From 0793a61d4df8daeac6492dbf8d2f3e5713caae5e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Dec 2008 20:12:29 +0100
Subject: [PATCH 0001/2198] performance counters: core code

Implement the core kernel bits of Performance Counters subsystem.

The Linux Performance Counter subsystem provides an abstraction of
performance counter hardware capabilities. It provides per task and per
CPU counters, and it provides event capabilities on top of those.

Performance counters are accessed via special file descriptors.
There's one file descriptor per virtual counter used.

The special file descriptor is opened via the perf_counter_open()
system call:

 int
 perf_counter_open(u32 hw_event_type,
                   u32 hw_event_period,
                   u32 record_type,
                   pid_t pid,
                   int cpu);

The syscall returns the new fd. The fd can be used via the normal
VFS system calls: read() can be used to read the counter, fcntl()
can be used to set the blocking mode, etc.

Multiple counters can be kept open at a time, and the counters
can be poll()ed.

See more details in Documentation/perf-counters.txt.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/char/sysrq.c         |   2 +
 include/linux/perf_counter.h | 171 +++++++
 include/linux/sched.h        |   9 +
 include/linux/syscalls.h     |   6 +
 init/Kconfig                 |  29 ++
 kernel/Makefile              |   1 +
 kernel/fork.c                |   1 +
 kernel/perf_counter.c        | 943 +++++++++++++++++++++++++++++++++++
 kernel/sched.c               |  24 +
 kernel/sys_ni.c              |   3 +
 10 files changed, 1189 insertions(+)
 create mode 100644 include/linux/perf_counter.h
 create mode 100644 kernel/perf_counter.c

diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c
index ce0d9da52a8ab..52146c2a8d972 100644
--- a/drivers/char/sysrq.c
+++ b/drivers/char/sysrq.c
@@ -25,6 +25,7 @@
 #include <linux/kbd_kern.h>
 #include <linux/proc_fs.h>
 #include <linux/quotaops.h>
+#include <linux/perf_counter.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
@@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)
 	struct pt_regs *regs = get_irq_regs();
 	if (regs)
 		show_regs(regs);
+	perf_counter_print_debug();
 }
 static struct sysrq_key_op sysrq_showregs_op = {
 	.handler	= sysrq_handle_showregs,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
new file mode 100644
index 0000000000000..22c4469abf44c
--- /dev/null
+++ b/include/linux/perf_counter.h
@@ -0,0 +1,171 @@
+/*
+ *  Performance counters:
+ *
+ *   Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
+ *   Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
+ *
+ *  Data type definitions, declarations, prototypes.
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_PERF_COUNTER_H
+#define _LINUX_PERF_COUNTER_H
+
+#include <asm/atomic.h>
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+
+struct task_struct;
+
+/*
+ * Generalized hardware event types, used by the hw_event_type parameter
+ * of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	PERF_COUNT_CYCLES,
+	PERF_COUNT_INSTRUCTIONS,
+	PERF_COUNT_CACHE_REFERENCES,
+	PERF_COUNT_CACHE_MISSES,
+	PERF_COUNT_BRANCH_INSTRUCTIONS,
+	PERF_COUNT_BRANCH_MISSES,
+	/*
+	 * If this bit is set in the type, then trigger NMI sampling:
+	 */
+	PERF_COUNT_NMI			= (1 << 30),
+};
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_record_type {
+	PERF_RECORD_SIMPLE,
+	PERF_RECORD_IRQ,
+	PERF_RECORD_GROUP,
+};
+
+/**
+ * struct hw_perf_counter - performance counter hardware details
+ */
+struct hw_perf_counter {
+	u64			config;
+	unsigned long		config_base;
+	unsigned long		counter_base;
+	int			nmi;
+	unsigned int		idx;
+	u64			prev_count;
+	s32			next_count;
+	u64			irq_period;
+};
+
+/*
+ * Hardcoded buffer length limit for now, for IRQ-fed events:
+ */
+#define PERF_DATA_BUFLEN	2048
+
+/**
+ * struct perf_data - performance counter IRQ data sampling ...
+ */
+struct perf_data {
+	int			len;
+	int			rd_idx;
+	int			overrun;
+	u8			data[PERF_DATA_BUFLEN];
+};
+
+/**
+ * struct perf_counter - performance counter kernel representation:
+ */
+struct perf_counter {
+	struct list_head		list;
+	int				active;
+#if BITS_PER_LONG == 64
+	atomic64_t			count;
+#else
+	atomic_t			count32[2];
+#endif
+	u64				__irq_period;
+
+	struct hw_perf_counter		hw;
+
+	struct perf_counter_context	*ctx;
+	struct task_struct		*task;
+
+	/*
+	 * Protect attach/detach:
+	 */
+	struct mutex			mutex;
+
+	int				oncpu;
+	int				cpu;
+
+	s32				hw_event_type;
+	enum perf_record_type		record_type;
+
+	/* read() / irq related data */
+	wait_queue_head_t		waitq;
+	/* optional: for NMIs */
+	int				wakeup_pending;
+	struct perf_data		*irqdata;
+	struct perf_data		*usrdata;
+	struct perf_data		data[2];
+};
+
+/**
+ * struct perf_counter_context - counter context structure
+ *
+ * Used as a container for task counters and CPU counters as well:
+ */
+struct perf_counter_context {
+#ifdef CONFIG_PERF_COUNTERS
+	/*
+	 * Protect the list of counters:
+	 */
+	spinlock_t		lock;
+	struct list_head	counters;
+	int			nr_counters;
+	int			nr_active;
+	struct task_struct	*task;
+#endif
+};
+
+/**
+ * struct perf_counter_cpu_context - per cpu counter context structure
+ */
+struct perf_cpu_context {
+	struct perf_counter_context	ctx;
+	struct perf_counter_context	*task_ctx;
+	int				active_oncpu;
+	int				max_pertask;
+};
+
+/*
+ * Set by architecture code:
+ */
+extern int perf_max_counters;
+
+#ifdef CONFIG_PERF_COUNTERS
+extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
+extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
+extern void perf_counter_task_tick(struct task_struct *task, int cpu);
+extern void perf_counter_init_task(struct task_struct *task);
+extern void perf_counter_notify(struct pt_regs *regs);
+extern void perf_counter_print_debug(void);
+#else
+static inline void
+perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
+static inline void
+perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
+static inline void
+perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
+static inline void perf_counter_init_task(struct task_struct *task)	{ }
+static inline void perf_counter_notify(struct pt_regs *regs)		{ }
+static inline void perf_counter_print_debug(void)			{ }
+#endif
+
+#endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 55e30d1144779..4c530278391b7 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,6 +71,7 @@ struct sched_param {
 #include <linux/fs_struct.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
+#include <linux/perf_counter.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
@@ -1326,6 +1327,7 @@ struct task_struct {
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
+	struct perf_counter_context perf_counter_ctx;
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;
 	short il_next;
@@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk)
 #define TASK_SIZE_OF(tsk)	TASK_SIZE
 #endif
 
+/*
+ * Call the function if the target task is executing on a CPU right now:
+ */
+extern void task_oncpu_function_call(struct task_struct *p,
+				     void (*func) (void *info), void *info);
+
+
 #ifdef CONFIG_MM_OWNER
 extern void mm_update_next_owner(struct mm_struct *mm);
 extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 04fb47bfb920d..6cce728a6263e 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
+asmlinkage int
+sys_perf_counter_open(u32 hw_event_type,
+		      u32 hw_event_period,
+		      u32 record_type,
+		      pid_t pid,
+		      int cpu);
 #endif
diff --git a/init/Kconfig b/init/Kconfig
index f763762d544a1..78bede218f10f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -732,6 +732,35 @@ config AIO
           by some high performance threaded applications. Disabling
           this option saves about 7k.
 
+config HAVE_PERF_COUNTERS
+	bool
+
+menu "Performance Counters"
+
+config PERF_COUNTERS
+	bool "Kernel Performance Counters"
+	depends on HAVE_PERF_COUNTERS
+	default y
+	help
+	  Enable kernel support for performance counter hardware.
+
+	  Performance counters are special hardware registers available
+	  on most modern CPUs. These registers count the number of certain
+	  types of hw events: such as instructions executed, cachemisses
+	  suffered, or branches mis-predicted - without slowing down the
+	  kernel or applications. These registers can also trigger interrupts
+	  when a threshold number of events have passed - and can thus be
+	  used to profile the code that runs on that CPU.
+
+	  The Linux Performance Counter subsystem provides an abstraction of
+	  these hardware capabilities, available via a system call. It
+	  provides per task and per CPU counters, and it provides event
+	  capabilities on top of those.
+
+	  Say Y if unsure.
+
+endmenu
+
 config VM_EVENT_COUNTERS
 	default y
 	bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
diff --git a/kernel/Makefile b/kernel/Makefile
index 19fad003b19d6..1f184a1dc406a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/fork.c b/kernel/fork.c
index 2a372a0e206fa..441fadff1fa47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto fork_out;
 
 	rt_mutex_init_task(p);
+	perf_counter_init_task(p);
 
 #ifdef CONFIG_PROVE_LOCKING
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
new file mode 100644
index 0000000000000..20508f053658f
--- /dev/null
+++ b/kernel/perf_counter.c
@@ -0,0 +1,943 @@
+/*
+ * Performance counter core code
+ *
+ *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/fs.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
+#include <linux/poll.h>
+#include <linux/sysfs.h>
+#include <linux/ptrace.h>
+#include <linux/percpu.h>
+#include <linux/uaccess.h>
+#include <linux/syscalls.h>
+#include <linux/anon_inodes.h>
+#include <linux/perf_counter.h>
+
+/*
+ * Each CPU has a list of per CPU counters:
+ */
+DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
+
+int perf_max_counters __read_mostly;
+static int perf_reserved_percpu __read_mostly;
+static int perf_overcommit __read_mostly = 1;
+
+/*
+ * Mutex for (sysadmin-configurable) counter reservations:
+ */
+static DEFINE_MUTEX(perf_resource_mutex);
+
+/*
+ * Architecture provided APIs - weak aliases:
+ */
+
+int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type)
+{
+	return -EINVAL;
+}
+
+void __weak hw_perf_counter_enable(struct perf_counter *counter)	 { }
+void __weak hw_perf_counter_disable(struct perf_counter *counter)	 { }
+void __weak hw_perf_counter_read(struct perf_counter *counter)		 { }
+void __weak hw_perf_disable_all(void) { }
+void __weak hw_perf_enable_all(void) { }
+void __weak hw_perf_counter_setup(void) { }
+
+#if BITS_PER_LONG == 64
+
+/*
+ * Read the cached counter in counter safe against cross CPU / NMI
+ * modifications. 64 bit version - no complications.
+ */
+static inline u64 perf_read_counter_safe(struct perf_counter *counter)
+{
+	return (u64) atomic64_read(&counter->count);
+}
+
+#else
+
+/*
+ * Read the cached counter in counter safe against cross CPU / NMI
+ * modifications. 32 bit version.
+ */
+static u64 perf_read_counter_safe(struct perf_counter *counter)
+{
+	u32 cntl, cnth;
+
+	local_irq_disable();
+	do {
+		cnth = atomic_read(&counter->count32[1]);
+		cntl = atomic_read(&counter->count32[0]);
+	} while (cnth != atomic_read(&counter->count32[1]));
+
+	local_irq_enable();
+
+	return cntl | ((u64) cnth) << 32;
+}
+
+#endif
+
+/*
+ * Cross CPU call to remove a performance counter
+ *
+ * We disable the counter on the hardware level first. After that we
+ * remove it from the context list.
+ */
+static void __perf_remove_from_context(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter *counter = info;
+	struct perf_counter_context *ctx = counter->ctx;
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu. If not it has been
+	 * scheduled out before the smp call arrived.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	spin_lock(&ctx->lock);
+
+	if (counter->active) {
+		hw_perf_counter_disable(counter);
+		counter->active = 0;
+		ctx->nr_active--;
+		cpuctx->active_oncpu--;
+		counter->task = NULL;
+	}
+	ctx->nr_counters--;
+
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * counters on a global level. NOP for non NMI based counters.
+	 */
+	hw_perf_disable_all();
+	list_del_init(&counter->list);
+	hw_perf_enable_all();
+
+	if (!ctx->task) {
+		/*
+		 * Allow more per task counters with respect to the
+		 * reservation:
+		 */
+		cpuctx->max_pertask =
+			min(perf_max_counters - ctx->nr_counters,
+			    perf_max_counters - perf_reserved_percpu);
+	}
+
+	spin_unlock(&ctx->lock);
+}
+
+
+/*
+ * Remove the counter from a task's (or a CPU's) list of counters.
+ *
+ * Must be called with counter->mutex held.
+ *
+ * CPU counters are removed with a smp call. For task counters we only
+ * call when the task is on a CPU.
+ */
+static void perf_remove_from_context(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Per cpu counters are removed via an smp call and
+		 * the removal is always sucessful.
+		 */
+		smp_call_function_single(counter->cpu,
+					 __perf_remove_from_context,
+					 counter, 1);
+		return;
+	}
+
+retry:
+	task_oncpu_function_call(task, __perf_remove_from_context,
+				 counter);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the context is active we need to retry the smp call.
+	 */
+	if (ctx->nr_active && !list_empty(&counter->list)) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * The lock prevents that this context is scheduled in so we
+	 * can remove the counter safely, if it the call above did not
+	 * succeed.
+	 */
+	if (!list_empty(&counter->list)) {
+		ctx->nr_counters--;
+		list_del_init(&counter->list);
+		counter->task = NULL;
+	}
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Cross CPU call to install and enable a preformance counter
+ */
+static void __perf_install_in_context(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter *counter = info;
+	struct perf_counter_context *ctx = counter->ctx;
+	int cpu = smp_processor_id();
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu. If not it has been
+	 * scheduled out before the smp call arrived.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * counters on a global level. NOP for non NMI based counters.
+	 */
+	hw_perf_disable_all();
+	list_add_tail(&counter->list, &ctx->counters);
+	hw_perf_enable_all();
+
+	ctx->nr_counters++;
+
+	if (cpuctx->active_oncpu < perf_max_counters) {
+		hw_perf_counter_enable(counter);
+		counter->active = 1;
+		counter->oncpu = cpu;
+		ctx->nr_active++;
+		cpuctx->active_oncpu++;
+	}
+
+	if (!ctx->task && cpuctx->max_pertask)
+		cpuctx->max_pertask--;
+
+	spin_unlock(&ctx->lock);
+}
+
+/*
+ * Attach a performance counter to a context
+ *
+ * First we add the counter to the list with the hardware enable bit
+ * in counter->hw_config cleared.
+ *
+ * If the counter is attached to a task which is on a CPU we use a smp
+ * call to enable it in the task context. The task might have been
+ * scheduled away, but we check this in the smp call again.
+ */
+static void
+perf_install_in_context(struct perf_counter_context *ctx,
+			struct perf_counter *counter,
+			int cpu)
+{
+	struct task_struct *task = ctx->task;
+
+	counter->ctx = ctx;
+	if (!task) {
+		/*
+		 * Per cpu counters are installed via an smp call and
+		 * the install is always sucessful.
+		 */
+		smp_call_function_single(cpu, __perf_install_in_context,
+					 counter, 1);
+		return;
+	}
+
+	counter->task = task;
+retry:
+	task_oncpu_function_call(task, __perf_install_in_context,
+				 counter);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the context is active and the counter has not been added
+	 * we need to retry the smp call.
+	 */
+	if (ctx->nr_active && list_empty(&counter->list)) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * The lock prevents that this context is scheduled in so we
+	 * can add the counter safely, if it the call above did not
+	 * succeed.
+	 */
+	if (list_empty(&counter->list)) {
+		list_add_tail(&counter->list, &ctx->counters);
+		ctx->nr_counters++;
+	}
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Called from scheduler to remove the counters of the current task,
+ * with interrupts disabled.
+ *
+ * We stop each counter and update the counter value in counter->count.
+ *
+ * This does not protect us against NMI, but hw_perf_counter_disable()
+ * sets the disabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * not restart the counter.
+ */
+void perf_counter_task_sched_out(struct task_struct *task, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter *counter;
+
+	if (likely(!cpuctx->task_ctx))
+		return;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(counter, &ctx->counters, list) {
+		if (!ctx->nr_active)
+			break;
+		if (counter->active) {
+			hw_perf_counter_disable(counter);
+			counter->active = 0;
+			counter->oncpu = -1;
+			ctx->nr_active--;
+			cpuctx->active_oncpu--;
+		}
+	}
+	spin_unlock(&ctx->lock);
+	cpuctx->task_ctx = NULL;
+}
+
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but hw_perf_counter_enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter *counter;
+
+	if (likely(!ctx->nr_counters))
+		return;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(counter, &ctx->counters, list) {
+		if (ctx->nr_active == cpuctx->max_pertask)
+			break;
+		if (counter->cpu != -1 && counter->cpu != cpu)
+			continue;
+
+		hw_perf_counter_enable(counter);
+		counter->active = 1;
+		counter->oncpu = cpu;
+		ctx->nr_active++;
+		cpuctx->active_oncpu++;
+	}
+	spin_unlock(&ctx->lock);
+	cpuctx->task_ctx = ctx;
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter *counter;
+
+	if (likely(!ctx->nr_counters))
+		return;
+
+	perf_counter_task_sched_out(curr, cpu);
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * Rotate the first entry last:
+	 */
+	hw_perf_disable_all();
+	list_for_each_entry(counter, &ctx->counters, list) {
+		list_del(&counter->list);
+		list_add_tail(&counter->list, &ctx->counters);
+		break;
+	}
+	hw_perf_enable_all();
+
+	spin_unlock(&ctx->lock);
+
+	perf_counter_task_sched_in(curr, cpu);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+void perf_counter_init_task(struct task_struct *task)
+{
+	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&ctx->counters);
+	ctx->nr_counters = 0;
+	ctx->task = task;
+}
+
+/*
+ * Cross CPU call to read the hardware counter
+ */
+static void __hw_perf_counter_read(void *info)
+{
+	hw_perf_counter_read(info);
+}
+
+static u64 perf_read_counter(struct perf_counter *counter)
+{
+	/*
+	 * If counter is enabled and currently active on a CPU, update the
+	 * value in the counter structure:
+	 */
+	if (counter->active) {
+		smp_call_function_single(counter->oncpu,
+					 __hw_perf_counter_read, counter, 1);
+	}
+
+	return perf_read_counter_safe(counter);
+}
+
+/*
+ * Cross CPU call to switch performance data pointers
+ */
+static void __perf_switch_irq_data(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter *counter = info;
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_data *oldirqdata = counter->irqdata;
+
+	/*
+	 * If this is a task context, we need to check whether it is
+	 * the current task context of this cpu. If not it has been
+	 * scheduled out before the smp call arrived.
+	 */
+	if (ctx->task) {
+		if (cpuctx->task_ctx != ctx)
+			return;
+		spin_lock(&ctx->lock);
+	}
+
+	/* Change the pointer NMI safe */
+	atomic_long_set((atomic_long_t *)&counter->irqdata,
+			(unsigned long) counter->usrdata);
+	counter->usrdata = oldirqdata;
+
+	if (ctx->task)
+		spin_unlock(&ctx->lock);
+}
+
+static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_data *oldirqdata = counter->irqdata;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		smp_call_function_single(counter->cpu,
+					 __perf_switch_irq_data,
+					 counter, 1);
+		return counter->usrdata;
+	}
+
+retry:
+	spin_lock_irq(&ctx->lock);
+	if (!counter->active) {
+		counter->irqdata = counter->usrdata;
+		counter->usrdata = oldirqdata;
+		spin_unlock_irq(&ctx->lock);
+		return oldirqdata;
+	}
+	spin_unlock_irq(&ctx->lock);
+	task_oncpu_function_call(task, __perf_switch_irq_data, counter);
+	/* Might have failed, because task was scheduled out */
+	if (counter->irqdata == oldirqdata)
+		goto retry;
+
+	return counter->usrdata;
+}
+
+static void put_context(struct perf_counter_context *ctx)
+{
+	if (ctx->task)
+		put_task_struct(ctx->task);
+}
+
+static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
+	struct task_struct *task;
+
+	/*
+	 * If cpu is not a wildcard then this is a percpu counter:
+	 */
+	if (cpu != -1) {
+		/* Must be root to operate on a CPU counter: */
+		if (!capable(CAP_SYS_ADMIN))
+			return ERR_PTR(-EACCES);
+
+		if (cpu < 0 || cpu > num_possible_cpus())
+			return ERR_PTR(-EINVAL);
+
+		/*
+		 * We could be clever and allow to attach a counter to an
+		 * offline CPU and activate it when the CPU comes up, but
+		 * that's for later.
+		 */
+		if (!cpu_isset(cpu, cpu_online_map))
+			return ERR_PTR(-ENODEV);
+
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		ctx = &cpuctx->ctx;
+
+		WARN_ON_ONCE(ctx->task);
+		return ctx;
+	}
+
+	rcu_read_lock();
+	if (!pid)
+		task = current;
+	else
+		task = find_task_by_vpid(pid);
+	if (task)
+		get_task_struct(task);
+	rcu_read_unlock();
+
+	if (!task)
+		return ERR_PTR(-ESRCH);
+
+	ctx = &task->perf_counter_ctx;
+	ctx->task = task;
+
+	/* Reuse ptrace permission checks for now. */
+	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
+		put_context(ctx);
+		return ERR_PTR(-EACCES);
+	}
+
+	return ctx;
+}
+
+/*
+ * Called when the last reference to the file is gone.
+ */
+static int perf_release(struct inode *inode, struct file *file)
+{
+	struct perf_counter *counter = file->private_data;
+	struct perf_counter_context *ctx = counter->ctx;
+
+	file->private_data = NULL;
+
+	mutex_lock(&counter->mutex);
+
+	perf_remove_from_context(counter);
+	put_context(ctx);
+
+	mutex_unlock(&counter->mutex);
+
+	kfree(counter);
+
+	return 0;
+}
+
+/*
+ * Read the performance counter - simple non blocking version for now
+ */
+static ssize_t
+perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
+{
+	u64 cntval;
+
+	if (count != sizeof(cntval))
+		return -EINVAL;
+
+	mutex_lock(&counter->mutex);
+	cntval = perf_read_counter(counter);
+	mutex_unlock(&counter->mutex);
+
+	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
+}
+
+static ssize_t
+perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
+{
+	if (!usrdata->len)
+		return 0;
+
+	count = min(count, (size_t)usrdata->len);
+	if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
+		return -EFAULT;
+
+	/* Adjust the counters */
+	usrdata->len -= count;
+	if (!usrdata->len)
+		usrdata->rd_idx = 0;
+	else
+		usrdata->rd_idx += count;
+
+	return count;
+}
+
+static ssize_t
+perf_read_irq_data(struct perf_counter	*counter,
+		   char __user		*buf,
+		   size_t		count,
+		   int			nonblocking)
+{
+	struct perf_data *irqdata, *usrdata;
+	DECLARE_WAITQUEUE(wait, current);
+	ssize_t res;
+
+	irqdata = counter->irqdata;
+	usrdata = counter->usrdata;
+
+	if (usrdata->len + irqdata->len >= count)
+		goto read_pending;
+
+	if (nonblocking)
+		return -EAGAIN;
+
+	spin_lock_irq(&counter->waitq.lock);
+	__add_wait_queue(&counter->waitq, &wait);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (usrdata->len + irqdata->len >= count)
+			break;
+
+		if (signal_pending(current))
+			break;
+
+		spin_unlock_irq(&counter->waitq.lock);
+		schedule();
+		spin_lock_irq(&counter->waitq.lock);
+	}
+	__remove_wait_queue(&counter->waitq, &wait);
+	__set_current_state(TASK_RUNNING);
+	spin_unlock_irq(&counter->waitq.lock);
+
+	if (usrdata->len + irqdata->len < count)
+		return -ERESTARTSYS;
+read_pending:
+	mutex_lock(&counter->mutex);
+
+	/* Drain pending data first: */
+	res = perf_copy_usrdata(usrdata, buf, count);
+	if (res < 0 || res == count)
+		goto out;
+
+	/* Switch irq buffer: */
+	usrdata = perf_switch_irq_data(counter);
+	if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
+		if (!res)
+			res = -EFAULT;
+	} else {
+		res = count;
+	}
+out:
+	mutex_unlock(&counter->mutex);
+
+	return res;
+}
+
+static ssize_t
+perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct perf_counter *counter = file->private_data;
+
+	switch (counter->record_type) {
+	case PERF_RECORD_SIMPLE:
+		return perf_read_hw(counter, buf, count);
+
+	case PERF_RECORD_IRQ:
+	case PERF_RECORD_GROUP:
+		return perf_read_irq_data(counter, buf, count,
+					  file->f_flags & O_NONBLOCK);
+	}
+	return -EINVAL;
+}
+
+static unsigned int perf_poll(struct file *file, poll_table *wait)
+{
+	struct perf_counter *counter = file->private_data;
+	unsigned int events = 0;
+	unsigned long flags;
+
+	poll_wait(file, &counter->waitq, wait);
+
+	spin_lock_irqsave(&counter->waitq.lock, flags);
+	if (counter->usrdata->len || counter->irqdata->len)
+		events |= POLLIN;
+	spin_unlock_irqrestore(&counter->waitq.lock, flags);
+
+	return events;
+}
+
+static const struct file_operations perf_fops = {
+	.release		= perf_release,
+	.read			= perf_read,
+	.poll			= perf_poll,
+};
+
+/*
+ * Allocate and initialize a counter structure
+ */
+static struct perf_counter *
+perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
+{
+	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+
+	if (!counter)
+		return NULL;
+
+	mutex_init(&counter->mutex);
+	INIT_LIST_HEAD(&counter->list);
+	init_waitqueue_head(&counter->waitq);
+
+	counter->irqdata	= &counter->data[0];
+	counter->usrdata	= &counter->data[1];
+	counter->cpu		= cpu;
+	counter->record_type	= record_type;
+	counter->__irq_period	= hw_event_period;
+	counter->wakeup_pending = 0;
+
+	return counter;
+}
+
+/**
+ * sys_perf_task_open - open a performance counter associate it to a task
+ * @hw_event_type:	event type for monitoring/sampling...
+ * @pid:		target pid
+ */
+asmlinkage int
+sys_perf_counter_open(u32 hw_event_type,
+		      u32 hw_event_period,
+		      u32 record_type,
+		      pid_t pid,
+		      int cpu)
+{
+	struct perf_counter_context *ctx;
+	struct perf_counter *counter;
+	int ret;
+
+	ctx = find_get_context(pid, cpu);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = -ENOMEM;
+	counter = perf_counter_alloc(hw_event_period, cpu, record_type);
+	if (!counter)
+		goto err_put_context;
+
+	ret = hw_perf_counter_init(counter, hw_event_type);
+	if (ret)
+		goto err_free_put_context;
+
+	perf_install_in_context(ctx, counter, cpu);
+
+	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
+	if (ret < 0)
+		goto err_remove_free_put_context;
+
+	return ret;
+
+err_remove_free_put_context:
+	mutex_lock(&counter->mutex);
+	perf_remove_from_context(counter);
+	mutex_unlock(&counter->mutex);
+
+err_free_put_context:
+	kfree(counter);
+
+err_put_context:
+	put_context(ctx);
+
+	return ret;
+}
+
+static void __cpuinit perf_init_cpu(int cpu)
+{
+	struct perf_cpu_context *ctx;
+
+	ctx = &per_cpu(perf_cpu_context, cpu);
+	spin_lock_init(&ctx->ctx.lock);
+	INIT_LIST_HEAD(&ctx->ctx.counters);
+
+	mutex_lock(&perf_resource_mutex);
+	ctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+	mutex_unlock(&perf_resource_mutex);
+	hw_perf_counter_setup();
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void __perf_exit_cpu(void *info)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter_context *ctx = &cpuctx->ctx;
+	struct perf_counter *counter, *tmp;
+
+	list_for_each_entry_safe(counter, tmp, &ctx->counters, list)
+		__perf_remove_from_context(counter);
+
+}
+static void perf_exit_cpu(int cpu)
+{
+	smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1);
+}
+#else
+static inline void perf_exit_cpu(int cpu) { }
+#endif
+
+static int __cpuinit
+perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (long)hcpu;
+
+	switch (action) {
+
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		perf_init_cpu(cpu);
+		break;
+
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		perf_exit_cpu(cpu);
+		break;
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata perf_cpu_nb = {
+	.notifier_call		= perf_cpu_notify,
+};
+
+static int __init perf_counter_init(void)
+{
+	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	register_cpu_notifier(&perf_cpu_nb);
+
+	return 0;
+}
+early_initcall(perf_counter_init);
+
+static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", perf_reserved_percpu);
+}
+
+static ssize_t
+perf_set_reserve_percpu(struct sysdev_class *class,
+			const char *buf,
+			size_t count)
+{
+	struct perf_cpu_context *cpuctx;
+	unsigned long val;
+	int err, cpu, mpt;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val > perf_max_counters)
+		return -EINVAL;
+
+	mutex_lock(&perf_resource_mutex);
+	perf_reserved_percpu = val;
+	for_each_online_cpu(cpu) {
+		cpuctx = &per_cpu(perf_cpu_context, cpu);
+		spin_lock_irq(&cpuctx->ctx.lock);
+		mpt = min(perf_max_counters - cpuctx->ctx.nr_counters,
+			  perf_max_counters - perf_reserved_percpu);
+		cpuctx->max_pertask = mpt;
+		spin_unlock_irq(&cpuctx->ctx.lock);
+	}
+	mutex_unlock(&perf_resource_mutex);
+
+	return count;
+}
+
+static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf)
+{
+	return sprintf(buf, "%d\n", perf_overcommit);
+}
+
+static ssize_t
+perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
+{
+	unsigned long val;
+	int err;
+
+	err = strict_strtoul(buf, 10, &val);
+	if (err)
+		return err;
+	if (val > 1)
+		return -EINVAL;
+
+	mutex_lock(&perf_resource_mutex);
+	perf_overcommit = val;
+	mutex_unlock(&perf_resource_mutex);
+
+	return count;
+}
+
+static SYSDEV_CLASS_ATTR(
+				reserve_percpu,
+				0644,
+				perf_show_reserve_percpu,
+				perf_set_reserve_percpu
+			);
+
+static SYSDEV_CLASS_ATTR(
+				overcommit,
+				0644,
+				perf_show_overcommit,
+				perf_set_overcommit
+			);
+
+static struct attribute *perfclass_attrs[] = {
+	&attr_reserve_percpu.attr,
+	&attr_overcommit.attr,
+	NULL
+};
+
+static struct attribute_group perfclass_attr_group = {
+	.attrs			= perfclass_attrs,
+	.name			= "perf_counters",
+};
+
+static int __init perf_counter_sysfs_init(void)
+{
+	return sysfs_create_group(&cpu_sysdev_class.kset.kobj,
+				  &perfclass_attr_group);
+}
+device_initcall(perf_counter_sysfs_init);
+
diff --git a/kernel/sched.c b/kernel/sched.c
index b7480fb5c3dc2..254d56de2548d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2212,6 +2212,27 @@ static int sched_balance_self(int cpu, int flag)
 
 #endif /* CONFIG_SMP */
 
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:		the task to evaluate
+ * @func:	the function to be called
+ * @info:	the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+			      void (*func) (void *info), void *info)
+{
+	int cpu;
+
+	preempt_disable();
+	cpu = task_cpu(p);
+	if (task_curr(p))
+		smp_call_function_single(cpu, func, info, 1);
+	preempt_enable();
+}
+
 /***
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
@@ -2534,6 +2555,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	fire_sched_out_preempt_notifiers(prev, next);
+	perf_counter_task_sched_out(prev, cpu_of(rq));
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
@@ -2574,6 +2596,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	prev_state = prev->state;
 	finish_arch_switch(prev);
+	perf_counter_task_sched_in(current, cpu_of(rq));
 	finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
 	if (current->sched_class->post_schedule)
@@ -4296,6 +4319,7 @@ void scheduler_tick(void)
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
+	perf_counter_task_tick(curr, cpu);
 }
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index e14a232817076..4be8bbc7577cc 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime);
 cond_syscall(compat_sys_timerfd_gettime);
 cond_syscall(sys_eventfd);
 cond_syscall(sys_eventfd2);
+
+/* performance counters: */
+cond_syscall(sys_perf_counter_open);
-- 
GitLab


From e7bc62b6b3aeaa8849f8383e0cfb7ca6c003adc6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Dec 2008 20:13:45 +0100
Subject: [PATCH 0002/2198] performance counters: documentation

Add more documentation about performance counters.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf-counters.txt | 104 ++++++++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 Documentation/perf-counters.txt

diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt
new file mode 100644
index 0000000000000..19033a0bb5262
--- /dev/null
+++ b/Documentation/perf-counters.txt
@@ -0,0 +1,104 @@
+
+Performance Counters for Linux
+------------------------------
+
+Performance counters are special hardware registers available on most modern
+CPUs. These registers count the number of certain types of hw events: such
+as instructions executed, cachemisses suffered, or branches mis-predicted -
+without slowing down the kernel or applications. These registers can also
+trigger interrupts when a threshold number of events have passed - and can
+thus be used to profile the code that runs on that CPU.
+
+The Linux Performance Counter subsystem provides an abstraction of these
+hardware capabilities. It provides per task and per CPU counters, and
+it provides event capabilities on top of those.
+
+Performance counters are accessed via special file descriptors.
+There's one file descriptor per virtual counter used.
+
+The special file descriptor is opened via the perf_counter_open()
+system call:
+
+ int
+ perf_counter_open(u32 hw_event_type,
+                   u32 hw_event_period,
+                   u32 record_type,
+                   pid_t pid,
+                   int cpu);
+
+The syscall returns the new fd. The fd can be used via the normal
+VFS system calls: read() can be used to read the counter, fcntl()
+can be used to set the blocking mode, etc.
+
+Multiple counters can be kept open at a time, and the counters
+can be poll()ed.
+
+When creating a new counter fd, 'hw_event_type' is one of:
+
+ enum hw_event_types {
+	PERF_COUNT_CYCLES,
+	PERF_COUNT_INSTRUCTIONS,
+	PERF_COUNT_CACHE_REFERENCES,
+	PERF_COUNT_CACHE_MISSES,
+	PERF_COUNT_BRANCH_INSTRUCTIONS,
+	PERF_COUNT_BRANCH_MISSES,
+ };
+
+These are standardized types of events that work uniformly on all CPUs
+that implements Performance Counters support under Linux. If a CPU is
+not able to count branch-misses, then the system call will return
+-EINVAL.
+
+[ Note: more hw_event_types are supported as well, but they are CPU
+  specific and are enumerated via /sys on a per CPU basis. Raw hw event
+  types can be passed in as negative numbers. For example, to count
+  "External bus cycles while bus lock signal asserted" events on Intel
+  Core CPUs, pass in a -0x4064 event type value. ]
+
+The parameter 'hw_event_period' is the number of events before waking up
+a read() that is blocked on a counter fd. Zero value means a non-blocking
+counter.
+
+'record_type' is the type of data that a read() will provide for the
+counter, and it can be one of:
+
+  enum perf_record_type {
+	PERF_RECORD_SIMPLE,
+	PERF_RECORD_IRQ,
+  };
+
+a "simple" counter is one that counts hardware events and allows
+them to be read out into a u64 count value. (read() returns 8 on
+a successful read of a simple counter.)
+
+An "irq" counter is one that will also provide an IRQ context information:
+the IP of the interrupted context. In this case read() will return
+the 8-byte counter value, plus the Instruction Pointer address of the
+interrupted context.
+
+The 'pid' parameter allows the counter to be specific to a task:
+
+ pid == 0: if the pid parameter is zero, the counter is attached to the
+ current task.
+
+ pid > 0: the counter is attached to a specific task (if the current task
+ has sufficient privilege to do so)
+
+ pid < 0: all tasks are counted (per cpu counters)
+
+The 'cpu' parameter allows a counter to be made specific to a full
+CPU:
+
+ cpu >= 0: the counter is restricted to a specific CPU
+ cpu == -1: the counter counts on all CPUs
+
+Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.
+
+A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
+events of that task and 'follows' that task to whatever CPU the task
+gets schedule to. Per task counters can be created by any user, for
+their own tasks.
+
+A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
+all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
+
-- 
GitLab


From 241771ef016b5c0c83cd7a4372a74321c973c1e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Dec 2008 10:39:53 +0100
Subject: [PATCH 0003/2198] performance counters: x86 support

Implement performance counters for x86 Intel CPUs.

It's simplified right now: the PERFMON CPU feature is assumed,
which is available in Core2 and later Intel CPUs.

The design is flexible to be extended to more CPU types as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                              |   1 +
 arch/x86/ia32/ia32entry.S                     |   3 +-
 arch/x86/include/asm/hardirq_32.h             |   1 +
 arch/x86/include/asm/hw_irq.h                 |   2 +
 arch/x86/include/asm/intel_arch_perfmon.h     |  34 +-
 arch/x86/include/asm/irq_vectors.h            |   5 +
 .../x86/include/asm/mach-default/entry_arch.h |   5 +
 arch/x86/include/asm/pda.h                    |   1 +
 arch/x86/include/asm/thread_info.h            |   4 +-
 arch/x86/include/asm/unistd_32.h              |   1 +
 arch/x86/include/asm/unistd_64.h              |   3 +-
 arch/x86/kernel/apic.c                        |   2 +
 arch/x86/kernel/cpu/Makefile                  |  12 +-
 arch/x86/kernel/cpu/common.c                  |   2 +
 arch/x86/kernel/cpu/perf_counter.c            | 571 ++++++++++++++++++
 arch/x86/kernel/entry_64.S                    |   5 +
 arch/x86/kernel/irq.c                         |   5 +
 arch/x86/kernel/irqinit_32.c                  |   3 +
 arch/x86/kernel/irqinit_64.c                  |   5 +
 arch/x86/kernel/signal.c                      |   7 +-
 arch/x86/kernel/syscall_table_32.S            |   1 +
 21 files changed, 652 insertions(+), 21 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/perf_counter.c

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d4d4cb7629ea9..f2fdc18672412 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,6 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
+	select HAVE_PERF_COUNTERS
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 256b00b61892b..3c14ed07dc4e7 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -823,7 +823,8 @@ ia32_sys_call_table:
 	.quad compat_sys_signalfd4
 	.quad sys_eventfd2
 	.quad sys_epoll_create1
-	.quad sys_dup3			/* 330 */
+	.quad sys_dup3				/* 330 */
 	.quad sys_pipe2
 	.quad sys_inotify_init1
+	.quad sys_perf_counter_open
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
index 5ca135e72f2b0..b3e475dc93384 100644
--- a/arch/x86/include/asm/hardirq_32.h
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -9,6 +9,7 @@ typedef struct {
 	unsigned long idle_timestamp;
 	unsigned int __nmi_count;	/* arch dependent */
 	unsigned int apic_timer_irqs;	/* arch dependent */
+	unsigned int apic_perf_irqs;	/* arch dependent */
 	unsigned int irq0_irqs;
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 8de644b6b9599..aa93e53b85ee4 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,8 @@
 /* Interrupt handlers registered during init_IRQ */
 extern void apic_timer_interrupt(void);
 extern void error_interrupt(void);
+extern void perf_counter_interrupt(void);
+
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
 extern void reschedule_interrupt(void);
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
index fa0fd068bc2e2..71598a9eab617 100644
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ b/arch/x86/include/asm/intel_arch_perfmon.h
@@ -1,22 +1,24 @@
 #ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
 #define _ASM_X86_INTEL_ARCH_PERFMON_H
 
-#define MSR_ARCH_PERFMON_PERFCTR0		0xc1
-#define MSR_ARCH_PERFMON_PERFCTR1		0xc2
+#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
+#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
-#define MSR_ARCH_PERFMON_EVENTSEL0		0x186
-#define MSR_ARCH_PERFMON_EVENTSEL1		0x187
+#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186
+#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187
 
-#define ARCH_PERFMON_EVENTSEL0_ENABLE	(1 << 22)
-#define ARCH_PERFMON_EVENTSEL_INT	(1 << 20)
-#define ARCH_PERFMON_EVENTSEL_OS	(1 << 17)
-#define ARCH_PERFMON_EVENTSEL_USR	(1 << 16)
+#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22)
+#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20)
+#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
+#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
 
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL	(0x3c)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK	(0x00 << 8)
-#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
+#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
-	(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
+
+#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
 
 union cpuid10_eax {
 	struct {
@@ -28,4 +30,12 @@ union cpuid10_eax {
 	unsigned int full;
 };
 
+#ifdef CONFIG_PERF_COUNTERS
+extern void init_hw_perf_counters(void);
+extern void perf_counters_lapic_init(int nmi);
+#else
+static inline void init_hw_perf_counters(void)		{ }
+static inline void perf_counters_lapic_init(int nmi)	{ }
+#endif
+
 #endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 0005adb0f941f..b8d277f1252f9 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -86,6 +86,11 @@
  */
 #define LOCAL_TIMER_VECTOR	0xef
 
+/*
+ * Performance monitoring interrupt vector:
+ */
+#define LOCAL_PERF_VECTOR	0xee
+
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
index 6b1add8e31dde..ad31e5d90e908 100644
--- a/arch/x86/include/asm/mach-default/entry_arch.h
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -25,10 +25,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
  * a much simpler SMP time architecture:
  */
 #ifdef CONFIG_X86_LOCAL_APIC
+
 BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
 BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
+#ifdef CONFIG_PERF_COUNTERS
+BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+#endif
+
 #ifdef CONFIG_X86_MCE_P4THERMAL
 BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
 #endif
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
index 2fbfff88df37b..90a8d9d4206b9 100644
--- a/arch/x86/include/asm/pda.h
+++ b/arch/x86/include/asm/pda.h
@@ -30,6 +30,7 @@ struct x8664_pda {
 	short isidle;
 	struct mm_struct *active_mm;
 	unsigned apic_timer_irqs;
+	unsigned apic_perf_irqs;
 	unsigned irq0_irqs;
 	unsigned irq_resched_count;
 	unsigned irq_call_count;
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index e44d379faad2b..810bf266d134f 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -80,6 +80,7 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_PERF_COUNTERS	11	/* notify perf counter work */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -103,6 +104,7 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
+#define _TIF_PERF_COUNTERS	(1 << TIF_PERF_COUNTERS)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -135,7 +137,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f2bba78430a4d..7e47658b0a6f2 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -338,6 +338,7 @@
 #define __NR_dup3		330
 #define __NR_pipe2		331
 #define __NR_inotify_init1	332
+#define __NR_perf_counter_open	333
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index d2e415e6666f6..53025feaf88dc 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3)
 __SYSCALL(__NR_pipe2, sys_pipe2)
 #define __NR_inotify_init1			294
 __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
-
+#define __NR_perf_counter_open		295
+__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 16f94879b5257..8ab8c18586723 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -31,6 +31,7 @@
 #include <linux/dmi.h>
 #include <linux/dmar.h>
 
+#include <asm/intel_arch_perfmon.h>
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/mtrr.h>
@@ -1147,6 +1148,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 #endif
+	perf_counters_lapic_init(0);
 
 	preempt_disable();
 
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 82ec6075c057b..89e53361fe245 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -1,5 +1,5 @@
 #
-# Makefile for x86-compatible CPU details and quirks
+# Makefile for x86-compatible CPU details, features and quirks
 #
 
 obj-y			:= intel_cacheinfo.o addon_cpuid_features.o
@@ -16,11 +16,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64)	+= centaur_64.o
 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
 
-obj-$(CONFIG_X86_MCE)	+= mcheck/
-obj-$(CONFIG_MTRR)	+= mtrr/
-obj-$(CONFIG_CPU_FREQ)	+= cpufreq/
+obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o
 
-obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
+obj-$(CONFIG_X86_MCE)			+= mcheck/
+obj-$(CONFIG_MTRR)			+= mtrr/
+obj-$(CONFIG_CPU_FREQ)			+= cpufreq/
+
+obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o
 
 quiet_cmd_mkcapflags = MKCAP   $@
       cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b9c9ea0217a9b..4461011db47c6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,6 +17,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
+#include <asm/intel_arch_perfmon.h>
 #include <asm/pat.h>
 #include <asm/asm.h>
 #include <asm/numa.h>
@@ -750,6 +751,7 @@ void __init identify_boot_cpu(void)
 #else
 	vgetcpu_set_mode();
 #endif
+	init_hw_perf_counters();
 }
 
 void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
new file mode 100644
index 0000000000000..82440cbed0e6a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -0,0 +1,571 @@
+/*
+ * Performance counter x86 architecture code
+ *
+ *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/perf_counter.h>
+#include <linux/capability.h>
+#include <linux/notifier.h>
+#include <linux/hardirq.h>
+#include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/sched.h>
+
+#include <asm/intel_arch_perfmon.h>
+#include <asm/apic.h>
+
+static bool perf_counters_initialized __read_mostly;
+
+/*
+ * Number of (generic) HW counters:
+ */
+static int nr_hw_counters __read_mostly;
+static u32 perf_counter_mask __read_mostly;
+
+/* No support for fixed function counters yet */
+
+#define MAX_HW_COUNTERS		8
+
+struct cpu_hw_counters {
+	struct perf_counter	*counters[MAX_HW_COUNTERS];
+	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
+	int			enable_all;
+};
+
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+
+const int intel_perfmon_event_map[] =
+{
+  [PERF_COUNT_CYCLES]			= 0x003c,
+  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
+  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+};
+
+const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+
+/*
+ * Setup the hardware configuration for a given hw_event_type
+ */
+int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (unlikely(!perf_counters_initialized))
+		return -EINVAL;
+
+	/*
+	 * Count user events, and generate PMC IRQs:
+	 * (keep 'enabled' bit clear for now)
+	 */
+	hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
+
+	/*
+	 * If privileged enough, count OS events too, and allow
+	 * NMI events as well:
+	 */
+	hwc->nmi = 0;
+	if (capable(CAP_SYS_ADMIN)) {
+		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
+		if (hw_event_type & PERF_COUNT_NMI)
+			hwc->nmi = 1;
+	}
+
+	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
+	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+
+	hwc->irq_period = counter->__irq_period;
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic counter period:
+	 */
+	if (!hwc->irq_period)
+		hwc->irq_period = 0x7FFFFFFF;
+
+	hwc->next_count = -((s32) hwc->irq_period);
+
+	/*
+	 * Negative event types mean raw encoded event+umask values:
+	 */
+	if (hw_event_type < 0) {
+		counter->hw_event_type = -hw_event_type;
+		counter->hw_event_type &= ~PERF_COUNT_NMI;
+	} else {
+		hw_event_type &= ~PERF_COUNT_NMI;
+		if (hw_event_type >= max_intel_perfmon_events)
+			return -EINVAL;
+		/*
+		 * The generic map:
+		 */
+		counter->hw_event_type = intel_perfmon_event_map[hw_event_type];
+	}
+	hwc->config |= counter->hw_event_type;
+	counter->wakeup_pending = 0;
+
+	return 0;
+}
+
+static void __hw_perf_enable_all(void)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
+}
+
+void hw_perf_enable_all(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	cpuc->enable_all = 1;
+	__hw_perf_enable_all();
+}
+
+void hw_perf_disable_all(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	cpuc->enable_all = 0;
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+}
+
+static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+
+static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+{
+	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+
+	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
+void hw_perf_counter_enable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+	int idx = hwc->idx;
+
+	/* Try to get the previous counter again */
+	if (test_and_set_bit(idx, cpuc->used)) {
+		idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
+		set_bit(idx, cpuc->used);
+		hwc->idx = idx;
+	}
+
+	perf_counters_lapic_init(hwc->nmi);
+
+	wrmsr(hwc->config_base + idx,
+	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+
+	cpuc->counters[idx] = counter;
+	counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	__hw_perf_counter_enable(hwc, idx);
+}
+
+#ifdef CONFIG_X86_64
+static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
+{
+	atomic64_set(&counter->count, val);
+}
+
+static inline u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic64_read(&counter->count);
+}
+#else
+/*
+ * Todo: add proper atomic64_t support to 32-bit x86:
+ */
+static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
+{
+	u32 *val32 = (void *)&val64;
+
+	atomic_set(counter->count32 + 0, *(val32 + 0));
+	atomic_set(counter->count32 + 1, *(val32 + 1));
+}
+
+static inline u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic_read(counter->count32 + 0) |
+		(u64) atomic_read(counter->count32 + 1) << 32;
+}
+#endif
+
+static void __hw_perf_save_counter(struct perf_counter *counter,
+				   struct hw_perf_counter *hwc, int idx)
+{
+	s64 raw = -1;
+	s64 delta;
+	int err;
+
+	/*
+	 * Get the raw hw counter value:
+	 */
+	err = rdmsrl_safe(hwc->counter_base + idx, &raw);
+	WARN_ON_ONCE(err);
+
+	/*
+	 * Rebase it to zero (it started counting at -irq_period),
+	 * to see the delta since ->prev_count:
+	 */
+	delta = (s64)hwc->irq_period + (s64)(s32)raw;
+
+	atomic64_counter_set(counter, hwc->prev_count + delta);
+
+	/*
+	 * Adjust the ->prev_count offset - if we went beyond
+	 * irq_period of units, then we got an IRQ and the counter
+	 * was set back to -irq_period:
+	 */
+	while (delta >= (s64)hwc->irq_period) {
+		hwc->prev_count += hwc->irq_period;
+		delta -= (s64)hwc->irq_period;
+	}
+
+	/*
+	 * Calculate the next raw counter value we'll write into
+	 * the counter at the next sched-in time:
+	 */
+	delta -= (s64)hwc->irq_period;
+
+	hwc->next_count = (s32)delta;
+}
+
+void perf_counter_print_debug(void)
+{
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+	int cpu, err, idx;
+
+	local_irq_disable();
+
+	cpu = smp_processor_id();
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl);
+	WARN_ON_ONCE(err);
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status);
+	WARN_ON_ONCE(err);
+
+	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow);
+	WARN_ON_ONCE(err);
+
+	printk(KERN_INFO "\n");
+	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
+	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+
+	for (idx = 0; idx < nr_hw_counters; idx++) {
+		err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
+		WARN_ON_ONCE(err);
+
+		err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count);
+		WARN_ON_ONCE(err);
+
+		next_count = per_cpu(prev_next_count[idx], cpu);
+
+		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
+			cpu, idx, pmc_ctrl);
+		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+		printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
+			cpu, idx, next_count);
+	}
+	local_irq_enable();
+}
+
+void hw_perf_counter_disable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+	unsigned int idx = hwc->idx;
+
+	counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+
+	clear_bit(idx, cpuc->used);
+	cpuc->counters[idx] = NULL;
+	__hw_perf_save_counter(counter, hwc, idx);
+}
+
+void hw_perf_counter_read(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	unsigned long addr = hwc->counter_base + hwc->idx;
+	s64 offs, val = -1LL;
+	s32 val32;
+	int err;
+
+	/* Careful: NMI might modify the counter offset */
+	do {
+		offs = hwc->prev_count;
+		err = rdmsrl_safe(addr, &val);
+		WARN_ON_ONCE(err);
+	} while (offs != hwc->prev_count);
+
+	val32 = (s32) val;
+	val =  (s64)hwc->irq_period + (s64)val32;
+	atomic64_counter_set(counter, hwc->prev_count + val);
+}
+
+static void perf_store_irq_data(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_save_and_restart(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	int idx = hwc->idx;
+
+	wrmsr(hwc->config_base + idx,
+	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+
+	if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) {
+		__hw_perf_save_counter(counter, hwc, idx);
+		__hw_perf_counter_enable(hwc, idx);
+	}
+}
+
+static void
+perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+{
+	struct perf_counter_context *ctx = leader->ctx;
+	struct perf_counter *counter;
+	int bit;
+
+	list_for_each_entry(counter, &ctx->counters, list) {
+		if (counter->record_type != PERF_RECORD_SIMPLE ||
+		    counter == leader)
+			continue;
+
+		if (counter->active) {
+			/*
+			 * When counter was not in the overflow mask, we have to
+			 * read it from hardware. We read it as well, when it
+			 * has not been read yet and clear the bit in the
+			 * status mask.
+			 */
+			bit = counter->hw.idx;
+			if (!test_bit(bit, (unsigned long *) overflown) ||
+			    test_bit(bit, (unsigned long *) status)) {
+				clear_bit(bit, (unsigned long *) status);
+				perf_save_and_restart(counter);
+			}
+		}
+		perf_store_irq_data(leader, counter->hw_event_type);
+		perf_store_irq_data(leader, atomic64_counter_read(counter));
+	}
+}
+
+/*
+ * This handler is triggered by the local APIC, so the APIC IRQ handling
+ * rules apply:
+ */
+static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+{
+	int bit, cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc;
+	u64 ack, status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (!status) {
+		ack_APIC_irq();
+		return;
+	}
+
+	/* Disable counters globally */
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	ack_APIC_irq();
+
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+again:
+	ack = status;
+	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
+		struct perf_counter *counter = cpuc->counters[bit];
+
+		clear_bit(bit, (unsigned long *) &status);
+		if (!counter)
+			continue;
+
+		perf_save_and_restart(counter);
+
+		switch (counter->record_type) {
+		case PERF_RECORD_SIMPLE:
+			continue;
+		case PERF_RECORD_IRQ:
+			perf_store_irq_data(counter, instruction_pointer(regs));
+			break;
+		case PERF_RECORD_GROUP:
+			perf_store_irq_data(counter, counter->hw_event_type);
+			perf_store_irq_data(counter,
+					    atomic64_counter_read(counter));
+			perf_handle_group(counter, &status, &ack);
+			break;
+		}
+		/*
+		 * From NMI context we cannot call into the scheduler to
+		 * do a task wakeup - but we mark these counters as
+		 * wakeup_pending and initate a wakeup callback:
+		 */
+		if (nmi) {
+			counter->wakeup_pending = 1;
+			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+		} else {
+			wake_up(&counter->waitq);
+		}
+	}
+
+	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
+
+	/*
+	 * Repeat if there is more work to be done:
+	 */
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (status)
+		goto again;
+
+	/*
+	 * Do not reenable when global enable is off:
+	 */
+	if (cpuc->enable_all)
+		__hw_perf_enable_all();
+}
+
+void smp_perf_counter_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+#ifdef CONFIG_X86_64
+	add_pda(apic_perf_irqs, 1);
+#else
+	per_cpu(irq_stat, smp_processor_id()).apic_perf_irqs++;
+#endif
+	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	__smp_perf_counter_interrupt(regs, 0);
+
+	irq_exit();
+}
+
+/*
+ * This handler is triggered by NMI contexts:
+ */
+void perf_counter_notify(struct pt_regs *regs)
+{
+	struct cpu_hw_counters *cpuc;
+	unsigned long flags;
+	int bit, cpu;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+	for_each_bit(bit, cpuc->used, nr_hw_counters) {
+		struct perf_counter *counter = cpuc->counters[bit];
+
+		if (!counter)
+			continue;
+
+		if (counter->wakeup_pending) {
+			counter->wakeup_pending = 0;
+			wake_up(&counter->waitq);
+		}
+	}
+
+	local_irq_restore(flags);
+}
+
+void __cpuinit perf_counters_lapic_init(int nmi)
+{
+	u32 apic_val;
+
+	if (!perf_counters_initialized)
+		return;
+	/*
+	 * Enable the performance counter vector in the APIC LVT:
+	 */
+	apic_val = apic_read(APIC_LVTERR);
+
+	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
+	if (nmi)
+		apic_write(APIC_LVTPC, APIC_DM_NMI);
+	else
+		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	apic_write(APIC_LVTERR, apic_val);
+}
+
+static int __kprobes
+perf_counter_nmi_handler(struct notifier_block *self,
+			 unsigned long cmd, void *__args)
+{
+	struct die_args *args = __args;
+	struct pt_regs *regs;
+
+	if (likely(cmd != DIE_NMI_IPI))
+		return NOTIFY_DONE;
+
+	regs = args->regs;
+
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
+	__smp_perf_counter_interrupt(regs, 1);
+
+	return NOTIFY_STOP;
+}
+
+static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
+	.notifier_call		= perf_counter_nmi_handler
+};
+
+void __init init_hw_perf_counters(void)
+{
+	union cpuid10_eax eax;
+	unsigned int unused;
+	unsigned int ebx;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	/*
+	 * Check whether the Architectural PerfMon supports
+	 * Branch Misses Retired Event or not.
+	 */
+	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
+		return;
+
+	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
+
+	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
+	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
+	nr_hw_counters = eax.split.num_counters;
+	if (nr_hw_counters > MAX_HW_COUNTERS) {
+		nr_hw_counters = MAX_HW_COUNTERS;
+		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
+			nr_hw_counters, MAX_HW_COUNTERS);
+	}
+	perf_counter_mask = (1 << nr_hw_counters) - 1;
+	perf_max_counters = nr_hw_counters;
+
+	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
+
+	perf_counters_lapic_init(0);
+	register_die_notifier(&perf_counter_nmi_notifier);
+
+	perf_counters_initialized = true;
+}
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3194636a42933..fc013cfde3071 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -984,6 +984,11 @@ apicinterrupt ERROR_APIC_VECTOR \
 apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
+#ifdef CONFIG_PERF_COUNTERS
+apicinterrupt LOCAL_PERF_VECTOR \
+	perf_counter_interrupt smp_perf_counter_interrupt
+#endif
+
 /*
  * Exception entry points.
  */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d1d4dc52f649c..d92bc71e41a7c 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -56,6 +56,10 @@ static int show_other_interrupts(struct seq_file *p)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
 	seq_printf(p, "  Local timer interrupts\n");
+	seq_printf(p, "CNT: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
+	seq_printf(p, "  Performance counter interrupts\n");
 #endif
 #ifdef CONFIG_SMP
 	seq_printf(p, "RES: ");
@@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	sum += irq_stats(cpu)->apic_timer_irqs;
+	sum += irq_stats(cpu)->apic_perf_irqs;
 #endif
 #ifdef CONFIG_SMP
 	sum += irq_stats(cpu)->irq_resched_count;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 607db63044a5e..6a33b5e30161c 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -160,6 +160,9 @@ void __init native_init_IRQ(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+# ifdef CONFIG_PERF_COUNTERS
+	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+# endif
 #endif
 
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8670b3ce626e7..91d785c25ad9e 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -138,6 +138,11 @@ static void __init apic_intr_init(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+	/* Performance monitoring interrupt: */
+#ifdef CONFIG_PERF_COUNTERS
+	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+#endif
 }
 
 void __init native_init_IRQ(void)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index b1cc6da64208b..dee553c503d32 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,7 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
-
+#include <linux/perf_counter.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -891,6 +891,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		tracehook_notify_resume(regs);
 	}
 
+	if (thread_info_flags & _TIF_PERF_COUNTERS) {
+		clear_thread_flag(TIF_PERF_COUNTERS);
+		perf_counter_notify(regs);
+	}
+
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
 #endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index d44395ff34c34..496726ddcea16 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -332,3 +332,4 @@ ENTRY(sys_call_table)
 	.long sys_dup3			/* 330 */
 	.long sys_pipe2
 	.long sys_inotify_init1
+	.long sys_perf_counter_open
-- 
GitLab


From 87b9cf4623ad4e5fc009e48c020593dffd5d3793 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 8 Dec 2008 14:20:16 +0100
Subject: [PATCH 0004/2198] x86, perfcounters: read out
 MSR_CORE_PERF_GLOBAL_STATUS with counters disabled

Impact: make perfcounter NMI and IRQ sequence more robust

Make __smp_perf_counter_interrupt() a bit more conservative: first disable
all counters, then read out the status. Most invocations are because there
are real events, so there's no performance impact.

Code flow gets a bit simpler as well this way.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 82440cbed0e6a..615e953208ef9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -383,18 +383,16 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	struct cpu_hw_counters *cpuc;
 	u64 ack, status;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-	if (!status) {
-		ack_APIC_irq();
-		return;
-	}
-
 	/* Disable counters globally */
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 	ack_APIC_irq();
 
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	if (!status)
+		goto out;
+
 again:
 	ack = status;
 	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
@@ -440,7 +438,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (status)
 		goto again;
-
+out:
 	/*
 	 * Do not reenable when global enable is off:
 	 */
-- 
GitLab


From 4c59e4676dc95f6f58a2cff5390b2699fa5b5549 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 8 Dec 2008 19:38:33 +0100
Subject: [PATCH 0005/2198] perfcounters: select ANON_INODES

The perfcounters subsystem depends on CONFIG_ANON_INODES facilities,
so make sure it's selected.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/init/Kconfig b/init/Kconfig
index 78bede218f10f..7d147a36e968a 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -741,6 +741,7 @@ config PERF_COUNTERS
 	bool "Kernel Performance Counters"
 	depends on HAVE_PERF_COUNTERS
 	default y
+	select ANON_INODES
 	help
 	  Enable kernel support for performance counter hardware.
 
-- 
GitLab


From 7e2ae34749edf19e76e594b9c4b2cdde1066afc5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 11:40:46 +0100
Subject: [PATCH 0006/2198] perfcounters, x86: simplify disable/enable of
 counters

Impact: fix spurious missed counter wakeups

In the case of NMI events, close a race window that can occur if an NMI
hits counter code that temporarily disables+enables a counter, and the NMI
leaks into the disabled section.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 40 +++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 615e953208ef9..7d528ffc2d261 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -136,14 +136,25 @@ void hw_perf_disable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 }
 
+static inline void
+__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+{
+	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
 static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
 {
 	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
 
 	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+}
+
+static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+{
+	wrmsr(hwc->config_base + idx,
+	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
 void hw_perf_counter_enable(struct perf_counter *counter)
@@ -161,11 +172,11 @@ void hw_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	wrmsr(hwc->config_base + idx,
-	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+	__hw_perf_counter_disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	__hw_perf_counter_set_period(hwc, idx);
 	__hw_perf_counter_enable(hwc, idx);
 }
 
@@ -286,8 +297,7 @@ void hw_perf_counter_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+	__hw_perf_counter_disable(hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
@@ -328,18 +338,24 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 	}
 }
 
+/*
+ * NMI-safe enable method:
+ */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
+	u64 pmc_ctrl;
+	int err;
 
-	wrmsr(hwc->config_base + idx,
-	      hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+	err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
+	WARN_ON_ONCE(err);
 
-	if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) {
-		__hw_perf_save_counter(counter, hwc, idx);
+	__hw_perf_save_counter(counter, hwc, idx);
+	__hw_perf_counter_set_period(hwc, idx);
+
+	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
 		__hw_perf_counter_enable(hwc, idx);
-	}
 }
 
 static void
-- 
GitLab


From 1e12567678054bc1d4c944ecfad17624b3e49345 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 12:18:18 +0100
Subject: [PATCH 0007/2198] perfcounters, x86: clean up debug code

Impact: cleanup

Get rid of unused debug code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 35 ++++++++++--------------------
 1 file changed, 12 insertions(+), 23 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7d528ffc2d261..919ec46679b21 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -214,13 +214,11 @@ static void __hw_perf_save_counter(struct perf_counter *counter,
 {
 	s64 raw = -1;
 	s64 delta;
-	int err;
 
 	/*
 	 * Get the raw hw counter value:
 	 */
-	err = rdmsrl_safe(hwc->counter_base + idx, &raw);
-	WARN_ON_ONCE(err);
+	rdmsrl(hwc->counter_base + idx, raw);
 
 	/*
 	 * Rebase it to zero (it started counting at -irq_period),
@@ -252,20 +250,18 @@ static void __hw_perf_save_counter(struct perf_counter *counter,
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
-	int cpu, err, idx;
+	int cpu, idx;
+
+	if (!nr_hw_counters)
+		return;
 
 	local_irq_disable();
 
 	cpu = smp_processor_id();
 
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl);
-	WARN_ON_ONCE(err);
-
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status);
-	WARN_ON_ONCE(err);
-
-	err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow);
-	WARN_ON_ONCE(err);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 
 	printk(KERN_INFO "\n");
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
@@ -273,11 +269,8 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 
 	for (idx = 0; idx < nr_hw_counters; idx++) {
-		err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
-		WARN_ON_ONCE(err);
-
-		err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count);
-		WARN_ON_ONCE(err);
+		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
+		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
 		next_count = per_cpu(prev_next_count[idx], cpu);
 
@@ -310,13 +303,11 @@ void hw_perf_counter_read(struct perf_counter *counter)
 	unsigned long addr = hwc->counter_base + hwc->idx;
 	s64 offs, val = -1LL;
 	s32 val32;
-	int err;
 
 	/* Careful: NMI might modify the counter offset */
 	do {
 		offs = hwc->prev_count;
-		err = rdmsrl_safe(addr, &val);
-		WARN_ON_ONCE(err);
+		rdmsrl(addr, val);
 	} while (offs != hwc->prev_count);
 
 	val32 = (s32) val;
@@ -346,10 +337,8 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
 	u64 pmc_ctrl;
-	int err;
 
-	err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl);
-	WARN_ON_ONCE(err);
+	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
 	__hw_perf_save_counter(counter, hwc, idx);
 	__hw_perf_counter_set_period(hwc, idx);
-- 
GitLab


From 43874d238d5f208854a73c3225ca2a22833eec8b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 9 Dec 2008 12:23:59 +0100
Subject: [PATCH 0008/2198] perfcounters: consolidate global-disable codepaths

Impact: cleanup

Simplify global disable handling.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 23 ++++++-----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 919ec46679b21..6a93d1f04d97e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,7 +33,6 @@ static u32 perf_counter_mask __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[MAX_HW_COUNTERS];
 	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
-	int			enable_all;
 };
 
 /*
@@ -115,24 +114,13 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	return 0;
 }
 
-static void __hw_perf_enable_all(void)
-{
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
-}
-
 void hw_perf_enable_all(void)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	cpuc->enable_all = 1;
-	__hw_perf_enable_all();
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
 void hw_perf_disable_all(void)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	cpuc->enable_all = 0;
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 }
 
@@ -385,8 +373,10 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
+	u64 ack, status, saved_global;
 	struct cpu_hw_counters *cpuc;
-	u64 ack, status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 
 	/* Disable counters globally */
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
@@ -445,10 +435,9 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 		goto again;
 out:
 	/*
-	 * Do not reenable when global enable is off:
+	 * Restore - do not reenable when global enable is off:
 	 */
-	if (cpuc->enable_all)
-		__hw_perf_enable_all();
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
-- 
GitLab


From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 9 Dec 2008 21:43:39 +0100
Subject: [PATCH 0009/2198] perf counters: protect them against CSTATE
 transitions

Impact: fix rare lost events problem

There are CPUs whose performance counters misbehave on CSTATE transitions,
so provide a way to just disable/enable them around deep idle methods.

(hw_perf_enable_all() is cheap on x86.)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++++++++-
 drivers/acpi/processor_idle.c      |  8 ++++++++
 include/linux/perf_counter.h       |  4 ++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6a93d1f04d97e..0a7f3bea2dc6e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -12,6 +12,7 @@
 #include <linux/notifier.h>
 #include <linux/hardirq.h>
 #include <linux/kprobes.h>
+#include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
@@ -119,10 +120,21 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_disable_all(void)
+void hw_perf_restore_ctrl(u64 ctrl)
 {
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+
+u64 hw_perf_disable_all(void)
+{
+	u64 ctrl;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	return ctrl;
 }
+EXPORT_SYMBOL_GPL(hw_perf_disable_all);
 
 static inline void
 __hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 5f8d746a9b819..cca804e6f1dd4 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -270,8 +270,11 @@ static atomic_t c3_cpu_count;
 /* Common C-state entry for C2, C3, .. */
 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 {
+	u64 pctrl;
+
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
+	pctrl = hw_perf_disable_all();
 	if (cstate->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cstate);
@@ -284,6 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
+	hw_perf_restore_ctrl(pctrl);
 	start_critical_timings();
 }
 #endif /* !CONFIG_CPU_IDLE */
@@ -1425,8 +1429,11 @@ static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr,
  */
 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
+	u64 pctrl;
+
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
+	pctrl = hw_perf_disable_all();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -1441,6 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
+	hw_perf_restore_ctrl(pctrl);
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 22c4469abf44c..5031b5614f25e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -156,6 +156,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *task);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
+extern void hw_perf_restore_ctrl(u64 ctrl);
+extern u64 hw_perf_disable_all(void);
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -166,6 +168,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
+static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
+static inline u64 hw_perf_disable_all(void)		{ return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
-- 
GitLab


From eab656ae04b9d3b83265e3db01c0d2c46b748ef7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Dec 2008 19:26:59 +0100
Subject: [PATCH 0010/2198] perf counters: clean up 'raw' type API

Impact: cleanup

Introduce a separate hw_event type.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 +++++++
 include/linux/syscalls.h     |  8 +++-----
 kernel/perf_counter.c        | 15 ++++++++-------
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5031b5614f25e..daedd7d87c2a9 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -38,6 +38,7 @@ enum hw_event_types {
 	 * If this bit is set in the type, then trigger NMI sampling:
 	 */
 	PERF_COUNT_NMI			= (1 << 30),
+	PERF_COUNT_RAW			= (1 << 31),
 };
 
 /*
@@ -49,6 +50,12 @@ enum perf_record_type {
 	PERF_RECORD_GROUP,
 };
 
+struct perf_counter_event {
+	u32			hw_event_type;
+	u32			hw_event_period;
+	u64			hw_raw_ctrl;
+};
+
 /**
  * struct hw_perf_counter - performance counter hardware details
  */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 6cce728a6263e..3ecd73d03daa9 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,6 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
+struct perf_counter_event;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -625,9 +626,6 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 asmlinkage int
-sys_perf_counter_open(u32 hw_event_type,
-		      u32 hw_event_period,
-		      u32 record_type,
-		      pid_t pid,
-		      int cpu);
+sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
+		      pid_t pid, int cpu, int masterfd);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 20508f053658f..96c333a5b0fcc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -734,26 +734,27 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
  * @pid:		target pid
  */
 asmlinkage int
-sys_perf_counter_open(u32 hw_event_type,
-		      u32 hw_event_period,
-		      u32 record_type,
-		      pid_t pid,
-		      int cpu)
+sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
+		      pid_t pid, int cpu, int masterfd)
 {
 	struct perf_counter_context *ctx;
+	struct perf_counter_event event;
 	struct perf_counter *counter;
 	int ret;
 
+	if (copy_from_user(&event, uevent, sizeof(event)) != 0)
+		return -EFAULT;
+
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(hw_event_period, cpu, record_type);
+	counter = perf_counter_alloc(event.hw_event_period, cpu, record_type);
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter, hw_event_type);
+	ret = hw_perf_counter_init(counter, event.hw_event_type);
 	if (ret)
 		goto err_free_put_context;
 
-- 
GitLab


From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Dec 2008 19:35:37 +0100
Subject: [PATCH 0011/2198] perf counters: expand use of counter->event

Impact: change syscall, cleanup

Make use of the new perf_counters event type.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++++-----------
 include/linux/perf_counter.h       |  4 +---
 kernel/perf_counter.c              | 10 +++++-----
 3 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0a7f3bea2dc6e..30e7ebf782759 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,9 +56,10 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
-int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
+int hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
+	u32 hw_event_type = counter->event.hw_event_type;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
@@ -83,7 +84,7 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
 	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
 
-	hwc->irq_period = counter->__irq_period;
+	hwc->irq_period = counter->event.hw_event_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -95,21 +96,19 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type)
 	hwc->next_count = -((s32) hwc->irq_period);
 
 	/*
-	 * Negative event types mean raw encoded event+umask values:
+	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event_type < 0) {
-		counter->hw_event_type = -hw_event_type;
-		counter->hw_event_type &= ~PERF_COUNT_NMI;
+	hw_event_type &= ~PERF_COUNT_NMI;
+	if (hw_event_type == PERF_COUNT_RAW) {
+		hwc->config |= counter->event.hw_raw_ctrl;
 	} else {
-		hw_event_type &= ~PERF_COUNT_NMI;
 		if (hw_event_type >= max_intel_perfmon_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		counter->hw_event_type = intel_perfmon_event_map[hw_event_type];
+		hwc->config |= intel_perfmon_event_map[hw_event_type];
 	}
-	hwc->config |= counter->hw_event_type;
 	counter->wakeup_pending = 0;
 
 	return 0;
@@ -373,7 +372,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->hw_event_type);
+		perf_store_irq_data(leader, counter->event.hw_event_type);
 		perf_store_irq_data(leader, atomic64_counter_read(counter));
 	}
 }
@@ -418,7 +417,8 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 			perf_store_irq_data(counter, instruction_pointer(regs));
 			break;
 		case PERF_RECORD_GROUP:
-			perf_store_irq_data(counter, counter->hw_event_type);
+			perf_store_irq_data(counter,
+					    counter->event.hw_event_type);
 			perf_store_irq_data(counter,
 					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index daedd7d87c2a9..1f0017673e77c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -96,8 +96,7 @@ struct perf_counter {
 #else
 	atomic_t			count32[2];
 #endif
-	u64				__irq_period;
-
+	struct perf_counter_event	event;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -111,7 +110,6 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	s32				hw_event_type;
 	enum perf_record_type		record_type;
 
 	/* read() / irq related data */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 96c333a5b0fcc..2557c670a3bb8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,7 +37,7 @@ static DEFINE_MUTEX(perf_resource_mutex);
  * Architecture provided APIs - weak aliases:
  */
 
-int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type)
+int __weak hw_perf_counter_init(struct perf_counter *counter)
 {
 	return -EINVAL;
 }
@@ -707,7 +707,7 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
+perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
@@ -722,7 +722,7 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type)
 	counter->usrdata	= &counter->data[1];
 	counter->cpu		= cpu;
 	counter->record_type	= record_type;
-	counter->__irq_period	= hw_event_period;
+	counter->event		= *event;
 	counter->wakeup_pending = 0;
 
 	return counter;
@@ -750,11 +750,11 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(event.hw_event_period, cpu, record_type);
+	counter = perf_counter_alloc(&event, cpu, record_type);
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter, event.hw_event_type);
+	ret = hw_perf_counter_init(counter);
 	if (ret)
 		goto err_free_put_context;
 
-- 
GitLab


From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 10 Dec 2008 12:33:23 +0100
Subject: [PATCH 0012/2198] perf counters: restructure the API

Impact: clean up new API

Thorough cleanup of the new perf counters API, we now get clean separation
of the various concepts:

 - introduce perf_counter_hw_event to separate out the event source details

 - move special type flags into separate attributes: PERF_COUNT_NMI,
   PERF_COUNT_RAW

 - extend the type to u64 and reserve it fully to the architecture in the
   raw type case.

And make use of all these changes in the core and x86 perfcounters code.

Also change the syscall signature to:

  asmlinkage int sys_perf_counter_open(

	struct perf_counter_hw_event	*hw_event_uptr		__user,
	pid_t				pid,
	int				cpu,
	int				group_fd);

( Note that group_fd is unused for now - it's reserved for the counter
  groups abstraction. )

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 29 +++++----
 include/linux/perf_counter.h       | 98 +++++++++++++++++++-----------
 include/linux/syscalls.h           | 12 ++--
 kernel/perf_counter.c              | 38 +++++++-----
 4 files changed, 106 insertions(+), 71 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 30e7ebf782759..ef1936a871aa0 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
  */
 int hw_perf_counter_init(struct perf_counter *counter)
 {
+	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
-	u32 hw_event_type = counter->event.hw_event_type;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
@@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter)
 	hwc->nmi = 0;
 	if (capable(CAP_SYS_ADMIN)) {
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-		if (hw_event_type & PERF_COUNT_NMI)
+		if (hw_event->nmi)
 			hwc->nmi = 1;
 	}
 
-	hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0;
-	hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+	hwc->config_base	= MSR_ARCH_PERFMON_EVENTSEL0;
+	hwc->counter_base	= MSR_ARCH_PERFMON_PERFCTR0;
 
-	hwc->irq_period = counter->event.hw_event_period;
+	hwc->irq_period		= hw_event->irq_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter)
 	if (!hwc->irq_period)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count = -((s32) hwc->irq_period);
+	hwc->next_count	= -(s32)hwc->irq_period;
 
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	hw_event_type &= ~PERF_COUNT_NMI;
-	if (hw_event_type == PERF_COUNT_RAW) {
-		hwc->config |= counter->event.hw_raw_ctrl;
+	if (hw_event->raw) {
+		hwc->config |= hw_event->type;
 	} else {
-		if (hw_event_type >= max_intel_perfmon_events)
+		if (hw_event->type >= max_intel_perfmon_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= intel_perfmon_event_map[hw_event_type];
+		hwc->config |= intel_perfmon_event_map[hw_event->type];
 	}
 	counter->wakeup_pending = 0;
 
@@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 	int bit;
 
 	list_for_each_entry(counter, &ctx->counters, list) {
-		if (counter->record_type != PERF_RECORD_SIMPLE ||
+		if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
 		    counter == leader)
 			continue;
 
@@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->event.hw_event_type);
+		perf_store_irq_data(leader, counter->hw_event.type);
 		perf_store_irq_data(leader, atomic64_counter_read(counter));
 	}
 }
@@ -410,7 +409,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 
 		perf_save_and_restart(counter);
 
-		switch (counter->record_type) {
+		switch (counter->hw_event.record_type) {
 		case PERF_RECORD_SIMPLE:
 			continue;
 		case PERF_RECORD_IRQ:
@@ -418,7 +417,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 			break;
 		case PERF_RECORD_GROUP:
 			perf_store_irq_data(counter,
-					    counter->event.hw_event_type);
+					    counter->hw_event.type);
 			perf_store_irq_data(counter,
 					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1f0017673e77c..a2b4852e2d70e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -24,65 +24,93 @@
 struct task_struct;
 
 /*
- * Generalized hardware event types, used by the hw_event_type parameter
- * of the sys_perf_counter_open() syscall:
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
  */
 enum hw_event_types {
-	PERF_COUNT_CYCLES,
-	PERF_COUNT_INSTRUCTIONS,
-	PERF_COUNT_CACHE_REFERENCES,
-	PERF_COUNT_CACHE_MISSES,
-	PERF_COUNT_BRANCH_INSTRUCTIONS,
-	PERF_COUNT_BRANCH_MISSES,
 	/*
-	 * If this bit is set in the type, then trigger NMI sampling:
+	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_NMI			= (1 << 30),
-	PERF_COUNT_RAW			= (1 << 31),
+	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 };
 
 /*
  * IRQ-notification data record type:
  */
-enum perf_record_type {
-	PERF_RECORD_SIMPLE,
-	PERF_RECORD_IRQ,
-	PERF_RECORD_GROUP,
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
 };
 
-struct perf_counter_event {
-	u32			hw_event_type;
-	u32			hw_event_period;
-	u64			hw_raw_ctrl;
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	u64			type;
+
+	u64			irq_period;
+	u32			record_type;
+
+	u32			disabled     :  1, /* off by default */
+				nmi	     :  1, /* NMI sampling   */
+				raw	     :  1, /* raw event type */
+				__reserved_1 : 29;
+
+	u64			__reserved_2;
 };
 
+/*
+ * Kernel-internal data types:
+ */
+
 /**
- * struct hw_perf_counter - performance counter hardware details
+ * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
-	u64			config;
-	unsigned long		config_base;
-	unsigned long		counter_base;
-	int			nmi;
-	unsigned int		idx;
-	u64			prev_count;
-	s32			next_count;
-	u64			irq_period;
+	u64				config;
+	unsigned long			config_base;
+	unsigned long			counter_base;
+	int				nmi;
+	unsigned int			idx;
+	u64				prev_count;
+	u64				irq_period;
+	s32				next_count;
 };
 
 /*
  * Hardcoded buffer length limit for now, for IRQ-fed events:
  */
-#define PERF_DATA_BUFLEN	2048
+#define PERF_DATA_BUFLEN		2048
 
 /**
  * struct perf_data - performance counter IRQ data sampling ...
  */
 struct perf_data {
-	int			len;
-	int			rd_idx;
-	int			overrun;
-	u8			data[PERF_DATA_BUFLEN];
+	int				len;
+	int				rd_idx;
+	int				overrun;
+	u8				data[PERF_DATA_BUFLEN];
 };
 
 /**
@@ -96,7 +124,7 @@ struct perf_counter {
 #else
 	atomic_t			count32[2];
 #endif
-	struct perf_counter_event	event;
+	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -110,8 +138,6 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	enum perf_record_type		record_type;
-
 	/* read() / irq related data */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 3ecd73d03daa9..a549678b7c3c5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -54,7 +54,7 @@ struct compat_stat;
 struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
-struct perf_counter_event;
+struct perf_counter_hw_event;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -625,7 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
 
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
-		      pid_t pid, int cpu, int masterfd);
+
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2557c670a3bb8..0d323ceda3a40 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -669,7 +669,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	struct perf_counter *counter = file->private_data;
 
-	switch (counter->record_type) {
+	switch (counter->hw_event.record_type) {
 	case PERF_RECORD_SIMPLE:
 		return perf_read_hw(counter, buf, count);
 
@@ -707,7 +707,7 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
@@ -718,31 +718,37 @@ perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type)
 	INIT_LIST_HEAD(&counter->list);
 	init_waitqueue_head(&counter->waitq);
 
-	counter->irqdata	= &counter->data[0];
-	counter->usrdata	= &counter->data[1];
-	counter->cpu		= cpu;
-	counter->record_type	= record_type;
-	counter->event		= *event;
-	counter->wakeup_pending = 0;
+	counter->irqdata		= &counter->data[0];
+	counter->usrdata		= &counter->data[1];
+	counter->cpu			= cpu;
+	counter->hw_event		= *hw_event;
+	counter->wakeup_pending		= 0;
 
 	return counter;
 }
 
 /**
- * sys_perf_task_open - open a performance counter associate it to a task
- * @hw_event_type:	event type for monitoring/sampling...
+ * sys_perf_task_open - open a performance counter, associate it to a task/cpu
+ *
+ * @hw_event_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
+ * @cpu:		target cpu
+ * @group_fd:		group leader counter fd
  */
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
-		      pid_t pid, int cpu, int masterfd)
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd)
+
 {
 	struct perf_counter_context *ctx;
-	struct perf_counter_event event;
+	struct perf_counter_hw_event hw_event;
 	struct perf_counter *counter;
 	int ret;
 
-	if (copy_from_user(&event, uevent, sizeof(event)) != 0)
+	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
 	ctx = find_get_context(pid, cpu);
@@ -750,7 +756,7 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type,
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(&event, cpu, record_type);
+	counter = perf_counter_alloc(&hw_event, cpu);
 	if (!counter)
 		goto err_put_context;
 
-- 
GitLab


From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 08:38:42 +0100
Subject: [PATCH 0013/2198] perf counters: add support for group counters

Impact: add group counters

This patch adds the "counter groups" abstraction.

Groups of counters behave much like normal 'single' counters, with a
few semantic and behavioral extensions on top of that.

A counter group is created by creating a new counter with the open()
syscall's group-leader group_fd file descriptor parameter pointing
to another, already existing counter.

Groups of counters are scheduled in and out in one atomic group, and
they are also roundrobin-scheduled atomically.

Counters that are member of a group can also record events with an
(atomic) extended timestamp that extends to all members of the group,
if the record type is set to PERF_RECORD_GROUP.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  28 +--
 include/linux/perf_counter.h       |   8 +-
 kernel/perf_counter.c              | 282 ++++++++++++++++++++++-------
 3 files changed, 236 insertions(+), 82 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ef1936a871aa0..54b4ad0cce683 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter)
 }
 
 static void
-perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
+perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
-	struct perf_counter_context *ctx = leader->ctx;
-	struct perf_counter *counter;
+	struct perf_counter *counter, *group_leader = sibling->group_leader;
 	int bit;
 
-	list_for_each_entry(counter, &ctx->counters, list) {
-		if (counter->hw_event.record_type != PERF_RECORD_SIMPLE ||
-		    counter == leader)
-			continue;
+	/*
+	 * Store the counter's own timestamp first:
+	 */
+	perf_store_irq_data(sibling, sibling->hw_event.type);
+	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
-		if (counter->active) {
+	/*
+	 * Then store sibling timestamps (if any):
+	 */
+	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+		if (!counter->active) {
 			/*
 			 * When counter was not in the overflow mask, we have to
 			 * read it from hardware. We read it as well, when it
@@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown)
 				perf_save_and_restart(counter);
 			}
 		}
-		perf_store_irq_data(leader, counter->hw_event.type);
-		perf_store_irq_data(leader, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, counter->hw_event.type);
+		perf_store_irq_data(sibling, atomic64_counter_read(counter));
 	}
 }
 
@@ -416,10 +420,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 			perf_store_irq_data(counter, instruction_pointer(regs));
 			break;
 		case PERF_RECORD_GROUP:
-			perf_store_irq_data(counter,
-					    counter->hw_event.type);
-			perf_store_irq_data(counter,
-					    atomic64_counter_read(counter));
 			perf_handle_group(counter, &status, &ack);
 			break;
 		}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a2b4852e2d70e..7af7d8965460a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -117,7 +117,10 @@ struct perf_data {
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
-	struct list_head		list;
+	struct list_head		list_entry;
+	struct list_head		sibling_list;
+	struct perf_counter		*group_leader;
+
 	int				active;
 #if BITS_PER_LONG == 64
 	atomic64_t			count;
@@ -158,7 +161,8 @@ struct perf_counter_context {
 	 * Protect the list of counters:
 	 */
 	spinlock_t		lock;
-	struct list_head	counters;
+
+	struct list_head	counter_list;
 	int			nr_counters;
 	int			nr_active;
 	struct task_struct	*task;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0d323ceda3a40..fa59fe8c02d5b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sysfs.h>
 #include <linux/ptrace.h>
@@ -55,7 +56,7 @@ void __weak hw_perf_counter_setup(void) { }
  * Read the cached counter in counter safe against cross CPU / NMI
  * modifications. 64 bit version - no complications.
  */
-static inline u64 perf_read_counter_safe(struct perf_counter *counter)
+static inline u64 perf_counter_read_safe(struct perf_counter *counter)
 {
 	return (u64) atomic64_read(&counter->count);
 }
@@ -66,7 +67,7 @@ static inline u64 perf_read_counter_safe(struct perf_counter *counter)
  * Read the cached counter in counter safe against cross CPU / NMI
  * modifications. 32 bit version.
  */
-static u64 perf_read_counter_safe(struct perf_counter *counter)
+static u64 perf_counter_read_safe(struct perf_counter *counter)
 {
 	u32 cntl, cnth;
 
@@ -83,13 +84,55 @@ static u64 perf_read_counter_safe(struct perf_counter *counter)
 
 #endif
 
+static void
+list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+	struct perf_counter *group_leader = counter->group_leader;
+
+	/*
+	 * Depending on whether it is a standalone or sibling counter,
+	 * add it straight to the context's counter list, or to the group
+	 * leader's sibling list:
+	 */
+	if (counter->group_leader == counter)
+		list_add_tail(&counter->list_entry, &ctx->counter_list);
+	else
+		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+}
+
+static void
+list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
+{
+	struct perf_counter *sibling, *tmp;
+
+	list_del_init(&counter->list_entry);
+
+	if (list_empty(&counter->sibling_list))
+		return;
+
+	/*
+	 * If this was a group counter with sibling counters then
+	 * upgrade the siblings to singleton counters by adding them
+	 * to the context list directly:
+	 */
+	list_for_each_entry_safe(sibling, tmp,
+				 &counter->sibling_list, list_entry) {
+
+		list_del_init(&sibling->list_entry);
+		list_add_tail(&sibling->list_entry, &ctx->counter_list);
+		WARN_ON_ONCE(!sibling->group_leader);
+		WARN_ON_ONCE(sibling->group_leader == sibling);
+		sibling->group_leader = sibling;
+	}
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
  * We disable the counter on the hardware level first. After that we
  * remove it from the context list.
  */
-static void __perf_remove_from_context(void *info)
+static void __perf_counter_remove_from_context(void *info)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
@@ -119,7 +162,7 @@ static void __perf_remove_from_context(void *info)
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
 	hw_perf_disable_all();
-	list_del_init(&counter->list);
+	list_del_counter(counter, ctx);
 	hw_perf_enable_all();
 
 	if (!ctx->task) {
@@ -144,7 +187,7 @@ static void __perf_remove_from_context(void *info)
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
  */
-static void perf_remove_from_context(struct perf_counter *counter)
+static void perf_counter_remove_from_context(struct perf_counter *counter)
 {
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
@@ -155,32 +198,32 @@ static void perf_remove_from_context(struct perf_counter *counter)
 		 * the removal is always sucessful.
 		 */
 		smp_call_function_single(counter->cpu,
-					 __perf_remove_from_context,
+					 __perf_counter_remove_from_context,
 					 counter, 1);
 		return;
 	}
 
 retry:
-	task_oncpu_function_call(task, __perf_remove_from_context,
+	task_oncpu_function_call(task, __perf_counter_remove_from_context,
 				 counter);
 
 	spin_lock_irq(&ctx->lock);
 	/*
 	 * If the context is active we need to retry the smp call.
 	 */
-	if (ctx->nr_active && !list_empty(&counter->list)) {
+	if (ctx->nr_active && !list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
 
 	/*
 	 * The lock prevents that this context is scheduled in so we
-	 * can remove the counter safely, if it the call above did not
+	 * can remove the counter safely, if the call above did not
 	 * succeed.
 	 */
-	if (!list_empty(&counter->list)) {
+	if (!list_empty(&counter->list_entry)) {
 		ctx->nr_counters--;
-		list_del_init(&counter->list);
+		list_del_counter(counter, ctx);
 		counter->task = NULL;
 	}
 	spin_unlock_irq(&ctx->lock);
@@ -211,7 +254,7 @@ static void __perf_install_in_context(void *info)
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
 	hw_perf_disable_all();
-	list_add_tail(&counter->list, &ctx->counters);
+	list_add_counter(counter, ctx);
 	hw_perf_enable_all();
 
 	ctx->nr_counters++;
@@ -268,7 +311,7 @@ perf_install_in_context(struct perf_counter_context *ctx,
 	 * If the context is active and the counter has not been added
 	 * we need to retry the smp call.
 	 */
-	if (ctx->nr_active && list_empty(&counter->list)) {
+	if (ctx->nr_active && list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
@@ -278,13 +321,45 @@ perf_install_in_context(struct perf_counter_context *ctx,
 	 * can add the counter safely, if it the call above did not
 	 * succeed.
 	 */
-	if (list_empty(&counter->list)) {
-		list_add_tail(&counter->list, &ctx->counters);
+	if (list_empty(&counter->list_entry)) {
+		list_add_counter(counter, ctx);
 		ctx->nr_counters++;
 	}
 	spin_unlock_irq(&ctx->lock);
 }
 
+static void
+counter_sched_out(struct perf_counter *counter,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_counter_context *ctx)
+{
+	if (!counter->active)
+		return;
+
+	hw_perf_counter_disable(counter);
+	counter->active	=  0;
+	counter->oncpu	= -1;
+
+	cpuctx->active_oncpu--;
+	ctx->nr_active--;
+}
+
+static void
+group_sched_out(struct perf_counter *group_counter,
+		struct perf_cpu_context *cpuctx,
+		struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	/*
+	 * Schedule out siblings (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_out(counter, cpuctx, ctx);
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -306,21 +381,48 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 		return;
 
 	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->counters, list) {
-		if (!ctx->nr_active)
-			break;
-		if (counter->active) {
-			hw_perf_counter_disable(counter);
-			counter->active = 0;
-			counter->oncpu = -1;
-			ctx->nr_active--;
-			cpuctx->active_oncpu--;
-		}
+	if (ctx->nr_active) {
+		list_for_each_entry(counter, &ctx->counter_list, list_entry)
+			group_sched_out(counter, cpuctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
 	cpuctx->task_ctx = NULL;
 }
 
+static void
+counter_sched_in(struct perf_counter *counter,
+		 struct perf_cpu_context *cpuctx,
+		 struct perf_counter_context *ctx,
+		 int cpu)
+{
+	if (!counter->active)
+		return;
+
+	hw_perf_counter_enable(counter);
+	counter->active = 1;
+	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
+
+	cpuctx->active_oncpu++;
+	ctx->nr_active++;
+}
+
+static void
+group_sched_in(struct perf_counter *group_counter,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx,
+	       int cpu)
+{
+	struct perf_counter *counter;
+
+	counter_sched_in(group_counter, cpuctx, ctx, cpu);
+
+	/*
+	 * Schedule in siblings as one group (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_in(counter, cpuctx, ctx, cpu);
+}
+
 /*
  * Called from scheduler to add the counters of the current task
  * with interrupts disabled.
@@ -342,19 +444,21 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 		return;
 
 	spin_lock(&ctx->lock);
-	list_for_each_entry(counter, &ctx->counters, list) {
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (ctx->nr_active == cpuctx->max_pertask)
 			break;
+
+		/*
+		 * Listen to the 'cpu' scheduling filter constraint
+		 * of counters:
+		 */
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		hw_perf_counter_enable(counter);
-		counter->active = 1;
-		counter->oncpu = cpu;
-		ctx->nr_active++;
-		cpuctx->active_oncpu++;
+		group_sched_in(counter, cpuctx, ctx, cpu);
 	}
 	spin_unlock(&ctx->lock);
+
 	cpuctx->task_ctx = ctx;
 }
 
@@ -371,12 +475,12 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	spin_lock(&ctx->lock);
 
 	/*
-	 * Rotate the first entry last:
+	 * Rotate the first entry last (works just fine for group counters too):
 	 */
 	hw_perf_disable_all();
-	list_for_each_entry(counter, &ctx->counters, list) {
-		list_del(&counter->list);
-		list_add_tail(&counter->list, &ctx->counters);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		list_del(&counter->list_entry);
+		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
 	hw_perf_enable_all();
@@ -386,17 +490,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	perf_counter_task_sched_in(curr, cpu);
 }
 
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	ctx->nr_counters	= 0;
+	ctx->task		= task;
+}
 /*
  * Initialize the perf_counter context in task_struct
  */
 void perf_counter_init_task(struct task_struct *task)
 {
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
-
-	spin_lock_init(&ctx->lock);
-	INIT_LIST_HEAD(&ctx->counters);
-	ctx->nr_counters = 0;
-	ctx->task = task;
+	__perf_counter_init_context(&task->perf_counter_ctx, task);
 }
 
 /*
@@ -407,7 +518,7 @@ static void __hw_perf_counter_read(void *info)
 	hw_perf_counter_read(info);
 }
 
-static u64 perf_read_counter(struct perf_counter *counter)
+static u64 perf_counter_read(struct perf_counter *counter)
 {
 	/*
 	 * If counter is enabled and currently active on a CPU, update the
@@ -418,7 +529,7 @@ static u64 perf_read_counter(struct perf_counter *counter)
 					 __hw_perf_counter_read, counter, 1);
 	}
 
-	return perf_read_counter_safe(counter);
+	return perf_counter_read_safe(counter);
 }
 
 /*
@@ -555,7 +666,7 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	mutex_lock(&counter->mutex);
 
-	perf_remove_from_context(counter);
+	perf_counter_remove_from_context(counter);
 	put_context(ctx);
 
 	mutex_unlock(&counter->mutex);
@@ -577,7 +688,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 		return -EINVAL;
 
 	mutex_lock(&counter->mutex);
-	cntval = perf_read_counter(counter);
+	cntval = perf_counter_read(counter);
 	mutex_unlock(&counter->mutex);
 
 	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
@@ -707,15 +818,25 @@ static const struct file_operations perf_fops = {
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
+perf_counter_alloc(struct perf_counter_hw_event *hw_event,
+		   int cpu,
+		   struct perf_counter *group_leader)
 {
 	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 
 	if (!counter)
 		return NULL;
 
+	/*
+	 * Single counters are their own group leaders, with an
+	 * empty sibling list:
+	 */
+	if (!group_leader)
+		group_leader = counter;
+
 	mutex_init(&counter->mutex);
-	INIT_LIST_HEAD(&counter->list);
+	INIT_LIST_HEAD(&counter->list_entry);
+	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
 	counter->irqdata		= &counter->data[0];
@@ -723,6 +844,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu)
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
+	counter->group_leader		= group_leader;
 
 	return counter;
 }
@@ -743,20 +865,45 @@ asmlinkage int sys_perf_counter_open(
 	int				group_fd)
 
 {
-	struct perf_counter_context *ctx;
+	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
-	struct perf_counter *counter;
+	struct perf_counter_context *ctx;
+	struct file *group_file = NULL;
+	int fput_needed = 0;
 	int ret;
 
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
+	/*
+	 * Look up the group leader:
+	 */
+	group_leader = NULL;
+	if (group_fd != -1) {
+		ret = -EINVAL;
+		group_file = fget_light(group_fd, &fput_needed);
+		if (!group_file)
+			goto out_fput;
+		if (group_file->f_op != &perf_fops)
+			goto out_fput;
+
+		group_leader = group_file->private_data;
+		/*
+		 * Do not allow a recursive hierarchy:
+		 */
+		if (group_leader->group_leader)
+			goto out_fput;
+	}
+
+	/*
+	 * Get the target context (task or percpu):
+	 */
 	ctx = find_get_context(pid, cpu);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
 	ret = -ENOMEM;
-	counter = perf_counter_alloc(&hw_event, cpu);
+	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
 	if (!counter)
 		goto err_put_context;
 
@@ -770,11 +917,14 @@ asmlinkage int sys_perf_counter_open(
 	if (ret < 0)
 		goto err_remove_free_put_context;
 
+out_fput:
+	fput_light(group_file, fput_needed);
+
 	return ret;
 
 err_remove_free_put_context:
 	mutex_lock(&counter->mutex);
-	perf_remove_from_context(counter);
+	perf_counter_remove_from_context(counter);
 	mutex_unlock(&counter->mutex);
 
 err_free_put_context:
@@ -783,40 +933,40 @@ asmlinkage int sys_perf_counter_open(
 err_put_context:
 	put_context(ctx);
 
-	return ret;
+	goto out_fput;
 }
 
-static void __cpuinit perf_init_cpu(int cpu)
+static void __cpuinit perf_counter_init_cpu(int cpu)
 {
-	struct perf_cpu_context *ctx;
+	struct perf_cpu_context *cpuctx;
 
-	ctx = &per_cpu(perf_cpu_context, cpu);
-	spin_lock_init(&ctx->ctx.lock);
-	INIT_LIST_HEAD(&ctx->ctx.counters);
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	__perf_counter_init_context(&cpuctx->ctx, NULL);
 
 	mutex_lock(&perf_resource_mutex);
-	ctx->max_pertask = perf_max_counters - perf_reserved_percpu;
+	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
 	mutex_unlock(&perf_resource_mutex);
+
 	hw_perf_counter_setup();
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void __perf_exit_cpu(void *info)
+static void __perf_counter_exit_cpu(void *info)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = &cpuctx->ctx;
 	struct perf_counter *counter, *tmp;
 
-	list_for_each_entry_safe(counter, tmp, &ctx->counters, list)
-		__perf_remove_from_context(counter);
+	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
+		__perf_counter_remove_from_context(counter);
 
 }
-static void perf_exit_cpu(int cpu)
+static void perf_counter_exit_cpu(int cpu)
 {
-	smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1);
+	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
 }
 #else
-static inline void perf_exit_cpu(int cpu) { }
+static inline void perf_counter_exit_cpu(int cpu) { }
 #endif
 
 static int __cpuinit
@@ -828,12 +978,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
-		perf_init_cpu(cpu);
+		perf_counter_init_cpu(cpu);
 		break;
 
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
-		perf_exit_cpu(cpu);
+		perf_counter_exit_cpu(cpu);
 		break;
 
 	default:
-- 
GitLab


From ccff286d85098ba5438e22aa2ea807fc1e18cf2f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 11:26:29 +0100
Subject: [PATCH 0014/2198] perf counters: group counter, fixes

Impact: bugfix

Check that a group does not span outside the context of a CPU or a task.

Also, do not allow deep recursive hierarchies.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fa59fe8c02d5b..278209c547a87 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -107,9 +107,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 	list_del_init(&counter->list_entry);
 
-	if (list_empty(&counter->sibling_list))
-		return;
-
 	/*
 	 * If this was a group counter with sibling counters then
 	 * upgrade the siblings to singleton counters by adding them
@@ -395,9 +392,6 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	if (!counter->active)
-		return;
-
 	hw_perf_counter_enable(counter);
 	counter->active = 1;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
@@ -876,32 +870,39 @@ asmlinkage int sys_perf_counter_open(
 		return -EFAULT;
 
 	/*
-	 * Look up the group leader:
+	 * Get the target context (task or percpu):
+	 */
+	ctx = find_get_context(pid, cpu);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	/*
+	 * Look up the group leader (we will attach this counter to it):
 	 */
 	group_leader = NULL;
 	if (group_fd != -1) {
 		ret = -EINVAL;
 		group_file = fget_light(group_fd, &fput_needed);
 		if (!group_file)
-			goto out_fput;
+			goto err_put_context;
 		if (group_file->f_op != &perf_fops)
-			goto out_fput;
+			goto err_put_context;
 
 		group_leader = group_file->private_data;
 		/*
-		 * Do not allow a recursive hierarchy:
+		 * Do not allow a recursive hierarchy (this new sibling
+		 * becoming part of another group-sibling):
+		 */
+		if (group_leader->group_leader != group_leader)
+			goto err_put_context;
+		/*
+		 * Do not allow to attach to a group in a different
+		 * task or CPU context:
 		 */
-		if (group_leader->group_leader)
-			goto out_fput;
+		if (group_leader->ctx != ctx)
+			goto err_put_context;
 	}
 
-	/*
-	 * Get the target context (task or percpu):
-	 */
-	ctx = find_get_context(pid, cpu);
-	if (IS_ERR(ctx))
-		return PTR_ERR(ctx);
-
 	ret = -ENOMEM;
 	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
 	if (!counter)
-- 
GitLab


From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 12:46:46 +0100
Subject: [PATCH 0015/2198] perf counters: hw driver API

Impact: restructure code, introduce hw_ops driver abstraction

Introduce this abstraction to handle counter details:

 struct hw_perf_counter_ops {
	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
	void (*hw_perf_counter_read)	(struct perf_counter *counter);
 };

This will be useful to support assymetric hw details, and it will also
be useful to implement "software counters". (Counters that count kernel
managed sw events such as pagefaults, context-switches, wall-clock time
or task-local time.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 37 +++++++++++++++++-------
 include/linux/perf_counter.h       | 15 ++++++++++
 kernel/perf_counter.c              | 45 ++++++++++++++++--------------
 3 files changed, 66 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 54b4ad0cce683..718b635dece6b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
-int hw_perf_counter_init(struct perf_counter *counter)
+static int __hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void)
 EXPORT_SYMBOL_GPL(hw_perf_disable_all);
 
 static inline void
-__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
 {
 	wrmsr(hwc->config_base + idx, hwc->config, 0);
 }
@@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
 	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
 }
 
-static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
-void hw_perf_counter_enable(struct perf_counter *counter)
+static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__hw_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
 	__hw_perf_counter_set_period(hwc, idx);
-	__hw_perf_counter_enable(hwc, idx);
+	__x86_perf_counter_enable(hwc, idx);
 }
 
 #ifdef CONFIG_X86_64
@@ -282,20 +282,20 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-void hw_perf_counter_disable(struct perf_counter *counter)
+static void x86_perf_counter_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__hw_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
 	__hw_perf_save_counter(counter, hwc, idx);
 }
 
-void hw_perf_counter_read(struct perf_counter *counter)
+static void x86_perf_counter_read(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned long addr = hwc->counter_base + hwc->idx;
@@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__hw_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(hwc, idx);
 }
 
 static void
@@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void)
 
 	perf_counters_initialized = true;
 }
+
+static struct hw_perf_counter_ops x86_perf_counter_ops = {
+	.hw_perf_counter_enable		= x86_perf_counter_enable,
+	.hw_perf_counter_disable	= x86_perf_counter_disable,
+	.hw_perf_counter_read		= x86_perf_counter_read,
+};
+
+struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+{
+	int err;
+
+	err = __hw_perf_counter_init(counter);
+	if (err)
+		return NULL;
+
+	return &x86_perf_counter_ops;
+}
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7af7d8965460a..27385641ecb60 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -113,6 +113,17 @@ struct perf_data {
 	u8				data[PERF_DATA_BUFLEN];
 };
 
+struct perf_counter;
+
+/**
+ * struct hw_perf_counter_ops - performance counter hw ops
+ */
+struct hw_perf_counter_ops {
+	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
+	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
+	void (*hw_perf_counter_read)	(struct perf_counter *counter);
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -120,6 +131,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
+	struct hw_perf_counter_ops	*hw_ops;
 
 	int				active;
 #if BITS_PER_LONG == 64
@@ -185,6 +197,9 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
+extern struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter);
+
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 278209c547a87..e6e41ca95463e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,18 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-
-int __weak hw_perf_counter_init(struct perf_counter *counter)
+extern __weak struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
 {
-	return -EINVAL;
+	return ERR_PTR(-EINVAL);
 }
 
-void __weak hw_perf_counter_enable(struct perf_counter *counter)	 { }
-void __weak hw_perf_counter_disable(struct perf_counter *counter)	 { }
-void __weak hw_perf_counter_read(struct perf_counter *counter)		 { }
-void __weak hw_perf_disable_all(void) { }
-void __weak hw_perf_enable_all(void) { }
-void __weak hw_perf_counter_setup(void) { }
+void __weak hw_perf_disable_all(void)	 { }
+void __weak hw_perf_enable_all(void)	 { }
+void __weak hw_perf_counter_setup(void)	 { }
 
 #if BITS_PER_LONG == 64
 
@@ -146,7 +143,7 @@ static void __perf_counter_remove_from_context(void *info)
 	spin_lock(&ctx->lock);
 
 	if (counter->active) {
-		hw_perf_counter_disable(counter);
+		counter->hw_ops->hw_perf_counter_disable(counter);
 		counter->active = 0;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
@@ -257,7 +254,7 @@ static void __perf_install_in_context(void *info)
 	ctx->nr_counters++;
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
-		hw_perf_counter_enable(counter);
+		counter->hw_ops->hw_perf_counter_enable(counter);
 		counter->active = 1;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
@@ -333,7 +330,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (!counter->active)
 		return;
 
-	hw_perf_counter_disable(counter);
+	counter->hw_ops->hw_perf_counter_disable(counter);
 	counter->active	=  0;
 	counter->oncpu	= -1;
 
@@ -392,7 +389,7 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	hw_perf_counter_enable(counter);
+	counter->hw_ops->hw_perf_counter_enable(counter);
 	counter->active = 1;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
@@ -509,7 +506,9 @@ void perf_counter_init_task(struct task_struct *task)
  */
 static void __hw_perf_counter_read(void *info)
 {
-	hw_perf_counter_read(info);
+	struct perf_counter *counter = info;
+
+	counter->hw_ops->hw_perf_counter_read(counter);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -816,8 +815,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
 		   struct perf_counter *group_leader)
 {
-	struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	struct hw_perf_counter_ops *hw_ops;
+	struct perf_counter *counter;
 
+	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
 	if (!counter)
 		return NULL;
 
@@ -839,6 +840,14 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
+	counter->hw_ops			= NULL;
+
+	hw_ops = hw_perf_counter_init(counter);
+	if (!hw_ops) {
+		kfree(counter);
+		return NULL;
+	}
+	counter->hw_ops = hw_ops;
 
 	return counter;
 }
@@ -908,10 +917,6 @@ asmlinkage int sys_perf_counter_open(
 	if (!counter)
 		goto err_put_context;
 
-	ret = hw_perf_counter_init(counter);
-	if (ret)
-		goto err_free_put_context;
-
 	perf_install_in_context(ctx, counter, cpu);
 
 	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
@@ -927,8 +932,6 @@ asmlinkage int sys_perf_counter_open(
 	mutex_lock(&counter->mutex);
 	perf_counter_remove_from_context(counter);
 	mutex_unlock(&counter->mutex);
-
-err_free_put_context:
 	kfree(counter);
 
 err_put_context:
-- 
GitLab


From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 13:21:10 +0100
Subject: [PATCH 0016/2198] perf counters: implement PERF_COUNT_CPU_CLOCK

Impact: add new perf-counter type

The 'CPU clock' counter counts the amount of CPU clock time that is
elapsing, in nanoseconds. (regardless of how much of it the task is
spending on a CPU executing)

This counter type is a Linux kernel based abstraction, it is available
even if the hardware does not support native hardware performance counters.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 36 ++---------
 include/linux/perf_counter.h       |  9 ++-
 kernel/perf_counter.c              | 95 ++++++++++++++++++++++++++----
 3 files changed, 92 insertions(+), 48 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 718b635dece6b..43c8e9a38b4e2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 	__x86_perf_counter_enable(hwc, idx);
 }
 
-#ifdef CONFIG_X86_64
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-	atomic64_set(&counter->count, val);
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic64_read(&counter->count);
-}
-#else
-/*
- * Todo: add proper atomic64_t support to 32-bit x86:
- */
-static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-	u32 *val32 = (void *)&val64;
-
-	atomic_set(counter->count32 + 0, *(val32 + 0));
-	atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-static inline u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic_read(counter->count32 + 0) |
-		(u64) atomic_read(counter->count32 + 1) << 32;
-}
-#endif
-
 static void __hw_perf_save_counter(struct perf_counter *counter,
 				   struct hw_perf_counter *hwc, int idx)
 {
@@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter)
 	} while (offs != hwc->prev_count);
 
 	val32 = (s32) val;
-	val =  (s64)hwc->irq_period + (s64)val32;
+	val = (s64)hwc->irq_period + (s64)val32;
 	atomic64_counter_set(counter, hwc->prev_count + val);
 }
 
@@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
-static struct hw_perf_counter_ops x86_perf_counter_ops = {
+static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
 	.hw_perf_counter_read		= x86_perf_counter_read,
 };
 
-struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter)
+const struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
 {
 	int err;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 27385641ecb60..9a1713a1be277 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -131,7 +131,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
-	struct hw_perf_counter_ops	*hw_ops;
+	const struct hw_perf_counter_ops *hw_ops;
 
 	int				active;
 #if BITS_PER_LONG == 64
@@ -197,7 +197,7 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
-extern struct hw_perf_counter_ops *
+extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
@@ -208,6 +208,9 @@ extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern void hw_perf_restore_ctrl(u64 ctrl);
 extern u64 hw_perf_disable_all(void);
+extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
+extern u64 atomic64_counter_read(struct perf_counter *counter);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -219,7 +222,7 @@ static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
-static inline u64 hw_perf_disable_all(void)		{ return 0; }
+static inline u64 hw_perf_disable_all(void)		      { return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e6e41ca95463e..506286e5ba634 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -37,15 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-extern __weak struct hw_perf_counter_ops *
+extern __weak const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter)
 {
 	return ERR_PTR(-EINVAL);
 }
 
-void __weak hw_perf_disable_all(void)	 { }
-void __weak hw_perf_enable_all(void)	 { }
-void __weak hw_perf_counter_setup(void)	 { }
+u64 __weak hw_perf_disable_all(void)		{ return 0; }
+void __weak hw_perf_restore_ctrl(u64 ctrl)	{ }
+void __weak hw_perf_counter_setup(void)		{ }
 
 #if BITS_PER_LONG == 64
 
@@ -58,6 +58,16 @@ static inline u64 perf_counter_read_safe(struct perf_counter *counter)
 	return (u64) atomic64_read(&counter->count);
 }
 
+void atomic64_counter_set(struct perf_counter *counter, u64 val)
+{
+	atomic64_set(&counter->count, val);
+}
+
+u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic64_read(&counter->count);
+}
+
 #else
 
 /*
@@ -79,6 +89,20 @@ static u64 perf_counter_read_safe(struct perf_counter *counter)
 	return cntl | ((u64) cnth) << 32;
 }
 
+void atomic64_counter_set(struct perf_counter *counter, u64 val64)
+{
+	u32 *val32 = (void *)&val64;
+
+	atomic_set(counter->count32 + 0, *(val32 + 0));
+	atomic_set(counter->count32 + 1, *(val32 + 1));
+}
+
+u64 atomic64_counter_read(struct perf_counter *counter)
+{
+	return atomic_read(counter->count32 + 0) |
+		(u64) atomic_read(counter->count32 + 1) << 32;
+}
+
 #endif
 
 static void
@@ -131,6 +155,7 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
+	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -155,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_del_counter(counter, ctx);
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	if (!ctx->task) {
 		/*
@@ -232,6 +257,7 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	int cpu = smp_processor_id();
+	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -247,9 +273,9 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_add_counter(counter, ctx);
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	ctx->nr_counters++;
 
@@ -457,6 +483,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
+	u64 perf_flags;
 
 	if (likely(!ctx->nr_counters))
 		return;
@@ -468,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	hw_perf_disable_all();
+	perf_flags = hw_perf_disable_all();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_del(&counter->list_entry);
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_enable_all();
+	hw_perf_restore_ctrl(perf_flags);
 
 	spin_unlock(&ctx->lock);
 
@@ -807,6 +834,42 @@ static const struct file_operations perf_fops = {
 	.poll			= perf_poll,
 };
 
+static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+}
+
+static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
+{
+}
+
+static void cpu_clock_perf_counter_read(struct perf_counter *counter)
+{
+	int cpu = raw_smp_processor_id();
+
+	atomic64_counter_set(counter, cpu_clock(cpu));
+}
+
+static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
+	.hw_perf_counter_enable		= cpu_clock_perf_counter_enable,
+	.hw_perf_counter_disable	= cpu_clock_perf_counter_disable,
+	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
+};
+
+static const struct hw_perf_counter_ops *
+sw_perf_counter_init(struct perf_counter *counter)
+{
+	const struct hw_perf_counter_ops *hw_ops = NULL;
+
+	switch (counter->hw_event.type) {
+	case PERF_COUNT_CPU_CLOCK:
+		hw_ops = &perf_ops_cpu_clock;
+		break;
+	default:
+		break;
+	}
+	return hw_ops;
+}
+
 /*
  * Allocate and initialize a counter structure
  */
@@ -815,7 +878,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
 		   struct perf_counter *group_leader)
 {
-	struct hw_perf_counter_ops *hw_ops;
+	const struct hw_perf_counter_ops *hw_ops;
 	struct perf_counter *counter;
 
 	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
@@ -842,7 +905,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 
-	hw_ops = hw_perf_counter_init(counter);
+	hw_ops = NULL;
+	if (!hw_event->raw && hw_event->type < 0)
+		hw_ops = sw_perf_counter_init(counter);
+	if (!hw_ops) {
+		hw_ops = hw_perf_counter_init(counter);
+	}
+
 	if (!hw_ops) {
 		kfree(counter);
 		return NULL;
@@ -912,7 +981,7 @@ asmlinkage int sys_perf_counter_open(
 			goto err_put_context;
 	}
 
-	ret = -ENOMEM;
+	ret = -EINVAL;
 	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
 	if (!counter)
 		goto err_put_context;
-- 
GitLab


From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 13:45:51 +0100
Subject: [PATCH 0017/2198] perf counters: consolidate hw_perf save/restore
 APIs

Impact: cleanup

Rename them to better match up the usual IRQ disable/enable APIs:

 hw_perf_disable_all()  => hw_perf_save_disable()
 hw_perf_restore_ctrl() => hw_perf_restore()

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  8 ++++----
 drivers/acpi/processor_idle.c      | 10 +++++-----
 include/linux/perf_counter.h       | 10 +++++-----
 kernel/perf_counter.c              | 16 ++++++++--------
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 43c8e9a38b4e2..3e1dbebe22b9c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -118,13 +118,13 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore_ctrl(u64 ctrl)
+void hw_perf_restore(u64 ctrl)
 {
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
 }
-EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl);
+EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-u64 hw_perf_disable_all(void)
+u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
 
@@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
 	return ctrl;
 }
-EXPORT_SYMBOL_GPL(hw_perf_disable_all);
+EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 static inline void
 __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index cca804e6f1dd4..a3e66a33b7a2e 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -270,11 +270,11 @@ static atomic_t c3_cpu_count;
 /* Common C-state entry for C2, C3, .. */
 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 {
-	u64 pctrl;
+	u64 perf_flags;
 
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	pctrl = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	if (cstate->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cstate);
@@ -287,7 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore_ctrl(pctrl);
+	hw_perf_restore(perf_flags);
 	start_critical_timings();
 }
 #endif /* !CONFIG_CPU_IDLE */
@@ -1433,7 +1433,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	pctrl = hw_perf_disable_all();
+	pctrl = hw_perf_save_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -1448,7 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore_ctrl(pctrl);
+	hw_perf_restore(pctrl);
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9a1713a1be277..68f6e3ad531fb 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -67,7 +67,7 @@ enum perf_counter_record_type {
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	u64			type;
+	s64			type;
 
 	u64			irq_period;
 	u32			record_type;
@@ -206,8 +206,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *task);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
-extern void hw_perf_restore_ctrl(u64 ctrl);
-extern u64 hw_perf_disable_all(void);
+extern u64 hw_perf_save_disable(void);
+extern void hw_perf_restore(u64 ctrl);
 extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
 extern u64 atomic64_counter_read(struct perf_counter *counter);
 
@@ -221,8 +221,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *task)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
-static inline void hw_perf_restore_ctrl(u64 ctrl)			{ }
-static inline u64 hw_perf_disable_all(void)		      { return 0; }
+static inline void hw_perf_restore(u64 ctrl)			{ }
+static inline u64 hw_perf_save_disable(void)		      { return 0; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 506286e5ba634..0e93fea17120c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -43,8 +43,8 @@ hw_perf_counter_init(struct perf_counter *counter)
 	return ERR_PTR(-EINVAL);
 }
 
-u64 __weak hw_perf_disable_all(void)		{ return 0; }
-void __weak hw_perf_restore_ctrl(u64 ctrl)	{ }
+u64 __weak hw_perf_save_disable(void)		{ return 0; }
+void __weak hw_perf_restore(u64 ctrl)	{ }
 void __weak hw_perf_counter_setup(void)		{ }
 
 #if BITS_PER_LONG == 64
@@ -180,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_del_counter(counter, ctx);
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	if (!ctx->task) {
 		/*
@@ -273,9 +273,9 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_add_counter(counter, ctx);
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	ctx->nr_counters++;
 
@@ -495,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	perf_flags = hw_perf_disable_all();
+	perf_flags = hw_perf_save_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_del(&counter->list_entry);
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_restore_ctrl(perf_flags);
+	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
 
-- 
GitLab


From bae43c9945ebeef15e7952e317efb02393d3bfc7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 14:03:20 +0100
Subject: [PATCH 0018/2198] perf counters: implement PERF_COUNT_TASK_CLOCK

Impact: add new perf-counter type

The 'task clock' counter counts the amount of time a task is executing,
in nanoseconds. It stops ticking when a task is scheduled out either due
to it blocking, sleeping or it being preempted.

This counter type is a Linux kernel based abstraction, it is available
even if the hardware does not support native hardware performance counters.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 +++++--
 kernel/perf_counter.c        | 22 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 68f6e3ad531fb..30c0ec8c1ee35 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -50,8 +50,11 @@ enum hw_event_types {
 	 */
 	PERF_COUNT_CPU_CLOCK		= -1,
 	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	/*
+	 * Future software events:
+	 */
+	/* PERF_COUNT_PAGE_FAULTS	= -3,
+	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0e93fea17120c..a0fe8474ee297 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -855,6 +855,25 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
 };
 
+static void task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+}
+
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
+{
+}
+
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+	atomic64_counter_set(counter, current->se.sum_exec_runtime);
+}
+
+static const struct hw_perf_counter_ops perf_ops_task_clock = {
+	.hw_perf_counter_enable		= task_clock_perf_counter_enable,
+	.hw_perf_counter_disable	= task_clock_perf_counter_disable,
+	.hw_perf_counter_read		= task_clock_perf_counter_read,
+};
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -864,6 +883,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 		break;
+	case PERF_COUNT_TASK_CLOCK:
+		hw_ops = &perf_ops_task_clock;
+		break;
 	default:
 		break;
 	}
-- 
GitLab


From 1d1c7ddbfab358445a542715551301b7fc363e28 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 14:59:31 +0100
Subject: [PATCH 0019/2198] perf counters: add prctl interface to
 disable/enable counters

Add a way for self-monitoring tasks to disable/enable counters summarily,
via a prctl:

	PR_TASK_PERF_COUNTERS_DISABLE		31
	PR_TASK_PERF_COUNTERS_ENABLE		32

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 ++
 include/linux/prctl.h        |  3 ++
 kernel/perf_counter.c        | 86 +++++++++++++++++++++++++++++++++---
 kernel/sys.c                 |  7 +++
 4 files changed, 93 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 30c0ec8c1ee35..97d86c293ee8f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -213,6 +213,8 @@ extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
 extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
 extern u64 atomic64_counter_read(struct perf_counter *counter);
+extern int perf_counter_task_disable(void);
+extern int perf_counter_task_enable(void);
 
 #else
 static inline void
@@ -226,6 +228,8 @@ static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)			{ }
 static inline u64 hw_perf_save_disable(void)		      { return 0; }
+static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
+static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 #endif
 
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 48d887e3c6e73..b00df4c79c633 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -85,4 +85,7 @@
 #define PR_SET_TIMERSLACK 29
 #define PR_GET_TIMERSLACK 30
 
+#define PR_TASK_PERF_COUNTERS_DISABLE		31
+#define PR_TASK_PERF_COUNTERS_ENABLE		32
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a0fe8474ee297..4e679b91d8bb3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -415,6 +415,9 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
+	if (counter->active == -1)
+		return;
+
 	counter->hw_ops->hw_perf_counter_enable(counter);
 	counter->active = 1;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
@@ -479,6 +482,79 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 	cpuctx->task_ctx = ctx;
 }
 
+int perf_counter_task_disable(void)
+{
+	struct task_struct *curr = current;
+	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter *counter;
+	u64 perf_flags;
+	int cpu;
+
+	if (likely(!ctx->nr_counters))
+		return 0;
+
+	local_irq_disable();
+	cpu = smp_processor_id();
+
+	perf_counter_task_sched_out(curr, cpu);
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * Disable all the counters:
+	 */
+	perf_flags = hw_perf_save_disable();
+
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		WARN_ON_ONCE(counter->active == 1);
+		counter->active = -1;
+	}
+	hw_perf_restore(perf_flags);
+
+	spin_unlock(&ctx->lock);
+
+	local_irq_enable();
+
+	return 0;
+}
+
+int perf_counter_task_enable(void)
+{
+	struct task_struct *curr = current;
+	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter *counter;
+	u64 perf_flags;
+	int cpu;
+
+	if (likely(!ctx->nr_counters))
+		return 0;
+
+	local_irq_disable();
+	cpu = smp_processor_id();
+
+	spin_lock(&ctx->lock);
+
+	/*
+	 * Disable all the counters:
+	 */
+	perf_flags = hw_perf_save_disable();
+
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->active != -1)
+			continue;
+		counter->active = 0;
+	}
+	hw_perf_restore(perf_flags);
+
+	spin_unlock(&ctx->lock);
+
+	perf_counter_task_sched_in(curr, cpu);
+
+	local_irq_enable();
+
+	return 0;
+}
+
 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
@@ -951,13 +1027,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd)
-
+asmlinkage int
+sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
+		      pid_t pid, int cpu, int group_fd)
 {
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
diff --git a/kernel/sys.c b/kernel/sys.c
index 31deba8f7d160..0f66633be319b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -14,6 +14,7 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/perf_counter.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
 #include <linux/kexec.h>
@@ -1716,6 +1717,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
 		case PR_SET_TSC:
 			error = SET_TSC_CTL(arg2);
 			break;
+		case PR_TASK_PERF_COUNTERS_DISABLE:
+			error = perf_counter_task_disable();
+			break;
+		case PR_TASK_PERF_COUNTERS_ENABLE:
+			error = perf_counter_task_enable();
+			break;
 		case PR_GET_TIMERSLACK:
 			error = current->timer_slack_ns;
 			break;
-- 
GitLab


From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 15:17:03 +0100
Subject: [PATCH 0020/2198] perf counters: clean up state transitions

Impact: cleanup

Introduce a proper enum for the 3 states of a counter:

	PERF_COUNTER_STATE_OFF		= -1
	PERF_COUNTER_STATE_INACTIVE	=  0
	PERF_COUNTER_STATE_ACTIVE	=  1

and rename counter->active to counter->state and propagate the
changes everywhere.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  2 +-
 include/linux/perf_counter.h       | 11 ++++++++++-
 kernel/perf_counter.c              | 29 ++++++++++++++---------------
 3 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3e1dbebe22b9c..4854cca7fffd4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	 * Then store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (!counter->active) {
+		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 			/*
 			 * When counter was not in the overflow mask, we have to
 			 * read it from hardware. We read it as well, when it
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 97d86c293ee8f..8cb095fa442c3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -127,6 +127,15 @@ struct hw_perf_counter_ops {
 	void (*hw_perf_counter_read)	(struct perf_counter *counter);
 };
 
+/**
+ * enum perf_counter_active_state - the states of a counter
+ */
+enum perf_counter_active_state {
+	PERF_COUNTER_STATE_OFF		= -1,
+	PERF_COUNTER_STATE_INACTIVE	=  0,
+	PERF_COUNTER_STATE_ACTIVE	=  1,
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -136,7 +145,7 @@ struct perf_counter {
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
-	int				active;
+	enum perf_counter_active_state	state;
 #if BITS_PER_LONG == 64
 	atomic64_t			count;
 #else
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4e679b91d8bb3..559130b8774de 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -167,9 +167,9 @@ static void __perf_counter_remove_from_context(void *info)
 
 	spin_lock(&ctx->lock);
 
-	if (counter->active) {
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		counter->hw_ops->hw_perf_counter_disable(counter);
-		counter->active = 0;
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
 		counter->task = NULL;
@@ -281,7 +281,7 @@ static void __perf_install_in_context(void *info)
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
 		counter->hw_ops->hw_perf_counter_enable(counter);
-		counter->active = 1;
+		counter->state = PERF_COUNTER_STATE_ACTIVE;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
@@ -328,7 +328,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
 
 	spin_lock_irq(&ctx->lock);
 	/*
-	 * If the context is active and the counter has not been added
 	 * we need to retry the smp call.
 	 */
 	if (ctx->nr_active && list_empty(&counter->list_entry)) {
@@ -353,12 +352,12 @@ counter_sched_out(struct perf_counter *counter,
 		  struct perf_cpu_context *cpuctx,
 		  struct perf_counter_context *ctx)
 {
-	if (!counter->active)
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return;
 
 	counter->hw_ops->hw_perf_counter_disable(counter);
-	counter->active	=  0;
-	counter->oncpu	= -1;
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->oncpu = -1;
 
 	cpuctx->active_oncpu--;
 	ctx->nr_active--;
@@ -415,11 +414,11 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	if (counter->active == -1)
+	if (counter->state == PERF_COUNTER_STATE_OFF)
 		return;
 
 	counter->hw_ops->hw_perf_counter_enable(counter);
-	counter->active = 1;
+	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
 	cpuctx->active_oncpu++;
@@ -506,8 +505,8 @@ int perf_counter_task_disable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		WARN_ON_ONCE(counter->active == 1);
-		counter->active = -1;
+		WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+		counter->state = PERF_COUNTER_STATE_OFF;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -540,9 +539,9 @@ int perf_counter_task_enable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->active != -1)
+		if (counter->state != PERF_COUNTER_STATE_OFF)
 			continue;
-		counter->active = 0;
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -620,7 +619,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	 * If counter is enabled and currently active on a CPU, update the
 	 * value in the counter structure:
 	 */
-	if (counter->active) {
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
 					 __hw_perf_counter_read, counter, 1);
 	}
@@ -673,7 +672,7 @@ static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
 
 retry:
 	spin_lock_irq(&ctx->lock);
-	if (!counter->active) {
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
 		counter->irqdata = counter->usrdata;
 		counter->usrdata = oldirqdata;
 		spin_unlock_irq(&ctx->lock);
-- 
GitLab


From 447557ac7ce120306b4a31d6003faef39cb1bf14 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Dec 2008 20:40:18 +0100
Subject: [PATCH 0021/2198] perf counters: update docs

Impact: update docs

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf-counters.txt | 107 ++++++++++++++++++++++----------
 1 file changed, 75 insertions(+), 32 deletions(-)

diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt
index 19033a0bb5262..fddd32189a500 100644
--- a/Documentation/perf-counters.txt
+++ b/Documentation/perf-counters.txt
@@ -10,8 +10,8 @@ trigger interrupts when a threshold number of events have passed - and can
 thus be used to profile the code that runs on that CPU.
 
 The Linux Performance Counter subsystem provides an abstraction of these
-hardware capabilities. It provides per task and per CPU counters, and
-it provides event capabilities on top of those.
+hardware capabilities. It provides per task and per CPU counters, counter
+groups, and it provides event capabilities on top of those.
 
 Performance counters are accessed via special file descriptors.
 There's one file descriptor per virtual counter used.
@@ -19,12 +19,8 @@ There's one file descriptor per virtual counter used.
 The special file descriptor is opened via the perf_counter_open()
 system call:
 
- int
- perf_counter_open(u32 hw_event_type,
-                   u32 hw_event_period,
-                   u32 record_type,
-                   pid_t pid,
-                   int cpu);
+   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+			     pid_t pid, int cpu, int group_fd);
 
 The syscall returns the new fd. The fd can be used via the normal
 VFS system calls: read() can be used to read the counter, fcntl()
@@ -33,39 +29,78 @@ can be used to set the blocking mode, etc.
 Multiple counters can be kept open at a time, and the counters
 can be poll()ed.
 
-When creating a new counter fd, 'hw_event_type' is one of:
-
- enum hw_event_types {
-	PERF_COUNT_CYCLES,
-	PERF_COUNT_INSTRUCTIONS,
-	PERF_COUNT_CACHE_REFERENCES,
-	PERF_COUNT_CACHE_MISSES,
-	PERF_COUNT_BRANCH_INSTRUCTIONS,
-	PERF_COUNT_BRANCH_MISSES,
- };
+When creating a new counter fd, 'perf_counter_hw_event' is:
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	s64			type;
+
+	u64			irq_period;
+	u32			record_type;
+
+	u32			disabled     :  1, /* off by default */
+				nmi	     :  1, /* NMI sampling   */
+				raw	     :  1, /* raw event type */
+				__reserved_1 : 29;
+
+	u64			__reserved_2;
+};
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	/*
+	 * Future software events:
+	 */
+	/* PERF_COUNT_PAGE_FAULTS	= -3,
+	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
+};
 
 These are standardized types of events that work uniformly on all CPUs
 that implements Performance Counters support under Linux. If a CPU is
 not able to count branch-misses, then the system call will return
 -EINVAL.
 
-[ Note: more hw_event_types are supported as well, but they are CPU
-  specific and are enumerated via /sys on a per CPU basis. Raw hw event
-  types can be passed in as negative numbers. For example, to count
-  "External bus cycles while bus lock signal asserted" events on Intel
-  Core CPUs, pass in a -0x4064 event type value. ]
-
-The parameter 'hw_event_period' is the number of events before waking up
-a read() that is blocked on a counter fd. Zero value means a non-blocking
-counter.
+More hw_event_types are supported as well, but they are CPU
+specific and are enumerated via /sys on a per CPU basis. Raw hw event
+types can be passed in under hw_event.type if hw_event.raw is 1.
+For example, to count "External bus cycles while bus lock signal asserted"
+events on Intel Core CPUs, pass in a 0x4064 event type value and set
+hw_event.raw to 1.
 
 'record_type' is the type of data that a read() will provide for the
 counter, and it can be one of:
 
-  enum perf_record_type {
-	PERF_RECORD_SIMPLE,
-	PERF_RECORD_IRQ,
-  };
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
+};
 
 a "simple" counter is one that counts hardware events and allows
 them to be read out into a u64 count value. (read() returns 8 on
@@ -76,6 +111,10 @@ the IP of the interrupted context. In this case read() will return
 the 8-byte counter value, plus the Instruction Pointer address of the
 interrupted context.
 
+The parameter 'hw_event_period' is the number of events before waking up
+a read() that is blocked on a counter fd. Zero value means a non-blocking
+counter.
+
 The 'pid' parameter allows the counter to be specific to a task:
 
  pid == 0: if the pid parameter is zero, the counter is attached to the
@@ -92,7 +131,7 @@ CPU:
  cpu >= 0: the counter is restricted to a specific CPU
  cpu == -1: the counter counts on all CPUs
 
-Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.
+(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.)
 
 A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts
 events of that task and 'follows' that task to whatever CPU the task
@@ -102,3 +141,7 @@ their own tasks.
 A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
 all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
 
+Group counters are created by passing in a group_fd of another counter.
+Groups are scheduled at once and can be used with PERF_RECORD_GROUP
+to record multi-dimensional timestamps.
+
-- 
GitLab


From 9b194e831fb2c322ed81a373e49620f34edc2778 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 20:22:35 +0100
Subject: [PATCH 0022/2198] x86: implement atomic64_t on 32-bit

Impact: new API

Implement the atomic64_t APIs on 32-bit as well. Will be used by
the performance counters code.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/atomic_32.h | 218 +++++++++++++++++++++++++++++++
 1 file changed, 218 insertions(+)

diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index ad5b9f6ecddf3..9927e01b03c2c 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -255,5 +255,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 #define smp_mb__before_atomic_inc()	barrier()
 #define smp_mb__after_atomic_inc()	barrier()
 
+/* An 64bit atomic type */
+
+typedef struct {
+	unsigned long long counter;
+} atomic64_t;
+
+#define ATOMIC64_INIT(val)	{ (val) }
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @v: pointer of type atomic64_t
+ *
+ * Atomically reads the value of @v.
+ * Doesn't imply a read memory barrier.
+ */
+#define __atomic64_read(ptr)		((ptr)->counter)
+
+static inline unsigned long long
+cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new)
+{
+	asm volatile(
+
+		LOCK_PREFIX "cmpxchg8b (%[ptr])\n"
+
+		     :		"=A" (old)
+
+		     : [ptr]	"D" (ptr),
+				"A" (old),
+				"b" (ll_low(new)),
+				"c" (ll_high(new))
+
+		     : "memory");
+
+	return old;
+}
+
+static inline unsigned long long
+atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
+		 unsigned long long new_val)
+{
+	return cmpxchg8b(&ptr->counter, old_val, new_val);
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ * @new_val:  value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+	unsigned long long old_val;
+
+	do {
+		old_val = atomic_read(ptr);
+	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+}
+
+/**
+ * atomic64_read - read atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ *
+ * Atomically reads the value of @ptr and returns it.
+ */
+static inline unsigned long long atomic64_read(atomic64_t *ptr)
+{
+	unsigned long long curr_val;
+
+	do {
+		curr_val = __atomic64_read(ptr);
+	} while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val);
+
+	return curr_val;
+}
+
+/**
+ * atomic64_add_return - add and return
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns @delta + *@ptr
+ */
+static inline unsigned long long
+atomic64_add_return(unsigned long long delta, atomic64_t *ptr)
+{
+	unsigned long long old_val, new_val;
+
+	do {
+		old_val = atomic_read(ptr);
+		new_val = old_val + delta;
+
+	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+	return new_val;
+}
+
+static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr)
+{
+	return atomic64_add_return(-delta, ptr);
+}
+
+static inline long atomic64_inc_return(atomic64_t *ptr)
+{
+	return atomic64_add_return(1, ptr);
+}
+
+static inline long atomic64_dec_return(atomic64_t *ptr)
+{
+	return atomic64_sub_return(1, ptr);
+}
+
+/**
+ * atomic64_add - add integer to atomic64 variable
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr.
+ */
+static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr)
+{
+	atomic64_add_return(delta, ptr);
+}
+
+/**
+ * atomic64_sub - subtract the atomic64 variable
+ * @delta: integer value to subtract
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr.
+ */
+static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr)
+{
+	atomic64_add(-delta, ptr);
+}
+
+/**
+ * atomic64_sub_and_test - subtract value from variable and test result
+ * @delta: integer value to subtract
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically subtracts @delta from @ptr and returns
+ * true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int
+atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr)
+{
+	unsigned long long old_val = atomic64_sub_return(delta, ptr);
+
+	return old_val == 0;
+}
+
+/**
+ * atomic64_inc - increment atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1.
+ */
+static inline void atomic64_inc(atomic64_t *ptr)
+{
+	atomic64_add(1, ptr);
+}
+
+/**
+ * atomic64_dec - decrement atomic64 variable
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1.
+ */
+static inline void atomic64_dec(atomic64_t *ptr)
+{
+	atomic64_sub(1, ptr);
+}
+
+/**
+ * atomic64_dec_and_test - decrement and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically decrements @ptr by 1 and
+ * returns true if the result is 0, or false for all other
+ * cases.
+ */
+static inline int atomic64_dec_and_test(atomic64_t *ptr)
+{
+	return atomic64_sub_and_test(1, ptr);
+}
+
+/**
+ * atomic64_inc_and_test - increment and test
+ * @ptr: pointer to type atomic64_t
+ *
+ * Atomically increments @ptr by 1
+ * and returns true if the result is zero, or false for all
+ * other cases.
+ */
+static inline int atomic64_inc_and_test(atomic64_t *ptr)
+{
+	return atomic64_sub_and_test(-1, ptr);
+}
+
+/**
+ * atomic64_add_negative - add and test if negative
+ * @delta: integer value to add
+ * @ptr:   pointer to type atomic64_t
+ *
+ * Atomically adds @delta to @ptr and returns true
+ * if the result is negative, or false when
+ * result is greater than or equal to zero.
+ */
+static inline int
+atomic64_add_negative(unsigned long long delta, atomic64_t *ptr)
+{
+	long long old_val = atomic64_add_return(delta, ptr);
+
+	return old_val < 0;
+}
+
 #include <asm-generic/atomic.h>
 #endif /* _ASM_X86_ATOMIC_32_H */
-- 
GitLab


From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 13 Dec 2008 09:00:03 +0100
Subject: [PATCH 0023/2198] perfcounters: restructure x86 counter math

Impact: restructure code

Change counter math from absolute values to clear delta logic.

We try to extract elapsed deltas from the raw hw counter - and put
that into the generic counter.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                   |   2 +-
 arch/x86/kernel/cpu/perf_counter.c | 230 ++++++++++++++++-------------
 include/linux/perf_counter.h       |  15 +-
 kernel/perf_counter.c              |  68 +--------
 4 files changed, 137 insertions(+), 178 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f2fdc18672412..fe94490bab614 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -643,7 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
 	def_bool y
 	depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
-	select HAVE_PERF_COUNTERS
+	select HAVE_PERF_COUNTERS if (!M386 && !M486)
 
 config X86_IO_APIC
 	def_bool y
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b903f8df72bb5..5afae13d8d59b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] =
 
 const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static void
+x86_perf_counter_update(struct perf_counter *counter,
+			struct hw_perf_counter *hwc, int idx)
+{
+	u64 prev_raw_count, new_raw_count, delta;
+
+	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
+	/*
+	 * Careful: an NMI might modify the previous counter value.
+	 *
+	 * Our tactic to handle this is to first atomically read and
+	 * exchange a new raw count - then add that new-prev delta
+	 * count to the generic counter atomically:
+	 */
+again:
+	prev_raw_count = atomic64_read(&hwc->prev_count);
+	rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+					new_raw_count) != prev_raw_count)
+		goto again;
+
+	/*
+	 * Now we have the new raw value and have updated the prev
+	 * timestamp already. We can now calculate the elapsed delta
+	 * (counter-)time and add that to the generic counter.
+	 *
+	 * Careful, not all hw sign-extends above the physical width
+	 * of the count, so we do that by clipping the delta to 32 bits:
+	 */
+	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+	WARN_ON_ONCE((int)delta < 0);
+
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &hwc->period_left);
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * so we install an artificial 1<<31 period regardless of
 	 * the generic counter period:
 	 */
-	if (!hwc->irq_period)
+	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
 		hwc->irq_period = 0x7FFFFFFF;
 
-	hwc->next_count	= -(s32)hwc->irq_period;
+	atomic64_set(&hwc->period_left, hwc->irq_period);
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -118,12 +160,6 @@ void hw_perf_enable_all(void)
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore(u64 ctrl)
-{
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
-}
-EXPORT_SYMBOL_GPL(hw_perf_restore);
-
 u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
@@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void)
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+void hw_perf_restore(u64 ctrl)
+{
+	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore);
+
 static inline void
-__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct perf_counter *counter,
+			   struct hw_perf_counter *hwc, unsigned int idx)
 {
-	wrmsr(hwc->config_base + idx, hwc->config, 0);
+	int err;
+
+	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+	WARN_ON_ONCE(err);
 }
 
-static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static void
+__hw_perf_counter_set_period(struct perf_counter *counter,
+			     struct hw_perf_counter *hwc, int idx)
 {
-	per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+	s32 left = atomic64_read(&hwc->period_left);
+	s32 period = hwc->irq_period;
+
+	WARN_ON_ONCE(period <= 0);
+
+	/*
+	 * If we are way outside a reasoable range then just skip forward:
+	 */
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_set(&hwc->period_left, left);
+	}
 
-	wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+	WARN_ON_ONCE(left <= 0);
+
+	per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+	/*
+	 * The hw counter starts counting from this counter offset,
+	 * mark it to be able to extra future deltas:
+	 */
+	atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+
+	wrmsr(hwc->counter_base + idx, -left, 0);
 }
 
-static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void
+__x86_perf_counter_enable(struct perf_counter *counter,
+			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
 static void x86_perf_counter_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 
-	__hw_perf_counter_set_period(hwc, idx);
-	__x86_perf_counter_enable(hwc, idx);
-}
-
-static void __hw_perf_save_counter(struct perf_counter *counter,
-				   struct hw_perf_counter *hwc, int idx)
-{
-	s64 raw = -1;
-	s64 delta;
-
-	/*
-	 * Get the raw hw counter value:
-	 */
-	rdmsrl(hwc->counter_base + idx, raw);
-
-	/*
-	 * Rebase it to zero (it started counting at -irq_period),
-	 * to see the delta since ->prev_count:
-	 */
-	delta = (s64)hwc->irq_period + (s64)(s32)raw;
-
-	atomic64_counter_set(counter, hwc->prev_count + delta);
-
-	/*
-	 * Adjust the ->prev_count offset - if we went beyond
-	 * irq_period of units, then we got an IRQ and the counter
-	 * was set back to -irq_period:
-	 */
-	while (delta >= (s64)hwc->irq_period) {
-		hwc->prev_count += hwc->irq_period;
-		delta -= (s64)hwc->irq_period;
-	}
-
-	/*
-	 * Calculate the next raw counter value we'll write into
-	 * the counter at the next sched-in time:
-	 */
-	delta -= (s64)hwc->irq_period;
-
-	hwc->next_count = (s32)delta;
+	__hw_perf_counter_set_period(counter, hwc, idx);
+	__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
 {
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 	int cpu, idx;
 
 	if (!nr_hw_counters)
@@ -241,14 +286,14 @@ void perf_counter_print_debug(void)
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
-		next_count = per_cpu(prev_next_count[idx], cpu);
+		prev_left = per_cpu(prev_left[idx], cpu);
 
 		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
 		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
-			cpu, idx, next_count);
+		printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+			cpu, idx, prev_left);
 	}
 	local_irq_enable();
 }
@@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(hwc, idx);
+	__x86_perf_counter_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
-	__hw_perf_save_counter(counter, hwc, idx);
-}
 
-static void x86_perf_counter_read(struct perf_counter *counter)
-{
-	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned long addr = hwc->counter_base + hwc->idx;
-	s64 offs, val = -1LL;
-	s32 val32;
-
-	/* Careful: NMI might modify the counter offset */
-	do {
-		offs = hwc->prev_count;
-		rdmsrl(addr, val);
-	} while (offs != hwc->prev_count);
-
-	val32 = (s32) val;
-	val = (s64)hwc->irq_period + (s64)val32;
-	atomic64_counter_set(counter, hwc->prev_count + val);
+	/*
+	 * Drain the remaining delta count out of a counter
+	 * that we are disabling:
+	 */
+	x86_perf_counter_update(counter, hwc, idx);
 }
 
 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
@@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 }
 
 /*
- * NMI-safe enable method:
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
  */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
@@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter)
 
 	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
-	__hw_perf_save_counter(counter, hwc, idx);
-	__hw_perf_counter_set_period(hwc, idx);
+	x86_perf_counter_update(counter, hwc, idx);
+	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(hwc, idx);
+		__x86_perf_counter_enable(counter, hwc, idx);
 }
 
 static void
 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
 	struct perf_counter *counter, *group_leader = sibling->group_leader;
-	int bit;
-
-	/*
-	 * Store the counter's own timestamp first:
-	 */
-	perf_store_irq_data(sibling, sibling->hw_event.type);
-	perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
 	/*
-	 * Then store sibling timestamps (if any):
+	 * Store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-			/*
-			 * When counter was not in the overflow mask, we have to
-			 * read it from hardware. We read it as well, when it
-			 * has not been read yet and clear the bit in the
-			 * status mask.
-			 */
-			bit = counter->hw.idx;
-			if (!test_bit(bit, (unsigned long *) overflown) ||
-			    test_bit(bit, (unsigned long *) status)) {
-				clear_bit(bit, (unsigned long *) status);
-				perf_save_and_restart(counter);
-			}
-		}
+		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 		perf_store_irq_data(sibling, counter->hw_event.type);
-		perf_store_irq_data(sibling, atomic64_counter_read(counter));
+		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
 
@@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void)
 	perf_counters_initialized = true;
 }
 
+static void x86_perf_counter_read(struct perf_counter *counter)
+{
+	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
 	.hw_perf_counter_enable		= x86_perf_counter_enable,
 	.hw_perf_counter_disable	= x86_perf_counter_disable,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8cb095fa442c3..72460289c654a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -91,14 +91,16 @@ struct perf_counter_hw_event {
  * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
 	u64				config;
 	unsigned long			config_base;
 	unsigned long			counter_base;
 	int				nmi;
 	unsigned int			idx;
-	u64				prev_count;
+	atomic64_t			prev_count;
 	u64				irq_period;
-	s32				next_count;
+	atomic64_t			period_left;
+#endif
 };
 
 /*
@@ -140,17 +142,15 @@ enum perf_counter_active_state {
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
 	struct list_head		list_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
 	enum perf_counter_active_state	state;
-#if BITS_PER_LONG == 64
 	atomic64_t			count;
-#else
-	atomic_t			count32[2];
-#endif
+
 	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
@@ -172,6 +172,7 @@ struct perf_counter {
 	struct perf_data		*irqdata;
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
+#endif
 };
 
 /**
@@ -220,8 +221,6 @@ extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
-extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
-extern u64 atomic64_counter_read(struct perf_counter *counter);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 559130b8774de..416861ce8b272 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,67 +44,9 @@ hw_perf_counter_init(struct perf_counter *counter)
 }
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)	{ }
+void __weak hw_perf_restore(u64 ctrl)		{ }
 void __weak hw_perf_counter_setup(void)		{ }
 
-#if BITS_PER_LONG == 64
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 64 bit version - no complications.
- */
-static inline u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-	return (u64) atomic64_read(&counter->count);
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-	atomic64_set(&counter->count, val);
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic64_read(&counter->count);
-}
-
-#else
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 32 bit version.
- */
-static u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-	u32 cntl, cnth;
-
-	local_irq_disable();
-	do {
-		cnth = atomic_read(&counter->count32[1]);
-		cntl = atomic_read(&counter->count32[0]);
-	} while (cnth != atomic_read(&counter->count32[1]));
-
-	local_irq_enable();
-
-	return cntl | ((u64) cnth) << 32;
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-	u32 *val32 = (void *)&val64;
-
-	atomic_set(counter->count32 + 0, *(val32 + 0));
-	atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-	return atomic_read(counter->count32 + 0) |
-		(u64) atomic_read(counter->count32 + 1) << 32;
-}
-
-#endif
-
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -280,11 +222,11 @@ static void __perf_install_in_context(void *info)
 	ctx->nr_counters++;
 
 	if (cpuctx->active_oncpu < perf_max_counters) {
-		counter->hw_ops->hw_perf_counter_enable(counter);
 		counter->state = PERF_COUNTER_STATE_ACTIVE;
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
+		counter->hw_ops->hw_perf_counter_enable(counter);
 	}
 
 	if (!ctx->task && cpuctx->max_pertask)
@@ -624,7 +566,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 					 __hw_perf_counter_read, counter, 1);
 	}
 
-	return perf_counter_read_safe(counter);
+	return atomic64_read(&counter->count);
 }
 
 /*
@@ -921,7 +863,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
 
-	atomic64_counter_set(counter, cpu_clock(cpu));
+	atomic64_set(&counter->count, cpu_clock(cpu));
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
@@ -940,7 +882,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	atomic64_counter_set(counter, current->se.sum_exec_runtime);
+	atomic64_set(&counter->count, current->se.sum_exec_runtime);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
GitLab


From 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 12 Dec 2008 13:49:45 +0100
Subject: [PATCH 0024/2198] perfcounters: implement "counter inheritance"

Impact: implement new performance feature

Counter inheritance can be used to run performance counters in a workload,
transparently - and pipe back the counter results to the parent counter.

Inheritance for performance counters works the following way: when creating
a counter it can be marked with the .inherit=1 flag. Such counters are then
'inherited' by all child tasks (be they fork()-ed or clone()-ed). These
counters get inherited through exec() boundaries as well (except through
setuid boundaries).

The counter values get added back to the parent counter(s) when the child
task(s) exit - much like stime/utime statistics are gathered. So inherited
counters are ideal to gather summary statistics about an application's
behavior via shell commands, without having to modify that application.

The timec.c command utilizes counter inheritance:

  http://redhat.com/~mingo/perfcounters/timec.c

Sample output:

   $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null

   Performance counter stats for 'ls':

           163516953 instructions
                2295 cache-misses
             2855182 branch-misses

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  24 +++-
 kernel/exit.c                |   7 +-
 kernel/perf_counter.c        | 248 +++++++++++++++++++++++++++++------
 3 files changed, 228 insertions(+), 51 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 72460289c654a..e5d25bf8f74e8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -75,10 +75,11 @@ struct perf_counter_hw_event {
 	u64			irq_period;
 	u32			record_type;
 
-	u32			disabled     :  1, /* off by default */
-				nmi	     :  1, /* NMI sampling   */
-				raw	     :  1, /* raw event type */
-				__reserved_1 : 29;
+	u32			disabled     :  1, /* off by default      */
+				nmi	     :  1, /* NMI sampling        */
+				raw	     :  1, /* raw event type      */
+				inherit	     :  1, /* children inherit it */
+				__reserved_1 : 28;
 
 	u64			__reserved_2;
 };
@@ -138,6 +139,8 @@ enum perf_counter_active_state {
 	PERF_COUNTER_STATE_ACTIVE	=  1,
 };
 
+struct file;
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -156,7 +159,10 @@ struct perf_counter {
 
 	struct perf_counter_context	*ctx;
 	struct task_struct		*task;
+	struct file			*filp;
 
+	unsigned int			nr_inherited;
+	struct perf_counter		*parent;
 	/*
 	 * Protect attach/detach:
 	 */
@@ -210,13 +216,16 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
+extern void
+perf_counter_show(struct perf_counter *counter, char *str, int trace);
 extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern void perf_counter_init_task(struct task_struct *task);
+extern void perf_counter_init_task(struct task_struct *child);
+extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern u64 hw_perf_save_disable(void);
@@ -226,12 +235,15 @@ extern int perf_counter_task_enable(void);
 
 #else
 static inline void
+perf_counter_show(struct perf_counter *counter, char *str, int trace)   { }
+static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
 static inline void
 perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline void perf_counter_init_task(struct task_struct *task)	{ }
+static inline void perf_counter_init_task(struct task_struct *child)	{ }
+static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)			{ }
diff --git a/kernel/exit.c b/kernel/exit.c
index 2d8be7ebb0f73..d336c90a5f133 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code)
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
-#ifdef CONFIG_FUTEX
 	/*
-	 * This must happen late, after the PID is not
-	 * hashed anymore:
+	 * These must happen late, after the PID is not
+	 * hashed anymore, but still at a point that may sleep:
 	 */
+	perf_counter_exit_task(tsk);
+#ifdef CONFIG_FUTEX
 	if (unlikely(!list_empty(&tsk->pi_state_list)))
 		exit_pi_state_list(tsk);
 	if (unlikely(current->pi_state_cache))
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 416861ce8b272..f5e81dd193d1c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 		list_del_init(&sibling->list_entry);
 		list_add_tail(&sibling->list_entry, &ctx->counter_list);
-		WARN_ON_ONCE(!sibling->group_leader);
-		WARN_ON_ONCE(sibling->group_leader == sibling);
 		sibling->group_leader = sibling;
 	}
 }
@@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
+	unsigned long flags;
 	u64 perf_flags;
 
 	/*
@@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		counter->hw_ops->hw_perf_counter_disable(counter);
@@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
-	spin_unlock(&ctx->lock);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 
@@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	int cpu = smp_processor_id();
+	unsigned long flags;
 	u64 perf_flags;
 
 	/*
@@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info)
 	if (!ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
-	spin_unlock(&ctx->lock);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -446,10 +446,9 @@ int perf_counter_task_disable(void)
 	 */
 	perf_flags = hw_perf_save_disable();
 
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry)
 		counter->state = PERF_COUNTER_STATE_OFF;
-	}
+
 	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
@@ -525,26 +524,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	perf_counter_task_sched_in(curr, cpu);
 }
 
-/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
-			    struct task_struct *task)
-{
-	spin_lock_init(&ctx->lock);
-	INIT_LIST_HEAD(&ctx->counter_list);
-	ctx->nr_counters	= 0;
-	ctx->task		= task;
-}
-/*
- * Initialize the perf_counter context in task_struct
- */
-void perf_counter_init_task(struct task_struct *task)
-{
-	__perf_counter_init_context(&task->perf_counter_ctx, task);
-}
-
 /*
  * Cross CPU call to read the hardware counter
  */
@@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
 
-		WARN_ON_ONCE(ctx->task);
 		return ctx;
 	}
 
@@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter)
 static struct perf_counter *
 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
-		   struct perf_counter *group_leader)
+		   struct perf_counter *group_leader,
+		   gfp_t gfpflags)
 {
 	const struct hw_perf_counter_ops *hw_ops;
 	struct perf_counter *counter;
 
-	counter = kzalloc(sizeof(*counter), GFP_KERNEL);
+	counter = kzalloc(sizeof(*counter), gfpflags);
 	if (!counter)
 		return NULL;
 
@@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	hw_ops = NULL;
 	if (!hw_event->raw && hw_event->type < 0)
 		hw_ops = sw_perf_counter_init(counter);
-	if (!hw_ops) {
+	if (!hw_ops)
 		hw_ops = hw_perf_counter_init(counter);
-	}
 
 	if (!hw_ops) {
 		kfree(counter);
@@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
 	struct perf_counter_context *ctx;
+	struct file *counter_file = NULL;
 	struct file *group_file = NULL;
 	int fput_needed = 0;
+	int fput_needed2 = 0;
 	int ret;
 
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
@@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 	}
 
 	ret = -EINVAL;
-	counter = perf_counter_alloc(&hw_event, cpu, group_leader);
+	counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
 	if (!counter)
 		goto err_put_context;
 
-	perf_install_in_context(ctx, counter, cpu);
-
 	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
 	if (ret < 0)
-		goto err_remove_free_put_context;
+		goto err_free_put_context;
+
+	counter_file = fget_light(ret, &fput_needed2);
+	if (!counter_file)
+		goto err_free_put_context;
+
+	counter->filp = counter_file;
+	perf_install_in_context(ctx, counter, cpu);
+
+	fput_light(counter_file, fput_needed2);
 
 out_fput:
 	fput_light(group_file, fput_needed);
 
 	return ret;
 
-err_remove_free_put_context:
-	mutex_lock(&counter->mutex);
-	perf_counter_remove_from_context(counter);
-	mutex_unlock(&counter->mutex);
+err_free_put_context:
 	kfree(counter);
 
 err_put_context:
@@ -1044,6 +1028,186 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 	goto out_fput;
 }
 
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->lock);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	ctx->task = task;
+}
+
+/*
+ * inherit a counter from parent task to child task:
+ */
+static int
+inherit_counter(struct perf_counter *parent_counter,
+	      struct task_struct *parent,
+	      struct perf_counter_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_counter_context *child_ctx)
+{
+	struct perf_counter *child_counter;
+
+	child_counter = perf_counter_alloc(&parent_counter->hw_event,
+					    parent_counter->cpu, NULL,
+					    GFP_ATOMIC);
+	if (!child_counter)
+		return -ENOMEM;
+
+	/*
+	 * Link it up in the child's context:
+	 */
+	child_counter->ctx = child_ctx;
+	child_counter->task = child;
+	list_add_counter(child_counter, child_ctx);
+	child_ctx->nr_counters++;
+
+	child_counter->parent = parent_counter;
+	parent_counter->nr_inherited++;
+	/*
+	 * inherit into child's child as well:
+	 */
+	child_counter->hw_event.inherit = 1;
+
+	/*
+	 * Get a reference to the parent filp - we will fput it
+	 * when the child counter exits. This is safe to do because
+	 * we are in the parent and we know that the filp still
+	 * exists and has a nonzero count:
+	 */
+	atomic_long_inc(&parent_counter->filp->f_count);
+
+	return 0;
+}
+
+static void
+__perf_counter_exit_task(struct task_struct *child,
+			 struct perf_counter *child_counter,
+			 struct perf_counter_context *child_ctx)
+{
+	struct perf_counter *parent_counter;
+	u64 parent_val, child_val;
+	u64 perf_flags;
+
+	/*
+	 * Disable and unlink this counter.
+	 *
+	 * Be careful about zapping the list - IRQ/NMI context
+	 * could still be processing it:
+	 */
+	local_irq_disable();
+	perf_flags = hw_perf_save_disable();
+
+	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE)
+		child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+	list_del_init(&child_counter->list_entry);
+
+	hw_perf_restore(perf_flags);
+	local_irq_enable();
+
+	parent_counter = child_counter->parent;
+	/*
+	 * It can happen that parent exits first, and has counters
+	 * that are still around due to the child reference. These
+	 * counters need to be zapped - but otherwise linger.
+	 */
+	if (!parent_counter)
+		return;
+
+	parent_val = atomic64_read(&parent_counter->count);
+	child_val = atomic64_read(&child_counter->count);
+
+	/*
+	 * Add back the child's count to the parent's count:
+	 */
+	atomic64_add(child_val, &parent_counter->count);
+
+	fput(parent_counter->filp);
+
+	kfree(child_counter);
+}
+
+/*
+ * When a child task exist, feed back counter values to parent counters.
+ *
+ * Note: we are running in child context, but the PID is not hashed
+ * anymore so new counters will not be added.
+ */
+void perf_counter_exit_task(struct task_struct *child)
+{
+	struct perf_counter *child_counter, *tmp;
+	struct perf_counter_context *child_ctx;
+
+	child_ctx = &child->perf_counter_ctx;
+
+	if (likely(!child_ctx->nr_counters))
+		return;
+
+	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
+				 list_entry)
+		__perf_counter_exit_task(child, child_counter, child_ctx);
+}
+
+/*
+ * Initialize the perf_counter context in task_struct
+ */
+void perf_counter_init_task(struct task_struct *child)
+{
+	struct perf_counter_context *child_ctx, *parent_ctx;
+	struct perf_counter *counter, *parent_counter;
+	struct task_struct *parent = current;
+	unsigned long flags;
+
+	child_ctx  =  &child->perf_counter_ctx;
+	parent_ctx = &parent->perf_counter_ctx;
+
+	__perf_counter_init_context(child_ctx, child);
+
+	/*
+	 * This is executed from the parent task context, so inherit
+	 * counters that have been marked for cloning:
+	 */
+
+	if (likely(!parent_ctx->nr_counters))
+		return;
+
+	/*
+	 * Lock the parent list. No need to lock the child - not PID
+	 * hashed yet and not running, so nobody can access it.
+	 */
+	spin_lock_irqsave(&parent_ctx->lock, flags);
+
+	/*
+	 * We dont have to disable NMIs - we are only looking at
+	 * the list, not manipulating it:
+	 */
+	list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
+		if (!counter->hw_event.inherit || counter->group_leader != counter)
+			continue;
+
+		/*
+		 * Instead of creating recursive hierarchies of counters,
+		 * we link inheritd counters back to the original parent,
+		 * which has a filp for sure, which we use as the reference
+		 * count:
+		 */
+		parent_counter = counter;
+		if (counter->parent)
+			parent_counter = counter->parent;
+
+		if (inherit_counter(parent_counter, parent,
+				  parent_ctx, child, child_ctx))
+			break;
+	}
+
+	spin_unlock_irqrestore(&parent_ctx->lock, flags);
+}
+
 static void __cpuinit perf_counter_init_cpu(int cpu)
 {
 	struct perf_cpu_context *cpuctx;
-- 
GitLab


From 8cb391e8786c8072367f0aeb90551903fef074ba Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 12:22:31 +0100
Subject: [PATCH 0025/2198] perfcounters: fix task clock counter

Impact: bugfix

Update the task clock counter to the new math.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f5e81dd193d1c..1f81cde0dc430 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -850,17 +850,36 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
 };
 
-static void task_clock_perf_counter_enable(struct perf_counter *counter)
+static void task_clock_perf_counter_update(struct perf_counter *counter)
 {
+	u64 prev, now;
+	s64 delta;
+
+	prev = atomic64_read(&counter->hw.prev_count);
+	now = current->se.sum_exec_runtime;
+
+	atomic64_set(&counter->hw.prev_count, now);
+
+	delta = now - prev;
+	if (WARN_ON_ONCE(delta < 0))
+		delta = 0;
+
+	atomic64_add(delta, &counter->count);
 }
 
-static void task_clock_perf_counter_disable(struct perf_counter *counter)
+static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
+	task_clock_perf_counter_update(counter);
 }
 
-static void task_clock_perf_counter_read(struct perf_counter *counter)
+static void task_clock_perf_counter_enable(struct perf_counter *counter)
+{
+	atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime);
+}
+
+static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->count, current->se.sum_exec_runtime);
+	task_clock_perf_counter_update(counter);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
GitLab


From 5d6a27d8a096868ae313f71f563b06074a7e34fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 12:28:33 +0100
Subject: [PATCH 0026/2198] perfcounters: add context switch counter

Impact: add new feature, new sw counter

Add a counter that counts the number of context-switches a task
is doing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 +--
 kernel/perf_counter.c        | 51 ++++++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e5d25bf8f74e8..d2a16563415fe 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -53,8 +53,8 @@ enum hw_event_types {
 	/*
 	 * Future software events:
 	 */
-	/* PERF_COUNT_PAGE_FAULTS	= -3,
-	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1f81cde0dc430..09287091c5266 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.hw_perf_counter_read		= task_clock_perf_counter_read,
 };
 
+static u64 get_context_switches(void)
+{
+	struct task_struct *curr = current;
+
+	return curr->nvcsw + curr->nivcsw;
+}
+
+static void context_switches_perf_counter_update(struct perf_counter *counter)
+{
+	u64 prev, now;
+	s64 delta;
+
+	prev = atomic64_read(&counter->hw.prev_count);
+	now = get_context_switches();
+
+	atomic64_set(&counter->hw.prev_count, now);
+
+	delta = now - prev;
+	if (WARN_ON_ONCE(delta < 0))
+		delta = 0;
+
+	atomic64_add(delta, &counter->count);
+}
+
+static void context_switches_perf_counter_read(struct perf_counter *counter)
+{
+	context_switches_perf_counter_update(counter);
+}
+
+static void context_switches_perf_counter_enable(struct perf_counter *counter)
+{
+	/*
+	 * ->nvcsw + curr->nivcsw is a per-task value already,
+	 * so we dont have to clear it on switch-in.
+	 */
+}
+
+static void context_switches_perf_counter_disable(struct perf_counter *counter)
+{
+	context_switches_perf_counter_update(counter);
+}
+
+static const struct hw_perf_counter_ops perf_ops_context_switches = {
+	.hw_perf_counter_enable		= context_switches_perf_counter_enable,
+	.hw_perf_counter_disable	= context_switches_perf_counter_disable,
+	.hw_perf_counter_read		= context_switches_perf_counter_read,
+};
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -900,6 +948,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_TASK_CLOCK:
 		hw_ops = &perf_ops_task_clock;
 		break;
+	case PERF_COUNT_CONTEXT_SWITCHES:
+		hw_ops = &perf_ops_context_switches;
+		break;
 	default:
 		break;
 	}
-- 
GitLab


From 6c594c21fcb02c662f11c97be4d7d2b73060a205 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 12:34:15 +0100
Subject: [PATCH 0027/2198] perfcounters: add task migrations counter

Impact: add new feature, new sw counter

Add a counter that counts the number of cross-CPU migrations a
task is suffering.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  8 +++---
 include/linux/sched.h        |  3 ++-
 kernel/perf_counter.c        | 49 ++++++++++++++++++++++++++++++++++++
 kernel/sched.c               |  7 ++++--
 4 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d2a16563415fe..f30486fc55d72 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -42,6 +42,8 @@ enum hw_event_types {
 	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
 	PERF_COUNT_BRANCH_MISSES	=  5,
 
+	PERF_HW_EVENTS_MAX		=  6,
+
 	/*
 	 * Special "software" counters provided by the kernel, even if
 	 * the hardware does not support performance counters. These
@@ -50,11 +52,11 @@ enum hw_event_types {
 	 */
 	PERF_COUNT_CPU_CLOCK		= -1,
 	PERF_COUNT_TASK_CLOCK		= -2,
-	/*
-	 * Future software events:
-	 */
 	PERF_COUNT_PAGE_FAULTS		= -3,
 	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	PERF_COUNT_CPU_MIGRATIONS	= -5,
+
+	PERF_SW_EVENTS_MIN		= -6,
 };
 
 /*
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4c530278391b7..2e15be8fc792e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1014,6 +1014,8 @@ struct sched_entity {
 	u64			last_wakeup;
 	u64			avg_overlap;
 
+	u64			nr_migrations;
+
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_max;
@@ -1029,7 +1031,6 @@ struct sched_entity {
 	u64			exec_max;
 	u64			slice_max;
 
-	u64			nr_migrations;
 	u64			nr_migrations_cold;
 	u64			nr_failed_migrations_affine;
 	u64			nr_failed_migrations_running;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 09287091c5266..fb11e351e44e3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -936,6 +936,52 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
 	.hw_perf_counter_read		= context_switches_perf_counter_read,
 };
 
+static inline u64 get_cpu_migrations(void)
+{
+	return current->se.nr_migrations;
+}
+
+static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
+{
+	u64 prev, now;
+	s64 delta;
+
+	prev = atomic64_read(&counter->hw.prev_count);
+	now = get_cpu_migrations();
+
+	atomic64_set(&counter->hw.prev_count, now);
+
+	delta = now - prev;
+	if (WARN_ON_ONCE(delta < 0))
+		delta = 0;
+
+	atomic64_add(delta, &counter->count);
+}
+
+static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+{
+	cpu_migrations_perf_counter_update(counter);
+}
+
+static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
+{
+	/*
+	 * se.nr_migrations is a per-task value already,
+	 * so we dont have to clear it on switch-in.
+	 */
+}
+
+static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
+{
+	cpu_migrations_perf_counter_update(counter);
+}
+
+static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
+	.hw_perf_counter_enable		= cpu_migrations_perf_counter_enable,
+	.hw_perf_counter_disable	= cpu_migrations_perf_counter_disable,
+	.hw_perf_counter_read		= cpu_migrations_perf_counter_read,
+};
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -951,6 +997,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_CONTEXT_SWITCHES:
 		hw_ops = &perf_ops_context_switches;
 		break;
+	case PERF_COUNT_CPU_MIGRATIONS:
+		hw_ops = &perf_ops_cpu_migrations;
+		break;
 	default:
 		break;
 	}
diff --git a/kernel/sched.c b/kernel/sched.c
index 5c3f4106314ee..382cfdb5e38d8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1852,12 +1852,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		p->se.sleep_start -= clock_offset;
 	if (p->se.block_start)
 		p->se.block_start -= clock_offset;
+#endif
 	if (old_cpu != new_cpu) {
-		schedstat_inc(p, se.nr_migrations);
+		p->se.nr_migrations++;
+#ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
-	}
 #endif
+	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
 
@@ -2375,6 +2377,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.exec_start		= 0;
 	p->se.sum_exec_runtime		= 0;
 	p->se.prev_sum_exec_runtime	= 0;
+	p->se.nr_migrations		= 0;
 	p->se.last_wakeup		= 0;
 	p->se.avg_overlap		= 0;
 
-- 
GitLab


From e06c61a879910869aa5bf3f8f634abfee1a7bebc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 14:44:31 +0100
Subject: [PATCH 0028/2198] perfcounters: add nr-of-faults counter

Impact: add new feature, new sw counter

Add a counter that counts the number of pagefaults a task
is experiencing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 51 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fb11e351e44e3..59c52f9ee431a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.hw_perf_counter_read		= task_clock_perf_counter_read,
 };
 
+static u64 get_page_faults(void)
+{
+	struct task_struct *curr = current;
+
+	return curr->maj_flt + curr->min_flt;
+}
+
+static void page_faults_perf_counter_update(struct perf_counter *counter)
+{
+	u64 prev, now;
+	s64 delta;
+
+	prev = atomic64_read(&counter->hw.prev_count);
+	now = get_page_faults();
+
+	atomic64_set(&counter->hw.prev_count, now);
+
+	delta = now - prev;
+	if (WARN_ON_ONCE(delta < 0))
+		delta = 0;
+
+	atomic64_add(delta, &counter->count);
+}
+
+static void page_faults_perf_counter_read(struct perf_counter *counter)
+{
+	page_faults_perf_counter_update(counter);
+}
+
+static void page_faults_perf_counter_enable(struct perf_counter *counter)
+{
+	/*
+	 * page-faults is a per-task value already,
+	 * so we dont have to clear it on switch-in.
+	 */
+}
+
+static void page_faults_perf_counter_disable(struct perf_counter *counter)
+{
+	page_faults_perf_counter_update(counter);
+}
+
+static const struct hw_perf_counter_ops perf_ops_page_faults = {
+	.hw_perf_counter_enable		= page_faults_perf_counter_enable,
+	.hw_perf_counter_disable	= page_faults_perf_counter_disable,
+	.hw_perf_counter_read		= page_faults_perf_counter_read,
+};
+
 static u64 get_context_switches(void)
 {
 	struct task_struct *curr = current;
@@ -994,6 +1042,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_TASK_CLOCK:
 		hw_ops = &perf_ops_task_clock;
 		break;
+	case PERF_COUNT_PAGE_FAULTS:
+		hw_ops = &perf_ops_page_faults;
+		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
 		hw_ops = &perf_ops_context_switches;
 		break;
-- 
GitLab


From 2b9ff0db19b5e2c77000b7201525f9c3d6e8328d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 18:36:30 +0100
Subject: [PATCH 0029/2198] perfcounters: fix non-intel-perfmon CPUs

Do not write MSR_CORE_PERF_GLOBAL_CTRL on CPUs where it does not exist.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5afae13d8d59b..6d30f603b62c3 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -157,6 +157,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 
 void hw_perf_enable_all(void)
 {
+	if (unlikely(!perf_counters_initialized))
+		return;
+
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
@@ -164,14 +167,21 @@ u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
 
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+
 	return ctrl;
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 void hw_perf_restore(u64 ctrl)
 {
+	if (unlikely(!perf_counters_initialized))
+		return;
+
 	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
-- 
GitLab


From 088e2852c858159d47f71ee8da38e0fb1b21f806 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 20:21:00 +0100
Subject: [PATCH 0030/2198] perfcounters, x86: fix sw counters on non-PMC CPUs

Make perf_max_counters default to at least 1 - this allows the sw
counters to be used.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 59c52f9ee431a..539fa8283a06f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -25,7 +25,7 @@
  */
 DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context);
 
-int perf_max_counters __read_mostly;
+int perf_max_counters __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
-- 
GitLab


From 75f224cf7700ed6006574dc3f2efa29860727570 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 21:58:46 +0100
Subject: [PATCH 0031/2198] perfcounters: fix lapic initialization

Fix non-working NMI sampling in certain bootup scenarios.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6d30f603b62c3..8a154bd7ba94a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -557,10 +557,10 @@ void __init init_hw_perf_counters(void)
 	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
 	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
 
+	perf_counters_initialized = true;
+
 	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
-
-	perf_counters_initialized = true;
 }
 
 static void x86_perf_counter_read(struct perf_counter *counter)
-- 
GitLab


From 0cc0c027d4e028632933f1be2dc4cd730358183b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 14 Dec 2008 23:20:36 +0100
Subject: [PATCH 0032/2198] perfcounters: release CPU context when exiting task
 counters

If counters are exiting via do_exit() not via filp close, then
the CPU context needs to be released - otherwise future percpu
counter creations might fail.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 539fa8283a06f..16396e9406fa8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1273,8 +1273,19 @@ __perf_counter_exit_task(struct task_struct *child,
 	local_irq_disable();
 	perf_flags = hw_perf_save_disable();
 
-	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE)
+	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+		struct perf_cpu_context *cpuctx;
+
+		cpuctx = &__get_cpu_var(perf_cpu_context);
+
 		child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+		child_counter->oncpu = -1;
+
+		cpuctx->active_oncpu--;
+		child_ctx->nr_active--;
+	}
+
 	list_del_init(&child_counter->list_entry);
 
 	hw_perf_restore(perf_flags);
@@ -1539,4 +1550,3 @@ static int __init perf_counter_sysfs_init(void)
 				  &perfclass_attr_group);
 }
 device_initcall(perf_counter_sysfs_init);
-
-- 
GitLab


From f65cb45cba63f249458b669aa67069eabc37b2f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 16 Dec 2008 13:40:44 +0100
Subject: [PATCH 0033/2198] perfcounters: flush on setuid exec

Pavel Machek pointed out that performance counters should be flushed
when crossing protection domains on setuid execution.

Reported-by: Pavel Machek <pavel@suse.cz>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/exec.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index ec5df9a383133..d5165d899a49e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -33,6 +33,7 @@
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/pagemap.h>
+#include <linux/perf_counter.h>
 #include <linux/highmem.h>
 #include <linux/spinlock.h>
 #include <linux/key.h>
@@ -1017,6 +1018,13 @@ int flush_old_exec(struct linux_binprm * bprm)
 		set_dumpable(current->mm, suid_dumpable);
 	}
 
+	/*
+	 * Flush performance counters when crossing a
+	 * security domain:
+	 */
+	if (!get_dumpable(current->mm))
+		perf_counter_exit_task(current);
+
 	/* An exec changes our domain. We are no longer part of the thread
 	   group */
 
-- 
GitLab


From a86ed50859d65a08beec9474df97b88438a996df Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 00:43:10 +0100
Subject: [PATCH 0034/2198] perfcounters: use hw_event.disable flag

Impact: implement default-off counters

Make sure that counters that are created with counter.hw_event.disabled=1,
get created in disabled state.

They can be enabled via:

        prctl(PR_TASK_PERF_COUNTERS_ENABLE);

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 16396e9406fa8..5431e790b5d60 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1093,6 +1093,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 
+	if (hw_event->disabled)
+		counter->state = PERF_COUNTER_STATE_OFF;
+
 	hw_ops = NULL;
 	if (!hw_event->raw && hw_event->type < 0)
 		hw_ops = sw_perf_counter_init(counter);
-- 
GitLab


From 94c46572a6d9bb497eda0a14099d9f1360d57d5d Mon Sep 17 00:00:00 2001
From: Jaswinder Singh <jaswinder@infradead.org>
Date: Fri, 19 Dec 2008 22:37:58 +0530
Subject: [PATCH 0035/2198] x86: perf_counter.c intel_perfmon_event_map and
 max_intel_perfmon_events should be static

Impact: cleanup, avoid sparse warnings, reduce kernel size a bit

Fixes these sparse warnings:
 arch/x86/kernel/cpu/perf_counter.c:44:11: warning: symbol 'intel_perfmon_event_map' was not declared. Should it be static?
 arch/x86/kernel/cpu/perf_counter.c:54:11: warning: symbol 'max_intel_perfmon_events' was not declared. Should it be static?

Signed-off-by: Jaswinder Singh <jaswinder@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8a154bd7ba94a..bdbdb56eaa340 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -41,7 +41,7 @@ struct cpu_hw_counters {
  */
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
-const int intel_perfmon_event_map[] =
+static const int intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CYCLES]			= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -51,7 +51,7 @@ const int intel_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
 /*
  * Propagate counter elapsed time into the generic counter.
-- 
GitLab


From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:04:16 +0100
Subject: [PATCH 0036/2198] perfcounters: remove warnings

Impact: remove debug checks

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 7 -------
 include/linux/perf_counter.h       | 4 ----
 kernel/perf_counter.c              | 8 --------
 3 files changed, 19 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index bdbdb56eaa340..89fad5d4fb37d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -64,7 +64,6 @@ x86_perf_counter_update(struct perf_counter *counter,
 {
 	u64 prev_raw_count, new_raw_count, delta;
 
-	WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
 	/*
 	 * Careful: an NMI might modify the previous counter value.
 	 *
@@ -89,7 +88,6 @@ x86_perf_counter_update(struct perf_counter *counter,
 	 * of the count, so we do that by clipping the delta to 32 bits:
 	 */
 	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
-	WARN_ON_ONCE((int)delta < 0);
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
@@ -193,7 +191,6 @@ __x86_perf_counter_disable(struct perf_counter *counter,
 	int err;
 
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
-	WARN_ON_ONCE(err);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
@@ -209,8 +206,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 	s32 left = atomic64_read(&hwc->period_left);
 	s32 period = hwc->irq_period;
 
-	WARN_ON_ONCE(period <= 0);
-
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
 	 */
@@ -224,8 +219,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 		atomic64_set(&hwc->period_left, left);
 	}
 
-	WARN_ON_ONCE(left <= 0);
-
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f30486fc55d72..d038450de87d5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -218,8 +218,6 @@ struct perf_cpu_context {
 extern int perf_max_counters;
 
 #ifdef CONFIG_PERF_COUNTERS
-extern void
-perf_counter_show(struct perf_counter *counter, char *str, int trace);
 extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
@@ -237,8 +235,6 @@ extern int perf_counter_task_enable(void);
 
 #else
 static inline void
-perf_counter_show(struct perf_counter *counter, char *str, int trace)   { }
-static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
 static inline void
 perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5431e790b5d60..aab6c123b02c2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -861,8 +861,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -906,8 +904,6 @@ static void page_faults_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -954,8 +950,6 @@ static void context_switches_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
@@ -1000,8 +994,6 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
 	atomic64_set(&counter->hw.prev_count, now);
 
 	delta = now - prev;
-	if (WARN_ON_ONCE(delta < 0))
-		delta = 0;
 
 	atomic64_add(delta, &counter->count);
 }
-- 
GitLab


From 7995888fcb0246543ee8027bf2835a250ba8c925 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 08:54:56 +0100
Subject: [PATCH 0037/2198] perfcounters: tweak group scheduling

Impact: schedule in groups atomically

If there are multiple groups in a task, make sure they are scheduled
in and out atomically.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index aab6c123b02c2..f8a4d9a5d5d35 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -367,21 +367,26 @@ counter_sched_in(struct perf_counter *counter,
 	ctx->nr_active++;
 }
 
-static void
+static int
 group_sched_in(struct perf_counter *group_counter,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx,
 	       int cpu)
 {
 	struct perf_counter *counter;
+	int was_group = 0;
 
 	counter_sched_in(group_counter, cpuctx, ctx, cpu);
 
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
 		counter_sched_in(counter, cpuctx, ctx, cpu);
+		was_group = 1;
+	}
+
+	return was_group;
 }
 
 /*
@@ -416,7 +421,12 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		group_sched_in(counter, cpuctx, ctx, cpu);
+		/*
+		 * If we scheduled in a group atomically and
+		 * exclusively, break out:
+		 */
+		if (group_sched_in(counter, cpuctx, ctx, cpu))
+			break;
 	}
 	spin_unlock(&ctx->lock);
 
-- 
GitLab


From 5c167b8585c8d91206b395d57011ead7711e322f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 09:02:19 +0100
Subject: [PATCH 0038/2198] x86, perfcounters: rename intel_arch_perfmon.h =>
 perf_counter.h

Impact: rename include file

We'll be providing an asm/perf_counter.h to the generic perfcounter code,
so use the already existing x86 file for this purpose and rename it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../include/asm/{intel_arch_perfmon.h => perf_counter.h}    | 6 +++---
 arch/x86/kernel/apic.c                                      | 2 +-
 arch/x86/kernel/cpu/common.c                                | 2 +-
 arch/x86/kernel/cpu/perf_counter.c                          | 2 +-
 arch/x86/kernel/cpu/perfctr-watchdog.c                      | 2 +-
 arch/x86/oprofile/op_model_ppro.c                           | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)
 rename arch/x86/include/asm/{intel_arch_perfmon.h => perf_counter.h} (90%)

diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/perf_counter.h
similarity index 90%
rename from arch/x86/include/asm/intel_arch_perfmon.h
rename to arch/x86/include/asm/perf_counter.h
index 71598a9eab617..9dadce1124ee6 100644
--- a/arch/x86/include/asm/intel_arch_perfmon.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -1,5 +1,5 @@
-#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
-#define _ASM_X86_INTEL_ARCH_PERFMON_H
+#ifndef _ASM_X86_PERF_COUNTER_H
+#define _ASM_X86_PERF_COUNTER_H
 
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
 #define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
@@ -38,4 +38,4 @@ static inline void init_hw_perf_counters(void)		{ }
 static inline void perf_counters_lapic_init(int nmi)	{ }
 #endif
 
-#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
+#endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 0579ec1cd6e31..4f859acb1563c 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -31,7 +31,7 @@
 #include <linux/dmi.h>
 #include <linux/dmar.h>
 
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/atomic.h>
 #include <asm/smp.h>
 #include <asm/mtrr.h>
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4461011db47c6..ad331b4d62387 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -17,7 +17,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/pat.h>
 #include <asm/asm.h>
 #include <asm/numa.h>
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 89fad5d4fb37d..a4a3a09a654b5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -16,7 +16,7 @@
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 #include <asm/apic.h>
 
 static bool perf_counters_initialized __read_mostly;
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 9abd48b226741..d6f5b9fbde325 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -20,7 +20,7 @@
 #include <linux/kprobes.h>
 
 #include <asm/apic.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 
 struct nmi_watchdog_ctlblk {
 	unsigned int cccr_msr;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index e9f80c744cf34..07c914555a5ee 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -18,7 +18,7 @@
 #include <asm/msr.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
-#include <asm/intel_arch_perfmon.h>
+#include <asm/perf_counter.h>
 
 #include "op_x86_model.h"
 #include "op_counter.h"
-- 
GitLab


From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 09:09:13 +0100
Subject: [PATCH 0039/2198] x86, perfcounters: prepare for fixed-mode PMCs

Impact: refactor the x86 code for fixed-mode PMCs

Extend the data structures and rename the existing facilities
to allow for a 'generic' versus 'fixed' counter distinction.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h | 11 ++++++
 arch/x86/kernel/cpu/perf_counter.c  | 53 ++++++++++++++---------------
 include/linux/perf_counter.h        |  1 +
 3 files changed, 38 insertions(+), 27 deletions(-)

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 9dadce1124ee6..dd5a4a559e2dc 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -1,6 +1,13 @@
 #ifndef _ASM_X86_PERF_COUNTER_H
 #define _ASM_X86_PERF_COUNTER_H
 
+/*
+ * Performance counter hw details:
+ */
+
+#define X86_PMC_MAX_GENERIC					8
+#define X86_PMC_MAX_FIXED					3
+
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
 #define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
@@ -20,6 +27,10 @@
 
 #define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6
 
+/*
+ * Intel "Architectural Performance Monitoring" CPUID
+ * detection/enumeration details:
+ */
 union cpuid10_eax {
 	struct {
 		unsigned int version_id:8;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a4a3a09a654b5..fc3af86882326 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -27,13 +27,12 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_hw_counters __read_mostly;
 static u32 perf_counter_mask __read_mostly;
 
-/* No support for fixed function counters yet */
-
-#define MAX_HW_COUNTERS		8
-
 struct cpu_hw_counters {
-	struct perf_counter	*counters[MAX_HW_COUNTERS];
-	unsigned long		used[BITS_TO_LONGS(MAX_HW_COUNTERS)];
+	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
+	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
+
+	struct perf_counter	*fixed[X86_PMC_MAX_FIXED];
+	unsigned long		used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)];
 };
 
 /*
@@ -185,7 +184,7 @@ void hw_perf_restore(u64 ctrl)
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline void
-__x86_perf_counter_disable(struct perf_counter *counter,
+__pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
 	int err;
@@ -193,7 +192,7 @@ __x86_perf_counter_disable(struct perf_counter *counter,
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
-static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
@@ -231,7 +230,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 }
 
 static void
-__x86_perf_counter_enable(struct perf_counter *counter,
+__pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
 	wrmsr(hwc->config_base + idx,
@@ -241,7 +240,7 @@ __x86_perf_counter_enable(struct perf_counter *counter,
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static void x86_perf_counter_enable(struct perf_counter *counter)
+static void pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -256,12 +255,12 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_perf_counter_disable(counter, hwc, idx);
+	__pmc_generic_disable(counter, hwc, idx);
 
-	cpuc->counters[idx] = counter;
+	cpuc->generic[idx] = counter;
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
-	__x86_perf_counter_enable(counter, hwc, idx);
+	__pmc_generic_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
@@ -301,16 +300,16 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-static void x86_perf_counter_disable(struct perf_counter *counter)
+static void pmc_generic_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__x86_perf_counter_disable(counter, hwc, idx);
+	__pmc_generic_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
-	cpuc->counters[idx] = NULL;
+	cpuc->generic[idx] = NULL;
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -349,7 +348,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-		__x86_perf_counter_enable(counter, hwc, idx);
+		__pmc_generic_enable(counter, hwc, idx);
 }
 
 static void
@@ -392,7 +391,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 again:
 	ack = status;
 	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->counters[bit];
+		struct perf_counter *counter = cpuc->generic[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
 		if (!counter)
@@ -412,7 +411,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 		}
 		/*
 		 * From NMI context we cannot call into the scheduler to
-		 * do a task wakeup - but we mark these counters as
+		 * do a task wakeup - but we mark these generic as
 		 * wakeup_pending and initate a wakeup callback:
 		 */
 		if (nmi) {
@@ -462,7 +461,7 @@ void perf_counter_notify(struct pt_regs *regs)
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	for_each_bit(bit, cpuc->used, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->counters[bit];
+		struct perf_counter *counter = cpuc->generic[bit];
 
 		if (!counter)
 			continue;
@@ -539,10 +538,10 @@ void __init init_hw_perf_counters(void)
 	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
 	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
 	nr_hw_counters = eax.split.num_counters;
-	if (nr_hw_counters > MAX_HW_COUNTERS) {
-		nr_hw_counters = MAX_HW_COUNTERS;
+	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
+		nr_hw_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_hw_counters, MAX_HW_COUNTERS);
+			nr_hw_counters, X86_PMC_MAX_GENERIC);
 	}
 	perf_counter_mask = (1 << nr_hw_counters) - 1;
 	perf_max_counters = nr_hw_counters;
@@ -556,15 +555,15 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void x86_perf_counter_read(struct perf_counter *counter)
+static void pmc_generic_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
 
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.hw_perf_counter_enable		= x86_perf_counter_enable,
-	.hw_perf_counter_disable	= x86_perf_counter_disable,
-	.hw_perf_counter_read		= x86_perf_counter_read,
+	.hw_perf_counter_enable		= pmc_generic_enable,
+	.hw_perf_counter_disable	= pmc_generic_disable,
+	.hw_perf_counter_read		= pmc_generic_read,
 };
 
 const struct hw_perf_counter_ops *
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d038450de87d5..984da540224b6 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,6 +14,7 @@
 #define _LINUX_PERF_COUNTER_H
 
 #include <asm/atomic.h>
+#include <asm/perf_counter.h>
 
 #include <linux/list.h>
 #include <linux/mutex.h>
-- 
GitLab


From 703e937c83bbad79075a7846e062e447c2fee6a4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 10:51:15 +0100
Subject: [PATCH 0040/2198] perfcounters: add fixed-mode PMC enumeration

Enumerate fixed-mode PMCs based on CPUID, and feed that into the
perfcounter code.

Does not use fixed-mode PMCs yet.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h | 23 +++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_counter.c  | 23 +++++++++++++++++------
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index dd5a4a559e2dc..945a315e6d62f 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -41,6 +41,29 @@ union cpuid10_eax {
 	unsigned int full;
 };
 
+union cpuid10_edx {
+	struct {
+		unsigned int num_counters_fixed:4;
+		unsigned int reserved:28;
+	} split;
+	unsigned int full;
+};
+
+
+/*
+ * Fixed-purpose performance counters:
+ */
+
+/* Instr_Retired.Any: */
+#define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
+
+/* CPU_CLK_Unhalted.Core: */
+#define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
+
+/* CPU_CLK_Unhalted.Ref: */
+#define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
+
+
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fc3af86882326..2fca50c45979d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -27,6 +27,8 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_hw_counters __read_mostly;
 static u32 perf_counter_mask __read_mostly;
 
+static int nr_hw_counters_fixed __read_mostly;
+
 struct cpu_hw_counters {
 	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
@@ -519,8 +521,9 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 void __init init_hw_perf_counters(void)
 {
 	union cpuid10_eax eax;
-	unsigned int unused;
 	unsigned int ebx;
+	unsigned int unused;
+	union cpuid10_edx edx;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
@@ -529,14 +532,14 @@ void __init init_hw_perf_counters(void)
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
 	 */
-	cpuid(10, &(eax.full), &ebx, &unused, &unused);
+	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return;
 
 	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
 
-	printk(KERN_INFO "... version:      %d\n", eax.split.version_id);
-	printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters);
+	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
+	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
 	nr_hw_counters = eax.split.num_counters;
 	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
 		nr_hw_counters = X86_PMC_MAX_GENERIC;
@@ -546,8 +549,16 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_hw_counters) - 1;
 	perf_max_counters = nr_hw_counters;
 
-	printk(KERN_INFO "... bit_width:    %d\n", eax.split.bit_width);
-	printk(KERN_INFO "... mask_length:  %d\n", eax.split.mask_length);
+	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+
+	nr_hw_counters_fixed = edx.split.num_counters_fixed;
+	if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) {
+		nr_hw_counters_fixed = X86_PMC_MAX_FIXED;
+		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
+			nr_hw_counters_fixed, X86_PMC_MAX_FIXED);
+	}
+	printk(KERN_INFO "... fixed counters:  %d\n", nr_hw_counters_fixed);
 
 	perf_counters_initialized = true;
 
-- 
GitLab


From 862a1a5f346fe7e9181ea51eaae48cf2cd70f746 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 13:09:20 +0100
Subject: [PATCH 0041/2198] x86, perfcounters: refactor code for fixed-function
 PMCs

Impact: clean up

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h | 14 +++++-
 arch/x86/kernel/cpu/perf_counter.c  | 73 +++++++++++++++--------------
 2 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 945a315e6d62f..13745deb16c8b 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -8,6 +8,10 @@
 #define X86_PMC_MAX_GENERIC					8
 #define X86_PMC_MAX_FIXED					3
 
+#define X86_PMC_IDX_GENERIC				        0
+#define X86_PMC_IDX_FIXED				       32
+#define X86_PMC_IDX_MAX					       64
+
 #define MSR_ARCH_PERFMON_PERFCTR0			      0xc1
 #define MSR_ARCH_PERFMON_PERFCTR1			      0xc2
 
@@ -54,6 +58,15 @@ union cpuid10_edx {
  * Fixed-purpose performance counters:
  */
 
+/*
+ * All 3 fixed-mode PMCs are configured via this single MSR:
+ */
+#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL			0x38d
+
+/*
+ * The counts are available in three separate MSRs:
+ */
+
 /* Instr_Retired.Any: */
 #define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
 
@@ -63,7 +76,6 @@ union cpuid10_edx {
 /* CPU_CLK_Unhalted.Ref: */
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 
-
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2fca50c45979d..358af52664078 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -24,17 +24,14 @@ static bool perf_counters_initialized __read_mostly;
 /*
  * Number of (generic) HW counters:
  */
-static int nr_hw_counters __read_mostly;
-static u32 perf_counter_mask __read_mostly;
+static int nr_counters_generic __read_mostly;
+static u64 perf_counter_mask __read_mostly;
 
-static int nr_hw_counters_fixed __read_mostly;
+static int nr_counters_fixed __read_mostly;
 
 struct cpu_hw_counters {
-	struct perf_counter	*generic[X86_PMC_MAX_GENERIC];
-	unsigned long		used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)];
-
-	struct perf_counter	*fixed[X86_PMC_MAX_FIXED];
-	unsigned long		used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)];
+	struct perf_counter	*counters[X86_PMC_IDX_MAX];
+	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 };
 
 /*
@@ -159,7 +156,7 @@ void hw_perf_enable_all(void)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask);
 }
 
 u64 hw_perf_save_disable(void)
@@ -170,7 +167,7 @@ u64 hw_perf_save_disable(void)
 		return 0;
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
 	return ctrl;
 }
@@ -181,7 +178,7 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
@@ -239,6 +236,11 @@ __pmc_generic_enable(struct perf_counter *counter,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+static int fixed_mode_idx(struct hw_perf_counter *hwc)
+{
+	return -1;
+}
+
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
@@ -250,7 +252,7 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	/* Try to get the previous counter again */
 	if (test_and_set_bit(idx, cpuc->used)) {
-		idx = find_first_zero_bit(cpuc->used, nr_hw_counters);
+		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -259,7 +261,7 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	__pmc_generic_disable(counter, hwc, idx);
 
-	cpuc->generic[idx] = counter;
+	cpuc->counters[idx] = counter;
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -270,7 +272,7 @@ void perf_counter_print_debug(void)
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
 	int cpu, idx;
 
-	if (!nr_hw_counters)
+	if (!nr_counters_generic)
 		return;
 
 	local_irq_disable();
@@ -286,7 +288,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 
-	for (idx = 0; idx < nr_hw_counters; idx++) {
+	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
@@ -311,7 +313,7 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	__pmc_generic_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
-	cpuc->generic[idx] = NULL;
+	cpuc->counters[idx] = NULL;
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -381,7 +383,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 
 	/* Disable counters globally */
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
@@ -392,8 +394,8 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 
 again:
 	ack = status;
-	for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->generic[bit];
+	for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) {
+		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
 		if (!counter)
@@ -424,7 +426,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 		}
 	}
 
-	wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 
 	/*
 	 * Repeat if there is more work to be done:
@@ -436,7 +438,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	/*
 	 * Restore - do not reenable when global enable is off:
 	 */
-	wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
@@ -462,8 +464,8 @@ void perf_counter_notify(struct pt_regs *regs)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	for_each_bit(bit, cpuc->used, nr_hw_counters) {
-		struct perf_counter *counter = cpuc->generic[bit];
+	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
+		struct perf_counter *counter = cpuc->counters[bit];
 
 		if (!counter)
 			continue;
@@ -540,26 +542,29 @@ void __init init_hw_perf_counters(void)
 
 	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
 	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
-	nr_hw_counters = eax.split.num_counters;
-	if (nr_hw_counters > X86_PMC_MAX_GENERIC) {
-		nr_hw_counters = X86_PMC_MAX_GENERIC;
+	nr_counters_generic = eax.split.num_counters;
+	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
+		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_hw_counters, X86_PMC_MAX_GENERIC);
+			nr_counters_generic, X86_PMC_MAX_GENERIC);
 	}
-	perf_counter_mask = (1 << nr_hw_counters) - 1;
-	perf_max_counters = nr_hw_counters;
+	perf_counter_mask = (1 << nr_counters_generic) - 1;
+	perf_max_counters = nr_counters_generic;
 
 	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
 	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
 
-	nr_hw_counters_fixed = edx.split.num_counters_fixed;
-	if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) {
-		nr_hw_counters_fixed = X86_PMC_MAX_FIXED;
+	nr_counters_fixed = edx.split.num_counters_fixed;
+	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
+		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-			nr_hw_counters_fixed, X86_PMC_MAX_FIXED);
+			nr_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	printk(KERN_INFO "... fixed counters:  %d\n", nr_hw_counters_fixed);
+	printk(KERN_INFO "... fixed counters:  %d\n", nr_counters_fixed);
+
+	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
+	printk(KERN_INFO "... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
-- 
GitLab


From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 14:20:28 +0100
Subject: [PATCH 0042/2198] perfcounters: hw ops rename

Impact: rename field names

Shorten them.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  6 ++--
 include/linux/perf_counter.h       |  6 ++--
 kernel/perf_counter.c              | 50 +++++++++++++++---------------
 3 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 358af52664078..b675571214250 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.hw_perf_counter_enable		= pmc_generic_enable,
-	.hw_perf_counter_disable	= pmc_generic_disable,
-	.hw_perf_counter_read		= pmc_generic_read,
+	.enable		= pmc_generic_enable,
+	.disable	= pmc_generic_disable,
+	.read		= pmc_generic_read,
 };
 
 const struct hw_perf_counter_ops *
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 984da540224b6..48f76d2e54c2a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,9 +128,9 @@ struct perf_counter;
  * struct hw_perf_counter_ops - performance counter hw ops
  */
 struct hw_perf_counter_ops {
-	void (*hw_perf_counter_enable)	(struct perf_counter *counter);
-	void (*hw_perf_counter_disable)	(struct perf_counter *counter);
-	void (*hw_perf_counter_read)	(struct perf_counter *counter);
+	void (*enable)			(struct perf_counter *counter);
+	void (*disable)			(struct perf_counter *counter);
+	void (*read)			(struct perf_counter *counter);
 };
 
 /**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f8a4d9a5d5d35..961d651aa5748 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -109,7 +109,7 @@ static void __perf_counter_remove_from_context(void *info)
 	spin_lock_irqsave(&ctx->lock, flags);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		counter->hw_ops->hw_perf_counter_disable(counter);
+		counter->hw_ops->disable(counter);
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
@@ -226,7 +226,7 @@ static void __perf_install_in_context(void *info)
 		counter->oncpu = cpu;
 		ctx->nr_active++;
 		cpuctx->active_oncpu++;
-		counter->hw_ops->hw_perf_counter_enable(counter);
+		counter->hw_ops->enable(counter);
 	}
 
 	if (!ctx->task && cpuctx->max_pertask)
@@ -297,7 +297,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return;
 
-	counter->hw_ops->hw_perf_counter_disable(counter);
+	counter->hw_ops->disable(counter);
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->oncpu = -1;
 
@@ -327,7 +327,7 @@ group_sched_out(struct perf_counter *group_counter,
  *
  * We stop each counter and update the counter value in counter->count.
  *
- * This does not protect us against NMI, but hw_perf_counter_disable()
+ * This does not protect us against NMI, but disable()
  * sets the disabled bit in the control field of counter _before_
  * accessing the counter control register. If a NMI hits, then it will
  * not restart the counter.
@@ -359,7 +359,7 @@ counter_sched_in(struct perf_counter *counter,
 	if (counter->state == PERF_COUNTER_STATE_OFF)
 		return;
 
-	counter->hw_ops->hw_perf_counter_enable(counter);
+	counter->hw_ops->enable(counter);
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
@@ -395,7 +395,7 @@ group_sched_in(struct perf_counter *group_counter,
  *
  * We restore the counter value and then enable it.
  *
- * This does not protect us against NMI, but hw_perf_counter_enable()
+ * This does not protect us against NMI, but enable()
  * sets the enabled bit in the control field of counter _before_
  * accessing the counter control register. If a NMI hits, then it will
  * keep the counter running.
@@ -537,11 +537,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 /*
  * Cross CPU call to read the hardware counter
  */
-static void __hw_perf_counter_read(void *info)
+static void __read(void *info)
 {
 	struct perf_counter *counter = info;
 
-	counter->hw_ops->hw_perf_counter_read(counter);
+	counter->hw_ops->read(counter);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -552,7 +552,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	 */
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
-					 __hw_perf_counter_read, counter, 1);
+					 __read, counter, 1);
 	}
 
 	return atomic64_read(&counter->count);
@@ -855,9 +855,9 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
-	.hw_perf_counter_enable		= cpu_clock_perf_counter_enable,
-	.hw_perf_counter_disable	= cpu_clock_perf_counter_disable,
-	.hw_perf_counter_read		= cpu_clock_perf_counter_read,
+	.enable		= cpu_clock_perf_counter_enable,
+	.disable	= cpu_clock_perf_counter_disable,
+	.read		= cpu_clock_perf_counter_read,
 };
 
 static void task_clock_perf_counter_update(struct perf_counter *counter)
@@ -891,9 +891,9 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-	.hw_perf_counter_enable		= task_clock_perf_counter_enable,
-	.hw_perf_counter_disable	= task_clock_perf_counter_disable,
-	.hw_perf_counter_read		= task_clock_perf_counter_read,
+	.enable		= task_clock_perf_counter_enable,
+	.disable	= task_clock_perf_counter_disable,
+	.read		= task_clock_perf_counter_read,
 };
 
 static u64 get_page_faults(void)
@@ -937,9 +937,9 @@ static void page_faults_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.hw_perf_counter_enable		= page_faults_perf_counter_enable,
-	.hw_perf_counter_disable	= page_faults_perf_counter_disable,
-	.hw_perf_counter_read		= page_faults_perf_counter_read,
+	.enable		= page_faults_perf_counter_enable,
+	.disable	= page_faults_perf_counter_disable,
+	.read		= page_faults_perf_counter_read,
 };
 
 static u64 get_context_switches(void)
@@ -983,9 +983,9 @@ static void context_switches_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_context_switches = {
-	.hw_perf_counter_enable		= context_switches_perf_counter_enable,
-	.hw_perf_counter_disable	= context_switches_perf_counter_disable,
-	.hw_perf_counter_read		= context_switches_perf_counter_read,
+	.enable		= context_switches_perf_counter_enable,
+	.disable	= context_switches_perf_counter_disable,
+	.read		= context_switches_perf_counter_read,
 };
 
 static inline u64 get_cpu_migrations(void)
@@ -1027,9 +1027,9 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
-	.hw_perf_counter_enable		= cpu_migrations_perf_counter_enable,
-	.hw_perf_counter_disable	= cpu_migrations_perf_counter_disable,
-	.hw_perf_counter_read		= cpu_migrations_perf_counter_read,
+	.enable		= cpu_migrations_perf_counter_enable,
+	.disable	= cpu_migrations_perf_counter_disable,
+	.read		= cpu_migrations_perf_counter_read,
 };
 
 static const struct hw_perf_counter_ops *
@@ -1283,7 +1283,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		child_counter->hw_ops->hw_perf_counter_disable(child_counter);
+		child_counter->hw_ops->disable(child_counter);
 		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
 		child_counter->oncpu = -1;
 
-- 
GitLab


From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 17 Dec 2008 14:10:57 +0100
Subject: [PATCH 0043/2198] perfcounters: fix task clock counter

Impact: fix per task clock counter precision

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |  8 +++++
 kernel/exit.c               | 17 ++++++---
 kernel/perf_counter.c       | 70 ++++++++++++++++++++++++++++---------
 kernel/sched.c              | 49 ++++++++++++++++++++++++--
 4 files changed, 120 insertions(+), 24 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 4a145caeee075..1b2e3242497cc 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 	return sum;
 }
 
+
+/*
+ * Lock/unlock the current runqueue - to extract task statistics:
+ */
+extern void curr_rq_lock_irq_save(unsigned long *flags);
+extern void curr_rq_unlock_irq_restore(unsigned long *flags);
+extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
 extern unsigned long long task_delta_exec(struct task_struct *);
+
 extern void account_user_time(struct task_struct *, cputime_t);
 extern void account_user_time_scaled(struct task_struct *, cputime_t);
 extern void account_system_time(struct task_struct *, int, cputime_t);
diff --git a/kernel/exit.c b/kernel/exit.c
index d336c90a5f133..244edfd96865e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	forget_original_parent(tsk);
 	exit_task_namespaces(tsk);
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code)
 	mpol_put(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
-	/*
-	 * These must happen late, after the PID is not
-	 * hashed anymore, but still at a point that may sleep:
-	 */
-	perf_counter_exit_task(tsk);
 #ifdef CONFIG_FUTEX
 	if (unlikely(!list_empty(&tsk->pi_state_list)))
 		exit_pi_state_list(tsk);
@@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
+	/*
+	 * These must happen late, after the PID is not
+	 * hashed anymore, but still at a point that may sleep:
+	 */
+	perf_counter_exit_task(tsk);
+
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 961d651aa5748..f1110ac1267bc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
+#include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
 
 /*
@@ -106,7 +107,8 @@ static void __perf_counter_remove_from_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		counter->hw_ops->disable(counter);
@@ -135,7 +137,8 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
 }
 
 
@@ -209,7 +212,8 @@ static void __perf_install_in_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -232,7 +236,8 @@ static void __perf_install_in_context(void *info)
 	if (!ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
 }
 
 /*
@@ -438,15 +443,19 @@ int perf_counter_task_disable(void)
 	struct task_struct *curr = current;
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
+	unsigned long flags;
 	u64 perf_flags;
 	int cpu;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	local_irq_disable();
+	curr_rq_lock_irq_save(&flags);
 	cpu = smp_processor_id();
 
+	/* force the update of the task clock: */
+	__task_delta_exec(curr, 1);
+
 	perf_counter_task_sched_out(curr, cpu);
 
 	spin_lock(&ctx->lock);
@@ -463,7 +472,7 @@ int perf_counter_task_disable(void)
 
 	spin_unlock(&ctx->lock);
 
-	local_irq_enable();
+	curr_rq_unlock_irq_restore(&flags);
 
 	return 0;
 }
@@ -473,15 +482,19 @@ int perf_counter_task_enable(void)
 	struct task_struct *curr = current;
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
+	unsigned long flags;
 	u64 perf_flags;
 	int cpu;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	local_irq_disable();
+	curr_rq_lock_irq_save(&flags);
 	cpu = smp_processor_id();
 
+	/* force the update of the task clock: */
+	__task_delta_exec(curr, 1);
+
 	spin_lock(&ctx->lock);
 
 	/*
@@ -493,6 +506,7 @@ int perf_counter_task_enable(void)
 		if (counter->state != PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->hw_event.disabled = 0;
 	}
 	hw_perf_restore(perf_flags);
 
@@ -500,7 +514,7 @@ int perf_counter_task_enable(void)
 
 	perf_counter_task_sched_in(curr, cpu);
 
-	local_irq_enable();
+	curr_rq_unlock_irq_restore(&flags);
 
 	return 0;
 }
@@ -540,8 +554,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 static void __read(void *info)
 {
 	struct perf_counter *counter = info;
+	unsigned long flags;
 
+	curr_rq_lock_irq_save(&flags);
 	counter->hw_ops->read(counter);
+	curr_rq_unlock_irq_restore(&flags);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -860,13 +877,27 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 	.read		= cpu_clock_perf_counter_read,
 };
 
-static void task_clock_perf_counter_update(struct perf_counter *counter)
+/*
+ * Called from within the scheduler:
+ */
+static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
 {
-	u64 prev, now;
+	struct task_struct *curr = counter->task;
+	u64 delta;
+
+	WARN_ON_ONCE(counter->task != current);
+
+	delta = __task_delta_exec(curr, update);
+
+	return curr->se.sum_exec_runtime + delta;
+}
+
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+{
+	u64 prev;
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = current->se.sum_exec_runtime;
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -877,17 +908,23 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	task_clock_perf_counter_update(counter);
+	u64 now = task_clock_perf_counter_val(counter, 1);
+
+	task_clock_perf_counter_update(counter, now);
 }
 
 static void task_clock_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime);
+	u64 now = task_clock_perf_counter_val(counter, 0);
+
+	atomic64_set(&counter->hw.prev_count, now);
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	task_clock_perf_counter_update(counter);
+	u64 now = task_clock_perf_counter_val(counter, 0);
+
+	task_clock_perf_counter_update(counter, now);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
@@ -1267,6 +1304,7 @@ __perf_counter_exit_task(struct task_struct *child,
 {
 	struct perf_counter *parent_counter;
 	u64 parent_val, child_val;
+	unsigned long flags;
 	u64 perf_flags;
 
 	/*
@@ -1275,7 +1313,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	 * Be careful about zapping the list - IRQ/NMI context
 	 * could still be processing it:
 	 */
-	local_irq_disable();
+	curr_rq_lock_irq_save(&flags);
 	perf_flags = hw_perf_save_disable();
 
 	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
@@ -1294,7 +1332,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	list_del_init(&child_counter->list_entry);
 
 	hw_perf_restore(perf_flags);
-	local_irq_enable();
+	curr_rq_unlock_irq_restore(&flags);
 
 	parent_counter = child_counter->parent;
 	/*
diff --git a/kernel/sched.c b/kernel/sched.c
index 382cfdb5e38d8..4d84ff4c8774b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
 {
 	rq->clock = sched_clock_cpu(cpu_of(rq));
 }
@@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	}
 }
 
+void curr_rq_lock_irq_save(unsigned long *flags)
+	__acquires(rq->lock)
+{
+	struct rq *rq;
+
+	local_irq_save(*flags);
+	rq = cpu_rq(smp_processor_id());
+	spin_lock(&rq->lock);
+}
+
+void curr_rq_unlock_irq_restore(unsigned long *flags)
+	__releases(rq->lock)
+{
+	struct rq *rq;
+
+	rq = cpu_rq(smp_processor_id());
+	spin_unlock(&rq->lock);
+	local_irq_restore(*flags);
+}
+
 void task_rq_unlock_wait(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
@@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 		    struct task_struct *next)
 {
 	fire_sched_out_preempt_notifiers(prev, next);
-	perf_counter_task_sched_out(prev, cpu_of(rq));
 	prepare_lock_switch(rq, next);
 	prepare_arch_switch(next);
 }
@@ -4089,6 +4108,29 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
+/*
+ * Return any ns on the sched_clock that have not yet been banked in
+ * @p in case that task is currently running.
+ */
+unsigned long long __task_delta_exec(struct task_struct *p, int update)
+{
+	s64 delta_exec;
+	struct rq *rq;
+
+	rq = task_rq(p);
+	WARN_ON_ONCE(!runqueue_is_locked());
+	WARN_ON_ONCE(!task_current(rq, p));
+
+	if (update)
+		update_rq_clock(rq);
+
+	delta_exec = rq->clock - p->se.exec_start;
+
+	WARN_ON_ONCE(delta_exec < 0);
+
+	return delta_exec;
+}
+
 /*
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
@@ -4316,13 +4358,13 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
+	perf_counter_task_tick(curr, cpu);
 	spin_unlock(&rq->lock);
 
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
 #endif
-	perf_counter_task_tick(curr, cpu);
 }
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -4512,6 +4554,7 @@ asmlinkage void __sched schedule(void)
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
+		perf_counter_task_sched_out(prev, cpu);
 
 		rq->nr_switches++;
 		rq->curr = next;
-- 
GitLab


From eef6cbf5844c620d9db9be99e4908cdf92492fb9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 19 Dec 2008 10:20:42 +0100
Subject: [PATCH 0044/2198] perfcounters: pull inherited counters

Change counter inheritance from a 'push' to a 'pull' model: instead of
child tasks pushing their final counts to the parent, reuse the wait4
infrastructure to pull counters as child tasks are exit-processed,
much like how cutime/cstime is collected.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h |  9 +++++++++
 kernel/exit.c             | 21 +++++++++------------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 23fd8909b9e52..54fa2fa2c8e4b 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -113,6 +113,14 @@ extern struct group_info init_groups;
 # define CAP_INIT_BSET  CAP_INIT_EFF_SET
 #endif
 
+#ifdef CONFIG_PERF_COUNTERS
+# define INIT_PERF_COUNTERS(tsk)					\
+	.perf_counter_ctx.counter_list =				\
+		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),
+#else
+# define INIT_PERF_COUNTERS(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -180,6 +188,7 @@ extern struct group_info init_groups;
 	INIT_IDS							\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
+	INIT_PERF_COUNTERS(tsk)						\
 }
 
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 244edfd96865e..101b7eeff44c8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -153,6 +153,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 {
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
+#ifdef CONFIG_PERF_COUNTERS
+	WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+#endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
 }
@@ -922,12 +925,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	forget_original_parent(tsk);
 	exit_task_namespaces(tsk);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(tsk);
-
 	write_lock_irq(&tasklist_lock);
 	if (group_dead)
 		kill_orphaned_pgrp(tsk->group_leader, NULL);
@@ -1122,12 +1119,6 @@ NORET_TYPE void do_exit(long code)
 	if (tsk->splice_pipe)
 		__free_pipe_info(tsk->splice_pipe);
 
-	/*
-	 * These must happen late, after the PID is not
-	 * hashed anymore, but still at a point that may sleep:
-	 */
-	perf_counter_exit_task(tsk);
-
 	preempt_disable();
 	/* causes final put_task_struct in finish_task_switch(). */
 	tsk->state = TASK_DEAD;
@@ -1371,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	 */
 	read_unlock(&tasklist_lock);
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(p);
+
 	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
-- 
GitLab


From 78b6084c907cea15bb40a564b974e072f5163781 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 21 Dec 2008 15:07:49 +0100
Subject: [PATCH 0045/2198] perfcounters: fix init context lock

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 54fa2fa2c8e4b..467cff545c308 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -116,7 +116,9 @@ extern struct group_info init_groups;
 #ifdef CONFIG_PERF_COUNTERS
 # define INIT_PERF_COUNTERS(tsk)					\
 	.perf_counter_ctx.counter_list =				\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),
+		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),	\
+	.perf_counter_ctx.lock =					\
+		__SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
 #else
 # define INIT_PERF_COUNTERS(tsk)
 #endif
-- 
GitLab


From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 21 Dec 2008 13:50:42 +0100
Subject: [PATCH 0046/2198] perfcounters: enable lowlevel pmc code to schedule
 counters

Allow lowlevel ->enable() op to return an error if a counter can not be
added. This can be used to handle counter constraints.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  6 ++-
 include/linux/perf_counter.h       |  2 +-
 kernel/perf_counter.c              | 62 ++++++++++++++++++++++--------
 3 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b675571214250..74090a393a7ce 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc)
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static void pmc_generic_enable(struct perf_counter *counter)
+static int pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
 	/* Try to get the previous counter again */
 	if (test_and_set_bit(idx, cpuc->used)) {
 		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
+		if (idx == nr_counters_generic)
+			return -EAGAIN;
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter)
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
+
+	return 0;
 }
 
 void perf_counter_print_debug(void)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 48f76d2e54c2a..53af11d3767bf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -128,7 +128,7 @@ struct perf_counter;
  * struct hw_perf_counter_ops - performance counter hw ops
  */
 struct hw_perf_counter_ops {
-	void (*enable)			(struct perf_counter *counter);
+	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f1110ac1267bc..2e73929a69595 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -355,21 +355,25 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	cpuctx->task_ctx = NULL;
 }
 
-static void
+static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
 	if (counter->state == PERF_COUNTER_STATE_OFF)
-		return;
+		return 0;
+
+	if (counter->hw_ops->enable(counter))
+		return -EAGAIN;
 
-	counter->hw_ops->enable(counter);
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
 
 	cpuctx->active_oncpu++;
 	ctx->nr_active++;
+
+	return 0;
 }
 
 static int
@@ -378,20 +382,38 @@ group_sched_in(struct perf_counter *group_counter,
 	       struct perf_counter_context *ctx,
 	       int cpu)
 {
-	struct perf_counter *counter;
-	int was_group = 0;
+	struct perf_counter *counter, *partial_group;
+	int ret = 0;
 
-	counter_sched_in(group_counter, cpuctx, ctx, cpu);
+	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+		return -EAGAIN;
 
 	/*
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		counter_sched_in(counter, cpuctx, ctx, cpu);
-		was_group = 1;
+		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+			partial_group = counter;
+			goto group_error;
+		}
+		ret = -EAGAIN;
 	}
 
-	return was_group;
+	return ret;
+
+group_error:
+	/*
+	 * Groups can be scheduled in as one unit only, so undo any
+	 * partial group before returning:
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		if (counter == partial_group)
+			break;
+		counter_sched_out(counter, cpuctx, ctx);
+	}
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	return -EAGAIN;
 }
 
 /*
@@ -416,9 +438,6 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 
 	spin_lock(&ctx->lock);
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (ctx->nr_active == cpuctx->max_pertask)
-			break;
-
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of counters:
@@ -856,8 +875,9 @@ static const struct file_operations perf_fops = {
 	.poll			= perf_poll,
 };
 
-static void cpu_clock_perf_counter_enable(struct perf_counter *counter)
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 {
+	return 0;
 }
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
@@ -913,11 +933,13 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 	task_clock_perf_counter_update(counter, now);
 }
 
-static void task_clock_perf_counter_enable(struct perf_counter *counter)
+static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
 	u64 now = task_clock_perf_counter_val(counter, 0);
 
 	atomic64_set(&counter->hw.prev_count, now);
+
+	return 0;
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
@@ -960,12 +982,14 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
 	page_faults_perf_counter_update(counter);
 }
 
-static void page_faults_perf_counter_enable(struct perf_counter *counter)
+static int page_faults_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * page-faults is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void page_faults_perf_counter_disable(struct perf_counter *counter)
@@ -1006,12 +1030,14 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
 	context_switches_perf_counter_update(counter);
 }
 
-static void context_switches_perf_counter_enable(struct perf_counter *counter)
+static int context_switches_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * ->nvcsw + curr->nivcsw is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void context_switches_perf_counter_disable(struct perf_counter *counter)
@@ -1050,12 +1076,14 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
 	cpu_migrations_perf_counter_update(counter);
 }
 
-static void cpu_migrations_perf_counter_enable(struct perf_counter *counter)
+static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
 {
 	/*
 	 * se.nr_migrations is a per-task value already,
 	 * so we dont have to clear it on switch-in.
 	 */
+
+	return 0;
 }
 
 static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-- 
GitLab


From 0dff86aa7b9ec65a6d07167b7afb050b5fc98ddc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:28:12 +0100
Subject: [PATCH 0047/2198] x86, perfcounters: print out the ->used bitmask

Impact: extend debug printouts

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 74090a393a7ce..f3359c2b39107 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -255,6 +255,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
 		if (idx == nr_counters_generic)
 			return -EAGAIN;
+
 		set_bit(idx, cpuc->used);
 		hwc->idx = idx;
 	}
@@ -274,6 +275,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
+	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
 	if (!nr_counters_generic)
@@ -282,6 +284,7 @@ void perf_counter_print_debug(void)
 	local_irq_disable();
 
 	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
@@ -291,6 +294,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
-- 
GitLab


From 8fe91e61cdc407c7556d3cd71cf20141a25bbcea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:29:25 +0100
Subject: [PATCH 0048/2198] perfcounters: remove ->nr_inherited

Impact: remove dead code

nr_inherited was not maintained correctly (not decremented) - and also
not used - remove it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 1 -
 kernel/perf_counter.c        | 1 -
 2 files changed, 2 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 53af11d3767bf..1ea08e9f31ceb 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -164,7 +164,6 @@ struct perf_counter {
 	struct task_struct		*task;
 	struct file			*filp;
 
-	unsigned int			nr_inherited;
 	struct perf_counter		*parent;
 	/*
 	 * Protect attach/detach:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2e73929a69595..48e1dbcdc1cd9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1308,7 +1308,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	child_ctx->nr_counters++;
 
 	child_counter->parent = parent_counter;
-	parent_counter->nr_inherited++;
 	/*
 	 * inherit into child's child as well:
 	 */
-- 
GitLab


From 235c7fc7c500e4fd1700c4ad01b5612bcdc1b449 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 21 Dec 2008 14:43:25 +0100
Subject: [PATCH 0049/2198] perfcounters: generalize the counter scheduler

Impact: clean up and refactor code

refactor the counter scheduler: separate out in/out functions and
introduce a counter-rotation function as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 220 +++++++++++++++++++++++++++---------------
 1 file changed, 142 insertions(+), 78 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 48e1dbcdc1cd9..d7a79f321b1c3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -111,11 +111,12 @@ static void __perf_counter_remove_from_context(void *info)
 	spin_lock(&ctx->lock);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		counter->hw_ops->disable(counter);
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->hw_ops->disable(counter);
 		ctx->nr_active--;
 		cpuctx->active_oncpu--;
 		counter->task = NULL;
+		counter->oncpu = -1;
 	}
 	ctx->nr_counters--;
 
@@ -192,8 +193,36 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
+static int
+counter_sched_in(struct perf_counter *counter,
+		 struct perf_cpu_context *cpuctx,
+		 struct perf_counter_context *ctx,
+		 int cpu)
+{
+	if (counter->state == PERF_COUNTER_STATE_OFF)
+		return 0;
+
+	counter->state = PERF_COUNTER_STATE_ACTIVE;
+	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
+	/*
+	 * The new state must be visible before we turn it on in the hardware:
+	 */
+	smp_wmb();
+
+	if (counter->hw_ops->enable(counter)) {
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->oncpu = -1;
+		return -EAGAIN;
+	}
+
+	cpuctx->active_oncpu++;
+	ctx->nr_active++;
+
+	return 0;
+}
+
 /*
- * Cross CPU call to install and enable a preformance counter
+ * Cross CPU call to install and enable a performance counter
  */
 static void __perf_install_in_context(void *info)
 {
@@ -220,22 +249,17 @@ static void __perf_install_in_context(void *info)
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
 	perf_flags = hw_perf_save_disable();
-	list_add_counter(counter, ctx);
-	hw_perf_restore(perf_flags);
 
+	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
 
-	if (cpuctx->active_oncpu < perf_max_counters) {
-		counter->state = PERF_COUNTER_STATE_ACTIVE;
-		counter->oncpu = cpu;
-		ctx->nr_active++;
-		cpuctx->active_oncpu++;
-		counter->hw_ops->enable(counter);
-	}
+	counter_sched_in(counter, cpuctx, ctx, cpu);
 
 	if (!ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
+	hw_perf_restore(perf_flags);
+
 	spin_unlock(&ctx->lock);
 	curr_rq_unlock_irq_restore(&flags);
 }
@@ -302,8 +326,8 @@ counter_sched_out(struct perf_counter *counter,
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return;
 
-	counter->hw_ops->disable(counter);
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->hw_ops->disable(counter);
 	counter->oncpu = -1;
 
 	cpuctx->active_oncpu--;
@@ -326,6 +350,22 @@ group_sched_out(struct perf_counter *group_counter,
 		counter_sched_out(counter, cpuctx, ctx);
 }
 
+void __perf_counter_sched_out(struct perf_counter_context *ctx,
+			      struct perf_cpu_context *cpuctx)
+{
+	struct perf_counter *counter;
+
+	if (likely(!ctx->nr_counters))
+		return;
+
+	spin_lock(&ctx->lock);
+	if (ctx->nr_active) {
+		list_for_each_entry(counter, &ctx->counter_list, list_entry)
+			group_sched_out(counter, cpuctx, ctx);
+	}
+	spin_unlock(&ctx->lock);
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -341,39 +381,18 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = &task->perf_counter_ctx;
-	struct perf_counter *counter;
 
 	if (likely(!cpuctx->task_ctx))
 		return;
 
-	spin_lock(&ctx->lock);
-	if (ctx->nr_active) {
-		list_for_each_entry(counter, &ctx->counter_list, list_entry)
-			group_sched_out(counter, cpuctx, ctx);
-	}
-	spin_unlock(&ctx->lock);
+	__perf_counter_sched_out(ctx, cpuctx);
+
 	cpuctx->task_ctx = NULL;
 }
 
-static int
-counter_sched_in(struct perf_counter *counter,
-		 struct perf_cpu_context *cpuctx,
-		 struct perf_counter_context *ctx,
-		 int cpu)
+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 {
-	if (counter->state == PERF_COUNTER_STATE_OFF)
-		return 0;
-
-	if (counter->hw_ops->enable(counter))
-		return -EAGAIN;
-
-	counter->state = PERF_COUNTER_STATE_ACTIVE;
-	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */
-
-	cpuctx->active_oncpu++;
-	ctx->nr_active++;
-
-	return 0;
+	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 }
 
 static int
@@ -416,21 +435,10 @@ group_sched_in(struct perf_counter *group_counter,
 	return -EAGAIN;
 }
 
-/*
- * Called from scheduler to add the counters of the current task
- * with interrupts disabled.
- *
- * We restore the counter value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * keep the counter running.
- */
-void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+static void
+__perf_counter_sched_in(struct perf_counter_context *ctx,
+			struct perf_cpu_context *cpuctx, int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
 	struct perf_counter *counter;
 
 	if (likely(!ctx->nr_counters))
@@ -453,10 +461,35 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 			break;
 	}
 	spin_unlock(&ctx->lock);
+}
 
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+
+	__perf_counter_sched_in(ctx, cpuctx, cpu);
 	cpuctx->task_ctx = ctx;
 }
 
+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+	struct perf_counter_context *ctx = &cpuctx->ctx;
+
+	__perf_counter_sched_in(ctx, cpuctx, cpu);
+}
+
 int perf_counter_task_disable(void)
 {
 	struct task_struct *curr = current;
@@ -514,6 +547,8 @@ int perf_counter_task_enable(void)
 	/* force the update of the task clock: */
 	__task_delta_exec(curr, 1);
 
+	perf_counter_task_sched_out(curr, cpu);
+
 	spin_lock(&ctx->lock);
 
 	/*
@@ -538,19 +573,18 @@ int perf_counter_task_enable(void)
 	return 0;
 }
 
-void perf_counter_task_tick(struct task_struct *curr, int cpu)
+/*
+ * Round-robin a context's counters:
+ */
+static void rotate_ctx(struct perf_counter_context *ctx)
 {
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
 	u64 perf_flags;
 
-	if (likely(!ctx->nr_counters))
+	if (!ctx->nr_counters)
 		return;
 
-	perf_counter_task_sched_out(curr, cpu);
-
 	spin_lock(&ctx->lock);
-
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
@@ -563,7 +597,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	const int rotate_percpu = 0;
+
+	if (rotate_percpu)
+		perf_counter_cpu_sched_out(cpuctx);
+	perf_counter_task_sched_out(curr, cpu);
 
+	if (rotate_percpu)
+		rotate_ctx(&cpuctx->ctx);
+	rotate_ctx(ctx);
+
+	if (rotate_percpu)
+		perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
 }
 
@@ -905,8 +956,6 @@ static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
 	struct task_struct *curr = counter->task;
 	u64 delta;
 
-	WARN_ON_ONCE(counter->task != current);
-
 	delta = __task_delta_exec(curr, update);
 
 	return curr->se.sum_exec_runtime + delta;
@@ -1160,6 +1209,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
@@ -1331,35 +1381,49 @@ __perf_counter_exit_task(struct task_struct *child,
 {
 	struct perf_counter *parent_counter;
 	u64 parent_val, child_val;
-	unsigned long flags;
-	u64 perf_flags;
 
 	/*
-	 * Disable and unlink this counter.
-	 *
-	 * Be careful about zapping the list - IRQ/NMI context
-	 * could still be processing it:
+	 * If we do not self-reap then we have to wait for the
+	 * child task to unschedule (it will happen for sure),
+	 * so that its counter is at its final count. (This
+	 * condition triggers rarely - child tasks usually get
+	 * off their CPU before the parent has a chance to
+	 * get this far into the reaping action)
 	 */
-	curr_rq_lock_irq_save(&flags);
-	perf_flags = hw_perf_save_disable();
-
-	if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+	if (child != current) {
+		wait_task_inactive(child, 0);
+		list_del_init(&child_counter->list_entry);
+	} else {
 		struct perf_cpu_context *cpuctx;
+		unsigned long flags;
+		u64 perf_flags;
+
+		/*
+		 * Disable and unlink this counter.
+		 *
+		 * Be careful about zapping the list - IRQ/NMI context
+		 * could still be processing it:
+		 */
+		curr_rq_lock_irq_save(&flags);
+		perf_flags = hw_perf_save_disable();
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		child_counter->hw_ops->disable(child_counter);
-		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-		child_counter->oncpu = -1;
+		if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+			child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+			child_counter->hw_ops->disable(child_counter);
+			cpuctx->active_oncpu--;
+			child_ctx->nr_active--;
+			child_counter->oncpu = -1;
+		}
 
-		cpuctx->active_oncpu--;
-		child_ctx->nr_active--;
-	}
+		list_del_init(&child_counter->list_entry);
 
-	list_del_init(&child_counter->list_entry);
+		child_ctx->nr_counters--;
 
-	hw_perf_restore(perf_flags);
-	curr_rq_unlock_irq_restore(&flags);
+		hw_perf_restore(perf_flags);
+		curr_rq_unlock_irq_restore(&flags);
+	}
 
 	parent_counter = child_counter->parent;
 	/*
-- 
GitLab


From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 23 Dec 2008 12:17:29 +0100
Subject: [PATCH 0050/2198] perfcounters: add PERF_COUNT_BUS_CYCLES

Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref
on x86. (which is a good enough approximation)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 3 ++-
 include/linux/perf_counter.h       | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f3359c2b39107..86b2fdd344a62 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -41,12 +41,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
 static const int intel_perfmon_event_map[] =
 {
-  [PERF_COUNT_CYCLES]			= 0x003c,
+  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
   [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
   [PERF_COUNT_CACHE_MISSES]		= 0x412e,
   [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
 static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1ea08e9f31ceb..ec77d1643d370 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -36,14 +36,15 @@ enum hw_event_types {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CYCLES		=  0,
+	PERF_COUNT_CPU_CYCLES		=  0,
 	PERF_COUNT_INSTRUCTIONS		=  1,
 	PERF_COUNT_CACHE_REFERENCES	=  2,
 	PERF_COUNT_CACHE_MISSES		=  3,
 	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
 	PERF_COUNT_BRANCH_MISSES	=  5,
+	PERF_COUNT_BUS_CYCLES		=  6,
 
-	PERF_HW_EVENTS_MAX		=  6,
+	PERF_HW_EVENTS_MAX		=  7,
 
 	/*
 	 * Special "software" counters provided by the kernel, even if
-- 
GitLab


From 2f18d1e8d07ae67dd0afce875287756d4bd31a46 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 22 Dec 2008 11:10:42 +0100
Subject: [PATCH 0051/2198] x86, perfcounters: add support for fixed-function
 pmcs

Impact: extend performance counter support on x86 Intel CPUs

Modern Intel CPUs have 3 "fixed-function" performance counters, which
count these hardware events:

    Instr_Retired.Any
    CPU_CLK_Unhalted.Core
    CPU_CLK_Unhalted.Ref

Add support for them to the performance counters subsystem.

Their use is transparent to user-space: the counter scheduler is
extended to automatically recognize the cases where a fixed-function
PMC can be utilized instead of a generic PMC. In such cases the
generic PMC is kept available for more counters.

The above fixed-function events map to these generic counter hw events:

        PERF_COUNT_INSTRUCTIONS
        PERF_COUNT_CPU_CYCLES
        PERF_COUNT_BUS_CYCLES

(The 'bus' cycles are in reality often CPU-ish cycles, just with a fixed
 frequency.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h |   8 ++
 arch/x86/kernel/cpu/perf_counter.c  | 149 +++++++++++++++++++++++-----
 2 files changed, 133 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 13745deb16c8b..2e08ed736647e 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -23,6 +23,11 @@
 #define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17)
 #define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16)
 
+/*
+ * Includes eventsel and unit mask as well:
+ */
+#define ARCH_PERFMON_EVENT_MASK				    0xffff
+
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8)
 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0
@@ -69,12 +74,15 @@ union cpuid10_edx {
 
 /* Instr_Retired.Any: */
 #define MSR_ARCH_PERFMON_FIXED_CTR0			0x309
+#define X86_PMC_IDX_FIXED_INSTRUCTIONS			(X86_PMC_IDX_FIXED + 0)
 
 /* CPU_CLK_Unhalted.Core: */
 #define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a
+#define X86_PMC_IDX_FIXED_CPU_CYCLES			(X86_PMC_IDX_FIXED + 1)
 
 /* CPU_CLK_Unhalted.Ref: */
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
+#define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 86b2fdd344a62..da46eca125432 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -26,6 +26,7 @@ static bool perf_counters_initialized __read_mostly;
  */
 static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
+static u64 counter_value_mask __read_mostly;
 
 static int nr_counters_fixed __read_mostly;
 
@@ -120,9 +121,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			hwc->nmi = 1;
 	}
 
-	hwc->config_base	= MSR_ARCH_PERFMON_EVENTSEL0;
-	hwc->counter_base	= MSR_ARCH_PERFMON_PERFCTR0;
-
 	hwc->irq_period		= hw_event->irq_period;
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
@@ -183,16 +181,34 @@ void hw_perf_restore(u64 ctrl)
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
+static inline void
+__pmc_fixed_disable(struct perf_counter *counter,
+		    struct hw_perf_counter *hwc, unsigned int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, mask;
+	int err;
+
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
 static inline void
 __pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
 	int err;
 
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
+		return __pmc_fixed_disable(counter, hwc, idx);
+
 	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
-static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]);
+static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
 
 /*
  * Set the next IRQ period, based on the hwc->period_left value.
@@ -202,8 +218,9 @@ static void
 __hw_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
-	s32 left = atomic64_read(&hwc->period_left);
+	s64 left = atomic64_read(&hwc->period_left);
 	s32 period = hwc->irq_period;
+	int err;
 
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
@@ -224,21 +241,64 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 	 * The hw counter starts counting from this counter offset,
 	 * mark it to be able to extra future deltas:
 	 */
-	atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+	atomic64_set(&hwc->prev_count, (u64)-left);
 
-	wrmsr(hwc->counter_base + idx, -left, 0);
+	err = checking_wrmsrl(hwc->counter_base + idx,
+			     (u64)(-left) & counter_value_mask);
+}
+
+static inline void
+__pmc_fixed_enable(struct perf_counter *counter,
+		   struct hw_perf_counter *hwc, unsigned int __idx)
+{
+	int idx = __idx - X86_PMC_IDX_FIXED;
+	u64 ctrl_val, bits, mask;
+	int err;
+
+	/*
+	 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
+	 * and enable ring-0 counting if allowed:
+	 */
+	bits = 0x8ULL | 0x2ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+		bits |= 0x1;
+	bits <<= (idx * 4);
+	mask = 0xfULL << (idx * 4);
+
+	rdmsrl(hwc->config_base, ctrl_val);
+	ctrl_val &= ~mask;
+	ctrl_val |= bits;
+	err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
 static void
 __pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
+		return __pmc_fixed_enable(counter, hwc, idx);
+
 	wrmsr(hwc->config_base + idx,
 	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
-static int fixed_mode_idx(struct hw_perf_counter *hwc)
+static int
+fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
+	unsigned int event;
+
+	if (unlikely(hwc->nmi))
+		return -1;
+
+	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
+
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
+		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
+		return X86_PMC_IDX_FIXED_CPU_CYCLES;
+	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
+		return X86_PMC_IDX_FIXED_BUS_CYCLES;
+
 	return -1;
 }
 
@@ -249,16 +309,39 @@ static int pmc_generic_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
-	int idx = hwc->idx;
+	int idx;
 
-	/* Try to get the previous counter again */
-	if (test_and_set_bit(idx, cpuc->used)) {
-		idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
-		if (idx == nr_counters_generic)
-			return -EAGAIN;
+	idx = fixed_mode_idx(counter, hwc);
+	if (idx >= 0) {
+		/*
+		 * Try to get the fixed counter, if that is already taken
+		 * then try to get a generic counter:
+		 */
+		if (test_and_set_bit(idx, cpuc->used))
+			goto try_generic;
 
-		set_bit(idx, cpuc->used);
+		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
+		/*
+		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to
+		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
+		 */
+		hwc->counter_base =
+			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
 		hwc->idx = idx;
+	} else {
+		idx = hwc->idx;
+		/* Try to get the previous generic counter again */
+		if (test_and_set_bit(idx, cpuc->used)) {
+try_generic:
+			idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
+			if (idx == nr_counters_generic)
+				return -EAGAIN;
+
+			set_bit(idx, cpuc->used);
+			hwc->idx = idx;
+		}
+		hwc->config_base  = MSR_ARCH_PERFMON_EVENTSEL0;
+		hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -266,6 +349,10 @@ static int pmc_generic_enable(struct perf_counter *counter)
 	__pmc_generic_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
+	/*
+	 * Make it visible before enabling the hw:
+	 */
+	smp_wmb();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -275,7 +362,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 
 void perf_counter_print_debug(void)
 {
-	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
+	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
 	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
@@ -290,11 +377,13 @@ void perf_counter_print_debug(void)
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+	rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
 
 	printk(KERN_INFO "\n");
 	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
+	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
@@ -303,13 +392,19 @@ void perf_counter_print_debug(void)
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
-		printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
-		printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+		printk(KERN_INFO "CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
+	for (idx = 0; idx < nr_counters_fixed; idx++) {
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
+
+		printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
+			cpu, idx, pmc_count);
+	}
 	local_irq_enable();
 }
 
@@ -323,6 +418,11 @@ static void pmc_generic_disable(struct perf_counter *counter)
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
+	/*
+	 * Make sure the cleared pointer becomes visible before we
+	 * (potentially) free the counter:
+	 */
+	smp_wmb();
 
 	/*
 	 * Drain the remaining delta count out of a counter
@@ -353,14 +453,11 @@ static void perf_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
-	u64 pmc_ctrl;
-
-	rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
 	x86_perf_counter_update(counter, hwc, idx);
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
-	if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		__pmc_generic_enable(counter, hwc, idx);
 }
 
@@ -373,6 +470,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	 * Store sibling timestamps (if any):
 	 */
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+
 		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 		perf_store_irq_data(sibling, counter->hw_event.type);
 		perf_store_irq_data(sibling, atomic64_read(&counter->count));
@@ -403,7 +501,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 
 again:
 	ack = status;
-	for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) {
+	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
@@ -561,6 +659,9 @@ void __init init_hw_perf_counters(void)
 	perf_max_counters = nr_counters_generic;
 
 	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
+
 	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
 
 	nr_counters_fixed = edx.split.num_counters_fixed;
-- 
GitLab


From e44aef58ecfbe061eb4c53b939bcc35fb1bee82d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 25 Dec 2008 09:02:11 +0100
Subject: [PATCH 0052/2198] perfcounters: include asm/perf_counter.h only if
 CONFIG_PERF_COUNTERS=y

Impact: build fix on ia64

KOSAKI Motohiro reported that -tip doesnt build on ia64 because
asm/perf_counter.h only exists on x86 for now. Fix it.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index ec77d1643d370..cc3a75a239a9a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,7 +14,10 @@
 #define _LINUX_PERF_COUNTER_H
 
 #include <asm/atomic.h>
-#include <asm/perf_counter.h>
+
+#ifdef CONFIG_PERF_COUNTERS
+# include <asm/perf_counter.h>
+#endif
 
 #include <linux/list.h>
 #include <linux/mutex.h>
-- 
GitLab


From 01ea1ccaa24dea3552e103be13b7897211607a8b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 26 Dec 2008 21:05:06 -0800
Subject: [PATCH 0053/2198] perf_counter: more barrier in blank weak function

Impact: fix panic possible panic

Some versions of GCC inline the weak global function if it's empty.
Add a barrier() to work it around.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d7a79f321b1c3..37f771691f938 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -45,8 +45,8 @@ hw_perf_counter_init(struct perf_counter *counter)
 }
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)		{ }
-void __weak hw_perf_counter_setup(void)		{ }
+void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
+void __weak hw_perf_counter_setup(void)		{ barrier(); }
 
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
-- 
GitLab


From 2b583d8bc8d7105b58d7481a4a0ceb718dac49c6 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@infradead.org>
Date: Sat, 27 Dec 2008 19:15:43 +0530
Subject: [PATCH 0054/2198] x86: perf_counter remove unwanted
 hw_perf_enable_all

Impact: clean, reduce kernel size a bit, avoid sparse warnings

Fixes sparse warnings:

 arch/x86/kernel/cpu/perf_counter.c:153:6: warning: symbol 'hw_perf_enable_all' was not declared. Should it be static?
 arch/x86/kernel/cpu/perf_counter.c:279:3: warning: returning void-valued expression
 arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression
 arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index da46eca125432..9376771f757b2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -150,14 +150,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-void hw_perf_enable_all(void)
-{
-	if (unlikely(!perf_counters_initialized))
-		return;
-
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask);
-}
-
 u64 hw_perf_save_disable(void)
 {
 	u64 ctrl;
@@ -200,12 +192,10 @@ static inline void
 __pmc_generic_disable(struct perf_counter *counter,
 			   struct hw_perf_counter *hwc, unsigned int idx)
 {
-	int err;
-
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		return __pmc_fixed_disable(counter, hwc, idx);
-
-	err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+		__pmc_fixed_disable(counter, hwc, idx);
+	else
+		wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -276,10 +266,10 @@ __pmc_generic_enable(struct perf_counter *counter,
 			  struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		return __pmc_fixed_enable(counter, hwc, idx);
-
-	wrmsr(hwc->config_base + idx,
-	      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+		__pmc_fixed_enable(counter, hwc, idx);
+	else
+		wrmsr(hwc->config_base + idx,
+		      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
 static int
-- 
GitLab


From ff6f05416ece2caec1a7a1f8180d6598e0ab9272 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 16:19:25 +1100
Subject: [PATCH 0055/2198] perf_counter: Fix return value from dummy
 hw_perf_counter_init

Impact: fix oops-causing bug

Currently, if you try to use perf_counters on an architecture that has
no hardware support, and you select an event that doesn't map to any of
the defined software counters, you get an oops rather than an error.
This is because the dummy hw_perf_counter_init returns ERR_PTR(-EINVAL)
but the caller (perf_counter_alloc) only tests for NULL.

This makes the dummy hw_perf_counter_init return NULL instead.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 37f771691f938..4be1a8d872b43 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -41,7 +41,7 @@ static DEFINE_MUTEX(perf_resource_mutex);
 extern __weak const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter)
 {
-	return ERR_PTR(-EINVAL);
+	return NULL;
 }
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
-- 
GitLab


From 9abf8a08bc8f18a3b125f834f00e2e71b49c15d2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 16:26:43 +1100
Subject: [PATCH 0056/2198] perf_counter: Fix the cpu_clock software counter

Impact: bug fix

Currently if you do (e.g.) timec -e -1 ls, it will report 0 for the
value of the cpu_clock counter.  The reason is that the core assumes
that a counter's count field is up-to-date when the counter is inactive,
and doesn't call the counter's read function.  However, the cpu_clock
counter code only updates the count in the read function.

This fixes it by making both the read and disable functions update the
count.  It also makes the counter ignore time passing while the counter
is disabled, by making the enable function update the hw.prev_count field.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/perf_counter.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4be1a8d872b43..b7a027a2ef02c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -928,18 +928,32 @@ static const struct file_operations perf_fops = {
 
 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 {
+	int cpu = raw_smp_processor_id();
+
+	atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
 	return 0;
 }
 
+static void cpu_clock_perf_counter_update(struct perf_counter *counter)
+{
+	int cpu = raw_smp_processor_id();
+	s64 prev;
+	u64 now;
+
+	now = cpu_clock(cpu);
+	prev = atomic64_read(&counter->hw.prev_count);
+	atomic64_set(&counter->hw.prev_count, now);
+	atomic64_add(now - prev, &counter->count);
+}
+
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
+	cpu_clock_perf_counter_update(counter);
 }
 
 static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 {
-	int cpu = raw_smp_processor_id();
-
-	atomic64_set(&counter->count, cpu_clock(cpu));
+	cpu_clock_perf_counter_update(counter);
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
-- 
GitLab


From 3cbed429a9ccdb7a243f733b1056fe5c39e9004c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 16:43:42 +1100
Subject: [PATCH 0057/2198] perf_counter: Add optional hw_perf_group_sched_in
 arch function

Impact: extend perf_counter infrastructure

This adds an optional hw_perf_group_sched_in() arch function that enables
a whole group of counters in one go.  It returns 1 if it added the group
successfully, 0 if it did nothing (and therefore the core needs to add
the counters individually), or a negative number if an error occurred.
It should add all the counters and enable any software counters in the
group, or else add none of them and return an error.

There are a couple of related changes/improvements in the group handling
here:

* As an optimization, group_sched_out() and group_sched_in() now check the
  state of the group leader, and do nothing if the leader is not active
  or disabled.

* We now call hw_perf_save_disable/hw_perf_restore around the complete
  set of counter enable/disable calls in __perf_counter_sched_in/out,
  to give the arch code the opportunity to defer updating the hardware
  state until the hw_perf_restore call if it wants.

* We no longer stop adding groups after we get to a group that has more
  than one counter.  We will ultimately add an option for a group to be
  exclusive.  The current code doesn't really implement exclusive groups
  anyway, since a group could end up going on with other counters that
  get added before it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/perf_counter.h |  3 +++
 kernel/perf_counter.c        | 31 ++++++++++++++++++++++++++-----
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index cc3a75a239a9a..b21d1ea4c054d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -236,6 +236,9 @@ extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
+extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx, int cpu);
 
 #else
 static inline void
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b7a027a2ef02c..9ad11e44d9ab5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -47,6 +47,12 @@ hw_perf_counter_init(struct perf_counter *counter)
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
 void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
 void __weak hw_perf_counter_setup(void)		{ barrier(); }
+int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx, int cpu)
+{
+	return 0;
+}
 
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
@@ -341,6 +347,9 @@ group_sched_out(struct perf_counter *group_counter,
 {
 	struct perf_counter *counter;
 
+	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return;
+
 	counter_sched_out(group_counter, cpuctx, ctx);
 
 	/*
@@ -354,15 +363,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
 	struct perf_counter *counter;
+	u64 flags;
 
 	if (likely(!ctx->nr_counters))
 		return;
 
 	spin_lock(&ctx->lock);
+	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
 		list_for_each_entry(counter, &ctx->counter_list, list_entry)
 			group_sched_out(counter, cpuctx, ctx);
 	}
+	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
 }
 
@@ -402,7 +414,14 @@ group_sched_in(struct perf_counter *group_counter,
 	       int cpu)
 {
 	struct perf_counter *counter, *partial_group;
-	int ret = 0;
+	int ret;
+
+	if (group_counter->state == PERF_COUNTER_STATE_OFF)
+		return 0;
+
+	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+	if (ret)
+		return ret < 0 ? ret : 0;
 
 	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 		return -EAGAIN;
@@ -415,10 +434,9 @@ group_sched_in(struct perf_counter *group_counter,
 			partial_group = counter;
 			goto group_error;
 		}
-		ret = -EAGAIN;
 	}
 
-	return ret;
+	return 0;
 
 group_error:
 	/*
@@ -440,11 +458,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct perf_counter *counter;
+	u64 flags;
 
 	if (likely(!ctx->nr_counters))
 		return;
 
 	spin_lock(&ctx->lock);
+	flags = hw_perf_save_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
@@ -454,12 +474,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 			continue;
 
 		/*
-		 * If we scheduled in a group atomically and
-		 * exclusively, break out:
+		 * If we scheduled in a group atomically and exclusively,
+		 * or if this group can't go on, break out:
 		 */
 		if (group_sched_in(counter, cpuctx, ctx, cpu))
 			break;
 	}
+	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
 }
 
-- 
GitLab


From 4eb96fcfe07b7f2a05577e57533840f8e26bea53 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 17:24:34 +1100
Subject: [PATCH 0058/2198] perf_counter: Add dummy perf_counter_print_debug
 function

Impact: minimize requirements on architectures

Currently, an architecture just enabling CONFIG_PERF_COUNTERS but not
providing any extra functions will fail to build with
perf_counter_print_debug being undefined, since we don't provide an
empty dummy definition like we do with the hw_perf_* functions.

This provides an empty dummy perf_counter_print_debug() to make it
easier for architectures to turn on CONFIG_PERF_COUNTERS.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/perf_counter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 9ad11e44d9ab5..4c0dccb756adf 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -54,6 +54,8 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 	return 0;
 }
 
+void __weak perf_counter_print_debug(void)	{ }
+
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
-- 
GitLab


From d662ed26734473d4cb5f3d78cebfec8f9126e97c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 17:01:53 +1100
Subject: [PATCH 0059/2198] powerpc/perf_counter: Add perf_counter system call
 on powerpc

... with an empty/dummy asm/perf_counter.h so it builds.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/perf_counter.h | 10 ++++++++++
 arch/powerpc/include/asm/systbl.h       |  1 +
 arch/powerpc/include/asm/unistd.h       |  3 ++-
 arch/powerpc/platforms/Kconfig.cputype  |  1 +
 4 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/perf_counter.h

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
new file mode 100644
index 0000000000000..59530ae1d53c5
--- /dev/null
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -0,0 +1,10 @@
+/*
+ * Performance counter support - PowerPC-specific definitions.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 803def236654a..da300c4d28889 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,3 +322,4 @@ SYSCALL_SPU(epoll_create1)
 SYSCALL_SPU(dup3)
 SYSCALL_SPU(pipe2)
 SYSCALL(inotify_init1)
+SYSCALL(perf_counter_open)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index e07d0c76ed779..7cef5afe89d84 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -341,10 +341,11 @@
 #define __NR_dup3		316
 #define __NR_pipe2		317
 #define __NR_inotify_init1	318
+#define __NR_perf_counter_open	319
 
 #ifdef __KERNEL__
 
-#define __NR_syscalls		319
+#define __NR_syscalls		320
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype
index 3d0c776f888d8..94dd1fb9a0043 100644
--- a/arch/powerpc/platforms/Kconfig.cputype
+++ b/arch/powerpc/platforms/Kconfig.cputype
@@ -1,6 +1,7 @@
 config PPC64
 	bool "64-bit kernel"
 	default n
+	select HAVE_PERF_COUNTERS
 	help
 	  This option selects whether a 32-bit or a 64-bit kernel
 	  will be built.
-- 
GitLab


From 93a6d3ce6962044fe9badf528fed46b455d58292 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 16:52:19 +1100
Subject: [PATCH 0060/2198] powerpc: Provide a way to defer perf counter work
 until interrupts are enabled

Because 64-bit powerpc uses lazy (soft) interrupt disabling, it is
possible for a performance monitor exception to come in when the
kernel thinks interrupts are disabled (i.e. when they are
soft-disabled but hard-enabled).  In such a situation the performance
monitor exception handler might have some processing to do (such as
process wakeups) which can't be done in what is effectively an NMI
handler.

This provides a way to defer that work until interrupts get enabled,
either in raw_local_irq_restore() or by returning from an interrupt
handler to code that had interrupts enabled.  We have a per-processor
flag that indicates that there is work pending to do when interrupts
subsequently get re-enabled.  This flag is checked in the interrupt
return path and in raw_local_irq_restore(), and if it is set,
perf_counter_do_pending() is called to do the pending work.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/hw_irq.h | 31 +++++++++++++++++++++++++++++++
 arch/powerpc/include/asm/paca.h   |  1 +
 arch/powerpc/kernel/asm-offsets.c |  1 +
 arch/powerpc/kernel/entry_64.S    |  9 +++++++++
 arch/powerpc/kernel/irq.c         | 10 ++++++++++
 5 files changed, 52 insertions(+)

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index f75a5fc64d2e6..e10f151c3db64 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -131,5 +131,36 @@ static inline int irqs_disabled_flags(unsigned long flags)
  */
 struct hw_interrupt_type;
 
+#ifdef CONFIG_PERF_COUNTERS
+static inline unsigned long get_perf_counter_pending(void)
+{
+	unsigned long x;
+
+	asm volatile("lbz %0,%1(13)"
+		: "=r" (x)
+		: "i" (offsetof(struct paca_struct, perf_counter_pending)));
+	return x;
+}
+
+static inline void set_perf_counter_pending(int x)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (x),
+		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+extern void perf_counter_do_pending(void);
+
+#else
+
+static inline unsigned long get_perf_counter_pending(void)
+{
+	return 0;
+}
+
+static inline void set_perf_counter_pending(int x) {}
+static inline void perf_counter_do_pending(void) {}
+#endif /* CONFIG_PERF_COUNTERS */
+
 #endif	/* __KERNEL__ */
 #endif	/* _ASM_POWERPC_HW_IRQ_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 082b3aedf145b..6ef055723019f 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -99,6 +99,7 @@ struct paca_struct {
 	u8 soft_enabled;		/* irq soft-enable flag */
 	u8 hard_enabled;		/* set if irqs are enabled in MSR */
 	u8 io_sync;			/* writel() needs spin_unlock sync */
+	u8 perf_counter_pending;	/* PM interrupt while soft-disabled */
 
 	/* Stuff for accurate time accounting */
 	u64 user_time;			/* accumulated usermode TB ticks */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 661d07d2146bc..cea462900119f 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -127,6 +127,7 @@ int main(void)
 	DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));
 	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));
 	DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled));
+	DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));
 	DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));
 	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));
 	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id));
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 383ed6eb00850..f30b4e553c53b 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)
 2:
 	TRACE_AND_RESTORE_IRQ(r5);
 
+#ifdef CONFIG_PERF_COUNTERS
+	/* check paca->perf_counter_pending if we're enabling ints */
+	lbz	r3,PACAPERFPEND(r13)
+	and.	r3,r3,r5
+	beq	27f
+	bl	.perf_counter_do_pending
+27:
+#endif /* CONFIG_PERF_COUNTERS */
+
 	/* extract EE bit and use it to restore paca->hard_enabled */
 	ld	r3,_MSR(r1)
 	rldicl	r4,r3,49,63		/* r0 = (r3 >> 15) & 1 */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index ac222d0ab12e8..4efb886ea4393 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -104,6 +104,13 @@ static inline notrace void set_soft_enabled(unsigned long enable)
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 
+#ifdef CONFIG_PERF_COUNTERS
+notrace void __weak perf_counter_do_pending(void)
+{
+	set_perf_counter_pending(0);
+}
+#endif
+
 notrace void raw_local_irq_restore(unsigned long en)
 {
 	/*
@@ -135,6 +142,9 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
+	if (get_perf_counter_pending())
+		perf_counter_do_pending();
+
 	/*
 	 * if (get_paca()->hard_enabled) return;
 	 * But again we need to take care that gcc gets hard_enabled directly
-- 
GitLab


From 4574910e5087085a1f330ff8373cee4503f5c77c Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 20:21:55 +1100
Subject: [PATCH 0061/2198] powerpc/perf_counter: Add generic support for
 POWER-family PMU hardware

This provides the architecture-specific functions needed to access
PMU hardware on the 64-bit PowerPC processors.  It has been designed
for the IBM POWER family (POWER 4/4+/5/5+/6 and PPC970) but will
hopefully also suit other 64-bit PowerPC machines (although probably
not Cell given how different it is in this area).  This doesn't
include back-ends for any specific processors.

This implements a system which allows back-ends to express the
constraints that their hardware has on what events can be counted
simultaneously.  The constraints are expressed as a 64-bit mask +
64-bit value for each event, and the encoding is capable of
expressing the constraints arising from having a set of multiplexers
feeding an event bus, with some events being available through
multiple multiplexer settings, such as we get on POWER4 and PPC970.
Furthermore, the back-end can supply alternative event codes for
each event, and the constraint checking code will try all possible
combinations of alternative event codes to try to find a combination
that will fit.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/perf_counter.h |  62 ++
 arch/powerpc/kernel/Makefile            |   1 +
 arch/powerpc/kernel/perf_counter.c      | 754 ++++++++++++++++++++++++
 3 files changed, 817 insertions(+)
 create mode 100644 arch/powerpc/kernel/perf_counter.c

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 59530ae1d53c5..9d7ff6d7fb566 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -8,3 +8,65 @@
  * as published by the Free Software Foundation; either version
  * 2 of the License, or (at your option) any later version.
  */
+#include <linux/types.h>
+
+#define MAX_HWCOUNTERS		8
+#define MAX_EVENT_ALTERNATIVES	8
+
+/*
+ * This struct provides the constants and functions needed to
+ * describe the PMU on a particular POWER-family CPU.
+ */
+struct power_pmu {
+	int	n_counter;
+	int	max_alternatives;
+	u64	add_fields;
+	u64	test_adder;
+	int	(*compute_mmcr)(unsigned int events[], int n_ev,
+				unsigned int hwc[], u64 mmcr[]);
+	int	(*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
+	int	(*get_alternatives)(unsigned int event, unsigned int alt[]);
+	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
+	int	n_generic;
+	int	*generic_events;
+};
+
+extern struct power_pmu *ppmu;
+
+/*
+ * The power_pmu.get_constraint function returns a 64-bit value and
+ * a 64-bit mask that express the constraints between this event and
+ * other events.
+ *
+ * The value and mask are divided up into (non-overlapping) bitfields
+ * of three different types:
+ *
+ * Select field: this expresses the constraint that some set of bits
+ * in MMCR* needs to be set to a specific value for this event.  For a
+ * select field, the mask contains 1s in every bit of the field, and
+ * the value contains a unique value for each possible setting of the
+ * MMCR* bits.  The constraint checking code will ensure that two events
+ * that set the same field in their masks have the same value in their
+ * value dwords.
+ *
+ * Add field: this expresses the constraint that there can be at most
+ * N events in a particular class.  A field of k bits can be used for
+ * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field
+ * set (and the other bits 0), and the value has only the least significant
+ * bit of the field set.  In addition, the 'add_fields' and 'test_adder'
+ * in the struct power_pmu for this processor come into play.  The
+ * add_fields value contains 1 in the LSB of the field, and the
+ * test_adder contains 2^(k-1) - 1 - N in the field.
+ *
+ * NAND field: this expresses the constraint that you may not have events
+ * in all of a set of classes.  (For example, on PPC970, you can't select
+ * events from the FPU, ISU and IDU simultaneously, although any two are
+ * possible.)  For N classes, the field is N+1 bits wide, and each class
+ * is assigned one bit from the least-significant N bits.  The mask has
+ * only the most-significant bit set, and the value has only the bit
+ * for the event's class set.  The test_adder has the least significant
+ * bit set in the field.
+ *
+ * If an event is not subject to the constraint expressed by a particular
+ * field, then it will have 0 in both the mask and value for that field.
+ */
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 1308a86e90708..fde190bbb2bd0 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
new file mode 100644
index 0000000000000..c7d4c2966a5c1
--- /dev/null
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -0,0 +1,754 @@
+/*
+ * Performance counter support - powerpc architecture code
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/perf_counter.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <asm/reg.h>
+#include <asm/pmc.h>
+
+struct cpu_hw_counters {
+	int n_counters;
+	int n_percpu;
+	int disabled;
+	int n_added;
+	struct perf_counter *counter[MAX_HWCOUNTERS];
+	unsigned int events[MAX_HWCOUNTERS];
+	u64 mmcr[3];
+};
+DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+
+struct power_pmu *ppmu;
+
+void perf_counter_print_debug(void)
+{
+}
+
+/*
+ * Return 1 for a software counter, 0 for a hardware counter
+ */
+static inline int is_software_counter(struct perf_counter *counter)
+{
+	return !counter->hw_event.raw && counter->hw_event.type < 0;
+}
+
+/*
+ * Read one performance monitor counter (PMC).
+ */
+static unsigned long read_pmc(int idx)
+{
+	unsigned long val;
+
+	switch (idx) {
+	case 1:
+		val = mfspr(SPRN_PMC1);
+		break;
+	case 2:
+		val = mfspr(SPRN_PMC2);
+		break;
+	case 3:
+		val = mfspr(SPRN_PMC3);
+		break;
+	case 4:
+		val = mfspr(SPRN_PMC4);
+		break;
+	case 5:
+		val = mfspr(SPRN_PMC5);
+		break;
+	case 6:
+		val = mfspr(SPRN_PMC6);
+		break;
+	case 7:
+		val = mfspr(SPRN_PMC7);
+		break;
+	case 8:
+		val = mfspr(SPRN_PMC8);
+		break;
+	default:
+		printk(KERN_ERR "oops trying to read PMC%d\n", idx);
+		val = 0;
+	}
+	return val;
+}
+
+/*
+ * Write one PMC.
+ */
+static void write_pmc(int idx, unsigned long val)
+{
+	switch (idx) {
+	case 1:
+		mtspr(SPRN_PMC1, val);
+		break;
+	case 2:
+		mtspr(SPRN_PMC2, val);
+		break;
+	case 3:
+		mtspr(SPRN_PMC3, val);
+		break;
+	case 4:
+		mtspr(SPRN_PMC4, val);
+		break;
+	case 5:
+		mtspr(SPRN_PMC5, val);
+		break;
+	case 6:
+		mtspr(SPRN_PMC6, val);
+		break;
+	case 7:
+		mtspr(SPRN_PMC7, val);
+		break;
+	case 8:
+		mtspr(SPRN_PMC8, val);
+		break;
+	default:
+		printk(KERN_ERR "oops trying to write PMC%d\n", idx);
+	}
+}
+
+/*
+ * Check if a set of events can all go on the PMU at once.
+ * If they can't, this will look at alternative codes for the events
+ * and see if any combination of alternative codes is feasible.
+ * The feasible set is returned in event[].
+ */
+static int power_check_constraints(unsigned int event[], int n_ev)
+{
+	u64 mask, value, nv;
+	unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
+	int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS];
+	int i, j;
+	u64 addf = ppmu->add_fields;
+	u64 tadd = ppmu->test_adder;
+
+	if (n_ev > ppmu->n_counter)
+		return -1;
+
+	/* First see if the events will go on as-is */
+	for (i = 0; i < n_ev; ++i) {
+		alternatives[i][0] = event[i];
+		if (ppmu->get_constraint(event[i], &amasks[i][0],
+					 &avalues[i][0]))
+			return -1;
+		choice[i] = 0;
+	}
+	value = mask = 0;
+	for (i = 0; i < n_ev; ++i) {
+		nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf);
+		if ((((nv + tadd) ^ value) & mask) != 0 ||
+		    (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0)
+			break;
+		value = nv;
+		mask |= amasks[i][0];
+	}
+	if (i == n_ev)
+		return 0;	/* all OK */
+
+	/* doesn't work, gather alternatives... */
+	if (!ppmu->get_alternatives)
+		return -1;
+	for (i = 0; i < n_ev; ++i) {
+		n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
+		for (j = 1; j < n_alt[i]; ++j)
+			ppmu->get_constraint(alternatives[i][j],
+					     &amasks[i][j], &avalues[i][j]);
+	}
+
+	/* enumerate all possibilities and see if any will work */
+	i = 0;
+	j = -1;
+	value = mask = nv = 0;
+	while (i < n_ev) {
+		if (j >= 0) {
+			/* we're backtracking, restore context */
+			value = svalues[i];
+			mask = smasks[i];
+			j = choice[i];
+		}
+		/*
+		 * See if any alternative k for event i,
+		 * where k > j, will satisfy the constraints.
+		 */
+		while (++j < n_alt[i]) {
+			nv = (value | avalues[i][j]) +
+				(value & avalues[i][j] & addf);
+			if ((((nv + tadd) ^ value) & mask) == 0 &&
+			    (((nv + tadd) ^ avalues[i][j])
+			     & amasks[i][j]) == 0)
+				break;
+		}
+		if (j >= n_alt[i]) {
+			/*
+			 * No feasible alternative, backtrack
+			 * to event i-1 and continue enumerating its
+			 * alternatives from where we got up to.
+			 */
+			if (--i < 0)
+				return -1;
+		} else {
+			/*
+			 * Found a feasible alternative for event i,
+			 * remember where we got up to with this event,
+			 * go on to the next event, and start with
+			 * the first alternative for it.
+			 */
+			choice[i] = j;
+			svalues[i] = value;
+			smasks[i] = mask;
+			value = nv;
+			mask |= amasks[i][j];
+			++i;
+			j = -1;
+		}
+	}
+
+	/* OK, we have a feasible combination, tell the caller the solution */
+	for (i = 0; i < n_ev; ++i)
+		event[i] = alternatives[i][choice[i]];
+	return 0;
+}
+
+static void power_perf_read(struct perf_counter *counter)
+{
+	long val, delta, prev;
+
+	if (!counter->hw.idx)
+		return;
+	/*
+	 * Performance monitor interrupts come even when interrupts
+	 * are soft-disabled, as long as interrupts are hard-enabled.
+	 * Therefore we treat them like NMIs.
+	 */
+	do {
+		prev = atomic64_read(&counter->hw.prev_count);
+		barrier();
+		val = read_pmc(counter->hw.idx);
+	} while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev);
+
+	/* The counters are only 32 bits wide */
+	delta = (val - prev) & 0xfffffffful;
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &counter->hw.period_left);
+}
+
+/*
+ * Disable all counters to prevent PMU interrupts and to allow
+ * counters to be added or removed.
+ */
+u64 hw_perf_save_disable(void)
+{
+	struct cpu_hw_counters *cpuhw;
+	unsigned long ret;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+
+	ret = cpuhw->disabled;
+	if (!ret) {
+		cpuhw->disabled = 1;
+		cpuhw->n_added = 0;
+
+		/*
+		 * Set the 'freeze counters' bit.
+		 * The barrier is to make sure the mtspr has been
+		 * executed and the PMU has frozen the counters
+		 * before we return.
+		 */
+		mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+		mb();
+	}
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Re-enable all counters if disable == 0.
+ * If we were previously disabled and counters were added, then
+ * put the new config on the PMU.
+ */
+void hw_perf_restore(u64 disable)
+{
+	struct perf_counter *counter;
+	struct cpu_hw_counters *cpuhw;
+	unsigned long flags;
+	long i;
+	unsigned long val;
+	s64 left;
+	unsigned int hwc_index[MAX_HWCOUNTERS];
+
+	if (disable)
+		return;
+	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	cpuhw->disabled = 0;
+
+	/*
+	 * If we didn't change anything, or only removed counters,
+	 * no need to recalculate MMCR* settings and reset the PMCs.
+	 * Just reenable the PMU with the current MMCR* settings
+	 * (possibly updated for removal of counters).
+	 */
+	if (!cpuhw->n_added) {
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+		mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+		goto out;
+	}
+
+	/*
+	 * Compute MMCR* values for the new set of counters
+	 */
+	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index,
+			       cpuhw->mmcr)) {
+		/* shouldn't ever get here */
+		printk(KERN_ERR "oops compute_mmcr failed\n");
+		goto out;
+	}
+
+	/*
+	 * Write the new configuration to MMCR* with the freeze
+	 * bit set and set the hardware counters to their initial values.
+	 * Then unfreeze the counters.
+	 */
+	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
+	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
+				| MMCR0_FC);
+
+	/*
+	 * Read off any pre-existing counters that need to move
+	 * to another PMC.
+	 */
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
+			power_perf_read(counter);
+			write_pmc(counter->hw.idx, 0);
+			counter->hw.idx = 0;
+		}
+	}
+
+	/*
+	 * Initialize the PMCs for all the new and moved counters.
+	 */
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		if (counter->hw.idx)
+			continue;
+		val = 0;
+		if (counter->hw_event.irq_period) {
+			left = atomic64_read(&counter->hw.period_left);
+			if (left < 0x80000000L)
+				val = 0x80000000L - left;
+		}
+		atomic64_set(&counter->hw.prev_count, val);
+		counter->hw.idx = hwc_index[i] + 1;
+		write_pmc(counter->hw.idx, val);
+	}
+	mb();
+	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+
+ out:
+	local_irq_restore(flags);
+}
+
+static int collect_events(struct perf_counter *group, int max_count,
+			  struct perf_counter *ctrs[], unsigned int *events)
+{
+	int n = 0;
+	struct perf_counter *counter;
+
+	if (!is_software_counter(group)) {
+		if (n >= max_count)
+			return -1;
+		ctrs[n] = group;
+		events[n++] = group->hw.config;
+	}
+	list_for_each_entry(counter, &group->sibling_list, list_entry) {
+		if (!is_software_counter(counter) &&
+		    counter->state != PERF_COUNTER_STATE_OFF) {
+			if (n >= max_count)
+				return -1;
+			ctrs[n] = counter;
+			events[n++] = counter->hw.config;
+		}
+	}
+	return n;
+}
+
+static void counter_sched_in(struct perf_counter *counter, int cpu)
+{
+	counter->state = PERF_COUNTER_STATE_ACTIVE;
+	counter->oncpu = cpu;
+	if (is_software_counter(counter))
+		counter->hw_ops->enable(counter);
+}
+
+/*
+ * Called to enable a whole group of counters.
+ * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
+ * Assumes the caller has disabled interrupts and has
+ * frozen the PMU with hw_perf_save_disable.
+ */
+int hw_perf_group_sched_in(struct perf_counter *group_leader,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx, int cpu)
+{
+	struct cpu_hw_counters *cpuhw;
+	long i, n, n0;
+	struct perf_counter *sub;
+
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	n0 = cpuhw->n_counters;
+	n = collect_events(group_leader, ppmu->n_counter - n0,
+			   &cpuhw->counter[n0], &cpuhw->events[n0]);
+	if (n < 0)
+		return -EAGAIN;
+	if (power_check_constraints(cpuhw->events, n + n0))
+		return -EAGAIN;
+	cpuhw->n_counters = n0 + n;
+	cpuhw->n_added += n;
+
+	/*
+	 * OK, this group can go on; update counter states etc.,
+	 * and enable any software counters
+	 */
+	for (i = n0; i < n0 + n; ++i)
+		cpuhw->counter[i]->hw.config = cpuhw->events[i];
+	n = 1;
+	counter_sched_in(group_leader, cpu);
+	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
+		if (sub->state != PERF_COUNTER_STATE_OFF) {
+			counter_sched_in(sub, cpu);
+			++n;
+		}
+	}
+	cpuctx->active_oncpu += n;
+	ctx->nr_active += n;
+
+	return 1;
+}
+
+/*
+ * Add a counter to the PMU.
+ * If all counters are not already frozen, then we disable and
+ * re-enable the PMU in order to get hw_perf_restore to do the
+ * actual work of reconfiguring the PMU.
+ */
+static int power_perf_enable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuhw;
+	unsigned long flags;
+	u64 pmudis;
+	int n0;
+	int ret = -EAGAIN;
+
+	local_irq_save(flags);
+	pmudis = hw_perf_save_disable();
+
+	/*
+	 * Add the counter to the list (if there is room)
+	 * and check whether the total set is still feasible.
+	 */
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	n0 = cpuhw->n_counters;
+	if (n0 >= ppmu->n_counter)
+		goto out;
+	cpuhw->counter[n0] = counter;
+	cpuhw->events[n0] = counter->hw.config;
+	if (power_check_constraints(cpuhw->events, n0 + 1))
+		goto out;
+
+	counter->hw.config = cpuhw->events[n0];
+	++cpuhw->n_counters;
+	++cpuhw->n_added;
+
+	ret = 0;
+ out:
+	hw_perf_restore(pmudis);
+	local_irq_restore(flags);
+	return ret;
+}
+
+/*
+ * Remove a counter from the PMU.
+ */
+static void power_perf_disable(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuhw;
+	long i;
+	u64 pmudis;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pmudis = hw_perf_save_disable();
+
+	power_perf_read(counter);
+
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		if (counter == cpuhw->counter[i]) {
+			while (++i < cpuhw->n_counters)
+				cpuhw->counter[i-1] = cpuhw->counter[i];
+			--cpuhw->n_counters;
+			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
+			write_pmc(counter->hw.idx, 0);
+			counter->hw.idx = 0;
+			break;
+		}
+	}
+	if (cpuhw->n_counters == 0) {
+		/* disable exceptions if no counters are running */
+		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
+	}
+
+	hw_perf_restore(pmudis);
+	local_irq_restore(flags);
+}
+
+struct hw_perf_counter_ops power_perf_ops = {
+	.enable = power_perf_enable,
+	.disable = power_perf_disable,
+	.read = power_perf_read
+};
+
+const struct hw_perf_counter_ops *
+hw_perf_counter_init(struct perf_counter *counter)
+{
+	unsigned long ev;
+	struct perf_counter *ctrs[MAX_HWCOUNTERS];
+	unsigned int events[MAX_HWCOUNTERS];
+	int n;
+
+	if (!ppmu)
+		return NULL;
+	if ((s64)counter->hw_event.irq_period < 0)
+		return NULL;
+	ev = counter->hw_event.type;
+	if (!counter->hw_event.raw) {
+		if (ev >= ppmu->n_generic ||
+		    ppmu->generic_events[ev] == 0)
+			return NULL;
+		ev = ppmu->generic_events[ev];
+	}
+	counter->hw.config_base = ev;
+	counter->hw.idx = 0;
+
+	/*
+	 * If this is in a group, check if it can go on with all the
+	 * other hardware counters in the group.  We assume the counter
+	 * hasn't been linked into its leader's sibling list at this point.
+	 */
+	n = 0;
+	if (counter->group_leader != counter) {
+		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
+				   ctrs, events);
+		if (n < 0)
+			return NULL;
+	}
+	events[n++] = ev;
+	if (power_check_constraints(events, n))
+		return NULL;
+
+	counter->hw.config = events[n - 1];
+	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+	return &power_perf_ops;
+}
+
+/*
+ * Handle wakeups.
+ */
+void perf_counter_do_pending(void)
+{
+	int i;
+	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
+	struct perf_counter *counter;
+
+	set_perf_counter_pending(0);
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		if (counter && counter->wakeup_pending) {
+			counter->wakeup_pending = 0;
+			wake_up(&counter->waitq);
+		}
+	}
+}
+
+/*
+ * Record data for an irq counter.
+ * This function was lifted from the x86 code; maybe it should
+ * go in the core?
+ */
+static void perf_store_irq_data(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+/*
+ * Record all the values of the counters in a group
+ */
+static void perf_handle_group(struct perf_counter *counter)
+{
+	struct perf_counter *leader, *sub;
+
+	leader = counter->group_leader;
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		if (sub != counter)
+			sub->hw_ops->read(sub);
+		perf_store_irq_data(counter, sub->hw_event.type);
+		perf_store_irq_data(counter, atomic64_read(&sub->count));
+	}
+}
+
+/*
+ * A counter has overflowed; update its count and record
+ * things if requested.  Note that interrupts are hard-disabled
+ * here so there is no possibility of being interrupted.
+ */
+static void record_and_restart(struct perf_counter *counter, long val,
+			       struct pt_regs *regs)
+{
+	s64 prev, delta, left;
+	int record = 0;
+
+	/* we don't have to worry about interrupts here */
+	prev = atomic64_read(&counter->hw.prev_count);
+	delta = (val - prev) & 0xfffffffful;
+	atomic64_add(delta, &counter->count);
+
+	/*
+	 * See if the total period for this counter has expired,
+	 * and update for the next period.
+	 */
+	val = 0;
+	left = atomic64_read(&counter->hw.period_left) - delta;
+	if (counter->hw_event.irq_period) {
+		if (left <= 0) {
+			left += counter->hw_event.irq_period;
+			if (left <= 0)
+				left = counter->hw_event.irq_period;
+			record = 1;
+		}
+		if (left < 0x80000000L)
+			val = 0x80000000L - left;
+	}
+	write_pmc(counter->hw.idx, val);
+	atomic64_set(&counter->hw.prev_count, val);
+	atomic64_set(&counter->hw.period_left, left);
+
+	/*
+	 * Finally record data if requested.
+	 */
+	if (record) {
+		switch (counter->hw_event.record_type) {
+		case PERF_RECORD_SIMPLE:
+			break;
+		case PERF_RECORD_IRQ:
+			perf_store_irq_data(counter, instruction_pointer(regs));
+			counter->wakeup_pending = 1;
+			break;
+		case PERF_RECORD_GROUP:
+			perf_handle_group(counter);
+			counter->wakeup_pending = 1;
+			break;
+		}
+	}
+}
+
+/*
+ * Performance monitor interrupt stuff
+ */
+static void perf_counter_interrupt(struct pt_regs *regs)
+{
+	int i;
+	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
+	struct perf_counter *counter;
+	long val;
+	int need_wakeup = 0, found = 0;
+
+	for (i = 0; i < cpuhw->n_counters; ++i) {
+		counter = cpuhw->counter[i];
+		val = read_pmc(counter->hw.idx);
+		if ((int)val < 0) {
+			/* counter has overflowed */
+			found = 1;
+			record_and_restart(counter, val, regs);
+			if (counter->wakeup_pending)
+				need_wakeup = 1;
+		}
+	}
+
+	/*
+	 * In case we didn't find and reset the counter that caused
+	 * the interrupt, scan all counters and reset any that are
+	 * negative, to avoid getting continual interrupts.
+	 * Any that we processed in the previous loop will not be negative.
+	 */
+	if (!found) {
+		for (i = 0; i < ppmu->n_counter; ++i) {
+			val = read_pmc(i + 1);
+			if ((int)val < 0)
+				write_pmc(i + 1, 0);
+		}
+	}
+
+	/*
+	 * Reset MMCR0 to its normal value.  This will set PMXE and
+	 * clear FC (freeze counters) and PMAO (perf mon alert occurred)
+	 * and thus allow interrupts to occur again.
+	 * XXX might want to use MSR.PM to keep the counters frozen until
+	 * we get back out of this interrupt.
+	 */
+	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+
+	/*
+	 * If we need a wakeup, check whether interrupts were soft-enabled
+	 * when we took the interrupt.  If they were, we can wake stuff up
+	 * immediately; otherwise we'll have to set a flag and do the
+	 * wakeup when interrupts get soft-enabled.
+	 */
+	if (need_wakeup) {
+		if (regs->softe) {
+			irq_enter();
+			perf_counter_do_pending();
+			irq_exit();
+		} else {
+			set_perf_counter_pending(1);
+		}
+	}
+}
+
+static int init_perf_counters(void)
+{
+	if (reserve_pmc_hardware(perf_counter_interrupt)) {
+		printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
+		return -EBUSY;
+	}
+
+	return 0;
+}
+
+arch_initcall(init_perf_counters);
-- 
GitLab


From 16b067993dee3dfde61b20027e0b168dc06201ee Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 10 Jan 2009 16:34:07 +1100
Subject: [PATCH 0062/2198] powerpc/perf_counter: Add support for PPC970 family

This adds the back-end for the PMU on the PPC970 family.

The PPC970 allows events from the ISU to be selected in two different
ways.  Rather than use alternative event codes to express this, we
instead use a single encoding for ISU events and express the
resulting constraint (that you can't select events from all three
of FPU/IFU/VPU, ISU and IDU/STS at the same time, since they all come
in through only 2 multiplexers) using a NAND constraint field, and
work out which multiplexer is used for ISU events at compute_mmcr
time.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile       |   2 +-
 arch/powerpc/kernel/perf_counter.c |  13 +
 arch/powerpc/kernel/ppc970-pmu.c   | 375 +++++++++++++++++++++++++++++
 3 files changed, 389 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/ppc970-pmu.c

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fde190bbb2bd0..45798f6fb1379 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c7d4c2966a5c1..5561ecb02a4bd 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -741,13 +741,26 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	}
 }
 
+extern struct power_pmu ppc970_pmu;
+
 static int init_perf_counters(void)
 {
+	unsigned long pvr;
+
 	if (reserve_pmc_hardware(perf_counter_interrupt)) {
 		printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
 		return -EBUSY;
 	}
 
+	/* XXX should get this from cputable */
+	pvr = mfspr(SPRN_PVR);
+	switch (PVR_VER(pvr)) {
+	case PV_970:
+	case PV_970FX:
+	case PV_970MP:
+		ppmu = &ppc970_pmu;
+		break;
+	}
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
new file mode 100644
index 0000000000000..c3256580be1ac
--- /dev/null
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -0,0 +1,375 @@
+/*
+ * Performance counter support for PPC970-family processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/string.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for PPC970
+ */
+#define PM_PMC_SH	12	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_BYTE_SH	4	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	3
+#define PM_PMCSEL_MSK	0xf
+
+/* Values in PM_UNIT field */
+#define PM_NONE		0
+#define PM_FPU		1
+#define PM_VPU		2
+#define PM_ISU		3
+#define PM_IFU		4
+#define PM_IDU		5
+#define PM_STS		6
+#define PM_LSU0		7
+#define PM_LSU1U	8
+#define PM_LSU1L	9
+#define PM_LASTUNIT	9
+
+/*
+ * Bits in MMCR0 for PPC970
+ */
+#define MMCR0_PMC1SEL_SH	8
+#define MMCR0_PMC2SEL_SH	1
+#define MMCR_PMCSEL_MSK		0x1f
+
+/*
+ * Bits in MMCR1 for PPC970
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTM1SEL_SH	59
+#define MMCR1_TTM3SEL_SH	53
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	50
+#define MMCR1_TD_CP_DBG1SEL_SH	48
+#define MMCR1_TD_CP_DBG2SEL_SH	46
+#define MMCR1_TD_CP_DBG3SEL_SH	44
+#define MMCR1_PMC1_ADDER_SEL_SH	39
+#define MMCR1_PMC2_ADDER_SEL_SH	38
+#define MMCR1_PMC6_ADDER_SEL_SH	37
+#define MMCR1_PMC5_ADDER_SEL_SH	36
+#define MMCR1_PMC8_ADDER_SEL_SH	35
+#define MMCR1_PMC7_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC3SEL_SH	27
+#define MMCR1_PMC4SEL_SH	22
+#define MMCR1_PMC5SEL_SH	17
+#define MMCR1_PMC6SEL_SH	12
+#define MMCR1_PMC7SEL_SH	7
+#define MMCR1_PMC8SEL_SH	2
+
+static short mmcr1_adder_bits[8] = {
+	MMCR1_PMC1_ADDER_SEL_SH,
+	MMCR1_PMC2_ADDER_SEL_SH,
+	MMCR1_PMC3_ADDER_SEL_SH,
+	MMCR1_PMC4_ADDER_SEL_SH,
+	MMCR1_PMC5_ADDER_SEL_SH,
+	MMCR1_PMC6_ADDER_SEL_SH,
+	MMCR1_PMC7_ADDER_SEL_SH,
+	MMCR1_PMC8_ADDER_SEL_SH
+};
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *                 <><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *                 T0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *
+ * T0 - TTM0 constraint
+ *     46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS
+ *     43: UC3 error 0x0800_0000_0000
+ *     42: FPU|IFU|VPU events needed 0x0400_0000_0000
+ *     41: ISU events needed 0x0200_0000_0000
+ *     40: IDU|STS events needed 0x0100_0000_0000
+ *
+ * PS1
+ *     39: PS1 error 0x0080_0000_0000
+ *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
+ *
+ * PS2
+ *     35: PS2 error 0x0008_0000_0000
+ *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
+ *
+ * B0
+ *     28-31: Byte 0 event source 0xf000_0000
+ *	      Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
+ *
+ * P1
+ *     15: P1 error 0x8000
+ *     14-15: Count of events needing PMC1
+ *
+ * P2..P8
+ *     0-13: Count of events needing PMC2..PMC8
+ */
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0xc80000000000ull, 0x040000000000ull },
+	[PM_VPU] =   { 0xc80000000000ull, 0xc40000000000ull },
+	[PM_ISU] =   { 0x080000000000ull, 0x020000000000ull },
+	[PM_IFU] =   { 0xc80000000000ull, 0x840000000000ull },
+	[PM_IDU] =   { 0x380000000000ull, 0x010000000000ull },
+	[PM_STS] =   { 0x380000000000ull, 0x310000000000ull },
+};
+
+static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, sh;
+	u64 mask = 0, value = 0;
+	int grp = -1;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 8)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		grp = ((pmc - 1) >> 1) & 1;
+	}
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	if (unit) {
+		if (unit > PM_LASTUNIT)
+			return -1;
+		mask |= unit_cons[unit][0];
+		value |= unit_cons[unit][1];
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		/*
+		 * Bus events on bytes 0 and 2 can be counted
+		 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
+		 */
+		if (!pmc)
+			grp = byte & 1;
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (28 - 4 * byte);
+		value |= (u64)unit << (28 - 4 * byte);
+	}
+	if (grp == 0) {
+		/* increment PMC1/2/5/6 field */
+		mask  |= 0x8000000000ull;
+		value |= 0x1000000000ull;
+	} else if (grp == 1) {
+		/* increment PMC3/4/7/8 field */
+		mask  |= 0x800000000ull;
+		value |= 0x100000000ull;
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+static int p970_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	alt[0] = event;
+
+	/* 2 alternatives for LSU empty */
+	if (event == 0x2002 || event == 0x3002) {
+		alt[1] = event ^ 0x1000;
+		return 2;
+	}
+		
+	return 1;
+}
+
+static int p970_compute_mmcr(unsigned int event[], int n_ev,
+			     unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+	unsigned int pmc, unit, byte, psel;
+	unsigned int ttm, grp;
+	unsigned int pmc_inuse = 0;
+	unsigned int pmc_grp_use[2];
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 };
+	unsigned char ttmuse[2];
+	unsigned char pmcsel[8];
+	int i;
+
+	if (n_ev > 8)
+		return -1;
+
+	/* First pass to count resource use */
+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+			/* count 1/2/5/6 vs 3/4/7/8 use */
+			++pmc_grp_use[((pmc - 1) >> 1) & 1];
+		}
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		if (unit) {
+			if (unit > PM_LASTUNIT)
+				return -1;
+			if (!pmc)
+				++pmc_grp_use[byte & 1];
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			unituse[unit] = 1;
+		}
+	}
+	if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
+		return -1;
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * PM_ISU can go either on TTM0 or TTM1, but that's the only
+	 * choice we have to deal with.
+	 */
+	if (unituse[PM_ISU] &
+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU]))
+		unitmap[PM_ISU] = 2 | 4;	/* move ISU to TTM1 */
+	/* Set TTM[01]SEL fields. */
+	ttmuse[0] = ttmuse[1] = 0;
+	for (i = PM_FPU; i <= PM_STS; ++i) {
+		if (!unituse[i])
+			continue;
+		ttm = unitmap[i];
+		++ttmuse[(ttm >> 2) & 1];
+		mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH;
+	}
+	/* Check only one unit per TTMx */
+	if (ttmuse[0] > 1 || ttmuse[1] > 1)
+		return -1;
+
+	/* Set byte lane select fields and TTM3SEL. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit <= PM_STS)
+			ttm = (unitmap[unit] >> 2) & 1;
+		else if (unit == PM_LSU0)
+			ttm = 2;
+		else {
+			ttm = 3;
+			if (unit == PM_LSU1L && byte >= 2)
+				mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+		}
+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	memset(pmcsel, 0x8, sizeof(pmcsel));	/* 8 means don't count */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			if (unit)
+				psel |= 0x10 | ((byte & 2) << 2);
+			else
+				psel |= 8;
+			for (pmc = 0; pmc < 8; ++pmc) {
+				if (pmc_inuse & (1 << pmc))
+					continue;
+				grp = (pmc >> 1) & 1;
+				if (unit) {
+					if (grp == (byte & 1))
+						break;
+				} else if (pmc_grp_use[grp] < 4) {
+					++pmc_grp_use[grp];
+					break;
+				}
+			}
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct event */
+			--pmc;
+			if (psel == 0 && (byte & 2))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
+		}
+		pmcsel[pmc] = psel;
+		hwc[i] = pmc;
+	}
+	for (pmc = 0; pmc < 2; ++pmc)
+		mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
+	for (; pmc < 8; ++pmc)
+		mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+	if (pmc_inuse & 1)
+		mmcr0 |= MMCR0_PMC1CE;
+	if (pmc_inuse & 0xfe)
+		mmcr0 |= MMCR0_PMCjCE;
+
+	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */
+
+	/* Return MMCRx values */
+	mmcr[0] = mmcr0;
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcra;
+	return 0;
+}
+
+static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	int shift, i;
+
+	if (pmc <= 1) {
+		shift = MMCR0_PMC1SEL_SH - 7 * pmc;
+		i = 0;
+	} else {
+		shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2);
+		i = 1;
+	}
+	/*
+	 * Setting the PMCxSEL field to 0x08 disables PMC x.
+	 */
+	mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift);
+}
+
+static int ppc970_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 7,
+	[PERF_COUNT_INSTRUCTIONS] = 1,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x8810,		/* PM_LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3810,		/* PM_LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431,	/* PM_BR_ISSUED */
+	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
+};
+
+struct power_pmu ppc970_pmu = {
+	.n_counter = 8,
+	.max_alternatives = 2,
+	.add_fields = 0x001100005555ull,
+	.test_adder = 0x013300000000ull,
+	.compute_mmcr = p970_compute_mmcr,
+	.get_constraint = p970_get_constraint,
+	.get_alternatives = p970_get_alternatives,
+	.disable_pmc = p970_disable_pmc,
+	.n_generic = ARRAY_SIZE(ppc970_generic_events),
+	.generic_events = ppc970_generic_events,
+};
-- 
GitLab


From f78628374a13bc150db77c6e02d4f2c0a7f932ef Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 9 Jan 2009 21:05:35 +1100
Subject: [PATCH 0063/2198] powerpc/perf_counter: Add support for POWER6

This adds the back-end for the PMU on the POWER6 processor.
Fortunately, the event selection hardware is somewhat simpler on
POWER6 than on other POWER family processors, so the constraints
fit into only 32 bits.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile       |   2 +-
 arch/powerpc/kernel/perf_counter.c |   4 +
 arch/powerpc/kernel/power6-pmu.c   | 283 +++++++++++++++++++++++++++++
 3 files changed, 288 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/power6-pmu.c

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 45798f6fb1379..0ebf4d04d4b95 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5561ecb02a4bd..df3fe057dee99 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -742,6 +742,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 }
 
 extern struct power_pmu ppc970_pmu;
+extern struct power_pmu power6_pmu;
 
 static int init_perf_counters(void)
 {
@@ -760,6 +761,9 @@ static int init_perf_counters(void)
 	case PV_970MP:
 		ppmu = &ppc970_pmu;
 		break;
+	case 0x3e:
+		ppmu = &power6_pmu;
+		break;
 	}
 	return 0;
 }
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
new file mode 100644
index 0000000000000..b1f61f3c97bb9
--- /dev/null
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -0,0 +1,283 @@
+/*
+ * Performance counter support for POWER6 processors.
+ *
+ * Copyright 2008-2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER6
+ */
+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0x7
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	16	/* Unit event comes (TTMxSEL encoding) */
+#define PM_UNIT_MSK	0xf
+#define PM_UNIT_MSKS	(PM_UNIT_MSK << PM_UNIT_SH)
+#define PM_LLAV		0x8000	/* Load lookahead match value */
+#define PM_LLA		0x4000	/* Load lookahead match enable */
+#define PM_BYTE_SH	12	/* Byte of event bus to use */
+#define PM_BYTE_MSK	3
+#define PM_SUBUNIT_SH	8	/* Subunit event comes from (NEST_SEL enc.) */
+#define PM_SUBUNIT_MSK	7
+#define PM_SUBUNIT_MSKS	(PM_SUBUNIT_MSK << PM_SUBUNIT_SH)
+#define PM_PMCSEL_MSK	0xff	/* PMCxSEL value */
+#define PM_BUSEVENT_MSK	0xf3700
+
+/*
+ * Bits in MMCR1 for POWER6
+ */
+#define MMCR1_TTM0SEL_SH	60
+#define MMCR1_TTMSEL_SH(n)	(MMCR1_TTM0SEL_SH - (n) * 4)
+#define MMCR1_TTMSEL_MSK	0xf
+#define MMCR1_TTMSEL(m, n)	(((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK)
+#define MMCR1_NESTSEL_SH	45
+#define MMCR1_NESTSEL_MSK	0x7
+#define MMCR1_NESTSEL(m)	(((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK)
+#define MMCR1_PMC1_LLA		((u64)1 << 44)
+#define MMCR1_PMC1_LLA_VALUE	((u64)1 << 39)
+#define MMCR1_PMC1_ADDR_SEL	((u64)1 << 35)
+#define MMCR1_PMC1SEL_SH	24
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0xff
+
+/*
+ * Assign PMC numbers and compute MMCR1 value for a set of events
+ */
+static int p6_compute_mmcr(unsigned int event[], int n_ev,
+			   unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	int i;
+	unsigned int pmc, ev, b, u, s, psel;
+	unsigned int ttmset = 0;
+	unsigned int pmc_inuse = 0;
+
+	if (n_ev > 4)
+		return -1;
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;	/* collision! */
+			pmc_inuse |= 1 << (pmc - 1);
+		}
+	}
+	for (i = 0; i < n_ev; ++i) {
+		ev = event[i];
+		pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			--pmc;
+		} else {
+			/* can go on any PMC; find a free one */
+			for (pmc = 0; pmc < 4; ++pmc)
+				if (!(pmc_inuse & (1 << pmc)))
+					break;
+			pmc_inuse |= 1 << pmc;
+		}
+		hwc[i] = pmc;
+		psel = ev & PM_PMCSEL_MSK;
+		if (ev & PM_BUSEVENT_MSK) {
+			/* this event uses the event bus */
+			b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK;
+			u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK;
+			/* check for conflict on this byte of event bus */
+			if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u)
+				return -1;
+			mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b);
+			ttmset |= 1 << b;
+			if (u == 5) {
+				/* Nest events have a further mux */
+				s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
+				if ((ttmset & 0x10) &&
+				    MMCR1_NESTSEL(mmcr1) != s)
+					return -1;
+				ttmset |= 0x10;
+				mmcr1 |= (u64)s << MMCR1_NESTSEL_SH;
+			}
+			if (0x30 <= psel && psel <= 0x3d) {
+				/* these need the PMCx_ADDR_SEL bits */
+				if (b >= 2)
+					mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc;
+			}
+			/* bus select values are different for PMC3/4 */
+			if (pmc >= 2 && (psel & 0x90) == 0x80)
+				psel ^= 0x20;
+		}
+		if (ev & PM_LLA) {
+			mmcr1 |= MMCR1_PMC1_LLA >> pmc;
+			if (ev & PM_LLAV)
+				mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
+		}
+		mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
+	}
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0xe)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = 0;
+	return 0;
+}
+
+/*
+ * Layout of constraint bits:
+ *
+ *	0-1	add field: number of uses of PMC1 (max 1)
+ *	2-3, 4-5, 6-7: ditto for PMC2, 3, 4
+ *	8-10	select field: nest (subunit) event selector
+ *	16-19	select field: unit on byte 0 of event bus
+ *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
+ */
+static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, sh;
+	unsigned int mask = 0, value = 0;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 4)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+	}
+	if (event & PM_BUSEVENT_MSK) {
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		sh = byte * 4;
+		mask |= PM_UNIT_MSKS << sh;
+		value |= (event & PM_UNIT_MSKS) << sh;
+		if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
+			mask |= PM_SUBUNIT_MSKS;
+			value |= event & PM_SUBUNIT_MSKS;
+		}
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	4	/* at most 4 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x0130e8, 0x2000f6, 0x3000fc },	/* PM_PTEG_RELOAD_VALID */
+	{ 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
+	{ 0x080088, 0x200054, 0x3000f0 },	/* PM_ST_MISS_L1 */
+	{ 0x10000a, 0x2000f4 },			/* PM_RUN_CYC */
+	{ 0x10000b, 0x2000f5 },			/* PM_RUN_COUNT */
+	{ 0x10000e, 0x400010 },			/* PM_PURR */
+	{ 0x100010, 0x4000f8 },			/* PM_FLUSH */
+	{ 0x10001a, 0x200010 },			/* PM_MRK_INST_DISP */
+	{ 0x100026, 0x3000f8 },			/* PM_TB_BIT_TRANS */
+	{ 0x100054, 0x2000f0 },			/* PM_ST_FIN */
+	{ 0x100056, 0x2000fc },			/* PM_L1_ICACHE_MISS */
+	{ 0x1000f0, 0x40000a },			/* PM_INST_IMC_MATCH_CMPL */
+	{ 0x1000f8, 0x200008 },			/* PM_GCT_EMPTY_CYC */
+	{ 0x1000fc, 0x400006 },			/* PM_LSU_DERAT_MISS_CYC */
+	{ 0x20000e, 0x400007 },			/* PM_LSU_DERAT_MISS */
+	{ 0x200012, 0x300012 },			/* PM_INST_DISP */
+	{ 0x2000f2, 0x3000f2 },			/* PM_INST_DISP */
+	{ 0x2000f8, 0x300010 },			/* PM_EXT_INT */
+	{ 0x2000fe, 0x300056 },			/* PM_DATA_FROM_L2MISS */
+	{ 0x2d0030, 0x30001a },			/* PM_MRK_FPU_FIN */
+	{ 0x30000a, 0x400018 },			/* PM_MRK_INST_FIN */
+	{ 0x3000f6, 0x40000e },			/* PM_L1_DCACHE_RELOAD_VALID */
+	{ 0x3000fe, 0x400056 },			/* PM_DATA_FROM_L3MISS */
+};
+
+/*
+ * This could be made more efficient with a binary search on
+ * a presorted list, if necessary
+ */
+static int find_alternatives_list(unsigned int event)
+{
+	int i, j;
+	unsigned int alt;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			return -1;
+		for (j = 0; j < MAX_ALT; ++j) {
+			alt = event_alternatives[i][j];
+			if (!alt || event < alt)
+				break;
+			if (event == alt)
+				return i;
+		}
+	}
+	return -1;
+}
+
+static int p6_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j;
+	unsigned int aevent, psel, pmc;
+	unsigned int nalt = 1;
+
+	alt[0] = event;
+
+	/* check the alternatives table */
+	i = find_alternatives_list(event);
+	if (i >= 0) {
+		/* copy out alternatives from list */
+		for (j = 0; j < MAX_ALT; ++j) {
+			aevent = event_alternatives[i][j];
+			if (!aevent)
+				break;
+			if (aevent != event)
+				alt[nalt++] = aevent;
+		}
+
+	} else {
+		/* Check for alternative ways of computing sum events */
+		/* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */
+		psel = event & (PM_PMCSEL_MSK & ~1);	/* ignore edge bit */
+		pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc && (psel == 0x32 || psel == 0x34))
+			alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) |
+				((5 - pmc) << PM_PMC_SH);
+
+		/* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */
+		if (pmc && (psel == 0x38 || psel == 0x3a))
+			alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) |
+				((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
+	}
+
+	return nalt;
+}
+
+static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	/* Set PMCxSEL to 0 to disable PMCx */
+	mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power6_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0x1e,
+	[PERF_COUNT_INSTRUCTIONS] = 2,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x280030,	/* LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x30000c,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0,	/* BR_PRED */ 
+	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
+};
+
+struct power_pmu power6_pmu = {
+	.n_counter = 4,
+	.max_alternatives = MAX_ALT,
+	.add_fields = 0x55,
+	.test_adder = 0,
+	.compute_mmcr = p6_compute_mmcr,
+	.get_constraint = p6_get_constraint,
+	.get_alternatives = p6_get_alternatives,
+	.disable_pmc = p6_disable_pmc,
+	.n_generic = ARRAY_SIZE(power6_generic_events),
+	.generic_events = power6_generic_events,
+};
-- 
GitLab


From dd0e6ba22ea21bcc2c420b385a170593c58f4c08 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 12 Jan 2009 15:11:00 +1100
Subject: [PATCH 0064/2198] perf_counter: Always schedule all software counters
 in

Software counters aren't subject to the limitations imposed by the
fixed number of hardware counter registers, so there is no reason not
to enable them all in __perf_counter_sched_in.  Previously we used to
break out of the loop when we got to a group that wouldn't fit on the
PMU; with this we continue through the list but only schedule in
software counters (or groups containing only software counters) from
there on.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 kernel/perf_counter.c | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4c0dccb756adf..3aef3062ff78b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -455,12 +455,37 @@ group_sched_in(struct perf_counter *group_counter,
 	return -EAGAIN;
 }
 
+/*
+ * Return 1 for a software counter, 0 for a hardware counter
+ */
+static inline int is_software_counter(struct perf_counter *counter)
+{
+	return !counter->hw_event.raw && counter->hw_event.type < 0;
+}
+
+/*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+	struct perf_counter *counter;
+
+	if (!is_software_counter(leader))
+		return 0;
+	list_for_each_entry(counter, &leader->sibling_list, list_entry)
+		if (!is_software_counter(counter))
+			return 0;
+	return 1;
+}
+
 static void
 __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct perf_counter *counter;
 	u64 flags;
+	int can_add_hw = 1;
 
 	if (likely(!ctx->nr_counters))
 		return;
@@ -477,10 +502,12 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 
 		/*
 		 * If we scheduled in a group atomically and exclusively,
-		 * or if this group can't go on, break out:
+		 * or if this group can't go on, don't add any more
+		 * hardware counters.
 		 */
-		if (group_sched_in(counter, cpuctx, ctx, cpu))
-			break;
+		if (can_add_hw || is_software_only_group(counter))
+			if (group_sched_in(counter, cpuctx, ctx, cpu))
+				can_add_hw = 0;
 	}
 	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
-- 
GitLab


From 01d0287f068de2934109ba9b989d8807526cccc2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 14 Jan 2009 13:44:19 +1100
Subject: [PATCH 0065/2198] powerpc/perf_counter: Make sure PMU gets enabled
 properly

This makes sure that we call the platform-specific ppc_md.enable_pmcs
function on each CPU before we try to use the PMU on that CPU.  If the
CPU goes off-line and then on-line, we need to do the enable_pmcs call
again, so we use the hw_perf_counter_setup hook to ensure that.  It gets
called as each CPU comes online, but it isn't called on the CPU that is
coming up, so this adds the CPU number as an argument to it (there were
no non-empty instances of hw_perf_counter_setup before).

This also arranges to set the pmcregs_in_use field of the lppaca (data
structure shared with the hypervisor) on each CPU when we are using the
PMU and clear it when we are not.  This allows the hypervisor to optimize
partition switches by not saving/restoring the PMU registers when we
aren't using the PMU.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 22 ++++++++++++++++++++++
 kernel/perf_counter.c              |  4 ++--
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index df3fe057dee99..85ad25923c2c3 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -15,6 +15,7 @@
 #include <linux/hardirq.h>
 #include <asm/reg.h>
 #include <asm/pmc.h>
+#include <asm/machdep.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -24,6 +25,7 @@ struct cpu_hw_counters {
 	struct perf_counter *counter[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
 	u64 mmcr[3];
+	u8 pmcs_enabled;
 };
 DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
@@ -261,6 +263,15 @@ u64 hw_perf_save_disable(void)
 		cpuhw->disabled = 1;
 		cpuhw->n_added = 0;
 
+		/*
+		 * Check if we ever enabled the PMU on this cpu.
+		 */
+		if (!cpuhw->pmcs_enabled) {
+			if (ppc_md.enable_pmcs)
+				ppc_md.enable_pmcs();
+			cpuhw->pmcs_enabled = 1;
+		}
+
 		/*
 		 * Set the 'freeze counters' bit.
 		 * The barrier is to make sure the mtspr has been
@@ -305,6 +316,8 @@ void hw_perf_restore(u64 disable)
 		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
 		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 		mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+		if (cpuhw->n_counters == 0)
+			get_lppaca()->pmcregs_in_use = 0;
 		goto out;
 	}
 
@@ -323,6 +336,7 @@ void hw_perf_restore(u64 disable)
 	 * bit set and set the hardware counters to their initial values.
 	 * Then unfreeze the counters.
 	 */
+	get_lppaca()->pmcregs_in_use = 1;
 	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
 	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
@@ -741,6 +755,14 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	}
 }
 
+void hw_perf_counter_setup(int cpu)
+{
+	struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu);
+
+	memset(cpuhw, 0, sizeof(*cpuhw));
+	cpuhw->mmcr[0] = MMCR0_FC;
+}
+
 extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power6_pmu;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3aef3062ff78b..52f2f526248e8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,7 +46,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 u64 __weak hw_perf_save_disable(void)		{ return 0; }
 void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
-void __weak hw_perf_counter_setup(void)		{ barrier(); }
+void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu)
@@ -1598,7 +1598,7 @@ static void __cpuinit perf_counter_init_cpu(int cpu)
 	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
 	mutex_unlock(&perf_resource_mutex);
 
-	hw_perf_counter_setup();
+	hw_perf_counter_setup(cpu);
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-- 
GitLab


From 3b6f9e5cb21964b7ce12bf81076f830885563ec8 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 14 Jan 2009 21:00:30 +1100
Subject: [PATCH 0066/2198] perf_counter: Add support for pinned and exclusive
 counter groups

Impact: New perf_counter features

A pinned counter group is one that the user wants to have on the CPU
whenever possible, i.e. whenever the associated task is running, for
a per-task group, or always for a per-cpu group.  If the system
cannot satisfy that, it puts the group into an error state where
it is not scheduled any more and reads from it return EOF (i.e. 0
bytes read).  The group can be released from error state and made
readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE).  When we
have finer-grained enable/disable controls on counters we'll be able
to reset the error state on individual groups.

An exclusive group is one that the user wants to be the only group
using the CPU performance monitor hardware whenever it is on.  The
counter group scheduler will not schedule an exclusive group if there
are already other groups on the CPU and will not schedule other groups
onto the CPU if there is an exclusive group scheduled (that statement
does not apply to groups containing only software counters, which can
always go on and which do not prevent an exclusive group from going on).
With an exclusive group, we will be able to let users program PMU
registers at a low level without the concern that those settings will
perturb other measurements.

Along the way this reorganizes things a little:
- is_software_counter() is moved to perf_counter.h.
- cpuctx->active_oncpu now records the number of hardware counters on
  the CPU, i.e. it now excludes software counters.  Nothing was reading
  cpuctx->active_oncpu before, so this change is harmless.
- A new cpuctx->exclusive field records whether we currently have an
  exclusive group on the CPU.
- counter_sched_out moves higher up in perf_counter.c and gets called
  from __perf_counter_remove_from_context and __perf_counter_exit_task,
  where we used to have essentially the same code.
- __perf_counter_sched_in now goes through the counter list twice, doing
  the pinned counters in the first loop and the non-pinned counters in
  the second loop, in order to give the pinned counters the best chance
  to be scheduled in.

Note that only a group leader can be exclusive or pinned, and that
attribute applies to the whole group.  This avoids some awkwardness in
some corner cases (e.g. where a group leader is closed and the other
group members get added to the context list).  If we want to relax that
restriction later, we can, and it is easier to relax a restriction than
to apply a new one.

This doesn't yet handle the case where a pinned counter is inherited
and goes into error state in the child - the error state is not
propagated up to the parent when the child exits, and arguably it
should.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c |  10 +-
 include/linux/perf_counter.h       |  15 +-
 kernel/perf_counter.c              | 226 ++++++++++++++++++++---------
 3 files changed, 169 insertions(+), 82 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 85ad25923c2c3..5b0211348c73a 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -35,14 +35,6 @@ void perf_counter_print_debug(void)
 {
 }
 
-/*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
-}
-
 /*
  * Read one performance monitor counter (PMC).
  */
@@ -443,6 +435,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	 */
 	for (i = n0; i < n0 + n; ++i)
 		cpuhw->counter[i]->hw.config = cpuhw->events[i];
+	cpuctx->active_oncpu += n;
 	n = 1;
 	counter_sched_in(group_leader, cpu);
 	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) {
@@ -451,7 +444,6 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 			++n;
 		}
 	}
-	cpuctx->active_oncpu += n;
 	ctx->nr_active += n;
 
 	return 1;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index b21d1ea4c054d..7ab8e5f96f5bf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -86,7 +86,10 @@ struct perf_counter_hw_event {
 				nmi	     :  1, /* NMI sampling        */
 				raw	     :  1, /* raw event type      */
 				inherit	     :  1, /* children inherit it */
-				__reserved_1 : 28;
+				pinned	     :  1, /* must always be on PMU */
+				exclusive    :  1, /* only counter on PMU */
+
+				__reserved_1 : 26;
 
 	u64			__reserved_2;
 };
@@ -141,6 +144,7 @@ struct hw_perf_counter_ops {
  * enum perf_counter_active_state - the states of a counter
  */
 enum perf_counter_active_state {
+	PERF_COUNTER_STATE_ERROR	= -2,
 	PERF_COUNTER_STATE_OFF		= -1,
 	PERF_COUNTER_STATE_INACTIVE	=  0,
 	PERF_COUNTER_STATE_ACTIVE	=  1,
@@ -214,6 +218,7 @@ struct perf_cpu_context {
 	struct perf_counter_context	*task_ctx;
 	int				active_oncpu;
 	int				max_pertask;
+	int				exclusive;
 };
 
 /*
@@ -240,6 +245,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu);
 
+/*
+ * Return 1 for a software counter, 0 for a hardware counter
+ */
+static inline int is_software_counter(struct perf_counter *counter)
+{
+	return !counter->hw_event.raw && counter->hw_event.type < 0;
+}
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52f2f526248e8..faf671b295661 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -93,6 +93,25 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	}
 }
 
+static void
+counter_sched_out(struct perf_counter *counter,
+		  struct perf_cpu_context *cpuctx,
+		  struct perf_counter_context *ctx)
+{
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return;
+
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->hw_ops->disable(counter);
+	counter->oncpu = -1;
+
+	if (!is_software_counter(counter))
+		cpuctx->active_oncpu--;
+	ctx->nr_active--;
+	if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+		cpuctx->exclusive = 0;
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
@@ -118,14 +137,9 @@ static void __perf_counter_remove_from_context(void *info)
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
 
-	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->hw_ops->disable(counter);
-		ctx->nr_active--;
-		cpuctx->active_oncpu--;
-		counter->task = NULL;
-		counter->oncpu = -1;
-	}
+	counter_sched_out(counter, cpuctx, ctx);
+
+	counter->task = NULL;
 	ctx->nr_counters--;
 
 	/*
@@ -207,7 +221,7 @@ counter_sched_in(struct perf_counter *counter,
 		 struct perf_counter_context *ctx,
 		 int cpu)
 {
-	if (counter->state == PERF_COUNTER_STATE_OFF)
+	if (counter->state <= PERF_COUNTER_STATE_OFF)
 		return 0;
 
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
@@ -223,12 +237,63 @@ counter_sched_in(struct perf_counter *counter,
 		return -EAGAIN;
 	}
 
-	cpuctx->active_oncpu++;
+	if (!is_software_counter(counter))
+		cpuctx->active_oncpu++;
 	ctx->nr_active++;
 
+	if (counter->hw_event.exclusive)
+		cpuctx->exclusive = 1;
+
 	return 0;
 }
 
+/*
+ * Return 1 for a group consisting entirely of software counters,
+ * 0 if the group contains any hardware counters.
+ */
+static int is_software_only_group(struct perf_counter *leader)
+{
+	struct perf_counter *counter;
+
+	if (!is_software_counter(leader))
+		return 0;
+	list_for_each_entry(counter, &leader->sibling_list, list_entry)
+		if (!is_software_counter(counter))
+			return 0;
+	return 1;
+}
+
+/*
+ * Work out whether we can put this counter group on the CPU now.
+ */
+static int group_can_go_on(struct perf_counter *counter,
+			   struct perf_cpu_context *cpuctx,
+			   int can_add_hw)
+{
+	/*
+	 * Groups consisting entirely of software counters can always go on.
+	 */
+	if (is_software_only_group(counter))
+		return 1;
+	/*
+	 * If an exclusive group is already on, no other hardware
+	 * counters can go on.
+	 */
+	if (cpuctx->exclusive)
+		return 0;
+	/*
+	 * If this group is exclusive and there are already
+	 * counters on the CPU, it can't go on.
+	 */
+	if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+		return 0;
+	/*
+	 * Otherwise, try to add it if all previous groups were able
+	 * to go on.
+	 */
+	return can_add_hw;
+}
+
 /*
  * Cross CPU call to install and enable a performance counter
  */
@@ -240,6 +305,7 @@ static void __perf_install_in_context(void *info)
 	int cpu = smp_processor_id();
 	unsigned long flags;
 	u64 perf_flags;
+	int err;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -261,9 +327,21 @@ static void __perf_install_in_context(void *info)
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
 
-	counter_sched_in(counter, cpuctx, ctx, cpu);
+	/*
+	 * An exclusive counter can't go on if there are already active
+	 * hardware counters, and no hardware counter can go on if there
+	 * is already an exclusive counter on.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE &&
+	    !group_can_go_on(counter, cpuctx, 1))
+		err = -EEXIST;
+	else
+		err = counter_sched_in(counter, cpuctx, ctx, cpu);
+
+	if (err && counter->hw_event.pinned)
+		counter->state = PERF_COUNTER_STATE_ERROR;
 
-	if (!ctx->task && cpuctx->max_pertask)
+	if (!err && !ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
 	hw_perf_restore(perf_flags);
@@ -326,22 +404,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
 	spin_unlock_irq(&ctx->lock);
 }
 
-static void
-counter_sched_out(struct perf_counter *counter,
-		  struct perf_cpu_context *cpuctx,
-		  struct perf_counter_context *ctx)
-{
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return;
-
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->hw_ops->disable(counter);
-	counter->oncpu = -1;
-
-	cpuctx->active_oncpu--;
-	ctx->nr_active--;
-}
-
 static void
 group_sched_out(struct perf_counter *group_counter,
 		struct perf_cpu_context *cpuctx,
@@ -359,6 +421,9 @@ group_sched_out(struct perf_counter *group_counter,
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 		counter_sched_out(counter, cpuctx, ctx);
+
+	if (group_counter->hw_event.exclusive)
+		cpuctx->exclusive = 0;
 }
 
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
@@ -455,30 +520,6 @@ group_sched_in(struct perf_counter *group_counter,
 	return -EAGAIN;
 }
 
-/*
- * Return 1 for a software counter, 0 for a hardware counter
- */
-static inline int is_software_counter(struct perf_counter *counter)
-{
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
-}
-
-/*
- * Return 1 for a group consisting entirely of software counters,
- * 0 if the group contains any hardware counters.
- */
-static int is_software_only_group(struct perf_counter *leader)
-{
-	struct perf_counter *counter;
-
-	if (!is_software_counter(leader))
-		return 0;
-	list_for_each_entry(counter, &leader->sibling_list, list_entry)
-		if (!is_software_counter(counter))
-			return 0;
-	return 1;
-}
-
 static void
 __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
@@ -492,7 +533,38 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 
 	spin_lock(&ctx->lock);
 	flags = hw_perf_save_disable();
+
+	/*
+	 * First go through the list and put on any pinned groups
+	 * in order to give them the best chance of going on.
+	 */
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->state <= PERF_COUNTER_STATE_OFF ||
+		    !counter->hw_event.pinned)
+			continue;
+		if (counter->cpu != -1 && counter->cpu != cpu)
+			continue;
+
+		if (group_can_go_on(counter, cpuctx, 1))
+			group_sched_in(counter, cpuctx, ctx, cpu);
+
+		/*
+		 * If this pinned group hasn't been scheduled,
+		 * put it in error state.
+		 */
+		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+			counter->state = PERF_COUNTER_STATE_ERROR;
+	}
+
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		/*
+		 * Ignore counters in OFF or ERROR state, and
+		 * ignore pinned counters since we did them already.
+		 */
+		if (counter->state <= PERF_COUNTER_STATE_OFF ||
+		    counter->hw_event.pinned)
+			continue;
+
 		/*
 		 * Listen to the 'cpu' scheduling filter constraint
 		 * of counters:
@@ -500,14 +572,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		/*
-		 * If we scheduled in a group atomically and exclusively,
-		 * or if this group can't go on, don't add any more
-		 * hardware counters.
-		 */
-		if (can_add_hw || is_software_only_group(counter))
+		if (group_can_go_on(counter, cpuctx, can_add_hw)) {
 			if (group_sched_in(counter, cpuctx, ctx, cpu))
 				can_add_hw = 0;
+		}
 	}
 	hw_perf_restore(flags);
 	spin_unlock(&ctx->lock);
@@ -567,8 +635,10 @@ int perf_counter_task_disable(void)
 	 */
 	perf_flags = hw_perf_save_disable();
 
-	list_for_each_entry(counter, &ctx->counter_list, list_entry)
-		counter->state = PERF_COUNTER_STATE_OFF;
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->state != PERF_COUNTER_STATE_ERROR)
+			counter->state = PERF_COUNTER_STATE_OFF;
+	}
 
 	hw_perf_restore(perf_flags);
 
@@ -607,7 +677,7 @@ int perf_counter_task_enable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_OFF)
+		if (counter->state > PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->hw_event.disabled = 0;
@@ -849,6 +919,14 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (count != sizeof(cntval))
 		return -EINVAL;
 
+	/*
+	 * Return end-of-file for a read on a counter that is in
+	 * error state (i.e. because it was pinned but it couldn't be
+	 * scheduled on to the CPU at some point).
+	 */
+	if (counter->state == PERF_COUNTER_STATE_ERROR)
+		return 0;
+
 	mutex_lock(&counter->mutex);
 	cntval = perf_counter_read(counter);
 	mutex_unlock(&counter->mutex);
@@ -884,7 +962,7 @@ perf_read_irq_data(struct perf_counter	*counter,
 {
 	struct perf_data *irqdata, *usrdata;
 	DECLARE_WAITQUEUE(wait, current);
-	ssize_t res;
+	ssize_t res, res2;
 
 	irqdata = counter->irqdata;
 	usrdata = counter->usrdata;
@@ -905,6 +983,9 @@ perf_read_irq_data(struct perf_counter	*counter,
 		if (signal_pending(current))
 			break;
 
+		if (counter->state == PERF_COUNTER_STATE_ERROR)
+			break;
+
 		spin_unlock_irq(&counter->waitq.lock);
 		schedule();
 		spin_lock_irq(&counter->waitq.lock);
@@ -913,7 +994,8 @@ perf_read_irq_data(struct perf_counter	*counter,
 	__set_current_state(TASK_RUNNING);
 	spin_unlock_irq(&counter->waitq.lock);
 
-	if (usrdata->len + irqdata->len < count)
+	if (usrdata->len + irqdata->len < count &&
+	    counter->state != PERF_COUNTER_STATE_ERROR)
 		return -ERESTARTSYS;
 read_pending:
 	mutex_lock(&counter->mutex);
@@ -925,11 +1007,12 @@ perf_read_irq_data(struct perf_counter	*counter,
 
 	/* Switch irq buffer: */
 	usrdata = perf_switch_irq_data(counter);
-	if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) {
+	res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
+	if (res2 < 0) {
 		if (!res)
 			res = -EFAULT;
 	} else {
-		res = count;
+		res += res2;
 	}
 out:
 	mutex_unlock(&counter->mutex);
@@ -1348,6 +1431,11 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 		 */
 		if (group_leader->ctx != ctx)
 			goto err_put_context;
+		/*
+		 * Only a group leader can be exclusive or pinned
+		 */
+		if (hw_event.exclusive || hw_event.pinned)
+			goto err_put_context;
 	}
 
 	ret = -EINVAL;
@@ -1473,13 +1561,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
-			child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-			child_counter->hw_ops->disable(child_counter);
-			cpuctx->active_oncpu--;
-			child_ctx->nr_active--;
-			child_counter->oncpu = -1;
-		}
+		counter_sched_out(child_counter, cpuctx, child_ctx);
 
 		list_del_init(&child_counter->list_entry);
 
-- 
GitLab


From d859e29fe34cb833071b20aef860ee94fbad9bb2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 17 Jan 2009 18:10:22 +1100
Subject: [PATCH 0067/2198] perf_counter: Add counter enable/disable ioctls

Impact: New perf_counter features

This primarily adds a way for perf_counter users to enable and disable
counters and groups.  Enabling or disabling a counter or group also
enables or disables all of the child counters that have been cloned
from it to monitor children of the task monitored by the top-level
counter.  The userspace interface to enable/disable counters is via
ioctl on the counter file descriptor.

Along the way this extends the code that handles child counters to
handle child counter groups properly.  A group with multiple counters
will be cloned to child tasks if and only if the group leader has the
hw_event.inherit bit set - if it is set the whole group is cloned as a
group in the child task.

In order to be able to enable or disable all child counters of a given
top-level counter, we need a way to find them all.  Hence I have added
a child_list field to struct perf_counter, which is the head of the
list of children for a top-level counter, or the link in that list for
a child counter.  That list is protected by the perf_counter.mutex
field.

This also adds a mutex to the perf_counter_context struct.  Previously
the list of counters was protected just by the lock field in the
context, which meant that perf_counter_init_task had to take that lock
and then take whatever lock/mutex protects the top-level counter's
child_list.  But the counter enable/disable functions need to take
that lock in order to traverse the list, then for each counter take
the lock in that counter's context in order to change the counter's
state safely, which will lead to a deadlock.

To solve this, we now have both a mutex and a spinlock in the context,
and taking either is sufficient to ensure the list of counters can't
change - you have to take both before changing the list.  Now
perf_counter_init_task takes the mutex instead of the lock (which
incidentally means that inherit_counter can use GFP_KERNEL instead of
GFP_ATOMIC) and thus avoids the possible deadlock.  Similarly the new
enable/disable functions can take the mutex while traversing the list
of child counters without incurring a possible deadlock when the
counter manipulation code locks the context for a child counter.

We also had an misfeature that the first counter added to a context
would possibly not go on until the next sched-in, because we were
using ctx->nr_active to detect if the context was running on a CPU.
But nr_active is the number of active counters, and if that was zero
(because the context didn't have any counters yet) it would look like
the context wasn't running on a cpu and so the retry code in
__perf_install_in_context wouldn't retry.  So this adds an 'is_active'
field that is set when the context is on a CPU, even if it has no
counters.  The is_active field is only used for task contexts, not for
per-cpu contexts.

If we enable a subsidiary counter in a group that is active on a CPU,
and the arch code can't enable the counter, then we have to pull the
whole group off the CPU.  We do this with group_sched_out, which gets
moved up in the file so it comes before all its callers.  This also
adds similar logic to __perf_install_in_context so that the "all on,
or none" invariant of groups is preserved when adding a new counter to
a group.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/perf_counter.h |  21 +-
 kernel/perf_counter.c        | 455 ++++++++++++++++++++++++++++++-----
 2 files changed, 415 insertions(+), 61 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7ab8e5f96f5bf..33ba9fe0a7810 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -14,6 +14,7 @@
 #define _LINUX_PERF_COUNTER_H
 
 #include <asm/atomic.h>
+#include <asm/ioctl.h>
 
 #ifdef CONFIG_PERF_COUNTERS
 # include <asm/perf_counter.h>
@@ -94,6 +95,12 @@ struct perf_counter_hw_event {
 	u64			__reserved_2;
 };
 
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+
 /*
  * Kernel-internal data types:
  */
@@ -173,8 +180,10 @@ struct perf_counter {
 	struct file			*filp;
 
 	struct perf_counter		*parent;
+	struct list_head		child_list;
+
 	/*
-	 * Protect attach/detach:
+	 * Protect attach/detach and child_list:
 	 */
 	struct mutex			mutex;
 
@@ -199,13 +208,21 @@ struct perf_counter {
 struct perf_counter_context {
 #ifdef CONFIG_PERF_COUNTERS
 	/*
-	 * Protect the list of counters:
+	 * Protect the states of the counters in the list,
+	 * nr_active, and the list:
 	 */
 	spinlock_t		lock;
+	/*
+	 * Protect the list of counters.  Locking either mutex or lock
+	 * is sufficient to ensure the list doesn't change; to change
+	 * the list you need to lock both the mutex and the spinlock.
+	 */
+	struct mutex		mutex;
 
 	struct list_head	counter_list;
 	int			nr_counters;
 	int			nr_active;
+	int			is_active;
 	struct task_struct	*task;
 #endif
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index faf671b295661..1ac18daa424fd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -112,6 +112,28 @@ counter_sched_out(struct perf_counter *counter,
 		cpuctx->exclusive = 0;
 }
 
+static void
+group_sched_out(struct perf_counter *group_counter,
+		struct perf_cpu_context *cpuctx,
+		struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+
+	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return;
+
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	/*
+	 * Schedule out siblings (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
+		counter_sched_out(counter, cpuctx, ctx);
+
+	if (group_counter->hw_event.exclusive)
+		cpuctx->exclusive = 0;
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
@@ -168,7 +190,7 @@ static void __perf_counter_remove_from_context(void *info)
 /*
  * Remove the counter from a task's (or a CPU's) list of counters.
  *
- * Must be called with counter->mutex held.
+ * Must be called with counter->mutex and ctx->mutex held.
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
@@ -215,6 +237,99 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Cross CPU call to disable a performance counter
+ */
+static void __perf_counter_disable(void *info)
+{
+	struct perf_counter *counter = info;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter_context *ctx = counter->ctx;
+	unsigned long flags;
+
+	/*
+	 * If this is a per-task counter, need to check whether this
+	 * counter's task is the current task on this cpu.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
+		return;
+
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
+
+	/*
+	 * If the counter is on, turn it off.
+	 * If it is in error state, leave it in error state.
+	 */
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+		if (counter == counter->group_leader)
+			group_sched_out(counter, cpuctx, ctx);
+		else
+			counter_sched_out(counter, cpuctx, ctx);
+		counter->state = PERF_COUNTER_STATE_OFF;
+	}
+
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
+}
+
+/*
+ * Disable a counter.
+ */
+static void perf_counter_disable(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Disable the counter on the cpu that it's on
+		 */
+		smp_call_function_single(counter->cpu, __perf_counter_disable,
+					 counter, 1);
+		return;
+	}
+
+ retry:
+	task_oncpu_function_call(task, __perf_counter_disable, counter);
+
+	spin_lock_irq(&ctx->lock);
+	/*
+	 * If the counter is still active, we need to retry the cross-call.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
+		spin_unlock_irq(&ctx->lock);
+		goto retry;
+	}
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+		counter->state = PERF_COUNTER_STATE_OFF;
+
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Disable a counter and all its children.
+ */
+static void perf_counter_disable_family(struct perf_counter *counter)
+{
+	struct perf_counter *child;
+
+	perf_counter_disable(counter);
+
+	/*
+	 * Lock the mutex to protect the list of children
+	 */
+	mutex_lock(&counter->mutex);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_disable(child);
+	mutex_unlock(&counter->mutex);
+}
+
 static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
@@ -302,6 +417,7 @@ static void __perf_install_in_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *leader = counter->group_leader;
 	int cpu = smp_processor_id();
 	unsigned long flags;
 	u64 perf_flags;
@@ -327,23 +443,40 @@ static void __perf_install_in_context(void *info)
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
 
+	/*
+	 * Don't put the counter on if it is disabled or if
+	 * it is in a group and the group isn't on.
+	 */
+	if (counter->state != PERF_COUNTER_STATE_INACTIVE ||
+	    (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE))
+		goto unlock;
+
 	/*
 	 * An exclusive counter can't go on if there are already active
 	 * hardware counters, and no hardware counter can go on if there
 	 * is already an exclusive counter on.
 	 */
-	if (counter->state == PERF_COUNTER_STATE_INACTIVE &&
-	    !group_can_go_on(counter, cpuctx, 1))
+	if (!group_can_go_on(counter, cpuctx, 1))
 		err = -EEXIST;
 	else
 		err = counter_sched_in(counter, cpuctx, ctx, cpu);
 
-	if (err && counter->hw_event.pinned)
-		counter->state = PERF_COUNTER_STATE_ERROR;
+	if (err) {
+		/*
+		 * This counter couldn't go on.  If it is in a group
+		 * then we have to pull the whole group off.
+		 * If the counter group is pinned then put it in error state.
+		 */
+		if (leader != counter)
+			group_sched_out(leader, cpuctx, ctx);
+		if (leader->hw_event.pinned)
+			leader->state = PERF_COUNTER_STATE_ERROR;
+	}
 
 	if (!err && !ctx->task && cpuctx->max_pertask)
 		cpuctx->max_pertask--;
 
+ unlock:
 	hw_perf_restore(perf_flags);
 
 	spin_unlock(&ctx->lock);
@@ -359,6 +492,8 @@ static void __perf_install_in_context(void *info)
  * If the counter is attached to a task which is on a CPU we use a smp
  * call to enable it in the task context. The task might have been
  * scheduled away, but we check this in the smp call again.
+ *
+ * Must be called with ctx->mutex held.
  */
 static void
 perf_install_in_context(struct perf_counter_context *ctx,
@@ -387,7 +522,7 @@ perf_install_in_context(struct perf_counter_context *ctx,
 	/*
 	 * we need to retry the smp call.
 	 */
-	if (ctx->nr_active && list_empty(&counter->list_entry)) {
+	if (ctx->is_active && list_empty(&counter->list_entry)) {
 		spin_unlock_irq(&ctx->lock);
 		goto retry;
 	}
@@ -404,26 +539,131 @@ perf_install_in_context(struct perf_counter_context *ctx,
 	spin_unlock_irq(&ctx->lock);
 }
 
-static void
-group_sched_out(struct perf_counter *group_counter,
-		struct perf_cpu_context *cpuctx,
-		struct perf_counter_context *ctx)
+/*
+ * Cross CPU call to enable a performance counter
+ */
+static void __perf_counter_enable(void *info)
 {
-	struct perf_counter *counter;
+	struct perf_counter *counter = info;
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *leader = counter->group_leader;
+	unsigned long flags;
+	int err;
 
-	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE)
+	/*
+	 * If this is a per-task counter, need to check whether this
+	 * counter's task is the current task on this cpu.
+	 */
+	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	counter_sched_out(group_counter, cpuctx, ctx);
+	curr_rq_lock_irq_save(&flags);
+	spin_lock(&ctx->lock);
+
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		goto unlock;
+	counter->state = PERF_COUNTER_STATE_INACTIVE;
 
 	/*
-	 * Schedule out siblings (if any):
+	 * If the counter is in a group and isn't the group leader,
+	 * then don't put it on unless the group is on.
 	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
-		counter_sched_out(counter, cpuctx, ctx);
+	if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
+		goto unlock;
 
-	if (group_counter->hw_event.exclusive)
-		cpuctx->exclusive = 0;
+	if (!group_can_go_on(counter, cpuctx, 1))
+		err = -EEXIST;
+	else
+		err = counter_sched_in(counter, cpuctx, ctx,
+				       smp_processor_id());
+
+	if (err) {
+		/*
+		 * If this counter can't go on and it's part of a
+		 * group, then the whole group has to come off.
+		 */
+		if (leader != counter)
+			group_sched_out(leader, cpuctx, ctx);
+		if (leader->hw_event.pinned)
+			leader->state = PERF_COUNTER_STATE_ERROR;
+	}
+
+ unlock:
+	spin_unlock(&ctx->lock);
+	curr_rq_unlock_irq_restore(&flags);
+}
+
+/*
+ * Enable a counter.
+ */
+static void perf_counter_enable(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		/*
+		 * Enable the counter on the cpu that it's on
+		 */
+		smp_call_function_single(counter->cpu, __perf_counter_enable,
+					 counter, 1);
+		return;
+	}
+
+	spin_lock_irq(&ctx->lock);
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		goto out;
+
+	/*
+	 * If the counter is in error state, clear that first.
+	 * That way, if we see the counter in error state below, we
+	 * know that it has gone back into error state, as distinct
+	 * from the task having been scheduled away before the
+	 * cross-call arrived.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_ERROR)
+		counter->state = PERF_COUNTER_STATE_OFF;
+
+ retry:
+	spin_unlock_irq(&ctx->lock);
+	task_oncpu_function_call(task, __perf_counter_enable, counter);
+
+	spin_lock_irq(&ctx->lock);
+
+	/*
+	 * If the context is active and the counter is still off,
+	 * we need to retry the cross-call.
+	 */
+	if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF)
+		goto retry;
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (counter->state == PERF_COUNTER_STATE_OFF)
+		counter->state = PERF_COUNTER_STATE_INACTIVE;
+ out:
+	spin_unlock_irq(&ctx->lock);
+}
+
+/*
+ * Enable a counter and all its children.
+ */
+static void perf_counter_enable_family(struct perf_counter *counter)
+{
+	struct perf_counter *child;
+
+	perf_counter_enable(counter);
+
+	/*
+	 * Lock the mutex to protect the list of children
+	 */
+	mutex_lock(&counter->mutex);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_enable(child);
+	mutex_unlock(&counter->mutex);
 }
 
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
@@ -432,16 +672,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	struct perf_counter *counter;
 	u64 flags;
 
+	spin_lock(&ctx->lock);
+	ctx->is_active = 0;
 	if (likely(!ctx->nr_counters))
-		return;
+		goto out;
 
-	spin_lock(&ctx->lock);
 	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
 		list_for_each_entry(counter, &ctx->counter_list, list_entry)
 			group_sched_out(counter, cpuctx, ctx);
 	}
 	hw_perf_restore(flags);
+ out:
 	spin_unlock(&ctx->lock);
 }
 
@@ -528,10 +770,11 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	u64 flags;
 	int can_add_hw = 1;
 
+	spin_lock(&ctx->lock);
+	ctx->is_active = 1;
 	if (likely(!ctx->nr_counters))
-		return;
+		goto out;
 
-	spin_lock(&ctx->lock);
 	flags = hw_perf_save_disable();
 
 	/*
@@ -578,6 +821,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		}
 	}
 	hw_perf_restore(flags);
+ out:
 	spin_unlock(&ctx->lock);
 }
 
@@ -896,12 +1140,14 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	file->private_data = NULL;
 
+	mutex_lock(&ctx->mutex);
 	mutex_lock(&counter->mutex);
 
 	perf_counter_remove_from_context(counter);
 	put_context(ctx);
 
 	mutex_unlock(&counter->mutex);
+	mutex_unlock(&ctx->mutex);
 
 	kfree(counter);
 
@@ -1053,10 +1299,30 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 	return events;
 }
 
+static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct perf_counter *counter = file->private_data;
+	int err = 0;
+
+	switch (cmd) {
+	case PERF_COUNTER_IOC_ENABLE:
+		perf_counter_enable_family(counter);
+		break;
+	case PERF_COUNTER_IOC_DISABLE:
+		perf_counter_disable_family(counter);
+		break;
+	default:
+		err = -ENOTTY;
+	}
+	return err;
+}
+
 static const struct file_operations perf_fops = {
 	.release		= perf_release,
 	.read			= perf_read,
 	.poll			= perf_poll,
+	.unlocked_ioctl		= perf_ioctl,
+	.compat_ioctl		= perf_ioctl,
 };
 
 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
@@ -1348,6 +1614,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
+	INIT_LIST_HEAD(&counter->child_list);
+
 	counter->irqdata		= &counter->data[0];
 	counter->usrdata		= &counter->data[1];
 	counter->cpu			= cpu;
@@ -1452,7 +1720,9 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 		goto err_free_put_context;
 
 	counter->filp = counter_file;
+	mutex_lock(&ctx->mutex);
 	perf_install_in_context(ctx, counter, cpu);
+	mutex_unlock(&ctx->mutex);
 
 	fput_light(counter_file, fput_needed2);
 
@@ -1479,6 +1749,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 {
 	memset(ctx, 0, sizeof(*ctx));
 	spin_lock_init(&ctx->lock);
+	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
 	ctx->task = task;
 }
@@ -1486,20 +1757,30 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 /*
  * inherit a counter from parent task to child task:
  */
-static int
+static struct perf_counter *
 inherit_counter(struct perf_counter *parent_counter,
 	      struct task_struct *parent,
 	      struct perf_counter_context *parent_ctx,
 	      struct task_struct *child,
+	      struct perf_counter *group_leader,
 	      struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *child_counter;
 
+	/*
+	 * Instead of creating recursive hierarchies of counters,
+	 * we link inherited counters back to the original parent,
+	 * which has a filp for sure, which we use as the reference
+	 * count:
+	 */
+	if (parent_counter->parent)
+		parent_counter = parent_counter->parent;
+
 	child_counter = perf_counter_alloc(&parent_counter->hw_event,
-					    parent_counter->cpu, NULL,
-					    GFP_ATOMIC);
+					    parent_counter->cpu, group_leader,
+					    GFP_KERNEL);
 	if (!child_counter)
-		return -ENOMEM;
+		return NULL;
 
 	/*
 	 * Link it up in the child's context:
@@ -1523,16 +1804,82 @@ inherit_counter(struct perf_counter *parent_counter,
 	 */
 	atomic_long_inc(&parent_counter->filp->f_count);
 
+	/*
+	 * Link this into the parent counter's child list
+	 */
+	mutex_lock(&parent_counter->mutex);
+	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
+
+	/*
+	 * Make the child state follow the state of the parent counter,
+	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * so we won't race with perf_counter_{en,dis}able_family.
+	 */
+	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+	else
+		child_counter->state = PERF_COUNTER_STATE_OFF;
+
+	mutex_unlock(&parent_counter->mutex);
+
+	return child_counter;
+}
+
+static int inherit_group(struct perf_counter *parent_counter,
+	      struct task_struct *parent,
+	      struct perf_counter_context *parent_ctx,
+	      struct task_struct *child,
+	      struct perf_counter_context *child_ctx)
+{
+	struct perf_counter *leader;
+	struct perf_counter *sub;
+
+	leader = inherit_counter(parent_counter, parent, parent_ctx,
+				 child, NULL, child_ctx);
+	if (!leader)
+		return -ENOMEM;
+	list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
+		if (!inherit_counter(sub, parent, parent_ctx,
+				     child, leader, child_ctx))
+			return -ENOMEM;
+	}
 	return 0;
 }
 
+static void sync_child_counter(struct perf_counter *child_counter,
+			       struct perf_counter *parent_counter)
+{
+	u64 parent_val, child_val;
+
+	parent_val = atomic64_read(&parent_counter->count);
+	child_val = atomic64_read(&child_counter->count);
+
+	/*
+	 * Add back the child's count to the parent's count:
+	 */
+	atomic64_add(child_val, &parent_counter->count);
+
+	/*
+	 * Remove this counter from the parent's list
+	 */
+	mutex_lock(&parent_counter->mutex);
+	list_del_init(&child_counter->child_list);
+	mutex_unlock(&parent_counter->mutex);
+
+	/*
+	 * Release the parent counter, if this was the last
+	 * reference to it.
+	 */
+	fput(parent_counter->filp);
+}
+
 static void
 __perf_counter_exit_task(struct task_struct *child,
 			 struct perf_counter *child_counter,
 			 struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *parent_counter;
-	u64 parent_val, child_val;
+	struct perf_counter *sub, *tmp;
 
 	/*
 	 * If we do not self-reap then we have to wait for the
@@ -1561,7 +1908,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
-		counter_sched_out(child_counter, cpuctx, child_ctx);
+		group_sched_out(child_counter, cpuctx, child_ctx);
 
 		list_del_init(&child_counter->list_entry);
 
@@ -1577,26 +1924,23 @@ __perf_counter_exit_task(struct task_struct *child,
 	 * that are still around due to the child reference. These
 	 * counters need to be zapped - but otherwise linger.
 	 */
-	if (!parent_counter)
-		return;
-
-	parent_val = atomic64_read(&parent_counter->count);
-	child_val = atomic64_read(&child_counter->count);
-
-	/*
-	 * Add back the child's count to the parent's count:
-	 */
-	atomic64_add(child_val, &parent_counter->count);
-
-	fput(parent_counter->filp);
+	if (parent_counter) {
+		sync_child_counter(child_counter, parent_counter);
+		list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
+					 list_entry) {
+			if (sub->parent)
+				sync_child_counter(sub, sub->parent);
+			kfree(sub);
+		}
+	}
 
 	kfree(child_counter);
 }
 
 /*
- * When a child task exist, feed back counter values to parent counters.
+ * When a child task exits, feed back counter values to parent counters.
  *
- * Note: we are running in child context, but the PID is not hashed
+ * Note: we may be running in child context, but the PID is not hashed
  * anymore so new counters will not be added.
  */
 void perf_counter_exit_task(struct task_struct *child)
@@ -1620,9 +1964,8 @@ void perf_counter_exit_task(struct task_struct *child)
 void perf_counter_init_task(struct task_struct *child)
 {
 	struct perf_counter_context *child_ctx, *parent_ctx;
-	struct perf_counter *counter, *parent_counter;
+	struct perf_counter *counter;
 	struct task_struct *parent = current;
-	unsigned long flags;
 
 	child_ctx  =  &child->perf_counter_ctx;
 	parent_ctx = &parent->perf_counter_ctx;
@@ -1641,32 +1984,22 @@ void perf_counter_init_task(struct task_struct *child)
 	 * Lock the parent list. No need to lock the child - not PID
 	 * hashed yet and not running, so nobody can access it.
 	 */
-	spin_lock_irqsave(&parent_ctx->lock, flags);
+	mutex_lock(&parent_ctx->mutex);
 
 	/*
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
 	list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
-		if (!counter->hw_event.inherit || counter->group_leader != counter)
+		if (!counter->hw_event.inherit)
 			continue;
 
-		/*
-		 * Instead of creating recursive hierarchies of counters,
-		 * we link inheritd counters back to the original parent,
-		 * which has a filp for sure, which we use as the reference
-		 * count:
-		 */
-		parent_counter = counter;
-		if (counter->parent)
-			parent_counter = counter->parent;
-
-		if (inherit_counter(parent_counter, parent,
+		if (inherit_group(counter, parent,
 				  parent_ctx, child, child_ctx))
 			break;
 	}
 
-	spin_unlock_irqrestore(&parent_ctx->lock, flags);
+	mutex_unlock(&parent_ctx->mutex);
 }
 
 static void __cpuinit perf_counter_init_cpu(int cpu)
@@ -1692,11 +2025,15 @@ static void __perf_counter_exit_cpu(void *info)
 
 	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry)
 		__perf_counter_remove_from_context(counter);
-
 }
 static void perf_counter_exit_cpu(int cpu)
 {
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx = &cpuctx->ctx;
+
+	mutex_lock(&ctx->mutex);
 	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1);
+	mutex_unlock(&ctx->mutex);
 }
 #else
 static inline void perf_counter_exit_cpu(int cpu) { }
-- 
GitLab


From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 10:13:01 +0100
Subject: [PATCH 0068/2198] perfcounters: throttle on too high IRQ rates

Starting kerneltop with only -c 100 seems to be a bad idea, it can
easily lock the system due to perfcounter IRQ overload.

So add throttling: if a new IRQ arrives in a shorter than
PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them
from the next timer tick.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic.c             |  2 ++
 arch/x86/kernel/cpu/perf_counter.c | 38 +++++++++++++++++++++++++-----
 include/linux/perf_counter.h       |  4 ++++
 3 files changed, 38 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c
index 7b434e5b14c93..849c23009bf55 100644
--- a/arch/x86/kernel/apic.c
+++ b/arch/x86/kernel/apic.c
@@ -781,6 +781,8 @@ static void local_apic_timer_interrupt(void)
 	inc_irq_stat(apic_timer_irqs);
 
 	evt->event_handler(evt);
+
+	perf_counter_unthrottle();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9376771f757b2..1a040b179b537 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,6 +33,9 @@ static int nr_counters_fixed __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	u64			last_interrupt;
+	u64			global_enable;
+	int			throttled;
 };
 
 /*
@@ -474,16 +477,19 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
-	u64 ack, status, saved_global;
-	struct cpu_hw_counters *cpuc;
+	u64 ack, status, now;
+	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 
 	/* Disable counters globally */
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
+	now = sched_clock();
+	if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS)
+		cpuc->throttled = 1;
+	cpuc->last_interrupt = now;
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (!status)
@@ -533,9 +539,29 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 		goto again;
 out:
 	/*
-	 * Restore - do not reenable when global enable is off:
+	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global);
+	if (!cpuc->throttled)
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+}
+
+void perf_counter_unthrottle(void)
+{
+	struct cpu_hw_counters *cpuc;
+
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
+	if (cpuc->throttled) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n");
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		cpuc->throttled = 0;
+	}
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 33ba9fe0a7810..91f1ca4c01c08 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -254,6 +254,7 @@ extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
+extern void perf_counter_unthrottle(void);
 extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
 extern int perf_counter_task_disable(void);
@@ -270,6 +271,8 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
+#define PERFMON_MIN_PERIOD_NS 10000
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -281,6 +284,7 @@ static inline void perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
+static inline void perf_counter_unthrottle(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)			{ }
 static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
-- 
GitLab


From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 14:36:16 +0100
Subject: [PATCH 0069/2198] perfcounters: ratelimit performance counter
 interrupts

Ratelimit performance counter interrupts to 100KHz per CPU.

This replaces the irq-delta-time based method.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 27 +++++++++++++++------------
 include/linux/perf_counter.h       |  2 --
 2 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1a040b179b537..a56d4cf92f30e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -33,9 +33,8 @@ static int nr_counters_fixed __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	u64			last_interrupt;
+	unsigned long		interrupts;
 	u64			global_enable;
-	int			throttled;
 };
 
 /*
@@ -470,6 +469,11 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	}
 }
 
+/*
+ * Maximum interrupt frequency of 100KHz per CPU
+ */
+#define PERFMON_MAX_INTERRUPTS 100000/HZ
+
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -477,7 +481,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
-	u64 ack, status, now;
+	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
@@ -486,11 +490,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 	ack_APIC_irq();
 
-	now = sched_clock();
-	if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS)
-		cpuc->throttled = 1;
-	cpuc->last_interrupt = now;
-
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	if (!status)
 		goto out;
@@ -541,13 +540,14 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	/*
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	if (!cpuc->throttled)
+	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
 		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 }
 
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
+	u64 global_enable;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
@@ -556,12 +556,15 @@ void perf_counter_unthrottle(void)
 		return;
 
 	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
-	if (cpuc->throttled) {
+	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n");
+			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
 		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
-		cpuc->throttled = 0;
 	}
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
+	if (unlikely(cpuc->global_enable && !global_enable))
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+	cpuc->interrupts = 0;
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 91f1ca4c01c08..f55381fbcac98 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -271,8 +271,6 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
-#define PERFMON_MIN_PERIOD_NS 10000
-
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
-- 
GitLab


From 3415dd9146c574bffe8f012c096bfc2bc62b9508 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Jan 2009 14:16:53 +0100
Subject: [PATCH 0070/2198] perfcounters fix section mismatch warning in
 perf_counter.c::perf_counters_lapic_init()

Fix:

WARNING: arch/x86/kernel/built-in.o(.text+0xdd0f): Section mismatch in reference from the function pmc_generic_enable() to the function .cpuinit.text:perf_counters_lapic_init()
The function pmc_generic_enable() references
the function __cpuinit perf_counters_lapic_init().
This is often because pmc_generic_enable lacks a __cpuinit
annotation or the annotation of perf_counters_lapic_init is wrong.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a56d4cf92f30e..46c436cdd732b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -605,7 +605,7 @@ void perf_counter_notify(struct pt_regs *regs)
 	local_irq_restore(flags);
 }
 
-void __cpuinit perf_counters_lapic_init(int nmi)
+void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
 
-- 
GitLab


From bb3f0b59ad005d2d2ecbbe9bd048eab6d1ecbd31 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 25 Jan 2009 02:38:09 -0800
Subject: [PATCH 0071/2198] x86: make irqinit_32.c more like irqinit_64.c, v2

Impact: cleanup

1. add smp_intr_init and apic_intr_init for 32bit, the same as 64bit
2. move the apic_intr_init() call before set gate with interrupt[i]
3. for 64bit, if ia32_emulation is not used, will make per_cpu to use 0x80 vector.

[ v2: should use !test_bit() instead of test_bit() with 32bit ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 56 +++++++++++++++++++++---------------
 arch/x86/kernel/irqinit_64.c |  7 +++--
 arch/x86/kernel/traps.c      | 15 ++++------
 3 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index c56496f8c6fcb..ddf3eb72f864f 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -120,28 +120,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	return 0;
 }
 
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
+static void __init smp_intr_init(void)
 {
-	int i;
-
-	/* all the set up before the call gates are initialised */
-	pre_intr_init_hook();
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-	}
-
-
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -170,8 +150,13 @@ void __init native_init_IRQ(void)
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
+}
 
+static void __init apic_intr_init(void)
+{
 #ifdef CONFIG_X86_LOCAL_APIC
+	smp_intr_init();
+
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
@@ -181,12 +166,37 @@ void __init native_init_IRQ(void)
 # ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 # endif
-#endif
 
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+# ifdef CONFIG_X86_MCE_P4THERMAL
 	/* thermal monitor LVT interrupt */
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+# endif
 #endif
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	/* all the set up before the call gates are initialised */
+	pre_intr_init_hook();
+
+	apic_intr_init();
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
+		int vector = FIRST_EXTERNAL_VECTOR + i;
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (!test_bit(vector, used_vectors))
+			set_intr_gate(vector, interrupt[i]);
+	}
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 6a71bfc51e51a..16e1fc687504b 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -162,6 +162,9 @@ void __init native_init_IRQ(void)
 	int i;
 
 	init_ISA_irqs();
+
+	apic_intr_init();
+
 	/*
 	 * Cover the whole vector space, no vector can escape
 	 * us. (some of these will be overridden and become
@@ -169,12 +172,10 @@ void __init native_init_IRQ(void)
 	 */
 	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
 		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != IA32_SYSCALL_VECTOR)
+		if (!test_bit(vector, used_vectors))
 			set_intr_gate(vector, interrupt[i]);
 	}
 
-	apic_intr_init();
-
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ed5aee5f3fcc2..d36a502d87abf 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -979,8 +979,13 @@ void __init trap_init(void)
 #endif
 	set_intr_gate(19, &simd_coprocessor_error);
 
+	/* Reserve all the builtin and the syscall vector: */
+	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+		set_bit(i, used_vectors);
+
 #ifdef CONFIG_IA32_EMULATION
 	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
+	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
 #endif
 
 #ifdef CONFIG_X86_32
@@ -997,17 +1002,9 @@ void __init trap_init(void)
 	}
 
 	set_system_trap_gate(SYSCALL_VECTOR, &system_call);
-#endif
-
-	/* Reserve all the builtin and the syscall vector: */
-	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
-		set_bit(i, used_vectors);
-
-#ifdef CONFIG_X86_64
-	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
 	set_bit(SYSCALL_VECTOR, used_vectors);
 #endif
+
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
-- 
GitLab


From 65d370862f64973611a271ced61864b5f9bb6fc0 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 29 Jan 2009 14:06:52 +0100
Subject: [PATCH 0072/2198] perfcounters: fix refcounting bug

don't kfree in use counters.

Running...

	while true; do perfstat -e 1 -c true; done

...on all cores for a while doesn't seem to be eating ram, and my oops
is gone.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1ac18daa424fd..f27a7e9f3c417 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1934,7 +1934,8 @@ __perf_counter_exit_task(struct task_struct *child,
 		}
 	}
 
-	kfree(child_counter);
+	if (!child_counter->filp || !atomic_long_read(&child_counter->filp->f_count))
+		kfree(child_counter);
 }
 
 /*
-- 
GitLab


From 15081c61362618a0c81cc8d04e45e7427bc1ed71 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 1 Feb 2009 22:07:39 +0530
Subject: [PATCH 0073/2198] x86: irqinit_32.c fix compilation warning

Fix:

  arch/x86/kernel/irqinit_32.c:124: warning: 'smp_intr_init' defined but not used

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index ddf3eb72f864f..520e6c1c5d229 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -154,9 +154,9 @@ static void __init smp_intr_init(void)
 
 static void __init apic_intr_init(void)
 {
-#ifdef CONFIG_X86_LOCAL_APIC
 	smp_intr_init();
 
+#ifdef CONFIG_X86_LOCAL_APIC
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
-- 
GitLab


From 5b75af0a02fcf3b8899f38ff6f22164c5d8e2fdd Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 4 Feb 2009 17:11:34 +0100
Subject: [PATCH 0074/2198] perfcounters: fix "perf counters kill oprofile" bug

With oprofile as a module, and unloaded by profiling script,
both oprofile and kerneltop work fine.. unless you leave kerneltop
running when you start profiling, then you may see badness.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 +++-
 arch/x86/oprofile/nmi_int.c        | 7 ++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 46c436cdd732b..8bb213323fe84 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -643,7 +643,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-	.notifier_call		= perf_counter_nmi_handler
+	.notifier_call		= perf_counter_nmi_handler,
+	.next			= NULL,
+	.priority		= 1
 };
 
 void __init init_hw_perf_counters(void)
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a7e..c638685136e1c 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,
 
 	switch (val) {
 	case DIE_NMI:
-		if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)))
-			ret = NOTIFY_STOP;
+	case DIE_NMI_IPI:
+		model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu));
+		ret = NOTIFY_STOP;
 		break;
 	default:
 		break;
@@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)
 static struct notifier_block profile_exceptions_nb = {
 	.notifier_call = profile_exceptions_notify,
 	.next = NULL,
-	.priority = 0
+	.priority = 2
 };
 
 static int nmi_setup(void)
-- 
GitLab


From 82aa9a1829199233f9bdaf26e2ee271114f4701e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Feb 2009 15:23:08 +0100
Subject: [PATCH 0075/2198] perfcounters: fix "perf counters kills oprofile"
 bug, v2

Impact: fix kernel crash

Both oprofile and perfcounters register an NMI die handler, but only one
can handle the NMI.  Conveniently, oprofile unregisters it's notifier
when not actively in use, so setting it's notifier priority higher than
perfcounter's allows oprofile to borrow the NMI for the duration of it's
run.  Tested/works both as module and built-in.

While testing, I found that if kerneltop was generating NMIs at very
high frequency, the kernel may panic when oprofile registered it's
handler.  This turned out to be because oprofile registers it's handler
before reset_value has been allocated, so if an NMI comes in while it's
still setting up, kabOom.  Rather than try more invasive changes, I
followed the lead of other places in op_model_ppro.c, and simply
returned in that highly unlikely event.  (debug warnings attached)

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/oprofile/op_model_ppro.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 07c914555a5ee..85eb6268374f5 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -126,6 +126,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 	u64 val;
 	int i;
 
+	/*
+	 * This can happen if perf counters are in use when
+	 * we steal the die notifier NMI.
+	 */
+	if (unlikely(!reset_value))
+		goto out;
+
 	for (i = 0 ; i < num_counters; ++i) {
 		if (!reset_value[i])
 			continue;
@@ -136,6 +143,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
 		}
 	}
 
+out:
 	/* Only P6 based Pentium M need to re-unmask the apic vector but it
 	 * doesn't hurt other P6 variant */
 	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
-- 
GitLab


From 23a185ca8abbeef64b6ffc33059b1d630e43ec10 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 9 Feb 2009 22:42:47 +1100
Subject: [PATCH 0076/2198] perf_counters: make software counters work as
 per-cpu counters

Impact: kernel crash fix

Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software
counter as a per-cpu counter would reliably crash the system, because
it calls __task_delta_exec with a null pointer.  The page fault,
context switch and cpu migration counters also won't function
correctly as per-cpu counters since they reference the current task.

This fixes the problem by redirecting the task_clock counter to the
cpu_clock counter when used as a per-cpu counter, and by implementing
per-cpu page fault, context switch and cpu migration counters.

Along the way, this:

- Initializes counter->ctx earlier, in perf_counter_alloc, so that
  sw_perf_counter_init can use it
- Adds code to kernel/sched.c to count task migrations into each
  cpu, in rq->nr_migrations_in
- Exports the per-cpu context switch and task migration counts
  via new functions added to kernel/sched.c
- Makes sure that if sw_perf_counter_init fails, we don't try to
  initialize the counter as a hardware counter.  Since the user has
  passed a negative, non-raw event type, they clearly don't intend
  for it to be interpreted as a hardware event.

Reported-by: "Zhang Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/perf_counter.c | 78 +++++++++++++++++++++++++------------------
 kernel/sched.c        | 17 ++++++++++
 3 files changed, 64 insertions(+), 33 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b85b10abf770e..1e5f70062a9c2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,6 +137,8 @@ extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
+extern u64 cpu_nr_switches(int cpu);
+extern u64 cpu_nr_migrations(int cpu);
 
 struct seq_file;
 struct cfs_rq;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f27a7e9f3c417..544193cbc4784 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -20,6 +20,8 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
 
 /*
  * Each CPU has a list of per CPU counters:
@@ -502,7 +504,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
 {
 	struct task_struct *task = ctx->task;
 
-	counter->ctx = ctx;
 	if (!task) {
 		/*
 		 * Per cpu counters are installed via an smp call and
@@ -1417,11 +1418,19 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-static u64 get_page_faults(void)
+#ifdef CONFIG_VM_EVENT_COUNTERS
+#define cpu_page_faults()	__get_cpu_var(vm_event_states).event[PGFAULT]
+#else
+#define cpu_page_faults()	0
+#endif
+
+static u64 get_page_faults(struct perf_counter *counter)
 {
-	struct task_struct *curr = current;
+	struct task_struct *curr = counter->ctx->task;
 
-	return curr->maj_flt + curr->min_flt;
+	if (curr)
+		return curr->maj_flt + curr->min_flt;
+	return cpu_page_faults();
 }
 
 static void page_faults_perf_counter_update(struct perf_counter *counter)
@@ -1430,7 +1439,7 @@ static void page_faults_perf_counter_update(struct perf_counter *counter)
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_page_faults();
+	now = get_page_faults(counter);
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -1446,11 +1455,7 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
 
 static int page_faults_perf_counter_enable(struct perf_counter *counter)
 {
-	/*
-	 * page-faults is a per-task value already,
-	 * so we dont have to clear it on switch-in.
-	 */
-
+	atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
 	return 0;
 }
 
@@ -1465,11 +1470,13 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = {
 	.read		= page_faults_perf_counter_read,
 };
 
-static u64 get_context_switches(void)
+static u64 get_context_switches(struct perf_counter *counter)
 {
-	struct task_struct *curr = current;
+	struct task_struct *curr = counter->ctx->task;
 
-	return curr->nvcsw + curr->nivcsw;
+	if (curr)
+		return curr->nvcsw + curr->nivcsw;
+	return cpu_nr_switches(smp_processor_id());
 }
 
 static void context_switches_perf_counter_update(struct perf_counter *counter)
@@ -1478,7 +1485,7 @@ static void context_switches_perf_counter_update(struct perf_counter *counter)
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_context_switches();
+	now = get_context_switches(counter);
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -1494,11 +1501,7 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
 
 static int context_switches_perf_counter_enable(struct perf_counter *counter)
 {
-	/*
-	 * ->nvcsw + curr->nivcsw is a per-task value already,
-	 * so we dont have to clear it on switch-in.
-	 */
-
+	atomic64_set(&counter->hw.prev_count, get_context_switches(counter));
 	return 0;
 }
 
@@ -1513,9 +1516,13 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
 	.read		= context_switches_perf_counter_read,
 };
 
-static inline u64 get_cpu_migrations(void)
+static inline u64 get_cpu_migrations(struct perf_counter *counter)
 {
-	return current->se.nr_migrations;
+	struct task_struct *curr = counter->ctx->task;
+
+	if (curr)
+		return curr->se.nr_migrations;
+	return cpu_nr_migrations(smp_processor_id());
 }
 
 static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
@@ -1524,7 +1531,7 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
 	s64 delta;
 
 	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_cpu_migrations();
+	now = get_cpu_migrations(counter);
 
 	atomic64_set(&counter->hw.prev_count, now);
 
@@ -1540,11 +1547,7 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
 
 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
 {
-	/*
-	 * se.nr_migrations is a per-task value already,
-	 * so we dont have to clear it on switch-in.
-	 */
-
+	atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter));
 	return 0;
 }
 
@@ -1569,7 +1572,14 @@ sw_perf_counter_init(struct perf_counter *counter)
 		hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
-		hw_ops = &perf_ops_task_clock;
+		/*
+		 * If the user instantiates this as a per-cpu counter,
+		 * use the cpu_clock counter instead.
+		 */
+		if (counter->ctx->task)
+			hw_ops = &perf_ops_task_clock;
+		else
+			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
 		hw_ops = &perf_ops_page_faults;
@@ -1592,6 +1602,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 static struct perf_counter *
 perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   int cpu,
+		   struct perf_counter_context *ctx,
 		   struct perf_counter *group_leader,
 		   gfp_t gfpflags)
 {
@@ -1623,6 +1634,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
+	counter->ctx			= ctx;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
@@ -1631,7 +1643,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	hw_ops = NULL;
 	if (!hw_event->raw && hw_event->type < 0)
 		hw_ops = sw_perf_counter_init(counter);
-	if (!hw_ops)
+	else
 		hw_ops = hw_perf_counter_init(counter);
 
 	if (!hw_ops) {
@@ -1707,7 +1719,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
 	}
 
 	ret = -EINVAL;
-	counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL);
+	counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
+				     GFP_KERNEL);
 	if (!counter)
 		goto err_put_context;
 
@@ -1777,15 +1790,14 @@ inherit_counter(struct perf_counter *parent_counter,
 		parent_counter = parent_counter->parent;
 
 	child_counter = perf_counter_alloc(&parent_counter->hw_event,
-					    parent_counter->cpu, group_leader,
-					    GFP_KERNEL);
+					   parent_counter->cpu, child_ctx,
+					   group_leader, GFP_KERNEL);
 	if (!child_counter)
 		return NULL;
 
 	/*
 	 * Link it up in the child's context:
 	 */
-	child_counter->ctx = child_ctx;
 	child_counter->task = child;
 	list_add_counter(child_counter, child_ctx);
 	child_ctx->nr_counters++;
diff --git a/kernel/sched.c b/kernel/sched.c
index 8db1a4cf2082f..173768f142ad7 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -558,6 +558,7 @@ struct rq {
 	struct load_weight load;
 	unsigned long nr_load_updates;
 	u64 nr_switches;
+	u64 nr_migrations_in;
 
 	struct cfs_rq cfs;
 	struct rt_rq rt;
@@ -1908,6 +1909,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 #endif
 	if (old_cpu != new_cpu) {
 		p->se.nr_migrations++;
+		new_rq->nr_migrations_in++;
 #ifdef CONFIG_SCHEDSTATS
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
@@ -2810,6 +2812,21 @@ unsigned long nr_active(void)
 	return running + uninterruptible;
 }
 
+/*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_switches(cpu) - number of context switches on that cpu
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_switches(int cpu)
+{
+	return cpu_rq(cpu)->nr_switches;
+}
+
+u64 cpu_nr_migrations(int cpu)
+{
+	return cpu_rq(cpu)->nr_migrations_in;
+}
+
 /*
  * Update rq->cpu_load[] statistics. This function is usually called every
  * scheduler tick (TICK_NSEC).
-- 
GitLab


From d278c48435625cb6b7edcf6a547620768b175709 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 9 Feb 2009 07:38:50 +0100
Subject: [PATCH 0077/2198] perf_counters: account NMI interrupts

I noticed that kerneltop interrupts were accounted as NMI, but not their
perf counter origin.

Account NMI performance counter interrupts.

Signed-off-by: Mike Galbraith  <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>

 arch/x86/kernel/cpu/perf_counter.c |    2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8bb213323fe84..9901e46998d1a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -495,6 +495,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 		goto out;
 
 again:
+	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
 		struct perf_counter *counter = cpuc->counters[bit];
@@ -570,7 +571,6 @@ void perf_counter_unthrottle(void)
 void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
-	inc_irq_stat(apic_perf_irqs);
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	__smp_perf_counter_interrupt(regs, 0);
 
-- 
GitLab


From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 11 Feb 2009 14:35:35 +1100
Subject: [PATCH 0078/2198] perf_counters: allow users to count user, kernel
 and/or hypervisor events

Impact: new perf_counter feature

This extends the perf_counter_hw_event struct with bits that specify
that events in user, kernel and/or hypervisor mode should not be
counted (i.e. should be excluded), and adds code to program the PMU
mode selection bits accordingly on x86 and powerpc.

For software counters, we don't currently have the infrastructure to
distinguish which mode an event occurs in, so we currently fail the
counter initialization if the setting of the hw_event.exclude_* bits
would require us to distinguish.  Context switches and CPU migrations
are currently considered to occur in kernel mode.

On x86, this changes the previous policy that only root can count
kernel events.  Now non-root users can count kernel events or exclude
them.  Non-root users still can't use NMI events, though.  On x86 we
don't appear to have any way to control whether hypervisor events are
counted or not, so hw_event.exclude_hv is ignored.

On powerpc, the selection of whether to count events in user, kernel
and/or hypervisor mode is PMU-wide, not per-counter, so this adds a
check that the hw_event.exclude_* settings are the same as other events
on the PMU.  Counters being added to a group have to have the same
settings as the other hardware counters in the group.  Counters and
groups can only be enabled in hw_perf_group_sched_in or power_perf_enable
if they have the same settings as any other counters already on the
PMU.  If we are not running on a hypervisor, the exclude_hv setting
is ignored (by forcing it to 0) since we can't ever get any
hypervisor events.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 68 ++++++++++++++++++++++++++++--
 arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++------
 include/linux/perf_counter.h       | 19 +++++----
 kernel/perf_counter.c              | 26 ++++++++++--
 4 files changed, 117 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5b0211348c73a..bd6ba85beb549 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -16,6 +16,7 @@
 #include <asm/reg.h>
 #include <asm/pmc.h>
 #include <asm/machdep.h>
+#include <asm/firmware.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -214,6 +215,36 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 	return 0;
 }
 
+/*
+ * Check if newly-added counters have consistent settings for
+ * exclude_{user,kernel,hv} with each other and any previously
+ * added counters.
+ */
+static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+{
+	int eu, ek, eh;
+	int i, n;
+	struct perf_counter *counter;
+
+	n = n_prev + n_new;
+	if (n <= 1)
+		return 0;
+
+	eu = ctrs[0]->hw_event.exclude_user;
+	ek = ctrs[0]->hw_event.exclude_kernel;
+	eh = ctrs[0]->hw_event.exclude_hv;
+	if (n_prev == 0)
+		n_prev = 1;
+	for (i = n_prev; i < n; ++i) {
+		counter = ctrs[i];
+		if (counter->hw_event.exclude_user != eu ||
+		    counter->hw_event.exclude_kernel != ek ||
+		    counter->hw_event.exclude_hv != eh)
+			return -EAGAIN;
+	}
+	return 0;
+}
+
 static void power_perf_read(struct perf_counter *counter)
 {
 	long val, delta, prev;
@@ -323,6 +354,20 @@ void hw_perf_restore(u64 disable)
 		goto out;
 	}
 
+	/*
+	 * Add in MMCR0 freeze bits corresponding to the
+	 * hw_event.exclude_* bits for the first counter.
+	 * We have already checked that all counters have the
+	 * same values for these bits as the first counter.
+	 */
+	counter = cpuhw->counter[0];
+	if (counter->hw_event.exclude_user)
+		cpuhw->mmcr[0] |= MMCR0_FCP;
+	if (counter->hw_event.exclude_kernel)
+		cpuhw->mmcr[0] |= MMCR0_FCS;
+	if (counter->hw_event.exclude_hv)
+		cpuhw->mmcr[0] |= MMCR0_FCHV;
+
 	/*
 	 * Write the new configuration to MMCR* with the freeze
 	 * bit set and set the hardware counters to their initial values.
@@ -424,6 +469,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 			   &cpuhw->counter[n0], &cpuhw->events[n0]);
 	if (n < 0)
 		return -EAGAIN;
+	if (check_excludes(cpuhw->counter, n0, n))
+		return -EAGAIN;
 	if (power_check_constraints(cpuhw->events, n + n0))
 		return -EAGAIN;
 	cpuhw->n_counters = n0 + n;
@@ -476,6 +523,8 @@ static int power_perf_enable(struct perf_counter *counter)
 		goto out;
 	cpuhw->counter[n0] = counter;
 	cpuhw->events[n0] = counter->hw.config;
+	if (check_excludes(cpuhw->counter, n0, 1))
+		goto out;
 	if (power_check_constraints(cpuhw->events, n0 + 1))
 		goto out;
 
@@ -554,6 +603,17 @@ hw_perf_counter_init(struct perf_counter *counter)
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
 
+	/*
+	 * If we are not running on a hypervisor, force the
+	 * exclude_hv bit to 0 so that we don't care what
+	 * the user set it to.  This also means that we don't
+	 * set the MMCR0_FCHV bit, which unconditionally freezes
+	 * the counters on the PPC970 variants used in Apple G5
+	 * machines (since MSR.HV is always 1 on those machines).
+	 */
+	if (!firmware_has_feature(FW_FEATURE_LPAR))
+		counter->hw_event.exclude_hv = 0;
+	
 	/*
 	 * If this is in a group, check if it can go on with all the
 	 * other hardware counters in the group.  We assume the counter
@@ -566,11 +626,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		if (n < 0)
 			return NULL;
 	}
-	events[n++] = ev;
-	if (power_check_constraints(events, n))
+	events[n] = ev;
+	if (check_excludes(ctrs, n, 1))
+		return NULL;
+	if (power_check_constraints(events, n + 1))
 		return NULL;
 
-	counter->hw.config = events[n - 1];
+	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
 	return &power_perf_ops;
 }
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9901e46998d1a..383d4c6423a1c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		return -EINVAL;
 
 	/*
-	 * Count user events, and generate PMC IRQs:
+	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
 	 */
-	hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT;
+	hwc->config = ARCH_PERFMON_EVENTSEL_INT;
 
 	/*
-	 * If privileged enough, count OS events too, and allow
-	 * NMI events as well:
+	 * Count user and OS events unless requested not to.
 	 */
-	hwc->nmi = 0;
-	if (capable(CAP_SYS_ADMIN)) {
+	if (!hw_event->exclude_user)
+		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
+	if (!hw_event->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
-		if (hw_event->nmi)
-			hwc->nmi = 1;
-	}
+
+	/*
+	 * If privileged enough, allow NMI events:
+	 */
+	hwc->nmi = 0;
+	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
+		hwc->nmi = 1;
 
 	hwc->irq_period		= hw_event->irq_period;
 	/*
@@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter,
 	int err;
 
 	/*
-	 * Enable IRQ generation (0x8) and ring-3 counting (0x2),
-	 * and enable ring-0 counting if allowed:
+	 * Enable IRQ generation (0x8),
+	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
+	 * if requested:
 	 */
-	bits = 0x8ULL | 0x2ULL;
+	bits = 0x8ULL;
+	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
+		bits |= 0x2;
 	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
 		bits |= 0x1;
 	bits <<= (idx * 4);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f55381fbcac98..c83f51d6e359f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -83,14 +83,17 @@ struct perf_counter_hw_event {
 	u64			irq_period;
 	u32			record_type;
 
-	u32			disabled     :  1, /* off by default      */
-				nmi	     :  1, /* NMI sampling        */
-				raw	     :  1, /* raw event type      */
-				inherit	     :  1, /* children inherit it */
-				pinned	     :  1, /* must always be on PMU */
-				exclusive    :  1, /* only counter on PMU */
-
-				__reserved_1 : 26;
+	u32			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				raw	       :  1, /* raw event type        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+
+				__reserved_1 : 23;
 
 	u64			__reserved_2;
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 544193cbc4784..89d5e3fe9700e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1567,11 +1567,25 @@ sw_perf_counter_init(struct perf_counter *counter)
 {
 	const struct hw_perf_counter_ops *hw_ops = NULL;
 
+	/*
+	 * Software counters (currently) can't in general distinguish
+	 * between user, kernel and hypervisor events.
+	 * However, context switches and cpu migrations are considered
+	 * to be kernel events, and page faults are never hypervisor
+	 * events.
+	 */
 	switch (counter->hw_event.type) {
 	case PERF_COUNT_CPU_CLOCK:
-		hw_ops = &perf_ops_cpu_clock;
+		if (!(counter->hw_event.exclude_user ||
+		      counter->hw_event.exclude_kernel ||
+		      counter->hw_event.exclude_hv))
+			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
+		if (counter->hw_event.exclude_user ||
+		    counter->hw_event.exclude_kernel ||
+		    counter->hw_event.exclude_hv)
+			break;
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -1582,13 +1596,17 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
-		hw_ops = &perf_ops_page_faults;
+		if (!(counter->hw_event.exclude_user ||
+		      counter->hw_event.exclude_kernel))
+			hw_ops = &perf_ops_page_faults;
 		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		hw_ops = &perf_ops_context_switches;
+		if (!counter->hw_event.exclude_kernel)
+			hw_ops = &perf_ops_context_switches;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
-		hw_ops = &perf_ops_cpu_migrations;
+		if (!counter->hw_event.exclude_kernel)
+			hw_ops = &perf_ops_cpu_migrations;
 		break;
 	default:
 		break;
-- 
GitLab


From 5af759176cc767e7426f89764bde4996ebaaf419 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 11 Feb 2009 10:53:37 +0100
Subject: [PATCH 0079/2198] perfcounters: fix use after free in perf_release()

running...

  while true; do
    foo -d 1 -f 1 -c 100000 & sleep 1
    kerneltop -d 1 -f 1 -e 1 -c 25000 -p `pidof foo`
  done

  while true; do
    killall foo; killall kerneltop; sleep 2
  done

...in two shells with SLUB_DEBUG enabled produces flood of:
BUG task_struct: Poison overwritten.

Fix the use-after-free bug in perf_release().

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 89d5e3fe9700e..e0576c3fdb509 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1145,12 +1145,12 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_lock(&counter->mutex);
 
 	perf_counter_remove_from_context(counter);
-	put_context(ctx);
 
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
 	kfree(counter);
+	put_context(ctx);
 
 	return 0;
 }
-- 
GitLab


From 4bcf349a0f90d1e69eb35c6df0fa285c886c1cd6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 11 Feb 2009 13:53:19 +0100
Subject: [PATCH 0080/2198] perfcounters: fix refcounting bug, take 2

Only free child_counter if it has a parent; if it doesn't, then it
has a file pointing to it and we'll free it in perf_release.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e0576c3fdb509..fcefb0a726f30 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1958,14 +1958,13 @@ __perf_counter_exit_task(struct task_struct *child,
 		sync_child_counter(child_counter, parent_counter);
 		list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
 					 list_entry) {
-			if (sub->parent)
+			if (sub->parent) {
 				sync_child_counter(sub, sub->parent);
-			kfree(sub);
+				kfree(sub);
+			}
 		}
-	}
-
-	if (!child_counter->filp || !atomic_long_read(&child_counter->filp->f_count))
 		kfree(child_counter);
+	}
 }
 
 /*
-- 
GitLab


From c07c99b67233ccaad38a961c17405dc1e1542aa4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 13 Feb 2009 22:10:34 +1100
Subject: [PATCH 0081/2198] perfcounters: make context switch and migration
 software counters work again

Jaswinder Singh Rajput reported that commit 23a185ca8abbeef caused the
context switch and migration software counters to report zero always.
With that commit, the software counters only count events that occur
between sched-in and sched-out for a task.  This is necessary for the
counter enable/disable prctls and ioctls to work.  However, the
context switch and migration counts are incremented after sched-out
for one task and before sched-in for the next.  Since the increment
doesn't occur while a task is scheduled in (as far as the software
counters are concerned) it doesn't count towards any counter.

Thus the context switch and migration counters need to count events
that occur at any time, provided the counter is enabled, not just
those that occur while the task is scheduled in (from the perf_counter
subsystem's point of view).  The problem though is that the software
counter code can't tell the difference between being enabled and being
scheduled in, and between being disabled and being scheduled out,
since we use the one pair of enable/disable entry points for both.
That is, the high-level disable operation simply arranges for the
counter to not be scheduled in any more, and the high-level enable
operation arranges for it to be scheduled in again.

One way to solve this would be to have sched_in/out operations in the
hw_perf_counter_ops struct as well as enable/disable.  However, this
takes a simpler approach: it adds a 'prev_state' field to the
perf_counter struct that allows a counter's enable method to know
whether the counter was previously disabled or just inactive
(scheduled out), and therefore whether the enable method is being
called as a result of a high-level enable or a schedule-in operation.

This then allows the context switch, migration and page fault counters
to reset their hw.prev_count value in their enable functions only if
they are called as a result of a high-level enable operation.
Although page faults would normally only occur while the counter is
scheduled in, this changes the page fault counter code too in case
there are ever circumstances where page faults get counted against a
task while its counters are not scheduled in.

Reported-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 +
 kernel/perf_counter.c        | 21 +++++++++++++++------
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c83f51d6e359f..32cd1acb7386c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -173,6 +173,7 @@ struct perf_counter {
 	const struct hw_perf_counter_ops *hw_ops;
 
 	enum perf_counter_active_state	state;
+	enum perf_counter_active_state	prev_state;
 	atomic64_t			count;
 
 	struct perf_counter_hw_event	hw_event;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fcefb0a726f30..ad62965828d3d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -444,6 +444,7 @@ static void __perf_install_in_context(void *info)
 
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
+	counter->prev_state = PERF_COUNTER_STATE_OFF;
 
 	/*
 	 * Don't put the counter on if it is disabled or if
@@ -562,6 +563,7 @@ static void __perf_counter_enable(void *info)
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
 
+	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
@@ -733,6 +735,7 @@ group_sched_in(struct perf_counter *group_counter,
 	if (ret)
 		return ret < 0 ? ret : 0;
 
+	group_counter->prev_state = group_counter->state;
 	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 		return -EAGAIN;
 
@@ -740,6 +743,7 @@ group_sched_in(struct perf_counter *group_counter,
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		counter->prev_state = counter->state;
 		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 			partial_group = counter;
 			goto group_error;
@@ -1398,9 +1402,9 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 
 static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
-	u64 now = task_clock_perf_counter_val(counter, 0);
-
-	atomic64_set(&counter->hw.prev_count, now);
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count,
+			     task_clock_perf_counter_val(counter, 0));
 
 	return 0;
 }
@@ -1455,7 +1459,8 @@ static void page_faults_perf_counter_read(struct perf_counter *counter)
 
 static int page_faults_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
 	return 0;
 }
 
@@ -1501,7 +1506,9 @@ static void context_switches_perf_counter_read(struct perf_counter *counter)
 
 static int context_switches_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, get_context_switches(counter));
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count,
+			     get_context_switches(counter));
 	return 0;
 }
 
@@ -1547,7 +1554,9 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
 
 static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
 {
-	atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter));
+	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
+		atomic64_set(&counter->hw.prev_count,
+			     get_cpu_migrations(counter));
 	return 0;
 }
 
-- 
GitLab


From 73ca2f8380311115723c7afe811f3ed1f0ba945e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 16 Feb 2009 01:08:17 +0100
Subject: [PATCH 0082/2198] perfcounters: remove duplicate definition of
 LOCAL_PERF_VECTOR

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/irq_vectors.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index b66b518ff0003..b07278c55e9ef 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -111,11 +111,6 @@
  */
 #define LOCAL_PERF_VECTOR		0xee
 
-/*
- * Performance monitoring interrupt vector:
- */
-#define LOCAL_PERF_VECTOR	0xee
-
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
-- 
GitLab


From 37a25424252b6cff4dd4b1937ab6a1dbfcadabcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 16 Feb 2009 15:32:23 +0100
Subject: [PATCH 0083/2198] perfcounters: fix acpi_idle_do_entry() workaround

Fix merge error in drivers/acpi/processor_idle.c. This
resulted in non-working perfcounters on certain Nehalem
systems.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/acpi/processor_idle.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 259f6e806314c..08def2f20cd97 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -826,12 +826,9 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
 	u64 perf_flags;
 
-	u64 pctrl;
-
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
 	perf_flags = hw_perf_save_disable();
-	pctrl = hw_perf_save_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -847,7 +844,6 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
 	hw_perf_restore(perf_flags);
-	hw_perf_restore(pctrl);
 	start_critical_timings();
 }
 
-- 
GitLab


From d095cd46dac104e4d2a4967c7c19b55a12f78240 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 23 Feb 2009 23:01:28 +1100
Subject: [PATCH 0084/2198] perfcounters/powerpc: Make exclude_kernel bit work
 on Apple G5 processors

Currently, setting hw_event.exclude_kernel does nothing on the PPC970
variants used in Apple G5 machines, because they have the HV (hypervisor)
bit in the MSR forced to 1, so as far as the PMU is concerned, the
kernel runs in hypervisor mode.  Thus we have to use the MMCR0_FCHV
(freeze counters in hypervisor mode) bit rather than the MMCR0_FCS
(freeze counters in supervisor mode) bit.

This checks the MSR.HV bit at startup, and if it is set, we set the
freeze_counters_kernel variable to MMCR0_FCHV (it was initialized to
MMCR0_FCS).  We then use that whenever we need to exclude kernel events.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bd6ba85beb549..6e27913ec0d88 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -32,6 +32,15 @@ DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
 struct power_pmu *ppmu;
 
+/*
+ * Normally, to ignore kernel events we set the FCS (freeze counters
+ * in supervisor mode) bit in MMCR0, but if the kernel runs with the
+ * hypervisor bit set in the MSR, or if we are running on a processor
+ * where the hypervisor bit is forced to 1 (as on Apple G5 processors),
+ * then we need to use the FCHV bit to ignore kernel events.
+ */
+static unsigned int freeze_counters_kernel = MMCR0_FCS;
+
 void perf_counter_print_debug(void)
 {
 }
@@ -364,7 +373,7 @@ void hw_perf_restore(u64 disable)
 	if (counter->hw_event.exclude_user)
 		cpuhw->mmcr[0] |= MMCR0_FCP;
 	if (counter->hw_event.exclude_kernel)
-		cpuhw->mmcr[0] |= MMCR0_FCS;
+		cpuhw->mmcr[0] |= freeze_counters_kernel;
 	if (counter->hw_event.exclude_hv)
 		cpuhw->mmcr[0] |= MMCR0_FCHV;
 
@@ -606,10 +615,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * If we are not running on a hypervisor, force the
 	 * exclude_hv bit to 0 so that we don't care what
-	 * the user set it to.  This also means that we don't
-	 * set the MMCR0_FCHV bit, which unconditionally freezes
-	 * the counters on the PPC970 variants used in Apple G5
-	 * machines (since MSR.HV is always 1 on those machines).
+	 * the user set it to.
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		counter->hw_event.exclude_hv = 0;
@@ -841,6 +847,13 @@ static int init_perf_counters(void)
 		ppmu = &power6_pmu;
 		break;
 	}
+
+	/*
+	 * Use FCHV to ignore kernel events if MSR.HV is set.
+	 */
+	if (mfmsr() & MSR_HV)
+		freeze_counters_kernel = MMCR0_FCHV;
+
 	return 0;
 }
 
-- 
GitLab


From 742bd95ba96e19b3f7196c3a0834ebc17c8ba006 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 24 Feb 2009 11:33:56 +1100
Subject: [PATCH 0085/2198] perfcounters/powerpc: Add support for POWER5
 processors

This adds the back-end for the PMU on the POWER5 processor.  This knows
how to use the fixed-function PMC5 and PMC6 (instructions completed and
run cycles).  Unlike POWER6, PMC5/6 obey the freeze conditions and can
generate interrupts, so their use doesn't impose any extra restrictions.

POWER5+ is different and is not supported by this patch.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile       |   3 +-
 arch/powerpc/kernel/perf_counter.c |   4 +
 arch/powerpc/kernel/power5-pmu.c   | 475 +++++++++++++++++++++++++++++
 3 files changed, 481 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/power5-pmu.c

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 7c941ec3b23e3..b4c6f466164b6 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,8 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power6-pmu.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power5-pmu.o \
+				   power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6e27913ec0d88..112332d07fc26 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -824,6 +824,7 @@ void hw_perf_counter_setup(int cpu)
 }
 
 extern struct power_pmu ppc970_pmu;
+extern struct power_pmu power5_pmu;
 extern struct power_pmu power6_pmu;
 
 static int init_perf_counters(void)
@@ -843,6 +844,9 @@ static int init_perf_counters(void)
 	case PV_970MP:
 		ppmu = &ppc970_pmu;
 		break;
+	case PV_POWER5:
+		ppmu = &power5_pmu;
+		break;
 	case 0x3e:
 		ppmu = &power6_pmu;
 		break;
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
new file mode 100644
index 0000000000000..379ed1087ccaa
--- /dev/null
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -0,0 +1,475 @@
+/*
+ * Performance counter support for POWER5 (not POWER5++) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER5 (not POWER5++)
+ */
+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	16	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_BYTE_SH	12	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	7
+#define PM_GRS_SH	8	/* Storage subsystem mux select */
+#define PM_GRS_MSK	7
+#define PM_BUSEVENT_MSK	0x80	/* Set if event uses event bus */
+#define PM_PMCSEL_MSK	0x7f
+
+/* Values in PM_UNIT field */
+#define PM_FPU		0
+#define PM_ISU0		1
+#define PM_IFU		2
+#define PM_ISU1		3
+#define PM_IDU		4
+#define PM_ISU0_ALT	6
+#define PM_GRS		7
+#define PM_LSU0		8
+#define PM_LSU1		0xc
+#define PM_LASTUNIT	0xc
+
+/*
+ * Bits in MMCR1 for POWER5
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTM1SEL_SH	60
+#define MMCR1_TTM2SEL_SH	58
+#define MMCR1_TTM3SEL_SH	56
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	54
+#define MMCR1_TD_CP_DBG1SEL_SH	52
+#define MMCR1_TD_CP_DBG2SEL_SH	50
+#define MMCR1_TD_CP_DBG3SEL_SH	48
+#define MMCR1_GRS_L2SEL_SH	46
+#define MMCR1_GRS_L2SEL_MSK	3
+#define MMCR1_GRS_L3SEL_SH	44
+#define MMCR1_GRS_L3SEL_MSK	3
+#define MMCR1_GRS_MCSEL_SH	41
+#define MMCR1_GRS_MCSEL_MSK	7
+#define MMCR1_GRS_FABSEL_SH	39
+#define MMCR1_GRS_FABSEL_MSK	3
+#define MMCR1_PMC1_ADDER_SEL_SH	35
+#define MMCR1_PMC2_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC1SEL_SH	25
+#define MMCR1_PMC2SEL_SH	17
+#define MMCR1_PMC3SEL_SH	9
+#define MMCR1_PMC4SEL_SH	1
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0x7f
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *         <><>[  ><><>< ><> [  >[ >[ ><  ><  ><  ><  ><><><><><><>
+ *         T0T1 NC G0G1G2 G3  UC PS1PS2 B0  B1  B2  B3 P6P5P4P3P2P1
+ *
+ * T0 - TTM0 constraint
+ *     54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000
+ *
+ * NC - number of counters
+ *     51: NC error 0x0008_0000_0000_0000
+ *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
+ *
+ * G0..G3 - GRS mux constraints
+ *     46-47: GRS_L2SEL value
+ *     44-45: GRS_L3SEL value
+ *     41-44: GRS_MCSEL value
+ *     39-40: GRS_FABSEL value
+ *	Note that these match up with their bit positions in MMCR1
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
+ *     37: UC3 error 0x20_0000_0000
+ *     36: FPU|IFU|ISU1 events needed 0x10_0000_0000
+ *     35: ISU0 events needed 0x08_0000_0000
+ *     34: IDU|GRS events needed 0x04_0000_0000
+ *
+ * PS1
+ *     33: PS1 error 0x2_0000_0000
+ *     31-32: count of events needing PMC1/2 0x1_8000_0000
+ *
+ * PS2
+ *     30: PS2 error 0x4000_0000
+ *     28-29: count of events needing PMC3/4 0x3000_0000
+ *
+ * B0
+ *     24-27: Byte 0 event source 0x0f00_0000
+ *	      Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
+ *
+ * P1..P6
+ *     0-11: Count of events needing PMC1..PMC6
+ */
+
+static const int grsel_shift[8] = {
+	MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
+	MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
+	MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
+};
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0xc0002000000000ull, 0x00001000000000ull },
+	[PM_ISU0] =  { 0x00002000000000ull, 0x00000800000000ull },
+	[PM_ISU1] =  { 0xc0002000000000ull, 0xc0001000000000ull },
+	[PM_IFU] =   { 0xc0002000000000ull, 0x80001000000000ull },
+	[PM_IDU] =   { 0x30002000000000ull, 0x00000400000000ull },
+	[PM_GRS] =   { 0x30002000000000ull, 0x30000400000000ull },
+};
+
+static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, sh;
+	int bit, fmask;
+	u64 mask = 0, value = 0;
+	int grp = -1;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 6)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		if (pmc <= 4)
+			grp = (pmc - 1) >> 1;
+		else if (event != 0x500009 && event != 0x600005)
+			return -1;
+	}
+	if (event & PM_BUSEVENT_MSK) {
+		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+		if (unit > PM_LASTUNIT)
+			return -1;
+		if (unit == PM_ISU0_ALT)
+			unit = PM_ISU0;
+		mask |= unit_cons[unit][0];
+		value |= unit_cons[unit][1];
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		if (byte >= 4) {
+			if (unit != PM_LSU1)
+				return -1;
+			/* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
+			++unit;
+			byte &= 3;
+		}
+		if (unit == PM_GRS) {
+			bit = event & 7;
+			fmask = (bit == 6)? 7: 3;
+			sh = grsel_shift[bit];
+			mask |= (u64)fmask << sh;
+			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+		}
+		/*
+		 * Bus events on bytes 0 and 2 can be counted
+		 * on PMC1/2; bytes 1 and 3 on PMC3/4.
+		 */
+		if (!pmc)
+			grp = byte & 1;
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (24 - 4 * byte);
+		value |= (u64)unit << (24 - 4 * byte);
+	}
+	if (grp == 0) {
+		/* increment PMC1/2 field */
+		mask  |= 0x200000000ull;
+		value |= 0x080000000ull;
+	} else if (grp == 1) {
+		/* increment PMC3/4 field */
+		mask  |= 0x40000000ull;
+		value |= 0x10000000ull;
+	}
+	if (pmc < 5) {
+		/* need a counter from PMC1-4 set */
+		mask  |= 0x8000000000000ull;
+		value |= 0x1000000000000ull;
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	3	/* at most 3 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x120e4,  0x400002 },			/* PM_GRP_DISP_REJECT */
+	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
+	{ 0x100005, 0x600005 },			/* PM_RUN_CYC */
+	{ 0x100009, 0x200009, 0x500009 },	/* PM_INST_CMPL */
+	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(unsigned int event)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			break;
+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+			if (event == event_alternatives[i][j])
+				return i;
+	}
+	return -1;
+}
+
+static const unsigned char bytedecode_alternatives[4][4] = {
+	/* PMC 1 */	{ 0x21, 0x23, 0x25, 0x27 },
+	/* PMC 2 */	{ 0x07, 0x17, 0x0e, 0x1e },
+	/* PMC 3 */	{ 0x20, 0x22, 0x24, 0x26 },
+	/* PMC 4 */	{ 0x07, 0x17, 0x0e, 0x1e }
+};
+
+/*
+ * Some direct events for decodes of event bus byte 3 have alternative
+ * PMCSEL values on other counters.  This returns the alternative
+ * event code for those that do, or -1 otherwise.
+ */
+static int find_alternative_bdecode(unsigned int event)
+{
+	int pmc, altpmc, pp, j;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc == 0 || pmc > 4)
+		return -1;
+	altpmc = 5 - pmc;	/* 1 <-> 4, 2 <-> 3 */
+	pp = event & PM_PMCSEL_MSK;
+	for (j = 0; j < 4; ++j) {
+		if (bytedecode_alternatives[pmc - 1][j] == pp) {
+			return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
+				(altpmc << PM_PMC_SH) |
+				bytedecode_alternatives[altpmc - 1][j];
+		}
+	}
+	return -1;
+}
+
+static int power5_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j, ae, nalt = 1;
+
+	alt[0] = event;
+	nalt = 1;
+	i = find_alternative(event);
+	if (i >= 0) {
+		for (j = 0; j < MAX_ALT; ++j) {
+			ae = event_alternatives[i][j];
+			if (ae && ae != event)
+				alt[nalt++] = ae;
+		}
+	} else {
+		ae = find_alternative_bdecode(event);
+		if (ae > 0)
+			alt[nalt++] = ae;
+	}
+	return nalt;
+}
+
+static int power5_compute_mmcr(unsigned int event[], int n_ev,
+			       unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	unsigned int pmc, unit, byte, psel;
+	unsigned int ttm, grp;
+	int i, isbus, bit, grsel;
+	unsigned int pmc_inuse = 0;
+	unsigned int pmc_grp_use[2];
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	int ttmuse;
+
+	if (n_ev > 6)
+		return -1;
+
+	/* First pass to count resource use */
+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc > 6)
+				return -1;
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+			/* count 1/2 vs 3/4 use */
+			if (pmc <= 4)
+				++pmc_grp_use[(pmc - 1) >> 1];
+		}
+		if (event[i] & PM_BUSEVENT_MSK) {
+			unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+			byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+			if (unit > PM_LASTUNIT)
+				return -1;
+			if (unit == PM_ISU0_ALT)
+				unit = PM_ISU0;
+			if (byte >= 4) {
+				if (unit != PM_LSU1)
+					return -1;
+				++unit;
+				byte &= 3;
+			}
+			if (!pmc)
+				++pmc_grp_use[byte & 1];
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			unituse[unit] = 1;
+		}
+	}
+	if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2)
+		return -1;
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
+	 * choice we have to deal with.
+	 */
+	if (unituse[PM_ISU0] &
+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
+		unituse[PM_ISU0_ALT] = 1;	/* move ISU to TTM1 */
+		unituse[PM_ISU0] = 0;
+	}
+	/* Set TTM[01]SEL fields. */
+	ttmuse = 0;
+	for (i = PM_FPU; i <= PM_ISU1; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+	}
+	ttmuse = 0;
+	for (; i <= PM_GRS; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+	}
+	if (ttmuse > 1)
+		return -1;
+
+	/* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
+			/* get ISU0 through TTM1 rather than TTM0 */
+			unit = PM_ISU0_ALT;
+		} else if (unit == PM_LSU1 + 1) {
+			/* select lower word of LSU1 for this byte */
+			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+		}
+		ttm = unit >> 2;
+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		isbus = event[i] & PM_BUSEVENT_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			for (pmc = 0; pmc < 4; ++pmc) {
+				if (pmc_inuse & (1 << pmc))
+					continue;
+				grp = (pmc >> 1) & 1;
+				if (isbus) {
+					if (grp == (byte & 1))
+						break;
+				} else if (pmc_grp_use[grp] < 2) {
+					++pmc_grp_use[grp];
+					break;
+				}
+			}
+			pmc_inuse |= 1 << pmc;
+		} else if (pmc <= 4) {
+			/* Direct event */
+			--pmc;
+			if ((psel == 8 || psel == 0x10) && isbus && (byte & 2))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+		} else {
+			/* Instructions or run cycles on PMC5/6 */
+			--pmc;
+		}
+		if (isbus && unit == PM_GRS) {
+			bit = psel & 7;
+			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
+			mmcr1 |= (u64)grsel << grsel_shift[bit];
+		}
+		if (pmc <= 3)
+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+		hwc[i] = pmc;
+	}
+
+	/* Return MMCRx values */
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0x3e)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = 0;
+	return 0;
+}
+
+static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	if (pmc <= 3)
+		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power5_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0xf,
+	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x4c1090,	/* LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
+	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+};
+
+struct power_pmu power5_pmu = {
+	.n_counter = 6,
+	.max_alternatives = MAX_ALT,
+	.add_fields = 0x7000090000555ull,
+	.test_adder = 0x3000490000000ull,
+	.compute_mmcr = power5_compute_mmcr,
+	.get_constraint = power5_get_constraint,
+	.get_alternatives = power5_get_alternatives,
+	.disable_pmc = power5_disable_pmc,
+	.n_generic = ARRAY_SIZE(power5_generic_events),
+	.generic_events = power5_generic_events,
+};
-- 
GitLab


From f3dfd2656deb81a0addee4f4ceff66b50a387388 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 26 Feb 2009 22:43:46 +1100
Subject: [PATCH 0086/2198] perfcounters: fix a few minor cleanliness issues

This fixes three issues noticed by Arnd Bergmann:

- Add #ifdef __KERNEL__ and move some things around in perf_counter.h
  to make sure only the bits that userspace needs are exported to
  userspace.

- Use __u64, __s64, __u32 types in the structs exported to userspace
  rather than u64, s64, u32.

- Make the sys_perf_counter_open syscall available to the SPUs on
  Cell platforms.

And one issue that I noticed in looking at the code again:

- Wrap the perf_counter_open syscall with SYSCALL_DEFINE4 so we get
  the proper handling of int arguments on ppc64 (and some other 64-bit
  architectures).

Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/systbl.h |  2 +-
 include/linux/perf_counter.h      | 43 +++++++++++++++++--------------
 include/linux/syscalls.h          |  9 +++----
 kernel/perf_counter.c             |  6 ++---
 4 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index 4c8095f6bec0b..d312eec8abb93 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -322,4 +322,4 @@ SYSCALL_SPU(epoll_create1)
 SYSCALL_SPU(dup3)
 SYSCALL_SPU(pipe2)
 SYSCALL(inotify_init1)
-SYSCALL(perf_counter_open)
+SYSCALL_SPU(perf_counter_open)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 32cd1acb7386c..186efaf496650 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -13,20 +13,8 @@
 #ifndef _LINUX_PERF_COUNTER_H
 #define _LINUX_PERF_COUNTER_H
 
-#include <asm/atomic.h>
-#include <asm/ioctl.h>
-
-#ifdef CONFIG_PERF_COUNTERS
-# include <asm/perf_counter.h>
-#endif
-
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/rculist.h>
-#include <linux/rcupdate.h>
-#include <linux/spinlock.h>
-
-struct task_struct;
+#include <linux/types.h>
+#include <linux/ioctl.h>
 
 /*
  * User-space ABI bits:
@@ -78,12 +66,12 @@ enum perf_counter_record_type {
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	s64			type;
+	__s64			type;
 
-	u64			irq_period;
-	u32			record_type;
+	__u64			irq_period;
+	__u32			record_type;
 
-	u32			disabled       :  1, /* off by default        */
+	__u32			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
 				raw	       :  1, /* raw event type        */
 				inherit	       :  1, /* children inherit it   */
@@ -95,7 +83,7 @@ struct perf_counter_hw_event {
 
 				__reserved_1 : 23;
 
-	u64			__reserved_2;
+	__u64			__reserved_2;
 };
 
 /*
@@ -104,10 +92,24 @@ struct perf_counter_hw_event {
 #define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
 #define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
 
+#ifdef __KERNEL__
 /*
- * Kernel-internal data types:
+ * Kernel-internal data types and definitions:
  */
 
+#ifdef CONFIG_PERF_COUNTERS
+# include <asm/perf_counter.h>
+#endif
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+
+struct task_struct;
+
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -293,4 +295,5 @@ static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 #endif
 
+#endif /* __KERNEL__ */
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 88255d3261a42..28ef2be839c70 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -696,10 +696,7 @@ asmlinkage long sys_pipe(int __user *);
 int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd);
+asmlinkage long sys_perf_counter_open(
+		const struct perf_counter_hw_event __user *hw_event_uptr,
+		pid_t pid, int cpu, int group_fd);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ad62965828d3d..16b14ba99d34d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1690,9 +1690,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
-asmlinkage int
-sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user,
-		      pid_t pid, int cpu, int group_fd)
+SYSCALL_DEFINE4(perf_counter_open,
+		const struct perf_counter_hw_event __user *, hw_event_uptr,
+		pid_t, pid, int, cpu, int, group_fd)
 {
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
-- 
GitLab


From b56a3802dc6df29aa27d2c12edf420258091ad66 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Fri, 27 Feb 2009 18:09:09 +0530
Subject: [PATCH 0087/2198] x86: prepare perf_counter to add more cpus

Introduced  struct pmc_x86_ops to add more cpus.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 106 +++++++++++++++++++++--------
 1 file changed, 78 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 383d4c6423a1c..a3c88529bb72e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -3,6 +3,7 @@
  *
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
+ *  Copyright(C) 2009 Jaswinder Singh Rajput
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -38,10 +39,24 @@ struct cpu_hw_counters {
 };
 
 /*
- * Intel PerfMon v3. Used on Core2 and later.
+ * struct pmc_x86_ops - performance counter x86 ops
  */
+struct pmc_x86_ops {
+	u64 (*save_disable_all)		(void);
+	void (*restore_all)		(u64 ctrl);
+	unsigned eventsel;
+	unsigned perfctr;
+	int (*event_map)		(int event);
+	int max_events;
+};
+
+static struct pmc_x86_ops *pmc_ops;
+
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
+/*
+ * Intel PerfMon v3. Used on Core2 and later.
+ */
 static const int intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x003c,
@@ -53,7 +68,10 @@ static const int intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
+static int pmc_intel_event_map(int event)
+{
+	return intel_perfmon_event_map[event];
+}
 
 /*
  * Propagate counter elapsed time into the generic counter.
@@ -144,38 +162,48 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (hw_event->raw) {
 		hwc->config |= hw_event->type;
 	} else {
-		if (hw_event->type >= max_intel_perfmon_events)
+		if (hw_event->type >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= intel_perfmon_event_map[hw_event->type];
+		hwc->config |= pmc_ops->event_map(hw_event->type);
 	}
 	counter->wakeup_pending = 0;
 
 	return 0;
 }
 
-u64 hw_perf_save_disable(void)
+static u64 pmc_intel_save_disable_all(void)
 {
 	u64 ctrl;
 
-	if (unlikely(!perf_counters_initialized))
-		return 0;
-
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
 
 	return ctrl;
 }
+
+u64 hw_perf_save_disable(void)
+{
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
+	return pmc_ops->save_disable_all();
+}
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+static void pmc_intel_restore_all(u64 ctrl)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+}
+
 void hw_perf_restore(u64 ctrl)
 {
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	pmc_ops->restore_all(ctrl);
 }
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
@@ -291,11 +319,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES]))
+	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -339,8 +367,8 @@ static int pmc_generic_enable(struct perf_counter *counter)
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = MSR_ARCH_PERFMON_EVENTSEL0;
-		hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0;
+		hwc->config_base  = pmc_ops->eventsel;
+		hwc->counter_base = pmc_ops->perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -386,8 +414,8 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
-		rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
+		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
+		rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -655,29 +683,56 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 	.priority		= 1
 };
 
-void __init init_hw_perf_counters(void)
+static struct pmc_x86_ops pmc_intel_ops = {
+	.save_disable_all	= pmc_intel_save_disable_all,
+	.restore_all		= pmc_intel_restore_all,
+	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
+	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
+	.event_map		= pmc_intel_event_map,
+	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+};
+
+static struct pmc_x86_ops *pmc_intel_init(void)
 {
 	union cpuid10_eax eax;
 	unsigned int ebx;
 	unsigned int unused;
 	union cpuid10_edx edx;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	/*
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
 	 */
 	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-		return;
+		return NULL;
 
 	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
-
 	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
-	printk(KERN_INFO "... num counters:    %d\n", eax.split.num_counters);
+	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
+	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+
 	nr_counters_generic = eax.split.num_counters;
+	nr_counters_fixed = edx.split.num_counters_fixed;
+	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+
+	return &pmc_intel_ops;
+}
+
+void __init init_hw_perf_counters(void)
+{
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return;
+
+	switch (boot_cpu_data.x86_vendor) {
+	case X86_VENDOR_INTEL:
+		pmc_ops = pmc_intel_init();
+		break;
+	}
+	if (!pmc_ops)
+		return;
+
+	printk(KERN_INFO "... num counters:    %d\n", nr_counters_generic);
 	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
 		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -686,13 +741,8 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_counters_generic) - 1;
 	perf_max_counters = nr_counters_generic;
 
-	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
-	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
 
-	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
-
-	nr_counters_fixed = edx.split.num_counters_fixed;
 	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
 		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-- 
GitLab


From f87ad35d37fa543925210550f7db20a54c83ed70 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Fri, 27 Feb 2009 20:15:14 +0530
Subject: [PATCH 0088/2198] x86: AMD Support for perf_counter

Supported basic performance counter for AMD K7 and later:

$ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null

 Performance counter stats for 'ls':

      12.298610  task clock ticks     (msecs)

        3298477  CPU cycles           (events)
        1406354  instructions         (events)
         749035  cache references     (events)
          16939  cache misses         (events)
         100589  branches             (events)
          11159  branch misses        (events)
       7.627540  cpu clock ticks      (msecs)
      12.298610  task clock ticks     (msecs)
            500  pagefaults           (events)
              6  context switches     (events)
              3  CPU migrations       (events)

 Wall-clock time elapsed:     8.672290 msecs

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c          |  4 ++
 arch/x86/kernel/cpu/perf_counter.c | 83 +++++++++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 25423a5b80ed2..edcde52bd1709 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -368,6 +368,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 6)
 		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
 
+	/* Enable Performance counter for K7 and later */
+	if (c->x86 > 6 && c->x86 <= 0x11)
+		set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
+
 	if (!c->x86_model_id[0]) {
 		switch (c->x86) {
 		case 0xf:
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a3c88529bb72e..266618aa1a032 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -73,6 +73,24 @@ static int pmc_intel_event_map(int event)
 	return intel_perfmon_event_map[event];
 }
 
+/*
+ * AMD Performance Monitor K7 and later.
+ */
+static const int amd_perfmon_event_map[] =
+{
+  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
+  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+};
+
+static int pmc_amd_event_map(int event)
+{
+	return amd_perfmon_event_map[event];
+}
+
 /*
  * Propagate counter elapsed time into the generic counter.
  * Can only be executed on the CPU where the counter is active.
@@ -151,8 +169,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * so we install an artificial 1<<31 period regardless of
 	 * the generic counter period:
 	 */
-	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
-		hwc->irq_period = 0x7FFFFFFF;
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+		if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
+			hwc->irq_period = 0x7FFFFFFF;
 
 	atomic64_set(&hwc->period_left, hwc->irq_period);
 
@@ -184,6 +203,22 @@ static u64 pmc_intel_save_disable_all(void)
 	return ctrl;
 }
 
+static u64 pmc_amd_save_disable_all(void)
+{
+	int idx;
+	u64 val, ctrl = 0;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+			ctrl |= (1 << idx);
+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+	}
+
+	return ctrl;
+}
+
 u64 hw_perf_save_disable(void)
 {
 	if (unlikely(!perf_counters_initialized))
@@ -198,6 +233,20 @@ static void pmc_intel_restore_all(u64 ctrl)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 
+static void pmc_amd_restore_all(u64 ctrl)
+{
+	u64 val;
+	int idx;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		if (ctrl & (1 << idx)) {
+			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		}
+	}
+}
+
 void hw_perf_restore(u64 ctrl)
 {
 	if (unlikely(!perf_counters_initialized))
@@ -314,6 +363,9 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
 	unsigned int event;
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return -1;
+
 	if (unlikely(hwc->nmi))
 		return -1;
 
@@ -401,6 +453,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -411,6 +464,7 @@ void perf_counter_print_debug(void)
 	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
 	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
 	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
+	}
 	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
@@ -588,6 +642,9 @@ void perf_counter_unthrottle(void)
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return;
+
 	if (unlikely(!perf_counters_initialized))
 		return;
 
@@ -692,6 +749,15 @@ static struct pmc_x86_ops pmc_intel_ops = {
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
+static struct pmc_x86_ops pmc_amd_ops = {
+	.save_disable_all	= pmc_amd_save_disable_all,
+	.restore_all		= pmc_amd_restore_all,
+	.eventsel		= MSR_K7_EVNTSEL0,
+	.perfctr		= MSR_K7_PERFCTR0,
+	.event_map		= pmc_amd_event_map,
+	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+};
+
 static struct pmc_x86_ops *pmc_intel_init(void)
 {
 	union cpuid10_eax eax;
@@ -719,6 +785,16 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	return &pmc_intel_ops;
 }
 
+static struct pmc_x86_ops *pmc_amd_init(void)
+{
+	nr_counters_generic = 4;
+	nr_counters_fixed = 0;
+
+	printk(KERN_INFO "AMD Performance Monitoring support detected.\n");
+
+	return &pmc_amd_ops;
+}
+
 void __init init_hw_perf_counters(void)
 {
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -728,6 +804,9 @@ void __init init_hw_perf_counters(void)
 	case X86_VENDOR_INTEL:
 		pmc_ops = pmc_intel_init();
 		break;
+	case X86_VENDOR_AMD:
+		pmc_ops = pmc_amd_init();
+		break;
 	}
 	if (!pmc_ops)
 		return;
-- 
GitLab


From 169e41eb7f5464c077a7e0e129f025759d04cc54 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 28 Feb 2009 18:37:49 +0530
Subject: [PATCH 0089/2198] x86: decent declarations in perf_counter.c

Impact: cleanup

making decent declrations for struct pmc_x86_ops and
fix checkpatch error:
 ERROR: Macros with complex values should be enclosed in parenthesis

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 266618aa1a032..a1f3646a3e8e6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -42,12 +42,12 @@ struct cpu_hw_counters {
  * struct pmc_x86_ops - performance counter x86 ops
  */
 struct pmc_x86_ops {
-	u64 (*save_disable_all)		(void);
-	void (*restore_all)		(u64 ctrl);
-	unsigned eventsel;
-	unsigned perfctr;
-	int (*event_map)		(int event);
-	int max_events;
+	u64		(*save_disable_all)(void);
+	void		(*restore_all)(u64 ctrl);
+	unsigned	eventsel;
+	unsigned	perfctr;
+	int		(*event_map)(int event);
+	int		max_events;
 };
 
 static struct pmc_x86_ops *pmc_ops;
@@ -561,7 +561,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 /*
  * Maximum interrupt frequency of 100KHz per CPU
  */
-#define PERFMON_MAX_INTERRUPTS 100000/HZ
+#define PERFMON_MAX_INTERRUPTS (100000/HZ)
 
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
-- 
GitLab


From a1ef58f442542d8b3e3b963339fbc522c36e827c Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Sat, 28 Feb 2009 18:45:39 +0530
Subject: [PATCH 0090/2198] x86: use pr_info in perf_counter.c

Impact: cleanup

using pr_info in perf_counter.c fixes various 80 characters warnings and
also indenting for conditional statement

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 48 +++++++++++++++---------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a1f3646a3e8e6..3b65f19a6681a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -454,18 +454,18 @@ void perf_counter_print_debug(void)
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-	rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
-	rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-
-	printk(KERN_INFO "\n");
-	printk(KERN_INFO "CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
-	printk(KERN_INFO "CPU#%d: status:     %016llx\n", cpu, status);
-	printk(KERN_INFO "CPU#%d: overflow:   %016llx\n", cpu, overflow);
-	printk(KERN_INFO "CPU#%d: fixed:      %016llx\n", cpu, fixed);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
+		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+
+		pr_info("\n");
+		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
+		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
+		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
+		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	}
-	printk(KERN_INFO "CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
@@ -473,17 +473,17 @@ void perf_counter_print_debug(void)
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n",
 			cpu, idx, pmc_ctrl);
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d count: %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
-		printk(KERN_INFO "CPU#%d:   gen-PMC%d left:  %016llx\n",
+		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
 	for (idx = 0; idx < nr_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
-		printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n",
+		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
 	}
 	local_irq_enable();
@@ -773,10 +773,10 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return NULL;
 
-	printk(KERN_INFO "Intel Performance Monitoring support detected.\n");
-	printk(KERN_INFO "... version:         %d\n", eax.split.version_id);
-	printk(KERN_INFO "... bit width:       %d\n", eax.split.bit_width);
-	printk(KERN_INFO "... mask length:     %d\n", eax.split.mask_length);
+	pr_info("Intel Performance Monitoring support detected.\n");
+	pr_info("... version:         %d\n", eax.split.version_id);
+	pr_info("... bit width:       %d\n", eax.split.bit_width);
+	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
@@ -790,7 +790,7 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 
-	printk(KERN_INFO "AMD Performance Monitoring support detected.\n");
+	pr_info("AMD Performance Monitoring support detected.\n");
 
 	return &pmc_amd_ops;
 }
@@ -811,7 +811,7 @@ void __init init_hw_perf_counters(void)
 	if (!pmc_ops)
 		return;
 
-	printk(KERN_INFO "... num counters:    %d\n", nr_counters_generic);
+	pr_info("... num counters:    %d\n", nr_counters_generic);
 	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
 		nr_counters_generic = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -820,18 +820,18 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << nr_counters_generic) - 1;
 	perf_max_counters = nr_counters_generic;
 
-	printk(KERN_INFO "... value mask:      %016Lx\n", counter_value_mask);
+	pr_info("... value mask:      %016Lx\n", counter_value_mask);
 
 	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
 		nr_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
 			nr_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	printk(KERN_INFO "... fixed counters:  %d\n", nr_counters_fixed);
+	pr_info("... fixed counters:  %d\n", nr_counters_fixed);
 
 	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
-	printk(KERN_INFO "... counter mask:    %016Lx\n", perf_counter_mask);
+	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
-- 
GitLab


From 2743a5b0fa6f309da904f2190a9cc25deee34dbd Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 4 Mar 2009 20:36:51 +1100
Subject: [PATCH 0091/2198] perfcounters: provide expansion room in the ABI

Impact: ABI change

This expands several fields in the perf_counter_hw_event struct and adds
a "flags" argument to the perf_counter_open system call, in order that
features can be added in future without ABI changes.

In particular the record_type field is expanded to 64 bits, and the
space for flag bits has been expanded from 32 to 64 bits.

This also adds some new fields:

* read_format (64 bits) is intended to provide a way to specify what
  userspace wants to get back when it does a read() on a simple
  (non-interrupting) counter;

* exclude_idle (1 bit) provides a way for userspace to ask that events
  that occur when the cpu is idle be excluded;

* extra_config_len will provide a way for userspace to supply an
  arbitrary amount of extra machine-specific PMU configuration data
  immediately following the perf_counter_hw_event struct, to allow
  sophisticated users to program things such as instruction matching
  CAMs and address range registers;

* __reserved_3 and __reserved_4 provide space for future expansion.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 include/linux/perf_counter.h | 12 +++++++++---
 include/linux/syscalls.h     |  2 +-
 kernel/perf_counter.c        | 10 +++++++---
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 186efaf496650..c42455ab15589 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -69,9 +69,10 @@ struct perf_counter_hw_event {
 	__s64			type;
 
 	__u64			irq_period;
-	__u32			record_type;
+	__u64			record_type;
+	__u64			read_format;
 
-	__u32			disabled       :  1, /* off by default        */
+	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
 				raw	       :  1, /* raw event type        */
 				inherit	       :  1, /* children inherit it   */
@@ -80,10 +81,15 @@ struct perf_counter_hw_event {
 				exclude_user   :  1, /* don't count user      */
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
 
-				__reserved_1 : 23;
+				__reserved_1 : 55;
+
+	__u32			extra_config_len;
+	__u32			__reserved_4;
 
 	__u64			__reserved_2;
+	__u64			__reserved_3;
 };
 
 /*
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 28ef2be839c70..ab1d772473954 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -698,5 +698,5 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 asmlinkage long sys_perf_counter_open(
 		const struct perf_counter_hw_event __user *hw_event_uptr,
-		pid_t pid, int cpu, int group_fd);
+		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 16b14ba99d34d..b2e838959f3e4 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1683,16 +1683,16 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 }
 
 /**
- * sys_perf_task_open - open a performance counter, associate it to a task/cpu
+ * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
  * @hw_event_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
-SYSCALL_DEFINE4(perf_counter_open,
+SYSCALL_DEFINE5(perf_counter_open,
 		const struct perf_counter_hw_event __user *, hw_event_uptr,
-		pid_t, pid, int, cpu, int, group_fd)
+		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_counter *counter, *group_leader;
 	struct perf_counter_hw_event hw_event;
@@ -1703,6 +1703,10 @@ SYSCALL_DEFINE4(perf_counter_open,
 	int fput_needed2 = 0;
 	int ret;
 
+	/* for future expandability... */
+	if (flags)
+		return -EINVAL;
+
 	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
 		return -EFAULT;
 
-- 
GitLab


From 2485e5184452ccbda34ff83883883d9107800dff Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 12:33:16 +0100
Subject: [PATCH 0092/2198] perfcounters: fix reserved bits sizing

The sum of bits is 65 currently not 64 - so reduce the # of reserved bits
from 55 to 54.

Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c42455ab15589..dde564517b661 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -83,7 +83,7 @@ struct perf_counter_hw_event {
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
 
-				__reserved_1 : 55;
+				__reserved_1   : 54;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
-- 
GitLab


From b0f3f28e0f14eb335f67bfaae33ce8b8d74fd58b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Mar 2009 18:08:27 +0100
Subject: [PATCH 0093/2198] perfcounters: IRQ and NMI support on AMD CPUs

The below completes the K7+ performance counter support:

 - IRQ support
 - NMI support

KernelTop output works now as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1236273633.5187.286.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 272 ++++++++++++++++++++++++-----
 1 file changed, 228 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3b65f19a6681a..6ebe9abf6aefb 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -28,6 +28,7 @@ static bool perf_counters_initialized __read_mostly;
 static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
 static u64 counter_value_mask __read_mostly;
+static int counter_value_bits __read_mostly;
 
 static int nr_counters_fixed __read_mostly;
 
@@ -35,7 +36,9 @@ struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
-	u64			global_enable;
+	u64			throttle_ctrl;
+	u64			active_mask;
+	int			enabled;
 };
 
 /*
@@ -43,21 +46,28 @@ struct cpu_hw_counters {
  */
 struct pmc_x86_ops {
 	u64		(*save_disable_all)(void);
-	void		(*restore_all)(u64 ctrl);
+	void		(*restore_all)(u64);
+	u64		(*get_status)(u64);
+	void		(*ack_status)(u64);
+	void		(*enable)(int, u64);
+	void		(*disable)(int, u64);
 	unsigned	eventsel;
 	unsigned	perfctr;
-	int		(*event_map)(int event);
+	u64		(*event_map)(int);
+	u64		(*raw_event)(u64);
 	int		max_events;
 };
 
 static struct pmc_x86_ops *pmc_ops;
 
-static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
+static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
+	.enabled = 1,
+};
 
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
-static const int intel_perfmon_event_map[] =
+static const u64 intel_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x003c,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -68,15 +78,29 @@ static const int intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static int pmc_intel_event_map(int event)
+static u64 pmc_intel_event_map(int event)
 {
 	return intel_perfmon_event_map[event];
 }
 
+static u64 pmc_intel_raw_event(u64 event)
+{
+#define CORE_EVNTSEL_EVENT_MASK		0x000000FF
+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00
+#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000
+
+#define CORE_EVNTSEL_MASK 		\
+	(CORE_EVNTSEL_EVENT_MASK |	\
+	 CORE_EVNTSEL_UNIT_MASK  |	\
+	 CORE_EVNTSEL_COUNTER_MASK)
+
+	return event & CORE_EVNTSEL_MASK;
+}
+
 /*
  * AMD Performance Monitor K7 and later.
  */
-static const int amd_perfmon_event_map[] =
+static const u64 amd_perfmon_event_map[] =
 {
   [PERF_COUNT_CPU_CYCLES]		= 0x0076,
   [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
@@ -86,11 +110,25 @@ static const int amd_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-static int pmc_amd_event_map(int event)
+static u64 pmc_amd_event_map(int event)
 {
 	return amd_perfmon_event_map[event];
 }
 
+static u64 pmc_amd_raw_event(u64 event)
+{
+#define K7_EVNTSEL_EVENT_MASK	0x7000000FF
+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00
+#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000
+
+#define K7_EVNTSEL_MASK			\
+	(K7_EVNTSEL_EVENT_MASK |	\
+	 K7_EVNTSEL_UNIT_MASK  |	\
+	 K7_EVNTSEL_COUNTER_MASK)
+
+	return event & K7_EVNTSEL_MASK;
+}
+
 /*
  * Propagate counter elapsed time into the generic counter.
  * Can only be executed on the CPU where the counter is active.
@@ -179,7 +217,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (hw_event->raw) {
-		hwc->config |= hw_event->type;
+		hwc->config |= pmc_ops->raw_event(hw_event->type);
 	} else {
 		if (hw_event->type >= pmc_ops->max_events)
 			return -EINVAL;
@@ -205,18 +243,24 @@ static u64 pmc_intel_save_disable_all(void)
 
 static u64 pmc_amd_save_disable_all(void)
 {
-	int idx;
-	u64 val, ctrl = 0;
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	int enabled, idx;
+
+	enabled = cpuc->enabled;
+	cpuc->enabled = 0;
+	barrier();
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
+		u64 val;
+
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
-			ctrl |= (1 << idx);
-		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
+			val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		}
 	}
 
-	return ctrl;
+	return enabled;
 }
 
 u64 hw_perf_save_disable(void)
@@ -226,6 +270,9 @@ u64 hw_perf_save_disable(void)
 
 	return pmc_ops->save_disable_all();
 }
+/*
+ * Exported because of ACPI idle
+ */
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
 static void pmc_intel_restore_all(u64 ctrl)
@@ -235,11 +282,18 @@ static void pmc_intel_restore_all(u64 ctrl)
 
 static void pmc_amd_restore_all(u64 ctrl)
 {
-	u64 val;
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
 
+	cpuc->enabled = ctrl;
+	barrier();
+	if (!ctrl)
+		return;
+
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (ctrl & (1 << idx)) {
+		if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) {
+			u64 val;
+
 			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
@@ -254,8 +308,112 @@ void hw_perf_restore(u64 ctrl)
 
 	pmc_ops->restore_all(ctrl);
 }
+/*
+ * Exported because of ACPI idle
+ */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
+static u64 pmc_intel_get_status(u64 mask)
+{
+	u64 status;
+
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+
+	return status;
+}
+
+static u64 pmc_amd_get_status(u64 mask)
+{
+	u64 status = 0;
+	int idx;
+
+	for (idx = 0; idx < nr_counters_generic; idx++) {
+		s64 val;
+
+		if (!(mask & (1 << idx)))
+			continue;
+
+		rdmsrl(MSR_K7_PERFCTR0 + idx, val);
+		val <<= (64 - counter_value_bits);
+		if (val >= 0)
+			status |= (1 << idx);
+	}
+
+	return status;
+}
+
+static u64 hw_perf_get_status(u64 mask)
+{
+	if (unlikely(!perf_counters_initialized))
+		return 0;
+
+	return pmc_ops->get_status(mask);
+}
+
+static void pmc_intel_ack_status(u64 ack)
+{
+	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+}
+
+static void pmc_amd_ack_status(u64 ack)
+{
+}
+
+static void hw_perf_ack_status(u64 ack)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->ack_status(ack);
+}
+
+static void pmc_intel_enable(int idx, u64 config)
+{
+	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
+			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
+}
+
+static void pmc_amd_enable(int idx, u64 config)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	set_bit(idx, (unsigned long *)&cpuc->active_mask);
+	if (cpuc->enabled)
+		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
+}
+
+static void hw_perf_enable(int idx, u64 config)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->enable(idx, config);
+}
+
+static void pmc_intel_disable(int idx, u64 config)
+{
+	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
+}
+
+static void pmc_amd_disable(int idx, u64 config)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	clear_bit(idx, (unsigned long *)&cpuc->active_mask);
+	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
+
+}
+
+static void hw_perf_disable(int idx, u64 config)
+{
+	if (unlikely(!perf_counters_initialized))
+		return;
+
+	pmc_ops->disable(idx, config);
+}
+
 static inline void
 __pmc_fixed_disable(struct perf_counter *counter,
 		    struct hw_perf_counter *hwc, unsigned int __idx)
@@ -278,7 +436,7 @@ __pmc_generic_disable(struct perf_counter *counter,
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
 	else
-		wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+		hw_perf_disable(idx, hwc->config);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -354,8 +512,7 @@ __pmc_generic_enable(struct perf_counter *counter,
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_enable(counter, hwc, idx);
 	else
-		wrmsr(hwc->config_base + idx,
-		      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
+		hw_perf_enable(idx, hwc->config);
 }
 
 static int
@@ -567,22 +724,20 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+	int ret = 0;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
-
-	/* Disable counters globally */
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-	ack_APIC_irq();
+	cpuc->throttle_ctrl = hw_perf_save_disable();
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	status = hw_perf_get_status(cpuc->throttle_ctrl);
 	if (!status)
 		goto out;
 
+	ret = 1;
 again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
@@ -618,12 +773,12 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 		}
 	}
 
-	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
+	hw_perf_ack_status(ack);
 
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
+	status = hw_perf_get_status(cpuc->throttle_ctrl);
 	if (status)
 		goto again;
 out:
@@ -631,32 +786,27 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
 	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		hw_perf_restore(cpuc->throttle_ctrl);
+
+	return ret;
 }
 
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
-	u64 global_enable;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		return;
-
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	cpuc = &per_cpu(cpu_hw_counters, smp_processor_id());
+	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
 			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
+		hw_perf_restore(cpuc->throttle_ctrl);
 	}
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable);
-	if (unlikely(cpuc->global_enable && !global_enable))
-		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable);
 	cpuc->interrupts = 0;
 }
 
@@ -664,8 +814,8 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
+	ack_APIC_irq();
 	__smp_perf_counter_interrupt(regs, 0);
-
 	irq_exit();
 }
 
@@ -722,16 +872,23 @@ perf_counter_nmi_handler(struct notifier_block *self,
 {
 	struct die_args *args = __args;
 	struct pt_regs *regs;
+	int ret;
+
+	switch (cmd) {
+	case DIE_NMI:
+	case DIE_NMI_IPI:
+		break;
 
-	if (likely(cmd != DIE_NMI_IPI))
+	default:
 		return NOTIFY_DONE;
+	}
 
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	__smp_perf_counter_interrupt(regs, 1);
+	ret = __smp_perf_counter_interrupt(regs, 1);
 
-	return NOTIFY_STOP;
+	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
@@ -743,18 +900,28 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 static struct pmc_x86_ops pmc_intel_ops = {
 	.save_disable_all	= pmc_intel_save_disable_all,
 	.restore_all		= pmc_intel_restore_all,
+	.get_status		= pmc_intel_get_status,
+	.ack_status		= pmc_intel_ack_status,
+	.enable			= pmc_intel_enable,
+	.disable		= pmc_intel_disable,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
 	.event_map		= pmc_intel_event_map,
+	.raw_event		= pmc_intel_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
 static struct pmc_x86_ops pmc_amd_ops = {
 	.save_disable_all	= pmc_amd_save_disable_all,
 	.restore_all		= pmc_amd_restore_all,
+	.get_status		= pmc_amd_get_status,
+	.ack_status		= pmc_amd_ack_status,
+	.enable			= pmc_amd_enable,
+	.disable		= pmc_amd_disable,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
 	.event_map		= pmc_amd_event_map,
+	.raw_event		= pmc_amd_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
@@ -787,8 +954,25 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 
 static struct pmc_x86_ops *pmc_amd_init(void)
 {
+	u64 old;
+	int bits;
+
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
+	counter_value_mask = ~0ULL;
+
+	rdmsrl(MSR_K7_PERFCTR0, old);
+	wrmsrl(MSR_K7_PERFCTR0, counter_value_mask);
+	/*
+	 * read the truncated mask
+	 */
+	rdmsrl(MSR_K7_PERFCTR0, counter_value_mask);
+	wrmsrl(MSR_K7_PERFCTR0, old);
+
+	bits = 32 + fls(counter_value_mask >> 32);
+	if (bits == 32)
+		bits = fls((u32)counter_value_mask);
+	counter_value_bits = bits;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-- 
GitLab


From b5e8acf66ff5db707c7e08df49fdf6b415878442 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 5 Mar 2009 20:34:21 +0100
Subject: [PATCH 0094/2198] perfcounters: IRQ and NMI support on AMD CPUs, fix

The BKGD suggests that counter width on AMD CPUs is 48 for all
existing models (it certainly is for mine).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6ebe9abf6aefb..f5853718d4d37 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -959,20 +959,8 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
-	counter_value_mask = ~0ULL;
-
-	rdmsrl(MSR_K7_PERFCTR0, old);
-	wrmsrl(MSR_K7_PERFCTR0, counter_value_mask);
-	/*
-	 * read the truncated mask
-	 */
-	rdmsrl(MSR_K7_PERFCTR0, counter_value_mask);
-	wrmsrl(MSR_K7_PERFCTR0, old);
-
-	bits = 32 + fls(counter_value_mask >> 32);
-	if (bits == 32)
-		bits = fls((u32)counter_value_mask);
-	counter_value_bits = bits;
+	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
+	counter_value_bits = 48;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-- 
GitLab


From 86028598de16538f02519141756ccf4accfc29a6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 5 Mar 2009 14:05:57 +1100
Subject: [PATCH 0095/2198] perfcounters/powerpc: fix oops with multiple
 counters in a group

Impact: fix oops-causing bug

This fixes a bug in the powerpc hw_perf_counter_init where the code
didn't initialize ctrs[n] before passing the ctrs array to check_excludes,
leading to possible oopses and other incorrect behaviour.  This fixes it
by initializing ctrs[n] correctly.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 112332d07fc26..4fec112386fce 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -633,6 +633,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 			return NULL;
 	}
 	events[n] = ev;
+	ctrs[n] = counter;
 	if (check_excludes(ctrs, n, 1))
 		return NULL;
 	if (power_check_constraints(events, n + 1))
-- 
GitLab


From aabbaa6036fd847c583f585c6bae82b5a033e6c7 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 6 Mar 2009 16:27:10 +1100
Subject: [PATCH 0096/2198] perfcounters/powerpc: add support for POWER5+
 processors

Impact: more hardware support

This adds the back-end for the PMU on the POWER5+ processors (i.e. GS,
including GS DD3 aka POWER5++).  This doesn't use the fixed-function
PMC5 and PMC6 since they don't respect the freeze conditions and don't
generate interrupts, as on POWER6.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile       |   4 +-
 arch/powerpc/kernel/perf_counter.c |   4 +
 arch/powerpc/kernel/power5+-pmu.c  | 452 +++++++++++++++++++++++++++++
 3 files changed, 458 insertions(+), 2 deletions(-)
 create mode 100644 arch/powerpc/kernel/power5+-pmu.c

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index b4c6f466164b6..49851e0d8fdeb 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,8 +94,8 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o power5-pmu.o \
-				   power6-pmu.o
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o \
+				   power5-pmu.o power5+-pmu.o power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4fec112386fce..162f3981fa27b 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -826,6 +826,7 @@ void hw_perf_counter_setup(int cpu)
 
 extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power5_pmu;
+extern struct power_pmu power5p_pmu;
 extern struct power_pmu power6_pmu;
 
 static int init_perf_counters(void)
@@ -848,6 +849,9 @@ static int init_perf_counters(void)
 	case PV_POWER5:
 		ppmu = &power5_pmu;
 		break;
+	case PV_POWER5p:
+		ppmu = &power5p_pmu;
+		break;
 	case 0x3e:
 		ppmu = &power6_pmu;
 		break;
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
new file mode 100644
index 0000000000000..cec21ea65b0e2
--- /dev/null
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -0,0 +1,452 @@
+/*
+ * Performance counter support for POWER5 (not POWER5++) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3)
+ */
+#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	16	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_BYTE_SH	12	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	7
+#define PM_GRS_SH	8	/* Storage subsystem mux select */
+#define PM_GRS_MSK	7
+#define PM_BUSEVENT_MSK	0x80	/* Set if event uses event bus */
+#define PM_PMCSEL_MSK	0x7f
+
+/* Values in PM_UNIT field */
+#define PM_FPU		0
+#define PM_ISU0		1
+#define PM_IFU		2
+#define PM_ISU1		3
+#define PM_IDU		4
+#define PM_ISU0_ALT	6
+#define PM_GRS		7
+#define PM_LSU0		8
+#define PM_LSU1		0xc
+#define PM_LASTUNIT	0xc
+
+/*
+ * Bits in MMCR1 for POWER5+
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTM1SEL_SH	60
+#define MMCR1_TTM2SEL_SH	58
+#define MMCR1_TTM3SEL_SH	56
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	54
+#define MMCR1_TD_CP_DBG1SEL_SH	52
+#define MMCR1_TD_CP_DBG2SEL_SH	50
+#define MMCR1_TD_CP_DBG3SEL_SH	48
+#define MMCR1_GRS_L2SEL_SH	46
+#define MMCR1_GRS_L2SEL_MSK	3
+#define MMCR1_GRS_L3SEL_SH	44
+#define MMCR1_GRS_L3SEL_MSK	3
+#define MMCR1_GRS_MCSEL_SH	41
+#define MMCR1_GRS_MCSEL_MSK	7
+#define MMCR1_GRS_FABSEL_SH	39
+#define MMCR1_GRS_FABSEL_MSK	3
+#define MMCR1_PMC1_ADDER_SEL_SH	35
+#define MMCR1_PMC2_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC1SEL_SH	25
+#define MMCR1_PMC2SEL_SH	17
+#define MMCR1_PMC3SEL_SH	9
+#define MMCR1_PMC4SEL_SH	1
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0x7f
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *             [  ><><>< ><> <><>[  >      <  ><  ><  ><  ><><><><>
+ *             NC  G0G1G2 G3 T0T1 UC        B0  B1  B2  B3 P4P3P2P1
+ *
+ * NC - number of counters
+ *     51: NC error 0x0008_0000_0000_0000
+ *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000
+ *
+ * G0..G3 - GRS mux constraints
+ *     46-47: GRS_L2SEL value
+ *     44-45: GRS_L3SEL value
+ *     41-44: GRS_MCSEL value
+ *     39-40: GRS_FABSEL value
+ *	Note that these match up with their bit positions in MMCR1
+ *
+ * T0 - TTM0 constraint
+ *     36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000
+ *
+ * T1 - TTM1 constraint
+ *     34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000
+ *
+ * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS
+ *     33: UC3 error 0x02_0000_0000
+ *     32: FPU|IFU|ISU1 events needed 0x01_0000_0000
+ *     31: ISU0 events needed 0x01_8000_0000
+ *     30: IDU|GRS events needed 0x00_4000_0000
+ *
+ * B0
+ *     20-23: Byte 0 event source 0x00f0_0000
+ *	      Encoding as for the event code
+ *
+ * B1, B2, B3
+ *     16-19, 12-15, 8-11: Byte 1, 2, 3 event sources
+ *
+ * P4
+ *     7: P1 error 0x80
+ *     6-7: Count of events needing PMC4
+ *
+ * P1..P3
+ *     0-6: Count of events needing PMC1..PMC3
+ */
+
+static const int grsel_shift[8] = {
+	MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH,
+	MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH,
+	MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH
+};
+
+/* Masks and values for using events from the various units */
+static u64 unit_cons[PM_LASTUNIT+1][2] = {
+	[PM_FPU] =   { 0x3200000000ull, 0x0100000000ull },
+	[PM_ISU0] =  { 0x0200000000ull, 0x0080000000ull },
+	[PM_ISU1] =  { 0x3200000000ull, 0x3100000000ull },
+	[PM_IFU] =   { 0x3200000000ull, 0x2100000000ull },
+	[PM_IDU] =   { 0x0e00000000ull, 0x0040000000ull },
+	[PM_GRS] =   { 0x0e00000000ull, 0x0c40000000ull },
+};
+
+static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, sh;
+	int bit, fmask;
+	u64 mask = 0, value = 0;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 4)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+	}
+	if (event & PM_BUSEVENT_MSK) {
+		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+		if (unit > PM_LASTUNIT)
+			return -1;
+		if (unit == PM_ISU0_ALT)
+			unit = PM_ISU0;
+		mask |= unit_cons[unit][0];
+		value |= unit_cons[unit][1];
+		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+		if (byte >= 4) {
+			if (unit != PM_LSU1)
+				return -1;
+			/* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */
+			++unit;
+			byte &= 3;
+		}
+		if (unit == PM_GRS) {
+			bit = event & 7;
+			fmask = (bit == 6)? 7: 3;
+			sh = grsel_shift[bit];
+			mask |= (u64)fmask << sh;
+			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
+		}
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (20 - 4 * byte);
+		value |= (u64)unit << (20 - 4 * byte);
+	}
+	mask  |= 0x8000000000000ull;
+	value |= 0x1000000000000ull;
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	3	/* at most 3 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x100c0,  0x40001f },			/* PM_GCT_FULL_CYC */
+	{ 0x120e4,  0x400002 },			/* PM_GRP_DISP_REJECT */
+	{ 0x230e2,  0x323087 },			/* PM_BR_PRED_CR */
+	{ 0x230e3,  0x223087, 0x3230a0 },	/* PM_BR_PRED_TA */
+	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
+	{ 0x800c4,  0xc20e0 },			/* PM_DTLB_MISS */
+	{ 0xc50c6,  0xc60e0 },			/* PM_MRK_DTLB_MISS */
+	{ 0x100009, 0x200009 },			/* PM_INST_CMPL */
+	{ 0x200015, 0x300015 },			/* PM_LSU_LMQ_SRQ_EMPTY_CYC */
+	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(unsigned int event)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			break;
+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+			if (event == event_alternatives[i][j])
+				return i;
+	}
+	return -1;
+}
+
+static const unsigned char bytedecode_alternatives[4][4] = {
+	/* PMC 1 */	{ 0x21, 0x23, 0x25, 0x27 },
+	/* PMC 2 */	{ 0x07, 0x17, 0x0e, 0x1e },
+	/* PMC 3 */	{ 0x20, 0x22, 0x24, 0x26 },
+	/* PMC 4 */	{ 0x07, 0x17, 0x0e, 0x1e }
+};
+
+/*
+ * Some direct events for decodes of event bus byte 3 have alternative
+ * PMCSEL values on other counters.  This returns the alternative
+ * event code for those that do, or -1 otherwise.  This also handles
+ * alternative PCMSEL values for add events.
+ */
+static int find_alternative_bdecode(unsigned int event)
+{
+	int pmc, altpmc, pp, j;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc == 0 || pmc > 4)
+		return -1;
+	altpmc = 5 - pmc;	/* 1 <-> 4, 2 <-> 3 */
+	pp = event & PM_PMCSEL_MSK;
+	for (j = 0; j < 4; ++j) {
+		if (bytedecode_alternatives[pmc - 1][j] == pp) {
+			return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) |
+				(altpmc << PM_PMC_SH) |
+				bytedecode_alternatives[altpmc - 1][j];
+		}
+	}
+
+	/* new decode alternatives for power5+ */
+	if (pmc == 1 && (pp == 0x0d || pp == 0x0e))
+		return event + (2 << PM_PMC_SH) + (0x2e - 0x0d);
+	if (pmc == 3 && (pp == 0x2e || pp == 0x2f))
+		return event - (2 << PM_PMC_SH) - (0x2e - 0x0d);
+
+	/* alternative add event encodings */
+	if (pp == 0x10 || pp == 0x28)
+		return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) |
+			(altpmc << PM_PMC_SH);
+
+	return -1;
+}
+
+static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j, ae, nalt = 1;
+
+	alt[0] = event;
+	nalt = 1;
+	i = find_alternative(event);
+	if (i >= 0) {
+		for (j = 0; j < MAX_ALT; ++j) {
+			ae = event_alternatives[i][j];
+			if (ae && ae != event)
+				alt[nalt++] = ae;
+		}
+	} else {
+		ae = find_alternative_bdecode(event);
+		if (ae > 0)
+			alt[nalt++] = ae;
+	}
+	return nalt;
+}
+
+static int power5p_compute_mmcr(unsigned int event[], int n_ev,
+				unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	unsigned int pmc, unit, byte, psel;
+	unsigned int ttm;
+	int i, isbus, bit, grsel;
+	unsigned int pmc_inuse = 0;
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	int ttmuse;
+
+	if (n_ev > 4)
+		return -1;
+
+	/* First pass to count resource use */
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc > 4)
+				return -1;
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+		}
+		if (event[i] & PM_BUSEVENT_MSK) {
+			unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+			byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+			if (unit > PM_LASTUNIT)
+				return -1;
+			if (unit == PM_ISU0_ALT)
+				unit = PM_ISU0;
+			if (byte >= 4) {
+				if (unit != PM_LSU1)
+					return -1;
+				++unit;
+				byte &= 3;
+			}
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			unituse[unit] = 1;
+		}
+	}
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only
+	 * choice we have to deal with.
+	 */
+	if (unituse[PM_ISU0] &
+	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) {
+		unituse[PM_ISU0_ALT] = 1;	/* move ISU to TTM1 */
+		unituse[PM_ISU0] = 0;
+	}
+	/* Set TTM[01]SEL fields. */
+	ttmuse = 0;
+	for (i = PM_FPU; i <= PM_ISU1; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH;
+	}
+	ttmuse = 0;
+	for (; i <= PM_GRS; ++i) {
+		if (!unituse[i])
+			continue;
+		if (ttmuse++)
+			return -1;
+		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH;
+	}
+	if (ttmuse > 1)
+		return -1;
+
+	/* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) {
+			/* get ISU0 through TTM1 rather than TTM0 */
+			unit = PM_ISU0_ALT;
+		} else if (unit == PM_LSU1 + 1) {
+			/* select lower word of LSU1 for this byte */
+			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte);
+		}
+		ttm = unit >> 2;
+		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte);
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		isbus = event[i] & PM_BUSEVENT_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			for (pmc = 0; pmc < 4; ++pmc) {
+				if (!(pmc_inuse & (1 << pmc)))
+					break;
+			}
+			if (pmc >= 4)
+				return -1;
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct event */
+			--pmc;
+			if (isbus && (byte & 2) &&
+			    (psel == 8 || psel == 0x10 || psel == 0x28))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+		}
+		if (isbus && unit == PM_GRS) {
+			bit = psel & 7;
+			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
+			mmcr1 |= (u64)grsel << grsel_shift[bit];
+		}
+		if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
+			/* select alternate byte lane */
+			psel |= 0x10;
+		if (pmc <= 3)
+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+		hwc[i] = pmc;
+	}
+
+	/* Return MMCRx values */
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0x3e)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = 0;
+	return 0;
+}
+
+static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	if (pmc <= 3)
+		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power5p_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0xf,
+	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8,	/* LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
+	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+};
+
+struct power_pmu power5p_pmu = {
+	.n_counter = 4,
+	.max_alternatives = MAX_ALT,
+	.add_fields = 0x7000000000055ull,
+	.test_adder = 0x3000040000000ull,
+	.compute_mmcr = power5p_compute_mmcr,
+	.get_constraint = power5p_get_constraint,
+	.get_alternatives = power5p_get_alternatives,
+	.disable_pmc = power5p_disable_pmc,
+	.n_generic = ARRAY_SIZE(power5p_generic_events),
+	.generic_events = power5p_generic_events,
+};
-- 
GitLab


From 880860e392d92c457e8116cdee39ec4d109174ee Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 6 Mar 2009 16:30:52 +1100
Subject: [PATCH 0097/2198] perfcounters/powerpc: add support for POWER4
 processors

Impact: more hardware support

This adds the back-end for the PMU on the POWER4 and POWER4+ processors
(GP and GQ).  This is quite similar to the PPC970, with 8 PMCs, but has
fewer events than the PPC970.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/Makefile       |   2 +-
 arch/powerpc/kernel/perf_counter.c |   5 +
 arch/powerpc/kernel/power4-pmu.c   | 557 +++++++++++++++++++++++++++++
 3 files changed, 563 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/power4-pmu.c

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 49851e0d8fdeb..8e5e2c74971ea 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
-obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o ppc970-pmu.o \
+obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o power4-pmu.o ppc970-pmu.o \
 				   power5-pmu.o power5+-pmu.o power6-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 162f3981fa27b..0e33d27cd4641 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -824,6 +824,7 @@ void hw_perf_counter_setup(int cpu)
 	cpuhw->mmcr[0] = MMCR0_FC;
 }
 
+extern struct power_pmu power4_pmu;
 extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power5_pmu;
 extern struct power_pmu power5p_pmu;
@@ -841,6 +842,10 @@ static int init_perf_counters(void)
 	/* XXX should get this from cputable */
 	pvr = mfspr(SPRN_PVR);
 	switch (PVR_VER(pvr)) {
+	case PV_POWER4:
+	case PV_POWER4p:
+		ppmu = &power4_pmu;
+		break;
 	case PV_970:
 	case PV_970FX:
 	case PV_970MP:
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
new file mode 100644
index 0000000000000..1407b19ab619c
--- /dev/null
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -0,0 +1,557 @@
+/*
+ * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER4
+ */
+#define PM_PMC_SH	12	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_LOWER_SH	6
+#define PM_LOWER_MSK	1
+#define PM_LOWER_MSKS	0x40
+#define PM_BYTE_SH	4	/* Byte number of event bus to use */
+#define PM_BYTE_MSK	3
+#define PM_PMCSEL_MSK	7
+
+/*
+ * Unit code values
+ */
+#define PM_FPU		1
+#define PM_ISU1		2
+#define PM_IFU		3
+#define PM_IDU0		4
+#define PM_ISU1_ALT	6
+#define PM_ISU2		7
+#define PM_IFU_ALT	8
+#define PM_LSU0		9
+#define PM_LSU1		0xc
+#define PM_GPS		0xf
+
+/*
+ * Bits in MMCR0 for POWER4
+ */
+#define MMCR0_PMC1SEL_SH	8
+#define MMCR0_PMC2SEL_SH	1
+#define MMCR_PMCSEL_MSK		0x1f
+
+/*
+ * Bits in MMCR1 for POWER4
+ */
+#define MMCR1_TTM0SEL_SH	62
+#define MMCR1_TTC0SEL_SH	61
+#define MMCR1_TTM1SEL_SH	59
+#define MMCR1_TTC1SEL_SH	58
+#define MMCR1_TTM2SEL_SH	56
+#define MMCR1_TTC2SEL_SH	55
+#define MMCR1_TTM3SEL_SH	53
+#define MMCR1_TTC3SEL_SH	52
+#define MMCR1_TTMSEL_MSK	3
+#define MMCR1_TD_CP_DBG0SEL_SH	50
+#define MMCR1_TD_CP_DBG1SEL_SH	48
+#define MMCR1_TD_CP_DBG2SEL_SH	46
+#define MMCR1_TD_CP_DBG3SEL_SH	44
+#define MMCR1_DEBUG0SEL_SH	43
+#define MMCR1_DEBUG1SEL_SH	42
+#define MMCR1_DEBUG2SEL_SH	41
+#define MMCR1_DEBUG3SEL_SH	40
+#define MMCR1_PMC1_ADDER_SEL_SH	39
+#define MMCR1_PMC2_ADDER_SEL_SH	38
+#define MMCR1_PMC6_ADDER_SEL_SH	37
+#define MMCR1_PMC5_ADDER_SEL_SH	36
+#define MMCR1_PMC8_ADDER_SEL_SH	35
+#define MMCR1_PMC7_ADDER_SEL_SH	34
+#define MMCR1_PMC3_ADDER_SEL_SH	33
+#define MMCR1_PMC4_ADDER_SEL_SH	32
+#define MMCR1_PMC3SEL_SH	27
+#define MMCR1_PMC4SEL_SH	22
+#define MMCR1_PMC5SEL_SH	17
+#define MMCR1_PMC6SEL_SH	12
+#define MMCR1_PMC7SEL_SH	7
+#define MMCR1_PMC8SEL_SH	2	/* note bit 0 is in MMCRA for GP */
+
+static short mmcr1_adder_bits[8] = {
+	MMCR1_PMC1_ADDER_SEL_SH,
+	MMCR1_PMC2_ADDER_SEL_SH,
+	MMCR1_PMC3_ADDER_SEL_SH,
+	MMCR1_PMC4_ADDER_SEL_SH,
+	MMCR1_PMC5_ADDER_SEL_SH,
+	MMCR1_PMC6_ADDER_SEL_SH,
+	MMCR1_PMC7_ADDER_SEL_SH,
+	MMCR1_PMC8_ADDER_SEL_SH
+};
+
+/*
+ * Bits in MMCRA
+ */
+#define MMCRA_PMC8SEL0_SH	17	/* PMC8SEL bit 0 for GP */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *        |[  >[  >[   >|||[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *        | UC1 UC2 UC3 ||| PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ * 	  \SMPL	        ||\TTC3SEL
+ * 		        |\TTC_IFU_SEL
+ * 		        \TTM2SEL0
+ *
+ * SMPL - SAMPLE_ENABLE constraint
+ *     56: SAMPLE_ENABLE value 0x0100_0000_0000_0000
+ *
+ * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2
+ *     55: UC1 error 0x0080_0000_0000_0000
+ *     54: FPU events needed 0x0040_0000_0000_0000
+ *     53: ISU1 events needed 0x0020_0000_0000_0000
+ *     52: IDU0|ISU2 events needed 0x0010_0000_0000_0000
+ *
+ * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0
+ *     51: UC2 error 0x0008_0000_0000_0000
+ *     50: FPU events needed 0x0004_0000_0000_0000
+ *     49: IFU events needed 0x0002_0000_0000_0000
+ *     48: LSU0 events needed 0x0001_0000_0000_0000
+ *
+ * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1
+ *     47: UC3 error 0x8000_0000_0000
+ *     46: LSU0 events needed 0x4000_0000_0000
+ *     45: IFU events needed 0x2000_0000_0000
+ *     44: IDU0|ISU2 events needed 0x1000_0000_0000
+ *     43: ISU1 events needed 0x0800_0000_0000
+ *
+ * TTM2SEL0
+ *     42: 0 = IDU0 events needed
+ *     	   1 = ISU2 events needed 0x0400_0000_0000
+ *
+ * TTC_IFU_SEL
+ *     41: 0 = IFU.U events needed
+ *     	   1 = IFU.L events needed 0x0200_0000_0000
+ *
+ * TTC3SEL
+ *     40: 0 = LSU1.U events needed
+ *     	   1 = LSU1.L events needed 0x0100_0000_0000
+ *
+ * PS1
+ *     39: PS1 error 0x0080_0000_0000
+ *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000
+ *
+ * PS2
+ *     35: PS2 error 0x0008_0000_0000
+ *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000
+ *
+ * B0
+ *     28-31: Byte 0 event source 0xf000_0000
+ *     	   1 = FPU
+ * 	   2 = ISU1
+ * 	   3 = IFU
+ * 	   4 = IDU0
+ * 	   7 = ISU2
+ * 	   9 = LSU0
+ * 	   c = LSU1
+ * 	   f = GPS
+ *
+ * B1, B2, B3
+ *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources
+ *
+ * P8
+ *     15: P8 error 0x8000
+ *     14-15: Count of events needing PMC8
+ *
+ * P1..P7
+ *     0-13: Count of events needing PMC1..PMC7
+ *
+ * Note: this doesn't allow events using IFU.U to be combined with events
+ * using IFU.L, though that is feasible (using TTM0 and TTM2).  However
+ * there are no listed events for IFU.L (they are debug events not
+ * verified for performance monitoring) so this shouldn't cause a
+ * problem.
+ */
+
+static struct unitinfo {
+	u64	value, mask;
+	int	unit;
+	int	lowerbit;
+} p4_unitinfo[16] = {
+	[PM_FPU]  = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 },
+	[PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+	[PM_ISU1_ALT] =
+		    { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 },
+	[PM_IFU]  = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+	[PM_IFU_ALT] =
+		    { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 },
+	[PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 },
+	[PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 },
+	[PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 },
+	[PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 },
+	[PM_GPS]  = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 }
+};
+
+static unsigned char direct_marked_event[8] = {
+	(1<<2) | (1<<3),	/* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
+	(1<<3) | (1<<5),	/* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
+	(1<<3),			/* PMC3: PM_MRK_ST_CMPL_INT */
+	(1<<4) | (1<<5),	/* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
+	(1<<4) | (1<<5),	/* PMC5: PM_MRK_GRP_TIMEO */
+	(1<<3) | (1<<4) | (1<<5),
+		/* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
+	(1<<4) | (1<<5),	/* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
+	(1<<4),			/* PMC8: PM_MRK_LSU_FIN */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int p4_marked_instr_event(unsigned int event)
+{
+	int pmc, psel, unit, byte, bit;
+	unsigned int mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc) {
+		if (direct_marked_event[pmc - 1] & (1 << psel))
+			return 1;
+		if (psel == 0)		/* add events */
+			bit = (pmc <= 4)? pmc - 1: 8 - pmc;
+		else if (psel == 6)	/* decode events */
+			bit = 4;
+		else
+			return 0;
+	} else
+		bit = psel;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	mask = 0;
+	switch (unit) {
+	case PM_LSU1:
+		if (event & PM_LOWER_MSKS)
+			mask = 1 << 28;		/* byte 7 bit 4 */
+		else
+			mask = 6 << 24;		/* byte 3 bits 1 and 2 */
+		break;
+	case PM_LSU0:
+		/* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */
+		mask = 0x083dff00;
+	}
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
+static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+{
+	int pmc, byte, unit, lower, sh;
+	u64 mask = 0, value = 0;
+	int grp = -1;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 8)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		grp = ((pmc - 1) >> 1) & 1;
+	}
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	if (unit) {
+		lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK;
+
+		/*
+		 * Bus events on bytes 0 and 2 can be counted
+		 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8.
+		 */
+		if (!pmc)
+			grp = byte & 1;
+
+		if (!p4_unitinfo[unit].unit)
+			return -1;
+		mask  |= p4_unitinfo[unit].mask;
+		value |= p4_unitinfo[unit].value;
+		sh = p4_unitinfo[unit].lowerbit;
+		if (sh > 1)
+			value |= (u64)lower << sh;
+		else if (lower != sh)
+			return -1;
+		unit = p4_unitinfo[unit].unit;
+
+		/* Set byte lane select field */
+		mask  |= 0xfULL << (28 - 4 * byte);
+		value |= (u64)unit << (28 - 4 * byte);
+	}
+	if (grp == 0) {
+		/* increment PMC1/2/5/6 field */
+		mask  |= 0x8000000000ull;
+		value |= 0x1000000000ull;
+	} else {
+		/* increment PMC3/4/7/8 field */
+		mask  |= 0x800000000ull;
+		value |= 0x100000000ull;
+	}
+
+	/* Marked instruction events need sample_enable set */
+	if (p4_marked_instr_event(event)) {
+		mask  |= 1ull << 56;
+		value |= 1ull << 56;
+	}
+
+	/* PMCSEL=6 decode events on byte 2 need sample_enable clear */
+	if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2)
+		mask  |= 1ull << 56;
+
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+static unsigned int ppc_inst_cmpl[] = {
+	0x1001, 0x4001, 0x6001, 0x7001, 0x8001
+};
+
+static int p4_get_alternatives(unsigned int event, unsigned int alt[])
+{
+	int i, j, na;
+
+	alt[0] = event;
+	na = 1;
+
+	/* 2 possibilities for PM_GRP_DISP_REJECT */
+	if (event == 0x8003 || event == 0x0224) {
+		alt[1] = event ^ (0x8003 ^ 0x0224);
+		return 2;
+	}
+
+	/* 2 possibilities for PM_ST_MISS_L1 */
+	if (event == 0x0c13 || event == 0x0c23) {
+		alt[1] = event ^ (0x0c13 ^ 0x0c23);
+		return 2;
+	}
+
+	/* several possibilities for PM_INST_CMPL */
+	for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) {
+		if (event == ppc_inst_cmpl[i]) {
+			for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j)
+				if (j != i)
+					alt[na++] = ppc_inst_cmpl[j];
+			break;
+		}
+	}
+
+	return na;
+}
+
+static int p4_compute_mmcr(unsigned int event[], int n_ev,
+			   unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
+	unsigned int pmc, unit, byte, psel, lower;
+	unsigned int ttm, grp;
+	unsigned int pmc_inuse = 0;
+	unsigned int pmc_grp_use[2];
+	unsigned char busbyte[4];
+	unsigned char unituse[16];
+	unsigned int unitlower = 0;
+	int i;
+
+	if (n_ev > 8)
+		return -1;
+
+	/* First pass to count resource use */
+	pmc_grp_use[0] = pmc_grp_use[1] = 0;
+	memset(busbyte, 0, sizeof(busbyte));
+	memset(unituse, 0, sizeof(unituse));
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+			/* count 1/2/5/6 vs 3/4/7/8 use */
+			++pmc_grp_use[((pmc - 1) >> 1) & 1];
+		}
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK;
+		if (unit) {
+			if (!pmc)
+				++pmc_grp_use[byte & 1];
+			if (unit == 6 || unit == 8)
+				/* map alt ISU1/IFU codes: 6->2, 8->3 */
+				unit = (unit >> 1) - 1;
+			if (busbyte[byte] && busbyte[byte] != unit)
+				return -1;
+			busbyte[byte] = unit;
+			lower <<= unit;
+			if (unituse[unit] && lower != (unitlower & lower))
+				return -1;
+			unituse[unit] = 1;
+			unitlower |= lower;
+		}
+	}
+	if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4)
+		return -1;
+
+	/*
+	 * Assign resources and set multiplexer selects.
+	 *
+	 * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2.
+	 * Each TTMx can only select one unit, but since
+	 * units 2 and 6 are both ISU1, and 3 and 8 are both IFU,
+	 * we have some choices.
+	 */
+	if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) {
+		unituse[6] = 1;		/* Move 2 to 6 */
+		unituse[2] = 0;
+	}
+	if (unituse[3] & (unituse[1] | unituse[2])) {
+		unituse[8] = 1;		/* Move 3 to 8 */
+		unituse[3] = 0;
+		unitlower = (unitlower & ~8) | ((unitlower & 8) << 5);
+	}
+	/* Check only one unit per TTMx */
+	if (unituse[1] + unituse[2] + unituse[3] > 1 ||
+	    unituse[4] + unituse[6] + unituse[7] > 1 ||
+	    unituse[8] + unituse[9] > 1 ||
+	    (unituse[5] | unituse[10] | unituse[11] |
+	     unituse[13] | unituse[14]))
+		return -1;
+
+	/* Set TTMxSEL fields.  Note, units 1-3 => TTM0SEL codes 0-2 */
+	mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH;
+	mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH;
+	mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH;
+
+	/* Set TTCxSEL fields. */
+	if (unitlower & 0xe)
+		mmcr1 |= 1ull << MMCR1_TTC0SEL_SH;
+	if (unitlower & 0xf0)
+		mmcr1 |= 1ull << MMCR1_TTC1SEL_SH;
+	if (unitlower & 0xf00)
+		mmcr1 |= 1ull << MMCR1_TTC2SEL_SH;
+	if (unitlower & 0x7000)
+		mmcr1 |= 1ull << MMCR1_TTC3SEL_SH;
+
+	/* Set byte lane select fields. */
+	for (byte = 0; byte < 4; ++byte) {
+		unit = busbyte[byte];
+		if (!unit)
+			continue;
+		if (unit == 0xf) {
+			/* special case for GPS */
+			mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte);
+		} else {
+			if (!unituse[unit])
+				ttm = unit - 1;		/* 2->1, 3->2 */
+			else
+				ttm = unit >> 2;
+			mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte);
+		}
+	}
+
+	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		if (!pmc) {
+			/* Bus event or 00xxx direct event (off or cycles) */
+			if (unit)
+				psel |= 0x10 | ((byte & 2) << 2);
+			for (pmc = 0; pmc < 8; ++pmc) {
+				if (pmc_inuse & (1 << pmc))
+					continue;
+				grp = (pmc >> 1) & 1;
+				if (unit) {
+					if (grp == (byte & 1))
+						break;
+				} else if (pmc_grp_use[grp] < 4) {
+					++pmc_grp_use[grp];
+					break;
+				}
+			}
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct event */
+			--pmc;
+			if (psel == 0 && (byte & 2))
+				/* add events on higher-numbered bus */
+				mmcr1 |= 1ull << mmcr1_adder_bits[pmc];
+			else if (psel == 6 && byte == 3)
+				/* seem to need to set sample_enable here */
+				mmcra |= MMCRA_SAMPLE_ENABLE;
+			psel |= 8;
+		}
+		if (pmc <= 1)
+			mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc);
+		else
+			mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2));
+		if (pmc == 7)	/* PMC8 */
+			mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH;
+		hwc[i] = pmc;
+		if (p4_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
+	}
+
+	if (pmc_inuse & 1)
+		mmcr0 |= MMCR0_PMC1CE;
+	if (pmc_inuse & 0xfe)
+		mmcr0 |= MMCR0_PMCjCE;
+
+	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */
+
+	/* Return MMCRx values */
+	mmcr[0] = mmcr0;
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcra;
+	return 0;
+}
+
+static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	/*
+	 * Setting the PMCxSEL field to 0 disables PMC x.
+	 * (Note that pmc is 0-based here, not 1-based.)
+	 */
+	if (pmc <= 1) {
+		mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc));
+	} else {
+		mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)));
+		if (pmc == 7)
+			mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH);
+	}
+}
+
+static int p4_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 7,
+	[PERF_COUNT_INSTRUCTIONS] = 0x1001,
+	[PERF_COUNT_CACHE_REFERENCES] = 0x8c10,		/* PM_LD_REF_L1 */
+	[PERF_COUNT_CACHE_MISSES] = 0x3c10,		/* PM_LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330,	/* PM_BR_ISSUED */
+	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
+};
+
+struct power_pmu power4_pmu = {
+	.n_counter = 8,
+	.max_alternatives = 5,
+	.add_fields = 0x0000001100005555ull,
+	.test_adder = 0x0011083300000000ull,
+	.compute_mmcr = p4_compute_mmcr,
+	.get_constraint = p4_get_constraint,
+	.get_alternatives = p4_get_alternatives,
+	.disable_pmc = p4_disable_pmc,
+	.n_generic = ARRAY_SIZE(p4_generic_events),
+	.generic_events = p4_generic_events,
+};
-- 
GitLab


From 184fe4ab1f2e4dfa45584889bb3820031648386b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Sun, 8 Mar 2009 11:34:19 +0100
Subject: [PATCH 0098/2198] x86: perf_counter cleanup

Use and actual unsigned long bitmap instead of casting our way around.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
LKML-Reference: <1236508459.22914.3645.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f5853718d4d37..1df421042b2a1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -37,7 +37,7 @@ struct cpu_hw_counters {
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
-	u64			active_mask;
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	int			enabled;
 };
 
@@ -291,7 +291,7 @@ static void pmc_amd_restore_all(u64 ctrl)
 		return;
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) {
+		if (test_bit(idx, cpuc->active_mask)) {
 			u64 val;
 
 			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
@@ -377,7 +377,7 @@ static void pmc_amd_enable(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, (unsigned long *)&cpuc->active_mask);
+	set_bit(idx, cpuc->active_mask);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -401,7 +401,7 @@ static void pmc_amd_disable(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	clear_bit(idx, (unsigned long *)&cpuc->active_mask);
+	clear_bit(idx, cpuc->active_mask);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
-- 
GitLab


From e255357764f92afcafafbd4879b222b8c752065a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 8 Mar 2009 17:09:49 +0530
Subject: [PATCH 0099/2198] x86: perf_counter cleanup

Remove unused variables and duplicate header file.

Signed-off-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1df421042b2a1..155bc3c239b63 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -17,7 +17,6 @@
 #include <linux/kdebug.h>
 #include <linux/sched.h>
 
-#include <asm/perf_counter.h>
 #include <asm/apic.h>
 
 static bool perf_counters_initialized __read_mostly;
@@ -954,9 +953,6 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 
 static struct pmc_x86_ops *pmc_amd_init(void)
 {
-	u64 old;
-	int bits;
-
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
-- 
GitLab


From bc44fb5f7d3e764ed7698c835a1a0f35aba2eb3d Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:42:18 +0100
Subject: [PATCH 0100/2198] x86, bts: detect size of DS fields

Impact: more robust DS feature enumeration

Detect the size of the pointer-type fields in the DS area
configuration via the DTES64 features rather than based on
the cpuid.

Rename a variable to denote that size to reflect that it only
covers the pointer-type fields.

Add more boot-time diagnostics giving the detected size and
the sizes of BTS and PEBS records.

Use the size of the BTS/PEBS record to indicate that the
respective feature is not available (if the record size is zero).

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c | 84 +++++++++++++++++++++++---------------------
 1 file changed, 44 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 87b67e3a765ac..6e5ec679a0cd1 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -39,7 +39,7 @@ struct ds_configuration {
 	/* the size of one pointer-typed field in the DS structure and
 	   in the BTS and PEBS buffers in bytes;
 	   this covers the first 8 DS fields related to buffer management. */
-	unsigned char  sizeof_field;
+	unsigned char  sizeof_ptr_field;
 	/* the size of a BTS/PEBS record in bytes */
 	unsigned char  sizeof_rec[2];
 	/* a series of bit-masks to control various features indexed
@@ -142,14 +142,14 @@ enum ds_qualifier {
 static inline unsigned long ds_get(const unsigned char *base,
 				   enum ds_qualifier qual, enum ds_field field)
 {
-	base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
 	return *(unsigned long *)base;
 }
 
 static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
 			  enum ds_field field, unsigned long value)
 {
-	base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
 	(*(unsigned long *)base) = value;
 }
 
@@ -410,7 +410,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
  * Later architectures use 64bit pointers throughout, whereas earlier
  * architectures use 32bit pointers in 32bit mode.
  *
- * We compute the base address for the first 8 fields based on:
+ * We compute the base address for the fields based on:
  * - the field size stored in the DS configuration
  * - the relative field position
  *
@@ -441,13 +441,13 @@ enum bts_field {
 
 static inline unsigned long bts_get(const char *base, enum bts_field field)
 {
-	base += (ds_cfg.sizeof_field * field);
+	base += (ds_cfg.sizeof_ptr_field * field);
 	return *(unsigned long *)base;
 }
 
 static inline void bts_set(char *base, enum bts_field field, unsigned long val)
 {
-	base += (ds_cfg.sizeof_field * field);;
+	base += (ds_cfg.sizeof_ptr_field * field);;
 	(*(unsigned long *)base) = val;
 }
 
@@ -593,6 +593,10 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 	struct ds_context *context;
 	int error;
 
+	error = -EOPNOTSUPP;
+	if (!ds_cfg.sizeof_rec[qual])
+		goto out;
+
 	error = -EINVAL;
 	if (!base)
 		goto out;
@@ -635,10 +639,6 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	unsigned long irq;
 	int error;
 
-	error = -EOPNOTSUPP;
-	if (!ds_cfg.ctl[dsf_bts])
-		goto out;
-
 	/* buffer overflow notification is not yet implemented */
 	error = -EOPNOTSUPP;
 	if (ovfl)
@@ -848,7 +848,8 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
 
 	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
 	tracer->trace.reset_value =
-		*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
+		*(u64 *)(tracer->ds.context->ds +
+			 (ds_cfg.sizeof_ptr_field * 8));
 
 	return &tracer->trace;
 }
@@ -884,7 +885,8 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
 	if (!tracer)
 		return -EINVAL;
 
-	*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
+	*(u64 *)(tracer->ds.context->ds +
+		 (ds_cfg.sizeof_ptr_field * 8)) = value;
 
 	return 0;
 }
@@ -894,52 +896,54 @@ static const struct ds_configuration ds_cfg_netburst = {
 	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),
 	.ctl[dsf_bts_kernel]	= (1 << 5),
 	.ctl[dsf_bts_user]	= (1 << 6),
-
-	.sizeof_field		= sizeof(long),
-	.sizeof_rec[ds_bts]	= sizeof(long) * 3,
-#ifdef __i386__
-	.sizeof_rec[ds_pebs]	= sizeof(long) * 10,
-#else
-	.sizeof_rec[ds_pebs]	= sizeof(long) * 18,
-#endif
 };
 static const struct ds_configuration ds_cfg_pentium_m = {
 	.name = "Pentium M",
 	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
-
-	.sizeof_field		= sizeof(long),
-	.sizeof_rec[ds_bts]	= sizeof(long) * 3,
-#ifdef __i386__
-	.sizeof_rec[ds_pebs]	= sizeof(long) * 10,
-#else
-	.sizeof_rec[ds_pebs]	= sizeof(long) * 18,
-#endif
 };
 static const struct ds_configuration ds_cfg_core2_atom = {
 	.name = "Core 2/Atom",
 	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
 	.ctl[dsf_bts_kernel]	= (1 << 9),
 	.ctl[dsf_bts_user]	= (1 << 10),
-
-	.sizeof_field		= 8,
-	.sizeof_rec[ds_bts]	= 8 * 3,
-	.sizeof_rec[ds_pebs]	= 8 * 18,
 };
 
 static void
-ds_configure(const struct ds_configuration *cfg)
+ds_configure(const struct ds_configuration *cfg,
+	     struct cpuinfo_x86 *cpu)
 {
+	unsigned long nr_pebs_fields = 0;
+
+	printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
+
+#ifdef __i386__
+	nr_pebs_fields = 10;
+#else
+	nr_pebs_fields = 18;
+#endif
+
 	memset(&ds_cfg, 0, sizeof(ds_cfg));
 	ds_cfg = *cfg;
 
-	printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
+	ds_cfg.sizeof_ptr_field =
+		(cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
 
-	if (!cpu_has_bts) {
-		ds_cfg.ctl[dsf_bts] = 0;
+	ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3;
+	ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
+
+	if (!cpu_has(cpu, X86_FEATURE_BTS)) {
+		ds_cfg.sizeof_rec[ds_bts] = 0;
 		printk(KERN_INFO "[ds] bts not available\n");
 	}
-	if (!cpu_has_pebs)
+	if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
+		ds_cfg.sizeof_rec[ds_pebs] = 0;
 		printk(KERN_INFO "[ds] pebs not available\n");
+	}
+
+	printk(KERN_INFO "[ds] sizes: address: %u bit, ",
+	       8 * ds_cfg.sizeof_ptr_field);
+	printk("bts/pebs record: %u/%u bytes\n",
+	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
 
 	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
 }
@@ -951,12 +955,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 		switch (c->x86_model) {
 		case 0x9:
 		case 0xd: /* Pentium M */
-			ds_configure(&ds_cfg_pentium_m);
+			ds_configure(&ds_cfg_pentium_m, c);
 			break;
 		case 0xf:
 		case 0x17: /* Core2 */
 		case 0x1c: /* Atom */
-			ds_configure(&ds_cfg_core2_atom);
+			ds_configure(&ds_cfg_core2_atom, c);
 			break;
 		case 0x1a: /* i7 */
 		default:
@@ -969,7 +973,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 		case 0x0:
 		case 0x1:
 		case 0x2: /* Netburst */
-			ds_configure(&ds_cfg_netburst);
+			ds_configure(&ds_cfg_netburst, c);
 			break;
 		default:
 			/* sorry, don't know about them */
-- 
GitLab


From 8a327f6d1b05f5ce16572b4413a5df1d0e872283 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:45:07 +0100
Subject: [PATCH 0101/2198] x86, bts: add selftest for BTS

Perform a selftest of branch trace store when a cpu is initialized.

WARN and disable branch trace store support if the selftest fails.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104507.A30125@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug        |   9 ++
 arch/x86/kernel/Makefile      |   1 +
 arch/x86/kernel/ds.c          |  21 +++
 arch/x86/kernel/ds_selftest.c | 241 ++++++++++++++++++++++++++++++++++
 arch/x86/kernel/ds_selftest.h |  15 +++
 5 files changed, 287 insertions(+)
 create mode 100644 arch/x86/kernel/ds_selftest.c
 create mode 100644 arch/x86/kernel/ds_selftest.h

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index fdb45df608b62..dfd74abc03f82 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -175,6 +175,15 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
+config X86_DS_SELFTEST
+    bool "DS selftest"
+    default y
+    depends on DEBUG_KERNEL
+    depends on X86_DS
+	---help---
+	  Perform Debug Store selftests at boot time.
+	  If in doubt, say "N".
+
 config HAVE_MMIOTRACE_SUPPORT
 	def_bool y
 
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 339ce35648e60..a0c9e138b0083 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -44,6 +44,7 @@ obj-y				+= process.o
 obj-y				+= i387.o xsave.o
 obj-y				+= ptrace.o
 obj-$(CONFIG_X86_DS)		+= ds.o
+obj-$(CONFIG_X86_DS_SELFTEST)		+= ds_selftest.o
 obj-$(CONFIG_X86_32)		+= tls.o
 obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 6e5ec679a0cd1..51c936c1a3906 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -29,6 +29,7 @@
 #include <linux/mm.h>
 #include <linux/kernel.h>
 
+#include "ds_selftest.h"
 
 /*
  * The configuration for a particular DS hardware implementation.
@@ -940,6 +941,26 @@ ds_configure(const struct ds_configuration *cfg,
 		printk(KERN_INFO "[ds] pebs not available\n");
 	}
 
+	if (ds_cfg.sizeof_rec[ds_bts]) {
+		int error;
+
+		error = ds_selftest_bts();
+		if (error) {
+			WARN(1, "[ds] selftest failed. disabling bts.\n");
+			ds_cfg.sizeof_rec[ds_bts] = 0;
+		}
+	}
+
+	if (ds_cfg.sizeof_rec[ds_pebs]) {
+		int error;
+
+		error = ds_selftest_pebs();
+		if (error) {
+			WARN(1, "[ds] selftest failed. disabling pebs.\n");
+			ds_cfg.sizeof_rec[ds_pebs] = 0;
+		}
+	}
+
 	printk(KERN_INFO "[ds] sizes: address: %u bit, ",
 	       8 * ds_cfg.sizeof_ptr_field);
 	printk("bts/pebs record: %u/%u bytes\n",
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
new file mode 100644
index 0000000000000..8c46fbf38c461
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.c
@@ -0,0 +1,241 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#include "ds_selftest.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+
+#include <asm/ds.h>
+
+
+#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */
+
+
+static int ds_selftest_bts_consistency(const struct bts_trace *trace)
+{
+	int error = 0;
+
+	if (!trace) {
+		printk(KERN_CONT "failed to access trace...");
+		/* Bail out. Other tests are pointless. */
+		return -1;
+	}
+
+	if (!trace->read) {
+		printk(KERN_CONT "bts read not available...");
+		error = -1;
+	}
+
+	/* Do some sanity checks on the trace configuration. */
+	if (!trace->ds.n) {
+		printk(KERN_CONT "empty bts buffer...");
+		error = -1;
+	}
+	if (!trace->ds.size) {
+		printk(KERN_CONT "bad bts trace setup...");
+		error = -1;
+	}
+	if (trace->ds.end !=
+	    (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
+		printk(KERN_CONT "bad bts buffer setup...");
+		error = -1;
+	}
+	if ((trace->ds.top < trace->ds.begin) ||
+	    (trace->ds.end <= trace->ds.top)) {
+		printk(KERN_CONT "bts top out of bounds...");
+		error = -1;
+	}
+
+	return error;
+}
+
+static int ds_selftest_bts_read(struct bts_tracer *tracer,
+				const struct bts_trace *trace,
+				const void *from, const void *to)
+{
+	const unsigned char *at;
+
+	/*
+	 * Check a few things which do not belong to this test.
+	 * They should be covered by other tests.
+	 */
+	if (!trace)
+		return -1;
+
+	if (!trace->read)
+		return -1;
+
+	if (to < from)
+		return -1;
+
+	if (from < trace->ds.begin)
+		return -1;
+
+	if (trace->ds.end < to)
+		return -1;
+
+	if (!trace->ds.size)
+		return -1;
+
+	/* Now to the test itself. */
+	for (at = from; (void *)at < to; at += trace->ds.size) {
+		struct bts_struct bts;
+		size_t index;
+		int error;
+
+		if (((void *)at - trace->ds.begin) % trace->ds.size) {
+			printk(KERN_CONT
+			       "read from non-integer index...");
+			return -1;
+		}
+		index = ((void *)at - trace->ds.begin) / trace->ds.size;
+
+		memset(&bts, 0, sizeof(bts));
+		error = trace->read(tracer, at, &bts);
+		if (error < 0) {
+			printk(KERN_CONT
+			       "error reading bts trace at [%lu] (0x%p)...",
+			       index, at);
+			return error;
+		}
+
+		switch (bts.qualifier) {
+		case BTS_BRANCH:
+			break;
+		default:
+			printk(KERN_CONT
+			       "unexpected bts entry %llu at [%lu] (0x%p)...",
+			       bts.qualifier, index, at);
+			return -1;
+		}
+	}
+
+	return 0;
+}
+
+int ds_selftest_bts(void)
+{
+	const struct bts_trace *trace;
+	struct bts_tracer *tracer;
+	int error = 0;
+	void *top;
+	unsigned char buffer[DS_SELFTEST_BUFFER_SIZE];
+
+	printk(KERN_INFO "[ds] bts selftest...");
+
+	tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE,
+				NULL, (size_t)-1, BTS_KERNEL);
+	if (IS_ERR(tracer)) {
+		error = PTR_ERR(tracer);
+		tracer = NULL;
+
+		printk(KERN_CONT
+		       "initialization failed (err: %d)...", error);
+		goto out;
+	}
+
+	/* The return should already give us enough trace. */
+	ds_suspend_bts(tracer);
+
+	/* Let's see if we can access the trace. */
+	trace = ds_read_bts(tracer);
+
+	error = ds_selftest_bts_consistency(trace);
+	if (error < 0)
+		goto out;
+
+	/* If everything went well, we should have a few trace entries. */
+	if (trace->ds.top == trace->ds.begin) {
+		/*
+		 * It is possible but highly unlikely that we got a
+		 * buffer overflow and end up at exactly the same
+		 * position we started from.
+		 * Let's issue a warning, but continue.
+		 */
+		printk(KERN_CONT "no trace/overflow...");
+	}
+
+	/* Let's try to read the trace we collected. */
+	error = ds_selftest_bts_read(tracer, trace,
+				     trace->ds.begin, trace->ds.top);
+	if (error < 0)
+		goto out;
+
+	/*
+	 * Let's read the trace again.
+	 * Since we suspended tracing, we should get the same result.
+	 */
+	top = trace->ds.top;
+
+	trace = ds_read_bts(tracer);
+	error = ds_selftest_bts_consistency(trace);
+	if (error < 0)
+		goto out;
+
+	if (top != trace->ds.top) {
+		printk(KERN_CONT "suspend not working...");
+		error = -1;
+		goto out;
+	}
+
+	/* Let's collect some more trace - see if resume is working. */
+	ds_resume_bts(tracer);
+	ds_suspend_bts(tracer);
+
+	trace = ds_read_bts(tracer);
+
+	error = ds_selftest_bts_consistency(trace);
+	if (error < 0)
+		goto out;
+
+	if (trace->ds.top == top) {
+		/*
+		 * It is possible but highly unlikely that we got a
+		 * buffer overflow and end up at exactly the same
+		 * position we started from.
+		 * Let's issue a warning and check the full trace.
+		 */
+		printk(KERN_CONT
+		       "no resume progress/overflow...");
+
+		error = ds_selftest_bts_read(tracer, trace,
+					     trace->ds.begin, trace->ds.end);
+	} else if (trace->ds.top < top) {
+		/*
+		 * We had a buffer overflow - the entire buffer should
+		 * contain trace records.
+		 */
+		error = ds_selftest_bts_read(tracer, trace,
+					     trace->ds.begin, trace->ds.end);
+	} else {
+		/*
+		 * It is quite likely that the buffer did not overflow.
+		 * Let's just check the delta trace.
+		 */
+		error = ds_selftest_bts_read(tracer, trace,
+					     top, trace->ds.top);
+	}
+	if (error < 0)
+		goto out;
+
+	error = 0;
+
+	/* The final test: release the tracer while tracing is suspended. */
+ out:
+	ds_release_bts(tracer);
+
+	printk(KERN_CONT "%s.\n", (error ? "failed" : "passed"));
+
+	return error;
+}
+
+int ds_selftest_pebs(void)
+{
+	return 0;
+}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
new file mode 100644
index 0000000000000..0e6e19d4c7d23
--- /dev/null
+++ b/arch/x86/kernel/ds_selftest.h
@@ -0,0 +1,15 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#ifdef CONFIG_X86_DS_SELFTEST
+extern int ds_selftest_bts(void);
+extern int ds_selftest_pebs(void);
+#else
+static inline int ds_selftest_bts(void) { return 0; }
+static inline int ds_selftest_pebs(void) { return 0; }
+#endif /* CONFIG_X86_DS_SELFTEST */
-- 
GitLab


From b8e47195451c5d3f62620b2b1b5928669afd56eb Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:46:42 +0100
Subject: [PATCH 0102/2198] x86, bts: correct comment style in ds.c

Correct the comment style in ds.c.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104642.A30149@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c | 79 ++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 51c936c1a3906..d9cab7168058b 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -35,25 +35,22 @@
  * The configuration for a particular DS hardware implementation.
  */
 struct ds_configuration {
-	/* the name of the configuration */
+	/* The name of the configuration. */
 	const char *name;
-	/* the size of one pointer-typed field in the DS structure and
-	   in the BTS and PEBS buffers in bytes;
-	   this covers the first 8 DS fields related to buffer management. */
+	/* The size of pointer-typed fields in DS, BTS, and PEBS. */
 	unsigned char  sizeof_ptr_field;
-	/* the size of a BTS/PEBS record in bytes */
+	/* The size of a BTS/PEBS record in bytes. */
 	unsigned char  sizeof_rec[2];
-	/* a series of bit-masks to control various features indexed
-	 * by enum ds_feature */
+	/* Control bit-masks indexed by enum ds_feature. */
 	unsigned long ctl[dsf_ctl_max];
 };
 static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
 
 #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
 
-#define MAX_SIZEOF_DS (12 * 8)	/* maximal size of a DS configuration */
-#define MAX_SIZEOF_BTS (3 * 8)	/* maximal size of a BTS record */
-#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */
+#define MAX_SIZEOF_DS (12 * 8)	/* Maximal size of a DS configuration. */
+#define MAX_SIZEOF_BTS (3 * 8)	/* Maximal size of a BTS record. */
+#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment. */
 
 #define BTS_CONTROL \
  (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
@@ -67,28 +64,28 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
  * to identify tracers.
  */
 struct ds_tracer {
-	/* the DS context (partially) owned by this tracer */
+	/* The DS context (partially) owned by this tracer. */
 	struct ds_context *context;
-	/* the buffer provided on ds_request() and its size in bytes */
+	/* The buffer provided on ds_request() and its size in bytes. */
 	void *buffer;
 	size_t size;
 };
 
 struct bts_tracer {
-	/* the common DS part */
+	/* The common DS part. */
 	struct ds_tracer ds;
-	/* the trace including the DS configuration */
+	/* The trace including the DS configuration. */
 	struct bts_trace trace;
-	/* buffer overflow notification function */
+	/* Buffer overflow notification function. */
 	bts_ovfl_callback_t ovfl;
 };
 
 struct pebs_tracer {
-	/* the common DS part */
+	/* The common DS part. */
 	struct ds_tracer ds;
-	/* the trace including the DS configuration */
+	/* The trace including the DS configuration. */
 	struct pebs_trace trace;
-	/* buffer overflow notification function */
+	/* Buffer overflow notification function. */
 	pebs_ovfl_callback_t ovfl;
 };
 
@@ -214,18 +211,16 @@ static inline int check_tracer(struct task_struct *task)
  * deallocated when the last user puts the context.
  */
 struct ds_context {
-	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
+	/* The DS configuration; goes into MSR_IA32_DS_AREA. */
 	unsigned char ds[MAX_SIZEOF_DS];
-	/* the owner of the BTS and PEBS configuration, respectively */
+	/* The owner of the BTS and PEBS configuration, respectively. */
 	struct bts_tracer *bts_master;
 	struct pebs_tracer *pebs_master;
-	/* use count */
+	/* Use count. */
 	unsigned long count;
-	/* a pointer to the context location inside the thread_struct
-	 * or the per_cpu context array */
+	/* Pointer to the context pointer field. */
 	struct ds_context **this;
-	/* a pointer to the task owning this context, or NULL, if the
-	 * context is owned by a cpu */
+	/* The traced task; NULL for current cpu. */
 	struct task_struct *task;
 };
 
@@ -350,14 +345,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
 		unsigned long write_size, adj_write_size;
 
 		/*
-		 * write as much as possible without producing an
+		 * Write as much as possible without producing an
 		 * overflow interrupt.
 		 *
-		 * interrupt_threshold must either be
+		 * Interrupt_threshold must either be
 		 * - bigger than absolute_maximum or
 		 * - point to a record between buffer_base and absolute_maximum
 		 *
-		 * index points to a valid record.
+		 * Index points to a valid record.
 		 */
 		base   = ds_get(context->ds, qual, ds_buffer_base);
 		index  = ds_get(context->ds, qual, ds_index);
@@ -366,8 +361,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
 
 		write_end = min(end, int_th);
 
-		/* if we are already beyond the interrupt threshold,
-		 * we fill the entire buffer */
+		/*
+		 * If we are already beyond the interrupt threshold,
+		 * we fill the entire buffer.
+		 */
 		if (write_end <= index)
 			write_end = end;
 
@@ -384,7 +381,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
 		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
 		adj_write_size *= ds_cfg.sizeof_rec[qual];
 
-		/* zero out trailing bytes */
+		/* Zero out trailing bytes. */
 		memset((char *)index + write_size, 0,
 		       adj_write_size - write_size);
 		index += adj_write_size;
@@ -556,7 +553,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
 			     unsigned int flags) {
 	unsigned long buffer, adj;
 
-	/* adjust the buffer address and size to meet alignment
+	/*
+	 * Adjust the buffer address and size to meet alignment
 	 * constraints:
 	 * - buffer is double-word aligned
 	 * - size is multiple of record size
@@ -578,7 +576,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
 	trace->begin = (void *)buffer;
 	trace->top = trace->begin;
 	trace->end = (void *)(buffer + size);
-	/* The value for 'no threshold' is -1, which will set the
+	/*
+	 * The value for 'no threshold' is -1, which will set the
 	 * threshold outside of the buffer, just like we want it.
 	 */
 	trace->ith = (void *)(buffer + size - ith);
@@ -602,7 +601,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 	if (!base)
 		goto out;
 
-	/* we require some space to do alignment adjustments below */
+	/* We require some space to do alignment adjustments below. */
 	error = -EINVAL;
 	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
 		goto out;
@@ -640,7 +639,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	unsigned long irq;
 	int error;
 
-	/* buffer overflow notification is not yet implemented */
+	/* Buffer overflow notification is not yet implemented. */
 	error = -EOPNOTSUPP;
 	if (ovfl)
 		goto out;
@@ -700,7 +699,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	unsigned long irq;
 	int error;
 
-	/* buffer overflow notification is not yet implemented */
+	/* Buffer overflow notification is not yet implemented. */
 	error = -EOPNOTSUPP;
 	if (ovfl)
 		goto out;
@@ -983,9 +982,9 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 		case 0x1c: /* Atom */
 			ds_configure(&ds_cfg_core2_atom, c);
 			break;
-		case 0x1a: /* i7 */
+		case 0x1a: /* Core i7 */
 		default:
-			/* sorry, don't know about them */
+			/* Sorry, don't know about them. */
 			break;
 		}
 		break;
@@ -997,12 +996,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 			ds_configure(&ds_cfg_netburst, c);
 			break;
 		default:
-			/* sorry, don't know about them */
+			/* Sorry, don't know about them. */
 			break;
 		}
 		break;
 	default:
-		/* sorry, don't know about them */
+		/* Sorry, don't know about them. */
 		break;
 	}
 }
-- 
GitLab


From ba9372a8f306c4e53a5f61dcbcd6c1e4a8c2e9ac Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:48:52 +0100
Subject: [PATCH 0103/2198] x86, hw-branch-tracer: keep resources on stop

Distinguish init/reset and start/stop:

init/reset will allocate and release bts tracing resources
stop/start will suspend and resume bts tracing

Return an error on init() if no cpu can be traced.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104852.A30168@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 119 ++++++++++++++++++++++---------
 1 file changed, 85 insertions(+), 34 deletions(-)

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c2347f3..a99a04c5e9cdd 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -19,7 +19,7 @@
 #include "trace_output.h"
 
 
-#define SIZEOF_BTS (1 << 13)
+#define BTS_BUFFER_SIZE (1 << 13)
 
 /*
  * The tracer lock protects the below per-cpu tracer array.
@@ -33,53 +33,68 @@
  */
 static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
-static int __read_mostly trace_hw_branches_enabled;
+static int trace_hw_branches_enabled __read_mostly;
+static int trace_hw_branches_suspended __read_mostly;
 static struct trace_array *hw_branch_trace __read_mostly;
 
 
 /*
- * Start tracing on the current cpu.
+ * Initialize the tracer for the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_lock must be locked.
  */
-static void bts_trace_start_cpu(void *arg)
+static void bts_trace_init_cpu(void *arg)
 {
 	if (this_tracer)
 		ds_release_bts(this_tracer);
 
-	this_tracer =
-		ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
-			       /* ovfl = */ NULL, /* th = */ (size_t)-1,
-			       BTS_KERNEL);
+	this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE,
+				     NULL, (size_t)-1, BTS_KERNEL);
 	if (IS_ERR(this_tracer)) {
 		this_tracer = NULL;
 		return;
 	}
 }
 
-static void bts_trace_start(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
 {
+	int cpu, avail;
+
 	spin_lock(&bts_tracer_lock);
 
-	on_each_cpu(bts_trace_start_cpu, NULL, 1);
-	trace_hw_branches_enabled = 1;
+	hw_branch_trace = tr;
+
+	on_each_cpu(bts_trace_init_cpu, NULL, 1);
+
+	/* Check on how many cpus we could enable tracing */
+	avail = 0;
+	for_each_online_cpu(cpu)
+		if (per_cpu(tracer, cpu))
+			avail++;
+
+	trace_hw_branches_enabled = (avail ? 1 : 0);
+	trace_hw_branches_suspended = 0;
 
 	spin_unlock(&bts_tracer_lock);
+
+
+	/* If we could not enable tracing on a single cpu, we fail. */
+	return avail ? 0 : -EOPNOTSUPP;
 }
 
 /*
- * Stop tracing on the current cpu.
+ * Release the tracer for the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_lock must be locked.
  */
-static void bts_trace_stop_cpu(void *arg)
+static void bts_trace_release_cpu(void *arg)
 {
 	if (this_tracer) {
 		ds_release_bts(this_tracer);
@@ -87,12 +102,57 @@ static void bts_trace_stop_cpu(void *arg)
 	}
 }
 
-static void bts_trace_stop(struct trace_array *tr)
+static void bts_trace_reset(struct trace_array *tr)
 {
 	spin_lock(&bts_tracer_lock);
 
+	on_each_cpu(bts_trace_release_cpu, NULL, 1);
 	trace_hw_branches_enabled = 0;
-	on_each_cpu(bts_trace_stop_cpu, NULL, 1);
+	trace_hw_branches_suspended = 0;
+
+	spin_unlock(&bts_tracer_lock);
+}
+
+/*
+ * Resume tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_lock must be locked.
+ */
+static void bts_trace_resume_cpu(void *arg)
+{
+	if (this_tracer)
+		ds_resume_bts(this_tracer);
+}
+
+static void bts_trace_start(struct trace_array *tr)
+{
+	spin_lock(&bts_tracer_lock);
+
+	on_each_cpu(bts_trace_resume_cpu, NULL, 1);
+	trace_hw_branches_suspended = 0;
+
+	spin_unlock(&bts_tracer_lock);
+}
+
+/*
+ * Suspend tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_lock must be locked.
+ */
+static void bts_trace_suspend_cpu(void *arg)
+{
+	if (this_tracer)
+		ds_suspend_bts(this_tracer);
+}
+
+static void bts_trace_stop(struct trace_array *tr)
+{
+	spin_lock(&bts_tracer_lock);
+
+	on_each_cpu(bts_trace_suspend_cpu, NULL, 1);
+	trace_hw_branches_suspended = 1;
 
 	spin_unlock(&bts_tracer_lock);
 }
@@ -110,10 +170,14 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
-		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+		smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1);
+
+		if (trace_hw_branches_suspended)
+			smp_call_function_single(cpu, bts_trace_suspend_cpu,
+						 NULL, 1);
 		break;
 	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+		smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1);
 		break;
 	}
 
@@ -126,20 +190,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 	.notifier_call = bts_hotcpu_handler
 };
 
-static int bts_trace_init(struct trace_array *tr)
-{
-	hw_branch_trace = tr;
-
-	bts_trace_start(tr);
-
-	return 0;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-	bts_trace_stop(tr);
-}
-
 static void bts_trace_print_header(struct seq_file *m)
 {
 	seq_puts(m, "# CPU#        TO  <-  FROM\n");
@@ -228,7 +278,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
  */
 static void trace_bts_cpu(void *arg)
 {
-	struct trace_array *tr = (struct trace_array *) arg;
+	struct trace_array *tr = (struct trace_array *)arg;
 	const struct bts_trace *trace;
 	unsigned char *at;
 
@@ -276,7 +326,8 @@ void trace_hw_branch_oops(void)
 {
 	spin_lock(&bts_tracer_lock);
 
-	trace_bts_cpu(hw_branch_trace);
+	if (trace_hw_branches_enabled)
+		trace_bts_cpu(hw_branch_trace);
 
 	spin_unlock(&bts_tracer_lock);
 }
-- 
GitLab


From 321bb5e1ac461c04b6a93f795010d6eb01d8c5ca Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 13 Mar 2009 10:50:27 +0100
Subject: [PATCH 0104/2198] x86, hw-branch-tracer: add selftest

Add a selftest for the hw-branch-tracer.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313105027.A30183@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h             |  2 ++
 kernel/trace/trace_hw_branches.c |  5 ++-
 kernel/trace/trace_selftest.c    | 53 ++++++++++++++++++++++++++++++++
 3 files changed, 59 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 56ce34d90b030..e7fbc826f1e93 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -576,6 +576,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
 					       struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
 					 struct trace_array *tr);
+extern int trace_selftest_startup_hw_branches(struct tracer *trace,
+					      struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index a99a04c5e9cdd..4ca82700c04e0 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -342,7 +342,10 @@ struct tracer bts_tracer __read_mostly =
 	.start		= bts_trace_start,
 	.stop		= bts_trace_stop,
 	.open		= trace_bts_prepare,
-	.close		= trace_bts_close
+	.close		= trace_bts_close,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_hw_branches,
+#endif /* CONFIG_FTRACE_SELFTEST */
 };
 
 __init static int init_bts_trace(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index f907a2b29028a..3c7b797d0d28f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_BRANCH:
 	case TRACE_GRAPH_ENT:
 	case TRACE_GRAPH_RET:
+	case TRACE_HW_BRANCHES:
 		return 1;
 	}
 	return 0;
@@ -691,3 +692,55 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+int
+trace_selftest_startup_hw_branches(struct tracer *trace,
+				   struct trace_array *tr)
+{
+	unsigned long count;
+	int ret;
+	struct trace_iterator iter;
+	struct tracer tracer;
+
+	if (!trace->open) {
+		printk(KERN_CONT "missing open function...");
+		return -1;
+	}
+
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		return ret;
+	}
+
+	/*
+	 * The hw-branch tracer needs to collect the trace from the various
+	 * cpu trace buffers - before tracing is stopped.
+	 */
+	memset(&iter, 0, sizeof(iter));
+	memcpy(&tracer, trace, sizeof(tracer));
+
+	iter.trace = &tracer;
+	iter.tr = tr;
+	iter.pos = -1;
+	mutex_init(&iter.mutex);
+
+	trace->open(&iter);
+
+	mutex_destroy(&iter.mutex);
+
+	tracing_stop();
+
+	ret = trace_test_buffer(tr, &count);
+	trace->reset(tr);
+	tracing_start();
+
+	if (!ret && !count) {
+		printk(KERN_CONT "no entries found..");
+		ret = -1;
+	}
+
+	return ret;
+}
+#endif /* CONFIG_HW_BRANCH_TRACER */
-- 
GitLab


From e9a22d1fb94050b7d600019c32e6b672d539054b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:54:40 +0100
Subject: [PATCH 0105/2198] x86, bts: cleanups

Impact: cleanup, no code changed

Cc: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c             | 142 +++++++++++++++++--------------
 arch/x86/kernel/ds_selftest.h    |   2 +-
 kernel/trace/trace_hw_branches.c |   6 +-
 kernel/trace/trace_selftest.c    |   5 +-
 4 files changed, 87 insertions(+), 68 deletions(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index d9cab7168058b..7363e01ba082f 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -19,43 +19,52 @@
  * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
  */
 
-
-#include <asm/ds.h>
-
-#include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/string.h>
-#include <linux/slab.h>
+#include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/kernel.h>
+
+#include <asm/ds.h>
 
 #include "ds_selftest.h"
 
 /*
- * The configuration for a particular DS hardware implementation.
+ * The configuration for a particular DS hardware implementation:
  */
 struct ds_configuration {
-	/* The name of the configuration. */
-	const char *name;
-	/* The size of pointer-typed fields in DS, BTS, and PEBS. */
-	unsigned char  sizeof_ptr_field;
-	/* The size of a BTS/PEBS record in bytes. */
-	unsigned char  sizeof_rec[2];
-	/* Control bit-masks indexed by enum ds_feature. */
-	unsigned long ctl[dsf_ctl_max];
+	/* The name of the configuration: */
+	const char		*name;
+
+	/* The size of pointer-typed fields in DS, BTS, and PEBS: */
+	unsigned char		sizeof_ptr_field;
+
+	/* The size of a BTS/PEBS record in bytes: */
+	unsigned char		sizeof_rec[2];
+
+	/* Control bit-masks indexed by enum ds_feature: */
+	unsigned long		ctl[dsf_ctl_max];
 };
 static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
 
 #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
 
-#define MAX_SIZEOF_DS (12 * 8)	/* Maximal size of a DS configuration. */
-#define MAX_SIZEOF_BTS (3 * 8)	/* Maximal size of a BTS record. */
-#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment. */
+/* Maximal size of a DS configuration: */
+#define MAX_SIZEOF_DS		(12 * 8)
 
-#define BTS_CONTROL \
- (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
-  ds_cfg.ctl[dsf_bts_overflow])
+/* Maximal size of a BTS record: */
+#define MAX_SIZEOF_BTS		(3 * 8)
 
+/* BTS and PEBS buffer alignment: */
+#define DS_ALIGNMENT		(1 << 3)
+
+/* Mask of control bits in the DS MSR register: */
+#define BTS_CONTROL				  \
+	( ds_cfg.ctl[dsf_bts]			| \
+	  ds_cfg.ctl[dsf_bts_kernel]		| \
+	  ds_cfg.ctl[dsf_bts_user]		| \
+	  ds_cfg.ctl[dsf_bts_overflow] )
 
 /*
  * A BTS or PEBS tracer.
@@ -65,28 +74,32 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
  */
 struct ds_tracer {
 	/* The DS context (partially) owned by this tracer. */
-	struct ds_context *context;
+	struct ds_context	*context;
 	/* The buffer provided on ds_request() and its size in bytes. */
-	void *buffer;
-	size_t size;
+	void			*buffer;
+	size_t			size;
 };
 
 struct bts_tracer {
-	/* The common DS part. */
-	struct ds_tracer ds;
-	/* The trace including the DS configuration. */
-	struct bts_trace trace;
-	/* Buffer overflow notification function. */
-	bts_ovfl_callback_t ovfl;
+	/* The common DS part: */
+	struct ds_tracer	ds;
+
+	/* The trace including the DS configuration: */
+	struct bts_trace	trace;
+
+	/* Buffer overflow notification function: */
+	bts_ovfl_callback_t	ovfl;
 };
 
 struct pebs_tracer {
-	/* The common DS part. */
-	struct ds_tracer ds;
-	/* The trace including the DS configuration. */
-	struct pebs_trace trace;
-	/* Buffer overflow notification function. */
-	pebs_ovfl_callback_t ovfl;
+	/* The common DS part: */
+	struct ds_tracer	ds;
+
+	/* The trace including the DS configuration: */
+	struct pebs_trace	trace;
+
+	/* Buffer overflow notification function: */
+	pebs_ovfl_callback_t	ovfl;
 };
 
 /*
@@ -95,6 +108,7 @@ struct pebs_tracer {
  *
  * The DS configuration consists of the following fields; different
  * architetures vary in the size of those fields.
+ *
  * - double-word aligned base linear address of the BTS buffer
  * - write pointer into the BTS buffer
  * - end linear address of the BTS buffer (one byte beyond the end of
@@ -133,19 +147,20 @@ enum ds_field {
 };
 
 enum ds_qualifier {
-	ds_bts  = 0,
+	ds_bts = 0,
 	ds_pebs
 };
 
-static inline unsigned long ds_get(const unsigned char *base,
-				   enum ds_qualifier qual, enum ds_field field)
+static inline unsigned long
+ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
 {
 	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
 	return *(unsigned long *)base;
 }
 
-static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
-			  enum ds_field field, unsigned long value)
+static inline void
+ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
+       unsigned long value)
 {
 	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
 	(*(unsigned long *)base) = value;
@@ -157,7 +172,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
  */
 static DEFINE_SPINLOCK(ds_lock);
 
-
 /*
  * We either support (system-wide) per-cpu or per-thread allocation.
  * We distinguish the two based on the task_struct pointer, where a
@@ -211,17 +225,21 @@ static inline int check_tracer(struct task_struct *task)
  * deallocated when the last user puts the context.
  */
 struct ds_context {
-	/* The DS configuration; goes into MSR_IA32_DS_AREA. */
-	unsigned char ds[MAX_SIZEOF_DS];
-	/* The owner of the BTS and PEBS configuration, respectively. */
-	struct bts_tracer *bts_master;
-	struct pebs_tracer *pebs_master;
-	/* Use count. */
+	/* The DS configuration; goes into MSR_IA32_DS_AREA: */
+	unsigned char		ds[MAX_SIZEOF_DS];
+
+	/* The owner of the BTS and PEBS configuration, respectively: */
+	struct bts_tracer	*bts_master;
+	struct pebs_tracer	*pebs_master;
+
+	/* Use count: */
 	unsigned long count;
-	/* Pointer to the context pointer field. */
-	struct ds_context **this;
-	/* The traced task; NULL for current cpu. */
-	struct task_struct *task;
+
+	/* Pointer to the context pointer field: */
+	struct ds_context	**this;
+
+	/* The traced task; NULL for current cpu: */
+	struct task_struct	*task;
 };
 
 static DEFINE_PER_CPU(struct ds_context *, system_context_array);
@@ -328,9 +346,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
  * The remainder of any partially written record is zeroed out.
  *
  * context: the DS context
- * qual: the buffer type
- * record: the data to write
- * size: the size of the data
+ * qual:    the buffer type
+ * record:  the data to write
+ * size:    the size of the data
  */
 static int ds_write(struct ds_context *context, enum ds_qualifier qual,
 		    const void *record, size_t size)
@@ -429,12 +447,12 @@ enum bts_field {
 	bts_to,
 	bts_flags,
 
-	bts_qual = bts_from,
-	bts_jiffies = bts_to,
-	bts_pid = bts_flags,
+	bts_qual		= bts_from,
+	bts_jiffies		= bts_to,
+	bts_pid			= bts_flags,
 
-	bts_qual_mask = (bts_qual_max - 1),
-	bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
+	bts_qual_mask		= (bts_qual_max - 1),
+	bts_escape		= ((unsigned long)-1 & ~bts_qual_mask)
 };
 
 static inline unsigned long bts_get(const char *base, enum bts_field field)
@@ -461,8 +479,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)
  *
  * return: bytes read/written on success; -Eerrno, otherwise
  */
-static int bts_read(struct bts_tracer *tracer, const void *at,
-		    struct bts_struct *out)
+static int
+bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
 {
 	if (!tracer)
 		return -EINVAL;
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
index 0e6e19d4c7d23..2ba8745c66631 100644
--- a/arch/x86/kernel/ds_selftest.h
+++ b/arch/x86/kernel/ds_selftest.h
@@ -12,4 +12,4 @@ extern int ds_selftest_pebs(void);
 #else
 static inline int ds_selftest_bts(void) { return 0; }
 static inline int ds_selftest_pebs(void) { return 0; }
-#endif /* CONFIG_X86_DS_SELFTEST */
+#endif
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 4ca82700c04e0..8b2109a6c61cc 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,5 +1,5 @@
 /*
- * h/w branch tracer for x86 based on bts
+ * h/w branch tracer for x86 based on BTS
  *
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
@@ -15,8 +15,8 @@
 
 #include <asm/ds.h>
 
-#include "trace.h"
 #include "trace_output.h"
+#include "trace.h"
 
 
 #define BTS_BUFFER_SIZE (1 << 13)
@@ -197,10 +197,10 @@ static void bts_trace_print_header(struct seq_file *m)
 
 static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 {
+	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *seq = &iter->seq;
 	struct hw_branch_entry *it;
-	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 
 	trace_assign_type(it, entry);
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 3c7b797d0d28f..b910912670676 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -189,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 #else
 # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
 #endif /* CONFIG_DYNAMIC_FTRACE */
+
 /*
  * Simple verification test of ftrace function tracer.
  * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -698,10 +699,10 @@ int
 trace_selftest_startup_hw_branches(struct tracer *trace,
 				   struct trace_array *tr)
 {
-	unsigned long count;
-	int ret;
 	struct trace_iterator iter;
 	struct tracer tracer;
+	unsigned long count;
+	int ret;
 
 	if (!trace->open) {
 		printk(KERN_CONT "missing open function...");
-- 
GitLab


From 79258a354e0c69be94ae2871809a195bf4a647b1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 12:02:08 +0100
Subject: [PATCH 0106/2198] x86, bts: detect size of DS fields, fix

Impact: build fix

One usage site was missed in the sizeof_field -> sizeof_ptr_field
rename.

Cc: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 7363e01ba082f..5fd53333c1dfc 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -983,7 +983,7 @@ ds_configure(const struct ds_configuration *cfg,
 	printk("bts/pebs record: %u/%u bytes\n",
 	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
 
-	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
+	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field));
 }
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
-- 
GitLab


From c78a3956b982418186e40978a51636a2b43221bc Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Wed, 18 Mar 2009 19:27:00 +0100
Subject: [PATCH 0107/2198] x86, bts: use atomic memory allocation

Ds_request_bts() needs to allocate memory. It uses GFP_KERNEL.

Hw-branch-tracer calls ds_request_bts() within on_each_cpu().

Use atomic memory allocation to allow it to be used in that context.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
LKML-Reference: <20090318192700.A6038@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 5fd53333c1dfc..b1d6e1f502fa3 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -255,8 +255,13 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
 	struct ds_context *new_context = NULL;
 	unsigned long irq;
 
-	/* Chances are small that we already have a context. */
-	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
+	/*
+	 * Chances are small that we already have a context.
+	 *
+	 * Contexts for per-cpu tracing are allocated using
+	 * smp_call_function(). We must not sleep.
+	 */
+	new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC);
 	if (!new_context)
 		return NULL;
 
@@ -662,8 +667,12 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	if (ovfl)
 		goto out;
 
+	/*
+	 * Per-cpu tracing is typically requested using smp_call_function().
+	 * We must not sleep.
+	 */
 	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
 	if (!tracer)
 		goto out;
 	tracer->ovfl = ovfl;
@@ -722,8 +731,12 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	if (ovfl)
 		goto out;
 
+	/*
+	 * Per-cpu tracing is typically requested using smp_call_function().
+	 * We must not sleep.
+	 */
 	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
+	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
 	if (!tracer)
 		goto out;
 	tracer->ovfl = ovfl;
-- 
GitLab


From 425480081e936d8725f0d44b8829d699bf088c6b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 13:38:36 -0400
Subject: [PATCH 0108/2198] tracing: add handler to trace_stat

Currently, if a trace_stat user wants a handle to some private data,
the trace_stat infrastructure does not supply a way to do that.

This patch passes the trace_stat structure to the start function of
the trace_stat code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_branch.c    | 4 ++--
 kernel/trace/trace_stat.c      | 2 +-
 kernel/trace/trace_stat.h      | 2 +-
 kernel/trace/trace_workqueue.c | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ad8c22efff417..e6e32912ffb81 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -263,7 +263,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static void *annotated_branch_stat_start(void)
+static void *annotated_branch_stat_start(struct tracer_stat *trace)
 {
 	return __start_annotated_branch_profile;
 }
@@ -338,7 +338,7 @@ static int all_branch_stat_headers(struct seq_file *m)
 	return 0;
 }
 
-static void *all_branch_stat_start(void)
+static void *all_branch_stat_start(struct tracer_stat *trace)
 {
 	return __start_branch_profile;
 }
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index f71b85b22cfe2..f8f48d84b2c38 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -85,7 +85,7 @@ static int stat_seq_init(struct tracer_stat_session *session)
 	if (!ts->stat_cmp)
 		ts->stat_cmp = dummy_cmp;
 
-	stat = ts->stat_start();
+	stat = ts->stat_start(ts);
 	if (!stat)
 		goto exit;
 
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
index 202274cf7f3d0..f3546a2cd826b 100644
--- a/kernel/trace/trace_stat.h
+++ b/kernel/trace/trace_stat.h
@@ -12,7 +12,7 @@ struct tracer_stat {
 	/* The name of your stat file */
 	const char		*name;
 	/* Iteration over statistic entries */
-	void			*(*stat_start)(void);
+	void			*(*stat_start)(struct tracer_stat *trace);
 	void			*(*stat_next)(void *prev, int idx);
 	/* Compare two entries for stats sorting */
 	int			(*stat_cmp)(void *p1, void *p2);
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 9ab035b58cf14..ee533c2e161bb 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -152,7 +152,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
 	return ret;
 }
 
-static void *workqueue_stat_start(void)
+static void *workqueue_stat_start(struct tracer_stat *trace)
 {
 	int cpu;
 	void *ret = NULL;
-- 
GitLab


From bac429f037f1a51a74d62bad6d1518c3be065df3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 20 Mar 2009 12:50:56 -0400
Subject: [PATCH 0109/2198] tracing: add function profiler

Impact: new profiling feature

This patch adds a function profiler. In debugfs/tracing/ two new
files are created.

  function_profile_enabled  - to enable or disable profiling

  trace_stat/functions   - the profiled functions.

For example:

  echo 1 > /debugfs/tracing/function_profile_enabled
  ./hackbench 50
  echo 0 > /debugfs/tracing/function_profile_enabled

yields:

  cat /debugfs/tracing/trace_stat/functions

  Function                               Hit
  --------                               ---
  _spin_lock                        10106442
  _spin_unlock                      10097492
  kfree                              6013704
  _spin_unlock_irqrestore            4423941
  _spin_lock_irqsave                 4406825
  __phys_addr                        4181686
  __slab_free                        4038222
  dput                               4030130
  path_put                           4023387
  unroll_tree_refs                   4019532
[...]

The most hit functions are listed first. Functions that are not
hit are not listed.

This feature depends on and uses dynamic function tracing. When the
function profiling is disabled, no overhead occurs. But it still
takes up around 300KB to hold the data, thus it is not recomended
to keep it enabled for systems low on memory.

When a '1' is echoed into the function_profile_enabled file, the
counters for is function is reset back to zero. Thus you can see what
functions are hit most by different programs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |   4 +
 kernel/trace/Kconfig   |  19 +++
 kernel/trace/ftrace.c  | 313 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 334 insertions(+), 2 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 015a3d22cf743..0456c3a51c669 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -153,6 +153,10 @@ struct dyn_ftrace {
 		unsigned long		flags;
 		struct dyn_ftrace	*newlist;
 	};
+#ifdef CONFIG_FUNCTION_PROFILER
+	unsigned long			counter;
+	struct hlist_node		node;
+#endif
 	struct dyn_arch_ftrace		arch;
 };
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8a4d72931042a..95e9ad5735d9c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -105,6 +105,7 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
@@ -376,6 +377,24 @@ config DYNAMIC_FTRACE
 	 were made. If so, it runs stop_machine (stops all CPUS)
 	 and modifies the code to jump over the call to ftrace.
 
+config FUNCTION_PROFILER
+	bool "Kernel function profiler"
+	depends on DYNAMIC_FTRACE
+	default n
+	help
+	 This option enables the kernel function profiler. When the dynamic
+	 function tracing is enabled, a counter is added into the function
+	 records used by the dynamic function tracer. A file is created in
+	 debugfs called function_profile_enabled which defaults to zero.
+	 When a 1 is echoed into this file profiling begins, and when a
+	 zero is entered, profiling stops. A file in the trace_stats
+	 directory called functions, that show the list of functions that
+	 have been hit and their counters.
+
+	 This takes up around 320K more memory.
+
+	 If in doubt, say N
+
 config FTRACE_MCOUNT_RECORD
 	def_bool y
 	depends on DYNAMIC_FTRACE
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7b8722baf1531..11f364c776d54 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -34,6 +34,7 @@
 #include <asm/ftrace.h>
 
 #include "trace.h"
+#include "trace_stat.h"
 
 #define FTRACE_WARN_ON(cond)			\
 	do {					\
@@ -261,7 +262,6 @@ struct ftrace_func_probe {
 	struct rcu_head		rcu;
 };
 
-
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -309,6 +309,307 @@ static struct dyn_ftrace *ftrace_free_records;
 		}				\
 	}
 
+#ifdef CONFIG_FUNCTION_PROFILER
+static struct hlist_head *ftrace_profile_hash;
+static int ftrace_profile_bits;
+static int ftrace_profile_enabled;
+static DEFINE_MUTEX(ftrace_profile_lock);
+
+static void *
+function_stat_next(void *v, int idx)
+{
+	struct dyn_ftrace *rec = v;
+	struct ftrace_page *pg;
+
+	pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK);
+
+ again:
+	rec++;
+	if ((void *)rec >= (void *)&pg->records[pg->index]) {
+		pg = pg->next;
+		if (!pg)
+			return NULL;
+		rec = &pg->records[0];
+	}
+
+	if (rec->flags & FTRACE_FL_FREE ||
+	    rec->flags & FTRACE_FL_FAILED ||
+	    !(rec->flags & FTRACE_FL_CONVERTED) ||
+	    /* ignore non hit functions */
+	    !rec->counter)
+		goto again;
+
+	return rec;
+}
+
+static void *function_stat_start(struct tracer_stat *trace)
+{
+	return function_stat_next(&ftrace_pages_start->records[0], 0);
+}
+
+static int function_stat_cmp(void *p1, void *p2)
+{
+	struct dyn_ftrace *a = p1;
+	struct dyn_ftrace *b = p2;
+
+	if (a->counter < b->counter)
+		return -1;
+	if (a->counter > b->counter)
+		return 1;
+	else
+		return 0;
+}
+
+static int function_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "  Function                               Hit\n"
+		      "  --------                               ---\n");
+	return 0;
+}
+
+static int function_stat_show(struct seq_file *m, void *v)
+{
+	struct dyn_ftrace *rec = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+
+	seq_printf(m, "  %-30.30s  %10lu\n", str, rec->counter);
+	return 0;
+}
+
+static struct tracer_stat function_stats = {
+	.name = "functions",
+	.stat_start = function_stat_start,
+	.stat_next = function_stat_next,
+	.stat_cmp = function_stat_cmp,
+	.stat_headers = function_stat_headers,
+	.stat_show = function_stat_show
+};
+
+static void ftrace_profile_init(int nr_funcs)
+{
+	unsigned long addr;
+	int order;
+	int size;
+
+	/*
+	 * We are profiling all functions, lets make it 1/4th of the
+	 * number of functions that are in core kernel. So we have to
+	 * iterate 4 times.
+	 */
+	order = (sizeof(struct hlist_head) * nr_funcs) / 4;
+	order = get_order(order);
+	size = 1 << (PAGE_SHIFT + order);
+
+	pr_info("Allocating %d KB for profiler hash\n", size >> 10);
+
+	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
+	if (!addr) {
+		pr_warning("Could not allocate function profiler hash\n");
+		return;
+	}
+
+	ftrace_profile_hash = (void *)addr;
+
+	/*
+	 * struct hlist_head should be a pointer of 4 or 8 bytes.
+	 * And a simple bit manipulation can be done, but if for
+	 * some reason struct hlist_head is not a mulitple of 2,
+	 * then we play it safe, and simply count. This function
+	 * is done once at boot up, so it is not that critical in
+	 * performance.
+	 */
+
+	size--;
+	size /= sizeof(struct hlist_head);
+
+	for (; size; size >>= 1)
+		ftrace_profile_bits++;
+
+	pr_info("Function profiler has %d hash buckets\n",
+		1 << ftrace_profile_bits);
+
+	return;
+}
+
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static void ftrace_profile_reset(void)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+
+	do_for_each_ftrace_rec(pg, rec) {
+		rec->counter = 0;
+	} while_for_each_ftrace_rec();
+}
+
+static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip)
+{
+	struct dyn_ftrace *rec;
+	struct hlist_head *hhd;
+	struct hlist_node *n;
+	unsigned long flags;
+	unsigned long key;
+
+	if (!ftrace_profile_hash)
+		return NULL;
+
+	key = hash_long(ip, ftrace_profile_bits);
+	hhd = &ftrace_profile_hash[key];
+
+	if (hlist_empty(hhd))
+		return NULL;
+
+	local_irq_save(flags);
+	hlist_for_each_entry_rcu(rec, n, hhd, node) {
+		if (rec->ip == ip)
+			goto out;
+	}
+	rec = NULL;
+ out:
+	local_irq_restore(flags);
+
+	return rec;
+}
+
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct dyn_ftrace *rec;
+	unsigned long flags;
+
+	if (!ftrace_profile_enabled)
+		return;
+
+	local_irq_save(flags);
+	rec = ftrace_find_profiled_func(ip);
+	if (!rec)
+		goto out;
+
+	rec->counter++;
+ out:
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops ftrace_profile_ops __read_mostly =
+{
+	.func = function_profile_call,
+};
+
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	unsigned long val;
+	char buf[64];
+	int ret;
+
+	if (!ftrace_profile_hash) {
+		pr_info("Can not enable hash due to earlier problems\n");
+		return -ENODEV;
+	}
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	val = !!val;
+
+	mutex_lock(&ftrace_profile_lock);
+	if (ftrace_profile_enabled ^ val) {
+		if (val) {
+			ftrace_profile_reset();
+			register_ftrace_function(&ftrace_profile_ops);
+			ftrace_profile_enabled = 1;
+		} else {
+			ftrace_profile_enabled = 0;
+			unregister_ftrace_function(&ftrace_profile_ops);
+		}
+	}
+	mutex_unlock(&ftrace_profile_lock);
+
+	filp->f_pos += cnt;
+
+	return cnt;
+}
+
+static const struct file_operations ftrace_profile_fops = {
+	.open		= tracing_open_generic,
+	.read		= ftrace_profile_read,
+	.write		= ftrace_profile_write,
+};
+
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+	struct dentry *entry;
+	int ret;
+
+	ret = register_stat_tracer(&function_stats);
+	if (ret) {
+		pr_warning("Warning: could not register "
+			   "function stats\n");
+		return;
+	}
+
+	entry = debugfs_create_file("function_profile_enabled", 0644,
+				    d_tracer, NULL, &ftrace_profile_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'function_profile_enabled' entry\n");
+}
+
+static void ftrace_add_profile(struct dyn_ftrace *rec)
+{
+	unsigned long key;
+
+	if (!ftrace_profile_hash)
+		return;
+
+	key = hash_long(rec->ip, ftrace_profile_bits);
+	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+}
+
+static void ftrace_profile_release(struct dyn_ftrace *rec)
+{
+	mutex_lock(&ftrace_profile_lock);
+	hlist_del(&rec->node);
+	mutex_unlock(&ftrace_profile_lock);
+}
+
+#else /* CONFIG_FUNCTION_PROFILER */
+static void ftrace_profile_init(int nr_funcs)
+{
+}
+static void ftrace_add_profile(struct dyn_ftrace *rec)
+{
+}
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+}
+static void ftrace_profile_release(struct dyn_ftrace *rec)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
+
 #ifdef CONFIG_KPROBES
 
 static int frozen_record_count;
@@ -359,8 +660,10 @@ void ftrace_release(void *start, unsigned long size)
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE))
+		    !(rec->flags & FTRACE_FL_FREE)) {
 			ftrace_free_rec(rec);
+			ftrace_profile_release(rec);
+		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
@@ -414,6 +717,8 @@ ftrace_record_ip(unsigned long ip)
 	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
+	ftrace_add_profile(rec);
+
 	return rec;
 }
 
@@ -2157,6 +2462,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+	ftrace_profile_debugfs(d_tracer);
+
 	return 0;
 }
 
@@ -2225,6 +2532,8 @@ void __init ftrace_init(void)
 	if (ret)
 		goto failed;
 
+	ftrace_profile_init(count);
+
 	last_ftrace_enabled = ftrace_enabled = 1;
 
 	ret = ftrace_convert_nops(NULL,
-- 
GitLab


From 493762fc534c71d11d489f872c4b4a2c61173668 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 17:12:36 -0400
Subject: [PATCH 0110/2198] tracing: move function profiler data out of
 function struct

Impact: reduce size of memory in function profiler

The function profiler originally introduces its counters into the
function records itself. There is 20 thousand different functions on
a normal system, and that is adding 20 thousand counters for profiling
event when not needed.

A normal run of the profiler yields only a couple of thousand functions
executed, depending on what is being profiled. This means we have around
18 thousand useless counters.

This patch rectifies this by moving the data out of the function
records used by dynamic ftrace. Data is preallocated to hold the functions
when the profiling begins. Checks are made during profiling to see if
more recorcds should be allocated, and they are allocated if it is safe
to do so.

This also removes the dependency from using dynamic ftrace, and also
removes the overhead by having it enabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |   4 -
 kernel/trace/Kconfig   |  10 +-
 kernel/trace/ftrace.c  | 440 ++++++++++++++++++++++++-----------------
 3 files changed, 263 insertions(+), 191 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0456c3a51c669..015a3d22cf743 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -153,10 +153,6 @@ struct dyn_ftrace {
 		unsigned long		flags;
 		struct dyn_ftrace	*newlist;
 	};
-#ifdef CONFIG_FUNCTION_PROFILER
-	unsigned long			counter;
-	struct hlist_node		node;
-#endif
 	struct dyn_arch_ftrace		arch;
 };
 
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 95e9ad5735d9c..8a4136096d7d9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -379,20 +379,16 @@ config DYNAMIC_FTRACE
 
 config FUNCTION_PROFILER
 	bool "Kernel function profiler"
-	depends on DYNAMIC_FTRACE
+	depends on FUNCTION_TRACER
 	default n
 	help
-	 This option enables the kernel function profiler. When the dynamic
-	 function tracing is enabled, a counter is added into the function
-	 records used by the dynamic function tracer. A file is created in
-	 debugfs called function_profile_enabled which defaults to zero.
+	 This option enables the kernel function profiler. A file is created
+	 in debugfs called function_profile_enabled which defaults to zero.
 	 When a 1 is echoed into this file profiling begins, and when a
 	 zero is entered, profiling stops. A file in the trace_stats
 	 directory called functions, that show the list of functions that
 	 have been hit and their counters.
 
-	 This takes up around 320K more memory.
-
 	 If in doubt, say N
 
 config FTRACE_MCOUNT_RECORD
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 11f364c776d54..24dac448cdc95 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -241,87 +241,48 @@ static void ftrace_update_pid_func(void)
 #endif
 }
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
-
-#ifdef CONFIG_DYNAMIC_FTRACE
-
-#ifndef CONFIG_FTRACE_MCOUNT_RECORD
-# error Dynamic ftrace depends on MCOUNT_RECORD
-#endif
-
-static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
-
-struct ftrace_func_probe {
-	struct hlist_node	node;
-	struct ftrace_probe_ops	*ops;
-	unsigned long		flags;
-	unsigned long		ip;
-	void			*data;
-	struct rcu_head		rcu;
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+	struct hlist_node		node;
+	unsigned long			ip;
+	unsigned long			counter;
 };
 
-enum {
-	FTRACE_ENABLE_CALLS		= (1 << 0),
-	FTRACE_DISABLE_CALLS		= (1 << 1),
-	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
-	FTRACE_ENABLE_MCOUNT		= (1 << 3),
-	FTRACE_DISABLE_MCOUNT		= (1 << 4),
-	FTRACE_START_FUNC_RET		= (1 << 5),
-	FTRACE_STOP_FUNC_RET		= (1 << 6),
+struct ftrace_profile_page {
+	struct ftrace_profile_page	*next;
+	unsigned long			index;
+	struct ftrace_profile		records[];
 };
 
-static int ftrace_filtered;
+#define PROFILE_RECORDS_SIZE						\
+	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
 
-static struct dyn_ftrace *ftrace_new_addrs;
+#define PROFILES_PER_PAGE					\
+	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-static DEFINE_MUTEX(ftrace_regex_lock);
-
-struct ftrace_page {
-	struct ftrace_page	*next;
-	int			index;
-	struct dyn_ftrace	records[];
-};
+/* TODO: make these percpu, to prevent cache line bouncing */
+static struct ftrace_profile_page *profile_pages_start;
+static struct ftrace_profile_page *profile_pages;
 
-#define ENTRIES_PER_PAGE \
-  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
-
-/* estimate from running different kernels */
-#define NR_TO_INIT		10000
-
-static struct ftrace_page	*ftrace_pages_start;
-static struct ftrace_page	*ftrace_pages;
-
-static struct dyn_ftrace *ftrace_free_records;
-
-/*
- * This is a double for. Do not use 'break' to break out of the loop,
- * you must use a goto.
- */
-#define do_for_each_ftrace_rec(pg, rec)					\
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
-		int _____i;						\
-		for (_____i = 0; _____i < pg->index; _____i++) {	\
-			rec = &pg->records[_____i];
-
-#define while_for_each_ftrace_rec()		\
-		}				\
-	}
-
-#ifdef CONFIG_FUNCTION_PROFILER
 static struct hlist_head *ftrace_profile_hash;
 static int ftrace_profile_bits;
 static int ftrace_profile_enabled;
 static DEFINE_MUTEX(ftrace_profile_lock);
 
+static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable);
+
+#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+
+static raw_spinlock_t ftrace_profile_rec_lock =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
 static void *
 function_stat_next(void *v, int idx)
 {
-	struct dyn_ftrace *rec = v;
-	struct ftrace_page *pg;
+	struct ftrace_profile *rec = v;
+	struct ftrace_profile_page *pg;
 
-	pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK);
+	pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
 
  again:
 	rec++;
@@ -330,27 +291,22 @@ function_stat_next(void *v, int idx)
 		if (!pg)
 			return NULL;
 		rec = &pg->records[0];
+		if (!rec->counter)
+			goto again;
 	}
 
-	if (rec->flags & FTRACE_FL_FREE ||
-	    rec->flags & FTRACE_FL_FAILED ||
-	    !(rec->flags & FTRACE_FL_CONVERTED) ||
-	    /* ignore non hit functions */
-	    !rec->counter)
-		goto again;
-
 	return rec;
 }
 
 static void *function_stat_start(struct tracer_stat *trace)
 {
-	return function_stat_next(&ftrace_pages_start->records[0], 0);
+	return function_stat_next(&profile_pages_start->records[0], 0);
 }
 
 static int function_stat_cmp(void *p1, void *p2)
 {
-	struct dyn_ftrace *a = p1;
-	struct dyn_ftrace *b = p2;
+	struct ftrace_profile *a = p1;
+	struct ftrace_profile *b = p2;
 
 	if (a->counter < b->counter)
 		return -1;
@@ -369,7 +325,7 @@ static int function_stat_headers(struct seq_file *m)
 
 static int function_stat_show(struct seq_file *m, void *v)
 {
-	struct dyn_ftrace *rec = v;
+	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -387,115 +343,191 @@ static struct tracer_stat function_stats = {
 	.stat_show = function_stat_show
 };
 
-static void ftrace_profile_init(int nr_funcs)
+static void ftrace_profile_reset(void)
 {
-	unsigned long addr;
-	int order;
-	int size;
+	struct ftrace_profile_page *pg;
 
-	/*
-	 * We are profiling all functions, lets make it 1/4th of the
-	 * number of functions that are in core kernel. So we have to
-	 * iterate 4 times.
-	 */
-	order = (sizeof(struct hlist_head) * nr_funcs) / 4;
-	order = get_order(order);
-	size = 1 << (PAGE_SHIFT + order);
-
-	pr_info("Allocating %d KB for profiler hash\n", size >> 10);
+	pg = profile_pages = profile_pages_start;
 
-	addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order);
-	if (!addr) {
-		pr_warning("Could not allocate function profiler hash\n");
-		return;
+	while (pg) {
+		memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+		pg->index = 0;
+		pg = pg->next;
 	}
 
-	ftrace_profile_hash = (void *)addr;
+	memset(ftrace_profile_hash, 0,
+	       FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
 
-	/*
-	 * struct hlist_head should be a pointer of 4 or 8 bytes.
-	 * And a simple bit manipulation can be done, but if for
-	 * some reason struct hlist_head is not a mulitple of 2,
-	 * then we play it safe, and simply count. This function
-	 * is done once at boot up, so it is not that critical in
-	 * performance.
-	 */
+int ftrace_profile_pages_init(void)
+{
+	struct ftrace_profile_page *pg;
+	int i;
 
-	size--;
-	size /= sizeof(struct hlist_head);
+	/* If we already allocated, do nothing */
+	if (profile_pages)
+		return 0;
 
-	for (; size; size >>= 1)
-		ftrace_profile_bits++;
+	profile_pages = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!profile_pages)
+		return -ENOMEM;
 
-	pr_info("Function profiler has %d hash buckets\n",
-		1 << ftrace_profile_bits);
+	pg = profile_pages_start = profile_pages;
 
-	return;
+	/* allocate 10 more pages to start */
+	for (i = 0; i < 10; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+		/*
+		 * We only care about allocating profile_pages, if
+		 * we failed to allocate here, hopefully we will allocate
+		 * later.
+		 */
+		if (!pg->next)
+			break;
+		pg = pg->next;
+	}
+
+	return 0;
 }
 
-static ssize_t
-ftrace_profile_read(struct file *filp, char __user *ubuf,
-		     size_t cnt, loff_t *ppos)
+static int ftrace_profile_init(void)
 {
-	char buf[64];
-	int r;
+	int size;
 
-	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
+	if (ftrace_profile_hash) {
+		/* If the profile is already created, simply reset it */
+		ftrace_profile_reset();
+		return 0;
+	}
 
-static void ftrace_profile_reset(void)
-{
-	struct dyn_ftrace *rec;
-	struct ftrace_page *pg;
+	/*
+	 * We are profiling all functions, but usually only a few thousand
+	 * functions are hit. We'll make a hash of 1024 items.
+	 */
+	size = FTRACE_PROFILE_HASH_SIZE;
 
-	do_for_each_ftrace_rec(pg, rec) {
-		rec->counter = 0;
-	} while_for_each_ftrace_rec();
+	ftrace_profile_hash =
+		kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+
+	if (!ftrace_profile_hash)
+		return -ENOMEM;
+
+	size--;
+
+	for (; size; size >>= 1)
+		ftrace_profile_bits++;
+
+	/* Preallocate a few pages */
+	if (ftrace_profile_pages_init() < 0) {
+		kfree(ftrace_profile_hash);
+		ftrace_profile_hash = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
 }
 
-static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip)
+/* interrupts must be disabled */
+static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
 {
-	struct dyn_ftrace *rec;
+	struct ftrace_profile *rec;
 	struct hlist_head *hhd;
 	struct hlist_node *n;
-	unsigned long flags;
 	unsigned long key;
 
-	if (!ftrace_profile_hash)
-		return NULL;
-
 	key = hash_long(ip, ftrace_profile_bits);
 	hhd = &ftrace_profile_hash[key];
 
 	if (hlist_empty(hhd))
 		return NULL;
 
-	local_irq_save(flags);
 	hlist_for_each_entry_rcu(rec, n, hhd, node) {
 		if (rec->ip == ip)
-			goto out;
+			return rec;
+	}
+
+	return NULL;
+}
+
+static void ftrace_add_profile(struct ftrace_profile *rec)
+{
+	unsigned long key;
+
+	key = hash_long(rec->ip, ftrace_profile_bits);
+	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+}
+
+/* Interrupts must be disabled calling this */
+static struct ftrace_profile *
+ftrace_profile_alloc(unsigned long ip, bool alloc_safe)
+{
+	struct ftrace_profile *rec = NULL;
+
+	/* prevent recursion */
+	if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1)
+		goto out;
+
+	__raw_spin_lock(&ftrace_profile_rec_lock);
+
+	/* Try to always keep another page available */
+	if (!profile_pages->next && alloc_safe)
+		profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
+
+	/*
+	 * Try to find the function again since another
+	 * task on another CPU could have added it
+	 */
+	rec = ftrace_find_profiled_func(ip);
+	if (rec)
+		goto out_unlock;
+
+	if (profile_pages->index == PROFILES_PER_PAGE) {
+		if (!profile_pages->next)
+			goto out_unlock;
+		profile_pages = profile_pages->next;
 	}
-	rec = NULL;
+
+	rec = &profile_pages->records[profile_pages->index++];
+	rec->ip = ip;
+	ftrace_add_profile(rec);
+
+ out_unlock:
+	__raw_spin_unlock(&ftrace_profile_rec_lock);
  out:
-	local_irq_restore(flags);
+	atomic_dec(&__get_cpu_var(ftrace_profile_disable));
 
 	return rec;
 }
 
+/*
+ * If we are not in an interrupt, or softirq and
+ * and interrupts are disabled and preemption is not enabled
+ * (not in a spinlock) then it should be safe to allocate memory.
+ */
+static bool ftrace_safe_to_allocate(void)
+{
+	return !in_interrupt() && irqs_disabled() && !preempt_count();
+}
+
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip)
 {
-	struct dyn_ftrace *rec;
+	struct ftrace_profile *rec;
 	unsigned long flags;
+	bool alloc_safe;
 
 	if (!ftrace_profile_enabled)
 		return;
 
+	alloc_safe = ftrace_safe_to_allocate();
+
 	local_irq_save(flags);
 	rec = ftrace_find_profiled_func(ip);
-	if (!rec)
-		goto out;
+	if (!rec) {
+		rec = ftrace_profile_alloc(ip, alloc_safe);
+		if (!rec)
+			goto out;
+	}
 
 	rec->counter++;
  out:
@@ -515,11 +547,6 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	char buf[64];
 	int ret;
 
-	if (!ftrace_profile_hash) {
-		pr_info("Can not enable hash due to earlier problems\n");
-		return -ENODEV;
-	}
-
 	if (cnt >= sizeof(buf))
 		return -EINVAL;
 
@@ -537,7 +564,12 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	mutex_lock(&ftrace_profile_lock);
 	if (ftrace_profile_enabled ^ val) {
 		if (val) {
-			ftrace_profile_reset();
+			ret = ftrace_profile_init();
+			if (ret < 0) {
+				cnt = ret;
+				goto out;
+			}
+
 			register_ftrace_function(&ftrace_profile_ops);
 			ftrace_profile_enabled = 1;
 		} else {
@@ -545,6 +577,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 			unregister_ftrace_function(&ftrace_profile_ops);
 		}
 	}
+ out:
 	mutex_unlock(&ftrace_profile_lock);
 
 	filp->f_pos += cnt;
@@ -552,6 +585,17 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+		     size_t cnt, loff_t *ppos)
+{
+	char buf[64];
+	int r;
+
+	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
 static const struct file_operations ftrace_profile_fops = {
 	.open		= tracing_open_generic,
 	.read		= ftrace_profile_read,
@@ -577,39 +621,80 @@ static void ftrace_profile_debugfs(struct dentry *d_tracer)
 			   "'function_profile_enabled' entry\n");
 }
 
-static void ftrace_add_profile(struct dyn_ftrace *rec)
-{
-	unsigned long key;
-
-	if (!ftrace_profile_hash)
-		return;
-
-	key = hash_long(rec->ip, ftrace_profile_bits);
-	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
-}
-
-static void ftrace_profile_release(struct dyn_ftrace *rec)
-{
-	mutex_lock(&ftrace_profile_lock);
-	hlist_del(&rec->node);
-	mutex_unlock(&ftrace_profile_lock);
-}
-
 #else /* CONFIG_FUNCTION_PROFILER */
-static void ftrace_profile_init(int nr_funcs)
-{
-}
-static void ftrace_add_profile(struct dyn_ftrace *rec)
-{
-}
 static void ftrace_profile_debugfs(struct dentry *d_tracer)
 {
 }
-static void ftrace_profile_release(struct dyn_ftrace *rec)
-{
-}
 #endif /* CONFIG_FUNCTION_PROFILER */
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+#ifndef CONFIG_FTRACE_MCOUNT_RECORD
+# error Dynamic ftrace depends on MCOUNT_RECORD
+#endif
+
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_probe {
+	struct hlist_node	node;
+	struct ftrace_probe_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+	FTRACE_START_FUNC_RET		= (1 << 5),
+	FTRACE_STOP_FUNC_RET		= (1 << 6),
+};
+
+static int ftrace_filtered;
+
+static struct dyn_ftrace *ftrace_new_addrs;
+
+static DEFINE_MUTEX(ftrace_regex_lock);
+
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct dyn_ftrace	records[];
+};
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
+static struct dyn_ftrace *ftrace_free_records;
+
+/*
+ * This is a double for. Do not use 'break' to break out of the loop,
+ * you must use a goto.
+ */
+#define do_for_each_ftrace_rec(pg, rec)					\
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
+		int _____i;						\
+		for (_____i = 0; _____i < pg->index; _____i++) {	\
+			rec = &pg->records[_____i];
+
+#define while_for_each_ftrace_rec()		\
+		}				\
+	}
+
 #ifdef CONFIG_KPROBES
 
 static int frozen_record_count;
@@ -660,10 +745,8 @@ void ftrace_release(void *start, unsigned long size)
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE)) {
+		    !(rec->flags & FTRACE_FL_FREE))
 			ftrace_free_rec(rec);
-			ftrace_profile_release(rec);
-		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
@@ -717,8 +800,6 @@ ftrace_record_ip(unsigned long ip)
 	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
-	ftrace_add_profile(rec);
-
 	return rec;
 }
 
@@ -2462,8 +2543,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-	ftrace_profile_debugfs(d_tracer);
-
 	return 0;
 }
 
@@ -2532,8 +2611,6 @@ void __init ftrace_init(void)
 	if (ret)
 		goto failed;
 
-	ftrace_profile_init(count);
-
 	last_ftrace_enabled = ftrace_enabled = 1;
 
 	ret = ftrace_convert_nops(NULL,
@@ -2734,6 +2811,9 @@ static __init int ftrace_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'set_ftrace_pid' entry\n");
+
+	ftrace_profile_debugfs(d_tracer);
+
 	return 0;
 }
 fs_initcall(ftrace_init_debugfs);
-- 
GitLab


From 0706f1c48ca8a7ab478090b4e38f2e578ae2bfe0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 23:12:58 -0400
Subject: [PATCH 0111/2198] tracing: adding function timings to function
 profiler

If the function graph trace is enabled, the function profiler will
use it to take the timing of the functions.

 cat /debug/tracing/trace_stat/functions

  Function                               Hit    Time
  --------                               ---    ----
  mwait_idle                             127    183028.4 us
  schedule                                26    151997.7 us
  __schedule                              31    151975.1 us
  sys_wait4                                2    74080.53 us
  do_wait                                  2    74077.80 us
  sys_newlstat                           138    39929.16 us
  do_path_lookup                         179    39845.79 us
  vfs_lstat_fd                           138    39761.97 us
  user_path_at                           153    39469.58 us
  path_walk                              179    39435.76 us
  __link_path_walk                       189    39143.73 us
[...]

Note the times are skewed due to the function graph tracer not taking
into account schedules.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c                | 93 ++++++++++++++++++++++++++--
 kernel/trace/trace.c                 | 11 ----
 kernel/trace/trace.h                 |  3 +-
 kernel/trace/trace_functions_graph.c | 17 ++++-
 kernel/trace/trace_output.c          | 10 +++
 kernel/trace/trace_output.h          |  2 +
 6 files changed, 117 insertions(+), 19 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 24dac448cdc95..a9ccd71fc9226 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -33,7 +33,7 @@
 
 #include <asm/ftrace.h>
 
-#include "trace.h"
+#include "trace_output.h"
 #include "trace_stat.h"
 
 #define FTRACE_WARN_ON(cond)			\
@@ -246,6 +246,9 @@ struct ftrace_profile {
 	struct hlist_node		node;
 	unsigned long			ip;
 	unsigned long			counter;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	unsigned long long		time;
+#endif
 };
 
 struct ftrace_profile_page {
@@ -303,6 +306,22 @@ static void *function_stat_start(struct tracer_stat *trace)
 	return function_stat_next(&profile_pages_start->records[0], 0);
 }
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* function graph compares on total time */
+static int function_stat_cmp(void *p1, void *p2)
+{
+	struct ftrace_profile *a = p1;
+	struct ftrace_profile *b = p2;
+
+	if (a->time < b->time)
+		return -1;
+	if (a->time > b->time)
+		return 1;
+	else
+		return 0;
+}
+#else
+/* not function graph compares against hits */
 static int function_stat_cmp(void *p1, void *p2)
 {
 	struct ftrace_profile *a = p1;
@@ -315,11 +334,17 @@ static int function_stat_cmp(void *p1, void *p2)
 	else
 		return 0;
 }
+#endif
 
 static int function_stat_headers(struct seq_file *m)
 {
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	seq_printf(m, "  Function                               Hit    Time\n"
+		      "  --------                               ---    ----\n");
+#else
 	seq_printf(m, "  Function                               Hit\n"
 		      "  --------                               ---\n");
+#endif
 	return 0;
 }
 
@@ -327,10 +352,25 @@ static int function_stat_show(struct seq_file *m, void *v)
 {
 	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	static struct trace_seq s;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);
+	trace_seq_init(&s);
+	trace_print_graph_duration(rec->time, &s);
+#endif
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+	seq_printf(m, "    ");
+	trace_print_seq(m, &s);
+	mutex_unlock(&mutex);
+#endif
+	seq_putc(m, '\n');
 
-	seq_printf(m, "  %-30.30s  %10lu\n", str, rec->counter);
 	return 0;
 }
 
@@ -534,11 +574,52 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int profile_graph_entry(struct ftrace_graph_ent *trace)
+{
+	function_profile_call(trace->func, 0);
+	return 1;
+}
+
+static void profile_graph_return(struct ftrace_graph_ret *trace)
+{
+	unsigned long flags;
+	struct ftrace_profile *rec;
+
+	local_irq_save(flags);
+	rec = ftrace_find_profiled_func(trace->func);
+	if (rec)
+		rec->time += trace->rettime - trace->calltime;
+	local_irq_restore(flags);
+}
+
+static int register_ftrace_profiler(void)
+{
+	return register_ftrace_graph(&profile_graph_return,
+				     &profile_graph_entry);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+	unregister_ftrace_graph();
+}
+#else
 static struct ftrace_ops ftrace_profile_ops __read_mostly =
 {
 	.func = function_profile_call,
 };
 
+static int register_ftrace_profiler(void)
+{
+	return register_ftrace_function(&ftrace_profile_ops);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+	unregister_ftrace_function(&ftrace_profile_ops);
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
 static ssize_t
 ftrace_profile_write(struct file *filp, const char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
@@ -570,11 +651,15 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 				goto out;
 			}
 
-			register_ftrace_function(&ftrace_profile_ops);
+			ret = register_ftrace_profiler();
+			if (ret < 0) {
+				cnt = ret;
+				goto out;
+			}
 			ftrace_profile_enabled = 1;
 		} else {
 			ftrace_profile_enabled = 0;
-			unregister_ftrace_function(&ftrace_profile_ops);
+			unregister_ftrace_profiler();
 		}
 	}
  out:
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 67c6a21dd4274..821bf49771d4c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -402,17 +402,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 	return cnt;
 }
 
-static void
-trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
-	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
-
-	s->buffer[len] = 0;
-	seq_puts(m, s->buffer);
-
-	trace_seq_init(s);
-}
-
 /**
  * update_max_tr - snapshot all trace buffers from global_trace to max_tr
  * @tr: tracer
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d7410bbb9a803..c66ca3b660502 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -605,6 +605,8 @@ extern unsigned long trace_flags;
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
@@ -636,7 +638,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
 	return 1;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
-
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 print_graph_function(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e7b3a7b..85bba0f018b04 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -426,8 +426,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
 	unsigned long nsecs_rem = do_div(duration, 1000);
 	/* log10(ULONG_MAX) + '\0' */
@@ -464,12 +464,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+	int ret;
+
+	ret = trace_print_graph_duration(duration, s);
+	if (ret != TRACE_TYPE_HANDLED)
+		return ret;
 
 	ret = trace_seq_printf(s, "|  ");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
-	return TRACE_TYPE_HANDLED;
 
+	return TRACE_TYPE_HANDLED;
 }
 
 /* Case of a leaf function on its call entry */
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 19261fdd24558..a3b6e3fd70446 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -19,6 +19,16 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
 
+void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+	int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+	s->buffer[len] = 0;
+	seq_puts(m, s->buffer);
+
+	trace_seq_init(s);
+}
+
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 35c422fb51a93..1eac2973374e3 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -20,6 +20,8 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_printk_msg_only(struct trace_iterator *iter);
 
+extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
+
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
-- 
GitLab


From cafb168a1c92e4c9e1731fe3d666c39611762c49 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 20:50:39 -0400
Subject: [PATCH 0112/2198] tracing: make the function profiler per cpu

Impact: speed enhancement

By making the function profiler record in per cpu data we not only
get better readings, avoid races, we also do not have to take any
locks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 199 +++++++++++++++++++++++++++---------------
 1 file changed, 130 insertions(+), 69 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a9ccd71fc9226..ed1fc5021d441 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -257,28 +257,28 @@ struct ftrace_profile_page {
 	struct ftrace_profile		records[];
 };
 
+struct ftrace_profile_stat {
+	atomic_t			disabled;
+	struct hlist_head		*hash;
+	struct ftrace_profile_page	*pages;
+	struct ftrace_profile_page	*start;
+	struct tracer_stat		stat;
+};
+
 #define PROFILE_RECORDS_SIZE						\
 	(PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
 
 #define PROFILES_PER_PAGE					\
 	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-/* TODO: make these percpu, to prevent cache line bouncing */
-static struct ftrace_profile_page *profile_pages_start;
-static struct ftrace_profile_page *profile_pages;
-
-static struct hlist_head *ftrace_profile_hash;
 static int ftrace_profile_bits;
 static int ftrace_profile_enabled;
 static DEFINE_MUTEX(ftrace_profile_lock);
 
-static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable);
+static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
 
 #define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
 
-static raw_spinlock_t ftrace_profile_rec_lock =
-	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
-
 static void *
 function_stat_next(void *v, int idx)
 {
@@ -303,7 +303,13 @@ function_stat_next(void *v, int idx)
 
 static void *function_stat_start(struct tracer_stat *trace)
 {
-	return function_stat_next(&profile_pages_start->records[0], 0);
+	struct ftrace_profile_stat *stat =
+		container_of(trace, struct ftrace_profile_stat, stat);
+
+	if (!stat || !stat->start)
+		return NULL;
+
+	return function_stat_next(&stat->start->records[0], 0);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -374,20 +380,11 @@ static int function_stat_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct tracer_stat function_stats = {
-	.name = "functions",
-	.stat_start = function_stat_start,
-	.stat_next = function_stat_next,
-	.stat_cmp = function_stat_cmp,
-	.stat_headers = function_stat_headers,
-	.stat_show = function_stat_show
-};
-
-static void ftrace_profile_reset(void)
+static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
 {
 	struct ftrace_profile_page *pg;
 
-	pg = profile_pages = profile_pages_start;
+	pg = stat->pages = stat->start;
 
 	while (pg) {
 		memset(pg->records, 0, PROFILE_RECORDS_SIZE);
@@ -395,24 +392,24 @@ static void ftrace_profile_reset(void)
 		pg = pg->next;
 	}
 
-	memset(ftrace_profile_hash, 0,
+	memset(stat->hash, 0,
 	       FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
 }
 
-int ftrace_profile_pages_init(void)
+int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 {
 	struct ftrace_profile_page *pg;
 	int i;
 
 	/* If we already allocated, do nothing */
-	if (profile_pages)
+	if (stat->pages)
 		return 0;
 
-	profile_pages = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!profile_pages)
+	stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!stat->pages)
 		return -ENOMEM;
 
-	pg = profile_pages_start = profile_pages;
+	pg = stat->start = stat->pages;
 
 	/* allocate 10 more pages to start */
 	for (i = 0; i < 10; i++) {
@@ -430,13 +427,16 @@ int ftrace_profile_pages_init(void)
 	return 0;
 }
 
-static int ftrace_profile_init(void)
+static int ftrace_profile_init_cpu(int cpu)
 {
+	struct ftrace_profile_stat *stat;
 	int size;
 
-	if (ftrace_profile_hash) {
+	stat = &per_cpu(ftrace_profile_stats, cpu);
+
+	if (stat->hash) {
 		/* If the profile is already created, simply reset it */
-		ftrace_profile_reset();
+		ftrace_profile_reset(stat);
 		return 0;
 	}
 
@@ -446,29 +446,45 @@ static int ftrace_profile_init(void)
 	 */
 	size = FTRACE_PROFILE_HASH_SIZE;
 
-	ftrace_profile_hash =
-		kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+	stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
 
-	if (!ftrace_profile_hash)
+	if (!stat->hash)
 		return -ENOMEM;
 
-	size--;
+	if (!ftrace_profile_bits) {
+		size--;
 
-	for (; size; size >>= 1)
-		ftrace_profile_bits++;
+		for (; size; size >>= 1)
+			ftrace_profile_bits++;
+	}
 
 	/* Preallocate a few pages */
-	if (ftrace_profile_pages_init() < 0) {
-		kfree(ftrace_profile_hash);
-		ftrace_profile_hash = NULL;
+	if (ftrace_profile_pages_init(stat) < 0) {
+		kfree(stat->hash);
+		stat->hash = NULL;
 		return -ENOMEM;
 	}
 
 	return 0;
 }
 
+static int ftrace_profile_init(void)
+{
+	int cpu;
+	int ret = 0;
+
+	for_each_online_cpu(cpu) {
+		ret = ftrace_profile_init_cpu(cpu);
+		if (ret)
+			break;
+	}
+
+	return ret;
+}
+
 /* interrupts must be disabled */
-static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
+static struct ftrace_profile *
+ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
 {
 	struct ftrace_profile *rec;
 	struct hlist_head *hhd;
@@ -476,7 +492,7 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
 	unsigned long key;
 
 	key = hash_long(ip, ftrace_profile_bits);
-	hhd = &ftrace_profile_hash[key];
+	hhd = &stat->hash[key];
 
 	if (hlist_empty(hhd))
 		return NULL;
@@ -489,52 +505,50 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip)
 	return NULL;
 }
 
-static void ftrace_add_profile(struct ftrace_profile *rec)
+static void ftrace_add_profile(struct ftrace_profile_stat *stat,
+			       struct ftrace_profile *rec)
 {
 	unsigned long key;
 
 	key = hash_long(rec->ip, ftrace_profile_bits);
-	hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]);
+	hlist_add_head_rcu(&rec->node, &stat->hash[key]);
 }
 
 /* Interrupts must be disabled calling this */
 static struct ftrace_profile *
-ftrace_profile_alloc(unsigned long ip, bool alloc_safe)
+ftrace_profile_alloc(struct ftrace_profile_stat *stat,
+		     unsigned long ip, bool alloc_safe)
 {
 	struct ftrace_profile *rec = NULL;
 
 	/* prevent recursion */
-	if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1)
+	if (atomic_inc_return(&stat->disabled) != 1)
 		goto out;
 
-	__raw_spin_lock(&ftrace_profile_rec_lock);
-
 	/* Try to always keep another page available */
-	if (!profile_pages->next && alloc_safe)
-		profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
+	if (!stat->pages->next && alloc_safe)
+		stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
 
 	/*
 	 * Try to find the function again since another
 	 * task on another CPU could have added it
 	 */
-	rec = ftrace_find_profiled_func(ip);
+	rec = ftrace_find_profiled_func(stat, ip);
 	if (rec)
-		goto out_unlock;
+		goto out;
 
-	if (profile_pages->index == PROFILES_PER_PAGE) {
-		if (!profile_pages->next)
-			goto out_unlock;
-		profile_pages = profile_pages->next;
+	if (stat->pages->index == PROFILES_PER_PAGE) {
+		if (!stat->pages->next)
+			goto out;
+		stat->pages = stat->pages->next;
 	}
 
-	rec = &profile_pages->records[profile_pages->index++];
+	rec = &stat->pages->records[stat->pages->index++];
 	rec->ip = ip;
-	ftrace_add_profile(rec);
+	ftrace_add_profile(stat, rec);
 
- out_unlock:
-	__raw_spin_unlock(&ftrace_profile_rec_lock);
  out:
-	atomic_dec(&__get_cpu_var(ftrace_profile_disable));
+	atomic_dec(&stat->disabled);
 
 	return rec;
 }
@@ -552,6 +566,7 @@ static bool ftrace_safe_to_allocate(void)
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip)
 {
+	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
 	unsigned long flags;
 	bool alloc_safe;
@@ -562,9 +577,14 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 	alloc_safe = ftrace_safe_to_allocate();
 
 	local_irq_save(flags);
-	rec = ftrace_find_profiled_func(ip);
+
+	stat = &__get_cpu_var(ftrace_profile_stats);
+	if (!stat->hash)
+		goto out;
+
+	rec = ftrace_find_profiled_func(stat, ip);
 	if (!rec) {
-		rec = ftrace_profile_alloc(ip, alloc_safe);
+		rec = ftrace_profile_alloc(stat, ip, alloc_safe);
 		if (!rec)
 			goto out;
 	}
@@ -583,13 +603,19 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
 
 static void profile_graph_return(struct ftrace_graph_ret *trace)
 {
-	unsigned long flags;
+	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
+	unsigned long flags;
 
 	local_irq_save(flags);
-	rec = ftrace_find_profiled_func(trace->func);
+	stat = &__get_cpu_var(ftrace_profile_stats);
+	if (!stat->hash)
+		goto out;
+
+	rec = ftrace_find_profiled_func(stat, trace->func);
 	if (rec)
 		rec->time += trace->rettime - trace->calltime;
+ out:
 	local_irq_restore(flags);
 }
 
@@ -687,16 +713,51 @@ static const struct file_operations ftrace_profile_fops = {
 	.write		= ftrace_profile_write,
 };
 
+/* used to initialize the real stat files */
+static struct tracer_stat function_stats __initdata = {
+	.name = "functions",
+	.stat_start = function_stat_start,
+	.stat_next = function_stat_next,
+	.stat_cmp = function_stat_cmp,
+	.stat_headers = function_stat_headers,
+	.stat_show = function_stat_show
+};
+
 static void ftrace_profile_debugfs(struct dentry *d_tracer)
 {
+	struct ftrace_profile_stat *stat;
 	struct dentry *entry;
+	char *name;
 	int ret;
+	int cpu;
 
-	ret = register_stat_tracer(&function_stats);
-	if (ret) {
-		pr_warning("Warning: could not register "
-			   "function stats\n");
-		return;
+	for_each_possible_cpu(cpu) {
+		stat = &per_cpu(ftrace_profile_stats, cpu);
+
+		/* allocate enough for function name + cpu number */
+		name = kmalloc(32, GFP_KERNEL);
+		if (!name) {
+			/*
+			 * The files created are permanent, if something happens
+			 * we still do not free memory.
+			 */
+			kfree(stat);
+			WARN(1,
+			     "Could not allocate stat file for cpu %d\n",
+			     cpu);
+			return;
+		}
+		stat->stat = function_stats;
+		snprintf(name, 32, "function%d", cpu);
+		stat->stat.name = name;
+		ret = register_stat_tracer(&stat->stat);
+		if (ret) {
+			WARN(1,
+			     "Could not register function stat for cpu %d\n",
+			     cpu);
+			kfree(name);
+			return;
+		}
 	}
 
 	entry = debugfs_create_file("function_profile_enabled", 0644,
-- 
GitLab


From a2a16d6a3156ef7309ca7328a20c35df9418e670 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 23:17:58 -0400
Subject: [PATCH 0113/2198] function-graph: add option to calculate graph time
 or not

graph time is the time that a function is executing another function.
Thus if function A calls B, if graph-time is set, then the time for
A includes B. This is the default behavior. But if graph-time is off,
then the time spent executing B is subtracted from A.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h               |  3 +--
 kernel/trace/ftrace.c                | 21 ++++++++++++++++++++-
 kernel/trace/trace.c                 |  4 +++-
 kernel/trace/trace.h                 |  1 +
 kernel/trace/trace_functions_graph.c |  8 ++++----
 5 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 015a3d22cf743..9e0a8d245e55d 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -365,6 +365,7 @@ struct ftrace_ret_stack {
 	unsigned long ret;
 	unsigned long func;
 	unsigned long long calltime;
+	unsigned long long subtime;
 };
 
 /*
@@ -376,8 +377,6 @@ extern void return_to_handler(void);
 
 extern int
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
-extern void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
 
 /*
  * Sometimes we don't want to trace a function with the function
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ed1fc5021d441..71e5faef12ab9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -604,6 +604,7 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
 static void profile_graph_return(struct ftrace_graph_ret *trace)
 {
 	struct ftrace_profile_stat *stat;
+	unsigned long long calltime;
 	struct ftrace_profile *rec;
 	unsigned long flags;
 
@@ -612,9 +613,27 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 	if (!stat->hash)
 		goto out;
 
+	calltime = trace->rettime - trace->calltime;
+
+	if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+		int index;
+
+		index = trace->depth;
+
+		/* Append this call time to the parent time to subtract */
+		if (index)
+			current->ret_stack[index - 1].subtime += calltime;
+
+		if (current->ret_stack[index].subtime < calltime)
+			calltime -= current->ret_stack[index].subtime;
+		else
+			calltime = 0;
+	}
+
 	rec = ftrace_find_profiled_func(stat, trace->func);
 	if (rec)
-		rec->time += trace->rettime - trace->calltime;
+		rec->time += calltime;
+
  out:
 	local_irq_restore(flags);
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 821bf49771d4c..5d1a16cae3761 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -255,7 +255,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+	TRACE_ITER_GRAPH_TIME;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -317,6 +318,7 @@ static const char *trace_options[] = {
 	"latency-format",
 	"global-clock",
 	"sleep-time",
+	"graph-time",
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c66ca3b660502..e3429a8ab0596 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -685,6 +685,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
 	TRACE_ITER_GLOBAL_CLK		= 0x80000,
 	TRACE_ITER_SLEEP_TIME		= 0x100000,
+	TRACE_ITER_GRAPH_TIME		= 0x200000,
 };
 
 /*
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 85bba0f018b04..10f6ad7d85f65 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -78,13 +78,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
 	current->ret_stack[index].calltime = calltime;
+	current->ret_stack[index].subtime = 0;
 	*depth = index;
 
 	return 0;
 }
 
 /* Retrieve a function return address to the trace stack on thread info.*/
-void
+static void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 {
 	int index;
@@ -104,9 +105,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 	trace->calltime = current->ret_stack[index].calltime;
 	trace->overrun = atomic_read(&current->trace_overrun);
 	trace->depth = index;
-	barrier();
-	current->curr_ret_stack--;
-
 }
 
 /*
@@ -121,6 +119,8 @@ unsigned long ftrace_return_to_handler(void)
 	ftrace_pop_return_trace(&trace, &ret);
 	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
+	barrier();
+	current->curr_ret_stack--;
 
 	if (unlikely(!ret)) {
 		ftrace_graph_stop();
-- 
GitLab


From fb9fb015e92123fa3a8e0c2e2fff491d4a56b470 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 13:26:41 -0400
Subject: [PATCH 0114/2198] tracing: clean up tracing profiler

Ingo Molnar suggested clean ups for the profiling code. This patch
makes those updates.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 71e5faef12ab9..a141d8499ab03 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -69,7 +69,7 @@ static DEFINE_MUTEX(ftrace_lock);
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
-	.func = ftrace_stub,
+	.func		= ftrace_stub,
 };
 
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -271,8 +271,10 @@ struct ftrace_profile_stat {
 #define PROFILES_PER_PAGE					\
 	(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
 
-static int ftrace_profile_bits;
-static int ftrace_profile_enabled;
+static int ftrace_profile_bits __read_mostly;
+static int ftrace_profile_enabled __read_mostly;
+
+/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
 static DEFINE_MUTEX(ftrace_profile_lock);
 
 static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
@@ -651,7 +653,7 @@ static void unregister_ftrace_profiler(void)
 #else
 static struct ftrace_ops ftrace_profile_ops __read_mostly =
 {
-	.func = function_profile_call,
+	.func		= function_profile_call,
 };
 
 static int register_ftrace_profiler(void)
@@ -670,7 +672,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
 	unsigned long val;
-	char buf[64];
+	char buf[64];		/* big enough to hold a number */
 	int ret;
 
 	if (cnt >= sizeof(buf))
@@ -719,7 +721,7 @@ static ssize_t
 ftrace_profile_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
-	char buf[64];
+	char buf[64];		/* big enough to hold a number */
 	int r;
 
 	r = sprintf(buf, "%u\n", ftrace_profile_enabled);
@@ -734,12 +736,12 @@ static const struct file_operations ftrace_profile_fops = {
 
 /* used to initialize the real stat files */
 static struct tracer_stat function_stats __initdata = {
-	.name = "functions",
-	.stat_start = function_stat_start,
-	.stat_next = function_stat_next,
-	.stat_cmp = function_stat_cmp,
-	.stat_headers = function_stat_headers,
-	.stat_show = function_stat_show
+	.name		= "functions",
+	.stat_start	= function_stat_start,
+	.stat_next	= function_stat_next,
+	.stat_cmp	= function_stat_cmp,
+	.stat_headers	= function_stat_headers,
+	.stat_show	= function_stat_show
 };
 
 static void ftrace_profile_debugfs(struct dentry *d_tracer)
@@ -1954,7 +1956,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 
 static struct ftrace_ops trace_probe_ops __read_mostly =
 {
-	.func = function_trace_probe_call,
+	.func		= function_trace_probe_call,
 };
 
 static int ftrace_probe_registered;
-- 
GitLab


From 318e0a73c9e41b9a17241829bcd0605a39b87cb9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 20:06:34 -0400
Subject: [PATCH 0115/2198] tracing: remove on the fly allocator from function
 profiler

Impact: safer code

The on the fly allocator for the function profiler was to save
memory. But at the expense of stability. Although it survived several
tests, allocating from the function tracer is just too risky, just
to save space.

This patch removes the allocator and simply allocates enough entries
at start up.

Each function gets a profiling structure of 40 bytes. With an average
of 20K functions, and this is for each CPU, we have 800K per online
CPU. This is not too bad, at least for non-embedded.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 76 ++++++++++++++++++++++++-------------------
 1 file changed, 43 insertions(+), 33 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a141d8499ab03..4d90c916b2bbd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -401,6 +401,8 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
 int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 {
 	struct ftrace_profile_page *pg;
+	int functions;
+	int pages;
 	int i;
 
 	/* If we already allocated, do nothing */
@@ -411,22 +413,46 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
 	if (!stat->pages)
 		return -ENOMEM;
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+	functions = ftrace_update_tot_cnt;
+#else
+	/*
+	 * We do not know the number of functions that exist because
+	 * dynamic tracing is what counts them. With past experience
+	 * we have around 20K functions. That should be more than enough.
+	 * It is highly unlikely we will execute every function in
+	 * the kernel.
+	 */
+	functions = 20000;
+#endif
+
 	pg = stat->start = stat->pages;
 
-	/* allocate 10 more pages to start */
-	for (i = 0; i < 10; i++) {
+	pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
+
+	for (i = 0; i < pages; i++) {
 		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
-		/*
-		 * We only care about allocating profile_pages, if
-		 * we failed to allocate here, hopefully we will allocate
-		 * later.
-		 */
 		if (!pg->next)
-			break;
+			goto out_free;
 		pg = pg->next;
 	}
 
 	return 0;
+
+ out_free:
+	pg = stat->start;
+	while (pg) {
+		unsigned long tmp = (unsigned long)pg;
+
+		pg = pg->next;
+		free_page(tmp);
+	}
+
+	free_page((unsigned long)stat->pages);
+	stat->pages = NULL;
+	stat->start = NULL;
+
+	return -ENOMEM;
 }
 
 static int ftrace_profile_init_cpu(int cpu)
@@ -460,7 +486,7 @@ static int ftrace_profile_init_cpu(int cpu)
 			ftrace_profile_bits++;
 	}
 
-	/* Preallocate a few pages */
+	/* Preallocate the function profiling pages */
 	if (ftrace_profile_pages_init(stat) < 0) {
 		kfree(stat->hash);
 		stat->hash = NULL;
@@ -516,24 +542,21 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
 	hlist_add_head_rcu(&rec->node, &stat->hash[key]);
 }
 
-/* Interrupts must be disabled calling this */
+/*
+ * The memory is already allocated, this simply finds a new record to use.
+ */
 static struct ftrace_profile *
-ftrace_profile_alloc(struct ftrace_profile_stat *stat,
-		     unsigned long ip, bool alloc_safe)
+ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
 {
 	struct ftrace_profile *rec = NULL;
 
-	/* prevent recursion */
+	/* prevent recursion (from NMIs) */
 	if (atomic_inc_return(&stat->disabled) != 1)
 		goto out;
 
-	/* Try to always keep another page available */
-	if (!stat->pages->next && alloc_safe)
-		stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC);
-
 	/*
-	 * Try to find the function again since another
-	 * task on another CPU could have added it
+	 * Try to find the function again since an NMI
+	 * could have added it
 	 */
 	rec = ftrace_find_profiled_func(stat, ip);
 	if (rec)
@@ -555,29 +578,16 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat,
 	return rec;
 }
 
-/*
- * If we are not in an interrupt, or softirq and
- * and interrupts are disabled and preemption is not enabled
- * (not in a spinlock) then it should be safe to allocate memory.
- */
-static bool ftrace_safe_to_allocate(void)
-{
-	return !in_interrupt() && irqs_disabled() && !preempt_count();
-}
-
 static void
 function_profile_call(unsigned long ip, unsigned long parent_ip)
 {
 	struct ftrace_profile_stat *stat;
 	struct ftrace_profile *rec;
 	unsigned long flags;
-	bool alloc_safe;
 
 	if (!ftrace_profile_enabled)
 		return;
 
-	alloc_safe = ftrace_safe_to_allocate();
-
 	local_irq_save(flags);
 
 	stat = &__get_cpu_var(ftrace_profile_stats);
@@ -586,7 +596,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 
 	rec = ftrace_find_profiled_func(stat, ip);
 	if (!rec) {
-		rec = ftrace_profile_alloc(stat, ip, alloc_safe);
+		rec = ftrace_profile_alloc(stat, ip);
 		if (!rec)
 			goto out;
 	}
-- 
GitLab


From 34886c8bc590f078d4c0b88f50d061326639198d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 21:00:47 -0400
Subject: [PATCH 0116/2198] tracing: add average time in function to function
 profiler

Show the average time in the function (Time / Hit)

  Function                               Hit    Time            Avg
  --------                               ---    ----            ---
  mwait_idle                              51    140326.6 us     2751.503 us
  smp_apic_timer_interrupt                47    3517.735 us     74.845 us
  schedule                                10    2738.754 us     273.875 us
  __schedule                              10    2732.857 us     273.285 us
  hrtimer_interrupt                       47    1896.104 us     40.342 us
  irq_exit                                56    1711.833 us     30.568 us
  __run_hrtimer                           47    1315.589 us     27.991 us
  tick_sched_timer                        47    1138.690 us     24.227 us
  do_softirq                              56    1116.829 us     19.943 us
  __do_softirq                            56    1066.932 us     19.052 us
  do_IRQ                                   9    926.153 us      102.905 us

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4d90c916b2bbd..c7f4a4be05dc0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -347,8 +347,10 @@ static int function_stat_cmp(void *p1, void *p2)
 static int function_stat_headers(struct seq_file *m)
 {
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	seq_printf(m, "  Function                               Hit    Time\n"
-		      "  --------                               ---    ----\n");
+	seq_printf(m, "  Function                               "
+		   "Hit    Time            Avg\n"
+		      "  --------                               "
+		   "---    ----            ---\n");
 #else
 	seq_printf(m, "  Function                               Hit\n"
 		      "  --------                               ---\n");
@@ -361,12 +363,9 @@ static int function_stat_show(struct seq_file *m, void *v)
 	struct ftrace_profile *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	static struct trace_seq s;
 	static DEFINE_MUTEX(mutex);
-
-	mutex_lock(&mutex);
-	trace_seq_init(&s);
-	trace_print_graph_duration(rec->time, &s);
+	static struct trace_seq s;
+	unsigned long long avg;
 #endif
 
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
@@ -374,6 +373,14 @@ static int function_stat_show(struct seq_file *m, void *v)
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	seq_printf(m, "    ");
+	avg = rec->time;
+	do_div(avg, rec->counter);
+
+	mutex_lock(&mutex);
+	trace_seq_init(&s);
+	trace_print_graph_duration(rec->time, &s);
+	trace_seq_puts(&s, "    ");
+	trace_print_graph_duration(avg, &s);
 	trace_print_seq(m, &s);
 	mutex_unlock(&mutex);
 #endif
-- 
GitLab


From a8a93f3f03b7a8008d720e8d91798efe599d416c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 12 Feb 2009 13:45:34 -0800
Subject: [PATCH 0117/2198] mm: disable preemption in apply_to_pte_range

Impact: bugfix

Lazy mmu mode needs preemption disabled, so if we're apply to
init_mm (which doesn't require any pte locks), then explicitly
disable preemption.  (Do it unconditionally after checking we've
successfully done the allocation to simplify the error handling.)

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 mm/memory.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/memory.c b/mm/memory.c
index baa999e87cd21..b80cc31292b1e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1718,6 +1718,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 
 	BUG_ON(pmd_huge(*pmd));
 
+	preempt_disable();
 	arch_enter_lazy_mmu_mode();
 
 	token = pmd_pgtable(*pmd);
@@ -1729,6 +1730,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
+	preempt_enable();
 
 	if (mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
-- 
GitLab


From b8bcfe997e46150fedcc3f5b26b846400122fdd9 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Feb 2009 23:05:19 -0800
Subject: [PATCH 0118/2198] x86/paravirt: remove lazy mode in interrupts

Impact: simplification, robustness

Make paravirt_lazy_mode() always return PARAVIRT_LAZY_NONE
when in an interrupt.  This prevents interrupt code from
accidentally inheriting an outer lazy state, and instead
does everything synchronously.  Outer batched operations
are left deferred.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/paravirt.c |  3 +++
 arch/x86/mm/fault.c        |  6 ++----
 arch/x86/mm/highmem_32.c   |  2 --
 arch/x86/mm/iomap_32.c     |  1 -
 arch/x86/mm/pageattr.c     | 14 --------------
 5 files changed, 5 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 63dd358d8ee13..8ab250ac498b6 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -282,6 +282,9 @@ void paravirt_leave_lazy_cpu(void)
 
 enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 {
+	if (in_interrupt())
+		return PARAVIRT_LAZY_NONE;
+
 	return __get_cpu_var(paravirt_lazy_mode);
 }
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa01..cfbb4a7380116 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -225,12 +225,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
 	if (!pmd_present(*pmd_k))
 		return NULL;
 
-	if (!pmd_present(*pmd)) {
+	if (!pmd_present(*pmd))
 		set_pmd(pmd, *pmd_k);
-		arch_flush_lazy_mmu_mode();
-	} else {
+	else
 		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-	}
 
 	return pmd_k;
 }
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 00f127c80b0e3..e81dfa4081578 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -87,7 +87,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
 	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
 	BUG_ON(!pte_none(*(kmap_pte-idx)));
 	set_pte(kmap_pte-idx, mk_pte(page, prot));
-	arch_flush_lazy_mmu_mode();
 
 	return (void *)vaddr;
 }
@@ -117,7 +116,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 #endif
 	}
 
-	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
 
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 04102d42ff425..b6a61f3d7ef82 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -74,7 +74,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
 	if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
 		kpte_clear_flush(kmap_pte-idx, vaddr);
 
-	arch_flush_lazy_mmu_mode();
 	pagefault_enable();
 }
 EXPORT_SYMBOL_GPL(iounmap_atomic);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 9c4294986af77..9015e5e412b51 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -824,13 +824,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 
 	vm_unmap_aliases();
 
-	/*
-	 * If we're called with lazy mmu updates enabled, the
-	 * in-memory pte state may be stale.  Flush pending updates to
-	 * bring them up to date.
-	 */
-	arch_flush_lazy_mmu_mode();
-
 	cpa.vaddr = addr;
 	cpa.numpages = numpages;
 	cpa.mask_set = mask_set;
@@ -873,13 +866,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 	} else
 		cpa_flush_all(cache);
 
-	/*
-	 * If we've been called with lazy mmu updates enabled, then
-	 * make sure that everything gets flushed out before we
-	 * return.
-	 */
-	arch_flush_lazy_mmu_mode();
-
 out:
 	return ret;
 }
-- 
GitLab


From 7fd7d83d49914f03aefffba6aee09032fcd54cce Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Feb 2009 23:24:03 -0800
Subject: [PATCH 0119/2198] x86/pvops: replace arch_enter_lazy_cpu_mode with
 arch_start_context_switch

Impact: simplification, prepare for later changes

Make lazy cpu mode more specific to context switching, so that
it makes sense to do more context-switch specific things in
the callbacks.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/include/asm/paravirt.h |  8 +++-----
 arch/x86/kernel/paravirt.c      | 13 -------------
 arch/x86/kernel/process_32.c    |  2 +-
 arch/x86/kernel/process_64.c    |  2 +-
 arch/x86/xen/mmu.c              |  5 +----
 include/asm-frv/pgtable.h       |  4 ++--
 include/asm-generic/pgtable.h   | 21 +++++++++++----------
 kernel/sched.c                  |  2 +-
 8 files changed, 20 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 0617d5cc9712d..7b28abac323ff 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1420,19 +1420,17 @@ void paravirt_enter_lazy_mmu(void);
 void paravirt_leave_lazy_mmu(void);
 void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
 
-#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-static inline void arch_enter_lazy_cpu_mode(void)
+#define  __HAVE_ARCH_START_CONTEXT_SWITCH
+static inline void arch_start_context_switch(void)
 {
 	PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
 }
 
-static inline void arch_leave_lazy_cpu_mode(void)
+static inline void arch_end_context_switch(void)
 {
 	PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
 }
 
-void arch_flush_lazy_cpu_mode(void);
-
 #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
 static inline void arch_enter_lazy_mmu_mode(void)
 {
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 8ab250ac498b6..5eea9548216be 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -301,19 +301,6 @@ void arch_flush_lazy_mmu_mode(void)
 	preempt_enable();
 }
 
-void arch_flush_lazy_cpu_mode(void)
-{
-	preempt_disable();
-
-	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
-		WARN_ON(preempt_count() == 1);
-		arch_leave_lazy_cpu_mode();
-		arch_enter_lazy_cpu_mode();
-	}
-
-	preempt_enable();
-}
-
 struct pv_info pv_info = {
 	.name = "bare hardware",
 	.paravirt_enabled = 0,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 14014d766cadb..57e49a8278a98 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * done before math_state_restore, so the TS bit is up
 	 * to date.
 	 */
-	arch_leave_lazy_cpu_mode();
+	arch_end_context_switch();
 
 	/* If the task has used fpu the last 5 timeslices, just do a full
 	 * restore of the math state immediately to avoid the trap; the
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index abb7e6a7f0c62..7115e6085326c 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * done before math_state_restore, so the TS bit is up
 	 * to date.
 	 */
-	arch_leave_lazy_cpu_mode();
+	arch_end_context_switch();
 
 	/*
 	 * Switch FS and GS.
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index cb6afa4ec95c5..6b98f87232ac0 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1119,10 +1119,8 @@ static void drop_other_mm_ref(void *info)
 
 	/* If this cpu still has a stale cr3 reference, then make sure
 	   it has been flushed. */
-	if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
+	if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
 		load_cr3(swapper_pg_dir);
-		arch_flush_lazy_cpu_mode();
-	}
 }
 
 static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1135,7 +1133,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
 			load_cr3(swapper_pg_dir);
 		else
 			leave_mm(smp_processor_id());
-		arch_flush_lazy_cpu_mode();
 	}
 
 	/* Get the "official" set of cpus referring to our pagetable. */
diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h
index e16fdb1f4f4fe..235e34a7a340b 100644
--- a/include/asm-frv/pgtable.h
+++ b/include/asm-frv/pgtable.h
@@ -73,8 +73,8 @@ static inline int pte_file(pte_t pte) { return 0; }
 #define pgtable_cache_init()		do {} while (0)
 #define arch_enter_lazy_mmu_mode()	do {} while (0)
 #define arch_leave_lazy_mmu_mode()	do {} while (0)
-#define arch_enter_lazy_cpu_mode()	do {} while (0)
-#define arch_leave_lazy_cpu_mode()	do {} while (0)
+
+#define arch_start_context_switch()	do {} while (0)
 
 #else /* !CONFIG_MMU */
 /*****************************************************************************/
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 8e6d0ca70aba9..922f03671dd8c 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -280,17 +280,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 #endif
 
 /*
- * A facility to provide batching of the reload of page tables with the
- * actual context switch code for paravirtualized guests.  By convention,
- * only one of the lazy modes (CPU, MMU) should be active at any given
- * time, entry should never be nested, and entry and exits should always
- * be paired.  This is for sanity of maintaining and reasoning about the
- * kernel code.
+ * A facility to provide batching of the reload of page tables and
+ * other process state with the actual context switch code for
+ * paravirtualized guests.  By convention, only one of the batched
+ * update (lazy) modes (CPU, MMU) should be active at any given time,
+ * entry should never be nested, and entry and exits should always be
+ * paired.  This is for sanity of maintaining and reasoning about the
+ * kernel code.  In this case, the exit (end of the context switch) is
+ * in architecture-specific code, and so doesn't need a generic
+ * definition.
  */
-#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-#define arch_enter_lazy_cpu_mode()	do {} while (0)
-#define arch_leave_lazy_cpu_mode()	do {} while (0)
-#define arch_flush_lazy_cpu_mode()	do {} while (0)
+#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
+#define arch_start_context_switch()	do {} while (0)
 #endif
 
 #ifndef __HAVE_PFNMAP_TRACKING
diff --git a/kernel/sched.c b/kernel/sched.c
index 5757e03cfac0b..7530fdd7c9829 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
-	arch_enter_lazy_cpu_mode();
+	arch_start_context_switch();
 
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
-- 
GitLab


From b407fc57b815b2016186220baabc76cc8264206e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Feb 2009 23:46:21 -0800
Subject: [PATCH 0120/2198] x86/paravirt: flush pending mmu updates on context
 switch

Impact: allow preemption during lazy mmu updates

If we're in lazy mmu mode when context switching, leave
lazy mmu mode, but remember the task's state in
TIF_LAZY_MMU_UPDATES.  When we resume the task, check this
flag and re-enter lazy mmu mode if its set.

This sets things up for allowing lazy mmu mode while preemptible,
though that won't actually be active until the next change.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/include/asm/paravirt.h    |  1 -
 arch/x86/include/asm/thread_info.h |  2 ++
 arch/x86/kernel/kvm.c              |  2 +-
 arch/x86/kernel/paravirt.c         | 13 ++++++++++---
 arch/x86/kernel/vmi_32.c           | 14 ++++++++++----
 arch/x86/lguest/boot.c             | 14 ++++++++++----
 arch/x86/xen/enlighten.c           |  6 +++---
 arch/x86/xen/mmu.c                 |  7 ++++++-
 arch/x86/xen/xen-ops.h             |  1 -
 9 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 7b28abac323ff..58d2481b01a6c 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -1418,7 +1418,6 @@ void paravirt_enter_lazy_cpu(void);
 void paravirt_leave_lazy_cpu(void);
 void paravirt_enter_lazy_mmu(void);
 void paravirt_leave_lazy_mmu(void);
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
 
 #define  __HAVE_ARCH_START_CONTEXT_SWITCH
 static inline void arch_start_context_switch(void)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index df9d5f78385e4..2f34d643b567c 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -94,6 +94,7 @@ struct thread_info {
 #define TIF_FORCED_TF		24	/* true if TF in eflags artificially */
 #define TIF_DEBUGCTLMSR		25	/* uses thread_struct.debugctlmsr */
 #define TIF_DS_AREA_MSR		26      /* uses thread_struct.ds_area_msr */
+#define TIF_LAZY_MMU_UPDATES	27	/* task is updating the mmu lazily */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
@@ -115,6 +116,7 @@ struct thread_info {
 #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF)
 #define _TIF_DEBUGCTLMSR	(1 << TIF_DEBUGCTLMSR)
 #define _TIF_DS_AREA_MSR	(1 << TIF_DS_AREA_MSR)
+#define _TIF_LAZY_MMU_UPDATES	(1 << TIF_LAZY_MMU_UPDATES)
 
 /* work to do in syscall_trace_enter() */
 #define _TIF_WORK_SYSCALL_ENTRY	\
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 478bca986eca0..5d7f6e76b5dc6 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -201,7 +201,7 @@ static void kvm_leave_lazy_mmu(void)
 	struct kvm_para_state *state = kvm_para_state();
 
 	mmu_queue_flush(state);
-	paravirt_leave_lazy(paravirt_get_lazy_mode());
+	paravirt_leave_lazy_mmu();
 	state->mode = paravirt_get_lazy_mode();
 }
 
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5eea9548216be..430a0e30577bc 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -252,7 +252,7 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode)
 	__get_cpu_var(paravirt_lazy_mode) = mode;
 }
 
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+static void leave_lazy(enum paravirt_lazy_mode mode)
 {
 	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
 	BUG_ON(preemptible());
@@ -267,17 +267,24 @@ void paravirt_enter_lazy_mmu(void)
 
 void paravirt_leave_lazy_mmu(void)
 {
-	paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+	leave_lazy(PARAVIRT_LAZY_MMU);
 }
 
 void paravirt_enter_lazy_cpu(void)
 {
+	if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+		arch_leave_lazy_mmu_mode();
+		set_thread_flag(TIF_LAZY_MMU_UPDATES);
+	}
 	enter_lazy(PARAVIRT_LAZY_CPU);
 }
 
 void paravirt_leave_lazy_cpu(void)
 {
-	paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+	leave_lazy(PARAVIRT_LAZY_CPU);
+
+	if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES))
+		arch_enter_lazy_mmu_mode();
 }
 
 enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 2cc4a90e2cb3a..950929c607d35 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -473,16 +473,22 @@ static void vmi_enter_lazy_cpu(void)
 	vmi_ops.set_lazy_mode(2);
 }
 
+static void vmi_leave_lazy_cpu(void)
+{
+	vmi_ops.set_lazy_mode(0);
+	paravirt_leave_lazy_cpu();
+}
+
 static void vmi_enter_lazy_mmu(void)
 {
 	paravirt_enter_lazy_mmu();
 	vmi_ops.set_lazy_mode(1);
 }
 
-static void vmi_leave_lazy(void)
+static void vmi_leave_lazy_mmu(void)
 {
-	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	vmi_ops.set_lazy_mode(0);
+	paravirt_leave_lazy_mmu();
 }
 
 static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -718,12 +724,12 @@ static inline int __init activate_vmi(void)
 
 	para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
 		  set_lazy_mode, SetLazyMode);
-	para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
+	para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu,
 		  set_lazy_mode, SetLazyMode);
 
 	para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
 		  set_lazy_mode, SetLazyMode);
-	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
+	para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
 		  set_lazy_mode, SetLazyMode);
 
 	/* user and kernel flush are just handled with different flags to FlushTLB */
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9fe4ddaa8f6ff..41a5562e710ed 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -147,10 +147,16 @@ static void lazy_hcall(unsigned long call,
 
 /* When lazy mode is turned off reset the per-cpu lazy mode variable and then
  * issue the do-nothing hypercall to flush any stored calls. */
-static void lguest_leave_lazy_mode(void)
+static void lguest_leave_lazy_mmu_mode(void)
 {
-	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+	paravirt_leave_lazy_mmu();
+}
+
+static void lguest_leave_lazy_cpu_mode(void)
+{
+	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+	paravirt_leave_lazy_cpu();
 }
 
 /*G:033
@@ -1026,7 +1032,7 @@ __init void lguest_init(void)
 	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
 	pv_cpu_ops.wbinvd = lguest_wbinvd;
 	pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
-	pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+	pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_cpu_mode;
 
 	/* pagetable management */
 	pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1039,7 +1045,7 @@ __init void lguest_init(void)
 	pv_mmu_ops.read_cr2 = lguest_read_cr2;
 	pv_mmu_ops.read_cr3 = lguest_read_cr3;
 	pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+	pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* apic read/write intercepts */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 82cd39a6cbd3a..f586e63b9a632 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -203,10 +203,10 @@ static unsigned long xen_get_debugreg(int reg)
 	return HYPERVISOR_get_debugreg(reg);
 }
 
-void xen_leave_lazy(void)
+static void xen_leave_lazy_cpu(void)
 {
-	paravirt_leave_lazy(paravirt_get_lazy_mode());
 	xen_mc_flush();
+	paravirt_leave_lazy_cpu();
 }
 
 static unsigned long xen_store_tr(void)
@@ -819,7 +819,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 
 	.lazy_mode = {
 		.enter = paravirt_enter_lazy_cpu,
-		.leave = xen_leave_lazy,
+		.leave = xen_leave_lazy_cpu,
 	},
 };
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 6b98f87232ac0..f5f8faa4f76ca 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1816,6 +1816,11 @@ __init void xen_post_allocator_init(void)
 	xen_mark_init_mm_pinned();
 }
 
+static void xen_leave_lazy_mmu(void)
+{
+	xen_mc_flush();
+	paravirt_leave_lazy_mmu();
+}
 
 const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.pagetable_setup_start = xen_pagetable_setup_start,
@@ -1891,7 +1896,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
 	.lazy_mode = {
 		.enter = paravirt_enter_lazy_mmu,
-		.leave = xen_leave_lazy,
+		.leave = xen_leave_lazy_mmu,
 	},
 
 	.set_fixmap = xen_set_fixmap,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 2f5ef2632ea23..f897cdffccb6f 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_ident_map_ISA(void);
 void xen_reserve_top(void);
 
-void xen_leave_lazy(void);
 void xen_post_allocator_init(void);
 
 char * __init xen_memory_setup(void);
-- 
GitLab


From 224101ed69d3fbb486868e0f6e0f9fa37302efb4 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 18 Feb 2009 11:18:57 -0800
Subject: [PATCH 0121/2198] x86/paravirt: finish change from lazy cpu to
 context switch start/end

Impact: fix lazy context switch API

Pass the previous and next tasks into the context switch start
end calls, so that the called functions can properly access the
task state (esp in end_context_switch, in which the next task
is not yet completely current).

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/include/asm/paravirt.h | 17 ++++++++++-------
 arch/x86/include/asm/pgtable.h  |  2 ++
 arch/x86/kernel/paravirt.c      | 14 ++++++--------
 arch/x86/kernel/process_32.c    |  2 +-
 arch/x86/kernel/process_64.c    |  2 +-
 arch/x86/kernel/vmi_32.c        | 12 ++++++------
 arch/x86/lguest/boot.c          |  8 ++++----
 arch/x86/xen/enlighten.c        | 10 ++++------
 include/asm-frv/pgtable.h       |  2 +-
 include/asm-generic/pgtable.h   |  2 +-
 kernel/sched.c                  |  2 +-
 11 files changed, 37 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 58d2481b01a6c..dfdee0ca57d3a 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -56,6 +56,7 @@ struct desc_ptr;
 struct tss_struct;
 struct mm_struct;
 struct desc_struct;
+struct task_struct;
 
 /*
  * Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
 
 	void (*swapgs)(void);
 
-	struct pv_lazy_ops lazy_mode;
+	void (*start_context_switch)(struct task_struct *prev);
+	void (*end_context_switch)(struct task_struct *next);
 };
 
 struct pv_irq_ops {
@@ -1414,20 +1416,21 @@ enum paravirt_lazy_mode {
 };
 
 enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
-void paravirt_enter_lazy_cpu(void);
-void paravirt_leave_lazy_cpu(void);
+void paravirt_start_context_switch(struct task_struct *prev);
+void paravirt_end_context_switch(struct task_struct *next);
+
 void paravirt_enter_lazy_mmu(void);
 void paravirt_leave_lazy_mmu(void);
 
 #define  __HAVE_ARCH_START_CONTEXT_SWITCH
-static inline void arch_start_context_switch(void)
+static inline void arch_start_context_switch(struct task_struct *prev)
 {
-	PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
+	PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
 }
 
-static inline void arch_end_context_switch(void)
+static inline void arch_end_context_switch(struct task_struct *next)
 {
-	PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
+	PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
 }
 
 #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index d0812e155f1d6..24e42836e9214 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -83,6 +83,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
 #define pte_val(x)	native_pte_val(x)
 #define __pte(x)	native_make_pte(x)
 
+#define arch_end_context_switch(prev)	do {} while(0)
+
 #endif	/* CONFIG_PARAVIRT */
 
 /*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 430a0e30577bc..cf1437503bab7 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -270,20 +270,20 @@ void paravirt_leave_lazy_mmu(void)
 	leave_lazy(PARAVIRT_LAZY_MMU);
 }
 
-void paravirt_enter_lazy_cpu(void)
+void paravirt_start_context_switch(struct task_struct *prev)
 {
 	if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
 		arch_leave_lazy_mmu_mode();
-		set_thread_flag(TIF_LAZY_MMU_UPDATES);
+		set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
 	}
 	enter_lazy(PARAVIRT_LAZY_CPU);
 }
 
-void paravirt_leave_lazy_cpu(void)
+void paravirt_end_context_switch(struct task_struct *next)
 {
 	leave_lazy(PARAVIRT_LAZY_CPU);
 
-	if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES))
+	if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
 		arch_enter_lazy_mmu_mode();
 }
 
@@ -399,10 +399,8 @@ struct pv_cpu_ops pv_cpu_ops = {
 	.set_iopl_mask = native_set_iopl_mask,
 	.io_delay = native_io_delay,
 
-	.lazy_mode = {
-		.enter = paravirt_nop,
-		.leave = paravirt_nop,
-	},
+	.start_context_switch = paravirt_nop,
+	.end_context_switch = paravirt_nop,
 };
 
 struct pv_apic_ops pv_apic_ops = {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 57e49a8278a98..d766c7616fd7d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * done before math_state_restore, so the TS bit is up
 	 * to date.
 	 */
-	arch_end_context_switch();
+	arch_end_context_switch(next_p);
 
 	/* If the task has used fpu the last 5 timeslices, just do a full
 	 * restore of the math state immediately to avoid the trap; the
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 7115e6085326c..e8a9aaf9df889 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * done before math_state_restore, so the TS bit is up
 	 * to date.
 	 */
-	arch_end_context_switch();
+	arch_end_context_switch(next_p);
 
 	/*
 	 * Switch FS and GS.
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 950929c607d35..55a5d6938e5e7 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -467,16 +467,16 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 }
 #endif
 
-static void vmi_enter_lazy_cpu(void)
+static void vmi_start_context_switch(struct task_struct *prev)
 {
-	paravirt_enter_lazy_cpu();
+	paravirt_start_context_switch(prev);
 	vmi_ops.set_lazy_mode(2);
 }
 
-static void vmi_leave_lazy_cpu(void)
+static void vmi_end_context_switch(struct task_struct *next)
 {
 	vmi_ops.set_lazy_mode(0);
-	paravirt_leave_lazy_cpu();
+	paravirt_end_context_switch(next);
 }
 
 static void vmi_enter_lazy_mmu(void)
@@ -722,9 +722,9 @@ static inline int __init activate_vmi(void)
 	para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
 	para_fill(pv_cpu_ops.io_delay, IODelay);
 
-	para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
+	para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
 		  set_lazy_mode, SetLazyMode);
-	para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu,
+	para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
 		  set_lazy_mode, SetLazyMode);
 
 	para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 41a5562e710ed..5287081b35670 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -153,10 +153,10 @@ static void lguest_leave_lazy_mmu_mode(void)
 	paravirt_leave_lazy_mmu();
 }
 
-static void lguest_leave_lazy_cpu_mode(void)
+static void lguest_end_context_switch(struct task_struct *next)
 {
 	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
-	paravirt_leave_lazy_cpu();
+	paravirt_end_context_switch(next);
 }
 
 /*G:033
@@ -1031,8 +1031,8 @@ __init void lguest_init(void)
 	pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
 	pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
 	pv_cpu_ops.wbinvd = lguest_wbinvd;
-	pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
-	pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_cpu_mode;
+	pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
+	pv_cpu_ops.end_context_switch = lguest_end_context_switch;
 
 	/* pagetable management */
 	pv_mmu_ops.write_cr3 = lguest_write_cr3;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f586e63b9a632..70b355d3a86c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -203,10 +203,10 @@ static unsigned long xen_get_debugreg(int reg)
 	return HYPERVISOR_get_debugreg(reg);
 }
 
-static void xen_leave_lazy_cpu(void)
+static void xen_end_context_switch(struct task_struct *next)
 {
 	xen_mc_flush();
-	paravirt_leave_lazy_cpu();
+	paravirt_end_context_switch(next);
 }
 
 static unsigned long xen_store_tr(void)
@@ -817,10 +817,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 	/* Xen takes care of %gs when switching to usermode for us */
 	.swapgs = paravirt_nop,
 
-	.lazy_mode = {
-		.enter = paravirt_enter_lazy_cpu,
-		.leave = xen_leave_lazy_cpu,
-	},
+	.start_context_switch = paravirt_start_context_switch,
+	.end_context_switch = xen_end_context_switch,
 };
 
 static const struct pv_apic_ops xen_apic_ops __initdata = {
diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h
index 235e34a7a340b..09887045d03f0 100644
--- a/include/asm-frv/pgtable.h
+++ b/include/asm-frv/pgtable.h
@@ -74,7 +74,7 @@ static inline int pte_file(pte_t pte) { return 0; }
 #define arch_enter_lazy_mmu_mode()	do {} while (0)
 #define arch_leave_lazy_mmu_mode()	do {} while (0)
 
-#define arch_start_context_switch()	do {} while (0)
+#define arch_start_context_switch(prev)	do {} while (0)
 
 #else /* !CONFIG_MMU */
 /*****************************************************************************/
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 922f03671dd8c..e410f602cab1b 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -291,7 +291,7 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
  * definition.
  */
 #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
-#define arch_start_context_switch()	do {} while (0)
+#define arch_start_context_switch(prev)	do {} while (0)
 #endif
 
 #ifndef __HAVE_PFNMAP_TRACKING
diff --git a/kernel/sched.c b/kernel/sched.c
index 7530fdd7c9829..133762aece50e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	 * combine the page table reload and the switch backend into
 	 * one hypercall.
 	 */
-	arch_start_context_switch();
+	arch_start_context_switch(prev);
 
 	if (unlikely(!mm)) {
 		next->active_mm = oldmm;
-- 
GitLab


From 2829b449276aed45f3d649efb21e3418e39dd5d1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Feb 2009 23:53:19 -0800
Subject: [PATCH 0122/2198] x86/paravirt: allow preemption with lazy mmu mode

Impact: remove obsolete checks, simplification

Lift restrictions on preemption with lazy mmu mode, as it is now allowed.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/kernel/paravirt.c | 7 ++++---
 arch/x86/xen/mmu.c         | 8 +-------
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index cf1437503bab7..bf2e86eee80ce 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -247,7 +247,6 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
 static inline void enter_lazy(enum paravirt_lazy_mode mode)
 {
 	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
-	BUG_ON(preemptible());
 
 	__get_cpu_var(paravirt_lazy_mode) = mode;
 }
@@ -255,7 +254,6 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode)
 static void leave_lazy(enum paravirt_lazy_mode mode)
 {
 	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
-	BUG_ON(preemptible());
 
 	__get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
 }
@@ -272,6 +270,8 @@ void paravirt_leave_lazy_mmu(void)
 
 void paravirt_start_context_switch(struct task_struct *prev)
 {
+	BUG_ON(preemptible());
+
 	if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
 		arch_leave_lazy_mmu_mode();
 		set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
@@ -281,6 +281,8 @@ void paravirt_start_context_switch(struct task_struct *prev)
 
 void paravirt_end_context_switch(struct task_struct *next)
 {
+	BUG_ON(preemptible());
+
 	leave_lazy(PARAVIRT_LAZY_CPU);
 
 	if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
@@ -300,7 +302,6 @@ void arch_flush_lazy_mmu_mode(void)
 	preempt_disable();
 
 	if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
-		WARN_ON(preempt_count() == 1);
 		arch_leave_lazy_mmu_mode();
 		arch_enter_lazy_mmu_mode();
 	}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index f5f8faa4f76ca..3f2d0fe5e6a8d 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -419,10 +419,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 		    pte_t *ptep, pte_t pteval)
 {
-	/* updates to init_mm may be done without lock */
-	if (mm == &init_mm)
-		preempt_disable();
-
 	ADD_STATS(set_pte_at, 1);
 //	ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
 	ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -443,9 +439,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
 	}
 	xen_set_pte(ptep, pteval);
 
-out:
-	if (mm == &init_mm)
-		preempt_enable();
+out:	return;
 }
 
 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
-- 
GitLab


From 252a6bf2a3a7e7add56b17d48aecf3f3ef213103 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 18 Feb 2009 00:11:28 -0800
Subject: [PATCH 0123/2198] mm: allow preemption in apply_to_pte_range

Impact: allow preemption in apply_to_pte_range updates to init_mm

Preemption is now allowed for lazy mmu mode, so don't disable
it for the inner loop of apply_to_pte_range.  This only applies
when doing updates to init_mm; user pagetables are still modified
under the pte lock, so preemption is disabled anyway.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 mm/memory.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index b80cc31292b1e..baa999e87cd21 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1718,7 +1718,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 
 	BUG_ON(pmd_huge(*pmd));
 
-	preempt_disable();
 	arch_enter_lazy_mmu_mode();
 
 	token = pmd_pgtable(*pmd);
@@ -1730,7 +1729,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
 	arch_leave_lazy_mmu_mode();
-	preempt_enable();
 
 	if (mm != &init_mm)
 		pte_unmap_unlock(pte-1, ptl);
-- 
GitLab


From ab2f75f0b760d2b0c9a875b669a1b51dce02c85a Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 18 Feb 2009 00:18:50 -0800
Subject: [PATCH 0124/2198] x86/paravirt: use percpu_ rather than __get_cpu_var

Impact: minor optimisation

percpu_read/write is a slightly more direct way of getting
to percpu data.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/kernel/paravirt.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bf2e86eee80ce..254e8aa8bfdb1 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -246,16 +246,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
 
 static inline void enter_lazy(enum paravirt_lazy_mode mode)
 {
-	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
+	BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
 
-	__get_cpu_var(paravirt_lazy_mode) = mode;
+	percpu_write(paravirt_lazy_mode, mode);
 }
 
 static void leave_lazy(enum paravirt_lazy_mode mode)
 {
-	BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
+	BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
 
-	__get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+	percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
 }
 
 void paravirt_enter_lazy_mmu(void)
@@ -294,7 +294,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 	if (in_interrupt())
 		return PARAVIRT_LAZY_NONE;
 
-	return __get_cpu_var(paravirt_lazy_mode);
+	return percpu_read(paravirt_lazy_mode);
 }
 
 void arch_flush_lazy_mmu_mode(void)
-- 
GitLab


From 5caecb9432428241d0c641897f07ff4003f1b55f Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 20 Feb 2009 23:01:26 -0800
Subject: [PATCH 0125/2198] xen: disable preempt for leave_lazy_mmu

xen_mc_flush() requires preemption to be disabled for its own sanity,
so disable it while we're flushing.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3f2d0fe5e6a8d..0e572380413bb 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1812,8 +1812,10 @@ __init void xen_post_allocator_init(void)
 
 static void xen_leave_lazy_mmu(void)
 {
+	preempt_disable();
 	xen_mc_flush();
 	paravirt_leave_lazy_mmu();
+	preempt_enable();
 }
 
 const struct pv_mmu_ops xen_mmu_ops __initdata = {
-- 
GitLab


From 59d7187142bbe9b404a403ed0f874d3227305f26 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 26 Feb 2009 15:48:33 -0800
Subject: [PATCH 0126/2198] xen: separate p2m allocation from setting

When doing very early p2m setting, we need to separate setting
from allocation, so split things up accordingly.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 61 +++++++++++++++++++++++++++++++++-------------
 arch/x86/xen/mmu.h |  3 +++
 2 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0e572380413bb..e0a55b7a6cebf 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -233,47 +233,74 @@ unsigned long get_phys_to_machine(unsigned long pfn)
 }
 EXPORT_SYMBOL_GPL(get_phys_to_machine);
 
-static void alloc_p2m(unsigned long **pp, unsigned long *mfnp)
+/* install a  new p2m_top page */
+bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
 {
-	unsigned long *p;
+	unsigned topidx = p2m_top_index(pfn);
+	unsigned long **pfnp, *mfnp;
 	unsigned i;
 
-	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
-	BUG_ON(p == NULL);
+	pfnp = &p2m_top[topidx];
+	mfnp = &p2m_top_mfn[topidx];
 
 	for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
 		p[i] = INVALID_P2M_ENTRY;
 
-	if (cmpxchg(pp, p2m_missing, p) != p2m_missing)
-		free_page((unsigned long)p);
-	else
+	if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
 		*mfnp = virt_to_mfn(p);
+		return true;
+	}
+
+	return false;
 }
 
-void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+static void alloc_p2m(unsigned long pfn)
 {
-	unsigned topidx, idx;
+	unsigned long *p;
 
-	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
-		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
-		return;
-	}
+	p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
+	BUG_ON(p == NULL);
+
+	if (!install_p2mtop_page(pfn, p))
+		free_page((unsigned long)p);
+}
+
+/* Try to install p2m mapping; fail if intermediate bits missing */
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	unsigned topidx, idx;
 
 	if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
 		BUG_ON(mfn != INVALID_P2M_ENTRY);
-		return;
+		return true;
 	}
 
 	topidx = p2m_top_index(pfn);
 	if (p2m_top[topidx] == p2m_missing) {
-		/* no need to allocate a page to store an invalid entry */
 		if (mfn == INVALID_P2M_ENTRY)
-			return;
-		alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]);
+			return true;
+		return false;
 	}
 
 	idx = p2m_index(pfn);
 	p2m_top[topidx][idx] = mfn;
+
+	return true;
+}
+
+void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
+{
+	if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
+		BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
+		return;
+	}
+
+	if (unlikely(!__set_phys_to_machine(pfn, mfn)))  {
+		alloc_p2m(pfn);
+
+		if (!__set_phys_to_machine(pfn, mfn))
+			BUG();
+	}
 }
 
 unsigned long arbitrary_virt_to_mfn(void *vaddr)
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 24d1b44a337d6..da73026248978 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -11,6 +11,9 @@ enum pt_level {
 };
 
 
+bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
+bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
+
 void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
 
 
-- 
GitLab


From a2bcd4731f77cb77ae4b5e4a3d7f5471cf346c33 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 29 Mar 2009 23:47:48 +0200
Subject: [PATCH 0127/2198] x86/mm: further cleanups of fault.c's include file
 section

Impact: cleanup

Eliminate more than 20 unnecessary #include lines in fault.c

Also fix include file dependency bug in asm/traps.h. (this was
masked before, by implicit inclusion)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <tip-56aea8468746e673a4bf50b6a13d97b2d1cbe1e8@git.kernel.org>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/traps.h |  1 +
 arch/x86/mm/fault.c          | 44 ++++++++----------------------------
 2 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d5342515b869..37fb07a9cda06 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -2,6 +2,7 @@
 #define _ASM_X86_TRAPS_H
 
 #include <asm/debugreg.h>
+#include <asm/siginfo.h>			/* TRAP_TRACE, ... */
 
 #ifdef CONFIG_X86_32
 #define dotraplinkage
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa01..24a36a6426abd 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -3,40 +3,16 @@
  *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
  */
-#include <linux/interrupt.h>
-#include <linux/mmiotrace.h>
-#include <linux/bootmem.h>
-#include <linux/compiler.h>
-#include <linux/highmem.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/vt_kern.h>
-#include <linux/signal.h>
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/errno.h>
-#include <linux/magic.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/mman.h>
-#include <linux/tty.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-
-#include <asm-generic/sections.h>
-
-#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
-#include <asm/segment.h>
-#include <asm/system.h>
-#include <asm/proto.h>
-#include <asm/traps.h>
-#include <asm/desc.h>
+#include <linux/magic.h>		/* STACK_END_MAGIC		*/
+#include <linux/sched.h>		/* test_thread_flag(), ...	*/
+#include <linux/kdebug.h>		/* oops_begin/end, ...		*/
+#include <linux/module.h>		/* search_exception_table	*/
+#include <linux/bootmem.h>		/* max_low_pfn			*/
+#include <linux/kprobes.h>		/* __kprobes, ...		*/
+#include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
+
+#include <asm/traps.h>			/* dotraplinkage, ...		*/
+#include <asm/pgalloc.h>		/* pgd_*(), ...			*/
 
 /*
  * Page fault error code bits:
-- 
GitLab


From 7571a60446030d2576d881438447e86a0755a83b Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Feb 2009 15:34:59 -0800
Subject: [PATCH 0128/2198] xen: split construction of p2m mfn tables from
 registration

Build the p2m_mfn_list_list early with the rest of the p2m table, but
register it later when the real shared_info structure is in place.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index e0a55b7a6cebf..67d2ab45cd90c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -184,7 +184,7 @@ static inline unsigned p2m_index(unsigned long pfn)
 }
 
 /* Build the parallel p2m_top_mfn structures */
-void xen_setup_mfn_list_list(void)
+static void __init xen_build_mfn_list_list(void)
 {
 	unsigned pfn, idx;
 
@@ -198,7 +198,10 @@ void xen_setup_mfn_list_list(void)
 		unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
 		p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
 	}
+}
 
+void xen_setup_mfn_list_list(void)
+{
 	BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
 
 	HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
@@ -218,6 +221,8 @@ void __init xen_build_dynamic_phys_to_machine(void)
 
 		p2m_top[topidx] = &mfn_list[pfn];
 	}
+
+	xen_build_mfn_list_list();
 }
 
 unsigned long get_phys_to_machine(unsigned long pfn)
-- 
GitLab


From 6ed6bf428aff64fe37cdc54b239d598fee6016f1 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 13:02:18 -0800
Subject: [PATCH 0129/2198] xen: clean up xen_load_gdt

Makes the logic a bit clearer.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 70b355d3a86c4..5776dc270297e 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -301,10 +301,21 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	frames = mcs.args;
 
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
-		frames[f] = arbitrary_virt_to_mfn((void *)va);
+		int level;
+		pte_t *ptep = lookup_address(va, &level);
+		unsigned long pfn, mfn;
+		void *virt;
+
+		BUG_ON(ptep == NULL);
+
+		pfn = pte_pfn(*ptep);
+		mfn = pfn_to_mfn(pfn);
+		virt = __va(PFN_PHYS(pfn));
+
+		frames[f] = mfn;
 
 		make_lowmem_page_readonly((void *)va);
-		make_lowmem_page_readonly(mfn_to_virt(frames[f]));
+		make_lowmem_page_readonly(virt);
 	}
 
 	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
-- 
GitLab


From 3ce5fa7ebff74b6a4dc5fdcdc22e6979f5a4ff85 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 15:26:00 -0800
Subject: [PATCH 0130/2198] xen: make xen_load_gdt simpler

Remove use of multicall machinery which is unused (gdt loading
is never performance critical).  This removes the implicit use
of percpu variables, which simplifies understanding how
the percpu code's use of load_gdt interacts with this code.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5776dc270297e..48b399bd6e0de 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -284,12 +284,11 @@ static void xen_set_ldt(const void *addr, unsigned entries)
 
 static void xen_load_gdt(const struct desc_ptr *dtr)
 {
-	unsigned long *frames;
 	unsigned long va = dtr->address;
 	unsigned int size = dtr->size + 1;
 	unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+	unsigned long frames[pages];
 	int f;
-	struct multicall_space mcs;
 
 	/* A GDT can be up to 64k in size, which corresponds to 8192
 	   8-byte entries, or 16 4k pages.. */
@@ -297,9 +296,6 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 	BUG_ON(size > 65536);
 	BUG_ON(va & ~PAGE_MASK);
 
-	mcs = xen_mc_entry(sizeof(*frames) * pages);
-	frames = mcs.args;
-
 	for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) {
 		int level;
 		pte_t *ptep = lookup_address(va, &level);
@@ -314,13 +310,15 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 
 		frames[f] = mfn;
 
+		printk("xen_load_gdt: %d va=%p mfn=%lx pfn=%lx va'=%p\n",
+		       f, (void *)va, mfn, pfn, virt);
+
 		make_lowmem_page_readonly((void *)va);
 		make_lowmem_page_readonly(virt);
 	}
 
-	MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct));
-
-	xen_mc_issue(PARAVIRT_LAZY_CPU);
+	if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct)))
+		BUG();
 }
 
 static void load_TLS_descriptor(struct thread_struct *t,
-- 
GitLab


From b4b7e58590d0e94ed78bd6be1aa163caba7b6c74 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 16:34:27 -0800
Subject: [PATCH 0131/2198] xen: remove xen_load_gdt debug

Don't need the noise.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 48b399bd6e0de..75b7a0f903806 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -310,9 +310,6 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
 
 		frames[f] = mfn;
 
-		printk("xen_load_gdt: %d va=%p mfn=%lx pfn=%lx va'=%p\n",
-		       f, (void *)va, mfn, pfn, virt);
-
 		make_lowmem_page_readonly((void *)va);
 		make_lowmem_page_readonly(virt);
 	}
-- 
GitLab


From e9e2d1ffcfdb38bed11a3064aa74bea9ee38ed80 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Thu, 5 Mar 2009 20:13:57 +0100
Subject: [PATCH 0132/2198] NULL noise: arch/x86/xen/smp.c

Fix this sparse warnings:
  arch/x86/xen/smp.c:316:52: warning: Using plain integer as NULL pointer
  arch/x86/xen/smp.c:421:60: warning: Using plain integer as NULL pointer

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/smp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 8d470562ffc9f..304d832710cd5 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -317,7 +317,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
 	BUG_ON(rc);
 
 	while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
-		HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+		HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
 		barrier();
 	}
 
@@ -422,7 +422,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask)
 	/* Make sure other vcpus get a chance to run if they need to. */
 	for_each_cpu(cpu, mask) {
 		if (xen_vcpu_stolen(cpu)) {
-			HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+			HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
 			break;
 		}
 	}
-- 
GitLab


From e826fe1ba1563a9272345da8e3279a930ac160a7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sat, 7 Mar 2009 17:09:27 -0800
Subject: [PATCH 0133/2198] xen: mask XSAVE from cpuid

Xen leaves XSAVE set in cpuid, but doesn't allow cr4.OSXSAVE
to be set.  This confuses the kernel and it ends up crashing on
an xsetbv instruction.

At boot time, try to set cr4.OSXSAVE, and mask XSAVE out of
cpuid it we can't.  This will produce a spurious error from Xen,
but allows us to support XSAVE if/when Xen does.

This also factors out the cpuid mask decisions to boot time.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 50 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 75b7a0f903806..da33e0c5870d9 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -168,21 +168,23 @@ static void __init xen_banner(void)
 	       xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
 }
 
+static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
+static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
+
 static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 		      unsigned int *cx, unsigned int *dx)
 {
+	unsigned maskecx = ~0;
 	unsigned maskedx = ~0;
 
 	/*
 	 * Mask out inconvenient features, to try and disable as many
 	 * unsupported kernel subsystems as possible.
 	 */
-	if (*ax == 1)
-		maskedx = ~((1 << X86_FEATURE_APIC) |  /* disable APIC */
-			    (1 << X86_FEATURE_ACPI) |  /* disable ACPI */
-			    (1 << X86_FEATURE_MCE)  |  /* disable MCE */
-			    (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-			    (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+	if (*ax == 1) {
+		maskecx = cpuid_leaf1_ecx_mask;
+		maskedx = cpuid_leaf1_edx_mask;
+	}
 
 	asm(XEN_EMULATE_PREFIX "cpuid"
 		: "=a" (*ax),
@@ -190,9 +192,43 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
 		  "=c" (*cx),
 		  "=d" (*dx)
 		: "0" (*ax), "2" (*cx));
+
+	*cx &= maskecx;
 	*dx &= maskedx;
 }
 
+static __init void xen_init_cpuid_mask(void)
+{
+	unsigned int ax, bx, cx, dx;
+
+	cpuid_leaf1_edx_mask =
+		~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
+		  (1 << X86_FEATURE_MCA)  |  /* disable MCA */
+		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
+
+	if (!xen_initial_domain())
+		cpuid_leaf1_edx_mask &=
+			~((1 << X86_FEATURE_APIC) |  /* disable local APIC */
+			  (1 << X86_FEATURE_ACPI));  /* disable ACPI */
+
+	ax = 1;
+	xen_cpuid(&ax, &bx, &cx, &dx);
+
+	/* cpuid claims we support xsave; try enabling it to see what happens */
+	if (cx & (1 << (X86_FEATURE_XSAVE % 32))) {
+		unsigned long cr4;
+
+		set_in_cr4(X86_CR4_OSXSAVE);
+		
+		cr4 = read_cr4();
+
+		if ((cr4 & X86_CR4_OSXSAVE) == 0)
+			cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
+
+		clear_in_cr4(X86_CR4_OSXSAVE);
+	}
+}
+
 static void xen_set_debugreg(int reg, unsigned long val)
 {
 	HYPERVISOR_set_debugreg(reg, val);
@@ -901,6 +937,8 @@ asmlinkage void __init xen_start_kernel(void)
 
 	xen_init_irq_ops();
 
+	xen_init_cpuid_mask();
+
 #ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * set up the basic apic ops.
-- 
GitLab


From 68509cdcde6583ee1a9542899d1270449c7d5903 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sun, 8 Mar 2009 03:59:04 -0700
Subject: [PATCH 0134/2198] x86-64: remove PGE from must-have feature list

PGE may not be available when running paravirtualized, so test the cpuid
bit before using it.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/required-features.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index d5cd6c586881f..a4737dddfd587 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -50,7 +50,7 @@
 #ifdef CONFIG_X86_64
 #define NEED_PSE	0
 #define NEED_MSR	(1<<(X86_FEATURE_MSR & 31))
-#define NEED_PGE	(1<<(X86_FEATURE_PGE & 31))
+#define NEED_PGE	0
 #define NEED_FXSR	(1<<(X86_FEATURE_FXSR & 31))
 #define NEED_XMM	(1<<(X86_FEATURE_XMM & 31))
 #define NEED_XMM2	(1<<(X86_FEATURE_XMM2 & 31))
-- 
GitLab


From 1e7449730853e7c9ae9a2458b2ced7ba12559a0e Mon Sep 17 00:00:00 2001
From: Alex Nixon <alex.nixon@citrix.com>
Date: Mon, 9 Feb 2009 12:05:46 -0800
Subject: [PATCH 0135/2198] Xen: Add virt_to_pfn helper function

Signed-off-by: Alex Nixon <alex.nixon@citrix.com>
---
 arch/x86/include/asm/xen/page.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index 1a918dde46b50..018a0a400799c 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -124,7 +124,8 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
 
 /* VIRT <-> MACHINE conversion */
 #define virt_to_machine(v)	(phys_to_machine(XPADDR(__pa(v))))
-#define virt_to_mfn(v)		(pfn_to_mfn(PFN_DOWN(__pa(v))))
+#define virt_to_pfn(v)          (PFN_DOWN(__pa(v)))
+#define virt_to_mfn(v)		(pfn_to_mfn(virt_to_pfn(v)))
 #define mfn_to_virt(m)		(__va(mfn_to_pfn(m) << PAGE_SHIFT))
 
 static inline unsigned long pte_mfn(pte_t pte)
-- 
GitLab


From 5f241e65f2be4661a33e1937e1c829252a80b2b8 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Mon, 16 Mar 2009 17:08:48 -0700
Subject: [PATCH 0136/2198] x86-64: non-paravirt systems always has PSE and PGE

A paravirtualized system may not have PSE or PGE available to
guests, so they are not required features.  However, without
paravirt we can assume that any x86-64 implementation will have
them available.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/required-features.h | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
index a4737dddfd587..64cf2d24fad1c 100644
--- a/arch/x86/include/asm/required-features.h
+++ b/arch/x86/include/asm/required-features.h
@@ -48,9 +48,15 @@
 #endif
 
 #ifdef CONFIG_X86_64
+#ifdef CONFIG_PARAVIRT
+/* Paravirtualized systems may not have PSE or PGE available */
 #define NEED_PSE	0
-#define NEED_MSR	(1<<(X86_FEATURE_MSR & 31))
 #define NEED_PGE	0
+#else
+#define NEED_PSE	(1<<(X86_FEATURE_PSE) & 31)
+#define NEED_PGE	(1<<(X86_FEATURE_PGE) & 31)
+#endif
+#define NEED_MSR	(1<<(X86_FEATURE_MSR & 31))
 #define NEED_FXSR	(1<<(X86_FEATURE_FXSR & 31))
 #define NEED_XMM	(1<<(X86_FEATURE_XMM & 31))
 #define NEED_XMM2	(1<<(X86_FEATURE_XMM2 & 31))
-- 
GitLab


From 4185f35404dc96f8525298c7c548aee419f3b3f4 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 17 Mar 2009 13:30:55 -0700
Subject: [PATCH 0137/2198] xen/mmu: some early pagetable cleanups

1. make sure early-allocated ptes are pinned, so they can be later
   unpinned
2. don't pin pmd+pud, just make them RO
3. scatter some __inits around

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c     | 40 ++++++++++++++++++++++++++++------------
 arch/x86/xen/xen-ops.h |  2 --
 2 files changed, 28 insertions(+), 14 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 67d2ab45cd90c..df87c803cecc9 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1013,7 +1013,7 @@ static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
 	return 0;
 }
 
-void __init xen_mark_init_mm_pinned(void)
+static void __init xen_mark_init_mm_pinned(void)
 {
 	xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
 }
@@ -1461,10 +1461,29 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
 }
 #endif
 
+static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+{
+	struct mmuext_op op;
+	op.cmd = cmd;
+	op.arg1.mfn = pfn_to_mfn(pfn);
+	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
+		BUG();
+}
+
 /* Early in boot, while setting up the initial pagetable, assume
    everything is pinned. */
 static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
 {
+#ifdef CONFIG_FLATMEM
+	BUG_ON(mem_map);	/* should only be used early */
+#endif
+	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
+	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
+}
+
+/* Used for pmd and pud */
+static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
+{
 #ifdef CONFIG_FLATMEM
 	BUG_ON(mem_map);	/* should only be used early */
 #endif
@@ -1473,18 +1492,15 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
 
 /* Early release_pte assumes that all pts are pinned, since there's
    only init_mm and anything attached to that is pinned. */
-static void xen_release_pte_init(unsigned long pfn)
+static __init void xen_release_pte_init(unsigned long pfn)
 {
+	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
 	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
 
-static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
+static __init void xen_release_pmd_init(unsigned long pfn)
 {
-	struct mmuext_op op;
-	op.cmd = cmd;
-	op.arg1.mfn = pfn_to_mfn(pfn);
-	if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-		BUG();
+	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
 }
 
 /* This needs to make sure the new pte page is pinned iff its being
@@ -1873,9 +1889,9 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
 	.alloc_pte = xen_alloc_pte_init,
 	.release_pte = xen_release_pte_init,
-	.alloc_pmd = xen_alloc_pte_init,
+	.alloc_pmd = xen_alloc_pmd_init,
 	.alloc_pmd_clone = paravirt_nop,
-	.release_pmd = xen_release_pte_init,
+	.release_pmd = xen_release_pmd_init,
 
 #ifdef CONFIG_HIGHPTE
 	.kmap_atomic_pte = xen_kmap_atomic_pte,
@@ -1914,8 +1930,8 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
 	.set_pgd = xen_set_pgd_hyper,
 
-	.alloc_pud = xen_alloc_pte_init,
-	.release_pud = xen_release_pte_init,
+	.alloc_pud = xen_alloc_pmd_init,
+	.release_pud = xen_release_pmd_init,
 #endif	/* PAGETABLE_LEVELS == 4 */
 
 	.activate_mm = xen_activate_mm,
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f897cdffccb6f..5c50a1017a37b 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -56,8 +56,6 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 
 bool xen_vcpu_stolen(int vcpu);
 
-void xen_mark_init_mm_pinned(void);
-
 void xen_setup_vcpu_info_placement(void);
 
 #ifdef CONFIG_SMP
-- 
GitLab


From 8de07bbdede03598801cf33ab23dcbcd28a918d2 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Wed, 4 Mar 2009 17:36:57 -0800
Subject: [PATCH 0138/2198] xen/mmu: weaken flush_tlb_other test

Impact: fixes crashing bug

There's no particular problem with getting an empty cpu mask,
so just shortcut-return if we get one.

Avoids crash reported by Christophe Saout <christophe@saout.de>

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index df87c803cecc9..e425a32e0a908 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1293,8 +1293,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 	} *args;
 	struct multicall_space mcs;
 
-	BUG_ON(cpumask_empty(cpus));
-	BUG_ON(!mm);
+	if (cpumask_empty(cpus))
+		return;		/* nothing to do */
 
 	mcs = xen_mc_entry(sizeof(*args));
 	args = mcs.args;
-- 
GitLab


From 1e6fcf840e11ceff8a656a678c6e4b0560a98e08 Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Wed, 25 Mar 2009 17:46:42 +0000
Subject: [PATCH 0139/2198] xen: resume interrupts before system devices.

Impact: bugfix Xen domain restore

Otherwise the first timer interrupt after resume is missed and we never
get another.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/manage.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 3ccd348d112d6..b703dd2c9f11e 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -68,15 +68,15 @@ static int xen_suspend(void *data)
 	gnttab_resume();
 	xen_mm_unpin_all();
 
-	sysdev_resume();
-	device_power_up(PMSG_RESUME);
-
 	if (!*cancelled) {
 		xen_irq_resume();
 		xen_console_resume();
 		xen_timer_resume();
 	}
 
+	sysdev_resume();
+	device_power_up(PMSG_RESUME);
+
 	return 0;
 }
 
-- 
GitLab


From 707ebbc81c61eb480d8a51ca61e355e240df1d32 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 27 Mar 2009 11:29:02 -0700
Subject: [PATCH 0140/2198] xen: set _PAGE_NX in __supported_pte_mask before
 pagetable construction

Some 64-bit machines don't support the NX flag in ptes.
Check for NX before constructing the kernel pagetables.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index da33e0c5870d9..80f4c53434957 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -42,6 +42,7 @@
 #include <asm/xen/hypervisor.h>
 #include <asm/fixmap.h>
 #include <asm/processor.h>
+#include <asm/proto.h>
 #include <asm/msr-index.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
@@ -912,7 +913,6 @@ static const struct machine_ops __initdata xen_machine_ops = {
 	.emergency_restart = xen_emergency_restart,
 };
 
-
 /* First C function to be called on Xen boot */
 asmlinkage void __init xen_start_kernel(void)
 {
@@ -980,6 +980,11 @@ asmlinkage void __init xen_start_kernel(void)
 	if (!xen_initial_domain())
 		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
+#ifdef CONFIG_X86_64
+	/* Work out if we support NX */
+	check_efer();
+#endif
+
 	/* Don't do the full vcpu_info placement stuff until we have a
 	   possible map and a non-dummy shared_info. */
 	per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
-- 
GitLab


From 6d02c42698f99eccb290ac53d4f10ca883b9f90c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sun, 29 Mar 2009 22:57:15 -0700
Subject: [PATCH 0141/2198] xen: clean up gate trap/interrupt constants

Use GATE_INTERRUPT/TRAP rather than 0xe/f.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 80f4c53434957..12a3159333bc0 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -428,7 +428,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 static int cvt_gate_to_trap(int vector, const gate_desc *val,
 			    struct trap_info *info)
 {
-	if (val->type != 0xf && val->type != 0xe)
+	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
 		return 0;
 
 	info->vector = vector;
@@ -436,8 +436,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 	info->cs = gate_segment(*val);
 	info->flags = val->dpl;
 	/* interrupt gates clear IF */
-	if (val->type == 0xe)
-		info->flags |= 4;
+	if (val->type == GATE_INTERRUPT)
+		info->flags |= 1 << 2;
 
 	return 1;
 }
-- 
GitLab


From d4c045364d3107603187f21a56ec231e74d26441 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 6 Feb 2009 19:20:31 -0800
Subject: [PATCH 0142/2198] xen: add irq_from_evtchn

Given an evtchn, return the corresponding irq.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/events.c | 6 ++++++
 include/xen/events.h | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 30963af5dba02..1cd2a0e15ae8e 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -151,6 +151,12 @@ static unsigned int evtchn_from_irq(unsigned irq)
 	return info_for_irq(irq)->evtchn;
 }
 
+unsigned irq_from_evtchn(unsigned int evtchn)
+{
+	return evtchn_to_irq[evtchn];
+}
+EXPORT_SYMBOL_GPL(irq_from_evtchn);
+
 static enum ipi_vector ipi_from_irq(unsigned irq)
 {
 	struct irq_info *info = info_for_irq(irq);
diff --git a/include/xen/events.h b/include/xen/events.h
index 0d5f1adc0363e..e68d59a90ca88 100644
--- a/include/xen/events.h
+++ b/include/xen/events.h
@@ -53,4 +53,7 @@ bool xen_test_irq_pending(int irq);
    irq will be disabled so it won't deliver an interrupt. */
 void xen_poll_irq(int irq);
 
+/* Determine the IRQ which is bound to an event channel */
+unsigned irq_from_evtchn(unsigned int evtchn);
+
 #endif	/* _XEN_EVENTS_H */
-- 
GitLab


From f7116284c734f3a47180cd9c907944a1837ccb3c Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 6 Feb 2009 19:21:19 -0800
Subject: [PATCH 0143/2198] xen: add /dev/xen/evtchn driver

This driver is used by application which wish to receive notifications
from the hypervisor or other guests via Xen's event channel
mechanism. In particular it is used by the xenstore daemon in domain
0.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/Kconfig  |  10 +
 drivers/xen/Makefile |   3 +-
 drivers/xen/evtchn.c | 494 +++++++++++++++++++++++++++++++++++++++++++
 include/xen/evtchn.h |  88 ++++++++
 4 files changed, 594 insertions(+), 1 deletion(-)
 create mode 100644 drivers/xen/evtchn.c
 create mode 100644 include/xen/evtchn.h

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 526187c8a12de..1bbb9108f31e4 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -18,6 +18,16 @@ config XEN_SCRUB_PAGES
 	  secure, but slightly less efficient.
 	  If in doubt, say yes.
 
+config XEN_DEV_EVTCHN
+	tristate "Xen /dev/xen/evtchn device"
+	depends on XEN
+	default y
+	help
+	  The evtchn driver allows a userspace process to triger event
+	  channels and to receive notification of an event channel
+	  firing.
+	  If in doubt, say yes.
+
 config XENFS
 	tristate "Xen filesystem"
 	depends on XEN
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index ff8accc9e103f..1567639847e7b 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -4,4 +4,5 @@ obj-y	+= xenbus/
 obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
-obj-$(CONFIG_XENFS)		+= xenfs/
\ No newline at end of file
+obj-$(CONFIG_XEN_DEV_EVTCHN)	+= evtchn.o
+obj-$(CONFIG_XENFS)		+= xenfs/
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
new file mode 100644
index 0000000000000..517b9ee63e18f
--- /dev/null
+++ b/drivers/xen/evtchn.c
@@ -0,0 +1,494 @@
+/******************************************************************************
+ * evtchn.c
+ *
+ * Driver for receiving and demuxing event-channel signals.
+ *
+ * Copyright (c) 2004-2005, K A Fraser
+ * Multi-process extensions Copyright (c) 2004, Steven Smith
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/miscdevice.h>
+#include <linux/major.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/poll.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <xen/events.h>
+#include <xen/evtchn.h>
+#include <asm/xen/hypervisor.h>
+
+struct per_user_data {
+	/* Notification ring, accessed via /dev/xen/evtchn. */
+#define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
+#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
+	evtchn_port_t *ring;
+	unsigned int ring_cons, ring_prod, ring_overflow;
+	struct mutex ring_cons_mutex; /* protect against concurrent readers */
+
+	/* Processes wait on this queue when ring is empty. */
+	wait_queue_head_t evtchn_wait;
+	struct fasync_struct *evtchn_async_queue;
+	const char *name;
+};
+
+/* Who's bound to each port? */
+static struct per_user_data *port_user[NR_EVENT_CHANNELS];
+static DEFINE_SPINLOCK(port_user_lock);
+
+irqreturn_t evtchn_interrupt(int irq, void *data)
+{
+	unsigned int port = (unsigned long)data;
+	struct per_user_data *u;
+
+	spin_lock(&port_user_lock);
+
+	u = port_user[port];
+
+	disable_irq_nosync(irq);
+
+	if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
+		u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+		wmb(); /* Ensure ring contents visible */
+		if (u->ring_cons == u->ring_prod++) {
+			wake_up_interruptible(&u->evtchn_wait);
+			kill_fasync(&u->evtchn_async_queue,
+				    SIGIO, POLL_IN);
+		}
+	} else {
+		u->ring_overflow = 1;
+	}
+
+	spin_unlock(&port_user_lock);
+
+	return IRQ_HANDLED;
+}
+
+static ssize_t evtchn_read(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	int rc;
+	unsigned int c, p, bytes1 = 0, bytes2 = 0;
+	struct per_user_data *u = file->private_data;
+
+	/* Whole number of ports. */
+	count &= ~(sizeof(evtchn_port_t)-1);
+
+	if (count == 0)
+		return 0;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+
+	for (;;) {
+		mutex_lock(&u->ring_cons_mutex);
+
+		rc = -EFBIG;
+		if (u->ring_overflow)
+			goto unlock_out;
+
+		c = u->ring_cons;
+		p = u->ring_prod;
+		if (c != p)
+			break;
+
+		mutex_unlock(&u->ring_cons_mutex);
+
+		if (file->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		rc = wait_event_interruptible(u->evtchn_wait,
+					      u->ring_cons != u->ring_prod);
+		if (rc)
+			return rc;
+	}
+
+	/* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
+	if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
+		bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
+			sizeof(evtchn_port_t);
+		bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
+	} else {
+		bytes1 = (p - c) * sizeof(evtchn_port_t);
+		bytes2 = 0;
+	}
+
+	/* Truncate chunks according to caller's maximum byte count. */
+	if (bytes1 > count) {
+		bytes1 = count;
+		bytes2 = 0;
+	} else if ((bytes1 + bytes2) > count) {
+		bytes2 = count - bytes1;
+	}
+
+	rc = -EFAULT;
+	rmb(); /* Ensure that we see the port before we copy it. */
+	if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
+	    ((bytes2 != 0) &&
+	     copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
+		goto unlock_out;
+
+	u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
+	rc = bytes1 + bytes2;
+
+ unlock_out:
+	mutex_unlock(&u->ring_cons_mutex);
+	return rc;
+}
+
+static ssize_t evtchn_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	int rc, i;
+	evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+	struct per_user_data *u = file->private_data;
+
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	/* Whole number of ports. */
+	count &= ~(sizeof(evtchn_port_t)-1);
+
+	rc = 0;
+	if (count == 0)
+		goto out;
+
+	if (count > PAGE_SIZE)
+		count = PAGE_SIZE;
+
+	rc = -EFAULT;
+	if (copy_from_user(kbuf, buf, count) != 0)
+		goto out;
+
+	spin_lock_irq(&port_user_lock);
+	for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
+		if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
+			enable_irq(irq_from_evtchn(kbuf[i]));
+	spin_unlock_irq(&port_user_lock);
+
+	rc = count;
+
+ out:
+	free_page((unsigned long)kbuf);
+	return rc;
+}
+
+static int evtchn_bind_to_user(struct per_user_data *u, int port)
+{
+	int irq;
+	int rc = 0;
+
+	spin_lock_irq(&port_user_lock);
+
+	BUG_ON(port_user[port] != NULL);
+
+	irq = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+					u->name, (void *)(unsigned long)port);
+	if (rc < 0)
+		goto fail;
+
+	port_user[port] = u;
+
+fail:
+	spin_unlock_irq(&port_user_lock);
+	return rc;
+}
+
+static void evtchn_unbind_from_user(struct per_user_data *u, int port)
+{
+	int irq = irq_from_evtchn(port);
+
+	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+	port_user[port] = NULL;
+}
+
+static long evtchn_ioctl(struct file *file,
+			 unsigned int cmd, unsigned long arg)
+{
+	int rc;
+	struct per_user_data *u = file->private_data;
+	void __user *uarg = (void __user *) arg;
+
+	switch (cmd) {
+	case IOCTL_EVTCHN_BIND_VIRQ: {
+		struct ioctl_evtchn_bind_virq bind;
+		struct evtchn_bind_virq bind_virq;
+
+		rc = -EFAULT;
+		if (copy_from_user(&bind, uarg, sizeof(bind)))
+			break;
+
+		bind_virq.virq = bind.virq;
+		bind_virq.vcpu = 0;
+		rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+						 &bind_virq);
+		if (rc != 0)
+			break;
+
+		rc = evtchn_bind_to_user(u, bind_virq.port);
+		if (rc == 0)
+			rc = bind_virq.port;
+		break;
+	}
+
+	case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
+		struct ioctl_evtchn_bind_interdomain bind;
+		struct evtchn_bind_interdomain bind_interdomain;
+
+		rc = -EFAULT;
+		if (copy_from_user(&bind, uarg, sizeof(bind)))
+			break;
+
+		bind_interdomain.remote_dom  = bind.remote_domain;
+		bind_interdomain.remote_port = bind.remote_port;
+		rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+						 &bind_interdomain);
+		if (rc != 0)
+			break;
+
+		rc = evtchn_bind_to_user(u, bind_interdomain.local_port);
+		if (rc == 0)
+			rc = bind_interdomain.local_port;
+		break;
+	}
+
+	case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
+		struct ioctl_evtchn_bind_unbound_port bind;
+		struct evtchn_alloc_unbound alloc_unbound;
+
+		rc = -EFAULT;
+		if (copy_from_user(&bind, uarg, sizeof(bind)))
+			break;
+
+		alloc_unbound.dom        = DOMID_SELF;
+		alloc_unbound.remote_dom = bind.remote_domain;
+		rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+						 &alloc_unbound);
+		if (rc != 0)
+			break;
+
+		rc = evtchn_bind_to_user(u, alloc_unbound.port);
+		if (rc == 0)
+			rc = alloc_unbound.port;
+		break;
+	}
+
+	case IOCTL_EVTCHN_UNBIND: {
+		struct ioctl_evtchn_unbind unbind;
+
+		rc = -EFAULT;
+		if (copy_from_user(&unbind, uarg, sizeof(unbind)))
+			break;
+
+		rc = -EINVAL;
+		if (unbind.port >= NR_EVENT_CHANNELS)
+			break;
+
+		spin_lock_irq(&port_user_lock);
+
+		rc = -ENOTCONN;
+		if (port_user[unbind.port] != u) {
+			spin_unlock_irq(&port_user_lock);
+			break;
+		}
+
+		evtchn_unbind_from_user(u, unbind.port);
+
+		spin_unlock_irq(&port_user_lock);
+
+		rc = 0;
+		break;
+	}
+
+	case IOCTL_EVTCHN_NOTIFY: {
+		struct ioctl_evtchn_notify notify;
+
+		rc = -EFAULT;
+		if (copy_from_user(&notify, uarg, sizeof(notify)))
+			break;
+
+		if (notify.port >= NR_EVENT_CHANNELS) {
+			rc = -EINVAL;
+		} else if (port_user[notify.port] != u) {
+			rc = -ENOTCONN;
+		} else {
+			notify_remote_via_evtchn(notify.port);
+			rc = 0;
+		}
+		break;
+	}
+
+	case IOCTL_EVTCHN_RESET: {
+		/* Initialise the ring to empty. Clear errors. */
+		mutex_lock(&u->ring_cons_mutex);
+		spin_lock_irq(&port_user_lock);
+		u->ring_cons = u->ring_prod = u->ring_overflow = 0;
+		spin_unlock_irq(&port_user_lock);
+		mutex_unlock(&u->ring_cons_mutex);
+		rc = 0;
+		break;
+	}
+
+	default:
+		rc = -ENOSYS;
+		break;
+	}
+
+	return rc;
+}
+
+static unsigned int evtchn_poll(struct file *file, poll_table *wait)
+{
+	unsigned int mask = POLLOUT | POLLWRNORM;
+	struct per_user_data *u = file->private_data;
+
+	poll_wait(file, &u->evtchn_wait, wait);
+	if (u->ring_cons != u->ring_prod)
+		mask |= POLLIN | POLLRDNORM;
+	if (u->ring_overflow)
+		mask = POLLERR;
+	return mask;
+}
+
+static int evtchn_fasync(int fd, struct file *filp, int on)
+{
+	struct per_user_data *u = filp->private_data;
+	return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
+}
+
+static int evtchn_open(struct inode *inode, struct file *filp)
+{
+	struct per_user_data *u;
+
+	u = kzalloc(sizeof(*u), GFP_KERNEL);
+	if (u == NULL)
+		return -ENOMEM;
+
+	u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
+	if (u->name == NULL) {
+		kfree(u);
+		return -ENOMEM;
+	}
+
+	init_waitqueue_head(&u->evtchn_wait);
+
+	u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+	if (u->ring == NULL) {
+		kfree(u->name);
+		kfree(u);
+		return -ENOMEM;
+	}
+
+	mutex_init(&u->ring_cons_mutex);
+
+	filp->private_data = u;
+
+	return 0;
+}
+
+static int evtchn_release(struct inode *inode, struct file *filp)
+{
+	int i;
+	struct per_user_data *u = filp->private_data;
+
+	spin_lock_irq(&port_user_lock);
+
+	free_page((unsigned long)u->ring);
+
+	for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+		if (port_user[i] != u)
+			continue;
+
+		evtchn_unbind_from_user(port_user[i], i);
+	}
+
+	spin_unlock_irq(&port_user_lock);
+
+	kfree(u->name);
+	kfree(u);
+
+	return 0;
+}
+
+static const struct file_operations evtchn_fops = {
+	.owner   = THIS_MODULE,
+	.read    = evtchn_read,
+	.write   = evtchn_write,
+	.unlocked_ioctl = evtchn_ioctl,
+	.poll    = evtchn_poll,
+	.fasync  = evtchn_fasync,
+	.open    = evtchn_open,
+	.release = evtchn_release,
+};
+
+static struct miscdevice evtchn_miscdev = {
+	.minor        = MISC_DYNAMIC_MINOR,
+	.name         = "evtchn",
+	.fops         = &evtchn_fops,
+};
+static int __init evtchn_init(void)
+{
+	int err;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	spin_lock_init(&port_user_lock);
+	memset(port_user, 0, sizeof(port_user));
+
+	/* Create '/dev/misc/evtchn'. */
+	err = misc_register(&evtchn_miscdev);
+	if (err != 0) {
+		printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+		return err;
+	}
+
+	printk(KERN_INFO "Event-channel device installed.\n");
+
+	return 0;
+}
+
+static void __exit evtchn_cleanup(void)
+{
+	misc_deregister(&evtchn_miscdev);
+}
+
+module_init(evtchn_init);
+module_exit(evtchn_cleanup);
+
+MODULE_LICENSE("GPL");
diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h
new file mode 100644
index 0000000000000..14e833ee4e0bd
--- /dev/null
+++ b/include/xen/evtchn.h
@@ -0,0 +1,88 @@
+/******************************************************************************
+ * evtchn.h
+ *
+ * Interface to /dev/xen/evtchn.
+ *
+ * Copyright (c) 2003-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_EVTCHN_H__
+#define __LINUX_PUBLIC_EVTCHN_H__
+
+/*
+ * Bind a fresh port to VIRQ @virq.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_VIRQ				\
+	_IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
+struct ioctl_evtchn_bind_virq {
+	unsigned int virq;
+};
+
+/*
+ * Bind a fresh port to remote <@remote_domain, @remote_port>.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_INTERDOMAIN			\
+	_IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
+struct ioctl_evtchn_bind_interdomain {
+	unsigned int remote_domain, remote_port;
+};
+
+/*
+ * Allocate a fresh port for binding to @remote_domain.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_UNBOUND_PORT			\
+	_IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
+struct ioctl_evtchn_bind_unbound_port {
+	unsigned int remote_domain;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_UNBIND				\
+	_IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
+struct ioctl_evtchn_unbind {
+	unsigned int port;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_NOTIFY				\
+	_IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
+struct ioctl_evtchn_notify {
+	unsigned int port;
+};
+
+/* Clear and reinitialise the event buffer. Clear error condition. */
+#define IOCTL_EVTCHN_RESET				\
+	_IOC(_IOC_NONE, 'E', 5, 0)
+
+#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
-- 
GitLab


From c5cfef0f79cacc3aa438fc28f4747f0d10c54d0d Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Fri, 6 Feb 2009 19:21:19 -0800
Subject: [PATCH 0144/2198] xen: export ioctl headers to userspace

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 include/Kbuild     | 1 +
 include/xen/Kbuild | 1 +
 2 files changed, 2 insertions(+)
 create mode 100644 include/xen/Kbuild

diff --git a/include/Kbuild b/include/Kbuild
index d8c3e3cbf4160..fe36accd43283 100644
--- a/include/Kbuild
+++ b/include/Kbuild
@@ -8,3 +8,4 @@ header-y += mtd/
 header-y += rdma/
 header-y += video/
 header-y += drm/
+header-y += xen/
diff --git a/include/xen/Kbuild b/include/xen/Kbuild
new file mode 100644
index 0000000000000..4e65c16a445b0
--- /dev/null
+++ b/include/xen/Kbuild
@@ -0,0 +1 @@
+header-y += evtchn.h
-- 
GitLab


From 0a4666b539a0e896ec4e8396a034a479e3573125 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 12 Feb 2009 13:03:24 -0800
Subject: [PATCH 0145/2198] xen/dev-evtchn: clean up locking in evtchn

Define a new per_user_data mutex to serialize bind/unbind operations
to prevent them from racing with each other.  Fix error returns
and don't do a bind while holding a spinlock.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/evtchn.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
index 517b9ee63e18f..af031950f9b1a 100644
--- a/drivers/xen/evtchn.c
+++ b/drivers/xen/evtchn.c
@@ -54,6 +54,8 @@
 #include <asm/xen/hypervisor.h>
 
 struct per_user_data {
+	struct mutex bind_mutex; /* serialize bind/unbind operations */
+
 	/* Notification ring, accessed via /dev/xen/evtchn. */
 #define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
 #define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
@@ -69,7 +71,7 @@ struct per_user_data {
 
 /* Who's bound to each port? */
 static struct per_user_data *port_user[NR_EVENT_CHANNELS];
-static DEFINE_SPINLOCK(port_user_lock);
+static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
 
 irqreturn_t evtchn_interrupt(int irq, void *data)
 {
@@ -210,22 +212,24 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf,
 
 static int evtchn_bind_to_user(struct per_user_data *u, int port)
 {
-	int irq;
 	int rc = 0;
 
-	spin_lock_irq(&port_user_lock);
-
+	/*
+	 * Ports are never reused, so every caller should pass in a
+	 * unique port.
+	 *
+	 * (Locking not necessary because we haven't registered the
+	 * interrupt handler yet, and our caller has already
+	 * serialized bind operations.)
+	 */
 	BUG_ON(port_user[port] != NULL);
-
-	irq = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
-					u->name, (void *)(unsigned long)port);
-	if (rc < 0)
-		goto fail;
-
 	port_user[port] = u;
 
-fail:
-	spin_unlock_irq(&port_user_lock);
+	rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+				       u->name, (void *)(unsigned long)port);
+	if (rc >= 0)
+		rc = 0;
+
 	return rc;
 }
 
@@ -234,6 +238,10 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port)
 	int irq = irq_from_evtchn(port);
 
 	unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+
+	/* make sure we unbind the irq handler before clearing the port */
+	barrier();
+
 	port_user[port] = NULL;
 }
 
@@ -244,6 +252,9 @@ static long evtchn_ioctl(struct file *file,
 	struct per_user_data *u = file->private_data;
 	void __user *uarg = (void __user *) arg;
 
+	/* Prevent bind from racing with unbind */
+	mutex_lock(&u->bind_mutex);
+
 	switch (cmd) {
 	case IOCTL_EVTCHN_BIND_VIRQ: {
 		struct ioctl_evtchn_bind_virq bind;
@@ -368,6 +379,7 @@ static long evtchn_ioctl(struct file *file,
 		rc = -ENOSYS;
 		break;
 	}
+	mutex_unlock(&u->bind_mutex);
 
 	return rc;
 }
@@ -414,6 +426,7 @@ static int evtchn_open(struct inode *inode, struct file *filp)
 		return -ENOMEM;
 	}
 
+	mutex_init(&u->bind_mutex);
 	mutex_init(&u->ring_cons_mutex);
 
 	filp->private_data = u;
-- 
GitLab


From a1ce1be578365a4da7e7d7db4812539d2d5da763 Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Mon, 9 Feb 2009 12:05:50 -0800
Subject: [PATCH 0146/2198] xen: remove suspend_cancel hook

Remove suspend_cancel hook from xenbus_driver, in preparation for using
the device model for suspending.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenbus/xenbus_probe.c | 23 -----------------------
 include/xen/xenbus.h              |  1 -
 2 files changed, 24 deletions(-)

diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 773d1cf232833..bd20361fb0997 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -689,27 +689,6 @@ static int suspend_dev(struct device *dev, void *data)
 	return 0;
 }
 
-static int suspend_cancel_dev(struct device *dev, void *data)
-{
-	int err = 0;
-	struct xenbus_driver *drv;
-	struct xenbus_device *xdev;
-
-	DPRINTK("");
-
-	if (dev->driver == NULL)
-		return 0;
-	drv = to_xenbus_driver(dev->driver);
-	xdev = container_of(dev, struct xenbus_device, dev);
-	if (drv->suspend_cancel)
-		err = drv->suspend_cancel(xdev);
-	if (err)
-		printk(KERN_WARNING
-		       "xenbus: suspend_cancel %s failed: %i\n",
-		       dev_name(dev), err);
-	return 0;
-}
-
 static int resume_dev(struct device *dev, void *data)
 {
 	int err;
@@ -777,8 +756,6 @@ EXPORT_SYMBOL_GPL(xenbus_resume);
 void xenbus_suspend_cancel(void)
 {
 	xs_suspend_cancel();
-	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
-	xenbus_backend_resume(suspend_cancel_dev);
 }
 EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
 
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index f87f9614844db..0836772b96861 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -92,7 +92,6 @@ struct xenbus_driver {
 				 enum xenbus_state backend_state);
 	int (*remove)(struct xenbus_device *dev);
 	int (*suspend)(struct xenbus_device *dev);
-	int (*suspend_cancel)(struct xenbus_device *dev);
 	int (*resume)(struct xenbus_device *dev);
 	int (*uevent)(struct xenbus_device *, char **, int, char *, int);
 	struct device_driver driver;
-- 
GitLab


From de5b31bd47de7e6f41be2e271318dbc8f1af354d Mon Sep 17 00:00:00 2001
From: Ian Campbell <ian.campbell@citrix.com>
Date: Mon, 9 Feb 2009 12:05:50 -0800
Subject: [PATCH 0147/2198] xen: use device model for suspending xenbus devices

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/manage.c              |  9 ++++----
 drivers/xen/xenbus/xenbus_probe.c | 37 ++++++++-----------------------
 drivers/xen/xenbus/xenbus_xs.c    |  2 ++
 include/xen/xenbus.h              |  2 +-
 4 files changed, 16 insertions(+), 34 deletions(-)

diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index b703dd2c9f11e..5269bb4d24969 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -104,9 +104,8 @@ static void do_suspend(void)
 		goto out;
 	}
 
-	printk("suspending xenbus...\n");
-	/* XXX use normal device tree? */
-	xenbus_suspend();
+	printk(KERN_DEBUG "suspending xenstore...\n");
+	xs_suspend();
 
 	err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
 	if (err) {
@@ -116,9 +115,9 @@ static void do_suspend(void)
 
 	if (!cancelled) {
 		xen_arch_resume();
-		xenbus_resume();
+		xs_resume();
 	} else
-		xenbus_suspend_cancel();
+		xs_suspend_cancel();
 
 	device_resume(PMSG_RESUME);
 
diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index bd20361fb0997..4649213bed9e4 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -71,6 +71,9 @@ static int xenbus_probe_frontend(const char *type, const char *name);
 
 static void xenbus_dev_shutdown(struct device *_dev);
 
+static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
+static int xenbus_dev_resume(struct device *dev);
+
 /* If something in array of ids matches this device, return it. */
 static const struct xenbus_device_id *
 match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -188,6 +191,9 @@ static struct xen_bus_type xenbus_frontend = {
 		.remove    = xenbus_dev_remove,
 		.shutdown  = xenbus_dev_shutdown,
 		.dev_attrs = xenbus_dev_attrs,
+
+		.suspend   = xenbus_dev_suspend,
+		.resume    = xenbus_dev_resume,
 	},
 };
 
@@ -669,7 +675,7 @@ static struct xenbus_watch fe_watch = {
 	.callback = frontend_changed,
 };
 
-static int suspend_dev(struct device *dev, void *data)
+static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
 {
 	int err = 0;
 	struct xenbus_driver *drv;
@@ -682,14 +688,14 @@ static int suspend_dev(struct device *dev, void *data)
 	drv = to_xenbus_driver(dev->driver);
 	xdev = container_of(dev, struct xenbus_device, dev);
 	if (drv->suspend)
-		err = drv->suspend(xdev);
+		err = drv->suspend(xdev, state);
 	if (err)
 		printk(KERN_WARNING
 		       "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
 	return 0;
 }
 
-static int resume_dev(struct device *dev, void *data)
+static int xenbus_dev_resume(struct device *dev)
 {
 	int err;
 	struct xenbus_driver *drv;
@@ -734,31 +740,6 @@ static int resume_dev(struct device *dev, void *data)
 	return 0;
 }
 
-void xenbus_suspend(void)
-{
-	DPRINTK("");
-
-	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
-	xenbus_backend_suspend(suspend_dev);
-	xs_suspend();
-}
-EXPORT_SYMBOL_GPL(xenbus_suspend);
-
-void xenbus_resume(void)
-{
-	xb_init_comms();
-	xs_resume();
-	bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
-	xenbus_backend_resume(resume_dev);
-}
-EXPORT_SYMBOL_GPL(xenbus_resume);
-
-void xenbus_suspend_cancel(void)
-{
-	xs_suspend_cancel();
-}
-EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
-
 /* A flag to determine if xenstored is 'ready' (i.e. has started) */
 int xenstored_ready = 0;
 
diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c
index e325eab4724d8..eab33f1dbdf70 100644
--- a/drivers/xen/xenbus/xenbus_xs.c
+++ b/drivers/xen/xenbus/xenbus_xs.c
@@ -673,6 +673,8 @@ void xs_resume(void)
 	struct xenbus_watch *watch;
 	char token[sizeof(watch) * 2 + 1];
 
+	xb_init_comms();
+
 	mutex_unlock(&xs_state.response_mutex);
 	mutex_unlock(&xs_state.request_mutex);
 	up_write(&xs_state.transaction_mutex);
diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h
index 0836772b96861..b9763badbd77d 100644
--- a/include/xen/xenbus.h
+++ b/include/xen/xenbus.h
@@ -91,7 +91,7 @@ struct xenbus_driver {
 	void (*otherend_changed)(struct xenbus_device *dev,
 				 enum xenbus_state backend_state);
 	int (*remove)(struct xenbus_device *dev);
-	int (*suspend)(struct xenbus_device *dev);
+	int (*suspend)(struct xenbus_device *dev, pm_message_t state);
 	int (*resume)(struct xenbus_device *dev);
 	int (*uevent)(struct xenbus_device *, char **, int, char *, int);
 	struct device_driver driver;
-- 
GitLab


From c6a960ce8858f20036cc3afc3b9422670d0d9021 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Mon, 9 Feb 2009 12:05:53 -0800
Subject: [PATCH 0148/2198] xen/xenbus: export xenbus_dev_changed

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenbus/xenbus_probe.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c
index 4649213bed9e4..d42e25d5968dc 100644
--- a/drivers/xen/xenbus/xenbus_probe.c
+++ b/drivers/xen/xenbus/xenbus_probe.c
@@ -660,6 +660,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
 
 	kfree(root);
 }
+EXPORT_SYMBOL_GPL(xenbus_dev_changed);
 
 static void frontend_changed(struct xenbus_watch *watch,
 			     const char **vec, unsigned int len)
-- 
GitLab


From cff7e81b3dd7c25cd2248cd7a04c5764552d5d55 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 10 Mar 2009 14:39:59 -0700
Subject: [PATCH 0149/2198] xen: add /sys/hypervisor support

Adds support for Xen info under /sys/hypervisor.  Taken from Novell 2.6.27
backport tree.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/Kconfig             |  10 +
 drivers/xen/Makefile            |   3 +-
 drivers/xen/sys-hypervisor.c    | 475 ++++++++++++++++++++++++++++++++
 include/xen/interface/version.h |   3 +
 4 files changed, 490 insertions(+), 1 deletion(-)
 create mode 100644 drivers/xen/sys-hypervisor.c

diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 526187c8a12de..88bca1c42db4a 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -41,3 +41,13 @@ config XEN_COMPAT_XENFS
          a xen platform.
          If in doubt, say yes.
 
+config XEN_SYS_HYPERVISOR
+       bool "Create xen entries under /sys/hypervisor"
+       depends on XEN && SYSFS
+       select SYS_HYPERVISOR
+       default y
+       help
+         Create entries under /sys/hypervisor describing the Xen
+	 hypervisor environment.  When running native or in another
+	 virtual environment, /sys/hypervisor will still be present,
+	 but will have no xen contents.
\ No newline at end of file
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index ff8accc9e103f..f3603a39db55f 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -4,4 +4,5 @@ obj-y	+= xenbus/
 obj-$(CONFIG_HOTPLUG_CPU)	+= cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)	+= xencomm.o
 obj-$(CONFIG_XEN_BALLOON)	+= balloon.o
-obj-$(CONFIG_XENFS)		+= xenfs/
\ No newline at end of file
+obj-$(CONFIG_XENFS)		+= xenfs/
+obj-$(CONFIG_XEN_SYS_HYPERVISOR)	+= sys-hypervisor.o
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
new file mode 100644
index 0000000000000..cb29d1ccf0d98
--- /dev/null
+++ b/drivers/xen/sys-hypervisor.c
@@ -0,0 +1,475 @@
+/*
+ *  copyright (c) 2006 IBM Corporation
+ *  Authored by: Mike D. Day <ncmike@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xenbus.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+
+#define HYPERVISOR_ATTR_RO(_name) \
+static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
+
+#define HYPERVISOR_ATTR_RW(_name) \
+static struct hyp_sysfs_attr _name##_attr = \
+	__ATTR(_name, 0644, _name##_show, _name##_store)
+
+struct hyp_sysfs_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct hyp_sysfs_attr *, char *);
+	ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
+	void *hyp_attr_data;
+};
+
+static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	return sprintf(buffer, "xen\n");
+}
+
+HYPERVISOR_ATTR_RO(type);
+
+static int __init xen_sysfs_type_init(void)
+{
+	return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
+}
+
+static void xen_sysfs_type_destroy(void)
+{
+	sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
+}
+
+/* xen version attributes */
+static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+	if (version)
+		return sprintf(buffer, "%d\n", version >> 16);
+	return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(major);
+
+static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+	if (version)
+		return sprintf(buffer, "%d\n", version & 0xff);
+	return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(minor);
+
+static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	char *extra;
+
+	extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
+	if (extra) {
+		ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", extra);
+		kfree(extra);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(extra);
+
+static struct attribute *version_attrs[] = {
+	&major_attr.attr,
+	&minor_attr.attr,
+	&extra_attr.attr,
+	NULL
+};
+
+static struct attribute_group version_group = {
+	.name = "version",
+	.attrs = version_attrs,
+};
+
+static int __init xen_sysfs_version_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &version_group);
+}
+
+static void xen_sysfs_version_destroy(void)
+{
+	sysfs_remove_group(hypervisor_kobj, &version_group);
+}
+
+/* UUID */
+
+static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	char *vm, *val;
+	int ret;
+	extern int xenstored_ready;
+
+	if (!xenstored_ready)
+		return -EBUSY;
+
+	vm = xenbus_read(XBT_NIL, "vm", "", NULL);
+	if (IS_ERR(vm))
+		return PTR_ERR(vm);
+	val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
+	kfree(vm);
+	if (IS_ERR(val))
+		return PTR_ERR(val);
+	ret = sprintf(buffer, "%s\n", val);
+	kfree(val);
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(uuid);
+
+static int __init xen_sysfs_uuid_init(void)
+{
+	return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+static void xen_sysfs_uuid_destroy(void)
+{
+	sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+/* xen compilation attributes */
+
+static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_compile_info *info;
+
+	info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+	if (info) {
+		ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", info->compiler);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiler);
+
+static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_compile_info *info;
+
+	info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+	if (info) {
+		ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", info->compile_by);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiled_by);
+
+static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_compile_info *info;
+
+	info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+	if (info) {
+		ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", info->compile_date);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(compile_date);
+
+static struct attribute *xen_compile_attrs[] = {
+	&compiler_attr.attr,
+	&compiled_by_attr.attr,
+	&compile_date_attr.attr,
+	NULL
+};
+
+static struct attribute_group xen_compilation_group = {
+	.name = "compilation",
+	.attrs = xen_compile_attrs,
+};
+
+int __init static xen_compilation_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+static void xen_compilation_destroy(void)
+{
+	sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+/* xen properties info */
+
+static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	char *caps;
+
+	caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
+	if (caps) {
+		ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", caps);
+		kfree(caps);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(capabilities);
+
+static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	char *cset;
+
+	cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
+	if (cset) {
+		ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
+		if (!ret)
+			ret = sprintf(buffer, "%s\n", cset);
+		kfree(cset);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(changeset);
+
+static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_platform_parameters *parms;
+
+	parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
+	if (parms) {
+		ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
+					     parms);
+		if (!ret)
+			ret = sprintf(buffer, "%lx\n", parms->virt_start);
+		kfree(parms);
+	}
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(virtual_start);
+
+static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	int ret;
+
+	ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
+	if (ret > 0)
+		ret = sprintf(buffer, "%x\n", ret);
+
+	return ret;
+}
+
+HYPERVISOR_ATTR_RO(pagesize);
+
+/* eventually there will be several more features to export */
+static ssize_t xen_feature_show(int index, char *buffer)
+{
+	int ret = -ENOMEM;
+	struct xen_feature_info *info;
+
+	info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
+	if (info) {
+		info->submap_idx = index;
+		ret = HYPERVISOR_xen_version(XENVER_get_features, info);
+		if (!ret)
+			ret = sprintf(buffer, "%d\n", info->submap);
+		kfree(info);
+	}
+
+	return ret;
+}
+
+static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+	return xen_feature_show(XENFEAT_writable_page_tables, buffer);
+}
+
+HYPERVISOR_ATTR_RO(writable_pt);
+
+static struct attribute *xen_properties_attrs[] = {
+	&capabilities_attr.attr,
+	&changeset_attr.attr,
+	&virtual_start_attr.attr,
+	&pagesize_attr.attr,
+	&writable_pt_attr.attr,
+	NULL
+};
+
+static struct attribute_group xen_properties_group = {
+	.name = "properties",
+	.attrs = xen_properties_attrs,
+};
+
+static int __init xen_properties_init(void)
+{
+	return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
+}
+
+static void xen_properties_destroy(void)
+{
+	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
+}
+
+#ifdef CONFIG_KEXEC
+
+extern size_t vmcoreinfo_size_xen;
+extern unsigned long paddr_vmcoreinfo_xen;
+
+static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page)
+{
+	return sprintf(page, "%lx %zx\n",
+		paddr_vmcoreinfo_xen, vmcoreinfo_size_xen);
+}
+
+HYPERVISOR_ATTR_RO(vmcoreinfo);
+
+static int __init xen_sysfs_vmcoreinfo_init(void)
+{
+	return sysfs_create_file(hypervisor_kobj,
+				 &vmcoreinfo_attr.attr);
+}
+
+static void xen_sysfs_vmcoreinfo_destroy(void)
+{
+	sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
+}
+
+#endif
+
+static int __init hyper_sysfs_init(void)
+{
+	int ret;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	ret = xen_sysfs_type_init();
+	if (ret)
+		goto out;
+	ret = xen_sysfs_version_init();
+	if (ret)
+		goto version_out;
+	ret = xen_compilation_init();
+	if (ret)
+		goto comp_out;
+	ret = xen_sysfs_uuid_init();
+	if (ret)
+		goto uuid_out;
+	ret = xen_properties_init();
+	if (ret)
+		goto prop_out;
+#ifdef CONFIG_KEXEC
+	if (vmcoreinfo_size_xen != 0) {
+		ret = xen_sysfs_vmcoreinfo_init();
+		if (ret)
+			goto vmcoreinfo_out;
+	}
+#endif
+
+	goto out;
+
+#ifdef CONFIG_KEXEC
+vmcoreinfo_out:
+#endif
+	xen_properties_destroy();
+prop_out:
+	xen_sysfs_uuid_destroy();
+uuid_out:
+	xen_compilation_destroy();
+comp_out:
+	xen_sysfs_version_destroy();
+version_out:
+	xen_sysfs_type_destroy();
+out:
+	return ret;
+}
+
+static void __exit hyper_sysfs_exit(void)
+{
+#ifdef CONFIG_KEXEC
+	if (vmcoreinfo_size_xen != 0)
+		xen_sysfs_vmcoreinfo_destroy();
+#endif
+	xen_properties_destroy();
+	xen_compilation_destroy();
+	xen_sysfs_uuid_destroy();
+	xen_sysfs_version_destroy();
+	xen_sysfs_type_destroy();
+
+}
+module_init(hyper_sysfs_init);
+module_exit(hyper_sysfs_exit);
+
+static ssize_t hyp_sysfs_show(struct kobject *kobj,
+			      struct attribute *attr,
+			      char *buffer)
+{
+	struct hyp_sysfs_attr *hyp_attr;
+	hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+	if (hyp_attr->show)
+		return hyp_attr->show(hyp_attr, buffer);
+	return 0;
+}
+
+static ssize_t hyp_sysfs_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buffer,
+			       size_t len)
+{
+	struct hyp_sysfs_attr *hyp_attr;
+	hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+	if (hyp_attr->store)
+		return hyp_attr->store(hyp_attr, buffer, len);
+	return 0;
+}
+
+static struct sysfs_ops hyp_sysfs_ops = {
+	.show = hyp_sysfs_show,
+	.store = hyp_sysfs_store,
+};
+
+static struct kobj_type hyp_sysfs_kobj_type = {
+	.sysfs_ops = &hyp_sysfs_ops,
+};
+
+static int __init hypervisor_subsys_init(void)
+{
+	if (!xen_domain())
+		return -ENODEV;
+
+	hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
+	return 0;
+}
+device_initcall(hypervisor_subsys_init);
diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h
index 453235e923f0b..e8b6519d47e9d 100644
--- a/include/xen/interface/version.h
+++ b/include/xen/interface/version.h
@@ -57,4 +57,7 @@ struct xen_feature_info {
 /* Declares the features reported by XENVER_get_features. */
 #include "features.h"
 
+/* arg == NULL; returns host memory page size. */
+#define XENVER_pagesize 7
+
 #endif /* __XEN_PUBLIC_VERSION_H__ */
-- 
GitLab


From a649b720614d5675dc402bef75a92576143fede7 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 10 Mar 2009 17:17:41 -0700
Subject: [PATCH 0150/2198] xen/sys/hypervisor: change writable_pt to features

/sys/hypervisor/properties/writable_pt was misnamed.  Rename to features,
expressed as a bit array in hex.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/sys-hypervisor.c | 41 +++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index cb29d1ccf0d98..1267d6fcc0c72 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -293,37 +293,48 @@ static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
 
 HYPERVISOR_ATTR_RO(pagesize);
 
-/* eventually there will be several more features to export */
 static ssize_t xen_feature_show(int index, char *buffer)
 {
-	int ret = -ENOMEM;
-	struct xen_feature_info *info;
+	ssize_t ret;
+	struct xen_feature_info info;
 
-	info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL);
-	if (info) {
-		info->submap_idx = index;
-		ret = HYPERVISOR_xen_version(XENVER_get_features, info);
-		if (!ret)
-			ret = sprintf(buffer, "%d\n", info->submap);
-		kfree(info);
-	}
+	info.submap_idx = index;
+	ret = HYPERVISOR_xen_version(XENVER_get_features, &info);
+	if (!ret)
+		ret = sprintf(buffer, "%08x", info.submap);
 
 	return ret;
 }
 
-static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer)
+static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer)
 {
-	return xen_feature_show(XENFEAT_writable_page_tables, buffer);
+	ssize_t len;
+	int i;
+
+	len = 0;
+	for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) {
+		int ret = xen_feature_show(i, buffer + len);
+		if (ret < 0) {
+			if (len == 0)
+				len = ret;
+			break;
+		}
+		len += ret;
+	}
+	if (len > 0)
+		buffer[len++] = '\n';
+
+	return len;
 }
 
-HYPERVISOR_ATTR_RO(writable_pt);
+HYPERVISOR_ATTR_RO(features);
 
 static struct attribute *xen_properties_attrs[] = {
 	&capabilities_attr.attr,
 	&changeset_attr.attr,
 	&virtual_start_attr.attr,
 	&pagesize_attr.attr,
-	&writable_pt_attr.attr,
+	&features_attr.attr,
 	NULL
 };
 
-- 
GitLab


From f0783708bf63a2827863cf2be57c08a24843e6bd Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Wed, 11 Mar 2009 10:19:54 +0000
Subject: [PATCH 0151/2198] xen: drop kexec bits from /sys/hypervisor since
 kexec isn't implemented yet

I needed this to compile since there is no kexec yet in pvops kernel
  CC      drivers/xen/sys-hypervisor.o
drivers/xen/sys-hypervisor.c: In function 'hyper_sysfs_init':
drivers/xen/sys-hypervisor.c:405: error: 'vmcoreinfo_size_xen' undeclared (first use in this function)
drivers/xen/sys-hypervisor.c:405: error: (Each undeclared identifier is reported only once
drivers/xen/sys-hypervisor.c:405: error: for each function it appears in.)
drivers/xen/sys-hypervisor.c:406: error: implicit declaration of function 'xen_sysfs_vmcoreinfo_init'
drivers/xen/sys-hypervisor.c: In function 'hyper_sysfs_exit':
drivers/xen/sys-hypervisor.c:433: error: 'vmcoreinfo_size_xen' undeclared (first use in this function)
drivers/xen/sys-hypervisor.c:434: error: implicit declaration of function 'xen_sysfs_vmcoreinfo_destroy'

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
---
 drivers/xen/sys-hypervisor.c | 41 ------------------------------------
 1 file changed, 41 deletions(-)

diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
index 1267d6fcc0c72..88a60e03ccf07 100644
--- a/drivers/xen/sys-hypervisor.c
+++ b/drivers/xen/sys-hypervisor.c
@@ -353,32 +353,6 @@ static void xen_properties_destroy(void)
 	sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
 }
 
-#ifdef CONFIG_KEXEC
-
-extern size_t vmcoreinfo_size_xen;
-extern unsigned long paddr_vmcoreinfo_xen;
-
-static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page)
-{
-	return sprintf(page, "%lx %zx\n",
-		paddr_vmcoreinfo_xen, vmcoreinfo_size_xen);
-}
-
-HYPERVISOR_ATTR_RO(vmcoreinfo);
-
-static int __init xen_sysfs_vmcoreinfo_init(void)
-{
-	return sysfs_create_file(hypervisor_kobj,
-				 &vmcoreinfo_attr.attr);
-}
-
-static void xen_sysfs_vmcoreinfo_destroy(void)
-{
-	sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr);
-}
-
-#endif
-
 static int __init hyper_sysfs_init(void)
 {
 	int ret;
@@ -401,20 +375,9 @@ static int __init hyper_sysfs_init(void)
 	ret = xen_properties_init();
 	if (ret)
 		goto prop_out;
-#ifdef CONFIG_KEXEC
-	if (vmcoreinfo_size_xen != 0) {
-		ret = xen_sysfs_vmcoreinfo_init();
-		if (ret)
-			goto vmcoreinfo_out;
-	}
-#endif
 
 	goto out;
 
-#ifdef CONFIG_KEXEC
-vmcoreinfo_out:
-#endif
-	xen_properties_destroy();
 prop_out:
 	xen_sysfs_uuid_destroy();
 uuid_out:
@@ -429,10 +392,6 @@ static int __init hyper_sysfs_init(void)
 
 static void __exit hyper_sysfs_exit(void)
 {
-#ifdef CONFIG_KEXEC
-	if (vmcoreinfo_size_xen != 0)
-		xen_sysfs_vmcoreinfo_destroy();
-#endif
 	xen_properties_destroy();
 	xen_compilation_destroy();
 	xen_sysfs_uuid_destroy();
-- 
GitLab


From 818fd20673df82031e604bb784d836f1fc2e2451 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Fri, 6 Feb 2009 18:46:47 -0800
Subject: [PATCH 0152/2198] xen: add "capabilities" file

The xenfs capabilities file allows usermode to determine what
capabilities the domain has.  The only one at present is "control_d"
in a privileged domain.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 drivers/xen/xenfs/super.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c
index 515741a8e6b8e..6559e0c752ce1 100644
--- a/drivers/xen/xenfs/super.c
+++ b/drivers/xen/xenfs/super.c
@@ -20,10 +20,27 @@
 MODULE_DESCRIPTION("Xen filesystem");
 MODULE_LICENSE("GPL");
 
+static ssize_t capabilities_read(struct file *file, char __user *buf,
+				 size_t size, loff_t *off)
+{
+	char *tmp = "";
+
+	if (xen_initial_domain())
+		tmp = "control_d\n";
+
+	return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp));
+}
+
+static const struct file_operations capabilities_file_ops = {
+	.read = capabilities_read,
+};
+
 static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	static struct tree_descr xenfs_files[] = {
-		[2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR},
+		[1] = {},
+		{ "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
+		{ "capabilities", &capabilities_file_ops, S_IRUGO },
 		{""},
 	};
 
-- 
GitLab


From 8a6f83afd0c5355db6d11394a798e94950306239 Mon Sep 17 00:00:00 2001
From: KaiGai Kohei <kaigai@ak.jp.nec.com>
Date: Wed, 1 Apr 2009 10:07:57 +0900
Subject: [PATCH 0153/2198] Permissive domain in userspace object manager

This patch enables applications to handle permissive domain correctly.

Since the v2.6.26 kernel, SELinux has supported an idea of permissive
domain which allows certain processes to work as if permissive mode,
even if the global setting is enforcing mode.
However, we don't have an application program interface to inform
what domains are permissive one, and what domains are not.
It means applications focuses on SELinux (XACE/SELinux, SE-PostgreSQL
and so on) cannot handle permissive domain correctly.

This patch add the sixth field (flags) on the reply of the /selinux/access
interface which is used to make an access control decision from userspace.
If the first bit of the flags field is positive, it means the required
access control decision is on permissive domain, so application should
allow any required actions, as the kernel doing.

This patch also has a side benefit. The av_decision.flags is set at
context_struct_compute_av(). It enables to check required permissions
without read_lock(&policy_rwlock).

Signed-off-by: KaiGai Kohei <kaigai@ak.jp.nec.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Acked-by: Eric Paris <eparis@redhat.com>
--
 security/selinux/avc.c              |    2 +-
 security/selinux/include/security.h |    4 +++-
 security/selinux/selinuxfs.c        |    4 ++--
 security/selinux/ss/services.c      |   30 +++++-------------------------
 4 files changed, 11 insertions(+), 29 deletions(-)
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/selinux/avc.c              |  2 +-
 security/selinux/include/security.h |  4 +++-
 security/selinux/selinuxfs.c        |  4 ++--
 security/selinux/ss/services.c      | 30 +++++------------------------
 4 files changed, 11 insertions(+), 29 deletions(-)

diff --git a/security/selinux/avc.c b/security/selinux/avc.c
index 7f9b5fac87793..b2ab608598325 100644
--- a/security/selinux/avc.c
+++ b/security/selinux/avc.c
@@ -927,7 +927,7 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid,
 	if (denied) {
 		if (flags & AVC_STRICT)
 			rc = -EACCES;
-		else if (!selinux_enforcing || security_permissive_sid(ssid))
+		else if (!selinux_enforcing || (avd->flags & AVD_FLAGS_PERMISSIVE))
 			avc_update_node(AVC_CALLBACK_GRANT, requested, ssid,
 					tsid, tclass, avd->seqno);
 		else
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index 5c3434f7626fd..a7be3f01fb081 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -91,9 +91,11 @@ struct av_decision {
 	u32 auditallow;
 	u32 auditdeny;
 	u32 seqno;
+	u32 flags;
 };
 
-int security_permissive_sid(u32 sid);
+/* definitions of av_decision.flags */
+#define AVD_FLAGS_PERMISSIVE	0x0001
 
 int security_compute_av(u32 ssid, u32 tsid,
 	u16 tclass, u32 requested,
diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 2d5136ec3d545..8d4007fbe0e96 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -527,10 +527,10 @@ static ssize_t sel_write_access(struct file *file, char *buf, size_t size)
 		goto out2;
 
 	length = scnprintf(buf, SIMPLE_TRANSACTION_LIMIT,
-			  "%x %x %x %x %u",
+			  "%x %x %x %x %u %x",
 			  avd.allowed, 0xffffffff,
 			  avd.auditallow, avd.auditdeny,
-			  avd.seqno);
+			  avd.seqno, avd.flags);
 out2:
 	kfree(tcon);
 out:
diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c
index deeec6c013aef..500e6f78e1159 100644
--- a/security/selinux/ss/services.c
+++ b/security/selinux/ss/services.c
@@ -410,6 +410,7 @@ static int context_struct_compute_av(struct context *scontext,
 	avd->auditallow = 0;
 	avd->auditdeny = 0xffffffff;
 	avd->seqno = latest_granting;
+	avd->flags = 0;
 
 	/*
 	 * Check for all the invalid cases.
@@ -528,31 +529,6 @@ static int context_struct_compute_av(struct context *scontext,
 	return 0;
 }
 
-/*
- * Given a sid find if the type has the permissive flag set
- */
-int security_permissive_sid(u32 sid)
-{
-	struct context *context;
-	u32 type;
-	int rc;
-
-	read_lock(&policy_rwlock);
-
-	context = sidtab_search(&sidtab, sid);
-	BUG_ON(!context);
-
-	type = context->type;
-	/*
-	 * we are intentionally using type here, not type-1, the 0th bit may
-	 * someday indicate that we are globally setting permissive in policy.
-	 */
-	rc = ebitmap_get_bit(&policydb.permissive_map, type);
-
-	read_unlock(&policy_rwlock);
-	return rc;
-}
-
 static int security_validtrans_handle_fail(struct context *ocontext,
 					   struct context *ncontext,
 					   struct context *tcontext,
@@ -767,6 +743,10 @@ int security_compute_av(u32 ssid,
 
 	rc = context_struct_compute_av(scontext, tcontext, tclass,
 				       requested, avd);
+
+	/* permissive domain? */
+	if (ebitmap_get_bit(&policydb.permissive_map, scontext->type))
+	    avd->flags |= AVD_FLAGS_PERMISSIVE;
 out:
 	read_unlock(&policy_rwlock);
 	return rc;
-- 
GitLab


From 53152f957d4a5dfd537d17c823afeb1a2c03753e Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Thu, 2 Apr 2009 13:24:28 +0100
Subject: [PATCH 0154/2198] xen: honour VCPU availability on boot

If a VM is booted with offline VCPUs then unplug them during boot. Determining
the availability of a VCPU requires access to XenStore which is not available
at the point smp_prepare_cpus() is called, therefore we bring up all VCPUS
initially and unplug the offline ones as soon as XenStore becomes available.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
---
 drivers/xen/cpu_hotplug.c | 40 +++++++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c
index 974f56d1ebe1f..411cb1fc92795 100644
--- a/drivers/xen/cpu_hotplug.c
+++ b/drivers/xen/cpu_hotplug.c
@@ -21,29 +21,41 @@ static void disable_hotplug_cpu(int cpu)
 	cpu_clear(cpu, cpu_present_map);
 }
 
-static void vcpu_hotplug(unsigned int cpu)
+static int vcpu_online(unsigned int cpu)
 {
 	int err;
 	char dir[32], state[32];
 
-	if (!cpu_possible(cpu))
-		return;
-
 	sprintf(dir, "cpu/%u", cpu);
 	err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state);
 	if (err != 1) {
 		printk(KERN_ERR "XENBUS: Unable to read cpu state\n");
-		return;
+		return err;
 	}
 
-	if (strcmp(state, "online") == 0) {
+	if (strcmp(state, "online") == 0)
+		return 1;
+	else if (strcmp(state, "offline") == 0)
+		return 0;
+
+	printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu);
+	return -EINVAL;
+}
+static void vcpu_hotplug(unsigned int cpu)
+{
+	if (!cpu_possible(cpu))
+		return;
+
+	switch (vcpu_online(cpu)) {
+	case 1:
 		enable_hotplug_cpu(cpu);
-	} else if (strcmp(state, "offline") == 0) {
+		break;
+	case 0:
 		(void)cpu_down(cpu);
 		disable_hotplug_cpu(cpu);
-	} else {
-		printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n",
-		       state, cpu);
+		break;
+	default:
+		break;
 	}
 }
 
@@ -64,12 +76,20 @@ static void handle_vcpu_hotplug_event(struct xenbus_watch *watch,
 static int setup_cpu_watcher(struct notifier_block *notifier,
 			      unsigned long event, void *data)
 {
+	int cpu;
 	static struct xenbus_watch cpu_watch = {
 		.node = "cpu",
 		.callback = handle_vcpu_hotplug_event};
 
 	(void)register_xenbus_watch(&cpu_watch);
 
+	for_each_possible_cpu(cpu) {
+		if (vcpu_online(cpu) == 0) {
+			(void)cpu_down(cpu);
+			cpu_clear(cpu, cpu_present_map);
+		}
+	}
+
 	return NOTIFY_DONE;
 }
 
-- 
GitLab


From 3d43321b7015387cfebbe26436d0e9d299162ea1 Mon Sep 17 00:00:00 2001
From: Kees Cook <kees@ubuntu.com>
Date: Thu, 2 Apr 2009 15:49:29 -0700
Subject: [PATCH 0155/2198] modules: sysctl to block module loading

Implement a sysctl file that disables module-loading system-wide since
there is no longer a viable way to remove CAP_SYS_MODULE after the system
bounding capability set was removed in 2.6.25.

Value can only be set to "1", and is tested only if standard capability
checks allow CAP_SYS_MODULE.  Given existing /dev/mem protections, this
should allow administrators a one-way method to block module loading
after initial boot-time module loading has finished.

Signed-off-by: Kees Cook <kees.cook@canonical.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/sysctl/kernel.txt | 11 +++++++++++
 kernel/module.c                 |  7 +++++--
 kernel/sysctl.c                 | 12 ++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index a4ccdd1981cfb..02b134956273f 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -30,6 +30,7 @@ show up in /proc/sys/kernel:
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
+- modules_disabled
 - msgmax
 - msgmnb
 - msgmni
@@ -179,6 +180,16 @@ kernel stack.
 
 ==============================================================
 
+modules_disabled:
+
+A toggle value indicating if modules are allowed to be loaded
+in an otherwise modular kernel.  This toggle defaults to off
+(0), but can be set true (1).  Once true, modules can be
+neither loaded nor unloaded, and the toggle cannot be set back
+to false.
+
+==============================================================
+
 osrelease, ostype & version:
 
 # cat osrelease
diff --git a/kernel/module.c b/kernel/module.c
index f77ac320d0b51..eeb3f7b1383c9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -778,6 +778,9 @@ static void wait_for_zero_refcount(struct module *mod)
 	mutex_lock(&module_mutex);
 }
 
+/* Block module loading/unloading? */
+int modules_disabled = 0;
+
 SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		unsigned int, flags)
 {
@@ -785,7 +788,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 	char name[MODULE_NAME_LEN];
 	int ret, forced = 0;
 
-	if (!capable(CAP_SYS_MODULE))
+	if (!capable(CAP_SYS_MODULE) || modules_disabled)
 		return -EPERM;
 
 	if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
@@ -2349,7 +2352,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
 	int ret = 0;
 
 	/* Must have permission */
-	if (!capable(CAP_SYS_MODULE))
+	if (!capable(CAP_SYS_MODULE) || modules_disabled)
 		return -EPERM;
 
 	/* Only one module load at a time, please */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44ff850f5..2fb4246d27ded 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -113,6 +113,7 @@ static int ngroups_max = NGROUPS_MAX;
 
 #ifdef CONFIG_MODULES
 extern char modprobe_path[];
+extern int modules_disabled;
 #endif
 #ifdef CONFIG_CHR_DEV_SG
 extern int sg_big_buff;
@@ -533,6 +534,17 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dostring,
 		.strategy	= &sysctl_string,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "modules_disabled",
+		.data		= &modules_disabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		/* only handle a transition from default "0" to "1" */
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &one,
+		.extra2		= &one,
+	},
 #endif
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
 	{
-- 
GitLab


From b5f22a59c0356655a501190959db9f7f5dd07e3f Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Thu, 2 Apr 2009 18:47:14 -0500
Subject: [PATCH 0156/2198] don't raise all privs on setuid-root file with fE
 set (v2)

Distributions face a backward compatibility problem with starting to use
file capabilities.  For instance, removing setuid root from ping and
doing setcap cap_net_raw=pe means that booting with an older kernel
or one compiled without file capabilities means ping won't work for
non-root users.

In order to replace the setuid root bit on a capability-unaware
program, one has to set the effective, or legacy, file capability,
which makes the capability effective immediately.  This patch
uses the legacy bit as a queue to not automatically add full
privilege to a setuid-root program.

So, with this patch, an ordinary setuid-root program will run with
privilege.  But if /bin/ping has both setuid-root and cap_net_raw in
fP and fE, then ping (when run by non-root user) will not run
with only cap_net_raw.

Changelog:
	Apr 2 2009: Print a message once when such a binary is loaded,
		as per James Morris' suggestion.
	Apr 2 2009: Fix the condition to only catch uid!=0 && euid==0.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/commoncap.c | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/security/commoncap.c b/security/commoncap.c
index 7cd61a5f52055..97ac1f1677176 100644
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -28,6 +28,28 @@
 #include <linux/prctl.h>
 #include <linux/securebits.h>
 
+/*
+ * If a non-root user executes a setuid-root binary in
+ * !secure(SECURE_NOROOT) mode, then we raise capabilities.
+ * However if fE is also set, then the intent is for only
+ * the file capabilities to be applied, and the setuid-root
+ * bit is left on either to change the uid (plausible) or
+ * to get full privilege on a kernel without file capabilities
+ * support.  So in that case we do not raise capabilities.
+ *
+ * Warn if that happens, once per boot.
+ */
+static void warn_setuid_and_fcaps_mixed(char *fname)
+{
+	static int warned;
+	if (!warned) {
+		printk(KERN_INFO "warning: `%s' has both setuid-root and"
+			" effective capabilities. Therefore not raising all"
+			" capabilities.\n", fname);
+		warned = 1;
+	}
+}
+
 int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
 {
 	NETLINK_CB(skb).eff_cap = current_cap();
@@ -463,6 +485,15 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 		return ret;
 
 	if (!issecure(SECURE_NOROOT)) {
+		/*
+		 * If the legacy file capability is set, then don't set privs
+		 * for a setuid root binary run by a non-root user.  Do set it
+		 * for a root user just to cause least surprise to an admin.
+		 */
+		if (effective && new->uid != 0 && new->euid == 0) {
+			warn_setuid_and_fcaps_mixed(bprm->filename);
+			goto skip;
+		}
 		/*
 		 * To support inheritance of root-permissions and suid-root
 		 * executables under compatibility mode, we override the
@@ -478,6 +509,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
 		if (new->euid == 0)
 			effective = true;
 	}
+skip:
 
 	/* Don't let someone trace a set[ug]id/setpcap binary with the revised
 	 * credentials unless they have the appropriate permit
-- 
GitLab


From 31c9a24ec82926fcae49483e53566d231e705057 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 21:06:25 -0700
Subject: [PATCH 0157/2198] RCU: make treercu be default

Impact: switch default config from CLASSIC_RCU to TREE_RCU

Given that I have not gotten any complaints or bug reports on treercu
recently, this patch makes it be the default.  There are a number of
other defconfig files that explicitly call out CLASSIC_RCU, but which
have comment headers saying not to edit them.  Probably holdovers from
one of the flavors of "make config", but...

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: akpm@linux-foundation.org
Cc: dipankar@in.ibm.com
Cc: niv@us.ibm.com
Cc: manfred@colorfullife.com
Cc: peterz@infradead.org
LKML-Reference: <20090403040625.GA9473@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index 14c483d2b7c90..26ac8772464df 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -302,7 +302,7 @@ menu "RCU Subsystem"
 
 choice
 	prompt "RCU Implementation"
-	default CLASSIC_RCU
+	default TREE_RCU
 
 config CLASSIC_RCU
 	bool "Classic RCU"
-- 
GitLab


From 595258aaeac4cc6e187b98b1bf29bb176febe763 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:28 +0100
Subject: [PATCH 0158/2198] perf_counter: x86: fix 32-bit irq_period assumption

No need to assume the irq_period is 32bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 155bc3c239b63..1cedc3468ce5b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -449,7 +449,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s32 period = hwc->irq_period;
+	s64 period = hwc->irq_period;
 	int err;
 
 	/*
-- 
GitLab


From 755642322aa66fbc5421a35fd3e1733f73e20083 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:29 +0100
Subject: [PATCH 0159/2198] perf_counter: use list_move_tail()

Instead of del/add use a move list-op.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b2e838959f3e4..0fe22c916e294 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -89,8 +89,7 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	list_for_each_entry_safe(sibling, tmp,
 				 &counter->sibling_list, list_entry) {
 
-		list_del_init(&sibling->list_entry);
-		list_add_tail(&sibling->list_entry, &ctx->counter_list);
+		list_move_tail(&sibling->list_entry, &ctx->counter_list);
 		sibling->group_leader = sibling;
 	}
 }
@@ -959,8 +958,7 @@ static void rotate_ctx(struct perf_counter_context *ctx)
 	 */
 	perf_flags = hw_perf_save_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		list_del(&counter->list_entry);
-		list_add_tail(&counter->list_entry, &ctx->counter_list);
+		list_move_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
 	hw_perf_restore(perf_flags);
-- 
GitLab


From 60b3df9c1e24a18aabb412da9905208c5f04ebea Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:30 +0100
Subject: [PATCH 0160/2198] perf_counter: add comment to barrier

We need to ensure the enabled=0 write happens before we
start disabling the actual counters, so that a pcm_amd_enable()
will not enable one underneath us.

I think the race is impossible anyway, we always balance the
ops within any one context and perform enable() with IRQs disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1cedc3468ce5b..a2e3b76bfdc15 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -247,6 +247,10 @@ static u64 pmc_amd_save_disable_all(void)
 
 	enabled = cpuc->enabled;
 	cpuc->enabled = 0;
+	/*
+	 * ensure we write the disable before we start disabling the
+	 * counters proper, so that pcm_amd_enable() does the right thing.
+	 */
 	barrier();
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-- 
GitLab


From 82bae4f8c2fd64a2bb1e2e72c508853ed2b4a299 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:31 +0100
Subject: [PATCH 0161/2198] perf_counter: x86: use ULL postfix for 64bit
 constants

Fix a build warning on 32bit machines by explicitly marking the
constants as 64-bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a2e3b76bfdc15..22dab06c08a42 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -84,9 +84,9 @@ static u64 pmc_intel_event_map(int event)
 
 static u64 pmc_intel_raw_event(u64 event)
 {
-#define CORE_EVNTSEL_EVENT_MASK		0x000000FF
-#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00
-#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000
+#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
+#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK 		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
@@ -116,9 +116,9 @@ static u64 pmc_amd_event_map(int event)
 
 static u64 pmc_amd_raw_event(u64 event)
 {
-#define K7_EVNTSEL_EVENT_MASK	0x7000000FF
-#define K7_EVNTSEL_UNIT_MASK	0x00000FF00
-#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000
+#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
+#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
+#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
 
 #define K7_EVNTSEL_MASK			\
 	(K7_EVNTSEL_EVENT_MASK |	\
-- 
GitLab


From 15dbf27cc18559a14e99609f78678aa86b9c6ff1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:32 +0100
Subject: [PATCH 0162/2198] perf_counter: software counter event infrastructure

Provide generic software counter infrastructure that supports
software events.

This will be used to allow sample based profiling based on software
events such as pagefaults. The current infrastructure can only
provide a count of such events, no place information.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |   8 +-
 kernel/perf_counter.c        | 201 +++++++++++++++++++++++++++++++++++
 2 files changed, 208 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index dde564517b661..3fefc3b8150df 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -126,6 +126,7 @@ struct hw_perf_counter {
 	unsigned long			counter_base;
 	int				nmi;
 	unsigned int			idx;
+	atomic64_t			count; /* software */
 	atomic64_t			prev_count;
 	u64				irq_period;
 	atomic64_t			period_left;
@@ -283,6 +284,8 @@ static inline int is_software_counter(struct perf_counter *counter)
 	return !counter->hw_event.raw && counter->hw_event.type < 0;
 }
 
+extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -295,10 +298,13 @@ static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_notify(struct pt_regs *regs)		{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
-static inline void hw_perf_restore(u64 ctrl)			{ }
+static inline void hw_perf_restore(u64 ctrl)				{ }
 static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
+
+static inline void perf_swcounter_event(enum hw_event_types event, u64 nr,
+					int nmi, struct pt_regs *regs)	{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0fe22c916e294..eeb1b46cf7070 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1328,6 +1328,185 @@ static const struct file_operations perf_fops = {
 	.compat_ioctl		= perf_ioctl,
 };
 
+/*
+ * Generic software counter infrastructure
+ */
+
+static void perf_swcounter_update(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	u64 prev, now;
+	s64 delta;
+
+again:
+	prev = atomic64_read(&hwc->prev_count);
+	now = atomic64_read(&hwc->count);
+	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev)
+		goto again;
+
+	delta = now - prev;
+
+	atomic64_add(delta, &counter->count);
+	atomic64_sub(delta, &hwc->period_left);
+}
+
+static void perf_swcounter_set_period(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	s64 left = atomic64_read(&hwc->period_left);
+	s64 period = hwc->irq_period;
+
+	if (unlikely(left <= -period)) {
+		left = period;
+		atomic64_set(&hwc->period_left, left);
+	}
+
+	if (unlikely(left <= 0)) {
+		left += period;
+		atomic64_add(period, &hwc->period_left);
+	}
+
+	atomic64_set(&hwc->prev_count, -left);
+	atomic64_set(&hwc->count, -left);
+}
+
+static void perf_swcounter_save_and_restart(struct perf_counter *counter)
+{
+	perf_swcounter_update(counter);
+	perf_swcounter_set_period(counter);
+}
+
+static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_swcounter_handle_group(struct perf_counter *sibling)
+{
+	struct perf_counter *counter, *group_leader = sibling->group_leader;
+
+	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
+		perf_swcounter_update(counter);
+		perf_swcounter_store_irq(sibling, counter->hw_event.type);
+		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
+	}
+}
+
+static void perf_swcounter_interrupt(struct perf_counter *counter,
+				     int nmi, struct pt_regs *regs)
+{
+	perf_swcounter_save_and_restart(counter);
+
+	switch (counter->hw_event.record_type) {
+	case PERF_RECORD_SIMPLE:
+		break;
+
+	case PERF_RECORD_IRQ:
+		perf_swcounter_store_irq(counter, instruction_pointer(regs));
+		break;
+
+	case PERF_RECORD_GROUP:
+		perf_swcounter_handle_group(counter);
+		break;
+	}
+
+	if (nmi) {
+		counter->wakeup_pending = 1;
+		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+	} else
+		wake_up(&counter->waitq);
+}
+
+static int perf_swcounter_match(struct perf_counter *counter,
+				enum hw_event_types event,
+				struct pt_regs *regs)
+{
+	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+		return 0;
+
+	if (counter->hw_event.raw)
+		return 0;
+
+	if (counter->hw_event.type != event)
+		return 0;
+
+	if (counter->hw_event.exclude_user && user_mode(regs))
+		return 0;
+
+	if (counter->hw_event.exclude_kernel && !user_mode(regs))
+		return 0;
+
+	return 1;
+}
+
+static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
+				     enum hw_event_types event, u64 nr,
+				     int nmi, struct pt_regs *regs)
+{
+	struct perf_counter *counter;
+	unsigned long flags;
+	int neg;
+
+	if (list_empty(&ctx->counter_list))
+		return;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+
+	/*
+	 * XXX: make counter_list RCU safe
+	 */
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (perf_swcounter_match(counter, event, regs)) {
+			neg = atomic64_add_negative(nr, &counter->hw.count);
+			if (counter->hw.irq_period && !neg)
+				perf_swcounter_interrupt(counter, nmi, regs);
+		}
+	}
+
+	spin_unlock_irqrestore(&ctx->lock, flags);
+}
+
+void perf_swcounter_event(enum hw_event_types event, u64 nr,
+			  int nmi, struct pt_regs *regs)
+{
+	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+
+	perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
+	if (cpuctx->task_ctx)
+		perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
+
+	put_cpu_var(perf_cpu_context);
+}
+
+static void perf_swcounter_read(struct perf_counter *counter)
+{
+	perf_swcounter_update(counter);
+}
+
+static int perf_swcounter_enable(struct perf_counter *counter)
+{
+	perf_swcounter_set_period(counter);
+	return 0;
+}
+
+static void perf_swcounter_disable(struct perf_counter *counter)
+{
+	perf_swcounter_update(counter);
+}
+
+/*
+ * Software counter: cpu wall time clock
+ */
+
 static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
@@ -1364,6 +1543,10 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
 	.read		= cpu_clock_perf_counter_read,
 };
 
+/*
+ * Software counter: task time clock
+ */
+
 /*
  * Called from within the scheduler:
  */
@@ -1420,6 +1603,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
+/*
+ * Software counter: page faults
+ */
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 #define cpu_page_faults()	__get_cpu_var(vm_event_states).event[PGFAULT]
 #else
@@ -1473,6 +1660,10 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = {
 	.read		= page_faults_perf_counter_read,
 };
 
+/*
+ * Software counter: context switches
+ */
+
 static u64 get_context_switches(struct perf_counter *counter)
 {
 	struct task_struct *curr = counter->ctx->task;
@@ -1521,6 +1712,10 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = {
 	.read		= context_switches_perf_counter_read,
 };
 
+/*
+ * Software counter: cpu migrations
+ */
+
 static inline u64 get_cpu_migrations(struct perf_counter *counter)
 {
 	struct task_struct *curr = counter->ctx->task;
@@ -1572,7 +1767,9 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
+	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	const struct hw_perf_counter_ops *hw_ops = NULL;
+	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
 	 * Software counters (currently) can't in general distinguish
@@ -1618,6 +1815,10 @@ sw_perf_counter_init(struct perf_counter *counter)
 	default:
 		break;
 	}
+
+	if (hw_ops)
+		hwc->irq_period = hw_event->irq_period;
+
 	return hw_ops;
 }
 
-- 
GitLab


From 7dd1fcc258b65da718f01e4684a7b9244501a9fb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:33 +0100
Subject: [PATCH 0163/2198] perf_counter: provide pagefault software events

We use the generic software counter infrastructure to provide
page fault events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/mm/fault.c |  3 +++
 arch/x86/mm/fault.c     |  3 +++
 kernel/perf_counter.c   | 53 +++--------------------------------------
 3 files changed, 9 insertions(+), 50 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 76993941cac95..eda5b0ca4af26 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -29,6 +29,7 @@
 #include <linux/module.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
+#include <linux/perf_counter.h>
 
 #include <asm/firmware.h>
 #include <asm/page.h>
@@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
 	 * kernel and should generate an OOPS.  Unfortunately, in the case of an
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a03b7279efa01..c8725752b6cdb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -27,6 +27,7 @@
 #include <linux/tty.h>
 #include <linux/smp.h>
 #include <linux/mm.h>
+#include <linux/perf_counter.h>
 
 #include <asm-generic/sections.h>
 
@@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+
 	/*
 	 * If we're in an interrupt, have no user context or are running
 	 * in an atomic region then we must not take the fault:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index eeb1b46cf7070..1773c5d7427dc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1607,57 +1607,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
  * Software counter: page faults
  */
 
-#ifdef CONFIG_VM_EVENT_COUNTERS
-#define cpu_page_faults()	__get_cpu_var(vm_event_states).event[PGFAULT]
-#else
-#define cpu_page_faults()	0
-#endif
-
-static u64 get_page_faults(struct perf_counter *counter)
-{
-	struct task_struct *curr = counter->ctx->task;
-
-	if (curr)
-		return curr->maj_flt + curr->min_flt;
-	return cpu_page_faults();
-}
-
-static void page_faults_perf_counter_update(struct perf_counter *counter)
-{
-	u64 prev, now;
-	s64 delta;
-
-	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_page_faults(counter);
-
-	atomic64_set(&counter->hw.prev_count, now);
-
-	delta = now - prev;
-
-	atomic64_add(delta, &counter->count);
-}
-
-static void page_faults_perf_counter_read(struct perf_counter *counter)
-{
-	page_faults_perf_counter_update(counter);
-}
-
-static int page_faults_perf_counter_enable(struct perf_counter *counter)
-{
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count, get_page_faults(counter));
-	return 0;
-}
-
-static void page_faults_perf_counter_disable(struct perf_counter *counter)
-{
-	page_faults_perf_counter_update(counter);
-}
-
 static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.enable		= page_faults_perf_counter_enable,
-	.disable	= page_faults_perf_counter_disable,
-	.read		= page_faults_perf_counter_read,
+	.enable		= perf_swcounter_enable,
+	.disable	= perf_swcounter_disable,
+	.read		= perf_swcounter_read,
 };
 
 /*
-- 
GitLab


From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:34 +0100
Subject: [PATCH 0164/2198] perf_counter: provide major/minor page fault
 software events

Provide separate sw counters for major and minor page faults.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/mm/fault.c      |  5 ++++-
 arch/x86/mm/fault.c          |  7 +++++--
 include/linux/perf_counter.h |  4 +++-
 kernel/perf_counter.c        | 22 +++++++++-------------
 4 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index eda5b0ca4af26..17bbf6f91fbe6 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -312,6 +312,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
@@ -319,8 +320,10 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 			preempt_enable();
 		}
 #endif
-	} else
+	} else {
 		current->min_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+	}
 	up_read(&mm->mmap_sem);
 	return 0;
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index c8725752b6cdb..f2d3324d92152 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1140,10 +1140,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 		return;
 	}
 
-	if (fault & VM_FAULT_MAJOR)
+	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-	else
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+	} else {
 		tsk->min_flt++;
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+	}
 
 	check_v8086_mode(regs, address, tsk);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3fefc3b8150df..4b14a8e9dbf5d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -49,8 +49,10 @@ enum hw_event_types {
 	PERF_COUNT_PAGE_FAULTS		= -3,
 	PERF_COUNT_CONTEXT_SWITCHES	= -4,
 	PERF_COUNT_CPU_MIGRATIONS	= -5,
+	PERF_COUNT_PAGE_FAULTS_MIN	= -6,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
 
-	PERF_SW_EVENTS_MIN		= -6,
+	PERF_SW_EVENTS_MIN		= -8,
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 1773c5d7427dc..68950a3a52bfb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1503,6 +1503,12 @@ static void perf_swcounter_disable(struct perf_counter *counter)
 	perf_swcounter_update(counter);
 }
 
+static const struct hw_perf_counter_ops perf_ops_generic = {
+	.enable		= perf_swcounter_enable,
+	.disable	= perf_swcounter_disable,
+	.read		= perf_swcounter_read,
+};
+
 /*
  * Software counter: cpu wall time clock
  */
@@ -1603,16 +1609,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-/*
- * Software counter: page faults
- */
-
-static const struct hw_perf_counter_ops perf_ops_page_faults = {
-	.enable		= perf_swcounter_enable,
-	.disable	= perf_swcounter_disable,
-	.read		= perf_swcounter_read,
-};
-
 /*
  * Software counter: context switches
  */
@@ -1753,9 +1749,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_clock;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
-		if (!(counter->hw_event.exclude_user ||
-		      counter->hw_event.exclude_kernel))
-			hw_ops = &perf_ops_page_faults;
+	case PERF_COUNT_PAGE_FAULTS_MIN:
+	case PERF_COUNT_PAGE_FAULTS_MAJ:
+		hw_ops = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
 		if (!counter->hw_event.exclude_kernel)
-- 
GitLab


From d6d020e9957745c61285ef3da9f294c5e6801f0f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:35 +0100
Subject: [PATCH 0165/2198] perf_counter: hrtimer based sampling for software
 time events

Use hrtimers to profile timer based sampling for the software time
counters.

This allows platforms without hardware counter support to still
perform sample based profiling.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  20 ++++--
 kernel/perf_counter.c        | 123 ++++++++++++++++++++++++-----------
 2 files changed, 100 insertions(+), 43 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4b14a8e9dbf5d..dfb4c7ce18b3b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -114,6 +114,7 @@ struct perf_counter_hw_event {
 #include <linux/rculist.h>
 #include <linux/rcupdate.h>
 #include <linux/spinlock.h>
+#include <linux/hrtimer.h>
 #include <asm/atomic.h>
 
 struct task_struct;
@@ -123,12 +124,19 @@ struct task_struct;
  */
 struct hw_perf_counter {
 #ifdef CONFIG_PERF_COUNTERS
-	u64				config;
-	unsigned long			config_base;
-	unsigned long			counter_base;
-	int				nmi;
-	unsigned int			idx;
-	atomic64_t			count; /* software */
+	union {
+		struct { /* hardware */
+			u64				config;
+			unsigned long			config_base;
+			unsigned long			counter_base;
+			int				nmi;
+			unsigned int			idx;
+		};
+		union { /* software */
+			atomic64_t			count;
+			struct hrtimer			hrtimer;
+		};
+	};
 	atomic64_t			prev_count;
 	u64				irq_period;
 	atomic64_t			period_left;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 68950a3a52bfb..f9330d5827cfc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1395,7 +1395,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling)
 	struct perf_counter *counter, *group_leader = sibling->group_leader;
 
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		perf_swcounter_update(counter);
+		counter->hw_ops->read(counter);
 		perf_swcounter_store_irq(sibling, counter->hw_event.type);
 		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
 	}
@@ -1404,8 +1404,6 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling)
 static void perf_swcounter_interrupt(struct perf_counter *counter,
 				     int nmi, struct pt_regs *regs)
 {
-	perf_swcounter_save_and_restart(counter);
-
 	switch (counter->hw_event.record_type) {
 	case PERF_RECORD_SIMPLE:
 		break;
@@ -1426,6 +1424,38 @@ static void perf_swcounter_interrupt(struct perf_counter *counter,
 		wake_up(&counter->waitq);
 }
 
+static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
+{
+	struct perf_counter *counter;
+	struct pt_regs *regs;
+
+	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
+	counter->hw_ops->read(counter);
+
+	regs = get_irq_regs();
+	/*
+	 * In case we exclude kernel IPs or are somehow not in interrupt
+	 * context, provide the next best thing, the user IP.
+	 */
+	if ((counter->hw_event.exclude_kernel || !regs) &&
+			!counter->hw_event.exclude_user)
+		regs = task_pt_regs(current);
+
+	if (regs)
+		perf_swcounter_interrupt(counter, 0, regs);
+
+	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
+
+	return HRTIMER_RESTART;
+}
+
+static void perf_swcounter_overflow(struct perf_counter *counter,
+				    int nmi, struct pt_regs *regs)
+{
+	perf_swcounter_save_and_restart(counter);
+	perf_swcounter_interrupt(counter, nmi, regs);
+}
+
 static int perf_swcounter_match(struct perf_counter *counter,
 				enum hw_event_types event,
 				struct pt_regs *regs)
@@ -1448,13 +1478,20 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	return 1;
 }
 
+static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
+			       int nmi, struct pt_regs *regs)
+{
+	int neg = atomic64_add_negative(nr, &counter->hw.count);
+	if (counter->hw.irq_period && !neg)
+		perf_swcounter_overflow(counter, nmi, regs);
+}
+
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum hw_event_types event, u64 nr,
 				     int nmi, struct pt_regs *regs)
 {
 	struct perf_counter *counter;
 	unsigned long flags;
-	int neg;
 
 	if (list_empty(&ctx->counter_list))
 		return;
@@ -1465,11 +1502,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 	 * XXX: make counter_list RCU safe
 	 */
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (perf_swcounter_match(counter, event, regs)) {
-			neg = atomic64_add_negative(nr, &counter->hw.count);
-			if (counter->hw.irq_period && !neg)
-				perf_swcounter_interrupt(counter, nmi, regs);
-		}
+		if (perf_swcounter_match(counter, event, regs))
+			perf_swcounter_add(counter, nr, nmi, regs);
 	}
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
@@ -1513,14 +1547,6 @@ static const struct hw_perf_counter_ops perf_ops_generic = {
  * Software counter: cpu wall time clock
  */
 
-static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
-{
-	int cpu = raw_smp_processor_id();
-
-	atomic64_set(&counter->hw.prev_count, cpu_clock(cpu));
-	return 0;
-}
-
 static void cpu_clock_perf_counter_update(struct perf_counter *counter)
 {
 	int cpu = raw_smp_processor_id();
@@ -1533,8 +1559,26 @@ static void cpu_clock_perf_counter_update(struct perf_counter *counter)
 	atomic64_add(now - prev, &counter->count);
 }
 
+static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	int cpu = raw_smp_processor_id();
+
+	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	if (hwc->irq_period) {
+		hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		hwc->hrtimer.function = perf_swcounter_hrtimer;
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(hwc->irq_period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
+
+	return 0;
+}
+
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
+	hrtimer_cancel(&counter->hw.hrtimer);
 	cpu_clock_perf_counter_update(counter);
 }
 
@@ -1580,27 +1624,33 @@ static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now
 	atomic64_add(delta, &counter->count);
 }
 
-static void task_clock_perf_counter_read(struct perf_counter *counter)
-{
-	u64 now = task_clock_perf_counter_val(counter, 1);
-
-	task_clock_perf_counter_update(counter, now);
-}
-
 static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count,
-			     task_clock_perf_counter_val(counter, 0));
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
+	if (hwc->irq_period) {
+		hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		hwc->hrtimer.function = perf_swcounter_hrtimer;
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(hwc->irq_period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
 
 	return 0;
 }
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	u64 now = task_clock_perf_counter_val(counter, 0);
+	hrtimer_cancel(&counter->hw.hrtimer);
+	task_clock_perf_counter_update(counter,
+			task_clock_perf_counter_val(counter, 0));
+}
 
-	task_clock_perf_counter_update(counter, now);
+static void task_clock_perf_counter_read(struct perf_counter *counter)
+{
+	task_clock_perf_counter_update(counter,
+			task_clock_perf_counter_val(counter, 1));
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
@@ -1729,16 +1779,12 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 */
 	switch (counter->hw_event.type) {
 	case PERF_COUNT_CPU_CLOCK:
-		if (!(counter->hw_event.exclude_user ||
-		      counter->hw_event.exclude_kernel ||
-		      counter->hw_event.exclude_hv))
-			hw_ops = &perf_ops_cpu_clock;
+		hw_ops = &perf_ops_cpu_clock;
+
+		if (hw_event->irq_period && hw_event->irq_period < 10000)
+			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
-		if (counter->hw_event.exclude_user ||
-		    counter->hw_event.exclude_kernel ||
-		    counter->hw_event.exclude_hv)
-			break;
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -1747,6 +1793,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_task_clock;
 		else
 			hw_ops = &perf_ops_cpu_clock;
+
+		if (hw_event->irq_period && hw_event->irq_period < 10000)
+			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
 	case PERF_COUNT_PAGE_FAULTS_MIN:
-- 
GitLab


From 592903cdcbf606a838056bae6d03fc557806c914 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:36 +0100
Subject: [PATCH 0166/2198] perf_counter: add an event_list

I noticed that the counter_list only includes top-level counters, thus
perf_swcounter_event() will miss sw-counters in groups.

Since perf_swcounter_event() also wants an RCU safe list, create a new
event_list that includes all counters and uses RCU list ops and use call_rcu
to free the counter structure.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 ++++
 kernel/perf_counter.c        | 30 +++++++++++++++++++-----------
 2 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index dfb4c7ce18b3b..08c11a6afebc1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -187,6 +187,7 @@ struct file;
 struct perf_counter {
 #ifdef CONFIG_PERF_COUNTERS
 	struct list_head		list_entry;
+	struct list_head		event_entry;
 	struct list_head		sibling_list;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
@@ -220,6 +221,8 @@ struct perf_counter {
 	struct perf_data		*irqdata;
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
+
+	struct rcu_head			rcu_head;
 #endif
 };
 
@@ -243,6 +246,7 @@ struct perf_counter_context {
 	struct mutex		mutex;
 
 	struct list_head	counter_list;
+	struct list_head	event_list;
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f9330d5827cfc..8d6ecfa64c0e5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -22,6 +22,7 @@
 #include <linux/perf_counter.h>
 #include <linux/mm.h>
 #include <linux/vmstat.h>
+#include <linux/rculist.h>
 
 /*
  * Each CPU has a list of per CPU counters:
@@ -72,6 +73,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 	else
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+
+	list_add_rcu(&counter->event_entry, &ctx->event_list);
 }
 
 static void
@@ -80,6 +83,7 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	struct perf_counter *sibling, *tmp;
 
 	list_del_init(&counter->list_entry);
+	list_del_rcu(&counter->event_entry);
 
 	/*
 	 * If this was a group counter with sibling counters then
@@ -1133,6 +1137,14 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	return ctx;
 }
 
+static void free_counter_rcu(struct rcu_head *head)
+{
+	struct perf_counter *counter;
+
+	counter = container_of(head, struct perf_counter, rcu_head);
+	kfree(counter);
+}
+
 /*
  * Called when the last reference to the file is gone.
  */
@@ -1151,7 +1163,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
-	kfree(counter);
+	call_rcu(&counter->rcu_head, free_counter_rcu);
 	put_context(ctx);
 
 	return 0;
@@ -1491,22 +1503,16 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     int nmi, struct pt_regs *regs)
 {
 	struct perf_counter *counter;
-	unsigned long flags;
 
-	if (list_empty(&ctx->counter_list))
+	if (list_empty(&ctx->event_list))
 		return;
 
-	spin_lock_irqsave(&ctx->lock, flags);
-
-	/*
-	 * XXX: make counter_list RCU safe
-	 */
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
 		if (perf_swcounter_match(counter, event, regs))
 			perf_swcounter_add(counter, nr, nmi, regs);
 	}
-
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	rcu_read_unlock();
 }
 
 void perf_swcounter_event(enum hw_event_types event, u64 nr,
@@ -1846,6 +1852,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	mutex_init(&counter->mutex);
 	INIT_LIST_HEAD(&counter->list_entry);
+	INIT_LIST_HEAD(&counter->event_entry);
 	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
@@ -1992,6 +1999,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	spin_lock_init(&ctx->lock);
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
+	INIT_LIST_HEAD(&ctx->event_list);
 	ctx->task = task;
 }
 
-- 
GitLab


From 039fc91e064b81c2820ff16c304be5aba35fd126 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 16:43:47 +0100
Subject: [PATCH 0167/2198] perf_counter: fix hrtimer sampling

Impact: fix deadlock with perfstat

Fix for the perfstat fubar..

We cannot unconditionally call hrtimer_cancel() without ever having done
hrtimer_init() on the thing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <1236959027.22447.149.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8d6ecfa64c0e5..d6cc22271ef45 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1571,9 +1571,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	int cpu = raw_smp_processor_id();
 
 	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
-		hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		hwc->hrtimer.function = perf_swcounter_hrtimer;
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(hwc->irq_period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -1635,9 +1635,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 
 	atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
-		hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-		hwc->hrtimer.function = perf_swcounter_hrtimer;
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(hwc->irq_period), 0,
 				HRTIMER_MODE_REL, 0);
-- 
GitLab


From 4e193bd4dfdc983d12969b51439b4a1fbaf2daad Mon Sep 17 00:00:00 2001
From: Tim Blechmann <tim@klingt.org>
Date: Sat, 14 Mar 2009 14:29:25 +0100
Subject: [PATCH 0168/2198] perf_counter: include missing header

Impact: build fix

In order to compile a kernel with performance counter patches,
<asm/irq_regs.h> has to be included to provide the declaration of
struct pt_regs *get_irq_regs(void);

[ This bug was masked by unrelated x86 header file changes in the
  x86 tree, but occurs in the tip:perfcounters/core standalone
  tree. ]

Signed-off-by: Tim Blechmann <tim@klingt.org>
Orig-LKML-Reference: <20090314142925.49c29c17@thinkpad>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d6cc22271ef45..0018c5e81249e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -24,6 +24,8 @@
 #include <linux/vmstat.h>
 #include <linux/rculist.h>
 
+#include <asm/irq_regs.h>
+
 /*
  * Each CPU has a list of per CPU counters:
  */
-- 
GitLab


From 7bb497bd885eedd0f56dfe3cc1b5ff20710d33b9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Mar 2009 08:59:21 +0100
Subject: [PATCH 0169/2198] perf_counter: fix crash on perfmon v1 systems

Impact: fix boot crash on Intel Perfmon Version 1 systems

Intel Perfmon v1 does not support the global MSRs, nor does
it offer the generalized MSR ranges. So support v2 and later
CPUs only.

Also mark pmc_ops as read-mostly - to avoid false cacheline
sharing.

Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 22dab06c08a42..6cba9d47b7114 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -57,12 +57,14 @@ struct pmc_x86_ops {
 	int		max_events;
 };
 
-static struct pmc_x86_ops *pmc_ops;
+static struct pmc_x86_ops *pmc_ops __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
 };
 
+static __read_mostly int intel_perfmon_version;
+
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
@@ -613,7 +615,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) {
+	if (intel_perfmon_version >= 2) {
 		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -930,10 +932,10 @@ static struct pmc_x86_ops pmc_amd_ops = {
 
 static struct pmc_x86_ops *pmc_intel_init(void)
 {
+	union cpuid10_edx edx;
 	union cpuid10_eax eax;
-	unsigned int ebx;
 	unsigned int unused;
-	union cpuid10_edx edx;
+	unsigned int ebx;
 
 	/*
 	 * Check whether the Architectural PerfMon supports
@@ -943,8 +945,12 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return NULL;
 
+	intel_perfmon_version = eax.split.version_id;
+	if (intel_perfmon_version < 2)
+		return NULL;
+
 	pr_info("Intel Performance Monitoring support detected.\n");
-	pr_info("... version:         %d\n", eax.split.version_id);
+	pr_info("... version:         %d\n", intel_perfmon_version);
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
-- 
GitLab


From b6c5a71da1477d261bc36254fe1f20d32b57598d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 16 Mar 2009 21:00:00 +1100
Subject: [PATCH 0170/2198] perf_counter: abstract wakeup flag setting in core
 to fix powerpc build

Impact: build fix for powerpc

Commit bd753921015e7905 ("perf_counter: software counter event
infrastructure") introduced a use of TIF_PERF_COUNTERS into the core
perfcounter code.  This breaks the build on powerpc because we use
a flag in a per-cpu area to signal wakeups on powerpc rather than
a thread_info flag, because the thread_info flags have to be
manipulated with atomic operations and are thus slower than per-cpu
flags.

This fixes the by changing the core to use an abstracted
set_perf_counter_pending() function, which is defined on x86 to set
the TIF_PERF_COUNTERS flag and on powerpc to set the per-cpu flag
(paca->perf_counter_pending).  It changes the previous powerpc
definition of set_perf_counter_pending to not take an argument and
adds a clear_perf_counter_pending, so as to simplify the definition
on x86.

On x86, set_perf_counter_pending() is defined as a macro.  Defining
it as a static inline in arch/x86/include/asm/perf_counters.h causes
compile failures because <asm/perf_counters.h> gets included early in
<linux/sched.h>, and the definitions of set_tsk_thread_flag etc. are
therefore not available in <asm/perf_counters.h>.  (On powerpc this
problem is avoided by defining set_perf_counter_pending etc. in
<asm/hw_irq.h>.)

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/include/asm/hw_irq.h   | 14 +++++++++++---
 arch/powerpc/kernel/irq.c           | 11 +++--------
 arch/powerpc/kernel/perf_counter.c  |  3 +--
 arch/x86/include/asm/perf_counter.h |  3 +++
 kernel/perf_counter.c               |  2 +-
 5 files changed, 19 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index b43076ff92c9b..cb32d571c9c7f 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -142,10 +142,17 @@ static inline unsigned long get_perf_counter_pending(void)
 	return x;
 }
 
-static inline void set_perf_counter_pending(int x)
+static inline void set_perf_counter_pending(void)
 {
 	asm volatile("stb %0,%1(13)" : :
-		"r" (x),
+		"r" (1),
+		"i" (offsetof(struct paca_struct, perf_counter_pending)));
+}
+
+static inline void clear_perf_counter_pending(void)
+{
+	asm volatile("stb %0,%1(13)" : :
+		"r" (0),
 		"i" (offsetof(struct paca_struct, perf_counter_pending)));
 }
 
@@ -158,7 +165,8 @@ static inline unsigned long get_perf_counter_pending(void)
 	return 0;
 }
 
-static inline void set_perf_counter_pending(int x) {}
+static inline void set_perf_counter_pending(void) {}
+static inline void clear_perf_counter_pending(void) {}
 static inline void perf_counter_do_pending(void) {}
 #endif /* CONFIG_PERF_COUNTERS */
 
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 0d2e37c57738f..469e9635ff04b 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -104,13 +104,6 @@ static inline notrace void set_soft_enabled(unsigned long enable)
 	: : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled)));
 }
 
-#ifdef CONFIG_PERF_COUNTERS
-notrace void __weak perf_counter_do_pending(void)
-{
-	set_perf_counter_pending(0);
-}
-#endif
-
 notrace void raw_local_irq_restore(unsigned long en)
 {
 	/*
@@ -142,8 +135,10 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
-	if (get_perf_counter_pending())
+	if (get_perf_counter_pending()) {
+		clear_perf_counter_pending();
 		perf_counter_do_pending();
+	}
 
 	/*
 	 * if (get_paca()->hard_enabled) return;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0e33d27cd4641..5008762e8bf4c 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -653,7 +653,6 @@ void perf_counter_do_pending(void)
 	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 	struct perf_counter *counter;
 
-	set_perf_counter_pending(0);
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter && counter->wakeup_pending) {
@@ -811,7 +810,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 			perf_counter_do_pending();
 			irq_exit();
 		} else {
-			set_perf_counter_pending(1);
+			set_perf_counter_pending();
 		}
 	}
 }
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 2e08ed736647e..1662043b340fb 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,6 +84,9 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
+#define set_perf_counter_pending()	\
+		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
 extern void perf_counters_lapic_init(int nmi);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0018c5e81249e..b39456ad74a14 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1433,7 +1433,7 @@ static void perf_swcounter_interrupt(struct perf_counter *counter,
 
 	if (nmi) {
 		counter->wakeup_pending = 1;
-		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+		set_perf_counter_pending();
 	} else
 		wake_up(&counter->waitq);
 }
-- 
GitLab


From 01ef09d9ffb5ce9f8d62d1e5206da3d5ca612acc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:11 +0100
Subject: [PATCH 0171/2198] perf_counter: fix uninitialized usage of event_list

Impact: fix boot crash

When doing the generic context switch event I ran into some early
boot hangs, which were caused by inf func recursion (event, fault,
event, fault).

I eventually tracked it down to event_list not being initialized
at the time of the first event. Fix this.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.195392657@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h | 2 ++
 kernel/perf_counter.c     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 219748d00262e..ca226a91abeea 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -124,6 +124,8 @@ extern struct cred init_cred;
 # define INIT_PERF_COUNTERS(tsk)					\
 	.perf_counter_ctx.counter_list =				\
 		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),	\
+	.perf_counter_ctx.event_list =					\
+		LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list),	\
 	.perf_counter_ctx.lock =					\
 		__SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
 #else
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b39456ad74a14..4c4e9eb37ab0c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1506,7 +1506,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 {
 	struct perf_counter *counter;
 
-	if (list_empty(&ctx->event_list))
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
 		return;
 
 	rcu_read_lock();
-- 
GitLab


From 4a0deca657f3dbb8a707b5dc8f173beec01e7ed2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:12 +0100
Subject: [PATCH 0172/2198] perf_counter: generic context switch event

Impact: cleanup

Use the generic software events for context switches.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.283522645@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 -
 kernel/perf_counter.c | 60 +++----------------------------------------
 kernel/sched.c        |  6 -----
 3 files changed, 4 insertions(+), 63 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75b2fc5306d8e..7ed41f7c5aced 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -138,7 +138,6 @@ extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
-extern u64 cpu_nr_switches(int cpu);
 extern u64 cpu_nr_migrations(int cpu);
 
 extern unsigned long get_parent_ip(unsigned long addr);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4c4e9eb37ab0c..99d5930f0a525 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -710,10 +710,13 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct pt_regs *regs;
 
 	if (likely(!cpuctx->task_ctx))
 		return;
 
+	regs = task_pt_regs(task);
+	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
 	__perf_counter_sched_out(ctx, cpuctx);
 
 	cpuctx->task_ctx = NULL;
@@ -1667,58 +1670,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = {
 	.read		= task_clock_perf_counter_read,
 };
 
-/*
- * Software counter: context switches
- */
-
-static u64 get_context_switches(struct perf_counter *counter)
-{
-	struct task_struct *curr = counter->ctx->task;
-
-	if (curr)
-		return curr->nvcsw + curr->nivcsw;
-	return cpu_nr_switches(smp_processor_id());
-}
-
-static void context_switches_perf_counter_update(struct perf_counter *counter)
-{
-	u64 prev, now;
-	s64 delta;
-
-	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_context_switches(counter);
-
-	atomic64_set(&counter->hw.prev_count, now);
-
-	delta = now - prev;
-
-	atomic64_add(delta, &counter->count);
-}
-
-static void context_switches_perf_counter_read(struct perf_counter *counter)
-{
-	context_switches_perf_counter_update(counter);
-}
-
-static int context_switches_perf_counter_enable(struct perf_counter *counter)
-{
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count,
-			     get_context_switches(counter));
-	return 0;
-}
-
-static void context_switches_perf_counter_disable(struct perf_counter *counter)
-{
-	context_switches_perf_counter_update(counter);
-}
-
-static const struct hw_perf_counter_ops perf_ops_context_switches = {
-	.enable		= context_switches_perf_counter_enable,
-	.disable	= context_switches_perf_counter_disable,
-	.read		= context_switches_perf_counter_read,
-};
-
 /*
  * Software counter: cpu migrations
  */
@@ -1808,11 +1759,8 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_PAGE_FAULTS:
 	case PERF_COUNT_PAGE_FAULTS_MIN:
 	case PERF_COUNT_PAGE_FAULTS_MAJ:
-		hw_ops = &perf_ops_generic;
-		break;
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		if (!counter->hw_event.exclude_kernel)
-			hw_ops = &perf_ops_context_switches;
+		hw_ops = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
 		if (!counter->hw_event.exclude_kernel)
diff --git a/kernel/sched.c b/kernel/sched.c
index 39e7086021697..f76e3c0188a2e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2900,14 +2900,8 @@ unsigned long nr_active(void)
 
 /*
  * Externally visible per-cpu scheduler statistics:
- * cpu_nr_switches(cpu) - number of context switches on that cpu
  * cpu_nr_migrations(cpu) - number of migrations into that cpu
  */
-u64 cpu_nr_switches(int cpu)
-{
-	return cpu_rq(cpu)->nr_switches;
-}
-
 u64 cpu_nr_migrations(int cpu)
 {
 	return cpu_rq(cpu)->nr_migrations_in;
-- 
GitLab


From f16009527595ee562308653bc3d0039166d2ab15 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:16 +0100
Subject: [PATCH 0173/2198] perf_counter: fix up counter free paths

Impact: fix crash during perfcounters use

I found another counter free path, create a free_counter() call to
accomodate generic tear-down.

Fixes an RCU bug.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.652078652@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 99d5930f0a525..97f891ffeb406 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1150,6 +1150,11 @@ static void free_counter_rcu(struct rcu_head *head)
 	kfree(counter);
 }
 
+static void free_counter(struct perf_counter *counter)
+{
+	call_rcu(&counter->rcu_head, free_counter_rcu);
+}
+
 /*
  * Called when the last reference to the file is gone.
  */
@@ -1168,7 +1173,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
-	call_rcu(&counter->rcu_head, free_counter_rcu);
+	free_counter(counter);
 	put_context(ctx);
 
 	return 0;
@@ -2128,10 +2133,10 @@ __perf_counter_exit_task(struct task_struct *child,
 					 list_entry) {
 			if (sub->parent) {
 				sync_child_counter(sub, sub->parent);
-				kfree(sub);
+				free_counter(sub);
 			}
 		}
-		kfree(child_counter);
+		free_counter(child_counter);
 	}
 }
 
-- 
GitLab


From e077df4f439681e43f0db8255b2d215b342ebdc6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:17 +0100
Subject: [PATCH 0174/2198] perf_counter: hook up the tracepoint events

Impact: new perfcounters feature

Enable usage of tracepoints as perf counter events.

tracepoint event ids can be found in /debug/tracing/event/*/*/id
and (for now) are represented as -65536+id in the type field.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.744044174@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  3 +++
 init/Kconfig                 |  5 +++++
 kernel/perf_counter.c        | 43 ++++++++++++++++++++++++++++++++++++
 3 files changed, 51 insertions(+)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 08c11a6afebc1..065984c1ff579 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -53,6 +53,8 @@ enum hw_event_types {
 	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
 
 	PERF_SW_EVENTS_MIN		= -8,
+
+	PERF_TP_EVENTS_MIN		= -65536
 };
 
 /*
@@ -222,6 +224,7 @@ struct perf_counter {
 	struct perf_data		*usrdata;
 	struct perf_data		data[2];
 
+	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
 #endif
 };
diff --git a/init/Kconfig b/init/Kconfig
index 38a2ecd47c379..4f647142f2e64 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -947,6 +947,11 @@ config PERF_COUNTERS
 
 	  Say Y if unsure.
 
+config EVENT_PROFILE
+	bool "Tracepoint profile sources"
+	depends on PERF_COUNTERS && EVENT_TRACER
+	default y
+
 endmenu
 
 config VM_EVENT_COUNTERS
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 97f891ffeb406..0bbe3e45ba0d5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1152,6 +1152,9 @@ static void free_counter_rcu(struct rcu_head *head)
 
 static void free_counter(struct perf_counter *counter)
 {
+	if (counter->destroy)
+		counter->destroy(counter);
+
 	call_rcu(&counter->rcu_head, free_counter_rcu);
 }
 
@@ -1727,6 +1730,45 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
 	.read		= cpu_migrations_perf_counter_read,
 };
 
+#ifdef CONFIG_EVENT_PROFILE
+void perf_tpcounter_event(int event_id)
+{
+	perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1,
+			task_pt_regs(current));
+}
+
+extern int ftrace_profile_enable(int);
+extern void ftrace_profile_disable(int);
+
+static void tp_perf_counter_destroy(struct perf_counter *counter)
+{
+	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
+
+	ftrace_profile_disable(event_id);
+}
+
+static const struct hw_perf_counter_ops *
+tp_perf_counter_init(struct perf_counter *counter)
+{
+	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
+	int ret;
+
+	ret = ftrace_profile_enable(event_id);
+	if (ret)
+		return NULL;
+
+	counter->destroy = tp_perf_counter_destroy;
+
+	return &perf_ops_generic;
+}
+#else
+static const struct hw_perf_counter_ops *
+tp_perf_counter_init(struct perf_counter *counter)
+{
+	return NULL;
+}
+#endif
+
 static const struct hw_perf_counter_ops *
 sw_perf_counter_init(struct perf_counter *counter)
 {
@@ -1772,6 +1814,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 			hw_ops = &perf_ops_cpu_migrations;
 		break;
 	default:
+		hw_ops = tp_perf_counter_init(counter);
 		break;
 	}
 
-- 
GitLab


From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:18 +0100
Subject: [PATCH 0175/2198] perf_counter: revamp syscall input ABI

Impact: modify ABI

The hardware/software classification in hw_event->type became a little
strained due to the addition of tracepoint tracing.

Instead split up the field and provide a type field to explicitly specify
the counter type, while using the event_id field to specify which event to
use.

Raw counters still work as before, only the raw config now goes into
raw_event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.836807573@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  4 +-
 arch/x86/kernel/cpu/perf_counter.c | 10 ++--
 include/linux/perf_counter.h       | 95 +++++++++++++++++++-----------
 kernel/perf_counter.c              | 83 +++++++++++++++-----------
 4 files changed, 117 insertions(+), 75 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5008762e8bf4c..26f69dc7130e9 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,7 +602,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	ev = counter->hw_event.type;
+	ev = counter->hw_event.event_id;
 	if (!counter->hw_event.raw) {
 		if (ev >= ppmu->n_generic ||
 		    ppmu->generic_events[ev] == 0)
@@ -692,7 +692,7 @@ static void perf_handle_group(struct perf_counter *counter)
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_store_irq_data(counter, sub->hw_event.type);
+		perf_store_irq_data(counter, sub->hw_event.event_config);
 		perf_store_irq_data(counter, atomic64_read(&sub->count));
 	}
 }
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6cba9d47b7114..d844ae41d5a34 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event->raw) {
-		hwc->config |= pmc_ops->raw_event(hw_event->type);
+	if (hw_event->raw_type) {
+		hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
 	} else {
-		if (hw_event->type >= pmc_ops->max_events)
+		if (hw_event->event_id >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(hw_event->type);
+		hwc->config |= pmc_ops->event_map(hw_event->event_id);
 	}
 	counter->wakeup_pending = 0;
 
@@ -715,7 +715,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
 
 		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-		perf_store_irq_data(sibling, counter->hw_event.type);
+		perf_store_irq_data(sibling, counter->hw_event.event_config);
 		perf_store_irq_data(sibling, atomic64_read(&counter->count));
 	}
 }
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 065984c1ff579..8f9394905502e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -21,56 +21,81 @@
  */
 
 /*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
+ * hw_event.type
  */
-enum hw_event_types {
+enum perf_event_types {
+	PERF_TYPE_HARDWARE		= 0,
+	PERF_TYPE_SOFTWARE		= 1,
+	PERF_TYPE_TRACEPOINT		= 2,
+
 	/*
-	 * Common hardware events, generalized by the kernel:
+	 * available TYPE space, raw is the max value.
 	 */
-	PERF_COUNT_CPU_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-	PERF_COUNT_BUS_CYCLES		=  6,
 
-	PERF_HW_EVENTS_MAX		=  7,
+	PERF_TYPE_RAW			= 128,
+};
 
+/*
+ * Generalized performance counter event types, used by the hw_event.event_id
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_ids {
 	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
+	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
-	PERF_COUNT_CPU_MIGRATIONS	= -5,
-	PERF_COUNT_PAGE_FAULTS_MIN	= -6,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= -7,
-
-	PERF_SW_EVENTS_MIN		= -8,
+	PERF_COUNT_CPU_CYCLES		= 0,
+	PERF_COUNT_INSTRUCTIONS		= 1,
+	PERF_COUNT_CACHE_REFERENCES	= 2,
+	PERF_COUNT_CACHE_MISSES		= 3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_BRANCH_MISSES	= 5,
+	PERF_COUNT_BUS_CYCLES		= 6,
+
+	PERF_HW_EVENTS_MAX		= 7,
+};
 
-	PERF_TP_EVENTS_MIN		= -65536
+/*
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
+ */
+enum sw_event_ids {
+	PERF_COUNT_CPU_CLOCK		= 0,
+	PERF_COUNT_TASK_CLOCK		= 1,
+	PERF_COUNT_PAGE_FAULTS		= 2,
+	PERF_COUNT_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
+
+	PERF_SW_EVENTS_MAX		= 7,
 };
 
 /*
  * IRQ-notification data record type:
  */
 enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
+	PERF_RECORD_SIMPLE		= 0,
+	PERF_RECORD_IRQ			= 1,
+	PERF_RECORD_GROUP		= 2,
 };
 
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	__s64			type;
+	union {
+		struct {
+			__u64			event_id	: 56,
+						type		:  8;
+		};
+		struct {
+			__u64			raw_event_id	: 63,
+						raw_type	:  1;
+		};
+		__u64		event_config;
+	};
 
 	__u64			irq_period;
 	__u64			record_type;
@@ -78,7 +103,6 @@ struct perf_counter_hw_event {
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
-				raw	       :  1, /* raw event type        */
 				inherit	       :  1, /* children inherit it   */
 				pinned	       :  1, /* must always be on PMU */
 				exclusive      :  1, /* only group on PMU     */
@@ -87,7 +111,7 @@ struct perf_counter_hw_event {
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
 
-				__reserved_1   : 54;
+				__reserved_1   : 55;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -298,10 +322,11 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !counter->hw_event.raw && counter->hw_event.type < 0;
+	return !counter->hw_event.raw_type &&
+		counter->hw_event.type != PERF_TYPE_HARDWARE;
 }
 
-extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *);
+extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
 
 #else
 static inline void
@@ -320,7 +345,7 @@ static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
-static inline void perf_swcounter_event(enum hw_event_types event, u64 nr,
+static inline void perf_swcounter_event(u32 event, u64 nr,
 					int nmi, struct pt_regs *regs)	{ }
 #endif
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0bbe3e45ba0d5..68a56a68bc747 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1395,12 +1395,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	atomic64_set(&hwc->count, -left);
 }
 
-static void perf_swcounter_save_and_restart(struct perf_counter *counter)
-{
-	perf_swcounter_update(counter);
-	perf_swcounter_set_period(counter);
-}
-
 static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
 {
 	struct perf_data *irqdata = counter->irqdata;
@@ -1421,7 +1415,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling)
 
 	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
 		counter->hw_ops->read(counter);
-		perf_swcounter_store_irq(sibling, counter->hw_event.type);
+		perf_swcounter_store_irq(sibling, counter->hw_event.event_config);
 		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
 	}
 }
@@ -1477,21 +1471,25 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs)
 {
-	perf_swcounter_save_and_restart(counter);
+	perf_swcounter_update(counter);
+	perf_swcounter_set_period(counter);
 	perf_swcounter_interrupt(counter, nmi, regs);
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-				enum hw_event_types event,
-				struct pt_regs *regs)
+				enum perf_event_types type,
+				u32 event, struct pt_regs *regs)
 {
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return 0;
 
-	if (counter->hw_event.raw)
+	if (counter->hw_event.raw_type)
+		return 0;
+
+	if (counter->hw_event.type != type)
 		return 0;
 
-	if (counter->hw_event.type != event)
+	if (counter->hw_event.event_id != event)
 		return 0;
 
 	if (counter->hw_event.exclude_user && user_mode(regs))
@@ -1512,8 +1510,8 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum hw_event_types event, u64 nr,
-				     int nmi, struct pt_regs *regs)
+				     enum perf_event_types type, u32 event,
+				     u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_counter *counter;
 
@@ -1522,24 +1520,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, event, regs))
+		if (perf_swcounter_match(counter, type, event, regs))
 			perf_swcounter_add(counter, nr, nmi, regs);
 	}
 	rcu_read_unlock();
 }
 
-void perf_swcounter_event(enum hw_event_types event, u64 nr,
-			  int nmi, struct pt_regs *regs)
+static void __perf_swcounter_event(enum perf_event_types type, u32 event,
+				   u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 
-	perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs);
-	if (cpuctx->task_ctx)
-		perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs);
+	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
+	if (cpuctx->task_ctx) {
+		perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
+				nr, nmi, regs);
+	}
 
 	put_cpu_var(perf_cpu_context);
 }
 
+void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
+{
+	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
+}
+
 static void perf_swcounter_read(struct perf_counter *counter)
 {
 	perf_swcounter_update(counter);
@@ -1733,8 +1738,12 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
-	perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1,
-			task_pt_regs(current));
+	struct pt_regs *regs = get_irq_regs();
+
+	if (!regs)
+		regs = task_pt_regs(current);
+
+	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
 }
 
 extern int ftrace_profile_enable(int);
@@ -1742,15 +1751,13 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
-
-	ftrace_profile_disable(event_id);
+	ftrace_profile_disable(counter->hw_event.event_id);
 }
 
 static const struct hw_perf_counter_ops *
 tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN;
+	int event_id = counter->hw_event.event_id;
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -1758,6 +1765,7 @@ tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
+	counter->hw.irq_period = counter->hw_event.irq_period;
 
 	return &perf_ops_generic;
 }
@@ -1783,7 +1791,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (counter->hw_event.type) {
+	switch (counter->hw_event.event_id) {
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 
@@ -1813,9 +1821,6 @@ sw_perf_counter_init(struct perf_counter *counter)
 		if (!counter->hw_event.exclude_kernel)
 			hw_ops = &perf_ops_cpu_migrations;
 		break;
-	default:
-		hw_ops = tp_perf_counter_init(counter);
-		break;
 	}
 
 	if (hw_ops)
@@ -1870,10 +1875,22 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		counter->state = PERF_COUNTER_STATE_OFF;
 
 	hw_ops = NULL;
-	if (!hw_event->raw && hw_event->type < 0)
-		hw_ops = sw_perf_counter_init(counter);
-	else
+
+	if (hw_event->raw_type)
+		hw_ops = hw_perf_counter_init(counter);
+	else switch (hw_event->type) {
+	case PERF_TYPE_HARDWARE:
 		hw_ops = hw_perf_counter_init(counter);
+		break;
+
+	case PERF_TYPE_SOFTWARE:
+		hw_ops = sw_perf_counter_init(counter);
+		break;
+
+	case PERF_TYPE_TRACEPOINT:
+		hw_ops = tp_perf_counter_init(counter);
+		break;
+	}
 
 	if (!hw_ops) {
 		kfree(counter);
-- 
GitLab


From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:19 +0100
Subject: [PATCH 0176/2198] perf_counter: unify irq output code

Impact: cleanup

Having 3 slightly different copies of the same code around does nobody
any good. First step in revamping the output format.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194233.929962222@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  51 +-------------
 arch/x86/kernel/cpu/perf_counter.c |  53 +--------------
 include/linux/perf_counter.h       |   2 +
 kernel/perf_counter.c              | 106 +++++++++++++++--------------
 4 files changed, 61 insertions(+), 151 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 26f69dc7130e9..88b72eb4af12b 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -662,41 +662,6 @@ void perf_counter_do_pending(void)
 	}
 }
 
-/*
- * Record data for an irq counter.
- * This function was lifted from the x86 code; maybe it should
- * go in the core?
- */
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
-/*
- * Record all the values of the counters in a group
- */
-static void perf_handle_group(struct perf_counter *counter)
-{
-	struct perf_counter *leader, *sub;
-
-	leader = counter->group_leader;
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		if (sub != counter)
-			sub->hw_ops->read(sub);
-		perf_store_irq_data(counter, sub->hw_event.event_config);
-		perf_store_irq_data(counter, atomic64_read(&sub->count));
-	}
-}
-
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -736,20 +701,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	/*
 	 * Finally record data if requested.
 	 */
-	if (record) {
-		switch (counter->hw_event.record_type) {
-		case PERF_RECORD_SIMPLE:
-			break;
-		case PERF_RECORD_IRQ:
-			perf_store_irq_data(counter, instruction_pointer(regs));
-			counter->wakeup_pending = 1;
-			break;
-		case PERF_RECORD_GROUP:
-			perf_handle_group(counter);
-			counter->wakeup_pending = 1;
-			break;
-		}
-	}
+	if (record)
+		perf_counter_output(counter, 1, regs);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d844ae41d5a34..902282d68b0c8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -674,20 +674,6 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	x86_perf_counter_update(counter, hwc, idx);
 }
 
-static void perf_store_irq_data(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
 /*
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
@@ -704,22 +690,6 @@ static void perf_save_and_restart(struct perf_counter *counter)
 		__pmc_generic_enable(counter, hwc, idx);
 }
 
-static void
-perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
-{
-	struct perf_counter *counter, *group_leader = sibling->group_leader;
-
-	/*
-	 * Store sibling timestamps (if any):
-	 */
-	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-
-		x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
-		perf_store_irq_data(sibling, counter->hw_event.event_config);
-		perf_store_irq_data(sibling, atomic64_read(&counter->count));
-	}
-}
-
 /*
  * Maximum interrupt frequency of 100KHz per CPU
  */
@@ -754,28 +724,7 @@ static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 			continue;
 
 		perf_save_and_restart(counter);
-
-		switch (counter->hw_event.record_type) {
-		case PERF_RECORD_SIMPLE:
-			continue;
-		case PERF_RECORD_IRQ:
-			perf_store_irq_data(counter, instruction_pointer(regs));
-			break;
-		case PERF_RECORD_GROUP:
-			perf_handle_group(counter, &status, &ack);
-			break;
-		}
-		/*
-		 * From NMI context we cannot call into the scheduler to
-		 * do a task wakeup - but we mark these generic as
-		 * wakeup_pending and initate a wakeup callback:
-		 */
-		if (nmi) {
-			counter->wakeup_pending = 1;
-			set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
-		} else {
-			wake_up(&counter->waitq);
-		}
+		perf_counter_output(counter, nmi, regs);
 	}
 
 	hw_perf_ack_status(ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8f9394905502e..a4b76c0175f34 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -317,6 +317,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu);
 
+extern void perf_counter_output(struct perf_counter *counter,
+				int nmi, struct pt_regs *regs);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 68a56a68bc747..f054b8c9bf96b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1353,6 +1353,60 @@ static const struct file_operations perf_fops = {
 	.compat_ioctl		= perf_ioctl,
 };
 
+/*
+ * Output
+ */
+
+static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
+{
+	struct perf_data *irqdata = counter->irqdata;
+
+	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
+		irqdata->overrun++;
+	} else {
+		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+
+		*p = data;
+		irqdata->len += sizeof(u64);
+	}
+}
+
+static void perf_counter_handle_group(struct perf_counter *counter)
+{
+	struct perf_counter *leader, *sub;
+
+	leader = counter->group_leader;
+	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		if (sub != counter)
+			sub->hw_ops->read(sub);
+		perf_counter_store_irq(counter, sub->hw_event.event_config);
+		perf_counter_store_irq(counter, atomic64_read(&sub->count));
+	}
+}
+
+void perf_counter_output(struct perf_counter *counter,
+			 int nmi, struct pt_regs *regs)
+{
+	switch (counter->hw_event.record_type) {
+	case PERF_RECORD_SIMPLE:
+		return;
+
+	case PERF_RECORD_IRQ:
+		perf_counter_store_irq(counter, instruction_pointer(regs));
+		break;
+
+	case PERF_RECORD_GROUP:
+		perf_counter_handle_group(counter);
+		break;
+	}
+
+	if (nmi) {
+		counter->wakeup_pending = 1;
+		set_perf_counter_pending();
+	} else
+		wake_up(&counter->waitq);
+}
+
 /*
  * Generic software counter infrastructure
  */
@@ -1395,54 +1449,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	atomic64_set(&hwc->count, -left);
 }
 
-static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data)
-{
-	struct perf_data *irqdata = counter->irqdata;
-
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
-
-		*p = data;
-		irqdata->len += sizeof(u64);
-	}
-}
-
-static void perf_swcounter_handle_group(struct perf_counter *sibling)
-{
-	struct perf_counter *counter, *group_leader = sibling->group_leader;
-
-	list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-		counter->hw_ops->read(counter);
-		perf_swcounter_store_irq(sibling, counter->hw_event.event_config);
-		perf_swcounter_store_irq(sibling, atomic64_read(&counter->count));
-	}
-}
-
-static void perf_swcounter_interrupt(struct perf_counter *counter,
-				     int nmi, struct pt_regs *regs)
-{
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		break;
-
-	case PERF_RECORD_IRQ:
-		perf_swcounter_store_irq(counter, instruction_pointer(regs));
-		break;
-
-	case PERF_RECORD_GROUP:
-		perf_swcounter_handle_group(counter);
-		break;
-	}
-
-	if (nmi) {
-		counter->wakeup_pending = 1;
-		set_perf_counter_pending();
-	} else
-		wake_up(&counter->waitq);
-}
-
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	struct perf_counter *counter;
@@ -1461,7 +1467,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 		regs = task_pt_regs(current);
 
 	if (regs)
-		perf_swcounter_interrupt(counter, 0, regs);
+		perf_counter_output(counter, 0, regs);
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
 
@@ -1473,7 +1479,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	perf_swcounter_interrupt(counter, nmi, regs);
+	perf_counter_output(counter, nmi, regs);
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-- 
GitLab


From db4fb5acf20295063d1d5105e67724eb51440207 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 19 Mar 2009 20:26:20 +0100
Subject: [PATCH 0177/2198] perf_counter: powerpc: clean up
 perc_counter_interrupt

Impact: cleanup

This updates the powerpc perf_counter_interrupt following on from the
"perf_counter: unify irq output code" patch.  Since we now use the
generic perf_counter_output code, which sets the perf_counter_pending
flag directly, we no longer need the need_wakeup variable.

This removes need_wakeup and makes perf_counter_interrupt use
get_perf_counter_pending() instead.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Orig-LKML-Reference: <20090319194234.024464535@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 88b72eb4af12b..830ca9c4494ce 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -723,8 +723,6 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 			/* counter has overflowed */
 			found = 1;
 			record_and_restart(counter, val, regs);
-			if (counter->wakeup_pending)
-				need_wakeup = 1;
 		}
 	}
 
@@ -754,17 +752,14 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	/*
 	 * If we need a wakeup, check whether interrupts were soft-enabled
 	 * when we took the interrupt.  If they were, we can wake stuff up
-	 * immediately; otherwise we'll have to set a flag and do the
-	 * wakeup when interrupts get soft-enabled.
+	 * immediately; otherwise we'll have do the wakeup when interrupts
+	 * get soft-enabled.
 	 */
-	if (need_wakeup) {
-		if (regs->softe) {
-			irq_enter();
-			perf_counter_do_pending();
-			irq_exit();
-		} else {
-			set_perf_counter_pending();
-		}
+	if (get_perf_counter_pending() && regs->softe) {
+		irq_enter();
+		clear_perf_counter_pending();
+		perf_counter_do_pending();
+		irq_exit();
 	}
 }
 
-- 
GitLab


From 9aaa131a279834dff75c290c91f0058f62d72d46 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Sat, 21 Mar 2009 15:31:47 +1100
Subject: [PATCH 0178/2198] perf_counter: fix type/event_id layout on
 big-endian systems

Impact: build fix for powerpc

Commit db3a944aca35ae61 ("perf_counter: revamp syscall input ABI")
expanded the hw_event.type field into a union of structs containing
bitfields.  In particular it introduced a type field and a raw_type
field, with the intention that the 1-bit raw_type field should
overlay the most-significant bit of the 8-bit type field, and in fact
perf_counter_alloc() now assumes that (or at least, assumes that
raw_type doesn't overlay any of the bits that are 1 in the values of
PERF_TYPE_{HARDWARE,SOFTWARE,TRACEPOINT}).

Unfortunately this is not true on big-endian systems such as PowerPC,
where bitfields are laid out from left to right, i.e. from most
significant bit to least significant.  This means that setting
hw_event.type = PERF_TYPE_SOFTWARE will set hw_event.raw_type to 1.

This fixes it by making the layout depend on whether or not
__BIG_ENDIAN_BITFIELD is defined.  It's a bit ugly, but that's what
we get for using bitfields in a user/kernel ABI.

Also, that commit didn't fix up some places in arch/powerpc/kernel/
perf_counter.c where hw_event.raw and hw_event.event_id were used.
This fixes them too.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 arch/powerpc/kernel/perf_counter.c |  9 +++++----
 include/linux/perf_counter.h       | 12 ++++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 830ca9c4494ce..6413d9c0313b2 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,12 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	ev = counter->hw_event.event_id;
-	if (!counter->hw_event.raw) {
-		if (ev >= ppmu->n_generic ||
-		    ppmu->generic_events[ev] == 0)
+	if (!counter->hw_event.raw_type) {
+		ev = counter->hw_event.event_id;
+		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return NULL;
 		ev = ppmu->generic_events[ev];
+	} else {
+		ev = counter->hw_event.raw_event_id;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a4b76c0175f34..98f5990be1e19 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -15,6 +15,7 @@
 
 #include <linux/types.h>
 #include <linux/ioctl.h>
+#include <asm/byteorder.h>
 
 /*
  * User-space ABI bits:
@@ -86,6 +87,7 @@ enum perf_counter_record_type {
  */
 struct perf_counter_hw_event {
 	union {
+#ifndef __BIG_ENDIAN_BITFIELD
 		struct {
 			__u64			event_id	: 56,
 						type		:  8;
@@ -94,6 +96,16 @@ struct perf_counter_hw_event {
 			__u64			raw_event_id	: 63,
 						raw_type	:  1;
 		};
+#else
+		struct {
+			__u64			type		:  8,
+						event_id	: 56;
+		};
+		struct {
+			__u64			raw_type	:  1,
+						raw_event_id	: 63;
+		};
+#endif /* __BIT_ENDIAN_BITFIELD */
 		__u64		event_config;
 	};
 
-- 
GitLab


From 6f9f791eb53b56097cd311a1a5517a8e89bdaf35 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 21:26:52 +0100
Subject: [PATCH 0179/2198] perf_counter: create Documentation/perf_counter/
 and move perfcounters.txt there

We'll have more files in that directory, prepare for that.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/{perf-counters.txt => perf_counter/design.txt} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename Documentation/{perf-counters.txt => perf_counter/design.txt} (100%)

diff --git a/Documentation/perf-counters.txt b/Documentation/perf_counter/design.txt
similarity index 100%
rename from Documentation/perf-counters.txt
rename to Documentation/perf_counter/design.txt
-- 
GitLab


From e0143bad9dbf2a8fad4c5430562bceba196b66ea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 21:29:59 +0100
Subject: [PATCH 0180/2198] perf_counter: add sample user-space to
 Documentation/perf_counter/

Initial version of kerneltop.c and perfstat.c.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile    |  12 +
 Documentation/perf_counter/kerneltop.c | 956 +++++++++++++++++++++++++
 Documentation/perf_counter/perfstat.c  | 521 ++++++++++++++
 3 files changed, 1489 insertions(+)
 create mode 100644 Documentation/perf_counter/Makefile
 create mode 100644 Documentation/perf_counter/kerneltop.c
 create mode 100644 Documentation/perf_counter/perfstat.c

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
new file mode 100644
index 0000000000000..b45749753fcb7
--- /dev/null
+++ b/Documentation/perf_counter/Makefile
@@ -0,0 +1,12 @@
+BINS = kerneltop perfstat
+
+all: $(BINS)
+
+kerneltop: kerneltop.c perfcounters.h
+	cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $<
+
+perfstat: kerneltop
+	ln -sf kerneltop perfstat
+
+clean:
+	rm $(BINS)
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
new file mode 100644
index 0000000000000..cf0e30bab5d56
--- /dev/null
+++ b/Documentation/perf_counter/kerneltop.c
@@ -0,0 +1,956 @@
+/*
+ * kerneltop.c: show top kernel functions - performance counters showcase
+
+   Build with:
+
+     cc -O6 -Wall `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
+
+   Sample output:
+
+------------------------------------------------------------------------------
+ KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
+------------------------------------------------------------------------------
+
+             weight         RIP          kernel function
+             ______   ________________   _______________
+
+              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
+              33.00 - ffffffff804cb740 : sock_alloc_send_skb
+              31.26 - ffffffff804ce808 : skb_push
+              22.43 - ffffffff80510004 : tcp_established_options
+              19.00 - ffffffff8027d250 : find_get_page
+              15.76 - ffffffff804e4fc9 : eth_type_trans
+              15.20 - ffffffff804d8baa : dst_release
+              14.86 - ffffffff804cf5d8 : skb_release_head_state
+              14.00 - ffffffff802217d5 : read_hpet
+              12.00 - ffffffff804ffb7f : __ip_local_out
+              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
+               8.54 - ffffffff805001a3 : ip_queue_xmit
+
+  Started by Ingo Molnar <mingo@redhat.com>
+
+  Improvements and fixes by:
+
+    Arjan van de Ven <arjan@linux.intel.com>
+    Yanmin Zhang <yanmin.zhang@intel.com>
+    Mike Galbraith <efault@gmx.de>
+
+  Released under the GPL v2. (and only v2, not any later version)
+
+ */
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+
+#include <glib.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+
+#include <linux/unistd.h>
+
+#ifdef __x86_64__
+# define __NR_perf_counter_open	295
+#endif
+
+#ifdef __i386__
+# define __NR_perf_counter_open 333
+#endif
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+typedef unsigned int		__u32;
+typedef unsigned long long	__u64;
+typedef long long		__s64;
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_CPU_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+	PERF_COUNT_BUS_CYCLES		=  6,
+
+	PERF_HW_EVENTS_MAX		=  7,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	PERF_COUNT_CPU_MIGRATIONS	= -5,
+
+	PERF_SW_EVENTS_MIN		= -6,
+};
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
+};
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	__s64			type;
+
+	__u64			irq_period;
+	__u64			record_type;
+	__u64			read_format;
+
+	__u64			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				raw	       :  1, /* raw event type        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
+
+				__reserved_1   : 54;
+
+	__u32			extra_config_len;
+	__u32			__reserved_4;
+
+	__u64			__reserved_2;
+	__u64			__reserved_3;
+};
+
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd,
+	unsigned long			flags)
+{
+	int ret;
+
+	ret = syscall(
+		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+#if defined(__x86_64__) || defined(__i386__)
+	if (ret < 0 && ret > -4096) {
+		errno = -ret;
+		ret = -1;
+	}
+#endif
+	return ret;
+}
+
+const char *event_types [] = {
+	"CPU cycles",
+	"instructions",
+	"cache-refs",
+	"cache-misses",
+	"branches",
+	"branch-misses",
+	"bus cycles"
+};
+
+const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE           31
+#define PR_TASK_PERF_COUNTERS_ENABLE            32
+
+#define MAX_COUNTERS		8
+
+static int			nr_counters			= -1;
+
+static __u64			count_filter		       = 100;
+
+#define MAX_NR_CPUS		256
+
+static int			event_count[MAX_COUNTERS];
+static unsigned long		event_id[MAX_COUNTERS];
+static int			event_raw[MAX_COUNTERS];
+
+static int			tid				= -1;
+static int			profile_cpu			= -1;
+static int			nr_cpus				=  0;
+static int			nmi				=  1;
+static int			group				=  0;
+
+static char			*vmlinux;
+
+static char			*sym_filter;
+static unsigned long		filter_start;
+static unsigned long		filter_end;
+
+static int			delay_secs			=  2;
+static int			zero;
+static int			dump_symtab;
+
+struct source_line {
+	uint64_t		EIP;
+	unsigned long		count;
+	char			*line;
+};
+
+static GList			*lines;
+
+static void display_help(void)
+{
+	printf(
+	"Usage: kerneltop [<options>]\n\n"
+	"KernelTop Options (up to %d event types can be specified at once):\n\n",
+		 MAX_COUNTERS);
+	printf(
+	" -e EID    --event_id=EID     # event type ID                    [default:  0]\n"
+	"                                   0: CPU cycles\n"
+	"                                   1: instructions\n"
+	"                                   2: cache accesses\n"
+	"                                   3: cache misses\n"
+	"                                   4: branch instructions\n"
+	"                                   5: branch prediction misses\n"
+	"                                   6: bus cycles\n\n"
+	"                                rNNN: raw PMU events (eventsel+umask)\n\n"
+	" -c CNT    --count=CNT        # event period to sample\n\n"
+	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
+	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
+	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
+	" -f CNT --filter=CNT          # min-event-count filter          [default: 100]\n\n"
+	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
+	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use:\n"
+	" -z        --zero             # zero counts after display\n"
+	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
+	"\n");
+
+	exit(0);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0, counter;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"cpu",		required_argument,	NULL, 'C'},
+			{"delay",	required_argument,	NULL, 'd'},
+			{"dump_symtab",	no_argument,		NULL, 'D'},
+			{"event_id",	required_argument,	NULL, 'e'},
+			{"filter",	required_argument,	NULL, 'f'},
+			{"group",	required_argument,	NULL, 'g'},
+			{"help",	no_argument,		NULL, 'h'},
+			{"nmi",		required_argument,	NULL, 'n'},
+			{"pid",		required_argument,	NULL, 'p'},
+			{"vmlinux",	required_argument,	NULL, 'x'},
+			{"symbol",	required_argument,	NULL, 's'},
+			{"zero",	no_argument,		NULL, 'z'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "c:C:d:De:f:g:hn:p:s:x:z",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'c':
+			if (nr_counters == -1)
+				nr_counters = 0;
+			event_count[nr_counters]	=   atoi(optarg); break;
+		case 'C':
+			/* CPU and PID are mutually exclusive */
+			if (tid != -1) {
+				printf("WARNING: CPU switch overriding PID\n");
+				sleep(1);
+				tid = -1;
+			}
+			profile_cpu			=   atoi(optarg); break;
+		case 'd': delay_secs			=   atoi(optarg); break;
+		case 'D': dump_symtab			=              1; break;
+
+		case 'e':
+			nr_counters++;
+			if (nr_counters == MAX_COUNTERS) {
+				error = 1;
+				break;
+			}
+			if (*optarg == 'r') {
+				event_raw[nr_counters] = 1;
+				++optarg;
+			}
+			event_id[nr_counters] = strtol(optarg, NULL, 16);
+			break;
+
+		case 'f': count_filter			=   atoi(optarg); break;
+		case 'g': group				=   atoi(optarg); break;
+		case 'h':      				  display_help(); break;
+		case 'n': nmi				=   atoi(optarg); break;
+		case 'p':
+			/* CPU and PID are mutually exclusive */
+			if (profile_cpu != -1) {
+				printf("WARNING: PID switch overriding CPU\n");
+				sleep(1);
+				profile_cpu = -1;
+			}
+			tid				=   atoi(optarg); break;
+		case 's': sym_filter			= strdup(optarg); break;
+		case 'x': vmlinux			= strdup(optarg); break;
+		case 'z': zero				=              1; break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	nr_counters++;
+	if (nr_counters < 1)
+		nr_counters = 1;
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		if (event_id[counter] < PERF_HW_EVENTS_MAX)
+			event_count[counter] = default_count[event_id[counter]];
+		else
+			event_count[counter] = 100000;
+	}
+}
+
+static uint64_t			min_ip;
+static uint64_t			max_ip = -1ll;
+
+struct sym_entry {
+	unsigned long long	addr;
+	char			*sym;
+	unsigned long		count[MAX_COUNTERS];
+	int			skip;
+	GList			*source;
+};
+
+#define MAX_SYMS		100000
+
+static int sym_table_count;
+
+struct sym_entry		*sym_filter_entry;
+
+static struct sym_entry		sym_table[MAX_SYMS];
+
+static void show_details(struct sym_entry *sym);
+
+/*
+ * Ordering weight: count-1 * count-1 * ... / count-n
+ */
+static double sym_weight(const struct sym_entry *sym)
+{
+	double weight;
+	int counter;
+
+	weight = sym->count[0];
+
+	for (counter = 1; counter < nr_counters-1; counter++)
+		weight *= sym->count[counter];
+
+	weight /= (sym->count[counter] + 1);
+
+	return weight;
+}
+
+static int compare(const void *__sym1, const void *__sym2)
+{
+	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
+
+	return sym_weight(sym1) < sym_weight(sym2);
+}
+
+static time_t			last_refresh;
+static long			events;
+static long			userspace_events;
+static const char		CONSOLE_CLEAR[] = "[H[2J";
+
+static struct sym_entry		tmp[MAX_SYMS];
+
+static void print_sym_table(void)
+{
+	int i, printed;
+	int counter;
+	float events_per_sec = events/delay_secs;
+	float kevents_per_sec = (events-userspace_events)/delay_secs;
+
+	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
+	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
+
+	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
+
+	printf(
+"------------------------------------------------------------------------------\n");
+	printf( " KernelTop:%8.0f irqs/sec  kernel:%3.1f%% [%s, ",
+		events_per_sec,
+		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
+		nmi ? "NMI" : "IRQ");
+
+	if (nr_counters == 1)
+		printf("%d ", event_count[0]);
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (counter)
+			printf("/");
+
+		if (event_id[counter] < PERF_HW_EVENTS_MAX)
+			printf( "%s", event_types[event_id[counter]]);
+		else
+			printf( "raw:%04lx", event_id[counter]);
+	}
+
+	printf( "], ");
+
+	if (tid != -1)
+		printf(" (tid: %d", tid);
+	else
+		printf(" (all");
+
+	if (profile_cpu != -1)
+		printf(", cpu: %d)\n", profile_cpu);
+	else {
+		if (tid != -1)
+			printf(")\n");
+		else
+			printf(", %d CPUs)\n", nr_cpus);
+	}
+
+	printf("------------------------------------------------------------------------------\n\n");
+
+	if (nr_counters == 1)
+		printf("             events");
+	else
+		printf("  weight     events");
+
+	printf("         RIP          kernel function\n"
+	       	       "  ______     ______   ________________   _______________\n\n"
+	);
+
+	printed = 0;
+	for (i = 0; i < sym_table_count; i++) {
+		int count;
+
+		if (nr_counters == 1) {
+			if (printed <= 18 &&
+					tmp[i].count[0] >= count_filter) {
+				printf("%19.2f - %016llx : %s\n",
+				  sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
+				printed++;
+			}
+		} else {
+			if (printed <= 18 &&
+					tmp[i].count[0] >= count_filter) {
+				printf("%8.1f %10ld - %016llx : %s\n",
+				  sym_weight(tmp + i),
+				  tmp[i].count[0],
+				  tmp[i].addr, tmp[i].sym);
+				printed++;
+			}
+		}
+		/*
+		 * Add decay to the counts:
+		 */
+		for (count = 0; count < nr_counters; count++)
+			sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
+	}
+
+	if (sym_filter_entry)
+		show_details(sym_filter_entry);
+
+	last_refresh = time(NULL);
+
+	{
+		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
+
+		if (poll(&stdin_poll, 1, 0) == 1) {
+			printf("key pressed - exiting.\n");
+			exit(0);
+		}
+	}
+}
+
+static int read_symbol(FILE *in, struct sym_entry *s)
+{
+	static int filter_match = 0;
+	char *sym, stype;
+	char str[500];
+	int rc, pos;
+
+	rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
+	if (rc == EOF)
+		return -1;
+
+	assert(rc == 3);
+
+	/* skip until end of line: */
+	pos = strlen(str);
+	do {
+		rc = fgetc(in);
+		if (rc == '\n' || rc == EOF || pos >= 499)
+			break;
+		str[pos] = rc;
+		pos++;
+	} while (1);
+	str[pos] = 0;
+
+	sym = str;
+
+	/* Filter out known duplicates and non-text symbols. */
+	if (!strcmp(sym, "_text"))
+		return 1;
+	if (!min_ip && !strcmp(sym, "_stext"))
+		return 1;
+	if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
+		return 1;
+	if (stype != 'T' && stype != 't')
+		return 1;
+	if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
+		return 1;
+	if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
+		return 1;
+
+	s->sym = malloc(strlen(str));
+	assert(s->sym);
+
+	strcpy((char *)s->sym, str);
+	s->skip = 0;
+
+	/* Tag events to be skipped. */
+	if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
+		s->skip = 1;
+	if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+		s->skip = 1;
+
+	if (filter_match == 1) {
+		filter_end = s->addr;
+		filter_match = -1;
+		if (filter_end - filter_start > 10000) {
+			printf("hm, too large filter symbol <%s> - skipping.\n",
+				sym_filter);
+			printf("symbol filter start: %016lx\n", filter_start);
+			printf("                end: %016lx\n", filter_end);
+			filter_end = filter_start = 0;
+			sym_filter = NULL;
+			sleep(1);
+		}
+	}
+	if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
+		filter_match = 1;
+		filter_start = s->addr;
+	}
+
+	return 0;
+}
+
+int compare_addr(const void *__sym1, const void *__sym2)
+{
+	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
+
+	return sym1->addr > sym2->addr;
+}
+
+static void sort_symbol_table(void)
+{
+	int i, dups;
+
+	do {
+		qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
+		for (i = 0, dups = 0; i < sym_table_count; i++) {
+			if (sym_table[i].addr == sym_table[i+1].addr) {
+				sym_table[i+1].addr = -1ll;
+				dups++;
+			}
+		}
+		sym_table_count -= dups;
+	} while(dups);
+}
+
+static void parse_symbols(void)
+{
+	struct sym_entry *last;
+
+	FILE *kallsyms = fopen("/proc/kallsyms", "r");
+
+	if (!kallsyms) {
+		printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
+		exit(-1);
+	}
+
+	while (!feof(kallsyms)) {
+		if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
+			sym_table_count++;
+			assert(sym_table_count <= MAX_SYMS);
+		}
+	}
+
+	sort_symbol_table();
+	min_ip = sym_table[0].addr;
+	max_ip = sym_table[sym_table_count-1].addr;
+	last = sym_table + sym_table_count++;
+
+	last->addr = -1ll;
+	last->sym = "<end>";
+
+	if (filter_end) {
+		int count;
+		for (count=0; count < sym_table_count; count ++) {
+			if (!strcmp(sym_table[count].sym, sym_filter)) {
+				sym_filter_entry = &sym_table[count];
+				break;
+			}
+		}
+	}
+	if (dump_symtab) {
+		int i;
+
+		for (i = 0; i < sym_table_count; i++)
+			fprintf(stderr, "%llx %s\n",
+				sym_table[i].addr, sym_table[i].sym);
+	}
+}
+
+
+static void parse_vmlinux(char *filename)
+{
+	FILE *file;
+	char command[PATH_MAX*2];
+	if (!filename)
+		return;
+
+	sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
+
+	file = popen(command, "r");
+	if (!file)
+		return;
+
+	while (!feof(file)) {
+		struct source_line *src;
+		size_t dummy = 0;
+		char *c;
+
+		src = malloc(sizeof(struct source_line));
+		assert(src != NULL);	
+		memset(src, 0, sizeof(struct source_line));
+
+		if (getline(&src->line, &dummy, file) < 0)
+			break;
+		if (!src->line)
+			break;
+
+		c = strchr(src->line, '\n');
+		if (c)
+			*c = 0;
+
+		lines = g_list_prepend(lines, src);
+
+		if (strlen(src->line)>8 && src->line[8] == ':')
+			src->EIP = strtoull(src->line, NULL, 16);
+		if (strlen(src->line)>8 && src->line[16] == ':')
+			src->EIP = strtoull(src->line, NULL, 16);
+	}
+	pclose(file);
+	lines = g_list_reverse(lines);
+}
+
+static void record_precise_ip(uint64_t ip)
+{
+	struct source_line *line;
+	GList *item;
+
+	item = g_list_first(lines);
+	while (item) {
+		line = item->data;
+		if (line->EIP == ip)
+			line->count++;
+		if (line->EIP > ip)
+			break;
+		item = g_list_next(item);
+	}
+}
+
+static void lookup_sym_in_vmlinux(struct sym_entry *sym)
+{
+	struct source_line *line;
+	GList *item;
+	char pattern[PATH_MAX];
+	sprintf(pattern, "<%s>:", sym->sym);
+
+	item = g_list_first(lines);
+	while (item) {
+		line = item->data;
+		if (strstr(line->line, pattern)) {
+			sym->source = item;
+			break;
+		}
+		item = g_list_next(item);
+	}
+}
+
+void show_lines(GList *item_queue, int item_queue_count)
+{
+	int i;
+	struct source_line *line;
+
+	for (i = 0; i < item_queue_count; i++) {
+		line = item_queue->data;
+		printf("%8li\t%s\n", line->count, line->line);
+		item_queue = g_list_next(item_queue);
+	}
+}
+
+#define TRACE_COUNT     3
+
+static void show_details(struct sym_entry *sym)
+{
+	struct source_line *line;
+	GList *item;
+	int displayed = 0;
+	GList *item_queue = NULL;
+	int item_queue_count = 0;
+
+	if (!sym->source)
+		lookup_sym_in_vmlinux(sym);
+	if (!sym->source)
+		return;
+
+	printf("Showing details for %s\n", sym->sym);
+
+	item = sym->source;
+	while (item) {
+		line = item->data;
+		if (displayed && strstr(line->line, ">:"))
+			break;
+
+		if (!item_queue_count)
+			item_queue = item;
+		item_queue_count ++;
+
+		if (line->count >= count_filter) {
+			show_lines(item_queue, item_queue_count);
+			item_queue_count = 0;
+			item_queue = NULL;
+		} else if (item_queue_count > TRACE_COUNT) {
+			item_queue = g_list_next(item_queue);
+			item_queue_count --;
+		}
+
+		line->count = 0;
+		displayed++;
+		if (displayed > 300)
+			break;
+		item = g_list_next(item);
+	}
+}
+
+/*
+ * Binary search in the histogram table and record the hit:
+ */
+static void record_ip(uint64_t ip, int counter)
+{
+	int left_idx, middle_idx, right_idx, idx;
+	unsigned long left, middle, right;
+
+	record_precise_ip(ip);
+
+	left_idx = 0;
+	right_idx = sym_table_count-1;
+	assert(ip <= max_ip && ip >= min_ip);
+
+	while (left_idx + 1 < right_idx) {
+		middle_idx = (left_idx + right_idx) / 2;
+
+		left   = sym_table[  left_idx].addr;
+		middle = sym_table[middle_idx].addr;
+		right  = sym_table[ right_idx].addr;
+
+		if (!(left <= middle && middle <= right)) {
+			printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
+			printf("%d %d %d\n", left_idx, middle_idx, right_idx);
+		}
+		assert(left <= middle && middle <= right);
+		if (!(left <= ip && ip <= right)) {
+			printf(" left: %016lx\n", left);
+			printf("   ip: %016lx\n", ip);
+			printf("right: %016lx\n", right);
+		}
+		assert(left <= ip && ip <= right);
+		/*
+		 * [ left .... target .... middle .... right ]
+		 *   => right := middle
+		 */
+		if (ip < middle) {
+			right_idx = middle_idx;
+			continue;
+		}
+		/*
+		 * [ left .... middle ... target ... right ]
+		 *   => left := middle
+		 */
+		left_idx = middle_idx;
+	}
+
+	idx = left_idx;
+
+	if (!sym_table[idx].skip)
+		sym_table[idx].count[counter]++;
+	else events--;
+}
+
+static void process_event(uint64_t ip, int counter)
+{
+	events++;
+
+	if (ip < min_ip || ip > max_ip) {
+		userspace_events++;
+		return;
+	}
+
+	record_ip(ip, counter);
+}
+
+int main(int argc, char *argv[])
+{
+	struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct perf_counter_hw_event hw_event;
+	int fd[MAX_NR_CPUS][MAX_COUNTERS];
+	int i, counter, group_fd;
+	unsigned int cpu;
+	uint64_t ip;
+	ssize_t res;
+	int ret;
+
+	process_options(argc, argv);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	if (tid != -1 || profile_cpu != -1)
+		nr_cpus = 1;
+
+	assert(nr_cpus <= MAX_NR_CPUS);
+
+	for (i = 0; i < nr_cpus; i++) {
+		group_fd = -1;
+		for (counter = 0; counter < nr_counters; counter++) {
+
+			cpu	= profile_cpu;
+			if (tid == -1 && profile_cpu == -1)
+				cpu = i;
+
+			memset(&hw_event, 0, sizeof(hw_event));
+			hw_event.type		= event_id[counter];
+			hw_event.raw		= event_raw[counter];
+			hw_event.irq_period	= event_count[counter];
+			hw_event.record_type	= PERF_RECORD_IRQ;
+			hw_event.nmi		= nmi;
+
+			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
+			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+			if (fd[i][counter] < 0) {
+				printf("kerneltop error: syscall returned with %d (%s)\n",
+					fd[i][counter], strerror(-fd[i][counter]));
+				if (fd[i][counter] == -1)
+					printf("Are you root?\n");
+				exit(-1);
+			}
+			assert(fd[i][counter] >= 0);
+
+			/*
+			 * First counter acts as the group leader:
+			 */
+			if (group && group_fd == -1)
+				group_fd = fd[i][counter];
+
+			event_array[i][counter].fd = fd[i][counter];
+			event_array[i][counter].events = POLLIN;
+		}
+	}
+
+	parse_symbols();
+	if (vmlinux && sym_filter_entry)
+		parse_vmlinux(vmlinux);
+
+	printf("KernelTop refresh period: %d seconds\n", delay_secs);
+	last_refresh = time(NULL);
+
+	while (1) {
+		int hits = events;
+
+		for (i = 0; i < nr_cpus; i++) {
+			for (counter = 0; counter < nr_counters; counter++) {
+				res = read(fd[i][counter], (char *) &ip, sizeof(ip));
+				if (res > 0) {
+					assert(res == sizeof(ip));
+
+					process_event(ip, counter);
+				}
+			}
+		}
+
+		if (time(NULL) >= last_refresh + delay_secs) {
+			print_sym_table();
+			events = userspace_events = 0;
+		}
+
+		if (hits == events)
+			ret = poll(event_array[0], nr_cpus, 1000);
+		hits = events;
+	}
+
+	return 0;
+}
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
new file mode 100644
index 0000000000000..9a5808fbcf90b
--- /dev/null
+++ b/Documentation/perf_counter/perfstat.c
@@ -0,0 +1,521 @@
+/*
+ * perfstat:  /usr/bin/time -alike performance counter statistics utility
+ *
+ *        It summarizes the counter events of all tasks (and child tasks),
+ *        covering all CPUs that the command (or workload) executes on.
+ *        It only counts the per-task events of the workload started,
+ *        independent of how many other tasks run on those CPUs.
+ *
+ * Build with:       cc -O2 -g -lrt -Wall -W -o perfstat perfstat.c
+ *
+ * Sample output:
+ *
+
+   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
+
+   Performance counter stats for 'ls':
+
+           163516953 instructions
+                2295 cache-misses
+             2855182 branch-misses
+
+ *
+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *
+ * Released under the GPLv2 (not later).
+ *
+ * Percpu counter support by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
+ * Symbolic event options by: Wu Fengguang <fengguang.wu@intel.com>
+ */
+#define _GNU_SOURCE
+
+#include <assert.h>
+#include <getopt.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <time.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+
+#include <linux/unistd.h>
+
+#ifdef __x86_64__
+# define __NR_perf_counter_open	295
+#endif
+
+#ifdef __i386__
+# define __NR_perf_counter_open 333
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#endif
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+typedef unsigned int		__u32;
+typedef unsigned long long	__u64;
+typedef long long		__s64;
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_CPU_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+	PERF_COUNT_BUS_CYCLES		=  6,
+
+	PERF_HW_EVENTS_MAX		=  7,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	PERF_COUNT_CPU_MIGRATIONS	= -5,
+
+	PERF_SW_EVENTS_MIN		= -6,
+};
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
+};
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	__s64			type;
+
+	__u64			irq_period;
+	__u64			record_type;
+	__u64			read_format;
+
+	__u64			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				raw	       :  1, /* raw event type        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
+
+				__reserved_1   : 54;
+
+	__u32			extra_config_len;
+	__u32			__reserved_4;
+
+	__u64			__reserved_2;
+	__u64			__reserved_3;
+};
+
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd,
+	unsigned long			flags)
+{
+	int ret;
+
+	ret = syscall(
+		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+#if defined(__x86_64__) || defined(__i386__)
+	if (ret < 0 && ret > -4096) {
+		errno = -ret;
+		ret = -1;
+	}
+#endif
+	return ret;
+}
+
+
+static char *hw_event_names [] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names [] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+};
+
+struct event_symbol {
+	int event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols [] = {
+	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
+	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
+	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
+	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
+	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
+	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
+	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
+	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
+	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
+	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
+	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
+	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
+	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
+	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
+};
+
+#define MAX_COUNTERS					64
+#define MAX_NR_CPUS					256
+
+static int			nr_counters		= 0;
+static int			nr_cpus			= 0;
+
+static int			event_id[MAX_COUNTERS]	=
+					 { -2, -5, -4, -3, 0, 1, 2, 3};
+
+static int			event_raw[MAX_COUNTERS];
+
+static int			system_wide		= 0;
+
+static void display_help(void)
+{
+	unsigned int i;
+	int e;
+
+	printf(
+	"Usage: perfstat [<events...>] <cmd...>\n\n"
+	"PerfStat Options (up to %d event types can be specified):\n\n",
+		 MAX_COUNTERS);
+	printf(
+	" -e EVENT     --event=EVENT        #  symbolic-name        abbreviations");
+
+	for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
+		if (e != event_symbols[i].event) {
+			e = event_symbols[i].event;
+			printf(
+	"\n                                  %2d: %-20s", e, event_symbols[i].symbol);
+		} else
+			printf(" %s", event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                                rNNN: raw event type\n\n"
+	" -s                                # system-wide collection\n\n"
+	" -c <cmd..>   --command=<cmd..>    # command+arguments to be timed.\n"
+	"\n");
+	exit(0);
+}
+
+static int type_valid(int type)
+{
+	if (type >= PERF_HW_EVENTS_MAX)
+		return 0;
+	if (type <= PERF_SW_EVENTS_MIN)
+		return 0;
+
+	return 1;
+}
+
+static char *event_name(int ctr)
+{
+	int type = event_id[ctr];
+	static char buf[32];
+
+	if (event_raw[ctr]) {
+		sprintf(buf, "raw 0x%x", type);
+		return buf;
+	}
+	if (!type_valid(type))
+		return "unknown";
+
+	if (type >= 0)
+		return hw_event_names[type];
+
+	return sw_event_names[-type-1];
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static int match_event_symbols(char *str)
+{
+	unsigned int i;
+
+	if (isdigit(str[0]) || str[0] == '-')
+		return atoi(str);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return PERF_HW_EVENTS_MAX;
+}
+
+static void parse_events(char *str)
+{
+	int type, raw;
+
+again:
+	nr_counters++;
+	if (nr_counters == MAX_COUNTERS)
+		display_help();
+
+	raw = 0;
+	if (*str == 'r') {
+		raw = 1;
+		++str;
+		type = strtol(str, NULL, 16);
+	} else {
+		type = match_event_symbols(str);
+		if (!type_valid(type))
+			display_help();
+	}
+
+	event_id[nr_counters] = type;
+	event_raw[nr_counters] = raw;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+}
+
+static void process_options(int argc, char *argv[])
+{
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"event",	required_argument,	NULL, 'e'},
+			{"help",	no_argument,		NULL, 'h'},
+			{"command",	no_argument,		NULL, 'c'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:e:c:s",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'c':
+			break;
+		case 's':
+			system_wide = 1;
+			break;
+		case 'e':
+			parse_events(optarg);
+			break;
+		default:
+			break;
+		}
+	}
+	if (optind == argc)
+		goto err;
+
+	if (!nr_counters)
+		nr_counters = 8;
+	else
+		nr_counters++;
+	return;
+
+err:
+	display_help();
+}
+
+char fault_here[1000000];
+
+#define PR_TASK_PERF_COUNTERS_DISABLE           31
+#define PR_TASK_PERF_COUNTERS_ENABLE            32
+
+static int fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static void create_counter(int counter)
+{
+	struct perf_counter_hw_event hw_event;
+
+	memset(&hw_event, 0, sizeof(hw_event));
+	hw_event.type		= event_id[counter];
+	hw_event.raw		= event_raw[counter];
+	hw_event.record_type	= PERF_RECORD_SIMPLE;
+	hw_event.nmi		= 0;
+
+	if (system_wide) {
+		int cpu;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
+			if (fd[cpu][counter] < 0) {
+				printf("perfstat error: syscall returned with %d (%s)\n",
+						fd[cpu][counter], strerror(errno));
+				exit(-1);
+			}
+			
+		}
+	} else {
+		hw_event.inherit	= 1;
+		hw_event.disabled	= 1;
+
+		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
+		if (fd[0][counter] < 0) {
+			printf("perfstat error: syscall returned with %d (%s)\n",
+					fd[0][counter], strerror(errno));
+			exit(-1);
+		}
+	}
+}
+
+
+#define rdclock()					\
+({							\
+	struct timespec ts;				\
+							\
+	clock_gettime(CLOCK_MONOTONIC, &ts);		\
+	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
+})
+
+int main(int argc, char *argv[])
+{
+	unsigned long long t0, t1;
+	int counter;
+	ssize_t res;
+	int status;
+	int pid;
+
+	process_options(argc, argv);
+
+	if (system_wide) {
+		nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+		assert(nr_cpus <= MAX_NR_CPUS);
+		assert(nr_cpus >= 0);
+	} else
+		nr_cpus = 1;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		create_counter(counter);
+
+	argc -= optind;
+	argv += optind;
+
+	/*
+	 * Enable counters and exec the command:
+	 */
+	t0 = rdclock();
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	if ((pid = fork()) < 0)
+		perror("failed to fork");
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+	while (wait(&status) >= 0)
+		;
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	t1 = rdclock();
+
+	fflush(stdout);
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Performance counter stats for \'%s\':\n",
+		argv[0]);
+	fprintf(stderr, "\n");
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		int cpu;
+		__u64 count, single_count;
+
+		count = 0;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			res = read(fd[cpu][counter],
+					(char *) &single_count, sizeof(single_count));
+			assert(res == sizeof(single_count));
+			count += single_count;
+		}
+
+		if (!event_raw[counter] &&
+		    (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
+		     event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
+
+			double msecs = (double)count / 1000000;
+
+			fprintf(stderr, " %14.6f  %-20s (msecs)\n",
+				msecs, event_name(counter));
+		} else {
+			fprintf(stderr, " %14Ld  %-20s (events)\n",
+				count, event_name(counter));
+		}
+		if (!counter)
+			fprintf(stderr, "\n");
+	}
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
+			(double)(t1-t0)/1e6);
+	fprintf(stderr, "\n");
+
+	return 0;
+}
-- 
GitLab


From cea92ce5b07078cd62c4712d51390b09a43dba2e Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:02 +0800
Subject: [PATCH 0181/2198] perf_counter tools: Merge common code into
 perfcounters.h

kerneltop's MAX_COUNTERS is increased from 8 to 64(the value used by perfstat).

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c    | 132 +--------------------
 Documentation/perf_counter/perfcounters.h | 137 ++++++++++++++++++++++
 Documentation/perf_counter/perfstat.c     | 134 +--------------------
 3 files changed, 139 insertions(+), 264 deletions(-)
 create mode 100644 Documentation/perf_counter/perfcounters.h

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index cf0e30bab5d56..fe70a2c92a8e8 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -65,126 +65,7 @@
 
 #include <linux/unistd.h>
 
-#ifdef __x86_64__
-# define __NR_perf_counter_open	295
-#endif
-
-#ifdef __i386__
-# define __NR_perf_counter_open 333
-#endif
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-typedef unsigned int		__u32;
-typedef unsigned long long	__u64;
-typedef long long		__s64;
-
-/*
- * User-space ABI bits:
- */
-
-/*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
- */
-enum hw_event_types {
-	/*
-	 * Common hardware events, generalized by the kernel:
-	 */
-	PERF_COUNT_CPU_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-	PERF_COUNT_BUS_CYCLES		=  6,
-
-	PERF_HW_EVENTS_MAX		=  7,
-
-	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
-	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
-	PERF_COUNT_CPU_MIGRATIONS	= -5,
-
-	PERF_SW_EVENTS_MIN		= -6,
-};
-
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
-};
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_hw_event {
-	__s64			type;
-
-	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
-
-	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
-				raw	       :  1, /* raw event type        */
-				inherit	       :  1, /* children inherit it   */
-				pinned	       :  1, /* must always be on PMU */
-				exclusive      :  1, /* only group on PMU     */
-				exclude_user   :  1, /* don't count user      */
-				exclude_kernel :  1, /* ditto kernel          */
-				exclude_hv     :  1, /* ditto hypervisor      */
-				exclude_idle   :  1, /* don't count when idle */
-
-				__reserved_1   : 54;
-
-	__u32			extra_config_len;
-	__u32			__reserved_4;
-
-	__u64			__reserved_2;
-	__u64			__reserved_3;
-};
-
-/*
- * Ioctls that can be done on a perf counter fd:
- */
-#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
-
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd,
-	unsigned long			flags)
-{
-	int ret;
-
-	ret = syscall(
-		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-#if defined(__x86_64__) || defined(__i386__)
-	if (ret < 0 && ret > -4096) {
-		errno = -ret;
-		ret = -1;
-	}
-#endif
-	return ret;
-}
+#include "perfcounters.h"
 
 const char *event_types [] = {
 	"CPU cycles",
@@ -205,21 +86,10 @@ const unsigned int default_count[] = {
 	  10000,
 };
 
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE           31
-#define PR_TASK_PERF_COUNTERS_ENABLE            32
-
-#define MAX_COUNTERS		8
-
 static int			nr_counters			= -1;
 
 static __u64			count_filter		       = 100;
 
-#define MAX_NR_CPUS		256
-
 static int			event_count[MAX_COUNTERS];
 static unsigned long		event_id[MAX_COUNTERS];
 static int			event_raw[MAX_COUNTERS];
diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
new file mode 100644
index 0000000000000..8c1559b25f10b
--- /dev/null
+++ b/Documentation/perf_counter/perfcounters.h
@@ -0,0 +1,137 @@
+/*
+ * Ioctls that can be done on a perf counter fd:
+ */
+#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE	31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+typedef unsigned int		__u32;
+typedef unsigned long long	__u64;
+typedef long long		__s64;
+
+/*
+ * User-space ABI bits:
+ */
+
+/*
+ * Generalized performance counter event types, used by the hw_event.type
+ * parameter of the sys_perf_counter_open() syscall:
+ */
+enum hw_event_types {
+	/*
+	 * Common hardware events, generalized by the kernel:
+	 */
+	PERF_COUNT_CPU_CYCLES		=  0,
+	PERF_COUNT_INSTRUCTIONS		=  1,
+	PERF_COUNT_CACHE_REFERENCES	=  2,
+	PERF_COUNT_CACHE_MISSES		=  3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
+	PERF_COUNT_BRANCH_MISSES	=  5,
+	PERF_COUNT_BUS_CYCLES		=  6,
+
+	PERF_HW_EVENTS_MAX		=  7,
+
+	/*
+	 * Special "software" counters provided by the kernel, even if
+	 * the hardware does not support performance counters. These
+	 * counters measure various physical and sw events of the
+	 * kernel (and allow the profiling of them as well):
+	 */
+	PERF_COUNT_CPU_CLOCK		= -1,
+	PERF_COUNT_TASK_CLOCK		= -2,
+	PERF_COUNT_PAGE_FAULTS		= -3,
+	PERF_COUNT_CONTEXT_SWITCHES	= -4,
+	PERF_COUNT_CPU_MIGRATIONS	= -5,
+
+	PERF_SW_EVENTS_MIN		= -6,
+};
+
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		=  0,
+	PERF_RECORD_IRQ			=  1,
+	PERF_RECORD_GROUP		=  2,
+};
+
+/*
+ * Hardware event to monitor via a performance monitoring counter:
+ */
+struct perf_counter_hw_event {
+	__s64			type;
+
+	__u64			irq_period;
+	__u64			record_type;
+	__u64			read_format;
+
+	__u64			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				raw	       :  1, /* raw event type        */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
+
+				__reserved_1   : 54;
+
+	__u32			extra_config_len;
+	__u32			__reserved_4;
+
+	__u64			__reserved_2;
+	__u64			__reserved_3;
+};
+
+#ifdef __x86_64__
+# define __NR_perf_counter_open	295
+#endif
+
+#ifdef __i386__
+# define __NR_perf_counter_open 333
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#endif
+
+asmlinkage int sys_perf_counter_open(
+
+	struct perf_counter_hw_event	*hw_event_uptr		__user,
+	pid_t				pid,
+	int				cpu,
+	int				group_fd,
+	unsigned long			flags)
+{
+	int ret;
+
+	ret = syscall(
+		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+#if defined(__x86_64__) || defined(__i386__)
+	if (ret < 0 && ret > -4096) {
+		errno = -ret;
+		ret = -1;
+	}
+#endif
+	return ret;
+}
+
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
index 9a5808fbcf90b..a3d4a7a602f66 100644
--- a/Documentation/perf_counter/perfstat.c
+++ b/Documentation/perf_counter/perfstat.c
@@ -52,133 +52,7 @@
 
 #include <linux/unistd.h>
 
-#ifdef __x86_64__
-# define __NR_perf_counter_open	295
-#endif
-
-#ifdef __i386__
-# define __NR_perf_counter_open 333
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#endif
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-typedef unsigned int		__u32;
-typedef unsigned long long	__u64;
-typedef long long		__s64;
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-/*
- * User-space ABI bits:
- */
-
-/*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
- */
-enum hw_event_types {
-	/*
-	 * Common hardware events, generalized by the kernel:
-	 */
-	PERF_COUNT_CPU_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-	PERF_COUNT_BUS_CYCLES		=  6,
-
-	PERF_HW_EVENTS_MAX		=  7,
-
-	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
-	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
-	PERF_COUNT_CPU_MIGRATIONS	= -5,
-
-	PERF_SW_EVENTS_MIN		= -6,
-};
-
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
-};
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_hw_event {
-	__s64			type;
-
-	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
-
-	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
-				raw	       :  1, /* raw event type        */
-				inherit	       :  1, /* children inherit it   */
-				pinned	       :  1, /* must always be on PMU */
-				exclusive      :  1, /* only group on PMU     */
-				exclude_user   :  1, /* don't count user      */
-				exclude_kernel :  1, /* ditto kernel          */
-				exclude_hv     :  1, /* ditto hypervisor      */
-				exclude_idle   :  1, /* don't count when idle */
-
-				__reserved_1   : 54;
-
-	__u32			extra_config_len;
-	__u32			__reserved_4;
-
-	__u64			__reserved_2;
-	__u64			__reserved_3;
-};
-
-/*
- * Ioctls that can be done on a perf counter fd:
- */
-#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
-
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd,
-	unsigned long			flags)
-{
-	int ret;
-
-	ret = syscall(
-		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-#if defined(__x86_64__) || defined(__i386__)
-	if (ret < 0 && ret > -4096) {
-		errno = -ret;
-		ret = -1;
-	}
-#endif
-	return ret;
-}
-
+#include "perfcounters.h"
 
 static char *hw_event_names [] = {
 	"CPU cycles",
@@ -224,9 +98,6 @@ static struct event_symbol event_symbols [] = {
 	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
 };
 
-#define MAX_COUNTERS					64
-#define MAX_NR_CPUS					256
-
 static int			nr_counters		= 0;
 static int			nr_cpus			= 0;
 
@@ -388,9 +259,6 @@ static void process_options(int argc, char *argv[])
 
 char fault_here[1000000];
 
-#define PR_TASK_PERF_COUNTERS_DISABLE           31
-#define PR_TASK_PERF_COUNTERS_ENABLE            32
-
 static int fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static void create_counter(int counter)
-- 
GitLab


From f49012fad4ed2231c7380c0b1901122242b3eab0 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:03 +0800
Subject: [PATCH 0182/2198] perf_counter tools: Move perfstat supporting code
 into perfcounters.h

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/perfcounters.h | 130 ++++++++++++++++++++++
 Documentation/perf_counter/perfstat.c     | 130 ----------------------
 2 files changed, 130 insertions(+), 130 deletions(-)

diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
index 8c1559b25f10b..0f3764aa52abd 100644
--- a/Documentation/perf_counter/perfcounters.h
+++ b/Documentation/perf_counter/perfcounters.h
@@ -16,6 +16,14 @@
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 
+#define rdclock()					\
+({							\
+	struct timespec ts;				\
+							\
+	clock_gettime(CLOCK_MONOTONIC, &ts);		\
+	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
+})
+
 /*
  * Pick up some kernel type conventions:
  */
@@ -135,3 +143,125 @@ asmlinkage int sys_perf_counter_open(
 	return ret;
 }
 
+static char *hw_event_names [] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names [] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+};
+
+struct event_symbol {
+	int event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols [] = {
+	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
+	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
+	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
+	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
+	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
+	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
+	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
+	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
+	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
+	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
+	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
+	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
+	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
+	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
+};
+
+static int type_valid(int type)
+{
+	if (type >= PERF_HW_EVENTS_MAX)
+		return 0;
+	if (type <= PERF_SW_EVENTS_MIN)
+		return 0;
+
+	return 1;
+}
+
+static char *event_name(int ctr)
+{
+	int type = event_id[ctr];
+	static char buf[32];
+
+	if (event_raw[ctr]) {
+		sprintf(buf, "raw 0x%x", type);
+		return buf;
+	}
+	if (!type_valid(type))
+		return "unknown";
+
+	if (type >= 0)
+		return hw_event_names[type];
+
+	return sw_event_names[-type-1];
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static int match_event_symbols(char *str)
+{
+	unsigned int i;
+
+	if (isdigit(str[0]) || str[0] == '-')
+		return atoi(str);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return PERF_HW_EVENTS_MAX;
+}
+
+static void parse_events(char *str)
+{
+	int type, raw;
+
+again:
+	nr_counters++;
+	if (nr_counters == MAX_COUNTERS)
+		display_help();
+
+	raw = 0;
+	if (*str == 'r') {
+		raw = 1;
+		++str;
+		type = strtol(str, NULL, 16);
+	} else {
+		type = match_event_symbols(str);
+		if (!type_valid(type))
+			display_help();
+	}
+
+	event_id[nr_counters] = type;
+	event_raw[nr_counters] = raw;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+}
+
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
index a3d4a7a602f66..3364dcb9dd9d3 100644
--- a/Documentation/perf_counter/perfstat.c
+++ b/Documentation/perf_counter/perfstat.c
@@ -54,50 +54,6 @@
 
 #include "perfcounters.h"
 
-static char *hw_event_names [] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names [] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-};
-
-struct event_symbol {
-	int event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols [] = {
-	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
-	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
-	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
-	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
-	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
-	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
-	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
-	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
-	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
-	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
-	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
-	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
-	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
-	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
-};
-
 static int			nr_counters		= 0;
 static int			nr_cpus			= 0;
 
@@ -137,84 +93,6 @@ static void display_help(void)
 	exit(0);
 }
 
-static int type_valid(int type)
-{
-	if (type >= PERF_HW_EVENTS_MAX)
-		return 0;
-	if (type <= PERF_SW_EVENTS_MIN)
-		return 0;
-
-	return 1;
-}
-
-static char *event_name(int ctr)
-{
-	int type = event_id[ctr];
-	static char buf[32];
-
-	if (event_raw[ctr]) {
-		sprintf(buf, "raw 0x%x", type);
-		return buf;
-	}
-	if (!type_valid(type))
-		return "unknown";
-
-	if (type >= 0)
-		return hw_event_names[type];
-
-	return sw_event_names[-type-1];
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static int match_event_symbols(char *str)
-{
-	unsigned int i;
-
-	if (isdigit(str[0]) || str[0] == '-')
-		return atoi(str);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return PERF_HW_EVENTS_MAX;
-}
-
-static void parse_events(char *str)
-{
-	int type, raw;
-
-again:
-	nr_counters++;
-	if (nr_counters == MAX_COUNTERS)
-		display_help();
-
-	raw = 0;
-	if (*str == 'r') {
-		raw = 1;
-		++str;
-		type = strtol(str, NULL, 16);
-	} else {
-		type = match_event_symbols(str);
-		if (!type_valid(type))
-			display_help();
-	}
-
-	event_id[nr_counters] = type;
-	event_raw[nr_counters] = raw;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-}
-
 static void process_options(int argc, char *argv[])
 {
 	for (;;) {
@@ -296,14 +174,6 @@ static void create_counter(int counter)
 }
 
 
-#define rdclock()					\
-({							\
-	struct timespec ts;				\
-							\
-	clock_gettime(CLOCK_MONOTONIC, &ts);		\
-	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
-})
-
 int main(int argc, char *argv[])
 {
 	unsigned long long t0, t1;
-- 
GitLab


From 95bb3be1b3ca4a71cc168787b675d5b7852fc6be Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:04 +0800
Subject: [PATCH 0183/2198] perf_counter tools: support symbolic event names in
 kerneltop

- kerneltop: --event_id => --event
- kerneltop: can accept SW event types now
- perfstat: it used to implicitly add event -2(task-clock),
	    the new code no longer does this. Shall we?

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c    | 28 +++++------------------
 Documentation/perf_counter/perfcounters.h | 15 ++++++++----
 Documentation/perf_counter/perfstat.c     |  8 -------
 3 files changed, 16 insertions(+), 35 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index fe70a2c92a8e8..edc5b09fb586f 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -86,13 +86,9 @@ const unsigned int default_count[] = {
 	  10000,
 };
 
-static int			nr_counters			= -1;
-
 static __u64			count_filter		       = 100;
 
 static int			event_count[MAX_COUNTERS];
-static unsigned long		event_id[MAX_COUNTERS];
-static int			event_raw[MAX_COUNTERS];
 
 static int			tid				= -1;
 static int			profile_cpu			= -1;
@@ -125,7 +121,7 @@ static void display_help(void)
 	"KernelTop Options (up to %d event types can be specified at once):\n\n",
 		 MAX_COUNTERS);
 	printf(
-	" -e EID    --event_id=EID     # event type ID                    [default:  0]\n"
+	" -e EID    --event=EID        # event type ID                    [default:  0]\n"
 	"                                   0: CPU cycles\n"
 	"                                   1: instructions\n"
 	"                                   2: cache accesses\n"
@@ -160,7 +156,7 @@ static void process_options(int argc, char *argv[])
 			{"cpu",		required_argument,	NULL, 'C'},
 			{"delay",	required_argument,	NULL, 'd'},
 			{"dump_symtab",	no_argument,		NULL, 'D'},
-			{"event_id",	required_argument,	NULL, 'e'},
+			{"event",	required_argument,	NULL, 'e'},
 			{"filter",	required_argument,	NULL, 'f'},
 			{"group",	required_argument,	NULL, 'g'},
 			{"help",	no_argument,		NULL, 'h'},
@@ -178,8 +174,6 @@ static void process_options(int argc, char *argv[])
 
 		switch (c) {
 		case 'c':
-			if (nr_counters == -1)
-				nr_counters = 0;
 			event_count[nr_counters]	=   atoi(optarg); break;
 		case 'C':
 			/* CPU and PID are mutually exclusive */
@@ -192,18 +186,7 @@ static void process_options(int argc, char *argv[])
 		case 'd': delay_secs			=   atoi(optarg); break;
 		case 'D': dump_symtab			=              1; break;
 
-		case 'e':
-			nr_counters++;
-			if (nr_counters == MAX_COUNTERS) {
-				error = 1;
-				break;
-			}
-			if (*optarg == 'r') {
-				event_raw[nr_counters] = 1;
-				++optarg;
-			}
-			event_id[nr_counters] = strtol(optarg, NULL, 16);
-			break;
+		case 'e': error				= parse_events(optarg); break;
 
 		case 'f': count_filter			=   atoi(optarg); break;
 		case 'g': group				=   atoi(optarg); break;
@@ -226,9 +209,10 @@ static void process_options(int argc, char *argv[])
 	if (error)
 		display_help();
 
-	nr_counters++;
-	if (nr_counters < 1)
+	if (!nr_counters) {
 		nr_counters = 1;
+		event_id[0] = 0;
+	}
 
 	for (counter = 0; counter < nr_counters; counter++) {
 		if (event_count[counter])
diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
index 0f3764aa52abd..99a90d833e127 100644
--- a/Documentation/perf_counter/perfcounters.h
+++ b/Documentation/perf_counter/perfcounters.h
@@ -143,6 +143,10 @@ asmlinkage int sys_perf_counter_open(
 	return ret;
 }
 
+static int			nr_counters			= 0;
+static long			event_id[MAX_COUNTERS]		= { -2, -5, -4, -3, 0, 1, 2, 3};
+static int			event_raw[MAX_COUNTERS];
+
 static char *hw_event_names [] = {
 	"CPU cycles",
 	"instructions",
@@ -235,14 +239,13 @@ static int match_event_symbols(char *str)
 	return PERF_HW_EVENTS_MAX;
 }
 
-static void parse_events(char *str)
+static int parse_events(char *str)
 {
 	int type, raw;
 
 again:
-	nr_counters++;
 	if (nr_counters == MAX_COUNTERS)
-		display_help();
+		return -1;
 
 	raw = 0;
 	if (*str == 'r') {
@@ -252,16 +255,18 @@ static void parse_events(char *str)
 	} else {
 		type = match_event_symbols(str);
 		if (!type_valid(type))
-			display_help();
+			return -1;
 	}
 
 	event_id[nr_counters] = type;
 	event_raw[nr_counters] = raw;
+	nr_counters++;
 
 	str = strstr(str, ",");
 	if (str) {
 		str++;
 		goto again;
 	}
-}
 
+	return 0;
+}
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
index 3364dcb9dd9d3..fd594468e655b 100644
--- a/Documentation/perf_counter/perfstat.c
+++ b/Documentation/perf_counter/perfstat.c
@@ -54,14 +54,8 @@
 
 #include "perfcounters.h"
 
-static int			nr_counters		= 0;
 static int			nr_cpus			= 0;
 
-static int			event_id[MAX_COUNTERS]	=
-					 { -2, -5, -4, -3, 0, 1, 2, 3};
-
-static int			event_raw[MAX_COUNTERS];
-
 static int			system_wide		= 0;
 
 static void display_help(void)
@@ -127,8 +121,6 @@ static void process_options(int argc, char *argv[])
 
 	if (!nr_counters)
 		nr_counters = 8;
-	else
-		nr_counters++;
 	return;
 
 err:
-- 
GitLab


From e3908612d93dff9d7746d85d37c42593254bf282 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:05 +0800
Subject: [PATCH 0184/2198] perf_counter tools: Reuse event_name() in kerneltop

- can handle sw counters now
- the outputs will look slightly different

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index edc5b09fb586f..cba5cb0a97f97 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -67,16 +67,6 @@
 
 #include "perfcounters.h"
 
-const char *event_types [] = {
-	"CPU cycles",
-	"instructions",
-	"cache-refs",
-	"cache-misses",
-	"branches",
-	"branch-misses",
-	"bus cycles"
-};
-
 const unsigned int default_count[] = {
 	1000000,
 	1000000,
@@ -304,10 +294,7 @@ static void print_sym_table(void)
 		if (counter)
 			printf("/");
 
-		if (event_id[counter] < PERF_HW_EVENTS_MAX)
-			printf( "%s", event_types[event_id[counter]]);
-		else
-			printf( "raw:%04lx", event_id[counter]);
+		printf("%s", event_name(counter));
 	}
 
 	printf( "], ");
-- 
GitLab


From f7524bda8be8be98db356d6a83ac1da451ecdb2e Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:06 +0800
Subject: [PATCH 0185/2198] perf_counter tools: move remaining code into
 kerneltop.c

- perfstat.c can be safely removed now
- perfstat: -s => -a for system wide accounting
- kerneltop: add -S/--stat for perfstat mode
- minor adjustments to kerneltop --help, perfstat --help

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c    | 530 ++++++++++++++++++----
 Documentation/perf_counter/perfcounters.h | 132 +-----
 2 files changed, 432 insertions(+), 230 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index cba5cb0a97f97..9db65a4f10426 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -3,7 +3,7 @@
 
    Build with:
 
-     cc -O6 -Wall `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
+     cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
 
    Sample output:
 
@@ -26,18 +26,40 @@
               12.00 - ffffffff804ffb7f : __ip_local_out
               11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
                8.54 - ffffffff805001a3 : ip_queue_xmit
+ */
 
-  Started by Ingo Molnar <mingo@redhat.com>
+/*
+ * perfstat:  /usr/bin/time -alike performance counter statistics utility
 
-  Improvements and fixes by:
+          It summarizes the counter events of all tasks (and child tasks),
+          covering all CPUs that the command (or workload) executes on.
+          It only counts the per-task events of the workload started,
+          independent of how many other tasks run on those CPUs.
 
-    Arjan van de Ven <arjan@linux.intel.com>
-    Yanmin Zhang <yanmin.zhang@intel.com>
-    Mike Galbraith <efault@gmx.de>
+   Sample output:
 
-  Released under the GPL v2. (and only v2, not any later version)
+   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
 
+   Performance counter stats for 'ls':
+
+           163516953 instructions
+                2295 cache-misses
+             2855182 branch-misses
  */
+
+ /*
+  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+  *
+  * Improvements and fixes by:
+  *
+  *   Arjan van de Ven <arjan@linux.intel.com>
+  *   Yanmin Zhang <yanmin.zhang@intel.com>
+  *   Wu Fengguang <fengguang.wu@intel.com>
+  *   Mike Galbraith <efault@gmx.de>
+  *
+  * Released under the GPL v2. (and only v2, not any later version)
+  */
+
 #define _GNU_SOURCE
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -67,18 +89,22 @@
 
 #include "perfcounters.h"
 
-const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
-};
 
-static __u64			count_filter		       = 100;
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define DEF_PERFSTAT_EVENTS		{ -2, -5, -4, -3, 0, 1, 2, 3}
+
+static int			run_perfstat			=  0;
+static int			system_wide			=  0;
 
+static int			nr_counters			=  0;
+static long			event_id[MAX_COUNTERS]		= DEF_PERFSTAT_EVENTS;
+static int			event_raw[MAX_COUNTERS];
 static int			event_count[MAX_COUNTERS];
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static __u64			count_filter		       = 100;
 
 static int			tid				= -1;
 static int			profile_cpu			= -1;
@@ -96,125 +122,335 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
+static GList			*lines;
+
 struct source_line {
 	uint64_t		EIP;
 	unsigned long		count;
 	char			*line;
 };
 
-static GList			*lines;
+
+const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+static char *hw_event_names[] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+};
+
+struct event_symbol {
+	int event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
+	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
+	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
+	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
+	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
+	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
+	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
+	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
+	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
+	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
+	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
+	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
+	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
+	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
+	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
+	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
+};
+
+static void display_events_help(void)
+{
+	unsigned int i;
+	int e;
+
+	printf(
+	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
+
+	for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
+		if (e != event_symbols[i].event) {
+			e = event_symbols[i].event;
+			printf(
+	"\n                             %2d: %-20s", e, event_symbols[i].symbol);
+		} else
+			printf(" %s", event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
+}
+
+static void display_perfstat_help(void)
+{
+	printf(
+	"Usage: perfstat [<events...>] <cmd...>\n\n"
+	"PerfStat Options (up to %d event types can be specified):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -a                           # system-wide collection\n");
+	exit(0);
+}
 
 static void display_help(void)
 {
+	if (run_perfstat)
+		return display_perfstat_help();
+
 	printf(
-	"Usage: kerneltop [<options>]\n\n"
+	"Usage: kerneltop [<options>]\n"
+	"   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
 	"KernelTop Options (up to %d event types can be specified at once):\n\n",
 		 MAX_COUNTERS);
+
+	display_events_help();
+
 	printf(
-	" -e EID    --event=EID        # event type ID                    [default:  0]\n"
-	"                                   0: CPU cycles\n"
-	"                                   1: instructions\n"
-	"                                   2: cache accesses\n"
-	"                                   3: cache misses\n"
-	"                                   4: branch instructions\n"
-	"                                   5: branch prediction misses\n"
-	"                                   6: bus cycles\n\n"
-	"                                rNNN: raw PMU events (eventsel+umask)\n\n"
+	" -S        --stat             # perfstat COMMAND\n"
+	" -a                           # system-wide collection (for perfstat)\n\n"
 	" -c CNT    --count=CNT        # event period to sample\n\n"
 	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
 	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
 	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
-	" -f CNT --filter=CNT          # min-event-count filter          [default: 100]\n\n"
+	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
 	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
-	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use:\n"
+	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
 	" -z        --zero             # zero counts after display\n"
 	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
-	"\n");
+	);
 
 	exit(0);
 }
 
-static void process_options(int argc, char *argv[])
+static int type_valid(int type)
 {
-	int error = 0, counter;
+	if (type >= PERF_HW_EVENTS_MAX)
+		return 0;
+	if (type <= PERF_SW_EVENTS_MIN)
+		return 0;
 
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"cpu",		required_argument,	NULL, 'C'},
-			{"delay",	required_argument,	NULL, 'd'},
-			{"dump_symtab",	no_argument,		NULL, 'D'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"filter",	required_argument,	NULL, 'f'},
-			{"group",	required_argument,	NULL, 'g'},
-			{"help",	no_argument,		NULL, 'h'},
-			{"nmi",		required_argument,	NULL, 'n'},
-			{"pid",		required_argument,	NULL, 'p'},
-			{"vmlinux",	required_argument,	NULL, 'x'},
-			{"symbol",	required_argument,	NULL, 's'},
-			{"zero",	no_argument,		NULL, 'z'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "c:C:d:De:f:g:hn:p:s:x:z",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
+	return 1;
+}
 
-		switch (c) {
-		case 'c':
-			event_count[nr_counters]	=   atoi(optarg); break;
-		case 'C':
-			/* CPU and PID are mutually exclusive */
-			if (tid != -1) {
-				printf("WARNING: CPU switch overriding PID\n");
-				sleep(1);
-				tid = -1;
-			}
-			profile_cpu			=   atoi(optarg); break;
-		case 'd': delay_secs			=   atoi(optarg); break;
-		case 'D': dump_symtab			=              1; break;
+static char *event_name(int ctr)
+{
+	int type = event_id[ctr];
+	static char buf[32];
 
-		case 'e': error				= parse_events(optarg); break;
+	if (event_raw[ctr]) {
+		sprintf(buf, "raw 0x%x", type);
+		return buf;
+	}
+	if (!type_valid(type))
+		return "unknown";
 
-		case 'f': count_filter			=   atoi(optarg); break;
-		case 'g': group				=   atoi(optarg); break;
-		case 'h':      				  display_help(); break;
-		case 'n': nmi				=   atoi(optarg); break;
-		case 'p':
-			/* CPU and PID are mutually exclusive */
-			if (profile_cpu != -1) {
-				printf("WARNING: PID switch overriding CPU\n");
-				sleep(1);
-				profile_cpu = -1;
+	if (type >= 0)
+		return hw_event_names[type];
+
+	return sw_event_names[-type-1];
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static int match_event_symbols(char *str)
+{
+	unsigned int i;
+
+	if (isdigit(str[0]) || str[0] == '-')
+		return atoi(str);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return PERF_HW_EVENTS_MAX;
+}
+
+static int parse_events(char *str)
+{
+	int type, raw;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	raw = 0;
+	if (*str == 'r') {
+		raw = 1;
+		++str;
+		type = strtol(str, NULL, 16);
+	} else {
+		type = match_event_symbols(str);
+		if (!type_valid(type))
+			return -1;
+	}
+
+	event_id[nr_counters] = type;
+	event_raw[nr_counters] = raw;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+
+/*
+ * perfstat
+ */
+
+char fault_here[1000000];
+
+static void create_perfstat_counter(int counter)
+{
+	struct perf_counter_hw_event hw_event;
+
+	memset(&hw_event, 0, sizeof(hw_event));
+	hw_event.type		= event_id[counter];
+	hw_event.raw		= event_raw[counter];
+	hw_event.record_type	= PERF_RECORD_SIMPLE;
+	hw_event.nmi		= 0;
+
+	if (system_wide) {
+		int cpu;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
+			if (fd[cpu][counter] < 0) {
+				printf("perfstat error: syscall returned with %d (%s)\n",
+						fd[cpu][counter], strerror(errno));
+				exit(-1);
 			}
-			tid				=   atoi(optarg); break;
-		case 's': sym_filter			= strdup(optarg); break;
-		case 'x': vmlinux			= strdup(optarg); break;
-		case 'z': zero				=              1; break;
-		default: error = 1; break;
+		}
+	} else {
+		hw_event.inherit	= 1;
+		hw_event.disabled	= 1;
+
+		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
+		if (fd[0][counter] < 0) {
+			printf("perfstat error: syscall returned with %d (%s)\n",
+					fd[0][counter], strerror(errno));
+			exit(-1);
 		}
 	}
-	if (error)
-		display_help();
+}
 
-	if (!nr_counters) {
-		nr_counters = 1;
-		event_id[0] = 0;
+int do_perfstat(int argc, char *argv[])
+{
+	unsigned long long t0, t1;
+	int counter;
+	ssize_t res;
+	int status;
+	int pid;
+
+	if (!system_wide)
+		nr_cpus = 1;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		create_perfstat_counter(counter);
+
+	argc -= optind;
+	argv += optind;
+
+	/*
+	 * Enable counters and exec the command:
+	 */
+	t0 = rdclock();
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	if ((pid = fork()) < 0)
+		perror("failed to fork");
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
 	}
+	while (wait(&status) >= 0)
+		;
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	t1 = rdclock();
+
+	fflush(stdout);
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Performance counter stats for \'%s\':\n",
+		argv[0]);
+	fprintf(stderr, "\n");
 
 	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
+		int cpu;
+		__u64 count, single_count;
+
+		count = 0;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			res = read(fd[cpu][counter],
+					(char *) &single_count, sizeof(single_count));
+			assert(res == sizeof(single_count));
+			count += single_count;
+		}
 
-		if (event_id[counter] < PERF_HW_EVENTS_MAX)
-			event_count[counter] = default_count[event_id[counter]];
-		else
-			event_count[counter] = 100000;
+		if (!event_raw[counter] &&
+		    (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
+		     event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
+
+			double msecs = (double)count / 1000000;
+
+			fprintf(stderr, " %14.6f  %-20s (msecs)\n",
+				msecs, event_name(counter));
+		} else {
+			fprintf(stderr, " %14Ld  %-20s (events)\n",
+				count, event_name(counter));
+		}
+		if (!counter)
+			fprintf(stderr, "\n");
 	}
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
+			(double)(t1-t0)/1e6);
+	fprintf(stderr, "\n");
+
+	return 0;
 }
 
+/*
+ * Symbols
+ */
+
 static uint64_t			min_ip;
 static uint64_t			max_ip = -1ll;
 
@@ -507,6 +743,9 @@ static void parse_symbols(void)
 	}
 }
 
+/*
+ * Source lines
+ */
 
 static void parse_vmlinux(char *filename)
 {
@@ -527,7 +766,7 @@ static void parse_vmlinux(char *filename)
 		char *c;
 
 		src = malloc(sizeof(struct source_line));
-		assert(src != NULL);	
+		assert(src != NULL);
 		memset(src, 0, sizeof(struct source_line));
 
 		if (getline(&src->line, &dummy, file) < 0)
@@ -706,11 +945,100 @@ static void process_event(uint64_t ip, int counter)
 	record_ip(ip, counter);
 }
 
+static void process_options(int argc, char *argv[])
+{
+	int error = 0, counter;
+
+	if (strstr(argv[0], "perfstat"))
+		run_perfstat = 1;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"cpu",		required_argument,	NULL, 'C'},
+			{"delay",	required_argument,	NULL, 'd'},
+			{"dump_symtab",	no_argument,		NULL, 'D'},
+			{"event",	required_argument,	NULL, 'e'},
+			{"filter",	required_argument,	NULL, 'f'},
+			{"group",	required_argument,	NULL, 'g'},
+			{"help",	no_argument,		NULL, 'h'},
+			{"nmi",		required_argument,	NULL, 'n'},
+			{"pid",		required_argument,	NULL, 'p'},
+			{"vmlinux",	required_argument,	NULL, 'x'},
+			{"symbol",	required_argument,	NULL, 's'},
+			{"stat",	no_argument,		NULL, 'S'},
+			{"zero",	no_argument,		NULL, 'z'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'a': system_wide			=	       1; break;
+		case 'c': event_count[nr_counters]	=   atoi(optarg); break;
+		case 'C':
+			/* CPU and PID are mutually exclusive */
+			if (tid != -1) {
+				printf("WARNING: CPU switch overriding PID\n");
+				sleep(1);
+				tid = -1;
+			}
+			profile_cpu			=   atoi(optarg); break;
+		case 'd': delay_secs			=   atoi(optarg); break;
+		case 'D': dump_symtab			=              1; break;
+
+		case 'e': error				= parse_events(optarg); break;
+
+		case 'f': count_filter			=   atoi(optarg); break;
+		case 'g': group				=   atoi(optarg); break;
+		case 'h':      				  display_help(); break;
+		case 'n': nmi				=   atoi(optarg); break;
+		case 'p':
+			/* CPU and PID are mutually exclusive */
+			if (profile_cpu != -1) {
+				printf("WARNING: PID switch overriding CPU\n");
+				sleep(1);
+				profile_cpu = -1;
+			}
+			tid				=   atoi(optarg); break;
+		case 's': sym_filter			= strdup(optarg); break;
+		case 'S': run_perfstat			=	       1; break;
+		case 'x': vmlinux			= strdup(optarg); break;
+		case 'z': zero				=              1; break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	if (!nr_counters) {
+		if (run_perfstat)
+			nr_counters = 8;
+		else {
+			nr_counters = 1;
+			event_id[0] = 0;
+		}
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		if (event_id[counter] < PERF_HW_EVENTS_MAX)
+			event_count[counter] = default_count[event_id[counter]];
+		else
+			event_count[counter] = 100000;
+	}
+}
+
 int main(int argc, char *argv[])
 {
 	struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
-	int fd[MAX_NR_CPUS][MAX_COUNTERS];
 	int i, counter, group_fd;
 	unsigned int cpu;
 	uint64_t ip;
@@ -720,11 +1048,15 @@ int main(int argc, char *argv[])
 	process_options(argc, argv);
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	if (run_perfstat)
+		return do_perfstat(argc, argv);
+
 	if (tid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 
-	assert(nr_cpus <= MAX_NR_CPUS);
-
 	for (i = 0; i < nr_cpus; i++) {
 		group_fd = -1;
 		for (counter = 0; counter < nr_counters; counter++) {
diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
index 99a90d833e127..32e24b9154ab6 100644
--- a/Documentation/perf_counter/perfcounters.h
+++ b/Documentation/perf_counter/perfcounters.h
@@ -11,9 +11,6 @@
 #define PR_TASK_PERF_COUNTERS_DISABLE	31
 #define PR_TASK_PERF_COUNTERS_ENABLE    32
 
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 
 #define rdclock()					\
@@ -110,6 +107,7 @@ struct perf_counter_hw_event {
 	__u64			__reserved_3;
 };
 
+
 #ifdef __x86_64__
 # define __NR_perf_counter_open	295
 #endif
@@ -142,131 +140,3 @@ asmlinkage int sys_perf_counter_open(
 #endif
 	return ret;
 }
-
-static int			nr_counters			= 0;
-static long			event_id[MAX_COUNTERS]		= { -2, -5, -4, -3, 0, 1, 2, 3};
-static int			event_raw[MAX_COUNTERS];
-
-static char *hw_event_names [] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names [] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-};
-
-struct event_symbol {
-	int event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols [] = {
-	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
-	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
-	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
-	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
-	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
-	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
-	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
-	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
-	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
-	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
-	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
-	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
-	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
-	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
-};
-
-static int type_valid(int type)
-{
-	if (type >= PERF_HW_EVENTS_MAX)
-		return 0;
-	if (type <= PERF_SW_EVENTS_MIN)
-		return 0;
-
-	return 1;
-}
-
-static char *event_name(int ctr)
-{
-	int type = event_id[ctr];
-	static char buf[32];
-
-	if (event_raw[ctr]) {
-		sprintf(buf, "raw 0x%x", type);
-		return buf;
-	}
-	if (!type_valid(type))
-		return "unknown";
-
-	if (type >= 0)
-		return hw_event_names[type];
-
-	return sw_event_names[-type-1];
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static int match_event_symbols(char *str)
-{
-	unsigned int i;
-
-	if (isdigit(str[0]) || str[0] == '-')
-		return atoi(str);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return PERF_HW_EVENTS_MAX;
-}
-
-static int parse_events(char *str)
-{
-	int type, raw;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	raw = 0;
-	if (*str == 'r') {
-		raw = 1;
-		++str;
-		type = strtol(str, NULL, 16);
-	} else {
-		type = match_event_symbols(str);
-		if (!type_valid(type))
-			return -1;
-	}
-
-	event_id[nr_counters] = type;
-	event_raw[nr_counters] = raw;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-- 
GitLab


From ef45fa9e6c1694d3e8063f39749097a6e496b12c Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:07 +0800
Subject: [PATCH 0186/2198] perf_counter tools: fix comment for sym_weight()

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 9db65a4f10426..7bf2a516f18b9 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -473,7 +473,7 @@ static struct sym_entry		sym_table[MAX_SYMS];
 static void show_details(struct sym_entry *sym);
 
 /*
- * Ordering weight: count-1 * count-1 * ... / count-n
+ * Ordering weight: count-1 * count-2 * ... / count-n
  */
 static double sym_weight(const struct sym_entry *sym)
 {
-- 
GitLab


From 3ab8d792b1348eaabfe550ba60902d781d160dd4 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:08 +0800
Subject: [PATCH 0187/2198] perf_counter tools: fix event_id type

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 7bf2a516f18b9..7bfb0f0d8005b 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -99,7 +99,7 @@ static int			run_perfstat			=  0;
 static int			system_wide			=  0;
 
 static int			nr_counters			=  0;
-static long			event_id[MAX_COUNTERS]		= DEF_PERFSTAT_EVENTS;
+static __s64			event_id[MAX_COUNTERS]		= DEF_PERFSTAT_EVENTS;
 static int			event_raw[MAX_COUNTERS];
 static int			event_count[MAX_COUNTERS];
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
@@ -261,11 +261,11 @@ static int type_valid(int type)
 
 static char *event_name(int ctr)
 {
-	int type = event_id[ctr];
+	__s64 type = event_id[ctr];
 	static char buf[32];
 
 	if (event_raw[ctr]) {
-		sprintf(buf, "raw 0x%x", type);
+		sprintf(buf, "raw 0x%llx", (long long)type);
 		return buf;
 	}
 	if (!type_valid(type))
@@ -299,7 +299,8 @@ static int match_event_symbols(char *str)
 
 static int parse_events(char *str)
 {
-	int type, raw;
+	__s64 type;
+	int raw;
 
 again:
 	if (nr_counters == MAX_COUNTERS)
-- 
GitLab


From dda7c02f33833bfa9412ba6f0e410b0a18b42c88 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:09 +0800
Subject: [PATCH 0188/2198] perf_counter tools: cut down default count for
 cpu-cycles

In my system, it takes kerneltop dozens of minutes to
show up usable numbers. Make the default count 100 times
smaller fixed this long startup latency.

I'm not sure if it's the right solution though.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 7bfb0f0d8005b..0bd3c13150b1f 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -132,7 +132,7 @@ struct source_line {
 
 
 const unsigned int default_count[] = {
-	1000000,
+	  10000,
 	1000000,
 	  10000,
 	  10000,
-- 
GitLab


From af9522cf133e9be6da8525a46a9ed7e7659f0e1a Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 20 Mar 2009 10:08:10 +0800
Subject: [PATCH 0189/2198] perf_counter tools: when no command is feed to
 perfstat, display help and exit

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 0bd3c13150b1f..81a68aac137f7 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -387,6 +387,9 @@ int do_perfstat(int argc, char *argv[])
 	argc -= optind;
 	argv += optind;
 
+	if (!argc)
+		display_help();
+
 	/*
 	 * Enable counters and exec the command:
 	 */
-- 
GitLab


From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:06 +0100
Subject: [PATCH 0190/2198] perf_counter: remove the event config bitfields

Since the bitfields turned into a bit of a mess, remove them and rely on
good old masks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.059499915@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  6 +--
 arch/x86/kernel/cpu/perf_counter.c |  8 ++--
 include/linux/perf_counter.h       | 74 ++++++++++++++++++++----------
 kernel/perf_counter.c              | 22 +++++----
 4 files changed, 70 insertions(+), 40 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6413d9c0313b2..d05651584d439 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -602,13 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 	if ((s64)counter->hw_event.irq_period < 0)
 		return NULL;
-	if (!counter->hw_event.raw_type) {
-		ev = counter->hw_event.event_id;
+	if (!perf_event_raw(&counter->hw_event)) {
+		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return NULL;
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = counter->hw_event.raw_event_id;
+		ev = perf_event_config(&counter->hw_event);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 902282d68b0c8..3f95b0cdc550e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (hw_event->raw_type) {
-		hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id);
+	if (perf_event_raw(hw_event)) {
+		hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
 	} else {
-		if (hw_event->event_id >= pmc_ops->max_events)
+		if (perf_event_id(hw_event) >= pmc_ops->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(hw_event->event_id);
+		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
 	counter->wakeup_pending = 0;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 98f5990be1e19..56099e52970d2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -82,32 +82,37 @@ enum perf_counter_record_type {
 	PERF_RECORD_GROUP		= 2,
 };
 
+#define __PERF_COUNTER_MASK(name) 			\
+	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
+	 PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW_BITS		1
+#define PERF_COUNTER_RAW_SHIFT		63
+#define PERF_COUNTER_RAW_MASK		__PERF_COUNTER_MASK(RAW)
+
+#define PERF_COUNTER_CONFIG_BITS	63
+#define PERF_COUNTER_CONFIG_SHIFT	0
+#define PERF_COUNTER_CONFIG_MASK	__PERF_COUNTER_MASK(CONFIG)
+
+#define PERF_COUNTER_TYPE_BITS		7
+#define PERF_COUNTER_TYPE_SHIFT		56
+#define PERF_COUNTER_TYPE_MASK		__PERF_COUNTER_MASK(TYPE)
+
+#define PERF_COUNTER_EVENT_BITS		56
+#define PERF_COUNTER_EVENT_SHIFT	0
+#define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
+
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	union {
-#ifndef __BIG_ENDIAN_BITFIELD
-		struct {
-			__u64			event_id	: 56,
-						type		:  8;
-		};
-		struct {
-			__u64			raw_event_id	: 63,
-						raw_type	:  1;
-		};
-#else
-		struct {
-			__u64			type		:  8,
-						event_id	: 56;
-		};
-		struct {
-			__u64			raw_type	:  1,
-						raw_event_id	: 63;
-		};
-#endif /* __BIT_ENDIAN_BITFIELD */
-		__u64		event_config;
-	};
+	/*
+	 * The MSB of the config word signifies if the rest contains cpu
+	 * specific (raw) counter configuration data, if unset, the next
+	 * 7 bits are an event type and the rest of the bits are the event
+	 * identifier.
+	 */
+	__u64			config;
 
 	__u64			irq_period;
 	__u64			record_type;
@@ -157,6 +162,27 @@ struct perf_counter_hw_event {
 
 struct task_struct;
 
+static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_RAW_MASK;
+}
+
+static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_CONFIG_MASK;
+}
+
+static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
+{
+	return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
+		PERF_COUNTER_TYPE_SHIFT;
+}
+
+static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
+{
+	return hw_event->config & PERF_COUNTER_EVENT_MASK;
+}
+
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -336,8 +362,8 @@ extern void perf_counter_output(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !counter->hw_event.raw_type &&
-		counter->hw_event.type != PERF_TYPE_HARDWARE;
+	return !perf_event_raw(&counter->hw_event) &&
+		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f054b8c9bf96b..ca14fc41ccdf7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1379,7 +1379,7 @@ static void perf_counter_handle_group(struct perf_counter *counter)
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_counter_store_irq(counter, sub->hw_event.event_config);
+		perf_counter_store_irq(counter, sub->hw_event.config);
 		perf_counter_store_irq(counter, atomic64_read(&sub->count));
 	}
 }
@@ -1489,13 +1489,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 		return 0;
 
-	if (counter->hw_event.raw_type)
+	if (perf_event_raw(&counter->hw_event))
 		return 0;
 
-	if (counter->hw_event.type != type)
+	if (perf_event_type(&counter->hw_event) != type)
 		return 0;
 
-	if (counter->hw_event.event_id != event)
+	if (perf_event_id(&counter->hw_event) != event)
 		return 0;
 
 	if (counter->hw_event.exclude_user && user_mode(regs))
@@ -1757,13 +1757,13 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(counter->hw_event.event_id);
+	ftrace_profile_disable(perf_event_id(&counter->hw_event));
 }
 
 static const struct hw_perf_counter_ops *
 tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = counter->hw_event.event_id;
+	int event_id = perf_event_id(&counter->hw_event);
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -1797,7 +1797,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (counter->hw_event.event_id) {
+	switch (perf_event_id(&counter->hw_event)) {
 	case PERF_COUNT_CPU_CLOCK:
 		hw_ops = &perf_ops_cpu_clock;
 
@@ -1882,9 +1882,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	hw_ops = NULL;
 
-	if (hw_event->raw_type)
+	if (perf_event_raw(hw_event)) {
 		hw_ops = hw_perf_counter_init(counter);
-	else switch (hw_event->type) {
+		goto done;
+	}
+
+	switch (perf_event_type(hw_event)) {
 	case PERF_TYPE_HARDWARE:
 		hw_ops = hw_perf_counter_init(counter);
 		break;
@@ -1902,6 +1905,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		kfree(counter);
 		return NULL;
 	}
+done:
 	counter->hw_ops = hw_ops;
 
 	return counter;
-- 
GitLab


From 96f6d4444302bb2ea2cf409529eef816462f6ce0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:07 +0100
Subject: [PATCH 0191/2198] perf_counter: avoid recursion

Tracepoint events like lock_acquire and software counters like
pagefaults can recurse into the perf counter code again, avoid that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.152096433@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 +++++++
 kernel/perf_counter.c        | 26 ++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 56099e52970d2..18dc17d0a61cc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -328,6 +328,13 @@ struct perf_cpu_context {
 	int				active_oncpu;
 	int				max_pertask;
 	int				exclusive;
+
+	/*
+	 * Recursion avoidance:
+	 *
+	 * task, softirq, irq, nmi context
+	 */
+	int			recursion[4];
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ca14fc41ccdf7..ce34bff07bdaa 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -23,6 +23,7 @@
 #include <linux/mm.h>
 #include <linux/vmstat.h>
 #include <linux/rculist.h>
+#include <linux/hardirq.h>
 
 #include <asm/irq_regs.h>
 
@@ -1532,10 +1533,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 	rcu_read_unlock();
 }
 
+static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
+{
+	if (in_nmi())
+		return &cpuctx->recursion[3];
+
+	if (in_irq())
+		return &cpuctx->recursion[2];
+
+	if (in_softirq())
+		return &cpuctx->recursion[1];
+
+	return &cpuctx->recursion[0];
+}
+
 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 				   u64 nr, int nmi, struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
+	int *recursion = perf_swcounter_recursion_context(cpuctx);
+
+	if (*recursion)
+		goto out;
+
+	(*recursion)++;
+	barrier();
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
 	if (cpuctx->task_ctx) {
@@ -1543,6 +1565,10 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 				nr, nmi, regs);
 	}
 
+	barrier();
+	(*recursion)--;
+
+out:
 	put_cpu_var(perf_cpu_context);
 }
 
-- 
GitLab


From 37d81828385f8ff823caaaf1a83e72d065b6cfa1 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 23 Mar 2009 18:22:08 +0100
Subject: [PATCH 0192/2198] perf_counter: add an mmap method to allow userspace
 to read hardware counters

Impact: new feature giving performance improvement

This adds the ability for userspace to do an mmap on a hardware counter
fd and get access to a read-only page that contains the information
needed to translate a hardware counter value to the full 64-bit
counter value that would be returned by a read on the fd.  This is
useful on architectures that allow user programs to read the hardware
counters, such as PowerPC.

The mmap will only succeed if the counter is a hardware counter
monitoring the current process.

On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter
and translate it to the full 64-bit value in about 30ns using the
mmapped page, compared to about 830ns for the read syscall on the
counter, so this does give a significant performance improvement.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090323172417.297057964@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  6 +++
 include/linux/perf_counter.h       | 15 ++++++
 kernel/perf_counter.c              | 76 ++++++++++++++++++++++++++++++
 3 files changed, 97 insertions(+)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d05651584d439..e4349281b07de 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -417,6 +417,8 @@ void hw_perf_restore(u64 disable)
 		atomic64_set(&counter->hw.prev_count, val);
 		counter->hw.idx = hwc_index[i] + 1;
 		write_pmc(counter->hw.idx, val);
+		if (counter->user_page)
+			perf_counter_update_userpage(counter);
 	}
 	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -572,6 +574,8 @@ static void power_perf_disable(struct perf_counter *counter)
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
+			if (counter->user_page)
+				perf_counter_update_userpage(counter);
 			break;
 		}
 	}
@@ -698,6 +702,8 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	write_pmc(counter->hw.idx, val);
 	atomic64_set(&counter->hw.prev_count, val);
 	atomic64_set(&counter->hw.period_left, left);
+	if (counter->user_page)
+		perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 18dc17d0a61cc..40b324e91bf68 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -143,6 +143,17 @@ struct perf_counter_hw_event {
 #define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
 #define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
 
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+	__u32	version;		/* version number of this structure */
+	__u32	compat_version;		/* lowest version this is compat with */
+	__u32	lock;			/* seqlock for synchronization */
+	__u32	index;			/* hardware counter identifier */
+	__s64	offset;			/* add to hardware counter value */
+};
+
 #ifdef __KERNEL__
 /*
  * Kernel-internal data types and definitions:
@@ -278,6 +289,9 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
+	/* pointer to page shared with userspace via mmap */
+	unsigned long			user_page;
+
 	/* read() / irq related data */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
@@ -361,6 +375,7 @@ extern int perf_counter_task_enable(void);
 extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu);
+extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 extern void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ce34bff07bdaa..d9cfd902140e0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1177,6 +1177,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
+	free_page(counter->user_page);
 	free_counter(counter);
 	put_context(ctx);
 
@@ -1346,12 +1347,87 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+	struct perf_counter_mmap_page *userpg;
+
+	if (!counter->user_page)
+		return;
+	userpg = (struct perf_counter_mmap_page *) counter->user_page;
+
+	++userpg->lock;
+	smp_wmb();
+	userpg->index = counter->hw.idx;
+	userpg->offset = atomic64_read(&counter->count);
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+		userpg->offset -= atomic64_read(&counter->hw.prev_count);
+	smp_wmb();
+	++userpg->lock;
+}
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct perf_counter *counter = vma->vm_file->private_data;
+
+	if (!counter->user_page)
+		return VM_FAULT_SIGBUS;
+
+	vmf->page = virt_to_page(counter->user_page);
+	get_page(vmf->page);
+	return 0;
+}
+
+static struct vm_operations_struct perf_mmap_vmops = {
+	.fault = perf_mmap_fault,
+};
+
+static int perf_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct perf_counter *counter = file->private_data;
+	unsigned long userpg;
+
+	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
+		return -EINVAL;
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	/*
+	 * For now, restrict to the case of a hardware counter
+	 * on the current task.
+	 */
+	if (is_software_counter(counter) || counter->task != current)
+		return -EINVAL;
+
+	userpg = counter->user_page;
+	if (!userpg) {
+		userpg = get_zeroed_page(GFP_KERNEL);
+		mutex_lock(&counter->mutex);
+		if (counter->user_page) {
+			free_page(userpg);
+			userpg = counter->user_page;
+		} else {
+			counter->user_page = userpg;
+		}
+		mutex_unlock(&counter->mutex);
+		if (!userpg)
+			return -ENOMEM;
+	}
+
+	perf_counter_update_userpage(counter);
+
+	vma->vm_flags &= ~VM_MAYWRITE;
+	vma->vm_flags |= VM_RESERVED;
+	vma->vm_ops = &perf_mmap_vmops;
+	return 0;
+}
+
 static const struct file_operations perf_fops = {
 	.release		= perf_release,
 	.read			= perf_read,
 	.poll			= perf_poll,
 	.unlocked_ioctl		= perf_ioctl,
 	.compat_ioctl		= perf_ioctl,
+	.mmap			= perf_mmap,
 };
 
 /*
-- 
GitLab


From 9ab772cd535c4b256a577eae516f9c7462346b2d Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 23 Mar 2009 18:22:09 +0100
Subject: [PATCH 0193/2198] mutex: add atomic_dec_and_mutex_lock()

Much like the atomic_dec_and_lock() function in which we take an hold a
spin_lock if we drop the atomic to 0 this function takes and holds the
mutex if we dec the atomic to 0.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.410913479@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mutex.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7e0ab84..93054fc3635c9 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+
 #endif
-- 
GitLab


From b09d2501ed3d294619cbfbcf828ad39324d0e548 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 17:21:56 -0700
Subject: [PATCH 0194/2198] mutex: drop "inline" from mutex_lock() inside
 kernel/mutex.c

Impact: build fix

mutex_lock() is was defined inline in kernel/mutex.c, but wasn't
declared so not in <linux/mutex.h>.  This didn't cause a problem until
checkin 3a2d367d9aabac486ac4444c6c7ec7a1dab16267 added the
atomic_dec_and_mutex_lock() inline in between declaration and
definion.

This broke building with CONFIG_ALLOW_WARNINGS=n, e.g. make
allnoconfig.

Either from the source code nor the allnoconfig binary output I cannot
find any internal references to mutex_lock() in kernel/mutex.c, so
presumably this "inline" is now-useless legacy.

Cc: Eric Paris <eparis@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <tip-3a2d367d9aabac486ac4444c6c7ec7a1dab16267@git.kernel.org>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 kernel/mutex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781394a30..fd95eaa672e6c 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);
  *
  * This function is similar to (but not equivalent to) down().
  */
-void inline __sched mutex_lock(struct mutex *lock)
+void __sched mutex_lock(struct mutex *lock)
 {
 	might_sleep();
 	/*
-- 
GitLab


From 7b732a75047738e4f85438ed2f9cd34bf5f2a19a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:10 +0100
Subject: [PATCH 0195/2198] perf_counter: new output ABI - part 1

Impact: Rework the perfcounter output ABI

use sys_read() only for instant data and provide mmap() output for all
async overflow data.

The first mmap() determines the size of the output buffer. The mmap()
size must be a PAGE_SIZE multiple of 1+pages, where pages must be a
power of 2 or 0. Further mmap()s of the same fd must have the same
size. Once all maps are gone, you can again mmap() with a new size.

In case of 0 extra pages there is no data output and the first page
only contains meta data.

When there are data pages, a poll() event will be generated for each
full page of data. Furthermore, the output is circular. This means
that although 1 page is a valid configuration, its useless, since
we'll start overwriting it the instant we report a full page.

Future work will focus on the output format (currently maintained)
where we'll likey want each entry denoted by a header which includes a
type and length.

Further future work will allow to splice() the fd, also containing the
async overflow data -- splice() would be mutually exclusive with
mmap() of the data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.470536358@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |   9 +-
 include/linux/perf_counter.h       |  36 +--
 kernel/perf_counter.c              | 464 +++++++++++++++--------------
 3 files changed, 263 insertions(+), 246 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index e4349281b07de..d48596ab65577 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -417,8 +417,7 @@ void hw_perf_restore(u64 disable)
 		atomic64_set(&counter->hw.prev_count, val);
 		counter->hw.idx = hwc_index[i] + 1;
 		write_pmc(counter->hw.idx, val);
-		if (counter->user_page)
-			perf_counter_update_userpage(counter);
+		perf_counter_update_userpage(counter);
 	}
 	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
@@ -574,8 +573,7 @@ static void power_perf_disable(struct perf_counter *counter)
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
-			if (counter->user_page)
-				perf_counter_update_userpage(counter);
+			perf_counter_update_userpage(counter);
 			break;
 		}
 	}
@@ -702,8 +700,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	write_pmc(counter->hw.idx, val);
 	atomic64_set(&counter->hw.prev_count, val);
 	atomic64_set(&counter->hw.period_left, left);
-	if (counter->user_page)
-		perf_counter_update_userpage(counter);
+	perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 40b324e91bf68..2b5e66d5ebdf0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -152,6 +152,8 @@ struct perf_counter_mmap_page {
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
 	__s64	offset;			/* add to hardware counter value */
+
+	__u32   data_head;		/* head in the data section */
 };
 
 #ifdef __KERNEL__
@@ -218,21 +220,6 @@ struct hw_perf_counter {
 #endif
 };
 
-/*
- * Hardcoded buffer length limit for now, for IRQ-fed events:
- */
-#define PERF_DATA_BUFLEN		2048
-
-/**
- * struct perf_data - performance counter IRQ data sampling ...
- */
-struct perf_data {
-	int				len;
-	int				rd_idx;
-	int				overrun;
-	u8				data[PERF_DATA_BUFLEN];
-};
-
 struct perf_counter;
 
 /**
@@ -256,6 +243,14 @@ enum perf_counter_active_state {
 
 struct file;
 
+struct perf_mmap_data {
+	struct rcu_head			rcu_head;
+	int				nr_pages;
+	atomic_t			head;
+	struct perf_counter_mmap_page   *user_page;
+	void 				*data_pages[0];
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -289,16 +284,15 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
-	/* pointer to page shared with userspace via mmap */
-	unsigned long			user_page;
+	/* mmap bits */
+	struct mutex			mmap_mutex;
+	atomic_t			mmap_count;
+	struct perf_mmap_data		*data;
 
-	/* read() / irq related data */
+	/* poll related */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
 	int				wakeup_pending;
-	struct perf_data		*irqdata;
-	struct perf_data		*usrdata;
-	struct perf_data		data[2];
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d9cfd902140e0..0dfe91094fd14 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4,7 +4,8 @@
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
  *
- *  For licencing details see kernel-base/COPYING
+ *
+ *  For licensing details see kernel-base/COPYING
  */
 
 #include <linux/fs.h>
@@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	return atomic64_read(&counter->count);
 }
 
-/*
- * Cross CPU call to switch performance data pointers
- */
-static void __perf_switch_irq_data(void *info)
-{
-	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-	struct perf_counter *counter = info;
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_data *oldirqdata = counter->irqdata;
-
-	/*
-	 * If this is a task context, we need to check whether it is
-	 * the current task context of this cpu. If not it has been
-	 * scheduled out before the smp call arrived.
-	 */
-	if (ctx->task) {
-		if (cpuctx->task_ctx != ctx)
-			return;
-		spin_lock(&ctx->lock);
-	}
-
-	/* Change the pointer NMI safe */
-	atomic_long_set((atomic_long_t *)&counter->irqdata,
-			(unsigned long) counter->usrdata);
-	counter->usrdata = oldirqdata;
-
-	if (ctx->task)
-		spin_unlock(&ctx->lock);
-}
-
-static struct perf_data *perf_switch_irq_data(struct perf_counter *counter)
-{
-	struct perf_counter_context *ctx = counter->ctx;
-	struct perf_data *oldirqdata = counter->irqdata;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		smp_call_function_single(counter->cpu,
-					 __perf_switch_irq_data,
-					 counter, 1);
-		return counter->usrdata;
-	}
-
-retry:
-	spin_lock_irq(&ctx->lock);
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-		counter->irqdata = counter->usrdata;
-		counter->usrdata = oldirqdata;
-		spin_unlock_irq(&ctx->lock);
-		return oldirqdata;
-	}
-	spin_unlock_irq(&ctx->lock);
-	task_oncpu_function_call(task, __perf_switch_irq_data, counter);
-	/* Might have failed, because task was scheduled out */
-	if (counter->irqdata == oldirqdata)
-		goto retry;
-
-	return counter->usrdata;
-}
-
 static void put_context(struct perf_counter_context *ctx)
 {
 	if (ctx->task)
@@ -1177,7 +1118,6 @@ static int perf_release(struct inode *inode, struct file *file)
 	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
-	free_page(counter->user_page);
 	free_counter(counter);
 	put_context(ctx);
 
@@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
 	u64 cntval;
 
-	if (count != sizeof(cntval))
+	if (count < sizeof(cntval))
 		return -EINVAL;
 
 	/*
@@ -1210,122 +1150,21 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
 }
 
-static ssize_t
-perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count)
-{
-	if (!usrdata->len)
-		return 0;
-
-	count = min(count, (size_t)usrdata->len);
-	if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count))
-		return -EFAULT;
-
-	/* Adjust the counters */
-	usrdata->len -= count;
-	if (!usrdata->len)
-		usrdata->rd_idx = 0;
-	else
-		usrdata->rd_idx += count;
-
-	return count;
-}
-
-static ssize_t
-perf_read_irq_data(struct perf_counter	*counter,
-		   char __user		*buf,
-		   size_t		count,
-		   int			nonblocking)
-{
-	struct perf_data *irqdata, *usrdata;
-	DECLARE_WAITQUEUE(wait, current);
-	ssize_t res, res2;
-
-	irqdata = counter->irqdata;
-	usrdata = counter->usrdata;
-
-	if (usrdata->len + irqdata->len >= count)
-		goto read_pending;
-
-	if (nonblocking)
-		return -EAGAIN;
-
-	spin_lock_irq(&counter->waitq.lock);
-	__add_wait_queue(&counter->waitq, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (usrdata->len + irqdata->len >= count)
-			break;
-
-		if (signal_pending(current))
-			break;
-
-		if (counter->state == PERF_COUNTER_STATE_ERROR)
-			break;
-
-		spin_unlock_irq(&counter->waitq.lock);
-		schedule();
-		spin_lock_irq(&counter->waitq.lock);
-	}
-	__remove_wait_queue(&counter->waitq, &wait);
-	__set_current_state(TASK_RUNNING);
-	spin_unlock_irq(&counter->waitq.lock);
-
-	if (usrdata->len + irqdata->len < count &&
-	    counter->state != PERF_COUNTER_STATE_ERROR)
-		return -ERESTARTSYS;
-read_pending:
-	mutex_lock(&counter->mutex);
-
-	/* Drain pending data first: */
-	res = perf_copy_usrdata(usrdata, buf, count);
-	if (res < 0 || res == count)
-		goto out;
-
-	/* Switch irq buffer: */
-	usrdata = perf_switch_irq_data(counter);
-	res2 = perf_copy_usrdata(usrdata, buf + res, count - res);
-	if (res2 < 0) {
-		if (!res)
-			res = -EFAULT;
-	} else {
-		res += res2;
-	}
-out:
-	mutex_unlock(&counter->mutex);
-
-	return res;
-}
-
 static ssize_t
 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 	struct perf_counter *counter = file->private_data;
 
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		return perf_read_hw(counter, buf, count);
-
-	case PERF_RECORD_IRQ:
-	case PERF_RECORD_GROUP:
-		return perf_read_irq_data(counter, buf, count,
-					  file->f_flags & O_NONBLOCK);
-	}
-	return -EINVAL;
+	return perf_read_hw(counter, buf, count);
 }
 
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_counter *counter = file->private_data;
-	unsigned int events = 0;
-	unsigned long flags;
+	unsigned int events = POLLIN;
 
 	poll_wait(file, &counter->waitq, wait);
 
-	spin_lock_irqsave(&counter->waitq.lock, flags);
-	if (counter->usrdata->len || counter->irqdata->len)
-		events |= POLLIN;
-	spin_unlock_irqrestore(&counter->waitq.lock, flags);
-
 	return events;
 }
 
@@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-void perf_counter_update_userpage(struct perf_counter *counter)
+static void __perf_counter_update_userpage(struct perf_counter *counter,
+					   struct perf_mmap_data *data)
 {
-	struct perf_counter_mmap_page *userpg;
-
-	if (!counter->user_page)
-		return;
-	userpg = (struct perf_counter_mmap_page *) counter->user_page;
+	struct perf_counter_mmap_page *userpg = data->user_page;
 
+	/*
+	 * Disable preemption so as to not let the corresponding user-space
+	 * spin too long if we get preempted.
+	 */
+	preempt_disable();
 	++userpg->lock;
 	smp_wmb();
 	userpg->index = counter->hw.idx;
 	userpg->offset = atomic64_read(&counter->count);
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
+
+	userpg->data_head = atomic_read(&data->head);
 	smp_wmb();
 	++userpg->lock;
+	preempt_enable();
+}
+
+void perf_counter_update_userpage(struct perf_counter *counter)
+{
+	struct perf_mmap_data *data;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (data)
+		__perf_counter_update_userpage(counter, data);
+	rcu_read_unlock();
 }
 
 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct perf_counter *counter = vma->vm_file->private_data;
+	struct perf_mmap_data *data;
+	int ret = VM_FAULT_SIGBUS;
 
-	if (!counter->user_page)
-		return VM_FAULT_SIGBUS;
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (!data)
+		goto unlock;
+
+	if (vmf->pgoff == 0) {
+		vmf->page = virt_to_page(data->user_page);
+	} else {
+		int nr = vmf->pgoff - 1;
 
-	vmf->page = virt_to_page(counter->user_page);
+		if ((unsigned)nr > data->nr_pages)
+			goto unlock;
+
+		vmf->page = virt_to_page(data->data_pages[nr]);
+	}
 	get_page(vmf->page);
+	ret = 0;
+unlock:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
+{
+	struct perf_mmap_data *data;
+	unsigned long size;
+	int i;
+
+	WARN_ON(atomic_read(&counter->mmap_count));
+
+	size = sizeof(struct perf_mmap_data);
+	size += nr_pages * sizeof(void *);
+
+	data = kzalloc(size, GFP_KERNEL);
+	if (!data)
+		goto fail;
+
+	data->user_page = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!data->user_page)
+		goto fail_user_page;
+
+	for (i = 0; i < nr_pages; i++) {
+		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
+		if (!data->data_pages[i])
+			goto fail_data_pages;
+	}
+
+	data->nr_pages = nr_pages;
+
+	rcu_assign_pointer(counter->data, data);
+
 	return 0;
+
+fail_data_pages:
+	for (i--; i >= 0; i--)
+		free_page((unsigned long)data->data_pages[i]);
+
+	free_page((unsigned long)data->user_page);
+
+fail_user_page:
+	kfree(data);
+
+fail:
+	return -ENOMEM;
+}
+
+static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+{
+	struct perf_mmap_data *data = container_of(rcu_head,
+			struct perf_mmap_data, rcu_head);
+	int i;
+
+	free_page((unsigned long)data->user_page);
+	for (i = 0; i < data->nr_pages; i++)
+		free_page((unsigned long)data->data_pages[i]);
+	kfree(data);
+}
+
+static void perf_mmap_data_free(struct perf_counter *counter)
+{
+	struct perf_mmap_data *data = counter->data;
+
+	WARN_ON(atomic_read(&counter->mmap_count));
+
+	rcu_assign_pointer(counter->data, NULL);
+	call_rcu(&data->rcu_head, __perf_mmap_data_free);
+}
+
+static void perf_mmap_open(struct vm_area_struct *vma)
+{
+	struct perf_counter *counter = vma->vm_file->private_data;
+
+	atomic_inc(&counter->mmap_count);
+}
+
+static void perf_mmap_close(struct vm_area_struct *vma)
+{
+	struct perf_counter *counter = vma->vm_file->private_data;
+
+	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
+				      &counter->mmap_mutex)) {
+		perf_mmap_data_free(counter);
+		mutex_unlock(&counter->mmap_mutex);
+	}
 }
 
 static struct vm_operations_struct perf_mmap_vmops = {
+	.open = perf_mmap_open,
+	.close = perf_mmap_close,
 	.fault = perf_mmap_fault,
 };
 
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = file->private_data;
-	unsigned long userpg;
+	unsigned long vma_size;
+	unsigned long nr_pages;
+	unsigned long locked, lock_limit;
+	int ret = 0;
 
 	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
 		return -EINVAL;
-	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+
+	vma_size = vma->vm_end - vma->vm_start;
+	nr_pages = (vma_size / PAGE_SIZE) - 1;
+
+	if (nr_pages == 0 || !is_power_of_2(nr_pages))
 		return -EINVAL;
 
-	/*
-	 * For now, restrict to the case of a hardware counter
-	 * on the current task.
-	 */
-	if (is_software_counter(counter) || counter->task != current)
+	if (vma_size != PAGE_SIZE * (1 + nr_pages))
 		return -EINVAL;
 
-	userpg = counter->user_page;
-	if (!userpg) {
-		userpg = get_zeroed_page(GFP_KERNEL);
-		mutex_lock(&counter->mutex);
-		if (counter->user_page) {
-			free_page(userpg);
-			userpg = counter->user_page;
-		} else {
-			counter->user_page = userpg;
-		}
-		mutex_unlock(&counter->mutex);
-		if (!userpg)
-			return -ENOMEM;
-	}
+	if (vma->vm_pgoff != 0)
+		return -EINVAL;
+
+	locked = vma_size >>  PAGE_SHIFT;
+	locked += vma->vm_mm->locked_vm;
 
-	perf_counter_update_userpage(counter);
+	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
+	lock_limit >>= PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
+		return -EPERM;
+
+	mutex_lock(&counter->mmap_mutex);
+	if (atomic_inc_not_zero(&counter->mmap_count))
+		goto out;
+
+	WARN_ON(counter->data);
+	ret = perf_mmap_data_alloc(counter, nr_pages);
+	if (!ret)
+		atomic_set(&counter->mmap_count, 1);
+out:
+	mutex_unlock(&counter->mmap_mutex);
 
 	vma->vm_flags &= ~VM_MAYWRITE;
 	vma->vm_flags |= VM_RESERVED;
 	vma->vm_ops = &perf_mmap_vmops;
-	return 0;
+
+	return ret;
 }
 
 static const struct file_operations perf_fops = {
@@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = {
  * Output
  */
 
-static void perf_counter_store_irq(struct perf_counter *counter, u64 data)
+static int perf_output_write(struct perf_counter *counter, int nmi,
+			     void *buf, ssize_t size)
 {
-	struct perf_data *irqdata = counter->irqdata;
+	struct perf_mmap_data *data;
+	unsigned int offset, head, nr;
+	unsigned int len;
+	int ret, wakeup;
 
-	if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) {
-		irqdata->overrun++;
-	} else {
-		u64 *p = (u64 *) &irqdata->data[irqdata->len];
+	rcu_read_lock();
+	ret = -ENOSPC;
+	data = rcu_dereference(counter->data);
+	if (!data)
+		goto out;
+
+	if (!data->nr_pages)
+		goto out;
+
+	ret = -EINVAL;
+	if (size > PAGE_SIZE)
+		goto out;
+
+	do {
+		offset = head = atomic_read(&data->head);
+		head += sizeof(u64);
+	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
+
+	wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
 
-		*p = data;
-		irqdata->len += sizeof(u64);
+	nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1);
+	offset &= PAGE_SIZE - 1;
+
+	len = min_t(unsigned int, PAGE_SIZE - offset, size);
+	memcpy(data->data_pages[nr] + offset, buf, len);
+	size -= len;
+
+	if (size) {
+		nr = (nr + 1) & (data->nr_pages - 1);
+		memcpy(data->data_pages[nr], buf + len, size);
+	}
+
+	/*
+	 * generate a poll() wakeup for every page boundary crossed
+	 */
+	if (wakeup) {
+		__perf_counter_update_userpage(counter, data);
+		if (nmi) {
+			counter->wakeup_pending = 1;
+			set_perf_counter_pending();
+		} else
+			wake_up(&counter->waitq);
 	}
+	ret = 0;
+out:
+	rcu_read_unlock();
+
+	return ret;
 }
 
-static void perf_counter_handle_group(struct perf_counter *counter)
+static void perf_output_simple(struct perf_counter *counter,
+			       int nmi, struct pt_regs *regs)
+{
+	u64 entry;
+
+	entry = instruction_pointer(regs);
+
+	perf_output_write(counter, nmi, &entry, sizeof(entry));
+}
+
+struct group_entry {
+	u64 event;
+	u64 counter;
+};
+
+static void perf_output_group(struct perf_counter *counter, int nmi)
 {
 	struct perf_counter *leader, *sub;
 
 	leader = counter->group_leader;
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+		struct group_entry entry;
+
 		if (sub != counter)
 			sub->hw_ops->read(sub);
-		perf_counter_store_irq(counter, sub->hw_event.config);
-		perf_counter_store_irq(counter, atomic64_read(&sub->count));
+
+		entry.event = sub->hw_event.config;
+		entry.counter = atomic64_read(&sub->count);
+
+		perf_output_write(counter, nmi, &entry, sizeof(entry));
 	}
 }
 
@@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter,
 		return;
 
 	case PERF_RECORD_IRQ:
-		perf_counter_store_irq(counter, instruction_pointer(regs));
+		perf_output_simple(counter, nmi, regs);
 		break;
 
 	case PERF_RECORD_GROUP:
-		perf_counter_handle_group(counter);
+		perf_output_group(counter, nmi);
 		break;
 	}
-
-	if (nmi) {
-		counter->wakeup_pending = 1;
-		set_perf_counter_pending();
-	} else
-		wake_up(&counter->waitq);
 }
 
 /*
@@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	INIT_LIST_HEAD(&counter->sibling_list);
 	init_waitqueue_head(&counter->waitq);
 
+	mutex_init(&counter->mmap_mutex);
+
 	INIT_LIST_HEAD(&counter->child_list);
 
-	counter->irqdata		= &counter->data[0];
-	counter->usrdata		= &counter->data[1];
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->wakeup_pending		= 0;
-- 
GitLab


From 803d4f3980f6e220b27311a283aab0a4d68b6709 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:11 +0100
Subject: [PATCH 0196/2198] perf_counter tools: update to new syscall ABI

update the kerneltop userspace to work with the latest syscall ABI

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.559643732@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 235 +++++++++++++++++--------
 1 file changed, 157 insertions(+), 78 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 81a68aac137f7..a72c9bd28071b 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -87,20 +87,90 @@
 
 #include <linux/unistd.h>
 
-#include "perfcounters.h"
+#include "include/linux/perf_counter.h"
 
 
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define rdclock()                                       \
+({                                                      \
+        struct timespec ts;                             \
+                                                        \
+        clock_gettime(CLOCK_MONOTONIC, &ts);            \
+        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+typedef unsigned int            __u32;
+typedef unsigned long long      __u64;
+typedef long long               __s64;
+
+
+#ifdef __x86_64__
+# define __NR_perf_counter_open 295
+#endif
+
+#ifdef __i386__
+# define __NR_perf_counter_open 333
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#endif
+
+asmlinkage int sys_perf_counter_open(
+        struct perf_counter_hw_event    *hw_event_uptr          __user,
+        pid_t                           pid,
+        int                             cpu,
+        int                             group_fd,
+        unsigned long                   flags)
+{
+        int ret;
+
+        ret = syscall(
+                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+#if defined(__x86_64__) || defined(__i386__)
+        if (ret < 0 && ret > -4096) {
+                errno = -ret;
+                ret = -1;
+        }
+#endif
+        return ret;
+}
+
 #define MAX_COUNTERS			64
 #define MAX_NR_CPUS			256
 
-#define DEF_PERFSTAT_EVENTS		{ -2, -5, -4, -3, 0, 1, 2, 3}
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
 
 static int			run_perfstat			=  0;
 static int			system_wide			=  0;
 
 static int			nr_counters			=  0;
-static __s64			event_id[MAX_COUNTERS]		= DEF_PERFSTAT_EVENTS;
-static int			event_raw[MAX_COUNTERS];
+static __u64			event_id[MAX_COUNTERS]		= {
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+};
+static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
@@ -156,49 +226,63 @@ static char *sw_event_names[] = {
 	"pagefaults",
 	"context switches",
 	"CPU migrations",
+	"minor faults",
+	"major faults",
 };
 
 struct event_symbol {
-	int event;
+	__u64 event;
 	char *symbol;
 };
 
 static struct event_symbol event_symbols[] = {
-	{PERF_COUNT_CPU_CYCLES,			"cpu-cycles",		},
-	{PERF_COUNT_CPU_CYCLES,			"cycles",		},
-	{PERF_COUNT_INSTRUCTIONS,		"instructions",		},
-	{PERF_COUNT_CACHE_REFERENCES,		"cache-references",	},
-	{PERF_COUNT_CACHE_MISSES,		"cache-misses",		},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branch-instructions",	},
-	{PERF_COUNT_BRANCH_INSTRUCTIONS,	"branches",		},
-	{PERF_COUNT_BRANCH_MISSES,		"branch-misses",	},
-	{PERF_COUNT_BUS_CYCLES,			"bus-cycles",		},
-	{PERF_COUNT_CPU_CLOCK,			"cpu-ticks",		},
-	{PERF_COUNT_CPU_CLOCK,			"ticks",		},
-	{PERF_COUNT_TASK_CLOCK,			"task-ticks",		},
-	{PERF_COUNT_PAGE_FAULTS,		"page-faults",		},
-	{PERF_COUNT_PAGE_FAULTS,		"faults",		},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"context-switches",	},
-	{PERF_COUNT_CONTEXT_SWITCHES,		"cs",			},
-	{PERF_COUNT_CPU_MIGRATIONS,		"cpu-migrations",	},
-	{PERF_COUNT_CPU_MIGRATIONS,		"migrations",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
 };
 
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
 static void display_events_help(void)
 {
 	unsigned int i;
-	int e;
+	__u64 e;
 
 	printf(
 	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
 
-	for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
-		if (e != event_symbols[i].event) {
-			e = event_symbols[i].event;
-			printf(
-	"\n                             %2d: %-20s", e, event_symbols[i].symbol);
-		} else
-			printf(" %s", event_symbols[i].symbol);
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		printf("\n                             %d:%d: %-20s",
+				type, id, event_symbols[i].symbol);
 	}
 
 	printf("\n"
@@ -249,44 +333,51 @@ static void display_help(void)
 	exit(0);
 }
 
-static int type_valid(int type)
-{
-	if (type >= PERF_HW_EVENTS_MAX)
-		return 0;
-	if (type <= PERF_SW_EVENTS_MIN)
-		return 0;
-
-	return 1;
-}
-
 static char *event_name(int ctr)
 {
-	__s64 type = event_id[ctr];
+	__u64 config = event_id[ctr];
+	int type = PERF_COUNTER_TYPE(config);
+	int id = PERF_COUNTER_ID(config);
 	static char buf[32];
 
-	if (event_raw[ctr]) {
-		sprintf(buf, "raw 0x%llx", (long long)type);
+	if (PERF_COUNTER_RAW(config)) {
+		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
 		return buf;
 	}
-	if (!type_valid(type))
-		return "unknown";
 
-	if (type >= 0)
-		return hw_event_names[type];
+	switch (type) {
+	case PERF_TYPE_HARDWARE:
+		if (id < PERF_HW_EVENTS_MAX)
+			return hw_event_names[id];
+		return "unknown-hardware";
+
+	case PERF_TYPE_SOFTWARE:
+		if (id < PERF_SW_EVENTS_MAX)
+			return sw_event_names[id];
+		return "unknown-software";
 
-	return sw_event_names[-type-1];
+	default:
+		break;
+	}
+
+	return "unknown";
 }
 
 /*
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static int match_event_symbols(char *str)
+static __u64 match_event_symbols(char *str)
 {
+	__u64 config, id;
+	int type;
 	unsigned int i;
 
-	if (isdigit(str[0]) || str[0] == '-')
-		return atoi(str);
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		if (!strncmp(str, event_symbols[i].symbol,
@@ -294,31 +385,22 @@ static int match_event_symbols(char *str)
 			return event_symbols[i].event;
 	}
 
-	return PERF_HW_EVENTS_MAX;
+	return ~0ULL;
 }
 
 static int parse_events(char *str)
 {
-	__s64 type;
-	int raw;
+	__u64 config;
 
 again:
 	if (nr_counters == MAX_COUNTERS)
 		return -1;
 
-	raw = 0;
-	if (*str == 'r') {
-		raw = 1;
-		++str;
-		type = strtol(str, NULL, 16);
-	} else {
-		type = match_event_symbols(str);
-		if (!type_valid(type))
-			return -1;
-	}
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
 
-	event_id[nr_counters] = type;
-	event_raw[nr_counters] = raw;
+	event_id[nr_counters] = config;
 	nr_counters++;
 
 	str = strstr(str, ",");
@@ -342,8 +424,7 @@ static void create_perfstat_counter(int counter)
 	struct perf_counter_hw_event hw_event;
 
 	memset(&hw_event, 0, sizeof(hw_event));
-	hw_event.type		= event_id[counter];
-	hw_event.raw		= event_raw[counter];
+	hw_event.config		= event_id[counter];
 	hw_event.record_type	= PERF_RECORD_SIMPLE;
 	hw_event.nmi		= 0;
 
@@ -428,7 +509,7 @@ int do_perfstat(int argc, char *argv[])
 			count += single_count;
 		}
 
-		if (!event_raw[counter] &&
+		if (!PERF_COUNTER_RAW(event_id[counter]) &&
 		    (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
 		     event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
 
@@ -911,7 +992,7 @@ static void record_ip(uint64_t ip, int counter)
 		assert(left <= middle && middle <= right);
 		if (!(left <= ip && ip <= right)) {
 			printf(" left: %016lx\n", left);
-			printf("   ip: %016lx\n", ip);
+			printf("   ip: %016llx\n", ip);
 			printf("right: %016lx\n", right);
 		}
 		assert(left <= ip && ip <= right);
@@ -983,7 +1064,7 @@ static void process_options(int argc, char *argv[])
 
 		switch (c) {
 		case 'a': system_wide			=	       1; break;
-		case 'c': event_count[nr_counters]	=   atoi(optarg); break;
+		case 'c': default_interval		=   atoi(optarg); break;
 		case 'C':
 			/* CPU and PID are mutually exclusive */
 			if (tid != -1) {
@@ -1032,10 +1113,7 @@ static void process_options(int argc, char *argv[])
 		if (event_count[counter])
 			continue;
 
-		if (event_id[counter] < PERF_HW_EVENTS_MAX)
-			event_count[counter] = default_count[event_id[counter]];
-		else
-			event_count[counter] = 100000;
+		event_count[counter] = default_interval;
 	}
 }
 
@@ -1070,12 +1148,13 @@ int main(int argc, char *argv[])
 				cpu = i;
 
 			memset(&hw_event, 0, sizeof(hw_event));
-			hw_event.type		= event_id[counter];
-			hw_event.raw		= event_raw[counter];
+			hw_event.config		= event_id[counter];
 			hw_event.irq_period	= event_count[counter];
 			hw_event.record_type	= PERF_RECORD_IRQ;
 			hw_event.nmi		= nmi;
 
+			printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]);
+
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
 			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
 			if (fd[i][counter] < 0) {
-- 
GitLab


From bcbcb37cdb67d8100acfa66df40c4d636c28c4d1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 23 Mar 2009 18:22:12 +0100
Subject: [PATCH 0197/2198] perf_counter tools: use mmap() output

update kerneltop to use the mmap() output to gather overflow information

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.677932499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 93 ++++++++++++++++++++++----
 1 file changed, 79 insertions(+), 14 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index a72c9bd28071b..80b790553ec81 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -84,6 +84,7 @@
 #include <sys/prctl.h>
 #include <sys/wait.h>
 #include <sys/uio.h>
+#include <sys/mman.h>
 
 #include <linux/unistd.h>
 
@@ -119,17 +120,25 @@ typedef long long               __s64;
 
 
 #ifdef __x86_64__
-# define __NR_perf_counter_open 295
+#define __NR_perf_counter_open 295
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __i386__
-# define __NR_perf_counter_open 333
+#define __NR_perf_counter_open 333
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __powerpc__
 #define __NR_perf_counter_open 319
+#define rmb() 		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
 #endif
 
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+
 asmlinkage int sys_perf_counter_open(
         struct perf_counter_hw_event    *hw_event_uptr          __user,
         pid_t                           pid,
@@ -181,6 +190,7 @@ static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
 static int			nmi				=  1;
 static int			group				=  0;
+static unsigned int		page_size;
 
 static char			*vmlinux;
 
@@ -1117,16 +1127,68 @@ static void process_options(int argc, char *argv[])
 	}
 }
 
+struct mmap_data {
+	int counter;
+	void *base;
+	unsigned int mask;
+	unsigned int prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+	struct perf_counter_mmap_page *pc = md->base;
+	unsigned int seq, head;
+
+repeat:
+	rmb();
+	seq = pc->lock;
+
+	if (unlikely(seq & 1)) {
+		cpu_relax();
+		goto repeat;
+	}
+
+	head = pc->data_head;
+
+	rmb();
+	if (pc->lock != seq)
+		goto repeat;
+
+	return head;
+}
+
+static void mmap_read(struct mmap_data *md)
+{
+	unsigned int head = mmap_read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+
+	if (head - old > md->mask) {
+		printf("ERROR: failed to keep up with mmap data\n");
+		exit(-1);
+	}
+
+	for (; old != head;) {
+		__u64 *ptr = (__u64 *)&data[old & md->mask];
+		old += sizeof(__u64);
+
+		process_event(*ptr, md->counter);
+	}
+
+	md->prev = old;
+}
+
 int main(int argc, char *argv[])
 {
 	struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
 	int i, counter, group_fd;
 	unsigned int cpu;
-	uint64_t ip;
-	ssize_t res;
 	int ret;
 
+	page_size = sysconf(_SC_PAGE_SIZE);
+
 	process_options(argc, argv);
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -1153,8 +1215,6 @@ int main(int argc, char *argv[])
 			hw_event.record_type	= PERF_RECORD_IRQ;
 			hw_event.nmi		= nmi;
 
-			printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]);
-
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
 			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
 			if (fd[i][counter] < 0) {
@@ -1174,6 +1234,17 @@ int main(int argc, char *argv[])
 
 			event_array[i][counter].fd = fd[i][counter];
 			event_array[i][counter].events = POLLIN;
+
+			mmap_array[i][counter].counter = counter;
+			mmap_array[i][counter].prev = 0;
+			mmap_array[i][counter].mask = 2*page_size - 1;
+			mmap_array[i][counter].base = mmap(NULL, 3*page_size,
+					PROT_READ, MAP_SHARED, fd[i][counter], 0);
+			if (mmap_array[i][counter].base == MAP_FAILED) {
+				printf("kerneltop error: failed to mmap with %d (%s)\n",
+						errno, strerror(errno));
+				exit(-1);
+			}
 		}
 	}
 
@@ -1188,14 +1259,8 @@ int main(int argc, char *argv[])
 		int hits = events;
 
 		for (i = 0; i < nr_cpus; i++) {
-			for (counter = 0; counter < nr_counters; counter++) {
-				res = read(fd[i][counter], (char *) &ip, sizeof(ip));
-				if (res > 0) {
-					assert(res == sizeof(ip));
-
-					process_event(ip, counter);
-				}
-			}
+			for (counter = 0; counter < nr_counters; counter++)
+				mmap_read(&mmap_array[i][counter]);
 		}
 
 		if (time(NULL) >= last_refresh + delay_secs) {
-- 
GitLab


From 383c5f8cd7d253be0017d8d5a39cbb78aaf70206 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 21:49:25 +0100
Subject: [PATCH 0198/2198] perf_counter tools: tidy up in-kernel dependencies

Remove now unified perfstat.c and perf_counter.h, and link to the
in-kernel perf_counter.h.

Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090323172417.677932499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile       |   2 +-
 Documentation/perf_counter/kerneltop.c    |   2 +-
 Documentation/perf_counter/perfcounters.h | 142 ------------
 Documentation/perf_counter/perfstat.c     | 251 ----------------------
 4 files changed, 2 insertions(+), 395 deletions(-)
 delete mode 100644 Documentation/perf_counter/perfcounters.h
 delete mode 100644 Documentation/perf_counter/perfstat.c

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index b45749753fcb7..666da95a78774 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -2,7 +2,7 @@ BINS = kerneltop perfstat
 
 all: $(BINS)
 
-kerneltop: kerneltop.c perfcounters.h
+kerneltop: kerneltop.c ../../include/linux/perf_counter.h
 	cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $<
 
 perfstat: kerneltop
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 80b790553ec81..25e80bc44556c 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -88,7 +88,7 @@
 
 #include <linux/unistd.h>
 
-#include "include/linux/perf_counter.h"
+#include "../../include/linux/perf_counter.h"
 
 
 /*
diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h
deleted file mode 100644
index 32e24b9154ab6..0000000000000
--- a/Documentation/perf_counter/perfcounters.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Ioctls that can be done on a perf counter fd:
- */
-#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE	31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define rdclock()					\
-({							\
-	struct timespec ts;				\
-							\
-	clock_gettime(CLOCK_MONOTONIC, &ts);		\
-	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-typedef unsigned int		__u32;
-typedef unsigned long long	__u64;
-typedef long long		__s64;
-
-/*
- * User-space ABI bits:
- */
-
-/*
- * Generalized performance counter event types, used by the hw_event.type
- * parameter of the sys_perf_counter_open() syscall:
- */
-enum hw_event_types {
-	/*
-	 * Common hardware events, generalized by the kernel:
-	 */
-	PERF_COUNT_CPU_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-	PERF_COUNT_BUS_CYCLES		=  6,
-
-	PERF_HW_EVENTS_MAX		=  7,
-
-	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
-	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	PERF_COUNT_PAGE_FAULTS		= -3,
-	PERF_COUNT_CONTEXT_SWITCHES	= -4,
-	PERF_COUNT_CPU_MIGRATIONS	= -5,
-
-	PERF_SW_EVENTS_MIN		= -6,
-};
-
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
-};
-
-/*
- * Hardware event to monitor via a performance monitoring counter:
- */
-struct perf_counter_hw_event {
-	__s64			type;
-
-	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
-
-	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
-				raw	       :  1, /* raw event type        */
-				inherit	       :  1, /* children inherit it   */
-				pinned	       :  1, /* must always be on PMU */
-				exclusive      :  1, /* only group on PMU     */
-				exclude_user   :  1, /* don't count user      */
-				exclude_kernel :  1, /* ditto kernel          */
-				exclude_hv     :  1, /* ditto hypervisor      */
-				exclude_idle   :  1, /* don't count when idle */
-
-				__reserved_1   : 54;
-
-	__u32			extra_config_len;
-	__u32			__reserved_4;
-
-	__u64			__reserved_2;
-	__u64			__reserved_3;
-};
-
-
-#ifdef __x86_64__
-# define __NR_perf_counter_open	295
-#endif
-
-#ifdef __i386__
-# define __NR_perf_counter_open 333
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#endif
-
-asmlinkage int sys_perf_counter_open(
-
-	struct perf_counter_hw_event	*hw_event_uptr		__user,
-	pid_t				pid,
-	int				cpu,
-	int				group_fd,
-	unsigned long			flags)
-{
-	int ret;
-
-	ret = syscall(
-		__NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-#if defined(__x86_64__) || defined(__i386__)
-	if (ret < 0 && ret > -4096) {
-		errno = -ret;
-		ret = -1;
-	}
-#endif
-	return ret;
-}
diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c
deleted file mode 100644
index fd594468e655b..0000000000000
--- a/Documentation/perf_counter/perfstat.c
+++ /dev/null
@@ -1,251 +0,0 @@
-/*
- * perfstat:  /usr/bin/time -alike performance counter statistics utility
- *
- *        It summarizes the counter events of all tasks (and child tasks),
- *        covering all CPUs that the command (or workload) executes on.
- *        It only counts the per-task events of the workload started,
- *        independent of how many other tasks run on those CPUs.
- *
- * Build with:       cc -O2 -g -lrt -Wall -W -o perfstat perfstat.c
- *
- * Sample output:
- *
-
-   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
-
-   Performance counter stats for 'ls':
-
-           163516953 instructions
-                2295 cache-misses
-             2855182 branch-misses
-
- *
- * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
- *
- * Released under the GPLv2 (not later).
- *
- * Percpu counter support by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
- * Symbolic event options by: Wu Fengguang <fengguang.wu@intel.com>
- */
-#define _GNU_SOURCE
-
-#include <assert.h>
-#include <getopt.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <unistd.h>
-#include <ctype.h>
-#include <errno.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <time.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/prctl.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-
-#include <linux/unistd.h>
-
-#include "perfcounters.h"
-
-static int			nr_cpus			= 0;
-
-static int			system_wide		= 0;
-
-static void display_help(void)
-{
-	unsigned int i;
-	int e;
-
-	printf(
-	"Usage: perfstat [<events...>] <cmd...>\n\n"
-	"PerfStat Options (up to %d event types can be specified):\n\n",
-		 MAX_COUNTERS);
-	printf(
-	" -e EVENT     --event=EVENT        #  symbolic-name        abbreviations");
-
-	for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) {
-		if (e != event_symbols[i].event) {
-			e = event_symbols[i].event;
-			printf(
-	"\n                                  %2d: %-20s", e, event_symbols[i].symbol);
-		} else
-			printf(" %s", event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                                rNNN: raw event type\n\n"
-	" -s                                # system-wide collection\n\n"
-	" -c <cmd..>   --command=<cmd..>    # command+arguments to be timed.\n"
-	"\n");
-	exit(0);
-}
-
-static void process_options(int argc, char *argv[])
-{
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"event",	required_argument,	NULL, 'e'},
-			{"help",	no_argument,		NULL, 'h'},
-			{"command",	no_argument,		NULL, 'c'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:e:c:s",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'c':
-			break;
-		case 's':
-			system_wide = 1;
-			break;
-		case 'e':
-			parse_events(optarg);
-			break;
-		default:
-			break;
-		}
-	}
-	if (optind == argc)
-		goto err;
-
-	if (!nr_counters)
-		nr_counters = 8;
-	return;
-
-err:
-	display_help();
-}
-
-char fault_here[1000000];
-
-static int fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static void create_counter(int counter)
-{
-	struct perf_counter_hw_event hw_event;
-
-	memset(&hw_event, 0, sizeof(hw_event));
-	hw_event.type		= event_id[counter];
-	hw_event.raw		= event_raw[counter];
-	hw_event.record_type	= PERF_RECORD_SIMPLE;
-	hw_event.nmi		= 0;
-
-	if (system_wide) {
-		int cpu;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
-			if (fd[cpu][counter] < 0) {
-				printf("perfstat error: syscall returned with %d (%s)\n",
-						fd[cpu][counter], strerror(errno));
-				exit(-1);
-			}
-			
-		}
-	} else {
-		hw_event.inherit	= 1;
-		hw_event.disabled	= 1;
-
-		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
-		if (fd[0][counter] < 0) {
-			printf("perfstat error: syscall returned with %d (%s)\n",
-					fd[0][counter], strerror(errno));
-			exit(-1);
-		}
-	}
-}
-
-
-int main(int argc, char *argv[])
-{
-	unsigned long long t0, t1;
-	int counter;
-	ssize_t res;
-	int status;
-	int pid;
-
-	process_options(argc, argv);
-
-	if (system_wide) {
-		nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-		assert(nr_cpus <= MAX_NR_CPUS);
-		assert(nr_cpus >= 0);
-	} else
-		nr_cpus = 1;
-
-	for (counter = 0; counter < nr_counters; counter++)
-		create_counter(counter);
-
-	argc -= optind;
-	argv += optind;
-
-	/*
-	 * Enable counters and exec the command:
-	 */
-	t0 = rdclock();
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
-
-	if ((pid = fork()) < 0)
-		perror("failed to fork");
-	if (!pid) {
-		if (execvp(argv[0], argv)) {
-			perror(argv[0]);
-			exit(-1);
-		}
-	}
-	while (wait(&status) >= 0)
-		;
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
-	t1 = rdclock();
-
-	fflush(stdout);
-
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Performance counter stats for \'%s\':\n",
-		argv[0]);
-	fprintf(stderr, "\n");
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		int cpu;
-		__u64 count, single_count;
-
-		count = 0;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			res = read(fd[cpu][counter],
-					(char *) &single_count, sizeof(single_count));
-			assert(res == sizeof(single_count));
-			count += single_count;
-		}
-
-		if (!event_raw[counter] &&
-		    (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
-		     event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
-
-			double msecs = (double)count / 1000000;
-
-			fprintf(stderr, " %14.6f  %-20s (msecs)\n",
-				msecs, event_name(counter));
-		} else {
-			fprintf(stderr, " %14Ld  %-20s (events)\n",
-				count, event_name(counter));
-		}
-		if (!counter)
-			fprintf(stderr, "\n");
-	}
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
-			(double)(t1-t0)/1e6);
-	fprintf(stderr, "\n");
-
-	return 0;
-}
-- 
GitLab


From 193e8df1b465f206f8a286202680702e1c153367 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 22:23:16 +0100
Subject: [PATCH 0199/2198] perf_counter tools: fix build warning in
 kerneltop.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix:

 kerneltop.c: In function ‘record_ip’:
 kerneltop.c:1005: warning: format ‘%016llx’ expects type ‘long long unsigned int’, but argument 2 has type ‘uint64_t’

Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090323172417.677932499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 25e80bc44556c..8f9a303f28d8d 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -1002,7 +1002,7 @@ static void record_ip(uint64_t ip, int counter)
 		assert(left <= middle && middle <= right);
 		if (!(left <= ip && ip <= right)) {
 			printf(" left: %016lx\n", left);
-			printf("   ip: %016llx\n", ip);
+			printf("   ip: %016lx\n", (unsigned long)ip);
 			printf("right: %016lx\n", right);
 		}
 		assert(left <= ip && ip <= right);
-- 
GitLab


From 81cdbe0509542324ad7d3282ab67c2b6716df663 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 22:29:50 +0100
Subject: [PATCH 0200/2198] perf_counter tools: increase cpu-cycles again

Commit b7368fdd7d decreased the CPU cycles interval 100-fold, but
this is causig kerneltop failures on my Nehalem box:

 aldebaran:/home/mingo/linux/linux/Documentation/perf_counter>
 ./kerneltop
 KernelTop refresh period: 2 seconds
 ERROR: failed to keep up with mmap data

10,000 cycles is way too short.

What we should do instead on mostly-idle systems is some sort of
read/poll timeout, so that we display something every 2 seconds
for sure.

Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 8f9a303f28d8d..2ab29b5e32e8d 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -212,7 +212,7 @@ struct source_line {
 
 
 const unsigned int default_count[] = {
-	  10000,
+	1000000,
 	1000000,
 	  10000,
 	  10000,
-- 
GitLab


From cbe46555dc4de6403cd757139d42289b5f21abb9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 24 Mar 2009 16:52:34 +1100
Subject: [PATCH 0201/2198] perf_counter tools: remove glib dependency and fix
 bugs in kerneltop.c

The glib dependency in kerneltop.c is only for a little bit of list
manipulation, and I find it inconvenient.  This adds a 'next' field to
struct source_line, which lets us link them together into a list.  The
code to do the linking ourselves turns out to be no longer or more
difficult than using glib.

This also fixes a few other problems:

- We need to #include <limits.h> to get PATH_MAX on powerpc.

- We need to #include <linux/types.h> rather than have our own
  definitions of __u64 and __s64; on powerpc the installed headers
  define them to be unsigned long and long respectively, and if we
  have our own, different definition here that causes a compile error.

- This takes out the x86 setting of errno from -ret in
  sys_perf_counter_open.  My experiments on x86 indicate that the
  glibc syscall() does this for us already.

- We had two CPU migration counters in the default set, which seems
  unnecessary; I changed one of them to a context switch counter.

- In perfstat mode we were printing CPU cycles and instructions as
  milliseconds, and the cpu clock and task clock counters as events.
  This fixes that.

- In perfstat mode we were still printing a blank line after the first
  counter, which was a holdover from when a task clock counter was
  automatically included as the first counter.  This removes the blank
  line.

- On a test machine here, parse_symbols() and parse_vmlinux() were
  taking long enough (almost 0.5 seconds) for the mmap buffer to
  overflow before we got to the first mmap_read() call, so this moves
  them before we open all the counters.

- The error message if sys_perf_counter_open fails needs to use errno,
  not -fd[i][counter].

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Orig-LKML-Reference: <18888.29986.340328.540512@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile    |   2 +-
 Documentation/perf_counter/kerneltop.c | 112 ++++++++++---------------
 2 files changed, 46 insertions(+), 68 deletions(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 666da95a78774..194b662155886 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -3,7 +3,7 @@ BINS = kerneltop perfstat
 all: $(BINS)
 
 kerneltop: kerneltop.c ../../include/linux/perf_counter.h
-	cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $<
+	cc -O6 -Wall -lrt -o $@ $<
 
 perfstat: kerneltop
 	ln -sf kerneltop perfstat
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 2ab29b5e32e8d..ea13e4e672291 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -3,7 +3,7 @@
 
    Build with:
 
-     cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c
+     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
 
    Sample output:
 
@@ -56,6 +56,7 @@
   *   Yanmin Zhang <yanmin.zhang@intel.com>
   *   Wu Fengguang <fengguang.wu@intel.com>
   *   Mike Galbraith <efault@gmx.de>
+  *   Paul Mackerras <paulus@samba.org>
   *
   * Released under the GPL v2. (and only v2, not any later version)
   */
@@ -68,6 +69,7 @@
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
+#include <limits.h>
 #include <getopt.h>
 #include <assert.h>
 #include <fcntl.h>
@@ -76,8 +78,6 @@
 #include <ctype.h>
 #include <time.h>
 
-#include <glib.h>
-
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
 #include <sys/poll.h>
@@ -87,6 +87,7 @@
 #include <sys/mman.h>
 
 #include <linux/unistd.h>
+#include <linux/types.h>
 
 #include "../../include/linux/perf_counter.h"
 
@@ -114,11 +115,6 @@
 #define __user
 #define asmlinkage
 
-typedef unsigned int            __u32;
-typedef unsigned long long      __u64;
-typedef long long               __s64;
-
-
 #ifdef __x86_64__
 #define __NR_perf_counter_open 295
 #define rmb()		asm volatile("lfence" ::: "memory")
@@ -146,17 +142,8 @@ asmlinkage int sys_perf_counter_open(
         int                             group_fd,
         unsigned long                   flags)
 {
-        int ret;
-
-        ret = syscall(
+        return syscall(
                 __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-#if defined(__x86_64__) || defined(__i386__)
-        if (ret < 0 && ret > -4096) {
-                errno = -ret;
-                ret = -1;
-        }
-#endif
-        return ret;
 }
 
 #define MAX_COUNTERS			64
@@ -170,7 +157,7 @@ static int			system_wide			=  0;
 static int			nr_counters			=  0;
 static __u64			event_id[MAX_COUNTERS]		= {
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
 
@@ -202,14 +189,15 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
-static GList			*lines;
-
 struct source_line {
 	uint64_t		EIP;
 	unsigned long		count;
 	char			*line;
+	struct source_line	*next;
 };
 
+static struct source_line	*lines;
+static struct source_line	**lines_tail;
 
 const unsigned int default_count[] = {
 	1000000,
@@ -519,9 +507,8 @@ int do_perfstat(int argc, char *argv[])
 			count += single_count;
 		}
 
-		if (!PERF_COUNTER_RAW(event_id[counter]) &&
-		    (event_id[counter] == PERF_COUNT_CPU_CLOCK ||
-		     event_id[counter] == PERF_COUNT_TASK_CLOCK)) {
+		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
+		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
 
 			double msecs = (double)count / 1000000;
 
@@ -531,8 +518,6 @@ int do_perfstat(int argc, char *argv[])
 			fprintf(stderr, " %14Ld  %-20s (events)\n",
 				count, event_name(counter));
 		}
-		if (!counter)
-			fprintf(stderr, "\n");
 	}
 	fprintf(stderr, "\n");
 	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
@@ -554,7 +539,7 @@ struct sym_entry {
 	char			*sym;
 	unsigned long		count[MAX_COUNTERS];
 	int			skip;
-	GList			*source;
+	struct source_line	*source;
 };
 
 #define MAX_SYMS		100000
@@ -855,6 +840,7 @@ static void parse_vmlinux(char *filename)
 	if (!file)
 		return;
 
+	lines_tail = &lines;
 	while (!feof(file)) {
 		struct source_line *src;
 		size_t dummy = 0;
@@ -873,7 +859,9 @@ static void parse_vmlinux(char *filename)
 		if (c)
 			*c = 0;
 
-		lines = g_list_prepend(lines, src);
+		src->next = NULL;
+		*lines_tail = src;
+		lines_tail = &src->next;
 
 		if (strlen(src->line)>8 && src->line[8] == ':')
 			src->EIP = strtoull(src->line, NULL, 16);
@@ -881,52 +869,43 @@ static void parse_vmlinux(char *filename)
 			src->EIP = strtoull(src->line, NULL, 16);
 	}
 	pclose(file);
-	lines = g_list_reverse(lines);
 }
 
 static void record_precise_ip(uint64_t ip)
 {
 	struct source_line *line;
-	GList *item;
 
-	item = g_list_first(lines);
-	while (item) {
-		line = item->data;
+	for (line = lines; line; line = line->next) {
 		if (line->EIP == ip)
 			line->count++;
 		if (line->EIP > ip)
 			break;
-		item = g_list_next(item);
 	}
 }
 
 static void lookup_sym_in_vmlinux(struct sym_entry *sym)
 {
 	struct source_line *line;
-	GList *item;
 	char pattern[PATH_MAX];
 	sprintf(pattern, "<%s>:", sym->sym);
 
-	item = g_list_first(lines);
-	while (item) {
-		line = item->data;
+	for (line = lines; line; line = line->next) {
 		if (strstr(line->line, pattern)) {
-			sym->source = item;
+			sym->source = line;
 			break;
 		}
-		item = g_list_next(item);
 	}
 }
 
-void show_lines(GList *item_queue, int item_queue_count)
+static void show_lines(struct source_line *line_queue, int line_queue_count)
 {
 	int i;
 	struct source_line *line;
 
-	for (i = 0; i < item_queue_count; i++) {
-		line = item_queue->data;
+	line = line_queue;
+	for (i = 0; i < line_queue_count; i++) {
 		printf("%8li\t%s\n", line->count, line->line);
-		item_queue = g_list_next(item_queue);
+		line = line->next;
 	}
 }
 
@@ -935,10 +914,9 @@ void show_lines(GList *item_queue, int item_queue_count)
 static void show_details(struct sym_entry *sym)
 {
 	struct source_line *line;
-	GList *item;
+	struct source_line *line_queue = NULL;
 	int displayed = 0;
-	GList *item_queue = NULL;
-	int item_queue_count = 0;
+	int line_queue_count = 0;
 
 	if (!sym->source)
 		lookup_sym_in_vmlinux(sym);
@@ -947,30 +925,29 @@ static void show_details(struct sym_entry *sym)
 
 	printf("Showing details for %s\n", sym->sym);
 
-	item = sym->source;
-	while (item) {
-		line = item->data;
+	line = sym->source;
+	while (line) {
 		if (displayed && strstr(line->line, ">:"))
 			break;
 
-		if (!item_queue_count)
-			item_queue = item;
-		item_queue_count ++;
+		if (!line_queue_count)
+			line_queue = line;
+		line_queue_count ++;
 
 		if (line->count >= count_filter) {
-			show_lines(item_queue, item_queue_count);
-			item_queue_count = 0;
-			item_queue = NULL;
-		} else if (item_queue_count > TRACE_COUNT) {
-			item_queue = g_list_next(item_queue);
-			item_queue_count --;
+			show_lines(line_queue, line_queue_count);
+			line_queue_count = 0;
+			line_queue = NULL;
+		} else if (line_queue_count > TRACE_COUNT) {
+			line_queue = line_queue->next;
+			line_queue_count --;
 		}
 
 		line->count = 0;
 		displayed++;
 		if (displayed > 300)
 			break;
-		item = g_list_next(item);
+		line = line->next;
 	}
 }
 
@@ -1201,6 +1178,10 @@ int main(int argc, char *argv[])
 	if (tid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 
+	parse_symbols();
+	if (vmlinux && sym_filter_entry)
+		parse_vmlinux(vmlinux);
+
 	for (i = 0; i < nr_cpus; i++) {
 		group_fd = -1;
 		for (counter = 0; counter < nr_counters; counter++) {
@@ -1216,15 +1197,16 @@ int main(int argc, char *argv[])
 			hw_event.nmi		= nmi;
 
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
-			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
 			if (fd[i][counter] < 0) {
+				int err = errno;
 				printf("kerneltop error: syscall returned with %d (%s)\n",
-					fd[i][counter], strerror(-fd[i][counter]));
-				if (fd[i][counter] == -1)
+					fd[i][counter], strerror(err));
+				if (err == EPERM)
 					printf("Are you root?\n");
 				exit(-1);
 			}
 			assert(fd[i][counter] >= 0);
+			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
 
 			/*
 			 * First counter acts as the group leader:
@@ -1248,10 +1230,6 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	parse_symbols();
-	if (vmlinux && sym_filter_entry)
-		parse_vmlinux(vmlinux);
-
 	printf("KernelTop refresh period: %d seconds\n", delay_secs);
 	last_refresh = time(NULL);
 
-- 
GitLab


From 0fd112e41cd6f6d4779cbe327c3632d087e31476 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 24 Mar 2009 10:50:24 +0100
Subject: [PATCH 0202/2198] perf_counter tools: remove glib dependency and fix
 bugs in kerneltop.c, fix poll()

Paul Mackerras wrote:

> I noticed the poll stuff is bogus - we have a 2D array of struct
> pollfds (MAX_NR_CPUS x MAX_COUNTERS), we fill in a sub-array (with the
> rest being uninitialized, since the array is on the stack) and then
> pass the first nr_cpus elements to poll.  Not what we really meant, I
> suspect. :)  Not even if we only have one counter, since it's the
> counter dimension that varies fastest.

This should fix the most obvious poll fubar.. not enough to fix the
full problem though..

Reported-by: Paul Mackerras <paulus@samba.org>
Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Orig-LKML-Reference: <18888.29986.340328.540512@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index ea13e4e672291..7ebde7a336c62 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -1157,10 +1157,10 @@ static void mmap_read(struct mmap_data *md)
 
 int main(int argc, char *argv[])
 {
-	struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
-	int i, counter, group_fd;
+	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
 	int ret;
 
@@ -1214,8 +1214,9 @@ int main(int argc, char *argv[])
 			if (group && group_fd == -1)
 				group_fd = fd[i][counter];
 
-			event_array[i][counter].fd = fd[i][counter];
-			event_array[i][counter].events = POLLIN;
+			event_array[nr_poll].fd = fd[i][counter];
+			event_array[nr_poll].events = POLLIN;
+			nr_poll++;
 
 			mmap_array[i][counter].counter = counter;
 			mmap_array[i][counter].prev = 0;
@@ -1247,7 +1248,7 @@ int main(int argc, char *argv[])
 		}
 
 		if (hits == events)
-			ret = poll(event_array[0], nr_cpus, 1000);
+			ret = poll(event_array, nr_poll, 1000);
 		hits = events;
 	}
 
-- 
GitLab


From f66c6b2066b44d4ab8e8ac1ee4cae543738fe2ac Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 23 Mar 2009 10:29:36 +1100
Subject: [PATCH 0203/2198] perf_counter: update documentation

Impact: documentation fix

This updates the perfcounter documentation to reflect recent changes.

Signed-off-by: Paul Mackerras <paulus@samba.org>
---
 Documentation/perf_counter/design.txt | 268 +++++++++++++++++++-------
 1 file changed, 202 insertions(+), 66 deletions(-)

diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
index fddd32189a500..aaf105c02fba7 100644
--- a/Documentation/perf_counter/design.txt
+++ b/Documentation/perf_counter/design.txt
@@ -11,7 +11,9 @@ thus be used to profile the code that runs on that CPU.
 
 The Linux Performance Counter subsystem provides an abstraction of these
 hardware capabilities. It provides per task and per CPU counters, counter
-groups, and it provides event capabilities on top of those.
+groups, and it provides event capabilities on top of those.  It
+provides "virtual" 64-bit counters, regardless of the width of the
+underlying hardware counters.
 
 Performance counters are accessed via special file descriptors.
 There's one file descriptor per virtual counter used.
@@ -20,7 +22,8 @@ The special file descriptor is opened via the perf_counter_open()
 system call:
 
    int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
-			     pid_t pid, int cpu, int group_fd);
+			     pid_t pid, int cpu, int group_fd,
+			     unsigned long flags);
 
 The syscall returns the new fd. The fd can be used via the normal
 VFS system calls: read() can be used to read the counter, fcntl()
@@ -32,90 +35,180 @@ can be poll()ed.
 When creating a new counter fd, 'perf_counter_hw_event' is:
 
 /*
- * Hardware event to monitor via a performance monitoring counter:
+ * Event to monitor via a performance monitoring counter:
  */
 struct perf_counter_hw_event {
-	s64			type;
+	__u64			event_config;
 
-	u64			irq_period;
-	u32			record_type;
+	__u64			irq_period;
+	__u64			record_type;
+	__u64			read_format;
 
-	u32			disabled     :  1, /* off by default */
-				nmi	     :  1, /* NMI sampling   */
-				raw	     :  1, /* raw event type */
-				__reserved_1 : 29;
+	__u64			disabled       :  1, /* off by default        */
+				nmi	       :  1, /* NMI sampling          */
+				inherit	       :  1, /* children inherit it   */
+				pinned	       :  1, /* must always be on PMU */
+				exclusive      :  1, /* only group on PMU     */
+				exclude_user   :  1, /* don't count user      */
+				exclude_kernel :  1, /* ditto kernel          */
+				exclude_hv     :  1, /* ditto hypervisor      */
+				exclude_idle   :  1, /* don't count when idle */
 
-	u64			__reserved_2;
+				__reserved_1   : 55;
+
+	__u32			extra_config_len;
+
+	__u32			__reserved_4;
+	__u64			__reserved_2;
+	__u64			__reserved_3;
 };
 
+The 'event_config' field specifies what the counter should count.  It
+is divided into 3 bit-fields:
+
+raw_type: 1 bit (most significant bit)		0x8000_0000_0000_0000
+type:	  7 bits (next most significant)	0x7f00_0000_0000_0000
+event_id: 56 bits (least significant)		0x00ff_0000_0000_0000
+
+If 'raw_type' is 1, then the counter will count a hardware event
+specified by the remaining 63 bits of event_config.  The encoding is
+machine-specific.
+
+If 'raw_type' is 0, then the 'type' field says what kind of counter
+this is, with the following encoding:
+
+enum perf_event_types {
+	PERF_TYPE_HARDWARE		= 0,
+	PERF_TYPE_SOFTWARE		= 1,
+	PERF_TYPE_TRACEPOINT		= 2,
+};
+
+A counter of PERF_TYPE_HARDWARE will count the hardware event
+specified by 'event_id':
+
 /*
- * Generalized performance counter event types, used by the hw_event.type
+ * Generalized performance counter event types, used by the hw_event.event_id
  * parameter of the sys_perf_counter_open() syscall:
  */
-enum hw_event_types {
+enum hw_event_ids {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CYCLES		=  0,
-	PERF_COUNT_INSTRUCTIONS		=  1,
-	PERF_COUNT_CACHE_REFERENCES	=  2,
-	PERF_COUNT_CACHE_MISSES		=  3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	=  4,
-	PERF_COUNT_BRANCH_MISSES	=  5,
-
-	/*
-	 * Special "software" counters provided by the kernel, even if
-	 * the hardware does not support performance counters. These
-	 * counters measure various physical and sw events of the
-	 * kernel (and allow the profiling of them as well):
-	 */
-	PERF_COUNT_CPU_CLOCK		= -1,
-	PERF_COUNT_TASK_CLOCK		= -2,
-	/*
-	 * Future software events:
-	 */
-	/* PERF_COUNT_PAGE_FAULTS	= -3,
-	   PERF_COUNT_CONTEXT_SWITCHES	= -4, */
+	PERF_COUNT_CPU_CYCLES		= 0,
+	PERF_COUNT_INSTRUCTIONS		= 1,
+	PERF_COUNT_CACHE_REFERENCES	= 2,
+	PERF_COUNT_CACHE_MISSES		= 3,
+	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_BRANCH_MISSES	= 5,
+	PERF_COUNT_BUS_CYCLES		= 6,
 };
 
-These are standardized types of events that work uniformly on all CPUs
-that implements Performance Counters support under Linux. If a CPU is
-not able to count branch-misses, then the system call will return
--EINVAL.
+These are standardized types of events that work relatively uniformly
+on all CPUs that implement Performance Counters support under Linux,
+although there may be variations (e.g., different CPUs might count
+cache references and misses at different levels of the cache hierarchy).
+If a CPU is not able to count the selected event, then the system call
+will return -EINVAL.
 
-More hw_event_types are supported as well, but they are CPU
-specific and are enumerated via /sys on a per CPU basis. Raw hw event
-types can be passed in under hw_event.type if hw_event.raw is 1.
-For example, to count "External bus cycles while bus lock signal asserted"
-events on Intel Core CPUs, pass in a 0x4064 event type value and set
-hw_event.raw to 1.
+More hw_event_types are supported as well, but they are CPU-specific
+and accessed as raw events.  For example, to count "External bus
+cycles while bus lock signal asserted" events on Intel Core CPUs, pass
+in a 0x4064 event_id value and set hw_event.raw_type to 1.
 
-'record_type' is the type of data that a read() will provide for the
-counter, and it can be one of:
+A counter of type PERF_TYPE_SOFTWARE will count one of the available
+software events, selected by 'event_id':
 
 /*
- * IRQ-notification data record type:
+ * Special "software" counters provided by the kernel, even if the hardware
+ * does not support performance counters. These counters measure various
+ * physical and sw events of the kernel (and allow the profiling of them as
+ * well):
  */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		=  0,
-	PERF_RECORD_IRQ			=  1,
-	PERF_RECORD_GROUP		=  2,
+enum sw_event_ids {
+	PERF_COUNT_CPU_CLOCK		= 0,
+	PERF_COUNT_TASK_CLOCK		= 1,
+	PERF_COUNT_PAGE_FAULTS		= 2,
+	PERF_COUNT_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
 };
 
-a "simple" counter is one that counts hardware events and allows
-them to be read out into a u64 count value. (read() returns 8 on
-a successful read of a simple counter.)
+Counters come in two flavours: counting counters and sampling
+counters.  A "counting" counter is one that is used for counting the
+number of events that occur, and is characterised by having
+irq_period = 0 and record_type = PERF_RECORD_SIMPLE.  A read() on a
+counting counter simply returns the current value of the counter as
+an 8-byte number.
 
-An "irq" counter is one that will also provide an IRQ context information:
-the IP of the interrupted context. In this case read() will return
-the 8-byte counter value, plus the Instruction Pointer address of the
-interrupted context.
+A "sampling" counter is one that is set up to generate an interrupt
+every N events, where N is given by 'irq_period'.  A sampling counter
+has irq_period > 0 and record_type != PERF_RECORD_SIMPLE.  The
+record_type controls what data is recorded on each interrupt, and the
+available values are currently:
 
-The parameter 'hw_event_period' is the number of events before waking up
-a read() that is blocked on a counter fd. Zero value means a non-blocking
-counter.
+/*
+ * IRQ-notification data record type:
+ */
+enum perf_counter_record_type {
+	PERF_RECORD_SIMPLE		= 0,
+	PERF_RECORD_IRQ			= 1,
+	PERF_RECORD_GROUP		= 2,
+};
 
-The 'pid' parameter allows the counter to be specific to a task:
+A record_type value of PERF_RECORD_IRQ will record the instruction
+pointer (IP) at which the interrupt occurred.  A record_type value of
+PERF_RECORD_GROUP will record the event_config and counter value of
+all of the other counters in the group, and should only be used on a
+group leader (see below).  Currently these two values are mutually
+exclusive, but record_type will become a bit-mask in future and
+support other values.
+
+A sampling counter has an event queue, into which an event is placed
+on each interrupt.  A read() on a sampling counter will read the next
+event from the event queue.  If the queue is empty, the read() will
+either block or return an EAGAIN error, depending on whether the fd
+has been set to non-blocking mode or not.
+
+The 'disabled' bit specifies whether the counter starts out disabled
+or enabled.  If it is initially disabled, it can be enabled by ioctl
+or prctl (see below).
+
+The 'nmi' bit specifies, for hardware events, whether the counter
+should be set up to request non-maskable interrupts (NMIs) or normal
+interrupts.  This bit is ignored if the user doesn't have
+CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't
+generate NMIs from hardware counters.
+
+The 'inherit' bit, if set, specifies that this counter should count
+events on descendant tasks as well as the task specified.  This only
+applies to new descendents, not to any existing descendents at the
+time the counter is created (nor to any new descendents of existing
+descendents).
+
+The 'pinned' bit, if set, specifies that the counter should always be
+on the CPU if at all possible.  It only applies to hardware counters
+and only to group leaders.  If a pinned counter cannot be put onto the
+CPU (e.g. because there are not enough hardware counters or because of
+a conflict with some other event), then the counter goes into an
+'error' state, where reads return end-of-file (i.e. read() returns 0)
+until the counter is subsequently enabled or disabled.
+
+The 'exclusive' bit, if set, specifies that when this counter's group
+is on the CPU, it should be the only group using the CPU's counters.
+In future, this will allow sophisticated monitoring programs to supply
+extra configuration information via 'extra_config_len' to exploit
+advanced features of the CPU's Performance Monitor Unit (PMU) that are
+not otherwise accessible and that might disrupt other hardware
+counters.
+
+The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
+way to request that counting of events be restricted to times when the
+CPU is in user, kernel and/or hypervisor mode.
+
+
+The 'pid' parameter to the perf_counter_open() system call allows the
+counter to be specific to a task:
 
  pid == 0: if the pid parameter is zero, the counter is attached to the
  current task.
@@ -125,8 +218,7 @@ The 'pid' parameter allows the counter to be specific to a task:
 
  pid < 0: all tasks are counted (per cpu counters)
 
-The 'cpu' parameter allows a counter to be made specific to a full
-CPU:
+The 'cpu' parameter allows a counter to be made specific to a CPU:
 
  cpu >= 0: the counter is restricted to a specific CPU
  cpu == -1: the counter counts on all CPUs
@@ -141,7 +233,51 @@ their own tasks.
 A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts
 all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege.
 
-Group counters are created by passing in a group_fd of another counter.
-Groups are scheduled at once and can be used with PERF_RECORD_GROUP
-to record multi-dimensional timestamps.
+The 'flags' parameter is currently unused and must be zero.
+
+The 'group_fd' parameter allows counter "groups" to be set up.  A
+counter group has one counter which is the group "leader".  The leader
+is created first, with group_fd = -1 in the perf_counter_open call
+that creates it.  The rest of the group members are created
+subsequently, with group_fd giving the fd of the group leader.
+(A single counter on its own is created with group_fd = -1 and is
+considered to be a group with only 1 member.)
+
+A counter group is scheduled onto the CPU as a unit, that is, it will
+only be put onto the CPU if all of the counters in the group can be
+put onto the CPU.  This means that the values of the member counters
+can be meaningfully compared, added, divided (to get ratios), etc.,
+with each other, since they have counted events for the same set of
+executed instructions.
+
+Counters can be enabled and disabled in two ways: via ioctl and via
+prctl.  When a counter is disabled, it doesn't count or generate
+events but does continue to exist and maintain its count value.
+
+An individual counter or counter group can be enabled with
+
+	ioctl(fd, PERF_COUNTER_IOC_ENABLE);
+
+or disabled with
+
+	ioctl(fd, PERF_COUNTER_IOC_DISABLE);
+
+Enabling or disabling the leader of a group enables or disables the
+whole group; that is, while the group leader is disabled, none of the
+counters in the group will count.  Enabling or disabling a member of a
+group other than the leader only affects that counter - disabling an
+non-leader stops that counter from counting but doesn't affect any
+other counter.
+
+A process can enable or disable all the counter groups that are
+attached to it, using prctl:
+
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+
+This applies to all counters on the current process, whether created
+by this process or by another, and doesn't affect any counters that
+this process has created on other processes.  It only enables or
+disables the group leaders, not any other members in the groups.
 
-- 
GitLab


From c7138f37f905bb7987b1f9f5a8ee73667db39f25 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Tue, 24 Mar 2009 13:18:16 +0100
Subject: [PATCH 0204/2198] perf_counter: fix perf_poll()

Impact: fix kerneltop 100% CPU usage

Only return a poll event when there's actually been one, poll_wait()
doesn't actually wait for the waitq you pass it, it only enqueues
you on it.

Only once all FDs have been iterated and none of thm returned a
poll-event will it schedule().

Also make it return POLL_HUP when there's not mmap() area to read from.

Further, fix a silly bug in the write code.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Orig-LKML-Reference: <1237897096.24918.181.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 +
 kernel/perf_counter.c        | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2b5e66d5ebdf0..48212c15b7d6b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -246,6 +246,7 @@ struct file;
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;
+	atomic_t			wakeup;
 	atomic_t			head;
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0dfe91094fd14..affe227d56a04 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1161,7 +1161,16 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_counter *counter = file->private_data;
-	unsigned int events = POLLIN;
+	struct perf_mmap_data *data;
+	unsigned int events;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (data)
+		events = atomic_xchg(&data->wakeup, 0);
+	else
+		events = POLL_HUP;
+	rcu_read_unlock();
 
 	poll_wait(file, &counter->waitq, wait);
 
@@ -1425,7 +1434,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 
 	do {
 		offset = head = atomic_read(&data->head);
-		head += sizeof(u64);
+		head += size;
 	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
 
 	wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
@@ -1446,6 +1455,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 	 * generate a poll() wakeup for every page boundary crossed
 	 */
 	if (wakeup) {
+		atomic_xchg(&data->wakeup, POLL_IN);
 		__perf_counter_update_userpage(counter, data);
 		if (nmi) {
 			counter->wakeup_pending = 1;
-- 
GitLab


From b9cacc7bf193df16532bfa7d7ca77fe50fc3c2e6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:22 +0100
Subject: [PATCH 0205/2198] perf_counter: more elaborate write API

Provide a begin, copy, end interface to the output buffer.

begin() reserves the space,
 copy() copies the data over, considering page boundaries,
  end() finalizes the event and does the wakeup.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113316.740550870@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 109 +++++++++++++++++++++++++++++-------------
 1 file changed, 75 insertions(+), 34 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index affe227d56a04..0422fd9bf6273 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/fs.h>
+#include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
 #include <linux/file.h>
@@ -16,15 +17,14 @@
 #include <linux/sysfs.h>
 #include <linux/ptrace.h>
 #include <linux/percpu.h>
+#include <linux/vmstat.h>
+#include <linux/hardirq.h>
+#include <linux/rculist.h>
 #include <linux/uaccess.h>
 #include <linux/syscalls.h>
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
-#include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/rculist.h>
-#include <linux/hardirq.h>
 
 #include <asm/irq_regs.h>
 
@@ -1411,16 +1411,20 @@ static const struct file_operations perf_fops = {
  * Output
  */
 
-static int perf_output_write(struct perf_counter *counter, int nmi,
-			     void *buf, ssize_t size)
+struct perf_output_handle {
+	struct perf_counter	*counter;
+	struct perf_mmap_data	*data;
+	unsigned int		offset;
+	int			wakeup;
+};
+
+static int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_counter *counter, unsigned int size)
 {
 	struct perf_mmap_data *data;
-	unsigned int offset, head, nr;
-	unsigned int len;
-	int ret, wakeup;
+	unsigned int offset, head;
 
 	rcu_read_lock();
-	ret = -ENOSPC;
 	data = rcu_dereference(counter->data);
 	if (!data)
 		goto out;
@@ -1428,45 +1432,82 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 	if (!data->nr_pages)
 		goto out;
 
-	ret = -EINVAL;
-	if (size > PAGE_SIZE)
-		goto out;
-
 	do {
 		offset = head = atomic_read(&data->head);
 		head += size;
 	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
 
-	wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
+	handle->counter	= counter;
+	handle->data	= data;
+	handle->offset	= offset;
+	handle->wakeup	= (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
 
-	nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1);
-	offset &= PAGE_SIZE - 1;
+	return 0;
 
-	len = min_t(unsigned int, PAGE_SIZE - offset, size);
-	memcpy(data->data_pages[nr] + offset, buf, len);
-	size -= len;
+out:
+	rcu_read_unlock();
 
-	if (size) {
-		nr = (nr + 1) & (data->nr_pages - 1);
-		memcpy(data->data_pages[nr], buf + len, size);
-	}
+	return -ENOSPC;
+}
 
-	/*
-	 * generate a poll() wakeup for every page boundary crossed
-	 */
-	if (wakeup) {
-		atomic_xchg(&data->wakeup, POLL_IN);
-		__perf_counter_update_userpage(counter, data);
+static void perf_output_copy(struct perf_output_handle *handle,
+			     void *buf, unsigned int len)
+{
+	unsigned int pages_mask;
+	unsigned int offset;
+	unsigned int size;
+	void **pages;
+
+	offset		= handle->offset;
+	pages_mask	= handle->data->nr_pages - 1;
+	pages		= handle->data->data_pages;
+
+	do {
+		unsigned int page_offset;
+		int nr;
+
+		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
+		page_offset = offset & (PAGE_SIZE - 1);
+		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+
+		memcpy(pages[nr] + page_offset, buf, size);
+
+		len	    -= size;
+		buf	    += size;
+		offset	    += size;
+	} while (len);
+
+	handle->offset = offset;
+}
+
+static void perf_output_end(struct perf_output_handle *handle, int nmi)
+{
+	if (handle->wakeup) {
+		(void)atomic_xchg(&handle->data->wakeup, POLL_IN);
+		__perf_counter_update_userpage(handle->counter, handle->data);
 		if (nmi) {
-			counter->wakeup_pending = 1;
+			handle->counter->wakeup_pending = 1;
 			set_perf_counter_pending();
 		} else
-			wake_up(&counter->waitq);
+			wake_up(&handle->counter->waitq);
 	}
-	ret = 0;
-out:
 	rcu_read_unlock();
+}
+
+static int perf_output_write(struct perf_counter *counter, int nmi,
+			     void *buf, ssize_t size)
+{
+	struct perf_output_handle handle;
+	int ret;
 
+	ret = perf_output_begin(&handle, counter, size);
+	if (ret)
+		goto out;
+
+	perf_output_copy(&handle, buf, size);
+	perf_output_end(&handle, nmi);
+
+out:
 	return ret;
 }
 
-- 
GitLab


From 5c1481943250ab65fa5130e05ec479c93216e9f7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:23 +0100
Subject: [PATCH 0206/2198] perf_counter: output objects

Provide a {type,size} header for each output entry.

This should provide extensible output, and the ability to mix multiple streams.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113316.831607932@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 11 ++++++++
 kernel/perf_counter.c        | 53 ++++++++++++++++++++++++++++--------
 2 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 48212c15b7d6b..c256635377d43 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -156,6 +156,16 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
+struct perf_event_header {
+	__u32	type;
+	__u32	size;
+};
+
+enum perf_event_type {
+	PERF_EVENT_IP		= 0,
+	PERF_EVENT_GROUP	= 1,
+};
+
 #ifdef __KERNEL__
 /*
  * Kernel-internal data types and definitions:
@@ -260,6 +270,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
+	int 				nr_siblings;
 	struct perf_counter		*group_leader;
 	const struct hw_perf_counter_ops *hw_ops;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0422fd9bf6273..d76e3112d386b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -75,8 +75,10 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	 */
 	if (counter->group_leader == counter)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
-	else
+	else {
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
+		group_leader->nr_siblings++;
+	}
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 }
@@ -89,6 +91,9 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
 
+	if (counter->group_leader != counter)
+		counter->group_leader->nr_siblings--;
+
 	/*
 	 * If this was a group counter with sibling counters then
 	 * upgrade the siblings to singleton counters by adding them
@@ -381,9 +386,11 @@ static int is_software_only_group(struct perf_counter *leader)
 
 	if (!is_software_counter(leader))
 		return 0;
+
 	list_for_each_entry(counter, &leader->sibling_list, list_entry)
 		if (!is_software_counter(counter))
 			return 0;
+
 	return 1;
 }
 
@@ -1480,6 +1487,9 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	handle->offset = offset;
 }
 
+#define perf_output_put(handle, x) \
+	perf_output_copy((handle), &(x), sizeof(x))
+
 static void perf_output_end(struct perf_output_handle *handle, int nmi)
 {
 	if (handle->wakeup) {
@@ -1514,34 +1524,53 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 static void perf_output_simple(struct perf_counter *counter,
 			       int nmi, struct pt_regs *regs)
 {
-	u64 entry;
+	struct {
+		struct perf_event_header header;
+		u64 ip;
+	} event;
 
-	entry = instruction_pointer(regs);
+	event.header.type = PERF_EVENT_IP;
+	event.header.size = sizeof(event);
+	event.ip = instruction_pointer(regs);
 
-	perf_output_write(counter, nmi, &entry, sizeof(entry));
+	perf_output_write(counter, nmi, &event, sizeof(event));
 }
 
-struct group_entry {
-	u64 event;
-	u64 counter;
-};
-
 static void perf_output_group(struct perf_counter *counter, int nmi)
 {
+	struct perf_output_handle handle;
+	struct perf_event_header header;
 	struct perf_counter *leader, *sub;
+	unsigned int size;
+	struct {
+		u64 event;
+		u64 counter;
+	} entry;
+	int ret;
+
+	size = sizeof(header) + counter->nr_siblings * sizeof(entry);
+
+	ret = perf_output_begin(&handle, counter, size);
+	if (ret)
+		return;
+
+	header.type = PERF_EVENT_GROUP;
+	header.size = size;
+
+	perf_output_put(&handle, header);
 
 	leader = counter->group_leader;
 	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		struct group_entry entry;
-
 		if (sub != counter)
 			sub->hw_ops->read(sub);
 
 		entry.event = sub->hw_event.config;
 		entry.counter = atomic64_read(&sub->count);
 
-		perf_output_write(counter, nmi, &entry, sizeof(entry));
+		perf_output_put(&handle, entry);
 	}
+
+	perf_output_end(&handle, nmi);
 }
 
 void perf_counter_output(struct perf_counter *counter,
-- 
GitLab


From 63e35b25d6b5c3136d22ef249dbbf96716aa08bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:24 +0100
Subject: [PATCH 0207/2198] perf_counter: sanity check on the output API

Ensure we never write more than we said we would.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113316.921433024@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d76e3112d386b..7669afe82cc7c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1422,6 +1422,7 @@ struct perf_output_handle {
 	struct perf_counter	*counter;
 	struct perf_mmap_data	*data;
 	unsigned int		offset;
+	unsigned int		head;
 	int			wakeup;
 };
 
@@ -1447,6 +1448,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	handle->counter	= counter;
 	handle->data	= data;
 	handle->offset	= offset;
+	handle->head	= head;
 	handle->wakeup	= (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
 
 	return 0;
@@ -1485,6 +1487,8 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	} while (len);
 
 	handle->offset = offset;
+
+	WARN_ON_ONCE(handle->offset > handle->head);
 }
 
 #define perf_output_put(handle, x) \
-- 
GitLab


From ea5d20cf99db5d26d43b6d322d3ace17e08a6552 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:25 +0100
Subject: [PATCH 0208/2198] perf_counter: optionally provide the pid/tid of the
 sampled task

Allow cpu wide counters to profile userspace by providing what process
the sample belongs to.

This raises the first issue with the output type, lots of these
options: group, tid, callchain, etc.. are non-exclusive and could be
combined, suggesting a bitfield.

However, things like the mmap() data stream doesn't fit in that.

How to split the type field...

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113317.013775235@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  5 ++++-
 kernel/perf_counter.c        | 18 ++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c256635377d43..7fdbdf8be7753 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -127,8 +127,9 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
+				include_tid    :  1, /* include the tid */
 
-				__reserved_1   : 55;
+				__reserved_1   : 54;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -164,6 +165,8 @@ struct perf_event_header {
 enum perf_event_type {
 	PERF_EVENT_IP		= 0,
 	PERF_EVENT_GROUP	= 1,
+
+	__PERF_EVENT_TID	= 0x100,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7669afe82cc7c..f3e1b27bc1b85 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1528,16 +1528,30 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 static void perf_output_simple(struct perf_counter *counter,
 			       int nmi, struct pt_regs *regs)
 {
+	unsigned int size;
 	struct {
 		struct perf_event_header header;
 		u64 ip;
+		u32 pid, tid;
 	} event;
 
 	event.header.type = PERF_EVENT_IP;
-	event.header.size = sizeof(event);
 	event.ip = instruction_pointer(regs);
 
-	perf_output_write(counter, nmi, &event, sizeof(event));
+	size = sizeof(event);
+
+	if (counter->hw_event.include_tid) {
+		/* namespace issues */
+		event.pid = current->group_leader->pid;
+		event.tid = current->pid;
+
+		event.header.type |= __PERF_EVENT_TID;
+	} else
+		size -= sizeof(u64);
+
+	event.header.size = size;
+
+	perf_output_write(counter, nmi, &event, size);
 }
 
 static void perf_output_group(struct perf_counter *counter, int nmi)
-- 
GitLab


From 4c4ba21d2c3659e4c0421533939b58a8fd9f06c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:26 +0100
Subject: [PATCH 0209/2198] perf_counter: kerneltop: mmap_pages argument

provide a knob to set the number of mmap data pages.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113317.104545398@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 31 +++++++++++++++++++++-----
 1 file changed, 25 insertions(+), 6 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 7ebde7a336c62..3e45bf6591b2e 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -178,6 +178,7 @@ static int			nr_cpus				=  0;
 static int			nmi				=  1;
 static int			group				=  0;
 static unsigned int		page_size;
+static unsigned int		mmap_pages			=  4;
 
 static char			*vmlinux;
 
@@ -326,6 +327,7 @@ static void display_help(void)
 	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
 	" -z        --zero             # zero counts after display\n"
 	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
+	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
 	);
 
 	exit(0);
@@ -732,7 +734,9 @@ static int read_symbol(FILE *in, struct sym_entry *s)
 	/* Tag events to be skipped. */
 	if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
 		s->skip = 1;
-	if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+	else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+		s->skip = 1;
+	else if (!strcmp("mwait_idle", s->sym))
 		s->skip = 1;
 
 	if (filter_match == 1) {
@@ -1042,9 +1046,10 @@ static void process_options(int argc, char *argv[])
 			{"symbol",	required_argument,	NULL, 's'},
 			{"stat",	no_argument,		NULL, 'S'},
 			{"zero",	no_argument,		NULL, 'z'},
+			{"mmap_pages",	required_argument,	NULL, 'm'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z",
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1081,6 +1086,7 @@ static void process_options(int argc, char *argv[])
 		case 'S': run_perfstat			=	       1; break;
 		case 'x': vmlinux			= strdup(optarg); break;
 		case 'z': zero				=              1; break;
+		case 'm': mmap_pages			=   atoi(optarg); break;
 		default: error = 1; break;
 		}
 	}
@@ -1134,17 +1140,30 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 	return head;
 }
 
+struct timeval last_read, this_read;
+
 static void mmap_read(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
 
+	gettimeofday(&this_read, NULL);
+
 	if (head - old > md->mask) {
-		printf("ERROR: failed to keep up with mmap data\n");
-		exit(-1);
+		struct timeval iv;
+		unsigned long msecs;
+
+		timersub(&this_read, &last_read, &iv);
+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+		fprintf(stderr, "WARNING: failed to keep up with mmap data.  Last read %lu msecs ago.\n", msecs);
+
+		old = head;
 	}
 
+	last_read = this_read;
+
 	for (; old != head;) {
 		__u64 *ptr = (__u64 *)&data[old & md->mask];
 		old += sizeof(__u64);
@@ -1220,8 +1239,8 @@ int main(int argc, char *argv[])
 
 			mmap_array[i][counter].counter = counter;
 			mmap_array[i][counter].prev = 0;
-			mmap_array[i][counter].mask = 2*page_size - 1;
-			mmap_array[i][counter].base = mmap(NULL, 3*page_size,
+			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
 					PROT_READ, MAP_SHARED, fd[i][counter], 0);
 			if (mmap_array[i][counter].base == MAP_FAILED) {
 				printf("kerneltop error: failed to mmap with %d (%s)\n",
-- 
GitLab


From 00f0ad73ac90e3fba8b4cbe4cf21b2fb9a56cb72 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:30:27 +0100
Subject: [PATCH 0210/2198] perf_counter: kerneltop: output event support

Teach kerneltop about the new output ABI.

XXX: anybody fancy integrating the PID/TID data into the output?

Bump the mmap_data pages a little because we bloated the output and
have to be more careful about overruns with structured data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Orig-LKML-Reference: <20090325113317.192910290@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 65 +++++++++++++++++++++++---
 1 file changed, 59 insertions(+), 6 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 3e45bf6591b2e..fda1438365dc0 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -134,6 +134,11 @@
 #endif
 
 #define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
 
 asmlinkage int sys_perf_counter_open(
         struct perf_counter_hw_event    *hw_event_uptr          __user,
@@ -178,7 +183,7 @@ static int			nr_cpus				=  0;
 static int			nmi				=  1;
 static int			group				=  0;
 static unsigned int		page_size;
-static unsigned int		mmap_pages			=  4;
+static unsigned int		mmap_pages			=  16;
 
 static char			*vmlinux;
 
@@ -1147,28 +1152,75 @@ static void mmap_read(struct mmap_data *md)
 	unsigned int head = mmap_read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
+	int diff;
 
 	gettimeofday(&this_read, NULL);
 
-	if (head - old > md->mask) {
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and screw up the events under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
 		struct timeval iv;
 		unsigned long msecs;
 
 		timersub(&this_read, &last_read, &iv);
 		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
 
-		fprintf(stderr, "WARNING: failed to keep up with mmap data.  Last read %lu msecs ago.\n", msecs);
+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
+				"  Last read %lu msecs ago.\n", msecs);
 
+		/*
+		 * head points to a known good entry, start there.
+		 */
 		old = head;
 	}
 
 	last_read = this_read;
 
 	for (; old != head;) {
-		__u64 *ptr = (__u64 *)&data[old & md->mask];
-		old += sizeof(__u64);
+		struct event_struct {
+			struct perf_event_header header;
+			__u64 ip;
+			__u32 pid, tid;
+		} *event = (struct event_struct *)&data[old & md->mask];
+		struct event_struct event_copy;
+
+		unsigned int size = event->header.size;
+
+		/*
+		 * Event straddles the mmap boundary -- header should always
+		 * be inside due to u64 alignment of output.
+		 */
+		if ((old & md->mask) + size != ((old + size) & md->mask)) {
+			unsigned int offset = old;
+			unsigned int len = sizeof(*event), cpy;
+			void *dst = &event_copy;
+
+			do {
+				cpy = min(md->mask + 1 - (offset & md->mask), len);
+				memcpy(dst, &data[offset & md->mask], cpy);
+				offset += cpy;
+				dst += cpy;
+				len -= cpy;
+			} while (len);
+
+			event = &event_copy;
+		}
 
-		process_event(*ptr, md->counter);
+		old += size;
+
+		switch (event->header.type) {
+		case PERF_EVENT_IP:
+		case PERF_EVENT_IP | __PERF_EVENT_TID:
+			process_event(event->ip, md->counter);
+			break;
+		}
 	}
 
 	md->prev = old;
@@ -1214,6 +1266,7 @@ int main(int argc, char *argv[])
 			hw_event.irq_period	= event_count[counter];
 			hw_event.record_type	= PERF_RECORD_IRQ;
 			hw_event.nmi		= nmi;
+			hw_event.include_tid	= 1;
 
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
-- 
GitLab


From 7730d8655880f41f2ea519aca2ca6a1413dfd2c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Mar 2009 12:48:31 +0100
Subject: [PATCH 0211/2198] perf_counter: allow and require one-page mmap on
 counting counters

A brainfart stopped single page mmap()s working. The rest of the code
should be perfectly fine with not having any data pages.

Reported-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <1237981712.7972.812.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f3e1b27bc1b85..95e02575546b0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1369,7 +1369,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	vma_size = vma->vm_end - vma->vm_start;
 	nr_pages = (vma_size / PAGE_SIZE) - 1;
 
-	if (nr_pages == 0 || !is_power_of_2(nr_pages))
+	/*
+	 * If we have data pages ensure they're a power-of-two number, so we
+	 * can do bitmasks instead of modulo.
+	 */
+	if (nr_pages != 0 && !is_power_of_2(nr_pages))
 		return -EINVAL;
 
 	if (vma_size != PAGE_SIZE * (1 + nr_pages))
-- 
GitLab


From 53cfbf593758916aac41db728f029986a62f1254 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 25 Mar 2009 22:46:58 +1100
Subject: [PATCH 0212/2198] perf_counter: record time running and time enabled
 for each counter

Impact: new functionality

Currently, if there are more counters enabled than can fit on the CPU,
the kernel will multiplex the counters on to the hardware using
round-robin scheduling.  That isn't too bad for sampling counters, but
for counting counters it means that the value read from a counter
represents some unknown fraction of the true count of events that
occurred while the counter was enabled.

This remedies the situation by keeping track of how long each counter
is enabled for, and how long it is actually on the cpu and counting
events.  These times are recorded in nanoseconds using the task clock
for per-task counters and the cpu clock for per-cpu counters.

These values can be supplied to userspace on a read from the counter.
Userspace requests that they be supplied after the counter value by
setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or
PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field
when creating the counter.  (There is no way to change the read format
after the counter is created, though it would be possible to add some
way to do that.)

Using this information it is possible for userspace to scale the count
it reads from the counter to get an estimate of the true count:

true_count_estimate = count * total_time_enabled / total_time_running

This also lets userspace detect the situation where the counter never
got to go on the cpu: total_time_running == 0.

This functionality has been requested by the PAPI developers, and will
be generally needed for interpreting the count values from counting
counters correctly.

In the implementation, this keeps 5 time values (in nanoseconds) for
each counter: total_time_enabled and total_time_running are used when
the counter is in state OFF or ERROR and for reporting back to
userspace.  When the counter is in state INACTIVE or ACTIVE, it is the
tstamp_enabled, tstamp_running and tstamp_stopped values that are
relevant, and total_time_enabled and total_time_running are determined
from them.  (tstamp_stopped is only used in INACTIVE state.)  The
reason for doing it like this is that it means that only counters
being enabled or disabled at sched-in and sched-out time need to be
updated.  There are no new loops that iterate over all counters to
update total_time_enabled or total_time_running.

This also keeps separate child_total_time_running and
child_total_time_enabled fields that get added in when reporting the
totals to userspace.  They are separate fields so that they can be
atomic.  We don't want to use atomics for total_time_running,
total_time_enabled etc., because then we would have to use atomic
sequences to update them, which are slower than regular arithmetic and
memory accesses.

It is possible to measure total_time_running by adding a task_clock
counter to each group of counters, and total_time_enabled can be
measured approximately with a top-level task_clock counter (though
inaccuracies will creep in if you need to disable and enable groups
since it is not possible in general to disable/enable the top-level
task_clock counter simultaneously with another group).  However, that
adds extra overhead - I measured around 15% increase in the context
switch latency reported by lat_ctx (from lmbench) when a task_clock
counter was added to each of 2 groups, and around 25% increase when a
task_clock counter was added to each of 4 groups.  (In both cases a
top-level task-clock counter was also added.)

In contrast, the code added in this commit gives better information
with no overhead that I could measure (in fact in some cases I
measured lower times with this code, but the differences were all less
than one standard deviation).

[ v2: address review comments by Andrew Morton. ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |   2 +
 include/linux/perf_counter.h       |  53 ++++++++++
 kernel/perf_counter.c              | 157 +++++++++++++++++++++++++----
 3 files changed, 191 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d48596ab65577..df007fe0cc0bb 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -455,6 +455,8 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 {
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;
+	counter->tstamp_running += counter->ctx->time_now -
+		counter->tstamp_stopped;
 	if (is_software_counter(counter))
 		counter->hw_ops->enable(counter);
 }
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7fdbdf8be7753..6bf67ce176259 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -102,6 +102,16 @@ enum perf_counter_record_type {
 #define PERF_COUNTER_EVENT_SHIFT	0
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
+/*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1,
+	PERF_FORMAT_TOTAL_TIME_RUNNING	=  2,
+};
+
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
@@ -281,6 +291,32 @@ struct perf_counter {
 	enum perf_counter_active_state	prev_state;
 	atomic64_t			count;
 
+	/*
+	 * These are the total time in nanoseconds that the counter
+	 * has been enabled (i.e. eligible to run, and the task has
+	 * been scheduled in, if this is a per-task counter)
+	 * and running (scheduled onto the CPU), respectively.
+	 *
+	 * They are computed from tstamp_enabled, tstamp_running and
+	 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
+	 */
+	u64				total_time_enabled;
+	u64				total_time_running;
+
+	/*
+	 * These are timestamps used for computing total_time_enabled
+	 * and total_time_running when the counter is in INACTIVE or
+	 * ACTIVE state, measured in nanoseconds from an arbitrary point
+	 * in time.
+	 * tstamp_enabled: the notional time when the counter was enabled
+	 * tstamp_running: the notional time when the counter was scheduled on
+	 * tstamp_stopped: in INACTIVE state, the notional time when the
+	 *	counter was scheduled off.
+	 */
+	u64				tstamp_enabled;
+	u64				tstamp_running;
+	u64				tstamp_stopped;
+
 	struct perf_counter_hw_event	hw_event;
 	struct hw_perf_counter		hw;
 
@@ -291,6 +327,13 @@ struct perf_counter {
 	struct perf_counter		*parent;
 	struct list_head		child_list;
 
+	/*
+	 * These accumulate total time (in nanoseconds) that children
+	 * counters have been enabled and running, respectively.
+	 */
+	atomic64_t			child_total_time_enabled;
+	atomic64_t			child_total_time_running;
+
 	/*
 	 * Protect attach/detach and child_list:
 	 */
@@ -339,6 +382,16 @@ struct perf_counter_context {
 	int			nr_active;
 	int			is_active;
 	struct task_struct	*task;
+
+	/*
+	 * time_now is the current time in nanoseconds since an arbitrary
+	 * point in the past.  For per-task counters, this is based on the
+	 * task clock, and for per-cpu counters it is based on the cpu clock.
+	 * time_lost is an offset from the task/cpu clock, used to make it
+	 * appear that time only passes while the context is scheduled in.
+	 */
+	u64			time_now;
+	u64			time_lost;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 95e02575546b0..3b862a7988cda 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -116,6 +116,7 @@ counter_sched_out(struct perf_counter *counter,
 		return;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->tstamp_stopped = ctx->time_now;
 	counter->hw_ops->disable(counter);
 	counter->oncpu = -1;
 
@@ -251,6 +252,60 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
+/*
+ * Get the current time for this context.
+ * If this is a task context, we use the task's task clock,
+ * or for a per-cpu context, we use the cpu clock.
+ */
+static u64 get_context_time(struct perf_counter_context *ctx, int update)
+{
+	struct task_struct *curr = ctx->task;
+
+	if (!curr)
+		return cpu_clock(smp_processor_id());
+
+	return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime;
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_counter_context *ctx, int update)
+{
+	ctx->time_now = get_context_time(ctx, update) - ctx->time_lost;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a counter.
+ */
+static void update_counter_times(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	u64 run_end;
+
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+		counter->total_time_enabled = ctx->time_now -
+			counter->tstamp_enabled;
+		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+			run_end = counter->tstamp_stopped;
+		else
+			run_end = ctx->time_now;
+		counter->total_time_running = run_end - counter->tstamp_running;
+	}
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all counters in a group.
+ */
+static void update_group_times(struct perf_counter *leader)
+{
+	struct perf_counter *counter;
+
+	update_counter_times(leader);
+	list_for_each_entry(counter, &leader->sibling_list, list_entry)
+		update_counter_times(counter);
+}
+
 /*
  * Cross CPU call to disable a performance counter
  */
@@ -276,6 +331,8 @@ static void __perf_counter_disable(void *info)
 	 * If it is in error state, leave it in error state.
 	 */
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+		update_context_time(ctx, 1);
+		update_counter_times(counter);
 		if (counter == counter->group_leader)
 			group_sched_out(counter, cpuctx, ctx);
 		else
@@ -320,8 +377,10 @@ static void perf_counter_disable(struct perf_counter *counter)
 	 * Since we have the lock this context can't be scheduled
 	 * in, so we can change the state safely.
 	 */
-	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+		update_counter_times(counter);
 		counter->state = PERF_COUNTER_STATE_OFF;
+	}
 
 	spin_unlock_irq(&ctx->lock);
 }
@@ -366,6 +425,8 @@ counter_sched_in(struct perf_counter *counter,
 		return -EAGAIN;
 	}
 
+	counter->tstamp_running += ctx->time_now - counter->tstamp_stopped;
+
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
@@ -425,6 +486,17 @@ static int group_can_go_on(struct perf_counter *counter,
 	return can_add_hw;
 }
 
+static void add_counter_to_ctx(struct perf_counter *counter,
+			       struct perf_counter_context *ctx)
+{
+	list_add_counter(counter, ctx);
+	ctx->nr_counters++;
+	counter->prev_state = PERF_COUNTER_STATE_OFF;
+	counter->tstamp_enabled = ctx->time_now;
+	counter->tstamp_running = ctx->time_now;
+	counter->tstamp_stopped = ctx->time_now;
+}
+
 /*
  * Cross CPU call to install and enable a performance counter
  */
@@ -449,6 +521,7 @@ static void __perf_install_in_context(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
+	update_context_time(ctx, 1);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -456,9 +529,7 @@ static void __perf_install_in_context(void *info)
 	 */
 	perf_flags = hw_perf_save_disable();
 
-	list_add_counter(counter, ctx);
-	ctx->nr_counters++;
-	counter->prev_state = PERF_COUNTER_STATE_OFF;
+	add_counter_to_ctx(counter, ctx);
 
 	/*
 	 * Don't put the counter on if it is disabled or if
@@ -486,8 +557,10 @@ static void __perf_install_in_context(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned)
+		if (leader->hw_event.pinned) {
+			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
+		}
 	}
 
 	if (!err && !ctx->task && cpuctx->max_pertask)
@@ -548,10 +621,8 @@ perf_install_in_context(struct perf_counter_context *ctx,
 	 * can add the counter safely, if it the call above did not
 	 * succeed.
 	 */
-	if (list_empty(&counter->list_entry)) {
-		list_add_counter(counter, ctx);
-		ctx->nr_counters++;
-	}
+	if (list_empty(&counter->list_entry))
+		add_counter_to_ctx(counter, ctx);
 	spin_unlock_irq(&ctx->lock);
 }
 
@@ -576,11 +647,13 @@ static void __perf_counter_enable(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
+	update_context_time(ctx, 1);
 
 	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
+	counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -602,8 +675,10 @@ static void __perf_counter_enable(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned)
+		if (leader->hw_event.pinned) {
+			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
+		}
 	}
 
  unlock:
@@ -659,8 +734,11 @@ static void perf_counter_enable(struct perf_counter *counter)
 	 * Since we have the lock this context can't be scheduled
 	 * in, so we can change the state safely.
 	 */
-	if (counter->state == PERF_COUNTER_STATE_OFF)
+	if (counter->state == PERF_COUNTER_STATE_OFF) {
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->tstamp_enabled = ctx->time_now -
+			counter->total_time_enabled;
+	}
  out:
 	spin_unlock_irq(&ctx->lock);
 }
@@ -693,6 +771,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	ctx->is_active = 0;
 	if (likely(!ctx->nr_counters))
 		goto out;
+	update_context_time(ctx, 0);
 
 	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
@@ -797,6 +876,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	if (likely(!ctx->nr_counters))
 		goto out;
 
+	/*
+	 * Add any time since the last sched_out to the lost time
+	 * so it doesn't get included in the total_time_enabled and
+	 * total_time_running measures for counters in the context.
+	 */
+	ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now;
+
 	flags = hw_perf_save_disable();
 
 	/*
@@ -817,8 +903,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		 * If this pinned group hasn't been scheduled,
 		 * put it in error state.
 		 */
-		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+		if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+			update_group_times(counter);
 			counter->state = PERF_COUNTER_STATE_ERROR;
+		}
 	}
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
@@ -902,8 +990,10 @@ int perf_counter_task_disable(void)
 	perf_flags = hw_perf_save_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ERROR)
+		if (counter->state != PERF_COUNTER_STATE_ERROR) {
+			update_group_times(counter);
 			counter->state = PERF_COUNTER_STATE_OFF;
+		}
 	}
 
 	hw_perf_restore(perf_flags);
@@ -946,6 +1036,8 @@ int perf_counter_task_enable(void)
 		if (counter->state > PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
+		counter->tstamp_enabled = ctx->time_now -
+			counter->total_time_enabled;
 		counter->hw_event.disabled = 0;
 	}
 	hw_perf_restore(perf_flags);
@@ -1009,10 +1101,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 static void __read(void *info)
 {
 	struct perf_counter *counter = info;
+	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
 
 	curr_rq_lock_irq_save(&flags);
+	if (ctx->is_active)
+		update_context_time(ctx, 1);
 	counter->hw_ops->read(counter);
+	update_counter_times(counter);
 	curr_rq_unlock_irq_restore(&flags);
 }
 
@@ -1025,6 +1121,8 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
 		smp_call_function_single(counter->oncpu,
 					 __read, counter, 1);
+	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+		update_counter_times(counter);
 	}
 
 	return atomic64_read(&counter->count);
@@ -1137,10 +1235,8 @@ static int perf_release(struct inode *inode, struct file *file)
 static ssize_t
 perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 {
-	u64 cntval;
-
-	if (count < sizeof(cntval))
-		return -EINVAL;
+	u64 values[3];
+	int n;
 
 	/*
 	 * Return end-of-file for a read on a counter that is in
@@ -1151,10 +1247,24 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 		return 0;
 
 	mutex_lock(&counter->mutex);
-	cntval = perf_counter_read(counter);
+	values[0] = perf_counter_read(counter);
+	n = 1;
+	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+		values[n++] = counter->total_time_enabled +
+			atomic64_read(&counter->child_total_time_enabled);
+	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+		values[n++] = counter->total_time_running +
+			atomic64_read(&counter->child_total_time_running);
 	mutex_unlock(&counter->mutex);
 
-	return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
+	if (count < n * sizeof(u64))
+		return -EINVAL;
+	count = n * sizeof(u64);
+
+	if (copy_to_user(buf, values, count))
+		return -EFAULT;
+
+	return count;
 }
 
 static ssize_t
@@ -2290,8 +2400,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	 * Link it up in the child's context:
 	 */
 	child_counter->task = child;
-	list_add_counter(child_counter, child_ctx);
-	child_ctx->nr_counters++;
+	add_counter_to_ctx(child_counter, child_ctx);
 
 	child_counter->parent = parent_counter;
 	/*
@@ -2361,6 +2470,10 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	 * Add back the child's count to the parent's count:
 	 */
 	atomic64_add(child_val, &parent_counter->count);
+	atomic64_add(child_counter->total_time_enabled,
+		     &parent_counter->child_total_time_enabled);
+	atomic64_add(child_counter->total_time_running,
+		     &parent_counter->child_total_time_running);
 
 	/*
 	 * Remove this counter from the parent's list
@@ -2395,6 +2508,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	if (child != current) {
 		wait_task_inactive(child, 0);
 		list_del_init(&child_counter->list_entry);
+		update_counter_times(child_counter);
 	} else {
 		struct perf_cpu_context *cpuctx;
 		unsigned long flags;
@@ -2412,6 +2526,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
 		group_sched_out(child_counter, cpuctx, child_ctx);
+		update_counter_times(child_counter);
 
 		list_del_init(&child_counter->list_entry);
 
-- 
GitLab


From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:02 +0200
Subject: [PATCH 0213/2198] perf_counter: unify and fix delayed counter wakeup

While going over the wakeup code I noticed delayed wakeups only work
for hardware counters but basically all software counters rely on
them.

This patch unifies and generalizes the delayed wakeup to fix this
issue.

Since we're dealing with NMI context bits here, use a cmpxchg() based
single link list implementation to track counters that have pending
wakeups.

[ This should really be generic code for delayed wakeups, but since we
  cannot use cmpxchg()/xchg() in generic code, I've let it live in the
  perf_counter code. -- Eric Dumazet could use it to aggregate the
  network wakeups. ]

Furthermore, the x86 method of using TIF flags was flawed in that its
quite possible to end up setting the bit on the idle task, loosing the
wakeup.

The powerpc method uses per-cpu storage and does appear to be
sufficient.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.153932974@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/hw_irq.h   |   4 +-
 arch/powerpc/kernel/irq.c           |   2 +-
 arch/powerpc/kernel/perf_counter.c  |  22 +----
 arch/x86/include/asm/perf_counter.h |   5 +-
 arch/x86/include/asm/thread_info.h  |   4 +-
 arch/x86/kernel/cpu/perf_counter.c  |  29 -------
 arch/x86/kernel/signal.c            |   6 --
 include/linux/perf_counter.h        |  15 ++--
 kernel/perf_counter.c               | 128 ++++++++++++++++++++++++++--
 kernel/timer.c                      |   3 +
 10 files changed, 142 insertions(+), 76 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index cb32d571c9c7f..20a44d0c9fdde 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -132,7 +132,7 @@ static inline int irqs_disabled_flags(unsigned long flags)
 struct irq_chip;
 
 #ifdef CONFIG_PERF_COUNTERS
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
 	unsigned long x;
 
@@ -160,7 +160,7 @@ extern void perf_counter_do_pending(void);
 
 #else
 
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
 	return 0;
 }
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index 469e9635ff04b..2cd471f92fe65 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -135,7 +135,7 @@ notrace void raw_local_irq_restore(unsigned long en)
 			iseries_handle_interrupts();
 	}
 
-	if (get_perf_counter_pending()) {
+	if (test_perf_counter_pending()) {
 		clear_perf_counter_pending();
 		perf_counter_do_pending();
 	}
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index df007fe0cc0bb..cde720fc495cd 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -649,24 +649,6 @@ hw_perf_counter_init(struct perf_counter *counter)
 	return &power_perf_ops;
 }
 
-/*
- * Handle wakeups.
- */
-void perf_counter_do_pending(void)
-{
-	int i;
-	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-	struct perf_counter *counter;
-
-	for (i = 0; i < cpuhw->n_counters; ++i) {
-		counter = cpuhw->counter[i];
-		if (counter && counter->wakeup_pending) {
-			counter->wakeup_pending = 0;
-			wake_up(&counter->waitq);
-		}
-	}
-}
-
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -720,7 +702,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
 	struct perf_counter *counter;
 	long val;
-	int need_wakeup = 0, found = 0;
+	int found = 0;
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
@@ -761,7 +743,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 * immediately; otherwise we'll have do the wakeup when interrupts
 	 * get soft-enabled.
 	 */
-	if (get_perf_counter_pending() && regs->softe) {
+	if (test_perf_counter_pending() && regs->softe) {
 		irq_enter();
 		clear_perf_counter_pending();
 		perf_counter_do_pending();
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index 1662043b340fb..e2b0e66b23535 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,8 +84,9 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-#define set_perf_counter_pending()	\
-		set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+#define set_perf_counter_pending()	do { } while (0)
+#define clear_perf_counter_pending()	do { } while (0)
+#define test_perf_counter_pending()	(0)
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 3ffd5d2a36769..8820a73ae090a 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,7 +83,6 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
-#define TIF_PERF_COUNTERS	11	/* notify perf counter work */
 #define TIF_NOTSC		16	/* TSC is not accessible in userland */
 #define TIF_IA32		17	/* 32bit process */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -107,7 +106,6 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT	(1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY		(1 << TIF_MCE_NOTIFY)
-#define _TIF_PERF_COUNTERS	(1 << TIF_PERF_COUNTERS)
 #define _TIF_NOTSC		(1 << TIF_NOTSC)
 #define _TIF_IA32		(1 << TIF_IA32)
 #define _TIF_FORK		(1 << TIF_FORK)
@@ -141,7 +139,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
+	(_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3f95b0cdc550e..7aab177fb5663 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		 */
 		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
-	counter->wakeup_pending = 0;
 
 	return 0;
 }
@@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
-/*
- * This handler is triggered by NMI contexts:
- */
-void perf_counter_notify(struct pt_regs *regs)
-{
-	struct cpu_hw_counters *cpuc;
-	unsigned long flags;
-	int bit, cpu;
-
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
-
-	for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
-		struct perf_counter *counter = cpuc->counters[bit];
-
-		if (!counter)
-			continue;
-
-		if (counter->wakeup_pending) {
-			counter->wakeup_pending = 0;
-			wake_up(&counter->waitq);
-		}
-	}
-
-	local_irq_restore(flags);
-}
-
 void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 611615a92c902..0a813b17b1724 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -6,7 +6,6 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
-#include <linux/perf_counter.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 		tracehook_notify_resume(regs);
 	}
 
-	if (thread_info_flags & _TIF_PERF_COUNTERS) {
-		clear_thread_flag(TIF_PERF_COUNTERS);
-		perf_counter_notify(regs);
-	}
-
 #ifdef CONFIG_X86_32
 	clear_thread_flag(TIF_IRET);
 #endif /* CONFIG_X86_32 */
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6bf67ce176259..0d833228eee5a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -275,6 +275,10 @@ struct perf_mmap_data {
 	void 				*data_pages[0];
 };
 
+struct perf_wakeup_entry {
+	struct perf_wakeup_entry *next;
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -350,7 +354,7 @@ struct perf_counter {
 	/* poll related */
 	wait_queue_head_t		waitq;
 	/* optional: for NMIs */
-	int				wakeup_pending;
+	struct perf_wakeup_entry	wakeup;
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
@@ -427,7 +431,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
-extern void perf_counter_notify(struct pt_regs *regs);
+extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void perf_counter_unthrottle(void);
 extern u64 hw_perf_save_disable(void);
@@ -461,7 +465,7 @@ static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
-static inline void perf_counter_notify(struct pt_regs *regs)		{ }
+static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
 static inline void hw_perf_restore(u64 ctrl)				{ }
@@ -469,8 +473,9 @@ static inline u64 hw_perf_save_disable(void)		      { return 0; }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
-static inline void perf_swcounter_event(u32 event, u64 nr,
-					int nmi, struct pt_regs *regs)	{ }
+static inline void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
+
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3b862a7988cda..f70ff80e79d7a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head)
 	kfree(counter);
 }
 
+static void perf_pending_sync(struct perf_counter *counter);
+
 static void free_counter(struct perf_counter *counter)
 {
+	perf_pending_sync(counter);
+
 	if (counter->destroy)
 		counter->destroy(counter);
 
@@ -1528,6 +1532,118 @@ static const struct file_operations perf_fops = {
 	.mmap			= perf_mmap,
 };
 
+/*
+ * Perf counter wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_counter_wakeup(struct perf_counter *counter)
+{
+	struct perf_mmap_data *data;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (data) {
+		(void)atomic_xchg(&data->wakeup, POLL_IN);
+		__perf_counter_update_userpage(counter, data);
+	}
+	rcu_read_unlock();
+
+	wake_up_all(&counter->waitq);
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+	PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_counter *counter)
+{
+	struct perf_wakeup_entry **head;
+	struct perf_wakeup_entry *prev, *next;
+
+	if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+		return;
+
+	head = &get_cpu_var(perf_wakeup_head);
+
+	do {
+		prev = counter->wakeup.next = *head;
+		next = &counter->wakeup;
+	} while (cmpxchg(head, prev, next) != prev);
+
+	set_perf_counter_pending();
+
+	put_cpu_var(perf_wakeup_head);
+}
+
+static int __perf_pending_run(void)
+{
+	struct perf_wakeup_entry *list;
+	int nr = 0;
+
+	list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+	while (list != PENDING_TAIL) {
+		struct perf_counter *counter = container_of(list,
+				struct perf_counter, wakeup);
+
+		list = list->next;
+
+		counter->wakeup.next = NULL;
+		/*
+		 * Ensure we observe the unqueue before we issue the wakeup,
+		 * so that we won't be waiting forever.
+		 * -- see perf_not_pending().
+		 */
+		smp_wmb();
+
+		perf_counter_wakeup(counter);
+		nr++;
+	}
+
+	return nr;
+}
+
+static inline int perf_not_pending(struct perf_counter *counter)
+{
+	/*
+	 * If we flush on whatever cpu we run, there is a chance we don't
+	 * need to wait.
+	 */
+	get_cpu();
+	__perf_pending_run();
+	put_cpu();
+
+	/*
+	 * Ensure we see the proper queue state before going to sleep
+	 * so that we do not miss the wakeup. -- see perf_pending_handle()
+	 */
+	smp_rmb();
+	return counter->wakeup.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_counter *counter)
+{
+	wait_event(counter->waitq, perf_not_pending(counter));
+}
+
+void perf_counter_do_pending(void)
+{
+	__perf_pending_run();
+}
+
 /*
  * Output
  */
@@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle,
 static void perf_output_end(struct perf_output_handle *handle, int nmi)
 {
 	if (handle->wakeup) {
-		(void)atomic_xchg(&handle->data->wakeup, POLL_IN);
-		__perf_counter_update_userpage(handle->counter, handle->data);
-		if (nmi) {
-			handle->counter->wakeup_pending = 1;
-			set_perf_counter_pending();
-		} else
-			wake_up(&handle->counter->waitq);
+		if (nmi)
+			perf_pending_queue(handle->counter);
+		else
+			perf_counter_wakeup(handle->counter);
 	}
 	rcu_read_unlock();
 }
@@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
-	counter->wakeup_pending		= 0;
 	counter->group_leader		= group_leader;
 	counter->hw_ops			= NULL;
 	counter->ctx			= ctx;
diff --git a/kernel/timer.c b/kernel/timer.c
index b4555568b4e4a..672ca25fbc43a 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __get_cpu_var(tvec_bases);
 
+	perf_counter_do_pending();
+
 	hrtimer_run_pending();
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
-- 
GitLab


From 38ff667b321b00f5e6830e93fb4ab11a653a2920 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:03 +0200
Subject: [PATCH 0214/2198] perf_counter: fix update_userpage()

It just occured to me it is possible to have multiple contending
updates of the userpage (mmap information vs overflow vs counter).
This would break the seqlock logic.

It appear the arch code uses this from NMI context, so we cannot
possibly serialize its use, therefore separate the data_head update
from it and let it return to its original use.

The arch code needs to make sure there are no contending callers by
disabling the counter before using it -- powerpc appears to do this
nicely.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.241410660@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 35 +++++++++++++++++++++++++++++++++
 kernel/perf_counter.c        | 38 ++++++++++++++++++++++--------------
 2 files changed, 58 insertions(+), 15 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0d833228eee5a..8ac18852dcfe8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -160,10 +160,45 @@ struct perf_counter_hw_event {
 struct perf_counter_mmap_page {
 	__u32	version;		/* version number of this structure */
 	__u32	compat_version;		/* lowest version this is compat with */
+
+	/*
+	 * Bits needed to read the hw counters in user-space.
+	 *
+	 * The index and offset should be read atomically using the seqlock:
+	 *
+	 *   __u32 seq, index;
+	 *   __s64 offset;
+	 *
+	 * again:
+	 *   rmb();
+	 *   seq = pc->lock;
+	 *
+	 *   if (unlikely(seq & 1)) {
+	 *     cpu_relax();
+	 *     goto again;
+	 *   }
+	 *
+	 *   index = pc->index;
+	 *   offset = pc->offset;
+	 *
+	 *   rmb();
+	 *   if (pc->lock != seq)
+	 *     goto again;
+	 *
+	 * After this, index contains architecture specific counter index + 1,
+	 * so that 0 means unavailable, offset contains the value to be added
+	 * to the result of the raw timer read to obtain this counter's value.
+	 */
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
 	__s64	offset;			/* add to hardware counter value */
 
+	/*
+	 * Control data for the mmap() data buffer.
+	 *
+	 * User-space reading this value should issue an rmb(), on SMP capable
+	 * platforms, after reading this value -- see perf_counter_wakeup().
+	 */
 	__u32   data_head;		/* head in the data section */
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f70ff80e79d7a..c95e92329b978 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1316,10 +1316,22 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
-static void __perf_counter_update_userpage(struct perf_counter *counter,
-					   struct perf_mmap_data *data)
+/*
+ * Callers need to ensure there can be no nesting of this function, otherwise
+ * the seqlock logic goes bad. We can not serialize this because the arch
+ * code calls this from NMI context.
+ */
+void perf_counter_update_userpage(struct perf_counter *counter)
 {
-	struct perf_counter_mmap_page *userpg = data->user_page;
+	struct perf_mmap_data *data;
+	struct perf_counter_mmap_page *userpg;
+
+	rcu_read_lock();
+	data = rcu_dereference(counter->data);
+	if (!data)
+		goto unlock;
+
+	userpg = data->user_page;
 
 	/*
 	 * Disable preemption so as to not let the corresponding user-space
@@ -1333,20 +1345,10 @@ static void __perf_counter_update_userpage(struct perf_counter *counter,
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
 
-	userpg->data_head = atomic_read(&data->head);
 	smp_wmb();
 	++userpg->lock;
 	preempt_enable();
-}
-
-void perf_counter_update_userpage(struct perf_counter *counter)
-{
-	struct perf_mmap_data *data;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (data)
-		__perf_counter_update_userpage(counter, data);
+unlock:
 	rcu_read_unlock();
 }
 
@@ -1547,7 +1549,13 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	data = rcu_dereference(counter->data);
 	if (data) {
 		(void)atomic_xchg(&data->wakeup, POLL_IN);
-		__perf_counter_update_userpage(counter, data);
+		/*
+		 * Ensure all data writes are issued before updating the
+		 * user-space data head information. The matching rmb()
+		 * will be in userspace after reading this value.
+		 */
+		smp_wmb();
+		data->user_page->data_head = atomic_read(&data->head);
 	}
 	rcu_read_unlock();
 
-- 
GitLab


From 195564390210977954fe4ef45b39cdee34f41b59 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:04 +0200
Subject: [PATCH 0215/2198] perf_counter: kerneltop: simplify data_head read

Now that the kernel side changed, match up again.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.327144324@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index fda1438365dc0..2779c57ad4baf 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -1125,22 +1125,10 @@ struct mmap_data {
 static unsigned int mmap_read_head(struct mmap_data *md)
 {
 	struct perf_counter_mmap_page *pc = md->base;
-	unsigned int seq, head;
-
-repeat:
-	rmb();
-	seq = pc->lock;
-
-	if (unlikely(seq & 1)) {
-		cpu_relax();
-		goto repeat;
-	}
+	int head;
 
 	head = pc->data_head;
-
 	rmb();
-	if (pc->lock != seq)
-		goto repeat;
 
 	return head;
 }
-- 
GitLab


From 0a4a93919bdc5cee48fe4367591e8e0449c1086c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:05 +0200
Subject: [PATCH 0216/2198] perf_counter: executable mmap() information

Currently the profiling information returns userspace IPs but no way
to correlate them to userspace code. Userspace could look into
/proc/$pid/maps but that might not be current or even present anymore
at the time of analyzing the IPs.

Therefore provide means to track the mmap information and provide it
in the output stream.

XXX: only covers mmap()/munmap(), mremap() and mprotect() are missing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Orig-LKML-Reference: <20090330171023.417259499@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  24 +++++-
 kernel/perf_counter.c        | 145 +++++++++++++++++++++++++++++++++++
 mm/mmap.c                    |  10 +++
 3 files changed, 177 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8ac18852dcfe8..037a81145acaa 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -137,9 +137,11 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
-				include_tid    :  1, /* include the tid */
+				include_tid    :  1, /* include the tid       */
+				mmap           :  1, /* include mmap data     */
+				munmap         :  1, /* include munmap data   */
 
-				__reserved_1   : 54;
+				__reserved_1   : 52;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -211,6 +213,9 @@ enum perf_event_type {
 	PERF_EVENT_IP		= 0,
 	PERF_EVENT_GROUP	= 1,
 
+	PERF_EVENT_MMAP		= 2,
+	PERF_EVENT_MUNMAP	= 3,
+
 	__PERF_EVENT_TID	= 0x100,
 };
 
@@ -491,6 +496,12 @@ static inline int is_software_counter(struct perf_counter *counter)
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
 
+extern void perf_counter_mmap(unsigned long addr, unsigned long len,
+			      unsigned long pgoff, struct file *file);
+
+extern void perf_counter_munmap(unsigned long addr, unsigned long len,
+				unsigned long pgoff, struct file *file);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -511,6 +522,15 @@ static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 static inline void
 perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
 
+
+static inline void
+perf_counter_mmap(unsigned long addr, unsigned long len,
+		  unsigned long pgoff, struct file *file)		{ }
+
+static inline void
+perf_counter_munmap(unsigned long addr, unsigned long len,
+		    unsigned long pgoff, struct file *file) 		{ }
+
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c95e92329b978..f35e89e3d6a41 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -25,6 +25,7 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
+#include <linux/dcache.h>
 
 #include <asm/irq_regs.h>
 
@@ -1843,6 +1844,150 @@ void perf_counter_output(struct perf_counter *counter,
 	}
 }
 
+/*
+ * mmap tracking
+ */
+
+struct perf_mmap_event {
+	struct file	*file;
+	char		*file_name;
+	int		file_size;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+		u64				start;
+		u64				len;
+		u64				pgoff;
+	} event;
+};
+
+static void perf_counter_mmap_output(struct perf_counter *counter,
+				     struct perf_mmap_event *mmap_event)
+{
+	struct perf_output_handle handle;
+	int size = mmap_event->event.header.size;
+	int ret = perf_output_begin(&handle, counter, size);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, mmap_event->event);
+	perf_output_copy(&handle, mmap_event->file_name,
+				   mmap_event->file_size);
+	perf_output_end(&handle, 0);
+}
+
+static int perf_counter_mmap_match(struct perf_counter *counter,
+				   struct perf_mmap_event *mmap_event)
+{
+	if (counter->hw_event.mmap &&
+	    mmap_event->event.header.type == PERF_EVENT_MMAP)
+		return 1;
+
+	if (counter->hw_event.munmap &&
+	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
+		return 1;
+
+	return 0;
+}
+
+static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
+				  struct perf_mmap_event *mmap_event)
+{
+	struct perf_counter *counter;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+		if (perf_counter_mmap_match(counter, mmap_event))
+			perf_counter_mmap_output(counter, mmap_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct file *file = mmap_event->file;
+	unsigned int size;
+	char tmp[16];
+	char *buf = NULL;
+	char *name;
+
+	if (file) {
+		buf = kzalloc(PATH_MAX, GFP_KERNEL);
+		if (!buf) {
+			name = strncpy(tmp, "//enomem", sizeof(tmp));
+			goto got_name;
+		}
+		name = dentry_path(file->f_dentry, buf, PATH_MAX);
+		if (IS_ERR(name)) {
+			name = strncpy(tmp, "//toolong", sizeof(tmp));
+			goto got_name;
+		}
+	} else {
+		name = strncpy(tmp, "//anon", sizeof(tmp));
+		goto got_name;
+	}
+
+got_name:
+	size = ALIGN(strlen(name), sizeof(u64));
+
+	mmap_event->file_name = name;
+	mmap_event->file_size = size;
+
+	mmap_event->event.header.size = sizeof(mmap_event->event) + size;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
+	put_cpu_var(perf_cpu_context);
+
+	perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
+
+	kfree(buf);
+}
+
+void perf_counter_mmap(unsigned long addr, unsigned long len,
+		       unsigned long pgoff, struct file *file)
+{
+	struct perf_mmap_event mmap_event = {
+		.file   = file,
+		.event  = {
+			.header = { .type = PERF_EVENT_MMAP, },
+			.pid	= current->group_leader->pid,
+			.tid	= current->pid,
+			.start  = addr,
+			.len    = len,
+			.pgoff  = pgoff,
+		},
+	};
+
+	perf_counter_mmap_event(&mmap_event);
+}
+
+void perf_counter_munmap(unsigned long addr, unsigned long len,
+			 unsigned long pgoff, struct file *file)
+{
+	struct perf_mmap_event mmap_event = {
+		.file   = file,
+		.event  = {
+			.header = { .type = PERF_EVENT_MUNMAP, },
+			.pid	= current->group_leader->pid,
+			.tid	= current->pid,
+			.start  = addr,
+			.len    = len,
+			.pgoff  = pgoff,
+		},
+	};
+
+	perf_counter_mmap_event(&mmap_event);
+}
+
 /*
  * Generic software counter infrastructure
  */
diff --git a/mm/mmap.c b/mm/mmap.c
index 4a3841186c111..1df63f614f97f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -28,6 +28,7 @@
 #include <linux/mempolicy.h>
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -1223,6 +1224,9 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 out:
+	if (vm_flags & VM_EXEC)
+		perf_counter_mmap(addr, len, pgoff, file);
+
 	mm->total_vm += len >> PAGE_SHIFT;
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
 	if (vm_flags & VM_LOCKED) {
@@ -1756,6 +1760,12 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 	do {
 		long nrpages = vma_pages(vma);
 
+		if (vma->vm_flags & VM_EXEC) {
+			perf_counter_munmap(vma->vm_start,
+					nrpages << PAGE_SHIFT,
+					vma->vm_pgoff, vma->vm_file);
+		}
+
 		mm->total_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
 		vma = remove_vma(vma);
-- 
GitLab


From 3c1ba6fafecaed295017881f8863a18602f32c1d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:06 +0200
Subject: [PATCH 0217/2198] perf_counter: kerneltop: parse the mmap data stream

frob the kerneltop code to print the mmap data in the stream

Better use would be collecting the IPs per PID and mapping them onto
the provided userspace code.. TODO

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.501902515@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 50 ++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 6 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 2779c57ad4baf..995111dee7fba 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -184,6 +184,8 @@ static int			nmi				=  1;
 static int			group				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=  16;
+static int			use_mmap			= 0;
+static int			use_munmap			= 0;
 
 static char			*vmlinux;
 
@@ -333,6 +335,8 @@ static void display_help(void)
 	" -z        --zero             # zero counts after display\n"
 	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
 	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
+	" -M        --mmap_info        # print mmap info stream\n"
+	" -U        --munmap_info      # print munmap info stream\n"
 	);
 
 	exit(0);
@@ -1052,9 +1056,11 @@ static void process_options(int argc, char *argv[])
 			{"stat",	no_argument,		NULL, 'S'},
 			{"zero",	no_argument,		NULL, 'z'},
 			{"mmap_pages",	required_argument,	NULL, 'm'},
+			{"mmap_info",	no_argument,		NULL, 'M'},
+			{"munmap_info",	no_argument,		NULL, 'U'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z",
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:zMU",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1092,6 +1098,8 @@ static void process_options(int argc, char *argv[])
 		case 'x': vmlinux			= strdup(optarg); break;
 		case 'z': zero				=              1; break;
 		case 'm': mmap_pages			=   atoi(optarg); break;
+		case 'M': use_mmap			=              1; break;
+		case 'U': use_munmap			=              1; break;
 		default: error = 1; break;
 		}
 	}
@@ -1172,12 +1180,29 @@ static void mmap_read(struct mmap_data *md)
 	last_read = this_read;
 
 	for (; old != head;) {
-		struct event_struct {
+		struct ip_event {
 			struct perf_event_header header;
 			__u64 ip;
 			__u32 pid, tid;
-		} *event = (struct event_struct *)&data[old & md->mask];
-		struct event_struct event_copy;
+		};
+		struct mmap_event {
+			struct perf_event_header header;
+			__u32 pid, tid;
+			__u64 start;
+			__u64 len;
+			__u64 pgoff;
+			char filename[PATH_MAX];
+		};
+
+		typedef union event_union {
+			struct perf_event_header header;
+			struct ip_event ip;
+			struct mmap_event mmap;
+		} event_t;
+
+		event_t *event = (event_t *)&data[old & md->mask];
+
+		event_t event_copy;
 
 		unsigned int size = event->header.size;
 
@@ -1187,7 +1212,7 @@ static void mmap_read(struct mmap_data *md)
 		 */
 		if ((old & md->mask) + size != ((old + size) & md->mask)) {
 			unsigned int offset = old;
-			unsigned int len = sizeof(*event), cpy;
+			unsigned int len = min(sizeof(*event), size), cpy;
 			void *dst = &event_copy;
 
 			do {
@@ -1206,7 +1231,18 @@ static void mmap_read(struct mmap_data *md)
 		switch (event->header.type) {
 		case PERF_EVENT_IP:
 		case PERF_EVENT_IP | __PERF_EVENT_TID:
-			process_event(event->ip, md->counter);
+			process_event(event->ip.ip, md->counter);
+			break;
+
+		case PERF_EVENT_MMAP:
+		case PERF_EVENT_MUNMAP:
+			printf("%s: %Lu %Lu %Lu %s\n",
+					event->header.type == PERF_EVENT_MMAP
+					  ? "mmap" : "munmap",
+					event->mmap.start,
+					event->mmap.len,
+					event->mmap.pgoff,
+					event->mmap.filename);
 			break;
 		}
 	}
@@ -1255,6 +1291,8 @@ int main(int argc, char *argv[])
 			hw_event.record_type	= PERF_RECORD_IRQ;
 			hw_event.nmi		= nmi;
 			hw_event.include_tid	= 1;
+			hw_event.mmap		= use_mmap;
+			hw_event.munmap		= use_munmap;
 
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
-- 
GitLab


From 7595d63b3a9ce65d14c4fbd0e7de448a343d7215 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 30 Mar 2009 19:07:07 +0200
Subject: [PATCH 0218/2198] perf_counter: powerpc: only reserve PMU hardware
 when we need it

Impact: cooperate with oprofile

At present, on PowerPC, if you have perf_counters compiled in, oprofile
doesn't work.  There is code to allow the PMU to be shared between
competing subsystems, such as perf_counters and oprofile, but currently
the perf_counter subsystem reserves the PMU for itself at boot time,
and never releases it.

This makes perf_counter play nicely with oprofile.  Now we keep a count
of how many perf_counter instances are counting hardware events, and
reserve the PMU when that count becomes non-zero, and release the PMU
when that count becomes zero.  This means that it is possible to have
perf_counters compiled in and still use oprofile, as long as there are
no hardware perf_counters active.  This also means that if oprofile is
active, sys_perf_counter_open will fail if the hw_event specifies a
hardware event.

To avoid races with other tasks creating and destroying perf_counters,
we use a mutex.  We use atomic_inc_not_zero and atomic_add_unless to
avoid having to take the mutex unless there is a possibility of the
count going between 0 and 1.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090330171023.627912475@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 47 ++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index cde720fc495cd..560dd1e7b5244 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -41,6 +41,8 @@ struct power_pmu *ppmu;
  */
 static unsigned int freeze_counters_kernel = MMCR0_FCS;
 
+static void perf_counter_interrupt(struct pt_regs *regs);
+
 void perf_counter_print_debug(void)
 {
 }
@@ -594,6 +596,24 @@ struct hw_perf_counter_ops power_perf_ops = {
 	.read = power_perf_read
 };
 
+/* Number of perf_counters counting hardware events */
+static atomic_t num_counters;
+/* Used to avoid races in calling reserve/release_pmc_hardware */
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+/*
+ * Release the PMU if this is the last perf_counter.
+ */
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+	if (!atomic_add_unless(&num_counters, -1, 1)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_dec_return(&num_counters) == 0)
+			release_pmc_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
 const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter)
 {
@@ -601,6 +621,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
 	int n;
+	int err;
 
 	if (!ppmu)
 		return NULL;
@@ -646,6 +667,27 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+
+	/*
+	 * See if we need to reserve the PMU.
+	 * If no counters are currently in use, then we have to take a
+	 * mutex to ensure that we don't race with another task doing
+	 * reserve_pmc_hardware or release_pmc_hardware.
+	 */
+	err = 0;
+	if (!atomic_inc_not_zero(&num_counters)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&num_counters) == 0 &&
+		    reserve_pmc_hardware(perf_counter_interrupt))
+			err = -EBUSY;
+		else
+			atomic_inc(&num_counters);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	counter->destroy = hw_perf_counter_destroy;
+
+	if (err)
+		return NULL;
 	return &power_perf_ops;
 }
 
@@ -769,11 +811,6 @@ static int init_perf_counters(void)
 {
 	unsigned long pvr;
 
-	if (reserve_pmc_hardware(perf_counter_interrupt)) {
-		printk(KERN_ERR "Couldn't init performance monitor subsystem\n");
-		return -EBUSY;
-	}
-
 	/* XXX should get this from cputable */
 	pvr = mfspr(SPRN_PVR);
 	switch (PVR_VER(pvr)) {
-- 
GitLab


From d5d2bc0dd0379deddb9ede66fec90a3083eaec57 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 30 Mar 2009 19:07:08 +0200
Subject: [PATCH 0219/2198] perf_counter: make it possible for
 hw_perf_counter_init to return error codes

Impact: better error reporting

At present, if hw_perf_counter_init encounters an error, all it can do
is return NULL, which causes sys_perf_counter_open to return an EINVAL
error to userspace.  This isn't very informative for userspace; it means
that userspace can't tell the difference between "sorry, oprofile is
already using the PMU" and "we don't support this CPU" and "this CPU
doesn't support the requested generic hardware event".

This commit uses the PTR_ERR/ERR_PTR/IS_ERR set of macros to let
hw_perf_counter_init return an error code on error rather than just NULL
if it wishes.  If it does so, that error code will be returned from
sys_perf_counter_open to userspace.  If it returns NULL, an EINVAL
error will be returned to userspace, as before.

This also adapts the powerpc hw_perf_counter_init to make use of this
to return ENXIO, EINVAL, EBUSY, or EOPNOTSUPP as appropriate.  It would
be good to add extra error numbers in future to allow userspace to
distinguish the various errors that are currently reported as EINVAL,
i.e. irq_period < 0, too many events in a group, conflict between
exclude_* settings in a group, and PMU resource conflict in a group.

[ v2: fix a bug pointed out by Corey Ashford where error returns from
      hw_perf_counter_init were not handled correctly in the case of
      raw hardware events.]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090330171023.682428180@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 14 ++++++------
 kernel/perf_counter.c              | 35 +++++++++++++++++++-----------
 2 files changed, 29 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 560dd1e7b5244..0a4d14f279ae6 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -624,13 +624,13 @@ hw_perf_counter_init(struct perf_counter *counter)
 	int err;
 
 	if (!ppmu)
-		return NULL;
+		return ERR_PTR(-ENXIO);
 	if ((s64)counter->hw_event.irq_period < 0)
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	if (!perf_event_raw(&counter->hw_event)) {
 		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
-			return NULL;
+			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
 		ev = perf_event_config(&counter->hw_event);
@@ -656,14 +656,14 @@ hw_perf_counter_init(struct perf_counter *counter)
 		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
 				   ctrs, events);
 		if (n < 0)
-			return NULL;
+			return ERR_PTR(-EINVAL);
 	}
 	events[n] = ev;
 	ctrs[n] = counter;
 	if (check_excludes(ctrs, n, 1))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 	if (power_check_constraints(events, n + 1))
-		return NULL;
+		return ERR_PTR(-EINVAL);
 
 	counter->hw.config = events[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
@@ -687,7 +687,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	counter->destroy = hw_perf_counter_destroy;
 
 	if (err)
-		return NULL;
+		return ERR_PTR(err);
 	return &power_perf_ops;
 }
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f35e89e3d6a41..d07b45278b4f5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2453,10 +2453,11 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 {
 	const struct hw_perf_counter_ops *hw_ops;
 	struct perf_counter *counter;
+	long err;
 
 	counter = kzalloc(sizeof(*counter), gfpflags);
 	if (!counter)
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * Single counters are their own group leaders, with an
@@ -2505,12 +2506,18 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		hw_ops = tp_perf_counter_init(counter);
 		break;
 	}
+done:
+	err = 0;
+	if (!hw_ops)
+		err = -EINVAL;
+	else if (IS_ERR(hw_ops))
+		err = PTR_ERR(hw_ops);
 
-	if (!hw_ops) {
+	if (err) {
 		kfree(counter);
-		return NULL;
+		return ERR_PTR(err);
 	}
-done:
+
 	counter->hw_ops = hw_ops;
 
 	return counter;
@@ -2583,10 +2590,10 @@ SYSCALL_DEFINE5(perf_counter_open,
 			goto err_put_context;
 	}
 
-	ret = -EINVAL;
 	counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
 				     GFP_KERNEL);
-	if (!counter)
+	ret = PTR_ERR(counter);
+	if (IS_ERR(counter))
 		goto err_put_context;
 
 	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0);
@@ -2658,8 +2665,8 @@ inherit_counter(struct perf_counter *parent_counter,
 	child_counter = perf_counter_alloc(&parent_counter->hw_event,
 					   parent_counter->cpu, child_ctx,
 					   group_leader, GFP_KERNEL);
-	if (!child_counter)
-		return NULL;
+	if (IS_ERR(child_counter))
+		return child_counter;
 
 	/*
 	 * Link it up in the child's context:
@@ -2710,15 +2717,17 @@ static int inherit_group(struct perf_counter *parent_counter,
 {
 	struct perf_counter *leader;
 	struct perf_counter *sub;
+	struct perf_counter *child_ctr;
 
 	leader = inherit_counter(parent_counter, parent, parent_ctx,
 				 child, NULL, child_ctx);
-	if (!leader)
-		return -ENOMEM;
+	if (IS_ERR(leader))
+		return PTR_ERR(leader);
 	list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) {
-		if (!inherit_counter(sub, parent, parent_ctx,
-				     child, leader, child_ctx))
-			return -ENOMEM;
+		child_ctr = inherit_counter(sub, parent, parent_ctx,
+					    child, leader, child_ctx);
+		if (IS_ERR(child_ctr))
+			return PTR_ERR(child_ctr);
 	}
 	return 0;
 }
-- 
GitLab


From 9ea98e191255ee642e64a5745014424fc63f83b0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:09 +0200
Subject: [PATCH 0220/2198] perf_counter: x86: proper error propagation for the
 x86 hw_perf_counter_init()

Now that Paul cleaned up the error propagation paths, pass down the
x86 error as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.792822360@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7aab177fb5663..b8885ccd8049e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -954,7 +954,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	err = __hw_perf_counter_init(counter);
 	if (err)
-		return NULL;
+		return ERR_PTR(err);
 
 	return &x86_perf_counter_ops;
 }
-- 
GitLab


From 31f004df8d14212f0a8a2fb12a8ed44a3d80e2fb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 30 Mar 2009 19:07:10 +0200
Subject: [PATCH 0221/2198] perf_counter tools: optionally scale counter values
 in perfstat mode

Impact: new functionality

This adds add an option to the perfstat mode of kerneltop to scale the
reported counter values according to the fraction of time that each
counter gets to count.  This is invoked with the -l option (I used 'l'
because s, c, a and e were all taken already.)  This uses the new
PERF_RECORD_TOTAL_TIME_{ENABLED,RUNNING} read format options.

With this, we get output like this:

$ ./perfstat -l -e 0:0,0:1,0:2,0:3,0:4,0:5 ./spin

 Performance counter stats for './spin':

     4016072055  CPU cycles           (events)  (scaled from 66.53%)
     2005887318  instructions         (events)  (scaled from 66.53%)
        1762849  cache references     (events)  (scaled from 66.69%)
         165229  cache misses         (events)  (scaled from 66.85%)
     1001298009  branches             (events)  (scaled from 66.78%)
          41566  branch misses        (events)  (scaled from 66.61%)

 Wall-clock time elapsed:  2438.227446 msecs

This also lets us detect when a counter is zero because the counter
never got to go on the CPU at all.  In that case we print <not counted>
rather than 0.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <20090330171023.871484899@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 56 +++++++++++++++++++++-----
 1 file changed, 45 insertions(+), 11 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 995111dee7fba..c0ca01504ff36 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -197,6 +197,8 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
+static int			scale;
+
 struct source_line {
 	uint64_t		EIP;
 	unsigned long		count;
@@ -305,6 +307,7 @@ static void display_perfstat_help(void)
 	display_events_help();
 
 	printf(
+	" -l                           # scale counter values\n"
 	" -a                           # system-wide collection\n");
 	exit(0);
 }
@@ -328,6 +331,7 @@ static void display_help(void)
 	" -c CNT    --count=CNT        # event period to sample\n\n"
 	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
 	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
+	" -l                           # show scale factor for RR events\n"
 	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
 	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
 	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
@@ -436,6 +440,9 @@ static void create_perfstat_counter(int counter)
 	hw_event.config		= event_id[counter];
 	hw_event.record_type	= PERF_RECORD_SIMPLE;
 	hw_event.nmi		= 0;
+	if (scale)
+		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
+					  PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide) {
 		int cpu;
@@ -507,28 +514,53 @@ int do_perfstat(int argc, char *argv[])
 	fprintf(stderr, "\n");
 
 	for (counter = 0; counter < nr_counters; counter++) {
-		int cpu;
-		__u64 count, single_count;
+		int cpu, nv;
+		__u64 count[3], single_count[3];
+		int scaled;
 
-		count = 0;
+		count[0] = count[1] = count[2] = 0;
+		nv = scale ? 3 : 1;
 		for (cpu = 0; cpu < nr_cpus; cpu ++) {
 			res = read(fd[cpu][counter],
-					(char *) &single_count, sizeof(single_count));
-			assert(res == sizeof(single_count));
-			count += single_count;
+				   single_count, nv * sizeof(__u64));
+			assert(res == nv * sizeof(__u64));
+
+			count[0] += single_count[0];
+			if (scale) {
+				count[1] += single_count[1];
+				count[2] += single_count[2];
+			}
+		}
+
+		scaled = 0;
+		if (scale) {
+			if (count[2] == 0) {
+				fprintf(stderr, " %14s  %-20s\n",
+					"<not counted>", event_name(counter));
+				continue;
+			}
+			if (count[2] < count[1]) {
+				scaled = 1;
+				count[0] = (unsigned long long)
+					((double)count[0] * count[1] / count[2] + 0.5);
+			}
 		}
 
 		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
 		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
 
-			double msecs = (double)count / 1000000;
+			double msecs = (double)count[0] / 1000000;
 
-			fprintf(stderr, " %14.6f  %-20s (msecs)\n",
+			fprintf(stderr, " %14.6f  %-20s (msecs)",
 				msecs, event_name(counter));
 		} else {
-			fprintf(stderr, " %14Ld  %-20s (events)\n",
-				count, event_name(counter));
+			fprintf(stderr, " %14Ld  %-20s (events)",
+				count[0], event_name(counter));
 		}
+		if (scaled)
+			fprintf(stderr, "  (scaled from %.2f%%)",
+				(double) count[2] / count[1] * 100);
+		fprintf(stderr, "\n");
 	}
 	fprintf(stderr, "\n");
 	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
@@ -1049,6 +1081,7 @@ static void process_options(int argc, char *argv[])
 			{"filter",	required_argument,	NULL, 'f'},
 			{"group",	required_argument,	NULL, 'g'},
 			{"help",	no_argument,		NULL, 'h'},
+			{"scale",	no_argument,		NULL, 'l'},
 			{"nmi",		required_argument,	NULL, 'n'},
 			{"pid",		required_argument,	NULL, 'p'},
 			{"vmlinux",	required_argument,	NULL, 'x'},
@@ -1060,7 +1093,7 @@ static void process_options(int argc, char *argv[])
 			{"munmap_info",	no_argument,		NULL, 'U'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:zMU",
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:s:Sx:zMU",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1084,6 +1117,7 @@ static void process_options(int argc, char *argv[])
 		case 'f': count_filter			=   atoi(optarg); break;
 		case 'g': group				=   atoi(optarg); break;
 		case 'h':      				  display_help(); break;
+		case 'l': scale				=	       1; break;
 		case 'n': nmi				=   atoi(optarg); break;
 		case 'p':
 			/* CPU and PID are mutually exclusive */
-- 
GitLab


From 78d613eb129fc4edf0e2cabfcc6a4c5285482d21 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:11 +0200
Subject: [PATCH 0222/2198] perf_counter: small cleanup of the output routines

Move the nmi argument to the _begin() function, so that _end() only needs the
handle. This allows the _begin() function to generate a wakeup on event loss.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.959404268@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 42 ++++++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 16 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d07b45278b4f5..4471e7e2c109d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1663,10 +1663,20 @@ struct perf_output_handle {
 	unsigned int		offset;
 	unsigned int		head;
 	int			wakeup;
+	int			nmi;
 };
 
+static inline void __perf_output_wakeup(struct perf_output_handle *handle)
+{
+	if (handle->nmi)
+		perf_pending_queue(handle->counter);
+	else
+		perf_counter_wakeup(handle->counter);
+}
+
 static int perf_output_begin(struct perf_output_handle *handle,
-			     struct perf_counter *counter, unsigned int size)
+			     struct perf_counter *counter, unsigned int size,
+			     int nmi)
 {
 	struct perf_mmap_data *data;
 	unsigned int offset, head;
@@ -1676,15 +1686,17 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data)
 		goto out;
 
+	handle->counter	= counter;
+	handle->nmi	= nmi;
+
 	if (!data->nr_pages)
-		goto out;
+		goto fail;
 
 	do {
 		offset = head = atomic_read(&data->head);
 		head += size;
 	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
 
-	handle->counter	= counter;
 	handle->data	= data;
 	handle->offset	= offset;
 	handle->head	= head;
@@ -1692,6 +1704,8 @@ static int perf_output_begin(struct perf_output_handle *handle,
 
 	return 0;
 
+fail:
+	__perf_output_wakeup(handle);
 out:
 	rcu_read_unlock();
 
@@ -1733,14 +1747,10 @@ static void perf_output_copy(struct perf_output_handle *handle,
 #define perf_output_put(handle, x) \
 	perf_output_copy((handle), &(x), sizeof(x))
 
-static void perf_output_end(struct perf_output_handle *handle, int nmi)
+static void perf_output_end(struct perf_output_handle *handle)
 {
-	if (handle->wakeup) {
-		if (nmi)
-			perf_pending_queue(handle->counter);
-		else
-			perf_counter_wakeup(handle->counter);
-	}
+	if (handle->wakeup)
+		__perf_output_wakeup(handle);
 	rcu_read_unlock();
 }
 
@@ -1750,12 +1760,12 @@ static int perf_output_write(struct perf_counter *counter, int nmi,
 	struct perf_output_handle handle;
 	int ret;
 
-	ret = perf_output_begin(&handle, counter, size);
+	ret = perf_output_begin(&handle, counter, size, nmi);
 	if (ret)
 		goto out;
 
 	perf_output_copy(&handle, buf, size);
-	perf_output_end(&handle, nmi);
+	perf_output_end(&handle);
 
 out:
 	return ret;
@@ -1804,7 +1814,7 @@ static void perf_output_group(struct perf_counter *counter, int nmi)
 
 	size = sizeof(header) + counter->nr_siblings * sizeof(entry);
 
-	ret = perf_output_begin(&handle, counter, size);
+	ret = perf_output_begin(&handle, counter, size, nmi);
 	if (ret)
 		return;
 
@@ -1824,7 +1834,7 @@ static void perf_output_group(struct perf_counter *counter, int nmi)
 		perf_output_put(&handle, entry);
 	}
 
-	perf_output_end(&handle, nmi);
+	perf_output_end(&handle);
 }
 
 void perf_counter_output(struct perf_counter *counter,
@@ -1869,7 +1879,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 {
 	struct perf_output_handle handle;
 	int size = mmap_event->event.header.size;
-	int ret = perf_output_begin(&handle, counter, size);
+	int ret = perf_output_begin(&handle, counter, size, 0);
 
 	if (ret)
 		return;
@@ -1877,7 +1887,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 	perf_output_put(&handle, mmap_event->event);
 	perf_output_copy(&handle, mmap_event->file_name,
 				   mmap_event->file_size);
-	perf_output_end(&handle, 0);
+	perf_output_end(&handle);
 }
 
 static int perf_counter_mmap_match(struct perf_counter *counter,
-- 
GitLab


From 5ed00415e304203a0a9dcaef226d6d3f1106070e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:12 +0200
Subject: [PATCH 0223/2198] perf_counter: re-arrange the perf_event_type

Breaks ABI yet again :-)

Change the event type so that [0, 2^31-1] are regular event types, but
[2^31, 2^32-1] forms a bitmask for overflow events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.047961770@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  6 ++--
 kernel/perf_counter.c        | 56 ++++++++++++++++--------------------
 2 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 037a81145acaa..edf5bfb7ff515 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -210,13 +210,15 @@ struct perf_event_header {
 };
 
 enum perf_event_type {
-	PERF_EVENT_IP		= 0,
+
 	PERF_EVENT_GROUP	= 1,
 
 	PERF_EVENT_MMAP		= 2,
 	PERF_EVENT_MUNMAP	= 3,
 
-	__PERF_EVENT_TID	= 0x100,
+	PERF_EVENT_OVERFLOW	= 1UL << 31,
+	__PERF_EVENT_IP		= 1UL << 30,
+	__PERF_EVENT_TID	= 1UL << 29,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4471e7e2c109d..d93e9ddf78484 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1754,50 +1754,44 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static int perf_output_write(struct perf_counter *counter, int nmi,
-			     void *buf, ssize_t size)
-{
-	struct perf_output_handle handle;
-	int ret;
-
-	ret = perf_output_begin(&handle, counter, size, nmi);
-	if (ret)
-		goto out;
-
-	perf_output_copy(&handle, buf, size);
-	perf_output_end(&handle);
-
-out:
-	return ret;
-}
-
 static void perf_output_simple(struct perf_counter *counter,
 			       int nmi, struct pt_regs *regs)
 {
-	unsigned int size;
+	int ret;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	u64 ip;
 	struct {
-		struct perf_event_header header;
-		u64 ip;
 		u32 pid, tid;
-	} event;
+	} tid_entry;
 
-	event.header.type = PERF_EVENT_IP;
-	event.ip = instruction_pointer(regs);
+	header.type = PERF_EVENT_OVERFLOW;
+	header.size = sizeof(header);
 
-	size = sizeof(event);
+	ip = instruction_pointer(regs);
+	header.type |= __PERF_EVENT_IP;
+	header.size += sizeof(ip);
 
 	if (counter->hw_event.include_tid) {
 		/* namespace issues */
-		event.pid = current->group_leader->pid;
-		event.tid = current->pid;
+		tid_entry.pid = current->group_leader->pid;
+		tid_entry.tid = current->pid;
 
-		event.header.type |= __PERF_EVENT_TID;
-	} else
-		size -= sizeof(u64);
+		header.type |= __PERF_EVENT_TID;
+		header.size += sizeof(tid_entry);
+	}
 
-	event.header.size = size;
+	ret = perf_output_begin(&handle, counter, header.size, nmi);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, header);
+	perf_output_put(&handle, ip);
+
+	if (counter->hw_event.include_tid)
+		perf_output_put(&handle, tid_entry);
 
-	perf_output_write(counter, nmi, &event, size);
+	perf_output_end(&handle);
 }
 
 static void perf_output_group(struct perf_counter *counter, int nmi)
-- 
GitLab


From 023c54c42288416b4f43c67bfd5049a76926fad6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:13 +0200
Subject: [PATCH 0224/2198] perf_counter tools: kerneltop: update event_types

Go along with the new perf_event_type ABI.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.133985461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index c0ca01504ff36..430810dae1fec 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -1263,8 +1263,8 @@ static void mmap_read(struct mmap_data *md)
 		old += size;
 
 		switch (event->header.type) {
-		case PERF_EVENT_IP:
-		case PERF_EVENT_IP | __PERF_EVENT_TID:
+		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP:
+		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
 			process_event(event->ip.ip, md->counter);
 			break;
 
-- 
GitLab


From 394ee07623cf556c8daae2b3c00cf5fea47f0811 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:14 +0200
Subject: [PATCH 0225/2198] perf_counter: provide generic callchain bits

Provide the generic callchain support bits. If hw_event->callchain is
set the arch specific perf_callchain() function is called upon to
provide a perf_callchain_entry structure filled with the current
callchain.

If it does so, it is added to the overflow output event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.254266860@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 13 ++++++++++++-
 kernel/perf_counter.c        | 27 +++++++++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index edf5bfb7ff515..43083afffe0fb 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -140,8 +140,9 @@ struct perf_counter_hw_event {
 				include_tid    :  1, /* include the tid       */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
+				callchain      :  1, /* add callchain data    */
 
-				__reserved_1   : 52;
+				__reserved_1   : 51;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -219,6 +220,7 @@ enum perf_event_type {
 	PERF_EVENT_OVERFLOW	= 1UL << 31,
 	__PERF_EVENT_IP		= 1UL << 30,
 	__PERF_EVENT_TID	= 1UL << 29,
+	__PERF_EVENT_CALLCHAIN  = 1UL << 28,
 };
 
 #ifdef __KERNEL__
@@ -504,6 +506,15 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
+#define MAX_STACK_DEPTH		255
+
+struct perf_callchain_entry {
+	u64	nr;
+	u64	ip[MAX_STACK_DEPTH];
+};
+
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d93e9ddf78484..860cdc26bd7a8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1653,6 +1653,17 @@ void perf_counter_do_pending(void)
 	__perf_pending_run();
 }
 
+/*
+ * Callchain support -- arch specific
+ */
+
+struct perf_callchain_entry *
+__attribute__((weak))
+perf_callchain(struct pt_regs *regs)
+{
+	return NULL;
+}
+
 /*
  * Output
  */
@@ -1764,6 +1775,8 @@ static void perf_output_simple(struct perf_counter *counter,
 	struct {
 		u32 pid, tid;
 	} tid_entry;
+	struct perf_callchain_entry *callchain = NULL;
+	int callchain_size = 0;
 
 	header.type = PERF_EVENT_OVERFLOW;
 	header.size = sizeof(header);
@@ -1781,6 +1794,17 @@ static void perf_output_simple(struct perf_counter *counter,
 		header.size += sizeof(tid_entry);
 	}
 
+	if (counter->hw_event.callchain) {
+		callchain = perf_callchain(regs);
+
+		if (callchain) {
+			callchain_size = (1 + callchain->nr) * sizeof(u64);
+
+			header.type |= __PERF_EVENT_CALLCHAIN;
+			header.size += callchain_size;
+		}
+	}
+
 	ret = perf_output_begin(&handle, counter, header.size, nmi);
 	if (ret)
 		return;
@@ -1791,6 +1815,9 @@ static void perf_output_simple(struct perf_counter *counter,
 	if (counter->hw_event.include_tid)
 		perf_output_put(&handle, tid_entry);
 
+	if (callchain)
+		perf_output_copy(&handle, callchain, callchain_size);
+
 	perf_output_end(&handle);
 }
 
-- 
GitLab


From d7d59fb323833682b117b528d77eeb8ef587036a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:15 +0200
Subject: [PATCH 0226/2198] perf_counter: x86: callchain support

Provide the x86 perf_callchain() implementation.

Code based on the ftrace/sysprof code from Soeren Sandmann Pedersen.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Cc: Soeren Sandmann Pedersen <sandmann@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Orig-LKML-Reference: <20090330171024.341993293@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 154 +++++++++++++++++++++++++++++
 1 file changed, 154 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b8885ccd8049e..e16dfafc6d72e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -16,8 +16,10 @@
 #include <linux/module.h>
 #include <linux/kdebug.h>
 #include <linux/sched.h>
+#include <linux/uaccess.h>
 
 #include <asm/apic.h>
+#include <asm/stacktrace.h>
 
 static bool perf_counters_initialized __read_mostly;
 
@@ -958,3 +960,155 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	return &x86_perf_counter_ops;
 }
+
+/*
+ * callchain support
+ */
+
+static inline
+void callchain_store(struct perf_callchain_entry *entry, unsigned long ip)
+{
+	if (entry->nr < MAX_STACK_DEPTH)
+		entry->ip[entry->nr++] = ip;
+}
+
+static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry);
+static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry);
+
+
+static void
+backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
+{
+	/* Ignore warnings */
+}
+
+static void backtrace_warning(void *data, char *msg)
+{
+	/* Ignore warnings */
+}
+
+static int backtrace_stack(void *data, char *name)
+{
+	/* Don't bother with IRQ stacks for now */
+	return -1;
+}
+
+static void backtrace_address(void *data, unsigned long addr, int reliable)
+{
+	struct perf_callchain_entry *entry = data;
+
+	if (reliable)
+		callchain_store(entry, addr);
+}
+
+static const struct stacktrace_ops backtrace_ops = {
+	.warning		= backtrace_warning,
+	.warning_symbol		= backtrace_warning_symbol,
+	.stack			= backtrace_stack,
+	.address		= backtrace_address,
+};
+
+static void
+perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	unsigned long bp;
+	char *stack;
+
+	callchain_store(entry, instruction_pointer(regs));
+
+	stack = ((char *)regs + sizeof(struct pt_regs));
+#ifdef CONFIG_FRAME_POINTER
+	bp = frame_pointer(regs);
+#else
+	bp = 0;
+#endif
+
+	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+}
+
+
+struct stack_frame {
+	const void __user	*next_fp;
+	unsigned long		return_address;
+};
+
+static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+{
+	int ret;
+
+	if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
+		return 0;
+
+	ret = 1;
+	pagefault_disable();
+	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
+		ret = 0;
+	pagefault_enable();
+
+	return ret;
+}
+
+static void
+perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	struct stack_frame frame;
+	const void __user *fp;
+
+	regs = (struct pt_regs *)current->thread.sp0 - 1;
+	fp   = (void __user *)regs->bp;
+
+	callchain_store(entry, regs->ip);
+
+	while (entry->nr < MAX_STACK_DEPTH) {
+		frame.next_fp	     = NULL;
+		frame.return_address = 0;
+
+		if (!copy_stack_frame(fp, &frame))
+			break;
+
+		if ((unsigned long)fp < user_stack_pointer(regs))
+			break;
+
+		callchain_store(entry, frame.return_address);
+		fp = frame.next_fp;
+	}
+}
+
+static void
+perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
+{
+	int is_user;
+
+	if (!regs)
+		return;
+
+	is_user = user_mode(regs);
+
+	if (!current || current->pid == 0)
+		return;
+
+	if (is_user && current->state != TASK_RUNNING)
+		return;
+
+	if (!is_user)
+		perf_callchain_kernel(regs, entry);
+
+	if (current->mm)
+		perf_callchain_user(regs, entry);
+}
+
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+	struct perf_callchain_entry *entry;
+
+	if (in_nmi())
+		entry = &__get_cpu_var(nmi_entry);
+	else
+		entry = &__get_cpu_var(irq_entry);
+
+	entry->nr = 0;
+
+	perf_do_callchain(regs, entry);
+
+	return entry;
+}
-- 
GitLab


From 4e935e47177c3b26cf383e79849bae2a464d0160 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 30 Mar 2009 19:07:16 +0200
Subject: [PATCH 0227/2198] perf_counter: pmc arbitration

Follow the example set by powerpc and try to play nice with oprofile
and the nmi watchdog.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171024.459968444@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 75 ++++++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e16dfafc6d72e..2a946a160cac7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -20,6 +20,7 @@
 
 #include <asm/apic.h>
 #include <asm/stacktrace.h>
+#include <asm/nmi.h>
 
 static bool perf_counters_initialized __read_mostly;
 
@@ -172,6 +173,65 @@ x86_perf_counter_update(struct perf_counter *counter,
 	atomic64_sub(delta, &hwc->period_left);
 }
 
+static atomic_t num_counters;
+static DEFINE_MUTEX(pmc_reserve_mutex);
+
+static bool reserve_pmc_hardware(void)
+{
+	int i;
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		disable_lapic_nmi_watchdog();
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
+			goto perfctr_fail;
+	}
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
+			goto eventsel_fail;
+	}
+
+	return true;
+
+eventsel_fail:
+	for (i--; i >= 0; i--)
+		release_evntsel_nmi(pmc_ops->eventsel + i);
+
+	i = nr_counters_generic;
+
+perfctr_fail:
+	for (i--; i >= 0; i--)
+		release_perfctr_nmi(pmc_ops->perfctr + i);
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+
+	return false;
+}
+
+static void release_pmc_hardware(void)
+{
+	int i;
+
+	for (i = 0; i < nr_counters_generic; i++) {
+		release_perfctr_nmi(pmc_ops->perfctr + i);
+		release_evntsel_nmi(pmc_ops->eventsel + i);
+	}
+
+	if (nmi_watchdog == NMI_LOCAL_APIC)
+		enable_lapic_nmi_watchdog();
+}
+
+static void hw_perf_counter_destroy(struct perf_counter *counter)
+{
+	if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
+		release_pmc_hardware();
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -179,10 +239,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	struct hw_perf_counter *hwc = &counter->hw;
+	int err;
 
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
 
+	err = 0;
+	if (atomic_inc_not_zero(&num_counters)) {
+		mutex_lock(&pmc_reserve_mutex);
+		if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
+			err = -EBUSY;
+		else
+			atomic_inc(&num_counters);
+		mutex_unlock(&pmc_reserve_mutex);
+	}
+	if (err)
+		return err;
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -230,6 +303,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
 	}
 
+	counter->destroy = hw_perf_counter_destroy;
+
 	return 0;
 }
 
-- 
GitLab


From 9dd499889bdb12ac0e412ccdd718fe0d348258f2 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 27 Mar 2009 12:13:43 +0100
Subject: [PATCH 0228/2198] perf_counter tools: kerneltop: add real-time data
 acquisition thread

Decouple kerneltop display from event acquisition by introducing
a separate data acquisition thread. This fixes annnoying kerneltop
display refresh jitter and missed events.

Also add a -r <prio> option, to switch the data acquisition thread
to real-time priority.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Orig-LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 57 ++++++++++++++++++--------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 430810dae1fec..33b4fcf6e489f 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -77,6 +77,8 @@
 #include <errno.h>
 #include <ctype.h>
 #include <time.h>
+#include <sched.h>
+#include <pthread.h>
 
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
@@ -181,6 +183,7 @@ static int			tid				= -1;
 static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
 static int			nmi				=  1;
+static unsigned int		realtime_prio			=  0;
 static int			group				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=  16;
@@ -334,6 +337,7 @@ static void display_help(void)
 	" -l                           # show scale factor for RR events\n"
 	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
 	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
+	" -r prio   --realtime=<prio>  # event acquisition runs with SCHED_FIFO policy\n"
 	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
 	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
 	" -z        --zero             # zero counts after display\n"
@@ -620,7 +624,6 @@ static int compare(const void *__sym1, const void *__sym2)
 	return sym_weight(sym1) < sym_weight(sym2);
 }
 
-static time_t			last_refresh;
 static long			events;
 static long			userspace_events;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
@@ -634,6 +637,7 @@ static void print_sym_table(void)
 	float events_per_sec = events/delay_secs;
 	float kevents_per_sec = (events-userspace_events)/delay_secs;
 
+	events = userspace_events = 0;
 	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
 	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
 
@@ -714,8 +718,6 @@ static void print_sym_table(void)
 	if (sym_filter_entry)
 		show_details(sym_filter_entry);
 
-	last_refresh = time(NULL);
-
 	{
 		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
 
@@ -726,6 +728,16 @@ static void print_sym_table(void)
 	}
 }
 
+static void *display_thread(void *arg)
+{
+	printf("KernelTop refresh period: %d seconds\n", delay_secs);
+
+	while (!sleep(delay_secs))
+		print_sym_table();
+
+	return NULL;
+}
+
 static int read_symbol(FILE *in, struct sym_entry *s)
 {
 	static int filter_match = 0;
@@ -1081,19 +1093,20 @@ static void process_options(int argc, char *argv[])
 			{"filter",	required_argument,	NULL, 'f'},
 			{"group",	required_argument,	NULL, 'g'},
 			{"help",	no_argument,		NULL, 'h'},
-			{"scale",	no_argument,		NULL, 'l'},
 			{"nmi",		required_argument,	NULL, 'n'},
+			{"mmap_info",	no_argument,		NULL, 'M'},
+			{"mmap_pages",	required_argument,	NULL, 'm'},
+			{"munmap_info",	no_argument,		NULL, 'U'},
 			{"pid",		required_argument,	NULL, 'p'},
-			{"vmlinux",	required_argument,	NULL, 'x'},
+			{"realtime",	required_argument,	NULL, 'r'},
+			{"scale",	no_argument,		NULL, 'l'},
 			{"symbol",	required_argument,	NULL, 's'},
 			{"stat",	no_argument,		NULL, 'S'},
+			{"vmlinux",	required_argument,	NULL, 'x'},
 			{"zero",	no_argument,		NULL, 'z'},
-			{"mmap_pages",	required_argument,	NULL, 'm'},
-			{"mmap_info",	no_argument,		NULL, 'M'},
-			{"munmap_info",	no_argument,		NULL, 'U'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:s:Sx:zMU",
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -1127,6 +1140,7 @@ static void process_options(int argc, char *argv[])
 				profile_cpu = -1;
 			}
 			tid				=   atoi(optarg); break;
+		case 'r': realtime_prio			=   atoi(optarg); break;
 		case 's': sym_filter			= strdup(optarg); break;
 		case 'S': run_perfstat			=	       1; break;
 		case 'x': vmlinux			= strdup(optarg); break;
@@ -1289,6 +1303,7 @@ int main(int argc, char *argv[])
 	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
+	pthread_t thread;
 	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
 	int ret;
@@ -1363,8 +1378,20 @@ int main(int argc, char *argv[])
 		}
 	}
 
-	printf("KernelTop refresh period: %d seconds\n", delay_secs);
-	last_refresh = time(NULL);
+	if (pthread_create(&thread, NULL, display_thread, NULL)) {
+		printf("Could not create display thread.\n");
+		exit(-1);
+	}
+
+	if (realtime_prio) {
+		struct sched_param param;
+
+		param.sched_priority = realtime_prio;
+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+			printf("Could not set realtime priority.\n");
+			exit(-1);
+		}
+	}
 
 	while (1) {
 		int hits = events;
@@ -1374,14 +1401,8 @@ int main(int argc, char *argv[])
 				mmap_read(&mmap_array[i][counter]);
 		}
 
-		if (time(NULL) >= last_refresh + delay_secs) {
-			print_sym_table();
-			events = userspace_events = 0;
-		}
-
 		if (hits == events)
-			ret = poll(event_array, nr_poll, 1000);
-		hits = events;
+			ret = poll(event_array, nr_poll, 100);
 	}
 
 	return 0;
-- 
GitLab


From 8a057d84912f36e53f970c4d177cb4bb6b2f9e08 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:11:59 +0200
Subject: [PATCH 0229/2198] perf_counter: move the event overflow output bits
 to record_type

Per suggestion from Paul, move the event overflow bits to record_type
and sanitize the enums a bit.

Breaks the ABI -- again ;-)

Suggested-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.151921176@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  50 +++++++++--------
 kernel/perf_counter.c        | 101 ++++++++++++++---------------------
 2 files changed, 68 insertions(+), 83 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 43083afffe0fb..06a6fba9f5318 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,15 +73,6 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-/*
- * IRQ-notification data record type:
- */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		= 0,
-	PERF_RECORD_IRQ			= 1,
-	PERF_RECORD_GROUP		= 2,
-};
-
 #define __PERF_COUNTER_MASK(name) 			\
 	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
 	 PERF_COUNTER_##name##_SHIFT)
@@ -102,6 +93,17 @@ enum perf_counter_record_type {
 #define PERF_COUNTER_EVENT_SHIFT	0
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
+/*
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
+ */
+enum perf_counter_record_format {
+	PERF_RECORD_IP		= 1U << 0,
+	PERF_RECORD_TID		= 1U << 1,
+	PERF_RECORD_GROUP	= 1U << 2,
+	PERF_RECORD_CALLCHAIN	= 1U << 3,
+};
+
 /*
  * Bits that can be set in hw_event.read_format to request that
  * reads on the counter should return the indicated quantities,
@@ -125,8 +127,8 @@ struct perf_counter_hw_event {
 	__u64			config;
 
 	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
+	__u32			record_type;
+	__u32			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
@@ -137,12 +139,10 @@ struct perf_counter_hw_event {
 				exclude_kernel :  1, /* ditto kernel          */
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
-				include_tid    :  1, /* include the tid       */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
-				callchain      :  1, /* add callchain data    */
 
-				__reserved_1   : 51;
+				__reserved_1   : 53;
 
 	__u32			extra_config_len;
 	__u32			__reserved_4;
@@ -212,15 +212,21 @@ struct perf_event_header {
 
 enum perf_event_type {
 
-	PERF_EVENT_GROUP	= 1,
-
-	PERF_EVENT_MMAP		= 2,
-	PERF_EVENT_MUNMAP	= 3,
+	PERF_EVENT_MMAP			= 1,
+	PERF_EVENT_MUNMAP		= 2,
 
-	PERF_EVENT_OVERFLOW	= 1UL << 31,
-	__PERF_EVENT_IP		= 1UL << 30,
-	__PERF_EVENT_TID	= 1UL << 29,
-	__PERF_EVENT_CALLCHAIN  = 1UL << 28,
+	/*
+	 * Half the event type space is reserved for the counter overflow
+	 * bitfields, as found in hw_event.record_type.
+	 *
+	 * These events will have types of the form:
+	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 */
+	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
+	__PERF_EVENT_IP			= PERF_RECORD_IP,
+	__PERF_EVENT_TID		= PERF_RECORD_TID,
+	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
+	__PERF_EVENT_CALLCHAIN		= PERF_RECORD_CALLCHAIN,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 860cdc26bd7a8..995063df910fc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1765,27 +1765,34 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-static void perf_output_simple(struct perf_counter *counter,
-			       int nmi, struct pt_regs *regs)
+void perf_counter_output(struct perf_counter *counter,
+			 int nmi, struct pt_regs *regs)
 {
 	int ret;
+	u64 record_type = counter->hw_event.record_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
 	struct {
 		u32 pid, tid;
 	} tid_entry;
+	struct {
+		u64 event;
+		u64 counter;
+	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 
-	header.type = PERF_EVENT_OVERFLOW;
+	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
 
-	ip = instruction_pointer(regs);
-	header.type |= __PERF_EVENT_IP;
-	header.size += sizeof(ip);
+	if (record_type & PERF_RECORD_IP) {
+		ip = instruction_pointer(regs);
+		header.type |= __PERF_EVENT_IP;
+		header.size += sizeof(ip);
+	}
 
-	if (counter->hw_event.include_tid) {
+	if (record_type & PERF_RECORD_TID) {
 		/* namespace issues */
 		tid_entry.pid = current->group_leader->pid;
 		tid_entry.tid = current->pid;
@@ -1794,7 +1801,13 @@ static void perf_output_simple(struct perf_counter *counter,
 		header.size += sizeof(tid_entry);
 	}
 
-	if (counter->hw_event.callchain) {
+	if (record_type & PERF_RECORD_GROUP) {
+		header.type |= __PERF_EVENT_GROUP;
+		header.size += sizeof(u64) +
+			counter->nr_siblings * sizeof(group_entry);
+	}
+
+	if (record_type & PERF_RECORD_CALLCHAIN) {
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
@@ -1810,69 +1823,35 @@ static void perf_output_simple(struct perf_counter *counter,
 		return;
 
 	perf_output_put(&handle, header);
-	perf_output_put(&handle, ip);
 
-	if (counter->hw_event.include_tid)
-		perf_output_put(&handle, tid_entry);
+	if (record_type & PERF_RECORD_IP)
+		perf_output_put(&handle, ip);
 
-	if (callchain)
-		perf_output_copy(&handle, callchain, callchain_size);
-
-	perf_output_end(&handle);
-}
-
-static void perf_output_group(struct perf_counter *counter, int nmi)
-{
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	struct perf_counter *leader, *sub;
-	unsigned int size;
-	struct {
-		u64 event;
-		u64 counter;
-	} entry;
-	int ret;
-
-	size = sizeof(header) + counter->nr_siblings * sizeof(entry);
+	if (record_type & PERF_RECORD_TID)
+		perf_output_put(&handle, tid_entry);
 
-	ret = perf_output_begin(&handle, counter, size, nmi);
-	if (ret)
-		return;
+	if (record_type & PERF_RECORD_GROUP) {
+		struct perf_counter *leader, *sub;
+		u64 nr = counter->nr_siblings;
 
-	header.type = PERF_EVENT_GROUP;
-	header.size = size;
+		perf_output_put(&handle, nr);
 
-	perf_output_put(&handle, header);
+		leader = counter->group_leader;
+		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
+			if (sub != counter)
+				sub->hw_ops->read(sub);
 
-	leader = counter->group_leader;
-	list_for_each_entry(sub, &leader->sibling_list, list_entry) {
-		if (sub != counter)
-			sub->hw_ops->read(sub);
+			group_entry.event = sub->hw_event.config;
+			group_entry.counter = atomic64_read(&sub->count);
 
-		entry.event = sub->hw_event.config;
-		entry.counter = atomic64_read(&sub->count);
-
-		perf_output_put(&handle, entry);
+			perf_output_put(&handle, group_entry);
+		}
 	}
 
-	perf_output_end(&handle);
-}
-
-void perf_counter_output(struct perf_counter *counter,
-			 int nmi, struct pt_regs *regs)
-{
-	switch (counter->hw_event.record_type) {
-	case PERF_RECORD_SIMPLE:
-		return;
-
-	case PERF_RECORD_IRQ:
-		perf_output_simple(counter, nmi, regs);
-		break;
+	if (callchain)
+		perf_output_copy(&handle, callchain, callchain_size);
 
-	case PERF_RECORD_GROUP:
-		perf_output_group(counter, nmi);
-		break;
-	}
+	perf_output_end(&handle);
 }
 
 /*
-- 
GitLab


From c457810ab4a825161aec6ef71b581e1bc8febd1a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:01 +0200
Subject: [PATCH 0230/2198] perf_counter: per event wakeups

By request, provide a way to request a wakeup every 'n' events instead
of every page of output.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.323309784@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  3 ++-
 kernel/perf_counter.c        | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 06a6fba9f5318..5428ba120d78c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -145,7 +145,7 @@ struct perf_counter_hw_event {
 				__reserved_1   : 53;
 
 	__u32			extra_config_len;
-	__u32			__reserved_4;
+	__u32			wakeup_events;	/* wakeup every n events */
 
 	__u64			__reserved_2;
 	__u64			__reserved_3;
@@ -321,6 +321,7 @@ struct perf_mmap_data {
 	int				nr_pages;
 	atomic_t			wakeup;
 	atomic_t			head;
+	atomic_t			events;
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 995063df910fc..9bcab10e735df 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1760,7 +1760,15 @@ static void perf_output_copy(struct perf_output_handle *handle,
 
 static void perf_output_end(struct perf_output_handle *handle)
 {
-	if (handle->wakeup)
+	int wakeup_events = handle->counter->hw_event.wakeup_events;
+
+	if (wakeup_events) {
+		int events = atomic_inc_return(&handle->data->events);
+		if (events >= wakeup_events) {
+			atomic_sub(wakeup_events, &handle->data->events);
+			__perf_output_wakeup(handle);
+		}
+	} else if (handle->wakeup)
 		__perf_output_wakeup(handle);
 	rcu_read_unlock();
 }
-- 
GitLab


From 3df70fd623bb109e0079e697c0276d220a4b7908 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:02 +0200
Subject: [PATCH 0231/2198] perf_counter: kerneltop: update to new ABI

Update to reflect the new record_type ABI changes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.407283141@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 33b4fcf6e489f..4f8d7917aba12 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -442,7 +442,7 @@ static void create_perfstat_counter(int counter)
 
 	memset(&hw_event, 0, sizeof(hw_event));
 	hw_event.config		= event_id[counter];
-	hw_event.record_type	= PERF_RECORD_SIMPLE;
+	hw_event.record_type	= 0;
 	hw_event.nmi		= 0;
 	if (scale)
 		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
@@ -1277,8 +1277,8 @@ static void mmap_read(struct mmap_data *md)
 		old += size;
 
 		switch (event->header.type) {
-		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP:
-		case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
+		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP:
+		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
 			process_event(event->ip.ip, md->counter);
 			break;
 
@@ -1337,9 +1337,8 @@ int main(int argc, char *argv[])
 			memset(&hw_event, 0, sizeof(hw_event));
 			hw_event.config		= event_id[counter];
 			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IRQ;
+			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
 			hw_event.nmi		= nmi;
-			hw_event.include_tid	= 1;
 			hw_event.mmap		= use_mmap;
 			hw_event.munmap		= use_munmap;
 
-- 
GitLab


From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:03 +0200
Subject: [PATCH 0232/2198] perf_counter: add more context information

Put in counts to tell which ips belong to what context.

  -----
   | |  hv
   | --
nr | |  kernel
   | --
   | |  user
  -----

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.493101305@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 +++++++++
 include/linux/perf_counter.h       | 4 ++--
 kernel/perf_counter.c              | 2 +-
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2a946a160cac7..c74e20d593a77 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	unsigned long bp;
 	char *stack;
+	int nr = entry->nr;
 
 	callchain_store(entry, instruction_pointer(regs));
 
@@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
 #endif
 
 	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry);
+
+	entry->kernel = entry->nr - nr;
 }
 
 
@@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 {
 	struct stack_frame frame;
 	const void __user *fp;
+	int nr = entry->nr;
 
 	regs = (struct pt_regs *)current->thread.sp0 - 1;
 	fp   = (void __user *)regs->bp;
@@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
 		callchain_store(entry, frame.return_address);
 		fp = frame.next_fp;
 	}
+
+	entry->user = entry->nr - nr;
 }
 
 static void
@@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 		entry = &__get_cpu_var(irq_entry);
 
 	entry->nr = 0;
+	entry->hv = 0;
+	entry->kernel = 0;
+	entry->user = 0;
 
 	perf_do_callchain(regs, entry);
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5428ba120d78c..90cce0c74a034 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -513,10 +513,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
-#define MAX_STACK_DEPTH		255
+#define MAX_STACK_DEPTH		254
 
 struct perf_callchain_entry {
-	u64	nr;
+	u32	nr, hv, kernel, user;
 	u64	ip[MAX_STACK_DEPTH];
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 9bcab10e735df..f105a6e696c20 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1819,7 +1819,7 @@ void perf_counter_output(struct perf_counter *counter,
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
-			callchain_size = (1 + callchain->nr) * sizeof(u64);
+			callchain_size = (2 + callchain->nr) * sizeof(u64);
 
 			header.type |= __PERF_EVENT_CALLCHAIN;
 			header.size += callchain_size;
-- 
GitLab


From 92f22a3865abe87eea2609a6f8e5be5123f7ce4f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 2 Apr 2009 11:12:04 +0200
Subject: [PATCH 0233/2198] perf_counter: update mmap() counter read

Paul noted that we don't need SMP barriers for the mmap() counter read
because its always on the same cpu (otherwise you can't access the hw
counter anyway).

So remove the SMP barriers and replace them with regular compiler
barriers.

Further, update the comment to include a race free method of reading
said hardware counter. The primary change is putting the pmc_read
inside the seq-loop, otherwise we can still race and read rubbish.

Noticed-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Orig-LKML-Reference: <20090402091319.577951445@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 22 ++++++++++------------
 kernel/perf_counter.c        |  4 ++--
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 90cce0c74a034..f2b914de3f0c7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -167,30 +167,28 @@ struct perf_counter_mmap_page {
 	/*
 	 * Bits needed to read the hw counters in user-space.
 	 *
-	 * The index and offset should be read atomically using the seqlock:
-	 *
-	 *   __u32 seq, index;
-	 *   __s64 offset;
+	 *   u32 seq;
+	 *   s64 count;
 	 *
 	 * again:
-	 *   rmb();
 	 *   seq = pc->lock;
-	 *
 	 *   if (unlikely(seq & 1)) {
 	 *     cpu_relax();
 	 *     goto again;
 	 *   }
 	 *
-	 *   index = pc->index;
-	 *   offset = pc->offset;
+	 *   if (pc->index) {
+	 *     count = pmc_read(pc->index - 1);
+	 *     count += pc->offset;
+	 *   } else
+	 *     goto regular_read;
 	 *
-	 *   rmb();
+	 *   barrier();
 	 *   if (pc->lock != seq)
 	 *     goto again;
 	 *
-	 * After this, index contains architecture specific counter index + 1,
-	 * so that 0 means unavailable, offset contains the value to be added
-	 * to the result of the raw timer read to obtain this counter's value.
+	 * NOTE: for obvious reason this only works on self-monitoring
+	 *       processes.
 	 */
 	__u32	lock;			/* seqlock for synchronization */
 	__u32	index;			/* hardware counter identifier */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f105a6e696c20..2a5d4f5255678 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1340,13 +1340,13 @@ void perf_counter_update_userpage(struct perf_counter *counter)
 	 */
 	preempt_disable();
 	++userpg->lock;
-	smp_wmb();
+	barrier();
 	userpg->index = counter->hw.idx;
 	userpg->offset = atomic64_read(&counter->count);
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		userpg->offset -= atomic64_read(&counter->hw.prev_count);
 
-	smp_wmb();
+	barrier();
 	++userpg->lock;
 	preempt_enable();
 unlock:
-- 
GitLab


From ca5f9524d61f54b1f618293ab92fc6b49cac864d Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:39:33 -0700
Subject: [PATCH 0234/2198] futex: separate futex_wait_queue_me() logic from
 futex_wait()

Refactor futex_wait() in preparation for futex_wait_requeue_pi(). In
order to reuse a good chunk of the futex_wait() code for the upcoming
futex_wait_requeue_pi() function, this patch breaks out the
queue-to-wakeup section of futex_wait() into futex_wait_queue_me().

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 138 +++++++++++++++++++++++++++----------------------
 1 file changed, 76 insertions(+), 62 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 6b50a024bca22..ebb48d6d1a87e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1115,24 +1115,87 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 
 static long futex_wait_restart(struct restart_block *restart);
 
+/**
+ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
+ * @hb:		the futex hash bucket, must be locked by the caller
+ * @q:		the futex_q to queue up on
+ * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
+ * @wait:	the wait_queue to add to the futex_q after queueing in the hb
+ */
+static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+				struct hrtimer_sleeper *timeout,
+				wait_queue_t *wait)
+{
+	queue_me(q, hb);
+
+	/*
+	 * There might have been scheduling since the queue_me(), as we
+	 * cannot hold a spinlock across the get_user() in case it
+	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+	 * queueing ourselves into the futex hash.  This code thus has to
+	 * rely on the futex_wake() code removing us from hash when it
+	 * wakes us up.
+	 */
+
+	/* add_wait_queue is the barrier after __set_current_state. */
+	__set_current_state(TASK_INTERRUPTIBLE);
+
+	/*
+	 * Add current as the futex_q waiter.  We don't remove ourselves from
+	 * the wait_queue because we are the only user of it.
+	 */
+	add_wait_queue(&q->waiter, wait);
+
+	/* Arm the timer */
+	if (timeout) {
+		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+		if (!hrtimer_active(&timeout->timer))
+			timeout->task = NULL;
+	}
+
+	/*
+	 * !plist_node_empty() is safe here without any lock.
+	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+	 */
+	if (likely(!plist_node_empty(&q->list))) {
+		/*
+		 * If the timer has already expired, current will already be
+		 * flagged for rescheduling. Only call schedule if there
+		 * is no timeout, or if it has yet to expire.
+		 */
+		if (!timeout || timeout->task)
+			schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
 static int futex_wait(u32 __user *uaddr, int fshared,
 		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
 {
-	struct task_struct *curr = current;
+	struct hrtimer_sleeper timeout, *to = NULL;
+	DECLARE_WAITQUEUE(wait, current);
 	struct restart_block *restart;
-	DECLARE_WAITQUEUE(wait, curr);
 	struct futex_hash_bucket *hb;
 	struct futex_q q;
 	u32 uval;
 	int ret;
-	struct hrtimer_sleeper t;
-	int rem = 0;
 
 	if (!bitset)
 		return -EINVAL;
 
 	q.pi_state = NULL;
 	q.bitset = bitset;
+
+	if (abs_time) {
+		to = &timeout;
+
+		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		hrtimer_init_sleeper(to, current);
+		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+					     current->timer_slack_ns);
+	}
+
 retry:
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
@@ -1178,75 +1241,22 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 		goto retry;
 	}
 	ret = -EWOULDBLOCK;
+
+	/* Only actually queue if *uaddr contained val.  */
 	if (unlikely(uval != val)) {
 		queue_unlock(&q, hb);
 		goto out_put_key;
 	}
 
-	/* Only actually queue if *uaddr contained val.  */
-	queue_me(&q, hb);
-
-	/*
-	 * There might have been scheduling since the queue_me(), as we
-	 * cannot hold a spinlock across the get_user() in case it
-	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
-	 * queueing ourselves into the futex hash.  This code thus has to
-	 * rely on the futex_wake() code removing us from hash when it
-	 * wakes us up.
-	 */
-
-	/* add_wait_queue is the barrier after __set_current_state. */
-	__set_current_state(TASK_INTERRUPTIBLE);
-	add_wait_queue(&q.waiter, &wait);
-	/*
-	 * !plist_node_empty() is safe here without any lock.
-	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
-	 */
-	if (likely(!plist_node_empty(&q.list))) {
-		if (!abs_time)
-			schedule();
-		else {
-			hrtimer_init_on_stack(&t.timer,
-					      clockrt ? CLOCK_REALTIME :
-					      CLOCK_MONOTONIC,
-					      HRTIMER_MODE_ABS);
-			hrtimer_init_sleeper(&t, current);
-			hrtimer_set_expires_range_ns(&t.timer, *abs_time,
-						     current->timer_slack_ns);
-
-			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
-			if (!hrtimer_active(&t.timer))
-				t.task = NULL;
-
-			/*
-			 * the timer could have already expired, in which
-			 * case current would be flagged for rescheduling.
-			 * Don't bother calling schedule.
-			 */
-			if (likely(t.task))
-				schedule();
-
-			hrtimer_cancel(&t.timer);
-
-			/* Flag if a timeout occured */
-			rem = (t.task == NULL);
-
-			destroy_hrtimer_on_stack(&t.timer);
-		}
-	}
-	__set_current_state(TASK_RUNNING);
-
-	/*
-	 * NOTE: we don't remove ourselves from the waitqueue because
-	 * we are the only user of it.
-	 */
+	/* queue_me and wait for wakeup, timeout, or a signal. */
+	futex_wait_queue_me(hb, &q, to, &wait);
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	ret = 0;
 	if (!unqueue_me(&q))
 		goto out_put_key;
 	ret = -ETIMEDOUT;
-	if (rem)
+	if (to && !to->task)
 		goto out_put_key;
 
 	/*
@@ -1275,6 +1285,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 out_put_key:
 	put_futex_key(fshared, &q.key);
 out:
+	if (to) {
+		hrtimer_cancel(&to->timer);
+		destroy_hrtimer_on_stack(&to->timer);
+	}
 	return ret;
 }
 
-- 
GitLab


From 4b1c486b3587d2abf50bee4a05eb488cd4045f2c Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:39:42 -0700
Subject: [PATCH 0235/2198] futex: add helper to find the top prio waiter of a
 futex

Improve legibility by wrapping finding the top waiter in a function.
This will be used by the follow-on patches for enabling requeue pi.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/kernel/futex.c b/kernel/futex.c
index ebb48d6d1a87e..421fb5e42a106 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -276,6 +276,25 @@ void put_futex_key(int fshared, union futex_key *key)
 	drop_futex_key_refs(key);
 }
 
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb:     the hash bucket the futex_q's reside in
+ * @key:    the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
+					union futex_key *key)
+{
+	struct futex_q *this;
+
+	plist_for_each_entry(this, &hb->chain, list) {
+		if (match_futex(&this->key, key))
+			return this;
+	}
+	return NULL;
+}
+
 static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
 {
 	u32 curval;
-- 
GitLab


From 1a52084d0919c2799258737c21fb328a9de159b5 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:39:52 -0700
Subject: [PATCH 0236/2198] futex: split out atomic logic from futex_lock_pi()

Refactor the atomic portion of futex_lock_pi() into futex_lock_pi_atomic().

This logic will be needed by requeue_pi, so modularize it to reduce
code duplication.  The only significant change is passing of the task
to try and take the lock for.  This simplifies the -EDEADLK test as if
the lock is owned by task t, it's a deadlock, regardless of if we are
doing requeue pi or not.  This patch updates the corresponding comment
accordingly.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 224 ++++++++++++++++++++++++++++---------------------
 1 file changed, 130 insertions(+), 94 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 421fb5e42a106..986b16e44534b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -556,6 +556,127 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 	return 0;
 }
 
+/**
+ * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
+ * @uaddr:	the pi futex user address
+ * @hb:		the pi futex hash bucket
+ * @key:	the futex key associated with uaddr and hb
+ * @ps:		the pi_state pointer where we store the result of the lookup
+ * @task:	the task to perform the atomic lock work for.  This will be
+ * 		"current" except in the case of requeue pi.
+ *
+ * Returns:
+ *  0 - ready to wait
+ *  1 - acquired the lock
+ * <0 - error
+ *
+ * The hb->lock and futex_key refs shall be held by the caller.
+ */
+static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+				union futex_key *key,
+				struct futex_pi_state **ps,
+				struct task_struct *task)
+{
+	int lock_taken, ret, ownerdied = 0;
+	u32 uval, newval, curval;
+
+retry:
+	ret = lock_taken = 0;
+
+	/*
+	 * To avoid races, we attempt to take the lock here again
+	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
+	 * the locks. It will most likely not succeed.
+	 */
+	newval = task_pid_vnr(task);
+
+	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+
+	if (unlikely(curval == -EFAULT))
+		return -EFAULT;
+
+	/*
+	 * Detect deadlocks.
+	 */
+	if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+		return -EDEADLK;
+
+	/*
+	 * Surprise - we got the lock. Just return to userspace:
+	 */
+	if (unlikely(!curval))
+		return 1;
+
+	uval = curval;
+
+	/*
+	 * Set the FUTEX_WAITERS flag, so the owner will know it has someone
+	 * to wake at the next unlock.
+	 */
+	newval = curval | FUTEX_WAITERS;
+
+	/*
+	 * There are two cases, where a futex might have no owner (the
+	 * owner TID is 0): OWNER_DIED. We take over the futex in this
+	 * case. We also do an unconditional take over, when the owner
+	 * of the futex died.
+	 *
+	 * This is safe as we are protected by the hash bucket lock !
+	 */
+	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+		/* Keep the OWNER_DIED bit */
+		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+		ownerdied = 0;
+		lock_taken = 1;
+	}
+
+	curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+	if (unlikely(curval == -EFAULT))
+		return -EFAULT;
+	if (unlikely(curval != uval))
+		goto retry;
+
+	/*
+	 * We took the lock due to owner died take over.
+	 */
+	if (unlikely(lock_taken))
+		return 1;
+
+	/*
+	 * We dont have the lock. Look up the PI state (or create it if
+	 * we are the first waiter):
+	 */
+	ret = lookup_pi_state(uval, hb, key, ps);
+
+	if (unlikely(ret)) {
+		switch (ret) {
+		case -ESRCH:
+			/*
+			 * No owner found for this futex. Check if the
+			 * OWNER_DIED bit is set to figure out whether
+			 * this is a robust futex or not.
+			 */
+			if (get_futex_value_locked(&curval, uaddr))
+				return -EFAULT;
+
+			/*
+			 * We simply start over in case of a robust
+			 * futex. The code above will take the futex
+			 * and return happy.
+			 */
+			if (curval & FUTEX_OWNER_DIED) {
+				ownerdied = 1;
+				goto retry;
+			}
+		default:
+			break;
+		}
+	}
+
+	return ret;
+}
+
 /*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
@@ -1340,9 +1461,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct task_struct *curr = current;
 	struct futex_hash_bucket *hb;
-	u32 uval, newval, curval;
+	u32 uval;
 	struct futex_q q;
-	int ret, lock_taken, ownerdied = 0;
+	int ret;
 
 	if (refill_pi_state_cache())
 		return -ENOMEM;
@@ -1365,81 +1486,15 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 retry_private:
 	hb = queue_lock(&q);
 
-retry_locked:
-	ret = lock_taken = 0;
-
-	/*
-	 * To avoid races, we attempt to take the lock here again
-	 * (by doing a 0 -> TID atomic cmpxchg), while holding all
-	 * the locks. It will most likely not succeed.
-	 */
-	newval = task_pid_vnr(current);
-
-	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
-	if (unlikely(curval == -EFAULT))
-		goto uaddr_faulted;
-
-	/*
-	 * Detect deadlocks. In case of REQUEUE_PI this is a valid
-	 * situation and we return success to user space.
-	 */
-	if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
-		ret = -EDEADLK;
-		goto out_unlock_put_key;
-	}
-
-	/*
-	 * Surprise - we got the lock. Just return to userspace:
-	 */
-	if (unlikely(!curval))
-		goto out_unlock_put_key;
-
-	uval = curval;
-
-	/*
-	 * Set the WAITERS flag, so the owner will know it has someone
-	 * to wake at next unlock
-	 */
-	newval = curval | FUTEX_WAITERS;
-
-	/*
-	 * There are two cases, where a futex might have no owner (the
-	 * owner TID is 0): OWNER_DIED. We take over the futex in this
-	 * case. We also do an unconditional take over, when the owner
-	 * of the futex died.
-	 *
-	 * This is safe as we are protected by the hash bucket lock !
-	 */
-	if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
-		/* Keep the OWNER_DIED bit */
-		newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
-		ownerdied = 0;
-		lock_taken = 1;
-	}
-
-	curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-	if (unlikely(curval == -EFAULT))
-		goto uaddr_faulted;
-	if (unlikely(curval != uval))
-		goto retry_locked;
-
-	/*
-	 * We took the lock due to owner died take over.
-	 */
-	if (unlikely(lock_taken))
-		goto out_unlock_put_key;
-
-	/*
-	 * We dont have the lock. Look up the PI state (or create it if
-	 * we are the first waiter):
-	 */
-	ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
-
+	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current);
 	if (unlikely(ret)) {
 		switch (ret) {
-
+		case 1:
+			/* We got the lock. */
+			ret = 0;
+			goto out_unlock_put_key;
+		case -EFAULT:
+			goto uaddr_faulted;
 		case -EAGAIN:
 			/*
 			 * Task is exiting and we just wait for the
@@ -1449,25 +1504,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 			put_futex_key(fshared, &q.key);
 			cond_resched();
 			goto retry;
-
-		case -ESRCH:
-			/*
-			 * No owner found for this futex. Check if the
-			 * OWNER_DIED bit is set to figure out whether
-			 * this is a robust futex or not.
-			 */
-			if (get_futex_value_locked(&curval, uaddr))
-				goto uaddr_faulted;
-
-			/*
-			 * We simply start over in case of a robust
-			 * futex. The code above will take the futex
-			 * and return happy.
-			 */
-			if (curval & FUTEX_OWNER_DIED) {
-				ownerdied = 1;
-				goto retry_locked;
-			}
 		default:
 			goto out_unlock_put_key;
 		}
-- 
GitLab


From dd9739980b50c8cde33e1f8eb08b7e0140bcd61e Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:02 -0700
Subject: [PATCH 0237/2198] futex: split out fixup owner logic from
 futex_lock_pi()

Refactor the post lock acquisition logic from futex_lock_pi(). This
code will be reused in futex_wait_requeue_pi().

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 158 ++++++++++++++++++++++++++++---------------------
 1 file changed, 89 insertions(+), 69 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 986b16e44534b..af831fbb7fb43 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1255,6 +1255,79 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 
 static long futex_wait_restart(struct restart_block *restart);
 
+/**
+ * fixup_owner() - Post lock pi_state and corner case management
+ * @uaddr:	user address of the futex
+ * @fshared:	whether the futex is shared (1) or not (0)
+ * @q:		futex_q (contains pi_state and access to the rt_mutex)
+ * @locked:	if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Returns:
+ *  1 - success, lock taken
+ *  0 - success, lock not taken
+ * <0 - on error (-EFAULT)
+ */
+static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+		       int locked)
+{
+	struct task_struct *owner;
+	int ret = 0;
+
+	if (locked) {
+		/*
+		 * Got the lock. We might not be the anticipated owner if we
+		 * did a lock-steal - fix up the PI-state in that case:
+		 */
+		if (q->pi_state->owner != current)
+			ret = fixup_pi_state_owner(uaddr, q, current, fshared);
+		goto out;
+	}
+
+	/*
+	 * Catch the rare case, where the lock was released when we were on the
+	 * way back before we locked the hash bucket.
+	 */
+	if (q->pi_state->owner == current) {
+		/*
+		 * Try to get the rt_mutex now. This might fail as some other
+		 * task acquired the rt_mutex after we removed ourself from the
+		 * rt_mutex waiters list.
+		 */
+		if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
+			locked = 1;
+			goto out;
+		}
+
+		/*
+		 * pi_state is incorrect, some other task did a lock steal and
+		 * we returned due to timeout or signal without taking the
+		 * rt_mutex. Too late. We can access the rt_mutex_owner without
+		 * locking, as the other task is now blocked on the hash bucket
+		 * lock. Fix the state up.
+		 */
+		owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+		ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+		goto out;
+	}
+
+	/*
+	 * Paranoia check. If we did not take the lock, then we should not be
+	 * the owner, nor the pending owner, of the rt_mutex.
+	 */
+	if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+		printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
+				"pi-state %p\n", ret,
+				q->pi_state->pi_mutex.owner,
+				q->pi_state->owner);
+
+out:
+	return ret ? ret : locked;
+}
+
 /**
  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
  * @hb:		the futex hash bucket, must be locked by the caller
@@ -1459,11 +1532,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 			 int detect, ktime_t *time, int trylock)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
-	struct task_struct *curr = current;
 	struct futex_hash_bucket *hb;
 	u32 uval;
 	struct futex_q q;
-	int ret;
+	int res, ret;
 
 	if (refill_pi_state_cache())
 		return -ENOMEM;
@@ -1527,71 +1599,21 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	}
 
 	spin_lock(q.lock_ptr);
-
-	if (!ret) {
-		/*
-		 * Got the lock. We might not be the anticipated owner
-		 * if we did a lock-steal - fix up the PI-state in
-		 * that case:
-		 */
-		if (q.pi_state->owner != curr)
-			ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
-	} else {
-		/*
-		 * Catch the rare case, where the lock was released
-		 * when we were on the way back before we locked the
-		 * hash bucket.
-		 */
-		if (q.pi_state->owner == curr) {
-			/*
-			 * Try to get the rt_mutex now. This might
-			 * fail as some other task acquired the
-			 * rt_mutex after we removed ourself from the
-			 * rt_mutex waiters list.
-			 */
-			if (rt_mutex_trylock(&q.pi_state->pi_mutex))
-				ret = 0;
-			else {
-				/*
-				 * pi_state is incorrect, some other
-				 * task did a lock steal and we
-				 * returned due to timeout or signal
-				 * without taking the rt_mutex. Too
-				 * late. We can access the
-				 * rt_mutex_owner without locking, as
-				 * the other task is now blocked on
-				 * the hash bucket lock. Fix the state
-				 * up.
-				 */
-				struct task_struct *owner;
-				int res;
-
-				owner = rt_mutex_owner(&q.pi_state->pi_mutex);
-				res = fixup_pi_state_owner(uaddr, &q, owner,
-							   fshared);
-
-				/* propagate -EFAULT, if the fixup failed */
-				if (res)
-					ret = res;
-			}
-		} else {
-			/*
-			 * Paranoia check. If we did not take the lock
-			 * in the trylock above, then we should not be
-			 * the owner of the rtmutex, neither the real
-			 * nor the pending one:
-			 */
-			if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
-				printk(KERN_ERR "futex_lock_pi: ret = %d "
-				       "pi-mutex: %p pi-state %p\n", ret,
-				       q.pi_state->pi_mutex.owner,
-				       q.pi_state->owner);
-		}
-	}
+	/*
+	 * Fixup the pi_state owner and possibly acquire the lock if we
+	 * haven't already.
+	 */
+	res = fixup_owner(uaddr, fshared, &q, !ret);
+	/*
+	 * If fixup_owner() returned an error, proprogate that.  If it acquired
+	 * the lock, clear our -ETIMEDOUT or -EINTR.
+	 */
+	if (res)
+		ret = (res < 0) ? res : 0;
 
 	/*
-	 * If fixup_pi_state_owner() faulted and was unable to handle the
-	 * fault, unlock it and return the fault to userspace.
+	 * If fixup_owner() faulted and was unable to handle the fault, unlock
+	 * it and return the fault to userspace.
 	 */
 	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
 		rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1599,9 +1621,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(&q);
 
-	if (to)
-		destroy_hrtimer_on_stack(&to->timer);
-	return ret != -EINTR ? ret : -ERESTARTNOINTR;
+	goto out;
 
 out_unlock_put_key:
 	queue_unlock(&q, hb);
@@ -1611,7 +1631,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 out:
 	if (to)
 		destroy_hrtimer_on_stack(&to->timer);
-	return ret;
+	return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
 uaddr_faulted:
 	/*
-- 
GitLab


From 8dac456a681bd94272ff50ecb31be6b669382c2b Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:12 -0700
Subject: [PATCH 0238/2198] rt_mutex: add proxy lock routines

This patch is a prerequisite for futex requeue_pi. It basically splits
rt_mutex_slowlock() right down the middle, just before the first call
to schedule(). It further adds helper functions which make use of the
split and provide the rt-mutex preliminaries for futex requeue_pi.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/rtmutex.c        | 240 +++++++++++++++++++++++++++++++---------
 kernel/rtmutex_common.h |   8 ++
 2 files changed, 195 insertions(+), 53 deletions(-)

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ffa6..fec77e7e0562f 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
  * assigned pending owner [which might not have taken the
  * lock yet]:
  */
-static inline int try_to_steal_lock(struct rt_mutex *lock)
+static inline int try_to_steal_lock(struct rt_mutex *lock,
+				    struct task_struct *task)
 {
 	struct task_struct *pendowner = rt_mutex_owner(lock);
 	struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
 	if (!rt_mutex_owner_pending(lock))
 		return 0;
 
-	if (pendowner == current)
+	if (pendowner == task)
 		return 1;
 
 	spin_lock_irqsave(&pendowner->pi_lock, flags);
-	if (current->prio >= pendowner->prio) {
+	if (task->prio >= pendowner->prio) {
 		spin_unlock_irqrestore(&pendowner->pi_lock, flags);
 		return 0;
 	}
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
 	 * We are going to steal the lock and a waiter was
 	 * enqueued on the pending owners pi_waiters queue. So
 	 * we have to enqueue this waiter into
-	 * current->pi_waiters list. This covers the case,
-	 * where current is boosted because it holds another
+	 * task->pi_waiters list. This covers the case,
+	 * where task is boosted because it holds another
 	 * lock and gets unboosted because the booster is
 	 * interrupted, so we would delay a waiter with higher
-	 * priority as current->normal_prio.
+	 * priority as task->normal_prio.
 	 *
 	 * Note: in the rare case of a SCHED_OTHER task changing
 	 * its priority and thus stealing the lock, next->task
-	 * might be current:
+	 * might be task:
 	 */
-	if (likely(next->task != current)) {
-		spin_lock_irqsave(&current->pi_lock, flags);
-		plist_add(&next->pi_list_entry, &current->pi_waiters);
-		__rt_mutex_adjust_prio(current);
-		spin_unlock_irqrestore(&current->pi_lock, flags);
+	if (likely(next->task != task)) {
+		spin_lock_irqsave(&task->pi_lock, flags);
+		plist_add(&next->pi_list_entry, &task->pi_waiters);
+		__rt_mutex_adjust_prio(task);
+		spin_unlock_irqrestore(&task->pi_lock, flags);
 	}
 	return 1;
 }
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
 	 */
 	mark_rt_mutex_waiters(lock);
 
-	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+	if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
 		return 0;
 
 	/* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
  */
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 				   struct rt_mutex_waiter *waiter,
+				   struct task_struct *task,
 				   int detect_deadlock)
 {
 	struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	unsigned long flags;
 	int chain_walk = 0, res;
 
-	spin_lock_irqsave(&current->pi_lock, flags);
-	__rt_mutex_adjust_prio(current);
-	waiter->task = current;
+	spin_lock_irqsave(&task->pi_lock, flags);
+	__rt_mutex_adjust_prio(task);
+	waiter->task = task;
 	waiter->lock = lock;
-	plist_node_init(&waiter->list_entry, current->prio);
-	plist_node_init(&waiter->pi_list_entry, current->prio);
+	plist_node_init(&waiter->list_entry, task->prio);
+	plist_node_init(&waiter->pi_list_entry, task->prio);
 
 	/* Get the top priority waiter on the lock */
 	if (rt_mutex_has_waiters(lock))
 		top_waiter = rt_mutex_top_waiter(lock);
 	plist_add(&waiter->list_entry, &lock->wait_list);
 
-	current->pi_blocked_on = waiter;
+	task->pi_blocked_on = waiter;
 
-	spin_unlock_irqrestore(&current->pi_lock, flags);
+	spin_unlock_irqrestore(&task->pi_lock, flags);
 
 	if (waiter == rt_mutex_top_waiter(lock)) {
 		spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
 	spin_unlock(&lock->wait_lock);
 
 	res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
-					 current);
+					 task);
 
 	spin_lock(&lock->wait_lock);
 
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
 	rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
 
-/*
- * Slow path lock function:
+/**
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock:		 the rt_mutex to take
+ * @state:		 the state the task should block in (TASK_INTERRUPTIBLE
+ * 			 or TASK_UNINTERRUPTIBLE)
+ * @timeout:		 the pre-initialized and started timer, or NULL for none
+ * @waiter:		 the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:	 passed to task_blocks_on_rt_mutex
+ *
+ * lock->wait_lock must be held by the caller.
  */
 static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
-		  struct hrtimer_sleeper *timeout,
-		  int detect_deadlock)
+__rt_mutex_slowlock(struct rt_mutex *lock, int state,
+		    struct hrtimer_sleeper *timeout,
+		    struct rt_mutex_waiter *waiter,
+		    int detect_deadlock)
 {
-	struct rt_mutex_waiter waiter;
 	int ret = 0;
 
-	debug_rt_mutex_init_waiter(&waiter);
-	waiter.task = NULL;
-
-	spin_lock(&lock->wait_lock);
-
-	/* Try to acquire the lock again: */
-	if (try_to_take_rt_mutex(lock)) {
-		spin_unlock(&lock->wait_lock);
-		return 0;
-	}
-
-	set_current_state(state);
-
-	/* Setup the timer, when timeout != NULL */
-	if (unlikely(timeout)) {
-		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-		if (!hrtimer_active(&timeout->timer))
-			timeout->task = NULL;
-	}
-
 	for (;;) {
 		/* Try to acquire the lock: */
 		if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 		}
 
 		/*
-		 * waiter.task is NULL the first time we come here and
+		 * waiter->task is NULL the first time we come here and
 		 * when we have been woken up by the previous owner
 		 * but the lock got stolen by a higher prio task.
 		 */
-		if (!waiter.task) {
-			ret = task_blocks_on_rt_mutex(lock, &waiter,
+		if (!waiter->task) {
+			ret = task_blocks_on_rt_mutex(lock, waiter, current,
 						      detect_deadlock);
 			/*
 			 * If we got woken up by the owner then start loop
 			 * all over without going into schedule to try
 			 * to get the lock now:
 			 */
-			if (unlikely(!waiter.task)) {
+			if (unlikely(!waiter->task)) {
 				/*
 				 * Reset the return value. We might
 				 * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
 		spin_unlock(&lock->wait_lock);
 
-		debug_rt_mutex_print_deadlock(&waiter);
+		debug_rt_mutex_print_deadlock(waiter);
 
-		if (waiter.task)
+		if (waiter->task)
 			schedule_rt_mutex(lock);
 
 		spin_lock(&lock->wait_lock);
 		set_current_state(state);
 	}
 
+	return ret;
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+		  struct hrtimer_sleeper *timeout,
+		  int detect_deadlock)
+{
+	struct rt_mutex_waiter waiter;
+	int ret = 0;
+
+	debug_rt_mutex_init_waiter(&waiter);
+	waiter.task = NULL;
+
+	spin_lock(&lock->wait_lock);
+
+	/* Try to acquire the lock again: */
+	if (try_to_take_rt_mutex(lock)) {
+		spin_unlock(&lock->wait_lock);
+		return 0;
+	}
+
+	set_current_state(state);
+
+	/* Setup the timer, when timeout != NULL */
+	if (unlikely(timeout)) {
+		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+		if (!hrtimer_active(&timeout->timer))
+			timeout->task = NULL;
+	}
+
+	ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+				  detect_deadlock);
+
 	set_current_state(TASK_RUNNING);
 
 	if (unlikely(waiter.task))
@@ -985,6 +1012,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 	rt_mutex_deadlock_account_unlock(proxy_owner);
 }
 
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock:		the rt_mutex to take
+ * @waiter:		the pre-initialized rt_mutex_waiter
+ * @task:		the task to prepare
+ * @detect_deadlock:	perform deadlock detection (1) or not (0)
+ *
+ * Returns:
+ *  0 - task blocked on lock
+ *  1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+			      struct rt_mutex_waiter *waiter,
+			      struct task_struct *task, int detect_deadlock)
+{
+	int ret;
+
+	spin_lock(&lock->wait_lock);
+
+	mark_rt_mutex_waiters(lock);
+
+	if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
+		/* We got the lock for task. */
+		debug_rt_mutex_lock(lock);
+
+		rt_mutex_set_owner(lock, task, 0);
+
+		rt_mutex_deadlock_account_lock(lock, task);
+		return 1;
+	}
+
+	ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+
+
+	if (ret && !waiter->task) {
+		/*
+		 * Reset the return value. We might have
+		 * returned with -EDEADLK and the owner
+		 * released the lock while we were walking the
+		 * pi chain.  Let the waiter sort it out.
+		 */
+		ret = 0;
+	}
+	spin_unlock(&lock->wait_lock);
+
+	debug_rt_mutex_print_deadlock(waiter);
+
+	return ret;
+}
+
 /**
  * rt_mutex_next_owner - return the next owner of the lock
  *
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
 
 	return rt_mutex_top_waiter(lock)->task;
 }
+
+/**
+ * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * @lock:		the rt_mutex we were woken on
+ * @to:			the timeout, null if none. hrtimer should already have
+ * 			been started.
+ * @waiter:		the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:	perform deadlock detection (1) or not (0)
+ *
+ * Complete the lock acquisition started our behalf by another thread.
+ *
+ * Returns:
+ *  0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ *
+ * Special API call for PI-futex requeue support
+ */
+int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+			       struct hrtimer_sleeper *to,
+			       struct rt_mutex_waiter *waiter,
+			       int detect_deadlock)
+{
+	int ret;
+
+	spin_lock(&lock->wait_lock);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
+				  detect_deadlock);
+
+	set_current_state(TASK_RUNNING);
+
+	if (unlikely(waiter->task))
+		remove_waiter(lock, waiter);
+
+	/*
+	 * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+	 * have to fix that up.
+	 */
+	fixup_rt_mutex_waiters(lock);
+
+	spin_unlock(&lock->wait_lock);
+
+	/*
+	 * Readjust priority, when we did not get the lock. We might have been
+	 * the pending owner and boosted. Since we did not take the lock, the
+	 * PI boost has to go.
+	 */
+	if (unlikely(ret))
+		rt_mutex_adjust_prio(current);
+
+	return ret;
+}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index e124bf5800ea1..97a2f81866afd 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
 				       struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
 				  struct task_struct *proxy_owner);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+				     struct rt_mutex_waiter *waiter,
+				     struct task_struct *task,
+				     int detect_deadlock);
+extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+				      struct hrtimer_sleeper *to,
+				      struct rt_mutex_waiter *waiter,
+				      int detect_deadlock);
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
-- 
GitLab


From a72188d8a64ebe74722f1cf7ffac41b41ffdba21 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:22 -0700
Subject: [PATCH 0239/2198] futex: add FUTEX_HAS_TIMEOUT flag to
 restart.futex.flags

Currently restart is only used if there is a timeout. The requeue_pi
functionality requires restarting to futex_lock_pi() on signal after
wakeup in futex_wait_requeue_pi() regardless of if there was a timeout
or not. Using 0 for the timeout value is confusing as that could
indicate an expired timer. The flag makes this explicit. While the
check is not technically needed in futex_wait_restart(), doing so
makes the code consistent with and will avoid confusion should the
need arise to restart wait without a timeout.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index af831fbb7fb43..6b597cf33b02f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1252,6 +1252,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
  */
 #define FLAGS_SHARED		0x01
 #define FLAGS_CLOCKRT		0x02
+#define FLAGS_HAS_TIMEOUT	0x04
 
 static long futex_wait_restart(struct restart_block *restart);
 
@@ -1486,7 +1487,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 	restart->futex.val = val;
 	restart->futex.time = abs_time->tv64;
 	restart->futex.bitset = bitset;
-	restart->futex.flags = 0;
+	restart->futex.flags = FLAGS_HAS_TIMEOUT;
 
 	if (fshared)
 		restart->futex.flags |= FLAGS_SHARED;
@@ -1510,13 +1511,16 @@ static long futex_wait_restart(struct restart_block *restart)
 {
 	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
 	int fshared = 0;
-	ktime_t t;
+	ktime_t t, *tp = NULL;
 
-	t.tv64 = restart->futex.time;
+	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+		t.tv64 = restart->futex.time;
+		tp = &t;
+	}
 	restart->fn = do_no_restart_syscall;
 	if (restart->futex.flags & FLAGS_SHARED)
 		fshared = 1;
-	return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+	return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
 				restart->futex.bitset,
 				restart->futex.flags & FLAGS_CLOCKRT);
 }
-- 
GitLab


From 9121e4783cd5c7e2a407763f3b61c2d573891133 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:31 -0700
Subject: [PATCH 0240/2198] futex: distangle futex_requeue()

futex_requeue() is getting a bit long-winded, and will be getting more
so after the requeue_pi patch. Factor out the actual requeueing into a
nicely contained inline function to reduce function length and improve
legibility.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 6b597cf33b02f..e76942e2a79fa 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -940,6 +940,34 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 	return ret;
 }
 
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q:		the futex_q to requeue
+ * @hb1:	the source hash_bucket
+ * @hb2:	the target hash_bucket
+ * @key2:	the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+		   struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+	/*
+	 * If key1 and key2 hash to the same bucket, no need to
+	 * requeue.
+	 */
+	if (likely(&hb1->chain != &hb2->chain)) {
+		plist_del(&q->list, &hb1->chain);
+		plist_add(&q->list, &hb2->chain);
+		q->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+		q->list.plist.lock = &hb2->lock;
+#endif
+	}
+	get_futex_key_refs(key2);
+	q->key = *key2;
+}
+
 /*
  * Requeue all waiters hashed on one physical page to another
  * physical page.
@@ -999,20 +1027,7 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 		if (++ret <= nr_wake) {
 			wake_futex(this);
 		} else {
-			/*
-			 * If key1 and key2 hash to the same bucket, no need to
-			 * requeue.
-			 */
-			if (likely(head1 != &hb2->chain)) {
-				plist_del(&this->list, &hb1->chain);
-				plist_add(&this->list, &hb2->chain);
-				this->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-				this->list.plist.lock = &hb2->lock;
-#endif
-			}
-			this->key = key2;
-			get_futex_key_refs(&key2);
+			requeue_futex(this, hb1, hb2, &key2);
 			drop_count++;
 
 			if (ret - nr_wake >= nr_requeue)
-- 
GitLab


From f801073f87aa22ddf0e9146355fec3993163790f Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:40 -0700
Subject: [PATCH 0241/2198] futex: split out futex value validation code

Refactor the code to validate the expected futex value in order to
reuse it with the requeue_pi code.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 116 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 72 insertions(+), 44 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index e76942e2a79fa..dbe857aa4381b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1398,42 +1398,29 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	__set_current_state(TASK_RUNNING);
 }
 
-static int futex_wait(u32 __user *uaddr, int fshared,
-		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr:	the futex userspace address
+ * @val:	the expected value
+ * @fshared:	whether the futex is shared (1) or not (0)
+ * @q:		the associated futex_q
+ * @hb:		storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket.  Get the futex value and
+ * compare it with the expected value.  Handle atomic faults internally.
+ * Return with the hb lock held and a q.key reference on success, and unlocked
+ * with no q.key reference on failure.
+ *
+ * Returns:
+ *  0 - uaddr contains val and hb has been locked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ */
+static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+			   struct futex_q *q, struct futex_hash_bucket **hb)
 {
-	struct hrtimer_sleeper timeout, *to = NULL;
-	DECLARE_WAITQUEUE(wait, current);
-	struct restart_block *restart;
-	struct futex_hash_bucket *hb;
-	struct futex_q q;
 	u32 uval;
 	int ret;
 
-	if (!bitset)
-		return -EINVAL;
-
-	q.pi_state = NULL;
-	q.bitset = bitset;
-
-	if (abs_time) {
-		to = &timeout;
-
-		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
-				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-		hrtimer_init_sleeper(to, current);
-		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
-					     current->timer_slack_ns);
-	}
-
-retry:
-	q.key = FUTEX_KEY_INIT;
-	ret = get_futex_key(uaddr, fshared, &q.key);
-	if (unlikely(ret != 0))
-		goto out;
-
-retry_private:
-	hb = queue_lock(&q);
-
 	/*
 	 * Access the page AFTER the hash-bucket is locked.
 	 * Order is important:
@@ -1450,33 +1437,74 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 	 * A consequence is that futex_wait() can return zero and absorb
 	 * a wakeup when *uaddr != val on entry to the syscall.  This is
 	 * rare, but normal.
-	 *
-	 * For shared futexes, we hold the mmap semaphore, so the mapping
-	 * cannot have changed since we looked it up in get_futex_key.
 	 */
+retry:
+	q->key = FUTEX_KEY_INIT;
+	ret = get_futex_key(uaddr, fshared, &q->key);
+	if (unlikely(ret != 0))
+		goto out;
+
+retry_private:
+	*hb = queue_lock(q);
+
 	ret = get_futex_value_locked(&uval, uaddr);
 
-	if (unlikely(ret)) {
-		queue_unlock(&q, hb);
+	if (ret) {
+		queue_unlock(q, *hb);
 
 		ret = get_user(uval, uaddr);
 		if (ret)
-			goto out_put_key;
+			goto out;
 
 		if (!fshared)
 			goto retry_private;
 
-		put_futex_key(fshared, &q.key);
+		put_futex_key(fshared, &q->key);
 		goto retry;
 	}
-	ret = -EWOULDBLOCK;
 
-	/* Only actually queue if *uaddr contained val.  */
-	if (unlikely(uval != val)) {
-		queue_unlock(&q, hb);
-		goto out_put_key;
+	if (uval != val) {
+		queue_unlock(q, *hb);
+		ret = -EWOULDBLOCK;
 	}
 
+out:
+	if (ret)
+		put_futex_key(fshared, &q->key);
+	return ret;
+}
+
+static int futex_wait(u32 __user *uaddr, int fshared,
+		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+{
+	struct hrtimer_sleeper timeout, *to = NULL;
+	DECLARE_WAITQUEUE(wait, current);
+	struct restart_block *restart;
+	struct futex_hash_bucket *hb;
+	struct futex_q q;
+	int ret;
+
+	if (!bitset)
+		return -EINVAL;
+
+	q.pi_state = NULL;
+	q.bitset = bitset;
+
+	if (abs_time) {
+		to = &timeout;
+
+		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		hrtimer_init_sleeper(to, current);
+		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+					     current->timer_slack_ns);
+	}
+
+	/* Prepare to wait on uaddr. */
+	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+	if (ret)
+		goto out;
+
 	/* queue_me and wait for wakeup, timeout, or a signal. */
 	futex_wait_queue_me(hb, &q, to, &wait);
 
-- 
GitLab


From 52400ba946759af28442dee6265c5c0180ac7122 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 3 Apr 2009 13:40:49 -0700
Subject: [PATCH 0242/2198] futex: add requeue_pi functionality

PI Futexes and their underlying rt_mutex cannot be left ownerless if
there are pending waiters as this will break the PI boosting logic, so
the standard requeue commands aren't sufficient.  The new commands
properly manage pi futex ownership by ensuring a futex with waiters
has an owner at all times.  This will allow glibc to properly handle
pi mutexes with pthread_condvars.

The approach taken here is to create two new futex op codes:

FUTEX_WAIT_REQUEUE_PI:
Tasks will use this op code to wait on a futex (such as a non-pi waitqueue)
and wake after they have been requeued to a pi futex.  Prior to returning to
userspace, they will acquire this pi futex (and the underlying rt_mutex).

futex_wait_requeue_pi() is the result of a high speed collision between
futex_wait() and futex_lock_pi() (with the first part of futex_lock_pi() being
done by futex_proxy_trylock_atomic() on behalf of the top_waiter).

FUTEX_REQUEUE_PI (and FUTEX_CMP_REQUEUE_PI):
This call must be used to wake tasks waiting with FUTEX_WAIT_REQUEUE_PI,
regardless of how many tasks the caller intends to wake or requeue.
pthread_cond_broadcast() should call this with nr_wake=1 and
nr_requeue=INT_MAX.  pthread_cond_signal() should call this with nr_wake=1 and
nr_requeue=0.  The reason being we need both callers to get the benefit of the
futex_proxy_trylock_atomic() routine.  futex_requeue() also enqueues the
top_waiter on the rt_mutex via rt_mutex_start_proxy_lock().

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/futex.h       |   8 +
 include/linux/thread_info.h |   3 +-
 kernel/futex.c              | 519 ++++++++++++++++++++++++++++++++++--
 3 files changed, 510 insertions(+), 20 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 3bf5bb5a34f9f..b05519ca9e574 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -23,6 +23,9 @@ union ktime;
 #define FUTEX_TRYLOCK_PI	8
 #define FUTEX_WAIT_BITSET	9
 #define FUTEX_WAKE_BITSET	10
+#define FUTEX_WAIT_REQUEUE_PI	11
+#define FUTEX_REQUEUE_PI	12
+#define FUTEX_CMP_REQUEUE_PI	13
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
@@ -38,6 +41,11 @@ union ktime;
 #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAIT_BITSET_PRIVATE	(FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_BITSET_PRIVATE	(FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAIT_REQUEUE_PI_PRIVATE	(FUTEX_WAIT_REQUEUE_PI | \
+					 FUTEX_PRIVATE_FLAG)
+#define FUTEX_REQUEUE_PI_PRIVATE	(FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG)
+#define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
+					 FUTEX_PRIVATE_FLAG)
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index e6b820f8b56b8..a8cc4e13434c4 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -21,13 +21,14 @@ struct restart_block {
 		struct {
 			unsigned long arg0, arg1, arg2, arg3;
 		};
-		/* For futex_wait */
+		/* For futex_wait and futex_wait_requeue_pi */
 		struct {
 			u32 *uaddr;
 			u32 val;
 			u32 flags;
 			u32 bitset;
 			u64 time;
+			u32 *uaddr2;
 		} futex;
 		/* For nanosleep */
 		struct {
diff --git a/kernel/futex.c b/kernel/futex.c
index dbe857aa4381b..185c981d89e34 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -19,6 +19,10 @@
  *  PRIVATE futexes by Eric Dumazet
  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  *
+ *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ *  Copyright (C) IBM Corporation, 2009
+ *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -109,6 +113,9 @@ struct futex_q {
 	struct futex_pi_state *pi_state;
 	struct task_struct *task;
 
+	/* rt_waiter storage for requeue_pi: */
+	struct rt_mutex_waiter *rt_waiter;
+
 	/* Bitset for the optional bitmasked wakeup */
 	u32 bitset;
 };
@@ -827,7 +834,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 
 	plist_for_each_entry_safe(this, next, head, list) {
 		if (match_futex (&this->key, &key)) {
-			if (this->pi_state) {
+			if (this->pi_state || this->rt_waiter) {
 				ret = -EINVAL;
 				break;
 			}
@@ -968,20 +975,138 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
 	q->key = *key2;
 }
 
-/*
- * Requeue all waiters hashed on one physical page to another
- * physical page.
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * q:	the futex_q
+ * key:	the key of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.  Set the futex_q key
+ * to the requeue target futex so the waiter can detect the wakeup on the right
+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
+ * atomic lock acquisition.  Must be called with the q->lock_ptr held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
+{
+	drop_futex_key_refs(&q->key);
+	get_futex_key_refs(key);
+	q->key = *key;
+
+	WARN_ON(plist_node_empty(&q->list));
+	plist_del(&q->list, &q->list.plist);
+
+	WARN_ON(!q->rt_waiter);
+	q->rt_waiter = NULL;
+
+	wake_up(&q->waiter);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex:	the user address of the to futex
+ * @hb1:	the from futex hash bucket, must be locked by the caller
+ * @hb2:	the to futex hash bucket, must be locked by the caller
+ * @key1:	the from futex key
+ * @key2:	the to futex key
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed.  hb1 and hb2 must be held by the caller.
+ *
+ * Returns:
+ *  0 - failed to acquire the lock atomicly
+ *  1 - acquired the lock
+ * <0 - error
+ */
+static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+				 struct futex_hash_bucket *hb1,
+				 struct futex_hash_bucket *hb2,
+				 union futex_key *key1, union futex_key *key2,
+				 struct futex_pi_state **ps)
+{
+	struct futex_q *top_waiter;
+	u32 curval;
+	int ret;
+
+	if (get_futex_value_locked(&curval, pifutex))
+		return -EFAULT;
+
+	top_waiter = futex_top_waiter(hb1, key1);
+
+	/* There are no waiters, nothing for us to do. */
+	if (!top_waiter)
+		return 0;
+
+	/*
+	 * Either take the lock for top_waiter or set the FUTEX_WAITERS bit.
+	 * The pi_state is returned in ps in contended cases.
+	 */
+	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task);
+	if (ret == 1)
+		requeue_pi_wake_futex(top_waiter, key2);
+
+	return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * uaddr1:	source futex user address
+ * uaddr2:	target futex user address
+ * nr_wake:	number of waiters to wake (must be 1 for requeue_pi)
+ * nr_requeue:	number of waiters to requeue (0-INT_MAX)
+ * requeue_pi:	if we are attempting to requeue from a non-pi futex to a
+ * 		pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Returns:
+ * >=0 - on success, the number of tasks requeued or woken
+ *  <0 - on error
  */
 static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-			 int nr_wake, int nr_requeue, u32 *cmpval)
+			 int nr_wake, int nr_requeue, u32 *cmpval,
+			 int requeue_pi)
 {
 	union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+	int drop_count = 0, task_count = 0, ret;
+	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head1;
 	struct futex_q *this, *next;
-	int ret, drop_count = 0;
+	u32 curval2;
+
+	if (requeue_pi) {
+		/*
+		 * requeue_pi requires a pi_state, try to allocate it now
+		 * without any locks in case it fails.
+		 */
+		if (refill_pi_state_cache())
+			return -ENOMEM;
+		/*
+		 * requeue_pi must wake as many tasks as it can, up to nr_wake
+		 * + nr_requeue, since it acquires the rt_mutex prior to
+		 * returning to userspace, so as to not leave the rt_mutex with
+		 * waiters and no owner.  However, second and third wake-ups
+		 * cannot be predicted as they involve race conditions with the
+		 * first wake and a fault while looking up the pi_state.  Both
+		 * pthread_cond_signal() and pthread_cond_broadcast() should
+		 * use nr_wake=1.
+		 */
+		if (nr_wake != 1)
+			return -EINVAL;
+	}
 
 retry:
+	if (pi_state != NULL) {
+		/*
+		 * We will have to lookup the pi_state again, so free this one
+		 * to keep the accounting correct.
+		 */
+		free_pi_state(pi_state);
+		pi_state = NULL;
+	}
+
 	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
@@ -1020,19 +1145,94 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 		}
 	}
 
+	if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+		/* Attempt to acquire uaddr2 and wake the top_waiter. */
+		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+						 &key2, &pi_state);
+
+		/*
+		 * At this point the top_waiter has either taken uaddr2 or is
+		 * waiting on it.  If the former, then the pi_state will not
+		 * exist yet, look it up one more time to ensure we have a
+		 * reference to it.
+		 */
+		if (ret == 1) {
+			WARN_ON(pi_state);
+			task_count++;
+			ret = get_futex_value_locked(&curval2, uaddr2);
+			if (!ret)
+				ret = lookup_pi_state(curval2, hb2, &key2,
+						      &pi_state);
+		}
+
+		switch (ret) {
+		case 0:
+			break;
+		case -EFAULT:
+			double_unlock_hb(hb1, hb2);
+			put_futex_key(fshared, &key2);
+			put_futex_key(fshared, &key1);
+			ret = get_user(curval2, uaddr2);
+			if (!ret)
+				goto retry;
+			goto out;
+		case -EAGAIN:
+			/* The owner was exiting, try again. */
+			double_unlock_hb(hb1, hb2);
+			put_futex_key(fshared, &key2);
+			put_futex_key(fshared, &key1);
+			cond_resched();
+			goto retry;
+		default:
+			goto out_unlock;
+		}
+	}
+
 	head1 = &hb1->chain;
 	plist_for_each_entry_safe(this, next, head1, list) {
-		if (!match_futex (&this->key, &key1))
+		if (task_count - nr_wake >= nr_requeue)
+			break;
+
+		if (!match_futex(&this->key, &key1))
 			continue;
-		if (++ret <= nr_wake) {
+
+		WARN_ON(!requeue_pi && this->rt_waiter);
+		WARN_ON(requeue_pi && !this->rt_waiter);
+
+		/*
+		 * Wake nr_wake waiters.  For requeue_pi, if we acquired the
+		 * lock, we already woke the top_waiter.  If not, it will be
+		 * woken by futex_unlock_pi().
+		 */
+		if (++task_count <= nr_wake && !requeue_pi) {
 			wake_futex(this);
-		} else {
-			requeue_futex(this, hb1, hb2, &key2);
-			drop_count++;
+			continue;
+		}
 
-			if (ret - nr_wake >= nr_requeue)
-				break;
+		/*
+		 * Requeue nr_requeue waiters and possibly one more in the case
+		 * of requeue_pi if we couldn't acquire the lock atomically.
+		 */
+		if (requeue_pi) {
+			/* Prepare the waiter to take the rt_mutex. */
+			atomic_inc(&pi_state->refcount);
+			this->pi_state = pi_state;
+			ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+							this->rt_waiter,
+							this->task, 1);
+			if (ret == 1) {
+				/* We got the lock. */
+				requeue_pi_wake_futex(this, &key2);
+				continue;
+			} else if (ret) {
+				/* -EDEADLK */
+				this->pi_state = NULL;
+				free_pi_state(pi_state);
+				goto out_unlock;
+			}
 		}
+		requeue_futex(this, hb1, hb2, &key2);
+		drop_count++;
 	}
 
 out_unlock:
@@ -1047,7 +1247,9 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 out_put_key1:
 	put_futex_key(fshared, &key1);
 out:
-	return ret;
+	if (pi_state != NULL)
+		free_pi_state(pi_state);
+	return ret ? ret : task_count;
 }
 
 /* The key must be already stored in q->key. */
@@ -1270,6 +1472,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 #define FLAGS_HAS_TIMEOUT	0x04
 
 static long futex_wait_restart(struct restart_block *restart);
+static long futex_lock_pi_restart(struct restart_block *restart);
 
 /**
  * fixup_owner() - Post lock pi_state and corner case management
@@ -1489,6 +1692,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 
 	q.pi_state = NULL;
 	q.bitset = bitset;
+	q.rt_waiter = NULL;
 
 	if (abs_time) {
 		to = &timeout;
@@ -1596,6 +1800,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	}
 
 	q.pi_state = NULL;
+	q.rt_waiter = NULL;
 retry:
 	q.key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q.key);
@@ -1701,6 +1906,20 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	goto retry;
 }
 
+static long futex_lock_pi_restart(struct restart_block *restart)
+{
+	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
+	ktime_t t, *tp = NULL;
+	int fshared = restart->futex.flags & FLAGS_SHARED;
+
+	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+		t.tv64 = restart->futex.time;
+		tp = &t;
+	}
+	restart->fn = do_no_restart_syscall;
+
+	return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0);
+}
 
 /*
  * Userspace attempted a TID -> 0 atomic transition, and failed.
@@ -1803,6 +2022,253 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared)
 	return ret;
 }
 
+/**
+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * @hb:		the hash_bucket futex_q was original enqueued on
+ * @q:		the futex_q woken while waiting to be requeued
+ * @key2:	the futex_key of the requeue target futex
+ * @timeout:	the timeout associated with the wait (NULL if none)
+ *
+ * Detect if the task was woken on the initial futex as opposed to the requeue
+ * target futex.  If so, determine if it was a timeout or a signal that caused
+ * the wakeup and return the appropriate error code to the caller.  Must be
+ * called with the hb lock held.
+ *
+ * Returns
+ *  0 - no early wakeup detected
+ * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?)
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+				   struct futex_q *q, union futex_key *key2,
+				   struct hrtimer_sleeper *timeout)
+{
+	int ret = 0;
+
+	/*
+	 * With the hb lock held, we avoid races while we process the wakeup.
+	 * We only need to hold hb (and not hb2) to ensure atomicity as the
+	 * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+	 * It can't be requeued from uaddr2 to something else since we don't
+	 * support a PI aware source futex for requeue.
+	 */
+	if (!match_futex(&q->key, key2)) {
+		WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
+		/*
+		 * We were woken prior to requeue by a timeout or a signal.
+		 * Unqueue the futex_q and determine which it was.
+		 */
+		plist_del(&q->list, &q->list.plist);
+		drop_futex_key_refs(&q->key);
+
+		if (timeout && !timeout->task)
+			ret = -ETIMEDOUT;
+		else {
+			/*
+			 * We expect signal_pending(current), but another
+			 * thread may have handled it for us already.
+			 */
+			/* FIXME: ERESTARTSYS or ERESTARTNOINTR?  Do we care if
+			 * the user specified SA_RESTART or not? */
+			ret = -ERESTARTSYS;
+		}
+	}
+	return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr:	the futex we initialyl wait on (non-pi)
+ * @fshared:	whether the futexes are shared (1) or not (0).  They must be
+ * 		the same type, no requeueing from private to shared, etc.
+ * @val:	the expected value of uaddr
+ * @abs_time:	absolute timeout
+ * @bitset:	32 bit wakeup bitset set by userspace, defaults to all.
+ * @clockrt:	whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
+ * @uaddr2:	the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
+ * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * need to.
+ *
+ * We call schedule in futex_wait_queue_me() when we enqueue and return there
+ * via the following:
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue and subsequent unlock
+ * 3) signal (before or after requeue)
+ * 4) timeout (before or after requeue)
+ *
+ * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Returns:
+ *  0 - On success
+ * <0 - On error
+ */
+static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+				 u32 val, ktime_t *abs_time, u32 bitset,
+				 int clockrt, u32 __user *uaddr2)
+{
+	struct hrtimer_sleeper timeout, *to = NULL;
+	struct rt_mutex_waiter rt_waiter;
+	struct rt_mutex *pi_mutex = NULL;
+	DECLARE_WAITQUEUE(wait, current);
+	struct restart_block *restart;
+	struct futex_hash_bucket *hb;
+	union futex_key key2;
+	struct futex_q q;
+	int res, ret;
+	u32 uval;
+
+	if (!bitset)
+		return -EINVAL;
+
+	if (abs_time) {
+		to = &timeout;
+		hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+				      CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+		hrtimer_init_sleeper(to, current);
+		hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+					     current->timer_slack_ns);
+	}
+
+	/*
+	 * The waiter is allocated on our stack, manipulated by the requeue
+	 * code while we sleep on uaddr.
+	 */
+	debug_rt_mutex_init_waiter(&rt_waiter);
+	rt_waiter.task = NULL;
+
+	q.pi_state = NULL;
+	q.bitset = bitset;
+	q.rt_waiter = &rt_waiter;
+
+	key2 = FUTEX_KEY_INIT;
+	ret = get_futex_key(uaddr2, fshared, &key2);
+	if (unlikely(ret != 0))
+		goto out;
+
+	/* Prepare to wait on uaddr. */
+	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+	if (ret) {
+		put_futex_key(fshared, &key2);
+		goto out;
+	}
+
+	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
+	futex_wait_queue_me(hb, &q, to, &wait);
+
+	spin_lock(&hb->lock);
+	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+	spin_unlock(&hb->lock);
+	if (ret)
+		goto out_put_keys;
+
+	/*
+	 * In order for us to be here, we know our q.key == key2, and since
+	 * we took the hb->lock above, we also know that futex_requeue() has
+	 * completed and we no longer have to concern ourselves with a wakeup
+	 * race with the atomic proxy lock acquition by the requeue code.
+	 */
+
+	/* Check if the requeue code acquired the second futex for us. */
+	if (!q.rt_waiter) {
+		/*
+		 * Got the lock. We might not be the anticipated owner if we
+		 * did a lock-steal - fix up the PI-state in that case.
+		 */
+		if (q.pi_state && (q.pi_state->owner != current)) {
+			spin_lock(q.lock_ptr);
+			ret = fixup_pi_state_owner(uaddr2, &q, current,
+						   fshared);
+			spin_unlock(q.lock_ptr);
+		}
+	} else {
+		/*
+		 * We have been woken up by futex_unlock_pi(), a timeout, or a
+		 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
+		 * the pi_state.
+		 */
+		WARN_ON(!&q.pi_state);
+		pi_mutex = &q.pi_state->pi_mutex;
+		ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+		debug_rt_mutex_free_waiter(&rt_waiter);
+
+		spin_lock(q.lock_ptr);
+		/*
+		 * Fixup the pi_state owner and possibly acquire the lock if we
+		 * haven't already.
+		 */
+		res = fixup_owner(uaddr2, fshared, &q, !ret);
+		/*
+		 * If fixup_owner() returned an error, proprogate that.  If it
+		 * acquired the lock, clear our -ETIMEDOUT or -EINTR.
+		 */
+		if (res)
+			ret = (res < 0) ? res : 0;
+
+		/* Unqueue and drop the lock. */
+		unqueue_me_pi(&q);
+	}
+
+	/*
+	 * If fixup_pi_state_owner() faulted and was unable to handle the
+	 * fault, unlock the rt_mutex and return the fault to userspace.
+	 */
+	if (ret == -EFAULT) {
+		if (rt_mutex_owner(pi_mutex) == current)
+			rt_mutex_unlock(pi_mutex);
+	} else if (ret == -EINTR) {
+		ret = -EFAULT;
+		if (get_user(uval, uaddr2))
+			goto out_put_keys;
+
+		/*
+		 * We've already been requeued, so restart by calling
+		 * futex_lock_pi() directly, rather then returning to this
+		 * function.
+		 */
+		ret = -ERESTART_RESTARTBLOCK;
+		restart = &current_thread_info()->restart_block;
+		restart->fn = futex_lock_pi_restart;
+		restart->futex.uaddr = (u32 *)uaddr2;
+		restart->futex.val = uval;
+		restart->futex.flags = 0;
+		if (abs_time) {
+			restart->futex.flags |= FLAGS_HAS_TIMEOUT;
+			restart->futex.time = abs_time->tv64;
+		}
+
+		if (fshared)
+			restart->futex.flags |= FLAGS_SHARED;
+		if (clockrt)
+			restart->futex.flags |= FLAGS_CLOCKRT;
+	}
+
+out_put_keys:
+	put_futex_key(fshared, &q.key);
+	put_futex_key(fshared, &key2);
+
+out:
+	if (to) {
+		hrtimer_cancel(&to->timer);
+		destroy_hrtimer_on_stack(&to->timer);
+	}
+	return ret;
+}
+
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
@@ -2025,7 +2491,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		fshared = 1;
 
 	clockrt = op & FUTEX_CLOCK_REALTIME;
-	if (clockrt && cmd != FUTEX_WAIT_BITSET)
+	if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
 		return -ENOSYS;
 
 	switch (cmd) {
@@ -2040,10 +2506,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		ret = futex_wake(uaddr, fshared, val, val3);
 		break;
 	case FUTEX_REQUEUE:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
 		break;
 	case FUTEX_CMP_REQUEUE:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+				    0);
 		break;
 	case FUTEX_WAKE_OP:
 		ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -2060,6 +2527,18 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		if (futex_cmpxchg_enabled)
 			ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
 		break;
+	case FUTEX_WAIT_REQUEUE_PI:
+		val3 = FUTEX_BITSET_MATCH_ANY;
+		ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+					    clockrt, uaddr2);
+		break;
+	case FUTEX_REQUEUE_PI:
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1);
+		break;
+	case FUTEX_CMP_REQUEUE_PI:
+		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+				    1);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
@@ -2077,7 +2556,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	int cmd = op & FUTEX_CMD_MASK;
 
 	if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-		      cmd == FUTEX_WAIT_BITSET)) {
+		      cmd == FUTEX_WAIT_BITSET ||
+		      cmd == FUTEX_WAIT_REQUEUE_PI)) {
 		if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
 			return -EFAULT;
 		if (!timespec_valid(&ts))
@@ -2089,10 +2569,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 		tp = &t;
 	}
 	/*
-	 * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+	 * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
 	 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
 	 */
 	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+	    cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI ||
 	    cmd == FUTEX_WAKE_OP)
 		val2 = (u32) (unsigned long) utime;
 
-- 
GitLab


From 7ba5779533819fc061b4afafcb4a609d55f37057 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Mon, 6 Apr 2009 20:49:14 +0900
Subject: [PATCH 0243/2198] tomoyo: remove "undelete domain" command.

Since TOMOYO's policy management tools does not use the "undelete domain"
command, we decided to remove that command.

Signed-off-by: Kentaro Takeda <takedakn@nttdata.co.jp>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Toshiharu Harada <haradats@nttdata.co.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/tomoyo/common.c |  7 +---
 security/tomoyo/common.h |  8 +---
 security/tomoyo/domain.c | 90 ++--------------------------------------
 3 files changed, 5 insertions(+), 100 deletions(-)

diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index 92cea656ad216..a0affd9cfca8b 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -1252,15 +1252,12 @@ static int tomoyo_write_domain_policy(struct tomoyo_io_buffer *head)
 	struct tomoyo_domain_info *domain = head->write_var1;
 	bool is_delete = false;
 	bool is_select = false;
-	bool is_undelete = false;
 	unsigned int profile;
 
 	if (tomoyo_str_starts(&data, TOMOYO_KEYWORD_DELETE))
 		is_delete = true;
 	else if (tomoyo_str_starts(&data, TOMOYO_KEYWORD_SELECT))
 		is_select = true;
-	else if (tomoyo_str_starts(&data, TOMOYO_KEYWORD_UNDELETE))
-		is_undelete = true;
 	if (is_select && tomoyo_is_select_one(head, data))
 		return 0;
 	/* Don't allow updating policies by non manager programs. */
@@ -1274,9 +1271,7 @@ static int tomoyo_write_domain_policy(struct tomoyo_io_buffer *head)
 			down_read(&tomoyo_domain_list_lock);
 			domain = tomoyo_find_domain(data);
 			up_read(&tomoyo_domain_list_lock);
-		} else if (is_undelete)
-			domain = tomoyo_undelete_domain(data);
-		else
+		} else
 			domain = tomoyo_find_or_assign_new_domain(data, 0);
 		head->write_var1 = domain;
 		return 0;
diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h
index 26a76d67aa1c2..e77e6a6de0f21 100644
--- a/security/tomoyo/common.h
+++ b/security/tomoyo/common.h
@@ -88,10 +88,7 @@ struct tomoyo_domain_info {
 	/* Name of this domain. Never NULL.          */
 	const struct tomoyo_path_info *domainname;
 	u8 profile;        /* Profile number to use. */
-	u8 is_deleted;     /* Delete flag.
-			      0 = active.
-			      1 = deleted but undeletable.
-			      255 = deleted and no longer undeletable. */
+	bool is_deleted;   /* Delete flag.           */
 	bool quota_warned; /* Quota warnning flag.   */
 	/* DOMAIN_FLAGS_*. Use tomoyo_set_domain_flag() to modify. */
 	u8 flags;
@@ -144,7 +141,6 @@ struct tomoyo_double_path_acl_record {
 #define TOMOYO_KEYWORD_NO_INITIALIZE_DOMAIN      "no_initialize_domain "
 #define TOMOYO_KEYWORD_NO_KEEP_DOMAIN            "no_keep_domain "
 #define TOMOYO_KEYWORD_SELECT                    "select "
-#define TOMOYO_KEYWORD_UNDELETE                  "undelete "
 #define TOMOYO_KEYWORD_USE_PROFILE               "use_profile "
 #define TOMOYO_KEYWORD_IGNORE_GLOBAL_ALLOW_READ  "ignore_global_allow_read"
 /* A domain definition starts with <kernel>. */
@@ -267,8 +263,6 @@ struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname);
 struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 							    domainname,
 							    const u8 profile);
-/* Undelete a domain. */
-struct tomoyo_domain_info *tomoyo_undelete_domain(const char *domainname);
 /* Check mode for specified functionality. */
 unsigned int tomoyo_check_flags(const struct tomoyo_domain_info *domain,
 				const u8 index);
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 093a756030bd4..2f2b449ffd2d9 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -551,9 +551,7 @@ int tomoyo_write_alias_policy(char *data, const bool is_delete)
 	return tomoyo_update_alias_entry(data, cp, is_delete);
 }
 
-/* Domain create/delete/undelete handler. */
-
-/* #define TOMOYO_DEBUG_DOMAIN_UNDELETE */
+/* Domain create/delete handler. */
 
 /**
  * tomoyo_delete_domain - Delete a domain.
@@ -571,41 +569,15 @@ int tomoyo_delete_domain(char *domainname)
 	tomoyo_fill_path_info(&name);
 	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_list_lock);
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-	printk(KERN_DEBUG "tomoyo_delete_domain %s\n", domainname);
-	list_for_each_entry(domain, &tomoyo_domain_list, list) {
-		if (tomoyo_pathcmp(domain->domainname, &name))
-			continue;
-		printk(KERN_DEBUG "List: %p %u\n", domain, domain->is_deleted);
-	}
-#endif
 	/* Is there an active domain? */
 	list_for_each_entry(domain, &tomoyo_domain_list, list) {
-		struct tomoyo_domain_info *domain2;
 		/* Never delete tomoyo_kernel_domain */
 		if (domain == &tomoyo_kernel_domain)
 			continue;
 		if (domain->is_deleted ||
 		    tomoyo_pathcmp(domain->domainname, &name))
 			continue;
-		/* Mark already deleted domains as non undeletable. */
-		list_for_each_entry(domain2, &tomoyo_domain_list, list) {
-			if (!domain2->is_deleted ||
-			    tomoyo_pathcmp(domain2->domainname, &name))
-				continue;
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-			if (domain2->is_deleted != 255)
-				printk(KERN_DEBUG
-				       "Marked %p as non undeletable\n",
-				       domain2);
-#endif
-			domain2->is_deleted = 255;
-		}
-		/* Delete and mark active domain as undeletable. */
-		domain->is_deleted = 1;
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-		printk(KERN_DEBUG "Marked %p as undeletable\n", domain);
-#endif
+		domain->is_deleted = true;
 		break;
 	}
 	up_write(&tomoyo_domain_list_lock);
@@ -613,58 +585,6 @@ int tomoyo_delete_domain(char *domainname)
 	return 0;
 }
 
-/**
- * tomoyo_undelete_domain - Undelete a domain.
- *
- * @domainname: The name of domain.
- *
- * Returns pointer to "struct tomoyo_domain_info" on success, NULL otherwise.
- */
-struct tomoyo_domain_info *tomoyo_undelete_domain(const char *domainname)
-{
-	struct tomoyo_domain_info *domain;
-	struct tomoyo_domain_info *candidate_domain = NULL;
-	struct tomoyo_path_info name;
-
-	name.name = domainname;
-	tomoyo_fill_path_info(&name);
-	/***** EXCLUSIVE SECTION START *****/
-	down_write(&tomoyo_domain_list_lock);
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-	printk(KERN_DEBUG "tomoyo_undelete_domain %s\n", domainname);
-	list_for_each_entry(domain, &tomoyo_domain_list, list) {
-		if (tomoyo_pathcmp(domain->domainname, &name))
-			continue;
-		printk(KERN_DEBUG "List: %p %u\n", domain, domain->is_deleted);
-	}
-#endif
-	list_for_each_entry(domain, &tomoyo_domain_list, list) {
-		if (tomoyo_pathcmp(&name, domain->domainname))
-			continue;
-		if (!domain->is_deleted) {
-			/* This domain is active. I can't undelete. */
-			candidate_domain = NULL;
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-			printk(KERN_DEBUG "%p is active. I can't undelete.\n",
-			       domain);
-#endif
-			break;
-		}
-		/* Is this domain undeletable? */
-		if (domain->is_deleted == 1)
-			candidate_domain = domain;
-	}
-	if (candidate_domain) {
-		candidate_domain->is_deleted = 0;
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-		printk(KERN_DEBUG "%p was undeleted.\n", candidate_domain);
-#endif
-	}
-	up_write(&tomoyo_domain_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
-	return candidate_domain;
-}
-
 /**
  * tomoyo_find_or_assign_new_domain - Create a domain.
  *
@@ -711,10 +631,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 		/***** CRITICAL SECTION END *****/
 		if (flag)
 			continue;
-#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE
-		printk(KERN_DEBUG "Reusing %p %s\n", domain,
-		       domain->domainname->name);
-#endif
 		list_for_each_entry(ptr, &domain->acl_info_list, list) {
 			ptr->type |= TOMOYO_ACL_DELETED;
 		}
@@ -722,7 +638,7 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 		domain->profile = profile;
 		domain->quota_warned = false;
 		mb(); /* Avoid out-of-order execution. */
-		domain->is_deleted = 0;
+		domain->is_deleted = false;
 		goto out;
 	}
 	/* No memory reusable. Create using new memory. */
-- 
GitLab


From a2e87d06ddbe6e6fdb8d6d2e5e985efe4efb07dd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:44:59 +0200
Subject: [PATCH 0244/2198] perf_counter: update mmap() counter read, take 2

Update the userspace read method.

Paul noted that:
 - userspace cannot observe ->lock & 1 on the same cpu.
 - we need a barrier() between reading ->lock and ->index
   to ensure we read them in that prticular order.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.368446033@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f2b914de3f0c7..e22ab47a2f417 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -170,22 +170,18 @@ struct perf_counter_mmap_page {
 	 *   u32 seq;
 	 *   s64 count;
 	 *
-	 * again:
-	 *   seq = pc->lock;
-	 *   if (unlikely(seq & 1)) {
-	 *     cpu_relax();
-	 *     goto again;
-	 *   }
+	 *   do {
+	 *     seq = pc->lock;
 	 *
-	 *   if (pc->index) {
-	 *     count = pmc_read(pc->index - 1);
-	 *     count += pc->offset;
-	 *   } else
-	 *     goto regular_read;
+	 *     barrier()
+	 *     if (pc->index) {
+	 *       count = pmc_read(pc->index - 1);
+	 *       count += pc->offset;
+	 *     } else
+	 *       goto regular_read;
 	 *
-	 *   barrier();
-	 *   if (pc->lock != seq)
-	 *     goto again;
+	 *     barrier();
+	 *   } while (pc->lock != seq);
 	 *
 	 * NOTE: for obvious reason this only works on self-monitoring
 	 *       processes.
-- 
GitLab


From 9c03d88e328d5f28f13191622c2ea1349c36b799 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:00 +0200
Subject: [PATCH 0245/2198] perf_counter: add more context information

Change the callchain context entries to u16, so as to gain some space.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.457320003@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 4 ++--
 kernel/perf_counter.c        | 6 ++----
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e22ab47a2f417..f9d5cf0bfbdd0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -507,10 +507,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
-#define MAX_STACK_DEPTH		254
+#define MAX_STACK_DEPTH		255
 
 struct perf_callchain_entry {
-	u32	nr, hv, kernel, user;
+	u16	nr, hv, kernel, user;
 	u64	ip[MAX_STACK_DEPTH];
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2a5d4f5255678..727624db5078b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1657,9 +1657,7 @@ void perf_counter_do_pending(void)
  * Callchain support -- arch specific
  */
 
-struct perf_callchain_entry *
-__attribute__((weak))
-perf_callchain(struct pt_regs *regs)
+__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 {
 	return NULL;
 }
@@ -1819,7 +1817,7 @@ void perf_counter_output(struct perf_counter *counter,
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
-			callchain_size = (2 + callchain->nr) * sizeof(u64);
+			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
 			header.type |= __PERF_EVENT_CALLCHAIN;
 			header.size += callchain_size;
-- 
GitLab


From 3c446b3d3b38f991f97e9d2df0ad26a60a94dcff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:01 +0200
Subject: [PATCH 0246/2198] perf_counter: SIGIO support

Provide support for fcntl() I/O availability signals.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.579788800@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 20 +++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f9d5cf0bfbdd0..8d5d11b8d0115 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -238,6 +238,7 @@ enum perf_event_type {
 #include <linux/rcupdate.h>
 #include <linux/spinlock.h>
 #include <linux/hrtimer.h>
+#include <linux/fs.h>
 #include <asm/atomic.h>
 
 struct task_struct;
@@ -398,6 +399,7 @@ struct perf_counter {
 
 	/* poll related */
 	wait_queue_head_t		waitq;
+	struct fasync_struct		*fasync;
 	/* optional: for NMIs */
 	struct perf_wakeup_entry	wakeup;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 727624db5078b..c58cc64319e16 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1526,6 +1526,22 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	return ret;
 }
 
+static int perf_fasync(int fd, struct file *filp, int on)
+{
+	struct perf_counter *counter = filp->private_data;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	int retval;
+
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &counter->fasync);
+	mutex_unlock(&inode->i_mutex);
+
+	if (retval < 0)
+		return retval;
+
+	return 0;
+}
+
 static const struct file_operations perf_fops = {
 	.release		= perf_release,
 	.read			= perf_read,
@@ -1533,6 +1549,7 @@ static const struct file_operations perf_fops = {
 	.unlocked_ioctl		= perf_ioctl,
 	.compat_ioctl		= perf_ioctl,
 	.mmap			= perf_mmap,
+	.fasync			= perf_fasync,
 };
 
 /*
@@ -1549,7 +1566,7 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (data) {
-		(void)atomic_xchg(&data->wakeup, POLL_IN);
+		atomic_set(&data->wakeup, POLL_IN);
 		/*
 		 * Ensure all data writes are issued before updating the
 		 * user-space data head information. The matching rmb()
@@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	rcu_read_unlock();
 
 	wake_up_all(&counter->waitq);
+	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
 }
 
 /*
-- 
GitLab


From 671dec5daf3b3c43c5777be282f00120a44cf37f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:02 +0200
Subject: [PATCH 0247/2198] perf_counter: generalize pending infrastructure

Prepare the pending infrastructure to do more than wakeups.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.634732847@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 +++--
 kernel/perf_counter.c        | 53 ++++++++++++++++++++++--------------
 2 files changed, 36 insertions(+), 24 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8d5d11b8d0115..977fb15a53f37 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -321,8 +321,9 @@ struct perf_mmap_data {
 	void 				*data_pages[0];
 };
 
-struct perf_wakeup_entry {
-	struct perf_wakeup_entry *next;
+struct perf_pending_entry {
+	struct perf_pending_entry *next;
+	void (*func)(struct perf_pending_entry *);
 };
 
 /**
@@ -401,7 +402,7 @@ struct perf_counter {
 	wait_queue_head_t		waitq;
 	struct fasync_struct		*fasync;
 	/* optional: for NMIs */
-	struct perf_wakeup_entry	wakeup;
+	struct perf_pending_entry	pending;
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c58cc64319e16..0a2ade2e4f11f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1581,6 +1581,14 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
 }
 
+static void perf_pending_wakeup(struct perf_pending_entry *entry)
+{
+	struct perf_counter *counter = container_of(entry,
+			struct perf_counter, pending);
+
+	perf_counter_wakeup(counter);
+}
+
 /*
  * Pending wakeups
  *
@@ -1590,45 +1598,47 @@ void perf_counter_wakeup(struct perf_counter *counter)
  * single linked list and use cmpxchg() to add entries lockless.
  */
 
-#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+#define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
 
-static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
 	PENDING_TAIL,
 };
 
-static void perf_pending_queue(struct perf_counter *counter)
+static void perf_pending_queue(struct perf_pending_entry *entry,
+			       void (*func)(struct perf_pending_entry *))
 {
-	struct perf_wakeup_entry **head;
-	struct perf_wakeup_entry *prev, *next;
+	struct perf_pending_entry **head;
 
-	if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL)
 		return;
 
-	head = &get_cpu_var(perf_wakeup_head);
+	entry->func = func;
+
+	head = &get_cpu_var(perf_pending_head);
 
 	do {
-		prev = counter->wakeup.next = *head;
-		next = &counter->wakeup;
-	} while (cmpxchg(head, prev, next) != prev);
+		entry->next = *head;
+	} while (cmpxchg(head, entry->next, entry) != entry->next);
 
 	set_perf_counter_pending();
 
-	put_cpu_var(perf_wakeup_head);
+	put_cpu_var(perf_pending_head);
 }
 
 static int __perf_pending_run(void)
 {
-	struct perf_wakeup_entry *list;
+	struct perf_pending_entry *list;
 	int nr = 0;
 
-	list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL);
 	while (list != PENDING_TAIL) {
-		struct perf_counter *counter = container_of(list,
-				struct perf_counter, wakeup);
+		void (*func)(struct perf_pending_entry *);
+		struct perf_pending_entry *entry = list;
 
 		list = list->next;
 
-		counter->wakeup.next = NULL;
+		func = entry->func;
+		entry->next = NULL;
 		/*
 		 * Ensure we observe the unqueue before we issue the wakeup,
 		 * so that we won't be waiting forever.
@@ -1636,7 +1646,7 @@ static int __perf_pending_run(void)
 		 */
 		smp_wmb();
 
-		perf_counter_wakeup(counter);
+		func(entry);
 		nr++;
 	}
 
@@ -1658,7 +1668,7 @@ static inline int perf_not_pending(struct perf_counter *counter)
 	 * so that we do not miss the wakeup. -- see perf_pending_handle()
 	 */
 	smp_rmb();
-	return counter->wakeup.next == NULL;
+	return counter->pending.next == NULL;
 }
 
 static void perf_pending_sync(struct perf_counter *counter)
@@ -1695,9 +1705,10 @@ struct perf_output_handle {
 
 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 {
-	if (handle->nmi)
-		perf_pending_queue(handle->counter);
-	else
+	if (handle->nmi) {
+		perf_pending_queue(&handle->counter->pending,
+				   perf_pending_wakeup);
+	} else
 		perf_counter_wakeup(handle->counter);
 }
 
-- 
GitLab


From b6276f353bf490add62dcf7db0ebd75baa3e1a37 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:03 +0200
Subject: [PATCH 0248/2198] perf_counter: x86: self-IPI for pending work

Implement set_perf_counter_pending() with a self-IPI so that it will
run ASAP in a usable context.

For now use a second IRQ vector, because the primary vector pokes
the apic in funny ways that seem to confuse things.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.724626696@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/entry_arch.h   |  1 +
 arch/x86/include/asm/hardirq.h      |  1 +
 arch/x86/include/asm/hw_irq.h       |  1 +
 arch/x86/include/asm/irq_vectors.h  |  5 +++++
 arch/x86/include/asm/perf_counter.h |  3 ++-
 arch/x86/kernel/cpu/perf_counter.c  | 14 ++++++++++++++
 arch/x86/kernel/entry_64.S          |  2 ++
 arch/x86/kernel/irq.c               |  5 +++++
 arch/x86/kernel/irqinit_32.c        |  1 +
 arch/x86/kernel/irqinit_64.c        |  1 +
 10 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index c2e6bedaf2587..fe24d28024900 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
 #ifdef CONFIG_PERF_COUNTERS
 BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
+BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
 #endif
 
 #ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 25454427ceea9..f5ebe2aaca4b6 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,6 +14,7 @@ typedef struct {
 #endif
 	unsigned int generic_irqs;	/* arch dependent */
 	unsigned int apic_perf_irqs;
+	unsigned int apic_pending_irqs;
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index ae80f64973e01..7309c0ad69023 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -30,6 +30,7 @@ extern void apic_timer_interrupt(void);
 extern void generic_interrupt(void);
 extern void error_interrupt(void);
 extern void perf_counter_interrupt(void);
+extern void perf_pending_interrupt(void);
 
 extern void spurious_interrupt(void);
 extern void thermal_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47c8..545bb811ccb52 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -116,6 +116,11 @@
  */
 #define GENERIC_INTERRUPT_VECTOR	0xed
 
+/*
+ * Performance monitoring pending work vector:
+ */
+#define LOCAL_PENDING_VECTOR		0xec
+
 /*
  * First APIC vector available to drivers: (vectors 0x30-0xee) we
  * start at 0x31(0x41) to spread out vectors evenly between priority
diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index e2b0e66b23535..d08dd52cb8ff1 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -84,7 +84,8 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2)
 
-#define set_perf_counter_pending()	do { } while (0)
+extern void set_perf_counter_pending(void);
+
 #define clear_perf_counter_pending()	do { } while (0)
 #define test_perf_counter_pending()	(0)
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c74e20d593a77..438415866fe47 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -849,6 +849,20 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
+void smp_perf_pending_interrupt(struct pt_regs *regs)
+{
+	irq_enter();
+	ack_APIC_irq();
+	inc_irq_stat(apic_pending_irqs);
+	perf_counter_do_pending();
+	irq_exit();
+}
+
+void set_perf_counter_pending(void)
+{
+	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
+}
+
 void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3f129d963a05b..1d46cba56fd8e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1028,6 +1028,8 @@ apicinterrupt SPURIOUS_APIC_VECTOR \
 #ifdef CONFIG_PERF_COUNTERS
 apicinterrupt LOCAL_PERF_VECTOR \
 	perf_counter_interrupt smp_perf_counter_interrupt
+apicinterrupt LOCAL_PENDING_VECTOR \
+	perf_pending_interrupt smp_perf_pending_interrupt
 #endif
 
 /*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 9c2754302ecce..d465487da5871 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,6 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
+	seq_printf(p, "PND: ");
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
+	seq_printf(p, "  Performance pending work\n");
 #endif
 	if (generic_interrupt_extension) {
 		seq_printf(p, "PLT: ");
@@ -171,6 +175,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->apic_timer_irqs;
 	sum += irq_stats(cpu)->irq_spurious_count;
 	sum += irq_stats(cpu)->apic_perf_irqs;
+	sum += irq_stats(cpu)->apic_pending_irqs;
 #endif
 	if (generic_interrupt_extension)
 		sum += irq_stats(cpu)->generic_irqs;
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 925d87cfc5513..3190a6b961e64 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -166,6 +166,7 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 # ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 # endif
 
 # ifdef CONFIG_X86_MCE_P4THERMAL
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 665e2ab48abd1..53ceb26f80ff3 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -156,6 +156,7 @@ static void __init apic_intr_init(void)
 	/* Performance monitoring interrupt: */
 #ifdef CONFIG_PERF_COUNTERS
 	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 #endif
 }
 
-- 
GitLab


From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:04 +0200
Subject: [PATCH 0249/2198] perf_counter: theres more to overflow than writing
 events

Prepare for more generic overflow handling. The new perf_counter_overflow()
method will handle the generic bits of the counter overflow, and can return
a !0 return value, in which case the counter should be (soft) disabled, so
that it won't count until it's properly disabled.

XXX: do powerpc and swcounter

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.812109629@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  2 +-
 arch/x86/kernel/cpu/perf_counter.c |  3 ++-
 include/linux/perf_counter.h       |  4 ++--
 kernel/perf_counter.c              | 29 +++++++++++++++++++++++------
 4 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0a4d14f279ae6..f88c35d0710a5 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -732,7 +732,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_output(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs);
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 438415866fe47..1116a41bc7b5d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,8 @@ static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 			continue;
 
 		perf_save_and_restart(counter);
-		perf_counter_output(counter, nmi, regs);
+		if (perf_counter_overflow(counter, nmi, regs))
+			__pmc_generic_disable(counter, &counter->hw, bit);
 	}
 
 	hw_perf_ack_status(ack);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 977fb15a53f37..ca2d4df29e0c8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -491,8 +491,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_counter_context *ctx, int cpu);
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
-extern void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs);
+extern int perf_counter_overflow(struct perf_counter *counter,
+				 int nmi, struct pt_regs *regs);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0a2ade2e4f11f..195e976eb07dd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1800,8 +1800,8 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
-void perf_counter_output(struct perf_counter *counter,
-			 int nmi, struct pt_regs *regs)
+static void perf_counter_output(struct perf_counter *counter,
+				int nmi, struct pt_regs *regs)
 {
 	int ret;
 	u64 record_type = counter->hw_event.record_type;
@@ -2033,6 +2033,17 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 	perf_counter_mmap_event(&mmap_event);
 }
 
+/*
+ * Generic counter overflow handling.
+ */
+
+int perf_counter_overflow(struct perf_counter *counter,
+			  int nmi, struct pt_regs *regs)
+{
+	perf_counter_output(counter, nmi, regs);
+	return 0;
+}
+
 /*
  * Generic software counter infrastructure
  */
@@ -2077,6 +2088,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
+	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_counter *counter;
 	struct pt_regs *regs;
 
@@ -2092,12 +2104,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			!counter->hw_event.exclude_user)
 		regs = task_pt_regs(current);
 
-	if (regs)
-		perf_counter_output(counter, 0, regs);
+	if (regs) {
+		if (perf_counter_overflow(counter, 0, regs))
+			ret = HRTIMER_NORESTART;
+	}
 
 	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
 
-	return HRTIMER_RESTART;
+	return ret;
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
@@ -2105,7 +2119,10 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	perf_counter_output(counter, nmi, regs);
+	if (perf_counter_overflow(counter, nmi, regs))
+		/* soft-disable the counter */
+		;
+
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-- 
GitLab


From ebb3c4c4cb81d64cc041356915ec015e2c57092a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:05 +0200
Subject: [PATCH 0250/2198] perf_counter: fix the mlock accounting

Reading through the code I saw I forgot the finish the mlock accounting.
Do so now.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094517.899767331@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 195e976eb07dd..c841563de0433 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1461,13 +1461,14 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
+		vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
 		perf_mmap_data_free(counter);
 		mutex_unlock(&counter->mmap_mutex);
 	}
 }
 
 static struct vm_operations_struct perf_mmap_vmops = {
-	.open = perf_mmap_open,
+	.open  = perf_mmap_open,
 	.close = perf_mmap_close,
 	.fault = perf_mmap_fault,
 };
@@ -1499,24 +1500,32 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_pgoff != 0)
 		return -EINVAL;
 
-	locked = vma_size >>  PAGE_SHIFT;
-	locked += vma->vm_mm->locked_vm;
+	mutex_lock(&counter->mmap_mutex);
+	if (atomic_inc_not_zero(&counter->mmap_count)) {
+		if (nr_pages != counter->data->nr_pages)
+			ret = -EINVAL;
+		goto unlock;
+	}
+
+	locked = vma->vm_mm->locked_vm;
+	locked += nr_pages + 1;
 
 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
 
-	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK))
-		return -EPERM;
-
-	mutex_lock(&counter->mmap_mutex);
-	if (atomic_inc_not_zero(&counter->mmap_count))
-		goto out;
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		ret = -EPERM;
+		goto unlock;
+	}
 
 	WARN_ON(counter->data);
 	ret = perf_mmap_data_alloc(counter, nr_pages);
-	if (!ret)
-		atomic_set(&counter->mmap_count, 1);
-out:
+	if (ret)
+		goto unlock;
+
+	atomic_set(&counter->mmap_count, 1);
+	vma->vm_mm->locked_vm += nr_pages + 1;
+unlock:
 	mutex_unlock(&counter->mmap_mutex);
 
 	vma->vm_flags &= ~VM_MAYWRITE;
-- 
GitLab


From 339f7c90b8a2f3aa2dd4267e79f797999e8a3c59 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:06 +0200
Subject: [PATCH 0251/2198] perf_counter: PERF_RECORD_TIME

By popular request, provide means to log a timestamp along with the
counter overflow event.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.024173282@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index ca2d4df29e0c8..928a7fae0961f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -102,6 +102,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_TID		= 1U << 1,
 	PERF_RECORD_GROUP	= 1U << 2,
 	PERF_RECORD_CALLCHAIN	= 1U << 3,
+	PERF_RECORD_TIME	= 1U << 4,
 };
 
 /*
@@ -221,6 +222,7 @@ enum perf_event_type {
 	__PERF_EVENT_TID		= PERF_RECORD_TID,
 	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
 	__PERF_EVENT_CALLCHAIN		= PERF_RECORD_CALLCHAIN,
+	__PERF_EVENT_TIME		= PERF_RECORD_TIME,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c841563de0433..19990d1f02154 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1826,6 +1826,7 @@ static void perf_counter_output(struct perf_counter *counter,
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
+	u64 time;
 
 	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
@@ -1862,6 +1863,16 @@ static void perf_counter_output(struct perf_counter *counter,
 		}
 	}
 
+	if (record_type & PERF_RECORD_TIME) {
+		/*
+		 * Maybe do better on x86 and provide cpu_clock_nmi()
+		 */
+		time = sched_clock();
+
+		header.type |= __PERF_EVENT_TIME;
+		header.size += sizeof(u64);
+	}
+
 	ret = perf_output_begin(&handle, counter, header.size, nmi);
 	if (ret)
 		return;
@@ -1895,6 +1906,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (callchain)
 		perf_output_copy(&handle, callchain, callchain_size);
 
+	if (record_type & PERF_RECORD_TIME)
+		perf_output_put(&handle, time);
+
 	perf_output_end(&handle);
 }
 
-- 
GitLab


From 79f146415623fe74f39af67c0f6adc208939a410 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:07 +0200
Subject: [PATCH 0252/2198] perf_counter: counter overflow limit

Provide means to auto-disable the counter after 'n' overflow events.

Create the counter with hw_event.disabled = 1, and then issue an
ioctl(fd, PREF_COUNTER_IOC_REFRESH, n); to set the limit and enable
the counter.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.083139737@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 12 ++++++---
 kernel/perf_counter.c        | 51 +++++++++++++++++++++++++++++-------
 2 files changed, 50 insertions(+), 13 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 928a7fae0961f..ef4dcbff75ab6 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -155,8 +155,9 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IO('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO('$', 1)
+#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
 
 /*
  * Structure of the page that can be mapped via mmap
@@ -403,9 +404,14 @@ struct perf_counter {
 	/* poll related */
 	wait_queue_head_t		waitq;
 	struct fasync_struct		*fasync;
-	/* optional: for NMIs */
+
+	/* delayed work for NMIs and such */
+	int				pending_wakeup;
+	int				pending_disable;
 	struct perf_pending_entry	pending;
 
+	atomic_t			event_limit;
+
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 19990d1f02154..c05e10354bc95 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -744,6 +744,12 @@ static void perf_counter_enable(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
+static void perf_counter_refresh(struct perf_counter *counter, int refresh)
+{
+	atomic_add(refresh, &counter->event_limit);
+	perf_counter_enable(counter);
+}
+
 /*
  * Enable a counter and all its children.
  */
@@ -1311,6 +1317,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_COUNTER_IOC_DISABLE:
 		perf_counter_disable_family(counter);
 		break;
+	case PERF_COUNTER_IOC_REFRESH:
+		perf_counter_refresh(counter, arg);
+		break;
 	default:
 		err = -ENOTTY;
 	}
@@ -1590,14 +1599,6 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
 }
 
-static void perf_pending_wakeup(struct perf_pending_entry *entry)
-{
-	struct perf_counter *counter = container_of(entry,
-			struct perf_counter, pending);
-
-	perf_counter_wakeup(counter);
-}
-
 /*
  * Pending wakeups
  *
@@ -1607,6 +1608,22 @@ static void perf_pending_wakeup(struct perf_pending_entry *entry)
  * single linked list and use cmpxchg() to add entries lockless.
  */
 
+static void perf_pending_counter(struct perf_pending_entry *entry)
+{
+	struct perf_counter *counter = container_of(entry,
+			struct perf_counter, pending);
+
+	if (counter->pending_disable) {
+		counter->pending_disable = 0;
+		perf_counter_disable(counter);
+	}
+
+	if (counter->pending_wakeup) {
+		counter->pending_wakeup = 0;
+		perf_counter_wakeup(counter);
+	}
+}
+
 #define PENDING_TAIL ((struct perf_pending_entry *)-1UL)
 
 static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = {
@@ -1715,8 +1732,9 @@ struct perf_output_handle {
 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 {
 	if (handle->nmi) {
+		handle->counter->pending_wakeup = 1;
 		perf_pending_queue(&handle->counter->pending,
-				   perf_pending_wakeup);
+				   perf_pending_counter);
 	} else
 		perf_counter_wakeup(handle->counter);
 }
@@ -2063,8 +2081,21 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 int perf_counter_overflow(struct perf_counter *counter,
 			  int nmi, struct pt_regs *regs)
 {
+	int events = atomic_read(&counter->event_limit);
+	int ret = 0;
+
+	if (events && atomic_dec_and_test(&counter->event_limit)) {
+		ret = 1;
+		if (nmi) {
+			counter->pending_disable = 1;
+			perf_pending_queue(&counter->pending,
+					   perf_pending_counter);
+		} else
+			perf_counter_disable(counter);
+	}
+
 	perf_counter_output(counter, nmi, regs);
-	return 0;
+	return ret;
 }
 
 /*
-- 
GitLab


From 0c593b3411341e3a05a61f5527df36ab02bd11e8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:08 +0200
Subject: [PATCH 0253/2198] perf_counter: comment the perf_event_type stuff

Describe the event format.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.211174347@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index ef4dcbff75ab6..81220188d0583 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -208,6 +208,20 @@ struct perf_event_header {
 
 enum perf_event_type {
 
+	/*
+	 * The MMAP events record the PROT_EXEC mappings so that we can
+	 * correlate userspace IPs to code. They have the following structure:
+	 *
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	u32				pid, tid;
+	 * 	u64				addr;
+	 * 	u64				len;
+	 * 	u64				pgoff;
+	 * 	char				filename[];
+	 * };
+	 */
 	PERF_EVENT_MMAP			= 1,
 	PERF_EVENT_MUNMAP		= 2,
 
@@ -217,6 +231,24 @@ enum perf_event_type {
 	 *
 	 * These events will have types of the form:
 	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 *
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	{ u64			ip;	  } && __PERF_EVENT_IP
+	 * 	{ u32			pid, tid; } && __PERF_EVENT_TID
+	 *
+	 * 	{ u64			nr;
+	 * 	  { u64 event, val; } 	cnt[nr];  } && __PERF_EVENT_GROUP
+	 *
+	 * 	{ u16			nr,
+	 * 				hv,
+	 * 				kernel,
+	 * 				user;
+	 * 	  u64			ips[nr];  } && __PERF_EVENT_CALLCHAIN
+	 *
+	 * 	{ u64			time;     } && __PERF_EVENT_TIME
+	 * };
 	 */
 	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
 	__PERF_EVENT_IP			= PERF_RECORD_IP,
-- 
GitLab


From 4c9e25428ff46b968a30f1dfafdba550cb6e4141 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:09 +0200
Subject: [PATCH 0254/2198] perf_counter: change event definition

Currently the definition of an event is slightly ambiguous. We have
wakeup events, for poll() and SIGIO, which are either generated
when a record crosses a page boundary (hw_events.wakeup_events == 0),
or every wakeup_events new records.

Now a record can be either a counter overflow record, or a number of
different things, like the mmap PROT_EXEC region notifications.

Then there is the PERF_COUNTER_IOC_REFRESH event limit, which only
considers counter overflows.

This patch changes then wakeup_events and SIGIO notification to only
consider overflow events. Furthermore it changes the SIGIO notification
to report SIGHUP when the event limit is reached and the counter will
be disabled.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.266679874@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 +
 kernel/perf_counter.c        | 22 +++++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 81220188d0583..0f5a4005048f5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -439,6 +439,7 @@ struct perf_counter {
 
 	/* delayed work for NMIs and such */
 	int				pending_wakeup;
+	int				pending_kill;
 	int				pending_disable;
 	struct perf_pending_entry	pending;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c05e10354bc95..8c8eaf0625f9a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1596,7 +1596,11 @@ void perf_counter_wakeup(struct perf_counter *counter)
 	rcu_read_unlock();
 
 	wake_up_all(&counter->waitq);
-	kill_fasync(&counter->fasync, SIGIO, POLL_IN);
+
+	if (counter->pending_kill) {
+		kill_fasync(&counter->fasync, SIGIO, counter->pending_kill);
+		counter->pending_kill = 0;
+	}
 }
 
 /*
@@ -1727,6 +1731,7 @@ struct perf_output_handle {
 	unsigned int		head;
 	int			wakeup;
 	int			nmi;
+	int			overflow;
 };
 
 static inline void __perf_output_wakeup(struct perf_output_handle *handle)
@@ -1741,7 +1746,7 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 
 static int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_counter *counter, unsigned int size,
-			     int nmi)
+			     int nmi, int overflow)
 {
 	struct perf_mmap_data *data;
 	unsigned int offset, head;
@@ -1751,8 +1756,9 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data)
 		goto out;
 
-	handle->counter	= counter;
-	handle->nmi	= nmi;
+	handle->counter	 = counter;
+	handle->nmi	 = nmi;
+	handle->overflow = overflow;
 
 	if (!data->nr_pages)
 		goto fail;
@@ -1816,7 +1822,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 {
 	int wakeup_events = handle->counter->hw_event.wakeup_events;
 
-	if (wakeup_events) {
+	if (handle->overflow && wakeup_events) {
 		int events = atomic_inc_return(&handle->data->events);
 		if (events >= wakeup_events) {
 			atomic_sub(wakeup_events, &handle->data->events);
@@ -1891,7 +1897,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
-	ret = perf_output_begin(&handle, counter, header.size, nmi);
+	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
 	if (ret)
 		return;
 
@@ -1955,7 +1961,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 {
 	struct perf_output_handle handle;
 	int size = mmap_event->event.header.size;
-	int ret = perf_output_begin(&handle, counter, size, 0);
+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
 
 	if (ret)
 		return;
@@ -2084,8 +2090,10 @@ int perf_counter_overflow(struct perf_counter *counter,
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
 
+	counter->pending_kill = POLL_IN;
 	if (events && atomic_dec_and_test(&counter->event_limit)) {
 		ret = 1;
+		counter->pending_kill = POLL_HUP;
 		if (nmi) {
 			counter->pending_disable = 1;
 			perf_pending_queue(&counter->pending,
-- 
GitLab


From 4af4998b8aa35600f4c4a4f3c3a23baca6081d02 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:10 +0200
Subject: [PATCH 0255/2198] perf_counter: rework context time

Since perf_counter_context is switched along with tasks, we can
maintain the context time without using the task runtime clock.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.353552838@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 10 ++---
 kernel/perf_counter.c        | 78 ++++++++++++++++--------------------
 2 files changed, 37 insertions(+), 51 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0f5a4005048f5..7f5d353d78ac5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -477,14 +477,10 @@ struct perf_counter_context {
 	struct task_struct	*task;
 
 	/*
-	 * time_now is the current time in nanoseconds since an arbitrary
-	 * point in the past.  For per-task counters, this is based on the
-	 * task clock, and for per-cpu counters it is based on the cpu clock.
-	 * time_lost is an offset from the task/cpu clock, used to make it
-	 * appear that time only passes while the context is scheduled in.
+	 * Context clock, runs when context enabled.
 	 */
-	u64			time_now;
-	u64			time_lost;
+	u64			time;
+	u64			timestamp;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8c8eaf0625f9a..84d85ab4e1611 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -117,7 +117,7 @@ counter_sched_out(struct perf_counter *counter,
 		return;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->tstamp_stopped = ctx->time_now;
+	counter->tstamp_stopped = ctx->time;
 	counter->hw_ops->disable(counter);
 	counter->oncpu = -1;
 
@@ -253,27 +253,20 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Get the current time for this context.
- * If this is a task context, we use the task's task clock,
- * or for a per-cpu context, we use the cpu clock.
- */
-static u64 get_context_time(struct perf_counter_context *ctx, int update)
+static inline u64 perf_clock(void)
 {
-	struct task_struct *curr = ctx->task;
-
-	if (!curr)
-		return cpu_clock(smp_processor_id());
-
-	return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime;
+	return cpu_clock(smp_processor_id());
 }
 
 /*
  * Update the record of the current time in a context.
  */
-static void update_context_time(struct perf_counter_context *ctx, int update)
+static void update_context_time(struct perf_counter_context *ctx)
 {
-	ctx->time_now = get_context_time(ctx, update) - ctx->time_lost;
+	u64 now = perf_clock();
+
+	ctx->time += now - ctx->timestamp;
+	ctx->timestamp = now;
 }
 
 /*
@@ -284,15 +277,17 @@ static void update_counter_times(struct perf_counter *counter)
 	struct perf_counter_context *ctx = counter->ctx;
 	u64 run_end;
 
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
-		counter->total_time_enabled = ctx->time_now -
-			counter->tstamp_enabled;
-		if (counter->state == PERF_COUNTER_STATE_INACTIVE)
-			run_end = counter->tstamp_stopped;
-		else
-			run_end = ctx->time_now;
-		counter->total_time_running = run_end - counter->tstamp_running;
-	}
+	if (counter->state < PERF_COUNTER_STATE_INACTIVE)
+		return;
+
+	counter->total_time_enabled = ctx->time - counter->tstamp_enabled;
+
+	if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+		run_end = counter->tstamp_stopped;
+	else
+		run_end = ctx->time;
+
+	counter->total_time_running = run_end - counter->tstamp_running;
 }
 
 /*
@@ -332,7 +327,7 @@ static void __perf_counter_disable(void *info)
 	 * If it is in error state, leave it in error state.
 	 */
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
-		update_context_time(ctx, 1);
+		update_context_time(ctx);
 		update_counter_times(counter);
 		if (counter == counter->group_leader)
 			group_sched_out(counter, cpuctx, ctx);
@@ -426,7 +421,7 @@ counter_sched_in(struct perf_counter *counter,
 		return -EAGAIN;
 	}
 
-	counter->tstamp_running += ctx->time_now - counter->tstamp_stopped;
+	counter->tstamp_running += ctx->time - counter->tstamp_stopped;
 
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu++;
@@ -493,9 +488,9 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 	list_add_counter(counter, ctx);
 	ctx->nr_counters++;
 	counter->prev_state = PERF_COUNTER_STATE_OFF;
-	counter->tstamp_enabled = ctx->time_now;
-	counter->tstamp_running = ctx->time_now;
-	counter->tstamp_stopped = ctx->time_now;
+	counter->tstamp_enabled = ctx->time;
+	counter->tstamp_running = ctx->time;
+	counter->tstamp_stopped = ctx->time;
 }
 
 /*
@@ -522,7 +517,7 @@ static void __perf_install_in_context(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
-	update_context_time(ctx, 1);
+	update_context_time(ctx);
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -648,13 +643,13 @@ static void __perf_counter_enable(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	spin_lock(&ctx->lock);
-	update_context_time(ctx, 1);
+	update_context_time(ctx);
 
 	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled;
+	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -737,8 +732,8 @@ static void perf_counter_enable(struct perf_counter *counter)
 	 */
 	if (counter->state == PERF_COUNTER_STATE_OFF) {
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled = ctx->time_now -
-			counter->total_time_enabled;
+		counter->tstamp_enabled =
+			ctx->time - counter->total_time_enabled;
 	}
  out:
 	spin_unlock_irq(&ctx->lock);
@@ -778,7 +773,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	ctx->is_active = 0;
 	if (likely(!ctx->nr_counters))
 		goto out;
-	update_context_time(ctx, 0);
+	update_context_time(ctx);
 
 	flags = hw_perf_save_disable();
 	if (ctx->nr_active) {
@@ -883,12 +878,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	if (likely(!ctx->nr_counters))
 		goto out;
 
-	/*
-	 * Add any time since the last sched_out to the lost time
-	 * so it doesn't get included in the total_time_enabled and
-	 * total_time_running measures for counters in the context.
-	 */
-	ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now;
+	ctx->timestamp = perf_clock();
 
 	flags = hw_perf_save_disable();
 
@@ -1043,8 +1033,8 @@ int perf_counter_task_enable(void)
 		if (counter->state > PERF_COUNTER_STATE_OFF)
 			continue;
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled = ctx->time_now -
-			counter->total_time_enabled;
+		counter->tstamp_enabled =
+			ctx->time - counter->total_time_enabled;
 		counter->hw_event.disabled = 0;
 	}
 	hw_perf_restore(perf_flags);
@@ -1113,7 +1103,7 @@ static void __read(void *info)
 
 	curr_rq_lock_irq_save(&flags);
 	if (ctx->is_active)
-		update_context_time(ctx, 1);
+		update_context_time(ctx);
 	counter->hw_ops->read(counter);
 	update_counter_times(counter);
 	curr_rq_unlock_irq_restore(&flags);
-- 
GitLab


From a39d6f2556c4a19f58f538c6aa28bf8faca4fcb8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:11 +0200
Subject: [PATCH 0256/2198] perf_counter: rework the task clock software
 counter

Rework the task clock software counter to use the context time instead
of the task runtime clock, this removes the last such user.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.445450972@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 42 ++++++++++++------------------------------
 1 file changed, 12 insertions(+), 30 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 84d85ab4e1611..56b7eb53d673e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -974,9 +974,6 @@ int perf_counter_task_disable(void)
 	curr_rq_lock_irq_save(&flags);
 	cpu = smp_processor_id();
 
-	/* force the update of the task clock: */
-	__task_delta_exec(curr, 1);
-
 	perf_counter_task_sched_out(curr, cpu);
 
 	spin_lock(&ctx->lock);
@@ -1017,9 +1014,6 @@ int perf_counter_task_enable(void)
 	curr_rq_lock_irq_save(&flags);
 	cpu = smp_processor_id();
 
-	/* force the update of the task clock: */
-	__task_delta_exec(curr, 1);
-
 	perf_counter_task_sched_out(curr, cpu);
 
 	spin_lock(&ctx->lock);
@@ -2347,38 +2341,28 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
  * Software counter: task time clock
  */
 
-/*
- * Called from within the scheduler:
- */
-static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
-{
-	struct task_struct *curr = counter->task;
-	u64 delta;
-
-	delta = __task_delta_exec(curr, update);
-
-	return curr->se.sum_exec_runtime + delta;
-}
-
-static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
+static void task_clock_perf_counter_update(struct perf_counter *counter)
 {
-	u64 prev;
+	u64 prev, now;
 	s64 delta;
 
-	prev = atomic64_read(&counter->hw.prev_count);
-
-	atomic64_set(&counter->hw.prev_count, now);
+	update_context_time(counter->ctx);
+	now = counter->ctx->time;
 
+	prev = atomic64_xchg(&counter->hw.prev_count, now);
 	delta = now - prev;
-
 	atomic64_add(delta, &counter->count);
 }
 
 static int task_clock_perf_counter_enable(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
+	u64 now;
+
+	update_context_time(counter->ctx);
+	now = counter->ctx->time;
 
-	atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0));
+	atomic64_set(&hwc->prev_count, now);
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
@@ -2393,14 +2377,12 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
 	hrtimer_cancel(&counter->hw.hrtimer);
-	task_clock_perf_counter_update(counter,
-			task_clock_perf_counter_val(counter, 0));
+	task_clock_perf_counter_update(counter);
 }
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	task_clock_perf_counter_update(counter,
-			task_clock_perf_counter_val(counter, 1));
+	task_clock_perf_counter_update(counter);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
GitLab


From 849691a6cd40270ff5f4a8846d5f6bf8df663ffc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:12 +0200
Subject: [PATCH 0257/2198] perf_counter: remove rq->lock usage

Now that all the task runtime clock users are gone, remove the ugly
rq->lock usage from perf counters, which solves the nasty deadlock
seen when a software task clock counter was read from an NMI overflow
context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.531137582@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |  2 --
 kernel/perf_counter.c       | 42 ++++++++++++++-----------------------
 kernel/sched.c              | 20 ------------------
 3 files changed, 16 insertions(+), 48 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index b6d2887a5d889..080d1fd461d74 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -85,8 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 /*
  * Lock/unlock the current runqueue - to extract task statistics:
  */
-extern void curr_rq_lock_irq_save(unsigned long *flags);
-extern void curr_rq_unlock_irq_restore(unsigned long *flags);
 extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
 extern unsigned long long task_delta_exec(struct task_struct *);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 56b7eb53d673e..f4f7596f78414 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -172,8 +172,7 @@ static void __perf_counter_remove_from_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	counter_sched_out(counter, cpuctx, ctx);
 
@@ -198,8 +197,7 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 
@@ -319,8 +317,7 @@ static void __perf_counter_disable(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 
 	/*
 	 * If the counter is on, turn it off.
@@ -336,8 +333,7 @@ static void __perf_counter_disable(void *info)
 		counter->state = PERF_COUNTER_STATE_OFF;
 	}
 
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -515,8 +511,7 @@ static void __perf_install_in_context(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 	update_context_time(ctx);
 
 	/*
@@ -565,8 +560,7 @@ static void __perf_install_in_context(void *info)
  unlock:
 	hw_perf_restore(perf_flags);
 
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -641,8 +635,7 @@ static void __perf_counter_enable(void *info)
 	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
 
-	curr_rq_lock_irq_save(&flags);
-	spin_lock(&ctx->lock);
+	spin_lock_irqsave(&ctx->lock, flags);
 	update_context_time(ctx);
 
 	counter->prev_state = counter->state;
@@ -678,8 +671,7 @@ static void __perf_counter_enable(void *info)
 	}
 
  unlock:
-	spin_unlock(&ctx->lock);
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
 /*
@@ -971,7 +963,7 @@ int perf_counter_task_disable(void)
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	curr_rq_lock_irq_save(&flags);
+	local_irq_save(flags);
 	cpu = smp_processor_id();
 
 	perf_counter_task_sched_out(curr, cpu);
@@ -992,9 +984,7 @@ int perf_counter_task_disable(void)
 
 	hw_perf_restore(perf_flags);
 
-	spin_unlock(&ctx->lock);
-
-	curr_rq_unlock_irq_restore(&flags);
+	spin_unlock_irqrestore(&ctx->lock, flags);
 
 	return 0;
 }
@@ -1011,7 +1001,7 @@ int perf_counter_task_enable(void)
 	if (likely(!ctx->nr_counters))
 		return 0;
 
-	curr_rq_lock_irq_save(&flags);
+	local_irq_save(flags);
 	cpu = smp_processor_id();
 
 	perf_counter_task_sched_out(curr, cpu);
@@ -1037,7 +1027,7 @@ int perf_counter_task_enable(void)
 
 	perf_counter_task_sched_in(curr, cpu);
 
-	curr_rq_unlock_irq_restore(&flags);
+	local_irq_restore(flags);
 
 	return 0;
 }
@@ -1095,12 +1085,12 @@ static void __read(void *info)
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
 
-	curr_rq_lock_irq_save(&flags);
+	local_irq_save(flags);
 	if (ctx->is_active)
 		update_context_time(ctx);
 	counter->hw_ops->read(counter);
 	update_counter_times(counter);
-	curr_rq_unlock_irq_restore(&flags);
+	local_irq_restore(flags);
 }
 
 static u64 perf_counter_read(struct perf_counter *counter)
@@ -2890,7 +2880,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		 * Be careful about zapping the list - IRQ/NMI context
 		 * could still be processing it:
 		 */
-		curr_rq_lock_irq_save(&flags);
+		local_irq_save(flags);
 		perf_flags = hw_perf_save_disable();
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
@@ -2903,7 +2893,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		child_ctx->nr_counters--;
 
 		hw_perf_restore(perf_flags);
-		curr_rq_unlock_irq_restore(&flags);
+		local_irq_restore(flags);
 	}
 
 	parent_counter = child_counter->parent;
diff --git a/kernel/sched.c b/kernel/sched.c
index f76e3c0188a2e..0de2f814fb18c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -997,26 +997,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	}
 }
 
-void curr_rq_lock_irq_save(unsigned long *flags)
-	__acquires(rq->lock)
-{
-	struct rq *rq;
-
-	local_irq_save(*flags);
-	rq = cpu_rq(smp_processor_id());
-	spin_lock(&rq->lock);
-}
-
-void curr_rq_unlock_irq_restore(unsigned long *flags)
-	__releases(rq->lock)
-{
-	struct rq *rq;
-
-	rq = cpu_rq(smp_processor_id());
-	spin_unlock(&rq->lock);
-	local_irq_restore(*flags);
-}
-
 void task_rq_unlock_wait(struct task_struct *p)
 {
 	struct rq *rq = task_rq(p);
-- 
GitLab


From bce379bf358738ab8efc8cda2594a299ac685085 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 6 Apr 2009 11:45:13 +0200
Subject: [PATCH 0258/2198] perf_counter: minimize context time updates

Push the update_context_time() calls up the stack so that we get less
invokations and thereby a less noisy output:

before:

 # ./perfstat -e 1:0 -e 1:1 -e 1:1 -e 1:1 -l ls > /dev/null

 Performance counter stats for 'ls':

      10.163691  cpu clock ticks      (msecs)  (scaled from 98.94%)
      10.215360  task clock ticks     (msecs)  (scaled from 98.18%)
      10.185549  task clock ticks     (msecs)  (scaled from 98.53%)
      10.183581  task clock ticks     (msecs)  (scaled from 98.71%)

 Wall-clock time elapsed:    11.912858 msecs

after:

 # ./perfstat -e 1:0 -e 1:1 -e 1:1 -e 1:1 -l ls > /dev/null

 Performance counter stats for 'ls':

       9.316630  cpu clock ticks      (msecs)
       9.280789  task clock ticks     (msecs)
       9.280789  task clock ticks     (msecs)
       9.280789  task clock ticks     (msecs)

 Wall-clock time elapsed:     9.574872 msecs

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090406094518.618876874@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f4f7596f78414..863703b3158f8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -319,6 +319,8 @@ static void __perf_counter_disable(void *info)
 
 	spin_lock_irqsave(&ctx->lock, flags);
 
+	update_context_time(ctx);
+
 	/*
 	 * If the counter is on, turn it off.
 	 * If it is in error state, leave it in error state.
@@ -797,6 +799,8 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	if (likely(!cpuctx->task_ctx))
 		return;
 
+	update_context_time(ctx);
+
 	regs = task_pt_regs(task);
 	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
 	__perf_counter_sched_out(ctx, cpuctx);
@@ -2336,7 +2340,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter)
 	u64 prev, now;
 	s64 delta;
 
-	update_context_time(counter->ctx);
 	now = counter->ctx->time;
 
 	prev = atomic64_xchg(&counter->hw.prev_count, now);
@@ -2349,7 +2352,6 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	u64 now;
 
-	update_context_time(counter->ctx);
 	now = counter->ctx->time;
 
 	atomic64_set(&hwc->prev_count, now);
@@ -2372,6 +2374,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
+	update_context_time(counter->ctx);
 	task_clock_perf_counter_update(counter);
 }
 
-- 
GitLab


From 6278af660ff83fbafb18e53fc2747eb2ee6780fa Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 2 Apr 2009 10:40:28 +0200
Subject: [PATCH 0259/2198] perf_counter tools: kerneltop: display per function
 percentage along with event count

------------------------------------------------------------------------------
 KernelTop:   90551 irqs/sec  kernel:15.0% [NMI, 100000 CPU cycles],  (all, 4 CPUs)
------------------------------------------------------------------------------

             events    pcnt         RIP          kernel function
  ______     ______   _____   ________________   _______________

           16871.00 - 19.1% - ffffffff80328e20 : clear_page_c
            8810.00 -  9.9% - ffffffff8048ce80 : page_fault
            4746.00 -  5.4% - ffffffff8048cae2 : _spin_lock
            4428.00 -  5.0% - ffffffff80328e70 : copy_page_c
            3340.00 -  3.8% - ffffffff80329090 : copy_user_generic_string!
            2679.00 -  3.0% - ffffffff8028a16b : get_page_from_freelist
            2254.00 -  2.5% - ffffffff80296f19 : unmap_vmas
            2082.00 -  2.4% - ffffffff80297e19 : handle_mm_fault
            1754.00 -  2.0% - ffffffff80288dc8 : __rmqueue_smallest
            1553.00 -  1.8% - ffffffff8048ca58 : _spin_lock_irqsave
            1400.00 -  1.6% - ffffffff8028cdc8 : release_pages
            1337.00 -  1.5% - ffffffff80285400 : find_get_page
            1335.00 -  1.5% - ffffffff80225a23 : do_page_fault
            1299.00 -  1.5% - ffffffff802ba8e7 : __d_lookup
            1174.00 -  1.3% - ffffffff802b38f3 : __link_path_walk
            1155.00 -  1.3% - ffffffff802843e1 : perf_swcounter_ctx_event!
            1137.00 -  1.3% - ffffffff8028d118 : ____pagevec_lru_add
             963.00 -  1.1% - ffffffff802a670b : kmem_cache_alloc
             885.00 -  1.0% - ffffffff8024bc61 : __wake_up_bit

Display per function percentage along with event count.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 45 +++++++++++++-------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 4f8d7917aba12..15f3a5f901989 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -636,16 +636,20 @@ static void print_sym_table(void)
 	int counter;
 	float events_per_sec = events/delay_secs;
 	float kevents_per_sec = (events-userspace_events)/delay_secs;
+	float sum_kevents = 0.0;
 
 	events = userspace_events = 0;
 	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
 	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
 
+	for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
+		sum_kevents += tmp[i].count[0];
+
 	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
 
 	printf(
 "------------------------------------------------------------------------------\n");
-	printf( " KernelTop:%8.0f irqs/sec  kernel:%3.1f%% [%s, ",
+	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [%s, ",
 		events_per_sec,
 		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
 		nmi ? "NMI" : "IRQ");
@@ -679,34 +683,31 @@ static void print_sym_table(void)
 	printf("------------------------------------------------------------------------------\n\n");
 
 	if (nr_counters == 1)
-		printf("             events");
+		printf("             events    pcnt");
 	else
-		printf("  weight     events");
+		printf("  weight     events    pcnt");
 
 	printf("         RIP          kernel function\n"
-	       	       "  ______     ______   ________________   _______________\n\n"
+	       	       "  ______     ______   _____   ________________   _______________\n\n"
 	);
 
-	printed = 0;
-	for (i = 0; i < sym_table_count; i++) {
+	for (i = 0, printed = 0; i < sym_table_count; i++) {
+		float pcnt;
 		int count;
 
-		if (nr_counters == 1) {
-			if (printed <= 18 &&
-					tmp[i].count[0] >= count_filter) {
-				printf("%19.2f - %016llx : %s\n",
-				  sym_weight(tmp + i), tmp[i].addr, tmp[i].sym);
-				printed++;
-			}
-		} else {
-			if (printed <= 18 &&
-					tmp[i].count[0] >= count_filter) {
-				printf("%8.1f %10ld - %016llx : %s\n",
-				  sym_weight(tmp + i),
-				  tmp[i].count[0],
-				  tmp[i].addr, tmp[i].sym);
-				printed++;
-			}
+		if (printed <= 18 && tmp[i].count[0] >= count_filter) {
+			pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
+
+			if (nr_counters == 1)
+				printf("%19.2f - %4.1f%% - %016llx : %s\n",
+					sym_weight(tmp + i),
+					pcnt, tmp[i].addr, tmp[i].sym);
+			else
+				printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
+					sym_weight(tmp + i),
+					tmp[i].count[0],
+					pcnt, tmp[i].addr, tmp[i].sym);
+			printed++;
 		}
 		/*
 		 * Add decay to the counts:
-- 
GitLab


From 98c2aaf8be5baf7193be37fb28bce8e7327158bc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 7 Apr 2009 11:30:17 +0200
Subject: [PATCH 0260/2198] x86, perfcounters: add atomic64_xchg()

Complete atomic64_t support on the 32-bit side by adding atomic64_xch().

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090406094518.445450972@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/atomic_32.h | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
index 977250ed8b898..aff9f1fcdcd7e 100644
--- a/arch/x86/include/asm/atomic_32.h
+++ b/arch/x86/include/asm/atomic_32.h
@@ -291,19 +291,37 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val,
 }
 
 /**
- * atomic64_set - set atomic64 variable
+ * atomic64_xchg - xchg atomic64 variable
  * @ptr:      pointer to type atomic64_t
  * @new_val:  value to assign
+ * @old_val:  old value that was there
  *
- * Atomically sets the value of @ptr to @new_val.
+ * Atomically xchgs the value of @ptr to @new_val and returns
+ * the old value.
  */
-static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+
+static inline unsigned long long
+atomic64_xchg(atomic64_t *ptr, unsigned long long new_val)
 {
 	unsigned long long old_val;
 
 	do {
 		old_val = atomic_read(ptr);
 	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val);
+
+	return old_val;
+}
+
+/**
+ * atomic64_set - set atomic64 variable
+ * @ptr:      pointer to type atomic64_t
+ * @new_val:  value to assign
+ *
+ * Atomically sets the value of @ptr to @new_val.
+ */
+static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val)
+{
+	atomic64_xchg(ptr, new_val);
 }
 
 /**
-- 
GitLab


From cac94f979326212831c0ea44ed9ea1622b4f4e93 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:33 +0200
Subject: [PATCH 0261/2198] x86, bts: fix race when bts tracer is removed

When the bts tracer is removed while the traced task is running,
the write to clear the bts tracer pointer races with context switch code.

Read the tracer once during a context switch.

When a new tracer is installed, the bts tracer is set in the ds context
before the tracer is initialized in order to claim the context for that
tracer.

This may result in write accesses using an uninitialized trace configuration
when scheduling timestamps have been requested.

Store active tracing flags separately and only set active flags after
the tracing configuration has been initialized.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144548.881338000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c | 58 ++++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index b1d6e1f502fa3..c730155bf54d3 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -89,6 +89,9 @@ struct bts_tracer {
 
 	/* Buffer overflow notification function: */
 	bts_ovfl_callback_t	ovfl;
+
+	/* Active flags affecting trace collection. */
+	unsigned int		flags;
 };
 
 struct pebs_tracer {
@@ -799,6 +802,8 @@ void ds_suspend_bts(struct bts_tracer *tracer)
 	if (!tracer)
 		return;
 
+	tracer->flags = 0;
+
 	task = tracer->ds.context->task;
 
 	if (!task || (task == current))
@@ -820,6 +825,8 @@ void ds_resume_bts(struct bts_tracer *tracer)
 	if (!tracer)
 		return;
 
+	tracer->flags = tracer->trace.ds.flags;
+
 	task = tracer->ds.context->task;
 
 	control = ds_cfg.ctl[dsf_bts];
@@ -1037,43 +1044,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 	}
 }
 
+static inline void ds_take_timestamp(struct ds_context *context,
+				     enum bts_qualifier qualifier,
+				     struct task_struct *task)
+{
+	struct bts_tracer *tracer = context->bts_master;
+	struct bts_struct ts;
+
+	/* Prevent compilers from reading the tracer pointer twice. */
+	barrier();
+
+	if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
+		return;
+
+	memset(&ts, 0, sizeof(ts));
+	ts.qualifier			= qualifier;
+	ts.variant.timestamp.jiffies	= jiffies_64;
+	ts.variant.timestamp.pid	= task->pid;
+
+	bts_write(tracer, &ts);
+}
+
 /*
  * Change the DS configuration from tracing prev to tracing next.
  */
 void ds_switch_to(struct task_struct *prev, struct task_struct *next)
 {
-	struct ds_context *prev_ctx = prev->thread.ds_ctx;
-	struct ds_context *next_ctx = next->thread.ds_ctx;
+	struct ds_context *prev_ctx	= prev->thread.ds_ctx;
+	struct ds_context *next_ctx	= next->thread.ds_ctx;
+	unsigned long debugctlmsr	= next->thread.debugctlmsr;
+
+	/* Make sure all data is read before we start. */
+	barrier();
 
 	if (prev_ctx) {
 		update_debugctlmsr(0);
 
-		if (prev_ctx->bts_master &&
-		    (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
-			struct bts_struct ts = {
-				.qualifier = bts_task_departs,
-				.variant.timestamp.jiffies = jiffies_64,
-				.variant.timestamp.pid = prev->pid
-			};
-			bts_write(prev_ctx->bts_master, &ts);
-		}
+		ds_take_timestamp(prev_ctx, bts_task_departs, prev);
 	}
 
 	if (next_ctx) {
-		if (next_ctx->bts_master &&
-		    (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
-			struct bts_struct ts = {
-				.qualifier = bts_task_arrives,
-				.variant.timestamp.jiffies = jiffies_64,
-				.variant.timestamp.pid = next->pid
-			};
-			bts_write(next_ctx->bts_master, &ts);
-		}
+		ds_take_timestamp(next_ctx, bts_task_arrives, next);
 
 		wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
 	}
 
-	update_debugctlmsr(next->thread.debugctlmsr);
+	update_debugctlmsr(debugctlmsr);
 }
 
 void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
-- 
GitLab


From a26b89f05d194413c7238e0bea071054f6b5d3c8 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:34 +0200
Subject: [PATCH 0262/2198] sched, hw-branch-tracer: add
 wait_task_context_switch() function to sched.h

Add a function to wait until some other task has been
switched out at least once.

This differs from wait_task_inactive() subtly, in that the
latter will wait until the task has left the CPU.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: markus.t.metzger@gmail.com
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144549.794157000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        | 43 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b94f3541f67be..a5b9a83065fae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1993,8 +1993,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
+extern void wait_task_context_switch(struct task_struct *p);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
+static inline void wait_task_context_switch(struct task_struct *p) {}
 static inline unsigned long wait_task_inactive(struct task_struct *p,
 					       long match_state)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 6cc1fd5d5072b..f91bc8141dc34 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2002,6 +2002,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 	return 1;
 }
 
+/*
+ * wait_task_context_switch -	wait for a thread to complete at least one
+ *				context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+	unsigned long nvcsw, nivcsw, flags;
+	int running;
+	struct rq *rq;
+
+	nvcsw	= p->nvcsw;
+	nivcsw	= p->nivcsw;
+	for (;;) {
+		/*
+		 * The runqueue is assigned before the actual context
+		 * switch. We need to take the runqueue lock.
+		 *
+		 * We could check initially without the lock but it is
+		 * very likely that we need to take the lock in every
+		 * iteration.
+		 */
+		rq = task_rq_lock(p, &flags);
+		running = task_running(rq, p);
+		task_rq_unlock(rq, &flags);
+
+		if (likely(!running))
+			break;
+		/*
+		 * The switch count is incremented before the actual
+		 * context switch. We thus wait for two switches to be
+		 * sure at least one completed.
+		 */
+		if ((p->nvcsw - nvcsw) > 1)
+			break;
+		if ((p->nivcsw - nivcsw) > 1)
+			break;
+
+		cpu_relax();
+	}
+}
+
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
-- 
GitLab


From e2b371f00a6f529f6362654239bdec8dcd510760 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:35 +0200
Subject: [PATCH 0263/2198] mm, x86, ptrace, bts: defer branch trace stopping

When a ptraced task is unlinked, we need to stop branch tracing for
that task.

Since the unlink is called with interrupts disabled, and we need
interrupts enabled to stop branch tracing, we defer the work.

Collect all branch tracing related stuff in a branch tracing context.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144550.712401000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h |   4 -
 arch/x86/kernel/ptrace.c         | 254 ++++++++++++++++++++-----------
 include/linux/mm.h               |   3 +-
 include/linux/sched.h            |   9 +-
 mm/mlock.c                       |  13 +-
 5 files changed, 179 insertions(+), 104 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 34c52370f2fe8..2483807e06e7f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -458,10 +458,6 @@ struct thread_struct {
 /* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
 	struct ds_context	*ds_ctx;
 #endif /* CONFIG_X86_DS */
-#ifdef CONFIG_X86_PTRACE_BTS
-/* the signal to send on a bts buffer overflow */
-	unsigned int	bts_ovfl_signal;
-#endif /* CONFIG_X86_PTRACE_BTS */
 };
 
 static inline unsigned long native_get_debugreg(int regno)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index fe9345c967de2..7c21d1e8cae79 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -22,6 +22,7 @@
 #include <linux/seccomp.h>
 #include <linux/signal.h>
 #include <linux/ftrace.h>
+#include <linux/workqueue.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -577,17 +578,119 @@ static int ioperm_get(struct task_struct *target,
 }
 
 #ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * A branch trace store context.
+ *
+ * Contexts may only be installed by ptrace_bts_config() and only for
+ * ptraced tasks.
+ *
+ * Contexts are destroyed when the tracee is detached from the tracer.
+ * The actual destruction work requires interrupts enabled, so the
+ * work is deferred and will be scheduled during __ptrace_unlink().
+ *
+ * Contexts hold an additional task_struct reference on the traced
+ * task, as well as a reference on the tracer's mm.
+ *
+ * Ptrace already holds a task_struct for the duration of ptrace operations,
+ * but since destruction is deferred, it may be executed after both
+ * tracer and tracee exited.
+ */
+struct bts_context {
+	/* The branch trace handle. */
+	struct bts_tracer	*tracer;
+
+	/* The buffer used to store the branch trace and its size. */
+	void			*buffer;
+	unsigned int		size;
+
+	/* The mm that paid for the above buffer. */
+	struct mm_struct	*mm;
+
+	/* The task this context belongs to. */
+	struct task_struct	*task;
+
+	/* The signal to send on a bts buffer overflow. */
+	unsigned int		bts_ovfl_signal;
+
+	/* The work struct to destroy a context. */
+	struct work_struct	work;
+};
+
+static inline void alloc_bts_buffer(struct bts_context *context,
+				    unsigned int size)
+{
+	void *buffer;
+
+	buffer = alloc_locked_buffer(size);
+	if (buffer) {
+		context->buffer = buffer;
+		context->size = size;
+		context->mm = get_task_mm(current);
+	}
+}
+
+static inline void free_bts_buffer(struct bts_context *context)
+{
+	if (!context->buffer)
+		return;
+
+	kfree(context->buffer);
+	context->buffer = NULL;
+
+	refund_locked_buffer_memory(context->mm, context->size);
+	context->size = 0;
+
+	mmput(context->mm);
+	context->mm = NULL;
+}
+
+static void free_bts_context_work(struct work_struct *w)
+{
+	struct bts_context *context;
+
+	context = container_of(w, struct bts_context, work);
+
+	ds_release_bts(context->tracer);
+	put_task_struct(context->task);
+	free_bts_buffer(context);
+	kfree(context);
+}
+
+static inline void free_bts_context(struct bts_context *context)
+{
+	INIT_WORK(&context->work, free_bts_context_work);
+	schedule_work(&context->work);
+}
+
+static inline struct bts_context *alloc_bts_context(struct task_struct *task)
+{
+	struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (context) {
+		context->task = task;
+		task->bts = context;
+
+		get_task_struct(task);
+	}
+
+	return context;
+}
+
 static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 				  struct bts_struct __user *out)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	struct bts_struct bts;
 	const unsigned char *at;
 	int error;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	at = trace->ds.top - ((index + 1) * trace->ds.size);
 	if ((void *)at < trace->ds.begin)
@@ -596,7 +699,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
 	if (!trace->read)
 		return -EOPNOTSUPP;
 
-	error = trace->read(child->bts, at, &bts);
+	error = trace->read(context->tracer, at, &bts);
 	if (error < 0)
 		return error;
 
@@ -610,13 +713,18 @@ static int ptrace_bts_drain(struct task_struct *child,
 			    long size,
 			    struct bts_struct __user *out)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	const unsigned char *at;
 	int error, drained = 0;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	if (!trace->read)
 		return -EOPNOTSUPP;
@@ -627,9 +735,8 @@ static int ptrace_bts_drain(struct task_struct *child,
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     out++, drained++, at += trace->ds.size) {
 		struct bts_struct bts;
-		int error;
 
-		error = trace->read(child->bts, at, &bts);
+		error = trace->read(context->tracer, at, &bts);
 		if (error < 0)
 			return error;
 
@@ -639,35 +746,18 @@ static int ptrace_bts_drain(struct task_struct *child,
 
 	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-	error = ds_reset_bts(child->bts);
+	error = ds_reset_bts(context->tracer);
 	if (error < 0)
 		return error;
 
 	return drained;
 }
 
-static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
-{
-	child->bts_buffer = alloc_locked_buffer(size);
-	if (!child->bts_buffer)
-		return -ENOMEM;
-
-	child->bts_size = size;
-
-	return 0;
-}
-
-static void ptrace_bts_free_buffer(struct task_struct *child)
-{
-	free_locked_buffer(child->bts_buffer, child->bts_size);
-	child->bts_buffer = NULL;
-	child->bts_size = 0;
-}
-
 static int ptrace_bts_config(struct task_struct *child,
 			     long cfg_size,
 			     const struct ptrace_bts_config __user *ucfg)
 {
+	struct bts_context *context;
 	struct ptrace_bts_config cfg;
 	unsigned int flags = 0;
 
@@ -677,28 +767,31 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
 		return -EFAULT;
 
-	if (child->bts) {
-		ds_release_bts(child->bts);
-		child->bts = NULL;
-	}
+	context = child->bts;
+	if (!context)
+		context = alloc_bts_context(child);
+	if (!context)
+		return -ENOMEM;
 
 	if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
 		if (!cfg.signal)
 			return -EINVAL;
 
-		child->thread.bts_ovfl_signal = cfg.signal;
 		return -EOPNOTSUPP;
+		context->bts_ovfl_signal = cfg.signal;
 	}
 
-	if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
-	    (cfg.size != child->bts_size)) {
-		int error;
+	ds_release_bts(context->tracer);
+	context->tracer = NULL;
 
-		ptrace_bts_free_buffer(child);
+	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+		free_bts_buffer(context);
+		if (!cfg.size)
+			return 0;
 
-		error = ptrace_bts_allocate_buffer(child, cfg.size);
-		if (error < 0)
-			return error;
+		alloc_bts_buffer(context, cfg.size);
+		if (!context->buffer)
+			return -ENOMEM;
 	}
 
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -707,15 +800,13 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
 		flags |= BTS_TIMESTAMPS;
 
-	child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
-				    /* ovfl = */ NULL, /* th = */ (size_t)-1,
-				    flags);
-	if (IS_ERR(child->bts)) {
-		int error = PTR_ERR(child->bts);
-
-		ptrace_bts_free_buffer(child);
-		child->bts = NULL;
+	context->tracer = ds_request_bts(child, context->buffer, context->size,
+					 NULL, (size_t)-1, flags);
+	if (unlikely(IS_ERR(context->tracer))) {
+		int error = PTR_ERR(context->tracer);
 
+		free_bts_buffer(context);
+		context->tracer = NULL;
 		return error;
 	}
 
@@ -726,20 +817,25 @@ static int ptrace_bts_status(struct task_struct *child,
 			     long cfg_size,
 			     struct ptrace_bts_config __user *ucfg)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 	struct ptrace_bts_config cfg;
 
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
 	if (cfg_size < sizeof(cfg))
 		return -EIO;
 
-	trace = ds_read_bts(child->bts);
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	memset(&cfg, 0, sizeof(cfg));
-	cfg.size = trace->ds.end - trace->ds.begin;
-	cfg.signal = child->thread.bts_ovfl_signal;
-	cfg.bts_size = sizeof(struct bts_struct);
+	cfg.size	= trace->ds.end - trace->ds.begin;
+	cfg.signal	= context->bts_ovfl_signal;
+	cfg.bts_size	= sizeof(struct bts_struct);
 
 	if (cfg.signal)
 		cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -758,67 +854,56 @@ static int ptrace_bts_status(struct task_struct *child,
 
 static int ptrace_bts_clear(struct task_struct *child)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-	return ds_reset_bts(child->bts);
+	return ds_reset_bts(context->tracer);
 }
 
 static int ptrace_bts_size(struct task_struct *child)
 {
+	struct bts_context *context;
 	const struct bts_trace *trace;
 
-	trace = ds_read_bts(child->bts);
+	context = child->bts;
+	if (!context)
+		return -ESRCH;
+
+	trace = ds_read_bts(context->tracer);
 	if (!trace)
-		return -EPERM;
+		return -ESRCH;
 
 	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 
-static void ptrace_bts_fork(struct task_struct *tsk)
+static inline void ptrace_bts_fork(struct task_struct *tsk)
 {
 	tsk->bts = NULL;
-	tsk->bts_buffer = NULL;
-	tsk->bts_size = 0;
-	tsk->thread.bts_ovfl_signal = 0;
 }
 
-static void ptrace_bts_untrace(struct task_struct *child)
+/*
+ * Called from __ptrace_unlink() after the child has been moved back
+ * to its original parent.
+ */
+static inline void ptrace_bts_untrace(struct task_struct *child)
 {
 	if (unlikely(child->bts)) {
-		ds_release_bts(child->bts);
+		free_bts_context(child->bts);
 		child->bts = NULL;
-
-		/* We cannot update total_vm and locked_vm since
-		   child's mm is already gone. But we can reclaim the
-		   memory. */
-		kfree(child->bts_buffer);
-		child->bts_buffer = NULL;
-		child->bts_size = 0;
 	}
 }
-
-static void ptrace_bts_detach(struct task_struct *child)
-{
-	/*
-	 * Ptrace_detach() races with ptrace_untrace() in case
-	 * the child dies and is reaped by another thread.
-	 *
-	 * We only do the memory accounting at this point and
-	 * leave the buffer deallocation and the bts tracer
-	 * release to ptrace_bts_untrace() which will be called
-	 * later on with tasklist_lock held.
-	 */
-	release_locked_buffer(child->bts_buffer, child->bts_size);
-}
 #else
 static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_detach(struct task_struct *child) {}
 static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 
@@ -843,7 +928,6 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
-	ptrace_bts_detach(child);
 }
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d475c75..64d8ed2538aef 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,6 +13,7 @@
 #include <linux/prio_tree.h>
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
+#include <linux/sched.h>
 
 struct mempolicy;
 struct anon_vma;
@@ -1321,6 +1322,6 @@ void vmemmap_populate_print_last(void);
 
 extern void *alloc_locked_buffer(size_t size);
 extern void free_locked_buffer(void *buffer, size_t size);
-extern void release_locked_buffer(void *buffer, size_t size);
+extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a5b9a83065fae..52b8cd049c2e0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -96,8 +96,8 @@ struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
 struct bio;
-struct bts_tracer;
 struct fs_struct;
+struct bts_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -1210,12 +1210,7 @@ struct task_struct {
 	 * This is the tracer handle for the ptrace BTS extension.
 	 * This field actually belongs to the ptracer task.
 	 */
-	struct bts_tracer *bts;
-	/*
-	 * The buffer to hold the BTS data.
-	 */
-	void *bts_buffer;
-	size_t bts_size;
+	struct bts_context *bts;
 #endif /* CONFIG_X86_PTRACE_BTS */
 
 	/* PID/PID hash table linkage. */
diff --git a/mm/mlock.c b/mm/mlock.c
index cbe9e0581b75d..749383b442c7e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -660,21 +660,20 @@ void *alloc_locked_buffer(size_t size)
 	return buffer;
 }
 
-void release_locked_buffer(void *buffer, size_t size)
+void refund_locked_buffer_memory(struct mm_struct *mm, size_t size)
 {
 	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-	down_write(&current->mm->mmap_sem);
+	down_write(&mm->mmap_sem);
 
-	current->mm->total_vm  -= pgsz;
-	current->mm->locked_vm -= pgsz;
+	mm->total_vm  -= pgsz;
+	mm->locked_vm -= pgsz;
 
-	up_write(&current->mm->mmap_sem);
+	up_write(&mm->mmap_sem);
 }
 
 void free_locked_buffer(void *buffer, size_t size)
 {
-	release_locked_buffer(buffer, size);
-
+	refund_locked_buffer_memory(current->mm, size);
 	kfree(buffer);
 }
-- 
GitLab


From 8d99b3ac2726e5edd97ad147fa5c1f2acb63a745 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:36 +0200
Subject: [PATCH 0264/2198] x86, bts: wait until traced task has been scheduled
 out

In order to stop branch tracing for a running task, we need to first
clear the branch tracing control bits before we may free the tracing
buffer.

If the traced task is running, the cpu might still trace that task
after the branch trace control bits have cleared.

Wait until the traced task has been scheduled out before proceeding.

A similar problem affects the task debug store context. We first remove
the context, then we need to wait until the task has been scheduled
out before we can free the context memory.

Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144551.919636000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index c730155bf54d3..5cd137ab26723 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -299,6 +299,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
 
 static inline void ds_put_context(struct ds_context *context)
 {
+	struct task_struct *task;
 	unsigned long irq;
 
 	if (!context)
@@ -313,14 +314,20 @@ static inline void ds_put_context(struct ds_context *context)
 
 	*(context->this) = NULL;
 
-	if (context->task)
-		clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+	task = context->task;
+
+	if (task)
+		clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
 
-	if (!context->task || (context->task == current))
+	if (!task || (task == current))
 		wrmsrl(MSR_IA32_DS_AREA, 0);
 
 	spin_unlock_irqrestore(&ds_lock, irq);
 
+	/* The context might still be in use for context switching. */
+	if (task && (task != current))
+		wait_task_context_switch(task);
+
 	kfree(context);
 }
 
@@ -781,15 +788,23 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 
 void ds_release_bts(struct bts_tracer *tracer)
 {
+	struct task_struct *task;
+
 	if (!tracer)
 		return;
 
+	task = tracer->ds.context->task;
+
 	ds_suspend_bts(tracer);
 
 	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
 	tracer->ds.context->bts_master = NULL;
 
-	put_tracer(tracer->ds.context->task);
+	/* Make sure tracing stopped and the tracer is not in use. */
+	if (task && (task != current))
+		wait_task_context_switch(task);
+
+	put_tracer(task);
 	ds_put_context(tracer->ds.context);
 
 	kfree(tracer);
-- 
GitLab


From 38f801129ad07b9afa7f9bd3779f61b805416d8c Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:37 +0200
Subject: [PATCH 0265/2198] x86, bts: fix race between per-task and per-cpu
 branch tracing

Per-task branch tracing installs a debug store context with the traced
task. This immediately results in the branch trace control bits to be
cleared for the next context switch of that task, if not set before.

Either per-cpu or per-task tracing are allowed at the same time.

An active per-cpu tracing would be disabled even if the per-task tracing
request is rejected and the task debug store context removed.

Check the tracing type (per-cpu or per-task) before installing a task
debug store context.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144552.856000000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c | 72 +++++++++++++++++++++++++-------------------
 1 file changed, 41 insertions(+), 31 deletions(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 5cd137ab26723..f03f117eff8cd 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -193,12 +193,28 @@ static DEFINE_SPINLOCK(ds_lock);
  */
 static atomic_t tracers = ATOMIC_INIT(0);
 
-static inline void get_tracer(struct task_struct *task)
+static inline int get_tracer(struct task_struct *task)
 {
-	if (task)
+	int error;
+
+	spin_lock_irq(&ds_lock);
+
+	if (task) {
+		error = -EPERM;
+		if (atomic_read(&tracers) < 0)
+			goto out;
 		atomic_inc(&tracers);
-	else
+	} else {
+		error = -EPERM;
+		if (atomic_read(&tracers) > 0)
+			goto out;
 		atomic_dec(&tracers);
+	}
+
+	error = 0;
+out:
+	spin_unlock_irq(&ds_lock);
+	return error;
 }
 
 static inline void put_tracer(struct task_struct *task)
@@ -209,14 +225,6 @@ static inline void put_tracer(struct task_struct *task)
 		atomic_inc(&tracers);
 }
 
-static inline int check_tracer(struct task_struct *task)
-{
-	return task ?
-		(atomic_read(&tracers) >= 0) :
-		(atomic_read(&tracers) <= 0);
-}
-
-
 /*
  * The DS context is either attached to a thread or to a cpu:
  * - in the former case, the thread_struct contains a pointer to the
@@ -677,6 +685,10 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	if (ovfl)
 		goto out;
 
+	error = get_tracer(task);
+	if (error < 0)
+		goto out;
+
 	/*
 	 * Per-cpu tracing is typically requested using smp_call_function().
 	 * We must not sleep.
@@ -684,7 +696,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	error = -ENOMEM;
 	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
 	if (!tracer)
-		goto out;
+		goto out_put_tracer;
 	tracer->ovfl = ovfl;
 
 	error = ds_request(&tracer->ds, &tracer->trace.ds,
@@ -695,14 +707,9 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 
 	spin_lock_irqsave(&ds_lock, irq);
 
-	error = -EPERM;
-	if (!check_tracer(task))
-		goto out_unlock;
-	get_tracer(task);
-
 	error = -EPERM;
 	if (tracer->ds.context->bts_master)
-		goto out_put_tracer;
+		goto out_unlock;
 	tracer->ds.context->bts_master = tracer;
 
 	spin_unlock_irqrestore(&ds_lock, irq);
@@ -716,13 +723,13 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 
 	return tracer;
 
- out_put_tracer:
-	put_tracer(task);
  out_unlock:
 	spin_unlock_irqrestore(&ds_lock, irq);
 	ds_put_context(tracer->ds.context);
  out_tracer:
 	kfree(tracer);
+ out_put_tracer:
+	put_tracer(task);
  out:
 	return ERR_PTR(error);
 }
@@ -741,6 +748,10 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	if (ovfl)
 		goto out;
 
+	error = get_tracer(task);
+	if (error < 0)
+		goto out;
+
 	/*
 	 * Per-cpu tracing is typically requested using smp_call_function().
 	 * We must not sleep.
@@ -748,7 +759,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	error = -ENOMEM;
 	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
 	if (!tracer)
-		goto out;
+		goto out_put_tracer;
 	tracer->ovfl = ovfl;
 
 	error = ds_request(&tracer->ds, &tracer->trace.ds,
@@ -758,14 +769,9 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 
 	spin_lock_irqsave(&ds_lock, irq);
 
-	error = -EPERM;
-	if (!check_tracer(task))
-		goto out_unlock;
-	get_tracer(task);
-
 	error = -EPERM;
 	if (tracer->ds.context->pebs_master)
-		goto out_put_tracer;
+		goto out_unlock;
 	tracer->ds.context->pebs_master = tracer;
 
 	spin_unlock_irqrestore(&ds_lock, irq);
@@ -775,13 +781,13 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 
 	return tracer;
 
- out_put_tracer:
-	put_tracer(task);
  out_unlock:
 	spin_unlock_irqrestore(&ds_lock, irq);
 	ds_put_context(tracer->ds.context);
  out_tracer:
 	kfree(tracer);
+ out_put_tracer:
+	put_tracer(task);
  out:
 	return ERR_PTR(error);
 }
@@ -804,8 +810,8 @@ void ds_release_bts(struct bts_tracer *tracer)
 	if (task && (task != current))
 		wait_task_context_switch(task);
 
-	put_tracer(task);
 	ds_put_context(tracer->ds.context);
+	put_tracer(task);
 
 	kfree(tracer);
 }
@@ -861,16 +867,20 @@ void ds_resume_bts(struct bts_tracer *tracer)
 
 void ds_release_pebs(struct pebs_tracer *tracer)
 {
+	struct task_struct *task;
+
 	if (!tracer)
 		return;
 
+	task = tracer->ds.context->task;
+
 	ds_suspend_pebs(tracer);
 
 	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
 	tracer->ds.context->pebs_master = NULL;
 
-	put_tracer(tracer->ds.context->task);
 	ds_put_context(tracer->ds.context);
+	put_tracer(task);
 
 	kfree(tracer);
 }
-- 
GitLab


From 15879d042164650b93d83281ad5f87ad323bfbfe Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:38 +0200
Subject: [PATCH 0266/2198] x86, bts: use trace_clock_global() for timestamps

Rename the bts_struct timestamp field to event.

Use trace_clock_global() for time measurement.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144553.773216000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ds.h |  4 ++--
 arch/x86/kernel/ds.c      | 17 +++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index a8f672ba100c5..772f141afb9a1 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -170,9 +170,9 @@ struct bts_struct {
 		} lbr;
 		/* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
 		struct {
-			__u64 jiffies;
+			__u64 clock;
 			pid_t pid;
-		} timestamp;
+		} event;
 	} variant;
 };
 
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index f03f117eff8cd..2071b992c35ce 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -25,6 +25,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
+#include <linux/trace_clock.h>
 
 #include <asm/ds.h>
 
@@ -471,7 +472,7 @@ enum bts_field {
 	bts_flags,
 
 	bts_qual		= bts_from,
-	bts_jiffies		= bts_to,
+	bts_clock		= bts_to,
 	bts_pid			= bts_flags,
 
 	bts_qual_mask		= (bts_qual_max - 1),
@@ -517,8 +518,8 @@ bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
 	memset(out, 0, sizeof(*out));
 	if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
 		out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
-		out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
-		out->variant.timestamp.pid = bts_get(at, bts_pid);
+		out->variant.event.clock = bts_get(at, bts_clock);
+		out->variant.event.pid = bts_get(at, bts_pid);
 	} else {
 		out->qualifier = bts_branch;
 		out->variant.lbr.from = bts_get(at, bts_from);
@@ -555,8 +556,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
 	case bts_task_arrives:
 	case bts_task_departs:
 		bts_set(raw, bts_qual, (bts_escape | in->qualifier));
-		bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
-		bts_set(raw, bts_pid, in->variant.timestamp.pid);
+		bts_set(raw, bts_clock, in->variant.event.clock);
+		bts_set(raw, bts_pid, in->variant.event.pid);
 		break;
 	default:
 		return -EINVAL;
@@ -1083,9 +1084,9 @@ static inline void ds_take_timestamp(struct ds_context *context,
 		return;
 
 	memset(&ts, 0, sizeof(ts));
-	ts.qualifier			= qualifier;
-	ts.variant.timestamp.jiffies	= jiffies_64;
-	ts.variant.timestamp.pid	= task->pid;
+	ts.qualifier		= qualifier;
+	ts.variant.event.clock	= trace_clock_global();
+	ts.variant.event.pid	= task->pid;
 
 	bts_write(tracer, &ts);
 }
-- 
GitLab


From 35bb7600c17762bb129588c1877d2717fe325289 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:39 +0200
Subject: [PATCH 0267/2198] x86, debugctlmsr: add _on_cpu variants to
 debugctlmsr functions

Add functions to get and set the debugctlmsr on different cpus.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144554.738772000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2483807e06e7f..1efeb497f1f90 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -785,6 +785,21 @@ static inline unsigned long get_debugctlmsr(void)
     return debugctlmsr;
 }
 
+static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
+{
+	u64 debugctlmsr = 0;
+	u32 val1, val2;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return 0;
+#endif
+	rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
+	debugctlmsr = val1 | ((u64)val2 << 32);
+
+	return debugctlmsr;
+}
+
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
@@ -794,6 +809,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 }
 
+static inline void update_debugctlmsr_on_cpu(int cpu,
+					     unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+	if (boot_cpu_data.x86 < 6)
+		return;
+#endif
+	wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
+		     (u32)((u64)debugctlmsr),
+		     (u32)((u64)debugctlmsr >> 32));
+}
+
 /*
  * from system description table in BIOS. Mostly for MCA use, but
  * others may find it useful:
-- 
GitLab


From de79f54f5347ad7ec6ff55ccbb6d4ab2a21f6a93 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:40 +0200
Subject: [PATCH 0268/2198] x86, bts, hw-branch-tracer: add _noirq variants to
 the debug store interface

The hw-branch-tracer uses debug store functions from an on_each_cpu()
context, which is simply wrong since the functions may sleep.

Add _noirq variants for most functions, which  may be called with
interrupts disabled.

Separate per-cpu and per-task tracing and allow per-cpu tracing to be
controlled from any cpu.

Make the hw-branch-tracer use the new debug store interface, synchronize
with hotplug cpu event using get/put_online_cpus(), and remove the
unnecessary spinlock.

Make the ptrace bts and the ds selftest code use the new interface.

Defer the ds selftest.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144555.658136000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ds.h        |  57 +++-
 arch/x86/kernel/ds.c             | 474 ++++++++++++++++++++++++-------
 arch/x86/kernel/ds_selftest.c    |   9 +-
 arch/x86/kernel/ptrace.c         |   5 +-
 kernel/trace/trace_hw_branches.c | 193 +++++--------
 5 files changed, 492 insertions(+), 246 deletions(-)

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index 772f141afb9a1..413e127e567d7 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -15,8 +15,8 @@
  * - buffer allocation (memory accounting)
  *
  *
- * Copyright (C) 2007-2008 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
+ * Copyright (C) 2007-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
  */
 
 #ifndef _ASM_X86_DS_H
@@ -83,8 +83,10 @@ enum ds_feature {
  * The interrupt threshold is independent from the overflow callback
  * to allow users to use their own overflow interrupt handling mechanism.
  *
- * task: the task to request recording for;
- *       NULL for per-cpu recording on the current cpu
+ * The function might sleep.
+ *
+ * task: the task to request recording for
+ * cpu:  the cpu to request recording for
  * base: the base pointer for the (non-pageable) buffer;
  * size: the size of the provided buffer in bytes
  * ovfl: pointer to a function to be called on buffer overflow;
@@ -93,19 +95,28 @@ enum ds_feature {
  *     -1 if no interrupt threshold is requested.
  * flags: a bit-mask of the above flags
  */
-extern struct bts_tracer *ds_request_bts(struct task_struct *task,
-					 void *base, size_t size,
-					 bts_ovfl_callback_t ovfl,
-					 size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
-					   void *base, size_t size,
-					   pebs_ovfl_callback_t ovfl,
-					   size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+					      void *base, size_t size,
+					      bts_ovfl_callback_t ovfl,
+					      size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+					     bts_ovfl_callback_t ovfl,
+					     size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+						void *base, size_t size,
+						pebs_ovfl_callback_t ovfl,
+						size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
+					       void *base, size_t size,
+					       pebs_ovfl_callback_t ovfl,
+					       size_t th, unsigned int flags);
 
 /*
  * Release BTS or PEBS resources
  * Suspend and resume BTS or PEBS tracing
  *
+ * Must be called with irq's enabled.
+ *
  * tracer: the tracer handle returned from ds_request_~()
  */
 extern void ds_release_bts(struct bts_tracer *tracer);
@@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer);
 extern void ds_suspend_pebs(struct pebs_tracer *tracer);
 extern void ds_resume_pebs(struct pebs_tracer *tracer);
 
+/*
+ * Release BTS or PEBS resources
+ * Suspend and resume BTS or PEBS tracing
+ *
+ * Cpu tracers must call this on the traced cpu.
+ * Task tracers must call ds_release_~_noirq() for themselves.
+ *
+ * May be called with irq's disabled.
+ *
+ * Returns 0 if successful;
+ * -EPERM if the cpu tracer does not trace the current cpu.
+ * -EPERM if the task tracer does not trace itself.
+ *
+ * tracer: the tracer handle returned from ds_request_~()
+ */
+extern int ds_release_bts_noirq(struct bts_tracer *tracer);
+extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
+extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
+extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
+
 
 /*
  * The raw DS buffer state as it is used for BTS and PEBS recording.
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 2071b992c35ce..21a3852abf685 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -245,60 +245,50 @@ struct ds_context {
 	struct pebs_tracer	*pebs_master;
 
 	/* Use count: */
-	unsigned long count;
+	unsigned long		count;
 
 	/* Pointer to the context pointer field: */
 	struct ds_context	**this;
 
-	/* The traced task; NULL for current cpu: */
+	/* The traced task; NULL for cpu tracing: */
 	struct task_struct	*task;
-};
 
-static DEFINE_PER_CPU(struct ds_context *, system_context_array);
+	/* The traced cpu; only valid if task is NULL: */
+	int			cpu;
+};
 
-#define system_context per_cpu(system_context_array, smp_processor_id())
+static DEFINE_PER_CPU(struct ds_context *, cpu_context);
 
 
-static inline struct ds_context *ds_get_context(struct task_struct *task)
+static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
 {
 	struct ds_context **p_context =
-		(task ? &task->thread.ds_ctx : &system_context);
+		(task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
 	struct ds_context *context = NULL;
 	struct ds_context *new_context = NULL;
-	unsigned long irq;
 
-	/*
-	 * Chances are small that we already have a context.
-	 *
-	 * Contexts for per-cpu tracing are allocated using
-	 * smp_call_function(). We must not sleep.
-	 */
-	new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC);
+	/* Chances are small that we already have a context. */
+	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
 	if (!new_context)
 		return NULL;
 
-	spin_lock_irqsave(&ds_lock, irq);
+	spin_lock_irq(&ds_lock);
 
 	context = *p_context;
-	if (!context) {
+	if (likely(!context)) {
 		context = new_context;
 
 		context->this = p_context;
 		context->task = task;
+		context->cpu = cpu;
 		context->count = 0;
 
-		if (task)
-			set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
-		if (!task || (task == current))
-			wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
-
 		*p_context = context;
 	}
 
 	context->count++;
 
-	spin_unlock_irqrestore(&ds_lock, irq);
+	spin_unlock_irq(&ds_lock);
 
 	if (context != new_context)
 		kfree(new_context);
@@ -306,7 +296,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
 	return context;
 }
 
-static inline void ds_put_context(struct ds_context *context)
+static void ds_put_context(struct ds_context *context)
 {
 	struct task_struct *task;
 	unsigned long irq;
@@ -328,8 +318,15 @@ static inline void ds_put_context(struct ds_context *context)
 	if (task)
 		clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
 
-	if (!task || (task == current))
-		wrmsrl(MSR_IA32_DS_AREA, 0);
+	/*
+	 * We leave the (now dangling) pointer to the DS configuration in
+	 * the DS_AREA msr. This is as good or as bad as replacing it with
+	 * NULL - the hardware would crash if we enabled tracing.
+	 *
+	 * This saves us some problems with having to write an msr on a
+	 * different cpu while preventing others from doing the same for the
+	 * next context for that same cpu.
+	 */
 
 	spin_unlock_irqrestore(&ds_lock, irq);
 
@@ -340,6 +337,31 @@ static inline void ds_put_context(struct ds_context *context)
 	kfree(context);
 }
 
+static void ds_install_ds_area(struct ds_context *context)
+{
+	unsigned long ds;
+
+	ds = (unsigned long)context->ds;
+
+	/*
+	 * There is a race between the bts master and the pebs master.
+	 *
+	 * The thread/cpu access is synchronized via get/put_cpu() for
+	 * task tracing and via wrmsr_on_cpu for cpu tracing.
+	 *
+	 * If bts and pebs are collected for the same task or same cpu,
+	 * the same confiuration is written twice.
+	 */
+	if (context->task) {
+		get_cpu();
+		if (context->task == current)
+			wrmsrl(MSR_IA32_DS_AREA, ds);
+		set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+		put_cpu();
+	} else
+		wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
+			     (u32)((u64)ds), (u32)((u64)ds >> 32));
+}
 
 /*
  * Call the tracer's callback on a buffer overflow.
@@ -622,6 +644,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
 	 * The value for 'no threshold' is -1, which will set the
 	 * threshold outside of the buffer, just like we want it.
 	 */
+	ith *= ds_cfg.sizeof_rec[qual];
 	trace->ith = (void *)(buffer + size - ith);
 
 	trace->flags = flags;
@@ -630,7 +653,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
 
 static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 		      enum ds_qualifier qual, struct task_struct *task,
-		      void *base, size_t size, size_t th, unsigned int flags)
+		      int cpu, void *base, size_t size, size_t th)
 {
 	struct ds_context *context;
 	int error;
@@ -643,7 +666,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 	if (!base)
 		goto out;
 
-	/* We require some space to do alignment adjustments below. */
+	/* We need space for alignment adjustments in ds_init_ds_trace(). */
 	error = -EINVAL;
 	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
 		goto out;
@@ -660,25 +683,27 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 	tracer->size = size;
 
 	error = -ENOMEM;
-	context = ds_get_context(task);
+	context = ds_get_context(task, cpu);
 	if (!context)
 		goto out;
 	tracer->context = context;
 
-	ds_init_ds_trace(trace, qual, base, size, th, flags);
+	/*
+	 * Defer any tracer-specific initialization work for the context until
+	 * context ownership has been clarified.
+	 */
 
 	error = 0;
  out:
 	return error;
 }
 
-struct bts_tracer *ds_request_bts(struct task_struct *task,
-				  void *base, size_t size,
-				  bts_ovfl_callback_t ovfl, size_t th,
-				  unsigned int flags)
+static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
+					 void *base, size_t size,
+					 bts_ovfl_callback_t ovfl, size_t th,
+					 unsigned int flags)
 {
 	struct bts_tracer *tracer;
-	unsigned long irq;
 	int error;
 
 	/* Buffer overflow notification is not yet implemented. */
@@ -690,42 +715,46 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	if (error < 0)
 		goto out;
 
-	/*
-	 * Per-cpu tracing is typically requested using smp_call_function().
-	 * We must not sleep.
-	 */
 	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
+	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
 	if (!tracer)
 		goto out_put_tracer;
 	tracer->ovfl = ovfl;
 
+	/* Do some more error checking and acquire a tracing context. */
 	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_bts, task, base, size, th, flags);
+			   ds_bts, task, cpu, base, size, th);
 	if (error < 0)
 		goto out_tracer;
 
-
-	spin_lock_irqsave(&ds_lock, irq);
+	/* Claim the bts part of the tracing context we acquired above. */
+	spin_lock_irq(&ds_lock);
 
 	error = -EPERM;
 	if (tracer->ds.context->bts_master)
 		goto out_unlock;
 	tracer->ds.context->bts_master = tracer;
 
-	spin_unlock_irqrestore(&ds_lock, irq);
+	spin_unlock_irq(&ds_lock);
 
+	/*
+	 * Now that we own the bts part of the context, let's complete the
+	 * initialization for that part.
+	 */
+	ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
+	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+	ds_install_ds_area(tracer->ds.context);
 
 	tracer->trace.read  = bts_read;
 	tracer->trace.write = bts_write;
 
-	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+	/* Start tracing. */
 	ds_resume_bts(tracer);
 
 	return tracer;
 
  out_unlock:
-	spin_unlock_irqrestore(&ds_lock, irq);
+	spin_unlock_irq(&ds_lock);
 	ds_put_context(tracer->ds.context);
  out_tracer:
 	kfree(tracer);
@@ -735,13 +764,27 @@ struct bts_tracer *ds_request_bts(struct task_struct *task,
 	return ERR_PTR(error);
 }
 
-struct pebs_tracer *ds_request_pebs(struct task_struct *task,
-				    void *base, size_t size,
-				    pebs_ovfl_callback_t ovfl, size_t th,
-				    unsigned int flags)
+struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+				       void *base, size_t size,
+				       bts_ovfl_callback_t ovfl,
+				       size_t th, unsigned int flags)
+{
+	return ds_request_bts(task, 0, base, size, ovfl, th, flags);
+}
+
+struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+				      bts_ovfl_callback_t ovfl,
+				      size_t th, unsigned int flags)
+{
+	return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
+					   void *base, size_t size,
+					   pebs_ovfl_callback_t ovfl, size_t th,
+					   unsigned int flags)
 {
 	struct pebs_tracer *tracer;
-	unsigned long irq;
 	int error;
 
 	/* Buffer overflow notification is not yet implemented. */
@@ -753,37 +796,43 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	if (error < 0)
 		goto out;
 
-	/*
-	 * Per-cpu tracing is typically requested using smp_call_function().
-	 * We must not sleep.
-	 */
 	error = -ENOMEM;
-	tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC);
+	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
 	if (!tracer)
 		goto out_put_tracer;
 	tracer->ovfl = ovfl;
 
+	/* Do some more error checking and acquire a tracing context. */
 	error = ds_request(&tracer->ds, &tracer->trace.ds,
-			   ds_pebs, task, base, size, th, flags);
+			   ds_pebs, task, cpu, base, size, th);
 	if (error < 0)
 		goto out_tracer;
 
-	spin_lock_irqsave(&ds_lock, irq);
+	/* Claim the pebs part of the tracing context we acquired above. */
+	spin_lock_irq(&ds_lock);
 
 	error = -EPERM;
 	if (tracer->ds.context->pebs_master)
 		goto out_unlock;
 	tracer->ds.context->pebs_master = tracer;
 
-	spin_unlock_irqrestore(&ds_lock, irq);
+	spin_unlock_irq(&ds_lock);
 
+	/*
+	 * Now that we own the pebs part of the context, let's complete the
+	 * initialization for that part.
+	 */
+	ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
 	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
+	ds_install_ds_area(tracer->ds.context);
+
+	/* Start tracing. */
 	ds_resume_pebs(tracer);
 
 	return tracer;
 
  out_unlock:
-	spin_unlock_irqrestore(&ds_lock, irq);
+	spin_unlock_irq(&ds_lock);
 	ds_put_context(tracer->ds.context);
  out_tracer:
 	kfree(tracer);
@@ -793,16 +842,26 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task,
 	return ERR_PTR(error);
 }
 
-void ds_release_bts(struct bts_tracer *tracer)
+struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+					 void *base, size_t size,
+					 pebs_ovfl_callback_t ovfl,
+					 size_t th, unsigned int flags)
 {
-	struct task_struct *task;
+	return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
+}
 
-	if (!tracer)
-		return;
+struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
+					pebs_ovfl_callback_t ovfl,
+					size_t th, unsigned int flags)
+{
+	return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
+}
 
-	task = tracer->ds.context->task;
+static void ds_free_bts(struct bts_tracer *tracer)
+{
+	struct task_struct *task;
 
-	ds_suspend_bts(tracer);
+	task = tracer->ds.context->task;
 
 	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
 	tracer->ds.context->bts_master = NULL;
@@ -817,9 +876,69 @@ void ds_release_bts(struct bts_tracer *tracer)
 	kfree(tracer);
 }
 
+void ds_release_bts(struct bts_tracer *tracer)
+{
+	might_sleep();
+
+	if (!tracer)
+		return;
+
+	ds_suspend_bts(tracer);
+	ds_free_bts(tracer);
+}
+
+int ds_release_bts_noirq(struct bts_tracer *tracer)
+{
+	struct task_struct *task;
+	unsigned long irq;
+	int error;
+
+	if (!tracer)
+		return 0;
+
+	task = tracer->ds.context->task;
+
+	local_irq_save(irq);
+
+	error = -EPERM;
+	if (!task &&
+	    (tracer->ds.context->cpu != smp_processor_id()))
+		goto out;
+
+	error = -EPERM;
+	if (task && (task != current))
+		goto out;
+
+	ds_suspend_bts_noirq(tracer);
+	ds_free_bts(tracer);
+
+	error = 0;
+ out:
+	local_irq_restore(irq);
+	return error;
+}
+
+static void update_task_debugctlmsr(struct task_struct *task,
+				    unsigned long debugctlmsr)
+{
+	task->thread.debugctlmsr = debugctlmsr;
+
+	get_cpu();
+	if (task == current)
+		update_debugctlmsr(debugctlmsr);
+
+	if (task->thread.debugctlmsr)
+		set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+	else
+		clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
+	put_cpu();
+}
+
 void ds_suspend_bts(struct bts_tracer *tracer)
 {
 	struct task_struct *task;
+	unsigned long debugctlmsr;
+	int cpu;
 
 	if (!tracer)
 		return;
@@ -827,29 +946,60 @@ void ds_suspend_bts(struct bts_tracer *tracer)
 	tracer->flags = 0;
 
 	task = tracer->ds.context->task;
+	cpu  = tracer->ds.context->cpu;
 
-	if (!task || (task == current))
-		update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
+	WARN_ON(!task && irqs_disabled());
 
-	if (task) {
-		task->thread.debugctlmsr &= ~BTS_CONTROL;
+	debugctlmsr = (task ?
+		       task->thread.debugctlmsr :
+		       get_debugctlmsr_on_cpu(cpu));
+	debugctlmsr &= ~BTS_CONTROL;
 
-		if (!task->thread.debugctlmsr)
-			clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
-	}
+	if (task)
+		update_task_debugctlmsr(task, debugctlmsr);
+	else
+		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
 }
 
-void ds_resume_bts(struct bts_tracer *tracer)
+int ds_suspend_bts_noirq(struct bts_tracer *tracer)
 {
 	struct task_struct *task;
-	unsigned long control;
+	unsigned long debugctlmsr, irq;
+	int cpu, error = 0;
 
 	if (!tracer)
-		return;
+		return 0;
 
-	tracer->flags = tracer->trace.ds.flags;
+	tracer->flags = 0;
 
 	task = tracer->ds.context->task;
+	cpu  = tracer->ds.context->cpu;
+
+	local_irq_save(irq);
+
+	error = -EPERM;
+	if (!task && (cpu != smp_processor_id()))
+		goto out;
+
+	debugctlmsr = (task ?
+		       task->thread.debugctlmsr :
+		       get_debugctlmsr());
+	debugctlmsr &= ~BTS_CONTROL;
+
+	if (task)
+		update_task_debugctlmsr(task, debugctlmsr);
+	else
+		update_debugctlmsr(debugctlmsr);
+
+	error = 0;
+ out:
+	local_irq_restore(irq);
+	return error;
+}
+
+static unsigned long ds_bts_control(struct bts_tracer *tracer)
+{
+	unsigned long control;
 
 	control = ds_cfg.ctl[dsf_bts];
 	if (!(tracer->trace.ds.flags & BTS_KERNEL))
@@ -857,25 +1007,77 @@ void ds_resume_bts(struct bts_tracer *tracer)
 	if (!(tracer->trace.ds.flags & BTS_USER))
 		control |= ds_cfg.ctl[dsf_bts_user];
 
-	if (task) {
-		task->thread.debugctlmsr |= control;
-		set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
-	}
-
-	if (!task || (task == current))
-		update_debugctlmsr(get_debugctlmsr() | control);
+	return control;
 }
 
-void ds_release_pebs(struct pebs_tracer *tracer)
+void ds_resume_bts(struct bts_tracer *tracer)
 {
 	struct task_struct *task;
+	unsigned long debugctlmsr;
+	int cpu;
 
 	if (!tracer)
 		return;
 
+	tracer->flags = tracer->trace.ds.flags;
+
 	task = tracer->ds.context->task;
+	cpu  = tracer->ds.context->cpu;
 
-	ds_suspend_pebs(tracer);
+	WARN_ON(!task && irqs_disabled());
+
+	debugctlmsr = (task ?
+		       task->thread.debugctlmsr :
+		       get_debugctlmsr_on_cpu(cpu));
+	debugctlmsr |= ds_bts_control(tracer);
+
+	if (task)
+		update_task_debugctlmsr(task, debugctlmsr);
+	else
+		update_debugctlmsr_on_cpu(cpu, debugctlmsr);
+}
+
+int ds_resume_bts_noirq(struct bts_tracer *tracer)
+{
+	struct task_struct *task;
+	unsigned long debugctlmsr, irq;
+	int cpu, error = 0;
+
+	if (!tracer)
+		return 0;
+
+	tracer->flags = tracer->trace.ds.flags;
+
+	task = tracer->ds.context->task;
+	cpu  = tracer->ds.context->cpu;
+
+	local_irq_save(irq);
+
+	error = -EPERM;
+	if (!task && (cpu != smp_processor_id()))
+		goto out;
+
+	debugctlmsr = (task ?
+		       task->thread.debugctlmsr :
+		       get_debugctlmsr());
+	debugctlmsr |= ds_bts_control(tracer);
+
+	if (task)
+		update_task_debugctlmsr(task, debugctlmsr);
+	else
+		update_debugctlmsr(debugctlmsr);
+
+	error = 0;
+ out:
+	local_irq_restore(irq);
+	return error;
+}
+
+static void ds_free_pebs(struct pebs_tracer *tracer)
+{
+	struct task_struct *task;
+
+	task = tracer->ds.context->task;
 
 	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
 	tracer->ds.context->pebs_master = NULL;
@@ -886,16 +1088,68 @@ void ds_release_pebs(struct pebs_tracer *tracer)
 	kfree(tracer);
 }
 
+void ds_release_pebs(struct pebs_tracer *tracer)
+{
+	might_sleep();
+
+	if (!tracer)
+		return;
+
+	ds_suspend_pebs(tracer);
+	ds_free_pebs(tracer);
+}
+
+int ds_release_pebs_noirq(struct pebs_tracer *tracer)
+{
+	struct task_struct *task;
+	unsigned long irq;
+	int error;
+
+	if (!tracer)
+		return 0;
+
+	task = tracer->ds.context->task;
+
+	local_irq_save(irq);
+
+	error = -EPERM;
+	if (!task &&
+	    (tracer->ds.context->cpu != smp_processor_id()))
+		goto out;
+
+	error = -EPERM;
+	if (task && (task != current))
+		goto out;
+
+	ds_suspend_pebs_noirq(tracer);
+	ds_free_pebs(tracer);
+
+	error = 0;
+ out:
+	local_irq_restore(irq);
+	return error;
+}
+
 void ds_suspend_pebs(struct pebs_tracer *tracer)
 {
 
 }
 
+int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
+{
+	return 0;
+}
+
 void ds_resume_pebs(struct pebs_tracer *tracer)
 {
 
 }
 
+int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
+{
+	return 0;
+}
+
 const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
 {
 	if (!tracer)
@@ -1004,26 +1258,6 @@ ds_configure(const struct ds_configuration *cfg,
 		printk(KERN_INFO "[ds] pebs not available\n");
 	}
 
-	if (ds_cfg.sizeof_rec[ds_bts]) {
-		int error;
-
-		error = ds_selftest_bts();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling bts.\n");
-			ds_cfg.sizeof_rec[ds_bts] = 0;
-		}
-	}
-
-	if (ds_cfg.sizeof_rec[ds_pebs]) {
-		int error;
-
-		error = ds_selftest_pebs();
-		if (error) {
-			WARN(1, "[ds] selftest failed. disabling pebs.\n");
-			ds_cfg.sizeof_rec[ds_pebs] = 0;
-		}
-	}
-
 	printk(KERN_INFO "[ds] sizes: address: %u bit, ",
 	       8 * ds_cfg.sizeof_ptr_field);
 	printk("bts/pebs record: %u/%u bytes\n",
@@ -1127,3 +1361,29 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
 void ds_exit_thread(struct task_struct *tsk)
 {
 }
+
+static __init int ds_selftest(void)
+{
+	if (ds_cfg.sizeof_rec[ds_bts]) {
+		int error;
+
+		error = ds_selftest_bts();
+		if (error) {
+			WARN(1, "[ds] selftest failed. disabling bts.\n");
+			ds_cfg.sizeof_rec[ds_bts] = 0;
+		}
+	}
+
+	if (ds_cfg.sizeof_rec[ds_pebs]) {
+		int error;
+
+		error = ds_selftest_pebs();
+		if (error) {
+			WARN(1, "[ds] selftest failed. disabling pebs.\n");
+			ds_cfg.sizeof_rec[ds_pebs] = 0;
+		}
+	}
+
+	return 0;
+}
+device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index 8c46fbf38c461..e5a263c8a14cd 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -10,11 +10,12 @@
 
 #include <linux/kernel.h>
 #include <linux/string.h>
+#include <linux/smp.h>
 
 #include <asm/ds.h>
 
 
-#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */
+#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */
 
 
 static int ds_selftest_bts_consistency(const struct bts_trace *trace)
@@ -125,12 +126,12 @@ int ds_selftest_bts(void)
 	struct bts_tracer *tracer;
 	int error = 0;
 	void *top;
-	unsigned char buffer[DS_SELFTEST_BUFFER_SIZE];
+	unsigned char buffer[BUFFER_SIZE];
 
 	printk(KERN_INFO "[ds] bts selftest...");
 
-	tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE,
-				NULL, (size_t)-1, BTS_KERNEL);
+	tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE,
+				    NULL, (size_t)-1, BTS_KERNEL);
 	if (IS_ERR(tracer)) {
 		error = PTR_ERR(tracer);
 		tracer = NULL;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7c21d1e8cae79..adbb24322d8f2 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -800,8 +800,9 @@ static int ptrace_bts_config(struct task_struct *child,
 	if (cfg.flags & PTRACE_BTS_O_SCHED)
 		flags |= BTS_TIMESTAMPS;
 
-	context->tracer = ds_request_bts(child, context->buffer, context->size,
-					 NULL, (size_t)-1, flags);
+	context->tracer =
+		ds_request_bts_task(child, context->buffer, context->size,
+				    NULL, (size_t)-1, flags);
 	if (unlikely(IS_ERR(context->tracer))) {
 		int error = PTR_ERR(context->tracer);
 
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 8b2109a6c61cc..50565d8cd2ed9 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -4,7 +4,6 @@
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
  */
-#include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
@@ -21,168 +20,113 @@
 
 #define BTS_BUFFER_SIZE (1 << 13)
 
-/*
- * The tracer lock protects the below per-cpu tracer array.
- * It needs to be held to:
- * - start tracing on all cpus
- * - stop tracing on all cpus
- * - start tracing on a single hotplug cpu
- * - stop tracing on a single hotplug cpu
- * - read the trace from all cpus
- * - read the trace from a single cpu
- */
-static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
 static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
 
 static int trace_hw_branches_enabled __read_mostly;
 static int trace_hw_branches_suspended __read_mostly;
 static struct trace_array *hw_branch_trace __read_mostly;
 
 
-/*
- * Initialize the tracer for the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_init_cpu(void *arg)
+static void bts_trace_init_cpu(int cpu)
 {
-	if (this_tracer)
-		ds_release_bts(this_tracer);
+	per_cpu(tracer, cpu) =
+		ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+				   NULL, (size_t)-1, BTS_KERNEL);
 
-	this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE,
-				     NULL, (size_t)-1, BTS_KERNEL);
-	if (IS_ERR(this_tracer)) {
-		this_tracer = NULL;
-		return;
-	}
+	if (IS_ERR(per_cpu(tracer, cpu)))
+		per_cpu(tracer, cpu) = NULL;
 }
 
 static int bts_trace_init(struct trace_array *tr)
 {
-	int cpu, avail;
-
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
 	hw_branch_trace = tr;
+	trace_hw_branches_enabled = 0;
 
-	on_each_cpu(bts_trace_init_cpu, NULL, 1);
-
-	/* Check on how many cpus we could enable tracing */
-	avail = 0;
-	for_each_online_cpu(cpu)
-		if (per_cpu(tracer, cpu))
-			avail++;
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		bts_trace_init_cpu(cpu);
 
-	trace_hw_branches_enabled = (avail ? 1 : 0);
+		if (likely(per_cpu(tracer, cpu)))
+			trace_hw_branches_enabled = 1;
+	}
 	trace_hw_branches_suspended = 0;
-
-	spin_unlock(&bts_tracer_lock);
-
+	put_online_cpus();
 
 	/* If we could not enable tracing on a single cpu, we fail. */
-	return avail ? 0 : -EOPNOTSUPP;
-}
-
-/*
- * Release the tracer for the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_release_cpu(void *arg)
-{
-	if (this_tracer) {
-		ds_release_bts(this_tracer);
-		this_tracer = NULL;
-	}
+	return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
 }
 
 static void bts_trace_reset(struct trace_array *tr)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
-	on_each_cpu(bts_trace_release_cpu, NULL, 1);
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		if (likely(per_cpu(tracer, cpu))) {
+			ds_release_bts(per_cpu(tracer, cpu));
+			per_cpu(tracer, cpu) = NULL;
+		}
+	}
 	trace_hw_branches_enabled = 0;
 	trace_hw_branches_suspended = 0;
-
-	spin_unlock(&bts_tracer_lock);
-}
-
-/*
- * Resume tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_resume_cpu(void *arg)
-{
-	if (this_tracer)
-		ds_resume_bts(this_tracer);
+	put_online_cpus();
 }
 
 static void bts_trace_start(struct trace_array *tr)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
-	on_each_cpu(bts_trace_resume_cpu, NULL, 1);
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_resume_bts(per_cpu(tracer, cpu));
 	trace_hw_branches_suspended = 0;
-
-	spin_unlock(&bts_tracer_lock);
-}
-
-/*
- * Suspend tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_suspend_cpu(void *arg)
-{
-	if (this_tracer)
-		ds_suspend_bts(this_tracer);
+	put_online_cpus();
 }
 
 static void bts_trace_stop(struct trace_array *tr)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
-	on_each_cpu(bts_trace_suspend_cpu, NULL, 1);
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_suspend_bts(per_cpu(tracer, cpu));
 	trace_hw_branches_suspended = 1;
-
-	spin_unlock(&bts_tracer_lock);
+	put_online_cpus();
 }
 
 static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 				     unsigned long action, void *hcpu)
 {
-	unsigned int cpu = (unsigned long)hcpu;
-
-	spin_lock(&bts_tracer_lock);
-
-	if (!trace_hw_branches_enabled)
-		goto out;
+	int cpu = (long)hcpu;
 
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_DOWN_FAILED:
-		smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1);
-
-		if (trace_hw_branches_suspended)
-			smp_call_function_single(cpu, bts_trace_suspend_cpu,
-						 NULL, 1);
+		/* The notification is sent with interrupts enabled. */
+		if (trace_hw_branches_enabled) {
+			bts_trace_init_cpu(cpu);
+
+			if (trace_hw_branches_suspended &&
+			    likely(per_cpu(tracer, cpu)))
+				ds_suspend_bts(per_cpu(tracer, cpu));
+		}
 		break;
+
 	case CPU_DOWN_PREPARE:
-		smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1);
-		break;
+		/* The notification is sent with interrupts enabled. */
+		if (likely(per_cpu(tracer, cpu))) {
+			ds_release_bts(per_cpu(tracer, cpu));
+			per_cpu(tracer, cpu) = NULL;
+		}
 	}
 
- out:
-	spin_unlock(&bts_tracer_lock);
 	return NOTIFY_DONE;
 }
 
@@ -274,7 +218,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
 /*
  * Collect the trace on the current cpu and write it into the ftrace buffer.
  *
- * pre: bts_tracer_lock must be locked
+ * pre: tracing must be suspended on the current cpu
  */
 static void trace_bts_cpu(void *arg)
 {
@@ -291,10 +235,9 @@ static void trace_bts_cpu(void *arg)
 	if (unlikely(!this_tracer))
 		return;
 
-	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
-		goto out;
+		return;
 
 	for (at = trace->ds.top; (void *)at < trace->ds.end;
 	     at += trace->ds.size)
@@ -303,18 +246,27 @@ static void trace_bts_cpu(void *arg)
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     at += trace->ds.size)
 		trace_bts_at(trace, at);
-
-out:
-	ds_resume_bts(this_tracer);
 }
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-	spin_lock(&bts_tracer_lock);
+	int cpu;
 
+	get_online_cpus();
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_suspend_bts(per_cpu(tracer, cpu));
+	/*
+	 * We need to collect the trace on the respective cpu since ftrace
+	 * implicitly adds the record for the current cpu.
+	 * Once that is more flexible, we could collect the data from any cpu.
+	 */
 	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
-	spin_unlock(&bts_tracer_lock);
+	for_each_online_cpu(cpu)
+		if (likely(per_cpu(tracer, cpu)))
+			ds_resume_bts(per_cpu(tracer, cpu));
+	put_online_cpus();
 }
 
 static void trace_bts_close(struct trace_iterator *iter)
@@ -324,12 +276,11 @@ static void trace_bts_close(struct trace_iterator *iter)
 
 void trace_hw_branch_oops(void)
 {
-	spin_lock(&bts_tracer_lock);
-
-	if (trace_hw_branches_enabled)
+	if (this_tracer) {
+		ds_suspend_bts_noirq(this_tracer);
 		trace_bts_cpu(hw_branch_trace);
-
-	spin_unlock(&bts_tracer_lock);
+		ds_resume_bts_noirq(this_tracer);
+	}
 }
 
 struct tracer bts_tracer __read_mostly =
-- 
GitLab


From 4d657e51dfc042216febd4a007c6f36881f9256d Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:41 +0200
Subject: [PATCH 0269/2198] x86, hw-branch-tracer: allocate selftest iterator
 on heap

Allocate the trace_iterator for the hw-branch-tracer selftest on the heap.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144556.578777000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 499d01c44cd17..00dd6485bdd7e 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -757,7 +757,7 @@ int
 trace_selftest_startup_hw_branches(struct tracer *trace,
 				   struct trace_array *tr)
 {
-	struct trace_iterator iter;
+	struct trace_iterator *iter;
 	struct tracer tracer;
 	unsigned long count;
 	int ret;
@@ -777,17 +777,21 @@ trace_selftest_startup_hw_branches(struct tracer *trace,
 	 * The hw-branch tracer needs to collect the trace from the various
 	 * cpu trace buffers - before tracing is stopped.
 	 */
-	memset(&iter, 0, sizeof(iter));
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return -ENOMEM;
+
 	memcpy(&tracer, trace, sizeof(tracer));
 
-	iter.trace = &tracer;
-	iter.tr = tr;
-	iter.pos = -1;
-	mutex_init(&iter.mutex);
+	iter->trace = &tracer;
+	iter->tr = tr;
+	iter->pos = -1;
+	mutex_init(&iter->mutex);
 
-	trace->open(&iter);
+	trace->open(iter);
 
-	mutex_destroy(&iter.mutex);
+	mutex_destroy(&iter->mutex);
+	kfree(iter);
 
 	tracing_stop();
 
-- 
GitLab


From 353afeea24cc51aafc0ff21a72ec740b6f0af50c Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:42 +0200
Subject: [PATCH 0270/2198] x86, ds: fix compiler warning

Size_t is defined differently on i386 and x86_64.
Change type to avoid compiler warning.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144557.523964000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index e5a263c8a14cd..e1ba5101b5765 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -87,7 +87,7 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer,
 	/* Now to the test itself. */
 	for (at = from; (void *)at < to; at += trace->ds.size) {
 		struct bts_struct bts;
-		size_t index;
+		unsigned long index;
 		int error;
 
 		if (((void *)at - trace->ds.begin) % trace->ds.size) {
-- 
GitLab


From 84f201139245c30777ff858e71b8d7e134b8c3ed Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:43 +0200
Subject: [PATCH 0271/2198] x86, ds: fix bounds check in ds selftest

Fix a bad bounds check in the debug store selftest.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144558.450027000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds_selftest.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index e1ba5101b5765..cccc19a38f6d0 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -47,8 +47,13 @@ static int ds_selftest_bts_consistency(const struct bts_trace *trace)
 		printk(KERN_CONT "bad bts buffer setup...");
 		error = -1;
 	}
+	/*
+	 * We allow top in [begin; end], since its not clear when the
+	 * overflow adjustment happens: after the increment or before the
+	 * write.
+	 */
 	if ((trace->ds.top < trace->ds.begin) ||
-	    (trace->ds.end <= trace->ds.top)) {
+	    (trace->ds.end < trace->ds.top)) {
 		printk(KERN_CONT "bts top out of bounds...");
 		error = -1;
 	}
-- 
GitLab


From 01f6569ece6915616f6cae1d7d8b46ab8da9c1bd Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:44 +0200
Subject: [PATCH 0272/2198] x86, ds: selftest each cpu

Perform debug store selftests on each cpu.

Cover both the normal and the _noirq variant of the debug store interface.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144559.394583000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds_selftest.c | 182 +++++++++++++++++++++++++---------
 1 file changed, 135 insertions(+), 47 deletions(-)

diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index cccc19a38f6d0..599a96300628a 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -11,13 +11,21 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/smp.h>
+#include <linux/cpu.h>
 
 #include <asm/ds.h>
 
 
-#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */
+#define BUFFER_SIZE	521 /* Intentionally chose an odd size. */
 
 
+struct ds_selftest_bts_conf {
+	struct bts_tracer *tracer;
+	int error;
+	int (*suspend)(struct bts_tracer *);
+	int (*resume)(struct bts_tracer *);
+};
+
 static int ds_selftest_bts_consistency(const struct bts_trace *trace)
 {
 	int error = 0;
@@ -125,36 +133,32 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer,
 	return 0;
 }
 
-int ds_selftest_bts(void)
+static void ds_selftest_bts_cpu(void *arg)
 {
+	struct ds_selftest_bts_conf *conf = arg;
 	const struct bts_trace *trace;
-	struct bts_tracer *tracer;
-	int error = 0;
 	void *top;
-	unsigned char buffer[BUFFER_SIZE];
 
-	printk(KERN_INFO "[ds] bts selftest...");
-
-	tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE,
-				    NULL, (size_t)-1, BTS_KERNEL);
-	if (IS_ERR(tracer)) {
-		error = PTR_ERR(tracer);
-		tracer = NULL;
+	if (IS_ERR(conf->tracer)) {
+		conf->error = PTR_ERR(conf->tracer);
+		conf->tracer = NULL;
 
 		printk(KERN_CONT
-		       "initialization failed (err: %d)...", error);
-		goto out;
+		       "initialization failed (err: %d)...", conf->error);
+		return;
 	}
 
-	/* The return should already give us enough trace. */
-	ds_suspend_bts(tracer);
+	/* We should meanwhile have enough trace. */
+	conf->error = conf->suspend(conf->tracer);
+	if (conf->error < 0)
+		return;
 
 	/* Let's see if we can access the trace. */
-	trace = ds_read_bts(tracer);
+	trace = ds_read_bts(conf->tracer);
 
-	error = ds_selftest_bts_consistency(trace);
-	if (error < 0)
-		goto out;
+	conf->error = ds_selftest_bts_consistency(trace);
+	if (conf->error < 0)
+		return;
 
 	/* If everything went well, we should have a few trace entries. */
 	if (trace->ds.top == trace->ds.begin) {
@@ -168,10 +172,11 @@ int ds_selftest_bts(void)
 	}
 
 	/* Let's try to read the trace we collected. */
-	error = ds_selftest_bts_read(tracer, trace,
+	conf->error =
+		ds_selftest_bts_read(conf->tracer, trace,
 				     trace->ds.begin, trace->ds.top);
-	if (error < 0)
-		goto out;
+	if (conf->error < 0)
+		return;
 
 	/*
 	 * Let's read the trace again.
@@ -179,26 +184,31 @@ int ds_selftest_bts(void)
 	 */
 	top = trace->ds.top;
 
-	trace = ds_read_bts(tracer);
-	error = ds_selftest_bts_consistency(trace);
-	if (error < 0)
-		goto out;
+	trace = ds_read_bts(conf->tracer);
+	conf->error = ds_selftest_bts_consistency(trace);
+	if (conf->error < 0)
+		return;
 
 	if (top != trace->ds.top) {
 		printk(KERN_CONT "suspend not working...");
-		error = -1;
-		goto out;
+		conf->error = -1;
+		return;
 	}
 
 	/* Let's collect some more trace - see if resume is working. */
-	ds_resume_bts(tracer);
-	ds_suspend_bts(tracer);
+	conf->error = conf->resume(conf->tracer);
+	if (conf->error < 0)
+		return;
+
+	conf->error = conf->suspend(conf->tracer);
+	if (conf->error < 0)
+		return;
 
-	trace = ds_read_bts(tracer);
+	trace = ds_read_bts(conf->tracer);
 
-	error = ds_selftest_bts_consistency(trace);
-	if (error < 0)
-		goto out;
+	conf->error = ds_selftest_bts_consistency(trace);
+	if (conf->error < 0)
+		return;
 
 	if (trace->ds.top == top) {
 		/*
@@ -210,35 +220,113 @@ int ds_selftest_bts(void)
 		printk(KERN_CONT
 		       "no resume progress/overflow...");
 
-		error = ds_selftest_bts_read(tracer, trace,
+		conf->error =
+			ds_selftest_bts_read(conf->tracer, trace,
 					     trace->ds.begin, trace->ds.end);
 	} else if (trace->ds.top < top) {
 		/*
 		 * We had a buffer overflow - the entire buffer should
 		 * contain trace records.
 		 */
-		error = ds_selftest_bts_read(tracer, trace,
+		conf->error =
+			ds_selftest_bts_read(conf->tracer, trace,
 					     trace->ds.begin, trace->ds.end);
 	} else {
 		/*
 		 * It is quite likely that the buffer did not overflow.
 		 * Let's just check the delta trace.
 		 */
-		error = ds_selftest_bts_read(tracer, trace,
-					     top, trace->ds.top);
+		conf->error =
+			ds_selftest_bts_read(conf->tracer, trace, top,
+					     trace->ds.top);
 	}
-	if (error < 0)
-		goto out;
+	if (conf->error < 0)
+		return;
 
-	error = 0;
+	conf->error = 0;
+}
 
-	/* The final test: release the tracer while tracing is suspended. */
- out:
-	ds_release_bts(tracer);
+static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
+{
+	ds_suspend_bts(tracer);
+	return 0;
+}
+
+static int ds_resume_bts_wrap(struct bts_tracer *tracer)
+{
+	ds_resume_bts(tracer);
+	return 0;
+}
 
-	printk(KERN_CONT "%s.\n", (error ? "failed" : "passed"));
+static void ds_release_bts_noirq_wrap(void *tracer)
+{
+	(void)ds_release_bts_noirq(tracer);
+}
 
-	return error;
+static int ds_selftest_bts_bad_release_noirq(int cpu,
+					     struct bts_tracer *tracer)
+{
+	int error = -EPERM;
+
+	/* Try to release the tracer on the wrong cpu. */
+	get_cpu();
+	if (cpu != smp_processor_id()) {
+		error = ds_release_bts_noirq(tracer);
+		if (error != -EPERM)
+			printk(KERN_CONT "release on wrong cpu...");
+	}
+	put_cpu();
+
+	return error ? 0 : -1;
+}
+
+int ds_selftest_bts(void)
+{
+	struct ds_selftest_bts_conf conf;
+	unsigned char buffer[BUFFER_SIZE];
+	int cpu;
+
+	printk(KERN_INFO "[ds] bts selftest...");
+	conf.error = 0;
+
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		conf.suspend = ds_suspend_bts_wrap;
+		conf.resume = ds_resume_bts_wrap;
+		conf.tracer =
+			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+					   NULL, (size_t)-1, BTS_KERNEL);
+		ds_selftest_bts_cpu(&conf);
+		ds_release_bts(conf.tracer);
+		if (conf.error < 0)
+			goto out;
+
+		conf.suspend = ds_suspend_bts_noirq;
+		conf.resume = ds_resume_bts_noirq;
+		conf.tracer =
+			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+					   NULL, (size_t)-1, BTS_KERNEL);
+		smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
+		if (conf.error >= 0) {
+			conf.error =
+				ds_selftest_bts_bad_release_noirq(cpu,
+								  conf.tracer);
+			/* We must not release the tracer twice. */
+			if (conf.error < 0)
+				conf.tracer = NULL;
+		}
+		smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
+					 conf.tracer, 1);
+		if (conf.error < 0)
+			goto out;
+	}
+
+	conf.error = 0;
+ out:
+	put_online_cpus();
+	printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
+
+	return conf.error;
 }
 
 int ds_selftest_pebs(void)
-- 
GitLab


From 3a68eef945b234f286406d96dc690fe17863c203 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:45 +0200
Subject: [PATCH 0273/2198] x86, ds: add task tracing selftest

Add selftests to cover per-task branch tracing.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144600.329346000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds_selftest.c | 71 +++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)

diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index 599a96300628a..a40b2533c71e6 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -280,10 +280,51 @@ static int ds_selftest_bts_bad_release_noirq(int cpu,
 	return error ? 0 : -1;
 }
 
+static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
+{
+	struct bts_tracer *tracer;
+	int error;
+
+	/* Try to request cpu tracing while task tracing is active. */
+	tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
+				    (size_t)-1, BTS_KERNEL);
+	error = PTR_ERR(tracer);
+	if (!IS_ERR(tracer)) {
+		ds_release_bts(tracer);
+		error = 0;
+	}
+
+	if (error != -EPERM)
+		printk(KERN_CONT "cpu/task tracing overlap...");
+
+	return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_task(void *buffer)
+{
+	struct bts_tracer *tracer;
+	int error;
+
+	/* Try to request cpu tracing while task tracing is active. */
+	tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
+				    (size_t)-1, BTS_KERNEL);
+	error = PTR_ERR(tracer);
+	if (!IS_ERR(tracer)) {
+		error = 0;
+		ds_release_bts(tracer);
+	}
+
+	if (error != -EPERM)
+		printk(KERN_CONT "task/cpu tracing overlap...");
+
+	return error ? 0 : -1;
+}
+
 int ds_selftest_bts(void)
 {
 	struct ds_selftest_bts_conf conf;
 	unsigned char buffer[BUFFER_SIZE];
+	unsigned long irq;
 	int cpu;
 
 	printk(KERN_INFO "[ds] bts selftest...");
@@ -297,6 +338,8 @@ int ds_selftest_bts(void)
 			ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
 					   NULL, (size_t)-1, BTS_KERNEL);
 		ds_selftest_bts_cpu(&conf);
+		if (conf.error >= 0)
+			conf.error = ds_selftest_bts_bad_request_task(buffer);
 		ds_release_bts(conf.tracer);
 		if (conf.error < 0)
 			goto out;
@@ -315,12 +358,40 @@ int ds_selftest_bts(void)
 			if (conf.error < 0)
 				conf.tracer = NULL;
 		}
+		if (conf.error >= 0)
+			conf.error = ds_selftest_bts_bad_request_task(buffer);
 		smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
 					 conf.tracer, 1);
 		if (conf.error < 0)
 			goto out;
 	}
 
+	conf.suspend = ds_suspend_bts_wrap;
+	conf.resume = ds_resume_bts_wrap;
+	conf.tracer =
+		ds_request_bts_task(current, buffer, BUFFER_SIZE,
+				    NULL, (size_t)-1, BTS_KERNEL);
+	ds_selftest_bts_cpu(&conf);
+	if (conf.error >= 0)
+		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+	ds_release_bts(conf.tracer);
+	if (conf.error < 0)
+		goto out;
+
+	conf.suspend = ds_suspend_bts_noirq;
+	conf.resume = ds_resume_bts_noirq;
+	conf.tracer =
+		ds_request_bts_task(current, buffer, BUFFER_SIZE,
+				   NULL, (size_t)-1, BTS_KERNEL);
+	local_irq_save(irq);
+	ds_selftest_bts_cpu(&conf);
+	if (conf.error >= 0)
+		conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+	ds_release_bts_noirq(conf.tracer);
+	local_irq_restore(irq);
+	if (conf.error < 0)
+		goto out;
+
 	conf.error = 0;
  out:
 	put_online_cpus();
-- 
GitLab


From 2311f0de21c17b2a8b960677a9cccfbfa52beb35 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:46 +0200
Subject: [PATCH 0274/2198] x86, ds: add leakage warning

Add a warning in case a debug store context is not removed before
the task it is attached to is freed.

Remove the old warning at thread exit. It is too early.

Declare the debug store context field in thread_struct unconditionally.

Remove ds_copy_thread() and ds_exit_thread() and do the work directly
in process*.c.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144601.254472000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ds.h        |  9 ---------
 arch/x86/include/asm/processor.h |  4 +---
 arch/x86/kernel/ds.c             | 10 ----------
 arch/x86/kernel/process.c        |  5 +++--
 arch/x86/kernel/process_32.c     |  3 ++-
 arch/x86/kernel/process_64.c     |  3 ++-
 6 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index 413e127e567d7..149e5208e9670 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -285,21 +285,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
  */
 extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
 
-/*
- * Task clone/init and cleanup work
- */
-extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
-extern void ds_exit_thread(struct task_struct *tsk);
-
 #else /* CONFIG_X86_DS */
 
 struct cpuinfo_x86;
 static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
 static inline void ds_switch_to(struct task_struct *prev,
 				struct task_struct *next) {}
-static inline void ds_copy_thread(struct task_struct *tsk,
-				  struct task_struct *father) {}
-static inline void ds_exit_thread(struct task_struct *tsk) {}
 
 #endif /* CONFIG_X86_DS */
 #endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 1efeb497f1f90..7c39de7e709a9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -454,10 +454,8 @@ struct thread_struct {
 	unsigned		io_bitmap_max;
 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
 	unsigned long	debugctlmsr;
-#ifdef CONFIG_X86_DS
-/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
+	/* Debug Store context; see asm/ds.h */
 	struct ds_context	*ds_ctx;
-#endif /* CONFIG_X86_DS */
 };
 
 static inline unsigned long native_get_debugreg(int regno)
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 21a3852abf685..71cab3b62dce1 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -1352,16 +1352,6 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next)
 	update_debugctlmsr(debugctlmsr);
 }
 
-void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
-{
-	clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
-	tsk->thread.ds_ctx = NULL;
-}
-
-void ds_exit_thread(struct task_struct *tsk)
-{
-}
-
 static __init int ds_selftest(void)
 {
 	if (ds_cfg.sizeof_rec[ds_bts]) {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca989158e8479..fb5dfb891f0fb 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -14,6 +14,7 @@
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/ds.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -45,6 +46,8 @@ void free_thread_xstate(struct task_struct *tsk)
 		kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
 		tsk->thread.xstate = NULL;
 	}
+
+	WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
 void free_thread_info(struct thread_info *ti)
@@ -83,8 +86,6 @@ void exit_thread(void)
 		put_cpu();
 		kfree(bp);
 	}
-
-	ds_exit_thread(current);
 }
 
 void flush_thread(void)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84043a2a..b5e4bfef44722 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -290,7 +290,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 		p->thread.io_bitmap_max = 0;
 	}
 
-	ds_copy_thread(p, current);
+	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+	p->thread.ds_ctx = NULL;
 
 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
 	p->thread.debugctlmsr = 0;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41392b1b..5a1a1de292ec8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -335,7 +335,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 			goto out;
 	}
 
-	ds_copy_thread(p, me);
+	clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+	p->thread.ds_ctx = NULL;
 
 	clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
 	p->thread.debugctlmsr = 0;
-- 
GitLab


From ee811517a5604aa63fae803b7c044712699e1303 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:47 +0200
Subject: [PATCH 0275/2198] x86, ds: use single debug store cpu configuration

Use a single configuration for all cpus.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144602.191165000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 71cab3b62dce1..443f415441dae 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -47,9 +47,8 @@ struct ds_configuration {
 	/* Control bit-masks indexed by enum ds_feature: */
 	unsigned long		ctl[dsf_ctl_max];
 };
-static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
+static struct ds_configuration ds_cfg __read_mostly;
 
-#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
 
 /* Maximal size of a DS configuration: */
 #define MAX_SIZEOF_DS		(12 * 8)
@@ -1268,6 +1267,10 @@ ds_configure(const struct ds_configuration *cfg,
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 {
+	/* Only configure the first cpu. Others are identical. */
+	if (ds_cfg.name)
+		return;
+
 	switch (c->x86) {
 	case 0x6:
 		switch (c->x86_model) {
-- 
GitLab


From 0f4814065ff8c24ca8bfd75c9b73502be152c287 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:48 +0200
Subject: [PATCH 0276/2198] x86, ptrace: add bts context unconditionally

Add the ptrace bts context field to task_struct unconditionally.

Initialize the field directly in copy_process().
Remove all the unneeded functionality used to initialize that field.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144603.292754000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ptrace.h |  9 ++++-----
 arch/x86/kernel/ptrace.c      | 20 +-------------------
 include/linux/ptrace.h        | 10 ----------
 include/linux/sched.h         |  2 --
 kernel/fork.c                 |  4 ++--
 kernel/ptrace.c               | 10 ----------
 6 files changed, 7 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index e304b66abeea6..5cdd19f20b5b7 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -235,12 +235,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
 			      struct user_desc __user *info, int can_allocate);
 
-extern void x86_ptrace_untrace(struct task_struct *);
-extern void x86_ptrace_fork(struct task_struct *child,
-			    unsigned long clone_flags);
+#ifdef CONFIG_X86_PTRACE_BTS
+extern void ptrace_bts_untrace(struct task_struct *tsk);
 
-#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
-#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+#define arch_ptrace_untrace(tsk)	ptrace_bts_untrace(tsk)
+#endif /* CONFIG_X86_PTRACE_BTS */
 
 #endif /* __KERNEL__ */
 
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index adbb24322d8f2..b32a8ee533816 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -887,37 +887,19 @@ static int ptrace_bts_size(struct task_struct *child)
 	return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 
-static inline void ptrace_bts_fork(struct task_struct *tsk)
-{
-	tsk->bts = NULL;
-}
-
 /*
  * Called from __ptrace_unlink() after the child has been moved back
  * to its original parent.
  */
-static inline void ptrace_bts_untrace(struct task_struct *child)
+void ptrace_bts_untrace(struct task_struct *child)
 {
 	if (unlikely(child->bts)) {
 		free_bts_context(child->bts);
 		child->bts = NULL;
 	}
 }
-#else
-static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 
-void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-	ptrace_bts_fork(child);
-}
-
-void x86_ptrace_untrace(struct task_struct *child)
-{
-	ptrace_bts_untrace(child);
-}
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 67c15653fc230..59e133d39d509 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -95,7 +95,6 @@ extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void exit_ptrace(struct task_struct *tracer);
-extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
@@ -327,15 +326,6 @@ static inline void user_enable_block_step(struct task_struct *task)
 #define arch_ptrace_untrace(task)		do { } while (0)
 #endif
 
-#ifndef arch_ptrace_fork
-/*
- * Do machine-specific work to initialize a new task.
- *
- * This is called from copy_process().
- */
-#define arch_ptrace_fork(child, clone_flags)	do { } while (0)
-#endif
-
 extern int task_current_syscall(struct task_struct *target, long *callno,
 				unsigned long args[6], unsigned int maxargs,
 				unsigned long *sp, unsigned long *pc);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 52b8cd049c2e0..451186a22ef55 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1205,13 +1205,11 @@ struct task_struct {
 	struct list_head ptraced;
 	struct list_head ptrace_entry;
 
-#ifdef CONFIG_X86_PTRACE_BTS
 	/*
 	 * This is the tracer handle for the ptrace BTS extension.
 	 * This field actually belongs to the ptracer task.
 	 */
 	struct bts_context *bts;
-#endif /* CONFIG_X86_PTRACE_BTS */
 
 	/* PID/PID hash table linkage. */
 	struct pid_link pids[PIDTYPE_MAX];
diff --git a/kernel/fork.c b/kernel/fork.c
index 660c2b8765bce..69bde7a22e9bf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1086,8 +1086,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
-	if (unlikely(current->ptrace))
-		ptrace_fork(p, clone_flags);
+
+	p->bts = NULL;
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index aaad0ec341948..321127d965c29 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -26,16 +26,6 @@
 #include <asm/uaccess.h>
 
 
-/*
- * Initialize a new task whose father had been ptraced.
- *
- * Called from copy_process().
- */
-void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-	arch_ptrace_fork(child, clone_flags);
-}
-
 /*
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
-- 
GitLab


From 6047550d3d26fed88b18a208b31f8b90b5ef3e9b Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:49 +0200
Subject: [PATCH 0277/2198] x86, ds: dont use TIF_DEBUGCTLMSR

Debug store already uses TIF_DS_AREA_MSR to trigger debug store context
switch handling. No need to use TIF_DEBUGCTLMSR, as well.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144604.256645000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 443f415441dae..cab28320dac73 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -925,11 +925,6 @@ static void update_task_debugctlmsr(struct task_struct *task,
 	get_cpu();
 	if (task == current)
 		update_debugctlmsr(debugctlmsr);
-
-	if (task->thread.debugctlmsr)
-		set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
-	else
-		clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
 	put_cpu();
 }
 
-- 
GitLab


From 608780a9048efa3e85fbc4d8649b26805cc588aa Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:50 +0200
Subject: [PATCH 0278/2198] x86, ds: fix bad ds_reset_pebs()

Ds_reset_pebs() passed the wrong qualifier to a shared function resulting
in a reset of bts, rather than pebs.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144605.206510000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index cab28320dac73..ebfb0fde8e6f9 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -1186,7 +1186,7 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
 
 	tracer->trace.ds.top = tracer->trace.ds.begin;
 
-	ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+	ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
 	       (unsigned long)tracer->trace.ds.top);
 
 	return 0;
-- 
GitLab


From 150f5164c1258e05b7dea16f29e592f354c48f34 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:51 +0200
Subject: [PATCH 0279/2198] x86, ds: allow small debug store buffers

Check the buffer size more precisely to allow buffers for exactly
one element provided the base address is already properly aligned.

Add a debug store selftest.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144606.139137000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds.c          | 9 +++++++--
 arch/x86/kernel/ds_selftest.c | 6 +++---
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index ebfb0fde8e6f9..4e05157506aa8 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -656,6 +656,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 {
 	struct ds_context *context;
 	int error;
+	size_t req_size;
 
 	error = -EOPNOTSUPP;
 	if (!ds_cfg.sizeof_rec[qual])
@@ -665,9 +666,13 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
 	if (!base)
 		goto out;
 
-	/* We need space for alignment adjustments in ds_init_ds_trace(). */
+	req_size = ds_cfg.sizeof_rec[qual];
+	/* We might need space for alignment adjustments. */
+	if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
+		req_size += DS_ALIGNMENT;
+
 	error = -EINVAL;
-	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+	if (size < req_size)
 		goto out;
 
 	if (th != (size_t)-1) {
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index a40b2533c71e6..5f104a0ace666 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -16,8 +16,8 @@
 #include <asm/ds.h>
 
 
-#define BUFFER_SIZE	521 /* Intentionally chose an odd size. */
-
+#define BUFFER_SIZE		521	/* Intentionally chose an odd size. */
+#define SMALL_BUFFER_SIZE	24	/* A single bts entry. */
 
 struct ds_selftest_bts_conf {
 	struct bts_tracer *tracer;
@@ -381,7 +381,7 @@ int ds_selftest_bts(void)
 	conf.suspend = ds_suspend_bts_noirq;
 	conf.resume = ds_resume_bts_noirq;
 	conf.tracer =
-		ds_request_bts_task(current, buffer, BUFFER_SIZE,
+		ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE,
 				   NULL, (size_t)-1, BTS_KERNEL);
 	local_irq_save(irq);
 	ds_selftest_bts_cpu(&conf);
-- 
GitLab


From 017bc617657c928cb9a0c45a7a7e9f4e66695347 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 3 Apr 2009 16:43:52 +0200
Subject: [PATCH 0280/2198] x86, ds: support Core i7

Add debug store support for Core i7.

Core i7 adds a reset value for each performance counter and a new
PEBS record format.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: roland@redhat.com
Cc: eranian@googlemail.com
Cc: oleg@redhat.com
Cc: juan.villacis@intel.com
Cc: ak@linux.jf.intel.com
LKML-Reference: <20090403144607.088997000@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/ds.h | 12 +++++--
 arch/x86/kernel/ds.c      | 69 +++++++++++++++++++++++++++++++++++----
 2 files changed, 71 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
index 149e5208e9670..70dac199b093d 100644
--- a/arch/x86/include/asm/ds.h
+++ b/arch/x86/include/asm/ds.h
@@ -234,8 +234,12 @@ struct bts_trace {
 struct pebs_trace {
 	struct ds_trace ds;
 
-	/* the PEBS reset value */
-	unsigned long long reset_value;
+	/* the number of valid counters in the below array */
+	unsigned int counters;
+
+#define MAX_PEBS_COUNTERS 4
+	/* the counter reset value */
+	unsigned long long counter_reset[MAX_PEBS_COUNTERS];
 };
 
 
@@ -270,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer);
  * Returns 0 on success; -Eerrno on error
  *
  * tracer: the tracer handle returned from ds_request_pebs()
+ * counter: the index of the counter
  * value: the new counter reset value
  */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
+			     unsigned int counter, u64 value);
 
 /*
  * Initialization
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 4e05157506aa8..48bfe1386038c 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -44,6 +44,9 @@ struct ds_configuration {
 	/* The size of a BTS/PEBS record in bytes: */
 	unsigned char		sizeof_rec[2];
 
+	/* The number of pebs counter reset values in the DS structure. */
+	unsigned char		nr_counter_reset;
+
 	/* Control bit-masks indexed by enum ds_feature: */
 	unsigned long		ctl[dsf_ctl_max];
 };
@@ -51,7 +54,7 @@ static struct ds_configuration ds_cfg __read_mostly;
 
 
 /* Maximal size of a DS configuration: */
-#define MAX_SIZEOF_DS		(12 * 8)
+#define MAX_SIZEOF_DS		0x80
 
 /* Maximal size of a BTS record: */
 #define MAX_SIZEOF_BTS		(3 * 8)
@@ -59,6 +62,12 @@ static struct ds_configuration ds_cfg __read_mostly;
 /* BTS and PEBS buffer alignment: */
 #define DS_ALIGNMENT		(1 << 3)
 
+/* Number of buffer pointers in DS: */
+#define NUM_DS_PTR_FIELDS	8
+
+/* Size of a pebs reset value in DS: */
+#define PEBS_RESET_FIELD_SIZE	8
+
 /* Mask of control bits in the DS MSR register: */
 #define BTS_CONTROL				  \
 	( ds_cfg.ctl[dsf_bts]			| \
@@ -1164,9 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
 		return NULL;
 
 	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-	tracer->trace.reset_value =
-		*(u64 *)(tracer->ds.context->ds +
-			 (ds_cfg.sizeof_ptr_field * 8));
+
+	tracer->trace.counters = ds_cfg.nr_counter_reset;
+	memcpy(tracer->trace.counter_reset,
+	       tracer->ds.context->ds +
+	       (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
+	       ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
 
 	return &tracer->trace;
 }
@@ -1197,13 +1209,18 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
 	return 0;
 }
 
-int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer,
+		      unsigned int counter, u64 value)
 {
 	if (!tracer)
 		return -EINVAL;
 
+	if (ds_cfg.nr_counter_reset < counter)
+		return -EINVAL;
+
 	*(u64 *)(tracer->ds.context->ds +
-		 (ds_cfg.sizeof_ptr_field * 8)) = value;
+		 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
+		 (counter * PEBS_RESET_FIELD_SIZE)) = value;
 
 	return 0;
 }
@@ -1213,16 +1230,26 @@ static const struct ds_configuration ds_cfg_netburst = {
 	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),
 	.ctl[dsf_bts_kernel]	= (1 << 5),
 	.ctl[dsf_bts_user]	= (1 << 6),
+	.nr_counter_reset	= 1,
 };
 static const struct ds_configuration ds_cfg_pentium_m = {
 	.name = "Pentium M",
 	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
+	.nr_counter_reset	= 1,
 };
 static const struct ds_configuration ds_cfg_core2_atom = {
 	.name = "Core 2/Atom",
 	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
 	.ctl[dsf_bts_kernel]	= (1 << 9),
 	.ctl[dsf_bts_user]	= (1 << 10),
+	.nr_counter_reset	= 1,
+};
+static const struct ds_configuration ds_cfg_core_i7 = {
+	.name = "Core i7",
+	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),
+	.ctl[dsf_bts_kernel]	= (1 << 9),
+	.ctl[dsf_bts_user]	= (1 << 10),
+	.nr_counter_reset	= 4,
 };
 
 static void
@@ -1239,6 +1266,32 @@ ds_configure(const struct ds_configuration *cfg,
 	nr_pebs_fields = 18;
 #endif
 
+	/*
+	 * Starting with version 2, architectural performance
+	 * monitoring supports a format specifier.
+	 */
+	if ((cpuid_eax(0xa) & 0xff) > 1) {
+		unsigned long perf_capabilities, format;
+
+		rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
+
+		format = (perf_capabilities >> 8) & 0xf;
+
+		switch (format) {
+		case 0:
+			nr_pebs_fields = 18;
+			break;
+		case 1:
+			nr_pebs_fields = 22;
+			break;
+		default:
+			printk(KERN_INFO
+			       "[ds] unknown PEBS format: %lu\n", format);
+			nr_pebs_fields = 0;
+			break;
+		}
+	}
+
 	memset(&ds_cfg, 0, sizeof(ds_cfg));
 	ds_cfg = *cfg;
 
@@ -1262,7 +1315,7 @@ ds_configure(const struct ds_configuration *cfg,
 	printk("bts/pebs record: %u/%u bytes\n",
 	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
 
-	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field));
+	WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
 }
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
@@ -1284,6 +1337,8 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 			ds_configure(&ds_cfg_core2_atom, c);
 			break;
 		case 0x1a: /* Core i7 */
+			ds_configure(&ds_cfg_core_i7, c);
+			break;
 		default:
 			/* Sorry, don't know about them. */
 			break;
-- 
GitLab


From a5dec5573f3c7e63f2f9b5852b9759ea342a5ff9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 14:55:44 +0800
Subject: [PATCH 0281/2198] tracing: use macros to denote usec and nsec per
 second

Impact: cleanup

Use USEC_PER_SEC and NSEC_PER_SEC instead of 1000000 and 1000000000.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <49CC7870.9000309@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_boot.c      | 5 +++--
 kernel/trace/trace_mmiotrace.c | 6 ++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7a30fc4c36423..a29ef23ffb470 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/kallsyms.h>
+#include <linux/time.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
 	trace_assign_type(field, entry);
 	call = &field->boot_call;
 	ts = iter->ts;
-	nsec_rem = do_div(ts, 1000000000);
+	nsec_rem = do_div(ts, NSEC_PER_SEC);
 
 	ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
 			(unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
 	trace_assign_type(field, entry);
 	init_ret = &field->boot_ret;
 	ts = iter->ts;
-	nsec_rem = do_div(ts, 1000000000);
+	nsec_rem = do_div(ts, NSEC_PER_SEC);
 
 	ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
 			"returned %d after %llu msecs\n",
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 8e37fcddd8b49..d53b45ed08062 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,8 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <linux/time.h>
+
 #include <asm/atomic.h>
 
 #include "trace.h"
@@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
 	struct mmiotrace_rw *rw;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret = 1;
 
@@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
 	struct mmiotrace_map *m;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret;
 
-- 
GitLab


From 5452af664f6fba26b80eb2c8c4ceae2999d5cf56 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 27 Mar 2009 00:25:38 +0100
Subject: [PATCH 0282/2198] tracing/ftrace: factorize the tracing files
 creation

Impact: cleanup

Most of the tracing files creation follow the same pattern:

ret = debugfs_create_file(...)
if (!ret)
	pr_warning("Couldn't create ... entry\n")

Unify it!

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1238109938-11840-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c              |  39 ++-----
 kernel/trace/ring_buffer.c         |   7 +-
 kernel/trace/trace.c               | 159 +++++++++++------------------
 kernel/trace/trace.h               |   6 ++
 kernel/trace/trace_event_profile.c |   1 -
 kernel/trace/trace_printk.c        |   6 +-
 kernel/trace/trace_stack.c         |  13 +--
 kernel/trace/trace_sysprof.c       |   6 +-
 8 files changed, 86 insertions(+), 151 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 678e3d6caf85a..6ea5a1ae6a987 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2698,38 +2698,23 @@ static const struct file_operations ftrace_graph_fops = {
 
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
-	struct dentry *entry;
 
-	entry = debugfs_create_file("available_filter_functions", 0444,
-				    d_tracer, NULL, &ftrace_avail_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'available_filter_functions' entry\n");
+	trace_create_file("available_filter_functions", 0444,
+			d_tracer, NULL, &ftrace_avail_fops);
 
-	entry = debugfs_create_file("failures", 0444,
-				    d_tracer, NULL, &ftrace_failures_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'failures' entry\n");
+	trace_create_file("failures", 0444,
+			d_tracer, NULL, &ftrace_failures_fops);
 
-	entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
-				    NULL, &ftrace_filter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_filter' entry\n");
+	trace_create_file("set_ftrace_filter", 0644, d_tracer,
+			NULL, &ftrace_filter_fops);
 
-	entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+	trace_create_file("set_ftrace_notrace", 0644, d_tracer,
 				    NULL, &ftrace_notrace_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_notrace' entry\n");
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-	entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+	trace_create_file("set_graph_function", 0444, d_tracer,
 				    NULL,
 				    &ftrace_graph_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
 	return 0;
@@ -2987,7 +2972,6 @@ static const struct file_operations ftrace_pid_fops = {
 static __init int ftrace_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
@@ -2995,11 +2979,8 @@ static __init int ftrace_init_debugfs(void)
 
 	ftrace_init_dyn_debugfs(d_tracer);
 
-	entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
-				    NULL, &ftrace_pid_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'set_ftrace_pid' entry\n");
+	trace_create_file("set_ftrace_pid", 0644, d_tracer,
+			    NULL, &ftrace_pid_fops);
 
 	ftrace_profile_debugfs(d_tracer);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 960cbf44c844a..74a11808c282e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2845,14 +2845,11 @@ static const struct file_operations rb_simple_fops = {
 static __init int rb_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("tracing_on", 0644, d_tracer,
-				    &ring_buffer_flags, &rb_simple_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_on' entry\n");
+	trace_create_file("tracing_on", 0644, d_tracer,
+			    &ring_buffer_flags, &rb_simple_fops);
 
 	return 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 32653c8c6e261..0615751a3ed7a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3581,7 +3581,7 @@ struct dentry *tracing_dentry_percpu(void)
 static void tracing_init_debugfs_percpu(long cpu)
 {
 	struct dentry *d_percpu = tracing_dentry_percpu();
-	struct dentry *entry, *d_cpu;
+	struct dentry *d_cpu;
 	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */
 	char cpu_dir[7];
 
@@ -3596,21 +3596,15 @@ static void tracing_init_debugfs_percpu(long cpu)
 	}
 
 	/* per cpu trace_pipe */
-	entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
-				(void *) cpu, &tracing_pipe_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_pipe' entry\n");
+	trace_create_file("trace_pipe", 0444, d_cpu,
+			(void *) cpu, &tracing_pipe_fops);
 
 	/* per cpu trace */
-	entry = debugfs_create_file("trace", 0644, d_cpu,
-				(void *) cpu, &tracing_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace' entry\n");
+	trace_create_file("trace", 0644, d_cpu,
+			(void *) cpu, &tracing_fops);
 
-	entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
-				    (void *) cpu, &tracing_buffers_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
+	trace_create_file("trace_pipe_raw", 0444, d_cpu,
+			(void *) cpu, &tracing_buffers_fops);
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -3766,6 +3760,22 @@ static const struct file_operations trace_options_core_fops = {
 	.write = trace_options_core_write,
 };
 
+struct dentry *trace_create_file(const char *name,
+				 mode_t mode,
+				 struct dentry *parent,
+				 void *data,
+				 const struct file_operations *fops)
+{
+	struct dentry *ret;
+
+	ret = debugfs_create_file(name, mode, parent, data, fops);
+	if (!ret)
+		pr_warning("Could not create debugfs '%s' entry\n", name);
+
+	return ret;
+}
+
+
 static struct dentry *trace_options_init_dentry(void)
 {
 	struct dentry *d_tracer;
@@ -3793,7 +3803,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
 			 struct tracer_opt *opt)
 {
 	struct dentry *t_options;
-	struct dentry *entry;
 
 	t_options = trace_options_init_dentry();
 	if (!t_options)
@@ -3802,11 +3811,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
 	topt->flags = flags;
 	topt->opt = opt;
 
-	entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+	topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
 				    &trace_options_fops);
 
-	topt->entry = entry;
-
 }
 
 static struct trace_option_dentry *
@@ -3861,123 +3868,81 @@ static struct dentry *
 create_trace_option_core_file(const char *option, long index)
 {
 	struct dentry *t_options;
-	struct dentry *entry;
 
 	t_options = trace_options_init_dentry();
 	if (!t_options)
 		return NULL;
 
-	entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+	return trace_create_file(option, 0644, t_options, (void *)index,
 				    &trace_options_core_fops);
-
-	return entry;
 }
 
 static __init void create_trace_options_dir(void)
 {
 	struct dentry *t_options;
-	struct dentry *entry;
 	int i;
 
 	t_options = trace_options_init_dentry();
 	if (!t_options)
 		return;
 
-	for (i = 0; trace_options[i]; i++) {
-		entry = create_trace_option_core_file(trace_options[i], i);
-		if (!entry)
-			pr_warning("Could not create debugfs %s entry\n",
-				   trace_options[i]);
-	}
+	for (i = 0; trace_options[i]; i++)
+		create_trace_option_core_file(trace_options[i], i);
 }
 
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 	int cpu;
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
-				    &global_trace, &tracing_ctrl_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+	trace_create_file("tracing_enabled", 0644, d_tracer,
+			&global_trace, &tracing_ctrl_fops);
 
-	entry = debugfs_create_file("trace_options", 0644, d_tracer,
-				    NULL, &tracing_iter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace_options' entry\n");
+	trace_create_file("trace_options", 0644, d_tracer,
+			NULL, &tracing_iter_fops);
 
-	create_trace_options_dir();
+	trace_create_file("tracing_cpumask", 0644, d_tracer,
+			NULL, &tracing_cpumask_fops);
+
+	trace_create_file("trace", 0644, d_tracer,
+			(void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
+
+	trace_create_file("available_tracers", 0444, d_tracer,
+			&global_trace, &show_traces_fops);
+
+	trace_create_file("current_tracer", 0444, d_tracer,
+			&global_trace, &set_tracer_fops);
+
+	trace_create_file("tracing_max_latency", 0644, d_tracer,
+			&tracing_max_latency, &tracing_max_lat_fops);
+
+	trace_create_file("tracing_thresh", 0644, d_tracer,
+			&tracing_thresh, &tracing_max_lat_fops);
 
-	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
-				    NULL, &tracing_cpumask_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
-
-	entry = debugfs_create_file("trace", 0644, d_tracer,
-				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'trace' entry\n");
-
-	entry = debugfs_create_file("available_tracers", 0444, d_tracer,
-				    &global_trace, &show_traces_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'available_tracers' entry\n");
-
-	entry = debugfs_create_file("current_tracer", 0444, d_tracer,
-				    &global_trace, &set_tracer_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'current_tracer' entry\n");
-
-	entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
-				    &tracing_max_latency,
-				    &tracing_max_lat_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'tracing_max_latency' entry\n");
-
-	entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
-				    &tracing_thresh, &tracing_max_lat_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'tracing_thresh' entry\n");
-	entry = debugfs_create_file("README", 0644, d_tracer,
-				    NULL, &tracing_readme_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'README' entry\n");
-
-	entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
+	trace_create_file("README", 0644, d_tracer,
+			NULL, &tracing_readme_fops);
+
+	trace_create_file("trace_pipe", 0444, d_tracer,
 			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'trace_pipe' entry\n");
-
-	entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
-				    &global_trace, &tracing_entries_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'buffer_size_kb' entry\n");
-
-	entry = debugfs_create_file("trace_marker", 0220, d_tracer,
-				    NULL, &tracing_mark_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'trace_marker' entry\n");
+
+	trace_create_file("buffer_size_kb", 0644, d_tracer,
+			&global_trace, &tracing_entries_fops);
+
+	trace_create_file("trace_marker", 0220, d_tracer,
+			NULL, &tracing_mark_fops);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
-				    &ftrace_update_tot_cnt,
-				    &tracing_dyn_info_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'dyn_ftrace_total_info' entry\n");
+	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
 #ifdef CONFIG_SYSPROF_TRACER
 	init_tracer_sysprof_debugfs(d_tracer);
 #endif
 
+	create_trace_options_dir();
+
 	for_each_tracing_cpu(cpu)
 		tracing_init_debugfs_percpu(cpu);
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 47aa6d0c97a0f..f76a8f8689d45 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -470,6 +470,12 @@ void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
 int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *trace_create_file(const char *name,
+				 mode_t mode,
+				 struct dentry *parent,
+				 void *data,
+				 const struct file_operations *fops);
+
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 22cba9970776c..199de9c742296 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -28,4 +28,3 @@ void ftrace_profile_disable(int event_id)
 			return event->profile_disable(event);
 	}
 }
-
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index eb81556107fec..9bece9687b62a 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = {
 static __init int init_trace_printk_function_export(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
 		return 0;
 
-	entry = debugfs_create_file("printk_formats", 0444, d_tracer,
+	trace_create_file("printk_formats", 0444, d_tracer,
 				    NULL, &ftrace_formats_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'printk_formats' entry\n");
 
 	return 0;
 }
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index c750f65f96615..1796f00524e19 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace);
 static __init int stack_trace_init(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *entry;
 
 	d_tracer = tracing_init_dentry();
 
-	entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
-				    &max_stack_size, &stack_max_size_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'stack_max_size' entry\n");
+	trace_create_file("stack_max_size", 0644, d_tracer,
+			&max_stack_size, &stack_max_size_fops);
 
-	entry = debugfs_create_file("stack_trace", 0444, d_tracer,
-				    NULL, &stack_trace_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'stack_trace' entry\n");
+	trace_create_file("stack_trace", 0444, d_tracer,
+			NULL, &stack_trace_fops);
 
 	if (stack_tracer_enabled)
 		register_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 91fd19c2149f5..e04b76cc238a6 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = {
 
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
 {
-	struct dentry *entry;
 
-	entry = debugfs_create_file("sysprof_sample_period", 0644,
+	trace_create_file("sysprof_sample_period", 0644,
 			d_tracer, NULL, &sysprof_sample_fops);
-	if (entry)
-		return;
-	pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
 }
-- 
GitLab


From 597af81537654097b67fd7a0c92775e66d4a86fe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Apr 2009 15:24:12 -0400
Subject: [PATCH 0283/2198] function-graph: use int instead of atomic for
 ftrace_graph_active

Impact: cleanup

The variable ftrace_graph_active is only modified under the
ftrace_lock mutex, thus an atomic is not necessary for modification.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6ea5a1ae6a987..8e6a0b5c99405 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3092,7 +3092,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-static atomic_t ftrace_graph_active;
+static int ftrace_graph_active;
 static struct notifier_block ftrace_suspend_notifier;
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -3244,7 +3244,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	mutex_lock(&ftrace_lock);
 
 	/* we currently allow only one tracer registered at a time */
-	if (atomic_read(&ftrace_graph_active)) {
+	if (ftrace_graph_active) {
 		ret = -EBUSY;
 		goto out;
 	}
@@ -3252,10 +3252,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	register_pm_notifier(&ftrace_suspend_notifier);
 
-	atomic_inc(&ftrace_graph_active);
+	ftrace_graph_active++;
 	ret = start_graph_tracing();
 	if (ret) {
-		atomic_dec(&ftrace_graph_active);
+		ftrace_graph_active--;
 		goto out;
 	}
 
@@ -3273,10 +3273,10 @@ void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_lock);
 
-	if (!unlikely(atomic_read(&ftrace_graph_active)))
+	if (unlikely(!ftrace_graph_active))
 		goto out;
 
-	atomic_dec(&ftrace_graph_active);
+	ftrace_graph_active--;
 	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -3290,7 +3290,7 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
-	if (atomic_read(&ftrace_graph_active)) {
+	if (ftrace_graph_active) {
 		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
-- 
GitLab


From dcef788eb9659b61a2110284fcce3ca6e63480d2 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 15:26:14 +0800
Subject: [PATCH 0284/2198] ftrace: clean up enable logic for sched_switch

Unify sched_switch and sched_wakeup's action to following logic:
Do record_cmdline when start_cmdline_record() is called.
Start tracing events when the tracer is started.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
LKML-Reference: <49D1C596.5050203@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_sched_switch.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9117cea6f1ae7..9d8cccdfaa06c 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	int cpu;
 	int pc;
 
-	if (!sched_ref || sched_stopped)
+	if (unlikely(!sched_ref))
 		return;
 
 	tracing_record_cmdline(prev);
 	tracing_record_cmdline(next);
 
-	if (!tracer_enabled)
+	if (!tracer_enabled || sched_stopped)
 		return;
 
 	pc = preempt_count();
@@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 	unsigned long flags;
 	int cpu, pc;
 
-	if (!likely(tracer_enabled))
+	if (unlikely(!sched_ref))
 		return;
 
-	pc = preempt_count();
 	tracing_record_cmdline(current);
 
-	if (sched_stopped)
+	if (!tracer_enabled || sched_stopped)
 		return;
 
+	pc = preempt_count();
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = ctx_trace->data[cpu];
-- 
GitLab


From 9c83633ad38138855181af6936e8ac570ef7e2cb Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Tue, 7 Apr 2009 14:48:16 +0300
Subject: [PATCH 0285/2198] missing unlock in jfs_quota_write()

We should unlock &inode->i_mutex on the error path.  This bug was
in ext2_quota_write().  I sent a patch to them today as well.

Found by smatch (http://repo.or.cz/w/smatch.git).  Compile tested.

regards,
dan carpenter

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
---
 fs/jfs/super.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 6f21adf9479af..d9b0e92b36021 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -720,8 +720,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
 		blk++;
 	}
 out:
-	if (len == towrite)
+	if (len == towrite) {
+		mutex_unlock(&inode->i_mutex);
 		return err;
+	}
 	if (inode->i_size < off+len-towrite)
 		i_size_write(inode, off+len-towrite);
 	inode->i_version++;
-- 
GitLab


From 169aafbc8d3f05431b5cfeb60294a12b8ef2bcee Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Tue, 7 Apr 2009 13:37:26 -0700
Subject: [PATCH 0286/2198] lguest: update lazy mmu changes to match lguest's
 use of kvm hypercalls

Duplicate hcall -> kvm_hypercall0 convertion from "lguest: use KVM
hypercalls".

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Matias Zabaljauregui <zabaljauregui at gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/boot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 5ab239711cc21..cfb2d68dc7959 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -168,7 +168,7 @@ static void lazy_hcall3(unsigned long call,
  * issue the do-nothing hypercall to flush any stored calls. */
 static void lguest_leave_lazy_mmu_mode(void)
 {
-	hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0);
+	kvm_hypercall0(LHCALL_FLUSH_ASYNC);
 	paravirt_leave_lazy_mmu();
 }
 
-- 
GitLab


From 44bc9dc729e33a4ec6ebed4d0b6c08e8d20b42cf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 10:47:17 +0200
Subject: [PATCH 0287/2198] mm, x86, ptrace, bts: defer branch trace stopping,
 cleanup

Andrew Morton noticed that mm.h needlessly includes sched.h - remove it.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mm.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64d8ed2538aef..776b641f37e3a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -13,7 +13,6 @@
 #include <linux/prio_tree.h>
 #include <linux/debug_locks.h>
 #include <linux/mm_types.h>
-#include <linux/sched.h>
 
 struct mempolicy;
 struct anon_vma;
-- 
GitLab


From a34b50ddc265bae058c66661b096ef6384c5a8b1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 8 Apr 2009 10:56:54 +0200
Subject: [PATCH 0288/2198] mm, x86, ptrace, bts: defer branch trace stopping,
 remove dead code

Remove the unused free_locked_buffer() API.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mm.h | 1 -
 mm/mlock.c         | 6 ------
 2 files changed, 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 776b641f37e3a..a3963ba23a6d6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1320,7 +1320,6 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
 extern void *alloc_locked_buffer(size_t size);
-extern void free_locked_buffer(void *buffer, size_t size);
 extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/mlock.c b/mm/mlock.c
index 749383b442c7e..28be15ead9c13 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -671,9 +671,3 @@ void refund_locked_buffer_memory(struct mm_struct *mm, size_t size)
 
 	up_write(&mm->mmap_sem);
 }
-
-void free_locked_buffer(void *buffer, size_t size)
-{
-	refund_locked_buffer_memory(current->mm, size);
-	kfree(buffer);
-}
-- 
GitLab


From dc66270b51a62b1a6888d5309229e638a305c47b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Apr 2009 20:30:10 +1000
Subject: [PATCH 0289/2198] perf_counter: fix powerpc build

Commit 4af4998b ("perf_counter: rework context time") changed struct
perf_counter_context to have a 'time' field instead of a 'time_now'
field, but neglected to fix the place in the powerpc perf_counter.c
where the time_now field was accessed.  This fixes it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18908.31922.411398.147810@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index f88c35d0710a5..0e5651385ddc6 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -457,8 +457,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 {
 	counter->state = PERF_COUNTER_STATE_ACTIVE;
 	counter->oncpu = cpu;
-	counter->tstamp_running += counter->ctx->time_now -
-		counter->tstamp_stopped;
+	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
 	if (is_software_counter(counter))
 		counter->hw_ops->enable(counter);
 }
-- 
GitLab


From f708223d49ac39f5af1643985056206c98033f5b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 8 Apr 2009 20:30:18 +1000
Subject: [PATCH 0290/2198] perf_counter: powerpc: set sample enable bit for
 marked instruction events

Impact: enable access to hardware feature

POWER processors have the ability to "mark" a subset of the instructions
and provide more detailed information on what happens to the marked
instructions as they flow through the pipeline.  This marking is
enabled by the "sample enable" bit in MMCRA, and there are
synchronization requirements around setting and clearing the bit.

This adds logic to the processor-specific back-ends so that they know
which events relate to marked instructions and set the sampling enable
bit if any event that we want to put on the PMU is a marked instruction
event.  It also adds logic to the generic powerpc code to do the
necessary synchronization if that bit is set.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18908.31930.1024.228867@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  28 +++++--
 arch/powerpc/kernel/power5+-pmu.c  | 103 ++++++++++++++++++++++-
 arch/powerpc/kernel/power5-pmu.c   |  96 +++++++++++++++++++++-
 arch/powerpc/kernel/power6-pmu.c   | 126 ++++++++++++++++++++++++++++-
 arch/powerpc/kernel/ppc970-pmu.c   |  72 ++++++++++++++++-
 5 files changed, 413 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0e5651385ddc6..0697ade84dd3a 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -306,6 +306,15 @@ u64 hw_perf_save_disable(void)
 			cpuhw->pmcs_enabled = 1;
 		}
 
+		/*
+		 * Disable instruction sampling if it was enabled
+		 */
+		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+			mtspr(SPRN_MMCRA,
+			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
+			mb();
+		}
+
 		/*
 		 * Set the 'freeze counters' bit.
 		 * The barrier is to make sure the mtspr has been
@@ -347,12 +356,11 @@ void hw_perf_restore(u64 disable)
 	 * (possibly updated for removal of counters).
 	 */
 	if (!cpuhw->n_added) {
-		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
-		mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 		if (cpuhw->n_counters == 0)
 			get_lppaca()->pmcregs_in_use = 0;
-		goto out;
+		goto out_enable;
 	}
 
 	/*
@@ -385,7 +393,7 @@ void hw_perf_restore(u64 disable)
 	 * Then unfreeze the counters.
 	 */
 	get_lppaca()->pmcregs_in_use = 1;
-	mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE);
 	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]);
 	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE))
 				| MMCR0_FC);
@@ -421,10 +429,20 @@ void hw_perf_restore(u64 disable)
 		write_pmc(counter->hw.idx, val);
 		perf_counter_update_userpage(counter);
 	}
-	mb();
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
+
+ out_enable:
+	mb();
 	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 
+	/*
+	 * Enable instruction sampling if necessary
+	 */
+	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) {
+		mb();
+		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]);
+	}
+
  out:
 	local_irq_restore(flags);
 }
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index cec21ea65b0e2..1222c8ea3c26e 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -1,5 +1,5 @@
 /*
- * Performance counter support for POWER5 (not POWER5++) processors.
+ * Performance counter support for POWER5+/++ (not POWER5) processors.
  *
  * Copyright 2009 Paul Mackerras, IBM Corporation.
  *
@@ -281,10 +281,107 @@ static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
 	return nalt;
 }
 
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
+ * Bit 0 is set if it is marked for all PMCs.
+ * The 0x80 bit indicates a byte decode PMCSEL value.
+ */
+static unsigned char direct_event_is_marked[0x28] = {
+	0,	/* 00 */
+	0x1f,	/* 01 PM_IOPS_CMPL */
+	0x2,	/* 02 PM_MRK_GRP_DISP */
+	0xe,	/* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+	0,	/* 04 */
+	0x1c,	/* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
+	0x80,	/* 06 */
+	0x80,	/* 07 */
+	0, 0, 0,/* 08 - 0a */
+	0x18,	/* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
+	0,	/* 0c */
+	0x80,	/* 0d */
+	0x80,	/* 0e */
+	0,	/* 0f */
+	0,	/* 10 */
+	0x14,	/* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
+	0,	/* 12 */
+	0x10,	/* 13 PM_MRK_GRP_CMPL */
+	0x1f,	/* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
+	0x2,	/* 15 PM_MRK_GRP_ISSUED */
+	0x80,	/* 16 */
+	0x80,	/* 17 */
+	0, 0, 0, 0, 0,
+	0x80,	/* 1d */
+	0x80,	/* 1e */
+	0,	/* 1f */
+	0x80,	/* 20 */
+	0x80,	/* 21 */
+	0x80,	/* 22 */
+	0x80,	/* 23 */
+	0x80,	/* 24 */
+	0x80,	/* 25 */
+	0x80,	/* 26 */
+	0x80,	/* 27 */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power5p_marked_instr_event(unsigned int event)
+{
+	int pmc, psel;
+	int bit, byte, unit;
+	u32 mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc >= 5)
+		return 0;
+
+	bit = -1;
+	if (psel < sizeof(direct_event_is_marked)) {
+		if (direct_event_is_marked[psel] & (1 << pmc))
+			return 1;
+		if (direct_event_is_marked[psel] & 0x80)
+			bit = 4;
+		else if (psel == 0x08)
+			bit = pmc - 1;
+		else if (psel == 0x10)
+			bit = 4 - pmc;
+		else if (psel == 0x1b && (pmc == 1 || pmc == 3))
+			bit = 4;
+	} else if ((psel & 0x48) == 0x40) {
+		bit = psel & 7;
+	} else if (psel == 0x28) {
+		bit = pmc - 1;
+	} else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) {
+		bit = 4;
+	}
+
+	if (!(event & PM_BUSEVENT_MSK) || bit == -1)
+		return 0;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	if (unit == PM_LSU0) {
+		/* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
+		mask = 0x5dff00;
+	} else if (unit == PM_LSU1 && byte >= 4) {
+		byte -= 4;
+		/* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */
+		mask = 0x5f11c000;
+	} else
+		return 0;
+
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 				unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
+	u64 mmcra = 0;
 	unsigned int pmc, unit, byte, psel;
 	unsigned int ttm;
 	int i, isbus, bit, grsel;
@@ -404,6 +501,8 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
 			mmcr1 |= (u64)grsel << grsel_shift[bit];
 		}
+		if (power5p_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 		if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1))
 			/* select alternate byte lane */
 			psel |= 0x10;
@@ -419,7 +518,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 	if (pmc_inuse & 0x3e)
 		mmcr[0] |= MMCR0_PMCjCE;
 	mmcr[1] = mmcr1;
-	mmcr[2] = 0;
+	mmcr[2] = mmcra;
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 379ed1087ccaa..116c4bb1809ea 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -290,10 +290,102 @@ static int power5_get_alternatives(unsigned int event, unsigned int alt[])
 	return nalt;
 }
 
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event.
+ * Bit 0 is set if it is marked for all PMCs.
+ * The 0x80 bit indicates a byte decode PMCSEL value.
+ */
+static unsigned char direct_event_is_marked[0x28] = {
+	0,	/* 00 */
+	0x1f,	/* 01 PM_IOPS_CMPL */
+	0x2,	/* 02 PM_MRK_GRP_DISP */
+	0xe,	/* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+	0,	/* 04 */
+	0x1c,	/* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */
+	0x80,	/* 06 */
+	0x80,	/* 07 */
+	0, 0, 0,/* 08 - 0a */
+	0x18,	/* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */
+	0,	/* 0c */
+	0x80,	/* 0d */
+	0x80,	/* 0e */
+	0,	/* 0f */
+	0,	/* 10 */
+	0x14,	/* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */
+	0,	/* 12 */
+	0x10,	/* 13 PM_MRK_GRP_CMPL */
+	0x1f,	/* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */
+	0x2,	/* 15 PM_MRK_GRP_ISSUED */
+	0x80,	/* 16 */
+	0x80,	/* 17 */
+	0, 0, 0, 0, 0,
+	0x80,	/* 1d */
+	0x80,	/* 1e */
+	0,	/* 1f */
+	0x80,	/* 20 */
+	0x80,	/* 21 */
+	0x80,	/* 22 */
+	0x80,	/* 23 */
+	0x80,	/* 24 */
+	0x80,	/* 25 */
+	0x80,	/* 26 */
+	0x80,	/* 27 */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power5_marked_instr_event(unsigned int event)
+{
+	int pmc, psel;
+	int bit, byte, unit;
+	u32 mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc >= 5)
+		return 0;
+
+	bit = -1;
+	if (psel < sizeof(direct_event_is_marked)) {
+		if (direct_event_is_marked[psel] & (1 << pmc))
+			return 1;
+		if (direct_event_is_marked[psel] & 0x80)
+			bit = 4;
+		else if (psel == 0x08)
+			bit = pmc - 1;
+		else if (psel == 0x10)
+			bit = 4 - pmc;
+		else if (psel == 0x1b && (pmc == 1 || pmc == 3))
+			bit = 4;
+	} else if ((psel & 0x58) == 0x40)
+		bit = psel & 7;
+
+	if (!(event & PM_BUSEVENT_MSK))
+		return 0;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	if (unit == PM_LSU0) {
+		/* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */
+		mask = 0x5dff00;
+	} else if (unit == PM_LSU1 && byte >= 4) {
+		byte -= 4;
+		/* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */
+		mask = 0x5f00c0aa;
+	} else
+		return 0;
+
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 static int power5_compute_mmcr(unsigned int event[], int n_ev,
 			       unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
+	u64 mmcra = 0;
 	unsigned int pmc, unit, byte, psel;
 	unsigned int ttm, grp;
 	int i, isbus, bit, grsel;
@@ -430,6 +522,8 @@ static int power5_compute_mmcr(unsigned int event[], int n_ev,
 			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK;
 			mmcr1 |= (u64)grsel << grsel_shift[bit];
 		}
+		if (power5_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 		if (pmc <= 3)
 			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
 		hwc[i] = pmc;
@@ -442,7 +536,7 @@ static int power5_compute_mmcr(unsigned int event[], int n_ev,
 	if (pmc_inuse & 0x3e)
 		mmcr[0] |= MMCR0_PMCjCE;
 	mmcr[1] = mmcr1;
-	mmcr[2] = 0;
+	mmcr[2] = mmcra;
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index b1f61f3c97bb9..fce1fc290a1d0 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -48,6 +48,127 @@
 #define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
 #define MMCR1_PMCSEL_MSK	0xff
 
+/*
+ * Map of which direct events on which PMCs are marked instruction events.
+ * Indexed by PMCSEL value >> 1.
+ * Bottom 4 bits are a map of which PMCs are interesting,
+ * top 4 bits say what sort of event:
+ *   0 = direct marked event,
+ *   1 = byte decode event,
+ *   4 = add/and event (PMC1 -> bits 0 & 4),
+ *   5 = add/and event (PMC1 -> bits 1 & 5),
+ *   6 = add/and event (PMC1 -> bits 2 & 6),
+ *   7 = add/and event (PMC1 -> bits 3 & 7).
+ */
+static unsigned char direct_event_is_marked[0x60 >> 1] = {
+	0,	/* 00 */
+	0,	/* 02 */
+	0,	/* 04 */
+	0x07,	/* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */
+	0x04,	/* 08 PM_MRK_DFU_FIN */
+	0x06,	/* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */
+	0,	/* 0c */
+	0,	/* 0e */
+	0x02,	/* 10 PM_MRK_INST_DISP */
+	0x08,	/* 12 PM_MRK_LSU_DERAT_MISS */
+	0,	/* 14 */
+	0,	/* 16 */
+	0x0c,	/* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */
+	0x0f,	/* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */
+	0x01,	/* 1c PM_MRK_INST_ISSUED */
+	0,	/* 1e */
+	0,	/* 20 */
+	0,	/* 22 */
+	0,	/* 24 */
+	0,	/* 26 */
+	0x15,	/* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */
+	0,	/* 2a */
+	0,	/* 2c */
+	0,	/* 2e */
+	0x4f,	/* 30 */
+	0x7f,	/* 32 */
+	0x4f,	/* 34 */
+	0x5f,	/* 36 */
+	0x6f,	/* 38 */
+	0x4f,	/* 3a */
+	0,	/* 3c */
+	0x08,	/* 3e PM_MRK_INST_TIMEO */
+	0x1f,	/* 40 */
+	0x1f,	/* 42 */
+	0x1f,	/* 44 */
+	0x1f,	/* 46 */
+	0x1f,	/* 48 */
+	0x1f,	/* 4a */
+	0x1f,	/* 4c */
+	0x1f,	/* 4e */
+	0,	/* 50 */
+	0x05,	/* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */
+	0x1c,	/* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */
+	0x02,	/* 56 PM_MRK_LD_MISS_L1 */
+	0,	/* 58 */
+	0,	/* 5a */
+	0,	/* 5c */
+	0,	/* 5e */
+};
+
+/*
+ * Masks showing for each unit which bits are marked events.
+ * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0.
+ */
+static u32 marked_bus_events[16] = {
+	0x01000000,	/* direct events set 1: byte 3 bit 0 */
+	0x00010000,	/* direct events set 2: byte 2 bit 0 */
+	0, 0, 0, 0,	/* IDU, IFU, nest: nothing */
+	0x00000088,	/* VMX set 1: byte 0 bits 3, 7 */
+	0x000000c0,	/* VMX set 2: byte 0 bits 4-7 */
+	0x04010000,	/* LSU set 1: byte 2 bit 0, byte 3 bit 2 */
+	0xff010000u,	/* LSU set 2: byte 2 bit 0, all of byte 3 */
+	0,		/* LSU set 3 */
+	0x00000010,	/* VMX set 3: byte 0 bit 4 */
+	0,		/* BFP set 1 */
+	0x00000022,	/* BFP set 2: byte 0 bits 1, 5 */
+	0, 0
+};
+	
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power6_marked_instr_event(unsigned int event)
+{
+	int pmc, psel, ptype;
+	int bit, byte, unit;
+	u32 mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = (event & PM_PMCSEL_MSK) >> 1;	/* drop edge/level bit */
+	if (pmc >= 5)
+		return 0;
+
+	bit = -1;
+	if (psel < sizeof(direct_event_is_marked)) {
+		ptype = direct_event_is_marked[psel];
+		if (pmc == 0 || !(ptype & (1 << (pmc - 1))))
+			return 0;
+		ptype >>= 4;
+		if (ptype == 0)
+			return 1;
+		if (ptype == 1)
+			bit = 0;
+		else
+			bit = ptype ^ (pmc - 1);
+	} else if ((psel & 0x48) == 0x40)
+		bit = psel & 7;
+
+	if (!(event & PM_BUSEVENT_MSK) || bit == -1)
+		return 0;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	mask = marked_bus_events[unit];
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 /*
  * Assign PMC numbers and compute MMCR1 value for a set of events
  */
@@ -55,6 +176,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 			   unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
+	u64 mmcra = 0;
 	int i;
 	unsigned int pmc, ev, b, u, s, psel;
 	unsigned int ttmset = 0;
@@ -116,6 +238,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 			if (ev & PM_LLAV)
 				mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc;
 		}
+		if (power6_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 		mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
 	}
 	mmcr[0] = 0;
@@ -124,7 +248,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 	if (pmc_inuse & 0xe)
 		mmcr[0] |= MMCR0_PMCjCE;
 	mmcr[1] = mmcr1;
-	mmcr[2] = 0;
+	mmcr[2] = mmcra;
 	return 0;
 }
 
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index c3256580be1ac..aed8ccd7c0779 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -19,6 +19,8 @@
 #define PM_PMC_MSK	0xf
 #define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */
 #define PM_UNIT_MSK	0xf
+#define PM_SPCSEL_SH	6
+#define PM_SPCSEL_MSK	3
 #define PM_BYTE_SH	4	/* Byte number of event bus to use */
 #define PM_BYTE_MSK	3
 #define PM_PMCSEL_MSK	0xf
@@ -88,8 +90,11 @@ static short mmcr1_adder_bits[8] = {
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
  * 3210987654321098765432109876543210987654321098765432109876543210
- *                 <><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
- *                 T0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *               <><><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><>
+ *               SPT0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8
+ *
+ * SP - SPCSEL constraint
+ *     48-49: SPCSEL value 0x3_0000_0000_0000
  *
  * T0 - TTM0 constraint
  *     46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000
@@ -126,6 +131,57 @@ static short mmcr1_adder_bits[8] = {
  *     0-13: Count of events needing PMC2..PMC8
  */
 
+static unsigned char direct_marked_event[8] = {
+	(1<<2) | (1<<3),	/* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */
+	(1<<3) | (1<<5),	/* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */
+	(1<<3) | (1<<5),	/* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */
+	(1<<4) | (1<<5),	/* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */
+	(1<<4) | (1<<5),	/* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */
+	(1<<3) | (1<<4) | (1<<5),
+		/* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */
+	(1<<4) | (1<<5),	/* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */
+	(1<<4)			/* PMC8: PM_MRK_LSU_FIN */
+};
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int p970_marked_instr_event(unsigned int event)
+{
+	int pmc, psel, unit, byte, bit;
+	unsigned int mask;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if (pmc) {
+		if (direct_marked_event[pmc - 1] & (1 << psel))
+			return 1;
+		if (psel == 0)		/* add events */
+			bit = (pmc <= 4)? pmc - 1: 8 - pmc;
+		else if (psel == 7 || psel == 13)	/* decode events */
+			bit = 4;
+		else
+			return 0;
+	} else
+		bit = psel;
+
+	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	mask = 0;
+	switch (unit) {
+	case PM_VPU:
+		mask = 0x4c;		/* byte 0 bits 2,3,6 */
+	case PM_LSU0:
+		/* byte 2 bits 0,2,3,4,6; all of byte 1 */
+		mask = 0x085dff00;
+	case PM_LSU1L:
+		mask = 0x50 << 24;	/* byte 3 bits 4,6 */
+		break;
+	}
+	return (mask >> (byte * 8 + bit)) & 1;
+}
+
 /* Masks and values for using events from the various units */
 static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_FPU] =   { 0xc80000000000ull, 0x040000000000ull },
@@ -138,7 +194,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 
 static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 {
-	int pmc, byte, unit, sh;
+	int pmc, byte, unit, sh, spcsel;
 	u64 mask = 0, value = 0;
 	int grp = -1;
 
@@ -177,6 +233,11 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 		mask  |= 0x800000000ull;
 		value |= 0x100000000ull;
 	}
+	spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
+	if (spcsel) {
+		mask  |= 3ull << 48;
+		value |= (u64)spcsel << 48;
+	}
 	*maskp = mask;
 	*valp = value;
 	return 0;
@@ -209,6 +270,7 @@ static int p970_compute_mmcr(unsigned int event[], int n_ev,
 	unsigned char ttmuse[2];
 	unsigned char pmcsel[8];
 	int i;
+	int spcsel;
 
 	if (n_ev > 8)
 		return -1;
@@ -316,6 +378,10 @@ static int p970_compute_mmcr(unsigned int event[], int n_ev,
 		}
 		pmcsel[pmc] = psel;
 		hwc[i] = pmc;
+		spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK;
+		mmcr1 |= spcsel;
+		if (p970_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
 	}
 	for (pmc = 0; pmc < 2; ++pmc)
 		mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc);
-- 
GitLab


From bab5bc9e857638880facef76e4b4c3fa807f8c73 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Tue, 7 Apr 2009 23:23:50 -0700
Subject: [PATCH 0291/2198] futex: fixup unlocked requeue pi case

Thomas's testing caught a problem when the requeue target futex is
unowned and multiple tasks are requeued to it. This patch ensures
the FUTEX_WAITERS bit gets set if futex_requeue() will requeue one
or more tasks in addition to the one acquiring the lock.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 65 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 185c981d89e34..041bf3ac4be95 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -565,12 +565,14 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 
 /**
  * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
- * @uaddr:	the pi futex user address
- * @hb:		the pi futex hash bucket
- * @key:	the futex key associated with uaddr and hb
- * @ps:		the pi_state pointer where we store the result of the lookup
- * @task:	the task to perform the atomic lock work for.  This will be
- * 		"current" except in the case of requeue pi.
+ * @uaddr:		the pi futex user address
+ * @hb:			the pi futex hash bucket
+ * @key:		the futex key associated with uaddr and hb
+ * @ps:			the pi_state pointer where we store the result of the
+ *			lookup
+ * @task:		the task to perform the atomic lock work for.  This will
+ *			be "current" except in the case of requeue pi.
+ * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
  *
  * Returns:
  *  0 - ready to wait
@@ -582,7 +584,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 				union futex_key *key,
 				struct futex_pi_state **ps,
-				struct task_struct *task)
+				struct task_struct *task, int set_waiters)
 {
 	int lock_taken, ret, ownerdied = 0;
 	u32 uval, newval, curval;
@@ -596,6 +598,8 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
 	 * the locks. It will most likely not succeed.
 	 */
 	newval = task_pid_vnr(task);
+	if (set_waiters)
+		newval |= FUTEX_WAITERS;
 
 	curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
 
@@ -1004,14 +1008,18 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
 
 /**
  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
- * @pifutex:	the user address of the to futex
- * @hb1:	the from futex hash bucket, must be locked by the caller
- * @hb2:	the to futex hash bucket, must be locked by the caller
- * @key1:	the from futex key
- * @key2:	the to futex key
+ * @pifutex:		the user address of the to futex
+ * @hb1:		the from futex hash bucket, must be locked by the caller
+ * @hb2:		the to futex hash bucket, must be locked by the caller
+ * @key1:		the from futex key
+ * @key2:		the to futex key
+ * @ps:			address to store the pi_state pointer
+ * @set_waiters:	force setting the FUTEX_WAITERS bit (1) or not (0)
  *
  * Try and get the lock on behalf of the top waiter if we can do it atomically.
- * Wake the top waiter if we succeed.  hb1 and hb2 must be held by the caller.
+ * Wake the top waiter if we succeed.  If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
  *
  * Returns:
  *  0 - failed to acquire the lock atomicly
@@ -1022,15 +1030,23 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 				 struct futex_hash_bucket *hb1,
 				 struct futex_hash_bucket *hb2,
 				 union futex_key *key1, union futex_key *key2,
-				 struct futex_pi_state **ps)
+				 struct futex_pi_state **ps, int set_waiters)
 {
-	struct futex_q *top_waiter;
+	struct futex_q *top_waiter = NULL;
 	u32 curval;
 	int ret;
 
 	if (get_futex_value_locked(&curval, pifutex))
 		return -EFAULT;
 
+	/*
+	 * Find the top_waiter and determine if there are additional waiters.
+	 * If the caller intends to requeue more than 1 waiter to pifutex,
+	 * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+	 * as we have means to handle the possible fault.  If not, don't set
+	 * the bit unecessarily as it will force the subsequent unlock to enter
+	 * the kernel.
+	 */
 	top_waiter = futex_top_waiter(hb1, key1);
 
 	/* There are no waiters, nothing for us to do. */
@@ -1038,10 +1054,12 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 		return 0;
 
 	/*
-	 * Either take the lock for top_waiter or set the FUTEX_WAITERS bit.
-	 * The pi_state is returned in ps in contended cases.
+	 * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
+	 * the contended case or if set_waiters is 1.  The pi_state is returned
+	 * in ps in contended cases.
 	 */
-	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task);
+	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+				   set_waiters);
 	if (ret == 1)
 		requeue_pi_wake_futex(top_waiter, key2);
 
@@ -1146,9 +1164,14 @@ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 	}
 
 	if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
-		/* Attempt to acquire uaddr2 and wake the top_waiter. */
+		/*
+		 * Attempt to acquire uaddr2 and wake the top waiter. If we
+		 * intend to requeue waiters, force setting the FUTEX_WAITERS
+		 * bit.  We force this here where we are able to easily handle
+		 * faults rather in the requeue loop below.
+		 */
 		ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
-						 &key2, &pi_state);
+						 &key2, &pi_state, nr_requeue);
 
 		/*
 		 * At this point the top_waiter has either taken uaddr2 or is
@@ -1810,7 +1833,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 retry_private:
 	hb = queue_lock(&q);
 
-	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current);
+	ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
 	if (unlikely(ret)) {
 		switch (ret) {
 		case 1:
-- 
GitLab


From fcb2ac5bdfa3a7a04fb9749b916f64400f4c35a8 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 8 Apr 2009 13:31:58 +0200
Subject: [PATCH 0292/2198] x86_32: introduce restore_fpu_checking()

Impact: cleanup, prepare FPU code unificaton

Like on x86_64, return an error from restore_fpu and kill the task
if it fails.

Also rename restore_fpu to restore_fpu_checking which allows ifdefs
to be removed in math_state_restore().

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
LKML-Reference: <1239190320-23952-1-git-send-email-jirislaby@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/i387.h | 9 ++++-----
 arch/x86/kernel/traps.c     | 5 +----
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 71c9e51839827..09a2d6dfd85b4 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -185,12 +185,10 @@ static inline void tolerant_fwait(void)
 	asm volatile("fnclex ; fwait");
 }
 
-static inline void restore_fpu(struct task_struct *tsk)
+static inline int restore_fpu_checking(struct task_struct *tsk)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE) {
-		xrstor_checking(&tsk->thread.xstate->xsave);
-		return;
-	}
+	if (task_thread_info(tsk)->status & TS_XSAVE)
+		return xrstor_checking(&tsk->thread.xstate->xsave);
 	/*
 	 * The "nop" is needed to make the instructions the same
 	 * length.
@@ -200,6 +198,7 @@ static inline void restore_fpu(struct task_struct *tsk)
 		"fxrstor %1",
 		X86_FEATURE_FXSR,
 		"m" (tsk->thread.xstate->fxsave));
+	return 0;
 }
 
 /* We need a safe address that is cheap to find and that is already
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff0f..d696145855b51 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void)
 	}
 
 	clts();				/* Allow maths ops (or we recurse) */
-#ifdef CONFIG_X86_32
-	restore_fpu(tsk);
-#else
 	/*
 	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
 	 */
@@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void)
 		force_sig(SIGSEGV, tsk);
 		return;
 	}
-#endif
+
 	thread->status |= TS_USEDFPU;	/* So we fnsave on switch_to() */
 	tsk->fpu_counter++;
 }
-- 
GitLab


From 34ba476a01e128aad51e02f9be854584e9ec73cf Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 8 Apr 2009 13:31:59 +0200
Subject: [PATCH 0293/2198] x86: unify restore_fpu_checking

Impact: cleanup

On x86_32, separate f*rstor to an inline function which makes
restore_fpu_checking the same on both platforms -> move it
outside the ifdefs.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
LKML-Reference: <1239190320-23952-2-git-send-email-jirislaby@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/i387.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 09a2d6dfd85b4..7a6f21d95cf42 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 	return err;
 }
 
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
-	if (task_thread_info(tsk)->status & TS_XSAVE)
-		return xrstor_checking(&tsk->thread.xstate->xsave);
-	else
-		return fxrstor_checking(&tsk->thread.xstate->fxsave);
-}
-
 /* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
    is pending. Clear the x87 state here by setting it to fixed
    values. The kernel data segment can be sometimes 0 and sometimes
@@ -185,10 +177,9 @@ static inline void tolerant_fwait(void)
 	asm volatile("fnclex ; fwait");
 }
 
-static inline int restore_fpu_checking(struct task_struct *tsk)
+/* perform fxrstor iff the processor has extended states, otherwise frstor */
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 {
-	if (task_thread_info(tsk)->status & TS_XSAVE)
-		return xrstor_checking(&tsk->thread.xstate->xsave);
 	/*
 	 * The "nop" is needed to make the instructions the same
 	 * length.
@@ -197,7 +188,8 @@ static inline int restore_fpu_checking(struct task_struct *tsk)
 		"nop ; frstor %1",
 		"fxrstor %1",
 		X86_FEATURE_FXSR,
-		"m" (tsk->thread.xstate->fxsave));
+		"m" (*fx));
+
 	return 0;
 }
 
@@ -261,6 +253,14 @@ static inline void __save_init_fpu(struct task_struct *tsk)
 
 #endif	/* CONFIG_X86_64 */
 
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+	if (task_thread_info(tsk)->status & TS_XSAVE)
+		return xrstor_checking(&tsk->thread.xstate->xsave);
+	else
+		return fxrstor_checking(&tsk->thread.xstate->fxsave);
+}
+
 /*
  * Signal frame handlers...
  */
-- 
GitLab


From 4ecf458492c2d97b3f9d850a5f92d79792e0a7e7 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 8 Apr 2009 13:32:00 +0200
Subject: [PATCH 0294/2198] x86_64: fix incorrect comments

Impact: cleanup

The comments which fxrstor_checking and fxsave_uset refer to is
now in fxsave. Change the comments appropriately.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Jiri Slaby <jirislaby@gmail.com>
LKML-Reference: <1239190320-23952-3-git-send-email-jirislaby@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/i387.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 7a6f21d95cf42..63d185087d91f 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 		     ".previous\n"
 		     _ASM_EXTABLE(1b, 3b)
 		     : [err] "=r" (err)
-#if 0 /* See comment in __save_init_fpu() below. */
+#if 0 /* See comment in fxsave() below. */
 		     : [fx] "r" (fx), "m" (*fx), "0" (0));
 #else
 		     : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -112,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
 		     ".previous\n"
 		     _ASM_EXTABLE(1b, 3b)
 		     : [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in __fxsave_clear() below. */
+#if 0 /* See comment in fxsave() below. */
 		     : [fx] "r" (fx), "0" (0));
 #else
 		     : [fx] "cdaSDb" (fx), "0" (0));
-- 
GitLab


From a59dacfdc9ba06903652fa4883bf1106278b18ec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 17 Oct 2008 14:38:08 +0200
Subject: [PATCH 0295/2198] x86 early quirks: eliminate unused function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

this warning:

  arch/x86/kernel/early-quirks.c:99: warning: ‘ati_ixp4x0_rev’ defined but not used

triggers because ati_ixp4x0_rev() is only used in the
ACPI && X86_IO_APIC case.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/early-quirks.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 76b8cd953deed..ebdb85cf2686f 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -96,6 +96,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
 
 }
 
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
 #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
 static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
 {
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
 	d &= 0xff;
 	return d;
 }
+#endif
 
 static void __init ati_bugs(int num, int slot, int func)
 {
-- 
GitLab


From cdc1cb0d4445f39561a65204d26f89365f917550 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 3 Apr 2009 17:15:14 -0700
Subject: [PATCH 0296/2198] x86: make wakeup_secondary_cpu_via_init static

Impact: cleanup

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49D6A692.6040400@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 58d24ef917d8b..bddf2ccaf3257 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
 	return (send_status | accept_status);
 }
 
-int __devinit
+static int __devinit
 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
 	unsigned long send_status, accept_status = 0;
-- 
GitLab


From 02421f98ec55c3ff118f358740ff640f096c7ad6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 3 Apr 2009 17:15:53 -0700
Subject: [PATCH 0297/2198] x86: consistent about warm_reset_vector for
 UN_NON_UNIQUE_APIC

Impact: cleanup

didn't set it for UV_NON_UNIQUE_APIC, so don't restore it

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49D6A6B9.6060501@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bddf2ccaf3257..bf8ad6344b18e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -822,10 +822,12 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
 	/* mark "stuck" area as not stuck */
 	*((volatile unsigned long *)trampoline_base) = 0;
 
-	/*
-	 * Cleanup possible dangling ends...
-	 */
-	smpboot_restore_warm_reset_vector();
+	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
+		/*
+		 * Cleanup possible dangling ends...
+		 */
+		smpboot_restore_warm_reset_vector();
+	}
 
 	return boot_error;
 }
-- 
GitLab


From ceb5ac3264686e75e6951de6a18d4baa9bdecb92 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:15 -0500
Subject: [PATCH 0298/2198] swiotlb: comment corrections

Impact: cleanup

swiotlb_map/unmap_single are now swiotlb_map/unmap_page;
trivially change all the comments to reference new names.

Also, there were some comments that should have been
referring to just plain old map_single, not swiotlb_map_single;
fix those as well.

Also change a use of the word "pointer", when what is
referred to is actually a dma/physical address.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-2-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 2b0b5a7d2ced1..170cf56af6a99 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -60,8 +60,8 @@ enum dma_sync_target {
 int swiotlb_force;
 
 /*
- * Used to do a quick range check in swiotlb_unmap_single and
- * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
+ * Used to do a quick range check in unmap_single and
+ * sync_single_*, to see if the memory was in fact allocated by this
  * API.
  */
 static char *io_tlb_start, *io_tlb_end;
@@ -560,7 +560,6 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 				   size)) {
 		/*
 		 * The allocated memory isn't reachable by the device.
-		 * Fall back on swiotlb_map_single().
 		 */
 		free_pages((unsigned long) ret, order);
 		ret = NULL;
@@ -568,9 +567,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 	if (!ret) {
 		/*
 		 * We are either out of memory or the device can't DMA
-		 * to GFP_DMA memory; fall back on
-		 * swiotlb_map_single(), which will grab memory from
-		 * the lowest available address range.
+		 * to GFP_DMA memory; fall back on map_single(), which
+		 * will grab memory from the lowest available address range.
 		 */
 		ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
 		if (!ret)
@@ -634,7 +632,7 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
  * physical address to use is returned.
  *
  * Once the device is given the dma address, the device owns this memory until
- * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
+ * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
  */
 dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 			    unsigned long offset, size_t size,
@@ -648,7 +646,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 
 	BUG_ON(dir == DMA_NONE);
 	/*
-	 * If the pointer passed in happens to be in the device's DMA window,
+	 * If the address happens to be in the device's DMA window,
 	 * we can safely return the device addr and not worry about bounce
 	 * buffering it.
 	 */
@@ -679,7 +677,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
 
 /*
  * Unmap a single streaming mode DMA translation.  The dma_addr and size must
- * match what was provided for in a previous swiotlb_map_single call.  All
+ * match what was provided for in a previous swiotlb_map_page call.  All
  * other usages are undefined.
  *
  * After this call, reads by the cpu to the buffer are guaranteed to see
@@ -703,7 +701,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
  * Make physical memory consistent for a single streaming mode DMA translation
  * after a transfer.
  *
- * If you perform a swiotlb_map_single() but wish to interrogate the buffer
+ * If you perform a swiotlb_map_page() but wish to interrogate the buffer
  * using the cpu, yet do not wish to teardown the dma mapping, you must
  * call this function before doing so.  At the next point you give the dma
  * address back to the card, you must first perform a
@@ -777,7 +775,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
 
 /*
  * Map a set of buffers described by scatterlist in streaming mode for DMA.
- * This is the scatter-gather version of the above swiotlb_map_single
+ * This is the scatter-gather version of the above swiotlb_map_page
  * interface.  Here the scatter gather list elements are each tagged with the
  * appropriate dma address and length.  They are obtained via
  * sg_dma_{address,length}(SG).
@@ -788,7 +786,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
  *       The routine returns the number of addr/length pairs actually
  *       used, at most nents.
  *
- * Device ownership issues as mentioned above for swiotlb_map_single are the
+ * Device ownership issues as mentioned above for swiotlb_map_page are the
  * same here.
  */
 int
@@ -836,7 +834,7 @@ EXPORT_SYMBOL(swiotlb_map_sg);
 
 /*
  * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
- * concerning calls here are the same as for swiotlb_unmap_single() above.
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
  */
 void
 swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
-- 
GitLab


From 67131ad0514d7105b55003a0506209cf1bba3f00 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:16 -0500
Subject: [PATCH 0299/2198] swiotlb: fix compile warning

Squash a build warning seen on 32-bit powerpc caused by
calling min() with 2 different types. Use min_t() instead.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-3-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 170cf56af6a99..4fd6a76e728c1 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -341,7 +341,7 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
 		unsigned long flags;
 
 		while (size) {
-			sz = min(PAGE_SIZE - offset, size);
+			sz = min_t(size_t, PAGE_SIZE - offset, size);
 
 			local_irq_save(flags);
 			buffer = kmap_atomic(pfn_to_page(pfn),
-- 
GitLab


From dd6b02fe427f30520d0adc94aa52352367227873 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:17 -0500
Subject: [PATCH 0300/2198] swiotlb: map_page fix for highmem systems

The current code calls virt_to_phys() on address that might
be in highmem, which is bad.  This wasn't needed, anyway, because
we already have the physical address we need.

Get rid of the now-unused virtual address as well.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-4-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 4fd6a76e728c1..e8a47c8cf77ef 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -640,7 +640,6 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 			    struct dma_attrs *attrs)
 {
 	phys_addr_t phys = page_to_phys(page) + offset;
-	void *ptr = page_address(page) + offset;
 	dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys);
 	void *map;
 
@@ -651,7 +650,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
 	 * buffering it.
 	 */
 	if (!address_needs_mapping(dev, dev_addr, size) &&
-	    !range_needs_mapping(virt_to_phys(ptr), size))
+	    !range_needs_mapping(phys, size))
 		return dev_addr;
 
 	/*
-- 
GitLab


From ef5722f698bde01cfec2b98fff733a48663ebf55 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:18 -0500
Subject: [PATCH 0301/2198] swiotlb: allow arch override of
 address_needs_mapping

Some architectures require additional checking to determine
if a device can dma to an address and need to provide their
own address_needs_mapping..

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-5-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index e8a47c8cf77ef..d81afab85167e 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -145,6 +145,12 @@ static void *swiotlb_bus_to_virt(dma_addr_t address)
 	return phys_to_virt(swiotlb_bus_to_phys(address));
 }
 
+int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev,
+					       dma_addr_t addr, size_t size)
+{
+	return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
+}
+
 int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
 {
 	return 0;
@@ -309,10 +315,10 @@ swiotlb_late_init_with_default_size(size_t default_size)
 	return -ENOMEM;
 }
 
-static int
+static inline int
 address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
 {
-	return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
+	return swiotlb_arch_address_needs_mapping(hwdev, addr, size);
 }
 
 static inline int range_needs_mapping(phys_addr_t paddr, size_t size)
-- 
GitLab


From 7fcebbd2d984eac3fdd6da2f4453e7c43d32de89 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:19 -0500
Subject: [PATCH 0302/2198] swiotlb: rename unmap_single to do_unmap_single

Previously, swiotlb_unmap_page and swiotlb_unmap_sg were
duplicating very similar code.  Refactor that code into a
new unmap_single and unmap_single use do_unmap_single.

Note that the swiotlb_unmap_sg code was previously doing
a complicated comparison to determine if an addresses needed
to be unmapped where a simple is_swiotlb_buffer() call
would have sufficed.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-6-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 42 +++++++++++++++++++++++++-----------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index d81afab85167e..2bde54a40d86e 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -482,7 +482,7 @@ map_single(struct device *hwdev, phys_addr_t phys, size_t size, int dir)
  * dma_addr is the kernel virtual address of the bounce buffer to unmap.
  */
 static void
-unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 {
 	unsigned long flags;
 	int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
@@ -591,7 +591,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
 		       (unsigned long long)dev_addr);
 
 		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
-		unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
+		do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
 		return NULL;
 	}
 	*dma_handle = dev_addr;
@@ -608,7 +608,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
 		free_pages((unsigned long) vaddr, get_order(size));
 	else
 		/* DMA_TO_DEVICE to avoid memcpy in unmap_single */
-		unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
+		do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
 }
 EXPORT_SYMBOL(swiotlb_free_coherent);
 
@@ -688,17 +688,29 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
  * After this call, reads by the cpu to the buffer are guaranteed to see
  * whatever the device wrote there.
  */
-void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
-			size_t size, enum dma_data_direction dir,
-			struct dma_attrs *attrs)
+static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+			 size_t size, int dir)
 {
 	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
-	if (is_swiotlb_buffer(dma_addr))
-		unmap_single(hwdev, dma_addr, size, dir);
-	else if (dir == DMA_FROM_DEVICE)
-		dma_mark_clean(dma_addr, size);
+
+	if (is_swiotlb_buffer(dma_addr)) {
+		do_unmap_single(hwdev, dma_addr, size, dir);
+		return;
+	}
+
+	if (dir != DMA_FROM_DEVICE)
+		return;
+
+	dma_mark_clean(dma_addr, size);
+}
+
+void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
+			size_t size, enum dma_data_direction dir,
+			struct dma_attrs *attrs)
+{
+	unmap_single(hwdev, dev_addr, size, dir);
 }
 EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
 
@@ -850,13 +862,9 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 
 	BUG_ON(dir == DMA_NONE);
 
-	for_each_sg(sgl, sg, nelems, i) {
-		if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg)))
-			unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
-				     sg->dma_length, dir);
-		else if (dir == DMA_FROM_DEVICE)
-			dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
-	}
+	for_each_sg(sgl, sg, nelems, i)
+		unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+
 }
 EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
 
-- 
GitLab


From 380d687833aee098c4a2c3b35beaefe1c1f48d01 Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:20 -0500
Subject: [PATCH 0303/2198] swiotlb: use swiotlb_sync_single instead of
 duplicating code

Right now both swiotlb_sync_single_range and swiotlb_sync_sg
were duplicating the code in swiotlb_sync_single.  Just call it
instead.  Also rearrange the sync_single code for readability.

Note that the swiotlb_sync_sg code was previously doing
a complicated comparison to determine if an addresses needed
to be unmapped where a simple is_swiotlb_buffer() call
would have sufficed.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-7-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/swiotlb.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index 2bde54a40d86e..d912f068145cd 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -731,10 +731,16 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
-	if (is_swiotlb_buffer(dma_addr))
+
+	if (is_swiotlb_buffer(dma_addr)) {
 		sync_single(hwdev, dma_addr, size, dir, target);
-	else if (dir == DMA_FROM_DEVICE)
-		dma_mark_clean(dma_addr, size);
+		return;
+	}
+
+	if (dir != DMA_FROM_DEVICE)
+		return;
+
+	dma_mark_clean(dma_addr, size);
 }
 
 void
@@ -761,13 +767,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
 			  unsigned long offset, size_t size,
 			  int dir, int target)
 {
-	char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset;
-
-	BUG_ON(dir == DMA_NONE);
-	if (is_swiotlb_buffer(dma_addr))
-		sync_single(hwdev, dma_addr, size, dir, target);
-	else if (dir == DMA_FROM_DEVICE)
-		dma_mark_clean(dma_addr, size);
+	swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target);
 }
 
 void
@@ -890,15 +890,9 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
 	struct scatterlist *sg;
 	int i;
 
-	BUG_ON(dir == DMA_NONE);
-
-	for_each_sg(sgl, sg, nelems, i) {
-		if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg)))
-			sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
+	for_each_sg(sgl, sg, nelems, i)
+		swiotlb_sync_single(hwdev, sg->dma_address,
 				    sg->dma_length, dir, target);
-		else if (dir == DMA_FROM_DEVICE)
-			dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
-	}
 }
 
 void
-- 
GitLab


From 42d7c5e353cef9062129b0de3ec9ddf10567b9ca Mon Sep 17 00:00:00 2001
From: Becky Bruce <beckyb@kernel.crashing.org>
Date: Wed, 8 Apr 2009 09:09:21 -0500
Subject: [PATCH 0304/2198] swiotlb: change swiotlb_bus_to[phys,virt]
 prototypes

Add a hwdev argument that is needed on some architectures
in order to access a per-device offset that is taken into
account when producing a physical address (also needed to
get from bus address to virtual address because the physical
address is an intermediate step).

Also make swiotlb_bus_to_virt weak so architectures can
override it.

Signed-off-by: Becky Bruce <beckyb@kernel.crashing.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
Cc: jeremy@goop.org
Cc: ian.campbell@citrix.com
LKML-Reference: <1239199761-22886-8-git-send-email-galak@kernel.crashing.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-swiotlb.c |  2 +-
 include/linux/swiotlb.h       |  3 ++-
 lib/swiotlb.c                 | 10 +++++-----
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 34f12e9996ed8..887388a1c57d6 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
 	return paddr;
 }
 
-phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
 {
 	return baddr;
 }
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index ac9ff54f7cb30..cb1a6631b8f44 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -29,7 +29,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
 
 extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev,
 				      phys_addr_t address);
-extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
+extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev,
+				       dma_addr_t address);
 
 extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size);
 
diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index d912f068145cd..bffe6d7ef9d9a 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -129,7 +129,7 @@ dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
 	return paddr;
 }
 
-phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
 {
 	return baddr;
 }
@@ -140,9 +140,9 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
 	return swiotlb_phys_to_bus(hwdev, virt_to_phys(address));
 }
 
-static void *swiotlb_bus_to_virt(dma_addr_t address)
+void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address)
 {
-	return phys_to_virt(swiotlb_bus_to_phys(address));
+	return phys_to_virt(swiotlb_bus_to_phys(hwdev, address));
 }
 
 int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev,
@@ -691,7 +691,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
 static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
 			 size_t size, int dir)
 {
-	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 
@@ -728,7 +728,7 @@ static void
 swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
 		    size_t size, int dir, int target)
 {
-	char *dma_addr = swiotlb_bus_to_virt(dev_addr);
+	char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
 
 	BUG_ON(dir == DMA_NONE);
 
-- 
GitLab


From a4e94ef0dd391eae05bdeacd12b8da3510957a97 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 17:07:05 +0800
Subject: [PATCH 0305/2198] printk: add support of hh length modifier for
 printk

Impact: new feature, extend vsprintf format strings

hh is used as length modifier for signed char or unsigned char.
It is supported by glibc, we add kernel support now.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: torvalds@linux-foundation.org
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49CC9739.30107@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/vsprintf.c | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index 7536acea135ba..b56f6d039d211 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -408,6 +408,8 @@ enum format_type {
 	FORMAT_TYPE_LONG_LONG,
 	FORMAT_TYPE_ULONG,
 	FORMAT_TYPE_LONG,
+	FORMAT_TYPE_UBYTE,
+	FORMAT_TYPE_BYTE,
 	FORMAT_TYPE_USHORT,
 	FORMAT_TYPE_SHORT,
 	FORMAT_TYPE_UINT,
@@ -853,11 +855,15 @@ static int format_decode(const char *fmt, struct printf_spec *spec)
 	spec->qualifier = -1;
 	if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
 	    *fmt == 'Z' || *fmt == 'z' || *fmt == 't') {
-		spec->qualifier = *fmt;
-		++fmt;
-		if (spec->qualifier == 'l' && *fmt == 'l') {
-			spec->qualifier = 'L';
-			++fmt;
+		spec->qualifier = *fmt++;
+		if (unlikely(spec->qualifier == *fmt)) {
+			if (spec->qualifier == 'l') {
+				spec->qualifier = 'L';
+				++fmt;
+			} else if (spec->qualifier == 'h') {
+				spec->qualifier = 'H';
+				++fmt;
+			}
 		}
 	}
 
@@ -919,6 +925,11 @@ static int format_decode(const char *fmt, struct printf_spec *spec)
 		spec->type = FORMAT_TYPE_SIZE_T;
 	} else if (spec->qualifier == 't') {
 		spec->type = FORMAT_TYPE_PTRDIFF;
+	} else if (spec->qualifier == 'H') {
+		if (spec->flags & SIGN)
+			spec->type = FORMAT_TYPE_BYTE;
+		else
+			spec->type = FORMAT_TYPE_UBYTE;
 	} else if (spec->qualifier == 'h') {
 		if (spec->flags & SIGN)
 			spec->type = FORMAT_TYPE_SHORT;
@@ -1087,6 +1098,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
 			case FORMAT_TYPE_PTRDIFF:
 				num = va_arg(args, ptrdiff_t);
 				break;
+			case FORMAT_TYPE_UBYTE:
+				num = (unsigned char) va_arg(args, int);
+				break;
+			case FORMAT_TYPE_BYTE:
+				num = (signed char) va_arg(args, int);
+				break;
 			case FORMAT_TYPE_USHORT:
 				num = (unsigned short) va_arg(args, int);
 				break;
@@ -1363,6 +1380,10 @@ do {									\
 			case FORMAT_TYPE_PTRDIFF:
 				save_arg(ptrdiff_t);
 				break;
+			case FORMAT_TYPE_UBYTE:
+			case FORMAT_TYPE_BYTE:
+				save_arg(char);
+				break;
 			case FORMAT_TYPE_USHORT:
 			case FORMAT_TYPE_SHORT:
 				save_arg(short);
@@ -1538,6 +1559,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
 			case FORMAT_TYPE_PTRDIFF:
 				num = get_arg(ptrdiff_t);
 				break;
+			case FORMAT_TYPE_UBYTE:
+				num = get_arg(unsigned char);
+				break;
+			case FORMAT_TYPE_BYTE:
+				num = get_arg(signed char);
+				break;
 			case FORMAT_TYPE_USHORT:
 				num = get_arg(unsigned short);
 				break;
-- 
GitLab


From 7333a8003cdc0470e8c0ae8b949cbc44f3165ff3 Mon Sep 17 00:00:00 2001
From: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Date: Wed, 25 Mar 2009 10:50:34 +0900
Subject: [PATCH 0306/2198] x86: smarten /proc/interrupts output for new
 counters

Now /proc/interrupts of tip tree has new counters:

  CNT: Performance counter interrupts

Format change of output, as like that by commit:

  commit 7a81d9a7da03d2f27840d659f97ef140d032f609
  x86: smarten /proc/interrupts output

should be applied to these new counters too.

Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com>
Cc: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49C98DEA.8060208@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index d465487da5871..dccaaa8557897 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -63,7 +63,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
 	seq_printf(p, "  Spurious interrupts\n");
-	seq_printf(p, "CNT: ");
+	seq_printf(p, "%*s: ", prec, "CNT");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
-- 
GitLab


From ae9e6bc9f74f8247cbca50a6a93c80e0d686fa19 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 2 Apr 2009 13:19:54 +0900
Subject: [PATCH 0307/2198] percpu: don't put the first chunk in reverse-map
 rbtree

Impact: both first chunks don't use rbtree, no functional change

There can be two first chunks - reserved and dynamic with the former
one being optional.  Dynamic first chunk was linked on reverse-mapping
rbtree while the reserved one was mapped manually using the start
address and reserved offset limit.

This patch makes both first chunks to be looked up manually without
using the rbtree.  This is to help getting rid of the rbtree.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: rusty@rustcorp.com.au
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: rmk@arm.linux.org.uk
Cc: starvik@axis.com
Cc: ralf@linux-mips.org
Cc: davem@davemloft.net
Cc: cooloney@kernel.org
Cc: kyle@mcmartin.ca
Cc: matthew@wil.cx
Cc: grundler@parisc-linux.org
Cc: takata@linux-m32r.org
Cc: benh@kernel.crashing.org
Cc: rth@twiddle.net
Cc: ink@jurassic.park.msu.ru
Cc: heiko.carstens@de.ibm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Christoph Lameter <cl@linux.com>
LKML-Reference: <49D43CEA.3040609@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/percpu.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 1aa5d8fbca121..bf1bf1f4a7299 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -110,9 +110,21 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
-/* optional reserved chunk, only accessible for reserved allocations */
+/*
+ * The first chunk which always exists.  Note that unlike other
+ * chunks, this one can be allocated and mapped in several different
+ * ways and thus often doesn't live in the vmalloc area.
+ */
+static struct pcpu_chunk *pcpu_first_chunk;
+
+/*
+ * Optional reserved chunk.  This chunk reserves part of the first
+ * chunk and serves it for reserved allocations.  The amount of
+ * reserved offset is in pcpu_reserved_chunk_limit.  When reserved
+ * area doesn't exist, the following variables contain NULL and 0
+ * respectively.
+ */
 static struct pcpu_chunk *pcpu_reserved_chunk;
-/* offset limit of the reserved chunk */
 static int pcpu_reserved_chunk_limit;
 
 /*
@@ -297,15 +309,16 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
  */
 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 {
+	void *first_start = pcpu_first_chunk->vm->addr;
 	struct rb_node *n, *parent;
 	struct pcpu_chunk *chunk;
 
-	/* is it in the reserved chunk? */
-	if (pcpu_reserved_chunk) {
-		void *start = pcpu_reserved_chunk->vm->addr;
-
-		if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+	/* is it in the first chunk? */
+	if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
+		/* is it in the reserved area? */
+		if (addr < first_start + pcpu_reserved_chunk_limit)
 			return pcpu_reserved_chunk;
+		return pcpu_first_chunk;
 	}
 
 	/* nah... search the regular ones */
@@ -1147,7 +1160,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
 	if (reserved_size) {
 		schunk->free_size = reserved_size;
-		pcpu_reserved_chunk = schunk;	/* not for dynamic alloc */
+		pcpu_reserved_chunk = schunk;
+		pcpu_reserved_chunk_limit = static_size + reserved_size;
 	} else {
 		schunk->free_size = dyn_size;
 		dyn_size = 0;			/* dynamic area covered */
@@ -1158,8 +1172,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 
-	pcpu_reserved_chunk_limit = static_size + schunk->free_size;
-
 	/* init dynamic chunk if necessary */
 	if (dyn_size) {
 		dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
@@ -1226,13 +1238,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	}
 
 	/* link the first chunk in */
-	if (!dchunk) {
-		pcpu_chunk_relocate(schunk, -1);
-		pcpu_chunk_addr_insert(schunk);
-	} else {
-		pcpu_chunk_relocate(dchunk, -1);
-		pcpu_chunk_addr_insert(dchunk);
-	}
+	pcpu_first_chunk = dchunk ?: schunk;
+	pcpu_chunk_relocate(pcpu_first_chunk, -1);
 
 	/* we're done */
 	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
-- 
GitLab


From e1b9aa3f47242e757c776a3771bb6613e675bf9c Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux.com>
Date: Thu, 2 Apr 2009 13:21:44 +0900
Subject: [PATCH 0308/2198] percpu: remove rbtree and use page->index instead

Impact: use page->index for addr to chunk mapping instead of dedicated rbtree

The rbtree is used to determine the chunk from the virtual address.
However, we can already determine the page struct from a virtual
address and there are several unused fields in page struct used by
vmalloc.  Use the index field to store a pointer to the chunk. Then
there is no need anymore for an rbtree.

tj: * s/(set|get)_chunk/pcpu_\1_page_chunk/

    * Drop inline from the above two functions and moved them upwards
      so that they are with other simple helpers.

    * Initial pages might not (actually most of the time don't) live
      in the vmalloc area.  With the previous patch to manually
      reverse-map both first chunks, this is no longer an issue.
      Removed pcpu_set_chunk() call on initial pages.

Signed-off-by: Christoph Lameter <cl@linux.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: rusty@rustcorp.com.au
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: rmk@arm.linux.org.uk
Cc: starvik@axis.com
Cc: ralf@linux-mips.org
Cc: davem@davemloft.net
Cc: cooloney@kernel.org
Cc: kyle@mcmartin.ca
Cc: matthew@wil.cx
Cc: grundler@parisc-linux.org
Cc: takata@linux-m32r.org
Cc: benh@kernel.crashing.org
Cc: rth@twiddle.net
Cc: ink@jurassic.park.msu.ru
Cc: heiko.carstens@de.ibm.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
LKML-Reference: <49D43D58.4050102@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/percpu.c | 100 +++++++++++-----------------------------------------
 1 file changed, 20 insertions(+), 80 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index bf1bf1f4a7299..c0b2c1a76e81c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -23,7 +23,7 @@
  * Allocation is done in offset-size areas of single unit space.  Ie,
  * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
  * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
- * percpu base registers UNIT_SIZE apart.
+ * percpu base registers pcpu_unit_size apart.
  *
  * There are usually many small percpu allocations many of them as
  * small as 4 bytes.  The allocator organizes chunks into lists
@@ -38,8 +38,8 @@
  * region and negative allocated.  Allocation inside a chunk is done
  * by scanning this map sequentially and serving the first matching
  * entry.  This is mostly copied from the percpu_modalloc() allocator.
- * Chunks are also linked into a rb tree to ease address to chunk
- * mapping during free.
+ * Chunks can be determined from the address using the index field
+ * in the page struct. The index field contains a pointer to the chunk.
  *
  * To use this allocator, arch code should do the followings.
  *
@@ -61,7 +61,6 @@
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/pfn.h>
-#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
@@ -88,7 +87,6 @@
 
 struct pcpu_chunk {
 	struct list_head	list;		/* linked to pcpu_slot lists */
-	struct rb_node		rb_node;	/* key is chunk->vm->addr */
 	int			free_size;	/* free bytes in the chunk */
 	int			contig_hint;	/* max contiguous size hint */
 	struct vm_struct	*vm;		/* mapped vmalloc region */
@@ -133,7 +131,7 @@ static int pcpu_reserved_chunk_limit;
  * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
  * protects allocation/reclaim paths, chunks and chunk->page arrays.
  * The latter is a spinlock and protects the index data structures -
- * chunk slots, rbtree, chunks and area maps in chunks.
+ * chunk slots, chunks and area maps in chunks.
  *
  * During allocation, pcpu_alloc_mutex is kept locked all the time and
  * pcpu_lock is grabbed and released as necessary.  All actual memory
@@ -152,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex);	/* protects whole alloc and reclaim */
 static DEFINE_SPINLOCK(pcpu_lock);	/* protects index data structures */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
-static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
 
 /* reclaim work to release fully free chunks, scheduled from free path */
 static void pcpu_reclaim(struct work_struct *work);
@@ -203,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 	return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
 }
 
+/* set the pointer to a chunk in a page struct */
+static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
+{
+	page->index = (unsigned long)pcpu;
+}
+
+/* obtain pointer to a chunk from a page struct */
+static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
+{
+	return (struct pcpu_chunk *)page->index;
+}
+
 /**
  * pcpu_mem_alloc - allocate memory
  * @size: bytes to allocate
@@ -269,40 +278,9 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 	}
 }
 
-static struct rb_node **pcpu_chunk_rb_search(void *addr,
-					     struct rb_node **parentp)
-{
-	struct rb_node **p = &pcpu_addr_root.rb_node;
-	struct rb_node *parent = NULL;
-	struct pcpu_chunk *chunk;
-
-	while (*p) {
-		parent = *p;
-		chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
-
-		if (addr < chunk->vm->addr)
-			p = &(*p)->rb_left;
-		else if (addr > chunk->vm->addr)
-			p = &(*p)->rb_right;
-		else
-			break;
-	}
-
-	if (parentp)
-		*parentp = parent;
-	return p;
-}
-
 /**
- * pcpu_chunk_addr_search - search for chunk containing specified address
- * @addr: address to search for
- *
- * Look for chunk which might contain @addr.  More specifically, it
- * searchs for the chunk with the highest start address which isn't
- * beyond @addr.
- *
- * CONTEXT:
- * pcpu_lock.
+ * pcpu_chunk_addr_search - determine chunk containing specified address
+ * @addr: address for which the chunk needs to be determined.
  *
  * RETURNS:
  * The address of the found chunk.
@@ -310,8 +288,6 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 {
 	void *first_start = pcpu_first_chunk->vm->addr;
-	struct rb_node *n, *parent;
-	struct pcpu_chunk *chunk;
 
 	/* is it in the first chunk? */
 	if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
@@ -321,42 +297,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 		return pcpu_first_chunk;
 	}
 
-	/* nah... search the regular ones */
-	n = *pcpu_chunk_rb_search(addr, &parent);
-	if (!n) {
-		/* no exactly matching chunk, the parent is the closest */
-		n = parent;
-		BUG_ON(!n);
-	}
-	chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-
-	if (addr < chunk->vm->addr) {
-		/* the parent was the next one, look for the previous one */
-		n = rb_prev(n);
-		BUG_ON(!n);
-		chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-	}
-
-	return chunk;
-}
-
-/**
- * pcpu_chunk_addr_insert - insert chunk into address rb tree
- * @new: chunk to insert
- *
- * Insert @new into address rb tree.
- *
- * CONTEXT:
- * pcpu_lock.
- */
-static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
-{
-	struct rb_node **p, *parent;
-
-	p = pcpu_chunk_rb_search(new->vm->addr, &parent);
-	BUG_ON(*p);
-	rb_link_node(&new->rb_node, parent, p);
-	rb_insert_color(&new->rb_node, &pcpu_addr_root);
+	return pcpu_get_page_chunk(vmalloc_to_page(addr));
 }
 
 /**
@@ -768,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 						  alloc_mask, 0);
 			if (!*pagep)
 				goto err;
+			pcpu_set_page_chunk(*pagep, chunk);
 		}
 	}
 
@@ -892,7 +834,6 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 
 	spin_lock_irq(&pcpu_lock);
 	pcpu_chunk_relocate(chunk, -1);
-	pcpu_chunk_addr_insert(chunk);
 	goto restart;
 
 area_found:
@@ -981,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work)
 		if (chunk == list_first_entry(head, struct pcpu_chunk, list))
 			continue;
 
-		rb_erase(&chunk->rb_node, &pcpu_addr_root);
 		list_move(&chunk->list, &todo);
 	}
 
-- 
GitLab


From e30e08f65c7ef6c230424264f09c3d53f117f58b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:25 +0200
Subject: [PATCH 0309/2198] perf_counter: fix NMI race in task clock

We should not be updating ctx->time from NMI context, work around that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130408.681326666@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 863703b3158f8..84a39081344c8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -319,8 +319,6 @@ static void __perf_counter_disable(void *info)
 
 	spin_lock_irqsave(&ctx->lock, flags);
 
-	update_context_time(ctx);
-
 	/*
 	 * If the counter is on, turn it off.
 	 * If it is in error state, leave it in error state.
@@ -2335,13 +2333,11 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
  * Software counter: task time clock
  */
 
-static void task_clock_perf_counter_update(struct perf_counter *counter)
+static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now)
 {
-	u64 prev, now;
+	u64 prev;
 	s64 delta;
 
-	now = counter->ctx->time;
-
 	prev = atomic64_xchg(&counter->hw.prev_count, now);
 	delta = now - prev;
 	atomic64_add(delta, &counter->count);
@@ -2369,13 +2365,24 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
 	hrtimer_cancel(&counter->hw.hrtimer);
-	task_clock_perf_counter_update(counter);
+	task_clock_perf_counter_update(counter, counter->ctx->time);
+
 }
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-	update_context_time(counter->ctx);
-	task_clock_perf_counter_update(counter);
+	u64 time;
+
+	if (!in_nmi()) {
+		update_context_time(counter->ctx);
+		time = counter->ctx->time;
+	} else {
+		u64 now = perf_clock();
+		u64 delta = now - counter->ctx->timestamp;
+		time = counter->ctx->time + delta;
+	}
+
+	task_clock_perf_counter_update(counter, time);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {
-- 
GitLab


From 6fab01927e8bdbbc77bafba2abb4810c5591ad52 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:26 +0200
Subject: [PATCH 0310/2198] perf_counter: provide misc bits in the event header

Limit the size of each record to 64k (or should we count in multiples
of u64 and have a 512K limit?), this gives 16 bits or spare room in the
header, which we can use for misc bits, so as to not have to grow the
record with u64 every time we have a few bits to report.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130408.769271806@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 6 +++++-
 kernel/perf_counter.c        | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 7f5d353d78ac5..5bd8817b12d4a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -201,9 +201,13 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
+#define PERF_EVENT_MISC_KERNEL	(1 << 0)
+#define PERF_EVENT_MISC_USER	(1 << 1)
+
 struct perf_event_header {
 	__u32	type;
-	__u32	size;
+	__u16	misc;
+	__u16	size;
 };
 
 enum perf_event_type {
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 84a39081344c8..4af98f943d3b5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1831,6 +1831,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.type = PERF_EVENT_COUNTER_OVERFLOW;
 	header.size = sizeof(header);
 
+	header.misc = user_mode(regs) ?
+		PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
+
 	if (record_type & PERF_RECORD_IP) {
 		ip = instruction_pointer(regs);
 		header.type |= __PERF_EVENT_IP;
-- 
GitLab


From 6b6e5486b3a168f0328c82a8d4376caf901472b1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:27 +0200
Subject: [PATCH 0311/2198] perf_counter: use misc field to widen type

Push the PERF_EVENT_COUNTER_OVERFLOW bit into the misc field so that
we can have the full 32bit for PERF_RECORD_ bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130408.891867663@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 28 ++++++++++------------------
 kernel/perf_counter.c        | 15 ++++++++-------
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5bd8817b12d4a..4809ae18a9401 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -201,8 +201,9 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
-#define PERF_EVENT_MISC_KERNEL	(1 << 0)
-#define PERF_EVENT_MISC_USER	(1 << 1)
+#define PERF_EVENT_MISC_KERNEL		(1 << 0)
+#define PERF_EVENT_MISC_USER		(1 << 1)
+#define PERF_EVENT_MISC_OVERFLOW	(1 << 2)
 
 struct perf_event_header {
 	__u32	type;
@@ -230,36 +231,27 @@ enum perf_event_type {
 	PERF_EVENT_MUNMAP		= 2,
 
 	/*
-	 * Half the event type space is reserved for the counter overflow
-	 * bitfields, as found in hw_event.record_type.
-	 *
-	 * These events will have types of the form:
-	 *   PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } *
+	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+	 * will be PERF_RECORD_*
 	 *
 	 * struct {
 	 * 	struct perf_event_header	header;
 	 *
-	 * 	{ u64			ip;	  } && __PERF_EVENT_IP
-	 * 	{ u32			pid, tid; } && __PERF_EVENT_TID
+	 * 	{ u64			ip;	  } && PERF_RECORD_IP
+	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 *
 	 * 	{ u64			nr;
-	 * 	  { u64 event, val; } 	cnt[nr];  } && __PERF_EVENT_GROUP
+	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
 	 *
 	 * 	{ u16			nr,
 	 * 				hv,
 	 * 				kernel,
 	 * 				user;
-	 * 	  u64			ips[nr];  } && __PERF_EVENT_CALLCHAIN
+	 * 	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
 	 *
-	 * 	{ u64			time;     } && __PERF_EVENT_TIME
+	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * };
 	 */
-	PERF_EVENT_COUNTER_OVERFLOW	= 1UL << 31,
-	__PERF_EVENT_IP			= PERF_RECORD_IP,
-	__PERF_EVENT_TID		= PERF_RECORD_TID,
-	__PERF_EVENT_GROUP		= PERF_RECORD_GROUP,
-	__PERF_EVENT_CALLCHAIN		= PERF_RECORD_CALLCHAIN,
-	__PERF_EVENT_TIME		= PERF_RECORD_TIME,
 };
 
 #ifdef __KERNEL__
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4af98f943d3b5..bf12df6f3538c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1828,15 +1828,16 @@ static void perf_counter_output(struct perf_counter *counter,
 	int callchain_size = 0;
 	u64 time;
 
-	header.type = PERF_EVENT_COUNTER_OVERFLOW;
+	header.type = 0;
 	header.size = sizeof(header);
 
-	header.misc = user_mode(regs) ?
+	header.misc = PERF_EVENT_MISC_OVERFLOW;
+	header.misc |= user_mode(regs) ?
 		PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
 
 	if (record_type & PERF_RECORD_IP) {
 		ip = instruction_pointer(regs);
-		header.type |= __PERF_EVENT_IP;
+		header.type |= PERF_RECORD_IP;
 		header.size += sizeof(ip);
 	}
 
@@ -1845,12 +1846,12 @@ static void perf_counter_output(struct perf_counter *counter,
 		tid_entry.pid = current->group_leader->pid;
 		tid_entry.tid = current->pid;
 
-		header.type |= __PERF_EVENT_TID;
+		header.type |= PERF_RECORD_TID;
 		header.size += sizeof(tid_entry);
 	}
 
 	if (record_type & PERF_RECORD_GROUP) {
-		header.type |= __PERF_EVENT_GROUP;
+		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
 			counter->nr_siblings * sizeof(group_entry);
 	}
@@ -1861,7 +1862,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
-			header.type |= __PERF_EVENT_CALLCHAIN;
+			header.type |= PERF_RECORD_CALLCHAIN;
 			header.size += callchain_size;
 		}
 	}
@@ -1872,7 +1873,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		 */
 		time = sched_clock();
 
-		header.type |= __PERF_EVENT_TIME;
+		header.type |= PERF_RECORD_TIME;
 		header.size += sizeof(u64);
 	}
 
-- 
GitLab


From 808382b33bb4c60df6379ec2db39f332cc56b82a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:28 +0200
Subject: [PATCH 0312/2198] perf_counter: kerneltop: keep up with ABI changes

Update kerneltop to use PERF_EVENT_MISC_OVERFLOW

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130408.947197470@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/kerneltop.c | 32 +++++++++++++-------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
index 15f3a5f901989..042c1b83a8726 100644
--- a/Documentation/perf_counter/kerneltop.c
+++ b/Documentation/perf_counter/kerneltop.c
@@ -1277,22 +1277,22 @@ static void mmap_read(struct mmap_data *md)
 
 		old += size;
 
-		switch (event->header.type) {
-		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP:
-		case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID:
-			process_event(event->ip.ip, md->counter);
-			break;
-
-		case PERF_EVENT_MMAP:
-		case PERF_EVENT_MUNMAP:
-			printf("%s: %Lu %Lu %Lu %s\n",
-					event->header.type == PERF_EVENT_MMAP
-					  ? "mmap" : "munmap",
-					event->mmap.start,
-					event->mmap.len,
-					event->mmap.pgoff,
-					event->mmap.filename);
-			break;
+		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
+			if (event->header.type & PERF_RECORD_IP)
+				process_event(event->ip.ip, md->counter);
+		} else {
+			switch (event->header.type) {
+				case PERF_EVENT_MMAP:
+				case PERF_EVENT_MUNMAP:
+					printf("%s: %Lu %Lu %Lu %s\n",
+							event->header.type == PERF_EVENT_MMAP
+							? "mmap" : "munmap",
+							event->mmap.start,
+							event->mmap.len,
+							event->mmap.pgoff,
+							event->mmap.filename);
+					break;
+			}
 		}
 	}
 
-- 
GitLab


From 8740f9418c78dcad694b46ab25d1645d5aef1f5e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:29 +0200
Subject: [PATCH 0313/2198] perf_counter: add some comments

Add a few comments because I was forgetting what field what for what
functionality.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.036984214@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4809ae18a9401..8bf764fc6220a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -344,10 +344,12 @@ struct file;
 
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
-	int				nr_pages;
-	atomic_t			wakeup;
-	atomic_t			head;
-	atomic_t			events;
+	int				nr_pages;	/* nr of data pages  */
+
+	atomic_t			wakeup;		/* POLL_ for wakeups */
+	atomic_t			head;		/* write position    */
+	atomic_t			events;		/* event limit       */
+
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
-- 
GitLab


From 8d1b2d9361b494bfc761700c348c65ebbe3deb5b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:30 +0200
Subject: [PATCH 0314/2198] perf_counter: track task-comm data

Similar to the mmap data stream, add one that tracks the task COMM field,
so that the userspace reporting knows what to call a task.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.127422406@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 fs/exec.c                    |  1 +
 include/linux/perf_counter.h | 16 ++++++-
 kernel/perf_counter.c        | 93 ++++++++++++++++++++++++++++++++++++
 3 files changed, 109 insertions(+), 1 deletion(-)

diff --git a/fs/exec.c b/fs/exec.c
index e015c0b5a0826..bf47ed0278ffb 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -951,6 +951,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)
 	task_lock(tsk);
 	strlcpy(tsk->comm, buf, sizeof(tsk->comm));
 	task_unlock(tsk);
+	perf_counter_comm(tsk);
 }
 
 int flush_old_exec(struct linux_binprm * bprm)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8bf764fc6220a..a70a55f27598d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -142,8 +142,9 @@ struct perf_counter_hw_event {
 				exclude_idle   :  1, /* don't count when idle */
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
+				comm	       :  1, /* include comm data     */
 
-				__reserved_1   : 53;
+				__reserved_1   : 52;
 
 	__u32			extra_config_len;
 	__u32			wakeup_events;	/* wakeup every n events */
@@ -230,6 +231,16 @@ enum perf_event_type {
 	PERF_EVENT_MMAP			= 1,
 	PERF_EVENT_MUNMAP		= 2,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 *
+	 * 	u32				pid, tid;
+	 * 	char				comm[];
+	 * };
+	 */
+	PERF_EVENT_COMM			= 3,
+
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
@@ -545,6 +556,8 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
+extern void perf_counter_comm(struct task_struct *tsk);
+
 #define MAX_STACK_DEPTH		255
 
 struct perf_callchain_entry {
@@ -583,6 +596,7 @@ static inline void
 perf_counter_munmap(unsigned long addr, unsigned long len,
 		    unsigned long pgoff, struct file *file) 		{ }
 
+static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index bf12df6f3538c..2d4aebb2982bb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1916,6 +1916,99 @@ static void perf_counter_output(struct perf_counter *counter,
 	perf_output_end(&handle);
 }
 
+/*
+ * comm tracking
+ */
+
+struct perf_comm_event {
+	struct task_struct 	*task;
+	char 			*comm;
+	int			comm_size;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				tid;
+	} event;
+};
+
+static void perf_counter_comm_output(struct perf_counter *counter,
+				     struct perf_comm_event *comm_event)
+{
+	struct perf_output_handle handle;
+	int size = comm_event->event.header.size;
+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+	if (ret)
+		return;
+
+	perf_output_put(&handle, comm_event->event);
+	perf_output_copy(&handle, comm_event->comm,
+				   comm_event->comm_size);
+	perf_output_end(&handle);
+}
+
+static int perf_counter_comm_match(struct perf_counter *counter,
+				   struct perf_comm_event *comm_event)
+{
+	if (counter->hw_event.comm &&
+	    comm_event->event.header.type == PERF_EVENT_COMM)
+		return 1;
+
+	return 0;
+}
+
+static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
+				  struct perf_comm_event *comm_event)
+{
+	struct perf_counter *counter;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+		if (perf_counter_comm_match(counter, comm_event))
+			perf_counter_comm_output(counter, comm_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_counter_comm_event(struct perf_comm_event *comm_event)
+{
+	struct perf_cpu_context *cpuctx;
+	unsigned int size;
+	char *comm = comm_event->task->comm;
+
+	size = ALIGN(strlen(comm), sizeof(u64));
+
+	comm_event->comm = comm;
+	comm_event->comm_size = size;
+
+	comm_event->event.header.size = sizeof(comm_event->event) + size;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
+	put_cpu_var(perf_cpu_context);
+
+	perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
+}
+
+void perf_counter_comm(struct task_struct *task)
+{
+	struct perf_comm_event comm_event = {
+		.task	= task,
+		.event  = {
+			.header = { .type = PERF_EVENT_COMM, },
+			.pid	= task->group_leader->pid,
+			.tid	= task->pid,
+		},
+	};
+
+	perf_counter_comm_event(&comm_event);
+}
+
 /*
  * mmap tracking
  */
-- 
GitLab


From de9ac07bbf8f51e0ce40e5428c3a8f627bd237c2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:31 +0200
Subject: [PATCH 0315/2198] perf_counter: some simple userspace profiling

# perf-record make -j4 kernel/
 # perf-report | tail -15

  0.39              cc1 [kernel] lock_acquired
  0.42              cc1 [kernel] lock_acquire
  0.51              cc1 [ user ] /lib64/libc-2.8.90.so: _int_free
  0.51               as [kernel] clear_page_c
  0.53              cc1 [ user ] /lib64/libc-2.8.90.so: memcpy
  0.56              cc1 [ user ] /lib64/libc-2.8.90.so: _IO_vfprintf
  0.63              cc1 [kernel] lock_release
  0.67              cc1 [ user ] /lib64/libc-2.8.90.so: strlen
  0.68              cc1 [kernel] debug_smp_processor_id
  1.38              cc1 [ user ] /lib64/libc-2.8.90.so: _int_malloc
  1.55              cc1 [ user ] /lib64/libc-2.8.90.so: memset
  1.77              cc1 [kernel] __lock_acquire
  1.88              cc1 [kernel] clear_page_c
  3.61               as [ user ] /usr/bin/as: <unknown>
 59.16              cc1 [ user ] /usr/libexec/gcc/x86_64-redhat-linux/4.3.2/cc1: <unknown>

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
LKML-Reference: <20090408130409.220518450@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile       |   8 +-
 Documentation/perf_counter/perf-record.c  | 530 ++++++++++++++++++++++
 Documentation/perf_counter/perf-report.cc | 472 +++++++++++++++++++
 3 files changed, 1009 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/perf_counter/perf-record.c
 create mode 100644 Documentation/perf_counter/perf-report.cc

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 194b662155886..1dd37ee7dbdc3 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -1,10 +1,16 @@
-BINS = kerneltop perfstat
+BINS = kerneltop perfstat perf-record perf-report
 
 all: $(BINS)
 
 kerneltop: kerneltop.c ../../include/linux/perf_counter.h
 	cc -O6 -Wall -lrt -o $@ $<
 
+perf-record: perf-record.c ../../include/linux/perf_counter.h
+	cc -O6 -Wall -lrt -o $@ $<
+
+perf-report: perf-report.cc ../../include/linux/perf_counter.h
+	g++ -O6 -Wall -lrt -o $@ $<
+
 perfstat: kerneltop
 	ln -sf kerneltop perfstat
 
diff --git a/Documentation/perf_counter/perf-record.c b/Documentation/perf_counter/perf-record.c
new file mode 100644
index 0000000000000..614de7c468b21
--- /dev/null
+++ b/Documentation/perf_counter/perf-record.c
@@ -0,0 +1,530 @@
+
+
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <getopt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define rdclock()                                       \
+({                                                      \
+        struct timespec ts;                             \
+                                                        \
+        clock_gettime(CLOCK_MONOTONIC, &ts);            \
+        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#ifdef __x86_64__
+#define __NR_perf_counter_open 295
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __i386__
+#define __NR_perf_counter_open 333
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#define rmb() 		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+asmlinkage int sys_perf_counter_open(
+        struct perf_counter_hw_event    *hw_event_uptr          __user,
+        pid_t                           pid,
+        int                             cpu,
+        int                             group_fd,
+        unsigned long                   flags)
+{
+        return syscall(
+                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+}
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+
+static int			nr_counters			=  0;
+static __u64			event_id[MAX_COUNTERS]		= { };
+static int			default_interval = 100000;
+static int			event_count[MAX_COUNTERS];
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			nr_cpus				=  0;
+static unsigned int		page_size;
+static unsigned int		mmap_pages			= 16;
+static int			output;
+static char 			*output_name			= "output.perf";
+static int			group				= 0;
+static unsigned int		realtime_prio			=  0;
+
+const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+static char *hw_event_names[] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+	"minor faults",
+	"major faults",
+};
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+static int parse_events(char *str)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+static void display_events_help(void)
+{
+	unsigned int i;
+	__u64 e;
+
+	printf(
+	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		printf("\n                             %d:%d: %-20s",
+				type, id, event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
+}
+
+static void display_help(void)
+{
+	printf(
+	"Usage: perf-record [<options>]\n"
+	"perf-record Options (up to %d event types can be specified at once):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -c CNT    --count=CNT          # event period to sample\n"
+	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
+	" -o file   --output=<file>      # output file\n"
+	" -r prio   --realtime=<prio>    # use RT prio\n"
+	);
+
+	exit(0);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0, counter;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"event",	required_argument,	NULL, 'e'},
+			{"mmap_pages",	required_argument,	NULL, 'm'},
+			{"output",	required_argument,	NULL, 'o'},
+			{"realtime",	required_argument,	NULL, 'r'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:c:e:m:o:r:",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'c': default_interval		=   atoi(optarg); break;
+		case 'e': error				= parse_events(optarg); break;
+		case 'm': mmap_pages			=   atoi(optarg); break;
+		case 'o': output_name			= strdup(optarg); break;
+		case 'r': realtime_prio			=   atoi(optarg); break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	if (!nr_counters) {
+		nr_counters = 1;
+		event_id[0] = 0;
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+}
+
+struct mmap_data {
+	int counter;
+	void *base;
+	unsigned int mask;
+	unsigned int prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+	struct perf_counter_mmap_page *pc = md->base;
+	int head;
+
+	head = pc->data_head;
+	rmb();
+
+	return head;
+}
+
+static long events;
+static struct timeval last_read, this_read;
+
+static void mmap_read(struct mmap_data *md)
+{
+	unsigned int head = mmap_read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+	unsigned long size;
+	void *buf;
+	int diff;
+
+	gettimeofday(&this_read, NULL);
+
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and screw up the events under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
+		struct timeval iv;
+		unsigned long msecs;
+
+		timersub(&this_read, &last_read, &iv);
+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
+				"  Last read %lu msecs ago.\n", msecs);
+
+		/*
+		 * head points to a known good entry, start there.
+		 */
+		old = head;
+	}
+
+	last_read = this_read;
+
+	if (old != head)
+		events++;
+
+	size = head - old;
+
+	if ((old & md->mask) + size != (head & md->mask)) {
+		buf = &data[old & md->mask];
+		size = md->mask + 1 - (old & md->mask);
+		old += size;
+		while (size) {
+			int ret = write(output, buf, size);
+			if (ret < 0) {
+				perror("failed to write");
+				exit(-1);
+			}
+			size -= ret;
+			buf += ret;
+		}
+	}
+
+	buf = &data[old & md->mask];
+	size = head - old;
+	old += size;
+	while (size) {
+		int ret = write(output, buf, size);
+		if (ret < 0) {
+			perror("failed to write");
+			exit(-1);
+		}
+		size -= ret;
+		buf += ret;
+	}
+
+	md->prev = old;
+}
+
+static volatile int done = 0;
+
+static void sigchld_handler(int sig)
+{
+	if (sig == SIGCHLD)
+		done = 1;
+}
+
+int main(int argc, char *argv[])
+{
+	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct perf_counter_hw_event hw_event;
+	int i, counter, group_fd, nr_poll = 0;
+	pid_t pid;
+	int ret;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	process_options(argc, argv);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	output = open(output_name, O_CREAT|O_RDWR, S_IRWXU);
+	if (output < 0) {
+		perror("failed to create output file");
+		exit(-1);
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	for (i = 0; i < nr_cpus; i++) {
+		group_fd = -1;
+		for (counter = 0; counter < nr_counters; counter++) {
+
+			memset(&hw_event, 0, sizeof(hw_event));
+			hw_event.config		= event_id[counter];
+			hw_event.irq_period	= event_count[counter];
+			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
+			hw_event.nmi		= 1;
+			hw_event.mmap		= 1;
+			hw_event.comm		= 1;
+
+			fd[i][counter] = sys_perf_counter_open(&hw_event, -1, i, group_fd, 0);
+			if (fd[i][counter] < 0) {
+				int err = errno;
+				printf("kerneltop error: syscall returned with %d (%s)\n",
+					fd[i][counter], strerror(err));
+				if (err == EPERM)
+					printf("Are you root?\n");
+				exit(-1);
+			}
+			assert(fd[i][counter] >= 0);
+			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+
+			/*
+			 * First counter acts as the group leader:
+			 */
+			if (group && group_fd == -1)
+				group_fd = fd[i][counter];
+
+			event_array[nr_poll].fd = fd[i][counter];
+			event_array[nr_poll].events = POLLIN;
+			nr_poll++;
+
+			mmap_array[i][counter].counter = counter;
+			mmap_array[i][counter].prev = 0;
+			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+					PROT_READ, MAP_SHARED, fd[i][counter], 0);
+			if (mmap_array[i][counter].base == MAP_FAILED) {
+				printf("kerneltop error: failed to mmap with %d (%s)\n",
+						errno, strerror(errno));
+				exit(-1);
+			}
+		}
+	}
+
+	signal(SIGCHLD, sigchld_handler);
+
+	pid = fork();
+	if (pid < 0)
+		perror("failed to fork");
+
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+
+	if (realtime_prio) {
+		struct sched_param param;
+
+		param.sched_priority = realtime_prio;
+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+			printf("Could not set realtime priority.\n");
+			exit(-1);
+		}
+	}
+
+	/*
+	 * TODO: store the current /proc/$/maps information somewhere
+	 */
+
+	while (!done) {
+		int hits = events;
+
+		for (i = 0; i < nr_cpus; i++) {
+			for (counter = 0; counter < nr_counters; counter++)
+				mmap_read(&mmap_array[i][counter]);
+		}
+
+		if (hits == events)
+			ret = poll(event_array, nr_poll, 100);
+	}
+
+	return 0;
+}
diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
new file mode 100644
index 0000000000000..09da0ba482cde
--- /dev/null
+++ b/Documentation/perf_counter/perf-report.cc
@@ -0,0 +1,472 @@
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <getopt.h>
+
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+
+#include <set>
+#include <map>
+#include <string>
+
+
+static char 		const *input_name = "output.perf";
+static int		input;
+
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+struct ip_event {
+	struct perf_event_header header;
+	__u64 ip;
+	__u32 pid, tid;
+};
+struct mmap_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	__u64 start;
+	__u64 len;
+	__u64 pgoff;
+	char filename[PATH_MAX];
+};
+struct comm_event {
+	struct perf_event_header header;
+	__u32 pid,tid;
+	char comm[16];
+};
+
+typedef union event_union {
+	struct perf_event_header header;
+	struct ip_event ip;
+	struct mmap_event mmap;
+	struct comm_event comm;
+} event_t;
+
+struct section {
+	uint64_t start;
+	uint64_t end;
+
+	uint64_t offset;
+
+	std::string name;
+
+	section() { };
+
+	section(uint64_t stab) : end(stab) { };
+
+	section(uint64_t start, uint64_t size, uint64_t offset, std::string name) :
+		start(start), end(start + size), offset(offset), name(name)
+	{ };
+
+	bool operator < (const struct section &s) const {
+		return end < s.end;
+	};
+};
+
+typedef std::set<struct section> sections_t;
+
+struct symbol {
+	uint64_t start;
+	uint64_t end;
+
+	std::string name;
+
+	symbol() { };
+
+	symbol(uint64_t ip) : start(ip) { }
+
+	symbol(uint64_t start, uint64_t len, std::string name) :
+		start(start), end(start + len), name(name)
+	{ };
+
+	bool operator < (const struct symbol &s) const {
+		return start < s.start;
+	};
+};
+
+typedef std::set<struct symbol> symbols_t;
+
+struct dso {
+	sections_t sections;
+	symbols_t syms;
+};
+
+static std::map<std::string, struct dso> dsos;
+
+static void load_dso_sections(std::string dso_name)
+{
+	struct dso &dso = dsos[dso_name];
+
+	std::string cmd = "readelf -DSW " + dso_name;
+
+	FILE *file = popen(cmd.c_str(), "r");
+	if (!file) {
+		perror("failed to open pipe");
+		exit(-1);
+	}
+
+	char *line = NULL;
+	size_t n = 0;
+
+	while (!feof(file)) {
+		uint64_t addr, off, size;
+		char name[32];
+
+		if (getline(&line, &n, file) < 0)
+			break;
+		if (!line)
+			break;
+
+		if (sscanf(line, "  [%*2d] %16s %*14s %Lx %Lx %Lx",
+					name, &addr, &off, &size) == 4) {
+
+			dso.sections.insert(section(addr, size, addr - off, name));
+		}
+#if 0
+		/*
+		 * for reading readelf symbols (-s), however these don't seem
+		 * to include nearly everything, so use nm for that.
+		 */
+		if (sscanf(line, " %*4d %*3d: %Lx %5Lu %*7s %*6s %*7s %3d %s",
+			   &start, &size, &section, sym) == 4) {
+
+			start -= dso.section_offsets[section];
+
+			dso.syms.insert(symbol(start, size, std::string(sym)));
+		}
+#endif
+	}
+	pclose(file);
+}
+
+static void load_dso_symbols(std::string dso_name, std::string args)
+{
+	struct dso &dso = dsos[dso_name];
+
+	std::string cmd = "nm -nSC " + args + " " + dso_name;
+
+	FILE *file = popen(cmd.c_str(), "r");
+	if (!file) {
+		perror("failed to open pipe");
+		exit(-1);
+	}
+
+	char *line = NULL;
+	size_t n = 0;
+
+	while (!feof(file)) {
+		uint64_t start, size;
+		char c;
+		char sym[1024];
+
+		if (getline(&line, &n, file) < 0)
+			break;
+		if (!line)
+			break;
+
+
+		if (sscanf(line, "%Lx %Lx %c %s", &start, &size, &c, sym) == 4) {
+			sections_t::const_iterator si =
+				dso.sections.upper_bound(section(start));
+			if (si == dso.sections.end()) {
+				printf("symbol in unknown section: %s\n", sym);
+				continue;
+			}
+
+			start -= si->offset;
+
+			dso.syms.insert(symbol(start, size, sym));
+		}
+	}
+	pclose(file);
+}
+
+static void load_dso(std::string dso_name)
+{
+	load_dso_sections(dso_name);
+	load_dso_symbols(dso_name, "-D"); /* dynamic symbols */
+	load_dso_symbols(dso_name, "");   /* regular ones */
+}
+
+void load_kallsyms(void)
+{
+	struct dso &dso = dsos["[kernel]"];
+
+	FILE *file = fopen("/proc/kallsyms", "r");
+	if (!file) {
+		perror("failed to open kallsyms");
+		exit(-1);
+	}
+
+	char *line;
+	size_t n;
+
+	while (!feof(file)) {
+		uint64_t start;
+		char c;
+		char sym[1024];
+
+		if (getline(&line, &n, file) < 0)
+			break;
+		if (!line)
+			break;
+
+		if (sscanf(line, "%Lx %c %s", &start, &c, sym) == 3)
+			dso.syms.insert(symbol(start, 0x1000000, std::string(sym)));
+	}
+	fclose(file);
+}
+
+struct map {
+	uint64_t start;
+	uint64_t end;
+	uint64_t pgoff;
+
+	std::string dso;
+
+	map() { };
+
+	map(uint64_t ip) : end(ip) { }
+
+	map(mmap_event *mmap) {
+		start = mmap->start;
+		end = mmap->start + mmap->len;
+		pgoff = mmap->pgoff;
+
+		dso = std::string(mmap->filename);
+
+		if (dsos.find(dso) == dsos.end())
+			load_dso(dso);
+	};
+
+	bool operator < (const struct map &m) const {
+		return end < m.end;
+	};
+};
+
+typedef std::set<struct map> maps_t;
+
+static std::map<int, maps_t> maps;
+
+static std::map<int, std::string> comms;
+
+static std::map<std::string, int> hist;
+static std::multimap<int, std::string> rev_hist;
+
+static std::string resolve_comm(int pid)
+{
+	std::string comm = "<unknown>";
+	std::map<int, std::string>::const_iterator ci = comms.find(pid);
+	if (ci != comms.end())
+		comm = ci->second;
+
+	return comm;
+}
+
+static std::string resolve_user_symbol(int pid, uint64_t ip)
+{
+	std::string sym = "<unknown>";
+
+	maps_t &m = maps[pid];
+	maps_t::const_iterator mi = m.upper_bound(map(ip));
+	if (mi == m.end())
+		return sym;
+
+	ip -= mi->start + mi->pgoff;
+
+	symbols_t &s = dsos[mi->dso].syms;
+	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
+
+	sym = mi->dso + ": <unknown>";
+
+	if (si == s.begin())
+		return sym;
+	si--;
+
+	if (si->start <= ip && ip < si->end)
+		sym = mi->dso + ": " + si->name;
+#if 0
+	else if (si->start <= ip)
+		sym = mi->dso + ": ?" + si->name;
+#endif
+
+	return sym;
+}
+
+static std::string resolve_kernel_symbol(uint64_t ip)
+{
+	std::string sym = "<unknown>";
+
+	symbols_t &s = dsos["[kernel]"].syms;
+	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
+
+	if (si == s.begin())
+		return sym;
+	si--;
+
+	if (si->start <= ip && ip < si->end)
+		sym = si->name;
+
+	return sym;
+}
+
+static void display_help(void)
+{
+	printf(
+	"Usage: perf-report [<options>]\n"
+	" -i file   --input=<file>      # input file\n"
+	);
+
+	exit(0);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"input",	required_argument,	NULL, 'i'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:i:",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'i': input_name			= strdup(optarg); break;
+		default: error = 1; break;
+		}
+	}
+
+	if (error)
+		display_help();
+}
+
+int main(int argc, char *argv[])
+{
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat stat;
+	char *buf;
+	event_t *event;
+	int ret;
+	unsigned long total = 0;
+
+	page_size = getpagesize();
+
+	process_options(argc, argv);
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	load_kallsyms();
+
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+
+		munmap(buf, page_size * mmap_window);
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+	head += event->header.size;
+
+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
+		std::string comm, sym, level;
+		char output[1024];
+
+		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+			level = "[kernel]";
+			sym = resolve_kernel_symbol(event->ip.ip);
+		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+			level = "[ user ]";
+			sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
+		} else {
+			level = "[  hv  ]";
+		}
+		comm = resolve_comm(event->ip.pid);
+
+		snprintf(output, sizeof(output), "%16s %s %s",
+				comm.c_str(), level.c_str(), sym.c_str());
+		hist[output]++;
+
+		total++;
+
+	} else switch (event->header.type) {
+	case PERF_EVENT_MMAP:
+		maps[event->mmap.pid].insert(map(&event->mmap));
+		break;
+
+	case PERF_EVENT_COMM:
+		comms[event->comm.pid] = std::string(event->comm.comm);
+		break;
+	}
+
+	if (offset + head < stat.st_size)
+		goto more;
+
+	close(input);
+
+	std::map<std::string, int>::iterator hi = hist.begin();
+
+	while (hi != hist.end()) {
+		rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
+		hist.erase(hi++);
+	}
+
+	std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
+
+	while (ri != rev_hist.end()) {
+		printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
+		ri++;
+	}
+
+	return 0;
+}
+
-- 
GitLab


From 4d855457d84b819fefcd1cd1b0a2a0a0ec475c07 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:32 +0200
Subject: [PATCH 0316/2198] perf_counter: move PERF_RECORD_TIME

Move PERF_RECORD_TIME so that all the fixed length items come before
the variable length ones.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.307926436@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  9 ++++-----
 kernel/perf_counter.c        | 26 +++++++++++++-------------
 2 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a70a55f27598d..8bd1be58c938c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -100,9 +100,9 @@ enum sw_event_ids {
 enum perf_counter_record_format {
 	PERF_RECORD_IP		= 1U << 0,
 	PERF_RECORD_TID		= 1U << 1,
-	PERF_RECORD_GROUP	= 1U << 2,
-	PERF_RECORD_CALLCHAIN	= 1U << 3,
-	PERF_RECORD_TIME	= 1U << 4,
+	PERF_RECORD_TIME	= 1U << 2,
+	PERF_RECORD_GROUP	= 1U << 3,
+	PERF_RECORD_CALLCHAIN	= 1U << 4,
 };
 
 /*
@@ -250,6 +250,7 @@ enum perf_event_type {
 	 *
 	 * 	{ u64			ip;	  } && PERF_RECORD_IP
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
+	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
@@ -259,8 +260,6 @@ enum perf_event_type {
 	 * 				kernel,
 	 * 				user;
 	 * 	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
-	 *
-	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * };
 	 */
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2d4aebb2982bb..4dc8600d2825a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1850,6 +1850,16 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(tid_entry);
 	}
 
+	if (record_type & PERF_RECORD_TIME) {
+		/*
+		 * Maybe do better on x86 and provide cpu_clock_nmi()
+		 */
+		time = sched_clock();
+
+		header.type |= PERF_RECORD_TIME;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -1867,16 +1877,6 @@ static void perf_counter_output(struct perf_counter *counter,
 		}
 	}
 
-	if (record_type & PERF_RECORD_TIME) {
-		/*
-		 * Maybe do better on x86 and provide cpu_clock_nmi()
-		 */
-		time = sched_clock();
-
-		header.type |= PERF_RECORD_TIME;
-		header.size += sizeof(u64);
-	}
-
 	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
 	if (ret)
 		return;
@@ -1889,6 +1889,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_TID)
 		perf_output_put(&handle, tid_entry);
 
+	if (record_type & PERF_RECORD_TIME)
+		perf_output_put(&handle, time);
+
 	if (record_type & PERF_RECORD_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
@@ -1910,9 +1913,6 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (callchain)
 		perf_output_copy(&handle, callchain, callchain_size);
 
-	if (record_type & PERF_RECORD_TIME)
-		perf_output_put(&handle, time);
-
 	perf_output_end(&handle);
 }
 
-- 
GitLab


From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Apr 2009 15:01:33 +0200
Subject: [PATCH 0317/2198] perf_counter: allow for data addresses to be
 recorded

Paul suggested we allow for data addresses to be recorded along with
the traditional IPs as power can provide these.

For now, only the software pagefault events provide data addresses,
but in the future power might as well for some events.

x86 doesn't seem capable of providing this atm.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090408130409.394816925@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  2 +-
 arch/powerpc/mm/fault.c            |  8 ++++--
 arch/x86/kernel/cpu/perf_counter.c |  2 +-
 arch/x86/mm/fault.c                |  8 ++++--
 include/linux/perf_counter.h       | 14 +++++----
 kernel/perf_counter.c              | 46 +++++++++++++++++++-----------
 6 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 0697ade84dd3a..c9d019f190742 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_overflow(counter, 1, regs);
+		perf_counter_overflow(counter, 1, regs, 0);
 }
 
 /*
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 17bbf6f91fbe6..ac0e112031b29 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
 			preempt_disable();
@@ -322,7 +323,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1116a41bc7b5d..0fcbaab83f9bf 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -800,7 +800,7 @@ static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 			continue;
 
 		perf_save_and_restart(counter);
-		if (perf_counter_overflow(counter, nmi, regs))
+		if (perf_counter_overflow(counter, nmi, regs, 0))
 			__pmc_generic_disable(counter, &counter->hw, bit);
 	}
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f2d3324d92152..6f9df2babe487 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs);
+	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1142,10 +1142,12 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs);
+		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+				     regs, address);
 	}
 
 	check_v8086_mode(regs, address, tsk);
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 8bd1be58c938c..c22363a4f7466 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -101,8 +101,9 @@ enum perf_counter_record_format {
 	PERF_RECORD_IP		= 1U << 0,
 	PERF_RECORD_TID		= 1U << 1,
 	PERF_RECORD_TIME	= 1U << 2,
-	PERF_RECORD_GROUP	= 1U << 3,
-	PERF_RECORD_CALLCHAIN	= 1U << 4,
+	PERF_RECORD_ADDR	= 1U << 3,
+	PERF_RECORD_GROUP	= 1U << 4,
+	PERF_RECORD_CALLCHAIN	= 1U << 5,
 };
 
 /*
@@ -251,6 +252,7 @@ enum perf_event_type {
 	 * 	{ u64			ip;	  } && PERF_RECORD_IP
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
+	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
@@ -537,7 +539,7 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 extern int perf_counter_overflow(struct perf_counter *counter,
-				 int nmi, struct pt_regs *regs);
+				 int nmi, struct pt_regs *regs, u64 addr);
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
@@ -547,7 +549,7 @@ static inline int is_software_counter(struct perf_counter *counter)
 		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
 }
 
-extern void perf_swcounter_event(u32, u64, int, struct pt_regs *);
+extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
 
 extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 			      unsigned long pgoff, struct file *file);
@@ -584,8 +586,8 @@ static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
 static inline void
-perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)	{ }
-
+perf_swcounter_event(u32 event, u64 nr, int nmi,
+		     struct pt_regs *regs, u64 addr)			{ }
 
 static inline void
 perf_counter_mmap(unsigned long addr, unsigned long len,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4dc8600d2825a..321c57e3556f5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -800,7 +800,7 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	update_context_time(ctx);
 
 	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs);
+	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
 	__perf_counter_sched_out(ctx, cpuctx);
 
 	cpuctx->task_ctx = NULL;
@@ -1810,7 +1810,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 }
 
 static void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs)
+				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
 	u64 record_type = counter->hw_event.record_type;
@@ -1860,6 +1860,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_ADDR) {
+		header.type |= PERF_RECORD_ADDR;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -1892,6 +1897,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_TIME)
 		perf_output_put(&handle, time);
 
+	if (record_type & PERF_RECORD_ADDR)
+		perf_output_put(&handle, addr);
+
 	if (record_type & PERF_RECORD_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
@@ -2158,7 +2166,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
  */
 
 int perf_counter_overflow(struct perf_counter *counter,
-			  int nmi, struct pt_regs *regs)
+			  int nmi, struct pt_regs *regs, u64 addr)
 {
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
@@ -2175,7 +2183,7 @@ int perf_counter_overflow(struct perf_counter *counter,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, regs);
+	perf_counter_output(counter, nmi, regs, addr);
 	return ret;
 }
 
@@ -2240,7 +2248,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 		regs = task_pt_regs(current);
 
 	if (regs) {
-		if (perf_counter_overflow(counter, 0, regs))
+		if (perf_counter_overflow(counter, 0, regs, 0))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -2250,11 +2258,11 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct pt_regs *regs)
+				    int nmi, struct pt_regs *regs, u64 addr)
 {
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, regs))
+	if (perf_counter_overflow(counter, nmi, regs, addr))
 		/* soft-disable the counter */
 		;
 
@@ -2286,16 +2294,17 @@ static int perf_swcounter_match(struct perf_counter *counter,
 }
 
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct pt_regs *regs)
+			       int nmi, struct pt_regs *regs, u64 addr)
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 	if (counter->hw.irq_period && !neg)
-		perf_swcounter_overflow(counter, nmi, regs);
+		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum perf_event_types type, u32 event,
-				     u64 nr, int nmi, struct pt_regs *regs)
+				     u64 nr, int nmi, struct pt_regs *regs,
+				     u64 addr)
 {
 	struct perf_counter *counter;
 
@@ -2305,7 +2314,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
 		if (perf_swcounter_match(counter, type, event, regs))
-			perf_swcounter_add(counter, nr, nmi, regs);
+			perf_swcounter_add(counter, nr, nmi, regs, addr);
 	}
 	rcu_read_unlock();
 }
@@ -2325,7 +2334,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 }
 
 static void __perf_swcounter_event(enum perf_event_types type, u32 event,
-				   u64 nr, int nmi, struct pt_regs *regs)
+				   u64 nr, int nmi, struct pt_regs *regs,
+				   u64 addr)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -2336,10 +2346,11 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 	(*recursion)++;
 	barrier();
 
-	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs);
+	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
+				 nr, nmi, regs, addr);
 	if (cpuctx->task_ctx) {
 		perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
-				nr, nmi, regs);
+					 nr, nmi, regs, addr);
 	}
 
 	barrier();
@@ -2349,9 +2360,10 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 	put_cpu_var(perf_cpu_context);
 }
 
-void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs)
+void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr)
 {
-	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs);
+	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr);
 }
 
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -2548,7 +2560,7 @@ void perf_tpcounter_event(int event_id)
 	if (!regs)
 		regs = task_pt_regs(current);
 
-	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs);
+	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
 }
 
 extern int ftrace_profile_enable(int);
-- 
GitLab


From e85abf8f432bb2a13733ab7609fbb8e1500af51d Mon Sep 17 00:00:00 2001
From: Gary Hade <garyhade@us.ibm.com>
Date: Wed, 8 Apr 2009 14:07:25 -0700
Subject: [PATCH 0318/2198] x86: consolidate SMP code in io_apic.c

Impact: Cleanup

Reorganizes the code in arch/x86/kernel/io_apic.c by
combining two '#ifdef CONFIG_SMP' regions.  In addition
to making the code easier to understand the first
'#ifdef CONFIG_SMP' region is moved to a location later
in the file which will reduce the need for function
forward declarations when the code subsequently revised.

The only changes other than relocating code to a different
position in the file were the removal of the assign_irq_vector()
forward declaration which was no longer needed and some line
length reduction formatting changes.

Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Cc: lcm@us.ibm.com
LKML-Reference: <20090408210725.GC11159@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 223 ++++++++++++++++-----------------
 1 file changed, 109 insertions(+), 114 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 767fe7e46d689..7c9d045ac8341 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -518,120 +518,6 @@ static void ioapic_mask_entry(int apic, int pin)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#ifdef CONFIG_SMP
-static void send_cleanup_vector(struct irq_cfg *cfg)
-{
-	cpumask_var_t cleanup_mask;
-
-	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
-		unsigned int i;
-		cfg->move_cleanup_count = 0;
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-			cfg->move_cleanup_count++;
-		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
-	} else {
-		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
-		cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
-		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-		free_cpumask_var(cleanup_mask);
-	}
-	cfg->move_in_progress = 0;
-}
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
-	int apic, pin;
-	struct irq_pin_list *entry;
-	u8 vector = cfg->vector;
-
-	entry = cfg->irq_2_pin;
-	for (;;) {
-		unsigned int reg;
-
-		if (!entry)
-			break;
-
-		apic = entry->apic;
-		pin = entry->pin;
-		/*
-		 * With interrupt-remapping, destination information comes
-		 * from interrupt-remapping table entry.
-		 */
-		if (!irq_remapped(irq))
-			io_apic_write(apic, 0x11 + pin*2, dest);
-		reg = io_apic_read(apic, 0x10 + pin*2);
-		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
-		reg |= vector;
-		io_apic_modify(apic, 0x10 + pin*2, reg);
-		if (!entry->next)
-			break;
-		entry = entry->next;
-	}
-}
-
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-
-/*
- * Either sets desc->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
- * leaves desc->affinity untouched.
- */
-static unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
-{
-	struct irq_cfg *cfg;
-	unsigned int irq;
-
-	if (!cpumask_intersects(mask, cpu_online_mask))
-		return BAD_APICID;
-
-	irq = desc->irq;
-	cfg = desc->chip_data;
-	if (assign_irq_vector(irq, cfg, mask))
-		return BAD_APICID;
-
-	/* check that before desc->addinity get updated */
-	set_extra_move_desc(desc, mask);
-
-	cpumask_copy(desc->affinity, mask);
-
-	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
-}
-
-static void
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	unsigned int dest;
-	unsigned int irq;
-
-	irq = desc->irq;
-	cfg = desc->chip_data;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	dest = set_desc_affinity(desc, mask);
-	if (dest != BAD_APICID) {
-		/* Only the high 8 bits are valid. */
-		dest = SET_APIC_LOGICAL_ID(dest);
-		__target_IO_APIC_irq(irq, dest, cfg);
-	}
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
-	struct irq_desc *desc;
-
-	desc = irq_to_desc(irq);
-
-	set_ioapic_affinity_irq_desc(desc, mask);
-}
-#endif /* CONFIG_SMP */
-
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -2360,6 +2246,115 @@ static int ioapic_retrigger_irq(unsigned int irq)
  */
 
 #ifdef CONFIG_SMP
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+	cpumask_var_t cleanup_mask;
+
+	if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+		unsigned int i;
+		cfg->move_cleanup_count = 0;
+		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+			cfg->move_cleanup_count++;
+		for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+			apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+	} else {
+		cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+		cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+		apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+		free_cpumask_var(cleanup_mask);
+	}
+	cfg->move_in_progress = 0;
+}
+
+static void
+__target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+	int apic, pin;
+	struct irq_pin_list *entry;
+	u8 vector = cfg->vector;
+
+	entry = cfg->irq_2_pin;
+	for (;;) {
+		unsigned int reg;
+
+		if (!entry)
+			break;
+
+		apic = entry->apic;
+		pin = entry->pin;
+		/*
+		 * With interrupt-remapping, destination information comes
+		 * from interrupt-remapping table entry.
+		 */
+		if (!irq_remapped(irq))
+			io_apic_write(apic, 0x11 + pin*2, dest);
+		reg = io_apic_read(apic, 0x10 + pin*2);
+		reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+		reg |= vector;
+		io_apic_modify(apic, 0x10 + pin*2, reg);
+		if (!entry->next)
+			break;
+		entry = entry->next;
+	}
+}
+
+/*
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+	struct irq_cfg *cfg;
+	unsigned int irq;
+
+	if (!cpumask_intersects(mask, cpu_online_mask))
+		return BAD_APICID;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+	if (assign_irq_vector(irq, cfg, mask))
+		return BAD_APICID;
+
+	/* check that before desc->addinity get updated */
+	set_extra_move_desc(desc, mask);
+
+	cpumask_copy(desc->affinity, mask);
+
+	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+}
+
+static void
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int dest;
+	unsigned int irq;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	dest = set_desc_affinity(desc, mask);
+	if (dest != BAD_APICID) {
+		/* Only the high 8 bits are valid. */
+		dest = SET_APIC_LOGICAL_ID(dest);
+		__target_IO_APIC_irq(irq, dest, cfg);
+	}
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
+static void
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+	struct irq_desc *desc;
+
+	desc = irq_to_desc(irq);
+
+	set_ioapic_affinity_irq_desc(desc, mask);
+}
 
 #ifdef CONFIG_INTR_REMAP
 
-- 
GitLab


From 6c0b324435ff49fb3c68fe808a93853d81c7fb97 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 9 Apr 2009 09:27:37 +1000
Subject: [PATCH 0319/2198] perf_counter: add MAINTAINERS entry

This adds an entry in MAINTAINERS for the perf_counter subsystem.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18909.13033.345975.434902@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 MAINTAINERS | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index c3b215970f7b4..16fb45adb53e7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3483,6 +3483,16 @@ M:	balbir@linux.vnet.ibm.com
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 
+PERFORMANCE COUNTER SUBSYSTEM
+P:	Peter Zijlstra
+M:	a.p.zijlstra@chello.nl
+P:	Paul Mackerras
+M:	paulus@samba.org
+P:	Ingo Molnar
+M:	mingo@elte.hu
+L:	linux-kernel@vger.kernel.org
+S:	Supported
+
 PERSONALITY HANDLING
 P:	Christoph Hellwig
 M:	hch@infradead.org
-- 
GitLab


From ca8f2d7f019a8547f39ddb9ed0144932f12807f2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 9 Apr 2009 14:42:56 +1000
Subject: [PATCH 0320/2198] perf_counter: powerpc: add nmi_enter/nmi_exit calls

Impact: fix potential deadlocks on powerpc

Now that the core is using in_nmi() (added in e30e08f6, "perf_counter:
fix NMI race in task clock"), we need the powerpc perf_counter_interrupt
to call nmi_enter() and nmi_exit() in those cases where the interrupt
happens when interrupts are soft-disabled.

If interrupts were soft-enabled, we can treat it as a regular interrupt
and do irq_enter/irq_exit around the whole routine. This lets us get rid
of the test_perf_counter_pending() call at the end of
perf_counter_interrupt, thus simplifying things a little.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18909.31952.873098.336615@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 31 ++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c9d019f190742..bd76d0fa2c357 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -714,7 +714,7 @@ hw_perf_counter_init(struct perf_counter *counter)
  * here so there is no possibility of being interrupted.
  */
 static void record_and_restart(struct perf_counter *counter, long val,
-			       struct pt_regs *regs)
+			       struct pt_regs *regs, int nmi)
 {
 	s64 prev, delta, left;
 	int record = 0;
@@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record)
-		perf_counter_overflow(counter, 1, regs, 0);
+		perf_counter_overflow(counter, nmi, regs, 0);
 }
 
 /*
@@ -762,6 +762,17 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	struct perf_counter *counter;
 	long val;
 	int found = 0;
+	int nmi;
+
+	/*
+	 * If interrupts were soft-disabled when this PMU interrupt
+	 * occurred, treat it as an NMI.
+	 */
+	nmi = !regs->softe;
+	if (nmi)
+		nmi_enter();
+	else
+		irq_enter();
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
@@ -769,7 +780,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 		if ((int)val < 0) {
 			/* counter has overflowed */
 			found = 1;
-			record_and_restart(counter, val, regs);
+			record_and_restart(counter, val, regs, nmi);
 		}
 	}
 
@@ -796,18 +807,10 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 */
 	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
 
-	/*
-	 * If we need a wakeup, check whether interrupts were soft-enabled
-	 * when we took the interrupt.  If they were, we can wake stuff up
-	 * immediately; otherwise we'll have do the wakeup when interrupts
-	 * get soft-enabled.
-	 */
-	if (test_perf_counter_pending() && regs->softe) {
-		irq_enter();
-		clear_perf_counter_pending();
-		perf_counter_do_pending();
+	if (nmi)
+		nmi_exit();
+	else
 		irq_exit();
-	}
 }
 
 void hw_perf_counter_setup(int cpu)
-- 
GitLab


From d2de688891909b148efe83a6fc9520a9cd6015f0 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Thu, 9 Apr 2009 15:17:22 +1000
Subject: [PATCH 0321/2198] sparc64: extend TI_RESTART_BLOCK space by 8 bytes

Impact: build fix

Today's linux-next build (sparc64 defconfig) failed like this:

  arch/sparc/kernel/built-in.o: In function `trap_init':
  (.init.text+0x4): undefined reference to `thread_info_offsets_are_bolixed_dave'

Caused by commit 52400ba946759af28442dee6265c5c0180ac7122 ("futex: add
requeue_pi functionality") (from the tip-core tree) which changed the
size of struct restart_block.

Shift TI_KUNA_REGS and TI_KUNA_INSN up by 8 bytes to make space for the
larger restart block.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: "David S. Miller" <davem@davemloft.net>
Cc: Darren Hart <dvhltc@us.ibm.com>
LKML-Reference: <20090409151722.c8eabb56.sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/include/asm/thread_info_64.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h
index 639ac805448ab..65865726b2831 100644
--- a/arch/sparc/include/asm/thread_info_64.h
+++ b/arch/sparc/include/asm/thread_info_64.h
@@ -102,8 +102,8 @@ struct thread_info {
 #define TI_KERN_CNTD1	0x00000488
 #define TI_PCR		0x00000490
 #define TI_RESTART_BLOCK 0x00000498
-#define TI_KUNA_REGS	0x000004c0
-#define TI_KUNA_INSN	0x000004c8
+#define TI_KUNA_REGS	0x000004c8
+#define TI_KUNA_INSN	0x000004d0
 #define TI_FPREGS	0x00000500
 
 /* We embed this in the uppermost byte of thread_info->flags */
-- 
GitLab


From 002f128b473fb82f454654be5081b0919ee01ab2 Mon Sep 17 00:00:00 2001
From: Paul Turner <pjt@google.com>
Date: Wed, 8 Apr 2009 15:29:43 -0700
Subject: [PATCH 0322/2198] sched: remove redundant hierarchy walk in
 check_preempt_wakeup

Impact: micro-optimization

Under group scheduling we traverse up until we are at common siblings
to make the wakeup comparison on.

At this point however, they should have the same parent so continuing
to check up the tree is redundant.

Signed-off-by: Paul Turner <pjt@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <alpine.DEB.1.00.0904081520320.30317@kitami.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 3816f217f119e..5f9650e8fe758 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 
 	find_matching_se(&se, &pse);
 
-	while (se) {
-		BUG_ON(!pse);
+	BUG_ON(!pse);
 
-		if (wakeup_preempt_entity(se, pse) == 1) {
-			resched_task(curr);
-			break;
-		}
-
-		se = parent_entity(se);
-		pse = parent_entity(pse);
-	}
+	if (wakeup_preempt_entity(se, pse) == 1)
+		resched_task(curr);
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
-- 
GitLab


From 888fcee066a2f4abd0d0bc9418c0535f9b01e6e5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Apr 2009 09:48:22 +0200
Subject: [PATCH 0323/2198] perf_counter: fix off task->comm by one

strlen() does not include the \0.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 321c57e3556f5..b07195bbd228c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1989,7 +1989,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 	unsigned int size;
 	char *comm = comm_event->task->comm;
 
-	size = ALIGN(strlen(comm), sizeof(u64));
+	size = ALIGN(strlen(comm)+1, sizeof(u64));
 
 	comm_event->comm = comm;
 	comm_event->comm_size = size;
@@ -2109,7 +2109,7 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 	}
 
 got_name:
-	size = ALIGN(strlen(name), sizeof(u64));
+	size = ALIGN(strlen(name)+1, sizeof(u64));
 
 	mmap_event->file_name = name;
 	mmap_event->file_size = size;
-- 
GitLab


From b3828ebb3901adfe989d8d4157ed28247aeec132 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 9 Apr 2009 09:50:04 +0200
Subject: [PATCH 0324/2198] perf_counter tools: include PID in perf-report
 output, tweak user/kernel printut

It's handier than an <unknown> entry.
Also replace the kernel/user column with a more compact version:

  0.52              cc1  [k]  page_fault
  0.57               :0  [k]   _spin_lock
  0.59            :7506  [.]  <unknown>
  0.69               as  [.]  /usr/bin/as: <unknown>
  0.76              cc1  [.]  /lib64/libc-2.8.so: _int_free
  0.92              cc1  [k]  clear_page_c
  1.00            :7465  [.]  <unknown>
  1.43              cc1  [.]  /lib64/libc-2.8.so: memset
  1.86              cc1  [.]  /lib64/libc-2.8.so: _int_malloc
 70.33              cc1  [.]  /usr/libexec/gcc/x86_64-redhat-linux/4.3.2/cc1: <unknown>

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/perf-report.cc | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
index 09da0ba482cde..1727317352bf7 100644
--- a/Documentation/perf_counter/perf-report.cc
+++ b/Documentation/perf_counter/perf-report.cc
@@ -277,10 +277,17 @@ static std::multimap<int, std::string> rev_hist;
 
 static std::string resolve_comm(int pid)
 {
-	std::string comm = "<unknown>";
+	std::string comm;
+
 	std::map<int, std::string>::const_iterator ci = comms.find(pid);
-	if (ci != comms.end())
+	if (ci != comms.end()) {
 		comm = ci->second;
+	} else {
+		char pid_str[30];
+
+		sprintf(pid_str, ":%d", pid);
+		comm = pid_str;
+	}
 
 	return comm;
 }
@@ -422,13 +429,13 @@ more:
 		char output[1024];
 
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
-			level = "[kernel]";
+			level = " [k] ";
 			sym = resolve_kernel_symbol(event->ip.ip);
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
-			level = "[ user ]";
+			level = " [.] ";
 			sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
 		} else {
-			level = "[  hv  ]";
+			level = " [H] ";
 		}
 		comm = resolve_comm(event->ip.pid);
 
-- 
GitLab


From 9ee318a7825929bc3734110b83ae8e20e53d9de3 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 9 Apr 2009 10:53:44 +0200
Subject: [PATCH 0325/2198] perf_counter: optimize mmap/comm tracking

Impact: performance optimization

The mmap/comm tracking code does quite a lot of work before it discovers
there's no interest in it, avoid that by keeping a counter.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090409085524.427173196@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 39 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 36 insertions(+), 3 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b07195bbd228c..76376ecb23b5e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -38,6 +38,10 @@ int perf_max_counters __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
+static atomic_t nr_mmap_tracking __read_mostly;
+static atomic_t nr_munmap_tracking __read_mostly;
+static atomic_t nr_comm_tracking __read_mostly;
+
 /*
  * Mutex for (sysadmin-configurable) counter reservations:
  */
@@ -1186,6 +1190,13 @@ static void free_counter(struct perf_counter *counter)
 {
 	perf_pending_sync(counter);
 
+	if (counter->hw_event.mmap)
+		atomic_dec(&nr_mmap_tracking);
+	if (counter->hw_event.munmap)
+		atomic_dec(&nr_munmap_tracking);
+	if (counter->hw_event.comm)
+		atomic_dec(&nr_comm_tracking);
+
 	if (counter->destroy)
 		counter->destroy(counter);
 
@@ -2005,7 +2016,12 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 
 void perf_counter_comm(struct task_struct *task)
 {
-	struct perf_comm_event comm_event = {
+	struct perf_comm_event comm_event;
+
+	if (!atomic_read(&nr_comm_tracking))
+		return;
+       
+	comm_event = (struct perf_comm_event){
 		.task	= task,
 		.event  = {
 			.header = { .type = PERF_EVENT_COMM, },
@@ -2128,7 +2144,12 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 void perf_counter_mmap(unsigned long addr, unsigned long len,
 		       unsigned long pgoff, struct file *file)
 {
-	struct perf_mmap_event mmap_event = {
+	struct perf_mmap_event mmap_event;
+
+	if (!atomic_read(&nr_mmap_tracking))
+		return;
+
+	mmap_event = (struct perf_mmap_event){
 		.file   = file,
 		.event  = {
 			.header = { .type = PERF_EVENT_MMAP, },
@@ -2146,7 +2167,12 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 void perf_counter_munmap(unsigned long addr, unsigned long len,
 			 unsigned long pgoff, struct file *file)
 {
-	struct perf_mmap_event mmap_event = {
+	struct perf_mmap_event mmap_event;
+
+	if (!atomic_read(&nr_munmap_tracking))
+		return;
+
+	mmap_event = (struct perf_mmap_event){
 		.file   = file,
 		.event  = {
 			.header = { .type = PERF_EVENT_MUNMAP, },
@@ -2725,6 +2751,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	counter->hw_ops = hw_ops;
 
+	if (counter->hw_event.mmap)
+		atomic_inc(&nr_mmap_tracking);
+	if (counter->hw_event.munmap)
+		atomic_inc(&nr_munmap_tracking);
+	if (counter->hw_event.comm)
+		atomic_inc(&nr_comm_tracking);
+
 	return counter;
 }
 
-- 
GitLab


From 1ccd15497869f3ed83b5225d410df53a96e52757 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 9 Apr 2009 10:53:45 +0200
Subject: [PATCH 0326/2198] perf_counter: sysctl for system wide perf counters

Impact: add sysctl for paranoid/relaxed perfcounters policy

Allow the use of system wide perf counters to everybody, but provide
a sysctl to disable it for the paranoid security minded.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090409085524.514046352@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        |  4 +++-
 kernel/sysctl.c              | 11 +++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c22363a4f7466..981432885301d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -568,6 +568,8 @@ struct perf_callchain_entry {
 
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
+extern int sysctl_perf_counter_priv;
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 76376ecb23b5e..7efb7ebaaae04 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -42,6 +42,8 @@ static atomic_t nr_mmap_tracking __read_mostly;
 static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
 
+int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+
 /*
  * Mutex for (sysadmin-configurable) counter reservations:
  */
@@ -1132,7 +1134,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	 */
 	if (cpu != -1) {
 		/* Must be root to operate on a CPU counter: */
-		if (!capable(CAP_SYS_ADMIN))
+		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
 		if (cpu < 0 || cpu > num_possible_cpus())
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4286b62b34a0a..8ba457838d951 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -49,6 +49,7 @@
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
 #include <linux/slow-work.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -920,6 +921,16 @@ static struct ctl_table kern_table[] = {
 		.child		= slow_work_sysctls,
 	},
 #endif
+#ifdef CONFIG_PERF_COUNTERS
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "perf_counter_privileged",
+		.data		= &sysctl_perf_counter_priv,
+		.maxlen		= sizeof(sysctl_perf_counter_priv),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
GitLab


From d3d21c412d8525eb2e208d990ab5eee5fb0fe03d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 9 Apr 2009 10:53:46 +0200
Subject: [PATCH 0327/2198] perf_counter: log full path names

Impact: fix perf-report output for /home mounted binaries, etc.

dentry_path() only provide path-names up to the mount root, which is
unsuited for out purpose, use d_path() instead.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090409085524.601794134@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7efb7ebaaae04..7f9521c3c01ba 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2116,7 +2116,7 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 			name = strncpy(tmp, "//enomem", sizeof(tmp));
 			goto got_name;
 		}
-		name = dentry_path(file->f_dentry, buf, PATH_MAX);
+		name = d_path(&file->f_path, buf, PATH_MAX);
 		if (IS_ERR(name)) {
 			name = strncpy(tmp, "//toolong", sizeof(tmp));
 			goto got_name;
-- 
GitLab


From e7c064889606aab3569669078c69b87b2c527e72 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sat, 7 Mar 2009 23:48:41 -0800
Subject: [PATCH 0328/2198] xen: add FIX_TEXT_POKE to fixmap

FIX_TEXT_POKE[01] are used to map kernel addresses, so they're mapping
pfns, not mfns.

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/mmu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 77b242c9a11ed..a96f5b9393ea9 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1812,6 +1812,9 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
 #ifdef CONFIG_X86_LOCAL_APIC
 	case FIX_APIC_BASE:	/* maps dummy local APIC */
 #endif
+	case FIX_TEXT_POKE0:
+	case FIX_TEXT_POKE1:
+		/* All local page mappings */
 		pte = pfn_pte(phys, prot);
 		break;
 
-- 
GitLab


From 7a734e7dd93b9aea08ed51036a9a0e2c9dfd8dac Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:08:28 -0700
Subject: [PATCH 0329/2198] x86, setup: "glove box" BIOS calls --
 infrastructure

Impact: new interfaces (not yet used)

For all the platforms out there, there is an infinite number of buggy
BIOSes.  This adds infrastructure to treat BIOS interrupts more like
toxic waste and "glove box" them -- we switch out the register set,
perform the BIOS interrupt, and then restore the previous state.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/boot/Makefile                   |  5 +-
 arch/x86/boot/bioscall.S                 | 82 ++++++++++++++++++++++++
 arch/x86/boot/boot.h                     | 48 ++++++++++++++
 arch/x86/boot/header.S                   |  2 +-
 arch/x86/boot/regs.c                     | 29 +++++++++
 arch/x86/boot/setup.ld                   |  6 ++
 arch/x86/kernel/acpi/realmode/Makefile   |  2 +-
 arch/x86/kernel/acpi/realmode/bioscall.S |  1 +
 arch/x86/kernel/acpi/realmode/regs.c     |  1 +
 9 files changed, 172 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/boot/bioscall.S
 create mode 100644 arch/x86/boot/regs.c
 create mode 100644 arch/x86/kernel/acpi/realmode/bioscall.S
 create mode 100644 arch/x86/kernel/acpi/realmode/regs.c

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6633b6e7505a6..658bc525cac78 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -26,9 +26,10 @@ targets		:= vmlinux.bin setup.bin setup.elf bzImage
 targets		+= fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-		:= compressed
 
-setup-y		+= a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
+setup-y		+= a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
 setup-y		+= header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y		+= printf.o string.o tty.o video.o video-mode.o version.o
+setup-y		+= printf.o regs.o string.o tty.o video.o video-mode.o
+setup-y		+= version.o
 setup-$(CONFIG_X86_APM_BOOT) += apm.o
 
 # The link order of the video-*.o modules can matter.  In particular,
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
new file mode 100644
index 0000000000000..22b4b3efb9f90
--- /dev/null
+++ b/arch/x86/boot/bioscall.S
@@ -0,0 +1,82 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * "Glove box" for BIOS calls.  Avoids the constant problems with BIOSes
+ * touching memory they shouldn't be.
+ */
+
+	.code16
+	.text
+	.globl	intcall
+	.type	intcall, @function
+intcall:
+	/* Self-modify the INT instruction.  Ugly, but works. */
+	cmpb	%al, 3f
+	je	1f
+	movb	%al, 3f
+	jmp	1f		/* Synchronize pipeline */
+1:
+	/* Save state */
+	pushfl
+	pushw	%fs
+	pushw	%gs
+	pushal
+
+	/* Copy input state to stack frame */
+	subw	$44, %sp
+	movw	%dx, %si
+	movw	%sp, %di
+	movw	$11, %cx
+	rep; movsd
+
+	/* Pop full state from the stack */
+	popal
+	popw	%gs
+	popw	%fs
+	popw	%es
+	popw	%ds
+	popfl
+
+	/* Actual INT */
+	.byte	0xcd		/* INT opcode */
+3:	.byte	0
+
+	/* Push full state to the stack */
+	pushfl
+	pushw	%ds
+	pushw	%es
+	pushw	%fs
+	pushw	%gs
+	pushal
+
+	/* Re-establish C environment invariants */
+	cld
+	movzwl	%sp, %esp
+	movw	%cs, %ax
+	movw	%ax, %ds
+	movw	%ax, %es
+
+	/* Copy output state from stack frame */
+	movw	68(%esp), %di	/* Original %cx == 3rd argument */
+	andw	%di, %di
+	jz	4f
+	movw	%sp, %si
+	movw	$11, %cx
+	rep; movsd
+4:	addw	$44, %sp
+
+	/* Restore state and return */
+	popal
+	popw	%gs
+	popw	%fs
+	popfl
+	retl
+	.size	intcall, .-intcall
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index 7b2692e897e5f..98239d2658f27 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -26,6 +27,7 @@
 #include <asm/setup.h>
 #include "bitops.h"
 #include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
 
 /* Useful macros */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -241,6 +243,49 @@ int enable_a20(void);
 /* apm.c */
 int query_apm_bios(void);
 
+/* bioscall.c */
+struct biosregs {
+	union {
+		struct {
+			u32 edi;
+			u32 esi;
+			u32 ebp;
+			u32 _esp;
+			u32 ebx;
+			u32 edx;
+			u32 ecx;
+			u32 eax;
+			u32 _fsgs;
+			u32 _dses;
+			u32 eflags;
+		};
+		struct {
+			u16 di, hdi;
+			u16 si, hsi;
+			u16 bp, hbp;
+			u16 _sp, _hsp;
+			u16 bx, hbx;
+			u16 dx, hdx;
+			u16 cx, hcx;
+			u16 ax, hax;
+			u16 gs, fs;
+			u16 es, ds;
+			u16 flags, hflags;
+		};
+		struct {
+			u8 dil, dih, edi2, edi3;
+			u8 sil, sih, esi2, esi3;
+			u8 bpl, bph, ebp2, ebp3;
+			u8 _spl, _sph, _esp2, _esp3;
+			u8 bl, bh, ebx2, ebx3;
+			u8 dl, dh, edx2, edx3;
+			u8 cl, ch, ecx2, ecx3;
+			u8 al, ah, eax2, eax3;
+		};
+	};
+};
+void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
+
 /* cmdline.c */
 int cmdline_find_option(const char *option, char *buffer, int bufsize);
 int cmdline_find_option_bool(const char *option);
@@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...);
 int vsprintf(char *buf, const char *fmt, va_list args);
 int printf(const char *fmt, ...);
 
+/* regs.c */
+void initregs(struct biosregs *regs);
+
 /* string.c */
 int strcmp(const char *str1, const char *str2);
 size_t strnlen(const char *s, size_t maxlen);
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 5d84d1c74e4c6..486d97fa7f4d3 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -221,7 +221,7 @@ setup_data:		.quad 0			# 64-bit physical pointer to
 
 # End of setup header #####################################################
 
-	.section ".inittext", "ax"
+	.section ".entrytext", "ax"
 start_of_setup:
 #ifdef SAFE_RESET_DISK_CONTROLLER
 # Reset the disk controller.
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
new file mode 100644
index 0000000000000..958019b1cfa5b
--- /dev/null
+++ b/arch/x86/boot/regs.c
@@ -0,0 +1,29 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Simple helper function for initializing a register set.
+ *
+ * Note that this sets EFLAGS_CF in the input register set; this
+ * makes it easier to catch functions which do nothing but don't
+ * explicitly set CF.
+ */
+
+#include "boot.h"
+
+void initregs(struct biosregs *reg)
+{
+	memset(reg, 0, sizeof *reg);
+	reg->eflags |= X86_EFLAGS_CF;
+	reg->ds = ds();
+	reg->es = ds();
+	reg->fs = fs();
+	reg->gs = gs();
+}
diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld
index bb8dc2de79693..0f6ec455a2b13 100644
--- a/arch/x86/boot/setup.ld
+++ b/arch/x86/boot/setup.ld
@@ -15,8 +15,11 @@ SECTIONS
 
 	. = 497;
 	.header		: { *(.header) }
+	.entrytext	: { *(.entrytext) }
 	.inittext	: { *(.inittext) }
 	.initdata	: { *(.initdata) }
+	__end_init = .;
+
 	.text		: { *(.text) }
 	.text32		: { *(.text32) }
 
@@ -52,4 +55,7 @@ SECTIONS
 
 	. = ASSERT(_end <= 0x8000, "Setup too big!");
 	. = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
+	/* Necessary for the very-old-loader check to work... */
+	. = ASSERT(__end_init <= 5*512, "init sections too big!");
+
 }
diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile
index 1c31cc0e9def0..167bc16ce0e59 100644
--- a/arch/x86/kernel/acpi/realmode/Makefile
+++ b/arch/x86/kernel/acpi/realmode/Makefile
@@ -9,7 +9,7 @@
 always		:= wakeup.bin
 targets		:= wakeup.elf wakeup.lds
 
-wakeup-y	+= wakeup.o wakemain.o video-mode.o copy.o
+wakeup-y	+= wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
 
 # The link order of the video-*.o modules can matter.  In particular,
 # video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644
index 0000000000000..f51eb0bb56cec
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/bioscall.S
@@ -0,0 +1 @@
+#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644
index 0000000000000..6206033ba2027
--- /dev/null
+++ b/arch/x86/kernel/acpi/realmode/regs.c
@@ -0,0 +1 @@
+#include "../../../boot/regs.c"
-- 
GitLab


From df7699c56421c0476704f24a43409ac8c505f3d2 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:13:46 -0700
Subject: [PATCH 0330/2198] x86, setup: "glove box" BIOS interrupts in the core
 boot code

Impact: BIOS proofing

"Glove box" off BIOS interrupts in the core boot code.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/a20.c    |  9 +++--
 arch/x86/boot/main.c   | 39 ++++++++++----------
 arch/x86/boot/memory.c | 81 +++++++++++++++++++++---------------------
 arch/x86/boot/tty.c    | 52 ++++++++++++++-------------
 4 files changed, 95 insertions(+), 86 deletions(-)

diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c
index 7c19ce8c2442c..64a31a6d751a0 100644
--- a/arch/x86/boot/a20.c
+++ b/arch/x86/boot/a20.c
@@ -2,7 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007-2008 rPath, Inc. - All Rights Reserved
- *   Copyright 2009 Intel Corporation
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -90,8 +90,11 @@ static int a20_test_long(void)
 
 static void enable_a20_bios(void)
 {
-	asm volatile("pushfl; int $0x15; popfl"
-		     : : "a" ((u16)0x2401));
+	struct biosregs ireg;
+
+	initregs(&ireg);
+	ireg.ax = 0x2401;
+	intcall(0x15, &ireg, NULL);
 }
 
 static void enable_a20_kbc(void)
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 58f0415d3ae09..140172b895bd3 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -61,11 +62,10 @@ static void copy_boot_params(void)
  */
 static void keyboard_set_repeat(void)
 {
-	u16 ax = 0x0305;
-	u16 bx = 0;
-	asm volatile("int $0x16"
-		     : "+a" (ax), "+b" (bx)
-		     : : "ecx", "edx", "esi", "edi");
+	struct biosregs ireg;
+	initregs(&ireg);
+	ireg.ax = 0x0305;
+	intcall(0x16, &ireg, NULL);
 }
 
 /*
@@ -73,18 +73,22 @@ static void keyboard_set_repeat(void)
  */
 static void query_ist(void)
 {
+	struct biosregs ireg, oreg;
+
 	/* Some older BIOSes apparently crash on this call, so filter
 	   it from machines too old to have SpeedStep at all. */
 	if (cpu.level < 6)
 		return;
 
-	asm("int $0x15"
-	    : "=a" (boot_params.ist_info.signature),
-	      "=b" (boot_params.ist_info.command),
-	      "=c" (boot_params.ist_info.event),
-	      "=d" (boot_params.ist_info.perf_level)
-	    : "a" (0x0000e980),	 /* IST Support */
-	      "d" (0x47534943)); /* Request value */
+	initregs(&ireg);
+	ireg.ax  = 0xe980;	 /* IST Support */
+	ireg.edx = 0x47534943;	 /* Request value */
+	intcall(0x15, &ireg, &oreg);
+
+	boot_params.ist_info.signature  = oreg.eax;
+	boot_params.ist_info.command    = oreg.ebx;
+	boot_params.ist_info.event      = oreg.ecx;
+	boot_params.ist_info.perf_level = oreg.edx;
 }
 
 /*
@@ -93,13 +97,12 @@ static void query_ist(void)
 static void set_bios_mode(void)
 {
 #ifdef CONFIG_X86_64
-	u32 eax, ebx;
+	struct biosregs ireg;
 
-	eax = 0xec00;
-	ebx = 2;
-	asm volatile("int $0x15"
-		     : "+a" (eax), "+b" (ebx)
-		     : : "ecx", "edx", "esi", "edi");
+	initregs(&ireg);
+	ireg.ax = 0xec00;
+	ireg.bx = 2;
+	intcall(0x15, &ireg, NULL);
 #endif
 }
 
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 5054c2ddd1a03..d989de810cacc 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -25,12 +25,16 @@ struct e820_ext_entry {
 static int detect_memory_e820(void)
 {
 	int count = 0;
-	u32 next = 0;
-	u32 size, id, edi;
-	u8 err;
+	struct biosregs ireg, oreg;
 	struct e820entry *desc = boot_params.e820_map;
 	static struct e820_ext_entry buf; /* static so it is zeroed */
 
+	initregs(&ireg);
+	ireg.ax  = 0xe820;
+	ireg.cx  = sizeof buf;
+	ireg.edx = SMAP;
+	ireg.di  = (size_t)&buf;
+
 	/*
 	 * Set this here so that if the BIOS doesn't change this field
 	 * but still doesn't change %ecx, we're still okay...
@@ -38,22 +42,13 @@ static int detect_memory_e820(void)
 	buf.ext_flags = 1;
 
 	do {
-		size = sizeof buf;
-
-		/* Important: %edx and %esi are clobbered by some BIOSes,
-		   so they must be either used for the error output
-		   or explicitly marked clobbered.  Given that, assume there
-		   is something out there clobbering %ebp and %edi, too. */
-		asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
-		    : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
-		      "=D" (edi), "+m" (buf)
-		    : "D" (&buf), "d" (SMAP), "a" (0xe820)
-		    : "esi");
+		intcall(0x15, &ireg, &oreg);
+		ireg.ebx = oreg.ebx; /* for next iteration... */
 
 		/* BIOSes which terminate the chain with CF = 1 as opposed
 		   to %ebx = 0 don't always report the SMAP signature on
 		   the final, failing, probe. */
-		if (err)
+		if (oreg.eflags & X86_EFLAGS_CF)
 			break;
 
 		/* Some BIOSes stop returning SMAP in the middle of
@@ -61,7 +56,7 @@ static int detect_memory_e820(void)
 		   screwed up the map at that point, we might have a
 		   partial map, the full map, or complete garbage, so
 		   just return failure. */
-		if (id != SMAP) {
+		if (oreg.eax != SMAP) {
 			count = 0;
 			break;
 		}
@@ -69,58 +64,62 @@ static int detect_memory_e820(void)
 		/* ACPI 3.0 added the extended flags support.  If bit 0
 		   in the extended flags is zero, we're supposed to simply
 		   ignore the entry -- a backwards incompatible change! */
-		if (size > 20 && !(buf.ext_flags & 1))
+		if (oreg.cx > 20 && !(buf.ext_flags & 1))
 			continue;
 
 		*desc++ = buf.std;
 		count++;
-	} while (next && count < ARRAY_SIZE(boot_params.e820_map));
+	} while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
 
 	return boot_params.e820_entries = count;
 }
 
 static int detect_memory_e801(void)
 {
-	u16 ax, bx, cx, dx;
-	u8 err;
+	struct biosregs ireg, oreg;
 
-	bx = cx = dx = 0;
-	ax = 0xe801;
-	asm("stc; int $0x15; setc %0"
-	    : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
+	initregs(&ireg);
+	ireg.ax = 0xe801;
+	intcall(0x15, &ireg, &oreg);
 
-	if (err)
+	if (oreg.eflags & X86_EFLAGS_CF)
 		return -1;
 
 	/* Do we really need to do this? */
-	if (cx || dx) {
-		ax = cx;
-		bx = dx;
+	if (oreg.cx || oreg.dx) {
+		oreg.ax = oreg.cx;
+		oreg.bx = oreg.dx;
 	}
 
-	if (ax > 15*1024)
+	if (oreg.ax > 15*1024) {
 		return -1;	/* Bogus! */
-
-	/* This ignores memory above 16MB if we have a memory hole
-	   there.  If someone actually finds a machine with a memory
-	   hole at 16MB and no support for 0E820h they should probably
-	   generate a fake e820 map. */
-	boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax;
+	} else if (oreg.ax == 15*1024) {
+		boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
+	} else {
+		/*
+		 * This ignores memory above 16MB if we have a memory
+		 * hole there.  If someone actually finds a machine
+		 * with a memory hole at 16MB and no support for
+		 * 0E820h they should probably generate a fake e820
+		 * map.
+		 */
+		boot_params.alt_mem_k = oreg.ax;
+	}
 
 	return 0;
 }
 
 static int detect_memory_88(void)
 {
-	u16 ax;
-	u8 err;
+	struct biosregs ireg, oreg;
 
-	ax = 0x8800;
-	asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax));
+	initregs(&ireg);
+	ireg.ah = 0x88;
+	intcall(0x15, &ireg, &oreg);
 
-	boot_params.screen_info.ext_mem_k = ax;
+	boot_params.screen_info.ext_mem_k = oreg.ax;
 
-	return -err;
+	return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
 }
 
 int detect_memory(void)
diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c
index 7e8e8b25f5f6c..01ec69c901c7a 100644
--- a/arch/x86/boot/tty.c
+++ b/arch/x86/boot/tty.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -22,24 +23,23 @@
 
 void __attribute__((section(".inittext"))) putchar(int ch)
 {
-	unsigned char c = ch;
+	struct biosregs ireg;
 
-	if (c == '\n')
+	if (ch == '\n')
 		putchar('\r');	/* \n -> \r\n */
 
-	/* int $0x10 is known to have bugs involving touching registers
-	   it shouldn't.  Be extra conservative... */
-	asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
-		     : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
+	initregs(&ireg);
+	ireg.bx = 0x0007;
+	ireg.cx = 0x0001;
+	ireg.ah = 0x0e;
+	ireg.al = ch;
+	intcall(0x10, &ireg, NULL);
 }
 
 void __attribute__((section(".inittext"))) puts(const char *str)
 {
-	int n = 0;
-	while (*str) {
+	while (*str)
 		putchar(*str++);
-		n++;
-	}
 }
 
 /*
@@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str)
 
 static u8 gettime(void)
 {
-	u16 ax = 0x0200;
-	u16 cx, dx;
+	struct biosregs ireg, oreg;
 
-	asm volatile("int $0x1a"
-		     : "+a" (ax), "=c" (cx), "=d" (dx)
-		     : : "ebx", "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x02;
+	intcall(0x1a, &ireg, &oreg);
 
-	return dx >> 8;
+	return oreg.dh;
 }
 
 /*
@@ -64,19 +63,24 @@ static u8 gettime(void)
  */
 int getchar(void)
 {
-	u16 ax = 0;
-	asm volatile("int $0x16" : "+a" (ax));
+	struct biosregs ireg, oreg;
+
+	initregs(&ireg);
+	/* ireg.ah = 0x00; */
+	intcall(0x16, &ireg, &oreg);
 
-	return ax & 0xff;
+	return oreg.al;
 }
 
 static int kbd_pending(void)
 {
-	u8 pending;
-	asm volatile("int $0x16; setnz %0"
-		     : "=qm" (pending)
-		     : "a" (0x0100));
-	return pending;
+	struct biosregs ireg, oreg;
+
+	initregs(&ireg);
+	ireg.ah = 0x01;
+	intcall(0x16, &ireg, &oreg);
+
+	return !(oreg.eflags & X86_EFLAGS_ZF);
 }
 
 void kbd_flush(void)
-- 
GitLab


From d54ea252e4c92357226992cf65d94616a96e6fce Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:14:26 -0700
Subject: [PATCH 0331/2198] x86, setup: "glove box" BIOS interrupts in the APM
 code

Impact: BIOS proofing

"Glove box" off BIOS interrupts in the APM code.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
---
 arch/x86/boot/apm.c | 76 +++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 47 deletions(-)

diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c
index 7aa6033001f9a..ee274834ea8ba 100644
--- a/arch/x86/boot/apm.c
+++ b/arch/x86/boot/apm.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   Original APM BIOS checking by Stephen Rothwell, May 1994
  *   (sfr@canb.auug.org.au)
@@ -19,75 +20,56 @@
 
 int query_apm_bios(void)
 {
-	u16 ax, bx, cx, dx, di;
-	u32 ebx, esi;
-	u8 err;
+	struct biosregs ireg, oreg;
 
 	/* APM BIOS installation check */
-	ax = 0x5300;
-	bx = cx = 0;
-	asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
-		     : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
-		     : : "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x53;
+	intcall(0x15, &ireg, &oreg);
 
-	if (err)
+	if (oreg.flags & X86_EFLAGS_CF)
 		return -1;		/* No APM BIOS */
 
-	if (bx != 0x504d)	/* "PM" signature */
+	if (oreg.bx != 0x504d)		/* "PM" signature */
 		return -1;
 
-	if (!(cx & 0x02))		/* 32 bits supported? */
+	if (!(oreg.cx & 0x02))		/* 32 bits supported? */
 		return -1;
 
 	/* Disconnect first, just in case */
-	ax = 0x5304;
-	bx = 0;
-	asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
-		     : "+a" (ax), "+b" (bx)
-		     : : "ecx", "edx", "esi", "edi");
-
-	/* Paranoia */
-	ebx = esi = 0;
-	cx = dx = di = 0;
+	ireg.al = 0x04;
+	intcall(0x15, &ireg, NULL);
 
 	/* 32-bit connect */
-	asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6"
-		     : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx),
-		       "+S" (esi), "+D" (di), "=m" (err)
-		     : "a" (0x5303));
-
-	boot_params.apm_bios_info.cseg = ax;
-	boot_params.apm_bios_info.offset = ebx;
-	boot_params.apm_bios_info.cseg_16 = cx;
-	boot_params.apm_bios_info.dseg = dx;
-	boot_params.apm_bios_info.cseg_len = (u16)esi;
-	boot_params.apm_bios_info.cseg_16_len = esi >> 16;
-	boot_params.apm_bios_info.dseg_len = di;
-
-	if (err)
+	ireg.al = 0x03;
+	intcall(0x15, &ireg, &oreg);
+
+	boot_params.apm_bios_info.cseg        = oreg.ax;
+	boot_params.apm_bios_info.offset      = oreg.ebx;
+	boot_params.apm_bios_info.cseg_16     = oreg.cx;
+	boot_params.apm_bios_info.dseg        = oreg.dx;
+	boot_params.apm_bios_info.cseg_len    = oreg.si;
+	boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
+	boot_params.apm_bios_info.dseg_len    = oreg.di;
+
+	if (oreg.flags & X86_EFLAGS_CF)
 		return -1;
 
 	/* Redo the installation check as the 32-bit connect;
 	   some BIOSes return different flags this way... */
 
-	ax = 0x5300;
-	bx = cx = 0;
-	asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
-		     : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
-		     : : "esi", "edi");
+	ireg.al = 0x00;
+	intcall(0x15, &ireg, &oreg);
 
-	if (err || bx != 0x504d) {
+	if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
 		/* Failure with 32-bit connect, try to disconect and ignore */
-		ax = 0x5304;
-		bx = 0;
-		asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
-			     : "+a" (ax), "+b" (bx)
-			     : : "ecx", "edx", "esi", "edi");
+		ireg.al = 0x04;
+		intcall(0x15, &ireg, NULL);
 		return -1;
 	}
 
-	boot_params.apm_bios_info.version = ax;
-	boot_params.apm_bios_info.flags = cx;
+	boot_params.apm_bios_info.version = oreg.ax;
+	boot_params.apm_bios_info.flags   = oreg.cx;
 	return 0;
 }
 
-- 
GitLab


From 3435d3476c5ed955d56a6216ed2d156847b3a575 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:17:17 -0700
Subject: [PATCH 0332/2198] x86, setup: "glove box" BIOS interrupts in the EDD
 code

Impact: BIOS proofing

"Glove box" off BIOS interrupts in the EDD code.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/boot/edd.c | 71 ++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 40 deletions(-)

diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 1aae8f3e5ca19..c501a5b466f84 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -22,17 +23,17 @@
  */
 static int read_mbr(u8 devno, void *buf)
 {
-	u16 ax, bx, cx, dx;
+	struct biosregs ireg, oreg;
 
-	ax = 0x0201;		/* Legacy Read, one sector */
-	cx = 0x0001;		/* Sector 0-0-1 */
-	dx = devno;
-	bx = (size_t)buf;
-	asm volatile("pushfl; stc; int $0x13; setc %%al; popfl"
-		     : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
-		     : : "esi", "edi", "memory");
+	initregs(&ireg);
+	ireg.ax = 0x0201;		/* Legacy Read, one sector */
+	ireg.cx = 0x0001;		/* Sector 0-0-1 */
+	ireg.dl = devno;
+	ireg.bx = (size_t)buf;
 
-	return -(u8)ax;		/* 0 or -1 */
+	intcall(0x13, &ireg, &oreg);
+
+	return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
 }
 
 static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
@@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
 
 static int get_edd_info(u8 devno, struct edd_info *ei)
 {
-	u16 ax, bx, cx, dx, di;
+	struct biosregs ireg, oreg;
 
 	memset(ei, 0, sizeof *ei);
 
 	/* Check Extensions Present */
 
-	ax = 0x4100;
-	bx = EDDMAGIC1;
-	dx = devno;
-	asm("pushfl; stc; int $0x13; setc %%al; popfl"
-	    : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx)
-	    : : "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x41;
+	ireg.bx = EDDMAGIC1;
+	ireg.dl = devno;
+	intcall(0x13, &ireg, &oreg);
 
-	if ((u8)ax)
+	if (oreg.eflags & X86_EFLAGS_CF)
 		return -1;	/* No extended information */
 
-	if (bx != EDDMAGIC2)
+	if (oreg.bx != EDDMAGIC2)
 		return -1;
 
 	ei->device  = devno;
-	ei->version = ax >> 8;	/* EDD version number */
-	ei->interface_support = cx; /* EDD functionality subsets */
+	ei->version = oreg.ah;		 /* EDD version number */
+	ei->interface_support = oreg.cx; /* EDD functionality subsets */
 
 	/* Extended Get Device Parameters */
 
 	ei->params.length = sizeof(ei->params);
-	ax = 0x4800;
-	dx = devno;
-	asm("pushfl; int $0x13; popfl"
-	    : "+a" (ax), "+d" (dx), "=m" (ei->params)
-	    : "S" (&ei->params)
-	    : "ebx", "ecx", "edi");
+	ireg.ah = 0x48;
+	ireg.si = (size_t)&ei->params;
+	intcall(0x13, &ireg, &oreg);
 
 	/* Get legacy CHS parameters */
 
 	/* Ralf Brown recommends setting ES:DI to 0:0 */
-	ax = 0x0800;
-	dx = devno;
-	di = 0;
-	asm("pushw %%es; "
-	    "movw %%di,%%es; "
-	    "pushfl; stc; int $0x13; setc %%al; popfl; "
-	    "popw %%es"
-	    : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di)
-	    : : "esi");
-
-	if ((u8)ax == 0) {
-		ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
-		ei->legacy_max_head = dx >> 8;
-		ei->legacy_sectors_per_track = cx & 0x3f;
+	ireg.ah = 0x08;
+	ireg.es = 0;
+	intcall(0x13, &ireg, &oreg);
+
+	if (!(oreg.eflags & X86_EFLAGS_CF)) {
+		ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
+		ei->legacy_max_head = oreg.dh;
+		ei->legacy_sectors_per_track = oreg.cl & 0x3f;
 	}
 
 	return 0;
-- 
GitLab


From 0a706db320768f8f6e43bbf73b58d2aabdc93354 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:19:00 -0700
Subject: [PATCH 0333/2198] x86, setup: "glove box" BIOS interrupts in the MCA
 code

Impact: BIOS proofing

"Glove box" off BIOS interrupts in the MCA code.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 arch/x86/boot/mca.c | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c
index 911eaae5d6964..a95a531148ef9 100644
--- a/arch/x86/boot/mca.c
+++ b/arch/x86/boot/mca.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -16,26 +17,22 @@
 
 int query_mca(void)
 {
-	u8 err;
-	u16 es, bx, len;
-
-	asm("pushw %%es ; "
-	    "int $0x15 ; "
-	    "setc %0 ; "
-	    "movw %%es, %1 ; "
-	    "popw %%es"
-	    : "=acd" (err), "=acdSD" (es), "=b" (bx)
-	    : "a" (0xc000));
-
-	if (err)
+	struct biosregs ireg, oreg;
+	u16 len;
+
+	initregs(&ireg);
+	ireg.ah = 0xc0;
+	intcall(0x15, &ireg, &oreg);
+
+	if (oreg.eflags & X86_EFLAGS_CF)
 		return -1;	/* No MCA present */
 
-	set_fs(es);
-	len = rdfs16(bx);
+	set_fs(oreg.es);
+	len = rdfs16(oreg.bx);
 
 	if (len > sizeof(boot_params.sys_desc_table))
 		len = sizeof(boot_params.sys_desc_table);
 
-	copy_from_fs(&boot_params.sys_desc_table, bx, len);
+	copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
 	return 0;
 }
-- 
GitLab


From cf06de7b9cdd3efee7a59dced1977b3c21d43732 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@linux.intel.com>
Date: Wed, 1 Apr 2009 18:20:11 -0700
Subject: [PATCH 0334/2198] x86, setup: "glove box" BIOS interrupts in the
 video code

Impact: BIOS proofing

"Glove box" off BIOS interrupts in the video code.

LKML-Reference: <49DE7F79.4030106@zytor.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Rafael J. Wysocki <rjw@sisk.pl>
---
 arch/x86/boot/video-bios.c |  27 ++++----
 arch/x86/boot/video-vesa.c | 137 ++++++++++++++++---------------------
 arch/x86/boot/video-vga.c  |  95 +++++++++++++++----------
 arch/x86/boot/video.c      |  42 +++++-------
 arch/x86/boot/video.h      |  14 ----
 5 files changed, 151 insertions(+), 164 deletions(-)

diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 3fa979c9c363a..d660be4923634 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi)
 
 static int set_bios_mode(u8 mode)
 {
-	u16 ax;
+	struct biosregs ireg, oreg;
 	u8 new_mode;
 
-	ax = mode;		/* AH=0x00 Set Video Mode */
-	asm volatile(INT10
-		     : "+a" (ax)
-		     : : "ebx", "ecx", "edx", "esi", "edi");
+	initregs(&ireg);
+	ireg.al = mode;		/* AH=0x00 Set Video Mode */
+	intcall(0x10, &ireg, NULL);
 
-	ax = 0x0f00;		/* Get Current Video Mode */
-	asm volatile(INT10
-		     : "+a" (ax)
-		     : : "ebx", "ecx", "edx", "esi", "edi");
+
+	ireg.ah = 0x0f;		/* Get Current Video Mode */
+	intcall(0x10, &ireg, &oreg);
 
 	do_restore = 1;		/* Assume video contents were lost */
-	new_mode = ax & 0x7f;	/* Not all BIOSes are clean with the top bit */
+
+	/* Not all BIOSes are clean with the top bit */
+	new_mode = ireg.al & 0x7f;
 
 	if (new_mode == mode)
 		return 0;	/* Mode change OK */
@@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode)
 		/* Mode setting failed, but we didn't end up where we
 		   started.  That's bad.  Try to revert to the original
 		   video mode. */
-		ax = boot_params.screen_info.orig_video_mode;
-		asm volatile(INT10
-			     : "+a" (ax)
-			     : : "ebx", "ecx", "edx", "esi", "edi");
+		ireg.ax = boot_params.screen_info.orig_video_mode;
+		intcall(0x10, &ireg, NULL);
 	}
 #endif
 	return -1;
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 4a58c8ce3f696..c700147d6ffb2 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {}
 static int vesa_probe(void)
 {
 #if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
-	u16 ax, cx, di;
+	struct biosregs ireg, oreg;
 	u16 mode;
 	addr_t mode_ptr;
 	struct mode_info *mi;
@@ -39,13 +40,12 @@ static int vesa_probe(void)
 
 	video_vesa.modes = GET_HEAP(struct mode_info, 0);
 
-	ax = 0x4f00;
-	di = (size_t)&vginfo;
-	asm(INT10
-	    : "+a" (ax), "+D" (di), "=m" (vginfo)
-	    : : "ebx", "ecx", "edx", "esi");
+	initregs(&ireg);
+	ireg.ax = 0x4f00;
+	ireg.di = (size_t)&vginfo;
+	intcall(0x10, &ireg, &oreg);
 
-	if (ax != 0x004f ||
+	if (ireg.ax != 0x004f ||
 	    vginfo.signature != VESA_MAGIC ||
 	    vginfo.version < 0x0102)
 		return 0;	/* Not present */
@@ -65,14 +65,12 @@ static int vesa_probe(void)
 
 		memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
 
-		ax = 0x4f01;
-		cx = mode;
-		di = (size_t)&vminfo;
-		asm(INT10
-		    : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
-		    : : "ebx", "edx", "esi");
+		ireg.ax = 0x4f01;
+		ireg.cx = mode;
+		ireg.di = (size_t)&vminfo;
+		intcall(0x10, &ireg, &oreg);
 
-		if (ax != 0x004f)
+		if (ireg.ax != 0x004f)
 			continue;
 
 		if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -111,20 +109,19 @@ static int vesa_probe(void)
 
 static int vesa_set_mode(struct mode_info *mode)
 {
-	u16 ax, bx, cx, di;
+	struct biosregs ireg, oreg;
 	int is_graphic;
 	u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
 
 	memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
 
-	ax = 0x4f01;
-	cx = vesa_mode;
-	di = (size_t)&vminfo;
-	asm(INT10
-	    : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
-	    : : "ebx", "edx", "esi");
+	initregs(&ireg);
+	ireg.ax = 0x4f01;
+	ireg.cx = vesa_mode;
+	ireg.di = (size_t)&vminfo;
+	intcall(0x10, &ireg, &oreg);
 
-	if (ax != 0x004f)
+	if (oreg.ax != 0x004f)
 		return -1;
 
 	if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode)
 	}
 
 
-	ax = 0x4f02;
-	bx = vesa_mode;
-	di = 0;
-	asm volatile(INT10
-		     : "+a" (ax), "+b" (bx), "+D" (di)
-		     : : "ecx", "edx", "esi");
+	initregs(&ireg);
+	ireg.ax = 0x4f02;
+	ireg.bx = vesa_mode;
+	intcall(0x10, &ireg, &oreg);
 
-	if (ax != 0x004f)
+	if (oreg.ax != 0x004f)
 		return -1;
 
 	graphic_mode = is_graphic;
@@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode)
 /* Switch DAC to 8-bit mode */
 static void vesa_dac_set_8bits(void)
 {
+	struct biosregs ireg, oreg;
 	u8 dac_size = 6;
 
 	/* If possible, switch the DAC to 8-bit mode */
 	if (vginfo.capabilities & 1) {
-		u16 ax, bx;
-
-		ax = 0x4f08;
-		bx = 0x0800;
-		asm volatile(INT10
-			     : "+a" (ax), "+b" (bx)
-			     : : "ecx", "edx", "esi", "edi");
-
-		if (ax == 0x004f)
-			dac_size = bx >> 8;
+		initregs(&ireg);
+		ireg.ax = 0x4f08;
+		ireg.bh = 0x08;
+		intcall(0x10, &ireg, &oreg);
+		if (oreg.ax == 0x004f)
+			dac_size = oreg.bh;
 	}
 
 	/* Set the color sizes to the DAC size, and offsets to 0 */
-	boot_params.screen_info.red_size = dac_size;
+	boot_params.screen_info.red_size   = dac_size;
 	boot_params.screen_info.green_size = dac_size;
-	boot_params.screen_info.blue_size = dac_size;
-	boot_params.screen_info.rsvd_size = dac_size;
+	boot_params.screen_info.blue_size  = dac_size;
+	boot_params.screen_info.rsvd_size  = dac_size;
 
-	boot_params.screen_info.red_pos = 0;
-	boot_params.screen_info.green_pos = 0;
-	boot_params.screen_info.blue_pos = 0;
-	boot_params.screen_info.rsvd_pos = 0;
+	boot_params.screen_info.red_pos    = 0;
+	boot_params.screen_info.green_pos  = 0;
+	boot_params.screen_info.blue_pos   = 0;
+	boot_params.screen_info.rsvd_pos   = 0;
 }
 
 /* Save the VESA protected mode info */
 static void vesa_store_pm_info(void)
 {
-	u16 ax, bx, di, es;
+	struct biosregs ireg, oreg;
 
-	ax = 0x4f0a;
-	bx = di = 0;
-	asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es"
-	    : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
-	    : : "ecx", "esi");
+	initregs(&ireg);
+	ireg.ax = 0x4f0a;
+	intcall(0x10, &ireg, &oreg);
 
-	if (ax != 0x004f)
+	if (oreg.ax != 0x004f)
 		return;
 
-	boot_params.screen_info.vesapm_seg = es;
-	boot_params.screen_info.vesapm_off = di;
+	boot_params.screen_info.vesapm_seg = oreg.es;
+	boot_params.screen_info.vesapm_off = oreg.di;
 }
 
 /*
@@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void)
 void vesa_store_edid(void)
 {
 #ifdef CONFIG_FIRMWARE_EDID
-	u16 ax, bx, cx, dx, di;
+	struct biosregs ireg, oreg;
 
 	/* Apparently used as a nonsense token... */
 	memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
@@ -260,33 +250,26 @@ void vesa_store_edid(void)
 	if (vginfo.version < 0x0200)
 		return;		/* EDID requires VBE 2.0+ */
 
-	ax = 0x4f15;		/* VBE DDC */
-	bx = 0x0000;		/* Report DDC capabilities */
-	cx = 0;			/* Controller 0 */
-	di = 0;			/* ES:DI must be 0 by spec */
-
-	/* Note: The VBE DDC spec is different from the main VESA spec;
-	   we genuinely have to assume all registers are destroyed here. */
-
-	asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
-	    : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
-	    : : "esi", "edx");
+	initregs(&ireg);
+	ireg.ax = 0x4f15;		/* VBE DDC */
+	/* ireg.bx = 0x0000; */		/* Report DDC capabilities */
+	/* ireg.cx = 0;	*/		/* Controller 0 */
+	ireg.es = 0;			/* ES:DI must be 0 by spec */
+	intcall(0x10, &ireg, &oreg);
 
-	if (ax != 0x004f)
+	if (oreg.ax != 0x004f)
 		return;		/* No EDID */
 
 	/* BH = time in seconds to transfer EDD information */
 	/* BL = DDC level supported */
 
-	ax = 0x4f15;		/* VBE DDC */
-	bx = 0x0001;		/* Read EDID */
-	cx = 0;			/* Controller 0 */
-	dx = 0;			/* EDID block number */
-	di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */
-	asm(INT10
-	    : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info),
-	      "+c" (cx), "+D" (di)
-	    : : "esi");
+	ireg.ax = 0x4f15;		/* VBE DDC */
+	ireg.bx = 0x0001;		/* Read EDID */
+	/* ireg.cx = 0; */		/* Controller 0 */
+	/* ireg.dx = 0;	*/		/* EDID block number */
+	ireg.es = ds();
+	ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
+	intcall(0x10, &ireg, &oreg);
 #endif /* CONFIG_FIRMWARE_EDID */
 }
 
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 9e0587a377686..8f8d827e254d0 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -39,30 +40,30 @@ static __videocard video_vga;
 /* Set basic 80x25 mode */
 static u8 vga_set_basic_mode(void)
 {
+	struct biosregs ireg, oreg;
 	u16 ax;
 	u8 rows;
 	u8 mode;
 
+	initregs(&ireg);
+
 #ifdef CONFIG_VIDEO_400_HACK
 	if (adapter >= ADAPTER_VGA) {
-		asm volatile(INT10
-			     : : "a" (0x1202), "b" (0x0030)
-			     : "ecx", "edx", "esi", "edi");
+		ireg.ax = 0x1202;
+		ireg.bx = 0x0030;
+		intcall(0x10, &ireg, NULL);
 	}
 #endif
 
 	ax = 0x0f00;
-	asm volatile(INT10
-		     : "+a" (ax)
-		     : : "ebx", "ecx", "edx", "esi", "edi");
-
-	mode = (u8)ax;
+	intcall(0x10, &ireg, &oreg);
+	mode = oreg.al;
 
 	set_fs(0);
 	rows = rdfs8(0x484);	/* rows minus one */
 
 #ifndef CONFIG_VIDEO_400_HACK
-	if ((ax == 0x5003 || ax == 0x5007) &&
+	if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
 	    (rows == 0 || rows == 24))
 		return mode;
 #endif
@@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void)
 		mode = 3;
 
 	/* Set the mode */
-	ax = mode;
-	asm volatile(INT10
-		     : "+a" (ax)
-		     : : "ebx", "ecx", "edx", "esi", "edi");
+	ireg.ax = mode;		/* AH=0: set mode */
+	intcall(0x10, &ireg, NULL);
 	do_restore = 1;
 	return mode;
 }
@@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void)
 static void vga_set_8font(void)
 {
 	/* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
+	struct biosregs ireg;
+
+	initregs(&ireg);
 
 	/* Set 8x8 font */
-	asm volatile(INT10 : : "a" (0x1112), "b" (0));
+	ireg.ax = 0x1112;
+	/* ireg.bl = 0; */
+	intcall(0x10, &ireg, NULL);
 
 	/* Use alternate print screen */
-	asm volatile(INT10 : : "a" (0x1200), "b" (0x20));
+	ireg.ax = 0x1200;
+	ireg.bl = 0x20;
+	intcall(0x10, &ireg, NULL);
 
 	/* Turn off cursor emulation */
-	asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+	ireg.ax = 0x1201;
+	ireg.bl = 0x34;
+	intcall(0x10, &ireg, NULL);
 
 	/* Cursor is scan lines 6-7 */
-	asm volatile(INT10 : : "a" (0x0100), "c" (0x0607));
+	ireg.ax = 0x0100;
+	ireg.cx = 0x0607;
+	intcall(0x10, &ireg, NULL);
 }
 
 static void vga_set_14font(void)
 {
 	/* Set 9x14 font - 80x28 on VGA */
+	struct biosregs ireg;
+
+	initregs(&ireg);
 
 	/* Set 9x14 font */
-	asm volatile(INT10 : : "a" (0x1111), "b" (0));
+	ireg.ax = 0x1111;
+	/* ireg.bl = 0; */
+	intcall(0x10, &ireg, NULL);
 
 	/* Turn off cursor emulation */
-	asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+	ireg.ax = 0x1201;
+	ireg.bl = 0x34;
+	intcall(0x10, &ireg, NULL);
 
 	/* Cursor is scan lines 11-12 */
-	asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c));
+	ireg.ax = 0x0100;
+	ireg.cx = 0x0b0c;
+	intcall(0x10, &ireg, NULL);
 }
 
 static void vga_set_80x43(void)
 {
 	/* Set 80x43 mode on VGA (not EGA) */
+	struct biosregs ireg;
+
+	initregs(&ireg);
 
 	/* Set 350 scans */
-	asm volatile(INT10 : : "a" (0x1201), "b" (0x30));
+	ireg.ax = 0x1201;
+	ireg.bl = 0x30;
+	intcall(0x10, &ireg, NULL);
 
 	/* Reset video mode */
-	asm volatile(INT10 : : "a" (0x0003));
+	ireg.ax = 0x0003;
+	intcall(0x10, &ireg, NULL);
 
 	vga_set_8font();
 }
@@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode)
  */
 static int vga_probe(void)
 {
-	u16 ega_bx;
-
 	static const char *card_name[] = {
 		"CGA/MDA/HGC", "EGA", "VGA"
 	};
@@ -240,26 +263,26 @@ static int vga_probe(void)
 		sizeof(ega_modes)/sizeof(struct mode_info),
 		sizeof(vga_modes)/sizeof(struct mode_info),
 	};
-	u8 vga_flag;
 
-	asm(INT10
-	    : "=b" (ega_bx)
-	    : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */
-	    : "ecx", "edx", "esi", "edi");
+	struct biosregs ireg, oreg;
+
+	initregs(&ireg);
+
+	ireg.ax = 0x1200;
+	ireg.bl = 0x10;		/* Check EGA/VGA */
+	intcall(0x10, &ireg, &oreg);
 
 #ifndef _WAKEUP
-	boot_params.screen_info.orig_video_ega_bx = ega_bx;
+	boot_params.screen_info.orig_video_ega_bx = oreg.bx;
 #endif
 
 	/* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
-	if ((u8)ega_bx != 0x10) {
+	if (oreg.bl != 0x10) {
 		/* EGA/VGA */
-		asm(INT10
-		    : "=a" (vga_flag)
-		    : "a" (0x1a00)
-		    : "ebx", "ecx", "edx", "esi", "edi");
+		ireg.ax = 0x1a00;
+		intcall(0x10, &ireg, &oreg);
 
-		if (vga_flag == 0x1a) {
+		if (oreg.al == 0x1a) {
 			adapter = ADAPTER_VGA;
 #ifndef _WAKEUP
 			boot_params.screen_info.orig_video_isVGA = 1;
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 3bef2c1febe93..bad728b76fc2a 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -18,33 +19,29 @@
 
 static void store_cursor_position(void)
 {
-	u16 curpos;
-	u16 ax, bx;
+	struct biosregs ireg, oreg;
 
-	ax = 0x0300;
-	bx = 0;
-	asm(INT10
-	    : "=d" (curpos), "+a" (ax), "+b" (bx)
-	    : : "ecx", "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x03;
+	intcall(0x10, &ireg, &oreg);
 
-	boot_params.screen_info.orig_x = curpos;
-	boot_params.screen_info.orig_y = curpos >> 8;
+	boot_params.screen_info.orig_x = oreg.dl;
+	boot_params.screen_info.orig_y = oreg.dh;
 }
 
 static void store_video_mode(void)
 {
-	u16 ax, page;
+	struct biosregs ireg, oreg;
 
 	/* N.B.: the saving of the video page here is a bit silly,
 	   since we pretty much assume page 0 everywhere. */
-	ax = 0x0f00;
-	asm(INT10
-	    : "+a" (ax), "=b" (page)
-	    : : "ecx", "edx", "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x0f;
+	intcall(0x10, &ireg, &oreg);
 
 	/* Not all BIOSes are clean with respect to the top bit */
-	boot_params.screen_info.orig_video_mode = ax & 0x7f;
-	boot_params.screen_info.orig_video_page = page >> 8;
+	boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
+	boot_params.screen_info.orig_video_page = oreg.bh;
 }
 
 /*
@@ -257,7 +254,7 @@ static void restore_screen(void)
 	int y;
 	addr_t dst = 0;
 	u16 *src = saved.data;
-	u16 ax, bx, dx;
+	struct biosregs ireg;
 
 	if (graphic_mode)
 		return;		/* Can't restore onto a graphic mode */
@@ -296,12 +293,11 @@ static void restore_screen(void)
 	}
 
 	/* Restore cursor position */
-	ax = 0x0200;		/* Set cursor position */
-	bx = 0;			/* Page number (<< 8) */
-	dx = (saved.cury << 8)+saved.curx;
-	asm volatile(INT10
-		     : "+a" (ax), "+b" (bx), "+d" (dx)
-		     : : "ecx", "esi", "edi");
+	initregs(&ireg);
+	ireg.ah = 0x02;		/* Set cursor position */
+	ireg.dh = saved.cury;
+	ireg.dl = saved.curx;
+	intcall(0x10, &ireg, NULL);
 }
 #else
 #define save_screen()		((void)0)
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index ee63f5d144615..5bb174a997fc6 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -112,20 +112,6 @@ extern int force_x, force_y;	/* Don't query the BIOS for cols/rows */
 extern int do_restore;		/* Restore screen contents */
 extern int graphic_mode;	/* Graphics mode with linear frame buffer */
 
-/*
- * int $0x10 is notorious for touching registers it shouldn't.
- * gcc doesn't like %ebp being clobbered, so define it as a push/pop
- * sequence here.
- *
- * A number of systems, including the original PC can clobber %bp in
- * certain circumstances, like when scrolling.  There exists at least
- * one Trident video card which could clobber DS under a set of
- * circumstances that we are unlikely to encounter (scrolling when
- * using an extended graphics mode of more than 800x600 pixels), but
- * it's cheap insurance to deal with that here.
- */
-#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
-
 /* Accessing VGA indexed registers */
 static inline u8 in_idx(u16 port, u8 index)
 {
-- 
GitLab


From 2062501ae6505dbc5bff3a792246c2661d114050 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 6 Apr 2009 01:49:33 +0200
Subject: [PATCH 0335/2198] tracing/lockdep: report the time waited for a lock

While trying to optimize the new lock on reiserfs to replace
the bkl, I find the lock tracing very useful though it lacks
something important for performance (and latency) instrumentation:
the time a task waits for a lock.

That's what this patch implements:

  bash-4816  [000]   202.652815: lock_contended: lock_contended: &sb->s_type->i_mutex_key
  bash-4816  [000]   202.652819: lock_acquired: &rq->lock (0.000 us)
 <...>-4787  [000]   202.652825: lock_acquired: &rq->lock (0.000 us)
 <...>-4787  [000]   202.652829: lock_acquired: &rq->lock (0.000 us)
  bash-4816  [000]   202.652833: lock_acquired: &sb->s_type->i_mutex_key (16.005 us)

As shown above, the "lock acquired" field is followed by the time
it has been waiting for the lock. Usually, a lock contended entry
is followed by a near lock_acquired entry with a non-zero time waited.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1238975373-15739-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/lockdep_event_types.h | 23 ++++++++++++++++++-----
 kernel/lockdep.c                    |  8 ++++----
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
index adccfcd2ec8fc..863f1e4583a66 100644
--- a/include/trace/lockdep_event_types.h
+++ b/include/trace/lockdep_event_types.h
@@ -32,11 +32,24 @@ TRACE_FORMAT(lock_contended,
 	TP_FMT("%s", lock->name)
 	);
 
-TRACE_FORMAT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
+TRACE_EVENT(lock_acquired,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+	TP_ARGS(lock, ip, waittime),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, wait_usec)
+		__field(unsigned long, wait_nsec_rem)
+	),
+	TP_fast_assign(
+		__entry->name = lock->name;
+		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+		__entry->wait_usec = (unsigned long) waittime;
+	),
+	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
+				       __entry->wait_nsec_rem)
+);
 
 #endif
 #endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b0f011866969a..c4582a6ea9534 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3061,6 +3061,8 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 	put_lock_stats(stats);
 }
 
+DEFINE_TRACE(lock_acquired);
+
 static void
 __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
@@ -3099,6 +3101,8 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 		hlock->holdtime_stamp = now;
 	}
 
+	trace_lock_acquired(lock, ip, waittime);
+
 	stats = get_lock_stats(hlock_class(hlock));
 	if (waittime) {
 		if (hlock->read)
@@ -3137,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 
-DEFINE_TRACE(lock_acquired);
-
 void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
-	trace_lock_acquired(lock, ip);
-
 	if (unlikely(!lock_stat))
 		return;
 
-- 
GitLab


From e71e99c294058a61b7a8b9bb6da2f745ac51aa4f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Mar 2009 14:30:04 -0400
Subject: [PATCH 0336/2198] x86, function-graph: only save return values on
 x86_64

Impact: speed up

The return to handler portion of the function graph tracer should only
need to save the return values. The caller already saved off the
registers that the callee can modify. The returning function already
saved the registers it modified. When we call our own trace function
it too will save the registers that the callee must restore.

There's no reason to save off anything more that the registers used
to return the values.

Note, I did a complete kernel build with this modification and the
function graph tracer running on x86_64.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/entry_64.S | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a331ec38af9eb..1ac99865591cb 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -147,27 +147,14 @@ END(ftrace_graph_caller)
 GLOBAL(return_to_handler)
 	subq  $80, %rsp
 
+	/* Save the return values */
 	movq %rax, (%rsp)
-	movq %rcx, 8(%rsp)
-	movq %rdx, 16(%rsp)
-	movq %rsi, 24(%rsp)
-	movq %rdi, 32(%rsp)
-	movq %r8, 40(%rsp)
-	movq %r9, 48(%rsp)
-	movq %r10, 56(%rsp)
-	movq %r11, 64(%rsp)
+	movq %rdx, 8(%rsp)
 
 	call ftrace_return_to_handler
 
 	movq %rax, 72(%rsp)
-	movq 64(%rsp), %r11
-	movq 56(%rsp), %r10
-	movq 48(%rsp), %r9
-	movq 40(%rsp), %r8
-	movq 32(%rsp), %rdi
-	movq 24(%rsp), %rsi
-	movq 16(%rsp), %rdx
-	movq 8(%rsp), %rcx
+	movq 8(%rsp), %rdx
 	movq (%rsp), %rax
 	addq $72, %rsp
 	retq
-- 
GitLab


From 5cb3d1d9d34ac04bcaa2034139345b2a5fea54c1 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Thu, 9 Apr 2009 14:08:18 +0800
Subject: [PATCH 0337/2198] tracing, net, skb tracepoint: make skb tracepoint
 use the TRACE_EVENT() macro

TRACE_EVENT is a more generic way to define a tracepoint.
Doing so adds these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Neil Horman <nhorman@tuxdriver.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Steven Rostedt ;" <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49DD90D2.5020604@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/skb.h               |  4 +---
 include/trace/skb_event_types.h   | 38 +++++++++++++++++++++++++++++++
 include/trace/trace_event_types.h |  1 +
 include/trace/trace_events.h      |  1 +
 4 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 include/trace/skb_event_types.h

diff --git a/include/trace/skb.h b/include/trace/skb.h
index b66206d9be729..d2de7174a6e8b 100644
--- a/include/trace/skb.h
+++ b/include/trace/skb.h
@@ -4,8 +4,6 @@
 #include <linux/skbuff.h>
 #include <linux/tracepoint.h>
 
-DECLARE_TRACE(kfree_skb,
-	TP_PROTO(struct sk_buff *skb, void *location),
-	TP_ARGS(skb, location));
+#include <trace/skb_event_types.h>
 
 #endif
diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h
new file mode 100644
index 0000000000000..4a1c504c0e167
--- /dev/null
+++ b/include/trace/skb_event_types.h
@@ -0,0 +1,38 @@
+
+/* use <trace/skb.h> instead */
+#ifndef TRACE_EVENT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM skb
+
+/*
+ * Tracepoint for free an sk_buff:
+ */
+TRACE_EVENT(kfree_skb,
+
+	TP_PROTO(struct sk_buff *skb, void *location),
+
+	TP_ARGS(skb, location),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned short,	protocol	)
+		__field(	void *,		location	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		if (skb) {
+			__entry->protocol = ntohs(skb->protocol);
+		}
+		__entry->location = location;
+	),
+
+	TP_printk("skbaddr=%p protocol=%u location=%p",
+		__entry->skbaddr, __entry->protocol, __entry->location)
+);
+
+#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
index df56f5694be64..33b6bfcba93bd 100644
--- a/include/trace/trace_event_types.h
+++ b/include/trace/trace_event_types.h
@@ -3,3 +3,4 @@
 #include <trace/sched_event_types.h>
 #include <trace/irq_event_types.h>
 #include <trace/lockdep_event_types.h>
+#include <trace/skb_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index fd13750ca4ba3..0e2aa80076d94 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -3,3 +3,4 @@
 #include <trace/sched.h>
 #include <trace/irq.h>
 #include <trace/lockdep.h>
+#include <trace/skb.h>
-- 
GitLab


From bda869c614c937c318547c3ee1d65a316b693c21 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 9 Apr 2009 15:05:10 +0200
Subject: [PATCH 0338/2198] x86: cacheinfo: use L3 cache index disable feature
 only for CPUs that support it

AMD family 0x11 CPU doesn't support the feature.

Some AMD family 0x10 CPUs do not support it or have an erratum, see
erratum #382 in "Revision Guide for AMD Family 10h Processors, 41322
Rev. 3.40 February 2009".

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
CC: Mark Langsdorf <mark.langsdorf@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090409130510.GG31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 483eda96e1020..72401264912c3 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -291,6 +291,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
 	if (index < 3)
 		return;
+
+	if (boot_cpu_data.x86 == 0x11)
+		return;
+
+	/* see erratum #382 */
+	if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
+		return;
+
 	this_leaf->can_disable = 1;
 }
 
-- 
GitLab


From 845d8c761ec763871936c62b837c4a9ea6d0fbdb Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 9 Apr 2009 15:07:29 +0200
Subject: [PATCH 0339/2198] x86: cacheinfo: correct return value when
 cache_disable feature is not active

Impact: bug fix

If user writes to "cache_disable" attribute on a CPU that does not support
this feature, the process hangs due to an invalid return value in
store_cache_disable().

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <20090409130729.GH31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 72401264912c3..1ab46e05adf0c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -771,7 +771,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
 	unsigned int ret, index, val;
 
 	if (!this_leaf->can_disable)
-		return 0;
+		return -EINVAL;
 
 	if (strlen(buf) > 15)
 		return -EINVAL;
-- 
GitLab


From afd9fceec55225d33be878927056a548c2eef26c Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 9 Apr 2009 15:16:17 +0200
Subject: [PATCH 0340/2198] x86: cacheinfo: use cached K8 NB_MISC devices
 instead of scanning for it

Impact: avoid code duplication

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <20090409131617.GI31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/k8.h             |  8 ++++++
 arch/x86/kernel/cpu/intel_cacheinfo.c | 37 +++------------------------
 2 files changed, 11 insertions(+), 34 deletions(-)

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index 54c8cc53b24dd..c23b3d171be5f 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -12,4 +12,12 @@ extern int cache_k8_northbridges(void);
 extern void k8_flush_garts(void);
 extern int k8_scan_nodes(unsigned long start, unsigned long end);
 
+#ifdef CONFIG_K8_NB
+#define node_to_k8_nb_misc(node) \
+	(node < num_k8_northbridges) ? k8_northbridges[node] : NULL
+#else
+#define node_to_k8_nb_misc(node) NULL
+#endif
+
+
 #endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1ab46e05adf0c..0cde071536972 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,6 +17,7 @@
 
 #include <asm/processor.h>
 #include <asm/smp.h>
+#include <asm/k8.h>
 
 #define LVL_1_INST	1
 #define LVL_1_DATA	2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
 	unsigned long can_disable;
 };
 
-#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
-static struct pci_device_id k8_nb_id[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
-	{}
-};
-#endif
-
 unsigned short			num_cache_leaves;
 
 /* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -704,30 +697,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
 #define to_object(k)	container_of(k, struct _index_kobject, kobj)
 #define to_attr(a)	container_of(a, struct _cache_attr, attr)
 
-#ifdef CONFIG_PCI
-static struct pci_dev *get_k8_northbridge(int node)
-{
-	struct pci_dev *dev = NULL;
-	int i;
-
-	for (i = 0; i <= node; i++) {
-		do {
-			dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
-			if (!dev)
-				break;
-		} while (!pci_match_id(&k8_nb_id[0], dev));
-		if (!dev)
-			break;
-	}
-	return dev;
-}
-#else
-static struct pci_dev *get_k8_northbridge(int node)
-{
-	return NULL;
-}
-#endif
-
 static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
 {
 	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
@@ -739,7 +708,7 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
 	if (!this_leaf->can_disable)
 		return sprintf(buf, "Feature not enabled\n");
 
-	dev = get_k8_northbridge(node);
+	dev = node_to_k8_nb_misc(node);
 	if (!dev) {
 		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
 		return -EINVAL;
@@ -783,7 +752,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
 		return -EINVAL;
 
 	val |= 0xc0000000;
-	dev = get_k8_northbridge(node);
+	dev = node_to_k8_nb_misc(node);
 	if (!dev) {
 		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
 		return -EINVAL;
-- 
GitLab


From f8b201fc7110c3673437254e8ba02451461ece0b Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 9 Apr 2009 15:18:49 +0200
Subject: [PATCH 0341/2198] x86: cacheinfo: replace sysfs interface for
 cache_disable feature

Impact: replace sysfs attribute

Current interface violates against "one-value-per-sysfs-attribute
rule". This patch replaces current attribute with two attributes --
one for each L3 Cache Index Disable register.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090409131849.GJ31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 90 +++++++++++++--------------
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 0cde071536972..fc28291e40bab 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -697,73 +697,69 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
 #define to_object(k)	container_of(k, struct _index_kobject, kobj)
 #define to_attr(a)	container_of(a, struct _cache_attr, attr)
 
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+				  unsigned int index)
 {
-	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
-	int node = cpu_to_node(cpumask_first(mask));
-	struct pci_dev *dev = NULL;
-	ssize_t ret = 0;
-	int i;
+	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+	int node = cpu_to_node(cpu);
+	struct pci_dev *dev = node_to_k8_nb_misc(node);
+	unsigned int reg = 0;
 
 	if (!this_leaf->can_disable)
-		return sprintf(buf, "Feature not enabled\n");
-
-	dev = node_to_k8_nb_misc(node);
-	if (!dev) {
-		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
 		return -EINVAL;
-	}
 
-	for (i = 0; i < 2; i++) {
-		unsigned int reg;
+	if (!dev)
+		return -EINVAL;
 
-		pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
+	pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+	return sprintf(buf, "%x\n", reg);
+}
 
-		ret += sprintf(buf, "%sEntry: %d\n", buf, i);
-		ret += sprintf(buf, "%sReads:  %s\tNew Entries: %s\n",  
-			buf,
-			reg & 0x80000000 ? "Disabled" : "Allowed",
-			reg & 0x40000000 ? "Disabled" : "Allowed");
-		ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
-			buf, (reg & 0x30000) >> 16, reg & 0xfff);
-	}
-	return ret;
+#define SHOW_CACHE_DISABLE(index)					\
+static ssize_t								\
+show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)  	\
+{									\
+	return show_cache_disable(this_leaf, buf, index);		\
 }
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
 
-static ssize_t
-store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
-		    size_t count)
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+	const char *buf, size_t count, unsigned int index)
 {
-	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
-	int node = cpu_to_node(cpumask_first(mask));
-	struct pci_dev *dev = NULL;
-	unsigned int ret, index, val;
+	int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+	int node = cpu_to_node(cpu);
+	struct pci_dev *dev = node_to_k8_nb_misc(node);
+	unsigned long val = 0;
 
 	if (!this_leaf->can_disable)
 		return -EINVAL;
 
-	if (strlen(buf) > 15)
-		return -EINVAL;
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
 
-	ret = sscanf(buf, "%x %x", &index, &val);
-	if (ret != 2)
-		return -EINVAL;
-	if (index > 1)
+	if (!dev)
 		return -EINVAL;
 
-	val |= 0xc0000000;
-	dev = node_to_k8_nb_misc(node);
-	if (!dev) {
-		printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
+	if (strict_strtoul(buf, 10, &val) < 0)
 		return -EINVAL;
-	}
 
+	val |= 0xc0000000;
 	pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
 	wbinvd();
 	pci_write_config_dword(dev, 0x1BC + index * 4, val);
+	return count;
+}
 
-	return 1;
+#define STORE_CACHE_DISABLE(index)					\
+static ssize_t								\
+store_cache_disable_##index(struct _cpuid4_info *this_leaf,	     	\
+			    const char *buf, size_t count)		\
+{									\
+	return store_cache_disable(this_leaf, buf, count, index);	\
 }
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
 
 struct _cache_attr {
 	struct attribute attr;
@@ -785,7 +781,10 @@ define_one_ro(size);
 define_one_ro(shared_cpu_map);
 define_one_ro(shared_cpu_list);
 
-static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
+static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
+		show_cache_disable_0, store_cache_disable_0);
+static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
+		show_cache_disable_1, store_cache_disable_1);
 
 static struct attribute * default_attrs[] = {
 	&type.attr,
@@ -797,7 +796,8 @@ static struct attribute * default_attrs[] = {
 	&size.attr,
 	&shared_cpu_map.attr,
 	&shared_cpu_list.attr,
-	&cache_disable.attr,
+	&cache_disable_0.attr,
+	&cache_disable_1.attr,
 	NULL
 };
 
-- 
GitLab


From ba518bea2db21c72d44a6cbfd825b026ef9cdcb6 Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 9 Apr 2009 15:24:06 +0200
Subject: [PATCH 0342/2198] x86: cacheinfo: disable L3 ECC scrubbing when L3
 cache index is disabled

(Use correct mask to zero out bits 24-28 by Andreas)

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090409132406.GK31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index fc28291e40bab..d46a849f44a84 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -731,6 +731,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 	int node = cpu_to_node(cpu);
 	struct pci_dev *dev = node_to_k8_nb_misc(node);
 	unsigned long val = 0;
+	unsigned int scrubber = 0;
 
 	if (!this_leaf->can_disable)
 		return -EINVAL;
@@ -745,6 +746,11 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 		return -EINVAL;
 
 	val |= 0xc0000000;
+
+	pci_read_config_dword(dev, 0x58, &scrubber);
+	scrubber &= ~0x1f000000;
+	pci_write_config_dword(dev, 0x58, scrubber);
+
 	pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
 	wbinvd();
 	pci_write_config_dword(dev, 0x1BC + index * 4, val);
-- 
GitLab


From 2fad2d9bb8310889f3261035b594b4e068b6eb8b Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Thu, 9 Apr 2009 15:31:53 +0200
Subject: [PATCH 0343/2198] x86/docs: add description for cache_disable sysfs
 interface

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090409133153.GL31527@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../ABI/testing/sysfs-devices-cache_disable    | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 Documentation/ABI/testing/sysfs-devices-cache_disable

diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable
new file mode 100644
index 0000000000000..175bb4f705124
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-devices-cache_disable
@@ -0,0 +1,18 @@
+What:      /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X
+Date:      August 2008
+KernelVersion:	2.6.27
+Contact:	mark.langsdorf@amd.com
+Description:	These files exist in every cpu's cache index directories.
+		There are currently 2 cache_disable_# files in each
+		directory.  Reading from these files on a supported
+		processor will return that cache disable index value
+		for that processor and node.  Writing to one of these
+		files will cause the specificed cache index to be disabled.
+
+		Currently, only AMD Family 10h Processors support cache index
+		disable, and only for their L3 caches.  See the BIOS and
+		Kernel Developer's Guide at
+		http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf
+		for formatting information and other details on the
+		cache index disable.
+Users:    joachim.deguara@amd.com
-- 
GitLab


From f465145235313c451164bdfa9037ac254bf00c9a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:18 +0300
Subject: [PATCH 0344/2198] x86: move x86_quirk_pre_intr_init() to irqinit_32.c

Impact: cleanup

In preparation for unifying irqinit_{32,64}.c, make
x86_quirk_pre_intr_init() local to irqinit_32.c.

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/i8259.h |  4 ----
 arch/x86/include/asm/setup.h |  1 -
 arch/x86/kernel/irqinit_32.c | 20 +++++++++++++++++++-
 arch/x86/kernel/setup.c      | 18 ------------------
 4 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1a99e6c092afc..58d7091eeb1fc 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip;
 extern void mask_8259A(void);
 extern void unmask_8259A(void);
 
-#ifdef CONFIG_X86_32
-extern void init_ISA_irqs(void);
-#endif
-
 #endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index bdc2ada05ae06..4093d1ed6db2a 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -33,7 +33,6 @@ struct x86_quirks {
 	int (*setup_ioapic_ids)(void);
 };
 
-extern void x86_quirk_pre_intr_init(void);
 extern void x86_quirk_intr_init(void);
 
 extern void x86_quirk_trap_init(void);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 368b0a8836f90..0c0dedccd036d 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -53,7 +53,7 @@ static struct irqaction fpu_irq = {
 	.name = "fpu",
 };
 
-void __init init_ISA_irqs(void)
+static void __init init_ISA_irqs(void)
 {
 	int i;
 
@@ -121,6 +121,24 @@ int vector_used_by_percpu_irq(unsigned int vector)
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *	Perform any necessary interrupt initialisation prior to setting up
+ *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *	interrupts should be initialised here if the machine emulates a PC
+ *	in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+	if (x86_quirks->arch_pre_intr_init) {
+		if (x86_quirks->arch_pre_intr_init())
+			return;
+	}
+	init_ISA_irqs();
+}
+
 void __init native_init_IRQ(void)
 {
 	int i;
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4158439bf634..523bb697120d1 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -996,24 +996,6 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_X86_32
 
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- *	Perform any necessary interrupt initialisation prior to setting up
- *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
- *	interrupts should be initialised here if the machine emulates a PC
- *	in any way.
- **/
-void __init x86_quirk_pre_intr_init(void)
-{
-	if (x86_quirks->arch_pre_intr_init) {
-		if (x86_quirks->arch_pre_intr_init())
-			return;
-	}
-	init_ISA_irqs();
-}
-
 /**
  * x86_quirk_intr_init - post gate setup interrupt initialisation
  *
-- 
GitLab


From 7371d9fcb88dc9185be9719f64744a339c537a92 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:19 +0300
Subject: [PATCH 0345/2198] x86: move init_ISA_irqs() in irqinit_32.c to match
 ordering in irqinit_64.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 48 ++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 0c0dedccd036d..c5cb769db7b07 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -53,30 +53,6 @@ static struct irqaction fpu_irq = {
 	.name = "fpu",
 };
 
-static void __init init_ISA_irqs(void)
-{
-	int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-	init_bsp_APIC();
-#endif
-	init_8259A(0);
-
-	/*
-	 * 16 old-style INTA-cycle interrupts:
-	 */
-	for (i = 0; i < NR_IRQS_LEGACY; i++) {
-		struct irq_desc *desc = irq_to_desc(i);
-
-		desc->status = IRQ_DISABLED;
-		desc->action = NULL;
-		desc->depth = 1;
-
-		set_irq_chip_and_handler_name(i, &i8259A_chip,
-					      handle_level_irq, "XT");
-	}
-}
-
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
@@ -118,6 +94,30 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	return 0;
 }
 
+static void __init init_ISA_irqs(void)
+{
+	int i;
+
+#ifdef CONFIG_X86_LOCAL_APIC
+	init_bsp_APIC();
+#endif
+	init_8259A(0);
+
+	/*
+	 * 16 old-style INTA-cycle interrupts:
+	 */
+	for (i = 0; i < NR_IRQS_LEGACY; i++) {
+		struct irq_desc *desc = irq_to_desc(i);
+
+		desc->status = IRQ_DISABLED;
+		desc->action = NULL;
+		desc->depth = 1;
+
+		set_irq_chip_and_handler_name(i, &i8259A_chip,
+					      handle_level_irq, "XT");
+	}
+}
+
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
-- 
GitLab


From 36290d87f5abf260a543e5b711be4ceed03e6b1a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:20 +0300
Subject: [PATCH 0346/2198] x86: introduce smp_intr_init() in irqinit_32.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 61 +++++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index c5cb769db7b07..df0aad5a06245 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -121,6 +121,38 @@ static void __init init_ISA_irqs(void)
 /* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
+static void __init smp_intr_init(void)
+{
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+	/*
+	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+	 * IPI, driven by wakeup.
+	 */
+	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+	/* IPIs for invalidation */
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+
+	/* IPI for generic function call */
+	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+	/* IPI for single call function */
+	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+				 call_function_single_interrupt);
+
+	/* Low priority IPI to cleanup after moving an irq */
+	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
+#endif
+}
+
 /**
  * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
  *
@@ -158,34 +190,7 @@ void __init native_init_IRQ(void)
 	}
 
 
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPIs for invalidation */
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-	/* IPI for generic function call */
-	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* IPI for single call function */
-	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-				 call_function_single_interrupt);
-
-	/* Low priority IPI to cleanup after moving an irq */
-	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
+	smp_intr_init();
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/* self generated IPI for local APIC timer */
-- 
GitLab


From 22813c45228160b07244a7c4ed7580388ac0f33d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:21 +0300
Subject: [PATCH 0347/2198] x86: introduce apic_intr_init() in irqinit_32.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 40 ++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index df0aad5a06245..9ba68c4557b16 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -171,25 +171,8 @@ static void __init x86_quirk_pre_intr_init(void)
 	init_ISA_irqs();
 }
 
-void __init native_init_IRQ(void)
+static void __init apic_intr_init(void)
 {
-	int i;
-
-	/* Execute any quirks before the call gates are initialised: */
-	x86_quirk_pre_intr_init();
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-	}
-
-
 	smp_intr_init();
 
 #ifdef CONFIG_X86_LOCAL_APIC
@@ -208,6 +191,27 @@ void __init native_init_IRQ(void)
 	/* thermal monitor LVT interrupt */
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
+}
+
+void __init native_init_IRQ(void)
+{
+	int i;
+
+	/* Execute any quirks before the call gates are initialised: */
+	x86_quirk_pre_intr_init();
+
+	/*
+	 * Cover the whole vector space, no vector can escape
+	 * us. (some of these will be overridden and become
+	 * 'special' SMP interrupts)
+	 */
+	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (i != SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+	}
+
+	apic_intr_init();
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
-- 
GitLab


From d3496c85cae22fb7713af6ed542a6aeae8ee4210 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:22 +0300
Subject: [PATCH 0348/2198] x86: use identical loop constructs in 32-bit and
 64-bit native_init_IRQ()

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 2 +-
 arch/x86/kernel/irqinit_64.c | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 9ba68c4557b16..1029a1855f988 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -205,7 +205,7 @@ void __init native_init_IRQ(void)
 	 * us. (some of these will be overridden and become
 	 * 'special' SMP interrupts)
 	 */
-	for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
 		/* SYSCALL_VECTOR was reserved in trap_init. */
 		if (i != SYSCALL_VECTOR)
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 8cd10537fd46f..1c8858bb27f2f 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -159,15 +159,16 @@ void __init native_init_IRQ(void)
 	int i;
 
 	init_ISA_irqs();
+
 	/*
 	 * Cover the whole vector space, no vector can escape
 	 * us. (some of these will be overridden and become
 	 * 'special' SMP interrupts)
 	 */
-	for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-		int vector = FIRST_EXTERNAL_VECTOR + i;
-		if (vector != IA32_SYSCALL_VECTOR)
-			set_intr_gate(vector, interrupt[i]);
+	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
+		if (i != IA32_SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
 	}
 
 	apic_intr_init();
-- 
GitLab


From b0096bb0b640d0a7713618b3472fd0f4adf30a96 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:23 +0300
Subject: [PATCH 0349/2198] x86: unify smp_intr_init() in irqinit_{32,64}.h

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 8 +++++---
 arch/x86/kernel/irqinit_64.c | 2 ++
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 1029a1855f988..ef2528d298b66 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -123,7 +123,8 @@ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
 static void __init smp_intr_init(void)
 {
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
+#ifdef CONFIG_SMP
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
 	 * IPI, driven by wakeup.
@@ -143,14 +144,15 @@ static void __init smp_intr_init(void)
 	/* IPI for generic function call */
 	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 
-	/* IPI for single call function */
+	/* IPI for generic single function call */
 	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-				 call_function_single_interrupt);
+			call_function_single_interrupt);
 
 	/* Low priority IPI to cleanup after moving an irq */
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
+#endif /* CONFIG_SMP */
 }
 
 /**
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 1c8858bb27f2f..9e7c57dc79e6f 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -107,6 +107,7 @@ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 static void __init smp_intr_init(void)
 {
 #ifdef CONFIG_SMP
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/*
 	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
 	 * IPI, driven by wakeup.
@@ -134,6 +135,7 @@ static void __init smp_intr_init(void)
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
 #endif
+#endif /* CONFIG_SMP */
 }
 
 static void __init apic_intr_init(void)
-- 
GitLab


From 598c73d250ffb112715aa48fb325d79e255be23b Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:24 +0300
Subject: [PATCH 0350/2198] x86: unify init_ISA_irqs() in irqinit_{32,64}.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c |  2 +-
 arch/x86/kernel/irqinit_64.c | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index ef2528d298b66..4488b713396e1 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -98,7 +98,7 @@ static void __init init_ISA_irqs(void)
 {
 	int i;
 
-#ifdef CONFIG_X86_LOCAL_APIC
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	init_bsp_APIC();
 #endif
 	init_8259A(0);
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 9e7c57dc79e6f..61c9a922e80cf 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -84,9 +84,14 @@ static void __init init_ISA_irqs(void)
 {
 	int i;
 
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	init_bsp_APIC();
+#endif
 	init_8259A(0);
 
+	/*
+	 * 16 old-style INTA-cycle interrupts:
+	 */
 	for (i = 0; i < NR_IRQS_LEGACY; i++) {
 		struct irq_desc *desc = irq_to_desc(i);
 
@@ -94,11 +99,8 @@ static void __init init_ISA_irqs(void)
 		desc->action = NULL;
 		desc->depth = 1;
 
-		/*
-		 * 16 old-style INTA-cycle interrupts:
-		 */
 		set_irq_chip_and_handler_name(i, &i8259A_chip,
-						      handle_level_irq, "XT");
+					      handle_level_irq, "XT");
 	}
 }
 
-- 
GitLab


From 320fd99672a44ece6d1cd0d838ba31c8ebbf5979 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:25 +0300
Subject: [PATCH 0351/2198] x86: unify native_init_IRQ() in irqinit_{32,64}.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 53 ++++++++++++++---------
 arch/x86/kernel/irqinit_64.c | 82 +++++++++++++++++++++++++++++++++++-
 2 files changed, 115 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 4488b713396e1..a780de3ad5da8 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -22,7 +22,7 @@
 #include <asm/i8259.h>
 #include <asm/traps.h>
 
-
+#ifdef CONFIG_X86_32
 /*
  * Note that on a 486, we don't want to do a SIGFPE on an irq13
  * as the irq is unreliable, and exception 16 works correctly
@@ -52,6 +52,7 @@ static struct irqaction fpu_irq = {
 	.handler = math_error_irq,
 	.name = "fpu",
 };
+#endif
 
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
@@ -155,24 +156,6 @@ static void __init smp_intr_init(void)
 #endif /* CONFIG_SMP */
 }
 
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- *	Perform any necessary interrupt initialisation prior to setting up
- *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
- *	interrupts should be initialised here if the machine emulates a PC
- *	in any way.
- **/
-static void __init x86_quirk_pre_intr_init(void)
-{
-	if (x86_quirks->arch_pre_intr_init) {
-		if (x86_quirks->arch_pre_intr_init())
-			return;
-	}
-	init_ISA_irqs();
-}
-
 static void __init apic_intr_init(void)
 {
 	smp_intr_init();
@@ -195,12 +178,36 @@ static void __init apic_intr_init(void)
 #endif
 }
 
+#ifdef CONFIG_X86_32
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *	Perform any necessary interrupt initialisation prior to setting up
+ *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *	interrupts should be initialised here if the machine emulates a PC
+ *	in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+	if (x86_quirks->arch_pre_intr_init) {
+		if (x86_quirks->arch_pre_intr_init())
+			return;
+	}
+	init_ISA_irqs();
+}
+#endif
+
 void __init native_init_IRQ(void)
 {
 	int i;
 
+#ifdef CONFIG_X86_32
 	/* Execute any quirks before the call gates are initialised: */
 	x86_quirk_pre_intr_init();
+#else
+	init_ISA_irqs();
+#endif
 
 	/*
 	 * Cover the whole vector space, no vector can escape
@@ -208,9 +215,15 @@ void __init native_init_IRQ(void)
 	 * 'special' SMP interrupts)
 	 */
 	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+#ifdef CONFIG_X86_32
 		/* SYSCALL_VECTOR was reserved in trap_init. */
 		if (i != SYSCALL_VECTOR)
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+#else
+		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
+		if (i != IA32_SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+#endif
 	}
 
 	apic_intr_init();
@@ -218,6 +231,7 @@ void __init native_init_IRQ(void)
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 
+#ifdef CONFIG_X86_32
 	/*
 	 * Call quirks after call gates are initialised (usually add in
 	 * the architecture specific gates):
@@ -232,4 +246,5 @@ void __init native_init_IRQ(void)
 		setup_irq(FPU_IRQ, &fpu_irq);
 
 	irq_ctx_init(smp_processor_id());
+#endif
 }
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 61c9a922e80cf..ed50e35ce97e1 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -39,14 +39,46 @@
  * (these are usually mapped into the 0x30-0xff vector range)
  */
 
+#ifdef CONFIG_X86_32
 /*
- * IRQ2 is cascade interrupt to second interrupt controller
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+	outb(0, 0xF0);
+	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+		return IRQ_NONE;
+	math_error((void __user *)get_irq_regs()->ip);
+	return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
  */
+static struct irqaction fpu_irq = {
+	.handler = math_error_irq,
+	.name = "fpu",
+};
+#endif
 
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
 static struct irqaction irq2 = {
 	.handler = no_action,
 	.name = "cascade",
 };
+
 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
 	[0 ... IRQ0_VECTOR - 1] = -1,
 	[IRQ0_VECTOR] = 0,
@@ -158,11 +190,36 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 }
 
+#ifdef CONFIG_X86_32
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *	Perform any necessary interrupt initialisation prior to setting up
+ *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *	interrupts should be initialised here if the machine emulates a PC
+ *	in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+	if (x86_quirks->arch_pre_intr_init) {
+		if (x86_quirks->arch_pre_intr_init())
+			return;
+	}
+	init_ISA_irqs();
+}
+#endif
+
 void __init native_init_IRQ(void)
 {
 	int i;
 
+#ifdef CONFIG_X86_32
+	/* Execute any quirks before the call gates are initialised: */
+	x86_quirk_pre_intr_init();
+#else
 	init_ISA_irqs();
+#endif
 
 	/*
 	 * Cover the whole vector space, no vector can escape
@@ -170,13 +227,36 @@ void __init native_init_IRQ(void)
 	 * 'special' SMP interrupts)
 	 */
 	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+#ifdef CONFIG_X86_32
+		/* SYSCALL_VECTOR was reserved in trap_init. */
+		if (i != SYSCALL_VECTOR)
+			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+#else
 		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
 		if (i != IA32_SYSCALL_VECTOR)
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+#endif
 	}
 
 	apic_intr_init();
 
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Call quirks after call gates are initialised (usually add in
+	 * the architecture specific gates):
+	 */
+	x86_quirk_intr_init();
+
+	/*
+	 * External FPU? Set up irq13 if so, for
+	 * original braindamaged IBM FERR coupling.
+	 */
+	if (boot_cpu_data.hard_math && !cpu_has_fpu)
+		setup_irq(FPU_IRQ, &fpu_irq);
+
+	irq_ctx_init(smp_processor_id());
+#endif
 }
-- 
GitLab


From 778838600eb6973bdb6fd11e7f91b43cea4d6f45 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:26 +0300
Subject: [PATCH 0352/2198] x86: unify trivial differences in irqinit_{32,64}.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c | 20 ++++++++++++++++++++
 arch/x86/kernel/irqinit_64.c |  4 ++++
 2 files changed, 24 insertions(+)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index a780de3ad5da8..72ce94268d318 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -1,20 +1,24 @@
+#include <linux/linkage.h>
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/sched.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
+#include <linux/timex.h>
 #include <linux/slab.h>
 #include <linux/random.h>
 #include <linux/init.h>
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/bitops.h>
+#include <linux/acpi.h>
 #include <linux/io.h>
 #include <linux/delay.h>
 
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <asm/timer.h>
+#include <asm/hw_irq.h>
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 #include <asm/apic.h>
@@ -22,6 +26,22 @@
 #include <asm/i8259.h>
 #include <asm/traps.h>
 
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+
 #ifdef CONFIG_X86_32
 /*
  * Note that on a 486, we don't want to do a SIGFPE on an irq13
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index ed50e35ce97e1..687b6c33cd750 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -17,11 +17,14 @@
 
 #include <asm/atomic.h>
 #include <asm/system.h>
+#include <asm/timer.h>
 #include <asm/hw_irq.h>
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 #include <asm/apic.h>
+#include <asm/setup.h>
 #include <asm/i8259.h>
+#include <asm/traps.h>
 
 /*
  * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -136,6 +139,7 @@ static void __init init_ISA_irqs(void)
 	}
 }
 
+/* Overridden in paravirt.c */
 void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
 
 static void __init smp_intr_init(void)
-- 
GitLab


From ab19c25abd14db28d7454f00805ea59f22ed6057 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:27 +0300
Subject: [PATCH 0353/2198] x86: unify apic_intr_init() in irqinit_{32,64}.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit_32.c |  9 ++++++++-
 arch/x86/kernel/irqinit_64.c | 11 +++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 72ce94268d318..f3be5e974275c 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -180,7 +180,12 @@ static void __init apic_intr_init(void)
 {
 	smp_intr_init();
 
-#ifdef CONFIG_X86_LOCAL_APIC
+#ifdef CONFIG_X86_64
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
@@ -192,10 +197,12 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 #endif
 
+#ifdef CONFIG_X86_32
 #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
 	/* thermal monitor LVT interrupt */
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 #endif
+#endif
 }
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 687b6c33cd750..f3be5e974275c 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -180,9 +180,12 @@ static void __init apic_intr_init(void)
 {
 	smp_intr_init();
 
+#ifdef CONFIG_X86_64
 	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
 	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
 
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
 	/* self generated IPI for local APIC timer */
 	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
 
@@ -192,6 +195,14 @@ static void __init apic_intr_init(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+#endif
+
+#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+	/* thermal monitor LVT interrupt */
+	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+#endif
 }
 
 #ifdef CONFIG_X86_32
-- 
GitLab


From 31cb45ef2600d47191d51253ec94b5e3f689260d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:28 +0300
Subject: [PATCH 0354/2198] x86: unify irqinit_{32,64}.c into irqinit.c

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile                    |   2 +-
 arch/x86/kernel/{irqinit_32.c => irqinit.c} |   0
 arch/x86/kernel/irqinit_64.c                | 277 --------------------
 3 files changed, 1 insertion(+), 278 deletions(-)
 rename arch/x86/kernel/{irqinit_32.c => irqinit.c} (100%)
 delete mode 100644 arch/x86/kernel/irqinit_64.c

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 145cce75cda70..16e3acfe19e62 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,7 +28,7 @@ CFLAGS_paravirt.o	:= $(nostackp)
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y			+= setup.o i8259.o irqinit_$(BITS).o
+obj-y			+= setup.o i8259.o irqinit.o
 obj-$(CONFIG_X86_VISWS)	+= visws_quirks.o
 obj-$(CONFIG_X86_32)	+= probe_roms_32.o
 obj-$(CONFIG_X86_32)	+= sys_i386_32.o i386_ksyms_32.o
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit.c
similarity index 100%
rename from arch/x86/kernel/irqinit_32.c
rename to arch/x86/kernel/irqinit.c
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
deleted file mode 100644
index f3be5e974275c..0000000000000
--- a/arch/x86/kernel/irqinit_64.c
+++ /dev/null
@@ -1,277 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/timer.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
-#include <asm/i8259.h>
-#include <asm/traps.h>
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-
-#ifdef CONFIG_X86_32
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
-	outb(0, 0xF0);
-	if (ignore_fpu_irq || !boot_cpu_data.hard_math)
-		return IRQ_NONE;
-	math_error((void __user *)get_irq_regs()->ip);
-	return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
-	.handler = math_error_irq,
-	.name = "fpu",
-};
-#endif
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-static struct irqaction irq2 = {
-	.handler = no_action,
-	.name = "cascade",
-};
-
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... IRQ0_VECTOR - 1] = -1,
-	[IRQ0_VECTOR] = 0,
-	[IRQ1_VECTOR] = 1,
-	[IRQ2_VECTOR] = 2,
-	[IRQ3_VECTOR] = 3,
-	[IRQ4_VECTOR] = 4,
-	[IRQ5_VECTOR] = 5,
-	[IRQ6_VECTOR] = 6,
-	[IRQ7_VECTOR] = 7,
-	[IRQ8_VECTOR] = 8,
-	[IRQ9_VECTOR] = 9,
-	[IRQ10_VECTOR] = 10,
-	[IRQ11_VECTOR] = 11,
-	[IRQ12_VECTOR] = 12,
-	[IRQ13_VECTOR] = 13,
-	[IRQ14_VECTOR] = 14,
-	[IRQ15_VECTOR] = 15,
-	[IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-int vector_used_by_percpu_irq(unsigned int vector)
-{
-	int cpu;
-
-	for_each_online_cpu(cpu) {
-		if (per_cpu(vector_irq, cpu)[vector] != -1)
-			return 1;
-	}
-
-	return 0;
-}
-
-static void __init init_ISA_irqs(void)
-{
-	int i;
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
-	init_bsp_APIC();
-#endif
-	init_8259A(0);
-
-	/*
-	 * 16 old-style INTA-cycle interrupts:
-	 */
-	for (i = 0; i < NR_IRQS_LEGACY; i++) {
-		struct irq_desc *desc = irq_to_desc(i);
-
-		desc->status = IRQ_DISABLED;
-		desc->action = NULL;
-		desc->depth = 1;
-
-		set_irq_chip_and_handler_name(i, &i8259A_chip,
-					      handle_level_irq, "XT");
-	}
-}
-
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
-	/*
-	 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-	 * IPI, driven by wakeup.
-	 */
-	alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-	/* IPIs for invalidation */
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-	alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-	/* IPI for generic function call */
-	alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-	/* IPI for generic single function call */
-	alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-			call_function_single_interrupt);
-
-	/* Low priority IPI to cleanup after moving an irq */
-	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-	set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
-#endif /* CONFIG_SMP */
-}
-
-static void __init apic_intr_init(void)
-{
-	smp_intr_init();
-
-#ifdef CONFIG_X86_64
-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-#endif
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
-	/* self generated IPI for local APIC timer */
-	alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-	/* generic IPI for platform specific use */
-	alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
-
-	/* IPI vectors for APIC spurious and error interrupts */
-	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-#endif
-
-#ifdef CONFIG_X86_32
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
-	/* thermal monitor LVT interrupt */
-	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-#endif
-}
-
-#ifdef CONFIG_X86_32
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- *	Perform any necessary interrupt initialisation prior to setting up
- *	the "ordinary" interrupt call gates.  For legacy reasons, the ISA
- *	interrupts should be initialised here if the machine emulates a PC
- *	in any way.
- **/
-static void __init x86_quirk_pre_intr_init(void)
-{
-	if (x86_quirks->arch_pre_intr_init) {
-		if (x86_quirks->arch_pre_intr_init())
-			return;
-	}
-	init_ISA_irqs();
-}
-#endif
-
-void __init native_init_IRQ(void)
-{
-	int i;
-
-#ifdef CONFIG_X86_32
-	/* Execute any quirks before the call gates are initialised: */
-	x86_quirk_pre_intr_init();
-#else
-	init_ISA_irqs();
-#endif
-
-	/*
-	 * Cover the whole vector space, no vector can escape
-	 * us. (some of these will be overridden and become
-	 * 'special' SMP interrupts)
-	 */
-	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-#ifdef CONFIG_X86_32
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-#else
-		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != IA32_SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-#endif
-	}
-
-	apic_intr_init();
-
-	if (!acpi_ioapic)
-		setup_irq(2, &irq2);
-
-#ifdef CONFIG_X86_32
-	/*
-	 * Call quirks after call gates are initialised (usually add in
-	 * the architecture specific gates):
-	 */
-	x86_quirk_intr_init();
-
-	/*
-	 * External FPU? Set up irq13 if so, for
-	 * original braindamaged IBM FERR coupling.
-	 */
-	if (boot_cpu_data.hard_math && !cpu_has_fpu)
-		setup_irq(FPU_IRQ, &fpu_irq);
-
-	irq_ctx_init(smp_processor_id());
-#endif
-}
-- 
GitLab


From ac3048dfd4740becf8d768844cf47ebee363c9f8 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:29 +0300
Subject: [PATCH 0355/2198] x86: define IA32_SYSCALL_VECTOR on 32-bit to reduce
 ifdefs

Impact: cleanup

We can remove some #ifdefs if we define IA32_SYSCALL_VECTOR on 32-bit.

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/irq_vectors.h | 1 +
 arch/x86/kernel/irqinit.c          | 6 ------
 arch/x86/kernel/traps.c            | 5 +----
 3 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 3cbd79bbb47c8..910b5a3d6751f 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -34,6 +34,7 @@
 
 #ifdef CONFIG_X86_32
 # define SYSCALL_VECTOR			0x80
+# define IA32_SYSCALL_VECTOR		0x80
 #else
 # define IA32_SYSCALL_VECTOR		0x80
 #endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f3be5e974275c..f2c60a59f474e 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -242,15 +242,9 @@ void __init native_init_IRQ(void)
 	 * 'special' SMP interrupts)
 	 */
 	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-#ifdef CONFIG_X86_32
-		/* SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-#else
 		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
 		if (i != IA32_SYSCALL_VECTOR)
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-#endif
 	}
 
 	apic_intr_init();
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index a1d288327ff0f..2310700faca5f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -969,11 +969,8 @@ void __init trap_init(void)
 	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
 		set_bit(i, used_vectors);
 
-#ifdef CONFIG_X86_64
 	set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
-	set_bit(SYSCALL_VECTOR, used_vectors);
-#endif
+
 	/*
 	 * Should be a barrier for any external CPU state:
 	 */
-- 
GitLab


From abdb5a5713330e17dfe91ab0d3e29c4744d95162 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 9 Apr 2009 11:52:30 +0300
Subject: [PATCH 0356/2198] x86: remove some ifdefs from native_init_IRQ()

Impact: cleanup

Reviewed-by Cyrill Gorcunov <gorcunov@openvz.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f2c60a59f474e..626977200a584 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -205,7 +205,6 @@ static void __init apic_intr_init(void)
 #endif
 }
 
-#ifdef CONFIG_X86_32
 /**
  * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
  *
@@ -217,24 +216,21 @@ static void __init apic_intr_init(void)
  **/
 static void __init x86_quirk_pre_intr_init(void)
 {
+#ifdef CONFIG_X86_32
 	if (x86_quirks->arch_pre_intr_init) {
 		if (x86_quirks->arch_pre_intr_init())
 			return;
 	}
+#endif
 	init_ISA_irqs();
 }
-#endif
 
 void __init native_init_IRQ(void)
 {
 	int i;
 
-#ifdef CONFIG_X86_32
 	/* Execute any quirks before the call gates are initialised: */
 	x86_quirk_pre_intr_init();
-#else
-	init_ISA_irqs();
-#endif
 
 	/*
 	 * Cover the whole vector space, no vector can escape
-- 
GitLab


From 6265ff19ca08df0d96c859ae5e4dc2d9ad07070e Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Thu, 9 Apr 2009 15:47:10 +0200
Subject: [PATCH 0357/2198] x86: cacheinfo: complete L2/L3 Cache and TLB
 associativity field definitions

See "CPUID Specification" (AMD Publication #: 25481, Rev. 2.28, April 2008)

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <20090409134710.GA8026@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index d46a849f44a84..789efe217e1ab 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -200,10 +200,17 @@ union l3_cache {
 };
 
 static const unsigned short __cpuinitconst assocs[] = {
-	[1] = 1, [2] = 2, [4] = 4, [6] = 8,
-	[8] = 16, [0xa] = 32, [0xb] = 48,
+	[1] = 1,
+	[2] = 2,
+	[4] = 4,
+	[6] = 8,
+	[8] = 16,
+	[0xa] = 32,
+	[0xb] = 48,
 	[0xc] = 64,
-	[0xf] = 0xffff // ??
+	[0xd] = 96,
+	[0xe] = 128,
+	[0xf] = 0xffff /* fully associative - no way to show this currently */
 };
 
 static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -264,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 	eax->split.type = types[leaf];
 	eax->split.level = levels[leaf];
 	if (leaf == 3)
-		eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+		eax->split.num_threads_sharing =
+			current_cpu_data.x86_max_cores - 1;
 	else
 		eax->split.num_threads_sharing = 0;
 	eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
-- 
GitLab


From 47f16ca7631f9c6bad8e6d968cfb1433029b09ec Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 10 Apr 2009 14:58:05 +0200
Subject: [PATCH 0358/2198] x86, irqinit: preempt merge conflicts

To make the topic merge life easier for tip:perfcounters/core,
include two (inactive in this topic) IRQ vector initializations
here.

Also fix build bug - missing kprobes.h inclusion.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 626977200a584..b424c32c4a0c3 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -7,6 +7,7 @@
 #include <linux/timex.h>
 #include <linux/slab.h>
 #include <linux/random.h>
+#include <linux/kprobes.h>
 #include <linux/init.h>
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
@@ -195,6 +196,13 @@ static void __init apic_intr_init(void)
 	/* IPI vectors for APIC spurious and error interrupts */
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+	/* Performance monitoring interrupts: */
+# ifdef CONFIG_PERF_COUNTERS
+	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+# endif
+
 #endif
 
 #ifdef CONFIG_X86_32
-- 
GitLab


From a5a2a0c7fa039c59619bc908b3b1ed24734d442a Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Fri, 10 Apr 2009 09:50:05 -0700
Subject: [PATCH 0359/2198] futex: fix futex_wait_setup key handling

If the get_futex_key() call were to fail, the existing code would
try and put_futex_key() prior to returning.  This patch makes sure
we only put_futex_key() if get_futex_key() succeeded.

Reported-by: Clark Williams <williams@redhat.com>
Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
LKML-Reference: <20090410165005.14342.16973.stgit@Aeon>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 041bf3ac4be95..6d2daa46f9ffb 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1668,7 +1668,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
 	q->key = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr, fshared, &q->key);
 	if (unlikely(ret != 0))
-		goto out;
+		return ret;
 
 retry_private:
 	*hb = queue_lock(q);
-- 
GitLab


From cf9972a921470b0a2da7906104bcd540b20e33bf Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Sat, 11 Apr 2009 22:24:05 -0700
Subject: [PATCH 0360/2198] x86, setup: fix comment in the "glove box" code

Impact: Comment change only

The glove box is about avoiding problems with *registers* being
touched, not *memory*.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/bioscall.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
index 22b4b3efb9f90..507793739ea58 100644
--- a/arch/x86/boot/bioscall.S
+++ b/arch/x86/boot/bioscall.S
@@ -10,7 +10,7 @@
 
 /*
  * "Glove box" for BIOS calls.  Avoids the constant problems with BIOSes
- * touching memory they shouldn't be.
+ * touching registers they shouldn't be.
  */
 
 	.code16
-- 
GitLab


From 2de1f33e99cec5fd79542a1d0e26efb9c36a98bb Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sat, 11 Apr 2009 12:55:26 +0530
Subject: [PATCH 0361/2198] x86: apic/x2apic_cluster.c
 x86_cpu_to_logical_apicid should be static

Impact: reduce kernel size a bit, address sparse warning

Addresses the problem pointed out by this sparse warning:
  arch/x86/kernel/apic/x2apic_cluster.c:13:1: warning: symbol 'per_cpu__x86_cpu_to_logical_apicid' was not declared. Should it be static?

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <1239434726.4418.24.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 4a903e2f0d179..8e4cbb255c38d 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -10,7 +10,7 @@
 #include <asm/apic.h>
 #include <asm/ipi.h>
 
-DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-- 
GitLab


From 56c49951747f250d8398582509e02ae5ce1d36d1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 11 Apr 2009 15:51:19 -0400
Subject: [PATCH 0362/2198] tracing: Add documentation for the power tracer

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <1239479479-2603-4-git-send-email-tytso@mit.edu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/trace/power.txt | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)
 create mode 100644 Documentation/trace/power.txt

diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt
new file mode 100644
index 0000000000000..cd805e16dc271
--- /dev/null
+++ b/Documentation/trace/power.txt
@@ -0,0 +1,17 @@
+The power tracer collects detailed information about C-state and P-state
+transitions, instead of just looking at the high-level "average"
+information.
+
+There is a helper script found in scrips/tracing/power.pl in the kernel
+sources which can be used to parse this information and create a
+Scalable Vector Graphics (SVG) picture from the trace data.
+
+To use this tracer:
+
+	echo 0 > /sys/kernel/debug/tracing/tracing_enabled
+	echo power > /sys/kernel/debug/tracing/current_tracer
+	echo 1 > /sys/kernel/debug/tracing/tracing_enabled
+	sleep 1
+	echo 0 > /sys/kernel/debug/tracing/tracing_enabled
+	cat /sys/kernel/debug/tracing/trace | \
+		perl scripts/tracing/power.pl > out.sv
-- 
GitLab


From abd41443ac76d3e9c29a8c1d9e9a3312306cc55e Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 11 Apr 2009 15:51:18 -0400
Subject: [PATCH 0363/2198] tracing: Document the event tracing system

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Theodore Ts'o <tytso@mit.edu>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1239479479-2603-3-git-send-email-tytso@mit.edu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/trace/events.txt | 135 +++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 Documentation/trace/events.txt

diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
new file mode 100644
index 0000000000000..abdee664c0f66
--- /dev/null
+++ b/Documentation/trace/events.txt
@@ -0,0 +1,135 @@
+			     Event Tracing
+
+		Documentation written by Theodore Ts'o
+
+Introduction
+============
+
+Tracepoints (see Documentation/trace/tracepoints.txt) can be used
+without creating custom kernel modules to register probe functions
+using the event tracing infrastructure.
+
+Not all tracepoints can be traced using the event tracing system;
+the kernel developer must provide code snippets which define how the
+tracing information is saved into the tracing buffer, and how the
+the tracing information should be printed.
+
+Using Event Tracing
+===================
+
+The events which are available for tracing can be found in the file
+/sys/kernel/debug/tracing/available_events.
+
+To enable a particular event, such as 'sched_wakeup', simply echo it
+to /sys/debug/tracing/set_event. For example:
+
+	# echo sched_wakeup > /sys/kernel/debug/tracing/set_event
+
+[ Note: events can also be enabled/disabled via the 'enabled' toggle
+  found in the /sys/kernel/tracing/events/ hierarchy of directories. ]
+
+To disable an event, echo the event name to the set_event file prefixed
+with an exclamation point:
+
+	# echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event
+
+To disable events, echo an empty line to the set_event file:
+
+	# echo > /sys/kernel/debug/tracing/set_event
+
+The events are organized into subsystems, such as ext4, irq, sched,
+etc., and a full event name looks like this: <subsystem>:<event>.  The
+subsystem name is optional, but it is displayed in the available_events
+file.  All of the events in a subsystem can be specified via the syntax
+"<subsystem>:*"; for example, to enable all irq events, you can use the
+command:
+
+	# echo 'irq:*' > /sys/kernel/debug/tracing/set_event
+
+Defining an event-enabled tracepoint
+------------------------------------
+
+A kernel developer which wishes to define an event-enabled tracepoint
+must declare the tracepoint using TRACE_EVENT instead of DECLARE_TRACE.
+This is done via two header files in include/trace.  For example, to
+event-enable the jbd2 subsystem, we must create two files,
+include/trace/jbd2.h and include/trace/jbd2_event_types.h.  The
+include/trace/jbd2.h file should be included by kernel source files that
+will have a tracepoint inserted, and might look like this:
+
+#ifndef _TRACE_JBD2_H
+#define _TRACE_JBD2_H
+
+#include <linux/jbd2.h>
+#include <linux/tracepoint.h>
+
+#include <trace/jbd2_event_types.h>
+
+#endif
+
+In a file that utilizes a jbd2 tracepoint, this header file would be
+included.  Note that you still have to use DEFINE_TRACE().  So for
+example, if fs/jbd2/commit.c planned to use the jbd2_start_commit
+tracepoint, it would have the following near the beginning of the file:
+
+#include <trace/jbd2.h>
+
+DEFINE_TRACE(jbd2_start_commit);
+
+Then in the function that would call the tracepoint, it would call the
+tracepoint function.  (For more information, please see the tracepoint
+documentation in Documentation/trace/tracepoints.txt):
+
+	trace_jbd2_start_commit(journal, commit_transaction);
+
+The code snippets which allow jbd2_start_commit to be an event-enabled
+tracepoint are placed in the file include/trace/jbd2_event_types.h:
+
+/* use <trace/jbd2.h> instead */
+#ifndef TRACE_EVENT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM jbd2
+
+#include <linux/jbd2.h>
+
+TRACE_EVENT(jbd2_start_commit,
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+	TP_ARGS(journal, commit_transaction),
+	TP_STRUCT__entry(
+		__array(	char,	devname, BDEVNAME_SIZE+24 )
+		__field(	int,	transaction		  )
+	),
+	TP_fast_assign(
+		memcpy(__entry->devname, journal->j_devname, BDEVNAME_SIZE+24);
+		__entry->transaction	= commit_transaction->t_tid;
+	),
+	TP_printk("dev %s transaction %d",
+		  __entry->devname, __entry->transaction)
+);
+
+The TP_PROTO and TP_ARGS are unchanged from DECLARE_TRACE.  The new
+arguments to TRACE_EVENT are TP_STRUCT__entry, TP_fast_assign, and
+TP_printk.
+
+TP_STRUCT__entry defines the data structure which will be stored in the
+trace buffer.  Normally, fields in __entry will be arrays or simple
+types.  It is possible to place data structures in __entry --- however,
+pointers in the data structure can not be trusted, since they will be
+accessed sometime later by TP_printk, and if the data structure contains
+fields that will not or cannot be used by TP_printk, this will waste
+space in the trace buffer.  In general, data structures should be
+avoided, unless they do only contain non-pointer types and all of the
+fields will be used by TP_printk.
+
+TP_fast_assign defines the code snippet which saves information into the
+__entry data structure, using the passed-in arguments defined in
+TP_PROTO and TP_ARGS.
+
+Finally, TP_printk will print the __entry data structure.  At the time
+when the code snippet defined by TP_printk is executed, it will not have
+access to the TP_ARGS arguments; it can only use the information saved
+in the __entry data structure.
-- 
GitLab


From 2c1b284e4fa260fd922b9a65c99169e2630c6862 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sat, 11 Apr 2009 00:03:10 +0530
Subject: [PATCH 0364/2198] x86: clean up declarations and variables

Impact: cleanup, no code changed

 - syscalls.h       update declarations due to unifications
 - irq.c            declare smp_generic_interrupt() before it gets used
 - process.c        declare sys_fork() and sys_vfork() before they get used
 - tsc.c            rename tsc_khz shadowed variable
 - apic/probe_32.c  declare apic_default before it gets used
 - apic/nmi.c       prev_nmi_count should be unsigned
 - apic/io_apic.c   declare smp_irq_move_cleanup_interrupt() before it gets used
 - mm/init.c        declare direct_gbpages and free_initrd_mem before they get used

Signed-off-by: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h       |  3 +++
 arch/x86/include/asm/hw_irq.h     |  4 +++
 arch/x86/include/asm/pgtable.h    |  2 ++
 arch/x86/include/asm/pgtable_64.h |  6 -----
 arch/x86/include/asm/syscalls.h   | 45 ++++++++++++++++---------------
 arch/x86/kernel/apic/io_apic.c    |  1 +
 arch/x86/kernel/apic/nmi.c        |  2 +-
 arch/x86/kernel/apic/probe_32.c   |  1 -
 arch/x86/kernel/irq.c             |  1 +
 arch/x86/kernel/process.c         |  1 +
 arch/x86/kernel/tsc.c             |  8 +++---
 arch/x86/mm/init.c                |  1 +
 12 files changed, 42 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f83774224..5773660c8cd51 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -478,6 +478,9 @@ static inline unsigned int read_apic_id(void)
 extern void default_setup_apic_routing(void);
 
 #ifdef CONFIG_X86_32
+
+extern struct apic apic_default;
+
 /*
  * Set up the logical destination ID.
  *
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd703..be9ae4111c94e 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -78,7 +78,11 @@ extern void eisa_set_level_irq(unsigned int irq);
 /* SMP */
 extern void smp_apic_timer_interrupt(struct pt_regs *);
 extern void smp_spurious_interrupt(struct pt_regs *);
+extern void smp_generic_interrupt(struct pt_regs *);
 extern void smp_error_interrupt(struct pt_regs *);
+#ifdef CONFIG_X86_IO_APIC
+extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
+#endif
 #ifdef CONFIG_SMP
 extern void smp_reschedule_interrupt(struct pt_regs *);
 extern void smp_call_function_interrupt(struct pt_regs *);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 29d96d168bc09..3f8d09d94eb36 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -503,6 +503,8 @@ static inline int pgd_none(pgd_t pgd)
 
 #ifndef __ASSEMBLY__
 
+extern int direct_gbpages;
+
 /* local pte updates need not use xchg for locking */
 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
 {
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 6b87bc6d50189..abde308fdb0f5 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[];
 
 extern void paging_init(void);
 
-#endif /* !__ASSEMBLY__ */
-
-#ifndef __ASSEMBLY__
-
 #define pte_ERROR(e)					\
 	printk("%s:%d: bad pte %p(%016lx).\n",		\
 	       __FILE__, __LINE__, &(e), pte_val(e))
@@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 
 #define update_mmu_cache(vma, address, pte) do { } while (0)
 
-extern int direct_gbpages;
-
 /* Encode and de-code a swap entry */
 #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
 #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 7043408f6904a..372b76edd63f6 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -1,7 +1,7 @@
 /*
  * syscalls.h - Linux syscall interfaces (arch-specific)
  *
- * Copyright (c) 2008 Jaswinder Singh
+ * Copyright (c) 2008 Jaswinder Singh Rajput
  *
  * This file is released under the GPLv2.
  * See the file COPYING for more details.
@@ -12,50 +12,55 @@
 
 #include <linux/compiler.h>
 #include <linux/linkage.h>
-#include <linux/types.h>
 #include <linux/signal.h>
+#include <linux/types.h>
 
 /* Common in X86_32 and X86_64 */
 /* kernel/ioport.c */
 asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
 
+/* kernel/process.c */
+int sys_fork(struct pt_regs *);
+int sys_vfork(struct pt_regs *);
+
 /* kernel/ldt.c */
 asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
 
+/* kernel/signal.c */
+long sys_rt_sigreturn(struct pt_regs *);
+
 /* kernel/tls.c */
 asmlinkage int sys_set_thread_area(struct user_desc __user *);
 asmlinkage int sys_get_thread_area(struct user_desc __user *);
 
 /* X86_32 only */
 #ifdef CONFIG_X86_32
+/* kernel/ioport.c */
+long sys_iopl(struct pt_regs *);
+
 /* kernel/process_32.c */
-int sys_fork(struct pt_regs *);
 int sys_clone(struct pt_regs *);
-int sys_vfork(struct pt_regs *);
 int sys_execve(struct pt_regs *);
 
-/* kernel/signal_32.c */
+/* kernel/signal.c */
 asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
 asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
 			     struct old_sigaction __user *);
 int sys_sigaltstack(struct pt_regs *);
 unsigned long sys_sigreturn(struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
-
-/* kernel/ioport.c */
-long sys_iopl(struct pt_regs *);
 
 /* kernel/sys_i386_32.c */
+struct mmap_arg_struct;
+struct sel_arg_struct;
+struct oldold_utsname;
+struct old_utsname;
+
 asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
 			  unsigned long, unsigned long, unsigned long);
-struct mmap_arg_struct;
 asmlinkage int old_mmap(struct mmap_arg_struct __user *);
-struct sel_arg_struct;
 asmlinkage int old_select(struct sel_arg_struct __user *);
 asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
-struct old_utsname;
 asmlinkage int sys_uname(struct old_utsname __user *);
-struct oldold_utsname;
 asmlinkage int sys_olduname(struct oldold_utsname __user *);
 
 /* kernel/vm86_32.c */
@@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *);
 #else /* CONFIG_X86_32 */
 
 /* X86_64 only */
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
+
 /* kernel/process_64.c */
-asmlinkage long sys_fork(struct pt_regs *);
 asmlinkage long sys_clone(unsigned long, unsigned long,
 			  void __user *, void __user *,
 			  struct pt_regs *);
-asmlinkage long sys_vfork(struct pt_regs *);
 asmlinkage long sys_execve(char __user *, char __user * __user *,
 			   char __user * __user *,
 			   struct pt_regs *);
 long sys_arch_prctl(int, unsigned long);
 
-/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
-
-/* kernel/signal_64.c */
+/* kernel/signal.c */
 asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
 				struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
 
 /* kernel/sys_x86_64.c */
+struct new_utsname;
+
 asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
 			 unsigned long, unsigned long, unsigned long);
-struct new_utsname;
 asmlinkage long sys_uname(struct new_utsname __user *);
 
 #endif /* CONFIG_X86_32 */
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 767fe7e46d689..870c92ddaf9cb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -59,6 +59,7 @@
 #include <asm/setup.h>
 #include <asm/irq_remapping.h>
 #include <asm/hpet.h>
+#include <asm/hw_irq.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_irq.h>
 
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index d6bd624071525..02056310f2f55 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
 }
 #endif
 
-static void report_broken_nmi(int cpu, int *prev_nmi_count)
+static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
 {
 	printk(KERN_CONT "\n");
 
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 01eda2ac65e4d..440a8bccd91ad 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
 extern struct apic apic_bigsmp;
 extern struct apic apic_es7000;
 extern struct apic apic_es7000_cluster;
-extern struct apic apic_default;
 
 struct apic *apic = &apic_default;
 EXPORT_SYMBOL_GPL(apic);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3aaf7b9e3a8bd..2188267f523b2 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -12,6 +12,7 @@
 #include <asm/io_apic.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
+#include <asm/hw_irq.h>
 
 atomic_t irq_err_count;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca989158e8479..3e21e38d7e37f 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -11,6 +11,7 @@
 #include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
+#include <asm/syscalls.h>
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7a567ebe63614..a8dc0d00b8306 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void)
 {
 	u64 tsc1, tsc2, delta, ref1, ref2;
 	unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
-	unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
+	unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
 	int hpet = is_hpet_enabled(), i, loopmin;
 
-	tsc_khz = get_hypervisor_tsc_freq();
-	if (tsc_khz) {
+	hv_tsc_khz = get_hypervisor_tsc_freq();
+	if (hv_tsc_khz) {
 		printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
-		return tsc_khz;
+		return hv_tsc_khz;
 	}
 
 	local_irq_save(flags);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fd3da1dda1c9e..40924e445f570 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
+#include <linux/initrd.h>
 #include <linux/ioport.h>
 #include <linux/swap.h>
 
-- 
GitLab


From 02af61bb50f5d5f0322dbe5ab2a0d75808d25c7b Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 10 Apr 2009 14:26:18 +0800
Subject: [PATCH 0365/2198] tracing, kmemtrace: Separate
 include/trace/kmemtrace.h to kmemtrace part and tracepoint part

Impact: refactor code for future changes

Current kmemtrace.h is used both as header file of kmemtrace and kmem's
tracepoints definition.

Tracepoints' definition file may be used by other code, and should only have
definition of tracepoint.

We can separate include/trace/kmemtrace.h into 2 files:

  include/linux/kmemtrace.h: header file for kmemtrace
  include/trace/kmem.h:      definition of kmem tracepoints

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49DEE68A.5040902@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kmemtrace.h             | 25 +++++++++++++++++++++++++
 include/linux/slab_def.h              |  2 +-
 include/linux/slub_def.h              |  2 +-
 include/trace/{kmemtrace.h => kmem.h} | 25 +++----------------------
 init/main.c                           |  2 +-
 kernel/trace/kmemtrace.c              |  2 +-
 kernel/trace/trace.h                  |  2 +-
 mm/slab.c                             |  2 +-
 mm/slob.c                             |  2 +-
 mm/slub.c                             |  2 +-
 10 files changed, 36 insertions(+), 30 deletions(-)
 create mode 100644 include/linux/kmemtrace.h
 rename include/trace/{kmemtrace.h => kmem.h} (78%)

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
new file mode 100644
index 0000000000000..15c45a27a925c
--- /dev/null
+++ b/include/linux/kmemtrace.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <trace/kmem.h>
+
+#ifdef CONFIG_KMEMTRACE
+extern void kmemtrace_init(void);
+#else
+static inline void kmemtrace_init(void)
+{
+}
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 5ac9b0bcaf9ad..713f841ecaa91 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,7 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 /* Size description struct for general caches. */
 struct cache_sizes {
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index 5046f90c11710..be5d40c43bd2e 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,7 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
diff --git a/include/trace/kmemtrace.h b/include/trace/kmem.h
similarity index 78%
rename from include/trace/kmemtrace.h
rename to include/trace/kmem.h
index 28ee69f9cd46d..24d251928182c 100644
--- a/include/trace/kmemtrace.h
+++ b/include/trace/kmem.h
@@ -1,25 +1,9 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
+#ifndef _TRACE_KMEM_H
+#define _TRACE_KMEM_H
 
 #include <linux/tracepoint.h>
 #include <linux/types.h>
 
-#ifdef CONFIG_KMEMTRACE
-extern void kmemtrace_init(void);
-#else
-static inline void kmemtrace_init(void)
-{
-}
-#endif
-
 DECLARE_TRACE(kmalloc,
 	      TP_PROTO(unsigned long call_site,
 		      const void *ptr,
@@ -57,7 +41,4 @@ DECLARE_TRACE(kmem_cache_free,
 	      TP_PROTO(unsigned long call_site, const void *ptr),
 	      TP_ARGS(call_site, ptr));
 
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
+#endif /* _TRACE_KMEM_H */
diff --git a/init/main.c b/init/main.c
index 3585f073d6364..eece40cd8a645 100644
--- a/init/main.c
+++ b/init/main.c
@@ -64,6 +64,7 @@
 #include <linux/idr.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/kmemtrace.h>
 #include <trace/boot.h>
 
 #include <asm/io.h>
@@ -71,7 +72,6 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
-#include <trace/kmemtrace.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 5011f4d91e375..7a0aa0e260db8 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -12,7 +12,7 @@
 #include <linux/dcache.h>
 #include <linux/fs.h>
 
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 #include "trace_output.h"
 #include "trace.h"
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f76a8f8689d45..34b94c3f40adb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,7 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <trace/power.h>
 
 enum trace_type {
diff --git a/mm/slab.c b/mm/slab.c
index 9a90b00d2f914..f85831da9080d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,7 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
-#include	<trace/kmemtrace.h>
+#include	<linux/kmemtrace.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
diff --git a/mm/slob.c b/mm/slob.c
index a2d4ab32198d8..494f05f19417a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,7 +65,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <asm/atomic.h>
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index 7ab54ecbd3f3a..ea9e7160e2e7b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
-- 
GitLab


From fc182a4330fc22ea1b68fa3d5064dd85a73a4c4a Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 10 Apr 2009 14:27:38 +0800
Subject: [PATCH 0366/2198] tracing, kmemtrace: Make kmem tracepoints use
 TRACE_EVENT macro

TRACE_EVENT is a more generic way to define tracepoints.
Doing so adds these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49DEE6DA.80600@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/kmem.h              |  39 +-----
 include/trace/kmem_event_types.h  | 193 ++++++++++++++++++++++++++++++
 include/trace/trace_event_types.h |   1 +
 include/trace/trace_events.h      |   1 +
 4 files changed, 197 insertions(+), 37 deletions(-)
 create mode 100644 include/trace/kmem_event_types.h

diff --git a/include/trace/kmem.h b/include/trace/kmem.h
index 24d251928182c..46efc2423f03f 100644
--- a/include/trace/kmem.h
+++ b/include/trace/kmem.h
@@ -1,44 +1,9 @@
 #ifndef _TRACE_KMEM_H
 #define _TRACE_KMEM_H
 
-#include <linux/tracepoint.h>
 #include <linux/types.h>
+#include <linux/tracepoint.h>
 
-DECLARE_TRACE(kmalloc,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmem_cache_alloc,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmalloc_node,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags,
-		      int node),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kmem_cache_alloc_node,
-	      TP_PROTO(unsigned long call_site,
-		      const void *ptr,
-		      size_t bytes_req,
-		      size_t bytes_alloc,
-		      gfp_t gfp_flags,
-		      int node),
-	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kfree,
-	      TP_PROTO(unsigned long call_site, const void *ptr),
-	      TP_ARGS(call_site, ptr));
-DECLARE_TRACE(kmem_cache_free,
-	      TP_PROTO(unsigned long call_site, const void *ptr),
-	      TP_ARGS(call_site, ptr));
+#include <trace/kmem_event_types.h>
 
 #endif /* _TRACE_KMEM_H */
diff --git a/include/trace/kmem_event_types.h b/include/trace/kmem_event_types.h
new file mode 100644
index 0000000000000..4ff420fe46756
--- /dev/null
+++ b/include/trace/kmem_event_types.h
@@ -0,0 +1,193 @@
+
+/* use <trace/kmem.h> instead */
+#ifndef TRACE_EVENT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmem
+
+TRACE_EVENT(kmalloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmem_cache_alloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmalloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kmem_cache_alloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kfree,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+TRACE_EVENT(kmem_cache_free,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
index 33b6bfcba93bd..552a50e169a66 100644
--- a/include/trace/trace_event_types.h
+++ b/include/trace/trace_event_types.h
@@ -4,3 +4,4 @@
 #include <trace/irq_event_types.h>
 #include <trace/lockdep_event_types.h>
 #include <trace/skb_event_types.h>
+#include <trace/kmem_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index 0e2aa80076d94..13d6b85668cfe 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -4,3 +4,4 @@
 #include <trace/irq.h>
 #include <trace/lockdep.h>
 #include <trace/skb.h>
+#include <trace/kmem.h>
-- 
GitLab


From b78825d608f30a47e3154ab6872a03f0de0c9d45 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 1 Apr 2009 16:18:53 +0800
Subject: [PATCH 0367/2198] blktrace: fix output of unknown events

Not all events are pc (packet command) events. An event is a pc
event only if it has BLK_TC_PC bit set.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49D3236D.3090705@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d1f0ba9..e45e1af13563e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1182,7 +1182,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	}
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+		ret = trace_seq_printf(s, "Unknown action %x\n", what);
 	else {
 		ret = log_action(iter, what2act[what].act[long_act]);
 		if (ret)
-- 
GitLab


From 66de7792c02693b49671afe58c771fde3b092fc7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 1 Apr 2009 16:19:19 +0800
Subject: [PATCH 0368/2198] blktrace: fix output of BLK_TC_PC events

BLK_TC_PC events should be treated differently with BLK_TC_FS events.

Before this patch:

 # echo 1 > /sys/block/sda/sda1/trace/enable
 # echo pc > /sys/block/sda/sda1/trace/act_mask
 # echo blk > /debugfs/tracing/current_tracer
 # (generate some BLK_TC_PC events)
 # cat trace
        bash-2184  [000]  1774.275413:   8,7    I   N [bash]
        bash-2184  [000]  1774.275435:   8,7    D   N [bash]
        bash-2184  [000]  1774.275540:   8,7    I   R [bash]
        bash-2184  [000]  1774.275547:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275580:   8,7    C   N 0 [0]
        bash-2184  [000]  1774.275648:   8,7    I   R [bash]
        bash-2184  [000]  1774.275653:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275682:   8,7    C   N 0 [0]
        bash-2184  [000]  1774.275739:   8,7    I   R [bash]
        bash-2184  [000]  1774.275744:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275771:   8,7    C   N 0 [0]
        bash-2184  [000]  1774.275804:   8,7    I   R [bash]
        bash-2184  [000]  1774.275808:   8,7    D   R [bash]
 ksoftirqd/0-4     [000]  1774.275836:   8,7    C   N 0 [0]

After this patch:

 # cat trace
        bash-2263  [000]   366.782149:   8,7    I   N 0 (00 ..) [bash]
        bash-2263  [000]   366.782323:   8,7    D   N 0 (00 ..) [bash]
        bash-2263  [000]   366.782557:   8,7    I   R 8 (25 00 ..) [bash]
        bash-2263  [000]   366.782560:   8,7    D   R 8 (25 00 ..) [bash]
 ksoftirqd/0-4     [000]   366.782582:   8,7    C   N (25 00 ..) [0]
        bash-2263  [000]   366.782648:   8,7    I   R 8 (5a 00 3f 00) [bash]
        bash-2263  [000]   366.782650:   8,7    D   R 8 (5a 00 3f 00) [bash]
 ksoftirqd/0-4     [000]   366.782669:   8,7    C   N (5a 00 3f 00) [0]
        bash-2263  [000]   366.782710:   8,7    I   R 8 (5a 00 08 00) [bash]
        bash-2263  [000]   366.782713:   8,7    D   R 8 (5a 00 08 00) [bash]
 ksoftirqd/0-4     [000]   366.782730:   8,7    C   N (5a 00 08 00) [0]
        bash-2263  [000]   366.783375:   8,7    I   R 36 (5a 00 08 00) [bash]
        bash-2263  [000]   366.783379:   8,7    D   R 36 (5a 00 08 00) [bash]
 ksoftirqd/0-4     [000]   366.783404:   8,7    C   N (5a 00 08 00) [0]

This is what we do with PC events in user-space blktrace.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49D32387.9040106@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 88 +++++++++++++++++++++++++++++++++++++----
 1 file changed, 80 insertions(+), 8 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e45e1af13563e..2b98195b338b2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -971,6 +971,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
 	return te_blk_io_trace(ent) + 1;
 }
 
+static inline u32 t_action(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->action;
+}
+
+static inline u32 t_bytes(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->bytes;
+}
+
 static inline u32 t_sec(const struct trace_entry *ent)
 {
 	return te_blk_io_trace(ent)->bytes >> 9;
@@ -1031,25 +1041,87 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
 				MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 
+static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+{
+	const char *pdu_buf;
+	int pdu_len;
+	int i, end, ret;
+
+	pdu_buf = pdu_start(ent);
+	pdu_len = te_blk_io_trace(ent)->pdu_len;
+
+	if (!pdu_len)
+		return 1;
+
+	/* find the last zero that needs to be printed */
+	for (end = pdu_len - 1; end >= 0; end--)
+		if (pdu_buf[end])
+			break;
+	end++;
+
+	if (!trace_seq_putc(s, '('))
+		return 0;
+
+	for (i = 0; i < pdu_len; i++) {
+
+		ret = trace_seq_printf(s, "%s%02x",
+				       i == 0 ? "" : " ", pdu_buf[i]);
+		if (!ret)
+			return ret;
+
+		/*
+		 * stop when the rest is just zeroes and indicate so
+		 * with a ".." appended
+		 */
+		if (i == end && end != pdu_len - 1)
+			return trace_seq_puts(s, " ..) ");
+	}
+
+	return trace_seq_puts(s, ") ");
+}
+
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
 	char cmd[TASK_COMM_LEN];
 
 	trace_find_cmdline(ent->pid, cmd);
 
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%s]\n",
-					t_sector(ent), t_sec(ent), cmd);
-	return trace_seq_printf(s, "[%s]\n", cmd);
+	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+		int ret;
+
+		ret = trace_seq_printf(s, "%u ", t_bytes(ent));
+		if (!ret)
+			return 0;
+		ret = blk_log_dump_pdu(s, ent);
+		if (!ret)
+			return 0;
+		return trace_seq_printf(s, "[%s]\n", cmd);
+	} else {
+		if (t_sec(ent))
+			return trace_seq_printf(s, "%llu + %u [%s]\n",
+						t_sector(ent), t_sec(ent), cmd);
+		return trace_seq_printf(s, "[%s]\n", cmd);
+	}
 }
 
 static int blk_log_with_error(struct trace_seq *s,
 			      const struct trace_entry *ent)
 {
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
-					t_sec(ent), t_error(ent));
-	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+		int ret;
+
+		ret = blk_log_dump_pdu(s, ent);
+		if (ret)
+			return trace_seq_printf(s, "[%d]\n", t_error(ent));
+		return 0;
+	} else {
+		if (t_sec(ent))
+			return trace_seq_printf(s, "%llu + %u [%d]\n",
+						t_sector(ent),
+						t_sec(ent), t_error(ent));
+		return trace_seq_printf(s, "%llu [%d]\n",
+					t_sector(ent), t_error(ent));
+	}
 }
 
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
-- 
GitLab


From 3fa89ca7ba5ba50b3924a11f6604b4bdce5f7842 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 12 Apr 2009 20:37:25 +0530
Subject: [PATCH 0369/2198] x86: vdso/vma.c declare vdso_enabled and
 arch_setup_additional_pages before they get used

Impact: cleanup, address sparse warnings

Addresses the problem pointed out by these sparse warning:
  arch/x86/vdso/vma.c:19:28: warning: symbol 'vdso_enabled' was not declared. Should it be static?
  arch/x86/vdso/vma.c:101:5: warning: symbol 'arch_setup_additional_pages' was not declared. Should it be static?

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
LKML-Reference: <1239548845.4170.2.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/vdso/vma.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf9098b7..cac083386e033 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/random.h>
+#include <linux/elf.h>
 #include <asm/vsyscall.h>
 #include <asm/vgtod.h>
 #include <asm/proto.h>
-- 
GitLab


From edea7148a87c099e5d5d4838285cc27e459588b7 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 12 Apr 2009 20:47:39 +0400
Subject: [PATCH 0370/2198] x86: irq.c - tiny cleanup

Impact: cleanup, robustization

 1) guard ack_bad_irq with printk_ratelimit since there is no
    guarantee we will not be flooded one day

 2) use pr_emerg() helper

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090412165058.277579847@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 3aaf7b9e3a8bd..6603492e8b717 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -24,7 +24,8 @@ void (*generic_interrupt_extension)(void) = NULL;
  */
 void ack_bad_irq(unsigned int irq)
 {
-	printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+	if (printk_ratelimit())
+		pr_err("unexpected IRQ trap at vector %02x\n", irq);
 
 #ifdef CONFIG_X86_LOCAL_APIC
 	/*
@@ -178,7 +179,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
 	sum += irq_stats(cpu)->irq_thermal_count;
 # ifdef CONFIG_X86_64
 	sum += irq_stats(cpu)->irq_threshold_count;
-#endif
+# endif
 #endif
 	return sum;
 }
@@ -219,8 +220,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 #endif
 
 		if (printk_ratelimit())
-			printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
-			       __func__, smp_processor_id(), vector, irq);
+			pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
+				__func__, smp_processor_id(), vector, irq);
 	}
 
 	irq_exit();
-- 
GitLab


From c0eaa4536f08b98fbcfa7fce5b7b0de1bebcb0e1 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 12 Apr 2009 20:47:40 +0400
Subject: [PATCH 0371/2198] x86: apic - introduce imcr_ helpers

Impact: cleanup

Distinguish port writting magic into helpers with comments.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090412165058.535921550@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 098ec84b8c005..c3be10f5773e8 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -98,6 +98,29 @@ early_param("lapic", parse_lapic);
 /* Local APIC was disabled by the BIOS and enabled by the kernel */
 static int enabled_via_apicbase;
 
+/*
+ * Handle interrupt mode configuration register (IMCR).
+ * This register controls whether the interrupt signals
+ * that reach the BSP come from the master PIC or from the
+ * local APIC. Before entering Symmetric I/O Mode, either
+ * the BIOS or the operating system must switch out of
+ * PIC Mode by changing the IMCR.
+ */
+static inline imcr_pic_to_apic(void)
+{
+	/* select IMCR register */
+	outb(0x70, 0x22);
+	/* NMI and 8259 INTR go through APIC */
+	outb(0x01, 0x23);
+}
+
+static inline imcr_apic_to_pic(void)
+{
+	/* select IMCR register */
+	outb(0x70, 0x22);
+	/* NMI and 8259 INTR go directly to BSP */
+	outb(0x00, 0x23);
+}
 #endif
 
 #ifdef CONFIG_X86_64
@@ -1727,8 +1750,7 @@ void __init connect_bsp_APIC(void)
 		 */
 		apic_printk(APIC_VERBOSE, "leaving PIC mode, "
 				"enabling APIC mode.\n");
-		outb(0x70, 0x22);
-		outb(0x01, 0x23);
+		imcr_pic_to_apic();
 	}
 #endif
 	if (apic->enable_apic_mode)
@@ -1756,8 +1778,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
 		 */
 		apic_printk(APIC_VERBOSE, "disabling APIC mode, "
 				"entering PIC mode.\n");
-		outb(0x70, 0x22);
-		outb(0x00, 0x23);
+		imcr_apic_to_pic();
 		return;
 	}
 #endif
-- 
GitLab


From 08306ce61d6848e6fbf74fa4cc693c3fb29e943f Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 12 Apr 2009 20:47:41 +0400
Subject: [PATCH 0372/2198] x86: apic - introduce dummy apic operations

Impact: refactor, speed up and robustize code

In case if apic was disabled by kernel option
or by hardware limits we can use dummy operations
in apic->write to simplify the ack_APIC_irq() code.

At the lame time the patch fixes the missed EOI in
do_IRQ function (which has place if kernel is compiled
as X86-32 and interrupt without handler happens where
apic was not asked to be disabled via kernel option).

Note that native_apic_write_dummy() consists of
WARN_ON_ONCE to catch any buggy writes on enabled
APICs. Could be removed after some time of testing.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090412165058.724788431@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h |  3 ++-
 arch/x86/kernel/apic/apic.c | 24 ++++++++++++++++++++++++
 arch/x86/kernel/irq.c       | 10 ++--------
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f83774224..2bd5a463fd1f7 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -212,6 +212,7 @@ static inline void ack_x2APIC_irq(void)
 }
 #endif
 
+extern void apic_disable(void);
 extern int lapic_get_maxlvt(void);
 extern void clear_local_APIC(void);
 extern void connect_bsp_APIC(void);
@@ -252,7 +253,7 @@ static inline void lapic_shutdown(void) { }
 #define local_apic_timer_c2_ok		1
 static inline void init_apic_mappings(void) { }
 static inline void disable_local_APIC(void) { }
-
+static inline void apic_disable(void) { }
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index c3be10f5773e8..9b849d4957dc2 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -232,6 +232,24 @@ static int modern_apic(void)
 	return lapic_get_version() >= 0x14;
 }
 
+/*
+ * bare function to substitute write operation
+ * and it's _that_ fast :)
+ */
+void native_apic_write_dummy(u32 reg, u32 v)
+{
+	WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+}
+
+/*
+ * right after this call apic->write doesn't do anything
+ * note that there is no restore operation it works one way
+ */
+void apic_disable(void)
+{
+	apic->write = native_apic_write_dummy;
+}
+
 void native_apic_wait_icr_idle(void)
 {
 	while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -1582,6 +1600,12 @@ void __init init_apic_mappings(void)
 	 */
 	if (boot_cpu_physical_apicid == -1U)
 		boot_cpu_physical_apicid = read_apic_id();
+
+	/* lets check if we may to NOP'ify apic operations */
+	if (!cpu_has_apic) {
+		pr_info("APIC: disable apic facility\n");
+		apic_disable();
+	}
 }
 
 /*
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 6603492e8b717..fd57bf35d0fc7 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -27,7 +27,6 @@ void ack_bad_irq(unsigned int irq)
 	if (printk_ratelimit())
 		pr_err("unexpected IRQ trap at vector %02x\n", irq);
 
-#ifdef CONFIG_X86_LOCAL_APIC
 	/*
 	 * Currently unexpected vectors happen only on SMP and APIC.
 	 * We _must_ ack these because every local APIC has only N
@@ -37,9 +36,7 @@ void ack_bad_irq(unsigned int irq)
 	 * completely.
 	 * But only ack when the APIC is enabled -AK
 	 */
-	if (cpu_has_apic)
-		ack_APIC_irq();
-#endif
+	ack_APIC_irq();
 }
 
 #define irq_stats(x)		(&per_cpu(irq_stat, x))
@@ -214,10 +211,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	irq = __get_cpu_var(vector_irq)[vector];
 
 	if (!handle_irq(irq, regs)) {
-#ifdef CONFIG_X86_64
-		if (!disable_apic)
-			ack_APIC_irq();
-#endif
+		ack_APIC_irq();
 
 		if (printk_ratelimit())
 			pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
-- 
GitLab


From b9b34f24b23ba9e79e07c0980e7fff16af2a67d1 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 12 Apr 2009 20:47:42 +0400
Subject: [PATCH 0373/2198] x86: smp.c - align smp_ops assignments

Impact: cleanup

It's a bit hard to parse by eyes without
them being aligned.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090412165058.924175574@openvz.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smp.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea8ccaaf..f6db48c405b81 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -193,19 +193,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 }
 
 struct smp_ops smp_ops = {
-	.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
-	.smp_prepare_cpus = native_smp_prepare_cpus,
-	.smp_cpus_done = native_smp_cpus_done,
+	.smp_prepare_boot_cpu	= native_smp_prepare_boot_cpu,
+	.smp_prepare_cpus	= native_smp_prepare_cpus,
+	.smp_cpus_done		= native_smp_cpus_done,
 
-	.smp_send_stop = native_smp_send_stop,
-	.smp_send_reschedule = native_smp_send_reschedule,
+	.smp_send_stop		= native_smp_send_stop,
+	.smp_send_reschedule	= native_smp_send_reschedule,
 
-	.cpu_up = native_cpu_up,
-	.cpu_die = native_cpu_die,
-	.cpu_disable = native_cpu_disable,
-	.play_dead = native_play_dead,
+	.cpu_up			= native_cpu_up,
+	.cpu_die		= native_cpu_die,
+	.cpu_disable		= native_cpu_disable,
+	.play_dead		= native_play_dead,
 
-	.send_call_func_ipi = native_send_call_func_ipi,
+	.send_call_func_ipi	= native_send_call_func_ipi,
 	.send_call_func_single_ipi = native_send_call_func_single_ipi,
 };
 EXPORT_SYMBOL_GPL(smp_ops);
-- 
GitLab


From 0f3fd87ce43727d6b8573191ce89e874533b1429 Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Mon, 13 Apr 2009 20:24:50 +0100
Subject: [PATCH 0374/2198] perf_counter: fix alignment in /proc/interrupts

Trivial fix on columns alignment in /proc/interrupts file.

Signed-off-by: Luis Henriques <henrix@sapo.pt>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090413192449.GA3920@hades.domain.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irq.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index dccaaa8557897..849cfabb1fdce 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -67,7 +67,7 @@ static int show_other_interrupts(struct seq_file *p, int prec)
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
 	seq_printf(p, "  Performance counter interrupts\n");
-	seq_printf(p, "PND: ");
+	seq_printf(p, "%*s: ", prec, "PND");
 	for_each_online_cpu(j)
 		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
 	seq_printf(p, "  Performance pending work\n");
-- 
GitLab


From 5cda395f4a262788d8ed79ac8a26a2b821e5f751 Mon Sep 17 00:00:00 2001
From: Alexander van Heukelum <heukelum@mailshack.com>
Date: Mon, 13 Apr 2009 17:39:24 +0200
Subject: [PATCH 0375/2198] x86: fix function definitions after: x86: apic -
 introduce imcr_ helpers

The patch "introduce imcr_ helpers" introduced good comments, but
also a few new compile warnings. This fixes the function definitions
to have a 'void' return type.

Signed-off-by: Alexander van Heukelum <heukelum@fastmail.fm>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090413153924.GA20287@mailshack.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 9b849d4957dc2..4b48ff9163caf 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -106,7 +106,7 @@ static int enabled_via_apicbase;
  * the BIOS or the operating system must switch out of
  * PIC Mode by changing the IMCR.
  */
-static inline imcr_pic_to_apic(void)
+static inline void imcr_pic_to_apic(void)
 {
 	/* select IMCR register */
 	outb(0x70, 0x22);
@@ -114,7 +114,7 @@ static inline imcr_pic_to_apic(void)
 	outb(0x01, 0x23);
 }
 
-static inline imcr_apic_to_pic(void)
+static inline void imcr_apic_to_pic(void)
 {
 	/* select IMCR register */
 	outb(0x70, 0x22);
-- 
GitLab


From e1112b4d96859367a93468027c9635e2ac04eb3f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 31 Mar 2009 00:48:49 -0500
Subject: [PATCH 0376/2198] tracing/filters: add run-time field descriptions to
 TRACE_EVENT_FORMAT events

This patch adds run-time field descriptions to all the event formats
exported using TRACE_EVENT_FORMAT.  It also hooks up all the tracers
that use them (i.e. the tracers in the 'ftrace subsystem') so they can
also have their output filtered by the event-filtering mechanism.

When I was testing this, there were a couple of things that fooled me
into thinking the filters weren't working, when actually they were -
I'll mention them here so others don't make the same mistakes (and file
bug reports. ;-)

One is that some of the tracers trace multiple events e.g. the
sched_switch tracer uses the context_switch and wakeup events, and if
you don't set filters on all of the traced events, the unfiltered output
from the events without filters on them can make it look like the
filtering as a whole isn't working properly, when actually it is doing
what it was asked to do - it just wasn't asked to do the right thing.

The other is that for the really high-volume tracers e.g. the function
tracer, the volume of filtered events can be so high that it pushes the
unfiltered events out of the ring buffer before they can be read so e.g.
cat'ing the trace file repeatedly shows either no output, or once in
awhile some output but that isn't there the next time you read the
trace, which isn't what you normally expect when reading the trace file.
If you read from the trace_pipe file though, you can catch them before
they disappear.

Changes from v1:

As suggested by Frederic Weisbecker:

- get rid of externs in functions
- added unlikely() to filter_check_discard()

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c            |  6 +++
 kernel/trace/trace.c                | 25 +++++++++++++
 kernel/trace/trace.h                | 20 ++++++++++
 kernel/trace/trace_branch.c         |  3 ++
 kernel/trace/trace_event_types.h    |  6 ++-
 kernel/trace/trace_events.c         |  7 ++++
 kernel/trace/trace_events_filter.c  |  4 +-
 kernel/trace/trace_events_stage_2.h |  7 ----
 kernel/trace/trace_export.c         | 57 +++++++++++++++++++++++++++--
 kernel/trace/trace_hw_branches.c    |  2 +
 kernel/trace/trace_power.c          |  4 ++
 11 files changed, 127 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 7a0aa0e260db8..9419ad10541b6 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 				   gfp_t gfp_flags,
 				   int node)
 {
+	struct ftrace_event_call *call = &event_kmem_alloc;
 	struct trace_array *tr = kmemtrace_array;
 	struct kmemtrace_alloc_entry *entry;
 	struct ring_buffer_event *event;
@@ -62,6 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 	entry->gfp_flags	= gfp_flags;
 	entry->node		= node;
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
@@ -71,6 +74,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 				  unsigned long call_site,
 				  const void *ptr)
 {
+	struct ftrace_event_call *call = &event_kmem_free;
 	struct trace_array *tr = kmemtrace_array;
 	struct kmemtrace_free_entry *entry;
 	struct ring_buffer_event *event;
@@ -86,6 +90,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 	entry->call_site	= call_site;
 	entry->ptr		= ptr;
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4865459f609f9..962e6179994ae 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -898,6 +898,7 @@ trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
 	       int pc)
 {
+	struct ftrace_event_call *call = &event_function;
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
 
@@ -912,6 +913,9 @@ trace_function(struct trace_array *tr,
 	entry	= ring_buffer_event_data(event);
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
+
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 }
 
@@ -921,6 +925,7 @@ static int __trace_graph_entry(struct trace_array *tr,
 				unsigned long flags,
 				int pc)
 {
+	struct ftrace_event_call *call = &event_funcgraph_entry;
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent_entry *entry;
 
@@ -933,6 +938,7 @@ static int __trace_graph_entry(struct trace_array *tr,
 		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 
 	return 1;
@@ -943,6 +949,7 @@ static void __trace_graph_return(struct trace_array *tr,
 				unsigned long flags,
 				int pc)
 {
+	struct ftrace_event_call *call = &event_funcgraph_exit;
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ret_entry *entry;
 
@@ -955,6 +962,7 @@ static void __trace_graph_return(struct trace_array *tr,
 		return;
 	entry	= ring_buffer_event_data(event);
 	entry->ret				= *trace;
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
@@ -973,6 +981,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 				 int skip, int pc)
 {
 #ifdef CONFIG_STACKTRACE
+	struct ftrace_event_call *call = &event_kernel_stack;
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
@@ -990,6 +999,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
@@ -1015,6 +1025,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 				   unsigned long flags, int pc)
 {
 #ifdef CONFIG_STACKTRACE
+	struct ftrace_event_call *call = &event_user_stack;
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
@@ -1036,6 +1047,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
@@ -1052,6 +1064,7 @@ ftrace_trace_special(void *__tr,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
 		     int pc)
 {
+	struct ftrace_event_call *call = &event_special;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
@@ -1064,6 +1077,7 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
@@ -1080,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   struct task_struct *next,
 			   unsigned long flags, int pc)
 {
+	struct ftrace_event_call *call = &event_context_switch;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
@@ -1095,6 +1110,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
+
+	filter_check_discard(call, entry, event);
+
 	trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
@@ -1104,6 +1122,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct task_struct *curr,
 			   unsigned long flags, int pc)
 {
+	struct ftrace_event_call *call = &event_wakeup;
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
@@ -1120,6 +1139,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
@@ -1221,6 +1242,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	static u32 trace_buf[TRACE_BUF_SIZE];
 
+	struct ftrace_event_call *call = &event_bprint;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1260,6 +1282,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 out_unlock:
@@ -1279,6 +1302,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
 
+	struct ftrace_event_call *call = &event_print;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
@@ -1314,6 +1338,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
+	filter_check_discard(call, entry, event);
 	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 34b94c3f40adb..e7737281953fa 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -866,6 +866,21 @@ extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
 
+static inline void
+filter_check_discard(struct ftrace_event_call *call, void *rec,
+		     struct ring_buffer_event *event)
+{
+	if (unlikely(call->preds) && !filter_match_preds(call, rec))
+		ring_buffer_event_discard(event);
+}
+
+#define __common_field(type, item)					\
+	ret = trace_define_field(event_call, #type, "common_" #item,	\
+				 offsetof(typeof(field.ent), item),	\
+				 sizeof(field.ent.item));		\
+	if (ret)							\
+		return ret;
+
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
@@ -897,4 +912,9 @@ do {									\
 		__trace_printk(ip, fmt, ##args);			\
 } while (0)
 
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+	extern struct ftrace_event_call event_##call;
+#include "trace_event_types.h"
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index e6e32912ffb81..c95c25d838ef0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
 static void
 probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
+	struct ftrace_event_call *call = &event_branch;
 	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
@@ -73,6 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
+	filter_check_discard(call, entry, event);
+
 	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fd78bee71dd75..95b147aac2299 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned int, line, line)
-		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
-		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
+				    TRACE_FUNC_SIZE+1, func)
+		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
+				    TRACE_FUNC_SIZE+1, file)
 		TRACE_FIELD(char, correct, correct)
 	),
 	TP_RAW_FMT("%u:%s:%s (%u)")
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64ec4d278ffbc..be9299a53e2a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -680,6 +680,7 @@ static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
 	struct event_subsystem *system;
+	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(system, &event_subsystems, list) {
@@ -708,6 +709,12 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->preds = NULL;
 
+	entry = debugfs_create_file("filter", 0644, system->entry, system,
+				    &ftrace_subsystem_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/filter' entry\n", name);
+
 	return system->entry;
 }
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 026be412f356d..470ad9487eccc 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -185,7 +185,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 	}
 
 	events_for_each(call) {
-		if (!call->name || !call->regfunc)
+		if (!call->define_fields)
 			continue;
 
 		if (!strcmp(call->system, system->name))
@@ -324,7 +324,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 	events_for_each(call) {
 		int err;
 
-		if (!call->name || !call->regfunc)
+		if (!call->define_fields)
 			continue;
 
 		if (strcmp(call->system, system->name))
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 30743f7d41108..1c94b87c7180d 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -146,13 +146,6 @@ ftrace_format_##call(struct trace_seq *s)				\
 	if (ret)							\
 		return ret;
 
-#define __common_field(type, item)					\
-	ret = trace_define_field(event_call, #type, "common_" #item,	\
-				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
-	if (ret)							\
-		return ret;
-
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 int									\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 07a22c33ebf3c..f4e46616c48ea 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -30,7 +30,7 @@
 
 
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)			\
 	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
 			       "offset:%u;\tsize:%u;\n",		\
 			       (unsigned int)offsetof(typeof(field), item), \
@@ -85,18 +85,69 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_ENTRY	entry
 
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)	\
 	cmd;
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int ftrace_define_fields_##call(void);					\
+static int ftrace_raw_init_event_##call(void);				\
 									\
-static struct ftrace_event_call __used					\
+struct ftrace_event_call __used						\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.id			= proto,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
+	.raw_init		= ftrace_raw_init_event_##call,		\
 	.show_format		= ftrace_format_##call,			\
+	.define_fields		= ftrace_define_fields_##call,		\
+};									\
+static int ftrace_raw_init_event_##call(void)				\
+{									\
+	INIT_LIST_HEAD(&event_##call.fields);				\
+	return 0;							\
+}									\
+
+#include "trace_event_types.h"
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int									\
+ftrace_define_fields_##call(void)					\
+{									\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	struct args field;						\
+	int ret;							\
+									\
+	__common_field(unsigned char, type);				\
+	__common_field(unsigned char, flags);				\
+	__common_field(unsigned char, preempt_count);			\
+	__common_field(int, pid);					\
+	__common_field(int, tgid);					\
+									\
+	tstruct;							\
+									\
+	return ret;							\
 }
+
 #include "trace_event_types.h"
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 7bfdf4c2347f3..e6b275b22ac08 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -168,6 +168,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 
 void trace_hw_branch(u64 from, u64 to)
 {
+	struct ftrace_event_call *call = &event_hw_branch;
 	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
@@ -194,6 +195,7 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->from = from;
 	entry->to   = to;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, 0);
 
  out:
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index bae791ebcc516..8ce7d7d62c076 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,
 
 static void probe_power_end(struct power_trace *it)
 {
+	struct ftrace_event_call *call = &event_power;
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
@@ -54,6 +55,7 @@ static void probe_power_end(struct power_trace *it)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
@@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)
 static void probe_power_mark(struct power_trace *it, unsigned int type,
 				unsigned int level)
 {
+	struct ftrace_event_call *call = &event_power;
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
@@ -84,6 +87,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
+	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
-- 
GitLab


From e45f2e2bd298e1ff687448e5fd15a3588b5807ec Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 31 Mar 2009 00:49:16 -0500
Subject: [PATCH 0377/2198] tracing/filters: add TRACE_EVENT_FORMAT_NOFILTER
 event macro

Frederic Weisbecker suggested that the trace_special event shouldn't be
filterable; this patch adds a TRACE_EVENT_FORMAT_NOFILTER event macro
that allows an event format to be exported without having a filter
attached, and removes filtering from the trace_special event.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c             |  2 --
 kernel/trace/trace.h             |  2 ++
 kernel/trace/trace_event_types.h |  2 +-
 kernel/trace/trace_export.c      | 33 ++++++++++++++++++++++++++++++++
 4 files changed, 36 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 962e6179994ae..c209d214169c1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1064,7 +1064,6 @@ ftrace_trace_special(void *__tr,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
 		     int pc)
 {
-	struct ftrace_event_call *call = &event_special;
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
@@ -1077,7 +1076,6 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	filter_check_discard(call, entry, event);
 	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e7737281953fa..3cf856fa597bb 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -915,6 +915,8 @@ do {									\
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 	extern struct ftrace_event_call event_##call;
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
 #include "trace_event_types.h"
 
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 95b147aac2299..cfcecc4fd86d4 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
 	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
-TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, arg1, arg1)
 		TRACE_FIELD(unsigned long, arg2, arg2)
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index f4e46616c48ea..77c494f5e1d6a 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -65,6 +65,22 @@ ftrace_format_##call(struct trace_seq *s)				\
 	return ret;							\
 }
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)				\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct args field;						\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
+	return ret;							\
+}
+
 #include "trace_event_types.h"
 
 #undef TRACE_ZERO_CHAR
@@ -109,6 +125,19 @@ static int ftrace_raw_init_event_##call(void)				\
 	return 0;							\
 }									\
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)				\
+									\
+struct ftrace_event_call __used						\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name			= #call,				\
+	.id			= proto,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.show_format		= ftrace_format_##call,			\
+};
+
 #include "trace_event_types.h"
 
 #undef TRACE_FIELD
@@ -150,4 +179,8 @@ ftrace_define_fields_##call(void)					\
 	return ret;							\
 }
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\
+				    tpfmt)
+
 #include "trace_event_types.h"
-- 
GitLab


From fa1b47dd85453ec7d4bcfe4aa4a2d172ba452fc3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 2 Apr 2009 00:09:41 -0400
Subject: [PATCH 0378/2198] ring-buffer: add ring_buffer_discard_commit

The ring_buffer_discard_commit is similar to ring_buffer_event_discard
but it can only be done on an event that has yet to be commited.
Unpredictable results can happen otherwise.

The main difference between ring_buffer_discard_commit and
ring_buffer_event_discard is that ring_buffer_discard_commit will try
to free the data in the ring buffer if nothing has addded data
after the reserved event. If something did, then it acts almost the
same as ring_buffer_event_discard followed by a
ring_buffer_unlock_commit.

Note, either ring_buffer_commit_discard and ring_buffer_unlock_commit
can be called on an event, not both.

This commit also exports both discard functions to be usable by
GPL modules.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |  29 +++++++++
 kernel/trace/ring_buffer.c  | 125 ++++++++++++++++++++++++++++++------
 2 files changed, 133 insertions(+), 21 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index e1b7b2173885f..f0aa486d131c6 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -68,8 +68,37 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
 	return event->time_delta;
 }
 
+/*
+ * ring_buffer_event_discard can discard any event in the ring buffer.
+ *   it is up to the caller to protect against a reader from
+ *   consuming it or a writer from wrapping and replacing it.
+ *
+ * No external protection is needed if this is called before
+ * the event is commited. But in that case it would be better to
+ * use ring_buffer_discard_commit.
+ *
+ * Note, if an event that has not been committed is discarded
+ * with ring_buffer_event_discard, it must still be committed.
+ */
 void ring_buffer_event_discard(struct ring_buffer_event *event);
 
+/*
+ * ring_buffer_discard_commit will remove an event that has not
+ *   ben committed yet. If this is used, then ring_buffer_unlock_commit
+ *   must not be called on the discarded event. This function
+ *   will try to remove the event from the ring buffer completely
+ *   if another event has not been written after it.
+ *
+ * Example use:
+ *
+ *  if (some_condition)
+ *    ring_buffer_discard_commit(buffer, event);
+ *  else
+ *    ring_buffer_unlock_commit(buffer, event);
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+				struct ring_buffer_event *event);
+
 /*
  * size is in bytes for each per CPU buffer.
  */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 74a11808c282e..f935bd5ec3e83 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -205,27 +205,6 @@ static void rb_event_set_padding(struct ring_buffer_event *event)
 	event->time_delta = 0;
 }
 
-/**
- * ring_buffer_event_discard - discard an event in the ring buffer
- * @buffer: the ring buffer
- * @event: the event to discard
- *
- * Sometimes a event that is in the ring buffer needs to be ignored.
- * This function lets the user discard an event in the ring buffer
- * and then that event will not be read later.
- *
- * Note, it is up to the user to be careful with this, and protect
- * against races. If the user discards an event that has been consumed
- * it is possible that it could corrupt the ring buffer.
- */
-void ring_buffer_event_discard(struct ring_buffer_event *event)
-{
-	event->type = RINGBUF_TYPE_PADDING;
-	/* time delta must be non zero */
-	if (!event->time_delta)
-		event->time_delta = 1;
-}
-
 static unsigned
 rb_event_data_length(struct ring_buffer_event *event)
 {
@@ -1570,6 +1549,110 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
+/**
+ * ring_buffer_event_discard - discard any event in the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
+
+/**
+ * ring_buffer_commit_discard - discard an event that has not been committed
+ * @buffer: the ring buffer
+ * @event: non committed event to discard
+ *
+ * This is similar to ring_buffer_event_discard but must only be
+ * performed on an event that has not been committed yet. The difference
+ * is that this will also try to free the event from the ring buffer
+ * if another event has not been added behind it.
+ *
+ * If another event has been added behind it, it will set the event
+ * up as discarded, and perform the commit.
+ *
+ * If this function is called, do not call ring_buffer_unlock_commit on
+ * the event.
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+				struct ring_buffer_event *event)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long new_index, old_index;
+	struct buffer_page *bpage;
+	unsigned long index;
+	unsigned long addr;
+	int cpu;
+
+	/* The event is discarded regardless */
+	ring_buffer_event_discard(event);
+
+	/*
+	 * This must only be called if the event has not been
+	 * committed yet. Thus we can assume that preemption
+	 * is still disabled.
+	 */
+	RB_WARN_ON(buffer, !preempt_count());
+
+	cpu = smp_processor_id();
+	cpu_buffer = buffer->buffers[cpu];
+
+	new_index = rb_event_index(event);
+	old_index = new_index + rb_event_length(event);
+	addr = (unsigned long)event;
+	addr &= PAGE_MASK;
+
+	bpage = cpu_buffer->tail_page;
+
+	if (bpage == (void *)addr && rb_page_write(bpage) == old_index) {
+		/*
+		 * This is on the tail page. It is possible that
+		 * a write could come in and move the tail page
+		 * and write to the next page. That is fine
+		 * because we just shorten what is on this page.
+		 */
+		index = local_cmpxchg(&bpage->write, old_index, new_index);
+		if (index == old_index)
+			goto out;
+	}
+
+	/*
+	 * The commit is still visible by the reader, so we
+	 * must increment entries.
+	 */
+	cpu_buffer->entries++;
+ out:
+	/*
+	 * If a write came in and pushed the tail page
+	 * we still need to update the commit pointer
+	 * if we were the commit.
+	 */
+	if (rb_is_commit(cpu_buffer, event))
+		rb_set_commit_to_write(cpu_buffer);
+
+	/*
+	 * Only the last preempt count needs to restore preemption.
+	 */
+	if (preempt_count() == 1)
+		ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+	else
+		preempt_enable_no_resched_notrace();
+
+}
+EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
+
 /**
  * ring_buffer_write - write data to the buffer without reserving
  * @buffer: The ring buffer to write to.
-- 
GitLab


From 77d9f465d46fd67cdb82ee5e1ab99dd57a17c486 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 2 Apr 2009 01:16:59 -0400
Subject: [PATCH 0379/2198] tracing/filters: use ring_buffer_discard_commit for
 discarded events

The ring_buffer_discard_commit makes better usage of the ring_buffer
when an event has been discarded. It tries to remove it completely if
possible.

This patch converts the trace event filtering to use
ring_buffer_discard_commit instead of the ring_buffer_event_discard.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                | 9 +++++++--
 kernel/trace/trace.h                | 1 +
 kernel/trace/trace_events_stage_3.h | 6 +++---
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c209d214169c1..d880ab2772ce7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -884,13 +884,18 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
-	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
-	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+}
+
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
+{
+	ring_buffer_discard_commit(global_trace.buffer, event);
 }
 
 void
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3cf856fa597bb..dfefffd7ae39f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -497,6 +497,7 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 9d2fa78ceccae..d2f34bf30e597 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -223,9 +223,9 @@ static void ftrace_raw_event_##call(proto)				\
 	assign;								\
 									\
 	if (call->preds && !filter_match_preds(call, entry))		\
-		ring_buffer_event_discard(event);			\
-									\
-	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
+		trace_current_buffer_discard_commit(event);		\
+	else								\
+		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
 									\
 }									\
 									\
-- 
GitLab


From 5f77a88b3f8268b11940b51d2e03d26a663ceb90 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Wed, 8 Apr 2009 03:14:01 -0500
Subject: [PATCH 0380/2198] tracing/infrastructure: separate event tracer from
 event support

Add a new config option, CONFIG_EVENT_TRACING that gets selected
when CONFIG_TRACING is selected and adds everything needed by the stuff
in trace_export - basically all the event tracing support needed by e.g.
bprint, minus the actual events, which are only included if
CONFIG_EVENT_TRACER is selected.

So CONFIG_EVENT_TRACER can be used to turn on or off the generated events
(what I think of as the 'event tracer'), while CONFIG_EVENT_TRACING turns
on or off the base event tracing support used by both the event tracer and
the other things such as bprint that can't be configured out.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
LKML-Reference: <1239178441.10295.34.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h | 2 +-
 kernel/trace/Kconfig              | 4 ++++
 kernel/trace/Makefile             | 6 +++---
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 7fa660fd449ca..7e9b1e9f711c0 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -61,7 +61,7 @@
 #define BRANCH_PROFILE()
 #endif
 
-#ifdef CONFIG_EVENT_TRACER
+#ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS()	VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
 			*(_ftrace_events)				\
 			VMLINUX_SYMBOL(__stop_ftrace_events) = .;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 23b96ebbf8930..644606e899fa0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -48,6 +48,9 @@ config FTRACE_NMI_ENTER
        depends on HAVE_FTRACE_NMI_ENTER
        default y
 
+config EVENT_TRACING
+	bool
+
 config TRACING
 	bool
 	select DEBUG_FS
@@ -56,6 +59,7 @@ config TRACING
 	select TRACEPOINTS
 	select NOP_TRACER
 	select BINARY_PRINTF
+	select EVENT_TRACING
 
 #
 # Minimum requirements an architecture has to meet for us to
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f5121ec12..3ad367e7c97f2 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -40,11 +40,11 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
-obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 
 libftrace-y := ftrace.o
-- 
GitLab


From eb02ce017dd83985041a7e54c6449f92d53b026f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Wed, 8 Apr 2009 03:15:54 -0500
Subject: [PATCH 0381/2198] tracing/filters: use ring_buffer_discard_commit()
 in filter_check_discard()

This patch changes filter_check_discard() to make use of the new
ring_buffer_discard_commit() function and modifies the current users to
call the old commit function in the non-discard case.

It also introduces a version of filter_check_discard() that uses the
global trace buffer (filter_current_check_discard()) for those cases.

v2 changes:

- fix compile error noticed by Ingo Molnar

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
LKML-Reference: <1239178554.10295.36.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c            | 10 +++----
 kernel/trace/trace.c                | 45 +++++++++++++++--------------
 kernel/trace/trace.h                | 14 +++++++--
 kernel/trace/trace_branch.c         |  5 ++--
 kernel/trace/trace_events_stage_3.h |  5 +---
 kernel/trace/trace_hw_branches.c    |  4 +--
 kernel/trace/trace_power.c          |  8 ++---
 7 files changed, 48 insertions(+), 43 deletions(-)

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 9419ad10541b6..86cdf671d7e28 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -63,9 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 	entry->gfp_flags	= gfp_flags;
 	entry->node		= node;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -90,9 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 	entry->call_site	= call_site;
 	entry->ptr		= ptr;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d880ab2772ce7..c0047fcf70764 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -171,6 +171,12 @@ static struct trace_array	global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
+int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
+				 struct ring_buffer_event *event)
+{
+	return filter_check_discard(call, rec, global_trace.buffer, event);
+}
+
 cycle_t ftrace_now(int cpu)
 {
 	u64 ts;
@@ -919,9 +925,8 @@ trace_function(struct trace_array *tr,
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -943,8 +948,8 @@ static int __trace_graph_entry(struct trace_array *tr,
 		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(global_trace.buffer, event);
+	if (!filter_current_check_discard(call, entry, event))
+		ring_buffer_unlock_commit(global_trace.buffer, event);
 
 	return 1;
 }
@@ -967,8 +972,8 @@ static void __trace_graph_return(struct trace_array *tr,
 		return;
 	entry	= ring_buffer_event_data(event);
 	entry->ret				= *trace;
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(global_trace.buffer, event);
+	if (!filter_current_check_discard(call, entry, event))
+		ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
 
@@ -1004,8 +1009,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -1052,8 +1057,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -1114,9 +1119,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
 
-	filter_check_discard(call, entry, event);
-
-	trace_buffer_unlock_commit(tr, event, flags, pc);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
 void
@@ -1142,9 +1146,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 }
@@ -1285,8 +1288,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
 out_unlock:
 	__raw_spin_unlock(&trace_buf_lock);
@@ -1341,8 +1344,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
-	filter_check_discard(call, entry, event);
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
 	__raw_spin_unlock(&trace_buf_lock);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dfefffd7ae39f..9729d14767d8c 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -866,13 +866,21 @@ extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
+extern int filter_current_check_discard(struct ftrace_event_call *call,
+					void *rec,
+					struct ring_buffer_event *event);
 
-static inline void
+static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
+		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->preds) && !filter_match_preds(call, rec))
-		ring_buffer_event_discard(event);
+	if (unlikely(call->preds) && !filter_match_preds(call, rec)) {
+		ring_buffer_discard_commit(buffer, event);
+		return 1;
+	}
+
+	return 0;
 }
 
 #define __common_field(type, item)					\
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index c95c25d838ef0..8e64e604f5a7f 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -74,9 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
-	filter_check_discard(call, entry, event);
-
-	ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index d2f34bf30e597..b2b298269eb0e 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -222,11 +222,8 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	if (call->preds && !filter_match_preds(call, entry))		\
-		trace_current_buffer_discard_commit(event);		\
-	else								\
+	if (!filter_current_check_discard(call, entry, event))		\
 		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
-									\
 }									\
 									\
 static int ftrace_raw_reg_event_##call(void)				\
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e6b275b22ac08..8683d50a753a4 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -195,8 +195,8 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->from = from;
 	entry->to   = to;
-	filter_check_discard(call, entry, event);
-	trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, 0, 0);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 8ce7d7d62c076..810a5b7cf1c5e 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -55,8 +55,8 @@ static void probe_power_end(struct power_trace *it)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
-	filter_check_discard(call, entry, event);
-	trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
@@ -87,8 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	entry->state_data = *it;
-	filter_check_discard(call, entry, event);
-	trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, tr->buffer, event))
+		trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
-- 
GitLab


From 0a19e53c1514ad8e9c3cbab40c6c3f52c86f403d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 13 Apr 2009 03:17:50 -0500
Subject: [PATCH 0382/2198] tracing/filters: allow on-the-fly filter switching

This patch allows event filters to be safely removed or switched
on-the-fly while avoiding the use of rcu or the suspension of tracing of
previous versions.

It does it by adding a new filter_pred_none() predicate function which
does nothing and by never deallocating either the predicates or any of
the filter_pred members used in matching; the predicate lists are
allocated and initialized during ftrace_event_calls initialization.

Whenever a filter is removed or replaced, the filter_pred_* functions
currently in use by the affected ftrace_event_call are immediately
switched over to to the filter_pred_none() function, while the rest of
the filter_pred members are left intact, allowing any currently
executing filter_pred_* functions to finish up, using the values they're
currently using.

In the case of filter replacement, the new predicate values are copied
into the old predicates after the above step, and the filter_pred_none()
functions are replaced by the filter_pred_* functions for the new
filter.  In this case, it is possible though very unlikely that a
previous filter_pred_* is still running even after the
filter_pred_none() switch and the switch to the new filter_pred_*.  In
that case, however, because nothing has been deallocated in the
filter_pred, the worst that can happen is that the old filter_pred_*
function sees the new values and as a result produces either a false
positive or a false negative, depending on the values it finds.

So one downside to this method is that rarely, it can produce a bad
match during the filter switch, but it should be possible to live with
that, IMHO.

The other downside is that at least in this patch the predicate lists
are always pre-allocated, taking up memory from the start.  They could
probably be allocated on first-use, and de-allocated when tracing is
completely stopped - if this patch makes sense, I could create another
one to do that later on.

Oh, and it also places a restriction on the size of __arrays in events,
currently set to 128, since they can't be larger than the now embedded
str_val arrays in the filter_pred struct.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1239610670.6660.49.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h                |  14 +-
 kernel/trace/trace_events.c         |   9 +-
 kernel/trace/trace_events_filter.c  | 252 +++++++++++++++-------------
 kernel/trace/trace_events_stage_2.h |   1 +
 kernel/trace/trace_events_stage_3.h |   1 +
 kernel/trace/trace_export.c         |   1 +
 6 files changed, 150 insertions(+), 128 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9729d14767d8c..b05b6ac982a1a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -813,6 +813,7 @@ struct ftrace_event_call {
 	int			(*show_format)(struct trace_seq *s);
 	int			(*define_fields)(void);
 	struct list_head	fields;
+	int			n_preds;
 	struct filter_pred	**preds;
 
 #ifdef CONFIG_EVENT_PROFILE
@@ -826,6 +827,7 @@ struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
 	struct dentry		*entry;
+	int			n_preds;
 	struct filter_pred	**preds;
 };
 
@@ -834,7 +836,8 @@ struct event_subsystem {
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
 	     event++)
 
-#define MAX_FILTER_PRED 8
+#define MAX_FILTER_PRED		8
+#define MAX_FILTER_STR_VAL	128
 
 struct filter_pred;
 
@@ -843,7 +846,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
 struct filter_pred {
 	filter_pred_fn_t fn;
 	u64 val;
-	char *str_val;
+	char str_val[MAX_FILTER_STR_VAL];
 	int str_len;
 	char *field_name;
 	int offset;
@@ -855,13 +858,14 @@ struct filter_pred {
 
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size);
+extern int init_preds(struct ftrace_event_call *call);
 extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds,
+extern void filter_print_preds(struct filter_pred **preds, int n_preds,
 			       struct trace_seq *s);
 extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
-extern void filter_free_preds(struct ftrace_event_call *call);
+extern void filter_disable_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
@@ -875,7 +879,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->preds) && !filter_match_preds(call, rec)) {
+	if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 789e14eb09a5c..ead68ac991913 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -481,7 +481,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(call->preds, s);
+	filter_print_preds(call->preds, call->n_preds, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -516,7 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	}
 
 	if (pred->clear) {
-		filter_free_preds(call);
+		filter_disable_preds(call);
 		filter_free_pred(pred);
 		return cnt;
 	}
@@ -527,6 +527,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		return err;
 	}
 
+	filter_free_pred(pred);
+
 	*ppos += cnt;
 
 	return cnt;
@@ -549,7 +551,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(system->preds, s);
+	filter_print_preds(system->preds, system->n_preds, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -712,6 +714,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 	list_add(&system->list, &event_subsystems);
 
 	system->preds = NULL;
+	system->n_preds = 0;
 
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9f8ecca34a595..de42dad42a884 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -82,25 +82,27 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
 	return match;
 }
 
+static int filter_pred_none(struct filter_pred *pred, void *event)
+{
+	return 0;
+}
+
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
 	int i, matched, and_failed = 0;
 	struct filter_pred *pred;
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (call->preds[i]) {
-			pred = call->preds[i];
-			if (and_failed && !pred->or)
-				continue;
-			matched = pred->fn(pred, rec);
-			if (!matched && !pred->or) {
-				and_failed = 1;
-				continue;
-			} else if (matched && pred->or)
-				return 1;
-		} else
-			break;
+	for (i = 0; i < call->n_preds; i++) {
+		pred = call->preds[i];
+		if (and_failed && !pred->or)
+			continue;
+		matched = pred->fn(pred, rec);
+		if (!matched && !pred->or) {
+			and_failed = 1;
+			continue;
+		} else if (matched && pred->or)
+			return 1;
 	}
 
 	if (and_failed)
@@ -109,31 +111,29 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 	return 1;
 }
 
-void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
+void filter_print_preds(struct filter_pred **preds, int n_preds,
+			struct trace_seq *s)
 {
 	char *field_name;
 	struct filter_pred *pred;
 	int i;
 
-	if (!preds) {
+	if (!n_preds) {
 		trace_seq_printf(s, "none\n");
 		return;
 	}
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (preds[i]) {
-			pred = preds[i];
-			field_name = pred->field_name;
-			if (i)
-				trace_seq_printf(s, pred->or ? "|| " : "&& ");
-			trace_seq_printf(s, "%s ", field_name);
-			trace_seq_printf(s, pred->not ? "!= " : "== ");
-			if (pred->str_val)
-				trace_seq_printf(s, "%s\n", pred->str_val);
-			else
-				trace_seq_printf(s, "%llu\n", pred->val);
-		} else
-			break;
+	for (i = 0; i < n_preds; i++) {
+		pred = preds[i];
+		field_name = pred->field_name;
+		if (i)
+			trace_seq_printf(s, pred->or ? "|| " : "&& ");
+		trace_seq_printf(s, "%s ", field_name);
+		trace_seq_printf(s, pred->not ? "!= " : "== ");
+		if (pred->str_len)
+			trace_seq_printf(s, "%s\n", pred->str_val);
+		else
+			trace_seq_printf(s, "%llu\n", pred->val);
 	}
 }
 
@@ -156,20 +156,69 @@ void filter_free_pred(struct filter_pred *pred)
 		return;
 
 	kfree(pred->field_name);
-	kfree(pred->str_val);
 	kfree(pred);
 }
 
-void filter_free_preds(struct ftrace_event_call *call)
+static void filter_clear_pred(struct filter_pred *pred)
+{
+	kfree(pred->field_name);
+	pred->field_name = NULL;
+	pred->str_len = 0;
+}
+
+static int filter_set_pred(struct filter_pred *dest,
+			   struct filter_pred *src,
+			   filter_pred_fn_t fn)
+{
+	*dest = *src;
+	dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+	if (!dest->field_name)
+		return -ENOMEM;
+	dest->fn = fn;
+
+	return 0;
+}
+
+void filter_disable_preds(struct ftrace_event_call *call)
 {
 	int i;
 
-	if (call->preds) {
-		for (i = 0; i < MAX_FILTER_PRED; i++)
+	call->n_preds = 0;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++)
+		call->preds[i]->fn = filter_pred_none;
+}
+
+int init_preds(struct ftrace_event_call *call)
+{
+	struct filter_pred *pred;
+	int i;
+
+	call->n_preds = 0;
+
+	call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+	if (!call->preds)
+		return -ENOMEM;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+		if (!pred)
+			goto oom;
+		pred->fn = filter_pred_none;
+		call->preds[i] = pred;
+	}
+
+	return 0;
+
+oom:
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (call->preds[i])
 			filter_free_pred(call->preds[i]);
-		kfree(call->preds);
-		call->preds = NULL;
 	}
+	kfree(call->preds);
+	call->preds = NULL;
+
+	return -ENOMEM;
 }
 
 void filter_free_subsystem_preds(struct event_subsystem *system)
@@ -177,11 +226,12 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 	struct ftrace_event_call *call = __start_ftrace_events;
 	int i;
 
-	if (system->preds) {
-		for (i = 0; i < MAX_FILTER_PRED; i++)
+	if (system->n_preds) {
+		for (i = 0; i < system->n_preds; i++)
 			filter_free_pred(system->preds[i]);
 		kfree(system->preds);
 		system->preds = NULL;
+		system->n_preds = 0;
 	}
 
 	events_for_each(call) {
@@ -189,33 +239,31 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 			continue;
 
 		if (!strcmp(call->system, system->name))
-			filter_free_preds(call);
+			filter_disable_preds(call);
 	}
 }
 
 static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred)
+			     struct filter_pred *pred,
+			     filter_pred_fn_t fn)
 {
-	int i;
+	int idx, err;
 
-	if (call->preds && !pred->compound)
-		filter_free_preds(call);
+	if (call->n_preds && !pred->compound)
+		filter_disable_preds(call);
 
-	if (!call->preds) {
-		call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
-				      GFP_KERNEL);
-		if (!call->preds)
-			return -ENOMEM;
-	}
+	if (call->n_preds == MAX_FILTER_PRED)
+		return -ENOSPC;
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (!call->preds[i]) {
-			call->preds[i] = pred;
-			return 0;
-		}
-	}
+	idx = call->n_preds;
+	filter_clear_pred(call->preds[idx]);
+	err = filter_set_pred(call->preds[idx], pred, fn);
+	if (err)
+		return err;
+
+	call->n_preds++;
 
-	return -ENOSPC;
+	return 0;
 }
 
 static int is_string_field(const char *type)
@@ -229,98 +277,66 @@ static int is_string_field(const char *type)
 int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 {
 	struct ftrace_event_field *field;
+	filter_pred_fn_t fn;
 
 	field = find_event_field(call, pred->field_name);
 	if (!field)
 		return -EINVAL;
 
+	pred->fn = filter_pred_none;
 	pred->offset = field->offset;
 
 	if (is_string_field(field->type)) {
-		if (!pred->str_val)
+		if (!pred->str_len)
 			return -EINVAL;
-		pred->fn = filter_pred_string;
+		fn = filter_pred_string;
 		pred->str_len = field->size;
-		return __filter_add_pred(call, pred);
+		return __filter_add_pred(call, pred, fn);
 	} else {
-		if (pred->str_val)
+		if (pred->str_len)
 			return -EINVAL;
 	}
 
 	switch (field->size) {
 	case 8:
-		pred->fn = filter_pred_64;
+		fn = filter_pred_64;
 		break;
 	case 4:
-		pred->fn = filter_pred_32;
+		fn = filter_pred_32;
 		break;
 	case 2:
-		pred->fn = filter_pred_16;
+		fn = filter_pred_16;
 		break;
 	case 1:
-		pred->fn = filter_pred_8;
+		fn = filter_pred_8;
 		break;
 	default:
 		return -EINVAL;
 	}
 
-	return __filter_add_pred(call, pred);
-}
-
-static struct filter_pred *copy_pred(struct filter_pred *pred)
-{
-	struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
-	if (!new_pred)
-		return NULL;
-
-	memcpy(new_pred, pred, sizeof(*pred));
-
-	if (pred->field_name) {
-		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-		if (!new_pred->field_name) {
-			kfree(new_pred);
-			return NULL;
-		}
-	}
-
-	if (pred->str_val) {
-		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
-		if (!new_pred->str_val) {
-			filter_free_pred(new_pred);
-			return NULL;
-		}
-	}
-
-	return new_pred;
+	return __filter_add_pred(call, pred, fn);
 }
 
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
 	struct ftrace_event_call *call = __start_ftrace_events;
-	struct filter_pred *event_pred;
-	int i;
 
-	if (system->preds && !pred->compound)
+	if (system->n_preds && !pred->compound)
 		filter_free_subsystem_preds(system);
 
-	if (!system->preds) {
+	if (!system->n_preds) {
 		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
 		if (!system->preds)
 			return -ENOMEM;
 	}
 
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (!system->preds[i]) {
-			system->preds[i] = pred;
-			break;
-		}
-	}
-
-	if (i == MAX_FILTER_PRED)
+	if (system->n_preds == MAX_FILTER_PRED)
 		return -ENOSPC;
 
+	system->preds[system->n_preds] = pred;
+
 	events_for_each(call) {
 		int err;
 
@@ -333,22 +349,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 		if (!find_event_field(call, pred->field_name))
 			continue;
 
-		event_pred = copy_pred(pred);
-		if (!event_pred)
-			goto oom;
-
-		err = filter_add_pred(call, event_pred);
-		if (err)
-			filter_free_pred(event_pred);
-		if (err == -ENOMEM)
-			goto oom;
+		err = filter_add_pred(call, pred);
+		if (err == -ENOMEM) {
+			system->preds[system->n_preds] = NULL;
+			return err;
+		}
 	}
 
-	return 0;
+	system->n_preds++;
 
-oom:
-	system->preds[i] = NULL;
-	return -ENOMEM;
+	return 0;
 }
 
 int filter_parse(char **pbuf, struct filter_pred *pred)
@@ -410,7 +420,8 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 		}
 	}
 
-	if (!val_str) {
+	if (!val_str || !strlen(val_str)
+	    || strlen(val_str) >= MAX_FILTER_STR_VAL) {
 		pred->field_name = NULL;
 		return -EINVAL;
 	}
@@ -419,11 +430,12 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 	if (!pred->field_name)
 		return -ENOMEM;
 
+	pred->str_len = 0;
 	pred->val = simple_strtoull(val_str, &tmp, 0);
 	if (tmp == val_str) {
-		pred->str_val = kstrdup(val_str, GFP_KERNEL);
-		if (!pred->str_val)
-			return -ENOMEM;
+		strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL);
+		pred->str_len = strlen(val_str);
+		pred->str_val[pred->str_len] = '\0';
 	} else if (*tmp != '\0')
 		return -EINVAL;
 
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 02fb710193ed4..59cfd7dfe68d4 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -140,6 +140,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 
 #undef __array
 #define __array(type, item, len)					\
+	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
 	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
 				 offsetof(typeof(field), item),		\
 				 sizeof(field.item));			\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index b2b298269eb0e..5bb1b7ffbdb6e 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -255,6 +255,7 @@ static int ftrace_raw_init_event_##call(void)				\
 		return -ENODEV;						\
 	event_##call.id = id;						\
 	INIT_LIST_HEAD(&event_##call.fields);				\
+	init_preds(&event_##call);					\
 	return 0;							\
 }									\
 									\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 77c494f5e1d6a..48fc02fe73a09 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -122,6 +122,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 static int ftrace_raw_init_event_##call(void)				\
 {									\
 	INIT_LIST_HEAD(&event_##call.fields);				\
+	init_preds(&event_##call);					\
 	return 0;							\
 }									\
 
-- 
GitLab


From 6e837fb152410e571a81aaadbd9884f0bc46a55e Mon Sep 17 00:00:00 2001
From: Etienne Basset <etienne.basset@numericable.fr>
Date: Wed, 8 Apr 2009 20:39:40 +0200
Subject: [PATCH 0383/2198] smack: implement logging V3

This patch creates auditing functions usable by LSM to audit security
events. It provides standard dumping of FS, NET, task etc ... events
(code borrowed from SELinux)
and provides 2 callbacks to define LSM specific auditing, which should be
flexible enough to convert SELinux too.

Signed-off-by: Etienne Basset <etienne.basset@numericable.fr>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
cked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/lsm_audit.h | 111 +++++++++++
 security/lsm_audit.c      | 386 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 497 insertions(+)
 create mode 100644 include/linux/lsm_audit.h
 create mode 100644 security/lsm_audit.c

diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
new file mode 100644
index 0000000000000..e461b2c3d711a
--- /dev/null
+++ b/include/linux/lsm_audit.h
@@ -0,0 +1,111 @@
+/*
+ * Common LSM logging functions
+ * Heavily borrowed from selinux/avc.h
+ *
+ * Author : Etienne BASSET  <etienne.basset@ensta.org>
+ *
+ * All credits to : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * All BUGS to : Etienne BASSET  <etienne.basset@ensta.org>
+ */
+#ifndef _LSM_COMMON_LOGGING_
+#define _LSM_COMMON_LOGGING_
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kdev_t.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/audit.h>
+#include <linux/in6.h>
+#include <linux/path.h>
+#include <linux/key.h>
+#include <linux/skbuff.h>
+#include <asm/system.h>
+
+
+/* Auxiliary data to use in generating the audit record. */
+struct common_audit_data {
+	char    type;
+#define LSM_AUDIT_DATA_FS      1
+#define LSM_AUDIT_DATA_NET     2
+#define LSM_AUDIT_DATA_CAP     3
+#define LSM_AUDIT_DATA_IPC     4
+#define LSM_AUDIT_DATA_TASK    5
+#define LSM_AUDIT_DATA_KEY     6
+	struct task_struct *tsk;
+	union 	{
+		struct {
+			struct path path;
+			struct inode *inode;
+		} fs;
+		struct {
+			int netif;
+			struct sock *sk;
+			u16 family;
+			__be16 dport;
+			__be16 sport;
+			union {
+				struct {
+					__be32 daddr;
+					__be32 saddr;
+				} v4;
+				struct {
+					struct in6_addr daddr;
+					struct in6_addr saddr;
+				} v6;
+			} fam;
+		} net;
+		int cap;
+		int ipc_id;
+		struct task_struct *tsk;
+#ifdef CONFIG_KEYS
+		struct {
+			key_serial_t key;
+			char *key_desc;
+		} key_struct;
+#endif
+	} u;
+	const char *function;
+	/* this union contains LSM specific data */
+	union {
+		/* SMACK data */
+		struct smack_audit_data {
+			char *subject;
+			char *object;
+			char *request;
+			int result;
+		} smack_audit_data;
+		/* SELinux data */
+		struct {
+			u32 ssid;
+			u32 tsid;
+			u16 tclass;
+			u32 requested;
+			u32 audited;
+			struct av_decision *avd;
+			int result;
+		} selinux_audit_data;
+	} lsm_priv;
+	/* these callback will be implemented by a specific LSM */
+	void (*lsm_pre_audit)(struct audit_buffer *, void *);
+	void (*lsm_post_audit)(struct audit_buffer *, void *);
+};
+
+#define v4info fam.v4
+#define v6info fam.v6
+
+int ipv4_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto);
+
+int ipv6_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto);
+
+/* Initialize an LSM audit data structure. */
+#define COMMON_AUDIT_DATA_INIT(_d, _t) \
+	{ memset((_d), 0, sizeof(struct common_audit_data)); \
+	 (_d)->type = LSM_AUDIT_DATA_##_t; (_d)->function = __func__; }
+
+void common_lsm_audit(struct common_audit_data *a);
+
+#endif
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
new file mode 100644
index 0000000000000..94b868494b316
--- /dev/null
+++ b/security/lsm_audit.c
@@ -0,0 +1,386 @@
+/*
+ * common LSM auditing functions
+ *
+ * Based on code written for SELinux by :
+ *			Stephen Smalley, <sds@epoch.ncsc.mil>
+ * 			James Morris <jmorris@redhat.com>
+ * Author : Etienne Basset, <etienne.basset@ensta.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <linux/un.h>
+#include <net/af_unix.h>
+#include <linux/audit.h>
+#include <linux/ipv6.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/dccp.h>
+#include <linux/sctp.h>
+#include <linux/lsm_audit.h>
+
+/**
+ * ipv4_skb_to_auditdata : fill auditdata from skb
+ * @skb : the skb
+ * @ad : the audit data to fill
+ * @proto : the layer 4 protocol
+ *
+ * return  0 on success
+ */
+int ipv4_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto)
+{
+	int ret = 0;
+	struct iphdr *ih;
+
+	ih = ip_hdr(skb);
+	if (ih == NULL)
+		return -EINVAL;
+
+	ad->u.net.v4info.saddr = ih->saddr;
+	ad->u.net.v4info.daddr = ih->daddr;
+
+	if (proto)
+		*proto = ih->protocol;
+	/* non initial fragment */
+	if (ntohs(ih->frag_off) & IP_OFFSET)
+		return 0;
+
+	switch (ih->protocol) {
+	case IPPROTO_TCP: {
+		struct tcphdr *th = tcp_hdr(skb);
+		if (th == NULL)
+			break;
+
+		ad->u.net.sport = th->source;
+		ad->u.net.dport = th->dest;
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr *uh = udp_hdr(skb);
+		if (uh == NULL)
+			break;
+
+		ad->u.net.sport = uh->source;
+		ad->u.net.dport = uh->dest;
+		break;
+	}
+	case IPPROTO_DCCP: {
+		struct dccp_hdr *dh = dccp_hdr(skb);
+		if (dh == NULL)
+			break;
+
+		ad->u.net.sport = dh->dccph_sport;
+		ad->u.net.dport = dh->dccph_dport;
+		break;
+	}
+	case IPPROTO_SCTP: {
+		struct sctphdr *sh = sctp_hdr(skb);
+		if (sh == NULL)
+			break;
+		ad->u.net.sport = sh->source;
+		ad->u.net.dport = sh->dest;
+		break;
+	}
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * ipv6_skb_to_auditdata : fill auditdata from skb
+ * @skb : the skb
+ * @ad : the audit data to fill
+ * @proto : the layer 4 protocol
+ *
+ * return  0 on success
+ */
+int ipv6_skb_to_auditdata(struct sk_buff *skb,
+		struct common_audit_data *ad, u8 *proto)
+{
+	int offset, ret = 0;
+	struct ipv6hdr *ip6;
+	u8 nexthdr;
+
+	ip6 = ipv6_hdr(skb);
+	if (ip6 == NULL)
+		return -EINVAL;
+	ipv6_addr_copy(&ad->u.net.v6info.saddr, &ip6->saddr);
+	ipv6_addr_copy(&ad->u.net.v6info.daddr, &ip6->daddr);
+	ret = 0;
+	/* IPv6 can have several extension header before the Transport header
+	 * skip them */
+	offset = skb_network_offset(skb);
+	offset += sizeof(*ip6);
+	nexthdr = ip6->nexthdr;
+	offset = ipv6_skip_exthdr(skb, offset, &nexthdr);
+	if (offset < 0)
+		return 0;
+	if (proto)
+		*proto = nexthdr;
+	switch (nexthdr) {
+	case IPPROTO_TCP: {
+		struct tcphdr _tcph, *th;
+
+		th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+		if (th == NULL)
+			break;
+
+		ad->u.net.sport = th->source;
+		ad->u.net.dport = th->dest;
+		break;
+	}
+	case IPPROTO_UDP: {
+		struct udphdr _udph, *uh;
+
+		uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+		if (uh == NULL)
+			break;
+
+		ad->u.net.sport = uh->source;
+		ad->u.net.dport = uh->dest;
+		break;
+	}
+	case IPPROTO_DCCP: {
+		struct dccp_hdr _dccph, *dh;
+
+		dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
+		if (dh == NULL)
+			break;
+
+		ad->u.net.sport = dh->dccph_sport;
+		ad->u.net.dport = dh->dccph_dport;
+		break;
+	}
+	case IPPROTO_SCTP: {
+		struct sctphdr _sctph, *sh;
+
+		sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
+		if (sh == NULL)
+			break;
+		ad->u.net.sport = sh->source;
+		ad->u.net.dport = sh->dest;
+		break;
+	}
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+#endif
+
+
+static inline void print_ipv6_addr(struct audit_buffer *ab,
+				   struct in6_addr *addr, __be16 port,
+				   char *name1, char *name2)
+{
+	if (!ipv6_addr_any(addr))
+		audit_log_format(ab, " %s=%pI6", name1, addr);
+	if (port)
+		audit_log_format(ab, " %s=%d", name2, ntohs(port));
+}
+
+static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr,
+				   __be16 port, char *name1, char *name2)
+{
+	if (addr)
+		audit_log_format(ab, " %s=%pI4", name1, &addr);
+	if (port)
+		audit_log_format(ab, " %s=%d", name2, ntohs(port));
+}
+
+/**
+ * dump_common_audit_data - helper to dump common audit data
+ * @a : common audit data
+ *
+ */
+static void dump_common_audit_data(struct audit_buffer *ab,
+				   struct common_audit_data *a)
+{
+	struct inode *inode = NULL;
+	struct task_struct *tsk = current;
+
+	if (a->tsk)
+		tsk = a->tsk;
+	if (tsk && tsk->pid) {
+		audit_log_format(ab, " pid=%d comm=", tsk->pid);
+		audit_log_untrustedstring(ab, tsk->comm);
+	}
+
+	switch (a->type) {
+	case LSM_AUDIT_DATA_IPC:
+		audit_log_format(ab, " key=%d ", a->u.ipc_id);
+		break;
+	case LSM_AUDIT_DATA_CAP:
+		audit_log_format(ab, " capability=%d ", a->u.cap);
+		break;
+	case LSM_AUDIT_DATA_FS:
+		if (a->u.fs.path.dentry) {
+			struct dentry *dentry = a->u.fs.path.dentry;
+			if (a->u.fs.path.mnt) {
+				audit_log_d_path(ab, "path=", &a->u.fs.path);
+			} else {
+				audit_log_format(ab, " name=");
+				audit_log_untrustedstring(ab,
+						 dentry->d_name.name);
+			}
+			inode = dentry->d_inode;
+		} else if (a->u.fs.inode) {
+			struct dentry *dentry;
+			inode = a->u.fs.inode;
+			dentry = d_find_alias(inode);
+			if (dentry) {
+				audit_log_format(ab, " name=");
+				audit_log_untrustedstring(ab,
+						 dentry->d_name.name);
+				dput(dentry);
+			}
+		}
+		if (inode)
+			audit_log_format(ab, " dev=%s ino=%lu",
+					inode->i_sb->s_id,
+					inode->i_ino);
+		break;
+	case LSM_AUDIT_DATA_TASK:
+		tsk = a->u.tsk;
+		if (tsk && tsk->pid) {
+			audit_log_format(ab, " pid=%d comm=", tsk->pid);
+			audit_log_untrustedstring(ab, tsk->comm);
+		}
+		break;
+	case LSM_AUDIT_DATA_NET:
+		if (a->u.net.sk) {
+			struct sock *sk = a->u.net.sk;
+			struct unix_sock *u;
+			int len = 0;
+			char *p = NULL;
+
+			switch (sk->sk_family) {
+			case AF_INET: {
+				struct inet_sock *inet = inet_sk(sk);
+
+				print_ipv4_addr(ab, inet->rcv_saddr,
+						inet->sport,
+						"laddr", "lport");
+				print_ipv4_addr(ab, inet->daddr,
+						inet->dport,
+						"faddr", "fport");
+				break;
+			}
+			case AF_INET6: {
+				struct inet_sock *inet = inet_sk(sk);
+				struct ipv6_pinfo *inet6 = inet6_sk(sk);
+
+				print_ipv6_addr(ab, &inet6->rcv_saddr,
+						inet->sport,
+						"laddr", "lport");
+				print_ipv6_addr(ab, &inet6->daddr,
+						inet->dport,
+						"faddr", "fport");
+				break;
+			}
+			case AF_UNIX:
+				u = unix_sk(sk);
+				if (u->dentry) {
+					struct path path = {
+						.dentry = u->dentry,
+						.mnt = u->mnt
+					};
+					audit_log_d_path(ab, "path=", &path);
+					break;
+				}
+				if (!u->addr)
+					break;
+				len = u->addr->len-sizeof(short);
+				p = &u->addr->name->sun_path[0];
+				audit_log_format(ab, " path=");
+				if (*p)
+					audit_log_untrustedstring(ab, p);
+				else
+					audit_log_n_hex(ab, p, len);
+				break;
+			}
+		}
+
+		switch (a->u.net.family) {
+		case AF_INET:
+			print_ipv4_addr(ab, a->u.net.v4info.saddr,
+					a->u.net.sport,
+					"saddr", "src");
+			print_ipv4_addr(ab, a->u.net.v4info.daddr,
+					a->u.net.dport,
+					"daddr", "dest");
+			break;
+		case AF_INET6:
+			print_ipv6_addr(ab, &a->u.net.v6info.saddr,
+					a->u.net.sport,
+					"saddr", "src");
+			print_ipv6_addr(ab, &a->u.net.v6info.daddr,
+					a->u.net.dport,
+					"daddr", "dest");
+			break;
+		}
+		if (a->u.net.netif > 0) {
+			struct net_device *dev;
+
+			/* NOTE: we always use init's namespace */
+			dev = dev_get_by_index(&init_net, a->u.net.netif);
+			if (dev) {
+				audit_log_format(ab, " netif=%s", dev->name);
+				dev_put(dev);
+			}
+		}
+		break;
+#ifdef CONFIG_KEYS
+	case LSM_AUDIT_DATA_KEY:
+		audit_log_format(ab, " key_serial=%u", a->u.key_struct.key);
+		if (a->u.key_struct.key_desc) {
+			audit_log_format(ab, " key_desc=");
+			audit_log_untrustedstring(ab, a->u.key_struct.key_desc);
+		}
+		break;
+#endif
+	} /* switch (a->type) */
+}
+
+/**
+ * common_lsm_audit - generic LSM auditing function
+ * @a:  auxiliary audit data
+ *
+ * setup the audit buffer for common security information
+ * uses callback to print LSM specific information
+ */
+void common_lsm_audit(struct common_audit_data *a)
+{
+	struct audit_buffer *ab;
+
+	if (a == NULL)
+		return;
+	/* we use GFP_ATOMIC so we won't sleep */
+	ab = audit_log_start(current->audit_context, GFP_ATOMIC, AUDIT_AVC);
+
+	if (ab == NULL)
+		return;
+
+	if (a->lsm_pre_audit)
+		a->lsm_pre_audit(ab, a);
+
+	dump_common_audit_data(ab, a);
+
+	if (a->lsm_post_audit)
+		a->lsm_post_audit(ab, a);
+
+	audit_log_end(ab);
+}
-- 
GitLab


From ecfcc53fef3c357574bb6143dce6631e6d56295c Mon Sep 17 00:00:00 2001
From: Etienne Basset <etienne.basset@numericable.fr>
Date: Wed, 8 Apr 2009 20:40:06 +0200
Subject: [PATCH 0384/2198] smack: implement logging V3

the following patch, add logging of Smack security decisions.
This is of course very useful to understand what your current smack policy does.
As suggested by Casey, it also now forbids labels with ', " or \

It introduces a '/smack/logging' switch :
0: no logging
1: log denied (default)
2: log accepted
3: log denied&accepted

Signed-off-by: Etienne Basset <etienne.basset@numericable.fr>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/Smack.txt       |  20 +-
 security/Makefile             |   3 +
 security/smack/smack.h        | 108 +++++++++-
 security/smack/smack_access.c | 143 +++++++++++--
 security/smack/smack_lsm.c    | 390 ++++++++++++++++++++++++++--------
 security/smack/smackfs.c      |  66 ++++++
 6 files changed, 618 insertions(+), 112 deletions(-)

diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt
index 629c92e99783e..34614b4c708eb 100644
--- a/Documentation/Smack.txt
+++ b/Documentation/Smack.txt
@@ -184,8 +184,9 @@ length. Single character labels using special characters, that being anything
 other than a letter or digit, are reserved for use by the Smack development
 team. Smack labels are unstructured, case sensitive, and the only operation
 ever performed on them is comparison for equality. Smack labels cannot
-contain unprintable characters or the "/" (slash) character. Smack labels
-cannot begin with a '-', which is reserved for special options.
+contain unprintable characters, the "/" (slash), the "\" (backslash), the "'"
+(quote) and '"' (double-quote) characters.
+Smack labels cannot begin with a '-', which is reserved for special options.
 
 There are some predefined labels:
 
@@ -523,3 +524,18 @@ Smack supports some mount options:
 
 These mount options apply to all file system types.
 
+Smack auditing
+
+If you want Smack auditing of security events, you need to set CONFIG_AUDIT
+in your kernel configuration.
+By default, all denied events will be audited. You can change this behavior by
+writing a single character to the /smack/logging file :
+0 : no logging
+1 : log denied (default)
+2 : log accepted
+3 : log denied & accepted
+
+Events are logged as 'key=value' pairs, for each event you at least will get
+the subjet, the object, the rights requested, the action, the kernel function
+that triggered the event, plus other pairs depending on the type of event
+audited.
diff --git a/security/Makefile b/security/Makefile
index fa77021d9778a..c67557cdaa857 100644
--- a/security/Makefile
+++ b/security/Makefile
@@ -16,6 +16,9 @@ obj-$(CONFIG_SECURITYFS)		+= inode.o
 # Must precede capability.o in order to stack properly.
 obj-$(CONFIG_SECURITY_SELINUX)		+= selinux/built-in.o
 obj-$(CONFIG_SECURITY_SMACK)		+= smack/built-in.o
+ifeq ($(CONFIG_AUDIT),y)
+obj-$(CONFIG_SECURITY_SMACK)		+= lsm_audit.o
+endif
 obj-$(CONFIG_SECURITY_TOMOYO)		+= tomoyo/built-in.o
 obj-$(CONFIG_SECURITY_ROOTPLUG)		+= root_plug.o
 obj-$(CONFIG_CGROUP_DEVICE)		+= device_cgroup.o
diff --git a/security/smack/smack.h b/security/smack/smack.h
index 42ef313f98560..243bec175be05 100644
--- a/security/smack/smack.h
+++ b/security/smack/smack.h
@@ -20,6 +20,7 @@
 #include <net/netlabel.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
+#include <linux/lsm_audit.h>
 
 /*
  * Why 23? CIPSO is constrained to 30, so a 32 byte buffer is
@@ -178,6 +179,20 @@ struct smack_known {
 #define MAY_READWRITE	(MAY_READ | MAY_WRITE)
 #define MAY_NOT		0
 
+/*
+ * Number of access types used by Smack (rwxa)
+ */
+#define SMK_NUM_ACCESS_TYPE 4
+
+/*
+ * Smack audit data; is empty if CONFIG_AUDIT not set
+ * to save some stack
+ */
+struct smk_audit_info {
+#ifdef CONFIG_AUDIT
+	struct common_audit_data a;
+#endif
+};
 /*
  * These functions are in smack_lsm.c
  */
@@ -186,8 +201,8 @@ struct inode_smack *new_inode_smack(char *);
 /*
  * These functions are in smack_access.c
  */
-int smk_access(char *, char *, int);
-int smk_curacc(char *, u32);
+int smk_access(char *, char *, int, struct smk_audit_info *);
+int smk_curacc(char *, u32, struct smk_audit_info *);
 int smack_to_cipso(const char *, struct smack_cipso *);
 void smack_from_cipso(u32, char *, char *);
 char *smack_from_secid(const u32);
@@ -237,4 +252,93 @@ static inline char *smk_of_inode(const struct inode *isp)
 	return sip->smk_inode;
 }
 
+/*
+ * logging functions
+ */
+#define SMACK_AUDIT_DENIED 0x1
+#define SMACK_AUDIT_ACCEPT 0x2
+extern int log_policy;
+
+void smack_log(char *subject_label, char *object_label,
+		int request,
+		int result, struct smk_audit_info *auditdata);
+
+#ifdef CONFIG_AUDIT
+
+/*
+ * some inline functions to set up audit data
+ * they do nothing if CONFIG_AUDIT is not set
+ *
+ */
+static inline void smk_ad_init(struct smk_audit_info *a, const char *func,
+			       char type)
+{
+	memset(a, 0, sizeof(*a));
+	a->a.type = type;
+	a->a.function = func;
+}
+
+static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a,
+					 struct task_struct *t)
+{
+	a->a.u.tsk = t;
+}
+static inline void smk_ad_setfield_u_fs_path_dentry(struct smk_audit_info *a,
+						    struct dentry *d)
+{
+	a->a.u.fs.path.dentry = d;
+}
+static inline void smk_ad_setfield_u_fs_path_mnt(struct smk_audit_info *a,
+						 struct vfsmount *m)
+{
+	a->a.u.fs.path.mnt = m;
+}
+static inline void smk_ad_setfield_u_fs_inode(struct smk_audit_info *a,
+					      struct inode *i)
+{
+	a->a.u.fs.inode = i;
+}
+static inline void smk_ad_setfield_u_fs_path(struct smk_audit_info *a,
+					     struct path p)
+{
+	a->a.u.fs.path = p;
+}
+static inline void smk_ad_setfield_u_net_sk(struct smk_audit_info *a,
+					    struct sock *sk)
+{
+	a->a.u.net.sk = sk;
+}
+
+#else /* no AUDIT */
+
+static inline void smk_ad_init(struct smk_audit_info *a, const char *func,
+			       char type)
+{
+}
+static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a,
+					 struct task_struct *t)
+{
+}
+static inline void smk_ad_setfield_u_fs_path_dentry(struct smk_audit_info *a,
+						    struct dentry *d)
+{
+}
+static inline void smk_ad_setfield_u_fs_path_mnt(struct smk_audit_info *a,
+						 struct vfsmount *m)
+{
+}
+static inline void smk_ad_setfield_u_fs_inode(struct smk_audit_info *a,
+					      struct inode *i)
+{
+}
+static inline void smk_ad_setfield_u_fs_path(struct smk_audit_info *a,
+					     struct path p)
+{
+}
+static inline void smk_ad_setfield_u_net_sk(struct smk_audit_info *a,
+					    struct sock *sk)
+{
+}
+#endif
+
 #endif  /* _SECURITY_SMACK_H */
diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c
index ac0a2707f6d41..513dc1aa16dd1 100644
--- a/security/smack/smack_access.c
+++ b/security/smack/smack_access.c
@@ -59,11 +59,18 @@ LIST_HEAD(smack_known_list);
  */
 static u32 smack_next_secid = 10;
 
+/*
+ * what events do we log
+ * can be overwritten at run-time by /smack/logging
+ */
+int log_policy = SMACK_AUDIT_DENIED;
+
 /**
  * smk_access - determine if a subject has a specific access to an object
  * @subject_label: a pointer to the subject's Smack label
  * @object_label: a pointer to the object's Smack label
  * @request: the access requested, in "MAY" format
+ * @a : a pointer to the audit data
  *
  * This function looks up the subject/object pair in the
  * access rule list and returns 0 if the access is permitted,
@@ -78,10 +85,12 @@ static u32 smack_next_secid = 10;
  * will be on the list, so checking the pointers may be a worthwhile
  * optimization.
  */
-int smk_access(char *subject_label, char *object_label, int request)
+int smk_access(char *subject_label, char *object_label, int request,
+	       struct smk_audit_info *a)
 {
 	u32 may = MAY_NOT;
 	struct smack_rule *srp;
+	int rc = 0;
 
 	/*
 	 * Hardcoded comparisons.
@@ -89,8 +98,10 @@ int smk_access(char *subject_label, char *object_label, int request)
 	 * A star subject can't access any object.
 	 */
 	if (subject_label == smack_known_star.smk_known ||
-	    strcmp(subject_label, smack_known_star.smk_known) == 0)
-		return -EACCES;
+	    strcmp(subject_label, smack_known_star.smk_known) == 0) {
+		rc = -EACCES;
+		goto out_audit;
+	}
 	/*
 	 * An internet object can be accessed by any subject.
 	 * Tasks cannot be assigned the internet label.
@@ -100,20 +111,20 @@ int smk_access(char *subject_label, char *object_label, int request)
 	    subject_label == smack_known_web.smk_known ||
 	    strcmp(object_label, smack_known_web.smk_known) == 0 ||
 	    strcmp(subject_label, smack_known_web.smk_known) == 0)
-		return 0;
+		goto out_audit;
 	/*
 	 * A star object can be accessed by any subject.
 	 */
 	if (object_label == smack_known_star.smk_known ||
 	    strcmp(object_label, smack_known_star.smk_known) == 0)
-		return 0;
+		goto out_audit;
 	/*
 	 * An object can be accessed in any way by a subject
 	 * with the same label.
 	 */
 	if (subject_label == object_label ||
 	    strcmp(subject_label, object_label) == 0)
-		return 0;
+		goto out_audit;
 	/*
 	 * A hat subject can read any object.
 	 * A floor object can be read by any subject.
@@ -121,10 +132,10 @@ int smk_access(char *subject_label, char *object_label, int request)
 	if ((request & MAY_ANYREAD) == request) {
 		if (object_label == smack_known_floor.smk_known ||
 		    strcmp(object_label, smack_known_floor.smk_known) == 0)
-			return 0;
+			goto out_audit;
 		if (subject_label == smack_known_hat.smk_known ||
 		    strcmp(subject_label, smack_known_hat.smk_known) == 0)
-			return 0;
+			goto out_audit;
 	}
 	/*
 	 * Beyond here an explicit relationship is required.
@@ -148,28 +159,36 @@ int smk_access(char *subject_label, char *object_label, int request)
 	 * This is a bit map operation.
 	 */
 	if ((request & may) == request)
-		return 0;
-
-	return -EACCES;
+		goto out_audit;
+
+	rc = -EACCES;
+out_audit:
+#ifdef CONFIG_AUDIT
+	if (a)
+		smack_log(subject_label, object_label, request, rc, a);
+#endif
+	return rc;
 }
 
 /**
  * smk_curacc - determine if current has a specific access to an object
  * @obj_label: a pointer to the object's Smack label
  * @mode: the access requested, in "MAY" format
+ * @a : common audit data
  *
  * This function checks the current subject label/object label pair
  * in the access rule list and returns 0 if the access is permitted,
  * non zero otherwise. It allows that current may have the capability
  * to override the rules.
  */
-int smk_curacc(char *obj_label, u32 mode)
+int smk_curacc(char *obj_label, u32 mode, struct smk_audit_info *a)
 {
 	int rc;
+	char *sp = current_security();
 
-	rc = smk_access(current_security(), obj_label, mode);
+	rc = smk_access(sp, obj_label, mode, NULL);
 	if (rc == 0)
-		return 0;
+		goto out_audit;
 
 	/*
 	 * Return if a specific label has been designated as the
@@ -177,14 +196,105 @@ int smk_curacc(char *obj_label, u32 mode)
 	 * have that label.
 	 */
 	if (smack_onlycap != NULL && smack_onlycap != current->cred->security)
-		return rc;
+		goto out_audit;
 
 	if (capable(CAP_MAC_OVERRIDE))
 		return 0;
 
+out_audit:
+#ifdef CONFIG_AUDIT
+	if (a)
+		smack_log(sp, obj_label, mode, rc, a);
+#endif
 	return rc;
 }
 
+#ifdef CONFIG_AUDIT
+/**
+ * smack_str_from_perm : helper to transalate an int to a
+ * readable string
+ * @string : the string to fill
+ * @access : the int
+ *
+ */
+static inline void smack_str_from_perm(char *string, int access)
+{
+	int i = 0;
+	if (access & MAY_READ)
+		string[i++] = 'r';
+	if (access & MAY_WRITE)
+		string[i++] = 'w';
+	if (access & MAY_EXEC)
+		string[i++] = 'x';
+	if (access & MAY_APPEND)
+		string[i++] = 'a';
+	string[i] = '\0';
+}
+/**
+ * smack_log_callback - SMACK specific information
+ * will be called by generic audit code
+ * @ab : the audit_buffer
+ * @a  : audit_data
+ *
+ */
+static void smack_log_callback(struct audit_buffer *ab, void *a)
+{
+	struct common_audit_data *ad = a;
+	struct smack_audit_data *sad = &ad->lsm_priv.smack_audit_data;
+	audit_log_format(ab, "lsm=SMACK fn=%s action=%s", ad->function,
+			 sad->result ? "denied" : "granted");
+	audit_log_format(ab, " subject=");
+	audit_log_untrustedstring(ab, sad->subject);
+	audit_log_format(ab, " object=");
+	audit_log_untrustedstring(ab, sad->object);
+	audit_log_format(ab, " requested=%s", sad->request);
+}
+
+/**
+ *  smack_log - Audit the granting or denial of permissions.
+ *  @subject_label : smack label of the requester
+ *  @object_label  : smack label of the object being accessed
+ *  @request: requested permissions
+ *  @result: result from smk_access
+ *  @a:  auxiliary audit data
+ *
+ * Audit the granting or denial of permissions in accordance
+ * with the policy.
+ */
+void smack_log(char *subject_label, char *object_label, int request,
+	       int result, struct smk_audit_info *ad)
+{
+	char request_buffer[SMK_NUM_ACCESS_TYPE + 1];
+	struct smack_audit_data *sad;
+	struct common_audit_data *a = &ad->a;
+
+	/* check if we have to log the current event */
+	if (result != 0 && (log_policy & SMACK_AUDIT_DENIED) == 0)
+		return;
+	if (result == 0 && (log_policy & SMACK_AUDIT_ACCEPT) == 0)
+		return;
+
+	if (a->function == NULL)
+		a->function = "unknown";
+
+	/* end preparing the audit data */
+	sad = &a->lsm_priv.smack_audit_data;
+	smack_str_from_perm(request_buffer, request);
+	sad->subject = subject_label;
+	sad->object  = object_label;
+	sad->request = request_buffer;
+	sad->result  = result;
+	a->lsm_pre_audit = smack_log_callback;
+
+	common_lsm_audit(a);
+}
+#else /* #ifdef CONFIG_AUDIT */
+void smack_log(char *subject_label, char *object_label, int request,
+               int result, struct smk_audit_info *ad)
+{
+}
+#endif
+
 static DEFINE_MUTEX(smack_known_lock);
 
 /**
@@ -209,7 +319,8 @@ struct smack_known *smk_import_entry(const char *string, int len)
 		if (found)
 			smack[i] = '\0';
 		else if (i >= len || string[i] > '~' || string[i] <= ' ' ||
-			 string[i] == '/') {
+			 string[i] == '/' || string[i] == '"' ||
+			 string[i] == '\\' || string[i] == '\'') {
 			smack[i] = '\0';
 			found = 1;
 		} else
diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 921514902eca4..f557767911c93 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -30,7 +30,6 @@
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
 #include <linux/audit.h>
-
 #include "smack.h"
 
 #define task_security(task)	(task_cred_xxx((task), security))
@@ -103,14 +102,24 @@ struct inode_smack *new_inode_smack(char *smack)
 static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 {
 	int rc;
+	struct smk_audit_info ad;
+	char *sp, *tsp;
 
 	rc = cap_ptrace_may_access(ctp, mode);
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(current_security(), task_security(ctp), MAY_READWRITE);
+	sp = current_security();
+	tsp = task_security(ctp);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, ctp);
+
+	/* we won't log here, because rc can be overriden */
+	rc = smk_access(sp, tsp, MAY_READWRITE, NULL);
 	if (rc != 0 && capable(CAP_MAC_OVERRIDE))
-		return 0;
+		rc = 0;
+
+	smack_log(sp, tsp, MAY_READWRITE, rc, &ad);
 	return rc;
 }
 
@@ -125,14 +134,24 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 static int smack_ptrace_traceme(struct task_struct *ptp)
 {
 	int rc;
+	struct smk_audit_info ad;
+	char *sp, *tsp;
 
 	rc = cap_ptrace_traceme(ptp);
 	if (rc != 0)
 		return rc;
 
-	rc = smk_access(task_security(ptp), current_security(), MAY_READWRITE);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, ptp);
+
+	sp = current_security();
+	tsp = task_security(ptp);
+	/* we won't log here, because rc can be overriden */
+	rc = smk_access(tsp, sp, MAY_READWRITE, NULL);
 	if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE))
-		return 0;
+		rc = 0;
+
+	smack_log(tsp, sp, MAY_READWRITE, rc, &ad);
 	return rc;
 }
 
@@ -327,8 +346,14 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data)
 static int smack_sb_statfs(struct dentry *dentry)
 {
 	struct superblock_smack *sbp = dentry->d_sb->s_security;
+	int rc;
+	struct smk_audit_info ad;
 
-	return smk_curacc(sbp->smk_floor, MAY_READ);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
+	rc = smk_curacc(sbp->smk_floor, MAY_READ, &ad);
+	return rc;
 }
 
 /**
@@ -346,8 +371,12 @@ static int smack_sb_mount(char *dev_name, struct path *path,
 			  char *type, unsigned long flags, void *data)
 {
 	struct superblock_smack *sbp = path->mnt->mnt_sb->s_security;
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path(&ad, *path);
 
-	return smk_curacc(sbp->smk_floor, MAY_WRITE);
+	return smk_curacc(sbp->smk_floor, MAY_WRITE, &ad);
 }
 
 /**
@@ -361,10 +390,14 @@ static int smack_sb_mount(char *dev_name, struct path *path,
 static int smack_sb_umount(struct vfsmount *mnt, int flags)
 {
 	struct superblock_smack *sbp;
+	struct smk_audit_info ad;
 
-	sbp = mnt->mnt_sb->s_security;
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, mnt->mnt_mountpoint);
+	smk_ad_setfield_u_fs_path_mnt(&ad, mnt);
 
-	return smk_curacc(sbp->smk_floor, MAY_WRITE);
+	sbp = mnt->mnt_sb->s_security;
+	return smk_curacc(sbp->smk_floor, MAY_WRITE, &ad);
 }
 
 /*
@@ -441,15 +474,20 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir,
 static int smack_inode_link(struct dentry *old_dentry, struct inode *dir,
 			    struct dentry *new_dentry)
 {
-	int rc;
 	char *isp;
+	struct smk_audit_info ad;
+	int rc;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry);
 
 	isp = smk_of_inode(old_dentry->d_inode);
-	rc = smk_curacc(isp, MAY_WRITE);
+	rc = smk_curacc(isp, MAY_WRITE, &ad);
 
 	if (rc == 0 && new_dentry->d_inode != NULL) {
 		isp = smk_of_inode(new_dentry->d_inode);
-		rc = smk_curacc(isp, MAY_WRITE);
+		smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry);
+		rc = smk_curacc(isp, MAY_WRITE, &ad);
 	}
 
 	return rc;
@@ -466,18 +504,24 @@ static int smack_inode_link(struct dentry *old_dentry, struct inode *dir,
 static int smack_inode_unlink(struct inode *dir, struct dentry *dentry)
 {
 	struct inode *ip = dentry->d_inode;
+	struct smk_audit_info ad;
 	int rc;
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
 	/*
 	 * You need write access to the thing you're unlinking
 	 */
-	rc = smk_curacc(smk_of_inode(ip), MAY_WRITE);
-	if (rc == 0)
+	rc = smk_curacc(smk_of_inode(ip), MAY_WRITE, &ad);
+	if (rc == 0) {
 		/*
 		 * You also need write access to the containing directory
 		 */
-		rc = smk_curacc(smk_of_inode(dir), MAY_WRITE);
-
+		smk_ad_setfield_u_fs_path_dentry(&ad, NULL);
+		smk_ad_setfield_u_fs_inode(&ad, dir);
+		rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad);
+	}
 	return rc;
 }
 
@@ -491,17 +535,24 @@ static int smack_inode_unlink(struct inode *dir, struct dentry *dentry)
  */
 static int smack_inode_rmdir(struct inode *dir, struct dentry *dentry)
 {
+	struct smk_audit_info ad;
 	int rc;
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
 	/*
 	 * You need write access to the thing you're removing
 	 */
-	rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
-	if (rc == 0)
+	rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
+	if (rc == 0) {
 		/*
 		 * You also need write access to the containing directory
 		 */
-		rc = smk_curacc(smk_of_inode(dir), MAY_WRITE);
+		smk_ad_setfield_u_fs_path_dentry(&ad, NULL);
+		smk_ad_setfield_u_fs_inode(&ad, dir);
+		rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad);
+	}
 
 	return rc;
 }
@@ -525,15 +576,19 @@ static int smack_inode_rename(struct inode *old_inode,
 {
 	int rc;
 	char *isp;
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry);
 
 	isp = smk_of_inode(old_dentry->d_inode);
-	rc = smk_curacc(isp, MAY_READWRITE);
+	rc = smk_curacc(isp, MAY_READWRITE, &ad);
 
 	if (rc == 0 && new_dentry->d_inode != NULL) {
 		isp = smk_of_inode(new_dentry->d_inode);
-		rc = smk_curacc(isp, MAY_READWRITE);
+		smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry);
+		rc = smk_curacc(isp, MAY_READWRITE, &ad);
 	}
-
 	return rc;
 }
 
@@ -548,13 +603,15 @@ static int smack_inode_rename(struct inode *old_inode,
  */
 static int smack_inode_permission(struct inode *inode, int mask)
 {
+	struct smk_audit_info ad;
 	/*
 	 * No permission to check. Existence test. Yup, it's there.
 	 */
 	if (mask == 0)
 		return 0;
-
-	return smk_curacc(smk_of_inode(inode), mask);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_inode(&ad, inode);
+	return smk_curacc(smk_of_inode(inode), mask, &ad);
 }
 
 /**
@@ -566,13 +623,16 @@ static int smack_inode_permission(struct inode *inode, int mask)
  */
 static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr)
 {
+	struct smk_audit_info ad;
 	/*
 	 * Need to allow for clearing the setuid bit.
 	 */
 	if (iattr->ia_valid & ATTR_FORCE)
 		return 0;
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
 
-	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
 }
 
 /**
@@ -584,7 +644,12 @@ static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr)
  */
 static int smack_inode_getattr(struct vfsmount *mnt, struct dentry *dentry)
 {
-	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ);
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+	smk_ad_setfield_u_fs_path_mnt(&ad, mnt);
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ, &ad);
 }
 
 /**
@@ -602,6 +667,7 @@ static int smack_inode_getattr(struct vfsmount *mnt, struct dentry *dentry)
 static int smack_inode_setxattr(struct dentry *dentry, const char *name,
 				const void *value, size_t size, int flags)
 {
+	struct smk_audit_info ad;
 	int rc = 0;
 
 	if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
@@ -615,8 +681,11 @@ static int smack_inode_setxattr(struct dentry *dentry, const char *name,
 	} else
 		rc = cap_inode_setxattr(dentry, name, value, size, flags);
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
 	if (rc == 0)
-		rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+		rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
 
 	return rc;
 }
@@ -671,7 +740,12 @@ static void smack_inode_post_setxattr(struct dentry *dentry, const char *name,
  */
 static int smack_inode_getxattr(struct dentry *dentry, const char *name)
 {
-	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ);
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
+	return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ, &ad);
 }
 
 /*
@@ -685,6 +759,7 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name)
  */
 static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 {
+	struct smk_audit_info ad;
 	int rc = 0;
 
 	if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
@@ -695,8 +770,10 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 	} else
 		rc = cap_inode_removexattr(dentry, name);
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
 	if (rc == 0)
-		rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+		rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
 
 	return rc;
 }
@@ -855,12 +932,16 @@ static int smack_file_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg)
 {
 	int rc = 0;
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path(&ad, file->f_path);
 
 	if (_IOC_DIR(cmd) & _IOC_WRITE)
-		rc = smk_curacc(file->f_security, MAY_WRITE);
+		rc = smk_curacc(file->f_security, MAY_WRITE, &ad);
 
 	if (rc == 0 && (_IOC_DIR(cmd) & _IOC_READ))
-		rc = smk_curacc(file->f_security, MAY_READ);
+		rc = smk_curacc(file->f_security, MAY_READ, &ad);
 
 	return rc;
 }
@@ -874,7 +955,11 @@ static int smack_file_ioctl(struct file *file, unsigned int cmd,
  */
 static int smack_file_lock(struct file *file, unsigned int cmd)
 {
-	return smk_curacc(file->f_security, MAY_WRITE);
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path_dentry(&ad, file->f_path.dentry);
+	return smk_curacc(file->f_security, MAY_WRITE, &ad);
 }
 
 /**
@@ -888,8 +973,12 @@ static int smack_file_lock(struct file *file, unsigned int cmd)
 static int smack_file_fcntl(struct file *file, unsigned int cmd,
 			    unsigned long arg)
 {
+	struct smk_audit_info ad;
 	int rc;
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+	smk_ad_setfield_u_fs_path(&ad, file->f_path);
+
 	switch (cmd) {
 	case F_DUPFD:
 	case F_GETFD:
@@ -897,7 +986,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
 	case F_GETLK:
 	case F_GETOWN:
 	case F_GETSIG:
-		rc = smk_curacc(file->f_security, MAY_READ);
+		rc = smk_curacc(file->f_security, MAY_READ, &ad);
 		break;
 	case F_SETFD:
 	case F_SETFL:
@@ -905,10 +994,10 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
 	case F_SETLKW:
 	case F_SETOWN:
 	case F_SETSIG:
-		rc = smk_curacc(file->f_security, MAY_WRITE);
+		rc = smk_curacc(file->f_security, MAY_WRITE, &ad);
 		break;
 	default:
-		rc = smk_curacc(file->f_security, MAY_READWRITE);
+		rc = smk_curacc(file->f_security, MAY_READWRITE, &ad);
 	}
 
 	return rc;
@@ -943,14 +1032,21 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
 {
 	struct file *file;
 	int rc;
+	char *tsp = tsk->cred->security;
+	struct smk_audit_info ad;
 
 	/*
 	 * struct fown_struct is never outside the context of a struct file
 	 */
 	file = container_of(fown, struct file, f_owner);
-	rc = smk_access(file->f_security, tsk->cred->security, MAY_WRITE);
+	/* we don't log here as rc can be overriden */
+	rc = smk_access(file->f_security, tsp, MAY_WRITE, NULL);
 	if (rc != 0 && has_capability(tsk, CAP_MAC_OVERRIDE))
-		return 0;
+		rc = 0;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, tsk);
+	smack_log(file->f_security, tsp, MAY_WRITE, rc, &ad);
 	return rc;
 }
 
@@ -963,7 +1059,10 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
 static int smack_file_receive(struct file *file)
 {
 	int may = 0;
+	struct smk_audit_info ad;
 
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_fs_path(&ad, file->f_path);
 	/*
 	 * This code relies on bitmasks.
 	 */
@@ -972,7 +1071,7 @@ static int smack_file_receive(struct file *file)
 	if (file->f_mode & FMODE_WRITE)
 		may |= MAY_WRITE;
 
-	return smk_curacc(file->f_security, may);
+	return smk_curacc(file->f_security, may, &ad);
 }
 
 /*
@@ -1051,6 +1150,22 @@ static int smack_kernel_create_files_as(struct cred *new,
 	return 0;
 }
 
+/**
+ * smk_curacc_on_task - helper to log task related access
+ * @p: the task object
+ * @access : the access requested
+ *
+ * Return 0 if access is permitted
+ */
+static int smk_curacc_on_task(struct task_struct *p, int access)
+{
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, p);
+	return smk_curacc(task_security(p), access, &ad);
+}
+
 /**
  * smack_task_setpgid - Smack check on setting pgid
  * @p: the task object
@@ -1060,7 +1175,7 @@ static int smack_kernel_create_files_as(struct cred *new,
  */
 static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
 {
-	return smk_curacc(task_security(p), MAY_WRITE);
+	return smk_curacc_on_task(p, MAY_WRITE);
 }
 
 /**
@@ -1071,7 +1186,7 @@ static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
  */
 static int smack_task_getpgid(struct task_struct *p)
 {
-	return smk_curacc(task_security(p), MAY_READ);
+	return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1082,7 +1197,7 @@ static int smack_task_getpgid(struct task_struct *p)
  */
 static int smack_task_getsid(struct task_struct *p)
 {
-	return smk_curacc(task_security(p), MAY_READ);
+	return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1110,7 +1225,7 @@ static int smack_task_setnice(struct task_struct *p, int nice)
 
 	rc = cap_task_setnice(p, nice);
 	if (rc == 0)
-		rc = smk_curacc(task_security(p), MAY_WRITE);
+		rc = smk_curacc_on_task(p, MAY_WRITE);
 	return rc;
 }
 
@@ -1127,7 +1242,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
 
 	rc = cap_task_setioprio(p, ioprio);
 	if (rc == 0)
-		rc = smk_curacc(task_security(p), MAY_WRITE);
+		rc = smk_curacc_on_task(p, MAY_WRITE);
 	return rc;
 }
 
@@ -1139,7 +1254,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
  */
 static int smack_task_getioprio(struct task_struct *p)
 {
-	return smk_curacc(task_security(p), MAY_READ);
+	return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1157,7 +1272,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
 
 	rc = cap_task_setscheduler(p, policy, lp);
 	if (rc == 0)
-		rc = smk_curacc(task_security(p), MAY_WRITE);
+		rc = smk_curacc_on_task(p, MAY_WRITE);
 	return rc;
 }
 
@@ -1169,7 +1284,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
  */
 static int smack_task_getscheduler(struct task_struct *p)
 {
-	return smk_curacc(task_security(p), MAY_READ);
+	return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1180,7 +1295,7 @@ static int smack_task_getscheduler(struct task_struct *p)
  */
 static int smack_task_movememory(struct task_struct *p)
 {
-	return smk_curacc(task_security(p), MAY_WRITE);
+	return smk_curacc_on_task(p, MAY_WRITE);
 }
 
 /**
@@ -1198,18 +1313,23 @@ static int smack_task_movememory(struct task_struct *p)
 static int smack_task_kill(struct task_struct *p, struct siginfo *info,
 			   int sig, u32 secid)
 {
+	struct smk_audit_info ad;
+
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, p);
 	/*
 	 * Sending a signal requires that the sender
 	 * can write the receiver.
 	 */
 	if (secid == 0)
-		return smk_curacc(task_security(p), MAY_WRITE);
+		return smk_curacc(task_security(p), MAY_WRITE, &ad);
 	/*
 	 * If the secid isn't 0 we're dealing with some USB IO
 	 * specific behavior. This is not clean. For one thing
 	 * we can't take privilege into account.
 	 */
-	return smk_access(smack_from_secid(secid), task_security(p), MAY_WRITE);
+	return smk_access(smack_from_secid(secid), task_security(p),
+			  MAY_WRITE, &ad);
 }
 
 /**
@@ -1220,11 +1340,15 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
  */
 static int smack_task_wait(struct task_struct *p)
 {
+	struct smk_audit_info ad;
+	char *sp = current_security();
+	char *tsp = task_security(p);
 	int rc;
 
-	rc = smk_access(current_security(), task_security(p), MAY_WRITE);
+	/* we don't log here, we can be overriden */
+	rc = smk_access(sp, tsp, MAY_WRITE, NULL);
 	if (rc == 0)
-		return 0;
+		goto out_log;
 
 	/*
 	 * Allow the operation to succeed if either task
@@ -1238,8 +1362,12 @@ static int smack_task_wait(struct task_struct *p)
 	 * the smack value.
 	 */
 	if (capable(CAP_MAC_OVERRIDE) || has_capability(p, CAP_MAC_OVERRIDE))
-		return 0;
-
+		rc = 0;
+	/* we log only if we didn't get overriden */
+ out_log:
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+	smk_ad_setfield_u_tsk(&ad, p);
+	smack_log(sp, tsp, MAY_WRITE, rc, &ad);
 	return rc;
 }
 
@@ -1455,12 +1583,19 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap)
 	int sk_lbl;
 	char *hostsp;
 	struct socket_smack *ssp = sk->sk_security;
+	struct smk_audit_info ad;
 
 	rcu_read_lock();
 	hostsp = smack_host_label(sap);
 	if (hostsp != NULL) {
 		sk_lbl = SMACK_UNLABELED_SOCKET;
-		rc = smk_access(ssp->smk_out, hostsp, MAY_WRITE);
+#ifdef CONFIG_AUDIT
+		smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+		ad.a.u.net.family = sap->sin_family;
+		ad.a.u.net.dport = sap->sin_port;
+		ad.a.u.net.v4info.daddr = sap->sin_addr.s_addr;
+#endif
+		rc = smk_access(ssp->smk_out, hostsp, MAY_WRITE, &ad);
 	} else {
 		sk_lbl = SMACK_CIPSO_SOCKET;
 		rc = 0;
@@ -1655,6 +1790,25 @@ static void smack_shm_free_security(struct shmid_kernel *shp)
 	isp->security = NULL;
 }
 
+/**
+ * smk_curacc_shm : check if current has access on shm
+ * @shp : the object
+ * @access : access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smk_curacc_shm(struct shmid_kernel *shp, int access)
+{
+	char *ssp = smack_of_shm(shp);
+	struct smk_audit_info ad;
+
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+	ad.a.u.ipc_id = shp->shm_perm.id;
+#endif
+	return smk_curacc(ssp, access, &ad);
+}
+
 /**
  * smack_shm_associate - Smack access check for shm
  * @shp: the object
@@ -1664,11 +1818,10 @@ static void smack_shm_free_security(struct shmid_kernel *shp)
  */
 static int smack_shm_associate(struct shmid_kernel *shp, int shmflg)
 {
-	char *ssp = smack_of_shm(shp);
 	int may;
 
 	may = smack_flags_to_may(shmflg);
-	return smk_curacc(ssp, may);
+	return smk_curacc_shm(shp, may);
 }
 
 /**
@@ -1680,7 +1833,6 @@ static int smack_shm_associate(struct shmid_kernel *shp, int shmflg)
  */
 static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd)
 {
-	char *ssp;
 	int may;
 
 	switch (cmd) {
@@ -1703,9 +1855,7 @@ static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd)
 	default:
 		return -EINVAL;
 	}
-
-	ssp = smack_of_shm(shp);
-	return smk_curacc(ssp, may);
+	return smk_curacc_shm(shp, may);
 }
 
 /**
@@ -1719,11 +1869,10 @@ static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd)
 static int smack_shm_shmat(struct shmid_kernel *shp, char __user *shmaddr,
 			   int shmflg)
 {
-	char *ssp = smack_of_shm(shp);
 	int may;
 
 	may = smack_flags_to_may(shmflg);
-	return smk_curacc(ssp, may);
+	return smk_curacc_shm(shp, may);
 }
 
 /**
@@ -1764,6 +1913,25 @@ static void smack_sem_free_security(struct sem_array *sma)
 	isp->security = NULL;
 }
 
+/**
+ * smk_curacc_sem : check if current has access on sem
+ * @sma : the object
+ * @access : access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smk_curacc_sem(struct sem_array *sma, int access)
+{
+	char *ssp = smack_of_sem(sma);
+	struct smk_audit_info ad;
+
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+	ad.a.u.ipc_id = sma->sem_perm.id;
+#endif
+	return smk_curacc(ssp, access, &ad);
+}
+
 /**
  * smack_sem_associate - Smack access check for sem
  * @sma: the object
@@ -1773,11 +1941,10 @@ static void smack_sem_free_security(struct sem_array *sma)
  */
 static int smack_sem_associate(struct sem_array *sma, int semflg)
 {
-	char *ssp = smack_of_sem(sma);
 	int may;
 
 	may = smack_flags_to_may(semflg);
-	return smk_curacc(ssp, may);
+	return smk_curacc_sem(sma, may);
 }
 
 /**
@@ -1789,7 +1956,6 @@ static int smack_sem_associate(struct sem_array *sma, int semflg)
  */
 static int smack_sem_semctl(struct sem_array *sma, int cmd)
 {
-	char *ssp;
 	int may;
 
 	switch (cmd) {
@@ -1818,8 +1984,7 @@ static int smack_sem_semctl(struct sem_array *sma, int cmd)
 		return -EINVAL;
 	}
 
-	ssp = smack_of_sem(sma);
-	return smk_curacc(ssp, may);
+	return smk_curacc_sem(sma, may);
 }
 
 /**
@@ -1836,9 +2001,7 @@ static int smack_sem_semctl(struct sem_array *sma, int cmd)
 static int smack_sem_semop(struct sem_array *sma, struct sembuf *sops,
 			   unsigned nsops, int alter)
 {
-	char *ssp = smack_of_sem(sma);
-
-	return smk_curacc(ssp, MAY_READWRITE);
+	return smk_curacc_sem(sma, MAY_READWRITE);
 }
 
 /**
@@ -1879,6 +2042,25 @@ static char *smack_of_msq(struct msg_queue *msq)
 	return (char *)msq->q_perm.security;
 }
 
+/**
+ * smk_curacc_msq : helper to check if current has access on msq
+ * @msq : the msq
+ * @access : access requested
+ *
+ * return 0 if current has access, error otherwise
+ */
+static int smk_curacc_msq(struct msg_queue *msq, int access)
+{
+	char *msp = smack_of_msq(msq);
+	struct smk_audit_info ad;
+
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+	ad.a.u.ipc_id = msq->q_perm.id;
+#endif
+	return smk_curacc(msp, access, &ad);
+}
+
 /**
  * smack_msg_queue_associate - Smack access check for msg_queue
  * @msq: the object
@@ -1888,11 +2070,10 @@ static char *smack_of_msq(struct msg_queue *msq)
  */
 static int smack_msg_queue_associate(struct msg_queue *msq, int msqflg)
 {
-	char *msp = smack_of_msq(msq);
 	int may;
 
 	may = smack_flags_to_may(msqflg);
-	return smk_curacc(msp, may);
+	return smk_curacc_msq(msq, may);
 }
 
 /**
@@ -1904,7 +2085,6 @@ static int smack_msg_queue_associate(struct msg_queue *msq, int msqflg)
  */
 static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd)
 {
-	char *msp;
 	int may;
 
 	switch (cmd) {
@@ -1926,8 +2106,7 @@ static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd)
 		return -EINVAL;
 	}
 
-	msp = smack_of_msq(msq);
-	return smk_curacc(msp, may);
+	return smk_curacc_msq(msq, may);
 }
 
 /**
@@ -1941,11 +2120,10 @@ static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd)
 static int smack_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg,
 				  int msqflg)
 {
-	char *msp = smack_of_msq(msq);
-	int rc;
+	int may;
 
-	rc = smack_flags_to_may(msqflg);
-	return smk_curacc(msp, rc);
+	may = smack_flags_to_may(msqflg);
+	return smk_curacc_msq(msq, may);
 }
 
 /**
@@ -1961,9 +2139,7 @@ static int smack_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg,
 static int smack_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
 			struct task_struct *target, long type, int mode)
 {
-	char *msp = smack_of_msq(msq);
-
-	return smk_curacc(msp, MAY_READWRITE);
+	return smk_curacc_msq(msq, MAY_READWRITE);
 }
 
 /**
@@ -1976,10 +2152,14 @@ static int smack_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
 static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
 {
 	char *isp = ipp->security;
-	int may;
+	int may = smack_flags_to_may(flag);
+	struct smk_audit_info ad;
 
-	may = smack_flags_to_may(flag);
-	return smk_curacc(isp, may);
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+	ad.a.u.ipc_id = ipp->id;
+#endif
+	return smk_curacc(isp, may, &ad);
 }
 
 /**
@@ -2238,8 +2418,12 @@ static int smack_unix_stream_connect(struct socket *sock,
 {
 	struct inode *sp = SOCK_INODE(sock);
 	struct inode *op = SOCK_INODE(other);
+	struct smk_audit_info ad;
 
-	return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_READWRITE);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+	smk_ad_setfield_u_net_sk(&ad, other->sk);
+	return smk_access(smk_of_inode(sp), smk_of_inode(op),
+				 MAY_READWRITE, &ad);
 }
 
 /**
@@ -2254,8 +2438,11 @@ static int smack_unix_may_send(struct socket *sock, struct socket *other)
 {
 	struct inode *sp = SOCK_INODE(sock);
 	struct inode *op = SOCK_INODE(other);
+	struct smk_audit_info ad;
 
-	return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_WRITE);
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+	smk_ad_setfield_u_net_sk(&ad, other->sk);
+	return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_WRITE, &ad);
 }
 
 /**
@@ -2370,7 +2557,7 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 	char smack[SMK_LABELLEN];
 	char *csp;
 	int rc;
-
+	struct smk_audit_info ad;
 	if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
 		return 0;
 
@@ -2388,13 +2575,19 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
 	netlbl_secattr_destroy(&secattr);
 
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+	ad.a.u.net.family = sk->sk_family;
+	ad.a.u.net.netif = skb->iif;
+	ipv4_skb_to_auditdata(skb, &ad.a, NULL);
+#endif
 	/*
 	 * Receiving a packet requires that the other end
 	 * be able to write here. Read access is not required.
 	 * This is the simplist possible security model
 	 * for networking.
 	 */
-	rc = smk_access(csp, ssp->smk_in, MAY_WRITE);
+	rc = smk_access(csp, ssp->smk_in, MAY_WRITE, &ad);
 	if (rc != 0)
 		netlbl_skbuff_err(skb, rc, 0);
 	return rc;
@@ -2523,6 +2716,7 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 	struct iphdr *hdr;
 	char smack[SMK_LABELLEN];
 	int rc;
+	struct smk_audit_info ad;
 
 	/* handle mapped IPv4 packets arriving via IPv6 sockets */
 	if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
@@ -2536,11 +2730,17 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
 		strncpy(smack, smack_known_huh.smk_known, SMK_MAXLEN);
 	netlbl_secattr_destroy(&secattr);
 
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+	ad.a.u.net.family = family;
+	ad.a.u.net.netif = skb->iif;
+	ipv4_skb_to_auditdata(skb, &ad.a, NULL);
+#endif
 	/*
 	 * Receiving a packet requires that the other end be able to write
 	 * here. Read access is not required.
 	 */
-	rc = smk_access(smack, ssp->smk_in, MAY_WRITE);
+	rc = smk_access(smack, ssp->smk_in, MAY_WRITE, &ad);
 	if (rc != 0)
 		return rc;
 
@@ -2642,6 +2842,7 @@ static int smack_key_permission(key_ref_t key_ref,
 				const struct cred *cred, key_perm_t perm)
 {
 	struct key *keyp;
+	struct smk_audit_info ad;
 
 	keyp = key_ref_to_ptr(key_ref);
 	if (keyp == NULL)
@@ -2657,8 +2858,13 @@ static int smack_key_permission(key_ref_t key_ref,
 	 */
 	if (cred->security == NULL)
 		return -EACCES;
-
-	return smk_access(cred->security, keyp->security, MAY_READWRITE);
+#ifdef CONFIG_AUDIT
+	smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY);
+	ad.a.u.key_struct.key = keyp->serial;
+	ad.a.u.key_struct.key_desc = keyp->description;
+#endif
+	return smk_access(cred->security, keyp->security,
+				 MAY_READWRITE, &ad);
 }
 #endif /* CONFIG_KEYS */
 
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index e03a7e19c73b3..904af3483286f 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -41,6 +41,7 @@ enum smk_inos {
 	SMK_AMBIENT	= 7,	/* internet ambient label */
 	SMK_NETLBLADDR	= 8,	/* single label hosts */
 	SMK_ONLYCAP	= 9,	/* the only "capable" label */
+	SMK_LOGGING	= 10,	/* logging */
 };
 
 /*
@@ -1191,6 +1192,69 @@ static const struct file_operations smk_onlycap_ops = {
 	.write		= smk_write_onlycap,
 };
 
+/**
+ * smk_read_logging - read() for /smack/logging
+ * @filp: file pointer, not actually used
+ * @buf: where to put the result
+ * @cn: maximum to send along
+ * @ppos: where to start
+ *
+ * Returns number of bytes read or error code, as appropriate
+ */
+static ssize_t smk_read_logging(struct file *filp, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char temp[32];
+	ssize_t rc;
+
+	if (*ppos != 0)
+		return 0;
+
+	sprintf(temp, "%d\n", log_policy);
+	rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));
+	return rc;
+}
+
+/**
+ * smk_write_logging - write() for /smack/logging
+ * @file: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start
+ *
+ * Returns number of bytes written or error code, as appropriate
+ */
+static ssize_t smk_write_logging(struct file *file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char temp[32];
+	int i;
+
+	if (!capable(CAP_MAC_ADMIN))
+		return -EPERM;
+
+	if (count >= sizeof(temp) || count == 0)
+		return -EINVAL;
+
+	if (copy_from_user(temp, buf, count) != 0)
+		return -EFAULT;
+
+	temp[count] = '\0';
+
+	if (sscanf(temp, "%d", &i) != 1)
+		return -EINVAL;
+	if (i < 0 || i > 3)
+		return -EINVAL;
+	log_policy = i;
+	return count;
+}
+
+
+
+static const struct file_operations smk_logging_ops = {
+	.read		= smk_read_logging,
+	.write		= smk_write_logging,
+};
 /**
  * smk_fill_super - fill the /smackfs superblock
  * @sb: the empty superblock
@@ -1221,6 +1285,8 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent)
 			{"netlabel", &smk_netlbladdr_ops, S_IRUGO|S_IWUSR},
 		[SMK_ONLYCAP]	=
 			{"onlycap", &smk_onlycap_ops, S_IRUGO|S_IWUSR},
+		[SMK_LOGGING]	=
+			{"logging", &smk_logging_ops, S_IRUGO|S_IWUSR},
 		/* last one */ {""}
 	};
 
-- 
GitLab


From 7ba5c840e64d4a967379f1ae3eca73278180b11d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 Apr 2009 21:31:17 -0700
Subject: [PATCH 0385/2198] rcu: Add __rcu_pending tracing to hierarchical RCU

Add tracing to __rcu_pending() to provide information on why RCU
processing was kicked off.  This is helpful for debugging hierarchical
RCU, and might also be helpful in learning how hierarchical RCU operates.

Located-by: Anton Blanchard <anton@au1.ibm.com>
Tested-by: Anton Blanchard <anton@au1.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: anton@samba.org
Cc: akpm@linux-foundation.org
Cc: dipankar@in.ibm.com
Cc: manfred@colorfullife.com
Cc: cl@linux-foundation.org
Cc: josht@linux.vnet.ibm.com
Cc: schamp@sgi.com
Cc: niv@us.ibm.com
Cc: dvhltc@us.ibm.com
Cc: ego@in.ibm.com
Cc: laijs@cn.fujitsu.com
Cc: rostedt@goodmis.org
Cc: peterz@infradead.org
Cc: penberg@cs.helsinki.fi
Cc: andi@firstfloor.org
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Reference: <1239683479943-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcutree.h |  9 +++++-
 kernel/rcutree.c        | 25 ++++++++++++----
 kernel/rcutree_trace.c  | 64 ++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 90 insertions(+), 8 deletions(-)

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 58b2aa5312b9c..5a5153806c425 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -161,8 +161,15 @@ struct rcu_data {
 	unsigned long offline_fqs;	/* Kicked due to being offline. */
 	unsigned long resched_ipi;	/* Sent a resched IPI. */
 
-	/* 5) For future __rcu_pending statistics. */
+	/* 5) __rcu_pending() statistics. */
 	long n_rcu_pending;		/* rcu_pending() calls since boot. */
+	long n_rp_qs_pending;
+	long n_rp_cb_ready;
+	long n_rp_cpu_needs_gp;
+	long n_rp_gp_completed;
+	long n_rp_gp_started;
+	long n_rp_need_fqs;
+	long n_rp_need_nothing;
 
 	int cpu;
 };
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d2a372fb0b9b5..0dccfbba6d267 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	check_cpu_stall(rsp, rdp);
 
 	/* Is the RCU core waiting for a quiescent state from this CPU? */
-	if (rdp->qs_pending)
+	if (rdp->qs_pending) {
+		rdp->n_rp_qs_pending++;
 		return 1;
+	}
 
 	/* Does this CPU have callbacks ready to invoke? */
-	if (cpu_has_callbacks_ready_to_invoke(rdp))
+	if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+		rdp->n_rp_cb_ready++;
 		return 1;
+	}
 
 	/* Has RCU gone idle with this CPU needing another grace period? */
-	if (cpu_needs_another_gp(rsp, rdp))
+	if (cpu_needs_another_gp(rsp, rdp)) {
+		rdp->n_rp_cpu_needs_gp++;
 		return 1;
+	}
 
 	/* Has another RCU grace period completed?  */
-	if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+	if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
+		rdp->n_rp_gp_completed++;
 		return 1;
+	}
 
 	/* Has a new RCU grace period started? */
-	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+	if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
+		rdp->n_rp_gp_started++;
 		return 1;
+	}
 
 	/* Has an RCU GP gone long enough to send resched IPIs &c? */
 	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
-	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
+	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+		rdp->n_rp_need_fqs++;
 		return 1;
+	}
 
 	/* nothing to do */
+	rdp->n_rp_need_nothing++;
 	return 0;
 }
 
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 4b1875ba94044..fe1dcdbf1ca34 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
 	.release = single_release,
 };
 
-static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
+{
+	seq_printf(m, "%3d%cnp=%ld "
+		   "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+		   rdp->cpu,
+		   cpu_is_offline(rdp->cpu) ? '!' : ' ',
+		   rdp->n_rcu_pending,
+		   rdp->n_rp_qs_pending,
+		   rdp->n_rp_cb_ready,
+		   rdp->n_rp_cpu_needs_gp,
+		   rdp->n_rp_gp_completed,
+		   rdp->n_rp_gp_started,
+		   rdp->n_rp_need_fqs,
+		   rdp->n_rp_need_nothing);
+}
+
+static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+{
+	int cpu;
+	struct rcu_data *rdp;
+
+	for_each_possible_cpu(cpu) {
+		rdp = rsp->rda[cpu];
+		if (rdp->beenonline)
+			print_one_rcu_pending(m, rdp);
+	}
+}
+
+static int show_rcu_pending(struct seq_file *m, void *unused)
+{
+	seq_puts(m, "rcu:\n");
+	print_rcu_pendings(m, &rcu_state);
+	seq_puts(m, "rcu_bh:\n");
+	print_rcu_pendings(m, &rcu_bh_state);
+	return 0;
+}
+
+static int rcu_pending_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, show_rcu_pending, NULL);
+}
+
+static struct file_operations rcu_pending_fops = {
+	.owner = THIS_MODULE,
+	.open = rcu_pending_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
+static struct dentry *rcudir;
+static struct dentry *datadir;
+static struct dentry *datadir_csv;
+static struct dentry *gpdir;
+static struct dentry *hierdir;
+static struct dentry *rcu_pendingdir;
+
 static int __init rcuclassic_trace_init(void)
 {
 	rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
 						NULL, &rcuhier_fops);
 	if (!hierdir)
 		goto free_out;
+
+	rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
+						NULL, &rcu_pending_fops);
+	if (!rcu_pendingdir)
+		goto free_out;
 	return 0;
 free_out:
 	if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
 	debugfs_remove(datadir_csv);
 	debugfs_remove(gpdir);
 	debugfs_remove(hierdir);
+	debugfs_remove(rcu_pendingdir);
 	debugfs_remove(rcudir);
 }
 
-- 
GitLab


From 6fd9b3a40b82081d9e6490b0d7cd656e9a78a134 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 13 Apr 2009 21:31:18 -0700
Subject: [PATCH 0386/2198] rcu: Update RCU tracing documentation for
 __rcu_pending

This patch updates the RCU documentation to reflect the changes in
tracing made in the previous patch in the set.

Located-by: Anton Blanchard <anton@au1.ibm.com>
Tested-by: Anton Blanchard <anton@au1.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: anton@samba.org
Cc: akpm@linux-foundation.org
Cc: dipankar@in.ibm.com
Cc: manfred@colorfullife.com
Cc: cl@linux-foundation.org
Cc: josht@linux.vnet.ibm.com
Cc: schamp@sgi.com
Cc: niv@us.ibm.com
Cc: dvhltc@us.ibm.com
Cc: ego@in.ibm.com
Cc: laijs@cn.fujitsu.com
Cc: rostedt@goodmis.org
Cc: peterz@infradead.org
Cc: penberg@cs.helsinki.fi
Cc: andi@firstfloor.org
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
LKML-Reference: <12396834792865-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/RCU/trace.txt | 102 ++++++++++++++++++++++++++++--------
 1 file changed, 80 insertions(+), 22 deletions(-)

diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt
index 068848240a8bd..02cced183b2d6 100644
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -192,23 +192,24 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
 The output of "cat rcu/rcudata" looks as follows:
 
 rcu:
-  0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10
-  1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10
-  2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10
-  3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10
-  4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10
-  5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10
-  6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10
-  7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10
+rcu:
+  0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
+  1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
+  2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
+  3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10
+  4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10
+  5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10
+  6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10
+  7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10
 rcu_bh:
-  0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10
-  1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10
-  2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10
-  3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10
-  4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
-  5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
-  6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
-  7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
+  0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10
+  1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10
+  2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10
+  4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
 
 The first section lists the rcu_data structures for rcu, the second for
 rcu_bh.  Each section has one line per CPU, or eight for this 8-CPU system.
@@ -253,12 +254,6 @@ o	"pqc" indicates which grace period the last-observed quiescent
 o	"qp" indicates that RCU still expects a quiescent state from
 	this CPU.
 
-o	"rpfq" is the number of rcu_pending() calls on this CPU required
-	to induce this CPU to invoke force_quiescent_state().
-
-o	"rp" is low-order four hex digits of the count of how many times
-	rcu_pending() has been invoked on this CPU.
-
 o	"dt" is the current value of the dyntick counter that is incremented
 	when entering or leaving dynticks idle state, either by the
 	scheduler or by irq.  The number after the "/" is the interrupt
@@ -305,6 +300,9 @@ o	"b" is the batch limit for this CPU.  If more than this number
 	of RCU callbacks is ready to invoke, then the remainder will
 	be deferred.
 
+There is also an rcu/rcudata.csv file with the same information in
+comma-separated-variable spreadsheet format.
+
 
 The output of "cat rcu/rcugp" looks as follows:
 
@@ -411,3 +409,63 @@ o	Each element of the form "1/1 0:127 ^0" represents one struct
 		For example, the first entry at the lowest level shows
 		"^0", indicating that it corresponds to bit zero in
 		the first entry at the middle level.
+
+
+The output of "cat rcu/rcu_pending" looks as follows:
+
+rcu:
+  0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
+  1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
+  2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
+  3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
+  4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
+  5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
+  6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
+  7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
+rcu_bh:
+  0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
+  1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
+  2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
+  3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
+  4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
+  5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
+  6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
+  7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
+
+As always, this is once again split into "rcu" and "rcu_bh" portions.
+The fields are as follows:
+
+o	"np" is the number of times that __rcu_pending() has been invoked
+	for the corresponding flavor of RCU.
+
+o	"qsp" is the number of times that the RCU was waiting for a
+	quiescent state from this CPU.
+
+o	"cbr" is the number of times that this CPU had RCU callbacks
+	that had passed through a grace period, and were thus ready
+	to be invoked.
+
+o	"cng" is the number of times that this CPU needed another
+	grace period while RCU was idle.
+
+o	"gpc" is the number of times that an old grace period had
+	completed, but this CPU was not yet aware of it.
+
+o	"gps" is the number of times that a new grace period had started,
+	but this CPU was not yet aware of it.
+
+o	"nf" is the number of times that this CPU suspected that the
+	current grace period had run for too long, and thus needed to
+	be forced.
+
+	Please note that "forcing" consists of sending resched IPIs
+	to holdout CPUs.  If that CPU really still is in an old RCU
+	read-side critical section, then we really do have to wait for it.
+	The assumption behing "forcing" is that the CPU is not still in
+	an old RCU read-side critical section, but has not yet responded
+	for some other reason.
+
+o	"nn" is the number of times that this CPU needed nothing.  Alert
+	readers will note that the rcu "nn" number for a given CPU very
+	closely matches the rcu_bh "np" number for that same CPU.  This
+	is due to short-circuit evaluation in rcu_pending().
-- 
GitLab


From 66aa230e437d89ca56224135f617e2d8e391a3ef Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Tue, 14 Apr 2009 12:54:29 +0530
Subject: [PATCH 0387/2198] x86: page_types.h unification of declarations

Impact: unification of declarations, cleanup

Unification of declarations:
 moved init_memory_mapping, initmem_init and free_initmem from
page_XX_types.h to page_types.h

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1239693869.3033.31.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/page_32_types.h | 4 ----
 arch/x86/include/asm/page_64_types.h | 6 ------
 arch/x86/include/asm/page_types.h    | 6 ++++++
 3 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 0f915ae649a71..6f1b7331313f1 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
 extern int sysctl_legacy_va_layout;
 
 extern void find_low_pfn_range(void);
-extern unsigned long init_memory_mapping(unsigned long start,
-					 unsigned long end);
-extern void initmem_init(unsigned long, unsigned long);
-extern void free_initmem(void);
 extern void setup_bootmem_allocator(void);
 
 #endif	/* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d38c91b702483..3f587188ae68a 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -71,12 +71,6 @@ extern unsigned long __phys_addr(unsigned long);
 
 #define vmemmap ((struct page *)VMEMMAP_START)
 
-extern unsigned long init_memory_mapping(unsigned long start,
-					 unsigned long end);
-
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
-extern void free_initmem(void);
-
 extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
 extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
 
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 826ad37006ab1..6473f5ccff859 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
 
+extern unsigned long init_memory_mapping(unsigned long start,
+					 unsigned long end);
+
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void free_initmem(void);
+
 #endif	/* !__ASSEMBLY__ */
 
 #endif	/* _ASM_X86_PAGE_DEFS_H */
-- 
GitLab


From e7d43a74cb07cbc4b8e9b5e4a914816b33fb0719 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Tue, 14 Apr 2009 13:18:28 +0530
Subject: [PATCH 0388/2198] x86: avoid multiple declaration of
 kstack_depth_to_print

Impact: cleanup

asm/stacktrace.h is more appropriate so removing other 2 declarations.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
LKML-Reference: <1239695308.3033.34.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/traps.h | 1 -
 arch/x86/kernel/dumpstack.h  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d5342515b869..9aa3ab2620554 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -74,7 +74,6 @@ static inline int get_si_code(unsigned long condition)
 }
 
 extern int panic_on_unrecovered_nmi;
-extern int kstack_depth_to_print;
 
 void math_error(void __user *);
 void math_emulate(struct math_emu_info *);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index da87590b8698a..81086c227ab7c 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		unsigned long *sp, unsigned long bp, char *log_lvl);
 
 extern unsigned int code_bytes;
-extern int kstack_depth_to_print;
 
 /* The form of the top of the frame on the stack */
 struct stack_frame {
-- 
GitLab


From f711f6090a81cbd396b63de90f415d33f563af9b Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Tue, 14 Apr 2009 10:25:30 +0530
Subject: [PATCH 0389/2198] sched: Nominate idle load balancer from a semi-idle
 package.

Currently the nomination of idle-load balancer is done by choosing the first
idle cpu in the nohz.cpu_mask. This may not be power-efficient, since
such an idle cpu could come from a completely idle core/package thereby
preventing the whole core/package from being in a low-power state.

For eg, consider a quad-core dual package system. The cpu numbering need
not be sequential and can something like [0, 2, 4, 6] and [1, 3, 5, 7].
With sched_mc/smt_power_savings and the power-aware IRQ balance, we try to keep
as fewer Packages/Cores active. But the current idle load balancer logic
goes against this by choosing the first_cpu in the nohz.cpu_mask and not
taking the system topology into consideration.

Improve the algorithm to nominate the idle load balancer from a semi idle
cores/packages thereby increasing the probability of the cores/packages being
in deeper sleep states for longer duration.

The algorithm is activated only when sched_mc/smt_power_savings != 0.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090414045530.7645.12175.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 127 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 118 insertions(+), 9 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 5724508c3b66b..b0fefa300b401 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4240,10 +4240,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 static struct {
 	atomic_t load_balancer;
 	cpumask_var_t cpu_mask;
+	cpumask_var_t ilb_grp_nohz_mask;
 } nohz ____cacheline_aligned = {
 	.load_balancer = ATOMIC_INIT(-1),
 };
 
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:	The cpu whose lowest level of sched domain is to
+ *		be returned.
+ * @flag:	The flag to check for the lowest sched_domain
+ *		for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+	struct sched_domain *sd;
+
+	for_each_domain(cpu, sd)
+		if (sd && (sd->flags & flag))
+			break;
+
+	return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:	The cpu whose domains we're iterating over.
+ * @sd:		variable holding the value of the power_savings_sd
+ *		for cpu.
+ * @flag:	The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+	for (sd = lowest_flag_domain(cpu, flag); \
+		(sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:	group to be checked for semi-idleness
+ *
+ * Returns:	1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+	cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+					sched_group_cpus(ilb_group));
+
+	/*
+	 * A sched_group is semi-idle when it has atleast one busy cpu
+	 * and atleast one idle cpu.
+	 */
+	if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+		return 0;
+
+	if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+		return 0;
+
+	return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:	The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:	Returns the id of the idle load balancer if it exists,
+ *		Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+	struct sched_domain *sd;
+	struct sched_group *ilb_group;
+
+	/*
+	 * Have idle load balancer selection from semi-idle packages only
+	 * when power-aware load balancing is enabled
+	 */
+	if (!(sched_smt_power_savings || sched_mc_power_savings))
+		goto out_done;
+
+	/*
+	 * Optimize for the case when we have no idle CPUs or only one
+	 * idle CPU. Don't walk the sched_domain hierarchy in such cases
+	 */
+	if (cpumask_weight(nohz.cpu_mask) < 2)
+		goto out_done;
+
+	for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+		ilb_group = sd->groups;
+
+		do {
+			if (is_semi_idle_group(ilb_group))
+				return cpumask_first(nohz.ilb_grp_nohz_mask);
+
+			ilb_group = ilb_group->next;
+
+		} while (ilb_group != sd->groups);
+	}
+
+out_done:
+	return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+	return first_cpu(nohz.cpu_mask);
+}
+#endif
+
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4468,15 +4584,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
 		}
 
 		if (atomic_read(&nohz.load_balancer) == -1) {
-			/*
-			 * simple selection for now: Nominate the
-			 * first cpu in the nohz list to be the next
-			 * ilb owner.
-			 *
-			 * TBD: Traverse the sched domains and nominate
-			 * the nearest cpu in the nohz.cpu_mask.
-			 */
-			int ilb = cpumask_first(nohz.cpu_mask);
+			int ilb = find_new_ilb(cpu);
 
 			if (ilb < nr_cpu_ids)
 				resched_cpu(ilb);
@@ -9051,6 +9159,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
 	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+	alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
 #endif
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 #endif /* SMP */
-- 
GitLab


From e790fb0ba64bfec158e1219d899cb588275d12ab Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Tue, 14 Apr 2009 10:25:35 +0530
Subject: [PATCH 0390/2198] sched: Nominate a power-efficient ilb in
 select_nohz_balancer()

The CPU that first goes idle becomes the idle-load-balancer and remains
that until either it picks up a task or till all the CPUs of the system
goes idle.

Optimize this further to allow it to relinquish it's post
once all it's siblings in the power-aware sched_domain go idle, thereby
allowing the whole package-core to go idle. While relinquising the post,
nominate another an idle-load balancer from a semi-idle core/package.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090414045535.7645.31641.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index b0fefa300b401..36d213bca473f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4414,8 +4414,24 @@ int select_nohz_load_balancer(int stop_tick)
 			/* make me the ilb owner */
 			if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
 				return 1;
-		} else if (atomic_read(&nohz.load_balancer) == cpu)
+		} else if (atomic_read(&nohz.load_balancer) == cpu) {
+			int new_ilb;
+
+			if (!(sched_smt_power_savings ||
+						sched_mc_power_savings))
+				return 1;
+			/*
+			 * Check to see if there is a more power-efficient
+			 * ilb.
+			 */
+			new_ilb = find_new_ilb(cpu);
+			if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+				atomic_set(&nohz.load_balancer, -1);
+				resched_cpu(new_ilb);
+				return 0;
+			}
 			return 1;
+		}
 	} else {
 		if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
 			return 0;
-- 
GitLab


From 6424fb38667fffbbb1b90be0ffd9a0c540db6a4b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 13 Apr 2009 23:51:46 -0700
Subject: [PATCH 0391/2198] x86: remove (null) in /sys kernel_page_tables

Impact: cleanup

%p prints out 0x000000000000000 as (null)
so use %lx instead.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49E43282.1090607@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/dump_pagetables.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index e7277cbcfb40e..a725b7f760ae4 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
 		   st->current_address >= st->marker[1].start_address) {
 		const char *unit = units;
 		unsigned long delta;
+		int width = sizeof(unsigned long) * 2;
 
 		/*
 		 * Now print the actual finished series
 		 */
-		seq_printf(m, "0x%p-0x%p   ",
-			   (void *)st->start_address,
-			   (void *)st->current_address);
+		seq_printf(m, "0x%0*lx-0x%0*lx   ",
+			   width, st->start_address,
+			   width, st->current_address);
 
 		delta = (st->current_address - st->start_address) >> 10;
 		while (!(delta & 1023) && unit[1]) {
-- 
GitLab


From 7e05575c422d45f393c2d9b5900e97a30bf69bea Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 14 Apr 2009 12:12:29 +0900
Subject: [PATCH 0392/2198] x86: calgary: remove IOMMU_DEBUG

CONFIG_IOMMU_DEBUG has depends on CONFIG_GART_IOMMU:

config IOMMU_DEBUG
	bool "Enable IOMMU debugging"
	depends on GART_IOMMU && DEBUG_KERNEL
	depends on X86_64

So it's not useful to have CONFIG_IOMMU_DEBUG in Calgary IOMMU code,
which does the extra checking of the bitmap space management.

And Calgary uses the iommu helper for the bitmap space management now
so it would be better to have the extra checking feature in the iommu
helper rather than Calgary code (if necessary).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Muli Ben-Yehuda <muli@il.ibm.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Cc: alexisb@us.ibm.com
LKML-Reference: <20090414120827G.fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-calgary_64.c | 54 ++------------------------------
 1 file changed, 2 insertions(+), 52 deletions(-)

diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 755c21e906f3f..971a3bec47a86 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
 
 static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
 
-/* enable this to stress test the chip's TCE cache */
-#ifdef CONFIG_IOMMU_DEBUG
-static int debugging = 1;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
-	int expected, unsigned long start, unsigned long end)
-{
-	unsigned long idx = start;
-
-	BUG_ON(start >= end);
-
-	while (idx < end) {
-		if (!!test_bit(idx, bitmap) != expected)
-			return idx;
-		++idx;
-	}
-
-	/* all bits have the expected value */
-	return ~0UL;
-}
-#else /* debugging is disabled */
-static int debugging;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
-	int expected, unsigned long start, unsigned long end)
-{
-	return ~0UL;
-}
-
-#endif /* CONFIG_IOMMU_DEBUG */
-
 static inline int translation_enabled(struct iommu_table *tbl)
 {
 	/* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
 {
 	unsigned long index;
 	unsigned long end;
-	unsigned long badbit;
 	unsigned long flags;
 
 	index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
 
 	spin_lock_irqsave(&tbl->it_lock, flags);
 
-	badbit = verify_bit_range(tbl->it_map, 0, index, end);
-	if (badbit != ~0UL) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "Calgary: entry already allocated at "
-			       "0x%lx tbl %p dma 0x%lx npages %u\n",
-			       badbit, tbl, start_addr, npages);
-	}
-
 	iommu_area_reserve(tbl->it_map, index, npages);
 
 	spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	unsigned int npages)
 {
 	unsigned long entry;
-	unsigned long badbit;
 	unsigned long badend;
 	unsigned long flags;
 
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 
 	spin_lock_irqsave(&tbl->it_lock, flags);
 
-	badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
-	if (badbit != ~0UL) {
-		if (printk_ratelimit())
-			printk(KERN_ERR "Calgary: bit is off at 0x%lx "
-			       "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
-			       badbit, tbl, dma_addr, entry, npages);
-	}
-
 	iommu_area_free(tbl->it_map, entry, npages);
 
 	spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
 		iommu_detected = 1;
 		calgary_detected = 1;
 		printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
-		printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
-		       "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
-		       debugging ? "enabled" : "disabled");
+		printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+		       specified_table_size);
 
 		/* swiotlb for devices that aren't behind the Calgary. */
 		if (max_pfn > MAX_DMA32_PFN)
-- 
GitLab


From ea20d9293ce423a39717ed4375393129a2e701f9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 08:54:16 -0400
Subject: [PATCH 0393/2198] tracing: consolidate trace and trace_event headers

Impact: clean up

Neil Horman (et. al.) criticized the way the trace events were broken up
into two files. The reason for that was that ftrace needed to separate out
the declarations from where the #include <linux/tracepoint.h> was used.
It then dawned on me that the tracepoint.h header only needs to define the
TRACE_EVENT macro if it is not already defined.

The solution is simply to test if TRACE_EVENT is defined, and if it is not
then the linux/tracepoint.h header can define it. This change consolidates
all the <traces>.h and <traces>_event_types.h into the <traces>.h file.

Reported-by: Neil Horman <nhorman@tuxdriver.com>
Reported-by: Theodore Tso <tytso@mit.edu>
Reported-by: Jiaying Zhang <jiayingz@google.com>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h          |   9 +-
 include/trace/irq.h                 |  51 ++++-
 include/trace/irq_event_types.h     |  55 -----
 include/trace/kmem.h                | 189 +++++++++++++++-
 include/trace/lockdep.h             |  52 ++++-
 include/trace/lockdep_event_types.h |  57 -----
 include/trace/sched.h               | 333 ++++++++++++++++++++++++++-
 include/trace/sched_event_types.h   | 337 ----------------------------
 include/trace/skb.h                 |  36 ++-
 include/trace/skb_event_types.h     |  38 ----
 include/trace/trace_event_types.h   |   7 -
 kernel/trace/events.c               |   1 +
 kernel/trace/trace_events_stage_1.h |   4 +-
 kernel/trace/trace_events_stage_2.h |   8 +-
 kernel/trace/trace_events_stage_3.h |   4 +-
 15 files changed, 663 insertions(+), 518 deletions(-)
 delete mode 100644 include/trace/irq_event_types.h
 delete mode 100644 include/trace/lockdep_event_types.h
 delete mode 100644 include/trace/sched_event_types.h
 delete mode 100644 include/trace/skb_event_types.h
 delete mode 100644 include/trace/trace_event_types.h

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index d35a7ee7611fe..4353f3f7e624a 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -31,6 +31,8 @@ struct tracepoint {
 					 * Keep in sync with vmlinux.lds.h.
 					 */
 
+#ifndef DECLARE_TRACE
+
 #define TP_PROTO(args...)	args
 #define TP_ARGS(args...)		args
 
@@ -114,6 +116,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
 	struct tracepoint *end)
 { }
 #endif /* CONFIG_TRACEPOINTS */
+#endif /* DECLARE_TRACE */
 
 /*
  * Connect a probe to a tracepoint.
@@ -154,10 +157,13 @@ static inline void tracepoint_synchronize_unregister(void)
 }
 
 #define PARAMS(args...) args
+
+#ifndef TRACE_FORMAT
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#endif
 
-
+#ifndef TRACE_EVENT
 /*
  * For use with the TRACE_EVENT macro:
  *
@@ -262,5 +268,6 @@ static inline void tracepoint_synchronize_unregister(void)
 
 #define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#endif
 
 #endif
diff --git a/include/trace/irq.h b/include/trace/irq.h
index ff5d4495dc37c..04ab4c6522256 100644
--- a/include/trace/irq.h
+++ b/include/trace/irq.h
@@ -1,9 +1,54 @@
-#ifndef _TRACE_IRQ_H
+#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_IRQ_H
 
-#include <linux/interrupt.h>
 #include <linux/tracepoint.h>
+#include <linux/interrupt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq
+
+/*
+ * Tracepoint for entry of interrupt handler:
+ */
+TRACE_FORMAT(irq_handler_entry,
+	TP_PROTO(int irq, struct irqaction *action),
+	TP_ARGS(irq, action),
+	TP_FMT("irq=%d handler=%s", irq, action->name)
+	);
+
+/*
+ * Tracepoint for return of an interrupt handler:
+ */
+TRACE_EVENT(irq_handler_exit,
+
+	TP_PROTO(int irq, struct irqaction *action, int ret),
+
+	TP_ARGS(irq, action, ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	irq	)
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->irq	= irq;
+		__entry->ret	= ret;
+	),
+
+	TP_printk("irq=%d return=%s",
+		  __entry->irq, __entry->ret ? "handled" : "unhandled")
+);
+
+TRACE_FORMAT(softirq_entry,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
 
-#include <trace/irq_event_types.h>
+TRACE_FORMAT(softirq_exit,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
deleted file mode 100644
index 85964ebd47ec5..0000000000000
--- a/include/trace/irq_event_types.h
+++ /dev/null
@@ -1,55 +0,0 @@
-
-/* use <trace/irq.h> instead */
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM irq
-
-/*
- * Tracepoint for entry of interrupt handler:
- */
-TRACE_FORMAT(irq_handler_entry,
-	TP_PROTO(int irq, struct irqaction *action),
-	TP_ARGS(irq, action),
-	TP_FMT("irq=%d handler=%s", irq, action->name)
-	);
-
-/*
- * Tracepoint for return of an interrupt handler:
- */
-TRACE_EVENT(irq_handler_exit,
-
-	TP_PROTO(int irq, struct irqaction *action, int ret),
-
-	TP_ARGS(irq, action, ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	irq	)
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->irq	= irq;
-		__entry->ret	= ret;
-	),
-
-	TP_printk("irq=%d return=%s",
-		  __entry->irq, __entry->ret ? "handled" : "unhandled")
-);
-
-TRACE_FORMAT(softirq_entry,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-TRACE_FORMAT(softirq_exit,
-	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/kmem.h b/include/trace/kmem.h
index 46efc2423f03f..d7d12189e5c8c 100644
--- a/include/trace/kmem.h
+++ b/include/trace/kmem.h
@@ -1,9 +1,192 @@
-#ifndef _TRACE_KMEM_H
+#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_KMEM_H
 
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 
-#include <trace/kmem_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmem
 
-#endif /* _TRACE_KMEM_H */
+TRACE_EVENT(kmalloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmem_cache_alloc,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags)
+);
+
+TRACE_EVENT(kmalloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kmem_cache_alloc_node,
+
+	TP_PROTO(unsigned long call_site,
+		 const void *ptr,
+		 size_t bytes_req,
+		 size_t bytes_alloc,
+		 gfp_t gfp_flags,
+		 int node),
+
+	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+		__field(	size_t,		bytes_req	)
+		__field(	size_t,		bytes_alloc	)
+		__field(	gfp_t,		gfp_flags	)
+		__field(	int,		node		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+		__entry->bytes_req	= bytes_req;
+		__entry->bytes_alloc	= bytes_alloc;
+		__entry->gfp_flags	= gfp_flags;
+		__entry->node		= node;
+	),
+
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+		__entry->call_site,
+		__entry->ptr,
+		__entry->bytes_req,
+		__entry->bytes_alloc,
+		__entry->gfp_flags,
+		__entry->node)
+);
+
+TRACE_EVENT(kfree,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+TRACE_EVENT(kmem_cache_free,
+
+	TP_PROTO(unsigned long call_site, const void *ptr),
+
+	TP_ARGS(call_site, ptr),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	call_site	)
+		__field(	const void *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->call_site	= call_site;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+#endif
diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
index 5ca67df87f2ac..8ee7900b38c41 100644
--- a/include/trace/lockdep.h
+++ b/include/trace/lockdep.h
@@ -1,9 +1,57 @@
-#ifndef _TRACE_LOCKDEP_H
+#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_LOCKDEP_H
 
 #include <linux/lockdep.h>
 #include <linux/tracepoint.h>
 
-#include <trace/lockdep_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lock
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_FORMAT(lock_acquire,
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+		int trylock, int read, int check,
+		struct lockdep_map *next_lock, unsigned long ip),
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TP_FMT("%s%s%s", trylock ? "try " : "",
+		read ? "read " : "", lock->name)
+	);
+
+TRACE_FORMAT(lock_release,
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_ARGS(lock, nested, ip),
+	TP_FMT("%s", lock->name)
+	);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_FORMAT(lock_contended,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
+	);
+
+TRACE_EVENT(lock_acquired,
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+	TP_ARGS(lock, ip, waittime),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(unsigned long, wait_usec)
+		__field(unsigned long, wait_nsec_rem)
+	),
+	TP_fast_assign(
+		__entry->name = lock->name;
+		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+		__entry->wait_usec = (unsigned long) waittime;
+	),
+	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
+				       __entry->wait_nsec_rem)
+);
 
 #endif
+#endif
+
+#endif /* _TRACE_LOCKDEP_H */
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
deleted file mode 100644
index 863f1e4583a66..0000000000000
--- a/include/trace/lockdep_event_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM lock
-
-#ifdef CONFIG_LOCKDEP
-
-TRACE_FORMAT(lock_acquire,
-	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
-		int trylock, int read, int check,
-		struct lockdep_map *next_lock, unsigned long ip),
-	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TP_FMT("%s%s%s", trylock ? "try " : "",
-		read ? "read " : "", lock->name)
-	);
-
-TRACE_FORMAT(lock_release,
-	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-	TP_ARGS(lock, nested, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-#ifdef CONFIG_LOCK_STAT
-
-TRACE_FORMAT(lock_contended,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
-
-TRACE_EVENT(lock_acquired,
-	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
-
-	TP_ARGS(lock, ip, waittime),
-
-	TP_STRUCT__entry(
-		__field(const char *, name)
-		__field(unsigned long, wait_usec)
-		__field(unsigned long, wait_nsec_rem)
-	),
-	TP_fast_assign(
-		__entry->name = lock->name;
-		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
-		__entry->wait_usec = (unsigned long) waittime;
-	),
-	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
-				       __entry->wait_nsec_rem)
-);
-
-#endif
-#endif
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/sched.h b/include/trace/sched.h
index 4e372a1a29bfe..5b1cf4a28463c 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -1,9 +1,336 @@
-#ifndef _TRACE_SCHED_H
+#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_SCHED_H
 
 #include <linux/sched.h>
 #include <linux/tracepoint.h>
 
-#include <trace/sched_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sched
 
-#endif
+/*
+ * Tracepoint for calling kthread_stop, performed to end a kthread:
+ */
+TRACE_EVENT(sched_kthread_stop,
+
+	TP_PROTO(struct task_struct *t),
+
+	TP_ARGS(t),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+		__entry->pid	= t->pid;
+	),
+
+	TP_printk("task %s:%d", __entry->comm, __entry->pid)
+);
+
+/*
+ * Tracepoint for the return value of the kthread stopping:
+ */
+TRACE_EVENT(sched_kthread_stop_ret,
+
+	TP_PROTO(int ret),
+
+	TP_ARGS(ret),
+
+	TP_STRUCT__entry(
+		__field(	int,	ret	)
+	),
+
+	TP_fast_assign(
+		__entry->ret	= ret;
+	),
+
+	TP_printk("ret %d", __entry->ret)
+);
+
+/*
+ * Tracepoint for waiting on task to unschedule:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wait_task,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+
+	TP_ARGS(rq, p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	= p->pid;
+		__entry->prio	= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for waking up a task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+	TP_ARGS(rq, p, success),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	success			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->success	= success;
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
+);
+
+/*
+ * Tracepoint for waking up a new task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup_new,
+
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+	TP_ARGS(rq, p, success),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	success			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->success	= success;
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
+);
+
+/*
+ * Tracepoint for task switches, performed by the scheduler:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_switch,
+
+	TP_PROTO(struct rq *rq, struct task_struct *prev,
+		 struct task_struct *next),
+
+	TP_ARGS(rq, prev, next),
+
+	TP_STRUCT__entry(
+		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	prev_pid			)
+		__field(	int,	prev_prio			)
+		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	next_pid			)
+		__field(	int,	next_prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_pid	= prev->pid;
+		__entry->prev_prio	= prev->prio;
+		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+		__entry->next_pid	= next->pid;
+		__entry->next_prio	= next->prio;
+	),
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio)
+);
+
+/*
+ * Tracepoint for a task being migrated:
+ */
+TRACE_EVENT(sched_migrate_task,
+
+	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+
+	TP_ARGS(p, orig_cpu, dest_cpu),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+		__field(	int,	orig_cpu		)
+		__field(	int,	dest_cpu		)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+		__entry->orig_cpu	= orig_cpu;
+		__entry->dest_cpu	= dest_cpu;
+	),
+
+	TP_printk("task %s:%d [%d] from: %d  to: %d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->orig_cpu, __entry->dest_cpu)
+);
+
+/*
+ * Tracepoint for freeing a task:
+ */
+TRACE_EVENT(sched_process_free,
+
+	TP_PROTO(struct task_struct *p),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a task exiting:
+ */
+TRACE_EVENT(sched_process_exit,
+
+	TP_PROTO(struct task_struct *p),
+
+	TP_ARGS(p),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid		= p->pid;
+		__entry->prio		= p->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a waiting task:
+ */
+TRACE_EVENT(sched_process_wait,
+
+	TP_PROTO(struct pid *pid),
+
+	TP_ARGS(pid),
+
+	TP_STRUCT__entry(
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+		__field(	int,	prio			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+		__entry->pid		= pid_nr(pid);
+		__entry->prio		= current->prio;
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for do_fork:
+ */
+TRACE_EVENT(sched_process_fork,
+
+	TP_PROTO(struct task_struct *parent, struct task_struct *child),
+
+	TP_ARGS(parent, child),
+
+	TP_STRUCT__entry(
+		__array(	char,	parent_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	parent_pid			)
+		__array(	char,	child_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	child_pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
+		__entry->parent_pid	= parent->pid;
+		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
+		__entry->child_pid	= child->pid;
+	),
+
+	TP_printk("parent %s:%d  child %s:%d",
+		__entry->parent_comm, __entry->parent_pid,
+		__entry->child_comm, __entry->child_pid)
+);
+
+/*
+ * Tracepoint for sending a signal:
+ */
+TRACE_EVENT(sched_signal_send,
+
+	TP_PROTO(int sig, struct task_struct *p),
+
+	TP_ARGS(sig, p),
+
+	TP_STRUCT__entry(
+		__field(	int,	sig			)
+		__array(	char,	comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	pid			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+		__entry->pid	= p->pid;
+		__entry->sig	= sig;
+	),
+
+	TP_printk("sig: %d  task %s:%d",
+		  __entry->sig, __entry->comm, __entry->pid)
+);
+
+#endif /* _TRACE_SCHED_H */
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
deleted file mode 100644
index 63547dc1125f7..0000000000000
--- a/include/trace/sched_event_types.h
+++ /dev/null
@@ -1,337 +0,0 @@
-
-/* use <trace/sched.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM sched
-
-/*
- * Tracepoint for calling kthread_stop, performed to end a kthread:
- */
-TRACE_EVENT(sched_kthread_stop,
-
-	TP_PROTO(struct task_struct *t),
-
-	TP_ARGS(t),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
-		__entry->pid	= t->pid;
-	),
-
-	TP_printk("task %s:%d", __entry->comm, __entry->pid)
-);
-
-/*
- * Tracepoint for the return value of the kthread stopping:
- */
-TRACE_EVENT(sched_kthread_stop_ret,
-
-	TP_PROTO(int ret),
-
-	TP_ARGS(ret),
-
-	TP_STRUCT__entry(
-		__field(	int,	ret	)
-	),
-
-	TP_fast_assign(
-		__entry->ret	= ret;
-	),
-
-	TP_printk("ret %d", __entry->ret)
-);
-
-/*
- * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wait_task,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p),
-
-	TP_ARGS(rq, p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->prio	= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-	TP_ARGS(rq, p, success),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	success			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->success	= success;
-	),
-
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success)
-);
-
-/*
- * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup_new,
-
-	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-	TP_ARGS(rq, p, success),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	success			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->success	= success;
-	),
-
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success)
-);
-
-/*
- * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_switch,
-
-	TP_PROTO(struct rq *rq, struct task_struct *prev,
-		 struct task_struct *next),
-
-	TP_ARGS(rq, prev, next),
-
-	TP_STRUCT__entry(
-		__array(	char,	prev_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	prev_pid			)
-		__field(	int,	prev_prio			)
-		__array(	char,	next_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	next_pid			)
-		__field(	int,	next_prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
-		__entry->prev_pid	= prev->pid;
-		__entry->prev_prio	= prev->prio;
-		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
-		__entry->next_pid	= next->pid;
-		__entry->next_prio	= next->prio;
-	),
-
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->next_comm, __entry->next_pid, __entry->next_prio)
-);
-
-/*
- * Tracepoint for a task being migrated:
- */
-TRACE_EVENT(sched_migrate_task,
-
-	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-
-	TP_ARGS(p, orig_cpu, dest_cpu),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-		__field(	int,	orig_cpu		)
-		__field(	int,	dest_cpu		)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-		__entry->orig_cpu	= orig_cpu;
-		__entry->dest_cpu	= dest_cpu;
-	),
-
-	TP_printk("task %s:%d [%d] from: %d  to: %d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->orig_cpu, __entry->dest_cpu)
-);
-
-/*
- * Tracepoint for freeing a task:
- */
-TRACE_EVENT(sched_process_free,
-
-	TP_PROTO(struct task_struct *p),
-
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a task exiting:
- */
-TRACE_EVENT(sched_process_exit,
-
-	TP_PROTO(struct task_struct *p),
-
-	TP_ARGS(p),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid		= p->pid;
-		__entry->prio		= p->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a waiting task:
- */
-TRACE_EVENT(sched_process_wait,
-
-	TP_PROTO(struct pid *pid),
-
-	TP_ARGS(pid),
-
-	TP_STRUCT__entry(
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-		__field(	int,	prio			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-		__entry->pid		= pid_nr(pid);
-		__entry->prio		= current->prio;
-	),
-
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for do_fork:
- */
-TRACE_EVENT(sched_process_fork,
-
-	TP_PROTO(struct task_struct *parent, struct task_struct *child),
-
-	TP_ARGS(parent, child),
-
-	TP_STRUCT__entry(
-		__array(	char,	parent_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	parent_pid			)
-		__array(	char,	child_comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	child_pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
-		__entry->parent_pid	= parent->pid;
-		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
-		__entry->child_pid	= child->pid;
-	),
-
-	TP_printk("parent %s:%d  child %s:%d",
-		__entry->parent_comm, __entry->parent_pid,
-		__entry->child_comm, __entry->child_pid)
-);
-
-/*
- * Tracepoint for sending a signal:
- */
-TRACE_EVENT(sched_signal_send,
-
-	TP_PROTO(int sig, struct task_struct *p),
-
-	TP_ARGS(sig, p),
-
-	TP_STRUCT__entry(
-		__field(	int,	sig			)
-		__array(	char,	comm,	TASK_COMM_LEN	)
-		__field(	pid_t,	pid			)
-	),
-
-	TP_fast_assign(
-		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-		__entry->pid	= p->pid;
-		__entry->sig	= sig;
-	),
-
-	TP_printk("sig: %d  task %s:%d",
-		  __entry->sig, __entry->comm, __entry->pid)
-);
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/skb.h b/include/trace/skb.h
index d2de7174a6e8b..e6fd281f7f81b 100644
--- a/include/trace/skb.h
+++ b/include/trace/skb.h
@@ -1,9 +1,37 @@
-#ifndef _TRACE_SKB_H_
-#define _TRACE_SKB_H_
+#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SKB_H
 
 #include <linux/skbuff.h>
 #include <linux/tracepoint.h>
 
-#include <trace/skb_event_types.h>
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM skb
 
-#endif
+/*
+ * Tracepoint for free an sk_buff:
+ */
+TRACE_EVENT(kfree_skb,
+
+	TP_PROTO(struct sk_buff *skb, void *location),
+
+	TP_ARGS(skb, location),
+
+	TP_STRUCT__entry(
+		__field(	void *,		skbaddr		)
+		__field(	unsigned short,	protocol	)
+		__field(	void *,		location	)
+	),
+
+	TP_fast_assign(
+		__entry->skbaddr = skb;
+		if (skb) {
+			__entry->protocol = ntohs(skb->protocol);
+		}
+		__entry->location = location;
+	),
+
+	TP_printk("skbaddr=%p protocol=%u location=%p",
+		__entry->skbaddr, __entry->protocol, __entry->location)
+);
+
+#endif /* _TRACE_SKB_H */
diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h
deleted file mode 100644
index 4a1c504c0e167..0000000000000
--- a/include/trace/skb_event_types.h
+++ /dev/null
@@ -1,38 +0,0 @@
-
-/* use <trace/skb.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM skb
-
-/*
- * Tracepoint for free an sk_buff:
- */
-TRACE_EVENT(kfree_skb,
-
-	TP_PROTO(struct sk_buff *skb, void *location),
-
-	TP_ARGS(skb, location),
-
-	TP_STRUCT__entry(
-		__field(	void *,		skbaddr		)
-		__field(	unsigned short,	protocol	)
-		__field(	void *,		location	)
-	),
-
-	TP_fast_assign(
-		__entry->skbaddr = skb;
-		if (skb) {
-			__entry->protocol = ntohs(skb->protocol);
-		}
-		__entry->location = location;
-	),
-
-	TP_printk("skbaddr=%p protocol=%u location=%p",
-		__entry->skbaddr, __entry->protocol, __entry->location)
-);
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
deleted file mode 100644
index 552a50e169a66..0000000000000
--- a/include/trace/trace_event_types.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* trace/<type>_event_types.h here */
-
-#include <trace/sched_event_types.h>
-#include <trace/irq_event_types.h>
-#include <trace/lockdep_event_types.h>
-#include <trace/skb_event_types.h>
-#include <trace/kmem_event_types.h>
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 246f2aa6dc468..5a35a914f0e2c 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -8,6 +8,7 @@
 
 #include "trace_output.h"
 
+#define TRACE_HEADER_MULTI_READ
 #include "trace_events_stage_1.h"
 #include "trace_events_stage_2.h"
 #include "trace_events_stage_3.h"
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 38985f9b379c4..475f46a047ae5 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -1,7 +1,7 @@
 /*
  * Stage 1 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * struct ftrace_raw_<call> {
  *	struct trace_entry		ent;
@@ -36,4 +36,4 @@
 	};							\
 	static struct ftrace_event_call event_##name
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 59cfd7dfe68d4..aa4a67a0656fc 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -1,7 +1,7 @@
 /*
  * Stage 2 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * enum print_line_t
  * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
@@ -64,7 +64,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	return TRACE_TYPE_HANDLED;					\
 }
 	
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 /*
  * Setup the showing format of trace point.
@@ -128,7 +128,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 	return ret;							\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 #undef __field
 #define __field(type, item)						\
@@ -167,4 +167,4 @@ ftrace_define_fields_##call(void)					\
 	return ret;							\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 5bb1b7ffbdb6e..45c04e1f38dbf 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -1,7 +1,7 @@
 /*
  * Stage 3 of the trace events.
  *
- * Override the macros in <trace/trace_event_types.h> to include the following:
+ * Override the macros in <trace/trace_events.h> to include the following:
  *
  * static void ftrace_event_<call>(proto)
  * {
@@ -272,7 +272,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	_TRACE_PROFILE_INIT(call)					\
 }
 
-#include <trace/trace_event_types.h>
+#include <trace/trace_events.h>
 
 #undef _TRACE_PROFILE
 #undef _TRACE_PROFILE_INIT
-- 
GitLab


From 78ddb08feb7d4fbe3c0a9931804c51ee58be4023 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 14 Apr 2009 16:53:05 +0200
Subject: [PATCH 0394/2198] wait: don't use __wake_up_common()

'777c6c5 wait: prevent exclusive waiter starvation' made
__wake_up_common() global to be used from abort_exclusive_wait().

It was needed to do a wake-up with the waitqueue lock held while
passing down a key to the wake-up function.

Since '4ede816 epoll keyed wakeups: add __wake_up_locked_key() and
__wake_up_sync_key()' there is an appropriate wrapper for this case:
__wake_up_locked_key().

Use it here and make __wake_up_common() private to the scheduler
again.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1239720785-19661-1-git-send-email-hannes@cmpxchg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/wait.h | 2 --
 kernel/sched.c       | 2 +-
 kernel/wait.c        | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/linux/wait.h b/include/linux/wait.h
index 5d631c17eaee3..67d4e89b62d63 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
 	list_del(&old->task_list);
 }
 
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-			int nr_exclusive, int sync, void *key);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
diff --git a/kernel/sched.c b/kernel/sched.c
index 36d213bca473f..92b4b56ad093c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5345,7 +5345,7 @@ EXPORT_SYMBOL(default_wake_function);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int sync, void *key)
 {
 	wait_queue_t *curr, *next;
diff --git a/kernel/wait.c b/kernel/wait.c
index 42a2dbc181c89..ea7c3b4275cf3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
 	if (!list_empty(&wait->task_list))
 		list_del_init(&wait->task_list);
 	else if (waitqueue_active(q))
-		__wake_up_common(q, mode, 1, 0, key);
+		__wake_up_locked_key(q, mode, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
-- 
GitLab


From 56449f437add737a1e5e1cb7e00f63ac8ead1938 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 14 Apr 2009 11:24:36 +0200
Subject: [PATCH 0395/2198] tracing: make the trace clocks available generally

Jeremy Fitzhardinge reported this build failure:

 LD	 .tmp_vmlinux1
 arch/x86/kernel/built-in.o: In function `ds_take_timestamp':
 git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global'
 git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global'

Which is due to !CONFIG_TRACING && CONFIG_X86_DS=y.

Expose the trace clock code to CONFIG_X86_DS as well.

[ Unfortunately librarizing doesnt work well - ancient architectures
  with no raw_local_irq_save() primitive break the build. ]

Reported-by: Jeremy Fitzhardinge <jeremy@goop.org>
LKML-Reference: <49E4413F.7070700@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/Makefile       | 1 +
 kernel/trace/Makefile | 7 ++++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/Makefile b/kernel/Makefile
index bab1dffe37e94..c8e1be5f0b001 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -92,6 +92,7 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 2630f5121ec12..ecc671e9f142e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,11 +15,16 @@ ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
 
+#
+# Make the trace clocks available generally: it's infrastructure
+# relied on by ptrace for example:
+#
+obj-y += trace_clock.o
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
-obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
-- 
GitLab


From 72c6a9870f901045f2464c3dc6ee8914bfdc07aa Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 14 Apr 2009 17:33:57 +0200
Subject: [PATCH 0396/2198] rculist.h: introduce list_entry_rcu() and
 list_first_entry_rcu()

I've run into the situation where I need to use list_first_entry with
rcu-guarded list. This patch introduces this.

Also simplify list_for_each_entry_rcu() to use new list_entry_rcu()
instead of list_entry().

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: dipankar@in.ibm.com
LKML-Reference: <20090414153356.GC3999@psychotron.englab.brq.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rculist.h | 30 ++++++++++++++++++++++++++++--
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/include/linux/rculist.h b/include/linux/rculist.h
index e649bd3f2c976..5710f43bbc9ec 100644
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -198,6 +198,32 @@ static inline void list_splice_init_rcu(struct list_head *list,
 	at->prev = last;
 }
 
+/**
+ * list_entry_rcu - get the struct for this entry
+ * @ptr:        the &struct list_head pointer.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_entry_rcu(ptr, type, member) \
+	container_of(rcu_dereference(ptr), type, member)
+
+/**
+ * list_first_entry_rcu - get the first element from a list
+ * @ptr:        the list head to take the element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_first_entry_rcu(ptr, type, member) \
+	list_entry_rcu((ptr)->next, type, member)
+
 #define __list_for_each_rcu(pos, head) \
 	for (pos = rcu_dereference((head)->next); \
 		pos != (head); \
@@ -214,9 +240,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_entry_rcu(pos, head, member) \
-	for (pos = list_entry(rcu_dereference((head)->next), typeof(*pos), member); \
+	for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
 		prefetch(pos->member.next), &pos->member != (head); \
-		pos = list_entry(rcu_dereference(pos->member.next), typeof(*pos), member))
+		pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 
 /**
-- 
GitLab


From a8d154b009168337494fbf345671bab74d3e4b8b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 09:36:00 -0400
Subject: [PATCH 0397/2198] tracing: create automated trace defines

This patch lowers the number of places a developer must modify to add
new tracepoints. The current method to add a new tracepoint
into an existing system is to write the trace point macro in the
trace header with one of the macros TRACE_EVENT, TRACE_FORMAT or
DECLARE_TRACE, then they must add the same named item into the C file
with the macro DEFINE_TRACE(name) and then add the trace point.

This change cuts out the needing to add the DEFINE_TRACE(name).
Every file that uses the tracepoint must still include the trace/<type>.h
file, but the one C file must also add a define before the including
of that file.

 #define CREATE_TRACE_POINTS
 #include <trace/mytrace.h>

This will cause the trace/mytrace.h file to also produce the C code
necessary to implement the trace point.

Note, if more than one trace/<type>.h is used to create the C code
it is best to list them all together.

 #define CREATE_TRACE_POINTS
 #include <trace/foo.h>
 #include <trace/bar.h>
 #include <trace/fido.h>

Thanks to Mathieu Desnoyers and Christoph Hellwig for coming up with
the cleaner solution of the define above the includes over my first
design to have the C code include a "special" header.

This patch converts sched, irq and lockdep and skb to use this new
method.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/define_trace.h | 75 ++++++++++++++++++++++++++++++++++++
 include/trace/irq.h          |  5 ++-
 include/trace/kmem.h         |  4 +-
 include/trace/lockdep.h      |  3 ++
 include/trace/sched.h        |  3 ++
 include/trace/skb.h          |  3 ++
 kernel/exit.c                |  4 --
 kernel/fork.c                |  2 -
 kernel/irq/handle.c          |  7 ++--
 kernel/kthread.c             |  3 --
 kernel/lockdep.c             | 12 ++----
 kernel/sched.c               | 10 ++---
 kernel/signal.c              |  2 -
 kernel/softirq.c             |  3 --
 mm/util.c                    | 11 ++----
 net/core/net-traces.c        |  4 +-
 16 files changed, 105 insertions(+), 46 deletions(-)
 create mode 100644 include/trace/define_trace.h

diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
new file mode 100644
index 0000000000000..de9dc7d8508bb
--- /dev/null
+++ b/include/trace/define_trace.h
@@ -0,0 +1,75 @@
+/*
+ * Trace files that want to automate creationg of all tracepoints defined
+ * in their file should include this file. The following are macros that the
+ * trace file may define:
+ *
+ * TRACE_SYSTEM defines the system the tracepoint is for
+ *
+ * TRACE_INCLUDE_FILE if the file name is something other than TRACE_SYSTEM.h
+ *     This macro may be defined to tell define_trace.h what file to include.
+ *     Note, leave off the ".h".
+ *
+ * TRACE_INCLUDE_PATH if the path is something other than core kernel include/trace
+ *     then this macro can define the path to use. Note, the path is relative to
+ *     define_trace.h, not the file including it. Full path names for out of tree
+ *     modules must be used.
+ */
+
+#ifdef CREATE_TRACE_POINTS
+
+/* Prevent recursion */
+#undef CREATE_TRACE_POINTS
+
+#include <linux/stringify.h>
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
+	DEFINE_TRACE(name)
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(name, proto, args, print)	\
+	DEFINE_TRACE(name)
+
+#undef DECLARE_TRACE
+#define DECLARE_TRACE(name, proto, args)	\
+	DEFINE_TRACE(name)
+
+#undef TRACE_INCLUDE
+#undef __TRACE_INCLUDE
+
+#ifndef TRACE_INCLUDE_FILE
+# define TRACE_INCLUDE_FILE TRACE_SYSTEM
+# define UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+#ifndef TRACE_INCLUDE_PATH
+# define __TRACE_INCLUDE(system) <trace/system.h>
+# define UNDEF_TRACE_INCLUDE_FILE
+#else
+# define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
+#endif
+
+# define TRACE_INCLUDE(system) __TRACE_INCLUDE(system)
+
+/* Let the trace headers be reread */
+#define TRACE_HEADER_MULTI_READ
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef TRACE_HEADER_MULTI_READ
+
+/* Only undef what we defined in this file */
+#ifdef UNDEF_TRACE_INCLUDE_FILE
+# undef TRACE_INCLUDE_PATH
+# undef UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+#ifdef UNDEF_TRACE_INCLUDE_FILE
+# undef TRACE_INCLUDE_PATH
+# undef UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+/* We may be processing more files */
+#define CREATE_TRACE_POINTS
+
+#endif /* CREATE_TRACE_POINTS */
diff --git a/include/trace/irq.h b/include/trace/irq.h
index 04ab4c6522256..75e3468e44933 100644
--- a/include/trace/irq.h
+++ b/include/trace/irq.h
@@ -51,4 +51,7 @@ TRACE_FORMAT(softirq_exit,
 	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
 	);
 
-#endif
+#endif /*  _TRACE_IRQ_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/kmem.h b/include/trace/kmem.h
index d7d12189e5c8c..c22c42f980b5e 100644
--- a/include/trace/kmem.h
+++ b/include/trace/kmem.h
@@ -188,5 +188,7 @@ TRACE_EVENT(kmem_cache_free,
 
 	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
 );
+#endif /* _TRACE_KMEM_H */
 
-#endif
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
index 8ee7900b38c41..4d301e758de3e 100644
--- a/include/trace/lockdep.h
+++ b/include/trace/lockdep.h
@@ -55,3 +55,6 @@ TRACE_EVENT(lock_acquired,
 #endif
 
 #endif /* _TRACE_LOCKDEP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/sched.h b/include/trace/sched.h
index 5b1cf4a28463c..ffa1cab586b9b 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -334,3 +334,6 @@ TRACE_EVENT(sched_signal_send,
 );
 
 #endif /* _TRACE_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/skb.h b/include/trace/skb.h
index e6fd281f7f81b..1e8fabb57c06e 100644
--- a/include/trace/skb.h
+++ b/include/trace/skb.h
@@ -35,3 +35,6 @@ TRACE_EVENT(kfree_skb,
 );
 
 #endif /* _TRACE_SKB_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index abf9cf3b95c60..2fe9d2c7eeeec 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -56,10 +56,6 @@
 #include <asm/mmu_context.h>
 #include "cred-internals.h"
 
-DEFINE_TRACE(sched_process_free);
-DEFINE_TRACE(sched_process_exit);
-DEFINE_TRACE(sched_process_wait);
-
 static void exit_mm(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p)
diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd007265..4bebf26392358 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -83,8 +83,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
-DEFINE_TRACE(sched_process_fork);
-
 int nr_processes(void)
 {
 	int cpu;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d82142be8dd2c..983d8be8dff71 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,9 +17,11 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <trace/irq.h>
 #include <linux/bootmem.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/irq.h>
+
 #include "internals.h"
 
 /*
@@ -348,9 +350,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
 	       "but no thread function available.", irq, action->name);
 }
 
-DEFINE_TRACE(irq_handler_entry);
-DEFINE_TRACE(irq_handler_exit);
-
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ebaf8519abf6..e1c76924545b1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
-DEFINE_TRACE(sched_kthread_stop);
-DEFINE_TRACE(sched_kthread_stop_ret);
-
 struct kthread_create_info
 {
 	/* Information passed to kthread() from kthreadd. */
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index c4582a6ea9534..257f21a76c52e 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,12 +42,14 @@
 #include <linux/hash.h>
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
-#include <trace/lockdep.h>
 
 #include <asm/sections.h>
 
 #include "lockdep_internals.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/lockdep.h>
+
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -2929,8 +2931,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
 
-DEFINE_TRACE(lock_acquire);
-
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
@@ -2957,8 +2957,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 EXPORT_SYMBOL_GPL(lock_acquire);
 
-DEFINE_TRACE(lock_release);
-
 void lock_release(struct lockdep_map *lock, int nested,
 			  unsigned long ip)
 {
@@ -3061,8 +3059,6 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
 	put_lock_stats(stats);
 }
 
-DEFINE_TRACE(lock_acquired);
-
 static void
 __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
@@ -3118,8 +3114,6 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 	lock->ip = ip;
 }
 
-DEFINE_TRACE(lock_contended);
-
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
diff --git a/kernel/sched.c b/kernel/sched.c
index 5724508c3b66b..e6d4518d47e00 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -72,13 +72,15 @@
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 
 #include "sched_cpupri.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/sched.h>
+
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
  */
 #define RUNTIME_INF	((u64)~0ULL)
 
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
 #ifdef CONFIG_SMP
 
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
diff --git a/kernel/signal.c b/kernel/signal.c
index d8034737db4cb..1d5703ff003c3 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -41,8 +41,6 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
-DEFINE_TRACE(sched_signal_send);
-
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
 	return t->sighand->action[sig - 1].sa.sa_handler;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2fecefacdc5bc..a2d9b458ac2b5 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -186,9 +186,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-DEFINE_TRACE(softirq_entry);
-DEFINE_TRACE(softirq_exit);
-
 asmlinkage void __do_softirq(void)
 {
 	struct softirq_action *h;
diff --git a/mm/util.c b/mm/util.c
index 2599e83eea17a..0e74a22791cb0 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,9 +4,11 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
-#include <linux/tracepoint.h>
 #include <asm/uaccess.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/kmem.h>
+
 /**
  * kstrdup - allocate space for and copy an existing string
  * @s: the string to duplicate
@@ -239,13 +241,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
 /* Tracepoints definitions. */
-DEFINE_TRACE(kmalloc);
-DEFINE_TRACE(kmem_cache_alloc);
-DEFINE_TRACE(kmalloc_node);
-DEFINE_TRACE(kmem_cache_alloc_node);
-DEFINE_TRACE(kfree);
-DEFINE_TRACE(kmem_cache_free);
-
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
 EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index c8fb45665e4f4..8017720594741 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -19,11 +19,11 @@
 #include <linux/workqueue.h>
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
-#include <trace/skb.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/skb.h>
 
-DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
-- 
GitLab


From 9504504cbab29ecb694186b1c5b15d3579c43c51 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 11 Apr 2009 12:59:57 -0400
Subject: [PATCH 0398/2198] tracing: make trace_seq operations available for
 core kernel

In the process to make TRACE_EVENT macro work for modules, the trace_seq
operations must be available for core kernel code.

These operations are quite useful and can be used for other implementations.

The main idea is that we create a trace_seq handle that acts very much
like the seq_file handle.

	struct trace_seq *s = kmalloc(sizeof(*s, GFP_KERNEL);

	trace_seq_init(s);
	trace_seq_printf(s, "some data %d\n", variable);

	printk("%s", s->buffer);

The main use is to allow a top level function call several other functions
that may store printf like data into the buffer. Then at the end, the top
level function can process all the data with any method it would like to.
It could be passed to userspace, output via printk or even use seq_file:

	trace_seq_to_user(s, ubuf, cnt);
	seq_puts(m, s->buffer);

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_seq.h   | 89 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h        | 15 +------
 kernel/trace/trace_output.h | 16 +------
 3 files changed, 92 insertions(+), 28 deletions(-)
 create mode 100644 include/linux/trace_seq.h

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
new file mode 100644
index 0000000000000..28051da876ddc
--- /dev/null
+++ b/include/linux/trace_seq.h
@@ -0,0 +1,89 @@
+#ifndef _LINUX_TRACE_SEQ_H
+#define _LINUX_TRACE_SEQ_H
+
+/*
+ * Trace sequences are used to allow a function to call several other functions
+ * to create a string of data to use (up to a max of PAGE_SIZE.
+ */
+
+struct trace_seq {
+	unsigned char		buffer[PAGE_SIZE];
+	unsigned int		len;
+	unsigned int		readpos;
+};
+
+static inline void
+trace_seq_init(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+/*
+ * Currently only defined when tracing is enabled.
+ */
+#ifdef CONFIG_TRACING
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				size_t len);
+extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
+extern int trace_seq_path(struct trace_seq *s, struct path *path);
+
+#else /* CONFIG_TRACING */
+static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)))
+{
+	return 0;
+}
+static inline int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+	return 0;
+}
+
+static inline void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+}
+static inline ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt)
+{
+	return 0;
+}
+static inline int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	return 0;
+}
+static inline int trace_seq_putc(struct trace_seq *s, unsigned char c);
+{
+	return 0;
+}
+static inline int
+trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
+{
+	return 0;
+}
+static inline int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				       size_t len)
+{
+	return 0;
+}
+static inline void *trace_seq_reserve(struct trace_seq *s, size_t len)
+{
+	return NULL;
+}
+static inline int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+	return 0;
+}
+#endif /* CONFIG_TRACING */
+
+#endif /* _LINUX_TRACE_SEQ_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b05b6ac982a1a..1882846b73894 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -12,6 +12,8 @@
 #include <linux/kmemtrace.h>
 #include <trace/power.h>
 
+#include <linux/trace_seq.h>
+
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
 
@@ -423,19 +425,6 @@ struct tracer {
 	struct tracer_stat	*stats;
 };
 
-struct trace_seq {
-	unsigned char		buffer[PAGE_SIZE];
-	unsigned int		len;
-	unsigned int		readpos;
-};
-
-static inline void
-trace_seq_init(struct trace_seq *s)
-{
-	s->len = 0;
-	s->readpos = 0;
-}
-
 
 #define TRACE_PIPE_ALL_CPU	-1
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 91630217fb466..5c7cbfb65c713 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -1,6 +1,7 @@
 #ifndef __TRACE_EVENTS_H
 #define __TRACE_EVENTS_H
 
+#include <linux/trace_seq.h>
 #include "trace.h"
 
 typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
@@ -20,24 +21,9 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_printk_msg_only(struct trace_iterator *iter);
 
-extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
-
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)));
-extern int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
-				 size_t cnt);
-extern int trace_seq_puts(struct trace_seq *s, const char *str);
-extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
-				size_t len);
-extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
-extern int trace_seq_path(struct trace_seq *s, struct path *path);
 extern int seq_print_userip_objs(const struct userstack_entry *entry,
 				 struct trace_seq *s, unsigned long sym_flags);
 extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
-- 
GitLab


From 97f2025153499faa17267a0d4e18c7afaf73f39d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 13 Apr 2009 11:20:49 -0400
Subject: [PATCH 0399/2198] tracing/events: move declarations from trace
 directory to core include

In preparation to allowing trace events to happen in modules, we need
to move some of the local declarations in the kernel/trace directory
into include/linux.

This patch simply moves the declarations and performs no context changes.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 146 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h         | 120 +---------------------------
 kernel/trace/trace_output.h  |  14 ----
 3 files changed, 147 insertions(+), 133 deletions(-)
 create mode 100644 include/linux/ftrace_event.h

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
new file mode 100644
index 0000000000000..496b76d9f9d83
--- /dev/null
+++ b/include/linux/ftrace_event.h
@@ -0,0 +1,146 @@
+#ifndef _LINUX_FTRACE_EVENT_H
+#define _LINUX_FTRACE_EVENT_H
+
+#include <linux/trace_seq.h>
+#include <linux/ring_buffer.h>
+
+
+struct trace_array;
+struct tracer;
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+	unsigned char		type;
+	unsigned char		flags;
+	unsigned char		preempt_count;
+	int			pid;
+	int			tgid;
+};
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+	struct trace_array	*tr;
+	struct tracer		*trace;
+	void			*private;
+	int			cpu_file;
+	struct mutex		mutex;
+	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+
+	/* The below is zeroed out in pipe_read */
+	struct trace_seq	seq;
+	struct trace_entry	*ent;
+	int			cpu;
+	u64			ts;
+
+	unsigned long		iter_flags;
+	loff_t			pos;
+	long			idx;
+
+	cpumask_var_t		started;
+};
+
+
+typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
+					      int flags);
+struct trace_event {
+	struct hlist_node	node;
+	int			type;
+	trace_print_func	trace;
+	trace_print_func	raw;
+	trace_print_func	hex;
+	trace_print_func	binary;
+};
+
+extern int register_ftrace_event(struct trace_event *event);
+extern int unregister_ftrace_event(struct trace_event *event);
+
+/* Return values for print_line callback */
+enum print_line_t {
+	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
+	TRACE_TYPE_HANDLED	= 1,
+	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
+	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
+};
+
+
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+				  unsigned long flags, int pc);
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
+
+void tracing_record_cmdline(struct task_struct *tsk);
+
+struct ftrace_event_call {
+	char			*name;
+	char			*system;
+	struct dentry		*dir;
+	int			enabled;
+	int			(*regfunc)(void);
+	void			(*unregfunc)(void);
+	int			id;
+	int			(*raw_init)(void);
+	int			(*show_format)(struct trace_seq *s);
+	int			(*define_fields)(void);
+	struct list_head	fields;
+	int			n_preds;
+	struct filter_pred	**preds;
+
+#ifdef CONFIG_EVENT_PROFILE
+	atomic_t	profile_count;
+	int		(*profile_enable)(struct ftrace_event_call *);
+	void		(*profile_disable)(struct ftrace_event_call *);
+#endif
+};
+
+#define MAX_FILTER_PRED		8
+#define MAX_FILTER_STR_VAL	128
+
+extern int init_preds(struct ftrace_event_call *call);
+extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern int filter_current_check_discard(struct ftrace_event_call *call,
+					void *rec,
+					struct ring_buffer_event *event);
+
+extern int trace_define_field(struct ftrace_event_call *call, char *type,
+			      char *name, int offset, int size);
+
+
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement optimizing out.
+ */
+#define event_trace_printk(ip, fmt, args...)				\
+do {									\
+	__trace_printk_check_format(fmt, ##args);			\
+	tracing_record_cmdline(current);				\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
+	} else								\
+		__trace_printk(ip, fmt, ##args);			\
+} while (0)
+
+#define __common_field(type, item)					\
+	ret = trace_define_field(event_call, #type, "common_" #item,	\
+				 offsetof(typeof(field.ent), item),	\
+				 sizeof(field.ent.item));		\
+	if (ret)							\
+		return ret;
+
+#endif /* _LINUX_FTRACE_EVENT_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1882846b73894..6bcdf4af9b2d1 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,7 @@
 #include <trace/power.h>
 
 #include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -43,20 +44,6 @@ enum trace_type {
 	__TRACE_LAST_TYPE,
 };
 
-/*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
-	unsigned char		type;
-	unsigned char		flags;
-	unsigned char		preempt_count;
-	int			pid;
-	int			tgid;
-};
-
 /*
  * Function trace entry - function address and parent function addres:
  */
@@ -265,8 +252,6 @@ struct trace_array_cpu {
 	char			comm[TASK_COMM_LEN];
 };
 
-struct trace_iterator;
-
 /*
  * The trace array - an array of per-CPU trace arrays. This is the
  * highest level data structure that individual tracers deal with.
@@ -341,15 +326,6 @@ extern void __ftrace_bad_type(void);
 		__ftrace_bad_type();					\
 	} while (0)
 
-/* Return values for print_line callback */
-enum print_line_t {
-	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
-	TRACE_TYPE_HANDLED	= 1,
-	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
-	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
-};
-
-
 /*
  * An option specific to a tracer. This is a boolean value.
  * The bit is the bit index that sets its value on the
@@ -428,31 +404,6 @@ struct tracer {
 
 #define TRACE_PIPE_ALL_CPU	-1
 
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
-	struct trace_array	*tr;
-	struct tracer		*trace;
-	void			*private;
-	int			cpu_file;
-	struct mutex		mutex;
-	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
-
-	/* The below is zeroed out in pipe_read */
-	struct trace_seq	seq;
-	struct trace_entry	*ent;
-	int			cpu;
-	u64			ts;
-
-	unsigned long		iter_flags;
-	loff_t			pos;
-	long			idx;
-
-	cpumask_var_t		started;
-};
-
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
@@ -479,15 +430,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 				struct ring_buffer_event *event,
 				unsigned long flags, int pc);
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
-				  unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
-					unsigned long flags, int pc);
-void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
-
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 
@@ -510,7 +452,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
 				struct task_struct *prev,
 				struct task_struct *next,
 				unsigned long flags, int pc);
-void tracing_record_cmdline(struct task_struct *tsk);
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
 				struct task_struct *wakee,
@@ -790,28 +731,6 @@ struct ftrace_event_field {
 	int			size;
 };
 
-struct ftrace_event_call {
-	char			*name;
-	char			*system;
-	struct dentry		*dir;
-	int			enabled;
-	int			(*regfunc)(void);
-	void			(*unregfunc)(void);
-	int			id;
-	int			(*raw_init)(void);
-	int			(*show_format)(struct trace_seq *s);
-	int			(*define_fields)(void);
-	struct list_head	fields;
-	int			n_preds;
-	struct filter_pred	**preds;
-
-#ifdef CONFIG_EVENT_PROFILE
-	atomic_t	profile_count;
-	int		(*profile_enable)(struct ftrace_event_call *);
-	void		(*profile_disable)(struct ftrace_event_call *);
-#endif
-};
-
 struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
@@ -825,9 +744,6 @@ struct event_subsystem {
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
 	     event++)
 
-#define MAX_FILTER_PRED		8
-#define MAX_FILTER_STR_VAL	128
-
 struct filter_pred;
 
 typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
@@ -845,9 +761,6 @@ struct filter_pred {
 	int clear;
 };
 
-int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size);
-extern int init_preds(struct ftrace_event_call *call);
 extern void filter_free_pred(struct filter_pred *pred);
 extern void filter_print_preds(struct filter_pred **preds, int n_preds,
 			       struct trace_seq *s);
@@ -855,13 +768,9 @@ extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
 extern void filter_disable_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
-extern int filter_current_check_discard(struct ftrace_event_call *call,
-					void *rec,
-					struct ring_buffer_event *event);
 
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -876,14 +785,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
-#define __common_field(type, item)					\
-	ret = trace_define_field(event_call, #type, "common_" #item,	\
-				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
-	if (ret)							\
-		return ret;
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
@@ -895,25 +796,6 @@ extern struct ftrace_event_call __stop_ftrace_events[];
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
-/*
- * The double __builtin_constant_p is because gcc will give us an error
- * if we try to allocate the static variable to fmt if it is not a
- * constant. Even with the outer if statement optimizing out.
- */
-#define event_trace_printk(ip, fmt, args...)				\
-do {									\
-	__trace_printk_check_format(fmt, ##args);			\
-	tracing_record_cmdline(current);				\
-	if (__builtin_constant_p(fmt)) {				\
-		static const char *trace_printk_fmt			\
-		  __attribute__((section("__trace_printk_fmt"))) =	\
-			__builtin_constant_p(fmt) ? fmt : NULL;		\
-									\
-		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
-	} else								\
-		__trace_printk(ip, fmt, ##args);			\
-} while (0)
-
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 	extern struct ftrace_event_call event_##call;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 5c7cbfb65c713..6e220a8e5706a 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -4,18 +4,6 @@
 #include <linux/trace_seq.h>
 #include "trace.h"
 
-typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
-					      int flags);
-
-struct trace_event {
-	struct hlist_node	node;
-	int			type;
-	trace_print_func	trace;
-	trace_print_func	raw;
-	trace_print_func	hex;
-	trace_print_func	binary;
-};
-
 extern enum print_line_t
 trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
@@ -33,8 +21,6 @@ extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
 extern struct trace_event *ftrace_find_event(int type);
-extern int register_ftrace_event(struct trace_event *event);
-extern int unregister_ftrace_event(struct trace_event *event);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
 					 int flags);
-- 
GitLab


From f42c85e74faa422cf0bc747ed808681145448f88 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 13 Apr 2009 12:25:37 -0400
Subject: [PATCH 0400/2198] tracing/events: move the ftrace event tracing code
 to core

This patch moves the ftrace creation into include/trace/ftrace.h and
simplifies the work of developers in adding new tracepoints.
Just the act of creating the trace points in include/trace and including
define_trace.h will create the events in the debugfs/tracing/events
directory.

This patch removes the need of include/trace/trace_events.h

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/define_trace.h                  |   4 +
 .../trace/ftrace.h                            | 215 +++++++++++++++++-
 include/trace/trace_events.h                  |   7 -
 kernel/trace/Makefile                         |   1 -
 kernel/trace/events.c                         |  15 --
 kernel/trace/trace_events_stage_1.h           |  39 ----
 kernel/trace/trace_events_stage_2.h           | 170 --------------
 7 files changed, 218 insertions(+), 233 deletions(-)
 rename kernel/trace/trace_events_stage_3.h => include/trace/ftrace.h (58%)
 delete mode 100644 include/trace/trace_events.h
 delete mode 100644 kernel/trace/events.c
 delete mode 100644 kernel/trace/trace_events_stage_1.h
 delete mode 100644 kernel/trace/trace_events_stage_2.h

diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index de9dc7d8508bb..980eb66a6e386 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -56,6 +56,10 @@
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+#ifdef CONFIG_EVENT_TRACER
+#include <trace/ftrace.h>
+#endif
+
 #undef TRACE_HEADER_MULTI_READ
 
 /* Only undef what we defined in this file */
diff --git a/kernel/trace/trace_events_stage_3.h b/include/trace/ftrace.h
similarity index 58%
rename from kernel/trace/trace_events_stage_3.h
rename to include/trace/ftrace.h
index 45c04e1f38dbf..955b967acd748 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/include/trace/ftrace.h
@@ -1,3 +1,216 @@
+/*
+ * Stage 1 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * struct ftrace_raw_<call> {
+ *	struct trace_entry		ent;
+ *	<type>				<item>;
+ *	<type2>				<item2>[<len>];
+ *	[...]
+ * };
+ *
+ * The <type> <item> is created by the __field(type, item) macro or
+ * the __array(type2, item2, len) macro.
+ * We simply do "type item;", and that will create the fields
+ * in the structure.
+ */
+
+#include <linux/ftrace_event.h>
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)
+
+#undef __array
+#define __array(type, item, len)	type	item[len];
+
+#undef __field
+#define __field(type, item)		type	item;
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
+	struct ftrace_raw_##name {				\
+		struct trace_entry	ent;			\
+		tstruct						\
+	};							\
+	static struct ftrace_event_call event_##name
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 2 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * enum print_line_t
+ * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
+ * {
+ *	struct trace_seq *s = &iter->seq;
+ *	struct ftrace_raw_<call> *field; <-- defined in stage 1
+ *	struct trace_entry *entry;
+ *	int ret;
+ *
+ *	entry = iter->ent;
+ *
+ *	if (entry->type != event_<call>.id) {
+ *		WARN_ON_ONCE(1);
+ *		return TRACE_TYPE_UNHANDLED;
+ *	}
+ *
+ *	field = (typeof(field))entry;
+ *
+ *	ret = trace_seq_printf(s, <TP_printk> "\n");
+ *	if (!ret)
+ *		return TRACE_TYPE_PARTIAL_LINE;
+ *
+ *	return TRACE_TYPE_HANDLED;
+ * }
+ *
+ * This is the method used to print the raw event to the trace
+ * output format. Note, this is not needed if the data is read
+ * in binary.
+ */
+
+#undef __entry
+#define __entry field
+
+#undef TP_printk
+#define TP_printk(fmt, args...) fmt "\n", args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+enum print_line_t							\
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
+{									\
+	struct trace_seq *s = &iter->seq;				\
+	struct ftrace_raw_##call *field;				\
+	struct trace_entry *entry;					\
+	int ret;							\
+									\
+	entry = iter->ent;						\
+									\
+	if (entry->type != event_##call.id) {				\
+		WARN_ON_ONCE(1);					\
+		return TRACE_TYPE_UNHANDLED;				\
+	}								\
+									\
+	field = (typeof(field))entry;					\
+									\
+	ret = trace_seq_printf(s, #call ": " print);			\
+	if (!ret)							\
+		return TRACE_TYPE_PARTIAL_LINE;				\
+									\
+	return TRACE_TYPE_HANDLED;					\
+}
+	
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " offset:%u; size:%u;\n",
+ *			       offsetof(struct ftrace_raw_##call, item),
+ *			       sizeof(field.type));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __array
+#define __array(type, item, len)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __entry
+#define __entry REC
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef __field
+#define __field(type, item)						\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef __array
+#define __array(type, item, len)					\
+	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+int									\
+ftrace_define_fields_##call(void)					\
+{									\
+	struct ftrace_raw_##call field;					\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	int ret;							\
+									\
+	__common_field(unsigned char, type);				\
+	__common_field(unsigned char, flags);				\
+	__common_field(unsigned char, preempt_count);			\
+	__common_field(int, pid);					\
+	__common_field(int, tgid);					\
+									\
+	tstruct;							\
+									\
+	return ret;							\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
 /*
  * Stage 3 of the trace events.
  *
@@ -272,7 +485,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	_TRACE_PROFILE_INIT(call)					\
 }
 
-#include <trace/trace_events.h>
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 #undef _TRACE_PROFILE
 #undef _TRACE_PROFILE_INIT
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
deleted file mode 100644
index 13d6b85668cfe..0000000000000
--- a/include/trace/trace_events.h
+++ /dev/null
@@ -1,7 +0,0 @@
-/* trace/<type>.h here */
-
-#include <trace/sched.h>
-#include <trace/irq.h>
-#include <trace/lockdep.h>
-#include <trace/skb.h>
-#include <trace/kmem.h>
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 3ad367e7c97f2..fb9d7f964898b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,7 +41,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
-obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
deleted file mode 100644
index 5a35a914f0e2c..0000000000000
--- a/kernel/trace/events.c
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * This is the place to register all trace points as events.
- */
-
-#include <linux/stringify.h>
-
-#include <trace/trace_events.h>
-
-#include "trace_output.h"
-
-#define TRACE_HEADER_MULTI_READ
-#include "trace_events_stage_1.h"
-#include "trace_events_stage_2.h"
-#include "trace_events_stage_3.h"
-
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
deleted file mode 100644
index 475f46a047ae5..0000000000000
--- a/kernel/trace/trace_events_stage_1.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Stage 1 of the trace events.
- *
- * Override the macros in <trace/trace_events.h> to include the following:
- *
- * struct ftrace_raw_<call> {
- *	struct trace_entry		ent;
- *	<type>				<item>;
- *	<type2>				<item2>[<len>];
- *	[...]
- * };
- *
- * The <type> <item> is created by the __field(type, item) macro or
- * the __array(type2, item2, len) macro.
- * We simply do "type item;", and that will create the fields
- * in the structure.
- */
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
-#undef __array
-#define __array(type, item, len)	type	item[len];
-
-#undef __field
-#define __field(type, item)		type	item;
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
-	struct ftrace_raw_##name {				\
-		struct trace_entry	ent;			\
-		tstruct						\
-	};							\
-	static struct ftrace_event_call event_##name
-
-#include <trace/trace_events.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
deleted file mode 100644
index aa4a67a0656fc..0000000000000
--- a/kernel/trace/trace_events_stage_2.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Stage 2 of the trace events.
- *
- * Override the macros in <trace/trace_events.h> to include the following:
- *
- * enum print_line_t
- * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
- * {
- *	struct trace_seq *s = &iter->seq;
- *	struct ftrace_raw_<call> *field; <-- defined in stage 1
- *	struct trace_entry *entry;
- *	int ret;
- *
- *	entry = iter->ent;
- *
- *	if (entry->type != event_<call>.id) {
- *		WARN_ON_ONCE(1);
- *		return TRACE_TYPE_UNHANDLED;
- *	}
- *
- *	field = (typeof(field))entry;
- *
- *	ret = trace_seq_printf(s, <TP_printk> "\n");
- *	if (!ret)
- *		return TRACE_TYPE_PARTIAL_LINE;
- *
- *	return TRACE_TYPE_HANDLED;
- * }
- *
- * This is the method used to print the raw event to the trace
- * output format. Note, this is not needed if the data is read
- * in binary.
- */
-
-#undef __entry
-#define __entry field
-
-#undef TP_printk
-#define TP_printk(fmt, args...) fmt "\n", args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-enum print_line_t							\
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
-{									\
-	struct trace_seq *s = &iter->seq;				\
-	struct ftrace_raw_##call *field;				\
-	struct trace_entry *entry;					\
-	int ret;							\
-									\
-	entry = iter->ent;						\
-									\
-	if (entry->type != event_##call.id) {				\
-		WARN_ON_ONCE(1);					\
-		return TRACE_TYPE_UNHANDLED;				\
-	}								\
-									\
-	field = (typeof(field))entry;					\
-									\
-	ret = trace_seq_printf(s, #call ": " print);			\
-	if (!ret)							\
-		return TRACE_TYPE_PARTIAL_LINE;				\
-									\
-	return TRACE_TYPE_HANDLED;					\
-}
-	
-#include <trace/trace_events.h>
-
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " offset:%u; size:%u;\n",
- *			       offsetof(struct ftrace_raw_##call, item),
- *			       sizeof(field.type));
- *
- * }
- */
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef __field
-#define __field(type, item)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __array
-#define __array(type, item, len)						\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __entry
-#define __entry REC
-
-#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: " print);			\
-									\
-	return ret;							\
-}
-
-#include <trace/trace_events.h>
-
-#undef __field
-#define __field(type, item)						\
-	ret = trace_define_field(event_call, #type, #item,		\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
-	if (ret)							\
-		return ret;
-
-#undef __array
-#define __array(type, item, len)					\
-	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
-	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
-				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
-	if (ret)							\
-		return ret;
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
-int									\
-ftrace_define_fields_##call(void)					\
-{									\
-	struct ftrace_raw_##call field;					\
-	struct ftrace_event_call *event_call = &event_##call;		\
-	int ret;							\
-									\
-	__common_field(unsigned char, type);				\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
-									\
-	tstruct;							\
-									\
-	return ret;							\
-}
-
-#include <trace/trace_events.h>
-- 
GitLab


From a59fd6027218bd7c994e39d14afe0242f895144f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 13:52:20 -0400
Subject: [PATCH 0401/2198] tracing/events: convert event call sites to use a
 link list

Impact: makes it possible to define events in modules

The events are created by reading down the section that they are linked
in by the macros. But this is not scalable to modules. This patch converts
the manipulations to use a global link list, and on boot up it adds
the items in the section to the list.

This change will allow modules to add their tracing events to the list as
well.

Note, this change alone does not permit modules to use the TRACE_EVENT macros,
but the change is needed for them to eventually do so.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h       |  1 +
 kernel/trace/trace.h               | 13 +-------
 kernel/trace/trace_event_profile.c |  4 +--
 kernel/trace/trace_events.c        | 51 ++++++++++++++++++------------
 kernel/trace/trace_events_filter.c |  8 ++---
 5 files changed, 39 insertions(+), 38 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 496b76d9f9d83..17810853b4f80 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -83,6 +83,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
 void tracing_record_cmdline(struct task_struct *tsk);
 
 struct ftrace_event_call {
+	struct list_head	list;
 	char			*name;
 	char			*system;
 	struct dentry		*dir;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6bcdf4af9b2d1..8817c18ef97a9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -739,11 +739,6 @@ struct event_subsystem {
 	struct filter_pred	**preds;
 };
 
-#define events_for_each(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 struct filter_pred;
 
 typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
@@ -785,13 +780,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#define for_each_event(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
+extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 199de9c742296..7bf2ad65eee58 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -11,7 +11,7 @@ int ftrace_profile_enable(int event_id)
 {
 	struct ftrace_event_call *event;
 
-	for_each_event(event) {
+	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id)
 			return event->profile_enable(event);
 	}
@@ -23,7 +23,7 @@ void ftrace_profile_disable(int event_id)
 {
 	struct ftrace_event_call *event;
 
-	for_each_event(event) {
+	list_for_each_entry(event, &ftrace_events, list) {
 		if (event->id == event_id)
 			return event->profile_disable(event);
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ead68ac991913..5c66aaff07c12 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,6 +19,8 @@
 
 static DEFINE_MUTEX(event_mutex);
 
+LIST_HEAD(ftrace_events);
+
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size)
 {
@@ -54,16 +56,14 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 
 static void ftrace_clear_events(void)
 {
-	struct ftrace_event_call *call = (void *)__start_ftrace_events;
-
+	struct ftrace_event_call *call;
 
-	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (call->enabled) {
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		call++;
 	}
 }
 
@@ -89,7 +89,7 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 
 static int ftrace_set_clr_event(char *buf, int set)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	char *event = NULL, *sub = NULL, *match;
 	int ret = -EINVAL;
 
@@ -118,7 +118,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 	}
 
 	mutex_lock(&event_mutex);
-	for_each_event(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (!call->name || !call->regfunc)
 			continue;
@@ -224,15 +224,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = m->private;
-	struct ftrace_event_call *next = call;
+	struct list_head *list = m->private;
+	struct ftrace_event_call *call;
 
 	(*pos)++;
 
 	for (;;) {
-		if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		if (list == &ftrace_events)
 			return NULL;
 
+		call = list_entry(list, struct ftrace_event_call, list);
+
 		/*
 		 * The ftrace subsystem is for showing formats only.
 		 * They can not be enabled or disabled via the event files.
@@ -240,11 +242,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 		if (call->regfunc)
 			break;
 
-		call++;
-		next = call;
+		list = list->next;
 	}
 
-	m->private = ++next;
+	m->private = list->next;
 
 	return call;
 }
@@ -257,22 +258,23 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-	struct ftrace_event_call *call = m->private;
-	struct ftrace_event_call *next;
+	struct list_head *list = m->private;
+	struct ftrace_event_call *call;
 
 	(*pos)++;
 
  retry:
-	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+	if (list == &ftrace_events)
 		return NULL;
 
+	call = list_entry(list, struct ftrace_event_call, list);
+
 	if (!call->enabled) {
-		call++;
+		list = list->next;
 		goto retry;
 	}
 
-	next = call;
-	m->private = ++next;
+	m->private = list->next;
 
 	return call;
 }
@@ -312,7 +314,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 	if (!ret) {
 		struct seq_file *m = file->private_data;
 
-		m->private = __start_ftrace_events;
+		m->private = ftrace_events.next;
 	}
 	return ret;
 }
@@ -797,9 +799,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	return 0;
 }
 
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
+
+#define for_each_event(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 static __init int event_trace_init(void)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	struct dentry *d_tracer;
 	struct dentry *entry;
 	struct dentry *d_events;
@@ -830,6 +840,7 @@ static __init int event_trace_init(void)
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
+		list_add(&call->list, &ftrace_events);
 		event_create_dir(call, d_events);
 	}
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index de42dad42a884..d30b06b02b4d0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -223,7 +223,7 @@ int init_preds(struct ftrace_event_call *call)
 
 void filter_free_subsystem_preds(struct event_subsystem *system)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 	int i;
 
 	if (system->n_preds) {
@@ -234,7 +234,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 		system->n_preds = 0;
 	}
 
-	events_for_each(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
 
@@ -320,7 +320,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
-	struct ftrace_event_call *call = __start_ftrace_events;
+	struct ftrace_event_call *call;
 
 	if (system->n_preds && !pred->compound)
 		filter_free_subsystem_preds(system);
@@ -337,7 +337,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 
 	system->preds[system->n_preds] = pred;
 
-	events_for_each(call) {
+	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
 
 		if (!call->define_fields)
-- 
GitLab


From 17c873ec280a03894bc718af817f7f24fa787ae1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 18:12:50 -0400
Subject: [PATCH 0402/2198] tracing/events: add export symbols for trace events
 in modules

Impact: let modules add trace events

The trace event code requires some functions to be exported to allow
modules to use TRACE_EVENT. This patch adds EXPORT_SYMBOL_GPL to the
necessary functions.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c               | 3 +++
 kernel/trace/trace_events.c        | 1 +
 kernel/trace/trace_events_filter.c | 2 ++
 kernel/trace/trace_output.c        | 3 +++
 4 files changed, 9 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c0047fcf70764..2d69b26b3cc9f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -176,6 +176,7 @@ int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
 {
 	return filter_check_discard(call, rec, global_trace.buffer, event);
 }
+EXPORT_SYMBOL_GPL(filter_current_check_discard);
 
 cycle_t ftrace_now(int cpu)
 {
@@ -886,6 +887,7 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 	return trace_buffer_lock_reserve(&global_trace,
 					 type, len, flags, pc);
 }
+EXPORT_SYMBOL(trace_current_buffer_lock_reserve);
 
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
@@ -903,6 +905,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
 {
 	ring_buffer_discard_commit(global_trace.buffer, event);
 }
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
 
 void
 trace_function(struct trace_array *tr,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5c66aaff07c12..8b9e621b80b49 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -53,6 +53,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(trace_define_field);
 
 static void ftrace_clear_events(void)
 {
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d30b06b02b4d0..f8e5eab0424c6 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -110,6 +110,7 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 
 	return 1;
 }
+EXPORT_SYMBOL_GPL(filter_match_preds);
 
 void filter_print_preds(struct filter_pred **preds, int n_preds,
 			struct trace_seq *s)
@@ -220,6 +221,7 @@ int init_preds(struct ftrace_event_call *call)
 
 	return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(init_preds);
 
 void filter_free_subsystem_preds(struct event_subsystem *system)
 {
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0e70fb07ca785..83a8abb9640f9 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -94,6 +94,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 
 	return len;
 }
+EXPORT_SYMBOL_GPL(trace_seq_printf);
 
 int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
@@ -538,6 +539,7 @@ int register_ftrace_event(struct trace_event *event)
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_ftrace_event);
 
 /**
  * unregister_ftrace_event - remove a no longer used event
@@ -551,6 +553,7 @@ int unregister_ftrace_event(struct trace_event *event)
 
 	return 0;
 }
+EXPORT_SYMBOL_GPL(unregister_ftrace_event);
 
 /*
  * Standard events
-- 
GitLab


From 6d723736e472f7a0cd5b62c84152fceead241328 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 10 Apr 2009 14:53:50 -0400
Subject: [PATCH 0403/2198] tracing/events: add support for modules to
 TRACE_EVENT

Impact: allow modules to add TRACE_EVENTS on load

This patch adds the final hooks to allow modules to use the TRACE_EVENT
macro. A notifier and a data structure are used to link the TRACE_EVENTs
defined in the module to connect them with the ftrace event tracing system.

It also adds the necessary automated clean ups to the trace events when a
module is removed.

Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |   3 +
 include/linux/module.h       |   4 ++
 include/linux/trace_seq.h    |   2 +
 include/trace/ftrace.h       |   1 +
 kernel/module.c              |   7 ++
 kernel/trace/trace_events.c  | 128 ++++++++++++++++++++++++++---------
 6 files changed, 113 insertions(+), 32 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 17810853b4f80..75f3ac01a87c6 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -7,6 +7,7 @@
 
 struct trace_array;
 struct tracer;
+struct dentry;
 
 /*
  * The trace entry - the most basic unit of tracing. This is what
@@ -87,6 +88,7 @@ struct ftrace_event_call {
 	char			*name;
 	char			*system;
 	struct dentry		*dir;
+	struct trace_event	*event;
 	int			enabled;
 	int			(*regfunc)(void);
 	void			(*unregfunc)(void);
@@ -97,6 +99,7 @@ struct ftrace_event_call {
 	struct list_head	fields;
 	int			n_preds;
 	struct filter_pred	**preds;
+	void			*mod;
 
 #ifdef CONFIG_EVENT_PROFILE
 	atomic_t	profile_count;
diff --git a/include/linux/module.h b/include/linux/module.h
index 627ac082e2a64..6155fa44168ba 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -337,6 +337,10 @@ struct module
 	const char **trace_bprintk_fmt_start;
 	unsigned int num_trace_bprintk_fmt;
 #endif
+#ifdef CONFIG_EVENT_TRACING
+	struct ftrace_event_call *trace_events;
+	unsigned int num_trace_events;
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 28051da876ddc..15ca2c71af132 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_TRACE_SEQ_H
 #define _LINUX_TRACE_SEQ_H
 
+#include <linux/fs.h>
+
 /*
  * Trace sequences are used to allow a function to call several other functions
  * to create a string of data to use (up to a max of PAGE_SIZE.
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 955b967acd748..60c5323bee64b 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -477,6 +477,7 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
+	.event			= &ftrace_event_type_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
diff --git a/kernel/module.c b/kernel/module.c
index e797812a4d95f..a0394706f10c4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -18,6 +18,7 @@
 */
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <linux/ftrace_event.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
@@ -2172,6 +2173,12 @@ static noinline struct module *load_module(void __user *umod,
 					sizeof(*mod->tracepoints),
 					&mod->num_tracepoints);
 #endif
+#ifdef CONFIG_EVENT_TRACING
+	mod->trace_events = section_objs(hdr, sechdrs, secstrings,
+					 "_ftrace_events",
+					 sizeof(*mod->trace_events),
+					 &mod->num_trace_events);
+#endif
 
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8b9e621b80b49..a4b177720a6ce 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -713,7 +713,13 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 		return d_events;
 	}
 
-	system->name = name;
+	system->name = kstrdup(name, GFP_KERNEL);
+	if (!system->name) {
+		debugfs_remove(system->entry);
+		kfree(system);
+		return d_events;
+	}
+
 	list_add(&system->list, &event_subsystems);
 
 	system->preds = NULL;
@@ -738,7 +744,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	 * If the trace point header did not define TRACE_SYSTEM
 	 * then the system would be called "TRACE_SYSTEM".
 	 */
-	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+	if (strcmp(call->system, TRACE_SYSTEM) != 0)
 		d_events = event_subsystem_dir(call->system, d_events);
 
 	if (call->raw_init) {
@@ -757,21 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return -1;
 	}
 
-	if (call->regfunc) {
-		entry = debugfs_create_file("enable", 0644, call->dir, call,
-					    &ftrace_enable_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs "
-				   "'%s/enable' entry\n", call->name);
-	}
+	if (call->regfunc)
+		entry = trace_create_file("enable", 0644, call->dir, call,
+					  &ftrace_enable_fops);
 
-	if (call->id) {
-		entry = debugfs_create_file("id", 0444, call->dir, call,
-				&ftrace_event_id_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs '%s/id' entry\n",
-					call->name);
-	}
+	if (call->id)
+		entry = trace_create_file("id", 0444, call->dir, call,
+					  &ftrace_event_id_fops);
 
 	if (call->define_fields) {
 		ret = call->define_fields();
@@ -780,40 +778,102 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   " events/%s\n", call->name);
 			return ret;
 		}
-		entry = debugfs_create_file("filter", 0644, call->dir, call,
-					    &ftrace_event_filter_fops);
-		if (!entry)
-			pr_warning("Could not create debugfs "
-				   "'%s/filter' entry\n", call->name);
+		entry = trace_create_file("filter", 0644, call->dir, call,
+					  &ftrace_event_filter_fops);
 	}
 
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
 
-	entry = debugfs_create_file("format", 0444, call->dir, call,
-				    &ftrace_event_format_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/format' entry\n", call->name);
+	entry = trace_create_file("format", 0444, call->dir, call,
+				  &ftrace_event_format_fops);
+
+	return 0;
+}
+
+#define for_each_event(event, start, end)			\
+	for (event = start;					\
+	     (unsigned long)event < (unsigned long)end;		\
+	     event++)
+
+static void trace_module_add_events(struct module *mod)
+{
+	struct ftrace_event_call *call, *start, *end;
+	struct dentry *d_events;
+
+	start = mod->trace_events;
+	end = mod->trace_events + mod->num_trace_events;
+
+	if (start == end)
+		return;
+
+	d_events = event_trace_events_dir();
+	if (!d_events)
+		return;
+
+	for_each_event(call, start, end) {
+		/* The linker may leave blanks */
+		if (!call->name)
+			continue;
+		call->mod = mod;
+		list_add(&call->list, &ftrace_events);
+		event_create_dir(call, d_events);
+	}
+}
+
+static void trace_module_remove_events(struct module *mod)
+{
+	struct ftrace_event_call *call, *p;
+
+	list_for_each_entry_safe(call, p, &ftrace_events, list) {
+		if (call->mod == mod) {
+			if (call->enabled) {
+				call->enabled = 0;
+				call->unregfunc();
+			}
+			if (call->event)
+				unregister_ftrace_event(call->event);
+			debugfs_remove_recursive(call->dir);
+			list_del(&call->list);
+		}
+	}
+}
+
+int trace_module_notify(struct notifier_block *self,
+			unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	mutex_lock(&event_mutex);
+	switch (val) {
+	case MODULE_STATE_COMING:
+		trace_module_add_events(mod);
+		break;
+	case MODULE_STATE_GOING:
+		trace_module_remove_events(mod);
+		break;
+	}
+	mutex_unlock(&event_mutex);
 
 	return 0;
 }
 
+struct notifier_block trace_module_nb = {
+	.notifier_call = trace_module_notify,
+	.priority = 0,
+};
+
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
-#define for_each_event(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 static __init int event_trace_init(void)
 {
 	struct ftrace_event_call *call;
 	struct dentry *d_tracer;
 	struct dentry *entry;
 	struct dentry *d_events;
+	int ret;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
@@ -837,7 +897,7 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
-	for_each_event(call) {
+	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
@@ -845,6 +905,10 @@ static __init int event_trace_init(void)
 		event_create_dir(call, d_events);
 	}
 
+	ret = register_module_notifier(&trace_module_nb);
+	if (!ret)
+		pr_warning("Failed to register trace events module notifier\n");
+
 	return 0;
 }
 fs_initcall(event_trace_init);
-- 
GitLab


From 19e4529ee7345079eeacc8e40cf69a304a64dc23 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Tue, 14 Apr 2009 17:27:18 +1000
Subject: [PATCH 0404/2198] modules: Fix up build when CONFIG_MODULE_UNLOAD=n.

Commit 3d43321b7015387cfebbe26436d0e9d299162ea1 ("modules: sysctl to
block module loading") introduces a modules_disabled variable that is
only defined if CONFIG_MODULE_UNLOAD is enabled, despite being used in
other places. This moves it up and fixes up the build.

  CC      kernel/module.o
kernel/module.c: In function 'sys_init_module':
kernel/module.c:2401: error: 'modules_disabled' undeclared (first use in this function)
kernel/module.c:2401: error: (Each undeclared identifier is reported only once
kernel/module.c:2401: error: for each function it appears in.)
make[1]: *** [kernel/module.o] Error 1
make: *** [kernel/module.o] Error 2

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/module.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/module.c b/kernel/module.c
index eeb3f7b1383c9..ee7ab612dafa0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -71,6 +71,9 @@
 static DEFINE_MUTEX(module_mutex);
 static LIST_HEAD(modules);
 
+/* Block module loading/unloading? */
+int modules_disabled = 0;
+
 /* Waiting for a module to finish initializing? */
 static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
@@ -778,9 +781,6 @@ static void wait_for_zero_refcount(struct module *mod)
 	mutex_lock(&module_mutex);
 }
 
-/* Block module loading/unloading? */
-int modules_disabled = 0;
-
 SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		unsigned int, flags)
 {
-- 
GitLab


From 61f919a12fbdc3fd20f980a34a118d597198a392 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 18:22:32 -0400
Subject: [PATCH 0405/2198] tracing/events: fix compile for modules disabled

Impact: compile fix

The addition of TRACE_EVENT for modules breaks the build for when
modules are disabled. This code fixes that.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a4b177720a6ce..6591d83e1e7ad 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -797,6 +797,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	     (unsigned long)event < (unsigned long)end;		\
 	     event++)
 
+#ifdef CONFIG_MODULES
 static void trace_module_add_events(struct module *mod)
 {
 	struct ftrace_event_call *call, *start, *end;
@@ -840,8 +841,8 @@ static void trace_module_remove_events(struct module *mod)
 	}
 }
 
-int trace_module_notify(struct notifier_block *self,
-			unsigned long val, void *data)
+static int trace_module_notify(struct notifier_block *self,
+			       unsigned long val, void *data)
 {
 	struct module *mod = data;
 
@@ -858,6 +859,13 @@ int trace_module_notify(struct notifier_block *self,
 
 	return 0;
 }
+#else
+static int trace_module_notify(struct notifier_block *self,
+			       unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
 
 struct notifier_block trace_module_nb = {
 	.notifier_call = trace_module_notify,
-- 
GitLab


From ecda8ae02a08ef065ff387f5cb2a2d4999da2408 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 18:49:38 -0400
Subject: [PATCH 0406/2198] tracing/events: fix lockdep system name

Impact: fix compile error of lockdep event tracer

Ingo Molnar pointed out that the system name for the lockdep tracer was "lock"
which is used to include the event trace file name. It should be "lockdep"

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/lockdep.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
index 4d301e758de3e..45e326b5c7f3e 100644
--- a/include/trace/lockdep.h
+++ b/include/trace/lockdep.h
@@ -5,7 +5,7 @@
 #include <linux/tracepoint.h>
 
 #undef TRACE_SYSTEM
-#define TRACE_SYSTEM lock
+#define TRACE_SYSTEM lockdep
 
 #ifdef CONFIG_LOCKDEP
 
-- 
GitLab


From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 19:39:12 -0400
Subject: [PATCH 0407/2198] tracing/events: move trace point headers into
 include/trace/events

Impact: clean up

Create a sub directory in include/trace called events to keep the
trace point headers in their own separate directory. Only headers that
declare trace points should be defined in this directory.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/kmemtrace.h            | 2 +-
 include/trace/define_trace.h         | 2 +-
 include/trace/{ => events}/irq.h     | 0
 include/trace/{ => events}/kmem.h    | 0
 include/trace/{ => events}/lockdep.h | 0
 include/trace/{ => events}/sched.h   | 0
 include/trace/{ => events}/skb.h     | 0
 kernel/exit.c                        | 2 +-
 kernel/fork.c                        | 3 ++-
 kernel/irq/handle.c                  | 2 +-
 kernel/kthread.c                     | 2 +-
 kernel/lockdep.c                     | 2 +-
 kernel/sched.c                       | 2 +-
 kernel/signal.c                      | 2 +-
 kernel/softirq.c                     | 2 +-
 kernel/trace/ftrace.c                | 2 +-
 kernel/trace/trace_sched_switch.c    | 2 +-
 kernel/trace/trace_sched_wakeup.c    | 2 +-
 mm/util.c                            | 2 +-
 net/core/drop_monitor.c              | 2 +-
 net/core/net-traces.c                | 2 +-
 net/core/skbuff.c                    | 2 +-
 22 files changed, 18 insertions(+), 17 deletions(-)
 rename include/trace/{ => events}/irq.h (100%)
 rename include/trace/{ => events}/kmem.h (100%)
 rename include/trace/{ => events}/lockdep.h (100%)
 rename include/trace/{ => events}/sched.h (100%)
 rename include/trace/{ => events}/skb.h (100%)

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
index 15c45a27a925c..b616d3930c3b7 100644
--- a/include/linux/kmemtrace.h
+++ b/include/linux/kmemtrace.h
@@ -9,7 +9,7 @@
 
 #ifdef __KERNEL__
 
-#include <trace/kmem.h>
+#include <trace/events/kmem.h>
 
 #ifdef CONFIG_KMEMTRACE
 extern void kmemtrace_init(void);
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 980eb66a6e386..18869417109c0 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -43,7 +43,7 @@
 #endif
 
 #ifndef TRACE_INCLUDE_PATH
-# define __TRACE_INCLUDE(system) <trace/system.h>
+# define __TRACE_INCLUDE(system) <trace/events/system.h>
 # define UNDEF_TRACE_INCLUDE_FILE
 #else
 # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
diff --git a/include/trace/irq.h b/include/trace/events/irq.h
similarity index 100%
rename from include/trace/irq.h
rename to include/trace/events/irq.h
diff --git a/include/trace/kmem.h b/include/trace/events/kmem.h
similarity index 100%
rename from include/trace/kmem.h
rename to include/trace/events/kmem.h
diff --git a/include/trace/lockdep.h b/include/trace/events/lockdep.h
similarity index 100%
rename from include/trace/lockdep.h
rename to include/trace/events/lockdep.h
diff --git a/include/trace/sched.h b/include/trace/events/sched.h
similarity index 100%
rename from include/trace/sched.h
rename to include/trace/events/sched.h
diff --git a/include/trace/skb.h b/include/trace/events/skb.h
similarity index 100%
rename from include/trace/skb.h
rename to include/trace/events/skb.h
diff --git a/kernel/exit.c b/kernel/exit.c
index 2fe9d2c7eeeec..cab535c427b8f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,7 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 4bebf26392358..085f73ebcea63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -61,7 +61,6 @@
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
-#include <trace/sched.h>
 #include <linux/magic.h>
 
 #include <asm/pgtable.h>
@@ -71,6 +70,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <trace/events/sched.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 983d8be8dff71..37c63633e78b4 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -20,7 +20,7 @@
 #include <linux/bootmem.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/irq.h>
+#include <trace/events/irq.h>
 
 #include "internals.h"
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e1c76924545b1..41c88fe405003 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,7 +13,7 @@
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #define KTHREAD_NICE_LEVEL (-5)
 
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 257f21a76c52e..47b201ecc6df5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -48,7 +48,7 @@
 #include "lockdep_internals.h"
 
 #define CREATE_TRACE_POINTS
-#include <trace/lockdep.h>
+#include <trace/events/lockdep.h>
 
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
diff --git a/kernel/sched.c b/kernel/sched.c
index e6d4518d47e00..9f7ffd00b6ea4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -79,7 +79,7 @@
 #include "sched_cpupri.h"
 
 #define CREATE_TRACE_POINTS
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
diff --git a/kernel/signal.c b/kernel/signal.c
index 1d5703ff003c3..94ec0a4dde0f6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -27,7 +27,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/param.h>
 #include <asm/uaccess.h>
diff --git a/kernel/softirq.c b/kernel/softirq.c
index a2d9b458ac2b5..7ab9dfd8d0820 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,7 +24,7 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
-#include <trace/irq.h>
+#include <trace/events/irq.h>
 
 #include <asm/irq.h>
 /*
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8e6a0b5c99405..a23488988581a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,7 +29,7 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/ftrace.h>
 
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 9d8cccdfaa06c..a98106dd979cf 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -10,7 +10,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include "trace.h"
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 5bc00e8f153eb..b8b13c5540fdb 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include "trace.h"
 
diff --git a/mm/util.c b/mm/util.c
index 0e74a22791cb0..6794a336e9af4 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,7 +7,7 @@
 #include <asm/uaccess.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/kmem.h>
+#include <trace/events/kmem.h>
 
 /**
  * kstrdup - allocate space for and copy an existing string
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 9fd0dc3cca99f..b75b6cea49dab 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -23,7 +23,7 @@
 #include <linux/bitops.h>
 #include <net/genetlink.h>
 
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 #include <asm/unaligned.h>
 
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 8017720594741..499a67eaf3ae2 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -24,6 +24,6 @@
 #include <asm/bitops.h>
 
 #define CREATE_TRACE_POINTS
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index ce6356cd9f71c..12806b8444562 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -65,7 +65,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 #include "kmap_skb.h"
 
-- 
GitLab


From 9cfe06f8cd5c8c3ad6ab323973e87dde670642b8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 14 Apr 2009 21:37:03 -0400
Subject: [PATCH 0408/2198] tracing/events: add trace-events-sample

This patch adds a sample to the samples directory on how to create
and use TRACE_EVENT trace points.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 samples/Kconfig                            |   7 ++
 samples/Makefile                           |   2 +-
 samples/trace_events/Makefile              |   8 ++
 samples/trace_events/trace-events-sample.c |  56 ++++++++++
 samples/trace_events/trace-events-sample.h | 124 +++++++++++++++++++++
 5 files changed, 196 insertions(+), 1 deletion(-)
 create mode 100644 samples/trace_events/Makefile
 create mode 100644 samples/trace_events/trace-events-sample.c
 create mode 100644 samples/trace_events/trace-events-sample.h

diff --git a/samples/Kconfig b/samples/Kconfig
index 4b02f5a0e6560..93f41c0510920 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -19,6 +19,13 @@ config SAMPLE_TRACEPOINTS
 	help
 	  This build tracepoints example modules.
 
+config SAMPLE_TRACE_EVENTS
+	tristate "Build trace_events examples"
+	depends on EVENT_TRACING
+	default m
+	help
+	  This build trace event example modules.
+
 config SAMPLE_KOBJECT
 	tristate "Build kobject examples"
 	help
diff --git a/samples/Makefile b/samples/Makefile
index 10eaca89fe179..13e4b470b5399 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -1,3 +1,3 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/
+obj-$(CONFIG_SAMPLES)	+= markers/ kobject/ kprobes/ tracepoints/ trace_events/
diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile
new file mode 100644
index 0000000000000..06c6dea1eb84c
--- /dev/null
+++ b/samples/trace_events/Makefile
@@ -0,0 +1,8 @@
+# builds the trace events example kernel modules;
+# then to use one (as root):  insmod <module_name.ko>
+
+PWD := $(shell pwd)
+
+CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/
+
+obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
new file mode 100644
index 0000000000000..f33b3ba744ac8
--- /dev/null
+++ b/samples/trace_events/trace-events-sample.c
@@ -0,0 +1,56 @@
+#include <linux/module.h>
+#include <linux/kthread.h>
+
+/*
+ * Any file that uses trace points, must include the header.
+ * But only one file, must include the header by defining
+ * CREATE_TRACE_POINTS first.  This will make the C code that
+ * creates the handles for the trace points.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace-events-sample.h"
+
+
+static void simple_thread_func(int cnt)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	schedule_timeout(HZ);
+	trace_foo_bar("hello", cnt);
+
+	if (!(cnt % 10))
+		/* It is really important that I say "hi!" */
+		printk(KERN_EMERG "hi!\n");
+}
+
+static int simple_thread(void *arg)
+{
+	int cnt = 0;
+
+	while (!kthread_should_stop())
+		simple_thread_func(cnt++);
+
+	return 0;
+}
+
+static struct task_struct *simple_tsk;
+
+static int __init trace_event_init(void)
+{
+	simple_tsk = kthread_run(simple_thread, NULL, "event-sample");
+	if (IS_ERR(simple_tsk))
+		return -1;
+
+	return 0;
+}
+
+static void __exit trace_event_exit(void)
+{
+	kthread_stop(simple_tsk);
+}
+
+module_init(trace_event_init);
+module_exit(trace_event_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("trace-events-sample");
+MODULE_LICENSE("GPL");
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
new file mode 100644
index 0000000000000..eab46443e610e
--- /dev/null
+++ b/samples/trace_events/trace-events-sample.h
@@ -0,0 +1,124 @@
+/*
+ * Notice that this file is not protected like a normal header.
+ * We also must allow for rereading of this file. The
+ *
+ *  || defined(TRACE_HEADER_MULTI_READ)
+ *
+ * serves this purpose.
+ */
+#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EVENT_SAMPLE_H
+
+/*
+ * All trace headers should include tracepoint.h, until we finally
+ * make it into a standard header.
+ */
+#include <linux/tracepoint.h>
+
+/*
+ * If TRACE_SYSTEM is defined, that will be the directory created
+ * in the ftrace directory under /debugfs/tracing/events/<system>
+ *
+ * The define_trace.h belowe will also look for a file name of
+ * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here.
+ *
+ * If you want a different system than file name, you can override
+ * the header name by defining TRACE_INCLUDE_FILE
+ *
+ * If this file was called, goofy.h, then we would define:
+ *
+ * #define TRACE_INCLUDE_FILE goofy
+ *
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM trace-events-sample
+
+/*
+ * The TRACE_EVENT macro is broken up into 5 parts.
+ *
+ * name: name of the trace point. This is also how to enable the tracepoint.
+ *   A function called trace_foo_bar() will be created.
+ *
+ * proto: the prototype of the function trace_foo_bar()
+ *   Here it is trace_foo_bar(char *foo, int bar).
+ *
+ * args:  must match the arguments in the prototype.
+ *    Here it is simply "foo, bar".
+ *
+ * struct:  This defines the way the data will be stored in the ring buffer.
+ *    There are currently two types of elements. __field and __array.
+ *    a __field is broken up into (type, name). Where type can be any
+ *    type but an array.
+ *    For an array. there are three fields. (type, name, size). The
+ *    type of elements in the array, the name of the field and the size
+ *    of the array.
+ *
+ *    __array( char, foo, 10) is the same as saying   char foo[10].
+ *
+ * fast_assign: This is a C like function that is used to store the items
+ *    into the ring buffer.
+ *
+ * printk: This is a way to print out the data in pretty print. This is
+ *    useful if the system crashes and you are logging via a serial line,
+ *    the data can be printed to the console using this "printk" method.
+ *
+ * Note, that for both the assign and the printk, __entry is the handler
+ * to the data structure in the ring buffer, and is defined by the
+ * TP_STRUCT__entry.
+ */
+TRACE_EVENT(foo_bar,
+
+	TP_PROTO(char *foo, int bar),
+
+	TP_ARGS(foo, bar),
+
+	TP_STRUCT__entry(
+		__array(	char,	foo,    10		)
+		__field(	int,	bar			)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->foo, foo, 10);
+		__entry->bar	= bar;
+	),
+
+	TP_printk("foo %s %d", __entry->foo, __entry->bar)
+);
+#endif
+
+/***** NOTICE! The #if protection ends here. *****/
+
+
+/*
+ * There are several ways I could have done this. If I left out the
+ * TRACE_INCLUDE_PATH, then it would default to the kernel source
+ * include/trace/events directory.
+ *
+ * I could specify a path from the define_trace.h file back to this
+ * file.
+ *
+ * #define TRACE_INCLUDE_PATH ../../samples/trace_events
+ *
+ * But I chose to simply make it use the current directory and then in
+ * the Makefile I added:
+ *
+ * CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/
+ *
+ * This will make sure the current path is part of the include
+ * structure for our file so that we can find it.
+ *
+ * I could have made only the top level directory the include:
+ *
+ * CFLAGS_trace-events-sample.o := -I$(PWD)
+ *
+ * And then let the path to this directory be the TRACE_INCLUDE_PATH:
+ *
+ * #define TRACE_INCLUDE_PATH samples/trace_events
+ *
+ * But then if something defines "samples" or "trace_events" then we
+ * could risk that being converted too, and give us an unexpected
+ * result.
+ */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#include <trace/define_trace.h>
-- 
GitLab


From 05725f7eb4b8acb147c5fc7b91397b1f6bcab00d Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Tue, 14 Apr 2009 20:17:16 +0200
Subject: [PATCH 0409/2198] rculist: use list_entry_rcu in places where it's
 appropriate

Use previously introduced list_entry_rcu instead of an open-coded
list_entry + rcu_dereference combination.

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: dipankar@in.ibm.com
LKML-Reference: <20090414181715.GA3634@psychotron.englab.brq.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h           | 8 +++++---
 ipc/sem.c                       | 4 ++--
 security/integrity/ima/ima_fs.c | 4 ++--
 security/smack/smackfs.c        | 8 ++++----
 4 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049cb..886df41e74528 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -77,6 +77,7 @@ struct sched_param {
 #include <linux/proportions.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
+#include <linux/rculist.h>
 #include <linux/rtmutex.h>
 
 #include <linux/time.h>
@@ -2010,7 +2011,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 }
 #endif
 
-#define next_task(p)	list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
+#define next_task(p) \
+	list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
 
 #define for_each_process(p) \
 	for (p = &init_task ; (p = next_task(p)) != &init_task ; )
@@ -2049,8 +2051,8 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2)
 
 static inline struct task_struct *next_thread(const struct task_struct *p)
 {
-	return list_entry(rcu_dereference(p->thread_group.next),
-			  struct task_struct, thread_group);
+	return list_entry_rcu(p->thread_group.next,
+			      struct task_struct, thread_group);
 }
 
 static inline int thread_group_empty(struct task_struct *p)
diff --git a/ipc/sem.c b/ipc/sem.c
index 16a2189e96f94..87c2b641fd7b4 100644
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1290,8 +1290,8 @@ void exit_sem(struct task_struct *tsk)
 		int i;
 
 		rcu_read_lock();
-		un = list_entry(rcu_dereference(ulp->list_proc.next),
-					struct sem_undo, list_proc);
+		un = list_entry_rcu(ulp->list_proc.next,
+				    struct sem_undo, list_proc);
 		if (&un->list_proc == &ulp->list_proc)
 			semid = -1;
 		 else
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index ffbe259700b10..510186f0b72e4 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -84,8 +84,8 @@ static void *ima_measurements_next(struct seq_file *m, void *v, loff_t *pos)
 	 * against concurrent list-extension
 	 */
 	rcu_read_lock();
-	qe = list_entry(rcu_dereference(qe->later.next),
-			struct ima_queue_entry, later);
+	qe = list_entry_rcu(qe->later.next,
+			    struct ima_queue_entry, later);
 	rcu_read_unlock();
 	(*pos)++;
 
diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index e03a7e19c73b3..11d2cb19d7a6b 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -734,8 +734,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
 		return;
 	}
 
-	m = list_entry(rcu_dereference(smk_netlbladdr_list.next),
-			 struct smk_netlbladdr, list);
+	m = list_entry_rcu(smk_netlbladdr_list.next,
+			   struct smk_netlbladdr, list);
 
 	/* the comparison '>' is a bit hacky, but works */
 	if (new->smk_mask.s_addr > m->smk_mask.s_addr) {
@@ -748,8 +748,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
 			list_add_rcu(&new->list, &m->list);
 			return;
 		}
-		m_next = list_entry(rcu_dereference(m->list.next),
-				 struct smk_netlbladdr, list);
+		m_next = list_entry_rcu(m->list.next,
+					struct smk_netlbladdr, list);
 		if (new->smk_mask.s_addr > m_next->smk_mask.s_addr) {
 			list_add_rcu(&new->list, &m->list);
 			return;
-- 
GitLab


From b206525ad1f653b7da35f5827be93770d28eae11 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Tue, 14 Apr 2009 23:04:37 +0530
Subject: [PATCH 0410/2198] x86: k8 convert node_to_k8_nb_misc() from a macro
 to an inline function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Converting node_to_k8_nb_misc() from a macro to an inline function
makes compiler see the 'node' parameter in the !CONFIG_K8_NB too,
which eliminates these compiler warnings:

  arch/x86/kernel/cpu/intel_cacheinfo.c: In function ‘show_cache_disable’:
  arch/x86/kernel/cpu/intel_cacheinfo.c:712: warning: unused variable ‘node’
  arch/x86/kernel/cpu/intel_cacheinfo.c: In function ‘store_cache_disable’:
  arch/x86/kernel/cpu/intel_cacheinfo.c:739: warning: unused variable ‘node’

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Mark Langsdorf <mark.langsdorf@amd.com>
LKML-Reference: <1239730477.2966.26.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/k8.h | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index c23b3d171be5f..c2d1f3b58e5f1 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -13,10 +13,15 @@ extern void k8_flush_garts(void);
 extern int k8_scan_nodes(unsigned long start, unsigned long end);
 
 #ifdef CONFIG_K8_NB
-#define node_to_k8_nb_misc(node) \
-	(node < num_k8_northbridges) ? k8_northbridges[node] : NULL
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+	return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+}
 #else
-#define node_to_k8_nb_misc(node) NULL
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+	return NULL;
+}
 #endif
 
 
-- 
GitLab


From e6a1a89d572c31b62d6dcf11a371c7323852d9b2 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 15 Apr 2009 18:22:41 +0900
Subject: [PATCH 0411/2198] dma-debug: add dma_debug_resize_entries() to adjust
 the number of dma_debug_entries

We use a static value for the number of dma_debug_entries. It can be
overwritten by a kernel command line option.

Some IOMMUs (e.g. GART) can't set an appropriate value by a kernel
command line option because they can't know such value until they
finish initializing up their hardware.

This patch adds dma_debug_resize_entries() enables IOMMUs to adjust
the number of dma_debug_entries anytime.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: fujita.tomonori@lab.ntt.co.jp
Cc: akpm@linux-foundation.org
LKML-Reference: <20090415182234R.fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/dma-debug.h |  7 ++++
 lib/dma-debug.c           | 72 +++++++++++++++++++++++++++++++++++----
 2 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h
index 28d53cb7b5a22..171ad8aedc835 100644
--- a/include/linux/dma-debug.h
+++ b/include/linux/dma-debug.h
@@ -32,6 +32,8 @@ extern void dma_debug_add_bus(struct bus_type *bus);
 
 extern void dma_debug_init(u32 num_entries);
 
+extern int dma_debug_resize_entries(u32 num_entries);
+
 extern void debug_dma_map_page(struct device *dev, struct page *page,
 			       size_t offset, size_t size,
 			       int direction, dma_addr_t dma_addr,
@@ -91,6 +93,11 @@ static inline void dma_debug_init(u32 num_entries)
 {
 }
 
+static inline int dma_debug_resize_entries(u32 num_entries)
+{
+	return 0;
+}
+
 static inline void debug_dma_map_page(struct device *dev, struct page *page,
 				      size_t offset, size_t size,
 				      int direction, dma_addr_t dma_addr,
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index d3da7edc034f7..5d61019330cd8 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -85,6 +85,7 @@ static u32 show_num_errors = 1;
 
 static u32 num_free_entries;
 static u32 min_free_entries;
+static u32 nr_total_entries;
 
 /* number of preallocated entries requested by kernel cmdline */
 static u32 req_entries;
@@ -257,6 +258,21 @@ static void add_dma_entry(struct dma_debug_entry *entry)
 	put_hash_bucket(bucket, &flags);
 }
 
+static struct dma_debug_entry *__dma_entry_alloc(void)
+{
+	struct dma_debug_entry *entry;
+
+	entry = list_entry(free_entries.next, struct dma_debug_entry, list);
+	list_del(&entry->list);
+	memset(entry, 0, sizeof(*entry));
+
+	num_free_entries -= 1;
+	if (num_free_entries < min_free_entries)
+		min_free_entries = num_free_entries;
+
+	return entry;
+}
+
 /* struct dma_entry allocator
  *
  * The next two functions implement the allocator for
@@ -276,9 +292,7 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 		goto out;
 	}
 
-	entry = list_entry(free_entries.next, struct dma_debug_entry, list);
-	list_del(&entry->list);
-	memset(entry, 0, sizeof(*entry));
+	entry = __dma_entry_alloc();
 
 #ifdef CONFIG_STACKTRACE
 	entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
@@ -286,9 +300,6 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 	entry->stacktrace.skip = 2;
 	save_stack_trace(&entry->stacktrace);
 #endif
-	num_free_entries -= 1;
-	if (num_free_entries < min_free_entries)
-		min_free_entries = num_free_entries;
 
 out:
 	spin_unlock_irqrestore(&free_entries_lock, flags);
@@ -310,6 +321,53 @@ static void dma_entry_free(struct dma_debug_entry *entry)
 	spin_unlock_irqrestore(&free_entries_lock, flags);
 }
 
+int dma_debug_resize_entries(u32 num_entries)
+{
+	int i, delta, ret = 0;
+	unsigned long flags;
+	struct dma_debug_entry *entry;
+	LIST_HEAD(tmp);
+
+	spin_lock_irqsave(&free_entries_lock, flags);
+
+	if (nr_total_entries < num_entries) {
+		delta = num_entries - nr_total_entries;
+
+		spin_unlock_irqrestore(&free_entries_lock, flags);
+
+		for (i = 0; i < delta; i++) {
+			entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+			if (!entry)
+				break;
+
+			list_add_tail(&entry->list, &tmp);
+		}
+
+		spin_lock_irqsave(&free_entries_lock, flags);
+
+		list_splice(&tmp, &free_entries);
+		nr_total_entries += i;
+		num_free_entries += i;
+	} else {
+		delta = nr_total_entries - num_entries;
+
+		for (i = 0; i < delta && !list_empty(&free_entries); i++) {
+			entry = __dma_entry_alloc();
+			kfree(entry);
+		}
+
+		nr_total_entries -= i;
+	}
+
+	if (nr_total_entries != num_entries)
+		ret = 1;
+
+	spin_unlock_irqrestore(&free_entries_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(dma_debug_resize_entries);
+
 /*
  * DMA-API debugging init code
  *
@@ -490,6 +548,8 @@ void dma_debug_init(u32 num_entries)
 		return;
 	}
 
+	nr_total_entries = num_free_entries;
+
 	printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n");
 }
 
-- 
GitLab


From 19c1a6f5764d787113fa323ffb18be7991208f82 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 14 Apr 2009 09:43:19 +0900
Subject: [PATCH 0412/2198] x86 gart: reimplement IOMMU_LEAK feature by using
 DMA_API_DEBUG

IOMMU_LEAK, GART's own feature, dumps the used IOMMU entries when
IOMMU entries is full, which might be useful to find a bad driver that
eats IOMMU entries.

DMA_API_DEBUG provides the similar feature, debug_dma_dump_mappings,
and it's better than GART's IOMMU_LEAK feature. GART's IOMMU_LEAK
feature doesn't say who uses IOMMU entries so it's hard to find a bad
driver.

This patch reimplements the GART's IOMMU_LEAK feature by using
DMA_API_DEBUG.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1239669799-23579-2-git-send-email-fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug        |  3 +--
 arch/x86/kernel/pci-gart_64.c | 45 +++++++----------------------------
 2 files changed, 9 insertions(+), 39 deletions(-)

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d8359e73317f8..5865712d105d5 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -161,8 +161,7 @@ config IOMMU_DEBUG
 
 config IOMMU_LEAK
 	bool "IOMMU leak tracing"
-	depends on DEBUG_KERNEL
-	depends on IOMMU_DEBUG
+	depends on IOMMU_DEBUG && DMA_API_DEBUG
 	---help---
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index b284b58c035cc..1e8920d98f7ca 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -144,48 +144,21 @@ static void flush_gart(void)
 }
 
 #ifdef CONFIG_IOMMU_LEAK
-
-#define SET_LEAK(x)							\
-	do {								\
-		if (iommu_leak_tab)					\
-			iommu_leak_tab[x] = __builtin_return_address(0);\
-	} while (0)
-
-#define CLEAR_LEAK(x)							\
-	do {								\
-		if (iommu_leak_tab)					\
-			iommu_leak_tab[x] = NULL;			\
-	} while (0)
-
 /* Debugging aid for drivers that don't free their IOMMU tables */
-static void **iommu_leak_tab;
 static int leak_trace;
 static int iommu_leak_pages = 20;
 
 static void dump_leak(void)
 {
-	int i;
 	static int dump;
 
-	if (dump || !iommu_leak_tab)
+	if (dump)
 		return;
 	dump = 1;
-	show_stack(NULL, NULL);
 
-	/* Very crude. dump some from the end of the table too */
-	printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
-	       iommu_leak_pages);
-	for (i = 0; i < iommu_leak_pages; i += 2) {
-		printk(KERN_DEBUG "%lu: ", iommu_pages-i);
-		printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
-				0);
-		printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
-	}
-	printk(KERN_DEBUG "\n");
+	show_stack(NULL, NULL);
+	debug_dma_dump_mappings(NULL);
 }
-#else
-# define SET_LEAK(x)
-# define CLEAR_LEAK(x)
 #endif
 
 static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
 
 	for (i = 0; i < npages; i++) {
 		iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
-		SET_LEAK(iommu_page + i);
 		phys_mem += PAGE_SIZE;
 	}
 	return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
 	npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
 	for (i = 0; i < npages; i++) {
 		iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
-		CLEAR_LEAK(iommu_page + i);
 	}
 	free_iommu(iommu_page, npages);
 }
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
 		pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
 		while (pages--) {
 			iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
-			SET_LEAK(iommu_page);
 			addr += PAGE_SIZE;
 			iommu_page++;
 		}
@@ -801,11 +771,12 @@ void __init gart_iommu_init(void)
 
 #ifdef CONFIG_IOMMU_LEAK
 	if (leak_trace) {
-		iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
-				  get_order(iommu_pages*sizeof(void *)));
-		if (!iommu_leak_tab)
+		int ret;
+
+		ret = dma_debug_resize_entries(iommu_pages);
+		if (ret)
 			printk(KERN_DEBUG
-			       "PCI-DMA: Cannot allocate leak trace area\n");
+			       "PCI-DMA: Cannot trace all the entries\n");
 	}
 #endif
 
-- 
GitLab


From 13318a7186d8e0ae08c996ea4111a945e7789772 Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Wed, 15 Apr 2009 09:59:10 +0800
Subject: [PATCH 0413/2198] sched: use group_first_cpu() instead of
 cpumask_first(sched_group_cpus())

Impact: cleanup

This patch changes cpumask_first(sched_group_cpus()) to group_first_cpu()
for maintainability.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 92b4b56ad093c..7601ceebf7cee 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7995,7 +7995,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
 			struct sched_domain *sd;
 
 			sd = &per_cpu(phys_domains, j).sd;
-			if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+			if (j != group_first_cpu(sd->groups)) {
 				/*
 				 * Only add "power" once for each
 				 * physical package.
@@ -8073,7 +8073,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
 	WARN_ON(!sd || !sd->groups);
 
-	if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+	if (cpu != group_first_cpu(sd->groups))
 		return;
 
 	child = sd->child;
-- 
GitLab


From 77857dc07247ed5fa700a197c96ef842d8dbebdf Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 15 Apr 2009 11:57:01 -0700
Subject: [PATCH 0414/2198] x86: use used_vectors in init_IRQ()

Impact: fix crash with many devices

I found this crash:

[  552.616646] general protection fault: 0403 [#1] SMP
[  552.620013] last sysfs file:
/sys/devices/pci0000:00/0000:00:02.0/usb1/1-1/1-1:1.0/host13/target13:0:0/13:0:0:0/block/sr0/size
[  552.620013] CPU 0
[  552.620013] Modules linked in:
[  552.620013] Pid: 0, comm: swapper Not tainted 2.6.30-rc1-tip-01931-g8fcafd8-dirty #28 Sun Fire X4440
[  552.620013] RIP: 0010:[<ffffffff8023bada>]  [<ffffffff8023bada>] default_idle+0x7d/0xda
[  552.620013] RSP: 0018:ffffffff81345e68  EFLAGS: 00010246
[  552.620013] RAX: 0000000000000000 RBX: ffffffff8133d870 RCX: ffffc20000000000
[  552.620013] RDX: 00000000001d0620 RSI: ffffffff8023bad8 RDI: ffffffff802a3169
[  552.620013] RBP: ffffffff81345e98 R08: 0000000000000000 R09: ffffffff812244a0
[  552.620013] R10: ffffffff81345dc8 R11: 7ebe1b6fa0bcac50 R12: 4ec4ec4ec4ec4ec5
[  552.620013] R13: ffffffff813a54d0 R14: ffffffff813a7a40 R15: 0000000000000000
[  552.620013] FS:  00000000006d1880(0000) GS:ffffc20000000000(0000) knlGS:0000000000000000
[  552.620013] CS:  0010 DS: 0018 ES: 0018 CR0: 000000008005003b
[  552.620013] CR2: 00007fec9d936a50 CR3: 000000007d1a9000 CR4: 00000000000006e0
[  552.620013] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  552.620013] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400
[  552.620013] Process swapper (pid: 0, threadinfo ffffffff81344000,task ffffffff812244a0)
[  552.620013] Stack:
[  552.620013]  0000000000000000 ffffc20000000000 00000000001d0620 7ebe1b6fa0bcac50
[  552.620013]  ffffffff8133d870 4ec4ec4ec4ec4ec5 ffffffff81345ec8 ffffffff8023bd84
[  552.620013]  4ec4ec4ec4ec4ec5 ffffffff813a54d0 7ebe1b6fa0bcac50 ffffffff8133d870
[  552.620013] Call Trace:
[  552.620013]  [<ffffffff8023bd84>] c1e_idle+0x109/0x124
[  552.620013]  [<ffffffff8023314b>] cpu_idle+0xb8/0x101
[  552.620013]  [<ffffffff80c16d6a>] rest_init+0x7e/0x94
[  552.620013]  [<ffffffff81357efc>] start_kernel+0x3dc/0x3fd
[  552.620013]  [<ffffffff813572a9>] x86_64_start_reservations+0xb9/0xd4
[  552.620013]  [<ffffffff813573b2>] x86_64_start_kernel+0xee/0x109
[  552.620013] Code: 48 8b 04 25 f8 b4 00 00 83 a0 3c e0 ff ff fb 0f ae f0 65 48 8b 04 25 f8 b4 00 00 f6 80 38 e0 ff ff 08 75 09 e8 71 76 06 00 fb f4 <eb> 06 e8 68 76 06 00 fb 65 48 8b 04 25 f8 b4 00 00 83 88 3c e0
[  552.620013] RIP  [<ffffffff8023bada>] default_idle+0x7d/0xda
[  552.620013]  RSP <ffffffff81345e68>
[  552.828646] ---[ end trace 4cbfc5c01382af7f ]---

Joerg Roedel said
	"The 0403 error code means that there was an external interrupt with vector
	0x80. Yinghai, my theory is that the kernel on this machine has no 32bit
	emulation compiled in, right? In this case the selector points to a zero entry
	which may cause the #gpf right after the hlt.
	But I have no idea where the external int 0x80 comes from"

it turns out that we could use 0x80 for external device on 64-bit
when 32-bit emulation is disabled.

But we forgot to set the gate for it.

try to set gate for it by checking used_vectors.

Also move apic_intr_init() early to avoid setting
that gate two times.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Joerg Roedel <joerg.roedel@amd.com>
LKML-Reference: <49E62DFD.6010904@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/irqinit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index b424c32c4a0c3..2e08b10ad51ab 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -240,19 +240,19 @@ void __init native_init_IRQ(void)
 	/* Execute any quirks before the call gates are initialised: */
 	x86_quirk_pre_intr_init();
 
+	apic_intr_init();
+
 	/*
 	 * Cover the whole vector space, no vector can escape
 	 * us. (some of these will be overridden and become
 	 * 'special' SMP interrupts)
 	 */
 	for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-		/* IA32_SYSCALL_VECTOR was reserved in trap_init. */
-		if (i != IA32_SYSCALL_VECTOR)
+		/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
+		if (!test_bit(i, used_vectors))
 			set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
 	}
 
-	apic_intr_init();
-
 	if (!acpi_ioapic)
 		setup_irq(2, &irq2);
 
-- 
GitLab


From ff7b1b4f000cea84f071c1b6aa2918b2119d66f1 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 15 Apr 2009 16:55:05 +0100
Subject: [PATCH 0415/2198] perfcounters: export perf_tpcounter_event

Needed for modular tracepoint support.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7f9521c3c01ba..09396098dd0df 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2590,6 +2590,7 @@ void perf_tpcounter_event(int event_id)
 
 	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0);
 }
+EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 
 extern int ftrace_profile_enable(int);
 extern void ftrace_profile_disable(int);
-- 
GitLab


From 0207a2efb43d81e29e23662b5d035945688a103f Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Thu, 16 Apr 2009 14:40:56 +0900
Subject: [PATCH 0416/2198] sh: Add support for SH7724 (SH-Mobile R2R) CPU
 subtype.

This implements initial support for the SH-Mobile R2R CPU.
Based on Rev 0.11 of the initial SH7724 hardware manual.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                         |   10 +
 arch/sh/include/asm/processor.h         |    2 +-
 arch/sh/include/cpu-sh4/cpu/freq.h      |   18 +
 arch/sh/include/cpu-sh4/cpu/sh7724.h    |  255 +++
 arch/sh/kernel/cpu/sh4/probe.c          |    6 +
 arch/sh/kernel/cpu/sh4a/Makefile        |    3 +
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c  |  112 +-
 arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c | 2230 +++++++++++++++++++++++
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c  |  371 ++++
 arch/sh/kernel/setup.c                  |    3 +-
 arch/sh/oprofile/common.c               |    1 +
 11 files changed, 2997 insertions(+), 14 deletions(-)
 create mode 100644 arch/sh/include/cpu-sh4/cpu/sh7724.h
 create mode 100644 arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c
 create mode 100644 arch/sh/kernel/cpu/sh4a/setup-sh7724.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index e7390dd0283dd..505d1acbd0ad1 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -347,6 +347,15 @@ config CPU_SUBTYPE_SH7723
 	help
 	  Select SH7723 if you have an SH-MobileR2 CPU.
 
+config CPU_SUBTYPE_SH7724
+	bool "Support SH7724 processor"
+	select CPU_SH4A
+	select CPU_SHX2
+	select ARCH_SPARSEMEM_ENABLE
+	select SYS_SUPPORTS_CMT
+	help
+	  Select SH7724 if you have an SH-MobileR2R CPU.
+
 config CPU_SUBTYPE_SH7763
 	bool "Support SH7763 processor"
 	select CPU_SH4A
@@ -495,6 +504,7 @@ config SH_PCLK_FREQ
 			      CPU_SUBTYPE_SH7203 || CPU_SUBTYPE_SH7206 || \
 			      CPU_SUBTYPE_SH7263 || CPU_SUBTYPE_MXG    || \
 			      CPU_SUBTYPE_SH7786
+	default "41666666" if CPU_SUBTYPE_SH7724
 	default "60000000" if CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R
 	default "66000000" if CPU_SUBTYPE_SH4_202
 	default "50000000"
diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h
index 1fd58b421438e..005c962c8b1c5 100644
--- a/arch/sh/include/asm/processor.h
+++ b/arch/sh/include/asm/processor.h
@@ -32,7 +32,7 @@ enum cpu_type {
 
 	/* SH-4A types */
 	CPU_SH7763, CPU_SH7770, CPU_SH7780, CPU_SH7781, CPU_SH7785, CPU_SH7786,
-	CPU_SH7723, CPU_SHX3,
+	CPU_SH7723, CPU_SH7724, CPU_SHX3,
 
 	/* SH4AL-DSP types */
 	CPU_SH7343, CPU_SH7722, CPU_SH7366,
diff --git a/arch/sh/include/cpu-sh4/cpu/freq.h b/arch/sh/include/cpu-sh4/cpu/freq.h
index 749d1c4343375..ccf1d999db6da 100644
--- a/arch/sh/include/cpu-sh4/cpu/freq.h
+++ b/arch/sh/include/cpu-sh4/cpu/freq.h
@@ -25,6 +25,24 @@
 #elif defined(CONFIG_CPU_SUBTYPE_SH7763) || \
       defined(CONFIG_CPU_SUBTYPE_SH7780)
 #define	FRQCR			0xffc80000
+#elif defined(CONFIG_CPU_SUBTYPE_SH7724)
+#define FRQCRA			0xa4150000
+#define FRQCRB			0xa4150004
+#define VCLKCR			0xa4150048
+
+#define FCLKACR			0xa4150008
+#define FCLKBCR			0xa415000c
+#define FRQCR			FRQCRA
+#define SCLKACR			FCLKACR
+#define SCLKBCR			FCLKBCR
+#define FCLKACR			0xa4150008
+#define FCLKBCR			0xa415000c
+#define IrDACLKCR		0xa4150018
+
+#define MSTPCR0			0xa4150030
+#define MSTPCR1			0xa4150034
+#define MSTPCR2			0xa4150038
+
 #elif defined(CONFIG_CPU_SUBTYPE_SH7785)
 #define FRQCR0			0xffc80000
 #define FRQCR1			0xffc80004
diff --git a/arch/sh/include/cpu-sh4/cpu/sh7724.h b/arch/sh/include/cpu-sh4/cpu/sh7724.h
new file mode 100644
index 0000000000000..34605c9e354db
--- /dev/null
+++ b/arch/sh/include/cpu-sh4/cpu/sh7724.h
@@ -0,0 +1,255 @@
+#ifndef __ASM_SH7724_H__
+#define __ASM_SH7724_H__
+
+enum {
+	/* PTA */
+	GPIO_PTA7, GPIO_PTA6, GPIO_PTA5, GPIO_PTA4,
+	GPIO_PTA3, GPIO_PTA2, GPIO_PTA1, GPIO_PTA0,
+
+	/* PTB */
+	GPIO_PTB7, GPIO_PTB6, GPIO_PTB5, GPIO_PTB4,
+	GPIO_PTB3, GPIO_PTB2, GPIO_PTB1, GPIO_PTB0,
+
+	/* PTC */
+	GPIO_PTC7, GPIO_PTC6, GPIO_PTC5, GPIO_PTC4,
+	GPIO_PTC3, GPIO_PTC2, GPIO_PTC1, GPIO_PTC0,
+
+	/* PTD */
+	GPIO_PTD7, GPIO_PTD6, GPIO_PTD5, GPIO_PTD4,
+	GPIO_PTD3, GPIO_PTD2, GPIO_PTD1, GPIO_PTD0,
+
+	/* PTE */
+	GPIO_PTE7, GPIO_PTE6, GPIO_PTE5, GPIO_PTE4,
+	GPIO_PTE3, GPIO_PTE2, GPIO_PTE1, GPIO_PTE0,
+
+	/* PTF */
+	GPIO_PTF7, GPIO_PTF6, GPIO_PTF5, GPIO_PTF4,
+	GPIO_PTF3, GPIO_PTF2, GPIO_PTF1, GPIO_PTF0,
+
+	/* PTG */
+			      GPIO_PTG5, GPIO_PTG4,
+	GPIO_PTG3, GPIO_PTG2, GPIO_PTG1, GPIO_PTG0,
+
+	/* PTH */
+	GPIO_PTH7, GPIO_PTH6, GPIO_PTH5, GPIO_PTH4,
+	GPIO_PTH3, GPIO_PTH2, GPIO_PTH1, GPIO_PTH0,
+
+	/* PTJ */
+	GPIO_PTJ7, GPIO_PTJ6, GPIO_PTJ5,
+	GPIO_PTJ3, GPIO_PTJ2, GPIO_PTJ1, GPIO_PTJ0,
+
+	/* PTK */
+	GPIO_PTK7, GPIO_PTK6, GPIO_PTK5, GPIO_PTK4,
+	GPIO_PTK3, GPIO_PTK2, GPIO_PTK1, GPIO_PTK0,
+
+	/* PTL */
+	GPIO_PTL7, GPIO_PTL6, GPIO_PTL5, GPIO_PTL4,
+	GPIO_PTL3, GPIO_PTL2, GPIO_PTL1, GPIO_PTL0,
+
+	/* PTM */
+	GPIO_PTM7, GPIO_PTM6, GPIO_PTM5, GPIO_PTM4,
+	GPIO_PTM3, GPIO_PTM2, GPIO_PTM1, GPIO_PTM0,
+
+	/* PTN */
+	GPIO_PTN7, GPIO_PTN6, GPIO_PTN5, GPIO_PTN4,
+	GPIO_PTN3, GPIO_PTN2, GPIO_PTN1, GPIO_PTN0,
+
+	/* PTQ */
+	GPIO_PTQ7, GPIO_PTQ6, GPIO_PTQ5, GPIO_PTQ4,
+	GPIO_PTQ3, GPIO_PTQ2, GPIO_PTQ1, GPIO_PTQ0,
+
+	/* PTR */
+	GPIO_PTR7, GPIO_PTR6, GPIO_PTR5, GPIO_PTR4,
+	GPIO_PTR3, GPIO_PTR2, GPIO_PTR1, GPIO_PTR0,
+
+	/* PTS */
+		   GPIO_PTS6, GPIO_PTS5, GPIO_PTS4,
+	GPIO_PTS3, GPIO_PTS2, GPIO_PTS1, GPIO_PTS0,
+
+	/* PTT */
+	GPIO_PTT7, GPIO_PTT6, GPIO_PTT5, GPIO_PTT4,
+	GPIO_PTT3, GPIO_PTT2, GPIO_PTT1, GPIO_PTT0,
+
+	/* PTU */
+	GPIO_PTU7, GPIO_PTU6, GPIO_PTU5, GPIO_PTU4,
+	GPIO_PTU3, GPIO_PTU2, GPIO_PTU1, GPIO_PTU0,
+
+	/* PTV */
+	GPIO_PTV7, GPIO_PTV6, GPIO_PTV5, GPIO_PTV4,
+	GPIO_PTV3, GPIO_PTV2, GPIO_PTV1, GPIO_PTV0,
+
+	/* PTW */
+	GPIO_PTW7, GPIO_PTW6, GPIO_PTW5, GPIO_PTW4,
+	GPIO_PTW3, GPIO_PTW2, GPIO_PTW1, GPIO_PTW0,
+
+	/* PTX */
+	GPIO_PTX7, GPIO_PTX6, GPIO_PTX5, GPIO_PTX4,
+	GPIO_PTX3, GPIO_PTX2, GPIO_PTX1, GPIO_PTX0,
+
+	/* PTY */
+	GPIO_PTY7, GPIO_PTY6, GPIO_PTY5, GPIO_PTY4,
+	GPIO_PTY3, GPIO_PTY2, GPIO_PTY1, GPIO_PTY0,
+
+	/* PTZ */
+	GPIO_PTZ7, GPIO_PTZ6, GPIO_PTZ5, GPIO_PTZ4,
+	GPIO_PTZ3, GPIO_PTZ2, GPIO_PTZ1, GPIO_PTZ0,
+
+	/* BSC (PTA/PTB/PTJ/PTQ/PTR/PTT) */
+	GPIO_FN_D31, GPIO_FN_D30, GPIO_FN_D29, GPIO_FN_D28,
+	GPIO_FN_D27, GPIO_FN_D26, GPIO_FN_D25, GPIO_FN_D24,
+	GPIO_FN_D23, GPIO_FN_D22, GPIO_FN_D21, GPIO_FN_D20,
+	GPIO_FN_D19, GPIO_FN_D18, GPIO_FN_D17, GPIO_FN_D16,
+	GPIO_FN_D15, GPIO_FN_D14, GPIO_FN_D13, GPIO_FN_D12,
+	GPIO_FN_D11, GPIO_FN_D10, GPIO_FN_D9,  GPIO_FN_D8,
+	GPIO_FN_D7,  GPIO_FN_D6,  GPIO_FN_D5,  GPIO_FN_D4,
+	GPIO_FN_D3,  GPIO_FN_D2,  GPIO_FN_D1,  GPIO_FN_D0,
+	GPIO_FN_A25, GPIO_FN_A24, GPIO_FN_A23, GPIO_FN_A22,
+	GPIO_FN_CS6B_CE1B,  GPIO_FN_CS6A_CE2B,
+	GPIO_FN_CS5B_CE1A,  GPIO_FN_CS5A_CE2A,
+	GPIO_FN_WE3_ICIOWR, GPIO_FN_WE2_ICIORD,
+	GPIO_FN_IOIS16,     GPIO_FN_WAIT,
+	GPIO_FN_BS,
+
+	/* KEYSC (PTA/PTB)*/
+	GPIO_FN_KEYOUT5_IN5, GPIO_FN_KEYOUT4_IN6, GPIO_FN_KEYIN4,
+	GPIO_FN_KEYIN3,  GPIO_FN_KEYIN2,  GPIO_FN_KEYIN1,  GPIO_FN_KEYIN0,
+	GPIO_FN_KEYOUT3, GPIO_FN_KEYOUT2, GPIO_FN_KEYOUT1, GPIO_FN_KEYOUT0,
+
+	/* ATAPI (PTA/PTB/PTK/PTR/PTS/PTW) */
+	GPIO_FN_IDED15, GPIO_FN_IDED14, GPIO_FN_IDED13, GPIO_FN_IDED12,
+	GPIO_FN_IDED11, GPIO_FN_IDED10, GPIO_FN_IDED9,  GPIO_FN_IDED8,
+	GPIO_FN_IDED7,  GPIO_FN_IDED6,  GPIO_FN_IDED5,  GPIO_FN_IDED4,
+	GPIO_FN_IDED3,  GPIO_FN_IDED2,  GPIO_FN_IDED1,  GPIO_FN_IDED0,
+	GPIO_FN_IDEA2,     GPIO_FN_IDEA1,     GPIO_FN_IDEA0,  GPIO_FN_IDEIOWR,
+	GPIO_FN_IODREQ,    GPIO_FN_IDECS0,    GPIO_FN_IDECS1, GPIO_FN_IDEIORD,
+	GPIO_FN_DIRECTION, GPIO_FN_EXBUF_ENB, GPIO_FN_IDERST, GPIO_FN_IODACK,
+	GPIO_FN_IDEINT,    GPIO_FN_IDEIORDY,
+
+	/* TPU (PTB/PTR/PTS) */
+	GPIO_FN_TPUTO3, GPIO_FN_TPUTO2, GPIO_FN_TPUTO1, GPIO_FN_TPUTO0,
+	GPIO_FN_TPUTI3, GPIO_FN_TPUTI2,
+
+	/* LCDC (PTC/PTD/PTE/PTF/PTM/PTR) */
+	GPIO_FN_LCDD23, GPIO_FN_LCDD22, GPIO_FN_LCDD21, GPIO_FN_LCDD20,
+	GPIO_FN_LCDD19, GPIO_FN_LCDD18, GPIO_FN_LCDD17, GPIO_FN_LCDD16,
+	GPIO_FN_LCDD15, GPIO_FN_LCDD14, GPIO_FN_LCDD13, GPIO_FN_LCDD12,
+	GPIO_FN_LCDD11, GPIO_FN_LCDD10, GPIO_FN_LCDD9,  GPIO_FN_LCDD8,
+	GPIO_FN_LCDD7,  GPIO_FN_LCDD6,  GPIO_FN_LCDD5,  GPIO_FN_LCDD4,
+	GPIO_FN_LCDD3,  GPIO_FN_LCDD2,  GPIO_FN_LCDD1,  GPIO_FN_LCDD0,
+	GPIO_FN_LCDVSYN,  GPIO_FN_LCDDISP,  GPIO_FN_LCDRS,  GPIO_FN_LCDHSYN,
+	GPIO_FN_LCDCS,    GPIO_FN_LCDDON,   GPIO_FN_LCDDCK, GPIO_FN_LCDWR,
+	GPIO_FN_LCDVEPWC, GPIO_FN_LCDVCPWC, GPIO_FN_LCDRD,  GPIO_FN_LCDLCLK,
+
+	/* SCIF0 (PTF/PTM) */
+	GPIO_FN_SCIF0_TXD, GPIO_FN_SCIF0_RXD, GPIO_FN_SCIF0_SCK,
+
+	/* SCIF1 (PTL) */
+	GPIO_FN_SCIF1_SCK, GPIO_FN_SCIF1_RXD, GPIO_FN_SCIF1_TXD,
+
+	/* SCIF2 (PTE/PTF/PTN) with LCDC, VOU */
+	GPIO_FN_SCIF2_L_TXD, GPIO_FN_SCIF2_L_SCK, GPIO_FN_SCIF2_L_RXD,
+	GPIO_FN_SCIF2_V_TXD, GPIO_FN_SCIF2_V_SCK, GPIO_FN_SCIF2_V_RXD,
+
+	/* SCIF3 (PTL/PTN/PTZ) with VOU, IRQ */
+	GPIO_FN_SCIF3_V_SCK, GPIO_FN_SCIF3_V_RXD, GPIO_FN_SCIF3_V_TXD,
+	GPIO_FN_SCIF3_V_CTS, GPIO_FN_SCIF3_V_RTS,
+	GPIO_FN_SCIF3_I_SCK, GPIO_FN_SCIF3_I_RXD, GPIO_FN_SCIF3_I_TXD,
+	GPIO_FN_SCIF3_I_CTS, GPIO_FN_SCIF3_I_RTS,
+
+	/* SCIF4 (PTE) */
+	GPIO_FN_SCIF4_SCK, GPIO_FN_SCIF4_RXD, GPIO_FN_SCIF4_TXD,
+
+	/* SCIF5 (PTS) */
+	GPIO_FN_SCIF5_SCK, GPIO_FN_SCIF5_RXD, GPIO_FN_SCIF5_TXD,
+
+	/* FSI (PTE/PTU/PTV) */
+	GPIO_FN_FSIMCKB,   GPIO_FN_FSIMCKA,    GPIO_FN_FSIOASD,
+	GPIO_FN_FSIIABCK,  GPIO_FN_FSIIALRCK,  GPIO_FN_FSIOABCK,
+	GPIO_FN_FSIOALRCK, GPIO_FN_CLKAUDIOAO, GPIO_FN_FSIIBSD,
+	GPIO_FN_FSIOBSD,   GPIO_FN_FSIIBBCK,   GPIO_FN_FSIIBLRCK,
+	GPIO_FN_FSIOBBCK,  GPIO_FN_FSIOBLRCK,  GPIO_FN_CLKAUDIOBO,
+	GPIO_FN_FSIIASD,
+
+	/* AUD (PTG) */
+	GPIO_FN_AUDCK,   GPIO_FN_AUDSYNC, GPIO_FN_AUDATA3,
+	GPIO_FN_AUDATA2, GPIO_FN_AUDATA1, GPIO_FN_AUDATA0,
+
+	/* VIO (PTS) (common?) */
+	GPIO_FN_VIO_CKO,
+
+	/* VIO0 (PTH/PTK) */
+	GPIO_FN_VIO0_D15, GPIO_FN_VIO0_D14, GPIO_FN_VIO0_D13, GPIO_FN_VIO0_D12,
+	GPIO_FN_VIO0_D11, GPIO_FN_VIO0_D10, GPIO_FN_VIO0_D9,  GPIO_FN_VIO0_D8,
+	GPIO_FN_VIO0_D7,  GPIO_FN_VIO0_D6,  GPIO_FN_VIO0_D5,  GPIO_FN_VIO0_D4,
+	GPIO_FN_VIO0_D3,  GPIO_FN_VIO0_D2,  GPIO_FN_VIO0_D1,  GPIO_FN_VIO0_D0,
+	GPIO_FN_VIO0_VD,  GPIO_FN_VIO0_CLK,
+	GPIO_FN_VIO0_FLD, GPIO_FN_VIO0_HD,
+
+	/* VIO1 (PTK/PTS) */
+	GPIO_FN_VIO1_D7,  GPIO_FN_VIO1_D6, GPIO_FN_VIO1_D5, GPIO_FN_VIO1_D4,
+	GPIO_FN_VIO1_D3,  GPIO_FN_VIO1_D2, GPIO_FN_VIO1_D1, GPIO_FN_VIO1_D0,
+	GPIO_FN_VIO1_FLD, GPIO_FN_VIO1_HD, GPIO_FN_VIO1_VD, GPIO_FN_VIO1_CLK,
+
+	/* Eth  (PTL/PTN/PTX) */
+	GPIO_FN_RMII_RXD0,    GPIO_FN_RMII_RXD1,
+	GPIO_FN_RMII_TXD0,    GPIO_FN_RMII_TXD1,
+	GPIO_FN_RMII_REF_CLK, GPIO_FN_RMII_TX_EN,
+	GPIO_FN_RMII_RX_ER,   GPIO_FN_RMII_CRS_DV,
+	GPIO_FN_LNKSTA,       GPIO_FN_MDIO,
+	GPIO_FN_MDC,
+
+	/* System (PTJ) */
+	GPIO_FN_PDSTATUS, GPIO_FN_STATUS2, GPIO_FN_STATUS0,
+
+	/* VOU (PTL/PTM/PTN*/
+	GPIO_FN_DV_D15,  GPIO_FN_DV_D14, GPIO_FN_DV_D13,   GPIO_FN_DV_D12,
+	GPIO_FN_DV_D11,  GPIO_FN_DV_D10, GPIO_FN_DV_D9,    GPIO_FN_DV_D8,
+	GPIO_FN_DV_D7,   GPIO_FN_DV_D6,  GPIO_FN_DV_D5,    GPIO_FN_DV_D4,
+	GPIO_FN_DV_D3,   GPIO_FN_DV_D2,  GPIO_FN_DV_D1,    GPIO_FN_DV_D0,
+	GPIO_FN_DV_CLKI, GPIO_FN_DV_CLK, GPIO_FN_DV_VSYNC, GPIO_FN_DV_HSYNC,
+
+	/* MSIOF0 (PTL/PTM) */
+	GPIO_FN_MSIOF0_RXD,   GPIO_FN_MSIOF0_TXD,
+	GPIO_FN_MSIOF0_MCK,   GPIO_FN_MSIOF0_TSCK,
+	GPIO_FN_MSIOF0_SS1,   GPIO_FN_MSIOF0_SS2,
+	GPIO_FN_MSIOF0_TSYNC, GPIO_FN_MSIOF0_RSCK,
+	GPIO_FN_MSIOF0_RSYNC,
+
+	/* MSIOF1 (PTV) */
+	GPIO_FN_MSIOF1_RXD,   GPIO_FN_MSIOF1_TXD,
+	GPIO_FN_MSIOF1_MCK,   GPIO_FN_MSIOF1_TSCK,
+	GPIO_FN_MSIOF1_SS1,   GPIO_FN_MSIOF1_SS2,
+	GPIO_FN_MSIOF1_TSYNC, GPIO_FN_MSIOF1_RSCK,
+	GPIO_FN_MSIOF1_RSYNC,
+
+	/* DMAC (PTU/PTX) */
+	GPIO_FN_DMAC_DACK0, GPIO_FN_DMAC_DREQ0,
+	GPIO_FN_DMAC_DACK1, GPIO_FN_DMAC_DREQ1,
+
+	/* SDHI0 (PTY) */
+	GPIO_FN_SDHI0CD, GPIO_FN_SDHI0WP, GPIO_FN_SDHI0CMD, GPIO_FN_SDHI0CLK,
+	GPIO_FN_SDHI0D3, GPIO_FN_SDHI0D2, GPIO_FN_SDHI0D1,  GPIO_FN_SDHI0D0,
+
+	/* SDHI1 (PTW) */
+	GPIO_FN_SDHI1CD, GPIO_FN_SDHI1WP, GPIO_FN_SDHI1CMD, GPIO_FN_SDHI1CLK,
+	GPIO_FN_SDHI1D3, GPIO_FN_SDHI1D2, GPIO_FN_SDHI1D1,  GPIO_FN_SDHI1D0,
+
+	/* MMC (PTW/PTX)*/
+	GPIO_FN_MMC_D7,  GPIO_FN_MMC_D6,  GPIO_FN_MMC_D5, GPIO_FN_MMC_D4,
+	GPIO_FN_MMC_D3,  GPIO_FN_MMC_D2,  GPIO_FN_MMC_D1, GPIO_FN_MMC_D0,
+	GPIO_FN_MMC_CLK, GPIO_FN_MMC_CMD,
+
+	/* IrDA (PTX) */
+	GPIO_FN_IRDA_OUT, GPIO_FN_IRDA_IN,
+
+	/* TSIF (PTX) */
+	GPIO_FN_TSIF_TS0_SDAT, GPIO_FN_TSIF_TS0_SCK,
+	GPIO_FN_TSIF_TS0_SDEN, GPIO_FN_TSIF_TS0_SPSYNC,
+
+	/* IRQ (PTZ) */
+	GPIO_FN_INTC_IRQ7, GPIO_FN_INTC_IRQ6, GPIO_FN_INTC_IRQ5,
+	GPIO_FN_INTC_IRQ4, GPIO_FN_INTC_IRQ3, GPIO_FN_INTC_IRQ2,
+	GPIO_FN_INTC_IRQ1, GPIO_FN_INTC_IRQ0,
+};
+
+#endif /* __ASM_SH7724_H__ */
diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c
index 91e3677ae09da..973ff831c8a8c 100644
--- a/arch/sh/kernel/cpu/sh4/probe.c
+++ b/arch/sh/kernel/cpu/sh4/probe.c
@@ -156,6 +156,12 @@ int __init detect_cpu_and_cache_system(void)
 			break;
 		}
 		break;
+	case 0x300b:
+		boot_cpu_data.type = CPU_SH7724;
+		boot_cpu_data.icache.ways = 4;
+		boot_cpu_data.dcache.ways = 4;
+		boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_FPU;
+		break;
 	case 0x4000:	/* 1st cut */
 	case 0x4001:	/* 2nd cut */
 		boot_cpu_data.type = CPU_SHX3;
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index 1a92361feeb9d..afd6fba47849c 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -11,6 +11,7 @@ obj-$(CONFIG_CPU_SUBTYPE_SH7786)	+= setup-sh7786.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7343)	+= setup-sh7343.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7722)	+= setup-sh7722.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7723)	+= setup-sh7723.o
+obj-$(CONFIG_CPU_SUBTYPE_SH7724)	+= setup-sh7724.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7366)	+= setup-sh7366.o
 obj-$(CONFIG_CPU_SUBTYPE_SHX3)		+= setup-shx3.o
 
@@ -26,12 +27,14 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7786)	:= clock-sh7786.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7722)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7722.o
+clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7366)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SHX3)	:= clock-shx3.o
 
 # Pinmux setup
 pinmux-$(CONFIG_CPU_SUBTYPE_SH7722)	:= pinmux-sh7722.o
 pinmux-$(CONFIG_CPU_SUBTYPE_SH7723)	:= pinmux-sh7723.o
+pinmux-$(CONFIG_CPU_SUBTYPE_SH7724)	:= pinmux-sh7724.o
 pinmux-$(CONFIG_CPU_SUBTYPE_SH7785)	:= pinmux-sh7785.o
 pinmux-$(CONFIG_CPU_SUBTYPE_SH7786)	:= pinmux-sh7786.o
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 0e174af218741..1ccdfc561fefa 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -130,6 +130,12 @@ static void adjust_clocks(int originate, int *l, unsigned long v[],
  * is quite simple..
  */
 
+#if defined(CONFIG_CPU_SUBTYPE_SH7724)
+#define STCPLL(frqcr) ((((frqcr >> 24) & 0x3f) + 1) * 2)
+#else
+#define STCPLL(frqcr) (((frqcr >> 24) & 0x1f) + 1)
+#endif
+
 /*
  * Instead of having two separate multipliers/divisors set, like this:
  *
@@ -139,13 +145,17 @@ static void adjust_clocks(int originate, int *l, unsigned long v[],
  * I created the divisors2 array, which is used to calculate rate like
  *   rate = parent * 2 / divisors2[ divisor ];
 */
+#if defined(CONFIG_CPU_SUBTYPE_SH7724)
+static int divisors2[] = { 4, 1, 8, 12, 16, 24, 32, 1, 48, 64, 72, 96, 1, 144 };
+#else
 static int divisors2[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32, 40 };
+#endif
 
 static void master_clk_recalc(struct clk *clk)
 {
 	unsigned frqcr = ctrl_inl(FRQCR);
 
-	clk->rate = CONFIG_SH_PCLK_FREQ * (((frqcr >> 24) & 0x1f) + 1);
+	clk->rate = CONFIG_SH_PCLK_FREQ * STCPLL(frqcr);
 }
 
 static void master_clk_init(struct clk *clk)
@@ -161,13 +171,30 @@ static void module_clk_recalc(struct clk *clk)
 {
 	unsigned long frqcr = ctrl_inl(FRQCR);
 
-	clk->rate = clk->parent->rate / (((frqcr >> 24) & 0x1f) + 1);
+	clk->rate = clk->parent->rate / STCPLL(frqcr);
 }
 
+#if defined(CONFIG_CPU_SUBTYPE_SH7724)
+#define MASTERDIVS	{ 12, 16, 24, 30, 32, 36, 48 }
+#define STCMASK		0x3f
+#define DIVCALC(div)	(div/2-1)
+#define FRQCRKICK	0x80000000
+#elif defined(CONFIG_CPU_SUBTYPE_SH7723)
+#define MASTERDIVS	{ 6, 8, 12, 16 }
+#define STCMASK		0x1f
+#define DIVCALC(div)	(div-1)
+#define FRQCRKICK	0x00000000
+#else
+#define MASTERDIVS	{ 2, 3, 4, 6, 8, 16 }
+#define STCMASK		0x1f
+#define DIVCALC(div)	(div-1)
+#define FRQCRKICK	0x00000000
+#endif
+
 static int master_clk_setrate(struct clk *clk, unsigned long rate, int id)
 {
 	int div = rate / clk->rate;
-	int master_divs[] = { 2, 3, 4, 6, 8, 16 };
+	int master_divs[] = MASTERDIVS;
 	int index;
 	unsigned long frqcr;
 
@@ -180,8 +207,9 @@ static int master_clk_setrate(struct clk *clk, unsigned long rate, int id)
 	div = master_divs[index - 1];
 
 	frqcr = ctrl_inl(FRQCR);
-	frqcr &= ~(0xF << 24);
-	frqcr |= ( (div-1) << 24);
+	frqcr &= ~(STCMASK << 24);
+	frqcr |= (DIVCALC(div) << 24);
+	frqcr |= FRQCRKICK;
 	ctrl_outl(frqcr, FRQCR);
 
 	return 0;
@@ -377,6 +405,7 @@ static int sh7722_frqcr_set_rate(struct clk *clk, unsigned long rate,
 	/* clear FRQCR bits */
 	frqcr &= ~(ctx.mask << ctx.shift);
 	frqcr |= div << ctx.shift;
+	frqcr |= FRQCRKICK;
 
 	/* ...and perform actual change */
 	ctrl_outl(frqcr, FRQCR);
@@ -542,8 +571,8 @@ static struct clk sh7722_r_clock = {
 	.flags = CLK_RATE_PROPAGATES,
 };
 
-#ifndef CONFIG_CPU_SUBTYPE_SH7343
-
+#if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\
+    !defined(CONFIG_CPU_SUBTYPE_SH7724)
 /*
  * these three clocks - SIU A, SIU B, IrDA - share the same clk_ops
  * methods of clk_ops determine which register they should access by
@@ -560,15 +589,16 @@ static struct clk sh7722_siu_b_clock = {
 	.arch_flags = SCLKBCR,
 	.ops = &sh7722_siu_clk_ops,
 };
+#endif /* CONFIG_CPU_SUBTYPE_SH7343, SH7724 */
 
-#if defined(CONFIG_CPU_SUBTYPE_SH7722)
+#if defined(CONFIG_CPU_SUBTYPE_SH7722) ||\
+    defined(CONFIG_CPU_SUBTYPE_SH7724)
 static struct clk sh7722_irda_clock = {
 	.name = "irda_clk",
 	.arch_flags = IrDACLKCR,
 	.ops = &sh7722_siu_clk_ops,
 };
 #endif
-#endif /* CONFIG_CPU_SUBTYPE_SH7343 */
 
 static struct clk sh7722_video_clock = {
 	.name = "video_clk",
@@ -715,6 +745,61 @@ static struct clk sh7722_mstpcr_clocks[] = {
 	MSTPCR("vpu0", "bus_clk", 2, 1),
 	MSTPCR("lcdc0", "bus_clk", 2, 0),
 #endif
+#if defined(CONFIG_CPU_SUBTYPE_SH7724)
+	/* See Datasheet : Overview -> Block Diagram */
+	MSTPCR("tlb0", "cpu_clk", 0, 31),
+	MSTPCR("ic0", "cpu_clk", 0, 30),
+	MSTPCR("oc0", "cpu_clk", 0, 29),
+	MSTPCR("rs0", "bus_clk", 0, 28),
+	MSTPCR("ilmem0", "cpu_clk", 0, 27),
+	MSTPCR("l2c0", "sh_clk", 0, 26),
+	MSTPCR("fpu0", "cpu_clk", 0, 24),
+	MSTPCR("intc0", "peripheral_clk", 0, 22),
+	MSTPCR("dmac0", "bus_clk", 0, 21),
+	MSTPCR("sh0", "sh_clk", 0, 20),
+	MSTPCR("hudi0", "peripheral_clk", 0, 19),
+	MSTPCR("ubc0", "cpu_clk", 0, 17),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15),
+	MSTPCR("cmt0", "r_clk", 0, 14),
+	MSTPCR("rwdt0", "r_clk", 0, 13),
+	MSTPCR("dmac1", "bus_clk", 0, 12),
+	MSTPCR("tmu1", "peripheral_clk", 0, 10),
+	MSTPCR("scif0", "peripheral_clk", 0, 9),
+	MSTPCR("scif1", "peripheral_clk", 0, 8),
+	MSTPCR("scif2", "peripheral_clk", 0, 7),
+	MSTPCR("scif3", "bus_clk", 0, 6),
+	MSTPCR("scif4", "bus_clk", 0, 5),
+	MSTPCR("scif5", "bus_clk", 0, 4),
+	MSTPCR("msiof0", "bus_clk", 0, 2),
+	MSTPCR("msiof1", "bus_clk", 0, 1),
+	MSTPCR("keysc0", "r_clk", 1, 12),
+	MSTPCR("rtc0", "r_clk", 1, 11),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9),
+	MSTPCR("i2c1", "peripheral_clk", 1, 8),
+	MSTPCR("mmc0", "bus_clk", 2, 29),
+	MSTPCR("eth0", "bus_clk", 2, 28),
+	MSTPCR("atapi0", "bus_clk", 2, 26),
+	MSTPCR("tpu0", "bus_clk", 2, 25),
+	MSTPCR("irda0", "peripheral_clk", 2, 24),
+	MSTPCR("tsif0", "bus_clk", 2, 22),
+	MSTPCR("usb1", "bus_clk", 2, 21),
+	MSTPCR("usb0", "bus_clk", 2, 20),
+	MSTPCR("2dg0", "bus_clk", 2, 19),
+	MSTPCR("sdhi0", "bus_clk", 2, 18),
+	MSTPCR("sdhi1", "bus_clk", 2, 17),
+	MSTPCR("veu1", "bus_clk", 2, 15),
+	MSTPCR("ceu1", "bus_clk", 2, 13),
+	MSTPCR("beu1", "bus_clk", 2, 12),
+	MSTPCR("2ddmac0", "sh_clk", 2, 10),
+	MSTPCR("spu0", "bus_clk", 2, 9),
+	MSTPCR("jpu0", "bus_clk", 2, 6),
+	MSTPCR("vou0", "bus_clk", 2, 5),
+	MSTPCR("beu0", "bus_clk", 2, 4),
+	MSTPCR("ceu0", "bus_clk", 2, 3),
+	MSTPCR("veu0", "bus_clk", 2, 2),
+	MSTPCR("vpu0", "bus_clk", 2, 1),
+	MSTPCR("lcdc0", "bus_clk", 2, 0),
+#endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7343)
 	MSTPCR("uram0", "umem_clk", 0, 28),
 	MSTPCR("xymem0", "bus_clk", 0, 26),
@@ -786,12 +871,15 @@ static struct clk *sh7722_clocks[] = {
 	&sh7722_sh_clock,
 	&sh7722_peripheral_clock,
 	&sh7722_sdram_clock,
-#ifndef CONFIG_CPU_SUBTYPE_SH7343
+#if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\
+    !defined(CONFIG_CPU_SUBTYPE_SH7724)
 	&sh7722_siu_a_clock,
 	&sh7722_siu_b_clock,
-#if defined(CONFIG_CPU_SUBTYPE_SH7722)
-	&sh7722_irda_clock,
 #endif
+/* 7724 should support FSI clock */
+#if defined(CONFIG_CPU_SUBTYPE_SH7722) || \
+    defined(CONFIG_CPU_SUBTYPE_SH7724)
+	&sh7722_irda_clock,
 #endif
 	&sh7722_video_clock,
 };
diff --git a/arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c b/arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c
new file mode 100644
index 0000000000000..1af0f95863796
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c
@@ -0,0 +1,2230 @@
+/*
+ * SH7724 Pinmux
+ *
+ * Copyright (C) 2009 Renesas Solutions Corp.
+ *
+ * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ *
+ * Based on SH7723 Pinmux
+ *  Copyright (C) 2008  Magnus Damm
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <cpu/sh7724.h>
+
+enum {
+	PINMUX_RESERVED = 0,
+
+	PINMUX_DATA_BEGIN,
+	PTA7_DATA, PTA6_DATA, PTA5_DATA, PTA4_DATA,
+	PTA3_DATA, PTA2_DATA, PTA1_DATA, PTA0_DATA,
+	PTB7_DATA, PTB6_DATA, PTB5_DATA, PTB4_DATA,
+	PTB3_DATA, PTB2_DATA, PTB1_DATA, PTB0_DATA,
+	PTC7_DATA, PTC6_DATA, PTC5_DATA, PTC4_DATA,
+	PTC3_DATA, PTC2_DATA, PTC1_DATA, PTC0_DATA,
+	PTD7_DATA, PTD6_DATA, PTD5_DATA, PTD4_DATA,
+	PTD3_DATA, PTD2_DATA, PTD1_DATA, PTD0_DATA,
+	PTE7_DATA, PTE6_DATA, PTE5_DATA, PTE4_DATA,
+	PTE3_DATA, PTE2_DATA, PTE1_DATA, PTE0_DATA,
+	PTF7_DATA, PTF6_DATA, PTF5_DATA, PTF4_DATA,
+	PTF3_DATA, PTF2_DATA, PTF1_DATA, PTF0_DATA,
+			      PTG5_DATA, PTG4_DATA,
+	PTG3_DATA, PTG2_DATA, PTG1_DATA, PTG0_DATA,
+	PTH7_DATA, PTH6_DATA, PTH5_DATA, PTH4_DATA,
+	PTH3_DATA, PTH2_DATA, PTH1_DATA, PTH0_DATA,
+	PTJ7_DATA, PTJ6_DATA, PTJ5_DATA,
+	PTJ3_DATA, PTJ2_DATA, PTJ1_DATA, PTJ0_DATA,
+	PTK7_DATA, PTK6_DATA, PTK5_DATA, PTK4_DATA,
+	PTK3_DATA, PTK2_DATA, PTK1_DATA, PTK0_DATA,
+	PTL7_DATA, PTL6_DATA, PTL5_DATA, PTL4_DATA,
+	PTL3_DATA, PTL2_DATA, PTL1_DATA, PTL0_DATA,
+	PTM7_DATA, PTM6_DATA, PTM5_DATA, PTM4_DATA,
+	PTM3_DATA, PTM2_DATA, PTM1_DATA, PTM0_DATA,
+	PTN7_DATA, PTN6_DATA, PTN5_DATA, PTN4_DATA,
+	PTN3_DATA, PTN2_DATA, PTN1_DATA, PTN0_DATA,
+	PTQ7_DATA, PTQ6_DATA, PTQ5_DATA, PTQ4_DATA,
+	PTQ3_DATA, PTQ2_DATA, PTQ1_DATA, PTQ0_DATA,
+	PTR7_DATA, PTR6_DATA, PTR5_DATA, PTR4_DATA,
+	PTR3_DATA, PTR2_DATA, PTR1_DATA, PTR0_DATA,
+		   PTS6_DATA, PTS5_DATA, PTS4_DATA,
+	PTS3_DATA, PTS2_DATA, PTS1_DATA, PTS0_DATA,
+	PTT7_DATA, PTT6_DATA, PTT5_DATA, PTT4_DATA,
+	PTT3_DATA, PTT2_DATA, PTT1_DATA, PTT0_DATA,
+	PTU7_DATA, PTU6_DATA, PTU5_DATA, PTU4_DATA,
+	PTU3_DATA, PTU2_DATA, PTU1_DATA, PTU0_DATA,
+	PTV7_DATA, PTV6_DATA, PTV5_DATA, PTV4_DATA,
+	PTV3_DATA, PTV2_DATA, PTV1_DATA, PTV0_DATA,
+	PTW7_DATA, PTW6_DATA, PTW5_DATA, PTW4_DATA,
+	PTW3_DATA, PTW2_DATA, PTW1_DATA, PTW0_DATA,
+	PTX7_DATA, PTX6_DATA, PTX5_DATA, PTX4_DATA,
+	PTX3_DATA, PTX2_DATA, PTX1_DATA, PTX0_DATA,
+	PTY7_DATA, PTY6_DATA, PTY5_DATA, PTY4_DATA,
+	PTY3_DATA, PTY2_DATA, PTY1_DATA, PTY0_DATA,
+	PTZ7_DATA, PTZ6_DATA, PTZ5_DATA, PTZ4_DATA,
+	PTZ3_DATA, PTZ2_DATA, PTZ1_DATA, PTZ0_DATA,
+	PINMUX_DATA_END,
+
+	PINMUX_INPUT_BEGIN,
+	PTA7_IN, PTA6_IN, PTA5_IN, PTA4_IN,
+	PTA3_IN, PTA2_IN, PTA1_IN, PTA0_IN,
+	PTB7_IN, PTB6_IN, PTB5_IN, PTB4_IN,
+	PTB3_IN, PTB2_IN, PTB1_IN, PTB0_IN,
+	PTC7_IN, PTC6_IN, PTC5_IN, PTC4_IN,
+	PTC3_IN, PTC2_IN, PTC1_IN, PTC0_IN,
+	PTD7_IN, PTD6_IN, PTD5_IN, PTD4_IN,
+	PTD3_IN, PTD2_IN, PTD1_IN, PTD0_IN,
+	PTE7_IN, PTE6_IN, PTE5_IN, PTE4_IN,
+	PTE3_IN, PTE2_IN, PTE1_IN, PTE0_IN,
+	PTF7_IN, PTF6_IN, PTF5_IN, PTF4_IN,
+	PTF3_IN, PTF2_IN, PTF1_IN, PTF0_IN,
+	PTH7_IN, PTH6_IN, PTH5_IN, PTH4_IN,
+	PTH3_IN, PTH2_IN, PTH1_IN, PTH0_IN,
+	PTJ3_IN, PTJ2_IN, PTJ1_IN, PTJ0_IN,
+	PTK7_IN, PTK6_IN, PTK5_IN, PTK4_IN,
+	PTK3_IN, PTK2_IN, PTK1_IN, PTK0_IN,
+	PTL7_IN, PTL6_IN, PTL5_IN, PTL4_IN,
+	PTL3_IN, PTL2_IN, PTL1_IN, PTL0_IN,
+	PTM7_IN, PTM6_IN, PTM5_IN, PTM4_IN,
+	PTM3_IN, PTM2_IN, PTM1_IN, PTM0_IN,
+	PTN7_IN, PTN6_IN, PTN5_IN, PTN4_IN,
+	PTN3_IN, PTN2_IN, PTN1_IN, PTN0_IN,
+	PTQ7_IN, PTQ6_IN, PTQ5_IN, PTQ4_IN,
+	PTQ3_IN, PTQ2_IN, PTQ1_IN, PTQ0_IN,
+	PTR7_IN, PTR6_IN, PTR5_IN, PTR4_IN,
+	PTR3_IN, PTR2_IN, PTR1_IN, PTR0_IN,
+		 PTS6_IN, PTS5_IN, PTS4_IN,
+	PTS3_IN, PTS2_IN, PTS1_IN, PTS0_IN,
+	PTT7_IN, PTT6_IN, PTT5_IN, PTT4_IN,
+	PTT3_IN, PTT2_IN, PTT1_IN, PTT0_IN,
+	PTU7_IN, PTU6_IN, PTU5_IN, PTU4_IN,
+	PTU3_IN, PTU2_IN, PTU1_IN, PTU0_IN,
+	PTV7_IN, PTV6_IN, PTV5_IN, PTV4_IN,
+	PTV3_IN, PTV2_IN, PTV1_IN, PTV0_IN,
+	PTW7_IN, PTW6_IN, PTW5_IN, PTW4_IN,
+	PTW3_IN, PTW2_IN, PTW1_IN, PTW0_IN,
+	PTX7_IN, PTX6_IN, PTX5_IN, PTX4_IN,
+	PTX3_IN, PTX2_IN, PTX1_IN, PTX0_IN,
+	PTY7_IN, PTY6_IN, PTY5_IN, PTY4_IN,
+	PTY3_IN, PTY2_IN, PTY1_IN, PTY0_IN,
+	PTZ7_IN, PTZ6_IN, PTZ5_IN, PTZ4_IN,
+	PTZ3_IN, PTZ2_IN, PTZ1_IN, PTZ0_IN,
+	PINMUX_INPUT_END,
+
+	PINMUX_INPUT_PULLUP_BEGIN,
+	PTA7_IN_PU, PTA6_IN_PU, PTA5_IN_PU, PTA4_IN_PU,
+	PTA3_IN_PU, PTA2_IN_PU, PTA1_IN_PU, PTA0_IN_PU,
+	PTB7_IN_PU, PTB6_IN_PU, PTB5_IN_PU, PTB4_IN_PU,
+	PTB3_IN_PU, PTB2_IN_PU, PTB1_IN_PU, PTB0_IN_PU,
+	PTC7_IN_PU, PTC6_IN_PU, PTC5_IN_PU, PTC4_IN_PU,
+	PTC3_IN_PU, PTC2_IN_PU, PTC1_IN_PU, PTC0_IN_PU,
+	PTD7_IN_PU, PTD6_IN_PU, PTD5_IN_PU, PTD4_IN_PU,
+	PTD3_IN_PU, PTD2_IN_PU, PTD1_IN_PU, PTD0_IN_PU,
+	PTE7_IN_PU, PTE6_IN_PU, PTE5_IN_PU, PTE4_IN_PU,
+	PTE3_IN_PU, PTE2_IN_PU, PTE1_IN_PU, PTE0_IN_PU,
+	PTF7_IN_PU, PTF6_IN_PU, PTF5_IN_PU, PTF4_IN_PU,
+	PTF3_IN_PU, PTF2_IN_PU, PTF1_IN_PU, PTF0_IN_PU,
+	PTH7_IN_PU, PTH6_IN_PU, PTH5_IN_PU, PTH4_IN_PU,
+	PTH3_IN_PU, PTH2_IN_PU, PTH1_IN_PU, PTH0_IN_PU,
+	PTJ3_IN_PU, PTJ2_IN_PU, PTJ1_IN_PU, PTJ0_IN_PU,
+	PTK7_IN_PU, PTK6_IN_PU, PTK5_IN_PU, PTK4_IN_PU,
+	PTK3_IN_PU, PTK2_IN_PU, PTK1_IN_PU, PTK0_IN_PU,
+	PTL7_IN_PU, PTL6_IN_PU, PTL5_IN_PU, PTL4_IN_PU,
+	PTL3_IN_PU, PTL2_IN_PU, PTL1_IN_PU, PTL0_IN_PU,
+	PTM7_IN_PU, PTM6_IN_PU, PTM5_IN_PU, PTM4_IN_PU,
+	PTM3_IN_PU, PTM2_IN_PU, PTM1_IN_PU, PTM0_IN_PU,
+	PTN7_IN_PU, PTN6_IN_PU, PTN5_IN_PU, PTN4_IN_PU,
+	PTN3_IN_PU, PTN2_IN_PU, PTN1_IN_PU, PTN0_IN_PU,
+	PTQ7_IN_PU, PTQ6_IN_PU, PTQ5_IN_PU, PTQ4_IN_PU,
+	PTQ3_IN_PU, PTQ2_IN_PU, PTQ1_IN_PU, PTQ0_IN_PU,
+	PTR7_IN_PU, PTR6_IN_PU, PTR5_IN_PU, PTR4_IN_PU,
+	PTR3_IN_PU, PTR2_IN_PU, PTR1_IN_PU, PTR0_IN_PU,
+		    PTS6_IN_PU, PTS5_IN_PU, PTS4_IN_PU,
+	PTS3_IN_PU, PTS2_IN_PU, PTS1_IN_PU, PTS0_IN_PU,
+	PTT7_IN_PU, PTT6_IN_PU, PTT5_IN_PU, PTT4_IN_PU,
+	PTT3_IN_PU, PTT2_IN_PU, PTT1_IN_PU, PTT0_IN_PU,
+	PTU7_IN_PU, PTU6_IN_PU, PTU5_IN_PU, PTU4_IN_PU,
+	PTU3_IN_PU, PTU2_IN_PU, PTU1_IN_PU, PTU0_IN_PU,
+	PTV7_IN_PU, PTV6_IN_PU, PTV5_IN_PU, PTV4_IN_PU,
+	PTV3_IN_PU, PTV2_IN_PU, PTV1_IN_PU, PTV0_IN_PU,
+	PTW7_IN_PU, PTW6_IN_PU, PTW5_IN_PU, PTW4_IN_PU,
+	PTW3_IN_PU, PTW2_IN_PU, PTW1_IN_PU, PTW0_IN_PU,
+	PTX7_IN_PU, PTX6_IN_PU, PTX5_IN_PU, PTX4_IN_PU,
+	PTX3_IN_PU, PTX2_IN_PU, PTX1_IN_PU, PTX0_IN_PU,
+	PTY7_IN_PU, PTY6_IN_PU, PTY5_IN_PU, PTY4_IN_PU,
+	PTY3_IN_PU, PTY2_IN_PU, PTY1_IN_PU, PTY0_IN_PU,
+	PTZ7_IN_PU, PTZ6_IN_PU, PTZ5_IN_PU, PTZ4_IN_PU,
+	PTZ3_IN_PU, PTZ2_IN_PU, PTZ1_IN_PU, PTZ0_IN_PU,
+	PINMUX_INPUT_PULLUP_END,
+
+	PINMUX_OUTPUT_BEGIN,
+	PTA7_OUT, PTA6_OUT, PTA5_OUT, PTA4_OUT,
+	PTA3_OUT, PTA2_OUT, PTA1_OUT, PTA0_OUT,
+	PTB7_OUT, PTB6_OUT, PTB5_OUT, PTB4_OUT,
+	PTB3_OUT, PTB2_OUT, PTB1_OUT, PTB0_OUT,
+	PTC7_OUT, PTC6_OUT, PTC5_OUT, PTC4_OUT,
+	PTC3_OUT, PTC2_OUT, PTC1_OUT, PTC0_OUT,
+	PTD7_OUT, PTD6_OUT, PTD5_OUT, PTD4_OUT,
+	PTD3_OUT, PTD2_OUT, PTD1_OUT, PTD0_OUT,
+	PTE7_OUT, PTE6_OUT, PTE5_OUT, PTE4_OUT,
+	PTE3_OUT, PTE2_OUT, PTE1_OUT, PTE0_OUT,
+	PTF7_OUT, PTF6_OUT, PTF5_OUT, PTF4_OUT,
+	PTF3_OUT, PTF2_OUT, PTF1_OUT, PTF0_OUT,
+			    PTG5_OUT, PTG4_OUT,
+	PTG3_OUT, PTG2_OUT, PTG1_OUT, PTG0_OUT,
+	PTH7_OUT, PTH6_OUT, PTH5_OUT, PTH4_OUT,
+	PTH3_OUT, PTH2_OUT, PTH1_OUT, PTH0_OUT,
+	PTJ7_OUT, PTJ6_OUT, PTJ5_OUT,
+	PTJ3_OUT, PTJ2_OUT, PTJ1_OUT, PTJ0_OUT,
+	PTK7_OUT, PTK6_OUT, PTK5_OUT, PTK4_OUT,
+	PTK3_OUT, PTK2_OUT, PTK1_OUT, PTK0_OUT,
+	PTL7_OUT, PTL6_OUT, PTL5_OUT, PTL4_OUT,
+	PTL3_OUT, PTL2_OUT, PTL1_OUT, PTL0_OUT,
+	PTM7_OUT, PTM6_OUT, PTM5_OUT, PTM4_OUT,
+	PTM3_OUT, PTM2_OUT, PTM1_OUT, PTM0_OUT,
+	PTN7_OUT, PTN6_OUT, PTN5_OUT, PTN4_OUT,
+	PTN3_OUT, PTN2_OUT, PTN1_OUT, PTN0_OUT,
+	PTQ7_OUT, PTQ6_OUT, PTQ5_OUT, PTQ4_OUT,
+	PTQ3_OUT, PTQ2_OUT, PTQ1_OUT, PTQ0_OUT,
+	PTR7_OUT, PTR6_OUT, PTR5_OUT, PTR4_OUT,
+			    PTR1_OUT, PTR0_OUT,
+		  PTS6_OUT, PTS5_OUT, PTS4_OUT,
+	PTS3_OUT, PTS2_OUT, PTS1_OUT, PTS0_OUT,
+	PTT7_OUT, PTT6_OUT, PTT5_OUT, PTT4_OUT,
+	PTT3_OUT, PTT2_OUT, PTT1_OUT, PTT0_OUT,
+	PTU7_OUT, PTU6_OUT, PTU5_OUT, PTU4_OUT,
+	PTU3_OUT, PTU2_OUT, PTU1_OUT, PTU0_OUT,
+	PTV7_OUT, PTV6_OUT, PTV5_OUT, PTV4_OUT,
+	PTV3_OUT, PTV2_OUT, PTV1_OUT, PTV0_OUT,
+	PTW7_OUT, PTW6_OUT, PTW5_OUT, PTW4_OUT,
+	PTW3_OUT, PTW2_OUT, PTW1_OUT, PTW0_OUT,
+	PTX7_OUT, PTX6_OUT, PTX5_OUT, PTX4_OUT,
+	PTX3_OUT, PTX2_OUT, PTX1_OUT, PTX0_OUT,
+	PTY7_OUT, PTY6_OUT, PTY5_OUT, PTY4_OUT,
+	PTY3_OUT, PTY2_OUT, PTY1_OUT, PTY0_OUT,
+	PTZ7_OUT, PTZ6_OUT, PTZ5_OUT, PTZ4_OUT,
+	PTZ3_OUT, PTZ2_OUT, PTZ1_OUT, PTZ0_OUT,
+	PINMUX_OUTPUT_END,
+
+	PINMUX_FUNCTION_BEGIN,
+	PTA7_FN, PTA6_FN, PTA5_FN, PTA4_FN,
+	PTA3_FN, PTA2_FN, PTA1_FN, PTA0_FN,
+	PTB7_FN, PTB6_FN, PTB5_FN, PTB4_FN,
+	PTB3_FN, PTB2_FN, PTB1_FN, PTB0_FN,
+	PTC7_FN, PTC6_FN, PTC5_FN, PTC4_FN,
+	PTC3_FN, PTC2_FN, PTC1_FN, PTC0_FN,
+	PTD7_FN, PTD6_FN, PTD5_FN, PTD4_FN,
+	PTD3_FN, PTD2_FN, PTD1_FN, PTD0_FN,
+	PTE7_FN, PTE6_FN, PTE5_FN, PTE4_FN,
+	PTE3_FN, PTE2_FN, PTE1_FN, PTE0_FN,
+	PTF7_FN, PTF6_FN, PTF5_FN, PTF4_FN,
+	PTF3_FN, PTF2_FN, PTF1_FN, PTF0_FN,
+			  PTG5_FN, PTG4_FN,
+	PTG3_FN, PTG2_FN, PTG1_FN, PTG0_FN,
+	PTH7_FN, PTH6_FN, PTH5_FN, PTH4_FN,
+	PTH3_FN, PTH2_FN, PTH1_FN, PTH0_FN,
+	PTJ7_FN, PTJ6_FN, PTJ5_FN,
+	PTJ3_FN, PTJ2_FN, PTJ1_FN, PTJ0_FN,
+	PTK7_FN, PTK6_FN, PTK5_FN, PTK4_FN,
+	PTK3_FN, PTK2_FN, PTK1_FN, PTK0_FN,
+	PTL7_FN, PTL6_FN, PTL5_FN, PTL4_FN,
+	PTL3_FN, PTL2_FN, PTL1_FN, PTL0_FN,
+	PTM7_FN, PTM6_FN, PTM5_FN, PTM4_FN,
+	PTM3_FN, PTM2_FN, PTM1_FN, PTM0_FN,
+	PTN7_FN, PTN6_FN, PTN5_FN, PTN4_FN,
+	PTN3_FN, PTN2_FN, PTN1_FN, PTN0_FN,
+	PTQ7_FN, PTQ6_FN, PTQ5_FN, PTQ4_FN,
+	PTQ3_FN, PTQ2_FN, PTQ1_FN, PTQ0_FN,
+	PTR7_FN, PTR6_FN, PTR5_FN, PTR4_FN,
+	PTR3_FN, PTR2_FN, PTR1_FN, PTR0_FN,
+		 PTS6_FN, PTS5_FN, PTS4_FN,
+	PTS3_FN, PTS2_FN, PTS1_FN, PTS0_FN,
+	PTT7_FN, PTT6_FN, PTT5_FN, PTT4_FN,
+	PTT3_FN, PTT2_FN, PTT1_FN, PTT0_FN,
+	PTU7_FN, PTU6_FN, PTU5_FN, PTU4_FN,
+	PTU3_FN, PTU2_FN, PTU1_FN, PTU0_FN,
+	PTV7_FN, PTV6_FN, PTV5_FN, PTV4_FN,
+	PTV3_FN, PTV2_FN, PTV1_FN, PTV0_FN,
+	PTW7_FN, PTW6_FN, PTW5_FN, PTW4_FN,
+	PTW3_FN, PTW2_FN, PTW1_FN, PTW0_FN,
+	PTX7_FN, PTX6_FN, PTX5_FN, PTX4_FN,
+	PTX3_FN, PTX2_FN, PTX1_FN, PTX0_FN,
+	PTY7_FN, PTY6_FN, PTY5_FN, PTY4_FN,
+	PTY3_FN, PTY2_FN, PTY1_FN, PTY0_FN,
+	PTZ7_FN, PTZ6_FN, PTZ5_FN, PTZ4_FN,
+	PTZ3_FN, PTZ2_FN, PTZ1_FN, PTZ0_FN,
+
+
+	PSA15_0, PSA15_1,
+	PSA14_0, PSA14_1,
+	PSA13_0, PSA13_1,
+	PSA12_0, PSA12_1,
+	PSA10_0, PSA10_1,
+	PSA9_0,  PSA9_1,
+	PSA8_0,  PSA8_1,
+	PSA7_0,  PSA7_1,
+	PSA6_0,  PSA6_1,
+	PSA5_0,  PSA5_1,
+	PSA3_0,  PSA3_1,
+	PSA2_0,  PSA2_1,
+	PSA1_0,  PSA1_1,
+	PSA0_0,  PSA0_1,
+
+	PSB14_0, PSB14_1,
+	PSB13_0, PSB13_1,
+	PSB12_0, PSB12_1,
+	PSB11_0, PSB11_1,
+	PSB10_0, PSB10_1,
+	PSB9_0,  PSB9_1,
+	PSB8_0,  PSB8_1,
+	PSB7_0,  PSB7_1,
+	PSB6_0,  PSB6_1,
+	PSB5_0,  PSB5_1,
+	PSB4_0,  PSB4_1,
+	PSB3_0,  PSB3_1,
+	PSB2_0,  PSB2_1,
+	PSB1_0,  PSB1_1,
+	PSB0_0,  PSB0_1,
+
+	PSC15_0, PSC15_1,
+	PSC14_0, PSC14_1,
+	PSC13_0, PSC13_1,
+	PSC12_0, PSC12_1,
+	PSC11_0, PSC11_1,
+	PSC10_0, PSC10_1,
+	PSC9_0,  PSC9_1,
+	PSC8_0,  PSC8_1,
+	PSC7_0,  PSC7_1,
+	PSC6_0,  PSC6_1,
+	PSC5_0,  PSC5_1,
+	PSC4_0,  PSC4_1,
+	PSC2_0,  PSC2_1,
+	PSC1_0,  PSC1_1,
+	PSC0_0,  PSC0_1,
+
+	PSD15_0, PSD15_1,
+	PSD14_0, PSD14_1,
+	PSD13_0, PSD13_1,
+	PSD12_0, PSD12_1,
+	PSD11_0, PSD11_1,
+	PSD10_0, PSD10_1,
+	PSD9_0,  PSD9_1,
+	PSD8_0,  PSD8_1,
+	PSD7_0,  PSD7_1,
+	PSD6_0,  PSD6_1,
+	PSD5_0,  PSD5_1,
+	PSD4_0,  PSD4_1,
+	PSD3_0,  PSD3_1,
+	PSD2_0,  PSD2_1,
+	PSD1_0,  PSD1_1,
+	PSD0_0,  PSD0_1,
+
+	PSE15_0, PSE15_1,
+	PSE14_0, PSE14_1,
+	PSE13_0, PSE13_1,
+	PSE12_0, PSE12_1,
+	PSE11_0, PSE11_1,
+	PSE10_0, PSE10_1,
+	PSE9_0,  PSE9_1,
+	PSE8_0,  PSE8_1,
+	PSE7_0,  PSE7_1,
+	PSE6_0,  PSE6_1,
+	PSE5_0,  PSE5_1,
+	PSE4_0,  PSE4_1,
+	PSE3_0,  PSE3_1,
+	PSE2_0,  PSE2_1,
+	PSE1_0,  PSE1_1,
+	PSE0_0,  PSE0_1,
+	PINMUX_FUNCTION_END,
+
+	PINMUX_MARK_BEGIN,
+	/*PTA*/
+	D23_MARK,	KEYOUT2_MARK,		IDED15_MARK,
+	D22_MARK,	KEYOUT1_MARK,		IDED14_MARK,
+	D21_MARK,	KEYOUT0_MARK,		IDED13_MARK,
+	D20_MARK,	KEYIN4_MARK,		IDED12_MARK,
+	D19_MARK,	KEYIN3_MARK,		IDED11_MARK,
+	D18_MARK,	KEYIN2_MARK,		IDED10_MARK,
+	D17_MARK,	KEYIN1_MARK,		IDED9_MARK,
+	D16_MARK,	KEYIN0_MARK,		IDED8_MARK,
+
+	/*PTB*/
+	D31_MARK,	TPUTO1_MARK,		IDEA1_MARK,
+	D30_MARK,	TPUTO0_MARK,		IDEA0_MARK,
+	D29_MARK,				IODREQ_MARK,
+	D28_MARK,				IDECS0_MARK,
+	D27_MARK,				IDECS1_MARK,
+	D26_MARK,	KEYOUT5_IN5_MARK,	IDEIORD_MARK,
+	D25_MARK,	KEYOUT4_IN6_MARK,	IDEIOWR_MARK,
+	D24_MARK,	KEYOUT3_MARK,		IDEINT_MARK,
+
+	/*PTC*/
+	LCDD7_MARK,
+	LCDD6_MARK,
+	LCDD5_MARK,
+	LCDD4_MARK,
+	LCDD3_MARK,
+	LCDD2_MARK,
+	LCDD1_MARK,
+	LCDD0_MARK,
+
+	/*PTD*/
+	LCDD15_MARK,
+	LCDD14_MARK,
+	LCDD13_MARK,
+	LCDD12_MARK,
+	LCDD11_MARK,
+	LCDD10_MARK,
+	LCDD9_MARK,
+	LCDD8_MARK,
+
+	/*PTE*/
+	FSIMCKB_MARK,
+	FSIMCKA_MARK,
+	LCDD21_MARK,	SCIF2_L_TXD_MARK,
+	LCDD20_MARK,	SCIF4_SCK_MARK,
+	LCDD19_MARK,	SCIF4_RXD_MARK,
+	LCDD18_MARK,	SCIF4_TXD_MARK,
+	LCDD17_MARK,
+	LCDD16_MARK,
+
+	/*PTF*/
+	LCDVSYN_MARK,
+	LCDDISP_MARK,	LCDRS_MARK,
+	LCDHSYN_MARK,	LCDCS_MARK,
+	LCDDON_MARK,
+	LCDDCK_MARK,	LCDWR_MARK,
+	LCDVEPWC_MARK,	SCIF0_TXD_MARK,
+	LCDD23_MARK,	SCIF2_L_SCK_MARK,
+	LCDD22_MARK,	SCIF2_L_RXD_MARK,
+
+	/*PTG*/
+	AUDCK_MARK,
+	AUDSYNC_MARK,
+	AUDATA3_MARK,
+	AUDATA2_MARK,
+	AUDATA1_MARK,
+	AUDATA0_MARK,
+
+	/*PTH*/
+	VIO0_VD_MARK,
+	VIO0_CLK_MARK,
+	VIO0_D7_MARK,
+	VIO0_D6_MARK,
+	VIO0_D5_MARK,
+	VIO0_D4_MARK,
+	VIO0_D3_MARK,
+	VIO0_D2_MARK,
+
+	/*PTJ*/
+	PDSTATUS_MARK,
+	STATUS2_MARK,
+	STATUS0_MARK,
+	A25_MARK,		BS_MARK,
+	A24_MARK,
+	A23_MARK,
+	A22_MARK,
+
+	/*PTK*/
+	VIO1_D5_MARK,	VIO0_D13_MARK,	IDED5_MARK,
+	VIO1_D4_MARK,	VIO0_D12_MARK,	IDED4_MARK,
+	VIO1_D3_MARK,	VIO0_D11_MARK,	IDED3_MARK,
+	VIO1_D2_MARK,	VIO0_D10_MARK,	IDED2_MARK,
+	VIO1_D1_MARK,	VIO0_D9_MARK,	IDED1_MARK,
+	VIO1_D0_MARK,	VIO0_D8_MARK,	IDED0_MARK,
+	VIO0_FLD_MARK,
+	VIO0_HD_MARK,
+
+	/*PTL*/
+	DV_D5_MARK,	SCIF3_V_SCK_MARK,	RMII_RXD0_MARK,
+	DV_D4_MARK,	SCIF3_V_RXD_MARK,	RMII_RXD1_MARK,
+	DV_D3_MARK,	SCIF3_V_TXD_MARK,	RMII_REF_CLK_MARK,
+	DV_D2_MARK,	SCIF1_SCK_MARK,		RMII_TX_EN_MARK,
+	DV_D1_MARK,	SCIF1_RXD_MARK,		RMII_TXD0_MARK,
+	DV_D0_MARK,	SCIF1_TXD_MARK,		RMII_TXD1_MARK,
+	DV_D15_MARK,
+	DV_D14_MARK,	MSIOF0_MCK_MARK,
+
+	/*PTM*/
+	DV_D13_MARK,	MSIOF0_TSCK_MARK,
+	DV_D12_MARK,	MSIOF0_RXD_MARK,
+	DV_D11_MARK,	MSIOF0_TXD_MARK,
+	DV_D10_MARK,	MSIOF0_TSYNC_MARK,
+	DV_D9_MARK,	MSIOF0_SS1_MARK,	MSIOF0_RSCK_MARK,
+	DV_D8_MARK,	MSIOF0_SS2_MARK,	MSIOF0_RSYNC_MARK,
+	LCDVCPWC_MARK,	SCIF0_RXD_MARK,
+	LCDRD_MARK,	SCIF0_SCK_MARK,
+
+	/*PTN*/
+	VIO0_D1_MARK,
+	VIO0_D0_MARK,
+	DV_CLKI_MARK,
+	DV_CLK_MARK,	SCIF2_V_SCK_MARK,
+	DV_VSYNC_MARK,	SCIF2_V_RXD_MARK,
+	DV_HSYNC_MARK,	SCIF2_V_TXD_MARK,
+	DV_D7_MARK,	SCIF3_V_CTS_MARK,	RMII_RX_ER_MARK,
+	DV_D6_MARK,	SCIF3_V_RTS_MARK,	RMII_CRS_DV_MARK,
+
+	/*PTQ*/
+	D7_MARK,
+	D6_MARK,
+	D5_MARK,
+	D4_MARK,
+	D3_MARK,
+	D2_MARK,
+	D1_MARK,
+	D0_MARK,
+
+	/*PTR*/
+	CS6B_CE1B_MARK,
+	CS6A_CE2B_MARK,
+	CS5B_CE1A_MARK,
+	CS5A_CE2A_MARK,
+	IOIS16_MARK,		LCDLCLK_MARK,
+	WAIT_MARK,
+	WE3_ICIOWR_MARK,	TPUTO3_MARK,	TPUTI3_MARK,
+	WE2_ICIORD_MARK,	TPUTO2_MARK,	IDEA2_MARK,
+
+	/*PTS*/
+	VIO_CKO_MARK,
+	VIO1_FLD_MARK,	TPUTI2_MARK,		IDEIORDY_MARK,
+	VIO1_HD_MARK,	SCIF5_SCK_MARK,
+	VIO1_VD_MARK,	SCIF5_RXD_MARK,
+	VIO1_CLK_MARK,	SCIF5_TXD_MARK,
+	VIO1_D7_MARK,	VIO0_D15_MARK,		IDED7_MARK,
+	VIO1_D6_MARK,	VIO0_D14_MARK,		IDED6_MARK,
+
+	/*PTT*/
+	D15_MARK,
+	D14_MARK,
+	D13_MARK,
+	D12_MARK,
+	D11_MARK,
+	D10_MARK,
+	D9_MARK,
+	D8_MARK,
+
+	/*PTU*/
+	DMAC_DACK0_MARK,
+	DMAC_DREQ0_MARK,
+	FSIOASD_MARK,
+	FSIIABCK_MARK,
+	FSIIALRCK_MARK,
+	FSIOABCK_MARK,
+	FSIOALRCK_MARK,
+	CLKAUDIOAO_MARK,
+
+	/*PTV*/
+	FSIIBSD_MARK,		MSIOF1_SS2_MARK,	MSIOF1_RSYNC_MARK,
+	FSIOBSD_MARK,		MSIOF1_SS1_MARK,	MSIOF1_RSCK_MARK,
+	FSIIBBCK_MARK,		MSIOF1_RXD_MARK,
+	FSIIBLRCK_MARK,		MSIOF1_TSYNC_MARK,
+	FSIOBBCK_MARK,		MSIOF1_TSCK_MARK,
+	FSIOBLRCK_MARK,		MSIOF1_TXD_MARK,
+	CLKAUDIOBO_MARK,	MSIOF1_MCK_MARK,
+	FSIIASD_MARK,
+
+	/*PTW*/
+	MMC_D7_MARK,		SDHI1CD_MARK,		IODACK_MARK,
+	MMC_D6_MARK,		SDHI1WP_MARK,		IDERST_MARK,
+	MMC_D5_MARK,		SDHI1D3_MARK,		EXBUF_ENB_MARK,
+	MMC_D4_MARK,		SDHI1D2_MARK,		DIRECTION_MARK,
+	MMC_D3_MARK,		SDHI1D1_MARK,
+	MMC_D2_MARK,		SDHI1D0_MARK,
+	MMC_D1_MARK,		SDHI1CMD_MARK,
+	MMC_D0_MARK,		SDHI1CLK_MARK,
+
+	/*PTX*/
+	DMAC_DACK1_MARK,	IRDA_OUT_MARK,
+	DMAC_DREQ1_MARK,	IRDA_IN_MARK,
+	TSIF_TS0_SDAT_MARK,				LNKSTA_MARK,
+	TSIF_TS0_SCK_MARK,				MDIO_MARK,
+	TSIF_TS0_SDEN_MARK,				MDC_MARK,
+	TSIF_TS0_SPSYNC_MARK,
+	MMC_CLK_MARK,
+	MMC_CMD_MARK,
+
+	/*PTY*/
+	SDHI0CD_MARK,
+	SDHI0WP_MARK,
+	SDHI0D3_MARK,
+	SDHI0D2_MARK,
+	SDHI0D1_MARK,
+	SDHI0D0_MARK,
+	SDHI0CMD_MARK,
+	SDHI0CLK_MARK,
+
+	/*PTZ*/
+	INTC_IRQ7_MARK,		SCIF3_I_CTS_MARK,
+	INTC_IRQ6_MARK,		SCIF3_I_RTS_MARK,
+	INTC_IRQ5_MARK,		SCIF3_I_SCK_MARK,
+	INTC_IRQ4_MARK,		SCIF3_I_RXD_MARK,
+	INTC_IRQ3_MARK,		SCIF3_I_TXD_MARK,
+	INTC_IRQ2_MARK,
+	INTC_IRQ1_MARK,
+	INTC_IRQ0_MARK,
+	PINMUX_MARK_END,
+};
+
+static pinmux_enum_t pinmux_data[] = {
+	/* PTA GPIO */
+	PINMUX_DATA(PTA7_DATA, PTA7_IN, PTA7_OUT, PTA7_IN_PU),
+	PINMUX_DATA(PTA6_DATA, PTA6_IN, PTA6_OUT, PTA6_IN_PU),
+	PINMUX_DATA(PTA5_DATA, PTA5_IN, PTA5_OUT, PTA5_IN_PU),
+	PINMUX_DATA(PTA4_DATA, PTA4_IN, PTA4_OUT, PTA4_IN_PU),
+	PINMUX_DATA(PTA3_DATA, PTA3_IN, PTA3_OUT, PTA3_IN_PU),
+	PINMUX_DATA(PTA2_DATA, PTA2_IN, PTA2_OUT, PTA2_IN_PU),
+	PINMUX_DATA(PTA1_DATA, PTA1_IN, PTA1_OUT, PTA1_IN_PU),
+	PINMUX_DATA(PTA0_DATA, PTA0_IN, PTA0_OUT, PTA0_IN_PU),
+
+	/* PTB GPIO */
+	PINMUX_DATA(PTB7_DATA, PTB7_IN, PTB7_OUT, PTB7_IN_PU),
+	PINMUX_DATA(PTB6_DATA, PTB6_IN, PTB6_OUT, PTB6_IN_PU),
+	PINMUX_DATA(PTB5_DATA, PTB5_IN, PTB5_OUT, PTB5_IN_PU),
+	PINMUX_DATA(PTB4_DATA, PTB4_IN, PTB4_OUT, PTB4_IN_PU),
+	PINMUX_DATA(PTB3_DATA, PTB3_IN, PTB3_OUT, PTB3_IN_PU),
+	PINMUX_DATA(PTB2_DATA, PTB2_IN, PTB2_OUT, PTB2_IN_PU),
+	PINMUX_DATA(PTB1_DATA, PTB1_IN, PTB1_OUT, PTB1_IN_PU),
+	PINMUX_DATA(PTB0_DATA, PTB0_IN, PTB0_OUT, PTB0_IN_PU),
+
+	/* PTC GPIO */
+	PINMUX_DATA(PTC7_DATA, PTC7_IN, PTC7_OUT, PTC7_IN_PU),
+	PINMUX_DATA(PTC6_DATA, PTC6_IN, PTC6_OUT, PTC6_IN_PU),
+	PINMUX_DATA(PTC5_DATA, PTC5_IN, PTC5_OUT, PTC5_IN_PU),
+	PINMUX_DATA(PTC4_DATA, PTC4_IN, PTC4_OUT, PTC4_IN_PU),
+	PINMUX_DATA(PTC3_DATA, PTC3_IN, PTC3_OUT, PTC3_IN_PU),
+	PINMUX_DATA(PTC2_DATA, PTC2_IN, PTC2_OUT, PTC2_IN_PU),
+	PINMUX_DATA(PTC1_DATA, PTC1_IN, PTC1_OUT, PTC1_IN_PU),
+	PINMUX_DATA(PTC0_DATA, PTC0_IN, PTC0_OUT, PTC0_IN_PU),
+
+	/* PTD GPIO */
+	PINMUX_DATA(PTD7_DATA, PTD7_IN, PTD7_OUT, PTD7_IN_PU),
+	PINMUX_DATA(PTD6_DATA, PTD6_IN, PTD6_OUT, PTD6_IN_PU),
+	PINMUX_DATA(PTD5_DATA, PTD5_IN, PTD5_OUT, PTD5_IN_PU),
+	PINMUX_DATA(PTD4_DATA, PTD4_IN, PTD4_OUT, PTD4_IN_PU),
+	PINMUX_DATA(PTD3_DATA, PTD3_IN, PTD3_OUT, PTD3_IN_PU),
+	PINMUX_DATA(PTD2_DATA, PTD2_IN, PTD2_OUT, PTD2_IN_PU),
+	PINMUX_DATA(PTD1_DATA, PTD1_IN, PTD1_OUT, PTD1_IN_PU),
+	PINMUX_DATA(PTD0_DATA, PTD0_IN, PTD0_OUT, PTD0_IN_PU),
+
+	/* PTE GPIO */
+	PINMUX_DATA(PTE7_DATA, PTE7_IN, PTE7_OUT, PTE7_IN_PU),
+	PINMUX_DATA(PTE6_DATA, PTE6_IN, PTE6_OUT, PTE6_IN_PU),
+	PINMUX_DATA(PTE5_DATA, PTE5_IN, PTE5_OUT, PTE5_IN_PU),
+	PINMUX_DATA(PTE4_DATA, PTE4_IN, PTE4_OUT, PTE4_IN_PU),
+	PINMUX_DATA(PTE3_DATA, PTE3_IN, PTE3_OUT, PTE3_IN_PU),
+	PINMUX_DATA(PTE2_DATA, PTE2_IN, PTE2_OUT, PTE2_IN_PU),
+	PINMUX_DATA(PTE1_DATA, PTE1_IN, PTE1_OUT, PTE1_IN_PU),
+	PINMUX_DATA(PTE0_DATA, PTE0_IN, PTE0_OUT, PTE0_IN_PU),
+
+	/* PTF GPIO */
+	PINMUX_DATA(PTF7_DATA, PTF7_IN, PTF7_OUT, PTF7_IN_PU),
+	PINMUX_DATA(PTF6_DATA, PTF6_IN, PTF6_OUT, PTF6_IN_PU),
+	PINMUX_DATA(PTF5_DATA, PTF5_IN, PTF5_OUT, PTF5_IN_PU),
+	PINMUX_DATA(PTF4_DATA, PTF4_IN, PTF4_OUT, PTF4_IN_PU),
+	PINMUX_DATA(PTF3_DATA, PTF3_IN, PTF3_OUT, PTF3_IN_PU),
+	PINMUX_DATA(PTF2_DATA, PTF2_IN, PTF2_OUT, PTF2_IN_PU),
+	PINMUX_DATA(PTF1_DATA, PTF1_IN, PTF1_OUT, PTF1_IN_PU),
+	PINMUX_DATA(PTF0_DATA, PTF0_IN, PTF0_OUT, PTF0_IN_PU),
+
+	/* PTG GPIO */
+	PINMUX_DATA(PTG5_DATA, PTG5_OUT),
+	PINMUX_DATA(PTG4_DATA, PTG4_OUT),
+	PINMUX_DATA(PTG3_DATA, PTG3_OUT),
+	PINMUX_DATA(PTG2_DATA, PTG2_OUT),
+	PINMUX_DATA(PTG1_DATA, PTG1_OUT),
+	PINMUX_DATA(PTG0_DATA, PTG0_OUT),
+
+	/* PTH GPIO */
+	PINMUX_DATA(PTH7_DATA, PTH7_IN, PTH7_OUT, PTH7_IN_PU),
+	PINMUX_DATA(PTH6_DATA, PTH6_IN, PTH6_OUT, PTH6_IN_PU),
+	PINMUX_DATA(PTH5_DATA, PTH5_IN, PTH5_OUT, PTH5_IN_PU),
+	PINMUX_DATA(PTH4_DATA, PTH4_IN, PTH4_OUT, PTH4_IN_PU),
+	PINMUX_DATA(PTH3_DATA, PTH3_IN, PTH3_OUT, PTH3_IN_PU),
+	PINMUX_DATA(PTH2_DATA, PTH2_IN, PTH2_OUT, PTH2_IN_PU),
+	PINMUX_DATA(PTH1_DATA, PTH1_IN, PTH1_OUT, PTH1_IN_PU),
+	PINMUX_DATA(PTH0_DATA, PTH0_IN, PTH0_OUT, PTH0_IN_PU),
+
+	/* PTJ GPIO */
+	PINMUX_DATA(PTJ7_DATA, PTJ7_OUT),
+	PINMUX_DATA(PTJ6_DATA, PTJ6_OUT),
+	PINMUX_DATA(PTJ5_DATA, PTJ5_OUT),
+	PINMUX_DATA(PTJ3_DATA, PTJ3_IN, PTJ3_OUT, PTJ3_IN_PU),
+	PINMUX_DATA(PTJ2_DATA, PTJ2_IN, PTJ2_OUT, PTJ2_IN_PU),
+	PINMUX_DATA(PTJ1_DATA, PTJ1_IN, PTJ1_OUT, PTJ1_IN_PU),
+	PINMUX_DATA(PTJ0_DATA, PTJ0_IN, PTJ0_OUT, PTJ0_IN_PU),
+
+	/* PTK GPIO */
+	PINMUX_DATA(PTK7_DATA, PTK7_IN, PTK7_OUT, PTK7_IN_PU),
+	PINMUX_DATA(PTK6_DATA, PTK6_IN, PTK6_OUT, PTK6_IN_PU),
+	PINMUX_DATA(PTK5_DATA, PTK5_IN, PTK5_OUT, PTK5_IN_PU),
+	PINMUX_DATA(PTK4_DATA, PTK4_IN, PTK4_OUT, PTK4_IN_PU),
+	PINMUX_DATA(PTK3_DATA, PTK3_IN, PTK3_OUT, PTK3_IN_PU),
+	PINMUX_DATA(PTK2_DATA, PTK2_IN, PTK2_OUT, PTK2_IN_PU),
+	PINMUX_DATA(PTK1_DATA, PTK1_IN, PTK1_OUT, PTK1_IN_PU),
+	PINMUX_DATA(PTK0_DATA, PTK0_IN, PTK0_OUT, PTK0_IN_PU),
+
+	/* PTL GPIO */
+	PINMUX_DATA(PTL7_DATA, PTL7_IN, PTL7_OUT, PTL7_IN_PU),
+	PINMUX_DATA(PTL6_DATA, PTL6_IN, PTL6_OUT, PTL6_IN_PU),
+	PINMUX_DATA(PTL5_DATA, PTL5_IN, PTL5_OUT, PTL5_IN_PU),
+	PINMUX_DATA(PTL4_DATA, PTL4_IN, PTL4_OUT, PTL4_IN_PU),
+	PINMUX_DATA(PTL3_DATA, PTL3_IN, PTL3_OUT, PTL3_IN_PU),
+	PINMUX_DATA(PTL2_DATA, PTL2_IN, PTL2_OUT, PTL2_IN_PU),
+	PINMUX_DATA(PTL1_DATA, PTL1_IN, PTL1_OUT, PTL1_IN_PU),
+	PINMUX_DATA(PTL0_DATA, PTL0_IN, PTL0_OUT, PTL0_IN_PU),
+
+	/* PTM GPIO */
+	PINMUX_DATA(PTM7_DATA, PTM7_IN, PTM7_OUT, PTM7_IN_PU),
+	PINMUX_DATA(PTM6_DATA, PTM6_IN, PTM6_OUT, PTM6_IN_PU),
+	PINMUX_DATA(PTM5_DATA, PTM5_IN, PTM5_OUT, PTM5_IN_PU),
+	PINMUX_DATA(PTM4_DATA, PTM4_IN, PTM4_OUT, PTM4_IN_PU),
+	PINMUX_DATA(PTM3_DATA, PTM3_IN, PTM3_OUT, PTM3_IN_PU),
+	PINMUX_DATA(PTM2_DATA, PTM2_IN, PTM2_OUT, PTM2_IN_PU),
+	PINMUX_DATA(PTM1_DATA, PTM1_IN, PTM1_OUT, PTM1_IN_PU),
+	PINMUX_DATA(PTM0_DATA, PTM0_IN, PTM0_OUT, PTM0_IN_PU),
+
+	/* PTN GPIO */
+	PINMUX_DATA(PTN7_DATA, PTN7_IN, PTN7_OUT, PTN7_IN_PU),
+	PINMUX_DATA(PTN6_DATA, PTN6_IN, PTN6_OUT, PTN6_IN_PU),
+	PINMUX_DATA(PTN5_DATA, PTN5_IN, PTN5_OUT, PTN5_IN_PU),
+	PINMUX_DATA(PTN4_DATA, PTN4_IN, PTN4_OUT, PTN4_IN_PU),
+	PINMUX_DATA(PTN3_DATA, PTN3_IN, PTN3_OUT, PTN3_IN_PU),
+	PINMUX_DATA(PTN2_DATA, PTN2_IN, PTN2_OUT, PTN2_IN_PU),
+	PINMUX_DATA(PTN1_DATA, PTN1_IN, PTN1_OUT, PTN1_IN_PU),
+	PINMUX_DATA(PTN0_DATA, PTN0_IN, PTN0_OUT, PTN0_IN_PU),
+
+	/* PTQ GPIO */
+	PINMUX_DATA(PTQ7_DATA, PTQ7_IN, PTQ7_OUT, PTQ7_IN_PU),
+	PINMUX_DATA(PTQ6_DATA, PTQ6_IN, PTQ6_OUT, PTQ6_IN_PU),
+	PINMUX_DATA(PTQ5_DATA, PTQ5_IN, PTQ5_OUT, PTQ5_IN_PU),
+	PINMUX_DATA(PTQ4_DATA, PTQ4_IN, PTQ4_OUT, PTQ4_IN_PU),
+	PINMUX_DATA(PTQ3_DATA, PTQ3_IN, PTQ3_OUT, PTQ3_IN_PU),
+	PINMUX_DATA(PTQ2_DATA, PTQ2_IN, PTQ2_OUT, PTQ2_IN_PU),
+	PINMUX_DATA(PTQ1_DATA, PTQ1_IN, PTQ1_OUT, PTQ1_IN_PU),
+	PINMUX_DATA(PTQ0_DATA, PTQ0_IN, PTQ0_OUT, PTQ0_IN_PU),
+
+	/* PTR GPIO */
+	PINMUX_DATA(PTR7_DATA, PTR7_IN, PTR7_OUT, PTR7_IN_PU),
+	PINMUX_DATA(PTR6_DATA, PTR6_IN, PTR6_OUT, PTR6_IN_PU),
+	PINMUX_DATA(PTR5_DATA, PTR5_IN, PTR5_OUT, PTR5_IN_PU),
+	PINMUX_DATA(PTR4_DATA, PTR4_IN, PTR4_OUT, PTR4_IN_PU),
+	PINMUX_DATA(PTR3_DATA, PTR3_IN,           PTR3_IN_PU),
+	PINMUX_DATA(PTR2_DATA, PTR2_IN,           PTR2_IN_PU),
+	PINMUX_DATA(PTR1_DATA, PTR1_IN, PTR1_OUT, PTR1_IN_PU),
+	PINMUX_DATA(PTR0_DATA, PTR0_IN, PTR0_OUT, PTR0_IN_PU),
+
+	/* PTS GPIO */
+	PINMUX_DATA(PTS6_DATA, PTS6_IN, PTS6_OUT, PTS6_IN_PU),
+	PINMUX_DATA(PTS5_DATA, PTS5_IN, PTS5_OUT, PTS5_IN_PU),
+	PINMUX_DATA(PTS4_DATA, PTS4_IN, PTS4_OUT, PTS4_IN_PU),
+	PINMUX_DATA(PTS3_DATA, PTS3_IN, PTS3_OUT, PTS3_IN_PU),
+	PINMUX_DATA(PTS2_DATA, PTS2_IN, PTS2_OUT, PTS2_IN_PU),
+	PINMUX_DATA(PTS1_DATA, PTS1_IN, PTS1_OUT, PTS1_IN_PU),
+	PINMUX_DATA(PTS0_DATA, PTS0_IN, PTS0_OUT, PTS0_IN_PU),
+
+	/* PTT GPIO */
+	PINMUX_DATA(PTT7_DATA, PTT7_IN, PTT7_OUT, PTT7_IN_PU),
+	PINMUX_DATA(PTT6_DATA, PTT6_IN, PTT6_OUT, PTT6_IN_PU),
+	PINMUX_DATA(PTT5_DATA, PTT5_IN, PTT5_OUT, PTT5_IN_PU),
+	PINMUX_DATA(PTT4_DATA, PTT4_IN, PTT4_OUT, PTT4_IN_PU),
+	PINMUX_DATA(PTT3_DATA, PTT3_IN, PTT3_OUT, PTT3_IN_PU),
+	PINMUX_DATA(PTT2_DATA, PTT2_IN, PTT2_OUT, PTT2_IN_PU),
+	PINMUX_DATA(PTT1_DATA, PTT1_IN, PTT1_OUT, PTT1_IN_PU),
+	PINMUX_DATA(PTT0_DATA, PTT0_IN, PTT0_OUT, PTT0_IN_PU),
+
+	/* PTU GPIO */
+	PINMUX_DATA(PTU7_DATA, PTU7_IN, PTU7_OUT, PTU7_IN_PU),
+	PINMUX_DATA(PTU6_DATA, PTU6_IN, PTU6_OUT, PTU6_IN_PU),
+	PINMUX_DATA(PTU5_DATA, PTU5_IN, PTU5_OUT, PTU5_IN_PU),
+	PINMUX_DATA(PTU4_DATA, PTU4_IN, PTU4_OUT, PTU4_IN_PU),
+	PINMUX_DATA(PTU3_DATA, PTU3_IN, PTU3_OUT, PTU3_IN_PU),
+	PINMUX_DATA(PTU2_DATA, PTU2_IN, PTU2_OUT, PTU2_IN_PU),
+	PINMUX_DATA(PTU1_DATA, PTU1_IN, PTU1_OUT, PTU1_IN_PU),
+	PINMUX_DATA(PTU0_DATA, PTU0_IN, PTU0_OUT, PTU0_IN_PU),
+
+	/* PTV GPIO */
+	PINMUX_DATA(PTV7_DATA, PTV7_IN, PTV7_OUT, PTV7_IN_PU),
+	PINMUX_DATA(PTV6_DATA, PTV6_IN, PTV6_OUT, PTV6_IN_PU),
+	PINMUX_DATA(PTV5_DATA, PTV5_IN, PTV5_OUT, PTV5_IN_PU),
+	PINMUX_DATA(PTV4_DATA, PTV4_IN, PTV4_OUT, PTV4_IN_PU),
+	PINMUX_DATA(PTV3_DATA, PTV3_IN, PTV3_OUT, PTV3_IN_PU),
+	PINMUX_DATA(PTV2_DATA, PTV2_IN, PTV2_OUT, PTV2_IN_PU),
+	PINMUX_DATA(PTV1_DATA, PTV1_IN, PTV1_OUT, PTV1_IN_PU),
+	PINMUX_DATA(PTV0_DATA, PTV0_IN, PTV0_OUT, PTV0_IN_PU),
+
+	/* PTW GPIO */
+	PINMUX_DATA(PTW7_DATA, PTW7_IN, PTW7_OUT, PTW7_IN_PU),
+	PINMUX_DATA(PTW6_DATA, PTW6_IN, PTW6_OUT, PTW6_IN_PU),
+	PINMUX_DATA(PTW5_DATA, PTW5_IN, PTW5_OUT, PTW5_IN_PU),
+	PINMUX_DATA(PTW4_DATA, PTW4_IN, PTW4_OUT, PTW4_IN_PU),
+	PINMUX_DATA(PTW3_DATA, PTW3_IN, PTW3_OUT, PTW3_IN_PU),
+	PINMUX_DATA(PTW2_DATA, PTW2_IN, PTW2_OUT, PTW2_IN_PU),
+	PINMUX_DATA(PTW1_DATA, PTW1_IN, PTW1_OUT, PTW1_IN_PU),
+	PINMUX_DATA(PTW0_DATA, PTW0_IN, PTW0_OUT, PTW0_IN_PU),
+
+	/* PTX GPIO */
+	PINMUX_DATA(PTX7_DATA, PTX7_IN, PTX7_OUT, PTX7_IN_PU),
+	PINMUX_DATA(PTX6_DATA, PTX6_IN, PTX6_OUT, PTX6_IN_PU),
+	PINMUX_DATA(PTX5_DATA, PTX5_IN, PTX5_OUT, PTX5_IN_PU),
+	PINMUX_DATA(PTX4_DATA, PTX4_IN, PTX4_OUT, PTX4_IN_PU),
+	PINMUX_DATA(PTX3_DATA, PTX3_IN, PTX3_OUT, PTX3_IN_PU),
+	PINMUX_DATA(PTX2_DATA, PTX2_IN, PTX2_OUT, PTX2_IN_PU),
+	PINMUX_DATA(PTX1_DATA, PTX1_IN, PTX1_OUT, PTX1_IN_PU),
+	PINMUX_DATA(PTX0_DATA, PTX0_IN, PTX0_OUT, PTX0_IN_PU),
+
+	/* PTY GPIO */
+	PINMUX_DATA(PTY7_DATA, PTY7_IN, PTY7_OUT, PTY7_IN_PU),
+	PINMUX_DATA(PTY6_DATA, PTY6_IN, PTY6_OUT, PTY6_IN_PU),
+	PINMUX_DATA(PTY5_DATA, PTY5_IN, PTY5_OUT, PTY5_IN_PU),
+	PINMUX_DATA(PTY4_DATA, PTY4_IN, PTY4_OUT, PTY4_IN_PU),
+	PINMUX_DATA(PTY3_DATA, PTY3_IN, PTY3_OUT, PTY3_IN_PU),
+	PINMUX_DATA(PTY2_DATA, PTY2_IN, PTY2_OUT, PTY2_IN_PU),
+	PINMUX_DATA(PTY1_DATA, PTY1_IN, PTY1_OUT, PTY1_IN_PU),
+	PINMUX_DATA(PTY0_DATA, PTY0_IN, PTY0_OUT, PTY0_IN_PU),
+
+	/* PTZ GPIO */
+	PINMUX_DATA(PTZ7_DATA, PTZ7_IN, PTZ7_OUT, PTZ7_IN_PU),
+	PINMUX_DATA(PTZ6_DATA, PTZ6_IN, PTZ6_OUT, PTZ6_IN_PU),
+	PINMUX_DATA(PTZ5_DATA, PTZ5_IN, PTZ5_OUT, PTZ5_IN_PU),
+	PINMUX_DATA(PTZ4_DATA, PTZ4_IN, PTZ4_OUT, PTZ4_IN_PU),
+	PINMUX_DATA(PTZ3_DATA, PTZ3_IN, PTZ3_OUT, PTZ3_IN_PU),
+	PINMUX_DATA(PTZ2_DATA, PTZ2_IN, PTZ2_OUT, PTZ2_IN_PU),
+	PINMUX_DATA(PTZ1_DATA, PTZ1_IN, PTZ1_OUT, PTZ1_IN_PU),
+	PINMUX_DATA(PTZ0_DATA, PTZ0_IN, PTZ0_OUT, PTZ0_IN_PU),
+
+	/* PTA FN */
+	PINMUX_DATA(D23_MARK,	PSA15_0, PSA14_0, PTA7_FN),
+	PINMUX_DATA(D22_MARK,	PSA15_0, PSA14_0, PTA6_FN),
+	PINMUX_DATA(D21_MARK,	PSA15_0, PSA14_0, PTA5_FN),
+	PINMUX_DATA(D20_MARK,	PSA15_0, PSA14_0, PTA4_FN),
+	PINMUX_DATA(D19_MARK,	PSA15_0, PSA14_0, PTA3_FN),
+	PINMUX_DATA(D18_MARK,	PSA15_0, PSA14_0, PTA2_FN),
+	PINMUX_DATA(D17_MARK,	PSA15_0, PSA14_0, PTA1_FN),
+	PINMUX_DATA(D16_MARK,	PSA15_0, PSA14_0, PTA0_FN),
+
+	PINMUX_DATA(KEYOUT2_MARK,	PSA15_0, PSA14_1, PTA7_FN),
+	PINMUX_DATA(KEYOUT1_MARK,	PSA15_0, PSA14_1, PTA6_FN),
+	PINMUX_DATA(KEYOUT0_MARK,	PSA15_0, PSA14_1, PTA5_FN),
+	PINMUX_DATA(KEYIN4_MARK,	PSA15_0, PSA14_1, PTA4_FN),
+	PINMUX_DATA(KEYIN3_MARK,	PSA15_0, PSA14_1, PTA3_FN),
+	PINMUX_DATA(KEYIN2_MARK,	PSA15_0, PSA14_1, PTA2_FN),
+	PINMUX_DATA(KEYIN1_MARK,	PSA15_0, PSA14_1, PTA1_FN),
+	PINMUX_DATA(KEYIN0_MARK,	PSA15_0, PSA14_1, PTA0_FN),
+
+	PINMUX_DATA(IDED15_MARK,	PSA15_1, PSA14_0, PTA7_FN),
+	PINMUX_DATA(IDED14_MARK,	PSA15_1, PSA14_0, PTA6_FN),
+	PINMUX_DATA(IDED13_MARK,	PSA15_1, PSA14_0, PTA5_FN),
+	PINMUX_DATA(IDED12_MARK,	PSA15_1, PSA14_0, PTA4_FN),
+	PINMUX_DATA(IDED11_MARK,	PSA15_1, PSA14_0, PTA3_FN),
+	PINMUX_DATA(IDED10_MARK,	PSA15_1, PSA14_0, PTA2_FN),
+	PINMUX_DATA(IDED9_MARK,		PSA15_1, PSA14_0, PTA1_FN),
+	PINMUX_DATA(IDED8_MARK,		PSA15_1, PSA14_0, PTA0_FN),
+
+	/* PTB FN */
+	PINMUX_DATA(D31_MARK,		PSE15_0, PSE14_0, PTB7_FN),
+	PINMUX_DATA(D30_MARK,		PSE15_0, PSE14_0, PTB6_FN),
+	PINMUX_DATA(D29_MARK,		PSE11_0,          PTB5_FN),
+	PINMUX_DATA(D28_MARK,		PSE11_0,          PTB4_FN),
+	PINMUX_DATA(D27_MARK,		PSE11_0,          PTB3_FN),
+	PINMUX_DATA(D26_MARK,		PSA15_0, PSA14_0, PTB2_FN),
+	PINMUX_DATA(D25_MARK,		PSA15_0, PSA14_0, PTB1_FN),
+	PINMUX_DATA(D24_MARK,		PSA15_0, PSA14_0, PTB0_FN),
+
+	PINMUX_DATA(IDEA1_MARK,		PSE15_1, PSE14_0, PTB7_FN),
+	PINMUX_DATA(IDEA0_MARK,		PSE15_1, PSE14_0, PTB6_FN),
+	PINMUX_DATA(IODREQ_MARK,	PSE11_1,          PTB5_FN),
+	PINMUX_DATA(IDECS0_MARK,	PSE11_1,          PTB4_FN),
+	PINMUX_DATA(IDECS1_MARK,	PSE11_1,          PTB3_FN),
+	PINMUX_DATA(IDEIORD_MARK,	PSA15_1, PSA14_0, PTB2_FN),
+	PINMUX_DATA(IDEIOWR_MARK,	PSA15_1, PSA14_0, PTB1_FN),
+	PINMUX_DATA(IDEINT_MARK,	PSA15_1, PSA14_0, PTB0_FN),
+
+	PINMUX_DATA(TPUTO1_MARK,	PSE15_0, PSE14_1, PTB7_FN),
+	PINMUX_DATA(TPUTO0_MARK,	PSE15_0, PSE14_1, PTB6_FN),
+
+	PINMUX_DATA(KEYOUT5_IN5_MARK,	PSA15_0, PSA14_1, PTB2_FN),
+	PINMUX_DATA(KEYOUT4_IN6_MARK,	PSA15_0, PSA14_1, PTB1_FN),
+	PINMUX_DATA(KEYOUT3_MARK,	PSA15_0, PSA14_1, PTB0_FN),
+
+	/* PTC FN */
+	PINMUX_DATA(LCDD7_MARK, PSD5_0, PTC7_FN),
+	PINMUX_DATA(LCDD6_MARK, PSD5_0, PTC6_FN),
+	PINMUX_DATA(LCDD5_MARK, PSD5_0, PTC5_FN),
+	PINMUX_DATA(LCDD4_MARK, PSD5_0, PTC4_FN),
+	PINMUX_DATA(LCDD3_MARK, PSD5_0, PTC3_FN),
+	PINMUX_DATA(LCDD2_MARK, PSD5_0, PTC2_FN),
+	PINMUX_DATA(LCDD1_MARK, PSD5_0, PTC1_FN),
+	PINMUX_DATA(LCDD0_MARK, PSD5_0, PTC0_FN),
+
+	/* PTD FN */
+	PINMUX_DATA(LCDD15_MARK, PSD5_0, PTD7_FN),
+	PINMUX_DATA(LCDD14_MARK, PSD5_0, PTD6_FN),
+	PINMUX_DATA(LCDD13_MARK, PSD5_0, PTD5_FN),
+	PINMUX_DATA(LCDD12_MARK, PSD5_0, PTD4_FN),
+	PINMUX_DATA(LCDD11_MARK, PSD5_0, PTD3_FN),
+	PINMUX_DATA(LCDD10_MARK, PSD5_0, PTD2_FN),
+	PINMUX_DATA(LCDD9_MARK,  PSD5_0, PTD1_FN),
+	PINMUX_DATA(LCDD8_MARK,  PSD5_0, PTD0_FN),
+
+	/* PTE FN */
+	PINMUX_DATA(FSIMCKB_MARK, PTE7_FN),
+	PINMUX_DATA(FSIMCKA_MARK, PTE6_FN),
+
+	PINMUX_DATA(LCDD21_MARK,	PSC5_0, PSC4_0, PTE5_FN),
+	PINMUX_DATA(LCDD20_MARK,	PSD3_0, PSD2_0, PTE4_FN),
+	PINMUX_DATA(LCDD19_MARK,	PSA3_0, PSA2_0, PTE3_FN),
+	PINMUX_DATA(LCDD18_MARK,	PSA3_0, PSA2_0, PTE2_FN),
+	PINMUX_DATA(LCDD17_MARK,	PSD5_0,         PTE1_FN),
+	PINMUX_DATA(LCDD16_MARK,	PSD5_0,         PTE0_FN),
+
+	PINMUX_DATA(SCIF2_L_TXD_MARK,	PSC5_0, PSC4_1, PTE5_FN),
+	PINMUX_DATA(SCIF4_SCK_MARK,	PSD3_0, PSD2_1, PTE4_FN),
+	PINMUX_DATA(SCIF4_RXD_MARK,	PSA3_0, PSA2_1, PTE3_FN),
+	PINMUX_DATA(SCIF4_TXD_MARK,	PSA3_0, PSA2_1, PTE2_FN),
+
+	/* PTF FN */
+	PINMUX_DATA(LCDVSYN_MARK,	PSD8_0,          PTF7_FN),
+	PINMUX_DATA(LCDDISP_MARK,	PSD10_0, PSD9_0, PTF6_FN),
+	PINMUX_DATA(LCDHSYN_MARK,	PSD10_0, PSD9_0, PTF5_FN),
+	PINMUX_DATA(LCDDON_MARK,	PSD8_0,          PTF4_FN),
+	PINMUX_DATA(LCDDCK_MARK,	PSD10_0, PSD9_0, PTF3_FN),
+	PINMUX_DATA(LCDVEPWC_MARK,	PSA6_0,          PTF2_FN),
+	PINMUX_DATA(LCDD23_MARK,	PSC7_0,  PSC6_0, PTF1_FN),
+	PINMUX_DATA(LCDD22_MARK,	PSC5_0,  PSC4_0, PTF0_FN),
+
+	PINMUX_DATA(LCDRS_MARK,		PSD10_0, PSD9_1, PTF6_FN),
+	PINMUX_DATA(LCDCS_MARK,		PSD10_0, PSD9_1, PTF5_FN),
+	PINMUX_DATA(LCDWR_MARK,		PSD10_0, PSD9_1, PTF3_FN),
+
+	PINMUX_DATA(SCIF0_TXD_MARK,	PSA6_1,          PTF2_FN),
+	PINMUX_DATA(SCIF2_L_SCK_MARK,	PSC7_0,  PSC6_1, PTF1_FN),
+	PINMUX_DATA(SCIF2_L_RXD_MARK,	PSC5_0,  PSC4_1, PTF0_FN),
+
+	/* PTG FN */
+	PINMUX_DATA(AUDCK_MARK,   PTG5_FN),
+	PINMUX_DATA(AUDSYNC_MARK, PTG4_FN),
+	PINMUX_DATA(AUDATA3_MARK, PTG3_FN),
+	PINMUX_DATA(AUDATA2_MARK, PTG2_FN),
+	PINMUX_DATA(AUDATA1_MARK, PTG1_FN),
+	PINMUX_DATA(AUDATA0_MARK, PTG0_FN),
+
+	/* PTH FN */
+	PINMUX_DATA(VIO0_VD_MARK,  PTH7_FN),
+	PINMUX_DATA(VIO0_CLK_MARK, PTH6_FN),
+	PINMUX_DATA(VIO0_D7_MARK,  PTH5_FN),
+	PINMUX_DATA(VIO0_D6_MARK,  PTH4_FN),
+	PINMUX_DATA(VIO0_D5_MARK,  PTH3_FN),
+	PINMUX_DATA(VIO0_D4_MARK,  PTH2_FN),
+	PINMUX_DATA(VIO0_D3_MARK,  PTH1_FN),
+	PINMUX_DATA(VIO0_D2_MARK,  PTH0_FN),
+
+	/* PTJ FN */
+	PINMUX_DATA(PDSTATUS_MARK,	PTJ7_FN),
+	PINMUX_DATA(STATUS2_MARK,	PTJ6_FN),
+	PINMUX_DATA(STATUS0_MARK,	PTJ5_FN),
+	PINMUX_DATA(A25_MARK,		PSA8_0, PTJ3_FN),
+	PINMUX_DATA(BS_MARK,		PSA8_1, PTJ3_FN),
+	PINMUX_DATA(A24_MARK,		PTJ2_FN),
+	PINMUX_DATA(A23_MARK,		PTJ1_FN),
+	PINMUX_DATA(A22_MARK,		PTJ0_FN),
+
+	/* PTK FN */
+	PINMUX_DATA(VIO1_D5_MARK,	PSB7_0, PSB6_0, PTK7_FN),
+	PINMUX_DATA(VIO1_D4_MARK,	PSB7_0, PSB6_0, PTK6_FN),
+	PINMUX_DATA(VIO1_D3_MARK,	PSB7_0, PSB6_0, PTK5_FN),
+	PINMUX_DATA(VIO1_D2_MARK,	PSB7_0, PSB6_0, PTK4_FN),
+	PINMUX_DATA(VIO1_D1_MARK,	PSB7_0, PSB6_0, PTK3_FN),
+	PINMUX_DATA(VIO1_D0_MARK,	PSB7_0, PSB6_0, PTK2_FN),
+
+	PINMUX_DATA(VIO0_D13_MARK,	PSB7_0, PSB6_1, PTK7_FN),
+	PINMUX_DATA(VIO0_D12_MARK,	PSB7_0, PSB6_1, PTK6_FN),
+	PINMUX_DATA(VIO0_D11_MARK,	PSB7_0, PSB6_1, PTK5_FN),
+	PINMUX_DATA(VIO0_D10_MARK,	PSB7_0, PSB6_1, PTK4_FN),
+	PINMUX_DATA(VIO0_D9_MARK,	PSB7_0, PSB6_1, PTK3_FN),
+	PINMUX_DATA(VIO0_D8_MARK,	PSB7_0, PSB6_1, PTK2_FN),
+
+	PINMUX_DATA(IDED5_MARK,		PSB7_1, PSB6_0, PTK7_FN),
+	PINMUX_DATA(IDED4_MARK,		PSB7_1, PSB6_0, PTK6_FN),
+	PINMUX_DATA(IDED3_MARK,		PSB7_1, PSB6_0, PTK5_FN),
+	PINMUX_DATA(IDED2_MARK,		PSB7_1, PSB6_0, PTK4_FN),
+	PINMUX_DATA(IDED1_MARK,		PSB7_1, PSB6_0, PTK3_FN),
+	PINMUX_DATA(IDED0_MARK,		PSB7_1, PSB6_0, PTK2_FN),
+
+	PINMUX_DATA(VIO0_FLD_MARK,	PTK1_FN),
+	PINMUX_DATA(VIO0_HD_MARK,	PTK0_FN),
+
+	/* PTL FN */
+	PINMUX_DATA(DV_D5_MARK,		PSB9_0, PSB8_0, PTL7_FN),
+	PINMUX_DATA(DV_D4_MARK,		PSB9_0, PSB8_0, PTL6_FN),
+	PINMUX_DATA(DV_D3_MARK,		PSE7_0, PSE6_0, PTL5_FN),
+	PINMUX_DATA(DV_D2_MARK,		PSC9_0, PSC8_0, PTL4_FN),
+	PINMUX_DATA(DV_D1_MARK,		PSC9_0, PSC8_0, PTL3_FN),
+	PINMUX_DATA(DV_D0_MARK,		PSC9_0, PSC8_0, PTL2_FN),
+	PINMUX_DATA(DV_D15_MARK,	PSD4_0,         PTL1_FN),
+	PINMUX_DATA(DV_D14_MARK,	PSE5_0, PSE4_0, PTL0_FN),
+
+	PINMUX_DATA(SCIF3_V_SCK_MARK,	PSB9_0, PSB8_1, PTL7_FN),
+	PINMUX_DATA(SCIF3_V_RXD_MARK,	PSB9_0, PSB8_1, PTL6_FN),
+	PINMUX_DATA(SCIF3_V_TXD_MARK,	PSE7_0, PSE6_1, PTL5_FN),
+	PINMUX_DATA(SCIF1_SCK_MARK,	PSC9_0, PSC8_1, PTL4_FN),
+	PINMUX_DATA(SCIF1_RXD_MARK,	PSC9_0, PSC8_1, PTL3_FN),
+	PINMUX_DATA(SCIF1_TXD_MARK,	PSC9_0, PSC8_1, PTL2_FN),
+
+	PINMUX_DATA(RMII_RXD0_MARK,	PSB9_1, PSB8_0, PTL7_FN),
+	PINMUX_DATA(RMII_RXD1_MARK,	PSB9_1, PSB8_0, PTL6_FN),
+	PINMUX_DATA(RMII_REF_CLK_MARK,	PSE7_1, PSE6_0, PTL5_FN),
+	PINMUX_DATA(RMII_TX_EN_MARK,	PSC9_1, PSC8_0, PTL4_FN),
+	PINMUX_DATA(RMII_TXD0_MARK,	PSC9_1, PSC8_0, PTL3_FN),
+	PINMUX_DATA(RMII_TXD1_MARK,	PSC9_1, PSC8_0, PTL2_FN),
+
+	PINMUX_DATA(MSIOF0_MCK_MARK,	PSE5_0, PSE4_1, PTL0_FN),
+
+	/* PTM FN */
+	PINMUX_DATA(DV_D13_MARK,	PSC13_0, PSC12_0, PTM7_FN),
+	PINMUX_DATA(DV_D12_MARK,	PSC13_0, PSC12_0, PTM6_FN),
+	PINMUX_DATA(DV_D11_MARK,	PSC13_0, PSC12_0, PTM5_FN),
+	PINMUX_DATA(DV_D10_MARK,	PSC13_0, PSC12_0, PTM4_FN),
+	PINMUX_DATA(DV_D9_MARK,		PSC11_0, PSC10_0, PTM3_FN),
+	PINMUX_DATA(DV_D8_MARK,		PSC11_0, PSC10_0, PTM2_FN),
+
+	PINMUX_DATA(MSIOF0_TSCK_MARK,	PSC13_0, PSC12_1, PTM7_FN),
+	PINMUX_DATA(MSIOF0_RXD_MARK,	PSC13_0, PSC12_1, PTM6_FN),
+	PINMUX_DATA(MSIOF0_TXD_MARK,	PSC13_0, PSC12_1, PTM5_FN),
+	PINMUX_DATA(MSIOF0_TSYNC_MARK,	PSC13_0, PSC12_1, PTM4_FN),
+	PINMUX_DATA(MSIOF0_SS1_MARK,	PSC11_0, PSC10_1, PTM3_FN),
+	PINMUX_DATA(MSIOF0_RSCK_MARK,	PSC11_1, PSC10_0, PTM3_FN),
+	PINMUX_DATA(MSIOF0_SS2_MARK,	PSC11_0, PSC10_1, PTM2_FN),
+	PINMUX_DATA(MSIOF0_RSYNC_MARK,	PSC11_1, PSC10_0, PTM2_FN),
+
+	PINMUX_DATA(LCDVCPWC_MARK,	PSA6_0, PTM1_FN),
+	PINMUX_DATA(LCDRD_MARK,		PSA7_0, PTM0_FN),
+
+	PINMUX_DATA(SCIF0_RXD_MARK,	PSA6_1, PTM1_FN),
+	PINMUX_DATA(SCIF0_SCK_MARK,	PSA7_1, PTM0_FN),
+
+	/* PTN FN */
+	PINMUX_DATA(VIO0_D1_MARK,	PTN7_FN),
+	PINMUX_DATA(VIO0_D0_MARK,	PTN6_FN),
+
+	PINMUX_DATA(DV_CLKI_MARK,	PSD11_0,          PTN5_FN),
+	PINMUX_DATA(DV_CLK_MARK,	PSD13_0, PSD12_0, PTN4_FN),
+	PINMUX_DATA(DV_VSYNC_MARK,	PSD15_0, PSD14_0, PTN3_FN),
+	PINMUX_DATA(DV_HSYNC_MARK,	PSB5_0,  PSB4_0,  PTN2_FN),
+	PINMUX_DATA(DV_D7_MARK,		PSB3_0,  PSB2_0,  PTN1_FN),
+	PINMUX_DATA(DV_D6_MARK,		PSB1_0,  PSB0_0,  PTN0_FN),
+
+	PINMUX_DATA(SCIF2_V_SCK_MARK,	PSD13_0, PSD12_1, PTN4_FN),
+	PINMUX_DATA(SCIF2_V_RXD_MARK,	PSD15_0, PSD14_1, PTN3_FN),
+	PINMUX_DATA(SCIF2_V_TXD_MARK,	PSB5_0,  PSB4_1,  PTN2_FN),
+	PINMUX_DATA(SCIF3_V_CTS_MARK,	PSB3_0,  PSB2_1,  PTN1_FN),
+	PINMUX_DATA(SCIF3_V_RTS_MARK,	PSB1_0,  PSB0_1,  PTN0_FN),
+
+	PINMUX_DATA(RMII_RX_ER_MARK,	PSB3_1, PSB2_0, PTN1_FN),
+	PINMUX_DATA(RMII_CRS_DV_MARK,	PSB1_1, PSB0_0, PTN0_FN),
+
+	/* PTQ FN */
+	PINMUX_DATA(D7_MARK, PTQ7_FN),
+	PINMUX_DATA(D6_MARK, PTQ6_FN),
+	PINMUX_DATA(D5_MARK, PTQ5_FN),
+	PINMUX_DATA(D4_MARK, PTQ4_FN),
+	PINMUX_DATA(D3_MARK, PTQ3_FN),
+	PINMUX_DATA(D2_MARK, PTQ2_FN),
+	PINMUX_DATA(D1_MARK, PTQ1_FN),
+	PINMUX_DATA(D0_MARK, PTQ0_FN),
+
+	/* PTR FN */
+	PINMUX_DATA(CS6B_CE1B_MARK,	                PTR7_FN),
+	PINMUX_DATA(CS6A_CE2B_MARK,	                PTR6_FN),
+	PINMUX_DATA(CS5B_CE1A_MARK,	                PTR5_FN),
+	PINMUX_DATA(CS5A_CE2A_MARK,	                PTR4_FN),
+	PINMUX_DATA(IOIS16_MARK,	PSA5_0,         PTR3_FN),
+	PINMUX_DATA(WAIT_MARK,		                PTR2_FN),
+	PINMUX_DATA(WE3_ICIOWR_MARK,	PSA1_0, PSA0_0, PTR1_FN),
+	PINMUX_DATA(WE2_ICIORD_MARK,	PSD1_0, PSD0_0, PTR0_FN),
+
+	PINMUX_DATA(LCDLCLK_MARK,	PSA5_1,         PTR3_FN),
+
+	PINMUX_DATA(IDEA2_MARK,		PSD1_1, PSD0_0, PTR0_FN),
+
+	PINMUX_DATA(TPUTO3_MARK,	PSA1_0, PSA0_1, PTR1_FN),
+	PINMUX_DATA(TPUTI3_MARK,	PSA1_1, PSA0_0, PTR1_FN),
+	PINMUX_DATA(TPUTO2_MARK,	PSD1_0, PSD0_1, PTR0_FN),
+
+	/* PTS FN */
+	PINMUX_DATA(VIO_CKO_MARK,	PTS6_FN),
+
+	PINMUX_DATA(TPUTI2_MARK,	PSE9_0, PSE8_1, PTS5_FN),
+
+	PINMUX_DATA(IDEIORDY_MARK,	PSE9_1, PSE8_0, PTS5_FN),
+
+	PINMUX_DATA(VIO1_FLD_MARK,	PSE9_0, PSE8_0, PTS5_FN),
+	PINMUX_DATA(VIO1_HD_MARK,	PSA10_0,        PTS4_FN),
+	PINMUX_DATA(VIO1_VD_MARK,	PSA9_0,         PTS3_FN),
+	PINMUX_DATA(VIO1_CLK_MARK,	PSA9_0,         PTS2_FN),
+	PINMUX_DATA(VIO1_D7_MARK,	PSB7_0, PSB6_0, PTS1_FN),
+	PINMUX_DATA(VIO1_D6_MARK,	PSB7_0, PSB6_0, PTS0_FN),
+
+	PINMUX_DATA(SCIF5_SCK_MARK,	PSA10_1, PTS4_FN),
+	PINMUX_DATA(SCIF5_RXD_MARK,	PSA9_1,  PTS3_FN),
+	PINMUX_DATA(SCIF5_TXD_MARK,	PSA9_1,  PTS2_FN),
+
+	PINMUX_DATA(VIO0_D15_MARK,	PSB7_0, PSB6_1, PTS1_FN),
+	PINMUX_DATA(VIO0_D14_MARK,	PSB7_0, PSB6_1, PTS0_FN),
+
+	PINMUX_DATA(IDED7_MARK,		PSB7_1, PSB6_0, PTS1_FN),
+	PINMUX_DATA(IDED6_MARK,		PSB7_1, PSB6_0, PTS0_FN),
+
+	/* PTT FN */
+	PINMUX_DATA(D15_MARK, PTT7_FN),
+	PINMUX_DATA(D14_MARK, PTT6_FN),
+	PINMUX_DATA(D13_MARK, PTT5_FN),
+	PINMUX_DATA(D12_MARK, PTT4_FN),
+	PINMUX_DATA(D11_MARK, PTT3_FN),
+	PINMUX_DATA(D10_MARK, PTT2_FN),
+	PINMUX_DATA(D9_MARK,  PTT1_FN),
+	PINMUX_DATA(D8_MARK,  PTT0_FN),
+
+	/* PTU FN */
+	PINMUX_DATA(DMAC_DACK0_MARK, PTU7_FN),
+	PINMUX_DATA(DMAC_DREQ0_MARK, PTU6_FN),
+
+	PINMUX_DATA(FSIOASD_MARK,	PSE1_0, PTU5_FN),
+	PINMUX_DATA(FSIIABCK_MARK,	PSE1_0, PTU4_FN),
+	PINMUX_DATA(FSIIALRCK_MARK,	PSE1_0, PTU3_FN),
+	PINMUX_DATA(FSIOABCK_MARK,	PSE1_0, PTU2_FN),
+	PINMUX_DATA(FSIOALRCK_MARK,	PSE1_0, PTU1_FN),
+	PINMUX_DATA(CLKAUDIOAO_MARK,	PSE0_0, PTU0_FN),
+
+	/* PTV FN */
+	PINMUX_DATA(FSIIBSD_MARK,	PSD7_0,  PSD6_0,  PTV7_FN),
+	PINMUX_DATA(FSIOBSD_MARK,	PSD7_0,  PSD6_0,  PTV6_FN),
+	PINMUX_DATA(FSIIBBCK_MARK,	PSC15_0, PSC14_0, PTV5_FN),
+	PINMUX_DATA(FSIIBLRCK_MARK,	PSC15_0, PSC14_0, PTV4_FN),
+	PINMUX_DATA(FSIOBBCK_MARK,	PSC15_0, PSC14_0, PTV3_FN),
+	PINMUX_DATA(FSIOBLRCK_MARK,	PSC15_0, PSC14_0, PTV2_FN),
+	PINMUX_DATA(CLKAUDIOBO_MARK,	PSE3_0,  PSE2_0,  PTV1_FN),
+	PINMUX_DATA(FSIIASD_MARK,	PSE10_0,          PTV0_FN),
+
+	PINMUX_DATA(MSIOF1_SS2_MARK,	PSD7_0,  PSD6_1,  PTV7_FN),
+	PINMUX_DATA(MSIOF1_RSYNC_MARK,	PSD7_1,  PSD6_0,  PTV7_FN),
+	PINMUX_DATA(MSIOF1_SS1_MARK,	PSD7_0,  PSD6_1,  PTV6_FN),
+	PINMUX_DATA(MSIOF1_RSCK_MARK,	PSD7_1,  PSD6_0,  PTV6_FN),
+	PINMUX_DATA(MSIOF1_RXD_MARK,	PSC15_0, PSC14_1, PTV5_FN),
+	PINMUX_DATA(MSIOF1_TSYNC_MARK,	PSC15_0, PSC14_1, PTV4_FN),
+	PINMUX_DATA(MSIOF1_TSCK_MARK,	PSC15_0, PSC14_1, PTV3_FN),
+	PINMUX_DATA(MSIOF1_TXD_MARK,	PSC15_0, PSC14_1, PTV2_FN),
+	PINMUX_DATA(MSIOF1_MCK_MARK,	PSE3_0,  PSE2_1,  PTV1_FN),
+
+	/* PTW FN */
+	PINMUX_DATA(MMC_D7_MARK,	PSE13_0, PSE12_0, PTW7_FN),
+	PINMUX_DATA(MMC_D6_MARK,	PSE13_0, PSE12_0, PTW6_FN),
+	PINMUX_DATA(MMC_D5_MARK,	PSE13_0, PSE12_0, PTW5_FN),
+	PINMUX_DATA(MMC_D4_MARK,	PSE13_0, PSE12_0, PTW4_FN),
+	PINMUX_DATA(MMC_D3_MARK,	PSA13_0,          PTW3_FN),
+	PINMUX_DATA(MMC_D2_MARK,	PSA13_0,          PTW2_FN),
+	PINMUX_DATA(MMC_D1_MARK,	PSA13_0,          PTW1_FN),
+	PINMUX_DATA(MMC_D0_MARK,	PSA13_0,          PTW0_FN),
+
+	PINMUX_DATA(SDHI1CD_MARK,	PSE13_0, PSE12_1, PTW7_FN),
+	PINMUX_DATA(SDHI1WP_MARK,	PSE13_0, PSE12_1, PTW6_FN),
+	PINMUX_DATA(SDHI1D3_MARK,	PSE13_0, PSE12_1, PTW5_FN),
+	PINMUX_DATA(SDHI1D2_MARK,	PSE13_0, PSE12_1, PTW4_FN),
+	PINMUX_DATA(SDHI1D1_MARK,	PSA13_1,          PTW3_FN),
+	PINMUX_DATA(SDHI1D0_MARK,	PSA13_1,          PTW2_FN),
+	PINMUX_DATA(SDHI1CMD_MARK,	PSA13_1,          PTW1_FN),
+	PINMUX_DATA(SDHI1CLK_MARK,	PSA13_1,          PTW0_FN),
+
+	PINMUX_DATA(IODACK_MARK,	PSE13_1, PSE12_0, PTW7_FN),
+	PINMUX_DATA(IDERST_MARK,	PSE13_1, PSE12_0, PTW6_FN),
+	PINMUX_DATA(EXBUF_ENB_MARK,	PSE13_1, PSE12_0, PTW5_FN),
+	PINMUX_DATA(DIRECTION_MARK,	PSE13_1, PSE12_0, PTW4_FN),
+
+	/* PTX FN */
+	PINMUX_DATA(DMAC_DACK1_MARK,	PSA12_0, PTX7_FN),
+	PINMUX_DATA(DMAC_DREQ1_MARK,	PSA12_0, PTX6_FN),
+
+	PINMUX_DATA(IRDA_OUT_MARK,	PSA12_1, PTX7_FN),
+	PINMUX_DATA(IRDA_IN_MARK,	PSA12_1, PTX6_FN),
+
+	PINMUX_DATA(TSIF_TS0_SDAT_MARK,	PSC0_0, PTX5_FN),
+	PINMUX_DATA(TSIF_TS0_SCK_MARK,	PSC1_0, PTX4_FN),
+	PINMUX_DATA(TSIF_TS0_SDEN_MARK,	PSC2_0, PTX3_FN),
+	PINMUX_DATA(TSIF_TS0_SPSYNC_MARK,       PTX2_FN),
+
+	PINMUX_DATA(LNKSTA_MARK,	PSC0_1, PTX5_FN),
+	PINMUX_DATA(MDIO_MARK,		PSC1_1, PTX4_FN),
+	PINMUX_DATA(MDC_MARK,		PSC2_1, PTX3_FN),
+
+	PINMUX_DATA(MMC_CLK_MARK, PTX1_FN),
+	PINMUX_DATA(MMC_CMD_MARK, PTX0_FN),
+
+	/* PTY FN */
+	PINMUX_DATA(SDHI0CD_MARK,  PTY7_FN),
+	PINMUX_DATA(SDHI0WP_MARK,  PTY6_FN),
+	PINMUX_DATA(SDHI0D3_MARK,  PTY5_FN),
+	PINMUX_DATA(SDHI0D2_MARK,  PTY4_FN),
+	PINMUX_DATA(SDHI0D1_MARK,  PTY3_FN),
+	PINMUX_DATA(SDHI0D0_MARK,  PTY2_FN),
+	PINMUX_DATA(SDHI0CMD_MARK, PTY1_FN),
+	PINMUX_DATA(SDHI0CLK_MARK, PTY0_FN),
+
+	/* PTZ FN */
+	PINMUX_DATA(INTC_IRQ7_MARK,	PSB10_0, PTZ7_FN),
+	PINMUX_DATA(INTC_IRQ6_MARK,	PSB11_0, PTZ6_FN),
+	PINMUX_DATA(INTC_IRQ5_MARK,	PSB12_0, PTZ5_FN),
+	PINMUX_DATA(INTC_IRQ4_MARK,	PSB13_0, PTZ4_FN),
+	PINMUX_DATA(INTC_IRQ3_MARK,	PSB14_0, PTZ3_FN),
+	PINMUX_DATA(INTC_IRQ2_MARK,	         PTZ2_FN),
+	PINMUX_DATA(INTC_IRQ1_MARK,	         PTZ1_FN),
+	PINMUX_DATA(INTC_IRQ0_MARK,	         PTZ0_FN),
+
+	PINMUX_DATA(SCIF3_I_CTS_MARK,	PSB10_1, PTZ7_FN),
+	PINMUX_DATA(SCIF3_I_RTS_MARK,	PSB11_1, PTZ6_FN),
+	PINMUX_DATA(SCIF3_I_SCK_MARK,	PSB12_1, PTZ5_FN),
+	PINMUX_DATA(SCIF3_I_RXD_MARK,	PSB13_1, PTZ4_FN),
+	PINMUX_DATA(SCIF3_I_TXD_MARK,	PSB14_1, PTZ3_FN),
+};
+
+static struct pinmux_gpio pinmux_gpios[] = {
+	/* PTA */
+	PINMUX_GPIO(GPIO_PTA7, PTA7_DATA),
+	PINMUX_GPIO(GPIO_PTA6, PTA6_DATA),
+	PINMUX_GPIO(GPIO_PTA5, PTA5_DATA),
+	PINMUX_GPIO(GPIO_PTA4, PTA4_DATA),
+	PINMUX_GPIO(GPIO_PTA3, PTA3_DATA),
+	PINMUX_GPIO(GPIO_PTA2, PTA2_DATA),
+	PINMUX_GPIO(GPIO_PTA1, PTA1_DATA),
+	PINMUX_GPIO(GPIO_PTA0, PTA0_DATA),
+
+	/* PTB */
+	PINMUX_GPIO(GPIO_PTB7, PTB7_DATA),
+	PINMUX_GPIO(GPIO_PTB6, PTB6_DATA),
+	PINMUX_GPIO(GPIO_PTB5, PTB5_DATA),
+	PINMUX_GPIO(GPIO_PTB4, PTB4_DATA),
+	PINMUX_GPIO(GPIO_PTB3, PTB3_DATA),
+	PINMUX_GPIO(GPIO_PTB2, PTB2_DATA),
+	PINMUX_GPIO(GPIO_PTB1, PTB1_DATA),
+	PINMUX_GPIO(GPIO_PTB0, PTB0_DATA),
+
+	/* PTC */
+	PINMUX_GPIO(GPIO_PTC7, PTC7_DATA),
+	PINMUX_GPIO(GPIO_PTC6, PTC6_DATA),
+	PINMUX_GPIO(GPIO_PTC5, PTC5_DATA),
+	PINMUX_GPIO(GPIO_PTC4, PTC4_DATA),
+	PINMUX_GPIO(GPIO_PTC3, PTC3_DATA),
+	PINMUX_GPIO(GPIO_PTC2, PTC2_DATA),
+	PINMUX_GPIO(GPIO_PTC1, PTC1_DATA),
+	PINMUX_GPIO(GPIO_PTC0, PTC0_DATA),
+
+	/* PTD */
+	PINMUX_GPIO(GPIO_PTD7, PTD7_DATA),
+	PINMUX_GPIO(GPIO_PTD6, PTD6_DATA),
+	PINMUX_GPIO(GPIO_PTD5, PTD5_DATA),
+	PINMUX_GPIO(GPIO_PTD4, PTD4_DATA),
+	PINMUX_GPIO(GPIO_PTD3, PTD3_DATA),
+	PINMUX_GPIO(GPIO_PTD2, PTD2_DATA),
+	PINMUX_GPIO(GPIO_PTD1, PTD1_DATA),
+	PINMUX_GPIO(GPIO_PTD0, PTD0_DATA),
+
+	/* PTE */
+	PINMUX_GPIO(GPIO_PTE7, PTE7_DATA),
+	PINMUX_GPIO(GPIO_PTE6, PTE6_DATA),
+	PINMUX_GPIO(GPIO_PTE5, PTE5_DATA),
+	PINMUX_GPIO(GPIO_PTE4, PTE4_DATA),
+	PINMUX_GPIO(GPIO_PTE3, PTE3_DATA),
+	PINMUX_GPIO(GPIO_PTE2, PTE2_DATA),
+	PINMUX_GPIO(GPIO_PTE1, PTE1_DATA),
+	PINMUX_GPIO(GPIO_PTE0, PTE0_DATA),
+
+	/* PTF */
+	PINMUX_GPIO(GPIO_PTF7, PTF7_DATA),
+	PINMUX_GPIO(GPIO_PTF6, PTF6_DATA),
+	PINMUX_GPIO(GPIO_PTF5, PTF5_DATA),
+	PINMUX_GPIO(GPIO_PTF4, PTF4_DATA),
+	PINMUX_GPIO(GPIO_PTF3, PTF3_DATA),
+	PINMUX_GPIO(GPIO_PTF2, PTF2_DATA),
+	PINMUX_GPIO(GPIO_PTF1, PTF1_DATA),
+	PINMUX_GPIO(GPIO_PTF0, PTF0_DATA),
+
+	/* PTG */
+	PINMUX_GPIO(GPIO_PTG5, PTG5_DATA),
+	PINMUX_GPIO(GPIO_PTG4, PTG4_DATA),
+	PINMUX_GPIO(GPIO_PTG3, PTG3_DATA),
+	PINMUX_GPIO(GPIO_PTG2, PTG2_DATA),
+	PINMUX_GPIO(GPIO_PTG1, PTG1_DATA),
+	PINMUX_GPIO(GPIO_PTG0, PTG0_DATA),
+
+	/* PTH */
+	PINMUX_GPIO(GPIO_PTH7, PTH7_DATA),
+	PINMUX_GPIO(GPIO_PTH6, PTH6_DATA),
+	PINMUX_GPIO(GPIO_PTH5, PTH5_DATA),
+	PINMUX_GPIO(GPIO_PTH4, PTH4_DATA),
+	PINMUX_GPIO(GPIO_PTH3, PTH3_DATA),
+	PINMUX_GPIO(GPIO_PTH2, PTH2_DATA),
+	PINMUX_GPIO(GPIO_PTH1, PTH1_DATA),
+	PINMUX_GPIO(GPIO_PTH0, PTH0_DATA),
+
+	/* PTJ */
+	PINMUX_GPIO(GPIO_PTJ7, PTJ7_DATA),
+	PINMUX_GPIO(GPIO_PTJ6, PTJ6_DATA),
+	PINMUX_GPIO(GPIO_PTJ5, PTJ5_DATA),
+	PINMUX_GPIO(GPIO_PTJ3, PTJ3_DATA),
+	PINMUX_GPIO(GPIO_PTJ2, PTJ2_DATA),
+	PINMUX_GPIO(GPIO_PTJ1, PTJ1_DATA),
+	PINMUX_GPIO(GPIO_PTJ0, PTJ0_DATA),
+
+	/* PTK */
+	PINMUX_GPIO(GPIO_PTK7, PTK7_DATA),
+	PINMUX_GPIO(GPIO_PTK6, PTK6_DATA),
+	PINMUX_GPIO(GPIO_PTK5, PTK5_DATA),
+	PINMUX_GPIO(GPIO_PTK4, PTK4_DATA),
+	PINMUX_GPIO(GPIO_PTK3, PTK3_DATA),
+	PINMUX_GPIO(GPIO_PTK2, PTK2_DATA),
+	PINMUX_GPIO(GPIO_PTK1, PTK1_DATA),
+	PINMUX_GPIO(GPIO_PTK0, PTK0_DATA),
+
+	/* PTL */
+	PINMUX_GPIO(GPIO_PTL7, PTL7_DATA),
+	PINMUX_GPIO(GPIO_PTL6, PTL6_DATA),
+	PINMUX_GPIO(GPIO_PTL5, PTL5_DATA),
+	PINMUX_GPIO(GPIO_PTL4, PTL4_DATA),
+	PINMUX_GPIO(GPIO_PTL3, PTL3_DATA),
+	PINMUX_GPIO(GPIO_PTL2, PTL2_DATA),
+	PINMUX_GPIO(GPIO_PTL1, PTL1_DATA),
+	PINMUX_GPIO(GPIO_PTL0, PTL0_DATA),
+
+	/* PTM */
+	PINMUX_GPIO(GPIO_PTM7, PTM7_DATA),
+	PINMUX_GPIO(GPIO_PTM6, PTM6_DATA),
+	PINMUX_GPIO(GPIO_PTM5, PTM5_DATA),
+	PINMUX_GPIO(GPIO_PTM4, PTM4_DATA),
+	PINMUX_GPIO(GPIO_PTM3, PTM3_DATA),
+	PINMUX_GPIO(GPIO_PTM2, PTM2_DATA),
+	PINMUX_GPIO(GPIO_PTM1, PTM1_DATA),
+	PINMUX_GPIO(GPIO_PTM0, PTM0_DATA),
+
+	/* PTN */
+	PINMUX_GPIO(GPIO_PTN7, PTN7_DATA),
+	PINMUX_GPIO(GPIO_PTN6, PTN6_DATA),
+	PINMUX_GPIO(GPIO_PTN5, PTN5_DATA),
+	PINMUX_GPIO(GPIO_PTN4, PTN4_DATA),
+	PINMUX_GPIO(GPIO_PTN3, PTN3_DATA),
+	PINMUX_GPIO(GPIO_PTN2, PTN2_DATA),
+	PINMUX_GPIO(GPIO_PTN1, PTN1_DATA),
+	PINMUX_GPIO(GPIO_PTN0, PTN0_DATA),
+
+	/* PTQ */
+	PINMUX_GPIO(GPIO_PTQ7, PTQ7_DATA),
+	PINMUX_GPIO(GPIO_PTQ6, PTQ6_DATA),
+	PINMUX_GPIO(GPIO_PTQ5, PTQ5_DATA),
+	PINMUX_GPIO(GPIO_PTQ4, PTQ4_DATA),
+	PINMUX_GPIO(GPIO_PTQ3, PTQ3_DATA),
+	PINMUX_GPIO(GPIO_PTQ2, PTQ2_DATA),
+	PINMUX_GPIO(GPIO_PTQ1, PTQ1_DATA),
+	PINMUX_GPIO(GPIO_PTQ0, PTQ0_DATA),
+
+	/* PTR */
+	PINMUX_GPIO(GPIO_PTR7, PTR7_DATA),
+	PINMUX_GPIO(GPIO_PTR6, PTR6_DATA),
+	PINMUX_GPIO(GPIO_PTR5, PTR5_DATA),
+	PINMUX_GPIO(GPIO_PTR4, PTR4_DATA),
+	PINMUX_GPIO(GPIO_PTR3, PTR3_DATA),
+	PINMUX_GPIO(GPIO_PTR2, PTR2_DATA),
+	PINMUX_GPIO(GPIO_PTR1, PTR1_DATA),
+	PINMUX_GPIO(GPIO_PTR0, PTR0_DATA),
+
+	/* PTS */
+	PINMUX_GPIO(GPIO_PTS6, PTS6_DATA),
+	PINMUX_GPIO(GPIO_PTS5, PTS5_DATA),
+	PINMUX_GPIO(GPIO_PTS4, PTS4_DATA),
+	PINMUX_GPIO(GPIO_PTS3, PTS3_DATA),
+	PINMUX_GPIO(GPIO_PTS2, PTS2_DATA),
+	PINMUX_GPIO(GPIO_PTS1, PTS1_DATA),
+	PINMUX_GPIO(GPIO_PTS0, PTS0_DATA),
+
+	/* PTT */
+	PINMUX_GPIO(GPIO_PTT7, PTT7_DATA),
+	PINMUX_GPIO(GPIO_PTT6, PTT6_DATA),
+	PINMUX_GPIO(GPIO_PTT5, PTT5_DATA),
+	PINMUX_GPIO(GPIO_PTT4, PTT4_DATA),
+	PINMUX_GPIO(GPIO_PTT3, PTT3_DATA),
+	PINMUX_GPIO(GPIO_PTT2, PTT2_DATA),
+	PINMUX_GPIO(GPIO_PTT1, PTT1_DATA),
+	PINMUX_GPIO(GPIO_PTT0, PTT0_DATA),
+
+	/* PTU */
+	PINMUX_GPIO(GPIO_PTU7, PTU7_DATA),
+	PINMUX_GPIO(GPIO_PTU6, PTU6_DATA),
+	PINMUX_GPIO(GPIO_PTU5, PTU5_DATA),
+	PINMUX_GPIO(GPIO_PTU4, PTU4_DATA),
+	PINMUX_GPIO(GPIO_PTU3, PTU3_DATA),
+	PINMUX_GPIO(GPIO_PTU2, PTU2_DATA),
+	PINMUX_GPIO(GPIO_PTU1, PTU1_DATA),
+	PINMUX_GPIO(GPIO_PTU0, PTU0_DATA),
+
+	/* PTV */
+	PINMUX_GPIO(GPIO_PTV7, PTV7_DATA),
+	PINMUX_GPIO(GPIO_PTV6, PTV6_DATA),
+	PINMUX_GPIO(GPIO_PTV5, PTV5_DATA),
+	PINMUX_GPIO(GPIO_PTV4, PTV4_DATA),
+	PINMUX_GPIO(GPIO_PTV3, PTV3_DATA),
+	PINMUX_GPIO(GPIO_PTV2, PTV2_DATA),
+	PINMUX_GPIO(GPIO_PTV1, PTV1_DATA),
+	PINMUX_GPIO(GPIO_PTV0, PTV0_DATA),
+
+	/* PTW */
+	PINMUX_GPIO(GPIO_PTW7, PTW7_DATA),
+	PINMUX_GPIO(GPIO_PTW6, PTW6_DATA),
+	PINMUX_GPIO(GPIO_PTW5, PTW5_DATA),
+	PINMUX_GPIO(GPIO_PTW4, PTW4_DATA),
+	PINMUX_GPIO(GPIO_PTW3, PTW3_DATA),
+	PINMUX_GPIO(GPIO_PTW2, PTW2_DATA),
+	PINMUX_GPIO(GPIO_PTW1, PTW1_DATA),
+	PINMUX_GPIO(GPIO_PTW0, PTW0_DATA),
+
+	/* PTX */
+	PINMUX_GPIO(GPIO_PTX7, PTX7_DATA),
+	PINMUX_GPIO(GPIO_PTX6, PTX6_DATA),
+	PINMUX_GPIO(GPIO_PTX5, PTX5_DATA),
+	PINMUX_GPIO(GPIO_PTX4, PTX4_DATA),
+	PINMUX_GPIO(GPIO_PTX3, PTX3_DATA),
+	PINMUX_GPIO(GPIO_PTX2, PTX2_DATA),
+	PINMUX_GPIO(GPIO_PTX1, PTX1_DATA),
+	PINMUX_GPIO(GPIO_PTX0, PTX0_DATA),
+
+	/* PTY */
+	PINMUX_GPIO(GPIO_PTY7, PTY7_DATA),
+	PINMUX_GPIO(GPIO_PTY6, PTY6_DATA),
+	PINMUX_GPIO(GPIO_PTY5, PTY5_DATA),
+	PINMUX_GPIO(GPIO_PTY4, PTY4_DATA),
+	PINMUX_GPIO(GPIO_PTY3, PTY3_DATA),
+	PINMUX_GPIO(GPIO_PTY2, PTY2_DATA),
+	PINMUX_GPIO(GPIO_PTY1, PTY1_DATA),
+	PINMUX_GPIO(GPIO_PTY0, PTY0_DATA),
+
+	/* PTZ */
+	PINMUX_GPIO(GPIO_PTZ7, PTZ7_DATA),
+	PINMUX_GPIO(GPIO_PTZ6, PTZ6_DATA),
+	PINMUX_GPIO(GPIO_PTZ5, PTZ5_DATA),
+	PINMUX_GPIO(GPIO_PTZ4, PTZ4_DATA),
+	PINMUX_GPIO(GPIO_PTZ3, PTZ3_DATA),
+	PINMUX_GPIO(GPIO_PTZ2, PTZ2_DATA),
+	PINMUX_GPIO(GPIO_PTZ1, PTZ1_DATA),
+	PINMUX_GPIO(GPIO_PTZ0, PTZ0_DATA),
+
+	/* BSC */
+	PINMUX_GPIO(GPIO_FN_D31,	D31_MARK),
+	PINMUX_GPIO(GPIO_FN_D30,	D30_MARK),
+	PINMUX_GPIO(GPIO_FN_D29,	D29_MARK),
+	PINMUX_GPIO(GPIO_FN_D28,	D28_MARK),
+	PINMUX_GPIO(GPIO_FN_D27,	D27_MARK),
+	PINMUX_GPIO(GPIO_FN_D26,	D26_MARK),
+	PINMUX_GPIO(GPIO_FN_D25,	D25_MARK),
+	PINMUX_GPIO(GPIO_FN_D24,	D24_MARK),
+	PINMUX_GPIO(GPIO_FN_D23,	D23_MARK),
+	PINMUX_GPIO(GPIO_FN_D22,	D22_MARK),
+	PINMUX_GPIO(GPIO_FN_D21,	D21_MARK),
+	PINMUX_GPIO(GPIO_FN_D20,	D20_MARK),
+	PINMUX_GPIO(GPIO_FN_D19,	D19_MARK),
+	PINMUX_GPIO(GPIO_FN_D18,	D18_MARK),
+	PINMUX_GPIO(GPIO_FN_D17,	D17_MARK),
+	PINMUX_GPIO(GPIO_FN_D16,	D16_MARK),
+	PINMUX_GPIO(GPIO_FN_D15,	D15_MARK),
+	PINMUX_GPIO(GPIO_FN_D14,	D14_MARK),
+	PINMUX_GPIO(GPIO_FN_D13,	D13_MARK),
+	PINMUX_GPIO(GPIO_FN_D12,	D12_MARK),
+	PINMUX_GPIO(GPIO_FN_D11,	D11_MARK),
+	PINMUX_GPIO(GPIO_FN_D10,	D10_MARK),
+	PINMUX_GPIO(GPIO_FN_D9,		D9_MARK),
+	PINMUX_GPIO(GPIO_FN_D8,		D8_MARK),
+	PINMUX_GPIO(GPIO_FN_D7,		D7_MARK),
+	PINMUX_GPIO(GPIO_FN_D6,		D6_MARK),
+	PINMUX_GPIO(GPIO_FN_D5,		D5_MARK),
+	PINMUX_GPIO(GPIO_FN_D4,		D4_MARK),
+	PINMUX_GPIO(GPIO_FN_D3,		D3_MARK),
+	PINMUX_GPIO(GPIO_FN_D2,		D2_MARK),
+	PINMUX_GPIO(GPIO_FN_D1,		D1_MARK),
+	PINMUX_GPIO(GPIO_FN_D0,		D0_MARK),
+	PINMUX_GPIO(GPIO_FN_A25,	A25_MARK),
+	PINMUX_GPIO(GPIO_FN_A24,	A24_MARK),
+	PINMUX_GPIO(GPIO_FN_A23,	A23_MARK),
+	PINMUX_GPIO(GPIO_FN_A22,	A22_MARK),
+	PINMUX_GPIO(GPIO_FN_CS6B_CE1B,	CS6B_CE1B_MARK),
+	PINMUX_GPIO(GPIO_FN_CS6A_CE2B,	CS6A_CE2B_MARK),
+	PINMUX_GPIO(GPIO_FN_CS5B_CE1A,	CS5B_CE1A_MARK),
+	PINMUX_GPIO(GPIO_FN_CS5A_CE2A,	CS5A_CE2A_MARK),
+	PINMUX_GPIO(GPIO_FN_WE3_ICIOWR,	WE3_ICIOWR_MARK),
+	PINMUX_GPIO(GPIO_FN_WE2_ICIORD,	WE2_ICIORD_MARK),
+	PINMUX_GPIO(GPIO_FN_IOIS16,	IOIS16_MARK),
+	PINMUX_GPIO(GPIO_FN_WAIT,	WAIT_MARK),
+	PINMUX_GPIO(GPIO_FN_BS,		BS_MARK),
+
+	/* KEYSC */
+	PINMUX_GPIO(GPIO_FN_KEYOUT5_IN5,	KEYOUT5_IN5_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYOUT4_IN6,	KEYOUT4_IN6_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYIN4,		KEYIN4_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYIN3,		KEYIN3_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYIN2,		KEYIN2_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYIN1,		KEYIN1_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYIN0,		KEYIN0_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYOUT3,		KEYOUT3_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYOUT2,		KEYOUT2_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYOUT1,		KEYOUT1_MARK),
+	PINMUX_GPIO(GPIO_FN_KEYOUT0,		KEYOUT0_MARK),
+
+	/* ATAPI */
+	PINMUX_GPIO(GPIO_FN_IDED15,	IDED15_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED14,	IDED14_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED13,	IDED13_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED12,	IDED12_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED11,	IDED11_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED10,	IDED10_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED9,	IDED9_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED8,	IDED8_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED7,	IDED7_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED6,	IDED6_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED5,	IDED5_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED4,	IDED4_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED3,	IDED3_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED2,	IDED2_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED1,	IDED1_MARK),
+	PINMUX_GPIO(GPIO_FN_IDED0,	IDED0_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEA2,	IDEA2_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEA1,	IDEA1_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEA0,	IDEA0_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEIOWR,	IDEIOWR_MARK),
+	PINMUX_GPIO(GPIO_FN_IODREQ,	IODREQ_MARK),
+	PINMUX_GPIO(GPIO_FN_IDECS0,	IDECS0_MARK),
+	PINMUX_GPIO(GPIO_FN_IDECS1,	IDECS1_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEIORD,	IDEIORD_MARK),
+	PINMUX_GPIO(GPIO_FN_DIRECTION,	DIRECTION_MARK),
+	PINMUX_GPIO(GPIO_FN_EXBUF_ENB,	EXBUF_ENB_MARK),
+	PINMUX_GPIO(GPIO_FN_IDERST,	IDERST_MARK),
+	PINMUX_GPIO(GPIO_FN_IODACK,	IODACK_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEINT,	IDEINT_MARK),
+	PINMUX_GPIO(GPIO_FN_IDEIORDY,	IDEIORDY_MARK),
+
+	/* TPU */
+	PINMUX_GPIO(GPIO_FN_TPUTO3,	TPUTO3_MARK),
+	PINMUX_GPIO(GPIO_FN_TPUTO2,	TPUTO2_MARK),
+	PINMUX_GPIO(GPIO_FN_TPUTO1,	TPUTO1_MARK),
+	PINMUX_GPIO(GPIO_FN_TPUTO0,	TPUTO0_MARK),
+	PINMUX_GPIO(GPIO_FN_TPUTI3,	TPUTI3_MARK),
+	PINMUX_GPIO(GPIO_FN_TPUTI2,	TPUTI2_MARK),
+
+	/* LCDC */
+	PINMUX_GPIO(GPIO_FN_LCDD23,	LCDD23_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD22,	LCDD22_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD21,	LCDD21_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD20,	LCDD20_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD19,	LCDD19_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD18,	LCDD18_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD17,	LCDD17_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD16,	LCDD16_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD15,	LCDD15_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD14,	LCDD14_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD13,	LCDD13_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD12,	LCDD12_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD11,	LCDD11_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD10,	LCDD10_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD9,	LCDD9_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD8,	LCDD8_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD7,	LCDD7_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD6,	LCDD6_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD5,	LCDD5_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD4,	LCDD4_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD3,	LCDD3_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD2,	LCDD2_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD1,	LCDD1_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDD0,	LCDD0_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDVSYN,	LCDVSYN_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDDISP,	LCDDISP_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDRS,	LCDRS_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDHSYN,	LCDHSYN_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDCS,	LCDCS_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDDON,	LCDDON_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDDCK,	LCDDCK_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDWR,	LCDWR_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDVEPWC,	LCDVEPWC_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDVCPWC,	LCDVCPWC_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDRD,	LCDRD_MARK),
+	PINMUX_GPIO(GPIO_FN_LCDLCLK,	LCDLCLK_MARK),
+
+	/* SCIF0 */
+	PINMUX_GPIO(GPIO_FN_SCIF0_TXD,	SCIF0_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF0_RXD,	SCIF0_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF0_SCK,	SCIF0_SCK_MARK),
+
+	/* SCIF1 */
+	PINMUX_GPIO(GPIO_FN_SCIF1_SCK,	SCIF1_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF1_RXD,	SCIF1_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF1_TXD,	SCIF1_TXD_MARK),
+
+	/* SCIF2 */
+	PINMUX_GPIO(GPIO_FN_SCIF2_L_TXD,	SCIF2_L_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF2_L_SCK,	SCIF2_L_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF2_L_RXD,	SCIF2_L_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF2_V_TXD,	SCIF2_V_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF2_V_SCK,	SCIF2_V_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF2_V_RXD,	SCIF2_V_RXD_MARK),
+
+	/* SCIF3 */
+	PINMUX_GPIO(GPIO_FN_SCIF3_V_SCK,	SCIF3_V_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_V_RXD,	SCIF3_V_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_V_TXD,	SCIF3_V_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_V_CTS,	SCIF3_V_CTS_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_V_RTS,	SCIF3_V_RTS_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_I_SCK,	SCIF3_I_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_I_RXD,	SCIF3_I_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_I_TXD,	SCIF3_I_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_I_CTS,	SCIF3_I_CTS_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF3_I_RTS,	SCIF3_I_RTS_MARK),
+
+	/* SCIF4 */
+	PINMUX_GPIO(GPIO_FN_SCIF4_SCK,	SCIF4_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF4_RXD,	SCIF4_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF4_TXD,	SCIF4_TXD_MARK),
+
+	/* SCIF5 */
+	PINMUX_GPIO(GPIO_FN_SCIF5_SCK,	SCIF5_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF5_RXD,	SCIF5_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_SCIF5_TXD,	SCIF5_TXD_MARK),
+
+	/* FSI */
+	PINMUX_GPIO(GPIO_FN_FSIMCKB,	FSIMCKB_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIMCKA,	FSIMCKA_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOASD,	FSIOASD_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIABCK,	FSIIABCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIALRCK,	FSIIALRCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOABCK,	FSIOABCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOALRCK,	FSIOALRCK_MARK),
+	PINMUX_GPIO(GPIO_FN_CLKAUDIOAO,	CLKAUDIOAO_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIBSD,	FSIIBSD_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOBSD,	FSIOBSD_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIBBCK,	FSIIBBCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIBLRCK,	FSIIBLRCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOBBCK,	FSIOBBCK_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIOBLRCK,	FSIOBLRCK_MARK),
+	PINMUX_GPIO(GPIO_FN_CLKAUDIOBO,	CLKAUDIOBO_MARK),
+	PINMUX_GPIO(GPIO_FN_FSIIASD,	FSIIASD_MARK),
+
+	/* AUD */
+	PINMUX_GPIO(GPIO_FN_AUDCK,	AUDCK_MARK),
+	PINMUX_GPIO(GPIO_FN_AUDSYNC,	AUDSYNC_MARK),
+	PINMUX_GPIO(GPIO_FN_AUDATA3,	AUDATA3_MARK),
+	PINMUX_GPIO(GPIO_FN_AUDATA2,	AUDATA2_MARK),
+	PINMUX_GPIO(GPIO_FN_AUDATA1,	AUDATA1_MARK),
+	PINMUX_GPIO(GPIO_FN_AUDATA0,	AUDATA0_MARK),
+
+	/* VIO */
+	PINMUX_GPIO(GPIO_FN_VIO_CKO,	VIO_CKO_MARK),
+
+	/* VIO0 */
+	PINMUX_GPIO(GPIO_FN_VIO0_D15,	VIO0_D15_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D14,	VIO0_D14_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D13,	VIO0_D13_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D12,	VIO0_D12_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D11,	VIO0_D11_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D10,	VIO0_D10_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D9,	VIO0_D9_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D8,	VIO0_D8_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D7,	VIO0_D7_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D6,	VIO0_D6_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D5,	VIO0_D5_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D4,	VIO0_D4_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D3,	VIO0_D3_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D2,	VIO0_D2_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D1,	VIO0_D1_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_D0,	VIO0_D0_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_VD,	VIO0_VD_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_CLK,	VIO0_CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_FLD,	VIO0_FLD_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO0_HD,	VIO0_HD_MARK),
+
+	/* VIO1 */
+	PINMUX_GPIO(GPIO_FN_VIO1_D7,	VIO1_D7_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D6,	VIO1_D6_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D5,	VIO1_D5_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D4,	VIO1_D4_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D3,	VIO1_D3_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D2,	VIO1_D2_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D1,	VIO1_D1_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_D0,	VIO1_D0_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_FLD,	VIO1_FLD_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_HD,	VIO1_HD_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_VD,	VIO1_VD_MARK),
+	PINMUX_GPIO(GPIO_FN_VIO1_CLK,	VIO1_CLK_MARK),
+
+	/* Eth */
+	PINMUX_GPIO(GPIO_FN_RMII_RXD0,		RMII_RXD0_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_RXD1,		RMII_RXD1_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_TXD0,		RMII_TXD0_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_TXD1,		RMII_TXD1_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_REF_CLK,	RMII_REF_CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_TX_EN,		RMII_TX_EN_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_RX_ER,		RMII_RX_ER_MARK),
+	PINMUX_GPIO(GPIO_FN_RMII_CRS_DV,	RMII_CRS_DV_MARK),
+	PINMUX_GPIO(GPIO_FN_LNKSTA,		LNKSTA_MARK),
+	PINMUX_GPIO(GPIO_FN_MDIO,		MDIO_MARK),
+	PINMUX_GPIO(GPIO_FN_MDC,		MDC_MARK),
+
+	/* System */
+	PINMUX_GPIO(GPIO_FN_PDSTATUS,	PDSTATUS_MARK),
+	PINMUX_GPIO(GPIO_FN_STATUS2,	STATUS2_MARK),
+	PINMUX_GPIO(GPIO_FN_STATUS0,	STATUS0_MARK),
+
+	/* VOU */
+	PINMUX_GPIO(GPIO_FN_DV_D15,	DV_D15_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D14,	DV_D14_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D13,	DV_D13_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D12,	DV_D12_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D11,	DV_D11_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D10,	DV_D10_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D9,	DV_D9_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D8,	DV_D8_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D7,	DV_D7_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D6,	DV_D6_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D5,	DV_D5_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D4,	DV_D4_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D3,	DV_D3_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D2,	DV_D2_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D1,	DV_D1_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_D0,	DV_D0_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_CLKI,	DV_CLKI_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_CLK,	DV_CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_VSYNC,	DV_VSYNC_MARK),
+	PINMUX_GPIO(GPIO_FN_DV_HSYNC,	DV_HSYNC_MARK),
+
+	/* MSIOF0 */
+	PINMUX_GPIO(GPIO_FN_MSIOF0_RXD,		MSIOF0_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_TXD,		MSIOF0_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_MCK,		MSIOF0_MCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_TSCK,	MSIOF0_TSCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_SS1,		MSIOF0_SS1_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_SS2,		MSIOF0_SS2_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_TSYNC,	MSIOF0_TSYNC_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_RSCK,	MSIOF0_RSCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF0_RSYNC,	MSIOF0_RSYNC_MARK),
+
+	/* MSIOF1 */
+	PINMUX_GPIO(GPIO_FN_MSIOF1_RXD,		MSIOF1_RXD_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_TXD,		MSIOF1_TXD_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_MCK,		MSIOF1_MCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_TSCK,	MSIOF1_TSCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_SS1,		MSIOF1_SS1_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_SS2,		MSIOF1_SS2_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_TSYNC,	MSIOF1_TSYNC_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_RSCK,	MSIOF1_RSCK_MARK),
+	PINMUX_GPIO(GPIO_FN_MSIOF1_RSYNC,	MSIOF1_RSYNC_MARK),
+
+	/* DMAC */
+	PINMUX_GPIO(GPIO_FN_DMAC_DACK0,	DMAC_DACK0_MARK),
+	PINMUX_GPIO(GPIO_FN_DMAC_DREQ0,	DMAC_DREQ0_MARK),
+	PINMUX_GPIO(GPIO_FN_DMAC_DACK1,	DMAC_DACK1_MARK),
+	PINMUX_GPIO(GPIO_FN_DMAC_DREQ1,	DMAC_DREQ1_MARK),
+
+	/* SDHI0 */
+	PINMUX_GPIO(GPIO_FN_SDHI0CD,	SDHI0CD_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0WP,	SDHI0WP_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0CMD,	SDHI0CMD_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0CLK,	SDHI0CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0D3,	SDHI0D3_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0D2,	SDHI0D2_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0D1,	SDHI0D1_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI0D0,	SDHI0D0_MARK),
+
+	/* SDHI1 */
+	PINMUX_GPIO(GPIO_FN_SDHI1CD,	SDHI1CD_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1WP,	SDHI1WP_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1CMD,	SDHI1CMD_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1CLK,	SDHI1CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1D3,	SDHI1D3_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1D2,	SDHI1D2_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1D1,	SDHI1D1_MARK),
+	PINMUX_GPIO(GPIO_FN_SDHI1D0,	SDHI1D0_MARK),
+
+	/* MMC */
+	PINMUX_GPIO(GPIO_FN_MMC_D7,	MMC_D7_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D6,	MMC_D6_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D5,	MMC_D5_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D4,	MMC_D4_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D3,	MMC_D3_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D2,	MMC_D2_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D1,	MMC_D1_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_D0,	MMC_D0_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_CLK,	MMC_CLK_MARK),
+	PINMUX_GPIO(GPIO_FN_MMC_CMD,	MMC_CMD_MARK),
+
+	/* IrDA */
+	PINMUX_GPIO(GPIO_FN_IRDA_OUT,	IRDA_OUT_MARK),
+	PINMUX_GPIO(GPIO_FN_IRDA_IN,	IRDA_IN_MARK),
+
+	/* TSIF */
+	PINMUX_GPIO(GPIO_FN_TSIF_TS0_SDAT,	TSIF_TS0_SDAT_MARK),
+	PINMUX_GPIO(GPIO_FN_TSIF_TS0_SCK,	TSIF_TS0_SCK_MARK),
+	PINMUX_GPIO(GPIO_FN_TSIF_TS0_SDEN,	TSIF_TS0_SDEN_MARK),
+	PINMUX_GPIO(GPIO_FN_TSIF_TS0_SPSYNC,	TSIF_TS0_SPSYNC_MARK),
+
+	/* IRQ */
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ7,	INTC_IRQ7_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ6,	INTC_IRQ6_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ5,	INTC_IRQ5_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ4,	INTC_IRQ4_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ3,	INTC_IRQ3_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ2,	INTC_IRQ2_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ1,	INTC_IRQ1_MARK),
+	PINMUX_GPIO(GPIO_FN_INTC_IRQ0,	INTC_IRQ0_MARK),
+ };
+
+static struct pinmux_cfg_reg pinmux_config_regs[] = {
+	{ PINMUX_CFG_REG("PACR", 0xa4050100, 16, 2) {
+		PTA7_FN, PTA7_OUT, PTA7_IN_PU, PTA7_IN,
+		PTA6_FN, PTA6_OUT, PTA6_IN_PU, PTA6_IN,
+		PTA5_FN, PTA5_OUT, PTA5_IN_PU, PTA5_IN,
+		PTA4_FN, PTA4_OUT, PTA4_IN_PU, PTA4_IN,
+		PTA3_FN, PTA3_OUT, PTA3_IN_PU, PTA3_IN,
+		PTA2_FN, PTA2_OUT, PTA2_IN_PU, PTA2_IN,
+		PTA1_FN, PTA1_OUT, PTA1_IN_PU, PTA1_IN,
+		PTA0_FN, PTA0_OUT, PTA0_IN_PU, PTA0_IN }
+	},
+	{ PINMUX_CFG_REG("PBCR", 0xa4050102, 16, 2) {
+		PTB7_FN, PTB7_OUT, PTB7_IN_PU, PTB7_IN,
+		PTB6_FN, PTB6_OUT, PTB6_IN_PU, PTB6_IN,
+		PTB5_FN, PTB5_OUT, PTB5_IN_PU, PTB5_IN,
+		PTB4_FN, PTB4_OUT, PTB4_IN_PU, PTB4_IN,
+		PTB3_FN, PTB3_OUT, PTB3_IN_PU, PTB3_IN,
+		PTB2_FN, PTB2_OUT, PTB2_IN_PU, PTB2_IN,
+		PTB1_FN, PTB1_OUT, PTB1_IN_PU, PTB1_IN,
+		PTB0_FN, PTB0_OUT, PTB0_IN_PU, PTB0_IN }
+	},
+	{ PINMUX_CFG_REG("PCCR", 0xa4050104, 16, 2) {
+		PTC7_FN, PTC7_OUT, PTC7_IN_PU, PTC7_IN,
+		PTC6_FN, PTC6_OUT, PTC6_IN_PU, PTC6_IN,
+		PTC5_FN, PTC5_OUT, PTC5_IN_PU, PTC5_IN,
+		PTC4_FN, PTC4_OUT, PTC4_IN_PU, PTC4_IN,
+		PTC3_FN, PTC3_OUT, PTC3_IN_PU, PTC3_IN,
+		PTC2_FN, PTC2_OUT, PTC2_IN_PU, PTC2_IN,
+		PTC1_FN, PTC1_OUT, PTC1_IN_PU, PTC1_IN,
+		PTC0_FN, PTC0_OUT, PTC0_IN_PU, PTC0_IN }
+	},
+	{ PINMUX_CFG_REG("PDCR", 0xa4050106, 16, 2) {
+		PTD7_FN, PTD7_OUT, PTD7_IN_PU, PTD7_IN,
+		PTD6_FN, PTD6_OUT, PTD6_IN_PU, PTD6_IN,
+		PTD5_FN, PTD5_OUT, PTD5_IN_PU, PTD5_IN,
+		PTD4_FN, PTD4_OUT, PTD4_IN_PU, PTD4_IN,
+		PTD3_FN, PTD3_OUT, PTD3_IN_PU, PTD3_IN,
+		PTD2_FN, PTD2_OUT, PTD2_IN_PU, PTD2_IN,
+		PTD1_FN, PTD1_OUT, PTD1_IN_PU, PTD1_IN,
+		PTD0_FN, PTD0_OUT, PTD0_IN_PU, PTD0_IN }
+	},
+	{ PINMUX_CFG_REG("PECR", 0xa4050108, 16, 2) {
+		PTE7_FN, PTE7_OUT, PTE7_IN_PU, PTE7_IN,
+		PTE6_FN, PTE6_OUT, PTE6_IN_PU, PTE6_IN,
+		PTE5_FN, PTE5_OUT, PTE5_IN_PU, PTE5_IN,
+		PTE4_FN, PTE4_OUT, PTE4_IN_PU, PTE4_IN,
+		PTE3_FN, PTE3_OUT, PTE3_IN_PU, PTE3_IN,
+		PTE2_FN, PTE2_OUT, PTE2_IN_PU, PTE2_IN,
+		PTE1_FN, PTE1_OUT, PTE1_IN_PU, PTE1_IN,
+		PTE0_FN, PTE0_OUT, PTE0_IN_PU, PTE0_IN }
+	},
+	{ PINMUX_CFG_REG("PFCR", 0xa405010a, 16, 2) {
+		PTF7_FN, PTF7_OUT, PTF7_IN_PU, PTF7_IN,
+		PTF6_FN, PTF6_OUT, PTF6_IN_PU, PTF6_IN,
+		PTF5_FN, PTF5_OUT, PTF5_IN_PU, PTF5_IN,
+		PTF4_FN, PTF4_OUT, PTF4_IN_PU, PTF4_IN,
+		PTF3_FN, PTF3_OUT, PTF3_IN_PU, PTF3_IN,
+		PTF2_FN, PTF2_OUT, PTF2_IN_PU, PTF2_IN,
+		PTF1_FN, PTF1_OUT, PTF1_IN_PU, PTF1_IN,
+		PTF0_FN, PTF0_OUT, PTF0_IN_PU, PTF0_IN }
+	},
+	{ PINMUX_CFG_REG("PGCR", 0xa405010c, 16, 2) {
+		0, 0, 0, 0,
+		0, 0, 0, 0,
+		PTG5_FN, PTG5_OUT, 0, 0,
+		PTG4_FN, PTG4_OUT, 0, 0,
+		PTG3_FN, PTG3_OUT, 0, 0,
+		PTG2_FN, PTG2_OUT, 0, 0,
+		PTG1_FN, PTG1_OUT, 0, 0,
+		PTG0_FN, PTG0_OUT, 0, 0 }
+	},
+	{ PINMUX_CFG_REG("PHCR", 0xa405010e, 16, 2) {
+		PTH7_FN, PTH7_OUT, PTH7_IN_PU, PTH7_IN,
+		PTH6_FN, PTH6_OUT, PTH6_IN_PU, PTH6_IN,
+		PTH5_FN, PTH5_OUT, PTH5_IN_PU, PTH5_IN,
+		PTH4_FN, PTH4_OUT, PTH4_IN_PU, PTH4_IN,
+		PTH3_FN, PTH3_OUT, PTH3_IN_PU, PTH3_IN,
+		PTH2_FN, PTH2_OUT, PTH2_IN_PU, PTH2_IN,
+		PTH1_FN, PTH1_OUT, PTH1_IN_PU, PTH1_IN,
+		PTH0_FN, PTH0_OUT, PTH0_IN_PU, PTH0_IN }
+	},
+	{ PINMUX_CFG_REG("PJCR", 0xa4050110, 16, 2) {
+		PTJ7_FN, PTJ7_OUT, 0, 0,
+		PTJ6_FN, PTJ6_OUT, 0, 0,
+		PTJ5_FN, PTJ5_OUT, 0, 0,
+		0, 0, 0, 0,
+		PTJ3_FN, PTJ3_OUT, PTJ3_IN_PU, PTJ3_IN,
+		PTJ2_FN, PTJ2_OUT, PTJ2_IN_PU, PTJ2_IN,
+		PTJ1_FN, PTJ1_OUT, PTJ1_IN_PU, PTJ1_IN,
+		PTJ0_FN, PTJ0_OUT, PTJ0_IN_PU, PTJ0_IN }
+	},
+	{ PINMUX_CFG_REG("PKCR", 0xa4050112, 16, 2) {
+		PTK7_FN, PTK7_OUT, PTK7_IN_PU, PTK7_IN,
+		PTK6_FN, PTK6_OUT, PTK6_IN_PU, PTK6_IN,
+		PTK5_FN, PTK5_OUT, PTK5_IN_PU, PTK5_IN,
+		PTK4_FN, PTK4_OUT, PTK4_IN_PU, PTK4_IN,
+		PTK3_FN, PTK3_OUT, PTK3_IN_PU, PTK3_IN,
+		PTK2_FN, PTK2_OUT, PTK2_IN_PU, PTK2_IN,
+		PTK1_FN, PTK1_OUT, PTK1_IN_PU, PTK1_IN,
+		PTK0_FN, PTK0_OUT, PTK0_IN_PU, PTK0_IN }
+	},
+	{ PINMUX_CFG_REG("PLCR", 0xa4050114, 16, 2) {
+		PTL7_FN, PTL7_OUT, PTL7_IN_PU, PTL7_IN,
+		PTL6_FN, PTL6_OUT, PTL6_IN_PU, PTL6_IN,
+		PTL5_FN, PTL5_OUT, PTL5_IN_PU, PTL5_IN,
+		PTL4_FN, PTL4_OUT, PTL4_IN_PU, PTL4_IN,
+		PTL3_FN, PTL3_OUT, PTL3_IN_PU, PTL3_IN,
+		PTL2_FN, PTL2_OUT, PTL2_IN_PU, PTL2_IN,
+		PTL1_FN, PTL1_OUT, PTL1_IN_PU, PTL1_IN,
+		PTL0_FN, PTL0_OUT, PTL0_IN_PU, PTL0_IN }
+	},
+	{ PINMUX_CFG_REG("PMCR", 0xa4050116, 16, 2) {
+		PTM7_FN, PTM7_OUT, PTM7_IN_PU, PTM7_IN,
+		PTM6_FN, PTM6_OUT, PTM6_IN_PU, PTM6_IN,
+		PTM5_FN, PTM5_OUT, PTM5_IN_PU, PTM5_IN,
+		PTM4_FN, PTM4_OUT, PTM4_IN_PU, PTM4_IN,
+		PTM3_FN, PTM3_OUT, PTM3_IN_PU, PTM3_IN,
+		PTM2_FN, PTM2_OUT, PTM2_IN_PU, PTM2_IN,
+		PTM1_FN, PTM1_OUT, PTM1_IN_PU, PTM1_IN,
+		PTM0_FN, PTM0_OUT, PTM0_IN_PU, PTM0_IN }
+	},
+	{ PINMUX_CFG_REG("PNCR", 0xa4050118, 16, 2) {
+		PTN7_FN, PTN7_OUT, PTN7_IN_PU, PTN7_IN,
+		PTN6_FN, PTN6_OUT, PTN6_IN_PU, PTN6_IN,
+		PTN5_FN, PTN5_OUT, PTN5_IN_PU, PTN5_IN,
+		PTN4_FN, PTN4_OUT, PTN4_IN_PU, PTN4_IN,
+		PTN3_FN, PTN3_OUT, PTN3_IN_PU, PTN3_IN,
+		PTN2_FN, PTN2_OUT, PTN2_IN_PU, PTN2_IN,
+		PTN1_FN, PTN1_OUT, PTN1_IN_PU, PTN1_IN,
+		PTN0_FN, PTN0_OUT, PTN0_IN_PU, PTN0_IN }
+	},
+	{ PINMUX_CFG_REG("PQCR", 0xa405011a, 16, 2) {
+		PTQ7_FN, PTQ7_OUT, PTQ7_IN_PU, PTQ7_IN,
+		PTQ6_FN, PTQ6_OUT, PTQ6_IN_PU, PTQ6_IN,
+		PTQ5_FN, PTQ5_OUT, PTQ5_IN_PU, PTQ5_IN,
+		PTQ4_FN, PTQ4_OUT, PTQ4_IN_PU, PTQ4_IN,
+		PTQ3_FN, PTQ3_OUT, PTQ3_IN_PU, PTQ3_IN,
+		PTQ2_FN, PTQ2_OUT, PTQ2_IN_PU, PTQ2_IN,
+		PTQ1_FN, PTQ1_OUT, PTQ1_IN_PU, PTQ1_IN,
+		PTQ0_FN, PTQ0_OUT, PTQ0_IN_PU, PTQ0_IN }
+	},
+	{ PINMUX_CFG_REG("PRCR", 0xa405011c, 16, 2) {
+		PTR7_FN, PTR7_OUT, PTR7_IN_PU, PTR7_IN,
+		PTR6_FN, PTR6_OUT, PTR6_IN_PU, PTR6_IN,
+		PTR5_FN, PTR5_OUT, PTR5_IN_PU, PTR5_IN,
+		PTR4_FN, PTR4_OUT, PTR4_IN_PU, PTR4_IN,
+		PTR3_FN, 0,        PTR3_IN_PU, PTR3_IN,
+		PTR2_FN, 0,        PTR2_IN_PU, PTR2_IN,
+		PTR1_FN, PTR1_OUT, PTR1_IN_PU, PTR1_IN,
+		PTR0_FN, PTR0_OUT, PTR0_IN_PU, PTR0_IN }
+	},
+	{ PINMUX_CFG_REG("PSCR", 0xa405011e, 16, 2) {
+		0, 0, 0, 0,
+		PTS6_FN, PTS6_OUT, PTS6_IN_PU, PTS6_IN,
+		PTS5_FN, PTS5_OUT, PTS5_IN_PU, PTS5_IN,
+		PTS4_FN, PTS4_OUT, PTS4_IN_PU, PTS4_IN,
+		PTS3_FN, PTS3_OUT, PTS3_IN_PU, PTS3_IN,
+		PTS2_FN, PTS2_OUT, PTS2_IN_PU, PTS2_IN,
+		PTS1_FN, PTS1_OUT, PTS1_IN_PU, PTS1_IN,
+		PTS0_FN, PTS0_OUT, PTS0_IN_PU, PTS0_IN }
+	},
+	{ PINMUX_CFG_REG("PTCR", 0xa4050140, 16, 2) {
+		PTT7_FN, PTT7_OUT, PTT7_IN_PU, PTT7_IN,
+		PTT6_FN, PTT6_OUT, PTT6_IN_PU, PTT6_IN,
+		PTT5_FN, PTT5_OUT, PTT5_IN_PU, PTT5_IN,
+		PTT4_FN, PTT4_OUT, PTT4_IN_PU, PTT4_IN,
+		PTT3_FN, PTT3_OUT, PTT3_IN_PU, PTT3_IN,
+		PTT2_FN, PTT2_OUT, PTT2_IN_PU, PTT2_IN,
+		PTT1_FN, PTT1_OUT, PTT1_IN_PU, PTT1_IN,
+		PTT0_FN, PTT0_OUT, PTT0_IN_PU, PTT0_IN }
+	},
+	{ PINMUX_CFG_REG("PUCR", 0xa4050142, 16, 2) {
+		PTU7_FN, PTU7_OUT, PTU7_IN_PU, PTU7_IN,
+		PTU6_FN, PTU6_OUT, PTU6_IN_PU, PTU6_IN,
+		PTU5_FN, PTU5_OUT, PTU5_IN_PU, PTU5_IN,
+		PTU4_FN, PTU4_OUT, PTU4_IN_PU, PTU4_IN,
+		PTU3_FN, PTU3_OUT, PTU3_IN_PU, PTU3_IN,
+		PTU2_FN, PTU2_OUT, PTU2_IN_PU, PTU2_IN,
+		PTU1_FN, PTU1_OUT, PTU1_IN_PU, PTU1_IN,
+		PTU0_FN, PTU0_OUT, PTU0_IN_PU, PTU0_IN }
+	},
+	{ PINMUX_CFG_REG("PVCR", 0xa4050144, 16, 2) {
+		PTV7_FN, PTV7_OUT, PTV7_IN_PU, PTV7_IN,
+		PTV6_FN, PTV6_OUT, PTV6_IN_PU, PTV6_IN,
+		PTV5_FN, PTV5_OUT, PTV5_IN_PU, PTV5_IN,
+		PTV4_FN, PTV4_OUT, PTV4_IN_PU, PTV4_IN,
+		PTV3_FN, PTV3_OUT, PTV3_IN_PU, PTV3_IN,
+		PTV2_FN, PTV2_OUT, PTV2_IN_PU, PTV2_IN,
+		PTV1_FN, PTV1_OUT, PTV1_IN_PU, PTV1_IN,
+		PTV0_FN, PTV0_OUT, PTV0_IN_PU, PTV0_IN }
+	},
+	{ PINMUX_CFG_REG("PWCR", 0xa4050146, 16, 2) {
+		PTW7_FN, PTW7_OUT, PTW7_IN_PU, PTW7_IN,
+		PTW6_FN, PTW6_OUT, PTW6_IN_PU, PTW6_IN,
+		PTW5_FN, PTW5_OUT, PTW5_IN_PU, PTW5_IN,
+		PTW4_FN, PTW4_OUT, PTW4_IN_PU, PTW4_IN,
+		PTW3_FN, PTW3_OUT, PTW3_IN_PU, PTW3_IN,
+		PTW2_FN, PTW2_OUT, PTW2_IN_PU, PTW2_IN,
+		PTW1_FN, PTW1_OUT, PTW1_IN_PU, PTW1_IN,
+		PTW0_FN, PTW0_OUT, PTW0_IN_PU, PTW0_IN }
+	},
+	{ PINMUX_CFG_REG("PXCR", 0xa4050148, 16, 2) {
+		PTX7_FN, PTX7_OUT, PTX7_IN_PU, PTX7_IN,
+		PTX6_FN, PTX6_OUT, PTX6_IN_PU, PTX6_IN,
+		PTX5_FN, PTX5_OUT, PTX5_IN_PU, PTX5_IN,
+		PTX4_FN, PTX4_OUT, PTX4_IN_PU, PTX4_IN,
+		PTX3_FN, PTX3_OUT, PTX3_IN_PU, PTX3_IN,
+		PTX2_FN, PTX2_OUT, PTX2_IN_PU, PTX2_IN,
+		PTX1_FN, PTX1_OUT, PTX1_IN_PU, PTX1_IN,
+		PTX0_FN, PTX0_OUT, PTX0_IN_PU, PTX0_IN }
+	},
+	{ PINMUX_CFG_REG("PYCR", 0xa405014a, 16, 2) {
+		PTY7_FN, PTY7_OUT, PTY7_IN_PU, PTY7_IN,
+		PTY6_FN, PTY6_OUT, PTY6_IN_PU, PTY6_IN,
+		PTY5_FN, PTY5_OUT, PTY5_IN_PU, PTY5_IN,
+		PTY4_FN, PTY4_OUT, PTY4_IN_PU, PTY4_IN,
+		PTY3_FN, PTY3_OUT, PTY3_IN_PU, PTY3_IN,
+		PTY2_FN, PTY2_OUT, PTY2_IN_PU, PTY2_IN,
+		PTY1_FN, PTY1_OUT, PTY1_IN_PU, PTY1_IN,
+		PTY0_FN, PTY0_OUT, PTY0_IN_PU, PTY0_IN }
+	},
+	{ PINMUX_CFG_REG("PZCR", 0xa405014c, 16, 2) {
+		PTZ7_FN, PTZ7_OUT, PTZ7_IN_PU, PTZ7_IN,
+		PTZ6_FN, PTZ6_OUT, PTZ6_IN_PU, PTZ6_IN,
+		PTZ5_FN, PTZ5_OUT, PTZ5_IN_PU, PTZ5_IN,
+		PTZ4_FN, PTZ4_OUT, PTZ4_IN_PU, PTZ4_IN,
+		PTZ3_FN, PTZ3_OUT, PTZ3_IN_PU, PTZ3_IN,
+		PTZ2_FN, PTZ2_OUT, PTZ2_IN_PU, PTZ2_IN,
+		PTZ1_FN, PTZ1_OUT, PTZ1_IN_PU, PTZ1_IN,
+		PTZ0_FN, PTZ0_OUT, PTZ0_IN_PU, PTZ0_IN }
+	},
+	{ PINMUX_CFG_REG("PSELA", 0xa405014e, 16, 1) {
+		PSA15_0, PSA15_1,
+		PSA14_0, PSA14_1,
+		PSA13_0, PSA13_1,
+		PSA12_0, PSA12_1,
+		0, 0,
+		PSA10_0, PSA10_1,
+		PSA9_0,  PSA9_1,
+		PSA8_0,  PSA8_1,
+		PSA7_0,  PSA7_1,
+		PSA6_0,  PSA6_1,
+		PSA5_0,  PSA5_1,
+		0, 0,
+		PSA3_0,  PSA3_1,
+		PSA2_0,  PSA2_1,
+		PSA1_0,  PSA1_1,
+		PSA0_0,  PSA0_1}
+	},
+	{ PINMUX_CFG_REG("PSELB", 0xa4050150, 16, 1) {
+		0, 0,
+		PSB14_0, PSB14_1,
+		PSB13_0, PSB13_1,
+		PSB12_0, PSB12_1,
+		PSB11_0, PSB11_1,
+		PSB10_0, PSB10_1,
+		PSB9_0,  PSB9_1,
+		PSB8_0,  PSB8_1,
+		PSB7_0,  PSB7_1,
+		PSB6_0,  PSB6_1,
+		PSB5_0,  PSB5_1,
+		PSB4_0,  PSB4_1,
+		PSB3_0,  PSB3_1,
+		PSB2_0,  PSB2_1,
+		PSB1_0,  PSB1_1,
+		PSB0_0,  PSB0_1}
+	},
+	{ PINMUX_CFG_REG("PSELC", 0xa4050152, 16, 1) {
+		PSC15_0, PSC15_1,
+		PSC14_0, PSC14_1,
+		PSC13_0, PSC13_1,
+		PSC12_0, PSC12_1,
+		PSC11_0, PSC11_1,
+		PSC10_0, PSC10_1,
+		PSC9_0,  PSC9_1,
+		PSC8_0,  PSC8_1,
+		PSC7_0,  PSC7_1,
+		PSC6_0,  PSC6_1,
+		PSC5_0,  PSC5_1,
+		PSC4_0,  PSC4_1,
+		0, 0,
+		PSC2_0,  PSC2_1,
+		PSC1_0,  PSC1_1,
+		PSC0_0,  PSC0_1}
+	},
+	{ PINMUX_CFG_REG("PSELD", 0xa4050154, 16, 1) {
+		PSD15_0, PSD15_1,
+		PSD14_0, PSD14_1,
+		PSD13_0, PSD13_1,
+		PSD12_0, PSD12_1,
+		PSD11_0, PSD11_1,
+		PSD10_0, PSD10_1,
+		PSD9_0,  PSD9_1,
+		PSD8_0,  PSD8_1,
+		PSD7_0,  PSD7_1,
+		PSD6_0,  PSD6_1,
+		PSD5_0,  PSD5_1,
+		PSD4_0,  PSD4_1,
+		PSD3_0,  PSD3_1,
+		PSD2_0,  PSD2_1,
+		PSD1_0,  PSD1_1,
+		PSD0_0,  PSD0_1}
+	},
+	{ PINMUX_CFG_REG("PSELE", 0xa4050156, 16, 1) {
+		PSE15_0, PSE15_1,
+		PSE14_0, PSE14_1,
+		PSE13_0, PSE13_1,
+		PSE12_0, PSE12_1,
+		PSE11_0, PSE11_1,
+		PSE10_0, PSE10_1,
+		PSE9_0,  PSE9_1,
+		PSE8_0,  PSE8_1,
+		PSE7_0,  PSE7_1,
+		PSE6_0,  PSE6_1,
+		PSE5_0,  PSE5_1,
+		PSE4_0,  PSE4_1,
+		PSE3_0,  PSE3_1,
+		PSE2_0,  PSE2_1,
+		PSE1_0,  PSE1_1,
+		PSE0_0,  PSE0_1}
+	},
+	{}
+};
+
+static struct pinmux_data_reg pinmux_data_regs[] = {
+	{ PINMUX_DATA_REG("PADR", 0xa4050120, 8) {
+		PTA7_DATA, PTA6_DATA, PTA5_DATA, PTA4_DATA,
+		PTA3_DATA, PTA2_DATA, PTA1_DATA, PTA0_DATA }
+	},
+	{ PINMUX_DATA_REG("PBDR", 0xa4050122, 8) {
+		PTB7_DATA, PTB6_DATA, PTB5_DATA, PTB4_DATA,
+		PTB3_DATA, PTB2_DATA, PTB1_DATA, PTB0_DATA }
+	},
+	{ PINMUX_DATA_REG("PCDR", 0xa4050124, 8) {
+		PTC7_DATA, PTC6_DATA, PTC5_DATA, PTC4_DATA,
+		PTC3_DATA, PTC2_DATA, PTC1_DATA, PTC0_DATA }
+	},
+	{ PINMUX_DATA_REG("PDDR", 0xa4050126, 8) {
+		PTD7_DATA, PTD6_DATA, PTD5_DATA, PTD4_DATA,
+		PTD3_DATA, PTD2_DATA, PTD1_DATA, PTD0_DATA }
+	},
+	{ PINMUX_DATA_REG("PEDR", 0xa4050128, 8) {
+		PTE7_DATA, PTE6_DATA, PTE5_DATA, PTE4_DATA,
+		PTE3_DATA, PTE2_DATA, PTE1_DATA, PTE0_DATA }
+	},
+	{ PINMUX_DATA_REG("PFDR", 0xa405012a, 8) {
+		PTF7_DATA, PTF6_DATA, PTF5_DATA, PTF4_DATA,
+		PTF3_DATA, PTF2_DATA, PTF1_DATA, PTF0_DATA }
+	},
+	{ PINMUX_DATA_REG("PGDR", 0xa405012c, 8) {
+		0,         0,         PTG5_DATA, PTG4_DATA,
+		PTG3_DATA, PTG2_DATA, PTG1_DATA, PTG0_DATA }
+	},
+	{ PINMUX_DATA_REG("PHDR", 0xa405012e, 8) {
+		PTH7_DATA, PTH6_DATA, PTH5_DATA, PTH4_DATA,
+		PTH3_DATA, PTH2_DATA, PTH1_DATA, PTH0_DATA }
+	},
+	{ PINMUX_DATA_REG("PJDR", 0xa4050130, 8) {
+		PTJ7_DATA, PTJ6_DATA, PTJ5_DATA, 0,
+		PTJ3_DATA, PTJ2_DATA, PTJ1_DATA, PTJ0_DATA }
+	},
+	{ PINMUX_DATA_REG("PKDR", 0xa4050132, 8) {
+		PTK7_DATA, PTK6_DATA, PTK5_DATA, PTK4_DATA,
+		PTK3_DATA, PTK2_DATA, PTK1_DATA, PTK0_DATA }
+	},
+	{ PINMUX_DATA_REG("PLDR", 0xa4050134, 8) {
+		PTL7_DATA, PTL6_DATA, PTL5_DATA, PTL4_DATA,
+		PTL3_DATA, PTL2_DATA, PTL1_DATA, PTL0_DATA }
+	},
+	{ PINMUX_DATA_REG("PMDR", 0xa4050136, 8) {
+		PTM7_DATA, PTM6_DATA, PTM5_DATA, PTM4_DATA,
+		PTM3_DATA, PTM2_DATA, PTM1_DATA, PTM0_DATA }
+	},
+	{ PINMUX_DATA_REG("PNDR", 0xa4050138, 8) {
+		PTN7_DATA, PTN6_DATA, PTN5_DATA, PTN4_DATA,
+		PTN3_DATA, PTN2_DATA, PTN1_DATA, PTN0_DATA }
+	},
+	{ PINMUX_DATA_REG("PQDR", 0xa405013a, 8) {
+		PTQ7_DATA, PTQ6_DATA, PTQ5_DATA, PTQ4_DATA,
+		PTQ3_DATA, PTQ2_DATA, PTQ1_DATA, PTQ0_DATA }
+	},
+	{ PINMUX_DATA_REG("PRDR", 0xa405013c, 8) {
+		PTR7_DATA, PTR6_DATA, PTR5_DATA, PTR4_DATA,
+		PTR3_DATA, PTR2_DATA, PTR1_DATA, PTR0_DATA }
+	},
+	{ PINMUX_DATA_REG("PSDR", 0xa405013e, 8) {
+		0,         PTS6_DATA, PTS5_DATA, PTS4_DATA,
+		PTS3_DATA, PTS2_DATA, PTS1_DATA, PTS0_DATA }
+	},
+	{ PINMUX_DATA_REG("PTDR", 0xa4050160, 8) {
+		PTT7_DATA, PTT6_DATA, PTT5_DATA, PTT4_DATA,
+		PTT3_DATA, PTT2_DATA, PTT1_DATA, PTT0_DATA }
+	},
+	{ PINMUX_DATA_REG("PUDR", 0xa4050162, 8) {
+		PTU7_DATA, PTU6_DATA, PTU5_DATA, PTU4_DATA,
+		PTU3_DATA, PTU2_DATA, PTU1_DATA, PTU0_DATA }
+	},
+	{ PINMUX_DATA_REG("PVDR", 0xa4050164, 8) {
+		PTV7_DATA, PTV6_DATA, PTV5_DATA, PTV4_DATA,
+		PTV3_DATA, PTV2_DATA, PTV1_DATA, PTV0_DATA }
+	},
+	{ PINMUX_DATA_REG("PWDR", 0xa4050166, 8) {
+		PTW7_DATA, PTW6_DATA, PTW5_DATA, PTW4_DATA,
+		PTW3_DATA, PTW2_DATA, PTW1_DATA, PTW0_DATA }
+	},
+	{ PINMUX_DATA_REG("PXDR", 0xa4050168, 8) {
+		PTX7_DATA, PTX6_DATA, PTX5_DATA, PTX4_DATA,
+		PTX3_DATA, PTX2_DATA, PTX1_DATA, PTX0_DATA }
+	},
+	{ PINMUX_DATA_REG("PYDR", 0xa405016a, 8) {
+		PTY7_DATA, PTY6_DATA, PTY5_DATA, PTY4_DATA,
+		PTY3_DATA, PTY2_DATA, PTY1_DATA, PTY0_DATA }
+	},
+	{ PINMUX_DATA_REG("PZDR", 0xa405016c, 8) {
+		PTZ7_DATA, PTZ6_DATA, PTZ5_DATA, PTZ4_DATA,
+		PTZ3_DATA, PTZ2_DATA, PTZ1_DATA, PTZ0_DATA }
+	},
+	{ },
+};
+
+static struct pinmux_info sh7724_pinmux_info = {
+	.name = "sh7724_pfc",
+	.reserved_id = PINMUX_RESERVED,
+	.data = { PINMUX_DATA_BEGIN, PINMUX_DATA_END },
+	.input = { PINMUX_INPUT_BEGIN, PINMUX_INPUT_END },
+	.input_pu = { PINMUX_INPUT_PULLUP_BEGIN, PINMUX_INPUT_PULLUP_END },
+	.output = { PINMUX_OUTPUT_BEGIN, PINMUX_OUTPUT_END },
+	.mark = { PINMUX_MARK_BEGIN, PINMUX_MARK_END },
+	.function = { PINMUX_FUNCTION_BEGIN, PINMUX_FUNCTION_END },
+
+	.first_gpio = GPIO_PTA7,
+	.last_gpio = GPIO_FN_INTC_IRQ0,
+
+	.gpios = pinmux_gpios,
+	.cfg_regs = pinmux_config_regs,
+	.data_regs = pinmux_data_regs,
+
+	.gpio_data = pinmux_data,
+	.gpio_data_size = ARRAY_SIZE(pinmux_data),
+};
+
+static int __init plat_pinmux_setup(void)
+{
+	return register_pinmux(&sh7724_pinmux_info);
+}
+arch_initcall(plat_pinmux_setup);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
new file mode 100644
index 0000000000000..4327b1e080b73
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -0,0 +1,371 @@
+/*
+ * SH7724 Setup
+ *
+ * Copyright (C) 2009 Renesas Solutions Corp.
+ *
+ * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ *
+ * Based on SH7723 Setup
+ * Copyright (C) 2008  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/serial.h>
+#include <linux/mm.h>
+#include <linux/serial_sci.h>
+#include <linux/uio_driver.h>
+#include <linux/sh_cmt.h>
+#include <linux/io.h>
+#include <asm/clock.h>
+#include <asm/mmzone.h>
+
+/* Serial */
+static struct plat_sci_port sci_platform_data[] = {
+	{
+		.mapbase        = 0xffe00000,
+		.flags          = UPF_BOOT_AUTOCONF,
+		.type           = PORT_SCIF,
+		.irqs           = { 80, 80, 80, 80 },
+	}, {
+		.mapbase        = 0xffe10000,
+		.flags          = UPF_BOOT_AUTOCONF,
+		.type           = PORT_SCIF,
+		.irqs           = { 81, 81, 81, 81 },
+	}, {
+		.mapbase        = 0xffe20000,
+		.flags          = UPF_BOOT_AUTOCONF,
+		.type           = PORT_SCIF,
+		.irqs           = { 82, 82, 82, 82 },
+	}, {
+		.mapbase	= 0xa4e30000,
+		.flags		= UPF_BOOT_AUTOCONF,
+		.type		= PORT_SCIFA,
+		.irqs		= { 56, 56, 56, 56 },
+	}, {
+		.mapbase	= 0xa4e40000,
+		.flags		= UPF_BOOT_AUTOCONF,
+		.type		= PORT_SCIFA,
+		.irqs		= { 88, 88, 88, 88 },
+	}, {
+		.mapbase	= 0xa4e50000,
+		.flags		= UPF_BOOT_AUTOCONF,
+		.type		= PORT_SCIFA,
+		.irqs		= { 109, 109, 109, 109 },
+	}, {
+		.flags = 0,
+	}
+};
+
+static struct platform_device sci_device = {
+	.name		= "sh-sci",
+	.id		= -1,
+	.dev		= {
+		.platform_data	= sci_platform_data,
+	},
+};
+
+/* RTC */
+static struct resource rtc_resources[] = {
+	[0] = {
+		.start	= 0xa465fec0,
+		.end	= 0xa465fec0 + 0x58 - 1,
+		.flags	= IORESOURCE_IO,
+	},
+	[1] = {
+		/* Period IRQ */
+		.start	= 69,
+		.flags	= IORESOURCE_IRQ,
+	},
+	[2] = {
+		/* Carry IRQ */
+		.start	= 70,
+		.flags	= IORESOURCE_IRQ,
+	},
+	[3] = {
+		/* Alarm IRQ */
+		.start	= 68,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device rtc_device = {
+	.name		= "sh-rtc",
+	.id		= -1,
+	.num_resources	= ARRAY_SIZE(rtc_resources),
+	.resource	= rtc_resources,
+};
+
+static struct platform_device *sh7724_devices[] __initdata = {
+	&sci_device,
+	&rtc_device,
+};
+
+static int __init sh7724_devices_setup(void)
+{
+	clk_always_enable("rtc0");   /* RTC */
+
+	return platform_add_devices(sh7724_devices,
+				    ARRAY_SIZE(sh7724_devices));
+}
+device_initcall(sh7724_devices_setup);
+
+enum {
+	UNUSED = 0,
+
+	/* interrupt sources */
+	IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7,
+	HUDI,
+	DMAC1A_DEI0, DMAC1A_DEI1, DMAC1A_DEI2, DMAC1A_DEI3,
+	_2DG_TRI, _2DG_INI, _2DG_CEI, _2DG_BRK,
+	DMAC0A_DEI0, DMAC0A_DEI1, DMAC0A_DEI2, DMAC0A_DEI3,
+	VIO_CEU20I, VIO_BEU20I, VIO_VEU3F1, VIO_VOUI,
+	SCIFA_SCIFA0,
+	VPU_VPUI,
+	TPU_TPUI,
+	CEU21I,
+	BEU21I,
+	USB_USI0,
+	ATAPI,
+	RTC_ATI, RTC_PRI, RTC_CUI,
+	DMAC1B_DEI4, DMAC1B_DEI5, DMAC1B_DADERR,
+	DMAC0B_DEI4, DMAC0B_DEI5, DMAC0B_DADERR,
+	KEYSC_KEYI,
+	SCIF_SCIF0, SCIF_SCIF1, SCIF_SCIF2,
+	VEU3F0I,
+	MSIOF_MSIOFI0, MSIOF_MSIOFI1,
+	SPU_SPUI0, SPU_SPUI1,
+	SCIFA_SCIFA1,
+/*	ICB_ICBI, */
+	ETHI,
+	I2C1_ALI, I2C1_TACKI, I2C1_WAITI, I2C1_DTEI,
+	I2C0_ALI, I2C0_TACKI, I2C0_WAITI, I2C0_DTEI,
+	SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2,
+	CMT_CMTI,
+	TSIF_TSIFI,
+/*	ICB_LMBI, */
+	FSI_FSI,
+	SCIFA_SCIFA2,
+	TMU0_TUNI0, TMU0_TUNI1, TMU0_TUNI2,
+	IRDA_IRDAI,
+	SDHI1_SDHII0, SDHI1_SDHII1, SDHI1_SDHII2,
+	JPU_JPUI,
+	MMC_MMCI0, MMC_MMCI1, MMC_MMCI2,
+	LCDC_LCDCI,
+	TMU1_TUNI0, TMU1_TUNI1, TMU1_TUNI2,
+
+	/* interrupt groups */
+	DMAC1A, _2DG, DMAC0A, VIO, RTC,
+	DMAC1B, DMAC0B, I2C0, I2C1, SDHI0, SDHI1, SPU, MMC,
+};
+
+static struct intc_vect vectors[] __initdata = {
+	INTC_VECT(IRQ0, 0x600), INTC_VECT(IRQ1, 0x620),
+	INTC_VECT(IRQ2, 0x640), INTC_VECT(IRQ3, 0x660),
+	INTC_VECT(IRQ4, 0x680), INTC_VECT(IRQ5, 0x6a0),
+	INTC_VECT(IRQ6, 0x6c0), INTC_VECT(IRQ7, 0x6e0),
+
+	INTC_VECT(DMAC1A_DEI0, 0x700),
+	INTC_VECT(DMAC1A_DEI1, 0x720),
+	INTC_VECT(DMAC1A_DEI2, 0x740),
+	INTC_VECT(DMAC1A_DEI3, 0x760),
+
+	INTC_VECT(_2DG_TRI, 0x780),
+	INTC_VECT(_2DG_INI, 0x7A0),
+	INTC_VECT(_2DG_CEI, 0x7C0),
+	INTC_VECT(_2DG_BRK, 0x7E0),
+
+	INTC_VECT(DMAC0A_DEI0, 0x800),
+	INTC_VECT(DMAC0A_DEI1, 0x820),
+	INTC_VECT(DMAC0A_DEI2, 0x840),
+	INTC_VECT(DMAC0A_DEI3, 0x860),
+
+	INTC_VECT(VIO_CEU20I, 0x880),
+	INTC_VECT(VIO_BEU20I, 0x8A0),
+	INTC_VECT(VIO_VEU3F1, 0x8C0),
+	INTC_VECT(VIO_VOUI, 0x8E0),
+
+	INTC_VECT(SCIFA_SCIFA0, 0x900),
+	INTC_VECT(VPU_VPUI, 0x980),
+	INTC_VECT(TPU_TPUI, 0x9A0),
+	INTC_VECT(CEU21I, 0x9E0),
+	INTC_VECT(BEU21I, 0xA00),
+	INTC_VECT(USB_USI0, 0xA20),
+	INTC_VECT(ATAPI, 0xA60),
+
+	INTC_VECT(RTC_ATI, 0xA80),
+	INTC_VECT(RTC_PRI, 0xAA0),
+	INTC_VECT(RTC_CUI, 0xAC0),
+
+	INTC_VECT(DMAC1B_DEI4, 0xB00),
+	INTC_VECT(DMAC1B_DEI5, 0xB20),
+	INTC_VECT(DMAC1B_DADERR, 0xB40),
+
+	INTC_VECT(DMAC0B_DEI4, 0xB80),
+	INTC_VECT(DMAC0B_DEI5, 0xBA0),
+	INTC_VECT(DMAC0B_DADERR, 0xBC0),
+
+	INTC_VECT(KEYSC_KEYI, 0xBE0),
+	INTC_VECT(SCIF_SCIF0, 0xC00),
+	INTC_VECT(SCIF_SCIF1, 0xC20),
+	INTC_VECT(SCIF_SCIF2, 0xC40),
+	INTC_VECT(VEU3F0I, 0xC60),
+	INTC_VECT(MSIOF_MSIOFI0, 0xC80),
+	INTC_VECT(MSIOF_MSIOFI1, 0xCA0),
+	INTC_VECT(SPU_SPUI0, 0xCC0),
+	INTC_VECT(SPU_SPUI1, 0xCE0),
+	INTC_VECT(SCIFA_SCIFA1, 0xD00),
+
+/*	INTC_VECT(ICB_ICBI, 0xD20), */
+	INTC_VECT(ETHI, 0xD60),
+
+	INTC_VECT(I2C1_ALI, 0xD80),
+	INTC_VECT(I2C1_TACKI, 0xDA0),
+	INTC_VECT(I2C1_WAITI, 0xDC0),
+	INTC_VECT(I2C1_DTEI, 0xDE0),
+
+	INTC_VECT(I2C0_ALI, 0xE00),
+	INTC_VECT(I2C0_TACKI, 0xE20),
+	INTC_VECT(I2C0_WAITI, 0xE40),
+	INTC_VECT(I2C0_DTEI, 0xE60),
+
+	INTC_VECT(SDHI0_SDHII0, 0xE80),
+	INTC_VECT(SDHI0_SDHII1, 0xEA0),
+	INTC_VECT(SDHI0_SDHII2, 0xEC0),
+
+	INTC_VECT(CMT_CMTI, 0xF00),
+	INTC_VECT(TSIF_TSIFI, 0xF20),
+/*	INTC_VECT(ICB_LMBI, 0xF60), */
+	INTC_VECT(FSI_FSI, 0xF80),
+	INTC_VECT(SCIFA_SCIFA2, 0xFA0),
+
+	INTC_VECT(TMU0_TUNI0, 0x400),
+	INTC_VECT(TMU0_TUNI1, 0x420),
+	INTC_VECT(TMU0_TUNI2, 0x440),
+
+	INTC_VECT(IRDA_IRDAI, 0x480),
+
+	INTC_VECT(SDHI1_SDHII0, 0x4E0),
+	INTC_VECT(SDHI1_SDHII1, 0x500),
+	INTC_VECT(SDHI1_SDHII2, 0x520),
+
+	INTC_VECT(JPU_JPUI, 0x560),
+
+	INTC_VECT(MMC_MMCI0, 0x580),
+	INTC_VECT(MMC_MMCI1, 0x5A0),
+	INTC_VECT(MMC_MMCI2, 0x5C0),
+
+	INTC_VECT(LCDC_LCDCI, 0xF40),
+
+	INTC_VECT(TMU1_TUNI0, 0x920),
+	INTC_VECT(TMU1_TUNI1, 0x940),
+	INTC_VECT(TMU1_TUNI2, 0x960),
+};
+
+static struct intc_group groups[] __initdata = {
+	INTC_GROUP(DMAC1A, DMAC1A_DEI0, DMAC1A_DEI1, DMAC1A_DEI2, DMAC1A_DEI3),
+	INTC_GROUP(_2DG, _2DG_TRI, _2DG_INI, _2DG_CEI, _2DG_BRK),
+	INTC_GROUP(DMAC0A, DMAC0A_DEI0, DMAC0A_DEI1, DMAC0A_DEI2, DMAC0A_DEI3),
+	INTC_GROUP(VIO, VIO_CEU20I, VIO_BEU20I, VIO_VEU3F1, VIO_VOUI),
+	INTC_GROUP(RTC, RTC_ATI, RTC_PRI, RTC_CUI),
+	INTC_GROUP(DMAC1B, DMAC1B_DEI4, DMAC1B_DEI5, DMAC1B_DADERR),
+	INTC_GROUP(DMAC0B, DMAC0B_DEI4, DMAC0B_DEI5, DMAC0B_DADERR),
+	INTC_GROUP(I2C0, I2C0_ALI, I2C0_TACKI, I2C0_WAITI, I2C0_DTEI),
+	INTC_GROUP(I2C1, I2C1_ALI, I2C1_TACKI, I2C1_WAITI, I2C1_DTEI),
+	INTC_GROUP(SDHI0, SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2),
+	INTC_GROUP(SDHI1, SDHI1_SDHII0, SDHI1_SDHII1, SDHI1_SDHII2),
+	INTC_GROUP(SPU, SPU_SPUI0, SPU_SPUI1),
+	INTC_GROUP(MMC, MMC_MMCI0, MMC_MMCI1, MMC_MMCI2),
+};
+
+/* FIXMEEEEEEEEEEEEEEEEEEE !!!!! */
+/* very bad manual !! */
+static struct intc_mask_reg mask_registers[] __initdata = {
+	{ 0xa4080080, 0xa40800c0, 8, /* IMR0 / IMCR0 */
+	  { 0, TMU1_TUNI2, TMU1_TUNI1, TMU1_TUNI0,
+	    /*SDHII3?*/0, SDHI1_SDHII2, SDHI1_SDHII1, SDHI1_SDHII0 } },
+	{ 0xa4080084, 0xa40800c4, 8, /* IMR1 / IMCR1 */
+	  { VIO_VOUI, VIO_VEU3F1, VIO_BEU20I, VIO_CEU20I,
+	    DMAC0A_DEI3, DMAC0A_DEI2, DMAC0A_DEI1, DMAC0A_DEI0 } },
+	{ 0xa4080088, 0xa40800c8, 8, /* IMR2 / IMCR2 */
+	  { 0, 0, 0, VPU_VPUI, ATAPI, ETHI, 0, /*SCIFA3*/SCIFA_SCIFA0 } },
+	{ 0xa408008c, 0xa40800cc, 8, /* IMR3 / IMCR3 */
+	  { DMAC1A_DEI3, DMAC1A_DEI2, DMAC1A_DEI1, DMAC1A_DEI0,
+	    SPU_SPUI1, SPU_SPUI0, BEU21I, IRDA_IRDAI } },
+	{ 0xa4080090, 0xa40800d0, 8, /* IMR4 / IMCR4 */
+	  { 0, TMU0_TUNI2, TMU0_TUNI1, TMU0_TUNI0,
+	    JPU_JPUI, 0, 0, LCDC_LCDCI } },
+	{ 0xa4080094, 0xa40800d4, 8, /* IMR5 / IMCR5 */
+	  { KEYSC_KEYI, DMAC0B_DADERR, DMAC0B_DEI5, DMAC0B_DEI4,
+	    VEU3F0I, SCIF_SCIF2, SCIF_SCIF1, SCIF_SCIF0 } },
+	{ 0xa4080098, 0xa40800d8, 8, /* IMR6 / IMCR6 */
+	  { 0, 0, /*ICB_ICBI*/0, /*SCIFA4*/SCIFA_SCIFA1,
+	    CEU21I, 0, MSIOF_MSIOFI1, MSIOF_MSIOFI0 } },
+	{ 0xa408009c, 0xa40800dc, 8, /* IMR7 / IMCR7 */
+	  { I2C0_DTEI, I2C0_WAITI, I2C0_TACKI, I2C0_ALI,
+	    I2C1_DTEI, I2C1_WAITI, I2C1_TACKI, I2C1_ALI } },
+	{ 0xa40800a0, 0xa40800e0, 8, /* IMR8 / IMCR8 */
+	  { /*SDHII3*/0, SDHI0_SDHII2, SDHI0_SDHII1, SDHI0_SDHII0,
+	    0, 0, /*SCIFA5*/SCIFA_SCIFA2, FSI_FSI } },
+	{ 0xa40800a4, 0xa40800e4, 8, /* IMR9 / IMCR9 */
+	  { 0, 0, 0, CMT_CMTI, 0, /*USB1*/0, USB_USI0, 0 } },
+	{ 0xa40800a8, 0xa40800e8, 8, /* IMR10 / IMCR10 */
+	  { 0, DMAC1B_DADERR, DMAC1B_DEI5, DMAC1B_DEI4,
+	    0, RTC_ATI, RTC_PRI, RTC_CUI } },
+	{ 0xa40800ac, 0xa40800ec, 8, /* IMR11 / IMCR11 */
+	  { _2DG_BRK, _2DG_CEI, _2DG_INI, _2DG_TRI,
+	    0, TPU_TPUI, /*ICB_LMBI*/0, TSIF_TSIFI } },
+	{ 0xa40800b0, 0xa40800f0, 8, /* IMR12 / IMCR12 */
+	  { 0, 0, 0, 0, 0, 0, 0, 0/*2DDMAC*/ } },
+	{ 0xa4140044, 0xa4140064, 8, /* INTMSK00 / INTMSKCLR00 */
+	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
+};
+
+static struct intc_prio_reg prio_registers[] __initdata = {
+	{ 0xa4080000, 0, 16, 4, /* IPRA */ { TMU0_TUNI0, TMU0_TUNI1,
+					     TMU0_TUNI2, IRDA_IRDAI } },
+	{ 0xa4080004, 0, 16, 4, /* IPRB */ { JPU_JPUI, LCDC_LCDCI,
+					     DMAC1A, BEU21I } },
+	{ 0xa4080008, 0, 16, 4, /* IPRC */ { TMU1_TUNI0, TMU1_TUNI1,
+					     TMU1_TUNI2, SPU } },
+	{ 0xa408000c, 0, 16, 4, /* IPRD */ { 0, MMC, 0, ATAPI } },
+	{ 0xa4080010, 0, 16, 4, /* IPRE */
+	  { DMAC0A, /*BEU?VEU?*/VIO, /*SCIFA3*/SCIFA_SCIFA0, /*VPU5F*/
+	    VPU_VPUI } },
+	{ 0xa4080014, 0, 16, 4, /* IPRF */ { KEYSC_KEYI, DMAC0B,
+					     USB_USI0, CMT_CMTI } },
+	{ 0xa4080018, 0, 16, 4, /* IPRG */ { SCIF_SCIF0, SCIF_SCIF1,
+					     SCIF_SCIF2, VEU3F0I } },
+	{ 0xa408001c, 0, 16, 4, /* IPRH */ { MSIOF_MSIOFI0, MSIOF_MSIOFI1,
+					     I2C1, I2C0 } },
+	{ 0xa4080020, 0, 16, 4, /* IPRI */ { /*SCIFA4*/SCIFA_SCIFA1, /*ICB*/0,
+					     TSIF_TSIFI, _2DG/*ICB?*/ } },
+	{ 0xa4080024, 0, 16, 4, /* IPRJ */ { CEU21I, ETHI, FSI_FSI, SDHI1 } },
+	{ 0xa4080028, 0, 16, 4, /* IPRK */ { RTC, DMAC1B, /*ICB?*/0, SDHI0 } },
+	{ 0xa408002c, 0, 16, 4, /* IPRL */ { /*SCIFA5*/SCIFA_SCIFA2, 0,
+					     TPU_TPUI, /*2DDMAC*/0 } },
+	{ 0xa4140010, 0, 32, 4, /* INTPRI00 */
+	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
+};
+
+static struct intc_sense_reg sense_registers[] __initdata = {
+	{ 0xa414001c, 16, 2, /* ICR1 */
+	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
+};
+
+static struct intc_mask_reg ack_registers[] __initdata = {
+	{ 0xa4140024, 0, 8, /* INTREQ00 */
+	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
+};
+
+static DECLARE_INTC_DESC_ACK(intc_desc, "sh7724", vectors, groups,
+			     mask_registers, prio_registers, sense_registers,
+			     ack_registers);
+
+void __init plat_irq_setup(void)
+{
+	register_intc_controller(&intc_desc);
+}
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 04a6004fccc45..0e6ed804546ea 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -435,7 +435,8 @@ static const char *cpu_name[] = {
 	[CPU_SH7722]	= "SH7722",	[CPU_SHX3]	= "SH-X3",
 	[CPU_SH5_101]	= "SH5-101",	[CPU_SH5_103]	= "SH5-103",
 	[CPU_MXG]	= "MX-G",	[CPU_SH7723]	= "SH7723",
-	[CPU_SH7366]	= "SH7366",	[CPU_SH_NONE]	= "Unknown"
+	[CPU_SH7366]	= "SH7366",	[CPU_SH7724]	= "SH7724",
+	[CPU_SH_NONE]	= "Unknown"
 };
 
 const char *get_cpu_subtype(struct sh_cpuinfo *c)
diff --git a/arch/sh/oprofile/common.c b/arch/sh/oprofile/common.c
index 1b9d4304b3bf7..44f4e31c6d636 100644
--- a/arch/sh/oprofile/common.c
+++ b/arch/sh/oprofile/common.c
@@ -109,6 +109,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops)
 	case CPU_SH7785:
 	case CPU_SH7786:
 	case CPU_SH7723:
+	case CPU_SH7724:
 	case CPU_SHX3:
 		lmodel = &op_model_sh4a_ops;
 		break;
-- 
GitLab


From 47948d2bd6d27648a107a27357b3bc5ad054ff64 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Wed, 15 Apr 2009 11:42:47 +0900
Subject: [PATCH 0417/2198] serial: sh-sci: SH7724 support.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.h | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h
index d0aa82d7fce09..84cc6512f0812 100644
--- a/drivers/serial/sh-sci.h
+++ b/drivers/serial/sh-sci.h
@@ -91,6 +91,9 @@
 # define SCSPTR5                0xa4050128
 # define SCIF_ORER              0x0001  /* overrun error bit */
 # define SCSCR_INIT(port)       0x0038  /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */
+#elif defined(CONFIG_CPU_SUBTYPE_SH7724)
+# define SCIF_ORER              0x0001  /* overrun error bit */
+# define SCSCR_INIT(port)       0x0038  /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */
 #elif defined(CONFIG_CPU_SUBTYPE_SH4_202)
 # define SCSPTR2 0xffe80020 /* 16 bit SCIF */
 # define SCIF_ORER 0x0001   /* overrun error bit */
@@ -361,7 +364,8 @@
                  h8_sci_offset, h8_sci_size) \
   CPU_SCI_FNS(name, h8_sci_offset, h8_sci_size)
 #define SCIF_FNS(name, sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size)
-#elif defined(CONFIG_CPU_SUBTYPE_SH7723)
+#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
+      defined(CONFIG_CPU_SUBTYPE_SH7724)
         #define SCIx_FNS(name, sh4_scifa_offset, sh4_scifa_size, sh4_scif_offset, sh4_scif_size) \
                 CPU_SCIx_FNS(name, sh4_scifa_offset, sh4_scifa_size, sh4_scif_offset, sh4_scif_size)
         #define SCIF_FNS(name, sh4_scif_offset, sh4_scif_size) \
@@ -390,7 +394,8 @@ SCIF_FNS(SCFDR,  0x1c, 16)
 SCIF_FNS(SCxTDR, 0x20,  8)
 SCIF_FNS(SCxRDR, 0x24,  8)
 SCIF_FNS(SCLSR,  0x24, 16)
-#elif defined(CONFIG_CPU_SUBTYPE_SH7723)
+#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
+      defined(CONFIG_CPU_SUBTYPE_SH7724)
 SCIx_FNS(SCSMR,  0x00, 16, 0x00, 16)
 SCIx_FNS(SCBRR,  0x04,  8, 0x04,  8)
 SCIx_FNS(SCSCR,  0x08, 16, 0x08, 16)
@@ -604,6 +609,17 @@ static inline int sci_rxd_in(struct uart_port *port)
                 return ctrl_inb(SCSPTR5) & 0x0008 ? 1 : 0; /* SCIF5 */
         return 1;
 }
+#elif defined(CONFIG_CPU_SUBTYPE_SH7724)
+#  define SCFSR    0x0010
+#  define SCASSR   0x0014
+static inline int sci_rxd_in(struct uart_port *port)
+{
+	if (port->type == PORT_SCIF)
+		return ctrl_inw((port->mapbase + SCFSR))  & SCIF_BRK ? 1 : 0;
+	if (port->type == PORT_SCIFA)
+		return ctrl_inw((port->mapbase + SCASSR)) & SCIF_BRK ? 1 : 0;
+	return 1;
+}
 #elif defined(CONFIG_CPU_SUBTYPE_SH5_101) || defined(CONFIG_CPU_SUBTYPE_SH5_103)
 static inline int sci_rxd_in(struct uart_port *port)
 {
@@ -757,7 +773,8 @@ static inline int sci_rxd_in(struct uart_port *port)
       defined(CONFIG_CPU_SUBTYPE_SH7720) || \
       defined(CONFIG_CPU_SUBTYPE_SH7721)
 #define SCBRR_VALUE(bps, clk) (((clk*2)+16*bps)/(32*bps)-1)
-#elif defined(CONFIG_CPU_SUBTYPE_SH7723)
+#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
+      defined(CONFIG_CPU_SUBTYPE_SH7724)
 static inline int scbrr_calc(struct uart_port *port, int bps, int clk)
 {
 	if (port->type == PORT_SCIF)
-- 
GitLab


From 40c7e8be556715079d0a9d7454ceb5371a2f0b39 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Thu, 16 Apr 2009 13:16:07 +0900
Subject: [PATCH 0418/2198] sh: sh7724: Add I2C support.

This adds support for the SH-Mobile I2C controller on the SH7724.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 46 ++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 4327b1e080b73..f17eda6688d04 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -99,9 +99,55 @@ static struct platform_device rtc_device = {
 	.resource	= rtc_resources,
 };
 
+/* I2C0 */
+static struct resource iic0_resources[] = {
+	[0] = {
+		.name	= "IIC0",
+		.start  = 0x04470000,
+		.end    = 0x04470018 - 1,
+		.flags  = IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = 96,
+		.end    = 99,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device iic0_device = {
+	.name           = "i2c-sh_mobile",
+	.id             = 0, /* "i2c0" clock */
+	.num_resources  = ARRAY_SIZE(iic0_resources),
+	.resource       = iic0_resources,
+};
+
+/* I2C1 */
+static struct resource iic1_resources[] = {
+	[0] = {
+		.name	= "IIC1",
+		.start  = 0x04750000,
+		.end    = 0x04750018 - 1,
+		.flags  = IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = 92,
+		.end    = 95,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device iic1_device = {
+	.name           = "i2c-sh_mobile",
+	.id             = 1, /* "i2c1" clock */
+	.num_resources  = ARRAY_SIZE(iic1_resources),
+	.resource       = iic1_resources,
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&sci_device,
 	&rtc_device,
+	&iic0_device,
+	&iic1_device,
 };
 
 static int __init sh7724_devices_setup(void)
-- 
GitLab


From cd5b9ef776feff440e7a889d1a565ceabfecbfa1 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Wed, 15 Apr 2009 11:43:03 +0900
Subject: [PATCH 0419/2198] sh: sh7724: Add VPU support.

This adds uio_pdrv_genirq support for the VPU.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 33 ++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index f17eda6688d04..499a6fcdf2312 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -143,16 +143,49 @@ static struct platform_device iic1_device = {
 	.resource       = iic1_resources,
 };
 
+/* VPU */
+static struct uio_info vpu_platform_data = {
+	.name = "VPU5F",
+	.version = "0",
+	.irq = 60,
+};
+
+static struct resource vpu_resources[] = {
+	[0] = {
+		.name	= "VPU",
+		.start	= 0xfe900000,
+		.end	= 0xfe902807,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device vpu_device = {
+	.name		= "uio_pdrv_genirq",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &vpu_platform_data,
+	},
+	.resource	= vpu_resources,
+	.num_resources	= ARRAY_SIZE(vpu_resources),
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&sci_device,
 	&rtc_device,
 	&iic0_device,
 	&iic1_device,
+	&vpu_device,
 };
 
 static int __init sh7724_devices_setup(void)
 {
 	clk_always_enable("rtc0");   /* RTC */
+	clk_always_enable("vpu0");   /* VPU */
+
+	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
 
 	return platform_add_devices(sh7724_devices,
 				    ARRAY_SIZE(sh7724_devices));
-- 
GitLab


From ad95b78c9f735da11ff9ec760e9b038cd82aead6 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Wed, 15 Apr 2009 11:43:07 +0900
Subject: [PATCH 0420/2198] sh: sh7724: Add VEU support.

This adds uio_pdrv_genirq support for the VEU.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 64 ++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 499a6fcdf2312..65570ed69e6cf 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -172,20 +172,84 @@ static struct platform_device vpu_device = {
 	.num_resources	= ARRAY_SIZE(vpu_resources),
 };
 
+/* VEU0 */
+static struct uio_info veu0_platform_data = {
+	.name = "VEU3F0",
+	.version = "0",
+	.irq = 83,
+};
+
+static struct resource veu0_resources[] = {
+	[0] = {
+		.name	= "VEU3F0",
+		.start	= 0xfe920000,
+		.end	= 0xfe9200cb - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device veu0_device = {
+	.name		= "uio_pdrv_genirq",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &veu0_platform_data,
+	},
+	.resource	= veu0_resources,
+	.num_resources	= ARRAY_SIZE(veu0_resources),
+};
+
+/* VEU1 */
+static struct uio_info veu1_platform_data = {
+	.name = "VEU3F1",
+	.version = "0",
+	.irq = 54,
+};
+
+static struct resource veu1_resources[] = {
+	[0] = {
+		.name	= "VEU3F1",
+		.start	= 0xfe924000,
+		.end	= 0xfe9240cb - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device veu1_device = {
+	.name		= "uio_pdrv_genirq",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &veu1_platform_data,
+	},
+	.resource	= veu1_resources,
+	.num_resources	= ARRAY_SIZE(veu1_resources),
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&sci_device,
 	&rtc_device,
 	&iic0_device,
 	&iic1_device,
 	&vpu_device,
+	&veu0_device,
+	&veu1_device,
 };
 
 static int __init sh7724_devices_setup(void)
 {
 	clk_always_enable("rtc0");   /* RTC */
 	clk_always_enable("vpu0");   /* VPU */
+	clk_always_enable("veu1");   /* VEU3F1 */
+	clk_always_enable("veu0");   /* VEU3F0 */
 
 	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
+	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
+	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
 
 	return platform_add_devices(sh7724_devices,
 				    ARRAY_SIZE(sh7724_devices));
-- 
GitLab


From 6a3395beb99d7ae882ddf701c6fa6005ad7edebf Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 16 Apr 2009 15:36:13 +0900
Subject: [PATCH 0421/2198] sh: sh7724: Add CMT clockevents support.

This enables support for the CMT clockevents driver on SH7724.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 33 ++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 65570ed69e6cf..8b87ba8f26bbd 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -230,7 +230,40 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
+static struct sh_cmt_config cmt_platform_data = {
+	.name = "CMT",
+	.channel_offset = 0x60,
+	.timer_bit = 5,
+	.clk = "cmt0",
+	.clockevent_rating = 125,
+	.clocksource_rating = 200,
+};
+
+static struct resource cmt_resources[] = {
+	[0] = {
+		.name	= "CMT",
+		.start	= 0x044a0060,
+		.end	= 0x044a006b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt_device = {
+	.name		= "sh_cmt",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &cmt_platform_data,
+	},
+	.resource	= cmt_resources,
+	.num_resources	= ARRAY_SIZE(cmt_resources),
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
+	&cmt_device,
 	&sci_device,
 	&rtc_device,
 	&iic0_device,
-- 
GitLab


From 59fe700dcbf3d6257ae86ca8c0192fc64b2eea1c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 16 Apr 2009 15:43:42 +0900
Subject: [PATCH 0422/2198] sh: Have SH7724 select ARCH_SHMOBILE.

This is an SH-Mobile CPU, so select ARCH_SHMOBILE. This enables all of
the PM functionality, amongst other things.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 505d1acbd0ad1..4c68fdedfa106 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -351,6 +351,7 @@ config CPU_SUBTYPE_SH7724
 	bool "Support SH7724 processor"
 	select CPU_SH4A
 	select CPU_SHX2
+	select ARCH_SHMOBILE
 	select ARCH_SPARSEMEM_ENABLE
 	select SYS_SUPPORTS_CMT
 	help
-- 
GitLab


From bc9f3d4291f8275924a9197a81208a5df0a15095 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 16 Apr 2009 15:44:58 +0900
Subject: [PATCH 0423/2198] sh: Add a generic defconfig for SH7724 platforms.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/configs/sh7724_generic_defconfig | 707 +++++++++++++++++++++++
 1 file changed, 707 insertions(+)
 create mode 100644 arch/sh/configs/sh7724_generic_defconfig

diff --git a/arch/sh/configs/sh7724_generic_defconfig b/arch/sh/configs/sh7724_generic_defconfig
new file mode 100644
index 0000000000000..268d04ed8cdda
--- /dev/null
+++ b/arch/sh/configs/sh7724_generic_defconfig
@@ -0,0 +1,707 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.30-rc2
+# Thu Apr 16 15:42:20 2009
+#
+CONFIG_SUPERH=y
+CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
+CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_BUG=y
+CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
+CONFIG_GENERIC_IRQ_PROBE=y
+# CONFIG_GENERIC_GPIO is not set
+CONFIG_GENERIC_TIME=y
+CONFIG_GENERIC_CLOCKEVENTS=y
+CONFIG_ARCH_SUSPEND_POSSIBLE=y
+CONFIG_ARCH_HIBERNATION_POSSIBLE=y
+CONFIG_SYS_SUPPORTS_CMT=y
+CONFIG_STACKTRACE_SUPPORT=y
+CONFIG_LOCKDEP_SUPPORT=y
+CONFIG_HAVE_LATENCYTOP_SUPPORT=y
+# CONFIG_ARCH_HAS_ILOG2_U32 is not set
+# CONFIG_ARCH_HAS_ILOG2_U64 is not set
+CONFIG_ARCH_NO_VIRT_TO_BUS=y
+CONFIG_ARCH_HAS_DEFAULT_IDLE=y
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# General setup
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_BROKEN_ON_SMP=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_SYSVIPC_SYSCTL=y
+# CONFIG_BSD_PROCESS_ACCT is not set
+
+#
+# RCU Subsystem
+#
+# CONFIG_CLASSIC_RCU is not set
+CONFIG_TREE_RCU=y
+# CONFIG_PREEMPT_RCU is not set
+# CONFIG_RCU_TRACE is not set
+CONFIG_RCU_FANOUT=32
+# CONFIG_RCU_FANOUT_EXACT is not set
+# CONFIG_TREE_RCU_TRACE is not set
+# CONFIG_PREEMPT_RCU_TRACE is not set
+# CONFIG_IKCONFIG is not set
+CONFIG_LOG_BUF_SHIFT=17
+CONFIG_GROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_USER_SCHED=y
+# CONFIG_CGROUP_SCHED is not set
+CONFIG_CGROUPS=y
+# CONFIG_CGROUP_DEBUG is not set
+# CONFIG_CGROUP_NS is not set
+# CONFIG_CGROUP_FREEZER is not set
+# CONFIG_CGROUP_DEVICE is not set
+# CONFIG_CPUSETS is not set
+# CONFIG_CGROUP_CPUACCT is not set
+# CONFIG_RESOURCE_COUNTERS is not set
+# CONFIG_RELAY is not set
+# CONFIG_NAMESPACES is not set
+# CONFIG_BLK_DEV_INITRD is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
+CONFIG_EMBEDDED=y
+# CONFIG_UID16 is not set
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS=y
+# CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SIGNALFD=y
+CONFIG_TIMERFD=y
+CONFIG_EVENTFD=y
+CONFIG_SHMEM=y
+CONFIG_AIO=y
+CONFIG_VM_EVENT_COUNTERS=y
+# CONFIG_COMPAT_BRK is not set
+# CONFIG_SLAB is not set
+CONFIG_SLUB=y
+# CONFIG_SLOB is not set
+CONFIG_PROFILING=y
+CONFIG_TRACEPOINTS=y
+# CONFIG_MARKERS is not set
+CONFIG_OPROFILE=y
+CONFIG_HAVE_OPROFILE=y
+CONFIG_HAVE_IOREMAP_PROT=y
+CONFIG_HAVE_KPROBES=y
+CONFIG_HAVE_KRETPROBES=y
+CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
+CONFIG_HAVE_GENERIC_DMA_COHERENT=y
+CONFIG_RT_MUTEXES=y
+CONFIG_BASE_SMALL=0
+# CONFIG_MODULES is not set
+CONFIG_BLOCK=y
+# CONFIG_LBD is not set
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_BLK_DEV_INTEGRITY is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+CONFIG_DEFAULT_AS=y
+# CONFIG_DEFAULT_DEADLINE is not set
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="anticipatory"
+CONFIG_FREEZER=y
+
+#
+# System type
+#
+CONFIG_CPU_SH4=y
+CONFIG_CPU_SH4A=y
+CONFIG_CPU_SHX2=y
+CONFIG_ARCH_SHMOBILE=y
+# CONFIG_CPU_SUBTYPE_SH7619 is not set
+# CONFIG_CPU_SUBTYPE_SH7201 is not set
+# CONFIG_CPU_SUBTYPE_SH7203 is not set
+# CONFIG_CPU_SUBTYPE_SH7206 is not set
+# CONFIG_CPU_SUBTYPE_SH7263 is not set
+# CONFIG_CPU_SUBTYPE_MXG is not set
+# CONFIG_CPU_SUBTYPE_SH7705 is not set
+# CONFIG_CPU_SUBTYPE_SH7706 is not set
+# CONFIG_CPU_SUBTYPE_SH7707 is not set
+# CONFIG_CPU_SUBTYPE_SH7708 is not set
+# CONFIG_CPU_SUBTYPE_SH7709 is not set
+# CONFIG_CPU_SUBTYPE_SH7710 is not set
+# CONFIG_CPU_SUBTYPE_SH7712 is not set
+# CONFIG_CPU_SUBTYPE_SH7720 is not set
+# CONFIG_CPU_SUBTYPE_SH7721 is not set
+# CONFIG_CPU_SUBTYPE_SH7750 is not set
+# CONFIG_CPU_SUBTYPE_SH7091 is not set
+# CONFIG_CPU_SUBTYPE_SH7750R is not set
+# CONFIG_CPU_SUBTYPE_SH7750S is not set
+# CONFIG_CPU_SUBTYPE_SH7751 is not set
+# CONFIG_CPU_SUBTYPE_SH7751R is not set
+# CONFIG_CPU_SUBTYPE_SH7760 is not set
+# CONFIG_CPU_SUBTYPE_SH4_202 is not set
+# CONFIG_CPU_SUBTYPE_SH7723 is not set
+CONFIG_CPU_SUBTYPE_SH7724=y
+# CONFIG_CPU_SUBTYPE_SH7763 is not set
+# CONFIG_CPU_SUBTYPE_SH7770 is not set
+# CONFIG_CPU_SUBTYPE_SH7780 is not set
+# CONFIG_CPU_SUBTYPE_SH7785 is not set
+# CONFIG_CPU_SUBTYPE_SH7786 is not set
+# CONFIG_CPU_SUBTYPE_SHX3 is not set
+# CONFIG_CPU_SUBTYPE_SH7343 is not set
+# CONFIG_CPU_SUBTYPE_SH7722 is not set
+# CONFIG_CPU_SUBTYPE_SH7366 is not set
+
+#
+# Memory management options
+#
+CONFIG_QUICKLIST=y
+CONFIG_MMU=y
+CONFIG_PAGE_OFFSET=0x80000000
+CONFIG_MEMORY_START=0x08000000
+CONFIG_MEMORY_SIZE=0x04000000
+CONFIG_29BIT=y
+# CONFIG_X2TLB is not set
+CONFIG_VSYSCALL=y
+CONFIG_ARCH_FLATMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_MAX_ACTIVE_REGIONS=1
+CONFIG_ARCH_POPULATES_NODE_MAP=y
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y
+CONFIG_PAGE_SIZE_4KB=y
+# CONFIG_PAGE_SIZE_8KB is not set
+# CONFIG_PAGE_SIZE_16KB is not set
+# CONFIG_PAGE_SIZE_64KB is not set
+CONFIG_ENTRY_OFFSET=0x00001000
+CONFIG_SELECT_MEMORY_MODEL=y
+# CONFIG_FLATMEM_MANUAL is not set
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+CONFIG_SPARSEMEM_MANUAL=y
+CONFIG_SPARSEMEM=y
+CONFIG_HAVE_MEMORY_PRESENT=y
+CONFIG_SPARSEMEM_STATIC=y
+
+#
+# Memory hotplug is currently incompatible with Software Suspend
+#
+CONFIG_PAGEFLAGS_EXTENDED=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_MIGRATION=y
+# CONFIG_PHYS_ADDR_T_64BIT is not set
+CONFIG_ZONE_DMA_FLAG=0
+CONFIG_NR_QUICK=2
+CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
+
+#
+# Cache configuration
+#
+CONFIG_CACHE_WRITEBACK=y
+# CONFIG_CACHE_WRITETHROUGH is not set
+# CONFIG_CACHE_OFF is not set
+
+#
+# Processor features
+#
+CONFIG_CPU_LITTLE_ENDIAN=y
+# CONFIG_CPU_BIG_ENDIAN is not set
+CONFIG_SH_FPU=y
+# CONFIG_SH_STORE_QUEUES is not set
+CONFIG_CPU_HAS_INTEVT=y
+CONFIG_CPU_HAS_SR_RB=y
+CONFIG_CPU_HAS_PTEA=y
+CONFIG_CPU_HAS_FPU=y
+
+#
+# Board support
+#
+
+#
+# Timer and clock configuration
+#
+CONFIG_SH_TMU=y
+CONFIG_SH_TIMER_CMT=y
+CONFIG_SH_TIMER_IRQ=16
+CONFIG_SH_PCLK_FREQ=41666666
+CONFIG_TICK_ONESHOT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
+
+#
+# CPU Frequency scaling
+#
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_TABLE=y
+# CONFIG_CPU_FREQ_DEBUG is not set
+CONFIG_CPU_FREQ_STAT=y
+# CONFIG_CPU_FREQ_STAT_DETAILS is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
+# CONFIG_CPU_FREQ_GOV_USERSPACE is not set
+# CONFIG_CPU_FREQ_GOV_ONDEMAND is not set
+# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
+CONFIG_SH_CPU_FREQ=y
+
+#
+# DMA support
+#
+# CONFIG_SH_DMA is not set
+
+#
+# Companion Chips
+#
+
+#
+# Additional SuperH Device Drivers
+#
+# CONFIG_HEARTBEAT is not set
+# CONFIG_PUSH_SWITCH is not set
+
+#
+# Kernel features
+#
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
+# CONFIG_HZ_300 is not set
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=250
+CONFIG_SCHED_HRTICK=y
+CONFIG_KEXEC=y
+# CONFIG_CRASH_DUMP is not set
+CONFIG_KEXEC_JUMP=y
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+CONFIG_GUSA=y
+
+#
+# Boot options
+#
+CONFIG_ZERO_PAGE_OFFSET=0x00001000
+CONFIG_BOOT_LINK_OFFSET=0x00800000
+# CONFIG_CMDLINE_BOOL is not set
+
+#
+# Bus options
+#
+# CONFIG_ARCH_SUPPORTS_MSI is not set
+# CONFIG_PCCARD is not set
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_HAVE_AOUT is not set
+# CONFIG_BINFMT_MISC is not set
+
+#
+# Power management options (EXPERIMENTAL)
+#
+CONFIG_PM=y
+# CONFIG_PM_DEBUG is not set
+CONFIG_PM_SLEEP=y
+CONFIG_SUSPEND=y
+CONFIG_SUSPEND_FREEZER=y
+CONFIG_HIBERNATION=y
+CONFIG_PM_STD_PARTITION=""
+CONFIG_CPU_IDLE=y
+CONFIG_CPU_IDLE_GOV_LADDER=y
+CONFIG_CPU_IDLE_GOV_MENU=y
+# CONFIG_NET is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_STANDALONE=y
+# CONFIG_PREVENT_FIRMWARE_BUILD is not set
+CONFIG_FW_LOADER=y
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE=""
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_MTD is not set
+# CONFIG_PARPORT is not set
+CONFIG_BLK_DEV=y
+# CONFIG_BLK_DEV_COW_COMMON is not set
+# CONFIG_BLK_DEV_LOOP is not set
+# CONFIG_BLK_DEV_RAM is not set
+# CONFIG_CDROM_PKTCDVD is not set
+# CONFIG_BLK_DEV_HD is not set
+# CONFIG_MISC_DEVICES is not set
+CONFIG_HAVE_IDE=y
+# CONFIG_IDE is not set
+
+#
+# SCSI device support
+#
+# CONFIG_RAID_ATTRS is not set
+# CONFIG_SCSI is not set
+# CONFIG_SCSI_DMA is not set
+# CONFIG_SCSI_NETLINK is not set
+# CONFIG_ATA is not set
+# CONFIG_MD is not set
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+# CONFIG_INPUT is not set
+
+#
+# Hardware I/O ports
+#
+# CONFIG_SERIO is not set
+# CONFIG_GAMEPORT is not set
+
+#
+# Character devices
+#
+# CONFIG_VT is not set
+# CONFIG_DEVKMEM is not set
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+# CONFIG_SERIAL_8250 is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_SH_SCI=y
+CONFIG_SERIAL_SH_SCI_NR_UARTS=6
+CONFIG_SERIAL_SH_SCI_CONSOLE=y
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_UNIX98_PTYS is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_IPMI_HANDLER is not set
+# CONFIG_HW_RANDOM is not set
+# CONFIG_R3964 is not set
+# CONFIG_RAW_DRIVER is not set
+# CONFIG_TCG_TPM is not set
+CONFIG_I2C=y
+CONFIG_I2C_BOARDINFO=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_HELPER_AUTO=y
+
+#
+# I2C Hardware Bus support
+#
+
+#
+# I2C system bus drivers (mostly embedded / system-on-chip)
+#
+# CONFIG_I2C_OCORES is not set
+CONFIG_I2C_SH_MOBILE=y
+# CONFIG_I2C_SIMTEC is not set
+
+#
+# External I2C/SMBus adapter drivers
+#
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_TAOS_EVM is not set
+
+#
+# Other I2C/SMBus bus drivers
+#
+# CONFIG_I2C_PCA_PLATFORM is not set
+
+#
+# Miscellaneous I2C Chip support
+#
+# CONFIG_DS1682 is not set
+# CONFIG_SENSORS_PCF8574 is not set
+# CONFIG_PCF8575 is not set
+# CONFIG_SENSORS_PCA9539 is not set
+# CONFIG_SENSORS_MAX6875 is not set
+# CONFIG_SENSORS_TSL2550 is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+# CONFIG_SPI is not set
+# CONFIG_W1 is not set
+# CONFIG_POWER_SUPPLY is not set
+# CONFIG_HWMON is not set
+# CONFIG_THERMAL is not set
+# CONFIG_THERMAL_HWMON is not set
+# CONFIG_WATCHDOG is not set
+CONFIG_SSB_POSSIBLE=y
+
+#
+# Sonics Silicon Backplane
+#
+# CONFIG_SSB is not set
+
+#
+# Multifunction device drivers
+#
+# CONFIG_MFD_CORE is not set
+# CONFIG_MFD_SM501 is not set
+# CONFIG_HTC_PASIC3 is not set
+# CONFIG_TWL4030_CORE is not set
+# CONFIG_MFD_TMIO is not set
+# CONFIG_PMIC_DA903X is not set
+# CONFIG_MFD_WM8400 is not set
+# CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_PCF50633 is not set
+# CONFIG_REGULATOR is not set
+
+#
+# Multimedia devices
+#
+
+#
+# Multimedia core support
+#
+# CONFIG_VIDEO_DEV is not set
+# CONFIG_VIDEO_MEDIA is not set
+
+#
+# Multimedia drivers
+#
+# CONFIG_DAB is not set
+
+#
+# Graphics support
+#
+# CONFIG_VGASTATE is not set
+# CONFIG_VIDEO_OUTPUT_CONTROL is not set
+# CONFIG_FB is not set
+# CONFIG_BACKLIGHT_LCD_SUPPORT is not set
+
+#
+# Display device support
+#
+# CONFIG_DISPLAY_SUPPORT is not set
+# CONFIG_SOUND is not set
+# CONFIG_USB_SUPPORT is not set
+# CONFIG_MMC is not set
+# CONFIG_MEMSTICK is not set
+# CONFIG_NEW_LEDS is not set
+# CONFIG_ACCESSIBILITY is not set
+CONFIG_RTC_LIB=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_HCTOSYS=y
+CONFIG_RTC_HCTOSYS_DEVICE="rtc0"
+# CONFIG_RTC_DEBUG is not set
+
+#
+# RTC interfaces
+#
+CONFIG_RTC_INTF_DEV=y
+# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
+# CONFIG_RTC_DRV_TEST is not set
+
+#
+# I2C RTC drivers
+#
+# CONFIG_RTC_DRV_DS1307 is not set
+# CONFIG_RTC_DRV_DS1374 is not set
+# CONFIG_RTC_DRV_DS1672 is not set
+# CONFIG_RTC_DRV_MAX6900 is not set
+# CONFIG_RTC_DRV_RS5C372 is not set
+# CONFIG_RTC_DRV_ISL1208 is not set
+# CONFIG_RTC_DRV_X1205 is not set
+# CONFIG_RTC_DRV_PCF8563 is not set
+# CONFIG_RTC_DRV_PCF8583 is not set
+# CONFIG_RTC_DRV_M41T80 is not set
+# CONFIG_RTC_DRV_S35390A is not set
+# CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
+
+#
+# SPI RTC drivers
+#
+
+#
+# Platform RTC drivers
+#
+# CONFIG_RTC_DRV_DS1286 is not set
+# CONFIG_RTC_DRV_DS1511 is not set
+# CONFIG_RTC_DRV_DS1553 is not set
+# CONFIG_RTC_DRV_DS1742 is not set
+# CONFIG_RTC_DRV_STK17TA8 is not set
+# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_M48T35 is not set
+# CONFIG_RTC_DRV_M48T59 is not set
+# CONFIG_RTC_DRV_BQ4802 is not set
+# CONFIG_RTC_DRV_V3020 is not set
+
+#
+# on-CPU RTC drivers
+#
+CONFIG_RTC_DRV_SH=y
+# CONFIG_DMADEVICES is not set
+# CONFIG_AUXDISPLAY is not set
+CONFIG_UIO=y
+# CONFIG_UIO_PDRV is not set
+CONFIG_UIO_PDRV_GENIRQ=y
+# CONFIG_UIO_SMX is not set
+# CONFIG_UIO_SERCOS3 is not set
+# CONFIG_STAGING is not set
+
+#
+# File systems
+#
+# CONFIG_EXT2_FS is not set
+# CONFIG_EXT3_FS is not set
+# CONFIG_EXT4_FS is not set
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+# CONFIG_FS_POSIX_ACL is not set
+CONFIG_FILE_LOCKING=y
+# CONFIG_XFS_FS is not set
+# CONFIG_BTRFS_FS is not set
+# CONFIG_DNOTIFY is not set
+# CONFIG_INOTIFY is not set
+# CONFIG_QUOTA is not set
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_FUSE_FS is not set
+
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+# CONFIG_MSDOS_FS is not set
+# CONFIG_VFAT_FS is not set
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+# CONFIG_PROC_FS is not set
+# CONFIG_SYSFS is not set
+# CONFIG_TMPFS is not set
+# CONFIG_HUGETLBFS is not set
+# CONFIG_HUGETLB_PAGE is not set
+# CONFIG_MISC_FILESYSTEMS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+# CONFIG_NLS is not set
+
+#
+# Kernel hacking
+#
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+# CONFIG_PRINTK_TIME is not set
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_MAGIC_SYSRQ is not set
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_FS=y
+# CONFIG_HEADERS_CHECK is not set
+# CONFIG_DEBUG_KERNEL is not set
+CONFIG_STACKTRACE=y
+# CONFIG_DEBUG_BUGVERBOSE is not set
+# CONFIG_DEBUG_MEMORY_INIT is not set
+# CONFIG_RCU_CPU_STALL_DETECTOR is not set
+# CONFIG_LATENCYTOP is not set
+# CONFIG_SYSCTL_SYSCALL_CHECK is not set
+CONFIG_NOP_TRACER=y
+CONFIG_HAVE_FUNCTION_TRACER=y
+CONFIG_HAVE_DYNAMIC_FTRACE=y
+CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
+
+#
+# Tracers
+#
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
+# CONFIG_SAMPLES is not set
+CONFIG_HAVE_ARCH_KGDB=y
+# CONFIG_SH_STANDARD_BIOS is not set
+# CONFIG_EARLY_SCIF_CONSOLE is not set
+# CONFIG_MORE_COMPILE_OPTIONS is not set
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITYFS is not set
+# CONFIG_SECURITY_FILE_CAPABILITIES is not set
+# CONFIG_CRYPTO is not set
+CONFIG_BINARY_PRINTF=y
+
+#
+# Library routines
+#
+CONFIG_GENERIC_FIND_LAST_BIT=y
+# CONFIG_CRC_CCITT is not set
+# CONFIG_CRC16 is not set
+# CONFIG_CRC_T10DIF is not set
+# CONFIG_CRC_ITU_T is not set
+# CONFIG_CRC32 is not set
+# CONFIG_CRC7 is not set
+# CONFIG_LIBCRC32C is not set
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
-- 
GitLab


From b8b47bfbe4eb1ae0e6891e49c86a5f4fb00413be Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 11 Mar 2009 15:41:51 +0900
Subject: [PATCH 0424/2198] sh: pass along struct pci_channel

These patches rework the pci code for the sh architecture.

Currently each board implements some kind of ioport to address mapping.
Some boards use generic_io_base others try passing addresses as io ports.
This is the first set of patches that try to unify the pci code as much
as possible to avoid duplicated code. This will in the end lead to fewer
lines board specific code and more generic code.

This patch makes sure a struct pci_channel pointer is passed along to
various pci functions such as pci_read_reg(), pci_write_reg(),
pci_fixup_pcic(), sh7751_pcic_init() and sh7780_pcic_init().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/fixups-lboxre2.c    | 23 +++++----
 arch/sh/drivers/pci/fixups-r7780rp.c    | 33 ++++++------
 arch/sh/drivers/pci/fixups-rts7751r2d.c | 23 +++++----
 arch/sh/drivers/pci/fixups-sdk7780.c    | 46 +++++++++--------
 arch/sh/drivers/pci/fixups-se7780.c     | 32 ++++++------
 arch/sh/drivers/pci/ops-landisk.c       |  2 +-
 arch/sh/drivers/pci/ops-lboxre2.c       |  2 +-
 arch/sh/drivers/pci/ops-r7780rp.c       |  2 +-
 arch/sh/drivers/pci/ops-rts7751r2d.c    |  2 +-
 arch/sh/drivers/pci/ops-sdk7780.c       |  2 +-
 arch/sh/drivers/pci/ops-se7780.c        |  2 +-
 arch/sh/drivers/pci/ops-sh4.c           | 24 ++++-----
 arch/sh/drivers/pci/ops-snapgear.c      |  2 +-
 arch/sh/drivers/pci/ops-titan.c         |  2 +-
 arch/sh/drivers/pci/pci-sh4.h           | 11 ++--
 arch/sh/drivers/pci/pci-sh7751.c        | 68 +++++++++++++------------
 arch/sh/drivers/pci/pci-sh7751.h        |  3 +-
 arch/sh/drivers/pci/pci-sh7780.c        | 31 +++++------
 arch/sh/drivers/pci/pci-sh7780.h        |  3 +-
 19 files changed, 164 insertions(+), 149 deletions(-)

diff --git a/arch/sh/drivers/pci/fixups-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c
index 1c1d41255ec0f..a82011d03cb0a 100644
--- a/arch/sh/drivers/pci/fixups-lboxre2.c
+++ b/arch/sh/drivers/pci/fixups-lboxre2.c
@@ -9,33 +9,34 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
+#include <linux/pci.h>
 #include "pci-sh4.h"
 
 #define PCIMCR_MRSET_OFF	0xBFFFFFFF
 #define PCIMCR_RFSH_OFF		0xFFFFFFFB
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
 	unsigned long bcr1, mcr;
 
 	bcr1 = ctrl_inl(SH7751_BCR1);
 	bcr1 |= 0x40080000;	/* Enable Bit 19 BREQEN, set PCIC to slave */
-	pci_write_reg(bcr1, SH4_PCIBCR1);
+	pci_write_reg(chan, bcr1, SH4_PCIBCR1);
 
 	/* Enable all interrupts, so we known what to fix */
-	pci_write_reg(0x0000c3ff, SH4_PCIINTM);
-	pci_write_reg(0x0000380f, SH4_PCIAINTM);
-	pci_write_reg(0xfb900047, SH7751_PCICONF1);
-	pci_write_reg(0xab000001, SH7751_PCICONF4);
+	pci_write_reg(chan, 0x0000c3ff, SH4_PCIINTM);
+	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
+	pci_write_reg(chan, 0xfb900047, SH7751_PCICONF1);
+	pci_write_reg(chan, 0xab000001, SH7751_PCICONF4);
 
 	mcr = ctrl_inl(SH7751_MCR);
 	mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF;
-	pci_write_reg(mcr, SH4_PCIMCR);
+	pci_write_reg(chan, mcr, SH4_PCIMCR);
 
-	pci_write_reg(0x0c000000, SH7751_PCICONF5);
-	pci_write_reg(0xd0000000, SH7751_PCICONF6);
-	pci_write_reg(0x0c000000, SH4_PCILAR0);
-	pci_write_reg(0x00000000, SH4_PCILAR1);
+	pci_write_reg(chan, 0x0c000000, SH7751_PCICONF5);
+	pci_write_reg(chan, 0xd0000000, SH7751_PCICONF6);
+	pci_write_reg(chan, 0x0c000000, SH4_PCILAR0);
+	pci_write_reg(chan, 0x00000000, SH4_PCILAR1);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c
index 3e321df65d22a..5b25021bbd62a 100644
--- a/arch/sh/drivers/pci/fixups-r7780rp.c
+++ b/arch/sh/drivers/pci/fixups-r7780rp.c
@@ -14,32 +14,33 @@
 #include "pci-sh4.h"
 #include <asm/io.h>
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
-	pci_write_reg(0x000043ff, SH4_PCIINTM);
-	pci_write_reg(0x0000380f, SH4_PCIAINTM);
+	pci_write_reg(chan, 0x000043ff, SH4_PCIINTM);
+	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
 
-	pci_write_reg(0xfbb00047, SH7780_PCICMD);
-	pci_write_reg(0x00000000, SH7780_PCIIBAR);
+	pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD);
+	pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR);
 
-	pci_write_reg(0x00011912, SH7780_PCISVID);
-	pci_write_reg(0x08000000, SH7780_PCICSCR0);
-	pci_write_reg(0x0000001b, SH7780_PCICSAR0);
-	pci_write_reg(0xfd000000, SH7780_PCICSCR1);
-	pci_write_reg(0x0000000f, SH7780_PCICSAR1);
+	pci_write_reg(chan, 0x00011912, SH7780_PCISVID);
+	pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0);
+	pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0);
+	pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1);
+	pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1);
 
-	pci_write_reg(0xfd000000, SH7780_PCIMBR0);
-	pci_write_reg(0x00fc0000, SH7780_PCIMBMR0);
+	pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0);
+	pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0);
 
 #ifdef CONFIG_32BIT
-	pci_write_reg(0xc0000000, SH7780_PCIMBR2);
-	pci_write_reg(0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
+	pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2);
+	pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
 #endif
 
 	/* Set IOBR for windows containing area specified in pci.h */
-	pci_write_reg((PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)),
+	pci_write_reg(chan, (PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)),
 		      SH7780_PCIIOBR);
-	pci_write_reg(((SH7780_PCI_IO_SIZE-1) & (7<<18)), SH7780_PCIIOBMR);
+	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)),
+		      SH7780_PCIIOBMR);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/fixups-rts7751r2d.c b/arch/sh/drivers/pci/fixups-rts7751r2d.c
index 904bce8768d3b..38852334d4791 100644
--- a/arch/sh/drivers/pci/fixups-rts7751r2d.c
+++ b/arch/sh/drivers/pci/fixups-rts7751r2d.c
@@ -10,34 +10,35 @@
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
+#include <linux/pci.h>
 #include "pci-sh4.h"
 
 #define PCIMCR_MRSET_OFF	0xBFFFFFFF
 #define PCIMCR_RFSH_OFF		0xFFFFFFFB
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
 	unsigned long bcr1, mcr;
 
 	bcr1 = ctrl_inl(SH7751_BCR1);
 	bcr1 |= 0x40080000;	/* Enable Bit 19 BREQEN, set PCIC to slave */
-	pci_write_reg(bcr1, SH4_PCIBCR1);
+	pci_write_reg(chan, bcr1, SH4_PCIBCR1);
 
 	/* Enable all interrupts, so we known what to fix */
-	pci_write_reg(0x0000c3ff, SH4_PCIINTM);
-	pci_write_reg(0x0000380f, SH4_PCIAINTM);
+	pci_write_reg(chan, 0x0000c3ff, SH4_PCIINTM);
+	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
 
-	pci_write_reg(0xfb900047, SH7751_PCICONF1);
-	pci_write_reg(0xab000001, SH7751_PCICONF4);
+	pci_write_reg(chan, 0xfb900047, SH7751_PCICONF1);
+	pci_write_reg(chan, 0xab000001, SH7751_PCICONF4);
 
 	mcr = ctrl_inl(SH7751_MCR);
 	mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF;
-	pci_write_reg(mcr, SH4_PCIMCR);
+	pci_write_reg(chan, mcr, SH4_PCIMCR);
 
-	pci_write_reg(0x0c000000, SH7751_PCICONF5);
-	pci_write_reg(0xd0000000, SH7751_PCICONF6);
-	pci_write_reg(0x0c000000, SH4_PCILAR0);
-	pci_write_reg(0x00000000, SH4_PCILAR1);
+	pci_write_reg(chan, 0x0c000000, SH7751_PCICONF5);
+	pci_write_reg(chan, 0xd0000000, SH7751_PCICONF6);
+	pci_write_reg(chan, 0x0c000000, SH4_PCILAR0);
+	pci_write_reg(chan, 0x00000000, SH4_PCILAR1);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c
index 2f8863099dd15..3f6754a120ed5 100644
--- a/arch/sh/drivers/pci/fixups-sdk7780.c
+++ b/arch/sh/drivers/pci/fixups-sdk7780.c
@@ -14,46 +14,48 @@
 #include "pci-sh4.h"
 #include <asm/io.h>
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2);
 
 	/* Enable all interrupts, so we know what to fix */
-	pci_write_reg(0x0000C3FF, SH7780_PCIIMR);
-	pci_write_reg(0x0000380F, SH7780_PCIAINTM);
+	pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR);
+	pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM);
 
 	/* Set up standard PCI config registers */
-	pci_write_reg(0xFB00, SH7780_PCISTATUS);
-	pci_write_reg(0x0047, SH7780_PCICMD);
-	pci_write_reg(0x00, SH7780_PCIPIF);
-	pci_write_reg(0x00, SH7780_PCISUB);
-	pci_write_reg(0x06, SH7780_PCIBCC);
-	pci_write_reg(0x1912, SH7780_PCISVID);
-	pci_write_reg(0x0001, SH7780_PCISID);
+	pci_write_reg(chan, 0xFB00, SH7780_PCISTATUS);
+	pci_write_reg(chan, 0x0047, SH7780_PCICMD);
+	pci_write_reg(chan, 0x00, SH7780_PCIPIF);
+	pci_write_reg(chan, 0x00, SH7780_PCISUB);
+	pci_write_reg(chan, 0x06, SH7780_PCIBCC);
+	pci_write_reg(chan, 0x1912, SH7780_PCISVID);
+	pci_write_reg(chan, 0x0001, SH7780_PCISID);
 
-	pci_write_reg(0x08000000, SH7780_PCIMBAR0);	/* PCI */
-	pci_write_reg(0x08000000, SH7780_PCILAR0);	/* SHwy */
-	pci_write_reg(0x07F00001, SH7780_PCILSR);	/* size 128M w/ MBAR */
+	pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0);	/* PCI */
+	pci_write_reg(chan, 0x08000000, SH7780_PCILAR0);	/* SHwy */
+	pci_write_reg(chan, 0x07F00001, SH7780_PCILSR);	/* size 128M w/ MBAR */
 
-	pci_write_reg(0x00000000, SH7780_PCIMBAR1);
-	pci_write_reg(0x00000000, SH7780_PCILAR1);
-	pci_write_reg(0x00000000, SH7780_PCILSR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCILAR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCILSR1);
 
-	pci_write_reg(0xAB000801, SH7780_PCIIBAR);
+	pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR);
 
 	/*
 	 * Set the MBR so PCI address is one-to-one with window,
 	 * meaning all calls go straight through... use ifdef to
 	 * catch erroneous assumption.
 	 */
-	pci_write_reg(0xFD000000 , SH7780_PCIMBR0);
-	pci_write_reg(0x00FC0000 , SH7780_PCIMBMR0);	/* 16M */
+	pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0);
+	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);	/* 16M */
 
 	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1), SH7780_PCIIOBR);
-	pci_write_reg((SH7780_PCI_IO_SIZE-1) & (7 << 18), SH7780_PCIIOBMR);
+	pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1),
+		      SH7780_PCIIOBR);
+	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
+		      SH7780_PCIIOBMR);
 
-	pci_write_reg(0xA5000C01, SH7780_PCICR);
+	pci_write_reg(chan, 0xA5000C01, SH7780_PCICR);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/fixups-se7780.c b/arch/sh/drivers/pci/fixups-se7780.c
index 880cea1c0d89d..b8e735e01c3c7 100644
--- a/arch/sh/drivers/pci/fixups-se7780.c
+++ b/arch/sh/drivers/pci/fixups-se7780.c
@@ -15,13 +15,13 @@
 #include "pci-sh4.h"
 #include <asm/io.h>
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2);
 
 	/* Enable all interrupts, so we know what to fix */
-	pci_write_reg(0x0000C3FF, SH7780_PCIIMR);
-	pci_write_reg(0x0000380F, SH7780_PCIAINTM);
+	pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR);
+	pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM);
 
 	/* Set up standard PCI config registers */
 	ctrl_outw(0xFB00, PCI_REG(SH7780_PCISTATUS));
@@ -32,29 +32,31 @@ int pci_fixup_pcic(void)
 	ctrl_outw(0x1912, PCI_REG(SH7780_PCISVID));
 	ctrl_outw(0x0001, PCI_REG(SH7780_PCISID));
 
-	pci_write_reg(0x08000000, SH7780_PCIMBAR0);     /* PCI */
-	pci_write_reg(0x08000000, SH7780_PCILAR0);     /* SHwy */
-	pci_write_reg(0x07F00001, SH7780_PCILSR);      /* size 128M w/ MBAR */
+	pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0);     /* PCI */
+	pci_write_reg(chan, 0x08000000, SH7780_PCILAR0);     /* SHwy */
+	pci_write_reg(chan, 0x07F00001, SH7780_PCILSR); /* size 128M w/ MBAR */
 
-	pci_write_reg(0x00000000, SH7780_PCIMBAR1);
-	pci_write_reg(0x00000000, SH7780_PCILAR1);
-	pci_write_reg(0x00000000, SH7780_PCILSR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCILAR1);
+	pci_write_reg(chan, 0x00000000, SH7780_PCILSR1);
 
-	pci_write_reg(0xAB000801, SH7780_PCIIBAR);
+	pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR);
 
 	/*
 	 * Set the MBR so PCI address is one-to-one with window,
 	 * meaning all calls go straight through... use ifdef to
 	 * catch erroneous assumption.
 	 */
-	pci_write_reg(0xFD000000 , SH7780_PCIMBR0);
-	pci_write_reg(0x00FC0000 , SH7780_PCIMBMR0);    /* 16M */
+	pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0);
+	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);    /* 16M */
 
 	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1), SH7780_PCIIOBR);
-	pci_write_reg((SH7780_PCI_IO_SIZE-1) & (7 << 18), SH7780_PCIIOBMR);
+	pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1),
+		      SH7780_PCIIOBR);
+	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
+		      SH7780_PCIIOBMR);
 
-	pci_write_reg(0xA5000C01, SH7780_PCICR);
+	pci_write_reg(chan, 0xA5000C01, SH7780_PCICR);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c
index bff09ecf34198..343c072a5a796 100644
--- a/arch/sh/drivers/pci/ops-landisk.c
+++ b/arch/sh/drivers/pci/ops-landisk.c
@@ -45,7 +45,7 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7751_pcic_init(&sh7751_pci_map);
+	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
 
 int pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
index 86c0b6fb7375a..8bff32a22101e 100644
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ b/arch/sh/drivers/pci/ops-lboxre2.c
@@ -59,5 +59,5 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7751_pcic_init(&sh7751_pci_map);
+	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index 8555238e63eb8..bf32ee8b1321b 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -64,5 +64,5 @@ static struct sh4_pci_address_map sh7780_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7780_pcic_init(&sh7780_pci_map);
+	return sh7780_pcic_init(&board_pci_channels[0], &sh7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index d6ca74b25d5f4..e4208a697321d 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -69,6 +69,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 int __init pcibios_init_platform(void)
 {
 	__set_io_port_base(SH7751_PCI_IO_BASE);
-	return sh7751_pcic_init(&sh7751_pci_map);
+	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
 
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index 4dcc64184b235..21d59d4a2150d 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -69,5 +69,5 @@ static struct sh4_pci_address_map sdk7780_pci_map = {
 int __init pcibios_init_platform(void)
 {
 	printk(KERN_INFO "SH7780 PCI: Finished initializing PCI controller\n");
-	return sh7780_pcic_init(&sdk7780_pci_map);
+	return sh7780_pcic_init(&board_pci_channels[0], &sdk7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 3145c62484d61..78a6f2bc4f12b 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -92,5 +92,5 @@ int __init pcibios_init_platform(void)
 	ctrl_outw(0x0013, FPGA_PCI_INTSEL1);
 	ctrl_outw(0xE402, FPGA_PCI_INTSEL2);
 
-	return sh7780_pcic_init(&se7780_pci_map);
+	return sh7780_pcic_init(&board_pci_channels[0], &se7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index 710a3b0306e59..92d27f734f2e7 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -34,8 +34,8 @@ static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn,
 	 * so we must do byte alignment by hand
 	 */
 	spin_lock_irqsave(&sh4_pci_lock, flags);
-	pci_write_reg(CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
-	data = pci_read_reg(SH4_PCIPDR);
+	pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
+	data = pci_read_reg(NULL, SH4_PCIPDR);
 	spin_unlock_irqrestore(&sh4_pci_lock, flags);
 
 	switch (size) {
@@ -68,8 +68,8 @@ static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn,
 	u32 data;
 
 	spin_lock_irqsave(&sh4_pci_lock, flags);
-	pci_write_reg(CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
-	data = pci_read_reg(SH4_PCIPDR);
+	pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
+	data = pci_read_reg(NULL, SH4_PCIPDR);
 	spin_unlock_irqrestore(&sh4_pci_lock, flags);
 
 	switch (size) {
@@ -90,7 +90,7 @@ static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn,
 		return PCIBIOS_FUNC_NOT_SUPPORTED;
 	}
 
-	pci_write_reg(data, SH4_PCIPDR);
+	pci_write_reg(NULL, data, SH4_PCIPDR);
 
 	return PCIBIOS_SUCCESSFUL;
 }
@@ -106,25 +106,25 @@ struct pci_ops sh4_pci_ops = {
  */
 static unsigned int pci_probe = PCI_PROBE_CONF1;
 
-int __init sh4_pci_check_direct(void)
+int __init sh4_pci_check_direct(struct pci_channel *chan)
 {
 	/*
 	 * Check if configuration works.
 	 */
 	if (pci_probe & PCI_PROBE_CONF1) {
-		unsigned int tmp = pci_read_reg(SH4_PCIPAR);
+		unsigned int tmp = pci_read_reg(chan, SH4_PCIPAR);
 
-		pci_write_reg(P1SEG, SH4_PCIPAR);
+		pci_write_reg(chan, P1SEG, SH4_PCIPAR);
 
-		if (pci_read_reg(SH4_PCIPAR) == P1SEG) {
-			pci_write_reg(tmp, SH4_PCIPAR);
+		if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) {
+			pci_write_reg(chan, tmp, SH4_PCIPAR);
 			printk(KERN_INFO "PCI: Using configuration type 1\n");
 			request_region(PCI_REG(SH4_PCIPAR), 8, "PCI conf1");
 
 			return 0;
 		}
 
-		pci_write_reg(tmp, SH4_PCIPAR);
+		pci_write_reg(chan, tmp, SH4_PCIPAR);
 	}
 
 	pr_debug("PCI: pci_check_direct failed\n");
@@ -163,7 +163,7 @@ char * __devinit pcibios_setup(char *str)
 	return str;
 }
 
-int __attribute__((weak)) pci_fixup_pcic(void)
+int __attribute__((weak)) pci_fixup_pcic(struct pci_channel *chan)
 {
 	/* Nothing to do. */
 	return 0;
diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
index 53dd893d4e549..cba80153dde98 100644
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ b/arch/sh/drivers/pci/ops-snapgear.c
@@ -66,7 +66,7 @@ static struct sh4_pci_address_map sh7751_pci_map = {
  */
 int __init pcibios_init_platform(void)
 {
-	return sh7751_pcic_init(&sh7751_pci_map);
+	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
 
 int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
index a8f7801a34afb..69fcc5c5d5204 100644
--- a/arch/sh/drivers/pci/ops-titan.c
+++ b/arch/sh/drivers/pci/ops-titan.c
@@ -73,5 +73,5 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7751_pcic_init(&sh7751_pci_map);
+	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
diff --git a/arch/sh/drivers/pci/pci-sh4.h b/arch/sh/drivers/pci/pci-sh4.h
index a83dcf70c13b1..62ba35056087a 100644
--- a/arch/sh/drivers/pci/pci-sh4.h
+++ b/arch/sh/drivers/pci/pci-sh4.h
@@ -154,8 +154,8 @@
 
 /* arch/sh/kernel/drivers/pci/ops-sh4.c */
 extern struct pci_ops sh4_pci_ops;
-int sh4_pci_check_direct(void);
-int pci_fixup_pcic(void);
+int sh4_pci_check_direct(struct pci_channel *chan);
+int pci_fixup_pcic(struct pci_channel *chan);
 
 struct sh4_pci_address_space {
 	unsigned long base;
@@ -168,13 +168,16 @@ struct sh4_pci_address_map {
 	unsigned long flags;
 };
 
-static inline void pci_write_reg(unsigned long val, unsigned long reg)
+static inline void pci_write_reg(struct pci_channel *chan,
+				 unsigned long val, unsigned long reg)
 {
 	ctrl_outl(val, PCI_REG(reg));
 }
 
-static inline unsigned long pci_read_reg(unsigned long reg)
+static inline unsigned long pci_read_reg(struct pci_channel *chan,
+					 unsigned long reg)
 {
 	return ctrl_inl(PCI_REG(reg));
 }
+
 #endif /* __PCI_SH4_H */
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 3065eb184f01c..9c2c01490d626 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -40,21 +40,22 @@ static int __init sh7751_pci_init(void)
 	pr_debug("PCI: Starting intialization.\n");
 
 	/* check for SH7751/SH7751R hardware */
-	id = pci_read_reg(SH7751_PCICONF0);
+	id = pci_read_reg(NULL, SH7751_PCICONF0);
 	if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) &&
 	    id != ((SH7751R_DEVICE_ID << 16) | SH7751_VENDOR_ID)) {
 		pr_debug("PCI: This is not an SH7751(R) (%x)\n", id);
 		return -ENODEV;
 	}
 
-	if ((ret = sh4_pci_check_direct()) != 0)
+	if ((ret = sh4_pci_check_direct(NULL)) != 0)
 		return ret;
 
 	return pcibios_init_platform();
 }
 subsys_initcall(sh7751_pci_init);
 
-static int __init __area_sdram_check(unsigned int area)
+static int __init __area_sdram_check(struct pci_channel *chan,
+				     unsigned int area)
 {
 	u32 word;
 
@@ -65,7 +66,7 @@ static int __init __area_sdram_check(unsigned int area)
 		       area, word);
 		return 0;
 	}
-	pci_write_reg(word, SH4_PCIBCR1);
+	pci_write_reg(chan, word, SH4_PCIBCR1);
 
 	word = (u16)ctrl_inw(SH7751_BCR2);
 	/* check BCR2 for 32bit SDRAM interface*/
@@ -74,12 +75,13 @@ static int __init __area_sdram_check(unsigned int area)
 		       area, word);
 		return 0;
 	}
-	pci_write_reg(word, SH4_PCIBCR2);
+	pci_write_reg(chan, word, SH4_PCIBCR2);
 
 	return 1;
 }
 
-int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
+int __init sh7751_pcic_init(struct pci_channel *chan,
+			    struct sh4_pci_address_map *map)
 {
 	u32 reg;
 	u32 word;
@@ -90,10 +92,10 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 	ctrl_outl(reg, SH7751_BCR1);
 
 	/* Turn the clocks back on (not done in reset)*/
-	pci_write_reg(0, SH4_PCICLKR);
+	pci_write_reg(chan, 0, SH4_PCICLKR);
 	/* Clear Powerdown IRQ's (not done in reset) */
 	word = SH4_PCIPINT_D3 | SH4_PCIPINT_D0;
-	pci_write_reg(word, SH4_PCIPINT);
+	pci_write_reg(chan, word, SH4_PCIPINT);
 
 	/*
 	 * This code is unused for some boards as it is done in the
@@ -103,11 +105,11 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 	if (!(map->flags & SH4_PCIC_NO_RESET)) {
 		/* toggle PCI reset pin */
 		word = SH4_PCICR_PREFIX | SH4_PCICR_PRST;
-		pci_write_reg(word, SH4_PCICR);
+		pci_write_reg(chan, word, SH4_PCICR);
 		/* Wait for a long time... not 1 sec. but long enough */
 		mdelay(100);
 		word = SH4_PCICR_PREFIX;
-		pci_write_reg(word, SH4_PCICR);
+		pci_write_reg(chan, word, SH4_PCICR);
 	}
 
 	/* set the command/status bits to:
@@ -116,11 +118,11 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 	 */
 	word = SH7751_PCICONF1_WCC | SH7751_PCICONF1_PER |
 	       SH7751_PCICONF1_BUM | SH7751_PCICONF1_MES;
-	pci_write_reg(word, SH7751_PCICONF1);
+	pci_write_reg(chan, word, SH7751_PCICONF1);
 
 	/* define this host as the host bridge */
 	word = PCI_BASE_CLASS_BRIDGE << 24;
-	pci_write_reg(word, SH7751_PCICONF2);
+	pci_write_reg(chan, word, SH7751_PCICONF2);
 
 	/* Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
@@ -128,24 +130,24 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 	 * Window1 = map->window1.size @ cached area base = SDRAM
 	 */
 	word = map->window0.size - 1;
-	pci_write_reg(word, SH4_PCILSR0);
+	pci_write_reg(chan, word, SH4_PCILSR0);
 	word = map->window1.size - 1;
-	pci_write_reg(word, SH4_PCILSR1);
+	pci_write_reg(chan, word, SH4_PCILSR1);
 	/* Set the values on window 0 PCI config registers */
 	word = P2SEGADDR(map->window0.base);
-	pci_write_reg(word, SH4_PCILAR0);
-	pci_write_reg(word, SH7751_PCICONF5);
+	pci_write_reg(chan, word, SH4_PCILAR0);
+	pci_write_reg(chan, word, SH7751_PCICONF5);
 	/* Set the values on window 1 PCI config registers */
 	word =  PHYSADDR(map->window1.base);
-	pci_write_reg(word, SH4_PCILAR1);
-	pci_write_reg(word, SH7751_PCICONF6);
+	pci_write_reg(chan, word, SH4_PCILAR1);
+	pci_write_reg(chan, word, SH7751_PCICONF6);
 
 	/* Set the local 16MB PCI memory space window to
 	 * the lowest PCI mapped address
 	 */
 	word = PCIBIOS_MIN_MEM & SH4_PCIMBR_MASK;
 	pr_debug("PCI: Setting upper bits of Memory window to 0x%x\n", word);
-	pci_write_reg(word , SH4_PCIMBR);
+	pci_write_reg(chan, word , SH4_PCIMBR);
 
 	/* Map IO space into PCI IO window
 	 * The IO window is 64K-PCIBIOS_MIN_IO in size
@@ -160,19 +162,19 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 	 * correctly */
 	word = PCIBIOS_MIN_IO & SH4_PCIIOBR_MASK;
 	pr_debug("PCI: Setting upper bits of IO window to 0x%x\n", word);
-	pci_write_reg(word, SH4_PCIIOBR);
+	pci_write_reg(chan, word, SH4_PCIIOBR);
 
 	/* Set PCI WCRx, BCRx's, copy from BSC locations */
 
 	/* check BCR for SDRAM in specified area */
 	switch (map->window0.base) {
-	case SH7751_CS0_BASE_ADDR: word = __area_sdram_check(0); break;
-	case SH7751_CS1_BASE_ADDR: word = __area_sdram_check(1); break;
-	case SH7751_CS2_BASE_ADDR: word = __area_sdram_check(2); break;
-	case SH7751_CS3_BASE_ADDR: word = __area_sdram_check(3); break;
-	case SH7751_CS4_BASE_ADDR: word = __area_sdram_check(4); break;
-	case SH7751_CS5_BASE_ADDR: word = __area_sdram_check(5); break;
-	case SH7751_CS6_BASE_ADDR: word = __area_sdram_check(6); break;
+	case SH7751_CS0_BASE_ADDR: word = __area_sdram_check(chan, 0); break;
+	case SH7751_CS1_BASE_ADDR: word = __area_sdram_check(chan, 1); break;
+	case SH7751_CS2_BASE_ADDR: word = __area_sdram_check(chan, 2); break;
+	case SH7751_CS3_BASE_ADDR: word = __area_sdram_check(chan, 3); break;
+	case SH7751_CS4_BASE_ADDR: word = __area_sdram_check(chan, 4); break;
+	case SH7751_CS5_BASE_ADDR: word = __area_sdram_check(chan, 5); break;
+	case SH7751_CS6_BASE_ADDR: word = __area_sdram_check(chan, 6); break;
 	}
 
 	if (!word)
@@ -180,25 +182,25 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map)
 
 	/* configure the wait control registers */
 	word = ctrl_inl(SH7751_WCR1);
-	pci_write_reg(word, SH4_PCIWCR1);
+	pci_write_reg(chan, word, SH4_PCIWCR1);
 	word = ctrl_inl(SH7751_WCR2);
-	pci_write_reg(word, SH4_PCIWCR2);
+	pci_write_reg(chan, word, SH4_PCIWCR2);
 	word = ctrl_inl(SH7751_WCR3);
-	pci_write_reg(word, SH4_PCIWCR3);
+	pci_write_reg(chan, word, SH4_PCIWCR3);
 	word = ctrl_inl(SH7751_MCR);
-	pci_write_reg(word, SH4_PCIMCR);
+	pci_write_reg(chan, word, SH4_PCIMCR);
 
 	/* NOTE: I'm ignoring the PCI error IRQs for now..
 	 * TODO: add support for the internal error interrupts and
 	 * DMA interrupts...
 	 */
 
-	pci_fixup_pcic();
+	pci_fixup_pcic(chan);
 
 	/* SH7751 init done, set central function init complete */
 	/* use round robin mode to stop a device starving/overruning */
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM;
-	pci_write_reg(word, SH4_PCICR);
+	pci_write_reg(chan, word, SH4_PCICR);
 
 	return 1;
 }
diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h
index 68e3cb5e6bec2..6f101e5a6c834 100644
--- a/arch/sh/drivers/pci/pci-sh7751.h
+++ b/arch/sh/drivers/pci/pci-sh7751.h
@@ -130,6 +130,7 @@
 struct sh4_pci_address_map;
 
 /* arch/sh/drivers/pci/pci-sh7751.c */
-int sh7751_pcic_init(struct sh4_pci_address_map *map);
+int sh7751_pcic_init(struct pci_channel *chan,
+		     struct sh4_pci_address_map *map);
 
 #endif /* _PCI_SH7751_H_ */
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index bae6a2cf047dc..56f673f66cb5b 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -55,7 +55,7 @@ static int __init sh7780_pci_init(void)
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */
 
 	/* check for SH7780/SH7780R hardware */
-	id = pci_read_reg(SH7780_PCIVID);
+	id = pci_read_reg(NULL, SH7780_PCIVID);
 	if ((id & 0xffff) == SH7780_VENDOR_ID) {
 		switch ((id >> 16) & 0xffff) {
 		case SH7763_DEVICE_ID:
@@ -82,14 +82,15 @@ static int __init sh7780_pci_init(void)
 		ctrl_outl(0x33333333, INTC_INTPRI);
 	}
 
-	if ((ret = sh4_pci_check_direct()) != 0)
+	if ((ret = sh4_pci_check_direct(NULL)) != 0)
 		return ret;
 
 	return pcibios_init_platform();
 }
 core_initcall(sh7780_pci_init);
 
-int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
+int __init sh7780_pcic_init(struct pci_channel *chan,
+			    struct sh4_pci_address_map *map)
 {
 	u32 word;
 
@@ -101,34 +102,34 @@ int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
 	if (!(map->flags & SH4_PCIC_NO_RESET)) {
 		/* toggle PCI reset pin */
 		word = SH4_PCICR_PREFIX | SH4_PCICR_PRST;
-		pci_write_reg(word, SH4_PCICR);
+		pci_write_reg(chan, word, SH4_PCICR);
 		/* Wait for a long time... not 1 sec. but long enough */
 		mdelay(100);
 		word = SH4_PCICR_PREFIX;
-		pci_write_reg(word, SH4_PCICR);
+		pci_write_reg(chan, word, SH4_PCICR);
 	}
 
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
 	 * Mem space enable
 	 */
-	pci_write_reg(0x00000046, SH7780_PCICMD);
+	pci_write_reg(chan, 0x00000046, SH7780_PCICMD);
 
 	/* define this host as the host bridge */
 	word = PCI_BASE_CLASS_BRIDGE << 24;
-	pci_write_reg(word, SH7780_PCIRID);
+	pci_write_reg(chan, word, SH7780_PCIRID);
 
 	/* Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
 	 */
-	pci_write_reg(map->window0.size - 0xfffff, SH4_PCILSR0);
-	pci_write_reg(map->window1.size - 0xfffff, SH4_PCILSR1);
+	pci_write_reg(chan, map->window0.size - 0xfffff, SH4_PCILSR0);
+	pci_write_reg(chan, map->window1.size - 0xfffff, SH4_PCILSR1);
 	/* Set the values on window 0 PCI config registers */
-	pci_write_reg(map->window0.base, SH4_PCILAR0);
-	pci_write_reg(map->window0.base, SH7780_PCIMBAR0);
+	pci_write_reg(chan, map->window0.base, SH4_PCILAR0);
+	pci_write_reg(chan, map->window0.base, SH7780_PCIMBAR0);
 	/* Set the values on window 1 PCI config registers */
-	pci_write_reg(map->window1.base, SH4_PCILAR1);
-	pci_write_reg(map->window1.base, SH7780_PCIMBAR1);
+	pci_write_reg(chan, map->window1.base, SH4_PCILAR1);
+	pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1);
 
 	/* Map IO space into PCI IO window
 	 * The IO window is 64K-PCIBIOS_MIN_IO in size
@@ -145,12 +146,12 @@ int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
 	 */
 
 	/* Apply any last-minute PCIC fixups */
-	pci_fixup_pcic();
+	pci_fixup_pcic(chan);
 
 	/* SH7780 init done, set central function init complete */
 	/* use round robin mode to stop a device starving/overruning */
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO;
-	pci_write_reg(word, SH4_PCICR);
+	pci_write_reg(chan, word, SH4_PCICR);
 
 	return 1;
 }
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index 93adc7119b790..d34961153d583 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -109,6 +109,7 @@
 struct sh4_pci_address_map;
 
 /* arch/sh/drivers/pci/pci-sh7780.c */
-int sh7780_pcic_init(struct sh4_pci_address_map *map);
+int sh7780_pcic_init(struct pci_channel *chan,
+		     struct sh4_pci_address_map *map);
 
 #endif /* _PCI_SH7780_H_ */
-- 
GitLab


From d0e3db40e2a1352aa2a2f425a7d4631bddc03d51 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 11 Mar 2009 15:46:14 +0900
Subject: [PATCH 0425/2198] sh: add init member to pci_channel data

This patch adds an init callback to struct pci_channel and makes sure
it is initialized properly. Code is added to call this init function
from pcibios_init(). Return values are adjusted and a warning is is
printed if init fails.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-dreamcast/setup.c |  6 ------
 arch/sh/drivers/pci/ops-cayman.c      |  2 +-
 arch/sh/drivers/pci/ops-dreamcast.c   | 18 ++++++++----------
 arch/sh/drivers/pci/ops-landisk.c     |  2 +-
 arch/sh/drivers/pci/ops-lboxre2.c     |  2 +-
 arch/sh/drivers/pci/ops-r7780rp.c     |  2 +-
 arch/sh/drivers/pci/ops-rts7751r2d.c  |  2 +-
 arch/sh/drivers/pci/ops-sdk7780.c     |  2 +-
 arch/sh/drivers/pci/ops-se7780.c      |  2 +-
 arch/sh/drivers/pci/ops-sh03.c        |  2 +-
 arch/sh/drivers/pci/ops-snapgear.c    |  2 +-
 arch/sh/drivers/pci/ops-titan.c       |  2 +-
 arch/sh/drivers/pci/pci-sh5.c         |  6 ++++++
 arch/sh/drivers/pci/pci-sh5.h         |  1 +
 arch/sh/drivers/pci/pci-sh7751.c      | 11 +++++------
 arch/sh/drivers/pci/pci-sh7751.h      |  1 +
 arch/sh/drivers/pci/pci-sh7780.c      |  9 ++++-----
 arch/sh/drivers/pci/pci-sh7780.h      |  1 +
 arch/sh/drivers/pci/pci.c             | 23 ++++++++++++++++++-----
 arch/sh/include/asm/pci.h             |  2 ++
 20 files changed, 56 insertions(+), 42 deletions(-)

diff --git a/arch/sh/boards/mach-dreamcast/setup.c b/arch/sh/boards/mach-dreamcast/setup.c
index d1bee4884cd66..ebe99227d4e66 100644
--- a/arch/sh/boards/mach-dreamcast/setup.c
+++ b/arch/sh/boards/mach-dreamcast/setup.c
@@ -30,7 +30,6 @@
 
 extern struct irq_chip systemasic_int;
 extern void aica_time_init(void);
-extern int gapspci_init(void);
 extern int systemasic_irq_demux(int);
 
 static void __init dreamcast_setup(char **cmdline_p)
@@ -51,11 +50,6 @@ static void __init dreamcast_setup(char **cmdline_p)
 					 handle_level_irq);
 
 	board_time_init = aica_time_init;
-
-#ifdef CONFIG_PCI
-	if (gapspci_init() < 0)
-		printk(KERN_WARNING "GAPSPCI was not detected.\n");
-#endif
 }
 
 static struct sh_machine_vector mv_dreamcast __initmv = {
diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/ops-cayman.c
index 38ef76207af6c..f4a5e14f7e5ac 100644
--- a/arch/sh/drivers/pci/ops-cayman.c
+++ b/arch/sh/drivers/pci/ops-cayman.c
@@ -77,7 +77,7 @@ int __init pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin)
 }
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh5_pci_ops, NULL, NULL, 0, 0xff },
+	{ sh5_pci_init, &sh5_pci_ops, NULL, NULL, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c
index f5d2a2aa6f3f8..f62063eb64908 100644
--- a/arch/sh/drivers/pci/ops-dreamcast.c
+++ b/arch/sh/drivers/pci/ops-dreamcast.c
@@ -42,15 +42,6 @@ static struct resource gapspci_mem_resource = {
 	.flags	= IORESOURCE_MEM,
 };
 
-static struct pci_ops gapspci_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ &gapspci_pci_ops, &gapspci_io_resource,
-	  &gapspci_mem_resource, 0, 1 },
-	{ 0, }
-};
-EXPORT_SYMBOL(board_pci_channels);
-
 /*
  * The !gapspci_config_access case really shouldn't happen, ever, unless
  * someone implicitly messes around with the last devfn value.. otherwise we
@@ -116,7 +107,7 @@ static struct pci_ops gapspci_pci_ops = {
  * gapspci init
  */
 
-int __init gapspci_init(void)
+static int __init gapspci_init(struct pci_channel *chan)
 {
 	char idbuf[16];
 	int i;
@@ -168,3 +159,10 @@ char * __devinit pcibios_setup(char *str)
 {
 	return str;
 }
+
+struct pci_channel board_pci_channels[] = {
+	{ gapspci_init, &gapspci_pci_ops, &gapspci_io_resource,
+	  &gapspci_mem_resource, 0, 1 },
+	{ 0, }
+};
+EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c
index 343c072a5a796..c46911d95eca5 100644
--- a/arch/sh/drivers/pci/ops-landisk.c
+++ b/arch/sh/drivers/pci/ops-landisk.c
@@ -30,7 +30,7 @@ static struct resource sh7751_mem_resource = {
 };
 
 struct pci_channel board_pci_channels[] = {
-	{&sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0x3ff},
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0x3ff},
 	{NULL, NULL, NULL, 0, 0},
 };
 
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
index 8bff32a22101e..f606df2195c1f 100644
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ b/arch/sh/drivers/pci/ops-lboxre2.c
@@ -39,7 +39,7 @@ static struct resource sh7751_mem_resource = {
 extern struct pci_ops sh7751_pci_ops;
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index bf32ee8b1321b..b51b7e4078d0b 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -43,7 +43,7 @@ static struct resource sh7780_mem_resource = {
 extern struct pci_ops sh7780_pci_ops;
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff },
+	{ sh7780_pci_init, &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index e4208a697321d..fe5a231b86698 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -47,7 +47,7 @@ static struct resource sh7751_mem_resource = {
 extern struct pci_ops sh7751_pci_ops;
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index 21d59d4a2150d..7277cd15ae6ae 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -49,7 +49,7 @@ static struct resource sdk7780_mem_resource = {
 };
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff },
+	{ sh7780_pci_init, &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 78a6f2bc4f12b..76a74fb42fb01 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -58,7 +58,7 @@ static struct resource se7780_mem_resource = {
 extern struct pci_ops se7780_pci_ops;
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff },
+	{ sh7780_pci_init, &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-sh03.c b/arch/sh/drivers/pci/ops-sh03.c
index e1703ff5a4d23..0218135f0bb8e 100644
--- a/arch/sh/drivers/pci/ops-sh03.c
+++ b/arch/sh/drivers/pci/ops-sh03.c
@@ -39,7 +39,7 @@ static struct resource sh7751_mem_resource = {
 extern struct pci_ops sh4_pci_ops;
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 
diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
index cba80153dde98..2e254c6cf6c1a 100644
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ b/arch/sh/drivers/pci/ops-snapgear.c
@@ -40,7 +40,7 @@ static struct resource sh7751_mem_resource = {
 };
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ 0, }
 };
 
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
index 69fcc5c5d5204..ffa79bdf47557 100644
--- a/arch/sh/drivers/pci/ops-titan.c
+++ b/arch/sh/drivers/pci/ops-titan.c
@@ -52,7 +52,7 @@ static struct resource sh7751_mem_resource = {
 };
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
+	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c
index 7a97438762c83..008a02ec0d9f8 100644
--- a/arch/sh/drivers/pci/pci-sh5.c
+++ b/arch/sh/drivers/pci/pci-sh5.c
@@ -27,6 +27,12 @@
 unsigned long pcicr_virt;
 unsigned long PCI_IO_AREA;
 
+int __init sh5_pci_init(struct pci_channel *chan)
+{
+	pr_debug("PCI: Starting intialization.\n");
+	return pcibios_init_platform();
+}
+
 /* Rounds a number UP to the nearest power of two. Used for
  * sizing the PCI window.
  */
diff --git a/arch/sh/drivers/pci/pci-sh5.h b/arch/sh/drivers/pci/pci-sh5.h
index 7cff3fc04d30a..af09f384c7d27 100644
--- a/arch/sh/drivers/pci/pci-sh5.h
+++ b/arch/sh/drivers/pci/pci-sh5.h
@@ -108,6 +108,7 @@ extern unsigned long pcicr_virt;
 extern struct pci_ops sh5_pci_ops;
 
 /* arch/sh/drivers/pci/pci-sh5.c */
+int sh5_pci_init(struct pci_channel *chan);
 int sh5pci_init(unsigned long memStart, unsigned long memSize);
 
 #endif /* __PCI_SH5_H */
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 9c2c01490d626..230db8bd97443 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -32,7 +32,7 @@
  * space mapping) will be called via the platform defined function
  * pcibios_init_platform().
  */
-static int __init sh7751_pci_init(void)
+int __init sh7751_pci_init(struct pci_channel *chan)
 {
 	unsigned int id;
 	int ret;
@@ -40,19 +40,18 @@ static int __init sh7751_pci_init(void)
 	pr_debug("PCI: Starting intialization.\n");
 
 	/* check for SH7751/SH7751R hardware */
-	id = pci_read_reg(NULL, SH7751_PCICONF0);
+	id = pci_read_reg(chan, SH7751_PCICONF0);
 	if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) &&
 	    id != ((SH7751R_DEVICE_ID << 16) | SH7751_VENDOR_ID)) {
 		pr_debug("PCI: This is not an SH7751(R) (%x)\n", id);
 		return -ENODEV;
 	}
 
-	if ((ret = sh4_pci_check_direct(NULL)) != 0)
+	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
 	return pcibios_init_platform();
 }
-subsys_initcall(sh7751_pci_init);
 
 static int __init __area_sdram_check(struct pci_channel *chan,
 				     unsigned int area)
@@ -178,7 +177,7 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	}
 
 	if (!word)
-		return 0;
+		return -1;
 
 	/* configure the wait control registers */
 	word = ctrl_inl(SH7751_WCR1);
@@ -202,5 +201,5 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM;
 	pci_write_reg(chan, word, SH4_PCICR);
 
-	return 1;
+	return 0;
 }
diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h
index 6f101e5a6c834..0ea4387df1361 100644
--- a/arch/sh/drivers/pci/pci-sh7751.h
+++ b/arch/sh/drivers/pci/pci-sh7751.h
@@ -130,6 +130,7 @@
 struct sh4_pci_address_map;
 
 /* arch/sh/drivers/pci/pci-sh7751.c */
+int sh7751_pci_init(struct pci_channel *chan);
 int sh7751_pcic_init(struct pci_channel *chan,
 		     struct sh4_pci_address_map *map);
 
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 56f673f66cb5b..4706e880b0874 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -45,7 +45,7 @@
  * space mapping) will be called via the platform defined function
  * pcibios_init_platform().
  */
-static int __init sh7780_pci_init(void)
+int __init sh7780_pci_init(struct pci_channel *chan)
 {
 	unsigned int id;
 	int ret, match = 0;
@@ -55,7 +55,7 @@ static int __init sh7780_pci_init(void)
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */
 
 	/* check for SH7780/SH7780R hardware */
-	id = pci_read_reg(NULL, SH7780_PCIVID);
+	id = pci_read_reg(chan, SH7780_PCIVID);
 	if ((id & 0xffff) == SH7780_VENDOR_ID) {
 		switch ((id >> 16) & 0xffff) {
 		case SH7763_DEVICE_ID:
@@ -82,12 +82,11 @@ static int __init sh7780_pci_init(void)
 		ctrl_outl(0x33333333, INTC_INTPRI);
 	}
 
-	if ((ret = sh4_pci_check_direct(NULL)) != 0)
+	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
 	return pcibios_init_platform();
 }
-core_initcall(sh7780_pci_init);
 
 int __init sh7780_pcic_init(struct pci_channel *chan,
 			    struct sh4_pci_address_map *map)
@@ -153,5 +152,5 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO;
 	pci_write_reg(chan, word, SH4_PCICR);
 
-	return 1;
+	return 0;
 }
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index d34961153d583..2f3c92065ec34 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -109,6 +109,7 @@
 struct sh4_pci_address_map;
 
 /* arch/sh/drivers/pci/pci-sh7780.c */
+int sh7780_pci_init(struct pci_channel *chan);
 int sh7780_pcic_init(struct pci_channel *chan,
 		     struct sh4_pci_address_map *map);
 
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index 0d6ac7a1db498..29ec16e69afa2 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -28,18 +28,31 @@ static int __init pcibios_init(void)
 	struct pci_bus *bus;
 	int busno;
 
+	/* init channels */
+	busno = 0;
+	for (p = board_pci_channels; p->init; p++) {
+		if (p->init(p) == 0)
+			p->enabled = 1;
+		else
+			pr_err("Unable to init pci channel %d\n", busno);
+		busno++;
+	}
+
 #ifdef CONFIG_PCI_AUTO
 	/* assign resources */
 	busno = 0;
-	for (p = board_pci_channels; p->pci_ops != NULL; p++)
-		busno = pciauto_assign_resources(busno, p) + 1;
+	for (p = board_pci_channels; p->init; p++)
+		if (p->enabled)
+			busno = pciauto_assign_resources(busno, p) + 1;
 #endif
 
 	/* scan the buses */
 	busno = 0;
-	for (p = board_pci_channels; p->pci_ops != NULL; p++) {
-		bus = pci_scan_bus(busno, p->pci_ops, p);
-		busno = bus->subordinate + 1;
+	for (p = board_pci_channels; p->init; p++) {
+		if (p->enabled) {
+			bus = pci_scan_bus(busno, p->pci_ops, p);
+			busno = bus->subordinate + 1;
+		}
 	}
 
 	pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index df1d383e18a51..5c7a8f1d2d546 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -17,11 +17,13 @@
  * external) PCI controllers.
  */
 struct pci_channel {
+	int (*init)(struct pci_channel *chan);
 	struct pci_ops *pci_ops;
 	struct resource *io_resource;
 	struct resource *mem_resource;
 	int first_devfn;
 	int last_devfn;
+	int enabled;
 };
 
 /*
-- 
GitLab


From 710fa3c81151948ac4d836ef52b57cef91b0ab72 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 11 Mar 2009 15:47:23 +0900
Subject: [PATCH 0426/2198] sh: avoid using PCIBIOS_MIN_xxx

Replaces PCIBIOS_MIN_IO and PCIBIOS_MIN_MEM with direct struct
pci_channel access. This allows us to have more than one pci
channel.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/fixups-r7780rp.c |  2 +-
 arch/sh/drivers/pci/fixups-sdk7780.c |  2 +-
 arch/sh/drivers/pci/fixups-se7780.c  |  2 +-
 arch/sh/drivers/pci/pci-sh5.c        |  9 ++++++---
 arch/sh/drivers/pci/pci-sh7751.c     | 14 ++++++--------
 arch/sh/drivers/pci/pci-sh7780.c     | 10 ++++------
 6 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c
index 5b25021bbd62a..31f8dfbfcbf50 100644
--- a/arch/sh/drivers/pci/fixups-r7780rp.c
+++ b/arch/sh/drivers/pci/fixups-r7780rp.c
@@ -37,7 +37,7 @@ int pci_fixup_pcic(struct pci_channel *chan)
 #endif
 
 	/* Set IOBR for windows containing area specified in pci.h */
-	pci_write_reg(chan, (PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)),
+	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
 		      SH7780_PCIIOBR);
 	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)),
 		      SH7780_PCIIOBMR);
diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c
index 3f6754a120ed5..c2957312b30bd 100644
--- a/arch/sh/drivers/pci/fixups-sdk7780.c
+++ b/arch/sh/drivers/pci/fixups-sdk7780.c
@@ -50,7 +50,7 @@ int pci_fixup_pcic(struct pci_channel *chan)
 	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);	/* 16M */
 
 	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1),
+	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
 		      SH7780_PCIIOBR);
 	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
 		      SH7780_PCIIOBMR);
diff --git a/arch/sh/drivers/pci/fixups-se7780.c b/arch/sh/drivers/pci/fixups-se7780.c
index b8e735e01c3c7..a968af7d6f70f 100644
--- a/arch/sh/drivers/pci/fixups-se7780.c
+++ b/arch/sh/drivers/pci/fixups-se7780.c
@@ -51,7 +51,7 @@ int pci_fixup_pcic(struct pci_channel *chan)
 	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);    /* 16M */
 
 	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1),
+	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
 		      SH7780_PCIIOBR);
 	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
 		      SH7780_PCIIOBMR);
diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c
index 008a02ec0d9f8..7750da2762841 100644
--- a/arch/sh/drivers/pci/pci-sh5.c
+++ b/arch/sh/drivers/pci/pci-sh5.c
@@ -206,6 +206,9 @@ int __init sh5pci_init(unsigned long memStart, unsigned long memSize)
 	return 0;
 }
 
+#define xPCIBIOS_MIN_IO		board_pci_channels->io_resource->start
+#define xPCIBIOS_MIN_MEM	board_pci_channels->mem_resource->start
+
 void __devinit pcibios_fixup_bus(struct pci_bus *bus)
 {
 	struct pci_dev *dev = bus->self;
@@ -223,9 +226,9 @@ void __devinit pcibios_fixup_bus(struct pci_bus *bus)
 		/* For now, propagate host limits to the bus;
 		 * we'll adjust them later. */
 		bus->resource[0]->end = 64*1024 - 1 ;
-		bus->resource[1]->end = PCIBIOS_MIN_MEM+(256*1024*1024)-1;
-		bus->resource[0]->start = PCIBIOS_MIN_IO;
-		bus->resource[1]->start = PCIBIOS_MIN_MEM;
+		bus->resource[1]->end = xPCIBIOS_MIN_MEM+(256*1024*1024)-1;
+		bus->resource[0]->start = xPCIBIOS_MIN_IO;
+		bus->resource[1]->start = xPCIBIOS_MIN_MEM;
 
 		/* Turn off downstream PF memory address range by default */
 		bus->resource[2]->start = 1024*1024;
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 230db8bd97443..447234c69ab1b 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -144,22 +144,20 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	/* Set the local 16MB PCI memory space window to
 	 * the lowest PCI mapped address
 	 */
-	word = PCIBIOS_MIN_MEM & SH4_PCIMBR_MASK;
+	word = chan->mem_resource->start & SH4_PCIMBR_MASK;
 	pr_debug("PCI: Setting upper bits of Memory window to 0x%x\n", word);
 	pci_write_reg(chan, word , SH4_PCIMBR);
 
-	/* Map IO space into PCI IO window
-	 * The IO window is 64K-PCIBIOS_MIN_IO in size
-	 * IO addresses will be translated to the
-	 * PCI IO window base address
+	/* Map IO space into PCI IO window:
+	 * IO addresses will be translated to the PCI IO window base address
 	 */
 	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n",
-		 PCIBIOS_MIN_IO, (64 << 10),
-		 SH7751_PCI_IO_BASE + PCIBIOS_MIN_IO);
+		 chan->io_resource->start, chan->io_resource->end,
+		 SH7751_PCI_IO_BASE + chan->io_resource->start);
 
 	/* Make sure the MSB's of IO window are set to access PCI space
 	 * correctly */
-	word = PCIBIOS_MIN_IO & SH4_PCIIOBR_MASK;
+	word = chan->io_resource->start & SH4_PCIIOBR_MASK;
 	pr_debug("PCI: Setting upper bits of IO window to 0x%x\n", word);
 	pci_write_reg(chan, word, SH4_PCIIOBR);
 
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 4706e880b0874..e8f3a308c0750 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -130,14 +130,12 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 	pci_write_reg(chan, map->window1.base, SH4_PCILAR1);
 	pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1);
 
-	/* Map IO space into PCI IO window
-	 * The IO window is 64K-PCIBIOS_MIN_IO in size
-	 * IO addresses will be translated to the
-	 * PCI IO window base address
+	/* Map IO space into PCI IO window:
+	 * IO addresses will be translated to the PCI IO window base address
 	 */
 	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n",
-		 PCIBIOS_MIN_IO, (64 << 10),
-		 SH7780_PCI_IO_BASE + PCIBIOS_MIN_IO);
+		 chan->io_resource->start, chan->io_resource->end,
+		 SH7780_PCI_IO_BASE + chan->io_resource->start);
 
 	/* NOTE: I'm ignoring the PCI error IRQs for now..
 	 * TODO: add support for the internal error interrupts and
-- 
GitLab


From b6706ef10f75921733d7275fd45d268f2f6254c8 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:34:55 +0900
Subject: [PATCH 0427/2198] sh: hook in struct pci_channel in sysdata

Store a struct pci_channel pointer in bus->sysdata. This makes whatever
struct pci_channel assigned to a bus available for sh4_pci_read() and
sh4_pci_write(). We also modify PCIBIOS_MIN_IO and PCIBIOS_MIN_MEM to
use bus->sysdata - this to gives us support for multiple pci channels.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-sh4.c  | 12 +++++++-----
 arch/sh/drivers/pci/pci-auto.c |  1 +
 arch/sh/include/asm/pci.h      |  6 ++++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index 92d27f734f2e7..ee62e6de71332 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -26,6 +26,7 @@ static DEFINE_SPINLOCK(sh4_pci_lock);
 static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn,
 			   int where, int size, u32 *val)
 {
+	struct pci_channel *chan = bus->sysdata;
 	unsigned long flags;
 	u32 data;
 
@@ -34,8 +35,8 @@ static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn,
 	 * so we must do byte alignment by hand
 	 */
 	spin_lock_irqsave(&sh4_pci_lock, flags);
-	pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
-	data = pci_read_reg(NULL, SH4_PCIPDR);
+	pci_write_reg(chan, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
+	data = pci_read_reg(chan, SH4_PCIPDR);
 	spin_unlock_irqrestore(&sh4_pci_lock, flags);
 
 	switch (size) {
@@ -63,13 +64,14 @@ static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn,
 static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn,
 			 int where, int size, u32 val)
 {
+	struct pci_channel *chan = bus->sysdata;
 	unsigned long flags;
 	int shift;
 	u32 data;
 
 	spin_lock_irqsave(&sh4_pci_lock, flags);
-	pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
-	data = pci_read_reg(NULL, SH4_PCIPDR);
+	pci_write_reg(chan, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR);
+	data = pci_read_reg(chan, SH4_PCIPDR);
 	spin_unlock_irqrestore(&sh4_pci_lock, flags);
 
 	switch (size) {
@@ -90,7 +92,7 @@ static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn,
 		return PCIBIOS_FUNC_NOT_SUPPORTED;
 	}
 
-	pci_write_reg(NULL, data, SH4_PCIPDR);
+	pci_write_reg(chan, data, SH4_PCIPDR);
 
 	return PCIBIOS_SUCCESSFUL;
 }
diff --git a/arch/sh/drivers/pci/pci-auto.c b/arch/sh/drivers/pci/pci-auto.c
index cf48b12ee58cb..1d715ec405b20 100644
--- a/arch/sh/drivers/pci/pci-auto.c
+++ b/arch/sh/drivers/pci/pci-auto.c
@@ -67,6 +67,7 @@ static struct pci_dev *fake_pci_dev(struct pci_channel *hose,
 	dev.devfn = devfn;
 	bus.number = busnr;
 	bus.ops = hose->pci_ops;
+	bus.sysdata = hose;
 
 	if(busnr != top_bus)
 		/* Fake a parent bus structure. */
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 5c7a8f1d2d546..386587e088306 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -31,8 +31,10 @@ struct pci_channel {
  */
 extern struct pci_channel board_pci_channels[];
 
-#define PCIBIOS_MIN_IO		board_pci_channels->io_resource->start
-#define PCIBIOS_MIN_MEM		board_pci_channels->mem_resource->start
+/* ugly as hell, but makes drivers/pci/setup-res.c compile and work */
+#define __PCI_CHAN(bus)		((struct pci_channel *)bus->sysdata)
+#define PCIBIOS_MIN_IO		__PCI_CHAN(bus)->io_resource->start
+#define PCIBIOS_MIN_MEM		__PCI_CHAN(bus)->mem_resource->start
 
 /*
  * I/O routine helpers
-- 
GitLab


From e4c6a3604e07185046e2ce4be82a201f4447d788 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:35:04 +0900
Subject: [PATCH 0428/2198] sh: add reg_base member to pci_channel

Store the base address of the pci host controller registers in struct
pci_channel and use the address in pci_read_reg() and pci_write_reg().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-sh4.c    | 4 ++--
 arch/sh/drivers/pci/pci-sh4.h    | 4 ++--
 arch/sh/drivers/pci/pci-sh7751.c | 2 ++
 arch/sh/drivers/pci/pci-sh7751.h | 1 -
 arch/sh/drivers/pci/pci-sh7780.c | 2 ++
 arch/sh/drivers/pci/pci-sh7780.h | 1 -
 arch/sh/include/asm/pci.h        | 1 +
 7 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index ee62e6de71332..540683d07c770 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -121,8 +121,8 @@ int __init sh4_pci_check_direct(struct pci_channel *chan)
 		if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) {
 			pci_write_reg(chan, tmp, SH4_PCIPAR);
 			printk(KERN_INFO "PCI: Using configuration type 1\n");
-			request_region(PCI_REG(SH4_PCIPAR), 8, "PCI conf1");
-
+			request_region(chan->reg_base + SH4_PCIPAR, 8,
+				       "PCI conf1");
 			return 0;
 		}
 
diff --git a/arch/sh/drivers/pci/pci-sh4.h b/arch/sh/drivers/pci/pci-sh4.h
index 62ba35056087a..90abfe3d39bb5 100644
--- a/arch/sh/drivers/pci/pci-sh4.h
+++ b/arch/sh/drivers/pci/pci-sh4.h
@@ -171,13 +171,13 @@ struct sh4_pci_address_map {
 static inline void pci_write_reg(struct pci_channel *chan,
 				 unsigned long val, unsigned long reg)
 {
-	ctrl_outl(val, PCI_REG(reg));
+	ctrl_outl(val, chan->reg_base + reg);
 }
 
 static inline unsigned long pci_read_reg(struct pci_channel *chan,
 					 unsigned long reg)
 {
-	return ctrl_inl(PCI_REG(reg));
+	return ctrl_inl(chan->reg_base + reg);
 }
 
 #endif /* __PCI_SH4_H */
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 447234c69ab1b..201266b020f32 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -39,6 +39,8 @@ int __init sh7751_pci_init(struct pci_channel *chan)
 
 	pr_debug("PCI: Starting intialization.\n");
 
+	chan->reg_base = 0xfe200000;
+
 	/* check for SH7751/SH7751R hardware */
 	id = pci_read_reg(chan, SH7751_PCICONF0);
 	if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) &&
diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h
index 0ea4387df1361..c390dd2f5e1a3 100644
--- a/arch/sh/drivers/pci/pci-sh7751.h
+++ b/arch/sh/drivers/pci/pci-sh7751.h
@@ -26,7 +26,6 @@
 #define SH7751_PCI_IO_SIZE           0x40000     /* Size of IO window */
 
 #define SH7751_PCIREG_BASE           0xFE200000  /* PCI regs base address */
-#define PCI_REG(n)                  (SH7751_PCIREG_BASE+ n)
 
 #define SH7751_PCICONF0            0x0           /* PCI Config Reg 0 */
   #define SH7751_PCICONF0_DEVID      0xFFFF0000  /* Device ID */
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index e8f3a308c0750..9d6483a26cf99 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -52,6 +52,8 @@ int __init sh7780_pci_init(struct pci_channel *chan)
 
 	pr_debug("PCI: Starting intialization.\n");
 
+	chan->reg_base = 0xfe040000;
+
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */
 
 	/* check for SH7780/SH7780R hardware */
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index 2f3c92065ec34..fffcf1dcfedfa 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -35,7 +35,6 @@
 #define SH7780_PCI_IO_SIZE	0x00400000	/* Size of IO window */
 
 #define SH7780_PCIREG_BASE	0xFE040000	/* PCI regs base address */
-#define PCI_REG(n)		(SH7780_PCIREG_BASE+n)
 
 /* SH7780 PCI Config Registers */
 #define SH7780_PCIVID		0x000		/* Vendor ID */
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 386587e088306..8e9e0edcf36c0 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -24,6 +24,7 @@ struct pci_channel {
 	int first_devfn;
 	int last_devfn;
 	int enabled;
+	unsigned long reg_base;
 };
 
 /*
-- 
GitLab


From ef53fdeb7e0cb139aff33665635b886700137abb Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:35:14 +0900
Subject: [PATCH 0429/2198] sh: add io_base member to pci_channel

Store the io window base address in struct pci_channel and use that one
instead of SH77xx_PCI_IO_BASE.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-sh7751.c | 5 +++--
 arch/sh/drivers/pci/pci-sh7780.c | 5 +++--
 arch/sh/include/asm/pci.h        | 1 +
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 201266b020f32..2a6c7aab2d754 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -40,6 +40,7 @@ int __init sh7751_pci_init(struct pci_channel *chan)
 	pr_debug("PCI: Starting intialization.\n");
 
 	chan->reg_base = 0xfe200000;
+	chan->io_base = 0xfe240000;
 
 	/* check for SH7751/SH7751R hardware */
 	id = pci_read_reg(chan, SH7751_PCICONF0);
@@ -153,9 +154,9 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	/* Map IO space into PCI IO window:
 	 * IO addresses will be translated to the PCI IO window base address
 	 */
-	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n",
+	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n",
 		 chan->io_resource->start, chan->io_resource->end,
-		 SH7751_PCI_IO_BASE + chan->io_resource->start);
+		 chan->io_base + chan->io_resource->start);
 
 	/* Make sure the MSB's of IO window are set to access PCI space
 	 * correctly */
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 9d6483a26cf99..87a7f3b7a38f6 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -53,6 +53,7 @@ int __init sh7780_pci_init(struct pci_channel *chan)
 	pr_debug("PCI: Starting intialization.\n");
 
 	chan->reg_base = 0xfe040000;
+	chan->io_base = 0xfe200000;
 
 	ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */
 
@@ -135,9 +136,9 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 	/* Map IO space into PCI IO window:
 	 * IO addresses will be translated to the PCI IO window base address
 	 */
-	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n",
+	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n",
 		 chan->io_resource->start, chan->io_resource->end,
-		 SH7780_PCI_IO_BASE + chan->io_resource->start);
+		 chan->io_base + chan->io_resource->start);
 
 	/* NOTE: I'm ignoring the PCI error IRQs for now..
 	 * TODO: add support for the internal error interrupts and
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 8e9e0edcf36c0..84d12ebef0840 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -25,6 +25,7 @@ struct pci_channel {
 	int last_devfn;
 	int enabled;
 	unsigned long reg_base;
+	unsigned long io_base;
 };
 
 /*
-- 
GitLab


From ef339f241b08a16af58897e6288ba200e0c7a8c7 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:35:22 +0900
Subject: [PATCH 0430/2198] sh: pci memory range checking code

This patch changes the code to use __is_pci_memory() instead of
is_pci_memaddr(). __is_pci_memory() loops through all the pci
channels on the system to match memory windows.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-titan/io.c |  2 +-
 arch/sh/drivers/pci/pci.c      |  5 ++---
 arch/sh/include/asm/pci.h      | 23 +++++++++++++++++++----
 arch/sh/mm/ioremap_32.c        |  4 ++--
 4 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/arch/sh/boards/mach-titan/io.c b/arch/sh/boards/mach-titan/io.c
index 4badad4c6f303..053b3ed2ed8da 100644
--- a/arch/sh/boards/mach-titan/io.c
+++ b/arch/sh/boards/mach-titan/io.c
@@ -117,7 +117,7 @@ void titan_outsl(unsigned long port, const void *src, unsigned long count)
 
 void __iomem *titan_ioport_map(unsigned long port, unsigned int size)
 {
-	if (PXSEG(port) || is_pci_memaddr(port))
+	if (PXSEG(port))
 		return (void __iomem *)port;
 	else if (is_pci_ioaddr(port))
 		return (void __iomem *)pci_ioaddr(port);
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index 29ec16e69afa2..b9aca5478048e 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -167,9 +167,8 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
 	/*
 	 * Presently the IORESOURCE_MEM case is a bit special, most
 	 * SH7751 style PCI controllers have PCI memory at a fixed
-	 * location in the address space where no remapping is desired
-	 * (typically at 0xfd000000, but is_pci_memaddr() will know
-	 * best). With the IORESOURCE_MEM case more care has to be taken
+	 * location in the address space where no remapping is desired.
+	 * With the IORESOURCE_MEM case more care has to be taken
 	 * to inhibit page table mapping for legacy cores, but this is
 	 * punted off to __ioremap().
 	 *					-- PFM.
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 84d12ebef0840..ccf5c5ff62ff9 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -61,12 +61,8 @@ extern unsigned long PCI_IO_AREA;
 #define is_pci_ioaddr(port)		\
 	(((port) >= PCIBIOS_MIN_IO) &&	\
 	 ((port) < (PCIBIOS_MIN_IO + PCI_IO_SIZE)))
-#define is_pci_memaddr(port)		\
-	(((port) >= PCIBIOS_MIN_MEM) &&	\
-	 ((port) < (PCIBIOS_MIN_MEM + PCI_MEM_SIZE)))
 #else
 #define is_pci_ioaddr(port)	(0)
-#define is_pci_memaddr(port)	(0)
 #endif
 
 struct pci_dev;
@@ -127,6 +123,25 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+
+static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
+{
+	struct pci_channel *p;
+	struct resource *res;
+
+	for (p = board_pci_channels; p->init; p++) {
+		res = p->mem_resource;
+		if (p->enabled && (phys_addr >= res->start) &&
+		    (phys_addr + size) <= (res->end + 1))
+			return 1;
+	}
+	return 0;
+}
+#else
+static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
+{
+	return 0;
+}
 #endif
 
 /* Board-specific fixup routines. */
diff --git a/arch/sh/mm/ioremap_32.c b/arch/sh/mm/ioremap_32.c
index 60cc486d2c2cc..7e04cc8f3b9b6 100644
--- a/arch/sh/mm/ioremap_32.c
+++ b/arch/sh/mm/ioremap_32.c
@@ -56,7 +56,7 @@ void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 	 * P1/P2 space, ioremap() will already do the right thing,
 	 * and we'll never get this far.
 	 */
-	if (is_pci_memaddr(phys_addr) && is_pci_memaddr(last_addr))
+	if (__is_pci_memory(phys_addr, size))
 		return (void __iomem *)phys_addr;
 
 #if !defined(CONFIG_PMB_FIXED)
@@ -121,7 +121,7 @@ void __iounmap(void __iomem *addr)
 	unsigned long seg = PXSEG(vaddr);
 	struct vm_struct *p;
 
-	if (seg < P3SEG || vaddr >= P3_ADDR_MAX || is_pci_memaddr(vaddr))
+	if (seg < P3SEG || vaddr >= P3_ADDR_MAX || __is_pci_memory(vaddr, 0))
 		return;
 
 #ifdef CONFIG_PMB
-- 
GitLab


From 8ce0143b11cdc519b8e1fd94a262b654ef0bc3ab Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:35:31 +0900
Subject: [PATCH 0431/2198] sh: pci io port base address code

Adds a __get_pci_io_base() function which is used to match a port range
against struct pci_channel. This allows us to detect if a port range is
assigned to pci or happens to be legacy port io. While at it, remove unused
cpu-specific cruft.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-rts7751r2d.c |  1 -
 arch/sh/include/asm/pci.h            | 47 ++++++++++++----------------
 arch/sh/kernel/io.c                  |  5 +++
 3 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index fe5a231b86698..58ed1d58cffb1 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -68,7 +68,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	__set_io_port_base(SH7751_PCI_IO_BASE);
 	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
 }
 
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index ccf5c5ff62ff9..bb2c2fcddc9ea 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -38,33 +38,6 @@ extern struct pci_channel board_pci_channels[];
 #define PCIBIOS_MIN_IO		__PCI_CHAN(bus)->io_resource->start
 #define PCIBIOS_MIN_MEM		__PCI_CHAN(bus)->mem_resource->start
 
-/*
- * I/O routine helpers
- */
-#if defined(CONFIG_CPU_SUBTYPE_SH7780) || defined(CONFIG_CPU_SUBTYPE_SH7785)
-#define PCI_IO_AREA		0xFE400000
-#define PCI_IO_SIZE		0x00400000
-#elif defined(CONFIG_CPU_SH5)
-extern unsigned long PCI_IO_AREA;
-#define PCI_IO_SIZE		0x00010000
-#else
-#define PCI_IO_AREA		0xFE240000
-#define PCI_IO_SIZE		0x00040000
-#endif
-
-#define PCI_MEM_SIZE		0x01000000
-
-#define SH4_PCIIOBR_MASK	0xFFFC0000
-#define pci_ioaddr(addr)	(PCI_IO_AREA + (addr & ~SH4_PCIIOBR_MASK))
-
-#if defined(CONFIG_PCI)
-#define is_pci_ioaddr(port)		\
-	(((port) >= PCIBIOS_MIN_IO) &&	\
-	 ((port) < (PCIBIOS_MIN_IO + PCI_IO_SIZE)))
-#else
-#define is_pci_ioaddr(port)	(0)
-#endif
-
 struct pci_dev;
 
 extern void pcibios_set_master(struct pci_dev *dev);
@@ -137,11 +110,31 @@ static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
 	}
 	return 0;
 }
+
+static inline void __iomem *__get_pci_io_base(unsigned long port,
+					      unsigned long size)
+{
+	struct pci_channel *p;
+	struct resource *res;
+
+	for (p = board_pci_channels; p->init; p++) {
+		res = p->io_resource;
+		if (p->enabled && (port >= res->start) &&
+		    (port + size) <= (res->end + 1))
+			return (void __iomem *)(p->io_base + port);
+	}
+	return NULL;
+}
 #else
 static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
 {
 	return 0;
 }
+static inline void __iomem *__get_pci_io_base(unsigned long port,
+					      unsigned long size)
+{
+	return NULL;
+}
 #endif
 
 /* Board-specific fixup routines. */
diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c
index 29cf4588fc050..59fb020718a34 100644
--- a/arch/sh/kernel/io.c
+++ b/arch/sh/kernel/io.c
@@ -12,6 +12,7 @@
  * for more details.
  */
 #include <linux/module.h>
+#include <linux/pci.h>
 #include <asm/machvec.h>
 #include <asm/io.h>
 
@@ -69,6 +70,10 @@ void __iomem *ioport_map(unsigned long port, unsigned int nr)
 	if (ret)
 		return ret;
 
+	ret = __get_pci_io_base(port, nr);
+	if (ret)
+		return ret;
+
 	return __ioport_map(port, nr);
 }
 EXPORT_SYMBOL(ioport_map);
-- 
GitLab


From aa5d3ff99cc1f3dfe262ab9dd9179131fcfe39fd Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Tue, 19 Feb 2008 21:35:40 +0900
Subject: [PATCH 0432/2198] sh: export board_pci_channels in one place

Instead of sometimes exporting board_pci_channels[] in the board specific
code just export it in one place.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-cayman.c     | 1 -
 arch/sh/drivers/pci/ops-lboxre2.c    | 2 --
 arch/sh/drivers/pci/ops-r7780rp.c    | 1 -
 arch/sh/drivers/pci/ops-rts7751r2d.c | 1 -
 arch/sh/drivers/pci/ops-sdk7780.c    | 1 -
 arch/sh/drivers/pci/ops-se7780.c     | 1 -
 arch/sh/drivers/pci/ops-titan.c      | 1 -
 arch/sh/drivers/pci/pci.c            | 2 ++
 8 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/ops-cayman.c
index f4a5e14f7e5ac..cbaaf0234b72c 100644
--- a/arch/sh/drivers/pci/ops-cayman.c
+++ b/arch/sh/drivers/pci/ops-cayman.c
@@ -80,7 +80,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh5_pci_init, &sh5_pci_ops, NULL, NULL, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 int __init pcibios_init_platform(void)
 {
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
index f606df2195c1f..781496bfb1f91 100644
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ b/arch/sh/drivers/pci/ops-lboxre2.c
@@ -43,8 +43,6 @@ struct pci_channel board_pci_channels[] = {
 	{ NULL, NULL, NULL, 0, 0 },
 };
 
-EXPORT_SYMBOL(board_pci_channels);
-
 static struct sh4_pci_address_map sh7751_pci_map = {
 	.window0	= {
 		.base	= SH7751_CS3_BASE_ADDR,
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index b51b7e4078d0b..c58f1cff9fbac 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -46,7 +46,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh7780_pci_init, &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map sh7780_pci_map = {
 	.window0	= {
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index 58ed1d58cffb1..d374cd37f455a 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -50,7 +50,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map sh7751_pci_map = {
 	.window0	= {
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index 7277cd15ae6ae..b34fbc54a7c67 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -52,7 +52,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh7780_pci_init, &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map sdk7780_pci_map = {
 	.window0	= {
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 76a74fb42fb01..47302077a0c81 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -61,7 +61,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh7780_pci_init, &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map se7780_pci_map = {
 	.window0	= {
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
index ffa79bdf47557..31ed03716a2c8 100644
--- a/arch/sh/drivers/pci/ops-titan.c
+++ b/arch/sh/drivers/pci/ops-titan.c
@@ -55,7 +55,6 @@ struct pci_channel board_pci_channels[] = {
 	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map sh7751_pci_map = {
 	.window0	= {
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index b9aca5478048e..8aaacc99b7103 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -187,3 +187,5 @@ void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
 	iounmap(addr);
 }
 EXPORT_SYMBOL(pci_iounmap);
+
+EXPORT_SYMBOL(board_pci_channels);
-- 
GitLab


From 10591578c84825a320734e59272f161385edcc41 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 11 Mar 2009 15:58:04 +0900
Subject: [PATCH 0433/2198] sh: drop duplicate symbol export on dreamcast and
 sh7785lcr.

With board_pci_channels now being exported in a single place, update the
boards that duplicated the export.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-dreamcast.c | 1 -
 arch/sh/drivers/pci/ops-sh7785lcr.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c
index f62063eb64908..5bc81d5b660a3 100644
--- a/arch/sh/drivers/pci/ops-dreamcast.c
+++ b/arch/sh/drivers/pci/ops-dreamcast.c
@@ -165,4 +165,3 @@ struct pci_channel board_pci_channels[] = {
 	  &gapspci_mem_resource, 0, 1 },
 	{ 0, }
 };
-EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
index fb0869f0bef8b..2a9f9a599d663 100644
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ b/arch/sh/drivers/pci/ops-sh7785lcr.c
@@ -44,7 +44,6 @@ struct pci_channel board_pci_channels[] = {
 	{ &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
-EXPORT_SYMBOL(board_pci_channels);
 
 static struct sh4_pci_address_map sh7785_pci_map = {
 	.window0	= {
-- 
GitLab


From 3aabce8d3d2f9af2c08c2f590ac9acb272ca8c95 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 11 Mar 2009 16:12:39 +0900
Subject: [PATCH 0434/2198] sh: sh7785lcr: Update for recent PCI changes.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/fixups-sh7785lcr.c | 33 +++++++++++++-------------
 arch/sh/drivers/pci/ops-sh7785lcr.c    |  4 ++--
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/arch/sh/drivers/pci/fixups-sh7785lcr.c b/arch/sh/drivers/pci/fixups-sh7785lcr.c
index 4949e601387a3..9e7dc79037e76 100644
--- a/arch/sh/drivers/pci/fixups-sh7785lcr.c
+++ b/arch/sh/drivers/pci/fixups-sh7785lcr.c
@@ -15,32 +15,33 @@
 #include <linux/pci.h>
 #include "pci-sh4.h"
 
-int pci_fixup_pcic(void)
+int pci_fixup_pcic(struct pci_channel *chan)
 {
-	pci_write_reg(0x000043ff, SH4_PCIINTM);
-	pci_write_reg(0x0000380f, SH4_PCIAINTM);
+	pci_write_reg(chan, 0x000043ff, SH4_PCIINTM);
+	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
 
-	pci_write_reg(0xfbb00047, SH7780_PCICMD);
-	pci_write_reg(0x00000000, SH7780_PCIIBAR);
+	pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD);
+	pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR);
 
-	pci_write_reg(0x00011912, SH7780_PCISVID);
-	pci_write_reg(0x08000000, SH7780_PCICSCR0);
-	pci_write_reg(0x0000001b, SH7780_PCICSAR0);
-	pci_write_reg(0xfd000000, SH7780_PCICSCR1);
-	pci_write_reg(0x0000000f, SH7780_PCICSAR1);
+	pci_write_reg(chan, 0x00011912, SH7780_PCISVID);
+	pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0);
+	pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0);
+	pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1);
+	pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1);
 
-	pci_write_reg(0xfd000000, SH7780_PCIMBR0);
-	pci_write_reg(0x00fc0000, SH7780_PCIMBMR0);
+	pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0);
+	pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0);
 
 #ifdef CONFIG_32BIT
-	pci_write_reg(0xc0000000, SH7780_PCIMBR2);
-	pci_write_reg(0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
+	pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2);
+	pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
 #endif
 
 	/* Set IOBR for windows containing area specified in pci.h */
-	pci_write_reg((PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)),
+	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE - 1),
 		      SH7780_PCIIOBR);
-	pci_write_reg(((SH7780_PCI_IO_SIZE - 1) & (7 << 18)), SH7780_PCIIOBMR);
+	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE - 1) & (7 << 18)),
+		      SH7780_PCIIOBMR);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
index 2a9f9a599d663..afbb9bd47513c 100644
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ b/arch/sh/drivers/pci/ops-sh7785lcr.c
@@ -41,7 +41,7 @@ static struct resource sh7785_mem_resource = {
 };
 
 struct pci_channel board_pci_channels[] = {
-	{ &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
+	{ sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
 	{ NULL, NULL, NULL, 0, 0 },
 };
 
@@ -61,5 +61,5 @@ static struct sh4_pci_address_map sh7785_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7780_pcic_init(&sh7785_pci_map);
+	return sh7780_pcic_init(&board_pci_channels[0], &sh7785_pci_map);
 }
-- 
GitLab


From f1a9ba8f15f89f3f444985251efaf809cd16da53 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 11 Mar 2009 16:17:53 +0900
Subject: [PATCH 0435/2198] sh: pci: drop duplicate PCIC fixups for SE7780 and
 SH7785LCR.

SE7780 has the same PCIC fixup as SDK7780, and SH7785LCR the same
as R7780RP. Switch to using those, and drop the duplicate code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Makefile           |  4 +-
 arch/sh/drivers/pci/fixups-se7780.c    | 62 --------------------------
 arch/sh/drivers/pci/fixups-sh7785lcr.c | 47 -------------------
 3 files changed, 2 insertions(+), 111 deletions(-)
 delete mode 100644 arch/sh/drivers/pci/fixups-se7780.c
 delete mode 100644 arch/sh/drivers/pci/fixups-sh7785lcr.c

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 847e90894d1b4..67d710a04de1a 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -21,6 +21,6 @@ obj-$(CONFIG_SH_SDK7780)		+= ops-sdk7780.o fixups-sdk7780.o
 obj-$(CONFIG_SH_TITAN)			+= ops-titan.o
 obj-$(CONFIG_SH_LANDISK)		+= ops-landisk.o
 obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-lboxre2.o
-obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= ops-se7780.o fixups-se7780.o
+obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= ops-se7780.o fixups-sdk7780.o
 obj-$(CONFIG_SH_CAYMAN)			+= ops-cayman.o
-obj-$(CONFIG_SH_SH7785LCR)		+= ops-sh7785lcr.o fixups-sh7785lcr.o
+obj-$(CONFIG_SH_SH7785LCR)		+= ops-sh7785lcr.o fixups-r7780rp.o
diff --git a/arch/sh/drivers/pci/fixups-se7780.c b/arch/sh/drivers/pci/fixups-se7780.c
deleted file mode 100644
index a968af7d6f70f..0000000000000
--- a/arch/sh/drivers/pci/fixups-se7780.c
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * arch/sh/drivers/pci/fixups-se7780.c
- *
- * HITACHI UL Solution Engine 7780  PCI fixups
- *
- * Copyright (C) 2003  Lineo uSolutions, Inc.
- * Copyright (C) 2004 - 2006  Paul Mundt
- * Copyright (C) 2006  Nobuhiro Iwamatsu
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/pci.h>
-#include "pci-sh4.h"
-#include <asm/io.h>
-
-int pci_fixup_pcic(struct pci_channel *chan)
-{
-	ctrl_outl(0x00000001, SH7780_PCI_VCR2);
-
-	/* Enable all interrupts, so we know what to fix */
-	pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR);
-	pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM);
-
-	/* Set up standard PCI config registers */
-	ctrl_outw(0xFB00, PCI_REG(SH7780_PCISTATUS));
-	ctrl_outw(0x0047, PCI_REG(SH7780_PCICMD));
-	ctrl_outb(  0x00, PCI_REG(SH7780_PCIPIF));
-	ctrl_outb(  0x00, PCI_REG(SH7780_PCISUB));
-	ctrl_outb(  0x06, PCI_REG(SH7780_PCIBCC));
-	ctrl_outw(0x1912, PCI_REG(SH7780_PCISVID));
-	ctrl_outw(0x0001, PCI_REG(SH7780_PCISID));
-
-	pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0);     /* PCI */
-	pci_write_reg(chan, 0x08000000, SH7780_PCILAR0);     /* SHwy */
-	pci_write_reg(chan, 0x07F00001, SH7780_PCILSR); /* size 128M w/ MBAR */
-
-	pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1);
-	pci_write_reg(chan, 0x00000000, SH7780_PCILAR1);
-	pci_write_reg(chan, 0x00000000, SH7780_PCILSR1);
-
-	pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR);
-
-	/*
-	 * Set the MBR so PCI address is one-to-one with window,
-	 * meaning all calls go straight through... use ifdef to
-	 * catch erroneous assumption.
-	 */
-	pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0);
-	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);    /* 16M */
-
-	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
-		      SH7780_PCIIOBR);
-	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
-		      SH7780_PCIIOBMR);
-
-	pci_write_reg(chan, 0xA5000C01, SH7780_PCICR);
-
-	return 0;
-}
diff --git a/arch/sh/drivers/pci/fixups-sh7785lcr.c b/arch/sh/drivers/pci/fixups-sh7785lcr.c
deleted file mode 100644
index 9e7dc79037e76..0000000000000
--- a/arch/sh/drivers/pci/fixups-sh7785lcr.c
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * arch/sh/drivers/pci/fixups-sh7785lcr.c
- *
- * R0P7785LC0011RL PCI fixups
- * Copyright (C) 2008  Yoshihiro Shimoda
- *
- * Based on arch/sh/drivers/pci/fixups-r7780rp.c
- * Copyright (C) 2003  Lineo uSolutions, Inc.
- * Copyright (C) 2004 - 2006  Paul Mundt
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/pci.h>
-#include "pci-sh4.h"
-
-int pci_fixup_pcic(struct pci_channel *chan)
-{
-	pci_write_reg(chan, 0x000043ff, SH4_PCIINTM);
-	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
-
-	pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD);
-	pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR);
-
-	pci_write_reg(chan, 0x00011912, SH7780_PCISVID);
-	pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0);
-	pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0);
-	pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1);
-	pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1);
-
-	pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0);
-	pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0);
-
-#ifdef CONFIG_32BIT
-	pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2);
-	pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
-#endif
-
-	/* Set IOBR for windows containing area specified in pci.h */
-	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE - 1),
-		      SH7780_PCIIOBR);
-	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE - 1) & (7 << 18)),
-		      SH7780_PCIIOBMR);
-
-	return 0;
-}
-- 
GitLab


From d0deef5b14af7d5bbd0003a0a2a1a32326e20a6d Mon Sep 17 00:00:00 2001
From: Shawn Du <duyuyang@gmail.com>
Date: Tue, 14 Apr 2009 13:58:56 +0800
Subject: [PATCH 0436/2198] blktrace: support per-partition tracing

Though one can specify '-d /dev/sda1' when using blktrace, it still
traces the whole sda.

To support per-partition tracing, when we start tracing, we initialize
bt->start_lba and bt->end_lba to the start and end sector of that
partition.

Note some actions are per device, thus we don't filter 0-sector events.

The original patch and discussion can be found here:
	http://marc.info/?l=linux-btrace&m=122949374214540&w=2

Signed-off-by: Shawn Du <duyuyang@gmail.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42620.4050701@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/compat_ioctl.c         |  2 +-
 drivers/scsi/sg.c            |  1 +
 include/linux/blktrace_api.h | 24 +++++++++++++-----------
 kernel/trace/blktrace.c      | 29 +++++++++++++++++++++--------
 4 files changed, 36 insertions(+), 20 deletions(-)

diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f87615dea46bb..f8c218cd08e19 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -568,7 +568,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg)
 	memcpy(&buts.name, &cbuts.name, 32);
 
 	mutex_lock(&bdev->bd_mutex);
-	ret = do_blk_trace_setup(q, b, bdev->bd_dev, &buts);
+	ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts);
 	mutex_unlock(&bdev->bd_mutex);
 	if (ret)
 		return ret;
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 82312df9b0bfb..49c98730bb8db 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1065,6 +1065,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 		return blk_trace_setup(sdp->device->request_queue,
 				       sdp->disk->disk_name,
 				       MKDEV(SCSI_GENERIC_MAJOR, sdp->index),
+				       NULL,
 				       (char *)arg);
 	case BLKTRACESTART:
 		return blk_trace_startstop(sdp->device->request_queue, 1);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index d960889e92efa..267edc4017eef 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -165,8 +165,9 @@ struct blk_trace {
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern int do_blk_trace_setup(struct request_queue *q,
-	char *name, dev_t dev, struct blk_user_trace_setup *buts);
+extern int do_blk_trace_setup(struct request_queue *q, char *name,
+			      dev_t dev, struct block_device *bdev,
+			      struct blk_user_trace_setup *buts);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 
 /**
@@ -193,6 +194,7 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
 				void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			   struct block_device *bdev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
@@ -200,15 +202,15 @@ extern int blk_trace_remove(struct request_queue *q);
 extern struct attribute_group blk_trace_attr_group;
 
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
-#define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
-#define blk_trace_shutdown(q)			do { } while (0)
-#define do_blk_trace_setup(q, name, dev, buts)	(-ENOTTY)
-#define blk_add_driver_data(q, rq, data, len)	do {} while (0)
-#define blk_trace_setup(q, name, dev, arg)	(-ENOTTY)
-#define blk_trace_startstop(q, start)		(-ENOTTY)
-#define blk_trace_remove(q)			(-ENOTTY)
-#define blk_add_trace_msg(q, fmt, ...)		do { } while (0)
-
+# define blk_trace_ioctl(bdev, cmd, arg)		(-ENOTTY)
+# define blk_trace_shutdown(q)				do { } while (0)
+# define do_blk_trace_setup(q, name, dev, bdev, buts)	(-ENOTTY)
+# define blk_add_driver_data(q, rq, data, len)		do {} while (0)
+# define blk_trace_setup(q, name, dev, bdev, arg)	(-ENOTTY)
+# define blk_trace_startstop(q, start)			(-ENOTTY)
+# define blk_trace_remove(q)				(-ENOTTY)
+# define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
+
 #endif /* __KERNEL__ */
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2b98195b338b2..e932654cf5900 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,7 +147,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 {
 	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
 		return 1;
-	if (sector < bt->start_lba || sector > bt->end_lba)
+	if (sector && (sector < bt->start_lba || sector > bt->end_lba))
 		return 1;
 	if (bt->pid && pid != bt->pid)
 		return 1;
@@ -192,7 +192,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	what |= MASK_TC_BIT(rw, DISCARD);
 
 	pid = tsk->pid;
-	if (unlikely(act_log_check(bt, what, sector, pid)))
+	if (act_log_check(bt, what, sector, pid))
 		return;
 	cpu = raw_smp_processor_id();
 
@@ -407,11 +407,13 @@ static struct rchan_callbacks blk_relay_callbacks = {
  * Setup everything required to start tracing
  */
 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-			struct blk_user_trace_setup *buts)
+		       struct block_device *bdev,
+		       struct blk_user_trace_setup *buts)
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	struct dentry *dir = NULL;
 	int ret, i;
+	struct hd_struct *part = NULL;
 
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
@@ -480,11 +482,21 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->act_mask)
 		bt->act_mask = (u16) -1;
 
-	bt->start_lba = buts->start_lba;
-	bt->end_lba = buts->end_lba;
-	if (!bt->end_lba)
+	if (bdev)
+		part = bdev->bd_part;
+
+	if (part) {
+		bt->start_lba = part->start_sect;
+		bt->end_lba = part->start_sect + part->nr_sects;
+	} else
 		bt->end_lba = -1ULL;
 
+	/* overwrite with user settings */
+	if (buts->start_lba)
+		bt->start_lba = buts->start_lba;
+	if (buts->end_lba)
+		bt->end_lba = buts->end_lba;
+
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
@@ -505,6 +517,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 }
 
 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    struct block_device *bdev,
 		    char __user *arg)
 {
 	struct blk_user_trace_setup buts;
@@ -514,7 +527,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (ret)
 		return -EFAULT;
 
-	ret = do_blk_trace_setup(q, name, dev, &buts);
+	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
 	if (ret)
 		return ret;
 
@@ -582,7 +595,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 	switch (cmd) {
 	case BLKTRACESETUP:
 		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 	case BLKTRACESTART:
 		start = 1;
-- 
GitLab


From 9908c30997b8a73c95f836170b9998dae9aa3f4a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 14 Apr 2009 13:59:34 +0800
Subject: [PATCH 0437/2198] blktrace: support per-partition tracing for ftrace
 plugin

The previous patch adds support to trace a single partition for
relay+ioctl blktrace, and this patch is for ftrace plugin blktrace:

  # echo 1 > /sys/block/sda/sda7/enable
  # cat start_lba
  102398373
  # cat end_lba
  102703545

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Shawn Du <duyuyang@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42646.4060608@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e932654cf5900..d10989880520f 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -403,6 +403,23 @@ static struct rchan_callbacks blk_relay_callbacks = {
 	.remove_buf_file	= blk_remove_buf_file_callback,
 };
 
+static void blk_trace_setup_lba(struct blk_trace *bt,
+				struct block_device *bdev)
+{
+	struct hd_struct *part = NULL;
+
+	if (bdev)
+		part = bdev->bd_part;
+
+	if (part) {
+		bt->start_lba = part->start_sect;
+		bt->end_lba = part->start_sect + part->nr_sects;
+	} else {
+		bt->start_lba = 0;
+		bt->end_lba = -1ULL;
+	}
+}
+
 /*
  * Setup everything required to start tracing
  */
@@ -413,7 +430,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	struct blk_trace *old_bt, *bt = NULL;
 	struct dentry *dir = NULL;
 	int ret, i;
-	struct hd_struct *part = NULL;
 
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
@@ -482,14 +498,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!bt->act_mask)
 		bt->act_mask = (u16) -1;
 
-	if (bdev)
-		part = bdev->bd_part;
-
-	if (part) {
-		bt->start_lba = part->start_sect;
-		bt->end_lba = part->start_sect + part->nr_sects;
-	} else
-		bt->end_lba = -1ULL;
+	blk_trace_setup_lba(bt, bdev);
 
 	/* overwrite with user settings */
 	if (buts->start_lba)
@@ -1370,7 +1379,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
 /*
  * Setup everything required to start tracing
  */
-static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+static int blk_trace_setup_queue(struct request_queue *q,
+				 struct block_device *bdev)
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	int ret = -ENOMEM;
@@ -1383,9 +1393,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	if (!bt->msg_data)
 		goto free_bt;
 
-	bt->dev = dev;
+	bt->dev = bdev->bd_dev;
 	bt->act_mask = (u16)-1;
-	bt->end_lba = -1ULL;
+
+	blk_trace_setup_lba(bt, bdev);
 
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
@@ -1602,7 +1613,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 	if (attr == &dev_attr_enable) {
 		if (value)
-			ret = blk_trace_setup_queue(q, bdev->bd_dev);
+			ret = blk_trace_setup_queue(q, bdev);
 		else
 			ret = blk_trace_remove_queue(q);
 		goto out_unlock_bdev;
@@ -1610,7 +1621,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
 	ret = 0;
 	if (q->blk_trace == NULL)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+		ret = blk_trace_setup_queue(q, bdev);
 
 	if (ret == 0) {
 		if (attr == &dev_attr_act_mask)
-- 
GitLab


From 1d54ad6da9192fed5dd3b60224d9f2dfea0dcd82 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 14 Apr 2009 14:00:05 +0800
Subject: [PATCH 0438/2198] blktrace: add trace/ to /sys/block/sda

Impact: allow ftrace-plugin blktrace to trace device-mapper devices

To trace a single partition:
  # echo 1 > /sys/block/sda/sda1/enable

To trace the whole sda instead:
  # echo 1 > /sys/block/sda/enable

Thus we also fix an issue reported by Ted, that ftrace-plugin blktrace
can't be used to trace device-mapper devices.

Now:

  # echo 1 > /sys/block/dm-0/trace/enable
  echo: write error: No such device or address
  # mount -t ext4 /dev/dm-0 /mnt
  # echo 1 > /sys/block/dm-0/trace/enable
  # echo blk > /debug/tracing/current_tracer

Reported-by: Theodore Tso <tytso@mit.edu>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Shawn Du <duyuyang@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49E42665.6020506@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blk-sysfs.c            | 7 ++++++-
 include/linux/blktrace_api.h | 6 ++++++
 kernel/trace/blktrace.c      | 5 +++++
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 73f36beff5cd5..8653d710b39e1 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -387,16 +387,21 @@ struct kobj_type blk_queue_ktype = {
 int blk_register_queue(struct gendisk *disk)
 {
 	int ret;
+	struct device *dev = disk_to_dev(disk);
 
 	struct request_queue *q = disk->queue;
 
 	if (WARN_ON(!q))
 		return -ENXIO;
 
+	ret = blk_trace_init_sysfs(dev);
+	if (ret)
+		return ret;
+
 	if (!q->request_fn)
 		return 0;
 
-	ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
+	ret = kobject_add(&q->kobj, kobject_get(&dev->kobj),
 			  "%s", "queue");
 	if (ret < 0)
 		return ret;
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 267edc4017eef..62763c9528546 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -198,6 +198,7 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
+extern int blk_trace_init_sysfs(struct device *dev);
 
 extern struct attribute_group blk_trace_attr_group;
 
@@ -210,6 +211,11 @@ extern struct attribute_group blk_trace_attr_group;
 # define blk_trace_startstop(q, start)			(-ENOTTY)
 # define blk_trace_remove(q)				(-ENOTTY)
 # define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
+static inline int blk_trace_init_sysfs(struct device *dev)
+{
+	return 0;
+}
+
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #endif /* __KERNEL__ */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d10989880520f..8e7c5da3a3e6d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1644,3 +1644,8 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	return ret ? ret : count;
 }
 
+int blk_trace_init_sysfs(struct device *dev)
+{
+	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+
-- 
GitLab


From f3948f8857ef5de239f28a61dddb1554a0ae4c2c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 15 Apr 2009 11:02:56 +0800
Subject: [PATCH 0439/2198] blktrace: fix context-info when mixed-using blk
 tracer and trace events

When current tracer is set to blk tracer, TRACE_ITER_CONTEXT_INFO is
unset, but actually context-info is printed:

    pdflush-431   [000]   821.181576:   8,0    P   N [pdflush]

And then if we enable TRACE_ITER_CONTEXT_INFO:

    # echo context-info > trace_options

We'll see context-info printed twice. What's worse, when we use blk
tracer and trace events at the same time, we'll see no context-info
for trace events at all:

    jbd2_commit_logging: dev dm-0:8 transaction 333227
    jbd2_end_commit: dev dm-0:8 transaction 333227 head 332814
      rm-25433 [001]  9578.307485:   8,18   m   N cfq25433 slice expired t=0
      rm-25433 [001]  9578.307486:   8,18   m   N cfq25433 put_queue

This patch adds blk_tracer->set_flags(), and context-info flag is unset
only when we set the output to classic mode.

Note after this patch, one should unset context-info explicitly if he
wants to get binary output that can be parsed by blkparse:

    # echo nocontext-info > trace_options
    # echo bin > trace_options
    # echo blk > current_tracer
    # cat trace_pipe | blkparse -i -

Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49E54E60.50408@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8e7c5da3a3e6d..c32062bd10b39 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1211,7 +1211,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 static void blk_tracer_start(struct trace_array *tr)
 {
 	blk_tracer_enabled = true;
-	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
 static int blk_tracer_init(struct trace_array *tr)
@@ -1224,7 +1223,6 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
 	blk_tracer_enabled = false;
-	trace_flags |= TRACE_ITER_CONTEXT_INFO;
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
@@ -1289,9 +1287,6 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 					       int flags)
 {
-	if (!trace_print_context(iter))
-		return TRACE_TYPE_PARTIAL_LINE;
-
 	return print_one_line(iter, false);
 }
 
@@ -1326,6 +1321,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 	return print_one_line(iter, true);
 }
 
+static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+{
+	/* don't output context-info for blk_classic output */
+	if (bit == TRACE_BLK_OPT_CLASSIC) {
+		if (set)
+			trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+		else
+			trace_flags |= TRACE_ITER_CONTEXT_INFO;
+	}
+	return 0;
+}
+
 static struct tracer blk_tracer __read_mostly = {
 	.name		= "blk",
 	.init		= blk_tracer_init,
@@ -1335,6 +1342,7 @@ static struct tracer blk_tracer __read_mostly = {
 	.print_header	= blk_tracer_print_header,
 	.print_line	= blk_tracer_print_line,
 	.flags		= &blk_tracer_flags,
+	.set_flag	= blk_tracer_set_flag,
 };
 
 static struct trace_event trace_blk_event = {
-- 
GitLab


From 0232ba9ce031d0fd8f331fa8b3c00e16901f54e6 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 16 Apr 2009 18:01:31 +0900
Subject: [PATCH 0440/2198] sh: pci: Kill off unused SH4_PCIC_NO_RESET code.

Nothing ended up using this anymore, so just kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-landisk.c    |  2 --
 arch/sh/drivers/pci/ops-lboxre2.c    |  5 -----
 arch/sh/drivers/pci/ops-r7780rp.c    |  2 --
 arch/sh/drivers/pci/ops-rts7751r2d.c |  7 -------
 arch/sh/drivers/pci/ops-sdk7780.c    |  1 -
 arch/sh/drivers/pci/ops-se7780.c     |  1 -
 arch/sh/drivers/pci/ops-sh7785lcr.c  |  2 --
 arch/sh/drivers/pci/ops-snapgear.c   |  2 --
 arch/sh/drivers/pci/ops-titan.c      |  2 --
 arch/sh/drivers/pci/pci-sh4.h        |  4 ----
 arch/sh/drivers/pci/pci-sh7751.c     | 15 ---------------
 arch/sh/drivers/pci/pci-sh7780.c     | 15 ---------------
 12 files changed, 58 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c
index c46911d95eca5..178b77828aa98 100644
--- a/arch/sh/drivers/pci/ops-landisk.c
+++ b/arch/sh/drivers/pci/ops-landisk.c
@@ -39,8 +39,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 		.base	= SH7751_CS3_BASE_ADDR,
 		.size	= (64 << 20),	/* 64MB */
 	},
-
-	.flags = SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
index 781496bfb1f91..91cabd84f028e 100644
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ b/arch/sh/drivers/pci/ops-lboxre2.c
@@ -48,11 +48,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 		.base	= SH7751_CS3_BASE_ADDR,
 		.size	= 0x04000000,
 	},
-	.window1	= {
-		.base	= 0x00000000,	/* Unused */
-		.size	= 0x00000000,	/* Unused */
-	},
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index c58f1cff9fbac..8ec6d225ef9d5 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -57,8 +57,6 @@ static struct sh4_pci_address_map sh7780_pci_map = {
 		.base	= SH7780_CS3_BASE_ADDR,
 		.size	= 0x04000000,
 	},
-
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index d374cd37f455a..96b916c0d6c5b 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -56,13 +56,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 		.base	= SH7751_CS3_BASE_ADDR,
 		.size	= 0x04000000,
 	},
-
-	.window1	= {
-		.base	= 0x00000000,	/* Unused */
-		.size	= 0x00000000,	/* Unused */
-	},
-
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index b34fbc54a7c67..6a0b7c067831f 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -62,7 +62,6 @@ static struct sh4_pci_address_map sdk7780_pci_map = {
 		.base	= SH7780_CS3_BASE_ADDR,
 		.size	= 0x04000000,
 	},
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 47302077a0c81..583b8e82ff994 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -67,7 +67,6 @@ static struct sh4_pci_address_map se7780_pci_map = {
 		.base	= SH7780_CS2_BASE_ADDR,
 		.size	= 0x04000000,
 	},
-	.flags  = SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
index afbb9bd47513c..ab0d1decf2df8 100644
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ b/arch/sh/drivers/pci/ops-sh7785lcr.c
@@ -55,8 +55,6 @@ static struct sh4_pci_address_map sh7785_pci_map = {
 		.size	= 0x20000000,
 #endif
 	},
-
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
index 2e254c6cf6c1a..dd2c5df283079 100644
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ b/arch/sh/drivers/pci/ops-snapgear.c
@@ -54,8 +54,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 		.base	= SH7751_CS2_BASE_ADDR,
 		.size	= SNAPGEAR_LSR1_SIZE,
 	},
-
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 /*
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
index 31ed03716a2c8..e45bb62bf8ce0 100644
--- a/arch/sh/drivers/pci/ops-titan.c
+++ b/arch/sh/drivers/pci/ops-titan.c
@@ -66,8 +66,6 @@ static struct sh4_pci_address_map sh7751_pci_map = {
 		.base	= SH7751_CS2_BASE_ADDR,
 		.size	= SH7751_MEM_REGION_SIZE*2,
 	},
-
-	.flags	= SH4_PCIC_NO_RESET,
 };
 
 int __init pcibios_init_platform(void)
diff --git a/arch/sh/drivers/pci/pci-sh4.h b/arch/sh/drivers/pci/pci-sh4.h
index 90abfe3d39bb5..3d5296cde622d 100644
--- a/arch/sh/drivers/pci/pci-sh4.h
+++ b/arch/sh/drivers/pci/pci-sh4.h
@@ -149,9 +149,6 @@
   #define SH4_PCIPDTR_PB0	  0x000000001	/* Port 0 Enable */
 #define SH4_PCIPDR		0x220		/* Port IO Data Register */
 
-/* Flags */
-#define SH4_PCIC_NO_RESET	0x0001
-
 /* arch/sh/kernel/drivers/pci/ops-sh4.c */
 extern struct pci_ops sh4_pci_ops;
 int sh4_pci_check_direct(struct pci_channel *chan);
@@ -165,7 +162,6 @@ struct sh4_pci_address_space {
 struct sh4_pci_address_map {
 	struct sh4_pci_address_space window0;
 	struct sh4_pci_address_space window1;
-	unsigned long flags;
 };
 
 static inline void pci_write_reg(struct pci_channel *chan,
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 2a6c7aab2d754..af8874436d2f9 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -99,21 +99,6 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	word = SH4_PCIPINT_D3 | SH4_PCIPINT_D0;
 	pci_write_reg(chan, word, SH4_PCIPINT);
 
-	/*
-	 * This code is unused for some boards as it is done in the
-	 * bootloader and doing it here means the MAC addresses loaded
-	 * by the bootloader get lost.
-	 */
-	if (!(map->flags & SH4_PCIC_NO_RESET)) {
-		/* toggle PCI reset pin */
-		word = SH4_PCICR_PREFIX | SH4_PCICR_PRST;
-		pci_write_reg(chan, word, SH4_PCICR);
-		/* Wait for a long time... not 1 sec. but long enough */
-		mdelay(100);
-		word = SH4_PCICR_PREFIX;
-		pci_write_reg(chan, word, SH4_PCICR);
-	}
-
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
 	 * Mem space enable
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 87a7f3b7a38f6..282cabe15e369 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -96,21 +96,6 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 {
 	u32 word;
 
-	/*
-	 * This code is unused for some boards as it is done in the
-	 * bootloader and doing it here means the MAC addresses loaded
-	 * by the bootloader get lost.
-	 */
-	if (!(map->flags & SH4_PCIC_NO_RESET)) {
-		/* toggle PCI reset pin */
-		word = SH4_PCICR_PREFIX | SH4_PCICR_PRST;
-		pci_write_reg(chan, word, SH4_PCICR);
-		/* Wait for a long time... not 1 sec. but long enough */
-		mdelay(100);
-		word = SH4_PCICR_PREFIX;
-		pci_write_reg(chan, word, SH4_PCICR);
-	}
-
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
 	 * Mem space enable
-- 
GitLab


From d1f0ae5e2e45e74cff4c3bdefb0fc77608cdfeec Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 15 Apr 2009 21:34:55 +0200
Subject: [PATCH 0441/2198] x86: standardize Kbuild rules

Introducing this Kbuild file allow us to:

    make arch/x86/

And thus building all the core part of x86.

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kbuild   | 16 ++++++++++++++++
 arch/x86/Makefile | 19 ++-----------------
 2 files changed, 18 insertions(+), 17 deletions(-)
 create mode 100644 arch/x86/Kbuild

diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
new file mode 100644
index 0000000000000..ad8ec356fb363
--- /dev/null
+++ b/arch/x86/Kbuild
@@ -0,0 +1,16 @@
+
+obj-$(CONFIG_KVM) += kvm/
+
+# Xen paravirtualization support
+obj-$(CONFIG_XEN) += xen/
+
+# lguest paravirtualization support
+obj-$(CONFIG_LGUEST_GUEST) += lguest/
+
+obj-y += kernel/
+obj-y += mm/
+
+obj-y += crypto/
+obj-y += vdso/
+obj-$(CONFIG_IA32_EMULATION) += ia32/
+
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index f05d8c91d9e51..e81f0b2777618 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -7,8 +7,6 @@ else
         KBUILD_DEFCONFIG := $(ARCH)_defconfig
 endif
 
-core-$(CONFIG_KVM) += arch/x86/kvm/
-
 # BITS is used as extension for files which are available in a 32 bit
 # and a 64 bit version to simplify shared Makefiles.
 # e.g.: obj-y += foo_$(BITS).o
@@ -118,21 +116,8 @@ head-y += arch/x86/kernel/init_task.o
 
 libs-y  += arch/x86/lib/
 
-# Sub architecture files that needs linking first
-core-y += $(fcore-y)
-
-# Xen paravirtualization support
-core-$(CONFIG_XEN) += arch/x86/xen/
-
-# lguest paravirtualization support
-core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
-
-core-y += arch/x86/kernel/
-core-y += arch/x86/mm/
-
-core-y += arch/x86/crypto/
-core-y += arch/x86/vdso/
-core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
+# See arch/x86/Kbuild for content of core part of the kernel
+core-y += arch/x86/
 
 # drivers-y are linked after core-y
 drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
-- 
GitLab


From 84971bb401866d79c6b353cb1d8861c2b4621867 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 12:44:25 +0900
Subject: [PATCH 0442/2198] sh: pci: Kill off useless debugging printk() in
 pci-sh7780 init.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-sh7780.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 282cabe15e369..b826c7bc62b92 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -118,18 +118,6 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 	pci_write_reg(chan, map->window1.base, SH4_PCILAR1);
 	pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1);
 
-	/* Map IO space into PCI IO window:
-	 * IO addresses will be translated to the PCI IO window base address
-	 */
-	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n",
-		 chan->io_resource->start, chan->io_resource->end,
-		 chan->io_base + chan->io_resource->start);
-
-	/* NOTE: I'm ignoring the PCI error IRQs for now..
-	 * TODO: add support for the internal error interrupts and
-	 * DMA interrupts...
-	 */
-
 	/* Apply any last-minute PCIC fixups */
 	pci_fixup_pcic(chan);
 
-- 
GitLab


From b627b4ed3d056c5d00e8f3cb32d033b0ee6619a9 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 13:00:18 +0900
Subject: [PATCH 0443/2198] sh: pci: Move se7780 INTC fixups out of
 pci-sh7780.c.

These fixups belong in the board INTC setup code, not in the middle of
pci-sh7780.c.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-se/7780/irq.c | 10 ++++++++--
 arch/sh/drivers/pci/pci-sh7780.c  | 24 ------------------------
 2 files changed, 8 insertions(+), 26 deletions(-)

diff --git a/arch/sh/boards/mach-se/7780/irq.c b/arch/sh/boards/mach-se/7780/irq.c
index 66ad292c9fc34..44b61a597a157 100644
--- a/arch/sh/boards/mach-se/7780/irq.c
+++ b/arch/sh/boards/mach-se/7780/irq.c
@@ -12,10 +12,13 @@
 #include <linux/init.h>
 #include <linux/irq.h>
 #include <linux/interrupt.h>
-#include <asm/irq.h>
-#include <asm/io.h>
+#include <linux/irq.h>
+#include <linux/io.h>
 #include <mach-se/mach/se7780.h>
 
+#define INTC_BASE	0xffd00000
+#define INTC_ICR1	(INTC_BASE+0x1c)
+
 /*
  * Initialize IRQ setting
  */
@@ -43,4 +46,7 @@ void __init init_se7780_IRQ(void)
 	ctrl_outw((IRQPIN_PCCPW << IRQPOS_PCCPW), FPGA_INTSEL3);
 
 	plat_irq_setup_pins(IRQ_MODE_IRQ); /* install handlers for IRQ0-7 */
+
+	/* ICR1: detect low level(for 2ndcut) */
+	ctrl_outl(0xAAAA0000, INTC_ICR1);
 }
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index b826c7bc62b92..45fa423f2e53d 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -22,20 +22,6 @@
 #include <linux/delay.h>
 #include "pci-sh4.h"
 
-#define INTC_BASE	0xffd00000
-#define INTC_ICR0	(INTC_BASE+0x0)
-#define INTC_ICR1	(INTC_BASE+0x1c)
-#define INTC_INTPRI	(INTC_BASE+0x10)
-#define INTC_INTREQ	(INTC_BASE+0x24)
-#define INTC_INTMSK0	(INTC_BASE+0x44)
-#define INTC_INTMSK1	(INTC_BASE+0x48)
-#define INTC_INTMSK2	(INTC_BASE+0x40080)
-#define INTC_INTMSKCLR0	(INTC_BASE+0x64)
-#define INTC_INTMSKCLR1	(INTC_BASE+0x68)
-#define INTC_INTMSKCLR2	(INTC_BASE+0x40084)
-#define INTC_INT2MSKR	(INTC_BASE+0x40038)
-#define INTC_INT2MSKCR	(INTC_BASE+0x4003c)
-
 /*
  * Initialization. Try all known PCI access methods. Note that we support
  * using both PCI BIOS and direct access: in such cases, we use I/O ports
@@ -75,16 +61,6 @@ int __init sh7780_pci_init(struct pci_channel *chan)
 		return -ENODEV;
 	}
 
-	/* Setup the INTC */
-	if (mach_is_7780se()) {
-		/* ICR0: IRL=use separately */
-		ctrl_outl(0x00C00020, INTC_ICR0);
-		/* ICR1: detect low level(for 2ndcut) */
-		ctrl_outl(0xAAAA0000, INTC_ICR1);
-		/* INTPRI: priority=3(all) */
-		ctrl_outl(0x33333333, INTC_INTPRI);
-	}
-
 	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
-- 
GitLab


From 7e4ba0d77c96d328ba968ddff4a464d4d2fa7abc Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 14:07:57 +0900
Subject: [PATCH 0444/2198] sh: pci: Prefer P1SEG over P1SEGADDR for
 CONFIG_CMD.

P1SEGADDR is obsolete and will be killed off completely in the future,
so transition off of it and reference P1SEG explicitly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-sh4.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index 540683d07c770..2a7f7b50ff0a3 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -1,22 +1,22 @@
 /*
  * Generic SH-4 / SH-4A PCIC operations (SH7751, SH7780).
  *
- * Copyright (C) 2002 - 2006  Paul Mundt
+ * Copyright (C) 2002 - 2009  Paul Mundt
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License v2. See the file "COPYING" in the main directory of this archive
  * for more details.
  */
 #include <linux/pci.h>
+#include <linux/io.h>
 #include <asm/addrspace.h>
-#include <asm/io.h>
 #include "pci-sh4.h"
 
 /*
  * Direct access to PCI hardware...
  */
 #define CONFIG_CMD(bus, devfn, where) \
-	P1SEGADDR((bus->number << 16) | (devfn << 8) | (where & ~3))
+	(P1SEG | (bus->number << 16) | (devfn << 8) | (where & ~3))
 
 static DEFINE_SPINLOCK(sh4_pci_lock);
 
-- 
GitLab


From 0bbc9bc3189f24de946777af43c9033c8c4871e4 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 14:09:09 +0900
Subject: [PATCH 0445/2198] sh: pci: Set class/sub-class code correctly for
 SH7780 PCIC.

The SH7780 PCI host controller implements a configuration header that
requires a fair bit of hand-holding to initialize properly. By default
it appears as a pre-2.0 host controller given the zeroed out class code,
so fix this up properly.

Some boards that happened to be using the R7780RP version of the PCIC
fixups had set this correctly, but this belongs in the standard
initialization, and is by no means board specific.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/fixups-sdk7780.c | 4 ----
 arch/sh/drivers/pci/pci-sh7780.c     | 7 +++----
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c
index c2957312b30bd..004efd486ee35 100644
--- a/arch/sh/drivers/pci/fixups-sdk7780.c
+++ b/arch/sh/drivers/pci/fixups-sdk7780.c
@@ -16,8 +16,6 @@
 
 int pci_fixup_pcic(struct pci_channel *chan)
 {
-	ctrl_outl(0x00000001, SH7780_PCI_VCR2);
-
 	/* Enable all interrupts, so we know what to fix */
 	pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR);
 	pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM);
@@ -26,8 +24,6 @@ int pci_fixup_pcic(struct pci_channel *chan)
 	pci_write_reg(chan, 0xFB00, SH7780_PCISTATUS);
 	pci_write_reg(chan, 0x0047, SH7780_PCICMD);
 	pci_write_reg(chan, 0x00, SH7780_PCIPIF);
-	pci_write_reg(chan, 0x00, SH7780_PCISUB);
-	pci_write_reg(chan, 0x06, SH7780_PCIBCC);
 	pci_write_reg(chan, 0x1912, SH7780_PCISVID);
 	pci_write_reg(chan, 0x0001, SH7780_PCISID);
 
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 45fa423f2e53d..7f4f590375443 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -72,16 +72,15 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 {
 	u32 word;
 
+	pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST >> 8, SH7780_PCIBCC);
+	pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST & 0xff, SH7780_PCISUB);
+
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
 	 * Mem space enable
 	 */
 	pci_write_reg(chan, 0x00000046, SH7780_PCICMD);
 
-	/* define this host as the host bridge */
-	word = PCI_BASE_CLASS_BRIDGE << 24;
-	pci_write_reg(chan, word, SH7780_PCIRID);
-
 	/* Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
 	 */
-- 
GitLab


From 4e7b7fdb129995640f144b7de114e109c6b46a2a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 15:05:19 +0900
Subject: [PATCH 0446/2198] sh: pci: Rework SH7780 host controller detection.

This reworks how the host controller is probed, and makes it a bit more
verbose in the event a new type of controller is detected. Additionally,
we also log the revision information.

This now uses the proper access sizes for the vendor/device registers,
rather than relying on a larger access that encapsulated both of them.
Not all devices support 32-bit read cycles for these registers.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-sh7780.c | 42 ++++++++++++++++++--------------
 arch/sh/drivers/pci/pci-sh7780.h |  5 ++--
 2 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 7f4f590375443..63b5151e9aaad 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -34,33 +34,39 @@
 int __init sh7780_pci_init(struct pci_channel *chan)
 {
 	unsigned int id;
-	int ret, match = 0;
+	const char *type = NULL;
+	int ret;
 
-	pr_debug("PCI: Starting intialization.\n");
+	printk(KERN_NOTICE "PCI: Starting intialization.\n");
 
 	chan->reg_base = 0xfe040000;
 	chan->io_base = 0xfe200000;
 
-	ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */
-
-	/* check for SH7780/SH7780R hardware */
-	id = pci_read_reg(chan, SH7780_PCIVID);
-	if ((id & 0xffff) == SH7780_VENDOR_ID) {
-		switch ((id >> 16) & 0xffff) {
-		case SH7763_DEVICE_ID:
-		case SH7780_DEVICE_ID:
-		case SH7781_DEVICE_ID:
-		case SH7785_DEVICE_ID:
-			match = 1;
-			break;
-		}
-	}
+	/* Enable CPU access to the PCIC registers. */
+	__raw_writel(PCIECR_ENBL, PCIECR);
 
-	if (unlikely(!match)) {
-		printk(KERN_ERR "PCI: This is not an SH7780 (%x)\n", id);
+	id = __raw_readw(chan->reg_base + SH7780_PCIVID);
+	if (id != SH7780_VENDOR_ID) {
+		printk(KERN_ERR "PCI: Unknown vendor ID 0x%04x.\n", id);
 		return -ENODEV;
 	}
 
+	id = __raw_readw(chan->reg_base + SH7780_PCIDID);
+	type = (id == SH7763_DEVICE_ID)	? "SH7763" :
+	       (id == SH7780_DEVICE_ID) ? "SH7780" :
+	       (id == SH7781_DEVICE_ID) ? "SH7781" :
+	       (id == SH7785_DEVICE_ID) ? "SH7785" :
+					  NULL;
+	if (unlikely(!type)) {
+		printk(KERN_ERR "PCI: Found an unsupported Renesas host "
+		       "controller, device id 0x%04x.\n", id);
+		return -EINVAL;
+	}
+
+	printk(KERN_NOTICE "PCI: Found a Renesas %s host "
+	       "controller, revision %d.\n", type,
+	       __raw_readb(chan->reg_base + SH7780_PCIRID));
+
 	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index fffcf1dcfedfa..213f1d8c9ca5e 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -20,9 +20,8 @@
 #define SH7785_DEVICE_ID	0x0007
 
 /* SH7780 Control Registers */
-#define	SH7780_PCI_VCR0		0xFE000000
-#define	SH7780_PCI_VCR1		0xFE000004
-#define	SH7780_PCI_VCR2		0xFE000008
+#define	PCIECR			0xFE000008
+#define PCIECR_ENBL		0x01
 
 /* SH7780 Specific Values */
 #define SH7780_PCI_CONFIG_BASE	0xFD000000	/* Config space base addr */
-- 
GitLab


From ab78cbcf6877334fc20868b7df7887349e2e01c8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 15:08:01 +0900
Subject: [PATCH 0447/2198] sh: pci: Use the proper write size for
 class/sub-class code.

Don't use pci_write_reg() for these, as it defaults to 32-bit. Rather
than using the helper, use __raw_writeb() directly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-sh7780.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 63b5151e9aaad..19bac2168f4ff 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -78,8 +78,10 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 {
 	u32 word;
 
-	pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST >> 8, SH7780_PCIBCC);
-	pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST & 0xff, SH7780_PCISUB);
+	__raw_writeb(PCI_CLASS_BRIDGE_HOST >> 8,
+		     chan->reg_base + SH7780_PCIBCC);
+	__raw_writeb(PCI_CLASS_BRIDGE_HOST & 0xff,
+		     chan->reg_base + SH7780_PCISUB);
 
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
-- 
GitLab


From c66c1d79a94a7a302e2dc6c93da40902423eac3e Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 16:38:00 +0900
Subject: [PATCH 0448/2198] sh: pci: Set pci_cache_line_size on SH7780 via the
 PCICLS register.

The SH7780 PCIC contains a read-only cache line size register that we can
derive pci_cache_line_size from. So, make sure that the software idea of
the cache line size actually matches the host controller's idea.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-sh7780.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 19bac2168f4ff..fa73b0d158888 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -22,15 +22,6 @@
 #include <linux/delay.h>
 #include "pci-sh4.h"
 
-/*
- * Initialization. Try all known PCI access methods. Note that we support
- * using both PCI BIOS and direct access: in such cases, we use I/O ports
- * to access config space.
- *
- * Note that the platform specific initialization (BSC registers, and memory
- * space mapping) will be called via the platform defined function
- * pcibios_init_platform().
- */
 int __init sh7780_pci_init(struct pci_channel *chan)
 {
 	unsigned int id;
@@ -70,19 +61,31 @@ int __init sh7780_pci_init(struct pci_channel *chan)
 	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
+	/*
+	 * Platform specific initialization (BSC registers, and memory space
+	 * mapping) will be called via the platform defined function
+	 * pcibios_init_platform().
+	 */
 	return pcibios_init_platform();
 }
 
+extern u8 pci_cache_line_size;
+
 int __init sh7780_pcic_init(struct pci_channel *chan,
 			    struct sh4_pci_address_map *map)
 {
 	u32 word;
 
+	/*
+	 * Set the class and sub-class codes.
+	 */
 	__raw_writeb(PCI_CLASS_BRIDGE_HOST >> 8,
 		     chan->reg_base + SH7780_PCIBCC);
 	__raw_writeb(PCI_CLASS_BRIDGE_HOST & 0xff,
 		     chan->reg_base + SH7780_PCISUB);
 
+	pci_cache_line_size = pci_read_reg(chan, SH7780_PCICLS) / 4;
+
 	/* set the command/status bits to:
 	 * Wait Cycle Control + Parity Enable + Bus Master +
 	 * Mem space enable
-- 
GitLab


From f1dcab756687622b658154ded1657538984edcdb Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 17:00:27 +0900
Subject: [PATCH 0449/2198] sh: pci: Set the I/O port base to the SH7780 I/O
 window default.

Presently the I/O port base isn't being set anywhere, which allows things
like generic_inl() to blow up. Fix this up to point at the PCI IO window.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-sh7780.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index fa73b0d158888..207b7206fbdae 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -112,5 +112,7 @@ int __init sh7780_pcic_init(struct pci_channel *chan,
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO;
 	pci_write_reg(chan, word, SH4_PCICR);
 
+	__set_io_port_base(SH7780_PCI_IO_BASE);
+
 	return 0;
 }
-- 
GitLab


From ab1363a8929f32cc163cd3f50ca72f20d901b00c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 17:07:47 +0900
Subject: [PATCH 0450/2198] sh: pci: Consolidate PCI I/O and mem window
 definitions for SH7780.

This consolidates all of the PCI I/O and memory window definitions across
the pci-sh7780 users in pci-sh7780 itself. No functional changes, in that
every platform had exactly the same implementation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-r7780rp.c   | 23 +----------------------
 arch/sh/drivers/pci/ops-sdk7780.c   | 21 +--------------------
 arch/sh/drivers/pci/ops-se7780.c    | 23 +----------------------
 arch/sh/drivers/pci/ops-sh7785lcr.c | 21 +--------------------
 arch/sh/drivers/pci/pci-sh7780.c    | 25 ++++++++++++++++++++++---
 arch/sh/drivers/pci/pci-sh7780.h    |  4 +---
 6 files changed, 27 insertions(+), 90 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index 8ec6d225ef9d5..044525df18c5d 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -26,27 +26,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 	return irq_tab[slot];
 }
 
-static struct resource sh7780_io_resource = {
-	.name	= "SH7780_IO",
-	.start	= SH7780_PCI_IO_BASE,
-	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7780_mem_resource = {
-	.name	= "SH7780_mem",
-	.start	= SH7780_PCI_MEMORY_BASE,
-	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-extern struct pci_ops sh7780_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7780_pci_init, &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
 static struct sh4_pci_address_map sh7780_pci_map = {
 	.window0	= {
 		.base	= SH7780_CS2_BASE_ADDR,
@@ -61,5 +40,5 @@ static struct sh4_pci_address_map sh7780_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7780_pcic_init(&board_pci_channels[0], &sh7780_pci_map);
+	return sh7780_pcic_init(&sh7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index 6a0b7c067831f..570fd71f6937a 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -34,25 +34,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
        return sdk7780_irq_tab[pin-1][slot];
 }
 
-static struct resource sdk7780_io_resource = {
-	.name	= "SH7780_IO",
-	.start	= SH7780_PCI_IO_BASE,
-	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sdk7780_mem_resource = {
-	.name	= "SH7780_mem",
-	.start	= SH7780_PCI_MEMORY_BASE,
-	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7780_pci_init, &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
 static struct sh4_pci_address_map sdk7780_pci_map = {
 	.window0	= {
 		.base	= SH7780_CS2_BASE_ADDR,
@@ -67,5 +48,5 @@ static struct sh4_pci_address_map sdk7780_pci_map = {
 int __init pcibios_init_platform(void)
 {
 	printk(KERN_INFO "SH7780 PCI: Finished initializing PCI controller\n");
-	return sh7780_pcic_init(&board_pci_channels[0], &sdk7780_pci_map);
+	return sh7780_pcic_init(&sdk7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 583b8e82ff994..8b0941515b0f8 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -41,27 +41,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
        return se7780_irq_tab[pin-1][slot];
 }
 
-static struct resource se7780_io_resource = {
-	.name	= "SH7780_IO",
-	.start	= SH7780_PCI_IO_BASE,
-	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource se7780_mem_resource = {
-	.name	= "SH7780_mem",
-	.start	= SH7780_PCI_MEMORY_BASE,
-	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-extern struct pci_ops se7780_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7780_pci_init, &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
 static struct sh4_pci_address_map se7780_pci_map = {
 	.window0	= {
 		.base	= SH7780_CS2_BASE_ADDR,
@@ -90,5 +69,5 @@ int __init pcibios_init_platform(void)
 	ctrl_outw(0x0013, FPGA_PCI_INTSEL1);
 	ctrl_outw(0xE402, FPGA_PCI_INTSEL2);
 
-	return sh7780_pcic_init(&board_pci_channels[0], &se7780_pci_map);
+	return sh7780_pcic_init(&se7780_pci_map);
 }
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
index ab0d1decf2df8..bac46b5d1580a 100644
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ b/arch/sh/drivers/pci/ops-sh7785lcr.c
@@ -26,25 +26,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 	return irq_tab[slot];
 }
 
-static struct resource sh7785_io_resource = {
-	.name	= "SH7785_IO",
-	.start	= SH7780_PCI_IO_BASE,
-	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7785_mem_resource = {
-	.name	= "SH7785_mem",
-	.start	= SH7780_PCI_MEMORY_BASE,
-	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
 static struct sh4_pci_address_map sh7785_pci_map = {
 	.window0	= {
 #if defined(CONFIG_32BIT)
@@ -59,5 +40,5 @@ static struct sh4_pci_address_map sh7785_pci_map = {
 
 int __init pcibios_init_platform(void)
 {
-	return sh7780_pcic_init(&board_pci_channels[0], &sh7785_pci_map);
+	return sh7780_pcic_init(&sh7785_pci_map);
 }
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 207b7206fbdae..eb217ddf025fc 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -22,7 +22,7 @@
 #include <linux/delay.h>
 #include "pci-sh4.h"
 
-int __init sh7780_pci_init(struct pci_channel *chan)
+static int __init sh7780_pci_init(struct pci_channel *chan)
 {
 	unsigned int id;
 	const char *type = NULL;
@@ -71,9 +71,28 @@ int __init sh7780_pci_init(struct pci_channel *chan)
 
 extern u8 pci_cache_line_size;
 
-int __init sh7780_pcic_init(struct pci_channel *chan,
-			    struct sh4_pci_address_map *map)
+static struct resource sh7785_io_resource = {
+	.name	= "SH7785_IO",
+	.start	= SH7780_PCI_IO_BASE,
+	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
+	.flags	= IORESOURCE_IO
+};
+
+static struct resource sh7785_mem_resource = {
+	.name	= "SH7785_mem",
+	.start	= SH7780_PCI_MEMORY_BASE,
+	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
+	.flags	= IORESOURCE_MEM
+};
+
+struct pci_channel board_pci_channels[] = {
+	{ sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
+	{ NULL, NULL, NULL, 0, 0 },
+};
+
+int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
 {
+	struct pci_channel *chan = &board_pci_channels[0];
 	u32 word;
 
 	/*
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index 213f1d8c9ca5e..7a4f8a8dd6901 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -107,8 +107,6 @@
 struct sh4_pci_address_map;
 
 /* arch/sh/drivers/pci/pci-sh7780.c */
-int sh7780_pci_init(struct pci_channel *chan);
-int sh7780_pcic_init(struct pci_channel *chan,
-		     struct sh4_pci_address_map *map);
+int sh7780_pcic_init(struct sh4_pci_address_map *map);
 
 #endif /* _PCI_SH7780_H_ */
-- 
GitLab


From 4c7a47de897e89c25a40e228ac5319cbac7257fe Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 17:21:36 +0900
Subject: [PATCH 0451/2198] sh: pci: Kill off platform-specific multi-window
 mappings.

Commit 68b42d1b548be1840aff7122fdebeb804daf0fa3 ("sh: sh7785lcr: Map
whole PCI address space.") changed around the semantics of how various
chip-selects are made accessible to PCI. Now that there is a single
large mapping covering from CS0-CS6, there is no longer any need to
do multi-window mapping. Subsequently, all of the differing
implementations can be consolidated in to pci-sh7780.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-se/7780/irq.c   | 17 ++++++++++++++++
 arch/sh/drivers/pci/ops-r7780rp.c   | 17 ----------------
 arch/sh/drivers/pci/ops-sdk7780.c   | 17 ----------------
 arch/sh/drivers/pci/ops-se7780.c    | 31 -----------------------------
 arch/sh/drivers/pci/ops-sh7785lcr.c | 17 ----------------
 arch/sh/drivers/pci/pci-sh7780.c    | 24 ++++++++++++++--------
 arch/sh/drivers/pci/pci-sh7780.h    |  5 -----
 7 files changed, 33 insertions(+), 95 deletions(-)

diff --git a/arch/sh/boards/mach-se/7780/irq.c b/arch/sh/boards/mach-se/7780/irq.c
index 44b61a597a157..b8d43b638fcff 100644
--- a/arch/sh/boards/mach-se/7780/irq.c
+++ b/arch/sh/boards/mach-se/7780/irq.c
@@ -49,4 +49,21 @@ void __init init_se7780_IRQ(void)
 
 	/* ICR1: detect low level(for 2ndcut) */
 	ctrl_outl(0xAAAA0000, INTC_ICR1);
+
+	/*
+	 * FPGA PCISEL register initialize
+	 *
+	 *  CPU  || SLOT1 | SLOT2 | S-ATA | USB
+	 *  -------------------------------------
+	 *  INTA || INTA  | INTD  |  --   | INTB
+	 *  -------------------------------------
+	 *  INTB || INTB  | INTA  |  --   | INTC
+	 *  -------------------------------------
+	 *  INTC || INTC  | INTB  | INTA  |  --
+	 *  -------------------------------------
+	 *  INTD || INTD  | INTC  |  --   | INTA
+	 *  -------------------------------------
+	 */
+	ctrl_outw(0x0013, FPGA_PCI_INTSEL1);
+	ctrl_outw(0xE402, FPGA_PCI_INTSEL2);
 }
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
index 044525df18c5d..4ea136e4eacdd 100644
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ b/arch/sh/drivers/pci/ops-r7780rp.c
@@ -25,20 +25,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	return irq_tab[slot];
 }
-
-static struct sh4_pci_address_map sh7780_pci_map = {
-	.window0	= {
-		.base	= SH7780_CS2_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-
-	.window1	= {
-		.base	= SH7780_CS3_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7780_pcic_init(&sh7780_pci_map);
-}
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
index 570fd71f6937a..1d9a91bcfb7d2 100644
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ b/arch/sh/drivers/pci/ops-sdk7780.c
@@ -33,20 +33,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
        return sdk7780_irq_tab[pin-1][slot];
 }
-
-static struct sh4_pci_address_map sdk7780_pci_map = {
-	.window0	= {
-		.base	= SH7780_CS2_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-	.window1	= {
-		.base	= SH7780_CS3_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	printk(KERN_INFO "SH7780 PCI: Finished initializing PCI controller\n");
-	return sh7780_pcic_init(&sdk7780_pci_map);
-}
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
index 8b0941515b0f8..6c088ccfe475e 100644
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ b/arch/sh/drivers/pci/ops-se7780.c
@@ -40,34 +40,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
        return se7780_irq_tab[pin-1][slot];
 }
-
-static struct sh4_pci_address_map se7780_pci_map = {
-	.window0	= {
-		.base	= SH7780_CS2_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	printk("SH7780 PCI: Finished initialization of the PCI controller\n");
-
-	/*
-	 * FPGA PCISEL register initialize
-	 *
-	 *  CPU  || SLOT1 | SLOT2 | S-ATA | USB
-	 *  -------------------------------------
-	 *  INTA || INTA  | INTD  |  --   | INTB
-	 *  -------------------------------------
-	 *  INTB || INTB  | INTA  |  --   | INTC
-	 *  -------------------------------------
-	 *  INTC || INTC  | INTB  | INTA  |  --
-	 *  -------------------------------------
-	 *  INTD || INTD  | INTC  |  --   | INTA
-	 *  -------------------------------------
-	 */
-	ctrl_outw(0x0013, FPGA_PCI_INTSEL1);
-	ctrl_outw(0xE402, FPGA_PCI_INTSEL2);
-
-	return sh7780_pcic_init(&se7780_pci_map);
-}
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
index bac46b5d1580a..0fe423e235083 100644
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ b/arch/sh/drivers/pci/ops-sh7785lcr.c
@@ -25,20 +25,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	return irq_tab[slot];
 }
-
-static struct sh4_pci_address_map sh7785_pci_map = {
-	.window0	= {
-#if defined(CONFIG_32BIT)
-		.base	= SH7780_32BIT_DDR_BASE_ADDR,
-		.size	= 0x40000000,
-#else
-		.base	= SH7780_CS0_BASE_ADDR,
-		.size	= 0x20000000,
-#endif
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7780_pcic_init(&sh7785_pci_map);
-}
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index eb217ddf025fc..07c5529a273bb 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -90,7 +90,19 @@ struct pci_channel board_pci_channels[] = {
 	{ NULL, NULL, NULL, 0, 0 },
 };
 
-int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
+static struct sh4_pci_address_map sh7780_pci_map = {
+	.window0	= {
+#if defined(CONFIG_32BIT)
+		.base	= SH7780_32BIT_DDR_BASE_ADDR,
+		.size	= 0x40000000,
+#else
+		.base	= SH7780_CS0_BASE_ADDR,
+		.size	= 0x20000000,
+#endif
+	},
+};
+
+int __init pcibios_init_platform(void)
 {
 	struct pci_channel *chan = &board_pci_channels[0];
 	u32 word;
@@ -114,14 +126,10 @@ int __init sh7780_pcic_init(struct sh4_pci_address_map *map)
 	/* Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
 	 */
-	pci_write_reg(chan, map->window0.size - 0xfffff, SH4_PCILSR0);
-	pci_write_reg(chan, map->window1.size - 0xfffff, SH4_PCILSR1);
+	pci_write_reg(chan, sh7780_pci_map.window0.size - 0xfffff, SH4_PCILSR0);
 	/* Set the values on window 0 PCI config registers */
-	pci_write_reg(chan, map->window0.base, SH4_PCILAR0);
-	pci_write_reg(chan, map->window0.base, SH7780_PCIMBAR0);
-	/* Set the values on window 1 PCI config registers */
-	pci_write_reg(chan, map->window1.base, SH4_PCILAR1);
-	pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1);
+	pci_write_reg(chan, sh7780_pci_map.window0.base, SH4_PCILAR0);
+	pci_write_reg(chan, sh7780_pci_map.window0.base, SH7780_PCIMBAR0);
 
 	/* Apply any last-minute PCIC fixups */
 	pci_fixup_pcic(chan);
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index 7a4f8a8dd6901..4b65d4b26f75b 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -104,9 +104,4 @@
 
 #define SH7780_32BIT_DDR_BASE_ADDR	0x40000000
 
-struct sh4_pci_address_map;
-
-/* arch/sh/drivers/pci/pci-sh7780.c */
-int sh7780_pcic_init(struct sh4_pci_address_map *map);
-
 #endif /* _PCI_SH7780_H_ */
-- 
GitLab


From a6d377b6969235a3b5a6e87bdcef387d0976b41c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 20:11:44 +0900
Subject: [PATCH 0452/2198] sh: pci: Consolidate SH7780 PCIC IRQ routing.

Now that the platform code is a bit leaner, we can start consolidating
the various IRQ routing implementations. There are effectively only 2
variants, and the others can use those directly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Makefile         |  8 +++---
 arch/sh/drivers/pci/fixups-r7780rp.c | 10 ++++++-
 arch/sh/drivers/pci/fixups-sdk7780.c | 19 ++++++++++++-
 arch/sh/drivers/pci/ops-r7780rp.c    | 27 ------------------
 arch/sh/drivers/pci/ops-sdk7780.c    | 35 -----------------------
 arch/sh/drivers/pci/ops-se7780.c     | 42 ----------------------------
 arch/sh/drivers/pci/ops-sh7785lcr.c  | 27 ------------------
 7 files changed, 31 insertions(+), 137 deletions(-)
 delete mode 100644 arch/sh/drivers/pci/ops-r7780rp.c
 delete mode 100644 arch/sh/drivers/pci/ops-sdk7780.c
 delete mode 100644 arch/sh/drivers/pci/ops-se7780.c
 delete mode 100644 arch/sh/drivers/pci/ops-sh7785lcr.c

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 67d710a04de1a..5003971476ec5 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -16,11 +16,11 @@ obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o
 obj-$(CONFIG_SH_SECUREEDGE5410)		+= ops-snapgear.o
 obj-$(CONFIG_SH_RTS7751R2D)		+= ops-rts7751r2d.o fixups-rts7751r2d.o
 obj-$(CONFIG_SH_SH03)			+= ops-sh03.o fixups-sh03.o
-obj-$(CONFIG_SH_HIGHLANDER)		+= ops-r7780rp.o fixups-r7780rp.o
-obj-$(CONFIG_SH_SDK7780)		+= ops-sdk7780.o fixups-sdk7780.o
+obj-$(CONFIG_SH_HIGHLANDER)		+= fixups-r7780rp.o
+obj-$(CONFIG_SH_SH7785LCR)		+= fixups-r7780rp.o
+obj-$(CONFIG_SH_SDK7780)		+= fixups-sdk7780.o
+obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= fixups-sdk7780.o
 obj-$(CONFIG_SH_TITAN)			+= ops-titan.o
 obj-$(CONFIG_SH_LANDISK)		+= ops-landisk.o
 obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-lboxre2.o
-obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= ops-se7780.o fixups-sdk7780.o
 obj-$(CONFIG_SH_CAYMAN)			+= ops-cayman.o
-obj-$(CONFIG_SH_SH7785LCR)		+= ops-sh7785lcr.o fixups-r7780rp.o
diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c
index 31f8dfbfcbf50..864e92f69702a 100644
--- a/arch/sh/drivers/pci/fixups-r7780rp.c
+++ b/arch/sh/drivers/pci/fixups-r7780rp.c
@@ -11,9 +11,17 @@
  * for more details.
  */
 #include <linux/pci.h>
+#include <linux/io.h>
 #include "pci-sh4.h"
-#include <asm/io.h>
 
+static char irq_tab[] __initdata = {
+	65, 66, 67, 68,
+};
+
+int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
+{
+	return irq_tab[slot];
+}
 int pci_fixup_pcic(struct pci_channel *chan)
 {
 	pci_write_reg(chan, 0x000043ff, SH4_PCIINTM);
diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c
index 004efd486ee35..da60e99894b85 100644
--- a/arch/sh/drivers/pci/fixups-sdk7780.c
+++ b/arch/sh/drivers/pci/fixups-sdk7780.c
@@ -5,15 +5,32 @@
  *
  * Copyright (C) 2003  Lineo uSolutions, Inc.
  * Copyright (C) 2004 - 2006  Paul Mundt
+ * Copyright (C) 2006  Nobuhiro Iwamatsu
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
 #include <linux/pci.h>
+#include <linux/io.h>
 #include "pci-sh4.h"
-#include <asm/io.h>
 
+/* IDSEL [16][17][18][19][20][21][22][23][24][25][26][27][28][29][30][31] */
+static char sdk7780_irq_tab[4][16] __initdata = {
+	/* INTA */
+	{ 65, 68, 67, 68, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+	/* INTB */
+	{ 66, 65, -1, 65, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+	/* INTC */
+	{ 67, 66, -1, 66, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+	/* INTD */
+	{ 68, 67, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
+};
+
+int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
+{
+       return sdk7780_irq_tab[pin-1][slot];
+}
 int pci_fixup_pcic(struct pci_channel *chan)
 {
 	/* Enable all interrupts, so we know what to fix */
diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c
deleted file mode 100644
index 4ea136e4eacdd..0000000000000
--- a/arch/sh/drivers/pci/ops-r7780rp.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Author:  Ian DaSilva (idasilva@mvista.com)
- *
- * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * PCI initialization for the Renesas SH7780 Highlander R7780RP-1 board
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <mach/highlander.h>
-#include <asm/io.h>
-#include "pci-sh4.h"
-
-static char irq_tab[] __initdata = {
-	65, 66, 67, 68,
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	return irq_tab[slot];
-}
diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c
deleted file mode 100644
index 1d9a91bcfb7d2..0000000000000
--- a/arch/sh/drivers/pci/ops-sdk7780.c
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-sdk7780.c
- *
- * Copyright (C) 2006  Nobuhiro Iwamatsu
- *
- * PCI initialization for the SDK7780SE03
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <mach/sdk7780.h>
-#include <asm/io.h>
-#include "pci-sh4.h"
-
-/* IDSEL [16][17][18][19][20][21][22][23][24][25][26][27][28][29][30][31] */
-static char sdk7780_irq_tab[4][16] __initdata = {
-	/* INTA */
-	{ 65, 68, 67, 68, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTB */
-	{ 66, 65, -1, 65, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTC */
-	{ 67, 66, -1, 66, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTD */
-	{ 68, 67, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-       return sdk7780_irq_tab[pin-1][slot];
-}
diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c
deleted file mode 100644
index 6c088ccfe475e..0000000000000
--- a/arch/sh/drivers/pci/ops-se7780.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-se7780.c
- *
- * Copyright (C) 2006  Nobuhiro Iwamatsu
- *
- * PCI initialization for the Hitachi UL Solution Engine 7780SE03
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <mach-se/mach/se7780.h>
-#include <asm/io.h>
-#include "pci-sh4.h"
-
-/*
- * IDSEL = AD16  PCI slot
- * IDSEL = AD17  PCI slot
- * IDSEL = AD18  Serial ATA Controller (Silicon Image SiL3512A)
- * IDSEL = AD19  USB Host Controller (NEC uPD7210100A)
- */
-
-/* IDSEL [16][17][18][19][20][21][22][23][24][25][26][27][28][29][30][31] */
-static char se7780_irq_tab[4][16] __initdata = {
-	/* INTA */
-	{ 65, 68, 67, 68, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTB */
-	{ 66, 65, -1, 65, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTC */
-	{ 67, 66, -1, 66, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-	/* INTD */
-	{ 68, 67, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 },
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-       return se7780_irq_tab[pin-1][slot];
-}
diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c
deleted file mode 100644
index 0fe423e235083..0000000000000
--- a/arch/sh/drivers/pci/ops-sh7785lcr.c
+++ /dev/null
@@ -1,27 +0,0 @@
-/*
- * Author:  Ian DaSilva (idasilva@mvista.com)
- *
- * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * PCI initialization for the Renesas R0P7785LC0011RL board
- * Based on arch/sh/drivers/pci/ops-r7780rp.c
- *
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include "pci-sh4.h"
-
-static char irq_tab[] __initdata = {
-	65, 66, 67, 68,
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	return irq_tab[slot];
-}
-- 
GitLab


From 62c7ae87cb5962d3dfaa6d916a15e4faa9e07363 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 17 Apr 2009 20:37:16 +0900
Subject: [PATCH 0453/2198] sh: pci: Start unifying the SH7780 PCIC
 initialization.

This starts moving out the common initialization bits from the various
fixup paths in to the shared init path.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/fixups-r7780rp.c | 20 +----------
 arch/sh/drivers/pci/fixups-sdk7780.c | 32 +++---------------
 arch/sh/drivers/pci/pci-sh7780.c     | 50 +++++++++++++++++-----------
 arch/sh/drivers/pci/pci-sh7780.h     |  5 ---
 4 files changed, 37 insertions(+), 70 deletions(-)

diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c
index 864e92f69702a..15ca65cb667e7 100644
--- a/arch/sh/drivers/pci/fixups-r7780rp.c
+++ b/arch/sh/drivers/pci/fixups-r7780rp.c
@@ -22,33 +22,15 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	return irq_tab[slot];
 }
+
 int pci_fixup_pcic(struct pci_channel *chan)
 {
 	pci_write_reg(chan, 0x000043ff, SH4_PCIINTM);
-	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
-
-	pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD);
 	pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR);
-
-	pci_write_reg(chan, 0x00011912, SH7780_PCISVID);
 	pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0);
 	pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0);
 	pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1);
 	pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1);
 
-	pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0);
-	pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0);
-
-#ifdef CONFIG_32BIT
-	pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2);
-	pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
-#endif
-
-	/* Set IOBR for windows containing area specified in pci.h */
-	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
-		      SH7780_PCIIOBR);
-	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)),
-		      SH7780_PCIIOBMR);
-
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c
index da60e99894b85..250b0edd73653 100644
--- a/arch/sh/drivers/pci/fixups-sdk7780.c
+++ b/arch/sh/drivers/pci/fixups-sdk7780.c
@@ -35,40 +35,18 @@ int pci_fixup_pcic(struct pci_channel *chan)
 {
 	/* Enable all interrupts, so we know what to fix */
 	pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR);
-	pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM);
 
 	/* Set up standard PCI config registers */
-	pci_write_reg(chan, 0xFB00, SH7780_PCISTATUS);
-	pci_write_reg(chan, 0x0047, SH7780_PCICMD);
-	pci_write_reg(chan, 0x00, SH7780_PCIPIF);
-	pci_write_reg(chan, 0x1912, SH7780_PCISVID);
-	pci_write_reg(chan, 0x0001, SH7780_PCISID);
-
 	pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0);	/* PCI */
-	pci_write_reg(chan, 0x08000000, SH7780_PCILAR0);	/* SHwy */
-	pci_write_reg(chan, 0x07F00001, SH7780_PCILSR);	/* size 128M w/ MBAR */
+	pci_write_reg(chan, 0x08000000, SH4_PCILAR0);	/* SHwy */
+	pci_write_reg(chan, 0x07F00001, SH4_PCILSR0);	/* size 128M w/ MBAR */
 
 	pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1);
-	pci_write_reg(chan, 0x00000000, SH7780_PCILAR1);
-	pci_write_reg(chan, 0x00000000, SH7780_PCILSR1);
+	pci_write_reg(chan, 0x00000000, SH4_PCILAR1);
+	pci_write_reg(chan, 0x00000000, SH4_PCILSR1);
 
 	pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR);
-
-	/*
-	 * Set the MBR so PCI address is one-to-one with window,
-	 * meaning all calls go straight through... use ifdef to
-	 * catch erroneous assumption.
-	 */
-	pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0);
-	pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0);	/* 16M */
-
-	/* Set IOBR for window containing area specified in pci.h */
-	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
-		      SH7780_PCIIOBR);
-	pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18),
-		      SH7780_PCIIOBMR);
-
-	pci_write_reg(chan, 0xA5000C01, SH7780_PCICR);
+	pci_write_reg(chan, 0xA5000C01, SH4_PCICR);
 
 	return 0;
 }
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 07c5529a273bb..f02d9dfcf252f 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -1,19 +1,12 @@
 /*
- *	Low-Level PCI Support for the SH7780
+ * Low-Level PCI Support for the SH7780
  *
- *  Dustin McIntire (dustin@sensoria.com)
- *	Derived from arch/i386/kernel/pci-*.c which bore the message:
- *	(c) 1999--2000 Martin Mares <mj@ucw.cz>
- *
- *  Ported to the new API by Paul Mundt <lethal@linux-sh.org>
- *  With cleanup by Paul van Gool <pvangool@mimotech.com>
- *
- *  May be copied or modified under the terms of the GNU General Public
- *  License.  See linux/COPYING for more information.
+ *  Copyright (C) 2005 - 2009  Paul Mundt
  *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
  */
-#undef DEBUG
-
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
@@ -117,13 +110,8 @@ int __init pcibios_init_platform(void)
 
 	pci_cache_line_size = pci_read_reg(chan, SH7780_PCICLS) / 4;
 
-	/* set the command/status bits to:
-	 * Wait Cycle Control + Parity Enable + Bus Master +
-	 * Mem space enable
-	 */
-	pci_write_reg(chan, 0x00000046, SH7780_PCICMD);
-
-	/* Set IO and Mem windows to local address
+	/*
+	 * Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
 	 */
 	pci_write_reg(chan, sh7780_pci_map.window0.size - 0xfffff, SH4_PCILSR0);
@@ -131,9 +119,33 @@ int __init pcibios_init_platform(void)
 	pci_write_reg(chan, sh7780_pci_map.window0.base, SH4_PCILAR0);
 	pci_write_reg(chan, sh7780_pci_map.window0.base, SH7780_PCIMBAR0);
 
+	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
+
+	/* Set up standard PCI config registers */
+	__raw_writew(0xFB00, chan->reg_base + SH7780_PCISTATUS);
+	__raw_writew(0x0047, chan->reg_base + SH7780_PCICMD);
+	__raw_writew(0x1912, chan->reg_base + SH7780_PCISVID);
+	__raw_writew(0x0001, chan->reg_base + SH7780_PCISID);
+
+	__raw_writeb(0x00, chan->reg_base + SH7780_PCIPIF);
+
 	/* Apply any last-minute PCIC fixups */
 	pci_fixup_pcic(chan);
 
+	pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0);
+	pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0);
+
+#ifdef CONFIG_32BIT
+	pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2);
+	pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2);
+#endif
+
+	/* Set IOBR for windows containing area specified in pci.h */
+	pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1),
+		      SH7780_PCIIOBR);
+	pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)),
+		      SH7780_PCIIOBMR);
+
 	/* SH7780 init done, set central function init complete */
 	/* use round robin mode to stop a device starving/overruning */
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO;
diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h
index 4b65d4b26f75b..4a52478c97cf6 100644
--- a/arch/sh/drivers/pci/pci-sh7780.h
+++ b/arch/sh/drivers/pci/pci-sh7780.h
@@ -65,11 +65,6 @@
 #define SH7780_PCIPMCSR_BSE	0x046
 #define SH7780_PCICDD		0x047
 
-#define SH7780_PCICR		0x100		/* PCI Control Register */
-#define SH7780_PCILSR		0x104		/* PCI Local Space Register0 */
-#define SH7780_PCILSR1		0x108		/* PCI Local Space Register1 */
-#define SH7780_PCILAR0		0x10C		/* PCI Local Address Register1 */
-#define SH7780_PCILAR1		0x110		/* PCI Local Address Register1 */
 #define SH7780_PCIIR		0x114		/* PCI Interrupt Register */
 #define SH7780_PCIIMR		0x118		/* PCI Interrupt Mask Register */
 #define SH7780_PCIAIR		0x11C		/* Error Address Register */
-- 
GitLab


From 93eb677d74a4f7d3edfb678c94f6c0544d9fbad2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 13:24:06 -0400
Subject: [PATCH 0454/2198] ftrace: use module notifier for function tracer

The hooks in the module code for the function tracer must be called
before any of that module code runs. The function tracer hooks
modify the module (replacing calls to mcount to nops). If the code
is executed while the change occurs, then the CPU can take a GPF.

To handle the above with a bit of paranoia, I originally implemented
the hooks as calls directly from the module code.

After examining the notifier calls, it looks as though the start up
notify is called before any of the module's code is executed. This makes
the use of the notify safe with ftrace.

Only the startup notify is required to be "safe". The shutdown simply
removes the entries from the ftrace function list, and does not modify
any code.

This change has another benefit. It removes a issue with a reverse dependency
in the mutexes of ftrace_lock and module_mutex.

[ Impact: fix lock dependency bug, cleanup ]

Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  7 ----
 include/linux/module.h |  4 ++
 kernel/module.c        | 19 ++++-----
 kernel/trace/ftrace.c  | 90 ++++++++++++++++++++++++++++++------------
 4 files changed, 75 insertions(+), 45 deletions(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 53869bef61021..97c83e1bc5899 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -233,8 +233,6 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 extern int skip_trace(unsigned long ip);
 
-extern void ftrace_release(void *start, unsigned long size);
-
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 #else
@@ -325,13 +323,8 @@ static inline void __ftrace_enabled_restore(int enabled)
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
-extern void ftrace_init_module(struct module *mod,
-			       unsigned long *start, unsigned long *end);
 #else
 static inline void ftrace_init(void) { }
-static inline void
-ftrace_init_module(struct module *mod,
-		   unsigned long *start, unsigned long *end) { }
 #endif
 
 /*
diff --git a/include/linux/module.h b/include/linux/module.h
index 6155fa44168ba..a8f2c0aa4c328 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -341,6 +341,10 @@ struct module
 	struct ftrace_event_call *trace_events;
 	unsigned int num_trace_events;
 #endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+	unsigned long *ftrace_callsites;
+	unsigned int num_ftrace_callsites;
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
diff --git a/kernel/module.c b/kernel/module.c
index a0394706f10c4..2383e60fcf3fc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1490,9 +1490,6 @@ static void free_module(struct module *mod)
 	/* Free any allocated parameters. */
 	destroy_params(mod->kp, mod->num_kp);
 
-	/* release any pointers to mcount in this module */
-	ftrace_release(mod->module_core, mod->core_size);
-
 	/* This may be NULL, but that's OK */
 	module_free(mod, mod->module_init);
 	kfree(mod->args);
@@ -1893,11 +1890,9 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int num_mcount;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
-	unsigned long *mseg;
 	mm_segment_t old_fs;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2179,7 +2174,13 @@ static noinline struct module *load_module(void __user *umod,
 					 sizeof(*mod->trace_events),
 					 &mod->num_trace_events);
 #endif
-
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+	/* sechdrs[0].sh_size is always zero */
+	mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
+					     "__mcount_loc",
+					     sizeof(*mod->ftrace_callsites),
+					     &mod->num_ftrace_callsites);
+#endif
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2244,11 +2245,6 @@ static noinline struct module *load_module(void __user *umod,
 			dynamic_debug_setup(debug, num_debug);
 	}
 
-	/* sechdrs[0].sh_size is always zero */
-	mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
-			    sizeof(*mseg), &num_mcount);
-	ftrace_init_module(mod, mseg, mseg + num_mcount);
-
 	err = module_finalize(hdr, sechdrs, mod);
 	if (err < 0)
 		goto cleanup;
@@ -2309,7 +2305,6 @@ static noinline struct module *load_module(void __user *umod,
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
 	kobject_put(&mod->mkobj.kobj);
-	ftrace_release(mod->module_core, mod->core_size);
  free_unload:
 	module_unload_free(mod);
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index a23488988581a..5b606f45b6c43 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -916,30 +916,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
 	rec->flags |= FTRACE_FL_FREE;
 }
 
-void ftrace_release(void *start, unsigned long size)
-{
-	struct dyn_ftrace *rec;
-	struct ftrace_page *pg;
-	unsigned long s = (unsigned long)start;
-	unsigned long e = s + size;
-
-	if (ftrace_disabled || !start)
-		return;
-
-	mutex_lock(&ftrace_lock);
-	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e)) {
-			/*
-			 * rec->ip is changed in ftrace_free_rec()
-			 * It should not between s and e if record was freed.
-			 */
-			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-			ftrace_free_rec(rec);
-		}
-	} while_for_each_ftrace_rec();
-	mutex_unlock(&ftrace_lock);
-}
-
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
 	struct dyn_ftrace *rec;
@@ -2752,14 +2728,72 @@ static int ftrace_convert_nops(struct module *mod,
 	return 0;
 }
 
-void ftrace_init_module(struct module *mod,
-			unsigned long *start, unsigned long *end)
+#ifdef CONFIG_MODULES
+void ftrace_release(void *start, void *end)
+{
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	unsigned long s = (unsigned long)start;
+	unsigned long e = (unsigned long)end;
+
+	if (ftrace_disabled || !start || start == end)
+		return;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+		if ((rec->ip >= s) && (rec->ip < e)) {
+			/*
+			 * rec->ip is changed in ftrace_free_rec()
+			 * It should not between s and e if record was freed.
+			 */
+			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+			ftrace_free_rec(rec);
+		}
+	} while_for_each_ftrace_rec();
+	mutex_unlock(&ftrace_lock);
+}
+
+static void ftrace_init_module(struct module *mod,
+			       unsigned long *start, unsigned long *end)
 {
 	if (ftrace_disabled || start == end)
 		return;
 	ftrace_convert_nops(mod, start, end);
 }
 
+static int ftrace_module_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	struct module *mod = data;
+
+	switch (val) {
+	case MODULE_STATE_COMING:
+		ftrace_init_module(mod, mod->ftrace_callsites,
+				   mod->ftrace_callsites +
+				   mod->num_ftrace_callsites);
+		break;
+	case MODULE_STATE_GOING:
+		ftrace_release(mod->ftrace_callsites,
+			       mod->ftrace_callsites +
+			       mod->num_ftrace_callsites);
+		break;
+	}
+
+	return 0;
+}
+#else
+static int ftrace_module_notify(struct notifier_block *self,
+				unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block ftrace_module_nb = {
+	.notifier_call = ftrace_module_notify,
+	.priority = 0,
+};
+
 extern unsigned long __start_mcount_loc[];
 extern unsigned long __stop_mcount_loc[];
 
@@ -2791,6 +2825,10 @@ void __init ftrace_init(void)
 				  __start_mcount_loc,
 				  __stop_mcount_loc);
 
+	ret = register_module_notifier(&ftrace_module_nb);
+	if (!ret)
+		pr_warning("Failed to register trace ftrace module notifier\n");
+
 	return;
  failed:
 	ftrace_disabled = 1;
-- 
GitLab


From e6187007d6c365b551c69ea3df46f06fd1c8bd19 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 13:36:40 -0400
Subject: [PATCH 0455/2198] tracing/events: add startup tests for events
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As events start to become popular, and the new way to add tracing
infrastructure into ftrace, it is important to catch any problems
that might happen with a mistake in the TRACE_EVENT macro.

This patch introduces a startup self test on the registered trace
events. Note, it can only do a generic test, any type of testing that
needs more involement is needed to be implemented by the tracepoint
creators.

The test goes down one by one enabling a trace point and running
some random tasks (random in the sense that I just made them up).
Those tasks are creating threads, grabbing mutexes and spinlocks
and using workqueues.

After testing each event individually, it does the same test after
enabling each system of trace points. Like sched, irq, lockdep.

Then finally it enables all tracepoints and performs the tasks again.
The output to the console on bootup will look like this when everything
works:

Running tests on trace events:
Testing event kfree_skb: OK
Testing event kmalloc: OK
Testing event kmem_cache_alloc: OK
Testing event kmalloc_node: OK
Testing event kmem_cache_alloc_node: OK
Testing event kfree: OK
Testing event kmem_cache_free: OK
Testing event irq_handler_exit: OK
Testing event irq_handler_entry: OK
Testing event softirq_entry: OK
Testing event softirq_exit: OK
Testing event lock_acquire: OK
Testing event lock_release: OK
Testing event sched_kthread_stop: OK
Testing event sched_kthread_stop_ret: OK
Testing event sched_wait_task: OK
Testing event sched_wakeup: OK
Testing event sched_wakeup_new: OK
Testing event sched_switch: OK
Testing event sched_migrate_task: OK
Testing event sched_process_free: OK
Testing event sched_process_exit: OK
Testing event sched_process_wait: OK
Testing event sched_process_fork: OK
Testing event sched_signal_send: OK
Running tests on trace event systems:
Testing event system skb: OK
Testing event system kmem: OK
Testing event system irq: OK
Testing event system lockdep: OK
Testing event system sched: OK
Running tests on all trace events:
Testing all events: OK

[ folded in:

  tracing: add #include <linux/delay.h> to fix build failure in test_work()

  This build failure occured on a few rare configs:

   kernel/trace/trace_events.c: In function ‘test_work’:
   kernel/trace/trace_events.c:975: error: implicit declaration of function ‘udelay’
   kernel/trace/trace_events.c:980: error: implicit declaration of function ‘msleep’

  delay.h is included in way too many other headers, hiding cases
  where new usage is added without header inclusion.

  [ Impact: build fix ]

  Signed-off-by: Ingo Molnar <mingo@elte.hu>
]

[ Impact: add event tracer self-tests ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 178 ++++++++++++++++++++++++++++++++++++
 1 file changed, 178 insertions(+)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6591d83e1e7ad..f81d6eec4e430 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -8,10 +8,14 @@
  *
  */
 
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/delay.h>
 
 #include "trace_output.h"
 
@@ -920,3 +924,177 @@ static __init int event_trace_init(void)
 	return 0;
 }
 fs_initcall(event_trace_init);
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_SPINLOCK(test_spinlock_irq);
+static DEFINE_MUTEX(test_mutex);
+
+static __init void test_work(struct work_struct *dummy)
+{
+	spin_lock(&test_spinlock);
+	spin_lock_irq(&test_spinlock_irq);
+	udelay(1);
+	spin_unlock_irq(&test_spinlock_irq);
+	spin_unlock(&test_spinlock);
+
+	mutex_lock(&test_mutex);
+	msleep(1);
+	mutex_unlock(&test_mutex);
+}
+
+static __init int event_test_thread(void *unused)
+{
+	void *test_malloc;
+
+	test_malloc = kmalloc(1234, GFP_KERNEL);
+	if (!test_malloc)
+		pr_info("failed to kmalloc\n");
+
+	schedule_on_each_cpu(test_work);
+
+	kfree(test_malloc);
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop())
+		schedule();
+
+	return 0;
+}
+
+/*
+ * Do various things that may trigger events.
+ */
+static __init void event_test_stuff(void)
+{
+	struct task_struct *test_thread;
+
+	test_thread = kthread_run(event_test_thread, NULL, "test-events");
+	msleep(1);
+	kthread_stop(test_thread);
+}
+
+/*
+ * For every trace event defined, we will test each trace point separately,
+ * and then by groups, and finally all trace points.
+ */
+static __init int event_trace_self_tests(void)
+{
+	struct ftrace_event_call *call;
+	struct event_subsystem *system;
+	char *sysname;
+	int ret;
+
+	pr_info("Running tests on trace events:\n");
+
+	list_for_each_entry(call, &ftrace_events, list) {
+
+		/* Only test those that have a regfunc */
+		if (!call->regfunc)
+			continue;
+
+		pr_info("Testing event %s: ", call->name);
+
+		/*
+		 * If an event is already enabled, someone is using
+		 * it and the self test should not be on.
+		 */
+		if (call->enabled) {
+			pr_warning("Enabled event during self test!\n");
+			WARN_ON_ONCE(1);
+			continue;
+		}
+
+		call->enabled = 1;
+		call->regfunc();
+
+		event_test_stuff();
+
+		call->unregfunc();
+		call->enabled = 0;
+
+		pr_cont("OK\n");
+	}
+
+	/* Now test at the sub system level */
+
+	pr_info("Running tests on trace event systems:\n");
+
+	list_for_each_entry(system, &event_subsystems, list) {
+
+		/* the ftrace system is special, skip it */
+		if (strcmp(system->name, "ftrace") == 0)
+			continue;
+
+		pr_info("Testing event system %s: ", system->name);
+
+		/* ftrace_set_clr_event can modify the name passed in. */
+		sysname = kstrdup(system->name, GFP_KERNEL);
+		if (WARN_ON(!sysname)) {
+			pr_warning("Can't allocate memory, giving up!\n");
+			return 0;
+		}
+		ret = ftrace_set_clr_event(sysname, 1);
+		kfree(sysname);
+		if (WARN_ON_ONCE(ret)) {
+			pr_warning("error enabling system %s\n",
+				   system->name);
+			continue;
+		}
+
+		event_test_stuff();
+
+		sysname = kstrdup(system->name, GFP_KERNEL);
+		if (WARN_ON(!sysname)) {
+			pr_warning("Can't allocate memory, giving up!\n");
+			return 0;
+		}
+		ret = ftrace_set_clr_event(sysname, 0);
+		kfree(sysname);
+
+		if (WARN_ON_ONCE(ret))
+			pr_warning("error disabling system %s\n",
+				   system->name);
+
+		pr_cont("OK\n");
+	}
+
+	/* Test with all events enabled */
+
+	pr_info("Running tests on all trace events:\n");
+	pr_info("Testing all events: ");
+
+	sysname = kmalloc(4, GFP_KERNEL);
+	if (WARN_ON(!sysname)) {
+		pr_warning("Can't allocate memory, giving up!\n");
+		return 0;
+	}
+	memcpy(sysname, "*:*", 4);
+	ret = ftrace_set_clr_event(sysname, 1);
+	if (WARN_ON_ONCE(ret)) {
+		kfree(sysname);
+		pr_warning("error enabling all events\n");
+		return 0;
+	}
+
+	event_test_stuff();
+
+	/* reset sysname */
+	memcpy(sysname, "*:*", 4);
+	ret = ftrace_set_clr_event(sysname, 0);
+	kfree(sysname);
+
+	if (WARN_ON_ONCE(ret)) {
+		pr_warning("error disabling all events\n");
+		return 0;
+	}
+
+	pr_cont("OK\n");
+
+	return 0;
+}
+
+late_initcall(event_trace_self_tests);
+
+#endif
-- 
GitLab


From d1b182a8d49ed6416325b4e0a1cb0f17cd4e702a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 15 Apr 2009 16:53:47 -0400
Subject: [PATCH 0456/2198] tracing/events/ring-buffer: expose format of ring
 buffer headers to users

Currently, every thing needed to read the binary output from the
ring buffers is available, with the exception of the way the ring
buffers handles itself internally.

This patch creates two special files in the debugfs/tracing/events
directory:

 # cat /debug/tracing/events/header_page
        field: u64 timestamp;   offset:0;       size:8;
        field: local_t commit;  offset:8;       size:8;
        field: char data;       offset:16;      size:4080;

 # cat /debug/tracing/events/header_event
        type        :    2 bits
        len         :    3 bits
        time_delta  :   27 bits
        array       :   32 bits

        padding     : type == 0
        time_extend : type == 1
        data        : type == 3

This is to allow a userspace app to see if the ring buffer format changes
or not.

[ Impact: allow userspace apps to know of ringbuffer format changes ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  5 +++++
 kernel/trace/ring_buffer.c  | 44 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c | 38 ++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f0aa486d131c6..fac8f1ac6f493 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -166,6 +166,11 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
 int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
 			  size_t len, int cpu, int full);
 
+struct trace_seq;
+
+int ring_buffer_print_entry_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_seq *s);
+
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f935bd5ec3e83..84a6055f37c90 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -21,6 +21,28 @@
 
 #include "trace.h"
 
+/*
+ * The ring buffer header is special. We must manually up keep it.
+ */
+int ring_buffer_print_entry_header(struct trace_seq *s)
+{
+	int ret;
+
+	ret = trace_seq_printf(s, "\ttype        :    2 bits\n");
+	ret = trace_seq_printf(s, "\tlen         :    3 bits\n");
+	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
+	ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
+	ret = trace_seq_printf(s, "\n");
+	ret = trace_seq_printf(s, "\tpadding     : type == %d\n",
+			       RINGBUF_TYPE_PADDING);
+	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
+			       RINGBUF_TYPE_TIME_EXTEND);
+	ret = trace_seq_printf(s, "\tdata        : type == %d\n",
+			       RINGBUF_TYPE_DATA);
+
+	return ret;
+}
+
 /*
  * The ring buffer is made up of a list of pages. A separate list of pages is
  * allocated for each CPU. A writer may only write to a buffer that is
@@ -340,6 +362,28 @@ static inline int test_time_stamp(u64 delta)
 
 #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+	struct buffer_data_page field;
+	int ret;
+
+	ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+			       "offset:0;\tsize:%u;\n",
+			       (unsigned int)sizeof(field.time_stamp));
+
+	ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
+			       "offset:%u;\tsize:%u;\n",
+			       (unsigned int)offsetof(typeof(field), commit),
+			       (unsigned int)sizeof(field.commit));
+
+	ret = trace_seq_printf(s, "\tfield: char data;\t"
+			       "offset:%u;\tsize:%u;\n",
+			       (unsigned int)offsetof(typeof(field), data),
+			       (unsigned int)BUF_PAGE_SIZE);
+
+	return ret;
+}
+
 /*
  * head_page == tail_page && head == tail then buffer is empty.
  */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f81d6eec4e430..7163a2bb021aa 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -610,6 +610,30 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	int (*func)(struct trace_seq *s) = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	func(s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+	kfree(s);
+
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -667,6 +691,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
 	.write = subsystem_filter_write,
 };
 
+static const struct file_operations ftrace_show_header_fops = {
+	.open = tracing_open_generic,
+	.read = show_header,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -909,6 +938,15 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
+	/* ring buffer internal formats */
+	trace_create_file("header_page", 0444, d_events,
+			  ring_buffer_print_page_header,
+			  &ftrace_show_header_fops);
+
+	trace_create_file("header_event", 0444, d_events,
+			  ring_buffer_print_entry_header,
+			  &ftrace_show_header_fops);
+
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
-- 
GitLab


From 69abe6a5d18a9394baa325bab8f57748b037c517 Mon Sep 17 00:00:00 2001
From: Avadh Patel <avadh4all@gmail.com>
Date: Fri, 10 Apr 2009 16:04:48 -0400
Subject: [PATCH 0457/2198] tracing: add saved_cmdlines file to show cached
 task comms

Export the cached task comms to userspace. This allows user apps to translate
the pids from a trace into their respective task command lines.

[ Impact: let userspace apps reading binary buffer know comm's of pids ]

Signed-off-by: Avadh Patel <avadh4all@gmail.com>
[ added error checking and use of buf pointer to index file_buf ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 53 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2d69b26b3cc9f..031c46f11bb96 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2421,6 +2421,56 @@ static const struct file_operations tracing_readme_fops = {
 	.read		= tracing_readme_read,
 };
 
+static ssize_t
+tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
+				size_t cnt, loff_t *ppos)
+{
+	char *buf_comm;
+	char *file_buf;
+	char *buf;
+	int len = 0;
+	int pid;
+	int i;
+
+	file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
+	if (!file_buf)
+		return -ENOMEM;
+
+	buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
+	if (!buf_comm) {
+		kfree(file_buf);
+		return -ENOMEM;
+	}
+
+	buf = file_buf;
+
+	for (i = 0; i < SAVED_CMDLINES; i++) {
+		int r;
+
+		pid = map_cmdline_to_pid[i];
+		if (pid == -1 || pid == NO_CMDLINE_MAP)
+			continue;
+
+		trace_find_cmdline(pid, buf_comm);
+		r = sprintf(buf, "%d %s\n", pid, buf_comm);
+		buf += r;
+		len += r;
+	}
+
+	len = simple_read_from_buffer(ubuf, cnt, ppos,
+				      file_buf, len);
+
+	kfree(file_buf);
+	kfree(buf_comm);
+
+	return len;
+}
+
+static const struct file_operations tracing_saved_cmdlines_fops = {
+    .open       = tracing_open_generic,
+    .read       = tracing_saved_cmdlines_read,
+};
+
 static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
@@ -3973,6 +4023,9 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("trace_marker", 0220, d_tracer,
 			NULL, &tracing_mark_fops);
 
+	trace_create_file("saved_cmdlines", 0444, d_tracer,
+			NULL, &tracing_saved_cmdlines_fops);
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 	trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 			&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
-- 
GitLab


From 9ea21c1ecdb35ecdcac5fd9d95f62a1f6a7ffec0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 16 Apr 2009 12:15:44 -0400
Subject: [PATCH 0458/2198] tracing/events: perform function tracing in event
 selftests

We can find some bugs in the trace events if we stress the writes as well.
The function tracer is a good way to stress the events.

[ Impact: extend scope of event tracer self-tests ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090416161746.604786131@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 78 ++++++++++++++++++++++++++++++++++---
 1 file changed, 72 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7163a2bb021aa..1137f951be42e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1017,7 +1017,7 @@ static __init void event_test_stuff(void)
  * For every trace event defined, we will test each trace point separately,
  * and then by groups, and finally all trace points.
  */
-static __init int event_trace_self_tests(void)
+static __init void event_trace_self_tests(void)
 {
 	struct ftrace_event_call *call;
 	struct event_subsystem *system;
@@ -1071,7 +1071,7 @@ static __init int event_trace_self_tests(void)
 		sysname = kstrdup(system->name, GFP_KERNEL);
 		if (WARN_ON(!sysname)) {
 			pr_warning("Can't allocate memory, giving up!\n");
-			return 0;
+			return;
 		}
 		ret = ftrace_set_clr_event(sysname, 1);
 		kfree(sysname);
@@ -1086,7 +1086,7 @@ static __init int event_trace_self_tests(void)
 		sysname = kstrdup(system->name, GFP_KERNEL);
 		if (WARN_ON(!sysname)) {
 			pr_warning("Can't allocate memory, giving up!\n");
-			return 0;
+			return;
 		}
 		ret = ftrace_set_clr_event(sysname, 0);
 		kfree(sysname);
@@ -1106,14 +1106,14 @@ static __init int event_trace_self_tests(void)
 	sysname = kmalloc(4, GFP_KERNEL);
 	if (WARN_ON(!sysname)) {
 		pr_warning("Can't allocate memory, giving up!\n");
-		return 0;
+		return;
 	}
 	memcpy(sysname, "*:*", 4);
 	ret = ftrace_set_clr_event(sysname, 1);
 	if (WARN_ON_ONCE(ret)) {
 		kfree(sysname);
 		pr_warning("error enabling all events\n");
-		return 0;
+		return;
 	}
 
 	event_test_stuff();
@@ -1125,10 +1125,76 @@ static __init int event_trace_self_tests(void)
 
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error disabling all events\n");
-		return 0;
+		return;
 	}
 
 	pr_cont("OK\n");
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+static DEFINE_PER_CPU(atomic_t, test_event_disable);
+
+static void
+function_test_events_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct ring_buffer_event *event;
+	struct ftrace_entry *entry;
+	unsigned long flags;
+	long disabled;
+	int resched;
+	int cpu;
+	int pc;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+
+	if (disabled != 1)
+		goto out;
+
+	local_save_flags(flags);
+
+	event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
+						  flags, pc);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->parent_ip		= parent_ip;
+
+	trace_current_buffer_unlock_commit(event, flags, pc);
+
+ out:
+	atomic_dec(&per_cpu(test_event_disable, cpu));
+	ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_ops __initdata  =
+{
+	.func = function_test_events_call,
+};
+
+static __init void event_trace_self_test_with_function(void)
+{
+	register_ftrace_function(&trace_ops);
+	pr_info("Running tests again, along with the function tracer\n");
+	event_trace_self_tests();
+	unregister_ftrace_function(&trace_ops);
+}
+#else
+static __init void event_trace_self_test_with_function(void)
+{
+}
+#endif
+
+static __init int event_trace_self_tests_init(void)
+{
+
+	event_trace_self_tests();
+
+	event_trace_self_test_with_function();
 
 	return 0;
 }
-- 
GitLab


From 76aa81118ddfbb3dc31533030cf3ec329dd067a6 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Thu, 16 Apr 2009 23:35:39 -0700
Subject: [PATCH 0459/2198] tracing: avoid warnings from zero-arg tracepoints

Tracepoints with no arguments can issue two warnings:

	"field" defined by not used
	"ret" is uninitialized in this function

Mark field as being OK to leave unused, and initialize ret.

[ Impact: fix false positive compiler warnings. ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: mathieu.desnoyers@polymtl.ca
LKML-Reference: <1239950139-1119-5-git-send-email-jeremy@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/ftrace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 60c5323bee64b..39a3351f2e7ff 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -160,8 +160,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 static int								\
 ftrace_format_##call(struct trace_seq *s)				\
 {									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
+	struct ftrace_raw_##call field __attribute__((unused));		\
+	int ret = 0;							\
 									\
 	tstruct;							\
 									\
-- 
GitLab


From 339ae5d3c3fc2025e3657637921495fd600027c7 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 17 Apr 2009 10:34:30 +0800
Subject: [PATCH 0460/2198] tracing: fix file mode of trace and README

trace is read-write and README is read-only.

[ Impact: fix /debug/tracing/ file permissions. ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49E7EAB6.4070605@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 031c46f11bb96..f681f646aa07d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4002,7 +4002,7 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("available_tracers", 0444, d_tracer,
 			&global_trace, &show_traces_fops);
 
-	trace_create_file("current_tracer", 0444, d_tracer,
+	trace_create_file("current_tracer", 0644, d_tracer,
 			&global_trace, &set_tracer_fops);
 
 	trace_create_file("tracing_max_latency", 0644, d_tracer,
@@ -4011,7 +4011,7 @@ static __init int tracer_init_debugfs(void)
 	trace_create_file("tracing_thresh", 0644, d_tracer,
 			&tracing_thresh, &tracing_max_lat_fops);
 
-	trace_create_file("README", 0644, d_tracer,
+	trace_create_file("README", 0444, d_tracer,
 			NULL, &tracing_readme_fops);
 
 	trace_create_file("trace_pipe", 0444, d_tracer,
-- 
GitLab


From 9b94b3a19b13e094c10f65f24bc358f6ffe4eacd Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Fri, 17 Apr 2009 12:07:46 +0200
Subject: [PATCH 0461/2198] x86: fixup numa_node information for AMD CPU
 northbridge functions

Currently the numa_node attribute for these PCI devices is 0 (it
corresponds to the numa_node for PCI bus 0). This is not a big issue
but incorrect.

This inconsistency can be fixed by reading the node number from CPU
NB function 0.

[ Impact: fill in dev->numa_node information, to optimize DMA allocations ]

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: jbarnes@virtuousgeek.org
LKML-Reference: <20090417100746.GG16198@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/quirks.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index e95022e4f5d5e..94ad0c029f02b 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -493,5 +493,42 @@ void force_hpet_resume(void)
 		break;
 	}
 }
+#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
+/* Set correct numa_node information for AMD NB functions */
+static void __init quirk_amd_nb_node(struct pci_dev *dev)
+{
+	struct pci_dev *nb_ht;
+	unsigned int devfn;
+	u32 val;
+
+	devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+	nb_ht = pci_get_slot(dev->bus, devfn);
+	if (!nb_ht)
+		return;
+
+	pci_read_config_dword(nb_ht, 0x60, &val);
+	set_dev_node(&dev->dev, val & 7);
+	pci_dev_put(dev);
+}
 
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
+			quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
+			quirk_amd_nb_node);
 #endif
-- 
GitLab


From 46de405f25f1d9fa73b657ffbb752aa0cc87a91d Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 17 Apr 2009 10:53:43 +0800
Subject: [PATCH 0462/2198] tracing: Remove include/trace/kmem_event_types.h

kmem_event_types.h is no longer necessary since tracepoint definitions
are put into include/trace/events/kmem.h

[ Impact: remove now-unused file. ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49E7EF37.2080205@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/kmem_event_types.h | 193 -------------------------------
 1 file changed, 193 deletions(-)
 delete mode 100644 include/trace/kmem_event_types.h

diff --git a/include/trace/kmem_event_types.h b/include/trace/kmem_event_types.h
deleted file mode 100644
index 4ff420fe46756..0000000000000
--- a/include/trace/kmem_event_types.h
+++ /dev/null
@@ -1,193 +0,0 @@
-
-/* use <trace/kmem.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM kmem
-
-TRACE_EVENT(kmalloc,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags)
-);
-
-TRACE_EVENT(kmem_cache_alloc,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags)
-);
-
-TRACE_EVENT(kmalloc_node,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags,
-		 int node),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-		__field(	int,		node		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-		__entry->node		= node;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags,
-		__entry->node)
-);
-
-TRACE_EVENT(kmem_cache_alloc_node,
-
-	TP_PROTO(unsigned long call_site,
-		 const void *ptr,
-		 size_t bytes_req,
-		 size_t bytes_alloc,
-		 gfp_t gfp_flags,
-		 int node),
-
-	TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-		__field(	size_t,		bytes_req	)
-		__field(	size_t,		bytes_alloc	)
-		__field(	gfp_t,		gfp_flags	)
-		__field(	int,		node		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-		__entry->bytes_req	= bytes_req;
-		__entry->bytes_alloc	= bytes_alloc;
-		__entry->gfp_flags	= gfp_flags;
-		__entry->node		= node;
-	),
-
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
-		__entry->call_site,
-		__entry->ptr,
-		__entry->bytes_req,
-		__entry->bytes_alloc,
-		__entry->gfp_flags,
-		__entry->node)
-);
-
-TRACE_EVENT(kfree,
-
-	TP_PROTO(unsigned long call_site, const void *ptr),
-
-	TP_ARGS(call_site, ptr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-	),
-
-	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
-);
-
-TRACE_EVENT(kmem_cache_free,
-
-	TP_PROTO(unsigned long call_site, const void *ptr),
-
-	TP_ARGS(call_site, ptr),
-
-	TP_STRUCT__entry(
-		__field(	unsigned long,	call_site	)
-		__field(	const void *,	ptr		)
-	),
-
-	TP_fast_assign(
-		__entry->call_site	= call_site;
-		__entry->ptr		= ptr;
-	),
-
-	TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
-);
-
-#undef TRACE_SYSTEM
-- 
GitLab


From ac1adc55fc71c7515caa2eb0e63e49b3d1c6a47c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Fri, 17 Apr 2009 00:27:08 -0500
Subject: [PATCH 0463/2198] tracing/filters: add filter_mutex to protect filter
 predicates

This patch adds a filter_mutex to prevent the filter predicates from
being accessed concurrently by various external functions.

It's based on a previous patch by Li Zefan:
        "[PATCH 7/7] tracing/filters: make filter preds RCU safe"

v2 changes:

- fixed wrong value returned in a add_subsystem_pred() failure case
  noticed by Li Zefan.

[ Impact: fix trace filter corruption/crashes on parallel access ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1239946028.6639.13.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |  4 +-
 kernel/trace/trace_events.c        |  4 +-
 kernel/trace/trace_events_filter.c | 90 +++++++++++++++++++++++-------
 3 files changed, 75 insertions(+), 23 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8817c18ef97a9..247948e81b08d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -757,13 +757,15 @@ struct filter_pred {
 };
 
 extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds, int n_preds,
+extern void filter_print_preds(struct ftrace_event_call *call,
 			       struct trace_seq *s);
 extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
 extern void filter_disable_preds(struct ftrace_event_call *call);
 extern void filter_free_subsystem_preds(struct event_subsystem *system);
+extern void filter_print_subsystem_preds(struct event_subsystem *system,
+					 struct trace_seq *s);
 extern int filter_add_subsystem_pred(struct event_subsystem *system,
 				     struct filter_pred *pred);
 
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1137f951be42e..64f9d6d2735b3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -488,7 +488,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(call->preds, call->n_preds, s);
+	filter_print_preds(call, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -558,7 +558,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(system->preds, system->n_preds, s);
+	filter_print_subsystem_preds(system, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f8e5eab0424c6..e0fcfd2a16d69 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -22,10 +22,13 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/mutex.h>
 
 #include "trace.h"
 #include "trace_output.h"
 
+static DEFINE_MUTEX(filter_mutex);
+
 static int filter_pred_64(struct filter_pred *pred, void *event)
 {
 	u64 *addr = (u64 *)(event + pred->offset);
@@ -112,8 +115,8 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-void filter_print_preds(struct filter_pred **preds, int n_preds,
-			struct trace_seq *s)
+static void __filter_print_preds(struct filter_pred **preds, int n_preds,
+				 struct trace_seq *s)
 {
 	char *field_name;
 	struct filter_pred *pred;
@@ -138,6 +141,21 @@ void filter_print_preds(struct filter_pred **preds, int n_preds,
 	}
 }
 
+void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
+{
+	mutex_lock(&filter_mutex);
+	__filter_print_preds(call->preds, call->n_preds, s);
+	mutex_unlock(&filter_mutex);
+}
+
+void filter_print_subsystem_preds(struct event_subsystem *system,
+				  struct trace_seq *s)
+{
+	mutex_lock(&filter_mutex);
+	__filter_print_preds(system->preds, system->n_preds, s);
+	mutex_unlock(&filter_mutex);
+}
+
 static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
@@ -180,7 +198,7 @@ static int filter_set_pred(struct filter_pred *dest,
 	return 0;
 }
 
-void filter_disable_preds(struct ftrace_event_call *call)
+static void __filter_disable_preds(struct ftrace_event_call *call)
 {
 	int i;
 
@@ -190,6 +208,13 @@ void filter_disable_preds(struct ftrace_event_call *call)
 		call->preds[i]->fn = filter_pred_none;
 }
 
+void filter_disable_preds(struct ftrace_event_call *call)
+{
+	mutex_lock(&filter_mutex);
+	__filter_disable_preds(call);
+	mutex_unlock(&filter_mutex);
+}
+
 int init_preds(struct ftrace_event_call *call)
 {
 	struct filter_pred *pred;
@@ -223,7 +248,7 @@ int init_preds(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(init_preds);
 
-void filter_free_subsystem_preds(struct event_subsystem *system)
+static void __filter_free_subsystem_preds(struct event_subsystem *system)
 {
 	struct ftrace_event_call *call;
 	int i;
@@ -241,18 +266,25 @@ void filter_free_subsystem_preds(struct event_subsystem *system)
 			continue;
 
 		if (!strcmp(call->system, system->name))
-			filter_disable_preds(call);
+			__filter_disable_preds(call);
 	}
 }
 
-static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred,
-			     filter_pred_fn_t fn)
+void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+	mutex_lock(&filter_mutex);
+	__filter_free_subsystem_preds(system);
+	mutex_unlock(&filter_mutex);
+}
+
+static int filter_add_pred_fn(struct ftrace_event_call *call,
+			      struct filter_pred *pred,
+			      filter_pred_fn_t fn)
 {
 	int idx, err;
 
 	if (call->n_preds && !pred->compound)
-		filter_disable_preds(call);
+		__filter_disable_preds(call);
 
 	if (call->n_preds == MAX_FILTER_PRED)
 		return -ENOSPC;
@@ -276,7 +308,8 @@ static int is_string_field(const char *type)
 	return 0;
 }
 
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+static int __filter_add_pred(struct ftrace_event_call *call,
+			     struct filter_pred *pred)
 {
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
@@ -293,7 +326,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 			return -EINVAL;
 		fn = filter_pred_string;
 		pred->str_len = field->size;
-		return __filter_add_pred(call, pred, fn);
+		return filter_add_pred_fn(call, pred, fn);
 	} else {
 		if (pred->str_len)
 			return -EINVAL;
@@ -316,7 +349,18 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 		return -EINVAL;
 	}
 
-	return __filter_add_pred(call, pred, fn);
+	return filter_add_pred_fn(call, pred, fn);
+}
+
+int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+{
+	int err;
+
+	mutex_lock(&filter_mutex);
+	err = __filter_add_pred(call, pred);
+	mutex_unlock(&filter_mutex);
+
+	return err;
 }
 
 int filter_add_subsystem_pred(struct event_subsystem *system,
@@ -324,20 +368,27 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 {
 	struct ftrace_event_call *call;
 
+	mutex_lock(&filter_mutex);
+
 	if (system->n_preds && !pred->compound)
-		filter_free_subsystem_preds(system);
+		__filter_free_subsystem_preds(system);
 
 	if (!system->n_preds) {
 		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
-		if (!system->preds)
+		if (!system->preds) {
+			mutex_unlock(&filter_mutex);
 			return -ENOMEM;
+		}
 	}
 
-	if (system->n_preds == MAX_FILTER_PRED)
+	if (system->n_preds == MAX_FILTER_PRED) {
+		mutex_unlock(&filter_mutex);
 		return -ENOSPC;
+	}
 
 	system->preds[system->n_preds] = pred;
+	system->n_preds++;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
@@ -348,17 +399,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 		if (strcmp(call->system, system->name))
 			continue;
 
-		if (!find_event_field(call, pred->field_name))
-			continue;
-
-		err = filter_add_pred(call, pred);
+		err = __filter_add_pred(call, pred);
 		if (err == -ENOMEM) {
 			system->preds[system->n_preds] = NULL;
+			system->n_preds--;
+			mutex_unlock(&filter_mutex);
 			return err;
 		}
 	}
 
-	system->n_preds++;
+	mutex_unlock(&filter_mutex);
 
 	return 0;
 }
-- 
GitLab


From b0afdc126d0515e76890f0a5f26b28501cfa298e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 17 Apr 2009 13:02:22 -0400
Subject: [PATCH 0464/2198] tracing/events: enable code with EVENT_TRACING not
 EVENT_TRACER

The CONFIG_EVENT_TRACER is the way to turn on event tracing when no
other tracing has been configured. All code to get enabled should
depend on CONFIG_EVENT_TRACING. That is what is enabled when TRACING
(or CONFIG_EVENT_TRACER) is selected.

This patch enables the include/trace/ftrace.h file when
CONFIG_EVENT_TRACING is enabled.

[ Impact: fix warning in event tracer selftest ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/define_trace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 18869417109c0..7f1f23d601ecb 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -56,7 +56,7 @@
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
-#ifdef CONFIG_EVENT_TRACER
+#ifdef CONFIG_EVENT_TRACING
 #include <trace/ftrace.h>
 #endif
 
-- 
GitLab


From 12acd473d45cf2e40de3782cb2de712e5cd4d715 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 17 Apr 2009 16:01:56 -0400
Subject: [PATCH 0465/2198] tracing: add EXPORT_SYMBOL_GPL for trace commits

Not all the necessary symbols were exported to allow for tracing
by modules. This patch adds them in.

[ Impact: allow modules to commit data to the ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f681f646aa07d..183d788038e83 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -894,18 +894,20 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
+EXPORT_SYMBOL(trace_current_buffer_unlock_commit);
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
 }
+EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit);
 
 void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
 {
 	ring_buffer_discard_commit(global_trace.buffer, event);
 }
-EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
 
 void
 trace_function(struct trace_array *tr,
-- 
GitLab


From 261842b7c9099f56de2eb969c8ad65402d68e00e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 16 Apr 2009 21:41:52 -0400
Subject: [PATCH 0466/2198] tracing: add same level recursion detection

The tracing infrastructure allows for recursion. That is, an interrupt
may interrupt the act of tracing an event, and that interrupt may very well
perform its own trace. This is a recursive trace, and is fine to do.

The problem arises when there is a bug, and the utility doing the trace
calls something that recurses back into the tracer. This recursion is not
caused by an external event like an interrupt, but by code that is not
expected to recurse. The result could be a lockup.

This patch adds a bitmask to the task structure that keeps track
of the trace recursion. To find the interrupt depth, the following
algorithm is used:

  level = hardirq_count() + softirq_count() + in_nmi;

Here, level will be the depth of interrutps and softirqs, and even handles
the nmi. Then the corresponding bit is set in the recursion bitmask.
If the bit was already set, we know we had a recursion at the same level
and we warn about it and fail the writing to the buffer.

After the data has been committed to the buffer, we clear the bit.
No atomics are needed. The only races are with interrupts and they reset
the bitmask before returning anywy.

[ Impact: detect same irq level trace recursion ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h     |  7 +++++++
 include/linux/init_task.h  |  1 +
 include/linux/sched.h      |  4 +++-
 kernel/trace/ring_buffer.c | 42 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 53 insertions(+), 1 deletion(-)

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 97c83e1bc5899..39b95c56587e8 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -488,8 +488,15 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 
 extern int ftrace_dump_on_oops;
 
+#ifdef CONFIG_PREEMPT
+#define INIT_TRACE_RECURSION		.trace_recursion = 0,
+#endif
+
 #endif /* CONFIG_TRACING */
 
+#ifndef INIT_TRACE_RECURSION
+#define INIT_TRACE_RECURSION
+#endif
 
 #ifdef CONFIG_HW_BRANCH_TRACER
 
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index dcfb93337e9a2..6fc2185298633 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -187,6 +187,7 @@ extern struct cred init_cred;
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
+	INIT_TRACE_RECURSION						\
 }
 
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049cb..7ede5e4909133 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1428,7 +1428,9 @@ struct task_struct {
 #ifdef CONFIG_TRACING
 	/* state flags for use by tracers */
 	unsigned long trace;
-#endif
+	/* bitmask of trace recursion */
+	unsigned long trace_recursion;
+#endif /* CONFIG_TRACING */
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 84a6055f37c90..b421b0ea91126 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1481,6 +1481,40 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
+static int trace_irq_level(void)
+{
+	return hardirq_count() + softirq_count() + in_nmi();
+}
+
+static int trace_recursive_lock(void)
+{
+	int level;
+
+	level = trace_irq_level();
+
+	if (unlikely(current->trace_recursion & (1 << level))) {
+		/* Disable all tracing before we do anything else */
+		tracing_off_permanent();
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	current->trace_recursion |= 1 << level;
+
+	return 0;
+}
+
+static void trace_recursive_unlock(void)
+{
+	int level;
+
+	level = trace_irq_level();
+
+	WARN_ON_ONCE(!current->trace_recursion & (1 << level));
+
+	current->trace_recursion &= ~(1 << level);
+}
+
 static DEFINE_PER_CPU(int, rb_need_resched);
 
 /**
@@ -1514,6 +1548,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	/* If we are tracing schedule, we don't want to recurse */
 	resched = ftrace_preempt_disable();
 
+	if (trace_recursive_lock())
+		goto out_nocheck;
+
 	cpu = raw_smp_processor_id();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1543,6 +1580,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	return event;
 
  out:
+	trace_recursive_unlock();
+
+ out_nocheck:
 	ftrace_preempt_enable(resched);
 	return NULL;
 }
@@ -1581,6 +1621,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
 	rb_commit(cpu_buffer, event);
 
+	trace_recursive_unlock();
+
 	/*
 	 * Only the last preempt count needs to restore preemption.
 	 */
-- 
GitLab


From 3189cdb31622f4e40688ce5a6fc5d940b42bc805 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 17 Apr 2009 16:13:55 -0400
Subject: [PATCH 0467/2198] tracing: protect trace_printk from recursion

trace_printk can be called from any context, including NMIs.
If this happens, then we must test for for recursion before
grabbing any spinlocks.

This patch prevents trace_printk from being called recursively.

[ Impact: prevent hard lockup in lockdep event tracer ]

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 183d788038e83..b9a3adce9221a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1259,6 +1259,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	struct trace_array_cpu *data;
 	struct bprint_entry *entry;
 	unsigned long flags;
+	int disable;
 	int resched;
 	int cpu, len = 0, size, pc;
 
@@ -1273,7 +1274,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(atomic_read(&data->disabled)))
+	disable = atomic_inc_return(&data->disabled);
+	if (unlikely(disable != 1))
 		goto out;
 
 	/* Lockdep uses trace_printk for lock tracing */
@@ -1301,6 +1303,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 	local_irq_restore(flags);
 
 out:
+	atomic_dec_return(&data->disabled);
 	ftrace_preempt_enable(resched);
 	unpause_graph_tracing();
 
@@ -1320,6 +1323,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	int cpu, len = 0, size, pc;
 	struct print_entry *entry;
 	unsigned long irq_flags;
+	int disable;
 
 	if (tracing_disabled || tracing_selftest_running)
 		return 0;
@@ -1329,7 +1333,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
-	if (unlikely(atomic_read(&data->disabled)))
+	disable = atomic_inc_return(&data->disabled);
+	if (unlikely(disable != 1))
 		goto out;
 
 	pause_graph_tracing();
@@ -1357,6 +1362,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 	raw_local_irq_restore(irq_flags);
 	unpause_graph_tracing();
  out:
+	atomic_dec_return(&data->disabled);
 	preempt_enable_notrace();
 
 	return len;
-- 
GitLab


From 8e668b5b3455207e4540fc7ccab9ecf70142f288 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 17 Apr 2009 17:17:55 -0400
Subject: [PATCH 0468/2198] tracing: remove format attribute of inline function

Due to a cut and paste error, I added the gcc attribute for printf
format to the static inline stub of trace_seq_printf.

This will cause a compile failure.

[ Impact: fix compiler error when CONFIG_TRACING is off ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: =?ISO-8859-15?Q?Fr=E9d=E9ric_Weisbecker?= <fweisbec@gmail.com>
LKML-Reference: <alpine.DEB.2.00.0904171717080.1016@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/trace_seq.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 15ca2c71af132..37db9bdfbc1a0 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -42,7 +42,6 @@ extern int trace_seq_path(struct trace_seq *s, struct path *path);
 
 #else /* CONFIG_TRACING */
 static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-	__attribute__ ((format (printf, 2, 3)))
 {
 	return 0;
 }
-- 
GitLab


From 6f41469c627fe118121dfce2a8134ad24da3df28 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0469/2198] block: clear req->errors on bio completion only for
 fs requests

Impact: subtle behavior change

For fs requests, rq is only carrier of bios and rq error status as a
whole doesn't mean much.  This is the reason why rq->errors is being
cleared on each partial completion of a request as on each partial
completion the error status is transferred to the respective bios.

For pc requests, rq->errors is used to carry error status to the
issuer and thus __end_that_request_first() doesn't clear it on such
cases.

The condition was fine till now as only fs and pc requests have used
bio and thus the bio completion path.  However, future changes will
unify data accesses to bio and all non fs users care about rq error
status.  Clear rq->errors on bio completion only for fs requests.

In general, the implicit clearing is a bit too subtle especially as
the meaning of rq->errors is completely dependent on low level
drivers.  Unifying / cleaning up rq->errors usage and letting llds
manage it would be better.  TODO comment added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 07ab75403e1a5..cce7a88dc612a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1739,10 +1739,14 @@ static int __end_that_request_first(struct request *req, int error,
 	trace_block_rq_complete(req->q, req);
 
 	/*
-	 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
-	 * sense key with us all the way through
+	 * For fs requests, rq is just carrier of independent bio's
+	 * and each partial completion should be handled separately.
+	 * Reset per-request error on each partial completion.
+	 *
+	 * TODO: tj: This is too subtle.  It would be better to let
+	 * low level drivers do what they see fit.
 	 */
-	if (!blk_pc_request(req))
+	if (blk_fs_request(req))
 		req->errors = 0;
 
 	if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
-- 
GitLab


From 1e75540ec5202cae63cd238c86bd880e3d496546 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0470/2198] ide-tape: remove back-to-back REQUEST_SENSE
 detection

Impact: fix an oops which always triggers

ide_tape_issue_pc() assumed drive->pc isn't NULL on invocation when
checking for back-to-back request sense issues but drive->pc can be
NULL and even when it's not NULL, it's not safe to dereference it once
the previous command is complete because pc could have been freed or
was on stack.  Kill back-to-back REQUEST_SENSE detection.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index cb942a9b580f6..3a53e0834cf79 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -614,12 +614,6 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-	if (drive->pc->c[0] == REQUEST_SENSE &&
-	    pc->c[0] == REQUEST_SENSE) {
-		printk(KERN_ERR "ide-tape: possible ide-tape.c bug - "
-			"Two request sense in serial were issued\n");
-	}
-
 	if (drive->failed_pc == NULL && pc->c[0] != REQUEST_SENSE)
 		drive->failed_pc = pc;
 
-- 
GitLab


From 853280a4dc8e3cc97ff10c1c02234d96078f437b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0471/2198] ide: use blk_run_queue() instead of
 blk_start_queueing()

blk_start_queueing() is being phased out in favor of
[__]blk_run_queue().  Switch.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-park.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 310d03f2b5b79..a914023d6d035 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -24,11 +24,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 			start_queue = 1;
 		spin_unlock_irq(&hwif->lock);
 
-		if (start_queue) {
-			spin_lock_irq(q->queue_lock);
-			blk_start_queueing(q);
-			spin_unlock_irq(q->queue_lock);
-		}
+		if (start_queue)
+			blk_run_queue(q);
 		return;
 	}
 	spin_unlock_irq(&hwif->lock);
-- 
GitLab


From 55f3f399422a4a3f6cb84ea4096dfaddf8817399 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0472/2198] ide: don't set REQ_SOFTBARRIER

ide doesn't have to worry about REQ_SOFTBARRIER.  Don't set it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-disk.c   | 1 -
 drivers/ide/ide-ioctls.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index a9fbe2c31210c..c2438804d3c4a 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -411,7 +411,6 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
 	cmd->protocol = ATA_PROT_NODATA;
 
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
 	rq->special = cmd;
 }
 
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index c1c25ebbaa1fb..5991b23793f20 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -231,7 +231,6 @@ static int generic_drive_reset(ide_drive_t *drive)
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_len = 1;
 	rq->cmd[0] = REQ_DRIVE_RESET;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
 	if (blk_execute_rq(drive->queue, NULL, rq, 1))
 		ret = rq->errors;
 	blk_put_request(rq);
-- 
GitLab


From 46a802e852c2106d755f697819ea80cd3ffdc222 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0473/2198] ide kill unused ide_cmd->special

Impact: removal of unused field

No one uses ide_cmd->special anymore.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ide.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/ide.h b/include/linux/ide.h
index ff65fffb078f6..846a1e132407d 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -324,7 +324,6 @@ struct ide_cmd {
 	unsigned int		cursg_ofs;
 
 	struct request		*rq;		/* copy of request */
-	void			*special;	/* valid_t generally */
 };
 
 /* ATAPI packet command flags */
-- 
GitLab


From 1873b90cdea038715ec7140fccc2116fb930ffb5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0474/2198] ide-cd: clear sense buffer before issuing request
 sense

Impact: code simplification

ide_cd_request_sense_fixup() clears the tail of the sense buffer if
the device didn't completely fill it.  This patch makes
cdrom_queue_request_sense() clear the sense buffer before issuing the
command instead of clearing it afterwards.  This simplifies code and
eases future changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-cd.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 3aec19d1fdfca..509f1293cae2c 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -217,6 +217,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	if (sense == NULL)
 		sense = &info->sense_data;
 
+	memset(sense, 0, 18);
+
 	/* stuff the sense request in front of our current request */
 	blk_rq_init(NULL, rq);
 	rq->cmd_type = REQ_TYPE_ATA_PC;
@@ -504,14 +506,8 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
 	 * and some drives don't send them.  Sigh.
 	 */
 	if (rq->cmd[0] == GPCMD_REQUEST_SENSE &&
-	    cmd->nleft > 0 && cmd->nleft <= 5) {
-		unsigned int ofs = cmd->nbytes - cmd->nleft;
-
-		while (cmd->nleft > 0) {
-			*((u8 *)rq->data + ofs++) = 0;
-			cmd->nleft--;
-		}
-	}
+	    cmd->nleft > 0 && cmd->nleft <= 5)
+		cmd->nleft = 0;
 }
 
 int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
-- 
GitLab


From 7f006dc24fae158131116c9472874f12e16cf040 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0475/2198] ide-floppy: block pc always uses bio

Impact: remove unnecessary code path

Block pc requests always use bio and rq->data is always NULL.  No need
to worry about !rq->bio cases in idefloppy_block_pc_cmd().  Note that
ide-atapi uses ide_pio_bytes() for bio PIO transfer which handle sg
fine.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-floppy.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 2b4868d95f8b0..3b22e066287e0 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -216,15 +216,13 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 	ide_init_pc(pc);
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
 	pc->rq = rq;
-	if (rq->data_len && rq_data_dir(rq) == WRITE)
-		pc->flags |= PC_FLAG_WRITING;
-	pc->buf = rq->data;
-	if (rq->bio)
+	if (rq->data_len) {
 		pc->flags |= PC_FLAG_DMA_OK;
-	/*
-	 * possibly problematic, doesn't look like ide-floppy correctly
-	 * handled scattered requests if dma fails...
-	 */
+		if (rq_data_dir(rq) == WRITE)
+			pc->flags |= PC_FLAG_WRITING;
+	}
+	/* pio will be performed by ide_pio_bytes() which handles sg fine */
+	pc->buf = NULL;
 	pc->req_xfer = pc->buf_size = rq->data_len;
 }
 
-- 
GitLab


From eace4cb04c0edc9388e987bf9bbdef461f6daca4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0476/2198] ide-taskfile: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide_raw_taskfile() directly uses rq->buffer to carry pointer to the
data buffer.  This complicates both block interface and ide backend
request handling.  Use blk_rq_map_kern() instead and drop special
handling for REQ_TYPE_ATA_TASKFILE from ide_map_sg().

Note that REQ_RW setting is moved upwards as blk_rq_map_kern() uses it
to initialize bio rw flag.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-io.c       |  5 +----
 drivers/ide/ide-taskfile.c | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 35dc38d3b2c58..9b9e8b1aae5ee 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -248,10 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct scatterlist *sg = hwif->sg_table;
 	struct request *rq = cmd->rq;
 
-	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
-		sg_init_one(sg, rq->buffer, rq->nr_sectors * SECTOR_SIZE);
-		cmd->sg_nents = 1;
-	} else if (!rq->bio) {
+	if (!rq->bio) {
 		sg_init_one(sg, rq->data, rq->data_len);
 		cmd->sg_nents = 1;
 	} else
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 4aa6223c11bea..f400eb4d4aff7 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -424,7 +424,9 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-	rq->buffer = buf;
+
+	if (cmd->tf_flags & IDE_TFLAG_WRITE)
+		rq->cmd_flags |= REQ_RW;
 
 	/*
 	 * (ks) We transfer currently only whole sectors.
@@ -432,18 +434,20 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	 * if we would find a solution to transfer any size.
 	 * To support special commands like READ LONG.
 	 */
-	rq->hard_nr_sectors = rq->nr_sectors = nsect;
-	rq->hard_cur_sectors = rq->current_nr_sectors = nsect;
-
-	if (cmd->tf_flags & IDE_TFLAG_WRITE)
-		rq->cmd_flags |= REQ_RW;
+	if (nsect) {
+		error = blk_rq_map_kern(drive->queue, rq, buf,
+					nsect * SECTOR_SIZE, __GFP_WAIT);
+		if (error)
+			goto put_req;
+	}
 
 	rq->special = cmd;
 	cmd->rq = rq;
 
 	error = blk_execute_rq(drive->queue, NULL, rq, 0);
-	blk_put_request(rq);
 
+put_req:
+	blk_put_request(rq);
 	return error;
 }
 
-- 
GitLab


From c267cc1c4db4ccb3406d045a8da8660f0bbfe08d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0477/2198] ide-atapi: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide-atapi uses rq->buffer as private opaque value for internal special
requests.  rq->special isn't used for these cases (the only case where
rq->special is used is for ide-tape rw requests).  Use rq->special
instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-atapi.c  | 4 ++--
 drivers/ide/ide-floppy.c | 2 +-
 drivers/ide/ide-tape.c   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 7201b176d75b0..2894577237ba7 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -90,7 +90,7 @@ static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
 	blk_rq_init(NULL, rq);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_flags |= REQ_PREEMPT;
-	rq->buffer = (char *)pc;
+	rq->special = (char *)pc;
 	rq->rq_disk = disk;
 
 	if (pc->req_xfer) {
@@ -119,7 +119,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->buffer = (char *)pc;
+	rq->special = (char *)pc;
 
 	if (pc->req_xfer) {
 		rq->data = pc->buf;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 3b22e066287e0..94600331a2715 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -264,7 +264,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
 	} else if (blk_special_request(rq)) {
-		pc = (struct ide_atapi_pc *) rq->buffer;
+		pc = (struct ide_atapi_pc *)rq->special;
 	} else if (blk_pc_request(rq)) {
 		pc = &floppy->queued_pc;
 		idefloppy_blockpc_cmd(floppy, pc, rq);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 3a53e0834cf79..aadf53cfac6f0 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -828,7 +828,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		goto out;
 	}
 	if (rq->cmd[13] & REQ_IDETAPE_PC1) {
-		pc = (struct ide_atapi_pc *) rq->buffer;
+		pc = (struct ide_atapi_pc *)rq->special;
 		rq->cmd[13] &= ~(REQ_IDETAPE_PC1);
 		rq->cmd[13] |= REQ_IDETAPE_PC2;
 		goto out;
-- 
GitLab


From cbfd082abfcbed8c57a12636f36e9bead8d6cfc6 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0478/2198] ide-cd: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide-cd uses rq->buffer to carry pointer to the original request when
issuing REQUEST_SENSE.  Use rq->special instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-cd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 509f1293cae2c..eb3c299d95de1 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -232,8 +232,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	rq->cmd_type = REQ_TYPE_SENSE;
 	rq->cmd_flags |= REQ_PREEMPT;
 
-	/* NOTE! Save the failed command in "rq->buffer" */
-	rq->buffer = (void *) failed_command;
+	/* NOTE! Save the failed command in "rq->special" */
+	rq->special = (void *)failed_command;
 
 	if (failed_command)
 		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x",
@@ -247,10 +247,10 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
-	 * For REQ_TYPE_SENSE, "rq->buffer" points to the original
+	 * For REQ_TYPE_SENSE, "rq->special" points to the original
 	 * failed request
 	 */
-	struct request *failed = (struct request *)rq->buffer;
+	struct request *failed = (struct request *)rq->special;
 	struct cdrom_info *info = drive->driver_data;
 	void *sense = &info->sense_data;
 
-- 
GitLab


From a1df5169f9bf08f6067029bfb840a05e282b1b97 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0479/2198] ide: add helpers for preparing sense requests

This is in preparation of removing the queueing of a sense request out
of the IRQ handler path.

Use struct request_sense as a general sense buffer for all ATAPI
devices ide-{floppy,tape,cd}.

tj: * blk_get_request(__GFP_WAIT) can't be called from do_request() as
      it can cause deadlock.  Converted to use inline struct request
      and blk_rq_init().

    * Added xfer / cdb len selection depending on device type.

    * All sense prep logics folded into ide_prep_sense() which never
      fails.

    * hwif->rq clearing and sense_rq used handling moved into
      ide_queue_sense_rq().

    * blk_rq_map_kern() conversion is moved to later patch.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 61 +++++++++++++++++++++++++++++++++++++++++
 include/linux/ide.h     | 11 ++++++++
 2 files changed, 72 insertions(+)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2894577237ba7..c6e03485a63a8 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -191,6 +191,67 @@ void ide_create_request_sense_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq)
+{
+	struct request_sense *sense = &drive->sense_data;
+	struct request *sense_rq = &drive->sense_rq;
+	unsigned int cmd_len, sense_len;
+
+	debug_log("%s: enter\n", __func__);
+
+	switch (drive->media) {
+	case ide_floppy:
+		cmd_len = 255;
+		sense_len = 18;
+		break;
+	case ide_tape:
+		cmd_len = 20;
+		sense_len = 20;
+		break;
+	default:
+		cmd_len = 18;
+		sense_len = 18;
+	}
+
+	BUG_ON(sense_len > sizeof(*sense));
+
+	if (blk_sense_request(rq) || drive->sense_rq_armed)
+		return;
+
+	memset(sense, 0, sizeof(*sense));
+
+	blk_rq_init(rq->q, sense_rq);
+	sense_rq->rq_disk = rq->rq_disk;
+
+	sense_rq->data = sense;
+	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
+	sense_rq->cmd[4] = cmd_len;
+	sense_rq->data_len = sense_len;
+
+	sense_rq->cmd_type = REQ_TYPE_SENSE;
+	sense_rq->cmd_flags |= REQ_PREEMPT;
+
+	if (drive->media == ide_tape)
+		sense_rq->cmd[13] = REQ_IDETAPE_PC1;
+
+	drive->sense_rq_armed = true;
+}
+EXPORT_SYMBOL_GPL(ide_prep_sense);
+
+void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+{
+	BUG_ON(!drive->sense_rq_armed);
+
+	drive->sense_rq.special = special;
+	drive->sense_rq_armed = false;
+
+	drive->hwif->rq = NULL;
+
+	elv_add_request(drive->queue, &drive->sense_rq,
+			ELEVATOR_INSERT_FRONT, 0);
+}
+EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
+
 /*
  * Called when an error was detected during the last packet command.
  * We queue a request sense packet command in the head of the request list.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 846a1e132407d..a69ccac564111 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -26,6 +26,9 @@
 #include <asm/io.h>
 #include <asm/mutex.h>
 
+/* for request_sense */
+#include <linux/cdrom.h>
+
 #if defined(CONFIG_CRIS) || defined(CONFIG_FRV) || defined(CONFIG_MN10300)
 # define SUPPORT_VLB_SYNC 0
 #else
@@ -602,6 +605,11 @@ struct ide_drive_s {
 
 	struct ide_atapi_pc request_sense_pc;
 	struct request request_sense_rq;
+
+	/* current sense rq and buffer */
+	bool sense_rq_armed;
+	struct request sense_rq;
+	struct request_sense sense_data;
 };
 
 typedef struct ide_drive_s ide_drive_t;
@@ -1175,6 +1183,9 @@ int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *, struct gendisk *);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq);
+void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+
 int ide_cd_expiry(ide_drive_t *);
 
 int ide_cd_get_xferlen(struct request *);
-- 
GitLab


From 746d5e43274e9ea6cbd58818afc9239d41fb4e1e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0480/2198] ide-cd: convert to using generic sense request

Preallocate a sense request in the ->do_request method and reinitialize
it only on demand, in case it's been consumed in the IRQ handler path.
The reason for this is that we don't want to be mapping rq to bio in
the IRQ path and introduce all kinds of unnecessary hacks to the block
layer.

tj: * Both user and kernel PC requests expect sense data to be stored
      in separate storage other than drive->sense_data.  Copy sense
      data to rq->sense on completion if rq->sense is not NULL.  This
      fixes bogus sense data on PC requests.

As a result, remove cdrom_queue_request_sense.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-cd.c | 54 ++++++++++----------------------------------
 drivers/ide/ide-cd.h |  4 ----
 2 files changed, 12 insertions(+), 46 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index eb3c299d95de1..7b21c7eac5b05 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -206,44 +206,6 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 	ide_cd_log_error(drive->name, failed_command, sense);
 }
 
-static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
-				      struct request *failed_command)
-{
-	struct cdrom_info *info		= drive->driver_data;
-	struct request *rq		= &drive->request_sense_rq;
-
-	ide_debug_log(IDE_DBG_SENSE, "enter");
-
-	if (sense == NULL)
-		sense = &info->sense_data;
-
-	memset(sense, 0, 18);
-
-	/* stuff the sense request in front of our current request */
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_ATA_PC;
-	rq->rq_disk = info->disk;
-
-	rq->data = sense;
-	rq->cmd[0] = GPCMD_REQUEST_SENSE;
-	rq->cmd[4] = 18;
-	rq->data_len = 18;
-
-	rq->cmd_type = REQ_TYPE_SENSE;
-	rq->cmd_flags |= REQ_PREEMPT;
-
-	/* NOTE! Save the failed command in "rq->special" */
-	rq->special = (void *)failed_command;
-
-	if (failed_command)
-		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x",
-					     failed_command->cmd[0]);
-
-	drive->hwif->rq = NULL;
-
-	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
@@ -251,11 +213,16 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 	 * failed request
 	 */
 	struct request *failed = (struct request *)rq->special;
-	struct cdrom_info *info = drive->driver_data;
-	void *sense = &info->sense_data;
+	struct request_sense *sense = &drive->sense_data;
 
 	if (failed) {
 		if (failed->sense) {
+			/*
+			 * Sense is always read into drive->sense_data.
+			 * Copy back if the failed request has its
+			 * sense pointer set.
+			 */
+			memcpy(failed->sense, sense, 18);
 			sense = failed->sense;
 			failed->sense_len = rq->sense_len;
 		}
@@ -431,7 +398,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	/* if we got a CHECK_CONDITION status, queue a request sense command */
 	if (stat & ATA_ERR)
-		cdrom_queue_request_sense(drive, NULL, NULL);
+		ide_queue_sense_rq(drive, NULL);
 	return 1;
 
 end_request:
@@ -445,7 +412,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 		hwif->rq = NULL;
 
-		cdrom_queue_request_sense(drive, rq->sense, rq);
+		ide_queue_sense_rq(drive, rq);
 		return 1;
 	} else
 		return 2;
@@ -893,6 +860,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		goto out_end;
 	}
 
+	/* prepare sense request for this command */
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 1d97101099ce2..93a3cf1b0f3f8 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -87,10 +87,6 @@ struct cdrom_info {
 
 	struct atapi_toc *toc;
 
-	/* The result of the last successful request sense command
-	   on this device. */
-	struct request_sense sense_data;
-
 	u8 max_speed;		/* Max speed of the drive. */
 	u8 current_speed;	/* Current speed of the drive. */
 
-- 
GitLab


From 6b544fcc8cd0a04eb42de9d1ecdd345e979d6ada Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0481/2198] ide-atapi: convert ide-{floppy,tape} to using
 preallocated sense buffer

Since we're issuing REQ_TYPE_SENSE now we need to allow those types of
rqs in the ->do_request callbacks. As a future improvement, sense_len
assignment might be unified across all ATAPI devices. Borislav to
check with specs and test.

As a result, get rid of ide_queue_pc_head() and
drive->request_sense_rq.

tj: * Init request sense ide_atapi_pc from sense request.  In the
      longer timer, it would probably better to fold
      ide_create_request_sense_cmd() into its only current user -
      ide_floppy_get_format_progress().

    * ide_retry_pc() no longer takes @disk.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c  | 48 ++++++++++++----------------------------
 drivers/ide/ide-floppy.c |  4 +++-
 drivers/ide/ide-tape.c   |  7 ++++--
 include/linux/ide.h      |  3 +--
 4 files changed, 23 insertions(+), 39 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index c6e03485a63a8..972c522516f8f 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -79,34 +79,6 @@ void ide_init_pc(struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_init_pc);
 
-/*
- * Generate a new packet command request in front of the request queue, before
- * the current request, so that it will be processed immediately, on the next
- * pass through the driver.
- */
-static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
-			      struct ide_atapi_pc *pc, struct request *rq)
-{
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = (char *)pc;
-	rq->rq_disk = disk;
-
-	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
-	}
-
-	memcpy(rq->cmd, pc->c, 12);
-	if (drive->media == ide_tape)
-		rq->cmd[13] = REQ_IDETAPE_PC1;
-
-	drive->hwif->rq = NULL;
-
-	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
 /*
  * Add a special packet command request to the tail of the request queue,
  * and wait for it to be serviced.
@@ -254,18 +226,26 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
 /*
  * Called when an error was detected during the last packet command.
- * We queue a request sense packet command in the head of the request list.
+ * We queue a request sense packet command at the head of the request
+ * queue.
  */
-void ide_retry_pc(ide_drive_t *drive, struct gendisk *disk)
+void ide_retry_pc(ide_drive_t *drive)
 {
-	struct request *rq = &drive->request_sense_rq;
+	struct request *sense_rq = &drive->sense_rq;
 	struct ide_atapi_pc *pc = &drive->request_sense_pc;
 
 	(void)ide_read_error(drive);
-	ide_create_request_sense_cmd(drive, pc);
+
+	/* init pc from sense_rq */
+	ide_init_pc(pc);
+	memcpy(pc->c, sense_rq->cmd, 12);
+	pc->buf = sense_rq->data;
+	pc->req_xfer = sense_rq->data_len;
+
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
-	ide_queue_pc_head(drive, disk, pc, rq);
+
+	ide_queue_sense_rq(drive, pc);
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -404,7 +384,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			debug_log("[cmd %x]: check condition\n", rq->cmd[0]);
 
 			/* Retry operation */
-			ide_retry_pc(drive, rq->rq_disk);
+			ide_retry_pc(drive);
 
 			/* queued, but not started */
 			return ide_stopped;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 94600331a2715..d3302cc891e42 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -263,7 +263,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		}
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
-	} else if (blk_special_request(rq)) {
+	} else if (blk_special_request(rq) || blk_sense_request(rq)) {
 		pc = (struct ide_atapi_pc *)rq->special;
 	} else if (blk_pc_request(rq)) {
 		pc = &floppy->queued_pc;
@@ -273,6 +273,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		goto out_end;
 	}
 
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index aadf53cfac6f0..8324dfa78a3f9 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -695,7 +695,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 				printk(KERN_ERR "ide-tape: %s: I/O error, ",
 						tape->name);
 			/* Retry operation */
-			ide_retry_pc(drive, tape->disk);
+			ide_retry_pc(drive);
 			return ide_stopped;
 		}
 		pc->error = 0;
@@ -752,7 +752,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 			(unsigned long long)rq->sector, rq->nr_sectors,
 			rq->current_nr_sectors);
 
-	if (!blk_special_request(rq)) {
+	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
 		printk(KERN_NOTICE "ide-tape: %s: Unsupported request in "
 			"request queue (%d)\n", drive->name, rq->cmd_type);
@@ -840,6 +840,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	BUG();
 
 out:
+	/* prepare sense request for this command */
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a69ccac564111..9e67ccac3c1f1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -604,7 +604,6 @@ struct ide_drive_s {
 	unsigned long atapi_flags;
 
 	struct ide_atapi_pc request_sense_pc;
-	struct request request_sense_rq;
 
 	/* current sense rq and buffer */
 	bool sense_rq_armed;
@@ -1181,7 +1180,7 @@ int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
 int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
 int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
-void ide_retry_pc(ide_drive_t *, struct gendisk *);
+void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
 void ide_queue_sense_rq(ide_drive_t *drive, void *special);
-- 
GitLab


From 5c4be57249e2e09136446597d2fe2a967c6ffef0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0482/2198] ide-cd,atapi: use bio for internal commands

Impact: unify request data buffer handling

rq->data is used mostly to pass kernel buffer through request queue
without using bio.  There are only a couple of places which still do
this in kernel and converting to bio isn't difficult.

This patch converts ide-cd and atapi to use bio instead of rq->data
for request sense and internal pc commands.  With previous change to
unify sense request handling, this is relatively easily achieved by
adding blk_rq_map_kern() during sense_rq prep and PC issue.

If blk_rq_map_kern() fails for sense, the error is deferred till sense
issue and aborts the failed command which triggered the sense.  Note
that this is a slim possibility as sense prep is done on each command
issue, so for the above condition to actually trigger, all preps since
the last sense issue till the issue of the request which would require
a sense should fail.

* do_request functions might sleep now.  This should be okay as ide
  request_fn - do_ide_request() - is invoked only from make_request
  and plug work.  Make sure this is the case by adding might_sleep()
  to do_ide_request().

* Functions which access the read sense data before the sense request
  is complete now should access bio_data(sense_rq->bio) as the sense
  buffer might have been copied during blk_rq_map_kern().

* ide-tape updated to map sg.

* cdrom_do_block_pc() now doesn't have to deal with REQ_TYPE_ATA_PC
  special case.  Simplified.

* tp_ops->output/input_data path dropped from ide_pc_intr().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 52 ++++++++++++++++++++++++-----------------
 drivers/ide/ide-cd.c    | 28 +++++++++++-----------
 drivers/ide/ide-io.c    |  3 +++
 drivers/ide/ide-tape.c  |  3 +++
 include/linux/ide.h     |  2 +-
 5 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 972c522516f8f..5cefe12f5622b 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -94,16 +94,18 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	rq->special = (char *)pc;
 
 	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
+		error = blk_rq_map_kern(drive->queue, rq, pc->buf, pc->req_xfer,
+					GFP_NOIO);
+		if (error)
+			goto put_req;
 	}
 
 	memcpy(rq->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		rq->cmd[13] = REQ_IDETAPE_PC1;
 	error = blk_execute_rq(drive->queue, disk, rq, 0);
+put_req:
 	blk_put_request(rq);
-
 	return error;
 }
 EXPORT_SYMBOL_GPL(ide_queue_pc_tail);
@@ -168,6 +170,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	struct request_sense *sense = &drive->sense_data;
 	struct request *sense_rq = &drive->sense_rq;
 	unsigned int cmd_len, sense_len;
+	int err;
 
 	debug_log("%s: enter\n", __func__);
 
@@ -193,13 +196,19 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	memset(sense, 0, sizeof(*sense));
 
 	blk_rq_init(rq->q, sense_rq);
-	sense_rq->rq_disk = rq->rq_disk;
 
-	sense_rq->data = sense;
+	err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
+			      GFP_NOIO);
+	if (unlikely(err)) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "%s: failed to map sense buffer\n",
+			       drive->name);
+		return;
+	}
+
+	sense_rq->rq_disk = rq->rq_disk;
 	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
 	sense_rq->cmd[4] = cmd_len;
-	sense_rq->data_len = sense_len;
-
 	sense_rq->cmd_type = REQ_TYPE_SENSE;
 	sense_rq->cmd_flags |= REQ_PREEMPT;
 
@@ -210,9 +219,14 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(ide_prep_sense);
 
-void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 {
-	BUG_ON(!drive->sense_rq_armed);
+	/* deferred failure from ide_prep_sense() */
+	if (!drive->sense_rq_armed) {
+		printk(KERN_WARNING "%s: failed queue sense request\n",
+		       drive->name);
+		return -ENOMEM;
+	}
 
 	drive->sense_rq.special = special;
 	drive->sense_rq_armed = false;
@@ -221,6 +235,7 @@ void ide_queue_sense_rq(ide_drive_t *drive, void *special)
 
 	elv_add_request(drive->queue, &drive->sense_rq,
 			ELEVATOR_INSERT_FRONT, 0);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
@@ -239,13 +254,14 @@ void ide_retry_pc(ide_drive_t *drive)
 	/* init pc from sense_rq */
 	ide_init_pc(pc);
 	memcpy(pc->c, sense_rq->cmd, 12);
-	pc->buf = sense_rq->data;
+	pc->buf = bio_data(sense_rq->bio);	/* pointer to mapped address */
 	pc->req_xfer = sense_rq->data_len;
 
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
 
-	ide_queue_sense_rq(drive, pc);
+	if (ide_queue_sense_rq(drive, pc))
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -317,7 +333,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	xfer_func_t *xferfunc;
 	unsigned int timeout, done;
 	u16 bcount;
 	u8 stat, ireason, dsc = 0;
@@ -411,7 +426,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					rq->errors = -EIO;
 			}
 
-			if (drive->media == ide_tape)
+			if (drive->media == ide_tape && !rq->bio)
 				done = ide_rq_bytes(rq); /* FIXME */
 			else
 				done = blk_rq_bytes(rq);
@@ -448,16 +463,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
-
-	if (drive->media == ide_floppy && pc->buf == NULL) {
+	if (drive->media == ide_tape && pc->bh)
+		done = drive->pc_io_buffers(drive, pc, bcount, write);
+	else {
 		done = min_t(unsigned int, bcount, cmd->nleft);
 		ide_pio_bytes(drive, cmd, write, done);
-	} else if (drive->media == ide_tape && pc->bh) {
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	} else {
-		done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
-		xferfunc(drive, NULL, pc->cur_pos, done);
 	}
 
 	/* Update the current position */
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 7b21c7eac5b05..392a5bdf2fd36 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -210,10 +210,12 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
 	 * For REQ_TYPE_SENSE, "rq->special" points to the original
-	 * failed request
+	 * failed request.  Also, the sense data should be read
+	 * directly from rq which might be different from the original
+	 * sense buffer if it got copied during mapping.
 	 */
 	struct request *failed = (struct request *)rq->special;
-	struct request_sense *sense = &drive->sense_data;
+	void *sense = bio_data(rq->bio);
 
 	if (failed) {
 		if (failed->sense) {
@@ -398,7 +400,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	/* if we got a CHECK_CONDITION status, queue a request sense command */
 	if (stat & ATA_ERR)
-		ide_queue_sense_rq(drive, NULL);
+		return ide_queue_sense_rq(drive, NULL) ? 2 : 1;
 	return 1;
 
 end_request:
@@ -412,8 +414,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 		hwif->rq = NULL;
 
-		ide_queue_sense_rq(drive, rq);
-		return 1;
+		return ide_queue_sense_rq(drive, rq) ? 2 : 1;
 	} else
 		return 2;
 }
@@ -507,8 +508,12 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		rq->cmd_flags |= cmd_flags;
 		rq->timeout = timeout;
 		if (buffer) {
-			rq->data = buffer;
-			rq->data_len = *bufflen;
+			error = blk_rq_map_kern(drive->queue, rq, buffer,
+						*bufflen, GFP_NOIO);
+			if (error) {
+				blk_put_request(rq);
+				return error;
+			}
 		}
 
 		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
@@ -802,15 +807,10 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 	drive->dma = 0;
 
 	/* sg request */
-	if (rq->bio || ((rq->cmd_type == REQ_TYPE_ATA_PC) && rq->data_len)) {
+	if (rq->bio) {
 		struct request_queue *q = drive->queue;
+		char *buf = bio_data(rq->bio);
 		unsigned int alignment;
-		char *buf;
-
-		if (rq->bio)
-			buf = bio_data(rq->bio);
-		else
-			buf = rq->data;
 
 		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9b9e8b1aae5ee..3245c2dbda339 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -481,6 +481,9 @@ void do_ide_request(struct request_queue *q)
 
 	spin_unlock_irq(q->queue_lock);
 
+	/* HLD do_request() callback might sleep, make sure it's okay */
+	might_sleep();
+
 	if (ide_lock_host(host, hwif))
 		goto plug_device_2;
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8324dfa78a3f9..9b762a2d5d955 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -850,6 +850,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 
 	cmd.rq = rq;
 
+	ide_init_sg_cmd(&cmd, pc->req_xfer);
+	ide_map_sg(drive, &cmd);
+
 	return ide_tape_issue_pc(drive, &cmd, pc);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9e67ccac3c1f1..1957461ac762c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1183,7 +1183,7 @@ void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
-void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+int ide_queue_sense_rq(ide_drive_t *drive, void *special);
 
 int ide_cd_expiry(ide_drive_t *);
 
-- 
GitLab


From fc38b521dcffcb07447cd98fedc56f495c10b90d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:43 +0900
Subject: [PATCH 0483/2198] ide-pm: don't abuse rq->data

Impact: cleanup rq->data usage

ide-pm uses rq->data to carry pointer to struct request_pm_state
through request queue and rq->special is used to carray pointer to
local struct ide_cmd, which isn't necessary.  Use rq->special for
request_pm_state instead and use local ide_cmd in
ide_start_power_step().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-io.c |  2 +-
 drivers/ide/ide-pm.c | 38 +++++++++++++++-----------------------
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 3245c2dbda339..6e3094e227752 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -368,7 +368,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
 			return execute_drive_cmd(drive, rq);
 		else if (blk_pm_request(rq)) {
-			struct request_pm_state *pm = rq->data;
+			struct request_pm_state *pm = rq->special;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, pm->pm_step);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 0d8a151c0a01d..ba1488bd84307 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -7,7 +7,6 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
 	struct request_pm_state rqpm;
-	struct ide_cmd cmd;
 	int ret;
 
 	/* call ACPI _GTM only once */
@@ -15,11 +14,9 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 		ide_acpi_get_timing(hwif);
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&cmd, 0, sizeof(cmd));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_PM_SUSPEND;
-	rq->special = &cmd;
-	rq->data = &rqpm;
+	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
 		mesg.event = PM_EVENT_FREEZE;
@@ -41,7 +38,6 @@ int generic_ide_resume(struct device *dev)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
 	struct request_pm_state rqpm;
-	struct ide_cmd cmd;
 	int err;
 
 	/* call ACPI _PS0 / _STM only once */
@@ -53,12 +49,10 @@ int generic_ide_resume(struct device *dev)
 	ide_acpi_exec_tfs(drive);
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&cmd, 0, sizeof(cmd));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_PM_RESUME;
 	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = &cmd;
-	rq->data = &rqpm;
+	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
 
@@ -77,7 +71,7 @@ int generic_ide_resume(struct device *dev)
 
 void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 
 #ifdef DEBUG_PM
 	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -107,10 +101,8 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
-	struct ide_cmd *cmd = rq->special;
-
-	memset(cmd, 0, sizeof(*cmd));
+	struct request_pm_state *pm = rq->special;
+	struct ide_cmd cmd = { };
 
 	switch (pm->pm_step) {
 	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
@@ -123,12 +115,12 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 			return ide_stopped;
 		}
 		if (ata_id_flush_ext_enabled(drive->id))
-			cmd->tf.command = ATA_CMD_FLUSH_EXT;
+			cmd.tf.command = ATA_CMD_FLUSH_EXT;
 		else
-			cmd->tf.command = ATA_CMD_FLUSH;
+			cmd.tf.command = ATA_CMD_FLUSH;
 		goto out_do_tf;
 	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
-		cmd->tf.command = ATA_CMD_STANDBYNOW1;
+		cmd.tf.command = ATA_CMD_STANDBYNOW1;
 		goto out_do_tf;
 	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
 		ide_set_max_pio(drive);
@@ -141,7 +133,7 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 			ide_complete_power_step(drive, rq);
 		return ide_stopped;
 	case IDE_PM_IDLE:		/* Resume step 2 (idle) */
-		cmd->tf.command = ATA_CMD_IDLEIMMEDIATE;
+		cmd.tf.command = ATA_CMD_IDLEIMMEDIATE;
 		goto out_do_tf;
 	case IDE_PM_RESTORE_DMA:	/* Resume step 3 (restore DMA) */
 		/*
@@ -163,11 +155,11 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 	return ide_stopped;
 
 out_do_tf:
-	cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd->valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	cmd->protocol = ATA_PROT_NODATA;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
+	cmd.protocol = ATA_PROT_NODATA;
 
-	return do_rw_taskfile(drive, cmd);
+	return do_rw_taskfile(drive, &cmd);
 }
 
 /**
@@ -181,7 +173,7 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
@@ -207,7 +199,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 
 	if (blk_pm_suspend_request(rq) &&
 	    pm->pm_step == IDE_PM_START_SUSPEND)
-- 
GitLab


From ea7066afcd590e4663e6dc010f93704164050f48 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0484/2198] ide-tape,floppy: fix failed command completion
 after request sense

Impact: fix infinite retry loop

After a command failed, ide-tape and floppy inserts REQUEST_SENSE in
front of the failed command and according to the result, sets
pc->retries, flags and errors.  After REQUEST_SENSE is complete, the
failed command is again at the front of the queue and if the verdict
was to terminate the request, the issue functions tries to complete it
directly by calling drive->pc_callback() and returning ide_stopped.

However, drive->pc_callback() doesn't complete a request.  It only
prepares for completion of the request.  As a result, this creates an
infinite loop where the failed request is retried perpetually.

Fix it by actually ending the request by calling ide_complete_rq().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-floppy.c | 1 +
 drivers/ide/ide-tape.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index d3302cc891e42..d20704ac3183a 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -141,6 +141,7 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
 
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 9b762a2d5d955..2b9a13671c5fc 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -643,6 +643,7 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 		}
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 		return ide_stopped;
 	}
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
-- 
GitLab


From b3071d190d6757b14af002a9d79832f12de61bce Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0485/2198] ide-atapi,tape,floppy: allow ->pc_callback() to
 change rq->data_len

Impact: allow residual count implementation in ->pc_callback()

rq->data_len has two duties - carrying the number of input bytes on
issue and carrying residual count back to the issuer on completion.
ide-atapi completion callback ->pc_callback() is the right place to do
this but currently ide-atapi depends on rq->data_len carrying the
original request size after calling ->pc_callback() to complete the pc
request.

This patch makes ide_pc_intr(), ide_tape_issue_pc() and
ide_floppy_issue_pc() cache length to complete before calling
->pc_callback() so that it can modify rq->data_len as necessary.

Note: As using rq->data_len for two purposes can make cases like this
      incorrect in subtle ways, future changes will introduce separate
      field for residual count.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-atapi.c  | 16 ++++++++++------
 drivers/ide/ide-floppy.c |  5 ++++-
 drivers/ide/ide-tape.c   |  5 ++++-
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 5cefe12f5622b..3df5442de710d 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -409,6 +409,16 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0)
 			dsc = 1;
 
+		/*
+		 * ->pc_callback() might change rq->data_len for
+		 * residual count, cache total length.
+		 */
+		if (!blk_special_request(rq) &&
+		    (drive->media == ide_tape && !rq->bio))
+			done = ide_rq_bytes(rq);        /* FIXME */
+		else
+			done = blk_rq_bytes(rq);
+
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
 
@@ -417,7 +427,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		if (blk_special_request(rq)) {
 			rq->errors = 0;
-			done = blk_rq_bytes(rq);
 			error = 0;
 		} else {
 
@@ -426,11 +435,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					rq->errors = -EIO;
 			}
 
-			if (drive->media == ide_tape && !rq->bio)
-				done = ide_rq_bytes(rq); /* FIXME */
-			else
-				done = blk_rq_bytes(rq);
-
 			error = uptodate ? 0 : -EIO;
 		}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index d20704ac3183a..537b7c5580339 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -134,14 +134,17 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
 	drive->pc = pc;
 
 	if (pc->retries > IDEFLOPPY_MAX_PC_RETRIES) {
+		unsigned int done = blk_rq_bytes(drive->hwif->rq);
+
 		if (!(pc->flags & PC_FLAG_SUPPRESS_ERROR))
 			ide_floppy_report_error(floppy, pc);
+
 		/* Giving up */
 		pc->error = IDE_DRV_ERROR_GENERAL;
 
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
+		ide_complete_rq(drive, -EIO, done);
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2b9a13671c5fc..8226d52504d08 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -622,6 +622,8 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 
 	if (pc->retries > IDETAPE_MAX_PC_RETRIES ||
 		(pc->flags & PC_FLAG_ABORT)) {
+		unsigned int done = blk_rq_bytes(drive->hwif->rq);
+
 		/*
 		 * We will "abort" retrying a packet command in case legitimate
 		 * error code was received (crossing a filemark, or end of the
@@ -641,9 +643,10 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 			/* Giving up */
 			pc->error = IDE_DRV_ERROR_GENERAL;
 		}
+
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
+		ide_complete_rq(drive, -EIO, done);
 		return ide_stopped;
 	}
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
-- 
GitLab


From 35ab8d3251833e4052aa64b09b08195e949518c7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0486/2198] ide-tape: use single continuous buffer

Impact: simpler buffer allocation and handling, kills OOM, fix DMA transfers

ide-tape has its own multiple buffer mechanism using struct
idetape_bh.  It allocates buffer with decreasing order-of-two
allocations so that it results in minimum number of segments.
However, the implementation is quite complex and works in a way that
no other block or ide driver works necessitating a lot of special case
handling.

The benefit this complex allocation scheme brings is questionable as
PIO or DMA the number of segments (16 maximum) doesn't make any
noticeable difference and it also doesn't negate the need for multiple
order allocation which can fail under memory pressure or high
fragmentation although it does lower the highest order necessary by
one when the buffer size isn't power of two.

As the first step to remove the custom buffer management, this patch
makes ide-tape allocate single continous buffer.  The maximum order is
four.  I doubt the change would cause any trouble but if it ever
matters, it should be converted to regular sg mechanism like everyone
else and even in that case dropping custom buffer handling and moving
to standard mechanism first make sense as an intermediate step.

This patch makes the first bh to contain the whole buffer and drops
multi bh handling code.  Following patches will make further changes.

This patch has the side effect of killing OOM triggered by allocation
path and fixing DMA transfers.  Previously, bug in alloc path
triggered OOM on command issue and commands were passed to DMA engine
without DMA-mapping all the segments.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 256 ++++++++++-------------------------------
 1 file changed, 58 insertions(+), 198 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8226d52504d08..b2afbc7bcb6b1 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -134,7 +134,6 @@ enum {
 struct idetape_bh {
 	u32 b_size;
 	atomic_t b_count;
-	struct idetape_bh *b_reqnext;
 	char *b_data;
 };
 
@@ -228,10 +227,6 @@ typedef struct ide_tape_obj {
 	char *b_data;
 	int b_count;
 
-	int pages_per_buffer;
-	/* Wasted space in each stage */
-	int excess_bh_size;
-
 	/* Measures average tape speed */
 	unsigned long avg_time;
 	int avg_size;
@@ -303,9 +298,7 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
-	while (bcount) {
-		if (bh == NULL)
-			break;
+	if (bcount && bh) {
 		count = min(
 			(unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
 			bcount);
@@ -313,15 +306,10 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 					atomic_read(&bh->b_count), count);
 		bcount -= count;
 		atomic_add(count, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size) {
-			bh = bh->b_reqnext;
-			if (bh)
-				atomic_set(&bh->b_count, 0);
-		}
+		if (atomic_read(&bh->b_count) == bh->b_size)
+			pc->bh = NULL;
 	}
 
-	pc->bh = bh;
-
 	return bcount;
 }
 
@@ -331,22 +319,14 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
-	while (bcount) {
-		if (bh == NULL)
-			break;
+	if (bcount && bh) {
 		count = min((unsigned int)pc->b_count, (unsigned int)bcount);
 		drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count);
 		bcount -= count;
 		pc->b_data += count;
 		pc->b_count -= count;
-		if (!pc->b_count) {
-			bh = bh->b_reqnext;
-			pc->bh = bh;
-			if (bh) {
-				pc->b_data = bh->b_data;
-				pc->b_count = atomic_read(&bh->b_count);
-			}
-		}
+		if (!pc->b_count)
+			pc->bh = NULL;
 	}
 
 	return bcount;
@@ -355,24 +335,20 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
 {
 	struct idetape_bh *bh = pc->bh;
-	int count;
 	unsigned int bcount = pc->xferred;
 
 	if (pc->flags & PC_FLAG_WRITING)
 		return;
-	while (bcount) {
-		if (bh == NULL) {
+	if (bcount) {
+		if (bh == NULL || bcount > bh->b_size) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return;
 		}
-		count = min((unsigned int)bh->b_size, (unsigned int)bcount);
-		atomic_set(&bh->b_count, count);
+		atomic_set(&bh->b_count, bcount);
 		if (atomic_read(&bh->b_count) == bh->b_size)
-			bh = bh->b_reqnext;
-		bcount -= count;
+			pc->bh = NULL;
 	}
-	pc->bh = bh;
 }
 
 /*
@@ -439,24 +415,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 /* Free data buffers completely. */
 static void ide_tape_kfree_buffer(idetape_tape_t *tape)
 {
-	struct idetape_bh *prev_bh, *bh = tape->merge_bh;
-
-	while (bh) {
-		u32 size = bh->b_size;
-
-		while (size) {
-			unsigned int order = fls(size >> PAGE_SHIFT)-1;
-
-			if (bh->b_data)
-				free_pages((unsigned long)bh->b_data, order);
+	struct idetape_bh *bh = tape->merge_bh;
 
-			size &= (order-1);
-			bh->b_data += (1 << order) * PAGE_SIZE;
-		}
-		prev_bh = bh;
-		bh = bh->b_reqnext;
-		kfree(prev_bh);
-	}
+	kfree(bh->b_data);
+	kfree(bh);
 }
 
 static void ide_tape_handle_dsc(ide_drive_t *);
@@ -861,117 +823,50 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 }
 
 /*
- * The function below uses __get_free_pages to allocate a data buffer of size
- * tape->buffer_size (or a bit more). We attempt to combine sequential pages as
- * much as possible.
- *
- * It returns a pointer to the newly allocated buffer, or NULL in case of
- * failure.
+ * It returns a pointer to the newly allocated buffer, or NULL in case
+ * of failure.
  */
 static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape,
-						  int full, int clear)
-{
-	struct idetape_bh *prev_bh, *bh, *merge_bh;
-	int pages = tape->pages_per_buffer;
-	unsigned int order, b_allocd;
-	char *b_data = NULL;
-
-	merge_bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-	bh = merge_bh;
-	if (bh == NULL)
-		goto abort;
-
-	order = fls(pages) - 1;
-	bh->b_data = (char *) __get_free_pages(GFP_KERNEL, order);
-	if (!bh->b_data)
-		goto abort;
-	b_allocd = (1 << order) * PAGE_SIZE;
-	pages &= (order-1);
-
-	if (clear)
-		memset(bh->b_data, 0, b_allocd);
-	bh->b_reqnext = NULL;
-	bh->b_size = b_allocd;
-	atomic_set(&bh->b_count, full ? bh->b_size : 0);
+						  int full)
+{
+	struct idetape_bh *bh;
 
-	while (pages) {
-		order = fls(pages) - 1;
-		b_data = (char *) __get_free_pages(GFP_KERNEL, order);
-		if (!b_data)
-			goto abort;
-		b_allocd = (1 << order) * PAGE_SIZE;
-
-		if (clear)
-			memset(b_data, 0, b_allocd);
-
-		/* newly allocated page frames below buffer header or ...*/
-		if (bh->b_data == b_data + b_allocd) {
-			bh->b_size += b_allocd;
-			bh->b_data -= b_allocd;
-			if (full)
-				atomic_add(b_allocd, &bh->b_count);
-			continue;
-		}
-		/* they are above the header */
-		if (b_data == bh->b_data + bh->b_size) {
-			bh->b_size += b_allocd;
-			if (full)
-				atomic_add(b_allocd, &bh->b_count);
-			continue;
-		}
-		prev_bh = bh;
-		bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-		if (!bh) {
-			free_pages((unsigned long) b_data, order);
-			goto abort;
-		}
-		bh->b_reqnext = NULL;
-		bh->b_data = b_data;
-		bh->b_size = b_allocd;
-		atomic_set(&bh->b_count, full ? bh->b_size : 0);
-		prev_bh->b_reqnext = bh;
+	bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
+	if (!bh)
+		return NULL;
 
-		pages &= (order-1);
+	bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (!bh->b_data) {
+		kfree(bh);
+		return NULL;
 	}
 
-	bh->b_size -= tape->excess_bh_size;
-	if (full)
-		atomic_sub(tape->excess_bh_size, &bh->b_count);
-	return merge_bh;
-abort:
-	ide_tape_kfree_buffer(tape);
-	return NULL;
+	bh->b_size = tape->buffer_size;
+	atomic_set(&bh->b_count, full ? bh->b_size : 0);
+
+	return bh;
 }
 
 static int idetape_copy_stage_from_user(idetape_tape_t *tape,
 					const char __user *buf, int n)
 {
 	struct idetape_bh *bh = tape->bh;
-	int count;
 	int ret = 0;
 
-	while (n) {
-		if (bh == NULL) {
+	if (n) {
+		if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return 1;
 		}
-		count = min((unsigned int)
-				(bh->b_size - atomic_read(&bh->b_count)),
-				(unsigned int)n);
 		if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf,
-				count))
+				   n))
 			ret = 1;
-		n -= count;
-		atomic_add(count, &bh->b_count);
-		buf += count;
-		if (atomic_read(&bh->b_count) == bh->b_size) {
-			bh = bh->b_reqnext;
-			if (bh)
-				atomic_set(&bh->b_count, 0);
-		}
+		atomic_add(n, &bh->b_count);
+		if (atomic_read(&bh->b_count) == bh->b_size)
+			tape->bh = NULL;
 	}
-	tape->bh = bh;
+
 	return ret;
 }
 
@@ -979,30 +874,20 @@ static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf,
 				      int n)
 {
 	struct idetape_bh *bh = tape->bh;
-	int count;
 	int ret = 0;
 
-	while (n) {
-		if (bh == NULL) {
+	if (n) {
+		if (bh == NULL || n > tape->b_count) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return 1;
 		}
-		count = min(tape->b_count, n);
-		if  (copy_to_user(buf, tape->b_data, count))
+		if (copy_to_user(buf, tape->b_data, n))
 			ret = 1;
-		n -= count;
-		tape->b_data += count;
-		tape->b_count -= count;
-		buf += count;
-		if (!tape->b_count) {
-			bh = bh->b_reqnext;
-			tape->bh = bh;
-			if (bh) {
-				tape->b_data = bh->b_data;
-				tape->b_count = atomic_read(&bh->b_count);
-			}
-		}
+		tape->b_data += n;
+		tape->b_count -= n;
+		if (!tape->b_count)
+			tape->bh = NULL;
 	}
 	return ret;
 }
@@ -1254,7 +1139,7 @@ static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int blocks, min;
+	int blocks;
 	struct idetape_bh *bh;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
@@ -1269,31 +1154,16 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 	if (tape->merge_bh_size) {
 		blocks = tape->merge_bh_size / tape->blk_size;
 		if (tape->merge_bh_size % tape->blk_size) {
-			unsigned int i;
-
+			unsigned int i = tape->blk_size -
+				tape->merge_bh_size % tape->blk_size;
 			blocks++;
-			i = tape->blk_size - tape->merge_bh_size %
-				tape->blk_size;
-			bh = tape->bh->b_reqnext;
-			while (bh) {
-				atomic_set(&bh->b_count, 0);
-				bh = bh->b_reqnext;
-			}
 			bh = tape->bh;
-			while (i) {
-				if (bh == NULL) {
-					printk(KERN_INFO "ide-tape: bug,"
-							 " bh NULL\n");
-					break;
-				}
-				min = min(i, (unsigned int)(bh->b_size -
-						atomic_read(&bh->b_count)));
+			if (bh) {
 				memset(bh->b_data + atomic_read(&bh->b_count),
-						0, min);
-				atomic_add(min, &bh->b_count);
-				i -= min;
-				bh = bh->b_reqnext;
-			}
+				       0, i);
+				atomic_add(i, &bh->b_count);
+			} else
+				printk(KERN_INFO "ide-tape: bug, bh NULL\n");
 		}
 		(void) idetape_add_chrdev_write_request(drive, blocks);
 		tape->merge_bh_size = 0;
@@ -1321,7 +1191,7 @@ static int idetape_init_read(ide_drive_t *drive)
 					 " 0 now\n");
 			tape->merge_bh_size = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0);
+		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
 		if (!tape->merge_bh)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_READ;
@@ -1368,23 +1238,18 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	struct idetape_bh *bh;
+	struct idetape_bh *bh = tape->merge_bh;
 	int blocks;
 
 	while (bcount) {
 		unsigned int count;
 
-		bh = tape->merge_bh;
 		count = min(tape->buffer_size, bcount);
 		bcount -= count;
 		blocks = count / tape->blk_size;
-		while (count) {
-			atomic_set(&bh->b_count,
-				   min(count, (unsigned int)bh->b_size));
-			memset(bh->b_data, 0, atomic_read(&bh->b_count));
-			count -= atomic_read(&bh->b_count);
-			bh = bh->b_reqnext;
-		}
+		atomic_set(&bh->b_count, count);
+		memset(bh->b_data, 0, atomic_read(&bh->b_count));
+
 		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks,
 				      tape->merge_bh);
 	}
@@ -1596,7 +1461,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 				"should be 0 now\n");
 			tape->merge_bh_size = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0);
+		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
 		if (!tape->merge_bh)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_WRITE;
@@ -1970,7 +1835,7 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_tape_flush_merge_buffer(drive);
-	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1, 0);
+	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1);
 	if (tape->merge_bh != NULL) {
 		idetape_pad_zeros(drive, tape->blk_size *
 				(tape->user_bs_factor - 1));
@@ -2201,11 +2066,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
 		tape->buffer_size = *ctl * tape->blk_size;
 	}
 	buffer_size = tape->buffer_size;
-	tape->pages_per_buffer = buffer_size / PAGE_SIZE;
-	if (buffer_size % PAGE_SIZE) {
-		tape->pages_per_buffer++;
-		tape->excess_bh_size = PAGE_SIZE - buffer_size % PAGE_SIZE;
-	}
 
 	/* select the "best" DSC read/write polling freq */
 	speed = max(*(u16 *)&tape->caps[14], *(u16 *)&tape->caps[8]);
-- 
GitLab


From 21d9c5d227593d15630ae83a336d1519653e9b8a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0487/2198] ide-tape: use standard data transfer mechanism

Impact: use standard way to transfer data

ide-tape uses rq in an interesting way.  For r/w requests, rq->special
is used to carry a private buffer management structure idetape_bh and
rq->nr_sectors and current_nr_sectors are initialized to the number of
idetape blocks which isn't necessary 512 bytes.  Also,
rq->current_nr_sectors is used to report back the residual count in
units of idetape blocks.

This peculiarity taxes both block layer and ide.  ide-atapi has
different paths and hooks to accomodate it and what a rq means becomes
quite confusing and making changes at the block layer becomes quite
difficult and error-prone.

This patch makes ide-tape use bio instead.  With the previous patch,
ide-tape currently is using single contiguos buffer so replacing it
isn't difficult.  Data buffer is mapped into bio using
blk_rq_map_kern() in idetape_queue_rw_tail().  idetape_io_buffers()
and idetape_update_buffers() are dropped and pc->bh is set to null to
tell ide-atapi to use standard data transfer mechanism and idetape_bh
byte counts are updated by the issuer on completion using the residual
count.

This change also nicely removes the FIXME in ide_pc_intr() where
ide-tape rqs need to be completed using ide_rq_bytes() instead of
blk_rq_bytes() (although this didn't really matter as the request
didn't have bio).

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-atapi.c |   6 +--
 drivers/ide/ide-tape.c  | 112 +++++++++-------------------------------
 2 files changed, 24 insertions(+), 94 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 3df5442de710d..b9dd4503cbc70 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -413,11 +413,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		 * ->pc_callback() might change rq->data_len for
 		 * residual count, cache total length.
 		 */
-		if (!blk_special_request(rq) &&
-		    (drive->media == ide_tape && !rq->bio))
-			done = ide_rq_bytes(rq);        /* FIXME */
-		else
-			done = blk_rq_bytes(rq);
+		done = blk_rq_bytes(rq);
 
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index b2afbc7bcb6b1..b373ac876352a 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -292,65 +292,6 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i)
 	return tape;
 }
 
-static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				  unsigned int bcount)
-{
-	struct idetape_bh *bh = pc->bh;
-	int count;
-
-	if (bcount && bh) {
-		count = min(
-			(unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
-			bcount);
-		drive->hwif->tp_ops->input_data(drive, NULL, bh->b_data +
-					atomic_read(&bh->b_count), count);
-		bcount -= count;
-		atomic_add(count, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			pc->bh = NULL;
-	}
-
-	return bcount;
-}
-
-static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				   unsigned int bcount)
-{
-	struct idetape_bh *bh = pc->bh;
-	int count;
-
-	if (bcount && bh) {
-		count = min((unsigned int)pc->b_count, (unsigned int)bcount);
-		drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count);
-		bcount -= count;
-		pc->b_data += count;
-		pc->b_count -= count;
-		if (!pc->b_count)
-			pc->bh = NULL;
-	}
-
-	return bcount;
-}
-
-static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
-{
-	struct idetape_bh *bh = pc->bh;
-	unsigned int bcount = pc->xferred;
-
-	if (pc->flags & PC_FLAG_WRITING)
-		return;
-	if (bcount) {
-		if (bh == NULL || bcount > bh->b_size) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return;
-		}
-		atomic_set(&bh->b_count, bcount);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			pc->bh = NULL;
-	}
-}
-
 /*
  * called on each failed packet command retry to analyze the request sense. We
  * currently do not utilize this information.
@@ -368,12 +309,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 		 pc->c[0], tape->sense_key, tape->asc, tape->ascq);
 
 	/* Correct pc->xferred by asking the tape.	 */
-	if (pc->flags & PC_FLAG_DMA_ERROR) {
+	if (pc->flags & PC_FLAG_DMA_ERROR)
 		pc->xferred = pc->req_xfer -
 			tape->blk_size *
 			get_unaligned_be32(&sense[3]);
-		idetape_update_buffers(drive, pc);
-	}
 
 	/*
 	 * If error was the result of a zero-length read or write command,
@@ -458,7 +397,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->current_nr_sectors -= blocks;
+		rq->data_len -= blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
@@ -520,19 +459,6 @@ static void ide_tape_handle_dsc(ide_drive_t *drive)
 	idetape_postpone_request(drive);
 }
 
-static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				unsigned int bcount, int write)
-{
-	unsigned int bleft;
-
-	if (write)
-		bleft = idetape_output_buffers(drive, pc, bcount);
-	else
-		bleft = idetape_input_buffers(drive, pc, bcount);
-
-	return bcount - bleft;
-}
-
 /*
  * Packet Command Interface
  *
@@ -683,7 +609,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->bh = bh;
+	pc->bh = NULL;
 	pc->buf = NULL;
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
@@ -1063,10 +989,12 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 				 struct idetape_bh *bh)
 {
 	idetape_tape_t *tape = drive->driver_data;
+	size_t size = blocks * tape->blk_size;
 	struct request *rq;
-	int ret, errors;
+	int ret;
 
 	debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd);
+	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
@@ -1074,21 +1002,29 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 	rq->rq_disk = tape->disk;
 	rq->special = (void *)bh;
 	rq->sector = tape->first_frame;
-	rq->nr_sectors = blocks;
-	rq->current_nr_sectors = blocks;
+
+	if (size) {
+		ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size,
+				      __GFP_WAIT);
+		if (ret)
+			goto out_put;
+	}
+
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
-	errors = rq->errors;
-	ret = tape->blk_size * (blocks - rq->current_nr_sectors);
-	blk_put_request(rq);
+	/* calculate the number of transferred bytes and update bh */
+	size -= rq->data_len;
+	if (cmd == REQ_IDETAPE_READ)
+		atomic_add(size, &bh->b_count);
 
-	if ((cmd & (REQ_IDETAPE_READ | REQ_IDETAPE_WRITE)) == 0)
-		return 0;
+	ret = size;
+	if (rq->errors == IDE_DRV_ERROR_GENERAL)
+		ret = -EIO;
 
 	if (tape->merge_bh)
 		idetape_init_merge_buffer(tape);
-	if (errors == IDE_DRV_ERROR_GENERAL)
-		return -EIO;
+out_put:
+	blk_put_request(rq);
 	return ret;
 }
 
@@ -2034,8 +1970,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	u16 *ctl = (u16 *)&tape->caps[12];
 
 	drive->pc_callback	 = ide_tape_callback;
-	drive->pc_update_buffers = idetape_update_buffers;
-	drive->pc_io_buffers	 = ide_tape_io_buffers;
 
 	drive->dev_flags |= IDE_DFLAG_DSC_OVERLAP;
 
-- 
GitLab


From 963da55c4b9eeeb2085ca74ba927cf77bce966d4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0488/2198] ide-tape: kill idetape_bh

Impact: kill now unnecessary idetape_bh

With everything using standard mechanisms, there is no need for
idetape_bh anymore.  Kill it and use tape->buf, cur and valid to
describe data buffer instead.

Changes worth mentioning are...

* idetape_queue_rq_tail() now always queue tape->buf and and adjusts
  buffer state properly before completion.

* idetape_pad_zeros() clears the buffer only once.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 305 ++++++++++++-----------------------------
 1 file changed, 84 insertions(+), 221 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index b373ac876352a..d2e9c09b52159 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -131,12 +131,6 @@ enum {
 	IDETAPE_DIR_WRITE = (1 << 2),
 };
 
-struct idetape_bh {
-	u32 b_size;
-	atomic_t b_count;
-	char *b_data;
-};
-
 /* Tape door status */
 #define DOOR_UNLOCKED			0
 #define DOOR_LOCKED			1
@@ -218,14 +212,12 @@ typedef struct ide_tape_obj {
 
 	/* Data buffer size chosen based on the tape's recommendation */
 	int buffer_size;
-	/* merge buffer */
-	struct idetape_bh *merge_bh;
-	/* size of the merge buffer */
-	int merge_bh_size;
-	/* pointer to current buffer head within the merge buffer */
-	struct idetape_bh *bh;
-	char *b_data;
-	int b_count;
+	/* Staging buffer of buffer_size bytes */
+	void *buf;
+	/* The read/write cursor */
+	void *cur;
+	/* The number of valid bytes in buf */
+	size_t valid;
 
 	/* Measures average tape speed */
 	unsigned long avg_time;
@@ -351,15 +343,6 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 	}
 }
 
-/* Free data buffers completely. */
-static void ide_tape_kfree_buffer(idetape_tape_t *tape)
-{
-	struct idetape_bh *bh = tape->merge_bh;
-
-	kfree(bh->b_data);
-	kfree(bh);
-}
-
 static void ide_tape_handle_dsc(ide_drive_t *);
 
 static int ide_tape_callback(ide_drive_t *drive, int dsc)
@@ -603,8 +586,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 				   struct ide_atapi_pc *pc, struct request *rq,
 				   u8 opcode)
 {
-	struct idetape_bh *bh = (struct idetape_bh *)rq->special;
-	unsigned int length = rq->current_nr_sectors;
+	unsigned int length = rq->nr_sectors;
 
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
@@ -616,14 +598,11 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	if (pc->req_xfer == tape->buffer_size)
 		pc->flags |= PC_FLAG_DMA_OK;
 
-	if (opcode == READ_6) {
+	if (opcode == READ_6)
 		pc->c[0] = READ_6;
-		atomic_set(&bh->b_count, 0);
-	} else if (opcode == WRITE_6) {
+	else if (opcode == WRITE_6) {
 		pc->c[0] = WRITE_6;
 		pc->flags |= PC_FLAG_WRITING;
-		pc->b_data = bh->b_data;
-		pc->b_count = atomic_read(&bh->b_count);
 	}
 
 	memcpy(rq->cmd, pc->c, 12);
@@ -639,10 +618,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	struct ide_cmd cmd;
 	u8 stat;
 
-	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu,"
-			" current_nr_sectors: %u\n",
-			(unsigned long long)rq->sector, rq->nr_sectors,
-			rq->current_nr_sectors);
+	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n"
+		  (unsigned long long)rq->sector, rq->nr_sectors);
 
 	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
@@ -748,89 +725,6 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	return ide_tape_issue_pc(drive, &cmd, pc);
 }
 
-/*
- * It returns a pointer to the newly allocated buffer, or NULL in case
- * of failure.
- */
-static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape,
-						  int full)
-{
-	struct idetape_bh *bh;
-
-	bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-	if (!bh)
-		return NULL;
-
-	bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL);
-	if (!bh->b_data) {
-		kfree(bh);
-		return NULL;
-	}
-
-	bh->b_size = tape->buffer_size;
-	atomic_set(&bh->b_count, full ? bh->b_size : 0);
-
-	return bh;
-}
-
-static int idetape_copy_stage_from_user(idetape_tape_t *tape,
-					const char __user *buf, int n)
-{
-	struct idetape_bh *bh = tape->bh;
-	int ret = 0;
-
-	if (n) {
-		if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return 1;
-		}
-		if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf,
-				   n))
-			ret = 1;
-		atomic_add(n, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			tape->bh = NULL;
-	}
-
-	return ret;
-}
-
-static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf,
-				      int n)
-{
-	struct idetape_bh *bh = tape->bh;
-	int ret = 0;
-
-	if (n) {
-		if (bh == NULL || n > tape->b_count) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return 1;
-		}
-		if (copy_to_user(buf, tape->b_data, n))
-			ret = 1;
-		tape->b_data += n;
-		tape->b_count -= n;
-		if (!tape->b_count)
-			tape->bh = NULL;
-	}
-	return ret;
-}
-
-static void idetape_init_merge_buffer(idetape_tape_t *tape)
-{
-	struct idetape_bh *bh = tape->merge_bh;
-	tape->bh = tape->merge_bh;
-
-	if (tape->chrdev_dir == IDETAPE_DIR_WRITE)
-		atomic_set(&bh->b_count, 0);
-	else {
-		tape->b_data = bh->b_data;
-		tape->b_count = atomic_read(&bh->b_count);
-	}
-}
-
 /*
  * Write a filemark if write_filemark=1. Flush the device buffers without
  * writing a filemark otherwise.
@@ -928,10 +822,10 @@ static void __ide_tape_discard_merge_buffer(ide_drive_t *drive)
 		return;
 
 	clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags);
-	tape->merge_bh_size = 0;
-	if (tape->merge_bh != NULL) {
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+	tape->valid = 0;
+	if (tape->buf != NULL) {
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
@@ -985,8 +879,7 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
  * Generate a read/write request for the block device interface and wait for it
  * to be serviced.
  */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
-				 struct idetape_bh *bh)
+static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	size_t size = blocks * tape->blk_size;
@@ -1000,11 +893,10 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
-	rq->special = (void *)bh;
 	rq->sector = tape->first_frame;
 
 	if (size) {
-		ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size,
+		ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
 				      __GFP_WAIT);
 		if (ret)
 			goto out_put;
@@ -1012,17 +904,17 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
-	/* calculate the number of transferred bytes and update bh */
+	/* calculate the number of transferred bytes and update buffer state */
 	size -= rq->data_len;
+	tape->cur = tape->buf;
 	if (cmd == REQ_IDETAPE_READ)
-		atomic_add(size, &bh->b_count);
+		tape->valid = size;
+	else
+		tape->valid = 0;
 
 	ret = size;
 	if (rq->errors == IDE_DRV_ERROR_GENERAL)
 		ret = -EIO;
-
-	if (tape->merge_bh)
-		idetape_init_merge_buffer(tape);
 out_put:
 	blk_put_request(rq);
 	return ret;
@@ -1064,49 +956,33 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 /* Queue up a character device originated write request. */
 static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
 {
-	idetape_tape_t *tape = drive->driver_data;
-
 	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
-				     blocks, tape->merge_bh);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks);
 }
 
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	int blocks;
-	struct idetape_bh *bh;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer"
 				" but we are not writing.\n");
 		return;
 	}
-	if (tape->merge_bh_size > tape->buffer_size) {
-		printk(KERN_ERR "ide-tape: bug: merge_buffer too big\n");
-		tape->merge_bh_size = tape->buffer_size;
-	}
-	if (tape->merge_bh_size) {
-		blocks = tape->merge_bh_size / tape->blk_size;
-		if (tape->merge_bh_size % tape->blk_size) {
-			unsigned int i = tape->blk_size -
-				tape->merge_bh_size % tape->blk_size;
+	if (tape->buf) {
+		blocks = tape->valid / tape->blk_size;
+		if (tape->valid % tape->blk_size) {
 			blocks++;
-			bh = tape->bh;
-			if (bh) {
-				memset(bh->b_data + atomic_read(&bh->b_count),
-				       0, i);
-				atomic_add(i, &bh->b_count);
-			} else
-				printk(KERN_INFO "ide-tape: bug, bh NULL\n");
+			memset(tape->buf + tape->valid, 0,
+			       tape->blk_size - tape->valid % tape->blk_size);
 		}
 		(void) idetape_add_chrdev_write_request(drive, blocks);
-		tape->merge_bh_size = 0;
 	}
-	if (tape->merge_bh != NULL) {
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+	if (tape->buf != NULL) {
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
 }
@@ -1122,15 +998,15 @@ static int idetape_init_read(ide_drive_t *drive)
 			ide_tape_flush_merge_buffer(drive);
 			idetape_flush_tape_buffers(drive);
 		}
-		if (tape->merge_bh || tape->merge_bh_size) {
-			printk(KERN_ERR "ide-tape: merge_bh_size should be"
-					 " 0 now\n");
-			tape->merge_bh_size = 0;
+		if (tape->buf || tape->valid) {
+			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+			tape->valid = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
-		if (!tape->merge_bh)
+		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+		if (!tape->buf)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_READ;
+		tape->cur = tape->buf;
 
 		/*
 		 * Issue a read 0 command to ensure that DSC handshake is
@@ -1140,11 +1016,10 @@ static int idetape_init_read(ide_drive_t *drive)
 		 */
 		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
 			bytes_read = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_READ, 0,
-							tape->merge_bh);
+							REQ_IDETAPE_READ, 0);
 			if (bytes_read < 0) {
-				ide_tape_kfree_buffer(tape);
-				tape->merge_bh = NULL;
+				kfree(tape->buf);
+				tape->buf = NULL;
 				tape->chrdev_dir = IDETAPE_DIR_NONE;
 				return bytes_read;
 			}
@@ -1157,8 +1032,6 @@ static int idetape_init_read(ide_drive_t *drive)
 /* called from idetape_chrdev_read() to service a chrdev read request. */
 static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 {
-	idetape_tape_t *tape = drive->driver_data;
-
 	debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
 
 	/* If we are at a filemark, return a read length of 0 */
@@ -1167,27 +1040,21 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 
 	idetape_init_read(drive);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks,
-				     tape->merge_bh);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
 }
 
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	struct idetape_bh *bh = tape->merge_bh;
-	int blocks;
+
+	memset(tape->buf, 0, tape->buffer_size);
 
 	while (bcount) {
-		unsigned int count;
+		unsigned int count = min(tape->buffer_size, bcount);
 
-		count = min(tape->buffer_size, bcount);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
+				      count / tape->blk_size);
 		bcount -= count;
-		blocks = count / tape->blk_size;
-		atomic_set(&bh->b_count, count);
-		memset(bh->b_data, 0, atomic_read(&bh->b_count));
-
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks,
-				      tape->merge_bh);
 	}
 }
 
@@ -1267,7 +1134,7 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op,
 	}
 
 	if (tape->chrdev_dir == IDETAPE_DIR_READ) {
-		tape->merge_bh_size = 0;
+		tape->valid = 0;
 		if (test_and_clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
 			++count;
 		ide_tape_discard_merge_buffer(drive, 0);
@@ -1333,20 +1200,20 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		return rc;
 	if (count == 0)
 		return (0);
-	if (tape->merge_bh_size) {
-		actually_read = min((unsigned int)(tape->merge_bh_size),
-				    (unsigned int)count);
-		if (idetape_copy_stage_to_user(tape, buf, actually_read))
+	if (tape->valid) {
+		actually_read = min_t(unsigned int, tape->valid, count);
+		if (copy_to_user(buf, tape->cur, actually_read))
 			ret = -EFAULT;
 		buf += actually_read;
-		tape->merge_bh_size -= actually_read;
 		count -= actually_read;
+		tape->cur += actually_read;
+		tape->valid -= actually_read;
 	}
 	while (count >= tape->buffer_size) {
 		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
 		if (bytes_read <= 0)
 			goto finish;
-		if (idetape_copy_stage_to_user(tape, buf, bytes_read))
+		if (copy_to_user(buf, tape->cur, bytes_read))
 			ret = -EFAULT;
 		buf += bytes_read;
 		count -= bytes_read;
@@ -1357,10 +1224,11 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		if (bytes_read <= 0)
 			goto finish;
 		temp = min((unsigned long)count, (unsigned long)bytes_read);
-		if (idetape_copy_stage_to_user(tape, buf, temp))
+		if (copy_to_user(buf, tape->cur, temp))
 			ret = -EFAULT;
 		actually_read += temp;
-		tape->merge_bh_size = bytes_read-temp;
+		tape->cur += temp;
+		tape->valid -= temp;
 	}
 finish:
 	if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
@@ -1392,16 +1260,15 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		if (tape->chrdev_dir == IDETAPE_DIR_READ)
 			ide_tape_discard_merge_buffer(drive, 1);
-		if (tape->merge_bh || tape->merge_bh_size) {
-			printk(KERN_ERR "ide-tape: merge_bh_size "
-				"should be 0 now\n");
-			tape->merge_bh_size = 0;
+		if (tape->buf || tape->valid) {
+			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+			tape->valid = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
-		if (!tape->merge_bh)
+		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+		if (!tape->buf)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_WRITE;
-		idetape_init_merge_buffer(tape);
+		tape->cur = tape->buf;
 
 		/*
 		 * Issue a write 0 command to ensure that DSC handshake is
@@ -1411,11 +1278,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 		 */
 		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
 			ssize_t retval = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_WRITE, 0,
-							tape->merge_bh);
+							REQ_IDETAPE_WRITE, 0);
 			if (retval < 0) {
-				ide_tape_kfree_buffer(tape);
-				tape->merge_bh = NULL;
+				kfree(tape->buf);
+				tape->buf = NULL;
 				tape->chrdev_dir = IDETAPE_DIR_NONE;
 				return retval;
 			}
@@ -1423,23 +1289,19 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	if (count == 0)
 		return (0);
-	if (tape->merge_bh_size) {
-		if (tape->merge_bh_size >= tape->buffer_size) {
-			printk(KERN_ERR "ide-tape: bug: merge buf too big\n");
-			tape->merge_bh_size = 0;
-		}
-		actually_written = min((unsigned int)
-				(tape->buffer_size - tape->merge_bh_size),
-				(unsigned int)count);
-		if (idetape_copy_stage_from_user(tape, buf, actually_written))
-				ret = -EFAULT;
+	if (tape->valid < tape->buffer_size) {
+		actually_written = min_t(unsigned int,
+					 tape->buffer_size - tape->valid,
+					 count);
+		if (copy_from_user(tape->cur, buf, actually_written))
+			ret = -EFAULT;
 		buf += actually_written;
-		tape->merge_bh_size += actually_written;
 		count -= actually_written;
+		tape->cur += actually_written;
+		tape->valid += actually_written;
 
-		if (tape->merge_bh_size == tape->buffer_size) {
+		if (tape->valid == tape->buffer_size) {
 			ssize_t retval;
-			tape->merge_bh_size = 0;
 			retval = idetape_add_chrdev_write_request(drive, ctl);
 			if (retval <= 0)
 				return (retval);
@@ -1447,7 +1309,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	while (count >= tape->buffer_size) {
 		ssize_t retval;
-		if (idetape_copy_stage_from_user(tape, buf, tape->buffer_size))
+		if (copy_from_user(tape->cur, buf, tape->buffer_size))
 			ret = -EFAULT;
 		buf += tape->buffer_size;
 		count -= tape->buffer_size;
@@ -1458,9 +1320,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	if (count) {
 		actually_written += count;
-		if (idetape_copy_stage_from_user(tape, buf, count))
+		if (copy_from_user(tape->cur, buf, count))
 			ret = -EFAULT;
-		tape->merge_bh_size += count;
+		tape->cur += count;
+		tape->valid += count;
 	}
 	return ret ? ret : actually_written;
 }
@@ -1623,7 +1486,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 		idetape_flush_tape_buffers(drive);
 	}
 	if (cmd == MTIOCGET || cmd == MTIOCPOS) {
-		block_offset = tape->merge_bh_size /
+		block_offset = tape->valid /
 			(tape->blk_size * tape->user_bs_factor);
 		position = idetape_read_position(drive);
 		if (position < 0)
@@ -1771,12 +1634,12 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_tape_flush_merge_buffer(drive);
-	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1);
-	if (tape->merge_bh != NULL) {
+	tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (tape->buf != NULL) {
 		idetape_pad_zeros(drive, tape->blk_size *
 				(tape->user_bs_factor - 1));
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 	idetape_write_filemark(drive);
 	idetape_flush_tape_buffers(drive);
@@ -2042,7 +1905,7 @@ static void ide_tape_release(struct device *dev)
 	ide_drive_t *drive = tape->drive;
 	struct gendisk *g = tape->disk;
 
-	BUG_ON(tape->merge_bh_size);
+	BUG_ON(tape->valid);
 
 	drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP;
 	drive->driver_data = NULL;
-- 
GitLab


From 88f1b941c5c94016a59144a3c94c9ca31eb16205 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0489/2198] ide-tape: unify r/w init paths

Impact: cleanup

Read and write init paths are almost identical.  Unify them into
idetape_init_rw().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 110 +++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 64 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d2e9c09b52159..4da7fa9bca1ea 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -987,42 +987,50 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
 }
 
-static int idetape_init_read(ide_drive_t *drive)
+static int idetape_init_rw(ide_drive_t *drive, int dir)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int bytes_read;
+	int rc;
 
-	/* Initialize read operation */
-	if (tape->chrdev_dir != IDETAPE_DIR_READ) {
-		if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
-			ide_tape_flush_merge_buffer(drive);
-			idetape_flush_tape_buffers(drive);
-		}
-		if (tape->buf || tape->valid) {
-			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
-			tape->valid = 0;
-		}
-		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
-		if (!tape->buf)
-			return -ENOMEM;
-		tape->chrdev_dir = IDETAPE_DIR_READ;
-		tape->cur = tape->buf;
+	BUG_ON(dir != IDETAPE_DIR_READ && dir != IDETAPE_DIR_WRITE);
 
-		/*
-		 * Issue a read 0 command to ensure that DSC handshake is
-		 * switched from completion mode to buffer available mode.
-		 * No point in issuing this if DSC overlap isn't supported, some
-		 * drives (Seagate STT3401A) will return an error.
-		 */
-		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-			bytes_read = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_READ, 0);
-			if (bytes_read < 0) {
-				kfree(tape->buf);
-				tape->buf = NULL;
-				tape->chrdev_dir = IDETAPE_DIR_NONE;
-				return bytes_read;
-			}
+	if (tape->chrdev_dir == dir)
+		return 0;
+
+	if (tape->chrdev_dir == IDETAPE_DIR_READ)
+		ide_tape_discard_merge_buffer(drive, 1);
+	else if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
+		ide_tape_flush_merge_buffer(drive);
+		idetape_flush_tape_buffers(drive);
+	}
+
+	if (tape->buf || tape->valid) {
+		printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+		tape->valid = 0;
+	}
+
+	tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (!tape->buf)
+		return -ENOMEM;
+	tape->chrdev_dir = dir;
+	tape->cur = tape->buf;
+
+	/*
+	 * Issue a 0 rw command to ensure that DSC handshake is
+	 * switched from completion mode to buffer available mode.  No
+	 * point in issuing this if DSC overlap isn't supported, some
+	 * drives (Seagate STT3401A) will return an error.
+	 */
+	if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
+		int cmd = dir == IDETAPE_DIR_READ ? REQ_IDETAPE_READ
+						  : REQ_IDETAPE_WRITE;
+
+		rc = idetape_queue_rw_tail(drive, cmd, 0);
+		if (rc < 0) {
+			kfree(tape->buf);
+			tape->buf = NULL;
+			tape->chrdev_dir = IDETAPE_DIR_NONE;
+			return rc;
 		}
 	}
 
@@ -1038,7 +1046,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
 		return 0;
 
-	idetape_init_read(drive);
+	idetape_init_rw(drive, IDETAPE_DIR_READ);
 
 	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
 }
@@ -1195,7 +1203,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 			    (count % tape->blk_size) == 0)
 				tape->user_bs_factor = count / tape->blk_size;
 	}
-	rc = idetape_init_read(drive);
+	rc = idetape_init_rw(drive, IDETAPE_DIR_READ);
 	if (rc < 0)
 		return rc;
 	if (count == 0)
@@ -1249,6 +1257,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	ssize_t actually_written = 0;
 	ssize_t ret = 0;
 	u16 ctl = *(u16 *)&tape->caps[12];
+	int rc;
 
 	/* The drive is write protected. */
 	if (tape->write_prot)
@@ -1257,36 +1266,9 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
 	/* Initialize write operation */
-	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
-		if (tape->chrdev_dir == IDETAPE_DIR_READ)
-			ide_tape_discard_merge_buffer(drive, 1);
-		if (tape->buf || tape->valid) {
-			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
-			tape->valid = 0;
-		}
-		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
-		if (!tape->buf)
-			return -ENOMEM;
-		tape->chrdev_dir = IDETAPE_DIR_WRITE;
-		tape->cur = tape->buf;
-
-		/*
-		 * Issue a write 0 command to ensure that DSC handshake is
-		 * switched from completion mode to buffer available mode. No
-		 * point in issuing this if DSC overlap isn't supported, some
-		 * drives (Seagate STT3401A) will return an error.
-		 */
-		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-			ssize_t retval = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_WRITE, 0);
-			if (retval < 0) {
-				kfree(tape->buf);
-				tape->buf = NULL;
-				tape->chrdev_dir = IDETAPE_DIR_NONE;
-				return retval;
-			}
-		}
-	}
+	rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
+	if (rc < 0)
+		return rc;
 	if (count == 0)
 		return (0);
 	if (tape->valid < tape->buffer_size) {
-- 
GitLab


From 6bb11dd14f70228f8dab25fd25dabeb9bc74926d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: [PATCH 0490/2198] ide-tape: use byte size instead of sectors on rw
 issue functions

Impact: cleanup

Byte size is what most issue functions deal with, make
idetape_queue_rw_tail() and its wrappers take byte size instead of
sector counts.  idetape_chrdev_read() and write() functions are
converted to use tape->buffer_size instead of ctl from tape->cap.

This cleans up code a little bit and will ease the next r/w
reimplementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 45 +++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 4da7fa9bca1ea..d5e9bb286e306 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -879,15 +879,15 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
  * Generate a read/write request for the block device interface and wait for it
  * to be serviced.
  */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks)
+static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	size_t size = blocks * tape->blk_size;
 	struct request *rq;
 	int ret;
 
 	debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd);
 	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
+	BUG_ON(size < 0 || size % tape->blk_size);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
@@ -954,17 +954,16 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 }
 
 /* Queue up a character device originated write request. */
-static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
+static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size)
 {
 	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size);
 }
 
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int blocks;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer"
@@ -972,15 +971,10 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 		return;
 	}
 	if (tape->buf) {
-		blocks = tape->valid / tape->blk_size;
-		if (tape->valid % tape->blk_size) {
-			blocks++;
-			memset(tape->buf + tape->valid, 0,
-			       tape->blk_size - tape->valid % tape->blk_size);
-		}
-		(void) idetape_add_chrdev_write_request(drive, blocks);
-	}
-	if (tape->buf != NULL) {
+		size_t aligned = roundup(tape->valid, tape->blk_size);
+
+		memset(tape->cur, 0, aligned - tape->valid);
+		idetape_add_chrdev_write_request(drive, aligned);
 		kfree(tape->buf);
 		tape->buf = NULL;
 	}
@@ -1038,9 +1032,9 @@ static int idetape_init_rw(ide_drive_t *drive, int dir)
 }
 
 /* called from idetape_chrdev_read() to service a chrdev read request. */
-static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
+static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size)
 {
-	debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
+	debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size);
 
 	/* If we are at a filemark, return a read length of 0 */
 	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
@@ -1048,7 +1042,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 
 	idetape_init_rw(drive, IDETAPE_DIR_READ);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size);
 }
 
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
@@ -1060,8 +1054,7 @@ static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 	while (bcount) {
 		unsigned int count = min(tape->buffer_size, bcount);
 
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
-				      count / tape->blk_size);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, count);
 		bcount -= count;
 	}
 }
@@ -1193,7 +1186,6 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 	ide_drive_t *drive = tape->drive;
 	ssize_t bytes_read, temp, actually_read = 0, rc;
 	ssize_t ret = 0;
-	u16 ctl = *(u16 *)&tape->caps[12];
 
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
@@ -1218,7 +1210,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		tape->valid -= actually_read;
 	}
 	while (count >= tape->buffer_size) {
-		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
+		bytes_read = idetape_add_chrdev_read_request(drive,
+							     tape->buffer_size);
 		if (bytes_read <= 0)
 			goto finish;
 		if (copy_to_user(buf, tape->cur, bytes_read))
@@ -1228,7 +1221,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		actually_read += bytes_read;
 	}
 	if (count) {
-		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
+		bytes_read = idetape_add_chrdev_read_request(drive,
+							     tape->buffer_size);
 		if (bytes_read <= 0)
 			goto finish;
 		temp = min((unsigned long)count, (unsigned long)bytes_read);
@@ -1256,7 +1250,6 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	ide_drive_t *drive = tape->drive;
 	ssize_t actually_written = 0;
 	ssize_t ret = 0;
-	u16 ctl = *(u16 *)&tape->caps[12];
 	int rc;
 
 	/* The drive is write protected. */
@@ -1284,7 +1277,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 
 		if (tape->valid == tape->buffer_size) {
 			ssize_t retval;
-			retval = idetape_add_chrdev_write_request(drive, ctl);
+			retval = idetape_add_chrdev_write_request(drive,
+							tape->buffer_size);
 			if (retval <= 0)
 				return (retval);
 		}
@@ -1295,7 +1289,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 			ret = -EFAULT;
 		buf += tape->buffer_size;
 		count -= tape->buffer_size;
-		retval = idetape_add_chrdev_write_request(drive, ctl);
+		retval = idetape_add_chrdev_write_request(drive,
+							  tape->buffer_size);
 		actually_written += tape->buffer_size;
 		if (retval <= 0)
 			return (retval);
-- 
GitLab


From 07bd9686c50c2b1f10e48089d4fb836a971f5177 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: [PATCH 0491/2198] ide-tape: simplify read/write functions

Impact: cleanup

idetape_chrdev_read/write() functions are unnecessarily complex when
everything can be handled in a single loop.  Collapse
idetape_add_chrdev_read/write_request() into the rw functions and
simplify the implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 149 ++++++++++++++---------------------------
 1 file changed, 50 insertions(+), 99 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d5e9bb286e306..2599579e4174d 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -953,14 +953,6 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
 }
 
-/* Queue up a character device originated write request. */
-static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size)
-{
-	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
-
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size);
-}
-
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -974,7 +966,7 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 		size_t aligned = roundup(tape->valid, tape->blk_size);
 
 		memset(tape->cur, 0, aligned - tape->valid);
-		idetape_add_chrdev_write_request(drive, aligned);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, aligned);
 		kfree(tape->buf);
 		tape->buf = NULL;
 	}
@@ -1031,20 +1023,6 @@ static int idetape_init_rw(ide_drive_t *drive, int dir)
 	return 0;
 }
 
-/* called from idetape_chrdev_read() to service a chrdev read request. */
-static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size)
-{
-	debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size);
-
-	/* If we are at a filemark, return a read length of 0 */
-	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
-		return 0;
-
-	idetape_init_rw(drive, IDETAPE_DIR_READ);
-
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size);
-}
-
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -1184,8 +1162,9 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 {
 	struct ide_tape_obj *tape = file->private_data;
 	ide_drive_t *drive = tape->drive;
-	ssize_t bytes_read, temp, actually_read = 0, rc;
+	size_t done = 0;
 	ssize_t ret = 0;
+	int rc;
 
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
@@ -1195,52 +1174,43 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 			    (count % tape->blk_size) == 0)
 				tape->user_bs_factor = count / tape->blk_size;
 	}
+
 	rc = idetape_init_rw(drive, IDETAPE_DIR_READ);
 	if (rc < 0)
 		return rc;
-	if (count == 0)
-		return (0);
-	if (tape->valid) {
-		actually_read = min_t(unsigned int, tape->valid, count);
-		if (copy_to_user(buf, tape->cur, actually_read))
-			ret = -EFAULT;
-		buf += actually_read;
-		count -= actually_read;
-		tape->cur += actually_read;
-		tape->valid -= actually_read;
-	}
-	while (count >= tape->buffer_size) {
-		bytes_read = idetape_add_chrdev_read_request(drive,
-							     tape->buffer_size);
-		if (bytes_read <= 0)
-			goto finish;
-		if (copy_to_user(buf, tape->cur, bytes_read))
-			ret = -EFAULT;
-		buf += bytes_read;
-		count -= bytes_read;
-		actually_read += bytes_read;
-	}
-	if (count) {
-		bytes_read = idetape_add_chrdev_read_request(drive,
-							     tape->buffer_size);
-		if (bytes_read <= 0)
-			goto finish;
-		temp = min((unsigned long)count, (unsigned long)bytes_read);
-		if (copy_to_user(buf, tape->cur, temp))
+
+	while (done < count) {
+		size_t todo;
+
+		/* refill if staging buffer is empty */
+		if (!tape->valid) {
+			/* If we are at a filemark, nothing more to read */
+			if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
+				break;
+			/* read */
+			if (idetape_queue_rw_tail(drive, REQ_IDETAPE_READ,
+						  tape->buffer_size) <= 0)
+				break;
+		}
+
+		/* copy out */
+		todo = min_t(size_t, count - done, tape->valid);
+		if (copy_to_user(buf + done, tape->cur, todo))
 			ret = -EFAULT;
-		actually_read += temp;
-		tape->cur += temp;
-		tape->valid -= temp;
+
+		tape->cur += todo;
+		tape->valid -= todo;
+		done += todo;
 	}
-finish:
-	if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
+
+	if (!done && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
 		debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name);
 
 		idetape_space_over_filemarks(drive, MTFSF, 1);
 		return 0;
 	}
 
-	return ret ? ret : actually_read;
+	return ret ? ret : done;
 }
 
 static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
@@ -1248,7 +1218,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 {
 	struct ide_tape_obj *tape = file->private_data;
 	ide_drive_t *drive = tape->drive;
-	ssize_t actually_written = 0;
+	size_t done = 0;
 	ssize_t ret = 0;
 	int rc;
 
@@ -1262,47 +1232,28 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
 	if (rc < 0)
 		return rc;
-	if (count == 0)
-		return (0);
-	if (tape->valid < tape->buffer_size) {
-		actually_written = min_t(unsigned int,
-					 tape->buffer_size - tape->valid,
-					 count);
-		if (copy_from_user(tape->cur, buf, actually_written))
-			ret = -EFAULT;
-		buf += actually_written;
-		count -= actually_written;
-		tape->cur += actually_written;
-		tape->valid += actually_written;
-
-		if (tape->valid == tape->buffer_size) {
-			ssize_t retval;
-			retval = idetape_add_chrdev_write_request(drive,
-							tape->buffer_size);
-			if (retval <= 0)
-				return (retval);
-		}
-	}
-	while (count >= tape->buffer_size) {
-		ssize_t retval;
-		if (copy_from_user(tape->cur, buf, tape->buffer_size))
-			ret = -EFAULT;
-		buf += tape->buffer_size;
-		count -= tape->buffer_size;
-		retval = idetape_add_chrdev_write_request(drive,
-							  tape->buffer_size);
-		actually_written += tape->buffer_size;
-		if (retval <= 0)
-			return (retval);
-	}
-	if (count) {
-		actually_written += count;
-		if (copy_from_user(tape->cur, buf, count))
+
+	while (done < count) {
+		size_t todo;
+
+		/* flush if staging buffer is full */
+		if (tape->valid == tape->buffer_size &&
+		    idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
+					  tape->buffer_size) <= 0)
+			return rc;
+
+		/* copy in */
+		todo = min_t(size_t, count - done,
+			     tape->buffer_size - tape->valid);
+		if (copy_from_user(tape->cur, buf + done, todo))
 			ret = -EFAULT;
-		tape->cur += count;
-		tape->valid += count;
+
+		tape->cur += todo;
+		tape->valid += todo;
+		done += todo;
 	}
-	return ret ? ret : actually_written;
+
+	return ret ? ret : done;
 }
 
 static int idetape_write_filemark(ide_drive_t *drive)
-- 
GitLab


From 6d7003877c2f0578f1c08f66d05c3f72ef4ae596 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: [PATCH 0492/2198] ide-atapi: kill unused fields and callbacks

Impact: remove fields and code paths which are no longer necessary

Now that ide-tape uses standard mechanisms to transfer data, special
case handling for bh handling can be dropped from ide-atapi.  Drop the
followings.

* pc->cur_pos, b_count, bh and b_data
* drive->pc_update_buffers() and pc_io_buffers().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 17 ++++-------------
 drivers/ide/ide-tape.c  |  1 -
 include/linux/ide.h     | 12 ------------
 3 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index b9dd4503cbc70..afe5a43238793 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -359,11 +359,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					drive->name, rq_data_dir(pc->rq)
 						     ? "write" : "read");
 			pc->flags |= PC_FLAG_DMA_ERROR;
-		} else {
+		} else
 			pc->xferred = pc->req_xfer;
-			if (drive->pc_update_buffers)
-				drive->pc_update_buffers(drive, pc);
-		}
 		debug_log("%s: DMA finished\n", drive->name);
 	}
 
@@ -463,16 +460,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	if (drive->media == ide_tape && pc->bh)
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	else {
-		done = min_t(unsigned int, bcount, cmd->nleft);
-		ide_pio_bytes(drive, cmd, write, done);
-	}
+	done = min_t(unsigned int, bcount, cmd->nleft);
+	ide_pio_bytes(drive, cmd, write, done);
 
-	/* Update the current position */
+	/* Update transferred byte count */
 	pc->xferred += done;
-	pc->cur_pos += done;
 
 	bcount -= done;
 
@@ -650,7 +642,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 
 		/* We haven't transferred any data yet */
 		pc->xferred = 0;
-		pc->cur_pos = pc->buf;
 
 		valid_tf = IDE_VALID_DEVICE;
 		bcount = ((drive->media == ide_tape) ?
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2599579e4174d..8dfc68892d6ae 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -591,7 +591,6 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->bh = NULL;
 	pc->buf = NULL;
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1957461ac762c..34c128f0a33c1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -362,11 +362,7 @@ struct ide_atapi_pc {
 
 	/* data buffer */
 	u8 *buf;
-	/* current buffer position */
-	u8 *cur_pos;
 	int buf_size;
-	/* missing/available data on the current buffer */
-	int b_count;
 
 	/* the corresponding request */
 	struct request *rq;
@@ -379,10 +375,6 @@ struct ide_atapi_pc {
 	 */
 	u8 pc_buf[IDE_PC_BUFFER_SIZE];
 
-	/* idetape only */
-	struct idetape_bh *bh;
-	char *b_data;
-
 	unsigned long timeout;
 };
 
@@ -595,10 +587,6 @@ struct ide_drive_s {
 	/* callback for packet commands */
 	int  (*pc_callback)(struct ide_drive_s *, int);
 
-	void (*pc_update_buffers)(struct ide_drive_s *, struct ide_atapi_pc *);
-	int  (*pc_io_buffers)(struct ide_drive_s *, struct ide_atapi_pc *,
-			      unsigned int, int);
-
 	ide_startstop_t (*irq_handler)(struct ide_drive_s *);
 
 	unsigned long atapi_flags;
-- 
GitLab


From 2c316bb57ad4e9f0f3de2d7ef1ae85530c2a7e69 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: [PATCH 0493/2198] ide: drop rq->data handling from ide_map_sg()

Impact: remove code path which is no longer necessary

All IDE data transfers now use rq->bio.  Simplify ide_map_sg()
accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-io.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 6e3094e227752..a0309ea661ac0 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -248,11 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct scatterlist *sg = hwif->sg_table;
 	struct request *rq = cmd->rq;
 
-	if (!rq->bio) {
-		sg_init_one(sg, rq->data, rq->data_len);
-		cmd->sg_nents = 1;
-	} else
-		cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
+	cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
 }
 EXPORT_SYMBOL_GPL(ide_map_sg);
 
-- 
GitLab


From e475eedb09ee9a0fd855f3e923aa9af31c17d141 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 15 Apr 2009 10:50:04 +0000
Subject: [PATCH 0494/2198] clocksource: sh_cmt earlytimer support

Add Early Platform Driver support to the sh_cmt driver
using the earlytimer class.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/clocksource/sh_cmt.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 1c92c39a53aaa..02bae3994abe9 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -566,9 +566,19 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
 static int __devinit sh_cmt_probe(struct platform_device *pdev)
 {
 	struct sh_cmt_priv *p = platform_get_drvdata(pdev);
+	struct sh_cmt_config *cfg = pdev->dev.platform_data;
 	int ret;
 
-	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p) {
+		pr_info("sh_cmt: %s kept as earlytimer\n", cfg->name);
+		return 0;
+	}
+
+	if (is_early_platform_device(pdev))
+		p = alloc_bootmem(sizeof(*p));
+	else
+		p = kmalloc(sizeof(*p), GFP_KERNEL);
+
 	if (p == NULL) {
 		dev_err(&pdev->dev, "failed to allocate driver data\n");
 		return -ENOMEM;
@@ -576,7 +586,10 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev)
 
 	ret = sh_cmt_setup(p, pdev);
 	if (ret) {
-		kfree(p);
+		if (is_early_platform_device(pdev))
+			free_bootmem(__pa(p), sizeof(*p));
+		else
+			kfree(p);
 
 		platform_set_drvdata(pdev, NULL);
 	}
@@ -606,6 +619,7 @@ static void __exit sh_cmt_exit(void)
 	platform_driver_unregister(&sh_cmt_device_driver);
 }
 
+early_platform_init("earlytimer", &sh_cmt_device_driver);
 module_init(sh_cmt_init);
 module_exit(sh_cmt_exit);
 
-- 
GitLab


From eaab89197b733d0f81f07d6c44294b674479fda8 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 15 Apr 2009 10:50:12 +0000
Subject: [PATCH 0495/2198] sh: arch earlytimer support

Extend the 32-bit SuperH timer code to register and probe
the earlytimer class of Early Platform Drivers.

This registers the sh_cmt driver if compiled-in.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/time_32.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index c34e1e0f9b025..04b8c6ab16674 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -17,6 +17,7 @@
 #include <linux/sched.h>
 #include <linux/clockchips.h>
 #include <linux/mc146818rtc.h>	/* for rtc_lock */
+#include <linux/platform_device.h>
 #include <linux/smp.h>
 #include <asm/clock.h>
 #include <asm/rtc.h>
@@ -228,6 +229,14 @@ void __init time_init(void)
 	local_timer_setup(smp_processor_id());
 #endif
 
+	/*
+	 * Make sure all compiled-in early timers register themselves.
+	 * Run probe() for one "earlytimer" device.
+	 */
+	early_platform_driver_register_all("earlytimer");
+	if (early_platform_driver_probe("earlytimer", 1, 0))
+		return;
+
 	/*
 	 * Find the timer to use as the system timer, it will be
 	 * initialized for us.
-- 
GitLab


From 87a00dc059e3af46303f1f56b0e8df41af988c7b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 15 Apr 2009 10:50:21 +0000
Subject: [PATCH 0496/2198] sh: Add plat_early_device_setup()

Add a plat_early_device_setup() function to allow
processor-specific code to register Early Platform Data.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/device.h | 2 ++
 arch/sh/kernel/setup.c       | 7 +++++++
 2 files changed, 9 insertions(+)

diff --git a/arch/sh/include/asm/device.h b/arch/sh/include/asm/device.h
index efd511d0803ad..8688a88303ee7 100644
--- a/arch/sh/include/asm/device.h
+++ b/arch/sh/include/asm/device.h
@@ -10,3 +10,5 @@ struct platform_device;
 int platform_resource_setup_memory(struct platform_device *pdev,
 				   char *name, unsigned long memsize);
 
+void plat_early_device_setup(void);
+
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 04a6004fccc45..22b976d420142 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -29,6 +29,7 @@
 #include <linux/mmzone.h>
 #include <linux/clk.h>
 #include <linux/delay.h>
+#include <linux/platform_device.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/page.h>
@@ -328,6 +329,10 @@ static int __init parse_elfcorehdr(char *arg)
 early_param("elfcorehdr", parse_elfcorehdr);
 #endif
 
+void __init __attribute__ ((weak)) plat_early_device_setup(void)
+{
+}
+
 void __init setup_arch(char **cmdline_p)
 {
 	enable_mmu();
@@ -381,6 +386,8 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+	plat_early_device_setup();
+
 	sh_mv_setup();
 
 	/*
-- 
GitLab


From 28fde6863e6ed1f4bfb7cef080e67bf439c20628 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 15 Apr 2009 10:50:30 +0000
Subject: [PATCH 0497/2198] sh: Early Platform Data for SuperH Mobile

Use plat_early_device_setup() to register Early Platform Data
for SuperH Mobile processors.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7343.c | 10 ++++++++++
 arch/sh/kernel/cpu/sh4a/setup-sh7366.c | 10 ++++++++++
 arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 10 ++++++++++
 arch/sh/kernel/cpu/sh4a/setup-sh7723.c | 10 ++++++++++
 4 files changed, 40 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index c1549382c87c0..cb5b4db1ca259 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -234,6 +234,16 @@ static int __init sh7343_devices_setup(void)
 }
 __initcall(sh7343_devices_setup);
 
+static struct platform_device *sh7343_early_devices[] __initdata = {
+	&cmt_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7343_early_devices,
+				   ARRAY_SIZE(sh7343_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index 93ecf8ed5c6c3..2a771f48e9e09 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -226,6 +226,16 @@ static int __init sh7366_devices_setup(void)
 }
 __initcall(sh7366_devices_setup);
 
+static struct platform_device *sh7366_early_devices[] __initdata = {
+	&cmt_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7366_early_devices,
+				   ARRAY_SIZE(sh7366_early_devices));
+}
+
 enum {
 	UNUSED=0,
 
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 0e5d204bc7928..8f974d0494c97 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -270,6 +270,16 @@ static int __init sh7722_devices_setup(void)
 }
 __initcall(sh7722_devices_setup);
 
+static struct platform_device *sh7722_early_devices[] __initdata = {
+	&cmt_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7722_early_devices,
+				   ARRAY_SIZE(sh7722_early_devices));
+}
+
 enum {
 	UNUSED=0,
 
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 5338dacbcfba7..21a58d63a00d0 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -281,6 +281,16 @@ static int __init sh7723_devices_setup(void)
 }
 __initcall(sh7723_devices_setup);
 
+static struct platform_device *sh7723_early_devices[] __initdata = {
+	&cmt_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7723_early_devices,
+				   ARRAY_SIZE(sh7723_early_devices));
+}
+
 enum {
 	UNUSED=0,
 
-- 
GitLab


From d9aed8b95f062b2f96e37b0e07373f36a6e7066d Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 19 Apr 2009 13:12:04 +0900
Subject: [PATCH 0498/2198] sh: sh7724: Don't default enable the RTC clock.

rtc-sh takes care of this now, so no need to have this always enabled.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 8b87ba8f26bbd..195274a74f8d4 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -275,7 +275,6 @@ static struct platform_device *sh7724_devices[] __initdata = {
 
 static int __init sh7724_devices_setup(void)
 {
-	clk_always_enable("rtc0");   /* RTC */
 	clk_always_enable("vpu0");   /* VPU */
 	clk_always_enable("veu1");   /* VEU3F1 */
 	clk_always_enable("veu0");   /* VEU3F0 */
-- 
GitLab


From 8fb2bae4b41eb64f6e233e9bd3f3a789fbb04a06 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 19 Apr 2009 13:14:29 +0900
Subject: [PATCH 0499/2198] sh: sh7724: Register CMT as an early platform
 device here too.

Follows the SH7722/SH7723 changes.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 195274a74f8d4..8429396acb591 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -288,6 +288,16 @@ static int __init sh7724_devices_setup(void)
 }
 device_initcall(sh7724_devices_setup);
 
+static struct platform_device *sh7724_early_devices[] __initdata = {
+	&cmt_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7724_early_devices,
+				   ARRAY_SIZE(sh7724_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
GitLab


From 2b2fd87a6ef56ba7647a578e81bb8c8efda166b8 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:12 +0800
Subject: [PATCH 0500/2198] docs, x86: add nox2apic back to
 kernel-parameters.txt

"nox2apic" was removed from kernel-parameters.txt by mistake, when
entries were sorted in alpha order (commit 0cb55ad2). But this early
parameter is still there, add it back to kernel-parameters.txt.

[ Impact: add boot parameter description ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-2-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 6172e4360f608..33989d284ff0a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1588,6 +1588,8 @@ and is between 256 and 4096 characters. It is defined in the file
 
 	nowb		[ARM]
 
+	nox2apic	[X86-64,APIC] Do not enable x2APIC mode.
+
 	nptcg=		[IA64] Override max number of concurrent global TLB
 			purges which is reported from either PAL_VM_SUMMARY or
 			SAL PALO.
-- 
GitLab


From 5d0ae2db6deac4f15dac4f42f23bc56448fc8d4d Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:13 +0800
Subject: [PATCH 0501/2198] x86, intr-remap: fix ack for interrupt remapping

Shouldn't call ack_apic_edge() in ir_ack_apic_edge(), because
ack_apic_edge() does more than just ack: it also does irq migration
in the non-interrupt-remapping case. But there is no such need for
interrupt-remapping case, as irq migration is done in the process
context.

Similarly, ir_ack_apic_level() shouldn't call ack_apic_level, and
instead should do the local cpu's EOI + directed EOI to the io-apic.

ack_x2APIC_irq() is not neccessary, because ack_APIC_irq() will use MSR
write for x2apic, and uncached write for non-x2apic.

[ Impact: simplify/standardize intr-remap IRQ acking, fix on !x2apic ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-3-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h    |  8 --------
 arch/x86/kernel/apic/io_apic.c | 32 +++++---------------------------
 2 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 2bd5a463fd1f7..d4cb7e590c06d 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -204,14 +204,6 @@ static inline int x2apic_enabled(void)
 
 extern int get_physical_broadcast(void);
 
-#ifdef CONFIG_X86_X2APIC
-static inline void ack_x2APIC_irq(void)
-{
-	/* Docs say use 0 for future compatibility */
-	native_apic_msr_write(APIC_EOI, 0);
-}
-#endif
-
 extern void apic_disable(void);
 extern int lapic_get_maxlvt(void);
 extern void clear_local_APIC(void);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 8499000224074..ea22a86e3cda2 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2552,20 +2552,6 @@ eoi_ioapic_irq(struct irq_desc *desc)
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#ifdef CONFIG_X86_X2APIC
-static void ack_x2apic_level(unsigned int irq)
-{
-	struct irq_desc *desc = irq_to_desc(irq);
-	ack_x2APIC_irq();
-	eoi_ioapic_irq(desc);
-}
-
-static void ack_x2apic_edge(unsigned int irq)
-{
-	ack_x2APIC_irq();
-}
-#endif
-
 static void ack_apic_edge(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -2629,9 +2615,6 @@ static void ack_apic_level(unsigned int irq)
 	 */
 	ack_APIC_irq();
 
-	if (irq_remapped(irq))
-		eoi_ioapic_irq(desc);
-
 	/* Now we can move and renable the irq */
 	if (unlikely(do_unmask_irq)) {
 		/* Only migrate the irq if the ack has been received.
@@ -2680,20 +2663,15 @@ static void ack_apic_level(unsigned int irq)
 #ifdef CONFIG_INTR_REMAP
 static void ir_ack_apic_edge(unsigned int irq)
 {
-#ifdef CONFIG_X86_X2APIC
-       if (x2apic_enabled())
-               return ack_x2apic_edge(irq);
-#endif
-       return ack_apic_edge(irq);
+	ack_APIC_irq();
 }
 
 static void ir_ack_apic_level(unsigned int irq)
 {
-#ifdef CONFIG_X86_X2APIC
-       if (x2apic_enabled())
-               return ack_x2apic_level(irq);
-#endif
-       return ack_apic_level(irq);
+	struct irq_desc *desc = irq_to_desc(irq);
+
+	ack_APIC_irq();
+	eoi_ioapic_irq(desc);
 }
 #endif /* CONFIG_INTR_REMAP */
 
-- 
GitLab


From 937582382c71b75b29fbb92615629494e1a05ac0 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:14 +0800
Subject: [PATCH 0502/2198] x86, intr-remap: enable interrupt remapping early

Currently, when x2apic is not enabled, interrupt remapping
will be enabled in init_dmars(), where it is too late to remap
ioapic interrupts, that is, ioapic interrupts are really in
compatibility mode, not remappable mode.

This patch always enables interrupt remapping before ioapic
setup, it guarantees all interrupts will be remapped when
interrupt remapping is enabled. Thus it doesn't need to set
the compatibility interrupt bit.

[ Impact: refactor intr-remap init sequence, enable fuller remap mode ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-4-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h  |  7 ++--
 arch/x86/kernel/apic/apic.c  | 76 +++++++++++++++++-------------------
 drivers/pci/intel-iommu.c    |  9 -----
 drivers/pci/intr_remapping.c | 28 ++++++-------
 include/linux/dmar.h         |  1 +
 5 files changed, 54 insertions(+), 67 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index d4cb7e590c06d..fbdd65446c7a8 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -169,7 +169,6 @@ static inline u64 native_x2apic_icr_read(void)
 extern int x2apic, x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
-extern void enable_IR_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
 static inline int x2apic_enabled(void)
 {
@@ -190,18 +189,18 @@ static inline void check_x2apic(void)
 static inline void enable_x2apic(void)
 {
 }
-static inline void enable_IR_x2apic(void)
-{
-}
 static inline int x2apic_enabled(void)
 {
 	return 0;
 }
 
 #define	x2apic	0
+#define	x2apic_preenabled 0
 
 #endif
 
+extern void enable_IR_x2apic(void);
+
 extern int get_physical_broadcast(void);
 
 extern void apic_disable(void);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 83e47febcc893..0cf1eea750cc4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -141,6 +141,8 @@ static int x2apic_preenabled;
 static int disable_x2apic;
 static __init int setup_nox2apic(char *str)
 {
+	if (x2apic_enabled())
+		panic("Bios already enabled x2apic, can't enforce nox2apic");
 	disable_x2apic = 1;
 	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
 	return 0;
@@ -1345,6 +1347,7 @@ void enable_x2apic(void)
 		wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
 	}
 }
+#endif /* CONFIG_X86_X2APIC */
 
 void __init enable_IR_x2apic(void)
 {
@@ -1353,32 +1356,21 @@ void __init enable_IR_x2apic(void)
 	unsigned long flags;
 	struct IO_APIC_route_entry **ioapic_entries = NULL;
 
-	if (!cpu_has_x2apic)
-		return;
-
-	if (!x2apic_preenabled && disable_x2apic) {
-		pr_info("Skipped enabling x2apic and Interrupt-remapping "
-			"because of nox2apic\n");
-		return;
+	ret = dmar_table_init();
+	if (ret) {
+		pr_debug("dmar_table_init() failed with %d:\n", ret);
+		goto ir_failed;
 	}
 
-	if (x2apic_preenabled && disable_x2apic)
-		panic("Bios already enabled x2apic, can't enforce nox2apic");
-
-	if (!x2apic_preenabled && skip_ioapic_setup) {
-		pr_info("Skipped enabling x2apic and Interrupt-remapping "
-			"because of skipping io-apic setup\n");
-		return;
+	if (!intr_remapping_supported()) {
+		pr_debug("intr-remapping not supported\n");
+		goto ir_failed;
 	}
 
-	ret = dmar_table_init();
-	if (ret) {
-		pr_info("dmar_table_init() failed with %d:\n", ret);
 
-		if (x2apic_preenabled)
-			panic("x2apic enabled by bios. But IR enabling failed");
-		else
-			pr_info("Not enabling x2apic,Intr-remapping\n");
+	if (!x2apic_preenabled && skip_ioapic_setup) {
+		pr_info("Skipped enabling intr-remap because of skipping "
+			"io-apic setup\n");
 		return;
 	}
 
@@ -1398,20 +1390,25 @@ void __init enable_IR_x2apic(void)
 	mask_IO_APIC_setup(ioapic_entries);
 	mask_8259A();
 
-	ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-
-	if (ret && x2apic_preenabled) {
-		local_irq_restore(flags);
-		panic("x2apic enabled by bios. But IR enabling failed");
-	}
+#ifdef CONFIG_X86_X2APIC
+	if (cpu_has_x2apic)
+		ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
+	else
+#endif
+		ret = enable_intr_remapping(EIM_8BIT_APIC_ID);
 
 	if (ret)
 		goto end_restore;
 
-	if (!x2apic) {
+	pr_info("Enabled Interrupt-remapping\n");
+
+#ifdef CONFIG_X86_X2APIC
+	if (cpu_has_x2apic && !x2apic) {
 		x2apic = 1;
 		enable_x2apic();
+		pr_info("Enabled x2apic\n");
 	}
+#endif
 
 end_restore:
 	if (ret)
@@ -1426,30 +1423,29 @@ void __init enable_IR_x2apic(void)
 	local_irq_restore(flags);
 
 end:
-	if (!ret) {
-		if (!x2apic_preenabled)
-			pr_info("Enabled x2apic and interrupt-remapping\n");
-		else
-			pr_info("Enabled Interrupt-remapping\n");
-	} else
-		pr_err("Failed to enable Interrupt-remapping and x2apic\n");
 	if (ioapic_entries)
 		free_ioapic_entries(ioapic_entries);
+
+	if (!ret)
+		return;
+
+ir_failed:
+	if (x2apic_preenabled)
+		panic("x2apic enabled by bios. But IR enabling failed");
+	else if (cpu_has_x2apic)
+		pr_info("Not enabling x2apic,Intr-remapping\n");
 #else
 	if (!cpu_has_x2apic)
 		return;
 
 	if (x2apic_preenabled)
 		panic("x2apic enabled prior OS handover,"
-		      " enable CONFIG_INTR_REMAP");
-
-	pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
-		" and x2apic\n");
+		      " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
 #endif
 
 	return;
 }
-#endif /* CONFIG_X86_X2APIC */
+
 
 #ifdef CONFIG_X86_64
 /*
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c
index 001b328adf804..9ce8f0764bef8 100644
--- a/drivers/pci/intel-iommu.c
+++ b/drivers/pci/intel-iommu.c
@@ -1968,15 +1968,6 @@ static int __init init_dmars(void)
 		}
 	}
 
-#ifdef CONFIG_INTR_REMAP
-	if (!intr_remapping_enabled) {
-		ret = enable_intr_remapping(0);
-		if (ret)
-			printk(KERN_ERR
-			       "IOMMU: enable interrupt remapping failed\n");
-	}
-#endif
-
 	/*
 	 * For each rmrr
 	 *   for each dev attached to rmrr
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f5e0ea724a6f5..5c2142656e966 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -423,20 +423,6 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
 		      readl, (sts & DMA_GSTS_IRTPS), sts);
 	spin_unlock_irqrestore(&iommu->register_lock, flags);
 
-	if (mode == 0) {
-		spin_lock_irqsave(&iommu->register_lock, flags);
-
-		/* enable comaptiblity format interrupt pass through */
-		cmd = iommu->gcmd | DMA_GCMD_CFI;
-		iommu->gcmd |= DMA_GCMD_CFI;
-		writel(cmd, iommu->reg + DMAR_GCMD_REG);
-
-		IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-			      readl, (sts & DMA_GSTS_CFIS), sts);
-
-		spin_unlock_irqrestore(&iommu->register_lock, flags);
-	}
-
 	/*
 	 * global invalidation of interrupt entry cache before enabling
 	 * interrupt-remapping.
@@ -516,6 +502,20 @@ static void iommu_disable_intr_remapping(struct intel_iommu *iommu)
 	spin_unlock_irqrestore(&iommu->register_lock, flags);
 }
 
+int __init intr_remapping_supported(void)
+{
+	struct dmar_drhd_unit *drhd;
+
+	for_each_drhd_unit(drhd) {
+		struct intel_iommu *iommu = drhd->iommu;
+
+		if (!ecap_ir_support(iommu->ecap))
+			return 0;
+	}
+
+	return 1;
+}
+
 int __init enable_intr_remapping(int eim)
 {
 	struct dmar_drhd_unit *drhd;
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index e397dc342cdaf..06f592a7f73c8 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -108,6 +108,7 @@ struct irte {
 };
 #ifdef CONFIG_INTR_REMAP
 extern int intr_remapping_enabled;
+extern int intr_remapping_supported(void);
 extern int enable_intr_remapping(int);
 extern void disable_intr_remapping(void);
 extern int reenable_intr_remapping(int);
-- 
GitLab


From 03ea81550676296d94596e4337c771c6ba29f542 Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:15 +0800
Subject: [PATCH 0503/2198] x86, intr-remap: add option to disable interrupt
 remapping

Add option "nointremap" to disable interrupt remapping.

[ Impact: add new boot option ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-5-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/kernel-parameters.txt |  3 +++
 drivers/pci/intr_remapping.c        | 11 +++++++++++
 2 files changed, 14 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 33989d284ff0a..843cb6646d82a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1533,6 +1533,9 @@ and is between 256 and 4096 characters. It is defined in the file
 	noinitrd	[RAM] Tells the kernel not to load any configured
 			initial RAM disk.
 
+	nointremap	[X86-64, Intel-IOMMU] Do not enable interrupt
+			remapping.
+
 	nointroute	[IA-64]
 
 	nojitter	[IA64] Disables jitter checking for ITC timers.
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index 5c2142656e966..842039e4955b1 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -15,6 +15,14 @@ static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
 static int ir_ioapic_num;
 int intr_remapping_enabled;
 
+static int disable_intremap;
+static __init int setup_nointremap(char *str)
+{
+	disable_intremap = 1;
+	return 0;
+}
+early_param("nointremap", setup_nointremap);
+
 struct irq_2_iommu {
 	struct intel_iommu *iommu;
 	u16 irte_index;
@@ -506,6 +514,9 @@ int __init intr_remapping_supported(void)
 {
 	struct dmar_drhd_unit *drhd;
 
+	if (disable_intremap)
+		return 0;
+
 	for_each_drhd_unit(drhd) {
 		struct intel_iommu *iommu = drhd->iommu;
 
-- 
GitLab


From 9a2755c3569e4db92bd9b1daadeddb4045b0cccd Mon Sep 17 00:00:00 2001
From: Weidong Han <weidong.han@intel.com>
Date: Fri, 17 Apr 2009 16:42:16 +0800
Subject: [PATCH 0504/2198] x86, intr-remap: fix x2apic/intr-remap resume

Interrupt remapping was decoupled from x2apic. Shouldn't check
x2apic before resume interrupt remapping. Otherwise, interrupt
remapping won't be resumed when x2apic is not enabled.

[ Impact: fix potential intr-remap resume hang on !x2apic ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Weidong Han <weidong.han@intel.com>
Acked-by: David Woodhouse <David.Woodhouse@intel.com>
Cc: iommu@lists.linux-foundation.org
Cc: allen.m.kay@intel.com
Cc: fenghua.yu@intel.com
LKML-Reference: <1239957736-6161-6-git-send-email-weidong.han@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0cf1eea750cc4..7b41a32339e0a 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2032,7 +2032,7 @@ static int lapic_resume(struct sys_device *dev)
 		return 0;
 
 	local_irq_save(flags);
-	if (x2apic) {
+	if (intr_remapping_enabled) {
 		ioapic_entries = alloc_ioapic_entries();
 		if (!ioapic_entries) {
 			WARN(1, "Alloc ioapic_entries in lapic resume failed.");
@@ -2048,8 +2048,10 @@ static int lapic_resume(struct sys_device *dev)
 
 		mask_IO_APIC_setup(ioapic_entries);
 		mask_8259A();
-		enable_x2apic();
 	}
+
+	if (x2apic)
+		enable_x2apic();
 #else
 	if (!apic_pm_state.active)
 		return 0;
@@ -2097,10 +2099,12 @@ static int lapic_resume(struct sys_device *dev)
 	apic_read(APIC_ESR);
 
 #ifdef CONFIG_INTR_REMAP
-	if (intr_remapping_enabled)
-		reenable_intr_remapping(EIM_32BIT_APIC_ID);
+	if (intr_remapping_enabled) {
+		if (x2apic)
+			reenable_intr_remapping(EIM_32BIT_APIC_ID);
+		else
+			reenable_intr_remapping(EIM_8BIT_APIC_ID);
 
-	if (x2apic) {
 		unmask_8259A();
 		restore_IO_APIC_setup(ioapic_entries);
 		free_ioapic_entries(ioapic_entries);
@@ -2109,7 +2113,6 @@ static int lapic_resume(struct sys_device *dev)
 
 	local_irq_restore(flags);
 
-
 	return 0;
 }
 
-- 
GitLab


From cece3155d869a50ba534ed161b5a05e8a29dcad0 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 18 Apr 2009 23:45:28 +0400
Subject: [PATCH 0505/2198] x86: smpboot - wakeup_secondary should be done via
 __cpuinit section

A caller (do_boot_cpu) already has __cpuinit attribute.

Since HOTPLUG_CPU depends on SMP && HOTPLUG it doesn't
lead to panic at moment.

[ Impact: cleanup ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090418194528.GD25510@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smpboot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index bf8ad6344b18e..d2e8de9581562 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid)
  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
  * won't ... remember to clear down the APIC, etc later.
  */
-int __devinit
+int __cpuinit
 wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
 {
 	unsigned long send_status, accept_status = 0;
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
 	return (send_status | accept_status);
 }
 
-static int __devinit
+static int __cpuinit
 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
 	unsigned long send_status, accept_status = 0;
-- 
GitLab


From 667c5296cc76fefe0abcb79228952b28d9af45e3 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 19 Apr 2009 11:43:11 +0400
Subject: [PATCH 0506/2198] x86: es7000, uv - use __cpuinit for kicking
 secondary cpus

The caller already has __cpuinit attribute.

[ Impact: save memory, address section mismatch warning ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
LKML-Reference: <20090419074311.GA8670@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/es7000_32.c   | 2 +-
 arch/x86/kernel/apic/x2apic_uv_x.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 1c11b819f2453..8e07c1418661d 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
 	return gsi;
 }
 
-static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
 	unsigned long vect = 0, psaival = 0;
 
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index de1a50af807b8..873bf7121e894 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -91,7 +91,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
 	cpumask_set_cpu(cpu, retmask);
 }
 
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 {
 #ifdef CONFIG_SMP
 	unsigned long val;
-- 
GitLab


From e057a5e5647a1c9d0d0054fbd298bfa04b3d1cb4 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 Apr 2009 23:38:12 +0200
Subject: [PATCH 0507/2198] tracing/core: Add current context on tracing
 recursion warning

In case of tracing recursion detection, we only get the stacktrace.
But the current context may be very useful to debug the issue.

This patch adds the softirq/hardirq/nmi context with the warning
using lockdep context display to have a familiar output.

v2: Use printk_once()
v3: drop {hardirq,softirq}_context which depend on lockdep,
    only keep what is part of current->trace_recursion,
    sufficient to debug the warning source.

[ Impact: print context necessary to debug recursion ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/ring_buffer.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b421b0ea91126..bffde630c4e24 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1495,6 +1495,13 @@ static int trace_recursive_lock(void)
 	if (unlikely(current->trace_recursion & (1 << level))) {
 		/* Disable all tracing before we do anything else */
 		tracing_off_permanent();
+
+		printk_once(KERN_WARNING "Tracing recursion: "
+			    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+			    hardirq_count() >> HARDIRQ_SHIFT,
+			    softirq_count() >> SOFTIRQ_SHIFT,
+			    in_nmi());
+
 		WARN_ON_ONCE(1);
 		return -1;
 	}
-- 
GitLab


From 9ade1217c9ba39ad2f004a898ddfbb815fd5fe74 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 15:38:25 +0900
Subject: [PATCH 0508/2198] sh: pci: Drop asm-generic/pci.h, so we can use our
 own fixups.

The new PCI code wants its own bus<->resource mappings instead of the
generic equivalents, so drop the asm-generic include in preparation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci.c | 14 ++++++++++++++
 arch/sh/include/asm/pci.h | 27 ++++++++++++++++++++++++---
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index 8aaacc99b7103..6d659cd93c9d4 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -72,6 +72,20 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 	pci_read_bridge_bases(bus);
 }
 
+void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			     struct resource *res)
+{
+	region->start = res->start;
+	region->end = res->end;
+}
+
+void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+			     struct pci_bus_region *region)
+{
+	res->start = region->start;
+	res->end = region->end;
+}
+
 void pcibios_align_resource(void *data, struct resource *res,
 			    resource_size_t size, resource_size_t align)
 			    __attribute__ ((weak));
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index bb2c2fcddc9ea..69cb615c39169 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -146,13 +146,34 @@ int pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin);
 int pciauto_assign_resources(int busno, struct pci_channel *hose);
 #endif
 
-#endif /* __KERNEL__ */
+extern void pcibios_resource_to_bus(struct pci_dev *dev,
+	struct pci_bus_region *region, struct resource *res);
+
+extern void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+				    struct pci_bus_region *region);
+
+static inline struct resource *
+pcibios_select_root(struct pci_dev *pdev, struct resource *res)
+{
+	struct resource *root = NULL;
+
+	if (res->flags & IORESOURCE_IO)
+		root = &ioport_resource;
+	if (res->flags & IORESOURCE_MEM)
+		root = &iomem_resource;
 
-/* generic pci stuff */
-#include <asm-generic/pci.h>
+	return root;
+}
+
+/* Chances are this interrupt is wired PC-style ...  */
+static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
+{
+	return channel ? 15 : 14;
+}
 
 /* generic DMA-mapping stuff */
 #include <asm-generic/pci-dma-compat.h>
 
+#endif /* __KERNEL__ */
 #endif /* __ASM_SH_PCI_H */
 
-- 
GitLab


From 4c5107e44514a9bde74d0af77982705d602d9e39 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 15:43:36 +0900
Subject: [PATCH 0509/2198] sh: pci: Split out new-style PCI core.

This splits off a 'pci-new.c' which is aimed at gradually replacing the
pci-auto backend and the arch/sh/drivers/pci/pci.c core respectively.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Kconfig   |  10 +-
 arch/sh/drivers/pci/Makefile  |   4 +-
 arch/sh/drivers/pci/pci-new.c | 248 ++++++++++++++++++++++++++++++++++
 3 files changed, 258 insertions(+), 4 deletions(-)
 create mode 100644 arch/sh/drivers/pci/pci-new.c

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index 7e816ededed76..efe8cf965a9bb 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -18,10 +18,17 @@ config SH_PCIDMA_NONCOHERENT
 	  bridge integrated with your SH CPU, refer carefully to the chip specs
 	  to see if you can say 'N' here. Otherwise, leave it as 'Y'.
 
+# Temporary config option for transitioning off of PCI_AUTO
+config PCI_NEW
+	bool
+	depends on PCI
+	default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \
+		     CPU_SUBTYPE_SH7785
+
 # This is also board-specific
 config PCI_AUTO
 	bool
-	depends on PCI
+	depends on PCI && !PCI_NEW
 	default y
 
 config PCI_AUTO_UPDATE_RESOURCES
@@ -34,4 +41,3 @@ config PCI_AUTO_UPDATE_RESOURCES
 	  for some reason, you have a board that simply refuses to work
 	  with its resources updated beyond what they are when the device
 	  is powered up, set this to N. Everyone else will want this as Y.
-
diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 5003971476ec5..362a1eec73a4e 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -2,8 +2,8 @@
 # Makefile for the PCI specific kernel interface routines under Linux.
 #
 
-obj-y					+= pci.o
-obj-$(CONFIG_PCI_AUTO)			+= pci-auto.o
+obj-$(CONFIG_PCI_AUTO)			:= pci.o pci-auto.o
+obj-$(CONFIG_PCI_NEW)			:= pci-new.o
 
 obj-$(CONFIG_CPU_SUBTYPE_SH7751)	+= pci-sh7751.o ops-sh4.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7751R)	+= pci-sh7751.o ops-sh4.o
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
new file mode 100644
index 0000000000000..097eb8811120a
--- /dev/null
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -0,0 +1,248 @@
+/*
+ * New-style PCI core.
+ *
+ * Copyright (c) 2002  M. R. Brown
+ * Copyright (c) 2004 - 2009  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/dma-debug.h>
+#include <linux/io.h>
+
+static int __init pcibios_init(void)
+{
+	struct pci_channel *p;
+	struct pci_bus *bus;
+	int busno;
+
+	/* init channels */
+	busno = 0;
+	for (p = board_pci_channels; p->init; p++) {
+		if (p->init(p) == 0)
+			p->enabled = 1;
+		else
+			pr_err("Unable to init pci channel %d\n", busno);
+		busno++;
+	}
+
+	/* scan the buses */
+	busno = 0;
+	for (p = board_pci_channels; p->init; p++) {
+		if (p->enabled) {
+			bus = pci_scan_bus(busno, p->pci_ops, p);
+			busno = bus->subordinate + 1;
+
+			pci_bus_size_bridges(bus);
+			pci_bus_assign_resources(bus);
+			pci_enable_bridges(bus);
+		}
+	}
+
+	pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq);
+
+	dma_debug_add_bus(&pci_bus_type);
+
+	return 0;
+}
+subsys_initcall(pcibios_init);
+
+static void pcibios_fixup_device_resources(struct pci_dev *dev,
+	struct pci_bus *bus)
+{
+	/* Update device resources.  */
+	struct pci_channel *chan = bus->sysdata;
+	unsigned long offset = 0;
+	int i;
+
+	for (i = 0; i < PCI_NUM_RESOURCES; i++) {
+		if (!dev->resource[i].start)
+			continue;
+		if (dev->resource[i].flags & IORESOURCE_PCI_FIXED)
+			continue;
+		if (dev->resource[i].flags & IORESOURCE_IO)
+			offset = chan->io_base;
+		else if (dev->resource[i].flags & IORESOURCE_MEM)
+			offset = 0;
+
+		dev->resource[i].start += offset;
+		dev->resource[i].end += offset;
+	}
+}
+
+
+/*
+ *  Called after each bus is probed, but before its children
+ *  are examined.
+ */
+void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
+{
+	struct pci_dev *dev = bus->self;
+	struct list_head *ln;
+	struct pci_channel *chan = bus->sysdata;
+
+	if (!dev) {
+		bus->resource[0] = chan->io_resource;
+		bus->resource[1] = chan->mem_resource;
+	}
+
+	for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) {
+		dev = pci_dev_b(ln);
+
+		if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI)
+			pcibios_fixup_device_resources(dev, bus);
+	}
+}
+
+void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			 struct resource *res)
+{
+	struct pci_channel *chan = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = chan->io_base;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = 0;
+
+	region->start = res->start - offset;
+	region->end = res->end - offset;
+}
+
+void __devinit
+pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+			struct pci_bus_region *region)
+{
+	struct pci_channel *chan = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = chan->io_base;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = 0;
+
+	res->start = region->start + offset;
+	res->end = region->end + offset;
+}
+
+void pcibios_align_resource(void *data, struct resource *res,
+			    resource_size_t size, resource_size_t align)
+			    __attribute__ ((weak));
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ */
+void pcibios_align_resource(void *data, struct resource *res,
+			    resource_size_t size, resource_size_t align)
+{
+	if (res->flags & IORESOURCE_IO) {
+		resource_size_t start = res->start;
+
+		if (start & 0x300) {
+			start = (start + 0x3ff) & ~0x3ff;
+			res->start = start;
+		}
+	}
+}
+
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	u16 cmd, old_cmd;
+	int idx;
+	struct resource *r;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	old_cmd = cmd;
+	for(idx=0; idx<6; idx++) {
+		if (!(mask & (1 << idx)))
+			continue;
+		r = &dev->resource[idx];
+		if (!r->start && r->end) {
+			printk(KERN_ERR "PCI: Device %s not available because "
+			       "of resource collisions\n", pci_name(dev));
+			return -EINVAL;
+		}
+		if (r->flags & IORESOURCE_IO)
+			cmd |= PCI_COMMAND_IO;
+		if (r->flags & IORESOURCE_MEM)
+			cmd |= PCI_COMMAND_MEMORY;
+	}
+	if (dev->resource[PCI_ROM_RESOURCE].start)
+		cmd |= PCI_COMMAND_MEMORY;
+	if (cmd != old_cmd) {
+		printk(KERN_INFO "PCI: Enabling device %s (%04x -> %04x)\n",
+		       pci_name(dev), old_cmd, cmd);
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	}
+	return 0;
+}
+
+/*
+ *  If we set up a device for bus mastering, we need to check and set
+ *  the latency timer as it may not be properly set.
+ */
+static unsigned int pcibios_max_latency = 255;
+
+void pcibios_set_master(struct pci_dev *dev)
+{
+	u8 lat;
+	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
+	if (lat < 16)
+		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
+	else if (lat > pcibios_max_latency)
+		lat = pcibios_max_latency;
+	else
+		return;
+	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
+	       pci_name(dev), lat);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
+}
+
+void __init pcibios_update_irq(struct pci_dev *dev, int irq)
+{
+	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
+}
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	resource_size_t start = pci_resource_start(dev, bar);
+	resource_size_t len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (unlikely(!len || !start))
+		return NULL;
+	if (maxlen && len > maxlen)
+		len = maxlen;
+
+	/*
+	 * Presently the IORESOURCE_MEM case is a bit special, most
+	 * SH7751 style PCI controllers have PCI memory at a fixed
+	 * location in the address space where no remapping is desired.
+	 * With the IORESOURCE_MEM case more care has to be taken
+	 * to inhibit page table mapping for legacy cores, but this is
+	 * punted off to __ioremap().
+	 *					-- PFM.
+	 */
+	if (flags & IORESOURCE_IO)
+		return ioport_map(start, len);
+	if (flags & IORESOURCE_MEM)
+		return ioremap(start, len);
+
+	return NULL;
+}
+EXPORT_SYMBOL(pci_iomap);
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(pci_iounmap);
+
+EXPORT_SYMBOL(board_pci_channels);
-- 
GitLab


From 9833385131fc4e8c52f95320ab899051d1c06831 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 15:51:45 +0900
Subject: [PATCH 0510/2198] sh: pci: HAVE_PCI_MMAP support.

Derived from the MIPS version, now uses pgprot_noncached().

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Makefile  |  6 +++---
 arch/sh/drivers/pci/pci-lib.c | 26 ++++++++++++++++++++++++++
 arch/sh/include/asm/pci.h     |  3 +++
 3 files changed, 32 insertions(+), 3 deletions(-)
 create mode 100644 arch/sh/drivers/pci/pci-lib.c

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 362a1eec73a4e..c8eab14843e5a 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -1,9 +1,9 @@
 #
 # Makefile for the PCI specific kernel interface routines under Linux.
 #
-
-obj-$(CONFIG_PCI_AUTO)			:= pci.o pci-auto.o
-obj-$(CONFIG_PCI_NEW)			:= pci-new.o
+obj-y					+= pci-lib.o
+obj-$(CONFIG_PCI_AUTO)			+= pci.o pci-auto.o
+obj-$(CONFIG_PCI_NEW)			+= pci-new.o
 
 obj-$(CONFIG_CPU_SUBTYPE_SH7751)	+= pci-sh7751.o ops-sh4.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7751R)	+= pci-sh7751.o ops-sh4.o
diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
new file mode 100644
index 0000000000000..1a43a350d574f
--- /dev/null
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -0,0 +1,26 @@
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/types.h>
+#include <linux/pci.h>
+
+int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+			enum pci_mmap_state mmap_state, int write_combine)
+{
+	/*
+	 * I/O space can be accessed via normal processor loads and stores on
+	 * this platform but for now we elect not to do this and portable
+	 * drivers should not do this anyway.
+	 */
+	if (mmap_state == pci_mmap_io)
+		return -EINVAL;
+
+	/*
+	 * Ignore write-combine; for now only return uncached mappings.
+	 */
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 69cb615c39169..46afd449739d0 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -40,6 +40,9 @@ extern struct pci_channel board_pci_channels[];
 
 struct pci_dev;
 
+#define HAVE_PCI_MMAP
+extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+	enum pci_mmap_state mmap_state, int write_combine);
 extern void pcibios_set_master(struct pci_dev *dev);
 
 static inline void pcibios_penalize_isa_irq(int irq, int active)
-- 
GitLab


From a3c0e0d0032d5bbfd7dc04827a257c717d432a5b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 16:14:29 +0900
Subject: [PATCH 0511/2198] sh: pci: Consolidate pcibios_align_resource()
 definitions.

This introduces a saner pcibios_align_resource() that can be used
regardless of whether pci-auto or pci-new are being used, and
consolidates it in pci-lib.c.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-se/7751/pci.c |  9 ++++---
 arch/sh/drivers/pci/pci-lib.c     | 42 +++++++++++++++++++++++++++++++
 arch/sh/drivers/pci/pci-new.c     | 23 -----------------
 arch/sh/drivers/pci/pci.c         | 23 -----------------
 arch/sh/include/asm/pci.h         |  5 +---
 5 files changed, 49 insertions(+), 53 deletions(-)

diff --git a/arch/sh/boards/mach-se/7751/pci.c b/arch/sh/boards/mach-se/7751/pci.c
index 203b2923fe7f8..9ec64a416b302 100644
--- a/arch/sh/boards/mach-se/7751/pci.c
+++ b/arch/sh/boards/mach-se/7751/pci.c
@@ -30,6 +30,9 @@
 #define PCIC_WRITE(x,v) writel((v), PCI_REG(x))
 #define PCIC_READ(x) readl(PCI_REG(x))
 
+#define xPCIBIOS_MIN_IO		board_pci_channels->io_resource->start
+#define xPCIBIOS_MIN_MEM	board_pci_channels->mem_resource->start
+
 /*
  * Description:  This function sets up and initializes the pcic, sets
  * up the BARS, maps the DRAM into the address space etc, etc.
@@ -97,12 +100,12 @@ int __init pcibios_init_platform(void)
     * meaning all calls go straight through... use BUG_ON to
     * catch erroneous assumption.
     */
-   BUG_ON(PCIBIOS_MIN_MEM != SH7751_PCI_MEMORY_BASE);
+   BUG_ON(xPCIBIOS_MIN_MEM != SH7751_PCI_MEMORY_BASE);
 
-   PCIC_WRITE(SH7751_PCIMBR, PCIBIOS_MIN_MEM);
+   PCIC_WRITE(SH7751_PCIMBR, xPCIBIOS_MIN_MEM);
 
    /* Set IOBR for window containing area specified in pci.h */
-   PCIC_WRITE(SH7751_PCIIOBR, (PCIBIOS_MIN_IO & SH7751_PCIIOBR_MASK));
+   PCIC_WRITE(SH7751_PCIIOBR, (xPCIBIOS_MIN_IO & SH7751_PCIIOBR_MASK));
 
    /* All done, may as well say so... */
    printk("SH7751 PCI: Finished initialization of the PCI controller\n");
diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
index 1a43a350d574f..8ab1a2d1b4832 100644
--- a/arch/sh/drivers/pci/pci-lib.c
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -4,6 +4,41 @@
 #include <linux/types.h>
 #include <linux/pci.h>
 
+unsigned long PCIBIOS_MIN_IO = 0x0000;
+unsigned long PCIBIOS_MIN_MEM = 0;
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ */
+void pcibios_align_resource(void *data, struct resource *res,
+			    resource_size_t size, resource_size_t align)
+{
+	struct pci_dev *dev = data;
+	struct pci_channel *chan = dev->sysdata;
+	resource_size_t start = res->start;
+
+	if (res->flags & IORESOURCE_IO) {
+		if (start < PCIBIOS_MIN_IO + chan->io_resource->start)
+			start = PCIBIOS_MIN_IO + chan->io_resource->start;
+
+		/*
+                 * Put everything into 0x00-0xff region modulo 0x400.
+		 */
+		if (start & 0x300) {
+			start = (start + 0x3ff) & ~0x3ff;
+			res->start = start;
+		}
+	} else if (res->flags & IORESOURCE_MEM) {
+		if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start)
+			start = PCIBIOS_MIN_MEM + chan->mem_resource->start;
+	}
+
+	res->start = start;
+}
+
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
@@ -24,3 +59,10 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			       vma->vm_end - vma->vm_start,
 			       vma->vm_page_prot);
 }
+
+#ifdef CONFIG_HOTPLUG
+EXPORT_SYMBOL(pcibios_resource_to_bus);
+EXPORT_SYMBOL(pcibios_bus_to_resource);
+EXPORT_SYMBOL(PCIBIOS_MIN_IO);
+EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
+#endif
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index 097eb8811120a..4e9251f3d0902 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -129,29 +129,6 @@ pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 	res->end = region->end + offset;
 }
 
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-			    __attribute__ ((weak));
-
-/*
- * We need to avoid collisions with `mirrored' VGA ports
- * and other strange ISA hardware, so we always want the
- * addresses to be allocated in the 0x000-0x0ff region
- * modulo 0x400.
- */
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-{
-	if (res->flags & IORESOURCE_IO) {
-		resource_size_t start = res->start;
-
-		if (start & 0x300) {
-			start = (start + 0x3ff) & ~0x3ff;
-			res->start = start;
-		}
-	}
-}
-
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
 	u16 cmd, old_cmd;
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index 6d659cd93c9d4..f670988e033d9 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -86,29 +86,6 @@ void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 	res->end = region->end;
 }
 
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-			    __attribute__ ((weak));
-
-/*
- * We need to avoid collisions with `mirrored' VGA ports
- * and other strange ISA hardware, so we always want the
- * addresses to be allocated in the 0x000-0x0ff region
- * modulo 0x400.
- */
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-{
-	if (res->flags & IORESOURCE_IO) {
-		resource_size_t start = res->start;
-
-		if (start & 0x300) {
-			start = (start + 0x3ff) & ~0x3ff;
-			res->start = start;
-		}
-	}
-}
-
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
 	u16 cmd, old_cmd;
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 46afd449739d0..5212bf6dd4b16 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -33,10 +33,7 @@ struct pci_channel {
  */
 extern struct pci_channel board_pci_channels[];
 
-/* ugly as hell, but makes drivers/pci/setup-res.c compile and work */
-#define __PCI_CHAN(bus)		((struct pci_channel *)bus->sysdata)
-#define PCIBIOS_MIN_IO		__PCI_CHAN(bus)->io_resource->start
-#define PCIBIOS_MIN_MEM		__PCI_CHAN(bus)->mem_resource->start
+extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM;
 
 struct pci_dev;
 
-- 
GitLab


From 394b6d2fe624246e258a218dac68d44fe9a8411f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 16:18:46 +0900
Subject: [PATCH 0512/2198] sh: pci: Kill off unused pcibios_fixup().

This is left over cruft that hasn't been used by anything in a long time,
kill off bits that weren't purged previously.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-snapgear.c | 5 -----
 arch/sh/include/asm/pci.h          | 1 -
 2 files changed, 6 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
index dd2c5df283079..b64f2b91be8e6 100644
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ b/arch/sh/drivers/pci/ops-snapgear.c
@@ -85,8 +85,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 
 	return irq;
 }
-
-void __init pcibios_fixup(void)
-{
-	/* Nothing to fixup .. */
-}
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 5212bf6dd4b16..e8265fd0bb6f1 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -138,7 +138,6 @@ static inline void __iomem *__get_pci_io_base(unsigned long port,
 #endif
 
 /* Board-specific fixup routines. */
-void pcibios_fixup(void);
 int pcibios_init_platform(void);
 int pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin);
 
-- 
GitLab


From 0bb34a6bf1f71d5ad2abfda582a2c2794957bc7b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 16:38:00 +0900
Subject: [PATCH 0513/2198] sh: pci: Consolidate pci_iomap() and use the
 generic I/O base.

This consolidates the pci_iomap() definitions and reworks how the I/O
port base is handled. PCI channels can register their own I/O map base,
or if none is provided, the system-wide generic I/O base is used instead.

Functionally nothing changes, while this allows us to kill off lots of
I/O address special casing and lookups.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-lib.c | 51 +++++++++++++++++++++++++++++++++++
 arch/sh/drivers/pci/pci-new.c | 35 ------------------------
 arch/sh/drivers/pci/pci.c     | 35 ------------------------
 arch/sh/include/asm/pci.h     | 22 ++-------------
 arch/sh/kernel/io.c           |  4 ---
 5 files changed, 53 insertions(+), 94 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
index 8ab1a2d1b4832..654ffcc67d0ac 100644
--- a/arch/sh/drivers/pci/pci-lib.c
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -60,6 +60,57 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			       vma->vm_page_prot);
 }
 
+static void __iomem *ioport_map_pci(struct pci_dev *dev,
+				    unsigned long port, unsigned int nr)
+{
+	struct pci_channel *chan = dev->sysdata;
+
+	if (!chan->io_map_base)
+		chan->io_map_base = generic_io_base;
+
+	return (void __iomem *)(chan->io_map_base + port);
+}
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	resource_size_t start = pci_resource_start(dev, bar);
+	resource_size_t len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (unlikely(!len || !start))
+		return NULL;
+	if (maxlen && len > maxlen)
+		len = maxlen;
+
+	if (flags & IORESOURCE_IO)
+		return ioport_map_pci(dev, start, len);
+
+	/*
+	 * Presently the IORESOURCE_MEM case is a bit special, most
+	 * SH7751 style PCI controllers have PCI memory at a fixed
+	 * location in the address space where no remapping is desired.
+	 * With the IORESOURCE_MEM case more care has to be taken
+	 * to inhibit page table mapping for legacy cores, but this is
+	 * punted off to __ioremap().
+	 *					-- PFM.
+	 */
+	if (flags & IORESOURCE_MEM) {
+		if (flags & IORESOURCE_CACHEABLE)
+			return ioremap(start, len);
+
+		return ioremap_nocache(start, len);
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(pci_iomap);
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(pci_iounmap);
+
 #ifdef CONFIG_HOTPLUG
 EXPORT_SYMBOL(pcibios_resource_to_bus);
 EXPORT_SYMBOL(pcibios_bus_to_resource);
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index 4e9251f3d0902..c92e65045c681 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -187,39 +187,4 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
 
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
-{
-	resource_size_t start = pci_resource_start(dev, bar);
-	resource_size_t len = pci_resource_len(dev, bar);
-	unsigned long flags = pci_resource_flags(dev, bar);
-
-	if (unlikely(!len || !start))
-		return NULL;
-	if (maxlen && len > maxlen)
-		len = maxlen;
-
-	/*
-	 * Presently the IORESOURCE_MEM case is a bit special, most
-	 * SH7751 style PCI controllers have PCI memory at a fixed
-	 * location in the address space where no remapping is desired.
-	 * With the IORESOURCE_MEM case more care has to be taken
-	 * to inhibit page table mapping for legacy cores, but this is
-	 * punted off to __ioremap().
-	 *					-- PFM.
-	 */
-	if (flags & IORESOURCE_IO)
-		return ioport_map(start, len);
-	if (flags & IORESOURCE_MEM)
-		return ioremap(start, len);
-
-	return NULL;
-}
-EXPORT_SYMBOL(pci_iomap);
-
-void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
-{
-	iounmap(addr);
-}
-EXPORT_SYMBOL(pci_iounmap);
-
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index f670988e033d9..d39f24091adef 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -144,39 +144,4 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
 
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
-{
-	resource_size_t start = pci_resource_start(dev, bar);
-	resource_size_t len = pci_resource_len(dev, bar);
-	unsigned long flags = pci_resource_flags(dev, bar);
-
-	if (unlikely(!len || !start))
-		return NULL;
-	if (maxlen && len > maxlen)
-		len = maxlen;
-
-	/*
-	 * Presently the IORESOURCE_MEM case is a bit special, most
-	 * SH7751 style PCI controllers have PCI memory at a fixed
-	 * location in the address space where no remapping is desired.
-	 * With the IORESOURCE_MEM case more care has to be taken
-	 * to inhibit page table mapping for legacy cores, but this is
-	 * punted off to __ioremap().
-	 *					-- PFM.
-	 */
-	if (flags & IORESOURCE_IO)
-		return ioport_map(start, len);
-	if (flags & IORESOURCE_MEM)
-		return ioremap(start, len);
-
-	return NULL;
-}
-EXPORT_SYMBOL(pci_iomap);
-
-void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
-{
-	iounmap(addr);
-}
-EXPORT_SYMBOL(pci_iounmap);
-
 EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index e8265fd0bb6f1..5324282897729 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -26,6 +26,8 @@ struct pci_channel {
 	int enabled;
 	unsigned long reg_base;
 	unsigned long io_base;
+
+	unsigned long io_map_base;
 };
 
 /*
@@ -110,31 +112,11 @@ static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
 	}
 	return 0;
 }
-
-static inline void __iomem *__get_pci_io_base(unsigned long port,
-					      unsigned long size)
-{
-	struct pci_channel *p;
-	struct resource *res;
-
-	for (p = board_pci_channels; p->init; p++) {
-		res = p->io_resource;
-		if (p->enabled && (port >= res->start) &&
-		    (port + size) <= (res->end + 1))
-			return (void __iomem *)(p->io_base + port);
-	}
-	return NULL;
-}
 #else
 static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
 {
 	return 0;
 }
-static inline void __iomem *__get_pci_io_base(unsigned long port,
-					      unsigned long size)
-{
-	return NULL;
-}
 #endif
 
 /* Board-specific fixup routines. */
diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c
index 59fb020718a34..4f85fffaa5571 100644
--- a/arch/sh/kernel/io.c
+++ b/arch/sh/kernel/io.c
@@ -70,10 +70,6 @@ void __iomem *ioport_map(unsigned long port, unsigned int nr)
 	if (ret)
 		return ret;
 
-	ret = __get_pci_io_base(port, nr);
-	if (ret)
-		return ret;
-
 	return __ioport_map(port, nr);
 }
 EXPORT_SYMBOL(ioport_map);
-- 
GitLab


From f3b9aae16219aaeca2dd5a9ca69f7a10faa063df Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 Apr 2009 23:39:33 +0200
Subject: [PATCH 0514/2198] tracing/ring-buffer: Add unlock recursion
 protection on discard

The pair of helpers trace_recursive_lock() and trace_recursive_unlock()
have been introduced recently to provide generic tracing recursion
protection.

They are used in a symetric way:

 - trace_recursive_lock() on buffer reserve
 - trace_recursive_unlock() on buffer commit

However sometimes, we don't commit but discard on entry
to the buffer, ie: in case of filter checking.

Then we must also unlock the recursion protection on discard time,
otherwise the tracing gets definitely deactivated and a warning
is raised spuriously, such as:

111.119821] ------------[ cut here ]------------
[  111.119829] WARNING: at kernel/trace/ring_buffer.c:1498 ring_buffer_lock_reserve+0x1b7/0x1d0()
[  111.119835] Hardware name: AMILO Li 2727
[  111.119839] Modules linked in:
[  111.119846] Pid: 5731, comm: Xorg Tainted: G        W  2.6.30-rc1 #69
[  111.119851] Call Trace:
[  111.119863]  [<ffffffff8025ce68>] warn_slowpath+0xd8/0x130
[  111.119873]  [<ffffffff8028a30f>] ? __lock_acquire+0x19f/0x1ae0
[  111.119882]  [<ffffffff8028a30f>] ? __lock_acquire+0x19f/0x1ae0
[  111.119891]  [<ffffffff802199b0>] ? native_sched_clock+0x20/0x70
[  111.119899]  [<ffffffff80286dee>] ? put_lock_stats+0xe/0x30
[  111.119906]  [<ffffffff80286eb8>] ? lock_release_holdtime+0xa8/0x150
[  111.119913]  [<ffffffff802c8ae7>] ring_buffer_lock_reserve+0x1b7/0x1d0
[  111.119921]  [<ffffffff802cd110>] trace_buffer_lock_reserve+0x30/0x70
[  111.119930]  [<ffffffff802ce000>] trace_current_buffer_lock_reserve+0x20/0x30
[  111.119939]  [<ffffffff802474e8>] ftrace_raw_event_sched_switch+0x58/0x100
[  111.119948]  [<ffffffff808103b7>] __schedule+0x3a7/0x4cd
[  111.119957]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.119964]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.119971]  [<ffffffff80810c08>] schedule+0x18/0x40
[  111.119977]  [<ffffffff80810e09>] preempt_schedule+0x39/0x60
[  111.119985]  [<ffffffff80813bd3>] _read_unlock+0x53/0x60
[  111.119993]  [<ffffffff807259d2>] sock_def_readable+0x72/0x80
[  111.120002]  [<ffffffff807ad5ed>] unix_stream_sendmsg+0x24d/0x3d0
[  111.120011]  [<ffffffff807219a3>] sock_aio_write+0x143/0x160
[  111.120019]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120026]  [<ffffffff80721860>] ? sock_aio_write+0x0/0x160
[  111.120033]  [<ffffffff80721860>] ? sock_aio_write+0x0/0x160
[  111.120042]  [<ffffffff8031c283>] do_sync_readv_writev+0xf3/0x140
[  111.120049]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120057]  [<ffffffff80276ff0>] ? autoremove_wake_function+0x0/0x40
[  111.120067]  [<ffffffff8045d489>] ? cap_file_permission+0x9/0x10
[  111.120074]  [<ffffffff8045c1e6>] ? security_file_permission+0x16/0x20
[  111.120082]  [<ffffffff8031cab4>] do_readv_writev+0xd4/0x1f0
[  111.120089]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120097]  [<ffffffff80211b56>] ? ftrace_call+0x5/0x2b
[  111.120105]  [<ffffffff8031cc18>] vfs_writev+0x48/0x70
[  111.120111]  [<ffffffff8031cd65>] sys_writev+0x55/0xc0
[  111.120119]  [<ffffffff80211e32>] system_call_fastpath+0x16/0x1b
[  111.120125] ---[ end trace 15605f4e98d5ccb5 ]---

[ Impact: fix spurious warning triggering tracing shutdown ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/ring_buffer.c | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bffde630c4e24..e145969a8edab 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1642,6 +1642,14 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+
 /**
  * ring_buffer_event_discard - discard any event in the ring buffer
  * @event: the event to discard
@@ -1656,10 +1664,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
  */
 void ring_buffer_event_discard(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
-	/* time delta must be non zero */
-	if (!event->time_delta)
-		event->time_delta = 1;
+	rb_event_discard(event);
+	trace_recursive_unlock();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
 
@@ -1690,7 +1696,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	int cpu;
 
 	/* The event is discarded regardless */
-	ring_buffer_event_discard(event);
+	rb_event_discard(event);
 
 	/*
 	 * This must only be called if the event has not been
@@ -1735,6 +1741,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	if (rb_is_commit(cpu_buffer, event))
 		rb_set_commit_to_write(cpu_buffer);
 
+	trace_recursive_unlock();
+
 	/*
 	 * Only the last preempt count needs to restore preemption.
 	 */
-- 
GitLab


From 99f95f117848088f2708b45c70be73152e78bb8a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 18:24:57 +0900
Subject: [PATCH 0515/2198] sh: pci: Rework fixed region checks in ioremap().

Not all PCI channels have non-translatable memory windows, this is a
special property of the on-chip PCIC with its 0xfd00... mapping, handle
this explicitly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/pci.h | 29 +++++++++++------------------
 arch/sh/mm/ioremap_32.c   | 14 +++++++-------
 2 files changed, 18 insertions(+), 25 deletions(-)

diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 5324282897729..82a9369511b5a 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -90,7 +90,6 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)	do { } while (0)
 #endif
 
-#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -99,24 +98,18 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strategy_parameter = ~0UL;
 }
 
-static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
-{
-	struct pci_channel *p;
-	struct resource *res;
-
-	for (p = board_pci_channels; p->init; p++) {
-		res = p->mem_resource;
-		if (p->enabled && (phys_addr >= res->start) &&
-		    (phys_addr + size) <= (res->end + 1))
-			return 1;
-	}
-	return 0;
-}
+#ifdef CONFIG_SUPERH32
+/*
+ * If we're on an SH7751 or SH7780 PCI controller, PCI memory is mapped
+ * at the end of the address space in a special non-translatable area.
+ */
+#define PCI_MEM_FIXED_START	0xfd000000
+#define PCI_MEM_FIXED_END	(PCI_MEM_FIXED_START + 0x01000000)
+
+#define is_pci_memory_fixed_range(s, e)	\
+	((s) >= PCI_MEM_FIXED_START && (e) < PCI_MEM_FIXED_END)
 #else
-static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size)
-{
-	return 0;
-}
+#define is_pci_memory_fixed_range(s, e)	(0)
 #endif
 
 /* Board-specific fixup routines. */
diff --git a/arch/sh/mm/ioremap_32.c b/arch/sh/mm/ioremap_32.c
index 7e04cc8f3b9b6..da2f4186f2cd3 100644
--- a/arch/sh/mm/ioremap_32.c
+++ b/arch/sh/mm/ioremap_32.c
@@ -46,17 +46,15 @@ void __iomem *__ioremap(unsigned long phys_addr, unsigned long size,
 		return NULL;
 
 	/*
-	 * If we're on an SH7751 or SH7780 PCI controller, PCI memory is
-	 * mapped at the end of the address space (typically 0xfd000000)
-	 * in a non-translatable area, so mapping through page tables for
-	 * this area is not only pointless, but also fundamentally
-	 * broken. Just return the physical address instead.
+	 * If we're in the fixed PCI memory range, mapping through page
+	 * tables is not only pointless, but also fundamentally broken.
+	 * Just return the physical address instead.
 	 *
 	 * For boards that map a small PCI memory aperture somewhere in
 	 * P1/P2 space, ioremap() will already do the right thing,
 	 * and we'll never get this far.
 	 */
-	if (__is_pci_memory(phys_addr, size))
+	if (is_pci_memory_fixed_range(phys_addr, size))
 		return (void __iomem *)phys_addr;
 
 #if !defined(CONFIG_PMB_FIXED)
@@ -121,7 +119,9 @@ void __iounmap(void __iomem *addr)
 	unsigned long seg = PXSEG(vaddr);
 	struct vm_struct *p;
 
-	if (seg < P3SEG || vaddr >= P3_ADDR_MAX || __is_pci_memory(vaddr, 0))
+	if (seg < P3SEG || vaddr >= P3_ADDR_MAX)
+		return;
+	if (is_pci_memory_fixed_range(vaddr, 0))
 		return;
 
 #ifdef CONFIG_PMB
-- 
GitLab


From e79066a659b893debe19010179d3f3f015d76d1c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 18:29:22 +0900
Subject: [PATCH 0516/2198] sh: pci: New-style controller registration.

This moves off of the board_pci_channels[] approach for bus registration
and over to a cleaner register_pci_controller(), all derived from the
MIPS code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-new.c    | 97 ++++++++++++++++++++++++--------
 arch/sh/drivers/pci/pci-sh7780.c | 87 +++++++++++++---------------
 arch/sh/include/asm/pci.h        | 29 ++++++----
 3 files changed, 130 insertions(+), 83 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index c92e65045c681..78b7292c6aa87 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -13,40 +13,90 @@
 #include <linux/init.h>
 #include <linux/dma-debug.h>
 #include <linux/io.h>
+#include <linux/mutex.h>
 
-static int __init pcibios_init(void)
+/*
+ * The PCI controller list.
+ */
+static struct pci_channel *hose_head, **hose_tail = &hose_head;
+
+static int pci_initialized;
+
+static void __devinit pcibios_scanbus(struct pci_channel *hose)
 {
-	struct pci_channel *p;
+	static int next_busno;
 	struct pci_bus *bus;
-	int busno;
-
-	/* init channels */
-	busno = 0;
-	for (p = board_pci_channels; p->init; p++) {
-		if (p->init(p) == 0)
-			p->enabled = 1;
-		else
-			pr_err("Unable to init pci channel %d\n", busno);
-		busno++;
+
+	/* Catch botched conversion attempts */
+	BUG_ON(hose->init);
+
+	bus = pci_scan_bus(next_busno, hose->pci_ops, hose);
+	if (bus) {
+		next_busno = bus->subordinate + 1;
+		/* Don't allow 8-bit bus number overflow inside the hose -
+		   reserve some space for bridges. */
+		if (next_busno > 224)
+			next_busno = 0;
+
+		pci_bus_size_bridges(bus);
+		pci_bus_assign_resources(bus);
+		pci_enable_bridges(bus);
 	}
+}
 
-	/* scan the buses */
-	busno = 0;
-	for (p = board_pci_channels; p->init; p++) {
-		if (p->enabled) {
-			bus = pci_scan_bus(busno, p->pci_ops, p);
-			busno = bus->subordinate + 1;
+static DEFINE_MUTEX(pci_scan_mutex);
 
-			pci_bus_size_bridges(bus);
-			pci_bus_assign_resources(bus);
-			pci_enable_bridges(bus);
-		}
+void __devinit register_pci_controller(struct pci_channel *hose)
+{
+	if (request_resource(&iomem_resource, hose->mem_resource) < 0)
+		goto out;
+	if (request_resource(&ioport_resource, hose->io_resource) < 0) {
+		release_resource(hose->mem_resource);
+		goto out;
+	}
+
+	*hose_tail = hose;
+	hose_tail = &hose->next;
+
+	/*
+	 * Do not panic here but later - this might hapen before console init.
+	 */
+	if (!hose->io_map_base) {
+		printk(KERN_WARNING
+		       "registering PCI controller with io_map_base unset\n");
+	}
+
+	/*
+	 * Scan the bus if it is register after the PCI subsystem
+	 * initialization.
+	 */
+	if (pci_initialized) {
+		mutex_lock(&pci_scan_mutex);
+		pcibios_scanbus(hose);
+		mutex_unlock(&pci_scan_mutex);
 	}
 
+	return;
+
+out:
+	printk(KERN_WARNING
+	       "Skipping PCI bus scan due to resource conflict\n");
+}
+
+static int __init pcibios_init(void)
+{
+	struct pci_channel *hose;
+
+	/* Scan all of the recorded PCI controllers.  */
+	for (hose = hose_head; hose; hose = hose->next)
+		pcibios_scanbus(hose);
+
 	pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq);
 
 	dma_debug_add_bus(&pci_bus_type);
 
+	pci_initialized = 1;
+
 	return 0;
 }
 subsys_initcall(pcibios_init);
@@ -74,7 +124,6 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev,
 	}
 }
 
-
 /*
  *  Called after each bus is probed, but before its children
  *  are examined.
@@ -186,5 +235,3 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
 {
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
-
-EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index f02d9dfcf252f..4dd6e3b94a67d 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -15,11 +15,47 @@
 #include <linux/delay.h>
 #include "pci-sh4.h"
 
-static int __init sh7780_pci_init(struct pci_channel *chan)
+extern u8 pci_cache_line_size;
+
+static struct resource sh7785_io_resource = {
+	.name	= "SH7785_IO",
+	.start	= SH7780_PCI_IO_BASE,
+	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
+	.flags	= IORESOURCE_IO
+};
+
+static struct resource sh7785_mem_resource = {
+	.name	= "SH7785_mem",
+	.start	= SH7780_PCI_MEMORY_BASE,
+	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
+	.flags	= IORESOURCE_MEM
+};
+
+static struct pci_channel sh7780_pci_controller = {
+	.pci_ops	= &sh4_pci_ops,
+	.mem_resource	= &sh7785_mem_resource,
+	.io_resource	= &sh7785_io_resource,
+};
+
+static struct sh4_pci_address_map sh7780_pci_map = {
+	.window0	= {
+#if defined(CONFIG_32BIT)
+		.base	= SH7780_32BIT_DDR_BASE_ADDR,
+		.size	= 0x40000000,
+#else
+		.base	= SH7780_CS0_BASE_ADDR,
+		.size	= 0x20000000,
+#endif
+	},
+};
+
+static int __init sh7780_pci_init(void)
 {
+	struct pci_channel *chan = &sh7780_pci_controller;
 	unsigned int id;
 	const char *type = NULL;
 	int ret;
+	u32 word;
 
 	printk(KERN_NOTICE "PCI: Starting intialization.\n");
 
@@ -54,52 +90,6 @@ static int __init sh7780_pci_init(struct pci_channel *chan)
 	if ((ret = sh4_pci_check_direct(chan)) != 0)
 		return ret;
 
-	/*
-	 * Platform specific initialization (BSC registers, and memory space
-	 * mapping) will be called via the platform defined function
-	 * pcibios_init_platform().
-	 */
-	return pcibios_init_platform();
-}
-
-extern u8 pci_cache_line_size;
-
-static struct resource sh7785_io_resource = {
-	.name	= "SH7785_IO",
-	.start	= SH7780_PCI_IO_BASE,
-	.end	= SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7785_mem_resource = {
-	.name	= "SH7785_mem",
-	.start	= SH7780_PCI_MEMORY_BASE,
-	.end	= SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
-static struct sh4_pci_address_map sh7780_pci_map = {
-	.window0	= {
-#if defined(CONFIG_32BIT)
-		.base	= SH7780_32BIT_DDR_BASE_ADDR,
-		.size	= 0x40000000,
-#else
-		.base	= SH7780_CS0_BASE_ADDR,
-		.size	= 0x20000000,
-#endif
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	struct pci_channel *chan = &board_pci_channels[0];
-	u32 word;
-
 	/*
 	 * Set the class and sub-class codes.
 	 */
@@ -153,5 +143,8 @@ int __init pcibios_init_platform(void)
 
 	__set_io_port_base(SH7780_PCI_IO_BASE);
 
+	register_pci_controller(chan);
+
 	return 0;
 }
+arch_initcall(sh7780_pci_init);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 82a9369511b5a..e057ebdb46189 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -17,17 +17,22 @@
  * external) PCI controllers.
  */
 struct pci_channel {
-	int (*init)(struct pci_channel *chan);
-	struct pci_ops *pci_ops;
-	struct resource *io_resource;
-	struct resource *mem_resource;
-	int first_devfn;
-	int last_devfn;
-	int enabled;
-	unsigned long reg_base;
-	unsigned long io_base;
-
-	unsigned long io_map_base;
+	struct pci_channel	*next;
+
+	int			(*init)(struct pci_channel *chan);
+
+	struct pci_ops		*pci_ops;
+	struct resource		*io_resource;
+	struct resource		*mem_resource;
+
+	int			first_devfn;
+	int			last_devfn;
+	int			enabled;
+
+	unsigned long		reg_base;
+	unsigned long		io_base;
+
+	unsigned long		io_map_base;
 };
 
 /*
@@ -35,6 +40,8 @@ struct pci_channel {
  */
 extern struct pci_channel board_pci_channels[];
 
+extern void register_pci_controller(struct pci_channel *hose);
+
 extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM;
 
 struct pci_dev;
-- 
GitLab


From 09cfeb133e3cac39b8b9a2cb1d8ab4f77e396248 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 18:42:00 +0900
Subject: [PATCH 0517/2198] sh: pci: Track io and mem_offset per-channel.

This implements a per-hose offset for I/O and mem resources.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-new.c    | 18 +++++++++---------
 arch/sh/drivers/pci/pci-sh7780.c |  2 ++
 arch/sh/include/asm/pci.h        |  3 +++
 3 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index 78b7292c6aa87..e8ac8daafc33b 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -105,7 +105,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev,
 	struct pci_bus *bus)
 {
 	/* Update device resources.  */
-	struct pci_channel *chan = bus->sysdata;
+	struct pci_channel *hose = bus->sysdata;
 	unsigned long offset = 0;
 	int i;
 
@@ -115,9 +115,9 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev,
 		if (dev->resource[i].flags & IORESOURCE_PCI_FIXED)
 			continue;
 		if (dev->resource[i].flags & IORESOURCE_IO)
-			offset = chan->io_base;
+			offset = hose->io_offset;
 		else if (dev->resource[i].flags & IORESOURCE_MEM)
-			offset = 0;
+			offset = hose->mem_offset;
 
 		dev->resource[i].start += offset;
 		dev->resource[i].end += offset;
@@ -150,13 +150,13 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
 			 struct resource *res)
 {
-	struct pci_channel *chan = dev->sysdata;
+	struct pci_channel *hose = dev->sysdata;
 	unsigned long offset = 0;
 
 	if (res->flags & IORESOURCE_IO)
-		offset = chan->io_base;
+		offset = hose->io_offset;
 	else if (res->flags & IORESOURCE_MEM)
-		offset = 0;
+		offset = hose->mem_offset;
 
 	region->start = res->start - offset;
 	region->end = res->end - offset;
@@ -166,13 +166,13 @@ void __devinit
 pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 			struct pci_bus_region *region)
 {
-	struct pci_channel *chan = dev->sysdata;
+	struct pci_channel *hose = dev->sysdata;
 	unsigned long offset = 0;
 
 	if (res->flags & IORESOURCE_IO)
-		offset = chan->io_base;
+		offset = hose->io_offset;
 	else if (res->flags & IORESOURCE_MEM)
-		offset = 0;
+		offset = hose->mem_offset;
 
 	res->start = region->start + offset;
 	res->end = region->end + offset;
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 4dd6e3b94a67d..57a3b870a276b 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -34,7 +34,9 @@ static struct resource sh7785_mem_resource = {
 static struct pci_channel sh7780_pci_controller = {
 	.pci_ops	= &sh4_pci_ops,
 	.mem_resource	= &sh7785_mem_resource,
+	.mem_offset	= 0x00000000,
 	.io_resource	= &sh7785_io_resource,
+	.io_offset	= 0x00000000,
 };
 
 static struct sh4_pci_address_map sh7780_pci_map = {
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index e057ebdb46189..0be20521a1fe7 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -25,6 +25,9 @@ struct pci_channel {
 	struct resource		*io_resource;
 	struct resource		*mem_resource;
 
+	unsigned long		io_offset;
+	unsigned long		mem_offset;
+
 	int			first_devfn;
 	int			last_devfn;
 	int			enabled;
-- 
GitLab


From 5160d3f782a5e0cdb3bdaa8a891a1fb9d9ab83ec Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 18:47:21 +0900
Subject: [PATCH 0518/2198] sh: pci: Consolidate bus<->resource mapping in
 pci-lib.

Now that the io and mem offsets are tracked accordingly, the pci-new
version of the bus<->resource mappers can be used generically. This
moves them in to pci-lib.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-lib.c | 31 +++++++++++++++++++++++++++++++
 arch/sh/drivers/pci/pci-new.c | 31 -------------------------------
 arch/sh/drivers/pci/pci.c     | 14 --------------
 3 files changed, 31 insertions(+), 45 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
index 654ffcc67d0ac..9fd3af9db462e 100644
--- a/arch/sh/drivers/pci/pci-lib.c
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -39,6 +39,37 @@ void pcibios_align_resource(void *data, struct resource *res,
 	res->start = start;
 }
 
+void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			 struct resource *res)
+{
+	struct pci_channel *hose = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = hose->io_offset;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = hose->mem_offset;
+
+	region->start = res->start - offset;
+	region->end = res->end - offset;
+}
+
+void __devinit
+pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+			struct pci_bus_region *region)
+{
+	struct pci_channel *hose = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = hose->io_offset;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = hose->mem_offset;
+
+	res->start = region->start + offset;
+	res->end = region->end + offset;
+}
+
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index e8ac8daafc33b..9d426147802bb 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -147,37 +147,6 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 	}
 }
 
-void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
-			 struct resource *res)
-{
-	struct pci_channel *hose = dev->sysdata;
-	unsigned long offset = 0;
-
-	if (res->flags & IORESOURCE_IO)
-		offset = hose->io_offset;
-	else if (res->flags & IORESOURCE_MEM)
-		offset = hose->mem_offset;
-
-	region->start = res->start - offset;
-	region->end = res->end - offset;
-}
-
-void __devinit
-pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
-			struct pci_bus_region *region)
-{
-	struct pci_channel *hose = dev->sysdata;
-	unsigned long offset = 0;
-
-	if (res->flags & IORESOURCE_IO)
-		offset = hose->io_offset;
-	else if (res->flags & IORESOURCE_MEM)
-		offset = hose->mem_offset;
-
-	res->start = region->start + offset;
-	res->end = region->end + offset;
-}
-
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
 	u16 cmd, old_cmd;
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index d39f24091adef..c15a6f0ad5003 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -72,20 +72,6 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 	pci_read_bridge_bases(bus);
 }
 
-void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
-			     struct resource *res)
-{
-	region->start = res->start;
-	region->end = res->end;
-}
-
-void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
-			     struct pci_bus_region *region)
-{
-	res->start = region->start;
-	res->end = region->end;
-}
-
 int pcibios_enable_device(struct pci_dev *dev, int mask)
 {
 	u16 cmd, old_cmd;
-- 
GitLab


From 3f8daeacd7ed7a502daf0998e2515cea4f467f21 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 18:53:41 +0900
Subject: [PATCH 0519/2198] sh: pci: Consolidate the remaining common bits.

This moves the remaining common bits in to pci-lib. Thereby reducing
pci.c/pci-new.c to simple bus fixups and controller registration.

As more platforms are moved over, the old code will disappear completely
and the pci-new bits will be rolled in to pci-lib, eventually replacing
pci.c completely.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-lib.c | 64 +++++++++++++++++++++++++++++++++++
 arch/sh/drivers/pci/pci-new.c | 58 -------------------------------
 arch/sh/drivers/pci/pci.c     | 58 -------------------------------
 3 files changed, 64 insertions(+), 116 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
index 9fd3af9db462e..ea83629458491 100644
--- a/arch/sh/drivers/pci/pci-lib.c
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -70,6 +70,70 @@ pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
 	res->end = region->end + offset;
 }
 
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	u16 cmd, old_cmd;
+	int idx;
+	struct resource *r;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	old_cmd = cmd;
+	for (idx=0; idx < PCI_NUM_RESOURCES; idx++) {
+		/* Only set up the requested stuff */
+		if (!(mask & (1<<idx)))
+			continue;
+
+		r = &dev->resource[idx];
+		if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+			continue;
+		if ((idx == PCI_ROM_RESOURCE) &&
+				(!(r->flags & IORESOURCE_ROM_ENABLE)))
+			continue;
+		if (!r->start && r->end) {
+			printk(KERN_ERR "PCI: Device %s not available "
+			       "because of resource collisions\n",
+			       pci_name(dev));
+			return -EINVAL;
+		}
+		if (r->flags & IORESOURCE_IO)
+			cmd |= PCI_COMMAND_IO;
+		if (r->flags & IORESOURCE_MEM)
+			cmd |= PCI_COMMAND_MEMORY;
+	}
+	if (cmd != old_cmd) {
+		printk("PCI: Enabling device %s (%04x -> %04x)\n",
+		       pci_name(dev), old_cmd, cmd);
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	}
+	return 0;
+}
+
+/*
+ *  If we set up a device for bus mastering, we need to check and set
+ *  the latency timer as it may not be properly set.
+ */
+static unsigned int pcibios_max_latency = 255;
+
+void pcibios_set_master(struct pci_dev *dev)
+{
+	u8 lat;
+	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
+	if (lat < 16)
+		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
+	else if (lat > pcibios_max_latency)
+		lat = pcibios_max_latency;
+	else
+		return;
+	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
+	       pci_name(dev), lat);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
+}
+
+void __init pcibios_update_irq(struct pci_dev *dev, int irq)
+{
+	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
+}
+
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index 9d426147802bb..8c0b136eecb3c 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -146,61 +146,3 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 			pcibios_fixup_device_resources(dev, bus);
 	}
 }
-
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-	u16 cmd, old_cmd;
-	int idx;
-	struct resource *r;
-
-	pci_read_config_word(dev, PCI_COMMAND, &cmd);
-	old_cmd = cmd;
-	for(idx=0; idx<6; idx++) {
-		if (!(mask & (1 << idx)))
-			continue;
-		r = &dev->resource[idx];
-		if (!r->start && r->end) {
-			printk(KERN_ERR "PCI: Device %s not available because "
-			       "of resource collisions\n", pci_name(dev));
-			return -EINVAL;
-		}
-		if (r->flags & IORESOURCE_IO)
-			cmd |= PCI_COMMAND_IO;
-		if (r->flags & IORESOURCE_MEM)
-			cmd |= PCI_COMMAND_MEMORY;
-	}
-	if (dev->resource[PCI_ROM_RESOURCE].start)
-		cmd |= PCI_COMMAND_MEMORY;
-	if (cmd != old_cmd) {
-		printk(KERN_INFO "PCI: Enabling device %s (%04x -> %04x)\n",
-		       pci_name(dev), old_cmd, cmd);
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-	}
-	return 0;
-}
-
-/*
- *  If we set up a device for bus mastering, we need to check and set
- *  the latency timer as it may not be properly set.
- */
-static unsigned int pcibios_max_latency = 255;
-
-void pcibios_set_master(struct pci_dev *dev)
-{
-	u8 lat;
-	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
-	if (lat < 16)
-		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
-	else if (lat > pcibios_max_latency)
-		lat = pcibios_max_latency;
-	else
-		return;
-	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
-	       pci_name(dev), lat);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
-}
-
-void __init pcibios_update_irq(struct pci_dev *dev, int irq)
-{
-	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
-}
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index c15a6f0ad5003..8c332b2a4641a 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -72,62 +72,4 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 	pci_read_bridge_bases(bus);
 }
 
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-	u16 cmd, old_cmd;
-	int idx;
-	struct resource *r;
-
-	pci_read_config_word(dev, PCI_COMMAND, &cmd);
-	old_cmd = cmd;
-	for(idx=0; idx<6; idx++) {
-		if (!(mask & (1 << idx)))
-			continue;
-		r = &dev->resource[idx];
-		if (!r->start && r->end) {
-			printk(KERN_ERR "PCI: Device %s not available because "
-			       "of resource collisions\n", pci_name(dev));
-			return -EINVAL;
-		}
-		if (r->flags & IORESOURCE_IO)
-			cmd |= PCI_COMMAND_IO;
-		if (r->flags & IORESOURCE_MEM)
-			cmd |= PCI_COMMAND_MEMORY;
-	}
-	if (dev->resource[PCI_ROM_RESOURCE].start)
-		cmd |= PCI_COMMAND_MEMORY;
-	if (cmd != old_cmd) {
-		printk(KERN_INFO "PCI: Enabling device %s (%04x -> %04x)\n",
-		       pci_name(dev), old_cmd, cmd);
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-	}
-	return 0;
-}
-
-/*
- *  If we set up a device for bus mastering, we need to check and set
- *  the latency timer as it may not be properly set.
- */
-static unsigned int pcibios_max_latency = 255;
-
-void pcibios_set_master(struct pci_dev *dev)
-{
-	u8 lat;
-	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
-	if (lat < 16)
-		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
-	else if (lat > pcibios_max_latency)
-		lat = pcibios_max_latency;
-	else
-		return;
-	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
-	       pci_name(dev), lat);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
-}
-
-void __init pcibios_update_irq(struct pci_dev *dev, int irq)
-{
-	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
-}
-
 EXPORT_SYMBOL(board_pci_channels);
-- 
GitLab


From 5ba7205fc49ff72e88784c94fb661f93e7ae7d36 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:00:32 +0900
Subject: [PATCH 0520/2198] sh: pci: Kill off the now unused hose->io_base.

Nothing is using this any more, so kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-sh7751.c | 8 --------
 arch/sh/drivers/pci/pci-sh7780.c | 1 -
 arch/sh/include/asm/pci.h        | 1 -
 3 files changed, 10 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index af8874436d2f9..4c08fd7f665da 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -40,7 +40,6 @@ int __init sh7751_pci_init(struct pci_channel *chan)
 	pr_debug("PCI: Starting intialization.\n");
 
 	chan->reg_base = 0xfe200000;
-	chan->io_base = 0xfe240000;
 
 	/* check for SH7751/SH7751R hardware */
 	id = pci_read_reg(chan, SH7751_PCICONF0);
@@ -136,13 +135,6 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	pr_debug("PCI: Setting upper bits of Memory window to 0x%x\n", word);
 	pci_write_reg(chan, word , SH4_PCIMBR);
 
-	/* Map IO space into PCI IO window:
-	 * IO addresses will be translated to the PCI IO window base address
-	 */
-	pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n",
-		 chan->io_resource->start, chan->io_resource->end,
-		 chan->io_base + chan->io_resource->start);
-
 	/* Make sure the MSB's of IO window are set to access PCI space
 	 * correctly */
 	word = chan->io_resource->start & SH4_PCIIOBR_MASK;
diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index 57a3b870a276b..ae13ff925c61e 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -62,7 +62,6 @@ static int __init sh7780_pci_init(void)
 	printk(KERN_NOTICE "PCI: Starting intialization.\n");
 
 	chan->reg_base = 0xfe040000;
-	chan->io_base = 0xfe200000;
 
 	/* Enable CPU access to the PCIC registers. */
 	__raw_writel(PCIECR_ENBL, PCIECR);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 0be20521a1fe7..f36c7899295b6 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -33,7 +33,6 @@ struct pci_channel {
 	int			enabled;
 
 	unsigned long		reg_base;
-	unsigned long		io_base;
 
 	unsigned long		io_map_base;
 };
-- 
GitLab


From bb3396477bf46c7cae6bd04b969580277957966e Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:26:45 +0900
Subject: [PATCH 0521/2198] sh: pci: Kill off superfluous lboxre2 pci fixups.

This is a verbatim copy of the r2d one, use that instead.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Makefile         |  2 +-
 arch/sh/drivers/pci/fixups-lboxre2.c | 42 ----------------------------
 2 files changed, 1 insertion(+), 43 deletions(-)
 delete mode 100644 arch/sh/drivers/pci/fixups-lboxre2.c

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index c8eab14843e5a..4cac866d55d47 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -22,5 +22,5 @@ obj-$(CONFIG_SH_SDK7780)		+= fixups-sdk7780.o
 obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= fixups-sdk7780.o
 obj-$(CONFIG_SH_TITAN)			+= ops-titan.o
 obj-$(CONFIG_SH_LANDISK)		+= ops-landisk.o
-obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-lboxre2.o
+obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-rts7751r2d.o
 obj-$(CONFIG_SH_CAYMAN)			+= ops-cayman.o
diff --git a/arch/sh/drivers/pci/fixups-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c
deleted file mode 100644
index a82011d03cb0a..0000000000000
--- a/arch/sh/drivers/pci/fixups-lboxre2.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * arch/sh/drivers/pci/fixups-lboxre2.c
- *
- * L-BOX RE2 PCI fixups
- *
- * Copyright (C) 2007 Nobuhiro Iwamatsu
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/pci.h>
-#include "pci-sh4.h"
-
-#define PCIMCR_MRSET_OFF	0xBFFFFFFF
-#define PCIMCR_RFSH_OFF		0xFFFFFFFB
-
-int pci_fixup_pcic(struct pci_channel *chan)
-{
-	unsigned long bcr1, mcr;
-
-	bcr1 = ctrl_inl(SH7751_BCR1);
-	bcr1 |= 0x40080000;	/* Enable Bit 19 BREQEN, set PCIC to slave */
-	pci_write_reg(chan, bcr1, SH4_PCIBCR1);
-
-	/* Enable all interrupts, so we known what to fix */
-	pci_write_reg(chan, 0x0000c3ff, SH4_PCIINTM);
-	pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM);
-	pci_write_reg(chan, 0xfb900047, SH7751_PCICONF1);
-	pci_write_reg(chan, 0xab000001, SH7751_PCICONF4);
-
-	mcr = ctrl_inl(SH7751_MCR);
-	mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF;
-	pci_write_reg(chan, mcr, SH4_PCIMCR);
-
-	pci_write_reg(chan, 0x0c000000, SH7751_PCICONF5);
-	pci_write_reg(chan, 0xd0000000, SH7751_PCICONF6);
-	pci_write_reg(chan, 0x0c000000, SH4_PCILAR0);
-	pci_write_reg(chan, 0x00000000, SH4_PCILAR1);
-
-	return 0;
-}
-- 
GitLab


From d556fcc101c4b0d57ac9742ab806a6bfed78eac1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:31:20 +0900
Subject: [PATCH 0522/2198] sh: pci: Flag the dreamcast BBA as
 IORESOURCE_PCI_FIXED.

This isn't a real BAR, so prevent any attempts to move it, as we don't
wish to encourage a bus luck by overzealous PCI initialization code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/fixups-dreamcast.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c
index 2bf85cf091e13..48c6381fffaaf 100644
--- a/arch/sh/drivers/pci/fixups-dreamcast.c
+++ b/arch/sh/drivers/pci/fixups-dreamcast.c
@@ -41,6 +41,13 @@ static void __init gapspci_fixup_resources(struct pci_dev *dev)
 		 */
 		dev->resource[1].start	= p->io_resource->start  + 0x100;
 		dev->resource[1].end	= dev->resource[1].start + 0x200 - 1;
+
+		/*
+		 * This is not a normal BAR, prevent any attempts to move
+		 * the BAR, as this will result in a bus lock.
+		 */
+		dev->resource[1].flags |= IORESOURCE_PCI_FIXED;
+
 		/*
 		 * Redirect dma memory allocations to special memory window.
 		 */
-- 
GitLab


From c563bf0965c86cc6087b09c34ca063fe96d6deca Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:39:57 +0900
Subject: [PATCH 0523/2198] sh: pci: Kill off dead references to is_pci_ioaddr
 and friends.

Some old boards are still using this in their I/O routines, kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-se/7751/io.c  | 16 ----------------
 arch/sh/boards/mach-snapgear/io.c | 16 ----------------
 arch/sh/boards/mach-systemh/io.c  | 16 ----------------
 arch/sh/boards/mach-titan/io.c    | 18 ------------------
 4 files changed, 66 deletions(-)

diff --git a/arch/sh/boards/mach-se/7751/io.c b/arch/sh/boards/mach-se/7751/io.c
index 6287ae570319c..6e75bd4459e5c 100644
--- a/arch/sh/boards/mach-se/7751/io.c
+++ b/arch/sh/boards/mach-se/7751/io.c
@@ -34,8 +34,6 @@ unsigned char sh7751se_inb(unsigned long port)
 {
 	if (PXSEG(port))
 		return *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-		return *(volatile unsigned char *)pci_ioaddr(port);
 	else
 		return (*port2adr(port)) & 0xff;
 }
@@ -46,8 +44,6 @@ unsigned char sh7751se_inb_p(unsigned long port)
 
         if (PXSEG(port))
                 v = *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-                v = *(volatile unsigned char *)pci_ioaddr(port);
 	else
 		v = (*port2adr(port)) & 0xff;
 	ctrl_delay();
@@ -58,8 +54,6 @@ unsigned short sh7751se_inw(unsigned long port)
 {
         if (PXSEG(port))
                 return *(volatile unsigned short *)port;
-	else if (is_pci_ioaddr(port))
-                return *(volatile unsigned short *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else
@@ -71,8 +65,6 @@ unsigned int sh7751se_inl(unsigned long port)
 {
         if (PXSEG(port))
                 return *(volatile unsigned long *)port;
-	else if (is_pci_ioaddr(port))
-                return *(volatile unsigned int *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else
@@ -85,8 +77,6 @@ void sh7751se_outb(unsigned char value, unsigned long port)
 
         if (PXSEG(port))
                 *(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else
 		*(port2adr(port)) = value;
 }
@@ -95,8 +85,6 @@ void sh7751se_outb_p(unsigned char value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else
 		*(port2adr(port)) = value;
 	ctrl_delay();
@@ -106,8 +94,6 @@ void sh7751se_outw(unsigned short value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned short *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned short *)pci_ioaddr(port)) = value;
 	else if (port >= 0x2000)
 		*port2adr(port) = value;
 	else
@@ -118,8 +104,6 @@ void sh7751se_outl(unsigned int value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned long *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned long*)pci_ioaddr(port)) = value;
 	else
 		maybebadio(port);
 }
diff --git a/arch/sh/boards/mach-snapgear/io.c b/arch/sh/boards/mach-snapgear/io.c
index 0f4824264557d..476650e42dbc0 100644
--- a/arch/sh/boards/mach-snapgear/io.c
+++ b/arch/sh/boards/mach-snapgear/io.c
@@ -36,8 +36,6 @@ unsigned char snapgear_inb(unsigned long port)
 {
 	if (PXSEG(port))
 		return *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-		return *(volatile unsigned char *)pci_ioaddr(port);
 	else
 		return (*port2adr(port)) & 0xff;
 }
@@ -48,8 +46,6 @@ unsigned char snapgear_inb_p(unsigned long port)
 
 	if (PXSEG(port))
 		v = *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-		v = *(volatile unsigned char *)pci_ioaddr(port);
 	else
 		v = (*port2adr(port))&0xff;
 	ctrl_delay();
@@ -60,8 +56,6 @@ unsigned short snapgear_inw(unsigned long port)
 {
 	if (PXSEG(port))
 		return *(volatile unsigned short *)port;
-	else if (is_pci_ioaddr(port))
-		return *(volatile unsigned short *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else
@@ -73,8 +67,6 @@ unsigned int snapgear_inl(unsigned long port)
 {
 	if (PXSEG(port))
 		return *(volatile unsigned long *)port;
-	else if (is_pci_ioaddr(port))
-		return *(volatile unsigned int *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else
@@ -87,8 +79,6 @@ void snapgear_outb(unsigned char value, unsigned long port)
 
 	if (PXSEG(port))
 		*(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else
 		*(port2adr(port)) = value;
 }
@@ -97,8 +87,6 @@ void snapgear_outb_p(unsigned char value, unsigned long port)
 {
 	if (PXSEG(port))
 		*(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else
 		*(port2adr(port)) = value;
 	ctrl_delay();
@@ -108,8 +96,6 @@ void snapgear_outw(unsigned short value, unsigned long port)
 {
 	if (PXSEG(port))
 		*(volatile unsigned short *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned short *)pci_ioaddr(port)) = value;
 	else if (port >= 0x2000)
 		*port2adr(port) = value;
 	else
@@ -120,8 +106,6 @@ void snapgear_outl(unsigned int value, unsigned long port)
 {
 	if (PXSEG(port))
 		*(volatile unsigned long *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned long*)pci_ioaddr(port)) = value;
 	else
 		maybebadio(port);
 }
diff --git a/arch/sh/boards/mach-systemh/io.c b/arch/sh/boards/mach-systemh/io.c
index dec3db0ee933b..15577ff1f7153 100644
--- a/arch/sh/boards/mach-systemh/io.c
+++ b/arch/sh/boards/mach-systemh/io.c
@@ -35,8 +35,6 @@ unsigned char sh7751systemh_inb(unsigned long port)
 {
 	if (PXSEG(port))
 		return *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-		return *(volatile unsigned char *)pci_ioaddr(port);
 	else if (port <= 0x3F1)
 		return *(volatile unsigned char *)ETHER_IOMAP(port);
 	else
@@ -49,8 +47,6 @@ unsigned char sh7751systemh_inb_p(unsigned long port)
 
         if (PXSEG(port))
                 v = *(volatile unsigned char *)port;
-	else if (is_pci_ioaddr(port))
-                v = *(volatile unsigned char *)pci_ioaddr(port);
 	else if (port <= 0x3F1)
 		v = *(volatile unsigned char *)ETHER_IOMAP(port);
 	else
@@ -63,8 +59,6 @@ unsigned short sh7751systemh_inw(unsigned long port)
 {
         if (PXSEG(port))
                 return *(volatile unsigned short *)port;
-	else if (is_pci_ioaddr(port))
-                return *(volatile unsigned short *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else if (port <= 0x3F1)
@@ -78,8 +72,6 @@ unsigned int sh7751systemh_inl(unsigned long port)
 {
         if (PXSEG(port))
                 return *(volatile unsigned long *)port;
-	else if (is_pci_ioaddr(port))
-                return *(volatile unsigned int *)pci_ioaddr(port);
 	else if (port >= 0x2000)
 		return *port2adr(port);
 	else if (port <= 0x3F1)
@@ -94,8 +86,6 @@ void sh7751systemh_outb(unsigned char value, unsigned long port)
 
         if (PXSEG(port))
                 *(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else if (port <= 0x3F1)
 		*(volatile unsigned char *)ETHER_IOMAP(port) = value;
 	else
@@ -106,8 +96,6 @@ void sh7751systemh_outb_p(unsigned char value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned char *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned char*)pci_ioaddr(port)) = value;
 	else if (port <= 0x3F1)
 		*(volatile unsigned char *)ETHER_IOMAP(port) = value;
 	else
@@ -119,8 +107,6 @@ void sh7751systemh_outw(unsigned short value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned short *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned short *)pci_ioaddr(port)) = value;
 	else if (port >= 0x2000)
 		*port2adr(port) = value;
 	else if (port <= 0x3F1)
@@ -133,8 +119,6 @@ void sh7751systemh_outl(unsigned int value, unsigned long port)
 {
         if (PXSEG(port))
                 *(volatile unsigned long *)port = value;
-	else if (is_pci_ioaddr(port))
-		*((unsigned long*)pci_ioaddr(port)) = value;
 	else
 		maybebadio(port);
 }
diff --git a/arch/sh/boards/mach-titan/io.c b/arch/sh/boards/mach-titan/io.c
index 053b3ed2ed8da..0130e9826aca6 100644
--- a/arch/sh/boards/mach-titan/io.c
+++ b/arch/sh/boards/mach-titan/io.c
@@ -17,8 +17,6 @@ u8 titan_inb(unsigned long port)
 {
         if (PXSEG(port))
                 return ctrl_inb(port);
-        else if (is_pci_ioaddr(port))
-                return ctrl_inb(pci_ioaddr(port));
         return ctrl_inw(port2adr(port)) & 0xff;
 }
 
@@ -28,8 +26,6 @@ u8 titan_inb_p(unsigned long port)
 
         if (PXSEG(port))
                 v = ctrl_inb(port);
-        else if (is_pci_ioaddr(port))
-                v = ctrl_inb(pci_ioaddr(port));
         else
                 v = ctrl_inw(port2adr(port)) & 0xff;
         ctrl_delay();
@@ -40,8 +36,6 @@ u16 titan_inw(unsigned long port)
 {
         if (PXSEG(port))
                 return ctrl_inw(port);
-        else if (is_pci_ioaddr(port))
-                return ctrl_inw(pci_ioaddr(port));
         else if (port >= 0x2000)
                 return ctrl_inw(port2adr(port));
         else
@@ -53,8 +47,6 @@ u32 titan_inl(unsigned long port)
 {
         if (PXSEG(port))
                 return ctrl_inl(port);
-        else if (is_pci_ioaddr(port))
-                return ctrl_inl(pci_ioaddr(port));
         else if (port >= 0x2000)
                 return ctrl_inw(port2adr(port));
         else
@@ -66,8 +58,6 @@ void titan_outb(u8 value, unsigned long port)
 {
         if (PXSEG(port))
                 ctrl_outb(value, port);
-        else if (is_pci_ioaddr(port))
-                ctrl_outb(value, pci_ioaddr(port));
         else
                 ctrl_outw(value, port2adr(port));
 }
@@ -76,8 +66,6 @@ void titan_outb_p(u8 value, unsigned long port)
 {
         if (PXSEG(port))
                 ctrl_outb(value, port);
-        else if (is_pci_ioaddr(port))
-                ctrl_outb(value, pci_ioaddr(port));
         else
                 ctrl_outw(value, port2adr(port));
         ctrl_delay();
@@ -87,8 +75,6 @@ void titan_outw(u16 value, unsigned long port)
 {
         if (PXSEG(port))
                 ctrl_outw(value, port);
-        else if (is_pci_ioaddr(port))
-                ctrl_outw(value, pci_ioaddr(port));
         else if (port >= 0x2000)
                 ctrl_outw(value, port2adr(port));
         else
@@ -99,8 +85,6 @@ void titan_outl(u32 value, unsigned long port)
 {
         if (PXSEG(port))
                 ctrl_outl(value, port);
-        else if (is_pci_ioaddr(port))
-                ctrl_outl(value, pci_ioaddr(port));
         else
                 maybebadio(port);
 }
@@ -119,8 +103,6 @@ void __iomem *titan_ioport_map(unsigned long port, unsigned int size)
 {
 	if (PXSEG(port))
 		return (void __iomem *)port;
-	else if (is_pci_ioaddr(port))
-		return (void __iomem *)pci_ioaddr(port);
 
 	return (void __iomem *)port2adr(port);
 }
-- 
GitLab


From 0e75148108914062cb46ad3dc8f054a628018df7 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:48:48 +0900
Subject: [PATCH 0524/2198] sh: pci: Consolidate pcibios_setup() in pci-lib.

This wasn't really being used for anything useful, so just stub it in
pci-lib.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-dreamcast.c |  6 -----
 arch/sh/drivers/pci/ops-sh4.c       | 35 +++++++++--------------------
 arch/sh/drivers/pci/ops-sh5.c       |  5 -----
 arch/sh/drivers/pci/pci-lib.c       |  5 +++++
 4 files changed, 16 insertions(+), 35 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c
index 5bc81d5b660a3..529aa4f27a899 100644
--- a/arch/sh/drivers/pci/ops-dreamcast.c
+++ b/arch/sh/drivers/pci/ops-dreamcast.c
@@ -154,12 +154,6 @@ static int __init gapspci_init(struct pci_channel *chan)
 	return 0;
 }
 
-/* Haven't done anything here as yet */
-char * __devinit pcibios_setup(char *str)
-{
-	return str;
-}
-
 struct pci_channel board_pci_channels[] = {
 	{ gapspci_init, &gapspci_pci_ops, &gapspci_io_resource,
 	  &gapspci_mem_resource, 0, 1 },
diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index 2a7f7b50ff0a3..7cc1fccf1c524 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -106,30 +106,27 @@ struct pci_ops sh4_pci_ops = {
  * Not really related to pci_ops, but it's common and not worth shoving
  * somewhere else for now..
  */
-static unsigned int pci_probe = PCI_PROBE_CONF1;
-
 int __init sh4_pci_check_direct(struct pci_channel *chan)
 {
 	/*
 	 * Check if configuration works.
 	 */
-	if (pci_probe & PCI_PROBE_CONF1) {
-		unsigned int tmp = pci_read_reg(chan, SH4_PCIPAR);
-
-		pci_write_reg(chan, P1SEG, SH4_PCIPAR);
+	unsigned int tmp = pci_read_reg(chan, SH4_PCIPAR);
 
-		if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) {
-			pci_write_reg(chan, tmp, SH4_PCIPAR);
-			printk(KERN_INFO "PCI: Using configuration type 1\n");
-			request_region(chan->reg_base + SH4_PCIPAR, 8,
-				       "PCI conf1");
-			return 0;
-		}
+	pci_write_reg(chan, P1SEG, SH4_PCIPAR);
 
+	if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) {
 		pci_write_reg(chan, tmp, SH4_PCIPAR);
+		printk(KERN_INFO "PCI: Using configuration type 1\n");
+		request_region(chan->reg_base + SH4_PCIPAR, 8,
+			       "PCI conf1");
+		return 0;
 	}
 
-	pr_debug("PCI: pci_check_direct failed\n");
+	pci_write_reg(chan, tmp, SH4_PCIPAR);
+
+	printk(KERN_ERR "PCI: %s failed\n", __func__);
+
 	return -EINVAL;
 }
 
@@ -155,16 +152,6 @@ static void __init pci_fixup_ide_bases(struct pci_dev *d)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases);
 
-char * __devinit pcibios_setup(char *str)
-{
-	if (!strcmp(str, "off")) {
-		pci_probe = 0;
-		return NULL;
-	}
-
-	return str;
-}
-
 int __attribute__((weak)) pci_fixup_pcic(struct pci_channel *chan)
 {
 	/* Nothing to do. */
diff --git a/arch/sh/drivers/pci/ops-sh5.c b/arch/sh/drivers/pci/ops-sh5.c
index 729e38a6fe071..b10e2d5f4251c 100644
--- a/arch/sh/drivers/pci/ops-sh5.c
+++ b/arch/sh/drivers/pci/ops-sh5.c
@@ -42,11 +42,6 @@ static void __init pci_fixup_ide_bases(struct pci_dev *d)
 }
 DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases);
 
-char * __devinit pcibios_setup(char *str)
-{
-	return str;
-}
-
 static int sh5pci_read(struct pci_bus *bus, unsigned int devfn, int where,
 			int size, u32 *val)
 {
diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
index ea83629458491..f072acafce6c6 100644
--- a/arch/sh/drivers/pci/pci-lib.c
+++ b/arch/sh/drivers/pci/pci-lib.c
@@ -134,6 +134,11 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
 	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
 }
 
+char * __devinit pcibios_setup(char *str)
+{
+	return str;
+}
+
 int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 			enum pci_mmap_state mmap_state, int write_combine)
 {
-- 
GitLab


From 0db38cea69fc478a5c25b3c915ec680cc5538783 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 19:54:47 +0900
Subject: [PATCH 0525/2198] sh: pci: Kill off legacy ide quirks.

These fixups seem to have bitrotted a bit since their introduction in the
2.4 days. As we never had much use for them in the first place, and
nothing is using them any more, kill them off the rest of the way.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/ops-sh4.c | 22 ----------------------
 arch/sh/drivers/pci/ops-sh5.c | 20 --------------------
 2 files changed, 42 deletions(-)

diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c
index 7cc1fccf1c524..78bebebdc99c3 100644
--- a/arch/sh/drivers/pci/ops-sh4.c
+++ b/arch/sh/drivers/pci/ops-sh4.c
@@ -130,28 +130,6 @@ int __init sh4_pci_check_direct(struct pci_channel *chan)
 	return -EINVAL;
 }
 
-/* Handle generic fixups */
-static void __init pci_fixup_ide_bases(struct pci_dev *d)
-{
-	int i;
-
-	/*
-	 * PCI IDE controllers use non-standard I/O port decoding, respect it.
-	 */
-	if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE)
-		return;
-	pr_debug("PCI: IDE base address fixup for %s\n", pci_name(d));
-	for(i = 0; i < 4; i++) {
-		struct resource *r = &d->resource[i];
-
-		if ((r->start & ~0x80) == 0x374) {
-			r->start |= 2;
-			r->end = r->start;
-		}
-	}
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases);
-
 int __attribute__((weak)) pci_fixup_pcic(struct pci_channel *chan)
 {
 	/* Nothing to do. */
diff --git a/arch/sh/drivers/pci/ops-sh5.c b/arch/sh/drivers/pci/ops-sh5.c
index b10e2d5f4251c..4ce95a001b807 100644
--- a/arch/sh/drivers/pci/ops-sh5.c
+++ b/arch/sh/drivers/pci/ops-sh5.c
@@ -22,26 +22,6 @@
 #include <asm/io.h>
 #include "pci-sh5.h"
 
-static void __init pci_fixup_ide_bases(struct pci_dev *d)
-{
-	int i;
-
-	/*
-	 * PCI IDE controllers use non-standard I/O port decoding, respect it.
-	 */
-	if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE)
-		return;
-	printk("PCI: IDE base address fixup for %s\n", pci_name(d));
-	for(i=0; i<4; i++) {
-		struct resource *r = &d->resource[i];
-		if ((r->start & ~0x80) == 0x374) {
-			r->start |= 2;
-			r->end = r->start;
-		}
-	}
-}
-DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases);
-
 static int sh5pci_read(struct pci_bus *bus, unsigned int devfn, int where,
 			int size, u32 *val)
 {
-- 
GitLab


From 3444f5ec49bc6cb901ffea38e085db1d76e1189c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 20:22:05 +0900
Subject: [PATCH 0526/2198] sh: pci: Tidy up the dreamcast PCI support.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Makefile              |   3 +-
 arch/sh/drivers/pci/ops-dreamcast.c       |  98 +++-----------------
 arch/sh/drivers/pci/pci-dreamcast.c       | 105 ++++++++++++++++++++++
 arch/sh/include/mach-dreamcast/mach/pci.h |   2 +
 4 files changed, 119 insertions(+), 89 deletions(-)
 create mode 100644 arch/sh/drivers/pci/pci-dreamcast.c

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 4cac866d55d47..2160a06b6c11a 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -12,7 +12,8 @@ obj-$(CONFIG_CPU_SUBTYPE_SH7780)	+= pci-sh7780.o ops-sh4.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7785)	+= pci-sh7780.o ops-sh4.o
 obj-$(CONFIG_CPU_SH5)			+= pci-sh5.o ops-sh5.o
 
-obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o
+obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o \
+					   pci-dreamcast.o
 obj-$(CONFIG_SH_SECUREEDGE5410)		+= ops-snapgear.o
 obj-$(CONFIG_SH_RTS7751R2D)		+= ops-rts7751r2d.o fixups-rts7751r2d.o
 obj-$(CONFIG_SH_SH03)			+= ops-sh03.o fixups-sh03.o
diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c
index 529aa4f27a899..e83d0d3aabe25 100644
--- a/arch/sh/drivers/pci/ops-dreamcast.c
+++ b/arch/sh/drivers/pci/ops-dreamcast.c
@@ -1,15 +1,9 @@
 /*
- * arch/sh/drivers/pci/ops-dreamcast.c
- *
  * PCI operations for the Sega Dreamcast
  *
  * Copyright (C) 2001, 2002  M. R. Brown
  * Copyright (C) 2002, 2003  Paul Mundt
  *
- * This file originally bore the message (with enclosed-$):
- *	Id: pci.c,v 1.3 2003/05/04 19:29:46 lethal Exp
- *	Dreamcast PCI: Supports SEGA Broadband Adaptor only.
- *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
@@ -23,25 +17,10 @@
 #include <linux/irq.h>
 #include <linux/pci.h>
 #include <linux/module.h>
-
-#include <asm/io.h>
-#include <asm/irq.h>
+#include <linux/io.h>
+#include <linux/irq.h>
 #include <mach/pci.h>
 
-static struct resource gapspci_io_resource = {
-	.name	= "GAPSPCI IO",
-	.start	= GAPSPCI_BBA_CONFIG,
-	.end	= GAPSPCI_BBA_CONFIG + GAPSPCI_BBA_CONFIG_SIZE - 1,
-	.flags	= IORESOURCE_IO,
-};
-
-static struct resource gapspci_mem_resource = {
-	.name	= "GAPSPCI mem",
-	.start	= GAPSPCI_DMA_BASE,
-	.end	= GAPSPCI_DMA_BASE + GAPSPCI_DMA_SIZE - 1,
-	.flags	= IORESOURCE_MEM,
-};
-
 /*
  * The !gapspci_config_access case really shouldn't happen, ever, unless
  * someone implicitly messes around with the last devfn value.. otherwise we
@@ -76,10 +55,10 @@ static int gapspci_read(struct pci_bus *bus, unsigned int devfn, int where, int
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	switch (size) {
-		case 1: *val = inb(GAPSPCI_BBA_CONFIG+where); break;
-		case 2: *val = inw(GAPSPCI_BBA_CONFIG+where); break;
-		case 4: *val = inl(GAPSPCI_BBA_CONFIG+where); break;
-	}	
+	case 1: *val = inb(GAPSPCI_BBA_CONFIG+where); break;
+	case 2: *val = inw(GAPSPCI_BBA_CONFIG+where); break;
+	case 4: *val = inl(GAPSPCI_BBA_CONFIG+where); break;
+	}
 
         return PCIBIOS_SUCCESSFUL;
 }
@@ -90,72 +69,15 @@ static int gapspci_write(struct pci_bus *bus, unsigned int devfn, int where, int
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	switch (size) {
-		case 1: outb(( u8)val, GAPSPCI_BBA_CONFIG+where); break;
-		case 2: outw((u16)val, GAPSPCI_BBA_CONFIG+where); break;
-		case 4: outl((u32)val, GAPSPCI_BBA_CONFIG+where); break;
+	case 1: outb(( u8)val, GAPSPCI_BBA_CONFIG+where); break;
+	case 2: outw((u16)val, GAPSPCI_BBA_CONFIG+where); break;
+	case 4: outl((u32)val, GAPSPCI_BBA_CONFIG+where); break;
 	}
 
         return PCIBIOS_SUCCESSFUL;
 }
 
-static struct pci_ops gapspci_pci_ops = {
+struct pci_ops gapspci_pci_ops = {
 	.read	= gapspci_read,
 	.write	= gapspci_write,
 };
-
-/*
- * gapspci init
- */
-
-static int __init gapspci_init(struct pci_channel *chan)
-{
-	char idbuf[16];
-	int i;
-
-	/*
-	 * FIXME: All of this wants documenting to some degree,
-	 * even some basic register definitions would be nice.
-	 *
-	 * I haven't seen anything this ugly since.. maple.
-	 */
-
-	for (i=0; i<16; i++)
-		idbuf[i] = inb(GAPSPCI_REGS+i);
-
-	if (strncmp(idbuf, "GAPSPCI_BRIDGE_2", 16))
-		return -ENODEV;
-
-	outl(0x5a14a501, GAPSPCI_REGS+0x18);
-
-	for (i=0; i<1000000; i++)
-		;
-
-	if (inl(GAPSPCI_REGS+0x18) != 1)
-		return -EINVAL;
-
-	outl(0x01000000, GAPSPCI_REGS+0x20);
-	outl(0x01000000, GAPSPCI_REGS+0x24);
-
-	outl(GAPSPCI_DMA_BASE, GAPSPCI_REGS+0x28);
-	outl(GAPSPCI_DMA_BASE+GAPSPCI_DMA_SIZE, GAPSPCI_REGS+0x2c);
-
-	outl(1, GAPSPCI_REGS+0x14);
-	outl(1, GAPSPCI_REGS+0x34);
-
-	/* Setting Broadband Adapter */
-	outw(0xf900, GAPSPCI_BBA_CONFIG+0x06);
-	outl(0x00000000, GAPSPCI_BBA_CONFIG+0x30);
-	outb(0x00, GAPSPCI_BBA_CONFIG+0x3c);
-	outb(0xf0, GAPSPCI_BBA_CONFIG+0x0d);
-	outw(0x0006, GAPSPCI_BBA_CONFIG+0x04);
-	outl(0x00002001, GAPSPCI_BBA_CONFIG+0x10);
-	outl(0x01000000, GAPSPCI_BBA_CONFIG+0x14);
-
-	return 0;
-}
-
-struct pci_channel board_pci_channels[] = {
-	{ gapspci_init, &gapspci_pci_ops, &gapspci_io_resource,
-	  &gapspci_mem_resource, 0, 1 },
-	{ 0, }
-};
diff --git a/arch/sh/drivers/pci/pci-dreamcast.c b/arch/sh/drivers/pci/pci-dreamcast.c
new file mode 100644
index 0000000000000..0897be5053d49
--- /dev/null
+++ b/arch/sh/drivers/pci/pci-dreamcast.c
@@ -0,0 +1,105 @@
+/*
+ * PCI support for the Sega Dreamcast
+ *
+ * Copyright (C) 2001, 2002  M. R. Brown
+ * Copyright (C) 2002, 2003  Paul Mundt
+ *
+ * This file originally bore the message (with enclosed-$):
+ *	Id: pci.c,v 1.3 2003/05/04 19:29:46 lethal Exp
+ *	Dreamcast PCI: Supports SEGA Broadband Adaptor only.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/param.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/pci.h>
+#include <linux/module.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <mach/pci.h>
+
+static struct resource gapspci_io_resource = {
+	.name	= "GAPSPCI IO",
+	.start	= GAPSPCI_BBA_CONFIG,
+	.end	= GAPSPCI_BBA_CONFIG + GAPSPCI_BBA_CONFIG_SIZE - 1,
+	.flags	= IORESOURCE_IO,
+};
+
+static struct resource gapspci_mem_resource = {
+	.name	= "GAPSPCI mem",
+	.start	= GAPSPCI_DMA_BASE,
+	.end	= GAPSPCI_DMA_BASE + GAPSPCI_DMA_SIZE - 1,
+	.flags	= IORESOURCE_MEM,
+};
+
+/*
+ * gapspci init
+ */
+
+static int __init gapspci_init(struct pci_channel *chan)
+{
+	char idbuf[16];
+	int i;
+
+	/*
+	 * FIXME: All of this wants documenting to some degree,
+	 * even some basic register definitions would be nice.
+	 *
+	 * I haven't seen anything this ugly since.. maple.
+	 */
+
+	for (i=0; i<16; i++)
+		idbuf[i] = inb(GAPSPCI_REGS+i);
+
+	if (strncmp(idbuf, "GAPSPCI_BRIDGE_2", 16))
+		return -ENODEV;
+
+	outl(0x5a14a501, GAPSPCI_REGS+0x18);
+
+	for (i=0; i<1000000; i++)
+		cpu_relax();
+
+	if (inl(GAPSPCI_REGS+0x18) != 1)
+		return -EINVAL;
+
+	outl(0x01000000, GAPSPCI_REGS+0x20);
+	outl(0x01000000, GAPSPCI_REGS+0x24);
+
+	outl(GAPSPCI_DMA_BASE, GAPSPCI_REGS+0x28);
+	outl(GAPSPCI_DMA_BASE+GAPSPCI_DMA_SIZE, GAPSPCI_REGS+0x2c);
+
+	outl(1, GAPSPCI_REGS+0x14);
+	outl(1, GAPSPCI_REGS+0x34);
+
+	/* Setting Broadband Adapter */
+	outw(0xf900, GAPSPCI_BBA_CONFIG+0x06);
+	outl(0x00000000, GAPSPCI_BBA_CONFIG+0x30);
+	outb(0x00, GAPSPCI_BBA_CONFIG+0x3c);
+	outb(0xf0, GAPSPCI_BBA_CONFIG+0x0d);
+	outw(0x0006, GAPSPCI_BBA_CONFIG+0x04);
+	outl(0x00002001, GAPSPCI_BBA_CONFIG+0x10);
+	outl(0x01000000, GAPSPCI_BBA_CONFIG+0x14);
+
+	return 0;
+}
+
+struct pci_channel board_pci_channels[] = {
+	{
+		.init		= gapspci_init,
+		.pci_ops	= &gapspci_pci_ops,
+		.io_resource	= &gapspci_io_resource,
+		.mem_resource	= &gapspci_mem_resource,
+		.first_devfn	= 0,
+		.last_devfn	= 1,
+	}, {
+		.init		= NULL,
+	}
+};
diff --git a/arch/sh/include/mach-dreamcast/mach/pci.h b/arch/sh/include/mach-dreamcast/mach/pci.h
index 75fc9009e0921..0314d975e6269 100644
--- a/arch/sh/include/mach-dreamcast/mach/pci.h
+++ b/arch/sh/include/mach-dreamcast/mach/pci.h
@@ -21,5 +21,7 @@
 
 #define	GAPSPCI_IRQ		HW_EVENT_EXTERNAL
 
+extern struct pci_ops gapspci_pci_ops;
+
 #endif /* __ASM_SH_DREAMCAST_PCI_H */
 
-- 
GitLab


From 48e4237d96fdcb1237b63bcddb37771f97452eec Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 20:40:48 +0900
Subject: [PATCH 0527/2198] sh: pci: Convert the SH-5 code over to the new
 interface.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Kconfig      |  2 +-
 arch/sh/drivers/pci/ops-cayman.c | 11 ------
 arch/sh/drivers/pci/pci-sh5.c    | 58 ++++++++++++--------------------
 arch/sh/drivers/pci/pci-sh5.h    |  4 ---
 4 files changed, 23 insertions(+), 52 deletions(-)

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index efe8cf965a9bb..f9fb1d1b623ee 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -23,7 +23,7 @@ config PCI_NEW
 	bool
 	depends on PCI
 	default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \
-		     CPU_SUBTYPE_SH7785
+		     CPU_SUBTYPE_SH7785 || CPU_SH5
 
 # This is also board-specific
 config PCI_AUTO
diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/ops-cayman.c
index cbaaf0234b72c..b68b61d22c6cd 100644
--- a/arch/sh/drivers/pci/ops-cayman.c
+++ b/arch/sh/drivers/pci/ops-cayman.c
@@ -75,14 +75,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin)
 
 	return result;
 }
-
-struct pci_channel board_pci_channels[] = {
-	{ sh5_pci_init, &sh5_pci_ops, NULL, NULL, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh5pci_init(__pa(memory_start),
-			   __pa(memory_end) - __pa(memory_start));
-}
diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c
index 7750da2762841..cf431852213c0 100644
--- a/arch/sh/drivers/pci/pci-sh5.c
+++ b/arch/sh/drivers/pci/pci-sh5.c
@@ -27,12 +27,6 @@
 unsigned long pcicr_virt;
 unsigned long PCI_IO_AREA;
 
-int __init sh5_pci_init(struct pci_channel *chan)
-{
-	pr_debug("PCI: Starting intialization.\n");
-	return pcibios_init_platform();
-}
-
 /* Rounds a number UP to the nearest power of two. Used for
  * sizing the PCI window.
  */
@@ -95,8 +89,21 @@ static irqreturn_t pcish5_serr_irq(int irq, void *dev_id)
 	return IRQ_NONE;
 }
 
-int __init sh5pci_init(unsigned long memStart, unsigned long memSize)
+static struct resource sh5_io_resource = { /* place holder */ };
+static struct resource sh5_mem_resource = { /* place holder */ };
+
+static struct pci_channel sh5pci_controller = {
+	.pci_ops		= &sh5_pci_ops,
+	.mem_resource		= &sh5_mem_resource,
+	.mem_offset		= 0x00000000,
+	.io_resource		= &sh5_io_resource,
+	.io_offset		= 0x00000000,
+};
+
+static int __init sh5pci_init(void)
 {
+	unsigned long memStart = __pa(memory_start);
+	unsigned long memSize = __pa(memory_end) - memStart;
 	u32 lsr0;
 	u32 uval;
 
@@ -203,35 +210,14 @@ int __init sh5pci_init(unsigned long memStart, unsigned long memSize)
         SH5PCI_WRITE(AINTM, ~0);
         SH5PCI_WRITE(PINTM, ~0);
 
-	return 0;
-}
+	sh5_io_resource.start = PCI_IO_AREA;
+	sh5_io_resource.end = PCI_IO_AREA + 0x10000;
 
-#define xPCIBIOS_MIN_IO		board_pci_channels->io_resource->start
-#define xPCIBIOS_MIN_MEM	board_pci_channels->mem_resource->start
+	sh5_mem_resource.start = memStart;
+	sh5_mem_resource.end = memStart + memSize;
 
-void __devinit pcibios_fixup_bus(struct pci_bus *bus)
-{
-	struct pci_dev *dev = bus->self;
-	int i;
-
-	if (dev) {
-		for (i= 0; i < 3; i++) {
-			bus->resource[i] =
-				&dev->resource[PCI_BRIDGE_RESOURCES+i];
-			bus->resource[i]->name = bus->name;
-		}
-		bus->resource[0]->flags |= IORESOURCE_IO;
-		bus->resource[1]->flags |= IORESOURCE_MEM;
-
-		/* For now, propagate host limits to the bus;
-		 * we'll adjust them later. */
-		bus->resource[0]->end = 64*1024 - 1 ;
-		bus->resource[1]->end = xPCIBIOS_MIN_MEM+(256*1024*1024)-1;
-		bus->resource[0]->start = xPCIBIOS_MIN_IO;
-		bus->resource[1]->start = xPCIBIOS_MIN_MEM;
-
-		/* Turn off downstream PF memory address range by default */
-		bus->resource[2]->start = 1024*1024;
-		bus->resource[2]->end = bus->resource[2]->start - 1;
-	}
+	register_pci_controller(&sh5pci_controller);
+
+	return 0;
 }
+arch_initcall(sh5pci_init);
diff --git a/arch/sh/drivers/pci/pci-sh5.h b/arch/sh/drivers/pci/pci-sh5.h
index af09f384c7d27..f277628221f3c 100644
--- a/arch/sh/drivers/pci/pci-sh5.h
+++ b/arch/sh/drivers/pci/pci-sh5.h
@@ -107,8 +107,4 @@ extern unsigned long pcicr_virt;
 
 extern struct pci_ops sh5_pci_ops;
 
-/* arch/sh/drivers/pci/pci-sh5.c */
-int sh5_pci_init(struct pci_channel *chan);
-int sh5pci_init(unsigned long memStart, unsigned long memSize);
-
 #endif /* __PCI_SH5_H */
-- 
GitLab


From a5b08047129f214af1899bd9088605c7adc21ed5 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 20:41:45 +0900
Subject: [PATCH 0528/2198] sh: pci: Rename ops-cayman -> fixups-cayman.

Now that ops-cayman.c only contains IRQ routing fixups, rename it.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Makefile                          | 2 +-
 arch/sh/drivers/pci/{ops-cayman.c => fixups-cayman.c} | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename arch/sh/drivers/pci/{ops-cayman.c => fixups-cayman.c} (100%)

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index 2160a06b6c11a..cb2190c3e36b4 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -24,4 +24,4 @@ obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= fixups-sdk7780.o
 obj-$(CONFIG_SH_TITAN)			+= ops-titan.o
 obj-$(CONFIG_SH_LANDISK)		+= ops-landisk.o
 obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-rts7751r2d.o
-obj-$(CONFIG_SH_CAYMAN)			+= ops-cayman.o
+obj-$(CONFIG_SH_CAYMAN)			+= fixups-cayman.o
diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/fixups-cayman.c
similarity index 100%
rename from arch/sh/drivers/pci/ops-cayman.c
rename to arch/sh/drivers/pci/fixups-cayman.c
-- 
GitLab


From 757e3c16f8bafa2a470aebf9b04671c5d4d18f49 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:11:07 +0900
Subject: [PATCH 0529/2198] sh: pci: Rewrite SH7751 PCI support to follow
 SH7780.

This follows the similar sort of scheme that the refactored SH7780 code
uses, using a 64MB CS3 mapping to handle the window0 case, and simply
discarding window1. This vastly simplifies the code, and allows most of
the board-specific setup to go die.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Kconfig          |   3 +-
 arch/sh/drivers/pci/Makefile         |   2 +-
 arch/sh/drivers/pci/ops-landisk.c    |  31 ------
 arch/sh/drivers/pci/ops-lboxre2.c    |  33 -------
 arch/sh/drivers/pci/ops-rts7751r2d.c |  34 -------
 arch/sh/drivers/pci/ops-sh03.c       |  45 ---------
 arch/sh/drivers/pci/ops-snapgear.c   |  49 ----------
 arch/sh/drivers/pci/ops-titan.c      |  36 -------
 arch/sh/drivers/pci/pci-sh7751.c     | 135 ++++++++++++++-------------
 arch/sh/drivers/pci/pci-sh7751.h     |  13 +--
 10 files changed, 76 insertions(+), 305 deletions(-)
 delete mode 100644 arch/sh/drivers/pci/ops-sh03.c

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index f9fb1d1b623ee..5aaee3c707b08 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -23,7 +23,8 @@ config PCI_NEW
 	bool
 	depends on PCI
 	default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \
-		     CPU_SUBTYPE_SH7785 || CPU_SH5
+		     CPU_SUBTYPE_SH7785 || CPU_SH5 || \
+		     CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R
 
 # This is also board-specific
 config PCI_AUTO
diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index cb2190c3e36b4..b8667de19ece6 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -16,7 +16,7 @@ obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o \
 					   pci-dreamcast.o
 obj-$(CONFIG_SH_SECUREEDGE5410)		+= ops-snapgear.o
 obj-$(CONFIG_SH_RTS7751R2D)		+= ops-rts7751r2d.o fixups-rts7751r2d.o
-obj-$(CONFIG_SH_SH03)			+= ops-sh03.o fixups-sh03.o
+obj-$(CONFIG_SH_SH03)			+= fixups-sh03.o
 obj-$(CONFIG_SH_HIGHLANDER)		+= fixups-r7780rp.o
 obj-$(CONFIG_SH_SH7785LCR)		+= fixups-r7780rp.o
 obj-$(CONFIG_SH_SDK7780)		+= fixups-sdk7780.o
diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c
index 178b77828aa98..bb1a6bb5149ed 100644
--- a/arch/sh/drivers/pci/ops-landisk.c
+++ b/arch/sh/drivers/pci/ops-landisk.c
@@ -15,37 +15,6 @@
 #include <linux/pci.h>
 #include "pci-sh4.h"
 
-static struct resource sh7751_io_resource = {
-	.name = "SH7751 IO",
-	.start = SH7751_PCI_IO_BASE,
-	.end = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
-	.flags = IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name = "SH7751 mem",
-	.start = SH7751_PCI_MEMORY_BASE,
-	.end = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags = IORESOURCE_MEM
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0x3ff},
-	{NULL, NULL, NULL, 0, 0},
-};
-
-static struct sh4_pci_address_map sh7751_pci_map = {
-	.window0 = {
-		.base	= SH7751_CS3_BASE_ADDR,
-		.size	= (64 << 20),	/* 64MB */
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
-}
-
 int pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	/*
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c
index 91cabd84f028e..6db2c209737f7 100644
--- a/arch/sh/drivers/pci/ops-lboxre2.c
+++ b/arch/sh/drivers/pci/ops-lboxre2.c
@@ -21,36 +21,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	return lboxre2_irq_tab[slot];
 }
-
-static struct resource sh7751_io_resource = {
-	.name	= "SH7751_IO",
-	.start	= SH7751_PCI_IO_BASE ,
-	.end	= SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name	= "SH7751_mem",
-	.start	= SH7751_PCI_MEMORY_BASE,
-	.end	= SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-extern struct pci_ops sh7751_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
-static struct sh4_pci_address_map sh7751_pci_map = {
-	.window0	= {
-		.base	= SH7751_CS3_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
-}
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
index 96b916c0d6c5b..d950b8ab25f1e 100644
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ b/arch/sh/drivers/pci/ops-rts7751r2d.c
@@ -29,37 +29,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	return rts7751r2d_irq_tab[slot];
 }
-
-static struct resource sh7751_io_resource = {
-	.name	= "SH7751_IO",
-	.start	= 0x4000,
-	.end	= SH7751_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name	= "SH7751_mem",
-	.start	= SH7751_PCI_MEMORY_BASE,
-	.end	= SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-extern struct pci_ops sh7751_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
-static struct sh4_pci_address_map sh7751_pci_map = {
-	.window0	= {
-		.base	= SH7751_CS3_BASE_ADDR,
-		.size	= 0x04000000,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
-}
-
diff --git a/arch/sh/drivers/pci/ops-sh03.c b/arch/sh/drivers/pci/ops-sh03.c
deleted file mode 100644
index 0218135f0bb8e..0000000000000
--- a/arch/sh/drivers/pci/ops-sh03.c
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-sh03.c
- *
- * PCI initialization for the Interface CTP/PCI-SH03 board
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-#include <asm/io.h>
-#include "pci-sh7751.h"
-
-/*
- * Description:  This function sets up and initializes the pcic, sets
- * up the BARS, maps the DRAM into the address space etc, etc.
- */
-int __init pcibios_init_platform(void)
-{
-	__set_io_port_base(SH7751_PCI_IO_BASE);
-	return 1;
-}
-
-static struct resource sh7751_io_resource = {
-	.name   = "SH03 IO",
-	.start  = SH7751_PCI_IO_BASE,
-	.end    = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
-	.flags  = IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name   = "SH03 mem",
-	.start  = SH7751_PCI_MEMORY_BASE,
-	.end    = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags  = IORESOURCE_MEM
-};
-
-extern struct pci_ops sh4_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c
index b64f2b91be8e6..5a39ecc1adb80 100644
--- a/arch/sh/drivers/pci/ops-snapgear.c
+++ b/arch/sh/drivers/pci/ops-snapgear.c
@@ -18,55 +18,6 @@
 #include <linux/pci.h>
 #include "pci-sh4.h"
 
-#define SNAPGEAR_PCI_IO		0x4000
-#define SNAPGEAR_PCI_MEM	0xfd000000
-
-/* PCI: default LOCAL memory window sizes (seen from PCI bus) */
-#define SNAPGEAR_LSR0_SIZE    (64*(1<<20)) //64MB
-#define SNAPGEAR_LSR1_SIZE    (64*(1<<20)) //64MB
-
-static struct resource sh7751_io_resource = {
-	.name		= "SH7751 IO",
-	.start		= SNAPGEAR_PCI_IO,
-	.end		= SNAPGEAR_PCI_IO + (64*1024) - 1, /* 64KiB I/O */
-	.flags		= IORESOURCE_IO,
-};
-
-static struct resource sh7751_mem_resource = {
-	.name		= "SH7751 mem",
-	.start		= SNAPGEAR_PCI_MEM,
-	.end		= SNAPGEAR_PCI_MEM + (64*1024*1024) - 1, /* 64MiB mem */
-	.flags		= IORESOURCE_MEM,
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ 0, }
-};
-
-static struct sh4_pci_address_map sh7751_pci_map = {
-	.window0	= {
-		.base	= SH7751_CS2_BASE_ADDR,
-		.size	= SNAPGEAR_LSR0_SIZE,
-	},
-
-	.window1	= {
-		.base	= SH7751_CS2_BASE_ADDR,
-		.size	= SNAPGEAR_LSR1_SIZE,
-	},
-};
-
-/*
- * Initialize the SnapGear PCI interface
- * Setup hardware to be Central Funtion
- * Copy the BSR regs to the PCI interface
- * Setup PCI windows into local RAM
- */
-int __init pcibios_init_platform(void)
-{
-	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
-}
-
 int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 {
 	int irq = -1;
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c
index e45bb62bf8ce0..3a79fa8254a63 100644
--- a/arch/sh/drivers/pci/ops-titan.c
+++ b/arch/sh/drivers/pci/ops-titan.c
@@ -36,39 +36,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
 
 	return irq;
 }
-
-static struct resource sh7751_io_resource = {
-	.name	= "SH7751_IO",
-	.start	= SH7751_PCI_IO_BASE,
-	.end	= SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
-	.flags	= IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name	= "SH7751_mem",
-	.start	= SH7751_PCI_MEMORY_BASE,
-	.end	= SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags	= IORESOURCE_MEM
-};
-
-struct pci_channel board_pci_channels[] = {
-	{ sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
-static struct sh4_pci_address_map sh7751_pci_map = {
-	.window0	= {
-		.base	= SH7751_CS2_BASE_ADDR,
-		.size	= SH7751_MEM_REGION_SIZE*2,	/* cs2 and cs3 */
-	},
-
-	.window1	= {
-		.base	= SH7751_CS2_BASE_ADDR,
-		.size	= SH7751_MEM_REGION_SIZE*2,
-	},
-};
-
-int __init pcibios_init_platform(void)
-{
-	return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map);
-}
diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index 4c08fd7f665da..c4fa0bb139761 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -1,78 +1,41 @@
 /*
- *	Low-Level PCI Support for the SH7751
+ * Low-Level PCI Support for the SH7751
  *
- *  Dustin McIntire (dustin@sensoria.com)
- *	Derived from arch/i386/kernel/pci-*.c which bore the message:
- *	(c) 1999--2000 Martin Mares <mj@ucw.cz>
+ *  Copyright (C) 2003 - 2009  Paul Mundt
+ *  Copyright (C) 2001  Dustin McIntire
  *
- *  Ported to the new API by Paul Mundt <lethal@linux-sh.org>
- *  With cleanup by Paul van Gool <pvangool@mimotech.com>
- *
- *  May be copied or modified under the terms of the GNU General Public
- *  License.  See linux/COPYING for more information.
+ *  With cleanup by Paul van Gool <pvangool@mimotech.com>, 2003.
  *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
  */
-#undef DEBUG
-
 #include <linux/init.h>
 #include <linux/pci.h>
 #include <linux/types.h>
 #include <linux/errno.h>
-#include <linux/delay.h>
+#include <linux/io.h>
 #include "pci-sh4.h"
 #include <asm/addrspace.h>
-#include <asm/io.h>
-
-/*
- * Initialization. Try all known PCI access methods. Note that we support
- * using both PCI BIOS and direct access: in such cases, we use I/O ports
- * to access config space.
- *
- * Note that the platform specific initialization (BSC registers, and memory
- * space mapping) will be called via the platform defined function
- * pcibios_init_platform().
- */
-int __init sh7751_pci_init(struct pci_channel *chan)
-{
-	unsigned int id;
-	int ret;
-
-	pr_debug("PCI: Starting intialization.\n");
-
-	chan->reg_base = 0xfe200000;
-
-	/* check for SH7751/SH7751R hardware */
-	id = pci_read_reg(chan, SH7751_PCICONF0);
-	if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) &&
-	    id != ((SH7751R_DEVICE_ID << 16) | SH7751_VENDOR_ID)) {
-		pr_debug("PCI: This is not an SH7751(R) (%x)\n", id);
-		return -ENODEV;
-	}
-
-	if ((ret = sh4_pci_check_direct(chan)) != 0)
-		return ret;
-
-	return pcibios_init_platform();
-}
 
 static int __init __area_sdram_check(struct pci_channel *chan,
 				     unsigned int area)
 {
-	u32 word;
+	unsigned long word;
 
-	word = ctrl_inl(SH7751_BCR1);
+	word = __raw_readl(SH7751_BCR1);
 	/* check BCR for SDRAM in area */
 	if (((word >> area) & 1) == 0) {
-		printk("PCI: Area %d is not configured for SDRAM. BCR1=0x%x\n",
+		printk("PCI: Area %d is not configured for SDRAM. BCR1=0x%lx\n",
 		       area, word);
 		return 0;
 	}
 	pci_write_reg(chan, word, SH4_PCIBCR1);
 
-	word = (u16)ctrl_inw(SH7751_BCR2);
+	word = __raw_readw(SH7751_BCR2);
 	/* check BCR2 for 32bit SDRAM interface*/
 	if (((word >> (area << 1)) & 0x3) != 0x3) {
-		printk("PCI: Area %d is not 32 bit SDRAM. BCR2=0x%x\n",
+		printk("PCI: Area %d is not 32 bit SDRAM. BCR2=0x%lx\n",
 		       area, word);
 		return 0;
 	}
@@ -81,11 +44,56 @@ static int __init __area_sdram_check(struct pci_channel *chan,
 	return 1;
 }
 
-int __init sh7751_pcic_init(struct pci_channel *chan,
-			    struct sh4_pci_address_map *map)
+static struct resource sh7751_io_resource = {
+	.name	= "SH7751_IO",
+	.start	= SH7751_PCI_IO_BASE,
+	.end	= SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
+	.flags	= IORESOURCE_IO
+};
+
+static struct resource sh7751_mem_resource = {
+	.name	= "SH7785_mem",
+	.start	= SH7751_PCI_MEMORY_BASE,
+	.end	= SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
+	.flags	= IORESOURCE_MEM
+};
+
+static struct pci_channel sh7751_pci_controller = {
+	.pci_ops	= &sh4_pci_ops,
+	.mem_resource	= &sh7751_mem_resource,
+	.mem_offset	= 0x00000000,
+	.io_resource	= &sh7751_io_resource,
+	.io_offset	= 0x00000000,
+};
+
+static struct sh4_pci_address_map sh7751_pci_map = {
+	.window0	= {
+		.base	= SH7751_CS3_BASE_ADDR,
+		.size	= 0x04000000,
+	},
+};
+
+static int __init sh7751_pci_init(void)
 {
-	u32 reg;
-	u32 word;
+	struct pci_channel *chan = &sh7751_pci_controller;
+	unsigned int id;
+	u32 word, reg;
+	int ret;
+
+	printk(KERN_NOTICE "PCI: Starting intialization.\n");
+
+	chan->reg_base = 0xfe200000;
+
+	/* check for SH7751/SH7751R hardware */
+	id = pci_read_reg(chan, SH7751_PCICONF0);
+	if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) &&
+	    id != ((SH7751R_DEVICE_ID << 16) | SH7751_VENDOR_ID)) {
+		pr_debug("PCI: This is not an SH7751(R) (%x)\n", id);
+		return -ENODEV;
+	}
+
+	if ((ret = sh4_pci_check_direct(chan)) != 0)
+		return ret;
 
 	/* Set the BCR's to enable PCI access */
 	reg = ctrl_inl(SH7751_BCR1);
@@ -112,21 +120,13 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 
 	/* Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
-	 * Window0 = map->window0.size @ non-cached area base = SDRAM
-	 * Window1 = map->window1.size @ cached area base = SDRAM
 	 */
-	word = map->window0.size - 1;
+	word = sh7751_pci_map.window0.size - 1;
 	pci_write_reg(chan, word, SH4_PCILSR0);
-	word = map->window1.size - 1;
-	pci_write_reg(chan, word, SH4_PCILSR1);
 	/* Set the values on window 0 PCI config registers */
-	word = P2SEGADDR(map->window0.base);
+	word = P2SEGADDR(sh7751_pci_map.window0.base);
 	pci_write_reg(chan, word, SH4_PCILAR0);
 	pci_write_reg(chan, word, SH7751_PCICONF5);
-	/* Set the values on window 1 PCI config registers */
-	word =  PHYSADDR(map->window1.base);
-	pci_write_reg(chan, word, SH4_PCILAR1);
-	pci_write_reg(chan, word, SH7751_PCICONF6);
 
 	/* Set the local 16MB PCI memory space window to
 	 * the lowest PCI mapped address
@@ -144,7 +144,7 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	/* Set PCI WCRx, BCRx's, copy from BSC locations */
 
 	/* check BCR for SDRAM in specified area */
-	switch (map->window0.base) {
+	switch (sh7751_pci_map.window0.base) {
 	case SH7751_CS0_BASE_ADDR: word = __area_sdram_check(chan, 0); break;
 	case SH7751_CS1_BASE_ADDR: word = __area_sdram_check(chan, 1); break;
 	case SH7751_CS2_BASE_ADDR: word = __area_sdram_check(chan, 2); break;
@@ -179,5 +179,10 @@ int __init sh7751_pcic_init(struct pci_channel *chan,
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM;
 	pci_write_reg(chan, word, SH4_PCICR);
 
+	__set_io_port_base(SH7751_PCI_IO_BASE);
+
+	register_pci_controller(chan);
+
 	return 0;
 }
+arch_initcall(sh7751_pci_init);
diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h
index c390dd2f5e1a3..4983a4d203559 100644
--- a/arch/sh/drivers/pci/pci-sh7751.h
+++ b/arch/sh/drivers/pci/pci-sh7751.h
@@ -57,7 +57,7 @@
   #define SH7751_PCICONF2_SCC        0x00FF0000  /* Sub-Class Code */
   #define SH7751_PCICONF2_RLPI       0x0000FF00  /* Programming Interface */
   #define SH7751_PCICONF2_REV        0x000000FF  /* Revision ID */
-#define SH7751_PCICONF3            0xC           /* PCI Config Reg 3 */ 
+#define SH7751_PCICONF3            0xC           /* PCI Config Reg 3 */
   #define SH7751_PCICONF3_BIST7      0x80000000  /* Bist Supported */
   #define SH7751_PCICONF3_BIST6      0x40000000  /* Bist Executing */
   #define SH7751_PCICONF3_BIST3_0    0x0F000000  /* Bist Passed */
@@ -72,12 +72,12 @@
   #define SH7751_PCICONF5_BASE       0xFFFFFFF0  /* Mem Space Base Addr */
   #define SH7751_PCICONF5_LAP        0x00000008  /* Prefetch Enabled */
   #define SH7751_PCICONF5_LAT        0x00000006  /* Local Memory type */
-  #define SH7751_PCICONF5_ASI        0x00000001  /* Address Space Type */  
+  #define SH7751_PCICONF5_ASI        0x00000001  /* Address Space Type */
 #define SH7751_PCICONF6            0x18          /* PCI Config Reg 6 */
   #define SH7751_PCICONF6_BASE       0xFFFFFFF0  /* Mem Space Base Addr */
   #define SH7751_PCICONF6_LAP        0x00000008  /* Prefetch Enabled */
   #define SH7751_PCICONF6_LAT        0x00000006  /* Local Memory type */
-  #define SH7751_PCICONF6_ASI        0x00000001  /* Address Space Type */  
+  #define SH7751_PCICONF6_ASI        0x00000001  /* Address Space Type */
 /* PCICONF7 - PCICONF10 are undefined */
 #define SH7751_PCICONF11           0x2C          /* PCI Config Reg 11 */
   #define SH7751_PCICONF11_SSID      0xFFFF0000  /* Subsystem ID */
@@ -126,11 +126,4 @@
 #define SH7751_CS5_BASE_ADDR       (SH7751_CS4_BASE_ADDR + SH7751_MEM_REGION_SIZE)
 #define SH7751_CS6_BASE_ADDR       (SH7751_CS5_BASE_ADDR + SH7751_MEM_REGION_SIZE)
 
-struct sh4_pci_address_map;
-
-/* arch/sh/drivers/pci/pci-sh7751.c */
-int sh7751_pci_init(struct pci_channel *chan);
-int sh7751_pcic_init(struct pci_channel *chan,
-		     struct sh4_pci_address_map *map);
-
 #endif /* _PCI_SH7751_H_ */
-- 
GitLab


From 84972ec0c206b73ffb74e5114b574186ce6166b8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:17:10 +0900
Subject: [PATCH 0530/2198] sh: pci: Rename SH7751 platform ops files to
 fixups.

None of these contain pci_ops, only IRQ routing bits, rename them
accordingly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Makefile                              | 8 ++++----
 arch/sh/drivers/pci/{ops-landisk.c => fixups-landisk.c}   | 0
 arch/sh/drivers/pci/{ops-lboxre2.c => fixups-lboxre2.c}   | 0
 arch/sh/drivers/pci/{ops-snapgear.c => fixups-snapgear.c} | 0
 arch/sh/drivers/pci/{ops-titan.c => fixups-titan.c}       | 0
 5 files changed, 4 insertions(+), 4 deletions(-)
 rename arch/sh/drivers/pci/{ops-landisk.c => fixups-landisk.c} (100%)
 rename arch/sh/drivers/pci/{ops-lboxre2.c => fixups-lboxre2.c} (100%)
 rename arch/sh/drivers/pci/{ops-snapgear.c => fixups-snapgear.c} (100%)
 rename arch/sh/drivers/pci/{ops-titan.c => fixups-titan.c} (100%)

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index b8667de19ece6..abd931c14965c 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -14,14 +14,14 @@ obj-$(CONFIG_CPU_SH5)			+= pci-sh5.o ops-sh5.o
 
 obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o \
 					   pci-dreamcast.o
-obj-$(CONFIG_SH_SECUREEDGE5410)		+= ops-snapgear.o
+obj-$(CONFIG_SH_SECUREEDGE5410)		+= fixups-snapgear.o
 obj-$(CONFIG_SH_RTS7751R2D)		+= ops-rts7751r2d.o fixups-rts7751r2d.o
 obj-$(CONFIG_SH_SH03)			+= fixups-sh03.o
 obj-$(CONFIG_SH_HIGHLANDER)		+= fixups-r7780rp.o
 obj-$(CONFIG_SH_SH7785LCR)		+= fixups-r7780rp.o
 obj-$(CONFIG_SH_SDK7780)		+= fixups-sdk7780.o
 obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= fixups-sdk7780.o
-obj-$(CONFIG_SH_TITAN)			+= ops-titan.o
-obj-$(CONFIG_SH_LANDISK)		+= ops-landisk.o
-obj-$(CONFIG_SH_LBOX_RE2)		+= ops-lboxre2.o fixups-rts7751r2d.o
+obj-$(CONFIG_SH_TITAN)			+= fixups-titan.o
+obj-$(CONFIG_SH_LANDISK)		+= fixups-landisk.o
+obj-$(CONFIG_SH_LBOX_RE2)		+= fixups-lboxre2.o fixups-rts7751r2d.o
 obj-$(CONFIG_SH_CAYMAN)			+= fixups-cayman.o
diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/fixups-landisk.c
similarity index 100%
rename from arch/sh/drivers/pci/ops-landisk.c
rename to arch/sh/drivers/pci/fixups-landisk.c
diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c
similarity index 100%
rename from arch/sh/drivers/pci/ops-lboxre2.c
rename to arch/sh/drivers/pci/fixups-lboxre2.c
diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/fixups-snapgear.c
similarity index 100%
rename from arch/sh/drivers/pci/ops-snapgear.c
rename to arch/sh/drivers/pci/fixups-snapgear.c
diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/fixups-titan.c
similarity index 100%
rename from arch/sh/drivers/pci/ops-titan.c
rename to arch/sh/drivers/pci/fixups-titan.c
-- 
GitLab


From 37c8ac361efdbae969eff1a603bc39412d3e873f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:27:15 +0900
Subject: [PATCH 0531/2198] sh: pci: Consolidate lboxre2 and r2d IRQ fixups.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Makefile            |  4 ++--
 arch/sh/drivers/pci/fixups-lboxre2.c    | 23 ------------------
 arch/sh/drivers/pci/fixups-rts7751r2d.c | 25 +++++++++++++++++++-
 arch/sh/drivers/pci/ops-rts7751r2d.c    | 31 -------------------------
 4 files changed, 26 insertions(+), 57 deletions(-)
 delete mode 100644 arch/sh/drivers/pci/fixups-lboxre2.c
 delete mode 100644 arch/sh/drivers/pci/ops-rts7751r2d.c

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index abd931c14965c..fbebd73b22f60 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -15,7 +15,7 @@ obj-$(CONFIG_CPU_SH5)			+= pci-sh5.o ops-sh5.o
 obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o \
 					   pci-dreamcast.o
 obj-$(CONFIG_SH_SECUREEDGE5410)		+= fixups-snapgear.o
-obj-$(CONFIG_SH_RTS7751R2D)		+= ops-rts7751r2d.o fixups-rts7751r2d.o
+obj-$(CONFIG_SH_RTS7751R2D)		+= fixups-rts7751r2d.o
 obj-$(CONFIG_SH_SH03)			+= fixups-sh03.o
 obj-$(CONFIG_SH_HIGHLANDER)		+= fixups-r7780rp.o
 obj-$(CONFIG_SH_SH7785LCR)		+= fixups-r7780rp.o
@@ -23,5 +23,5 @@ obj-$(CONFIG_SH_SDK7780)		+= fixups-sdk7780.o
 obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= fixups-sdk7780.o
 obj-$(CONFIG_SH_TITAN)			+= fixups-titan.o
 obj-$(CONFIG_SH_LANDISK)		+= fixups-landisk.o
-obj-$(CONFIG_SH_LBOX_RE2)		+= fixups-lboxre2.o fixups-rts7751r2d.o
+obj-$(CONFIG_SH_LBOX_RE2)		+= fixups-rts7751r2d.o
 obj-$(CONFIG_SH_CAYMAN)			+= fixups-cayman.o
diff --git a/arch/sh/drivers/pci/fixups-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c
deleted file mode 100644
index 6db2c209737f7..0000000000000
--- a/arch/sh/drivers/pci/fixups-lboxre2.c
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-lboxre2.c
- *
- * Copyright (C) 2007 Nobuhiro Iwamatsu
- *
- * PCI initialization for the NTT COMWARE L-BOX RE2
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <mach/lboxre2.h>
-#include "pci-sh4.h"
-
-static char lboxre2_irq_tab[] __initdata = {
-	IRQ_ETH0, IRQ_ETH1, IRQ_INTA, IRQ_INTD,
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	return lboxre2_irq_tab[slot];
-}
diff --git a/arch/sh/drivers/pci/fixups-rts7751r2d.c b/arch/sh/drivers/pci/fixups-rts7751r2d.c
index 38852334d4791..052b354236dcf 100644
--- a/arch/sh/drivers/pci/fixups-rts7751r2d.c
+++ b/arch/sh/drivers/pci/fixups-rts7751r2d.c
@@ -1,21 +1,44 @@
 /*
  * arch/sh/drivers/pci/fixups-rts7751r2d.c
  *
- * RTS7751R2D PCI fixups
+ * RTS7751R2D / LBOXRE2 PCI fixups
  *
  * Copyright (C) 2003  Lineo uSolutions, Inc.
  * Copyright (C) 2004  Paul Mundt
+ * Copyright (C) 2007  Nobuhiro Iwamatsu
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
 #include <linux/pci.h>
+#include <mach/lboxre2.h>
+#include <mach/r2d.h>
 #include "pci-sh4.h"
+#include <asm/machtypes.h>
 
 #define PCIMCR_MRSET_OFF	0xBFFFFFFF
 #define PCIMCR_RFSH_OFF		0xFFFFFFFB
 
+static u8 rts7751r2d_irq_tab[] __initdata = {
+	IRQ_PCI_INTA,
+	IRQ_PCI_INTB,
+	IRQ_PCI_INTC,
+	IRQ_PCI_INTD,
+};
+
+static char lboxre2_irq_tab[] __initdata = {
+	IRQ_ETH0, IRQ_ETH1, IRQ_INTA, IRQ_INTD,
+};
+
+int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
+{
+	if (mach_is_lboxre2())
+		return lboxre2_irq_tab[slot];
+	else
+		return rts7751r2d_irq_tab[slot];
+}
+
 int pci_fixup_pcic(struct pci_channel *chan)
 {
 	unsigned long bcr1, mcr;
diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c
deleted file mode 100644
index d950b8ab25f1e..0000000000000
--- a/arch/sh/drivers/pci/ops-rts7751r2d.c
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * linux/arch/sh/drivers/pci/ops-rts7751r2d.c
- *
- * Author:  Ian DaSilva (idasilva@mvista.com)
- *
- * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * PCI initialization for the Renesas SH7751R RTS7751R2D board
- */
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/io.h>
-#include <mach/r2d.h>
-#include "pci-sh4.h"
-
-static u8 rts7751r2d_irq_tab[] __initdata = {
-	IRQ_PCI_INTA,
-	IRQ_PCI_INTB,
-	IRQ_PCI_INTC,
-	IRQ_PCI_INTD,
-};
-
-int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin)
-{
-	return rts7751r2d_irq_tab[slot];
-}
-- 
GitLab


From 951a681bda844491de8699b3bdc6c3899cbd4c9f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:34:36 +0900
Subject: [PATCH 0532/2198] sh: pci: Convert dreamcast to new-style interface.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Kconfig            |  5 +----
 arch/sh/drivers/pci/fixups-dreamcast.c |  2 +-
 arch/sh/drivers/pci/pci-dreamcast.c    | 27 ++++++++++++--------------
 3 files changed, 14 insertions(+), 20 deletions(-)

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index 5aaee3c707b08..1d53496b14989 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -20,11 +20,8 @@ config SH_PCIDMA_NONCOHERENT
 
 # Temporary config option for transitioning off of PCI_AUTO
 config PCI_NEW
-	bool
+	def_bool y
 	depends on PCI
-	default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \
-		     CPU_SUBTYPE_SH7785 || CPU_SH5 || \
-		     CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R
 
 # This is also board-specific
 config PCI_AUTO
diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c
index 48c6381fffaaf..ed7f489936f1e 100644
--- a/arch/sh/drivers/pci/fixups-dreamcast.c
+++ b/arch/sh/drivers/pci/fixups-dreamcast.c
@@ -30,7 +30,7 @@
 
 static void __init gapspci_fixup_resources(struct pci_dev *dev)
 {
-	struct pci_channel *p = board_pci_channels;
+	struct pci_channel *p = dev->sysdata;
 
 	printk(KERN_NOTICE "PCI: Fixing up device %s\n", pci_name(dev));
 
diff --git a/arch/sh/drivers/pci/pci-dreamcast.c b/arch/sh/drivers/pci/pci-dreamcast.c
index 0897be5053d49..210f9d4af1411 100644
--- a/arch/sh/drivers/pci/pci-dreamcast.c
+++ b/arch/sh/drivers/pci/pci-dreamcast.c
@@ -21,7 +21,6 @@
 #include <linux/irq.h>
 #include <linux/pci.h>
 #include <linux/module.h>
-
 #include <asm/io.h>
 #include <asm/irq.h>
 #include <mach/pci.h>
@@ -40,11 +39,19 @@ static struct resource gapspci_mem_resource = {
 	.flags	= IORESOURCE_MEM,
 };
 
+static struct pci_channel dreamcast_pci_controller = {
+	.pci_ops	= &gapspci_pci_ops,
+	.io_resource	= &gapspci_io_resource,
+	.io_offset	= 0x00000000,
+	.mem_resource	= &gapspci_mem_resource,
+	.mem_offset	= 0x00000000,
+};
+
 /*
  * gapspci init
  */
 
-static int __init gapspci_init(struct pci_channel *chan)
+static int __init gapspci_init(void)
 {
 	char idbuf[16];
 	int i;
@@ -88,18 +95,8 @@ static int __init gapspci_init(struct pci_channel *chan)
 	outl(0x00002001, GAPSPCI_BBA_CONFIG+0x10);
 	outl(0x01000000, GAPSPCI_BBA_CONFIG+0x14);
 
+	register_pci_controller(&dreamcast_pci_controller);
+
 	return 0;
 }
-
-struct pci_channel board_pci_channels[] = {
-	{
-		.init		= gapspci_init,
-		.pci_ops	= &gapspci_pci_ops,
-		.io_resource	= &gapspci_io_resource,
-		.mem_resource	= &gapspci_mem_resource,
-		.first_devfn	= 0,
-		.last_devfn	= 1,
-	}, {
-		.init		= NULL,
-	}
-};
+arch_initcall(gapspci_init);
-- 
GitLab


From 2d5efc190eb415dbff79ffab4f8ea731ab0288a9 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:42:59 +0900
Subject: [PATCH 0533/2198] sh: pci: Move the se7751 fixups in to
 arch/sh/drivers/pci/.

The se7751 was still doing the PCI fixups in its own board directory,
so we move it over to arch/sh/drivers/pci/ with the rest of the board
fixups. It has bitrotted significantly over the years, so will still
likely need a bit of work to bring back up to date.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-se/7751/Makefile |   2 -
 arch/sh/boards/mach-se/7751/pci.c    | 150 ---------------------------
 arch/sh/drivers/pci/Makefile         |   1 +
 arch/sh/drivers/pci/fixups-se7751.c  | 111 ++++++++++++++++++++
 4 files changed, 112 insertions(+), 152 deletions(-)
 delete mode 100644 arch/sh/boards/mach-se/7751/pci.c
 create mode 100644 arch/sh/drivers/pci/fixups-se7751.c

diff --git a/arch/sh/boards/mach-se/7751/Makefile b/arch/sh/boards/mach-se/7751/Makefile
index dbc29f3a9de50..e6f4341bfe6ea 100644
--- a/arch/sh/boards/mach-se/7751/Makefile
+++ b/arch/sh/boards/mach-se/7751/Makefile
@@ -3,5 +3,3 @@
 #
 
 obj-y	 := setup.o io.o irq.o
-
-obj-$(CONFIG_PCI) += pci.o
diff --git a/arch/sh/boards/mach-se/7751/pci.c b/arch/sh/boards/mach-se/7751/pci.c
deleted file mode 100644
index 9ec64a416b302..0000000000000
--- a/arch/sh/boards/mach-se/7751/pci.c
+++ /dev/null
@@ -1,150 +0,0 @@
-/*
- * linux/arch/sh/boards/se/7751/pci.c
- *
- * Author:  Ian DaSilva (idasilva@mvista.com)
- *
- * Highly leveraged from pci-bigsur.c, written by Dustin McIntire.
- *
- * May be copied or modified under the terms of the GNU General Public
- * License.  See linux/COPYING for more information.
- *
- * PCI initialization for the Hitachi SH7751 Solution Engine board (MS7751SE01)
- */
-
-#include <linux/kernel.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/delay.h>
-#include <linux/pci.h>
-
-#include <asm/io.h>
-#include "../../../drivers/pci/pci-sh7751.h"
-
-#define PCIMCR_MRSET_OFF	0xBFFFFFFF
-#define PCIMCR_RFSH_OFF		0xFFFFFFFB
-
-/*
- * Only long word accesses of the PCIC's internal local registers and the
- * configuration registers from the CPU is supported.
- */
-#define PCIC_WRITE(x,v) writel((v), PCI_REG(x))
-#define PCIC_READ(x) readl(PCI_REG(x))
-
-#define xPCIBIOS_MIN_IO		board_pci_channels->io_resource->start
-#define xPCIBIOS_MIN_MEM	board_pci_channels->mem_resource->start
-
-/*
- * Description:  This function sets up and initializes the pcic, sets
- * up the BARS, maps the DRAM into the address space etc, etc.
- */
-int __init pcibios_init_platform(void)
-{
-   unsigned long bcr1, wcr1, wcr2, wcr3, mcr;
-   unsigned short bcr2;
-
-   /*
-    * Initialize the slave bus controller on the pcic.  The values used
-    * here should not be hardcoded, but they should be taken from the bsc
-    * on the processor, to make this function as generic as possible.
-    * (i.e. Another sbc may usr different SDRAM timing settings -- in order
-    * for the pcic to work, its settings need to be exactly the same.)
-    */
-   bcr1 = (*(volatile unsigned long*)(SH7751_BCR1));
-   bcr2 = (*(volatile unsigned short*)(SH7751_BCR2));
-   wcr1 = (*(volatile unsigned long*)(SH7751_WCR1));
-   wcr2 = (*(volatile unsigned long*)(SH7751_WCR2));
-   wcr3 = (*(volatile unsigned long*)(SH7751_WCR3));
-   mcr = (*(volatile unsigned long*)(SH7751_MCR));
-
-   bcr1 = bcr1 | 0x00080000;  /* Enable Bit 19, BREQEN */
-   (*(volatile unsigned long*)(SH7751_BCR1)) = bcr1;   
-
-   bcr1 = bcr1 | 0x40080000;  /* Enable Bit 19 BREQEN, set PCIC to slave */
-   PCIC_WRITE(SH7751_PCIBCR1, bcr1);	 /* PCIC BCR1 */
-   PCIC_WRITE(SH7751_PCIBCR2, bcr2);     /* PCIC BCR2 */
-   PCIC_WRITE(SH7751_PCIWCR1, wcr1);     /* PCIC WCR1 */
-   PCIC_WRITE(SH7751_PCIWCR2, wcr2);     /* PCIC WCR2 */
-   PCIC_WRITE(SH7751_PCIWCR3, wcr3);     /* PCIC WCR3 */
-   mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF;
-   PCIC_WRITE(SH7751_PCIMCR, mcr);      /* PCIC MCR */
-
-
-   /* Enable all interrupts, so we know what to fix */
-   PCIC_WRITE(SH7751_PCIINTM, 0x0000c3ff);
-   PCIC_WRITE(SH7751_PCIAINTM, 0x0000380f);
-
-   /* Set up standard PCI config registers */
-   PCIC_WRITE(SH7751_PCICONF1, 	0xF39000C7); /* Bus Master, Mem & I/O access */
-   PCIC_WRITE(SH7751_PCICONF2, 	0x00000000); /* PCI Class code & Revision ID */
-   PCIC_WRITE(SH7751_PCICONF4, 	0xab000001); /* PCI I/O address (local regs) */
-   PCIC_WRITE(SH7751_PCICONF5, 	0x0c000000); /* PCI MEM address (local RAM)  */
-   PCIC_WRITE(SH7751_PCICONF6, 	0xd0000000); /* PCI MEM address (unused)     */
-   PCIC_WRITE(SH7751_PCICONF11, 0x35051054); /* PCI Subsystem ID & Vendor ID */
-   PCIC_WRITE(SH7751_PCILSR0, 0x03f00000);   /* MEM (full 64M exposed)       */
-   PCIC_WRITE(SH7751_PCILSR1, 0x00000000);   /* MEM (unused)                 */
-   PCIC_WRITE(SH7751_PCILAR0, 0x0c000000);   /* MEM (direct map from PCI)    */
-   PCIC_WRITE(SH7751_PCILAR1, 0x00000000);   /* MEM (unused)                 */
-
-   /* Now turn it on... */
-   PCIC_WRITE(SH7751_PCICR, 0xa5000001);
-
-   /*
-    * Set PCIMBR and PCIIOBR here, assuming a single window
-    * (16M MEM, 256K IO) is enough.  If a larger space is
-    * needed, the readx/writex and inx/outx functions will
-    * have to do more (e.g. setting registers for each call).
-    */
-
-   /*
-    * Set the MBR so PCI address is one-to-one with window,
-    * meaning all calls go straight through... use BUG_ON to
-    * catch erroneous assumption.
-    */
-   BUG_ON(xPCIBIOS_MIN_MEM != SH7751_PCI_MEMORY_BASE);
-
-   PCIC_WRITE(SH7751_PCIMBR, xPCIBIOS_MIN_MEM);
-
-   /* Set IOBR for window containing area specified in pci.h */
-   PCIC_WRITE(SH7751_PCIIOBR, (xPCIBIOS_MIN_IO & SH7751_PCIIOBR_MASK));
-
-   /* All done, may as well say so... */
-   printk("SH7751 PCI: Finished initialization of the PCI controller\n");
-
-   return 1;
-}
-
-int __init pcibios_map_platform_irq(u8 slot, u8 pin)
-{
-        switch (slot) {
-        case 0: return 13;
-        case 1: return 13; 	/* AMD Ethernet controller */
-        case 2: return -1;
-        case 3: return -1;
-        case 4: return -1;
-        default:
-                printk("PCI: Bad IRQ mapping request for slot %d\n", slot);
-                return -1;
-        }
-}
-
-static struct resource sh7751_io_resource = {
-	.name   = "SH7751 IO",
-	.start  = SH7751_PCI_IO_BASE,
-	.end    = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1,
-	.flags  = IORESOURCE_IO
-};
-
-static struct resource sh7751_mem_resource = {
-	.name   = "SH7751 mem",
-	.start  = SH7751_PCI_MEMORY_BASE,
-	.end    = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
-	.flags  = IORESOURCE_MEM
-};
-
-extern struct pci_ops sh7751_pci_ops;
-
-struct pci_channel board_pci_channels[] = {
-	{ &sh7751_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff },
-	{ NULL, NULL, NULL, 0, 0 },
-};
-
diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index fbebd73b22f60..e388a70d14634 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_CPU_SH5)			+= pci-sh5.o ops-sh5.o
 obj-$(CONFIG_SH_DREAMCAST)		+= ops-dreamcast.o fixups-dreamcast.o \
 					   pci-dreamcast.o
 obj-$(CONFIG_SH_SECUREEDGE5410)		+= fixups-snapgear.o
+obj-$(CONFIG_SH_7751_SOLUTION_ENGINE)	+= fixups-se7751.o
 obj-$(CONFIG_SH_RTS7751R2D)		+= fixups-rts7751r2d.o
 obj-$(CONFIG_SH_SH03)			+= fixups-sh03.o
 obj-$(CONFIG_SH_HIGHLANDER)		+= fixups-r7780rp.o
diff --git a/arch/sh/drivers/pci/fixups-se7751.c b/arch/sh/drivers/pci/fixups-se7751.c
new file mode 100644
index 0000000000000..475fa9f0fe2cc
--- /dev/null
+++ b/arch/sh/drivers/pci/fixups-se7751.c
@@ -0,0 +1,111 @@
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include "pci-sh4.h"
+
+int __init pcibios_map_platform_irq(u8 slot, u8 pin)
+{
+        switch (slot) {
+        case 0: return 13;
+        case 1: return 13;	/* AMD Ethernet controller */
+        case 2: return -1;
+        case 3: return -1;
+        case 4: return -1;
+        default:
+                printk("PCI: Bad IRQ mapping request for slot %d\n", slot);
+                return -1;
+        }
+}
+
+#define PCIMCR_MRSET_OFF	0xBFFFFFFF
+#define PCIMCR_RFSH_OFF		0xFFFFFFFB
+
+/*
+ * Only long word accesses of the PCIC's internal local registers and the
+ * configuration registers from the CPU is supported.
+ */
+#define PCIC_WRITE(x,v) writel((v), PCI_REG(x))
+#define PCIC_READ(x) readl(PCI_REG(x))
+
+/*
+ * Description:  This function sets up and initializes the pcic, sets
+ * up the BARS, maps the DRAM into the address space etc, etc.
+ */
+int pci_fixup_pcic(struct pci_channel *chan)
+{
+	unsigned long bcr1, wcr1, wcr2, wcr3, mcr;
+	unsigned short bcr2;
+
+	/*
+	* Initialize the slave bus controller on the pcic.  The values used
+	* here should not be hardcoded, but they should be taken from the bsc
+	* on the processor, to make this function as generic as possible.
+	* (i.e. Another sbc may usr different SDRAM timing settings -- in order
+	* for the pcic to work, its settings need to be exactly the same.)
+	*/
+	bcr1 = (*(volatile unsigned long*)(SH7751_BCR1));
+	bcr2 = (*(volatile unsigned short*)(SH7751_BCR2));
+	wcr1 = (*(volatile unsigned long*)(SH7751_WCR1));
+	wcr2 = (*(volatile unsigned long*)(SH7751_WCR2));
+	wcr3 = (*(volatile unsigned long*)(SH7751_WCR3));
+	mcr = (*(volatile unsigned long*)(SH7751_MCR));
+
+	bcr1 = bcr1 | 0x00080000;  /* Enable Bit 19, BREQEN */
+	(*(volatile unsigned long*)(SH7751_BCR1)) = bcr1;
+
+	bcr1 = bcr1 | 0x40080000;  /* Enable Bit 19 BREQEN, set PCIC to slave */
+	PCIC_WRITE(SH7751_PCIBCR1, bcr1);	 /* PCIC BCR1 */
+	PCIC_WRITE(SH7751_PCIBCR2, bcr2);     /* PCIC BCR2 */
+	PCIC_WRITE(SH7751_PCIWCR1, wcr1);     /* PCIC WCR1 */
+	PCIC_WRITE(SH7751_PCIWCR2, wcr2);     /* PCIC WCR2 */
+	PCIC_WRITE(SH7751_PCIWCR3, wcr3);     /* PCIC WCR3 */
+	mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF;
+	PCIC_WRITE(SH7751_PCIMCR, mcr);      /* PCIC MCR */
+
+
+	/* Enable all interrupts, so we know what to fix */
+	PCIC_WRITE(SH7751_PCIINTM, 0x0000c3ff);
+	PCIC_WRITE(SH7751_PCIAINTM, 0x0000380f);
+
+	/* Set up standard PCI config registers */
+	PCIC_WRITE(SH7751_PCICONF1,	0xF39000C7); /* Bus Master, Mem & I/O access */
+	PCIC_WRITE(SH7751_PCICONF2,	0x00000000); /* PCI Class code & Revision ID */
+	PCIC_WRITE(SH7751_PCICONF4,	0xab000001); /* PCI I/O address (local regs) */
+	PCIC_WRITE(SH7751_PCICONF5,	0x0c000000); /* PCI MEM address (local RAM)  */
+	PCIC_WRITE(SH7751_PCICONF6,	0xd0000000); /* PCI MEM address (unused)     */
+	PCIC_WRITE(SH7751_PCICONF11, 0x35051054); /* PCI Subsystem ID & Vendor ID */
+	PCIC_WRITE(SH7751_PCILSR0, 0x03f00000);   /* MEM (full 64M exposed)       */
+	PCIC_WRITE(SH7751_PCILSR1, 0x00000000);   /* MEM (unused)                 */
+	PCIC_WRITE(SH7751_PCILAR0, 0x0c000000);   /* MEM (direct map from PCI)    */
+	PCIC_WRITE(SH7751_PCILAR1, 0x00000000);   /* MEM (unused)                 */
+
+	/* Now turn it on... */
+	PCIC_WRITE(SH7751_PCICR, 0xa5000001);
+
+	/*
+	* Set PCIMBR and PCIIOBR here, assuming a single window
+	* (16M MEM, 256K IO) is enough.  If a larger space is
+	* needed, the readx/writex and inx/outx functions will
+	* have to do more (e.g. setting registers for each call).
+	*/
+
+	/*
+	* Set the MBR so PCI address is one-to-one with window,
+	* meaning all calls go straight through... use BUG_ON to
+	* catch erroneous assumption.
+	*/
+	BUG_ON(chan->mem_resource->start != SH7751_PCI_MEMORY_BASE);
+
+	PCIC_WRITE(SH7751_PCIMBR, chan->mem_resource->start);
+
+	/* Set IOBR for window containing area specified in pci.h */
+	PCIC_WRITE(SH7751_PCIIOBR, (chan->io_resource->start & SH7751_PCIIOBR_MASK));
+
+	/* All done, may as well say so... */
+	printk("SH7751 PCI: Finished initialization of the PCI controller\n");
+
+	return 1;
+}
-- 
GitLab


From 805fcc88999162b361ef0b0ce25782ef65f147d7 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:46:42 +0900
Subject: [PATCH 0534/2198] sh: pci: Kill off the last remnants of the now
 unused pci-auto code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Kconfig    |  18 --
 arch/sh/drivers/pci/Makefile   |   1 -
 arch/sh/drivers/pci/pci-auto.c | 546 ---------------------------------
 arch/sh/drivers/pci/pci.c      |  75 -----
 arch/sh/include/asm/pci.h      |  16 -
 5 files changed, 656 deletions(-)
 delete mode 100644 arch/sh/drivers/pci/pci-auto.c
 delete mode 100644 arch/sh/drivers/pci/pci.c

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index 1d53496b14989..ea903a984f011 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -18,24 +18,6 @@ config SH_PCIDMA_NONCOHERENT
 	  bridge integrated with your SH CPU, refer carefully to the chip specs
 	  to see if you can say 'N' here. Otherwise, leave it as 'Y'.
 
-# Temporary config option for transitioning off of PCI_AUTO
 config PCI_NEW
 	def_bool y
 	depends on PCI
-
-# This is also board-specific
-config PCI_AUTO
-	bool
-	depends on PCI && !PCI_NEW
-	default y
-
-config PCI_AUTO_UPDATE_RESOURCES
-	bool
-	depends on PCI_AUTO
-	default y if !SH_DREAMCAST
-	help
-	  Selecting this option will cause the PCI auto code to leave your
-	  BAR values alone. Otherwise they will be updated automatically. If
-	  for some reason, you have a board that simply refuses to work
-	  with its resources updated beyond what they are when the device
-	  is powered up, set this to N. Everyone else will want this as Y.
diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index e388a70d14634..a8350ccfa9d31 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -2,7 +2,6 @@
 # Makefile for the PCI specific kernel interface routines under Linux.
 #
 obj-y					+= pci-lib.o
-obj-$(CONFIG_PCI_AUTO)			+= pci.o pci-auto.o
 obj-$(CONFIG_PCI_NEW)			+= pci-new.o
 
 obj-$(CONFIG_CPU_SUBTYPE_SH7751)	+= pci-sh7751.o ops-sh4.o
diff --git a/arch/sh/drivers/pci/pci-auto.c b/arch/sh/drivers/pci/pci-auto.c
deleted file mode 100644
index 1d715ec405b20..0000000000000
--- a/arch/sh/drivers/pci/pci-auto.c
+++ /dev/null
@@ -1,546 +0,0 @@
-/*
- * PCI autoconfiguration library
- *
- * Author: Matt Porter <mporter@mvista.com>
- *
- * Copyright 2000, 2001 MontaVista Software Inc.
- * Copyright 2001 Bradley D. LaRonde <brad@ltc.com>
- * Copyright 2003 Paul Mundt <lethal@linux-sh.org>
- *
- * This program is free software; you can redistribute  it and/or modify it
- * under  the terms of  the GNU General  Public License as published by the
- * Free Software Foundation;  either version 2 of the  License, or (at your
- * option) any later version.
- */
-
-/*
- * Modified for MIPS by Jun Sun, jsun@mvista.com
- *
- * . Simplify the interface between pci_auto and the rest: a single function.
- * . Assign resources from low address to upper address.
- * . change most int to u32.
- *
- * Further modified to include it as mips generic code, ppopov@mvista.com.
- *
- * 2001-10-26  Bradley D. LaRonde <brad@ltc.com>
- * - Add a top_bus argument to the "early config" functions so that
- *   they can set a fake parent bus pointer to convince the underlying
- *   pci ops to use type 1 configuration for sub busses.
- * - Set bridge base and limit registers correctly.
- * - Align io and memory base properly before and after bridge setup.
- * - Don't fall through to pci_setup_bars for bridge.
- * - Reformat the debug output to look more like lspci's output.
- *
- * Cloned for SuperH by M. R. Brown, mrbrown@0xd6.org
- *
- * 2003-08-05  Paul Mundt <lethal@linux-sh.org>
- * - Don't update the BAR values on systems that already have valid addresses
- *   and don't want these updated for whatever reason, by way of a new config
- *   option check. However, we still read in the old BAR values so that they
- *   can still be reported through the debug output.
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/pci.h>
-
-#define	DEBUG
-#ifdef	DEBUG
-#define	DBG(x...)	printk(x)
-#else
-#define	DBG(x...)
-#endif
-
-/*
- * These functions are used early on before PCI scanning is done
- * and all of the pci_dev and pci_bus structures have been created.
- */
-static struct pci_dev *fake_pci_dev(struct pci_channel *hose,
-	int top_bus, int busnr, int devfn)
-{
-	static struct pci_dev dev;
-	static struct pci_bus bus;
-
-	dev.bus = &bus;
-	dev.sysdata = hose;
-	dev.devfn = devfn;
-	bus.number = busnr;
-	bus.ops = hose->pci_ops;
-	bus.sysdata = hose;
-
-	if(busnr != top_bus)
-		/* Fake a parent bus structure. */
-		bus.parent = &bus;
-	else
-		bus.parent = NULL;
-
-	return &dev;
-}
-
-#define EARLY_PCI_OP(rw, size, type)					\
-static int early_##rw##_config_##size(struct pci_channel *hose,		\
-	int top_bus, int bus, int devfn, int offset, type value)	\
-{									\
-	return pci_##rw##_config_##size(				\
-		fake_pci_dev(hose, top_bus, bus, devfn),		\
-		offset, value);						\
-}
-
-EARLY_PCI_OP(read, byte, u8 *)
-EARLY_PCI_OP(read, word, u16 *)
-EARLY_PCI_OP(read, dword, u32 *)
-EARLY_PCI_OP(write, byte, u8)
-EARLY_PCI_OP(write, word, u16)
-EARLY_PCI_OP(write, dword, u32)
-
-static struct resource *io_resource_inuse;
-static struct resource *mem_resource_inuse;
-
-static u32 pciauto_lower_iospc;
-static u32 pciauto_upper_iospc;
-
-static u32 pciauto_lower_memspc;
-static u32 pciauto_upper_memspc;
-
-static void __init
-pciauto_setup_bars(struct pci_channel *hose,
-		   int top_bus,
-		   int current_bus,
-		   int pci_devfn,
-		   int bar_limit)
-{
-	u32 bar_response, bar_size, bar_value;
-	u32 bar, addr_mask, bar_nr = 0;
-	u32 * upper_limit;
-	u32 * lower_limit;
-	int found_mem64 = 0;
-
-	for (bar = PCI_BASE_ADDRESS_0; bar <= bar_limit; bar+=4) {
-		u32 bar_addr;
-
-		/* Read the old BAR value */
-		early_read_config_dword(hose, top_bus,
-					current_bus,
-					pci_devfn,
-					bar,
-					&bar_addr);
-
-		/* Tickle the BAR and get the response */
-		early_write_config_dword(hose, top_bus,
-					 current_bus,
-					 pci_devfn,
-					 bar,
-					 0xffffffff);
-
-		early_read_config_dword(hose, top_bus,
-					current_bus,
-					pci_devfn,
-					bar,
-					&bar_response);
-
-		/*
-		 * Write the old BAR value back out, only update the BAR
-		 * if we implicitly want resources to be updated, which
-		 * is done by the generic code further down. -- PFM.
-		 */
-		early_write_config_dword(hose, top_bus,
-					 current_bus,
-					 pci_devfn,
-					 bar,
-					 bar_addr);
-
-		/* If BAR is not implemented go to the next BAR */
-		if (!bar_response)
-			continue;
-
-		/*
-		 * Workaround for a BAR that doesn't use its upper word,
-		 * like the ALi 1535D+ PCI DC-97 Controller Modem (M5457).
-		 * bdl <brad@ltc.com>
-		 */
-		if (!(bar_response & 0xffff0000))
-			bar_response |= 0xffff0000;
-
-retry:
-		/* Check the BAR type and set our address mask */
-		if (bar_response & PCI_BASE_ADDRESS_SPACE) {
-			addr_mask = PCI_BASE_ADDRESS_IO_MASK;
-			upper_limit = &pciauto_upper_iospc;
-			lower_limit = &pciauto_lower_iospc;
-			DBG("        I/O");
-		} else {
-			if ((bar_response & PCI_BASE_ADDRESS_MEM_TYPE_MASK) ==
-			    PCI_BASE_ADDRESS_MEM_TYPE_64)
-				found_mem64 = 1;
-
-			addr_mask = PCI_BASE_ADDRESS_MEM_MASK;
-			upper_limit = &pciauto_upper_memspc;
-			lower_limit = &pciauto_lower_memspc;
-			DBG("        Mem");
-		}
-
-
-		/* Calculate requested size */
-		bar_size = ~(bar_response & addr_mask) + 1;
-
-		/* Allocate a base address */
-		bar_value = ((*lower_limit - 1) & ~(bar_size - 1)) + bar_size;
-
-		if ((bar_value + bar_size) > *upper_limit) {
-			if (bar_response & PCI_BASE_ADDRESS_SPACE) {
-				if (io_resource_inuse->child) {
-					io_resource_inuse =
-						io_resource_inuse->child;
-					pciauto_lower_iospc =
-						io_resource_inuse->start;
-					pciauto_upper_iospc =
-						io_resource_inuse->end + 1;
-					goto retry;
-				}
-
-			} else {
-				if (mem_resource_inuse->child) {
-					mem_resource_inuse =
-						mem_resource_inuse->child;
-					pciauto_lower_memspc =
-						mem_resource_inuse->start;
-					pciauto_upper_memspc =
-						mem_resource_inuse->end + 1;
-					goto retry;
-				}
-			}
-			DBG(" unavailable -- skipping, value %x size %x\n",
-					bar_value, bar_size);
-			continue;
-		}
-
-		if (bar_value < *lower_limit || (bar_value + bar_size) >= *upper_limit) {
-			DBG(" unavailable -- skipping, value %x size %x\n",
-					bar_value, bar_size);
-			continue;
-		}
-
-#ifdef CONFIG_PCI_AUTO_UPDATE_RESOURCES
-		/* Write it out and update our limit */
-		early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-					 bar, bar_value);
-#endif
-
-		*lower_limit = bar_value + bar_size;
-
-		/*
-		 * If we are a 64-bit decoder then increment to the
-		 * upper 32 bits of the bar and force it to locate
-		 * in the lower 4GB of memory.
-		 */
-		if (found_mem64) {
-			bar += 4;
-			early_write_config_dword(hose, top_bus,
-						 current_bus,
-						 pci_devfn,
-						 bar,
-						 0x00000000);
-		}
-
-		DBG(" at 0x%.8x [size=0x%x]\n", bar_value, bar_size);
-
-		bar_nr++;
-	}
-
-}
-
-static void __init
-pciauto_prescan_setup_bridge(struct pci_channel *hose,
-			     int top_bus,
-			     int current_bus,
-			     int pci_devfn,
-			     int sub_bus)
-{
-	/* Configure bus number registers */
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-	                        PCI_PRIMARY_BUS, current_bus);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SECONDARY_BUS, sub_bus + 1);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SUBORDINATE_BUS, 0xff);
-
-	/* Align memory and I/O to 1MB and 4KB boundaries. */
-	pciauto_lower_memspc = (pciauto_lower_memspc + (0x100000 - 1))
-		& ~(0x100000 - 1);
-	pciauto_lower_iospc = (pciauto_lower_iospc + (0x1000 - 1))
-		& ~(0x1000 - 1);
-
-	/* Set base (lower limit) of address range behind bridge. */
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-		PCI_MEMORY_BASE, pciauto_lower_memspc >> 16);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-		PCI_IO_BASE, (pciauto_lower_iospc & 0x0000f000) >> 8);
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-		PCI_IO_BASE_UPPER16, pciauto_lower_iospc >> 16);
-
-	/* We don't support prefetchable memory for now, so disable */
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-				PCI_PREF_MEMORY_BASE, 0);
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-				PCI_PREF_MEMORY_LIMIT, 0);
-}
-
-static void __init
-pciauto_postscan_setup_bridge(struct pci_channel *hose,
-			      int top_bus,
-			      int current_bus,
-			      int pci_devfn,
-			      int sub_bus)
-{
-	u32 temp;
-
-	/*
-	 * [jsun] we always bump up baselines a little, so that if there
-	 * nothing behind P2P bridge, we don't wind up overlapping IO/MEM
-	 * spaces.
-	 */
-	pciauto_lower_memspc += 1;
-	pciauto_lower_iospc += 1;
-
-	/* Configure bus number registers */
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SUBORDINATE_BUS, sub_bus);
-
-	/* Set upper limit of address range behind bridge. */
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-		PCI_MEMORY_LIMIT, pciauto_lower_memspc >> 16);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-		PCI_IO_LIMIT, (pciauto_lower_iospc & 0x0000f000) >> 8);
-	early_write_config_word(hose, top_bus, current_bus, pci_devfn,
-		PCI_IO_LIMIT_UPPER16, pciauto_lower_iospc >> 16);
-
-	/* Align memory and I/O to 1MB and 4KB boundaries. */
-	pciauto_lower_memspc = (pciauto_lower_memspc + (0x100000 - 1))
-		& ~(0x100000 - 1);
-	pciauto_lower_iospc = (pciauto_lower_iospc + (0x1000 - 1))
-		& ~(0x1000 - 1);
-
-	/* Enable memory and I/O accesses, enable bus master */
-	early_read_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_COMMAND, &temp);
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_COMMAND, temp | PCI_COMMAND_IO | PCI_COMMAND_MEMORY
-		| PCI_COMMAND_MASTER);
-}
-
-static void __init
-pciauto_prescan_setup_cardbus_bridge(struct pci_channel *hose,
-			int top_bus,
-			int current_bus,
-			int pci_devfn,
-			int sub_bus)
-{
-	/* Configure bus number registers */
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_PRIMARY_BUS, current_bus);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SECONDARY_BUS, sub_bus + 1);
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SUBORDINATE_BUS, 0xff);
-
-	/* Align memory and I/O to 4KB and 4 byte boundaries. */
-	pciauto_lower_memspc = (pciauto_lower_memspc + (0x1000 - 1))
-		& ~(0x1000 - 1);
-	pciauto_lower_iospc = (pciauto_lower_iospc + (0x4 - 1))
-		& ~(0x4 - 1);
-
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_CB_MEMORY_BASE_0, pciauto_lower_memspc);
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_CB_IO_BASE_0, pciauto_lower_iospc);
-}
-
-static void __init
-pciauto_postscan_setup_cardbus_bridge(struct pci_channel *hose,
-			int top_bus,
-			int current_bus,
-			int pci_devfn,
-			int sub_bus)
-{
-	u32 temp;
-
-	/*
-	 * [jsun] we always bump up baselines a little, so that if there
-	 * nothing behind P2P bridge, we don't wind up overlapping IO/MEM
-	 * spaces.
-	 */
-	pciauto_lower_memspc += 1;
-	pciauto_lower_iospc += 1;
-
-	/*
-	 * Configure subordinate bus number.  The PCI subsystem
-	 * bus scan will renumber buses (reserving three additional
-	 * for this PCI<->CardBus bridge for the case where a CardBus
-	 * adapter contains a P2P or CB2CB bridge.
-	 */
-
-	early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-				PCI_SUBORDINATE_BUS, sub_bus);
-
-	/*
-	 * Reserve an additional 4MB for mem space and 16KB for
-	 * I/O space.  This should cover any additional space
-	 * requirement of unusual CardBus devices with
-	 * additional bridges that can consume more address space.
-	 *
-	 * Although pcmcia-cs currently will reprogram bridge
-	 * windows, the goal is to add an option to leave them
-	 * alone and use the bridge window ranges as the regions
-	 * that are searched for free resources upon hot-insertion
-	 * of a device.  This will allow a PCI<->CardBus bridge
-	 * configured by this routine to happily live behind a
-	 * P2P bridge in a system.
-	 */
-	/* Align memory and I/O to 4KB and 4 byte boundaries. */
-	pciauto_lower_memspc = (pciauto_lower_memspc + (0x1000 - 1))
-		& ~(0x1000 - 1);
-	pciauto_lower_iospc = (pciauto_lower_iospc + (0x4 - 1))
-		& ~(0x4 - 1);
-	/* Set up memory and I/O filter limits, assume 32-bit I/O space */
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_CB_MEMORY_LIMIT_0, pciauto_lower_memspc - 1);
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_CB_IO_LIMIT_0, pciauto_lower_iospc - 1);
-
-	/* Enable memory and I/O accesses, enable bus master */
-	early_read_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_COMMAND, &temp);
-	early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-		PCI_COMMAND, temp | PCI_COMMAND_IO | PCI_COMMAND_MEMORY |
-		PCI_COMMAND_MASTER);
-}
-
-#define	PCIAUTO_IDE_MODE_MASK		0x05
-
-static int __init
-pciauto_bus_scan(struct pci_channel *hose, int top_bus, int current_bus)
-{
-	int sub_bus;
-	u32 pci_devfn, pci_class, cmdstat, found_multi=0;
-	unsigned short vid, did;
-	unsigned char header_type;
-	int devfn_start = 0;
-	int devfn_stop = 0xff;
-
-	sub_bus = current_bus;
-
-	if (hose->first_devfn)
-		devfn_start = hose->first_devfn;
-	if (hose->last_devfn)
-		devfn_stop = hose->last_devfn;
-
-	for (pci_devfn=devfn_start; pci_devfn<devfn_stop; pci_devfn++) {
-
-		if (PCI_FUNC(pci_devfn) && !found_multi)
-			continue;
-
-		early_read_config_word(hose, top_bus, current_bus, pci_devfn,
-				       PCI_VENDOR_ID, &vid);
-
-		if (vid == 0xffff) continue;
-
-		early_read_config_byte(hose, top_bus, current_bus, pci_devfn,
-				       PCI_HEADER_TYPE, &header_type);
-
-		if (!PCI_FUNC(pci_devfn))
-			found_multi = header_type & 0x80;
-
-		early_read_config_word(hose, top_bus, current_bus, pci_devfn,
-				       PCI_DEVICE_ID, &did);
-
-		early_read_config_dword(hose, top_bus, current_bus, pci_devfn,
-					PCI_CLASS_REVISION, &pci_class);
-
-		DBG("%.2x:%.2x.%x Class %.4x: %.4x:%.4x",
-			current_bus, PCI_SLOT(pci_devfn), PCI_FUNC(pci_devfn),
-			pci_class >> 16, vid, did);
-		if (pci_class & 0xff)
-			DBG(" (rev %.2x)", pci_class & 0xff);
-		DBG("\n");
-
-		if ((pci_class >> 16) == PCI_CLASS_BRIDGE_PCI) {
-			DBG("        Bridge: primary=%.2x, secondary=%.2x\n",
-				current_bus, sub_bus + 1);
-			pciauto_prescan_setup_bridge(hose, top_bus, current_bus,
-						     pci_devfn, sub_bus);
-			DBG("Scanning sub bus %.2x, I/O 0x%.8x, Mem 0x%.8x\n",
-				sub_bus + 1,
-				pciauto_lower_iospc, pciauto_lower_memspc);
-			sub_bus = pciauto_bus_scan(hose, top_bus, sub_bus+1);
-			DBG("Back to bus %.2x\n", current_bus);
-			pciauto_postscan_setup_bridge(hose, top_bus, current_bus,
-							pci_devfn, sub_bus);
-			continue;
-		} else if ((pci_class >> 16) == PCI_CLASS_BRIDGE_CARDBUS) {
-			DBG("  CARDBUS  Bridge: primary=%.2x, secondary=%.2x\n",
-				current_bus, sub_bus + 1);
-			DBG("PCI Autoconfig: Found CardBus bridge, device %d function %d\n", PCI_SLOT(pci_devfn), PCI_FUNC(pci_devfn));
-			/* Place CardBus Socket/ExCA registers */
-			pciauto_setup_bars(hose, top_bus, current_bus, pci_devfn, PCI_BASE_ADDRESS_0);
-
-			pciauto_prescan_setup_cardbus_bridge(hose, top_bus,
-					current_bus, pci_devfn, sub_bus);
-
-			DBG("Scanning sub bus %.2x, I/O 0x%.8x, Mem 0x%.8x\n",
-				sub_bus + 1,
-				pciauto_lower_iospc, pciauto_lower_memspc);
-			sub_bus = pciauto_bus_scan(hose, top_bus, sub_bus+1);
-			DBG("Back to bus %.2x, sub_bus is %x\n", current_bus, sub_bus);
-			pciauto_postscan_setup_cardbus_bridge(hose, top_bus,
-					current_bus, pci_devfn, sub_bus);
-			continue;
-		} else if ((pci_class >> 16) == PCI_CLASS_STORAGE_IDE) {
-
-			unsigned char prg_iface;
-
-			early_read_config_byte(hose, top_bus, current_bus,
-				pci_devfn, PCI_CLASS_PROG, &prg_iface);
-			if (!(prg_iface & PCIAUTO_IDE_MODE_MASK)) {
-				DBG("Skipping legacy mode IDE controller\n");
-				continue;
-			}
-		}
-
-		/*
-		 * Found a peripheral, enable some standard
-		 * settings
-		 */
-		early_read_config_dword(hose, top_bus, current_bus, pci_devfn,
-					PCI_COMMAND, &cmdstat);
-		early_write_config_dword(hose, top_bus, current_bus, pci_devfn,
-					 PCI_COMMAND, cmdstat | PCI_COMMAND_IO |
-					 PCI_COMMAND_MEMORY |
-					 PCI_COMMAND_MASTER);
-		early_write_config_byte(hose, top_bus, current_bus, pci_devfn,
-					PCI_LATENCY_TIMER, 0x80);
-
-		/* Allocate PCI I/O and/or memory space */
-		pciauto_setup_bars(hose, top_bus, current_bus, pci_devfn, PCI_BASE_ADDRESS_5);
-	}
-	return sub_bus;
-}
-
-int __init
-pciauto_assign_resources(int busno, struct pci_channel *hose)
-{
-	/* setup resource limits */
-	io_resource_inuse = hose->io_resource;
-	mem_resource_inuse = hose->mem_resource;
-
-	pciauto_lower_iospc = io_resource_inuse->start;
-	pciauto_upper_iospc = io_resource_inuse->end + 1;
-	pciauto_lower_memspc = mem_resource_inuse->start;
-	pciauto_upper_memspc = mem_resource_inuse->end + 1;
-	DBG("Autoconfig PCI channel 0x%p\n", hose);
-	DBG("Scanning bus %.2x, I/O 0x%.8x:0x%.8x, Mem 0x%.8x:0x%.8x\n",
-		busno, pciauto_lower_iospc, pciauto_upper_iospc, 
-		pciauto_lower_memspc, pciauto_upper_memspc);
-
-	return pciauto_bus_scan(hose, busno, busno);
-}
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
deleted file mode 100644
index 8c332b2a4641a..0000000000000
--- a/arch/sh/drivers/pci/pci.c
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * arch/sh/drivers/pci/pci.c
- *
- * Copyright (c) 2002 M. R. Brown  <mrbrown@linux-sh.org>
- * Copyright (c) 2004 - 2006 Paul Mundt  <lethal@linux-sh.org>
- *
- * These functions are collected here to reduce duplication of common
- * code amongst the many platform-specific PCI support code files.
- *
- * These routines require the following board-specific routines:
- * void pcibios_fixup_irqs();
- *
- * See include/asm-sh/pci.h for more information.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/init.h>
-#include <linux/dma-debug.h>
-#include <asm/io.h>
-
-static int __init pcibios_init(void)
-{
-	struct pci_channel *p;
-	struct pci_bus *bus;
-	int busno;
-
-	/* init channels */
-	busno = 0;
-	for (p = board_pci_channels; p->init; p++) {
-		if (p->init(p) == 0)
-			p->enabled = 1;
-		else
-			pr_err("Unable to init pci channel %d\n", busno);
-		busno++;
-	}
-
-#ifdef CONFIG_PCI_AUTO
-	/* assign resources */
-	busno = 0;
-	for (p = board_pci_channels; p->init; p++)
-		if (p->enabled)
-			busno = pciauto_assign_resources(busno, p) + 1;
-#endif
-
-	/* scan the buses */
-	busno = 0;
-	for (p = board_pci_channels; p->init; p++) {
-		if (p->enabled) {
-			bus = pci_scan_bus(busno, p->pci_ops, p);
-			busno = bus->subordinate + 1;
-		}
-	}
-
-	pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq);
-
-	dma_debug_add_bus(&pci_bus_type);
-
-	return 0;
-}
-subsys_initcall(pcibios_init);
-
-/*
- *  Called after each bus is probed, but before its children
- *  are examined.
- */
-void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
-{
-	pci_read_bridge_bases(bus);
-}
-
-EXPORT_SYMBOL(board_pci_channels);
diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index f36c7899295b6..f910121559b01 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -19,8 +19,6 @@
 struct pci_channel {
 	struct pci_channel	*next;
 
-	int			(*init)(struct pci_channel *chan);
-
 	struct pci_ops		*pci_ops;
 	struct resource		*io_resource;
 	struct resource		*mem_resource;
@@ -28,20 +26,11 @@ struct pci_channel {
 	unsigned long		io_offset;
 	unsigned long		mem_offset;
 
-	int			first_devfn;
-	int			last_devfn;
-	int			enabled;
-
 	unsigned long		reg_base;
 
 	unsigned long		io_map_base;
 };
 
-/*
- * Each board initializes this array and terminates it with a NULL entry.
- */
-extern struct pci_channel board_pci_channels[];
-
 extern void register_pci_controller(struct pci_channel *hose);
 
 extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM;
@@ -122,13 +111,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 #endif
 
 /* Board-specific fixup routines. */
-int pcibios_init_platform(void);
 int pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin);
 
-#ifdef CONFIG_PCI_AUTO
-int pciauto_assign_resources(int busno, struct pci_channel *hose);
-#endif
-
 extern void pcibios_resource_to_bus(struct pci_dev *dev,
 	struct pci_bus_region *region, struct resource *res);
 
-- 
GitLab


From 35bcfffd86aac933205fbf8401e3e7e4dde68264 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:51:19 +0900
Subject: [PATCH 0535/2198] sh: pci: Roll pci-lib in to pci-new.

Now that the pci-auto cruft is gone, pci-lib can go away.
Roll it back in to pci-new.c where it originally split off from.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Makefile  |   1 -
 arch/sh/drivers/pci/pci-lib.c | 219 --------------------------------
 arch/sh/drivers/pci/pci-new.c | 226 +++++++++++++++++++++++++++++++++-
 3 files changed, 221 insertions(+), 225 deletions(-)
 delete mode 100644 arch/sh/drivers/pci/pci-lib.c

diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index a8350ccfa9d31..eacac7f475d94 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -1,7 +1,6 @@
 #
 # Makefile for the PCI specific kernel interface routines under Linux.
 #
-obj-y					+= pci-lib.o
 obj-$(CONFIG_PCI_NEW)			+= pci-new.o
 
 obj-$(CONFIG_CPU_SUBTYPE_SH7751)	+= pci-sh7751.o ops-sh4.o
diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c
deleted file mode 100644
index f072acafce6c6..0000000000000
--- a/arch/sh/drivers/pci/pci-lib.c
+++ /dev/null
@@ -1,219 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/pci.h>
-
-unsigned long PCIBIOS_MIN_IO = 0x0000;
-unsigned long PCIBIOS_MIN_MEM = 0;
-
-/*
- * We need to avoid collisions with `mirrored' VGA ports
- * and other strange ISA hardware, so we always want the
- * addresses to be allocated in the 0x000-0x0ff region
- * modulo 0x400.
- */
-void pcibios_align_resource(void *data, struct resource *res,
-			    resource_size_t size, resource_size_t align)
-{
-	struct pci_dev *dev = data;
-	struct pci_channel *chan = dev->sysdata;
-	resource_size_t start = res->start;
-
-	if (res->flags & IORESOURCE_IO) {
-		if (start < PCIBIOS_MIN_IO + chan->io_resource->start)
-			start = PCIBIOS_MIN_IO + chan->io_resource->start;
-
-		/*
-                 * Put everything into 0x00-0xff region modulo 0x400.
-		 */
-		if (start & 0x300) {
-			start = (start + 0x3ff) & ~0x3ff;
-			res->start = start;
-		}
-	} else if (res->flags & IORESOURCE_MEM) {
-		if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start)
-			start = PCIBIOS_MIN_MEM + chan->mem_resource->start;
-	}
-
-	res->start = start;
-}
-
-void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
-			 struct resource *res)
-{
-	struct pci_channel *hose = dev->sysdata;
-	unsigned long offset = 0;
-
-	if (res->flags & IORESOURCE_IO)
-		offset = hose->io_offset;
-	else if (res->flags & IORESOURCE_MEM)
-		offset = hose->mem_offset;
-
-	region->start = res->start - offset;
-	region->end = res->end - offset;
-}
-
-void __devinit
-pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
-			struct pci_bus_region *region)
-{
-	struct pci_channel *hose = dev->sysdata;
-	unsigned long offset = 0;
-
-	if (res->flags & IORESOURCE_IO)
-		offset = hose->io_offset;
-	else if (res->flags & IORESOURCE_MEM)
-		offset = hose->mem_offset;
-
-	res->start = region->start + offset;
-	res->end = region->end + offset;
-}
-
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
-	u16 cmd, old_cmd;
-	int idx;
-	struct resource *r;
-
-	pci_read_config_word(dev, PCI_COMMAND, &cmd);
-	old_cmd = cmd;
-	for (idx=0; idx < PCI_NUM_RESOURCES; idx++) {
-		/* Only set up the requested stuff */
-		if (!(mask & (1<<idx)))
-			continue;
-
-		r = &dev->resource[idx];
-		if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
-			continue;
-		if ((idx == PCI_ROM_RESOURCE) &&
-				(!(r->flags & IORESOURCE_ROM_ENABLE)))
-			continue;
-		if (!r->start && r->end) {
-			printk(KERN_ERR "PCI: Device %s not available "
-			       "because of resource collisions\n",
-			       pci_name(dev));
-			return -EINVAL;
-		}
-		if (r->flags & IORESOURCE_IO)
-			cmd |= PCI_COMMAND_IO;
-		if (r->flags & IORESOURCE_MEM)
-			cmd |= PCI_COMMAND_MEMORY;
-	}
-	if (cmd != old_cmd) {
-		printk("PCI: Enabling device %s (%04x -> %04x)\n",
-		       pci_name(dev), old_cmd, cmd);
-		pci_write_config_word(dev, PCI_COMMAND, cmd);
-	}
-	return 0;
-}
-
-/*
- *  If we set up a device for bus mastering, we need to check and set
- *  the latency timer as it may not be properly set.
- */
-static unsigned int pcibios_max_latency = 255;
-
-void pcibios_set_master(struct pci_dev *dev)
-{
-	u8 lat;
-	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
-	if (lat < 16)
-		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
-	else if (lat > pcibios_max_latency)
-		lat = pcibios_max_latency;
-	else
-		return;
-	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
-	       pci_name(dev), lat);
-	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
-}
-
-void __init pcibios_update_irq(struct pci_dev *dev, int irq)
-{
-	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
-}
-
-char * __devinit pcibios_setup(char *str)
-{
-	return str;
-}
-
-int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
-			enum pci_mmap_state mmap_state, int write_combine)
-{
-	/*
-	 * I/O space can be accessed via normal processor loads and stores on
-	 * this platform but for now we elect not to do this and portable
-	 * drivers should not do this anyway.
-	 */
-	if (mmap_state == pci_mmap_io)
-		return -EINVAL;
-
-	/*
-	 * Ignore write-combine; for now only return uncached mappings.
-	 */
-	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-
-	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
-			       vma->vm_end - vma->vm_start,
-			       vma->vm_page_prot);
-}
-
-static void __iomem *ioport_map_pci(struct pci_dev *dev,
-				    unsigned long port, unsigned int nr)
-{
-	struct pci_channel *chan = dev->sysdata;
-
-	if (!chan->io_map_base)
-		chan->io_map_base = generic_io_base;
-
-	return (void __iomem *)(chan->io_map_base + port);
-}
-
-void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
-{
-	resource_size_t start = pci_resource_start(dev, bar);
-	resource_size_t len = pci_resource_len(dev, bar);
-	unsigned long flags = pci_resource_flags(dev, bar);
-
-	if (unlikely(!len || !start))
-		return NULL;
-	if (maxlen && len > maxlen)
-		len = maxlen;
-
-	if (flags & IORESOURCE_IO)
-		return ioport_map_pci(dev, start, len);
-
-	/*
-	 * Presently the IORESOURCE_MEM case is a bit special, most
-	 * SH7751 style PCI controllers have PCI memory at a fixed
-	 * location in the address space where no remapping is desired.
-	 * With the IORESOURCE_MEM case more care has to be taken
-	 * to inhibit page table mapping for legacy cores, but this is
-	 * punted off to __ioremap().
-	 *					-- PFM.
-	 */
-	if (flags & IORESOURCE_MEM) {
-		if (flags & IORESOURCE_CACHEABLE)
-			return ioremap(start, len);
-
-		return ioremap_nocache(start, len);
-	}
-
-	return NULL;
-}
-EXPORT_SYMBOL(pci_iomap);
-
-void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
-{
-	iounmap(addr);
-}
-EXPORT_SYMBOL(pci_iounmap);
-
-#ifdef CONFIG_HOTPLUG
-EXPORT_SYMBOL(pcibios_resource_to_bus);
-EXPORT_SYMBOL(pcibios_bus_to_resource);
-EXPORT_SYMBOL(PCIBIOS_MIN_IO);
-EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
-#endif
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c
index 8c0b136eecb3c..54d77cbb8b39e 100644
--- a/arch/sh/drivers/pci/pci-new.c
+++ b/arch/sh/drivers/pci/pci-new.c
@@ -1,20 +1,28 @@
 /*
  * New-style PCI core.
  *
- * Copyright (c) 2002  M. R. Brown
  * Copyright (c) 2004 - 2009  Paul Mundt
+ * Copyright (c) 2002  M. R. Brown
+ *
+ * Modelled after arch/mips/pci/pci.c:
+ *  Copyright (C) 2003, 04 Ralf Baechle (ralf@linux-mips.org)
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
 #include <linux/kernel.h>
+#include <linux/mm.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/types.h>
 #include <linux/dma-debug.h>
 #include <linux/io.h>
 #include <linux/mutex.h>
 
+unsigned long PCIBIOS_MIN_IO = 0x0000;
+unsigned long PCIBIOS_MIN_MEM = 0;
+
 /*
  * The PCI controller list.
  */
@@ -27,9 +35,6 @@ static void __devinit pcibios_scanbus(struct pci_channel *hose)
 	static int next_busno;
 	struct pci_bus *bus;
 
-	/* Catch botched conversion attempts */
-	BUG_ON(hose->init);
-
 	bus = pci_scan_bus(next_busno, hose->pci_ops, hose);
 	if (bus) {
 		next_busno = bus->subordinate + 1;
@@ -128,7 +133,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev,
  *  Called after each bus is probed, but before its children
  *  are examined.
  */
-void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
+void __devinit pcibios_fixup_bus(struct pci_bus *bus)
 {
 	struct pci_dev *dev = bus->self;
 	struct list_head *ln;
@@ -146,3 +151,214 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus)
 			pcibios_fixup_device_resources(dev, bus);
 	}
 }
+
+/*
+ * We need to avoid collisions with `mirrored' VGA ports
+ * and other strange ISA hardware, so we always want the
+ * addresses to be allocated in the 0x000-0x0ff region
+ * modulo 0x400.
+ */
+void pcibios_align_resource(void *data, struct resource *res,
+			    resource_size_t size, resource_size_t align)
+{
+	struct pci_dev *dev = data;
+	struct pci_channel *chan = dev->sysdata;
+	resource_size_t start = res->start;
+
+	if (res->flags & IORESOURCE_IO) {
+		if (start < PCIBIOS_MIN_IO + chan->io_resource->start)
+			start = PCIBIOS_MIN_IO + chan->io_resource->start;
+
+		/*
+                 * Put everything into 0x00-0xff region modulo 0x400.
+		 */
+		if (start & 0x300) {
+			start = (start + 0x3ff) & ~0x3ff;
+			res->start = start;
+		}
+	} else if (res->flags & IORESOURCE_MEM) {
+		if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start)
+			start = PCIBIOS_MIN_MEM + chan->mem_resource->start;
+	}
+
+	res->start = start;
+}
+
+void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region,
+			 struct resource *res)
+{
+	struct pci_channel *hose = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = hose->io_offset;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = hose->mem_offset;
+
+	region->start = res->start - offset;
+	region->end = res->end - offset;
+}
+
+void __devinit
+pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res,
+			struct pci_bus_region *region)
+{
+	struct pci_channel *hose = dev->sysdata;
+	unsigned long offset = 0;
+
+	if (res->flags & IORESOURCE_IO)
+		offset = hose->io_offset;
+	else if (res->flags & IORESOURCE_MEM)
+		offset = hose->mem_offset;
+
+	res->start = region->start + offset;
+	res->end = region->end + offset;
+}
+
+int pcibios_enable_device(struct pci_dev *dev, int mask)
+{
+	u16 cmd, old_cmd;
+	int idx;
+	struct resource *r;
+
+	pci_read_config_word(dev, PCI_COMMAND, &cmd);
+	old_cmd = cmd;
+	for (idx=0; idx < PCI_NUM_RESOURCES; idx++) {
+		/* Only set up the requested stuff */
+		if (!(mask & (1<<idx)))
+			continue;
+
+		r = &dev->resource[idx];
+		if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM)))
+			continue;
+		if ((idx == PCI_ROM_RESOURCE) &&
+				(!(r->flags & IORESOURCE_ROM_ENABLE)))
+			continue;
+		if (!r->start && r->end) {
+			printk(KERN_ERR "PCI: Device %s not available "
+			       "because of resource collisions\n",
+			       pci_name(dev));
+			return -EINVAL;
+		}
+		if (r->flags & IORESOURCE_IO)
+			cmd |= PCI_COMMAND_IO;
+		if (r->flags & IORESOURCE_MEM)
+			cmd |= PCI_COMMAND_MEMORY;
+	}
+	if (cmd != old_cmd) {
+		printk("PCI: Enabling device %s (%04x -> %04x)\n",
+		       pci_name(dev), old_cmd, cmd);
+		pci_write_config_word(dev, PCI_COMMAND, cmd);
+	}
+	return 0;
+}
+
+/*
+ *  If we set up a device for bus mastering, we need to check and set
+ *  the latency timer as it may not be properly set.
+ */
+static unsigned int pcibios_max_latency = 255;
+
+void pcibios_set_master(struct pci_dev *dev)
+{
+	u8 lat;
+	pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat);
+	if (lat < 16)
+		lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency;
+	else if (lat > pcibios_max_latency)
+		lat = pcibios_max_latency;
+	else
+		return;
+	printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n",
+	       pci_name(dev), lat);
+	pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
+}
+
+void __init pcibios_update_irq(struct pci_dev *dev, int irq)
+{
+	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
+}
+
+char * __devinit pcibios_setup(char *str)
+{
+	return str;
+}
+
+int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
+			enum pci_mmap_state mmap_state, int write_combine)
+{
+	/*
+	 * I/O space can be accessed via normal processor loads and stores on
+	 * this platform but for now we elect not to do this and portable
+	 * drivers should not do this anyway.
+	 */
+	if (mmap_state == pci_mmap_io)
+		return -EINVAL;
+
+	/*
+	 * Ignore write-combine; for now only return uncached mappings.
+	 */
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+			       vma->vm_end - vma->vm_start,
+			       vma->vm_page_prot);
+}
+
+static void __iomem *ioport_map_pci(struct pci_dev *dev,
+				    unsigned long port, unsigned int nr)
+{
+	struct pci_channel *chan = dev->sysdata;
+
+	if (!chan->io_map_base)
+		chan->io_map_base = generic_io_base;
+
+	return (void __iomem *)(chan->io_map_base + port);
+}
+
+void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen)
+{
+	resource_size_t start = pci_resource_start(dev, bar);
+	resource_size_t len = pci_resource_len(dev, bar);
+	unsigned long flags = pci_resource_flags(dev, bar);
+
+	if (unlikely(!len || !start))
+		return NULL;
+	if (maxlen && len > maxlen)
+		len = maxlen;
+
+	if (flags & IORESOURCE_IO)
+		return ioport_map_pci(dev, start, len);
+
+	/*
+	 * Presently the IORESOURCE_MEM case is a bit special, most
+	 * SH7751 style PCI controllers have PCI memory at a fixed
+	 * location in the address space where no remapping is desired.
+	 * With the IORESOURCE_MEM case more care has to be taken
+	 * to inhibit page table mapping for legacy cores, but this is
+	 * punted off to __ioremap().
+	 *					-- PFM.
+	 */
+	if (flags & IORESOURCE_MEM) {
+		if (flags & IORESOURCE_CACHEABLE)
+			return ioremap(start, len);
+
+		return ioremap_nocache(start, len);
+	}
+
+	return NULL;
+}
+EXPORT_SYMBOL(pci_iomap);
+
+void pci_iounmap(struct pci_dev *dev, void __iomem *addr)
+{
+	iounmap(addr);
+}
+EXPORT_SYMBOL(pci_iounmap);
+
+#ifdef CONFIG_HOTPLUG
+EXPORT_SYMBOL(pcibios_resource_to_bus);
+EXPORT_SYMBOL(pcibios_bus_to_resource);
+EXPORT_SYMBOL(PCIBIOS_MIN_IO);
+EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
+#endif
-- 
GitLab


From cf242007a1eb29dcf93d1cb34713ec9b3f4a0e1b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 20 Apr 2009 21:53:33 +0900
Subject: [PATCH 0536/2198] sh: pci: Rename pci-new.c to pci.c.

pci-new.c is now in a state to replace the old pci.c, rename it
accordingly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/Kconfig              | 4 ----
 arch/sh/drivers/pci/Makefile             | 2 +-
 arch/sh/drivers/pci/{pci-new.c => pci.c} | 0
 3 files changed, 1 insertion(+), 5 deletions(-)
 rename arch/sh/drivers/pci/{pci-new.c => pci.c} (100%)

diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig
index ea903a984f011..e8db585a6638a 100644
--- a/arch/sh/drivers/pci/Kconfig
+++ b/arch/sh/drivers/pci/Kconfig
@@ -17,7 +17,3 @@ config SH_PCIDMA_NONCOHERENT
 	  code will not have to flush the CPU's caches. If you have a PCI host
 	  bridge integrated with your SH CPU, refer carefully to the chip specs
 	  to see if you can say 'N' here. Otherwise, leave it as 'Y'.
-
-config PCI_NEW
-	def_bool y
-	depends on PCI
diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile
index eacac7f475d94..d2ffc477549ad 100644
--- a/arch/sh/drivers/pci/Makefile
+++ b/arch/sh/drivers/pci/Makefile
@@ -1,7 +1,7 @@
 #
 # Makefile for the PCI specific kernel interface routines under Linux.
 #
-obj-$(CONFIG_PCI_NEW)			+= pci-new.o
+obj-y					+= pci.o
 
 obj-$(CONFIG_CPU_SUBTYPE_SH7751)	+= pci-sh7751.o ops-sh4.o
 obj-$(CONFIG_CPU_SUBTYPE_SH7751R)	+= pci-sh7751.o ops-sh4.o
diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci.c
similarity index 100%
rename from arch/sh/drivers/pci/pci-new.c
rename to arch/sh/drivers/pci/pci.c
-- 
GitLab


From 9ae5b8790037d05d32746f521af146c32089bfec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 10:27:58 -0400
Subject: [PATCH 0537/2198] tracing: change branch profiling to a choice
 selection

This patch makes the branch profiling into a choice selection:

  None               - no branch profiling
  likely/unlikely    - only profile likely/unlikely branches
  all                - profile all branches

The all profiler will also enable the likely/unlikely branches.

This does not change the way the profiler works or the dependencies
between the profilers.

What this patch does, is keep the branch profiling from being selected
by an allyesconfig make. The branch profiler is very intrusive and
it is known to break various architecture builds when selected as an
allyesconfig.

[ Impact: prevent branch profiler from being selected in allyesconfig ]

Reported-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 39 +++++++++++++++++++++++++++++++++------
 1 file changed, 33 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 57981d338d1fd..3ee28db69be6c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -212,8 +212,36 @@ config BOOT_TRACER
 	  to enable this on bootup.
 
 config TRACE_BRANCH_PROFILING
-	bool "Trace likely/unlikely profiler"
+	bool
 	select TRACING
+
+choice
+	prompt "Branch Profiling"
+	default BRANCH_PROFILE_NONE
+	help
+	 The branch profiling is a software profiler. It will add hooks
+	 into the C conditionals to test which path a branch takes.
+
+	 The likely/unlikely profiler only looks at the conditions that
+	 are annotated with a likely or unlikely macro.
+
+	 The "all branch" profiler will profile every if statement in the
+	 kernel. This profiler will also enable the likely/unlikely
+	 profiler as well.
+
+	 Either of the above profilers add a bit of overhead to the system.
+	 If unsure choose "No branch profiling".
+
+config BRANCH_PROFILE_NONE
+	bool "No branch profiling"
+	help
+	 No branch profiling. Branch profiling adds a bit of overhead.
+	 Only enable it if you want to analyse the branching behavior.
+	 Otherwise keep it disabled.
+
+config PROFILE_ANNOTATED_BRANCHES
+	bool "Trace likely/unlikely profiler"
+	select TRACE_BRANCH_PROFILING
 	help
 	  This tracer profiles all the the likely and unlikely macros
 	  in the kernel. It will display the results in:
@@ -223,11 +251,9 @@ config TRACE_BRANCH_PROFILING
 	  Note: this will add a significant overhead, only turn this
 	  on if you need to profile the system's use of these macros.
 
-	  Say N if unsure.
-
 config PROFILE_ALL_BRANCHES
 	bool "Profile all if conditionals"
-	depends on TRACE_BRANCH_PROFILING
+	select TRACE_BRANCH_PROFILING
 	help
 	  This tracer profiles all branch conditions. Every if ()
 	  taken in the kernel is recorded whether it hit or miss.
@@ -235,11 +261,12 @@ config PROFILE_ALL_BRANCHES
 
 	  /debugfs/tracing/profile_branch
 
+	  This option also enables the likely/unlikely profiler.
+
 	  This configuration, when enabled, will impose a great overhead
 	  on the system. This should only be enabled when the system
 	  is to be analyzed
-
-	  Say N if unsure.
+endchoice
 
 config TRACING_BRANCHES
 	bool
-- 
GitLab


From 4ed9f0716e46bb9646f26e73f4a1b5b24db7947a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 10:47:36 -0400
Subject: [PATCH 0538/2198] tracing: create menuconfig for tracing
 infrastructure

During testing we often use randconfig to test various kernels.
The current configuration set up does not give an easy way to disable
all tracing with a single config. The case where randconfig would
test all tracing disabled is very unlikely.

This patch adds a config option to enable or disable all tracing.
It is hooked into the tracing menu just like other submenus are done.

[ Impact: allow randconfig to easily produce all traces disabled ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3ee28db69be6c..3fa36d2bc2905 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -77,7 +77,12 @@ config TRACING_SUPPORT
 
 if TRACING_SUPPORT
 
-menu "Tracers"
+menuconfig FTRACE
+	bool "Tracers"
+	help
+	 Enable the kernel tracing infrastructure.
+
+if FTRACE
 
 config FUNCTION_TRACER
 	bool "Kernel Function Tracer"
@@ -462,7 +467,7 @@ config MMIOTRACE_TEST
 
 	  Say N, unless you absolutely know what you are doing.
 
-endmenu
+endif # FTRACE
 
 endif # TRACING_SUPPORT
 
-- 
GitLab


From d24e473e5b2ca86d1288b9416227ccc603313d0f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 13:32:07 +0200
Subject: [PATCH 0539/2198] perf_counter: copy in Git's top Makefile

We'd like to have a similar user-space structure as Git has, for the
perfcounter tools - so copy in Git's toplevel makefile as-is.

We'll strip it down in subsequent commits to make it fit the
perfcounters code.

The Git version used: 66996ec: Sync with 1.6.2.4

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile | 1735 ++++++++++++++++++++++++++-
 1 file changed, 1724 insertions(+), 11 deletions(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 1dd37ee7dbdc3..6e0838b03ad1e 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -1,18 +1,1731 @@
-BINS = kerneltop perfstat perf-record perf-report
+# The default target of this Makefile is...
+all::
 
-all: $(BINS)
+# Define V=1 to have a more verbose compile.
+#
+# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
+# or vsnprintf() return -1 instead of number of characters which would
+# have been written to the final string if enough space had been available.
+#
+# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
+# when attempting to read from an fopen'ed directory.
+#
+# Define NO_OPENSSL environment variable if you do not have OpenSSL.
+# This also implies MOZILLA_SHA1.
+#
+# Define NO_CURL if you do not have libcurl installed.  git-http-pull and
+# git-http-push are not built, and you cannot use http:// and https://
+# transports.
+#
+# Define CURLDIR=/foo/bar if your curl header and library files are in
+# /foo/bar/include and /foo/bar/lib directories.
+#
+# Define NO_EXPAT if you do not have expat installed.  git-http-push is
+# not built, and you cannot push using http:// and https:// transports.
+#
+# Define EXPATDIR=/foo/bar if your expat header and library files are in
+# /foo/bar/include and /foo/bar/lib directories.
+#
+# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
+#
+# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
+# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
+#
+# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
+# do not support the 'size specifiers' introduced by C99, namely ll, hh,
+# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
+# some C compilers supported these specifiers prior to C99 as an extension.
+#
+# Define NO_STRCASESTR if you don't have strcasestr.
+#
+# Define NO_MEMMEM if you don't have memmem.
+#
+# Define NO_STRLCPY if you don't have strlcpy.
+#
+# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
+# If your compiler also does not support long long or does not have
+# strtoull, define NO_STRTOULL.
+#
+# Define NO_SETENV if you don't have setenv in the C library.
+#
+# Define NO_UNSETENV if you don't have unsetenv in the C library.
+#
+# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
+#
+# Define NO_SYS_SELECT_H if you don't have sys/select.h.
+#
+# Define NO_SYMLINK_HEAD if you never want .git/HEAD to be a symbolic link.
+# Enable it on Windows.  By default, symrefs are still used.
+#
+# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
+# tests.  These tests take up a significant amount of the total test time
+# but are not needed unless you plan to talk to SVN repos.
+#
+# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
+# installed in /sw, but don't want GIT to link against any libraries
+# installed there.  If defined you may specify your own (or Fink's)
+# include directories and library directories by defining CFLAGS
+# and LDFLAGS appropriately.
+#
+# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
+# have DarwinPorts installed in /opt/local, but don't want GIT to
+# link against any libraries installed there.  If defined you may
+# specify your own (or DarwinPort's) include directories and
+# library directories by defining CFLAGS and LDFLAGS appropriately.
+#
+# Define PPC_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine optimized for PowerPC.
+#
+# Define ARM_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine optimized for ARM.
+#
+# Define MOZILLA_SHA1 environment variable when running make to make use of
+# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
+# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
+# choice) has very fast version optimized for i586.
+#
+# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
+#
+# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
+#
+# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
+# Patrick Mauritz).
+#
+# Define NO_MMAP if you want to avoid mmap.
+#
+# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
+#
+# Define NO_PREAD if you have a problem with pread() system call (e.g.
+# cygwin.dll before v1.5.22).
+#
+# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
+# generally faster on your platform than accessing the working directory.
+#
+# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
+# the executable mode bit, but doesn't really do so.
+#
+# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
+#
+# Define NO_SOCKADDR_STORAGE if your platform does not have struct
+# sockaddr_storage.
+#
+# Define NO_ICONV if your libc does not properly support iconv.
+#
+# Define OLD_ICONV if your library has an old iconv(), where the second
+# (input buffer pointer) parameter is declared with type (const char **).
+#
+# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
+#
+# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
+# that tells runtime paths to dynamic libraries;
+# "-Wl,-rpath=/path/lib" is used instead.
+#
+# Define USE_NSEC below if you want git to care about sub-second file mtimes
+# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
+# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
+# randomly break unless your underlying filesystem supports those sub-second
+# times (my ext3 doesn't).
+#
+# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
+# "st_ctim"
+#
+# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
+# available.  This automatically turns USE_NSEC off.
+#
+# Define USE_STDEV below if you want git to care about the underlying device
+# change being considered an inode change from the update-index perspective.
+#
+# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
+# field that counts the on-disk footprint in 512-byte blocks.
+#
+# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
+#
+# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
+#
+# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
+# MakeMaker (e.g. using ActiveState under Cygwin).
+#
+# Define NO_PERL if you do not want Perl scripts or libraries at all.
+#
+# Define NO_TCLTK if you do not want Tcl/Tk GUI.
+#
+# The TCL_PATH variable governs the location of the Tcl interpreter
+# used to optimize git-gui for your system.  Only used if NO_TCLTK
+# is not set.  Defaults to the bare 'tclsh'.
+#
+# The TCLTK_PATH variable governs the location of the Tcl/Tk interpreter.
+# If not set it defaults to the bare 'wish'. If it is set to the empty
+# string then NO_TCLTK will be forced (this is used by configure script).
+#
+# Define THREADED_DELTA_SEARCH if you have pthreads and wish to exploit
+# parallel delta searching when packing objects.
+#
+# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
+# is a simplified version of the merge sort used in glibc. This is
+# recommended if Git triggers O(n^2) behavior in your platform's qsort().
+#
+# Define NO_EXTERNAL_GREP if you don't want "git grep" to ever call
+# your external grep (e.g., if your system lacks grep, if its grep is
+# broken, or spawning external process is slower than built-in grep git has).
 
-kerneltop: kerneltop.c ../../include/linux/perf_counter.h
-	cc -O6 -Wall -lrt -o $@ $<
+GIT-VERSION-FILE: .FORCE-GIT-VERSION-FILE
+	@$(SHELL_PATH) ./GIT-VERSION-GEN
+-include GIT-VERSION-FILE
 
-perf-record: perf-record.c ../../include/linux/perf_counter.h
-	cc -O6 -Wall -lrt -o $@ $<
+uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
+uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
+uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
+uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
+uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
+uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
-perf-report: perf-report.cc ../../include/linux/perf_counter.h
-	g++ -O6 -Wall -lrt -o $@ $<
+# CFLAGS and LDFLAGS are for the users to override from the command line.
 
-perfstat: kerneltop
-	ln -sf kerneltop perfstat
+CFLAGS = -g -O2 -Wall
+LDFLAGS =
+ALL_CFLAGS = $(CFLAGS)
+ALL_LDFLAGS = $(LDFLAGS)
+STRIP ?= strip
+
+# Among the variables below, these:
+#   gitexecdir
+#   template_dir
+#   mandir
+#   infodir
+#   htmldir
+#   ETC_GITCONFIG (but not sysconfdir)
+# can be specified as a relative path some/where/else;
+# this is interpreted as relative to $(prefix) and "git" at
+# runtime figures out where they are based on the path to the executable.
+# This can help installing the suite in a relocatable way.
+
+prefix = $(HOME)
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+mandir = share/man
+infodir = share/info
+gitexecdir = libexec/git-core
+sharedir = $(prefix)/share
+template_dir = share/git-core/templates
+htmldir = share/doc/git-doc
+ifeq ($(prefix),/usr)
+sysconfdir = /etc
+ETC_GITCONFIG = $(sysconfdir)/gitconfig
+else
+sysconfdir = $(prefix)/etc
+ETC_GITCONFIG = etc/gitconfig
+endif
+lib = lib
+# DESTDIR=
+
+# default configuration for gitweb
+GITWEB_CONFIG = gitweb_config.perl
+GITWEB_CONFIG_SYSTEM = /etc/gitweb.conf
+GITWEB_HOME_LINK_STR = projects
+GITWEB_SITENAME =
+GITWEB_PROJECTROOT = /pub/git
+GITWEB_PROJECT_MAXDEPTH = 2007
+GITWEB_EXPORT_OK =
+GITWEB_STRICT_EXPORT =
+GITWEB_BASE_URL =
+GITWEB_LIST =
+GITWEB_HOMETEXT = indextext.html
+GITWEB_CSS = gitweb.css
+GITWEB_LOGO = git-logo.png
+GITWEB_FAVICON = git-favicon.png
+GITWEB_SITE_HEADER =
+GITWEB_SITE_FOOTER =
+
+export prefix bindir sharedir sysconfdir
+
+CC = gcc
+AR = ar
+RM = rm -f
+TAR = tar
+FIND = find
+INSTALL = install
+RPMBUILD = rpmbuild
+TCL_PATH = tclsh
+TCLTK_PATH = wish
+PTHREAD_LIBS = -lpthread
+
+export TCL_PATH TCLTK_PATH
+
+# sparse is architecture-neutral, which means that we need to tell it
+# explicitly what architecture to check for. Fix this up for yours..
+SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
+
+
+
+### --- END CONFIGURATION SECTION ---
+
+# Those must not be GNU-specific; they are shared with perl/ which may
+# be built by a different compiler. (Note that this is an artifact now
+# but it still might be nice to keep that distinction.)
+BASIC_CFLAGS =
+BASIC_LDFLAGS =
+
+# Guard against environment variables
+BUILTIN_OBJS =
+BUILT_INS =
+COMPAT_CFLAGS =
+COMPAT_OBJS =
+LIB_H =
+LIB_OBJS =
+PROGRAMS =
+SCRIPT_PERL =
+SCRIPT_SH =
+TEST_PROGRAMS =
+
+SCRIPT_SH += git-am.sh
+SCRIPT_SH += git-bisect.sh
+SCRIPT_SH += git-difftool--helper.sh
+SCRIPT_SH += git-filter-branch.sh
+SCRIPT_SH += git-lost-found.sh
+SCRIPT_SH += git-merge-octopus.sh
+SCRIPT_SH += git-merge-one-file.sh
+SCRIPT_SH += git-merge-resolve.sh
+SCRIPT_SH += git-mergetool.sh
+SCRIPT_SH += git-mergetool--lib.sh
+SCRIPT_SH += git-parse-remote.sh
+SCRIPT_SH += git-pull.sh
+SCRIPT_SH += git-quiltimport.sh
+SCRIPT_SH += git-rebase--interactive.sh
+SCRIPT_SH += git-rebase.sh
+SCRIPT_SH += git-repack.sh
+SCRIPT_SH += git-request-pull.sh
+SCRIPT_SH += git-sh-setup.sh
+SCRIPT_SH += git-stash.sh
+SCRIPT_SH += git-submodule.sh
+SCRIPT_SH += git-web--browse.sh
+
+SCRIPT_PERL += git-add--interactive.perl
+SCRIPT_PERL += git-difftool.perl
+SCRIPT_PERL += git-archimport.perl
+SCRIPT_PERL += git-cvsexportcommit.perl
+SCRIPT_PERL += git-cvsimport.perl
+SCRIPT_PERL += git-cvsserver.perl
+SCRIPT_PERL += git-relink.perl
+SCRIPT_PERL += git-send-email.perl
+SCRIPT_PERL += git-svn.perl
+
+SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
+	  $(patsubst %.perl,%,$(SCRIPT_PERL)) \
+	  git-instaweb
+
+# Empty...
+EXTRA_PROGRAMS =
+
+# ... and all the rest that could be moved out of bindir to gitexecdir
+PROGRAMS += $(EXTRA_PROGRAMS)
+PROGRAMS += git-fast-import$X
+PROGRAMS += git-hash-object$X
+PROGRAMS += git-index-pack$X
+PROGRAMS += git-merge-index$X
+PROGRAMS += git-merge-tree$X
+PROGRAMS += git-mktag$X
+PROGRAMS += git-mktree$X
+PROGRAMS += git-pack-redundant$X
+PROGRAMS += git-patch-id$X
+PROGRAMS += git-shell$X
+PROGRAMS += git-show-index$X
+PROGRAMS += git-unpack-file$X
+PROGRAMS += git-update-server-info$X
+PROGRAMS += git-upload-pack$X
+PROGRAMS += git-var$X
+
+# List built-in command $C whose implementation cmd_$C() is not in
+# builtin-$C.o but is linked in as part of some other command.
+BUILT_INS += $(patsubst builtin-%.o,git-%$X,$(BUILTIN_OBJS))
+
+BUILT_INS += git-cherry$X
+BUILT_INS += git-cherry-pick$X
+BUILT_INS += git-format-patch$X
+BUILT_INS += git-fsck-objects$X
+BUILT_INS += git-get-tar-commit-id$X
+BUILT_INS += git-init$X
+BUILT_INS += git-merge-subtree$X
+BUILT_INS += git-peek-remote$X
+BUILT_INS += git-repo-config$X
+BUILT_INS += git-show$X
+BUILT_INS += git-stage$X
+BUILT_INS += git-status$X
+BUILT_INS += git-whatchanged$X
+
+# what 'all' will build and 'install' will install, in gitexecdir
+ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
+
+# what 'all' will build but not install in gitexecdir
+OTHER_PROGRAMS = git$X
+ifndef NO_PERL
+OTHER_PROGRAMS += gitweb/gitweb.cgi
+endif
+
+# Set paths to tools early so that they can be used for version tests.
+ifndef SHELL_PATH
+	SHELL_PATH = /bin/sh
+endif
+ifndef PERL_PATH
+	PERL_PATH = /usr/bin/perl
+endif
+
+export PERL_PATH
+
+LIB_FILE=libgit.a
+XDIFF_LIB=xdiff/lib.a
+
+LIB_H += archive.h
+LIB_H += attr.h
+LIB_H += blob.h
+LIB_H += builtin.h
+LIB_H += cache.h
+LIB_H += cache-tree.h
+LIB_H += commit.h
+LIB_H += compat/cygwin.h
+LIB_H += compat/mingw.h
+LIB_H += csum-file.h
+LIB_H += decorate.h
+LIB_H += delta.h
+LIB_H += diffcore.h
+LIB_H += diff.h
+LIB_H += dir.h
+LIB_H += fsck.h
+LIB_H += git-compat-util.h
+LIB_H += graph.h
+LIB_H += grep.h
+LIB_H += hash.h
+LIB_H += help.h
+LIB_H += levenshtein.h
+LIB_H += list-objects.h
+LIB_H += ll-merge.h
+LIB_H += log-tree.h
+LIB_H += mailmap.h
+LIB_H += merge-recursive.h
+LIB_H += object.h
+LIB_H += pack.h
+LIB_H += pack-refs.h
+LIB_H += pack-revindex.h
+LIB_H += parse-options.h
+LIB_H += patch-ids.h
+LIB_H += pkt-line.h
+LIB_H += progress.h
+LIB_H += quote.h
+LIB_H += reflog-walk.h
+LIB_H += refs.h
+LIB_H += remote.h
+LIB_H += rerere.h
+LIB_H += revision.h
+LIB_H += run-command.h
+LIB_H += sha1-lookup.h
+LIB_H += sideband.h
+LIB_H += sigchain.h
+LIB_H += strbuf.h
+LIB_H += string-list.h
+LIB_H += tag.h
+LIB_H += transport.h
+LIB_H += tree.h
+LIB_H += tree-walk.h
+LIB_H += unpack-trees.h
+LIB_H += userdiff.h
+LIB_H += utf8.h
+LIB_H += wt-status.h
+
+LIB_OBJS += abspath.o
+LIB_OBJS += alias.o
+LIB_OBJS += alloc.o
+LIB_OBJS += archive.o
+LIB_OBJS += archive-tar.o
+LIB_OBJS += archive-zip.o
+LIB_OBJS += attr.o
+LIB_OBJS += base85.o
+LIB_OBJS += bisect.o
+LIB_OBJS += blob.o
+LIB_OBJS += branch.o
+LIB_OBJS += bundle.o
+LIB_OBJS += cache-tree.o
+LIB_OBJS += color.o
+LIB_OBJS += combine-diff.o
+LIB_OBJS += commit.o
+LIB_OBJS += config.o
+LIB_OBJS += connect.o
+LIB_OBJS += convert.o
+LIB_OBJS += copy.o
+LIB_OBJS += csum-file.o
+LIB_OBJS += ctype.o
+LIB_OBJS += date.o
+LIB_OBJS += decorate.o
+LIB_OBJS += diffcore-break.o
+LIB_OBJS += diffcore-delta.o
+LIB_OBJS += diffcore-order.o
+LIB_OBJS += diffcore-pickaxe.o
+LIB_OBJS += diffcore-rename.o
+LIB_OBJS += diff-delta.o
+LIB_OBJS += diff-lib.o
+LIB_OBJS += diff-no-index.o
+LIB_OBJS += diff.o
+LIB_OBJS += dir.o
+LIB_OBJS += editor.o
+LIB_OBJS += entry.o
+LIB_OBJS += environment.o
+LIB_OBJS += exec_cmd.o
+LIB_OBJS += fsck.o
+LIB_OBJS += graph.o
+LIB_OBJS += grep.o
+LIB_OBJS += hash.o
+LIB_OBJS += help.o
+LIB_OBJS += ident.o
+LIB_OBJS += levenshtein.o
+LIB_OBJS += list-objects.o
+LIB_OBJS += ll-merge.o
+LIB_OBJS += lockfile.o
+LIB_OBJS += log-tree.o
+LIB_OBJS += mailmap.o
+LIB_OBJS += match-trees.o
+LIB_OBJS += merge-file.o
+LIB_OBJS += merge-recursive.o
+LIB_OBJS += name-hash.o
+LIB_OBJS += object.o
+LIB_OBJS += pack-check.o
+LIB_OBJS += pack-refs.o
+LIB_OBJS += pack-revindex.o
+LIB_OBJS += pack-write.o
+LIB_OBJS += pager.o
+LIB_OBJS += parse-options.o
+LIB_OBJS += patch-delta.o
+LIB_OBJS += patch-ids.o
+LIB_OBJS += path.o
+LIB_OBJS += pkt-line.o
+LIB_OBJS += preload-index.o
+LIB_OBJS += pretty.o
+LIB_OBJS += progress.o
+LIB_OBJS += quote.o
+LIB_OBJS += reachable.o
+LIB_OBJS += read-cache.o
+LIB_OBJS += reflog-walk.o
+LIB_OBJS += refs.o
+LIB_OBJS += remote.o
+LIB_OBJS += rerere.o
+LIB_OBJS += revision.o
+LIB_OBJS += run-command.o
+LIB_OBJS += server-info.o
+LIB_OBJS += setup.o
+LIB_OBJS += sha1-lookup.o
+LIB_OBJS += sha1_file.o
+LIB_OBJS += sha1_name.o
+LIB_OBJS += shallow.o
+LIB_OBJS += sideband.o
+LIB_OBJS += sigchain.o
+LIB_OBJS += strbuf.o
+LIB_OBJS += string-list.o
+LIB_OBJS += symlinks.o
+LIB_OBJS += tag.o
+LIB_OBJS += trace.o
+LIB_OBJS += transport.o
+LIB_OBJS += tree-diff.o
+LIB_OBJS += tree.o
+LIB_OBJS += tree-walk.o
+LIB_OBJS += unpack-trees.o
+LIB_OBJS += usage.o
+LIB_OBJS += userdiff.o
+LIB_OBJS += utf8.o
+LIB_OBJS += walker.o
+LIB_OBJS += wrapper.o
+LIB_OBJS += write_or_die.o
+LIB_OBJS += ws.o
+LIB_OBJS += wt-status.o
+LIB_OBJS += xdiff-interface.o
+
+BUILTIN_OBJS += builtin-add.o
+BUILTIN_OBJS += builtin-annotate.o
+BUILTIN_OBJS += builtin-apply.o
+BUILTIN_OBJS += builtin-archive.o
+BUILTIN_OBJS += builtin-bisect--helper.o
+BUILTIN_OBJS += builtin-blame.o
+BUILTIN_OBJS += builtin-branch.o
+BUILTIN_OBJS += builtin-bundle.o
+BUILTIN_OBJS += builtin-cat-file.o
+BUILTIN_OBJS += builtin-check-attr.o
+BUILTIN_OBJS += builtin-check-ref-format.o
+BUILTIN_OBJS += builtin-checkout-index.o
+BUILTIN_OBJS += builtin-checkout.o
+BUILTIN_OBJS += builtin-clean.o
+BUILTIN_OBJS += builtin-clone.o
+BUILTIN_OBJS += builtin-commit-tree.o
+BUILTIN_OBJS += builtin-commit.o
+BUILTIN_OBJS += builtin-config.o
+BUILTIN_OBJS += builtin-count-objects.o
+BUILTIN_OBJS += builtin-describe.o
+BUILTIN_OBJS += builtin-diff-files.o
+BUILTIN_OBJS += builtin-diff-index.o
+BUILTIN_OBJS += builtin-diff-tree.o
+BUILTIN_OBJS += builtin-diff.o
+BUILTIN_OBJS += builtin-fast-export.o
+BUILTIN_OBJS += builtin-fetch--tool.o
+BUILTIN_OBJS += builtin-fetch-pack.o
+BUILTIN_OBJS += builtin-fetch.o
+BUILTIN_OBJS += builtin-fmt-merge-msg.o
+BUILTIN_OBJS += builtin-for-each-ref.o
+BUILTIN_OBJS += builtin-fsck.o
+BUILTIN_OBJS += builtin-gc.o
+BUILTIN_OBJS += builtin-grep.o
+BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-init-db.o
+BUILTIN_OBJS += builtin-log.o
+BUILTIN_OBJS += builtin-ls-files.o
+BUILTIN_OBJS += builtin-ls-remote.o
+BUILTIN_OBJS += builtin-ls-tree.o
+BUILTIN_OBJS += builtin-mailinfo.o
+BUILTIN_OBJS += builtin-mailsplit.o
+BUILTIN_OBJS += builtin-merge.o
+BUILTIN_OBJS += builtin-merge-base.o
+BUILTIN_OBJS += builtin-merge-file.o
+BUILTIN_OBJS += builtin-merge-ours.o
+BUILTIN_OBJS += builtin-merge-recursive.o
+BUILTIN_OBJS += builtin-mv.o
+BUILTIN_OBJS += builtin-name-rev.o
+BUILTIN_OBJS += builtin-pack-objects.o
+BUILTIN_OBJS += builtin-pack-refs.o
+BUILTIN_OBJS += builtin-prune-packed.o
+BUILTIN_OBJS += builtin-prune.o
+BUILTIN_OBJS += builtin-push.o
+BUILTIN_OBJS += builtin-read-tree.o
+BUILTIN_OBJS += builtin-receive-pack.o
+BUILTIN_OBJS += builtin-reflog.o
+BUILTIN_OBJS += builtin-remote.o
+BUILTIN_OBJS += builtin-rerere.o
+BUILTIN_OBJS += builtin-reset.o
+BUILTIN_OBJS += builtin-rev-list.o
+BUILTIN_OBJS += builtin-rev-parse.o
+BUILTIN_OBJS += builtin-revert.o
+BUILTIN_OBJS += builtin-rm.o
+BUILTIN_OBJS += builtin-send-pack.o
+BUILTIN_OBJS += builtin-shortlog.o
+BUILTIN_OBJS += builtin-show-branch.o
+BUILTIN_OBJS += builtin-show-ref.o
+BUILTIN_OBJS += builtin-stripspace.o
+BUILTIN_OBJS += builtin-symbolic-ref.o
+BUILTIN_OBJS += builtin-tag.o
+BUILTIN_OBJS += builtin-tar-tree.o
+BUILTIN_OBJS += builtin-unpack-objects.o
+BUILTIN_OBJS += builtin-update-index.o
+BUILTIN_OBJS += builtin-update-ref.o
+BUILTIN_OBJS += builtin-upload-archive.o
+BUILTIN_OBJS += builtin-verify-pack.o
+BUILTIN_OBJS += builtin-verify-tag.o
+BUILTIN_OBJS += builtin-write-tree.o
+
+GITLIBS = $(LIB_FILE) $(XDIFF_LIB)
+EXTLIBS =
+
+#
+# Platform specific tweaks
+#
+
+# We choose to avoid "if .. else if .. else .. endif endif"
+# because maintaining the nesting to match is a pain.  If
+# we had "elif" things would have been much nicer...
+
+ifeq ($(uname_S),Linux)
+	NO_STRLCPY = YesPlease
+	THREADED_DELTA_SEARCH = YesPlease
+endif
+ifeq ($(uname_S),GNU/kFreeBSD)
+	NO_STRLCPY = YesPlease
+	THREADED_DELTA_SEARCH = YesPlease
+endif
+ifeq ($(uname_S),UnixWare)
+	CC = cc
+	NEEDS_SOCKET = YesPlease
+	NEEDS_NSL = YesPlease
+	NEEDS_SSL_WITH_CRYPTO = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	SHELL_PATH = /usr/local/bin/bash
+	NO_IPV6 = YesPlease
+	NO_HSTRERROR = YesPlease
+	BASIC_CFLAGS += -Kthread
+	BASIC_CFLAGS += -I/usr/local/include
+	BASIC_LDFLAGS += -L/usr/local/lib
+	INSTALL = ginstall
+	TAR = gtar
+	NO_STRCASESTR = YesPlease
+	NO_MEMMEM = YesPlease
+endif
+ifeq ($(uname_S),SCO_SV)
+	ifeq ($(uname_R),3.2)
+		CFLAGS = -O2
+	endif
+	ifeq ($(uname_R),5)
+		CC = cc
+		BASIC_CFLAGS += -Kthread
+	endif
+	NEEDS_SOCKET = YesPlease
+	NEEDS_NSL = YesPlease
+	NEEDS_SSL_WITH_CRYPTO = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	SHELL_PATH = /usr/bin/bash
+	NO_IPV6 = YesPlease
+	NO_HSTRERROR = YesPlease
+	BASIC_CFLAGS += -I/usr/local/include
+	BASIC_LDFLAGS += -L/usr/local/lib
+	NO_STRCASESTR = YesPlease
+	NO_MEMMEM = YesPlease
+	INSTALL = ginstall
+	TAR = gtar
+endif
+ifeq ($(uname_S),Darwin)
+	NEEDS_SSL_WITH_CRYPTO = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	ifeq ($(shell expr "$(uname_R)" : '[15678]\.'),2)
+		OLD_ICONV = UnfortunatelyYes
+	endif
+	ifeq ($(shell expr "$(uname_R)" : '[15]\.'),2)
+		NO_STRLCPY = YesPlease
+	endif
+	NO_MEMMEM = YesPlease
+	THREADED_DELTA_SEARCH = YesPlease
+	USE_ST_TIMESPEC = YesPlease
+endif
+ifeq ($(uname_S),SunOS)
+	NEEDS_SOCKET = YesPlease
+	NEEDS_NSL = YesPlease
+	SHELL_PATH = /bin/bash
+	NO_STRCASESTR = YesPlease
+	NO_MEMMEM = YesPlease
+	NO_HSTRERROR = YesPlease
+	NO_MKDTEMP = YesPlease
+	OLD_ICONV = UnfortunatelyYes
+	ifeq ($(uname_R),5.8)
+		NO_UNSETENV = YesPlease
+		NO_SETENV = YesPlease
+		NO_C99_FORMAT = YesPlease
+		NO_STRTOUMAX = YesPlease
+	endif
+	ifeq ($(uname_R),5.9)
+		NO_UNSETENV = YesPlease
+		NO_SETENV = YesPlease
+		NO_C99_FORMAT = YesPlease
+		NO_STRTOUMAX = YesPlease
+	endif
+	INSTALL = ginstall
+	TAR = gtar
+	BASIC_CFLAGS += -D__EXTENSIONS__
+endif
+ifeq ($(uname_O),Cygwin)
+	NO_D_TYPE_IN_DIRENT = YesPlease
+	NO_D_INO_IN_DIRENT = YesPlease
+	NO_STRCASESTR = YesPlease
+	NO_MEMMEM = YesPlease
+	NO_SYMLINK_HEAD = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	NO_FAST_WORKING_DIRECTORY = UnfortunatelyYes
+	NO_TRUSTABLE_FILEMODE = UnfortunatelyYes
+	OLD_ICONV = UnfortunatelyYes
+	# There are conflicting reports about this.
+	# On some boxes NO_MMAP is needed, and not so elsewhere.
+	# Try commenting this out if you suspect MMAP is more efficient
+	NO_MMAP = YesPlease
+	NO_IPV6 = YesPlease
+	X = .exe
+endif
+ifeq ($(uname_S),FreeBSD)
+	NEEDS_LIBICONV = YesPlease
+	NO_MEMMEM = YesPlease
+	BASIC_CFLAGS += -I/usr/local/include
+	BASIC_LDFLAGS += -L/usr/local/lib
+	DIR_HAS_BSD_GROUP_SEMANTICS = YesPlease
+	USE_ST_TIMESPEC = YesPlease
+	THREADED_DELTA_SEARCH = YesPlease
+	ifeq ($(shell expr "$(uname_R)" : '4\.'),2)
+		PTHREAD_LIBS = -pthread
+		NO_UINTMAX_T = YesPlease
+		NO_STRTOUMAX = YesPlease
+	endif
+endif
+ifeq ($(uname_S),OpenBSD)
+	NO_STRCASESTR = YesPlease
+	NO_MEMMEM = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	BASIC_CFLAGS += -I/usr/local/include
+	BASIC_LDFLAGS += -L/usr/local/lib
+	THREADED_DELTA_SEARCH = YesPlease
+endif
+ifeq ($(uname_S),NetBSD)
+	ifeq ($(shell expr "$(uname_R)" : '[01]\.'),2)
+		NEEDS_LIBICONV = YesPlease
+	endif
+	BASIC_CFLAGS += -I/usr/pkg/include
+	BASIC_LDFLAGS += -L/usr/pkg/lib $(CC_LD_DYNPATH)/usr/pkg/lib
+	THREADED_DELTA_SEARCH = YesPlease
+endif
+ifeq ($(uname_S),AIX)
+	NO_STRCASESTR=YesPlease
+	NO_MEMMEM = YesPlease
+	NO_MKDTEMP = YesPlease
+	NO_STRLCPY = YesPlease
+	NO_NSEC = YesPlease
+	FREAD_READS_DIRECTORIES = UnfortunatelyYes
+	INTERNAL_QSORT = UnfortunatelyYes
+	NEEDS_LIBICONV=YesPlease
+	BASIC_CFLAGS += -D_LARGE_FILES
+	ifneq ($(shell expr "$(uname_V)" : '[1234]'),1)
+		THREADED_DELTA_SEARCH = YesPlease
+	else
+		NO_PTHREADS = YesPlease
+	endif
+endif
+ifeq ($(uname_S),GNU)
+	# GNU/Hurd
+	NO_STRLCPY=YesPlease
+endif
+ifeq ($(uname_S),IRIX64)
+	NO_IPV6=YesPlease
+	NO_SETENV=YesPlease
+	NO_STRCASESTR=YesPlease
+	NO_MEMMEM = YesPlease
+	NO_STRLCPY = YesPlease
+	NO_SOCKADDR_STORAGE=YesPlease
+	SHELL_PATH=/usr/gnu/bin/bash
+	BASIC_CFLAGS += -DPATH_MAX=1024
+	# for now, build 32-bit version
+	BASIC_LDFLAGS += -L/usr/lib32
+endif
+ifeq ($(uname_S),HP-UX)
+	NO_IPV6=YesPlease
+	NO_SETENV=YesPlease
+	NO_STRCASESTR=YesPlease
+	NO_MEMMEM = YesPlease
+	NO_STRLCPY = YesPlease
+	NO_MKDTEMP = YesPlease
+	NO_UNSETENV = YesPlease
+	NO_HSTRERROR = YesPlease
+	NO_SYS_SELECT_H = YesPlease
+	SNPRINTF_RETURNS_BOGUS = YesPlease
+endif
+ifneq (,$(findstring CYGWIN,$(uname_S)))
+	COMPAT_OBJS += compat/cygwin.o
+endif
+ifneq (,$(findstring MINGW,$(uname_S)))
+	NO_PREAD = YesPlease
+	NO_OPENSSL = YesPlease
+	NO_CURL = YesPlease
+	NO_SYMLINK_HEAD = YesPlease
+	NO_IPV6 = YesPlease
+	NO_SETENV = YesPlease
+	NO_UNSETENV = YesPlease
+	NO_STRCASESTR = YesPlease
+	NO_STRLCPY = YesPlease
+	NO_MEMMEM = YesPlease
+	NO_PTHREADS = YesPlease
+	NEEDS_LIBICONV = YesPlease
+	OLD_ICONV = YesPlease
+	NO_C99_FORMAT = YesPlease
+	NO_STRTOUMAX = YesPlease
+	NO_MKDTEMP = YesPlease
+	SNPRINTF_RETURNS_BOGUS = YesPlease
+	NO_SVN_TESTS = YesPlease
+	NO_PERL_MAKEMAKER = YesPlease
+	RUNTIME_PREFIX = YesPlease
+	NO_POSIX_ONLY_PROGRAMS = YesPlease
+	NO_ST_BLOCKS_IN_STRUCT_STAT = YesPlease
+	NO_NSEC = YesPlease
+	USE_WIN32_MMAP = YesPlease
+	COMPAT_CFLAGS += -D__USE_MINGW_ACCESS -DNOGDI -Icompat -Icompat/regex -Icompat/fnmatch
+	COMPAT_CFLAGS += -DSNPRINTF_SIZE_CORR=1
+	COMPAT_CFLAGS += -DSTRIP_EXTENSION=\".exe\"
+	COMPAT_OBJS += compat/mingw.o compat/fnmatch/fnmatch.o compat/regex/regex.o compat/winansi.o
+	EXTLIBS += -lws2_32
+	X = .exe
+endif
+ifneq (,$(findstring arm,$(uname_M)))
+	ARM_SHA1 = YesPlease
+endif
+
+-include config.mak.autogen
+-include config.mak
+
+ifeq ($(uname_S),Darwin)
+	ifndef NO_FINK
+		ifeq ($(shell test -d /sw/lib && echo y),y)
+			BASIC_CFLAGS += -I/sw/include
+			BASIC_LDFLAGS += -L/sw/lib
+		endif
+	endif
+	ifndef NO_DARWIN_PORTS
+		ifeq ($(shell test -d /opt/local/lib && echo y),y)
+			BASIC_CFLAGS += -I/opt/local/include
+			BASIC_LDFLAGS += -L/opt/local/lib
+		endif
+	endif
+	PTHREAD_LIBS =
+endif
+
+ifndef CC_LD_DYNPATH
+	ifdef NO_R_TO_GCC_LINKER
+		# Some gcc does not accept and pass -R to the linker to specify
+		# the runtime dynamic library path.
+		CC_LD_DYNPATH = -Wl,-rpath,
+	else
+		CC_LD_DYNPATH = -R
+	endif
+endif
+
+ifdef NO_CURL
+	BASIC_CFLAGS += -DNO_CURL
+else
+	ifdef CURLDIR
+		# Try "-Wl,-rpath=$(CURLDIR)/$(lib)" in such a case.
+		BASIC_CFLAGS += -I$(CURLDIR)/include
+		CURL_LIBCURL = -L$(CURLDIR)/$(lib) $(CC_LD_DYNPATH)$(CURLDIR)/$(lib) -lcurl
+	else
+		CURL_LIBCURL = -lcurl
+	endif
+	BUILTIN_OBJS += builtin-http-fetch.o
+	EXTLIBS += $(CURL_LIBCURL)
+	LIB_OBJS += http.o http-walker.o
+	curl_check := $(shell (echo 070908; curl-config --vernum) | sort -r | sed -ne 2p)
+	ifeq "$(curl_check)" "070908"
+		ifndef NO_EXPAT
+			PROGRAMS += git-http-push$X
+		endif
+	endif
+	ifndef NO_EXPAT
+		ifdef EXPATDIR
+			BASIC_CFLAGS += -I$(EXPATDIR)/include
+			EXPAT_LIBEXPAT = -L$(EXPATDIR)/$(lib) $(CC_LD_DYNPATH)$(EXPATDIR)/$(lib) -lexpat
+		else
+			EXPAT_LIBEXPAT = -lexpat
+		endif
+	endif
+endif
+
+ifdef ZLIB_PATH
+	BASIC_CFLAGS += -I$(ZLIB_PATH)/include
+	EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
+endif
+EXTLIBS += -lz
+
+ifndef NO_POSIX_ONLY_PROGRAMS
+	PROGRAMS += git-daemon$X
+	PROGRAMS += git-imap-send$X
+endif
+ifndef NO_OPENSSL
+	OPENSSL_LIBSSL = -lssl
+	ifdef OPENSSLDIR
+		BASIC_CFLAGS += -I$(OPENSSLDIR)/include
+		OPENSSL_LINK = -L$(OPENSSLDIR)/$(lib) $(CC_LD_DYNPATH)$(OPENSSLDIR)/$(lib)
+	else
+		OPENSSL_LINK =
+	endif
+else
+	BASIC_CFLAGS += -DNO_OPENSSL
+	MOZILLA_SHA1 = 1
+	OPENSSL_LIBSSL =
+endif
+ifdef NEEDS_SSL_WITH_CRYPTO
+	LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto -lssl
+else
+	LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto
+endif
+ifdef NEEDS_LIBICONV
+	ifdef ICONVDIR
+		BASIC_CFLAGS += -I$(ICONVDIR)/include
+		ICONV_LINK = -L$(ICONVDIR)/$(lib) $(CC_LD_DYNPATH)$(ICONVDIR)/$(lib)
+	else
+		ICONV_LINK =
+	endif
+	EXTLIBS += $(ICONV_LINK) -liconv
+endif
+ifdef NEEDS_SOCKET
+	EXTLIBS += -lsocket
+endif
+ifdef NEEDS_NSL
+	EXTLIBS += -lnsl
+endif
+ifdef NO_D_TYPE_IN_DIRENT
+	BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
+endif
+ifdef NO_D_INO_IN_DIRENT
+	BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
+endif
+ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
+	BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
+endif
+ifdef USE_NSEC
+	BASIC_CFLAGS += -DUSE_NSEC
+endif
+ifdef USE_ST_TIMESPEC
+	BASIC_CFLAGS += -DUSE_ST_TIMESPEC
+endif
+ifdef NO_NSEC
+	BASIC_CFLAGS += -DNO_NSEC
+endif
+ifdef NO_C99_FORMAT
+	BASIC_CFLAGS += -DNO_C99_FORMAT
+endif
+ifdef SNPRINTF_RETURNS_BOGUS
+	COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
+	COMPAT_OBJS += compat/snprintf.o
+endif
+ifdef FREAD_READS_DIRECTORIES
+	COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
+	COMPAT_OBJS += compat/fopen.o
+endif
+ifdef NO_SYMLINK_HEAD
+	BASIC_CFLAGS += -DNO_SYMLINK_HEAD
+endif
+ifdef NO_STRCASESTR
+	COMPAT_CFLAGS += -DNO_STRCASESTR
+	COMPAT_OBJS += compat/strcasestr.o
+endif
+ifdef NO_STRLCPY
+	COMPAT_CFLAGS += -DNO_STRLCPY
+	COMPAT_OBJS += compat/strlcpy.o
+endif
+ifdef NO_STRTOUMAX
+	COMPAT_CFLAGS += -DNO_STRTOUMAX
+	COMPAT_OBJS += compat/strtoumax.o
+endif
+ifdef NO_STRTOULL
+	COMPAT_CFLAGS += -DNO_STRTOULL
+endif
+ifdef NO_SETENV
+	COMPAT_CFLAGS += -DNO_SETENV
+	COMPAT_OBJS += compat/setenv.o
+endif
+ifdef NO_MKDTEMP
+	COMPAT_CFLAGS += -DNO_MKDTEMP
+	COMPAT_OBJS += compat/mkdtemp.o
+endif
+ifdef NO_UNSETENV
+	COMPAT_CFLAGS += -DNO_UNSETENV
+	COMPAT_OBJS += compat/unsetenv.o
+endif
+ifdef NO_SYS_SELECT_H
+	BASIC_CFLAGS += -DNO_SYS_SELECT_H
+endif
+ifdef NO_MMAP
+	COMPAT_CFLAGS += -DNO_MMAP
+	COMPAT_OBJS += compat/mmap.o
+else
+	ifdef USE_WIN32_MMAP
+		COMPAT_CFLAGS += -DUSE_WIN32_MMAP
+		COMPAT_OBJS += compat/win32mmap.o
+	endif
+endif
+ifdef NO_PREAD
+	COMPAT_CFLAGS += -DNO_PREAD
+	COMPAT_OBJS += compat/pread.o
+endif
+ifdef NO_FAST_WORKING_DIRECTORY
+	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
+endif
+ifdef NO_TRUSTABLE_FILEMODE
+	BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
+endif
+ifdef NO_IPV6
+	BASIC_CFLAGS += -DNO_IPV6
+endif
+ifdef NO_UINTMAX_T
+	BASIC_CFLAGS += -Duintmax_t=uint32_t
+endif
+ifdef NO_SOCKADDR_STORAGE
+ifdef NO_IPV6
+	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
+else
+	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
+endif
+endif
+ifdef NO_INET_NTOP
+	LIB_OBJS += compat/inet_ntop.o
+endif
+ifdef NO_INET_PTON
+	LIB_OBJS += compat/inet_pton.o
+endif
+
+ifdef NO_ICONV
+	BASIC_CFLAGS += -DNO_ICONV
+endif
+
+ifdef OLD_ICONV
+	BASIC_CFLAGS += -DOLD_ICONV
+endif
+
+ifdef NO_DEFLATE_BOUND
+	BASIC_CFLAGS += -DNO_DEFLATE_BOUND
+endif
+
+ifdef PPC_SHA1
+	SHA1_HEADER = "ppc/sha1.h"
+	LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
+else
+ifdef ARM_SHA1
+	SHA1_HEADER = "arm/sha1.h"
+	LIB_OBJS += arm/sha1.o arm/sha1_arm.o
+else
+ifdef MOZILLA_SHA1
+	SHA1_HEADER = "mozilla-sha1/sha1.h"
+	LIB_OBJS += mozilla-sha1/sha1.o
+else
+	SHA1_HEADER = <openssl/sha.h>
+	EXTLIBS += $(LIB_4_CRYPTO)
+endif
+endif
+endif
+ifdef NO_PERL_MAKEMAKER
+	export NO_PERL_MAKEMAKER
+endif
+ifdef NO_HSTRERROR
+	COMPAT_CFLAGS += -DNO_HSTRERROR
+	COMPAT_OBJS += compat/hstrerror.o
+endif
+ifdef NO_MEMMEM
+	COMPAT_CFLAGS += -DNO_MEMMEM
+	COMPAT_OBJS += compat/memmem.o
+endif
+ifdef INTERNAL_QSORT
+	COMPAT_CFLAGS += -DINTERNAL_QSORT
+	COMPAT_OBJS += compat/qsort.o
+endif
+ifdef RUNTIME_PREFIX
+	COMPAT_CFLAGS += -DRUNTIME_PREFIX
+endif
+
+ifdef NO_PTHREADS
+	THREADED_DELTA_SEARCH =
+	BASIC_CFLAGS += -DNO_PTHREADS
+else
+	EXTLIBS += $(PTHREAD_LIBS)
+endif
+
+ifdef THREADED_DELTA_SEARCH
+	BASIC_CFLAGS += -DTHREADED_DELTA_SEARCH
+	LIB_OBJS += thread-utils.o
+endif
+ifdef DIR_HAS_BSD_GROUP_SEMANTICS
+	COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
+endif
+ifdef NO_EXTERNAL_GREP
+	BASIC_CFLAGS += -DNO_EXTERNAL_GREP
+endif
+
+ifeq ($(TCLTK_PATH),)
+NO_TCLTK=NoThanks
+endif
+
+ifeq ($(PERL_PATH),)
+NO_PERL=NoThanks
+endif
+
+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
+QUIET_SUBDIR1  =
+
+ifneq ($(findstring $(MAKEFLAGS),w),w)
+PRINT_DIR = --no-print-directory
+else # "make -w"
+NO_SUBDIR = :
+endif
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifndef V
+	QUIET_CC       = @echo '   ' CC $@;
+	QUIET_AR       = @echo '   ' AR $@;
+	QUIET_LINK     = @echo '   ' LINK $@;
+	QUIET_BUILT_IN = @echo '   ' BUILTIN $@;
+	QUIET_GEN      = @echo '   ' GEN $@;
+	QUIET_SUBDIR0  = +@subdir=
+	QUIET_SUBDIR1  = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+			 $(MAKE) $(PRINT_DIR) -C $$subdir
+	export V
+	export QUIET_GEN
+	export QUIET_BUILT_IN
+endif
+endif
+
+ifdef ASCIIDOC8
+	export ASCIIDOC8
+endif
+
+# Shell quote (do not use $(call) to accommodate ancient setups);
+
+SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
+ETC_GITCONFIG_SQ = $(subst ','\'',$(ETC_GITCONFIG))
+
+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
+bindir_SQ = $(subst ','\'',$(bindir))
+bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
+mandir_SQ = $(subst ','\'',$(mandir))
+infodir_SQ = $(subst ','\'',$(infodir))
+gitexecdir_SQ = $(subst ','\'',$(gitexecdir))
+template_dir_SQ = $(subst ','\'',$(template_dir))
+htmldir_SQ = $(subst ','\'',$(htmldir))
+prefix_SQ = $(subst ','\'',$(prefix))
+
+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
+PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
+TCLTK_PATH_SQ = $(subst ','\'',$(TCLTK_PATH))
+
+LIBS = $(GITLIBS) $(EXTLIBS)
+
+BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
+	$(COMPAT_CFLAGS)
+LIB_OBJS += $(COMPAT_OBJS)
+
+ALL_CFLAGS += $(BASIC_CFLAGS)
+ALL_LDFLAGS += $(BASIC_LDFLAGS)
+
+export TAR INSTALL DESTDIR SHELL_PATH
+
+
+### Build rules
+
+SHELL = $(SHELL_PATH)
+
+all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) GIT-BUILD-OPTIONS
+ifneq (,$X)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
+endif
+
+all::
+ifndef NO_TCLTK
+	$(QUIET_SUBDIR0)git-gui $(QUIET_SUBDIR1) gitexecdir='$(gitexec_instdir_SQ)' all
+	$(QUIET_SUBDIR0)gitk-git $(QUIET_SUBDIR1) all
+endif
+ifndef NO_PERL
+	$(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' all
+endif
+	$(QUIET_SUBDIR0)templates $(QUIET_SUBDIR1)
+
+please_set_SHELL_PATH_to_a_more_modern_shell:
+	@$$(:)
+
+shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
+
+strip: $(PROGRAMS) git$X
+	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) git$X
+
+git.o: git.c common-cmds.h GIT-CFLAGS
+	$(QUIET_CC)$(CC) -DGIT_VERSION='"$(GIT_VERSION)"' \
+		'-DGIT_HTML_PATH="$(htmldir_SQ)"' \
+		$(ALL_CFLAGS) -c $(filter %.c,$^)
+
+git$X: git.o $(BUILTIN_OBJS) $(GITLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ git.o \
+		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
+
+builtin-help.o: builtin-help.c common-cmds.h GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+		'-DGIT_HTML_PATH="$(htmldir_SQ)"' \
+		'-DGIT_MAN_PATH="$(mandir_SQ)"' \
+		'-DGIT_INFO_PATH="$(infodir_SQ)"' $<
+
+$(BUILT_INS): git$X
+	$(QUIET_BUILT_IN)$(RM) $@ && \
+	ln git$X $@ 2>/dev/null || \
+	ln -s git$X $@ 2>/dev/null || \
+	cp git$X $@
+
+common-cmds.h: ./generate-cmdlist.sh command-list.txt
+
+common-cmds.h: $(wildcard Documentation/git-*.txt)
+	$(QUIET_GEN)./generate-cmdlist.sh > $@+ && mv $@+ $@
+
+$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
+	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
+	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
+	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
+	    $@.sh >$@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+
+ifndef NO_PERL
+$(patsubst %.perl,%,$(SCRIPT_PERL)): perl/perl.mak
+
+perl/perl.mak: GIT-CFLAGS perl/Makefile perl/Makefile.PL
+	$(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' $(@F)
+
+$(patsubst %.perl,%,$(SCRIPT_PERL)): % : %.perl
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	INSTLIBDIR=`MAKEFLAGS= $(MAKE) -C perl -s --no-print-directory instlibdir` && \
+	sed -e '1{' \
+	    -e '	s|#!.*perl|#!$(PERL_PATH_SQ)|' \
+	    -e '	h' \
+	    -e '	s=.*=use lib (split(/:/, $$ENV{GITPERLLIB} || "@@INSTLIBDIR@@"));=' \
+	    -e '	H' \
+	    -e '	x' \
+	    -e '}' \
+	    -e 's|@@INSTLIBDIR@@|'"$$INSTLIBDIR"'|g' \
+	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	    $@.perl >$@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+
+gitweb/gitweb.cgi: gitweb/gitweb.perl
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	sed -e '1s|#!.*perl|#!$(PERL_PATH_SQ)|' \
+	    -e 's|++GIT_VERSION++|$(GIT_VERSION)|g' \
+	    -e 's|++GIT_BINDIR++|$(bindir)|g' \
+	    -e 's|++GITWEB_CONFIG++|$(GITWEB_CONFIG)|g' \
+	    -e 's|++GITWEB_CONFIG_SYSTEM++|$(GITWEB_CONFIG_SYSTEM)|g' \
+	    -e 's|++GITWEB_HOME_LINK_STR++|$(GITWEB_HOME_LINK_STR)|g' \
+	    -e 's|++GITWEB_SITENAME++|$(GITWEB_SITENAME)|g' \
+	    -e 's|++GITWEB_PROJECTROOT++|$(GITWEB_PROJECTROOT)|g' \
+	    -e 's|"++GITWEB_PROJECT_MAXDEPTH++"|$(GITWEB_PROJECT_MAXDEPTH)|g' \
+	    -e 's|++GITWEB_EXPORT_OK++|$(GITWEB_EXPORT_OK)|g' \
+	    -e 's|++GITWEB_STRICT_EXPORT++|$(GITWEB_STRICT_EXPORT)|g' \
+	    -e 's|++GITWEB_BASE_URL++|$(GITWEB_BASE_URL)|g' \
+	    -e 's|++GITWEB_LIST++|$(GITWEB_LIST)|g' \
+	    -e 's|++GITWEB_HOMETEXT++|$(GITWEB_HOMETEXT)|g' \
+	    -e 's|++GITWEB_CSS++|$(GITWEB_CSS)|g' \
+	    -e 's|++GITWEB_LOGO++|$(GITWEB_LOGO)|g' \
+	    -e 's|++GITWEB_FAVICON++|$(GITWEB_FAVICON)|g' \
+	    -e 's|++GITWEB_SITE_HEADER++|$(GITWEB_SITE_HEADER)|g' \
+	    -e 's|++GITWEB_SITE_FOOTER++|$(GITWEB_SITE_FOOTER)|g' \
+	    $< >$@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+
+git-instaweb: git-instaweb.sh gitweb/gitweb.cgi gitweb/gitweb.css
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
+	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
+	    -e '/@@GITWEB_CGI@@/r gitweb/gitweb.cgi' \
+	    -e '/@@GITWEB_CGI@@/d' \
+	    -e '/@@GITWEB_CSS@@/r gitweb/gitweb.css' \
+	    -e '/@@GITWEB_CSS@@/d' \
+	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
+	    $@.sh > $@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+else # NO_PERL
+$(patsubst %.perl,%,$(SCRIPT_PERL)) git-instaweb: % : unimplemented.sh
+	$(QUIET_GEN)$(RM) $@ $@+ && \
+	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
+	    -e 's|@@REASON@@|NO_PERL=$(NO_PERL)|g' \
+	    unimplemented.sh >$@+ && \
+	chmod +x $@+ && \
+	mv $@+ $@
+endif # NO_PERL
+
+configure: configure.ac
+	$(QUIET_GEN)$(RM) $@ $<+ && \
+	sed -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	    $< > $<+ && \
+	autoconf -o $@ $<+ && \
+	$(RM) $<+
+
+# These can record GIT_VERSION
+git.o git.spec \
+	$(patsubst %.sh,%,$(SCRIPT_SH)) \
+	$(patsubst %.perl,%,$(SCRIPT_PERL)) \
+	: GIT-VERSION-FILE
+
+%.o: %.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
+%.s: %.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
+%.o: %.S
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
+
+exec_cmd.o: exec_cmd.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+		'-DGIT_EXEC_PATH="$(gitexecdir_SQ)"' \
+		'-DBINDIR="$(bindir_relative_SQ)"' \
+		'-DPREFIX="$(prefix_SQ)"' \
+		$<
+
+builtin-init-db.o: builtin-init-db.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_GIT_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
+
+config.o: config.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_GITCONFIG='"$(ETC_GITCONFIG_SQ)"' $<
+
+http.o: http.c GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DGIT_USER_AGENT='"git/$(GIT_VERSION)"' $<
+
+ifdef NO_EXPAT
+http-walker.o: http-walker.c http.h GIT-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DNO_EXPAT $<
+endif
+
+git-%$X: %.o $(GITLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
+
+git-imap-send$X: imap-send.o $(GITLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
+		$(LIBS) $(OPENSSL_LINK) $(OPENSSL_LIBSSL)
+
+http.o http-walker.o http-push.o transport.o: http.h
+
+git-http-push$X: revision.o http.o http-push.o $(GITLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
+		$(LIBS) $(CURL_LIBCURL) $(EXPAT_LIBEXPAT)
+
+$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
+$(patsubst git-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
+builtin-revert.o wt-status.o: wt-status.h
+
+$(LIB_FILE): $(LIB_OBJS)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
+
+XDIFF_OBJS=xdiff/xdiffi.o xdiff/xprepare.o xdiff/xutils.o xdiff/xemit.o \
+	xdiff/xmerge.o xdiff/xpatience.o
+$(XDIFF_OBJS): xdiff/xinclude.h xdiff/xmacros.h xdiff/xdiff.h xdiff/xtypes.h \
+	xdiff/xutils.h xdiff/xprepare.h xdiff/xdiffi.h xdiff/xemit.h
+
+$(XDIFF_LIB): $(XDIFF_OBJS)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(XDIFF_OBJS)
+
+
+doc:
+	$(MAKE) -C Documentation all
+
+man:
+	$(MAKE) -C Documentation man
+
+html:
+	$(MAKE) -C Documentation html
+
+info:
+	$(MAKE) -C Documentation info
+
+pdf:
+	$(MAKE) -C Documentation pdf
+
+TAGS:
+	$(RM) TAGS
+	$(FIND) . -name '*.[hcS]' -print | xargs etags -a
+
+tags:
+	$(RM) tags
+	$(FIND) . -name '*.[hcS]' -print | xargs ctags -a
+
+cscope:
+	$(RM) cscope*
+	$(FIND) . -name '*.[hcS]' -print | xargs cscope -b
+
+### Detect prefix changes
+TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
+             $(bindir_SQ):$(gitexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
+
+GIT-CFLAGS: .FORCE-GIT-CFLAGS
+	@FLAGS='$(TRACK_CFLAGS)'; \
+	    if test x"$$FLAGS" != x"`cat GIT-CFLAGS 2>/dev/null`" ; then \
+		echo 1>&2 "    * new build flags or prefix"; \
+		echo "$$FLAGS" >GIT-CFLAGS; \
+            fi
+
+# We need to apply sq twice, once to protect from the shell
+# that runs GIT-BUILD-OPTIONS, and then again to protect it
+# and the first level quoting from the shell that runs "echo".
+GIT-BUILD-OPTIONS: .FORCE-GIT-BUILD-OPTIONS
+	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
+	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
+	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
+	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
+
+### Detect Tck/Tk interpreter path changes
+ifndef NO_TCLTK
+TRACK_VARS = $(subst ','\'',-DTCLTK_PATH='$(TCLTK_PATH_SQ)')
+
+GIT-GUI-VARS: .FORCE-GIT-GUI-VARS
+	@VARS='$(TRACK_VARS)'; \
+	    if test x"$$VARS" != x"`cat $@ 2>/dev/null`" ; then \
+		echo 1>&2 "    * new Tcl/Tk interpreter location"; \
+		echo "$$VARS" >$@; \
+            fi
+
+.PHONY: .FORCE-GIT-GUI-VARS
+endif
+
+### Testing rules
+
+TEST_PROGRAMS += test-chmtime$X
+TEST_PROGRAMS += test-ctype$X
+TEST_PROGRAMS += test-date$X
+TEST_PROGRAMS += test-delta$X
+TEST_PROGRAMS += test-dump-cache-tree$X
+TEST_PROGRAMS += test-genrandom$X
+TEST_PROGRAMS += test-match-trees$X
+TEST_PROGRAMS += test-parse-options$X
+TEST_PROGRAMS += test-path-utils$X
+TEST_PROGRAMS += test-sha1$X
+TEST_PROGRAMS += test-sigchain$X
+
+all:: $(TEST_PROGRAMS)
+
+# GNU make supports exporting all variables by "export" without parameters.
+# However, the environment gets quite big, and some programs have problems
+# with that.
+
+export NO_SVN_TESTS
+
+test: all
+	$(MAKE) -C t/ all
+
+test-ctype$X: ctype.o
+
+test-date$X: date.o ctype.o
+
+test-delta$X: diff-delta.o patch-delta.o
+
+test-parse-options$X: parse-options.o
+
+.PRECIOUS: $(patsubst test-%$X,test-%.o,$(TEST_PROGRAMS))
+
+test-%$X: test-%.o $(GITLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
+
+check-sha1:: test-sha1$X
+	./test-sha1.sh
+
+check: common-cmds.h
+	if sparse; \
+	then \
+		for i in *.c; \
+		do \
+			sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
+		done; \
+	else \
+		echo 2>&1 "Did you mean 'make test'?"; \
+		exit 1; \
+	fi
+
+remove-dashes:
+	./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
+
+### Installation rules
+
+ifneq ($(filter /%,$(firstword $(template_dir))),)
+template_instdir = $(template_dir)
+else
+template_instdir = $(prefix)/$(template_dir)
+endif
+export template_instdir
+
+ifneq ($(filter /%,$(firstword $(gitexecdir))),)
+gitexec_instdir = $(gitexecdir)
+else
+gitexec_instdir = $(prefix)/$(gitexecdir)
+endif
+gitexec_instdir_SQ = $(subst ','\'',$(gitexec_instdir))
+export gitexec_instdir
+
+install: all
+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(gitexec_instdir_SQ)'
+	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)'
+	$(INSTALL) git$X git-upload-pack$X git-receive-pack$X git-upload-archive$X git-shell$X git-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)'
+	$(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install
+	$(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install
+ifndef NO_TCLTK
+	$(MAKE) -C gitk-git install
+	$(MAKE) -C git-gui gitexecdir='$(gitexec_instdir_SQ)' install
+endif
+ifneq (,$X)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), $(RM) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)/$p';)
+endif
+	bindir=$$(cd '$(DESTDIR_SQ)$(bindir_SQ)' && pwd) && \
+	execdir=$$(cd '$(DESTDIR_SQ)$(gitexec_instdir_SQ)' && pwd) && \
+	{ $(RM) "$$execdir/git-add$X" && \
+		ln "$$bindir/git$X" "$$execdir/git-add$X" 2>/dev/null || \
+		cp "$$bindir/git$X" "$$execdir/git-add$X"; } && \
+	{ for p in $(filter-out git-add$X,$(BUILT_INS)); do \
+		$(RM) "$$execdir/$$p" && \
+		ln "$$execdir/git-add$X" "$$execdir/$$p" 2>/dev/null || \
+		ln -s "git-add$X" "$$execdir/$$p" 2>/dev/null || \
+		cp "$$execdir/git-add$X" "$$execdir/$$p" || exit; \
+	  done } && \
+	./check_bindir "z$$bindir" "z$$execdir" "$$bindir/git-add$X"
+
+install-doc:
+	$(MAKE) -C Documentation install
+
+install-man:
+	$(MAKE) -C Documentation install-man
+
+install-html:
+	$(MAKE) -C Documentation install-html
+
+install-info:
+	$(MAKE) -C Documentation install-info
+
+install-pdf:
+	$(MAKE) -C Documentation install-pdf
+
+quick-install-doc:
+	$(MAKE) -C Documentation quick-install
+
+quick-install-man:
+	$(MAKE) -C Documentation quick-install-man
+
+quick-install-html:
+	$(MAKE) -C Documentation quick-install-html
+
+
+
+### Maintainer's dist rules
+
+git.spec: git.spec.in
+	sed -e 's/@@VERSION@@/$(GIT_VERSION)/g' < $< > $@+
+	mv $@+ $@
+
+GIT_TARNAME=git-$(GIT_VERSION)
+dist: git.spec git-archive$(X) configure
+	./git-archive --format=tar \
+		--prefix=$(GIT_TARNAME)/ HEAD^{tree} > $(GIT_TARNAME).tar
+	@mkdir -p $(GIT_TARNAME)
+	@cp git.spec configure $(GIT_TARNAME)
+	@echo $(GIT_VERSION) > $(GIT_TARNAME)/version
+	@$(MAKE) -C git-gui TARDIR=../$(GIT_TARNAME)/git-gui dist-version
+	$(TAR) rf $(GIT_TARNAME).tar \
+		$(GIT_TARNAME)/git.spec \
+		$(GIT_TARNAME)/configure \
+		$(GIT_TARNAME)/version \
+		$(GIT_TARNAME)/git-gui/version
+	@$(RM) -r $(GIT_TARNAME)
+	gzip -f -9 $(GIT_TARNAME).tar
+
+rpm: dist
+	$(RPMBUILD) -ta $(GIT_TARNAME).tar.gz
+
+htmldocs = git-htmldocs-$(GIT_VERSION)
+manpages = git-manpages-$(GIT_VERSION)
+dist-doc:
+	$(RM) -r .doc-tmp-dir
+	mkdir .doc-tmp-dir
+	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
+	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
+	gzip -n -9 -f $(htmldocs).tar
+	:
+	$(RM) -r .doc-tmp-dir
+	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
+	$(MAKE) -C Documentation DESTDIR=./ \
+		man1dir=../.doc-tmp-dir/man1 \
+		man5dir=../.doc-tmp-dir/man5 \
+		man7dir=../.doc-tmp-dir/man7 \
+		install
+	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
+	gzip -n -9 -f $(manpages).tar
+	$(RM) -r .doc-tmp-dir
+
+### Cleaning rules
+
+distclean: clean
+	$(RM) configure
 
 clean:
-	rm $(BINS)
+	$(RM) *.o mozilla-sha1/*.o arm/*.o ppc/*.o compat/*.o xdiff/*.o \
+		$(LIB_FILE) $(XDIFF_LIB)
+	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) git$X
+	$(RM) $(TEST_PROGRAMS)
+	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
+	$(RM) -r autom4te.cache
+	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache
+	$(RM) -r $(GIT_TARNAME) .doc-tmp-dir
+	$(RM) $(GIT_TARNAME).tar.gz git-core_$(GIT_VERSION)-*.tar.gz
+	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
+	$(MAKE) -C Documentation/ clean
+ifndef NO_PERL
+	$(RM) gitweb/gitweb.cgi
+	$(MAKE) -C perl clean
+endif
+	$(MAKE) -C templates/ clean
+	$(MAKE) -C t/ clean
+ifndef NO_TCLTK
+	$(MAKE) -C gitk-git clean
+	$(MAKE) -C git-gui clean
+endif
+	$(RM) GIT-VERSION-FILE GIT-CFLAGS GIT-GUI-VARS GIT-BUILD-OPTIONS
+
+.PHONY: all install clean strip
+.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
+.PHONY: .FORCE-GIT-VERSION-FILE TAGS tags cscope .FORCE-GIT-CFLAGS
+.PHONY: .FORCE-GIT-BUILD-OPTIONS
+
+### Check documentation
+#
+check-docs::
+	@(for v in $(ALL_PROGRAMS) $(BUILT_INS) git gitk; \
+	do \
+		case "$$v" in \
+		git-merge-octopus | git-merge-ours | git-merge-recursive | \
+		git-merge-resolve | git-merge-subtree | \
+		git-fsck-objects | git-init-db | \
+		git-?*--?* ) continue ;; \
+		esac ; \
+		test -f "Documentation/$$v.txt" || \
+		echo "no doc: $$v"; \
+		sed -e '/^#/d' command-list.txt | \
+		grep -q "^$$v[ 	]" || \
+		case "$$v" in \
+		git) ;; \
+		*) echo "no link: $$v";; \
+		esac ; \
+	done; \
+	( \
+		sed -e '/^#/d' \
+		    -e 's/[ 	].*//' \
+		    -e 's/^/listed /' command-list.txt; \
+		ls -1 Documentation/git*txt | \
+		sed -e 's|Documentation/|documented |' \
+		    -e 's/\.txt//'; \
+	) | while read how cmd; \
+	do \
+		case "$$how,$$cmd" in \
+		*,git-citool | \
+		*,git-gui | \
+		*,git-help | \
+		documented,gitattributes | \
+		documented,gitignore | \
+		documented,gitmodules | \
+		documented,gitcli | \
+		documented,git-tools | \
+		documented,gitcore-tutorial | \
+		documented,gitcvs-migration | \
+		documented,gitdiffcore | \
+		documented,gitglossary | \
+		documented,githooks | \
+		documented,gitrepository-layout | \
+		documented,gittutorial | \
+		documented,gittutorial-2 | \
+		sentinel,not,matching,is,ok ) continue ;; \
+		esac; \
+		case " $(ALL_PROGRAMS) $(BUILT_INS) git gitk " in \
+		*" $$cmd "*)	;; \
+		*) echo "removed but $$how: $$cmd" ;; \
+		esac; \
+	done ) | sort
+
+### Make sure built-ins do not have dups and listed in git.c
+#
+check-builtins::
+	./check-builtins.sh
+
+### Test suite coverage testing
+#
+.PHONY: coverage coverage-clean coverage-build coverage-report
+
+coverage:
+	$(MAKE) coverage-build
+	$(MAKE) coverage-report
+
+coverage-clean:
+	rm -f *.gcda *.gcno
+
+COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
+COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
+
+coverage-build: coverage-clean
+	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
+	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
+		-j1 test
+
+coverage-report:
+	gcov -b *.c
+	grep '^function.*called 0 ' *.c.gcov \
+		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
+		| tee coverage-untested-functions
-- 
GitLab


From 0780060124011b94af55830939c86cc0916be0f5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 15:00:56 +0200
Subject: [PATCH 0540/2198] perf_counter tools: add in basic glue from Git

First very raw version at having a central 'perf' command and
a list of subcommands:

  perf top
  perf stat
  perf record
  perf report
  ...

This is done by picking up Git's collection of utility functions,
and hacking them to build fine in this new environment.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/.gitignore         |  179 +++
 Documentation/perf_counter/Makefile           | 1037 +++---------
 Documentation/perf_counter/PERF-BUILD-OPTIONS |    4 +
 Documentation/perf_counter/PERF-CFLAGS        |    1 +
 Documentation/perf_counter/PERF-VERSION-FILE  |    1 +
 Documentation/perf_counter/PERF-VERSION-GEN   |   42 +
 Documentation/perf_counter/abspath.c          |  117 ++
 Documentation/perf_counter/alias.c            |   77 +
 Documentation/perf_counter/builtin-help.c     |  463 ++++++
 Documentation/perf_counter/builtin-top.c      | 1411 +++++++++++++++++
 Documentation/perf_counter/builtin.h          |   18 +
 Documentation/perf_counter/cache.h            |   97 ++
 Documentation/perf_counter/command-list.txt   |    4 +
 Documentation/perf_counter/config.c           |  966 +++++++++++
 Documentation/perf_counter/ctype.c            |   26 +
 Documentation/perf_counter/exec_cmd.c         |  165 ++
 Documentation/perf_counter/exec_cmd.h         |   13 +
 .../perf_counter/generate-cmdlist.sh          |   24 +
 Documentation/perf_counter/help.c             |  366 +++++
 Documentation/perf_counter/help.h             |   29 +
 Documentation/perf_counter/levenshtein.c      |   84 +
 Documentation/perf_counter/levenshtein.h      |    8 +
 Documentation/perf_counter/parse-options.c    |  495 ++++++
 Documentation/perf_counter/parse-options.h    |  172 ++
 Documentation/perf_counter/path.c             |  392 +++++
 Documentation/perf_counter/perf.c             |  411 +++++
 Documentation/perf_counter/quote.c            |  478 ++++++
 Documentation/perf_counter/quote.h            |   68 +
 Documentation/perf_counter/run-command.c      |  395 +++++
 Documentation/perf_counter/run-command.h      |   93 ++
 Documentation/perf_counter/strbuf.c           |  359 +++++
 Documentation/perf_counter/strbuf.h           |  137 ++
 Documentation/perf_counter/usage.c            |   80 +
 Documentation/perf_counter/util.h             |  394 +++++
 Documentation/perf_counter/wrapper.c          |  206 +++
 35 files changed, 7953 insertions(+), 859 deletions(-)
 create mode 100644 Documentation/perf_counter/.gitignore
 create mode 100644 Documentation/perf_counter/PERF-BUILD-OPTIONS
 create mode 100644 Documentation/perf_counter/PERF-CFLAGS
 create mode 100644 Documentation/perf_counter/PERF-VERSION-FILE
 create mode 100755 Documentation/perf_counter/PERF-VERSION-GEN
 create mode 100644 Documentation/perf_counter/abspath.c
 create mode 100644 Documentation/perf_counter/alias.c
 create mode 100644 Documentation/perf_counter/builtin-help.c
 create mode 100644 Documentation/perf_counter/builtin-top.c
 create mode 100644 Documentation/perf_counter/builtin.h
 create mode 100644 Documentation/perf_counter/cache.h
 create mode 100644 Documentation/perf_counter/command-list.txt
 create mode 100644 Documentation/perf_counter/config.c
 create mode 100644 Documentation/perf_counter/ctype.c
 create mode 100644 Documentation/perf_counter/exec_cmd.c
 create mode 100644 Documentation/perf_counter/exec_cmd.h
 create mode 100755 Documentation/perf_counter/generate-cmdlist.sh
 create mode 100644 Documentation/perf_counter/help.c
 create mode 100644 Documentation/perf_counter/help.h
 create mode 100644 Documentation/perf_counter/levenshtein.c
 create mode 100644 Documentation/perf_counter/levenshtein.h
 create mode 100644 Documentation/perf_counter/parse-options.c
 create mode 100644 Documentation/perf_counter/parse-options.h
 create mode 100644 Documentation/perf_counter/path.c
 create mode 100644 Documentation/perf_counter/perf.c
 create mode 100644 Documentation/perf_counter/quote.c
 create mode 100644 Documentation/perf_counter/quote.h
 create mode 100644 Documentation/perf_counter/run-command.c
 create mode 100644 Documentation/perf_counter/run-command.h
 create mode 100644 Documentation/perf_counter/strbuf.c
 create mode 100644 Documentation/perf_counter/strbuf.h
 create mode 100644 Documentation/perf_counter/usage.c
 create mode 100644 Documentation/perf_counter/util.h
 create mode 100644 Documentation/perf_counter/wrapper.c

diff --git a/Documentation/perf_counter/.gitignore b/Documentation/perf_counter/.gitignore
new file mode 100644
index 0000000000000..41c0b20a76ce0
--- /dev/null
+++ b/Documentation/perf_counter/.gitignore
@@ -0,0 +1,179 @@
+GIT-BUILD-OPTIONS
+GIT-CFLAGS
+GIT-GUI-VARS
+GIT-VERSION-FILE
+git
+git-add
+git-add--interactive
+git-am
+git-annotate
+git-apply
+git-archimport
+git-archive
+git-bisect
+git-bisect--helper
+git-blame
+git-branch
+git-bundle
+git-cat-file
+git-check-attr
+git-check-ref-format
+git-checkout
+git-checkout-index
+git-cherry
+git-cherry-pick
+git-clean
+git-clone
+git-commit
+git-commit-tree
+git-config
+git-count-objects
+git-cvsexportcommit
+git-cvsimport
+git-cvsserver
+git-daemon
+git-diff
+git-diff-files
+git-diff-index
+git-diff-tree
+git-difftool
+git-difftool--helper
+git-describe
+git-fast-export
+git-fast-import
+git-fetch
+git-fetch--tool
+git-fetch-pack
+git-filter-branch
+git-fmt-merge-msg
+git-for-each-ref
+git-format-patch
+git-fsck
+git-fsck-objects
+git-gc
+git-get-tar-commit-id
+git-grep
+git-hash-object
+git-help
+git-http-fetch
+git-http-push
+git-imap-send
+git-index-pack
+git-init
+git-init-db
+git-instaweb
+git-log
+git-lost-found
+git-ls-files
+git-ls-remote
+git-ls-tree
+git-mailinfo
+git-mailsplit
+git-merge
+git-merge-base
+git-merge-index
+git-merge-file
+git-merge-tree
+git-merge-octopus
+git-merge-one-file
+git-merge-ours
+git-merge-recursive
+git-merge-resolve
+git-merge-subtree
+git-mergetool
+git-mergetool--lib
+git-mktag
+git-mktree
+git-name-rev
+git-mv
+git-pack-redundant
+git-pack-objects
+git-pack-refs
+git-parse-remote
+git-patch-id
+git-peek-remote
+git-prune
+git-prune-packed
+git-pull
+git-push
+git-quiltimport
+git-read-tree
+git-rebase
+git-rebase--interactive
+git-receive-pack
+git-reflog
+git-relink
+git-remote
+git-repack
+git-repo-config
+git-request-pull
+git-rerere
+git-reset
+git-rev-list
+git-rev-parse
+git-revert
+git-rm
+git-send-email
+git-send-pack
+git-sh-setup
+git-shell
+git-shortlog
+git-show
+git-show-branch
+git-show-index
+git-show-ref
+git-stage
+git-stash
+git-status
+git-stripspace
+git-submodule
+git-svn
+git-symbolic-ref
+git-tag
+git-tar-tree
+git-unpack-file
+git-unpack-objects
+git-update-index
+git-update-ref
+git-update-server-info
+git-upload-archive
+git-upload-pack
+git-var
+git-verify-pack
+git-verify-tag
+git-web--browse
+git-whatchanged
+git-write-tree
+git-core-*/?*
+gitk-wish
+gitweb/gitweb.cgi
+test-chmtime
+test-ctype
+test-date
+test-delta
+test-dump-cache-tree
+test-genrandom
+test-match-trees
+test-parse-options
+test-path-utils
+test-sha1
+test-sigchain
+common-cmds.h
+*.tar.gz
+*.dsc
+*.deb
+git.spec
+*.exe
+*.[aos]
+*.py[co]
+config.mak
+autom4te.cache
+config.cache
+config.log
+config.status
+config.mak.autogen
+config.mak.append
+configure
+tags
+TAGS
+cscope*
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 6e0838b03ad1e..11809b943fc3c 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -13,16 +13,9 @@ all::
 # Define NO_OPENSSL environment variable if you do not have OpenSSL.
 # This also implies MOZILLA_SHA1.
 #
-# Define NO_CURL if you do not have libcurl installed.  git-http-pull and
-# git-http-push are not built, and you cannot use http:// and https://
-# transports.
-#
 # Define CURLDIR=/foo/bar if your curl header and library files are in
 # /foo/bar/include and /foo/bar/lib directories.
 #
-# Define NO_EXPAT if you do not have expat installed.  git-http-push is
-# not built, and you cannot push using http:// and https:// transports.
-#
 # Define EXPATDIR=/foo/bar if your expat header and library files are in
 # /foo/bar/include and /foo/bar/lib directories.
 #
@@ -40,8 +33,6 @@ all::
 #
 # Define NO_MEMMEM if you don't have memmem.
 #
-# Define NO_STRLCPY if you don't have strlcpy.
-#
 # Define NO_STRTOUMAX if you don't have strtoumax in the C library.
 # If your compiler also does not support long long or does not have
 # strtoull, define NO_STRTOULL.
@@ -54,7 +45,7 @@ all::
 #
 # Define NO_SYS_SELECT_H if you don't have sys/select.h.
 #
-# Define NO_SYMLINK_HEAD if you never want .git/HEAD to be a symbolic link.
+# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
 # Enable it on Windows.  By default, symrefs are still used.
 #
 # Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
@@ -62,13 +53,13 @@ all::
 # but are not needed unless you plan to talk to SVN repos.
 #
 # Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
-# installed in /sw, but don't want GIT to link against any libraries
+# installed in /sw, but don't want PERF to link against any libraries
 # installed there.  If defined you may specify your own (or Fink's)
 # include directories and library directories by defining CFLAGS
 # and LDFLAGS appropriately.
 #
 # Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
-# have DarwinPorts installed in /opt/local, but don't want GIT to
+# have DarwinPorts installed in /opt/local, but don't want PERF to
 # link against any libraries installed there.  If defined you may
 # specify your own (or DarwinPort's) include directories and
 # library directories by defining CFLAGS and LDFLAGS appropriately.
@@ -120,7 +111,7 @@ all::
 # that tells runtime paths to dynamic libraries;
 # "-Wl,-rpath=/path/lib" is used instead.
 #
-# Define USE_NSEC below if you want git to care about sub-second file mtimes
+# Define USE_NSEC below if you want perf to care about sub-second file mtimes
 # and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
 # it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
 # randomly break unless your underlying filesystem supports those sub-second
@@ -132,7 +123,7 @@ all::
 # Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
 # available.  This automatically turns USE_NSEC off.
 #
-# Define USE_STDEV below if you want git to care about the underlying device
+# Define USE_STDEV below if you want perf to care about the underlying device
 # change being considered an inode change from the update-index perspective.
 #
 # Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
@@ -150,27 +141,24 @@ all::
 # Define NO_TCLTK if you do not want Tcl/Tk GUI.
 #
 # The TCL_PATH variable governs the location of the Tcl interpreter
-# used to optimize git-gui for your system.  Only used if NO_TCLTK
+# used to optimize perf-gui for your system.  Only used if NO_TCLTK
 # is not set.  Defaults to the bare 'tclsh'.
 #
 # The TCLTK_PATH variable governs the location of the Tcl/Tk interpreter.
 # If not set it defaults to the bare 'wish'. If it is set to the empty
 # string then NO_TCLTK will be forced (this is used by configure script).
 #
-# Define THREADED_DELTA_SEARCH if you have pthreads and wish to exploit
-# parallel delta searching when packing objects.
-#
 # Define INTERNAL_QSORT to use Git's implementation of qsort(), which
 # is a simplified version of the merge sort used in glibc. This is
 # recommended if Git triggers O(n^2) behavior in your platform's qsort().
 #
-# Define NO_EXTERNAL_GREP if you don't want "git grep" to ever call
+# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
 # your external grep (e.g., if your system lacks grep, if its grep is
-# broken, or spawning external process is slower than built-in grep git has).
+# broken, or spawning external process is slower than built-in grep perf has).
 
-GIT-VERSION-FILE: .FORCE-GIT-VERSION-FILE
-	@$(SHELL_PATH) ./GIT-VERSION-GEN
--include GIT-VERSION-FILE
+PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
+	@$(SHELL_PATH) ./PERF-VERSION-GEN
+-include PERF-VERSION-FILE
 
 uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
 uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
@@ -182,20 +170,20 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
 CFLAGS = -g -O2 -Wall
-LDFLAGS =
+LDFLAGS = -lpthread -lrt
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
 STRIP ?= strip
 
 # Among the variables below, these:
-#   gitexecdir
+#   perfexecdir
 #   template_dir
 #   mandir
 #   infodir
 #   htmldir
-#   ETC_GITCONFIG (but not sysconfdir)
+#   ETC_PERFCONFIG (but not sysconfdir)
 # can be specified as a relative path some/where/else;
-# this is interpreted as relative to $(prefix) and "git" at
+# this is interpreted as relative to $(prefix) and "perf" at
 # runtime figures out where they are based on the path to the executable.
 # This can help installing the suite in a relocatable way.
 
@@ -204,38 +192,20 @@ bindir_relative = bin
 bindir = $(prefix)/$(bindir_relative)
 mandir = share/man
 infodir = share/info
-gitexecdir = libexec/git-core
+perfexecdir = libexec/perf-core
 sharedir = $(prefix)/share
-template_dir = share/git-core/templates
-htmldir = share/doc/git-doc
+template_dir = share/perf-core/templates
+htmldir = share/doc/perf-doc
 ifeq ($(prefix),/usr)
 sysconfdir = /etc
-ETC_GITCONFIG = $(sysconfdir)/gitconfig
+ETC_PERFCONFIG = $(sysconfdir)/perfconfig
 else
 sysconfdir = $(prefix)/etc
-ETC_GITCONFIG = etc/gitconfig
+ETC_PERFCONFIG = etc/perfconfig
 endif
 lib = lib
 # DESTDIR=
 
-# default configuration for gitweb
-GITWEB_CONFIG = gitweb_config.perl
-GITWEB_CONFIG_SYSTEM = /etc/gitweb.conf
-GITWEB_HOME_LINK_STR = projects
-GITWEB_SITENAME =
-GITWEB_PROJECTROOT = /pub/git
-GITWEB_PROJECT_MAXDEPTH = 2007
-GITWEB_EXPORT_OK =
-GITWEB_STRICT_EXPORT =
-GITWEB_BASE_URL =
-GITWEB_LIST =
-GITWEB_HOMETEXT = indextext.html
-GITWEB_CSS = gitweb.css
-GITWEB_LOGO = git-logo.png
-GITWEB_FAVICON = git-favicon.png
-GITWEB_SITE_HEADER =
-GITWEB_SITE_FOOTER =
-
 export prefix bindir sharedir sysconfdir
 
 CC = gcc
@@ -277,89 +247,46 @@ SCRIPT_PERL =
 SCRIPT_SH =
 TEST_PROGRAMS =
 
-SCRIPT_SH += git-am.sh
-SCRIPT_SH += git-bisect.sh
-SCRIPT_SH += git-difftool--helper.sh
-SCRIPT_SH += git-filter-branch.sh
-SCRIPT_SH += git-lost-found.sh
-SCRIPT_SH += git-merge-octopus.sh
-SCRIPT_SH += git-merge-one-file.sh
-SCRIPT_SH += git-merge-resolve.sh
-SCRIPT_SH += git-mergetool.sh
-SCRIPT_SH += git-mergetool--lib.sh
-SCRIPT_SH += git-parse-remote.sh
-SCRIPT_SH += git-pull.sh
-SCRIPT_SH += git-quiltimport.sh
-SCRIPT_SH += git-rebase--interactive.sh
-SCRIPT_SH += git-rebase.sh
-SCRIPT_SH += git-repack.sh
-SCRIPT_SH += git-request-pull.sh
-SCRIPT_SH += git-sh-setup.sh
-SCRIPT_SH += git-stash.sh
-SCRIPT_SH += git-submodule.sh
-SCRIPT_SH += git-web--browse.sh
-
-SCRIPT_PERL += git-add--interactive.perl
-SCRIPT_PERL += git-difftool.perl
-SCRIPT_PERL += git-archimport.perl
-SCRIPT_PERL += git-cvsexportcommit.perl
-SCRIPT_PERL += git-cvsimport.perl
-SCRIPT_PERL += git-cvsserver.perl
-SCRIPT_PERL += git-relink.perl
-SCRIPT_PERL += git-send-email.perl
-SCRIPT_PERL += git-svn.perl
+#
+# No scripts right now:
+#
+
+# SCRIPT_SH += perf-am.sh
+
+#
+# No Perl scripts right now:
+#
+
+# SCRIPT_PERL += perf-add--interactive.perl
 
 SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
-	  $(patsubst %.perl,%,$(SCRIPT_PERL)) \
-	  git-instaweb
+	  $(patsubst %.perl,%,$(SCRIPT_PERL))
 
 # Empty...
 EXTRA_PROGRAMS =
 
-# ... and all the rest that could be moved out of bindir to gitexecdir
+# ... and all the rest that could be moved out of bindir to perfexecdir
 PROGRAMS += $(EXTRA_PROGRAMS)
-PROGRAMS += git-fast-import$X
-PROGRAMS += git-hash-object$X
-PROGRAMS += git-index-pack$X
-PROGRAMS += git-merge-index$X
-PROGRAMS += git-merge-tree$X
-PROGRAMS += git-mktag$X
-PROGRAMS += git-mktree$X
-PROGRAMS += git-pack-redundant$X
-PROGRAMS += git-patch-id$X
-PROGRAMS += git-shell$X
-PROGRAMS += git-show-index$X
-PROGRAMS += git-unpack-file$X
-PROGRAMS += git-update-server-info$X
-PROGRAMS += git-upload-pack$X
-PROGRAMS += git-var$X
+
+#
+# None right now:
+#
+# PROGRAMS += perf-fast-import$X
 
 # List built-in command $C whose implementation cmd_$C() is not in
 # builtin-$C.o but is linked in as part of some other command.
-BUILT_INS += $(patsubst builtin-%.o,git-%$X,$(BUILTIN_OBJS))
-
-BUILT_INS += git-cherry$X
-BUILT_INS += git-cherry-pick$X
-BUILT_INS += git-format-patch$X
-BUILT_INS += git-fsck-objects$X
-BUILT_INS += git-get-tar-commit-id$X
-BUILT_INS += git-init$X
-BUILT_INS += git-merge-subtree$X
-BUILT_INS += git-peek-remote$X
-BUILT_INS += git-repo-config$X
-BUILT_INS += git-show$X
-BUILT_INS += git-stage$X
-BUILT_INS += git-status$X
-BUILT_INS += git-whatchanged$X
-
-# what 'all' will build and 'install' will install, in gitexecdir
+BUILT_INS += $(patsubst builtin-%.o,perf-%$X,$(BUILTIN_OBJS))
+
+#
+# None right now:
+#
+# BUILT_INS += perf-init $X
+
+# what 'all' will build and 'install' will install, in perfexecdir
 ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
 
-# what 'all' will build but not install in gitexecdir
-OTHER_PROGRAMS = git$X
-ifndef NO_PERL
-OTHER_PROGRAMS += gitweb/gitweb.cgi
-endif
+# what 'all' will build but not install in perfexecdir
+OTHER_PROGRAMS = perf$X
 
 # Set paths to tools early so that they can be used for version tests.
 ifndef SHELL_PATH
@@ -371,250 +298,34 @@ endif
 
 export PERL_PATH
 
-LIB_FILE=libgit.a
-XDIFF_LIB=xdiff/lib.a
-
-LIB_H += archive.h
-LIB_H += attr.h
-LIB_H += blob.h
-LIB_H += builtin.h
-LIB_H += cache.h
-LIB_H += cache-tree.h
-LIB_H += commit.h
-LIB_H += compat/cygwin.h
-LIB_H += compat/mingw.h
-LIB_H += csum-file.h
-LIB_H += decorate.h
-LIB_H += delta.h
-LIB_H += diffcore.h
-LIB_H += diff.h
-LIB_H += dir.h
-LIB_H += fsck.h
-LIB_H += git-compat-util.h
-LIB_H += graph.h
-LIB_H += grep.h
-LIB_H += hash.h
-LIB_H += help.h
+LIB_FILE=libperf.a
+
+LIB_H += ../../include/linux/perf_counter.h
 LIB_H += levenshtein.h
-LIB_H += list-objects.h
-LIB_H += ll-merge.h
-LIB_H += log-tree.h
-LIB_H += mailmap.h
-LIB_H += merge-recursive.h
-LIB_H += object.h
-LIB_H += pack.h
-LIB_H += pack-refs.h
-LIB_H += pack-revindex.h
 LIB_H += parse-options.h
-LIB_H += patch-ids.h
-LIB_H += pkt-line.h
-LIB_H += progress.h
 LIB_H += quote.h
-LIB_H += reflog-walk.h
-LIB_H += refs.h
-LIB_H += remote.h
-LIB_H += rerere.h
-LIB_H += revision.h
-LIB_H += run-command.h
-LIB_H += sha1-lookup.h
-LIB_H += sideband.h
-LIB_H += sigchain.h
 LIB_H += strbuf.h
-LIB_H += string-list.h
-LIB_H += tag.h
-LIB_H += transport.h
-LIB_H += tree.h
-LIB_H += tree-walk.h
-LIB_H += unpack-trees.h
-LIB_H += userdiff.h
-LIB_H += utf8.h
-LIB_H += wt-status.h
+LIB_H += run-command.h
 
 LIB_OBJS += abspath.o
 LIB_OBJS += alias.o
-LIB_OBJS += alloc.o
-LIB_OBJS += archive.o
-LIB_OBJS += archive-tar.o
-LIB_OBJS += archive-zip.o
-LIB_OBJS += attr.o
-LIB_OBJS += base85.o
-LIB_OBJS += bisect.o
-LIB_OBJS += blob.o
-LIB_OBJS += branch.o
-LIB_OBJS += bundle.o
-LIB_OBJS += cache-tree.o
-LIB_OBJS += color.o
-LIB_OBJS += combine-diff.o
-LIB_OBJS += commit.o
 LIB_OBJS += config.o
-LIB_OBJS += connect.o
-LIB_OBJS += convert.o
-LIB_OBJS += copy.o
-LIB_OBJS += csum-file.o
 LIB_OBJS += ctype.o
-LIB_OBJS += date.o
-LIB_OBJS += decorate.o
-LIB_OBJS += diffcore-break.o
-LIB_OBJS += diffcore-delta.o
-LIB_OBJS += diffcore-order.o
-LIB_OBJS += diffcore-pickaxe.o
-LIB_OBJS += diffcore-rename.o
-LIB_OBJS += diff-delta.o
-LIB_OBJS += diff-lib.o
-LIB_OBJS += diff-no-index.o
-LIB_OBJS += diff.o
-LIB_OBJS += dir.o
-LIB_OBJS += editor.o
-LIB_OBJS += entry.o
-LIB_OBJS += environment.o
 LIB_OBJS += exec_cmd.o
-LIB_OBJS += fsck.o
-LIB_OBJS += graph.o
-LIB_OBJS += grep.o
-LIB_OBJS += hash.o
 LIB_OBJS += help.o
-LIB_OBJS += ident.o
 LIB_OBJS += levenshtein.o
-LIB_OBJS += list-objects.o
-LIB_OBJS += ll-merge.o
-LIB_OBJS += lockfile.o
-LIB_OBJS += log-tree.o
-LIB_OBJS += mailmap.o
-LIB_OBJS += match-trees.o
-LIB_OBJS += merge-file.o
-LIB_OBJS += merge-recursive.o
-LIB_OBJS += name-hash.o
-LIB_OBJS += object.o
-LIB_OBJS += pack-check.o
-LIB_OBJS += pack-refs.o
-LIB_OBJS += pack-revindex.o
-LIB_OBJS += pack-write.o
-LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
-LIB_OBJS += patch-delta.o
-LIB_OBJS += patch-ids.o
 LIB_OBJS += path.o
-LIB_OBJS += pkt-line.o
-LIB_OBJS += preload-index.o
-LIB_OBJS += pretty.o
-LIB_OBJS += progress.o
-LIB_OBJS += quote.o
-LIB_OBJS += reachable.o
-LIB_OBJS += read-cache.o
-LIB_OBJS += reflog-walk.o
-LIB_OBJS += refs.o
-LIB_OBJS += remote.o
-LIB_OBJS += rerere.o
-LIB_OBJS += revision.o
 LIB_OBJS += run-command.o
-LIB_OBJS += server-info.o
-LIB_OBJS += setup.o
-LIB_OBJS += sha1-lookup.o
-LIB_OBJS += sha1_file.o
-LIB_OBJS += sha1_name.o
-LIB_OBJS += shallow.o
-LIB_OBJS += sideband.o
-LIB_OBJS += sigchain.o
+LIB_OBJS += quote.o
 LIB_OBJS += strbuf.o
-LIB_OBJS += string-list.o
-LIB_OBJS += symlinks.o
-LIB_OBJS += tag.o
-LIB_OBJS += trace.o
-LIB_OBJS += transport.o
-LIB_OBJS += tree-diff.o
-LIB_OBJS += tree.o
-LIB_OBJS += tree-walk.o
-LIB_OBJS += unpack-trees.o
 LIB_OBJS += usage.o
-LIB_OBJS += userdiff.o
-LIB_OBJS += utf8.o
-LIB_OBJS += walker.o
 LIB_OBJS += wrapper.o
-LIB_OBJS += write_or_die.o
-LIB_OBJS += ws.o
-LIB_OBJS += wt-status.o
-LIB_OBJS += xdiff-interface.o
-
-BUILTIN_OBJS += builtin-add.o
-BUILTIN_OBJS += builtin-annotate.o
-BUILTIN_OBJS += builtin-apply.o
-BUILTIN_OBJS += builtin-archive.o
-BUILTIN_OBJS += builtin-bisect--helper.o
-BUILTIN_OBJS += builtin-blame.o
-BUILTIN_OBJS += builtin-branch.o
-BUILTIN_OBJS += builtin-bundle.o
-BUILTIN_OBJS += builtin-cat-file.o
-BUILTIN_OBJS += builtin-check-attr.o
-BUILTIN_OBJS += builtin-check-ref-format.o
-BUILTIN_OBJS += builtin-checkout-index.o
-BUILTIN_OBJS += builtin-checkout.o
-BUILTIN_OBJS += builtin-clean.o
-BUILTIN_OBJS += builtin-clone.o
-BUILTIN_OBJS += builtin-commit-tree.o
-BUILTIN_OBJS += builtin-commit.o
-BUILTIN_OBJS += builtin-config.o
-BUILTIN_OBJS += builtin-count-objects.o
-BUILTIN_OBJS += builtin-describe.o
-BUILTIN_OBJS += builtin-diff-files.o
-BUILTIN_OBJS += builtin-diff-index.o
-BUILTIN_OBJS += builtin-diff-tree.o
-BUILTIN_OBJS += builtin-diff.o
-BUILTIN_OBJS += builtin-fast-export.o
-BUILTIN_OBJS += builtin-fetch--tool.o
-BUILTIN_OBJS += builtin-fetch-pack.o
-BUILTIN_OBJS += builtin-fetch.o
-BUILTIN_OBJS += builtin-fmt-merge-msg.o
-BUILTIN_OBJS += builtin-for-each-ref.o
-BUILTIN_OBJS += builtin-fsck.o
-BUILTIN_OBJS += builtin-gc.o
-BUILTIN_OBJS += builtin-grep.o
+
 BUILTIN_OBJS += builtin-help.o
-BUILTIN_OBJS += builtin-init-db.o
-BUILTIN_OBJS += builtin-log.o
-BUILTIN_OBJS += builtin-ls-files.o
-BUILTIN_OBJS += builtin-ls-remote.o
-BUILTIN_OBJS += builtin-ls-tree.o
-BUILTIN_OBJS += builtin-mailinfo.o
-BUILTIN_OBJS += builtin-mailsplit.o
-BUILTIN_OBJS += builtin-merge.o
-BUILTIN_OBJS += builtin-merge-base.o
-BUILTIN_OBJS += builtin-merge-file.o
-BUILTIN_OBJS += builtin-merge-ours.o
-BUILTIN_OBJS += builtin-merge-recursive.o
-BUILTIN_OBJS += builtin-mv.o
-BUILTIN_OBJS += builtin-name-rev.o
-BUILTIN_OBJS += builtin-pack-objects.o
-BUILTIN_OBJS += builtin-pack-refs.o
-BUILTIN_OBJS += builtin-prune-packed.o
-BUILTIN_OBJS += builtin-prune.o
-BUILTIN_OBJS += builtin-push.o
-BUILTIN_OBJS += builtin-read-tree.o
-BUILTIN_OBJS += builtin-receive-pack.o
-BUILTIN_OBJS += builtin-reflog.o
-BUILTIN_OBJS += builtin-remote.o
-BUILTIN_OBJS += builtin-rerere.o
-BUILTIN_OBJS += builtin-reset.o
-BUILTIN_OBJS += builtin-rev-list.o
-BUILTIN_OBJS += builtin-rev-parse.o
-BUILTIN_OBJS += builtin-revert.o
-BUILTIN_OBJS += builtin-rm.o
-BUILTIN_OBJS += builtin-send-pack.o
-BUILTIN_OBJS += builtin-shortlog.o
-BUILTIN_OBJS += builtin-show-branch.o
-BUILTIN_OBJS += builtin-show-ref.o
-BUILTIN_OBJS += builtin-stripspace.o
-BUILTIN_OBJS += builtin-symbolic-ref.o
-BUILTIN_OBJS += builtin-tag.o
-BUILTIN_OBJS += builtin-tar-tree.o
-BUILTIN_OBJS += builtin-unpack-objects.o
-BUILTIN_OBJS += builtin-update-index.o
-BUILTIN_OBJS += builtin-update-ref.o
-BUILTIN_OBJS += builtin-upload-archive.o
-BUILTIN_OBJS += builtin-verify-pack.o
-BUILTIN_OBJS += builtin-verify-tag.o
-BUILTIN_OBJS += builtin-write-tree.o
-
-GITLIBS = $(LIB_FILE) $(XDIFF_LIB)
+BUILTIN_OBJS += builtin-top.o
+
+PERFLIBS = $(LIB_FILE)
 EXTLIBS =
 
 #
@@ -625,221 +336,6 @@ EXTLIBS =
 # because maintaining the nesting to match is a pain.  If
 # we had "elif" things would have been much nicer...
 
-ifeq ($(uname_S),Linux)
-	NO_STRLCPY = YesPlease
-	THREADED_DELTA_SEARCH = YesPlease
-endif
-ifeq ($(uname_S),GNU/kFreeBSD)
-	NO_STRLCPY = YesPlease
-	THREADED_DELTA_SEARCH = YesPlease
-endif
-ifeq ($(uname_S),UnixWare)
-	CC = cc
-	NEEDS_SOCKET = YesPlease
-	NEEDS_NSL = YesPlease
-	NEEDS_SSL_WITH_CRYPTO = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	SHELL_PATH = /usr/local/bin/bash
-	NO_IPV6 = YesPlease
-	NO_HSTRERROR = YesPlease
-	BASIC_CFLAGS += -Kthread
-	BASIC_CFLAGS += -I/usr/local/include
-	BASIC_LDFLAGS += -L/usr/local/lib
-	INSTALL = ginstall
-	TAR = gtar
-	NO_STRCASESTR = YesPlease
-	NO_MEMMEM = YesPlease
-endif
-ifeq ($(uname_S),SCO_SV)
-	ifeq ($(uname_R),3.2)
-		CFLAGS = -O2
-	endif
-	ifeq ($(uname_R),5)
-		CC = cc
-		BASIC_CFLAGS += -Kthread
-	endif
-	NEEDS_SOCKET = YesPlease
-	NEEDS_NSL = YesPlease
-	NEEDS_SSL_WITH_CRYPTO = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	SHELL_PATH = /usr/bin/bash
-	NO_IPV6 = YesPlease
-	NO_HSTRERROR = YesPlease
-	BASIC_CFLAGS += -I/usr/local/include
-	BASIC_LDFLAGS += -L/usr/local/lib
-	NO_STRCASESTR = YesPlease
-	NO_MEMMEM = YesPlease
-	INSTALL = ginstall
-	TAR = gtar
-endif
-ifeq ($(uname_S),Darwin)
-	NEEDS_SSL_WITH_CRYPTO = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	ifeq ($(shell expr "$(uname_R)" : '[15678]\.'),2)
-		OLD_ICONV = UnfortunatelyYes
-	endif
-	ifeq ($(shell expr "$(uname_R)" : '[15]\.'),2)
-		NO_STRLCPY = YesPlease
-	endif
-	NO_MEMMEM = YesPlease
-	THREADED_DELTA_SEARCH = YesPlease
-	USE_ST_TIMESPEC = YesPlease
-endif
-ifeq ($(uname_S),SunOS)
-	NEEDS_SOCKET = YesPlease
-	NEEDS_NSL = YesPlease
-	SHELL_PATH = /bin/bash
-	NO_STRCASESTR = YesPlease
-	NO_MEMMEM = YesPlease
-	NO_HSTRERROR = YesPlease
-	NO_MKDTEMP = YesPlease
-	OLD_ICONV = UnfortunatelyYes
-	ifeq ($(uname_R),5.8)
-		NO_UNSETENV = YesPlease
-		NO_SETENV = YesPlease
-		NO_C99_FORMAT = YesPlease
-		NO_STRTOUMAX = YesPlease
-	endif
-	ifeq ($(uname_R),5.9)
-		NO_UNSETENV = YesPlease
-		NO_SETENV = YesPlease
-		NO_C99_FORMAT = YesPlease
-		NO_STRTOUMAX = YesPlease
-	endif
-	INSTALL = ginstall
-	TAR = gtar
-	BASIC_CFLAGS += -D__EXTENSIONS__
-endif
-ifeq ($(uname_O),Cygwin)
-	NO_D_TYPE_IN_DIRENT = YesPlease
-	NO_D_INO_IN_DIRENT = YesPlease
-	NO_STRCASESTR = YesPlease
-	NO_MEMMEM = YesPlease
-	NO_SYMLINK_HEAD = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	NO_FAST_WORKING_DIRECTORY = UnfortunatelyYes
-	NO_TRUSTABLE_FILEMODE = UnfortunatelyYes
-	OLD_ICONV = UnfortunatelyYes
-	# There are conflicting reports about this.
-	# On some boxes NO_MMAP is needed, and not so elsewhere.
-	# Try commenting this out if you suspect MMAP is more efficient
-	NO_MMAP = YesPlease
-	NO_IPV6 = YesPlease
-	X = .exe
-endif
-ifeq ($(uname_S),FreeBSD)
-	NEEDS_LIBICONV = YesPlease
-	NO_MEMMEM = YesPlease
-	BASIC_CFLAGS += -I/usr/local/include
-	BASIC_LDFLAGS += -L/usr/local/lib
-	DIR_HAS_BSD_GROUP_SEMANTICS = YesPlease
-	USE_ST_TIMESPEC = YesPlease
-	THREADED_DELTA_SEARCH = YesPlease
-	ifeq ($(shell expr "$(uname_R)" : '4\.'),2)
-		PTHREAD_LIBS = -pthread
-		NO_UINTMAX_T = YesPlease
-		NO_STRTOUMAX = YesPlease
-	endif
-endif
-ifeq ($(uname_S),OpenBSD)
-	NO_STRCASESTR = YesPlease
-	NO_MEMMEM = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	BASIC_CFLAGS += -I/usr/local/include
-	BASIC_LDFLAGS += -L/usr/local/lib
-	THREADED_DELTA_SEARCH = YesPlease
-endif
-ifeq ($(uname_S),NetBSD)
-	ifeq ($(shell expr "$(uname_R)" : '[01]\.'),2)
-		NEEDS_LIBICONV = YesPlease
-	endif
-	BASIC_CFLAGS += -I/usr/pkg/include
-	BASIC_LDFLAGS += -L/usr/pkg/lib $(CC_LD_DYNPATH)/usr/pkg/lib
-	THREADED_DELTA_SEARCH = YesPlease
-endif
-ifeq ($(uname_S),AIX)
-	NO_STRCASESTR=YesPlease
-	NO_MEMMEM = YesPlease
-	NO_MKDTEMP = YesPlease
-	NO_STRLCPY = YesPlease
-	NO_NSEC = YesPlease
-	FREAD_READS_DIRECTORIES = UnfortunatelyYes
-	INTERNAL_QSORT = UnfortunatelyYes
-	NEEDS_LIBICONV=YesPlease
-	BASIC_CFLAGS += -D_LARGE_FILES
-	ifneq ($(shell expr "$(uname_V)" : '[1234]'),1)
-		THREADED_DELTA_SEARCH = YesPlease
-	else
-		NO_PTHREADS = YesPlease
-	endif
-endif
-ifeq ($(uname_S),GNU)
-	# GNU/Hurd
-	NO_STRLCPY=YesPlease
-endif
-ifeq ($(uname_S),IRIX64)
-	NO_IPV6=YesPlease
-	NO_SETENV=YesPlease
-	NO_STRCASESTR=YesPlease
-	NO_MEMMEM = YesPlease
-	NO_STRLCPY = YesPlease
-	NO_SOCKADDR_STORAGE=YesPlease
-	SHELL_PATH=/usr/gnu/bin/bash
-	BASIC_CFLAGS += -DPATH_MAX=1024
-	# for now, build 32-bit version
-	BASIC_LDFLAGS += -L/usr/lib32
-endif
-ifeq ($(uname_S),HP-UX)
-	NO_IPV6=YesPlease
-	NO_SETENV=YesPlease
-	NO_STRCASESTR=YesPlease
-	NO_MEMMEM = YesPlease
-	NO_STRLCPY = YesPlease
-	NO_MKDTEMP = YesPlease
-	NO_UNSETENV = YesPlease
-	NO_HSTRERROR = YesPlease
-	NO_SYS_SELECT_H = YesPlease
-	SNPRINTF_RETURNS_BOGUS = YesPlease
-endif
-ifneq (,$(findstring CYGWIN,$(uname_S)))
-	COMPAT_OBJS += compat/cygwin.o
-endif
-ifneq (,$(findstring MINGW,$(uname_S)))
-	NO_PREAD = YesPlease
-	NO_OPENSSL = YesPlease
-	NO_CURL = YesPlease
-	NO_SYMLINK_HEAD = YesPlease
-	NO_IPV6 = YesPlease
-	NO_SETENV = YesPlease
-	NO_UNSETENV = YesPlease
-	NO_STRCASESTR = YesPlease
-	NO_STRLCPY = YesPlease
-	NO_MEMMEM = YesPlease
-	NO_PTHREADS = YesPlease
-	NEEDS_LIBICONV = YesPlease
-	OLD_ICONV = YesPlease
-	NO_C99_FORMAT = YesPlease
-	NO_STRTOUMAX = YesPlease
-	NO_MKDTEMP = YesPlease
-	SNPRINTF_RETURNS_BOGUS = YesPlease
-	NO_SVN_TESTS = YesPlease
-	NO_PERL_MAKEMAKER = YesPlease
-	RUNTIME_PREFIX = YesPlease
-	NO_POSIX_ONLY_PROGRAMS = YesPlease
-	NO_ST_BLOCKS_IN_STRUCT_STAT = YesPlease
-	NO_NSEC = YesPlease
-	USE_WIN32_MMAP = YesPlease
-	COMPAT_CFLAGS += -D__USE_MINGW_ACCESS -DNOGDI -Icompat -Icompat/regex -Icompat/fnmatch
-	COMPAT_CFLAGS += -DSNPRINTF_SIZE_CORR=1
-	COMPAT_CFLAGS += -DSTRIP_EXTENSION=\".exe\"
-	COMPAT_OBJS += compat/mingw.o compat/fnmatch/fnmatch.o compat/regex/regex.o compat/winansi.o
-	EXTLIBS += -lws2_32
-	X = .exe
-endif
-ifneq (,$(findstring arm,$(uname_M)))
-	ARM_SHA1 = YesPlease
-endif
-
 -include config.mak.autogen
 -include config.mak
 
@@ -869,72 +365,12 @@ ifndef CC_LD_DYNPATH
 	endif
 endif
 
-ifdef NO_CURL
-	BASIC_CFLAGS += -DNO_CURL
-else
-	ifdef CURLDIR
-		# Try "-Wl,-rpath=$(CURLDIR)/$(lib)" in such a case.
-		BASIC_CFLAGS += -I$(CURLDIR)/include
-		CURL_LIBCURL = -L$(CURLDIR)/$(lib) $(CC_LD_DYNPATH)$(CURLDIR)/$(lib) -lcurl
-	else
-		CURL_LIBCURL = -lcurl
-	endif
-	BUILTIN_OBJS += builtin-http-fetch.o
-	EXTLIBS += $(CURL_LIBCURL)
-	LIB_OBJS += http.o http-walker.o
-	curl_check := $(shell (echo 070908; curl-config --vernum) | sort -r | sed -ne 2p)
-	ifeq "$(curl_check)" "070908"
-		ifndef NO_EXPAT
-			PROGRAMS += git-http-push$X
-		endif
-	endif
-	ifndef NO_EXPAT
-		ifdef EXPATDIR
-			BASIC_CFLAGS += -I$(EXPATDIR)/include
-			EXPAT_LIBEXPAT = -L$(EXPATDIR)/$(lib) $(CC_LD_DYNPATH)$(EXPATDIR)/$(lib) -lexpat
-		else
-			EXPAT_LIBEXPAT = -lexpat
-		endif
-	endif
-endif
-
 ifdef ZLIB_PATH
 	BASIC_CFLAGS += -I$(ZLIB_PATH)/include
 	EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib)
 endif
 EXTLIBS += -lz
 
-ifndef NO_POSIX_ONLY_PROGRAMS
-	PROGRAMS += git-daemon$X
-	PROGRAMS += git-imap-send$X
-endif
-ifndef NO_OPENSSL
-	OPENSSL_LIBSSL = -lssl
-	ifdef OPENSSLDIR
-		BASIC_CFLAGS += -I$(OPENSSLDIR)/include
-		OPENSSL_LINK = -L$(OPENSSLDIR)/$(lib) $(CC_LD_DYNPATH)$(OPENSSLDIR)/$(lib)
-	else
-		OPENSSL_LINK =
-	endif
-else
-	BASIC_CFLAGS += -DNO_OPENSSL
-	MOZILLA_SHA1 = 1
-	OPENSSL_LIBSSL =
-endif
-ifdef NEEDS_SSL_WITH_CRYPTO
-	LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto -lssl
-else
-	LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto
-endif
-ifdef NEEDS_LIBICONV
-	ifdef ICONVDIR
-		BASIC_CFLAGS += -I$(ICONVDIR)/include
-		ICONV_LINK = -L$(ICONVDIR)/$(lib) $(CC_LD_DYNPATH)$(ICONVDIR)/$(lib)
-	else
-		ICONV_LINK =
-	endif
-	EXTLIBS += $(ICONV_LINK) -liconv
-endif
 ifdef NEEDS_SOCKET
 	EXTLIBS += -lsocket
 endif
@@ -977,10 +413,6 @@ ifdef NO_STRCASESTR
 	COMPAT_CFLAGS += -DNO_STRCASESTR
 	COMPAT_OBJS += compat/strcasestr.o
 endif
-ifdef NO_STRLCPY
-	COMPAT_CFLAGS += -DNO_STRLCPY
-	COMPAT_OBJS += compat/strlcpy.o
-endif
 ifdef NO_STRTOUMAX
 	COMPAT_CFLAGS += -DNO_STRTOUMAX
 	COMPAT_OBJS += compat/strtoumax.o
@@ -1090,17 +522,6 @@ ifdef RUNTIME_PREFIX
 	COMPAT_CFLAGS += -DRUNTIME_PREFIX
 endif
 
-ifdef NO_PTHREADS
-	THREADED_DELTA_SEARCH =
-	BASIC_CFLAGS += -DNO_PTHREADS
-else
-	EXTLIBS += $(PTHREAD_LIBS)
-endif
-
-ifdef THREADED_DELTA_SEARCH
-	BASIC_CFLAGS += -DTHREADED_DELTA_SEARCH
-	LIB_OBJS += thread-utils.o
-endif
 ifdef DIR_HAS_BSD_GROUP_SEMANTICS
 	COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
 endif
@@ -1148,14 +569,14 @@ endif
 # Shell quote (do not use $(call) to accommodate ancient setups);
 
 SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
-ETC_GITCONFIG_SQ = $(subst ','\'',$(ETC_GITCONFIG))
+ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
 
 DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
 bindir_SQ = $(subst ','\'',$(bindir))
 bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
 mandir_SQ = $(subst ','\'',$(mandir))
 infodir_SQ = $(subst ','\'',$(infodir))
-gitexecdir_SQ = $(subst ','\'',$(gitexecdir))
+perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
 template_dir_SQ = $(subst ','\'',$(template_dir))
 htmldir_SQ = $(subst ','\'',$(htmldir))
 prefix_SQ = $(subst ','\'',$(prefix))
@@ -1164,7 +585,7 @@ SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
 PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
 TCLTK_PATH_SQ = $(subst ','\'',$(TCLTK_PATH))
 
-LIBS = $(GITLIBS) $(EXTLIBS)
+LIBS = $(PERFLIBS) $(EXTLIBS)
 
 BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
 	$(COMPAT_CFLAGS)
@@ -1180,15 +601,15 @@ export TAR INSTALL DESTDIR SHELL_PATH
 
 SHELL = $(SHELL_PATH)
 
-all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) GIT-BUILD-OPTIONS
+all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
 ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
 endif
 
 all::
 ifndef NO_TCLTK
-	$(QUIET_SUBDIR0)git-gui $(QUIET_SUBDIR1) gitexecdir='$(gitexec_instdir_SQ)' all
-	$(QUIET_SUBDIR0)gitk-git $(QUIET_SUBDIR1) all
+	$(QUIET_SUBDIR0)perf-gui $(QUIET_SUBDIR1) perfexecdir='$(perfexec_instdir_SQ)' all
+	$(QUIET_SUBDIR0)perfk-perf $(QUIET_SUBDIR1) all
 endif
 ifndef NO_PERL
 	$(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' all
@@ -1200,33 +621,33 @@ please_set_SHELL_PATH_to_a_more_modern_shell:
 
 shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
 
-strip: $(PROGRAMS) git$X
-	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) git$X
+strip: $(PROGRAMS) perf$X
+	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
 
-git.o: git.c common-cmds.h GIT-CFLAGS
-	$(QUIET_CC)$(CC) -DGIT_VERSION='"$(GIT_VERSION)"' \
-		'-DGIT_HTML_PATH="$(htmldir_SQ)"' \
+perf.o: perf.c common-cmds.h PERF-CFLAGS
+	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
 		$(ALL_CFLAGS) -c $(filter %.c,$^)
 
-git$X: git.o $(BUILTIN_OBJS) $(GITLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ git.o \
+perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
+	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
 		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
 
-builtin-help.o: builtin-help.c common-cmds.h GIT-CFLAGS
+builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
-		'-DGIT_HTML_PATH="$(htmldir_SQ)"' \
-		'-DGIT_MAN_PATH="$(mandir_SQ)"' \
-		'-DGIT_INFO_PATH="$(infodir_SQ)"' $<
+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
+		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
 
-$(BUILT_INS): git$X
+$(BUILT_INS): perf$X
 	$(QUIET_BUILT_IN)$(RM) $@ && \
-	ln git$X $@ 2>/dev/null || \
-	ln -s git$X $@ 2>/dev/null || \
-	cp git$X $@
+	ln perf$X $@ 2>/dev/null || \
+	ln -s perf$X $@ 2>/dev/null || \
+	cp perf$X $@
 
 common-cmds.h: ./generate-cmdlist.sh command-list.txt
 
-common-cmds.h: $(wildcard Documentation/git-*.txt)
+common-cmds.h: $(wildcard Documentation/perf-*.txt)
 	$(QUIET_GEN)./generate-cmdlist.sh > $@+ && mv $@+ $@
 
 $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
@@ -1234,152 +655,55 @@ $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
 	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
 	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
 	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
-	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	    -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
 	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
 	    $@.sh >$@+ && \
 	chmod +x $@+ && \
 	mv $@+ $@
 
-ifndef NO_PERL
-$(patsubst %.perl,%,$(SCRIPT_PERL)): perl/perl.mak
-
-perl/perl.mak: GIT-CFLAGS perl/Makefile perl/Makefile.PL
-	$(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' $(@F)
-
-$(patsubst %.perl,%,$(SCRIPT_PERL)): % : %.perl
-	$(QUIET_GEN)$(RM) $@ $@+ && \
-	INSTLIBDIR=`MAKEFLAGS= $(MAKE) -C perl -s --no-print-directory instlibdir` && \
-	sed -e '1{' \
-	    -e '	s|#!.*perl|#!$(PERL_PATH_SQ)|' \
-	    -e '	h' \
-	    -e '	s=.*=use lib (split(/:/, $$ENV{GITPERLLIB} || "@@INSTLIBDIR@@"));=' \
-	    -e '	H' \
-	    -e '	x' \
-	    -e '}' \
-	    -e 's|@@INSTLIBDIR@@|'"$$INSTLIBDIR"'|g' \
-	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
-	    $@.perl >$@+ && \
-	chmod +x $@+ && \
-	mv $@+ $@
-
-gitweb/gitweb.cgi: gitweb/gitweb.perl
-	$(QUIET_GEN)$(RM) $@ $@+ && \
-	sed -e '1s|#!.*perl|#!$(PERL_PATH_SQ)|' \
-	    -e 's|++GIT_VERSION++|$(GIT_VERSION)|g' \
-	    -e 's|++GIT_BINDIR++|$(bindir)|g' \
-	    -e 's|++GITWEB_CONFIG++|$(GITWEB_CONFIG)|g' \
-	    -e 's|++GITWEB_CONFIG_SYSTEM++|$(GITWEB_CONFIG_SYSTEM)|g' \
-	    -e 's|++GITWEB_HOME_LINK_STR++|$(GITWEB_HOME_LINK_STR)|g' \
-	    -e 's|++GITWEB_SITENAME++|$(GITWEB_SITENAME)|g' \
-	    -e 's|++GITWEB_PROJECTROOT++|$(GITWEB_PROJECTROOT)|g' \
-	    -e 's|"++GITWEB_PROJECT_MAXDEPTH++"|$(GITWEB_PROJECT_MAXDEPTH)|g' \
-	    -e 's|++GITWEB_EXPORT_OK++|$(GITWEB_EXPORT_OK)|g' \
-	    -e 's|++GITWEB_STRICT_EXPORT++|$(GITWEB_STRICT_EXPORT)|g' \
-	    -e 's|++GITWEB_BASE_URL++|$(GITWEB_BASE_URL)|g' \
-	    -e 's|++GITWEB_LIST++|$(GITWEB_LIST)|g' \
-	    -e 's|++GITWEB_HOMETEXT++|$(GITWEB_HOMETEXT)|g' \
-	    -e 's|++GITWEB_CSS++|$(GITWEB_CSS)|g' \
-	    -e 's|++GITWEB_LOGO++|$(GITWEB_LOGO)|g' \
-	    -e 's|++GITWEB_FAVICON++|$(GITWEB_FAVICON)|g' \
-	    -e 's|++GITWEB_SITE_HEADER++|$(GITWEB_SITE_HEADER)|g' \
-	    -e 's|++GITWEB_SITE_FOOTER++|$(GITWEB_SITE_FOOTER)|g' \
-	    $< >$@+ && \
-	chmod +x $@+ && \
-	mv $@+ $@
-
-git-instaweb: git-instaweb.sh gitweb/gitweb.cgi gitweb/gitweb.css
-	$(QUIET_GEN)$(RM) $@ $@+ && \
-	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
-	    -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
-	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
-	    -e '/@@GITWEB_CGI@@/r gitweb/gitweb.cgi' \
-	    -e '/@@GITWEB_CGI@@/d' \
-	    -e '/@@GITWEB_CSS@@/r gitweb/gitweb.css' \
-	    -e '/@@GITWEB_CSS@@/d' \
-	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
-	    $@.sh > $@+ && \
-	chmod +x $@+ && \
-	mv $@+ $@
-else # NO_PERL
-$(patsubst %.perl,%,$(SCRIPT_PERL)) git-instaweb: % : unimplemented.sh
-	$(QUIET_GEN)$(RM) $@ $@+ && \
-	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
-	    -e 's|@@REASON@@|NO_PERL=$(NO_PERL)|g' \
-	    unimplemented.sh >$@+ && \
-	chmod +x $@+ && \
-	mv $@+ $@
-endif # NO_PERL
-
 configure: configure.ac
 	$(QUIET_GEN)$(RM) $@ $<+ && \
-	sed -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \
+	sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
 	    $< > $<+ && \
 	autoconf -o $@ $<+ && \
 	$(RM) $<+
 
-# These can record GIT_VERSION
-git.o git.spec \
+# These can record PERF_VERSION
+perf.o perf.spec \
 	$(patsubst %.sh,%,$(SCRIPT_SH)) \
 	$(patsubst %.perl,%,$(SCRIPT_PERL)) \
-	: GIT-VERSION-FILE
+	: PERF-VERSION-FILE
 
-%.o: %.c GIT-CFLAGS
+%.o: %.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
-%.s: %.c GIT-CFLAGS
+%.s: %.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
 %.o: %.S
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
 
-exec_cmd.o: exec_cmd.c GIT-CFLAGS
+exec_cmd.o: exec_cmd.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
-		'-DGIT_EXEC_PATH="$(gitexecdir_SQ)"' \
+		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
 		'-DBINDIR="$(bindir_relative_SQ)"' \
 		'-DPREFIX="$(prefix_SQ)"' \
 		$<
 
-builtin-init-db.o: builtin-init-db.c GIT-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_GIT_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
-
-config.o: config.c GIT-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_GITCONFIG='"$(ETC_GITCONFIG_SQ)"' $<
+builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
 
-http.o: http.c GIT-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DGIT_USER_AGENT='"git/$(GIT_VERSION)"' $<
+config.o: config.c PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
-ifdef NO_EXPAT
-http-walker.o: http-walker.c http.h GIT-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DNO_EXPAT $<
-endif
-
-git-%$X: %.o $(GITLIBS)
+perf-%$X: %.o $(PERFLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
 
-git-imap-send$X: imap-send.o $(GITLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
-		$(LIBS) $(OPENSSL_LINK) $(OPENSSL_LIBSSL)
-
-http.o http-walker.o http-push.o transport.o: http.h
-
-git-http-push$X: revision.o http.o http-push.o $(GITLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \
-		$(LIBS) $(CURL_LIBCURL) $(EXPAT_LIBEXPAT)
-
 $(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
-$(patsubst git-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
+$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
 builtin-revert.o wt-status.o: wt-status.h
 
 $(LIB_FILE): $(LIB_OBJS)
 	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
 
-XDIFF_OBJS=xdiff/xdiffi.o xdiff/xprepare.o xdiff/xutils.o xdiff/xemit.o \
-	xdiff/xmerge.o xdiff/xpatience.o
-$(XDIFF_OBJS): xdiff/xinclude.h xdiff/xmacros.h xdiff/xdiff.h xdiff/xtypes.h \
-	xdiff/xutils.h xdiff/xprepare.h xdiff/xdiffi.h xdiff/xemit.h
-
-$(XDIFF_LIB): $(XDIFF_OBJS)
-	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(XDIFF_OBJS)
-
-
 doc:
 	$(MAKE) -C Documentation all
 
@@ -1409,19 +733,19 @@ cscope:
 
 ### Detect prefix changes
 TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
-             $(bindir_SQ):$(gitexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
+             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
 
-GIT-CFLAGS: .FORCE-GIT-CFLAGS
+PERF-CFLAGS: .FORCE-PERF-CFLAGS
 	@FLAGS='$(TRACK_CFLAGS)'; \
-	    if test x"$$FLAGS" != x"`cat GIT-CFLAGS 2>/dev/null`" ; then \
+	    if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
 		echo 1>&2 "    * new build flags or prefix"; \
-		echo "$$FLAGS" >GIT-CFLAGS; \
+		echo "$$FLAGS" >PERF-CFLAGS; \
             fi
 
 # We need to apply sq twice, once to protect from the shell
-# that runs GIT-BUILD-OPTIONS, and then again to protect it
+# that runs PERF-BUILD-OPTIONS, and then again to protect it
 # and the first level quoting from the shell that runs "echo".
-GIT-BUILD-OPTIONS: .FORCE-GIT-BUILD-OPTIONS
+PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
 	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
 	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
 	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
@@ -1431,14 +755,14 @@ GIT-BUILD-OPTIONS: .FORCE-GIT-BUILD-OPTIONS
 ifndef NO_TCLTK
 TRACK_VARS = $(subst ','\'',-DTCLTK_PATH='$(TCLTK_PATH_SQ)')
 
-GIT-GUI-VARS: .FORCE-GIT-GUI-VARS
+PERF-GUI-VARS: .FORCE-PERF-GUI-VARS
 	@VARS='$(TRACK_VARS)'; \
 	    if test x"$$VARS" != x"`cat $@ 2>/dev/null`" ; then \
 		echo 1>&2 "    * new Tcl/Tk interpreter location"; \
 		echo "$$VARS" >$@; \
             fi
 
-.PHONY: .FORCE-GIT-GUI-VARS
+.PHONY: .FORCE-PERF-GUI-VARS
 endif
 
 ### Testing rules
@@ -1476,7 +800,7 @@ test-parse-options$X: parse-options.o
 
 .PRECIOUS: $(patsubst test-%$X,test-%.o,$(TEST_PROGRAMS))
 
-test-%$X: test-%.o $(GITLIBS)
+test-%$X: test-%.o $(PERFLIBS)
 	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
 
 check-sha1:: test-sha1$X
@@ -1506,40 +830,40 @@ template_instdir = $(prefix)/$(template_dir)
 endif
 export template_instdir
 
-ifneq ($(filter /%,$(firstword $(gitexecdir))),)
-gitexec_instdir = $(gitexecdir)
+ifneq ($(filter /%,$(firstword $(perfexecdir))),)
+perfexec_instdir = $(perfexecdir)
 else
-gitexec_instdir = $(prefix)/$(gitexecdir)
+perfexec_instdir = $(prefix)/$(perfexecdir)
 endif
-gitexec_instdir_SQ = $(subst ','\'',$(gitexec_instdir))
-export gitexec_instdir
+perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
+export perfexec_instdir
 
 install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(gitexec_instdir_SQ)'
-	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)'
-	$(INSTALL) git$X git-upload-pack$X git-receive-pack$X git-upload-archive$X git-shell$X git-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)'
+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(INSTALL) perf$X perf-upload-pack$X perf-receive-pack$X perf-upload-archive$X perf-shell$X perf-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install
 	$(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install
 ifndef NO_TCLTK
-	$(MAKE) -C gitk-git install
-	$(MAKE) -C git-gui gitexecdir='$(gitexec_instdir_SQ)' install
+	$(MAKE) -C perfk-perf install
+	$(MAKE) -C perf-gui perfexecdir='$(perfexec_instdir_SQ)' install
 endif
 ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), $(RM) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)/$p';)
+	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
 	bindir=$$(cd '$(DESTDIR_SQ)$(bindir_SQ)' && pwd) && \
-	execdir=$$(cd '$(DESTDIR_SQ)$(gitexec_instdir_SQ)' && pwd) && \
-	{ $(RM) "$$execdir/git-add$X" && \
-		ln "$$bindir/git$X" "$$execdir/git-add$X" 2>/dev/null || \
-		cp "$$bindir/git$X" "$$execdir/git-add$X"; } && \
-	{ for p in $(filter-out git-add$X,$(BUILT_INS)); do \
+	execdir=$$(cd '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' && pwd) && \
+	{ $(RM) "$$execdir/perf-add$X" && \
+		ln "$$bindir/perf$X" "$$execdir/perf-add$X" 2>/dev/null || \
+		cp "$$bindir/perf$X" "$$execdir/perf-add$X"; } && \
+	{ for p in $(filter-out perf-add$X,$(BUILT_INS)); do \
 		$(RM) "$$execdir/$$p" && \
-		ln "$$execdir/git-add$X" "$$execdir/$$p" 2>/dev/null || \
-		ln -s "git-add$X" "$$execdir/$$p" 2>/dev/null || \
-		cp "$$execdir/git-add$X" "$$execdir/$$p" || exit; \
+		ln "$$execdir/perf-add$X" "$$execdir/$$p" 2>/dev/null || \
+		ln -s "perf-add$X" "$$execdir/$$p" 2>/dev/null || \
+		cp "$$execdir/perf-add$X" "$$execdir/$$p" || exit; \
 	  done } && \
-	./check_bindir "z$$bindir" "z$$execdir" "$$bindir/git-add$X"
+	./check_bindir "z$$bindir" "z$$execdir" "$$bindir/perf-add$X"
 
 install-doc:
 	$(MAKE) -C Documentation install
@@ -1569,31 +893,31 @@ quick-install-html:
 
 ### Maintainer's dist rules
 
-git.spec: git.spec.in
-	sed -e 's/@@VERSION@@/$(GIT_VERSION)/g' < $< > $@+
+perf.spec: perf.spec.in
+	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
 	mv $@+ $@
 
-GIT_TARNAME=git-$(GIT_VERSION)
-dist: git.spec git-archive$(X) configure
-	./git-archive --format=tar \
-		--prefix=$(GIT_TARNAME)/ HEAD^{tree} > $(GIT_TARNAME).tar
-	@mkdir -p $(GIT_TARNAME)
-	@cp git.spec configure $(GIT_TARNAME)
-	@echo $(GIT_VERSION) > $(GIT_TARNAME)/version
-	@$(MAKE) -C git-gui TARDIR=../$(GIT_TARNAME)/git-gui dist-version
-	$(TAR) rf $(GIT_TARNAME).tar \
-		$(GIT_TARNAME)/git.spec \
-		$(GIT_TARNAME)/configure \
-		$(GIT_TARNAME)/version \
-		$(GIT_TARNAME)/git-gui/version
-	@$(RM) -r $(GIT_TARNAME)
-	gzip -f -9 $(GIT_TARNAME).tar
+PERF_TARNAME=perf-$(PERF_VERSION)
+dist: perf.spec perf-archive$(X) configure
+	./perf-archive --format=tar \
+		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
+	@mkdir -p $(PERF_TARNAME)
+	@cp perf.spec configure $(PERF_TARNAME)
+	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
+	@$(MAKE) -C perf-gui TARDIR=../$(PERF_TARNAME)/perf-gui dist-version
+	$(TAR) rf $(PERF_TARNAME).tar \
+		$(PERF_TARNAME)/perf.spec \
+		$(PERF_TARNAME)/configure \
+		$(PERF_TARNAME)/version \
+		$(PERF_TARNAME)/perf-gui/version
+	@$(RM) -r $(PERF_TARNAME)
+	gzip -f -9 $(PERF_TARNAME).tar
 
 rpm: dist
-	$(RPMBUILD) -ta $(GIT_TARNAME).tar.gz
+	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
 
-htmldocs = git-htmldocs-$(GIT_VERSION)
-manpages = git-manpages-$(GIT_VERSION)
+htmldocs = perf-htmldocs-$(PERF_VERSION)
+manpages = perf-manpages-$(PERF_VERSION)
 dist-doc:
 	$(RM) -r .doc-tmp-dir
 	mkdir .doc-tmp-dir
@@ -1618,51 +942,46 @@ distclean: clean
 	$(RM) configure
 
 clean:
-	$(RM) *.o mozilla-sha1/*.o arm/*.o ppc/*.o compat/*.o xdiff/*.o \
-		$(LIB_FILE) $(XDIFF_LIB)
-	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) git$X
+	$(RM) *.o $(LIB_FILE)
+	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
 	$(RM) $(TEST_PROGRAMS)
 	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
 	$(RM) -r autom4te.cache
 	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache
-	$(RM) -r $(GIT_TARNAME) .doc-tmp-dir
-	$(RM) $(GIT_TARNAME).tar.gz git-core_$(GIT_VERSION)-*.tar.gz
+	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
+	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
 	$(MAKE) -C Documentation/ clean
-ifndef NO_PERL
-	$(RM) gitweb/gitweb.cgi
-	$(MAKE) -C perl clean
-endif
 	$(MAKE) -C templates/ clean
 	$(MAKE) -C t/ clean
 ifndef NO_TCLTK
-	$(MAKE) -C gitk-git clean
-	$(MAKE) -C git-gui clean
+	$(MAKE) -C perfk-perf clean
+	$(MAKE) -C perf-gui clean
 endif
-	$(RM) GIT-VERSION-FILE GIT-CFLAGS GIT-GUI-VARS GIT-BUILD-OPTIONS
+	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-GUI-VARS PERF-BUILD-OPTIONS
 
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
-.PHONY: .FORCE-GIT-VERSION-FILE TAGS tags cscope .FORCE-GIT-CFLAGS
-.PHONY: .FORCE-GIT-BUILD-OPTIONS
+.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
+.PHONY: .FORCE-PERF-BUILD-OPTIONS
 
 ### Check documentation
 #
 check-docs::
-	@(for v in $(ALL_PROGRAMS) $(BUILT_INS) git gitk; \
+	@(for v in $(ALL_PROGRAMS) $(BUILT_INS) perf perfk; \
 	do \
 		case "$$v" in \
-		git-merge-octopus | git-merge-ours | git-merge-recursive | \
-		git-merge-resolve | git-merge-subtree | \
-		git-fsck-objects | git-init-db | \
-		git-?*--?* ) continue ;; \
+		perf-merge-octopus | perf-merge-ours | perf-merge-recursive | \
+		perf-merge-resolve | perf-merge-subtree | \
+		perf-fsck-objects | perf-init-db | \
+		perf-?*--?* ) continue ;; \
 		esac ; \
 		test -f "Documentation/$$v.txt" || \
 		echo "no doc: $$v"; \
 		sed -e '/^#/d' command-list.txt | \
 		grep -q "^$$v[ 	]" || \
 		case "$$v" in \
-		git) ;; \
+		perf) ;; \
 		*) echo "no link: $$v";; \
 		esac ; \
 	done; \
@@ -1670,37 +989,37 @@ check-docs::
 		sed -e '/^#/d' \
 		    -e 's/[ 	].*//' \
 		    -e 's/^/listed /' command-list.txt; \
-		ls -1 Documentation/git*txt | \
+		ls -1 Documentation/perf*txt | \
 		sed -e 's|Documentation/|documented |' \
 		    -e 's/\.txt//'; \
 	) | while read how cmd; \
 	do \
 		case "$$how,$$cmd" in \
-		*,git-citool | \
-		*,git-gui | \
-		*,git-help | \
-		documented,gitattributes | \
-		documented,gitignore | \
-		documented,gitmodules | \
-		documented,gitcli | \
-		documented,git-tools | \
-		documented,gitcore-tutorial | \
-		documented,gitcvs-migration | \
-		documented,gitdiffcore | \
-		documented,gitglossary | \
-		documented,githooks | \
-		documented,gitrepository-layout | \
-		documented,gittutorial | \
-		documented,gittutorial-2 | \
+		*,perf-citool | \
+		*,perf-gui | \
+		*,perf-help | \
+		documented,perfattributes | \
+		documented,perfignore | \
+		documented,perfmodules | \
+		documented,perfcli | \
+		documented,perf-tools | \
+		documented,perfcore-tutorial | \
+		documented,perfcvs-migration | \
+		documented,perfdiffcore | \
+		documented,perfglossary | \
+		documented,perfhooks | \
+		documented,perfrepository-layout | \
+		documented,perftutorial | \
+		documented,perftutorial-2 | \
 		sentinel,not,matching,is,ok ) continue ;; \
 		esac; \
-		case " $(ALL_PROGRAMS) $(BUILT_INS) git gitk " in \
+		case " $(ALL_PROGRAMS) $(BUILT_INS) perf perfk " in \
 		*" $$cmd "*)	;; \
 		*) echo "removed but $$how: $$cmd" ;; \
 		esac; \
 	done ) | sort
 
-### Make sure built-ins do not have dups and listed in git.c
+### Make sure built-ins do not have dups and listed in perf.c
 #
 check-builtins::
 	./check-builtins.sh
diff --git a/Documentation/perf_counter/PERF-BUILD-OPTIONS b/Documentation/perf_counter/PERF-BUILD-OPTIONS
new file mode 100644
index 0000000000000..46d8d6ceb2f41
--- /dev/null
+++ b/Documentation/perf_counter/PERF-BUILD-OPTIONS
@@ -0,0 +1,4 @@
+SHELL_PATH='/bin/sh'
+TAR='tar'
+NO_CURL=''
+NO_PERL=''
diff --git a/Documentation/perf_counter/PERF-CFLAGS b/Documentation/perf_counter/PERF-CFLAGS
new file mode 100644
index 0000000000000..f24906ca688d6
--- /dev/null
+++ b/Documentation/perf_counter/PERF-CFLAGS
@@ -0,0 +1 @@
+-g -O2 -Wall  -DSHA1_HEADER='<openssl/sha.h>' : /home/mingo/bin:libexec/perf-core:share/perf-core/templates:/home/mingo
diff --git a/Documentation/perf_counter/PERF-VERSION-FILE b/Documentation/perf_counter/PERF-VERSION-FILE
new file mode 100644
index 0000000000000..328e244c0c81c
--- /dev/null
+++ b/Documentation/perf_counter/PERF-VERSION-FILE
@@ -0,0 +1 @@
+PERF_VERSION = 0.0.1.PERF
diff --git a/Documentation/perf_counter/PERF-VERSION-GEN b/Documentation/perf_counter/PERF-VERSION-GEN
new file mode 100755
index 0000000000000..c561d1538c039
--- /dev/null
+++ b/Documentation/perf_counter/PERF-VERSION-GEN
@@ -0,0 +1,42 @@
+#!/bin/sh
+
+GVF=PERF-VERSION-FILE
+DEF_VER=v0.0.1.PERF
+
+LF='
+'
+
+# First see if there is a version file (included in release tarballs),
+# then try git-describe, then default.
+if test -f version
+then
+	VN=$(cat version) || VN="$DEF_VER"
+elif test -d .git -o -f .git &&
+	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) &&
+	case "$VN" in
+	*$LF*) (exit 1) ;;
+	v[0-9]*)
+		git update-index -q --refresh
+		test -z "$(git diff-index --name-only HEAD --)" ||
+		VN="$VN-dirty" ;;
+	esac
+then
+	VN=$(echo "$VN" | sed -e 's/-/./g');
+else
+	VN="$DEF_VER"
+fi
+
+VN=$(expr "$VN" : v*'\(.*\)')
+
+if test -r $GVF
+then
+	VC=$(sed -e 's/^PERF_VERSION = //' <$GVF)
+else
+	VC=unset
+fi
+test "$VN" = "$VC" || {
+	echo >&2 "PERF_VERSION = $VN"
+	echo "PERF_VERSION = $VN" >$GVF
+}
+
+
diff --git a/Documentation/perf_counter/abspath.c b/Documentation/perf_counter/abspath.c
new file mode 100644
index 0000000000000..649f34f83365d
--- /dev/null
+++ b/Documentation/perf_counter/abspath.c
@@ -0,0 +1,117 @@
+#include "cache.h"
+
+/*
+ * Do not use this for inspecting *tracked* content.  When path is a
+ * symlink to a directory, we do not want to say it is a directory when
+ * dealing with tracked content in the working tree.
+ */
+int is_directory(const char *path)
+{
+	struct stat st;
+	return (!stat(path, &st) && S_ISDIR(st.st_mode));
+}
+
+/* We allow "recursive" symbolic links. Only within reason, though. */
+#define MAXDEPTH 5
+
+const char *make_absolute_path(const char *path)
+{
+	static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1];
+	char cwd[1024] = "";
+	int buf_index = 1, len;
+
+	int depth = MAXDEPTH;
+	char *last_elem = NULL;
+	struct stat st;
+
+	if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+		die ("Too long path: %.*s", 60, path);
+
+	while (depth--) {
+		if (!is_directory(buf)) {
+			char *last_slash = strrchr(buf, '/');
+			if (last_slash) {
+				*last_slash = '\0';
+				last_elem = xstrdup(last_slash + 1);
+			} else {
+				last_elem = xstrdup(buf);
+				*buf = '\0';
+			}
+		}
+
+		if (*buf) {
+			if (!*cwd && !getcwd(cwd, sizeof(cwd)))
+				die ("Could not get current working directory");
+
+			if (chdir(buf))
+				die ("Could not switch to '%s'", buf);
+		}
+		if (!getcwd(buf, PATH_MAX))
+			die ("Could not get current working directory");
+
+		if (last_elem) {
+			int len = strlen(buf);
+			if (len + strlen(last_elem) + 2 > PATH_MAX)
+				die ("Too long path name: '%s/%s'",
+						buf, last_elem);
+			buf[len] = '/';
+			strcpy(buf + len + 1, last_elem);
+			free(last_elem);
+			last_elem = NULL;
+		}
+
+		if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) {
+			len = readlink(buf, next_buf, PATH_MAX);
+			if (len < 0)
+				die ("Invalid symlink: %s", buf);
+			if (PATH_MAX <= len)
+				die("symbolic link too long: %s", buf);
+			next_buf[len] = '\0';
+			buf = next_buf;
+			buf_index = 1 - buf_index;
+			next_buf = bufs[buf_index];
+		} else
+			break;
+	}
+
+	if (*cwd && chdir(cwd))
+		die ("Could not change back to '%s'", cwd);
+
+	return buf;
+}
+
+static const char *get_pwd_cwd(void)
+{
+	static char cwd[PATH_MAX + 1];
+	char *pwd;
+	struct stat cwd_stat, pwd_stat;
+	if (getcwd(cwd, PATH_MAX) == NULL)
+		return NULL;
+	pwd = getenv("PWD");
+	if (pwd && strcmp(pwd, cwd)) {
+		stat(cwd, &cwd_stat);
+		if (!stat(pwd, &pwd_stat) &&
+		    pwd_stat.st_dev == cwd_stat.st_dev &&
+		    pwd_stat.st_ino == cwd_stat.st_ino) {
+			strlcpy(cwd, pwd, PATH_MAX);
+		}
+	}
+	return cwd;
+}
+
+const char *make_nonrelative_path(const char *path)
+{
+	static char buf[PATH_MAX + 1];
+
+	if (is_absolute_path(path)) {
+		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	} else {
+		const char *cwd = get_pwd_cwd();
+		if (!cwd)
+			die("Cannot determine the current working directory");
+		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX)
+			die("Too long path: %.*s", 60, path);
+	}
+	return buf;
+}
diff --git a/Documentation/perf_counter/alias.c b/Documentation/perf_counter/alias.c
new file mode 100644
index 0000000000000..9b3dd2b428df8
--- /dev/null
+++ b/Documentation/perf_counter/alias.c
@@ -0,0 +1,77 @@
+#include "cache.h"
+
+static const char *alias_key;
+static char *alias_val;
+
+static int alias_lookup_cb(const char *k, const char *v, void *cb)
+{
+	if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) {
+		if (!v)
+			return config_error_nonbool(k);
+		alias_val = strdup(v);
+		return 0;
+	}
+	return 0;
+}
+
+char *alias_lookup(const char *alias)
+{
+	alias_key = alias;
+	alias_val = NULL;
+	perf_config(alias_lookup_cb, NULL);
+	return alias_val;
+}
+
+int split_cmdline(char *cmdline, const char ***argv)
+{
+	int src, dst, count = 0, size = 16;
+	char quoted = 0;
+
+	*argv = malloc(sizeof(char*) * size);
+
+	/* split alias_string */
+	(*argv)[count++] = cmdline;
+	for (src = dst = 0; cmdline[src];) {
+		char c = cmdline[src];
+		if (!quoted && isspace(c)) {
+			cmdline[dst++] = 0;
+			while (cmdline[++src]
+					&& isspace(cmdline[src]))
+				; /* skip */
+			if (count >= size) {
+				size += 16;
+				*argv = realloc(*argv, sizeof(char*) * size);
+			}
+			(*argv)[count++] = cmdline + dst;
+		} else if (!quoted && (c == '\'' || c == '"')) {
+			quoted = c;
+			src++;
+		} else if (c == quoted) {
+			quoted = 0;
+			src++;
+		} else {
+			if (c == '\\' && quoted != '\'') {
+				src++;
+				c = cmdline[src];
+				if (!c) {
+					free(*argv);
+					*argv = NULL;
+					return error("cmdline ends with \\");
+				}
+			}
+			cmdline[dst++] = c;
+			src++;
+		}
+	}
+
+	cmdline[dst] = 0;
+
+	if (quoted) {
+		free(*argv);
+		*argv = NULL;
+		return error("unclosed quote");
+	}
+
+	return count;
+}
+
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
new file mode 100644
index 0000000000000..125fcc2f4901d
--- /dev/null
+++ b/Documentation/perf_counter/builtin-help.c
@@ -0,0 +1,463 @@
+/*
+ * builtin-help.c
+ *
+ * Builtin help command
+ */
+#include "cache.h"
+#include "builtin.h"
+#include "exec_cmd.h"
+#include "common-cmds.h"
+#include "parse-options.h"
+#include "run-command.h"
+#include "help.h"
+
+static struct man_viewer_list {
+	struct man_viewer_list *next;
+	char name[FLEX_ARRAY];
+} *man_viewer_list;
+
+static struct man_viewer_info_list {
+	struct man_viewer_info_list *next;
+	const char *info;
+	char name[FLEX_ARRAY];
+} *man_viewer_info_list;
+
+enum help_format {
+	HELP_FORMAT_MAN,
+	HELP_FORMAT_INFO,
+	HELP_FORMAT_WEB,
+};
+
+static int show_all = 0;
+static enum help_format help_format = HELP_FORMAT_MAN;
+static struct option builtin_help_options[] = {
+	OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
+	OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
+	OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
+			HELP_FORMAT_WEB),
+	OPT_SET_INT('i', "info", &help_format, "show info page",
+			HELP_FORMAT_INFO),
+	OPT_END(),
+};
+
+static const char * const builtin_help_usage[] = {
+	"perf help [--all] [--man|--web|--info] [command]",
+	NULL
+};
+
+static enum help_format parse_help_format(const char *format)
+{
+	if (!strcmp(format, "man"))
+		return HELP_FORMAT_MAN;
+	if (!strcmp(format, "info"))
+		return HELP_FORMAT_INFO;
+	if (!strcmp(format, "web") || !strcmp(format, "html"))
+		return HELP_FORMAT_WEB;
+	die("unrecognized help format '%s'", format);
+}
+
+static const char *get_man_viewer_info(const char *name)
+{
+	struct man_viewer_info_list *viewer;
+
+	for (viewer = man_viewer_info_list; viewer; viewer = viewer->next)
+	{
+		if (!strcasecmp(name, viewer->name))
+			return viewer->info;
+	}
+	return NULL;
+}
+
+static int check_emacsclient_version(void)
+{
+	struct strbuf buffer = STRBUF_INIT;
+	struct child_process ec_process;
+	const char *argv_ec[] = { "emacsclient", "--version", NULL };
+	int version;
+
+	/* emacsclient prints its version number on stderr */
+	memset(&ec_process, 0, sizeof(ec_process));
+	ec_process.argv = argv_ec;
+	ec_process.err = -1;
+	ec_process.stdout_to_stderr = 1;
+	if (start_command(&ec_process)) {
+		fprintf(stderr, "Failed to start emacsclient.\n");
+		return -1;
+	}
+	strbuf_read(&buffer, ec_process.err, 20);
+	close(ec_process.err);
+
+	/*
+	 * Don't bother checking return value, because "emacsclient --version"
+	 * seems to always exits with code 1.
+	 */
+	finish_command(&ec_process);
+
+	if (prefixcmp(buffer.buf, "emacsclient")) {
+		fprintf(stderr, "Failed to parse emacsclient version.\n");
+		strbuf_release(&buffer);
+		return -1;
+	}
+
+	strbuf_remove(&buffer, 0, strlen("emacsclient"));
+	version = atoi(buffer.buf);
+
+	if (version < 22) {
+		fprintf(stderr,
+			"emacsclient version '%d' too old (< 22).\n",
+			version);
+		strbuf_release(&buffer);
+		return -1;
+	}
+
+	strbuf_release(&buffer);
+	return 0;
+}
+
+static void exec_woman_emacs(const char* path, const char *page)
+{
+	if (!check_emacsclient_version()) {
+		/* This works only with emacsclient version >= 22. */
+		struct strbuf man_page = STRBUF_INIT;
+
+		if (!path)
+			path = "emacsclient";
+		strbuf_addf(&man_page, "(woman \"%s\")", page);
+		execlp(path, "emacsclient", "-e", man_page.buf, NULL);
+		warning("failed to exec '%s': %s", path, strerror(errno));
+	}
+}
+
+static void exec_man_konqueror(const char* path, const char *page)
+{
+	const char *display = getenv("DISPLAY");
+	if (display && *display) {
+		struct strbuf man_page = STRBUF_INIT;
+		const char *filename = "kfmclient";
+
+		/* It's simpler to launch konqueror using kfmclient. */
+		if (path) {
+			const char *file = strrchr(path, '/');
+			if (file && !strcmp(file + 1, "konqueror")) {
+				char *new = strdup(path);
+				char *dest = strrchr(new, '/');
+
+				/* strlen("konqueror") == strlen("kfmclient") */
+				strcpy(dest + 1, "kfmclient");
+				path = new;
+			}
+			if (file)
+				filename = file;
+		} else
+			path = "kfmclient";
+		strbuf_addf(&man_page, "man:%s(1)", page);
+		execlp(path, filename, "newTab", man_page.buf, NULL);
+		warning("failed to exec '%s': %s", path, strerror(errno));
+	}
+}
+
+static void exec_man_man(const char* path, const char *page)
+{
+	if (!path)
+		path = "man";
+	execlp(path, "man", page, NULL);
+	warning("failed to exec '%s': %s", path, strerror(errno));
+}
+
+static void exec_man_cmd(const char *cmd, const char *page)
+{
+	struct strbuf shell_cmd = STRBUF_INIT;
+	strbuf_addf(&shell_cmd, "%s %s", cmd, page);
+	execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL);
+	warning("failed to exec '%s': %s", cmd, strerror(errno));
+}
+
+static void add_man_viewer(const char *name)
+{
+	struct man_viewer_list **p = &man_viewer_list;
+	size_t len = strlen(name);
+
+	while (*p)
+		p = &((*p)->next);
+	*p = calloc(1, (sizeof(**p) + len + 1));
+	strncpy((*p)->name, name, len);
+}
+
+static int supported_man_viewer(const char *name, size_t len)
+{
+	return (!strncasecmp("man", name, len) ||
+		!strncasecmp("woman", name, len) ||
+		!strncasecmp("konqueror", name, len));
+}
+
+static void do_add_man_viewer_info(const char *name,
+				   size_t len,
+				   const char *value)
+{
+	struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1);
+
+	strncpy(new->name, name, len);
+	new->info = strdup(value);
+	new->next = man_viewer_info_list;
+	man_viewer_info_list = new;
+}
+
+static int add_man_viewer_path(const char *name,
+			       size_t len,
+			       const char *value)
+{
+	if (supported_man_viewer(name, len))
+		do_add_man_viewer_info(name, len, value);
+	else
+		warning("'%s': path for unsupported man viewer.\n"
+			"Please consider using 'man.<tool>.cmd' instead.",
+			name);
+
+	return 0;
+}
+
+static int add_man_viewer_cmd(const char *name,
+			      size_t len,
+			      const char *value)
+{
+	if (supported_man_viewer(name, len))
+		warning("'%s': cmd for supported man viewer.\n"
+			"Please consider using 'man.<tool>.path' instead.",
+			name);
+	else
+		do_add_man_viewer_info(name, len, value);
+
+	return 0;
+}
+
+static int add_man_viewer_info(const char *var, const char *value)
+{
+	const char *name = var + 4;
+	const char *subkey = strrchr(name, '.');
+
+	if (!subkey)
+		return error("Config with no key for man viewer: %s", name);
+
+	if (!strcmp(subkey, ".path")) {
+		if (!value)
+			return config_error_nonbool(var);
+		return add_man_viewer_path(name, subkey - name, value);
+	}
+	if (!strcmp(subkey, ".cmd")) {
+		if (!value)
+			return config_error_nonbool(var);
+		return add_man_viewer_cmd(name, subkey - name, value);
+	}
+
+	warning("'%s': unsupported man viewer sub key.", subkey);
+	return 0;
+}
+
+static int perf_help_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "help.format")) {
+		if (!value)
+			return config_error_nonbool(var);
+		help_format = parse_help_format(value);
+		return 0;
+	}
+	if (!strcmp(var, "man.viewer")) {
+		if (!value)
+			return config_error_nonbool(var);
+		add_man_viewer(value);
+		return 0;
+	}
+	if (!prefixcmp(var, "man."))
+		return add_man_viewer_info(var, value);
+
+	return perf_default_config(var, value, cb);
+}
+
+static struct cmdnames main_cmds, other_cmds;
+
+void list_common_cmds_help(void)
+{
+	int i, longest = 0;
+
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		if (longest < strlen(common_cmds[i].name))
+			longest = strlen(common_cmds[i].name);
+	}
+
+	puts("The most commonly used perf commands are:");
+	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
+		printf("   %s   ", common_cmds[i].name);
+		mput_char(' ', longest - strlen(common_cmds[i].name));
+		puts(common_cmds[i].help);
+	}
+}
+
+static int is_perf_command(const char *s)
+{
+	return is_in_cmdlist(&main_cmds, s) ||
+		is_in_cmdlist(&other_cmds, s);
+}
+
+static const char *prepend(const char *prefix, const char *cmd)
+{
+	size_t pre_len = strlen(prefix);
+	size_t cmd_len = strlen(cmd);
+	char *p = malloc(pre_len + cmd_len + 1);
+	memcpy(p, prefix, pre_len);
+	strcpy(p + pre_len, cmd);
+	return p;
+}
+
+static const char *cmd_to_page(const char *perf_cmd)
+{
+	if (!perf_cmd)
+		return "perf";
+	else if (!prefixcmp(perf_cmd, "perf"))
+		return perf_cmd;
+	else if (is_perf_command(perf_cmd))
+		return prepend("perf-", perf_cmd);
+	else
+		return prepend("perf", perf_cmd);
+}
+
+static void setup_man_path(void)
+{
+	struct strbuf new_path = STRBUF_INIT;
+	const char *old_path = getenv("MANPATH");
+
+	/* We should always put ':' after our path. If there is no
+	 * old_path, the ':' at the end will let 'man' to try
+	 * system-wide paths after ours to find the manual page. If
+	 * there is old_path, we need ':' as delimiter. */
+	strbuf_addstr(&new_path, system_path(PERF_MAN_PATH));
+	strbuf_addch(&new_path, ':');
+	if (old_path)
+		strbuf_addstr(&new_path, old_path);
+
+	setenv("MANPATH", new_path.buf, 1);
+
+	strbuf_release(&new_path);
+}
+
+static void exec_viewer(const char *name, const char *page)
+{
+	const char *info = get_man_viewer_info(name);
+
+	if (!strcasecmp(name, "man"))
+		exec_man_man(info, page);
+	else if (!strcasecmp(name, "woman"))
+		exec_woman_emacs(info, page);
+	else if (!strcasecmp(name, "konqueror"))
+		exec_man_konqueror(info, page);
+	else if (info)
+		exec_man_cmd(info, page);
+	else
+		warning("'%s': unknown man viewer.", name);
+}
+
+static void show_man_page(const char *perf_cmd)
+{
+	struct man_viewer_list *viewer;
+	const char *page = cmd_to_page(perf_cmd);
+	const char *fallback = getenv("PERF_MAN_VIEWER");
+
+	setup_man_path();
+	for (viewer = man_viewer_list; viewer; viewer = viewer->next)
+	{
+		exec_viewer(viewer->name, page); /* will return when unable */
+	}
+	if (fallback)
+		exec_viewer(fallback, page);
+	exec_viewer("man", page);
+	die("no man viewer handled the request");
+}
+
+static void show_info_page(const char *perf_cmd)
+{
+	const char *page = cmd_to_page(perf_cmd);
+	setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
+	execlp("info", "info", "perfman", page, NULL);
+}
+
+static void get_html_page_path(struct strbuf *page_path, const char *page)
+{
+	struct stat st;
+	const char *html_path = system_path(PERF_HTML_PATH);
+
+	/* Check that we have a perf documentation directory. */
+	if (stat(mkpath("%s/perf.html", html_path), &st)
+	    || !S_ISREG(st.st_mode))
+		die("'%s': not a documentation directory.", html_path);
+
+	strbuf_init(page_path, 0);
+	strbuf_addf(page_path, "%s/%s.html", html_path, page);
+}
+
+/*
+ * If open_html is not defined in a platform-specific way (see for
+ * example compat/mingw.h), we use the script web--browse to display
+ * HTML.
+ */
+#ifndef open_html
+void open_html(const char *path)
+{
+	execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
+}
+#endif
+
+static void show_html_page(const char *perf_cmd)
+{
+	const char *page = cmd_to_page(perf_cmd);
+	struct strbuf page_path; /* it leaks but we exec bellow */
+
+	get_html_page_path(&page_path, page);
+
+	open_html(page_path.buf);
+}
+
+int cmd_help(int argc, const char **argv, const char *prefix)
+{
+	int nonperf;
+	const char *alias;
+	load_command_list("perf-", &main_cmds, &other_cmds);
+
+	/* setup_perf_directory_gently(&nonperf); */
+	perf_config(perf_help_config, NULL);
+
+	argc = parse_options(argc, argv, builtin_help_options,
+			builtin_help_usage, 0);
+
+	if (show_all) {
+		printf("usage: %s\n\n", perf_usage_string);
+		list_commands("perf commands", &main_cmds, &other_cmds);
+		printf("%s\n", perf_more_info_string);
+		return 0;
+	}
+
+	if (!argv[0]) {
+		printf("usage: %s\n\n", perf_usage_string);
+		list_common_cmds_help();
+		printf("\n%s\n", perf_more_info_string);
+		return 0;
+	}
+
+	alias = alias_lookup(argv[0]);
+	if (alias && !is_perf_command(argv[0])) {
+		printf("`perf %s' is aliased to `%s'\n", argv[0], alias);
+		return 0;
+	}
+
+	switch (help_format) {
+	case HELP_FORMAT_MAN:
+		show_man_page(argv[0]);
+		break;
+	case HELP_FORMAT_INFO:
+		show_info_page(argv[0]);
+		break;
+	case HELP_FORMAT_WEB:
+		show_html_page(argv[0]);
+		break;
+	}
+
+	return 0;
+}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
new file mode 100644
index 0000000000000..9d2c769e5f835
--- /dev/null
+++ b/Documentation/perf_counter/builtin-top.c
@@ -0,0 +1,1411 @@
+/*
+ * kerneltop.c: show top kernel functions - performance counters showcase
+
+   Build with:
+
+     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
+
+   Sample output:
+
+------------------------------------------------------------------------------
+ KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
+------------------------------------------------------------------------------
+
+             weight         RIP          kernel function
+             ______   ________________   _______________
+
+              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
+              33.00 - ffffffff804cb740 : sock_alloc_send_skb
+              31.26 - ffffffff804ce808 : skb_push
+              22.43 - ffffffff80510004 : tcp_established_options
+              19.00 - ffffffff8027d250 : find_get_page
+              15.76 - ffffffff804e4fc9 : eth_type_trans
+              15.20 - ffffffff804d8baa : dst_release
+              14.86 - ffffffff804cf5d8 : skb_release_head_state
+              14.00 - ffffffff802217d5 : read_hpet
+              12.00 - ffffffff804ffb7f : __ip_local_out
+              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
+               8.54 - ffffffff805001a3 : ip_queue_xmit
+ */
+
+/*
+ * perfstat:  /usr/bin/time -alike performance counter statistics utility
+
+          It summarizes the counter events of all tasks (and child tasks),
+          covering all CPUs that the command (or workload) executes on.
+          It only counts the per-task events of the workload started,
+          independent of how many other tasks run on those CPUs.
+
+   Sample output:
+
+   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
+
+   Performance counter stats for 'ls':
+
+           163516953 instructions
+                2295 cache-misses
+             2855182 branch-misses
+ */
+
+ /*
+  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+  *
+  * Improvements and fixes by:
+  *
+  *   Arjan van de Ven <arjan@linux.intel.com>
+  *   Yanmin Zhang <yanmin.zhang@intel.com>
+  *   Wu Fengguang <fengguang.wu@intel.com>
+  *   Mike Galbraith <efault@gmx.de>
+  *   Paul Mackerras <paulus@samba.org>
+  *
+  * Released under the GPL v2. (and only v2, not any later version)
+  */
+
+#include "util.h"
+
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <getopt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
+
+#define rdclock()                                       \
+({                                                      \
+        struct timespec ts;                             \
+                                                        \
+        clock_gettime(CLOCK_MONOTONIC, &ts);            \
+        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#ifdef __x86_64__
+#define __NR_perf_counter_open 295
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __i386__
+#define __NR_perf_counter_open 333
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#define rmb() 		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+asmlinkage int sys_perf_counter_open(
+        struct perf_counter_hw_event    *hw_event_uptr          __user,
+        pid_t                           pid,
+        int                             cpu,
+        int                             group_fd,
+        unsigned long                   flags)
+{
+        return syscall(
+                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
+}
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+
+static int			run_perfstat			=  0;
+static int			system_wide			=  0;
+
+static int			nr_counters			=  0;
+static __u64			event_id[MAX_COUNTERS]		= {
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+};
+static int			default_interval = 100000;
+static int			event_count[MAX_COUNTERS];
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static __u64			count_filter		       = 100;
+
+static int			tid				= -1;
+static int			profile_cpu			= -1;
+static int			nr_cpus				=  0;
+static int			nmi				=  1;
+static unsigned int		realtime_prio			=  0;
+static int			group				=  0;
+static unsigned int		page_size;
+static unsigned int		mmap_pages			=  16;
+static int			use_mmap			= 0;
+static int			use_munmap			= 0;
+
+static char			*vmlinux;
+
+static char			*sym_filter;
+static unsigned long		filter_start;
+static unsigned long		filter_end;
+
+static int			delay_secs			=  2;
+static int			zero;
+static int			dump_symtab;
+
+static int			scale;
+
+struct source_line {
+	uint64_t		EIP;
+	unsigned long		count;
+	char			*line;
+	struct source_line	*next;
+};
+
+static struct source_line	*lines;
+static struct source_line	**lines_tail;
+
+const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+static char *hw_event_names[] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+	"minor faults",
+	"major faults",
+};
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+static void display_events_help(void)
+{
+	unsigned int i;
+	__u64 e;
+
+	printf(
+	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		printf("\n                             %d:%d: %-20s",
+				type, id, event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
+}
+
+static void display_perfstat_help(void)
+{
+	printf(
+	"Usage: perfstat [<events...>] <cmd...>\n\n"
+	"PerfStat Options (up to %d event types can be specified):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -l                           # scale counter values\n"
+	" -a                           # system-wide collection\n");
+	exit(0);
+}
+
+static void display_help(void)
+{
+	if (run_perfstat)
+		return display_perfstat_help();
+
+	printf(
+	"Usage: kerneltop [<options>]\n"
+	"   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
+	"KernelTop Options (up to %d event types can be specified at once):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -S        --stat             # perfstat COMMAND\n"
+	" -a                           # system-wide collection (for perfstat)\n\n"
+	" -c CNT    --count=CNT        # event period to sample\n\n"
+	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
+	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
+	" -l                           # show scale factor for RR events\n"
+	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
+	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
+	" -r prio   --realtime=<prio>  # event acquisition runs with SCHED_FIFO policy\n"
+	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
+	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
+	" -z        --zero             # zero counts after display\n"
+	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
+	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
+	" -M        --mmap_info        # print mmap info stream\n"
+	" -U        --munmap_info      # print munmap info stream\n"
+	);
+
+	exit(0);
+}
+
+static char *event_name(int ctr)
+{
+	__u64 config = event_id[ctr];
+	int type = PERF_COUNTER_TYPE(config);
+	int id = PERF_COUNTER_ID(config);
+	static char buf[32];
+
+	if (PERF_COUNTER_RAW(config)) {
+		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
+		return buf;
+	}
+
+	switch (type) {
+	case PERF_TYPE_HARDWARE:
+		if (id < PERF_HW_EVENTS_MAX)
+			return hw_event_names[id];
+		return "unknown-hardware";
+
+	case PERF_TYPE_SOFTWARE:
+		if (id < PERF_SW_EVENTS_MAX)
+			return sw_event_names[id];
+		return "unknown-software";
+
+	default:
+		break;
+	}
+
+	return "unknown";
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+static int parse_events(char *str)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+
+/*
+ * perfstat
+ */
+
+char fault_here[1000000];
+
+static void create_perfstat_counter(int counter)
+{
+	struct perf_counter_hw_event hw_event;
+
+	memset(&hw_event, 0, sizeof(hw_event));
+	hw_event.config		= event_id[counter];
+	hw_event.record_type	= 0;
+	hw_event.nmi		= 0;
+	if (scale)
+		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
+					  PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+	if (system_wide) {
+		int cpu;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
+			if (fd[cpu][counter] < 0) {
+				printf("perfstat error: syscall returned with %d (%s)\n",
+						fd[cpu][counter], strerror(errno));
+				exit(-1);
+			}
+		}
+	} else {
+		hw_event.inherit	= 1;
+		hw_event.disabled	= 1;
+
+		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
+		if (fd[0][counter] < 0) {
+			printf("perfstat error: syscall returned with %d (%s)\n",
+					fd[0][counter], strerror(errno));
+			exit(-1);
+		}
+	}
+}
+
+int do_perfstat(int argc, char *argv[])
+{
+	unsigned long long t0, t1;
+	int counter;
+	ssize_t res;
+	int status;
+	int pid;
+
+	if (!system_wide)
+		nr_cpus = 1;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		create_perfstat_counter(counter);
+
+	argc -= optind;
+	argv += optind;
+
+	if (!argc)
+		display_help();
+
+	/*
+	 * Enable counters and exec the command:
+	 */
+	t0 = rdclock();
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	if ((pid = fork()) < 0)
+		perror("failed to fork");
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+	while (wait(&status) >= 0)
+		;
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	t1 = rdclock();
+
+	fflush(stdout);
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Performance counter stats for \'%s\':\n",
+		argv[0]);
+	fprintf(stderr, "\n");
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		int cpu, nv;
+		__u64 count[3], single_count[3];
+		int scaled;
+
+		count[0] = count[1] = count[2] = 0;
+		nv = scale ? 3 : 1;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			res = read(fd[cpu][counter],
+				   single_count, nv * sizeof(__u64));
+			assert(res == nv * sizeof(__u64));
+
+			count[0] += single_count[0];
+			if (scale) {
+				count[1] += single_count[1];
+				count[2] += single_count[2];
+			}
+		}
+
+		scaled = 0;
+		if (scale) {
+			if (count[2] == 0) {
+				fprintf(stderr, " %14s  %-20s\n",
+					"<not counted>", event_name(counter));
+				continue;
+			}
+			if (count[2] < count[1]) {
+				scaled = 1;
+				count[0] = (unsigned long long)
+					((double)count[0] * count[1] / count[2] + 0.5);
+			}
+		}
+
+		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
+		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+
+			double msecs = (double)count[0] / 1000000;
+
+			fprintf(stderr, " %14.6f  %-20s (msecs)",
+				msecs, event_name(counter));
+		} else {
+			fprintf(stderr, " %14Ld  %-20s (events)",
+				count[0], event_name(counter));
+		}
+		if (scaled)
+			fprintf(stderr, "  (scaled from %.2f%%)",
+				(double) count[2] / count[1] * 100);
+		fprintf(stderr, "\n");
+	}
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
+			(double)(t1-t0)/1e6);
+	fprintf(stderr, "\n");
+
+	return 0;
+}
+
+/*
+ * Symbols
+ */
+
+static uint64_t			min_ip;
+static uint64_t			max_ip = -1ll;
+
+struct sym_entry {
+	unsigned long long	addr;
+	char			*sym;
+	unsigned long		count[MAX_COUNTERS];
+	int			skip;
+	struct source_line	*source;
+};
+
+#define MAX_SYMS		100000
+
+static int sym_table_count;
+
+struct sym_entry		*sym_filter_entry;
+
+static struct sym_entry		sym_table[MAX_SYMS];
+
+static void show_details(struct sym_entry *sym);
+
+/*
+ * Ordering weight: count-1 * count-2 * ... / count-n
+ */
+static double sym_weight(const struct sym_entry *sym)
+{
+	double weight;
+	int counter;
+
+	weight = sym->count[0];
+
+	for (counter = 1; counter < nr_counters-1; counter++)
+		weight *= sym->count[counter];
+
+	weight /= (sym->count[counter] + 1);
+
+	return weight;
+}
+
+static int compare(const void *__sym1, const void *__sym2)
+{
+	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
+
+	return sym_weight(sym1) < sym_weight(sym2);
+}
+
+static long			events;
+static long			userspace_events;
+static const char		CONSOLE_CLEAR[] = "[H[2J";
+
+static struct sym_entry		tmp[MAX_SYMS];
+
+static void print_sym_table(void)
+{
+	int i, printed;
+	int counter;
+	float events_per_sec = events/delay_secs;
+	float kevents_per_sec = (events-userspace_events)/delay_secs;
+	float sum_kevents = 0.0;
+
+	events = userspace_events = 0;
+	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
+	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
+
+	for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
+		sum_kevents += tmp[i].count[0];
+
+	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
+
+	printf(
+"------------------------------------------------------------------------------\n");
+	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [%s, ",
+		events_per_sec,
+		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
+		nmi ? "NMI" : "IRQ");
+
+	if (nr_counters == 1)
+		printf("%d ", event_count[0]);
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (counter)
+			printf("/");
+
+		printf("%s", event_name(counter));
+	}
+
+	printf( "], ");
+
+	if (tid != -1)
+		printf(" (tid: %d", tid);
+	else
+		printf(" (all");
+
+	if (profile_cpu != -1)
+		printf(", cpu: %d)\n", profile_cpu);
+	else {
+		if (tid != -1)
+			printf(")\n");
+		else
+			printf(", %d CPUs)\n", nr_cpus);
+	}
+
+	printf("------------------------------------------------------------------------------\n\n");
+
+	if (nr_counters == 1)
+		printf("             events    pcnt");
+	else
+		printf("  weight     events    pcnt");
+
+	printf("         RIP          kernel function\n"
+	       	       "  ______     ______   _____   ________________   _______________\n\n"
+	);
+
+	for (i = 0, printed = 0; i < sym_table_count; i++) {
+		float pcnt;
+		int count;
+
+		if (printed <= 18 && tmp[i].count[0] >= count_filter) {
+			pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
+
+			if (nr_counters == 1)
+				printf("%19.2f - %4.1f%% - %016llx : %s\n",
+					sym_weight(tmp + i),
+					pcnt, tmp[i].addr, tmp[i].sym);
+			else
+				printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
+					sym_weight(tmp + i),
+					tmp[i].count[0],
+					pcnt, tmp[i].addr, tmp[i].sym);
+			printed++;
+		}
+		/*
+		 * Add decay to the counts:
+		 */
+		for (count = 0; count < nr_counters; count++)
+			sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
+	}
+
+	if (sym_filter_entry)
+		show_details(sym_filter_entry);
+
+	{
+		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
+
+		if (poll(&stdin_poll, 1, 0) == 1) {
+			printf("key pressed - exiting.\n");
+			exit(0);
+		}
+	}
+}
+
+static void *display_thread(void *arg)
+{
+	printf("KernelTop refresh period: %d seconds\n", delay_secs);
+
+	while (!sleep(delay_secs))
+		print_sym_table();
+
+	return NULL;
+}
+
+static int read_symbol(FILE *in, struct sym_entry *s)
+{
+	static int filter_match = 0;
+	char *sym, stype;
+	char str[500];
+	int rc, pos;
+
+	rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
+	if (rc == EOF)
+		return -1;
+
+	assert(rc == 3);
+
+	/* skip until end of line: */
+	pos = strlen(str);
+	do {
+		rc = fgetc(in);
+		if (rc == '\n' || rc == EOF || pos >= 499)
+			break;
+		str[pos] = rc;
+		pos++;
+	} while (1);
+	str[pos] = 0;
+
+	sym = str;
+
+	/* Filter out known duplicates and non-text symbols. */
+	if (!strcmp(sym, "_text"))
+		return 1;
+	if (!min_ip && !strcmp(sym, "_stext"))
+		return 1;
+	if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
+		return 1;
+	if (stype != 'T' && stype != 't')
+		return 1;
+	if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
+		return 1;
+	if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
+		return 1;
+
+	s->sym = malloc(strlen(str));
+	assert(s->sym);
+
+	strcpy((char *)s->sym, str);
+	s->skip = 0;
+
+	/* Tag events to be skipped. */
+	if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
+		s->skip = 1;
+	else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
+		s->skip = 1;
+	else if (!strcmp("mwait_idle", s->sym))
+		s->skip = 1;
+
+	if (filter_match == 1) {
+		filter_end = s->addr;
+		filter_match = -1;
+		if (filter_end - filter_start > 10000) {
+			printf("hm, too large filter symbol <%s> - skipping.\n",
+				sym_filter);
+			printf("symbol filter start: %016lx\n", filter_start);
+			printf("                end: %016lx\n", filter_end);
+			filter_end = filter_start = 0;
+			sym_filter = NULL;
+			sleep(1);
+		}
+	}
+	if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
+		filter_match = 1;
+		filter_start = s->addr;
+	}
+
+	return 0;
+}
+
+int compare_addr(const void *__sym1, const void *__sym2)
+{
+	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
+
+	return sym1->addr > sym2->addr;
+}
+
+static void sort_symbol_table(void)
+{
+	int i, dups;
+
+	do {
+		qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
+		for (i = 0, dups = 0; i < sym_table_count; i++) {
+			if (sym_table[i].addr == sym_table[i+1].addr) {
+				sym_table[i+1].addr = -1ll;
+				dups++;
+			}
+		}
+		sym_table_count -= dups;
+	} while(dups);
+}
+
+static void parse_symbols(void)
+{
+	struct sym_entry *last;
+
+	FILE *kallsyms = fopen("/proc/kallsyms", "r");
+
+	if (!kallsyms) {
+		printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
+		exit(-1);
+	}
+
+	while (!feof(kallsyms)) {
+		if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
+			sym_table_count++;
+			assert(sym_table_count <= MAX_SYMS);
+		}
+	}
+
+	sort_symbol_table();
+	min_ip = sym_table[0].addr;
+	max_ip = sym_table[sym_table_count-1].addr;
+	last = sym_table + sym_table_count++;
+
+	last->addr = -1ll;
+	last->sym = "<end>";
+
+	if (filter_end) {
+		int count;
+		for (count=0; count < sym_table_count; count ++) {
+			if (!strcmp(sym_table[count].sym, sym_filter)) {
+				sym_filter_entry = &sym_table[count];
+				break;
+			}
+		}
+	}
+	if (dump_symtab) {
+		int i;
+
+		for (i = 0; i < sym_table_count; i++)
+			fprintf(stderr, "%llx %s\n",
+				sym_table[i].addr, sym_table[i].sym);
+	}
+}
+
+/*
+ * Source lines
+ */
+
+static void parse_vmlinux(char *filename)
+{
+	FILE *file;
+	char command[PATH_MAX*2];
+	if (!filename)
+		return;
+
+	sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
+
+	file = popen(command, "r");
+	if (!file)
+		return;
+
+	lines_tail = &lines;
+	while (!feof(file)) {
+		struct source_line *src;
+		size_t dummy = 0;
+		char *c;
+
+		src = malloc(sizeof(struct source_line));
+		assert(src != NULL);
+		memset(src, 0, sizeof(struct source_line));
+
+		if (getline(&src->line, &dummy, file) < 0)
+			break;
+		if (!src->line)
+			break;
+
+		c = strchr(src->line, '\n');
+		if (c)
+			*c = 0;
+
+		src->next = NULL;
+		*lines_tail = src;
+		lines_tail = &src->next;
+
+		if (strlen(src->line)>8 && src->line[8] == ':')
+			src->EIP = strtoull(src->line, NULL, 16);
+		if (strlen(src->line)>8 && src->line[16] == ':')
+			src->EIP = strtoull(src->line, NULL, 16);
+	}
+	pclose(file);
+}
+
+static void record_precise_ip(uint64_t ip)
+{
+	struct source_line *line;
+
+	for (line = lines; line; line = line->next) {
+		if (line->EIP == ip)
+			line->count++;
+		if (line->EIP > ip)
+			break;
+	}
+}
+
+static void lookup_sym_in_vmlinux(struct sym_entry *sym)
+{
+	struct source_line *line;
+	char pattern[PATH_MAX];
+	sprintf(pattern, "<%s>:", sym->sym);
+
+	for (line = lines; line; line = line->next) {
+		if (strstr(line->line, pattern)) {
+			sym->source = line;
+			break;
+		}
+	}
+}
+
+static void show_lines(struct source_line *line_queue, int line_queue_count)
+{
+	int i;
+	struct source_line *line;
+
+	line = line_queue;
+	for (i = 0; i < line_queue_count; i++) {
+		printf("%8li\t%s\n", line->count, line->line);
+		line = line->next;
+	}
+}
+
+#define TRACE_COUNT     3
+
+static void show_details(struct sym_entry *sym)
+{
+	struct source_line *line;
+	struct source_line *line_queue = NULL;
+	int displayed = 0;
+	int line_queue_count = 0;
+
+	if (!sym->source)
+		lookup_sym_in_vmlinux(sym);
+	if (!sym->source)
+		return;
+
+	printf("Showing details for %s\n", sym->sym);
+
+	line = sym->source;
+	while (line) {
+		if (displayed && strstr(line->line, ">:"))
+			break;
+
+		if (!line_queue_count)
+			line_queue = line;
+		line_queue_count ++;
+
+		if (line->count >= count_filter) {
+			show_lines(line_queue, line_queue_count);
+			line_queue_count = 0;
+			line_queue = NULL;
+		} else if (line_queue_count > TRACE_COUNT) {
+			line_queue = line_queue->next;
+			line_queue_count --;
+		}
+
+		line->count = 0;
+		displayed++;
+		if (displayed > 300)
+			break;
+		line = line->next;
+	}
+}
+
+/*
+ * Binary search in the histogram table and record the hit:
+ */
+static void record_ip(uint64_t ip, int counter)
+{
+	int left_idx, middle_idx, right_idx, idx;
+	unsigned long left, middle, right;
+
+	record_precise_ip(ip);
+
+	left_idx = 0;
+	right_idx = sym_table_count-1;
+	assert(ip <= max_ip && ip >= min_ip);
+
+	while (left_idx + 1 < right_idx) {
+		middle_idx = (left_idx + right_idx) / 2;
+
+		left   = sym_table[  left_idx].addr;
+		middle = sym_table[middle_idx].addr;
+		right  = sym_table[ right_idx].addr;
+
+		if (!(left <= middle && middle <= right)) {
+			printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
+			printf("%d %d %d\n", left_idx, middle_idx, right_idx);
+		}
+		assert(left <= middle && middle <= right);
+		if (!(left <= ip && ip <= right)) {
+			printf(" left: %016lx\n", left);
+			printf("   ip: %016lx\n", (unsigned long)ip);
+			printf("right: %016lx\n", right);
+		}
+		assert(left <= ip && ip <= right);
+		/*
+		 * [ left .... target .... middle .... right ]
+		 *   => right := middle
+		 */
+		if (ip < middle) {
+			right_idx = middle_idx;
+			continue;
+		}
+		/*
+		 * [ left .... middle ... target ... right ]
+		 *   => left := middle
+		 */
+		left_idx = middle_idx;
+	}
+
+	idx = left_idx;
+
+	if (!sym_table[idx].skip)
+		sym_table[idx].count[counter]++;
+	else events--;
+}
+
+static void process_event(uint64_t ip, int counter)
+{
+	events++;
+
+	if (ip < min_ip || ip > max_ip) {
+		userspace_events++;
+		return;
+	}
+
+	record_ip(ip, counter);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0, counter;
+
+	if (strstr(argv[0], "perfstat"))
+		run_perfstat = 1;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"cpu",		required_argument,	NULL, 'C'},
+			{"delay",	required_argument,	NULL, 'd'},
+			{"dump_symtab",	no_argument,		NULL, 'D'},
+			{"event",	required_argument,	NULL, 'e'},
+			{"filter",	required_argument,	NULL, 'f'},
+			{"group",	required_argument,	NULL, 'g'},
+			{"help",	no_argument,		NULL, 'h'},
+			{"nmi",		required_argument,	NULL, 'n'},
+			{"mmap_info",	no_argument,		NULL, 'M'},
+			{"mmap_pages",	required_argument,	NULL, 'm'},
+			{"munmap_info",	no_argument,		NULL, 'U'},
+			{"pid",		required_argument,	NULL, 'p'},
+			{"realtime",	required_argument,	NULL, 'r'},
+			{"scale",	no_argument,		NULL, 'l'},
+			{"symbol",	required_argument,	NULL, 's'},
+			{"stat",	no_argument,		NULL, 'S'},
+			{"vmlinux",	required_argument,	NULL, 'x'},
+			{"zero",	no_argument,		NULL, 'z'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'a': system_wide			=	       1; break;
+		case 'c': default_interval		=   atoi(optarg); break;
+		case 'C':
+			/* CPU and PID are mutually exclusive */
+			if (tid != -1) {
+				printf("WARNING: CPU switch overriding PID\n");
+				sleep(1);
+				tid = -1;
+			}
+			profile_cpu			=   atoi(optarg); break;
+		case 'd': delay_secs			=   atoi(optarg); break;
+		case 'D': dump_symtab			=              1; break;
+
+		case 'e': error				= parse_events(optarg); break;
+
+		case 'f': count_filter			=   atoi(optarg); break;
+		case 'g': group				=   atoi(optarg); break;
+		case 'h':      				  display_help(); break;
+		case 'l': scale				=	       1; break;
+		case 'n': nmi				=   atoi(optarg); break;
+		case 'p':
+			/* CPU and PID are mutually exclusive */
+			if (profile_cpu != -1) {
+				printf("WARNING: PID switch overriding CPU\n");
+				sleep(1);
+				profile_cpu = -1;
+			}
+			tid				=   atoi(optarg); break;
+		case 'r': realtime_prio			=   atoi(optarg); break;
+		case 's': sym_filter			= strdup(optarg); break;
+		case 'S': run_perfstat			=	       1; break;
+		case 'x': vmlinux			= strdup(optarg); break;
+		case 'z': zero				=              1; break;
+		case 'm': mmap_pages			=   atoi(optarg); break;
+		case 'M': use_mmap			=              1; break;
+		case 'U': use_munmap			=              1; break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	if (!nr_counters) {
+		if (run_perfstat)
+			nr_counters = 8;
+		else {
+			nr_counters = 1;
+			event_id[0] = 0;
+		}
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+}
+
+struct mmap_data {
+	int counter;
+	void *base;
+	unsigned int mask;
+	unsigned int prev;
+};
+
+static unsigned int mmap_read_head(struct mmap_data *md)
+{
+	struct perf_counter_mmap_page *pc = md->base;
+	int head;
+
+	head = pc->data_head;
+	rmb();
+
+	return head;
+}
+
+struct timeval last_read, this_read;
+
+static void mmap_read(struct mmap_data *md)
+{
+	unsigned int head = mmap_read_head(md);
+	unsigned int old = md->prev;
+	unsigned char *data = md->base + page_size;
+	int diff;
+
+	gettimeofday(&this_read, NULL);
+
+	/*
+	 * If we're further behind than half the buffer, there's a chance
+	 * the writer will bite our tail and screw up the events under us.
+	 *
+	 * If we somehow ended up ahead of the head, we got messed up.
+	 *
+	 * In either case, truncate and restart at head.
+	 */
+	diff = head - old;
+	if (diff > md->mask / 2 || diff < 0) {
+		struct timeval iv;
+		unsigned long msecs;
+
+		timersub(&this_read, &last_read, &iv);
+		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
+
+		fprintf(stderr, "WARNING: failed to keep up with mmap data."
+				"  Last read %lu msecs ago.\n", msecs);
+
+		/*
+		 * head points to a known good entry, start there.
+		 */
+		old = head;
+	}
+
+	last_read = this_read;
+
+	for (; old != head;) {
+		struct ip_event {
+			struct perf_event_header header;
+			__u64 ip;
+			__u32 pid, tid;
+		};
+		struct mmap_event {
+			struct perf_event_header header;
+			__u32 pid, tid;
+			__u64 start;
+			__u64 len;
+			__u64 pgoff;
+			char filename[PATH_MAX];
+		};
+
+		typedef union event_union {
+			struct perf_event_header header;
+			struct ip_event ip;
+			struct mmap_event mmap;
+		} event_t;
+
+		event_t *event = (event_t *)&data[old & md->mask];
+
+		event_t event_copy;
+
+		unsigned int size = event->header.size;
+
+		/*
+		 * Event straddles the mmap boundary -- header should always
+		 * be inside due to u64 alignment of output.
+		 */
+		if ((old & md->mask) + size != ((old + size) & md->mask)) {
+			unsigned int offset = old;
+			unsigned int len = min(sizeof(*event), size), cpy;
+			void *dst = &event_copy;
+
+			do {
+				cpy = min(md->mask + 1 - (offset & md->mask), len);
+				memcpy(dst, &data[offset & md->mask], cpy);
+				offset += cpy;
+				dst += cpy;
+				len -= cpy;
+			} while (len);
+
+			event = &event_copy;
+		}
+
+		old += size;
+
+		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
+			if (event->header.type & PERF_RECORD_IP)
+				process_event(event->ip.ip, md->counter);
+		} else {
+			switch (event->header.type) {
+				case PERF_EVENT_MMAP:
+				case PERF_EVENT_MUNMAP:
+					printf("%s: %Lu %Lu %Lu %s\n",
+							event->header.type == PERF_EVENT_MMAP
+							? "mmap" : "munmap",
+							event->mmap.start,
+							event->mmap.len,
+							event->mmap.pgoff,
+							event->mmap.filename);
+					break;
+			}
+		}
+	}
+
+	md->prev = old;
+}
+
+int cmd_top(int argc, const char **argv, const char *prefix)
+{
+	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+	struct perf_counter_hw_event hw_event;
+	pthread_t thread;
+	int i, counter, group_fd, nr_poll = 0;
+	unsigned int cpu;
+	int ret;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	process_options(argc, argv);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	if (run_perfstat)
+		return do_perfstat(argc, argv);
+
+	if (tid != -1 || profile_cpu != -1)
+		nr_cpus = 1;
+
+	parse_symbols();
+	if (vmlinux && sym_filter_entry)
+		parse_vmlinux(vmlinux);
+
+	for (i = 0; i < nr_cpus; i++) {
+		group_fd = -1;
+		for (counter = 0; counter < nr_counters; counter++) {
+
+			cpu	= profile_cpu;
+			if (tid == -1 && profile_cpu == -1)
+				cpu = i;
+
+			memset(&hw_event, 0, sizeof(hw_event));
+			hw_event.config		= event_id[counter];
+			hw_event.irq_period	= event_count[counter];
+			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
+			hw_event.nmi		= nmi;
+			hw_event.mmap		= use_mmap;
+			hw_event.munmap		= use_munmap;
+
+			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
+			if (fd[i][counter] < 0) {
+				int err = errno;
+				printf("kerneltop error: syscall returned with %d (%s)\n",
+					fd[i][counter], strerror(err));
+				if (err == EPERM)
+					printf("Are you root?\n");
+				exit(-1);
+			}
+			assert(fd[i][counter] >= 0);
+			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+
+			/*
+			 * First counter acts as the group leader:
+			 */
+			if (group && group_fd == -1)
+				group_fd = fd[i][counter];
+
+			event_array[nr_poll].fd = fd[i][counter];
+			event_array[nr_poll].events = POLLIN;
+			nr_poll++;
+
+			mmap_array[i][counter].counter = counter;
+			mmap_array[i][counter].prev = 0;
+			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+					PROT_READ, MAP_SHARED, fd[i][counter], 0);
+			if (mmap_array[i][counter].base == MAP_FAILED) {
+				printf("kerneltop error: failed to mmap with %d (%s)\n",
+						errno, strerror(errno));
+				exit(-1);
+			}
+		}
+	}
+
+	if (pthread_create(&thread, NULL, display_thread, NULL)) {
+		printf("Could not create display thread.\n");
+		exit(-1);
+	}
+
+	if (realtime_prio) {
+		struct sched_param param;
+
+		param.sched_priority = realtime_prio;
+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+			printf("Could not set realtime priority.\n");
+			exit(-1);
+		}
+	}
+
+	while (1) {
+		int hits = events;
+
+		for (i = 0; i < nr_cpus; i++) {
+			for (counter = 0; counter < nr_counters; counter++)
+				mmap_read(&mmap_array[i][counter]);
+		}
+
+		if (hits == events)
+			ret = poll(event_array, nr_poll, 100);
+	}
+
+	return 0;
+}
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
new file mode 100644
index 0000000000000..41637444ce2dc
--- /dev/null
+++ b/Documentation/perf_counter/builtin.h
@@ -0,0 +1,18 @@
+#ifndef BUILTIN_H
+#define BUILTIN_H
+
+#include "util.h"
+#include "strbuf.h"
+
+extern const char perf_version_string[];
+extern const char perf_usage_string[];
+extern const char perf_more_info_string[];
+
+extern void list_common_cmds_help(void);
+extern const char *help_unknown_cmd(const char *cmd);
+extern void prune_packed_objects(int);
+extern int read_line_with_nul(char *buf, int size, FILE *file);
+extern int check_pager_config(const char *cmd);
+
+extern int cmd_top(int argc, const char **argv, const char *prefix);
+#endif
diff --git a/Documentation/perf_counter/cache.h b/Documentation/perf_counter/cache.h
new file mode 100644
index 0000000000000..dc085640a57d4
--- /dev/null
+++ b/Documentation/perf_counter/cache.h
@@ -0,0 +1,97 @@
+#ifndef CACHE_H
+#define CACHE_H
+
+#include "util.h"
+#include "strbuf.h"
+
+#define PERF_DIR_ENVIRONMENT "PERF_DIR"
+#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE"
+#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf"
+#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY"
+#define INDEX_ENVIRONMENT "PERF_INDEX_FILE"
+#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE"
+#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR"
+#define CONFIG_ENVIRONMENT "PERF_CONFIG"
+#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH"
+#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES"
+#define PERFATTRIBUTES_FILE ".perfattributes"
+#define INFOATTRIBUTES_FILE "info/attributes"
+#define ATTRIBUTE_MACRO_PREFIX "[attr]"
+
+typedef int (*config_fn_t)(const char *, const char *, void *);
+extern int perf_default_config(const char *, const char *, void *);
+extern int perf_config_from_file(config_fn_t fn, const char *, void *);
+extern int perf_config(config_fn_t fn, void *);
+extern int perf_parse_ulong(const char *, unsigned long *);
+extern int perf_config_int(const char *, const char *);
+extern unsigned long perf_config_ulong(const char *, const char *);
+extern int perf_config_bool_or_int(const char *, const char *, int *);
+extern int perf_config_bool(const char *, const char *);
+extern int perf_config_string(const char **, const char *, const char *);
+extern int perf_config_set(const char *, const char *);
+extern int perf_config_set_multivar(const char *, const char *, const char *, int);
+extern int perf_config_rename_section(const char *, const char *);
+extern const char *perf_etc_perfconfig(void);
+extern int check_repository_format_version(const char *var, const char *value, void *cb);
+extern int perf_config_system(void);
+extern int perf_config_global(void);
+extern int config_error_nonbool(const char *);
+extern const char *config_exclusive_filename;
+
+#define MAX_PERFNAME (1000)
+extern char perf_default_email[MAX_PERFNAME];
+extern char perf_default_name[MAX_PERFNAME];
+extern int user_ident_explicitly_given;
+
+extern const char *perf_log_output_encoding;
+extern const char *perf_mailmap_file;
+
+/* IO helper functions */
+extern void maybe_flush_or_die(FILE *, const char *);
+extern int copy_fd(int ifd, int ofd);
+extern int copy_file(const char *dst, const char *src, int mode);
+extern ssize_t read_in_full(int fd, void *buf, size_t count);
+extern ssize_t write_in_full(int fd, const void *buf, size_t count);
+extern void write_or_die(int fd, const void *buf, size_t count);
+extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg);
+extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg);
+extern void fsync_or_die(int fd, const char *);
+
+/* pager.c */
+extern void setup_pager(void);
+extern const char *pager_program;
+extern int pager_in_use(void);
+extern int pager_use_color;
+
+extern const char *editor_program;
+extern const char *excludes_file;
+
+char *alias_lookup(const char *alias);
+int split_cmdline(char *cmdline, const char ***argv);
+
+#define alloc_nr(x) (((x)+16)*3/2)
+
+/*
+ * Realloc the buffer pointed at by variable 'x' so that it can hold
+ * at least 'nr' entries; the number of entries currently allocated
+ * is 'alloc', using the standard growing factor alloc_nr() macro.
+ *
+ * DO NOT USE any expression with side-effect for 'x' or 'alloc'.
+ */
+#define ALLOC_GROW(x, nr, alloc) \
+	do { \
+		if ((nr) > alloc) { \
+			if (alloc_nr(alloc) < (nr)) \
+				alloc = (nr); \
+			else \
+				alloc = alloc_nr(alloc); \
+			x = xrealloc((x), alloc * sizeof(*(x))); \
+		} \
+	} while(0)
+
+
+static inline int is_absolute_path(const char *path)
+{
+	return path[0] == '/';
+}
+#endif /* CACHE_H */
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
new file mode 100644
index 0000000000000..1eab3659b2066
--- /dev/null
+++ b/Documentation/perf_counter/command-list.txt
@@ -0,0 +1,4 @@
+# List of known perf commands.
+# command name				category [deprecated] [common]
+perf-top                                mainporcelain common
+
diff --git a/Documentation/perf_counter/config.c b/Documentation/perf_counter/config.c
new file mode 100644
index 0000000000000..672d53959334c
--- /dev/null
+++ b/Documentation/perf_counter/config.c
@@ -0,0 +1,966 @@
+/*
+ * GIT - The information manager from hell
+ *
+ * Copyright (C) Linus Torvalds, 2005
+ * Copyright (C) Johannes Schindelin, 2005
+ *
+ */
+#include "util.h"
+#include "cache.h"
+#include "exec_cmd.h"
+
+#define MAXNAME (256)
+
+static FILE *config_file;
+static const char *config_file_name;
+static int config_linenr;
+static int config_file_eof;
+static int zlib_compression_seen;
+
+const char *config_exclusive_filename = NULL;
+
+static int get_next_char(void)
+{
+	int c;
+	FILE *f;
+
+	c = '\n';
+	if ((f = config_file) != NULL) {
+		c = fgetc(f);
+		if (c == '\r') {
+			/* DOS like systems */
+			c = fgetc(f);
+			if (c != '\n') {
+				ungetc(c, f);
+				c = '\r';
+			}
+		}
+		if (c == '\n')
+			config_linenr++;
+		if (c == EOF) {
+			config_file_eof = 1;
+			c = '\n';
+		}
+	}
+	return c;
+}
+
+static char *parse_value(void)
+{
+	static char value[1024];
+	int quote = 0, comment = 0, len = 0, space = 0;
+
+	for (;;) {
+		int c = get_next_char();
+		if (len >= sizeof(value) - 1)
+			return NULL;
+		if (c == '\n') {
+			if (quote)
+				return NULL;
+			value[len] = 0;
+			return value;
+		}
+		if (comment)
+			continue;
+		if (isspace(c) && !quote) {
+			space = 1;
+			continue;
+		}
+		if (!quote) {
+			if (c == ';' || c == '#') {
+				comment = 1;
+				continue;
+			}
+		}
+		if (space) {
+			if (len)
+				value[len++] = ' ';
+			space = 0;
+		}
+		if (c == '\\') {
+			c = get_next_char();
+			switch (c) {
+			case '\n':
+				continue;
+			case 't':
+				c = '\t';
+				break;
+			case 'b':
+				c = '\b';
+				break;
+			case 'n':
+				c = '\n';
+				break;
+			/* Some characters escape as themselves */
+			case '\\': case '"':
+				break;
+			/* Reject unknown escape sequences */
+			default:
+				return NULL;
+			}
+			value[len++] = c;
+			continue;
+		}
+		if (c == '"') {
+			quote = 1-quote;
+			continue;
+		}
+		value[len++] = c;
+	}
+}
+
+static inline int iskeychar(int c)
+{
+	return isalnum(c) || c == '-';
+}
+
+static int get_value(config_fn_t fn, void *data, char *name, unsigned int len)
+{
+	int c;
+	char *value;
+
+	/* Get the full name */
+	for (;;) {
+		c = get_next_char();
+		if (config_file_eof)
+			break;
+		if (!iskeychar(c))
+			break;
+		name[len++] = tolower(c);
+		if (len >= MAXNAME)
+			return -1;
+	}
+	name[len] = 0;
+	while (c == ' ' || c == '\t')
+		c = get_next_char();
+
+	value = NULL;
+	if (c != '\n') {
+		if (c != '=')
+			return -1;
+		value = parse_value();
+		if (!value)
+			return -1;
+	}
+	return fn(name, value, data);
+}
+
+static int get_extended_base_var(char *name, int baselen, int c)
+{
+	do {
+		if (c == '\n')
+			return -1;
+		c = get_next_char();
+	} while (isspace(c));
+
+	/* We require the format to be '[base "extension"]' */
+	if (c != '"')
+		return -1;
+	name[baselen++] = '.';
+
+	for (;;) {
+		int c = get_next_char();
+		if (c == '\n')
+			return -1;
+		if (c == '"')
+			break;
+		if (c == '\\') {
+			c = get_next_char();
+			if (c == '\n')
+				return -1;
+		}
+		name[baselen++] = c;
+		if (baselen > MAXNAME / 2)
+			return -1;
+	}
+
+	/* Final ']' */
+	if (get_next_char() != ']')
+		return -1;
+	return baselen;
+}
+
+static int get_base_var(char *name)
+{
+	int baselen = 0;
+
+	for (;;) {
+		int c = get_next_char();
+		if (config_file_eof)
+			return -1;
+		if (c == ']')
+			return baselen;
+		if (isspace(c))
+			return get_extended_base_var(name, baselen, c);
+		if (!iskeychar(c) && c != '.')
+			return -1;
+		if (baselen > MAXNAME / 2)
+			return -1;
+		name[baselen++] = tolower(c);
+	}
+}
+
+static int perf_parse_file(config_fn_t fn, void *data)
+{
+	int comment = 0;
+	int baselen = 0;
+	static char var[MAXNAME];
+
+	/* U+FEFF Byte Order Mark in UTF8 */
+	static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf";
+	const unsigned char *bomptr = utf8_bom;
+
+	for (;;) {
+		int c = get_next_char();
+		if (bomptr && *bomptr) {
+			/* We are at the file beginning; skip UTF8-encoded BOM
+			 * if present. Sane editors won't put this in on their
+			 * own, but e.g. Windows Notepad will do it happily. */
+			if ((unsigned char) c == *bomptr) {
+				bomptr++;
+				continue;
+			} else {
+				/* Do not tolerate partial BOM. */
+				if (bomptr != utf8_bom)
+					break;
+				/* No BOM at file beginning. Cool. */
+				bomptr = NULL;
+			}
+		}
+		if (c == '\n') {
+			if (config_file_eof)
+				return 0;
+			comment = 0;
+			continue;
+		}
+		if (comment || isspace(c))
+			continue;
+		if (c == '#' || c == ';') {
+			comment = 1;
+			continue;
+		}
+		if (c == '[') {
+			baselen = get_base_var(var);
+			if (baselen <= 0)
+				break;
+			var[baselen++] = '.';
+			var[baselen] = 0;
+			continue;
+		}
+		if (!isalpha(c))
+			break;
+		var[baselen] = tolower(c);
+		if (get_value(fn, data, var, baselen+1) < 0)
+			break;
+	}
+	die("bad config file line %d in %s", config_linenr, config_file_name);
+}
+
+static int parse_unit_factor(const char *end, unsigned long *val)
+{
+	if (!*end)
+		return 1;
+	else if (!strcasecmp(end, "k")) {
+		*val *= 1024;
+		return 1;
+	}
+	else if (!strcasecmp(end, "m")) {
+		*val *= 1024 * 1024;
+		return 1;
+	}
+	else if (!strcasecmp(end, "g")) {
+		*val *= 1024 * 1024 * 1024;
+		return 1;
+	}
+	return 0;
+}
+
+static int perf_parse_long(const char *value, long *ret)
+{
+	if (value && *value) {
+		char *end;
+		long val = strtol(value, &end, 0);
+		unsigned long factor = 1;
+		if (!parse_unit_factor(end, &factor))
+			return 0;
+		*ret = val * factor;
+		return 1;
+	}
+	return 0;
+}
+
+int perf_parse_ulong(const char *value, unsigned long *ret)
+{
+	if (value && *value) {
+		char *end;
+		unsigned long val = strtoul(value, &end, 0);
+		if (!parse_unit_factor(end, &val))
+			return 0;
+		*ret = val;
+		return 1;
+	}
+	return 0;
+}
+
+static void die_bad_config(const char *name)
+{
+	if (config_file_name)
+		die("bad config value for '%s' in %s", name, config_file_name);
+	die("bad config value for '%s'", name);
+}
+
+int perf_config_int(const char *name, const char *value)
+{
+	long ret = 0;
+	if (!perf_parse_long(value, &ret))
+		die_bad_config(name);
+	return ret;
+}
+
+unsigned long perf_config_ulong(const char *name, const char *value)
+{
+	unsigned long ret;
+	if (!perf_parse_ulong(value, &ret))
+		die_bad_config(name);
+	return ret;
+}
+
+int perf_config_bool_or_int(const char *name, const char *value, int *is_bool)
+{
+	*is_bool = 1;
+	if (!value)
+		return 1;
+	if (!*value)
+		return 0;
+	if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on"))
+		return 1;
+	if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off"))
+		return 0;
+	*is_bool = 0;
+	return perf_config_int(name, value);
+}
+
+int perf_config_bool(const char *name, const char *value)
+{
+	int discard;
+	return !!perf_config_bool_or_int(name, value, &discard);
+}
+
+int perf_config_string(const char **dest, const char *var, const char *value)
+{
+	if (!value)
+		return config_error_nonbool(var);
+	*dest = strdup(value);
+	return 0;
+}
+
+static int perf_default_core_config(const char *var, const char *value)
+{
+	/* Add other config variables here and to Documentation/config.txt. */
+	return 0;
+}
+
+int perf_default_config(const char *var, const char *value, void *dummy)
+{
+	if (!prefixcmp(var, "core."))
+		return perf_default_core_config(var, value);
+
+	/* Add other config variables here and to Documentation/config.txt. */
+	return 0;
+}
+
+int perf_config_from_file(config_fn_t fn, const char *filename, void *data)
+{
+	int ret;
+	FILE *f = fopen(filename, "r");
+
+	ret = -1;
+	if (f) {
+		config_file = f;
+		config_file_name = filename;
+		config_linenr = 1;
+		config_file_eof = 0;
+		ret = perf_parse_file(fn, data);
+		fclose(f);
+		config_file_name = NULL;
+	}
+	return ret;
+}
+
+const char *perf_etc_perfconfig(void)
+{
+	static const char *system_wide;
+	if (!system_wide)
+		system_wide = system_path(ETC_PERFCONFIG);
+	return system_wide;
+}
+
+static int perf_env_bool(const char *k, int def)
+{
+	const char *v = getenv(k);
+	return v ? perf_config_bool(k, v) : def;
+}
+
+int perf_config_system(void)
+{
+	return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0);
+}
+
+int perf_config_global(void)
+{
+	return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0);
+}
+
+int perf_config(config_fn_t fn, void *data)
+{
+	int ret = 0, found = 0;
+	char *repo_config = NULL;
+	const char *home = NULL;
+
+	/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */
+	if (config_exclusive_filename)
+		return perf_config_from_file(fn, config_exclusive_filename, data);
+	if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) {
+		ret += perf_config_from_file(fn, perf_etc_perfconfig(),
+					    data);
+		found += 1;
+	}
+
+	home = getenv("HOME");
+	if (perf_config_global() && home) {
+		char *user_config = strdup(mkpath("%s/.perfconfig", home));
+		if (!access(user_config, R_OK)) {
+			ret += perf_config_from_file(fn, user_config, data);
+			found += 1;
+		}
+		free(user_config);
+	}
+
+	repo_config = perf_pathdup("config");
+	if (!access(repo_config, R_OK)) {
+		ret += perf_config_from_file(fn, repo_config, data);
+		found += 1;
+	}
+	free(repo_config);
+	if (found == 0)
+		return -1;
+	return ret;
+}
+
+/*
+ * Find all the stuff for perf_config_set() below.
+ */
+
+#define MAX_MATCHES 512
+
+static struct {
+	int baselen;
+	char* key;
+	int do_not_match;
+	regex_t* value_regex;
+	int multi_replace;
+	size_t offset[MAX_MATCHES];
+	enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state;
+	int seen;
+} store;
+
+static int matches(const char* key, const char* value)
+{
+	return !strcmp(key, store.key) &&
+		(store.value_regex == NULL ||
+		 (store.do_not_match ^
+		  !regexec(store.value_regex, value, 0, NULL, 0)));
+}
+
+static int store_aux(const char* key, const char* value, void *cb)
+{
+	const char *ep;
+	size_t section_len;
+
+	switch (store.state) {
+	case KEY_SEEN:
+		if (matches(key, value)) {
+			if (store.seen == 1 && store.multi_replace == 0) {
+				warning("%s has multiple values", key);
+			} else if (store.seen >= MAX_MATCHES) {
+				error("too many matches for %s", key);
+				return 1;
+			}
+
+			store.offset[store.seen] = ftell(config_file);
+			store.seen++;
+		}
+		break;
+	case SECTION_SEEN:
+		/*
+		 * What we are looking for is in store.key (both
+		 * section and var), and its section part is baselen
+		 * long.  We found key (again, both section and var).
+		 * We would want to know if this key is in the same
+		 * section as what we are looking for.  We already
+		 * know we are in the same section as what should
+		 * hold store.key.
+		 */
+		ep = strrchr(key, '.');
+		section_len = ep - key;
+
+		if ((section_len != store.baselen) ||
+		    memcmp(key, store.key, section_len+1)) {
+			store.state = SECTION_END_SEEN;
+			break;
+		}
+
+		/*
+		 * Do not increment matches: this is no match, but we
+		 * just made sure we are in the desired section.
+		 */
+		store.offset[store.seen] = ftell(config_file);
+		/* fallthru */
+	case SECTION_END_SEEN:
+	case START:
+		if (matches(key, value)) {
+			store.offset[store.seen] = ftell(config_file);
+			store.state = KEY_SEEN;
+			store.seen++;
+		} else {
+			if (strrchr(key, '.') - key == store.baselen &&
+			      !strncmp(key, store.key, store.baselen)) {
+					store.state = SECTION_SEEN;
+					store.offset[store.seen] = ftell(config_file);
+			}
+		}
+	}
+	return 0;
+}
+
+static int write_error(const char *filename)
+{
+	error("failed to write new configuration file %s", filename);
+
+	/* Same error code as "failed to rename". */
+	return 4;
+}
+
+static int store_write_section(int fd, const char* key)
+{
+	const char *dot;
+	int i, success;
+	struct strbuf sb = STRBUF_INIT;
+
+	dot = memchr(key, '.', store.baselen);
+	if (dot) {
+		strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key);
+		for (i = dot - key + 1; i < store.baselen; i++) {
+			if (key[i] == '"' || key[i] == '\\')
+				strbuf_addch(&sb, '\\');
+			strbuf_addch(&sb, key[i]);
+		}
+		strbuf_addstr(&sb, "\"]\n");
+	} else {
+		strbuf_addf(&sb, "[%.*s]\n", store.baselen, key);
+	}
+
+	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
+	strbuf_release(&sb);
+
+	return success;
+}
+
+static int store_write_pair(int fd, const char* key, const char* value)
+{
+	int i, success;
+	int length = strlen(key + store.baselen + 1);
+	const char *quote = "";
+	struct strbuf sb = STRBUF_INIT;
+
+	/*
+	 * Check to see if the value needs to be surrounded with a dq pair.
+	 * Note that problematic characters are always backslash-quoted; this
+	 * check is about not losing leading or trailing SP and strings that
+	 * follow beginning-of-comment characters (i.e. ';' and '#') by the
+	 * configuration parser.
+	 */
+	if (value[0] == ' ')
+		quote = "\"";
+	for (i = 0; value[i]; i++)
+		if (value[i] == ';' || value[i] == '#')
+			quote = "\"";
+	if (i && value[i - 1] == ' ')
+		quote = "\"";
+
+	strbuf_addf(&sb, "\t%.*s = %s",
+		    length, key + store.baselen + 1, quote);
+
+	for (i = 0; value[i]; i++)
+		switch (value[i]) {
+		case '\n':
+			strbuf_addstr(&sb, "\\n");
+			break;
+		case '\t':
+			strbuf_addstr(&sb, "\\t");
+			break;
+		case '"':
+		case '\\':
+			strbuf_addch(&sb, '\\');
+		default:
+			strbuf_addch(&sb, value[i]);
+			break;
+		}
+	strbuf_addf(&sb, "%s\n", quote);
+
+	success = write_in_full(fd, sb.buf, sb.len) == sb.len;
+	strbuf_release(&sb);
+
+	return success;
+}
+
+static ssize_t find_beginning_of_line(const char* contents, size_t size,
+	size_t offset_, int* found_bracket)
+{
+	size_t equal_offset = size, bracket_offset = size;
+	ssize_t offset;
+
+contline:
+	for (offset = offset_-2; offset > 0
+			&& contents[offset] != '\n'; offset--)
+		switch (contents[offset]) {
+			case '=': equal_offset = offset; break;
+			case ']': bracket_offset = offset; break;
+		}
+	if (offset > 0 && contents[offset-1] == '\\') {
+		offset_ = offset;
+		goto contline;
+	}
+	if (bracket_offset < equal_offset) {
+		*found_bracket = 1;
+		offset = bracket_offset+1;
+	} else
+		offset++;
+
+	return offset;
+}
+
+int perf_config_set(const char* key, const char* value)
+{
+	return perf_config_set_multivar(key, value, NULL, 0);
+}
+
+/*
+ * If value==NULL, unset in (remove from) config,
+ * if value_regex!=NULL, disregard key/value pairs where value does not match.
+ * if multi_replace==0, nothing, or only one matching key/value is replaced,
+ *     else all matching key/values (regardless how many) are removed,
+ *     before the new pair is written.
+ *
+ * Returns 0 on success.
+ *
+ * This function does this:
+ *
+ * - it locks the config file by creating ".perf/config.lock"
+ *
+ * - it then parses the config using store_aux() as validator to find
+ *   the position on the key/value pair to replace. If it is to be unset,
+ *   it must be found exactly once.
+ *
+ * - the config file is mmap()ed and the part before the match (if any) is
+ *   written to the lock file, then the changed part and the rest.
+ *
+ * - the config file is removed and the lock file rename()d to it.
+ *
+ */
+int perf_config_set_multivar(const char* key, const char* value,
+	const char* value_regex, int multi_replace)
+{
+	int i, dot;
+	int fd = -1, in_fd;
+	int ret;
+	char* config_filename;
+	const char* last_dot = strrchr(key, '.');
+
+	if (config_exclusive_filename)
+		config_filename = strdup(config_exclusive_filename);
+	else
+		config_filename = perf_pathdup("config");
+
+	/*
+	 * Since "key" actually contains the section name and the real
+	 * key name separated by a dot, we have to know where the dot is.
+	 */
+
+	if (last_dot == NULL) {
+		error("key does not contain a section: %s", key);
+		ret = 2;
+		goto out_free;
+	}
+	store.baselen = last_dot - key;
+
+	store.multi_replace = multi_replace;
+
+	/*
+	 * Validate the key and while at it, lower case it for matching.
+	 */
+	store.key = malloc(strlen(key) + 1);
+	dot = 0;
+	for (i = 0; key[i]; i++) {
+		unsigned char c = key[i];
+		if (c == '.')
+			dot = 1;
+		/* Leave the extended basename untouched.. */
+		if (!dot || i > store.baselen) {
+			if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) {
+				error("invalid key: %s", key);
+				free(store.key);
+				ret = 1;
+				goto out_free;
+			}
+			c = tolower(c);
+		} else if (c == '\n') {
+			error("invalid key (newline): %s", key);
+			free(store.key);
+			ret = 1;
+			goto out_free;
+		}
+		store.key[i] = c;
+	}
+	store.key[i] = 0;
+
+	/*
+	 * If .perf/config does not exist yet, write a minimal version.
+	 */
+	in_fd = open(config_filename, O_RDONLY);
+	if ( in_fd < 0 ) {
+		free(store.key);
+
+		if ( ENOENT != errno ) {
+			error("opening %s: %s", config_filename,
+			      strerror(errno));
+			ret = 3; /* same as "invalid config file" */
+			goto out_free;
+		}
+		/* if nothing to unset, error out */
+		if (value == NULL) {
+			ret = 5;
+			goto out_free;
+		}
+
+		store.key = (char*)key;
+		if (!store_write_section(fd, key) ||
+		    !store_write_pair(fd, key, value))
+			goto write_err_out;
+	} else {
+		struct stat st;
+		char* contents;
+		size_t contents_sz, copy_begin, copy_end;
+		int i, new_line = 0;
+
+		if (value_regex == NULL)
+			store.value_regex = NULL;
+		else {
+			if (value_regex[0] == '!') {
+				store.do_not_match = 1;
+				value_regex++;
+			} else
+				store.do_not_match = 0;
+
+			store.value_regex = (regex_t*)malloc(sizeof(regex_t));
+			if (regcomp(store.value_regex, value_regex,
+					REG_EXTENDED)) {
+				error("invalid pattern: %s", value_regex);
+				free(store.value_regex);
+				ret = 6;
+				goto out_free;
+			}
+		}
+
+		store.offset[0] = 0;
+		store.state = START;
+		store.seen = 0;
+
+		/*
+		 * After this, store.offset will contain the *end* offset
+		 * of the last match, or remain at 0 if no match was found.
+		 * As a side effect, we make sure to transform only a valid
+		 * existing config file.
+		 */
+		if (perf_config_from_file(store_aux, config_filename, NULL)) {
+			error("invalid config file %s", config_filename);
+			free(store.key);
+			if (store.value_regex != NULL) {
+				regfree(store.value_regex);
+				free(store.value_regex);
+			}
+			ret = 3;
+			goto out_free;
+		}
+
+		free(store.key);
+		if (store.value_regex != NULL) {
+			regfree(store.value_regex);
+			free(store.value_regex);
+		}
+
+		/* if nothing to unset, or too many matches, error out */
+		if ((store.seen == 0 && value == NULL) ||
+				(store.seen > 1 && multi_replace == 0)) {
+			ret = 5;
+			goto out_free;
+		}
+
+		fstat(in_fd, &st);
+		contents_sz = xsize_t(st.st_size);
+		contents = mmap(NULL, contents_sz, PROT_READ,
+			MAP_PRIVATE, in_fd, 0);
+		close(in_fd);
+
+		if (store.seen == 0)
+			store.seen = 1;
+
+		for (i = 0, copy_begin = 0; i < store.seen; i++) {
+			if (store.offset[i] == 0) {
+				store.offset[i] = copy_end = contents_sz;
+			} else if (store.state != KEY_SEEN) {
+				copy_end = store.offset[i];
+			} else
+				copy_end = find_beginning_of_line(
+					contents, contents_sz,
+					store.offset[i]-2, &new_line);
+
+			if (copy_end > 0 && contents[copy_end-1] != '\n')
+				new_line = 1;
+
+			/* write the first part of the config */
+			if (copy_end > copy_begin) {
+				if (write_in_full(fd, contents + copy_begin,
+						  copy_end - copy_begin) <
+				    copy_end - copy_begin)
+					goto write_err_out;
+				if (new_line &&
+				    write_in_full(fd, "\n", 1) != 1)
+					goto write_err_out;
+			}
+			copy_begin = store.offset[i];
+		}
+
+		/* write the pair (value == NULL means unset) */
+		if (value != NULL) {
+			if (store.state == START) {
+				if (!store_write_section(fd, key))
+					goto write_err_out;
+			}
+			if (!store_write_pair(fd, key, value))
+				goto write_err_out;
+		}
+
+		/* write the rest of the config */
+		if (copy_begin < contents_sz)
+			if (write_in_full(fd, contents + copy_begin,
+					  contents_sz - copy_begin) <
+			    contents_sz - copy_begin)
+				goto write_err_out;
+
+		munmap(contents, contents_sz);
+	}
+
+	ret = 0;
+
+out_free:
+	free(config_filename);
+	return ret;
+
+write_err_out:
+	goto out_free;
+
+}
+
+static int section_name_match (const char *buf, const char *name)
+{
+	int i = 0, j = 0, dot = 0;
+	for (; buf[i] && buf[i] != ']'; i++) {
+		if (!dot && isspace(buf[i])) {
+			dot = 1;
+			if (name[j++] != '.')
+				break;
+			for (i++; isspace(buf[i]); i++)
+				; /* do nothing */
+			if (buf[i] != '"')
+				break;
+			continue;
+		}
+		if (buf[i] == '\\' && dot)
+			i++;
+		else if (buf[i] == '"' && dot) {
+			for (i++; isspace(buf[i]); i++)
+				; /* do_nothing */
+			break;
+		}
+		if (buf[i] != name[j++])
+			break;
+	}
+	return (buf[i] == ']' && name[j] == 0);
+}
+
+/* if new_name == NULL, the section is removed instead */
+int perf_config_rename_section(const char *old_name, const char *new_name)
+{
+	int ret = 0, remove = 0;
+	char *config_filename;
+	int out_fd;
+	char buf[1024];
+
+	if (config_exclusive_filename)
+		config_filename = strdup(config_exclusive_filename);
+	else
+		config_filename = perf_pathdup("config");
+	if (out_fd < 0) {
+		ret = error("could not lock config file %s", config_filename);
+		goto out;
+	}
+
+	if (!(config_file = fopen(config_filename, "rb"))) {
+		/* no config file means nothing to rename, no error */
+		goto unlock_and_out;
+	}
+
+	while (fgets(buf, sizeof(buf), config_file)) {
+		int i;
+		int length;
+		for (i = 0; buf[i] && isspace(buf[i]); i++)
+			; /* do nothing */
+		if (buf[i] == '[') {
+			/* it's a section */
+			if (section_name_match (&buf[i+1], old_name)) {
+				ret++;
+				if (new_name == NULL) {
+					remove = 1;
+					continue;
+				}
+				store.baselen = strlen(new_name);
+				if (!store_write_section(out_fd, new_name)) {
+					goto out;
+				}
+				continue;
+			}
+			remove = 0;
+		}
+		if (remove)
+			continue;
+		length = strlen(buf);
+		if (write_in_full(out_fd, buf, length) != length) {
+			goto out;
+		}
+	}
+	fclose(config_file);
+ unlock_and_out:
+ out:
+	free(config_filename);
+	return ret;
+}
+
+/*
+ * Call this to report error for your variable that should not
+ * get a boolean value (i.e. "[my] var" means "true").
+ */
+int config_error_nonbool(const char *var)
+{
+	return error("Missing value for '%s'", var);
+}
diff --git a/Documentation/perf_counter/ctype.c b/Documentation/perf_counter/ctype.c
new file mode 100644
index 0000000000000..b90ec004f29c3
--- /dev/null
+++ b/Documentation/perf_counter/ctype.c
@@ -0,0 +1,26 @@
+/*
+ * Sane locale-independent, ASCII ctype.
+ *
+ * No surprises, and works with signed and unsigned chars.
+ */
+#include "cache.h"
+
+enum {
+	S = GIT_SPACE,
+	A = GIT_ALPHA,
+	D = GIT_DIGIT,
+	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */
+	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | * */
+};
+
+unsigned char sane_ctype[256] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0,		/*   0.. 15 */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		/*  16.. 31 */
+	S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0,		/*  32.. 47 */
+	D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G,		/*  48.. 63 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
+	A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0,		/*  80.. 95 */
+	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
+	A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0,		/* 112..127 */
+	/* Nothing in the 128.. range */
+};
diff --git a/Documentation/perf_counter/exec_cmd.c b/Documentation/perf_counter/exec_cmd.c
new file mode 100644
index 0000000000000..d392922631530
--- /dev/null
+++ b/Documentation/perf_counter/exec_cmd.c
@@ -0,0 +1,165 @@
+#include "cache.h"
+#include "exec_cmd.h"
+#include "quote.h"
+#define MAX_ARGS	32
+
+extern char **environ;
+static const char *argv_exec_path;
+static const char *argv0_path;
+
+const char *system_path(const char *path)
+{
+#ifdef RUNTIME_PREFIX
+	static const char *prefix;
+#else
+	static const char *prefix = PREFIX;
+#endif
+	struct strbuf d = STRBUF_INIT;
+
+	if (is_absolute_path(path))
+		return path;
+
+#ifdef RUNTIME_PREFIX
+	assert(argv0_path);
+	assert(is_absolute_path(argv0_path));
+
+	if (!prefix &&
+	    !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) &&
+	    !(prefix = strip_path_suffix(argv0_path, BINDIR)) &&
+	    !(prefix = strip_path_suffix(argv0_path, "perf"))) {
+		prefix = PREFIX;
+		fprintf(stderr, "RUNTIME_PREFIX requested, "
+				"but prefix computation failed.  "
+				"Using static fallback '%s'.\n", prefix);
+	}
+#endif
+
+	strbuf_addf(&d, "%s/%s", prefix, path);
+	path = strbuf_detach(&d, NULL);
+	return path;
+}
+
+const char *perf_extract_argv0_path(const char *argv0)
+{
+	const char *slash;
+
+	if (!argv0 || !*argv0)
+		return NULL;
+	slash = argv0 + strlen(argv0);
+
+	while (argv0 <= slash && !is_dir_sep(*slash))
+		slash--;
+
+	if (slash >= argv0) {
+		argv0_path = strndup(argv0, slash - argv0);
+		return slash + 1;
+	}
+
+	return argv0;
+}
+
+void perf_set_argv_exec_path(const char *exec_path)
+{
+	argv_exec_path = exec_path;
+	/*
+	 * Propagate this setting to external programs.
+	 */
+	setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1);
+}
+
+
+/* Returns the highest-priority, location to look for perf programs. */
+const char *perf_exec_path(void)
+{
+	const char *env;
+
+	if (argv_exec_path)
+		return argv_exec_path;
+
+	env = getenv(EXEC_PATH_ENVIRONMENT);
+	if (env && *env) {
+		return env;
+	}
+
+	return system_path(PERF_EXEC_PATH);
+}
+
+static void add_path(struct strbuf *out, const char *path)
+{
+	if (path && *path) {
+		if (is_absolute_path(path))
+			strbuf_addstr(out, path);
+		else
+			strbuf_addstr(out, make_nonrelative_path(path));
+
+		strbuf_addch(out, PATH_SEP);
+	}
+}
+
+void setup_path(void)
+{
+	const char *old_path = getenv("PATH");
+	struct strbuf new_path = STRBUF_INIT;
+
+	add_path(&new_path, perf_exec_path());
+	add_path(&new_path, argv0_path);
+
+	if (old_path)
+		strbuf_addstr(&new_path, old_path);
+	else
+		strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin");
+
+	setenv("PATH", new_path.buf, 1);
+
+	strbuf_release(&new_path);
+}
+
+const char **prepare_perf_cmd(const char **argv)
+{
+	int argc;
+	const char **nargv;
+
+	for (argc = 0; argv[argc]; argc++)
+		; /* just counting */
+	nargv = malloc(sizeof(*nargv) * (argc + 2));
+
+	nargv[0] = "perf";
+	for (argc = 0; argv[argc]; argc++)
+		nargv[argc + 1] = argv[argc];
+	nargv[argc + 1] = NULL;
+	return nargv;
+}
+
+int execv_perf_cmd(const char **argv) {
+	const char **nargv = prepare_perf_cmd(argv);
+
+	/* execvp() can only ever return if it fails */
+	execvp("perf", (char **)nargv);
+
+	free(nargv);
+	return -1;
+}
+
+
+int execl_perf_cmd(const char *cmd,...)
+{
+	int argc;
+	const char *argv[MAX_ARGS + 1];
+	const char *arg;
+	va_list param;
+
+	va_start(param, cmd);
+	argv[0] = cmd;
+	argc = 1;
+	while (argc < MAX_ARGS) {
+		arg = argv[argc++] = va_arg(param, char *);
+		if (!arg)
+			break;
+	}
+	va_end(param);
+	if (MAX_ARGS <= argc)
+		return error("too many args to run %s", cmd);
+
+	argv[argc] = NULL;
+	return execv_perf_cmd(argv);
+}
diff --git a/Documentation/perf_counter/exec_cmd.h b/Documentation/perf_counter/exec_cmd.h
new file mode 100644
index 0000000000000..effe25eb15456
--- /dev/null
+++ b/Documentation/perf_counter/exec_cmd.h
@@ -0,0 +1,13 @@
+#ifndef PERF_EXEC_CMD_H
+#define PERF_EXEC_CMD_H
+
+extern void perf_set_argv_exec_path(const char *exec_path);
+extern const char *perf_extract_argv0_path(const char *path);
+extern const char *perf_exec_path(void);
+extern void setup_path(void);
+extern const char **prepare_perf_cmd(const char **argv);
+extern int execv_perf_cmd(const char **argv); /* NULL terminated */
+extern int execl_perf_cmd(const char *cmd, ...);
+extern const char *system_path(const char *path);
+
+#endif /* PERF_EXEC_CMD_H */
diff --git a/Documentation/perf_counter/generate-cmdlist.sh b/Documentation/perf_counter/generate-cmdlist.sh
new file mode 100755
index 0000000000000..75c68d948fd31
--- /dev/null
+++ b/Documentation/perf_counter/generate-cmdlist.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+
+echo "/* Automatically generated by $0 */
+struct cmdname_help
+{
+    char name[16];
+    char help[80];
+};
+
+static struct cmdname_help common_cmds[] = {"
+
+sed -n -e 's/^git-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
+sort |
+while read cmd
+do
+     sed -n '
+     /^NAME/,/git-'"$cmd"'/H
+     ${
+            x
+            s/.*git-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+	    p
+     }' "Documentation/git-$cmd.txt"
+done
+echo "};"
diff --git a/Documentation/perf_counter/help.c b/Documentation/perf_counter/help.c
new file mode 100644
index 0000000000000..ec0116721660b
--- /dev/null
+++ b/Documentation/perf_counter/help.c
@@ -0,0 +1,366 @@
+#include "cache.h"
+#include "builtin.h"
+#include "exec_cmd.h"
+#include "levenshtein.h"
+#include "help.h"
+
+/* most GUI terminals set COLUMNS (although some don't export it) */
+static int term_columns(void)
+{
+	char *col_string = getenv("COLUMNS");
+	int n_cols;
+
+	if (col_string && (n_cols = atoi(col_string)) > 0)
+		return n_cols;
+
+#ifdef TIOCGWINSZ
+	{
+		struct winsize ws;
+		if (!ioctl(1, TIOCGWINSZ, &ws)) {
+			if (ws.ws_col)
+				return ws.ws_col;
+		}
+	}
+#endif
+
+	return 80;
+}
+
+void add_cmdname(struct cmdnames *cmds, const char *name, int len)
+{
+	struct cmdname *ent = malloc(sizeof(*ent) + len + 1);
+
+	ent->len = len;
+	memcpy(ent->name, name, len);
+	ent->name[len] = 0;
+
+	ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc);
+	cmds->names[cmds->cnt++] = ent;
+}
+
+static void clean_cmdnames(struct cmdnames *cmds)
+{
+	int i;
+	for (i = 0; i < cmds->cnt; ++i)
+		free(cmds->names[i]);
+	free(cmds->names);
+	cmds->cnt = 0;
+	cmds->alloc = 0;
+}
+
+static int cmdname_compare(const void *a_, const void *b_)
+{
+	struct cmdname *a = *(struct cmdname **)a_;
+	struct cmdname *b = *(struct cmdname **)b_;
+	return strcmp(a->name, b->name);
+}
+
+static void uniq(struct cmdnames *cmds)
+{
+	int i, j;
+
+	if (!cmds->cnt)
+		return;
+
+	for (i = j = 1; i < cmds->cnt; i++)
+		if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name))
+			cmds->names[j++] = cmds->names[i];
+
+	cmds->cnt = j;
+}
+
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes)
+{
+	int ci, cj, ei;
+	int cmp;
+
+	ci = cj = ei = 0;
+	while (ci < cmds->cnt && ei < excludes->cnt) {
+		cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name);
+		if (cmp < 0)
+			cmds->names[cj++] = cmds->names[ci++];
+		else if (cmp == 0)
+			ci++, ei++;
+		else if (cmp > 0)
+			ei++;
+	}
+
+	while (ci < cmds->cnt)
+		cmds->names[cj++] = cmds->names[ci++];
+
+	cmds->cnt = cj;
+}
+
+static void pretty_print_string_list(struct cmdnames *cmds, int longest)
+{
+	int cols = 1, rows;
+	int space = longest + 1; /* min 1 SP between words */
+	int max_cols = term_columns() - 1; /* don't print *on* the edge */
+	int i, j;
+
+	if (space < max_cols)
+		cols = max_cols / space;
+	rows = (cmds->cnt + cols - 1) / cols;
+
+	for (i = 0; i < rows; i++) {
+		printf("  ");
+
+		for (j = 0; j < cols; j++) {
+			int n = j * rows + i;
+			int size = space;
+			if (n >= cmds->cnt)
+				break;
+			if (j == cols-1 || n + rows >= cmds->cnt)
+				size = 1;
+			printf("%-*s", size, cmds->names[n]->name);
+		}
+		putchar('\n');
+	}
+}
+
+static int is_executable(const char *name)
+{
+	struct stat st;
+
+	if (stat(name, &st) || /* stat, not lstat */
+	    !S_ISREG(st.st_mode))
+		return 0;
+
+#ifdef __MINGW32__
+	/* cannot trust the executable bit, peek into the file instead */
+	char buf[3] = { 0 };
+	int n;
+	int fd = open(name, O_RDONLY);
+	st.st_mode &= ~S_IXUSR;
+	if (fd >= 0) {
+		n = read(fd, buf, 2);
+		if (n == 2)
+			/* DOS executables start with "MZ" */
+			if (!strcmp(buf, "#!") || !strcmp(buf, "MZ"))
+				st.st_mode |= S_IXUSR;
+		close(fd);
+	}
+#endif
+	return st.st_mode & S_IXUSR;
+}
+
+static void list_commands_in_dir(struct cmdnames *cmds,
+					 const char *path,
+					 const char *prefix)
+{
+	int prefix_len;
+	DIR *dir = opendir(path);
+	struct dirent *de;
+	struct strbuf buf = STRBUF_INIT;
+	int len;
+
+	if (!dir)
+		return;
+	if (!prefix)
+		prefix = "perf-";
+	prefix_len = strlen(prefix);
+
+	strbuf_addf(&buf, "%s/", path);
+	len = buf.len;
+
+	while ((de = readdir(dir)) != NULL) {
+		int entlen;
+
+		if (prefixcmp(de->d_name, prefix))
+			continue;
+
+		strbuf_setlen(&buf, len);
+		strbuf_addstr(&buf, de->d_name);
+		if (!is_executable(buf.buf))
+			continue;
+
+		entlen = strlen(de->d_name) - prefix_len;
+		if (has_extension(de->d_name, ".exe"))
+			entlen -= 4;
+
+		add_cmdname(cmds, de->d_name + prefix_len, entlen);
+	}
+	closedir(dir);
+	strbuf_release(&buf);
+}
+
+void load_command_list(const char *prefix,
+		struct cmdnames *main_cmds,
+		struct cmdnames *other_cmds)
+{
+	const char *env_path = getenv("PATH");
+	const char *exec_path = perf_exec_path();
+
+	if (exec_path) {
+		list_commands_in_dir(main_cmds, exec_path, prefix);
+		qsort(main_cmds->names, main_cmds->cnt,
+		      sizeof(*main_cmds->names), cmdname_compare);
+		uniq(main_cmds);
+	}
+
+	if (env_path) {
+		char *paths, *path, *colon;
+		path = paths = strdup(env_path);
+		while (1) {
+			if ((colon = strchr(path, PATH_SEP)))
+				*colon = 0;
+			if (!exec_path || strcmp(path, exec_path))
+				list_commands_in_dir(other_cmds, path, prefix);
+
+			if (!colon)
+				break;
+			path = colon + 1;
+		}
+		free(paths);
+
+		qsort(other_cmds->names, other_cmds->cnt,
+		      sizeof(*other_cmds->names), cmdname_compare);
+		uniq(other_cmds);
+	}
+	exclude_cmds(other_cmds, main_cmds);
+}
+
+void list_commands(const char *title, struct cmdnames *main_cmds,
+		   struct cmdnames *other_cmds)
+{
+	int i, longest = 0;
+
+	for (i = 0; i < main_cmds->cnt; i++)
+		if (longest < main_cmds->names[i]->len)
+			longest = main_cmds->names[i]->len;
+	for (i = 0; i < other_cmds->cnt; i++)
+		if (longest < other_cmds->names[i]->len)
+			longest = other_cmds->names[i]->len;
+
+	if (main_cmds->cnt) {
+		const char *exec_path = perf_exec_path();
+		printf("available %s in '%s'\n", title, exec_path);
+		printf("----------------");
+		mput_char('-', strlen(title) + strlen(exec_path));
+		putchar('\n');
+		pretty_print_string_list(main_cmds, longest);
+		putchar('\n');
+	}
+
+	if (other_cmds->cnt) {
+		printf("%s available from elsewhere on your $PATH\n", title);
+		printf("---------------------------------------");
+		mput_char('-', strlen(title));
+		putchar('\n');
+		pretty_print_string_list(other_cmds, longest);
+		putchar('\n');
+	}
+}
+
+int is_in_cmdlist(struct cmdnames *c, const char *s)
+{
+	int i;
+	for (i = 0; i < c->cnt; i++)
+		if (!strcmp(s, c->names[i]->name))
+			return 1;
+	return 0;
+}
+
+static int autocorrect;
+static struct cmdnames aliases;
+
+static int perf_unknown_cmd_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "help.autocorrect"))
+		autocorrect = perf_config_int(var,value);
+	/* Also use aliases for command lookup */
+	if (!prefixcmp(var, "alias."))
+		add_cmdname(&aliases, var + 6, strlen(var + 6));
+
+	return perf_default_config(var, value, cb);
+}
+
+static int levenshtein_compare(const void *p1, const void *p2)
+{
+	const struct cmdname *const *c1 = p1, *const *c2 = p2;
+	const char *s1 = (*c1)->name, *s2 = (*c2)->name;
+	int l1 = (*c1)->len;
+	int l2 = (*c2)->len;
+	return l1 != l2 ? l1 - l2 : strcmp(s1, s2);
+}
+
+static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
+{
+	int i;
+	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc);
+
+	for (i = 0; i < old->cnt; i++)
+		cmds->names[cmds->cnt++] = old->names[i];
+	free(old->names);
+	old->cnt = 0;
+	old->names = NULL;
+}
+
+const char *help_unknown_cmd(const char *cmd)
+{
+	int i, n, best_similarity = 0;
+	struct cmdnames main_cmds, other_cmds;
+
+	memset(&main_cmds, 0, sizeof(main_cmds));
+	memset(&other_cmds, 0, sizeof(main_cmds));
+	memset(&aliases, 0, sizeof(aliases));
+
+	perf_config(perf_unknown_cmd_config, NULL);
+
+	load_command_list("perf-", &main_cmds, &other_cmds);
+
+	add_cmd_list(&main_cmds, &aliases);
+	add_cmd_list(&main_cmds, &other_cmds);
+	qsort(main_cmds.names, main_cmds.cnt,
+	      sizeof(main_cmds.names), cmdname_compare);
+	uniq(&main_cmds);
+
+	/* This reuses cmdname->len for similarity index */
+	for (i = 0; i < main_cmds.cnt; ++i)
+		main_cmds.names[i]->len =
+			levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
+
+	qsort(main_cmds.names, main_cmds.cnt,
+	      sizeof(*main_cmds.names), levenshtein_compare);
+
+	if (!main_cmds.cnt)
+		die ("Uh oh. Your system reports no Git commands at all.");
+
+	best_similarity = main_cmds.names[0]->len;
+	n = 1;
+	while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
+		++n;
+	if (autocorrect && n == 1) {
+		const char *assumed = main_cmds.names[0]->name;
+		main_cmds.names[0] = NULL;
+		clean_cmdnames(&main_cmds);
+		fprintf(stderr, "WARNING: You called a Git program named '%s', "
+			"which does not exist.\n"
+			"Continuing under the assumption that you meant '%s'\n",
+			cmd, assumed);
+		if (autocorrect > 0) {
+			fprintf(stderr, "in %0.1f seconds automatically...\n",
+				(float)autocorrect/10.0);
+			poll(NULL, 0, autocorrect * 100);
+		}
+		return assumed;
+	}
+
+	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
+
+	if (best_similarity < 6) {
+		fprintf(stderr, "\nDid you mean %s?\n",
+			n < 2 ? "this": "one of these");
+
+		for (i = 0; i < n; i++)
+			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name);
+	}
+
+	exit(1);
+}
+
+int cmd_version(int argc, const char **argv, const char *prefix)
+{
+	printf("perf version %s\n", perf_version_string);
+	return 0;
+}
diff --git a/Documentation/perf_counter/help.h b/Documentation/perf_counter/help.h
new file mode 100644
index 0000000000000..56bc15406ffc5
--- /dev/null
+++ b/Documentation/perf_counter/help.h
@@ -0,0 +1,29 @@
+#ifndef HELP_H
+#define HELP_H
+
+struct cmdnames {
+	int alloc;
+	int cnt;
+	struct cmdname {
+		size_t len; /* also used for similarity index in help.c */
+		char name[FLEX_ARRAY];
+	} **names;
+};
+
+static inline void mput_char(char c, unsigned int num)
+{
+	while(num--)
+		putchar(c);
+}
+
+void load_command_list(const char *prefix,
+		struct cmdnames *main_cmds,
+		struct cmdnames *other_cmds);
+void add_cmdname(struct cmdnames *cmds, const char *name, int len);
+/* Here we require that excludes is a sorted list. */
+void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes);
+int is_in_cmdlist(struct cmdnames *c, const char *s);
+void list_commands(const char *title, struct cmdnames *main_cmds,
+		   struct cmdnames *other_cmds);
+
+#endif /* HELP_H */
diff --git a/Documentation/perf_counter/levenshtein.c b/Documentation/perf_counter/levenshtein.c
new file mode 100644
index 0000000000000..e521d1516df6e
--- /dev/null
+++ b/Documentation/perf_counter/levenshtein.c
@@ -0,0 +1,84 @@
+#include "cache.h"
+#include "levenshtein.h"
+
+/*
+ * This function implements the Damerau-Levenshtein algorithm to
+ * calculate a distance between strings.
+ *
+ * Basically, it says how many letters need to be swapped, substituted,
+ * deleted from, or added to string1, at least, to get string2.
+ *
+ * The idea is to build a distance matrix for the substrings of both
+ * strings.  To avoid a large space complexity, only the last three rows
+ * are kept in memory (if swaps had the same or higher cost as one deletion
+ * plus one insertion, only two rows would be needed).
+ *
+ * At any stage, "i + 1" denotes the length of the current substring of
+ * string1 that the distance is calculated for.
+ *
+ * row2 holds the current row, row1 the previous row (i.e. for the substring
+ * of string1 of length "i"), and row0 the row before that.
+ *
+ * In other words, at the start of the big loop, row2[j + 1] contains the
+ * Damerau-Levenshtein distance between the substring of string1 of length
+ * "i" and the substring of string2 of length "j + 1".
+ *
+ * All the big loop does is determine the partial minimum-cost paths.
+ *
+ * It does so by calculating the costs of the path ending in characters
+ * i (in string1) and j (in string2), respectively, given that the last
+ * operation is a substition, a swap, a deletion, or an insertion.
+ *
+ * This implementation allows the costs to be weighted:
+ *
+ * - w (as in "sWap")
+ * - s (as in "Substitution")
+ * - a (for insertion, AKA "Add")
+ * - d (as in "Deletion")
+ *
+ * Note that this algorithm calculates a distance _iff_ d == a.
+ */
+int levenshtein(const char *string1, const char *string2,
+		int w, int s, int a, int d)
+{
+	int len1 = strlen(string1), len2 = strlen(string2);
+	int *row0 = malloc(sizeof(int) * (len2 + 1));
+	int *row1 = malloc(sizeof(int) * (len2 + 1));
+	int *row2 = malloc(sizeof(int) * (len2 + 1));
+	int i, j;
+
+	for (j = 0; j <= len2; j++)
+		row1[j] = j * a;
+	for (i = 0; i < len1; i++) {
+		int *dummy;
+
+		row2[0] = (i + 1) * d;
+		for (j = 0; j < len2; j++) {
+			/* substitution */
+			row2[j + 1] = row1[j] + s * (string1[i] != string2[j]);
+			/* swap */
+			if (i > 0 && j > 0 && string1[i - 1] == string2[j] &&
+					string1[i] == string2[j - 1] &&
+					row2[j + 1] > row0[j - 1] + w)
+				row2[j + 1] = row0[j - 1] + w;
+			/* deletion */
+			if (row2[j + 1] > row1[j + 1] + d)
+				row2[j + 1] = row1[j + 1] + d;
+			/* insertion */
+			if (row2[j + 1] > row2[j] + a)
+				row2[j + 1] = row2[j] + a;
+		}
+
+		dummy = row0;
+		row0 = row1;
+		row1 = row2;
+		row2 = dummy;
+	}
+
+	i = row1[len2];
+	free(row0);
+	free(row1);
+	free(row2);
+
+	return i;
+}
diff --git a/Documentation/perf_counter/levenshtein.h b/Documentation/perf_counter/levenshtein.h
new file mode 100644
index 0000000000000..0173abeef52c8
--- /dev/null
+++ b/Documentation/perf_counter/levenshtein.h
@@ -0,0 +1,8 @@
+#ifndef LEVENSHTEIN_H
+#define LEVENSHTEIN_H
+
+int levenshtein(const char *string1, const char *string2,
+	int swap_penalty, int substition_penalty,
+	int insertion_penalty, int deletion_penalty);
+
+#endif
diff --git a/Documentation/perf_counter/parse-options.c b/Documentation/perf_counter/parse-options.c
new file mode 100644
index 0000000000000..7464f34e5407e
--- /dev/null
+++ b/Documentation/perf_counter/parse-options.c
@@ -0,0 +1,495 @@
+#include "util.h"
+#include "parse-options.h"
+#include "cache.h"
+
+#define OPT_SHORT 1
+#define OPT_UNSET 2
+
+static int opterror(const struct option *opt, const char *reason, int flags)
+{
+	if (flags & OPT_SHORT)
+		return error("switch `%c' %s", opt->short_name, reason);
+	if (flags & OPT_UNSET)
+		return error("option `no-%s' %s", opt->long_name, reason);
+	return error("option `%s' %s", opt->long_name, reason);
+}
+
+static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
+		   int flags, const char **arg)
+{
+	if (p->opt) {
+		*arg = p->opt;
+		p->opt = NULL;
+	} else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) {
+		*arg = (const char *)opt->defval;
+	} else if (p->argc > 1) {
+		p->argc--;
+		*arg = *++p->argv;
+	} else
+		return opterror(opt, "requires a value", flags);
+	return 0;
+}
+
+static int get_value(struct parse_opt_ctx_t *p,
+		     const struct option *opt, int flags)
+{
+	const char *s, *arg;
+	const int unset = flags & OPT_UNSET;
+
+	if (unset && p->opt)
+		return opterror(opt, "takes no value", flags);
+	if (unset && (opt->flags & PARSE_OPT_NONEG))
+		return opterror(opt, "isn't available", flags);
+
+	if (!(flags & OPT_SHORT) && p->opt) {
+		switch (opt->type) {
+		case OPTION_CALLBACK:
+			if (!(opt->flags & PARSE_OPT_NOARG))
+				break;
+			/* FALLTHROUGH */
+		case OPTION_BOOLEAN:
+		case OPTION_BIT:
+		case OPTION_SET_INT:
+		case OPTION_SET_PTR:
+			return opterror(opt, "takes no value", flags);
+		default:
+			break;
+		}
+	}
+
+	switch (opt->type) {
+	case OPTION_BIT:
+		if (unset)
+			*(int *)opt->value &= ~opt->defval;
+		else
+			*(int *)opt->value |= opt->defval;
+		return 0;
+
+	case OPTION_BOOLEAN:
+		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1;
+		return 0;
+
+	case OPTION_SET_INT:
+		*(int *)opt->value = unset ? 0 : opt->defval;
+		return 0;
+
+	case OPTION_SET_PTR:
+		*(void **)opt->value = unset ? NULL : (void *)opt->defval;
+		return 0;
+
+	case OPTION_STRING:
+		if (unset)
+			*(const char **)opt->value = NULL;
+		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			*(const char **)opt->value = (const char *)opt->defval;
+		else
+			return get_arg(p, opt, flags, (const char **)opt->value);
+		return 0;
+
+	case OPTION_CALLBACK:
+		if (unset)
+			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_NOARG)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt)
+			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0;
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		return (*opt->callback)(opt, arg, 0) ? (-1) : 0;
+
+	case OPTION_INTEGER:
+		if (unset) {
+			*(int *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(int *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(int *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
+	default:
+		die("should not happen, someone must be hit on the forehead");
+	}
+}
+
+static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options)
+{
+	for (; options->type != OPTION_END; options++) {
+		if (options->short_name == *p->opt) {
+			p->opt = p->opt[1] ? p->opt + 1 : NULL;
+			return get_value(p, options, OPT_SHORT);
+		}
+	}
+	return -2;
+}
+
+static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg,
+                          const struct option *options)
+{
+	const char *arg_end = strchr(arg, '=');
+	const struct option *abbrev_option = NULL, *ambiguous_option = NULL;
+	int abbrev_flags = 0, ambiguous_flags = 0;
+
+	if (!arg_end)
+		arg_end = arg + strlen(arg);
+
+	for (; options->type != OPTION_END; options++) {
+		const char *rest;
+		int flags = 0;
+
+		if (!options->long_name)
+			continue;
+
+		rest = skip_prefix(arg, options->long_name);
+		if (options->type == OPTION_ARGUMENT) {
+			if (!rest)
+				continue;
+			if (*rest == '=')
+				return opterror(options, "takes no value", flags);
+			if (*rest)
+				continue;
+			p->out[p->cpidx++] = arg - 2;
+			return 0;
+		}
+		if (!rest) {
+			/* abbreviated? */
+			if (!strncmp(options->long_name, arg, arg_end - arg)) {
+is_abbreviated:
+				if (abbrev_option) {
+					/*
+					 * If this is abbreviated, it is
+					 * ambiguous. So when there is no
+					 * exact match later, we need to
+					 * error out.
+					 */
+					ambiguous_option = abbrev_option;
+					ambiguous_flags = abbrev_flags;
+				}
+				if (!(flags & OPT_UNSET) && *arg_end)
+					p->opt = arg_end + 1;
+				abbrev_option = options;
+				abbrev_flags = flags;
+				continue;
+			}
+			/* negated and abbreviated very much? */
+			if (!prefixcmp("no-", arg)) {
+				flags |= OPT_UNSET;
+				goto is_abbreviated;
+			}
+			/* negated? */
+			if (strncmp(arg, "no-", 3))
+				continue;
+			flags |= OPT_UNSET;
+			rest = skip_prefix(arg + 3, options->long_name);
+			/* abbreviated and negated? */
+			if (!rest && !prefixcmp(options->long_name, arg + 3))
+				goto is_abbreviated;
+			if (!rest)
+				continue;
+		}
+		if (*rest) {
+			if (*rest != '=')
+				continue;
+			p->opt = rest + 1;
+		}
+		return get_value(p, options, flags);
+	}
+
+	if (ambiguous_option)
+		return error("Ambiguous option: %s "
+			"(could be --%s%s or --%s%s)",
+			arg,
+			(ambiguous_flags & OPT_UNSET) ?  "no-" : "",
+			ambiguous_option->long_name,
+			(abbrev_flags & OPT_UNSET) ?  "no-" : "",
+			abbrev_option->long_name);
+	if (abbrev_option)
+		return get_value(p, abbrev_option, abbrev_flags);
+	return -2;
+}
+
+static void check_typos(const char *arg, const struct option *options)
+{
+	if (strlen(arg) < 3)
+		return;
+
+	if (!prefixcmp(arg, "no-")) {
+		error ("did you mean `--%s` (with two dashes ?)", arg);
+		exit(129);
+	}
+
+	for (; options->type != OPTION_END; options++) {
+		if (!options->long_name)
+			continue;
+		if (!prefixcmp(options->long_name, arg)) {
+			error ("did you mean `--%s` (with two dashes ?)", arg);
+			exit(129);
+		}
+	}
+}
+
+void parse_options_start(struct parse_opt_ctx_t *ctx,
+			 int argc, const char **argv, int flags)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	ctx->argc = argc - 1;
+	ctx->argv = argv + 1;
+	ctx->out  = argv;
+	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0);
+	ctx->flags = flags;
+	if ((flags & PARSE_OPT_KEEP_UNKNOWN) &&
+	    (flags & PARSE_OPT_STOP_AT_NON_OPTION))
+		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together");
+}
+
+static int usage_with_options_internal(const char * const *,
+				       const struct option *, int);
+
+int parse_options_step(struct parse_opt_ctx_t *ctx,
+		       const struct option *options,
+		       const char * const usagestr[])
+{
+	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP);
+
+	/* we must reset ->opt, unknown short option leave it dangling */
+	ctx->opt = NULL;
+
+	for (; ctx->argc; ctx->argc--, ctx->argv++) {
+		const char *arg = ctx->argv[0];
+
+		if (*arg != '-' || !arg[1]) {
+			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION)
+				break;
+			ctx->out[ctx->cpidx++] = ctx->argv[0];
+			continue;
+		}
+
+		if (arg[1] != '-') {
+			ctx->opt = arg + 1;
+			if (internal_help && *ctx->opt == 'h')
+				return parse_options_usage(usagestr, options);
+			switch (parse_short_opt(ctx, options)) {
+			case -1:
+				return parse_options_usage(usagestr, options);
+			case -2:
+				goto unknown;
+			}
+			if (ctx->opt)
+				check_typos(arg + 1, options);
+			while (ctx->opt) {
+				if (internal_help && *ctx->opt == 'h')
+					return parse_options_usage(usagestr, options);
+				switch (parse_short_opt(ctx, options)) {
+				case -1:
+					return parse_options_usage(usagestr, options);
+				case -2:
+					/* fake a short option thing to hide the fact that we may have
+					 * started to parse aggregated stuff
+					 *
+					 * This is leaky, too bad.
+					 */
+					ctx->argv[0] = strdup(ctx->opt - 1);
+					*(char *)ctx->argv[0] = '-';
+					goto unknown;
+				}
+			}
+			continue;
+		}
+
+		if (!arg[2]) { /* "--" */
+			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) {
+				ctx->argc--;
+				ctx->argv++;
+			}
+			break;
+		}
+
+		if (internal_help && !strcmp(arg + 2, "help-all"))
+			return usage_with_options_internal(usagestr, options, 1);
+		if (internal_help && !strcmp(arg + 2, "help"))
+			return parse_options_usage(usagestr, options);
+		switch (parse_long_opt(ctx, arg + 2, options)) {
+		case -1:
+			return parse_options_usage(usagestr, options);
+		case -2:
+			goto unknown;
+		}
+		continue;
+unknown:
+		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN))
+			return PARSE_OPT_UNKNOWN;
+		ctx->out[ctx->cpidx++] = ctx->argv[0];
+		ctx->opt = NULL;
+	}
+	return PARSE_OPT_DONE;
+}
+
+int parse_options_end(struct parse_opt_ctx_t *ctx)
+{
+	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out));
+	ctx->out[ctx->cpidx + ctx->argc] = NULL;
+	return ctx->cpidx + ctx->argc;
+}
+
+int parse_options(int argc, const char **argv, const struct option *options,
+		  const char * const usagestr[], int flags)
+{
+	struct parse_opt_ctx_t ctx;
+
+	parse_options_start(&ctx, argc, argv, flags);
+	switch (parse_options_step(&ctx, options, usagestr)) {
+	case PARSE_OPT_HELP:
+		exit(129);
+	case PARSE_OPT_DONE:
+		break;
+	default: /* PARSE_OPT_UNKNOWN */
+		if (ctx.argv[0][1] == '-') {
+			error("unknown option `%s'", ctx.argv[0] + 2);
+		} else {
+			error("unknown switch `%c'", *ctx.opt);
+		}
+		usage_with_options(usagestr, options);
+	}
+
+	return parse_options_end(&ctx);
+}
+
+#define USAGE_OPTS_WIDTH 24
+#define USAGE_GAP         2
+
+int usage_with_options_internal(const char * const *usagestr,
+				const struct option *opts, int full)
+{
+	if (!usagestr)
+		return PARSE_OPT_HELP;
+
+	fprintf(stderr, "usage: %s\n", *usagestr++);
+	while (*usagestr && **usagestr)
+		fprintf(stderr, "   or: %s\n", *usagestr++);
+	while (*usagestr) {
+		fprintf(stderr, "%s%s\n",
+				**usagestr ? "    " : "",
+				*usagestr);
+		usagestr++;
+	}
+
+	if (opts->type != OPTION_GROUP)
+		fputc('\n', stderr);
+
+	for (; opts->type != OPTION_END; opts++) {
+		size_t pos;
+		int pad;
+
+		if (opts->type == OPTION_GROUP) {
+			fputc('\n', stderr);
+			if (*opts->help)
+				fprintf(stderr, "%s\n", opts->help);
+			continue;
+		}
+		if (!full && (opts->flags & PARSE_OPT_HIDDEN))
+			continue;
+
+		pos = fprintf(stderr, "    ");
+		if (opts->short_name)
+			pos += fprintf(stderr, "-%c", opts->short_name);
+		if (opts->long_name && opts->short_name)
+			pos += fprintf(stderr, ", ");
+		if (opts->long_name)
+			pos += fprintf(stderr, "--%s", opts->long_name);
+
+		switch (opts->type) {
+		case OPTION_ARGUMENT:
+			break;
+		case OPTION_INTEGER:
+			if (opts->flags & PARSE_OPT_OPTARG)
+				if (opts->long_name)
+					pos += fprintf(stderr, "[=<n>]");
+				else
+					pos += fprintf(stderr, "[<n>]");
+			else
+				pos += fprintf(stderr, " <n>");
+			break;
+		case OPTION_CALLBACK:
+			if (opts->flags & PARSE_OPT_NOARG)
+				break;
+			/* FALLTHROUGH */
+		case OPTION_STRING:
+			if (opts->argh) {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=<%s>]", opts->argh);
+					else
+						pos += fprintf(stderr, "[<%s>]", opts->argh);
+				else
+					pos += fprintf(stderr, " <%s>", opts->argh);
+			} else {
+				if (opts->flags & PARSE_OPT_OPTARG)
+					if (opts->long_name)
+						pos += fprintf(stderr, "[=...]");
+					else
+						pos += fprintf(stderr, "[...]");
+				else
+					pos += fprintf(stderr, " ...");
+			}
+			break;
+		default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */
+			break;
+		}
+
+		if (pos <= USAGE_OPTS_WIDTH)
+			pad = USAGE_OPTS_WIDTH - pos;
+		else {
+			fputc('\n', stderr);
+			pad = USAGE_OPTS_WIDTH;
+		}
+		fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help);
+	}
+	fputc('\n', stderr);
+
+	return PARSE_OPT_HELP;
+}
+
+void usage_with_options(const char * const *usagestr,
+			const struct option *opts)
+{
+	usage_with_options_internal(usagestr, opts, 0);
+	exit(129);
+}
+
+int parse_options_usage(const char * const *usagestr,
+			const struct option *opts)
+{
+	return usage_with_options_internal(usagestr, opts, 0);
+}
+
+
+/*----- some often used options -----*/
+#include "cache.h"
+
+int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
+			   int unset)
+{
+	int *target = opt->value;
+
+	if (unset)
+		/* --no-quiet, --no-verbose */
+		*target = 0;
+	else if (opt->short_name == 'v') {
+		if (*target >= 0)
+			(*target)++;
+		else
+			*target = 1;
+	} else {
+		if (*target <= 0)
+			(*target)--;
+		else
+			*target = -1;
+	}
+	return 0;
+}
diff --git a/Documentation/perf_counter/parse-options.h b/Documentation/perf_counter/parse-options.h
new file mode 100644
index 0000000000000..a81c7faff68e0
--- /dev/null
+++ b/Documentation/perf_counter/parse-options.h
@@ -0,0 +1,172 @@
+#ifndef PARSE_OPTIONS_H
+#define PARSE_OPTIONS_H
+
+enum parse_opt_type {
+	/* special types */
+	OPTION_END,
+	OPTION_ARGUMENT,
+	OPTION_GROUP,
+	/* options with no arguments */
+	OPTION_BIT,
+	OPTION_BOOLEAN, /* _INCR would have been a better name */
+	OPTION_SET_INT,
+	OPTION_SET_PTR,
+	/* options with arguments (usually) */
+	OPTION_STRING,
+	OPTION_INTEGER,
+	OPTION_CALLBACK,
+};
+
+enum parse_opt_flags {
+	PARSE_OPT_KEEP_DASHDASH = 1,
+	PARSE_OPT_STOP_AT_NON_OPTION = 2,
+	PARSE_OPT_KEEP_ARGV0 = 4,
+	PARSE_OPT_KEEP_UNKNOWN = 8,
+	PARSE_OPT_NO_INTERNAL_HELP = 16,
+};
+
+enum parse_opt_option_flags {
+	PARSE_OPT_OPTARG  = 1,
+	PARSE_OPT_NOARG   = 2,
+	PARSE_OPT_NONEG   = 4,
+	PARSE_OPT_HIDDEN  = 8,
+	PARSE_OPT_LASTARG_DEFAULT = 16,
+};
+
+struct option;
+typedef int parse_opt_cb(const struct option *, const char *arg, int unset);
+
+/*
+ * `type`::
+ *   holds the type of the option, you must have an OPTION_END last in your
+ *   array.
+ *
+ * `short_name`::
+ *   the character to use as a short option name, '\0' if none.
+ *
+ * `long_name`::
+ *   the long option name, without the leading dashes, NULL if none.
+ *
+ * `value`::
+ *   stores pointers to the values to be filled.
+ *
+ * `argh`::
+ *   token to explain the kind of argument this option wants. Keep it
+ *   homogenous across the repository.
+ *
+ * `help`::
+ *   the short help associated to what the option does.
+ *   Must never be NULL (except for OPTION_END).
+ *   OPTION_GROUP uses this pointer to store the group header.
+ *
+ * `flags`::
+ *   mask of parse_opt_option_flags.
+ *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs)
+ *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs
+ *   PARSE_OPT_NONEG: says that this option cannot be negated
+ *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in
+ *                    the long one.
+ *
+ * `callback`::
+ *   pointer to the callback to use for OPTION_CALLBACK.
+ *
+ * `defval`::
+ *   default value to fill (*->value) with for PARSE_OPT_OPTARG.
+ *   OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in
+ *   the value when met.
+ *   CALLBACKS can use it like they want.
+ */
+struct option {
+	enum parse_opt_type type;
+	int short_name;
+	const char *long_name;
+	void *value;
+	const char *argh;
+	const char *help;
+
+	int flags;
+	parse_opt_cb *callback;
+	intptr_t defval;
+};
+
+#define OPT_END()                   { OPTION_END }
+#define OPT_ARGUMENT(l, h)          { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) }
+#define OPT_GROUP(h)                { OPTION_GROUP, 0, NULL, NULL, NULL, (h) }
+#define OPT_BIT(s, l, v, h, b)      { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) }
+#define OPT_BOOLEAN(s, l, v, h)     { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) }
+#define OPT_SET_INT(s, l, v, h, i)  { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
+#define OPT_SET_PTR(s, l, v, h, p)  { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
+#define OPT_INTEGER(s, l, v, h)     { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
+#define OPT_STRING(s, l, v, a, h)   { OPTION_STRING,  (s), (l), (v), (a), (h) }
+#define OPT_DATE(s, l, v, h) \
+	{ OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
+	  parse_opt_approxidate_cb }
+#define OPT_CALLBACK(s, l, v, a, h, f) \
+	{ OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) }
+
+/* parse_options() will filter out the processed options and leave the
+ * non-option argments in argv[].
+ * Returns the number of arguments left in argv[].
+ */
+extern int parse_options(int argc, const char **argv,
+                         const struct option *options,
+                         const char * const usagestr[], int flags);
+
+extern NORETURN void usage_with_options(const char * const *usagestr,
+                                        const struct option *options);
+
+/*----- incremantal advanced APIs -----*/
+
+enum {
+	PARSE_OPT_HELP = -1,
+	PARSE_OPT_DONE,
+	PARSE_OPT_UNKNOWN,
+};
+
+/*
+ * It's okay for the caller to consume argv/argc in the usual way.
+ * Other fields of that structure are private to parse-options and should not
+ * be modified in any way.
+ */
+struct parse_opt_ctx_t {
+	const char **argv;
+	const char **out;
+	int argc, cpidx;
+	const char *opt;
+	int flags;
+};
+
+extern int parse_options_usage(const char * const *usagestr,
+			       const struct option *opts);
+
+extern void parse_options_start(struct parse_opt_ctx_t *ctx,
+				int argc, const char **argv, int flags);
+
+extern int parse_options_step(struct parse_opt_ctx_t *ctx,
+			      const struct option *options,
+			      const char * const usagestr[]);
+
+extern int parse_options_end(struct parse_opt_ctx_t *ctx);
+
+
+/*----- some often used options -----*/
+extern int parse_opt_abbrev_cb(const struct option *, const char *, int);
+extern int parse_opt_approxidate_cb(const struct option *, const char *, int);
+extern int parse_opt_verbosity_cb(const struct option *, const char *, int);
+
+#define OPT__VERBOSE(var)  OPT_BOOLEAN('v', "verbose", (var), "be verbose")
+#define OPT__QUIET(var)    OPT_BOOLEAN('q', "quiet",   (var), "be quiet")
+#define OPT__VERBOSITY(var) \
+	{ OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \
+	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \
+	{ OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \
+	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }
+#define OPT__DRY_RUN(var)  OPT_BOOLEAN('n', "dry-run", (var), "dry run")
+#define OPT__ABBREV(var)  \
+	{ OPTION_CALLBACK, 0, "abbrev", (var), "n", \
+	  "use <n> digits to display SHA-1s", \
+	  PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 }
+
+extern const char *parse_options_fix_filename(const char *prefix, const char *file);
+
+#endif
diff --git a/Documentation/perf_counter/path.c b/Documentation/perf_counter/path.c
new file mode 100644
index 0000000000000..891b612ec1a9d
--- /dev/null
+++ b/Documentation/perf_counter/path.c
@@ -0,0 +1,392 @@
+/*
+ * I'm tired of doing "vsnprintf()" etc just to open a
+ * file, so here's a "return static buffer with printf"
+ * interface for paths.
+ *
+ * It's obviously not thread-safe. Sue me. But it's quite
+ * useful for doing things like
+ *
+ *   f = open(mkpath("%s/%s.perf", base, name), O_RDONLY);
+ *
+ * which is what it's designed for.
+ */
+#include "cache.h"
+
+static char bad_path[] = "/bad-path/";
+/*
+ * Two hacks:
+ */
+
+static char *get_perf_dir(void)
+{
+	return ".";
+}
+
+size_t strlcpy(char *dest, const char *src, size_t size)
+{
+	size_t ret = strlen(src);
+
+	if (size) {
+		size_t len = (ret >= size) ? size - 1 : ret;
+		memcpy(dest, src, len);
+		dest[len] = '\0';
+	}
+	return ret;
+}
+
+
+static char *get_pathname(void)
+{
+	static char pathname_array[4][PATH_MAX];
+	static int index;
+	return pathname_array[3 & ++index];
+}
+
+static char *cleanup_path(char *path)
+{
+	/* Clean it up */
+	if (!memcmp(path, "./", 2)) {
+		path += 2;
+		while (*path == '/')
+			path++;
+	}
+	return path;
+}
+
+char *mksnpath(char *buf, size_t n, const char *fmt, ...)
+{
+	va_list args;
+	unsigned len;
+
+	va_start(args, fmt);
+	len = vsnprintf(buf, n, fmt, args);
+	va_end(args);
+	if (len >= n) {
+		strlcpy(buf, bad_path, n);
+		return buf;
+	}
+	return cleanup_path(buf);
+}
+
+static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args)
+{
+	const char *perf_dir = get_perf_dir();
+	size_t len;
+
+	len = strlen(perf_dir);
+	if (n < len + 1)
+		goto bad;
+	memcpy(buf, perf_dir, len);
+	if (len && !is_dir_sep(perf_dir[len-1]))
+		buf[len++] = '/';
+	len += vsnprintf(buf + len, n - len, fmt, args);
+	if (len >= n)
+		goto bad;
+	return cleanup_path(buf);
+bad:
+	strlcpy(buf, bad_path, n);
+	return buf;
+}
+
+char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
+{
+	va_list args;
+	va_start(args, fmt);
+	(void)perf_vsnpath(buf, n, fmt, args);
+	va_end(args);
+	return buf;
+}
+
+char *perf_pathdup(const char *fmt, ...)
+{
+	char path[PATH_MAX];
+	va_list args;
+	va_start(args, fmt);
+	(void)perf_vsnpath(path, sizeof(path), fmt, args);
+	va_end(args);
+	return xstrdup(path);
+}
+
+char *mkpath(const char *fmt, ...)
+{
+	va_list args;
+	unsigned len;
+	char *pathname = get_pathname();
+
+	va_start(args, fmt);
+	len = vsnprintf(pathname, PATH_MAX, fmt, args);
+	va_end(args);
+	if (len >= PATH_MAX)
+		return bad_path;
+	return cleanup_path(pathname);
+}
+
+char *perf_path(const char *fmt, ...)
+{
+	const char *perf_dir = get_perf_dir();
+	char *pathname = get_pathname();
+	va_list args;
+	unsigned len;
+
+	len = strlen(perf_dir);
+	if (len > PATH_MAX-100)
+		return bad_path;
+	memcpy(pathname, perf_dir, len);
+	if (len && perf_dir[len-1] != '/')
+		pathname[len++] = '/';
+	va_start(args, fmt);
+	len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args);
+	va_end(args);
+	if (len >= PATH_MAX)
+		return bad_path;
+	return cleanup_path(pathname);
+}
+
+
+/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
+int perf_mkstemp(char *path, size_t len, const char *template)
+{
+	const char *tmp;
+	size_t n;
+
+	tmp = getenv("TMPDIR");
+	if (!tmp)
+		tmp = "/tmp";
+	n = snprintf(path, len, "%s/%s", tmp, template);
+	if (len <= n) {
+		errno = ENAMETOOLONG;
+		return -1;
+	}
+	return mkstemp(path);
+}
+
+
+static char *user_path(char *buf, char *path, int sz)
+{
+	struct passwd *pw;
+	char *slash;
+	int len, baselen;
+
+	if (!path || path[0] != '~')
+		return NULL;
+	path++;
+	slash = strchr(path, '/');
+	if (path[0] == '/' || !path[0]) {
+		pw = getpwuid(getuid());
+	}
+	else {
+		if (slash) {
+			*slash = 0;
+			pw = getpwnam(path);
+			*slash = '/';
+		}
+		else
+			pw = getpwnam(path);
+	}
+	if (!pw || !pw->pw_dir || sz <= strlen(pw->pw_dir))
+		return NULL;
+	baselen = strlen(pw->pw_dir);
+	memcpy(buf, pw->pw_dir, baselen);
+	while ((1 < baselen) && (buf[baselen-1] == '/')) {
+		buf[baselen-1] = 0;
+		baselen--;
+	}
+	if (slash && slash[1]) {
+		len = strlen(slash);
+		if (sz <= baselen + len)
+			return NULL;
+		memcpy(buf + baselen, slash, len + 1);
+	}
+	return buf;
+}
+
+const char *make_relative_path(const char *abs, const char *base)
+{
+	static char buf[PATH_MAX + 1];
+	int baselen;
+	if (!base)
+		return abs;
+	baselen = strlen(base);
+	if (prefixcmp(abs, base))
+		return abs;
+	if (abs[baselen] == '/')
+		baselen++;
+	else if (base[baselen - 1] != '/')
+		return abs;
+	strcpy(buf, abs + baselen);
+	return buf;
+}
+
+/*
+ * It is okay if dst == src, but they should not overlap otherwise.
+ *
+ * Performs the following normalizations on src, storing the result in dst:
+ * - Ensures that components are separated by '/' (Windows only)
+ * - Squashes sequences of '/'.
+ * - Removes "." components.
+ * - Removes ".." components, and the components the precede them.
+ * Returns failure (non-zero) if a ".." component appears as first path
+ * component anytime during the normalization. Otherwise, returns success (0).
+ *
+ * Note that this function is purely textual.  It does not follow symlinks,
+ * verify the existence of the path, or make any system calls.
+ */
+int normalize_path_copy(char *dst, const char *src)
+{
+	char *dst0;
+
+	if (has_dos_drive_prefix(src)) {
+		*dst++ = *src++;
+		*dst++ = *src++;
+	}
+	dst0 = dst;
+
+	if (is_dir_sep(*src)) {
+		*dst++ = '/';
+		while (is_dir_sep(*src))
+			src++;
+	}
+
+	for (;;) {
+		char c = *src;
+
+		/*
+		 * A path component that begins with . could be
+		 * special:
+		 * (1) "." and ends   -- ignore and terminate.
+		 * (2) "./"           -- ignore them, eat slash and continue.
+		 * (3) ".." and ends  -- strip one and terminate.
+		 * (4) "../"          -- strip one, eat slash and continue.
+		 */
+		if (c == '.') {
+			if (!src[1]) {
+				/* (1) */
+				src++;
+			} else if (is_dir_sep(src[1])) {
+				/* (2) */
+				src += 2;
+				while (is_dir_sep(*src))
+					src++;
+				continue;
+			} else if (src[1] == '.') {
+				if (!src[2]) {
+					/* (3) */
+					src += 2;
+					goto up_one;
+				} else if (is_dir_sep(src[2])) {
+					/* (4) */
+					src += 3;
+					while (is_dir_sep(*src))
+						src++;
+					goto up_one;
+				}
+			}
+		}
+
+		/* copy up to the next '/', and eat all '/' */
+		while ((c = *src++) != '\0' && !is_dir_sep(c))
+			*dst++ = c;
+		if (is_dir_sep(c)) {
+			*dst++ = '/';
+			while (is_dir_sep(c))
+				c = *src++;
+			src--;
+		} else if (!c)
+			break;
+		continue;
+
+	up_one:
+		/*
+		 * dst0..dst is prefix portion, and dst[-1] is '/';
+		 * go up one level.
+		 */
+		dst--;	/* go to trailing '/' */
+		if (dst <= dst0)
+			return -1;
+		/* Windows: dst[-1] cannot be backslash anymore */
+		while (dst0 < dst && dst[-1] != '/')
+			dst--;
+	}
+	*dst = '\0';
+	return 0;
+}
+
+/*
+ * path = Canonical absolute path
+ * prefix_list = Colon-separated list of absolute paths
+ *
+ * Determines, for each path in prefix_list, whether the "prefix" really
+ * is an ancestor directory of path.  Returns the length of the longest
+ * ancestor directory, excluding any trailing slashes, or -1 if no prefix
+ * is an ancestor.  (Note that this means 0 is returned if prefix_list is
+ * "/".) "/foo" is not considered an ancestor of "/foobar".  Directories
+ * are not considered to be their own ancestors.  path must be in a
+ * canonical form: empty components, or "." or ".." components are not
+ * allowed.  prefix_list may be null, which is like "".
+ */
+int longest_ancestor_length(const char *path, const char *prefix_list)
+{
+	char buf[PATH_MAX+1];
+	const char *ceil, *colon;
+	int len, max_len = -1;
+
+	if (prefix_list == NULL || !strcmp(path, "/"))
+		return -1;
+
+	for (colon = ceil = prefix_list; *colon; ceil = colon+1) {
+		for (colon = ceil; *colon && *colon != PATH_SEP; colon++);
+		len = colon - ceil;
+		if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil))
+			continue;
+		strlcpy(buf, ceil, len+1);
+		if (normalize_path_copy(buf, buf) < 0)
+			continue;
+		len = strlen(buf);
+		if (len > 0 && buf[len-1] == '/')
+			buf[--len] = '\0';
+
+		if (!strncmp(path, buf, len) &&
+		    path[len] == '/' &&
+		    len > max_len) {
+			max_len = len;
+		}
+	}
+
+	return max_len;
+}
+
+/* strip arbitrary amount of directory separators at end of path */
+static inline int chomp_trailing_dir_sep(const char *path, int len)
+{
+	while (len && is_dir_sep(path[len - 1]))
+		len--;
+	return len;
+}
+
+/*
+ * If path ends with suffix (complete path components), returns the
+ * part before suffix (sans trailing directory separators).
+ * Otherwise returns NULL.
+ */
+char *strip_path_suffix(const char *path, const char *suffix)
+{
+	int path_len = strlen(path), suffix_len = strlen(suffix);
+
+	while (suffix_len) {
+		if (!path_len)
+			return NULL;
+
+		if (is_dir_sep(path[path_len - 1])) {
+			if (!is_dir_sep(suffix[suffix_len - 1]))
+				return NULL;
+			path_len = chomp_trailing_dir_sep(path, path_len);
+			suffix_len = chomp_trailing_dir_sep(suffix, suffix_len);
+		}
+		else if (path[--path_len] != suffix[--suffix_len])
+			return NULL;
+	}
+
+	if (path_len && !is_dir_sep(path[path_len - 1]))
+		return NULL;
+	return xstrndup(path, chomp_trailing_dir_sep(path, path_len));
+}
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
new file mode 100644
index 0000000000000..9256f6a164460
--- /dev/null
+++ b/Documentation/perf_counter/perf.c
@@ -0,0 +1,411 @@
+#include "builtin.h"
+#include "exec_cmd.h"
+#include "cache.h"
+//#include "quote.h"
+#include "run-command.h"
+
+const char perf_usage_string[] =
+	"perf [--version] [--exec-path[=PERF_EXEC_PATH]] [--html-path] [-p|--paginate|--no-pager] [--bare] [--perf-dir=PERF_DIR] [--work-tree=PERF_WORK_TREE] [--help] COMMAND [ARGS]";
+
+const char perf_more_info_string[] =
+	"See 'perf help COMMAND' for more information on a specific command.";
+
+static int use_pager = -1;
+struct pager_config {
+	const char *cmd;
+	int val;
+};
+
+static int pager_command_config(const char *var, const char *value, void *data)
+{
+	struct pager_config *c = data;
+	if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd))
+		c->val = perf_config_bool(var, value);
+	return 0;
+}
+
+/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */
+int check_pager_config(const char *cmd)
+{
+	struct pager_config c;
+	c.cmd = cmd;
+	c.val = -1;
+	perf_config(pager_command_config, &c);
+	return c.val;
+}
+
+static void commit_pager_choice(void) {
+	switch (use_pager) {
+	case 0:
+		setenv("PERF_PAGER", "cat", 1);
+		break;
+	case 1:
+		/* setup_pager(); */
+		break;
+	default:
+		break;
+	}
+}
+
+static int handle_options(const char*** argv, int* argc, int* envchanged)
+{
+	int handled = 0;
+
+	while (*argc > 0) {
+		const char *cmd = (*argv)[0];
+		if (cmd[0] != '-')
+			break;
+
+		/*
+		 * For legacy reasons, the "version" and "help"
+		 * commands can be written with "--" prepended
+		 * to make them look like flags.
+		 */
+		if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version"))
+			break;
+
+		/*
+		 * Check remaining flags.
+		 */
+		if (!prefixcmp(cmd, "--exec-path")) {
+			cmd += 11;
+			if (*cmd == '=')
+				perf_set_argv_exec_path(cmd + 1);
+			else {
+				puts(perf_exec_path());
+				exit(0);
+			}
+		} else if (!strcmp(cmd, "--html-path")) {
+			puts(system_path(PERF_HTML_PATH));
+			exit(0);
+		} else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) {
+			use_pager = 1;
+		} else if (!strcmp(cmd, "--no-pager")) {
+			use_pager = 0;
+			if (envchanged)
+				*envchanged = 1;
+		} else if (!strcmp(cmd, "--perf-dir")) {
+			if (*argc < 2) {
+				fprintf(stderr, "No directory given for --perf-dir.\n" );
+				usage(perf_usage_string);
+			}
+			setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1);
+			if (envchanged)
+				*envchanged = 1;
+			(*argv)++;
+			(*argc)--;
+			handled++;
+		} else if (!prefixcmp(cmd, "--perf-dir=")) {
+			setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1);
+			if (envchanged)
+				*envchanged = 1;
+		} else if (!strcmp(cmd, "--work-tree")) {
+			if (*argc < 2) {
+				fprintf(stderr, "No directory given for --work-tree.\n" );
+				usage(perf_usage_string);
+			}
+			setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1);
+			if (envchanged)
+				*envchanged = 1;
+			(*argv)++;
+			(*argc)--;
+		} else if (!prefixcmp(cmd, "--work-tree=")) {
+			setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
+			if (envchanged)
+				*envchanged = 1;
+		} else {
+			fprintf(stderr, "Unknown option: %s\n", cmd);
+			usage(perf_usage_string);
+		}
+
+		(*argv)++;
+		(*argc)--;
+		handled++;
+	}
+	return handled;
+}
+
+static int handle_alias(int *argcp, const char ***argv)
+{
+	int envchanged = 0, ret = 0, saved_errno = errno;
+	int count, option_count;
+	const char** new_argv;
+	const char *alias_command;
+	char *alias_string;
+	int unused_nonperf;
+
+	alias_command = (*argv)[0];
+	alias_string = alias_lookup(alias_command);
+	if (alias_string) {
+		if (alias_string[0] == '!') {
+			if (*argcp > 1) {
+				struct strbuf buf;
+
+				strbuf_init(&buf, PATH_MAX);
+				strbuf_addstr(&buf, alias_string);
+				sq_quote_argv(&buf, (*argv) + 1, PATH_MAX);
+				free(alias_string);
+				alias_string = buf.buf;
+			}
+			ret = system(alias_string + 1);
+			if (ret >= 0 && WIFEXITED(ret) &&
+			    WEXITSTATUS(ret) != 127)
+				exit(WEXITSTATUS(ret));
+			die("Failed to run '%s' when expanding alias '%s'",
+			    alias_string + 1, alias_command);
+		}
+		count = split_cmdline(alias_string, &new_argv);
+		if (count < 0)
+			die("Bad alias.%s string", alias_command);
+		option_count = handle_options(&new_argv, &count, &envchanged);
+		if (envchanged)
+			die("alias '%s' changes environment variables\n"
+				 "You can use '!perf' in the alias to do this.",
+				 alias_command);
+		memmove(new_argv - option_count, new_argv,
+				count * sizeof(char *));
+		new_argv -= option_count;
+
+		if (count < 1)
+			die("empty alias for %s", alias_command);
+
+		if (!strcmp(alias_command, new_argv[0]))
+			die("recursive alias: %s", alias_command);
+
+		new_argv = realloc(new_argv, sizeof(char*) *
+				    (count + *argcp + 1));
+		/* insert after command name */
+		memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp);
+		new_argv[count+*argcp] = NULL;
+
+		*argv = new_argv;
+		*argcp += count - 1;
+
+		ret = 1;
+	}
+
+	errno = saved_errno;
+
+	return ret;
+}
+
+const char perf_version_string[] = PERF_VERSION;
+
+#define RUN_SETUP	(1<<0)
+#define USE_PAGER	(1<<1)
+/*
+ * require working tree to be present -- anything uses this needs
+ * RUN_SETUP for reading from the configuration file.
+ */
+#define NEED_WORK_TREE	(1<<2)
+
+struct cmd_struct {
+	const char *cmd;
+	int (*fn)(int, const char **, const char *);
+	int option;
+};
+
+static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
+{
+	int status;
+	struct stat st;
+	const char *prefix;
+
+	prefix = NULL;
+	if (p->option & RUN_SETUP)
+		prefix = NULL; /* setup_perf_directory(); */
+
+	if (use_pager == -1 && p->option & RUN_SETUP)
+		use_pager = check_pager_config(p->cmd);
+	if (use_pager == -1 && p->option & USE_PAGER)
+		use_pager = 1;
+	commit_pager_choice();
+
+	if (p->option & NEED_WORK_TREE)
+		/* setup_work_tree() */;
+
+	status = p->fn(argc, argv, prefix);
+	if (status)
+		return status & 0xff;
+
+	/* Somebody closed stdout? */
+	if (fstat(fileno(stdout), &st))
+		return 0;
+	/* Ignore write errors for pipes and sockets.. */
+	if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
+		return 0;
+
+	/* Check for ENOSPC and EIO errors.. */
+	if (fflush(stdout))
+		die("write failure on standard output: %s", strerror(errno));
+	if (ferror(stdout))
+		die("unknown write failure on standard output");
+	if (fclose(stdout))
+		die("close failed on standard output: %s", strerror(errno));
+	return 0;
+}
+
+static void handle_internal_command(int argc, const char **argv)
+{
+	const char *cmd = argv[0];
+	static struct cmd_struct commands[] = {
+		{ "top", cmd_top, 0 },
+	};
+	int i;
+	static const char ext[] = STRIP_EXTENSION;
+
+	if (sizeof(ext) > 1) {
+		i = strlen(argv[0]) - strlen(ext);
+		if (i > 0 && !strcmp(argv[0] + i, ext)) {
+			char *argv0 = strdup(argv[0]);
+			argv[0] = cmd = argv0;
+			argv0[i] = '\0';
+		}
+	}
+
+	/* Turn "perf cmd --help" into "perf help cmd" */
+	if (argc > 1 && !strcmp(argv[1], "--help")) {
+		argv[1] = argv[0];
+		argv[0] = cmd = "help";
+	}
+
+	for (i = 0; i < ARRAY_SIZE(commands); i++) {
+		struct cmd_struct *p = commands+i;
+		if (strcmp(p->cmd, cmd))
+			continue;
+		exit(run_builtin(p, argc, argv));
+	}
+}
+
+static void execv_dashed_external(const char **argv)
+{
+	struct strbuf cmd = STRBUF_INIT;
+	const char *tmp;
+	int status;
+
+	strbuf_addf(&cmd, "perf-%s", argv[0]);
+
+	/*
+	 * argv[0] must be the perf command, but the argv array
+	 * belongs to the caller, and may be reused in
+	 * subsequent loop iterations. Save argv[0] and
+	 * restore it on error.
+	 */
+	tmp = argv[0];
+	argv[0] = cmd.buf;
+
+	/*
+	 * if we fail because the command is not found, it is
+	 * OK to return. Otherwise, we just pass along the status code.
+	 */
+	status = run_command_v_opt(argv, 0);
+	if (status != -ERR_RUN_COMMAND_EXEC) {
+		if (IS_RUN_COMMAND_ERR(status))
+			die("unable to run '%s'", argv[0]);
+		exit(-status);
+	}
+	errno = ENOENT; /* as if we called execvp */
+
+	argv[0] = tmp;
+
+	strbuf_release(&cmd);
+}
+
+static int run_argv(int *argcp, const char ***argv)
+{
+	int done_alias = 0;
+
+	while (1) {
+		/* See if it's an internal command */
+		handle_internal_command(*argcp, *argv);
+
+		/* .. then try the external ones */
+		execv_dashed_external(*argv);
+
+		/* It could be an alias -- this works around the insanity
+		 * of overriding "perf log" with "perf show" by having
+		 * alias.log = show
+		 */
+		if (done_alias || !handle_alias(argcp, argv))
+			break;
+		done_alias = 1;
+	}
+
+	return done_alias;
+}
+
+
+int main(int argc, const char **argv)
+{
+	const char *cmd;
+
+	cmd = perf_extract_argv0_path(argv[0]);
+	if (!cmd)
+		cmd = "perf-help";
+
+	/*
+	 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
+	 *
+	 *  - cannot take flags in between the "perf" and the "xxxx".
+	 *  - cannot execute it externally (since it would just do
+	 *    the same thing over again)
+	 *
+	 * So we just directly call the internal command handler, and
+	 * die if that one cannot handle it.
+	 */
+	if (!prefixcmp(cmd, "perf-")) {
+		cmd += 4;
+		argv[0] = cmd;
+		handle_internal_command(argc, argv);
+		die("cannot handle %s internally", cmd);
+	}
+
+	/* Look for flags.. */
+	argv++;
+	argc--;
+	handle_options(&argv, &argc, NULL);
+	commit_pager_choice();
+	if (argc > 0) {
+		if (!prefixcmp(argv[0], "--"))
+			argv[0] += 2;
+	} else {
+		/* The user didn't specify a command; give them help */
+		printf("usage: %s\n\n", perf_usage_string);
+		list_common_cmds_help();
+		printf("\n%s\n", perf_more_info_string);
+		exit(1);
+	}
+	cmd = argv[0];
+
+	/*
+	 * We use PATH to find perf commands, but we prepend some higher
+	 * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH
+	 * environment, and the $(perfexecdir) from the Makefile at build
+	 * time.
+	 */
+	setup_path();
+
+	while (1) {
+		static int done_help = 0;
+		static int was_alias = 0;
+		was_alias = run_argv(&argc, &argv);
+		if (errno != ENOENT)
+			break;
+		if (was_alias) {
+			fprintf(stderr, "Expansion of alias '%s' failed; "
+				"'%s' is not a perf-command\n",
+				cmd, argv[0]);
+			exit(1);
+		}
+		if (!done_help) {
+			cmd = argv[0] = help_unknown_cmd(cmd);
+			done_help = 1;
+		} else
+			break;
+	}
+
+	fprintf(stderr, "Failed to run command '%s': %s\n",
+		cmd, strerror(errno));
+
+	return 1;
+}
diff --git a/Documentation/perf_counter/quote.c b/Documentation/perf_counter/quote.c
new file mode 100644
index 0000000000000..7a49fcf696716
--- /dev/null
+++ b/Documentation/perf_counter/quote.c
@@ -0,0 +1,478 @@
+#include "cache.h"
+#include "quote.h"
+
+int quote_path_fully = 1;
+
+/* Help to copy the thing properly quoted for the shell safety.
+ * any single quote is replaced with '\'', any exclamation point
+ * is replaced with '\!', and the whole thing is enclosed in a
+ *
+ * E.g.
+ *  original     sq_quote     result
+ *  name     ==> name      ==> 'name'
+ *  a b      ==> a b       ==> 'a b'
+ *  a'b      ==> a'\''b    ==> 'a'\''b'
+ *  a!b      ==> a'\!'b    ==> 'a'\!'b'
+ */
+static inline int need_bs_quote(char c)
+{
+	return (c == '\'' || c == '!');
+}
+
+void sq_quote_buf(struct strbuf *dst, const char *src)
+{
+	char *to_free = NULL;
+
+	if (dst->buf == src)
+		to_free = strbuf_detach(dst, NULL);
+
+	strbuf_addch(dst, '\'');
+	while (*src) {
+		size_t len = strcspn(src, "'!");
+		strbuf_add(dst, src, len);
+		src += len;
+		while (need_bs_quote(*src)) {
+			strbuf_addstr(dst, "'\\");
+			strbuf_addch(dst, *src++);
+			strbuf_addch(dst, '\'');
+		}
+	}
+	strbuf_addch(dst, '\'');
+	free(to_free);
+}
+
+void sq_quote_print(FILE *stream, const char *src)
+{
+	char c;
+
+	fputc('\'', stream);
+	while ((c = *src++)) {
+		if (need_bs_quote(c)) {
+			fputs("'\\", stream);
+			fputc(c, stream);
+			fputc('\'', stream);
+		} else {
+			fputc(c, stream);
+		}
+	}
+	fputc('\'', stream);
+}
+
+void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen)
+{
+	int i;
+
+	/* Copy into destination buffer. */
+	strbuf_grow(dst, 255);
+	for (i = 0; argv[i]; ++i) {
+		strbuf_addch(dst, ' ');
+		sq_quote_buf(dst, argv[i]);
+		if (maxlen && dst->len > maxlen)
+			die("Too many or long arguments");
+	}
+}
+
+char *sq_dequote_step(char *arg, char **next)
+{
+	char *dst = arg;
+	char *src = arg;
+	char c;
+
+	if (*src != '\'')
+		return NULL;
+	for (;;) {
+		c = *++src;
+		if (!c)
+			return NULL;
+		if (c != '\'') {
+			*dst++ = c;
+			continue;
+		}
+		/* We stepped out of sq */
+		switch (*++src) {
+		case '\0':
+			*dst = 0;
+			if (next)
+				*next = NULL;
+			return arg;
+		case '\\':
+			c = *++src;
+			if (need_bs_quote(c) && *++src == '\'') {
+				*dst++ = c;
+				continue;
+			}
+		/* Fallthrough */
+		default:
+			if (!next || !isspace(*src))
+				return NULL;
+			do {
+				c = *++src;
+			} while (isspace(c));
+			*dst = 0;
+			*next = src;
+			return arg;
+		}
+	}
+}
+
+char *sq_dequote(char *arg)
+{
+	return sq_dequote_step(arg, NULL);
+}
+
+int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc)
+{
+	char *next = arg;
+
+	if (!*arg)
+		return 0;
+	do {
+		char *dequoted = sq_dequote_step(next, &next);
+		if (!dequoted)
+			return -1;
+		ALLOC_GROW(*argv, *nr + 1, *alloc);
+		(*argv)[(*nr)++] = dequoted;
+	} while (next);
+
+	return 0;
+}
+
+/* 1 means: quote as octal
+ * 0 means: quote as octal if (quote_path_fully)
+ * -1 means: never quote
+ * c: quote as "\\c"
+ */
+#define X8(x)   x, x, x, x, x, x, x, x
+#define X16(x)  X8(x), X8(x)
+static signed char const sq_lookup[256] = {
+	/*           0    1    2    3    4    5    6    7 */
+	/* 0x00 */   1,   1,   1,   1,   1,   1,   1, 'a',
+	/* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r',   1,   1,
+	/* 0x10 */ X16(1),
+	/* 0x20 */  -1,  -1, '"',  -1,  -1,  -1,  -1,  -1,
+	/* 0x28 */ X16(-1), X16(-1), X16(-1),
+	/* 0x58 */  -1,  -1,  -1,  -1,'\\',  -1,  -1,  -1,
+	/* 0x60 */ X16(-1), X8(-1),
+	/* 0x78 */  -1,  -1,  -1,  -1,  -1,  -1,  -1,   1,
+	/* 0x80 */ /* set to 0 */
+};
+
+static inline int sq_must_quote(char c)
+{
+	return sq_lookup[(unsigned char)c] + quote_path_fully > 0;
+}
+
+/* returns the longest prefix not needing a quote up to maxlen if positive.
+   This stops at the first \0 because it's marked as a character needing an
+   escape */
+static size_t next_quote_pos(const char *s, ssize_t maxlen)
+{
+	size_t len;
+	if (maxlen < 0) {
+		for (len = 0; !sq_must_quote(s[len]); len++);
+	} else {
+		for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++);
+	}
+	return len;
+}
+
+/*
+ * C-style name quoting.
+ *
+ * (1) if sb and fp are both NULL, inspect the input name and counts the
+ *     number of bytes that are needed to hold c_style quoted version of name,
+ *     counting the double quotes around it but not terminating NUL, and
+ *     returns it.
+ *     However, if name does not need c_style quoting, it returns 0.
+ *
+ * (2) if sb or fp are not NULL, it emits the c_style quoted version
+ *     of name, enclosed with double quotes if asked and needed only.
+ *     Return value is the same as in (1).
+ */
+static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
+                                    struct strbuf *sb, FILE *fp, int no_dq)
+{
+#undef EMIT
+#define EMIT(c)                                 \
+	do {                                        \
+		if (sb) strbuf_addch(sb, (c));          \
+		if (fp) fputc((c), fp);                 \
+		count++;                                \
+	} while (0)
+#define EMITBUF(s, l)                           \
+	do {                                        \
+		if (sb) strbuf_add(sb, (s), (l));       \
+		if (fp) fwrite((s), (l), 1, fp);        \
+		count += (l);                           \
+	} while (0)
+
+	size_t len, count = 0;
+	const char *p = name;
+
+	for (;;) {
+		int ch;
+
+		len = next_quote_pos(p, maxlen);
+		if (len == maxlen || !p[len])
+			break;
+
+		if (!no_dq && p == name)
+			EMIT('"');
+
+		EMITBUF(p, len);
+		EMIT('\\');
+		p += len;
+		ch = (unsigned char)*p++;
+		if (sq_lookup[ch] >= ' ') {
+			EMIT(sq_lookup[ch]);
+		} else {
+			EMIT(((ch >> 6) & 03) + '0');
+			EMIT(((ch >> 3) & 07) + '0');
+			EMIT(((ch >> 0) & 07) + '0');
+		}
+	}
+
+	EMITBUF(p, len);
+	if (p == name)   /* no ending quote needed */
+		return 0;
+
+	if (!no_dq)
+		EMIT('"');
+	return count;
+}
+
+size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq)
+{
+	return quote_c_style_counted(name, -1, sb, fp, nodq);
+}
+
+void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq)
+{
+	if (quote_c_style(prefix, NULL, NULL, 0) ||
+	    quote_c_style(path, NULL, NULL, 0)) {
+		if (!nodq)
+			strbuf_addch(sb, '"');
+		quote_c_style(prefix, sb, NULL, 1);
+		quote_c_style(path, sb, NULL, 1);
+		if (!nodq)
+			strbuf_addch(sb, '"');
+	} else {
+		strbuf_addstr(sb, prefix);
+		strbuf_addstr(sb, path);
+	}
+}
+
+void write_name_quoted(const char *name, FILE *fp, int terminator)
+{
+	if (terminator) {
+		quote_c_style(name, NULL, fp, 0);
+	} else {
+		fputs(name, fp);
+	}
+	fputc(terminator, fp);
+}
+
+extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
+                                 const char *name, FILE *fp, int terminator)
+{
+	int needquote = 0;
+
+	if (terminator) {
+		needquote = next_quote_pos(pfx, pfxlen) < pfxlen
+			|| name[next_quote_pos(name, -1)];
+	}
+	if (needquote) {
+		fputc('"', fp);
+		quote_c_style_counted(pfx, pfxlen, NULL, fp, 1);
+		quote_c_style(name, NULL, fp, 1);
+		fputc('"', fp);
+	} else {
+		fwrite(pfx, pfxlen, 1, fp);
+		fputs(name, fp);
+	}
+	fputc(terminator, fp);
+}
+
+/* quote path as relative to the given prefix */
+char *quote_path_relative(const char *in, int len,
+			  struct strbuf *out, const char *prefix)
+{
+	int needquote;
+
+	if (len < 0)
+		len = strlen(in);
+
+	/* "../" prefix itself does not need quoting, but "in" might. */
+	needquote = next_quote_pos(in, len) < len;
+	strbuf_setlen(out, 0);
+	strbuf_grow(out, len);
+
+	if (needquote)
+		strbuf_addch(out, '"');
+	if (prefix) {
+		int off = 0;
+		while (prefix[off] && off < len && prefix[off] == in[off])
+			if (prefix[off] == '/') {
+				prefix += off + 1;
+				in += off + 1;
+				len -= off + 1;
+				off = 0;
+			} else
+				off++;
+
+		for (; *prefix; prefix++)
+			if (*prefix == '/')
+				strbuf_addstr(out, "../");
+	}
+
+	quote_c_style_counted (in, len, out, NULL, 1);
+
+	if (needquote)
+		strbuf_addch(out, '"');
+	if (!out->len)
+		strbuf_addstr(out, "./");
+
+	return out->buf;
+}
+
+/*
+ * C-style name unquoting.
+ *
+ * Quoted should point at the opening double quote.
+ * + Returns 0 if it was able to unquote the string properly, and appends the
+ *   result in the strbuf `sb'.
+ * + Returns -1 in case of error, and doesn't touch the strbuf. Though note
+ *   that this function will allocate memory in the strbuf, so calling
+ *   strbuf_release is mandatory whichever result unquote_c_style returns.
+ *
+ * Updates endp pointer to point at one past the ending double quote if given.
+ */
+int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp)
+{
+	size_t oldlen = sb->len, len;
+	int ch, ac;
+
+	if (*quoted++ != '"')
+		return -1;
+
+	for (;;) {
+		len = strcspn(quoted, "\"\\");
+		strbuf_add(sb, quoted, len);
+		quoted += len;
+
+		switch (*quoted++) {
+		  case '"':
+			if (endp)
+				*endp = quoted;
+			return 0;
+		  case '\\':
+			break;
+		  default:
+			goto error;
+		}
+
+		switch ((ch = *quoted++)) {
+		case 'a': ch = '\a'; break;
+		case 'b': ch = '\b'; break;
+		case 'f': ch = '\f'; break;
+		case 'n': ch = '\n'; break;
+		case 'r': ch = '\r'; break;
+		case 't': ch = '\t'; break;
+		case 'v': ch = '\v'; break;
+
+		case '\\': case '"':
+			break; /* verbatim */
+
+		/* octal values with first digit over 4 overflow */
+		case '0': case '1': case '2': case '3':
+					ac = ((ch - '0') << 6);
+			if ((ch = *quoted++) < '0' || '7' < ch)
+				goto error;
+					ac |= ((ch - '0') << 3);
+			if ((ch = *quoted++) < '0' || '7' < ch)
+				goto error;
+					ac |= (ch - '0');
+					ch = ac;
+					break;
+				default:
+			goto error;
+			}
+		strbuf_addch(sb, ch);
+		}
+
+  error:
+	strbuf_setlen(sb, oldlen);
+	return -1;
+}
+
+/* quoting as a string literal for other languages */
+
+void perl_quote_print(FILE *stream, const char *src)
+{
+	const char sq = '\'';
+	const char bq = '\\';
+	char c;
+
+	fputc(sq, stream);
+	while ((c = *src++)) {
+		if (c == sq || c == bq)
+			fputc(bq, stream);
+		fputc(c, stream);
+	}
+	fputc(sq, stream);
+}
+
+void python_quote_print(FILE *stream, const char *src)
+{
+	const char sq = '\'';
+	const char bq = '\\';
+	const char nl = '\n';
+	char c;
+
+	fputc(sq, stream);
+	while ((c = *src++)) {
+		if (c == nl) {
+			fputc(bq, stream);
+			fputc('n', stream);
+			continue;
+		}
+		if (c == sq || c == bq)
+			fputc(bq, stream);
+		fputc(c, stream);
+	}
+	fputc(sq, stream);
+}
+
+void tcl_quote_print(FILE *stream, const char *src)
+{
+	char c;
+
+	fputc('"', stream);
+	while ((c = *src++)) {
+		switch (c) {
+		case '[': case ']':
+		case '{': case '}':
+		case '$': case '\\': case '"':
+			fputc('\\', stream);
+		default:
+			fputc(c, stream);
+			break;
+		case '\f':
+			fputs("\\f", stream);
+			break;
+		case '\r':
+			fputs("\\r", stream);
+			break;
+		case '\n':
+			fputs("\\n", stream);
+			break;
+		case '\t':
+			fputs("\\t", stream);
+			break;
+		case '\v':
+			fputs("\\v", stream);
+			break;
+		}
+	}
+	fputc('"', stream);
+}
diff --git a/Documentation/perf_counter/quote.h b/Documentation/perf_counter/quote.h
new file mode 100644
index 0000000000000..66730f2bff3ce
--- /dev/null
+++ b/Documentation/perf_counter/quote.h
@@ -0,0 +1,68 @@
+#ifndef QUOTE_H
+#define QUOTE_H
+
+#include <stddef.h>
+#include <stdio.h>
+
+/* Help to copy the thing properly quoted for the shell safety.
+ * any single quote is replaced with '\'', any exclamation point
+ * is replaced with '\!', and the whole thing is enclosed in a
+ * single quote pair.
+ *
+ * For example, if you are passing the result to system() as an
+ * argument:
+ *
+ * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1))
+ *
+ * would be appropriate.  If the system() is going to call ssh to
+ * run the command on the other side:
+ *
+ * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
+ * sprintf(rcmd, "ssh %s %s", sq_quote(host), sq_quote(cmd));
+ *
+ * Note that the above examples leak memory!  Remember to free result from
+ * sq_quote() in a real application.
+ *
+ * sq_quote_buf() writes to an existing buffer of specified size; it
+ * will return the number of characters that would have been written
+ * excluding the final null regardless of the buffer size.
+ */
+
+extern void sq_quote_print(FILE *stream, const char *src);
+
+extern void sq_quote_buf(struct strbuf *, const char *src);
+extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen);
+
+/* This unwraps what sq_quote() produces in place, but returns
+ * NULL if the input does not look like what sq_quote would have
+ * produced.
+ */
+extern char *sq_dequote(char *);
+
+/*
+ * Same as the above, but can be used to unwrap many arguments in the
+ * same string separated by space. "next" is changed to point to the
+ * next argument that should be passed as first parameter. When there
+ * is no more argument to be dequoted, "next" is updated to point to NULL.
+ */
+extern char *sq_dequote_step(char *arg, char **next);
+extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc);
+
+extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp);
+extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq);
+extern void quote_two_c_style(struct strbuf *, const char *, const char *, int);
+
+extern void write_name_quoted(const char *name, FILE *, int terminator);
+extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
+                                 const char *name, FILE *, int terminator);
+
+/* quote path as relative to the given prefix */
+char *quote_path_relative(const char *in, int len,
+			  struct strbuf *out, const char *prefix);
+
+/* quoting as a string literal for other languages */
+extern void perl_quote_print(FILE *stream, const char *src);
+extern void python_quote_print(FILE *stream, const char *src);
+extern void tcl_quote_print(FILE *stream, const char *src);
+
+#endif
diff --git a/Documentation/perf_counter/run-command.c b/Documentation/perf_counter/run-command.c
new file mode 100644
index 0000000000000..b2f5e854f40ad
--- /dev/null
+++ b/Documentation/perf_counter/run-command.c
@@ -0,0 +1,395 @@
+#include "cache.h"
+#include "run-command.h"
+#include "exec_cmd.h"
+
+static inline void close_pair(int fd[2])
+{
+	close(fd[0]);
+	close(fd[1]);
+}
+
+static inline void dup_devnull(int to)
+{
+	int fd = open("/dev/null", O_RDWR);
+	dup2(fd, to);
+	close(fd);
+}
+
+int start_command(struct child_process *cmd)
+{
+	int need_in, need_out, need_err;
+	int fdin[2], fdout[2], fderr[2];
+
+	/*
+	 * In case of errors we must keep the promise to close FDs
+	 * that have been passed in via ->in and ->out.
+	 */
+
+	need_in = !cmd->no_stdin && cmd->in < 0;
+	if (need_in) {
+		if (pipe(fdin) < 0) {
+			if (cmd->out > 0)
+				close(cmd->out);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->in = fdin[1];
+	}
+
+	need_out = !cmd->no_stdout
+		&& !cmd->stdout_to_stderr
+		&& cmd->out < 0;
+	if (need_out) {
+		if (pipe(fdout) < 0) {
+			if (need_in)
+				close_pair(fdin);
+			else if (cmd->in)
+				close(cmd->in);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->out = fdout[0];
+	}
+
+	need_err = !cmd->no_stderr && cmd->err < 0;
+	if (need_err) {
+		if (pipe(fderr) < 0) {
+			if (need_in)
+				close_pair(fdin);
+			else if (cmd->in)
+				close(cmd->in);
+			if (need_out)
+				close_pair(fdout);
+			else if (cmd->out)
+				close(cmd->out);
+			return -ERR_RUN_COMMAND_PIPE;
+		}
+		cmd->err = fderr[0];
+	}
+
+#ifndef __MINGW32__
+	fflush(NULL);
+	cmd->pid = fork();
+	if (!cmd->pid) {
+		if (cmd->no_stdin)
+			dup_devnull(0);
+		else if (need_in) {
+			dup2(fdin[0], 0);
+			close_pair(fdin);
+		} else if (cmd->in) {
+			dup2(cmd->in, 0);
+			close(cmd->in);
+		}
+
+		if (cmd->no_stderr)
+			dup_devnull(2);
+		else if (need_err) {
+			dup2(fderr[1], 2);
+			close_pair(fderr);
+		}
+
+		if (cmd->no_stdout)
+			dup_devnull(1);
+		else if (cmd->stdout_to_stderr)
+			dup2(2, 1);
+		else if (need_out) {
+			dup2(fdout[1], 1);
+			close_pair(fdout);
+		} else if (cmd->out > 1) {
+			dup2(cmd->out, 1);
+			close(cmd->out);
+		}
+
+		if (cmd->dir && chdir(cmd->dir))
+			die("exec %s: cd to %s failed (%s)", cmd->argv[0],
+			    cmd->dir, strerror(errno));
+		if (cmd->env) {
+			for (; *cmd->env; cmd->env++) {
+				if (strchr(*cmd->env, '='))
+					putenv((char*)*cmd->env);
+				else
+					unsetenv(*cmd->env);
+			}
+		}
+		if (cmd->preexec_cb)
+			cmd->preexec_cb();
+		if (cmd->perf_cmd) {
+			execv_perf_cmd(cmd->argv);
+		} else {
+			execvp(cmd->argv[0], (char *const*) cmd->argv);
+		}
+		exit(127);
+	}
+#else
+	int s0 = -1, s1 = -1, s2 = -1;	/* backups of stdin, stdout, stderr */
+	const char **sargv = cmd->argv;
+	char **env = environ;
+
+	if (cmd->no_stdin) {
+		s0 = dup(0);
+		dup_devnull(0);
+	} else if (need_in) {
+		s0 = dup(0);
+		dup2(fdin[0], 0);
+	} else if (cmd->in) {
+		s0 = dup(0);
+		dup2(cmd->in, 0);
+	}
+
+	if (cmd->no_stderr) {
+		s2 = dup(2);
+		dup_devnull(2);
+	} else if (need_err) {
+		s2 = dup(2);
+		dup2(fderr[1], 2);
+	}
+
+	if (cmd->no_stdout) {
+		s1 = dup(1);
+		dup_devnull(1);
+	} else if (cmd->stdout_to_stderr) {
+		s1 = dup(1);
+		dup2(2, 1);
+	} else if (need_out) {
+		s1 = dup(1);
+		dup2(fdout[1], 1);
+	} else if (cmd->out > 1) {
+		s1 = dup(1);
+		dup2(cmd->out, 1);
+	}
+
+	if (cmd->dir)
+		die("chdir in start_command() not implemented");
+	if (cmd->env) {
+		env = copy_environ();
+		for (; *cmd->env; cmd->env++)
+			env = env_setenv(env, *cmd->env);
+	}
+
+	if (cmd->perf_cmd) {
+		cmd->argv = prepare_perf_cmd(cmd->argv);
+	}
+
+	cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env);
+
+	if (cmd->env)
+		free_environ(env);
+	if (cmd->perf_cmd)
+		free(cmd->argv);
+
+	cmd->argv = sargv;
+	if (s0 >= 0)
+		dup2(s0, 0), close(s0);
+	if (s1 >= 0)
+		dup2(s1, 1), close(s1);
+	if (s2 >= 0)
+		dup2(s2, 2), close(s2);
+#endif
+
+	if (cmd->pid < 0) {
+		int err = errno;
+		if (need_in)
+			close_pair(fdin);
+		else if (cmd->in)
+			close(cmd->in);
+		if (need_out)
+			close_pair(fdout);
+		else if (cmd->out)
+			close(cmd->out);
+		if (need_err)
+			close_pair(fderr);
+		return err == ENOENT ?
+			-ERR_RUN_COMMAND_EXEC :
+			-ERR_RUN_COMMAND_FORK;
+	}
+
+	if (need_in)
+		close(fdin[0]);
+	else if (cmd->in)
+		close(cmd->in);
+
+	if (need_out)
+		close(fdout[1]);
+	else if (cmd->out)
+		close(cmd->out);
+
+	if (need_err)
+		close(fderr[1]);
+
+	return 0;
+}
+
+static int wait_or_whine(pid_t pid)
+{
+	for (;;) {
+		int status, code;
+		pid_t waiting = waitpid(pid, &status, 0);
+
+		if (waiting < 0) {
+			if (errno == EINTR)
+				continue;
+			error("waitpid failed (%s)", strerror(errno));
+			return -ERR_RUN_COMMAND_WAITPID;
+		}
+		if (waiting != pid)
+			return -ERR_RUN_COMMAND_WAITPID_WRONG_PID;
+		if (WIFSIGNALED(status))
+			return -ERR_RUN_COMMAND_WAITPID_SIGNAL;
+
+		if (!WIFEXITED(status))
+			return -ERR_RUN_COMMAND_WAITPID_NOEXIT;
+		code = WEXITSTATUS(status);
+		switch (code) {
+		case 127:
+			return -ERR_RUN_COMMAND_EXEC;
+		case 0:
+			return 0;
+		default:
+			return -code;
+		}
+	}
+}
+
+int finish_command(struct child_process *cmd)
+{
+	return wait_or_whine(cmd->pid);
+}
+
+int run_command(struct child_process *cmd)
+{
+	int code = start_command(cmd);
+	if (code)
+		return code;
+	return finish_command(cmd);
+}
+
+static void prepare_run_command_v_opt(struct child_process *cmd,
+				      const char **argv,
+				      int opt)
+{
+	memset(cmd, 0, sizeof(*cmd));
+	cmd->argv = argv;
+	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0;
+	cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0;
+	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0;
+}
+
+int run_command_v_opt(const char **argv, int opt)
+{
+	struct child_process cmd;
+	prepare_run_command_v_opt(&cmd, argv, opt);
+	return run_command(&cmd);
+}
+
+int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env)
+{
+	struct child_process cmd;
+	prepare_run_command_v_opt(&cmd, argv, opt);
+	cmd.dir = dir;
+	cmd.env = env;
+	return run_command(&cmd);
+}
+
+#ifdef __MINGW32__
+static __stdcall unsigned run_thread(void *data)
+{
+	struct async *async = data;
+	return async->proc(async->fd_for_proc, async->data);
+}
+#endif
+
+int start_async(struct async *async)
+{
+	int pipe_out[2];
+
+	if (pipe(pipe_out) < 0)
+		return error("cannot create pipe: %s", strerror(errno));
+	async->out = pipe_out[0];
+
+#ifndef __MINGW32__
+	/* Flush stdio before fork() to avoid cloning buffers */
+	fflush(NULL);
+
+	async->pid = fork();
+	if (async->pid < 0) {
+		error("fork (async) failed: %s", strerror(errno));
+		close_pair(pipe_out);
+		return -1;
+	}
+	if (!async->pid) {
+		close(pipe_out[0]);
+		exit(!!async->proc(pipe_out[1], async->data));
+	}
+	close(pipe_out[1]);
+#else
+	async->fd_for_proc = pipe_out[1];
+	async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL);
+	if (!async->tid) {
+		error("cannot create thread: %s", strerror(errno));
+		close_pair(pipe_out);
+		return -1;
+	}
+#endif
+	return 0;
+}
+
+int finish_async(struct async *async)
+{
+#ifndef __MINGW32__
+	int ret = 0;
+
+	if (wait_or_whine(async->pid))
+		ret = error("waitpid (async) failed");
+#else
+	DWORD ret = 0;
+	if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0)
+		ret = error("waiting for thread failed: %lu", GetLastError());
+	else if (!GetExitCodeThread(async->tid, &ret))
+		ret = error("cannot get thread exit code: %lu", GetLastError());
+	CloseHandle(async->tid);
+#endif
+	return ret;
+}
+
+int run_hook(const char *index_file, const char *name, ...)
+{
+	struct child_process hook;
+	const char **argv = NULL, *env[2];
+	char index[PATH_MAX];
+	va_list args;
+	int ret;
+	size_t i = 0, alloc = 0;
+
+	if (access(perf_path("hooks/%s", name), X_OK) < 0)
+		return 0;
+
+	va_start(args, name);
+	ALLOC_GROW(argv, i + 1, alloc);
+	argv[i++] = perf_path("hooks/%s", name);
+	while (argv[i-1]) {
+		ALLOC_GROW(argv, i + 1, alloc);
+		argv[i++] = va_arg(args, const char *);
+	}
+	va_end(args);
+
+	memset(&hook, 0, sizeof(hook));
+	hook.argv = argv;
+	hook.no_stdin = 1;
+	hook.stdout_to_stderr = 1;
+	if (index_file) {
+		snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file);
+		env[0] = index;
+		env[1] = NULL;
+		hook.env = env;
+	}
+
+	ret = start_command(&hook);
+	free(argv);
+	if (ret) {
+		warning("Could not spawn %s", argv[0]);
+		return ret;
+	}
+	ret = finish_command(&hook);
+	if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL)
+		warning("%s exited due to uncaught signal", argv[0]);
+
+	return ret;
+}
diff --git a/Documentation/perf_counter/run-command.h b/Documentation/perf_counter/run-command.h
new file mode 100644
index 0000000000000..328289f236695
--- /dev/null
+++ b/Documentation/perf_counter/run-command.h
@@ -0,0 +1,93 @@
+#ifndef RUN_COMMAND_H
+#define RUN_COMMAND_H
+
+enum {
+	ERR_RUN_COMMAND_FORK = 10000,
+	ERR_RUN_COMMAND_EXEC,
+	ERR_RUN_COMMAND_PIPE,
+	ERR_RUN_COMMAND_WAITPID,
+	ERR_RUN_COMMAND_WAITPID_WRONG_PID,
+	ERR_RUN_COMMAND_WAITPID_SIGNAL,
+	ERR_RUN_COMMAND_WAITPID_NOEXIT,
+};
+#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK)
+
+struct child_process {
+	const char **argv;
+	pid_t pid;
+	/*
+	 * Using .in, .out, .err:
+	 * - Specify 0 for no redirections (child inherits stdin, stdout,
+	 *   stderr from parent).
+	 * - Specify -1 to have a pipe allocated as follows:
+	 *     .in: returns the writable pipe end; parent writes to it,
+	 *          the readable pipe end becomes child's stdin
+	 *     .out, .err: returns the readable pipe end; parent reads from
+	 *          it, the writable pipe end becomes child's stdout/stderr
+	 *   The caller of start_command() must close the returned FDs
+	 *   after it has completed reading from/writing to it!
+	 * - Specify > 0 to set a channel to a particular FD as follows:
+	 *     .in: a readable FD, becomes child's stdin
+	 *     .out: a writable FD, becomes child's stdout/stderr
+	 *     .err > 0 not supported
+	 *   The specified FD is closed by start_command(), even in case
+	 *   of errors!
+	 */
+	int in;
+	int out;
+	int err;
+	const char *dir;
+	const char *const *env;
+	unsigned no_stdin:1;
+	unsigned no_stdout:1;
+	unsigned no_stderr:1;
+	unsigned perf_cmd:1; /* if this is to be perf sub-command */
+	unsigned stdout_to_stderr:1;
+	void (*preexec_cb)(void);
+};
+
+int start_command(struct child_process *);
+int finish_command(struct child_process *);
+int run_command(struct child_process *);
+
+extern int run_hook(const char *index_file, const char *name, ...);
+
+#define RUN_COMMAND_NO_STDIN 1
+#define RUN_PERF_CMD	     2	/*If this is to be perf sub-command */
+#define RUN_COMMAND_STDOUT_TO_STDERR 4
+int run_command_v_opt(const char **argv, int opt);
+
+/*
+ * env (the environment) is to be formatted like environ: "VAR=VALUE".
+ * To unset an environment variable use just "VAR".
+ */
+int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env);
+
+/*
+ * The purpose of the following functions is to feed a pipe by running
+ * a function asynchronously and providing output that the caller reads.
+ *
+ * It is expected that no synchronization and mutual exclusion between
+ * the caller and the feed function is necessary so that the function
+ * can run in a thread without interfering with the caller.
+ */
+struct async {
+	/*
+	 * proc writes to fd and closes it;
+	 * returns 0 on success, non-zero on failure
+	 */
+	int (*proc)(int fd, void *data);
+	void *data;
+	int out;	/* caller reads from here and closes it */
+#ifndef __MINGW32__
+	pid_t pid;
+#else
+	HANDLE tid;
+	int fd_for_proc;
+#endif
+};
+
+int start_async(struct async *async);
+int finish_async(struct async *async);
+
+#endif
diff --git a/Documentation/perf_counter/strbuf.c b/Documentation/perf_counter/strbuf.c
new file mode 100644
index 0000000000000..eaba093068023
--- /dev/null
+++ b/Documentation/perf_counter/strbuf.c
@@ -0,0 +1,359 @@
+#include "cache.h"
+
+int prefixcmp(const char *str, const char *prefix)
+{
+	for (; ; str++, prefix++)
+		if (!*prefix)
+			return 0;
+		else if (*str != *prefix)
+			return (unsigned char)*prefix - (unsigned char)*str;
+}
+
+/*
+ * Used as the default ->buf value, so that people can always assume
+ * buf is non NULL and ->buf is NUL terminated even for a freshly
+ * initialized strbuf.
+ */
+char strbuf_slopbuf[1];
+
+void strbuf_init(struct strbuf *sb, size_t hint)
+{
+	sb->alloc = sb->len = 0;
+	sb->buf = strbuf_slopbuf;
+	if (hint)
+		strbuf_grow(sb, hint);
+}
+
+void strbuf_release(struct strbuf *sb)
+{
+	if (sb->alloc) {
+		free(sb->buf);
+		strbuf_init(sb, 0);
+	}
+}
+
+char *strbuf_detach(struct strbuf *sb, size_t *sz)
+{
+	char *res = sb->alloc ? sb->buf : NULL;
+	if (sz)
+		*sz = sb->len;
+	strbuf_init(sb, 0);
+	return res;
+}
+
+void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc)
+{
+	strbuf_release(sb);
+	sb->buf   = buf;
+	sb->len   = len;
+	sb->alloc = alloc;
+	strbuf_grow(sb, 0);
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_grow(struct strbuf *sb, size_t extra)
+{
+	if (sb->len + extra + 1 <= sb->len)
+		die("you want to use way too much memory");
+	if (!sb->alloc)
+		sb->buf = NULL;
+	ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc);
+}
+
+void strbuf_trim(struct strbuf *sb)
+{
+	char *b = sb->buf;
+	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
+		sb->len--;
+	while (sb->len > 0 && isspace(*b)) {
+		b++;
+		sb->len--;
+	}
+	memmove(sb->buf, b, sb->len);
+	sb->buf[sb->len] = '\0';
+}
+void strbuf_rtrim(struct strbuf *sb)
+{
+	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1]))
+		sb->len--;
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_ltrim(struct strbuf *sb)
+{
+	char *b = sb->buf;
+	while (sb->len > 0 && isspace(*b)) {
+		b++;
+		sb->len--;
+	}
+	memmove(sb->buf, b, sb->len);
+	sb->buf[sb->len] = '\0';
+}
+
+void strbuf_tolower(struct strbuf *sb)
+{
+	int i;
+	for (i = 0; i < sb->len; i++)
+		sb->buf[i] = tolower(sb->buf[i]);
+}
+
+struct strbuf **strbuf_split(const struct strbuf *sb, int delim)
+{
+	int alloc = 2, pos = 0;
+	char *n, *p;
+	struct strbuf **ret;
+	struct strbuf *t;
+
+	ret = calloc(alloc, sizeof(struct strbuf *));
+	p = n = sb->buf;
+	while (n < sb->buf + sb->len) {
+		int len;
+		n = memchr(n, delim, sb->len - (n - sb->buf));
+		if (pos + 1 >= alloc) {
+			alloc = alloc * 2;
+			ret = realloc(ret, sizeof(struct strbuf *) * alloc);
+		}
+		if (!n)
+			n = sb->buf + sb->len - 1;
+		len = n - p + 1;
+		t = malloc(sizeof(struct strbuf));
+		strbuf_init(t, len);
+		strbuf_add(t, p, len);
+		ret[pos] = t;
+		ret[++pos] = NULL;
+		p = ++n;
+	}
+	return ret;
+}
+
+void strbuf_list_free(struct strbuf **sbs)
+{
+	struct strbuf **s = sbs;
+
+	while (*s) {
+		strbuf_release(*s);
+		free(*s++);
+	}
+	free(sbs);
+}
+
+int strbuf_cmp(const struct strbuf *a, const struct strbuf *b)
+{
+	int len = a->len < b->len ? a->len: b->len;
+	int cmp = memcmp(a->buf, b->buf, len);
+	if (cmp)
+		return cmp;
+	return a->len < b->len ? -1: a->len != b->len;
+}
+
+void strbuf_splice(struct strbuf *sb, size_t pos, size_t len,
+				   const void *data, size_t dlen)
+{
+	if (pos + len < pos)
+		die("you want to use way too much memory");
+	if (pos > sb->len)
+		die("`pos' is too far after the end of the buffer");
+	if (pos + len > sb->len)
+		die("`pos + len' is too far after the end of the buffer");
+
+	if (dlen >= len)
+		strbuf_grow(sb, dlen - len);
+	memmove(sb->buf + pos + dlen,
+			sb->buf + pos + len,
+			sb->len - pos - len);
+	memcpy(sb->buf + pos, data, dlen);
+	strbuf_setlen(sb, sb->len + dlen - len);
+}
+
+void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len)
+{
+	strbuf_splice(sb, pos, 0, data, len);
+}
+
+void strbuf_remove(struct strbuf *sb, size_t pos, size_t len)
+{
+	strbuf_splice(sb, pos, len, NULL, 0);
+}
+
+void strbuf_add(struct strbuf *sb, const void *data, size_t len)
+{
+	strbuf_grow(sb, len);
+	memcpy(sb->buf + sb->len, data, len);
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len)
+{
+	strbuf_grow(sb, len);
+	memcpy(sb->buf + sb->len, sb->buf + pos, len);
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_addf(struct strbuf *sb, const char *fmt, ...)
+{
+	int len;
+	va_list ap;
+
+	if (!strbuf_avail(sb))
+		strbuf_grow(sb, 64);
+	va_start(ap, fmt);
+	len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
+	va_end(ap);
+	if (len < 0)
+		die("your vsnprintf is broken");
+	if (len > strbuf_avail(sb)) {
+		strbuf_grow(sb, len);
+		va_start(ap, fmt);
+		len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap);
+		va_end(ap);
+		if (len > strbuf_avail(sb)) {
+			die("this should not happen, your snprintf is broken");
+		}
+	}
+	strbuf_setlen(sb, sb->len + len);
+}
+
+void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn,
+		   void *context)
+{
+	for (;;) {
+		const char *percent;
+		size_t consumed;
+
+		percent = strchrnul(format, '%');
+		strbuf_add(sb, format, percent - format);
+		if (!*percent)
+			break;
+		format = percent + 1;
+
+		consumed = fn(sb, format, context);
+		if (consumed)
+			format += consumed;
+		else
+			strbuf_addch(sb, '%');
+	}
+}
+
+size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder,
+		void *context)
+{
+	struct strbuf_expand_dict_entry *e = context;
+	size_t len;
+
+	for (; e->placeholder && (len = strlen(e->placeholder)); e++) {
+		if (!strncmp(placeholder, e->placeholder, len)) {
+			if (e->value)
+				strbuf_addstr(sb, e->value);
+			return len;
+		}
+	}
+	return 0;
+}
+
+size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f)
+{
+	size_t res;
+	size_t oldalloc = sb->alloc;
+
+	strbuf_grow(sb, size);
+	res = fread(sb->buf + sb->len, 1, size, f);
+	if (res > 0)
+		strbuf_setlen(sb, sb->len + res);
+	else if (res < 0 && oldalloc == 0)
+		strbuf_release(sb);
+	return res;
+}
+
+ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint)
+{
+	size_t oldlen = sb->len;
+	size_t oldalloc = sb->alloc;
+
+	strbuf_grow(sb, hint ? hint : 8192);
+	for (;;) {
+		ssize_t cnt;
+
+		cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1);
+		if (cnt < 0) {
+			if (oldalloc == 0)
+				strbuf_release(sb);
+			else
+				strbuf_setlen(sb, oldlen);
+			return -1;
+		}
+		if (!cnt)
+			break;
+		sb->len += cnt;
+		strbuf_grow(sb, 8192);
+	}
+
+	sb->buf[sb->len] = '\0';
+	return sb->len - oldlen;
+}
+
+#define STRBUF_MAXLINK (2*PATH_MAX)
+
+int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint)
+{
+	size_t oldalloc = sb->alloc;
+
+	if (hint < 32)
+		hint = 32;
+
+	while (hint < STRBUF_MAXLINK) {
+		int len;
+
+		strbuf_grow(sb, hint);
+		len = readlink(path, sb->buf, hint);
+		if (len < 0) {
+			if (errno != ERANGE)
+				break;
+		} else if (len < hint) {
+			strbuf_setlen(sb, len);
+			return 0;
+		}
+
+		/* .. the buffer was too small - try again */
+		hint *= 2;
+	}
+	if (oldalloc == 0)
+		strbuf_release(sb);
+	return -1;
+}
+
+int strbuf_getline(struct strbuf *sb, FILE *fp, int term)
+{
+	int ch;
+
+	strbuf_grow(sb, 0);
+	if (feof(fp))
+		return EOF;
+
+	strbuf_reset(sb);
+	while ((ch = fgetc(fp)) != EOF) {
+		if (ch == term)
+			break;
+		strbuf_grow(sb, 1);
+		sb->buf[sb->len++] = ch;
+	}
+	if (ch == EOF && sb->len == 0)
+		return EOF;
+
+	sb->buf[sb->len] = '\0';
+	return 0;
+}
+
+int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint)
+{
+	int fd, len;
+
+	fd = open(path, O_RDONLY);
+	if (fd < 0)
+		return -1;
+	len = strbuf_read(sb, fd, hint);
+	close(fd);
+	if (len < 0)
+		return -1;
+
+	return len;
+}
diff --git a/Documentation/perf_counter/strbuf.h b/Documentation/perf_counter/strbuf.h
new file mode 100644
index 0000000000000..9ee908a3ec5d4
--- /dev/null
+++ b/Documentation/perf_counter/strbuf.h
@@ -0,0 +1,137 @@
+#ifndef STRBUF_H
+#define STRBUF_H
+
+/*
+ * Strbuf's can be use in many ways: as a byte array, or to store arbitrary
+ * long, overflow safe strings.
+ *
+ * Strbufs has some invariants that are very important to keep in mind:
+ *
+ * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to
+ *    build complex strings/buffers whose final size isn't easily known.
+ *
+ *    It is NOT legal to copy the ->buf pointer away.
+ *    `strbuf_detach' is the operation that detachs a buffer from its shell
+ *    while keeping the shell valid wrt its invariants.
+ *
+ * 2. the ->buf member is a byte array that has at least ->len + 1 bytes
+ *    allocated. The extra byte is used to store a '\0', allowing the ->buf
+ *    member to be a valid C-string. Every strbuf function ensure this
+ *    invariant is preserved.
+ *
+ *    Note that it is OK to "play" with the buffer directly if you work it
+ *    that way:
+ *
+ *    strbuf_grow(sb, SOME_SIZE);
+ *       ... Here, the memory array starting at sb->buf, and of length
+ *       ... strbuf_avail(sb) is all yours, and you are sure that
+ *       ... strbuf_avail(sb) is at least SOME_SIZE.
+ *    strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE);
+ *
+ *    Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb).
+ *
+ *    Doing so is safe, though if it has to be done in many places, adding the
+ *    missing API to the strbuf module is the way to go.
+ *
+ *    XXX: do _not_ assume that the area that is yours is of size ->alloc - 1
+ *         even if it's true in the current implementation. Alloc is somehow a
+ *         "private" member that should not be messed with.
+ */
+
+#include <assert.h>
+
+extern char strbuf_slopbuf[];
+struct strbuf {
+	size_t alloc;
+	size_t len;
+	char *buf;
+};
+
+#define STRBUF_INIT  { 0, 0, strbuf_slopbuf }
+
+/*----- strbuf life cycle -----*/
+extern void strbuf_init(struct strbuf *, size_t);
+extern void strbuf_release(struct strbuf *);
+extern char *strbuf_detach(struct strbuf *, size_t *);
+extern void strbuf_attach(struct strbuf *, void *, size_t, size_t);
+static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) {
+	struct strbuf tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
+/*----- strbuf size related -----*/
+static inline size_t strbuf_avail(const struct strbuf *sb) {
+	return sb->alloc ? sb->alloc - sb->len - 1 : 0;
+}
+
+extern void strbuf_grow(struct strbuf *, size_t);
+
+static inline void strbuf_setlen(struct strbuf *sb, size_t len) {
+	if (!sb->alloc)
+		strbuf_grow(sb, 0);
+	assert(len < sb->alloc);
+	sb->len = len;
+	sb->buf[len] = '\0';
+}
+#define strbuf_reset(sb)  strbuf_setlen(sb, 0)
+
+/*----- content related -----*/
+extern void strbuf_trim(struct strbuf *);
+extern void strbuf_rtrim(struct strbuf *);
+extern void strbuf_ltrim(struct strbuf *);
+extern int strbuf_cmp(const struct strbuf *, const struct strbuf *);
+extern void strbuf_tolower(struct strbuf *);
+
+extern struct strbuf **strbuf_split(const struct strbuf *, int delim);
+extern void strbuf_list_free(struct strbuf **);
+
+/*----- add data in your buffer -----*/
+static inline void strbuf_addch(struct strbuf *sb, int c) {
+	strbuf_grow(sb, 1);
+	sb->buf[sb->len++] = c;
+	sb->buf[sb->len] = '\0';
+}
+
+extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t);
+extern void strbuf_remove(struct strbuf *, size_t pos, size_t len);
+
+/* splice pos..pos+len with given data */
+extern void strbuf_splice(struct strbuf *, size_t pos, size_t len,
+                          const void *, size_t);
+
+extern void strbuf_add(struct strbuf *, const void *, size_t);
+static inline void strbuf_addstr(struct strbuf *sb, const char *s) {
+	strbuf_add(sb, s, strlen(s));
+}
+static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) {
+	strbuf_add(sb, sb2->buf, sb2->len);
+}
+extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len);
+
+typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context);
+extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context);
+struct strbuf_expand_dict_entry {
+	const char *placeholder;
+	const char *value;
+};
+extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context);
+
+__attribute__((format(printf,2,3)))
+extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...);
+
+extern size_t strbuf_fread(struct strbuf *, size_t, FILE *);
+/* XXX: if read fails, any partial read is undone */
+extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint);
+extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint);
+extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint);
+
+extern int strbuf_getline(struct strbuf *, FILE *, int);
+
+extern void stripspace(struct strbuf *buf, int skip_comments);
+extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env);
+
+extern int strbuf_branchname(struct strbuf *sb, const char *name);
+extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name);
+
+#endif /* STRBUF_H */
diff --git a/Documentation/perf_counter/usage.c b/Documentation/perf_counter/usage.c
new file mode 100644
index 0000000000000..7a10421fe6b42
--- /dev/null
+++ b/Documentation/perf_counter/usage.c
@@ -0,0 +1,80 @@
+/*
+ * GIT - The information manager from hell
+ *
+ * Copyright (C) Linus Torvalds, 2005
+ */
+#include "util.h"
+
+static void report(const char *prefix, const char *err, va_list params)
+{
+	char msg[1024];
+	vsnprintf(msg, sizeof(msg), err, params);
+	fprintf(stderr, "%s%s\n", prefix, msg);
+}
+
+static NORETURN void usage_builtin(const char *err)
+{
+	fprintf(stderr, "usage: %s\n", err);
+	exit(129);
+}
+
+static NORETURN void die_builtin(const char *err, va_list params)
+{
+	report("fatal: ", err, params);
+	exit(128);
+}
+
+static void error_builtin(const char *err, va_list params)
+{
+	report("error: ", err, params);
+}
+
+static void warn_builtin(const char *warn, va_list params)
+{
+	report("warning: ", warn, params);
+}
+
+/* If we are in a dlopen()ed .so write to a global variable would segfault
+ * (ugh), so keep things static. */
+static void (*usage_routine)(const char *err) NORETURN = usage_builtin;
+static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin;
+static void (*error_routine)(const char *err, va_list params) = error_builtin;
+static void (*warn_routine)(const char *err, va_list params) = warn_builtin;
+
+void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN)
+{
+	die_routine = routine;
+}
+
+void usage(const char *err)
+{
+	usage_routine(err);
+}
+
+void die(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	die_routine(err, params);
+	va_end(params);
+}
+
+int error(const char *err, ...)
+{
+	va_list params;
+
+	va_start(params, err);
+	error_routine(err, params);
+	va_end(params);
+	return -1;
+}
+
+void warning(const char *warn, ...)
+{
+	va_list params;
+
+	va_start(params, warn);
+	warn_routine(warn, params);
+	va_end(params);
+}
diff --git a/Documentation/perf_counter/util.h b/Documentation/perf_counter/util.h
new file mode 100644
index 0000000000000..13f8bdce76007
--- /dev/null
+++ b/Documentation/perf_counter/util.h
@@ -0,0 +1,394 @@
+#ifndef GIT_COMPAT_UTIL_H
+#define GIT_COMPAT_UTIL_H
+
+#define _FILE_OFFSET_BITS 64
+
+#ifndef FLEX_ARRAY
+/*
+ * See if our compiler is known to support flexible array members.
+ */
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+# define FLEX_ARRAY /* empty */
+#elif defined(__GNUC__)
+# if (__GNUC__ >= 3)
+#  define FLEX_ARRAY /* empty */
+# else
+#  define FLEX_ARRAY 0 /* older GNU extension */
+# endif
+#endif
+
+/*
+ * Otherwise, default to safer but a bit wasteful traditional style
+ */
+#ifndef FLEX_ARRAY
+# define FLEX_ARRAY 1
+#endif
+#endif
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+#ifdef __GNUC__
+#define TYPEOF(x) (__typeof__(x))
+#else
+#define TYPEOF(x)
+#endif
+
+#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits))))
+#define HAS_MULTI_BITS(i)  ((i) & ((i) - 1))  /* checks if an integer has more than 1 bit set */
+
+/* Approximation of the length of the decimal representation of this type. */
+#define decimal_length(x)	((int)(sizeof(x) * 2.56 + 0.5) + 1)
+
+#if !defined(__APPLE__) && !defined(__FreeBSD__)  && !defined(__USLC__) && !defined(_M_UNIX)
+#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */
+#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */
+#endif
+#define _ALL_SOURCE 1
+#define _GNU_SOURCE 1
+#define _BSD_SOURCE 1
+
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <string.h>
+#include <errno.h>
+#include <limits.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <dirent.h>
+#include <sys/time.h>
+#include <time.h>
+#include <signal.h>
+#include <fnmatch.h>
+#include <assert.h>
+#include <regex.h>
+#include <utime.h>
+#ifndef __MINGW32__
+#include <sys/wait.h>
+#include <sys/poll.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#ifndef NO_SYS_SELECT_H
+#include <sys/select.h>
+#endif
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+#include <pwd.h>
+#include <inttypes.h>
+#if defined(__CYGWIN__)
+#undef _XOPEN_SOURCE
+#include <grp.h>
+#define _XOPEN_SOURCE 600
+#include "compat/cygwin.h"
+#else
+#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */
+#include <grp.h>
+#define _ALL_SOURCE 1
+#endif
+#else 	/* __MINGW32__ */
+/* pull in Windows compatibility stuff */
+#include "compat/mingw.h"
+#endif	/* __MINGW32__ */
+
+#ifndef NO_ICONV
+#include <iconv.h>
+#endif
+
+#ifndef NO_OPENSSL
+#include <openssl/ssl.h>
+#include <openssl/err.h>
+#endif
+
+/* On most systems <limits.h> would have given us this, but
+ * not on some systems (e.g. GNU/Hurd).
+ */
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+#ifndef PRIuMAX
+#define PRIuMAX "llu"
+#endif
+
+#ifndef PRIu32
+#define PRIu32 "u"
+#endif
+
+#ifndef PRIx32
+#define PRIx32 "x"
+#endif
+
+#ifndef PATH_SEP
+#define PATH_SEP ':'
+#endif
+
+#ifndef STRIP_EXTENSION
+#define STRIP_EXTENSION ""
+#endif
+
+#ifndef has_dos_drive_prefix
+#define has_dos_drive_prefix(path) 0
+#endif
+
+#ifndef is_dir_sep
+#define is_dir_sep(c) ((c) == '/')
+#endif
+
+#ifdef __GNUC__
+#define NORETURN __attribute__((__noreturn__))
+#else
+#define NORETURN
+#ifndef __attribute__
+#define __attribute__(x)
+#endif
+#endif
+
+/* General helper functions */
+extern void usage(const char *err) NORETURN;
+extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2)));
+extern int error(const char *err, ...) __attribute__((format (printf, 1, 2)));
+extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2)));
+
+extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN);
+
+extern int prefixcmp(const char *str, const char *prefix);
+extern time_t tm_to_time_t(const struct tm *tm);
+
+static inline const char *skip_prefix(const char *str, const char *prefix)
+{
+	size_t len = strlen(prefix);
+	return strncmp(str, prefix, len) ? NULL : str + len;
+}
+
+#if defined(NO_MMAP) || defined(USE_WIN32_MMAP)
+
+#ifndef PROT_READ
+#define PROT_READ 1
+#define PROT_WRITE 2
+#define MAP_PRIVATE 1
+#define MAP_FAILED ((void*)-1)
+#endif
+
+#define mmap git_mmap
+#define munmap git_munmap
+extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
+extern int git_munmap(void *start, size_t length);
+
+#else /* NO_MMAP || USE_WIN32_MMAP */
+
+#include <sys/mman.h>
+
+#endif /* NO_MMAP || USE_WIN32_MMAP */
+
+#ifdef NO_MMAP
+
+/* This value must be multiple of (pagesize * 2) */
+#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024)
+
+#else /* NO_MMAP */
+
+/* This value must be multiple of (pagesize * 2) */
+#define DEFAULT_PACKED_GIT_WINDOW_SIZE \
+	(sizeof(void*) >= 8 \
+		?  1 * 1024 * 1024 * 1024 \
+		: 32 * 1024 * 1024)
+
+#endif /* NO_MMAP */
+
+#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
+#define on_disk_bytes(st) ((st).st_size)
+#else
+#define on_disk_bytes(st) ((st).st_blocks * 512)
+#endif
+
+#define DEFAULT_PACKED_GIT_LIMIT \
+	((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256))
+
+#ifdef NO_PREAD
+#define pread git_pread
+extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset);
+#endif
+/*
+ * Forward decl that will remind us if its twin in cache.h changes.
+ * This function is used in compat/pread.c.  But we can't include
+ * cache.h there.
+ */
+extern ssize_t read_in_full(int fd, void *buf, size_t count);
+
+#ifdef NO_SETENV
+#define setenv gitsetenv
+extern int gitsetenv(const char *, const char *, int);
+#endif
+
+#ifdef NO_MKDTEMP
+#define mkdtemp gitmkdtemp
+extern char *gitmkdtemp(char *);
+#endif
+
+#ifdef NO_UNSETENV
+#define unsetenv gitunsetenv
+extern void gitunsetenv(const char *);
+#endif
+
+#ifdef NO_STRCASESTR
+#define strcasestr gitstrcasestr
+extern char *gitstrcasestr(const char *haystack, const char *needle);
+#endif
+
+#ifdef NO_STRLCPY
+#define strlcpy gitstrlcpy
+extern size_t gitstrlcpy(char *, const char *, size_t);
+#endif
+
+#ifdef NO_STRTOUMAX
+#define strtoumax gitstrtoumax
+extern uintmax_t gitstrtoumax(const char *, char **, int);
+#endif
+
+#ifdef NO_HSTRERROR
+#define hstrerror githstrerror
+extern const char *githstrerror(int herror);
+#endif
+
+#ifdef NO_MEMMEM
+#define memmem gitmemmem
+void *gitmemmem(const void *haystack, size_t haystacklen,
+                const void *needle, size_t needlelen);
+#endif
+
+#ifdef FREAD_READS_DIRECTORIES
+#ifdef fopen
+#undef fopen
+#endif
+#define fopen(a,b) git_fopen(a,b)
+extern FILE *git_fopen(const char*, const char*);
+#endif
+
+#ifdef SNPRINTF_RETURNS_BOGUS
+#define snprintf git_snprintf
+extern int git_snprintf(char *str, size_t maxsize,
+			const char *format, ...);
+#define vsnprintf git_vsnprintf
+extern int git_vsnprintf(char *str, size_t maxsize,
+			 const char *format, va_list ap);
+#endif
+
+#ifdef __GLIBC_PREREQ
+#if __GLIBC_PREREQ(2, 1)
+#define HAVE_STRCHRNUL
+#endif
+#endif
+
+#ifndef HAVE_STRCHRNUL
+#define strchrnul gitstrchrnul
+static inline char *gitstrchrnul(const char *s, int c)
+{
+	while (*s && *s != c)
+		s++;
+	return (char *)s;
+}
+#endif
+
+static inline size_t xsize_t(off_t len)
+{
+	return (size_t)len;
+}
+
+static inline int has_extension(const char *filename, const char *ext)
+{
+	size_t len = strlen(filename);
+	size_t extlen = strlen(ext);
+	return len > extlen && !memcmp(filename + len - extlen, ext, extlen);
+}
+
+/* Sane ctype - no locale, and works with signed chars */
+#undef isascii
+#undef isspace
+#undef isdigit
+#undef isalpha
+#undef isalnum
+#undef tolower
+#undef toupper
+extern unsigned char sane_ctype[256];
+#define GIT_SPACE 0x01
+#define GIT_DIGIT 0x02
+#define GIT_ALPHA 0x04
+#define GIT_GLOB_SPECIAL 0x08
+#define GIT_REGEX_SPECIAL 0x10
+#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
+#define isascii(x) (((x) & ~0x7f) == 0)
+#define isspace(x) sane_istest(x,GIT_SPACE)
+#define isdigit(x) sane_istest(x,GIT_DIGIT)
+#define isalpha(x) sane_istest(x,GIT_ALPHA)
+#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
+#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
+#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL)
+#define tolower(x) sane_case((unsigned char)(x), 0x20)
+#define toupper(x) sane_case((unsigned char)(x), 0)
+
+static inline int sane_case(int x, int high)
+{
+	if (sane_istest(x, GIT_ALPHA))
+		x = (x & ~0x20) | high;
+	return x;
+}
+
+static inline int strtoul_ui(char const *s, int base, unsigned int *result)
+{
+	unsigned long ul;
+	char *p;
+
+	errno = 0;
+	ul = strtoul(s, &p, base);
+	if (errno || *p || p == s || (unsigned int) ul != ul)
+		return -1;
+	*result = ul;
+	return 0;
+}
+
+static inline int strtol_i(char const *s, int base, int *result)
+{
+	long ul;
+	char *p;
+
+	errno = 0;
+	ul = strtol(s, &p, base);
+	if (errno || *p || p == s || (int) ul != ul)
+		return -1;
+	*result = ul;
+	return 0;
+}
+
+#ifdef INTERNAL_QSORT
+void git_qsort(void *base, size_t nmemb, size_t size,
+	       int(*compar)(const void *, const void *));
+#define qsort git_qsort
+#endif
+
+#ifndef DIR_HAS_BSD_GROUP_SEMANTICS
+# define FORCE_DIR_SET_GID S_ISGID
+#else
+# define FORCE_DIR_SET_GID 0
+#endif
+
+#ifdef NO_NSEC
+#undef USE_NSEC
+#define ST_CTIME_NSEC(st) 0
+#define ST_MTIME_NSEC(st) 0
+#else
+#ifdef USE_ST_TIMESPEC
+#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec))
+#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec))
+#else
+#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec))
+#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec))
+#endif
+#endif
+
+#endif
diff --git a/Documentation/perf_counter/wrapper.c b/Documentation/perf_counter/wrapper.c
new file mode 100644
index 0000000000000..6350d65f6d9e5
--- /dev/null
+++ b/Documentation/perf_counter/wrapper.c
@@ -0,0 +1,206 @@
+/*
+ * Various trivial helper wrappers around standard functions
+ */
+#include "cache.h"
+
+/*
+ * There's no pack memory to release - but stay close to the Git
+ * version so wrap this away:
+ */
+static inline void release_pack_memory(size_t size, int flag)
+{
+}
+
+char *xstrdup(const char *str)
+{
+	char *ret = strdup(str);
+	if (!ret) {
+		release_pack_memory(strlen(str) + 1, -1);
+		ret = strdup(str);
+		if (!ret)
+			die("Out of memory, strdup failed");
+	}
+	return ret;
+}
+
+void *xmalloc(size_t size)
+{
+	void *ret = malloc(size);
+	if (!ret && !size)
+		ret = malloc(1);
+	if (!ret) {
+		release_pack_memory(size, -1);
+		ret = malloc(size);
+		if (!ret && !size)
+			ret = malloc(1);
+		if (!ret)
+			die("Out of memory, malloc failed");
+	}
+#ifdef XMALLOC_POISON
+	memset(ret, 0xA5, size);
+#endif
+	return ret;
+}
+
+/*
+ * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of
+ * "data" to the allocated memory, zero terminates the allocated memory,
+ * and returns a pointer to the allocated memory. If the allocation fails,
+ * the program dies.
+ */
+void *xmemdupz(const void *data, size_t len)
+{
+	char *p = xmalloc(len + 1);
+	memcpy(p, data, len);
+	p[len] = '\0';
+	return p;
+}
+
+char *xstrndup(const char *str, size_t len)
+{
+	char *p = memchr(str, '\0', len);
+	return xmemdupz(str, p ? p - str : len);
+}
+
+void *xrealloc(void *ptr, size_t size)
+{
+	void *ret = realloc(ptr, size);
+	if (!ret && !size)
+		ret = realloc(ptr, 1);
+	if (!ret) {
+		release_pack_memory(size, -1);
+		ret = realloc(ptr, size);
+		if (!ret && !size)
+			ret = realloc(ptr, 1);
+		if (!ret)
+			die("Out of memory, realloc failed");
+	}
+	return ret;
+}
+
+void *xcalloc(size_t nmemb, size_t size)
+{
+	void *ret = calloc(nmemb, size);
+	if (!ret && (!nmemb || !size))
+		ret = calloc(1, 1);
+	if (!ret) {
+		release_pack_memory(nmemb * size, -1);
+		ret = calloc(nmemb, size);
+		if (!ret && (!nmemb || !size))
+			ret = calloc(1, 1);
+		if (!ret)
+			die("Out of memory, calloc failed");
+	}
+	return ret;
+}
+
+void *xmmap(void *start, size_t length,
+	int prot, int flags, int fd, off_t offset)
+{
+	void *ret = mmap(start, length, prot, flags, fd, offset);
+	if (ret == MAP_FAILED) {
+		if (!length)
+			return NULL;
+		release_pack_memory(length, fd);
+		ret = mmap(start, length, prot, flags, fd, offset);
+		if (ret == MAP_FAILED)
+			die("Out of memory? mmap failed: %s", strerror(errno));
+	}
+	return ret;
+}
+
+/*
+ * xread() is the same a read(), but it automatically restarts read()
+ * operations with a recoverable error (EAGAIN and EINTR). xread()
+ * DOES NOT GUARANTEE that "len" bytes is read even if the data is available.
+ */
+ssize_t xread(int fd, void *buf, size_t len)
+{
+	ssize_t nr;
+	while (1) {
+		nr = read(fd, buf, len);
+		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
+			continue;
+		return nr;
+	}
+}
+
+/*
+ * xwrite() is the same a write(), but it automatically restarts write()
+ * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT
+ * GUARANTEE that "len" bytes is written even if the operation is successful.
+ */
+ssize_t xwrite(int fd, const void *buf, size_t len)
+{
+	ssize_t nr;
+	while (1) {
+		nr = write(fd, buf, len);
+		if ((nr < 0) && (errno == EAGAIN || errno == EINTR))
+			continue;
+		return nr;
+	}
+}
+
+ssize_t read_in_full(int fd, void *buf, size_t count)
+{
+	char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t loaded = xread(fd, p, count);
+		if (loaded <= 0)
+			return total ? total : loaded;
+		count -= loaded;
+		p += loaded;
+		total += loaded;
+	}
+
+	return total;
+}
+
+ssize_t write_in_full(int fd, const void *buf, size_t count)
+{
+	const char *p = buf;
+	ssize_t total = 0;
+
+	while (count > 0) {
+		ssize_t written = xwrite(fd, p, count);
+		if (written < 0)
+			return -1;
+		if (!written) {
+			errno = ENOSPC;
+			return -1;
+		}
+		count -= written;
+		p += written;
+		total += written;
+	}
+
+	return total;
+}
+
+int xdup(int fd)
+{
+	int ret = dup(fd);
+	if (ret < 0)
+		die("dup failed: %s", strerror(errno));
+	return ret;
+}
+
+FILE *xfdopen(int fd, const char *mode)
+{
+	FILE *stream = fdopen(fd, mode);
+	if (stream == NULL)
+		die("Out of memory? fdopen failed: %s", strerror(errno));
+	return stream;
+}
+
+int xmkstemp(char *template)
+{
+	int fd;
+
+	fd = mkstemp(template);
+	if (fd < 0)
+		die("Unable to create temporary file: %s", strerror(errno));
+	return fd;
+}
-- 
GitLab


From 6f06ccbc86f8a02aa32271263249657ce484eb25 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 15:22:22 +0200
Subject: [PATCH 0541/2198] perf_counter tools: clean up after introduction of
 the Git command framework

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile       | 207 +---------------------
 Documentation/perf_counter/builtin-help.c |   2 -
 Documentation/perf_counter/builtin-top.c  |  17 +-
 Documentation/perf_counter/cache.h        |  20 +++
 Documentation/perf_counter/config.c       |  95 +---------
 Documentation/perf_counter/path.c         |  39 ----
 Documentation/perf_counter/perf.c         |   3 +-
 Documentation/perf_counter/util.h         |  14 ++
 8 files changed, 45 insertions(+), 352 deletions(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 11809b943fc3c..1b60265555470 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -138,16 +138,6 @@ all::
 #
 # Define NO_PERL if you do not want Perl scripts or libraries at all.
 #
-# Define NO_TCLTK if you do not want Tcl/Tk GUI.
-#
-# The TCL_PATH variable governs the location of the Tcl interpreter
-# used to optimize perf-gui for your system.  Only used if NO_TCLTK
-# is not set.  Defaults to the bare 'tclsh'.
-#
-# The TCLTK_PATH variable governs the location of the Tcl/Tk interpreter.
-# If not set it defaults to the bare 'wish'. If it is set to the empty
-# string then NO_TCLTK will be forced (this is used by configure script).
-#
 # Define INTERNAL_QSORT to use Git's implementation of qsort(), which
 # is a simplified version of the merge sort used in glibc. This is
 # recommended if Git triggers O(n^2) behavior in your platform's qsort().
@@ -215,12 +205,8 @@ TAR = tar
 FIND = find
 INSTALL = install
 RPMBUILD = rpmbuild
-TCL_PATH = tclsh
-TCLTK_PATH = wish
 PTHREAD_LIBS = -lpthread
 
-export TCL_PATH TCLTK_PATH
-
 # sparse is architecture-neutral, which means that we need to tell it
 # explicitly what architecture to check for. Fix this up for yours..
 SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
@@ -529,10 +515,6 @@ ifdef NO_EXTERNAL_GREP
 	BASIC_CFLAGS += -DNO_EXTERNAL_GREP
 endif
 
-ifeq ($(TCLTK_PATH),)
-NO_TCLTK=NoThanks
-endif
-
 ifeq ($(PERL_PATH),)
 NO_PERL=NoThanks
 endif
@@ -583,7 +565,6 @@ prefix_SQ = $(subst ','\'',$(prefix))
 
 SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
 PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
-TCLTK_PATH_SQ = $(subst ','\'',$(TCLTK_PATH))
 
 LIBS = $(PERFLIBS) $(EXTLIBS)
 
@@ -607,14 +588,6 @@ ifneq (,$X)
 endif
 
 all::
-ifndef NO_TCLTK
-	$(QUIET_SUBDIR0)perf-gui $(QUIET_SUBDIR1) perfexecdir='$(perfexec_instdir_SQ)' all
-	$(QUIET_SUBDIR0)perfk-perf $(QUIET_SUBDIR1) all
-endif
-ifndef NO_PERL
-	$(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' all
-endif
-	$(QUIET_SUBDIR0)templates $(QUIET_SUBDIR1)
 
 please_set_SHELL_PATH_to_a_more_modern_shell:
 	@$$(:)
@@ -704,21 +677,6 @@ builtin-revert.o wt-status.o: wt-status.h
 $(LIB_FILE): $(LIB_OBJS)
 	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
 
-doc:
-	$(MAKE) -C Documentation all
-
-man:
-	$(MAKE) -C Documentation man
-
-html:
-	$(MAKE) -C Documentation html
-
-info:
-	$(MAKE) -C Documentation info
-
-pdf:
-	$(MAKE) -C Documentation pdf
-
 TAGS:
 	$(RM) TAGS
 	$(FIND) . -name '*.[hcS]' -print | xargs etags -a
@@ -751,33 +709,12 @@ PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
 	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
 	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
 
-### Detect Tck/Tk interpreter path changes
-ifndef NO_TCLTK
-TRACK_VARS = $(subst ','\'',-DTCLTK_PATH='$(TCLTK_PATH_SQ)')
-
-PERF-GUI-VARS: .FORCE-PERF-GUI-VARS
-	@VARS='$(TRACK_VARS)'; \
-	    if test x"$$VARS" != x"`cat $@ 2>/dev/null`" ; then \
-		echo 1>&2 "    * new Tcl/Tk interpreter location"; \
-		echo "$$VARS" >$@; \
-            fi
-
-.PHONY: .FORCE-PERF-GUI-VARS
-endif
-
 ### Testing rules
 
-TEST_PROGRAMS += test-chmtime$X
-TEST_PROGRAMS += test-ctype$X
-TEST_PROGRAMS += test-date$X
-TEST_PROGRAMS += test-delta$X
-TEST_PROGRAMS += test-dump-cache-tree$X
-TEST_PROGRAMS += test-genrandom$X
-TEST_PROGRAMS += test-match-trees$X
-TEST_PROGRAMS += test-parse-options$X
-TEST_PROGRAMS += test-path-utils$X
-TEST_PROGRAMS += test-sha1$X
-TEST_PROGRAMS += test-sigchain$X
+#
+# None right now:
+#
+# TEST_PROGRAMS += test-something$X
 
 all:: $(TEST_PROGRAMS)
 
@@ -787,25 +724,6 @@ all:: $(TEST_PROGRAMS)
 
 export NO_SVN_TESTS
 
-test: all
-	$(MAKE) -C t/ all
-
-test-ctype$X: ctype.o
-
-test-date$X: date.o ctype.o
-
-test-delta$X: diff-delta.o patch-delta.o
-
-test-parse-options$X: parse-options.o
-
-.PRECIOUS: $(patsubst test-%$X,test-%.o,$(TEST_PROGRAMS))
-
-test-%$X: test-%.o $(PERFLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
-
-check-sha1:: test-sha1$X
-	./test-sha1.sh
-
 check: common-cmds.h
 	if sparse; \
 	then \
@@ -845,10 +763,6 @@ install: all
 	$(INSTALL) perf$X perf-upload-pack$X perf-receive-pack$X perf-upload-archive$X perf-shell$X perf-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install
 	$(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install
-ifndef NO_TCLTK
-	$(MAKE) -C perfk-perf install
-	$(MAKE) -C perf-gui perfexecdir='$(perfexec_instdir_SQ)' install
-endif
 ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
@@ -865,32 +779,6 @@ endif
 	  done } && \
 	./check_bindir "z$$bindir" "z$$execdir" "$$bindir/perf-add$X"
 
-install-doc:
-	$(MAKE) -C Documentation install
-
-install-man:
-	$(MAKE) -C Documentation install-man
-
-install-html:
-	$(MAKE) -C Documentation install-html
-
-install-info:
-	$(MAKE) -C Documentation install-info
-
-install-pdf:
-	$(MAKE) -C Documentation install-pdf
-
-quick-install-doc:
-	$(MAKE) -C Documentation quick-install
-
-quick-install-man:
-	$(MAKE) -C Documentation quick-install-man
-
-quick-install-html:
-	$(MAKE) -C Documentation quick-install-html
-
-
-
 ### Maintainer's dist rules
 
 perf.spec: perf.spec.in
@@ -904,38 +792,16 @@ dist: perf.spec perf-archive$(X) configure
 	@mkdir -p $(PERF_TARNAME)
 	@cp perf.spec configure $(PERF_TARNAME)
 	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
-	@$(MAKE) -C perf-gui TARDIR=../$(PERF_TARNAME)/perf-gui dist-version
 	$(TAR) rf $(PERF_TARNAME).tar \
 		$(PERF_TARNAME)/perf.spec \
 		$(PERF_TARNAME)/configure \
-		$(PERF_TARNAME)/version \
-		$(PERF_TARNAME)/perf-gui/version
+		$(PERF_TARNAME)/version
 	@$(RM) -r $(PERF_TARNAME)
 	gzip -f -9 $(PERF_TARNAME).tar
 
 rpm: dist
 	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
 
-htmldocs = perf-htmldocs-$(PERF_VERSION)
-manpages = perf-manpages-$(PERF_VERSION)
-dist-doc:
-	$(RM) -r .doc-tmp-dir
-	mkdir .doc-tmp-dir
-	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
-	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
-	gzip -n -9 -f $(htmldocs).tar
-	:
-	$(RM) -r .doc-tmp-dir
-	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
-	$(MAKE) -C Documentation DESTDIR=./ \
-		man1dir=../.doc-tmp-dir/man1 \
-		man5dir=../.doc-tmp-dir/man5 \
-		man7dir=../.doc-tmp-dir/man7 \
-		install
-	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
-	gzip -n -9 -f $(manpages).tar
-	$(RM) -r .doc-tmp-dir
-
 ### Cleaning rules
 
 distclean: clean
@@ -951,74 +817,13 @@ clean:
 	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
 	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
-	$(MAKE) -C Documentation/ clean
-	$(MAKE) -C templates/ clean
-	$(MAKE) -C t/ clean
-ifndef NO_TCLTK
-	$(MAKE) -C perfk-perf clean
-	$(MAKE) -C perf-gui clean
-endif
-	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-GUI-VARS PERF-BUILD-OPTIONS
+	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
 
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
 .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
 .PHONY: .FORCE-PERF-BUILD-OPTIONS
 
-### Check documentation
-#
-check-docs::
-	@(for v in $(ALL_PROGRAMS) $(BUILT_INS) perf perfk; \
-	do \
-		case "$$v" in \
-		perf-merge-octopus | perf-merge-ours | perf-merge-recursive | \
-		perf-merge-resolve | perf-merge-subtree | \
-		perf-fsck-objects | perf-init-db | \
-		perf-?*--?* ) continue ;; \
-		esac ; \
-		test -f "Documentation/$$v.txt" || \
-		echo "no doc: $$v"; \
-		sed -e '/^#/d' command-list.txt | \
-		grep -q "^$$v[ 	]" || \
-		case "$$v" in \
-		perf) ;; \
-		*) echo "no link: $$v";; \
-		esac ; \
-	done; \
-	( \
-		sed -e '/^#/d' \
-		    -e 's/[ 	].*//' \
-		    -e 's/^/listed /' command-list.txt; \
-		ls -1 Documentation/perf*txt | \
-		sed -e 's|Documentation/|documented |' \
-		    -e 's/\.txt//'; \
-	) | while read how cmd; \
-	do \
-		case "$$how,$$cmd" in \
-		*,perf-citool | \
-		*,perf-gui | \
-		*,perf-help | \
-		documented,perfattributes | \
-		documented,perfignore | \
-		documented,perfmodules | \
-		documented,perfcli | \
-		documented,perf-tools | \
-		documented,perfcore-tutorial | \
-		documented,perfcvs-migration | \
-		documented,perfdiffcore | \
-		documented,perfglossary | \
-		documented,perfhooks | \
-		documented,perfrepository-layout | \
-		documented,perftutorial | \
-		documented,perftutorial-2 | \
-		sentinel,not,matching,is,ok ) continue ;; \
-		esac; \
-		case " $(ALL_PROGRAMS) $(BUILT_INS) perf perfk " in \
-		*" $$cmd "*)	;; \
-		*) echo "removed but $$how: $$cmd" ;; \
-		esac; \
-	done ) | sort
-
 ### Make sure built-ins do not have dups and listed in perf.c
 #
 check-builtins::
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
index 125fcc2f4901d..a136d619db371 100644
--- a/Documentation/perf_counter/builtin-help.c
+++ b/Documentation/perf_counter/builtin-help.c
@@ -417,11 +417,9 @@ static void show_html_page(const char *perf_cmd)
 
 int cmd_help(int argc, const char **argv, const char *prefix)
 {
-	int nonperf;
 	const char *alias;
 	load_command_list("perf-", &main_cmds, &other_cmds);
 
-	/* setup_perf_directory_gently(&nonperf); */
 	perf_config(perf_help_config, NULL);
 
 	argc = parse_options(argc, argv, builtin_help_options,
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 9d2c769e5f835..601bddbc30d5f 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -63,15 +63,6 @@
 
 #include "util.h"
 
-#define _GNU_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
 #include <getopt.h>
 #include <assert.h>
 #include <fcntl.h>
@@ -103,8 +94,6 @@
 #define PR_TASK_PERF_COUNTERS_DISABLE   31
 #define PR_TASK_PERF_COUNTERS_ENABLE    32
 
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
 #define rdclock()                                       \
 ({                                                      \
         struct timespec ts;                             \
@@ -1077,7 +1066,7 @@ static void process_event(uint64_t ip, int counter)
 	record_ip(ip, counter);
 }
 
-static void process_options(int argc, char *argv[])
+static void process_options(int argc, char **argv)
 {
 	int error = 0, counter;
 
@@ -1255,7 +1244,7 @@ static void mmap_read(struct mmap_data *md)
 
 		event_t event_copy;
 
-		unsigned int size = event->header.size;
+		size_t size = event->header.size;
 
 		/*
 		 * Event straddles the mmap boundary -- header should always
@@ -1301,7 +1290,7 @@ static void mmap_read(struct mmap_data *md)
 	md->prev = old;
 }
 
-int cmd_top(int argc, const char **argv, const char *prefix)
+int cmd_top(int argc, char **argv, const char *prefix)
 {
 	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
diff --git a/Documentation/perf_counter/cache.h b/Documentation/perf_counter/cache.h
index dc085640a57d4..71080512fa862 100644
--- a/Documentation/perf_counter/cache.h
+++ b/Documentation/perf_counter/cache.h
@@ -94,4 +94,24 @@ static inline int is_absolute_path(const char *path)
 {
 	return path[0] == '/';
 }
+
+const char *make_absolute_path(const char *path);
+const char *make_nonrelative_path(const char *path);
+const char *make_relative_path(const char *abs, const char *base);
+int normalize_path_copy(char *dst, const char *src);
+int longest_ancestor_length(const char *path, const char *prefix_list);
+char *strip_path_suffix(const char *path, const char *suffix);
+
+extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+
+extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
+	__attribute__((format (printf, 3, 4)));
+extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...)
+	__attribute__((format (printf, 3, 4)));
+extern char *perf_pathdup(const char *fmt, ...)
+	__attribute__((format (printf, 1, 2)));
+
+extern size_t strlcpy(char *dest, const char *src, size_t size);
+
 #endif /* CACHE_H */
diff --git a/Documentation/perf_counter/config.c b/Documentation/perf_counter/config.c
index 672d53959334c..3dd13faa6a27f 100644
--- a/Documentation/perf_counter/config.c
+++ b/Documentation/perf_counter/config.c
@@ -15,7 +15,6 @@ static FILE *config_file;
 static const char *config_file_name;
 static int config_linenr;
 static int config_file_eof;
-static int zlib_compression_seen;
 
 const char *config_exclusive_filename = NULL;
 
@@ -533,14 +532,6 @@ static int store_aux(const char* key, const char* value, void *cb)
 	return 0;
 }
 
-static int write_error(const char *filename)
-{
-	error("failed to write new configuration file %s", filename);
-
-	/* Same error code as "failed to rename". */
-	return 4;
-}
-
 static int store_write_section(int fd, const char* key)
 {
 	const char *dot;
@@ -673,7 +664,7 @@ int perf_config_set_multivar(const char* key, const char* value,
 {
 	int i, dot;
 	int fd = -1, in_fd;
-	int ret;
+	int ret = 0;
 	char* config_filename;
 	const char* last_dot = strrchr(key, '.');
 
@@ -872,90 +863,6 @@ int perf_config_set_multivar(const char* key, const char* value,
 
 }
 
-static int section_name_match (const char *buf, const char *name)
-{
-	int i = 0, j = 0, dot = 0;
-	for (; buf[i] && buf[i] != ']'; i++) {
-		if (!dot && isspace(buf[i])) {
-			dot = 1;
-			if (name[j++] != '.')
-				break;
-			for (i++; isspace(buf[i]); i++)
-				; /* do nothing */
-			if (buf[i] != '"')
-				break;
-			continue;
-		}
-		if (buf[i] == '\\' && dot)
-			i++;
-		else if (buf[i] == '"' && dot) {
-			for (i++; isspace(buf[i]); i++)
-				; /* do_nothing */
-			break;
-		}
-		if (buf[i] != name[j++])
-			break;
-	}
-	return (buf[i] == ']' && name[j] == 0);
-}
-
-/* if new_name == NULL, the section is removed instead */
-int perf_config_rename_section(const char *old_name, const char *new_name)
-{
-	int ret = 0, remove = 0;
-	char *config_filename;
-	int out_fd;
-	char buf[1024];
-
-	if (config_exclusive_filename)
-		config_filename = strdup(config_exclusive_filename);
-	else
-		config_filename = perf_pathdup("config");
-	if (out_fd < 0) {
-		ret = error("could not lock config file %s", config_filename);
-		goto out;
-	}
-
-	if (!(config_file = fopen(config_filename, "rb"))) {
-		/* no config file means nothing to rename, no error */
-		goto unlock_and_out;
-	}
-
-	while (fgets(buf, sizeof(buf), config_file)) {
-		int i;
-		int length;
-		for (i = 0; buf[i] && isspace(buf[i]); i++)
-			; /* do nothing */
-		if (buf[i] == '[') {
-			/* it's a section */
-			if (section_name_match (&buf[i+1], old_name)) {
-				ret++;
-				if (new_name == NULL) {
-					remove = 1;
-					continue;
-				}
-				store.baselen = strlen(new_name);
-				if (!store_write_section(out_fd, new_name)) {
-					goto out;
-				}
-				continue;
-			}
-			remove = 0;
-		}
-		if (remove)
-			continue;
-		length = strlen(buf);
-		if (write_in_full(out_fd, buf, length) != length) {
-			goto out;
-		}
-	}
-	fclose(config_file);
- unlock_and_out:
- out:
-	free(config_filename);
-	return ret;
-}
-
 /*
  * Call this to report error for your variable that should not
  * get a boolean value (i.e. "[my] var" means "true").
diff --git a/Documentation/perf_counter/path.c b/Documentation/perf_counter/path.c
index 891b612ec1a9d..a501a40dd2cbe 100644
--- a/Documentation/perf_counter/path.c
+++ b/Documentation/perf_counter/path.c
@@ -161,45 +161,6 @@ int perf_mkstemp(char *path, size_t len, const char *template)
 }
 
 
-static char *user_path(char *buf, char *path, int sz)
-{
-	struct passwd *pw;
-	char *slash;
-	int len, baselen;
-
-	if (!path || path[0] != '~')
-		return NULL;
-	path++;
-	slash = strchr(path, '/');
-	if (path[0] == '/' || !path[0]) {
-		pw = getpwuid(getuid());
-	}
-	else {
-		if (slash) {
-			*slash = 0;
-			pw = getpwnam(path);
-			*slash = '/';
-		}
-		else
-			pw = getpwnam(path);
-	}
-	if (!pw || !pw->pw_dir || sz <= strlen(pw->pw_dir))
-		return NULL;
-	baselen = strlen(pw->pw_dir);
-	memcpy(buf, pw->pw_dir, baselen);
-	while ((1 < baselen) && (buf[baselen-1] == '/')) {
-		buf[baselen-1] = 0;
-		baselen--;
-	}
-	if (slash && slash[1]) {
-		len = strlen(slash);
-		if (sz <= baselen + len)
-			return NULL;
-		memcpy(buf + baselen, slash, len + 1);
-	}
-	return buf;
-}
-
 const char *make_relative_path(const char *abs, const char *base)
 {
 	static char buf[PATH_MAX + 1];
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 9256f6a164460..63f8a892c0dff 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -1,7 +1,7 @@
 #include "builtin.h"
 #include "exec_cmd.h"
 #include "cache.h"
-//#include "quote.h"
+#include "quote.h"
 #include "run-command.h"
 
 const char perf_usage_string[] =
@@ -132,7 +132,6 @@ static int handle_alias(int *argcp, const char ***argv)
 	const char** new_argv;
 	const char *alias_command;
 	char *alias_string;
-	int unused_nonperf;
 
 	alias_command = (*argv)[0];
 	alias_string = alias_lookup(alias_command);
diff --git a/Documentation/perf_counter/util.h b/Documentation/perf_counter/util.h
index 13f8bdce76007..36e40c38e0939 100644
--- a/Documentation/perf_counter/util.h
+++ b/Documentation/perf_counter/util.h
@@ -295,6 +295,20 @@ static inline char *gitstrchrnul(const char *s, int c)
 }
 #endif
 
+/*
+ * Wrappers:
+ */
+extern char *xstrdup(const char *str);
+extern void *xmalloc(size_t size);
+extern void *xmemdupz(const void *data, size_t len);
+extern char *xstrndup(const char *str, size_t len);
+extern void *xrealloc(void *ptr, size_t size);
+extern void *xcalloc(size_t nmemb, size_t size);
+extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset);
+extern ssize_t xread(int fd, void *buf, size_t len);
+extern ssize_t xwrite(int fd, const void *buf, size_t len);
+extern int xdup(int fd);
+extern FILE *xfdopen(int fd, const char *mode);
 static inline size_t xsize_t(off_t len)
 {
 	return (size_t)len;
-- 
GitLab


From ddcacfa0febff6454dba6cea1931f3020a9f6c24 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 15:37:32 +0200
Subject: [PATCH 0542/2198] perf_counter tools: separate kerneltop into 'perf
 top' and 'perf stat'

Lets use the Git framework of built-in commands.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile         |    1 +
 Documentation/perf_counter/builtin-stat.c   |  592 ++++++++
 Documentation/perf_counter/builtin-top.c    |  204 +--
 Documentation/perf_counter/builtin.h        |    3 +-
 Documentation/perf_counter/command-list.txt |    1 +
 Documentation/perf_counter/kerneltop.c      | 1409 -------------------
 Documentation/perf_counter/perf.c           |    1 +
 7 files changed, 601 insertions(+), 1610 deletions(-)
 create mode 100644 Documentation/perf_counter/builtin-stat.c
 delete mode 100644 Documentation/perf_counter/kerneltop.c

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 1b60265555470..fb8b71744e59f 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -309,6 +309,7 @@ LIB_OBJS += usage.o
 LIB_OBJS += wrapper.o
 
 BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
 
 PERFLIBS = $(LIB_FILE)
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
new file mode 100644
index 0000000000000..169a2d1783fca
--- /dev/null
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -0,0 +1,592 @@
+/*
+ * kerneltop.c: show top kernel functions - performance counters showcase
+
+   Build with:
+
+     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
+
+   Sample output:
+
+------------------------------------------------------------------------------
+ KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
+------------------------------------------------------------------------------
+
+             weight         RIP          kernel function
+             ______   ________________   _______________
+
+              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
+              33.00 - ffffffff804cb740 : sock_alloc_send_skb
+              31.26 - ffffffff804ce808 : skb_push
+              22.43 - ffffffff80510004 : tcp_established_options
+              19.00 - ffffffff8027d250 : find_get_page
+              15.76 - ffffffff804e4fc9 : eth_type_trans
+              15.20 - ffffffff804d8baa : dst_release
+              14.86 - ffffffff804cf5d8 : skb_release_head_state
+              14.00 - ffffffff802217d5 : read_hpet
+              12.00 - ffffffff804ffb7f : __ip_local_out
+              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
+               8.54 - ffffffff805001a3 : ip_queue_xmit
+ */
+
+/*
+ * perfstat:  /usr/bin/time -alike performance counter statistics utility
+
+          It summarizes the counter events of all tasks (and child tasks),
+          covering all CPUs that the command (or workload) executes on.
+          It only counts the per-task events of the workload started,
+          independent of how many other tasks run on those CPUs.
+
+   Sample output:
+
+   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
+
+   Performance counter stats for 'ls':
+
+           163516953 instructions
+                2295 cache-misses
+             2855182 branch-misses
+ */
+
+ /*
+  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+  *
+  * Improvements and fixes by:
+  *
+  *   Arjan van de Ven <arjan@linux.intel.com>
+  *   Yanmin Zhang <yanmin.zhang@intel.com>
+  *   Wu Fengguang <fengguang.wu@intel.com>
+  *   Mike Galbraith <efault@gmx.de>
+  *   Paul Mackerras <paulus@samba.org>
+  *
+  * Released under the GPL v2. (and only v2, not any later version)
+  */
+
+#include "util.h"
+
+#include <getopt.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <sched.h>
+#include <pthread.h>
+
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/uio.h>
+#include <sys/mman.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define rdclock()                                       \
+({                                                      \
+        struct timespec ts;                             \
+                                                        \
+        clock_gettime(CLOCK_MONOTONIC, &ts);            \
+        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#ifdef __x86_64__
+#define __NR_perf_counter_open 295
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __i386__
+#define __NR_perf_counter_open 333
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#define rmb() 		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+extern asmlinkage int sys_perf_counter_open(
+        struct perf_counter_hw_event    *hw_event_uptr          __user,
+        pid_t                           pid,
+        int                             cpu,
+        int                             group_fd,
+        unsigned long                   flags);
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+
+static int			system_wide			=  0;
+
+static int			nr_counters			=  0;
+static __u64			event_id[MAX_COUNTERS]		= {
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
+	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
+	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+};
+static int			default_interval = 100000;
+static int			event_count[MAX_COUNTERS];
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static int			tid				= -1;
+static int			profile_cpu			= -1;
+static int			nr_cpus				=  0;
+static int			nmi				=  1;
+static int			group				=  0;
+static unsigned int		page_size;
+
+static int			zero;
+
+static int			scale;
+
+static const unsigned int default_count[] = {
+	1000000,
+	1000000,
+	  10000,
+	  10000,
+	1000000,
+	  10000,
+};
+
+static char *hw_event_names[] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+	"minor faults",
+	"major faults",
+};
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+static void display_events_help(void)
+{
+	unsigned int i;
+	__u64 e;
+
+	printf(
+	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		printf("\n                             %d:%d: %-20s",
+				type, id, event_symbols[i].symbol);
+	}
+
+	printf("\n"
+	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
+}
+
+static void display_help(void)
+{
+	printf(
+	"Usage: perfstat [<events...>] <cmd...>\n\n"
+	"PerfStat Options (up to %d event types can be specified):\n\n",
+		 MAX_COUNTERS);
+
+	display_events_help();
+
+	printf(
+	" -l                           # scale counter values\n"
+	" -a                           # system-wide collection\n");
+	exit(0);
+}
+
+static char *event_name(int ctr)
+{
+	__u64 config = event_id[ctr];
+	int type = PERF_COUNTER_TYPE(config);
+	int id = PERF_COUNTER_ID(config);
+	static char buf[32];
+
+	if (PERF_COUNTER_RAW(config)) {
+		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
+		return buf;
+	}
+
+	switch (type) {
+	case PERF_TYPE_HARDWARE:
+		if (id < PERF_HW_EVENTS_MAX)
+			return hw_event_names[id];
+		return "unknown-hardware";
+
+	case PERF_TYPE_SOFTWARE:
+		if (id < PERF_SW_EVENTS_MAX)
+			return sw_event_names[id];
+		return "unknown-software";
+
+	default:
+		break;
+	}
+
+	return "unknown";
+}
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+static int parse_events(char *str)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+
+/*
+ * perfstat
+ */
+
+char fault_here[1000000];
+
+static void create_perfstat_counter(int counter)
+{
+	struct perf_counter_hw_event hw_event;
+
+	memset(&hw_event, 0, sizeof(hw_event));
+	hw_event.config		= event_id[counter];
+	hw_event.record_type	= 0;
+	hw_event.nmi		= 0;
+	if (scale)
+		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
+					  PERF_FORMAT_TOTAL_TIME_RUNNING;
+
+	if (system_wide) {
+		int cpu;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
+			if (fd[cpu][counter] < 0) {
+				printf("perfstat error: syscall returned with %d (%s)\n",
+						fd[cpu][counter], strerror(errno));
+				exit(-1);
+			}
+		}
+	} else {
+		hw_event.inherit	= 1;
+		hw_event.disabled	= 1;
+
+		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
+		if (fd[0][counter] < 0) {
+			printf("perfstat error: syscall returned with %d (%s)\n",
+					fd[0][counter], strerror(errno));
+			exit(-1);
+		}
+	}
+}
+
+int do_perfstat(int argc, char *argv[])
+{
+	unsigned long long t0, t1;
+	int counter;
+	ssize_t res;
+	int status;
+	int pid;
+
+	if (!system_wide)
+		nr_cpus = 1;
+
+	for (counter = 0; counter < nr_counters; counter++)
+		create_perfstat_counter(counter);
+
+	argc -= optind;
+	argv += optind;
+
+	if (!argc)
+		display_help();
+
+	/*
+	 * Enable counters and exec the command:
+	 */
+	t0 = rdclock();
+	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
+
+	if ((pid = fork()) < 0)
+		perror("failed to fork");
+	if (!pid) {
+		if (execvp(argv[0], argv)) {
+			perror(argv[0]);
+			exit(-1);
+		}
+	}
+	while (wait(&status) >= 0)
+		;
+	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
+	t1 = rdclock();
+
+	fflush(stdout);
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Performance counter stats for \'%s\':\n",
+		argv[0]);
+	fprintf(stderr, "\n");
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		int cpu, nv;
+		__u64 count[3], single_count[3];
+		int scaled;
+
+		count[0] = count[1] = count[2] = 0;
+		nv = scale ? 3 : 1;
+		for (cpu = 0; cpu < nr_cpus; cpu ++) {
+			res = read(fd[cpu][counter],
+				   single_count, nv * sizeof(__u64));
+			assert(res == nv * sizeof(__u64));
+
+			count[0] += single_count[0];
+			if (scale) {
+				count[1] += single_count[1];
+				count[2] += single_count[2];
+			}
+		}
+
+		scaled = 0;
+		if (scale) {
+			if (count[2] == 0) {
+				fprintf(stderr, " %14s  %-20s\n",
+					"<not counted>", event_name(counter));
+				continue;
+			}
+			if (count[2] < count[1]) {
+				scaled = 1;
+				count[0] = (unsigned long long)
+					((double)count[0] * count[1] / count[2] + 0.5);
+			}
+		}
+
+		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
+		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+
+			double msecs = (double)count[0] / 1000000;
+
+			fprintf(stderr, " %14.6f  %-20s (msecs)",
+				msecs, event_name(counter));
+		} else {
+			fprintf(stderr, " %14Ld  %-20s (events)",
+				count[0], event_name(counter));
+		}
+		if (scaled)
+			fprintf(stderr, "  (scaled from %.2f%%)",
+				(double) count[2] / count[1] * 100);
+		fprintf(stderr, "\n");
+	}
+	fprintf(stderr, "\n");
+	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
+			(double)(t1-t0)/1e6);
+	fprintf(stderr, "\n");
+
+	return 0;
+}
+
+static void process_options(int argc, char **argv)
+{
+	int error = 0, counter;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"count",	required_argument,	NULL, 'c'},
+			{"cpu",		required_argument,	NULL, 'C'},
+			{"delay",	required_argument,	NULL, 'd'},
+			{"dump_symtab",	no_argument,		NULL, 'D'},
+			{"event",	required_argument,	NULL, 'e'},
+			{"filter",	required_argument,	NULL, 'f'},
+			{"group",	required_argument,	NULL, 'g'},
+			{"help",	no_argument,		NULL, 'h'},
+			{"nmi",		required_argument,	NULL, 'n'},
+			{"munmap_info",	no_argument,		NULL, 'U'},
+			{"pid",		required_argument,	NULL, 'p'},
+			{"realtime",	required_argument,	NULL, 'r'},
+			{"scale",	no_argument,		NULL, 'l'},
+			{"symbol",	required_argument,	NULL, 's'},
+			{"stat",	no_argument,		NULL, 'S'},
+			{"vmlinux",	required_argument,	NULL, 'x'},
+			{"zero",	no_argument,		NULL, 'z'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'a': system_wide			=	       1; break;
+		case 'c': default_interval		=   atoi(optarg); break;
+		case 'C':
+			/* CPU and PID are mutually exclusive */
+			if (tid != -1) {
+				printf("WARNING: CPU switch overriding PID\n");
+				sleep(1);
+				tid = -1;
+			}
+			profile_cpu			=   atoi(optarg); break;
+
+		case 'e': error				= parse_events(optarg); break;
+
+		case 'g': group				=   atoi(optarg); break;
+		case 'h':      				  display_help(); break;
+		case 'l': scale				=	       1; break;
+		case 'n': nmi				=   atoi(optarg); break;
+		case 'p':
+			/* CPU and PID are mutually exclusive */
+			if (profile_cpu != -1) {
+				printf("WARNING: PID switch overriding CPU\n");
+				sleep(1);
+				profile_cpu = -1;
+			}
+			tid				=   atoi(optarg); break;
+		case 'z': zero				=              1; break;
+		default: error = 1; break;
+		}
+	}
+	if (error)
+		display_help();
+
+	if (!nr_counters) {
+		nr_counters = 8;
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+}
+
+int cmd_stat(int argc, char **argv, const char *prefix)
+{
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	process_options(argc, argv);
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	return do_perfstat(argc, argv);
+}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 601bddbc30d5f..98e8690b6bcb0 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -28,25 +28,6 @@
                8.54 - ffffffff805001a3 : ip_queue_xmit
  */
 
-/*
- * perfstat:  /usr/bin/time -alike performance counter statistics utility
-
-          It summarizes the counter events of all tasks (and child tasks),
-          covering all CPUs that the command (or workload) executes on.
-          It only counts the per-task events of the workload started,
-          independent of how many other tasks run on those CPUs.
-
-   Sample output:
-
-   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
-
-   Performance counter stats for 'ls':
-
-           163516953 instructions
-                2295 cache-misses
-             2855182 branch-misses
- */
-
  /*
   * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
   *
@@ -149,7 +130,6 @@ asmlinkage int sys_perf_counter_open(
 
 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
 
-static int			run_perfstat			=  0;
 static int			system_wide			=  0;
 
 static int			nr_counters			=  0;
@@ -203,7 +183,7 @@ struct source_line {
 static struct source_line	*lines;
 static struct source_line	**lines_tail;
 
-const unsigned int default_count[] = {
+static const unsigned int default_count[] = {
 	1000000,
 	1000000,
 	  10000,
@@ -291,26 +271,8 @@ static void display_events_help(void)
 	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
 }
 
-static void display_perfstat_help(void)
-{
-	printf(
-	"Usage: perfstat [<events...>] <cmd...>\n\n"
-	"PerfStat Options (up to %d event types can be specified):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -l                           # scale counter values\n"
-	" -a                           # system-wide collection\n");
-	exit(0);
-}
-
 static void display_help(void)
 {
-	if (run_perfstat)
-		return display_perfstat_help();
-
 	printf(
 	"Usage: kerneltop [<options>]\n"
 	"   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
@@ -320,8 +282,6 @@ static void display_help(void)
 	display_events_help();
 
 	printf(
-	" -S        --stat             # perfstat COMMAND\n"
-	" -a                           # system-wide collection (for perfstat)\n\n"
 	" -c CNT    --count=CNT        # event period to sample\n\n"
 	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
 	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
@@ -420,151 +380,6 @@ static int parse_events(char *str)
 	return 0;
 }
 
-
-/*
- * perfstat
- */
-
-char fault_here[1000000];
-
-static void create_perfstat_counter(int counter)
-{
-	struct perf_counter_hw_event hw_event;
-
-	memset(&hw_event, 0, sizeof(hw_event));
-	hw_event.config		= event_id[counter];
-	hw_event.record_type	= 0;
-	hw_event.nmi		= 0;
-	if (scale)
-		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-					  PERF_FORMAT_TOTAL_TIME_RUNNING;
-
-	if (system_wide) {
-		int cpu;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
-			if (fd[cpu][counter] < 0) {
-				printf("perfstat error: syscall returned with %d (%s)\n",
-						fd[cpu][counter], strerror(errno));
-				exit(-1);
-			}
-		}
-	} else {
-		hw_event.inherit	= 1;
-		hw_event.disabled	= 1;
-
-		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
-		if (fd[0][counter] < 0) {
-			printf("perfstat error: syscall returned with %d (%s)\n",
-					fd[0][counter], strerror(errno));
-			exit(-1);
-		}
-	}
-}
-
-int do_perfstat(int argc, char *argv[])
-{
-	unsigned long long t0, t1;
-	int counter;
-	ssize_t res;
-	int status;
-	int pid;
-
-	if (!system_wide)
-		nr_cpus = 1;
-
-	for (counter = 0; counter < nr_counters; counter++)
-		create_perfstat_counter(counter);
-
-	argc -= optind;
-	argv += optind;
-
-	if (!argc)
-		display_help();
-
-	/*
-	 * Enable counters and exec the command:
-	 */
-	t0 = rdclock();
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
-
-	if ((pid = fork()) < 0)
-		perror("failed to fork");
-	if (!pid) {
-		if (execvp(argv[0], argv)) {
-			perror(argv[0]);
-			exit(-1);
-		}
-	}
-	while (wait(&status) >= 0)
-		;
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
-	t1 = rdclock();
-
-	fflush(stdout);
-
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Performance counter stats for \'%s\':\n",
-		argv[0]);
-	fprintf(stderr, "\n");
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		int cpu, nv;
-		__u64 count[3], single_count[3];
-		int scaled;
-
-		count[0] = count[1] = count[2] = 0;
-		nv = scale ? 3 : 1;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			res = read(fd[cpu][counter],
-				   single_count, nv * sizeof(__u64));
-			assert(res == nv * sizeof(__u64));
-
-			count[0] += single_count[0];
-			if (scale) {
-				count[1] += single_count[1];
-				count[2] += single_count[2];
-			}
-		}
-
-		scaled = 0;
-		if (scale) {
-			if (count[2] == 0) {
-				fprintf(stderr, " %14s  %-20s\n",
-					"<not counted>", event_name(counter));
-				continue;
-			}
-			if (count[2] < count[1]) {
-				scaled = 1;
-				count[0] = (unsigned long long)
-					((double)count[0] * count[1] / count[2] + 0.5);
-			}
-		}
-
-		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
-		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
-
-			double msecs = (double)count[0] / 1000000;
-
-			fprintf(stderr, " %14.6f  %-20s (msecs)",
-				msecs, event_name(counter));
-		} else {
-			fprintf(stderr, " %14Ld  %-20s (events)",
-				count[0], event_name(counter));
-		}
-		if (scaled)
-			fprintf(stderr, "  (scaled from %.2f%%)",
-				(double) count[2] / count[1] * 100);
-		fprintf(stderr, "\n");
-	}
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
-			(double)(t1-t0)/1e6);
-	fprintf(stderr, "\n");
-
-	return 0;
-}
-
 /*
  * Symbols
  */
@@ -805,7 +620,7 @@ static int read_symbol(FILE *in, struct sym_entry *s)
 	return 0;
 }
 
-int compare_addr(const void *__sym1, const void *__sym2)
+static int compare_addr(const void *__sym1, const void *__sym2)
 {
 	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
 
@@ -1070,9 +885,6 @@ static void process_options(int argc, char **argv)
 {
 	int error = 0, counter;
 
-	if (strstr(argv[0], "perfstat"))
-		run_perfstat = 1;
-
 	for (;;) {
 		int option_index = 0;
 		/** Options for getopt */
@@ -1134,7 +946,6 @@ static void process_options(int argc, char **argv)
 			tid				=   atoi(optarg); break;
 		case 'r': realtime_prio			=   atoi(optarg); break;
 		case 's': sym_filter			= strdup(optarg); break;
-		case 'S': run_perfstat			=	       1; break;
 		case 'x': vmlinux			= strdup(optarg); break;
 		case 'z': zero				=              1; break;
 		case 'm': mmap_pages			=   atoi(optarg); break;
@@ -1147,12 +958,8 @@ static void process_options(int argc, char **argv)
 		display_help();
 
 	if (!nr_counters) {
-		if (run_perfstat)
-			nr_counters = 8;
-		else {
-			nr_counters = 1;
-			event_id[0] = 0;
-		}
+		nr_counters = 1;
+		event_id[0] = 0;
 	}
 
 	for (counter = 0; counter < nr_counters; counter++) {
@@ -1308,9 +1115,6 @@ int cmd_top(int argc, char **argv, const char *prefix)
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
-	if (run_perfstat)
-		return do_perfstat(argc, argv);
-
 	if (tid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 41637444ce2dc..a3bb6cd6bed37 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,5 +14,6 @@ extern void prune_packed_objects(int);
 extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
-extern int cmd_top(int argc, const char **argv, const char *prefix);
+extern int cmd_top(int argc, char **argv, const char *prefix);
+extern int cmd_stat(int argc, char **argv, const char *prefix);
 #endif
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index 1eab3659b2066..52455d46bfb54 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -1,4 +1,5 @@
 # List of known perf commands.
 # command name				category [deprecated] [common]
 perf-top                                mainporcelain common
+perf-stat                               mainporcelain common
 
diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c
deleted file mode 100644
index 042c1b83a8726..0000000000000
--- a/Documentation/perf_counter/kerneltop.c
+++ /dev/null
@@ -1,1409 +0,0 @@
-/*
- * kerneltop.c: show top kernel functions - performance counters showcase
-
-   Build with:
-
-     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
-
-   Sample output:
-
-------------------------------------------------------------------------------
- KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
-------------------------------------------------------------------------------
-
-             weight         RIP          kernel function
-             ______   ________________   _______________
-
-              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
-              33.00 - ffffffff804cb740 : sock_alloc_send_skb
-              31.26 - ffffffff804ce808 : skb_push
-              22.43 - ffffffff80510004 : tcp_established_options
-              19.00 - ffffffff8027d250 : find_get_page
-              15.76 - ffffffff804e4fc9 : eth_type_trans
-              15.20 - ffffffff804d8baa : dst_release
-              14.86 - ffffffff804cf5d8 : skb_release_head_state
-              14.00 - ffffffff802217d5 : read_hpet
-              12.00 - ffffffff804ffb7f : __ip_local_out
-              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
-               8.54 - ffffffff805001a3 : ip_queue_xmit
- */
-
-/*
- * perfstat:  /usr/bin/time -alike performance counter statistics utility
-
-          It summarizes the counter events of all tasks (and child tasks),
-          covering all CPUs that the command (or workload) executes on.
-          It only counts the per-task events of the workload started,
-          independent of how many other tasks run on those CPUs.
-
-   Sample output:
-
-   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
-
-   Performance counter stats for 'ls':
-
-           163516953 instructions
-                2295 cache-misses
-             2855182 branch-misses
- */
-
- /*
-  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
-  *
-  * Improvements and fixes by:
-  *
-  *   Arjan van de Ven <arjan@linux.intel.com>
-  *   Yanmin Zhang <yanmin.zhang@intel.com>
-  *   Wu Fengguang <fengguang.wu@intel.com>
-  *   Mike Galbraith <efault@gmx.de>
-  *   Paul Mackerras <paulus@samba.org>
-  *
-  * Released under the GPL v2. (and only v2, not any later version)
-  */
-
-#define _GNU_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <getopt.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <ctype.h>
-#include <time.h>
-#include <sched.h>
-#include <pthread.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/poll.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-#include <sys/mman.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-#include "../../include/linux/perf_counter.h"
-
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define rdclock()                                       \
-({                                                      \
-        struct timespec ts;                             \
-                                                        \
-        clock_gettime(CLOCK_MONOTONIC, &ts);            \
-        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#ifdef __x86_64__
-#define __NR_perf_counter_open 295
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 333
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#define rmb() 		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-asmlinkage int sys_perf_counter_open(
-        struct perf_counter_hw_event    *hw_event_uptr          __user,
-        pid_t                           pid,
-        int                             cpu,
-        int                             group_fd,
-        unsigned long                   flags)
-{
-        return syscall(
-                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-}
-
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
-
-static int			run_perfstat			=  0;
-static int			system_wide			=  0;
-
-static int			nr_counters			=  0;
-static __u64			event_id[MAX_COUNTERS]		= {
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
-
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
-};
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static __u64			count_filter		       = 100;
-
-static int			tid				= -1;
-static int			profile_cpu			= -1;
-static int			nr_cpus				=  0;
-static int			nmi				=  1;
-static unsigned int		realtime_prio			=  0;
-static int			group				=  0;
-static unsigned int		page_size;
-static unsigned int		mmap_pages			=  16;
-static int			use_mmap			= 0;
-static int			use_munmap			= 0;
-
-static char			*vmlinux;
-
-static char			*sym_filter;
-static unsigned long		filter_start;
-static unsigned long		filter_end;
-
-static int			delay_secs			=  2;
-static int			zero;
-static int			dump_symtab;
-
-static int			scale;
-
-struct source_line {
-	uint64_t		EIP;
-	unsigned long		count;
-	char			*line;
-	struct source_line	*next;
-};
-
-static struct source_line	*lines;
-static struct source_line	**lines_tail;
-
-const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
-};
-
-static char *hw_event_names[] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names[] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-	"minor faults",
-	"major faults",
-};
-
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-static void display_events_help(void)
-{
-	unsigned int i;
-	__u64 e;
-
-	printf(
-	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		printf("\n                             %d:%d: %-20s",
-				type, id, event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
-}
-
-static void display_perfstat_help(void)
-{
-	printf(
-	"Usage: perfstat [<events...>] <cmd...>\n\n"
-	"PerfStat Options (up to %d event types can be specified):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -l                           # scale counter values\n"
-	" -a                           # system-wide collection\n");
-	exit(0);
-}
-
-static void display_help(void)
-{
-	if (run_perfstat)
-		return display_perfstat_help();
-
-	printf(
-	"Usage: kerneltop [<options>]\n"
-	"   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
-	"KernelTop Options (up to %d event types can be specified at once):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -S        --stat             # perfstat COMMAND\n"
-	" -a                           # system-wide collection (for perfstat)\n\n"
-	" -c CNT    --count=CNT        # event period to sample\n\n"
-	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
-	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
-	" -l                           # show scale factor for RR events\n"
-	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
-	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
-	" -r prio   --realtime=<prio>  # event acquisition runs with SCHED_FIFO policy\n"
-	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
-	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
-	" -z        --zero             # zero counts after display\n"
-	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
-	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
-	" -M        --mmap_info        # print mmap info stream\n"
-	" -U        --munmap_info      # print munmap info stream\n"
-	);
-
-	exit(0);
-}
-
-static char *event_name(int ctr)
-{
-	__u64 config = event_id[ctr];
-	int type = PERF_COUNTER_TYPE(config);
-	int id = PERF_COUNTER_ID(config);
-	static char buf[32];
-
-	if (PERF_COUNTER_RAW(config)) {
-		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
-		return buf;
-	}
-
-	switch (type) {
-	case PERF_TYPE_HARDWARE:
-		if (id < PERF_HW_EVENTS_MAX)
-			return hw_event_names[id];
-		return "unknown-hardware";
-
-	case PERF_TYPE_SOFTWARE:
-		if (id < PERF_SW_EVENTS_MAX)
-			return sw_event_names[id];
-		return "unknown-software";
-
-	default:
-		break;
-	}
-
-	return "unknown";
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(char *str)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
-
-/*
- * perfstat
- */
-
-char fault_here[1000000];
-
-static void create_perfstat_counter(int counter)
-{
-	struct perf_counter_hw_event hw_event;
-
-	memset(&hw_event, 0, sizeof(hw_event));
-	hw_event.config		= event_id[counter];
-	hw_event.record_type	= 0;
-	hw_event.nmi		= 0;
-	if (scale)
-		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-					  PERF_FORMAT_TOTAL_TIME_RUNNING;
-
-	if (system_wide) {
-		int cpu;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
-			if (fd[cpu][counter] < 0) {
-				printf("perfstat error: syscall returned with %d (%s)\n",
-						fd[cpu][counter], strerror(errno));
-				exit(-1);
-			}
-		}
-	} else {
-		hw_event.inherit	= 1;
-		hw_event.disabled	= 1;
-
-		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
-		if (fd[0][counter] < 0) {
-			printf("perfstat error: syscall returned with %d (%s)\n",
-					fd[0][counter], strerror(errno));
-			exit(-1);
-		}
-	}
-}
-
-int do_perfstat(int argc, char *argv[])
-{
-	unsigned long long t0, t1;
-	int counter;
-	ssize_t res;
-	int status;
-	int pid;
-
-	if (!system_wide)
-		nr_cpus = 1;
-
-	for (counter = 0; counter < nr_counters; counter++)
-		create_perfstat_counter(counter);
-
-	argc -= optind;
-	argv += optind;
-
-	if (!argc)
-		display_help();
-
-	/*
-	 * Enable counters and exec the command:
-	 */
-	t0 = rdclock();
-	prctl(PR_TASK_PERF_COUNTERS_ENABLE);
-
-	if ((pid = fork()) < 0)
-		perror("failed to fork");
-	if (!pid) {
-		if (execvp(argv[0], argv)) {
-			perror(argv[0]);
-			exit(-1);
-		}
-	}
-	while (wait(&status) >= 0)
-		;
-	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
-	t1 = rdclock();
-
-	fflush(stdout);
-
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Performance counter stats for \'%s\':\n",
-		argv[0]);
-	fprintf(stderr, "\n");
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		int cpu, nv;
-		__u64 count[3], single_count[3];
-		int scaled;
-
-		count[0] = count[1] = count[2] = 0;
-		nv = scale ? 3 : 1;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			res = read(fd[cpu][counter],
-				   single_count, nv * sizeof(__u64));
-			assert(res == nv * sizeof(__u64));
-
-			count[0] += single_count[0];
-			if (scale) {
-				count[1] += single_count[1];
-				count[2] += single_count[2];
-			}
-		}
-
-		scaled = 0;
-		if (scale) {
-			if (count[2] == 0) {
-				fprintf(stderr, " %14s  %-20s\n",
-					"<not counted>", event_name(counter));
-				continue;
-			}
-			if (count[2] < count[1]) {
-				scaled = 1;
-				count[0] = (unsigned long long)
-					((double)count[0] * count[1] / count[2] + 0.5);
-			}
-		}
-
-		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
-		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
-
-			double msecs = (double)count[0] / 1000000;
-
-			fprintf(stderr, " %14.6f  %-20s (msecs)",
-				msecs, event_name(counter));
-		} else {
-			fprintf(stderr, " %14Ld  %-20s (events)",
-				count[0], event_name(counter));
-		}
-		if (scaled)
-			fprintf(stderr, "  (scaled from %.2f%%)",
-				(double) count[2] / count[1] * 100);
-		fprintf(stderr, "\n");
-	}
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
-			(double)(t1-t0)/1e6);
-	fprintf(stderr, "\n");
-
-	return 0;
-}
-
-/*
- * Symbols
- */
-
-static uint64_t			min_ip;
-static uint64_t			max_ip = -1ll;
-
-struct sym_entry {
-	unsigned long long	addr;
-	char			*sym;
-	unsigned long		count[MAX_COUNTERS];
-	int			skip;
-	struct source_line	*source;
-};
-
-#define MAX_SYMS		100000
-
-static int sym_table_count;
-
-struct sym_entry		*sym_filter_entry;
-
-static struct sym_entry		sym_table[MAX_SYMS];
-
-static void show_details(struct sym_entry *sym);
-
-/*
- * Ordering weight: count-1 * count-2 * ... / count-n
- */
-static double sym_weight(const struct sym_entry *sym)
-{
-	double weight;
-	int counter;
-
-	weight = sym->count[0];
-
-	for (counter = 1; counter < nr_counters-1; counter++)
-		weight *= sym->count[counter];
-
-	weight /= (sym->count[counter] + 1);
-
-	return weight;
-}
-
-static int compare(const void *__sym1, const void *__sym2)
-{
-	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
-
-	return sym_weight(sym1) < sym_weight(sym2);
-}
-
-static long			events;
-static long			userspace_events;
-static const char		CONSOLE_CLEAR[] = "[H[2J";
-
-static struct sym_entry		tmp[MAX_SYMS];
-
-static void print_sym_table(void)
-{
-	int i, printed;
-	int counter;
-	float events_per_sec = events/delay_secs;
-	float kevents_per_sec = (events-userspace_events)/delay_secs;
-	float sum_kevents = 0.0;
-
-	events = userspace_events = 0;
-	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
-	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
-
-	for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
-		sum_kevents += tmp[i].count[0];
-
-	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
-
-	printf(
-"------------------------------------------------------------------------------\n");
-	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [%s, ",
-		events_per_sec,
-		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
-		nmi ? "NMI" : "IRQ");
-
-	if (nr_counters == 1)
-		printf("%d ", event_count[0]);
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (counter)
-			printf("/");
-
-		printf("%s", event_name(counter));
-	}
-
-	printf( "], ");
-
-	if (tid != -1)
-		printf(" (tid: %d", tid);
-	else
-		printf(" (all");
-
-	if (profile_cpu != -1)
-		printf(", cpu: %d)\n", profile_cpu);
-	else {
-		if (tid != -1)
-			printf(")\n");
-		else
-			printf(", %d CPUs)\n", nr_cpus);
-	}
-
-	printf("------------------------------------------------------------------------------\n\n");
-
-	if (nr_counters == 1)
-		printf("             events    pcnt");
-	else
-		printf("  weight     events    pcnt");
-
-	printf("         RIP          kernel function\n"
-	       	       "  ______     ______   _____   ________________   _______________\n\n"
-	);
-
-	for (i = 0, printed = 0; i < sym_table_count; i++) {
-		float pcnt;
-		int count;
-
-		if (printed <= 18 && tmp[i].count[0] >= count_filter) {
-			pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
-
-			if (nr_counters == 1)
-				printf("%19.2f - %4.1f%% - %016llx : %s\n",
-					sym_weight(tmp + i),
-					pcnt, tmp[i].addr, tmp[i].sym);
-			else
-				printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
-					sym_weight(tmp + i),
-					tmp[i].count[0],
-					pcnt, tmp[i].addr, tmp[i].sym);
-			printed++;
-		}
-		/*
-		 * Add decay to the counts:
-		 */
-		for (count = 0; count < nr_counters; count++)
-			sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
-	}
-
-	if (sym_filter_entry)
-		show_details(sym_filter_entry);
-
-	{
-		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
-
-		if (poll(&stdin_poll, 1, 0) == 1) {
-			printf("key pressed - exiting.\n");
-			exit(0);
-		}
-	}
-}
-
-static void *display_thread(void *arg)
-{
-	printf("KernelTop refresh period: %d seconds\n", delay_secs);
-
-	while (!sleep(delay_secs))
-		print_sym_table();
-
-	return NULL;
-}
-
-static int read_symbol(FILE *in, struct sym_entry *s)
-{
-	static int filter_match = 0;
-	char *sym, stype;
-	char str[500];
-	int rc, pos;
-
-	rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
-	if (rc == EOF)
-		return -1;
-
-	assert(rc == 3);
-
-	/* skip until end of line: */
-	pos = strlen(str);
-	do {
-		rc = fgetc(in);
-		if (rc == '\n' || rc == EOF || pos >= 499)
-			break;
-		str[pos] = rc;
-		pos++;
-	} while (1);
-	str[pos] = 0;
-
-	sym = str;
-
-	/* Filter out known duplicates and non-text symbols. */
-	if (!strcmp(sym, "_text"))
-		return 1;
-	if (!min_ip && !strcmp(sym, "_stext"))
-		return 1;
-	if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
-		return 1;
-	if (stype != 'T' && stype != 't')
-		return 1;
-	if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
-		return 1;
-	if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
-		return 1;
-
-	s->sym = malloc(strlen(str));
-	assert(s->sym);
-
-	strcpy((char *)s->sym, str);
-	s->skip = 0;
-
-	/* Tag events to be skipped. */
-	if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
-		s->skip = 1;
-	else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
-		s->skip = 1;
-	else if (!strcmp("mwait_idle", s->sym))
-		s->skip = 1;
-
-	if (filter_match == 1) {
-		filter_end = s->addr;
-		filter_match = -1;
-		if (filter_end - filter_start > 10000) {
-			printf("hm, too large filter symbol <%s> - skipping.\n",
-				sym_filter);
-			printf("symbol filter start: %016lx\n", filter_start);
-			printf("                end: %016lx\n", filter_end);
-			filter_end = filter_start = 0;
-			sym_filter = NULL;
-			sleep(1);
-		}
-	}
-	if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
-		filter_match = 1;
-		filter_start = s->addr;
-	}
-
-	return 0;
-}
-
-int compare_addr(const void *__sym1, const void *__sym2)
-{
-	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
-
-	return sym1->addr > sym2->addr;
-}
-
-static void sort_symbol_table(void)
-{
-	int i, dups;
-
-	do {
-		qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
-		for (i = 0, dups = 0; i < sym_table_count; i++) {
-			if (sym_table[i].addr == sym_table[i+1].addr) {
-				sym_table[i+1].addr = -1ll;
-				dups++;
-			}
-		}
-		sym_table_count -= dups;
-	} while(dups);
-}
-
-static void parse_symbols(void)
-{
-	struct sym_entry *last;
-
-	FILE *kallsyms = fopen("/proc/kallsyms", "r");
-
-	if (!kallsyms) {
-		printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
-		exit(-1);
-	}
-
-	while (!feof(kallsyms)) {
-		if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
-			sym_table_count++;
-			assert(sym_table_count <= MAX_SYMS);
-		}
-	}
-
-	sort_symbol_table();
-	min_ip = sym_table[0].addr;
-	max_ip = sym_table[sym_table_count-1].addr;
-	last = sym_table + sym_table_count++;
-
-	last->addr = -1ll;
-	last->sym = "<end>";
-
-	if (filter_end) {
-		int count;
-		for (count=0; count < sym_table_count; count ++) {
-			if (!strcmp(sym_table[count].sym, sym_filter)) {
-				sym_filter_entry = &sym_table[count];
-				break;
-			}
-		}
-	}
-	if (dump_symtab) {
-		int i;
-
-		for (i = 0; i < sym_table_count; i++)
-			fprintf(stderr, "%llx %s\n",
-				sym_table[i].addr, sym_table[i].sym);
-	}
-}
-
-/*
- * Source lines
- */
-
-static void parse_vmlinux(char *filename)
-{
-	FILE *file;
-	char command[PATH_MAX*2];
-	if (!filename)
-		return;
-
-	sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
-
-	file = popen(command, "r");
-	if (!file)
-		return;
-
-	lines_tail = &lines;
-	while (!feof(file)) {
-		struct source_line *src;
-		size_t dummy = 0;
-		char *c;
-
-		src = malloc(sizeof(struct source_line));
-		assert(src != NULL);
-		memset(src, 0, sizeof(struct source_line));
-
-		if (getline(&src->line, &dummy, file) < 0)
-			break;
-		if (!src->line)
-			break;
-
-		c = strchr(src->line, '\n');
-		if (c)
-			*c = 0;
-
-		src->next = NULL;
-		*lines_tail = src;
-		lines_tail = &src->next;
-
-		if (strlen(src->line)>8 && src->line[8] == ':')
-			src->EIP = strtoull(src->line, NULL, 16);
-		if (strlen(src->line)>8 && src->line[16] == ':')
-			src->EIP = strtoull(src->line, NULL, 16);
-	}
-	pclose(file);
-}
-
-static void record_precise_ip(uint64_t ip)
-{
-	struct source_line *line;
-
-	for (line = lines; line; line = line->next) {
-		if (line->EIP == ip)
-			line->count++;
-		if (line->EIP > ip)
-			break;
-	}
-}
-
-static void lookup_sym_in_vmlinux(struct sym_entry *sym)
-{
-	struct source_line *line;
-	char pattern[PATH_MAX];
-	sprintf(pattern, "<%s>:", sym->sym);
-
-	for (line = lines; line; line = line->next) {
-		if (strstr(line->line, pattern)) {
-			sym->source = line;
-			break;
-		}
-	}
-}
-
-static void show_lines(struct source_line *line_queue, int line_queue_count)
-{
-	int i;
-	struct source_line *line;
-
-	line = line_queue;
-	for (i = 0; i < line_queue_count; i++) {
-		printf("%8li\t%s\n", line->count, line->line);
-		line = line->next;
-	}
-}
-
-#define TRACE_COUNT     3
-
-static void show_details(struct sym_entry *sym)
-{
-	struct source_line *line;
-	struct source_line *line_queue = NULL;
-	int displayed = 0;
-	int line_queue_count = 0;
-
-	if (!sym->source)
-		lookup_sym_in_vmlinux(sym);
-	if (!sym->source)
-		return;
-
-	printf("Showing details for %s\n", sym->sym);
-
-	line = sym->source;
-	while (line) {
-		if (displayed && strstr(line->line, ">:"))
-			break;
-
-		if (!line_queue_count)
-			line_queue = line;
-		line_queue_count ++;
-
-		if (line->count >= count_filter) {
-			show_lines(line_queue, line_queue_count);
-			line_queue_count = 0;
-			line_queue = NULL;
-		} else if (line_queue_count > TRACE_COUNT) {
-			line_queue = line_queue->next;
-			line_queue_count --;
-		}
-
-		line->count = 0;
-		displayed++;
-		if (displayed > 300)
-			break;
-		line = line->next;
-	}
-}
-
-/*
- * Binary search in the histogram table and record the hit:
- */
-static void record_ip(uint64_t ip, int counter)
-{
-	int left_idx, middle_idx, right_idx, idx;
-	unsigned long left, middle, right;
-
-	record_precise_ip(ip);
-
-	left_idx = 0;
-	right_idx = sym_table_count-1;
-	assert(ip <= max_ip && ip >= min_ip);
-
-	while (left_idx + 1 < right_idx) {
-		middle_idx = (left_idx + right_idx) / 2;
-
-		left   = sym_table[  left_idx].addr;
-		middle = sym_table[middle_idx].addr;
-		right  = sym_table[ right_idx].addr;
-
-		if (!(left <= middle && middle <= right)) {
-			printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
-			printf("%d %d %d\n", left_idx, middle_idx, right_idx);
-		}
-		assert(left <= middle && middle <= right);
-		if (!(left <= ip && ip <= right)) {
-			printf(" left: %016lx\n", left);
-			printf("   ip: %016lx\n", (unsigned long)ip);
-			printf("right: %016lx\n", right);
-		}
-		assert(left <= ip && ip <= right);
-		/*
-		 * [ left .... target .... middle .... right ]
-		 *   => right := middle
-		 */
-		if (ip < middle) {
-			right_idx = middle_idx;
-			continue;
-		}
-		/*
-		 * [ left .... middle ... target ... right ]
-		 *   => left := middle
-		 */
-		left_idx = middle_idx;
-	}
-
-	idx = left_idx;
-
-	if (!sym_table[idx].skip)
-		sym_table[idx].count[counter]++;
-	else events--;
-}
-
-static void process_event(uint64_t ip, int counter)
-{
-	events++;
-
-	if (ip < min_ip || ip > max_ip) {
-		userspace_events++;
-		return;
-	}
-
-	record_ip(ip, counter);
-}
-
-static void process_options(int argc, char *argv[])
-{
-	int error = 0, counter;
-
-	if (strstr(argv[0], "perfstat"))
-		run_perfstat = 1;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"cpu",		required_argument,	NULL, 'C'},
-			{"delay",	required_argument,	NULL, 'd'},
-			{"dump_symtab",	no_argument,		NULL, 'D'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"filter",	required_argument,	NULL, 'f'},
-			{"group",	required_argument,	NULL, 'g'},
-			{"help",	no_argument,		NULL, 'h'},
-			{"nmi",		required_argument,	NULL, 'n'},
-			{"mmap_info",	no_argument,		NULL, 'M'},
-			{"mmap_pages",	required_argument,	NULL, 'm'},
-			{"munmap_info",	no_argument,		NULL, 'U'},
-			{"pid",		required_argument,	NULL, 'p'},
-			{"realtime",	required_argument,	NULL, 'r'},
-			{"scale",	no_argument,		NULL, 'l'},
-			{"symbol",	required_argument,	NULL, 's'},
-			{"stat",	no_argument,		NULL, 'S'},
-			{"vmlinux",	required_argument,	NULL, 'x'},
-			{"zero",	no_argument,		NULL, 'z'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'a': system_wide			=	       1; break;
-		case 'c': default_interval		=   atoi(optarg); break;
-		case 'C':
-			/* CPU and PID are mutually exclusive */
-			if (tid != -1) {
-				printf("WARNING: CPU switch overriding PID\n");
-				sleep(1);
-				tid = -1;
-			}
-			profile_cpu			=   atoi(optarg); break;
-		case 'd': delay_secs			=   atoi(optarg); break;
-		case 'D': dump_symtab			=              1; break;
-
-		case 'e': error				= parse_events(optarg); break;
-
-		case 'f': count_filter			=   atoi(optarg); break;
-		case 'g': group				=   atoi(optarg); break;
-		case 'h':      				  display_help(); break;
-		case 'l': scale				=	       1; break;
-		case 'n': nmi				=   atoi(optarg); break;
-		case 'p':
-			/* CPU and PID are mutually exclusive */
-			if (profile_cpu != -1) {
-				printf("WARNING: PID switch overriding CPU\n");
-				sleep(1);
-				profile_cpu = -1;
-			}
-			tid				=   atoi(optarg); break;
-		case 'r': realtime_prio			=   atoi(optarg); break;
-		case 's': sym_filter			= strdup(optarg); break;
-		case 'S': run_perfstat			=	       1; break;
-		case 'x': vmlinux			= strdup(optarg); break;
-		case 'z': zero				=              1; break;
-		case 'm': mmap_pages			=   atoi(optarg); break;
-		case 'M': use_mmap			=              1; break;
-		case 'U': use_munmap			=              1; break;
-		default: error = 1; break;
-		}
-	}
-	if (error)
-		display_help();
-
-	if (!nr_counters) {
-		if (run_perfstat)
-			nr_counters = 8;
-		else {
-			nr_counters = 1;
-			event_id[0] = 0;
-		}
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
-
-		event_count[counter] = default_interval;
-	}
-}
-
-struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
-};
-
-static unsigned int mmap_read_head(struct mmap_data *md)
-{
-	struct perf_counter_mmap_page *pc = md->base;
-	int head;
-
-	head = pc->data_head;
-	rmb();
-
-	return head;
-}
-
-struct timeval last_read, this_read;
-
-static void mmap_read(struct mmap_data *md)
-{
-	unsigned int head = mmap_read_head(md);
-	unsigned int old = md->prev;
-	unsigned char *data = md->base + page_size;
-	int diff;
-
-	gettimeofday(&this_read, NULL);
-
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and screw up the events under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff > md->mask / 2 || diff < 0) {
-		struct timeval iv;
-		unsigned long msecs;
-
-		timersub(&this_read, &last_read, &iv);
-		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
-
-		fprintf(stderr, "WARNING: failed to keep up with mmap data."
-				"  Last read %lu msecs ago.\n", msecs);
-
-		/*
-		 * head points to a known good entry, start there.
-		 */
-		old = head;
-	}
-
-	last_read = this_read;
-
-	for (; old != head;) {
-		struct ip_event {
-			struct perf_event_header header;
-			__u64 ip;
-			__u32 pid, tid;
-		};
-		struct mmap_event {
-			struct perf_event_header header;
-			__u32 pid, tid;
-			__u64 start;
-			__u64 len;
-			__u64 pgoff;
-			char filename[PATH_MAX];
-		};
-
-		typedef union event_union {
-			struct perf_event_header header;
-			struct ip_event ip;
-			struct mmap_event mmap;
-		} event_t;
-
-		event_t *event = (event_t *)&data[old & md->mask];
-
-		event_t event_copy;
-
-		unsigned int size = event->header.size;
-
-		/*
-		 * Event straddles the mmap boundary -- header should always
-		 * be inside due to u64 alignment of output.
-		 */
-		if ((old & md->mask) + size != ((old + size) & md->mask)) {
-			unsigned int offset = old;
-			unsigned int len = min(sizeof(*event), size), cpy;
-			void *dst = &event_copy;
-
-			do {
-				cpy = min(md->mask + 1 - (offset & md->mask), len);
-				memcpy(dst, &data[offset & md->mask], cpy);
-				offset += cpy;
-				dst += cpy;
-				len -= cpy;
-			} while (len);
-
-			event = &event_copy;
-		}
-
-		old += size;
-
-		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
-			if (event->header.type & PERF_RECORD_IP)
-				process_event(event->ip.ip, md->counter);
-		} else {
-			switch (event->header.type) {
-				case PERF_EVENT_MMAP:
-				case PERF_EVENT_MUNMAP:
-					printf("%s: %Lu %Lu %Lu %s\n",
-							event->header.type == PERF_EVENT_MMAP
-							? "mmap" : "munmap",
-							event->mmap.start,
-							event->mmap.len,
-							event->mmap.pgoff,
-							event->mmap.filename);
-					break;
-			}
-		}
-	}
-
-	md->prev = old;
-}
-
-int main(int argc, char *argv[])
-{
-	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-	struct perf_counter_hw_event hw_event;
-	pthread_t thread;
-	int i, counter, group_fd, nr_poll = 0;
-	unsigned int cpu;
-	int ret;
-
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	process_options(argc, argv);
-
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	assert(nr_cpus <= MAX_NR_CPUS);
-	assert(nr_cpus >= 0);
-
-	if (run_perfstat)
-		return do_perfstat(argc, argv);
-
-	if (tid != -1 || profile_cpu != -1)
-		nr_cpus = 1;
-
-	parse_symbols();
-	if (vmlinux && sym_filter_entry)
-		parse_vmlinux(vmlinux);
-
-	for (i = 0; i < nr_cpus; i++) {
-		group_fd = -1;
-		for (counter = 0; counter < nr_counters; counter++) {
-
-			cpu	= profile_cpu;
-			if (tid == -1 && profile_cpu == -1)
-				cpu = i;
-
-			memset(&hw_event, 0, sizeof(hw_event));
-			hw_event.config		= event_id[counter];
-			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.nmi		= nmi;
-			hw_event.mmap		= use_mmap;
-			hw_event.munmap		= use_munmap;
-
-			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
-			if (fd[i][counter] < 0) {
-				int err = errno;
-				printf("kerneltop error: syscall returned with %d (%s)\n",
-					fd[i][counter], strerror(err));
-				if (err == EPERM)
-					printf("Are you root?\n");
-				exit(-1);
-			}
-			assert(fd[i][counter] >= 0);
-			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
-
-			/*
-			 * First counter acts as the group leader:
-			 */
-			if (group && group_fd == -1)
-				group_fd = fd[i][counter];
-
-			event_array[nr_poll].fd = fd[i][counter];
-			event_array[nr_poll].events = POLLIN;
-			nr_poll++;
-
-			mmap_array[i][counter].counter = counter;
-			mmap_array[i][counter].prev = 0;
-			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
-			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-					PROT_READ, MAP_SHARED, fd[i][counter], 0);
-			if (mmap_array[i][counter].base == MAP_FAILED) {
-				printf("kerneltop error: failed to mmap with %d (%s)\n",
-						errno, strerror(errno));
-				exit(-1);
-			}
-		}
-	}
-
-	if (pthread_create(&thread, NULL, display_thread, NULL)) {
-		printf("Could not create display thread.\n");
-		exit(-1);
-	}
-
-	if (realtime_prio) {
-		struct sched_param param;
-
-		param.sched_priority = realtime_prio;
-		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
-			printf("Could not set realtime priority.\n");
-			exit(-1);
-		}
-	}
-
-	while (1) {
-		int hits = events;
-
-		for (i = 0; i < nr_cpus; i++) {
-			for (counter = 0; counter < nr_counters; counter++)
-				mmap_read(&mmap_array[i][counter]);
-		}
-
-		if (hits == events)
-			ret = poll(event_array, nr_poll, 100);
-	}
-
-	return 0;
-}
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 63f8a892c0dff..ff8658f2a2f1b 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -249,6 +249,7 @@ static void handle_internal_command(int argc, const char **argv)
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
 		{ "top", cmd_top, 0 },
+		{ "stat", cmd_stat, 0 },
 	};
 	int i;
 	static const char ext[] = STRIP_EXTENSION;
-- 
GitLab


From 1d8c8b209e9351a7de1307d7b9b6df4222b8d742 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 15:52:29 +0200
Subject: [PATCH 0543/2198] perf_counter tools: add help texts

Add Documentation/perf-stat.txt and Documentation/perf-top.txt.

The template that was used for it: Documentation/git-add.txt from Git.

Fix up small bugs to make these help texts show up both in the 'perf'
common-command summary output screen, and on the individual help screens.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../perf_counter/Documentation/perf-stat.txt  | 76 +++++++++++++++++++
 .../perf_counter/Documentation/perf-top.txt   | 61 +++++++++++++++
 Documentation/perf_counter/builtin.h          |  4 +-
 .../perf_counter/generate-cmdlist.sh          |  8 +-
 4 files changed, 143 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/perf_counter/Documentation/perf-stat.txt
 create mode 100644 Documentation/perf_counter/Documentation/perf-top.txt

diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
new file mode 100644
index 0000000000000..7fcab271e570a
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -0,0 +1,76 @@
+perf-stat(1)
+==========
+
+NAME
+----
+perf-stat - Run a command and gather performance counter statistics
+
+SYNOPSIS
+--------
+[verse]
+'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+
+DESCRIPTION
+-----------
+This command runs a command and gathers performance counter statistics
+from it.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-e::
+--event=::
+                             0:0: cpu-cycles          
+                             0:0: cycles              
+                             0:1: instructions        
+                             0:2: cache-references    
+                             0:3: cache-misses        
+                             0:4: branch-instructions 
+                             0:4: branches            
+                             0:5: branch-misses       
+                             0:6: bus-cycles          
+                             1:0: cpu-clock           
+                             1:1: task-clock          
+                             1:2: page-faults         
+                             1:2: faults              
+                             1:5: minor-faults        
+                             1:6: major-faults        
+                             1:3: context-switches    
+                             1:3: cs                  
+                             1:4: cpu-migrations      
+                             1:4: migrations          
+                           rNNN: raw PMU events (eventsel+umask)
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+Configuration
+-------------
+
+EXAMPLES
+--------
+
+$ perf stat sleep 1
+
+ Performance counter stats for 'sleep':
+
+       0.678356  task clock ticks     (msecs)
+              7  context switches     (events)
+              4  CPU migrations       (events)
+            232  pagefaults           (events)
+        1810403  CPU cycles           (events)
+         946759  instructions         (events)
+          18952  cache references     (events)
+           4885  cache misses         (events)
+
+ Wall-clock time elapsed:  1001.252894 msecs
+
+SEE ALSO
+--------
+linkperf:git-tops[1]
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
new file mode 100644
index 0000000000000..057333b725341
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -0,0 +1,61 @@
+perf-top(1)
+==========
+
+NAME
+----
+perf-top - Run a command and profile it
+
+SYNOPSIS
+--------
+[verse]
+'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+
+DESCRIPTION
+-----------
+This command runs a command and gathers a performance counter profile
+from it.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-e::
+--event=::
+                             0:0: cpu-cycles          
+                             0:0: cycles              
+                             0:1: instructions        
+                             0:2: cache-references    
+                             0:3: cache-misses        
+                             0:4: branch-instructions 
+                             0:4: branches            
+                             0:5: branch-misses       
+                             0:6: bus-cycles          
+                             1:0: cpu-clock           
+                             1:1: task-clock          
+                             1:2: page-faults         
+                             1:2: faults              
+                             1:5: minor-faults        
+                             1:6: major-faults        
+                             1:3: context-switches    
+                             1:3: cs                  
+                             1:4: cpu-migrations      
+                             1:4: migrations          
+                           rNNN: raw PMU events (eventsel+umask)
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+Configuration
+-------------
+
+EXAMPLES
+--------
+
+SEE ALSO
+--------
+linkperf:git-stat[1]
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index a3bb6cd6bed37..605323c691f13 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,6 +14,6 @@ extern void prune_packed_objects(int);
 extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
-extern int cmd_top(int argc, char **argv, const char *prefix);
-extern int cmd_stat(int argc, char **argv, const char *prefix);
+extern int cmd_top(int argc, const char **argv, const char *prefix);
+extern int cmd_stat(int argc, const char **argv, const char *prefix);
 #endif
diff --git a/Documentation/perf_counter/generate-cmdlist.sh b/Documentation/perf_counter/generate-cmdlist.sh
index 75c68d948fd31..f06f6fd148f8e 100755
--- a/Documentation/perf_counter/generate-cmdlist.sh
+++ b/Documentation/perf_counter/generate-cmdlist.sh
@@ -9,16 +9,16 @@ struct cmdname_help
 
 static struct cmdname_help common_cmds[] = {"
 
-sed -n -e 's/^git-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
+sed -n -e 's/^perf-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt |
 sort |
 while read cmd
 do
      sed -n '
-     /^NAME/,/git-'"$cmd"'/H
+     /^NAME/,/perf-'"$cmd"'/H
      ${
             x
-            s/.*git-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
+            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/
 	    p
-     }' "Documentation/git-$cmd.txt"
+     }' "Documentation/perf-$cmd.txt"
 done
 echo "};"
-- 
GitLab


From e33e0a43736307512422e41aee6e24d5a8c39181 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 15:58:01 +0200
Subject: [PATCH 0544/2198] perf_counter tools: add 'perf record' command

Move perf-record.c into the perf suite of commands.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../Documentation/perf-record.txt             | 63 +++++++++++++++++++
 Documentation/perf_counter/Makefile           |  1 +
 .../{perf-record.c => builtin-record.c}       | 30 +--------
 Documentation/perf_counter/builtin.h          |  3 +-
 Documentation/perf_counter/command-list.txt   |  3 +-
 Documentation/perf_counter/perf.c             |  3 +-
 6 files changed, 73 insertions(+), 30 deletions(-)
 create mode 100644 Documentation/perf_counter/Documentation/perf-record.txt
 rename Documentation/perf_counter/{perf-record.c => builtin-record.c} (95%)

diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
new file mode 100644
index 0000000000000..d07700e35eb27
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -0,0 +1,63 @@
+perf-record(1)
+==========
+
+NAME
+----
+perf-record - Run a command and record its profile into output.perf
+
+SYNOPSIS
+--------
+[verse]
+'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+
+DESCRIPTION
+-----------
+This command runs a command and gathers a performance counter profile
+from it, into output.perf - without displaying anything.
+
+This file can then be inspected later on, using 'perf report'.
+
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-e::
+--event=::
+                             0:0: cpu-cycles          
+                             0:0: cycles              
+                             0:1: instructions        
+                             0:2: cache-references    
+                             0:3: cache-misses        
+                             0:4: branch-instructions 
+                             0:4: branches            
+                             0:5: branch-misses       
+                             0:6: bus-cycles          
+                             1:0: cpu-clock           
+                             1:1: task-clock          
+                             1:2: page-faults         
+                             1:2: faults              
+                             1:5: minor-faults        
+                             1:6: major-faults        
+                             1:3: context-switches    
+                             1:3: cs                  
+                             1:4: cpu-migrations      
+                             1:4: migrations          
+                           rNNN: raw PMU events (eventsel+umask)
+
+-a::
+        system-wide collection
+
+-l::
+        scale counter values
+
+Configuration
+-------------
+
+EXAMPLES
+--------
+
+SEE ALSO
+--------
+linkperf:git-stat[1]
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index fb8b71744e59f..b6c665eb22e4d 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -309,6 +309,7 @@ LIB_OBJS += usage.o
 LIB_OBJS += wrapper.o
 
 BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
 
diff --git a/Documentation/perf_counter/perf-record.c b/Documentation/perf_counter/builtin-record.c
similarity index 95%
rename from Documentation/perf_counter/perf-record.c
rename to Documentation/perf_counter/builtin-record.c
index 614de7c468b21..4a50abf843ee9 100644
--- a/Documentation/perf_counter/perf-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -81,16 +81,12 @@
 	(void) (&_min1 == &_min2);		\
 	_min1 < _min2 ? _min1 : _min2; })
 
-asmlinkage int sys_perf_counter_open(
+extern asmlinkage int sys_perf_counter_open(
         struct perf_counter_hw_event    *hw_event_uptr          __user,
         pid_t                           pid,
         int                             cpu,
         int                             group_fd,
-        unsigned long                   flags)
-{
-        return syscall(
-                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-}
+        unsigned long                   flags);
 
 #define MAX_COUNTERS			64
 #define MAX_NR_CPUS			256
@@ -119,26 +115,6 @@ const unsigned int default_count[] = {
 	  10000,
 };
 
-static char *hw_event_names[] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names[] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-	"minor faults",
-	"major faults",
-};
-
 struct event_symbol {
 	__u64 event;
 	char *symbol;
@@ -414,7 +390,7 @@ static void sigchld_handler(int sig)
 		done = 1;
 }
 
-int main(int argc, char *argv[])
+int cmd_record(int argc, const char **argv)
 {
 	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 605323c691f13..5854b1715f546 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,6 +14,7 @@ extern void prune_packed_objects(int);
 extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
-extern int cmd_top(int argc, const char **argv, const char *prefix);
+extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
+extern int cmd_top(int argc, const char **argv, const char *prefix);
 #endif
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index 52455d46bfb54..d15210aa0cae0 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -1,5 +1,6 @@
 # List of known perf commands.
 # command name				category [deprecated] [common]
-perf-top                                mainporcelain common
+perf-record                             mainporcelain common
 perf-stat                               mainporcelain common
+perf-top                                mainporcelain common
 
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index ff8658f2a2f1b..e849dd66b5ef8 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -248,8 +248,9 @@ static void handle_internal_command(int argc, const char **argv)
 {
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
-		{ "top", cmd_top, 0 },
+		{ "record", cmd_record, 0 },
 		{ "stat", cmd_stat, 0 },
+		{ "top", cmd_top, 0 },
 	};
 	int i;
 	static const char ext[] = STRIP_EXTENSION;
-- 
GitLab


From cc13a5913797365b683212f5fca5fb04bb3582eb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 16:01:30 +0200
Subject: [PATCH 0545/2198] perf_counter tools: fix --version

Hook up the 'perf version' built-in command.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin.h | 1 +
 Documentation/perf_counter/perf.c    | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 5854b1715f546..aec5ae388e660 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -17,4 +17,5 @@ extern int check_pager_config(const char *cmd);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
+extern int cmd_version(int argc, const char **argv, const char *prefix);
 #endif
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index e849dd66b5ef8..20d508c772934 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -5,7 +5,7 @@
 #include "run-command.h"
 
 const char perf_usage_string[] =
-	"perf [--version] [--exec-path[=PERF_EXEC_PATH]] [--html-path] [-p|--paginate|--no-pager] [--bare] [--perf-dir=PERF_DIR] [--work-tree=PERF_WORK_TREE] [--help] COMMAND [ARGS]";
+	"perf [--version] [--help] COMMAND [ARGS]";
 
 const char perf_more_info_string[] =
 	"See 'perf help COMMAND' for more information on a specific command.";
@@ -251,6 +251,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "record", cmd_record, 0 },
 		{ "stat", cmd_stat, 0 },
 		{ "top", cmd_top, 0 },
+		{ "version", cmd_version, 0 },
 	};
 	int i;
 	static const char ext[] = STRIP_EXTENSION;
-- 
GitLab


From 6142fdd968c76a0f2ee753c39bd5be8d1bb4ef04 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 16:05:55 +0200
Subject: [PATCH 0546/2198] perf_counter tools: add 'perf help'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../perf_counter/Documentation/perf-help.txt  | 38 +++++++++++++++++++
 Documentation/perf_counter/builtin.h          |  1 +
 Documentation/perf_counter/perf.c             |  1 +
 3 files changed, 40 insertions(+)
 create mode 100644 Documentation/perf_counter/Documentation/perf-help.txt

diff --git a/Documentation/perf_counter/Documentation/perf-help.txt b/Documentation/perf_counter/Documentation/perf-help.txt
new file mode 100644
index 0000000000000..f85fed5a7edba
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-help.txt
@@ -0,0 +1,38 @@
+perf-help(1)
+===========
+
+NAME
+----
+perf-help - display help information about perf
+
+SYNOPSIS
+--------
+'perf help' [-a|--all] [COMMAND]
+
+DESCRIPTION
+-----------
+
+With no options and no COMMAND given, the synopsis of the 'perf'
+command and a list of the most commonly used perf commands are printed
+on the standard output.
+
+If the option '--all' or '-a' is given, then all available commands are
+printed on the standard output.
+
+If a perf command is named, a manual page for that command is brought
+up. The 'man' program is used by default for this purpose, but this
+can be overridden by other options or configuration variables.
+
+Note that `perf --help ...` is identical to `perf help ...` because the
+former is internally converted into the latter.
+
+OPTIONS
+-------
+-a::
+--all::
+	Prints all the available commands on the standard output. This
+	option supersedes any other option.
+
+PERF
+----
+Part of the linkperf:perf[1] suite
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index aec5ae388e660..800f86c1d4450 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,6 +14,7 @@ extern void prune_packed_objects(int);
 extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
+extern int cmd_help(int argc, const char **argv, const char *prefix);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 20d508c772934..8d6faecdc15d6 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -248,6 +248,7 @@ static void handle_internal_command(int argc, const char **argv)
 {
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
+		{ "help", cmd_help, 0 },
 		{ "record", cmd_record, 0 },
 		{ "stat", cmd_stat, 0 },
 		{ "top", cmd_top, 0 },
-- 
GitLab


From 125e702b09a28a502e145fb434678ee27720fc48 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 16:13:46 +0200
Subject: [PATCH 0547/2198] perf_counter tools: fix 'make install'

Remove Git leftovers from this area.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index b6c665eb22e4d..690045e496924 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -255,9 +255,9 @@ EXTRA_PROGRAMS =
 PROGRAMS += $(EXTRA_PROGRAMS)
 
 #
-# None right now:
+# Single 'perf' binary right now:
 #
-# PROGRAMS += perf-fast-import$X
+PROGRAMS += perf
 
 # List built-in command $C whose implementation cmd_$C() is not in
 # builtin-$C.o but is linked in as part of some other command.
@@ -762,24 +762,9 @@ install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-	$(INSTALL) perf$X perf-upload-pack$X perf-receive-pack$X perf-upload-archive$X perf-shell$X perf-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install
-	$(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install
 ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
-	bindir=$$(cd '$(DESTDIR_SQ)$(bindir_SQ)' && pwd) && \
-	execdir=$$(cd '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' && pwd) && \
-	{ $(RM) "$$execdir/perf-add$X" && \
-		ln "$$bindir/perf$X" "$$execdir/perf-add$X" 2>/dev/null || \
-		cp "$$bindir/perf$X" "$$execdir/perf-add$X"; } && \
-	{ for p in $(filter-out perf-add$X,$(BUILT_INS)); do \
-		$(RM) "$$execdir/$$p" && \
-		ln "$$execdir/perf-add$X" "$$execdir/$$p" 2>/dev/null || \
-		ln -s "perf-add$X" "$$execdir/$$p" 2>/dev/null || \
-		cp "$$execdir/perf-add$X" "$$execdir/$$p" || exit; \
-	  done } && \
-	./check_bindir "z$$bindir" "z$$execdir" "$$bindir/perf-add$X"
 
 ### Maintainer's dist rules
 
-- 
GitLab


From a7abe97fd8e7a6ccabba5a04a9f17be9211d418c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 10:59:34 -0400
Subject: [PATCH 0548/2198] tracing: rename EVENT_TRACER config to
 ENABLE_EVENT_TRACING

Currently we have two configs: EVENT_TRACING and EVENT_TRACER.
All tracers enable EVENT_TRACING. The EVENT_TRACER is only a
convenience to enable the EVENT_TRACING when no other tracers
are enabled.

The names EVENT_TRACER and EVENT_TRACING are too similar and confusing.
This patch renames EVENT_TRACER to ENABLE_EVENT_TRACING to be more
appropriate to what it actually does, as well as add a comment in
the help menu to explain the option's purpose.

[ Impact: rename config option to reduce confusion ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3fa36d2bc2905..450d3c2cfbd28 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -183,7 +183,7 @@ config CONTEXT_SWITCH_TRACER
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
 
-config EVENT_TRACER
+config ENABLE_EVENT_TRACING
 	bool "Trace various events in the kernel"
 	select TRACING
 	help
@@ -191,6 +191,10 @@ config EVENT_TRACER
 	  allowing the user to pick and choose which trace point they
 	  want to trace.
 
+	  Note, all tracers enable event tracing. This option is
+	  only a convenience to enable event tracing when no other
+	  tracers are selected.
+
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
-- 
GitLab


From 28d20e2d6e94434827e11c310788b87204b84559 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 12:12:44 -0400
Subject: [PATCH 0549/2198] tracing/events: call the correct event trace
 selftest init function

The late_initcall calls a helper function instead of the proper
init event selftest function.

This update may have been lost due to conflicting merges.

[ Impact: fix compiler warning and call extended event trace self tests ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 64f9d6d2735b3..98daf5dc74a6d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1199,6 +1199,6 @@ static __init int event_trace_self_tests_init(void)
 	return 0;
 }
 
-late_initcall(event_trace_self_tests);
+late_initcall(event_trace_self_tests_init);
 
 #endif
-- 
GitLab


From 23de29de2d8b227943be191d59fb6d983996d55e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 12:59:29 -0400
Subject: [PATCH 0550/2198] tracing: remove dangling semicolon

Due to a cut and paste error, the trace_seq_putc had a semicolon
after the prototype but before the stub function when tracing is
disabled.

[Impact: fix compile error ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_seq.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 37db9bdfbc1a0..ba9627f00d3f0 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -63,7 +63,7 @@ static inline int trace_seq_puts(struct trace_seq *s, const char *str)
 {
 	return 0;
 }
-static inline int trace_seq_putc(struct trace_seq *s, unsigned char c);
+static inline int trace_seq_putc(struct trace_seq *s, unsigned char c)
 {
 	return 0;
 }
-- 
GitLab


From 17487bfeb6cfb05920e6a9d5a54f345f2917b4e7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 13:24:21 -0400
Subject: [PATCH 0551/2198] tracing: fix recursive test level calculation

The recursive tests to detect same level recursion in the ring buffers
did not account for the hard/softirq_counts to be shifted. Thus the
numbers could be larger than then mask to be tested.

This patch includes the shift for the calculation of the irq depth.

[ Impact: stop false positives in trace recursion detection ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index e145969a8edab..aa40ae92233b3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1483,7 +1483,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 
 static int trace_irq_level(void)
 {
-	return hardirq_count() + softirq_count() + in_nmi();
+	return (hardirq_count() >> HARDIRQ_SHIFT) +
+		(softirq_count() >> + SOFTIRQ_SHIFT) +
+		!!in_nmi();
 }
 
 static int trace_recursive_lock(void)
-- 
GitLab


From e395898e98119085f666febbc7b631dd69bc637f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 13:32:44 -0400
Subject: [PATCH 0552/2198] tracing: remove recursive test from
 ring_buffer_event_discard

The ring_buffer_event_discard is not tied to ring_buffer_lock_reserve.
It can be called inside or outside the reserve/commit. Even if it
is called inside the reserve/commit the commit part must also be called.

Only ring_buffer_discard_commit can be used as a replacement for
ring_buffer_unlock_commit.

This patch removes the trace_recursive_unlock from ring_buffer_event_discard
since it would be the wrong place to do so.

[Impact: prevent breakage in trace recursive testing ]

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index aa40ae92233b3..a6997670cc469 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1667,7 +1667,6 @@ static inline void rb_event_discard(struct ring_buffer_event *event)
 void ring_buffer_event_discard(struct ring_buffer_event *event)
 {
 	rb_event_discard(event);
-	trace_recursive_unlock();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
 
-- 
GitLab


From f1f9b3b1795da8625e0e6096813c9d18d4a344ce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 20 Apr 2009 20:38:21 +0200
Subject: [PATCH 0553/2198] perfcounters, sched: remove __task_delta_exec()

This function was left orphan by the latest round of sw-counter
cleanups.

[ Impact: remove unused kernel function ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kernel_stat.h |  1 -
 kernel/sched.c              | 23 -----------------------
 2 files changed, 24 deletions(-)

diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 080d1fd461d74..a77c6007dc99a 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -85,7 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq)
 /*
  * Lock/unlock the current runqueue - to extract task statistics:
  */
-extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update);
 extern unsigned long long task_delta_exec(struct task_struct *);
 
 extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
diff --git a/kernel/sched.c b/kernel/sched.c
index b66a08c2480ed..a69278eef425e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4546,29 +4546,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
 
 EXPORT_PER_CPU_SYMBOL(kstat);
 
-/*
- * Return any ns on the sched_clock that have not yet been banked in
- * @p in case that task is currently running.
- */
-unsigned long long __task_delta_exec(struct task_struct *p, int update)
-{
-	s64 delta_exec;
-	struct rq *rq;
-
-	rq = task_rq(p);
-	WARN_ON_ONCE(!runqueue_is_locked());
-	WARN_ON_ONCE(!task_current(rq, p));
-
-	if (update)
-		update_rq_clock(rq);
-
-	delta_exec = rq->clock - p->se.exec_start;
-
-	WARN_ON_ONCE(delta_exec < 0);
-
-	return delta_exec;
-}
-
 /*
  * Return any ns on the sched_clock that have not yet been banked in
  * @p in case that task is currently running.
-- 
GitLab


From ff743345bf7685a207868048a70e23164c4785e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:26 +0100
Subject: [PATCH 0554/2198] sched: remove extra call overhead for schedule()

Lai Jiangshan's patch reminded me that I promised Nick to remove
that extra call overhead in schedule().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090313112300.927414207@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/mutex.c |  4 +++-
 kernel/sched.c | 12 ++++--------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5d79781394a30..e1fb735104090 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -248,7 +248,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
 		/* didnt get the lock, go to sleep: */
 		spin_unlock_mutex(&lock->wait_lock, flags);
-		__schedule();
+		preempt_enable_no_resched();
+		schedule();
+		preempt_disable();
 		spin_lock_mutex(&lock->wait_lock, flags);
 	}
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 7601ceebf7cee..797f6fdabadff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5131,13 +5131,15 @@ pick_next_task(struct rq *rq)
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
 {
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
 	int cpu;
 
+need_resched:
+	preempt_disable();
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
 	rcu_qsctr_inc(cpu);
@@ -5194,15 +5196,9 @@ asmlinkage void __sched __schedule(void)
 
 	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
-}
 
-asmlinkage void __sched schedule(void)
-{
-need_resched:
-	preempt_disable();
-	__schedule();
 	preempt_enable_no_resched();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (need_resched())
 		goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
-- 
GitLab


From aa18efb2a2f07e1cf062039848e9d369bb358724 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 16:16:11 -0400
Subject: [PATCH 0555/2198] tracing: use recursive counter over irq level

Althought using the irq level (hardirq_count, softirq_count and in_nmi)
was nice to detect bad recursion right away, but since the counters are
not atomically updated with respect to the interrupts, the function tracer
might trigger the test from an interrupt handler before the hardirq_count
is updated. This will trigger a false warning.

This patch converts the recursive detection to a simple counter.
If the depth is greater than 16 then the recursive detection will trigger.
16 is more than enough for any nested interrupts.

[ Impact: fix false positive trace recursion detection ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 45 ++++++++++++++------------------------
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a6997670cc469..7bcfd3e605375 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1481,47 +1481,34 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
-static int trace_irq_level(void)
-{
-	return (hardirq_count() >> HARDIRQ_SHIFT) +
-		(softirq_count() >> + SOFTIRQ_SHIFT) +
-		!!in_nmi();
-}
+#define TRACE_RECURSIVE_DEPTH 16
 
 static int trace_recursive_lock(void)
 {
-	int level;
-
-	level = trace_irq_level();
+	current->trace_recursion++;
 
-	if (unlikely(current->trace_recursion & (1 << level))) {
-		/* Disable all tracing before we do anything else */
-		tracing_off_permanent();
+	if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+		return 0;
 
-		printk_once(KERN_WARNING "Tracing recursion: "
-			    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
-			    hardirq_count() >> HARDIRQ_SHIFT,
-			    softirq_count() >> SOFTIRQ_SHIFT,
-			    in_nmi());
+	/* Disable all tracing before we do anything else */
+	tracing_off_permanent();
 
-		WARN_ON_ONCE(1);
-		return -1;
-	}
+	printk_once(KERN_WARNING "Tracing recursion: depth[%d]:"
+		    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+		    current->trace_recursion,
+		    hardirq_count() >> HARDIRQ_SHIFT,
+		    softirq_count() >> SOFTIRQ_SHIFT,
+		    in_nmi());
 
-	current->trace_recursion |= 1 << level;
-
-	return 0;
+	WARN_ON_ONCE(1);
+	return -1;
 }
 
 static void trace_recursive_unlock(void)
 {
-	int level;
-
-	level = trace_irq_level();
-
-	WARN_ON_ONCE(!current->trace_recursion & (1 << level));
+	WARN_ON_ONCE(!current->trace_recursion);
 
-	current->trace_recursion &= ~(1 << level);
+	current->trace_recursion--;
 }
 
 static DEFINE_PER_CPU(int, rb_need_resched);
-- 
GitLab


From cb4764a6dbffd9bb3cf759421ae82384071a933d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 20 Apr 2009 18:16:44 -0400
Subject: [PATCH 0556/2198] tracing: use nowakeup version of commit for
 function event trace tests

The startup tests for the event tracer also runs with the function
tracer enabled. The "wakeup" version of the trace commit was used
which can grab spinlocks. If a task was preempted by an NMI
that called a function being traced, it could deadlock due to the
function tracer trying to grab the same lock.

Thanks to Frederic Weisbecker for pointing out where the bug was.

Reported-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 98daf5dc74a6d..672b195f86c9c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1164,7 +1164,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 
-	trace_current_buffer_unlock_commit(event, flags, pc);
+	trace_nowake_buffer_unlock_commit(event, flags, pc);
 
  out:
 	atomic_dec(&per_cpu(test_event_disable, cpu));
-- 
GitLab


From 2ecf0a57c60dcb588f310d94412118e15c510532 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Apr 2009 12:16:56 +0900
Subject: [PATCH 0557/2198] ide-dma: don't reset request fields on
 dma_timeout_retry()

Impact: drop unnecessary code

Now that everything uses bio and block operations, there is no need to
reset request fields manually when retrying a request.  Every field is
guaranteed to be always valid.  Drop unnecessary request field
resetting from ide_dma_timeout_retry().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-dma.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index a0b8cab1d9a68..d9123ecae4a98 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -510,23 +510,11 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 	/*
 	 * un-busy drive etc and make sure request is sane
 	 */
-
 	rq = hwif->rq;
-	if (!rq)
-		goto out;
-
-	hwif->rq = NULL;
-
-	rq->errors = 0;
-
-	if (!rq->bio)
-		goto out;
-
-	rq->sector = rq->bio->bi_sector;
-	rq->current_nr_sectors = bio_iovec(rq->bio)->bv_len >> 9;
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->buffer = bio_data(rq->bio);
-out:
+	if (rq) {
+		hwif->rq = NULL;
+		rq->errors = 0;
+	}
 	return ret;
 }
 
-- 
GitLab


From 6e29ec5701e9d44fa02b96c1c5c45f7516182b65 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Tue, 21 Apr 2009 08:40:49 +0530
Subject: [PATCH 0558/2198] sched: Replace first_cpu() with cpumask_first() in
 ILB nomination code

Stephen Rothwell reported this build warning:

>  kernel/sched.c: In function 'find_new_ilb':
>  kernel/sched.c:4355: warning: passing argument 1 of '__first_cpu' from incompatible pointer type
>
> Possibly caused by commit f711f6090a81cbd396b63de90f415d33f563af9b
> ("sched: Nominate idle load balancer from a semi-idle package") from
> the sched tree.  Should this call to first_cpu be cpumask_first?

For !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT), find_new_ilb() nominates the
Idle load balancer as the first cpu from the nohz.cpu_mask.

This code uses the older API first_cpu(). Replace it with cpumask_first(),
which is the correct API here.

[ Impact: cleanup, address build warning ]

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090421031049.GA4140@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 797f6fdabadff..54d67b94f1a9f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4356,7 +4356,7 @@ static int find_new_ilb(int cpu)
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
 {
-	return first_cpu(nohz.cpu_mask);
+	return cpumask_first(nohz.cpu_mask);
 }
 #endif
 
-- 
GitLab


From fc1edaf9e7cc4d4696f83dee495b8f158d01c4eb Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:27 -0700
Subject: [PATCH 0559/2198] x86: x2apic, IR: Clean up X86_X2APIC and INTR_REMAP
 config checks

Add x2apic_supported() to clean up CONFIG_X86_X2APIC checks.

Fix CONFIG_INTR_REMAP checks.

[ Impact: cleanup ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: dwmw2@infradead.org
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Weidong Han <weidong.han@intel.com>
LKML-Reference: <20090420200450.128993000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h          | 10 +++---
 arch/x86/include/asm/io_apic.h       |  2 --
 arch/x86/include/asm/irq_remapping.h |  2 +-
 arch/x86/kernel/apic/apic.c          | 49 +++++++---------------------
 arch/x86/kernel/apic/io_apic.c       |  2 --
 arch/x86/kernel/apic/probe_64.c      |  2 +-
 include/linux/dmar.h                 |  2 ++
 7 files changed, 21 insertions(+), 48 deletions(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index fbdd65446c7a8..3738438a91f55 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void);
 extern void native_apic_icr_write(u32 low, u32 id);
 extern u64 native_apic_icr_read(void);
 
-#define EIM_8BIT_APIC_ID	0
-#define EIM_32BIT_APIC_ID	1
+extern int x2apic_mode;
 
 #ifdef CONFIG_X86_X2APIC
 /*
@@ -166,7 +165,7 @@ static inline u64 native_x2apic_icr_read(void)
 	return val;
 }
 
-extern int x2apic, x2apic_phys;
+extern int x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
@@ -182,6 +181,8 @@ static inline int x2apic_enabled(void)
 		return 1;
 	return 0;
 }
+
+#define x2apic_supported()	(cpu_has_x2apic)
 #else
 static inline void check_x2apic(void)
 {
@@ -194,9 +195,8 @@ static inline int x2apic_enabled(void)
 	return 0;
 }
 
-#define	x2apic	0
 #define	x2apic_preenabled 0
-
+#define	x2apic_supported()	0
 #endif
 
 extern void enable_IR_x2apic(void);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e4360100..34eaa37f7ad42 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -161,7 +161,6 @@ extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
-#ifdef CONFIG_X86_64
 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
 extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
 extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
@@ -169,7 +168,6 @@ extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
 	struct IO_APIC_route_entry **ioapic_entries);
-#endif
 
 extern void probe_nr_irqs_gsi(void);
 
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 0396760fccb85..f275e2244505b 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -1,6 +1,6 @@
 #ifndef _ASM_X86_IRQ_REMAPPING_H
 #define _ASM_X86_IRQ_REMAPPING_H
 
-#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
+#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
 
 #endif	/* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 7b41a32339e0a..2b30e520dce37 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -134,8 +134,8 @@ static __init int setup_apicpmtimer(char *s)
 __setup("apicpmtimer", setup_apicpmtimer);
 #endif
 
+int x2apic_mode;
 #ifdef CONFIG_X86_X2APIC
-int x2apic;
 /* x2apic enabled before OS handover */
 static int x2apic_preenabled;
 static int disable_x2apic;
@@ -858,7 +858,7 @@ void clear_local_APIC(void)
 	u32 v;
 
 	/* APIC hasn't been mapped yet */
-	if (!x2apic && !apic_phys)
+	if (!x2apic_mode && !apic_phys)
 		return;
 
 	maxlvt = lapic_get_maxlvt();
@@ -1330,7 +1330,7 @@ void check_x2apic(void)
 {
 	if (x2apic_enabled()) {
 		pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
-		x2apic_preenabled = x2apic = 1;
+		x2apic_preenabled = x2apic_mode = 1;
 	}
 }
 
@@ -1338,7 +1338,7 @@ void enable_x2apic(void)
 {
 	int msr, msr2;
 
-	if (!x2apic)
+	if (!x2apic_mode)
 		return;
 
 	rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1390,25 +1390,17 @@ void __init enable_IR_x2apic(void)
 	mask_IO_APIC_setup(ioapic_entries);
 	mask_8259A();
 
-#ifdef CONFIG_X86_X2APIC
-	if (cpu_has_x2apic)
-		ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-	else
-#endif
-		ret = enable_intr_remapping(EIM_8BIT_APIC_ID);
-
+	ret = enable_intr_remapping(x2apic_supported());
 	if (ret)
 		goto end_restore;
 
 	pr_info("Enabled Interrupt-remapping\n");
 
-#ifdef CONFIG_X86_X2APIC
-	if (cpu_has_x2apic && !x2apic) {
-		x2apic = 1;
+	if (x2apic_supported() && !x2apic_mode) {
+		x2apic_mode = 1;
 		enable_x2apic();
 		pr_info("Enabled x2apic\n");
 	}
-#endif
 
 end_restore:
 	if (ret)
@@ -1576,7 +1568,7 @@ void __init early_init_lapic_mapping(void)
  */
 void __init init_apic_mappings(void)
 {
-	if (x2apic) {
+	if (x2apic_mode) {
 		boot_cpu_physical_apicid = read_apic_id();
 		return;
 	}
@@ -2010,10 +2002,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 
 	local_irq_save(flags);
 	disable_local_APIC();
-#ifdef CONFIG_INTR_REMAP
+
 	if (intr_remapping_enabled)
 		disable_intr_remapping();
-#endif
+
 	local_irq_restore(flags);
 	return 0;
 }
@@ -2023,8 +2015,6 @@ static int lapic_resume(struct sys_device *dev)
 	unsigned int l, h;
 	unsigned long flags;
 	int maxlvt;
-
-#ifdef CONFIG_INTR_REMAP
 	int ret;
 	struct IO_APIC_route_entry **ioapic_entries = NULL;
 
@@ -2050,17 +2040,8 @@ static int lapic_resume(struct sys_device *dev)
 		mask_8259A();
 	}
 
-	if (x2apic)
+	if (x2apic_mode)
 		enable_x2apic();
-#else
-	if (!apic_pm_state.active)
-		return 0;
-
-	local_irq_save(flags);
-	if (x2apic)
-		enable_x2apic();
-#endif
-
 	else {
 		/*
 		 * Make sure the APICBASE points to the right address
@@ -2098,18 +2079,12 @@ static int lapic_resume(struct sys_device *dev)
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
 
-#ifdef CONFIG_INTR_REMAP
 	if (intr_remapping_enabled) {
-		if (x2apic)
-			reenable_intr_remapping(EIM_32BIT_APIC_ID);
-		else
-			reenable_intr_remapping(EIM_8BIT_APIC_ID);
-
+		reenable_intr_remapping(x2apic_mode);
 		unmask_8259A();
 		restore_IO_APIC_setup(ioapic_entries);
 		free_ioapic_entries(ioapic_entries);
 	}
-#endif
 
 	local_irq_restore(flags);
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ea22a86e3cda2..3a45d2ec97402 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -736,7 +736,6 @@ static int __init ioapic_pirq_setup(char *str)
 __setup("pirq=", ioapic_pirq_setup);
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_INTR_REMAP
 struct IO_APIC_route_entry **alloc_ioapic_entries(void)
 {
 	int apic;
@@ -857,7 +856,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
 
 	kfree(ioapic_entries);
 }
-#endif
 
 /*
  * Find the IRQ entry number of a certain pin.
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 1783652bb0e56..bc3e880f9b82e 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
 void __init default_setup_apic_routing(void)
 {
 #ifdef CONFIG_X86_X2APIC
-	if (x2apic && (apic != &apic_x2apic_phys &&
+	if (x2apic_mode && (apic != &apic_x2apic_phys &&
 #ifdef CONFIG_X86_UV
 		       apic != &apic_x2apic_uv_x &&
 #endif
diff --git a/include/linux/dmar.h b/include/linux/dmar.h
index 06f592a7f73c8..10ff5c498824b 100644
--- a/include/linux/dmar.h
+++ b/include/linux/dmar.h
@@ -158,6 +158,8 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic)
 }
 #define irq_remapped(irq)		(0)
 #define enable_intr_remapping(mode)	(-1)
+#define disable_intr_remapping()	(0)
+#define reenable_intr_remapping(mode)	(0)
 #define intr_remapping_enabled		(0)
 #endif
 
-- 
GitLab


From 25629d810a52176758401184d9b437fbb7f79195 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:28 -0700
Subject: [PATCH 0560/2198] x86: x2apic, IR: Move eoi_ioapic_irq() into a
 CONFIG_INTR_REMAP section

Address the following complier warning:

   arch/x86/kernel/apic/io_apic.c:2543: warning: `eoi_ioapic_irq' defined but not used

By moving that function (and eoi_ioapic_irq()) into an existing
#ifdef CONFIG_INTR_REMAP section of the code.

[ Impact: cleanup ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: dwmw2@infradead.org
Cc: Weidong Han <weidong.han@intel.com>
LKML-Reference: <20090420200450.271099000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: Weidong Han <weidong.han@intel.com>
---
 arch/x86/kernel/apic/io_apic.c | 66 +++++++++++++++++-----------------
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3a45d2ec97402..4baa9cbd630a3 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2517,39 +2517,6 @@ static void irq_complete_move(struct irq_desc **descp)
 static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
 
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
-	int apic, pin;
-	struct irq_pin_list *entry;
-
-	entry = cfg->irq_2_pin;
-	for (;;) {
-
-		if (!entry)
-			break;
-
-		apic = entry->apic;
-		pin = entry->pin;
-		io_apic_eoi(apic, pin);
-		entry = entry->next;
-	}
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
-	struct irq_cfg *cfg;
-	unsigned long flags;
-	unsigned int irq;
-
-	irq = desc->irq;
-	cfg = desc->chip_data;
-
-	spin_lock_irqsave(&ioapic_lock, flags);
-	__eoi_ioapic_irq(irq, cfg);
-	spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
 static void ack_apic_edge(unsigned int irq)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -2659,6 +2626,39 @@ static void ack_apic_level(unsigned int irq)
 }
 
 #ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+	int apic, pin;
+	struct irq_pin_list *entry;
+
+	entry = cfg->irq_2_pin;
+	for (;;) {
+
+		if (!entry)
+			break;
+
+		apic = entry->apic;
+		pin = entry->pin;
+		io_apic_eoi(apic, pin);
+		entry = entry->next;
+	}
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+	struct irq_cfg *cfg;
+	unsigned long flags;
+	unsigned int irq;
+
+	irq = desc->irq;
+	cfg = desc->chip_data;
+
+	spin_lock_irqsave(&ioapic_lock, flags);
+	__eoi_ioapic_irq(irq, cfg);
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
 static void ir_ack_apic_edge(unsigned int irq)
 {
 	ack_APIC_irq();
-- 
GitLab


From 39d83a5d684a457046aa2a6dac60f105966e78e9 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:29 -0700
Subject: [PATCH 0561/2198] x86: x2apic, IR: Clean up panic() with nox2apic
 boot option

Instead of panic() ignore the "nox2apic" boot option when BIOS
has already enabled x2apic prior to OS handover.

[ Impact: printk warning instead of panic() when BIOS has enabled x2apic already ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: dwmw2@infradead.org
Cc: Weidong Han <weidong.han@intel.com>
LKML-Reference: <20090420200450.425091000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 2b30e520dce37..d32f5589f1ddc 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -141,8 +141,12 @@ static int x2apic_preenabled;
 static int disable_x2apic;
 static __init int setup_nox2apic(char *str)
 {
-	if (x2apic_enabled())
-		panic("Bios already enabled x2apic, can't enforce nox2apic");
+	if (x2apic_enabled()) {
+		pr_warning("Bios already enabled x2apic, "
+			   "can't enforce nox2apic");
+		return 0;
+	}
+
 	disable_x2apic = 1;
 	setup_clear_cpu_cap(X86_FEATURE_X2APIC);
 	return 0;
-- 
GitLab


From 9d6c26e73bd248c286bb3597aaf788716e8fcceb Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:31 -0700
Subject: [PATCH 0562/2198] x86: x2apic, IR: Make config X86_UV dependent on
 X86_X2APIC

Instead of selecting X86_X2APIC, make config X86_UV dependent
on X86_X2APIC.

This will eliminate enabling CONFIG_X86_X2APIC with out
enabling CONFIG_INTR_REMAP.

[ Impact: cleanup ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Jack Steiner <steiner@sgi.com>
Cc: dwmw2@infradead.org
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Weidong Han <weidong.han@intel.com>
LKML-Reference: <20090420200450.694598000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c9086e6307a5e..58fb7b3bcd1a4 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -354,7 +354,7 @@ config X86_UV
 	depends on X86_64
 	depends on X86_EXTENDED_PLATFORM
 	depends on NUMA
-	select X86_X2APIC
+	depends on X86_X2APIC
 	---help---
 	  This option is needed in order to support SGI Ultraviolet systems.
 	  If you don't have one of these, you should say N here.
-- 
GitLab


From 89388913f2c88a2cd15d24abab571b17a2596127 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 21 Apr 2009 11:39:27 +0300
Subject: [PATCH 0563/2198] x86: unify noexec handling

This patch unifies noexec handling on 32-bit and 64-bit.

[ Impact: cleanup ]

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
[ mingo@elte.hu: build fix ]
LKML-Reference: <1240303167.771.69.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/pgtable_types.h |  1 -
 arch/x86/mm/init.c                   | 67 ++++++++++++++++++++++++++--
 arch/x86/mm/init_32.c                | 52 ---------------------
 arch/x86/mm/init_64.c                | 33 --------------
 4 files changed, 63 insertions(+), 90 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b8238dc8786d9..4d258ad76a0fc 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -273,7 +273,6 @@ typedef struct page *pgtable_t;
 
 extern pteval_t __supported_pte_mask;
 extern int nx_enabled;
-extern void set_nx(void);
 
 #define pgprot_writecombine	pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fd3da1dda1c9e..fedde5359a049 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -22,6 +22,69 @@ int direct_gbpages
 #endif
 ;
 
+int nx_enabled;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int disable_nx __cpuinitdata;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on      Enable
+ * off     Disable
+ */
+static int __init noexec_setup(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	if (!strncmp(str, "on", 2)) {
+		__supported_pte_mask |= _PAGE_NX;
+		disable_nx = 0;
+	} else if (!strncmp(str, "off", 3)) {
+		disable_nx = 1;
+		__supported_pte_mask &= ~_PAGE_NX;
+	}
+	return 0;
+}
+early_param("noexec", noexec_setup);
+#endif
+
+#ifdef CONFIG_X86_PAE
+static void __init set_nx(void)
+{
+	unsigned int v[4], l, h;
+
+	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
+		if ((v[3] & (1 << 20)) && !disable_nx) {
+			rdmsr(MSR_EFER, l, h);
+			l |= EFER_NX;
+			wrmsr(MSR_EFER, l, h);
+			nx_enabled = 1;
+			__supported_pte_mask |= _PAGE_NX;
+		}
+	}
+}
+#else
+static inline void set_nx(void)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void __cpuinit check_efer(void)
+{
+	unsigned long efer;
+
+	rdmsrl(MSR_EFER, efer);
+	if (!(efer & EFER_NX) || disable_nx)
+		__supported_pte_mask &= ~_PAGE_NX;
+}
+#endif
+
 static void __init find_early_table_space(unsigned long end, int use_pse,
 					  int use_gbpages)
 {
@@ -158,12 +221,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	use_gbpages = direct_gbpages;
 #endif
 
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_PAE
 	set_nx();
 	if (nx_enabled)
 		printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
 
 	/* Enable PSE if available */
 	if (cpu_has_pse)
@@ -174,7 +234,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 		set_in_cr4(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	}
-#endif
 
 	if (use_gbpages)
 		page_size_mask |= 1 << PG_LEVEL_1G;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 749559ed80f5d..2b27120665b32 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -587,61 +587,9 @@ void zap_low_mappings(void)
 	flush_tlb_all();
 }
 
-int nx_enabled;
-
 pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-#ifdef CONFIG_X86_PAE
-
-static int disable_nx __initdata;
-
-/*
- * noexec = on|off
- *
- * Control non executable mappings.
- *
- * on      Enable
- * off     Disable
- */
-static int __init noexec_setup(char *str)
-{
-	if (!str || !strcmp(str, "on")) {
-		if (cpu_has_nx) {
-			__supported_pte_mask |= _PAGE_NX;
-			disable_nx = 0;
-		}
-	} else {
-		if (!strcmp(str, "off")) {
-			disable_nx = 1;
-			__supported_pte_mask &= ~_PAGE_NX;
-		} else {
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-early_param("noexec", noexec_setup);
-
-void __init set_nx(void)
-{
-	unsigned int v[4], l, h;
-
-	if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
-		cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-
-		if ((v[3] & (1 << 20)) && !disable_nx) {
-			rdmsr(MSR_EFER, l, h);
-			l |= EFER_NX;
-			wrmsr(MSR_EFER, l, h);
-			nx_enabled = 1;
-			__supported_pte_mask |= _PAGE_NX;
-		}
-	}
-}
-#endif
-
 /* user-defined highmem size */
 static unsigned int highmem_pages = -1;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1753e8020df6e..a4e7846efb1a0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -85,39 +85,6 @@ early_param("gbpages", parse_direct_gbpages_on);
 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-static int disable_nx __cpuinitdata;
-
-/*
- * noexec=on|off
- * Control non-executable mappings for 64-bit processes.
- *
- * on	Enable (default)
- * off	Disable
- */
-static int __init nonx_setup(char *str)
-{
-	if (!str)
-		return -EINVAL;
-	if (!strncmp(str, "on", 2)) {
-		__supported_pte_mask |= _PAGE_NX;
-		disable_nx = 0;
-	} else if (!strncmp(str, "off", 3)) {
-		disable_nx = 1;
-		__supported_pte_mask &= ~_PAGE_NX;
-	}
-	return 0;
-}
-early_param("noexec", nonx_setup);
-
-void __cpuinit check_efer(void)
-{
-	unsigned long efer;
-
-	rdmsrl(MSR_EFER, efer);
-	if (!(efer & EFER_NX) || disable_nx)
-		__supported_pte_mask &= ~_PAGE_NX;
-}
-
 int force_personality32;
 
 /*
-- 
GitLab


From e8082f3f5a17d7a7bfc7dd1050a3f958dc034e9a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 21 Apr 2009 17:11:46 +0800
Subject: [PATCH 0564/2198] tracing/filters: don't remove old filters when
 failed to write subsys->filter

If writing subsys->filter returns EINVAL or ENOSPC, the original
filters in subsys/ and subsys/events/ will be removed. This is
definitely wrong.

[ Impact: fix filter setting semantics on error condition ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49ED8DD2.2070700@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 672b195f86c9c..9ea55a7dfdec3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -600,7 +600,6 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	err = filter_add_subsystem_pred(system, pred);
 	if (err < 0) {
-		filter_free_subsystem_preds(system);
 		filter_free_pred(pred);
 		return err;
 	}
-- 
GitLab


From f66578a7637b87810cbb9041c4e3a77fd2fa4706 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 21 Apr 2009 17:12:11 +0800
Subject: [PATCH 0565/2198] tracing/filters: allow user-input to be
 integer-like string

Suppose we would like to trace all tasks named '123', but this
will fail:

 # echo 'parent_comm == 123' > events/sched/sched_process_fork/filter
 bash: echo: write error: Invalid argument

Don't guess the type of the filter pred in filter_parse(), but instead
we check it in __filter_add_pred().

[ Impact: extend allowed filter field string values ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49ED8DEB.6000700@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 26 ++++++++++++--------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e0fcfd2a16d69..65418288f957f 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -313,6 +313,7 @@ static int __filter_add_pred(struct ftrace_event_call *call,
 {
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
+	unsigned long long val;
 
 	field = find_event_field(call, pred->field_name);
 	if (!field)
@@ -322,14 +323,13 @@ static int __filter_add_pred(struct ftrace_event_call *call,
 	pred->offset = field->offset;
 
 	if (is_string_field(field->type)) {
-		if (!pred->str_len)
-			return -EINVAL;
 		fn = filter_pred_string;
 		pred->str_len = field->size;
 		return filter_add_pred_fn(call, pred, fn);
 	} else {
-		if (pred->str_len)
+		if (strict_strtoull(pred->str_val, 0, &val))
 			return -EINVAL;
+		pred->val = val;
 	}
 
 	switch (field->size) {
@@ -413,12 +413,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 	return 0;
 }
 
+/*
+ * The filter format can be
+ *   - 0, which means remove all filter preds
+ *   - [||/&&] <field> ==/!= <val>
+ */
 int filter_parse(char **pbuf, struct filter_pred *pred)
 {
-	char *tmp, *tok, *val_str = NULL;
+	char *tok, *val_str = NULL;
 	int tok_n = 0;
 
-	/* field ==/!= number, or/and field ==/!= number, number */
 	while ((tok = strsep(pbuf, " \n"))) {
 		if (tok_n == 0) {
 			if (!strcmp(tok, "0")) {
@@ -478,19 +482,13 @@ int filter_parse(char **pbuf, struct filter_pred *pred)
 		return -EINVAL;
 	}
 
+	strcpy(pred->str_val, val_str);
+	pred->str_len = strlen(val_str);
+
 	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
 	if (!pred->field_name)
 		return -ENOMEM;
 
-	pred->str_len = 0;
-	pred->val = simple_strtoull(val_str, &tmp, 0);
-	if (tmp == val_str) {
-		strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL);
-		pred->str_len = strlen(val_str);
-		pred->str_val[pred->str_len] = '\0';
-	} else if (*tmp != '\0')
-		return -EINVAL;
-
 	return 0;
 }
 
-- 
GitLab


From 3554228d4289098a8fe5cfd87512ec32a19bbe5a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 21 Apr 2009 09:41:26 -0400
Subject: [PATCH 0566/2198] ring-buffer: only warn on wrap if buffer is bigger
 than two pages

On boot up, to save memory, ftrace allocates the minimum buffer
which is two pages. Ftrace also goes through a series of tests
(when configured) on boot up. These tests can fill up a page within
a single interrupt.

The ring buffer also has a WARN_ON when it detects that the buffer was
completely filled within a single commit (other commits are allowed to
be nested).

Combine the small buffer on start up, with the tests that can fill more
than a single page within an interrupt, this can trigger the WARN_ON.

This patch makes the WARN_ON only happen when the ring buffer consists
of more than two pages.

[ Impact: prevent false WARN_ON in ftrace startup tests ]

Reported-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <20090421094616.GA14561@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7bcfd3e605375..61dbdf21cd32d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1241,7 +1241,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * about it.
 		 */
 		if (unlikely(next_page == commit_page)) {
-			WARN_ON_ONCE(1);
+			/* This can easily happen on small ring buffers */
+			WARN_ON_ONCE(buffer->pages > 2);
 			goto out_reset;
 		}
 
-- 
GitLab


From 19bdc9d061bcb71efd2b53083d96b59bbe1a1751 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 17 Apr 2009 05:26:31 +0000
Subject: [PATCH 0567/2198] clocksource: sh_cmt clocksource support

Add clocksource support to the sh_cmt driver. With this in
place we can do tickless with a single CMT channel.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/clocksource/sh_cmt.c | 66 ++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 02bae3994abe9..c24756489612e 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -47,6 +47,7 @@ struct sh_cmt_priv {
 	unsigned long rate;
 	spinlock_t lock;
 	struct clock_event_device ced;
+	struct clocksource cs;
 	unsigned long total_cycles;
 };
 
@@ -376,6 +377,68 @@ static void sh_cmt_stop(struct sh_cmt_priv *p, unsigned long flag)
 	spin_unlock_irqrestore(&p->lock, flags);
 }
 
+static struct sh_cmt_priv *cs_to_sh_cmt(struct clocksource *cs)
+{
+	return container_of(cs, struct sh_cmt_priv, cs);
+}
+
+static cycle_t sh_cmt_clocksource_read(struct clocksource *cs)
+{
+	struct sh_cmt_priv *p = cs_to_sh_cmt(cs);
+	unsigned long flags, raw;
+	unsigned long value;
+	int has_wrapped;
+
+	spin_lock_irqsave(&p->lock, flags);
+	value = p->total_cycles;
+	raw = sh_cmt_get_counter(p, &has_wrapped);
+
+	if (unlikely(has_wrapped))
+		raw = p->match_value;
+	spin_unlock_irqrestore(&p->lock, flags);
+
+	return value + raw;
+}
+
+static int sh_cmt_clocksource_enable(struct clocksource *cs)
+{
+	struct sh_cmt_priv *p = cs_to_sh_cmt(cs);
+	int ret;
+
+	p->total_cycles = 0;
+
+	ret = sh_cmt_start(p, FLAG_CLOCKSOURCE);
+	if (ret)
+		return ret;
+
+	/* TODO: calculate good shift from rate and counter bit width */
+	cs->shift = 0;
+	cs->mult = clocksource_hz2mult(p->rate, cs->shift);
+	return 0;
+}
+
+static void sh_cmt_clocksource_disable(struct clocksource *cs)
+{
+	sh_cmt_stop(cs_to_sh_cmt(cs), FLAG_CLOCKSOURCE);
+}
+
+static int sh_cmt_register_clocksource(struct sh_cmt_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clocksource *cs = &p->cs;
+
+	cs->name = name;
+	cs->rating = rating;
+	cs->read = sh_cmt_clocksource_read;
+	cs->enable = sh_cmt_clocksource_enable;
+	cs->disable = sh_cmt_clocksource_disable;
+	cs->mask = CLOCKSOURCE_MASK(sizeof(unsigned long) * 8);
+	cs->flags = CLOCK_SOURCE_IS_CONTINUOUS;
+	pr_info("sh_cmt: %s used as clock source\n", cs->name);
+	clocksource_register(cs);
+	return 0;
+}
+
 static struct sh_cmt_priv *ced_to_sh_cmt(struct clock_event_device *ced)
 {
 	return container_of(ced, struct sh_cmt_priv, ced);
@@ -483,6 +546,9 @@ int sh_cmt_register(struct sh_cmt_priv *p, char *name,
 	if (clockevent_rating)
 		sh_cmt_register_clockevent(p, name, clockevent_rating);
 
+	if (clocksource_rating)
+		sh_cmt_register_clocksource(p, name, clocksource_rating);
+
 	return 0;
 }
 
-- 
GitLab


From 7a4f453b6d7379a7c380825949977c5a838aa012 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 22 Apr 2009 16:53:34 +0800
Subject: [PATCH 0568/2198] tracing/events: make struct trace_entry->type to be
 int type

struct trace_entry->type is unsigned char, while trace event's id is
int type, thus for a event with id >= 256, it's entry->type is cast
to (id % 256), and then we can't see the trace output of this event.

 # insmod trace-events-sample.ko
 # echo foo_bar > /mnt/tracing/set_event
 # cat /debug/tracing/events/trace-events-sample/foo_bar/id
 256
 # cat /mnt/tracing/trace_pipe
           <...>-3548  [001]   215.091142: Unknown type 0
           <...>-3548  [001]   216.089207: Unknown type 0
           <...>-3548  [001]   217.087271: Unknown type 0
           <...>-3548  [001]   218.085332: Unknown type 0

[ Impact: fix output for trace events with id >= 256 ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <49EEDB0E.5070207@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h | 4 ++--
 include/trace/ftrace.h       | 2 +-
 kernel/trace/trace.c         | 4 ++--
 kernel/trace/trace.h         | 2 +-
 kernel/trace/trace_events.c  | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 75f3ac01a87c6..2a4a40749911a 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -16,7 +16,7 @@ struct dentry;
  *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
  */
 struct trace_entry {
-	unsigned char		type;
+	int			type;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
@@ -73,7 +73,7 @@ enum print_line_t {
 
 
 struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
 				  unsigned long flags, int pc);
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 39a3351f2e7ff..15ef08d9add1c 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -198,7 +198,7 @@ ftrace_define_fields_##call(void)					\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	int ret;							\
 									\
-	__common_field(unsigned char, type);				\
+	__common_field(int, type);					\
 	__common_field(unsigned char, flags);				\
 	__common_field(unsigned char, preempt_count);			\
 	__common_field(int, pid);					\
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b9a3adce9221a..b6183bc9ecae5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -838,7 +838,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-						    unsigned char type,
+						    int type,
 						    unsigned long len,
 						    unsigned long flags, int pc)
 {
@@ -881,7 +881,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 }
 
 struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
 				  unsigned long flags, int pc)
 {
 	return trace_buffer_lock_reserve(&global_trace,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 247948e81b08d..7d55bcf50e499 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -422,7 +422,7 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 struct ring_buffer_event;
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-						    unsigned char type,
+						    int type,
 						    unsigned long len,
 						    unsigned long flags,
 						    int pc);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9ea55a7dfdec3..5d6e879cf8757 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
-				FIELD(unsigned char, type),
+				FIELD(int, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
 				FIELD(int, pid),
-- 
GitLab


From ff166cb57a17124af75714a9c11f448f56f1a4a3 Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Mon, 20 Apr 2009 13:02:30 -0700
Subject: [PATCH 0569/2198] x86: x2apic, IR: remove
 reinit_intr_remapped_IO_APIC()

When interrupt-remapping is enabled, we are relying on
setup_IO_APIC_irqs() to configure remapped entries in the
IO-APIC, which comes little bit later after enabling
interrupt-remapping.

Meanwhile, restoration of old io-apic entries after enabling
interrupt-remapping will not make the interrupts through
io-apic functional anyway.

So remove the unnecessary reinit_intr_remapped_IO_APIC() step.

The longer story:

When interrupt-remapping is enabled, IO-APIC entries need to be
setup in the re-mappable format (pointing to
interrupt-remapping table entries setup by the OS). This
remapping configuration is happening in the same place where we
traditionally configure IO-APIC (i.e., in
setup_IO_APIC_irqs()).

So when we enable interrupt-remapping successfully, there is no
need to restore old io-apic RTE entries before we actually do a
complete configuration shortly in setup_IO_APIC_irqs(). Old
IO-APIC RTE's may be in traditional format (non re-mappable) or
in re-mappable format pointing to interrupt-remapping table
entries setup by BIOS. Restoring both of these will not make
IO-APIC functional. We have to rely on setup_IO_APIC_irqs() for
proper configuration by OS.

So I am removing this unnecessary and broken step.

[ Impact: remove unnecessary/broken IO-APIC setup step ]

Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Acked-by: Weidong Han <weidong.han@intel.com>
Cc: dwmw2@infradead.org
LKML-Reference: <20090420200450.552359000@linux-os.sc.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/io_apic.h |  2 --
 arch/x86/kernel/apic/apic.c    |  2 --
 arch/x86/kernel/apic/io_apic.c | 14 --------------
 3 files changed, 18 deletions(-)

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 34eaa37f7ad42..1cf145039ee2b 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -166,8 +166,6 @@ extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
 extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
-	struct IO_APIC_route_entry **ioapic_entries);
 
 extern void probe_nr_irqs_gsi(void);
 
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index d32f5589f1ddc..1386dbec5525f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1412,8 +1412,6 @@ void __init enable_IR_x2apic(void)
 		 * IR enabling failed
 		 */
 		restore_IO_APIC_setup(ioapic_entries);
-	else
-		reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
 
 	unmask_8259A();
 	local_irq_restore(flags);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 4baa9cbd630a3..8aef5f9d9479f 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -833,20 +833,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
 	return 0;
 }
 
-void reinit_intr_remapped_IO_APIC(int intr_remapping,
-	struct IO_APIC_route_entry **ioapic_entries)
-
-{
-	/*
-	 * for now plain restore of previous settings.
-	 * TBD: In the case of OS enabling interrupt-remapping,
-	 * IO-APIC RTE's need to be setup to point to interrupt-remapping
-	 * table entries. for now, do a plain restore, and wait for
-	 * the setup_IO_APIC_irqs() to do proper initialization.
-	 */
-	restore_IO_APIC_setup(ioapic_entries);
-}
-
 void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
 {
 	int apic;
-- 
GitLab


From 9cbf117662e24c6d33245666804487f92c21b59d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 Apr 2009 04:51:29 +0200
Subject: [PATCH 0570/2198] tracing/events: provide string with undefined size
 support

This patch provides the support for dynamic size strings on
event tracing.

The key concept is to use a structure with an ending char array field of
undefined size and use such ability to allocate the minimal size on the
ring buffer to make one or more string entries fit inside, as opposite
to a fixed length strings with upper bound.

The strings themselves are represented using fields which have an offset
value from the beginning of the entry.

This patch provides three new macros:

__string(item, src)

This one declares a string to the structure inside TP_STRUCT__entry.
You need to provide the name of the string field and the source that will
be copied inside.
This will also add the dynamic size of the string needed for the ring
buffer entry allocation.
A stack allocated structure is used to temporarily store the offset
of each strings, avoiding double calls to strlen() on each event
insertion.

__get_str(field)

This one will give you a pointer to the string you have created. This
is an abstract helper to resolve the absolute address given the field
name which is a relative address from the beginning of the trace_structure.

__assign_str(dst, src)

Use this macro to automatically perform the string copy from src to
dst. src must be a variable to assign and dst is the name of a __string
field.

Example on how to use it:

TRACE_EVENT(my_event,
	TP_PROTO(char *src1, char *src2),

	TP_ARGS(src1, src2),
	TP_STRUCT__entry(
		__string(str1, src1)
		__string(str2, src2)
	),
	TP_fast_assign(
		__assign_str(str1, src1);
		__assign_str(str2, src2);
	),
	TP_printk("%s %s", __get_str(src1), __get_str(src2))
)

Of course you can mix-up any __field or __array inside this
TRACE_EVENT. The position of the __string or __assign_str
doesn't matter.

Changes in v2:

Address the suggestion of Steven Rostedt: drop the opening_string() macro
and redefine __ending_string() to get the size of the string to be copied
instead of overwritting the whole ring buffer allocation.

Changes in v3:

Address other suggestions of Steven Rostedt and Peter Zijlstra with
some changes: drop the __ending_string and the need to have only one
string field.
Use offsets instead of absolute addresses.

[ Impact: allow more compact memory usage for string tracing ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/trace/ftrace.h | 88 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 15ef08d9add1c..5a7d18c436349 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -27,6 +27,9 @@
 #undef __field
 #define __field(type, item)		type	item;
 
+#undef __string
+#define __string(item, src)		int	__str_loc_##item;
+
 #undef TP_STRUCT__entry
 #define TP_STRUCT__entry(args...) args
 
@@ -35,14 +38,53 @@
 	struct ftrace_raw_##name {				\
 		struct trace_entry	ent;			\
 		tstruct						\
+		char			__str_data[0];		\
 	};							\
 	static struct ftrace_event_call event_##name
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+
 /*
  * Stage 2 of the trace events.
  *
+ * Include the following:
+ *
+ * struct ftrace_str_offsets_<call> {
+ *	int				<str1>;
+ *	int				<str2>;
+ *	[...]
+ * };
+ *
+ * The __string() macro will create each int <str>, this is to
+ * keep the offset of each string from the beggining of the event
+ * once we perform the strlen() of the src strings.
+ *
+ */
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __field
+#define __field(type, item);
+
+#undef __string
+#define __string(item, src)	int item;
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+	struct ftrace_str_offsets_##call {				\
+		tstruct;						\
+	};
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 3 of the trace events.
+ *
  * Override the macros in <trace/trace_events.h> to include the following:
  *
  * enum print_line_t
@@ -80,6 +122,9 @@
 #undef TP_printk
 #define TP_printk(fmt, args...) fmt "\n", args
 
+#undef __get_str
+#define __get_str(field)	(char *)__entry + __entry->__str_loc_##field
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
@@ -146,6 +191,16 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	if (!ret)							\
 		return 0;
 
+#undef __string
+#define __string(item, src)						       \
+	ret = trace_seq_printf(s, "\tfield: __str_loc " #item ";\t"	       \
+			       "offset:%u;tsize:%u;\n",			       \
+			       (unsigned int)offsetof(typeof(field),	       \
+					__str_loc_##item),		       \
+			       (unsigned int)sizeof(field.__str_loc_##item));  \
+	if (!ret)							       \
+		return 0;
+
 #undef __entry
 #define __entry REC
 
@@ -189,6 +244,12 @@ ftrace_format_##call(struct trace_seq *s)				\
 	if (ret)							\
 		return ret;
 
+#undef __string
+#define __string(item, src)						       \
+	ret = trace_define_field(event_call, "__str_loc", #item,	       \
+				offsetof(typeof(field), __str_loc_##item),     \
+				sizeof(field.__str_loc_##item));
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 int									\
@@ -212,7 +273,7 @@ ftrace_define_fields_##call(void)					\
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
 /*
- * Stage 3 of the trace events.
+ * Stage 4 of the trace events.
  *
  * Override the macros in <trace/trace_events.h> to include the following:
  *
@@ -409,6 +470,23 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #undef __entry
 #define __entry entry
 
+#undef __field
+#define __field(type, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __string
+#define __string(item, src)						       \
+	__str_offsets.item = __str_size +				       \
+			     offsetof(typeof(*entry), __str_data);	       \
+	__str_size += strlen(src) + 1;
+
+#undef __assign_str
+#define __assign_str(dst, src)						\
+	__entry->__str_loc_##dst = __str_offsets.dst;			\
+	strcpy(__get_str(dst), src);
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 _TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
@@ -417,18 +495,22 @@ static struct ftrace_event_call event_##call;				\
 									\
 static void ftrace_raw_event_##call(proto)				\
 {									\
+	struct ftrace_str_offsets_##call __maybe_unused __str_offsets;  \
 	struct ftrace_event_call *call = &event_##call;			\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
 	unsigned long irq_flags;					\
+	int __str_size = 0;						\
 	int pc;								\
 									\
 	local_save_flags(irq_flags);					\
 	pc = preempt_count();						\
 									\
+	tstruct;							\
+									\
 	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				  sizeof(struct ftrace_raw_##call),	\
-				  irq_flags, pc);			\
+				 sizeof(struct ftrace_raw_##call) + __str_size,\
+				 irq_flags, pc);			\
 	if (!event)							\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
-- 
GitLab


From 7e7ca9a22dbbc5c91763cd16923c7509918709b6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 19 Apr 2009 04:54:49 +0200
Subject: [PATCH 0571/2198] tracing/lock: provide lock_acquired event support
 for dynamic size string

Now that we can support the dynamic sized string, make the lock tracing
able to use it, making it safe against modules removal and consuming
the right amount of memory needed for each lock name

Changes in v2:
adapt to the __ending_string() updates and the opening_string() removal.

[ Impact: protect lock tracer against module removal ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/lockdep.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h
index 45e326b5c7f3e..3ca315c1429da 100644
--- a/include/trace/events/lockdep.h
+++ b/include/trace/events/lockdep.h
@@ -38,16 +38,16 @@ TRACE_EVENT(lock_acquired,
 	TP_ARGS(lock, ip, waittime),
 
 	TP_STRUCT__entry(
-		__field(const char *, name)
+		__string(name, lock->name)
 		__field(unsigned long, wait_usec)
 		__field(unsigned long, wait_nsec_rem)
 	),
 	TP_fast_assign(
-		__entry->name = lock->name;
+		__assign_str(name, lock->name);
 		__entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
 		__entry->wait_usec = (unsigned long) waittime;
 	),
-	TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec,
+	TP_printk("%s (%lu.%03lu us)", __get_str(name), __entry->wait_usec,
 				       __entry->wait_nsec_rem)
 );
 
-- 
GitLab


From 6a74aa40907757ec98d8710ff66cd4cfe064e7d8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 22 Apr 2009 00:41:09 +0200
Subject: [PATCH 0572/2198] tracing/events: protect __get_str()

The __get_str() macro is used in a code part then its content should be
protected with parenthesis.

[ Impact: make macro definition more robust ]

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/trace/ftrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 5a7d18c436349..a77f71a46dbec 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -123,7 +123,7 @@
 #define TP_printk(fmt, args...) fmt "\n", args
 
 #undef __get_str
-#define __get_str(field)	(char *)__entry + __entry->__str_loc_##field
+#define __get_str(field)	((char *)__entry + __entry->__str_loc_##field)
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-- 
GitLab


From 9be24414aad047dcf9d8d2a9a929321536c7ebec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Mar 2009 10:25:24 -0400
Subject: [PATCH 0573/2198] tracing/wakeup: move access to wakeup_cpu into
 spinlock

The code had the following outside the lock:

        if (next != wakeup_task)
                return;

        pc = preempt_count();

        /* The task we are waiting for is waking up */
        data = wakeup_trace->data[wakeup_cpu];

On initialization, wakeup_task is NULL and wakeup_cpu -1. This code
is not under a lock. If wakeup_task is set on another CPU as that
task is waking up, we can see the wakeup_task before wakeup_cpu is
set. If we read wakeup_cpu while it is still -1 then we will have
a bad data pointer.

This patch moves the reading of wakeup_cpu within the protection of
the spinlock used to protect the writing of wakeup_cpu and wakeup_task.

[ Impact: remove possible race causing invalid pointer dereference ]

Reported-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_sched_wakeup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index b8b13c5540fdb..eacb272251736 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 
 	pc = preempt_count();
 
-	/* The task we are waiting for is waking up */
-	data = wakeup_trace->data[wakeup_cpu];
-
 	/* disable local data, not wakeup_cpu data */
 	cpu = raw_smp_processor_id();
 	disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (unlikely(!tracer_enabled || next != wakeup_task))
 		goto out_unlock;
 
+	/* The task we are waiting for is waking up */
+	data = wakeup_trace->data[wakeup_cpu];
+
 	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
-- 
GitLab


From 89ec0dee9eba6275d47be0b878cf5f6d5c2fb6eb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Mar 2009 11:03:29 -0400
Subject: [PATCH 0574/2198] tracing: increase size of number of possible events

With the new event tracing registration, we must increase the number
of events that can be registered. Currently the type field is only
one byte, which leaves us only 256 possible events.

Since we do not save the CPU number in the tracer anymore (it is determined
by the per cpu ring buffer that is used) we have an extra byte to use.

This patch increases the size of type from 1 byte (256 events) to
2 bytes (65,536 events).

It also adds a WARN_ON_ONCE if we exceed that limit.

[ Impact: allow more than 255 events ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace_event.h | 5 ++++-
 kernel/trace/trace_events.c  | 2 +-
 kernel/trace/trace_output.c  | 2 ++
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 2a4a40749911a..07e0a6d64a249 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -16,13 +16,16 @@ struct dentry;
  *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
  */
 struct trace_entry {
-	int			type;
+	unsigned short		type;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
 	int			tgid;
 };
 
+#define FTRACE_MAX_EVENT						\
+	((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1)
+
 /*
  * Trace iterator - used by printout routines who present trace
  * results to users and which routines might sleep, etc:
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5d6e879cf8757..9887131afa03a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s)
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
-				FIELD(int, type),
+				FIELD(unsigned short, type),
 				FIELD(unsigned char, flags),
 				FIELD(unsigned char, preempt_count),
 				FIELD(int, pid),
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 83a8abb9640f9..06997e75114b3 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -537,6 +537,8 @@ int register_ftrace_event(struct trace_event *event)
  out:
 	mutex_unlock(&trace_event_mutex);
 
+	WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
-- 
GitLab


From 75db37d2f4c0ad9466ead57d467277d097b4105c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Mar 2009 11:43:36 -0400
Subject: [PATCH 0575/2198] tracing: add size checks for exported ftrace
 internal structures

The events exported by TRACE_EVENT are automated and are guaranteed
to be correct when used.

The internal ftrace structures on the other hand are more manually
exported. These require the ftrace maintainer to make sure they
are up to date.

This patch adds a size check to help flag when a type changes in
an internal ftrace data structure, and the update needs to be reflected
in the export.

If a export is incorrect, then the only harm is that the user space
tools will not know how to correctly read the internal structures of
ftrace.

[ Impact: help prevent inconsistent ftrace format print outs ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 3 +++
 kernel/trace/trace_export.c | 4 ++++
 2 files changed, 7 insertions(+)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9887131afa03a..b92081588088d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -381,8 +381,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+extern char *__bad_type_size(void);
+
 #undef FIELD
 #define FIELD(type, name)						\
+	sizeof(type) != sizeof(field.name) ? __bad_type_size() :	\
 	#type, "common_" #name, offsetof(typeof(field), name),		\
 		sizeof(field.name)
 
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 48fc02fe73a09..0cb1a142c74fd 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -19,8 +19,12 @@
 #undef TRACE_STRUCT
 #define TRACE_STRUCT(args...) args
 
+extern void __bad_type_size(void);
+
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)					\
+	if (sizeof(type) != sizeof(field.item))				\
+		__bad_type_size();					\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
 			       "offset:%u;\tsize:%u;\n",		\
 			       (unsigned int)offsetof(typeof(field), item), \
-- 
GitLab


From d7285c6b5c54397fdf112c2fb98ee43193173aa9 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Thu, 23 Apr 2009 10:21:38 -0700
Subject: [PATCH 0576/2198] x86: use native register access for native tlb
 flushing

currently these are paravirtulaized, doesn't appear any callers rely on
this (no pv_ops backends are using native_tlb and overriding cr3/4
access).

[ Impact: fix lockdep warning with paravirt and function tracer ]

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
LKML-Reference: <20090423172138.GR3036@sequoia.sous-sol.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/include/asm/tlbflush.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d3539f998f88c..e2927c5f45b18 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -17,7 +17,7 @@
 
 static inline void __native_flush_tlb(void)
 {
-	write_cr3(read_cr3());
+	native_write_cr3(native_read_cr3());
 }
 
 static inline void __native_flush_tlb_global(void)
@@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void)
 	 */
 	raw_local_irq_save(flags);
 
-	cr4 = read_cr4();
+	cr4 = native_read_cr4();
 	/* clear PGE */
-	write_cr4(cr4 & ~X86_CR4_PGE);
+	native_write_cr4(cr4 & ~X86_CR4_PGE);
 	/* write old PGE again and flush TLBs */
-	write_cr4(cr4);
+	native_write_cr4(cr4);
 
 	raw_local_irq_restore(flags);
 }
-- 
GitLab


From c2518c4366f087ebc10b3919cb2461bbe4f42d0c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 23 Apr 2009 23:26:18 -0400
Subject: [PATCH 0577/2198] tracing: fix cut and paste macro error

In case a module uses the TRACE_EVENT macro for creating automated
events in ftrace, it may choose to use a different file name
than the defined system name, or choose to use a different path than
the default "include/trace/events" include path.

If this is done, then before including trace/define_trace.h the
header would define either "TRACE_INCLUDE_FILE" for the file
name or "TRACE_INCLUDE_PATH" for the include path.

If it does not define these, then the define_trace.h defines them
instead. If define trace defines them, then define_trace.h should
also undefine them before exiting. To do this a macro is used
to note this:

 #ifndef TRACE_INCLUDE_FILE
 # define TRACE_INCLUDE_FILE TRACE_SYSTEM
 # define UNDEF_TRACE_INCLUDE_FILE
 #endif

[...]

 #ifdef UNDEF_TRACE_INCLUDE_FILE
 # undef TRACE_INCLUDE_FILE
 # undef UNDEF_TRACE_INCLUDE_FILE
 #endif

The UNDEF_TRACE_INCLUDE_FILE acts as a CPP variable to know to undef
the TRACE_INCLUDE_FILE before leaving define_trace.h.

Unfortunately, due to cut and paste errors, the macros between
FILE and PATH got mixed up.

[ Impact: undef TRACE_INCLUDE_FILE and/or TRACE_INCLUDE_PATH when needed ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/define_trace.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index 7f1f23d601ecb..abc611feeb8cb 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -44,7 +44,7 @@
 
 #ifndef TRACE_INCLUDE_PATH
 # define __TRACE_INCLUDE(system) <trace/events/system.h>
-# define UNDEF_TRACE_INCLUDE_FILE
+# define UNDEF_TRACE_INCLUDE_PATH
 #else
 # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
 #endif
@@ -64,13 +64,13 @@
 
 /* Only undef what we defined in this file */
 #ifdef UNDEF_TRACE_INCLUDE_FILE
-# undef TRACE_INCLUDE_PATH
+# undef TRACE_INCLUDE_FILE
 # undef UNDEF_TRACE_INCLUDE_FILE
 #endif
 
-#ifdef UNDEF_TRACE_INCLUDE_FILE
+#ifdef UNDEF_TRACE_INCLUDE_PATH
 # undef TRACE_INCLUDE_PATH
-# undef UNDEF_TRACE_INCLUDE_FILE
+# undef UNDEF_TRACE_INCLUDE_PATH
 #endif
 
 /* We may be processing more files */
-- 
GitLab


From 334d4169a6592d3fcd863bbe822a8f6985ffa9af Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 24 Apr 2009 11:27:05 +0800
Subject: [PATCH 0578/2198] ring_buffer: compressed event header

RB_MAX_SMALL_DATA = 28bytes is too small for most tracers, it wastes
an 'u32' to save the actually length for events which data size > 28.

This fix uses compressed event header and enlarges RB_MAX_SMALL_DATA.

[ Impact: saves about 0%-12.5%(depends on tracer) memory in ring_buffer ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <49F13189.3090000@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 16 +++----
 kernel/trace/ring_buffer.c  | 83 ++++++++++++++++++-------------------
 2 files changed, 50 insertions(+), 49 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index fac8f1ac6f493..1c2f80911fbe2 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -11,7 +11,7 @@ struct ring_buffer_iter;
  * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
-	u32		type:2, len:3, time_delta:27;
+	u32		type_len:5, time_delta:27;
 	u32		array[];
 };
 
@@ -24,7 +24,8 @@ struct ring_buffer_event {
  *				  size is variable depending on how much
  *				  padding is needed
  *				 If time_delta is non zero:
- *				  everything else same as RINGBUF_TYPE_DATA
+ *				  array[0] holds the actual length
+ *				  size = 4 + length (bytes)
  *
  * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
  *				 array[0] = time delta (28 .. 59)
@@ -35,22 +36,23 @@ struct ring_buffer_event {
  *				 array[1..2] = tv_sec
  *				 size = 16 bytes
  *
- * @RINGBUF_TYPE_DATA:		Data record
- *				 If len is zero:
+ * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
+ *				Data record
+ *				 If type_len is zero:
  *				  array[0] holds the actual length
  *				  array[1..(length+3)/4] holds data
- *				  size = 4 + 4 + length (bytes)
+ *				  size = 4 + length (bytes)
  *				 else
- *				  length = len << 2
+ *				  length = type_len << 2
  *				  array[0..(length+3)/4-1] holds data
  *				  size = 4 + length (bytes)
  */
 enum ring_buffer_type {
+	RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
 	RINGBUF_TYPE_PADDING,
 	RINGBUF_TYPE_TIME_EXTEND,
 	/* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
 	RINGBUF_TYPE_TIME_STAMP,
-	RINGBUF_TYPE_DATA,
 };
 
 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 61dbdf21cd32d..9692f100ec1a9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -28,8 +28,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 {
 	int ret;
 
-	ret = trace_seq_printf(s, "\ttype        :    2 bits\n");
-	ret = trace_seq_printf(s, "\tlen         :    3 bits\n");
+	ret = trace_seq_printf(s, "# compressed entry header\n");
+	ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n");
 	ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
 	ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
 	ret = trace_seq_printf(s, "\n");
@@ -37,8 +37,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s)
 			       RINGBUF_TYPE_PADDING);
 	ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
 			       RINGBUF_TYPE_TIME_EXTEND);
-	ret = trace_seq_printf(s, "\tdata        : type == %d\n",
-			       RINGBUF_TYPE_DATA);
+	ret = trace_seq_printf(s, "\tdata max type_len  == %d\n",
+			       RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 
 	return ret;
 }
@@ -204,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
-#define RB_MAX_SMALL_DATA	28
+#define RB_MAX_SMALL_DATA	(RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+
+/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
+#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 
 enum {
 	RB_LEN_TIME_EXTEND = 8,
@@ -213,17 +216,18 @@ enum {
 
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
-	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+	return event->type_len == RINGBUF_TYPE_PADDING
+			&& event->time_delta == 0;
 }
 
 static inline int rb_discarded_event(struct ring_buffer_event *event)
 {
-	return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+	return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
 }
 
 static void rb_event_set_padding(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
+	event->type_len = RINGBUF_TYPE_PADDING;
 	event->time_delta = 0;
 }
 
@@ -232,8 +236,8 @@ rb_event_data_length(struct ring_buffer_event *event)
 {
 	unsigned length;
 
-	if (event->len)
-		length = event->len * RB_ALIGNMENT;
+	if (event->type_len)
+		length = event->type_len * RB_ALIGNMENT;
 	else
 		length = event->array[0];
 	return length + RB_EVNT_HDR_SIZE;
@@ -243,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event)
 static unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event))
 			/* undefined */
 			return -1;
-		return rb_event_data_length(event);
+		return  event->array[0] + RB_EVNT_HDR_SIZE;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		return RB_LEN_TIME_EXTEND;
@@ -272,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event)
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
 	unsigned length = rb_event_length(event);
-	if (event->type != RINGBUF_TYPE_DATA)
+	if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 		return length;
 	length -= RB_EVNT_HDR_SIZE;
 	if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -285,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
-	BUG_ON(event->type != RINGBUF_TYPE_DATA);
+	BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
 	/* If length is in len field, then array[0] has the data */
-	if (event->len)
+	if (event->type_len)
 		return (void *)&event->array[0];
 	/* Otherwise length is in array[0] and array[1] has the data */
 	return (void *)&event->array[1];
@@ -988,7 +992,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
 			return;
 		/* Only count data entries */
-		if (event->type != RINGBUF_TYPE_DATA)
+		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->overrun++;
 		cpu_buffer->entries--;
@@ -1133,28 +1137,21 @@ static void
 rb_update_event(struct ring_buffer_event *event,
 			 unsigned type, unsigned length)
 {
-	event->type = type;
+	event->type_len = type;
 
 	switch (type) {
 
 	case RINGBUF_TYPE_PADDING:
-		break;
-
 	case RINGBUF_TYPE_TIME_EXTEND:
-		event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
-		break;
-
 	case RINGBUF_TYPE_TIME_STAMP:
-		event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
 		break;
 
-	case RINGBUF_TYPE_DATA:
+	case 0:
 		length -= RB_EVNT_HDR_SIZE;
-		if (length > RB_MAX_SMALL_DATA) {
-			event->len = 0;
+		if (length > RB_MAX_SMALL_DATA)
 			event->array[0] = length;
-		} else
-			event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+		else
+			event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 		break;
 	default:
 		BUG();
@@ -1562,7 +1559,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (length > BUF_PAGE_SIZE)
 		goto out;
 
-	event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+	event = rb_reserve_next_event(cpu_buffer, 0, length);
 	if (!event)
 		goto out;
 
@@ -1634,7 +1631,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
 static inline void rb_event_discard(struct ring_buffer_event *event)
 {
-	event->type = RINGBUF_TYPE_PADDING;
+	/* array[0] holds the actual length for the discarded event */
+	event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+	event->type_len = RINGBUF_TYPE_PADDING;
 	/* time delta must be non zero */
 	if (!event->time_delta)
 		event->time_delta = 1;
@@ -1786,8 +1785,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		goto out;
 
 	event_length = rb_calculate_event_length(length);
-	event = rb_reserve_next_event(cpu_buffer,
-				      RINGBUF_TYPE_DATA, event_length);
+	event = rb_reserve_next_event(cpu_buffer, 0, event_length);
 	if (!event)
 		goto out;
 
@@ -2035,7 +2033,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 {
 	u64 delta;
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		return;
 
@@ -2066,7 +2064,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
 {
 	u64 delta;
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		return;
 
@@ -2181,7 +2179,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	event = rb_reader_event(cpu_buffer);
 
-	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
+	if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
+			|| rb_discarded_event(event))
 		cpu_buffer->entries--;
 
 	rb_update_read_stamp(cpu_buffer, event);
@@ -2262,7 +2261,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	event = rb_reader_event(cpu_buffer);
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event))
 			RB_WARN_ON(cpu_buffer, 1);
@@ -2334,7 +2333,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	event = rb_iter_head_event(iter);
 
-	switch (event->type) {
+	switch (event->type_len) {
 	case RINGBUF_TYPE_PADDING:
 		if (rb_null_event(event)) {
 			rb_inc_iter(iter);
@@ -2393,7 +2392,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2421,7 +2420,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	event = rb_iter_peek(iter, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2466,7 +2465,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
  out:
 	preempt_enable();
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2559,7 +2558,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-	if (event && event->type == RINGBUF_TYPE_PADDING) {
+	if (event && event->type_len == RINGBUF_TYPE_PADDING) {
 		cpu_relax();
 		goto again;
 	}
@@ -2766,7 +2765,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
 		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
 			return;
 		/* Only count data entries */
-		if (event->type != RINGBUF_TYPE_DATA)
+		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->entries--;
 	}
-- 
GitLab


From 3e98f9f15e916c48dfc5231d7e6a59be7f122764 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 24 Apr 2009 15:39:39 +0900
Subject: [PATCH 0579/2198] sh: pci: Fix up the build for CONFIG_PCI=n.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/pci.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index f910121559b01..5b2e0fcdfc22b 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -88,6 +88,7 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
 #define pci_unmap_len_set(PTR, LEN_NAME, VAL)	do { } while (0)
 #endif
 
+#ifdef CONFIG_PCI
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
@@ -95,6 +96,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 	*strat = PCI_DMA_BURST_INFINITY;
 	*strategy_parameter = ~0UL;
 }
+#endif
 
 #ifdef CONFIG_SUPERH32
 /*
-- 
GitLab


From 782cc5ae6331d63b4febaa312c9d14493aafa9b8 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 24 Apr 2009 09:43:09 +0200
Subject: [PATCH 0580/2198] x86, ds: fix buffer alignment in debug store
 selftest

The debug store selftest code uses a stack-allocated buffer, which is
not necessarily correctly aligned.

For tests using a buffer to hold a single entry, the buffer that is
passed to ds_request must already be suitably aligned.

Pass a suitably aligned portion of the bigger buffer.

[ Impact: fix hw-branch-tracer self-test failure ]

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Cc: markus.t.metzger@gmail.com
LKML-Reference: <20090424094309.A30145@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ds_selftest.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
index 5f104a0ace666..6bc7c199ab99e 100644
--- a/arch/x86/kernel/ds_selftest.c
+++ b/arch/x86/kernel/ds_selftest.c
@@ -323,13 +323,15 @@ static int ds_selftest_bts_bad_request_task(void *buffer)
 int ds_selftest_bts(void)
 {
 	struct ds_selftest_bts_conf conf;
-	unsigned char buffer[BUFFER_SIZE];
+	unsigned char buffer[BUFFER_SIZE], *small_buffer;
 	unsigned long irq;
 	int cpu;
 
 	printk(KERN_INFO "[ds] bts selftest...");
 	conf.error = 0;
 
+	small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
+
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		conf.suspend = ds_suspend_bts_wrap;
@@ -381,7 +383,7 @@ int ds_selftest_bts(void)
 	conf.suspend = ds_suspend_bts_noirq;
 	conf.resume = ds_resume_bts_noirq;
 	conf.tracer =
-		ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE,
+		ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
 				   NULL, (size_t)-1, BTS_KERNEL);
 	local_irq_save(irq);
 	ds_selftest_bts_cpu(&conf);
-- 
GitLab


From 7e0bfad24d85de7cf2202a7b0ce51de11a077b21 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 24 Apr 2009 09:44:48 +0200
Subject: [PATCH 0581/2198] x86, bts: reenable ptrace branch trace support

The races found by Oleg Nesterov have been fixed.

Reenable branch trace support.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
LKML-Reference: <20090424094448.A30216@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 8130334329c06..924e156a85abd 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -506,7 +506,6 @@ config X86_PTRACE_BTS
 	bool "Branch Trace Store"
 	default y
 	depends on X86_DEBUGCTLMSR
-	depends on BROKEN
 	---help---
 	  This adds a ptrace interface to the hardware's branch trace store.
 
-- 
GitLab


From 1cb81b143fa8f0e4629f10690862e2e52ca792ff Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Fri, 24 Apr 2009 09:51:43 +0200
Subject: [PATCH 0582/2198] x86, bts, mm: clean up buffer allocation

The current mm interface is asymetric. One function allocates a locked
buffer, another function only refunds the memory.

Change this to have two functions for accounting and refunding locked
memory, respectively; and do the actual buffer allocation in ptrace.

[ Impact: refactor BTS buffer allocation code ]

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090424095143.A30265@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ptrace.c | 39 ++++++++++++++++++++++++++-------------
 include/linux/mm.h       |  6 ++++--
 mm/mlock.c               | 36 +++++++++++++++++-------------------
 3 files changed, 47 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index d5252ae6c5205..09ecbde91c135 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -617,17 +617,28 @@ struct bts_context {
 	struct work_struct	work;
 };
 
-static inline void alloc_bts_buffer(struct bts_context *context,
-				    unsigned int size)
+static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
 {
-	void *buffer;
+	void *buffer = NULL;
+	int err = -ENOMEM;
 
-	buffer = alloc_locked_buffer(size);
-	if (buffer) {
-		context->buffer = buffer;
-		context->size = size;
-		context->mm = get_task_mm(current);
-	}
+	err = account_locked_memory(current->mm, current->signal->rlim, size);
+	if (err < 0)
+		return err;
+
+	buffer = kzalloc(size, GFP_KERNEL);
+	if (!buffer)
+		goto out_refund;
+
+	context->buffer = buffer;
+	context->size = size;
+	context->mm = get_task_mm(current);
+
+	return 0;
+
+ out_refund:
+	refund_locked_memory(current->mm, size);
+	return err;
 }
 
 static inline void free_bts_buffer(struct bts_context *context)
@@ -638,7 +649,7 @@ static inline void free_bts_buffer(struct bts_context *context)
 	kfree(context->buffer);
 	context->buffer = NULL;
 
-	refund_locked_buffer_memory(context->mm, context->size);
+	refund_locked_memory(context->mm, context->size);
 	context->size = 0;
 
 	mmput(context->mm);
@@ -786,13 +797,15 @@ static int ptrace_bts_config(struct task_struct *child,
 	context->tracer = NULL;
 
 	if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+		int err;
+
 		free_bts_buffer(context);
 		if (!cfg.size)
 			return 0;
 
-		alloc_bts_buffer(context, cfg.size);
-		if (!context->buffer)
-			return -ENOMEM;
+		err = alloc_bts_buffer(context, cfg.size);
+		if (err < 0)
+			return err;
 	}
 
 	if (cfg.flags & PTRACE_BTS_O_TRACE)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a3963ba23a6d6..009eabd3c21ca 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -19,6 +19,7 @@ struct anon_vma;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
+struct rlimit;
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -1319,7 +1320,8 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
-extern void *alloc_locked_buffer(size_t size);
-extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size);
+extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+				 size_t size);
+extern void refund_locked_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/mm/mlock.c b/mm/mlock.c
index 28be15ead9c13..ac130433c7d35 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -629,38 +629,36 @@ void user_shm_unlock(size_t size, struct user_struct *user)
 	free_uid(user);
 }
 
-void *alloc_locked_buffer(size_t size)
+int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+			  size_t size)
 {
-	unsigned long rlim, vm, pgsz;
-	void *buffer = NULL;
+	unsigned long lim, vm, pgsz;
+	int error = -ENOMEM;
 
 	pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-	down_write(&current->mm->mmap_sem);
-
-	rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
-	vm   = current->mm->total_vm + pgsz;
-	if (rlim < vm)
-		goto out;
+	down_write(&mm->mmap_sem);
 
-	rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-	vm   = current->mm->locked_vm + pgsz;
-	if (rlim < vm)
+	lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+	vm   = mm->total_vm + pgsz;
+	if (lim < vm)
 		goto out;
 
-	buffer = kzalloc(size, GFP_KERNEL);
-	if (!buffer)
+	lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+	vm   = mm->locked_vm + pgsz;
+	if (lim < vm)
 		goto out;
 
-	current->mm->total_vm  += pgsz;
-	current->mm->locked_vm += pgsz;
+	mm->total_vm  += pgsz;
+	mm->locked_vm += pgsz;
 
+	error = 0;
  out:
-	up_write(&current->mm->mmap_sem);
-	return buffer;
+	up_write(&mm->mmap_sem);
+	return error;
 }
 
-void refund_locked_buffer_memory(struct mm_struct *mm, size_t size)
+void refund_locked_memory(struct mm_struct *mm, size_t size)
 {
 	unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-- 
GitLab


From 39517091f88fae32b52254b561ced78da1eaf0a7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 11:05:52 -0400
Subject: [PATCH 0583/2198] tracing/lockdep: convert lockdep to use TRACE_EVENT
 macro

The TRACE_FORMAT will soon be deprecated. This patch converts it to
the TRACE_EVENT macro.

Note, this change should also speed up the tracing.

[ Impact: remove a user of deprecated TRACE_FORMAT ]

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/lockdep.h | 56 ++++++++++++++++++++++++++++------
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h
index 3ca315c1429da..0e956c9dfd7ee 100644
--- a/include/trace/events/lockdep.h
+++ b/include/trace/events/lockdep.h
@@ -9,28 +9,64 @@
 
 #ifdef CONFIG_LOCKDEP
 
-TRACE_FORMAT(lock_acquire,
+TRACE_EVENT(lock_acquire,
+
 	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
 		int trylock, int read, int check,
 		struct lockdep_map *next_lock, unsigned long ip),
+
 	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TP_FMT("%s%s%s", trylock ? "try " : "",
-		read ? "read " : "", lock->name)
-	);
 
-TRACE_FORMAT(lock_release,
+	TP_STRUCT__entry(
+		__field(unsigned int, flags)
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0);
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s%s%s", (__entry->flags & 1) ? "try " : "",
+		  (__entry->flags & 2) ? "read " : "",
+		  __get_str(name))
+);
+
+TRACE_EVENT(lock_release,
+
 	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+
 	TP_ARGS(lock, nested, ip),
-	TP_FMT("%s", lock->name)
-	);
+
+	TP_STRUCT__entry(
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s", __get_str(name))
+);
 
 #ifdef CONFIG_LOCK_STAT
 
-TRACE_FORMAT(lock_contended,
+TRACE_EVENT(lock_contended,
+
 	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
 	TP_ARGS(lock, ip),
-	TP_FMT("%s", lock->name)
-	);
+
+	TP_STRUCT__entry(
+		__string(name, lock->name)
+	),
+
+	TP_fast_assign(
+		__assign_str(name, lock->name);
+	),
+
+	TP_printk("%s", __get_str(name))
+);
 
 TRACE_EVENT(lock_acquired,
 	TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
-- 
GitLab


From 160031b556e93590fa8635210d73d93c3d3853a9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 11:26:55 -0400
Subject: [PATCH 0584/2198] tracing/irq: convert irq traces to use TRACE_EVENT
 macro

The TRACE_FORMAT will soon be deprecated. This patch converts it to
the TRACE_EVENT macro.

Note, this change should also speed up the tracing.

[ Impact: remove a user of deprecated TRACE_FORMAT ]

Cc: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/irq.h | 57 ++++++++++++++++++++++++++++++++------
 1 file changed, 48 insertions(+), 9 deletions(-)

diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 75e3468e44933..7686864675185 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -10,11 +10,24 @@
 /*
  * Tracepoint for entry of interrupt handler:
  */
-TRACE_FORMAT(irq_handler_entry,
+TRACE_EVENT(irq_handler_entry,
+
 	TP_PROTO(int irq, struct irqaction *action),
+
 	TP_ARGS(irq, action),
-	TP_FMT("irq=%d handler=%s", irq, action->name)
-	);
+
+	TP_STRUCT__entry(
+		__field(	int,	irq		)
+		__string(	name,	action->name	)
+	),
+
+	TP_fast_assign(
+		__entry->irq = irq;
+		__assign_str(name, action->name);
+	),
+
+	TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name))
+);
 
 /*
  * Tracepoint for return of an interrupt handler:
@@ -39,17 +52,43 @@ TRACE_EVENT(irq_handler_exit,
 		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
-TRACE_FORMAT(softirq_entry,
+TRACE_EVENT(softirq_entry,
+
 	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
 	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
 
-TRACE_FORMAT(softirq_exit,
+	TP_STRUCT__entry(
+		__field(	int,	vec			)
+		__string(	name,	softirq_to_name[h-vec]	)
+	),
+
+	TP_fast_assign(
+		__entry->vec = (int)(h - vec);
+		__assign_str(name, softirq_to_name[h-vec]);
+	),
+
+	TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name))
+);
+
+TRACE_EVENT(softirq_exit,
+
 	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
 	TP_ARGS(h, vec),
-	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-	);
+
+	TP_STRUCT__entry(
+		__field(	int,	vec			)
+		__string(	name,	softirq_to_name[h-vec]	)
+	),
+
+	TP_fast_assign(
+		__entry->vec = (int)(h - vec);
+		__assign_str(name, softirq_to_name[h-vec]);
+	),
+
+	TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name))
+);
 
 #endif /*  _TRACE_IRQ_H */
 
-- 
GitLab


From b8e65554d80b4c560d201362d0e8fa02109d89fd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 11:50:39 -0400
Subject: [PATCH 0585/2198] tracing: remove deprecated TRACE_FORMAT

The TRACE_FORMAT macro has been deprecated by the TRACE_EVENT macro.
There are no more users. All new users must use the TRACE_EVENT macro.

[ Impact: remove old functionality ]

Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/tracepoint.h   |  5 ---
 include/trace/define_trace.h |  4 ---
 include/trace/ftrace.h       | 66 ------------------------------------
 3 files changed, 75 deletions(-)

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 4353f3f7e624a..14df7e635d439 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -158,11 +158,6 @@ static inline void tracepoint_synchronize_unregister(void)
 
 #define PARAMS(args...) args
 
-#ifndef TRACE_FORMAT
-#define TRACE_FORMAT(name, proto, args, fmt)		\
-	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
-#endif
-
 #ifndef TRACE_EVENT
 /*
  * For use with the TRACE_EVENT macro:
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
index abc611feeb8cb..f7a7ae1e8f90f 100644
--- a/include/trace/define_trace.h
+++ b/include/trace/define_trace.h
@@ -26,10 +26,6 @@
 #define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
 	DEFINE_TRACE(name)
 
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(name, proto, args, print)	\
-	DEFINE_TRACE(name)
-
 #undef DECLARE_TRACE
 #define DECLARE_TRACE(name, proto, args)	\
 	DEFINE_TRACE(name)
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index a77f71a46dbec..1e681142f1dad 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -18,9 +18,6 @@
 
 #include <linux/ftrace_event.h>
 
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
 #undef __array
 #define __array(type, item, len)	type	item[len];
 
@@ -62,9 +59,6 @@
  *
  */
 
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
 #undef __array
 #define __array(type, item, len)
 
@@ -298,16 +292,6 @@ ftrace_define_fields_##call(void)					\
  *	unregister_trace_<call>(ftrace_event_<call>);
  * }
  *
- * For those macros defined with TRACE_FORMAT:
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *	.name			= "<call>",
- *	.regfunc		= ftrace_reg_event_<call>,
- *	.unregfunc		= ftrace_unreg_event_<call>,
- * }
- *
  *
  * For those macros defined with TRACE_EVENT:
  *
@@ -417,56 +401,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
 #define _TRACE_PROFILE_INIT(call)
 #endif
 
-#define _TRACE_FORMAT(call, proto, args, fmt)				\
-static void ftrace_event_##call(proto)					\
-{									\
-	event_trace_printk(_RET_IP_, #call ": " fmt);			\
-}									\
-									\
-static int ftrace_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_event_##call);		\
-	if (ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call "\n");			\
-	return ret;							\
-}									\
-									\
-static void ftrace_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_event_##call);			\
-}									\
-									\
-static struct ftrace_event_call event_##call;				\
-									\
-static int ftrace_init_event_##call(void)				\
-{									\
-	int id;								\
-									\
-	id = register_ftrace_event(NULL);				\
-	if (!id)							\
-		return -ENODEV;						\
-	event_##call.id = id;						\
-	return 0;							\
-}
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)				\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name			= #call,				\
-	.system			= __stringify(TRACE_SYSTEM),		\
-	.raw_init		= ftrace_init_event_##call,		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
-	_TRACE_PROFILE_INIT(call)					\
-}
-
 #undef __entry
 #define __entry entry
 
-- 
GitLab


From 060fa5c83e67901ba47ab484cfcdb32737d630ba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 12:20:52 -0400
Subject: [PATCH 0586/2198] tracing/events: reuse trace event ids after
 overflow

With modules being able to add trace events, and the max trace event
counter is 16 bits (65536) we can overflow the counter easily
with a simple while loop adding and removing modules that contain
trace events.

This patch links together the registered trace events and on overflow
searches for available trace event ids. It will still fail if
over 65536 events are registered, but considering that a typical
kernel only has 22000 functions, 65000 events should be sufficient.

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  1 +
 kernel/trace/trace_output.c  | 71 ++++++++++++++++++++++++++++++------
 2 files changed, 61 insertions(+), 11 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 07e0a6d64a249..78a9ba24cbf67 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -56,6 +56,7 @@ typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
 					      int flags);
 struct trace_event {
 	struct hlist_node	node;
+	struct list_head	list;
 	int			type;
 	trace_print_func	trace;
 	trace_print_func	raw;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 06997e75114b3..5fc51f0f75fc0 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -483,6 +483,36 @@ struct trace_event *ftrace_find_event(int type)
 	return NULL;
 }
 
+static LIST_HEAD(ftrace_event_list);
+
+static int trace_search_list(struct list_head **list)
+{
+	struct trace_event *e;
+	int last = __TRACE_LAST_TYPE;
+
+	if (list_empty(&ftrace_event_list)) {
+		*list = &ftrace_event_list;
+		return last + 1;
+	}
+
+	/*
+	 * We used up all possible max events,
+	 * lets see if somebody freed one.
+	 */
+	list_for_each_entry(e, &ftrace_event_list, list) {
+		if (e->type != last + 1)
+			break;
+		last++;
+	}
+
+	/* Did we used up all 65 thousand events??? */
+	if ((last + 1) > FTRACE_MAX_EVENT)
+		return 0;
+
+	*list = &e->list;
+	return last + 1;
+}
+
 /**
  * register_ftrace_event - register output for an event type
  * @event: the event type to register
@@ -505,20 +535,40 @@ int register_ftrace_event(struct trace_event *event)
 
 	mutex_lock(&trace_event_mutex);
 
-	if (!event) {
-		ret = next_event_type++;
+	if (WARN_ON(!event))
 		goto out;
-	}
 
-	if (!event->type)
-		event->type = next_event_type++;
-	else if (event->type > __TRACE_LAST_TYPE) {
+	INIT_LIST_HEAD(&event->list);
+
+	if (!event->type) {
+		struct list_head *list;
+
+		if (next_event_type > FTRACE_MAX_EVENT) {
+
+			event->type = trace_search_list(&list);
+			if (!event->type)
+				goto out;
+
+		} else {
+			
+			event->type = next_event_type++;
+			list = &ftrace_event_list;
+		}
+
+		if (WARN_ON(ftrace_find_event(event->type)))
+			goto out;
+
+		list_add_tail(&event->list, list);
+
+	} else if (event->type > __TRACE_LAST_TYPE) {
 		printk(KERN_WARNING "Need to add type to trace.h\n");
 		WARN_ON(1);
-	}
-
-	if (ftrace_find_event(event->type))
 		goto out;
+	} else {
+		/* Is this event already used */
+		if (ftrace_find_event(event->type))
+			goto out;
+	}
 
 	if (event->trace == NULL)
 		event->trace = trace_nop_print;
@@ -537,8 +587,6 @@ int register_ftrace_event(struct trace_event *event)
  out:
 	mutex_unlock(&trace_event_mutex);
 
-	WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
@@ -551,6 +599,7 @@ int unregister_ftrace_event(struct trace_event *event)
 {
 	mutex_lock(&trace_event_mutex);
 	hlist_del(&event->node);
+	list_del(&event->list);
 	mutex_unlock(&trace_event_mutex);
 
 	return 0;
-- 
GitLab


From 79ffab34391933ee3b95dac7f25c0478fa2f8f1e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 May 2009 15:13:42 -0400
Subject: [PATCH 0587/2198] ext4: Properly initialize the buffer_head state

These struct buffer_heads are allocated on the stack (and hence are
initialized with stack garbage).  They are only used to call a
get_blocks() function, so that's mostly OK, but b_state must be
initialized to be 0 so we don't have any unexpected BH_* flags set by
accident, such as BH_Unwritten or BH_Delay.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c |  1 +
 fs/ext4/inode.c   | 15 ++++++++++++++-
 fs/mpage.c        |  6 ++++--
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e3a55eb8b26a5..a953214f2829c 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3150,6 +3150,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 			ret = PTR_ERR(handle);
 			break;
 		}
+		map_bh.b_state = 0;
 		ret = ext4_get_blocks_wrap(handle, inode, block,
 					  max_blocks, &map_bh,
 					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2a9ffd528dd12..d7ad0bb73cd5b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2055,7 +2055,20 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	if ((mpd->b_state  & (1 << BH_Mapped)) &&
 	    !(mpd->b_state & (1 << BH_Delay)))
 		return 0;
-	new.b_state = mpd->b_state;
+	/*
+	 * We need to make sure the BH_Delay flag is passed down to
+	 * ext4_da_get_block_write(), since it calls
+	 * ext4_get_blocks_wrap() with the EXT4_DELALLOC_RSVED flag.
+	 * This flag causes ext4_get_blocks_wrap() to call
+	 * ext4_da_update_reserve_space() if the passed buffer head
+	 * has the BH_Delay flag set.  In the future, once we clean up
+	 * the interfaces to ext4_get_blocks_wrap(), we should pass in
+	 * a separate flag which requests that the delayed allocation
+	 * statistics should be updated, instead of depending on the
+	 * state information getting passed down via the map_bh's
+	 * state bitmasks plus the magic EXT4_DELALLOC_RSVED flag.
+	 */
+	new.b_state = mpd->b_state & (1 << BH_Delay);
 	new.b_blocknr = 0;
 	new.b_size = mpd->b_size;
 	next = mpd->b_blocknr;
diff --git a/fs/mpage.c b/fs/mpage.c
index 680ba60863ffb..42381bd6543b1 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
 
-	clear_buffer_mapped(&map_bh);
+	map_bh.b_state = 0;
+	map_bh.b_size = 0;
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
 		struct page *page = list_entry(pages->prev, struct page, lru);
 
@@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block)
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
 
-	clear_buffer_mapped(&map_bh);
+	map_bh.b_state = 0;
+	map_bh.b_size = 0;
 	bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
 			&map_bh, &first_logical_block, get_block);
 	if (bio)
-- 
GitLab


From 8fb0e342481c4d80040670fec915f0b9c7c6499a Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 12 May 2009 16:22:37 -0400
Subject: [PATCH 0588/2198] vfs: Add BUG_ON for delayed and unwritten flags in
 submit_bh()

The BH_Delay and BH_Unwritten flags should never leak out to
submit_bh().  So add some BUG_ON() checks to submit_bh so we can get a
stack trace and determine how and why this might have happened.

(Note that only XFS and ext4 use these buffer head flags, and XFS does
not use submit_bh().  So this patch should only modify behavior for
ext4.)

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-fsdevel@vger.kernel.org
---
 fs/buffer.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/buffer.c b/fs/buffer.c
index aed297739eb07..ad0112900222c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2933,6 +2933,8 @@ int submit_bh(int rw, struct buffer_head * bh)
 	BUG_ON(!buffer_locked(bh));
 	BUG_ON(!buffer_mapped(bh));
 	BUG_ON(!bh->b_end_io);
+	BUG_ON(buffer_delay(bh));
+	BUG_ON(buffer_unwritten(bh));
 
 	/*
 	 * Mask in barrier bit for a write (could be either a WRITE or a
-- 
GitLab


From 29fa89d088941d79765d60f22d5ccdd6b8696e11 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 12 May 2009 16:30:27 -0400
Subject: [PATCH 0589/2198] ext4: Mark the unwritten buffer_head as mapped
 during write_begin

Setting BH_Unwritten buffer_heads as BH_Mapped avoids multiple
(unnecessary) calls to get_block() during the call to the write(2)
system call.  Setting BH_Unwritten buffer heads as BH_Mapped requires
that the writepages() functions can handle BH_Unwritten buffer_heads.

After this commit, things work as follows:

ext4_ext_get_block() returns unmapped, unwritten, buffer head when
called with create = 0 for prealloc space. This makes sure we handle
the read path and non-delayed allocation case correctly.  Even though
the buffer head is marked unmapped we have valid b_blocknr and b_bdev
values in the buffer_head.

ext4_da_get_block_prep() called for block resrevation will now return
mapped, unwritten, new buffer_head for prealloc space. This avoids
multiple calls to get_block() for write to same offset. By making such
buffers as BH_New, we also assure that sub-block zeroing of buffered
writes happens correctly.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c |  4 +--
 fs/ext4/inode.c   | 82 ++++++++++++++++++++++++++++++-----------------
 2 files changed, 54 insertions(+), 32 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index a953214f2829c..ea5c47608cea7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2872,6 +2872,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
 				goto out;
 			if (!create) {
+				if (allocated > max_blocks)
+					allocated = max_blocks;
 				/*
 				 * We have blocks reserved already.  We
 				 * return allocated blocks so that delalloc
@@ -2879,8 +2881,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 				 * the buffer head will be unmapped so that
 				 * a read from the block returns 0s.
 				 */
-				if (allocated > max_blocks)
-					allocated = max_blocks;
 				set_buffer_unwritten(bh_result);
 				bh_result->b_bdev = inode->i_sb->s_bdev;
 				bh_result->b_blocknr = newblock;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d7ad0bb73cd5b..96f3366f59f6e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1852,7 +1852,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
  * @logical - first logical block to start assignment with
  *
  * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay
+ * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
  */
 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 				 struct buffer_head *exbh)
@@ -1902,16 +1902,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 			do {
 				if (cur_logical >= logical + blocks)
 					break;
-				if (buffer_delay(bh)) {
-					bh->b_blocknr = pblock;
-					clear_buffer_delay(bh);
-					bh->b_bdev = inode->i_sb->s_bdev;
-				} else if (buffer_unwritten(bh)) {
-					bh->b_blocknr = pblock;
-					clear_buffer_unwritten(bh);
-					set_buffer_mapped(bh);
-					set_buffer_new(bh);
-					bh->b_bdev = inode->i_sb->s_bdev;
+
+				if (buffer_delay(bh) ||
+						buffer_unwritten(bh)) {
+
+					BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
+
+					if (buffer_delay(bh)) {
+						clear_buffer_delay(bh);
+						bh->b_blocknr = pblock;
+					} else {
+						/*
+						 * unwritten already should have
+						 * blocknr assigned. Verify that
+						 */
+						clear_buffer_unwritten(bh);
+						BUG_ON(bh->b_blocknr != pblock);
+					}
+
 				} else if (buffer_mapped(bh))
 					BUG_ON(bh->b_blocknr != pblock);
 
@@ -2053,7 +2061,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * We consider only non-mapped and non-allocated blocks
 	 */
 	if ((mpd->b_state  & (1 << BH_Mapped)) &&
-	    !(mpd->b_state & (1 << BH_Delay)))
+		!(mpd->b_state & (1 << BH_Delay)) &&
+		!(mpd->b_state & (1 << BH_Unwritten)))
 		return 0;
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
@@ -2205,6 +2214,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
 	return;
 }
 
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+	/*
+	 * unmapped buffer is possible for holes.
+	 * delay buffer is possible with delayed allocation.
+	 * We also need to consider unwritten buffer as unmapped.
+	 */
+	return (!buffer_mapped(bh) || buffer_delay(bh) ||
+				buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
 /*
  * __mpage_da_writepage - finds extent of pages and blocks
  *
@@ -2289,8 +2309,7 @@ static int __mpage_da_writepage(struct page *page,
 			 * Otherwise we won't make progress
 			 * with the page in ext4_da_writepage
 			 */
-			if (buffer_dirty(bh) &&
-			    (!buffer_mapped(bh) || buffer_delay(bh))) {
+			if (ext4_bh_unmapped_or_delay(NULL, bh)) {
 				mpage_add_bh_to_extent(mpd, logical,
 						       bh->b_size,
 						       bh->b_state);
@@ -2318,6 +2337,14 @@ static int __mpage_da_writepage(struct page *page,
 /*
  * this is a special callback for ->write_begin() only
  * it's intention is to return mapped block or reserve space
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
+ *
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 				  struct buffer_head *bh_result, int create)
@@ -2353,28 +2380,23 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 		set_buffer_delay(bh_result);
 	} else if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
-		/*
-		 * With sub-block writes into unwritten extents
-		 * we also need to mark the buffer as new so that
-		 * the unwritten parts of the buffer gets correctly zeroed.
-		 */
-		if (buffer_unwritten(bh_result))
+		if (buffer_unwritten(bh_result)) {
+			/* A delayed write to unwritten bh should
+			 * be marked new and mapped.  Mapped ensures
+			 * that we don't do get_block multiple times
+			 * when we write to the same offset and new
+			 * ensures that we do proper zero out for
+			 * partial write.
+			 */
 			set_buffer_new(bh_result);
+			set_buffer_mapped(bh_result);
+		}
 		ret = 0;
 	}
 
 	return ret;
 }
 
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
-	/*
-	 * unmapped buffer is possible for holes.
-	 * delay buffer is possible with delayed allocation
-	 */
-	return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
-}
-
 static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
@@ -2828,7 +2850,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
 	for (i = 0; i < idx; i++)
 		bh = bh->b_this_page;
 
-	if (!buffer_mapped(bh) || (buffer_delay(bh)))
+	if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
 		return 0;
 	return 1;
 }
-- 
GitLab


From c5ca7c7636fa689a9746b6032f83aa7fffec31c6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 27 Apr 2009 22:48:48 -0400
Subject: [PATCH 0590/2198] ext4: Fallback to vmalloc if kmalloc can't allocate
 s_flex_groups array

For very large filesystems, the s_flex_groups array can get quite big.
For example, a filesystem that can be resized up to 16TB will have
8192 flex groups (assuming the default flex_bg size of 16), so the
array is 96k, which is *very* marginal for kmalloc().  On the other
hand, a 160GB filesystem without the resize_inode feature will only
require 960 bytes.  So we try to allocate the array first using
kmalloc(), and if that fails, we'll try to use vmalloc() instead.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f222a..3f4475daa66d6 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -20,6 +20,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/vmalloc.h>
 #include <linux/jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
@@ -586,7 +587,10 @@ static void ext4_put_super(struct super_block *sb)
 	for (i = 0; i < sbi->s_gdb_count; i++)
 		brelse(sbi->s_group_desc[i]);
 	kfree(sbi->s_group_desc);
-	kfree(sbi->s_flex_groups);
+	if (is_vmalloc_addr(sbi->s_flex_groups))
+		vfree(sbi->s_flex_groups);
+	else
+		kfree(sbi->s_flex_groups);
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -1620,6 +1624,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
 	int groups_per_flex = 0;
+	size_t size;
 	int i;
 
 	if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1634,8 +1639,13 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
 			((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
 			      EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
-	sbi->s_flex_groups = kzalloc(flex_group_count *
-				     sizeof(struct flex_groups), GFP_KERNEL);
+	size = flex_group_count * sizeof(struct flex_groups);
+	sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+	if (sbi->s_flex_groups == NULL) {
+		sbi->s_flex_groups = vmalloc(size);
+		if (sbi->s_flex_groups)
+			memset(sbi->s_flex_groups, 0, size);
+	}
 	if (sbi->s_flex_groups == NULL) {
 		printk(KERN_ERR "EXT4-fs: not enough memory for "
 				"%u flex groups\n", flex_group_count);
@@ -2842,6 +2852,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sbi->s_journal = NULL;
 	}
 failed_mount3:
+	if (sbi->s_flex_groups) {
+		if (is_vmalloc_addr(sbi->s_flex_groups))
+			vfree(sbi->s_flex_groups);
+		else
+			kfree(sbi->s_flex_groups);
+	}
 	percpu_counter_destroy(&sbi->s_freeblocks_counter);
 	percpu_counter_destroy(&sbi->s_freeinodes_counter);
 	percpu_counter_destroy(&sbi->s_dirs_counter);
-- 
GitLab


From f7c439504ccba0cca43271e651013ab97a221c62 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 24 Apr 2009 23:31:59 -0400
Subject: [PATCH 0591/2198] ext4: Use is_power_of_2() for clarity

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3f4475daa66d6..3e509bc647e3c 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1483,7 +1483,7 @@ static int parse_options(char *options, struct super_block *sb,
 				return 0;
 			if (option < 0 || option > (1 << 30))
 				return 0;
-			if (option & (option - 1)) {
+			if (!is_power_of_2(option)) {
 				printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
 				       " must be a power of 2\n");
 				return 0;
@@ -2101,8 +2101,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 	if (parse_strtoul(buf, 0x40000000, &t))
 		return -EINVAL;
 
-	/* inode_readahead_blks must be a power of 2 */
-	if (t & (t-1))
+	if (!is_power_of_2(t))
 		return -EINVAL;
 
 	sbi->s_inode_readahead_blks = t;
-- 
GitLab


From e2d670523c6c4ccb0fca9f3ab1b8f066d9aa57d6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 00:33:44 -0400
Subject: [PATCH 0592/2198] ext4: Simplify ext4_commit_super()'s function
 signature

The ext4_commit_super() function took both a struct super_block * and
a struct ext4_super_block *, but the struct ext4_super_block can be
derived from the struct super_block.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3e509bc647e3c..ad4c9be4abdc5 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -54,8 +54,7 @@ static struct kset *ext4_kset;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
-static int ext4_commit_super(struct super_block *sb,
-			      struct ext4_super_block *es, int sync);
+static int ext4_commit_super(struct super_block *sb, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
 					struct ext4_super_block *es);
 static void ext4_clear_journal_err(struct super_block *sb,
@@ -306,7 +305,7 @@ static void ext4_handle_error(struct super_block *sb)
 		printk(KERN_CRIT "Remounting filesystem read-only\n");
 		sb->s_flags |= MS_RDONLY;
 	}
-	ext4_commit_super(sb, es, 1);
+	ext4_commit_super(sb, 1);
 	if (test_opt(sb, ERRORS_PANIC))
 		panic("EXT4-fs (device %s): panic forced after error\n",
 			sb->s_id);
@@ -448,7 +447,7 @@ __acquires(bitlock)
 	if (test_opt(sb, ERRORS_CONT)) {
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-		ext4_commit_super(sb, es, 0);
+		ext4_commit_super(sb, 0);
 		return;
 	}
 	ext4_unlock_group(sb, grp);
@@ -577,7 +576,7 @@ static void ext4_put_super(struct super_block *sb)
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 	}
 	if (sbi->s_proc) {
 		remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -1596,7 +1595,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	if (sbi->s_journal)
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 
-	ext4_commit_super(sb, es, 1);
+	ext4_commit_super(sb, 1);
 	if (test_opt(sb, DEBUG))
 		printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
 				"bpg=%lu, ipg=%lu, mo=%04lx]\n",
@@ -2655,7 +2654,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			if (test_opt(sb, ERRORS_PANIC)) {
 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-				ext4_commit_super(sb, es, 1);
+				ext4_commit_super(sb, 1);
 				goto failed_mount4;
 			}
 		}
@@ -3132,15 +3131,15 @@ static int ext4_load_journal(struct super_block *sb,
 		sb->s_dirt = 1;
 
 		/* Make sure we flush the recovery flag to disk. */
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 	}
 
 	return 0;
 }
 
-static int ext4_commit_super(struct super_block *sb,
-			      struct ext4_super_block *es, int sync)
+static int ext4_commit_super(struct super_block *sb, int sync)
 {
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
 	struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
 	int error = 0;
 
@@ -3212,7 +3211,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		sb->s_dirt = 0;
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 	}
 	unlock_super(sb);
 
@@ -3253,7 +3252,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
 
 		EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 		es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 
 		jbd2_journal_clear_err(journal);
 	}
@@ -3293,7 +3292,7 @@ static void ext4_write_super(struct super_block *sb)
 			BUG();
 		sb->s_dirt = 0;
 	} else {
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		ext4_commit_super(sb, 1);
 	}
 }
 
@@ -3312,7 +3311,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 						     target);
 		}
 	} else {
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+		ext4_commit_super(sb, wait);
 	}
 	return ret;
 }
@@ -3345,7 +3344,7 @@ static int ext4_freeze(struct super_block *sb)
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		error = ext4_commit_super(sb, 1);
 		if (error)
 			goto out;
 	}
@@ -3365,7 +3364,7 @@ static int ext4_unfreeze(struct super_block *sb)
 		lock_super(sb);
 		/* Reser the needs_recovery flag before the fs is unlocked. */
 		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
+		ext4_commit_super(sb, 1);
 		unlock_super(sb);
 		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	}
@@ -3520,7 +3519,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		}
 	}
 	if (sbi->s_journal == NULL)
-		ext4_commit_super(sb, es, 1);
+		ext4_commit_super(sb, 1);
 
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
-- 
GitLab


From 7234ab2a55e77784b44cf2d862136d9e41b8d98a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 30 Apr 2009 21:24:04 -0400
Subject: [PATCH 0593/2198] ext4: Fix and simplify s_dirt handling

The s_dirt flag wasn't completely handled correctly, but it didn't
really matter when journalling was enabled.  It turns out that when
ext4 runs without a journal, we don't clear s_dirt in places where we
should have, with the result that the high-level write_super()
function was writing the superblock when it wasn't necessary.

So we fix this by making ext4_commit_super() clear the s_dirt flag,
and removing many of the other places where s_dirt is manipulated.
When journalling is enabled, the s_dirt flag might be left set more
often, but s_dirt really doesn't matter when journalling is enabled.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index ad4c9be4abdc5..7c7a08af12008 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3128,7 +3128,6 @@ static int ext4_load_journal(struct super_block *sb,
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
-		sb->s_dirt = 1;
 
 		/* Make sure we flush the recovery flag to disk. */
 		ext4_commit_super(sb, 1);
@@ -3168,7 +3167,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 					&EXT4_SB(sb)->s_freeblocks_counter));
 	es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
 					&EXT4_SB(sb)->s_freeinodes_counter));
-
+	sb->s_dirt = 0;
 	BUFFER_TRACE(sbh, "marking dirty");
 	mark_buffer_dirty(sbh);
 	if (sync) {
@@ -3210,7 +3209,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		sb->s_dirt = 0;
 		ext4_commit_super(sb, 1);
 	}
 	unlock_super(sb);
@@ -3271,10 +3269,8 @@ int ext4_force_commit(struct super_block *sb)
 		return 0;
 
 	journal = EXT4_SB(sb)->s_journal;
-	if (journal) {
-		sb->s_dirt = 0;
+	if (journal)
 		ret = ext4_journal_force_commit(journal);
-	}
 
 	return ret;
 }
@@ -3282,15 +3278,13 @@ int ext4_force_commit(struct super_block *sb)
 /*
  * Ext4 always journals updates to the superblock itself, so we don't
  * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
+ * point if the journalling is enabled.
  */
 static void ext4_write_super(struct super_block *sb)
 {
 	if (EXT4_SB(sb)->s_journal) {
 		if (mutex_trylock(&sb->s_lock) != 0)
 			BUG();
-		sb->s_dirt = 0;
 	} else {
 		ext4_commit_super(sb, 1);
 	}
@@ -3302,7 +3296,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	tid_t target;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
-	sb->s_dirt = 0;
 	if (EXT4_SB(sb)->s_journal) {
 		if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
 					      &target)) {
@@ -3324,7 +3317,6 @@ static int ext4_freeze(struct super_block *sb)
 {
 	int error = 0;
 	journal_t *journal;
-	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		journal = EXT4_SB(sb)->s_journal;
-- 
GitLab


From 9ca92389c5312a51e819c15c762f0abdc7f3129b Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 12:52:25 -0400
Subject: [PATCH 0594/2198] ext4: Use separate super_operations structure for
 no_journal filesystems

By using a separate super_operations structure for filesystems that
have and don't have journals, we can simply ext4_write_super() ---
which is only needed when no journal is present --- and ext4_freeze(),
ext4_unfreeze(), and ext4_sync_fs(), which are only needed when the
journal is present.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 108 +++++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 51 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7c7a08af12008..68c3a44c4a970 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -995,7 +995,6 @@ static const struct super_operations ext4_sops = {
 	.dirty_inode	= ext4_dirty_inode,
 	.delete_inode	= ext4_delete_inode,
 	.put_super	= ext4_put_super,
-	.write_super	= ext4_write_super,
 	.sync_fs	= ext4_sync_fs,
 	.freeze_fs	= ext4_freeze,
 	.unfreeze_fs	= ext4_unfreeze,
@@ -1010,6 +1009,25 @@ static const struct super_operations ext4_sops = {
 	.bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
+static const struct super_operations ext4_nojournal_sops = {
+	.alloc_inode	= ext4_alloc_inode,
+	.destroy_inode	= ext4_destroy_inode,
+	.write_inode	= ext4_write_inode,
+	.dirty_inode	= ext4_dirty_inode,
+	.delete_inode	= ext4_delete_inode,
+	.write_super	= ext4_write_super,
+	.put_super	= ext4_put_super,
+	.statfs		= ext4_statfs,
+	.remount_fs	= ext4_remount,
+	.clear_inode	= ext4_clear_inode,
+	.show_options	= ext4_show_options,
+#ifdef CONFIG_QUOTA
+	.quota_read	= ext4_quota_read,
+	.quota_write	= ext4_quota_write,
+#endif
+	.bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
 static const struct export_operations ext4_export_ops = {
 	.fh_to_dentry = ext4_fh_to_dentry,
 	.fh_to_parent = ext4_fh_to_parent,
@@ -2615,7 +2633,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	/*
 	 * set up enough so that it can read an inode
 	 */
-	sb->s_op = &ext4_sops;
+	if (!test_opt(sb, NOLOAD) &&
+	    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+		sb->s_op = &ext4_sops;
+	else
+		sb->s_op = &ext4_nojournal_sops;
 	sb->s_export_op = &ext4_export_ops;
 	sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
@@ -3275,19 +3297,9 @@ int ext4_force_commit(struct super_block *sb)
 	return ret;
 }
 
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point if the journalling is enabled.
- */
 static void ext4_write_super(struct super_block *sb)
 {
-	if (EXT4_SB(sb)->s_journal) {
-		if (mutex_trylock(&sb->s_lock) != 0)
-			BUG();
-	} else {
-		ext4_commit_super(sb, 1);
-	}
+	ext4_commit_super(sb, 1);
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3296,15 +3308,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
 	tid_t target;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
-	if (EXT4_SB(sb)->s_journal) {
-		if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
-					      &target)) {
-			if (wait)
-				jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
-						     target);
-		}
-	} else {
-		ext4_commit_super(sb, wait);
+	if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+		if (wait)
+			jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
 	}
 	return ret;
 }
@@ -3318,32 +3324,31 @@ static int ext4_freeze(struct super_block *sb)
 	int error = 0;
 	journal_t *journal;
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		journal = EXT4_SB(sb)->s_journal;
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
 
-		if (journal) {
-			/* Now we set up the journal barrier. */
-			jbd2_journal_lock_updates(journal);
+	journal = EXT4_SB(sb)->s_journal;
 
-			/*
-			 * We don't want to clear needs_recovery flag when we
-			 * failed to flush the journal.
-			 */
-			error = jbd2_journal_flush(journal);
-			if (error < 0)
-				goto out;
-		}
+	/* Now we set up the journal barrier. */
+	jbd2_journal_lock_updates(journal);
 
-		/* Journal blocked and flushed, clear needs_recovery flag. */
-		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		error = ext4_commit_super(sb, 1);
-		if (error)
-			goto out;
+	/*
+	 * Don't clear the needs_recovery flag if we failed to flush
+	 * the journal.
+	 */
+	error = jbd2_journal_flush(journal);
+	if (error < 0) {
+	out:
+		jbd2_journal_unlock_updates(journal);
+		return error;
 	}
+
+	/* Journal blocked and flushed, clear needs_recovery flag. */
+	EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	error = ext4_commit_super(sb, 1);
+	if (error)
+		goto out;
 	return 0;
-out:
-	jbd2_journal_unlock_updates(journal);
-	return error;
 }
 
 /*
@@ -3352,14 +3357,15 @@ static int ext4_freeze(struct super_block *sb)
  */
 static int ext4_unfreeze(struct super_block *sb)
 {
-	if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
-		lock_super(sb);
-		/* Reser the needs_recovery flag before the fs is unlocked. */
-		EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, 1);
-		unlock_super(sb);
-		jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-	}
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
+	lock_super(sb);
+	/* Reset the needs_recovery flag before the fs is unlocked. */
+	EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+	ext4_commit_super(sb, 1);
+	unlock_super(sb);
+	jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
 	return 0;
 }
 
-- 
GitLab


From 8df9675f8b498d0bfa1f0b5b06f56bf1ff366dd5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 08:50:38 -0400
Subject: [PATCH 0595/2198] ext4: Avoid races caused by on-line resizing and
 SMP memory reordering

Ext4's on-line resizing adds a new block group and then, only at the
last step adjusts s_groups_count.  However, it's possible on SMP
systems that another CPU could see the updated the s_group_count and
not see the newly initialized data structures for the just-added block
group.  For this reason, it's important to insert a SMP read barrier
after reading s_groups_count and before reading any (for example) the
new block group descriptors allowed by the increased value of
s_groups_count.

Unfortunately, we rather blatently violate this locking protocol
documented in fs/ext4/resize.c.  Fortunately, (1) on-line resizes
happen relatively rarely, and (2) it seems rare that the filesystem
code will immediately try to use just-added block group before any
memory ordering issues resolve themselves.  So apparently problems
here are relatively hard to hit, since ext3 has been vulnerable to the
same issue for years with no one apparently complaining.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 15 +++++++--------
 fs/ext4/ext4.h    | 12 ++++++++++++
 fs/ext4/ialloc.c  | 40 +++++++++++++++++++---------------------
 fs/ext4/inode.c   |  7 ++++---
 fs/ext4/mballoc.c | 45 ++++++++++++++++++++++++---------------------
 fs/ext4/super.c   |  3 +--
 6 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 53c72ad858773..a5ba039850c5e 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -88,6 +88,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
 	int bit, bit_max;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	unsigned free_blocks, group_blocks;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
@@ -123,7 +124,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		bit_max += ext4_bg_num_gdb(sb, block_group);
 	}
 
-	if (block_group == sbi->s_groups_count - 1) {
+	if (block_group == ngroups - 1) {
 		/*
 		 * Even though mke2fs always initialize first and last group
 		 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +132,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
 		 */
 		group_blocks = ext4_blocks_count(sbi->s_es) -
 			le32_to_cpu(sbi->s_es->s_first_data_block) -
-			(EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
+			(EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
 	} else {
 		group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
 	}
@@ -205,18 +206,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 {
 	unsigned int group_desc;
 	unsigned int offset;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-	if (block_group >= sbi->s_groups_count) {
+	if (block_group >= ngroups) {
 		ext4_error(sb, "ext4_get_group_desc",
 			   "block_group >= groups_count - "
 			   "block_group = %u, groups_count = %u",
-			   block_group, sbi->s_groups_count);
+			   block_group, ngroups);
 
 		return NULL;
 	}
-	smp_rmb();
 
 	group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
 	offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
@@ -665,7 +666,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	ext4_fsblk_t desc_count;
 	struct ext4_group_desc *gdp;
 	ext4_group_t i;
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	ext4_fsblk_t bitmap_count;
@@ -677,7 +678,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	bitmap_count = 0;
 	gdp = NULL;
 
-	smp_rmb();
 	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
@@ -700,7 +700,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
 	return bitmap_count;
 #else
 	desc_count = 0;
-	smp_rmb();
 	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d0f15ef56de1b..02ec44bf38e60 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1228,6 +1228,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
 	 return grp_info[indexv][indexh];
 }
 
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards.  See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+	ext4_group_t	ngroups = EXT4_SB(sb)->s_groups_count;
+
+	smp_rmb();
+	return ngroups;
+}
 
 static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
 					     ext4_group_t block_group)
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18e0a08a6b50..55ba419ca00bc 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -316,7 +316,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 static int find_group_dir(struct super_block *sb, struct inode *parent,
 				ext4_group_t *best_group)
 {
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	unsigned int freei, avefreei;
 	struct ext4_group_desc *desc, *best_desc = NULL;
 	ext4_group_t group;
@@ -353,7 +353,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 	struct flex_groups *flex_group = sbi->s_flex_groups;
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
-	ext4_group_t ngroups = sbi->s_groups_count;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	int flex_size = ext4_flex_bg_size(sbi);
 	ext4_group_t best_flex = parent_fbg_group;
 	int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
@@ -362,7 +362,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 	ext4_group_t n_fbg_groups;
 	ext4_group_t i;
 
-	n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+	n_fbg_groups = (ngroups + flex_size - 1) >>
 		sbi->s_log_groups_per_flex;
 
 find_close_to_parent:
@@ -478,20 +478,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_group_t ngroups = sbi->s_groups_count;
+	ext4_group_t real_ngroups = ext4_get_groups_count(sb);
 	int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
 	unsigned int freei, avefreei;
 	ext4_fsblk_t freeb, avefreeb;
 	unsigned int ndirs;
 	int max_dirs, min_inodes;
 	ext4_grpblk_t min_blocks;
-	ext4_group_t i, grp, g;
+	ext4_group_t i, grp, g, ngroups;
 	struct ext4_group_desc *desc;
 	struct orlov_stats stats;
 	int flex_size = ext4_flex_bg_size(sbi);
 
+	ngroups = real_ngroups;
 	if (flex_size > 1) {
-		ngroups = (ngroups + flex_size - 1) >>
+		ngroups = (real_ngroups + flex_size - 1) >>
 			sbi->s_log_groups_per_flex;
 		parent_group >>= sbi->s_log_groups_per_flex;
 	}
@@ -543,7 +544,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 		 */
 		grp *= flex_size;
 		for (i = 0; i < flex_size; i++) {
-			if (grp+i >= sbi->s_groups_count)
+			if (grp+i >= real_ngroups)
 				break;
 			desc = ext4_get_group_desc(sb, grp+i, NULL);
 			if (desc && ext4_free_inodes_count(sb, desc)) {
@@ -583,7 +584,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 	}
 
 fallback:
-	ngroups = sbi->s_groups_count;
+	ngroups = real_ngroups;
 	avefreei = freei / ngroups;
 fallback_retry:
 	parent_group = EXT4_I(parent)->i_block_group;
@@ -613,9 +614,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 			    ext4_group_t *group, int mode)
 {
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-	ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+	ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
 	struct ext4_group_desc *desc;
-	ext4_group_t i, last;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
 
 	/*
@@ -799,11 +799,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	struct super_block *sb;
 	struct buffer_head *inode_bitmap_bh = NULL;
 	struct buffer_head *group_desc_bh;
-	ext4_group_t group = 0;
+	ext4_group_t ngroups, group = 0;
 	unsigned long ino = 0;
 	struct inode *inode;
 	struct ext4_group_desc *gdp = NULL;
-	struct ext4_super_block *es;
 	struct ext4_inode_info *ei;
 	struct ext4_sb_info *sbi;
 	int ret2, err = 0;
@@ -818,15 +817,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		return ERR_PTR(-EPERM);
 
 	sb = dir->i_sb;
+	ngroups = ext4_get_groups_count(sb);
 	trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
 		   dir->i_ino, mode);
 	inode = new_inode(sb);
 	if (!inode)
 		return ERR_PTR(-ENOMEM);
 	ei = EXT4_I(inode);
-
 	sbi = EXT4_SB(sb);
-	es = sbi->s_es;
 
 	if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
 		ret2 = find_group_flex(sb, dir, &group);
@@ -856,7 +854,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 	if (ret2 == -1)
 		goto out;
 
-	for (i = 0; i < sbi->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		err = -EIO;
 
 		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -917,7 +915,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		 * group descriptor metadata has not yet been updated.
 		 * So we just go onto the next blockgroup.
 		 */
-		if (++group == sbi->s_groups_count)
+		if (++group == ngroups)
 			group = 0;
 	}
 	err = -ENOSPC;
@@ -1158,7 +1156,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
 	unsigned long desc_count;
 	struct ext4_group_desc *gdp;
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
 	struct ext4_super_block *es;
 	unsigned long bitmap_count, x;
@@ -1168,7 +1166,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 	desc_count = 0;
 	bitmap_count = 0;
 	gdp = NULL;
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
@@ -1190,7 +1188,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 	return desc_count;
 #else
 	desc_count = 0;
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
@@ -1205,9 +1203,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 unsigned long ext4_count_dirs(struct super_block * sb)
 {
 	unsigned long count = 0;
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
 		if (!gdp)
 			continue;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 96f3366f59f6e..4e7f363e3030a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4965,7 +4965,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  */
 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-	int groups, gdpblocks;
+	ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+	int gdpblocks;
 	int idxblocks;
 	int ret = 0;
 
@@ -4992,8 +4993,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 		groups += nrblocks;
 
 	gdpblocks = groups;
-	if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
-		groups = EXT4_SB(inode->i_sb)->s_groups_count;
+	if (groups > ngroups)
+		groups = ngroups;
 	if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
 		gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f871677a79849..c3af9e6b66683 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -739,6 +739,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 
 static int ext4_mb_init_cache(struct page *page, char *incore)
 {
+	ext4_group_t ngroups;
 	int blocksize;
 	int blocks_per_page;
 	int groups_per_page;
@@ -757,6 +758,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
 	inode = page->mapping->host;
 	sb = inode->i_sb;
+	ngroups = ext4_get_groups_count(sb);
 	blocksize = 1 << inode->i_blkbits;
 	blocks_per_page = PAGE_CACHE_SIZE / blocksize;
 
@@ -780,7 +782,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 	for (i = 0; i < groups_per_page; i++) {
 		struct ext4_group_desc *desc;
 
-		if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+		if (first_group + i >= ngroups)
 			break;
 
 		err = -EIO;
@@ -852,7 +854,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 		struct ext4_group_info *grinfo;
 
 		group = (first_block + i) >> 1;
-		if (group >= EXT4_SB(sb)->s_groups_count)
+		if (group >= ngroups)
 			break;
 
 		/*
@@ -1788,6 +1790,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
 	int block, pnum;
 	int blocks_per_page;
 	int groups_per_page;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t first_group;
 	struct ext4_group_info *grp;
 
@@ -1807,7 +1810,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
 	/* read all groups the page covers into the cache */
 	for (i = 0; i < groups_per_page; i++) {
 
-		if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+		if ((first_group + i) >= ngroups)
 			break;
 		grp = ext4_get_group_info(sb, first_group + i);
 		/* take all groups write allocation
@@ -1945,8 +1948,7 @@ static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-	ext4_group_t group;
-	ext4_group_t i;
+	ext4_group_t ngroups, group, i;
 	int cr;
 	int err = 0;
 	int bsbits;
@@ -1957,6 +1959,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 
 	sb = ac->ac_sb;
 	sbi = EXT4_SB(sb);
+	ngroups = ext4_get_groups_count(sb);
 	BUG_ON(ac->ac_status == AC_STATUS_FOUND);
 
 	/* first, try the goal */
@@ -2017,11 +2020,11 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 		 */
 		group = ac->ac_g_ex.fe_group;
 
-		for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+		for (i = 0; i < ngroups; group++, i++) {
 			struct ext4_group_info *grp;
 			struct ext4_group_desc *desc;
 
-			if (group == EXT4_SB(sb)->s_groups_count)
+			if (group == ngroups)
 				group = 0;
 
 			/* quick check to skip empty groups */
@@ -2315,12 +2318,10 @@ static struct file_operations ext4_mb_seq_history_fops = {
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
 	struct super_block *sb = seq->private;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t group;
 
-	if (*pos < 0 || *pos >= sbi->s_groups_count)
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
 		return NULL;
-
 	group = *pos + 1;
 	return (void *) ((unsigned long) group);
 }
@@ -2328,11 +2329,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct super_block *sb = seq->private;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t group;
 
 	++*pos;
-	if (*pos < 0 || *pos >= sbi->s_groups_count)
+	if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
 		return NULL;
 	group = *pos + 1;
 	return (void *) ((unsigned long) group);
@@ -2587,6 +2587,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
 
 static int ext4_mb_init_backend(struct super_block *sb)
 {
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
 	int metalen;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2598,7 +2599,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	struct ext4_group_desc *desc;
 
 	/* This is the number of blocks used by GDT */
-	num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+	num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
 				1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
 
 	/*
@@ -2644,7 +2645,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 	for (i = 0; i < num_meta_group_infos; i++) {
 		if ((i + 1) == num_meta_group_infos)
 			metalen = sizeof(*meta_group_info) *
-				(sbi->s_groups_count -
+				(ngroups -
 					(i << EXT4_DESC_PER_BLOCK_BITS(sb)));
 		meta_group_info = kmalloc(metalen, GFP_KERNEL);
 		if (meta_group_info == NULL) {
@@ -2655,7 +2656,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
 		sbi->s_group_info[i] = meta_group_info;
 	}
 
-	for (i = 0; i < sbi->s_groups_count; i++) {
+	for (i = 0; i < ngroups; i++) {
 		desc = ext4_get_group_desc(sb, i, NULL);
 		if (desc == NULL) {
 			printk(KERN_ERR
@@ -2781,13 +2782,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 
 int ext4_mb_release(struct super_block *sb)
 {
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
 	ext4_group_t i;
 	int num_meta_group_infos;
 	struct ext4_group_info *grinfo;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 
 	if (sbi->s_group_info) {
-		for (i = 0; i < sbi->s_groups_count; i++) {
+		for (i = 0; i < ngroups; i++) {
 			grinfo = ext4_get_group_info(sb, i);
 #ifdef DOUBLE_CHECK
 			kfree(grinfo->bb_bitmap);
@@ -2797,7 +2799,7 @@ int ext4_mb_release(struct super_block *sb)
 			ext4_unlock_group(sb, i);
 			kfree(grinfo);
 		}
-		num_meta_group_infos = (sbi->s_groups_count +
+		num_meta_group_infos = (ngroups +
 				EXT4_DESC_PER_BLOCK(sb) - 1) >>
 			EXT4_DESC_PER_BLOCK_BITS(sb);
 		for (i = 0; i < num_meta_group_infos; i++)
@@ -4121,7 +4123,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
 	struct super_block *sb = ac->ac_sb;
-	ext4_group_t i;
+	ext4_group_t ngroups, i;
 
 	printk(KERN_ERR "EXT4-fs: Can't allocate:"
 			" Allocation context details:\n");
@@ -4145,7 +4147,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 	printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
 		ac->ac_found);
 	printk(KERN_ERR "EXT4-fs: groups: \n");
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+	ngroups = ext4_get_groups_count(sb);
+	for (i = 0; i < ngroups; i++) {
 		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
 		struct ext4_prealloc_space *pa;
 		ext4_grpblk_t start;
@@ -4469,13 +4472,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 
 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 {
-	ext4_group_t i;
+	ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 	int ret;
 	int freed = 0;
 
 	trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
 		   sb->s_id, needed);
-	for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+	for (i = 0; i < ngroups && needed > 0; i++) {
 		ret = ext4_mb_discard_group_preallocations(sb, i, needed);
 		freed += ret;
 		needed -= ret;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 68c3a44c4a970..fcd7b24c6df37 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3557,9 +3557,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	if (test_opt(sb, MINIX_DF)) {
 		sbi->s_overhead_last = 0;
 	} else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-		ext4_group_t ngroups = sbi->s_groups_count, i;
+		ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 		ext4_fsblk_t overhead = 0;
-		smp_rmb();
 
 		/*
 		 * Compute the overhead (FS structures).  This is constant
-- 
GitLab


From 114e9fc90703bd6aac0229fb559e97caa6c49770 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Apr 2009 15:48:07 -0400
Subject: [PATCH 0596/2198] ext4: Remove outdated comment about lock_super()

ext4_fill_super() is no longer called by read_super(), and it is no
longer called with the superblock locked.  The
unlock_super()/lock_super() is no longer present, so this comment is
entirely superfluous.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index fcd7b24c6df37..e3b35f26d5fea 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2828,14 +2828,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount4;
 	};
 
-	/*
-	 * akpm: core read_super() calls in here with the superblock locked.
-	 * That deadlocks, because orphan cleanup needs to lock the superblock
-	 * in numerous places.  Here we just pop the lock - it's relatively
-	 * harmless, because we are now ready to accept write_super() requests,
-	 * and aviro says that's the only reason for hanging onto the
-	 * superblock lock.
-	 */
 	EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
-- 
GitLab


From a63c9eb2ce6f5028da90f282798232c4f398ceb8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 01:59:42 -0400
Subject: [PATCH 0597/2198] ext4: ext4_mark_recovery_complete() doesn't need to
 use lock_super

The function ext4_mark_recovery_complete() is called from two call
paths: either (a) while mounting the filesystem, in which case there's
no danger of any other CPU calling write_super() until the mount is
completed, and (b) while remounting the filesystem read-write, in
which case the fs core has already locked the superblock.  This also
allows us to take out a very vile unlock_super()/lock_super() pair in
ext4_remount().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e3b35f26d5fea..45d0ada9bfce9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3219,13 +3219,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
 	if (jbd2_journal_flush(journal) < 0)
 		goto out;
 
-	lock_super(sb);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		ext4_commit_super(sb, 1);
 	}
-	unlock_super(sb);
 
 out:
 	jbd2_journal_unlock_updates(journal);
@@ -3436,15 +3434,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			    (sbi->s_mount_state & EXT4_VALID_FS))
 				es->s_state = cpu_to_le16(sbi->s_mount_state);
 
-			/*
-			 * We have to unlock super so that we can wait for
-			 * transactions.
-			 */
-			if (sbi->s_journal) {
-				unlock_super(sb);
+			if (sbi->s_journal)
 				ext4_mark_recovery_complete(sb, es);
-				lock_super(sb);
-			}
 		} else {
 			int ret;
 			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-- 
GitLab


From 3b9d4ed26680771295d904a6b83e88e620780893 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Apr 2009 22:54:04 -0400
Subject: [PATCH 0598/2198] ext4: Replace lock/unlock_super() with an explicit
 lock for the orphan list

Use a separate lock to protect the orphan list, so we can stop
overloading the use of lock_super().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_sb.h |  1 +
 fs/ext4/namei.c   | 20 +++++++++++---------
 fs/ext4/super.c   |  1 +
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 57b71fefbccf7..4bda2f75d4269 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -71,6 +71,7 @@ struct ext4_sb_info {
 	struct inode *s_journal_inode;
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
+	struct mutex s_orphan_lock;
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 22098e1cd0859..8018e49a72875 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1997,7 +1997,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	if (!ext4_handle_valid(handle))
 		return 0;
 
-	lock_super(sb);
+	mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
 	if (!list_empty(&EXT4_I(inode)->i_orphan))
 		goto out_unlock;
 
@@ -2006,9 +2006,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 
 	/* @@@ FIXME: Observation from aviro:
 	 * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
-	 * here (on lock_super()), so race with ext4_link() which might bump
+	 * here (on s_orphan_lock), so race with ext4_link() which might bump
 	 * ->i_nlink. For, say it, character device. Not a regular file,
 	 * not a directory, not a symlink and ->i_nlink > 0.
+	 *
+	 * tytso, 4/25/2009: I'm not sure how that could happen;
+	 * shouldn't the fs core protect us from these sort of
+	 * unlink()/link() races?
 	 */
 	J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 		  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2045,7 +2049,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 	jbd_debug(4, "orphan inode %lu will point to %d\n",
 			inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-	unlock_super(sb);
+	mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
 	ext4_std_error(inode->i_sb, err);
 	return err;
 }
@@ -2066,11 +2070,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 	if (!ext4_handle_valid(handle))
 		return 0;
 
-	lock_super(inode->i_sb);
-	if (list_empty(&ei->i_orphan)) {
-		unlock_super(inode->i_sb);
-		return 0;
-	}
+	mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+	if (list_empty(&ei->i_orphan))
+		goto out;
 
 	ino_next = NEXT_ORPHAN(inode);
 	prev = ei->i_orphan.prev;
@@ -2120,7 +2122,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 out_err:
 	ext4_std_error(inode->i_sb, err);
 out:
-	unlock_super(inode->i_sb);
+	mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
 	return err;
 
 out_brelse:
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 45d0ada9bfce9..7f43fde9554b9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2645,6 +2645,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sb->dq_op = &ext4_quota_operations;
 #endif
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+	mutex_init(&sbi->s_orphan_lock);
 
 	sb->s_root = NULL;
 
-- 
GitLab


From 32ed5058ce90024efcd811254b4b1de0468099df Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 25 Apr 2009 22:53:39 -0400
Subject: [PATCH 0599/2198] ext4: Replace lock/unlock_super() with an explicit
 lock for resizing

Use a separate lock to protect s_groups_count and the other block
group descriptors which get changed via an on-line resize operation,
so we can stop overloading the use of lock_super().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4_sb.h |  1 +
 fs/ext4/resize.c  | 35 ++++++++++++++++++-----------------
 fs/ext4/super.c   |  1 +
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 4bda2f75d4269..2d36223d5f579 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -72,6 +72,7 @@ struct ext4_sb_info {
 	struct journal_s *s_journal;
 	struct list_head s_orphan;
 	struct mutex s_orphan_lock;
+	struct mutex s_resize_lock;
 	unsigned long s_commit_interval;
 	u32 s_max_batch_time;
 	u32 s_min_batch_time;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 546c7dd869e19..e8ded13b5cb13 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -193,7 +193,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
-	lock_super(sb);
+	mutex_lock(&sbi->s_resize_lock);
 	if (input->group != sbi->s_groups_count) {
 		err = -EBUSY;
 		goto exit_journal;
@@ -302,7 +302,7 @@ static int setup_new_group_blocks(struct super_block *sb,
 	brelse(bh);
 
 exit_journal:
-	unlock_super(sb);
+	mutex_unlock(&sbi->s_resize_lock);
 	if ((err2 = ext4_journal_stop(handle)) && !err)
 		err = err2;
 
@@ -643,11 +643,12 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode,
  * important part is that the new block and inode counts are in the backup
  * superblocks, and the location of the new group metadata in the GDT backups.
  *
- * We do not need lock_super() for this, because these blocks are not
- * otherwise touched by the filesystem code when it is mounted.  We don't
- * need to worry about last changing from sbi->s_groups_count, because the
- * worst that can happen is that we do not copy the full number of backups
- * at this time.  The resize which changed s_groups_count will backup again.
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted.  We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
  */
 static void update_backups(struct super_block *sb,
 			   int blk_off, char *data, int size)
@@ -809,7 +810,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 		goto exit_put;
 	}
 
-	lock_super(sb);
+	mutex_lock(&sbi->s_resize_lock);
 	if (input->group != sbi->s_groups_count) {
 		ext4_warning(sb, __func__,
 			     "multiple resizers run on filesystem!");
@@ -840,7 +841,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         /*
          * OK, now we've set up the new group.  Time to make it active.
          *
-         * Current kernels don't lock all allocations via lock_super(),
+         * We do not lock all allocations via s_resize_lock
          * so we have to be safe wrt. concurrent accesses the group
          * data.  So we need to be careful to set all of the relevant
          * group descriptor data etc. *before* we enable the group.
@@ -900,12 +901,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	 *
 	 * The precise rules we use are:
 	 *
-	 * * Writers of s_groups_count *must* hold lock_super
+	 * * Writers of s_groups_count *must* hold s_resize_lock
 	 * AND
 	 * * Writers must perform a smp_wmb() after updating all dependent
 	 *   data and before modifying the groups count
 	 *
-	 * * Readers must hold lock_super() over the access
+	 * * Readers must hold s_resize_lock over the access
 	 * OR
 	 * * Readers must perform an smp_rmb() after reading the groups count
 	 *   and before reading any dependent data.
@@ -948,7 +949,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
 	sb->s_dirt = 1;
 
 exit_journal:
-	unlock_super(sb);
+	mutex_unlock(&sbi->s_resize_lock);
 	if ((err2 = ext4_journal_stop(handle)) && !err)
 		err = err2;
 	if (!err) {
@@ -986,7 +987,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 
 	/* We don't need to worry about locking wrt other resizers just
 	 * yet: we're going to revalidate es->s_blocks_count after
-	 * taking lock_super() below. */
+	 * taking the s_resize_lock below. */
 	o_blocks_count = ext4_blocks_count(es);
 	o_groups_count = EXT4_SB(sb)->s_groups_count;
 
@@ -1056,11 +1057,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 		goto exit_put;
 	}
 
-	lock_super(sb);
+	mutex_lock(&EXT4_SB(sb)->s_resize_lock);
 	if (o_blocks_count != ext4_blocks_count(es)) {
 		ext4_warning(sb, __func__,
 			     "multiple resizers run on filesystem!");
-		unlock_super(sb);
+		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 		ext4_journal_stop(handle);
 		err = -EBUSY;
 		goto exit_put;
@@ -1070,14 +1071,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 						 EXT4_SB(sb)->s_sbh))) {
 		ext4_warning(sb, __func__,
 			     "error %d on journal write access", err);
-		unlock_super(sb);
+		mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 		ext4_journal_stop(handle);
 		goto exit_put;
 	}
 	ext4_blocks_count_set(es, o_blocks_count + add);
 	ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
 	sb->s_dirt = 1;
-	unlock_super(sb);
+	mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
 	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
 	/* We add the blocks to the bitmap and set the group need init bit */
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7f43fde9554b9..1fbf0906ae2ef 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2646,6 +2646,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #endif
 	INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
 	mutex_init(&sbi->s_orphan_lock);
+	mutex_init(&sbi->s_resize_lock);
 
 	sb->s_root = NULL;
 
-- 
GitLab


From 701970b3a83cc639c1ec8fc6f40a7871cb99426f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Apr 2009 23:11:22 -0400
Subject: [PATCH 0600/2198] tracing/events: make modules have their own
 file_operations structure

For proper module reference counting, the file_operations that modules use
must have the "owner" field set to the module. Unfortunately, the trace events
use share file_operations. The same file_operations are used by all both
kernel core and all modules.

This patch makes the modules allocate their own file_operations and
copies the functions from the core kernel. This allows those file
operations to be owned by the module.

Care is taken to free this code on module unload.

Thanks to Greg KH for reminding me that file_operations must be owned
by the module to have reference counting take place.

[ Impact: fix modular tracepoints / potential crash ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 kernel/trace/trace_events.c | 95 ++++++++++++++++++++++++++++++++++---
 1 file changed, 88 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b92081588088d..be4d3a437c17a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -770,7 +770,11 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 }
 
 static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+		 const struct file_operations *id,
+		 const struct file_operations *enable,
+		 const struct file_operations *filter,
+		 const struct file_operations *format)
 {
 	struct dentry *entry;
 	int ret;
@@ -800,11 +804,11 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 
 	if (call->regfunc)
 		entry = trace_create_file("enable", 0644, call->dir, call,
-					  &ftrace_enable_fops);
+					  enable);
 
 	if (call->id)
 		entry = trace_create_file("id", 0444, call->dir, call,
-					  &ftrace_event_id_fops);
+					  id);
 
 	if (call->define_fields) {
 		ret = call->define_fields();
@@ -814,7 +818,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 			return ret;
 		}
 		entry = trace_create_file("filter", 0644, call->dir, call,
-					  &ftrace_event_filter_fops);
+					  filter);
 	}
 
 	/* A trace may not want to export its format */
@@ -822,7 +826,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return 0;
 
 	entry = trace_create_file("format", 0444, call->dir, call,
-				  &ftrace_event_format_fops);
+				  format);
 
 	return 0;
 }
@@ -833,8 +837,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	     event++)
 
 #ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+	struct list_head		list;
+	struct module			*mod;
+	struct file_operations		id;
+	struct file_operations		enable;
+	struct file_operations		format;
+	struct file_operations		filter;
+};
+
+static struct ftrace_module_file_ops *
+trace_create_file_ops(struct module *mod)
+{
+	struct ftrace_module_file_ops *file_ops;
+
+	/*
+	 * This is a bit of a PITA. To allow for correct reference
+	 * counting, modules must "own" their file_operations.
+	 * To do this, we allocate the file operations that will be
+	 * used in the event directory.
+	 */
+
+	file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
+	if (!file_ops)
+		return NULL;
+
+	file_ops->mod = mod;
+
+	file_ops->id = ftrace_event_id_fops;
+	file_ops->id.owner = mod;
+
+	file_ops->enable = ftrace_enable_fops;
+	file_ops->enable.owner = mod;
+
+	file_ops->filter = ftrace_event_filter_fops;
+	file_ops->filter.owner = mod;
+
+	file_ops->format = ftrace_event_format_fops;
+	file_ops->format.owner = mod;
+
+	list_add(&file_ops->list, &ftrace_module_file_list);
+
+	return file_ops;
+}
+
 static void trace_module_add_events(struct module *mod)
 {
+	struct ftrace_module_file_ops *file_ops = NULL;
 	struct ftrace_event_call *call, *start, *end;
 	struct dentry *d_events;
 
@@ -852,14 +908,27 @@ static void trace_module_add_events(struct module *mod)
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
+
+		/*
+		 * This module has events, create file ops for this module
+		 * if not already done.
+		 */
+		if (!file_ops) {
+			file_ops = trace_create_file_ops(mod);
+			if (!file_ops)
+				return;
+		}
 		call->mod = mod;
 		list_add(&call->list, &ftrace_events);
-		event_create_dir(call, d_events);
+		event_create_dir(call, d_events,
+				 &file_ops->id, &file_ops->enable,
+				 &file_ops->filter, &file_ops->format);
 	}
 }
 
 static void trace_module_remove_events(struct module *mod)
 {
+	struct ftrace_module_file_ops *file_ops;
 	struct ftrace_event_call *call, *p;
 
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
@@ -874,6 +943,16 @@ static void trace_module_remove_events(struct module *mod)
 			list_del(&call->list);
 		}
 	}
+
+	/* Now free the file_operations */
+	list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+		if (file_ops->mod == mod)
+			break;
+	}
+	if (&file_ops->list != &ftrace_module_file_list) {
+		list_del(&file_ops->list);
+		kfree(file_ops);
+	}
 }
 
 static int trace_module_notify(struct notifier_block *self,
@@ -954,7 +1033,9 @@ static __init int event_trace_init(void)
 		if (!call->name)
 			continue;
 		list_add(&call->list, &ftrace_events);
-		event_create_dir(call, d_events);
+		event_create_dir(call, d_events, &ftrace_event_id_fops,
+				 &ftrace_enable_fops, &ftrace_event_filter_fops,
+				 &ftrace_event_format_fops);
 	}
 
 	ret = register_module_notifier(&trace_module_nb);
-- 
GitLab


From 0a3ec21fcd311b26ab0f249d62960e127bc20ca8 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Sun, 26 Apr 2009 23:07:42 +0200
Subject: [PATCH 0601/2198] x86: beautify vmlinux_64.lds.S

Beautify vmlinux_64.lds.S:

 - Use tabs for indent
 - Located curly braces like in C code
 - Rearranged a few comments

There is no functional changes in this patch

The beautification is done to prepare a unification
of the _32 and the _64 variants of the linker scripts.

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20090426210742.GA3464@uranus.ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux_64.lds.S | 448 +++++++++++++++++--------------
 1 file changed, 243 insertions(+), 205 deletions(-)

diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index c8742507b030b..6d5a5b05eaa20 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -15,69 +15,79 @@ OUTPUT_ARCH(i386:x86-64)
 ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 PHDRS {
-	text PT_LOAD FLAGS(5);	/* R_E */
-	data PT_LOAD FLAGS(7);	/* RWE */
-	user PT_LOAD FLAGS(7);	/* RWE */
+	text PT_LOAD FLAGS(5);		/* R_E */
+	data PT_LOAD FLAGS(7);		/* RWE */
+	user PT_LOAD FLAGS(7);		/* RWE */
 	data.init PT_LOAD FLAGS(7);	/* RWE */
 #ifdef CONFIG_SMP
 	percpu PT_LOAD FLAGS(7);	/* RWE */
 #endif
 	data.init2 PT_LOAD FLAGS(7);	/* RWE */
-	note PT_NOTE FLAGS(0);	/* ___ */
+	note PT_NOTE FLAGS(0);		/* ___ */
 }
 SECTIONS
 {
-  . = __START_KERNEL;
-  phys_startup_64 = startup_64 - LOAD_OFFSET;
-  .text :  AT(ADDR(.text) - LOAD_OFFSET) {
-	_text = .;			/* Text and read-only data */
-	/* First the code that has to be first for bootstrapping */
-	*(.text.head)
-	_stext = .;
-	/* Then the rest */
-	TEXT_TEXT
-	SCHED_TEXT
-	LOCK_TEXT
-	KPROBES_TEXT
-	IRQENTRY_TEXT
-	*(.fixup)
-	*(.gnu.warning)
-	_etext = .;		/* End of text section */
-  } :text = 0x9090
-
-  NOTES :text :note
-
-  . = ALIGN(16);		/* Exception table */
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-  	__start___ex_table = .;
-	 *(__ex_table)
-  	__stop___ex_table = .;
-  } :text = 0x9090
-
-  RODATA
-
-  . = ALIGN(PAGE_SIZE);		/* Align data segment to page size boundary */
-				/* Data */
-  .data : AT(ADDR(.data) - LOAD_OFFSET) {
-	DATA_DATA
-	CONSTRUCTORS
-	_edata = .;			/* End of data section */
-	} :data
+	. = __START_KERNEL;
+	phys_startup_64 = startup_64 - LOAD_OFFSET;
+
+	/* Text and read-only data */
+	.text :  AT(ADDR(.text) - LOAD_OFFSET) {
+		_text = .;
+		/* First the code that has to be first for bootstrapping */
+		*(.text.head)
+		_stext = .;
+		/* Then the rest */
+		TEXT_TEXT
+		SCHED_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		IRQENTRY_TEXT
+		*(.fixup)
+		*(.gnu.warning)
+		/* End of text section */
+		_etext = .;
+	} :text = 0x9090
+
+	NOTES :text :note
+
+	/* Exception table */
+	. = ALIGN(16);
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		 *(__ex_table)
+		__stop___ex_table = .;
+	} :text = 0x9090
 
+	RODATA
 
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+	/* Align data segment to page size boundary */
 	. = ALIGN(PAGE_SIZE);
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-	*(.data.cacheline_aligned)
-  }
-  . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-  	*(.data.read_mostly)
-  }
+	/* Data */
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {
+		DATA_DATA
+		CONSTRUCTORS
+		/* End of data section */
+		_edata = .;
+	} :data
+
+
+	.data.cacheline_aligned :
+		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+		*(.data.cacheline_aligned)
+	}
+
+	. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+		*(.data.read_mostly)
+	}
 
 #define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
 
 #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
 #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
@@ -85,37 +95,53 @@ SECTIONS
 #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
 #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
 
-  . = VSYSCALL_ADDR;
-  .vsyscall_0 :	 AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
-  __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+	. = VSYSCALL_ADDR;
+	.vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+		*(.vsyscall_0)
+	} :user
+
+	__vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
+		*(.vsyscall_fn)
+	}
+
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
+		*(.vsyscall_gtod_data)
+	}
 
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
-		{ *(.vsyscall_gtod_data) }
-  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
-  .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
-		{ *(.vsyscall_clock) }
-  vsyscall_clock = VVIRT(.vsyscall_clock);
+	vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+	.vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
+		*(.vsyscall_clock)
+	}
+	vsyscall_clock = VVIRT(.vsyscall_clock);
 
 
-  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
-		{ *(.vsyscall_1) }
-  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
-		{ *(.vsyscall_2) }
+	.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+		*(.vsyscall_1)
+	}
+	.vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
+		*(.vsyscall_2)
+	}
 
-  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
-  vgetcpu_mode = VVIRT(.vgetcpu_mode);
+	.vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
+		*(.vgetcpu_mode)
+	}
+	vgetcpu_mode = VVIRT(.vgetcpu_mode);
 
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
-  jiffies = VVIRT(.jiffies);
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.jiffies : AT(VLOAD(.jiffies)) {
+		*(.jiffies)
+	}
+	jiffies = VVIRT(.jiffies);
 
-  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
-		{ *(.vsyscall_3) }
+	.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
+		*(.vsyscall_3)
+	}
 
-  . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+	. = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
 
 #undef VSYSCALL_ADDR
 #undef VSYSCALL_PHYS_ADDR
@@ -125,156 +151,168 @@ SECTIONS
 #undef VVIRT_OFFSET
 #undef VVIRT
 
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-	. = ALIGN(THREAD_SIZE);	/* init_task */
-	*(.data.init_task)
-  }:data.init
+	/* init_task */
+	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+		. = ALIGN(THREAD_SIZE);
+		*(.data.init_task)
+	} :data.init
 
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-	. = ALIGN(PAGE_SIZE);
-	*(.data.page_aligned)
-  }
+	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		*(.data.page_aligned)
+	}
 
-  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-	/* might get freed after init */
-	. = ALIGN(PAGE_SIZE);
-	__smp_alt_begin = .;
-	__smp_locks = .;
-	*(.smp_locks)
-	__smp_locks_end = .;
+	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+		/* might get freed after init */
+		. = ALIGN(PAGE_SIZE);
+		__smp_alt_begin = .;
+		__smp_locks = .;
+		*(.smp_locks)
+		__smp_locks_end = .;
+		. = ALIGN(PAGE_SIZE);
+		__smp_alt_end = .;
+	}
+
+	/* Init code and data */
 	. = ALIGN(PAGE_SIZE);
-	__smp_alt_end = .;
-  }
-
-  . = ALIGN(PAGE_SIZE);		/* Init code and data */
-  __init_begin = .;	/* paired with __init_end */
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-	_sinittext = .;
-	INIT_TEXT
-	_einittext = .;
-  }
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-	__initdata_begin = .;
-	INIT_DATA
-	__initdata_end = .;
-   }
-
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-	. = ALIGN(16);
-	__setup_start = .;
-	*(.init.setup)
-	__setup_end = .;
-  }
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-	__initcall_start = .;
-	INITCALLS
-	__initcall_end = .;
-  }
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-	__con_initcall_start = .;
-	*(.con_initcall.init)
-	__con_initcall_end = .;
-  }
-  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-	__x86_cpu_dev_start = .;
-	*(.x86_cpu_dev.init)
-	__x86_cpu_dev_end = .;
-  }
-  SECURITY_INIT
-
-  . = ALIGN(8);
-  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-	__parainstructions = .;
-       *(.parainstructions)
-	__parainstructions_end = .;
-  }
-
-  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+	__init_begin = .;	/* paired with __init_end */
+	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		_sinittext = .;
+		INIT_TEXT
+		_einittext = .;
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+		__initdata_begin = .;
+		INIT_DATA
+		__initdata_end = .;
+	}
+
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		. = ALIGN(16);
+		__setup_start = .;
+		*(.init.setup)
+		__setup_end = .;
+	}
+
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		__initcall_start = .;
+		INITCALLS
+		__initcall_end = .;
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		__con_initcall_start = .;
+		*(.con_initcall.init)
+		__con_initcall_end = .;
+	}
+
+	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+		__x86_cpu_dev_start = .;
+		*(.x86_cpu_dev.init)
+		__x86_cpu_dev_end = .;
+	}
+
+	SECURITY_INIT
+
 	. = ALIGN(8);
-	__alt_instructions = .;
-	*(.altinstructions)
-	__alt_instructions_end = .;
-  }
-  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-	*(.altinstr_replacement)
-  }
-  /* .exit.text is discard at runtime, not link time, to deal with references
-     from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-	EXIT_TEXT
-  }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-	EXIT_DATA
-  }
+	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+		__parainstructions = .;
+		*(.parainstructions)
+		__parainstructions_end = .;
+	}
+
+	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+		. = ALIGN(8);
+		__alt_instructions = .;
+		*(.altinstructions)
+		__alt_instructions_end = .;
+	}
+
+	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+		*(.altinstr_replacement)
+	}
+
+	/*
+	 * .exit.text is discard at runtime, not link time, to deal with
+	 *  references from .altinstructions and .eh_frame
+	 */
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+		EXIT_TEXT
+	}
+
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+		EXIT_DATA
+	}
 
 #ifdef CONFIG_BLK_DEV_INITRD
-  . = ALIGN(PAGE_SIZE);
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-	__initramfs_start = .;
-	*(.init.ramfs)
-	__initramfs_end = .;
-  }
+	. = ALIGN(PAGE_SIZE);
+	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+	}
 #endif
 
 #ifdef CONFIG_SMP
-  /*
-   * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-   * output PHDR, so the next output section - __data_nosave - should
-   * start another section data.init2.  Also, pda should be at the head of
-   * percpu area.  Preallocate it and define the percpu offset symbol
-   * so that it can be accessed as a percpu variable.
-   */
-  . = ALIGN(PAGE_SIZE);
-  PERCPU_VADDR(0, :percpu)
+	/*
+	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+	 * output PHDR, so the next output section - __data_nosave - should
+	 * start another section data.init2.  Also, pda should be at the head of
+	 * percpu area.  Preallocate it and define the percpu offset symbol
+	 * so that it can be accessed as a percpu variable.
+	 */
+	. = ALIGN(PAGE_SIZE);
+	PERCPU_VADDR(0, :percpu)
 #else
-  PERCPU(PAGE_SIZE)
+	PERCPU(PAGE_SIZE)
 #endif
 
-  . = ALIGN(PAGE_SIZE);
-  __init_end = .;
-
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-	. = ALIGN(PAGE_SIZE);
-	__nosave_begin = .;
-	*(.data.nosave)
-	. = ALIGN(PAGE_SIZE);
-	__nosave_end = .;
-  } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
-
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 	. = ALIGN(PAGE_SIZE);
-	__bss_start = .;		/* BSS */
-	*(.bss.page_aligned)
-	*(.bss)
-	__bss_stop = .;
-  }
+	__init_end = .;
+
+	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		__nosave_begin = .;
+		*(.data.nosave)
+		. = ALIGN(PAGE_SIZE);
+		__nosave_end = .;
+	} :data.init2
+	/* use another section data.init2, see PERCPU_VADDR() above */
+
+	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		__bss_start = .;		/* BSS */
+		*(.bss.page_aligned)
+		*(.bss)
+		__bss_stop = .;
+	}
 
-  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
-	. = ALIGN(PAGE_SIZE);
-	__brk_base = . ;
- 	. += 64 * 1024 ;	/* 64k alignment slop space */
-	*(.brk_reservation)	/* areas brk users have reserved */
-	__brk_limit = . ;
-  }
-
-  _end = . ;
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	*(.exitcall.exit)
-	*(.eh_frame)
-	*(.discard)
+	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		__brk_base = .;
+		. += 64 * 1024;		/* 64k alignment slop space */
+		*(.brk_reservation)	/* areas brk users have reserved */
+		__brk_limit = .;
 	}
 
-  STABS_DEBUG
+	_end = . ;
 
-  DWARF_DEBUG
+	/* Sections to be discarded */
+	/DISCARD/ : {
+		*(.exitcall.exit)
+		*(.eh_frame)
+		*(.discard)
+	}
+
+	STABS_DEBUG
+	DWARF_DEBUG
 }
 
- /*
-  * Per-cpu symbols which need to be offset from __per_cpu_load
-  * for the boot processor.
-  */
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
 #define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
 INIT_PER_CPU(gdt_page);
 INIT_PER_CPU(irq_stack_union);
-- 
GitLab


From 51b26ada79b605ed709ddcedbb6012e8f8e0ebed Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 26 Apr 2009 10:12:47 -0700
Subject: [PATCH 0602/2198] x86: unify arch/x86/boot/compressed/vmlinux_*.lds

Look at the:

	diff -u arch/x86/boot/compressed/vmlinux_*.lds

output and realize that they're basially exactly the same except for
trivial naming differences, and the fact that the 64-bit version has a
"pgtable" thing.

So unify them.

There's some trivial cleanup there (make the output format a Kconfig thing
rather than doing #ifdef's for it, and unify both 32-bit and 64-bit BSS
end to "_ebss", where 32-bit used to use the traditional "_end"), but
other than that it's really very mindless and straigt conversion.

For example, I think we should aim to remove "startup_32" vs "startup_64",
and just call it "startup", and get rid of one more difference. I didn't
do that.

Also, notice the comment in the unified vmlinux.lds.S talks about
"head_64" and "startup_32" which is an odd and incorrect mix, but that was
actually what the old 64-bit only lds file had, so the confusion isn't
new, and now that mixing is arguably more accurate thanks to the
vmlinux.lds.S file being shared between the two cases ;)

[ Impact: cleanup, unification ]

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                              |  5 +++
 arch/x86/boot/compressed/Makefile             |  2 +-
 arch/x86/boot/compressed/head_32.S            |  8 ++--
 .../{vmlinux_64.lds => vmlinux.lds.S}         | 11 ++++-
 arch/x86/boot/compressed/vmlinux_32.lds       | 43 -------------------
 5 files changed, 20 insertions(+), 49 deletions(-)
 rename arch/x86/boot/compressed/{vmlinux_64.lds => vmlinux.lds.S} (78%)
 delete mode 100644 arch/x86/boot/compressed/vmlinux_32.lds

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index bc25b9f5e4cd2..039c3f04aac5b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -47,6 +47,11 @@ config X86
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 
+config OUTPUT_FORMAT
+	string
+	default "elf32-i386" if X86_32
+	default "elf64-x86-64" if X86_64
+
 config ARCH_DEFCONFIG
 	string
 	default "arch/x86/configs/i386_defconfig" if X86_32
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 65551c9f85718..0f4b5e2abd3fd 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,7 +19,7 @@ KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 LDFLAGS := -m elf_$(UTS_MACHINE)
 LDFLAGS_vmlinux := -T
 
-$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
 	@:
 
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 3a8a866fb2e29..85bd3285706da 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -88,9 +88,9 @@ ENTRY(startup_32)
  * where decompression in place becomes safe.
  */
 	pushl %esi
-	leal _end(%ebp), %esi
-	leal _end(%ebx), %edi
-	movl $(_end - startup_32), %ecx
+	leal _ebss(%ebp), %esi
+	leal _ebss(%ebx), %edi
+	movl $(_ebss - startup_32), %ecx
 	std
 	rep
 	movsb
@@ -121,7 +121,7 @@ relocated:
  */
 	xorl %eax,%eax
 	leal _edata(%ebx),%edi
-	leal _end(%ebx), %ecx
+	leal _ebss(%ebx), %ecx
 	subl %edi,%ecx
 	cld
 	rep
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S
similarity index 78%
rename from arch/x86/boot/compressed/vmlinux_64.lds
rename to arch/x86/boot/compressed/vmlinux.lds.S
index bef1ac891bce9..ffcb19134bf71 100644
--- a/arch/x86/boot/compressed/vmlinux_64.lds
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,6 +1,13 @@
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_64
 OUTPUT_ARCH(i386:x86-64)
 ENTRY(startup_64)
+#else
+OUTPUT_ARCH(i386)
+ENTRY(startup_32)
+#endif
+
 SECTIONS
 {
 	/* Be careful parts of head_64.S assume startup_32 is at
@@ -38,11 +45,13 @@ SECTIONS
 		*(.bss)
 		*(.bss.*)
 		*(COMMON)
+#ifdef CONFIG_X86_64
 		. = ALIGN(8);
 		_end_before_pgt = . ;
 		. = ALIGN(4096);
 		pgtable = . ;
 		. = . + 4096 * 6;
+#endif
 		_ebss = .;
 	}
 }
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
deleted file mode 100644
index bb3c48379c40f..0000000000000
--- a/arch/x86/boot/compressed/vmlinux_32.lds
+++ /dev/null
@@ -1,43 +0,0 @@
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(startup_32)
-SECTIONS
-{
-	/* Be careful parts of head_32.S assume startup_32 is at
-	 * address 0.
-	 */
-	. = 0;
-	.text.head : {
-		_head = . ;
-		*(.text.head)
-		_ehead = . ;
-	}
-	.rodata.compressed : {
-		*(.rodata.compressed)
-	}
-	.text :	{
-		_text = .; 	/* Text */
-		*(.text)
-		*(.text.*)
-		_etext = . ;
-	}
-	.rodata : {
-		_rodata = . ;
-		*(.rodata)	 /* read-only data */
-		*(.rodata.*)
-		_erodata = . ;
-	}
-	.data :	{
-		_data = . ;
-		*(.data)
-		*(.data.*)
-		_edata = . ;
-	}
-	.bss : {
-		_bss = . ;
-		*(.bss)
-		*(.bss.*)
-		*(COMMON)
-		_end = . ;
-	}
-}
-- 
GitLab


From fc4967b8c6a1540ebce9ac48e44b8d44e7fac971 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 27 Apr 2009 14:06:26 +0900
Subject: [PATCH 0603/2198] sh: update defconfigs for PCI changes.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/configs/ap325rxa_defconfig        | 37 ++++++++++--
 arch/sh/configs/cayman_defconfig          | 71 ++++++++++-------------
 arch/sh/configs/dreamcast_defconfig       | 39 ++++++++++---
 arch/sh/configs/edosk7705_defconfig       | 29 +++++++--
 arch/sh/configs/edosk7760_defconfig       | 32 ++++++++--
 arch/sh/configs/espt_defconfig            | 39 +++++++++++--
 arch/sh/configs/hp6xx_defconfig           | 35 +++++++++--
 arch/sh/configs/landisk_defconfig         | 48 ++++++++++++---
 arch/sh/configs/lboxre2_defconfig         | 44 +++++++++++---
 arch/sh/configs/magicpanelr2_defconfig    | 30 ++++++++--
 arch/sh/configs/microdev_defconfig        | 36 ++++++++++--
 arch/sh/configs/migor_defconfig           | 36 +++++++++---
 arch/sh/configs/polaris_defconfig         | 30 ++++++++--
 arch/sh/configs/r7780mp_defconfig         | 36 +++++++++---
 arch/sh/configs/r7785rp_defconfig         | 36 +++++++++---
 arch/sh/configs/rsk7201_defconfig         | 37 ++++++++++--
 arch/sh/configs/rsk7203_defconfig         | 40 ++++++++++---
 arch/sh/configs/rts7751r2d1_defconfig     | 43 +++++++++++---
 arch/sh/configs/rts7751r2dplus_defconfig  | 43 +++++++++++---
 arch/sh/configs/sdk7780_defconfig         | 38 +++++++++---
 arch/sh/configs/se7206_defconfig          | 35 +++++++++--
 arch/sh/configs/se7343_defconfig          | 42 ++++++++++++--
 arch/sh/configs/se7619_defconfig          | 35 +++++++++--
 arch/sh/configs/se7705_defconfig          | 34 +++++++++--
 arch/sh/configs/se7712_defconfig          | 30 ++++++++--
 arch/sh/configs/se7721_defconfig          | 33 +++++++++--
 arch/sh/configs/se7722_defconfig          | 38 +++++++++---
 arch/sh/configs/se7750_defconfig          | 34 +++++++++--
 arch/sh/configs/se7751_defconfig          | 34 +++++++++--
 arch/sh/configs/se7780_defconfig          | 38 +++++++++---
 arch/sh/configs/sh03_defconfig            | 43 +++++++++++---
 arch/sh/configs/sh7710voipgw_defconfig    | 35 +++++++++--
 arch/sh/configs/sh7724_generic_defconfig  |  4 +-
 arch/sh/configs/sh7763rdp_defconfig       | 35 +++++++++--
 arch/sh/configs/sh7785lcr_32bit_defconfig | 37 +++++++++---
 arch/sh/configs/sh7785lcr_defconfig       |  6 +-
 arch/sh/configs/shmin_defconfig           | 31 ++++++++--
 arch/sh/configs/shx3_defconfig            | 32 ++++++++--
 arch/sh/configs/snapgear_defconfig        | 40 ++++++++++---
 arch/sh/configs/systemh_defconfig         | 39 +++++++++++--
 arch/sh/configs/titan_defconfig           | 40 ++++++++++---
 arch/sh/configs/ul2_defconfig             | 37 ++++++++++--
 arch/sh/configs/urquell_defconfig         | 39 +++++++++++--
 43 files changed, 1247 insertions(+), 303 deletions(-)

diff --git a/arch/sh/configs/ap325rxa_defconfig b/arch/sh/configs/ap325rxa_defconfig
index c8d982a8a2e64..022f70e0ea03e 100644
--- a/arch/sh/configs/ap325rxa_defconfig
+++ b/arch/sh/configs/ap325rxa_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:46:53 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:42:06 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -74,6 +75,7 @@ CONFIG_EMBEDDED=y
 CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -92,12 +94,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -110,7 +115,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -159,6 +163,7 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 CONFIG_CPU_SUBTYPE_SH7723=y
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -168,8 +173,6 @@ CONFIG_CPU_SUBTYPE_SH7723=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -573,6 +576,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -695,6 +699,7 @@ CONFIG_DEVKMEM=y
 #
 # Non-8250 serial port support
 #
+# CONFIG_SERIAL_MAX3100 is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=6
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
@@ -1030,6 +1035,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y
 CONFIG_EXT2_FS_SECURITY=y
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
@@ -1051,6 +1057,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1100,6 +1111,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1191,10 +1203,24 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1303,6 +1329,7 @@ CONFIG_CRYPTO_CBC=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/cayman_defconfig b/arch/sh/configs/cayman_defconfig
index fa5fc1e1e980b..40301f86a45c1 100644
--- a/arch/sh/configs/cayman_defconfig
+++ b/arch/sh/configs/cayman_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:49:14 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:42:53 2009
 #
 CONFIG_SUPERH=y
 # CONFIG_SUPERH32 is not set
@@ -40,6 +40,7 @@ CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SWAP=y
 # CONFIG_SYSVIPC is not set
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_AUDIT is not set
@@ -70,6 +71,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_ALL is not set
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -89,10 +91,13 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -105,7 +110,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -127,39 +131,6 @@ CONFIG_DEFAULT_IOSCHED="cfq"
 # System type
 #
 CONFIG_CPU_SH5=y
-# CONFIG_CPU_SUBTYPE_SH7619 is not set
-# CONFIG_CPU_SUBTYPE_SH7201 is not set
-# CONFIG_CPU_SUBTYPE_SH7203 is not set
-# CONFIG_CPU_SUBTYPE_SH7206 is not set
-# CONFIG_CPU_SUBTYPE_SH7263 is not set
-# CONFIG_CPU_SUBTYPE_MXG is not set
-# CONFIG_CPU_SUBTYPE_SH7705 is not set
-# CONFIG_CPU_SUBTYPE_SH7706 is not set
-# CONFIG_CPU_SUBTYPE_SH7707 is not set
-# CONFIG_CPU_SUBTYPE_SH7708 is not set
-# CONFIG_CPU_SUBTYPE_SH7709 is not set
-# CONFIG_CPU_SUBTYPE_SH7710 is not set
-# CONFIG_CPU_SUBTYPE_SH7712 is not set
-# CONFIG_CPU_SUBTYPE_SH7720 is not set
-# CONFIG_CPU_SUBTYPE_SH7721 is not set
-# CONFIG_CPU_SUBTYPE_SH7750 is not set
-# CONFIG_CPU_SUBTYPE_SH7091 is not set
-# CONFIG_CPU_SUBTYPE_SH7750R is not set
-# CONFIG_CPU_SUBTYPE_SH7750S is not set
-# CONFIG_CPU_SUBTYPE_SH7751 is not set
-# CONFIG_CPU_SUBTYPE_SH7751R is not set
-# CONFIG_CPU_SUBTYPE_SH7760 is not set
-# CONFIG_CPU_SUBTYPE_SH4_202 is not set
-# CONFIG_CPU_SUBTYPE_SH7723 is not set
-# CONFIG_CPU_SUBTYPE_SH7763 is not set
-# CONFIG_CPU_SUBTYPE_SH7770 is not set
-# CONFIG_CPU_SUBTYPE_SH7780 is not set
-# CONFIG_CPU_SUBTYPE_SH7785 is not set
-# CONFIG_CPU_SUBTYPE_SH7786 is not set
-# CONFIG_CPU_SUBTYPE_SHX3 is not set
-# CONFIG_CPU_SUBTYPE_SH7343 is not set
-# CONFIG_CPU_SUBTYPE_SH7722 is not set
-# CONFIG_CPU_SUBTYPE_SH7366 is not set
 CONFIG_CPU_SUBTYPE_SH5_101=y
 # CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
@@ -279,8 +250,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -492,6 +461,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -568,6 +538,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -591,6 +562,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -779,6 +751,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_F71805F is not set
 # CONFIG_SENSORS_F71882FG is not set
 # CONFIG_SENSORS_F75375S is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_IT87 is not set
@@ -1042,7 +1015,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1082,6 +1054,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1104,6 +1077,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1146,8 +1124,13 @@ CONFIG_MINIX_FS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1208,6 +1191,9 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 CONFIG_SCHEDSTATS=y
 # CONFIG_TIMER_STATS is not set
@@ -1241,15 +1227,21 @@ CONFIG_FRAME_POINTER=y
 # CONFIG_LATENCYTOP is not set
 # CONFIG_SYSCTL_SYSCALL_CHECK is not set
 # CONFIG_PAGE_POISONING is not set
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 # CONFIG_EARLY_SCIF_CONSOLE is not set
 # CONFIG_DEBUG_BOOTMEM is not set
@@ -1354,6 +1346,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/dreamcast_defconfig b/arch/sh/configs/dreamcast_defconfig
index 5c1123640142b..1f3cc98330bfd 100644
--- a/arch/sh/configs/dreamcast_defconfig
+++ b/arch/sh/configs/dreamcast_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:51:48 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:44:27 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -71,6 +72,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -90,6 +92,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+# CONFIG_MARKERS is not set
 # CONFIG_OPROFILE is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
@@ -98,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -110,7 +115,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -156,6 +160,7 @@ CONFIG_CPU_SUBTYPE_SH7091=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -165,8 +170,6 @@ CONFIG_CPU_SUBTYPE_SH7091=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -271,7 +274,7 @@ CONFIG_CPU_FREQ_GOV_USERSPACE=y
 CONFIG_SH_DMA_API=y
 CONFIG_SH_DMA=y
 CONFIG_SH_DMA_IRQ_MULTI=y
-CONFIG_NR_ONCHIP_DMA_CHANNELS=6
+CONFIG_NR_ONCHIP_DMA_CHANNELS=4
 CONFIG_NR_DMA_CHANNELS_BOOL=y
 CONFIG_NR_DMA_CHANNELS=9
 # CONFIG_PVR2_DMA is not set
@@ -320,7 +323,6 @@ CONFIG_CMDLINE="console=ttySC1,115200 panic=3"
 CONFIG_MAPLE=y
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -602,6 +604,7 @@ CONFIG_INPUT_MOUSE=y
 # CONFIG_MOUSE_APPLETOUCH is not set
 # CONFIG_MOUSE_BCM5974 is not set
 # CONFIG_MOUSE_VSXXXAA is not set
+# CONFIG_MOUSE_MAPLE is not set
 # CONFIG_INPUT_JOYSTICK is not set
 # CONFIG_INPUT_TABLET is not set
 # CONFIG_INPUT_TOUCHSCREEN is not set
@@ -812,7 +815,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -866,6 +868,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -910,6 +917,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -947,10 +955,24 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1051,6 +1073,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/edosk7705_defconfig b/arch/sh/configs/edosk7705_defconfig
index f4c34b039312f..d7092457ddc70 100644
--- a/arch/sh/configs/edosk7705_defconfig
+++ b/arch/sh/configs/edosk7705_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:54:02 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:45:04 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
@@ -56,6 +57,7 @@ CONFIG_EMBEDDED=y
 # CONFIG_UID16 is not set
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 # CONFIG_PRINTK is not set
 # CONFIG_BUG is not set
@@ -74,12 +76,15 @@ CONFIG_SHMEM=y
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_BASE_SMALL=1
 # CONFIG_MODULES is not set
@@ -114,6 +119,7 @@ CONFIG_CPU_SUBTYPE_SH7705=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -123,8 +129,6 @@ CONFIG_CPU_SUBTYPE_SH7705=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -381,6 +385,10 @@ CONFIG_SSB_POSSIBLE=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+
 #
 # Pseudo filesystems
 #
@@ -409,10 +417,22 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -426,6 +446,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig
index 7825c2699f18f..a822b1d8c1160 100644
--- a/arch/sh/configs/edosk7760_defconfig
+++ b/arch/sh/configs/edosk7760_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:54:57 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:45:25 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -40,6 +41,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -67,7 +69,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -77,6 +78,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -96,6 +98,7 @@ CONFIG_COMPAT_BRK=y
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -103,6 +106,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -115,7 +120,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -161,6 +165,7 @@ CONFIG_CPU_SH4=y
 CONFIG_CPU_SUBTYPE_SH7760=y
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -170,8 +175,6 @@ CONFIG_CPU_SUBTYPE_SH7760=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -815,6 +818,7 @@ CONFIG_EXT2_FS_XATTR=y
 # CONFIG_EXT2_FS_SECURITY is not set
 CONFIG_EXT2_FS_XIP=y
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -838,6 +842,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_FUSE_FS is not set
 CONFIG_GENERIC_ACL=y
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -883,6 +892,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -964,6 +974,9 @@ CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_SCHEDSTATS is not set
 CONFIG_TIMER_STATS=y
@@ -1001,6 +1014,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1010,9 +1024,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1126,6 +1145,7 @@ CONFIG_CRYPTO_DES=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/espt_defconfig b/arch/sh/configs/espt_defconfig
index ebb4c37abaa6e..c5b50077913dd 100644
--- a/arch/sh/configs/espt_defconfig
+++ b/arch/sh/configs/espt_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 17:58:18 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:46:26 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -78,6 +79,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -106,6 +108,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -117,7 +121,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -164,6 +167,7 @@ CONFIG_CPU_SH4A=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 CONFIG_CPU_SUBTYPE_SH7763=y
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -173,8 +177,6 @@ CONFIG_CPU_SUBTYPE_SH7763=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -548,6 +550,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -919,6 +922,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -942,6 +946,11 @@ CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
 CONFIG_GENERIC_ACL=y
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -985,8 +994,13 @@ CONFIG_CRAMFS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -1075,11 +1089,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1179,6 +1207,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/hp6xx_defconfig b/arch/sh/configs/hp6xx_defconfig
index 82b113af08d32..8e13027eecc39 100644
--- a/arch/sh/configs/hp6xx_defconfig
+++ b/arch/sh/configs/hp6xx_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:01:05 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:47:15 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -67,6 +68,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -85,12 +87,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -98,7 +103,6 @@ CONFIG_BASE_SMALL=0
 # CONFIG_MODULES is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -144,6 +148,7 @@ CONFIG_CPU_SUBTYPE_SH7709=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -153,8 +158,6 @@ CONFIG_CPU_SUBTYPE_SH7709=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -385,6 +388,7 @@ CONFIG_BLK_DEV_SD=y
 # CONFIG_SCSI_SRP_ATTRS is not set
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
 # CONFIG_SCSI_DH is not set
@@ -431,6 +435,7 @@ CONFIG_KEYBOARD_HP6XX=y
 # CONFIG_INPUT_JOYSTICK is not set
 # CONFIG_INPUT_TABLET is not set
 CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879 is not set
 # CONFIG_TOUCHSCREEN_FUJITSU is not set
 # CONFIG_TOUCHSCREEN_GUNZE is not set
 # CONFIG_TOUCHSCREEN_ELO is not set
@@ -673,6 +678,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -719,6 +729,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 
 #
 # Partition Types
@@ -786,10 +797,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -898,6 +922,7 @@ CONFIG_CRYPTO_MD5=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig
index b6fa4a7599d01..7f549aef0dfd1 100644
--- a/arch/sh/configs/landisk_defconfig
+++ b/arch/sh/configs/landisk_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:02:54 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:47:48 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -69,6 +70,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -88,6 +90,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -95,6 +98,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -107,7 +112,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -153,6 +157,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -162,8 +167,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -292,8 +295,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -602,6 +603,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -706,6 +708,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -729,6 +732,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -922,6 +926,7 @@ CONFIG_VIDEO_HELPER_CHIPS_AUTO=y
 # CONFIG_SOC_CAMERA is not set
 CONFIG_V4L_USB_DRIVERS=y
 # CONFIG_USB_VIDEO_CLASS is not set
+CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y
 # CONFIG_USB_GSPCA is not set
 # CONFIG_VIDEO_HDPVR is not set
 CONFIG_VIDEO_USBVIDEO=m
@@ -994,15 +999,17 @@ CONFIG_USB_HID=m
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=m
 CONFIG_HID_APPLE=m
 CONFIG_HID_BELKIN=m
 CONFIG_HID_CHERRY=m
 CONFIG_HID_CHICONY=m
 CONFIG_HID_CYPRESS=m
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=m
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=m
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=m
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1197,6 +1204,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1221,6 +1229,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1270,10 +1283,15 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 CONFIG_UFS_FS=m
 # CONFIG_UFS_FS_WRITE is not set
 # CONFIG_UFS_DEBUG is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=m
 CONFIG_NFS_V3=y
@@ -1364,10 +1382,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_SH_STANDARD_BIOS=y
@@ -1469,6 +1500,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/lboxre2_defconfig b/arch/sh/configs/lboxre2_defconfig
index 92c515c4199f0..a7db539f2800d 100644
--- a/arch/sh/configs/lboxre2_defconfig
+++ b/arch/sh/configs/lboxre2_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:06:51 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:48:54 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -69,6 +70,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -88,6 +90,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -95,6 +98,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -107,7 +112,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -153,6 +157,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -162,8 +167,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -293,8 +296,6 @@ CONFIG_CMDLINE="console=ttySC1,115200 root=/dev/sda1"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -542,6 +543,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -702,6 +704,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -725,6 +728,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -931,7 +935,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1007,6 +1010,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1028,6 +1032,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1073,8 +1082,13 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -1151,10 +1165,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_SH_STANDARD_BIOS=y
@@ -1256,6 +1283,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/magicpanelr2_defconfig b/arch/sh/configs/magicpanelr2_defconfig
index 26586c2d64cae..58bec61506fa3 100644
--- a/arch/sh/configs/magicpanelr2_defconfig
+++ b/arch/sh/configs/magicpanelr2_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:07:39 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:49:32 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -39,6 +40,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
 # CONFIG_TASKSTATS is not set
@@ -66,7 +68,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -76,6 +77,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -94,6 +96,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -101,6 +104,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -113,7 +118,6 @@ CONFIG_MODVERSIONS=y
 CONFIG_MODULE_SRCVERSION_ALL=y
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -159,6 +163,7 @@ CONFIG_CPU_SUBTYPE_SH7720=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -168,8 +173,6 @@ CONFIG_CPU_SUBTYPE_SH7720=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -813,6 +816,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
 # CONFIG_EXT4_FS is not set
 CONFIG_JBD=y
@@ -830,6 +834,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -884,6 +893,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -965,6 +975,7 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1000,6 +1011,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1008,9 +1020,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_IRQSOFF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1035,6 +1052,7 @@ CONFIG_DUMP_CODE=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/microdev_defconfig b/arch/sh/configs/microdev_defconfig
index 75178355d69ae..2886fc84bc1cd 100644
--- a/arch/sh/configs/microdev_defconfig
+++ b/arch/sh/configs/microdev_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:11:13 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:50:51 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -65,7 +66,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -74,6 +74,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -92,12 +93,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -105,7 +109,6 @@ CONFIG_BASE_SMALL=0
 # CONFIG_MODULES is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -151,6 +154,7 @@ CONFIG_CPU_SH4=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 CONFIG_CPU_SUBTYPE_SH4_202=y
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -160,8 +164,6 @@ CONFIG_CPU_SUBTYPE_SH4_202=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -647,6 +649,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -668,6 +671,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -715,6 +723,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -802,10 +811,24 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -914,6 +937,7 @@ CONFIG_CRYPTO_DES=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/migor_defconfig b/arch/sh/configs/migor_defconfig
index a8720f9c60472..8ecceb4bf27ec 100644
--- a/arch/sh/configs/migor_defconfig
+++ b/arch/sh/configs/migor_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:14:03 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:51:34 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -67,7 +68,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -76,6 +76,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -104,6 +105,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -115,7 +118,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -165,6 +167,7 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -174,8 +177,6 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 CONFIG_CPU_SUBTYPE_SH7722=y
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -577,6 +578,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -873,7 +875,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
@@ -1011,6 +1012,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1056,6 +1062,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -1105,11 +1112,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1218,6 +1239,7 @@ CONFIG_CRYPTO_WORKQUEUE=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/polaris_defconfig b/arch/sh/configs/polaris_defconfig
index df2d177d5346e..2b95072861827 100644
--- a/arch/sh/configs/polaris_defconfig
+++ b/arch/sh/configs/polaris_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:16:48 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:52:19 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -40,6 +41,7 @@ CONFIG_LOCALVERSION=""
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 CONFIG_BSD_PROCESS_ACCT_V3=y
 # CONFIG_TASKSTATS is not set
@@ -76,6 +78,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -94,6 +97,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -101,6 +105,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -113,7 +119,6 @@ CONFIG_MODVERSIONS=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -159,6 +164,7 @@ CONFIG_CPU_SUBTYPE_SH7709=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -168,8 +174,6 @@ CONFIG_CPU_SUBTYPE_SH7709=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -777,6 +781,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -831,6 +840,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -874,6 +884,9 @@ CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -914,6 +927,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -923,9 +937,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -950,6 +969,7 @@ CONFIG_DUMP_CODE=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig
index 7def4df46ddb3..68e5eff5d7fe6 100644
--- a/arch/sh/configs/r7780mp_defconfig
+++ b/arch/sh/configs/r7780mp_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:20:17 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:53:28 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -78,6 +79,7 @@ CONFIG_UID16=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_ALL is not set
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -107,6 +109,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_BASE_SMALL=0
@@ -118,7 +122,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -165,6 +168,7 @@ CONFIG_CPU_SH4A=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 CONFIG_CPU_SUBTYPE_SH7780=y
@@ -174,8 +178,6 @@ CONFIG_CPU_SUBTYPE_SH7780=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -313,8 +315,6 @@ CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -536,6 +536,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -696,6 +697,7 @@ CONFIG_E1000=m
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -719,6 +721,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -920,6 +923,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_F71805F is not set
 # CONFIG_SENSORS_F71882FG is not set
 # CONFIG_SENSORS_F75375S is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_IT87 is not set
@@ -1027,7 +1031,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1120,6 +1123,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1142,6 +1146,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 CONFIG_FUSE_FS=m
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1191,6 +1200,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1279,6 +1289,9 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1316,6 +1329,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1325,11 +1339,16 @@ CONFIG_TRACING=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1449,6 +1468,7 @@ CONFIG_CRYPTO_DES=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig
index cb134ffc21185..890fe3cc91422 100644
--- a/arch/sh/configs/r7785rp_defconfig
+++ b/arch/sh/configs/r7785rp_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:24:35 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:55:10 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -43,6 +44,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -78,6 +80,7 @@ CONFIG_UID16=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -108,6 +111,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -120,7 +125,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -168,6 +172,7 @@ CONFIG_CPU_SHX2=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -177,8 +182,6 @@ CONFIG_CPU_SUBTYPE_SH7785=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -337,8 +340,6 @@ CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 # CONFIG_PCI_LEGACY is not set
@@ -561,6 +562,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -698,6 +700,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -721,6 +724,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -948,6 +952,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_F71805F is not set
 # CONFIG_SENSORS_F71882FG is not set
 # CONFIG_SENSORS_F75375S is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_IT87 is not set
@@ -970,6 +975,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_PC87360 is not set
 # CONFIG_SENSORS_PC87427 is not set
 # CONFIG_SENSORS_PCF8591 is not set
+# CONFIG_SENSORS_SHT15 is not set
 # CONFIG_SENSORS_SIS5595 is not set
 # CONFIG_SENSORS_DME1737 is not set
 # CONFIG_SENSORS_SMSC47M1 is not set
@@ -1109,7 +1115,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1202,6 +1207,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1224,6 +1230,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 CONFIG_FUSE_FS=m
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1273,6 +1284,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1359,6 +1371,7 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1401,6 +1414,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1410,11 +1424,16 @@ CONFIG_TRACING=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1534,6 +1553,7 @@ CONFIG_CRYPTO_DES=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/rsk7201_defconfig b/arch/sh/configs/rsk7201_defconfig
index a037c744b7980..fa4395768d192 100644
--- a/arch/sh/configs/rsk7201_defconfig
+++ b/arch/sh/configs/rsk7201_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:29:08 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:56:29 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -65,7 +66,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -74,6 +74,7 @@ CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -100,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_RT_MUTEXES=y
 CONFIG_BASE_SMALL=0
@@ -110,7 +113,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -157,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7201=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -166,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7201=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -602,6 +603,11 @@ CONFIG_EXT2_FS=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -651,8 +657,13 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 
 #
 # Partition Types
@@ -686,11 +697,24 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -705,6 +729,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig
index 9ae28e88426cf..e3a65f819f0ac 100644
--- a/arch/sh/configs/rsk7203_defconfig
+++ b/arch/sh/configs/rsk7203_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:30:34 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:57:06 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -39,6 +40,7 @@ CONFIG_LOCALVERSION=""
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -70,7 +72,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -80,6 +81,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -106,6 +108,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_RT_MUTEXES=y
 CONFIG_BASE_SMALL=0
@@ -116,7 +120,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -163,6 +166,7 @@ CONFIG_CPU_SUBTYPE_SH7203=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -172,8 +176,6 @@ CONFIG_CPU_SUBTYPE_SH7203=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -750,15 +752,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -874,6 +878,7 @@ CONFIG_LEDS_CLASS=y
 # LED drivers
 #
 CONFIG_LEDS_GPIO=y
+CONFIG_LEDS_GPIO_PLATFORM=y
 
 #
 # LED Triggers
@@ -882,6 +887,7 @@ CONFIG_LEDS_TRIGGERS=y
 CONFIG_LEDS_TRIGGER_TIMER=y
 CONFIG_LEDS_TRIGGER_HEARTBEAT=y
 CONFIG_LEDS_TRIGGER_BACKLIGHT=y
+# CONFIG_LEDS_TRIGGER_GPIO is not set
 CONFIG_LEDS_TRIGGER_DEFAULT_ON=y
 
 #
@@ -950,6 +956,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -989,8 +1000,13 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -1033,6 +1049,9 @@ CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1076,6 +1095,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1083,11 +1103,16 @@ CONFIG_TRACING=y
 # CONFIG_FUNCTION_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1111,6 +1136,7 @@ CONFIG_DUMP_CODE=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/rts7751r2d1_defconfig b/arch/sh/configs/rts7751r2d1_defconfig
index c0f741af6da81..a4a59f6205abd 100644
--- a/arch/sh/configs/rts7751r2d1_defconfig
+++ b/arch/sh/configs/rts7751r2d1_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:33:25 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:58:13 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -70,6 +71,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -99,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -110,7 +114,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -156,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -165,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -302,8 +304,6 @@ CONFIG_CMDLINE="console=tty0 console=ttySC0,115200 root=/dev/sda1 earlyprintk=se
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -513,6 +513,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -672,6 +673,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -695,6 +697,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -793,6 +796,7 @@ CONFIG_SERIAL_8250_RUNTIME_UARTS=4
 #
 # Non-8250 serial port support
 #
+# CONFIG_SERIAL_MAX3100 is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=1
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
@@ -1079,15 +1083,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1290,6 +1296,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1337,6 +1348,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -1417,11 +1429,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1524,6 +1550,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig
index 8feef629e49c0..2bc1c97d346a7 100644
--- a/arch/sh/configs/rts7751r2dplus_defconfig
+++ b/arch/sh/configs/rts7751r2dplus_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:34:12 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:59:01 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -70,6 +71,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -99,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -110,7 +114,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -156,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -165,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -302,8 +304,6 @@ CONFIG_CMDLINE="console=tty0 console=ttySC0,115200 root=/dev/sda1 earlyprintk=se
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -513,6 +513,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -672,6 +673,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -695,6 +697,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -793,6 +796,7 @@ CONFIG_SERIAL_8250_RUNTIME_UARTS=4
 #
 # Non-8250 serial port support
 #
+# CONFIG_SERIAL_MAX3100 is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=1
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
@@ -1079,15 +1083,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1290,6 +1296,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1337,6 +1348,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -1417,11 +1429,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1524,6 +1550,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig
index 739e2299ae80d..a629b79a18440 100644
--- a/arch/sh/configs/sdk7780_defconfig
+++ b/arch/sh/configs/sdk7780_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:34:43 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 12:59:32 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -41,6 +42,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -73,6 +75,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -93,6 +96,7 @@ CONFIG_COMPAT_BRK=y
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -100,6 +104,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -112,7 +118,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 CONFIG_LBD=y
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -159,6 +164,7 @@ CONFIG_CPU_SH4A=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 CONFIG_CPU_SUBTYPE_SH7780=y
@@ -168,8 +174,6 @@ CONFIG_CPU_SUBTYPE_SH7780=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -310,8 +314,6 @@ CONFIG_CMDLINE="mem=128M console=tty0 console=ttySC0,115200 ip=bootp root=/dev/n
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 # CONFIG_PCI_LEGACY is not set
@@ -646,6 +648,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -1091,15 +1094,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1257,6 +1262,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y
 # CONFIG_EXT2_FS_SECURITY is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1280,6 +1286,11 @@ CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
 CONFIG_GENERIC_ACL=y
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1331,6 +1342,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1418,6 +1430,9 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 # CONFIG_SCHED_DEBUG is not set
 # CONFIG_SCHEDSTATS is not set
 CONFIG_TIMER_STATS=y
@@ -1455,6 +1470,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1464,9 +1480,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1580,6 +1601,7 @@ CONFIG_CRYPTO_DES=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig
index d30e0a7ad9f17..5caf85a3312da 100644
--- a/arch/sh/configs/se7206_defconfig
+++ b/arch/sh/configs/se7206_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:39:37 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:01:02 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -40,6 +41,7 @@ CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -63,6 +65,7 @@ CONFIG_CGROUP_DEBUG=y
 CONFIG_CGROUP_NS=y
 # CONFIG_CGROUP_FREEZER is not set
 CONFIG_CGROUP_DEVICE=y
+# CONFIG_CPUSETS is not set
 CONFIG_CGROUP_CPUACCT=y
 CONFIG_RESOURCE_COUNTERS=y
 CONFIG_CGROUP_MEM_RES_CTLR=y
@@ -80,7 +83,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -90,6 +92,7 @@ CONFIG_EMBEDDED=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -116,6 +119,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_RT_MUTEXES=y
 CONFIG_BASE_SMALL=0
@@ -127,7 +132,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -174,6 +178,7 @@ CONFIG_CPU_SUBTYPE_SH7206=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -183,8 +188,6 @@ CONFIG_CPU_SUBTYPE_SH7206=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -745,6 +748,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -785,8 +793,13 @@ CONFIG_CRAMFS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -831,6 +844,9 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -869,6 +885,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -876,11 +893,16 @@ CONFIG_TRACING=y
 # CONFIG_FUNCTION_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -991,6 +1013,7 @@ CONFIG_CRYPTO_LZO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7343_defconfig b/arch/sh/configs/se7343_defconfig
index fbb72d029e685..004d531716dca 100644
--- a/arch/sh/configs/se7343_defconfig
+++ b/arch/sh/configs/se7343_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:42:00 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:01:44 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -40,6 +41,7 @@ CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_AUDIT is not set
@@ -73,6 +75,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -91,6 +94,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -98,6 +102,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_BASE_SMALL=0
@@ -109,7 +115,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -158,6 +163,7 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -167,8 +173,6 @@ CONFIG_ARCH_SHMOBILE=y
 CONFIG_CPU_SUBTYPE_SH7343=y
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -769,6 +773,7 @@ CONFIG_VIDEO_HELPER_CHIPS_AUTO=y
 # CONFIG_SOC_CAMERA is not set
 CONFIG_V4L_USB_DRIVERS=y
 # CONFIG_USB_VIDEO_CLASS is not set
+CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y
 CONFIG_USB_GSPCA=m
 # CONFIG_USB_M5602 is not set
 # CONFIG_USB_STV06XX is not set
@@ -800,6 +805,7 @@ CONFIG_USB_GSPCA=m
 # CONFIG_VIDEO_PVRUSB2 is not set
 # CONFIG_VIDEO_HDPVR is not set
 # CONFIG_VIDEO_EM28XX is not set
+# CONFIG_VIDEO_CX231XX is not set
 # CONFIG_VIDEO_USBVISION is not set
 # CONFIG_USB_VICAM is not set
 # CONFIG_USB_IBMCAM is not set
@@ -813,6 +819,7 @@ CONFIG_USB_GSPCA=m
 # CONFIG_USB_STV680 is not set
 # CONFIG_USB_ZC0301 is not set
 # CONFIG_USB_PWC is not set
+CONFIG_USB_PWC_INPUT_EVDEV=y
 # CONFIG_USB_ZR364XX is not set
 # CONFIG_USB_STKWEBCAM is not set
 # CONFIG_USB_S2255 is not set
@@ -914,15 +921,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1050,6 +1059,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1070,6 +1080,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1125,6 +1140,7 @@ CONFIG_CRAMFS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1174,10 +1190,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1279,6 +1308,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7619_defconfig b/arch/sh/configs/se7619_defconfig
index 125304e80f578..edbece52afc1c 100644
--- a/arch/sh/configs/se7619_defconfig
+++ b/arch/sh/configs/se7619_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:44:53 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:02:32 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -61,6 +62,7 @@ CONFIG_EMBEDDED=y
 # CONFIG_UID16 is not set
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -78,11 +80,14 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_BASE_SMALL=1
@@ -134,6 +139,7 @@ CONFIG_CPU_SUBTYPE_SH7619=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -143,8 +149,6 @@ CONFIG_CPU_SUBTYPE_SH7619=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -513,7 +517,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
@@ -563,6 +566,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -601,8 +609,13 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 
 #
 # Partition Types
@@ -630,10 +643,21 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -647,6 +671,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7705_defconfig b/arch/sh/configs/se7705_defconfig
index 0308abf523849..bae161c668357 100644
--- a/arch/sh/configs/se7705_defconfig
+++ b/arch/sh/configs/se7705_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:45:56 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:02:52 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -62,7 +63,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -70,6 +70,7 @@ CONFIG_EMBEDDED=y
 CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -88,12 +89,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -150,6 +154,7 @@ CONFIG_CPU_SUBTYPE_SH7705=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -159,8 +164,6 @@ CONFIG_CPU_SUBTYPE_SH7705=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -698,7 +701,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
@@ -751,6 +753,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -804,6 +811,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -847,10 +855,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -949,6 +970,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig
index a8c24b7034893..330043f3c3164 100644
--- a/arch/sh/configs/se7712_defconfig
+++ b/arch/sh/configs/se7712_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:48:18 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:03:27 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
@@ -38,6 +39,7 @@ CONFIG_LOCALVERSION=""
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -69,6 +71,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 # CONFIG_BUG is not set
@@ -87,6 +90,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -94,6 +98,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -105,7 +111,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -151,6 +156,7 @@ CONFIG_CPU_SUBTYPE_SH7712=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -160,8 +166,6 @@ CONFIG_CPU_SUBTYPE_SH7712=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -585,6 +589,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -812,6 +817,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y
 CONFIG_EXT2_FS_SECURITY=y
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -832,6 +838,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -887,6 +898,7 @@ CONFIG_CRAMFS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -927,6 +939,7 @@ CONFIG_FRAME_WARN=1024
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -961,6 +974,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -969,9 +983,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_IRQSOFF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1090,6 +1109,7 @@ CONFIG_CRYPTO_DEFLATE=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig
index 4b79c2567dc84..56478918440da 100644
--- a/arch/sh/configs/se7721_defconfig
+++ b/arch/sh/configs/se7721_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:51:44 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:04:19 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
@@ -38,6 +39,7 @@ CONFIG_LOCALVERSION=""
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -73,6 +75,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 # CONFIG_BUG is not set
@@ -91,6 +94,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -98,6 +102,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -109,7 +115,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -155,6 +160,7 @@ CONFIG_CPU_SUBTYPE_SH7721=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -164,8 +170,6 @@ CONFIG_CPU_SUBTYPE_SH7721=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -776,15 +780,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -943,6 +949,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y
 CONFIG_EXT2_FS_SECURITY=y
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -963,6 +970,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1021,6 +1033,7 @@ CONFIG_CRAMFS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 # CONFIG_NETWORK_FILESYSTEMS is not set
 
 #
@@ -1085,6 +1098,7 @@ CONFIG_FRAME_WARN=1024
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1119,6 +1133,7 @@ CONFIG_FRAME_POINTER=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1127,9 +1142,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_IRQSOFF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1248,6 +1268,7 @@ CONFIG_CRYPTO_DEFLATE=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7722_defconfig b/arch/sh/configs/se7722_defconfig
index 82bdaac45fb51..726fdbdb2807f 100644
--- a/arch/sh/configs/se7722_defconfig
+++ b/arch/sh/configs/se7722_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:55:10 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:05:29 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -69,7 +70,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -78,6 +78,7 @@ CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -97,6 +98,7 @@ CONFIG_COMPAT_BRK=y
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+# CONFIG_MARKERS is not set
 # CONFIG_OPROFILE is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
@@ -105,6 +107,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -117,7 +121,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -167,6 +170,7 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -176,8 +180,6 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 CONFIG_CPU_SUBTYPE_SH7722=y
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -486,6 +488,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -692,7 +695,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
@@ -766,6 +768,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -788,6 +791,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -832,6 +840,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -872,11 +881,25 @@ CONFIG_DEBUG_FS=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_SH_STANDARD_BIOS=y
@@ -977,6 +1000,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7750_defconfig b/arch/sh/configs/se7750_defconfig
index ceef6d9138ee2..ed1a1230f6362 100644
--- a/arch/sh/configs/se7750_defconfig
+++ b/arch/sh/configs/se7750_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:57:31 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:06:02 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -70,6 +71,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -88,6 +90,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -95,6 +98,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -106,7 +111,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -152,6 +156,7 @@ CONFIG_CPU_SUBTYPE_SH7750=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -161,8 +166,6 @@ CONFIG_CPU_SUBTYPE_SH7750=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -553,6 +556,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -774,6 +778,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -829,6 +838,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -886,10 +896,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -989,6 +1012,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig
index 67fc26b3a7d09..55f3b788e0cbd 100644
--- a/arch/sh/configs/se7751_defconfig
+++ b/arch/sh/configs/se7751_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 18:59:59 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:06:44 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -65,7 +66,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -74,6 +74,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -92,6 +93,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -99,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -110,7 +114,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -156,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7751=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -165,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7751=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -741,6 +743,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -796,6 +803,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -833,10 +841,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -936,6 +957,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/se7780_defconfig b/arch/sh/configs/se7780_defconfig
index ebce23cc2ad83..c4f0af32efa97 100644
--- a/arch/sh/configs/se7780_defconfig
+++ b/arch/sh/configs/se7780_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:02:05 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:07:14 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -67,6 +68,7 @@ CONFIG_EMBEDDED=y
 CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -86,12 +88,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -103,7 +108,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
 #
@@ -149,6 +153,7 @@ CONFIG_CPU_SH4A=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 CONFIG_CPU_SUBTYPE_SH7780=y
@@ -158,8 +163,6 @@ CONFIG_CPU_SUBTYPE_SH7780=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -285,8 +288,6 @@ CONFIG_CMDLINE="console=ttySC0.115200 root=/dev/sda1"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -558,6 +559,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -957,15 +959,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1121,6 +1125,10 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1245,11 +1253,24 @@ CONFIG_DEBUG_FS=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1345,6 +1366,7 @@ CONFIG_CRYPTO=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/sh03_defconfig b/arch/sh/configs/sh03_defconfig
index 6fcdb090cf322..f9c6e51dc0b0a 100644
--- a/arch/sh/configs/sh03_defconfig
+++ b/arch/sh/configs/sh03_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:04:59 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:07:56 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -41,6 +42,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -67,7 +69,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -76,6 +77,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -105,6 +107,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -117,7 +121,6 @@ CONFIG_MODVERSIONS=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -163,6 +166,7 @@ CONFIG_CPU_SUBTYPE_SH7751=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -172,8 +176,6 @@ CONFIG_CPU_SUBTYPE_SH7751=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -300,8 +302,6 @@ CONFIG_CMDLINE="console=ttySC1,115200 mem=64M root=/dev/nfs"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -562,6 +562,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -656,6 +657,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -679,6 +681,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -887,7 +890,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -929,6 +931,7 @@ CONFIG_EXT2_FS_XATTR=y
 # CONFIG_EXT2_FS_SECURITY is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -951,6 +954,11 @@ CONFIG_AUTOFS_FS=y
 CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1001,6 +1009,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1112,11 +1121,26 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_SH_STANDARD_BIOS=y
@@ -1228,6 +1252,7 @@ CONFIG_CRYPTO_DEFLATE=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/sh7710voipgw_defconfig b/arch/sh/configs/sh7710voipgw_defconfig
index 1ab37c01da6e9..48b58098cf841 100644
--- a/arch/sh/configs/sh7710voipgw_defconfig
+++ b/arch/sh/configs/sh7710voipgw_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:09:01 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:09:16 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -39,6 +40,7 @@ CONFIG_LOCALVERSION_AUTO=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_AUDIT is not set
@@ -72,6 +74,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -90,6 +93,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -97,6 +101,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_BASE_SMALL=0
@@ -108,7 +114,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -154,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7710=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -163,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7710=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -728,7 +732,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 # CONFIG_USB_ARCH_HAS_OHCI is not set
@@ -779,6 +782,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -834,6 +842,7 @@ CONFIG_JFFS2_RTIME=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -871,11 +880,24 @@ CONFIG_DEBUG_FS=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -975,6 +997,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/sh7724_generic_defconfig b/arch/sh/configs/sh7724_generic_defconfig
index 268d04ed8cdda..ec8f18c7684ce 100644
--- a/arch/sh/configs/sh7724_generic_defconfig
+++ b/arch/sh/configs/sh7724_generic_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.30-rc2
-# Thu Apr 16 15:42:20 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:09:47 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
diff --git a/arch/sh/configs/sh7763rdp_defconfig b/arch/sh/configs/sh7763rdp_defconfig
index c79bb84ec3051..f77bc7998d2fb 100644
--- a/arch/sh/configs/sh7763rdp_defconfig
+++ b/arch/sh/configs/sh7763rdp_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:10:57 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:10:12 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -78,6 +79,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -106,6 +108,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -117,7 +121,6 @@ CONFIG_MODULES=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -164,6 +167,7 @@ CONFIG_CPU_SH4A=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 CONFIG_CPU_SUBTYPE_SH7763=y
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -173,8 +177,6 @@ CONFIG_CPU_SUBTYPE_SH7763=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -555,6 +557,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -941,6 +944,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -964,6 +968,11 @@ CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
 CONFIG_GENERIC_ACL=y
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1012,6 +1021,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -1100,11 +1110,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1204,6 +1228,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig
index a6cf4505741ca..1c55b800d1240 100644
--- a/arch/sh/configs/sh7785lcr_32bit_defconfig
+++ b/arch/sh/configs/sh7785lcr_32bit_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:12:18 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:10:53 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -79,6 +80,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_ALL is not set
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -98,6 +100,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+# CONFIG_MARKERS is not set
 # CONFIG_OPROFILE is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
@@ -106,6 +109,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -118,7 +123,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -166,6 +170,7 @@ CONFIG_CPU_SHX2=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -175,8 +180,6 @@ CONFIG_CPU_SUBTYPE_SH7785=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -310,8 +313,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -672,6 +673,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -1009,15 +1011,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1218,6 +1222,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1239,6 +1244,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1289,6 +1299,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1377,6 +1388,9 @@ CONFIG_DEBUG_KERNEL=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1413,6 +1427,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1422,9 +1437,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1542,6 +1562,7 @@ CONFIG_CRYPTO_DES=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/sh7785lcr_defconfig b/arch/sh/configs/sh7785lcr_defconfig
index e4fac2efc055e..4385fe97a7807 100644
--- a/arch/sh/configs/sh7785lcr_defconfig
+++ b/arch/sh/configs/sh7785lcr_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.30-rc2
-# Wed Apr 22 19:17:56 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:11:48 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
@@ -307,8 +307,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig
index d695e90618748..4e120256ec638 100644
--- a/arch/sh/configs/shmin_defconfig
+++ b/arch/sh/configs/shmin_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:19:03 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:12:41 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_FIND_NEXT_BIT=y
@@ -63,6 +64,7 @@ CONFIG_EMBEDDED=y
 # CONFIG_UID16 is not set
 # CONFIG_SYSCTL_SYSCALL is not set
 # CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 # CONFIG_BUG is not set
@@ -81,12 +83,15 @@ CONFIG_COMPAT_BRK=y
 # CONFIG_SLUB is not set
 CONFIG_SLOB=y
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_BASE_SMALL=1
 # CONFIG_MODULES is not set
@@ -137,6 +142,7 @@ CONFIG_CPU_SUBTYPE_SH7706=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -146,8 +152,6 @@ CONFIG_CPU_SUBTYPE_SH7706=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -674,6 +678,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -718,6 +727,7 @@ CONFIG_CRAMFS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -762,10 +772,22 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 CONFIG_SH_STANDARD_BIOS=y
@@ -864,6 +886,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig
index e3651f5743996..c088144925fae 100644
--- a/arch/sh/configs/shx3_defconfig
+++ b/arch/sh/configs/shx3_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:20:54 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:13:12 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -42,6 +43,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 # CONFIG_TASKSTATS is not set
@@ -96,6 +98,7 @@ CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -126,6 +129,8 @@ CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_USE_GENERIC_SMP_HELPERS=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_RT_MUTEXES=y
 CONFIG_BASE_SMALL=0
@@ -138,7 +143,6 @@ CONFIG_MODULE_UNLOAD=y
 CONFIG_STOP_MACHINE=y
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -186,6 +190,7 @@ CONFIG_CPU_SHX3=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -195,8 +200,6 @@ CONFIG_CPU_SUBTYPE_SHX3=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -553,6 +556,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -646,6 +650,7 @@ CONFIG_DEVKMEM=y
 #
 # Non-8250 serial port support
 #
+# CONFIG_SERIAL_MAX3100 is not set
 CONFIG_SERIAL_SH_SCI=y
 CONFIG_SERIAL_SH_SCI_NR_UARTS=2
 CONFIG_SERIAL_SH_SCI_CONSOLE=y
@@ -985,6 +990,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1007,6 +1013,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1051,6 +1062,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -1085,6 +1097,9 @@ CONFIG_DEBUG_SHIRQ=y
 CONFIG_DETECT_SOFTLOCKUP=y
 # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set
 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0
+CONFIG_DETECT_HUNG_TASK=y
+# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set
+CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1124,6 +1139,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_RING_BUFFER=y
 CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1133,11 +1149,16 @@ CONFIG_TRACING=y
 # CONFIG_PREEMPT_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_FTRACE_STARTUP_TEST is not set
 # CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1245,6 +1266,7 @@ CONFIG_CRYPTO=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
diff --git a/arch/sh/configs/snapgear_defconfig b/arch/sh/configs/snapgear_defconfig
index 6960f60bf52e9..54a7a3c41f34f 100644
--- a/arch/sh/configs/snapgear_defconfig
+++ b/arch/sh/configs/snapgear_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:21:39 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:14:00 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -64,7 +65,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -73,6 +73,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -92,12 +93,15 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -105,7 +109,6 @@ CONFIG_BASE_SMALL=0
 # CONFIG_MODULES is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -151,6 +154,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -160,8 +164,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -295,8 +297,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -762,6 +762,11 @@ CONFIG_FILE_LOCKING=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -805,8 +810,13 @@ CONFIG_CRAMFS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 # CONFIG_NFS_FS is not set
 # CONFIG_NFSD is not set
@@ -844,10 +854,23 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -862,6 +885,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/systemh_defconfig b/arch/sh/configs/systemh_defconfig
index 7ea639bc5936e..dbe7e546f0bb6 100644
--- a/arch/sh/configs/systemh_defconfig
+++ b/arch/sh/configs/systemh_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:23:31 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:14:33 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -61,7 +62,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -70,6 +70,7 @@ CONFIG_UID16=y
 # CONFIG_SYSCTL_SYSCALL is not set
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 # CONFIG_HOTPLUG is not set
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -88,6 +89,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -95,6 +97,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -107,7 +111,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -153,6 +156,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -162,8 +166,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -505,6 +507,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -547,8 +554,13 @@ CONFIG_CRAMFS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 
 #
 # Partition Types
@@ -577,10 +589,24 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -595,6 +621,7 @@ CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SECURITYFS is not set
 # CONFIG_SECURITY_FILE_CAPABILITIES is not set
 # CONFIG_CRYPTO is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig
index bbeb4c6ebb955..8ca94ef74278c 100644
--- a/arch/sh/configs/titan_defconfig
+++ b/arch/sh/configs/titan_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:24:55 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:14:55 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -40,6 +41,7 @@ CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 # CONFIG_BSD_PROCESS_ACCT is not set
 # CONFIG_TASKSTATS is not set
 # CONFIG_AUDIT is not set
@@ -66,7 +68,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -76,6 +77,7 @@ CONFIG_UID16=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_ALL is not set
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -95,6 +97,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 # CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
 CONFIG_HAVE_IOREMAP_PROT=y
@@ -102,6 +105,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -114,7 +119,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -160,6 +164,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -169,8 +174,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -305,8 +308,6 @@ CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw"
 #
 CONFIG_PCI=y
 CONFIG_SH_PCIDMA_NONCOHERENT=y
-CONFIG_PCI_AUTO=y
-CONFIG_PCI_AUTO_UPDATE_RESOURCES=y
 # CONFIG_PCIEPORTBUS is not set
 # CONFIG_ARCH_SUPPORTS_MSI is not set
 CONFIG_PCI_LEGACY=y
@@ -789,6 +790,7 @@ CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_SCSI_MPT2SAS is not set
 # CONFIG_SCSI_HPTIOP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_FCOE is not set
 # CONFIG_SCSI_DMX3191D is not set
 # CONFIG_SCSI_FUTURE_DOMAIN is not set
@@ -906,6 +908,7 @@ CONFIG_NETDEV_1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -929,6 +932,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -1182,7 +1186,6 @@ CONFIG_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_USB_SUPPORT=y
 CONFIG_USB_ARCH_HAS_HCD=y
 CONFIG_USB_ARCH_HAS_OHCI=y
@@ -1393,6 +1396,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 # CONFIG_EXT3_FS_XATTR is not set
 # CONFIG_EXT4_FS is not set
 CONFIG_JBD=y
@@ -1418,6 +1422,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 CONFIG_FUSE_FS=m
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1467,8 +1476,13 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_HPFS_FS is not set
 # CONFIG_QNX4FS_FS is not set
 CONFIG_ROMFS_FS=y
+CONFIG_ROMFS_BACKED_BY_BLOCK=y
+# CONFIG_ROMFS_BACKED_BY_MTD is not set
+# CONFIG_ROMFS_BACKED_BY_BOTH is not set
+CONFIG_ROMFS_ON_BLOCK=y
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1576,6 +1590,7 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 CONFIG_SCHED_DEBUG=y
 # CONFIG_SCHEDSTATS is not set
 # CONFIG_TIMER_STATS is not set
@@ -1610,6 +1625,7 @@ CONFIG_SCHED_DEBUG=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -1618,9 +1634,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 # CONFIG_IRQSOFF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -1741,6 +1762,7 @@ CONFIG_CRYPTO_DEFLATE=y
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/ul2_defconfig b/arch/sh/configs/ul2_defconfig
index 34f5192a32411..bfb4d9806892e 100644
--- a/arch/sh/configs/ul2_defconfig
+++ b/arch/sh/configs/ul2_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:30:27 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 13:17:05 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -69,7 +70,6 @@ CONFIG_INITRAMFS_SOURCE=""
 CONFIG_RD_GZIP=y
 # CONFIG_RD_BZIP2 is not set
 # CONFIG_RD_LZMA is not set
-CONFIG_INITRAMFS_COMPRESSION_NONE=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
 CONFIG_ANON_INODES=y
@@ -78,6 +78,7 @@ CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -97,6 +98,7 @@ CONFIG_COMPAT_BRK=y
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+# CONFIG_MARKERS is not set
 # CONFIG_OPROFILE is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
@@ -105,6 +107,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -117,7 +121,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -167,6 +170,7 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -176,8 +180,6 @@ CONFIG_ARCH_SHMOBILE=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 CONFIG_CPU_SUBTYPE_SH7366=y
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -584,6 +586,7 @@ CONFIG_SCSI_WAIT_SCAN=m
 CONFIG_SCSI_LOWLEVEL=y
 # CONFIG_ISCSI_TCP is not set
 # CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
 # CONFIG_SCSI_DEBUG is not set
 # CONFIG_SCSI_DH is not set
 # CONFIG_SCSI_OSD_INITIATOR is not set
@@ -936,6 +939,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -957,6 +961,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1005,6 +1014,7 @@ CONFIG_CRAMFS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 # CONFIG_NFS_V3 is not set
@@ -1095,10 +1105,24 @@ CONFIG_FRAME_WARN=1024
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1208,6 +1232,7 @@ CONFIG_CRYPTO_ARC4=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig
index d174b1a4d802a..512664fed66c0 100644
--- a/arch/sh/configs/urquell_defconfig
+++ b/arch/sh/configs/urquell_defconfig
@@ -1,10 +1,11 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29
-# Thu Apr  2 19:33:39 2009
+# Linux kernel version: 2.6.30-rc3
+# Mon Apr 27 14:02:55 2009
 #
 CONFIG_SUPERH=y
 CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
 CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
 CONFIG_RWSEM_GENERIC_SPINLOCK=y
 CONFIG_GENERIC_BUG=y
@@ -76,6 +77,7 @@ CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 # CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
@@ -94,6 +96,7 @@ CONFIG_SLAB=y
 # CONFIG_SLUB is not set
 # CONFIG_SLOB is not set
 CONFIG_PROFILING=y
+# CONFIG_MARKERS is not set
 # CONFIG_OPROFILE is not set
 CONFIG_HAVE_OPROFILE=y
 # CONFIG_KPROBES is not set
@@ -102,6 +105,8 @@ CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
 CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -114,7 +119,6 @@ CONFIG_MODULE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-# CONFIG_BLK_DEV_IO_TRACE is not set
 # CONFIG_BLK_DEV_BSG is not set
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -162,6 +166,7 @@ CONFIG_CPU_SHX3=y
 # CONFIG_CPU_SUBTYPE_SH7760 is not set
 # CONFIG_CPU_SUBTYPE_SH4_202 is not set
 # CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
 # CONFIG_CPU_SUBTYPE_SH7763 is not set
 # CONFIG_CPU_SUBTYPE_SH7770 is not set
 # CONFIG_CPU_SUBTYPE_SH7780 is not set
@@ -171,8 +176,6 @@ CONFIG_CPU_SUBTYPE_SH7786=y
 # CONFIG_CPU_SUBTYPE_SH7343 is not set
 # CONFIG_CPU_SUBTYPE_SH7722 is not set
 # CONFIG_CPU_SUBTYPE_SH7366 is not set
-# CONFIG_CPU_SUBTYPE_SH5_101 is not set
-# CONFIG_CPU_SUBTYPE_SH5_103 is not set
 
 #
 # Memory management options
@@ -900,15 +903,17 @@ CONFIG_USB_HID=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+# CONFIG_HID_KYE is not set
 CONFIG_HID_GYRATION=y
+# CONFIG_HID_KENSINGTON is not set
 CONFIG_HID_LOGITECH=y
 # CONFIG_LOGITECH_FF is not set
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1046,6 +1051,7 @@ CONFIG_EXT2_FS=y
 # CONFIG_EXT2_FS_XATTR is not set
 # CONFIG_EXT2_FS_XIP is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 # CONFIG_EXT3_FS_POSIX_ACL is not set
 # CONFIG_EXT3_FS_SECURITY is not set
@@ -1067,6 +1073,11 @@ CONFIG_INOTIFY_USER=y
 # CONFIG_AUTOFS4_FS is not set
 # CONFIG_FUSE_FS is not set
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -1117,6 +1128,7 @@ CONFIG_MINIX_FS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -1209,10 +1221,24 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
 #
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_SH_STANDARD_BIOS is not set
@@ -1322,6 +1348,7 @@ CONFIG_CRYPTO_DES=y
 #
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 # CONFIG_CRYPTO_HW is not set
+# CONFIG_BINARY_PRINTF is not set
 
 #
 # Library routines
-- 
GitLab


From 5be7c0a4d3dfe25091f2e4e524103e81d9e7e180 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 27 Apr 2009 14:40:47 +0900
Subject: [PATCH 0604/2198] sh: select GENERIC_TIME for new CMT driver.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 4c68fdedfa106..164748945f953 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -476,10 +476,11 @@ config SH_TIMER_CMT
 	bool "CMT clockevents driver"
 	depends on SYS_SUPPORTS_CMT && !SH_CMT
 	select GENERIC_CLOCKEVENTS
+	select GENERIC_TIME
 
 config SH_MTU2
 	bool "MTU2 timer support"
-	depends on CPU_SH2A
+	depends on CPU_SH2A && !GENERIC_TIME
 	default y
 	help
 	  This enables the use of the MTU2 as the system timer.
-- 
GitLab


From 148be2c15d4a866fbc7a8f55342e4fd4de73be61 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 27 Apr 2009 08:02:14 +0200
Subject: [PATCH 0605/2198] perf_counter tools: move helper library to util/*

Clean up the top level directory a bit by moving all the helper libraries
to util/*.[ch].

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile           | 60 ++++++++++---------
 Documentation/perf_counter/builtin-help.c     | 10 ++--
 Documentation/perf_counter/builtin-stat.c     |  2 +-
 Documentation/perf_counter/builtin-top.c      |  2 +-
 Documentation/perf_counter/builtin.h          |  4 +-
 Documentation/perf_counter/perf.c             |  8 +--
 .../perf_counter/{ => util}/PERF-VERSION-GEN  |  0
 .../perf_counter/{ => util}/abspath.c         |  0
 Documentation/perf_counter/{ => util}/alias.c |  0
 Documentation/perf_counter/{ => util}/cache.h |  0
 .../perf_counter/{ => util}/config.c          |  0
 Documentation/perf_counter/{ => util}/ctype.c |  0
 .../perf_counter/{ => util}/exec_cmd.c        |  0
 .../perf_counter/{ => util}/exec_cmd.h        |  0
 .../{ => util}/generate-cmdlist.sh            |  0
 Documentation/perf_counter/{ => util}/help.c  |  2 +-
 Documentation/perf_counter/{ => util}/help.h  |  0
 .../perf_counter/{ => util}/levenshtein.c     |  0
 .../perf_counter/{ => util}/levenshtein.h     |  0
 .../perf_counter/{ => util}/parse-options.c   |  3 -
 .../perf_counter/{ => util}/parse-options.h   |  0
 Documentation/perf_counter/{ => util}/path.c  |  0
 Documentation/perf_counter/{ => util}/quote.c |  0
 Documentation/perf_counter/{ => util}/quote.h |  2 +-
 .../perf_counter/{ => util}/run-command.c     |  0
 .../perf_counter/{ => util}/run-command.h     |  0
 .../perf_counter/{ => util}/strbuf.c          |  0
 .../perf_counter/{ => util}/strbuf.h          |  0
 Documentation/perf_counter/{ => util}/usage.c |  0
 Documentation/perf_counter/{ => util}/util.h  |  0
 .../perf_counter/{ => util}/wrapper.c         |  0
 31 files changed, 46 insertions(+), 47 deletions(-)
 rename Documentation/perf_counter/{ => util}/PERF-VERSION-GEN (100%)
 rename Documentation/perf_counter/{ => util}/abspath.c (100%)
 rename Documentation/perf_counter/{ => util}/alias.c (100%)
 rename Documentation/perf_counter/{ => util}/cache.h (100%)
 rename Documentation/perf_counter/{ => util}/config.c (100%)
 rename Documentation/perf_counter/{ => util}/ctype.c (100%)
 rename Documentation/perf_counter/{ => util}/exec_cmd.c (100%)
 rename Documentation/perf_counter/{ => util}/exec_cmd.h (100%)
 rename Documentation/perf_counter/{ => util}/generate-cmdlist.sh (100%)
 rename Documentation/perf_counter/{ => util}/help.c (99%)
 rename Documentation/perf_counter/{ => util}/help.h (100%)
 rename Documentation/perf_counter/{ => util}/levenshtein.c (100%)
 rename Documentation/perf_counter/{ => util}/levenshtein.h (100%)
 rename Documentation/perf_counter/{ => util}/parse-options.c (99%)
 rename Documentation/perf_counter/{ => util}/parse-options.h (100%)
 rename Documentation/perf_counter/{ => util}/path.c (100%)
 rename Documentation/perf_counter/{ => util}/quote.c (100%)
 rename Documentation/perf_counter/{ => util}/quote.h (97%)
 rename Documentation/perf_counter/{ => util}/run-command.c (100%)
 rename Documentation/perf_counter/{ => util}/run-command.h (100%)
 rename Documentation/perf_counter/{ => util}/strbuf.c (100%)
 rename Documentation/perf_counter/{ => util}/strbuf.h (100%)
 rename Documentation/perf_counter/{ => util}/usage.c (100%)
 rename Documentation/perf_counter/{ => util}/util.h (100%)
 rename Documentation/perf_counter/{ => util}/wrapper.c (100%)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 690045e496924..543ccf28ac4a6 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -147,7 +147,7 @@ all::
 # broken, or spawning external process is slower than built-in grep perf has).
 
 PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
-	@$(SHELL_PATH) ./PERF-VERSION-GEN
+	@$(SHELL_PATH) util/PERF-VERSION-GEN
 -include PERF-VERSION-FILE
 
 uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
@@ -287,26 +287,28 @@ export PERL_PATH
 LIB_FILE=libperf.a
 
 LIB_H += ../../include/linux/perf_counter.h
-LIB_H += levenshtein.h
-LIB_H += parse-options.h
-LIB_H += quote.h
-LIB_H += strbuf.h
-LIB_H += run-command.h
-
-LIB_OBJS += abspath.o
-LIB_OBJS += alias.o
-LIB_OBJS += config.o
-LIB_OBJS += ctype.o
-LIB_OBJS += exec_cmd.o
-LIB_OBJS += help.o
-LIB_OBJS += levenshtein.o
-LIB_OBJS += parse-options.o
-LIB_OBJS += path.o
-LIB_OBJS += run-command.o
-LIB_OBJS += quote.o
-LIB_OBJS += strbuf.o
-LIB_OBJS += usage.o
-LIB_OBJS += wrapper.o
+LIB_H += util/levenshtein.h
+LIB_H += util/parse-options.h
+LIB_H += util/quote.h
+LIB_H += util/util.h
+LIB_H += util/help.h
+LIB_H += util/strbuf.h
+LIB_H += util/run-command.h
+
+LIB_OBJS += util/abspath.o
+LIB_OBJS += util/alias.o
+LIB_OBJS += util/config.o
+LIB_OBJS += util/ctype.o
+LIB_OBJS += util/exec_cmd.o
+LIB_OBJS += util/help.o
+LIB_OBJS += util/levenshtein.o
+LIB_OBJS += util/parse-options.o
+LIB_OBJS += util/path.o
+LIB_OBJS += util/run-command.o
+LIB_OBJS += util/quote.o
+LIB_OBJS += util/strbuf.o
+LIB_OBJS += util/usage.o
+LIB_OBJS += util/wrapper.o
 
 BUILTIN_OBJS += builtin-help.o
 BUILTIN_OBJS += builtin-record.o
@@ -620,10 +622,10 @@ $(BUILT_INS): perf$X
 	ln -s perf$X $@ 2>/dev/null || \
 	cp perf$X $@
 
-common-cmds.h: ./generate-cmdlist.sh command-list.txt
+common-cmds.h: util/generate-cmdlist.sh command-list.txt
 
 common-cmds.h: $(wildcard Documentation/perf-*.txt)
-	$(QUIET_GEN)./generate-cmdlist.sh > $@+ && mv $@+ $@
+	$(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@
 
 $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
 	$(QUIET_GEN)$(RM) $@ $@+ && \
@@ -656,7 +658,7 @@ perf.o perf.spec \
 %.o: %.S
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
 
-exec_cmd.o: exec_cmd.c PERF-CFLAGS
+util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
 		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
 		'-DBINDIR="$(bindir_relative_SQ)"' \
@@ -666,7 +668,7 @@ exec_cmd.o: exec_cmd.c PERF-CFLAGS
 builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
 
-config.o: config.c PERF-CFLAGS
+util/config.o: util/config.c PERF-CFLAGS
 	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
 
 perf-%$X: %.o $(PERFLIBS)
@@ -729,7 +731,7 @@ export NO_SVN_TESTS
 check: common-cmds.h
 	if sparse; \
 	then \
-		for i in *.c; \
+		for i in *.c */*.c; \
 		do \
 			sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
 		done; \
@@ -795,7 +797,7 @@ distclean: clean
 	$(RM) configure
 
 clean:
-	$(RM) *.o $(LIB_FILE)
+	$(RM) *.o */*.o $(LIB_FILE)
 	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
 	$(RM) $(TEST_PROGRAMS)
 	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
@@ -836,7 +838,7 @@ coverage-build: coverage-clean
 		-j1 test
 
 coverage-report:
-	gcov -b *.c
-	grep '^function.*called 0 ' *.c.gcov \
+	gcov -b *.c */*.c
+	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
 		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
 		| tee coverage-untested-functions
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
index a136d619db371..6616de0ef0535 100644
--- a/Documentation/perf_counter/builtin-help.c
+++ b/Documentation/perf_counter/builtin-help.c
@@ -3,13 +3,13 @@
  *
  * Builtin help command
  */
-#include "cache.h"
+#include "util/cache.h"
 #include "builtin.h"
-#include "exec_cmd.h"
+#include "util/exec_cmd.h"
 #include "common-cmds.h"
-#include "parse-options.h"
-#include "run-command.h"
-#include "help.h"
+#include "util/parse-options.h"
+#include "util/run-command.h"
+#include "util/help.h"
 
 static struct man_viewer_list {
 	struct man_viewer_list *next;
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 169a2d1783fca..d7ace631fc4ff 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -61,7 +61,7 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
-#include "util.h"
+#include "util/util.h"
 
 #include <getopt.h>
 #include <assert.h>
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 98e8690b6bcb0..dea016fa41e87 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -42,7 +42,7 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
-#include "util.h"
+#include "util/util.h"
 
 #include <getopt.h>
 #include <assert.h>
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 800f86c1d4450..d32318aed8cf1 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -1,8 +1,8 @@
 #ifndef BUILTIN_H
 #define BUILTIN_H
 
-#include "util.h"
-#include "strbuf.h"
+#include "util/util.h"
+#include "util/strbuf.h"
 
 extern const char perf_version_string[];
 extern const char perf_usage_string[];
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 8d6faecdc15d6..594d270be390d 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -1,8 +1,8 @@
 #include "builtin.h"
-#include "exec_cmd.h"
-#include "cache.h"
-#include "quote.h"
-#include "run-command.h"
+#include "util/exec_cmd.h"
+#include "util/cache.h"
+#include "util/quote.h"
+#include "util/run-command.h"
 
 const char perf_usage_string[] =
 	"perf [--version] [--help] COMMAND [ARGS]";
diff --git a/Documentation/perf_counter/PERF-VERSION-GEN b/Documentation/perf_counter/util/PERF-VERSION-GEN
similarity index 100%
rename from Documentation/perf_counter/PERF-VERSION-GEN
rename to Documentation/perf_counter/util/PERF-VERSION-GEN
diff --git a/Documentation/perf_counter/abspath.c b/Documentation/perf_counter/util/abspath.c
similarity index 100%
rename from Documentation/perf_counter/abspath.c
rename to Documentation/perf_counter/util/abspath.c
diff --git a/Documentation/perf_counter/alias.c b/Documentation/perf_counter/util/alias.c
similarity index 100%
rename from Documentation/perf_counter/alias.c
rename to Documentation/perf_counter/util/alias.c
diff --git a/Documentation/perf_counter/cache.h b/Documentation/perf_counter/util/cache.h
similarity index 100%
rename from Documentation/perf_counter/cache.h
rename to Documentation/perf_counter/util/cache.h
diff --git a/Documentation/perf_counter/config.c b/Documentation/perf_counter/util/config.c
similarity index 100%
rename from Documentation/perf_counter/config.c
rename to Documentation/perf_counter/util/config.c
diff --git a/Documentation/perf_counter/ctype.c b/Documentation/perf_counter/util/ctype.c
similarity index 100%
rename from Documentation/perf_counter/ctype.c
rename to Documentation/perf_counter/util/ctype.c
diff --git a/Documentation/perf_counter/exec_cmd.c b/Documentation/perf_counter/util/exec_cmd.c
similarity index 100%
rename from Documentation/perf_counter/exec_cmd.c
rename to Documentation/perf_counter/util/exec_cmd.c
diff --git a/Documentation/perf_counter/exec_cmd.h b/Documentation/perf_counter/util/exec_cmd.h
similarity index 100%
rename from Documentation/perf_counter/exec_cmd.h
rename to Documentation/perf_counter/util/exec_cmd.h
diff --git a/Documentation/perf_counter/generate-cmdlist.sh b/Documentation/perf_counter/util/generate-cmdlist.sh
similarity index 100%
rename from Documentation/perf_counter/generate-cmdlist.sh
rename to Documentation/perf_counter/util/generate-cmdlist.sh
diff --git a/Documentation/perf_counter/help.c b/Documentation/perf_counter/util/help.c
similarity index 99%
rename from Documentation/perf_counter/help.c
rename to Documentation/perf_counter/util/help.c
index ec0116721660b..edde541d238d7 100644
--- a/Documentation/perf_counter/help.c
+++ b/Documentation/perf_counter/util/help.c
@@ -1,5 +1,5 @@
 #include "cache.h"
-#include "builtin.h"
+#include "../builtin.h"
 #include "exec_cmd.h"
 #include "levenshtein.h"
 #include "help.h"
diff --git a/Documentation/perf_counter/help.h b/Documentation/perf_counter/util/help.h
similarity index 100%
rename from Documentation/perf_counter/help.h
rename to Documentation/perf_counter/util/help.h
diff --git a/Documentation/perf_counter/levenshtein.c b/Documentation/perf_counter/util/levenshtein.c
similarity index 100%
rename from Documentation/perf_counter/levenshtein.c
rename to Documentation/perf_counter/util/levenshtein.c
diff --git a/Documentation/perf_counter/levenshtein.h b/Documentation/perf_counter/util/levenshtein.h
similarity index 100%
rename from Documentation/perf_counter/levenshtein.h
rename to Documentation/perf_counter/util/levenshtein.h
diff --git a/Documentation/perf_counter/parse-options.c b/Documentation/perf_counter/util/parse-options.c
similarity index 99%
rename from Documentation/perf_counter/parse-options.c
rename to Documentation/perf_counter/util/parse-options.c
index 7464f34e5407e..28b34c1c29cf5 100644
--- a/Documentation/perf_counter/parse-options.c
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -469,9 +469,6 @@ int parse_options_usage(const char * const *usagestr,
 }
 
 
-/*----- some often used options -----*/
-#include "cache.h"
-
 int parse_opt_verbosity_cb(const struct option *opt, const char *arg,
 			   int unset)
 {
diff --git a/Documentation/perf_counter/parse-options.h b/Documentation/perf_counter/util/parse-options.h
similarity index 100%
rename from Documentation/perf_counter/parse-options.h
rename to Documentation/perf_counter/util/parse-options.h
diff --git a/Documentation/perf_counter/path.c b/Documentation/perf_counter/util/path.c
similarity index 100%
rename from Documentation/perf_counter/path.c
rename to Documentation/perf_counter/util/path.c
diff --git a/Documentation/perf_counter/quote.c b/Documentation/perf_counter/util/quote.c
similarity index 100%
rename from Documentation/perf_counter/quote.c
rename to Documentation/perf_counter/util/quote.c
diff --git a/Documentation/perf_counter/quote.h b/Documentation/perf_counter/util/quote.h
similarity index 97%
rename from Documentation/perf_counter/quote.h
rename to Documentation/perf_counter/util/quote.h
index 66730f2bff3ce..5dfad89816db1 100644
--- a/Documentation/perf_counter/quote.h
+++ b/Documentation/perf_counter/util/quote.h
@@ -18,7 +18,7 @@
  * run the command on the other side:
  *
  * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1));
- * sprintf(rcmd, "ssh %s %s", sq_quote(host), sq_quote(cmd));
+ * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd));
  *
  * Note that the above examples leak memory!  Remember to free result from
  * sq_quote() in a real application.
diff --git a/Documentation/perf_counter/run-command.c b/Documentation/perf_counter/util/run-command.c
similarity index 100%
rename from Documentation/perf_counter/run-command.c
rename to Documentation/perf_counter/util/run-command.c
diff --git a/Documentation/perf_counter/run-command.h b/Documentation/perf_counter/util/run-command.h
similarity index 100%
rename from Documentation/perf_counter/run-command.h
rename to Documentation/perf_counter/util/run-command.h
diff --git a/Documentation/perf_counter/strbuf.c b/Documentation/perf_counter/util/strbuf.c
similarity index 100%
rename from Documentation/perf_counter/strbuf.c
rename to Documentation/perf_counter/util/strbuf.c
diff --git a/Documentation/perf_counter/strbuf.h b/Documentation/perf_counter/util/strbuf.h
similarity index 100%
rename from Documentation/perf_counter/strbuf.h
rename to Documentation/perf_counter/util/strbuf.h
diff --git a/Documentation/perf_counter/usage.c b/Documentation/perf_counter/util/usage.c
similarity index 100%
rename from Documentation/perf_counter/usage.c
rename to Documentation/perf_counter/util/usage.c
diff --git a/Documentation/perf_counter/util.h b/Documentation/perf_counter/util/util.h
similarity index 100%
rename from Documentation/perf_counter/util.h
rename to Documentation/perf_counter/util/util.h
diff --git a/Documentation/perf_counter/wrapper.c b/Documentation/perf_counter/util/wrapper.c
similarity index 100%
rename from Documentation/perf_counter/wrapper.c
rename to Documentation/perf_counter/util/wrapper.c
-- 
GitLab


From b2ba83ff4f4405cebc10884121ee71338a1a6c94 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 26 Apr 2009 23:38:08 -0700
Subject: [PATCH 0606/2198] x86: apic: Remove duplicated macros

XAPIC_DEST_* is dupliicated to the one in apicdef.h

[ Impact: cleanup ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49F552D0.5050505@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/summit_32.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9cfe1f415d81f..344eee4ac0a48 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){
 		rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
 }
 
-
-/* In clustered mode, the high nibble of APIC ID is a cluster number.
- * The low nibble is a 4-bit bitmap. */
-#define XAPIC_DEST_CPUS_SHIFT	4
-#define XAPIC_DEST_CPUS_MASK	((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
-#define XAPIC_DEST_CLUSTER_MASK	(XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
-
 #define SUMMIT_APIC_DFR_VALUE	(APIC_DFR_CLUSTER)
 
 static const struct cpumask *summit_target_cpus(void)
-- 
GitLab


From e0e42142bab96404de535cceb85d6533d5ad7942 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 26 Apr 2009 23:39:38 -0700
Subject: [PATCH 0607/2198] x86: Use dmi check in apic_is_clustered() on 64-bit
 to mark the TSC unstable

We will have systems with 2 and more sockets 8cores/2thread,
but we treat them as multi chassis - while they could have
a stable TSC domain.

Use DMI check instead.

[ Impact: do not turn possibly stable TSCs off incorrectly ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ravikiran Thirumalai <kiran@scalex86.org>
LKML-Reference: <49F5532A.5000802@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 86 +++++++++++++++++++++++++------------
 1 file changed, 59 insertions(+), 27 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 1386dbec5525f..28f747d61d787 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2138,31 +2138,14 @@ static void apic_pm_activate(void) { }
 #endif	/* CONFIG_PM */
 
 #ifdef CONFIG_X86_64
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
+
+static int __cpuinit apic_cluster_num(void)
 {
 	int i, clusters, zeros;
 	unsigned id;
 	u16 *bios_cpu_apicid;
 	DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
 
-	/*
-	 * there is not this kind of box with AMD CPU yet.
-	 * Some AMD box with quadcore cpu and 8 sockets apicid
-	 * will be [4, 0x23] or [8, 0x27] could be thought to
-	 * vsmp box still need checking...
-	 */
-	if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
-		return 0;
-
 	bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
 	bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
@@ -2198,18 +2181,67 @@ __cpuinit int apic_is_clustered_box(void)
 			++zeros;
 	}
 
-	/* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
-	 * not guaranteed to be synced between boards
-	 */
-	if (is_vsmp_box() && clusters > 1)
+	return clusters;
+}
+
+static int __cpuinitdata multi_checked;
+static int __cpuinitdata multi;
+
+static int __cpuinit set_multi(const struct dmi_system_id *d)
+{
+	if (multi)
+		return 0;
+	printk(KERN_INFO "APIC: %s detected, Multi Chassis\n", d->ident);
+	multi = 1;
+	return 0;
+}
+
+static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+	{
+		.callback = set_multi,
+		.ident = "IBM System Summit2",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
+		},
+	},
+	{}
+};
+
+static void __cpuinit dmi_check_multi(void)
+{
+	if (multi_checked)
+		return;
+
+	dmi_check_system(multi_dmi_table);
+	multi_checked = 1;
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis.
+ * Use DMI to check them
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+	dmi_check_multi();
+	if (multi)
 		return 1;
 
+	if (!is_vsmp_box())
+		return 0;
+
 	/*
-	 * If clusters > 2, then should be multi-chassis.
-	 * May have to revisit this when multi-core + hyperthreaded CPUs come
-	 * out, but AFAIK this will work even for them.
+	 * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+	 * not guaranteed to be synced between boards
 	 */
-	return (clusters > 2);
+	if (apic_cluster_num() > 1)
+		return 1;
+
+	return 0;
 }
 #endif
 
-- 
GitLab


From 47c8a08bbe77ad3c06f63919a14b0f0b0cd54390 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 27 Apr 2009 17:34:39 +0900
Subject: [PATCH 0608/2198] sh: rtc-generic support.

This adds rtc-generic support for SUPERH32.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig           |  1 +
 arch/sh/include/asm/rtc.h | 11 +++++++++++
 arch/sh/kernel/time_32.c  | 23 +++++++++++++++++++++++
 drivers/rtc/Kconfig       |  2 +-
 4 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 164748945f953..9db02ced57a5a 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -30,6 +30,7 @@ config SUPERH32
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_ARCH_KGDB
 	select ARCH_HIBERNATION_POSSIBLE if MMU
+	select RTC_LIB
 
 config SUPERH64
 	def_bool ARCH = "sh64"
diff --git a/arch/sh/include/asm/rtc.h b/arch/sh/include/asm/rtc.h
index f7b010d48af7b..52b0c2dba979c 100644
--- a/arch/sh/include/asm/rtc.h
+++ b/arch/sh/include/asm/rtc.h
@@ -6,6 +6,17 @@ extern void (*board_time_init)(void);
 extern void (*rtc_sh_get_time)(struct timespec *);
 extern int (*rtc_sh_set_time)(const time_t);
 
+/* some dummy definitions */
+#define RTC_BATT_BAD 0x100	/* battery bad */
+#define RTC_SQWE 0x08		/* enable square-wave output */
+#define RTC_DM_BINARY 0x04	/* all time/date values are BCD if clear */
+#define RTC_24H 0x02		/* 24 hour mode - else hours bit 7 means pm */
+#define RTC_DST_EN 0x01	        /* auto switch DST - works f. USA only */
+
+struct rtc_time;
+unsigned int get_rtc_time(struct rtc_time *);
+int set_rtc_time(struct rtc_time *);
+
 #define RTC_CAP_4_DIGIT_YEAR	(1 << 0)
 
 struct sh_rtc_platform_info {
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index c770413c32130..109409f5ca53f 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -19,6 +19,7 @@
 #include <linux/mc146818rtc.h>	/* for rtc_lock */
 #include <linux/platform_device.h>
 #include <linux/smp.h>
+#include <linux/rtc.h>
 #include <asm/clock.h>
 #include <asm/rtc.h>
 #include <asm/timer.h>
@@ -45,6 +46,28 @@ static int null_rtc_set_time(const time_t secs)
 void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time;
 int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time;
 
+unsigned int get_rtc_time(struct rtc_time *tm)
+{
+	if (rtc_sh_get_time != null_rtc_get_time) {
+		struct timespec tv;
+
+		rtc_sh_get_time(&tv);
+		rtc_time_to_tm(tv.tv_sec, tm);
+	}
+
+	return RTC_24H;
+}
+EXPORT_SYMBOL(get_rtc_time);
+
+int set_rtc_time(struct rtc_time *tm)
+{
+	unsigned long secs;
+
+	rtc_tm_to_time(tm, &secs);
+	return rtc_sh_set_time(secs);
+}
+EXPORT_SYMBOL(set_rtc_time);
+
 #ifndef CONFIG_GENERIC_TIME
 void do_gettimeofday(struct timeval *tv)
 {
diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig
index 4e9851fc17463..277d35d232fad 100644
--- a/drivers/rtc/Kconfig
+++ b/drivers/rtc/Kconfig
@@ -692,7 +692,7 @@ config RTC_DRV_GENERIC
 	tristate "Generic RTC support"
 	# Please consider writing a new RTC driver instead of using the generic
 	# RTC abstraction
-	depends on PARISC || M68K || PPC
+	depends on PARISC || M68K || PPC || SUPERH32
 	help
 	  Say Y or M here to enable RTC support on systems using the generic
 	  RTC abstraction. If you do not know what you are doing, you should
-- 
GitLab


From c2e0090c668fc99f5be65fd9907da781cb6a2ef5 Mon Sep 17 00:00:00 2001
From: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
Date: Mon, 27 Apr 2009 17:54:41 +0900
Subject: [PATCH 0609/2198] sh: mach-r2d: add physmap-flash support for R2D+
 boards.

The RTS7751R2D_1 boards only support 1MB of socket-mounted MBM29F040
flash, which we just leave alone as it's not terribly interesting.

This adds support for the s29gl256p on the r2d+ boards that makes a bit
more sense to expose to the user.

Signed-off-by: Jean-Christophe PLAGNIOL-VILLARD <plagnioj@jcrosoft.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-r2d/setup.c          | 50 ++++++++++++++
 arch/sh/configs/rts7751r2dplus_defconfig | 87 +++++++++++++++++++++++-
 2 files changed, 136 insertions(+), 1 deletion(-)

diff --git a/arch/sh/boards/mach-r2d/setup.c b/arch/sh/boards/mach-r2d/setup.c
index c585be00956ea..a625ecb93e47e 100644
--- a/arch/sh/boards/mach-r2d/setup.c
+++ b/arch/sh/boards/mach-r2d/setup.c
@@ -10,6 +10,9 @@
  */
 #include <linux/init.h>
 #include <linux/platform_device.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/physmap.h>
 #include <linux/ata_platform.h>
 #include <linux/sm501.h>
 #include <linux/sm501-regs.h>
@@ -181,6 +184,50 @@ static struct platform_device sm501_device = {
 	.resource	= sm501_resources,
 };
 
+static struct mtd_partition r2d_partitions[] = {
+	{
+		.name		= "U-Boot",
+		.offset		= 0x00000000,
+		.size		= 0x00040000,
+		.mask_flags	= MTD_WRITEABLE,
+	}, {
+		.name		= "Environment",
+		.offset		= MTDPART_OFS_NXTBLK,
+		.size		= 0x00040000,
+		.mask_flags	= MTD_WRITEABLE,
+	}, {
+		.name		= "Kernel",
+		.offset		= MTDPART_OFS_NXTBLK,
+		.size		= 0x001c0000,
+	}, {
+		.name		= "Flash_FS",
+		.offset		= MTDPART_OFS_NXTBLK,
+		.size		= MTDPART_SIZ_FULL,
+	}
+};
+
+static struct physmap_flash_data flash_data = {
+	.width		= 2,
+	.nr_parts	= ARRAY_SIZE(r2d_partitions),
+	.parts		= r2d_partitions,
+};
+
+static struct resource flash_resource = {
+	.start		= 0x00000000,
+	.end		= 0x02000000,
+	.flags		= IORESOURCE_MEM,
+};
+
+static struct platform_device flash_device = {
+	.name		= "physmap-flash",
+	.id		= -1,
+	.resource	= &flash_resource,
+	.num_resources	= 1,
+	.dev		= {
+		.platform_data = &flash_data,
+	},
+};
+
 static struct platform_device *rts7751r2d_devices[] __initdata = {
 	&sm501_device,
 	&heartbeat_device,
@@ -203,6 +250,9 @@ static int __init rts7751r2d_devices_setup(void)
 	if (register_trapped_io(&cf_trapped_io) == 0)
 		platform_device_register(&cf_ide_device);
 
+	if (mach_is_r2d_plus())
+		platform_device_register(&flash_device);
+
 	spi_register_board_info(spi_bus, ARRAY_SIZE(spi_bus));
 
 	return platform_add_devices(rts7751r2d_devices,
diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig
index 2bc1c97d346a7..a860435b8847a 100644
--- a/arch/sh/configs/rts7751r2dplus_defconfig
+++ b/arch/sh/configs/rts7751r2dplus_defconfig
@@ -424,7 +424,92 @@ CONFIG_FIRMWARE_IN_KERNEL=y
 CONFIG_EXTRA_FIRMWARE=""
 # CONFIG_SYS_HYPERVISOR is not set
 # CONFIG_CONNECTOR is not set
-# CONFIG_MTD is not set
+CONFIG_MTD=y
+# CONFIG_MTD_DEBUG is not set
+CONFIG_MTD_CONCAT=y
+CONFIG_MTD_PARTITIONS=y
+# CONFIG_MTD_TESTS is not set
+# CONFIG_MTD_REDBOOT_PARTS is not set
+CONFIG_MTD_CMDLINE_PARTS=y
+# CONFIG_MTD_AR7_PARTS is not set
+
+#
+# User Modules And Translation Layers
+#
+CONFIG_MTD_CHAR=y
+# CONFIG_MTD_BLKDEVS is not set
+# CONFIG_MTD_BLOCK is not set
+# CONFIG_MTD_BLOCK_RO is not set
+# CONFIG_FTL is not set
+# CONFIG_NFTL is not set
+# CONFIG_INFTL is not set
+# CONFIG_RFD_FTL is not set
+# CONFIG_SSFDC is not set
+# CONFIG_MTD_OOPS is not set
+
+#
+# RAM/ROM/Flash chip drivers
+#
+CONFIG_MTD_CFI=y
+# CONFIG_MTD_JEDECPROBE is not set
+CONFIG_MTD_GEN_PROBE=y
+# CONFIG_MTD_CFI_ADV_OPTIONS is not set
+CONFIG_MTD_MAP_BANK_WIDTH_1=y
+CONFIG_MTD_MAP_BANK_WIDTH_2=y
+CONFIG_MTD_MAP_BANK_WIDTH_4=y
+# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
+CONFIG_MTD_CFI_I1=y
+CONFIG_MTD_CFI_I2=y
+# CONFIG_MTD_CFI_I4 is not set
+# CONFIG_MTD_CFI_I8 is not set
+# CONFIG_MTD_CFI_INTELEXT is not set
+CONFIG_MTD_CFI_AMDSTD=y
+# CONFIG_MTD_CFI_STAA is not set
+CONFIG_MTD_CFI_UTIL=y
+# CONFIG_MTD_RAM is not set
+# CONFIG_MTD_ROM is not set
+# CONFIG_MTD_ABSENT is not set
+
+#
+# Mapping drivers for chip access
+#
+# CONFIG_MTD_COMPLEX_MAPPINGS is not set
+CONFIG_MTD_PHYSMAP=y
+# CONFIG_MTD_PHYSMAP_COMPAT is not set
+# CONFIG_MTD_INTEL_VR_NOR is not set
+# CONFIG_MTD_PLATRAM is not set
+
+#
+# Self-contained MTD device drivers
+#
+# CONFIG_MTD_PMC551 is not set
+# CONFIG_MTD_DATAFLASH is not set
+# CONFIG_MTD_M25P80 is not set
+# CONFIG_MTD_SLRAM is not set
+# CONFIG_MTD_PHRAM is not set
+# CONFIG_MTD_MTDRAM is not set
+# CONFIG_MTD_BLOCK2MTD is not set
+
+#
+# Disk-On-Chip Device Drivers
+#
+# CONFIG_MTD_DOC2000 is not set
+# CONFIG_MTD_DOC2001 is not set
+# CONFIG_MTD_DOC2001PLUS is not set
+# CONFIG_MTD_NAND is not set
+# CONFIG_MTD_ONENAND is not set
+
+#
+# LPDDR flash memory drivers
+#
+# CONFIG_MTD_LPDDR is not set
+
+#
+# UBI - Unsorted block images
+#
+# CONFIG_MTD_UBI is not set
 # CONFIG_PARPORT is not set
 CONFIG_BLK_DEV=y
 # CONFIG_BLK_CPQ_CISS_DA is not set
-- 
GitLab


From ac2ff946a53e7bd0ae98f4e5d1d6c1b1dced82e5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 12:38:32 +0900
Subject: [PATCH 0610/2198] mg_disk: fix locking

IRQ and timeout handlers call functions which expect locked queue lock
without locking it.  Fix it.

While at it, convert 0s used as null pointer constant to NULLs.

[ Impact: fix locking, cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>
---
 drivers/block/mg_disk.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index fb39d9aa3cdcc..d3e72ad08dbe1 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -160,11 +160,16 @@ static irqreturn_t mg_irq(int irq, void *dev_id)
 	struct mg_host *host = dev_id;
 	void (*handler)(struct mg_host *) = host->mg_do_intr;
 
-	host->mg_do_intr = 0;
+	spin_lock(&host->lock);
+
+	host->mg_do_intr = NULL;
 	del_timer(&host->timer);
 	if (!handler)
 		handler = mg_unexpected_intr;
 	handler(host);
+
+	spin_unlock(&host->lock);
+
 	return IRQ_HANDLED;
 }
 
@@ -319,7 +324,7 @@ static void mg_read(struct request *req)
 
 	remains = req->nr_sectors;
 
-	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, 0) !=
+	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, NULL) !=
 			MG_ERR_NONE)
 		mg_bad_rw_intr(host);
 
@@ -363,7 +368,7 @@ static void mg_write(struct request *req)
 
 	remains = req->nr_sectors;
 
-	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, 0) !=
+	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, NULL) !=
 			MG_ERR_NONE) {
 		mg_bad_rw_intr(host);
 		return;
@@ -521,9 +526,11 @@ void mg_times_out(unsigned long data)
 	char *name;
 	struct request *req;
 
+	spin_lock_irq(&host->lock);
+
 	req = elv_next_request(host->breq);
 	if (!req)
-		return;
+		goto out_unlock;
 
 	host->mg_do_intr = NULL;
 
@@ -534,6 +541,8 @@ void mg_times_out(unsigned long data)
 	mg_bad_rw_intr(host);
 
 	mg_request(host->breq);
+out_unlock:
+	spin_unlock_irq(&host->lock);
 }
 
 static void mg_request_poll(struct request_queue *q)
-- 
GitLab


From 7090a0a97f55cbf47547a140fcc5a349f32c598c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Apr 2009 12:38:33 +0900
Subject: [PATCH 0611/2198] mg_disk: fix CONFIG_LBD=y warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

drivers/block/mg_disk.c: In function ‘mg_dump_status’:
drivers/block/mg_disk.c:265: warning: format ‘%ld’ expects type ‘long int’, but
argument 2 has type ‘sector_t’

[ Impact: kill build warning ]

Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/block/mg_disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index d3e72ad08dbe1..f3898353d0a8f 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -79,7 +79,7 @@ static void mg_dump_status(const char *msg, unsigned int stat,
 			if (host->breq) {
 				req = elv_next_request(host->breq);
 				if (req)
-					printk(", sector=%ld", req->sector);
+					printk(", sector=%u", (u32)req->sector);
 			}
 
 		}
-- 
GitLab


From e93b9fb7d85da4fd9d5171649e5ddcac1dd572bf Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 12:38:33 +0900
Subject: [PATCH 0612/2198] hd: fix locking

hd dance around local irq and HD_IRQ enable without achieving much.
It ends up transferring data from irq handler with both local irq and
HD_IRQ disabled.  The only place it actually does something is while
transferring the first block of a request which it does with HD_IRQ
disabled but local irq enabled.

Unfortunately, the dancing is horribly broken from locking POV.  IRQ
and timeout handlers access block queue without grabbing the queue
lock and running the driver in SMP configuration crashes the whole
machine pretty quickly.

Remove meaningless irq enable/disable dancing and add proper locking
in issue, irq and timeout paths.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/block/hd.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 3c11f062a18cf..baaa9e486e508 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -509,7 +509,6 @@ static void write_intr(void)
 	if (i > 0) {
 		SET_HANDLER(&write_intr);
 		outsw(HD_DATA, req->buffer, 256);
-		local_irq_enable();
 	} else {
 #if (HD_DELAY > 0)
 		last_req = read_timer();
@@ -541,8 +540,7 @@ static void hd_times_out(unsigned long dummy)
 	if (!CURRENT)
 		return;
 
-	disable_irq(HD_IRQ);
-	local_irq_enable();
+	spin_lock_irq(hd_queue->queue_lock);
 	reset = 1;
 	name = CURRENT->rq_disk->disk_name;
 	printk("%s: timeout\n", name);
@@ -552,9 +550,8 @@ static void hd_times_out(unsigned long dummy)
 #endif
 		end_request(CURRENT, 0);
 	}
-	local_irq_disable();
 	hd_request();
-	enable_irq(HD_IRQ);
+	spin_unlock_irq(hd_queue->queue_lock);
 }
 
 static int do_special_op(struct hd_i_struct *disk, struct request *req)
@@ -592,7 +589,6 @@ static void hd_request(void)
 		return;
 repeat:
 	del_timer(&device_timer);
-	local_irq_enable();
 
 	req = CURRENT;
 	if (!req) {
@@ -601,7 +597,6 @@ static void hd_request(void)
 	}
 
 	if (reset) {
-		local_irq_disable();
 		reset_hd();
 		return;
 	}
@@ -660,9 +655,7 @@ static void hd_request(void)
 
 static void do_hd_request(struct request_queue *q)
 {
-	disable_irq(HD_IRQ);
 	hd_request();
-	enable_irq(HD_IRQ);
 }
 
 static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -684,12 +677,16 @@ static irqreturn_t hd_interrupt(int irq, void *dev_id)
 {
 	void (*handler)(void) = do_hd;
 
+	spin_lock(hd_queue->queue_lock);
+
 	do_hd = NULL;
 	del_timer(&device_timer);
 	if (!handler)
 		handler = unexpected_hd_interrupt;
 	handler();
-	local_irq_enable();
+
+	spin_unlock(hd_queue->queue_lock);
+
 	return IRQ_HANDLED;
 }
 
-- 
GitLab


From e686307fdc84f249490e6c9da92fcb2424491f14 Mon Sep 17 00:00:00 2001
From: Akinobu Mita <akinobu.mita@gmail.com>
Date: Fri, 17 Apr 2009 08:41:21 +0200
Subject: [PATCH 0613/2198] loop: use BIO list management functions

Now that the bio list management stuff is generic, convert loop to use
bio lists instead of its own private bio list implementation.

Cc:  Jens Axboe <axboe@kernel.dk>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Akinobu Mita <akinobu.mita@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/loop.c | 26 +++++++-------------------
 include/linux/bio.h  |  2 +-
 include/linux/loop.h |  3 +--
 3 files changed, 9 insertions(+), 22 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ddae80825899a..9ca4bb0146571 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -511,11 +511,7 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
  */
 static void loop_add_bio(struct loop_device *lo, struct bio *bio)
 {
-	if (lo->lo_biotail) {
-		lo->lo_biotail->bi_next = bio;
-		lo->lo_biotail = bio;
-	} else
-		lo->lo_bio = lo->lo_biotail = bio;
+	bio_list_add(&lo->lo_bio_list, bio);
 }
 
 /*
@@ -523,16 +519,7 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio)
  */
 static struct bio *loop_get_bio(struct loop_device *lo)
 {
-	struct bio *bio;
-
-	if ((bio = lo->lo_bio)) {
-		if (bio == lo->lo_biotail)
-			lo->lo_biotail = NULL;
-		lo->lo_bio = bio->bi_next;
-		bio->bi_next = NULL;
-	}
-
-	return bio;
+	return bio_list_pop(&lo->lo_bio_list);
 }
 
 static int loop_make_request(struct request_queue *q, struct bio *old_bio)
@@ -609,12 +596,13 @@ static int loop_thread(void *data)
 
 	set_user_nice(current, -20);
 
-	while (!kthread_should_stop() || lo->lo_bio) {
+	while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) {
 
 		wait_event_interruptible(lo->lo_event,
-				lo->lo_bio || kthread_should_stop());
+				!bio_list_empty(&lo->lo_bio_list) ||
+				kthread_should_stop());
 
-		if (!lo->lo_bio)
+		if (bio_list_empty(&lo->lo_bio_list))
 			continue;
 		spin_lock_irq(&lo->lo_lock);
 		bio = loop_get_bio(lo);
@@ -841,7 +829,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	lo->old_gfp_mask = mapping_gfp_mask(mapping);
 	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
 
-	lo->lo_bio = lo->lo_biotail = NULL;
+	bio_list_init(&lo->lo_bio_list);
 
 	/*
 	 * set queue make_request_fn, and add limits based on lower level
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7b214fd672a2d..f37ca8c726ba0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -506,7 +506,7 @@ static inline int bio_has_data(struct bio *bio)
 }
 
 /*
- * BIO list managment for use by remapping drivers (e.g. DM or MD).
+ * BIO list management for use by remapping drivers (e.g. DM or MD) and loop.
  *
  * A bio_list anchors a singly-linked list of bios chained through the bi_next
  * member of the bio.  The bio_list also caches the last list member to allow
diff --git a/include/linux/loop.h b/include/linux/loop.h
index 40725447f5e0e..66c194e2d9b9c 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -56,8 +56,7 @@ struct loop_device {
 	gfp_t		old_gfp_mask;
 
 	spinlock_t		lo_lock;
-	struct bio 		*lo_bio;
-	struct bio		*lo_biotail;
+	struct bio_list		lo_bio_list;
 	int			lo_state;
 	struct mutex		lo_ctl_mutex;
 	struct task_struct	*lo_thread;
-- 
GitLab


From 924cec7789f65ab7f022256f6533ecba0747b5f3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0614/2198] block: clear req->errors on bio completion only for
 fs requests

Impact: subtle behavior change

For fs requests, rq is only carrier of bios and rq error status as a
whole doesn't mean much.  This is the reason why rq->errors is being
cleared on each partial completion of a request as on each partial
completion the error status is transferred to the respective bios.

For pc requests, rq->errors is used to carry error status to the
issuer and thus __end_that_request_first() doesn't clear it on such
cases.

The condition was fine till now as only fs and pc requests have used
bio and thus the bio completion path.  However, future changes will
unify data accesses to bio and all non fs users care about rq error
status.  Clear rq->errors on bio completion only for fs requests.

In general, the implicit clearing is a bit too subtle especially as
the meaning of rq->errors is completely dependent on low level
drivers.  Unifying / cleaning up rq->errors usage and letting llds
manage it would be better.  TODO comment added.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Jens Axboe <axboe@kernel.dk>
---
 block/blk-core.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 2998fe3a23778..41bc0ff75e28f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1741,10 +1741,14 @@ static int __end_that_request_first(struct request *req, int error,
 	trace_block_rq_complete(req->q, req);
 
 	/*
-	 * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
-	 * sense key with us all the way through
+	 * For fs requests, rq is just carrier of independent bio's
+	 * and each partial completion should be handled separately.
+	 * Reset per-request error on each partial completion.
+	 *
+	 * TODO: tj: This is too subtle.  It would be better to let
+	 * low level drivers do what they see fit.
 	 */
-	if (!blk_pc_request(req))
+	if (blk_fs_request(req))
 		req->errors = 0;
 
 	if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
-- 
GitLab


From 0de57fb93b1daaeaecb658a98b3299ae460c02e9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0615/2198] ide-tape: remove back-to-back REQUEST_SENSE
 detection

Impact: fix an oops which always triggers

ide_tape_issue_pc() assumed drive->pc isn't NULL on invocation when
checking for back-to-back request sense issues but drive->pc can be
NULL and even when it's not NULL, it's not safe to dereference it once
the previous command is complete because pc could have been freed or
was on stack.  Kill back-to-back REQUEST_SENSE detection.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index cb942a9b580f6..3a53e0834cf79 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -614,12 +614,6 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 {
 	idetape_tape_t *tape = drive->driver_data;
 
-	if (drive->pc->c[0] == REQUEST_SENSE &&
-	    pc->c[0] == REQUEST_SENSE) {
-		printk(KERN_ERR "ide-tape: possible ide-tape.c bug - "
-			"Two request sense in serial were issued\n");
-	}
-
 	if (drive->failed_pc == NULL && pc->c[0] != REQUEST_SENSE)
 		drive->failed_pc = pc;
 
-- 
GitLab


From 220d06b5531e7b8a6226b2fdfb21198c3ccc4f76 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0616/2198] ide: use blk_run_queue() instead of
 blk_start_queueing()

blk_start_queueing() is being phased out in favor of
[__]blk_run_queue().  Switch.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-park.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c
index 310d03f2b5b79..a914023d6d035 100644
--- a/drivers/ide/ide-park.c
+++ b/drivers/ide/ide-park.c
@@ -24,11 +24,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
 			start_queue = 1;
 		spin_unlock_irq(&hwif->lock);
 
-		if (start_queue) {
-			spin_lock_irq(q->queue_lock);
-			blk_start_queueing(q);
-			spin_unlock_irq(q->queue_lock);
-		}
+		if (start_queue)
+			blk_run_queue(q);
 		return;
 	}
 	spin_unlock_irq(&hwif->lock);
-- 
GitLab


From b2963ac1738542d30305d7e12c8c078a383a425c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0617/2198] ide: don't set REQ_SOFTBARRIER

ide doesn't have to worry about REQ_SOFTBARRIER.  Don't set it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-disk.c   | 1 -
 drivers/ide/ide-ioctls.c | 1 -
 2 files changed, 2 deletions(-)

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index a9fbe2c31210c..c2438804d3c4a 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -411,7 +411,6 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
 	cmd->protocol = ATA_PROT_NODATA;
 
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
 	rq->special = cmd;
 }
 
diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c
index c1c25ebbaa1fb..5991b23793f20 100644
--- a/drivers/ide/ide-ioctls.c
+++ b/drivers/ide/ide-ioctls.c
@@ -231,7 +231,6 @@ static int generic_drive_reset(ide_drive_t *drive)
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_len = 1;
 	rq->cmd[0] = REQ_DRIVE_RESET;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
 	if (blk_execute_rq(drive->queue, NULL, rq, 1))
 		ret = rq->errors;
 	blk_put_request(rq);
-- 
GitLab


From 214ae19104141404f18ad6651752eb38e3dd584e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0618/2198] ide kill unused ide_cmd->special

Impact: removal of unused field

No one uses ide_cmd->special anymore.  Kill it.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/ide.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/include/linux/ide.h b/include/linux/ide.h
index ff65fffb078f6..846a1e132407d 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -324,7 +324,6 @@ struct ide_cmd {
 	unsigned int		cursg_ofs;
 
 	struct request		*rq;		/* copy of request */
-	void			*special;	/* valid_t generally */
 };
 
 /* ATAPI packet command flags */
-- 
GitLab


From 59a4f6f355fc718581ddcf1bb45a469d4756c035 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:41 +0900
Subject: [PATCH 0619/2198] ide-cd: clear sense buffer before issuing request
 sense

Impact: code simplification

ide_cd_request_sense_fixup() clears the tail of the sense buffer if
the device didn't completely fill it.  This patch makes
cdrom_queue_request_sense() clear the sense buffer before issuing the
command instead of clearing it afterwards.  This simplifies code and
eases future changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-cd.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 3d4e099697634..b6a0d126b5763 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -217,6 +217,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	if (sense == NULL)
 		sense = &info->sense_data;
 
+	memset(sense, 0, 18);
+
 	/* stuff the sense request in front of our current request */
 	blk_rq_init(NULL, rq);
 	rq->cmd_type = REQ_TYPE_ATA_PC;
@@ -504,14 +506,8 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
 	 * and some drives don't send them.  Sigh.
 	 */
 	if (rq->cmd[0] == GPCMD_REQUEST_SENSE &&
-	    cmd->nleft > 0 && cmd->nleft <= 5) {
-		unsigned int ofs = cmd->nbytes - cmd->nleft;
-
-		while (cmd->nleft > 0) {
-			*((u8 *)rq->data + ofs++) = 0;
-			cmd->nleft--;
-		}
-	}
+	    cmd->nleft > 0 && cmd->nleft <= 5)
+		cmd->nleft = 0;
 }
 
 int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
-- 
GitLab


From 8968932e54db35cf9d69cfbbd50c26dfaaa586c7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0620/2198] ide-floppy: block pc always uses bio

Impact: remove unnecessary code path

Block pc requests always use bio and rq->data is always NULL.  No need
to worry about !rq->bio cases in idefloppy_block_pc_cmd().  Note that
ide-atapi uses ide_pio_bytes() for bio PIO transfer which handle sg
fine.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-floppy.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 2b4868d95f8b0..3b22e066287e0 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -216,15 +216,13 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 	ide_init_pc(pc);
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
 	pc->rq = rq;
-	if (rq->data_len && rq_data_dir(rq) == WRITE)
-		pc->flags |= PC_FLAG_WRITING;
-	pc->buf = rq->data;
-	if (rq->bio)
+	if (rq->data_len) {
 		pc->flags |= PC_FLAG_DMA_OK;
-	/*
-	 * possibly problematic, doesn't look like ide-floppy correctly
-	 * handled scattered requests if dma fails...
-	 */
+		if (rq_data_dir(rq) == WRITE)
+			pc->flags |= PC_FLAG_WRITING;
+	}
+	/* pio will be performed by ide_pio_bytes() which handles sg fine */
+	pc->buf = NULL;
 	pc->req_xfer = pc->buf_size = rq->data_len;
 }
 
-- 
GitLab


From d868ca24302e99a0e8a86071ca2c66273edf97d9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0621/2198] ide-taskfile: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide_raw_taskfile() directly uses rq->buffer to carry pointer to the
data buffer.  This complicates both block interface and ide backend
request handling.  Use blk_rq_map_kern() instead and drop special
handling for REQ_TYPE_ATA_TASKFILE from ide_map_sg().

Note that REQ_RW setting is moved upwards as blk_rq_map_kern() uses it
to initialize bio rw flag.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-io.c       |  5 +----
 drivers/ide/ide-taskfile.c | 18 +++++++++++-------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 35dc38d3b2c58..9b9e8b1aae5ee 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -248,10 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct scatterlist *sg = hwif->sg_table;
 	struct request *rq = cmd->rq;
 
-	if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
-		sg_init_one(sg, rq->buffer, rq->nr_sectors * SECTOR_SIZE);
-		cmd->sg_nents = 1;
-	} else if (!rq->bio) {
+	if (!rq->bio) {
 		sg_init_one(sg, rq->data, rq->data_len);
 		cmd->sg_nents = 1;
 	} else
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index 4aa6223c11bea..f400eb4d4aff7 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -424,7 +424,9 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-	rq->buffer = buf;
+
+	if (cmd->tf_flags & IDE_TFLAG_WRITE)
+		rq->cmd_flags |= REQ_RW;
 
 	/*
 	 * (ks) We transfer currently only whole sectors.
@@ -432,18 +434,20 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 	 * if we would find a solution to transfer any size.
 	 * To support special commands like READ LONG.
 	 */
-	rq->hard_nr_sectors = rq->nr_sectors = nsect;
-	rq->hard_cur_sectors = rq->current_nr_sectors = nsect;
-
-	if (cmd->tf_flags & IDE_TFLAG_WRITE)
-		rq->cmd_flags |= REQ_RW;
+	if (nsect) {
+		error = blk_rq_map_kern(drive->queue, rq, buf,
+					nsect * SECTOR_SIZE, __GFP_WAIT);
+		if (error)
+			goto put_req;
+	}
 
 	rq->special = cmd;
 	cmd->rq = rq;
 
 	error = blk_execute_rq(drive->queue, NULL, rq, 0);
-	blk_put_request(rq);
 
+put_req:
+	blk_put_request(rq);
 	return error;
 }
 
-- 
GitLab


From ac0b0113ddbab3ed2388132d368c97292f9f3c84 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0622/2198] ide-atapi: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide-atapi uses rq->buffer as private opaque value for internal special
requests.  rq->special isn't used for these cases (the only case where
rq->special is used is for ide-tape rw requests).  Use rq->special
instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-atapi.c  | 4 ++--
 drivers/ide/ide-floppy.c | 2 +-
 drivers/ide/ide-tape.c   | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 7201b176d75b0..2894577237ba7 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -90,7 +90,7 @@ static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
 	blk_rq_init(NULL, rq);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_flags |= REQ_PREEMPT;
-	rq->buffer = (char *)pc;
+	rq->special = (char *)pc;
 	rq->rq_disk = disk;
 
 	if (pc->req_xfer) {
@@ -119,7 +119,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->buffer = (char *)pc;
+	rq->special = (char *)pc;
 
 	if (pc->req_xfer) {
 		rq->data = pc->buf;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 3b22e066287e0..94600331a2715 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -264,7 +264,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
 	} else if (blk_special_request(rq)) {
-		pc = (struct ide_atapi_pc *) rq->buffer;
+		pc = (struct ide_atapi_pc *)rq->special;
 	} else if (blk_pc_request(rq)) {
 		pc = &floppy->queued_pc;
 		idefloppy_blockpc_cmd(floppy, pc, rq);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 3a53e0834cf79..aadf53cfac6f0 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -828,7 +828,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 		goto out;
 	}
 	if (rq->cmd[13] & REQ_IDETAPE_PC1) {
-		pc = (struct ide_atapi_pc *) rq->buffer;
+		pc = (struct ide_atapi_pc *)rq->special;
 		rq->cmd[13] &= ~(REQ_IDETAPE_PC1);
 		rq->cmd[13] |= REQ_IDETAPE_PC2;
 		goto out;
-- 
GitLab


From 1f181d2b1569dfb88a584a6e1847e9e1c7645951 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0623/2198] ide-cd: don't abuse rq->buffer

Impact: rq->buffer usage cleanup

ide-cd uses rq->buffer to carry pointer to the original request when
issuing REQUEST_SENSE.  Use rq->special instead.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-cd.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index b6a0d126b5763..bb77c79c1018d 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -232,8 +232,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 	rq->cmd_type = REQ_TYPE_SENSE;
 	rq->cmd_flags |= REQ_PREEMPT;
 
-	/* NOTE! Save the failed command in "rq->buffer" */
-	rq->buffer = (void *) failed_command;
+	/* NOTE! Save the failed command in "rq->special" */
+	rq->special = (void *)failed_command;
 
 	if (failed_command)
 		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x",
@@ -247,10 +247,10 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
-	 * For REQ_TYPE_SENSE, "rq->buffer" points to the original
+	 * For REQ_TYPE_SENSE, "rq->special" points to the original
 	 * failed request
 	 */
-	struct request *failed = (struct request *)rq->buffer;
+	struct request *failed = (struct request *)rq->special;
 	struct cdrom_info *info = drive->driver_data;
 	void *sense = &info->sense_data;
 
-- 
GitLab


From e69d800f7e8797a8e3423380ee9d8ca1cb90c388 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0624/2198] ide: add helpers for preparing sense requests

This is in preparation of removing the queueing of a sense request out
of the IRQ handler path.

Use struct request_sense as a general sense buffer for all ATAPI
devices ide-{floppy,tape,cd}.

tj: * blk_get_request(__GFP_WAIT) can't be called from do_request() as
      it can cause deadlock.  Converted to use inline struct request
      and blk_rq_init().

    * Added xfer / cdb len selection depending on device type.

    * All sense prep logics folded into ide_prep_sense() which never
      fails.

    * hwif->rq clearing and sense_rq used handling moved into
      ide_queue_sense_rq().

    * blk_rq_map_kern() conversion is moved to later patch.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 61 +++++++++++++++++++++++++++++++++++++++++
 include/linux/ide.h     | 11 ++++++++
 2 files changed, 72 insertions(+)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2894577237ba7..c6e03485a63a8 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -191,6 +191,67 @@ void ide_create_request_sense_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq)
+{
+	struct request_sense *sense = &drive->sense_data;
+	struct request *sense_rq = &drive->sense_rq;
+	unsigned int cmd_len, sense_len;
+
+	debug_log("%s: enter\n", __func__);
+
+	switch (drive->media) {
+	case ide_floppy:
+		cmd_len = 255;
+		sense_len = 18;
+		break;
+	case ide_tape:
+		cmd_len = 20;
+		sense_len = 20;
+		break;
+	default:
+		cmd_len = 18;
+		sense_len = 18;
+	}
+
+	BUG_ON(sense_len > sizeof(*sense));
+
+	if (blk_sense_request(rq) || drive->sense_rq_armed)
+		return;
+
+	memset(sense, 0, sizeof(*sense));
+
+	blk_rq_init(rq->q, sense_rq);
+	sense_rq->rq_disk = rq->rq_disk;
+
+	sense_rq->data = sense;
+	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
+	sense_rq->cmd[4] = cmd_len;
+	sense_rq->data_len = sense_len;
+
+	sense_rq->cmd_type = REQ_TYPE_SENSE;
+	sense_rq->cmd_flags |= REQ_PREEMPT;
+
+	if (drive->media == ide_tape)
+		sense_rq->cmd[13] = REQ_IDETAPE_PC1;
+
+	drive->sense_rq_armed = true;
+}
+EXPORT_SYMBOL_GPL(ide_prep_sense);
+
+void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+{
+	BUG_ON(!drive->sense_rq_armed);
+
+	drive->sense_rq.special = special;
+	drive->sense_rq_armed = false;
+
+	drive->hwif->rq = NULL;
+
+	elv_add_request(drive->queue, &drive->sense_rq,
+			ELEVATOR_INSERT_FRONT, 0);
+}
+EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
+
 /*
  * Called when an error was detected during the last packet command.
  * We queue a request sense packet command in the head of the request list.
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 846a1e132407d..a69ccac564111 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -26,6 +26,9 @@
 #include <asm/io.h>
 #include <asm/mutex.h>
 
+/* for request_sense */
+#include <linux/cdrom.h>
+
 #if defined(CONFIG_CRIS) || defined(CONFIG_FRV) || defined(CONFIG_MN10300)
 # define SUPPORT_VLB_SYNC 0
 #else
@@ -602,6 +605,11 @@ struct ide_drive_s {
 
 	struct ide_atapi_pc request_sense_pc;
 	struct request request_sense_rq;
+
+	/* current sense rq and buffer */
+	bool sense_rq_armed;
+	struct request sense_rq;
+	struct request_sense sense_data;
 };
 
 typedef struct ide_drive_s ide_drive_t;
@@ -1175,6 +1183,9 @@ int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *, struct gendisk *);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq);
+void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+
 int ide_cd_expiry(ide_drive_t *);
 
 int ide_cd_get_xferlen(struct request *);
-- 
GitLab


From c457ce874a0f3dfa3d5e9f2309789f6f34e24325 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0625/2198] ide-cd: convert to using generic sense request

Preallocate a sense request in the ->do_request method and reinitialize
it only on demand, in case it's been consumed in the IRQ handler path.
The reason for this is that we don't want to be mapping rq to bio in
the IRQ path and introduce all kinds of unnecessary hacks to the block
layer.

tj: * Both user and kernel PC requests expect sense data to be stored
      in separate storage other than drive->sense_data.  Copy sense
      data to rq->sense on completion if rq->sense is not NULL.  This
      fixes bogus sense data on PC requests.

As a result, remove cdrom_queue_request_sense.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-cd.c | 54 ++++++++++----------------------------------
 drivers/ide/ide-cd.h |  4 ----
 2 files changed, 12 insertions(+), 46 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index bb77c79c1018d..061d7bbcd34a3 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -206,44 +206,6 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 	ide_cd_log_error(drive->name, failed_command, sense);
 }
 
-static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
-				      struct request *failed_command)
-{
-	struct cdrom_info *info		= drive->driver_data;
-	struct request *rq		= &drive->request_sense_rq;
-
-	ide_debug_log(IDE_DBG_SENSE, "enter");
-
-	if (sense == NULL)
-		sense = &info->sense_data;
-
-	memset(sense, 0, 18);
-
-	/* stuff the sense request in front of our current request */
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_ATA_PC;
-	rq->rq_disk = info->disk;
-
-	rq->data = sense;
-	rq->cmd[0] = GPCMD_REQUEST_SENSE;
-	rq->cmd[4] = 18;
-	rq->data_len = 18;
-
-	rq->cmd_type = REQ_TYPE_SENSE;
-	rq->cmd_flags |= REQ_PREEMPT;
-
-	/* NOTE! Save the failed command in "rq->special" */
-	rq->special = (void *)failed_command;
-
-	if (failed_command)
-		ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x",
-					     failed_command->cmd[0]);
-
-	drive->hwif->rq = NULL;
-
-	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
@@ -251,11 +213,16 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 	 * failed request
 	 */
 	struct request *failed = (struct request *)rq->special;
-	struct cdrom_info *info = drive->driver_data;
-	void *sense = &info->sense_data;
+	struct request_sense *sense = &drive->sense_data;
 
 	if (failed) {
 		if (failed->sense) {
+			/*
+			 * Sense is always read into drive->sense_data.
+			 * Copy back if the failed request has its
+			 * sense pointer set.
+			 */
+			memcpy(failed->sense, sense, 18);
 			sense = failed->sense;
 			failed->sense_len = rq->sense_len;
 		}
@@ -431,7 +398,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	/* if we got a CHECK_CONDITION status, queue a request sense command */
 	if (stat & ATA_ERR)
-		cdrom_queue_request_sense(drive, NULL, NULL);
+		ide_queue_sense_rq(drive, NULL);
 	return 1;
 
 end_request:
@@ -445,7 +412,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 		hwif->rq = NULL;
 
-		cdrom_queue_request_sense(drive, rq->sense, rq);
+		ide_queue_sense_rq(drive, rq);
 		return 1;
 	} else
 		return 2;
@@ -893,6 +860,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 		goto out_end;
 	}
 
+	/* prepare sense request for this command */
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h
index 1d97101099ce2..93a3cf1b0f3f8 100644
--- a/drivers/ide/ide-cd.h
+++ b/drivers/ide/ide-cd.h
@@ -87,10 +87,6 @@ struct cdrom_info {
 
 	struct atapi_toc *toc;
 
-	/* The result of the last successful request sense command
-	   on this device. */
-	struct request_sense sense_data;
-
 	u8 max_speed;		/* Max speed of the drive. */
 	u8 current_speed;	/* Current speed of the drive. */
 
-- 
GitLab


From 068753203e6cd085664a62e0fc0636e19b148a12 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <petkovbb@googlemail.com>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0626/2198] ide-atapi: convert ide-{floppy,tape} to using
 preallocated sense buffer

Since we're issuing REQ_TYPE_SENSE now we need to allow those types of
rqs in the ->do_request callbacks. As a future improvement, sense_len
assignment might be unified across all ATAPI devices. Borislav to
check with specs and test.

As a result, get rid of ide_queue_pc_head() and
drive->request_sense_rq.

tj: * Init request sense ide_atapi_pc from sense request.  In the
      longer timer, it would probably better to fold
      ide_create_request_sense_cmd() into its only current user -
      ide_floppy_get_format_progress().

    * ide_retry_pc() no longer takes @disk.

CC: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c  | 48 ++++++++++++----------------------------
 drivers/ide/ide-floppy.c |  4 +++-
 drivers/ide/ide-tape.c   |  7 ++++--
 include/linux/ide.h      |  3 +--
 4 files changed, 23 insertions(+), 39 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index c6e03485a63a8..972c522516f8f 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -79,34 +79,6 @@ void ide_init_pc(struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_init_pc);
 
-/*
- * Generate a new packet command request in front of the request queue, before
- * the current request, so that it will be processed immediately, on the next
- * pass through the driver.
- */
-static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
-			      struct ide_atapi_pc *pc, struct request *rq)
-{
-	blk_rq_init(NULL, rq);
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = (char *)pc;
-	rq->rq_disk = disk;
-
-	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
-	}
-
-	memcpy(rq->cmd, pc->c, 12);
-	if (drive->media == ide_tape)
-		rq->cmd[13] = REQ_IDETAPE_PC1;
-
-	drive->hwif->rq = NULL;
-
-	elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
 /*
  * Add a special packet command request to the tail of the request queue,
  * and wait for it to be serviced.
@@ -254,18 +226,26 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
 /*
  * Called when an error was detected during the last packet command.
- * We queue a request sense packet command in the head of the request list.
+ * We queue a request sense packet command at the head of the request
+ * queue.
  */
-void ide_retry_pc(ide_drive_t *drive, struct gendisk *disk)
+void ide_retry_pc(ide_drive_t *drive)
 {
-	struct request *rq = &drive->request_sense_rq;
+	struct request *sense_rq = &drive->sense_rq;
 	struct ide_atapi_pc *pc = &drive->request_sense_pc;
 
 	(void)ide_read_error(drive);
-	ide_create_request_sense_cmd(drive, pc);
+
+	/* init pc from sense_rq */
+	ide_init_pc(pc);
+	memcpy(pc->c, sense_rq->cmd, 12);
+	pc->buf = sense_rq->data;
+	pc->req_xfer = sense_rq->data_len;
+
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
-	ide_queue_pc_head(drive, disk, pc, rq);
+
+	ide_queue_sense_rq(drive, pc);
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -404,7 +384,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			debug_log("[cmd %x]: check condition\n", rq->cmd[0]);
 
 			/* Retry operation */
-			ide_retry_pc(drive, rq->rq_disk);
+			ide_retry_pc(drive);
 
 			/* queued, but not started */
 			return ide_stopped;
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 94600331a2715..d3302cc891e42 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -263,7 +263,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		}
 		pc = &floppy->queued_pc;
 		idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
-	} else if (blk_special_request(rq)) {
+	} else if (blk_special_request(rq) || blk_sense_request(rq)) {
 		pc = (struct ide_atapi_pc *)rq->special;
 	} else if (blk_pc_request(rq)) {
 		pc = &floppy->queued_pc;
@@ -273,6 +273,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 		goto out_end;
 	}
 
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index aadf53cfac6f0..8324dfa78a3f9 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -695,7 +695,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
 				printk(KERN_ERR "ide-tape: %s: I/O error, ",
 						tape->name);
 			/* Retry operation */
-			ide_retry_pc(drive, tape->disk);
+			ide_retry_pc(drive);
 			return ide_stopped;
 		}
 		pc->error = 0;
@@ -752,7 +752,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 			(unsigned long long)rq->sector, rq->nr_sectors,
 			rq->current_nr_sectors);
 
-	if (!blk_special_request(rq)) {
+	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
 		printk(KERN_NOTICE "ide-tape: %s: Unsupported request in "
 			"request queue (%d)\n", drive->name, rq->cmd_type);
@@ -840,6 +840,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	BUG();
 
 out:
+	/* prepare sense request for this command */
+	ide_prep_sense(drive, rq);
+
 	memset(&cmd, 0, sizeof(cmd));
 
 	if (rq_data_dir(rq))
diff --git a/include/linux/ide.h b/include/linux/ide.h
index a69ccac564111..9e67ccac3c1f1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -604,7 +604,6 @@ struct ide_drive_s {
 	unsigned long atapi_flags;
 
 	struct ide_atapi_pc request_sense_pc;
-	struct request request_sense_rq;
 
 	/* current sense rq and buffer */
 	bool sense_rq_armed;
@@ -1181,7 +1180,7 @@ int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
 int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
 int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
-void ide_retry_pc(ide_drive_t *, struct gendisk *);
+void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
 void ide_queue_sense_rq(ide_drive_t *drive, void *special);
-- 
GitLab


From 02e7cf8f848841ca21864ccd019e480b73c323b7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:42 +0900
Subject: [PATCH 0627/2198] ide-cd,atapi: use bio for internal commands

Impact: unify request data buffer handling

rq->data is used mostly to pass kernel buffer through request queue
without using bio.  There are only a couple of places which still do
this in kernel and converting to bio isn't difficult.

This patch converts ide-cd and atapi to use bio instead of rq->data
for request sense and internal pc commands.  With previous change to
unify sense request handling, this is relatively easily achieved by
adding blk_rq_map_kern() during sense_rq prep and PC issue.

If blk_rq_map_kern() fails for sense, the error is deferred till sense
issue and aborts the failed command which triggered the sense.  Note
that this is a slim possibility as sense prep is done on each command
issue, so for the above condition to actually trigger, all preps since
the last sense issue till the issue of the request which would require
a sense should fail.

* do_request functions might sleep now.  This should be okay as ide
  request_fn - do_ide_request() - is invoked only from make_request
  and plug work.  Make sure this is the case by adding might_sleep()
  to do_ide_request().

* Functions which access the read sense data before the sense request
  is complete now should access bio_data(sense_rq->bio) as the sense
  buffer might have been copied during blk_rq_map_kern().

* ide-tape updated to map sg.

* cdrom_do_block_pc() now doesn't have to deal with REQ_TYPE_ATA_PC
  special case.  Simplified.

* tp_ops->output/input_data path dropped from ide_pc_intr().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 52 ++++++++++++++++++++++++-----------------
 drivers/ide/ide-cd.c    | 28 +++++++++++-----------
 drivers/ide/ide-io.c    |  3 +++
 drivers/ide/ide-tape.c  |  3 +++
 include/linux/ide.h     |  2 +-
 5 files changed, 52 insertions(+), 36 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 972c522516f8f..5cefe12f5622b 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -94,16 +94,18 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 	rq->special = (char *)pc;
 
 	if (pc->req_xfer) {
-		rq->data = pc->buf;
-		rq->data_len = pc->req_xfer;
+		error = blk_rq_map_kern(drive->queue, rq, pc->buf, pc->req_xfer,
+					GFP_NOIO);
+		if (error)
+			goto put_req;
 	}
 
 	memcpy(rq->cmd, pc->c, 12);
 	if (drive->media == ide_tape)
 		rq->cmd[13] = REQ_IDETAPE_PC1;
 	error = blk_execute_rq(drive->queue, disk, rq, 0);
+put_req:
 	blk_put_request(rq);
-
 	return error;
 }
 EXPORT_SYMBOL_GPL(ide_queue_pc_tail);
@@ -168,6 +170,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	struct request_sense *sense = &drive->sense_data;
 	struct request *sense_rq = &drive->sense_rq;
 	unsigned int cmd_len, sense_len;
+	int err;
 
 	debug_log("%s: enter\n", __func__);
 
@@ -193,13 +196,19 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 	memset(sense, 0, sizeof(*sense));
 
 	blk_rq_init(rq->q, sense_rq);
-	sense_rq->rq_disk = rq->rq_disk;
 
-	sense_rq->data = sense;
+	err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
+			      GFP_NOIO);
+	if (unlikely(err)) {
+		if (printk_ratelimit())
+			printk(KERN_WARNING "%s: failed to map sense buffer\n",
+			       drive->name);
+		return;
+	}
+
+	sense_rq->rq_disk = rq->rq_disk;
 	sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
 	sense_rq->cmd[4] = cmd_len;
-	sense_rq->data_len = sense_len;
-
 	sense_rq->cmd_type = REQ_TYPE_SENSE;
 	sense_rq->cmd_flags |= REQ_PREEMPT;
 
@@ -210,9 +219,14 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(ide_prep_sense);
 
-void ide_queue_sense_rq(ide_drive_t *drive, void *special)
+int ide_queue_sense_rq(ide_drive_t *drive, void *special)
 {
-	BUG_ON(!drive->sense_rq_armed);
+	/* deferred failure from ide_prep_sense() */
+	if (!drive->sense_rq_armed) {
+		printk(KERN_WARNING "%s: failed queue sense request\n",
+		       drive->name);
+		return -ENOMEM;
+	}
 
 	drive->sense_rq.special = special;
 	drive->sense_rq_armed = false;
@@ -221,6 +235,7 @@ void ide_queue_sense_rq(ide_drive_t *drive, void *special)
 
 	elv_add_request(drive->queue, &drive->sense_rq,
 			ELEVATOR_INSERT_FRONT, 0);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
 
@@ -239,13 +254,14 @@ void ide_retry_pc(ide_drive_t *drive)
 	/* init pc from sense_rq */
 	ide_init_pc(pc);
 	memcpy(pc->c, sense_rq->cmd, 12);
-	pc->buf = sense_rq->data;
+	pc->buf = bio_data(sense_rq->bio);	/* pointer to mapped address */
 	pc->req_xfer = sense_rq->data_len;
 
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
 
-	ide_queue_sense_rq(drive, pc);
+	if (ide_queue_sense_rq(drive, pc))
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -317,7 +333,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	struct ide_cmd *cmd = &hwif->cmd;
 	struct request *rq = hwif->rq;
 	const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-	xfer_func_t *xferfunc;
 	unsigned int timeout, done;
 	u16 bcount;
 	u8 stat, ireason, dsc = 0;
@@ -411,7 +426,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					rq->errors = -EIO;
 			}
 
-			if (drive->media == ide_tape)
+			if (drive->media == ide_tape && !rq->bio)
 				done = ide_rq_bytes(rq); /* FIXME */
 			else
 				done = blk_rq_bytes(rq);
@@ -448,16 +463,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
-
-	if (drive->media == ide_floppy && pc->buf == NULL) {
+	if (drive->media == ide_tape && pc->bh)
+		done = drive->pc_io_buffers(drive, pc, bcount, write);
+	else {
 		done = min_t(unsigned int, bcount, cmd->nleft);
 		ide_pio_bytes(drive, cmd, write, done);
-	} else if (drive->media == ide_tape && pc->bh) {
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	} else {
-		done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
-		xferfunc(drive, NULL, pc->cur_pos, done);
 	}
 
 	/* Update the current position */
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 061d7bbcd34a3..673628790f108 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -210,10 +210,12 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
 	/*
 	 * For REQ_TYPE_SENSE, "rq->special" points to the original
-	 * failed request
+	 * failed request.  Also, the sense data should be read
+	 * directly from rq which might be different from the original
+	 * sense buffer if it got copied during mapping.
 	 */
 	struct request *failed = (struct request *)rq->special;
-	struct request_sense *sense = &drive->sense_data;
+	void *sense = bio_data(rq->bio);
 
 	if (failed) {
 		if (failed->sense) {
@@ -398,7 +400,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 	/* if we got a CHECK_CONDITION status, queue a request sense command */
 	if (stat & ATA_ERR)
-		ide_queue_sense_rq(drive, NULL);
+		return ide_queue_sense_rq(drive, NULL) ? 2 : 1;
 	return 1;
 
 end_request:
@@ -412,8 +414,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 		hwif->rq = NULL;
 
-		ide_queue_sense_rq(drive, rq);
-		return 1;
+		return ide_queue_sense_rq(drive, rq) ? 2 : 1;
 	} else
 		return 2;
 }
@@ -507,8 +508,12 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		rq->cmd_flags |= cmd_flags;
 		rq->timeout = timeout;
 		if (buffer) {
-			rq->data = buffer;
-			rq->data_len = *bufflen;
+			error = blk_rq_map_kern(drive->queue, rq, buffer,
+						*bufflen, GFP_NOIO);
+			if (error) {
+				blk_put_request(rq);
+				return error;
+			}
 		}
 
 		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
@@ -802,15 +807,10 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 	drive->dma = 0;
 
 	/* sg request */
-	if (rq->bio || ((rq->cmd_type == REQ_TYPE_ATA_PC) && rq->data_len)) {
+	if (rq->bio) {
 		struct request_queue *q = drive->queue;
+		char *buf = bio_data(rq->bio);
 		unsigned int alignment;
-		char *buf;
-
-		if (rq->bio)
-			buf = bio_data(rq->bio);
-		else
-			buf = rq->data;
 
 		drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 9b9e8b1aae5ee..3245c2dbda339 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -481,6 +481,9 @@ void do_ide_request(struct request_queue *q)
 
 	spin_unlock_irq(q->queue_lock);
 
+	/* HLD do_request() callback might sleep, make sure it's okay */
+	might_sleep();
+
 	if (ide_lock_host(host, hwif))
 		goto plug_device_2;
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8324dfa78a3f9..9b762a2d5d955 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -850,6 +850,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 
 	cmd.rq = rq;
 
+	ide_init_sg_cmd(&cmd, pc->req_xfer);
+	ide_map_sg(drive, &cmd);
+
 	return ide_tape_issue_pc(drive, &cmd, pc);
 }
 
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 9e67ccac3c1f1..1957461ac762c 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -1183,7 +1183,7 @@ void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
 void ide_retry_pc(ide_drive_t *drive);
 
 void ide_prep_sense(ide_drive_t *drive, struct request *rq);
-void ide_queue_sense_rq(ide_drive_t *drive, void *special);
+int ide_queue_sense_rq(ide_drive_t *drive, void *special);
 
 int ide_cd_expiry(ide_drive_t *);
 
-- 
GitLab


From 765139ef5f1a4b1d5cb1f1a7a12de7ee61f6500f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 07:00:43 +0900
Subject: [PATCH 0628/2198] ide-pm: don't abuse rq->data

Impact: cleanup rq->data usage

ide-pm uses rq->data to carry pointer to struct request_pm_state
through request queue and rq->special is used to carray pointer to
local struct ide_cmd, which isn't necessary.  Use rq->special for
request_pm_state instead and use local ide_cmd in
ide_start_power_step().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-io.c |  2 +-
 drivers/ide/ide-pm.c | 38 +++++++++++++++-----------------------
 2 files changed, 16 insertions(+), 24 deletions(-)

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 3245c2dbda339..6e3094e227752 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -368,7 +368,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 		if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
 			return execute_drive_cmd(drive, rq);
 		else if (blk_pm_request(rq)) {
-			struct request_pm_state *pm = rq->data;
+			struct request_pm_state *pm = rq->special;
 #ifdef DEBUG_PM
 			printk("%s: start_power_step(step: %d)\n",
 				drive->name, pm->pm_step);
diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c
index 0d8a151c0a01d..ba1488bd84307 100644
--- a/drivers/ide/ide-pm.c
+++ b/drivers/ide/ide-pm.c
@@ -7,7 +7,6 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
 	struct request_pm_state rqpm;
-	struct ide_cmd cmd;
 	int ret;
 
 	/* call ACPI _GTM only once */
@@ -15,11 +14,9 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
 		ide_acpi_get_timing(hwif);
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&cmd, 0, sizeof(cmd));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_PM_SUSPEND;
-	rq->special = &cmd;
-	rq->data = &rqpm;
+	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_SUSPEND;
 	if (mesg.event == PM_EVENT_PRETHAW)
 		mesg.event = PM_EVENT_FREEZE;
@@ -41,7 +38,6 @@ int generic_ide_resume(struct device *dev)
 	ide_hwif_t *hwif = drive->hwif;
 	struct request *rq;
 	struct request_pm_state rqpm;
-	struct ide_cmd cmd;
 	int err;
 
 	/* call ACPI _PS0 / _STM only once */
@@ -53,12 +49,10 @@ int generic_ide_resume(struct device *dev)
 	ide_acpi_exec_tfs(drive);
 
 	memset(&rqpm, 0, sizeof(rqpm));
-	memset(&cmd, 0, sizeof(cmd));
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_PM_RESUME;
 	rq->cmd_flags |= REQ_PREEMPT;
-	rq->special = &cmd;
-	rq->data = &rqpm;
+	rq->special = &rqpm;
 	rqpm.pm_step = IDE_PM_START_RESUME;
 	rqpm.pm_state = PM_EVENT_ON;
 
@@ -77,7 +71,7 @@ int generic_ide_resume(struct device *dev)
 
 void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 
 #ifdef DEBUG_PM
 	printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -107,10 +101,8 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
-	struct ide_cmd *cmd = rq->special;
-
-	memset(cmd, 0, sizeof(*cmd));
+	struct request_pm_state *pm = rq->special;
+	struct ide_cmd cmd = { };
 
 	switch (pm->pm_step) {
 	case IDE_PM_FLUSH_CACHE:	/* Suspend step 1 (flush cache) */
@@ -123,12 +115,12 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 			return ide_stopped;
 		}
 		if (ata_id_flush_ext_enabled(drive->id))
-			cmd->tf.command = ATA_CMD_FLUSH_EXT;
+			cmd.tf.command = ATA_CMD_FLUSH_EXT;
 		else
-			cmd->tf.command = ATA_CMD_FLUSH;
+			cmd.tf.command = ATA_CMD_FLUSH;
 		goto out_do_tf;
 	case IDE_PM_STANDBY:		/* Suspend step 2 (standby) */
-		cmd->tf.command = ATA_CMD_STANDBYNOW1;
+		cmd.tf.command = ATA_CMD_STANDBYNOW1;
 		goto out_do_tf;
 	case IDE_PM_RESTORE_PIO:	/* Resume step 1 (restore PIO) */
 		ide_set_max_pio(drive);
@@ -141,7 +133,7 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 			ide_complete_power_step(drive, rq);
 		return ide_stopped;
 	case IDE_PM_IDLE:		/* Resume step 2 (idle) */
-		cmd->tf.command = ATA_CMD_IDLEIMMEDIATE;
+		cmd.tf.command = ATA_CMD_IDLEIMMEDIATE;
 		goto out_do_tf;
 	case IDE_PM_RESTORE_DMA:	/* Resume step 3 (restore DMA) */
 		/*
@@ -163,11 +155,11 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 	return ide_stopped;
 
 out_do_tf:
-	cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-	cmd->valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-	cmd->protocol = ATA_PROT_NODATA;
+	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
+	cmd.protocol = ATA_PROT_NODATA;
 
-	return do_rw_taskfile(drive, cmd);
+	return do_rw_taskfile(drive, &cmd);
 }
 
 /**
@@ -181,7 +173,7 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 	unsigned long flags;
 
 	ide_complete_power_step(drive, rq);
@@ -207,7 +199,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
-	struct request_pm_state *pm = rq->data;
+	struct request_pm_state *pm = rq->special;
 
 	if (blk_pm_suspend_request(rq) &&
 	    pm->pm_step == IDE_PM_START_SUSPEND)
-- 
GitLab


From 08f370f0a2fb223bf48d0bfa2a173be0393c19dc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0629/2198] ide-tape,floppy: fix failed command completion
 after request sense

Impact: fix infinite retry loop

After a command failed, ide-tape and floppy inserts REQUEST_SENSE in
front of the failed command and according to the result, sets
pc->retries, flags and errors.  After REQUEST_SENSE is complete, the
failed command is again at the front of the queue and if the verdict
was to terminate the request, the issue functions tries to complete it
directly by calling drive->pc_callback() and returning ide_stopped.

However, drive->pc_callback() doesn't complete a request.  It only
prepares for completion of the request.  As a result, this creates an
infinite loop where the failed request is retried perpetually.

Fix it by actually ending the request by calling ide_complete_rq().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-floppy.c | 1 +
 drivers/ide/ide-tape.c   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index d3302cc891e42..d20704ac3183a 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -141,6 +141,7 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
 
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 9b762a2d5d955..2b9a13671c5fc 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -643,6 +643,7 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 		}
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 		return ide_stopped;
 	}
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
-- 
GitLab


From eb6a61bb9543aa54d62595e27206b3d3c0293bcc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0630/2198] ide-atapi,tape,floppy: allow ->pc_callback() to
 change rq->data_len

Impact: allow residual count implementation in ->pc_callback()

rq->data_len has two duties - carrying the number of input bytes on
issue and carrying residual count back to the issuer on completion.
ide-atapi completion callback ->pc_callback() is the right place to do
this but currently ide-atapi depends on rq->data_len carrying the
original request size after calling ->pc_callback() to complete the pc
request.

This patch makes ide_pc_intr(), ide_tape_issue_pc() and
ide_floppy_issue_pc() cache length to complete before calling
->pc_callback() so that it can modify rq->data_len as necessary.

Note: As using rq->data_len for two purposes can make cases like this
      incorrect in subtle ways, future changes will introduce separate
      field for residual count.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-atapi.c  | 16 ++++++++++------
 drivers/ide/ide-floppy.c |  5 ++++-
 drivers/ide/ide-tape.c   |  5 ++++-
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 5cefe12f5622b..3df5442de710d 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -409,6 +409,16 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0)
 			dsc = 1;
 
+		/*
+		 * ->pc_callback() might change rq->data_len for
+		 * residual count, cache total length.
+		 */
+		if (!blk_special_request(rq) &&
+		    (drive->media == ide_tape && !rq->bio))
+			done = ide_rq_bytes(rq);        /* FIXME */
+		else
+			done = blk_rq_bytes(rq);
+
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
 
@@ -417,7 +427,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
 		if (blk_special_request(rq)) {
 			rq->errors = 0;
-			done = blk_rq_bytes(rq);
 			error = 0;
 		} else {
 
@@ -426,11 +435,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					rq->errors = -EIO;
 			}
 
-			if (drive->media == ide_tape && !rq->bio)
-				done = ide_rq_bytes(rq); /* FIXME */
-			else
-				done = blk_rq_bytes(rq);
-
 			error = uptodate ? 0 : -EIO;
 		}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index d20704ac3183a..537b7c5580339 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -134,14 +134,17 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
 	drive->pc = pc;
 
 	if (pc->retries > IDEFLOPPY_MAX_PC_RETRIES) {
+		unsigned int done = blk_rq_bytes(drive->hwif->rq);
+
 		if (!(pc->flags & PC_FLAG_SUPPRESS_ERROR))
 			ide_floppy_report_error(floppy, pc);
+
 		/* Giving up */
 		pc->error = IDE_DRV_ERROR_GENERAL;
 
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
+		ide_complete_rq(drive, -EIO, done);
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2b9a13671c5fc..8226d52504d08 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -622,6 +622,8 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 
 	if (pc->retries > IDETAPE_MAX_PC_RETRIES ||
 		(pc->flags & PC_FLAG_ABORT)) {
+		unsigned int done = blk_rq_bytes(drive->hwif->rq);
+
 		/*
 		 * We will "abort" retrying a packet command in case legitimate
 		 * error code was received (crossing a filemark, or end of the
@@ -641,9 +643,10 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 			/* Giving up */
 			pc->error = IDE_DRV_ERROR_GENERAL;
 		}
+
 		drive->failed_pc = NULL;
 		drive->pc_callback(drive, 0);
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
+		ide_complete_rq(drive, -EIO, done);
 		return ide_stopped;
 	}
 	debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
-- 
GitLab


From 7b13354eeaabaf6283b8c669a7d67d104ce7c638 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0631/2198] ide-tape: use single continuous buffer

Impact: simpler buffer allocation and handling, kills OOM, fix DMA transfers

ide-tape has its own multiple buffer mechanism using struct
idetape_bh.  It allocates buffer with decreasing order-of-two
allocations so that it results in minimum number of segments.
However, the implementation is quite complex and works in a way that
no other block or ide driver works necessitating a lot of special case
handling.

The benefit this complex allocation scheme brings is questionable as
PIO or DMA the number of segments (16 maximum) doesn't make any
noticeable difference and it also doesn't negate the need for multiple
order allocation which can fail under memory pressure or high
fragmentation although it does lower the highest order necessary by
one when the buffer size isn't power of two.

As the first step to remove the custom buffer management, this patch
makes ide-tape allocate single continous buffer.  The maximum order is
four.  I doubt the change would cause any trouble but if it ever
matters, it should be converted to regular sg mechanism like everyone
else and even in that case dropping custom buffer handling and moving
to standard mechanism first make sense as an intermediate step.

This patch makes the first bh to contain the whole buffer and drops
multi bh handling code.  Following patches will make further changes.

This patch has the side effect of killing OOM triggered by allocation
path and fixing DMA transfers.  Previously, bug in alloc path
triggered OOM on command issue and commands were passed to DMA engine
without DMA-mapping all the segments.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 256 ++++++++++-------------------------------
 1 file changed, 58 insertions(+), 198 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8226d52504d08..b2afbc7bcb6b1 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -134,7 +134,6 @@ enum {
 struct idetape_bh {
 	u32 b_size;
 	atomic_t b_count;
-	struct idetape_bh *b_reqnext;
 	char *b_data;
 };
 
@@ -228,10 +227,6 @@ typedef struct ide_tape_obj {
 	char *b_data;
 	int b_count;
 
-	int pages_per_buffer;
-	/* Wasted space in each stage */
-	int excess_bh_size;
-
 	/* Measures average tape speed */
 	unsigned long avg_time;
 	int avg_size;
@@ -303,9 +298,7 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
-	while (bcount) {
-		if (bh == NULL)
-			break;
+	if (bcount && bh) {
 		count = min(
 			(unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
 			bcount);
@@ -313,15 +306,10 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 					atomic_read(&bh->b_count), count);
 		bcount -= count;
 		atomic_add(count, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size) {
-			bh = bh->b_reqnext;
-			if (bh)
-				atomic_set(&bh->b_count, 0);
-		}
+		if (atomic_read(&bh->b_count) == bh->b_size)
+			pc->bh = NULL;
 	}
 
-	pc->bh = bh;
-
 	return bcount;
 }
 
@@ -331,22 +319,14 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 	struct idetape_bh *bh = pc->bh;
 	int count;
 
-	while (bcount) {
-		if (bh == NULL)
-			break;
+	if (bcount && bh) {
 		count = min((unsigned int)pc->b_count, (unsigned int)bcount);
 		drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count);
 		bcount -= count;
 		pc->b_data += count;
 		pc->b_count -= count;
-		if (!pc->b_count) {
-			bh = bh->b_reqnext;
-			pc->bh = bh;
-			if (bh) {
-				pc->b_data = bh->b_data;
-				pc->b_count = atomic_read(&bh->b_count);
-			}
-		}
+		if (!pc->b_count)
+			pc->bh = NULL;
 	}
 
 	return bcount;
@@ -355,24 +335,20 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
 static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
 {
 	struct idetape_bh *bh = pc->bh;
-	int count;
 	unsigned int bcount = pc->xferred;
 
 	if (pc->flags & PC_FLAG_WRITING)
 		return;
-	while (bcount) {
-		if (bh == NULL) {
+	if (bcount) {
+		if (bh == NULL || bcount > bh->b_size) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return;
 		}
-		count = min((unsigned int)bh->b_size, (unsigned int)bcount);
-		atomic_set(&bh->b_count, count);
+		atomic_set(&bh->b_count, bcount);
 		if (atomic_read(&bh->b_count) == bh->b_size)
-			bh = bh->b_reqnext;
-		bcount -= count;
+			pc->bh = NULL;
 	}
-	pc->bh = bh;
 }
 
 /*
@@ -439,24 +415,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 /* Free data buffers completely. */
 static void ide_tape_kfree_buffer(idetape_tape_t *tape)
 {
-	struct idetape_bh *prev_bh, *bh = tape->merge_bh;
-
-	while (bh) {
-		u32 size = bh->b_size;
-
-		while (size) {
-			unsigned int order = fls(size >> PAGE_SHIFT)-1;
-
-			if (bh->b_data)
-				free_pages((unsigned long)bh->b_data, order);
+	struct idetape_bh *bh = tape->merge_bh;
 
-			size &= (order-1);
-			bh->b_data += (1 << order) * PAGE_SIZE;
-		}
-		prev_bh = bh;
-		bh = bh->b_reqnext;
-		kfree(prev_bh);
-	}
+	kfree(bh->b_data);
+	kfree(bh);
 }
 
 static void ide_tape_handle_dsc(ide_drive_t *);
@@ -861,117 +823,50 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 }
 
 /*
- * The function below uses __get_free_pages to allocate a data buffer of size
- * tape->buffer_size (or a bit more). We attempt to combine sequential pages as
- * much as possible.
- *
- * It returns a pointer to the newly allocated buffer, or NULL in case of
- * failure.
+ * It returns a pointer to the newly allocated buffer, or NULL in case
+ * of failure.
  */
 static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape,
-						  int full, int clear)
-{
-	struct idetape_bh *prev_bh, *bh, *merge_bh;
-	int pages = tape->pages_per_buffer;
-	unsigned int order, b_allocd;
-	char *b_data = NULL;
-
-	merge_bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-	bh = merge_bh;
-	if (bh == NULL)
-		goto abort;
-
-	order = fls(pages) - 1;
-	bh->b_data = (char *) __get_free_pages(GFP_KERNEL, order);
-	if (!bh->b_data)
-		goto abort;
-	b_allocd = (1 << order) * PAGE_SIZE;
-	pages &= (order-1);
-
-	if (clear)
-		memset(bh->b_data, 0, b_allocd);
-	bh->b_reqnext = NULL;
-	bh->b_size = b_allocd;
-	atomic_set(&bh->b_count, full ? bh->b_size : 0);
+						  int full)
+{
+	struct idetape_bh *bh;
 
-	while (pages) {
-		order = fls(pages) - 1;
-		b_data = (char *) __get_free_pages(GFP_KERNEL, order);
-		if (!b_data)
-			goto abort;
-		b_allocd = (1 << order) * PAGE_SIZE;
-
-		if (clear)
-			memset(b_data, 0, b_allocd);
-
-		/* newly allocated page frames below buffer header or ...*/
-		if (bh->b_data == b_data + b_allocd) {
-			bh->b_size += b_allocd;
-			bh->b_data -= b_allocd;
-			if (full)
-				atomic_add(b_allocd, &bh->b_count);
-			continue;
-		}
-		/* they are above the header */
-		if (b_data == bh->b_data + bh->b_size) {
-			bh->b_size += b_allocd;
-			if (full)
-				atomic_add(b_allocd, &bh->b_count);
-			continue;
-		}
-		prev_bh = bh;
-		bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-		if (!bh) {
-			free_pages((unsigned long) b_data, order);
-			goto abort;
-		}
-		bh->b_reqnext = NULL;
-		bh->b_data = b_data;
-		bh->b_size = b_allocd;
-		atomic_set(&bh->b_count, full ? bh->b_size : 0);
-		prev_bh->b_reqnext = bh;
+	bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
+	if (!bh)
+		return NULL;
 
-		pages &= (order-1);
+	bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (!bh->b_data) {
+		kfree(bh);
+		return NULL;
 	}
 
-	bh->b_size -= tape->excess_bh_size;
-	if (full)
-		atomic_sub(tape->excess_bh_size, &bh->b_count);
-	return merge_bh;
-abort:
-	ide_tape_kfree_buffer(tape);
-	return NULL;
+	bh->b_size = tape->buffer_size;
+	atomic_set(&bh->b_count, full ? bh->b_size : 0);
+
+	return bh;
 }
 
 static int idetape_copy_stage_from_user(idetape_tape_t *tape,
 					const char __user *buf, int n)
 {
 	struct idetape_bh *bh = tape->bh;
-	int count;
 	int ret = 0;
 
-	while (n) {
-		if (bh == NULL) {
+	if (n) {
+		if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return 1;
 		}
-		count = min((unsigned int)
-				(bh->b_size - atomic_read(&bh->b_count)),
-				(unsigned int)n);
 		if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf,
-				count))
+				   n))
 			ret = 1;
-		n -= count;
-		atomic_add(count, &bh->b_count);
-		buf += count;
-		if (atomic_read(&bh->b_count) == bh->b_size) {
-			bh = bh->b_reqnext;
-			if (bh)
-				atomic_set(&bh->b_count, 0);
-		}
+		atomic_add(n, &bh->b_count);
+		if (atomic_read(&bh->b_count) == bh->b_size)
+			tape->bh = NULL;
 	}
-	tape->bh = bh;
+
 	return ret;
 }
 
@@ -979,30 +874,20 @@ static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf,
 				      int n)
 {
 	struct idetape_bh *bh = tape->bh;
-	int count;
 	int ret = 0;
 
-	while (n) {
-		if (bh == NULL) {
+	if (n) {
+		if (bh == NULL || n > tape->b_count) {
 			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
 					__func__);
 			return 1;
 		}
-		count = min(tape->b_count, n);
-		if  (copy_to_user(buf, tape->b_data, count))
+		if (copy_to_user(buf, tape->b_data, n))
 			ret = 1;
-		n -= count;
-		tape->b_data += count;
-		tape->b_count -= count;
-		buf += count;
-		if (!tape->b_count) {
-			bh = bh->b_reqnext;
-			tape->bh = bh;
-			if (bh) {
-				tape->b_data = bh->b_data;
-				tape->b_count = atomic_read(&bh->b_count);
-			}
-		}
+		tape->b_data += n;
+		tape->b_count -= n;
+		if (!tape->b_count)
+			tape->bh = NULL;
 	}
 	return ret;
 }
@@ -1254,7 +1139,7 @@ static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int blocks, min;
+	int blocks;
 	struct idetape_bh *bh;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
@@ -1269,31 +1154,16 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 	if (tape->merge_bh_size) {
 		blocks = tape->merge_bh_size / tape->blk_size;
 		if (tape->merge_bh_size % tape->blk_size) {
-			unsigned int i;
-
+			unsigned int i = tape->blk_size -
+				tape->merge_bh_size % tape->blk_size;
 			blocks++;
-			i = tape->blk_size - tape->merge_bh_size %
-				tape->blk_size;
-			bh = tape->bh->b_reqnext;
-			while (bh) {
-				atomic_set(&bh->b_count, 0);
-				bh = bh->b_reqnext;
-			}
 			bh = tape->bh;
-			while (i) {
-				if (bh == NULL) {
-					printk(KERN_INFO "ide-tape: bug,"
-							 " bh NULL\n");
-					break;
-				}
-				min = min(i, (unsigned int)(bh->b_size -
-						atomic_read(&bh->b_count)));
+			if (bh) {
 				memset(bh->b_data + atomic_read(&bh->b_count),
-						0, min);
-				atomic_add(min, &bh->b_count);
-				i -= min;
-				bh = bh->b_reqnext;
-			}
+				       0, i);
+				atomic_add(i, &bh->b_count);
+			} else
+				printk(KERN_INFO "ide-tape: bug, bh NULL\n");
 		}
 		(void) idetape_add_chrdev_write_request(drive, blocks);
 		tape->merge_bh_size = 0;
@@ -1321,7 +1191,7 @@ static int idetape_init_read(ide_drive_t *drive)
 					 " 0 now\n");
 			tape->merge_bh_size = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0);
+		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
 		if (!tape->merge_bh)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_READ;
@@ -1368,23 +1238,18 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	struct idetape_bh *bh;
+	struct idetape_bh *bh = tape->merge_bh;
 	int blocks;
 
 	while (bcount) {
 		unsigned int count;
 
-		bh = tape->merge_bh;
 		count = min(tape->buffer_size, bcount);
 		bcount -= count;
 		blocks = count / tape->blk_size;
-		while (count) {
-			atomic_set(&bh->b_count,
-				   min(count, (unsigned int)bh->b_size));
-			memset(bh->b_data, 0, atomic_read(&bh->b_count));
-			count -= atomic_read(&bh->b_count);
-			bh = bh->b_reqnext;
-		}
+		atomic_set(&bh->b_count, count);
+		memset(bh->b_data, 0, atomic_read(&bh->b_count));
+
 		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks,
 				      tape->merge_bh);
 	}
@@ -1596,7 +1461,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 				"should be 0 now\n");
 			tape->merge_bh_size = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0);
+		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
 		if (!tape->merge_bh)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_WRITE;
@@ -1970,7 +1835,7 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_tape_flush_merge_buffer(drive);
-	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1, 0);
+	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1);
 	if (tape->merge_bh != NULL) {
 		idetape_pad_zeros(drive, tape->blk_size *
 				(tape->user_bs_factor - 1));
@@ -2201,11 +2066,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
 		tape->buffer_size = *ctl * tape->blk_size;
 	}
 	buffer_size = tape->buffer_size;
-	tape->pages_per_buffer = buffer_size / PAGE_SIZE;
-	if (buffer_size % PAGE_SIZE) {
-		tape->pages_per_buffer++;
-		tape->excess_bh_size = PAGE_SIZE - buffer_size % PAGE_SIZE;
-	}
 
 	/* select the "best" DSC read/write polling freq */
 	speed = max(*(u16 *)&tape->caps[14], *(u16 *)&tape->caps[8]);
-- 
GitLab


From e998f30b45efb99a3c3ce7b5483f76317a17abed Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0632/2198] ide-tape: use standard data transfer mechanism

Impact: use standard way to transfer data

ide-tape uses rq in an interesting way.  For r/w requests, rq->special
is used to carry a private buffer management structure idetape_bh and
rq->nr_sectors and current_nr_sectors are initialized to the number of
idetape blocks which isn't necessary 512 bytes.  Also,
rq->current_nr_sectors is used to report back the residual count in
units of idetape blocks.

This peculiarity taxes both block layer and ide.  ide-atapi has
different paths and hooks to accomodate it and what a rq means becomes
quite confusing and making changes at the block layer becomes quite
difficult and error-prone.

This patch makes ide-tape use bio instead.  With the previous patch,
ide-tape currently is using single contiguos buffer so replacing it
isn't difficult.  Data buffer is mapped into bio using
blk_rq_map_kern() in idetape_queue_rw_tail().  idetape_io_buffers()
and idetape_update_buffers() are dropped and pc->bh is set to null to
tell ide-atapi to use standard data transfer mechanism and idetape_bh
byte counts are updated by the issuer on completion using the residual
count.

This change also nicely removes the FIXME in ide_pc_intr() where
ide-tape rqs need to be completed using ide_rq_bytes() instead of
blk_rq_bytes() (although this didn't really matter as the request
didn't have bio).

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-atapi.c |   6 +--
 drivers/ide/ide-tape.c  | 112 +++++++++-------------------------------
 2 files changed, 24 insertions(+), 94 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 3df5442de710d..b9dd4503cbc70 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -413,11 +413,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		 * ->pc_callback() might change rq->data_len for
 		 * residual count, cache total length.
 		 */
-		if (!blk_special_request(rq) &&
-		    (drive->media == ide_tape && !rq->bio))
-			done = ide_rq_bytes(rq);        /* FIXME */
-		else
-			done = blk_rq_bytes(rq);
+		done = blk_rq_bytes(rq);
 
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index b2afbc7bcb6b1..b373ac876352a 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -292,65 +292,6 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i)
 	return tape;
 }
 
-static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				  unsigned int bcount)
-{
-	struct idetape_bh *bh = pc->bh;
-	int count;
-
-	if (bcount && bh) {
-		count = min(
-			(unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
-			bcount);
-		drive->hwif->tp_ops->input_data(drive, NULL, bh->b_data +
-					atomic_read(&bh->b_count), count);
-		bcount -= count;
-		atomic_add(count, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			pc->bh = NULL;
-	}
-
-	return bcount;
-}
-
-static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				   unsigned int bcount)
-{
-	struct idetape_bh *bh = pc->bh;
-	int count;
-
-	if (bcount && bh) {
-		count = min((unsigned int)pc->b_count, (unsigned int)bcount);
-		drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count);
-		bcount -= count;
-		pc->b_data += count;
-		pc->b_count -= count;
-		if (!pc->b_count)
-			pc->bh = NULL;
-	}
-
-	return bcount;
-}
-
-static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
-{
-	struct idetape_bh *bh = pc->bh;
-	unsigned int bcount = pc->xferred;
-
-	if (pc->flags & PC_FLAG_WRITING)
-		return;
-	if (bcount) {
-		if (bh == NULL || bcount > bh->b_size) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return;
-		}
-		atomic_set(&bh->b_count, bcount);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			pc->bh = NULL;
-	}
-}
-
 /*
  * called on each failed packet command retry to analyze the request sense. We
  * currently do not utilize this information.
@@ -368,12 +309,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 		 pc->c[0], tape->sense_key, tape->asc, tape->ascq);
 
 	/* Correct pc->xferred by asking the tape.	 */
-	if (pc->flags & PC_FLAG_DMA_ERROR) {
+	if (pc->flags & PC_FLAG_DMA_ERROR)
 		pc->xferred = pc->req_xfer -
 			tape->blk_size *
 			get_unaligned_be32(&sense[3]);
-		idetape_update_buffers(drive, pc);
-	}
 
 	/*
 	 * If error was the result of a zero-length read or write command,
@@ -458,7 +397,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->current_nr_sectors -= blocks;
+		rq->data_len -= blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
@@ -520,19 +459,6 @@ static void ide_tape_handle_dsc(ide_drive_t *drive)
 	idetape_postpone_request(drive);
 }
 
-static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-				unsigned int bcount, int write)
-{
-	unsigned int bleft;
-
-	if (write)
-		bleft = idetape_output_buffers(drive, pc, bcount);
-	else
-		bleft = idetape_input_buffers(drive, pc, bcount);
-
-	return bcount - bleft;
-}
-
 /*
  * Packet Command Interface
  *
@@ -683,7 +609,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->bh = bh;
+	pc->bh = NULL;
 	pc->buf = NULL;
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
@@ -1063,10 +989,12 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 				 struct idetape_bh *bh)
 {
 	idetape_tape_t *tape = drive->driver_data;
+	size_t size = blocks * tape->blk_size;
 	struct request *rq;
-	int ret, errors;
+	int ret;
 
 	debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd);
+	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
@@ -1074,21 +1002,29 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 	rq->rq_disk = tape->disk;
 	rq->special = (void *)bh;
 	rq->sector = tape->first_frame;
-	rq->nr_sectors = blocks;
-	rq->current_nr_sectors = blocks;
+
+	if (size) {
+		ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size,
+				      __GFP_WAIT);
+		if (ret)
+			goto out_put;
+	}
+
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
-	errors = rq->errors;
-	ret = tape->blk_size * (blocks - rq->current_nr_sectors);
-	blk_put_request(rq);
+	/* calculate the number of transferred bytes and update bh */
+	size -= rq->data_len;
+	if (cmd == REQ_IDETAPE_READ)
+		atomic_add(size, &bh->b_count);
 
-	if ((cmd & (REQ_IDETAPE_READ | REQ_IDETAPE_WRITE)) == 0)
-		return 0;
+	ret = size;
+	if (rq->errors == IDE_DRV_ERROR_GENERAL)
+		ret = -EIO;
 
 	if (tape->merge_bh)
 		idetape_init_merge_buffer(tape);
-	if (errors == IDE_DRV_ERROR_GENERAL)
-		return -EIO;
+out_put:
+	blk_put_request(rq);
 	return ret;
 }
 
@@ -2034,8 +1970,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
 	u16 *ctl = (u16 *)&tape->caps[12];
 
 	drive->pc_callback	 = ide_tape_callback;
-	drive->pc_update_buffers = idetape_update_buffers;
-	drive->pc_io_buffers	 = ide_tape_io_buffers;
 
 	drive->dev_flags |= IDE_DFLAG_DSC_OVERLAP;
 
-- 
GitLab


From 6cf3d545f7d71b183e5b89960d4cc850a42c410d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0633/2198] ide-tape: kill idetape_bh

Impact: kill now unnecessary idetape_bh

With everything using standard mechanisms, there is no need for
idetape_bh anymore.  Kill it and use tape->buf, cur and valid to
describe data buffer instead.

Changes worth mentioning are...

* idetape_queue_rq_tail() now always queue tape->buf and and adjusts
  buffer state properly before completion.

* idetape_pad_zeros() clears the buffer only once.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 305 ++++++++++++-----------------------------
 1 file changed, 84 insertions(+), 221 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index b373ac876352a..d2e9c09b52159 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -131,12 +131,6 @@ enum {
 	IDETAPE_DIR_WRITE = (1 << 2),
 };
 
-struct idetape_bh {
-	u32 b_size;
-	atomic_t b_count;
-	char *b_data;
-};
-
 /* Tape door status */
 #define DOOR_UNLOCKED			0
 #define DOOR_LOCKED			1
@@ -218,14 +212,12 @@ typedef struct ide_tape_obj {
 
 	/* Data buffer size chosen based on the tape's recommendation */
 	int buffer_size;
-	/* merge buffer */
-	struct idetape_bh *merge_bh;
-	/* size of the merge buffer */
-	int merge_bh_size;
-	/* pointer to current buffer head within the merge buffer */
-	struct idetape_bh *bh;
-	char *b_data;
-	int b_count;
+	/* Staging buffer of buffer_size bytes */
+	void *buf;
+	/* The read/write cursor */
+	void *cur;
+	/* The number of valid bytes in buf */
+	size_t valid;
 
 	/* Measures average tape speed */
 	unsigned long avg_time;
@@ -351,15 +343,6 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
 	}
 }
 
-/* Free data buffers completely. */
-static void ide_tape_kfree_buffer(idetape_tape_t *tape)
-{
-	struct idetape_bh *bh = tape->merge_bh;
-
-	kfree(bh->b_data);
-	kfree(bh);
-}
-
 static void ide_tape_handle_dsc(ide_drive_t *);
 
 static int ide_tape_callback(ide_drive_t *drive, int dsc)
@@ -603,8 +586,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 				   struct ide_atapi_pc *pc, struct request *rq,
 				   u8 opcode)
 {
-	struct idetape_bh *bh = (struct idetape_bh *)rq->special;
-	unsigned int length = rq->current_nr_sectors;
+	unsigned int length = rq->nr_sectors;
 
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
@@ -616,14 +598,11 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	if (pc->req_xfer == tape->buffer_size)
 		pc->flags |= PC_FLAG_DMA_OK;
 
-	if (opcode == READ_6) {
+	if (opcode == READ_6)
 		pc->c[0] = READ_6;
-		atomic_set(&bh->b_count, 0);
-	} else if (opcode == WRITE_6) {
+	else if (opcode == WRITE_6) {
 		pc->c[0] = WRITE_6;
 		pc->flags |= PC_FLAG_WRITING;
-		pc->b_data = bh->b_data;
-		pc->b_count = atomic_read(&bh->b_count);
 	}
 
 	memcpy(rq->cmd, pc->c, 12);
@@ -639,10 +618,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	struct ide_cmd cmd;
 	u8 stat;
 
-	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu,"
-			" current_nr_sectors: %u\n",
-			(unsigned long long)rq->sector, rq->nr_sectors,
-			rq->current_nr_sectors);
+	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n"
+		  (unsigned long long)rq->sector, rq->nr_sectors);
 
 	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
@@ -748,89 +725,6 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	return ide_tape_issue_pc(drive, &cmd, pc);
 }
 
-/*
- * It returns a pointer to the newly allocated buffer, or NULL in case
- * of failure.
- */
-static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape,
-						  int full)
-{
-	struct idetape_bh *bh;
-
-	bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-	if (!bh)
-		return NULL;
-
-	bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL);
-	if (!bh->b_data) {
-		kfree(bh);
-		return NULL;
-	}
-
-	bh->b_size = tape->buffer_size;
-	atomic_set(&bh->b_count, full ? bh->b_size : 0);
-
-	return bh;
-}
-
-static int idetape_copy_stage_from_user(idetape_tape_t *tape,
-					const char __user *buf, int n)
-{
-	struct idetape_bh *bh = tape->bh;
-	int ret = 0;
-
-	if (n) {
-		if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return 1;
-		}
-		if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf,
-				   n))
-			ret = 1;
-		atomic_add(n, &bh->b_count);
-		if (atomic_read(&bh->b_count) == bh->b_size)
-			tape->bh = NULL;
-	}
-
-	return ret;
-}
-
-static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf,
-				      int n)
-{
-	struct idetape_bh *bh = tape->bh;
-	int ret = 0;
-
-	if (n) {
-		if (bh == NULL || n > tape->b_count) {
-			printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-					__func__);
-			return 1;
-		}
-		if (copy_to_user(buf, tape->b_data, n))
-			ret = 1;
-		tape->b_data += n;
-		tape->b_count -= n;
-		if (!tape->b_count)
-			tape->bh = NULL;
-	}
-	return ret;
-}
-
-static void idetape_init_merge_buffer(idetape_tape_t *tape)
-{
-	struct idetape_bh *bh = tape->merge_bh;
-	tape->bh = tape->merge_bh;
-
-	if (tape->chrdev_dir == IDETAPE_DIR_WRITE)
-		atomic_set(&bh->b_count, 0);
-	else {
-		tape->b_data = bh->b_data;
-		tape->b_count = atomic_read(&bh->b_count);
-	}
-}
-
 /*
  * Write a filemark if write_filemark=1. Flush the device buffers without
  * writing a filemark otherwise.
@@ -928,10 +822,10 @@ static void __ide_tape_discard_merge_buffer(ide_drive_t *drive)
 		return;
 
 	clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags);
-	tape->merge_bh_size = 0;
-	if (tape->merge_bh != NULL) {
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+	tape->valid = 0;
+	if (tape->buf != NULL) {
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
@@ -985,8 +879,7 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
  * Generate a read/write request for the block device interface and wait for it
  * to be serviced.
  */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
-				 struct idetape_bh *bh)
+static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	size_t size = blocks * tape->blk_size;
@@ -1000,11 +893,10 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
-	rq->special = (void *)bh;
 	rq->sector = tape->first_frame;
 
 	if (size) {
-		ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size,
+		ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
 				      __GFP_WAIT);
 		if (ret)
 			goto out_put;
@@ -1012,17 +904,17 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
 
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
-	/* calculate the number of transferred bytes and update bh */
+	/* calculate the number of transferred bytes and update buffer state */
 	size -= rq->data_len;
+	tape->cur = tape->buf;
 	if (cmd == REQ_IDETAPE_READ)
-		atomic_add(size, &bh->b_count);
+		tape->valid = size;
+	else
+		tape->valid = 0;
 
 	ret = size;
 	if (rq->errors == IDE_DRV_ERROR_GENERAL)
 		ret = -EIO;
-
-	if (tape->merge_bh)
-		idetape_init_merge_buffer(tape);
 out_put:
 	blk_put_request(rq);
 	return ret;
@@ -1064,49 +956,33 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 /* Queue up a character device originated write request. */
 static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
 {
-	idetape_tape_t *tape = drive->driver_data;
-
 	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
-				     blocks, tape->merge_bh);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks);
 }
 
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
 	int blocks;
-	struct idetape_bh *bh;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer"
 				" but we are not writing.\n");
 		return;
 	}
-	if (tape->merge_bh_size > tape->buffer_size) {
-		printk(KERN_ERR "ide-tape: bug: merge_buffer too big\n");
-		tape->merge_bh_size = tape->buffer_size;
-	}
-	if (tape->merge_bh_size) {
-		blocks = tape->merge_bh_size / tape->blk_size;
-		if (tape->merge_bh_size % tape->blk_size) {
-			unsigned int i = tape->blk_size -
-				tape->merge_bh_size % tape->blk_size;
+	if (tape->buf) {
+		blocks = tape->valid / tape->blk_size;
+		if (tape->valid % tape->blk_size) {
 			blocks++;
-			bh = tape->bh;
-			if (bh) {
-				memset(bh->b_data + atomic_read(&bh->b_count),
-				       0, i);
-				atomic_add(i, &bh->b_count);
-			} else
-				printk(KERN_INFO "ide-tape: bug, bh NULL\n");
+			memset(tape->buf + tape->valid, 0,
+			       tape->blk_size - tape->valid % tape->blk_size);
 		}
 		(void) idetape_add_chrdev_write_request(drive, blocks);
-		tape->merge_bh_size = 0;
 	}
-	if (tape->merge_bh != NULL) {
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+	if (tape->buf != NULL) {
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
 }
@@ -1122,15 +998,15 @@ static int idetape_init_read(ide_drive_t *drive)
 			ide_tape_flush_merge_buffer(drive);
 			idetape_flush_tape_buffers(drive);
 		}
-		if (tape->merge_bh || tape->merge_bh_size) {
-			printk(KERN_ERR "ide-tape: merge_bh_size should be"
-					 " 0 now\n");
-			tape->merge_bh_size = 0;
+		if (tape->buf || tape->valid) {
+			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+			tape->valid = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
-		if (!tape->merge_bh)
+		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+		if (!tape->buf)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_READ;
+		tape->cur = tape->buf;
 
 		/*
 		 * Issue a read 0 command to ensure that DSC handshake is
@@ -1140,11 +1016,10 @@ static int idetape_init_read(ide_drive_t *drive)
 		 */
 		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
 			bytes_read = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_READ, 0,
-							tape->merge_bh);
+							REQ_IDETAPE_READ, 0);
 			if (bytes_read < 0) {
-				ide_tape_kfree_buffer(tape);
-				tape->merge_bh = NULL;
+				kfree(tape->buf);
+				tape->buf = NULL;
 				tape->chrdev_dir = IDETAPE_DIR_NONE;
 				return bytes_read;
 			}
@@ -1157,8 +1032,6 @@ static int idetape_init_read(ide_drive_t *drive)
 /* called from idetape_chrdev_read() to service a chrdev read request. */
 static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 {
-	idetape_tape_t *tape = drive->driver_data;
-
 	debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
 
 	/* If we are at a filemark, return a read length of 0 */
@@ -1167,27 +1040,21 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 
 	idetape_init_read(drive);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks,
-				     tape->merge_bh);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
 }
 
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	struct idetape_bh *bh = tape->merge_bh;
-	int blocks;
+
+	memset(tape->buf, 0, tape->buffer_size);
 
 	while (bcount) {
-		unsigned int count;
+		unsigned int count = min(tape->buffer_size, bcount);
 
-		count = min(tape->buffer_size, bcount);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
+				      count / tape->blk_size);
 		bcount -= count;
-		blocks = count / tape->blk_size;
-		atomic_set(&bh->b_count, count);
-		memset(bh->b_data, 0, atomic_read(&bh->b_count));
-
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks,
-				      tape->merge_bh);
 	}
 }
 
@@ -1267,7 +1134,7 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op,
 	}
 
 	if (tape->chrdev_dir == IDETAPE_DIR_READ) {
-		tape->merge_bh_size = 0;
+		tape->valid = 0;
 		if (test_and_clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
 			++count;
 		ide_tape_discard_merge_buffer(drive, 0);
@@ -1333,20 +1200,20 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		return rc;
 	if (count == 0)
 		return (0);
-	if (tape->merge_bh_size) {
-		actually_read = min((unsigned int)(tape->merge_bh_size),
-				    (unsigned int)count);
-		if (idetape_copy_stage_to_user(tape, buf, actually_read))
+	if (tape->valid) {
+		actually_read = min_t(unsigned int, tape->valid, count);
+		if (copy_to_user(buf, tape->cur, actually_read))
 			ret = -EFAULT;
 		buf += actually_read;
-		tape->merge_bh_size -= actually_read;
 		count -= actually_read;
+		tape->cur += actually_read;
+		tape->valid -= actually_read;
 	}
 	while (count >= tape->buffer_size) {
 		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
 		if (bytes_read <= 0)
 			goto finish;
-		if (idetape_copy_stage_to_user(tape, buf, bytes_read))
+		if (copy_to_user(buf, tape->cur, bytes_read))
 			ret = -EFAULT;
 		buf += bytes_read;
 		count -= bytes_read;
@@ -1357,10 +1224,11 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		if (bytes_read <= 0)
 			goto finish;
 		temp = min((unsigned long)count, (unsigned long)bytes_read);
-		if (idetape_copy_stage_to_user(tape, buf, temp))
+		if (copy_to_user(buf, tape->cur, temp))
 			ret = -EFAULT;
 		actually_read += temp;
-		tape->merge_bh_size = bytes_read-temp;
+		tape->cur += temp;
+		tape->valid -= temp;
 	}
 finish:
 	if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
@@ -1392,16 +1260,15 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		if (tape->chrdev_dir == IDETAPE_DIR_READ)
 			ide_tape_discard_merge_buffer(drive, 1);
-		if (tape->merge_bh || tape->merge_bh_size) {
-			printk(KERN_ERR "ide-tape: merge_bh_size "
-				"should be 0 now\n");
-			tape->merge_bh_size = 0;
+		if (tape->buf || tape->valid) {
+			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+			tape->valid = 0;
 		}
-		tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0);
-		if (!tape->merge_bh)
+		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+		if (!tape->buf)
 			return -ENOMEM;
 		tape->chrdev_dir = IDETAPE_DIR_WRITE;
-		idetape_init_merge_buffer(tape);
+		tape->cur = tape->buf;
 
 		/*
 		 * Issue a write 0 command to ensure that DSC handshake is
@@ -1411,11 +1278,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 		 */
 		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
 			ssize_t retval = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_WRITE, 0,
-							tape->merge_bh);
+							REQ_IDETAPE_WRITE, 0);
 			if (retval < 0) {
-				ide_tape_kfree_buffer(tape);
-				tape->merge_bh = NULL;
+				kfree(tape->buf);
+				tape->buf = NULL;
 				tape->chrdev_dir = IDETAPE_DIR_NONE;
 				return retval;
 			}
@@ -1423,23 +1289,19 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	if (count == 0)
 		return (0);
-	if (tape->merge_bh_size) {
-		if (tape->merge_bh_size >= tape->buffer_size) {
-			printk(KERN_ERR "ide-tape: bug: merge buf too big\n");
-			tape->merge_bh_size = 0;
-		}
-		actually_written = min((unsigned int)
-				(tape->buffer_size - tape->merge_bh_size),
-				(unsigned int)count);
-		if (idetape_copy_stage_from_user(tape, buf, actually_written))
-				ret = -EFAULT;
+	if (tape->valid < tape->buffer_size) {
+		actually_written = min_t(unsigned int,
+					 tape->buffer_size - tape->valid,
+					 count);
+		if (copy_from_user(tape->cur, buf, actually_written))
+			ret = -EFAULT;
 		buf += actually_written;
-		tape->merge_bh_size += actually_written;
 		count -= actually_written;
+		tape->cur += actually_written;
+		tape->valid += actually_written;
 
-		if (tape->merge_bh_size == tape->buffer_size) {
+		if (tape->valid == tape->buffer_size) {
 			ssize_t retval;
-			tape->merge_bh_size = 0;
 			retval = idetape_add_chrdev_write_request(drive, ctl);
 			if (retval <= 0)
 				return (retval);
@@ -1447,7 +1309,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	while (count >= tape->buffer_size) {
 		ssize_t retval;
-		if (idetape_copy_stage_from_user(tape, buf, tape->buffer_size))
+		if (copy_from_user(tape->cur, buf, tape->buffer_size))
 			ret = -EFAULT;
 		buf += tape->buffer_size;
 		count -= tape->buffer_size;
@@ -1458,9 +1320,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	}
 	if (count) {
 		actually_written += count;
-		if (idetape_copy_stage_from_user(tape, buf, count))
+		if (copy_from_user(tape->cur, buf, count))
 			ret = -EFAULT;
-		tape->merge_bh_size += count;
+		tape->cur += count;
+		tape->valid += count;
 	}
 	return ret ? ret : actually_written;
 }
@@ -1623,7 +1486,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
 		idetape_flush_tape_buffers(drive);
 	}
 	if (cmd == MTIOCGET || cmd == MTIOCPOS) {
-		block_offset = tape->merge_bh_size /
+		block_offset = tape->valid /
 			(tape->blk_size * tape->user_bs_factor);
 		position = idetape_read_position(drive);
 		if (position < 0)
@@ -1771,12 +1634,12 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
 	idetape_tape_t *tape = drive->driver_data;
 
 	ide_tape_flush_merge_buffer(drive);
-	tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1);
-	if (tape->merge_bh != NULL) {
+	tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (tape->buf != NULL) {
 		idetape_pad_zeros(drive, tape->blk_size *
 				(tape->user_bs_factor - 1));
-		ide_tape_kfree_buffer(tape);
-		tape->merge_bh = NULL;
+		kfree(tape->buf);
+		tape->buf = NULL;
 	}
 	idetape_write_filemark(drive);
 	idetape_flush_tape_buffers(drive);
@@ -2042,7 +1905,7 @@ static void ide_tape_release(struct device *dev)
 	ide_drive_t *drive = tape->drive;
 	struct gendisk *g = tape->disk;
 
-	BUG_ON(tape->merge_bh_size);
+	BUG_ON(tape->valid);
 
 	drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP;
 	drive->driver_data = NULL;
-- 
GitLab


From 3596b66452491a3cff26256a5e6e6061a66c4142 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:02 +0900
Subject: [PATCH 0634/2198] ide-tape: unify r/w init paths

Impact: cleanup

Read and write init paths are almost identical.  Unify them into
idetape_init_rw().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 110 +++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 64 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d2e9c09b52159..4da7fa9bca1ea 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -987,42 +987,50 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 	tape->chrdev_dir = IDETAPE_DIR_NONE;
 }
 
-static int idetape_init_read(ide_drive_t *drive)
+static int idetape_init_rw(ide_drive_t *drive, int dir)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int bytes_read;
+	int rc;
 
-	/* Initialize read operation */
-	if (tape->chrdev_dir != IDETAPE_DIR_READ) {
-		if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
-			ide_tape_flush_merge_buffer(drive);
-			idetape_flush_tape_buffers(drive);
-		}
-		if (tape->buf || tape->valid) {
-			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
-			tape->valid = 0;
-		}
-		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
-		if (!tape->buf)
-			return -ENOMEM;
-		tape->chrdev_dir = IDETAPE_DIR_READ;
-		tape->cur = tape->buf;
+	BUG_ON(dir != IDETAPE_DIR_READ && dir != IDETAPE_DIR_WRITE);
 
-		/*
-		 * Issue a read 0 command to ensure that DSC handshake is
-		 * switched from completion mode to buffer available mode.
-		 * No point in issuing this if DSC overlap isn't supported, some
-		 * drives (Seagate STT3401A) will return an error.
-		 */
-		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-			bytes_read = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_READ, 0);
-			if (bytes_read < 0) {
-				kfree(tape->buf);
-				tape->buf = NULL;
-				tape->chrdev_dir = IDETAPE_DIR_NONE;
-				return bytes_read;
-			}
+	if (tape->chrdev_dir == dir)
+		return 0;
+
+	if (tape->chrdev_dir == IDETAPE_DIR_READ)
+		ide_tape_discard_merge_buffer(drive, 1);
+	else if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
+		ide_tape_flush_merge_buffer(drive);
+		idetape_flush_tape_buffers(drive);
+	}
+
+	if (tape->buf || tape->valid) {
+		printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+		tape->valid = 0;
+	}
+
+	tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+	if (!tape->buf)
+		return -ENOMEM;
+	tape->chrdev_dir = dir;
+	tape->cur = tape->buf;
+
+	/*
+	 * Issue a 0 rw command to ensure that DSC handshake is
+	 * switched from completion mode to buffer available mode.  No
+	 * point in issuing this if DSC overlap isn't supported, some
+	 * drives (Seagate STT3401A) will return an error.
+	 */
+	if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
+		int cmd = dir == IDETAPE_DIR_READ ? REQ_IDETAPE_READ
+						  : REQ_IDETAPE_WRITE;
+
+		rc = idetape_queue_rw_tail(drive, cmd, 0);
+		if (rc < 0) {
+			kfree(tape->buf);
+			tape->buf = NULL;
+			tape->chrdev_dir = IDETAPE_DIR_NONE;
+			return rc;
 		}
 	}
 
@@ -1038,7 +1046,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
 		return 0;
 
-	idetape_init_read(drive);
+	idetape_init_rw(drive, IDETAPE_DIR_READ);
 
 	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
 }
@@ -1195,7 +1203,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 			    (count % tape->blk_size) == 0)
 				tape->user_bs_factor = count / tape->blk_size;
 	}
-	rc = idetape_init_read(drive);
+	rc = idetape_init_rw(drive, IDETAPE_DIR_READ);
 	if (rc < 0)
 		return rc;
 	if (count == 0)
@@ -1249,6 +1257,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	ssize_t actually_written = 0;
 	ssize_t ret = 0;
 	u16 ctl = *(u16 *)&tape->caps[12];
+	int rc;
 
 	/* The drive is write protected. */
 	if (tape->write_prot)
@@ -1257,36 +1266,9 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
 	/* Initialize write operation */
-	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
-		if (tape->chrdev_dir == IDETAPE_DIR_READ)
-			ide_tape_discard_merge_buffer(drive, 1);
-		if (tape->buf || tape->valid) {
-			printk(KERN_ERR "ide-tape: valid should be 0 now\n");
-			tape->valid = 0;
-		}
-		tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
-		if (!tape->buf)
-			return -ENOMEM;
-		tape->chrdev_dir = IDETAPE_DIR_WRITE;
-		tape->cur = tape->buf;
-
-		/*
-		 * Issue a write 0 command to ensure that DSC handshake is
-		 * switched from completion mode to buffer available mode. No
-		 * point in issuing this if DSC overlap isn't supported, some
-		 * drives (Seagate STT3401A) will return an error.
-		 */
-		if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-			ssize_t retval = idetape_queue_rw_tail(drive,
-							REQ_IDETAPE_WRITE, 0);
-			if (retval < 0) {
-				kfree(tape->buf);
-				tape->buf = NULL;
-				tape->chrdev_dir = IDETAPE_DIR_NONE;
-				return retval;
-			}
-		}
-	}
+	rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
+	if (rc < 0)
+		return rc;
 	if (count == 0)
 		return (0);
 	if (tape->valid < tape->buffer_size) {
-- 
GitLab


From 71294cf93d22ceaa75448cc6ebee2c65897be8c2 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: [PATCH 0635/2198] ide-tape: use byte size instead of sectors on rw
 issue functions

Impact: cleanup

Byte size is what most issue functions deal with, make
idetape_queue_rw_tail() and its wrappers take byte size instead of
sector counts.  idetape_chrdev_read() and write() functions are
converted to use tape->buffer_size instead of ctl from tape->cap.

This cleans up code a little bit and will ease the next r/w
reimplementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 45 +++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 4da7fa9bca1ea..d5e9bb286e306 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -879,15 +879,15 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
  * Generate a read/write request for the block device interface and wait for it
  * to be serviced.
  */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks)
+static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	size_t size = blocks * tape->blk_size;
 	struct request *rq;
 	int ret;
 
 	debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd);
 	BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
+	BUG_ON(size < 0 || size % tape->blk_size);
 
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_SPECIAL;
@@ -954,17 +954,16 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 }
 
 /* Queue up a character device originated write request. */
-static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
+static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size)
 {
 	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size);
 }
 
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
-	int blocks;
 
 	if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
 		printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer"
@@ -972,15 +971,10 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 		return;
 	}
 	if (tape->buf) {
-		blocks = tape->valid / tape->blk_size;
-		if (tape->valid % tape->blk_size) {
-			blocks++;
-			memset(tape->buf + tape->valid, 0,
-			       tape->blk_size - tape->valid % tape->blk_size);
-		}
-		(void) idetape_add_chrdev_write_request(drive, blocks);
-	}
-	if (tape->buf != NULL) {
+		size_t aligned = roundup(tape->valid, tape->blk_size);
+
+		memset(tape->cur, 0, aligned - tape->valid);
+		idetape_add_chrdev_write_request(drive, aligned);
 		kfree(tape->buf);
 		tape->buf = NULL;
 	}
@@ -1038,9 +1032,9 @@ static int idetape_init_rw(ide_drive_t *drive, int dir)
 }
 
 /* called from idetape_chrdev_read() to service a chrdev read request. */
-static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
+static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size)
 {
-	debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
+	debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size);
 
 	/* If we are at a filemark, return a read length of 0 */
 	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
@@ -1048,7 +1042,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
 
 	idetape_init_rw(drive, IDETAPE_DIR_READ);
 
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks);
+	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size);
 }
 
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
@@ -1060,8 +1054,7 @@ static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 	while (bcount) {
 		unsigned int count = min(tape->buffer_size, bcount);
 
-		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
-				      count / tape->blk_size);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, count);
 		bcount -= count;
 	}
 }
@@ -1193,7 +1186,6 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 	ide_drive_t *drive = tape->drive;
 	ssize_t bytes_read, temp, actually_read = 0, rc;
 	ssize_t ret = 0;
-	u16 ctl = *(u16 *)&tape->caps[12];
 
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
@@ -1218,7 +1210,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		tape->valid -= actually_read;
 	}
 	while (count >= tape->buffer_size) {
-		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
+		bytes_read = idetape_add_chrdev_read_request(drive,
+							     tape->buffer_size);
 		if (bytes_read <= 0)
 			goto finish;
 		if (copy_to_user(buf, tape->cur, bytes_read))
@@ -1228,7 +1221,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 		actually_read += bytes_read;
 	}
 	if (count) {
-		bytes_read = idetape_add_chrdev_read_request(drive, ctl);
+		bytes_read = idetape_add_chrdev_read_request(drive,
+							     tape->buffer_size);
 		if (bytes_read <= 0)
 			goto finish;
 		temp = min((unsigned long)count, (unsigned long)bytes_read);
@@ -1256,7 +1250,6 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	ide_drive_t *drive = tape->drive;
 	ssize_t actually_written = 0;
 	ssize_t ret = 0;
-	u16 ctl = *(u16 *)&tape->caps[12];
 	int rc;
 
 	/* The drive is write protected. */
@@ -1284,7 +1277,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 
 		if (tape->valid == tape->buffer_size) {
 			ssize_t retval;
-			retval = idetape_add_chrdev_write_request(drive, ctl);
+			retval = idetape_add_chrdev_write_request(drive,
+							tape->buffer_size);
 			if (retval <= 0)
 				return (retval);
 		}
@@ -1295,7 +1289,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 			ret = -EFAULT;
 		buf += tape->buffer_size;
 		count -= tape->buffer_size;
-		retval = idetape_add_chrdev_write_request(drive, ctl);
+		retval = idetape_add_chrdev_write_request(drive,
+							  tape->buffer_size);
 		actually_written += tape->buffer_size;
 		if (retval <= 0)
 			return (retval);
-- 
GitLab


From 4344d07fb8dbf0cbfec1f7d7c1afeccaceaaa120 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: [PATCH 0636/2198] ide-tape: simplify read/write functions

Impact: cleanup

idetape_chrdev_read/write() functions are unnecessarily complex when
everything can be handled in a single loop.  Collapse
idetape_add_chrdev_read/write_request() into the rw functions and
simplify the implementation.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-tape.c | 149 ++++++++++++++---------------------------
 1 file changed, 50 insertions(+), 99 deletions(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index d5e9bb286e306..2599579e4174d 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -953,14 +953,6 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
 	pc->flags |= PC_FLAG_WAIT_FOR_DSC;
 }
 
-/* Queue up a character device originated write request. */
-static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size)
-{
-	debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
-
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size);
-}
-
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -974,7 +966,7 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 		size_t aligned = roundup(tape->valid, tape->blk_size);
 
 		memset(tape->cur, 0, aligned - tape->valid);
-		idetape_add_chrdev_write_request(drive, aligned);
+		idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, aligned);
 		kfree(tape->buf);
 		tape->buf = NULL;
 	}
@@ -1031,20 +1023,6 @@ static int idetape_init_rw(ide_drive_t *drive, int dir)
 	return 0;
 }
 
-/* called from idetape_chrdev_read() to service a chrdev read request. */
-static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size)
-{
-	debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size);
-
-	/* If we are at a filemark, return a read length of 0 */
-	if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
-		return 0;
-
-	idetape_init_rw(drive, IDETAPE_DIR_READ);
-
-	return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size);
-}
-
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
 	idetape_tape_t *tape = drive->driver_data;
@@ -1184,8 +1162,9 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 {
 	struct ide_tape_obj *tape = file->private_data;
 	ide_drive_t *drive = tape->drive;
-	ssize_t bytes_read, temp, actually_read = 0, rc;
+	size_t done = 0;
 	ssize_t ret = 0;
+	int rc;
 
 	debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
@@ -1195,52 +1174,43 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 			    (count % tape->blk_size) == 0)
 				tape->user_bs_factor = count / tape->blk_size;
 	}
+
 	rc = idetape_init_rw(drive, IDETAPE_DIR_READ);
 	if (rc < 0)
 		return rc;
-	if (count == 0)
-		return (0);
-	if (tape->valid) {
-		actually_read = min_t(unsigned int, tape->valid, count);
-		if (copy_to_user(buf, tape->cur, actually_read))
-			ret = -EFAULT;
-		buf += actually_read;
-		count -= actually_read;
-		tape->cur += actually_read;
-		tape->valid -= actually_read;
-	}
-	while (count >= tape->buffer_size) {
-		bytes_read = idetape_add_chrdev_read_request(drive,
-							     tape->buffer_size);
-		if (bytes_read <= 0)
-			goto finish;
-		if (copy_to_user(buf, tape->cur, bytes_read))
-			ret = -EFAULT;
-		buf += bytes_read;
-		count -= bytes_read;
-		actually_read += bytes_read;
-	}
-	if (count) {
-		bytes_read = idetape_add_chrdev_read_request(drive,
-							     tape->buffer_size);
-		if (bytes_read <= 0)
-			goto finish;
-		temp = min((unsigned long)count, (unsigned long)bytes_read);
-		if (copy_to_user(buf, tape->cur, temp))
+
+	while (done < count) {
+		size_t todo;
+
+		/* refill if staging buffer is empty */
+		if (!tape->valid) {
+			/* If we are at a filemark, nothing more to read */
+			if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
+				break;
+			/* read */
+			if (idetape_queue_rw_tail(drive, REQ_IDETAPE_READ,
+						  tape->buffer_size) <= 0)
+				break;
+		}
+
+		/* copy out */
+		todo = min_t(size_t, count - done, tape->valid);
+		if (copy_to_user(buf + done, tape->cur, todo))
 			ret = -EFAULT;
-		actually_read += temp;
-		tape->cur += temp;
-		tape->valid -= temp;
+
+		tape->cur += todo;
+		tape->valid -= todo;
+		done += todo;
 	}
-finish:
-	if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
+
+	if (!done && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
 		debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name);
 
 		idetape_space_over_filemarks(drive, MTFSF, 1);
 		return 0;
 	}
 
-	return ret ? ret : actually_read;
+	return ret ? ret : done;
 }
 
 static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
@@ -1248,7 +1218,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 {
 	struct ide_tape_obj *tape = file->private_data;
 	ide_drive_t *drive = tape->drive;
-	ssize_t actually_written = 0;
+	size_t done = 0;
 	ssize_t ret = 0;
 	int rc;
 
@@ -1262,47 +1232,28 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 	rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
 	if (rc < 0)
 		return rc;
-	if (count == 0)
-		return (0);
-	if (tape->valid < tape->buffer_size) {
-		actually_written = min_t(unsigned int,
-					 tape->buffer_size - tape->valid,
-					 count);
-		if (copy_from_user(tape->cur, buf, actually_written))
-			ret = -EFAULT;
-		buf += actually_written;
-		count -= actually_written;
-		tape->cur += actually_written;
-		tape->valid += actually_written;
-
-		if (tape->valid == tape->buffer_size) {
-			ssize_t retval;
-			retval = idetape_add_chrdev_write_request(drive,
-							tape->buffer_size);
-			if (retval <= 0)
-				return (retval);
-		}
-	}
-	while (count >= tape->buffer_size) {
-		ssize_t retval;
-		if (copy_from_user(tape->cur, buf, tape->buffer_size))
-			ret = -EFAULT;
-		buf += tape->buffer_size;
-		count -= tape->buffer_size;
-		retval = idetape_add_chrdev_write_request(drive,
-							  tape->buffer_size);
-		actually_written += tape->buffer_size;
-		if (retval <= 0)
-			return (retval);
-	}
-	if (count) {
-		actually_written += count;
-		if (copy_from_user(tape->cur, buf, count))
+
+	while (done < count) {
+		size_t todo;
+
+		/* flush if staging buffer is full */
+		if (tape->valid == tape->buffer_size &&
+		    idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
+					  tape->buffer_size) <= 0)
+			return rc;
+
+		/* copy in */
+		todo = min_t(size_t, count - done,
+			     tape->buffer_size - tape->valid);
+		if (copy_from_user(tape->cur, buf + done, todo))
 			ret = -EFAULT;
-		tape->cur += count;
-		tape->valid += count;
+
+		tape->cur += todo;
+		tape->valid += todo;
+		done += todo;
 	}
-	return ret ? ret : actually_written;
+
+	return ret ? ret : done;
 }
 
 static int idetape_write_filemark(ide_drive_t *drive)
-- 
GitLab


From 29d1a4371035e01b0d079bc5aa88b50f5af7a566 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: [PATCH 0637/2198] ide-atapi: kill unused fields and callbacks

Impact: remove fields and code paths which are no longer necessary

Now that ide-tape uses standard mechanisms to transfer data, special
case handling for bh handling can be dropped from ide-atapi.  Drop the
followings.

* pc->cur_pos, b_count, bh and b_data
* drive->pc_update_buffers() and pc_io_buffers().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-atapi.c | 17 ++++-------------
 drivers/ide/ide-tape.c  |  1 -
 include/linux/ide.h     | 12 ------------
 3 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index b9dd4503cbc70..afe5a43238793 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -359,11 +359,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 					drive->name, rq_data_dir(pc->rq)
 						     ? "write" : "read");
 			pc->flags |= PC_FLAG_DMA_ERROR;
-		} else {
+		} else
 			pc->xferred = pc->req_xfer;
-			if (drive->pc_update_buffers)
-				drive->pc_update_buffers(drive, pc);
-		}
 		debug_log("%s: DMA finished\n", drive->name);
 	}
 
@@ -463,16 +460,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		return ide_do_reset(drive);
 	}
 
-	if (drive->media == ide_tape && pc->bh)
-		done = drive->pc_io_buffers(drive, pc, bcount, write);
-	else {
-		done = min_t(unsigned int, bcount, cmd->nleft);
-		ide_pio_bytes(drive, cmd, write, done);
-	}
+	done = min_t(unsigned int, bcount, cmd->nleft);
+	ide_pio_bytes(drive, cmd, write, done);
 
-	/* Update the current position */
+	/* Update transferred byte count */
 	pc->xferred += done;
-	pc->cur_pos += done;
 
 	bcount -= done;
 
@@ -650,7 +642,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 
 		/* We haven't transferred any data yet */
 		pc->xferred = 0;
-		pc->cur_pos = pc->buf;
 
 		valid_tf = IDE_VALID_DEVICE;
 		bcount = ((drive->media == ide_tape) ?
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 2599579e4174d..8dfc68892d6ae 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -591,7 +591,6 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
 	pc->c[1] = 1;
-	pc->bh = NULL;
 	pc->buf = NULL;
 	pc->buf_size = length * tape->blk_size;
 	pc->req_xfer = pc->buf_size;
diff --git a/include/linux/ide.h b/include/linux/ide.h
index 1957461ac762c..34c128f0a33c1 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -362,11 +362,7 @@ struct ide_atapi_pc {
 
 	/* data buffer */
 	u8 *buf;
-	/* current buffer position */
-	u8 *cur_pos;
 	int buf_size;
-	/* missing/available data on the current buffer */
-	int b_count;
 
 	/* the corresponding request */
 	struct request *rq;
@@ -379,10 +375,6 @@ struct ide_atapi_pc {
 	 */
 	u8 pc_buf[IDE_PC_BUFFER_SIZE];
 
-	/* idetape only */
-	struct idetape_bh *bh;
-	char *b_data;
-
 	unsigned long timeout;
 };
 
@@ -595,10 +587,6 @@ struct ide_drive_s {
 	/* callback for packet commands */
 	int  (*pc_callback)(struct ide_drive_s *, int);
 
-	void (*pc_update_buffers)(struct ide_drive_s *, struct ide_atapi_pc *);
-	int  (*pc_io_buffers)(struct ide_drive_s *, struct ide_atapi_pc *,
-			      unsigned int, int);
-
 	ide_startstop_t (*irq_handler)(struct ide_drive_s *);
 
 	unsigned long atapi_flags;
-- 
GitLab


From 5ad960fe8d0e4f99fe2b8dded45e8251137293c9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sun, 19 Apr 2009 08:46:03 +0900
Subject: [PATCH 0638/2198] ide: drop rq->data handling from ide_map_sg()

Impact: remove code path which is no longer necessary

All IDE data transfers now use rq->bio.  Simplify ide_map_sg()
accordingly.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jens Axboe <axboe@kernel.dk>
---
 drivers/ide/ide-io.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 6e3094e227752..a0309ea661ac0 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -248,11 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
 	struct scatterlist *sg = hwif->sg_table;
 	struct request *rq = cmd->rq;
 
-	if (!rq->bio) {
-		sg_init_one(sg, rq->data, rq->data_len);
-		cmd->sg_nents = 1;
-	} else
-		cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
+	cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
 }
 EXPORT_SYMBOL_GPL(ide_map_sg);
 
-- 
GitLab


From 586cf2681f527ce8b85b9bd57c8b9f7945fbe051 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 21 Apr 2009 12:16:56 +0900
Subject: [PATCH 0639/2198] ide-dma: don't reset request fields on
 dma_timeout_retry()

Impact: drop unnecessary code

Now that everything uses bio and block operations, there is no need to
reset request fields manually when retrying a request.  Every field is
guaranteed to be always valid.  Drop unnecessary request field
resetting from ide_dma_timeout_retry().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 drivers/ide/ide-dma.c | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index a0b8cab1d9a68..d9123ecae4a98 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -510,23 +510,11 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
 	/*
 	 * un-busy drive etc and make sure request is sane
 	 */
-
 	rq = hwif->rq;
-	if (!rq)
-		goto out;
-
-	hwif->rq = NULL;
-
-	rq->errors = 0;
-
-	if (!rq->bio)
-		goto out;
-
-	rq->sector = rq->bio->bi_sector;
-	rq->current_nr_sectors = bio_iovec(rq->bio)->bv_len >> 9;
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->buffer = bio_data(rq->bio);
-out:
+	if (rq) {
+		hwif->rq = NULL;
+		rq->errors = 0;
+	}
 	return ret;
 }
 
-- 
GitLab


From db29a6b49674085f136331014ba0eee249c16a2c Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 21 Apr 2009 09:27:03 +0200
Subject: [PATCH 0640/2198] block: enable by default support for large devices
 and files on 32-bit archs

Enable by default support for large devices and files (CONFIG_LBD):

- With 1TB disks being a commodity hardware it is quite easy to hit 2TB
  limitation while building RAIDs etc. and many distros have been using
  CONFIG_LBD=y by default already (at least Fedora 10 and openSUSE 11.1).

- This should also prevent a subtle ext4 filesystem compatibility issue:
  mke2fs.ext4 defaults to creating filesystems with huge_files feature
  enabled and such filesystems cannot be later mounted read-write on
  machines with CONFIG_LBD=n (it should be quite easy to hit this issue
  when trying to use filesystem created using distro kernel on system
  running the self-build kernel, think about USB disk enclosures & co.).

While at it:

- Clarify config option help text w.r.t. mounting ext4 filesystems
  (they can be mounted with CONFIG_LBD=n but in the read-only mode).

Cc: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/Kconfig | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/block/Kconfig b/block/Kconfig
index e7d12782bcfb5..2c39527aa7db0 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,6 +26,7 @@ if BLOCK
 config LBD
 	bool "Support for large block devices and files"
 	depends on !64BIT
+	default y
 	help
 	  Enable block devices or files of size 2TB and larger.
 
@@ -38,11 +39,13 @@ config LBD
 
 	  The ext4 filesystem requires that this feature be enabled in
 	  order to support filesystems that have the huge_file feature
-	  enabled.    Otherwise, it will refuse to mount any filesystems
-	  that use the huge_file feature, which is enabled by default
-	  by mke2fs.ext4.   The GFS2 filesystem also requires this feature.
+	  enabled.  Otherwise, it will refuse to mount in the read-write
+	  mode any filesystems that use the huge_file feature, which is
+	  enabled by default by mke2fs.ext4.
 
-	  If unsure, say N.
+	  The GFS2 filesystem also requires this feature.
+
+	  If unsure, say Y.
 
 config BLK_DEV_BSG
 	bool "Block layer SG support v4 (EXPERIMENTAL)"
-- 
GitLab


From db2dbb12dc47a50c7a4c5678f526014063e486f6 Mon Sep 17 00:00:00 2001
From: Jeff Moyer <jmoyer@redhat.com>
Date: Wed, 22 Apr 2009 14:08:13 +0200
Subject: [PATCH 0641/2198] block: implement blkdev_readpages

Doing a proper block dev ->readpages() speeds up the crazy dump(8)
approach of using interleaved process IO.

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc18dd175..a85fe310fc6fe 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -331,6 +331,12 @@ static int blkdev_readpage(struct file * file, struct page * page)
 	return block_read_full_page(page, blkdev_get_block);
 }
 
+static int blkdev_readpages(struct file *file, struct address_space *mapping,
+			struct list_head *pages, unsigned nr_pages)
+{
+	return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
+}
+
 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
@@ -1399,6 +1405,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
+	.readpages	= blkdev_readpages,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= blkdev_write_begin,
-- 
GitLab


From a538cd03be6f363d039daa94199c28cfbd508455 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:17 +0900
Subject: [PATCH 0642/2198] block: merge blk_invoke_request_fn() into
 __blk_run_queue()

__blk_run_queue wraps blk_invoke_request_fn() such that it
additionally removes plug and bails out early if the queue is empty.
Both extra operations have their own pending mechanisms and don't
cause any harm correctness-wise when they are done superflously.

The only user of blk_invoke_request_fn() being blk_start_queue(),
there isn't much reason to keep both functions around.  Merge
blk_invoke_request_fn() into __blk_run_queue() and make
blk_start_queue() use __blk_run_queue() instead.

[ Impact: merge two subtly different internal functions ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-core.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 41bc0ff75e28f..02f53bc00e4c6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -333,24 +333,6 @@ void blk_unplug(struct request_queue *q)
 }
 EXPORT_SYMBOL(blk_unplug);
 
-static void blk_invoke_request_fn(struct request_queue *q)
-{
-	if (unlikely(blk_queue_stopped(q)))
-		return;
-
-	/*
-	 * one level of recursion is ok and is much faster than kicking
-	 * the unplug handling
-	 */
-	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
-		q->request_fn(q);
-		queue_flag_clear(QUEUE_FLAG_REENTER, q);
-	} else {
-		queue_flag_set(QUEUE_FLAG_PLUGGED, q);
-		kblockd_schedule_work(q, &q->unplug_work);
-	}
-}
-
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
@@ -365,7 +347,7 @@ void blk_start_queue(struct request_queue *q)
 	WARN_ON(!irqs_disabled());
 
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
-	blk_invoke_request_fn(q);
+	__blk_run_queue(q);
 }
 EXPORT_SYMBOL(blk_start_queue);
 
@@ -425,12 +407,23 @@ void __blk_run_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
 
+	if (unlikely(blk_queue_stopped(q)))
+		return;
+
+	if (elv_queue_empty(q))
+		return;
+
 	/*
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
-	if (!elv_queue_empty(q))
-		blk_invoke_request_fn(q);
+	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
+		q->request_fn(q);
+		queue_flag_clear(QUEUE_FLAG_REENTER, q);
+	} else {
+		queue_flag_set(QUEUE_FLAG_PLUGGED, q);
+		kblockd_schedule_work(q, &q->unplug_work);
+	}
 }
 EXPORT_SYMBOL(__blk_run_queue);
 
-- 
GitLab


From a7f557923441186a3cdbabc54f1bcacf42b63bf5 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:17 +0900
Subject: [PATCH 0643/2198] block: kill blk_start_queueing()

blk_start_queueing() is identical to __blk_run_queue() except that it
doesn't check for recursion.  None of the current users depends on
blk_start_queueing() running request_fn directly.  Replace usages of
blk_start_queueing() with [__]blk_run_queue() and kill it.

[ Impact: removal of mostly duplicate interface function ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/as-iosched.c     |  6 +-----
 block/blk-core.c       | 28 ++--------------------------
 block/cfq-iosched.c    |  6 +++---
 block/elevator.c       |  7 +++----
 include/linux/blkdev.h |  1 -
 5 files changed, 9 insertions(+), 39 deletions(-)

diff --git a/block/as-iosched.c b/block/as-iosched.c
index c48fa670d2213..45bd07059c285 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -1312,12 +1312,8 @@ static void as_merged_requests(struct request_queue *q, struct request *req,
 static void as_work_handler(struct work_struct *work)
 {
 	struct as_data *ad = container_of(work, struct as_data, antic_work);
-	struct request_queue *q = ad->q;
-	unsigned long flags;
 
-	spin_lock_irqsave(q->queue_lock, flags);
-	blk_start_queueing(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	blk_run_queue(ad->q);
 }
 
 static int as_may_queue(struct request_queue *q, int rw)
diff --git a/block/blk-core.c b/block/blk-core.c
index 02f53bc00e4c6..8b4a0af7d69fe 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -433,9 +433,7 @@ EXPORT_SYMBOL(__blk_run_queue);
  *
  * Description:
  *    Invoke request handling on this queue, if it has pending work to do.
- *    May be used to restart queueing when a request has completed. Also
- *    See @blk_start_queueing.
- *
+ *    May be used to restart queueing when a request has completed.
  */
 void blk_run_queue(struct request_queue *q)
 {
@@ -894,28 +892,6 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_get_request);
 
-/**
- * blk_start_queueing - initiate dispatch of requests to device
- * @q:		request queue to kick into gear
- *
- * This is basically a helper to remove the need to know whether a queue
- * is plugged or not if someone just wants to initiate dispatch of requests
- * for this queue. Should be used to start queueing on a device outside
- * of ->request_fn() context. Also see @blk_run_queue.
- *
- * The queue lock must be held with interrupts disabled.
- */
-void blk_start_queueing(struct request_queue *q)
-{
-	if (!blk_queue_plugged(q)) {
-		if (unlikely(blk_queue_stopped(q)))
-			return;
-		q->request_fn(q);
-	} else
-		__generic_unplug_device(q);
-}
-EXPORT_SYMBOL(blk_start_queueing);
-
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
@@ -984,7 +960,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, where, 0);
-	blk_start_queueing(q);
+	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index a55a9bd75bd1b..def0c698a4bc7 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -2088,7 +2088,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
 			    cfqd->busy_queues > 1) {
 				del_timer(&cfqd->idle_slice_timer);
-				blk_start_queueing(cfqd->queue);
+			__blk_run_queue(cfqd->queue);
 			}
 			cfq_mark_cfqq_must_dispatch(cfqq);
 		}
@@ -2100,7 +2100,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		 * this new queue is RT and the current one is BE
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
-		blk_start_queueing(cfqd->queue);
+		__blk_run_queue(cfqd->queue);
 	}
 }
 
@@ -2345,7 +2345,7 @@ static void cfq_kick_queue(struct work_struct *work)
 	struct request_queue *q = cfqd->queue;
 
 	spin_lock_irq(q->queue_lock);
-	blk_start_queueing(q);
+	__blk_run_queue(cfqd->queue);
 	spin_unlock_irq(q->queue_lock);
 }
 
diff --git a/block/elevator.c b/block/elevator.c
index 7073a9072577c..2e0fb21485b76 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -599,7 +599,7 @@ void elv_quiesce_start(struct request_queue *q)
 	 */
 	elv_drain_elevator(q);
 	while (q->rq.elvpriv) {
-		blk_start_queueing(q);
+		__blk_run_queue(q);
 		spin_unlock_irq(q->queue_lock);
 		msleep(10);
 		spin_lock_irq(q->queue_lock);
@@ -643,8 +643,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 		 *   with anything.  There's no point in delaying queue
 		 *   processing.
 		 */
-		blk_remove_plug(q);
-		blk_start_queueing(q);
+		__blk_run_queue(q);
 		break;
 
 	case ELEVATOR_INSERT_SORT:
@@ -971,7 +970,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
 		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
-			blk_start_queueing(q);
+			__blk_run_queue(q);
 		}
 	}
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2755d5c6da222..12e20de44b602 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -797,7 +797,6 @@ extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(struct request_queue *q);
 extern void __blk_run_queue(struct request_queue *);
 extern void blk_run_queue(struct request_queue *);
-extern void blk_start_queueing(struct request_queue *);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
 			   struct rq_map_data *, void __user *, unsigned long,
 			   gfp_t);
-- 
GitLab


From e4025f6c21f1389696c069be2dc647f364925c45 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:17 +0900
Subject: [PATCH 0644/2198] block: don't set REQ_NOMERGE unnecessarily

RQ_NOMERGE_FLAGS already clears defines which REQ flags aren't
mergeable.  There is no reason to specify it superflously.  It only
adds to confusion.  Don't set REQ_NOMERGE for barriers and requests
with specific queueing directive.  REQ_NOMERGE is now exclusively used
by the merging code.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-core.c | 5 +----
 block/blk-exec.c | 1 -
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 8b4a0af7d69fe..7e0fab53e930f 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1082,16 +1082,13 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	if (bio_failfast_driver(bio))
 		req->cmd_flags |= REQ_FAILFAST_DRIVER;
 
-	/*
-	 * REQ_BARRIER implies no merging, but lets make it explicit
-	 */
 	if (unlikely(bio_discard(bio))) {
 		req->cmd_flags |= REQ_DISCARD;
 		if (bio_barrier(bio))
 			req->cmd_flags |= REQ_SOFTBARRIER;
 		req->q->prepare_discard_fn(req->q, req);
 	} else if (unlikely(bio_barrier(bio)))
-		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
+		req->cmd_flags |= REQ_HARDBARRIER;
 
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 6af716d1e54e0..49557e91f0dab 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -51,7 +51,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 
 	rq->rq_disk = bd_disk;
-	rq->cmd_flags |= REQ_NOMERGE;
 	rq->end_io = done;
 	WARN_ON(irqs_disabled());
 	spin_lock_irq(q->queue_lock);
-- 
GitLab


From 10732f5661fb7adf62e20733b0030fc0fc93b0c4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: [PATCH 0645/2198] block: cleanup REQ_SOFTBARRIER usages

blk_insert_request() doesn't need to worry about REQ_SOFTBARRIER.
Don't set it.  Combined with recent ide updates, REQ_SOFTBARRIER is
now only used in elevator proper and for discard requests.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 7e0fab53e930f..cf10dfcda99d8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -946,7 +946,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq,
 	 * barrier
 	 */
 	rq->cmd_type = REQ_TYPE_SPECIAL;
-	rq->cmd_flags |= REQ_SOFTBARRIER;
 
 	rq->special = data;
 
-- 
GitLab


From 2eef33e439ba9ae387cdc3f1abcef2f3f6c4e7a8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: [PATCH 0646/2198] block: clean up misc stuff after block layer
 timeout conversion

* In blk_rq_timed_out_timer(), else { if } to else if

* In blk_add_timer(), simplify if/else block

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-timeout.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 1ec0d503cacdc..1ba7e0aca8781 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -122,10 +122,8 @@ void blk_rq_timed_out_timer(unsigned long data)
 			if (blk_mark_rq_complete(rq))
 				continue;
 			blk_rq_timed_out(rq);
-		} else {
-			if (!next || time_after(next, rq->deadline))
-				next = rq->deadline;
-		}
+		} else if (!next || time_after(next, rq->deadline))
+			next = rq->deadline;
 	}
 
 	/*
@@ -176,16 +174,14 @@ void blk_add_timer(struct request *req)
 	BUG_ON(!list_empty(&req->timeout_list));
 	BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
 
-	if (req->timeout)
-		req->deadline = jiffies + req->timeout;
-	else {
-		req->deadline = jiffies + q->rq_timeout;
-		/*
-		 * Some LLDs, like scsi, peek at the timeout to prevent
-		 * a command from being retried forever.
-		 */
+	/*
+	 * Some LLDs, like scsi, peek at the timeout to prevent a
+	 * command from being retried forever.
+	 */
+	if (!req->timeout)
 		req->timeout = q->rq_timeout;
-	}
+
+	req->deadline = jiffies + req->timeout;
 	list_add_tail(&req->timeout_list, &q->timeout_list);
 
 	/*
-- 
GitLab


From 5efccd17ceb0fc43837a331297c2c407969d7201 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: [PATCH 0647/2198] block: reorder request completion functions

Reorder request completion functions such that

* All request completion functions are located together.

* Functions which are used by only one caller is put right above the
  caller.

* end_request() is put after other completion functions but before
  blk_update_request().

This change is for completion function cleanup which will follow.

[ Impact: cleanup, code reorganization ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-core.c       | 144 ++++++++++++++++++++---------------------
 include/linux/blkdev.h |  16 ++---
 2 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index cf10dfcda99d8..406a93e526b63 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1683,6 +1683,35 @@ static void blk_account_io_done(struct request *req)
 	}
 }
 
+/**
+ * blk_rq_bytes - Returns bytes left to complete in the entire request
+ * @rq: the request being processed
+ **/
+unsigned int blk_rq_bytes(struct request *rq)
+{
+	if (blk_fs_request(rq))
+		return rq->hard_nr_sectors << 9;
+
+	return rq->data_len;
+}
+EXPORT_SYMBOL_GPL(blk_rq_bytes);
+
+/**
+ * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
+ * @rq: the request being processed
+ **/
+unsigned int blk_rq_cur_bytes(struct request *rq)
+{
+	if (blk_fs_request(rq))
+		return rq->current_nr_sectors << 9;
+
+	if (rq->bio)
+		return rq->bio->bi_size;
+
+	return rq->data_len;
+}
+EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
+
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
@@ -1797,6 +1826,22 @@ static int __end_that_request_first(struct request *req, int error,
 	return 1;
 }
 
+static int end_that_request_data(struct request *rq, int error,
+				 unsigned int nr_bytes, unsigned int bidi_bytes)
+{
+	if (rq->bio) {
+		if (__end_that_request_first(rq, error, nr_bytes))
+			return 1;
+
+		/* Bidi request must be completed as a whole */
+		if (blk_bidi_rq(rq) &&
+		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
+			return 1;
+	}
+
+	return 0;
+}
+
 /*
  * queue lock must be held
  */
@@ -1825,78 +1870,6 @@ static void end_that_request_last(struct request *req, int error)
 	}
 }
 
-/**
- * blk_rq_bytes - Returns bytes left to complete in the entire request
- * @rq: the request being processed
- **/
-unsigned int blk_rq_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return rq->hard_nr_sectors << 9;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_bytes);
-
-/**
- * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
- * @rq: the request being processed
- **/
-unsigned int blk_rq_cur_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return rq->current_nr_sectors << 9;
-
-	if (rq->bio)
-		return rq->bio->bi_size;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
-
-/**
- * end_request - end I/O on the current segment of the request
- * @req:	the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
- *
- * Description:
- *     Ends I/O on the current segment of a request. If that is the only
- *     remaining segment, the request is also completed and freed.
- *
- *     This is a remnant of how older block drivers handled I/O completions.
- *     Modern drivers typically end I/O on the full request in one go, unless
- *     they have a residual value to account for. For that case this function
- *     isn't really useful, unless the residual just happens to be the
- *     full current segment. In other words, don't use this function in new
- *     code. Use blk_end_request() or __blk_end_request() to end a request.
- **/
-void end_request(struct request *req, int uptodate)
-{
-	int error = 0;
-
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
-
-	__blk_end_request(req, error, req->hard_cur_sectors << 9);
-}
-EXPORT_SYMBOL(end_request);
-
-static int end_that_request_data(struct request *rq, int error,
-				 unsigned int nr_bytes, unsigned int bidi_bytes)
-{
-	if (rq->bio) {
-		if (__end_that_request_first(rq, error, nr_bytes))
-			return 1;
-
-		/* Bidi request must be completed as a whole */
-		if (blk_bidi_rq(rq) &&
-		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
-			return 1;
-	}
-
-	return 0;
-}
-
 /**
  * blk_end_io - Generic end_io function to complete a request.
  * @rq:           the request being processed
@@ -2006,6 +1979,33 @@ int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
 }
 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
+/**
+ * end_request - end I/O on the current segment of the request
+ * @req:	the request being processed
+ * @uptodate:	error value or %0/%1 uptodate flag
+ *
+ * Description:
+ *     Ends I/O on the current segment of a request. If that is the only
+ *     remaining segment, the request is also completed and freed.
+ *
+ *     This is a remnant of how older block drivers handled I/O completions.
+ *     Modern drivers typically end I/O on the full request in one go, unless
+ *     they have a residual value to account for. For that case this function
+ *     isn't really useful, unless the residual just happens to be the
+ *     full current segment. In other words, don't use this function in new
+ *     code. Use blk_end_request() or __blk_end_request() to end a request.
+ **/
+void end_request(struct request *req, int uptodate)
+{
+	int error = 0;
+
+	if (uptodate <= 0)
+		error = uptodate ? uptodate : -EIO;
+
+	__blk_end_request(req, error, req->hard_cur_sectors << 9);
+}
+EXPORT_SYMBOL(end_request);
+
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @rq:           the request being processed
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 12e20de44b602..156ffd9de967d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -831,6 +831,14 @@ static inline void blk_run_address_space(struct address_space *mapping)
 
 extern void blkdev_dequeue_request(struct request *req);
 
+/*
+ * blk_end_request() takes bytes instead of sectors as a complete size.
+ * blk_rq_bytes() returns bytes left to complete in the entire request.
+ * blk_rq_cur_bytes() returns bytes left to complete in the current segment.
+ */
+extern unsigned int blk_rq_bytes(struct request *rq);
+extern unsigned int blk_rq_cur_bytes(struct request *rq);
+
 /*
  * blk_end_request() and friends.
  * __blk_end_request() and end_request() must be called with
@@ -857,14 +865,6 @@ extern void blk_abort_queue(struct request_queue *);
 extern void blk_update_request(struct request *rq, int error,
 			       unsigned int nr_bytes);
 
-/*
- * blk_end_request() takes bytes instead of sectors as a complete size.
- * blk_rq_bytes() returns bytes left to complete in the entire request.
- * blk_rq_cur_bytes() returns bytes left to complete in the current segment.
- */
-extern unsigned int blk_rq_bytes(struct request *rq);
-extern unsigned int blk_rq_cur_bytes(struct request *rq);
-
 /*
  * Access functions for manipulating queue properties
  */
-- 
GitLab


From 158dbda0068e63c7cce7bd47c123bd1dfa5a902c Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: [PATCH 0648/2198] block: reorganize request fetching functions

Impact: code reorganization

elv_next_request() and elv_dequeue_request() are public block layer
interface than actual elevator implementation.  They mostly deal with
how requests interact with block layer and low level drivers at the
beginning of rqeuest processing whereas __elv_next_request() is the
actual eleveator request fetching interface.

Move the two functions to blk-core.c.  This prepares for further
interface cleanup.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-core.c |  95 +++++++++++++++++++++++++++++++++++
 block/blk.h      |  37 ++++++++++++++
 block/elevator.c | 128 -----------------------------------------------
 3 files changed, 132 insertions(+), 128 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 406a93e526b63..678ede23ed0a8 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1712,6 +1712,101 @@ unsigned int blk_rq_cur_bytes(struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
 
+struct request *elv_next_request(struct request_queue *q)
+{
+	struct request *rq;
+	int ret;
+
+	while ((rq = __elv_next_request(q)) != NULL) {
+		if (!(rq->cmd_flags & REQ_STARTED)) {
+			/*
+			 * This is the first time the device driver
+			 * sees this request (possibly after
+			 * requeueing).  Notify IO scheduler.
+			 */
+			if (blk_sorted_rq(rq))
+				elv_activate_rq(q, rq);
+
+			/*
+			 * just mark as started even if we don't start
+			 * it, a request that has been delayed should
+			 * not be passed by new incoming requests
+			 */
+			rq->cmd_flags |= REQ_STARTED;
+			trace_block_rq_issue(q, rq);
+		}
+
+		if (!q->boundary_rq || q->boundary_rq == rq) {
+			q->end_sector = rq_end_sector(rq);
+			q->boundary_rq = NULL;
+		}
+
+		if (rq->cmd_flags & REQ_DONTPREP)
+			break;
+
+		if (q->dma_drain_size && rq->data_len) {
+			/*
+			 * make sure space for the drain appears we
+			 * know we can do this because max_hw_segments
+			 * has been adjusted to be one fewer than the
+			 * device can handle
+			 */
+			rq->nr_phys_segments++;
+		}
+
+		if (!q->prep_rq_fn)
+			break;
+
+		ret = q->prep_rq_fn(q, rq);
+		if (ret == BLKPREP_OK) {
+			break;
+		} else if (ret == BLKPREP_DEFER) {
+			/*
+			 * the request may have been (partially) prepped.
+			 * we need to keep this request in the front to
+			 * avoid resource deadlock.  REQ_STARTED will
+			 * prevent other fs requests from passing this one.
+			 */
+			if (q->dma_drain_size && rq->data_len &&
+			    !(rq->cmd_flags & REQ_DONTPREP)) {
+				/*
+				 * remove the space for the drain we added
+				 * so that we don't add it again
+				 */
+				--rq->nr_phys_segments;
+			}
+
+			rq = NULL;
+			break;
+		} else if (ret == BLKPREP_KILL) {
+			rq->cmd_flags |= REQ_QUIET;
+			__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+		} else {
+			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
+			break;
+		}
+	}
+
+	return rq;
+}
+EXPORT_SYMBOL(elv_next_request);
+
+void elv_dequeue_request(struct request_queue *q, struct request *rq)
+{
+	BUG_ON(list_empty(&rq->queuelist));
+	BUG_ON(ELV_ON_HASH(rq));
+
+	list_del_init(&rq->queuelist);
+
+	/*
+	 * the time frame between a request being removed from the lists
+	 * and to it is freed is accounted as io that is in progress at
+	 * the driver side.
+	 */
+	if (blk_account_rq(rq))
+		q->in_flight++;
+}
+
 /**
  * __end_that_request_first - end I/O on a request
  * @req:      the request being processed
diff --git a/block/blk.h b/block/blk.h
index 79c85f7c9ff50..9b2c324e4407d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -43,6 +43,43 @@ static inline void blk_clear_rq_complete(struct request *rq)
 	clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 }
 
+/*
+ * Internal elevator interface
+ */
+#define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
+
+static inline struct request *__elv_next_request(struct request_queue *q)
+{
+	struct request *rq;
+
+	while (1) {
+		while (!list_empty(&q->queue_head)) {
+			rq = list_entry_rq(q->queue_head.next);
+			if (blk_do_ordered(q, &rq))
+				return rq;
+		}
+
+		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
+			return NULL;
+	}
+}
+
+static inline void elv_activate_rq(struct request_queue *q, struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->ops->elevator_activate_req_fn)
+		e->ops->elevator_activate_req_fn(q, rq);
+}
+
+static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
+{
+	struct elevator_queue *e = q->elevator;
+
+	if (e->ops->elevator_deactivate_req_fn)
+		e->ops->elevator_deactivate_req_fn(q, rq);
+}
+
 #ifdef CONFIG_FAIL_IO_TIMEOUT
 int blk_should_fake_timeout(struct request_queue *);
 ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
diff --git a/block/elevator.c b/block/elevator.c
index 2e0fb21485b76..b03b8752e18b9 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -53,7 +53,6 @@ static const int elv_hash_shift = 6;
 		(hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
 #define ELV_HASH_ENTRIES	(1 << elv_hash_shift)
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
-#define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
 DEFINE_TRACE(block_rq_insert);
 DEFINE_TRACE(block_rq_issue);
@@ -310,22 +309,6 @@ void elevator_exit(struct elevator_queue *e)
 }
 EXPORT_SYMBOL(elevator_exit);
 
-static void elv_activate_rq(struct request_queue *q, struct request *rq)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (e->ops->elevator_activate_req_fn)
-		e->ops->elevator_activate_req_fn(q, rq);
-}
-
-static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
-{
-	struct elevator_queue *e = q->elevator;
-
-	if (e->ops->elevator_deactivate_req_fn)
-		e->ops->elevator_deactivate_req_fn(q, rq);
-}
-
 static inline void __elv_rqhash_del(struct request *rq)
 {
 	hlist_del_init(&rq->hash);
@@ -758,117 +741,6 @@ void elv_add_request(struct request_queue *q, struct request *rq, int where,
 }
 EXPORT_SYMBOL(elv_add_request);
 
-static inline struct request *__elv_next_request(struct request_queue *q)
-{
-	struct request *rq;
-
-	while (1) {
-		while (!list_empty(&q->queue_head)) {
-			rq = list_entry_rq(q->queue_head.next);
-			if (blk_do_ordered(q, &rq))
-				return rq;
-		}
-
-		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
-			return NULL;
-	}
-}
-
-struct request *elv_next_request(struct request_queue *q)
-{
-	struct request *rq;
-	int ret;
-
-	while ((rq = __elv_next_request(q)) != NULL) {
-		if (!(rq->cmd_flags & REQ_STARTED)) {
-			/*
-			 * This is the first time the device driver
-			 * sees this request (possibly after
-			 * requeueing).  Notify IO scheduler.
-			 */
-			if (blk_sorted_rq(rq))
-				elv_activate_rq(q, rq);
-
-			/*
-			 * just mark as started even if we don't start
-			 * it, a request that has been delayed should
-			 * not be passed by new incoming requests
-			 */
-			rq->cmd_flags |= REQ_STARTED;
-			trace_block_rq_issue(q, rq);
-		}
-
-		if (!q->boundary_rq || q->boundary_rq == rq) {
-			q->end_sector = rq_end_sector(rq);
-			q->boundary_rq = NULL;
-		}
-
-		if (rq->cmd_flags & REQ_DONTPREP)
-			break;
-
-		if (q->dma_drain_size && rq->data_len) {
-			/*
-			 * make sure space for the drain appears we
-			 * know we can do this because max_hw_segments
-			 * has been adjusted to be one fewer than the
-			 * device can handle
-			 */
-			rq->nr_phys_segments++;
-		}
-
-		if (!q->prep_rq_fn)
-			break;
-
-		ret = q->prep_rq_fn(q, rq);
-		if (ret == BLKPREP_OK) {
-			break;
-		} else if (ret == BLKPREP_DEFER) {
-			/*
-			 * the request may have been (partially) prepped.
-			 * we need to keep this request in the front to
-			 * avoid resource deadlock.  REQ_STARTED will
-			 * prevent other fs requests from passing this one.
-			 */
-			if (q->dma_drain_size && rq->data_len &&
-			    !(rq->cmd_flags & REQ_DONTPREP)) {
-				/*
-				 * remove the space for the drain we added
-				 * so that we don't add it again
-				 */
-				--rq->nr_phys_segments;
-			}
-
-			rq = NULL;
-			break;
-		} else if (ret == BLKPREP_KILL) {
-			rq->cmd_flags |= REQ_QUIET;
-			__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
-		} else {
-			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
-			break;
-		}
-	}
-
-	return rq;
-}
-EXPORT_SYMBOL(elv_next_request);
-
-void elv_dequeue_request(struct request_queue *q, struct request *rq)
-{
-	BUG_ON(list_empty(&rq->queuelist));
-	BUG_ON(ELV_ON_HASH(rq));
-
-	list_del_init(&rq->queuelist);
-
-	/*
-	 * the time frame between a request being removed from the lists
-	 * and to it is freed is accounted as io that is in progress at
-	 * the driver side.
-	 */
-	if (blk_account_rq(rq))
-		q->in_flight++;
-}
-
 int elv_queue_empty(struct request_queue *q)
 {
 	struct elevator_queue *e = q->elevator;
-- 
GitLab


From 0b302d5aa7975006fa2ec3d66386610b9b36c669 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: [PATCH 0649/2198] block: kill blk_end_request_callback()

With recent IDE updates, blk_end_request_callback() doesn't have any
user now.  Kill it.

[ Impact: removal of unused convoluted interface ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-core.c       | 48 +++---------------------------------------
 include/linux/blkdev.h |  3 ---
 2 files changed, 3 insertions(+), 48 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 678ede23ed0a8..2f277ea0e599b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1971,10 +1971,6 @@ static void end_that_request_last(struct request *req, int error)
  * @error:        %0 for success, < %0 for error
  * @nr_bytes:     number of bytes to complete @rq
  * @bidi_bytes:   number of bytes to complete @rq->next_rq
- * @drv_callback: function called between completion of bios in the request
- *                and completion of the request.
- *                If the callback returns non %0, this helper returns without
- *                completion of the request.
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
@@ -1985,8 +1981,7 @@ static void end_that_request_last(struct request *req, int error)
  *     %1 - this request is not freed yet, it still has pending buffers.
  **/
 static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
-		      unsigned int bidi_bytes,
-		      int (drv_callback)(struct request *))
+		      unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags = 0UL;
@@ -1994,10 +1989,6 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
 	if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
 		return 1;
 
-	/* Special feature for tricky drivers */
-	if (drv_callback && drv_callback(rq))
-		return 1;
-
 	add_disk_randomness(rq->rq_disk);
 
 	spin_lock_irqsave(q->queue_lock, flags);
@@ -2023,7 +2014,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
  **/
 int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
-	return blk_end_io(rq, error, nr_bytes, 0, NULL);
+	return blk_end_io(rq, error, nr_bytes, 0);
 }
 EXPORT_SYMBOL_GPL(blk_end_request);
 
@@ -2070,7 +2061,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
 int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
 			 unsigned int bidi_bytes)
 {
-	return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL);
+	return blk_end_io(rq, error, nr_bytes, bidi_bytes);
 }
 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
@@ -2131,39 +2122,6 @@ void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 
-/**
- * blk_end_request_callback - Special helper function for tricky drivers
- * @rq:           the request being processed
- * @error:        %0 for success, < %0 for error
- * @nr_bytes:     number of bytes to complete
- * @drv_callback: function called between completion of bios in the request
- *                and completion of the request.
- *                If the callback returns non %0, this helper returns without
- *                completion of the request.
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- *     This special helper function is used only for existing tricky drivers.
- *     (e.g. cdrom_newpc_intr() of ide-cd)
- *     This interface will be removed when such drivers are rewritten.
- *     Don't use this interface in other places anymore.
- *
- * Return:
- *     %0 - we are done with this request
- *     %1 - this request is not freed yet.
- *          this request still has pending buffers or
- *          the driver doesn't want to finish this request yet.
- **/
-int blk_end_request_callback(struct request *rq, int error,
-			     unsigned int nr_bytes,
-			     int (drv_callback)(struct request *))
-{
-	return blk_end_io(rq, error, nr_bytes, 0, drv_callback);
-}
-EXPORT_SYMBOL_GPL(blk_end_request_callback);
-
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 156ffd9de967d..1fa9dcf9aa6ad 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -855,9 +855,6 @@ extern int __blk_end_request(struct request *rq, int error,
 extern int blk_end_bidi_request(struct request *rq, int error,
 				unsigned int nr_bytes, unsigned int bidi_bytes);
 extern void end_request(struct request *, int);
-extern int blk_end_request_callback(struct request *rq, int error,
-				unsigned int nr_bytes,
-				int (drv_callback)(struct request *));
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
-- 
GitLab


From 2e60e02297cf54e367567f2d85b2ca56b1c4a906 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: [PATCH 0650/2198] block: clean up request completion API

Request completion has gone through several changes and became a bit
messy over the time.  Clean it up.

1. end_that_request_data() is a thin wrapper around
   end_that_request_data_first() which checks whether bio is NULL
   before doing anything and handles bidi completion.
   blk_update_request() is a thin wrapper around
   end_that_request_data() which clears nr_sectors on the last
   iteration but doesn't use the bidi completion.

   Clean it up by moving the initial bio NULL check and nr_sectors
   clearing on the last iteration into end_that_request_data() and
   renaming it to blk_update_request(), which makes blk_end_io() the
   only user of end_that_request_data().  Collapse
   end_that_request_data() into blk_end_io().

2. There are four visible completion variants - blk_end_request(),
   __blk_end_request(), blk_end_bidi_request() and end_request().
   blk_end_request() and blk_end_bidi_request() uses blk_end_request()
   as the backend but __blk_end_request() and end_request() use
   separate implementation in __blk_end_request() due to different
   locking rules.

   blk_end_bidi_request() is identical to blk_end_io().  Collapse
   blk_end_io() into blk_end_bidi_request(), separate out request
   update into internal helper blk_update_bidi_request() and add
   __blk_end_bidi_request().  Redefine [__]blk_end_request() as thin
   inline wrappers around [__]blk_end_bidi_request().

3. As the whole request issue/completion usages are about to be
   modified and audited, it's a good chance to convert completion
   functions return bool which better indicates the intended meaning
   of return values.

4. The function name end_that_request_last() is from the days when it
   was a public interface and slighly confusing.  Give it a proper
   internal name - blk_finish_request().

5. Add description explaning that blk_end_bidi_request() can be safely
   used for uni requests as suggested by Boaz Harrosh.

The only visible behavior change is from #1.  nr_sectors counts are
cleared after the final iteration no matter which function is used to
complete the request.  I couldn't find any place where the code
assumes those nr_sectors counters contain the values for the last
segment and this change is good as it makes the API much more
consistent as the end result is now same whether a request is
completed using [__]blk_end_request() alone or in combination with
blk_update_request().

API further cleaned up per Christoph's suggestion.

[ Impact: cleanup, rq->*nr_sectors always updated after req completion ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Christoph Hellwig <hch@infradead.org>
---
 block/blk-core.c       | 226 ++++++++++++++---------------------------
 include/linux/blkdev.h |  94 ++++++++++++++---
 2 files changed, 157 insertions(+), 163 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 2f277ea0e599b..89cc05d9a7a90 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1808,25 +1808,35 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 }
 
 /**
- * __end_that_request_first - end I/O on a request
- * @req:      the request being processed
+ * blk_update_request - Special helper function for request stacking drivers
+ * @rq:	      the request being processed
  * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
+ * @nr_bytes: number of bytes to complete @rq
  *
  * Description:
- *     Ends I/O on a number of bytes attached to @req, and sets it up
- *     for the next range of segments (if any) in the cluster.
+ *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
+ *     the request structure even if @rq doesn't have leftover.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ *     This special helper function is only for request stacking drivers
+ *     (e.g. request-based dm) so that they can handle partial completion.
+ *     Actual device drivers should use blk_end_request instead.
+ *
+ *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
+ *     %false return from this function.
  *
  * Return:
- *     %0 - we are done with this request, call end_that_request_last()
- *     %1 - still buffers pending for this request
+ *     %false - this request doesn't have any more data
+ *     %true  - this request has more data
  **/
-static int __end_that_request_first(struct request *req, int error,
-				    int nr_bytes)
+bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	int total_bytes, bio_nbytes, next_idx = 0;
 	struct bio *bio;
 
+	if (!req->bio)
+		return false;
+
 	trace_block_rq_complete(req->q, req);
 
 	/*
@@ -1903,8 +1913,16 @@ static int __end_that_request_first(struct request *req, int error,
 	/*
 	 * completely done
 	 */
-	if (!req->bio)
-		return 0;
+	if (!req->bio) {
+		/*
+		 * Reset counters so that the request stacking driver
+		 * can find how many bytes remain in the request
+		 * later.
+		 */
+		req->nr_sectors = req->hard_nr_sectors = 0;
+		req->current_nr_sectors = req->hard_cur_sectors = 0;
+		return false;
+	}
 
 	/*
 	 * if the request wasn't completed, update state
@@ -1918,29 +1936,31 @@ static int __end_that_request_first(struct request *req, int error,
 
 	blk_recalc_rq_sectors(req, total_bytes >> 9);
 	blk_recalc_rq_segments(req);
-	return 1;
+	return true;
 }
+EXPORT_SYMBOL_GPL(blk_update_request);
 
-static int end_that_request_data(struct request *rq, int error,
-				 unsigned int nr_bytes, unsigned int bidi_bytes)
+static bool blk_update_bidi_request(struct request *rq, int error,
+				    unsigned int nr_bytes,
+				    unsigned int bidi_bytes)
 {
-	if (rq->bio) {
-		if (__end_that_request_first(rq, error, nr_bytes))
-			return 1;
+	if (blk_update_request(rq, error, nr_bytes))
+		return true;
 
-		/* Bidi request must be completed as a whole */
-		if (blk_bidi_rq(rq) &&
-		    __end_that_request_first(rq->next_rq, error, bidi_bytes))
-			return 1;
-	}
+	/* Bidi request must be completed as a whole */
+	if (unlikely(blk_bidi_rq(rq)) &&
+	    blk_update_request(rq->next_rq, error, bidi_bytes))
+		return true;
 
-	return 0;
+	add_disk_randomness(rq->rq_disk);
+
+	return false;
 }
 
 /*
  * queue lock must be held
  */
-static void end_that_request_last(struct request *req, int error)
+static void blk_finish_request(struct request *req, int error)
 {
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
@@ -1966,161 +1986,65 @@ static void end_that_request_last(struct request *req, int error)
 }
 
 /**
- * blk_end_io - Generic end_io function to complete a request.
- * @rq:           the request being processed
- * @error:        %0 for success, < %0 for error
- * @nr_bytes:     number of bytes to complete @rq
- * @bidi_bytes:   number of bytes to complete @rq->next_rq
+ * blk_end_bidi_request - Complete a bidi request
+ * @rq:         the request to complete
+ * @error:      %0 for success, < %0 for error
+ * @nr_bytes:   number of bytes to complete @rq
+ * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
- *     If @rq has leftover, sets it up for the next range of segments.
+ *     Drivers that supports bidi can safely call this member for any
+ *     type of request, bidi or uni.  In the later case @bidi_bytes is
+ *     just ignored.
  *
  * Return:
- *     %0 - we are done with this request
- *     %1 - this request is not freed yet, it still has pending buffers.
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  **/
-static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes,
-		      unsigned int bidi_bytes)
+bool blk_end_bidi_request(struct request *rq, int error,
+			  unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
-	unsigned long flags = 0UL;
-
-	if (end_that_request_data(rq, error, nr_bytes, bidi_bytes))
-		return 1;
+	unsigned long flags;
 
-	add_disk_randomness(rq->rq_disk);
+	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
+		return true;
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	end_that_request_last(rq, error);
+	blk_finish_request(rq, error);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 
-	return 0;
-}
-
-/**
- * blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- * Return:
- *     %0 - we are done with this request
- *     %1 - still buffers pending for this request
- **/
-int blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
-{
-	return blk_end_io(rq, error, nr_bytes, 0);
-}
-EXPORT_SYMBOL_GPL(blk_end_request);
-
-/**
- * __blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Must be called with queue lock held unlike blk_end_request().
- *
- * Return:
- *     %0 - we are done with this request
- *     %1 - still buffers pending for this request
- **/
-int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
-{
-	if (rq->bio && __end_that_request_first(rq, error, nr_bytes))
-		return 1;
-
-	add_disk_randomness(rq->rq_disk);
-
-	end_that_request_last(rq, error);
-
-	return 0;
+	return false;
 }
-EXPORT_SYMBOL_GPL(__blk_end_request);
+EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
 /**
- * blk_end_bidi_request - Helper function for drivers to complete bidi request.
- * @rq:         the bidi request being processed
+ * __blk_end_bidi_request - Complete a bidi request with queue lock held
+ * @rq:         the request to complete
  * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
- *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
+ *     Identical to blk_end_bidi_request() except that queue lock is
+ *     assumed to be locked on entry and remains so on return.
  *
  * Return:
- *     %0 - we are done with this request
- *     %1 - still buffers pending for this request
- **/
-int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes,
-			 unsigned int bidi_bytes)
-{
-	return blk_end_io(rq, error, nr_bytes, bidi_bytes);
-}
-EXPORT_SYMBOL_GPL(blk_end_bidi_request);
-
-/**
- * end_request - end I/O on the current segment of the request
- * @req:	the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
- *
- * Description:
- *     Ends I/O on the current segment of a request. If that is the only
- *     remaining segment, the request is also completed and freed.
- *
- *     This is a remnant of how older block drivers handled I/O completions.
- *     Modern drivers typically end I/O on the full request in one go, unless
- *     they have a residual value to account for. For that case this function
- *     isn't really useful, unless the residual just happens to be the
- *     full current segment. In other words, don't use this function in new
- *     code. Use blk_end_request() or __blk_end_request() to end a request.
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  **/
-void end_request(struct request *req, int uptodate)
+bool __blk_end_bidi_request(struct request *rq, int error,
+			    unsigned int nr_bytes, unsigned int bidi_bytes)
 {
-	int error = 0;
+	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
+		return true;
 
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
+	blk_finish_request(rq, error);
 
-	__blk_end_request(req, error, req->hard_cur_sectors << 9);
-}
-EXPORT_SYMBOL(end_request);
-
-/**
- * blk_update_request - Special helper function for request stacking drivers
- * @rq:           the request being processed
- * @error:        %0 for success, < %0 for error
- * @nr_bytes:     number of bytes to complete @rq
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
- *     the request structure even if @rq doesn't have leftover.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- *     This special helper function is only for request stacking drivers
- *     (e.g. request-based dm) so that they can handle partial completion.
- *     Actual device drivers should use blk_end_request instead.
- */
-void blk_update_request(struct request *rq, int error, unsigned int nr_bytes)
-{
-	if (!end_that_request_data(rq, error, nr_bytes, 0)) {
-		/*
-		 * These members are not updated in end_that_request_data()
-		 * when all bios are completed.
-		 * Update them so that the request stacking driver can find
-		 * how many bytes remain in the request later.
-		 */
-		rq->nr_sectors = rq->hard_nr_sectors = 0;
-		rq->current_nr_sectors = rq->hard_cur_sectors = 0;
-	}
+	return false;
 }
-EXPORT_SYMBOL_GPL(blk_update_request);
+EXPORT_SYMBOL_GPL(__blk_end_bidi_request);
 
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1fa9dcf9aa6ad..501f6845cc737 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -840,27 +840,97 @@ extern unsigned int blk_rq_bytes(struct request *rq);
 extern unsigned int blk_rq_cur_bytes(struct request *rq);
 
 /*
- * blk_end_request() and friends.
- * __blk_end_request() and end_request() must be called with
- * the request queue spinlock acquired.
+ * Request completion related functions.
+ *
+ * blk_update_request() completes given number of bytes and updates
+ * the request without completing it.
+ *
+ * blk_end_request() and friends.  __blk_end_request() and
+ * end_request() must be called with the request queue spinlock
+ * acquired.
  *
  * Several drivers define their own end_request and call
  * blk_end_request() for parts of the original function.
  * This prevents code duplication in drivers.
  */
-extern int blk_end_request(struct request *rq, int error,
-				unsigned int nr_bytes);
-extern int __blk_end_request(struct request *rq, int error,
-				unsigned int nr_bytes);
-extern int blk_end_bidi_request(struct request *rq, int error,
-				unsigned int nr_bytes, unsigned int bidi_bytes);
-extern void end_request(struct request *, int);
+extern bool blk_update_request(struct request *rq, int error,
+			       unsigned int nr_bytes);
+extern bool blk_end_bidi_request(struct request *rq, int error,
+				 unsigned int nr_bytes,
+				 unsigned int bidi_bytes);
+extern bool __blk_end_bidi_request(struct request *rq, int error,
+				   unsigned int nr_bytes,
+				   unsigned int bidi_bytes);
+
+/**
+ * blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+static inline bool blk_end_request(struct request *rq, int error,
+				   unsigned int nr_bytes)
+{
+	return blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+
+/**
+ * __blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Must be called with queue lock held unlike blk_end_request().
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+static inline bool __blk_end_request(struct request *rq, int error,
+				     unsigned int nr_bytes)
+{
+	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+
+/**
+ * end_request - end I/O on the current segment of the request
+ * @rq:		the request being processed
+ * @uptodate:	error value or %0/%1 uptodate flag
+ *
+ * Description:
+ *     Ends I/O on the current segment of a request. If that is the only
+ *     remaining segment, the request is also completed and freed.
+ *
+ *     This is a remnant of how older block drivers handled I/O completions.
+ *     Modern drivers typically end I/O on the full request in one go, unless
+ *     they have a residual value to account for. For that case this function
+ *     isn't really useful, unless the residual just happens to be the
+ *     full current segment. In other words, don't use this function in new
+ *     code. Use blk_end_request() or __blk_end_request() to end a request.
+ **/
+static inline void end_request(struct request *rq, int uptodate)
+{
+	int error = 0;
+
+	if (uptodate <= 0)
+		error = uptodate ? uptodate : -EIO;
+
+	__blk_end_bidi_request(rq, error, rq->hard_cur_sectors << 9, 0);
+}
+
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
 extern void blk_abort_queue(struct request_queue *);
-extern void blk_update_request(struct request *rq, int error,
-			       unsigned int nr_bytes);
 
 /*
  * Access functions for manipulating queue properties
-- 
GitLab


From b243ddcbe9be146172baa544dadecebf156eda0e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:18 +0900
Subject: [PATCH 0651/2198] block: move rq->start_time initialization to
 blk_rq_init()

rq->start_time was initialized in init_request_from_bio() so special
requests didn't have start_time set.  This has been okay as start_time
has been used only for fs requests; however, there is no indication of
this actually is the case or not.  Set rq->start_time in blk_rq_init()
and guarantee that all initialized rq's have its start_time set.  This
improves consistency at virtually no cost and future changes will make
use of the timestamp for !bio requests.

[ Impact: rq->start_time is valid for all requests ]

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 block/blk-core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 89cc05d9a7a90..b84250d3019b7 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -134,6 +134,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
 	rq->ref_count = 1;
+	rq->start_time = jiffies;
 }
 EXPORT_SYMBOL(blk_rq_init);
 
@@ -1099,7 +1100,6 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
-	req->start_time = jiffies;
 	blk_rq_bio_prep(req->q, req, bio);
 }
 
-- 
GitLab


From 40cbbb781d3eba5d6ac0860db078af490e5c7c6b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:19 +0900
Subject: [PATCH 0652/2198] block: implement and use [__]blk_end_request_all()

There are many [__]blk_end_request() call sites which call it with
full request length and expect full completion.  Many of them ensure
that the request actually completes by doing BUG_ON() the return
value, which is awkward and error-prone.

This patch adds [__]blk_end_request_all() which takes @rq and @error
and fully completes the request.  BUG_ON() is added to to ensure that
this actually happens.

Most conversions are simple but there are a few noteworthy ones.

* cdrom/viocd: viocd_end_request() replaced with direct calls to
  __blk_end_request_all().

* s390/block/dasd: dasd_end_request() replaced with direct calls to
  __blk_end_request_all().

* s390/char/tape_block: tapeblock_end_request() replaced with direct
  calls to blk_end_request_all().

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
---
 arch/arm/plat-omap/mailbox.c        | 11 +++-------
 block/blk-barrier.c                 |  9 ++------
 block/blk-core.c                    |  2 +-
 block/elevator.c                    |  2 +-
 drivers/block/cpqarray.c            |  3 +--
 drivers/block/sx8.c                 |  3 +--
 drivers/block/virtio_blk.c          |  2 +-
 drivers/block/xen-blkfront.c        |  4 +---
 drivers/cdrom/gdrom.c               |  2 +-
 drivers/cdrom/viocd.c               | 25 ++++------------------
 drivers/memstick/core/mspro_block.c |  2 +-
 drivers/s390/block/dasd.c           | 17 ++++-----------
 drivers/s390/char/tape_block.c      | 15 ++++----------
 drivers/scsi/scsi_lib.c             |  2 +-
 include/linux/blkdev.h              | 32 +++++++++++++++++++++++++++++
 15 files changed, 58 insertions(+), 73 deletions(-)

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index 0abfbaa598713..cf81bad8aec21 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -192,8 +192,7 @@ static void mbox_tx_work(struct work_struct *work)
 		}
 
 		spin_lock(q->queue_lock);
-		if (__blk_end_request(rq, 0, 0))
-			BUG();
+		__blk_end_request_all(rq, 0);
 		spin_unlock(q->queue_lock);
 	}
 }
@@ -224,10 +223,7 @@ static void mbox_rx_work(struct work_struct *work)
 			break;
 
 		msg = (mbox_msg_t) rq->data;
-
-		if (blk_end_request(rq, 0, 0))
-			BUG();
-
+		blk_end_request_all(rq, 0);
 		mbox->rxq->callback((void *)msg);
 	}
 }
@@ -337,8 +333,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf)
 
 		*p = (mbox_msg_t) rq->data;
 
-		if (blk_end_request(rq, 0, 0))
-			BUG();
+		blk_end_request_all(rq, 0);
 
 		if (unlikely(mbox_seq_test(mbox, *p))) {
 			pr_info("mbox: Illegal seq bit!(%08x) ignored\n", *p);
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 20b4111fa0507..c8d087655eff8 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -106,10 +106,7 @@ bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 	 */
 	q->ordseq = 0;
 	rq = q->orig_bar_rq;
-
-	if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq)))
-		BUG();
-
+	__blk_end_request_all(rq, q->orderr);
 	return true;
 }
 
@@ -252,9 +249,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 * with prejudice.
 			 */
 			elv_dequeue_request(q, rq);
-			if (__blk_end_request(rq, -EOPNOTSUPP,
-					      blk_rq_bytes(rq)))
-				BUG();
+			__blk_end_request_all(rq, -EOPNOTSUPP);
 			*rqp = NULL;
 			return false;
 		}
diff --git a/block/blk-core.c b/block/blk-core.c
index b84250d3019b7..0520cc7045858 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1780,7 +1780,7 @@ struct request *elv_next_request(struct request_queue *q)
 			break;
 		} else if (ret == BLKPREP_KILL) {
 			rq->cmd_flags |= REQ_QUIET;
-			__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+			__blk_end_request_all(rq, -EIO);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 			break;
diff --git a/block/elevator.c b/block/elevator.c
index b03b8752e18b9..1af5d9f04affe 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -810,7 +810,7 @@ void elv_abort_queue(struct request_queue *q)
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
 		trace_block_rq_abort(q, rq);
-		__blk_end_request(rq, -EIO, blk_rq_bytes(rq));
+		__blk_end_request_all(rq, -EIO);
 	}
 }
 EXPORT_SYMBOL(elv_abort_queue);
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index ca268ca111598..488a8f4a60aa2 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -1024,8 +1024,7 @@ static inline void complete_command(cmdlist_t *cmd, int timeout)
 				cmd->req.sg[i].size, ddir);
 
 	DBGPX(printk("Done with %p\n", rq););
-	if (__blk_end_request(rq, error, blk_rq_bytes(rq)))
-		BUG();
+	__blk_end_request_all(rq, error);
 }
 
 /*
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index ff0448e4bf036..60e85bb6f7902 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -749,8 +749,7 @@ static inline void carm_end_request_queued(struct carm_host *host,
 	struct request *req = crq->rq;
 	int rc;
 
-	rc = __blk_end_request(req, error, blk_rq_bytes(req));
-	assert(rc == 0);
+	__blk_end_request_all(req, error);
 
 	rc = carm_put_request(host, crq);
 	assert(rc == 0);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 5d34764c8a872..50745e64414e5 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -62,7 +62,7 @@ static void blk_done(struct virtqueue *vq)
 			break;
 		}
 
-		__blk_end_request(vbr->req, error, blk_rq_bytes(vbr->req));
+		__blk_end_request_all(vbr->req, error);
 		list_del(&vbr->list);
 		mempool_free(vbr, vblk->pool);
 	}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 8f905089b72b7..cd6cfe3b51e1b 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -551,7 +551,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
 	for (i = info->ring.rsp_cons; i != rp; i++) {
 		unsigned long id;
-		int ret;
 
 		bret = RING_GET_RESPONSE(&info->ring, i);
 		id   = bret->id;
@@ -578,8 +577,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 					"request: %x\n", bret->status);
 
-			ret = __blk_end_request(req, error, blk_rq_bytes(req));
-			BUG_ON(ret);
+			__blk_end_request_all(req, error);
 			break;
 		default:
 			BUG();
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 2eecb779437b8..fee9a9e83fc94 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -632,7 +632,7 @@ static void gdrom_readdisk_dma(struct work_struct *work)
 		* before handling ending the request */
 		spin_lock(&gdrom_lock);
 		list_del_init(&req->queuelist);
-		__blk_end_request(req, err, blk_rq_bytes(req));
+		__blk_end_request_all(req, err);
 	}
 	spin_unlock(&gdrom_lock);
 	kfree(read_command);
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 13929356135c8..cc3efa096e1a0 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -291,23 +291,6 @@ static int send_request(struct request *req)
 	return 0;
 }
 
-static void viocd_end_request(struct request *req, int error)
-{
-	int nsectors = req->hard_nr_sectors;
-
-	/*
-	 * Make sure it's fully ended, and ensure that we process
-	 * at least one sector.
-	 */
-	if (blk_pc_request(req))
-		nsectors = (req->data_len + 511) >> 9;
-	if (!nsectors)
-		nsectors = 1;
-
-	if (__blk_end_request(req, error, nsectors << 9))
-		BUG();
-}
-
 static int rwreq;
 
 static void do_viocd_request(struct request_queue *q)
@@ -316,11 +299,11 @@ static void do_viocd_request(struct request_queue *q)
 
 	while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) {
 		if (!blk_fs_request(req))
-			viocd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 		else if (send_request(req) < 0) {
 			printk(VIOCD_KERN_WARNING
 					"unable to send message to OS/400!");
-			viocd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 		} else
 			rwreq++;
 	}
@@ -531,9 +514,9 @@ static void vio_handle_cd_event(struct HvLpEvent *event)
 					"with rc %d:0x%04X: %s\n",
 					req, event->xRc,
 					bevent->sub_result, err->msg);
-			viocd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 		} else
-			viocd_end_request(req, 0);
+			__blk_end_request_all(req, 0);
 
 		/* restart handling of incoming requests */
 		spin_unlock_irqrestore(&viocd_reqlock, flags);
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index de143deb06f0b..a41634699f846 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -826,7 +826,7 @@ static void mspro_block_submit_req(struct request_queue *q)
 
 	if (msb->eject) {
 		while ((req = elv_next_request(q)) != NULL)
-			__blk_end_request(req, -ENODEV, blk_rq_bytes(req));
+			__blk_end_request_all(req, -ENODEV);
 
 		return;
 	}
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index d1815272c4351..fabec95686b0f 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1613,15 +1613,6 @@ void dasd_block_clear_timer(struct dasd_block *block)
 	del_timer(&block->timer);
 }
 
-/*
- * posts the buffer_cache about a finalized request
- */
-static inline void dasd_end_request(struct request *req, int error)
-{
-	if (__blk_end_request(req, error, blk_rq_bytes(req)))
-		BUG();
-}
-
 /*
  * Process finished error recovery ccw.
  */
@@ -1676,7 +1667,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 				      "Rejecting write request %p",
 				      req);
 			blkdev_dequeue_request(req);
-			dasd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 		cqr = basedev->discipline->build_cp(basedev, block, req);
@@ -1705,7 +1696,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 				      "on request %p",
 				      PTR_ERR(cqr), req);
 			blkdev_dequeue_request(req);
-			dasd_end_request(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 		/*
@@ -1731,7 +1722,7 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr)
 	status = cqr->block->base->discipline->free_cp(cqr, req);
 	if (status <= 0)
 		error = status ? status : -EIO;
-	dasd_end_request(req, error);
+	__blk_end_request_all(req, error);
 }
 
 /*
@@ -2040,7 +2031,7 @@ static void dasd_flush_request_queue(struct dasd_block *block)
 	spin_lock_irq(&block->request_queue_lock);
 	while ((req = elv_next_request(block->request_queue))) {
 		blkdev_dequeue_request(req);
-		dasd_end_request(req, -EIO);
+		__blk_end_request_all(req, -EIO);
 	}
 	spin_unlock_irq(&block->request_queue_lock);
 }
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index f32e89e7c4f2e..86596d3813b5c 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -73,13 +73,6 @@ tapeblock_trigger_requeue(struct tape_device *device)
 /*
  * Post finished request.
  */
-static void
-tapeblock_end_request(struct request *req, int error)
-{
-	if (blk_end_request(req, error, blk_rq_bytes(req)))
-		BUG();
-}
-
 static void
 __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 {
@@ -90,7 +83,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 
 	device = ccw_req->device;
 	req = (struct request *) data;
-	tapeblock_end_request(req, (ccw_req->rc == 0) ? 0 : -EIO);
+	blk_end_request_all(req, (ccw_req->rc == 0) ? 0 : -EIO);
 	if (ccw_req->rc == 0)
 		/* Update position. */
 		device->blk_data.block_position =
@@ -118,7 +111,7 @@ tapeblock_start_request(struct tape_device *device, struct request *req)
 	ccw_req = device->discipline->bread(device, req);
 	if (IS_ERR(ccw_req)) {
 		DBF_EVENT(1, "TBLOCK: bread failed\n");
-		tapeblock_end_request(req, -EIO);
+		blk_end_request_all(req, -EIO);
 		return PTR_ERR(ccw_req);
 	}
 	ccw_req->callback = __tapeblock_end_request;
@@ -131,7 +124,7 @@ tapeblock_start_request(struct tape_device *device, struct request *req)
 		 * Start/enqueueing failed. No retries in
 		 * this case.
 		 */
-		tapeblock_end_request(req, -EIO);
+		blk_end_request_all(req, -EIO);
 		device->discipline->free_bread(ccw_req);
 	}
 
@@ -177,7 +170,7 @@ tapeblock_requeue(struct work_struct *work) {
 			DBF_EVENT(1, "TBLOCK: Rejecting write request\n");
 			blkdev_dequeue_request(req);
 			spin_unlock_irq(&device->blk_data.request_queue_lock);
-			tapeblock_end_request(req, -EIO);
+			blk_end_request_all(req, -EIO);
 			spin_lock_irq(&device->blk_data.request_queue_lock);
 			continue;
 		}
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index d1cb64ad1a3f3..756ac7c93de0d 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -922,7 +922,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			if (driver_byte(result) & DRIVER_SENSE)
 				scsi_print_sense("", cmd);
 		}
-		blk_end_request(req, -EIO, blk_rq_bytes(req));
+		blk_end_request_all(req, -EIO);
 		scsi_next_command(cmd);
 		break;
 	case ACTION_REPREP:
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 501f6845cc737..e33c8356b3da2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -882,6 +882,22 @@ static inline bool blk_end_request(struct request *rq, int error,
 	return blk_end_bidi_request(rq, error, nr_bytes, 0);
 }
 
+/**
+ * blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.
+ */
+static inline void blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+
+	pending = blk_end_request(rq, error, blk_rq_bytes(rq));
+	BUG_ON(pending);
+}
+
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
@@ -901,6 +917,22 @@ static inline bool __blk_end_request(struct request *rq, int error,
 	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
 }
 
+/**
+ * __blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.  Must be called with queue lock held.
+ */
+static inline void __blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+
+	pending = __blk_end_request(rq, error, blk_rq_bytes(rq));
+	BUG_ON(pending);
+}
+
 /**
  * end_request - end I/O on the current segment of the request
  * @rq:		the request being processed
-- 
GitLab


From f06d9a2b52e246a66b606130cea3f0d7b7be17a7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:19 +0900
Subject: [PATCH 0653/2198] block: replace end_request() with
 [__]blk_end_request_cur()

end_request() has been kept around for backward compatibility;
however, it's about time for it to go away.

* There aren't too many users left.

* Its use of @updtodate is pretty confusing.

* In some cases, newer code ends up using mixture of end_request() and
  [__]blk_end_request[_all](), which is way too confusing.

So, add [__]blk_end_request_cur() and replace end_request() with it.
Most conversions are straightforward.  Noteworthy ones are...

* paride/pcd: next_request() updated to take 0/-errno instead of 1/0.

* paride/pf: pf_end_request() and next_request() updated to take
  0/-errno instead of 1/0.

* xd: xd_readwrite() updated to return 0/-errno instead of 1/0.

* mtd/mtd_blkdevs: blktrans_discard_request() updated to return
  0/-errno instead of 1/0.  Unnecessary local variable res
  initialization removed from mtd_blktrans_thread().

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Joerg Dorchain <joerg@dorchain.net>
Acked-by: Geert Uytterhoeven <geert@linux-m68k.org>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Laurent Vivier <Laurent@lvivier.info>
Cc: Tim Waugh <tim@cyberelk.net>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: unsik Kim <donari75@gmail.com>
---
 drivers/block/amiflop.c         | 10 +++----
 drivers/block/ataflop.c         | 14 +++++-----
 drivers/block/hd.c              | 14 +++++-----
 drivers/block/mg_disk.c         | 16 ++++++------
 drivers/block/paride/pcd.c      | 12 ++++-----
 drivers/block/paride/pd.c       |  5 ++--
 drivers/block/paride/pf.c       | 28 ++++++++++----------
 drivers/block/ps3disk.c         |  6 ++---
 drivers/block/swim.c            | 14 +++++-----
 drivers/block/swim3.c           | 26 +++++++++----------
 drivers/block/xd.c              | 15 ++++++-----
 drivers/block/xen-blkfront.c    |  2 +-
 drivers/block/xsysace.c         |  4 +--
 drivers/block/z2ram.c           |  4 +--
 drivers/cdrom/gdrom.c           |  6 ++---
 drivers/message/i2o/i2o_block.c |  2 +-
 drivers/mtd/mtd_blkdevs.c       | 22 ++++++++--------
 drivers/sbus/char/jsflash.c     |  8 +++---
 include/linux/blkdev.h          | 46 ++++++++++++++++-----------------
 19 files changed, 128 insertions(+), 126 deletions(-)

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 8df436ff7068b..b99a2a606d022 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1359,7 +1359,7 @@ static void redo_fd_request(void)
 #endif
 		block = CURRENT->sector + cnt;
 		if ((int)block > floppy->blocks) {
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 
@@ -1373,11 +1373,11 @@ static void redo_fd_request(void)
 
 		if ((rq_data_dir(CURRENT) != READ) && (rq_data_dir(CURRENT) != WRITE)) {
 			printk(KERN_WARNING "do_fd_request: unknown command\n");
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 		if (get_track(drive, track) == -1) {
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 
@@ -1391,7 +1391,7 @@ static void redo_fd_request(void)
 
 			/* keep the drive spinning while writes are scheduled */
 			if (!fd_motor_on(drive)) {
-				end_request(CURRENT, 0);
+				__blk_end_request_cur(CURRENT, -EIO);
 				goto repeat;
 			}
 			/*
@@ -1410,7 +1410,7 @@ static void redo_fd_request(void)
 	CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
 	CURRENT->sector += CURRENT->current_nr_sectors;
 
-	end_request(CURRENT, 1);
+	__blk_end_request_cur(CURRENT, 0);
 	goto repeat;
 }
 
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 4234c11c1e4cf..44a8702136a9a 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -612,7 +612,7 @@ static void fd_error( void )
 	CURRENT->errors++;
 	if (CURRENT->errors >= MAX_ERRORS) {
 		printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 	}
 	else if (CURRENT->errors == RECALIBRATE_ERRORS) {
 		printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
@@ -734,7 +734,7 @@ static void do_fd_action( int drive )
 			/* all sectors finished */
 			CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
 			CURRENT->sector += CURRENT->current_nr_sectors;
-			end_request(CURRENT, 1);
+			__blk_end_request_cur(CURRENT, 0);
 			redo_fd_request();
 			return;
 		    }
@@ -1141,7 +1141,7 @@ static void fd_rwsec_done1(int status)
 		/* all sectors finished */
 		CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
 		CURRENT->sector += CURRENT->current_nr_sectors;
-		end_request(CURRENT, 1);
+		__blk_end_request_cur(CURRENT, 0);
 		redo_fd_request();
 	}
 	return;
@@ -1414,7 +1414,7 @@ static void redo_fd_request(void)
 	if (!UD.connected) {
 		/* drive not connected */
 		printk(KERN_ERR "Unknown Device: fd%d\n", drive );
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 		goto repeat;
 	}
 		
@@ -1430,12 +1430,12 @@ static void redo_fd_request(void)
 		/* user supplied disk type */
 		if (--type >= NUM_DISK_MINORS) {
 			printk(KERN_WARNING "fd%d: invalid disk format", drive );
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 		if (minor2disktype[type].drive_types > DriveType)  {
 			printk(KERN_WARNING "fd%d: unsupported disk format", drive );
-			end_request(CURRENT, 0);
+			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
 		}
 		type = minor2disktype[type].index;
@@ -1445,7 +1445,7 @@ static void redo_fd_request(void)
 	}
 	
 	if (CURRENT->sector + 1 > UDT->blocks) {
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 		goto repeat;
 	}
 
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index baaa9e486e508..5cb300b81c6ae 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -410,7 +410,7 @@ static void bad_rw_intr(void)
 	if (req != NULL) {
 		struct hd_i_struct *disk = req->rq_disk->private_data;
 		if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			disk->special_op = disk->recalibrate = 1;
 		} else if (req->errors % RESET_FREQ == 0)
 			reset = 1;
@@ -466,7 +466,7 @@ static void read_intr(void)
 		req->buffer+512);
 #endif
 	if (req->current_nr_sectors <= 0)
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	if (i > 0) {
 		SET_HANDLER(&read_intr);
 		return;
@@ -505,7 +505,7 @@ static void write_intr(void)
 	--req->current_nr_sectors;
 	req->buffer += 512;
 	if (!i || (req->bio && req->current_nr_sectors <= 0))
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	if (i > 0) {
 		SET_HANDLER(&write_intr);
 		outsw(HD_DATA, req->buffer, 256);
@@ -548,7 +548,7 @@ static void hd_times_out(unsigned long dummy)
 #ifdef DEBUG
 		printk("%s: too many errors\n", name);
 #endif
-		end_request(CURRENT, 0);
+		__blk_end_request_cur(CURRENT, -EIO);
 	}
 	hd_request();
 	spin_unlock_irq(hd_queue->queue_lock);
@@ -563,7 +563,7 @@ static int do_special_op(struct hd_i_struct *disk, struct request *req)
 	}
 	if (disk->head > 16) {
 		printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 	}
 	disk->special_op = 0;
 	return 1;
@@ -607,7 +607,7 @@ static void hd_request(void)
 	    ((block+nsect) > get_capacity(req->rq_disk))) {
 		printk("%s: bad access: block=%d, count=%d\n",
 			req->rq_disk->disk_name, block, nsect);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		goto repeat;
 	}
 
@@ -647,7 +647,7 @@ static void hd_request(void)
 			break;
 		default:
 			printk("unknown hd-command\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			break;
 		}
 	}
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index f3898353d0a8f..408c2bd8a439b 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -285,7 +285,7 @@ static void mg_bad_rw_intr(struct mg_host *host)
 	if (req != NULL)
 		if (++req->errors >= MG_MAX_ERRORS ||
 				host->error == MG_ERR_TIMEOUT)
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 }
 
 static unsigned int mg_out(struct mg_host *host,
@@ -351,7 +351,7 @@ static void mg_read(struct request *req)
 
 		if (req->current_nr_sectors <= 0) {
 			MG_DBG("remain : %d sects\n", remains);
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			if (remains > 0)
 				req = elv_next_request(host->breq);
 		}
@@ -395,7 +395,7 @@ static void mg_write(struct request *req)
 
 		if (req->current_nr_sectors <= 0) {
 			MG_DBG("remain : %d sects\n", remains);
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			if (remains > 0)
 				req = elv_next_request(host->breq);
 		}
@@ -448,7 +448,7 @@ static void mg_read_intr(struct mg_host *host)
 
 	/* let know if current segment done */
 	if (req->current_nr_sectors <= 0)
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 
 	/* set handler if read remains */
 	if (i > 0) {
@@ -497,7 +497,7 @@ static void mg_write_intr(struct mg_host *host)
 
 	/* let know if current segment or all done */
 	if (!i || (req->bio && req->current_nr_sectors <= 0))
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 
 	/* write 1 sector and set handler if remains */
 	if (i > 0) {
@@ -563,7 +563,7 @@ static void mg_request_poll(struct request_queue *q)
 			default:
 				printk(KERN_WARNING "%s:%d unknown command\n",
 						__func__, __LINE__);
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				break;
 			}
 		}
@@ -617,7 +617,7 @@ static unsigned int mg_issue_req(struct request *req,
 	default:
 		printk(KERN_WARNING "%s:%d unknown command\n",
 				__func__, __LINE__);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		break;
 	}
 	return MG_ERR_NONE;
@@ -655,7 +655,7 @@ static void mg_request(struct request_queue *q)
 					"%s: bad access: sector=%d, count=%d\n",
 					req->rq_disk->disk_name,
 					sect_num, sect_cnt);
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index e91d4b4b014fc..9fd57c2aa4636 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -735,16 +735,16 @@ static void do_pcd_request(struct request_queue * q)
 			ps_set_intr(do_pcd_read, NULL, 0, nice);
 			return;
 		} else
-			end_request(pcd_req, 0);
+			__blk_end_request_cur(pcd_req, -EIO);
 	}
 }
 
-static inline void next_request(int success)
+static inline void next_request(int err)
 {
 	unsigned long saved_flags;
 
 	spin_lock_irqsave(&pcd_lock, saved_flags);
-	end_request(pcd_req, success);
+	__blk_end_request_cur(pcd_req, err);
 	pcd_busy = 0;
 	do_pcd_request(pcd_queue);
 	spin_unlock_irqrestore(&pcd_lock, saved_flags);
@@ -781,7 +781,7 @@ static void pcd_start(void)
 
 	if (pcd_command(pcd_current, rd_cmd, 2048, "read block")) {
 		pcd_bufblk = -1;
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 
@@ -796,7 +796,7 @@ static void do_pcd_read(void)
 	pcd_retries = 0;
 	pcd_transfer();
 	if (!pcd_count) {
-		next_request(1);
+		next_request(0);
 		return;
 	}
 
@@ -815,7 +815,7 @@ static void do_pcd_read_drq(void)
 			return;
 		}
 		pcd_bufblk = -1;
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 9299455b0af67..0732df4e901a5 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -410,7 +410,8 @@ static void run_fsm(void)
 				pd_claimed = 0;
 				phase = NULL;
 				spin_lock_irqsave(&pd_lock, saved_flags);
-				end_request(pd_req, res);
+				__blk_end_request_cur(pd_req,
+						      res == Ok ? 0 : -EIO);
 				pd_req = elv_next_request(pd_queue);
 				if (!pd_req)
 					stop = 1;
@@ -477,7 +478,7 @@ static int pd_next_buf(void)
 	if (pd_count)
 		return 0;
 	spin_lock_irqsave(&pd_lock, saved_flags);
-	end_request(pd_req, 1);
+	__blk_end_request_cur(pd_req, 0);
 	pd_count = pd_req->current_nr_sectors;
 	pd_buf = pd_req->buffer;
 	spin_unlock_irqrestore(&pd_lock, saved_flags);
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index bef3b997ba3e3..3871e3586d6d2 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -750,10 +750,10 @@ static int pf_ready(void)
 
 static struct request_queue *pf_queue;
 
-static void pf_end_request(int uptodate)
+static void pf_end_request(int err)
 {
 	if (pf_req) {
-		end_request(pf_req, uptodate);
+		__blk_end_request_cur(pf_req, err);
 		pf_req = NULL;
 	}
 }
@@ -773,7 +773,7 @@ static void do_pf_request(struct request_queue * q)
 	pf_count = pf_req->current_nr_sectors;
 
 	if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) {
-		pf_end_request(0);
+		pf_end_request(-EIO);
 		goto repeat;
 	}
 
@@ -788,7 +788,7 @@ static void do_pf_request(struct request_queue * q)
 		pi_do_claimed(pf_current->pi, do_pf_write);
 	else {
 		pf_busy = 0;
-		pf_end_request(0);
+		pf_end_request(-EIO);
 		goto repeat;
 	}
 }
@@ -805,7 +805,7 @@ static int pf_next_buf(void)
 		return 1;
 	if (!pf_count) {
 		spin_lock_irqsave(&pf_spin_lock, saved_flags);
-		pf_end_request(1);
+		pf_end_request(0);
 		pf_req = elv_next_request(pf_queue);
 		spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
 		if (!pf_req)
@@ -816,12 +816,12 @@ static int pf_next_buf(void)
 	return 0;
 }
 
-static inline void next_request(int success)
+static inline void next_request(int err)
 {
 	unsigned long saved_flags;
 
 	spin_lock_irqsave(&pf_spin_lock, saved_flags);
-	pf_end_request(success);
+	pf_end_request(err);
 	pf_busy = 0;
 	do_pf_request(pf_queue);
 	spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
@@ -844,7 +844,7 @@ static void do_pf_read_start(void)
 			pi_do_claimed(pf_current->pi, do_pf_read_start);
 			return;
 		}
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 	pf_mask = STAT_DRQ;
@@ -863,7 +863,7 @@ static void do_pf_read_drq(void)
 				pi_do_claimed(pf_current->pi, do_pf_read_start);
 				return;
 			}
-			next_request(0);
+			next_request(-EIO);
 			return;
 		}
 		pi_read_block(pf_current->pi, pf_buf, 512);
@@ -871,7 +871,7 @@ static void do_pf_read_drq(void)
 			break;
 	}
 	pi_disconnect(pf_current->pi);
-	next_request(1);
+	next_request(0);
 }
 
 static void do_pf_write(void)
@@ -890,7 +890,7 @@ static void do_pf_write_start(void)
 			pi_do_claimed(pf_current->pi, do_pf_write_start);
 			return;
 		}
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 
@@ -903,7 +903,7 @@ static void do_pf_write_start(void)
 				pi_do_claimed(pf_current->pi, do_pf_write_start);
 				return;
 			}
-			next_request(0);
+			next_request(-EIO);
 			return;
 		}
 		pi_write_block(pf_current->pi, pf_buf, 512);
@@ -923,11 +923,11 @@ static void do_pf_write_done(void)
 			pi_do_claimed(pf_current->pi, do_pf_write_start);
 			return;
 		}
-		next_request(0);
+		next_request(-EIO);
 		return;
 	}
 	pi_disconnect(pf_current->pi);
-	next_request(1);
+	next_request(0);
 }
 
 static int __init pf_init(void)
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index bccc42bb9212d..d23b54bc2f50a 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -158,7 +158,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 	if (res) {
 		dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__,
 			__LINE__, op, res);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		return 0;
 	}
 
@@ -180,7 +180,7 @@ static int ps3disk_submit_flush_request(struct ps3_storage_device *dev,
 	if (res) {
 		dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
 			__func__, __LINE__, res);
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 		return 0;
 	}
 
@@ -205,7 +205,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 				break;
 		} else {
 			blk_dump_rq_flags(req, DEVICE_NAME " bad request");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 	}
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index d22cc38569372..6544a7b06bf02 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -532,39 +532,39 @@ static void redo_fd_request(struct request_queue *q)
 
 		fs = req->rq_disk->private_data;
 		if (req->sector < 0 || req->sector >= fs->total_secs) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (req->current_nr_sectors == 0) {
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			continue;
 		}
 		if (!fs->disk_in) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (rq_data_dir(req) == WRITE) {
 			if (fs->write_protected) {
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
 		}
 		switch (rq_data_dir(req)) {
 		case WRITE:
 			/* NOT IMPLEMENTED */
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			break;
 		case READ:
 			if (floppy_read_sectors(fs, req->sector,
 						req->current_nr_sectors,
 						req->buffer)) {
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
 			req->nr_sectors -= req->current_nr_sectors;
 			req->sector += req->current_nr_sectors;
 			req->buffer += req->current_nr_sectors * 512;
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			break;
 		}
 	}
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 612965307ba00..5904f7b73c6ee 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -320,15 +320,15 @@ static void start_request(struct floppy_state *fs)
 #endif
 
 		if (req->sector < 0 || req->sector >= fs->total_secs) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (req->current_nr_sectors == 0) {
-			end_request(req, 1);
+			__blk_end_request_cur(req, 0);
 			continue;
 		}
 		if (fs->ejected) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
@@ -336,7 +336,7 @@ static void start_request(struct floppy_state *fs)
 			if (fs->write_prot < 0)
 				fs->write_prot = swim3_readbit(fs, WRITE_PROT);
 			if (fs->write_prot) {
-				end_request(req, 0);
+				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
 		}
@@ -508,7 +508,7 @@ static void act(struct floppy_state *fs)
 		case do_transfer:
 			if (fs->cur_cyl != fs->req_cyl) {
 				if (fs->retries > 5) {
-					end_request(fd_req, 0);
+					__blk_end_request_cur(fd_req, -EIO);
 					fs->state = idle;
 					return;
 				}
@@ -540,7 +540,7 @@ static void scan_timeout(unsigned long data)
 	out_8(&sw->intr_enable, 0);
 	fs->cur_cyl = -1;
 	if (fs->retries > 5) {
-		end_request(fd_req, 0);
+		__blk_end_request_cur(fd_req, -EIO);
 		fs->state = idle;
 		start_request(fs);
 	} else {
@@ -559,7 +559,7 @@ static void seek_timeout(unsigned long data)
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
 	printk(KERN_ERR "swim3: seek timeout\n");
-	end_request(fd_req, 0);
+	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -583,7 +583,7 @@ static void settle_timeout(unsigned long data)
 		return;
 	}
 	printk(KERN_ERR "swim3: seek settle timeout\n");
-	end_request(fd_req, 0);
+	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -615,7 +615,7 @@ static void xfer_timeout(unsigned long data)
 	fd_req->current_nr_sectors -= s;
 	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
 	       (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)fd_req->sector);
-	end_request(fd_req, 0);
+	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -646,7 +646,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk(KERN_ERR "swim3: seen sector but cyl=ff?\n");
 				fs->cur_cyl = -1;
 				if (fs->retries > 5) {
-					end_request(fd_req, 0);
+					__blk_end_request_cur(fd_req, -EIO);
 					fs->state = idle;
 					start_request(fs);
 				} else {
@@ -731,7 +731,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk("swim3: error %sing block %ld (err=%x)\n",
 				       rq_data_dir(fd_req) == WRITE? "writ": "read",
 				       (long)fd_req->sector, err);
-				end_request(fd_req, 0);
+				__blk_end_request_cur(fd_req, -EIO);
 				fs->state = idle;
 			}
 		} else {
@@ -740,7 +740,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid);
 				printk(KERN_ERR "  state=%d, dir=%x, intr=%x, err=%x\n",
 				       fs->state, rq_data_dir(fd_req), intr, err);
-				end_request(fd_req, 0);
+				__blk_end_request_cur(fd_req, -EIO);
 				fs->state = idle;
 				start_request(fs);
 				break;
@@ -749,7 +749,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			fd_req->current_nr_sectors -= fs->scount;
 			fd_req->buffer += fs->scount * 512;
 			if (fd_req->current_nr_sectors <= 0) {
-				end_request(fd_req, 1);
+				__blk_end_request_cur(fd_req, 0);
 				fs->state = idle;
 			} else {
 				fs->req_sector += fs->scount;
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 64b496fce98bb..6f6ad82ec0c0a 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -314,21 +314,22 @@ static void do_xd_request (struct request_queue * q)
 		int retry;
 
 		if (!blk_fs_request(req)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (block + count > get_capacity(req->rq_disk)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		if (rw != READ && rw != WRITE) {
 			printk("do_xd_request: unknown request\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		for (retry = 0; (retry < XD_RETRIES) && !res; retry++)
 			res = xd_readwrite(rw, disk, req->buffer, block, count);
-		end_request(req, res);	/* wrap up, 0 = fail, 1 = success */
+		/* wrap up, 0 = success, -errno = fail */
+		__blk_end_request_cur(req, res);
 	}
 }
 
@@ -418,7 +419,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_
 				printk("xd%c: %s timeout, recalibrating drive\n",'a'+drive,(operation == READ ? "read" : "write"));
 				xd_recalibrate(drive);
 				spin_lock_irq(&xd_lock);
-				return (0);
+				return -EIO;
 			case 2:
 				if (sense[0] & 0x30) {
 					printk("xd%c: %s - ",'a'+drive,(operation == READ ? "reading" : "writing"));
@@ -439,7 +440,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_
 				else
 					printk(" - no valid disk address\n");
 				spin_lock_irq(&xd_lock);
-				return (0);
+				return -EIO;
 		}
 		if (xd_dma_buffer)
 			for (i=0; i < (temp * 0x200); i++)
@@ -448,7 +449,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_
 		count -= temp, buffer += temp * 0x200, block += temp;
 	}
 	spin_lock_irq(&xd_lock);
-	return (1);
+	return 0;
 }
 
 /* xd_recalibrate: recalibrate a given drive and reset controller if necessary */
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index cd6cfe3b51e1b..b4564479f6416 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -302,7 +302,7 @@ static void do_blkif_request(struct request_queue *rq)
 	while ((req = elv_next_request(rq)) != NULL) {
 		info = req->rq_disk->private_data;
 		if (!blk_fs_request(req)) {
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 4aecf5dc6a93f..b1e1d7e5ab1e8 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -466,7 +466,7 @@ struct request *ace_get_next_request(struct request_queue * q)
 	while ((req = elv_next_request(q)) != NULL) {
 		if (blk_fs_request(req))
 			break;
-		end_request(req, 0);
+		__blk_end_request_cur(req, -EIO);
 	}
 	return req;
 }
@@ -494,7 +494,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 
 		/* Drop all pending requests */
 		while ((req = elv_next_request(ace->queue)) != NULL)
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 
 		/* Drop back to IDLE state and notify waiters */
 		ace->fsm_state = ACE_FSM_STATE_IDLE;
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 80754cdd31190..b66ad58a3c38a 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -77,7 +77,7 @@ static void do_z2_request(struct request_queue *q)
 		if (start + len > z2ram_size) {
 			printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
 				req->sector, req->current_nr_sectors);
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 		while (len) {
@@ -93,7 +93,7 @@ static void do_z2_request(struct request_queue *q)
 			start += size;
 			len -= size;
 		}
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	}
 }
 
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index fee9a9e83fc94..cab2b1fb2fe7a 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -654,17 +654,17 @@ static void gdrom_request(struct request_queue *rq)
 	while ((req = elv_next_request(rq)) != NULL) {
 		if (!blk_fs_request(req)) {
 			printk(KERN_DEBUG "GDROM: Non-fs request ignored\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 		}
 		if (rq_data_dir(req) != READ) {
 			printk(KERN_NOTICE "GDROM: Read only device -");
 			printk(" write request ignored\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 		}
 		if (req->nr_sectors)
 			gdrom_request_handler_dma(req);
 		else
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 	}
 }
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index a443e136dc415..221317e6a006d 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -923,7 +923,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 				break;
 			}
 		} else
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 	}
 };
 
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index a49a9c8f2cb1f..76c4c8d130736 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -54,33 +54,33 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 
 	if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
 	    req->cmd[0] == REQ_LB_OP_DISCARD)
-		return !tr->discard(dev, block, nsect);
+		return tr->discard(dev, block, nsect);
 
 	if (!blk_fs_request(req))
-		return 0;
+		return -EIO;
 
 	if (req->sector + req->current_nr_sectors > get_capacity(req->rq_disk))
-		return 0;
+		return -EIO;
 
 	switch(rq_data_dir(req)) {
 	case READ:
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->readsect(dev, block, buf))
-				return 0;
-		return 1;
+				return -EIO;
+		return 0;
 
 	case WRITE:
 		if (!tr->writesect)
-			return 0;
+			return -EIO;
 
 		for (; nsect > 0; nsect--, block++, buf += tr->blksize)
 			if (tr->writesect(dev, block, buf))
-				return 0;
-		return 1;
+				return -EIO;
+		return 0;
 
 	default:
 		printk(KERN_NOTICE "Unknown request %u\n", rq_data_dir(req));
-		return 0;
+		return -EIO;
 	}
 }
 
@@ -96,7 +96,7 @@ static int mtd_blktrans_thread(void *arg)
 	while (!kthread_should_stop()) {
 		struct request *req;
 		struct mtd_blktrans_dev *dev;
-		int res = 0;
+		int res;
 
 		req = elv_next_request(rq);
 
@@ -119,7 +119,7 @@ static int mtd_blktrans_thread(void *arg)
 
 		spin_lock_irq(rq->queue_lock);
 
-		end_request(req, res);
+		__blk_end_request_cur(req, res);
 	}
 	spin_unlock_irq(rq->queue_lock);
 
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index a85ad05e85482..09617884a50b1 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -192,25 +192,25 @@ static void jsfd_do_request(struct request_queue *q)
 		size_t len = req->current_nr_sectors << 9;
 
 		if ((offset + len) > jdp->dsize) {
-               		end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
 		if (rq_data_dir(req) != READ) {
 			printk(KERN_ERR "jsfd: write\n");
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
 		if ((jdp->dbase & 0xff000000) != 0x20000000) {
 			printk(KERN_ERR "jsfd: bad base %x\n", (int)jdp->dbase);
-			end_request(req, 0);
+			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
 
 		jsfd_read(req->buffer, jdp->dbase + offset, len);
 
-		end_request(req, 1);
+		__blk_end_request_cur(req, 0);
 	}
 }
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index e33c8356b3da2..cfeb3c2feb276 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -845,9 +845,8 @@ extern unsigned int blk_rq_cur_bytes(struct request *rq);
  * blk_update_request() completes given number of bytes and updates
  * the request without completing it.
  *
- * blk_end_request() and friends.  __blk_end_request() and
- * end_request() must be called with the request queue spinlock
- * acquired.
+ * blk_end_request() and friends.  __blk_end_request() must be called
+ * with the request queue spinlock acquired.
  *
  * Several drivers define their own end_request and call
  * blk_end_request() for parts of the original function.
@@ -898,6 +897,19 @@ static inline void blk_end_request_all(struct request *rq, int error)
 	BUG_ON(pending);
 }
 
+/**
+ * blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Complete the current consecutively mapped chunk from @rq.
+ */
+static inline void blk_end_request_cur(struct request *rq, int error)
+{
+	blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+}
+
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
@@ -934,29 +946,17 @@ static inline void __blk_end_request_all(struct request *rq, int error)
 }
 
 /**
- * end_request - end I/O on the current segment of the request
- * @rq:		the request being processed
- * @uptodate:	error value or %0/%1 uptodate flag
+ * __blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
  *
  * Description:
- *     Ends I/O on the current segment of a request. If that is the only
- *     remaining segment, the request is also completed and freed.
- *
- *     This is a remnant of how older block drivers handled I/O completions.
- *     Modern drivers typically end I/O on the full request in one go, unless
- *     they have a residual value to account for. For that case this function
- *     isn't really useful, unless the residual just happens to be the
- *     full current segment. In other words, don't use this function in new
- *     code. Use blk_end_request() or __blk_end_request() to end a request.
- **/
-static inline void end_request(struct request *rq, int uptodate)
+ *     Complete the current consecutively mapped chunk from @rq.  Must
+ *     be called with queue lock held.
+ */
+static inline void __blk_end_request_cur(struct request *rq, int error)
 {
-	int error = 0;
-
-	if (uptodate <= 0)
-		error = uptodate ? uptodate : -EIO;
-
-	__blk_end_bidi_request(rq, error, rq->hard_cur_sectors << 9, 0);
+	__blk_end_request(rq, error, rq->hard_cur_sectors << 9);
 }
 
 extern void blk_complete_request(struct request *);
-- 
GitLab


From ec24751a6b57e1373a12361e581b2458bc9bb791 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:20 +0900
Subject: [PATCH 0654/2198] arm-omap: don't abuse rq->data

omap mailbox uses rq->data as the second opaque pointer to carry
mbox_msg_t and rq->special message argument which is needed only for
tx.  Add and use omap_msg_tx_data struct for tx and use rq->special
for mbox_msg_t for rx such that only rq->special is used as opaque
pointer.

[ Impact: cleanup rq->data usage, extra kmalloc in msg_send ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Russell King <rmk@arm.linux.org.uk>
---
 arch/arm/plat-omap/mailbox.c | 43 ++++++++++++++++++++++++++----------
 1 file changed, 31 insertions(+), 12 deletions(-)

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index cf81bad8aec21..538ba7541d3f6 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -147,24 +147,40 @@ static int __mbox_msg_send(struct omap_mbox *mbox, mbox_msg_t msg, void *arg)
 	return ret;
 }
 
+struct omap_msg_tx_data {
+	mbox_msg_t	msg;
+	void		*arg;
+};
+
+static void omap_msg_tx_end_io(struct request *rq, int error)
+{
+	kfree(rq->special);
+	__blk_put_request(rq->q, rq);
+}
+
 int omap_mbox_msg_send(struct omap_mbox *mbox, mbox_msg_t msg, void* arg)
 {
+	struct omap_msg_tx_data *tx_data;
 	struct request *rq;
 	struct request_queue *q = mbox->txq->queue;
-	int ret = 0;
+
+	tx_data = kmalloc(sizeof(*tx_data), GFP_ATOMIC);
+	if (unlikely(!tx_data))
+		return -ENOMEM;
 
 	rq = blk_get_request(q, WRITE, GFP_ATOMIC);
 	if (unlikely(!rq)) {
-		ret = -ENOMEM;
-		goto fail;
+		kfree(tx_data);
+		return -ENOMEM;
 	}
 
-	rq->data = (void *)msg;
-	blk_insert_request(q, rq, 0, arg);
+	tx_data->msg = msg;
+	tx_data->arg = arg;
+	rq->end_io = omap_msg_tx_end_io;
+	blk_insert_request(q, rq, 0, tx_data);
 
 	schedule_work(&mbox->txq->work);
- fail:
-	return ret;
+	return 0;
 }
 EXPORT_SYMBOL(omap_mbox_msg_send);
 
@@ -178,6 +194,8 @@ static void mbox_tx_work(struct work_struct *work)
 	struct request_queue *q = mbox->txq->queue;
 
 	while (1) {
+		struct omap_msg_tx_data *tx_data;
+
 		spin_lock(q->queue_lock);
 		rq = elv_next_request(q);
 		spin_unlock(q->queue_lock);
@@ -185,7 +203,9 @@ static void mbox_tx_work(struct work_struct *work)
 		if (!rq)
 			break;
 
-		ret = __mbox_msg_send(mbox, (mbox_msg_t) rq->data, rq->special);
+		tx_data = rq->special;
+
+		ret = __mbox_msg_send(mbox, tx_data->msg, tx_data->arg);
 		if (ret) {
 			enable_mbox_irq(mbox, IRQ_TX);
 			return;
@@ -222,7 +242,7 @@ static void mbox_rx_work(struct work_struct *work)
 		if (!rq)
 			break;
 
-		msg = (mbox_msg_t) rq->data;
+		msg = (mbox_msg_t)rq->special;
 		blk_end_request_all(rq, 0);
 		mbox->rxq->callback((void *)msg);
 	}
@@ -260,7 +280,6 @@ static void __mbox_rx_interrupt(struct omap_mbox *mbox)
 			goto nomem;
 
 		msg = mbox_fifo_read(mbox);
-		rq->data = (void *)msg;
 
 		if (unlikely(mbox_seq_test(mbox, msg))) {
 			pr_info("mbox: Illegal seq bit!(%08x)\n", msg);
@@ -268,7 +287,7 @@ static void __mbox_rx_interrupt(struct omap_mbox *mbox)
 				mbox->err_notify();
 		}
 
-		blk_insert_request(q, rq, 0, NULL);
+		blk_insert_request(q, rq, 0, (void *)msg);
 		if (mbox->ops->type == OMAP_MBOX_TYPE1)
 			break;
 	}
@@ -331,7 +350,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf)
 		if (!rq)
 			break;
 
-		*p = (mbox_msg_t) rq->data;
+		*p = (mbox_msg_t)rq->special;
 
 		blk_end_request_all(rq, 0);
 
-- 
GitLab


From 731ec497e5888c6792ad62613ae9be97eebcd7ca Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 23 Apr 2009 11:05:20 +0900
Subject: [PATCH 0655/2198] block: kill rq->data

Now that all block request data transfer is done via bio, rq->data
isn't used.  Kill it.

While at it, make the roles of rq->special and buffer clear.

[ Impact: drop now unncessary field from struct request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Boaz Harrosh <bharrosh@panasas.com>
---
 block/blk-core.c        | 5 ++---
 block/blk-map.c         | 6 +++---
 block/scsi_ioctl.c      | 1 -
 drivers/scsi/scsi_lib.c | 1 -
 include/linux/blkdev.h  | 5 ++---
 5 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 0520cc7045858..6dd180cf15d2b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -189,10 +189,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 						(unsigned long long)rq->sector,
 						rq->nr_sectors,
 						rq->current_nr_sectors);
-	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, data %p, len %u\n",
+	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
 						rq->bio, rq->biotail,
-						rq->buffer, rq->data,
-						rq->data_len);
+						rq->buffer, rq->data_len);
 
 	if (blk_pc_request(rq)) {
 		printk(KERN_INFO "  cdb: ");
diff --git a/block/blk-map.c b/block/blk-map.c
index f103729b462fd..694fefad34e7c 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -156,7 +156,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 	if (!bio_flagged(bio, BIO_USER_MAPPED))
 		rq->cmd_flags |= REQ_COPY_USER;
 
-	rq->buffer = rq->data = NULL;
+	rq->buffer = NULL;
 	return 0;
 unmap_rq:
 	blk_rq_unmap_user(bio);
@@ -235,7 +235,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 	blk_queue_bounce(q, &bio);
 	bio_get(bio);
 	blk_rq_bio_prep(q, rq, bio);
-	rq->buffer = rq->data = NULL;
+	rq->buffer = NULL;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
@@ -313,7 +313,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 
 	blk_rq_bio_prep(q, rq, bio);
 	blk_queue_bounce(q, &rq->bio);
-	rq->buffer = rq->data = NULL;
+	rq->buffer = NULL;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 82a0ca2f67290..bb9aa43551b26 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -500,7 +500,6 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq = blk_get_request(q, WRITE, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	rq->data = NULL;
 	rq->data_len = 0;
 	rq->extra_len = 0;
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 756ac7c93de0d..aa9fc572e45fa 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1088,7 +1088,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 			return ret;
 	} else {
 		BUG_ON(req->data_len);
-		BUG_ON(req->data);
 
 		memset(&cmd->sdb, 0, sizeof(cmd->sdb));
 		req->buffer = NULL;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index cfeb3c2feb276..12c545e2737cf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -211,8 +211,8 @@ struct request {
 
 	unsigned short ioprio;
 
-	void *special;
-	char *buffer;
+	void *special;		/* opaque pointer available for LLD use */
+	char *buffer;		/* kaddr of the current segment if available */
 
 	int tag;
 	int errors;
@@ -229,7 +229,6 @@ struct request {
 	unsigned int data_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
-	void *data;
 	void *sense;
 
 	unsigned long deadline;
-- 
GitLab


From c2553b5844b06910435e40cfab9e6f384840cb97 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 24 Apr 2009 08:10:11 +0200
Subject: [PATCH 0656/2198] block: make blk_do_io_stat() do the full "is this
 rq accountable" checks

We currently check for file system requests outside of blk_do_io_stat(rq),
but we may as well just include it.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 12 +++---------
 block/blk.h      |  9 ++++++++-
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 6dd180cf15d2b..1e3b97f0ae6e6 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -68,7 +68,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 	int rw = rq_data_dir(rq);
 	int cpu;
 
-	if (!blk_fs_request(rq) || !blk_do_io_stat(rq))
+	if (!blk_do_io_stat(rq))
 		return;
 
 	cpu = part_stat_lock();
@@ -1639,10 +1639,7 @@ EXPORT_SYMBOL(blkdev_dequeue_request);
 
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
-	if (!blk_do_io_stat(req))
-		return;
-
-	if (blk_fs_request(req)) {
+	if (blk_do_io_stat(req)) {
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
@@ -1656,15 +1653,12 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 
 static void blk_account_io_done(struct request *req)
 {
-	if (!blk_do_io_stat(req))
-		return;
-
 	/*
 	 * Account IO completion.  bar_rq isn't accounted as a normal
 	 * IO on queueing nor completion.  Accounting the containing
 	 * request is enough.
 	 */
-	if (blk_fs_request(req) && req != &req->q->bar_rq) {
+	if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
diff --git a/block/blk.h b/block/blk.h
index 9b2c324e4407d..404c10b25ca1d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -149,9 +149,16 @@ static inline int blk_cpu_to_group(int cpu)
 #endif
 }
 
+/*
+ * Contribute to IO statistics IFF:
+ *
+ *	a) it's attached to a gendisk, and
+ *	b) the queue had IO stats enabled when this request was started, and
+ *	c) it's a file system request
+ */
 static inline int blk_do_io_stat(struct request *rq)
 {
-	return rq->rq_disk && blk_rq_io_stat(rq);
+	return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq);
 }
 
 #endif
-- 
GitLab


From c69d48540c201394d08cb4d48b905e001313d9b8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Fri, 24 Apr 2009 08:12:19 +0200
Subject: [PATCH 0657/2198] block: include discard requests in IO accounting

We currently don't do merging on discard requests, but we potentially
could. If we do, then we need to include discard requests in the IO
accounting, or merging would end up decrementing in_flight IO counters
for an IO which never incremented them.

So enable accounting for discard requests.

Problem found by Nikanth Karthikesan <knikanth@suse.de>

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/block/blk.h b/block/blk.h
index 404c10b25ca1d..51115599df9b8 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -158,7 +158,8 @@ static inline int blk_cpu_to_group(int cpu)
  */
 static inline int blk_do_io_stat(struct request *rq)
 {
-	return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq);
+	return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq) &&
+		blk_discard_rq(rq);
 }
 
 #endif
-- 
GitLab


From 9eb55b030c4b3227334ee4482402096cd1d1a6fe Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Mon, 27 Apr 2009 14:53:54 +0200
Subject: [PATCH 0658/2198] block: catch trying to use more bits than
 request->cmd_flags has

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 1e3b97f0ae6e6..394c5bd812718 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2097,6 +2097,9 @@ EXPORT_SYMBOL(kblockd_schedule_work);
 
 int __init blk_dev_init(void)
 {
+	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
+			sizeof(((struct request *)0)->cmd_flags));
+
 	kblockd_workqueue = create_workqueue("kblockd");
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
-- 
GitLab


From 9fd8d0e1bcb848257968d9a7d73ca4d890ea8bd1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:04 +0900
Subject: [PATCH 0659/2198] block: make blk_end_request_cur() return bool

In the process of mindlessly copying [__]blk_end_request_all(),
[__]blk_end_request_cur() ended up returning void even though they're
partial completion functions.  Fix it.

[ Impact: fix braindead API ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 12c545e2737cf..3a5b1bd6582c3 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -903,10 +903,14 @@ static inline void blk_end_request_all(struct request *rq, int error)
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  */
-static inline void blk_end_request_cur(struct request *rq, int error)
+static inline bool blk_end_request_cur(struct request *rq, int error)
 {
-	blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return blk_end_request(rq, error, rq->hard_cur_sectors << 9);
 }
 
 /**
@@ -952,10 +956,14 @@ static inline void __blk_end_request_all(struct request *rq, int error)
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.  Must
  *     be called with queue lock held.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
  */
-static inline void __blk_end_request_cur(struct request *rq, int error)
+static inline bool __blk_end_request_cur(struct request *rq, int error)
 {
-	__blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return __blk_end_request(rq, error, rq->hard_cur_sectors << 9);
 }
 
 extern void blk_complete_request(struct request *);
-- 
GitLab


From 4c94dece1baf320d925cedb231489c4e0358ac5a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:05 +0900
Subject: [PATCH 0660/2198] block: don't init rq fields unnecessarily

blk_get_request() always returns properly zeroed requests.  Don't set
fields to zero/NULL unnecessarily.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/scsi_ioctl.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index bb9aa43551b26..58cf4560f742f 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -500,8 +500,6 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
 
 	rq = blk_get_request(q, WRITE, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_BLOCK_PC;
-	rq->data_len = 0;
-	rq->extra_len = 0;
 	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
 	rq->cmd[0] = cmd;
 	rq->cmd[4] = data;
-- 
GitLab


From 5b5c5d12b91cb6b2a2967f06aef35d59008dc2e7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:06 +0900
Subject: [PATCH 0661/2198] amiflop,ataflop,xd,mg_disk: clean up unnecessary
 stuff from block drivers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

rq_data_dir() can only be READ or WRITE and rq->sector and nr_sectors
are always automatically updated after partial request completion.
Don't worry about rq_data_dir() not being either READ or WRITE or
manually update sector and nr_sectors.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jörg Dorchain <joerg@dorchain.net>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/amiflop.c |  7 -------
 drivers/block/ataflop.c |  4 ----
 drivers/block/mg_disk.c | 10 ----------
 drivers/block/xd.c      |  9 ++-------
 4 files changed, 2 insertions(+), 28 deletions(-)

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index b99a2a606d022..8ff95f2c0ede2 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1371,11 +1371,6 @@ static void redo_fd_request(void)
 		       "0x%08lx\n", track, sector, data);
 #endif
 
-		if ((rq_data_dir(CURRENT) != READ) && (rq_data_dir(CURRENT) != WRITE)) {
-			printk(KERN_WARNING "do_fd_request: unknown command\n");
-			__blk_end_request_cur(CURRENT, -EIO);
-			goto repeat;
-		}
 		if (get_track(drive, track) == -1) {
 			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
@@ -1407,8 +1402,6 @@ static void redo_fd_request(void)
 			break;
 		}
 	}
-	CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
-	CURRENT->sector += CURRENT->current_nr_sectors;
 
 	__blk_end_request_cur(CURRENT, 0);
 	goto repeat;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 44a8702136a9a..25067287211fb 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -732,8 +732,6 @@ static void do_fd_action( int drive )
 		    }
 		    else {
 			/* all sectors finished */
-			CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
-			CURRENT->sector += CURRENT->current_nr_sectors;
 			__blk_end_request_cur(CURRENT, 0);
 			redo_fd_request();
 			return;
@@ -1139,8 +1137,6 @@ static void fd_rwsec_done1(int status)
 	}
 	else {
 		/* all sectors finished */
-		CURRENT->nr_sectors -= CURRENT->current_nr_sectors;
-		CURRENT->sector += CURRENT->current_nr_sectors;
 		__blk_end_request_cur(CURRENT, 0);
 		redo_fd_request();
 	}
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 408c2bd8a439b..2c00ad90cd62e 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -560,11 +560,6 @@ static void mg_request_poll(struct request_queue *q)
 			case WRITE:
 				mg_write(req);
 				break;
-			default:
-				printk(KERN_WARNING "%s:%d unknown command\n",
-						__func__, __LINE__);
-				__blk_end_request_cur(req, -EIO);
-				break;
 			}
 		}
 	}
@@ -614,11 +609,6 @@ static unsigned int mg_issue_req(struct request *req,
 		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
 		break;
-	default:
-		printk(KERN_WARNING "%s:%d unknown command\n",
-				__func__, __LINE__);
-		__blk_end_request_cur(req, -EIO);
-		break;
 	}
 	return MG_ERR_NONE;
 }
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 6f6ad82ec0c0a..14be4c1ed1aaf 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -308,7 +308,6 @@ static void do_xd_request (struct request_queue * q)
 	while ((req = elv_next_request(q)) != NULL) {
 		unsigned block = req->sector;
 		unsigned count = req->nr_sectors;
-		int rw = rq_data_dir(req);
 		XD_INFO *disk = req->rq_disk->private_data;
 		int res = 0;
 		int retry;
@@ -321,13 +320,9 @@ static void do_xd_request (struct request_queue * q)
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
-		if (rw != READ && rw != WRITE) {
-			printk("do_xd_request: unknown request\n");
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
 		for (retry = 0; (retry < XD_RETRIES) && !res; retry++)
-			res = xd_readwrite(rw, disk, req->buffer, block, count);
+			res = xd_readwrite(rq_data_dir(req), disk, req->buffer,
+					   block, count);
 		/* wrap up, 0 = success, -errno = fail */
 		__blk_end_request_cur(req, res);
 	}
-- 
GitLab


From cd4c34ebec155e5c10f897d9bebf618248121e70 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:07 +0900
Subject: [PATCH 0662/2198] ps3disk: simplify request completion

ps3disk_interrupt() always completes requests fully but it uses
rq->hard_cur_sectors for FLUSH requests for some reason.  Drop them
and simply use __blk_end_request_all().

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/ps3disk.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index d23b54bc2f50a..f6586e4d351c0 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -231,7 +231,6 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
 	struct request *req;
 	int res, read, error;
 	u64 tag, status;
-	unsigned long num_sectors;
 	const char *op;
 
 	res = lv1_storage_get_async_status(dev->sbd.dev_id, &tag, &status);
@@ -261,11 +260,9 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
 	if (req->cmd_type == REQ_TYPE_LINUX_BLOCK &&
 	    req->cmd[0] == REQ_LB_OP_FLUSH) {
 		read = 0;
-		num_sectors = req->hard_cur_sectors;
 		op = "flush";
 	} else {
 		read = !rq_data_dir(req);
-		num_sectors = req->nr_sectors;
 		op = read ? "read" : "write";
 	}
 	if (status) {
@@ -281,7 +278,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data)
 	}
 
 	spin_lock(&priv->lock);
-	__blk_end_request(req, error, num_sectors << 9);
+	__blk_end_request_all(req, error);
 	priv->req = NULL;
 	ps3disk_do_request(dev, priv->queue);
 	spin_unlock(&priv->lock);
-- 
GitLab


From 044208506d35bd62396c4673176e2c12393905b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:08 +0900
Subject: [PATCH 0663/2198] sunvdc: kill vdc_end_request()

vdc_end_request() is a thin silly wrapper on top of
__blk_end_request().  Kill it.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/sunvdc.c | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 5861e33efe635..f59887c5ffbdb 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -212,11 +212,6 @@ static void vdc_end_special(struct vdc_port *port, struct vio_disk_desc *desc)
 	vdc_finish(&port->vio, -err, WAITING_FOR_GEN_CMD);
 }
 
-static void vdc_end_request(struct request *req, int error, int num_sectors)
-{
-	__blk_end_request(req, error, num_sectors << 9);
-}
-
 static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
 			unsigned int index)
 {
@@ -239,7 +234,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
 
 	rqe->req = NULL;
 
-	vdc_end_request(req, (desc->status ? -EIO : 0), desc->size >> 9);
+	__blk_end_request(req, (desc->status ? -EIO : 0), desc->size);
 
 	if (blk_queue_stopped(port->disk->queue))
 		blk_start_queue(port->disk->queue);
@@ -453,7 +448,7 @@ static void do_vdc_request(struct request_queue *q)
 
 		blkdev_dequeue_request(req);
 		if (__send_request(req) < 0)
-			vdc_end_request(req, -EIO, req->hard_nr_sectors);
+			__blk_end_request_all(req, -EIO);
 	}
 }
 
-- 
GitLab


From 4d6c84d91d1a539ebc47d1a36a35e9390ba11fdc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:09 +0900
Subject: [PATCH 0664/2198] ubd: cleanup completion path

ubd had its own block request partial completion mechanism, which is
unnecessary as block layer already does it.  Kill ubd_end_request()
and ubd_finish() and replace them with direct call to
blk_end_request().

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/um/drivers/ubd_kern.c | 23 +----------------------
 1 file changed, 1 insertion(+), 22 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index f934225fd8ef9..36ca9fa89d05d 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -451,23 +451,6 @@ static void do_ubd_request(struct request_queue * q);
 
 /* Only changed by ubd_init, which is an initcall. */
 static int thread_fd = -1;
-
-static void ubd_end_request(struct request *req, int bytes, int error)
-{
-	blk_end_request(req, error, bytes);
-}
-
-/* Callable only from interrupt context - otherwise you need to do
- * spin_lock_irq()/spin_lock_irqsave() */
-static inline void ubd_finish(struct request *req, int bytes)
-{
-	if(bytes < 0){
-		ubd_end_request(req, 0, -EIO);
-		return;
-	}
-	ubd_end_request(req, bytes, 0);
-}
-
 static LIST_HEAD(restart);
 
 /* XXX - move this inside ubd_intr. */
@@ -475,7 +458,6 @@ static LIST_HEAD(restart);
 static void ubd_handler(void)
 {
 	struct io_thread_req *req;
-	struct request *rq;
 	struct ubd *ubd;
 	struct list_head *list, *next_ele;
 	unsigned long flags;
@@ -492,10 +474,7 @@ static void ubd_handler(void)
 			return;
 		}
 
-		rq = req->req;
-		rq->nr_sectors -= req->length >> 9;
-		if(rq->nr_sectors == 0)
-			ubd_finish(rq, rq->hard_nr_sectors << 9);
+		blk_end_request(req->req, 0, req->length);
 		kfree(req);
 	}
 	reactivate_fd(thread_fd, UBD_IRQ);
-- 
GitLab


From f81f2f7c9fee307e371f37424577d46f9eaf8692 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:10 +0900
Subject: [PATCH 0665/2198] ubd: drop unnecessary rq->sector manipulation

ubd curiously updates rq->sector while issuing the request in multiple
pieces.  Don't do it and simply use local copy of sector.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Jeff Dike <jdike@linux.intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/um/drivers/ubd_kern.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 36ca9fa89d05d..433012764a376 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1222,7 +1222,8 @@ static void do_ubd_request(struct request_queue *q)
 {
 	struct io_thread_req *io_req;
 	struct request *req;
-	int n, last_sectors;
+	sector_t sector;
+	int n;
 
 	while(1){
 		struct ubd *dev = q->queuedata;
@@ -1238,11 +1239,10 @@ static void do_ubd_request(struct request_queue *q)
 		}
 
 		req = dev->request;
-		last_sectors = 0;
+		sector = req->sector;
 		while(dev->start_sg < dev->end_sg){
 			struct scatterlist *sg = &dev->sg[dev->start_sg];
 
-			req->sector += last_sectors;
 			io_req = kmalloc(sizeof(struct io_thread_req),
 					 GFP_ATOMIC);
 			if(io_req == NULL){
@@ -1251,10 +1251,10 @@ static void do_ubd_request(struct request_queue *q)
 				return;
 			}
 			prepare_request(req, io_req,
-					(unsigned long long) req->sector << 9,
+					(unsigned long long)sector << 9,
 					sg->offset, sg->length, sg_page(sg));
 
-			last_sectors = sg->length >> 9;
+			sector += sg->length >> 9;
 			n = os_write_file(thread_fd, &io_req,
 					  sizeof(struct io_thread_req *));
 			if(n != sizeof(struct io_thread_req *)){
-- 
GitLab


From e091eb67af957bac4e4f7410c5d1aa263ee483a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:11 +0900
Subject: [PATCH 0666/2198] hd: clean up request completion paths

hd read/write_intr() functions manually manipulate request to
incrementally complete it, which block layer already supports.  Simply
use block layer completion routines instead of manual partial
completion.

While at it, clear unnecessary elv_next_request() check at the tail of
read_intr().  This also makes read and write_intr() more consistent.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/hd.c | 36 ++++++++++++------------------------
 1 file changed, 12 insertions(+), 24 deletions(-)

diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 5cb300b81c6ae..75b9ca95c4eb8 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -452,32 +452,25 @@ static void read_intr(void)
 	bad_rw_intr();
 	hd_request();
 	return;
+
 ok_to_read:
 	req = CURRENT;
 	insw(HD_DATA, req->buffer, 256);
-	req->sector++;
-	req->buffer += 512;
-	req->errors = 0;
-	i = --req->nr_sectors;
-	--req->current_nr_sectors;
 #ifdef DEBUG
 	printk("%s: read: sector %ld, remaining = %ld, buffer=%p\n",
-		req->rq_disk->disk_name, req->sector, req->nr_sectors,
+		req->rq_disk->disk_name, req->sector + 1, req->nr_sectors - 1,
 		req->buffer+512);
 #endif
-	if (req->current_nr_sectors <= 0)
-		__blk_end_request_cur(req, 0);
-	if (i > 0) {
+	if (__blk_end_request(req, 0, 512)) {
 		SET_HANDLER(&read_intr);
 		return;
 	}
+
 	(void) inb_p(HD_STATUS);
 #if (HD_DELAY > 0)
 	last_req = read_timer();
 #endif
-	if (elv_next_request(QUEUE))
-		hd_request();
-	return;
+	hd_request();
 }
 
 static void write_intr(void)
@@ -499,23 +492,18 @@ static void write_intr(void)
 	bad_rw_intr();
 	hd_request();
 	return;
+
 ok_to_write:
-	req->sector++;
-	i = --req->nr_sectors;
-	--req->current_nr_sectors;
-	req->buffer += 512;
-	if (!i || (req->bio && req->current_nr_sectors <= 0))
-		__blk_end_request_cur(req, 0);
-	if (i > 0) {
+	if (__blk_end_request(req, 0, 512)) {
 		SET_HANDLER(&write_intr);
 		outsw(HD_DATA, req->buffer, 256);
-	} else {
+		return;
+	}
+
 #if (HD_DELAY > 0)
-		last_req = read_timer();
+	last_req = read_timer();
 #endif
-		hd_request();
-	}
-	return;
+	hd_request();
 }
 
 static void recal_intr(void)
-- 
GitLab


From 467ca759fc83fc35cb7d15aec0d74c62cffc4481 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:12 +0900
Subject: [PATCH 0667/2198] swim3: clean up request completion paths

swim3 curiously tries to update request parameters before calling
__blk_end_request() when __blk_end_request() will do it anyway, and it
updates request for partial completion manually instead of using
blk_update_request().  Also, it does some spurious checks on rq such
as testing whether rq->sector is negative or current_nr_sectors is
zero right after fetching.

Drop unnecessary stuff and use standard block layer mechanisms.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/swim3.c | 31 +++++--------------------------
 1 file changed, 5 insertions(+), 26 deletions(-)

diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 5904f7b73c6ee..424855945b9b3 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -319,14 +319,10 @@ static void start_request(struct floppy_state *fs)
 		       req->errors, req->current_nr_sectors);
 #endif
 
-		if (req->sector < 0 || req->sector >= fs->total_secs) {
+		if (req->sector >= fs->total_secs) {
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
-		if (req->current_nr_sectors == 0) {
-			__blk_end_request_cur(req, 0);
-			continue;
-		}
 		if (fs->ejected) {
 			__blk_end_request_cur(req, -EIO);
 			continue;
@@ -593,8 +589,6 @@ static void xfer_timeout(unsigned long data)
 	struct floppy_state *fs = (struct floppy_state *) data;
 	struct swim3 __iomem *sw = fs->swim3;
 	struct dbdma_regs __iomem *dr = fs->dma;
-	struct dbdma_cmd *cp = fs->dma_cmd;
-	unsigned long s;
 	int n;
 
 	fs->timeout_pending = 0;
@@ -605,14 +599,6 @@ static void xfer_timeout(unsigned long data)
 	out_8(&sw->intr_enable, 0);
 	out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
 	out_8(&sw->select, RELAX);
-	if (rq_data_dir(fd_req) == WRITE)
-		++cp;
-	if (ld_le16(&cp->xfer_status) != 0)
-		s = fs->scount - ((ld_le16(&cp->res_count) + 511) >> 9);
-	else
-		s = 0;
-	fd_req->sector += s;
-	fd_req->current_nr_sectors -= s;
 	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
 	       (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)fd_req->sector);
 	__blk_end_request_cur(fd_req, -EIO);
@@ -719,9 +705,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 		if (intr & ERROR_INTR) {
 			n = fs->scount - 1 - resid / 512;
 			if (n > 0) {
-				fd_req->sector += n;
-				fd_req->current_nr_sectors -= n;
-				fd_req->buffer += n * 512;
+				blk_update_request(fd_req, 0, n << 9);
 				fs->req_sector += n;
 			}
 			if (fs->retries < 5) {
@@ -745,13 +729,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				start_request(fs);
 				break;
 			}
-			fd_req->sector += fs->scount;
-			fd_req->current_nr_sectors -= fs->scount;
-			fd_req->buffer += fs->scount * 512;
-			if (fd_req->current_nr_sectors <= 0) {
-				__blk_end_request_cur(fd_req, 0);
-				fs->state = idle;
-			} else {
+			if (__blk_end_request(fd_req, 0, fs->scount << 9)) {
 				fs->req_sector += fs->scount;
 				if (fs->req_sector > fs->secpertrack) {
 					fs->req_sector -= fs->secpertrack;
@@ -761,7 +739,8 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 					}
 				}
 				act(fs);
-			}
+			} else
+				fs->state = idle;
 		}
 		if (fs->state == idle)
 			start_request(fs);
-- 
GitLab


From e138b4e08ef65771000fbe6d93d67e3960ff862b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:13 +0900
Subject: [PATCH 0668/2198] swim: clean up request completion paths

swim curiously tries to update request parameters before calling
__blk_end_request() when __blk_end_request() will do it anyway and
unnecessarily checks whether current_nr_sectors is zero right after
fetching.

Drop unnecessary stuff and use standard block layer mechanisms.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Laurent Vivier <Laurent@lvivier.info>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/swim.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 6544a7b06bf02..97ef4266c4c7e 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -535,10 +535,6 @@ static void redo_fd_request(struct request_queue *q)
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
-		if (req->current_nr_sectors == 0) {
-			__blk_end_request_cur(req, 0);
-			continue;
-		}
 		if (!fs->disk_in) {
 			__blk_end_request_cur(req, -EIO);
 			continue;
@@ -561,9 +557,6 @@ static void redo_fd_request(struct request_queue *q)
 				__blk_end_request_cur(req, -EIO);
 				continue;
 			}
-			req->nr_sectors -= req->current_nr_sectors;
-			req->sector += req->current_nr_sectors;
-			req->buffer += req->current_nr_sectors * 512;
 			__blk_end_request_cur(req, 0);
 			break;
 		}
-- 
GitLab


From eec9462088a26c046d4db3100796a340a50890b8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:14 +0900
Subject: [PATCH 0669/2198] mg_disk: fold mg_disk.h into mg_disk.c

include/linux/mg_disk.h is used only by drivers/block/mg_disk.c.  No
reason to put it in a separate header.  Fold it into mg_disk.c.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/mg_disk.c | 186 +++++++++++++++++++++++++++++++++++-
 include/linux/mg_disk.h | 206 ----------------------------------------
 2 files changed, 185 insertions(+), 207 deletions(-)
 delete mode 100644 include/linux/mg_disk.h

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 2c00ad90cd62e..2c6127ee83432 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -22,10 +22,194 @@
 #include <linux/delay.h>
 #include <linux/platform_device.h>
 #include <linux/gpio.h>
-#include <linux/mg_disk.h>
 
 #define MG_RES_SEC (CONFIG_MG_DISK_RES << 1)
 
+/* name for block device */
+#define MG_DISK_NAME "mgd"
+/* name for platform device */
+#define MG_DEV_NAME "mg_disk"
+
+#define MG_DISK_MAJ 0
+#define MG_DISK_MAX_PART 16
+#define MG_SECTOR_SIZE 512
+#define MG_MAX_SECTS 256
+
+/* Register offsets */
+#define MG_BUFF_OFFSET			0x8000
+#define MG_STORAGE_BUFFER_SIZE		0x200
+#define MG_REG_OFFSET			0xC000
+#define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
+#define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
+#define MG_REG_SECT_CNT			(MG_REG_OFFSET + 4)
+#define MG_REG_SECT_NUM			(MG_REG_OFFSET + 6)
+#define MG_REG_CYL_LOW			(MG_REG_OFFSET + 8)
+#define MG_REG_CYL_HIGH			(MG_REG_OFFSET + 0xA)
+#define MG_REG_DRV_HEAD			(MG_REG_OFFSET + 0xC)
+#define MG_REG_COMMAND			(MG_REG_OFFSET + 0xE)	/* write case */
+#define MG_REG_STATUS			(MG_REG_OFFSET + 0xE)	/* read  case */
+#define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
+#define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
+
+/* "Drive Select/Head Register" bit values */
+#define MG_REG_HEAD_MUST_BE_ON		0xA0 /* These 2 bits are always on */
+#define MG_REG_HEAD_DRIVE_MASTER	(0x00 | MG_REG_HEAD_MUST_BE_ON)
+#define MG_REG_HEAD_DRIVE_SLAVE		(0x10 | MG_REG_HEAD_MUST_BE_ON)
+#define MG_REG_HEAD_LBA_MODE		(0x40 | MG_REG_HEAD_MUST_BE_ON)
+
+
+/* "Device Control Register" bit values */
+#define MG_REG_CTRL_INTR_ENABLE			0x0
+#define MG_REG_CTRL_INTR_DISABLE		(0x1<<1)
+#define MG_REG_CTRL_RESET			(0x1<<2)
+#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH	0x0
+#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW	(0x1<<4)
+#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW		0x0
+#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH	(0x1<<5)
+#define MG_REG_CTRL_DPD_DISABLE			0x0
+#define MG_REG_CTRL_DPD_ENABLE			(0x1<<6)
+
+/* Status register bit */
+/* error bit in status register */
+#define MG_REG_STATUS_BIT_ERROR			0x01
+/* corrected error in status register */
+#define MG_REG_STATUS_BIT_CORRECTED_ERROR	0x04
+/* data request bit in status register */
+#define MG_REG_STATUS_BIT_DATA_REQ		0x08
+/* DSC - Drive Seek Complete */
+#define MG_REG_STATUS_BIT_SEEK_DONE		0x10
+/* DWF - Drive Write Fault */
+#define MG_REG_STATUS_BIT_WRITE_FAULT		0x20
+#define MG_REG_STATUS_BIT_READY			0x40
+#define MG_REG_STATUS_BIT_BUSY			0x80
+
+/* handy status */
+#define MG_STAT_READY	(MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE)
+#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | \
+				(MG_REG_STATUS_BIT_BUSY | \
+				 MG_REG_STATUS_BIT_WRITE_FAULT | \
+				 MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY)
+
+/* Error register */
+#define MG_REG_ERR_AMNF		0x01
+#define MG_REG_ERR_ABRT		0x04
+#define MG_REG_ERR_IDNF		0x10
+#define MG_REG_ERR_UNC		0x40
+#define MG_REG_ERR_BBK		0x80
+
+/* error code for others */
+#define MG_ERR_NONE		0
+#define MG_ERR_TIMEOUT		0x100
+#define MG_ERR_INIT_STAT	0x101
+#define MG_ERR_TRANSLATION	0x102
+#define MG_ERR_CTRL_RST		0x103
+#define MG_ERR_INV_STAT		0x104
+#define MG_ERR_RSTOUT		0x105
+
+#define MG_MAX_ERRORS	6	/* Max read/write errors */
+
+/* command */
+#define MG_CMD_RD 0x20
+#define MG_CMD_WR 0x30
+#define MG_CMD_SLEEP 0x99
+#define MG_CMD_WAKEUP 0xC3
+#define MG_CMD_ID 0xEC
+#define MG_CMD_WR_CONF 0x3C
+#define MG_CMD_RD_CONF 0x40
+
+/* operation mode */
+#define MG_OP_CASCADE (1 << 0)
+#define MG_OP_CASCADE_SYNC_RD (1 << 1)
+#define MG_OP_CASCADE_SYNC_WR (1 << 2)
+#define MG_OP_INTERLEAVE (1 << 3)
+
+/* synchronous */
+#define MG_BURST_LAT_4 (3 << 4)
+#define MG_BURST_LAT_5 (4 << 4)
+#define MG_BURST_LAT_6 (5 << 4)
+#define MG_BURST_LAT_7 (6 << 4)
+#define MG_BURST_LAT_8 (7 << 4)
+#define MG_BURST_LEN_4 (1 << 1)
+#define MG_BURST_LEN_8 (2 << 1)
+#define MG_BURST_LEN_16 (3 << 1)
+#define MG_BURST_LEN_32 (4 << 1)
+#define MG_BURST_LEN_CONT (0 << 1)
+
+/* timeout value (unit: ms) */
+#define MG_TMAX_CONF_TO_CMD	1
+#define MG_TMAX_WAIT_RD_DRQ	10
+#define MG_TMAX_WAIT_WR_DRQ	500
+#define MG_TMAX_RST_TO_BUSY	10
+#define MG_TMAX_HDRST_TO_RDY	500
+#define MG_TMAX_SWRST_TO_RDY	500
+#define MG_TMAX_RSTOUT		3000
+
+/* device attribution */
+/* use mflash as boot device */
+#define MG_BOOT_DEV		(1 << 0)
+/* use mflash as storage device */
+#define MG_STORAGE_DEV		(1 << 1)
+/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
+#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
+
+#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
+
+/* names of GPIO resource */
+#define MG_RST_PIN	"mg_rst"
+/* except MG_BOOT_DEV, reset-out pin should be assigned */
+#define MG_RSTOUT_PIN	"mg_rstout"
+
+/* private driver data */
+struct mg_drv_data {
+	/* disk resource */
+	u32 use_polling;
+
+	/* device attribution */
+	u32 dev_attr;
+
+	/* internally used */
+	struct mg_host *host;
+};
+
+/* main structure for mflash driver */
+struct mg_host {
+	struct device *dev;
+
+	struct request_queue *breq;
+	spinlock_t lock;
+	struct gendisk *gd;
+
+	struct timer_list timer;
+	void (*mg_do_intr) (struct mg_host *);
+
+	u16 id[ATA_ID_WORDS];
+
+	u16 cyls;
+	u16 heads;
+	u16 sectors;
+	u32 n_sectors;
+	u32 nres_sectors;
+
+	void __iomem *dev_base;
+	unsigned int irq;
+	unsigned int rst;
+	unsigned int rstout;
+
+	u32 major;
+	u32 error;
+};
+
+/*
+ * Debugging macro and defines
+ */
+#undef DO_MG_DEBUG
+#ifdef DO_MG_DEBUG
+#  define MG_DBG(fmt, args...) \
+	printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
+#else /* CONFIG_MG_DEBUG */
+#  define MG_DBG(fmt, args...) do { } while (0)
+#endif /* CONFIG_MG_DEBUG */
+
 static void mg_request(struct request_queue *);
 
 static void mg_dump_status(const char *msg, unsigned int stat,
diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h
deleted file mode 100644
index 1f76b1ebf627a..0000000000000
--- a/include/linux/mg_disk.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- *  include/linux/mg_disk.c
- *
- *  Support for the mGine m[g]flash IO mode.
- *  Based on legacy hd.c
- *
- * (c) 2008 mGine Co.,LTD
- * (c) 2008 unsik Kim <donari75@gmail.com>
- *
- *  This program is free software; you can redistribute it and/or modify
- *  it under the terms of the GNU General Public License version 2 as
- *  published by the Free Software Foundation.
- */
-
-#ifndef __MG_DISK_H__
-#define __MG_DISK_H__
-
-#include <linux/blkdev.h>
-#include <linux/ata.h>
-
-/* name for block device */
-#define MG_DISK_NAME "mgd"
-/* name for platform device */
-#define MG_DEV_NAME "mg_disk"
-
-#define MG_DISK_MAJ 0
-#define MG_DISK_MAX_PART 16
-#define MG_SECTOR_SIZE 512
-#define MG_MAX_SECTS 256
-
-/* Register offsets */
-#define MG_BUFF_OFFSET			0x8000
-#define MG_STORAGE_BUFFER_SIZE		0x200
-#define MG_REG_OFFSET			0xC000
-#define MG_REG_FEATURE			(MG_REG_OFFSET + 2)	/* write case */
-#define MG_REG_ERROR			(MG_REG_OFFSET + 2)	/* read case */
-#define MG_REG_SECT_CNT			(MG_REG_OFFSET + 4)
-#define MG_REG_SECT_NUM			(MG_REG_OFFSET + 6)
-#define MG_REG_CYL_LOW			(MG_REG_OFFSET + 8)
-#define MG_REG_CYL_HIGH			(MG_REG_OFFSET + 0xA)
-#define MG_REG_DRV_HEAD			(MG_REG_OFFSET + 0xC)
-#define MG_REG_COMMAND			(MG_REG_OFFSET + 0xE)	/* write case */
-#define MG_REG_STATUS			(MG_REG_OFFSET + 0xE)	/* read  case */
-#define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
-#define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
-
-/* "Drive Select/Head Register" bit values */
-#define MG_REG_HEAD_MUST_BE_ON		0xA0 /* These 2 bits are always on */
-#define MG_REG_HEAD_DRIVE_MASTER	(0x00 | MG_REG_HEAD_MUST_BE_ON)
-#define MG_REG_HEAD_DRIVE_SLAVE		(0x10 | MG_REG_HEAD_MUST_BE_ON)
-#define MG_REG_HEAD_LBA_MODE		(0x40 | MG_REG_HEAD_MUST_BE_ON)
-
-
-/* "Device Control Register" bit values */
-#define MG_REG_CTRL_INTR_ENABLE			0x0
-#define MG_REG_CTRL_INTR_DISABLE		(0x1<<1)
-#define MG_REG_CTRL_RESET			(0x1<<2)
-#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH	0x0
-#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW	(0x1<<4)
-#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW		0x0
-#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH	(0x1<<5)
-#define MG_REG_CTRL_DPD_DISABLE			0x0
-#define MG_REG_CTRL_DPD_ENABLE			(0x1<<6)
-
-/* Status register bit */
-/* error bit in status register */
-#define MG_REG_STATUS_BIT_ERROR			0x01
-/* corrected error in status register */
-#define MG_REG_STATUS_BIT_CORRECTED_ERROR	0x04
-/* data request bit in status register */
-#define MG_REG_STATUS_BIT_DATA_REQ		0x08
-/* DSC - Drive Seek Complete */
-#define MG_REG_STATUS_BIT_SEEK_DONE		0x10
-/* DWF - Drive Write Fault */
-#define MG_REG_STATUS_BIT_WRITE_FAULT		0x20
-#define MG_REG_STATUS_BIT_READY			0x40
-#define MG_REG_STATUS_BIT_BUSY			0x80
-
-/* handy status */
-#define MG_STAT_READY	(MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE)
-#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | \
-				(MG_REG_STATUS_BIT_BUSY | \
-				 MG_REG_STATUS_BIT_WRITE_FAULT | \
-				 MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY)
-
-/* Error register */
-#define MG_REG_ERR_AMNF		0x01
-#define MG_REG_ERR_ABRT		0x04
-#define MG_REG_ERR_IDNF		0x10
-#define MG_REG_ERR_UNC		0x40
-#define MG_REG_ERR_BBK		0x80
-
-/* error code for others */
-#define MG_ERR_NONE		0
-#define MG_ERR_TIMEOUT		0x100
-#define MG_ERR_INIT_STAT	0x101
-#define MG_ERR_TRANSLATION	0x102
-#define MG_ERR_CTRL_RST		0x103
-#define MG_ERR_INV_STAT		0x104
-#define MG_ERR_RSTOUT		0x105
-
-#define MG_MAX_ERRORS	6	/* Max read/write errors */
-
-/* command */
-#define MG_CMD_RD 0x20
-#define MG_CMD_WR 0x30
-#define MG_CMD_SLEEP 0x99
-#define MG_CMD_WAKEUP 0xC3
-#define MG_CMD_ID 0xEC
-#define MG_CMD_WR_CONF 0x3C
-#define MG_CMD_RD_CONF 0x40
-
-/* operation mode */
-#define MG_OP_CASCADE (1 << 0)
-#define MG_OP_CASCADE_SYNC_RD (1 << 1)
-#define MG_OP_CASCADE_SYNC_WR (1 << 2)
-#define MG_OP_INTERLEAVE (1 << 3)
-
-/* synchronous */
-#define MG_BURST_LAT_4 (3 << 4)
-#define MG_BURST_LAT_5 (4 << 4)
-#define MG_BURST_LAT_6 (5 << 4)
-#define MG_BURST_LAT_7 (6 << 4)
-#define MG_BURST_LAT_8 (7 << 4)
-#define MG_BURST_LEN_4 (1 << 1)
-#define MG_BURST_LEN_8 (2 << 1)
-#define MG_BURST_LEN_16 (3 << 1)
-#define MG_BURST_LEN_32 (4 << 1)
-#define MG_BURST_LEN_CONT (0 << 1)
-
-/* timeout value (unit: ms) */
-#define MG_TMAX_CONF_TO_CMD	1
-#define MG_TMAX_WAIT_RD_DRQ	10
-#define MG_TMAX_WAIT_WR_DRQ	500
-#define MG_TMAX_RST_TO_BUSY	10
-#define MG_TMAX_HDRST_TO_RDY	500
-#define MG_TMAX_SWRST_TO_RDY	500
-#define MG_TMAX_RSTOUT		3000
-
-/* device attribution */
-/* use mflash as boot device */
-#define MG_BOOT_DEV		(1 << 0)
-/* use mflash as storage device */
-#define MG_STORAGE_DEV		(1 << 1)
-/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */
-#define MG_STORAGE_DEV_SKIP_RST	(1 << 2)
-
-#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST)
-
-/* names of GPIO resource */
-#define MG_RST_PIN	"mg_rst"
-/* except MG_BOOT_DEV, reset-out pin should be assigned */
-#define MG_RSTOUT_PIN	"mg_rstout"
-
-/* private driver data */
-struct mg_drv_data {
-	/* disk resource */
-	u32 use_polling;
-
-	/* device attribution */
-	u32 dev_attr;
-
-	/* internally used */
-	struct mg_host *host;
-};
-
-/* main structure for mflash driver */
-struct mg_host {
-	struct device *dev;
-
-	struct request_queue *breq;
-	spinlock_t lock;
-	struct gendisk *gd;
-
-	struct timer_list timer;
-	void (*mg_do_intr) (struct mg_host *);
-
-	u16 id[ATA_ID_WORDS];
-
-	u16 cyls;
-	u16 heads;
-	u16 sectors;
-	u32 n_sectors;
-	u32 nres_sectors;
-
-	void __iomem *dev_base;
-	unsigned int irq;
-	unsigned int rst;
-	unsigned int rstout;
-
-	u32 major;
-	u32 error;
-};
-
-/*
- * Debugging macro and defines
- */
-#undef DO_MG_DEBUG
-#ifdef DO_MG_DEBUG
-#  define MG_DBG(fmt, args...) \
-	printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args)
-#else /* CONFIG_MG_DEBUG */
-#  define MG_DBG(fmt, args...) do { } while (0)
-#endif /* CONFIG_MG_DEBUG */
-
-#endif
-- 
GitLab


From a03bb5a32fff4aa23f081a3cff7e98d4084104cd Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Apr 2009 13:06:15 +0900
Subject: [PATCH 0670/2198] mg_disk: clean up request completion paths

mg_disk implements its own partial completion.  Convert to standard
block layer partial completion.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/mg_disk.c | 117 ++++++++++++----------------------------
 1 file changed, 33 insertions(+), 84 deletions(-)

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 2c6127ee83432..1a4cc968cfeed 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -503,95 +503,68 @@ static unsigned int mg_out(struct mg_host *host,
 
 static void mg_read(struct request *req)
 {
-	u32 remains, j;
+	u32 j;
 	struct mg_host *host = req->rq_disk->private_data;
 
-	remains = req->nr_sectors;
-
 	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, NULL) !=
 			MG_ERR_NONE)
 		mg_bad_rw_intr(host);
 
 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-			remains, req->sector, req->buffer);
+	       req->nr_sectors, req->sector, req->buffer);
+
+	do {
+		u16 *buff = (u16 *)req->buffer;
 
-	while (remains) {
 		if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ,
 					MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return;
 		}
-		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) {
-			*(u16 *)req->buffer =
-				inw((unsigned long)host->dev_base +
-						MG_BUFF_OFFSET + (j << 1));
-			req->buffer += 2;
-		}
-
-		req->sector++;
-		req->errors = 0;
-		remains = --req->nr_sectors;
-		--req->current_nr_sectors;
-
-		if (req->current_nr_sectors <= 0) {
-			MG_DBG("remain : %d sects\n", remains);
-			__blk_end_request_cur(req, 0);
-			if (remains > 0)
-				req = elv_next_request(host->breq);
-		}
+		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++)
+			*buff++ = inw((unsigned long)host->dev_base +
+				      MG_BUFF_OFFSET + (j << 1));
 
 		outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
-	}
+	} while (__blk_end_request(req, 0, MG_SECTOR_SIZE));
 }
 
 static void mg_write(struct request *req)
 {
-	u32 remains, j;
+	u32 j;
 	struct mg_host *host = req->rq_disk->private_data;
 
-	remains = req->nr_sectors;
-
 	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, NULL) !=
 			MG_ERR_NONE) {
 		mg_bad_rw_intr(host);
 		return;
 	}
 
-
 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-			remains, req->sector, req->buffer);
-	while (remains) {
+	       req->nr_sectors, req->sector, req->buffer);
+
+	do {
+		u16 *buff = (u16 *)req->buffer;
+
 		if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ,
 					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return;
 		}
-		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) {
-			outw(*(u16 *)req->buffer,
-					(unsigned long)host->dev_base +
-					MG_BUFF_OFFSET + (j << 1));
-			req->buffer += 2;
-		}
-		req->sector++;
-		remains = --req->nr_sectors;
-		--req->current_nr_sectors;
-
-		if (req->current_nr_sectors <= 0) {
-			MG_DBG("remain : %d sects\n", remains);
-			__blk_end_request_cur(req, 0);
-			if (remains > 0)
-				req = elv_next_request(host->breq);
-		}
+		for (j = 0; j < MG_SECTOR_SIZE >> 1; j++)
+			outw(*buff++, (unsigned long)host->dev_base +
+				      MG_BUFF_OFFSET + (j << 1));
 
 		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
-	}
+	} while (__blk_end_request(req, 0, MG_SECTOR_SIZE));
 }
 
 static void mg_read_intr(struct mg_host *host)
 {
 	u32 i;
+	u16 *buff;
 	struct request *req;
 
 	/* check status */
@@ -612,39 +585,24 @@ static void mg_read_intr(struct mg_host *host)
 ok_to_read:
 	/* get current segment of request */
 	req = elv_next_request(host->breq);
+	buff = (u16 *)req->buffer;
 
 	/* read 1 sector */
-	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) {
-		*(u16 *)req->buffer =
-			inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
-					(i << 1));
-		req->buffer += 2;
-	}
+	for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
+		*buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
+			      (i << 1));
 
-	/* manipulate request */
 	MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
 			req->sector, req->nr_sectors - 1, req->buffer);
 
-	req->sector++;
-	req->errors = 0;
-	i = --req->nr_sectors;
-	--req->current_nr_sectors;
-
-	/* let know if current segment done */
-	if (req->current_nr_sectors <= 0)
-		__blk_end_request_cur(req, 0);
-
-	/* set handler if read remains */
-	if (i > 0) {
-		host->mg_do_intr = mg_read_intr;
-		mod_timer(&host->timer, jiffies + 3 * HZ);
-	}
-
 	/* send read confirm */
 	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
 
-	/* goto next request */
-	if (!i)
+	if (__blk_end_request(req, 0, MG_SECTOR_SIZE)) {
+		/* set handler if read remains */
+		host->mg_do_intr = mg_read_intr;
+		mod_timer(&host->timer, jiffies + 3 * HZ);
+	} else /* goto next request */
 		mg_request(host->breq);
 }
 
@@ -653,6 +611,7 @@ static void mg_write_intr(struct mg_host *host)
 	u32 i, j;
 	u16 *buff;
 	struct request *req;
+	bool rem;
 
 	/* get current segment of request */
 	req = elv_next_request(host->breq);
@@ -673,18 +632,8 @@ static void mg_write_intr(struct mg_host *host)
 	return;
 
 ok_to_write:
-	/* manipulate request */
-	req->sector++;
-	i = --req->nr_sectors;
-	--req->current_nr_sectors;
-	req->buffer += MG_SECTOR_SIZE;
-
-	/* let know if current segment or all done */
-	if (!i || (req->bio && req->current_nr_sectors <= 0))
-		__blk_end_request_cur(req, 0);
-
-	/* write 1 sector and set handler if remains */
-	if (i > 0) {
+	if ((rem = __blk_end_request(req, 0, MG_SECTOR_SIZE))) {
+		/* write 1 sector and set handler if remains */
 		buff = (u16 *)req->buffer;
 		for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) {
 			outw(*buff, (unsigned long)host->dev_base +
@@ -700,7 +649,7 @@ static void mg_write_intr(struct mg_host *host)
 	/* send write confirm */
 	outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
 
-	if (!i)
+	if (!rem)
 		mg_request(host->breq);
 }
 
-- 
GitLab


From 8a11a789c39cd6f61de4243234cf44a46c23ffd0 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Apr 2009 13:06:16 +0900
Subject: [PATCH 0671/2198] mg_disk: fix dependency on libata

Add local copies of ata_id_string() and ata_id_c_string() to mg_disk
so there is no need for the driver to depend on ATA and SCSI.

[ Impact: break dependency on libata by copying ata id string functions ]

Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/Kconfig   |  2 +-
 drivers/block/mg_disk.c | 44 +++++++++++++++++++++++++++++++++++++----
 2 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index ddea8e485cc94..f42fa50d35506 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -412,7 +412,7 @@ config ATA_OVER_ETH
 
 config MG_DISK
 	tristate "mGine mflash, gflash support"
-	depends on ARM && ATA && GPIOLIB
+	depends on ARM && GPIOLIB
 	help
 	  mGine mFlash(gFlash) block device driver
 
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 1a4cc968cfeed..8e53cae9fa904 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -17,7 +17,7 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
-#include <linux/libata.h>
+#include <linux/ata.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
 #include <linux/platform_device.h>
@@ -357,6 +357,42 @@ static irqreturn_t mg_irq(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
+/* local copy of ata_id_string() */
+static void mg_id_string(const u16 *id, unsigned char *s,
+			 unsigned int ofs, unsigned int len)
+{
+	unsigned int c;
+
+	BUG_ON(len & 1);
+
+	while (len > 0) {
+		c = id[ofs] >> 8;
+		*s = c;
+		s++;
+
+		c = id[ofs] & 0xff;
+		*s = c;
+		s++;
+
+		ofs++;
+		len -= 2;
+	}
+}
+
+/* local copy of ata_id_c_string() */
+static void mg_id_c_string(const u16 *id, unsigned char *s,
+			   unsigned int ofs, unsigned int len)
+{
+	unsigned char *p;
+
+	mg_id_string(id, s, ofs, len - 1);
+
+	p = s + strnlen(s, len - 1);
+	while (p > s && p[-1] == ' ')
+		p--;
+	*p = '\0';
+}
+
 static int mg_get_disk_id(struct mg_host *host)
 {
 	u32 i;
@@ -403,9 +439,9 @@ static int mg_get_disk_id(struct mg_host *host)
 		host->n_sectors -= host->nres_sectors;
 	}
 
-	ata_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
-	ata_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
-	ata_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
+	mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev));
+	mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model));
+	mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial));
 	printk(KERN_INFO "mg_disk: model: %s\n", model);
 	printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev);
 	printk(KERN_INFO "mg_disk: serial: %s\n", serial);
-- 
GitLab


From f68adec3c7155a8bbf32a90cb4c4d0737df045d9 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Tue, 28 Apr 2009 13:06:17 +0900
Subject: [PATCH 0672/2198] mg_disk: use defines from <linux/ata.h>

While at it:
- remove MG_REG_HEAD_MUST_BE_ON define
- remove MG_REG_CTRL_INTR_ENABLE define
- remove MG_REG_HEAD_LBA_MODE define
- remove unused defines

Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/mg_disk.c | 144 ++++++++++++----------------------------
 1 file changed, 43 insertions(+), 101 deletions(-)

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 8e53cae9fa904..71e56cc28cac3 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -51,51 +51,10 @@
 #define MG_REG_DRV_CTRL			(MG_REG_OFFSET + 0x10)
 #define MG_REG_BURST_CTRL		(MG_REG_OFFSET + 0x12)
 
-/* "Drive Select/Head Register" bit values */
-#define MG_REG_HEAD_MUST_BE_ON		0xA0 /* These 2 bits are always on */
-#define MG_REG_HEAD_DRIVE_MASTER	(0x00 | MG_REG_HEAD_MUST_BE_ON)
-#define MG_REG_HEAD_DRIVE_SLAVE		(0x10 | MG_REG_HEAD_MUST_BE_ON)
-#define MG_REG_HEAD_LBA_MODE		(0x40 | MG_REG_HEAD_MUST_BE_ON)
-
-
-/* "Device Control Register" bit values */
-#define MG_REG_CTRL_INTR_ENABLE			0x0
-#define MG_REG_CTRL_INTR_DISABLE		(0x1<<1)
-#define MG_REG_CTRL_RESET			(0x1<<2)
-#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH	0x0
-#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW	(0x1<<4)
-#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW		0x0
-#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH	(0x1<<5)
-#define MG_REG_CTRL_DPD_DISABLE			0x0
-#define MG_REG_CTRL_DPD_ENABLE			(0x1<<6)
-
-/* Status register bit */
-/* error bit in status register */
-#define MG_REG_STATUS_BIT_ERROR			0x01
-/* corrected error in status register */
-#define MG_REG_STATUS_BIT_CORRECTED_ERROR	0x04
-/* data request bit in status register */
-#define MG_REG_STATUS_BIT_DATA_REQ		0x08
-/* DSC - Drive Seek Complete */
-#define MG_REG_STATUS_BIT_SEEK_DONE		0x10
-/* DWF - Drive Write Fault */
-#define MG_REG_STATUS_BIT_WRITE_FAULT		0x20
-#define MG_REG_STATUS_BIT_READY			0x40
-#define MG_REG_STATUS_BIT_BUSY			0x80
-
 /* handy status */
-#define MG_STAT_READY	(MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE)
-#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | \
-				(MG_REG_STATUS_BIT_BUSY | \
-				 MG_REG_STATUS_BIT_WRITE_FAULT | \
-				 MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY)
-
-/* Error register */
-#define MG_REG_ERR_AMNF		0x01
-#define MG_REG_ERR_ABRT		0x04
-#define MG_REG_ERR_IDNF		0x10
-#define MG_REG_ERR_UNC		0x40
-#define MG_REG_ERR_BBK		0x80
+#define MG_STAT_READY	(ATA_DRDY | ATA_DSC)
+#define MG_READY_OK(s)	(((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \
+				 ATA_ERR))) == MG_STAT_READY)
 
 /* error code for others */
 #define MG_ERR_NONE		0
@@ -225,41 +184,39 @@ static void mg_dump_status(const char *msg, unsigned int stat,
 	}
 
 	printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
-	if (stat & MG_REG_STATUS_BIT_BUSY)
+	if (stat & ATA_BUSY)
 		printk("Busy ");
-	if (stat & MG_REG_STATUS_BIT_READY)
+	if (stat & ATA_DRDY)
 		printk("DriveReady ");
-	if (stat & MG_REG_STATUS_BIT_WRITE_FAULT)
+	if (stat & ATA_DF)
 		printk("WriteFault ");
-	if (stat & MG_REG_STATUS_BIT_SEEK_DONE)
+	if (stat & ATA_DSC)
 		printk("SeekComplete ");
-	if (stat & MG_REG_STATUS_BIT_DATA_REQ)
+	if (stat & ATA_DRQ)
 		printk("DataRequest ");
-	if (stat & MG_REG_STATUS_BIT_CORRECTED_ERROR)
+	if (stat & ATA_CORR)
 		printk("CorrectedError ");
-	if (stat & MG_REG_STATUS_BIT_ERROR)
+	if (stat & ATA_ERR)
 		printk("Error ");
 	printk("}\n");
-	if ((stat & MG_REG_STATUS_BIT_ERROR) == 0) {
+	if ((stat & ATA_ERR) == 0) {
 		host->error = 0;
 	} else {
 		host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR);
 		printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg,
 				host->error & 0xff);
-		if (host->error & MG_REG_ERR_BBK)
+		if (host->error & ATA_BBK)
 			printk("BadSector ");
-		if (host->error & MG_REG_ERR_UNC)
+		if (host->error & ATA_UNC)
 			printk("UncorrectableError ");
-		if (host->error & MG_REG_ERR_IDNF)
+		if (host->error & ATA_IDNF)
 			printk("SectorIdNotFound ");
-		if (host->error & MG_REG_ERR_ABRT)
+		if (host->error & ATA_ABORTED)
 			printk("DriveStatusError ");
-		if (host->error & MG_REG_ERR_AMNF)
+		if (host->error & ATA_AMNF)
 			printk("AddrMarkNotFound ");
 		printk("}");
-		if (host->error &
-				(MG_REG_ERR_BBK | MG_REG_ERR_UNC |
-				 MG_REG_ERR_IDNF | MG_REG_ERR_AMNF)) {
+		if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
 			if (host->breq) {
 				req = elv_next_request(host->breq);
 				if (req)
@@ -284,12 +241,12 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
 
 	do {
 		cur_jiffies = jiffies;
-		if (status & MG_REG_STATUS_BIT_BUSY) {
-			if (expect == MG_REG_STATUS_BIT_BUSY)
+		if (status & ATA_BUSY) {
+			if (expect == ATA_BUSY)
 				break;
 		} else {
 			/* Check the error condition! */
-			if (status & MG_REG_STATUS_BIT_ERROR) {
+			if (status & ATA_ERR) {
 				mg_dump_status("mg_wait", status, host);
 				break;
 			}
@@ -298,8 +255,8 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
 				if (MG_READY_OK(status))
 					break;
 
-			if (expect == MG_REG_STATUS_BIT_DATA_REQ)
-				if (status & MG_REG_STATUS_BIT_DATA_REQ)
+			if (expect == ATA_DRQ)
+				if (status & ATA_DRQ)
 					break;
 		}
 		if (!msec) {
@@ -404,12 +361,10 @@ static int mg_get_disk_id(struct mg_host *host)
 	char serial[ATA_ID_SERNO_LEN + 1];
 
 	if (!prv_data->use_polling)
-		outb(MG_REG_CTRL_INTR_DISABLE,
-				(unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 
 	outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND);
-	err = mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, MG_TMAX_WAIT_RD_DRQ);
+	err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ);
 	if (err)
 		return err;
 
@@ -449,8 +404,7 @@ static int mg_get_disk_id(struct mg_host *host)
 			host->n_sectors, host->nres_sectors);
 
 	if (!prv_data->use_polling)
-		outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 
 	return err;
 }
@@ -464,7 +418,7 @@ static int mg_disk_init(struct mg_host *host)
 
 	/* hdd rst low */
 	gpio_set_value(host->rst, 0);
-	err = mg_wait(host, MG_REG_STATUS_BIT_BUSY, MG_TMAX_RST_TO_BUSY);
+	err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
 	if (err)
 		return err;
 
@@ -475,17 +429,14 @@ static int mg_disk_init(struct mg_host *host)
 		return err;
 
 	/* soft reset on */
-	outb(MG_REG_CTRL_RESET |
-			(prv_data->use_polling ? MG_REG_CTRL_INTR_DISABLE :
-			 MG_REG_CTRL_INTR_ENABLE),
+	outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0),
 			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
-	err = mg_wait(host, MG_REG_STATUS_BIT_BUSY, MG_TMAX_RST_TO_BUSY);
+	err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY);
 	if (err)
 		return err;
 
 	/* soft reset off */
-	outb(prv_data->use_polling ? MG_REG_CTRL_INTR_DISABLE :
-			MG_REG_CTRL_INTR_ENABLE,
+	outb(prv_data->use_polling ? ATA_NIEN : 0,
 			(unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 	err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY);
 	if (err)
@@ -531,7 +482,7 @@ static unsigned int mg_out(struct mg_host *host,
 			MG_REG_CYL_LOW);
 	outb((u8)(sect_num >> 16), (unsigned long)host->dev_base +
 			MG_REG_CYL_HIGH);
-	outb((u8)((sect_num >> 24) | MG_REG_HEAD_LBA_MODE),
+	outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS),
 			(unsigned long)host->dev_base + MG_REG_DRV_HEAD);
 	outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND);
 	return MG_ERR_NONE;
@@ -552,8 +503,8 @@ static void mg_read(struct request *req)
 	do {
 		u16 *buff = (u16 *)req->buffer;
 
-		if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ,
-					MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
+		if (mg_wait(host, ATA_DRQ,
+			    MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return;
 		}
@@ -583,8 +534,7 @@ static void mg_write(struct request *req)
 	do {
 		u16 *buff = (u16 *)req->buffer;
 
-		if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ,
-					MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
+	if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return;
 		}
@@ -606,11 +556,11 @@ static void mg_read_intr(struct mg_host *host)
 	/* check status */
 	do {
 		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		if (i & MG_REG_STATUS_BIT_BUSY)
+		if (i & ATA_BUSY)
 			break;
 		if (!MG_READY_OK(i))
 			break;
-		if (i & MG_REG_STATUS_BIT_DATA_REQ)
+		if (i & ATA_DRQ)
 			goto ok_to_read;
 	} while (0);
 	mg_dump_status("mg_read_intr", i, host);
@@ -655,11 +605,11 @@ static void mg_write_intr(struct mg_host *host)
 	/* check status */
 	do {
 		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
-		if (i & MG_REG_STATUS_BIT_BUSY)
+		if (i & ATA_BUSY)
 			break;
 		if (!MG_READY_OK(i))
 			break;
-		if ((req->nr_sectors <= 1) || (i & MG_REG_STATUS_BIT_DATA_REQ))
+		if ((req->nr_sectors <= 1) || (i & ATA_DRQ))
 			goto ok_to_write;
 	} while (0);
 	mg_dump_status("mg_write_intr", i, host);
@@ -752,18 +702,15 @@ static unsigned int mg_issue_req(struct request *req,
 		break;
 	case WRITE:
 		/* TODO : handler */
-		outb(MG_REG_CTRL_INTR_DISABLE,
-				(unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 		if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr)
 				!= MG_ERR_NONE) {
 			mg_bad_rw_intr(host);
 			return host->error;
 		}
 		del_timer(&host->timer);
-		mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, MG_TMAX_WAIT_WR_DRQ);
-		outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ);
+		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 		if (host->error) {
 			mg_bad_rw_intr(host);
 			return host->error;
@@ -849,9 +796,7 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
 		return -EIO;
 
 	if (!prv_data->use_polling)
-		outb(MG_REG_CTRL_INTR_DISABLE,
-				(unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 
 	outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND);
 	/* wait until mflash deep sleep */
@@ -859,9 +804,7 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state)
 
 	if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) {
 		if (!prv_data->use_polling)
-			outb(MG_REG_CTRL_INTR_ENABLE,
-					(unsigned long)host->dev_base +
-					MG_REG_DRV_CTRL);
+			outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 		return -EIO;
 	}
 
@@ -884,8 +827,7 @@ static int mg_resume(struct platform_device *plat_dev)
 		return -EIO;
 
 	if (!prv_data->use_polling)
-		outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base +
-				MG_REG_DRV_CTRL);
+		outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL);
 
 	return 0;
 }
-- 
GitLab


From 5b644c7a218702668d7b610994e7dcbc3d4705d3 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 28 Apr 2009 08:17:54 +0000
Subject: [PATCH 0673/2198] clocksource: improve sh_cmt clocksource overflow
 handling

This patch improves the sh_cmt clocksource handling.

Currently the counter value is ignored in the case of
overflow. With this patch the overflow flag is read
before and after reading the counter, removing any
counter value and overflow flag mismatch issues.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/clocksource/sh_cmt.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index c24756489612e..d607ac2d516bd 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -111,16 +111,21 @@ static unsigned long sh_cmt_get_counter(struct sh_cmt_priv *p,
 					int *has_wrapped)
 {
 	unsigned long v1, v2, v3;
+	int o1, o2;
+
+	o1 = sh_cmt_read(p, CMCSR) & p->overflow_bit;
 
 	/* Make sure the timer value is stable. Stolen from acpi_pm.c */
 	do {
+		o2 = o1;
 		v1 = sh_cmt_read(p, CMCNT);
 		v2 = sh_cmt_read(p, CMCNT);
 		v3 = sh_cmt_read(p, CMCNT);
-	} while (unlikely((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
-			  || (v3 > v1 && v3 < v2)));
+		o1 = sh_cmt_read(p, CMCSR) & p->overflow_bit;
+	} while (unlikely((o1 != o2) || (v1 > v2 && v1 < v3)
+			  || (v2 > v3 && v2 < v1) || (v3 > v1 && v3 < v2)));
 
-	*has_wrapped = sh_cmt_read(p, CMCSR) & p->overflow_bit;
+	*has_wrapped = o1;
 	return v2;
 }
 
@@ -394,7 +399,7 @@ static cycle_t sh_cmt_clocksource_read(struct clocksource *cs)
 	raw = sh_cmt_get_counter(p, &has_wrapped);
 
 	if (unlikely(has_wrapped))
-		raw = p->match_value;
+		raw += p->match_value;
 	spin_unlock_irqrestore(&p->lock, flags);
 
 	return value + raw;
-- 
GitLab


From 8e0b842948156e3463879caed12b4ce51bed772e Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 28 Apr 2009 08:19:50 +0000
Subject: [PATCH 0674/2198] sh: setup timers in late_time_init()

This patch moves the SuperH timer setup code from time_init()
to late_time_init(). Good things about this change:
 - interrupts: they are enabled at late_time_init()
 - mm: regular kmalloc() can be used at late_time_init()

Together with moving to late_time_init() this patch changes
the sh_cmt driver to always allocate with kmalloc(). This
simplifies the code a bit and also fixes section mismatches.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/time_32.c     | 34 ++++++++++++++++++++--------------
 drivers/clocksource/sh_cmt.c | 13 ++-----------
 2 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index 109409f5ca53f..a2458bfdda266 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -237,21 +237,8 @@ unsigned long long sched_clock(void)
 }
 #endif
 
-void __init time_init(void)
+static void __init sh_late_time_init(void)
 {
-	if (board_time_init)
-		board_time_init();
-
-	clk_init();
-
-	rtc_sh_get_time(&xtime);
-	set_normalized_timespec(&wall_to_monotonic,
-				-xtime.tv_sec, -xtime.tv_nsec);
-
-#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
-	local_timer_setup(smp_processor_id());
-#endif
-
 	/*
 	 * Make sure all compiled-in early timers register themselves.
 	 * Run probe() for one "earlytimer" device.
@@ -270,3 +257,22 @@ void __init time_init(void)
 
 	printk(KERN_INFO "Using %s for system timer\n", sys_timer->name);
 }
+
+void __init time_init(void)
+{
+	if (board_time_init)
+		board_time_init();
+
+	clk_init();
+
+	rtc_sh_get_time(&xtime);
+	set_normalized_timespec(&wall_to_monotonic,
+				-xtime.tv_sec, -xtime.tv_nsec);
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
+	local_timer_setup(smp_processor_id());
+#endif
+
+	late_time_init = sh_late_time_init;
+}
+
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index d607ac2d516bd..bf3e4c11fd37a 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -18,7 +18,6 @@
  */
 
 #include <linux/init.h>
-#include <linux/bootmem.h>
 #include <linux/platform_device.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
@@ -645,11 +644,7 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev)
 		return 0;
 	}
 
-	if (is_early_platform_device(pdev))
-		p = alloc_bootmem(sizeof(*p));
-	else
-		p = kmalloc(sizeof(*p), GFP_KERNEL);
-
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
 	if (p == NULL) {
 		dev_err(&pdev->dev, "failed to allocate driver data\n");
 		return -ENOMEM;
@@ -657,11 +652,7 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev)
 
 	ret = sh_cmt_setup(p, pdev);
 	if (ret) {
-		if (is_early_platform_device(pdev))
-			free_bootmem(__pa(p), sizeof(*p));
-		else
-			kfree(p);
-
+		kfree(p);
 		platform_set_drvdata(pdev, NULL);
 	}
 	return ret;
-- 
GitLab


From 9ec4fa271faf2db3b8e1419c998da1ca6b094eb6 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:57:18 -0700
Subject: [PATCH 0675/2198] irq, cpumask: correct CPUMASKS_OFFSTACK typo and
 fix fallout

CPUMASKS_OFFSTACK is not defined anywhere (it is CPUMASK_OFFSTACK).
It is a typo and init_allocate_desc_masks() is called before it set
affinity to all cpus...

Split init_alloc_desc_masks() into all_desc_masks() and init_desc_masks().

Also use CPUMASK_OFFSTACK in alloc_desc_masks().

[ Impact: fix smp_affinity copying/setup when moving irq_desc between CPUs ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
LKML-Reference: <49F6546E.3040406@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h       | 27 ++++++++++++++++++---------
 kernel/irq/handle.c       |  9 ++++++---
 kernel/irq/numa_migrate.c |  2 +-
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/include/linux/irq.h b/include/linux/irq.h
index b7cbeed972e42..c4953cf27e5e2 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -424,27 +424,25 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 
 #ifdef CONFIG_SMP
 /**
- * init_alloc_desc_masks - allocate cpumasks for irq_desc
+ * alloc_desc_masks - allocate cpumasks for irq_desc
  * @desc:	pointer to irq_desc struct
  * @cpu:	cpu which will be handling the cpumasks
  * @boot:	true if need bootmem
  *
  * Allocates affinity and pending_mask cpumask if required.
  * Returns true if successful (or not required).
- * Side effect: affinity has all bits set, pending_mask has all bits clear.
  */
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
+#ifdef CONFIG_CPUMASK_OFFSTACK
 	int node;
 
 	if (boot) {
 		alloc_bootmem_cpumask_var(&desc->affinity);
-		cpumask_setall(desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 		alloc_bootmem_cpumask_var(&desc->pending_mask);
-		cpumask_clear(desc->pending_mask);
 #endif
 		return true;
 	}
@@ -453,18 +451,25 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 
 	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
 		return false;
-	cpumask_setall(desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
 		free_cpumask_var(desc->affinity);
 		return false;
 	}
-	cpumask_clear(desc->pending_mask);
+#endif
 #endif
 	return true;
 }
 
+static inline void init_desc_masks(struct irq_desc *desc)
+{
+	cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	cpumask_clear(desc->pending_mask);
+#endif
+}
+
 /**
  * init_copy_desc_masks - copy cpumasks for irq_desc
  * @old_desc:	pointer to old irq_desc struct
@@ -478,7 +483,7 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 					struct irq_desc *new_desc)
 {
-#ifdef CONFIG_CPUMASKS_OFFSTACK
+#ifdef CONFIG_CPUMASK_OFFSTACK
 	cpumask_copy(new_desc->affinity, old_desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -499,12 +504,16 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
 
 #else /* !CONFIG_SMP */
 
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
 	return true;
 }
 
+static inline void init_desc_masks(struct irq_desc *desc)
+{
+}
+
 static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 					struct irq_desc *new_desc)
 {
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d82142be8dd2c..882c798001072 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -115,10 +115,11 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
-	if (!init_alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, cpu, false)) {
 		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
 		BUG_ON(1);
 	}
+	init_desc_masks(desc);
 	arch_init_chip_data(desc, cpu);
 }
 
@@ -169,7 +170,8 @@ int __init early_irq_init(void)
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-		init_alloc_desc_masks(&desc[i], 0, true);
+		alloc_desc_masks(&desc[i], 0, true);
+		init_desc_masks(&desc[i]);
 		irq_desc_ptrs[i] = desc + i;
 	}
 
@@ -256,7 +258,8 @@ int __init early_irq_init(void)
 
 	for (i = 0; i < count; i++) {
 		desc[i].irq = i;
-		init_alloc_desc_masks(&desc[i], 0, true);
+		alloc_desc_masks(&desc[i], 0, true);
+		init_desc_masks(&desc[i]);
 		desc[i].kstat_irqs = kstat_irqs_all[i];
 	}
 	return arch_early_irq_init();
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 44bbdcbaf8d2c..5760d72516264 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -37,7 +37,7 @@ static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 		 struct irq_desc *desc, int cpu)
 {
 	memcpy(desc, old_desc, sizeof(struct irq_desc));
-	if (!init_alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, cpu, false)) {
 		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
 				"for migration.\n", irq);
 		return false;
-- 
GitLab


From fcef5911c7ea89b80d5bfc727f402f37c9eefd57 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:58:23 -0700
Subject: [PATCH 0676/2198] x86/irq: remove leftover code from
 NUMA_MIGRATE_IRQ_DESC

The original feature of migrating irq_desc dynamic was too fragile
and was causing problems: it caused crashes on systems with lots of
cards with MSI-X when user-space irq-balancer was enabled.

We now have new patches that create irq_desc according to device
numa node. This patch removes the leftover bits of the dynamic balancer.

[ Impact: remove dead code ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F654AF.8000808@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig                  | 10 ------
 arch/x86/configs/x86_64_defconfig |  1 -
 arch/x86/kernel/apic/io_apic.c    | 56 +++----------------------------
 include/linux/irq.h               | 10 ------
 kernel/irq/Makefile               |  2 +-
 kernel/irq/chip.c                 | 12 ++-----
 kernel/irq/handle.c               |  9 ++---
 kernel/irq/numa_migrate.c         |  2 --
 8 files changed, 9 insertions(+), 93 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index df9e885eee143..e1b2543f8ed3f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -274,16 +274,6 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say N.
 
-config NUMA_MIGRATE_IRQ_DESC
-	bool "Move irq desc when changing irq smp_affinity"
-	depends on SPARSE_IRQ && NUMA
-	depends on BROKEN
-	default n
-	---help---
-	  This enables moving irq_desc to cpu/node that irq will use handled.
-
-	  If you don't know what to do here, say N.
-
 config X86_MPPARSE
 	bool "Enable MPS table" if ACPI
 	default y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9fe5d212ab4cc..27b8ce0f59086 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -195,7 +195,6 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
 CONFIG_SMP=y
 CONFIG_SPARSE_IRQ=y
-# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
 CONFIG_X86_FIND_SMP_CONFIG=y
 CONFIG_X86_MPPARSE=y
 # CONFIG_X86_ELAN is not set
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 30da617d18e47..9fbf0f7ec7ebe 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -148,9 +148,6 @@ struct irq_cfg {
 	unsigned move_cleanup_count;
 	u8 vector;
 	u8 move_in_progress : 1;
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-	u8 move_desc_pending : 1;
-#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -254,8 +251,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
 	return 0;
 }
 
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-
+/* for move_irq_desc */
 static void
 init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 {
@@ -356,19 +352,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
 		old_desc->chip_data = NULL;
 	}
 }
-
-static void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-	struct irq_cfg *cfg = desc->chip_data;
-
-	if (!cfg->move_in_progress) {
-		/* it means that domain is not changed */
-		if (!cpumask_intersects(desc->affinity, mask))
-			cfg->move_desc_pending = 1;
-	}
-}
-#endif
+/* end for move_irq_desc */
 
 #else
 static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +362,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 
 #endif
 
-#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static inline void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-}
-#endif
-
 struct io_apic {
 	unsigned int index;
 	unsigned int unused[3];
@@ -592,9 +569,6 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return BAD_APICID;
 
-	/* check that before desc->addinity get updated */
-	set_extra_move_desc(desc, mask);
-
 	cpumask_copy(desc->affinity, mask);
 
 	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
@@ -2393,8 +2367,6 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return;
 
-	set_extra_move_desc(desc, mask);
-
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
 	irte.vector = cfg->vector;
@@ -2491,34 +2463,14 @@ static void irq_complete_move(struct irq_desc **descp)
 	struct irq_cfg *cfg = desc->chip_data;
 	unsigned vector, me;
 
-	if (likely(!cfg->move_in_progress)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-		if (likely(!cfg->move_desc_pending))
-			return;
-
-		/* domain has not changed, but affinity did */
-		me = smp_processor_id();
-		if (cpumask_test_cpu(me, desc->affinity)) {
-			*descp = desc = move_irq_desc(desc, me);
-			/* get the new one */
-			cfg = desc->chip_data;
-			cfg->move_desc_pending = 0;
-		}
-#endif
+	if (likely(!cfg->move_in_progress))
 		return;
-	}
 
 	vector = ~get_irq_regs()->orig_ax;
 	me = smp_processor_id();
 
-	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-		*descp = desc = move_irq_desc(desc, me);
-		/* get the new one */
-		cfg = desc->chip_data;
-#endif
+	if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
 		send_cleanup_vector(cfg);
-	}
 }
 #else
 static inline void irq_complete_move(struct irq_desc **descp) {}
diff --git a/include/linux/irq.h b/include/linux/irq.h
index c4953cf27e5e2..2a34cd6281d7e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -212,16 +212,6 @@ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
 
 extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
 
-static inline struct irq_desc *
-irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
-{
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-	return irq_to_desc(irq);
-#else
-	return desc;
-#endif
-}
-
 /*
  * Migration helpers for obsolete names, they will go away:
  */
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 3394f8f52964b..2f065277f8ee2 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_SPARSE_IRQ) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c687ba4363f2b..13c68e71b726c 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
 	spin_lock(&desc->lock);
 	mask_ack_irq(desc, irq);
-	desc = irq_remap_to_desc(irq, desc);
 
 	if (unlikely(desc->status & IRQ_INPROGRESS))
 		goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
 	desc->status &= ~IRQ_INPROGRESS;
 out:
 	desc->chip->eoi(irq);
-	desc = irq_remap_to_desc(irq, desc);
 
 	spin_unlock(&desc->lock);
 }
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 		    !desc->action)) {
 		desc->status |= (IRQ_PENDING | IRQ_MASKED);
 		mask_ack_irq(desc, irq);
-		desc = irq_remap_to_desc(irq, desc);
 		goto out_unlock;
 	}
 	kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	/* Start handling the irq */
 	if (desc->chip->ack)
 		desc->chip->ack(irq);
-	desc = irq_remap_to_desc(irq, desc);
 
 	/* Mark the IRQ currently in progress.*/
 	desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
 	if (!noirqdebug)
 		note_interrupt(irq, desc, action_ret);
 
-	if (desc->chip->eoi) {
+	if (desc->chip->eoi)
 		desc->chip->eoi(irq);
-		desc = irq_remap_to_desc(irq, desc);
-	}
 }
 
 void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 
 	/* Uninstall? */
 	if (handle == handle_bad_irq) {
-		if (desc->chip != &no_irq_chip) {
+		if (desc->chip != &no_irq_chip)
 			mask_ack_irq(desc, irq);
-			desc = irq_remap_to_desc(irq, desc);
-		}
 		desc->status |= IRQ_DISABLED;
 		desc->depth = 1;
 	}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 882c798001072..3e0cbc44bd73a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -458,11 +458,8 @@ unsigned int __do_IRQ(unsigned int irq)
 		/*
 		 * No locking required for CPU-local interrupts:
 		 */
-		if (desc->chip->ack) {
+		if (desc->chip->ack)
 			desc->chip->ack(irq);
-			/* get new one */
-			desc = irq_remap_to_desc(irq, desc);
-		}
 		if (likely(!(desc->status & IRQ_DISABLED))) {
 			action_ret = handle_IRQ_event(irq, desc->action);
 			if (!noirqdebug)
@@ -473,10 +470,8 @@ unsigned int __do_IRQ(unsigned int irq)
 	}
 
 	spin_lock(&desc->lock);
-	if (desc->chip->ack) {
+	if (desc->chip->ack)
 		desc->chip->ack(irq);
-		desc = irq_remap_to_desc(irq, desc);
-	}
 	/*
 	 * REPLAY is when Linux resends an IRQ that was dropped earlier
 	 * WAITING is used by probe to mark irqs that are being tested
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 5760d72516264..ce72bc3f4ced3 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -97,9 +97,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 
 	/* free the old one */
 	free_one_irq_desc(old_desc, desc);
-	spin_unlock(&old_desc->lock);
 	kfree(old_desc);
-	spin_lock(&desc->lock);
 
 	return desc;
 
-- 
GitLab


From d5dedd4507d307eb3f35f21b6e16f336fdc0d82a Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:59:21 -0700
Subject: [PATCH 0677/2198] irq: change ->set_affinity() to return status

according to Ingo, change set_affinity() in irq_chip should return int,
because that way we can handle failure cases in a much cleaner way, in
the genirq layer.

v2: fix two typos

[ Impact: extend API ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-arch@vger.kernel.org
LKML-Reference: <49F654E9.4070809@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/alpha/kernel/sys_dp264.c         |  8 +++-
 arch/alpha/kernel/sys_titan.c         |  4 +-
 arch/arm/common/gic.c                 |  4 +-
 arch/cris/arch-v32/kernel/irq.c       |  4 +-
 arch/ia64/hp/sim/hpsim_irq.c          |  3 +-
 arch/ia64/kernel/iosapic.c            | 10 +++--
 arch/ia64/kernel/msi_ia64.c           | 16 ++++---
 arch/ia64/sn/kernel/irq.c             |  4 +-
 arch/ia64/sn/kernel/msi_sn.c          |  8 ++--
 arch/mips/cavium-octeon/octeon-irq.c  |  8 +++-
 arch/mips/include/asm/irq.h           |  2 +-
 arch/mips/kernel/irq-gic.c            |  5 ++-
 arch/mips/mti-malta/malta-smtc.c      |  4 +-
 arch/mips/sibyte/bcm1480/irq.c        |  8 ++--
 arch/mips/sibyte/sb1250/irq.c         |  8 ++--
 arch/parisc/kernel/irq.c              |  6 ++-
 arch/powerpc/platforms/pseries/xics.c | 12 ++---
 arch/powerpc/sysdev/mpic.c            |  4 +-
 arch/sparc/kernel/irq_64.c            | 12 +++--
 arch/x86/kernel/apic/io_apic.c        | 64 +++++++++++++++++----------
 drivers/parisc/iosapic.c              |  6 ++-
 drivers/xen/events.c                  | 12 ++---
 include/linux/irq.h                   |  2 +-
 23 files changed, 140 insertions(+), 74 deletions(-)

diff --git a/arch/alpha/kernel/sys_dp264.c b/arch/alpha/kernel/sys_dp264.c
index 9c9d1fd4155fc..5bd5259324b7c 100644
--- a/arch/alpha/kernel/sys_dp264.c
+++ b/arch/alpha/kernel/sys_dp264.c
@@ -176,22 +176,26 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 	}
 }
 
-static void
+static int
 dp264_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&dp264_irq_lock);
 	cpu_set_irq_affinity(irq, *affinity);
 	tsunami_update_irq_hw(cached_irq_mask);
 	spin_unlock(&dp264_irq_lock);
+
+	return 0;
 }
 
-static void
+static int
 clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&dp264_irq_lock);
 	cpu_set_irq_affinity(irq - 16, *affinity);
 	tsunami_update_irq_hw(cached_irq_mask);
 	spin_unlock(&dp264_irq_lock);
+
+	return 0;
 }
 
 static struct hw_interrupt_type dp264_irq_type = {
diff --git a/arch/alpha/kernel/sys_titan.c b/arch/alpha/kernel/sys_titan.c
index 27f840a4ad3d7..8dd239ebdb9e2 100644
--- a/arch/alpha/kernel/sys_titan.c
+++ b/arch/alpha/kernel/sys_titan.c
@@ -157,13 +157,15 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 
 }
 
-static void
+static int
 titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
 	spin_lock(&titan_irq_lock);
 	titan_cpu_set_irq_affinity(irq - 16, *affinity);
 	titan_update_irq_hw(titan_cached_irq_mask);
 	spin_unlock(&titan_irq_lock);
+
+	return 0;
 }
 
 static void
diff --git a/arch/arm/common/gic.c b/arch/arm/common/gic.c
index c6884ba1d5ed0..90f6b7f52d488 100644
--- a/arch/arm/common/gic.c
+++ b/arch/arm/common/gic.c
@@ -109,7 +109,7 @@ static void gic_unmask_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
+static int gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
 {
 	void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3);
 	unsigned int shift = (irq % 4) * 8;
@@ -122,6 +122,8 @@ static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
 	val |= 1 << (cpu + shift);
 	writel(val, reg);
 	spin_unlock(&irq_controller_lock);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/cris/arch-v32/kernel/irq.c b/arch/cris/arch-v32/kernel/irq.c
index df3925cb1c7fb..d70b445f4a8f0 100644
--- a/arch/cris/arch-v32/kernel/irq.c
+++ b/arch/cris/arch-v32/kernel/irq.c
@@ -325,12 +325,14 @@ static void end_crisv32_irq(unsigned int irq)
 {
 }
 
-void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
+int set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&irq_lock, flags);
 	irq_allocations[irq - FIRST_IRQ].mask = *dest;
 	spin_unlock_irqrestore(&irq_lock, flags);
+
+	return 0;
 }
 
 static struct irq_chip crisv32_irq_type = {
diff --git a/arch/ia64/hp/sim/hpsim_irq.c b/arch/ia64/hp/sim/hpsim_irq.c
index cc0a3182db3c0..acb5047ab5734 100644
--- a/arch/ia64/hp/sim/hpsim_irq.c
+++ b/arch/ia64/hp/sim/hpsim_irq.c
@@ -21,9 +21,10 @@ hpsim_irq_noop (unsigned int irq)
 {
 }
 
-static void
+static int
 hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b)
 {
+	return 0;
 }
 
 static struct hw_interrupt_type irq_type_hp_sim = {
diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c
index 166e0d839fa04..f92cef47bf862 100644
--- a/arch/ia64/kernel/iosapic.c
+++ b/arch/ia64/kernel/iosapic.c
@@ -329,7 +329,7 @@ unmask_irq (unsigned int irq)
 }
 
 
-static void
+static int
 iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 #ifdef CONFIG_SMP
@@ -343,15 +343,15 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	cpu = cpumask_first_and(cpu_online_mask, mask);
 	if (cpu >= nr_cpu_ids)
-		return;
+		return -1;
 
 	if (irq_prepare_move(irq, cpu))
-		return;
+		return -1;
 
 	dest = cpu_physical_id(cpu);
 
 	if (!iosapic_intr_info[irq].count)
-		return;			/* not an IOSAPIC interrupt */
+		return -1;			/* not an IOSAPIC interrupt */
 
 	set_irq_affinity_info(irq, dest, redir);
 
@@ -376,7 +376,9 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 		iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
 		iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
 	}
+
 #endif
+	return 0;
 }
 
 /*
diff --git a/arch/ia64/kernel/msi_ia64.c b/arch/ia64/kernel/msi_ia64.c
index 2b15e233f7fef..0f8ade9331bad 100644
--- a/arch/ia64/kernel/msi_ia64.c
+++ b/arch/ia64/kernel/msi_ia64.c
@@ -12,7 +12,7 @@
 static struct irq_chip	ia64_msi_chip;
 
 #ifdef CONFIG_SMP
-static void ia64_set_msi_irq_affinity(unsigned int irq,
+static int ia64_set_msi_irq_affinity(unsigned int irq,
 				      const cpumask_t *cpu_mask)
 {
 	struct msi_msg msg;
@@ -20,10 +20,10 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
 	int cpu = first_cpu(*cpu_mask);
 
 	if (!cpu_online(cpu))
-		return;
+		return -1;
 
 	if (irq_prepare_move(irq, cpu))
-		return;
+		return -1;
 
 	read_msi_msg(irq, &msg);
 
@@ -39,6 +39,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
 
 	write_msi_msg(irq, &msg);
 	cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
+
+	return 0;
 }
 #endif /* CONFIG_SMP */
 
@@ -130,17 +132,17 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg = irq_cfg + irq;
 	struct msi_msg msg;
 	int cpu = cpumask_first(mask);
 
 	if (!cpu_online(cpu))
-		return;
+		return -1;
 
 	if (irq_prepare_move(irq, cpu))
-		return;
+		return -1;
 
 	dmar_msi_read(irq, &msg);
 
@@ -151,6 +153,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dmar_msi_write(irq, &msg);
 	cpumask_copy(irq_desc[irq].affinity, mask);
+
+	return 0;
 }
 #endif /* CONFIG_SMP */
 
diff --git a/arch/ia64/sn/kernel/irq.c b/arch/ia64/sn/kernel/irq.c
index 66fd705e82c09..764f26abac05d 100644
--- a/arch/ia64/sn/kernel/irq.c
+++ b/arch/ia64/sn/kernel/irq.c
@@ -227,7 +227,7 @@ struct sn_irq_info *sn_retarget_vector(struct sn_irq_info *sn_irq_info,
 	return new_irq_info;
 }
 
-static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
+static int sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
 	struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
 	nasid_t nasid;
@@ -239,6 +239,8 @@ static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
 	list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
 				 sn_irq_lh[irq], list)
 		(void)sn_retarget_vector(sn_irq_info, nasid, slice);
+
+	return 0;
 }
 
 #ifdef CONFIG_SMP
diff --git a/arch/ia64/sn/kernel/msi_sn.c b/arch/ia64/sn/kernel/msi_sn.c
index 81e428943d737..fbbfb97012012 100644
--- a/arch/ia64/sn/kernel/msi_sn.c
+++ b/arch/ia64/sn/kernel/msi_sn.c
@@ -151,7 +151,7 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry)
 }
 
 #ifdef CONFIG_SMP
-static void sn_set_msi_irq_affinity(unsigned int irq,
+static int sn_set_msi_irq_affinity(unsigned int irq,
 				    const struct cpumask *cpu_mask)
 {
 	struct msi_msg msg;
@@ -168,7 +168,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
 	cpu = cpumask_first(cpu_mask);
 	sn_irq_info = sn_msi_info[irq].sn_irq_info;
 	if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
-		return;
+		return -1;
 
 	/*
 	 * Release XIO resources for the old MSI PCI address
@@ -189,7 +189,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
 	new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice);
 	sn_msi_info[irq].sn_irq_info = new_irq_info;
 	if (new_irq_info == NULL)
-		return;
+		return -1;
 
 	/*
 	 * Map the xio address into bus space
@@ -206,6 +206,8 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
 
 	write_msi_msg(irq, &msg);
 	cpumask_copy(irq_desc[irq].affinity, cpu_mask);
+
+	return 0;
 }
 #endif /* CONFIG_SMP */
 
diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c
index 1c19af8daa62a..d3a0c8154beca 100644
--- a/arch/mips/cavium-octeon/octeon-irq.c
+++ b/arch/mips/cavium-octeon/octeon-irq.c
@@ -177,7 +177,7 @@ static void octeon_irq_ciu0_disable(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
 {
 	int cpu;
 	int bit = irq - OCTEON_IRQ_WORKQ0;	/* Bit 0-63 of EN0 */
@@ -199,6 +199,8 @@ static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask
 	 */
 	cvmx_read_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num() * 2));
 	write_unlock(&octeon_irq_ciu0_rwlock);
+
+	return 0;
 }
 #endif
 
@@ -292,7 +294,7 @@ static void octeon_irq_ciu1_disable(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
 {
 	int cpu;
 	int bit = irq - OCTEON_IRQ_WDOG0;	/* Bit 0-63 of EN1 */
@@ -315,6 +317,8 @@ static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask
 	 */
 	cvmx_read_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num() * 2 + 1));
 	write_unlock(&octeon_irq_ciu1_rwlock);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h
index 3214ade02d105..4f1eed107b082 100644
--- a/arch/mips/include/asm/irq.h
+++ b/arch/mips/include/asm/irq.h
@@ -49,7 +49,7 @@ static inline void smtc_im_ack_irq(unsigned int irq)
 #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF
 #include <linux/cpumask.h>
 
-extern void plat_set_irq_affinity(unsigned int irq,
+extern int plat_set_irq_affinity(unsigned int irq,
 				  const struct cpumask *affinity);
 extern void smtc_forward_irq(unsigned int irq);
 
diff --git a/arch/mips/kernel/irq-gic.c b/arch/mips/kernel/irq-gic.c
index 87deb8f6c4588..3f43c2e3aa5a5 100644
--- a/arch/mips/kernel/irq-gic.c
+++ b/arch/mips/kernel/irq-gic.c
@@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq)
 
 static DEFINE_SPINLOCK(gic_lock);
 
-static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+static int gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
 	cpumask_t	tmp = CPU_MASK_NONE;
 	unsigned long	flags;
@@ -166,7 +166,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 
 	cpumask_and(&tmp, cpumask, cpu_online_mask);
 	if (cpus_empty(tmp))
-		return;
+		return -1;
 
 	/* Assumption : cpumask refers to a single CPU */
 	spin_lock_irqsave(&gic_lock, flags);
@@ -190,6 +190,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	cpumask_copy(irq_desc[irq].affinity, cpumask);
 	spin_unlock_irqrestore(&gic_lock, flags);
 
+	return 0;
 }
 #endif
 
diff --git a/arch/mips/mti-malta/malta-smtc.c b/arch/mips/mti-malta/malta-smtc.c
index 5ba31888fefbb..499ffe5475dff 100644
--- a/arch/mips/mti-malta/malta-smtc.c
+++ b/arch/mips/mti-malta/malta-smtc.c
@@ -114,7 +114,7 @@ struct plat_smp_ops msmtc_smp_ops = {
  */
 
 
-void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
+int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 {
 	cpumask_t tmask;
 	int cpu = 0;
@@ -156,5 +156,7 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 
 	/* Do any generic SMTC IRQ affinity setup */
 	smtc_set_irq_affinity(irq, tmask);
+
+	return 0;
 }
 #endif /* CONFIG_MIPS_MT_SMTC_IRQAFF */
diff --git a/arch/mips/sibyte/bcm1480/irq.c b/arch/mips/sibyte/bcm1480/irq.c
index 352352b3cb2fa..4f256a131bf6e 100644
--- a/arch/mips/sibyte/bcm1480/irq.c
+++ b/arch/mips/sibyte/bcm1480/irq.c
@@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq);
 static void disable_bcm1480_irq(unsigned int irq);
 static void ack_bcm1480_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_PCI
@@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	int i = 0, old_cpu, cpu, int_on, k;
 	u64 cur_ints;
@@ -119,7 +119,7 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	if (cpumask_weight(mask) != 1) {
 		printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
-		return;
+		return -1;
 	}
 	i = cpumask_first(mask);
 
@@ -155,6 +155,8 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 	}
 	spin_unlock(&bcm1480_imr_lock);
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/mips/sibyte/sb1250/irq.c b/arch/mips/sibyte/sb1250/irq.c
index c08ff582da6f0..e389507f1f966 100644
--- a/arch/mips/sibyte/sb1250/irq.c
+++ b/arch/mips/sibyte/sb1250/irq.c
@@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq);
 static void disable_sb1250_irq(unsigned int irq);
 static void ack_sb1250_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_SIBYTE_HAS_LDT
@@ -103,7 +103,7 @@ void sb1250_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	int i = 0, old_cpu, cpu, int_on;
 	u64 cur_ints;
@@ -114,7 +114,7 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	if (cpumask_weight(mask) > 1) {
 		printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
-		return;
+		return -1;
 	}
 
 	/* Convert logical CPU to physical CPU */
@@ -146,6 +146,8 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 	}
 	spin_unlock(&sb1250_imr_lock);
 	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index 4ea4229d765cc..8007f1e657298 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -130,15 +130,17 @@ int cpu_check_affinity(unsigned int irq, const struct cpumask *dest)
 	return cpu_dest;
 }
 
-static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
+static int cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
 {
 	int cpu_dest;
 
 	cpu_dest = cpu_check_affinity(irq, dest);
 	if (cpu_dest < 0)
-		return;
+		return -1;
 
 	cpumask_copy(&irq_desc[irq].affinity, dest);
+
+	return 0;
 }
 #endif
 
diff --git a/arch/powerpc/platforms/pseries/xics.c b/arch/powerpc/platforms/pseries/xics.c
index 80b513449f4c4..be3581a8c294c 100644
--- a/arch/powerpc/platforms/pseries/xics.c
+++ b/arch/powerpc/platforms/pseries/xics.c
@@ -333,7 +333,7 @@ static void xics_eoi_lpar(unsigned int virq)
 	lpar_xirr_info_set((0xff << 24) | irq);
 }
 
-static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
+static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 {
 	unsigned int irq;
 	int status;
@@ -342,14 +342,14 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 
 	irq = (unsigned int)irq_map[virq].hwirq;
 	if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
-		return;
+		return -1;
 
 	status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
 
 	if (status) {
 		printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n",
 			__func__, irq, status);
-		return;
+		return -1;
 	}
 
 	/*
@@ -363,7 +363,7 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 		printk(KERN_WARNING
 			"%s: No online cpus in the mask %s for irq %d\n",
 			__func__, cpulist, virq);
-		return;
+		return -1;
 	}
 
 	status = rtas_call(ibm_set_xive, 3, 1, NULL,
@@ -372,8 +372,10 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 	if (status) {
 		printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n",
 			__func__, irq, status);
-		return;
+		return -1;
 	}
+
+	return 0;
 }
 
 static struct irq_chip xics_pic_direct = {
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 21b9567015963..f4cbd15cf22f6 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -807,7 +807,7 @@ static void mpic_end_ipi(unsigned int irq)
 
 #endif /* CONFIG_SMP */
 
-void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
 	struct mpic *mpic = mpic_from_irq(irq);
 	unsigned int src = mpic_irq_to_hw(irq);
@@ -824,6 +824,8 @@ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 		mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
 			       mpic_physmask(cpus_addr(tmp)[0]));
 	}
+
+	return 0;
 }
 
 static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type)
diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c
index 5deabe921a47b..e5e78f9cfc95d 100644
--- a/arch/sparc/kernel/irq_64.c
+++ b/arch/sparc/kernel/irq_64.c
@@ -318,10 +318,12 @@ static void sun4u_irq_enable(unsigned int virt_irq)
 	}
 }
 
-static void sun4u_set_affinity(unsigned int virt_irq,
+static int sun4u_set_affinity(unsigned int virt_irq,
 			       const struct cpumask *mask)
 {
 	sun4u_irq_enable(virt_irq);
+
+	return 0;
 }
 
 /* Don't do anything.  The desc->status check for IRQ_DISABLED in
@@ -377,7 +379,7 @@ static void sun4v_irq_enable(unsigned int virt_irq)
 		       ino, err);
 }
 
-static void sun4v_set_affinity(unsigned int virt_irq,
+static int sun4v_set_affinity(unsigned int virt_irq,
 			       const struct cpumask *mask)
 {
 	unsigned int ino = virt_irq_table[virt_irq].dev_ino;
@@ -388,6 +390,8 @@ static void sun4v_set_affinity(unsigned int virt_irq,
 	if (err != HV_EOK)
 		printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): "
 		       "err(%d)\n", ino, cpuid, err);
+
+	return 0;
 }
 
 static void sun4v_irq_disable(unsigned int virt_irq)
@@ -445,7 +449,7 @@ static void sun4v_virq_enable(unsigned int virt_irq)
 		       dev_handle, dev_ino, err);
 }
 
-static void sun4v_virt_set_affinity(unsigned int virt_irq,
+static int sun4v_virt_set_affinity(unsigned int virt_irq,
 				    const struct cpumask *mask)
 {
 	unsigned long cpuid, dev_handle, dev_ino;
@@ -461,6 +465,8 @@ static void sun4v_virt_set_affinity(unsigned int virt_irq,
 		printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
 		       "err(%d)\n",
 		       dev_handle, dev_ino, cpuid, err);
+
+	return 0;
 }
 
 static void sun4v_virq_disable(unsigned int virt_irq)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9fbf0f7ec7ebe..5c7630b40a549 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -574,13 +574,14 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
 }
 
-static void
+static int
 set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	unsigned long flags;
 	unsigned int dest;
 	unsigned int irq;
+	int ret = -1;
 
 	irq = desc->irq;
 	cfg = desc->chip_data;
@@ -591,18 +592,21 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 		/* Only the high 8 bits are valid. */
 		dest = SET_APIC_LOGICAL_ID(dest);
 		__target_IO_APIC_irq(irq, dest, cfg);
+		ret = 0;
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
+
+	return ret;
 }
 
-static void
+static int
 set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc;
 
 	desc = irq_to_desc(irq);
 
-	set_ioapic_affinity_irq_desc(desc, mask);
+	return set_ioapic_affinity_irq_desc(desc, mask);
 }
 #endif /* CONFIG_SMP */
 
@@ -2348,24 +2352,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
  * Real vector that is used for interrupting cpu will be coming from
  * the interrupt-remapping table entry.
  */
-static void
+static int
 migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
 	struct irq_cfg *cfg;
 	struct irte irte;
 	unsigned int dest;
 	unsigned int irq;
+	int ret = -1;
 
 	if (!cpumask_intersects(mask, cpu_online_mask))
-		return;
+		return ret;
 
 	irq = desc->irq;
 	if (get_irte(irq, &irte))
-		return;
+		return ret;
 
 	cfg = desc->chip_data;
 	if (assign_irq_vector(irq, cfg, mask))
-		return;
+		return ret;
 
 	dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
@@ -2381,27 +2386,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 		send_cleanup_vector(cfg);
 
 	cpumask_copy(desc->affinity, mask);
+
+	return 0;
 }
 
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 					    const struct cpumask *mask)
 {
-	migrate_ioapic_irq_desc(desc, mask);
+	return migrate_ioapic_irq_desc(desc, mask);
 }
-static void set_ir_ioapic_affinity_irq(unsigned int irq,
+static int set_ir_ioapic_affinity_irq(unsigned int irq,
 				       const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 
-	set_ir_ioapic_affinity_irq_desc(desc, mask);
+	return set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #else
-static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 						   const struct cpumask *mask)
 {
+	return 0;
 }
 #endif
 
@@ -3318,7 +3326,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 }
 
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3327,7 +3335,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
@@ -3339,13 +3347,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	write_msi_msg_desc(desc, &msg);
+
+	return 0;
 }
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
  * done in the process context using interrupt-remapping hardware.
  */
-static void
+static int
 ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
@@ -3354,11 +3364,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	struct irte irte;
 
 	if (get_irte(irq, &irte))
-		return;
+		return -1;
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	irte.vector = cfg->vector;
 	irte.dest_id = IRTE_DEST(dest);
@@ -3375,6 +3385,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 	 */
 	if (cfg->move_in_progress)
 		send_cleanup_vector(cfg);
+
+	return 0;
 }
 
 #endif
@@ -3528,7 +3540,7 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3537,7 +3549,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
@@ -3549,6 +3561,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	dmar_msi_write(irq, &msg);
+
+	return 0;
 }
 
 #endif /* CONFIG_SMP */
@@ -3582,7 +3596,7 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3591,7 +3605,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
@@ -3603,6 +3617,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 	msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
 	hpet_msi_write(irq, &msg);
+
+	return 0;
 }
 
 #endif /* CONFIG_SMP */
@@ -3659,7 +3675,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
 	write_ht_irq_msg(irq, &msg);
 }
 
-static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irq_cfg *cfg;
@@ -3667,11 +3683,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
 	dest = set_desc_affinity(desc, mask);
 	if (dest == BAD_APICID)
-		return;
+		return -1;
 
 	cfg = desc->chip_data;
 
 	target_ht_irq(irq, dest, cfg->vector);
+
+	return 0;
 }
 
 #endif
diff --git a/drivers/parisc/iosapic.c b/drivers/parisc/iosapic.c
index 73348c4047e98..4a9cc92d4d186 100644
--- a/drivers/parisc/iosapic.c
+++ b/drivers/parisc/iosapic.c
@@ -702,7 +702,7 @@ static unsigned int iosapic_startup_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void iosapic_set_affinity_irq(unsigned int irq,
+static int iosapic_set_affinity_irq(unsigned int irq,
 				     const struct cpumask *dest)
 {
 	struct vector_info *vi = iosapic_get_vector(irq);
@@ -712,7 +712,7 @@ static void iosapic_set_affinity_irq(unsigned int irq,
 
 	dest_cpu = cpu_check_affinity(irq, dest);
 	if (dest_cpu < 0)
-		return;
+		return -1;
 
 	cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu));
 	vi->txn_addr = txn_affinity_addr(irq, dest_cpu);
@@ -724,6 +724,8 @@ static void iosapic_set_affinity_irq(unsigned int irq,
 	iosapic_set_irt_data(vi, &dummy_d0, &d1);
 	iosapic_wr_irt_entry(vi, d0, d1);
 	spin_unlock_irqrestore(&iosapic_lock, flags);
+
+	return 0;
 }
 #endif
 
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 30963af5dba02..33389880279bf 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -688,13 +688,13 @@ void rebind_evtchn_irq(int evtchn, int irq)
 }
 
 /* Rebind an evtchn so that it gets delivered to a specific cpu */
-static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 {
 	struct evtchn_bind_vcpu bind_vcpu;
 	int evtchn = evtchn_from_irq(irq);
 
 	if (!VALID_EVTCHN(evtchn))
-		return;
+		return -1;
 
 	/* Send future instances of this interrupt to other vcpu. */
 	bind_vcpu.port = evtchn;
@@ -707,13 +707,15 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 	 */
 	if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
 		bind_evtchn_to_cpu(evtchn, tcpu);
-}
 
+	return 0;
+}
 
-static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
+static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
 {
 	unsigned tcpu = cpumask_first(dest);
-	rebind_irq_to_cpu(irq, tcpu);
+
+	return rebind_irq_to_cpu(irq, tcpu);
 }
 
 int resend_irq_on_evtchn(unsigned int irq)
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 2a34cd6281d7e..8e4c18b291579 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -117,7 +117,7 @@ struct irq_chip {
 	void		(*eoi)(unsigned int irq);
 
 	void		(*end)(unsigned int irq);
-	void		(*set_affinity)(unsigned int irq,
+	int		(*set_affinity)(unsigned int irq,
 					const struct cpumask *dest);
 	int		(*retrigger)(unsigned int irq);
 	int		(*set_type)(unsigned int irq, unsigned int flow_type);
-- 
GitLab


From 57b150cce8e004ddd36330490a68bfb59b7271e9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 17:59:53 -0700
Subject: [PATCH 0678/2198] irq: only update affinity if ->set_affinity() is
 sucessfull

irq_set_affinity() and move_masked_irq() try to assign affinity
before calling chip set_affinity(). Some archs are assigning it
in ->set_affinity() again.

We do something like:

 cpumask_cpy(desc->affinity, mask);
 desc->chip->set_affinity(mask);

But in the failure path, affinity should not be touched - otherwise
we'll end up with a different affinity mask despite the failure to
migrate the IRQ.

So try to update the afffinity only if set_affinity returns with 0.
Also call irq_set_thread_affinity accordingly.

v2: update after "irq, x86: Remove IRQ_DISABLED check in process context IRQ move"
v3: according to Ingo, change set_affinity() in irq_chip should return int.
v4: update comments by removing moving irq_desc code.

[ Impact: fix /proc/irq/*/smp_affinity setting corner case bug ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F65509.60307@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/internals.h |  3 +++
 kernel/irq/manage.c    | 17 +++++++++++------
 kernel/irq/migration.c | 14 +++++++++-----
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 01ce20eab38fe..de5f412f6a921 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq,
 
 extern int irq_select_affinity_usr(unsigned int irq);
 
+extern void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
+
 /*
  * Debugging printout:
  */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 2734eca59243b..aaf5c9d057703 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq)
 	return 1;
 }
 
-static void
+void
 irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
 {
 	struct irqaction *action = desc->action;
@@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	spin_lock_irqsave(&desc->lock, flags);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (desc->status & IRQ_MOVE_PCNTXT)
-		desc->chip->set_affinity(irq, cpumask);
+	if (desc->status & IRQ_MOVE_PCNTXT) {
+		if (!desc->chip->set_affinity(irq, cpumask)) {
+			cpumask_copy(desc->affinity, cpumask);
+			irq_set_thread_affinity(desc, cpumask);
+		}
+	}
 	else {
 		desc->status |= IRQ_MOVE_PENDING;
 		cpumask_copy(desc->pending_mask, cpumask);
 	}
 #else
-	cpumask_copy(desc->affinity, cpumask);
-	desc->chip->set_affinity(irq, cpumask);
+	if (!desc->chip->set_affinity(irq, cpumask)) {
+		cpumask_copy(desc->affinity, cpumask);
+		irq_set_thread_affinity(desc, cpumask);
+	}
 #endif
-	irq_set_thread_affinity(desc, cpumask);
 	desc->status |= IRQ_AFFINITY_SET;
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index e05ad9be43b7a..cfe767ca15450 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -1,5 +1,8 @@
 
 #include <linux/irq.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
 
 void move_masked_irq(int irq)
 {
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
 	 * masking the irqs.
 	 */
 	if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
-		   < nr_cpu_ids)) {
-		cpumask_and(desc->affinity,
-			    desc->pending_mask, cpu_online_mask);
-		desc->chip->set_affinity(irq, desc->affinity);
-	}
+		   < nr_cpu_ids))
+		if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
+			cpumask_copy(desc->affinity, desc->pending_mask);
+			irq_set_thread_affinity(desc, desc->pending_mask);
+		}
+
 	cpumask_clear(desc->pending_mask);
 }
 
-- 
GitLab


From 85ac16d033370caf6f48d743c8dc8103700f5cc5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:00:38 -0700
Subject: [PATCH 0679/2198] x86/irq: change irq_desc_alloc() to take node
 instead of cpu

This simplifies the node awareness of the code. All our allocators
only deal with a NUMA node ID locality not with CPU ids anyway - so
there's no need to maintain (and transform) a CPU id all across the
IRq layer.

v2: keep move_irq_desc related

[ Impact: cleanup, prepare IRQ code to be NUMA-aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@goop.org>
LKML-Reference: <49F65536.2020300@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 58 +++++++++++++++-------------------
 arch/x86/lguest/boot.c         |  2 +-
 drivers/pci/intr_remapping.c   | 15 ++++-----
 drivers/xen/events.c           |  2 +-
 include/linux/interrupt.h      |  2 +-
 include/linux/irq.h            | 16 ++++------
 kernel/irq/handle.c            | 28 +++++++---------
 kernel/irq/internals.h         |  2 +-
 kernel/irq/numa_migrate.c      | 36 +++++++--------------
 kernel/softirq.c               |  2 +-
 10 files changed, 66 insertions(+), 97 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5c7630b40a549..560b887ba27c6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -129,12 +129,9 @@ struct irq_pin_list {
 	struct irq_pin_list *next;
 };
 
-static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+static struct irq_pin_list *get_one_free_irq_2_pin(int node)
 {
 	struct irq_pin_list *pin;
-	int node;
-
-	node = cpu_to_node(cpu);
 
 	pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
 
@@ -209,12 +206,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 	return cfg;
 }
 
-static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+static struct irq_cfg *get_one_free_irq_cfg(int node)
 {
 	struct irq_cfg *cfg;
-	int node;
-
-	node = cpu_to_node(cpu);
 
 	cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
 	if (cfg) {
@@ -235,13 +229,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
 	return cfg;
 }
 
-int arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int node)
 {
 	struct irq_cfg *cfg;
 
 	cfg = desc->chip_data;
 	if (!cfg) {
-		desc->chip_data = get_one_free_irq_cfg(cpu);
+		desc->chip_data = get_one_free_irq_cfg(node);
 		if (!desc->chip_data) {
 			printk(KERN_ERR "can not alloc irq_cfg\n");
 			BUG_ON(1);
@@ -253,7 +247,7 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
 
 /* for move_irq_desc */
 static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
 {
 	struct irq_pin_list *old_entry, *head, *tail, *entry;
 
@@ -262,7 +256,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 	if (!old_entry)
 		return;
 
-	entry = get_one_free_irq_2_pin(cpu);
+	entry = get_one_free_irq_2_pin(node);
 	if (!entry)
 		return;
 
@@ -272,7 +266,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
 	tail		= entry;
 	old_entry	= old_entry->next;
 	while (old_entry) {
-		entry = get_one_free_irq_2_pin(cpu);
+		entry = get_one_free_irq_2_pin(node);
 		if (!entry) {
 			entry = head;
 			while (entry) {
@@ -312,12 +306,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
 }
 
 void arch_init_copy_chip_data(struct irq_desc *old_desc,
-				 struct irq_desc *desc, int cpu)
+				 struct irq_desc *desc, int node)
 {
 	struct irq_cfg *cfg;
 	struct irq_cfg *old_cfg;
 
-	cfg = get_one_free_irq_cfg(cpu);
+	cfg = get_one_free_irq_cfg(node);
 
 	if (!cfg)
 		return;
@@ -328,7 +322,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
 
 	memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
 
-	init_copy_irq_2_pin(old_cfg, cfg, cpu);
+	init_copy_irq_2_pin(old_cfg, cfg, node);
 }
 
 static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -615,13 +609,13 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
 	struct irq_pin_list *entry;
 
 	entry = cfg->irq_2_pin;
 	if (!entry) {
-		entry = get_one_free_irq_2_pin(cpu);
+		entry = get_one_free_irq_2_pin(node);
 		if (!entry) {
 			printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
 					apic, pin);
@@ -641,7 +635,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 		entry = entry->next;
 	}
 
-	entry->next = get_one_free_irq_2_pin(cpu);
+	entry->next = get_one_free_irq_2_pin(node);
 	entry = entry->next;
 	entry->apic = apic;
 	entry->pin = pin;
@@ -650,7 +644,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
 				      int oldapic, int oldpin,
 				      int newapic, int newpin)
 {
@@ -670,7 +664,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 
 	/* why? call replace before add? */
 	if (!replaced)
-		add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
+		add_pin_to_irq_node(cfg, node, newapic, newpin);
 }
 
 static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -1612,7 +1606,7 @@ static void __init setup_IO_APIC_irqs(void)
 	int notcon = 0;
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
@@ -1647,13 +1641,13 @@ static void __init setup_IO_APIC_irqs(void)
 					apic->multi_timer_check(apic_id, irq))
 				continue;
 
-			desc = irq_to_desc_alloc_cpu(irq, cpu);
+			desc = irq_to_desc_alloc_node(irq, node);
 			if (!desc) {
 				printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 				continue;
 			}
 			cfg = desc->chip_data;
-			add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
+			add_pin_to_irq_node(cfg, node, apic_id, pin);
 
 			setup_IO_APIC_irq(apic_id, pin, irq, desc,
 					irq_trigger(idx), irq_polarity(idx));
@@ -2863,7 +2857,7 @@ static inline void __init check_timer(void)
 {
 	struct irq_desc *desc = irq_to_desc(0);
 	struct irq_cfg *cfg = desc->chip_data;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 	int apic1, pin1, apic2, pin2;
 	unsigned long flags;
 	int no_pin1 = 0;
@@ -2929,7 +2923,7 @@ static inline void __init check_timer(void)
 		 * Ok, does IRQ0 through the IOAPIC work?
 		 */
 		if (no_pin1) {
-			add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
+			add_pin_to_irq_node(cfg, node, apic1, pin1);
 			setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
 		} else {
 			/* for edge trigger, setup_IO_APIC_irq already
@@ -2966,7 +2960,7 @@ static inline void __init check_timer(void)
 		/*
 		 * legacy devices should be connected to IO APIC #0
 		 */
-		replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
+		replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
 		setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
 		enable_8259A_irq(0);
 		if (timer_irq_works()) {
@@ -3185,7 +3179,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 	unsigned int new;
 	unsigned long flags;
 	struct irq_cfg *cfg_new = NULL;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 	struct irq_desc *desc_new = NULL;
 
 	irq = 0;
@@ -3194,7 +3188,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 	spin_lock_irqsave(&vector_lock, flags);
 	for (new = irq_want; new < nr_irqs; new++) {
-		desc_new = irq_to_desc_alloc_cpu(new, cpu);
+		desc_new = irq_to_desc_alloc_node(new, node);
 		if (!desc_new) {
 			printk(KERN_INFO "can not get irq_desc for %d\n", new);
 			continue;
@@ -3968,7 +3962,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 {
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	int cpu = boot_cpu_id;
+	int node = cpu_to_node(boot_cpu_id);
 
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -3976,7 +3970,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 		return -EINVAL;
 	}
 
-	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
 		printk(KERN_INFO "can not get irq_desc %d\n", irq);
 		return 0;
@@ -3987,7 +3981,7 @@ int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int p
 	 */
 	if (irq >= NR_IRQS_LEGACY) {
 		cfg = desc->chip_data;
-		add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
+		add_pin_to_irq_node(cfg, node, ioapic, pin);
 	}
 
 	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ca7ec44bafc3b..45acbcf256833 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -636,7 +636,7 @@ static void __init lguest_init_IRQ(void)
 
 void lguest_setup_irq(unsigned int irq)
 {
-	irq_to_desc_alloc_cpu(irq, 0);
+	irq_to_desc_alloc_node(irq, 0);
 	set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
 				      handle_level_irq, "level");
 }
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f5e0ea724a6f5..9eff36a293e06 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -23,15 +23,12 @@ struct irq_2_iommu {
 };
 
 #ifdef CONFIG_GENERIC_HARDIRQS
-static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int node)
 {
 	struct irq_2_iommu *iommu;
-	int node;
-
-	node = cpu_to_node(cpu);
 
 	iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
-	printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+	printk(KERN_DEBUG "alloc irq_2_iommu on node %d\n", node);
 
 	return iommu;
 }
@@ -48,7 +45,7 @@ static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
 	return desc->irq_2_iommu;
 }
 
-static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node)
 {
 	struct irq_desc *desc;
 	struct irq_2_iommu *irq_iommu;
@@ -56,7 +53,7 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
 	/*
 	 * alloc irq desc if not allocated already.
 	 */
-	desc = irq_to_desc_alloc_cpu(irq, cpu);
+	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
 		printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 		return NULL;
@@ -65,14 +62,14 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
 	irq_iommu = desc->irq_2_iommu;
 
 	if (!irq_iommu)
-		desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+		desc->irq_2_iommu = get_one_free_irq_2_iommu(node);
 
 	return desc->irq_2_iommu;
 }
 
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
-	return irq_2_iommu_alloc_cpu(irq, boot_cpu_id);
+	return irq_2_iommu_alloc_node(irq, cpu_to_node(boot_cpu_id));
 }
 
 #else /* !CONFIG_SPARSE_IRQ */
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index 33389880279bf..be437c2bc942e 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -335,7 +335,7 @@ static int find_unbound_irq(void)
 	if (irq == nr_irqs)
 		panic("No available IRQ to bind to: increase nr_irqs!\n");
 
-	desc = irq_to_desc_alloc_cpu(irq, 0);
+	desc = irq_to_desc_alloc_node(irq, 0);
 	if (WARN_ON(desc == NULL))
 		return -1;
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 91bb76f44f14e..ff374ceface08 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -566,6 +566,6 @@ struct irq_desc;
 extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
-extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern int arch_init_chip_data(struct irq_desc *desc, int node);
 
 #endif
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 8e4c18b291579..a09baf8f9d999 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -187,7 +187,7 @@ struct irq_desc {
 	spinlock_t		lock;
 #ifdef CONFIG_SMP
 	cpumask_var_t		affinity;
-	unsigned int		cpu;
+	unsigned int		node;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	cpumask_var_t		pending_mask;
 #endif
@@ -201,16 +201,16 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
-					struct irq_desc *desc, int cpu);
+					struct irq_desc *desc, int node);
 extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
 
 #ifndef CONFIG_SPARSE_IRQ
 extern struct irq_desc irq_desc[NR_IRQS];
 #else /* CONFIG_SPARSE_IRQ */
-extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node);
 #endif /* CONFIG_SPARSE_IRQ */
 
-extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
 
 /*
  * Migration helpers for obsolete names, they will go away:
@@ -422,12 +422,10 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
  * Allocates affinity and pending_mask cpumask if required.
  * Returns true if successful (or not required).
  */
-static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
 								bool boot)
 {
 #ifdef CONFIG_CPUMASK_OFFSTACK
-	int node;
-
 	if (boot) {
 		alloc_bootmem_cpumask_var(&desc->affinity);
 
@@ -437,8 +435,6 @@ static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
 		return true;
 	}
 
-	node = cpu_to_node(cpu);
-
 	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
 		return false;
 
@@ -494,7 +490,7 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
 
 #else /* !CONFIG_SMP */
 
-static inline bool alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
 								bool boot)
 {
 	return true;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3e0cbc44bd73a..a6368db2618be 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -81,12 +81,10 @@ static struct irq_desc irq_desc_init = {
 	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
 };
 
-void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
-	int node;
 	void *ptr;
 
-	node = cpu_to_node(cpu);
 	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
 
 	/*
@@ -94,33 +92,32 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 	 * init_copy_kstat_irqs() could still use old one
 	 */
 	if (ptr) {
-		printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n",
-			 cpu, node);
+		printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
 		desc->kstat_irqs = ptr;
 	}
 }
 
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 {
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
 
 	spin_lock_init(&desc->lock);
 	desc->irq = irq;
 #ifdef CONFIG_SMP
-	desc->cpu = cpu;
+	desc->node = node;
 #endif
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-	init_kstat_irqs(desc, cpu, nr_cpu_ids);
+	init_kstat_irqs(desc, node, nr_cpu_ids);
 	if (!desc->kstat_irqs) {
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
-	if (!alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, node, false)) {
 		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
 		BUG_ON(1);
 	}
 	init_desc_masks(desc);
-	arch_init_chip_data(desc, cpu);
+	arch_init_chip_data(desc, node);
 }
 
 /*
@@ -189,11 +186,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	return NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
 	struct irq_desc *desc;
 	unsigned long flags;
-	int node;
 
 	if (irq >= nr_irqs) {
 		WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -212,15 +208,13 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 	if (desc)
 		goto out_unlock;
 
-	node = cpu_to_node(cpu);
 	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-	printk(KERN_DEBUG "  alloc irq_desc for %d on cpu %d node %d\n",
-		 irq, cpu, node);
+	printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
 	if (!desc) {
 		printk(KERN_ERR "can not alloc irq_desc\n");
 		BUG_ON(1);
 	}
-	init_one_irq_desc(irq, desc, cpu);
+	init_one_irq_desc(irq, desc, node);
 
 	irq_desc_ptrs[irq] = desc;
 
@@ -270,7 +264,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
 	return irq_to_desc(irq);
 }
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index de5f412f6a921..73468253143ba 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
 extern struct lock_class_key irq_desc_lock_class;
-extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 extern void clear_kstat_irqs(struct irq_desc *desc);
 extern spinlock_t sparse_irq_lock;
 
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ce72bc3f4ced3..2f69bee57bf21 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -15,9 +15,9 @@
 
 static void init_copy_kstat_irqs(struct irq_desc *old_desc,
 				 struct irq_desc *desc,
-				 int cpu, int nr)
+				 int node, int nr)
 {
-	init_kstat_irqs(desc, cpu, nr);
+	init_kstat_irqs(desc, node, nr);
 
 	if (desc->kstat_irqs != old_desc->kstat_irqs)
 		memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 
 static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-		 struct irq_desc *desc, int cpu)
+		 struct irq_desc *desc, int node)
 {
 	memcpy(desc, old_desc, sizeof(struct irq_desc));
-	if (!alloc_desc_masks(desc, cpu, false)) {
+	if (!alloc_desc_masks(desc, node, false)) {
 		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
 				"for migration.\n", irq);
 		return false;
 	}
 	spin_lock_init(&desc->lock);
-	desc->cpu = cpu;
+	desc->node = node;
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
 	init_copy_desc_masks(old_desc, desc);
-	arch_init_copy_chip_data(old_desc, desc, cpu);
+	arch_init_copy_chip_data(old_desc, desc, node);
 	return true;
 }
 
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 
 static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-						int cpu)
+						int node)
 {
 	struct irq_desc *desc;
 	unsigned int irq;
 	unsigned long flags;
-	int node;
 
 	irq = old_desc->irq;
 
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 	if (desc && old_desc != desc)
 		goto out_unlock;
 
-	node = cpu_to_node(cpu);
 	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
 	if (!desc) {
 		printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 		desc = old_desc;
 		goto out_unlock;
 	}
-	if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
+	if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
 		/* still use old one */
 		kfree(desc);
 		desc = old_desc;
@@ -107,24 +105,14 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 	return desc;
 }
 
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 {
-	int old_cpu;
-	int node, old_node;
-
 	/* those all static, do move them */
 	if (desc->irq < NR_IRQS_LEGACY)
 		return desc;
 
-	old_cpu = desc->cpu;
-	if (old_cpu != cpu) {
-		node = cpu_to_node(cpu);
-		old_node = cpu_to_node(old_cpu);
-		if (old_node != node)
-			desc = __real_move_irq_desc(desc, cpu);
-		else
-			desc->cpu = cpu;
-	}
+	if (desc->node != node)
+		desc = __real_move_irq_desc(desc, node);
 
 	return desc;
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b525dd348511b..f674f332a0243 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -828,7 +828,7 @@ int __init __weak arch_early_irq_init(void)
 	return 0;
 }
 
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int node)
 {
 	return 0;
 }
-- 
GitLab


From a2f809b08ae4dddc1015c7dcd8659e5729e45b3e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:01:20 -0700
Subject: [PATCH 0680/2198] irq: change ACPI GSI APIs to also take a device
 argument

We want to use dev_to_node() later on, to be aware of the 'home node'
of the GSI in question.

[ Impact: cleanup, prepare the IRQ code to be more NUMA aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Len Brown <lenb@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Len Brown <lenb@kernel.org>
Cc: Bjorn Helgaas <bjorn.helgaas@hp.com>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-acpi@vger.kernel.org
Cc: linux-ia64@vger.kernel.org
LKML-Reference: <49F65560.20904@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/ia64/kernel/acpi.c        | 5 +++--
 arch/x86/include/asm/io_apic.h | 4 ++--
 arch/x86/include/asm/mpspec.h  | 4 +++-
 arch/x86/kernel/acpi/boot.c    | 8 ++++----
 arch/x86/kernel/apic/io_apic.c | 3 ++-
 drivers/acpi/pci_irq.c         | 5 +++--
 drivers/char/hpet.c            | 4 ++--
 drivers/pnp/pnpacpi/rsparser.c | 2 +-
 include/linux/acpi.h           | 2 +-
 9 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index 5510317db37b2..baec6f00f7f3f 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -636,7 +636,7 @@ void __init acpi_numa_arch_fixup(void)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
 		return gsi;
@@ -678,7 +678,8 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
 
 	fadt = (struct acpi_table_fadt *)fadt_header;
 
-	acpi_register_gsi(fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW);
+	acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE,
+				 ACPI_ACTIVE_LOW);
 	return 0;
 }
 
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9d826e4360100..07f2913ba5de8 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,8 +154,8 @@ extern int timer_through_8259;
 extern int io_apic_get_unique_id(int ioapic, int apic_id);
 extern int io_apic_get_version(int ioapic);
 extern int io_apic_get_redir_entries(int ioapic);
-extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
-				   int edge_level, int active_high_low);
+extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin,
+				  int irq, int edge_level, int active_high_low);
 #endif /* CONFIG_ACPI */
 
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 642fc7fc8cdc3..3ea1f531f5324 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -72,7 +72,9 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
 extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 				   u32 gsi);
 extern void mp_config_acpi_legacy_irqs(void);
-extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+struct device;
+extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
+				 int active_high_low);
 extern int acpi_probe_gsi(void);
 #ifdef CONFIG_X86_IO_APIC
 extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 723989d7f8029..6ee96b5530f15 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -522,7 +522,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	unsigned int irq;
 	unsigned int plat_gsi = gsi;
@@ -539,7 +539,7 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
 
 #ifdef CONFIG_X86_IO_APIC
 	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
-		plat_gsi = mp_register_gsi(gsi, triggering, polarity);
+		plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity);
 	}
 #endif
 	acpi_gsi_to_irq(plat_gsi, &irq);
@@ -1158,7 +1158,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	}
 }
 
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	int ioapic;
 	int ioapic_pin;
@@ -1253,7 +1253,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
 		}
 	}
 #endif
-	io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
+	io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi,
 				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
 				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
 	return gsi;
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 560b887ba27c6..d9346622601bb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3958,7 +3958,8 @@ int __init io_apic_get_version(int ioapic)
 }
 #endif
 
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
+int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+				 int triggering, int polarity)
 {
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index 51b9f8280f887..2faa9e2ac8933 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -401,7 +401,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 		/* Interrupt Line values above 0xF are forbidden */
 		if (dev->irq > 0 && (dev->irq <= 0xF)) {
 			printk(" - using IRQ %d\n", dev->irq);
-			acpi_register_gsi(dev->irq, ACPI_LEVEL_SENSITIVE,
+			acpi_register_gsi(&dev->dev, dev->irq,
+					  ACPI_LEVEL_SENSITIVE,
 					  ACPI_ACTIVE_LOW);
 			return 0;
 		} else {
@@ -410,7 +411,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
 		}
 	}
 
-	rc = acpi_register_gsi(gsi, triggering, polarity);
+	rc = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
 	if (rc < 0) {
 		dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n",
 			 pin_name(pin));
diff --git a/drivers/char/hpet.c b/drivers/char/hpet.c
index 340ba4f9dc548..4a9f3492b9216 100644
--- a/drivers/char/hpet.c
+++ b/drivers/char/hpet.c
@@ -224,7 +224,7 @@ static void hpet_timer_set_irq(struct hpet_dev *devp)
 			break;
 		}
 
-		gsi = acpi_register_gsi(irq, ACPI_LEVEL_SENSITIVE,
+		gsi = acpi_register_gsi(NULL, irq, ACPI_LEVEL_SENSITIVE,
 					ACPI_ACTIVE_LOW);
 		if (gsi > 0)
 			break;
@@ -939,7 +939,7 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data)
 		irqp = &res->data.extended_irq;
 
 		for (i = 0; i < irqp->interrupt_count; i++) {
-			irq = acpi_register_gsi(irqp->interrupts[i],
+			irq = acpi_register_gsi(NULL, irqp->interrupts[i],
 				      irqp->triggering, irqp->polarity);
 			if (irq < 0)
 				return AE_ERROR;
diff --git a/drivers/pnp/pnpacpi/rsparser.c b/drivers/pnp/pnpacpi/rsparser.c
index adf17856bacc1..7f207f335beca 100644
--- a/drivers/pnp/pnpacpi/rsparser.c
+++ b/drivers/pnp/pnpacpi/rsparser.c
@@ -123,7 +123,7 @@ static void pnpacpi_parse_allocated_irqresource(struct pnp_dev *dev,
 	}
 
 	flags = irq_flags(triggering, polarity, shareable);
-	irq = acpi_register_gsi(gsi, triggering, polarity);
+	irq = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
 	if (irq >= 0)
 		pcibios_penalize_isa_irq(irq, 1);
 	else
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 88be890ee3c7e..51b4b0a5ce8cf 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -119,7 +119,7 @@ extern int pci_mmcfg_config_num;
 extern int sbf_port;
 extern unsigned long acpi_realmode_flags;
 
-int acpi_register_gsi (u32 gsi, int triggering, int polarity);
+int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
 int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 
 #ifdef CONFIG_X86_IO_APIC
-- 
GitLab


From 024154cfdd802654cb236a18c78b6e37351e2c49 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:01:50 -0700
Subject: [PATCH 0681/2198] irq: change io_apic_set_pci_routing() to use device
 parameter

Make actual use of the device parameter passed down to
io_apic_set_pci_routing() - to have the IRQ descriptor
on the home node of the device.

If no device has been passed down, we assume it's a platform
device and use the boot node ID for the IRQ descriptor.

[ Impact: optimization, make IO-APIC code more NUMA aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F6557E.3080101@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index d9346622601bb..82376e021b5da 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3963,7 +3963,7 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 {
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
-	int node = cpu_to_node(boot_cpu_id);
+	int node;
 
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
@@ -3971,6 +3971,11 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 		return -EINVAL;
 	}
 
+	if (dev)
+		node = dev_to_node(dev);
+	else
+		node = cpu_to_node(boot_cpu_id);
+
 	desc = irq_to_desc_alloc_node(irq, node);
 	if (!desc) {
 		printk(KERN_INFO "can not get irq_desc %d\n", irq);
-- 
GitLab


From d047f53a2ecce37e3bdf79eac5a326fbaadb3628 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:02:23 -0700
Subject: [PATCH 0682/2198] x86/irq: change MSI irq_desc to be more numa aware

Try to get irq_desc on the home node in create_irq_nr().

v2: don't check if we can move it when sparse_irq is not used
v3: use move_irq_des, if that node is not what we want

[ Impact: optimization, make MSI IRQ descriptors more NUMA aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F6559F.7070005@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 17 +++++++++++++----
 include/linux/irq.h            |  2 +-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 82376e021b5da..9cd4806cdf5f6 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3172,14 +3172,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
 /*
  * Dynamic irq allocate and deallocation
  */
-unsigned int create_irq_nr(unsigned int irq_want)
+unsigned int create_irq_nr(unsigned int irq_want, int node)
 {
 	/* Allocate an unused irq */
 	unsigned int irq;
 	unsigned int new;
 	unsigned long flags;
 	struct irq_cfg *cfg_new = NULL;
-	int node = cpu_to_node(boot_cpu_id);
 	struct irq_desc *desc_new = NULL;
 
 	irq = 0;
@@ -3197,6 +3196,13 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 		if (cfg_new->vector != 0)
 			continue;
+
+#ifdef CONFIG_NUMA_IRQ_DESC
+		/* different node ?*/
+		if (desc_new->node != node)
+			desc = move_irq_desc(desc, node);
+#endif
+
 		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
 			irq = new;
 		break;
@@ -3214,11 +3220,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 int create_irq(void)
 {
+	int node = cpu_to_node(boot_cpu_id);
 	unsigned int irq_want;
 	int irq;
 
 	irq_want = nr_irqs_gsi;
-	irq = create_irq_nr(irq_want);
+	irq = create_irq_nr(irq_want, node);
 
 	if (irq == 0)
 		irq = -1;
@@ -3476,15 +3483,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
 	unsigned int irq_want;
 	struct intel_iommu *iommu = NULL;
 	int index = 0;
+	int node;
 
 	/* x86 doesn't support multiple MSI yet */
 	if (type == PCI_CAP_ID_MSI && nvec > 1)
 		return 1;
 
+	node = dev_to_node(&dev->dev);
 	irq_want = nr_irqs_gsi;
 	sub_handle = 0;
 	list_for_each_entry(msidesc, &dev->msi_list, list) {
-		irq = create_irq_nr(irq_want);
+		irq = create_irq_nr(irq_want, node);
 		if (irq == 0)
 			return -1;
 		irq_want = irq + 1;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index a09baf8f9d999..4b95ddb5304b5 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -376,7 +376,7 @@ extern void set_irq_noprobe(unsigned int irq);
 extern void set_irq_probe(unsigned int irq);
 
 /* Handle dynamic irq creation and destruction */
-extern unsigned int create_irq_nr(unsigned int irq_want);
+extern unsigned int create_irq_nr(unsigned int irq_want, int node);
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
 
-- 
GitLab


From 56b581ea9591b5767b1e0204c6a06c7d0c49396e Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 27 Apr 2009 18:02:46 -0700
Subject: [PATCH 0683/2198] irq: make ht irq_desc more numa aware

Try to get irq_desc on the same node as create_irq_nr().

[ Impact: optimization, make HT IRQs more NUMA-aware ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F655B6.8020109@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/pci/htirq.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/htirq.c b/drivers/pci/htirq.c
index bf7d6ce9bbb3e..4e9dd0fe27453 100644
--- a/drivers/pci/htirq.c
+++ b/drivers/pci/htirq.c
@@ -98,6 +98,7 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	int max_irq;
 	int pos;
 	int irq;
+	int node;
 
 	pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
 	if (!pos)
@@ -125,7 +126,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
 	cfg->msg.address_lo = 0xffffffff;
 	cfg->msg.address_hi = 0xffffffff;
 
-	irq = create_irq();
+	node = dev_to_node(&dev->dev);
+	irq = create_irq_nr(0, node);
 
 	if (irq <= 0) {
 		kfree(cfg);
-- 
GitLab


From 4278600644dee621bd50d7498e244b135612e0f6 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 28 Apr 2009 23:12:10 +0900
Subject: [PATCH 0684/2198] sh: register the rtc-generic platform device
 properly.

The device registration was accidentally omitted, add it back in. Do some
basic device probing as well, so this doesn't show up for platforms that
tie in to the RTC interface properly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/time_32.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index a2458bfdda266..457332116e17d 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -3,7 +3,7 @@
  *
  *  Copyright (C) 1999  Tetsuya Okada & Niibe Yutaka
  *  Copyright (C) 2000  Philipp Rumpf <prumpf@tux.org>
- *  Copyright (C) 2002 - 2008  Paul Mundt
+ *  Copyright (C) 2002 - 2009  Paul Mundt
  *  Copyright (C) 2002  M. R. Brown  <mrbrown@linux-sh.org>
  *
  *  Some code taken from i386 version.
@@ -68,6 +68,21 @@ int set_rtc_time(struct rtc_time *tm)
 }
 EXPORT_SYMBOL(set_rtc_time);
 
+static int __init rtc_generic_init(void)
+{
+	struct platform_device *pdev;
+
+	if (rtc_sh_get_time == null_rtc_get_time)
+		return -ENODEV;
+
+	pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
+
+	return 0;
+}
+module_init(rtc_generic_init);
+
 #ifndef CONFIG_GENERIC_TIME
 void do_gettimeofday(struct timeval *tv)
 {
-- 
GitLab


From 1130b0296184bc21806225fd06d533515a99d2db Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 28 Apr 2009 14:56:18 +0200
Subject: [PATCH 0685/2198] perf_counter tools: fix Documentation/perf_counter
 build error
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mike Galbraith reported:

> marge:..Documentation/perf_counter # make
>     CC builtin-stat.o
> In file included from builtin-stat.c:71:
> /usr/include/ctype.h:102: error: expected expression before ‘]’ token

Remove the ctype.h include.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 1 -
 Documentation/perf_counter/builtin-top.c  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index d7ace631fc4ff..112b94ed32986 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -68,7 +68,6 @@
 #include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
-#include <ctype.h>
 #include <time.h>
 #include <sched.h>
 #include <pthread.h>
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index dea016fa41e87..6a276d2b2bbc1 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -49,7 +49,6 @@
 #include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
-#include <ctype.h>
 #include <time.h>
 #include <sched.h>
 #include <pthread.h>
-- 
GitLab


From cd891ae0305601bdb4d2e7e85282961c4ff256cd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 28 Apr 2009 11:39:34 -0400
Subject: [PATCH 0686/2198] tracing: convert ftrace_dump spinlocks to raw

ftrace_dump is used for printing out the contents of the ftrace ring buffer
to the console on failure. Currently it uses a spinlock to synchronize
the output from multiple failures on different CPUs. This spin lock
currently is a normal spinlock and can cause issues with lockdep and
lock tracing.

This patch converts it to raw since it is for error handling only.
The lock is local to the ftrace_dump and is not used by any other
infrastructure.

[ Impact: prevent ftrace_dump from locking up by internal tracing ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index b6183bc9ecae5..5d704a41f8366 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4114,7 +4114,8 @@ trace_printk_seq(struct trace_seq *s)
 
 static void __ftrace_dump(bool disable_tracing)
 {
-	static DEFINE_SPINLOCK(ftrace_dump_lock);
+	static raw_spinlock_t ftrace_dump_lock =
+		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
 	unsigned int old_userobj;
@@ -4123,7 +4124,8 @@ static void __ftrace_dump(bool disable_tracing)
 	int cnt = 0, cpu;
 
 	/* only one dump */
-	spin_lock_irqsave(&ftrace_dump_lock, flags);
+	local_irq_save(flags);
+	__raw_spin_lock(&ftrace_dump_lock);
 	if (dump_ran)
 		goto out;
 
@@ -4195,7 +4197,8 @@ static void __ftrace_dump(bool disable_tracing)
 	}
 
  out:
-	spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+	__raw_spin_unlock(&ftrace_dump_lock);
+	local_irq_restore(flags);
 }
 
 /* By default: disable tracing after the dump */
-- 
GitLab


From edc953fa4ebc0265ef3b1754fe116a9fd4264e15 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Tue, 28 Apr 2009 11:13:46 -0400
Subject: [PATCH 0687/2198] x86: clean up alternative.h

Alternative header duplicates assembly that could be merged in
one single macro.  Merging this into this macro also allows to
directly declare ALTERNATIVE() statements within assembly code.

Uses a __stringify() of the feature bits rather than passing a
"i" operand.  Leave the old %0 operand as-is (set to 0), unused
to stay compatible with API.

(v2: tab alignment fixes)

[ Impact: cleanup ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
LKML-Reference: <20090428151346.GA31212@Krystal>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/alternative.h | 59 ++++++++++++------------------
 1 file changed, 23 insertions(+), 36 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index f6aa18eadf717..1a37bcdc8606c 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/stddef.h>
+#include <linux/stringify.h>
 #include <asm/asm.h>
 
 /*
@@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {}
 
 const unsigned char *const *find_nop_table(void);
 
+/* alternative assembly primitive: */
+#define ALTERNATIVE(oldinstr, newinstr, feature)			\
+									\
+      "661:\n\t" oldinstr "\n662:\n"					\
+      ".section .altinstructions,\"a\"\n"				\
+      _ASM_ALIGN "\n"							\
+      _ASM_PTR "661b\n"				/* label           */	\
+      _ASM_PTR "663f\n"				/* new instruction */	\
+      "	 .byte " __stringify(feature) "\n"	/* feature bit     */	\
+      "	 .byte 662b-661b\n"			/* sourcelen       */	\
+      "	 .byte 664f-663f\n"			/* replacementlen  */	\
+      ".previous\n"							\
+      ".section .altinstr_replacement, \"ax\"\n"			\
+      "663:\n\t" newinstr "\n664:\n"		/* replacement     */	\
+      ".previous"
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
@@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void);
  * without volatile and memory clobber.
  */
 #define alternative(oldinstr, newinstr, feature)			\
-	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
-		      ".section .altinstructions,\"a\"\n"		\
-		      _ASM_ALIGN "\n"					\
-		      _ASM_PTR "661b\n"		/* label */		\
-		      _ASM_PTR "663f\n"		/* new instruction */	\
-		      "	 .byte %c0\n"		/* feature bit */	\
-		      "	 .byte 662b-661b\n"	/* sourcelen */		\
-		      "	 .byte 664f-663f\n"	/* replacementlen */	\
-		      ".previous\n"					\
-		      ".section .altinstr_replacement,\"ax\"\n"		\
-		      "663:\n\t" newinstr "\n664:\n"  /* replacement */	\
-		      ".previous" :: "i" (feature) : "memory")
+	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
 
 /*
  * Alternative inline assembly with input.
@@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void);
  * Best is to use constraints that are fixed size (like (%1) ... "r")
  * If you use variable sized constraints like "m" or "g" in the
  * replacement make sure to pad to the worst case length.
+ * Leaving an unused argument 0 to keep API compatibility.
  */
 #define alternative_input(oldinstr, newinstr, feature, input...)	\
-	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
-		      ".section .altinstructions,\"a\"\n"		\
-		      _ASM_ALIGN "\n"					\
-		      _ASM_PTR "661b\n"		/* label */		\
-		      _ASM_PTR "663f\n"		/* new instruction */	\
-		      "	 .byte %c0\n"		/* feature bit */	\
-		      "	 .byte 662b-661b\n"	/* sourcelen */		\
-		      "	 .byte 664f-663f\n"	/* replacementlen */	\
-		      ".previous\n"					\
-		      ".section .altinstr_replacement,\"ax\"\n"		\
-		      "663:\n\t" newinstr "\n664:\n"  /* replacement */	\
-		      ".previous" :: "i" (feature), ##input)
+	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)		\
+		: : "i" (0), ## input)
 
 /* Like alternative_input, but with a single output argument */
 #define alternative_io(oldinstr, newinstr, feature, output, input...)	\
-	asm volatile ("661:\n\t" oldinstr "\n662:\n"			\
-		      ".section .altinstructions,\"a\"\n"		\
-		      _ASM_ALIGN "\n"					\
-		      _ASM_PTR "661b\n"		/* label */		\
-		      _ASM_PTR "663f\n"		/* new instruction */	\
-		      "	 .byte %c[feat]\n"	/* feature bit */	\
-		      "	 .byte 662b-661b\n"	/* sourcelen */		\
-		      "	 .byte 664f-663f\n"	/* replacementlen */	\
-		      ".previous\n"					\
-		      ".section .altinstr_replacement,\"ax\"\n"		\
-		      "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
-		      ".previous" : output : [feat] "i" (feature), ##input)
+	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)		\
+		: output : "i" (0), ## input)
 
 /*
  * use this macro(s) if you need more than one output parameter
-- 
GitLab


From 5beae6efd1004b44c3e257dc96087978e4c763c1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 00:16:21 -0400
Subject: [PATCH 0688/2198] tracing: fix ref count in splice pages

The pages allocated for the splice binary buffer did not initialize
the ref count correctly. This caused pages not to be freed and causes
a drastic memory leak.

Thanks to logdev I was able to trace the tracer to find where the leak
was.

[ Impact: stop memory leak when using splice ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5d704a41f8366..9058240c85ca4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3531,6 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		if (!ref)
 			break;
 
+		ref->ref = 1;
 		ref->buffer = info->tr->buffer;
 		ref->page = ring_buffer_alloc_read_page(ref->buffer);
 		if (!ref->page) {
-- 
GitLab


From 93459c6cb9816c52200993d29dd18cea1daee335 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 00:23:13 -0400
Subject: [PATCH 0689/2198] tracing: only add splice page if entries exist

The splice code allocates a page even when the ring buffer is empty.
It detects the ring buffer being empty when it it fails to copy
anything from the ring buffer into the page.

This patch adds a check to see if there is anything in the ring buffer
before allocating a page.

Thanks to logdev for letting me trace the tracer to find this.

[ Impact: speed up due to removing unnecessary allocation ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9058240c85ca4..0aeb3b93414e7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3508,7 +3508,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		.spd_release	= buffer_spd_release,
 	};
 	struct buffer_ref *ref;
-	int size, i;
+	int entries, size, i;
 	size_t ret;
 
 	if (*ppos & (PAGE_SIZE - 1)) {
@@ -3523,7 +3523,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		len &= PAGE_MASK;
 	}
 
-	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
+	entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+
+	for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
 		struct page *page;
 		int r;
 
@@ -3564,6 +3566,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		spd.partial[i].private = (unsigned long)ref;
 		spd.nr_pages++;
 		*ppos += PAGE_SIZE;
+
+		entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
 	}
 
 	spd.nr_pages = i;
-- 
GitLab


From f2957f1f196b0217644a17c1379855a118a37d72 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 00:26:30 -0400
Subject: [PATCH 0690/2198] tracing: have splice only copy full pages

Splice works with pages, it is much more effecient to use an entire
page than to copy bits over several pages.

Using logdev to trace the internals of the splice mechanism, I was
able to see that splice can be very aggressive. When tracing is
occurring, and the reader caught up to the writer, and the writer
is on the reader page, the reader will copy what is there into the
splice page. Splice may iterate over several pages and if the
writer is still writing to the page, the reader will keep copying
bits to new pages to pass to userspace.

This patch changes it to only pass data to userspace if the page
is full (the writer has left the page). This has a small side effect
that splice can not read a partial page, and must wait for the
page to fill. This should not be an issue. If tracing has stopped,
then a use of "read" will still read all of the page.

[ Impact: better performance for ring buffer splice code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0aeb3b93414e7..f5427e0fc982a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3542,7 +3542,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
 		}
 
 		r = ring_buffer_read_page(ref->buffer, &ref->page,
-					  len, info->cpu, 0);
+					  len, info->cpu, 1);
 		if (r < 0) {
 			ring_buffer_free_read_page(ref->buffer,
 						   ref->page);
-- 
GitLab


From 7d7d2b803159d4edeb051b0e5efbc1a8d9ef1c67 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 27 Apr 2009 12:37:49 -0400
Subject: [PATCH 0691/2198] ring-buffer: fix printk output

The warning output in trace_recursive_lock uses %d for a long when
it should be %ld.

[ Impact: fix compile warning ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9692f100ec1a9..f4cc59040ebf8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1491,7 +1491,7 @@ static int trace_recursive_lock(void)
 	/* Disable all tracing before we do anything else */
 	tracing_off_permanent();
 
-	printk_once(KERN_WARNING "Tracing recursion: depth[%d]:"
+	printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
 		    "HC[%lu]:SC[%lu]:NMI[%lu]\n",
 		    current->trace_recursion,
 		    hardirq_count() >> HARDIRQ_SHIFT,
-- 
GitLab


From aee6a166a5401dcfcb17fcdc055e5edf2a4f4042 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:17 +0200
Subject: [PATCH 0692/2198] x86: beautify vmlinux_32.lds.S

Beautify vmlinux_32.lds.S:

 - Use tabs for indent
 - Located curly braces like in C code
 - Rearranged a few comments

To see actual differences use "git diff -b" which
ignore 'whitespace' changes.

The beautification is done to prepare a unification
of the _32 and _64 variants of the linker scripts.

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-1-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux_32.lds.S | 376 ++++++++++++++++---------------
 1 file changed, 200 insertions(+), 176 deletions(-)

diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 62ad500d55f3e..fffa45a1036f6 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -22,196 +22,220 @@ ENTRY(phys_startup_32)
 jiffies = jiffies_64;
 
 PHDRS {
-	text PT_LOAD FLAGS(5);	/* R_E */
-	data PT_LOAD FLAGS(7);	/* RWE */
-	note PT_NOTE FLAGS(0);	/* ___ */
+	text PT_LOAD FLAGS(5);		/* R_E */
+	data PT_LOAD FLAGS(7);		/* RWE */
+	note PT_NOTE FLAGS(0);		/* ___ */
 }
 SECTIONS
 {
-  . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
-  phys_startup_32 = startup_32 - LOAD_OFFSET;
-
-  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
-  	_text = .;			/* Text and read-only data */
-	*(.text.head)
-  } :text = 0x9090
-
-  /* read-only */
-  .text : AT(ADDR(.text) - LOAD_OFFSET) {
-	. = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
-	*(.text.page_aligned)
-	TEXT_TEXT
-	SCHED_TEXT
-	LOCK_TEXT
-	KPROBES_TEXT
-	IRQENTRY_TEXT
-	*(.fixup)
-	*(.gnu.warning)
-  	_etext = .;			/* End of text section */
-  } :text = 0x9090
-
-  NOTES :text :note
-
-  . = ALIGN(16);		/* Exception table */
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-  	__start___ex_table = .;
-	 *(__ex_table)
-  	__stop___ex_table = .;
-  } :text = 0x9090
-
-  RODATA
-
-  /* writeable */
-  . = ALIGN(PAGE_SIZE);
-  .data : AT(ADDR(.data) - LOAD_OFFSET) {	/* Data */
-	DATA_DATA
-	CONSTRUCTORS
+	. = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+	phys_startup_32 = startup_32 - LOAD_OFFSET;
+
+	/* Text and read-only data */
+	.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+		_text = .;
+		*(.text.head)
+	} :text = 0x9090
+
+	/* read-only */
+	.text : AT(ADDR(.text) - LOAD_OFFSET) {
+		/* not really needed, already page aligned */
+		. = ALIGN(PAGE_SIZE);
+		*(.text.page_aligned)
+		TEXT_TEXT
+		SCHED_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		IRQENTRY_TEXT
+		*(.fixup)
+		*(.gnu.warning)
+		/* End of text section */
+		_etext = .;
+	} :text = 0x9090
+
+	NOTES :text :note
+
+	/* Exception table */
+	. = ALIGN(16);
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		 *(__ex_table)
+		__stop___ex_table = .;
+	} :text = 0x9090
+
+	RODATA
+
+	/* writeable */
+	. = ALIGN(PAGE_SIZE);
+	/* Data */
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {
+		DATA_DATA
+		CONSTRUCTORS
 	} :data
 
-  . = ALIGN(PAGE_SIZE);
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-  	__nosave_begin = .;
-	*(.data.nosave)
-  	. = ALIGN(PAGE_SIZE);
-  	__nosave_end = .;
-  }
-
-  . = ALIGN(PAGE_SIZE);
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-	*(.data.page_aligned)
-	*(.data.idt)
-  }
-
-  . = ALIGN(32);
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-	*(.data.cacheline_aligned)
-  }
-
-  /* rarely changed data like cpu maps */
-  . = ALIGN(32);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-	*(.data.read_mostly)
-	_edata = .;		/* End of data section */
-  }
-
-  . = ALIGN(THREAD_SIZE);	/* init_task */
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-	*(.data.init_task)
-  }
-
-  /* might get freed after init */
-  . = ALIGN(PAGE_SIZE);
-  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-  	__smp_locks = .;
-	*(.smp_locks)
-	__smp_locks_end = .;
-  }
-  /* will be freed after init
-   * Following ALIGN() is required to make sure no other data falls on the
-   * same page where __smp_alt_end is pointing as that page might be freed
-   * after boot. Always make sure that ALIGN() directive is present after
-   * the section which contains __smp_alt_end.
-   */
-  . = ALIGN(PAGE_SIZE);
-
-  /* will be freed after init */
-  . = ALIGN(PAGE_SIZE);		/* Init code and data */
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-  	__init_begin = .;
-	_sinittext = .;
-	INIT_TEXT
-	_einittext = .;
-  }
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-	INIT_DATA
-  }
-  . = ALIGN(16);
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-  	__setup_start = .;
-	*(.init.setup)
-  	__setup_end = .;
-   }
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-  	__initcall_start = .;
-	INITCALLS
-  	__initcall_end = .;
-  }
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-  	__con_initcall_start = .;
-	*(.con_initcall.init)
-  	__con_initcall_end = .;
-  }
-  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-	__x86_cpu_dev_start = .;
-	*(.x86_cpu_dev.init)
-	__x86_cpu_dev_end = .;
-  }
-  SECURITY_INIT
-  . = ALIGN(4);
-  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-  	__alt_instructions = .;
-	*(.altinstructions)
-	__alt_instructions_end = .;
-  }
-  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-	*(.altinstr_replacement)
-  }
-  . = ALIGN(4);
-  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-  	__parainstructions = .;
-	*(.parainstructions)
-  	__parainstructions_end = .;
-  }
-  /* .exit.text is discard at runtime, not link time, to deal with references
-     from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-	EXIT_TEXT
-  }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-	EXIT_DATA
-  }
+	. = ALIGN(PAGE_SIZE);
+	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+		__nosave_begin = .;
+		*(.data.nosave)
+		. = ALIGN(PAGE_SIZE);
+		__nosave_end = .;
+	}
+
+	. = ALIGN(PAGE_SIZE);
+	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+		*(.data.page_aligned)
+		*(.data.idt)
+	}
+
+	. = ALIGN(32);
+	.data.cacheline_aligned :
+		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+		*(.data.cacheline_aligned)
+	}
+
+	/* rarely changed data like cpu maps */
+	. = ALIGN(32);
+	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+		*(.data.read_mostly)
+
+		/* End of data section */
+		_edata = .;
+	}
+
+	/* init_task */
+	. = ALIGN(THREAD_SIZE);
+	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+		*(.data.init_task)
+	}
+
+	. = ALIGN(PAGE_SIZE);
+	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+		/* might get freed after init */
+		__smp_locks = .;
+		*(.smp_locks)
+		__smp_locks_end = .;
+	}
+	/* will be freed after init
+	 * Following ALIGN() is required to make sure no other data falls on the
+	 * same page where __smp_alt_end is pointing as that page might be freed
+	 * after boot. Always make sure that ALIGN() directive is present after
+	 * the section which contains __smp_alt_end.
+	 */
+	. = ALIGN(PAGE_SIZE);
+
+	/* Init code and data - will be freed after init */
+	. = ALIGN(PAGE_SIZE);
+	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		__init_begin = .;
+		_sinittext = .;
+		INIT_TEXT
+		_einittext = .;
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+		INIT_DATA
+	}
+
+	. = ALIGN(16);
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		__setup_start = .;
+		*(.init.setup)
+		__setup_end = .;
+	}
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		__initcall_start = .;
+		INITCALLS
+		__initcall_end = .;
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		__con_initcall_start = .;
+		*(.con_initcall.init)
+		__con_initcall_end = .;
+	}
+
+	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+		__x86_cpu_dev_start = .;
+		*(.x86_cpu_dev.init)
+		__x86_cpu_dev_end = .;
+	}
+
+	SECURITY_INIT
+
+	. = ALIGN(4);
+	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+		__alt_instructions = .;
+		*(.altinstructions)
+		__alt_instructions_end = .;
+	}
+
+	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+		*(.altinstr_replacement)
+	}
+
+	. = ALIGN(4);
+	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+		__parainstructions = .;
+		*(.parainstructions)
+		__parainstructions_end = .;
+	}
+
+	/*
+	 * .exit.text is discard at runtime, not link time, to deal with
+	 *  references from .altinstructions and .eh_frame
+	 */
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+		EXIT_TEXT
+	}
+
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+		EXIT_DATA
+	}
+
 #if defined(CONFIG_BLK_DEV_INITRD)
-  . = ALIGN(PAGE_SIZE);
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-	__initramfs_start = .;
-	*(.init.ramfs)
-	__initramfs_end = .;
-  }
+	. = ALIGN(PAGE_SIZE);
+	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+	}
 #endif
-  PERCPU(PAGE_SIZE)
-  . = ALIGN(PAGE_SIZE);
-  /* freed after init ends here */
-
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-	__init_end = .;
-	__bss_start = .;		/* BSS */
-	*(.bss.page_aligned)
-	*(.bss)
-	. = ALIGN(4);
-	__bss_stop = .;
-  }
 
-  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+	PERCPU(PAGE_SIZE)
+
 	. = ALIGN(PAGE_SIZE);
-	__brk_base = . ;
- 	. += 64 * 1024 ;	/* 64k alignment slop space */
-	*(.brk_reservation)	/* areas brk users have reserved */
-	__brk_limit = . ;
-  }
+	/* freed after init ends here */
+
+	/* BSS */
+	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+		__init_end = .;
+		__bss_start = .;
+		*(.bss.page_aligned)
+		*(.bss)
+		. = ALIGN(4);
+		__bss_stop = .;
+	}
 
-  .end : AT(ADDR(.end) - LOAD_OFFSET) {
-	_end = . ;
-  }
+	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		__brk_base = .;
+		. += 64 * 1024;		/* 64k alignment slop space */
+		*(.brk_reservation)	/* areas brk users have reserved */
+		__brk_limit = .;
+	}
 
-  /* Sections to be discarded */
-  /DISCARD/ : {
-	*(.exitcall.exit)
-	*(.discard)
+	.end : AT(ADDR(.end) - LOAD_OFFSET) {
+		_end = . ;
 	}
 
-  STABS_DEBUG
+	/* Sections to be discarded */
+	/DISCARD/ : {
+		*(.exitcall.exit)
+		*(.discard)
+	}
 
-  DWARF_DEBUG
+	STABS_DEBUG
+	DWARF_DEBUG
 }
 
 /*
-- 
GitLab


From 17ce265d6a1789eae5eb739a3bb7fcffdb3e87c5 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:18 +0200
Subject: [PATCH 0693/2198] x86, vmlinux.lds: unify header/footer

Merge everything except PHDRS and SECTIONS into
vmlinux.lds.S.

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 77 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/vmlinux_32.lds.S | 37 ---------------
 arch/x86/kernel/vmlinux_64.lds.S | 42 -----------------
 3 files changed, 77 insertions(+), 79 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 849ee611f0138..d113642c13458 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -1,5 +1,82 @@
+/*
+ * ld script for the x86 kernel
+ *
+ * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Modernisation and unification done by Sam Ravnborg <sam@ravnborg.org>
+ *
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
+#ifdef CONFIG_X86_32
+#define LOAD_OFFSET __PAGE_OFFSET
+#else
+#define LOAD_OFFSET __START_KERNEL_map
+#endif
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/page_types.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+#undef i386     /* in case the preprocessor is a 32bit one */
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_32
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+#else
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+#endif
+
+
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
 #else
 # include "vmlinux_64.lds.S"
 #endif
+
+
+#ifdef CONFIG_X86_32
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+        "kernel image bigger than KERNEL_IMAGE_SIZE")
+#else
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+	"kernel image bigger than KERNEL_IMAGE_SIZE")
+
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+        "irq_stack_union is not at start of per-cpu area");
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big")
+#endif
+
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index fffa45a1036f6..4c985fcd9ab46 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,26 +1,3 @@
-/* ld script to make i386 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- *
- * Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
-
-#define LOAD_OFFSET __PAGE_OFFSET
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/thread_info.h>
-#include <asm/page_types.h>
-#include <asm/cache.h>
-#include <asm/boot.h>
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(phys_startup_32)
-jiffies = jiffies_64;
-
 PHDRS {
 	text PT_LOAD FLAGS(5);		/* R_E */
 	data PT_LOAD FLAGS(7);		/* RWE */
@@ -237,17 +214,3 @@ SECTIONS
 	STABS_DEBUG
 	DWARF_DEBUG
 }
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
-	"kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_KEXEC
-/* Link time checks */
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-       "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 6d5a5b05eaa20..7f1cc3d5fef28 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,19 +1,3 @@
-/* ld script to make x86-64 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- */
-
-#define LOAD_OFFSET __START_KERNEL_map
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/asm-offsets.h>
-#include <asm/page_types.h>
-
-#undef i386	/* in case the preprocessor is a 32bit one */
-
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
 PHDRS {
 	text PT_LOAD FLAGS(5);		/* R_E */
 	data PT_LOAD FLAGS(7);		/* RWE */
@@ -308,29 +292,3 @@ SECTIONS
 	STABS_DEBUG
 	DWARF_DEBUG
 }
-
-/*
- * Per-cpu symbols which need to be offset from __per_cpu_load
- * for the boot processor.
- */
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(irq_stack_union);
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
-	"kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_SMP
-ASSERT((per_cpu__irq_stack_union == 0),
-        "irq_stack_union is not at start of per-cpu area");
-#endif
-
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-       "kexec control code size is too big")
-#endif
-- 
GitLab


From afb8095a7eab32e5760613fa73d2f80a39cc45bf Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:19 +0200
Subject: [PATCH 0694/2198] x86, vmlinux.lds: unify PHDRS

PHDRS are not equal for the two - so
use ifdefs to cover up for that.

On the assumption that they may become equal the ifdef
is inside the PHDRS definiton.

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-3-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 13 +++++++++++++
 arch/x86/kernel/vmlinux_32.lds.S |  5 -----
 arch/x86/kernel/vmlinux_64.lds.S | 11 -----------
 3 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d113642c13458..1a1b303a42726 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -40,6 +40,19 @@ ENTRY(phys_startup_64)
 jiffies_64 = jiffies;
 #endif
 
+PHDRS {
+	text PT_LOAD FLAGS(5);          /* R_E */
+	data PT_LOAD FLAGS(7);          /* RWE */
+#ifdef CONFIG_X86_64
+	user PT_LOAD FLAGS(7);          /* RWE */
+	data.init PT_LOAD FLAGS(7);     /* RWE */
+#ifdef CONFIG_SMP
+	percpu PT_LOAD FLAGS(7);        /* RWE */
+#endif
+	data.init2 PT_LOAD FLAGS(7);    /* RWE */
+#endif
+	note PT_NOTE FLAGS(0);          /* ___ */
+}
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 4c985fcd9ab46..4fd40dc501727 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,8 +1,3 @@
-PHDRS {
-	text PT_LOAD FLAGS(5);		/* R_E */
-	data PT_LOAD FLAGS(7);		/* RWE */
-	note PT_NOTE FLAGS(0);		/* ___ */
-}
 SECTIONS
 {
 	. = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 7f1cc3d5fef28..6e7cbee0e87f0 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,14 +1,3 @@
-PHDRS {
-	text PT_LOAD FLAGS(5);		/* R_E */
-	data PT_LOAD FLAGS(7);		/* RWE */
-	user PT_LOAD FLAGS(7);		/* RWE */
-	data.init PT_LOAD FLAGS(7);	/* RWE */
-#ifdef CONFIG_SMP
-	percpu PT_LOAD FLAGS(7);	/* RWE */
-#endif
-	data.init2 PT_LOAD FLAGS(7);	/* RWE */
-	note PT_NOTE FLAGS(0);		/* ___ */
-}
 SECTIONS
 {
 	. = __START_KERNEL;
-- 
GitLab


From 444e0ae4831f99ba25062d9a5ccb7117c62841a0 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:20 +0200
Subject: [PATCH 0695/2198] x86, vmlinux.lds: unify start/end of SECTIONS

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-4-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 14 ++++++++++++++
 arch/x86/kernel/vmlinux_32.lds.S |  9 ---------
 arch/x86/kernel/vmlinux_64.lds.S |  9 ---------
 3 files changed, 14 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 1a1b303a42726..845776fe52984 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -54,12 +54,26 @@ PHDRS {
 	note PT_NOTE FLAGS(0);          /* ___ */
 }
 
+SECTIONS
+{
+#ifdef CONFIG_X86_32
+        . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+        phys_startup_32 = startup_32 - LOAD_OFFSET;
+#else
+        . = __START_KERNEL;
+        phys_startup_64 = startup_64 - LOAD_OFFSET;
+#endif
+
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
 #else
 # include "vmlinux_64.lds.S"
 #endif
 
+        STABS_DEBUG
+        DWARF_DEBUG
+}
+
 
 #ifdef CONFIG_X86_32
 ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 4fd40dc501727..3d3d49c31b0e3 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,8 +1,3 @@
-SECTIONS
-{
-	. = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
-	phys_startup_32 = startup_32 - LOAD_OFFSET;
-
 	/* Text and read-only data */
 	.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
 		_text = .;
@@ -205,7 +200,3 @@ SECTIONS
 		*(.exitcall.exit)
 		*(.discard)
 	}
-
-	STABS_DEBUG
-	DWARF_DEBUG
-}
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 6e7cbee0e87f0..2d7fa2016c316 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,8 +1,3 @@
-SECTIONS
-{
-	. = __START_KERNEL;
-	phys_startup_64 = startup_64 - LOAD_OFFSET;
-
 	/* Text and read-only data */
 	.text :  AT(ADDR(.text) - LOAD_OFFSET) {
 		_text = .;
@@ -277,7 +272,3 @@ SECTIONS
 		*(.eh_frame)
 		*(.discard)
 	}
-
-	STABS_DEBUG
-	DWARF_DEBUG
-}
-- 
GitLab


From dfc20895d944cfa81d8ff00809b68ecb8f72cbb0 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:21 +0200
Subject: [PATCH 0696/2198] x86, vmlinux.lds: unify .text output sections

32 bit x86 had a dedicated .text.head output section,
whereas 64 bit had it all in a single output section.

In the unified version the dedicated .text.head output section
was kept to have full control over the head code.

32 bit:

- Moved definition of _stext to the linker script.
  The definition is located _after_ .text.page_aligned as this
  is what 32 bit did before.

The ALIGN(8) was introduced so we hit the exact same address
(on the tested config) before and after the move.

I assume that it is a bug that _stext did not cover the
.text.page_aligned section - if this is true it can be fixed
in a follow-up patch (and the ugly ALIGN() can be dropped).

[ Impact: 64-bit: cleanup, 32-bit: use the 64-bit linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-5-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/head_32.S        |  7 -------
 arch/x86/kernel/vmlinux.lds.S    | 31 +++++++++++++++++++++++++++++++
 arch/x86/kernel/vmlinux_32.lds.S | 24 ------------------------
 arch/x86/kernel/vmlinux_64.lds.S | 20 --------------------
 4 files changed, 31 insertions(+), 51 deletions(-)

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 30683883e0cdc..dc5ed4bdd88d3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -608,13 +608,6 @@ ignore_int:
 ENTRY(initial_code)
 	.long i386_start_kernel
 
-.section .text
-/*
- * Real beginning of normal "text" segment
- */
-ENTRY(stext)
-ENTRY(_stext)
-
 /*
  * BSS section
  */
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 845776fe52984..a7c88bb436503 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -64,6 +64,37 @@ SECTIONS
         phys_startup_64 = startup_64 - LOAD_OFFSET;
 #endif
 
+	/* Text and read-only data */
+
+	/* bootstrapping code */
+	.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+		_text = .;
+		*(.text.head)
+	} :text = 0x9090
+
+	/* The rest of the text */
+	.text :  AT(ADDR(.text) - LOAD_OFFSET) {
+#ifdef CONFIG_X86_32
+		/* not really needed, already page aligned */
+		. = ALIGN(PAGE_SIZE);
+		*(.text.page_aligned)
+#endif
+		. = ALIGN(8);
+		_stext = .;
+		TEXT_TEXT
+		SCHED_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		IRQENTRY_TEXT
+		*(.fixup)
+		*(.gnu.warning)
+		/* End of text section */
+		_etext = .;
+	} :text = 0x9090
+
+	NOTES :text :note
+
+
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
 #else
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 3d3d49c31b0e3..854009288ec4f 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,27 +1,3 @@
-	/* Text and read-only data */
-	.text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
-		_text = .;
-		*(.text.head)
-	} :text = 0x9090
-
-	/* read-only */
-	.text : AT(ADDR(.text) - LOAD_OFFSET) {
-		/* not really needed, already page aligned */
-		. = ALIGN(PAGE_SIZE);
-		*(.text.page_aligned)
-		TEXT_TEXT
-		SCHED_TEXT
-		LOCK_TEXT
-		KPROBES_TEXT
-		IRQENTRY_TEXT
-		*(.fixup)
-		*(.gnu.warning)
-		/* End of text section */
-		_etext = .;
-	} :text = 0x9090
-
-	NOTES :text :note
-
 	/* Exception table */
 	. = ALIGN(16);
 	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 2d7fa2016c316..b5d43670d809d 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,23 +1,3 @@
-	/* Text and read-only data */
-	.text :  AT(ADDR(.text) - LOAD_OFFSET) {
-		_text = .;
-		/* First the code that has to be first for bootstrapping */
-		*(.text.head)
-		_stext = .;
-		/* Then the rest */
-		TEXT_TEXT
-		SCHED_TEXT
-		LOCK_TEXT
-		KPROBES_TEXT
-		IRQENTRY_TEXT
-		*(.fixup)
-		*(.gnu.warning)
-		/* End of text section */
-		_etext = .;
-	} :text = 0x9090
-
-	NOTES :text :note
-
 	/* Exception table */
 	. = ALIGN(16);
 	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-- 
GitLab


From 448bc3ab0d03e77fee8e4264de0d001fc87bc164 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:22 +0200
Subject: [PATCH 0697/2198] x86, vmlinux.lds: unify exception table

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-6-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 10 ++++++++++
 arch/x86/kernel/vmlinux_32.lds.S | 10 ----------
 arch/x86/kernel/vmlinux_64.lds.S | 10 ----------
 3 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index a7c88bb436503..67164f6f092f9 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -94,6 +94,16 @@ SECTIONS
 
 	NOTES :text :note
 
+	/* Exception table */
+	. = ALIGN(16);
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		*(__ex_table)
+		__stop___ex_table = .;
+	} :text = 0x9090
+
+	RODATA
+
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 854009288ec4f..920cc6989cc73 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,13 +1,3 @@
-	/* Exception table */
-	. = ALIGN(16);
-	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-		__start___ex_table = .;
-		 *(__ex_table)
-		__stop___ex_table = .;
-	} :text = 0x9090
-
-	RODATA
-
 	/* writeable */
 	. = ALIGN(PAGE_SIZE);
 	/* Data */
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index b5d43670d809d..641f3f991a014 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,13 +1,3 @@
-	/* Exception table */
-	. = ALIGN(16);
-	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-		__start___ex_table = .;
-		 *(__ex_table)
-		__stop___ex_table = .;
-	} :text = 0x9090
-
-	RODATA
-
 	/* Align data segment to page size boundary */
 	. = ALIGN(PAGE_SIZE);
 	/* Data */
-- 
GitLab


From 1f6397bac55040cd520d9eaf299e155a7aa01d5f Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:23 +0200
Subject: [PATCH 0698/2198] x86, vmlinux.lds: unify data output sections

For 64 bit the following functional changes are introduced:

 - .data.page_aligned has moved
 - .data.cacheline_aligned has moved
 - .data.read_mostly has moved
 - ALIGN() moved out of output section for .data.cacheline_aligned
 - ALIGN() moved out of output section for .data.page_aligned

Notice that 32 bit and 64 bit has different location of _edata.
.data_nosave is 32 bit only as 64 bit is special due to PERCPU.

[ Impact: 32-bit: cleanup, 64-bit: use 32-bit linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-7-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 55 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/vmlinux_32.lds.S | 37 ---------------------
 arch/x86/kernel/vmlinux_64.lds.S | 28 ----------------
 3 files changed, 55 insertions(+), 65 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 67164f6f092f9..067bdb012dadd 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -104,6 +104,61 @@ SECTIONS
 
 	RODATA
 
+	/* Data */
+	. = ALIGN(PAGE_SIZE);
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {
+		DATA_DATA
+		CONSTRUCTORS
+
+#ifdef CONFIG_X86_64
+		/* End of data section */
+		_edata = .;
+#endif
+	} :data
+
+#ifdef CONFIG_X86_32
+	/* 32 bit has nosave before _edata */
+	. = ALIGN(PAGE_SIZE);
+	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+		__nosave_begin = .;
+		*(.data.nosave)
+		. = ALIGN(PAGE_SIZE);
+		__nosave_end = .;
+	}
+#endif
+
+	. = ALIGN(PAGE_SIZE);
+	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+		*(.data.page_aligned)
+		*(.data.idt)
+	}
+
+#ifdef CONFIG_X86_32
+	. = ALIGN(32);
+#else
+	. = ALIGN(PAGE_SIZE);
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+#endif
+	.data.cacheline_aligned :
+		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+		*(.data.cacheline_aligned)
+	}
+
+	/* rarely changed data like cpu maps */
+#ifdef CONFIG_X86_32
+	. = ALIGN(32);
+#else
+	. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+#endif
+	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+		*(.data.read_mostly)
+
+#ifdef CONFIG_X86_32
+		/* End of data section */
+		_edata = .;
+#endif
+	}
+
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 920cc6989cc73..8ade84687b2d0 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,40 +1,3 @@
-	/* writeable */
-	. = ALIGN(PAGE_SIZE);
-	/* Data */
-	.data : AT(ADDR(.data) - LOAD_OFFSET) {
-		DATA_DATA
-		CONSTRUCTORS
-	} :data
-
-	. = ALIGN(PAGE_SIZE);
-	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-		__nosave_begin = .;
-		*(.data.nosave)
-		. = ALIGN(PAGE_SIZE);
-		__nosave_end = .;
-	}
-
-	. = ALIGN(PAGE_SIZE);
-	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-		*(.data.page_aligned)
-		*(.data.idt)
-	}
-
-	. = ALIGN(32);
-	.data.cacheline_aligned :
-		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-		*(.data.cacheline_aligned)
-	}
-
-	/* rarely changed data like cpu maps */
-	. = ALIGN(32);
-	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-		*(.data.read_mostly)
-
-		/* End of data section */
-		_edata = .;
-	}
-
 	/* init_task */
 	. = ALIGN(THREAD_SIZE);
 	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 641f3f991a014..826270147b5a0 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,26 +1,3 @@
-	/* Align data segment to page size boundary */
-	. = ALIGN(PAGE_SIZE);
-	/* Data */
-	.data : AT(ADDR(.data) - LOAD_OFFSET) {
-		DATA_DATA
-		CONSTRUCTORS
-		/* End of data section */
-		_edata = .;
-	} :data
-
-
-	.data.cacheline_aligned :
-		AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-		*(.data.cacheline_aligned)
-	}
-
-	. = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
-	.data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-		*(.data.read_mostly)
-	}
-
 #define VSYSCALL_ADDR (-10*1024*1024)
 #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
                             SIZEOF(.data.read_mostly) + 4095) & ~(4095))
@@ -95,11 +72,6 @@
 		*(.data.init_task)
 	} :data.init
 
-	.data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		*(.data.page_aligned)
-	}
-
 	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
 		/* might get freed after init */
 		. = ALIGN(PAGE_SIZE);
-- 
GitLab


From ff6f87e1626e10beef675084c9b5384a9477e3d5 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:24 +0200
Subject: [PATCH 0699/2198] x86, vmlinux.lds: move vsyscall output sections

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-8-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 71 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/vmlinux_64.lds.S | 68 ------------------------------
 2 files changed, 71 insertions(+), 68 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 067bdb012dadd..b3106c2a0373a 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -159,6 +159,77 @@ SECTIONS
 #endif
 	}
 
+#ifdef CONFIG_X86_64
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+	. = VSYSCALL_ADDR;
+	.vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+		*(.vsyscall_0)
+	} :user
+
+	__vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
+		*(.vsyscall_fn)
+	}
+
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
+		*(.vsyscall_gtod_data)
+	}
+
+	vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+	.vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
+		*(.vsyscall_clock)
+	}
+	vsyscall_clock = VVIRT(.vsyscall_clock);
+
+
+	.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+		*(.vsyscall_1)
+	}
+	.vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
+		*(.vsyscall_2)
+	}
+
+	.vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
+		*(.vgetcpu_mode)
+	}
+	vgetcpu_mode = VVIRT(.vgetcpu_mode);
+
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+	.jiffies : AT(VLOAD(.jiffies)) {
+		*(.jiffies)
+	}
+	jiffies = VVIRT(.jiffies);
+
+	.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
+		*(.vsyscall_3)
+	}
+
+	. = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
+
+#endif /* CONFIG_X86_64 */
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 826270147b5a0..013aa0e1dd3a1 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,71 +1,3 @@
-#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
-                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
-                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
-#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
-
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
-#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-
-	. = VSYSCALL_ADDR;
-	.vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
-		*(.vsyscall_0)
-	} :user
-
-	__vsyscall_0 = VSYSCALL_VIRT_ADDR;
-
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-	.vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
-		*(.vsyscall_fn)
-	}
-
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-	.vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
-		*(.vsyscall_gtod_data)
-	}
-
-	vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
-	.vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
-		*(.vsyscall_clock)
-	}
-	vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
-	.vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
-		*(.vsyscall_1)
-	}
-	.vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
-		*(.vsyscall_2)
-	}
-
-	.vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
-		*(.vgetcpu_mode)
-	}
-	vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
-	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-	.jiffies : AT(VLOAD(.jiffies)) {
-		*(.jiffies)
-	}
-	jiffies = VVIRT(.jiffies);
-
-	.vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
-		*(.vsyscall_3)
-	}
-
-	. = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
-
-#undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
-#undef VLOAD_OFFSET
-#undef VLOAD
-#undef VVIRT_OFFSET
-#undef VVIRT
-
 	/* init_task */
 	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
 		. = ALIGN(THREAD_SIZE);
-- 
GitLab


From e58bdaa8f810332e5c1760ce496b01e07d51642c Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:25 +0200
Subject: [PATCH 0700/2198] x86, vmlinux.lds: unify first part of initdata

32-bit:

 - Move definition of __init_begin outside output_section
   because it covers more than one section
 - Move ALIGN() for end-of-section inside .smp_locks output section.
   Same effect but the intent is better documented that
   we need both start and end aligned.

64-bit:

 - Move ALIGN() outside output section in .init.setup
 - Deleted unused __smp_alt_* symbols

None of the above should result in any functional change.

[ Impact: refactor and unify linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-9-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 61 ++++++++++++++++++++++++++++++++
 arch/x86/kernel/vmlinux_32.lds.S | 60 -------------------------------
 arch/x86/kernel/vmlinux_64.lds.S | 59 ------------------------------
 3 files changed, 61 insertions(+), 119 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index b3106c2a0373a..8b203c4ced9bc 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -231,6 +231,67 @@ SECTIONS
 
 #endif /* CONFIG_X86_64 */
 
+	/* init_task */
+	. = ALIGN(THREAD_SIZE);
+	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+		*(.data.init_task)
+	}
+#ifdef CONFIG_X86_64
+	 :data.init
+#endif
+
+	/*
+	 * smp_locks might be freed after init
+	 * start/end must be page aligned
+	 */
+	. = ALIGN(PAGE_SIZE);
+	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+		__smp_locks = .;
+		*(.smp_locks)
+		__smp_locks_end = .;
+		. = ALIGN(PAGE_SIZE);
+	}
+
+	/* Init code and data - will be freed after init */
+	. = ALIGN(PAGE_SIZE);
+	__init_begin = .; /* paired with __init_end */
+	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		_sinittext = .;
+		INIT_TEXT
+		_einittext = .;
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+		INIT_DATA
+	}
+
+	. = ALIGN(16);
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		__setup_start = .;
+		*(.init.setup)
+		__setup_end = .;
+	}
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		__initcall_start = .;
+		INITCALLS
+		__initcall_end = .;
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		__con_initcall_start = .;
+		*(.con_initcall.init)
+		__con_initcall_end = .;
+	}
+
+	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+		__x86_cpu_dev_start = .;
+		*(.x86_cpu_dev.init)
+		__x86_cpu_dev_end = .;
+	}
+
+	SECURITY_INIT
+
+
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
 #else
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 8ade84687b2d0..d8ba5394af036 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,63 +1,3 @@
-	/* init_task */
-	. = ALIGN(THREAD_SIZE);
-	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-		*(.data.init_task)
-	}
-
-	. = ALIGN(PAGE_SIZE);
-	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-		/* might get freed after init */
-		__smp_locks = .;
-		*(.smp_locks)
-		__smp_locks_end = .;
-	}
-	/* will be freed after init
-	 * Following ALIGN() is required to make sure no other data falls on the
-	 * same page where __smp_alt_end is pointing as that page might be freed
-	 * after boot. Always make sure that ALIGN() directive is present after
-	 * the section which contains __smp_alt_end.
-	 */
-	. = ALIGN(PAGE_SIZE);
-
-	/* Init code and data - will be freed after init */
-	. = ALIGN(PAGE_SIZE);
-	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-		__init_begin = .;
-		_sinittext = .;
-		INIT_TEXT
-		_einittext = .;
-	}
-
-	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-		INIT_DATA
-	}
-
-	. = ALIGN(16);
-	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-		__setup_start = .;
-		*(.init.setup)
-		__setup_end = .;
-	}
-	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-		__initcall_start = .;
-		INITCALLS
-		__initcall_end = .;
-	}
-
-	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-		__con_initcall_start = .;
-		*(.con_initcall.init)
-		__con_initcall_end = .;
-	}
-
-	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-		__x86_cpu_dev_start = .;
-		*(.x86_cpu_dev.init)
-		__x86_cpu_dev_end = .;
-	}
-
-	SECURITY_INIT
-
 	. = ALIGN(4);
 	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
 		__alt_instructions = .;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 013aa0e1dd3a1..0e8054e0c5c44 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,62 +1,3 @@
-	/* init_task */
-	.data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-		. = ALIGN(THREAD_SIZE);
-		*(.data.init_task)
-	} :data.init
-
-	.smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-		/* might get freed after init */
-		. = ALIGN(PAGE_SIZE);
-		__smp_alt_begin = .;
-		__smp_locks = .;
-		*(.smp_locks)
-		__smp_locks_end = .;
-		. = ALIGN(PAGE_SIZE);
-		__smp_alt_end = .;
-	}
-
-	/* Init code and data */
-	. = ALIGN(PAGE_SIZE);
-	__init_begin = .;	/* paired with __init_end */
-	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-		_sinittext = .;
-		INIT_TEXT
-		_einittext = .;
-	}
-
-	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-		__initdata_begin = .;
-		INIT_DATA
-		__initdata_end = .;
-	}
-
-	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-		. = ALIGN(16);
-		__setup_start = .;
-		*(.init.setup)
-		__setup_end = .;
-	}
-
-	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-		__initcall_start = .;
-		INITCALLS
-		__initcall_end = .;
-	}
-
-	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-		__con_initcall_start = .;
-		*(.con_initcall.init)
-		__con_initcall_end = .;
-	}
-
-	.x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-		__x86_cpu_dev_start = .;
-		*(.x86_cpu_dev.init)
-		__x86_cpu_dev_end = .;
-	}
-
-	SECURITY_INIT
-
 	. = ALIGN(8);
 	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
 		__parainstructions = .;
-- 
GitLab


From ae61836289a415351caa524d328110aaeae100d4 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:26 +0200
Subject: [PATCH 0701/2198] x86, vmlinux.lds: unify parainstructions

32 bit:

 - increase alignment from 4 to 8 for .parainstructions
 - increase alignment from 4 to 8 for .altinstructions

64 bit:

 - move ALIGN() outside output section for .altinstructions

None of the above should result in any functional change.

[ Impact: refactor and unify linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-10-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 18 ++++++++++++++++++
 arch/x86/kernel/vmlinux_32.lds.S | 18 ------------------
 arch/x86/kernel/vmlinux_64.lds.S | 18 ------------------
 3 files changed, 18 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 8b203c4ced9bc..c8dd71ecb56f9 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -291,6 +291,24 @@ SECTIONS
 
 	SECURITY_INIT
 
+	. = ALIGN(8);
+	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+		__parainstructions = .;
+		*(.parainstructions)
+		__parainstructions_end = .;
+	}
+
+	. = ALIGN(8);
+	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+		__alt_instructions = .;
+		*(.altinstructions)
+		__alt_instructions_end = .;
+	}
+
+	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+		*(.altinstr_replacement)
+	}
+
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index d8ba5394af036..5df9647bb5d98 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,21 +1,3 @@
-	. = ALIGN(4);
-	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-		__alt_instructions = .;
-		*(.altinstructions)
-		__alt_instructions_end = .;
-	}
-
-	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-		*(.altinstr_replacement)
-	}
-
-	. = ALIGN(4);
-	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-		__parainstructions = .;
-		*(.parainstructions)
-		__parainstructions_end = .;
-	}
-
 	/*
 	 * .exit.text is discard at runtime, not link time, to deal with
 	 *  references from .altinstructions and .eh_frame
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 0e8054e0c5c44..9ef709669852d 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,21 +1,3 @@
-	. = ALIGN(8);
-	.parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-		__parainstructions = .;
-		*(.parainstructions)
-		__parainstructions_end = .;
-	}
-
-	.altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-		. = ALIGN(8);
-		__alt_instructions = .;
-		*(.altinstructions)
-		__alt_instructions_end = .;
-	}
-
-	.altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-		*(.altinstr_replacement)
-	}
-
 	/*
 	 * .exit.text is discard at runtime, not link time, to deal with
 	 *  references from .altinstructions and .eh_frame
-- 
GitLab


From bf6a57418d5445c98047edbec022c9e54d1526e6 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:27 +0200
Subject: [PATCH 0702/2198] x86, vmlinux.lds: unify .exit.* and .init.ramfs

[ Impact: cleanup ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-11-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 20 ++++++++++++++++++++
 arch/x86/kernel/vmlinux_32.lds.S | 21 ---------------------
 arch/x86/kernel/vmlinux_64.lds.S | 21 ---------------------
 3 files changed, 20 insertions(+), 42 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index c8dd71ecb56f9..1ab62a5fa1a53 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -309,6 +309,26 @@ SECTIONS
 		*(.altinstr_replacement)
 	}
 
+	/*
+	 * .exit.text is discard at runtime, not link time, to deal with
+	 *  references from .altinstructions and .eh_frame
+	 */
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+		EXIT_TEXT
+	}
+
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+		EXIT_DATA
+	}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	. = ALIGN(PAGE_SIZE);
+	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+	}
+#endif
 
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 5df9647bb5d98..36c8fbd3c7623 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,24 +1,3 @@
-	/*
-	 * .exit.text is discard at runtime, not link time, to deal with
-	 *  references from .altinstructions and .eh_frame
-	 */
-	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-		EXIT_TEXT
-	}
-
-	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-		EXIT_DATA
-	}
-
-#if defined(CONFIG_BLK_DEV_INITRD)
-	. = ALIGN(PAGE_SIZE);
-	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-		__initramfs_start = .;
-		*(.init.ramfs)
-		__initramfs_end = .;
-	}
-#endif
-
 	PERCPU(PAGE_SIZE)
 
 	. = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 9ef709669852d..1aa536223330b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,24 +1,3 @@
-	/*
-	 * .exit.text is discard at runtime, not link time, to deal with
-	 *  references from .altinstructions and .eh_frame
-	 */
-	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-		EXIT_TEXT
-	}
-
-	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-		EXIT_DATA
-	}
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	. = ALIGN(PAGE_SIZE);
-	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-		__initramfs_start = .;
-		*(.init.ramfs)
-		__initramfs_end = .;
-	}
-#endif
-
 #ifdef CONFIG_SMP
 	/*
 	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-- 
GitLab


From 9d16e78318f174fd4b07916a93e41749d5199267 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:28 +0200
Subject: [PATCH 0703/2198] x86, vmlinux.lds: unify percpu

32 bit:
- move __init_end outside the .bss output section
  It really did not belong in there

[ Impact: 64-bit: cleanup, 32-bit: refactor linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-12-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 30 ++++++++++++++++++++++++++++++
 arch/x86/kernel/vmlinux_32.lds.S |  6 ------
 arch/x86/kernel/vmlinux_64.lds.S | 26 --------------------------
 3 files changed, 30 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 1ab62a5fa1a53..1ea2b8571e1a4 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -330,6 +330,36 @@ SECTIONS
 	}
 #endif
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+	/*
+	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+	 * output PHDR, so the next output section - __data_nosave - should
+	 * start another section data.init2.  Also, pda should be at the head of
+	 * percpu area.  Preallocate it and define the percpu offset symbol
+	 * so that it can be accessed as a percpu variable.
+	 */
+	. = ALIGN(PAGE_SIZE);
+	PERCPU_VADDR(0, :percpu)
+#else
+	PERCPU(PAGE_SIZE)
+#endif
+
+	. = ALIGN(PAGE_SIZE);
+	/* freed after init ends here */
+	__init_end = .;
+
+#ifdef CONFIG_X86_64
+	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+		. = ALIGN(PAGE_SIZE);
+		__nosave_begin = .;
+		*(.data.nosave)
+		. = ALIGN(PAGE_SIZE);
+		__nosave_end = .;
+	} :data.init2
+	/* use another section data.init2, see PERCPU_VADDR() above */
+#endif
+
+
 #ifdef CONFIG_X86_32
 # include "vmlinux_32.lds.S"
 #else
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index 36c8fbd3c7623..d23ee2c15c28d 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -1,11 +1,5 @@
-	PERCPU(PAGE_SIZE)
-
-	. = ALIGN(PAGE_SIZE);
-	/* freed after init ends here */
-
 	/* BSS */
 	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-		__init_end = .;
 		__bss_start = .;
 		*(.bss.page_aligned)
 		*(.bss)
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 1aa536223330b..a53936696a082 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -1,29 +1,3 @@
-#ifdef CONFIG_SMP
-	/*
-	 * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-	 * output PHDR, so the next output section - __data_nosave - should
-	 * start another section data.init2.  Also, pda should be at the head of
-	 * percpu area.  Preallocate it and define the percpu offset symbol
-	 * so that it can be accessed as a percpu variable.
-	 */
-	. = ALIGN(PAGE_SIZE);
-	PERCPU_VADDR(0, :percpu)
-#else
-	PERCPU(PAGE_SIZE)
-#endif
-
-	. = ALIGN(PAGE_SIZE);
-	__init_end = .;
-
-	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		__nosave_begin = .;
-		*(.data.nosave)
-		. = ALIGN(PAGE_SIZE);
-		__nosave_end = .;
-	} :data.init2
-	/* use another section data.init2, see PERCPU_VADDR() above */
-
 	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 		. = ALIGN(PAGE_SIZE);
 		__bss_start = .;		/* BSS */
-- 
GitLab


From 091e52c3551d3031343df24b573b770b4c6c72b6 Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Wed, 29 Apr 2009 09:47:29 +0200
Subject: [PATCH 0704/2198] x86, vmlinux.lds: unify remaining parts

32 bit:
- explicit page align .bss
- move ALING() out of .brk output section
- discard *(.eh_frame)

64 bit:
- move ALIGN() out of .bss output section
- move ALIGN() out of .brk output section
- use a dedicated section to define _end

[ Impact: unify and fix section alignments in linker script ]

Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-13-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S    | 32 +++++++++++++++++++++++++++-----
 arch/x86/kernel/vmlinux_32.lds.S | 26 --------------------------
 arch/x86/kernel/vmlinux_64.lds.S | 24 ------------------------
 3 files changed, 27 insertions(+), 55 deletions(-)
 delete mode 100644 arch/x86/kernel/vmlinux_32.lds.S
 delete mode 100644 arch/x86/kernel/vmlinux_64.lds.S

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 1ea2b8571e1a4..ef3e4f1042b56 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -359,12 +359,34 @@ SECTIONS
 	/* use another section data.init2, see PERCPU_VADDR() above */
 #endif
 
+	/* BSS */
+	. = ALIGN(PAGE_SIZE);
+	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+		__bss_start = .;
+		*(.bss.page_aligned)
+		*(.bss)
+		. = ALIGN(4);
+		__bss_stop = .;
+	}
 
-#ifdef CONFIG_X86_32
-# include "vmlinux_32.lds.S"
-#else
-# include "vmlinux_64.lds.S"
-#endif
+	. = ALIGN(PAGE_SIZE);
+	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+		__brk_base = .;
+		. += 64 * 1024;		/* 64k alignment slop space */
+		*(.brk_reservation)	/* areas brk users have reserved */
+		__brk_limit = .;
+	}
+
+	.end : AT(ADDR(.end) - LOAD_OFFSET) {
+		_end = .;
+	}
+
+	/* Sections to be discarded */
+	/DISCARD/ : {
+		*(.exitcall.exit)
+		*(.eh_frame)
+		*(.discard)
+	}
 
         STABS_DEBUG
         DWARF_DEBUG
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644
index d23ee2c15c28d..0000000000000
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,26 +0,0 @@
-	/* BSS */
-	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-		__bss_start = .;
-		*(.bss.page_aligned)
-		*(.bss)
-		. = ALIGN(4);
-		__bss_stop = .;
-	}
-
-	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		__brk_base = .;
-		. += 64 * 1024;		/* 64k alignment slop space */
-		*(.brk_reservation)	/* areas brk users have reserved */
-		__brk_limit = .;
-	}
-
-	.end : AT(ADDR(.end) - LOAD_OFFSET) {
-		_end = . ;
-	}
-
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.discard)
-	}
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644
index a53936696a082..0000000000000
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,24 +0,0 @@
-	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		__bss_start = .;		/* BSS */
-		*(.bss.page_aligned)
-		*(.bss)
-		__bss_stop = .;
-	}
-
-	.brk : AT(ADDR(.brk) - LOAD_OFFSET) {
-		. = ALIGN(PAGE_SIZE);
-		__brk_base = .;
-		. += 64 * 1024;		/* 64k alignment slop space */
-		*(.brk_reservation)	/* areas brk users have reserved */
-		__brk_limit = .;
-	}
-
-	_end = . ;
-
-	/* Sections to be discarded */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-		*(.eh_frame)
-		*(.discard)
-	}
-- 
GitLab


From 91fd7fe809bdf4d8aa56559d17b9f25a1a6fe732 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 29 Apr 2009 10:58:38 +0200
Subject: [PATCH 0705/2198] x86, vmlinux.lds: add copyright

Acked-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index ef3e4f1042b56..0bdbaa5796961 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -3,7 +3,8 @@
  *
  * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
  *
- * Modernisation and unification done by Sam Ravnborg <sam@ravnborg.org>
+ * Modernisation, unification and other changes and fixes:
+ *   Copyright (C) 2007-2009  Sam Ravnborg <sam@ravnborg.org>
  *
  *
  * Don't define absolute symbols until and unless you know that symbol
-- 
GitLab


From 0492e1bb8fe7d122901c9f3af75e537d4129712e Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Tue, 28 Apr 2009 20:17:49 +0100
Subject: [PATCH 0706/2198] tracing: x86, mmiotrace: code
 consistency/legibility improvement

kmmio_probe being *p and kmmio_fault_page being sometimes *f and
sometimes *p is not helpful.

[ Impact: cleanup ]

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Acked-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1240946271-7083-3-git-send-email-stuart@freedesktop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/kmmio.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 4f115e00486bc..869181a917d96 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -97,13 +97,13 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 {
 	struct list_head *head;
-	struct kmmio_fault_page *p;
+	struct kmmio_fault_page *f;
 
 	page &= PAGE_MASK;
 	head = kmmio_page_list(page);
-	list_for_each_entry_rcu(p, head, list) {
-		if (p->page == page)
-			return p;
+	list_for_each_entry_rcu(f, head, list) {
+		if (f->page == page)
+			return f;
 	}
 	return NULL;
 }
@@ -439,12 +439,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
 						head,
 						struct kmmio_delayed_release,
 						rcu);
-	struct kmmio_fault_page *p = dr->release_list;
-	while (p) {
-		struct kmmio_fault_page *next = p->release_next;
-		BUG_ON(p->count);
-		kfree(p);
-		p = next;
+	struct kmmio_fault_page *f = dr->release_list;
+	while (f) {
+		struct kmmio_fault_page *next = f->release_next;
+		BUG_ON(f->count);
+		kfree(f);
+		f = next;
 	}
 	kfree(dr);
 }
@@ -453,19 +453,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
 {
 	struct kmmio_delayed_release *dr =
 		container_of(head, struct kmmio_delayed_release, rcu);
-	struct kmmio_fault_page *p = dr->release_list;
+	struct kmmio_fault_page *f = dr->release_list;
 	struct kmmio_fault_page **prevp = &dr->release_list;
 	unsigned long flags;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
-	while (p) {
-		if (!p->count) {
-			list_del_rcu(&p->list);
-			prevp = &p->release_next;
+	while (f) {
+		if (!f->count) {
+			list_del_rcu(&f->list);
+			prevp = &f->release_next;
 		} else {
-			*prevp = p->release_next;
+			*prevp = f->release_next;
 		}
-		p = p->release_next;
+		f = f->release_next;
 	}
 	spin_unlock_irqrestore(&kmmio_lock, flags);
 
-- 
GitLab


From 46e91d00b1165b14b484aa33800e1bba0794ae1a Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Tue, 28 Apr 2009 20:17:50 +0100
Subject: [PATCH 0707/2198] tracing: x86, mmiotrace: refactor clearing/restore
 of page presence

* change function names to clear_* from set_*: in reality we only clear
  and restore page presence, and never unconditionally set present.
  Using clear_*({true, false}, ...) is therefore more honest than
  set_*({false, true}, ...)

* upgrade presence storage to pteval_t: doing user-space tracing will
  require saving and manipulation of the _PAGE_PROTNONE bit, in addition
  to the existing _PAGE_PRESENT changes, and having multiple bools stored
  and passed around does not seem optimal

[ Impact: refactor, clean up mmiotrace code ]

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Acked-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1240946271-7083-4-git-send-email-stuart@freedesktop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/kmmio.c | 43 ++++++++++++++++++++++---------------------
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 869181a917d96..a769d1a2d93b4 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -32,7 +32,7 @@ struct kmmio_fault_page {
 	struct list_head list;
 	struct kmmio_fault_page *release_next;
 	unsigned long page; /* location of the fault page */
-	bool old_presence; /* page presence prior to arming */
+	pteval_t old_presence; /* page presence prior to arming */
 	bool armed;
 
 	/*
@@ -108,49 +108,51 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
-static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
 {
 	pmdval_t v = pmd_val(*pmd);
-	*old = !!(v & _PAGE_PRESENT);
-	v &= ~_PAGE_PRESENT;
-	if (present)
-		v |= _PAGE_PRESENT;
+	if (clear) {
+		*old = v & _PAGE_PRESENT;
+		v &= ~_PAGE_PRESENT;
+	} else	/* presume this has been called with clear==true previously */
+		v |= *old;
 	set_pmd(pmd, __pmd(v));
 }
 
-static void set_pte_presence(pte_t *pte, bool present, bool *old)
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
 {
 	pteval_t v = pte_val(*pte);
-	*old = !!(v & _PAGE_PRESENT);
-	v &= ~_PAGE_PRESENT;
-	if (present)
-		v |= _PAGE_PRESENT;
+	if (clear) {
+		*old = v & _PAGE_PRESENT;
+		v &= ~_PAGE_PRESENT;
+	} else	/* presume this has been called with clear==true previously */
+		v |= *old;
 	set_pte_atomic(pte, __pte(v));
 }
 
-static int set_page_presence(unsigned long addr, bool present, bool *old)
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
 {
 	unsigned int level;
-	pte_t *pte = lookup_address(addr, &level);
+	pte_t *pte = lookup_address(f->page, &level);
 
 	if (!pte) {
-		pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+		pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
 		return -1;
 	}
 
 	switch (level) {
 	case PG_LEVEL_2M:
-		set_pmd_presence((pmd_t *)pte, present, old);
+		clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
 		break;
 	case PG_LEVEL_4K:
-		set_pte_presence(pte, present, old);
+		clear_pte_presence(pte, clear, &f->old_presence);
 		break;
 	default:
 		pr_err("kmmio: unexpected page level 0x%x.\n", level);
 		return -1;
 	}
 
-	__flush_tlb_one(addr);
+	__flush_tlb_one(f->page);
 	return 0;
 }
 
@@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 	WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
 	if (f->armed) {
 		pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
-					f->page, f->count, f->old_presence);
+					f->page, f->count, !!f->old_presence);
 	}
-	ret = set_page_presence(f->page, false, &f->old_presence);
+	ret = clear_page_presence(f, true);
 	WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
 	f->armed = true;
 	return ret;
@@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 /** Restore the given page to saved presence state. */
 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
-	bool tmp;
-	int ret = set_page_presence(f->page, f->old_presence, &tmp);
+	int ret = clear_page_presence(f, false);
 	WARN_ONCE(ret < 0,
 			KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
 	f->armed = false;
-- 
GitLab


From 0f9a623dd6c9b5b4dd00c232f29525bfc7a8ecf2 Mon Sep 17 00:00:00 2001
From: Stuart Bennett <stuart@freedesktop.org>
Date: Tue, 28 Apr 2009 20:17:51 +0100
Subject: [PATCH 0708/2198] tracing: x86, mmiotrace: only register for die
 notifier when tracer active

Follow up to afcfe024aebd74b0984a41af9a34e009cf5badaf in Linus' tree
("x86: mmiotrace: quieten spurious warning message")

Signed-off-by: Stuart Bennett <stuart@freedesktop.org>
Acked-by: Pekka Paalanen <pq@iki.fi>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1240946271-7083-5-git-send-email-stuart@freedesktop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/kmmio.c       | 27 ++++++++++++++++++++++-----
 arch/x86/mm/mmio-mod.c    |  2 ++
 include/linux/mmiotrace.h |  2 ++
 3 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index a769d1a2d93b4..256ce643b0ba6 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -311,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active) {
-		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+		/*
+		 * debug traps without an active context are due to either
+		 * something external causing them (f.e. using a debugger while
+		 * mmio tracing enabled), or erroneous behaviour
+		 */
+		pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
 							smp_processor_id());
 		goto out;
 	}
@@ -529,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 }
 EXPORT_SYMBOL(unregister_kmmio_probe);
 
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
-								void *args)
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
 {
 	struct die_args *arg = args;
 
@@ -545,11 +550,23 @@ static struct notifier_block nb_die = {
 	.notifier_call = kmmio_die_notifier
 };
 
-static int __init init_kmmio(void)
+int kmmio_init(void)
 {
 	int i;
+
 	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
 		INIT_LIST_HEAD(&kmmio_page_table[i]);
+
 	return register_die_notifier(&nb_die);
 }
-fs_initcall(init_kmmio); /* should be before device_initcall() */
+
+void kmmio_cleanup(void)
+{
+	int i;
+
+	unregister_die_notifier(&nb_die);
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+		WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+			KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+	}
+}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index c9342ed8b402d..132772a8ec57a 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -451,6 +451,7 @@ void enable_mmiotrace(void)
 
 	if (nommiotrace)
 		pr_info(NAME "MMIO tracing disabled.\n");
+	kmmio_init();
 	enter_uniprocessor();
 	spin_lock_irq(&trace_lock);
 	atomic_inc(&mmiotrace_enabled);
@@ -473,6 +474,7 @@ void disable_mmiotrace(void)
 
 	clear_trace_list(); /* guarantees: no more kmmio callbacks */
 	leave_uniprocessor();
+	kmmio_cleanup();
 	pr_info(NAME "disabled.\n");
 out:
 	mutex_unlock(&mmiotrace_mutex);
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 3d1b7bde12836..97491f78b08cd 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -30,6 +30,8 @@ extern unsigned int kmmio_count;
 
 extern int register_kmmio_probe(struct kmmio_probe *p);
 extern void unregister_kmmio_probe(struct kmmio_probe *p);
+extern int kmmio_init(void);
+extern void kmmio_cleanup(void);
 
 #ifdef CONFIG_MMIOTRACE
 /* kmmio is active by some kmmio_probes? */
-- 
GitLab


From fd0731944333db6e9e91b6954c6ef95f4b71ab04 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 29 Apr 2009 12:56:58 +0200
Subject: [PATCH 0709/2198] x86, vmlinux.lds: fix relocatable symbols

__init_begin/_end symbols should be inside sections as well,
otherwise the relocatable kernel gets confused when freeing
init sections in the wrong place.

[ Impact: fix bootup crash ]

Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Tim Abbott <tabbott@MIT.EDU>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20090429105056.GA28720@uranus.ravnborg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vmlinux.lds.S | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 0bdbaa5796961..4c85b2e2bb652 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -255,8 +255,8 @@ SECTIONS
 
 	/* Init code and data - will be freed after init */
 	. = ALIGN(PAGE_SIZE);
-	__init_begin = .; /* paired with __init_end */
 	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		__init_begin = .; /* paired with __init_end */
 		_sinittext = .;
 		INIT_TEXT
 		_einittext = .;
@@ -346,8 +346,11 @@ SECTIONS
 #endif
 
 	. = ALIGN(PAGE_SIZE);
+
 	/* freed after init ends here */
-	__init_end = .;
+	.init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
+		__init_end = .;
+	}
 
 #ifdef CONFIG_X86_64
 	.data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-- 
GitLab


From 30e673b230f9d556eb81ef68a7b1a08c8b3b142c Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:47 -0500
Subject: [PATCH 0710/2198] tracing/filters: move preds into event_filter
 object

Create a new event_filter object, and move the pred-related members
out of the call and subsystem objects and into the filter object - the
details of the filter implementation don't need to be exposed in the
call and subsystem in any case, and it will also help make the new
parser implementation a little cleaner.

[ Impact: refactor trace-filter code to prepare for new features ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905887.6416.119.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |   4 +-
 kernel/trace/trace.h               |  10 ++-
 kernel/trace/trace_events.c        |   3 +-
 kernel/trace/trace_events_filter.c | 107 ++++++++++++++++++-----------
 4 files changed, 76 insertions(+), 48 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 78a9ba24cbf67..46a27f2695a68 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -101,8 +101,8 @@ struct ftrace_event_call {
 	int			(*show_format)(struct trace_seq *s);
 	int			(*define_fields)(void);
 	struct list_head	fields;
-	int			n_preds;
-	struct filter_pred	**preds;
+	int			filter_active;
+	void			*filter;
 	void			*mod;
 
 #ifdef CONFIG_EVENT_PROFILE
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7d55bcf50e499..1fb7d6ccadf44 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -731,12 +731,16 @@ struct ftrace_event_field {
 	int			size;
 };
 
+struct event_filter {
+	int			n_preds;
+	struct filter_pred	**preds;
+};
+
 struct event_subsystem {
 	struct list_head	list;
 	const char		*name;
 	struct dentry		*entry;
-	int			n_preds;
-	struct filter_pred	**preds;
+	void			*filter;
 };
 
 struct filter_pred;
@@ -774,7 +778,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 		     struct ring_buffer *buffer,
 		     struct ring_buffer_event *event)
 {
-	if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) {
+	if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
 		ring_buffer_discard_commit(buffer, event);
 		return 1;
 	}
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index be4d3a437c17a..1cd1f37373ddd 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -757,8 +757,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	list_add(&system->list, &event_subsystems);
 
-	system->preds = NULL;
-	system->n_preds = 0;
+	system->filter = NULL;
 
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 65418288f957f..1e861eca3d027 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -93,11 +93,12 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
+	struct event_filter *filter = call->filter;
 	int i, matched, and_failed = 0;
 	struct filter_pred *pred;
 
-	for (i = 0; i < call->n_preds; i++) {
-		pred = call->preds[i];
+	for (i = 0; i < filter->n_preds; i++) {
+		pred = filter->preds[i];
 		if (and_failed && !pred->or)
 			continue;
 		matched = pred->fn(pred, rec);
@@ -115,20 +116,20 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-static void __filter_print_preds(struct filter_pred **preds, int n_preds,
+static void __filter_print_preds(struct event_filter *filter,
 				 struct trace_seq *s)
 {
-	char *field_name;
 	struct filter_pred *pred;
+	char *field_name;
 	int i;
 
-	if (!n_preds) {
+	if (!filter || !filter->n_preds) {
 		trace_seq_printf(s, "none\n");
 		return;
 	}
 
-	for (i = 0; i < n_preds; i++) {
-		pred = preds[i];
+	for (i = 0; i < filter->n_preds; i++) {
+		pred = filter->preds[i];
 		field_name = pred->field_name;
 		if (i)
 			trace_seq_printf(s, pred->or ? "|| " : "&& ");
@@ -144,7 +145,7 @@ static void __filter_print_preds(struct filter_pred **preds, int n_preds,
 void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
 {
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(call->preds, call->n_preds, s);
+	__filter_print_preds(call->filter, s);
 	mutex_unlock(&filter_mutex);
 }
 
@@ -152,7 +153,7 @@ void filter_print_subsystem_preds(struct event_subsystem *system,
 				  struct trace_seq *s)
 {
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(system->preds, system->n_preds, s);
+	__filter_print_preds(system->filter, s);
 	mutex_unlock(&filter_mutex);
 }
 
@@ -200,12 +201,14 @@ static int filter_set_pred(struct filter_pred *dest,
 
 static void __filter_disable_preds(struct ftrace_event_call *call)
 {
+	struct event_filter *filter = call->filter;
 	int i;
 
-	call->n_preds = 0;
+	call->filter_active = 0;
+	filter->n_preds = 0;
 
 	for (i = 0; i < MAX_FILTER_PRED; i++)
-		call->preds[i]->fn = filter_pred_none;
+		filter->preds[i]->fn = filter_pred_none;
 }
 
 void filter_disable_preds(struct ftrace_event_call *call)
@@ -217,32 +220,39 @@ void filter_disable_preds(struct ftrace_event_call *call)
 
 int init_preds(struct ftrace_event_call *call)
 {
+	struct event_filter *filter;
 	struct filter_pred *pred;
 	int i;
 
-	call->n_preds = 0;
-
-	call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
-	if (!call->preds)
+	filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+	if (!call->filter)
 		return -ENOMEM;
 
+	call->filter_active = 0;
+	filter->n_preds = 0;
+
+	filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+	if (!filter->preds)
+		goto oom;
+
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
 		pred = kzalloc(sizeof(*pred), GFP_KERNEL);
 		if (!pred)
 			goto oom;
 		pred->fn = filter_pred_none;
-		call->preds[i] = pred;
+		filter->preds[i] = pred;
 	}
 
 	return 0;
 
 oom:
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (call->preds[i])
-			filter_free_pred(call->preds[i]);
+		if (filter->preds[i])
+			filter_free_pred(filter->preds[i]);
 	}
-	kfree(call->preds);
-	call->preds = NULL;
+	kfree(filter->preds);
+	kfree(call->filter);
+	call->filter = NULL;
 
 	return -ENOMEM;
 }
@@ -250,15 +260,16 @@ EXPORT_SYMBOL_GPL(init_preds);
 
 static void __filter_free_subsystem_preds(struct event_subsystem *system)
 {
+	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 	int i;
 
-	if (system->n_preds) {
-		for (i = 0; i < system->n_preds; i++)
-			filter_free_pred(system->preds[i]);
-		kfree(system->preds);
-		system->preds = NULL;
-		system->n_preds = 0;
+	if (filter && filter->n_preds) {
+		for (i = 0; i < filter->n_preds; i++)
+			filter_free_pred(filter->preds[i]);
+		kfree(filter->preds);
+		kfree(filter);
+		system->filter = NULL;
 	}
 
 	list_for_each_entry(call, &ftrace_events, list) {
@@ -281,21 +292,23 @@ static int filter_add_pred_fn(struct ftrace_event_call *call,
 			      struct filter_pred *pred,
 			      filter_pred_fn_t fn)
 {
+	struct event_filter *filter = call->filter;
 	int idx, err;
 
-	if (call->n_preds && !pred->compound)
+	if (filter->n_preds && !pred->compound)
 		__filter_disable_preds(call);
 
-	if (call->n_preds == MAX_FILTER_PRED)
+	if (filter->n_preds == MAX_FILTER_PRED)
 		return -ENOSPC;
 
-	idx = call->n_preds;
-	filter_clear_pred(call->preds[idx]);
-	err = filter_set_pred(call->preds[idx], pred, fn);
+	idx = filter->n_preds;
+	filter_clear_pred(filter->preds[idx]);
+	err = filter_set_pred(filter->preds[idx], pred, fn);
 	if (err)
 		return err;
 
-	call->n_preds++;
+	filter->n_preds++;
+	call->filter_active = 1;
 
 	return 0;
 }
@@ -366,29 +379,41 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 int filter_add_subsystem_pred(struct event_subsystem *system,
 			      struct filter_pred *pred)
 {
+	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 
 	mutex_lock(&filter_mutex);
 
-	if (system->n_preds && !pred->compound)
+	if (filter && filter->n_preds && !pred->compound) {
 		__filter_free_subsystem_preds(system);
+		filter = NULL;
+	}
 
-	if (!system->n_preds) {
-		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+	if (!filter) {
+		system->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+		if (!system->filter) {
+			mutex_unlock(&filter_mutex);
+			return -ENOMEM;
+		}
+		filter = system->filter;
+		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
-		if (!system->preds) {
+
+		if (!filter->preds) {
+			kfree(system->filter);
+			system->filter = NULL;
 			mutex_unlock(&filter_mutex);
 			return -ENOMEM;
 		}
 	}
 
-	if (system->n_preds == MAX_FILTER_PRED) {
+	if (filter->n_preds == MAX_FILTER_PRED) {
 		mutex_unlock(&filter_mutex);
 		return -ENOSPC;
 	}
 
-	system->preds[system->n_preds] = pred;
-	system->n_preds++;
+	filter->preds[filter->n_preds] = pred;
+	filter->n_preds++;
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		int err;
@@ -401,8 +426,8 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 
 		err = __filter_add_pred(call, pred);
 		if (err == -ENOMEM) {
-			system->preds[system->n_preds] = NULL;
-			system->n_preds--;
+			filter->preds[filter->n_preds] = NULL;
+			filter->n_preds--;
 			mutex_unlock(&filter_mutex);
 			return err;
 		}
-- 
GitLab


From a118e4d1402f1349fe3d953493e4168a300a752d Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:53 -0500
Subject: [PATCH 0711/2198] tracing/filters: distinguish between signed and
 unsigned fields

The new filter comparison ops need to be able to distinguish between
signed and unsigned field types, so add an is_signed flag/param to the
event field struct/trace_define_fields().  Also define a simple macro,
is_signed_type() to determine the signedness at compile time, used in the
trace macros.  If the is_signed_type() macro won't work with a specific
type, a new slightly modified version of TRACE_FIELD() called
TRACE_FIELD_SIGN(), allows the signedness to be set explicitly.

[ Impact: extend trace-filter code for new feature ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905893.6416.120.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h     |  7 ++++---
 include/trace/ftrace.h           | 16 ++++++++--------
 kernel/trace/trace.h             |  1 +
 kernel/trace/trace_event_types.h |  4 ++--
 kernel/trace/trace_events.c      |  3 ++-
 kernel/trace/trace_export.c      | 29 ++++++++++++++++++++++-------
 6 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 46a27f2695a68..e61a7403f3d08 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -122,8 +122,9 @@ extern int filter_current_check_discard(struct ftrace_event_call *call,
 					struct ring_buffer_event *event);
 
 extern int trace_define_field(struct ftrace_event_call *call, char *type,
-			      char *name, int offset, int size);
+			      char *name, int offset, int size, int is_signed);
 
+#define is_signed_type(type)	(((type)(-1)) < 0)
 
 /*
  * The double __builtin_constant_p is because gcc will give us an error
@@ -144,10 +145,10 @@ do {									\
 		__trace_printk(ip, fmt, ##args);			\
 } while (0)
 
-#define __common_field(type, item)					\
+#define __common_field(type, item, is_signed)				\
 	ret = trace_define_field(event_call, #type, "common_" #item,	\
 				 offsetof(typeof(field.ent), item),	\
-				 sizeof(field.ent.item));		\
+				 sizeof(field.ent.item), is_signed);	\
 	if (ret)							\
 		return ret;
 
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 1e681142f1dad..edb02bc9f8fff 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -225,7 +225,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define __field(type, item)						\
 	ret = trace_define_field(event_call, #type, #item,		\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), is_signed_type(type));	\
 	if (ret)							\
 		return ret;
 
@@ -234,7 +234,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\
 	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), 0);		\
 	if (ret)							\
 		return ret;
 
@@ -242,7 +242,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define __string(item, src)						       \
 	ret = trace_define_field(event_call, "__str_loc", #item,	       \
 				offsetof(typeof(field), __str_loc_##item),     \
-				sizeof(field.__str_loc_##item));
+				 sizeof(field.__str_loc_##item), 0);
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
@@ -253,11 +253,11 @@ ftrace_define_fields_##call(void)					\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	int ret;							\
 									\
-	__common_field(int, type);					\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
+	__common_field(int, type, 1);					\
+	__common_field(unsigned char, flags, 0);			\
+	__common_field(unsigned char, preempt_count, 0);		\
+	__common_field(int, pid, 1);					\
+	__common_field(int, tgid, 1);					\
 									\
 	tstruct;							\
 									\
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1fb7d6ccadf44..866d0108fd2f4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -729,6 +729,7 @@ struct ftrace_event_field {
 	char			*type;
 	int			offset;
 	int			size;
+	int			is_signed;
 };
 
 struct event_filter {
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index cfcecc4fd86d4..5e32e375134d9 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -141,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 	TRACE_STRUCT(
-		TRACE_FIELD(ktime_t, state_data.stamp, stamp)
-		TRACE_FIELD(ktime_t, state_data.end, end)
+		TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
+		TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.state, state)
 	),
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1cd1f37373ddd..bbbea7479371f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -26,7 +26,7 @@ static DEFINE_MUTEX(event_mutex);
 LIST_HEAD(ftrace_events);
 
 int trace_define_field(struct ftrace_event_call *call, char *type,
-		       char *name, int offset, int size)
+		       char *name, int offset, int size, int is_signed)
 {
 	struct ftrace_event_field *field;
 
@@ -44,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 
 	field->offset = offset;
 	field->size = size;
+	field->is_signed = is_signed;
 	list_add(&field->link, &call->fields);
 
 	return 0;
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 0cb1a142c74fd..d06cf898dc86a 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -50,6 +50,9 @@ extern void __bad_type_size(void);
 	if (!ret)							\
 		return 0;
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
+	TRACE_FIELD(type, item, assign)
 
 #undef TP_RAW_FMT
 #define TP_RAW_FMT(args...) args
@@ -98,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\
+	TRACE_FIELD(type, item, assign)
+
 #undef TP_CMD
 #define TP_CMD(cmd...)	cmd
 
@@ -149,7 +156,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)					\
 	ret = trace_define_field(event_call, #type, #item,		\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), is_signed_type(type));	\
 	if (ret)							\
 		return ret;
 
@@ -157,7 +164,15 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\
 	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
 				 offsetof(typeof(field), item),		\
-				 sizeof(field.item));			\
+				 sizeof(field.item), 0);		\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)			\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item), is_signed);	\
 	if (ret)							\
 		return ret;
 
@@ -173,11 +188,11 @@ ftrace_define_fields_##call(void)					\
 	struct args field;						\
 	int ret;							\
 									\
-	__common_field(unsigned char, type);				\
-	__common_field(unsigned char, flags);				\
-	__common_field(unsigned char, preempt_count);			\
-	__common_field(int, pid);					\
-	__common_field(int, tgid);					\
+	__common_field(unsigned char, type, 0);				\
+	__common_field(unsigned char, flags, 0);			\
+	__common_field(unsigned char, preempt_count, 0);		\
+	__common_field(int, pid, 1);					\
+	__common_field(int, tgid, 1);					\
 									\
 	tstruct;							\
 									\
-- 
GitLab


From 8b3725621074040d380664964ffbc40610aef8c6 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 28 Apr 2009 03:04:59 -0500
Subject: [PATCH 0712/2198] tracing/filters: a better event parser

Replace the current event parser hack with a better one.  Filters are
no longer specified predicate by predicate, but all at once and can
use parens and any of the following operators:

numeric fields:

==, !=, <, <=, >, >=

string fields:

==, !=

predicates can be combined with the logical operators:

&&, ||

examples:

"common_preempt_count > 4" > filter

"((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter

If there was an error, the erroneous string along with an error
message can be seen by looking at the filter e.g.:

((sig >= 10 && sig < 15) || dsig == 17) && comm != bash
^
parse_error: Field not found

Currently the caret for an error always appears at the beginning of
the filter; a real position should be used, but the error message
should be useful even without it.

To clear a filter, '0' can be written to the filter file.

Filters can also be set or cleared for a complete subsystem by writing
the same filter as would be written to an individual event to the
filter file at the root of the subsytem.  Note however, that if any
event in the subsystem lacks a field specified in the filter being
set, the set will fail and all filters in the subsytem are
automatically cleared.  This change from the previous version was made
because using only the fields that happen to exist for a given event
would most likely result in a meaningless filter.

Because the logical operators are now implemented as predicates, the
maximum number of predicates in a filter was increased from 8 to 16.

[ Impact: add new, extended trace-filter implementation ]

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <1240905899.6416.121.camel@tropicana>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |    2 +-
 kernel/trace/trace.h               |   66 +-
 kernel/trace/trace_events.c        |   86 +--
 kernel/trace/trace_events_filter.c | 1020 ++++++++++++++++++++++------
 4 files changed, 884 insertions(+), 290 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index e61a7403f3d08..5fff40c9ff59c 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -112,7 +112,7 @@ struct ftrace_event_call {
 #endif
 };
 
-#define MAX_FILTER_PRED		8
+#define MAX_FILTER_PRED		32
 #define MAX_FILTER_STR_VAL	128
 
 extern int init_preds(struct ftrace_event_call *call);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 866d0108fd2f4..7736fe8c1b762 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -735,6 +735,7 @@ struct ftrace_event_field {
 struct event_filter {
 	int			n_preds;
 	struct filter_pred	**preds;
+	char			*filter_string;
 };
 
 struct event_subsystem {
@@ -746,7 +747,8 @@ struct event_subsystem {
 
 struct filter_pred;
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+				 int val1, int val2);
 
 struct filter_pred {
 	filter_pred_fn_t fn;
@@ -756,23 +758,18 @@ struct filter_pred {
 	char *field_name;
 	int offset;
 	int not;
-	int or;
-	int compound;
-	int clear;
+	int op;
+	int pop_n;
 };
 
-extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct ftrace_event_call *call,
+extern void print_event_filter(struct ftrace_event_call *call,
 			       struct trace_seq *s);
-extern int filter_parse(char **pbuf, struct filter_pred *pred);
-extern int filter_add_pred(struct ftrace_event_call *call,
-			   struct filter_pred *pred);
-extern void filter_disable_preds(struct ftrace_event_call *call);
-extern void filter_free_subsystem_preds(struct event_subsystem *system);
-extern void filter_print_subsystem_preds(struct event_subsystem *system,
+extern int apply_event_filter(struct ftrace_event_call *call,
+			      char *filter_string);
+extern int apply_subsystem_event_filter(struct event_subsystem *system,
+					char *filter_string);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
 					 struct trace_seq *s);
-extern int filter_add_subsystem_pred(struct event_subsystem *system,
-				     struct filter_pred *pred);
 
 static inline int
 filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -787,6 +784,47 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
 	return 0;
 }
 
+#define DEFINE_COMPARISON_PRED(type)					\
+static int filter_pred_##type(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	type *addr = (type *)(event + pred->offset);			\
+	type val = (type)pred->val;					\
+	int match = 0;							\
+									\
+	switch (pred->op) {						\
+	case OP_LT:							\
+		match = (*addr < val);					\
+		break;							\
+	case OP_LE:							\
+		match = (*addr <= val);					\
+		break;							\
+	case OP_GT:							\
+		match = (*addr > val);					\
+		break;							\
+	case OP_GE:							\
+		match = (*addr >= val);					\
+		break;							\
+	default:							\
+		break;							\
+	}								\
+									\
+	return match;							\
+}
+
+#define DEFINE_EQUALITY_PRED(size)					\
+static int filter_pred_##size(struct filter_pred *pred, void *event,	\
+			      int val1, int val2)			\
+{									\
+	u##size *addr = (u##size *)(event + pred->offset);		\
+	u##size val = (u##size)pred->val;				\
+	int match;							\
+									\
+	match = (val == *addr) ^ pred->not;				\
+									\
+	return match;							\
+}
+
 extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index bbbea7479371f..f789ca540fe1a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -492,7 +492,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_preds(call, s);
+	print_event_filter(call, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -505,40 +505,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
 	struct ftrace_event_call *call = filp->private_data;
-	char buf[64], *pbuf = buf;
-	struct filter_pred *pred;
+	char *buf;
 	int err;
 
-	if (cnt >= sizeof(buf))
+	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt] = '\0';
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return -ENOMEM;
 
-	err = filter_parse(&pbuf, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
-		return err;
-	}
-
-	if (pred->clear) {
-		filter_disable_preds(call);
-		filter_free_pred(pred);
-		return cnt;
+	if (copy_from_user(buf, ubuf, cnt)) {
+		free_page((unsigned long) buf);
+		return -EFAULT;
 	}
+	buf[cnt] = '\0';
 
-	err = filter_add_pred(call, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
+	err = apply_event_filter(call, buf);
+	free_page((unsigned long) buf);
+	if (err < 0)
 		return err;
-	}
-
-	filter_free_pred(pred);
 
 	*ppos += cnt;
 
@@ -562,7 +548,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	filter_print_subsystem_preds(system, s);
+	print_subsystem_event_filter(system, s);
 	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
@@ -575,38 +561,26 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		       loff_t *ppos)
 {
 	struct event_subsystem *system = filp->private_data;
-	char buf[64], *pbuf = buf;
-	struct filter_pred *pred;
+	char *buf;
 	int err;
 
-	if (cnt >= sizeof(buf))
+	if (cnt >= PAGE_SIZE)
 		return -EINVAL;
 
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-	buf[cnt] = '\0';
-
-	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-	if (!pred)
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return -ENOMEM;
 
-	err = filter_parse(&pbuf, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
-		return err;
-	}
-
-	if (pred->clear) {
-		filter_free_subsystem_preds(system);
-		filter_free_pred(pred);
-		return cnt;
+	if (copy_from_user(buf, ubuf, cnt)) {
+		free_page((unsigned long) buf);
+		return -EFAULT;
 	}
+	buf[cnt] = '\0';
 
-	err = filter_add_subsystem_pred(system, pred);
-	if (err < 0) {
-		filter_free_pred(pred);
+	err = apply_subsystem_event_filter(system, buf);
+	free_page((unsigned long) buf);
+	if (err < 0)
 		return err;
-	}
 
 	*ppos += cnt;
 
@@ -760,11 +734,21 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->filter = NULL;
 
+	system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+	if (!system->filter) {
+		pr_warning("Could not allocate filter for subsystem "
+			   "'%s'\n", name);
+		return system->entry;
+	}
+
 	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
-	if (!entry)
+	if (!entry) {
+		kfree(system->filter);
+		system->filter = NULL;
 		pr_warning("Could not create debugfs "
 			   "'%s/filter' entry\n", name);
+	}
 
 	return system->entry;
 }
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1e861eca3d027..f49486687ee2a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -29,51 +29,130 @@
 
 static DEFINE_MUTEX(filter_mutex);
 
-static int filter_pred_64(struct filter_pred *pred, void *event)
+enum filter_op_ids
 {
-	u64 *addr = (u64 *)(event + pred->offset);
-	u64 val = (u64)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
-}
-
-static int filter_pred_32(struct filter_pred *pred, void *event)
-{
-	u32 *addr = (u32 *)(event + pred->offset);
-	u32 val = (u32)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
-}
-
-static int filter_pred_16(struct filter_pred *pred, void *event)
+	OP_OR,
+	OP_AND,
+	OP_NE,
+	OP_EQ,
+	OP_LT,
+	OP_LE,
+	OP_GT,
+	OP_GE,
+	OP_NONE,
+	OP_OPEN_PAREN,
+};
+
+struct filter_op {
+	int id;
+	char *string;
+	int precedence;
+};
+
+static struct filter_op filter_ops[] = {
+	{ OP_OR, "||", 1 },
+	{ OP_AND, "&&", 2 },
+	{ OP_NE, "!=", 4 },
+	{ OP_EQ, "==", 4 },
+	{ OP_LT, "<", 5 },
+	{ OP_LE, "<=", 5 },
+	{ OP_GT, ">", 5 },
+	{ OP_GE, ">=", 5 },
+	{ OP_NONE, "OP_NONE", 0 },
+	{ OP_OPEN_PAREN, "(", 0 },
+};
+
+enum {
+	FILT_ERR_NONE,
+	FILT_ERR_INVALID_OP,
+	FILT_ERR_UNBALANCED_PAREN,
+	FILT_ERR_TOO_MANY_OPERANDS,
+	FILT_ERR_OPERAND_TOO_LONG,
+	FILT_ERR_FIELD_NOT_FOUND,
+	FILT_ERR_ILLEGAL_FIELD_OP,
+	FILT_ERR_ILLEGAL_INTVAL,
+	FILT_ERR_BAD_SUBSYS_FILTER,
+	FILT_ERR_TOO_MANY_PREDS,
+	FILT_ERR_MISSING_FIELD,
+	FILT_ERR_INVALID_FILTER,
+};
+
+static char *err_text[] = {
+	"No error",
+	"Invalid operator",
+	"Unbalanced parens",
+	"Too many operands",
+	"Operand too long",
+	"Field not found",
+	"Illegal operation for field type",
+	"Illegal integer value",
+	"Couldn't find or set field in one of a subsystem's events",
+	"Too many terms in predicate expression",
+	"Missing field name and/or value",
+	"Meaningless filter expression",
+};
+
+struct opstack_op {
+	int op;
+	struct list_head list;
+};
+
+struct postfix_elt {
+	int op;
+	char *operand;
+	struct list_head list;
+};
+
+struct filter_parse_state {
+	struct filter_op *ops;
+	struct list_head opstack;
+	struct list_head postfix;
+	int lasterr;
+	int lasterr_pos;
+
+	struct {
+		char *string;
+		unsigned int cnt;
+		unsigned int tail;
+	} infix;
+
+	struct {
+		char string[MAX_FILTER_STR_VAL];
+		int pos;
+		unsigned int tail;
+	} operand;
+};
+
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+
+static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
+			   void *event __attribute((unused)),
+			   int val1, int val2)
 {
-	u16 *addr = (u16 *)(event + pred->offset);
-	u16 val = (u16)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
+	return val1 && val2;
 }
 
-static int filter_pred_8(struct filter_pred *pred, void *event)
+static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
+			  void *event __attribute((unused)),
+			  int val1, int val2)
 {
-	u8 *addr = (u8 *)(event + pred->offset);
-	u8 val = (u8)pred->val;
-	int match;
-
-	match = (val == *addr) ^ pred->not;
-
-	return match;
+	return val1 || val2;
 }
 
-static int filter_pred_string(struct filter_pred *pred, void *event)
+static int filter_pred_string(struct filter_pred *pred, void *event,
+			      int val1, int val2)
 {
 	char *addr = (char *)(event + pred->offset);
 	int cmp, match;
@@ -85,7 +164,8 @@ static int filter_pred_string(struct filter_pred *pred, void *event)
 	return match;
 }
 
-static int filter_pred_none(struct filter_pred *pred, void *event)
+static int filter_pred_none(struct filter_pred *pred, void *event,
+			    int val1, int val2)
 {
 	return 0;
 }
@@ -94,66 +174,119 @@ static int filter_pred_none(struct filter_pred *pred, void *event)
 int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
 	struct event_filter *filter = call->filter;
-	int i, matched, and_failed = 0;
+	int match, top = 0, val1 = 0, val2 = 0;
+	int stack[MAX_FILTER_PRED];
 	struct filter_pred *pred;
+	int i;
 
 	for (i = 0; i < filter->n_preds; i++) {
 		pred = filter->preds[i];
-		if (and_failed && !pred->or)
+		if (!pred->pop_n) {
+			match = pred->fn(pred, rec, val1, val2);
+			stack[top++] = match;
 			continue;
-		matched = pred->fn(pred, rec);
-		if (!matched && !pred->or) {
-			and_failed = 1;
-			continue;
-		} else if (matched && pred->or)
-			return 1;
+		}
+		if (pred->pop_n > top) {
+			WARN_ON_ONCE(1);
+			return 0;
+		}
+		val1 = stack[--top];
+		val2 = stack[--top];
+		match = pred->fn(pred, rec, val1, val2);
+		stack[top++] = match;
 	}
 
-	if (and_failed)
-		return 0;
-
-	return 1;
+	return stack[--top];
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
 
-static void __filter_print_preds(struct event_filter *filter,
-				 struct trace_seq *s)
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
 {
-	struct filter_pred *pred;
-	char *field_name;
-	int i;
+	ps->lasterr = err;
+	ps->lasterr_pos = pos;
+}
 
-	if (!filter || !filter->n_preds) {
-		trace_seq_printf(s, "none\n");
+static void remove_filter_string(struct event_filter *filter)
+{
+	kfree(filter->filter_string);
+	filter->filter_string = NULL;
+}
+
+static int replace_filter_string(struct event_filter *filter,
+				 char *filter_string)
+{
+	kfree(filter->filter_string);
+	filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+	if (!filter->filter_string)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int append_filter_string(struct event_filter *filter,
+				char *string)
+{
+	int newlen;
+	char *new_filter_string;
+
+	BUG_ON(!filter->filter_string);
+	newlen = strlen(filter->filter_string) + strlen(string) + 1;
+	new_filter_string = kmalloc(newlen, GFP_KERNEL);
+	if (!new_filter_string)
+		return -ENOMEM;
+
+	strcpy(new_filter_string, filter->filter_string);
+	strcat(new_filter_string, string);
+	kfree(filter->filter_string);
+	filter->filter_string = new_filter_string;
+
+	return 0;
+}
+
+static void append_filter_err(struct filter_parse_state *ps,
+			      struct event_filter *filter)
+{
+	int pos = ps->lasterr_pos;
+	char *buf, *pbuf;
+
+	buf = (char *)__get_free_page(GFP_TEMPORARY);
+	if (!buf)
 		return;
-	}
 
-	for (i = 0; i < filter->n_preds; i++) {
-		pred = filter->preds[i];
-		field_name = pred->field_name;
-		if (i)
-			trace_seq_printf(s, pred->or ? "|| " : "&& ");
-		trace_seq_printf(s, "%s ", field_name);
-		trace_seq_printf(s, pred->not ? "!= " : "== ");
-		if (pred->str_len)
-			trace_seq_printf(s, "%s\n", pred->str_val);
-		else
-			trace_seq_printf(s, "%llu\n", pred->val);
-	}
+	append_filter_string(filter, "\n");
+	memset(buf, ' ', PAGE_SIZE);
+	if (pos > PAGE_SIZE - 128)
+		pos = 0;
+	buf[pos] = '^';
+	pbuf = &buf[pos] + 1;
+
+	sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+	append_filter_string(filter, buf);
+	free_page((unsigned long) buf);
 }
 
-void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
+	struct event_filter *filter = call->filter;
+
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(call->filter, s);
+	if (filter->filter_string)
+		trace_seq_printf(s, "%s\n", filter->filter_string);
+	else
+		trace_seq_printf(s, "none\n");
 	mutex_unlock(&filter_mutex);
 }
 
-void filter_print_subsystem_preds(struct event_subsystem *system,
+void print_subsystem_event_filter(struct event_subsystem *system,
 				  struct trace_seq *s)
 {
+	struct event_filter *filter = system->filter;
+
 	mutex_lock(&filter_mutex);
-	__filter_print_preds(system->filter, s);
+	if (filter->filter_string)
+		trace_seq_printf(s, "%s\n", filter->filter_string);
+	else
+		trace_seq_printf(s, "none\n");
 	mutex_unlock(&filter_mutex);
 }
 
@@ -170,7 +303,7 @@ find_event_field(struct ftrace_event_call *call, char *name)
 	return NULL;
 }
 
-void filter_free_pred(struct filter_pred *pred)
+static void filter_free_pred(struct filter_pred *pred)
 {
 	if (!pred)
 		return;
@@ -191,15 +324,17 @@ static int filter_set_pred(struct filter_pred *dest,
 			   filter_pred_fn_t fn)
 {
 	*dest = *src;
-	dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
-	if (!dest->field_name)
-		return -ENOMEM;
+	if (src->field_name) {
+		dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+		if (!dest->field_name)
+			return -ENOMEM;
+	}
 	dest->fn = fn;
 
 	return 0;
 }
 
-static void __filter_disable_preds(struct ftrace_event_call *call)
+static void filter_disable_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter = call->filter;
 	int i;
@@ -211,13 +346,6 @@ static void __filter_disable_preds(struct ftrace_event_call *call)
 		filter->preds[i]->fn = filter_pred_none;
 }
 
-void filter_disable_preds(struct ftrace_event_call *call)
-{
-	mutex_lock(&filter_mutex);
-	__filter_disable_preds(call);
-	mutex_unlock(&filter_mutex);
-}
-
 int init_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter;
@@ -258,48 +386,43 @@ int init_preds(struct ftrace_event_call *call)
 }
 EXPORT_SYMBOL_GPL(init_preds);
 
-static void __filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 	int i;
 
-	if (filter && filter->n_preds) {
+	if (filter->n_preds) {
 		for (i = 0; i < filter->n_preds; i++)
 			filter_free_pred(filter->preds[i]);
 		kfree(filter->preds);
-		kfree(filter);
-		system->filter = NULL;
+		filter->preds = NULL;
+		filter->n_preds = 0;
 	}
 
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
 
-		if (!strcmp(call->system, system->name))
-			__filter_disable_preds(call);
+		if (!strcmp(call->system, system->name)) {
+			filter_disable_preds(call);
+			remove_filter_string(call->filter);
+		}
 	}
 }
 
-void filter_free_subsystem_preds(struct event_subsystem *system)
-{
-	mutex_lock(&filter_mutex);
-	__filter_free_subsystem_preds(system);
-	mutex_unlock(&filter_mutex);
-}
-
-static int filter_add_pred_fn(struct ftrace_event_call *call,
+static int filter_add_pred_fn(struct filter_parse_state *ps,
+			      struct ftrace_event_call *call,
 			      struct filter_pred *pred,
 			      filter_pred_fn_t fn)
 {
 	struct event_filter *filter = call->filter;
 	int idx, err;
 
-	if (filter->n_preds && !pred->compound)
-		__filter_disable_preds(call);
-
-	if (filter->n_preds == MAX_FILTER_PRED)
+	if (filter->n_preds == MAX_FILTER_PRED) {
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
+	}
 
 	idx = filter->n_preds;
 	filter_clear_pred(filter->preds[idx]);
@@ -321,94 +444,132 @@ static int is_string_field(const char *type)
 	return 0;
 }
 
-static int __filter_add_pred(struct ftrace_event_call *call,
-			     struct filter_pred *pred)
+static int is_legal_op(struct ftrace_event_field *field, int op)
+{
+	if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
+		return 0;
+
+	return 1;
+}
+
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+					     int field_is_signed)
+{
+	filter_pred_fn_t fn = NULL;
+
+	switch (field_size) {
+	case 8:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_64;
+		else if (field_is_signed)
+			fn = filter_pred_s64;
+		else
+			fn = filter_pred_u64;
+		break;
+	case 4:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_32;
+		else if (field_is_signed)
+			fn = filter_pred_s32;
+		else
+			fn = filter_pred_u32;
+		break;
+	case 2:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_16;
+		else if (field_is_signed)
+			fn = filter_pred_s16;
+		else
+			fn = filter_pred_u16;
+		break;
+	case 1:
+		if (op == OP_EQ || op == OP_NE)
+			fn = filter_pred_8;
+		else if (field_is_signed)
+			fn = filter_pred_s8;
+		else
+			fn = filter_pred_u8;
+		break;
+	}
+
+	return fn;
+}
+
+static int filter_add_pred(struct filter_parse_state *ps,
+			   struct ftrace_event_call *call,
+			   struct filter_pred *pred)
 {
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
 	unsigned long long val;
 
+	pred->fn = filter_pred_none;
+
+	if (pred->op == OP_AND) {
+		pred->pop_n = 2;
+		return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+	} else if (pred->op == OP_OR) {
+		pred->pop_n = 2;
+		return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+	}
+
 	field = find_event_field(call, pred->field_name);
-	if (!field)
+	if (!field) {
+		parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
 		return -EINVAL;
+	}
 
-	pred->fn = filter_pred_none;
 	pred->offset = field->offset;
 
+	if (!is_legal_op(field, pred->op)) {
+		parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
+		return -EINVAL;
+	}
+
 	if (is_string_field(field->type)) {
 		fn = filter_pred_string;
 		pred->str_len = field->size;
-		return filter_add_pred_fn(call, pred, fn);
+		if (pred->op == OP_NE)
+			pred->not = 1;
+		return filter_add_pred_fn(ps, call, pred, fn);
 	} else {
-		if (strict_strtoull(pred->str_val, 0, &val))
+		if (strict_strtoull(pred->str_val, 0, &val)) {
+			parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
 			return -EINVAL;
+		}
 		pred->val = val;
 	}
 
-	switch (field->size) {
-	case 8:
-		fn = filter_pred_64;
-		break;
-	case 4:
-		fn = filter_pred_32;
-		break;
-	case 2:
-		fn = filter_pred_16;
-		break;
-	case 1:
-		fn = filter_pred_8;
-		break;
-	default:
+	fn = select_comparison_fn(pred->op, field->size, field->is_signed);
+	if (!fn) {
+		parse_error(ps, FILT_ERR_INVALID_OP, 0);
 		return -EINVAL;
 	}
 
-	return filter_add_pred_fn(call, pred, fn);
-}
-
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
-{
-	int err;
-
-	mutex_lock(&filter_mutex);
-	err = __filter_add_pred(call, pred);
-	mutex_unlock(&filter_mutex);
+	if (pred->op == OP_NE)
+		pred->not = 1;
 
-	return err;
+	return filter_add_pred_fn(ps, call, pred, fn);
 }
 
-int filter_add_subsystem_pred(struct event_subsystem *system,
-			      struct filter_pred *pred)
+static int filter_add_subsystem_pred(struct filter_parse_state *ps,
+				     struct event_subsystem *system,
+				     struct filter_pred *pred,
+				     char *filter_string)
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
 
-	mutex_lock(&filter_mutex);
-
-	if (filter && filter->n_preds && !pred->compound) {
-		__filter_free_subsystem_preds(system);
-		filter = NULL;
-	}
-
-	if (!filter) {
-		system->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
-		if (!system->filter) {
-			mutex_unlock(&filter_mutex);
-			return -ENOMEM;
-		}
-		filter = system->filter;
+	if (!filter->preds) {
 		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
 					GFP_KERNEL);
 
-		if (!filter->preds) {
-			kfree(system->filter);
-			system->filter = NULL;
-			mutex_unlock(&filter_mutex);
+		if (!filter->preds)
 			return -ENOMEM;
-		}
 	}
 
 	if (filter->n_preds == MAX_FILTER_PRED) {
-		mutex_unlock(&filter_mutex);
+		parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
 		return -ENOSPC;
 	}
 
@@ -424,97 +585,508 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 		if (strcmp(call->system, system->name))
 			continue;
 
-		err = __filter_add_pred(call, pred);
-		if (err == -ENOMEM) {
-			filter->preds[filter->n_preds] = NULL;
-			filter->n_preds--;
-			mutex_unlock(&filter_mutex);
+		err = filter_add_pred(ps, call, pred);
+		if (err) {
+			filter_free_subsystem_preds(system);
+			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
 			return err;
 		}
+		replace_filter_string(call->filter, filter_string);
 	}
 
-	mutex_unlock(&filter_mutex);
+	return 0;
+}
+
+static void parse_init(struct filter_parse_state *ps,
+		       struct filter_op *ops,
+		       char *infix_string)
+{
+	memset(ps, '\0', sizeof(*ps));
+
+	ps->infix.string = infix_string;
+	ps->infix.cnt = strlen(infix_string);
+	ps->ops = ops;
+
+	INIT_LIST_HEAD(&ps->opstack);
+	INIT_LIST_HEAD(&ps->postfix);
+}
+
+static char infix_next(struct filter_parse_state *ps)
+{
+	ps->infix.cnt--;
+
+	return ps->infix.string[ps->infix.tail++];
+}
+
+static char infix_peek(struct filter_parse_state *ps)
+{
+	if (ps->infix.tail == strlen(ps->infix.string))
+		return 0;
+
+	return ps->infix.string[ps->infix.tail];
+}
+
+static void infix_advance(struct filter_parse_state *ps)
+{
+	ps->infix.cnt--;
+	ps->infix.tail++;
+}
+
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+				      int a, int b)
+{
+	return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+	int i;
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (ps->ops[i].string[0] == c)
+			return 1;
+	}
 
 	return 0;
 }
 
-/*
- * The filter format can be
- *   - 0, which means remove all filter preds
- *   - [||/&&] <field> ==/!= <val>
- */
-int filter_parse(char **pbuf, struct filter_pred *pred)
-{
-	char *tok, *val_str = NULL;
-	int tok_n = 0;
-
-	while ((tok = strsep(pbuf, " \n"))) {
-		if (tok_n == 0) {
-			if (!strcmp(tok, "0")) {
-				pred->clear = 1;
-				return 0;
-			} else if (!strcmp(tok, "&&")) {
-				pred->or = 0;
-				pred->compound = 1;
-			} else if (!strcmp(tok, "||")) {
-				pred->or = 1;
-				pred->compound = 1;
-			} else
-				pred->field_name = tok;
-			tok_n = 1;
-			continue;
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+	char nextc = infix_peek(ps);
+	char opstr[3];
+	int i;
+
+	opstr[0] = firstc;
+	opstr[1] = nextc;
+	opstr[2] = '\0';
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (!strcmp(opstr, ps->ops[i].string)) {
+			infix_advance(ps);
+			return ps->ops[i].id;
 		}
-		if (tok_n == 1) {
-			if (!pred->field_name)
-				pred->field_name = tok;
-			else if (!strcmp(tok, "!="))
-				pred->not = 1;
-			else if (!strcmp(tok, "=="))
-				pred->not = 0;
-			else {
-				pred->field_name = NULL;
+	}
+
+	opstr[1] = '\0';
+
+	for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+		if (!strcmp(opstr, ps->ops[i].string))
+			return ps->ops[i].id;
+	}
+
+	return OP_NONE;
+}
+
+static inline void clear_operand_string(struct filter_parse_state *ps)
+{
+	memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+	ps->operand.tail = 0;
+}
+
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+	if (ps->operand.tail == MAX_FILTER_STR_VAL)
+		return -EINVAL;
+
+	ps->operand.string[ps->operand.tail++] = c;
+
+	return 0;
+}
+
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+	struct opstack_op *opstack_op;
+
+	opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
+	if (!opstack_op)
+		return -ENOMEM;
+
+	opstack_op->op = op;
+	list_add(&opstack_op->list, &ps->opstack);
+
+	return 0;
+}
+
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+	return list_empty(&ps->opstack);
+}
+
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+	struct opstack_op *opstack_op;
+
+	if (filter_opstack_empty(ps))
+		return OP_NONE;
+
+	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+
+	return opstack_op->op;
+}
+
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+	struct opstack_op *opstack_op;
+	int op;
+
+	if (filter_opstack_empty(ps))
+		return OP_NONE;
+
+	opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+	op = opstack_op->op;
+	list_del(&opstack_op->list);
+
+	kfree(opstack_op);
+
+	return op;
+}
+
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+	while (!filter_opstack_empty(ps))
+		filter_opstack_pop(ps);
+}
+
+static char *curr_operand(struct filter_parse_state *ps)
+{
+	return ps->operand.string;
+}
+
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+	struct postfix_elt *elt;
+
+	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return -ENOMEM;
+
+	elt->op = OP_NONE;
+	elt->operand = kstrdup(operand, GFP_KERNEL);
+	if (!elt->operand) {
+		kfree(elt);
+		return -ENOMEM;
+	}
+
+	list_add_tail(&elt->list, &ps->postfix);
+
+	return 0;
+}
+
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+	struct postfix_elt *elt;
+
+	elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+	if (!elt)
+		return -ENOMEM;
+
+	elt->op = op;
+	elt->operand = NULL;
+
+	list_add_tail(&elt->list, &ps->postfix);
+
+	return 0;
+}
+
+static void postfix_clear(struct filter_parse_state *ps)
+{
+	struct postfix_elt *elt;
+
+	while (!list_empty(&ps->postfix)) {
+		elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+		kfree(elt->operand);
+		list_del(&elt->list);
+	}
+}
+
+static int filter_parse(struct filter_parse_state *ps)
+{
+	int op, top_op;
+	char ch;
+
+	while ((ch = infix_next(ps))) {
+		if (isspace(ch))
+			continue;
+
+		if (is_op_char(ps, ch)) {
+			op = infix_get_op(ps, ch);
+			if (op == OP_NONE) {
+				parse_error(ps, FILT_ERR_INVALID_OP, 0);
 				return -EINVAL;
 			}
-			tok_n = 2;
+
+			if (strlen(curr_operand(ps))) {
+				postfix_append_operand(ps, curr_operand(ps));
+				clear_operand_string(ps);
+			}
+
+			while (!filter_opstack_empty(ps)) {
+				top_op = filter_opstack_top(ps);
+				if (!is_precedence_lower(ps, top_op, op)) {
+					top_op = filter_opstack_pop(ps);
+					postfix_append_op(ps, top_op);
+					continue;
+				}
+				break;
+			}
+
+			filter_opstack_push(ps, op);
 			continue;
 		}
-		if (tok_n == 2) {
-			if (pred->compound) {
-				if (!strcmp(tok, "!="))
-					pred->not = 1;
-				else if (!strcmp(tok, "=="))
-					pred->not = 0;
-				else {
-					pred->field_name = NULL;
-					return -EINVAL;
-				}
-			} else {
-				val_str = tok;
-				break; /* done */
+
+		if (ch == '(') {
+			filter_opstack_push(ps, OP_OPEN_PAREN);
+			continue;
+		}
+
+		if (ch == ')') {
+			if (strlen(curr_operand(ps))) {
+				postfix_append_operand(ps, curr_operand(ps));
+				clear_operand_string(ps);
+			}
+
+			top_op = filter_opstack_pop(ps);
+			while (top_op != OP_NONE) {
+				if (top_op == OP_OPEN_PAREN)
+					break;
+				postfix_append_op(ps, top_op);
+				top_op = filter_opstack_pop(ps);
+			}
+			if (top_op == OP_NONE) {
+				parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+				return -EINVAL;
 			}
-			tok_n = 3;
 			continue;
 		}
-		if (tok_n == 3) {
-			val_str = tok;
-			break; /* done */
+		if (append_operand_char(ps, ch)) {
+			parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+			return -EINVAL;
+		}
+	}
+
+	if (strlen(curr_operand(ps)))
+		postfix_append_operand(ps, curr_operand(ps));
+
+	while (!filter_opstack_empty(ps)) {
+		top_op = filter_opstack_pop(ps);
+		if (top_op == OP_NONE)
+			break;
+		if (top_op == OP_OPEN_PAREN) {
+			parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+			return -EINVAL;
+		}
+		postfix_append_op(ps, top_op);
+	}
+
+	return 0;
+}
+
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+{
+	struct filter_pred *pred;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return NULL;
+
+	pred->field_name = kstrdup(operand1, GFP_KERNEL);
+	if (!pred->field_name) {
+		kfree(pred);
+		return NULL;
+	}
+
+	strcpy(pred->str_val, operand2);
+	pred->str_len = strlen(operand2);
+
+	pred->op = op;
+
+	return pred;
+}
+
+static struct filter_pred *create_logical_pred(int op)
+{
+	struct filter_pred *pred;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return NULL;
+
+	pred->op = op;
+
+	return pred;
+}
+
+static int check_preds(struct filter_parse_state *ps)
+{
+	int n_normal_preds = 0, n_logical_preds = 0;
+	struct postfix_elt *elt;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE)
+			continue;
+
+		if (elt->op == OP_AND || elt->op == OP_OR) {
+			n_logical_preds++;
+			continue;
 		}
+		n_normal_preds++;
 	}
 
-	if (!val_str || !strlen(val_str)
-	    || strlen(val_str) >= MAX_FILTER_STR_VAL) {
-		pred->field_name = NULL;
+	if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+		parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
 		return -EINVAL;
 	}
 
-	strcpy(pred->str_val, val_str);
-	pred->str_len = strlen(val_str);
+	return 0;
+}
 
-	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-	if (!pred->field_name)
-		return -ENOMEM;
+static int replace_preds(struct event_subsystem *system,
+			 struct ftrace_event_call *call,
+			 struct filter_parse_state *ps,
+			 char *filter_string)
+{
+	char *operand1 = NULL, *operand2 = NULL;
+	struct filter_pred *pred;
+	struct postfix_elt *elt;
+	int err;
+
+	err = check_preds(ps);
+	if (err)
+		return err;
+
+	list_for_each_entry(elt, &ps->postfix, list) {
+		if (elt->op == OP_NONE) {
+			if (!operand1)
+				operand1 = elt->operand;
+			else if (!operand2)
+				operand2 = elt->operand;
+			else {
+				parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
+				return -EINVAL;
+			}
+			continue;
+		}
+
+		if (elt->op == OP_AND || elt->op == OP_OR) {
+			pred = create_logical_pred(elt->op);
+			if (call) {
+				err = filter_add_pred(ps, call, pred);
+				filter_free_pred(pred);
+			} else
+				err = filter_add_subsystem_pred(ps, system,
+							pred, filter_string);
+			if (err)
+				return err;
+
+			operand1 = operand2 = NULL;
+			continue;
+		}
+
+		if (!operand1 || !operand2) {
+			parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+			return -EINVAL;
+		}
+
+		pred = create_pred(elt->op, operand1, operand2);
+		if (call) {
+			err = filter_add_pred(ps, call, pred);
+			filter_free_pred(pred);
+		} else
+			err = filter_add_subsystem_pred(ps, system, pred,
+							filter_string);
+		if (err)
+			return err;
+
+		operand1 = operand2 = NULL;
+	}
 
 	return 0;
 }
 
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+	int err;
+
+	struct filter_parse_state *ps;
+
+	mutex_lock(&filter_mutex);
+
+	if (!strcmp(strstrip(filter_string), "0")) {
+		filter_disable_preds(call);
+		remove_filter_string(call->filter);
+		mutex_unlock(&filter_mutex);
+		return 0;
+	}
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		return -ENOMEM;
+
+	filter_disable_preds(call);
+	replace_filter_string(call->filter, filter_string);
+
+	parse_init(ps, filter_ops, filter_string);
+	err = filter_parse(ps);
+	if (err) {
+		append_filter_err(ps, call->filter);
+		goto out;
+	}
+
+	err = replace_preds(NULL, call, ps, filter_string);
+	if (err)
+		append_filter_err(ps, call->filter);
+
+out:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+	mutex_unlock(&filter_mutex);
+
+	return err;
+}
+
+int apply_subsystem_event_filter(struct event_subsystem *system,
+				 char *filter_string)
+{
+	int err;
+
+	struct filter_parse_state *ps;
+
+	mutex_lock(&filter_mutex);
+
+	if (!strcmp(strstrip(filter_string), "0")) {
+		filter_free_subsystem_preds(system);
+		remove_filter_string(system->filter);
+		mutex_unlock(&filter_mutex);
+		return 0;
+	}
+
+	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+	if (!ps)
+		return -ENOMEM;
+
+	filter_free_subsystem_preds(system);
+	replace_filter_string(system->filter, filter_string);
+
+	parse_init(ps, filter_ops, filter_string);
+	err = filter_parse(ps);
+	if (err) {
+		append_filter_err(ps, system->filter);
+		goto out;
+	}
+
+	err = replace_preds(system, NULL, ps, filter_string);
+	if (err)
+		append_filter_err(ps, system->filter);
+
+out:
+	filter_opstack_clear(ps);
+	postfix_clear(ps);
+	kfree(ps);
+
+	mutex_unlock(&filter_mutex);
+
+	return err;
+}
 
-- 
GitLab


From a0e39ed378fb6ba916522764cd508fa7d42ad495 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 29 Apr 2009 13:51:39 +0200
Subject: [PATCH 0713/2198] tracing: fix build failure on s390

"tracing: create automated trace defines" causes this compile error on s390,
as reported by Sachin Sant against linux-next:

 kernel/built-in.o: In function `__do_softirq':
 (.text+0x1c680): undefined reference to `__tracepoint_softirq_entry'

This happens because the definitions of the softirq tracepoints were moved
from kernel/softirq.c to kernel/irq/handle.c. Since s390 doesn't support
generic hardirqs handle.c doesn't get compiled and the definitions are
missing.

So move the tracepoints to softirq.c again.

[ Impact: fix build failure on s390 ]

Reported-by: Sachin Sant <sachinp@in.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: fweisbec@gmail.com
LKML-Reference: <20090429135139.5fac79b8@osiris.boeblingen.de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 2 --
 kernel/softirq.c    | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 37c63633e78b4..e68bb5aebe02e 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -18,8 +18,6 @@
 #include <linux/rculist.h>
 #include <linux/hash.h>
 #include <linux/bootmem.h>
-
-#define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
 
 #include "internals.h"
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7ab9dfd8d0820..d4ba347a872d9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,6 +24,8 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
+
+#define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
 
 #include <asm/irq.h>
-- 
GitLab


From 50fa610a3b6ba7cf91d7a92229177dfaff2b81a1 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 28 Apr 2009 15:01:38 +0100
Subject: [PATCH 0714/2198] sched: Document memory barriers implied by
 sleep/wake-up primitives

Add a section to the memory barriers document to note the implied
memory barriers of sleep primitives (set_current_state() and wrappers)
and wake-up primitives (wake_up() and co.).

Also extend the in-code comments on the wake_up() functions to note
these implied barriers.

[ Impact: add documentation ]

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090428140138.1192.94723.stgit@warthog.procyon.org.uk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/memory-barriers.txt | 129 +++++++++++++++++++++++++++++-
 kernel/sched.c                    |  23 ++++++
 2 files changed, 151 insertions(+), 1 deletion(-)

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index f5b7127f54acb..7f5809eddee62 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -31,6 +31,7 @@ Contents:
 
      - Locking functions.
      - Interrupt disabling functions.
+     - Sleep and wake-up functions.
      - Miscellaneous functions.
 
  (*) Inter-CPU locking barrier effects.
@@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some
 other means.
 
 
+SLEEP AND WAKE-UP FUNCTIONS
+---------------------------
+
+Sleeping and waking on an event flagged in global data can be viewed as an
+interaction between two pieces of data: the task state of the task waiting for
+the event and the global data used to indicate the event.  To make sure that
+these appear to happen in the right order, the primitives to begin the process
+of going to sleep, and the primitives to initiate a wake up imply certain
+barriers.
+
+Firstly, the sleeper normally follows something like this sequence of events:
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (event_indicated)
+			break;
+		schedule();
+	}
+
+A general memory barrier is interpolated automatically by set_current_state()
+after it has altered the task state:
+
+	CPU 1
+	===============================
+	set_current_state();
+	  set_mb();
+	    STORE current->state
+	    <general barrier>
+	LOAD event_indicated
+
+set_current_state() may be wrapped by:
+
+	prepare_to_wait();
+	prepare_to_wait_exclusive();
+
+which therefore also imply a general memory barrier after setting the state.
+The whole sequence above is available in various canned forms, all of which
+interpolate the memory barrier in the right place:
+
+	wait_event();
+	wait_event_interruptible();
+	wait_event_interruptible_exclusive();
+	wait_event_interruptible_timeout();
+	wait_event_killable();
+	wait_event_timeout();
+	wait_on_bit();
+	wait_on_bit_lock();
+
+
+Secondly, code that performs a wake up normally follows something like this:
+
+	event_indicated = 1;
+	wake_up(&event_wait_queue);
+
+or:
+
+	event_indicated = 1;
+	wake_up_process(event_daemon);
+
+A write memory barrier is implied by wake_up() and co. if and only if they wake
+something up.  The barrier occurs before the task state is cleared, and so sits
+between the STORE to indicate the event and the STORE to set TASK_RUNNING:
+
+	CPU 1				CPU 2
+	===============================	===============================
+	set_current_state();		STORE event_indicated
+	  set_mb();			wake_up();
+	    STORE current->state	  <write barrier>
+	    <general barrier>		  STORE current->state
+	LOAD event_indicated
+
+The available waker functions include:
+
+	complete();
+	wake_up();
+	wake_up_all();
+	wake_up_bit();
+	wake_up_interruptible();
+	wake_up_interruptible_all();
+	wake_up_interruptible_nr();
+	wake_up_interruptible_poll();
+	wake_up_interruptible_sync();
+	wake_up_interruptible_sync_poll();
+	wake_up_locked();
+	wake_up_locked_poll();
+	wake_up_nr();
+	wake_up_poll();
+	wake_up_process();
+
+
+[!] Note that the memory barriers implied by the sleeper and the waker do _not_
+order multiple stores before the wake-up with respect to loads of those stored
+values after the sleeper has called set_current_state().  For instance, if the
+sleeper does:
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (event_indicated)
+		break;
+	__set_current_state(TASK_RUNNING);
+	do_something(my_data);
+
+and the waker does:
+
+	my_data = value;
+	event_indicated = 1;
+	wake_up(&event_wait_queue);
+
+there's no guarantee that the change to event_indicated will be perceived by
+the sleeper as coming after the change to my_data.  In such a circumstance, the
+code on both sides must interpolate its own memory barriers between the
+separate data accesses.  Thus the above sleeper ought to do:
+
+	set_current_state(TASK_INTERRUPTIBLE);
+	if (event_indicated) {
+		smp_rmb();
+		do_something(my_data);
+	}
+
+and the waker should do:
+
+	my_data = value;
+	smp_wmb();
+	event_indicated = 1;
+	wake_up(&event_wait_queue);
+
+
 MISCELLANEOUS FUNCTIONS
 -----------------------
 
@@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED?
 
 Under normal operation, memory operation reordering is generally not going to
 be a problem as a single-threaded linear piece of code will still appear to
-work correctly, even if it's in an SMP kernel.  There are, however, three
+work correctly, even if it's in an SMP kernel.  There are, however, four
 circumstances in which reordering definitely _could_ be a problem:
 
  (*) Interprocessor interaction.
diff --git a/kernel/sched.c b/kernel/sched.c
index b902e587a3a03..fd0c2cee3f35f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2458,6 +2458,17 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	return success;
 }
 
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
 int wake_up_process(struct task_struct *p)
 {
 	return try_to_wake_up(p, TASK_ALL, 0);
@@ -5241,6 +5252,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
@@ -5279,6 +5293,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, void *key)
@@ -5315,6 +5332,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
  * awakened in the same order in which they were queued.
  *
  * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void complete(struct completion *x)
 {
@@ -5332,6 +5352,9 @@ EXPORT_SYMBOL(complete);
  * @x:  holds the state of this particular completion
  *
  * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void complete_all(struct completion *x)
 {
-- 
GitLab


From da1a776be1ac7f78bb30ececbec4c1383163b079 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:46:58 +0200
Subject: [PATCH 0715/2198] perf_counter, x86: remove X86_FEATURE_ARCH_PERFMON
 flag for AMD cpus

X86_FEATURE_ARCH_PERFMON is an Intel hardware feature that does not
work on AMD CPUs. The flag is now only used in Intel specific code
(especially initialization).

[ Impact: refactor code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241002046-8832-2-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd.c          | 4 ----
 arch/x86/kernel/cpu/perf_counter.c | 6 +++---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index fd69c514ca2ac..7e4a459daa644 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -420,10 +420,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 	if (c->x86 >= 6)
 		set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
 
-	/* Enable Performance counter for K7 and later */
-	if (c->x86 > 6 && c->x86 <= 0x11)
-		set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
-
 	if (!c->x86_model_id[0]) {
 		switch (c->x86) {
 		case 0xf:
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0fcbaab83f9bf..7d0f81dcb5247 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -949,6 +949,9 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	unsigned int unused;
 	unsigned int ebx;
 
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+		return NULL;
+
 	/*
 	 * Check whether the Architectural PerfMon supports
 	 * Branch Misses Retired Event or not.
@@ -987,9 +990,6 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 void __init init_hw_perf_counters(void)
 {
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
 		pmc_ops = pmc_intel_init();
-- 
GitLab


From 829b42dd395c5801f6ae87da87ecbdcfd5ef1a6c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:46:59 +0200
Subject: [PATCH 0716/2198] perf_counter, x86: declare perf_max_counters only
 for CONFIG_PERF_COUNTERS

This is only needed for CONFIG_PERF_COUNTERS enabled.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241002046-8832-3-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 981432885301d..be10b3ffe3208 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -512,12 +512,13 @@ struct perf_cpu_context {
 	int			recursion[4];
 };
 
+#ifdef CONFIG_PERF_COUNTERS
+
 /*
  * Set by architecture code:
  */
 extern int perf_max_counters;
 
-#ifdef CONFIG_PERF_COUNTERS
 extern const struct hw_perf_counter_ops *
 hw_perf_counter_init(struct perf_counter *counter);
 
-- 
GitLab


From 4138960a9251a265002b5cf07e671a49f8495381 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:00 +0200
Subject: [PATCH 0717/2198] perf_counter, x86: add default path to cpu
 detection

This quits hw counter initialization immediately if no cpu is
detected.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-4-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7d0f81dcb5247..d6d6529349ddd 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -997,6 +997,8 @@ void __init init_hw_perf_counters(void)
 	case X86_VENDOR_AMD:
 		pmc_ops = pmc_amd_init();
 		break;
+	default:
+		return;
 	}
 	if (!pmc_ops)
 		return;
-- 
GitLab


From 4295ee62660b13ddb87d41539f49b239e6e7d56f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:01 +0200
Subject: [PATCH 0718/2198] perf_counter, x86: rework
 pmc_amd_save_disable_all() and pmc_amd_restore_all()

MSR reads and writes are expensive. This patch adds checks to avoid
its usage where possible.

[ Impact: micro-optimization on AMD CPUs ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-5-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 24 ++++++++++++++----------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d6d6529349ddd..75a090394b6d2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -334,11 +334,13 @@ static u64 pmc_amd_save_disable_all(void)
 	for (idx = 0; idx < nr_counters_generic; idx++) {
 		u64 val;
 
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) {
-			val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
-			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		}
+		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
+			continue;
+		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 
 	return enabled;
@@ -372,13 +374,15 @@ static void pmc_amd_restore_all(u64 ctrl)
 		return;
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		if (test_bit(idx, cpuc->active_mask)) {
-			u64 val;
+		u64 val;
 
-			rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-			val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-			wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		}
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
+		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
+			continue;
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 }
 
-- 
GitLab


From 527e26af3741a2168986d8b82653ffe173891324 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:02 +0200
Subject: [PATCH 0719/2198] perf_counter, x86: protect per-cpu variables with
 compile barriers only

Per-cpu variables needn't to be protected with cpu barriers
(smp_wmb()). Protection is only needed for preemption on the same cpu
(rescheduling or the nmi handler). This can be done using a compiler
barrier only.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-6-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 75a090394b6d2..ad663d5ad2d91 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -673,7 +673,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 	/*
 	 * Make it visible before enabling the hw:
 	 */
-	smp_wmb();
+	barrier();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
 	__pmc_generic_enable(counter, hwc, idx);
@@ -745,7 +745,7 @@ static void pmc_generic_disable(struct perf_counter *counter)
 	 * Make sure the cleared pointer becomes visible before we
 	 * (potentially) free the counter:
 	 */
-	smp_wmb();
+	barrier();
 
 	/*
 	 * Drain the remaining delta count out of a counter
-- 
GitLab


From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:03 +0200
Subject: [PATCH 0720/2198] perfcounters: rename struct hw_perf_counter_ops
 into struct pmu

This patch renames struct hw_perf_counter_ops into struct pmu. It
introduces a structure to describe a cpu specific pmu (performance
monitoring unit). It may contain ops and data. The new name of the
structure fits better, is shorter, and thus better to handle. Where it
was appropriate, names of function and variable have been changed too.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 25 ++++++-----
 arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++--------
 include/linux/perf_counter.h       |  9 ++--
 kernel/perf_counter.c              | 68 ++++++++++++++----------------
 4 files changed, 66 insertions(+), 73 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bd76d0fa2c357..d9bbe5efc6492 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -256,7 +256,7 @@ static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
 	return 0;
 }
 
-static void power_perf_read(struct perf_counter *counter)
+static void power_pmu_read(struct perf_counter *counter)
 {
 	long val, delta, prev;
 
@@ -405,7 +405,7 @@ void hw_perf_restore(u64 disable)
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) {
-			power_perf_read(counter);
+			power_pmu_read(counter);
 			write_pmc(counter->hw.idx, 0);
 			counter->hw.idx = 0;
 		}
@@ -477,7 +477,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu)
 	counter->oncpu = cpu;
 	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped;
 	if (is_software_counter(counter))
-		counter->hw_ops->enable(counter);
+		counter->pmu->enable(counter);
 }
 
 /*
@@ -533,7 +533,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
  * re-enable the PMU in order to get hw_perf_restore to do the
  * actual work of reconfiguring the PMU.
  */
-static int power_perf_enable(struct perf_counter *counter)
+static int power_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
@@ -573,7 +573,7 @@ static int power_perf_enable(struct perf_counter *counter)
 /*
  * Remove a counter from the PMU.
  */
-static void power_perf_disable(struct perf_counter *counter)
+static void power_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	long i;
@@ -583,7 +583,7 @@ static void power_perf_disable(struct perf_counter *counter)
 	local_irq_save(flags);
 	pmudis = hw_perf_save_disable();
 
-	power_perf_read(counter);
+	power_pmu_read(counter);
 
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	for (i = 0; i < cpuhw->n_counters; ++i) {
@@ -607,10 +607,10 @@ static void power_perf_disable(struct perf_counter *counter)
 	local_irq_restore(flags);
 }
 
-struct hw_perf_counter_ops power_perf_ops = {
-	.enable = power_perf_enable,
-	.disable = power_perf_disable,
-	.read = power_perf_read
+struct pmu power_pmu = {
+	.enable		= power_pmu_enable,
+	.disable	= power_pmu_disable,
+	.read		= power_pmu_read,
 };
 
 /* Number of perf_counters counting hardware events */
@@ -631,8 +631,7 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	unsigned long ev;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
@@ -705,7 +704,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 
 	if (err)
 		return ERR_PTR(err);
-	return &power_perf_ops;
+	return &power_pmu;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ad663d5ad2d91..95de980c74a02 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -515,8 +515,8 @@ __pmc_fixed_disable(struct perf_counter *counter,
 }
 
 static inline void
-__pmc_generic_disable(struct perf_counter *counter,
-			   struct hw_perf_counter *hwc, unsigned int idx)
+__x86_pmu_disable(struct perf_counter *counter,
+		  struct hw_perf_counter *hwc, unsigned int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
@@ -591,8 +591,8 @@ __pmc_fixed_enable(struct perf_counter *counter,
 }
 
 static void
-__pmc_generic_enable(struct perf_counter *counter,
-			  struct hw_perf_counter *hwc, int idx)
+__x86_pmu_enable(struct perf_counter *counter,
+		 struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_enable(counter, hwc, idx);
@@ -626,7 +626,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 /*
  * Find a PMC slot for the freshly enabled / scheduled in counter:
  */
-static int pmc_generic_enable(struct perf_counter *counter)
+static int x86_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -667,7 +667,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__pmc_generic_disable(counter, hwc, idx);
+	__x86_pmu_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
 	/*
@@ -676,7 +676,7 @@ static int pmc_generic_enable(struct perf_counter *counter)
 	barrier();
 
 	__hw_perf_counter_set_period(counter, hwc, idx);
-	__pmc_generic_enable(counter, hwc, idx);
+	__x86_pmu_enable(counter, hwc, idx);
 
 	return 0;
 }
@@ -731,13 +731,13 @@ void perf_counter_print_debug(void)
 	local_irq_enable();
 }
 
-static void pmc_generic_disable(struct perf_counter *counter)
+static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
-	__pmc_generic_disable(counter, hwc, idx);
+	__x86_pmu_disable(counter, hwc, idx);
 
 	clear_bit(idx, cpuc->used);
 	cpuc->counters[idx] = NULL;
@@ -767,7 +767,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	__hw_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		__pmc_generic_enable(counter, hwc, idx);
+		__x86_pmu_enable(counter, hwc, idx);
 }
 
 /*
@@ -805,7 +805,7 @@ static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 
 		perf_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
-			__pmc_generic_disable(counter, &counter->hw, bit);
+			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
 
 	hw_perf_ack_status(ack);
@@ -1034,19 +1034,18 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void pmc_generic_read(struct perf_counter *counter)
+static void x86_pmu_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
 
-static const struct hw_perf_counter_ops x86_perf_counter_ops = {
-	.enable		= pmc_generic_enable,
-	.disable	= pmc_generic_disable,
-	.read		= pmc_generic_read,
+static const struct pmu pmu = {
+	.enable		= x86_pmu_enable,
+	.disable	= x86_pmu_disable,
+	.read		= x86_pmu_read,
 };
 
-const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	int err;
 
@@ -1054,7 +1053,7 @@ hw_perf_counter_init(struct perf_counter *counter)
 	if (err)
 		return ERR_PTR(err);
 
-	return &x86_perf_counter_ops;
+	return &pmu;
 }
 
 /*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index be10b3ffe3208..c3db52dc876ad 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -334,9 +334,9 @@ struct hw_perf_counter {
 struct perf_counter;
 
 /**
- * struct hw_perf_counter_ops - performance counter hw ops
+ * struct pmu - generic performance monitoring unit
  */
-struct hw_perf_counter_ops {
+struct pmu {
 	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
@@ -381,7 +381,7 @@ struct perf_counter {
 	struct list_head		sibling_list;
 	int 				nr_siblings;
 	struct perf_counter		*group_leader;
-	const struct hw_perf_counter_ops *hw_ops;
+	const struct pmu		*pmu;
 
 	enum perf_counter_active_state	state;
 	enum perf_counter_active_state	prev_state;
@@ -519,8 +519,7 @@ struct perf_cpu_context {
  */
 extern int perf_max_counters;
 
-extern const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter);
+extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 09396098dd0df..582108addefab 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -52,8 +52,7 @@ static DEFINE_MUTEX(perf_resource_mutex);
 /*
  * Architecture provided APIs - weak aliases:
  */
-extern __weak const struct hw_perf_counter_ops *
-hw_perf_counter_init(struct perf_counter *counter)
+extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	return NULL;
 }
@@ -124,7 +123,7 @@ counter_sched_out(struct perf_counter *counter,
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_stopped = ctx->time;
-	counter->hw_ops->disable(counter);
+	counter->pmu->disable(counter);
 	counter->oncpu = -1;
 
 	if (!is_software_counter(counter))
@@ -417,7 +416,7 @@ counter_sched_in(struct perf_counter *counter,
 	 */
 	smp_wmb();
 
-	if (counter->hw_ops->enable(counter)) {
+	if (counter->pmu->enable(counter)) {
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->oncpu = -1;
 		return -EAGAIN;
@@ -1096,7 +1095,7 @@ static void __read(void *info)
 	local_irq_save(flags);
 	if (ctx->is_active)
 		update_context_time(ctx);
-	counter->hw_ops->read(counter);
+	counter->pmu->read(counter);
 	update_counter_times(counter);
 	local_irq_restore(flags);
 }
@@ -1922,7 +1921,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		leader = counter->group_leader;
 		list_for_each_entry(sub, &leader->sibling_list, list_entry) {
 			if (sub != counter)
-				sub->hw_ops->read(sub);
+				sub->pmu->read(sub);
 
 			group_entry.event = sub->hw_event.config;
 			group_entry.counter = atomic64_read(&sub->count);
@@ -2264,7 +2263,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	struct pt_regs *regs;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
-	counter->hw_ops->read(counter);
+	counter->pmu->read(counter);
 
 	regs = get_irq_regs();
 	/*
@@ -2410,7 +2409,7 @@ static void perf_swcounter_disable(struct perf_counter *counter)
 	perf_swcounter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_generic = {
+static const struct pmu perf_ops_generic = {
 	.enable		= perf_swcounter_enable,
 	.disable	= perf_swcounter_disable,
 	.read		= perf_swcounter_read,
@@ -2460,7 +2459,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 	cpu_clock_perf_counter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
+static const struct pmu perf_ops_cpu_clock = {
 	.enable		= cpu_clock_perf_counter_enable,
 	.disable	= cpu_clock_perf_counter_disable,
 	.read		= cpu_clock_perf_counter_read,
@@ -2522,7 +2521,7 @@ static void task_clock_perf_counter_read(struct perf_counter *counter)
 	task_clock_perf_counter_update(counter, time);
 }
 
-static const struct hw_perf_counter_ops perf_ops_task_clock = {
+static const struct pmu perf_ops_task_clock = {
 	.enable		= task_clock_perf_counter_enable,
 	.disable	= task_clock_perf_counter_disable,
 	.read		= task_clock_perf_counter_read,
@@ -2574,7 +2573,7 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
 	cpu_migrations_perf_counter_update(counter);
 }
 
-static const struct hw_perf_counter_ops perf_ops_cpu_migrations = {
+static const struct pmu perf_ops_cpu_migrations = {
 	.enable		= cpu_migrations_perf_counter_enable,
 	.disable	= cpu_migrations_perf_counter_disable,
 	.read		= cpu_migrations_perf_counter_read,
@@ -2600,8 +2599,7 @@ static void tp_perf_counter_destroy(struct perf_counter *counter)
 	ftrace_profile_disable(perf_event_id(&counter->hw_event));
 }
 
-static const struct hw_perf_counter_ops *
-tp_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
 	int event_id = perf_event_id(&counter->hw_event);
 	int ret;
@@ -2616,18 +2614,16 @@ tp_perf_counter_init(struct perf_counter *counter)
 	return &perf_ops_generic;
 }
 #else
-static const struct hw_perf_counter_ops *
-tp_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
 	return NULL;
 }
 #endif
 
-static const struct hw_perf_counter_ops *
-sw_perf_counter_init(struct perf_counter *counter)
+static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_hw_event *hw_event = &counter->hw_event;
-	const struct hw_perf_counter_ops *hw_ops = NULL;
+	const struct pmu *pmu = NULL;
 	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
@@ -2639,7 +2635,7 @@ sw_perf_counter_init(struct perf_counter *counter)
 	 */
 	switch (perf_event_id(&counter->hw_event)) {
 	case PERF_COUNT_CPU_CLOCK:
-		hw_ops = &perf_ops_cpu_clock;
+		pmu = &perf_ops_cpu_clock;
 
 		if (hw_event->irq_period && hw_event->irq_period < 10000)
 			hw_event->irq_period = 10000;
@@ -2650,9 +2646,9 @@ sw_perf_counter_init(struct perf_counter *counter)
 		 * use the cpu_clock counter instead.
 		 */
 		if (counter->ctx->task)
-			hw_ops = &perf_ops_task_clock;
+			pmu = &perf_ops_task_clock;
 		else
-			hw_ops = &perf_ops_cpu_clock;
+			pmu = &perf_ops_cpu_clock;
 
 		if (hw_event->irq_period && hw_event->irq_period < 10000)
 			hw_event->irq_period = 10000;
@@ -2661,18 +2657,18 @@ sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_PAGE_FAULTS_MIN:
 	case PERF_COUNT_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		hw_ops = &perf_ops_generic;
+		pmu = &perf_ops_generic;
 		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
 		if (!counter->hw_event.exclude_kernel)
-			hw_ops = &perf_ops_cpu_migrations;
+			pmu = &perf_ops_cpu_migrations;
 		break;
 	}
 
-	if (hw_ops)
+	if (pmu)
 		hwc->irq_period = hw_event->irq_period;
 
-	return hw_ops;
+	return pmu;
 }
 
 /*
@@ -2685,7 +2681,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 		   struct perf_counter *group_leader,
 		   gfp_t gfpflags)
 {
-	const struct hw_perf_counter_ops *hw_ops;
+	const struct pmu *pmu;
 	struct perf_counter *counter;
 	long err;
 
@@ -2713,46 +2709,46 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->group_leader		= group_leader;
-	counter->hw_ops			= NULL;
+	counter->pmu			= NULL;
 	counter->ctx			= ctx;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
-	hw_ops = NULL;
+	pmu = NULL;
 
 	if (perf_event_raw(hw_event)) {
-		hw_ops = hw_perf_counter_init(counter);
+		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
 	switch (perf_event_type(hw_event)) {
 	case PERF_TYPE_HARDWARE:
-		hw_ops = hw_perf_counter_init(counter);
+		pmu = hw_perf_counter_init(counter);
 		break;
 
 	case PERF_TYPE_SOFTWARE:
-		hw_ops = sw_perf_counter_init(counter);
+		pmu = sw_perf_counter_init(counter);
 		break;
 
 	case PERF_TYPE_TRACEPOINT:
-		hw_ops = tp_perf_counter_init(counter);
+		pmu = tp_perf_counter_init(counter);
 		break;
 	}
 done:
 	err = 0;
-	if (!hw_ops)
+	if (!pmu)
 		err = -EINVAL;
-	else if (IS_ERR(hw_ops))
-		err = PTR_ERR(hw_ops);
+	else if (IS_ERR(pmu))
+		err = PTR_ERR(pmu);
 
 	if (err) {
 		kfree(counter);
 		return ERR_PTR(err);
 	}
 
-	counter->hw_ops = hw_ops;
+	counter->pmu = pmu;
 
 	if (counter->hw_event.mmap)
 		atomic_inc(&nr_mmap_tracking);
-- 
GitLab


From 5f4ec28ffe77c840354cce1820a3436106e9e0f1 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:04 +0200
Subject: [PATCH 0721/2198] perf_counter, x86: rename struct pmc_x86_ops into
 struct x86_pmu

This patch renames struct pmc_x86_ops into struct x86_pmu. It
introduces a structure to describe an x86 model specific pmu
(performance monitoring unit). It may contain ops and data. The new
name of the structure fits better, is shorter, and thus better to
handle. Where it was appropriate, names of function and variable have
been changed too.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-8-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 135 +++++++++++++++--------------
 1 file changed, 68 insertions(+), 67 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 95de980c74a02..808a1a1134633 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -44,9 +44,9 @@ struct cpu_hw_counters {
 };
 
 /*
- * struct pmc_x86_ops - performance counter x86 ops
+ * struct x86_pmu - generic x86 pmu
  */
-struct pmc_x86_ops {
+struct x86_pmu {
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	u64		(*get_status)(u64);
@@ -60,7 +60,7 @@ struct pmc_x86_ops {
 	int		max_events;
 };
 
-static struct pmc_x86_ops *pmc_ops __read_mostly;
+static struct x86_pmu *x86_pmu __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
@@ -82,12 +82,12 @@ static const u64 intel_perfmon_event_map[] =
   [PERF_COUNT_BUS_CYCLES]		= 0x013c,
 };
 
-static u64 pmc_intel_event_map(int event)
+static u64 intel_pmu_event_map(int event)
 {
 	return intel_perfmon_event_map[event];
 }
 
-static u64 pmc_intel_raw_event(u64 event)
+static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
@@ -114,12 +114,12 @@ static const u64 amd_perfmon_event_map[] =
   [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
 };
 
-static u64 pmc_amd_event_map(int event)
+static u64 amd_pmu_event_map(int event)
 {
 	return amd_perfmon_event_map[event];
 }
 
-static u64 pmc_amd_raw_event(u64 event)
+static u64 amd_pmu_raw_event(u64 event)
 {
 #define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
@@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void)
 		disable_lapic_nmi_watchdog();
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_perfctr_nmi(pmc_ops->perfctr + i))
+		if (!reserve_perfctr_nmi(x86_pmu->perfctr + i))
 			goto perfctr_fail;
 	}
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_evntsel_nmi(pmc_ops->eventsel + i))
+		if (!reserve_evntsel_nmi(x86_pmu->eventsel + i))
 			goto eventsel_fail;
 	}
 
@@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void)
 
 eventsel_fail:
 	for (i--; i >= 0; i--)
-		release_evntsel_nmi(pmc_ops->eventsel + i);
+		release_evntsel_nmi(x86_pmu->eventsel + i);
 
 	i = nr_counters_generic;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
-		release_perfctr_nmi(pmc_ops->perfctr + i);
+		release_perfctr_nmi(x86_pmu->perfctr + i);
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
@@ -216,8 +216,8 @@ static void release_pmc_hardware(void)
 	int i;
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		release_perfctr_nmi(pmc_ops->perfctr + i);
-		release_evntsel_nmi(pmc_ops->eventsel + i);
+		release_perfctr_nmi(x86_pmu->perfctr + i);
+		release_evntsel_nmi(x86_pmu->eventsel + i);
 	}
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -293,14 +293,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (perf_event_raw(hw_event)) {
-		hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event));
+		hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event));
 	} else {
-		if (perf_event_id(hw_event) >= pmc_ops->max_events)
+		if (perf_event_id(hw_event) >= x86_pmu->max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu->event_map(perf_event_id(hw_event));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
@@ -308,7 +308,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-static u64 pmc_intel_save_disable_all(void)
+static u64 intel_pmu_save_disable_all(void)
 {
 	u64 ctrl;
 
@@ -318,7 +318,7 @@ static u64 pmc_intel_save_disable_all(void)
 	return ctrl;
 }
 
-static u64 pmc_amd_save_disable_all(void)
+static u64 amd_pmu_save_disable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int enabled, idx;
@@ -327,7 +327,8 @@ static u64 pmc_amd_save_disable_all(void)
 	cpuc->enabled = 0;
 	/*
 	 * ensure we write the disable before we start disabling the
-	 * counters proper, so that pcm_amd_enable() does the right thing.
+	 * counters proper, so that amd_pmu_enable_counter() does the
+	 * right thing.
 	 */
 	barrier();
 
@@ -351,19 +352,19 @@ u64 hw_perf_save_disable(void)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return pmc_ops->save_disable_all();
+	return x86_pmu->save_disable_all();
 }
 /*
  * Exported because of ACPI idle
  */
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
-static void pmc_intel_restore_all(u64 ctrl)
+static void intel_pmu_restore_all(u64 ctrl)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 }
 
-static void pmc_amd_restore_all(u64 ctrl)
+static void amd_pmu_restore_all(u64 ctrl)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
@@ -391,14 +392,14 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->restore_all(ctrl);
+	x86_pmu->restore_all(ctrl);
 }
 /*
  * Exported because of ACPI idle
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static u64 pmc_intel_get_status(u64 mask)
+static u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
@@ -407,7 +408,7 @@ static u64 pmc_intel_get_status(u64 mask)
 	return status;
 }
 
-static u64 pmc_amd_get_status(u64 mask)
+static u64 amd_pmu_get_status(u64 mask)
 {
 	u64 status = 0;
 	int idx;
@@ -432,15 +433,15 @@ static u64 hw_perf_get_status(u64 mask)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return pmc_ops->get_status(mask);
+	return x86_pmu->get_status(mask);
 }
 
-static void pmc_intel_ack_status(u64 ack)
+static void intel_pmu_ack_status(u64 ack)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void pmc_amd_ack_status(u64 ack)
+static void amd_pmu_ack_status(u64 ack)
 {
 }
 
@@ -449,16 +450,16 @@ static void hw_perf_ack_status(u64 ack)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->ack_status(ack);
+	x86_pmu->ack_status(ack);
 }
 
-static void pmc_intel_enable(int idx, u64 config)
+static void intel_pmu_enable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
 			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
-static void pmc_amd_enable(int idx, u64 config)
+static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
@@ -474,15 +475,15 @@ static void hw_perf_enable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->enable(idx, config);
+	x86_pmu->enable(idx, config);
 }
 
-static void pmc_intel_disable(int idx, u64 config)
+static void intel_pmu_disable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
 }
 
-static void pmc_amd_disable(int idx, u64 config)
+static void amd_pmu_disable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
@@ -496,7 +497,7 @@ static void hw_perf_disable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	pmc_ops->disable(idx, config);
+	x86_pmu->disable(idx, config);
 }
 
 static inline void
@@ -613,11 +614,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -661,8 +662,8 @@ static int x86_pmu_enable(struct perf_counter *counter)
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = pmc_ops->eventsel;
-		hwc->counter_base = pmc_ops->perfctr;
+		hwc->config_base  = x86_pmu->eventsel;
+		hwc->counter_base = x86_pmu->perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -710,8 +711,8 @@ void perf_counter_print_debug(void)
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl);
-		rdmsrl(pmc_ops->perfctr  + idx, pmc_count);
+		rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl);
+		rdmsrl(x86_pmu->perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -918,35 +919,35 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 	.priority		= 1
 };
 
-static struct pmc_x86_ops pmc_intel_ops = {
-	.save_disable_all	= pmc_intel_save_disable_all,
-	.restore_all		= pmc_intel_restore_all,
-	.get_status		= pmc_intel_get_status,
-	.ack_status		= pmc_intel_ack_status,
-	.enable			= pmc_intel_enable,
-	.disable		= pmc_intel_disable,
+static struct x86_pmu intel_pmu = {
+	.save_disable_all	= intel_pmu_save_disable_all,
+	.restore_all		= intel_pmu_restore_all,
+	.get_status		= intel_pmu_get_status,
+	.ack_status		= intel_pmu_ack_status,
+	.enable			= intel_pmu_enable_counter,
+	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
 	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0,
-	.event_map		= pmc_intel_event_map,
-	.raw_event		= pmc_intel_raw_event,
+	.event_map		= intel_pmu_event_map,
+	.raw_event		= intel_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 };
 
-static struct pmc_x86_ops pmc_amd_ops = {
-	.save_disable_all	= pmc_amd_save_disable_all,
-	.restore_all		= pmc_amd_restore_all,
-	.get_status		= pmc_amd_get_status,
-	.ack_status		= pmc_amd_ack_status,
-	.enable			= pmc_amd_enable,
-	.disable		= pmc_amd_disable,
+static struct x86_pmu amd_pmu = {
+	.save_disable_all	= amd_pmu_save_disable_all,
+	.restore_all		= amd_pmu_restore_all,
+	.get_status		= amd_pmu_get_status,
+	.ack_status		= amd_pmu_ack_status,
+	.enable			= amd_pmu_enable_counter,
+	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
 	.perfctr		= MSR_K7_PERFCTR0,
-	.event_map		= pmc_amd_event_map,
-	.raw_event		= pmc_amd_raw_event,
+	.event_map		= amd_pmu_event_map,
+	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
-static struct pmc_x86_ops *pmc_intel_init(void)
+static struct x86_pmu *intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
@@ -977,10 +978,10 @@ static struct pmc_x86_ops *pmc_intel_init(void)
 	nr_counters_fixed = edx.split.num_counters_fixed;
 	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 
-	return &pmc_intel_ops;
+	return &intel_pmu;
 }
 
-static struct pmc_x86_ops *pmc_amd_init(void)
+static struct x86_pmu *amd_pmu_init(void)
 {
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
@@ -989,22 +990,22 @@ static struct pmc_x86_ops *pmc_amd_init(void)
 
 	pr_info("AMD Performance Monitoring support detected.\n");
 
-	return &pmc_amd_ops;
+	return &amd_pmu;
 }
 
 void __init init_hw_perf_counters(void)
 {
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
-		pmc_ops = pmc_intel_init();
+		x86_pmu = intel_pmu_init();
 		break;
 	case X86_VENDOR_AMD:
-		pmc_ops = pmc_amd_init();
+		x86_pmu = amd_pmu_init();
 		break;
 	default:
 		return;
 	}
-	if (!pmc_ops)
+	if (!x86_pmu)
 		return;
 
 	pr_info("... num counters:    %d\n", nr_counters_generic);
-- 
GitLab


From 39d81eab2374d71b2d9c82f66258a1a4f57ddd2e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:05 +0200
Subject: [PATCH 0722/2198] perf_counter, x86: make interrupt handler model
 specific

This separates the perfcounter interrupt handler for AMD and Intel
cpus. The AMD interrupt handler implementation is a follow-on patch.

[ Impact: refactor and clean up code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-9-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 808a1a1134633..9d90de0bd0b0a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -4,6 +4,7 @@
  *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
  *  Copyright(C) 2009 Jaswinder Singh Rajput
+ *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -47,6 +48,7 @@ struct cpu_hw_counters {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	u64		(*get_status)(u64);
@@ -241,6 +243,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
+	/* disable temporarily */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		return -ENOSYS;
+
 	if (unlikely(!perf_counters_initialized))
 		return -EINVAL;
 
@@ -780,7 +786,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
+static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
@@ -827,6 +833,8 @@ static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi)
 	return ret;
 }
 
+static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; }
+
 void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
@@ -851,7 +859,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	ack_APIC_irq();
-	__smp_perf_counter_interrupt(regs, 0);
+	x86_pmu->handle_irq(regs, 0);
 	irq_exit();
 }
 
@@ -908,7 +916,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = __smp_perf_counter_interrupt(regs, 1);
+	ret = x86_pmu->handle_irq(regs, 1);
 
 	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
@@ -920,6 +928,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 };
 
 static struct x86_pmu intel_pmu = {
+	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
 	.get_status		= intel_pmu_get_status,
@@ -934,6 +943,7 @@ static struct x86_pmu intel_pmu = {
 };
 
 static struct x86_pmu amd_pmu = {
+	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
 	.get_status		= amd_pmu_get_status,
-- 
GitLab


From b7f8859a8ed1937e2139c17b84878f1d413fa659 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:06 +0200
Subject: [PATCH 0723/2198] perf_counter, x86: remove get_status() from struct
 x86_pmu

This function is Intel only and not necessary for AMD cpus.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-10-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 39 ++++--------------------------
 1 file changed, 5 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9d90de0bd0b0a..d0bb02919c63c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -51,7 +51,6 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	u64		(*get_status)(u64);
 	void		(*ack_status)(u64);
 	void		(*enable)(int, u64);
 	void		(*disable)(int, u64);
@@ -405,41 +404,15 @@ void hw_perf_restore(u64 ctrl)
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static u64 intel_pmu_get_status(u64 mask)
+static inline u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
-	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
-
-	return status;
-}
-
-static u64 amd_pmu_get_status(u64 mask)
-{
-	u64 status = 0;
-	int idx;
-
-	for (idx = 0; idx < nr_counters_generic; idx++) {
-		s64 val;
-
-		if (!(mask & (1 << idx)))
-			continue;
-
-		rdmsrl(MSR_K7_PERFCTR0 + idx, val);
-		val <<= (64 - counter_value_bits);
-		if (val >= 0)
-			status |= (1 << idx);
-	}
-
-	return status;
-}
-
-static u64 hw_perf_get_status(u64 mask)
-{
 	if (unlikely(!perf_counters_initialized))
 		return 0;
+	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 
-	return x86_pmu->get_status(mask);
+	return status;
 }
 
 static void intel_pmu_ack_status(u64 ack)
@@ -795,7 +768,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	cpuc->throttle_ctrl = hw_perf_save_disable();
 
-	status = hw_perf_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (!status)
 		goto out;
 
@@ -820,7 +793,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	status = hw_perf_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (status)
 		goto again;
 out:
@@ -931,7 +904,6 @@ static struct x86_pmu intel_pmu = {
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
-	.get_status		= intel_pmu_get_status,
 	.ack_status		= intel_pmu_ack_status,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
@@ -946,7 +918,6 @@ static struct x86_pmu amd_pmu = {
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
-	.get_status		= amd_pmu_get_status,
 	.ack_status		= amd_pmu_ack_status,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
-- 
GitLab


From dee5d9067ca78b317538fd67930be4e09a83dbc5 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:07 +0200
Subject: [PATCH 0724/2198] perf_counter, x86: remove ack_status() from struct
 x86_pmu

This function is Intel only and not necessary for AMD cpus.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-11-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d0bb02919c63c..6bbdc16cc69f9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -51,7 +51,6 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	void		(*ack_status)(u64);
 	void		(*enable)(int, u64);
 	void		(*disable)(int, u64);
 	unsigned	eventsel;
@@ -415,23 +414,11 @@ static inline u64 intel_pmu_get_status(u64 mask)
 	return status;
 }
 
-static void intel_pmu_ack_status(u64 ack)
+static inline void intel_pmu_ack_status(u64 ack)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void amd_pmu_ack_status(u64 ack)
-{
-}
-
-static void hw_perf_ack_status(u64 ack)
-{
-	if (unlikely(!perf_counters_initialized))
-		return;
-
-	x86_pmu->ack_status(ack);
-}
-
 static void intel_pmu_enable_counter(int idx, u64 config)
 {
 	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
@@ -788,7 +775,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
 
-	hw_perf_ack_status(ack);
+	intel_pmu_ack_status(ack);
 
 	/*
 	 * Repeat if there is more work to be done:
@@ -904,7 +891,6 @@ static struct x86_pmu intel_pmu = {
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
-	.ack_status		= intel_pmu_ack_status,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
@@ -918,7 +904,6 @@ static struct x86_pmu amd_pmu = {
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
-	.ack_status		= amd_pmu_ack_status,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
-- 
GitLab


From 26816c287e13eedc67bc4ed0cd40c138314b7c7d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:08 +0200
Subject: [PATCH 0725/2198] perf_counter, x86: rename
 __hw_perf_counter_set_period into x86_perf_counter_set_period

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-12-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6bbdc16cc69f9..fa6541d781bc9 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -498,7 +498,7 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
  * To be called with the counter disabled in hw:
  */
 static void
-__hw_perf_counter_set_period(struct perf_counter *counter,
+x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
@@ -642,7 +642,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
 	 */
 	barrier();
 
-	__hw_perf_counter_set_period(counter, hwc, idx);
+	x86_perf_counter_set_period(counter, hwc, idx);
 	__x86_pmu_enable(counter, hwc, idx);
 
 	return 0;
@@ -731,7 +731,7 @@ static void perf_save_and_restart(struct perf_counter *counter)
 	int idx = hwc->idx;
 
 	x86_perf_counter_update(counter, hwc, idx);
-	__hw_perf_counter_set_period(counter, hwc, idx);
+	x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		__x86_pmu_enable(counter, hwc, idx);
-- 
GitLab


From 55de0f2e57994b525324bf0d04d242d9358a2417 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:09 +0200
Subject: [PATCH 0726/2198] perf_counter, x86: rename intel only functions

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-13-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fa6541d781bc9..5a52d73ccfa71 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -725,7 +725,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
  */
-static void perf_save_and_restart(struct perf_counter *counter)
+static void intel_pmu_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
@@ -753,7 +753,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
 	int ret = 0;
 
-	cpuc->throttle_ctrl = hw_perf_save_disable();
+	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
 	status = intel_pmu_get_status(cpuc->throttle_ctrl);
 	if (!status)
@@ -770,7 +770,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		if (!counter)
 			continue;
 
-		perf_save_and_restart(counter);
+		intel_pmu_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			__x86_pmu_disable(counter, &counter->hw, bit);
 	}
@@ -788,7 +788,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
 	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		hw_perf_restore(cpuc->throttle_ctrl);
+		intel_pmu_restore_all(cpuc->throttle_ctrl);
 
 	return ret;
 }
-- 
GitLab


From 72eae04d3a3075c26d39e1e685acfc8e8c29db64 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:10 +0200
Subject: [PATCH 0727/2198] perf_counter, x86: modify initialization of struct
 x86_pmu

This patch adds an error handler and changes initialization of struct
x86_pmu. No functional changes. Needed for follow-on patches.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-14-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 27 ++++++++++++++++-----------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5a52d73ccfa71..7c72a94236367 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -913,7 +913,7 @@ static struct x86_pmu amd_pmu = {
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
 };
 
-static struct x86_pmu *intel_pmu_init(void)
+static int intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
@@ -921,7 +921,7 @@ static struct x86_pmu *intel_pmu_init(void)
 	unsigned int ebx;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return NULL;
+		return -ENODEV;
 
 	/*
 	 * Check whether the Architectural PerfMon supports
@@ -929,49 +929,54 @@ static struct x86_pmu *intel_pmu_init(void)
 	 */
 	cpuid(10, &eax.full, &ebx, &unused, &edx.full);
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
-		return NULL;
+		return -ENODEV;
 
 	intel_perfmon_version = eax.split.version_id;
 	if (intel_perfmon_version < 2)
-		return NULL;
+		return -ENODEV;
 
 	pr_info("Intel Performance Monitoring support detected.\n");
 	pr_info("... version:         %d\n", intel_perfmon_version);
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
+	x86_pmu = &intel_pmu;
+
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
 	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
 
-	return &intel_pmu;
+	return 0;
 }
 
-static struct x86_pmu *amd_pmu_init(void)
+static int amd_pmu_init(void)
 {
+	x86_pmu = &amd_pmu;
+
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
 	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
 	counter_value_bits = 48;
 
 	pr_info("AMD Performance Monitoring support detected.\n");
-
-	return &amd_pmu;
+	return 0;
 }
 
 void __init init_hw_perf_counters(void)
 {
+	int err;
+
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
-		x86_pmu = intel_pmu_init();
+		err = intel_pmu_init();
 		break;
 	case X86_VENDOR_AMD:
-		x86_pmu = amd_pmu_init();
+		err = amd_pmu_init();
 		break;
 	default:
 		return;
 	}
-	if (!x86_pmu)
+	if (err != 0)
 		return;
 
 	pr_info("... num counters:    %d\n", nr_counters_generic);
-- 
GitLab


From 4a06bd8508f65ad1dd5cd2046b85694813fa36a2 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:11 +0200
Subject: [PATCH 0728/2198] perf_counter, x86: make x86_pmu data a static
 struct

Instead of using a pointer to reference to the x86 pmu we now have one
single data structure that is initialized at the beginning. This saves
the pointer access when using this memory.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-15-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 50 +++++++++++++++---------------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7c72a94236367..68597d763389b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -60,7 +60,7 @@ struct x86_pmu {
 	int		max_events;
 };
 
-static struct x86_pmu *x86_pmu __read_mostly;
+static struct x86_pmu x86_pmu __read_mostly;
 
 static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
@@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void)
 		disable_lapic_nmi_watchdog();
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_perfctr_nmi(x86_pmu->perfctr + i))
+		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		if (!reserve_evntsel_nmi(x86_pmu->eventsel + i))
+		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
 
@@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void)
 
 eventsel_fail:
 	for (i--; i >= 0; i--)
-		release_evntsel_nmi(x86_pmu->eventsel + i);
+		release_evntsel_nmi(x86_pmu.eventsel + i);
 
 	i = nr_counters_generic;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
-		release_perfctr_nmi(x86_pmu->perfctr + i);
+		release_perfctr_nmi(x86_pmu.perfctr + i);
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		enable_lapic_nmi_watchdog();
@@ -216,8 +216,8 @@ static void release_pmc_hardware(void)
 	int i;
 
 	for (i = 0; i < nr_counters_generic; i++) {
-		release_perfctr_nmi(x86_pmu->perfctr + i);
-		release_evntsel_nmi(x86_pmu->eventsel + i);
+		release_perfctr_nmi(x86_pmu.perfctr + i);
+		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
 
 	if (nmi_watchdog == NMI_LOCAL_APIC)
@@ -297,14 +297,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * Raw event type provide the config in the event structure
 	 */
 	if (perf_event_raw(hw_event)) {
-		hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event));
+		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
 	} else {
-		if (perf_event_id(hw_event) >= x86_pmu->max_events)
+		if (perf_event_id(hw_event) >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu->event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
@@ -356,7 +356,7 @@ u64 hw_perf_save_disable(void)
 	if (unlikely(!perf_counters_initialized))
 		return 0;
 
-	return x86_pmu->save_disable_all();
+	return x86_pmu.save_disable_all();
 }
 /*
  * Exported because of ACPI idle
@@ -396,7 +396,7 @@ void hw_perf_restore(u64 ctrl)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->restore_all(ctrl);
+	x86_pmu.restore_all(ctrl);
 }
 /*
  * Exported because of ACPI idle
@@ -441,7 +441,7 @@ static void hw_perf_enable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->enable(idx, config);
+	x86_pmu.enable(idx, config);
 }
 
 static void intel_pmu_disable_counter(int idx, u64 config)
@@ -463,7 +463,7 @@ static void hw_perf_disable(int idx, u64 config)
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu->disable(idx, config);
+	x86_pmu.disable(idx, config);
 }
 
 static inline void
@@ -580,11 +580,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
@@ -628,8 +628,8 @@ static int x86_pmu_enable(struct perf_counter *counter)
 			set_bit(idx, cpuc->used);
 			hwc->idx = idx;
 		}
-		hwc->config_base  = x86_pmu->eventsel;
-		hwc->counter_base = x86_pmu->perfctr;
+		hwc->config_base  = x86_pmu.eventsel;
+		hwc->counter_base = x86_pmu.perfctr;
 	}
 
 	perf_counters_lapic_init(hwc->nmi);
@@ -677,8 +677,8 @@ void perf_counter_print_debug(void)
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
 	for (idx = 0; idx < nr_counters_generic; idx++) {
-		rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl);
-		rdmsrl(x86_pmu->perfctr  + idx, pmc_count);
+		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
+		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
 		prev_left = per_cpu(prev_left[idx], cpu);
 
@@ -819,7 +819,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
 	irq_enter();
 	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
 	ack_APIC_irq();
-	x86_pmu->handle_irq(regs, 0);
+	x86_pmu.handle_irq(regs, 0);
 	irq_exit();
 }
 
@@ -876,7 +876,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = x86_pmu->handle_irq(regs, 1);
+	ret = x86_pmu.handle_irq(regs, 1);
 
 	return ret ? NOTIFY_STOP : NOTIFY_OK;
 }
@@ -940,7 +940,7 @@ static int intel_pmu_init(void)
 	pr_info("... bit width:       %d\n", eax.split.bit_width);
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
-	x86_pmu = &intel_pmu;
+	x86_pmu = intel_pmu;
 
 	nr_counters_generic = eax.split.num_counters;
 	nr_counters_fixed = edx.split.num_counters_fixed;
@@ -951,7 +951,7 @@ static int intel_pmu_init(void)
 
 static int amd_pmu_init(void)
 {
-	x86_pmu = &amd_pmu;
+	x86_pmu = amd_pmu;
 
 	nr_counters_generic = 4;
 	nr_counters_fixed = 0;
-- 
GitLab


From 0933e5c6a680ba8d8d786a6f7fa377b7ec0d1e49 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:12 +0200
Subject: [PATCH 0729/2198] perf_counter, x86: move counter parameters to
 struct x86_pmu

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-16-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 80 ++++++++++++++----------------
 1 file changed, 37 insertions(+), 43 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 68597d763389b..75dbb1f0900ec 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -24,16 +24,7 @@
 #include <asm/nmi.h>
 
 static bool perf_counters_initialized __read_mostly;
-
-/*
- * Number of (generic) HW counters:
- */
-static int nr_counters_generic __read_mostly;
 static u64 perf_counter_mask __read_mostly;
-static u64 counter_value_mask __read_mostly;
-static int counter_value_bits __read_mostly;
-
-static int nr_counters_fixed __read_mostly;
 
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
@@ -58,6 +49,10 @@ struct x86_pmu {
 	u64		(*event_map)(int);
 	u64		(*raw_event)(u64);
 	int		max_events;
+	int		num_counters;
+	int		num_counters_fixed;
+	int		counter_bits;
+	u64		counter_mask;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -183,12 +178,12 @@ static bool reserve_pmc_hardware(void)
 	if (nmi_watchdog == NMI_LOCAL_APIC)
 		disable_lapic_nmi_watchdog();
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
 			goto perfctr_fail;
 	}
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
 			goto eventsel_fail;
 	}
@@ -199,7 +194,7 @@ static bool reserve_pmc_hardware(void)
 	for (i--; i >= 0; i--)
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 
-	i = nr_counters_generic;
+	i = x86_pmu.num_counters;
 
 perfctr_fail:
 	for (i--; i >= 0; i--)
@@ -215,7 +210,7 @@ static void release_pmc_hardware(void)
 {
 	int i;
 
-	for (i = 0; i < nr_counters_generic; i++) {
+	for (i = 0; i < x86_pmu.num_counters; i++) {
 		release_perfctr_nmi(x86_pmu.perfctr + i);
 		release_evntsel_nmi(x86_pmu.eventsel + i);
 	}
@@ -336,7 +331,7 @@ static u64 amd_pmu_save_disable_all(void)
 	 */
 	barrier();
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -378,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	if (!ctrl)
 		return;
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
@@ -527,7 +522,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	atomic64_set(&hwc->prev_count, (u64)-left);
 
 	err = checking_wrmsrl(hwc->counter_base + idx,
-			     (u64)(-left) & counter_value_mask);
+			     (u64)(-left) & x86_pmu.counter_mask);
 }
 
 static inline void
@@ -621,8 +616,9 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		/* Try to get the previous generic counter again */
 		if (test_and_set_bit(idx, cpuc->used)) {
 try_generic:
-			idx = find_first_zero_bit(cpuc->used, nr_counters_generic);
-			if (idx == nr_counters_generic)
+			idx = find_first_zero_bit(cpuc->used,
+						  x86_pmu.num_counters);
+			if (idx == x86_pmu.num_counters)
 				return -EAGAIN;
 
 			set_bit(idx, cpuc->used);
@@ -654,7 +650,7 @@ void perf_counter_print_debug(void)
 	struct cpu_hw_counters *cpuc;
 	int cpu, idx;
 
-	if (!nr_counters_generic)
+	if (!x86_pmu.num_counters)
 		return;
 
 	local_irq_disable();
@@ -676,7 +672,7 @@ void perf_counter_print_debug(void)
 	}
 	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
 
-	for (idx = 0; idx < nr_counters_generic; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
 		rdmsrl(x86_pmu.perfctr  + idx, pmc_count);
 
@@ -689,7 +685,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n",
 			cpu, idx, prev_left);
 	}
-	for (idx = 0; idx < nr_counters_fixed; idx++) {
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
 
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -911,6 +907,9 @@ static struct x86_pmu amd_pmu = {
 	.event_map		= amd_pmu_event_map,
 	.raw_event		= amd_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(amd_perfmon_event_map),
+	.num_counters		= 4,
+	.counter_bits		= 48,
+	.counter_mask		= (1ULL << 48) - 1,
 };
 
 static int intel_pmu_init(void)
@@ -941,10 +940,10 @@ static int intel_pmu_init(void)
 	pr_info("... mask length:     %d\n", eax.split.mask_length);
 
 	x86_pmu = intel_pmu;
-
-	nr_counters_generic = eax.split.num_counters;
-	nr_counters_fixed = edx.split.num_counters_fixed;
-	counter_value_mask = (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters = eax.split.num_counters;
+	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+	x86_pmu.counter_bits = eax.split.bit_width;
+	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
 	return 0;
 }
@@ -952,12 +951,6 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
-
-	nr_counters_generic = 4;
-	nr_counters_fixed = 0;
-	counter_value_mask = 0x0000FFFFFFFFFFFFULL;
-	counter_value_bits = 48;
-
 	pr_info("AMD Performance Monitoring support detected.\n");
 	return 0;
 }
@@ -979,25 +972,26 @@ void __init init_hw_perf_counters(void)
 	if (err != 0)
 		return;
 
-	pr_info("... num counters:    %d\n", nr_counters_generic);
-	if (nr_counters_generic > X86_PMC_MAX_GENERIC) {
-		nr_counters_generic = X86_PMC_MAX_GENERIC;
+	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
+	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
+		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
-			nr_counters_generic, X86_PMC_MAX_GENERIC);
+		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
 	}
-	perf_counter_mask = (1 << nr_counters_generic) - 1;
-	perf_max_counters = nr_counters_generic;
+	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
+	perf_max_counters = x86_pmu.num_counters;
 
-	pr_info("... value mask:      %016Lx\n", counter_value_mask);
+	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
 
-	if (nr_counters_fixed > X86_PMC_MAX_FIXED) {
-		nr_counters_fixed = X86_PMC_MAX_FIXED;
+	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
+		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
-			nr_counters_fixed, X86_PMC_MAX_FIXED);
+		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	pr_info("... fixed counters:  %d\n", nr_counters_fixed);
+	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
 
-	perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED;
+	perf_counter_mask |=
+		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 	perf_counters_initialized = true;
-- 
GitLab


From faa28ae018ed004a22aa4a7704e04ccdde4a941e Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:13 +0200
Subject: [PATCH 0730/2198] perf_counter, x86: make pmu version generic

This makes the use of the version variable generic. Also, some debug
messages have been generalized.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-17-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 75dbb1f0900ec..15d2c03e16f14 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -39,6 +39,8 @@ struct cpu_hw_counters {
  * struct x86_pmu - generic x86 pmu
  */
 struct x86_pmu {
+	const char	*name;
+	int		version;
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
@@ -61,8 +63,6 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
 };
 
-static __read_mostly int intel_perfmon_version;
-
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
@@ -658,7 +658,7 @@ void perf_counter_print_debug(void)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (intel_perfmon_version >= 2) {
+	if (x86_pmu.version >= 2) {
 		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
@@ -884,6 +884,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 };
 
 static struct x86_pmu intel_pmu = {
+	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
 	.save_disable_all	= intel_pmu_save_disable_all,
 	.restore_all		= intel_pmu_restore_all,
@@ -897,6 +898,7 @@ static struct x86_pmu intel_pmu = {
 };
 
 static struct x86_pmu amd_pmu = {
+	.name			= "AMD",
 	.handle_irq		= amd_pmu_handle_irq,
 	.save_disable_all	= amd_pmu_save_disable_all,
 	.restore_all		= amd_pmu_restore_all,
@@ -918,6 +920,7 @@ static int intel_pmu_init(void)
 	union cpuid10_eax eax;
 	unsigned int unused;
 	unsigned int ebx;
+	int version;
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return -ENODEV;
@@ -930,16 +933,12 @@ static int intel_pmu_init(void)
 	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
 		return -ENODEV;
 
-	intel_perfmon_version = eax.split.version_id;
-	if (intel_perfmon_version < 2)
+	version = eax.split.version_id;
+	if (version < 2)
 		return -ENODEV;
 
-	pr_info("Intel Performance Monitoring support detected.\n");
-	pr_info("... version:         %d\n", intel_perfmon_version);
-	pr_info("... bit width:       %d\n", eax.split.bit_width);
-	pr_info("... mask length:     %d\n", eax.split.mask_length);
-
 	x86_pmu = intel_pmu;
+	x86_pmu.version = version;
 	x86_pmu.num_counters = eax.split.num_counters;
 	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
 	x86_pmu.counter_bits = eax.split.bit_width;
@@ -951,7 +950,6 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
-	pr_info("AMD Performance Monitoring support detected.\n");
 	return 0;
 }
 
@@ -972,6 +970,10 @@ void __init init_hw_perf_counters(void)
 	if (err != 0)
 		return;
 
+	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
+	pr_info("... version:         %d\n", x86_pmu.version);
+	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);
+
 	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
 	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
-- 
GitLab


From bb775fc2d1dcd1aa6eafde37a8289ba2d80783aa Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:14 +0200
Subject: [PATCH 0731/2198] perf_counter, x86: make x86_pmu_read() static
 inline

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-18-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 15d2c03e16f14..3f3ae477a7dc6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1002,7 +1002,7 @@ void __init init_hw_perf_counters(void)
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-static void x86_pmu_read(struct perf_counter *counter)
+static inline void x86_pmu_read(struct perf_counter *counter)
 {
 	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
 }
-- 
GitLab


From 93904966934193204ad08e951f806d5631c29eb3 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:15 +0200
Subject: [PATCH 0732/2198] perf_counter, x86: rename cpuc->active_mask

This is to have a consistent naming scheme with cpuc->used.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-19-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3f3ae477a7dc6..9ec51a662db5d 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -29,9 +29,9 @@ static u64 perf_counter_mask __read_mostly;
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
 	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		active[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
-	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	int			enabled;
 };
 
@@ -334,7 +334,7 @@ static u64 amd_pmu_save_disable_all(void)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active_mask))
+		if (!test_bit(idx, cpuc->active))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
@@ -376,7 +376,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active_mask))
+		if (!test_bit(idx, cpuc->active))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
@@ -424,7 +424,7 @@ static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, cpuc->active_mask);
+	set_bit(idx, cpuc->active);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -448,7 +448,7 @@ static void amd_pmu_disable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	clear_bit(idx, cpuc->active_mask);
+	clear_bit(idx, cpuc->active);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
-- 
GitLab


From 095342389e2ed8deed07b3076f990260ce3c7c9f Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:16 +0200
Subject: [PATCH 0733/2198] perf_counter, x86: generic use of cpuc->active

cpuc->active will now be used to indicate an enabled counter which
implies also valid pointers of cpuc->counters[]. In contrast,
cpuc->used only locks the counter, but it can be still uninitialized.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-20-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9ec51a662db5d..f7fd4a355159b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -424,7 +424,6 @@ static void amd_pmu_enable_counter(int idx, u64 config)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 
-	set_bit(idx, cpuc->active);
 	if (cpuc->enabled)
 		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
@@ -446,9 +445,6 @@ static void intel_pmu_disable_counter(int idx, u64 config)
 
 static void amd_pmu_disable_counter(int idx, u64 config)
 {
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	clear_bit(idx, cpuc->active);
 	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
 
 }
@@ -633,10 +629,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
 	__x86_pmu_disable(counter, hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	/*
-	 * Make it visible before enabling the hw:
-	 */
-	barrier();
+	set_bit(idx, cpuc->active);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
 	__x86_pmu_enable(counter, hwc, idx);
@@ -700,10 +693,13 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	unsigned int idx = hwc->idx;
 
+	/*
+	 * Must be done before we disable, otherwise the nmi handler
+	 * could reenable again:
+	 */
+	clear_bit(idx, cpuc->active);
 	__x86_pmu_disable(counter, hwc, idx);
 
-	clear_bit(idx, cpuc->used);
-	cpuc->counters[idx] = NULL;
 	/*
 	 * Make sure the cleared pointer becomes visible before we
 	 * (potentially) free the counter:
@@ -715,6 +711,8 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * that we are disabling:
 	 */
 	x86_perf_counter_update(counter, hwc, idx);
+	cpuc->counters[idx] = NULL;
+	clear_bit(idx, cpuc->used);
 }
 
 /*
@@ -763,7 +761,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
-		if (!counter)
+		if (!test_bit(bit, cpuc->active))
 			continue;
 
 		intel_pmu_save_and_restart(counter);
-- 
GitLab


From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:17 +0200
Subject: [PATCH 0734/2198] perf_counter, x86: consistent use of type int for
 counter index

The type of counter index is sometimes implemented as unsigned
int. This patch changes this to have a consistent usage of int.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++----
 include/linux/perf_counter.h       | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f7fd4a355159b..d8beebeb270f6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -459,7 +459,7 @@ static void hw_perf_disable(int idx, u64 config)
 
 static inline void
 __pmc_fixed_disable(struct perf_counter *counter,
-		    struct hw_perf_counter *hwc, unsigned int __idx)
+		    struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -474,7 +474,7 @@ __pmc_fixed_disable(struct perf_counter *counter,
 
 static inline void
 __x86_pmu_disable(struct perf_counter *counter,
-		  struct hw_perf_counter *hwc, unsigned int idx)
+		  struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
 		__pmc_fixed_disable(counter, hwc, idx);
@@ -523,7 +523,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 
 static inline void
 __pmc_fixed_enable(struct perf_counter *counter,
-		   struct hw_perf_counter *hwc, unsigned int __idx)
+		   struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -691,7 +691,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	struct hw_perf_counter *hwc = &counter->hw;
-	unsigned int idx = hwc->idx;
+	int idx = hwc->idx;
 
 	/*
 	 * Must be done before we disable, otherwise the nmi handler
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c3db52dc876ad..41aed42700574 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -318,7 +318,7 @@ struct hw_perf_counter {
 			unsigned long			config_base;
 			unsigned long			counter_base;
 			int				nmi;
-			unsigned int			idx;
+			int				idx;
 		};
 		union { /* software */
 			atomic64_t			count;
-- 
GitLab


From 7c90cc45f89af4dd4617f97d452740ad95b800d5 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:18 +0200
Subject: [PATCH 0735/2198] perf_counter, x86: rework counter enable functions

There is vendor specific code in generic x86 code, and there is vendor
specific code that could be generic. This patch introduces
x86_pmu_enable_counter() for x86 generic code. Fixed counter code for
Intel is moved to Intel only functions. In the end, checks and calls
via function pointers were reduced to the necessary. Also, the
internal function i/f changed.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-22-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 52 ++++++++++++++----------------
 1 file changed, 24 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d8beebeb270f6..ae55933ce79c8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -44,7 +44,7 @@ struct x86_pmu {
 	int		(*handle_irq)(struct pt_regs *, int);
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
-	void		(*enable)(int, u64);
+	void		(*enable)(struct hw_perf_counter *, int);
 	void		(*disable)(int, u64);
 	unsigned	eventsel;
 	unsigned	perfctr;
@@ -414,28 +414,15 @@ static inline void intel_pmu_ack_status(u64 ack)
 	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
 }
 
-static void intel_pmu_enable_counter(int idx, u64 config)
+static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx,
-			config | ARCH_PERFMON_EVENTSEL0_ENABLE);
-}
-
-static void amd_pmu_enable_counter(int idx, u64 config)
-{
-	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-
-	if (cpuc->enabled)
-		config |= ARCH_PERFMON_EVENTSEL0_ENABLE;
-
-	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
-}
+	int err;
 
-static void hw_perf_enable(int idx, u64 config)
-{
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu.enable(idx, config);
+	err = checking_wrmsrl(hwc->config_base + idx,
+			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
 static void intel_pmu_disable_counter(int idx, u64 config)
@@ -522,8 +509,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 }
 
 static inline void
-__pmc_fixed_enable(struct perf_counter *counter,
-		   struct hw_perf_counter *hwc, int __idx)
+intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, bits, mask;
@@ -548,14 +534,24 @@ __pmc_fixed_enable(struct perf_counter *counter,
 	err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
-static void
-__x86_pmu_enable(struct perf_counter *counter,
-		 struct hw_perf_counter *hwc, int idx)
+static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		__pmc_fixed_enable(counter, hwc, idx);
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_enable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_enable_counter(hwc, idx);
+}
+
+static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	if (cpuc->enabled)
+		x86_pmu_enable_counter(hwc, idx);
 	else
-		hw_perf_enable(idx, hwc->config);
+		amd_pmu_disable_counter(idx, hwc->config);
 }
 
 static int
@@ -632,7 +628,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
 	set_bit(idx, cpuc->active);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
-	__x86_pmu_enable(counter, hwc, idx);
+	x86_pmu.enable(hwc, idx);
 
 	return 0;
 }
@@ -728,7 +724,7 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 	x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-		__x86_pmu_enable(counter, hwc, idx);
+		intel_pmu_enable_counter(hwc, idx);
 }
 
 /*
-- 
GitLab


From d43698918bd46c71d494555fb92195fbea1fcb6c Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:19 +0200
Subject: [PATCH 0736/2198] perf_counter, x86: rework counter disable functions

As for the enable function, this patch reworks the disable functions
and introduces x86_pmu_disable_counter(). The internal function i/f in
struct x86_pmu changed too.

[ Impact: refactor and generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-23-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 48 ++++++++++++++----------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ae55933ce79c8..df9012bbd211a 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -45,7 +45,7 @@ struct x86_pmu {
 	u64		(*save_disable_all)(void);
 	void		(*restore_all)(u64);
 	void		(*enable)(struct hw_perf_counter *, int);
-	void		(*disable)(int, u64);
+	void		(*disable)(struct hw_perf_counter *, int);
 	unsigned	eventsel;
 	unsigned	perfctr;
 	u64		(*event_map)(int);
@@ -425,28 +425,19 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
-static void intel_pmu_disable_counter(int idx, u64 config)
+static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config);
-}
-
-static void amd_pmu_disable_counter(int idx, u64 config)
-{
-	wrmsrl(MSR_K7_EVNTSEL0 + idx, config);
-
-}
+	int err;
 
-static void hw_perf_disable(int idx, u64 config)
-{
 	if (unlikely(!perf_counters_initialized))
 		return;
 
-	x86_pmu.disable(idx, config);
+	err = checking_wrmsrl(hwc->config_base + idx,
+			      hwc->config);
 }
 
 static inline void
-__pmc_fixed_disable(struct perf_counter *counter,
-		    struct hw_perf_counter *hwc, int __idx)
+intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
@@ -460,13 +451,20 @@ __pmc_fixed_disable(struct perf_counter *counter,
 }
 
 static inline void
-__x86_pmu_disable(struct perf_counter *counter,
-		  struct hw_perf_counter *hwc, int idx)
+intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL))
-		__pmc_fixed_disable(counter, hwc, idx);
-	else
-		hw_perf_disable(idx, hwc->config);
+	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
+		intel_pmu_disable_fixed(hwc, idx);
+		return;
+	}
+
+	x86_pmu_disable_counter(hwc, idx);
+}
+
+static inline void
+amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+	x86_pmu_disable_counter(hwc, idx);
 }
 
 static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
@@ -551,7 +549,7 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 	if (cpuc->enabled)
 		x86_pmu_enable_counter(hwc, idx);
 	else
-		amd_pmu_disable_counter(idx, hwc->config);
+		x86_pmu_disable_counter(hwc, idx);
 }
 
 static int
@@ -622,7 +620,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
 
 	perf_counters_lapic_init(hwc->nmi);
 
-	__x86_pmu_disable(counter, hwc, idx);
+	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
 	set_bit(idx, cpuc->active);
@@ -694,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * could reenable again:
 	 */
 	clear_bit(idx, cpuc->active);
-	__x86_pmu_disable(counter, hwc, idx);
+	x86_pmu.disable(hwc, idx);
 
 	/*
 	 * Make sure the cleared pointer becomes visible before we
@@ -762,7 +760,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 		intel_pmu_save_and_restart(counter);
 		if (perf_counter_overflow(counter, nmi, regs, 0))
-			__x86_pmu_disable(counter, &counter->hw, bit);
+			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
 	intel_pmu_ack_status(ack);
-- 
GitLab


From 85cf9dba92152bb4edec118b2f4f0be1ae7fdcab Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:20 +0200
Subject: [PATCH 0737/2198] perf_counter, x86: change and remove pmu
 initialization checks

Some functions are only called if the pmu was proper initialized. That
initalization checks can be removed. The way to check initialization
changed too. Now, the pointer to the interrupt handler is checked. If
it exists the pmu is initialized. This also removes a static variable
and uses struct x86_pmu as only data source for the check.

[ Impact: simplify code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-24-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++------------------
 1 file changed, 13 insertions(+), 21 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index df9012bbd211a..2d3681bbb5225 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -23,7 +23,6 @@
 #include <asm/stacktrace.h>
 #include <asm/nmi.h>
 
-static bool perf_counters_initialized __read_mostly;
 static u64 perf_counter_mask __read_mostly;
 
 struct cpu_hw_counters {
@@ -227,6 +226,11 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
+static inline int x86_pmu_initialized(void)
+{
+	return x86_pmu.handle_irq != NULL;
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -240,8 +244,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
 		return -ENOSYS;
 
-	if (unlikely(!perf_counters_initialized))
-		return -EINVAL;
+	if (!x86_pmu_initialized())
+		return -ENODEV;
 
 	err = 0;
 	if (atomic_inc_not_zero(&num_counters)) {
@@ -348,9 +352,8 @@ static u64 amd_pmu_save_disable_all(void)
 
 u64 hw_perf_save_disable(void)
 {
-	if (unlikely(!perf_counters_initialized))
+	if (!x86_pmu_initialized())
 		return 0;
-
 	return x86_pmu.save_disable_all();
 }
 /*
@@ -388,9 +391,8 @@ static void amd_pmu_restore_all(u64 ctrl)
 
 void hw_perf_restore(u64 ctrl)
 {
-	if (unlikely(!perf_counters_initialized))
+	if (!x86_pmu_initialized())
 		return;
-
 	x86_pmu.restore_all(ctrl);
 }
 /*
@@ -402,8 +404,6 @@ static inline u64 intel_pmu_get_status(u64 mask)
 {
 	u64 status;
 
-	if (unlikely(!perf_counters_initialized))
-		return 0;
 	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 
 	return status;
@@ -417,10 +417,6 @@ static inline void intel_pmu_ack_status(u64 ack)
 static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	int err;
-
-	if (unlikely(!perf_counters_initialized))
-		return;
-
 	err = checking_wrmsrl(hwc->config_base + idx,
 			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
@@ -428,10 +424,6 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	int err;
-
-	if (unlikely(!perf_counters_initialized))
-		return;
-
 	err = checking_wrmsrl(hwc->config_base + idx,
 			      hwc->config);
 }
@@ -787,10 +779,10 @@ void perf_counter_unthrottle(void)
 {
 	struct cpu_hw_counters *cpuc;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+	if (!x86_pmu_initialized())
 		return;
 
-	if (unlikely(!perf_counters_initialized))
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
 		return;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -829,8 +821,9 @@ void perf_counters_lapic_init(int nmi)
 {
 	u32 apic_val;
 
-	if (!perf_counters_initialized)
+	if (!x86_pmu_initialized())
 		return;
+
 	/*
 	 * Enable the performance counter vector in the APIC LVT:
 	 */
@@ -988,7 +981,6 @@ void __init init_hw_perf_counters(void)
 		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
-	perf_counters_initialized = true;
 
 	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
-- 
GitLab


From a29aa8a7ff93e4196d558036928597e68337dd8d Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:21 +0200
Subject: [PATCH 0738/2198] perf_counter, x86: implement the interrupt handler
 for AMD cpus

This patch implements the interrupt handler for AMD performance
counters. In difference to the Intel pmu, there is no single status
register and also there are no fixed counters. This makes the handler
very different and it is useful to make the handler vendor
specific. To check if a counter is overflowed the upper bit of the
counter is checked. Only counters where the active bit is set are
checked.

With this patch throttling is enabled for AMD performance counters.

This patch also reenables Linux performance counters on AMD cpus.

[ Impact: re-enable perfcounters on AMD CPUs ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-25-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 45 ++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2d3681bbb5225..f4d59d4cf3f18 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -240,10 +240,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
-	/* disable temporarily */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
-		return -ENOSYS;
-
 	if (!x86_pmu_initialized())
 		return -ENODEV;
 
@@ -773,7 +769,43 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	return ret;
 }
 
-static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; }
+static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
+{
+	int cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+	u64 val;
+	int handled = 0;
+	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
+	int idx;
+
+	++cpuc->interrupts;
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		if (!test_bit(idx, cpuc->active))
+			continue;
+		counter = cpuc->counters[idx];
+		hwc = &counter->hw;
+		x86_perf_counter_update(counter, hwc, idx);
+		val = atomic64_read(&hwc->prev_count);
+		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+			continue;
+		/* counter overflow */
+		x86_perf_counter_set_period(counter, hwc, idx);
+		handled = 1;
+		inc_irq_stat(apic_perf_irqs);
+		if (perf_counter_overflow(counter, nmi, regs, 0))
+			amd_pmu_disable_counter(hwc, idx);
+		else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
+			/*
+			 * do not reenable when throttled, but reload
+			 * the register
+			 */
+			amd_pmu_disable_counter(hwc, idx);
+		else if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+			amd_pmu_enable_counter(hwc, idx);
+	}
+	return handled;
+}
 
 void perf_counter_unthrottle(void)
 {
@@ -782,9 +814,6 @@ void perf_counter_unthrottle(void)
 	if (!x86_pmu_initialized())
 		return;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
-		return;
-
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-- 
GitLab


From 4b7bfd0d276da3a006d37e85d3cf900d7a14ae2a Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:22 +0200
Subject: [PATCH 0739/2198] perf_counter, x86: return raw count with
 x86_perf_counter_update()

To check on AMD cpus if a counter overflows, the upper bit of the raw
counter value must be checked. This value is already internally
available in x86_perf_counter_update(). Now, the value is returned so
that it can be used directly to check for overflows.

[ Impact: micro-optimization ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-26-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f4d59d4cf3f18..a8a53abd706d4 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -132,7 +132,7 @@ static u64 amd_pmu_raw_event(u64 event)
  * Can only be executed on the CPU where the counter is active.
  * Returns the delta events processed.
  */
-static void
+static u64
 x86_perf_counter_update(struct perf_counter *counter,
 			struct hw_perf_counter *hwc, int idx)
 {
@@ -165,6 +165,8 @@ x86_perf_counter_update(struct perf_counter *counter,
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
+
+	return new_raw_count;
 }
 
 static atomic_t num_counters;
@@ -785,8 +787,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 			continue;
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
-		x86_perf_counter_update(counter, hwc, idx);
-		val = atomic64_read(&hwc->prev_count);
+		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
 		/* counter overflow */
-- 
GitLab


From c619b8ffb1cec6a431687a35695dc6fd292a79e6 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:23 +0200
Subject: [PATCH 0740/2198] perf_counter, x86: introduce max_period variable

In x86 pmus the allowed counter period to programm differs. This
introduces a max_period value and allows the generic implementation
for all models to check the max period.

[ Impact: generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-27-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a8a53abd706d4..4b8715b34f879 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -54,6 +54,7 @@ struct x86_pmu {
 	int		num_counters_fixed;
 	int		counter_bits;
 	u64		counter_mask;
+	u64		max_period;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -279,14 +280,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 
 	hwc->irq_period		= hw_event->irq_period;
-	/*
-	 * Intel PMCs cannot be accessed sanely above 32 bit width,
-	 * so we install an artificial 1<<31 period regardless of
-	 * the generic counter period:
-	 */
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
-		if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
-			hwc->irq_period = 0x7FFFFFFF;
+	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
+		hwc->irq_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left, hwc->irq_period);
 
@@ -910,6 +905,12 @@ static struct x86_pmu intel_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.raw_event		= intel_pmu_raw_event,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
+	/*
+	 * Intel PMCs cannot be accessed sanely above 32 bit width,
+	 * so we install an artificial 1<<31 period regardless of
+	 * the generic counter period:
+	 */
+	.max_period		= (1ULL << 31) - 1,
 };
 
 static struct x86_pmu amd_pmu = {
@@ -927,6 +928,8 @@ static struct x86_pmu amd_pmu = {
 	.num_counters		= 4,
 	.counter_bits		= 48,
 	.counter_mask		= (1ULL << 48) - 1,
+	/* use highest bit to detect overflow */
+	.max_period		= (1ULL << 47) - 1,
 };
 
 static int intel_pmu_init(void)
@@ -999,6 +1002,7 @@ void __init init_hw_perf_counters(void)
 	perf_max_counters = x86_pmu.num_counters;
 
 	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
+	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
 
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
-- 
GitLab


From ef7b3e09ffdcd5200aea9523f6b56d331d1c4fc0 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:24 +0200
Subject: [PATCH 0741/2198] perf_counter, x86: remove vendor check in
 fixed_mode_idx()

The function fixed_mode_idx() is used generically. Now it checks the
num_counters_fixed value instead of the vendor to decide if fixed
counters are present.

[ Impact: generalize code ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 4b8715b34f879..d1c8036dcbd66 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -542,7 +542,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 {
 	unsigned int event;
 
-	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+	if (!x86_pmu.num_counters_fixed)
 		return -1;
 
 	if (unlikely(hwc->nmi))
-- 
GitLab


From 19d84dab55a383d75c885b5c1a618f5ead96f2f6 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:25 +0200
Subject: [PATCH 0742/2198] perf_counter, x86: remove unused function argument
 in intel_pmu_get_status()

The mask argument is unused and thus can be removed.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-29-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d1c8036dcbd66..856b0b8521928 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -393,7 +393,7 @@ void hw_perf_restore(u64 ctrl)
  */
 EXPORT_SYMBOL_GPL(hw_perf_restore);
 
-static inline u64 intel_pmu_get_status(u64 mask)
+static inline u64 intel_pmu_get_status(void)
 {
 	u64 status;
 
@@ -728,7 +728,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
-	status = intel_pmu_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status();
 	if (!status)
 		goto out;
 
@@ -753,7 +753,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	/*
 	 * Repeat if there is more work to be done:
 	 */
-	status = intel_pmu_get_status(cpuc->throttle_ctrl);
+	status = intel_pmu_get_status();
 	if (status)
 		goto again;
 out:
-- 
GitLab


From 38105f0234d4795c77c7c6845916caf3a395c451 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 12:47:26 +0200
Subject: [PATCH 0743/2198] perf_counter: update 'perf top' documentation

The documentation about the perf-top build was outdated after
perfstat has been implemented. This updates it.

[ Impact: update documentation ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-30-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 6a276d2b2bbc1..8d28864a20c0f 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -3,7 +3,7 @@
 
    Build with:
 
-     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
+     make -C Documentation/perf_counter/
 
    Sample output:
 
-- 
GitLab


From 98144511427c192e4249ff66a3f9debc55c59411 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 29 Apr 2009 14:52:50 +0200
Subject: [PATCH 0744/2198] perf_counter: add/update copyrights

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 +++++----
 kernel/perf_counter.c              | 6 +++---
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 856b0b8521928..47e563bfd4cf2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1,10 +1,11 @@
 /*
  * Performance counter x86 architecture code
  *
- *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
- *  Copyright(C) 2009 Jaswinder Singh Rajput
- *  Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2009 Jaswinder Singh Rajput
+ *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  *  For licencing details see kernel-base/COPYING
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 582108addefab..a95a171e608cf 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1,9 +1,9 @@
 /*
  * Performance counter core code
  *
- *  Copyright(C) 2008 Thomas Gleixner <tglx@linutronix.de>
- *  Copyright(C) 2008 Red Hat, Inc., Ingo Molnar
- *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *
  *  For licensing details see kernel-base/COPYING
  */
-- 
GitLab


From ab7ef2e50a557af92f4f90689f51fadadafc16b2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 29 Apr 2009 22:38:51 +1000
Subject: [PATCH 0745/2198] perf_counter: powerpc: allow use of
 limited-function counters

POWER5+ and POWER6 have two hardware counters with limited functionality:
PMC5 counts instructions completed in run state and PMC6 counts cycles
in run state.  (Run state is the state when a hardware RUN bit is 1;
the idle task clears RUN while waiting for work to do and sets it when
there is work to do.)

These counters can't be written to by the kernel, can't generate
interrupts, and don't obey the freeze conditions.  That means we can
only use them for per-task counters (where we know we'll always be in
run state; we can't put a per-task counter on an idle task), and only
if we don't want interrupts and we do want to count in all processor
modes.

Obviously some counters can't go on a limited hardware counter, but there
are also situations where we can only put a counter on a limited hardware
counter - if there are already counters on that exclude some processor
modes and we want to put on a per-task cycle or instruction counter that
doesn't exclude any processor mode, it could go on if it can use a
limited hardware counter.

To keep track of these constraints, this adds a flags argument to the
processor-specific get_alternatives() functions, with three bits defined:
one to say that we can accept alternative event codes that go on limited
counters, one to say we only want alternatives on limited counters, and
one to say that this is a per-task counter and therefore events that are
gated by run state are equivalent to those that aren't (e.g. a "cycles"
event is equivalent to a "cycles in run state" event).  These flags
are computed for each counter and stored in the counter->hw.counter_base
field (slightly wonky name for what it does, but it was an existing
unused field).

Since the limited counters don't freeze when we freeze the other counters,
we need some special handling to avoid getting skew between things counted
on the limited counters and those counted on normal counters.  To minimize
this skew, if we are using any limited counters, we read PMC5 and PMC6
immediately after setting and clearing the freeze bit.  This is done in
a single asm in the new write_mmcr0() function.

The code here is specific to PMC5 and PMC6 being the limited hardware
counters.  Being more general (e.g. having a bitmap of limited hardware
counter numbers) would have meant more complex code to read the limited
counters when freezing and unfreezing the normal counters, with
conditional branches, which would have increased the skew.  Since it
isn't necessary for the code to be more general at this stage, it isn't.

This also extends the back-ends for POWER5+ and POWER6 to be able to
handle up to 6 counters rather than the 4 they previously handled.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <18936.19035.163066.892208@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/perf_counter.h |  13 +-
 arch/powerpc/kernel/perf_counter.c      | 297 +++++++++++++++++++++---
 arch/powerpc/kernel/power4-pmu.c        |   3 +-
 arch/powerpc/kernel/power5+-pmu.c       | 117 ++++++++--
 arch/powerpc/kernel/power5-pmu.c        |   3 +-
 arch/powerpc/kernel/power6-pmu.c        | 119 ++++++++--
 arch/powerpc/kernel/ppc970-pmu.c        |   3 +-
 7 files changed, 479 insertions(+), 76 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 9d7ff6d7fb566..56d66c38143bc 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -12,6 +12,7 @@
 
 #define MAX_HWCOUNTERS		8
 #define MAX_EVENT_ALTERNATIVES	8
+#define MAX_LIMITED_HWCOUNTERS	2
 
 /*
  * This struct provides the constants and functions needed to
@@ -25,14 +26,24 @@ struct power_pmu {
 	int	(*compute_mmcr)(unsigned int events[], int n_ev,
 				unsigned int hwc[], u64 mmcr[]);
 	int	(*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
-	int	(*get_alternatives)(unsigned int event, unsigned int alt[]);
+	int	(*get_alternatives)(unsigned int event, unsigned int flags,
+				    unsigned int alt[]);
 	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
+	int	(*limited_pmc_event)(unsigned int event);
+	int	limited_pmc5_6;	/* PMC5 and PMC6 have limited function */
 	int	n_generic;
 	int	*generic_events;
 };
 
 extern struct power_pmu *ppmu;
 
+/*
+ * Values for flags to get_alternatives()
+ */
+#define PPMU_LIMITED_PMC_OK	1	/* can put this on a limited PMC */
+#define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
+#define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
+
 /*
  * The power_pmu.get_constraint function returns a 64-bit value and
  * a 64-bit mask that express the constraints between this event and
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index d9bbe5efc6492..15cdc8e672296 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -23,10 +23,14 @@ struct cpu_hw_counters {
 	int n_percpu;
 	int disabled;
 	int n_added;
+	int n_limited;
+	u8  pmcs_enabled;
 	struct perf_counter *counter[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
+	unsigned int flags[MAX_HWCOUNTERS];
 	u64 mmcr[3];
-	u8 pmcs_enabled;
+	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
+	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS];
 };
 DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters);
 
@@ -127,7 +131,8 @@ static void write_pmc(int idx, unsigned long val)
  * and see if any combination of alternative codes is feasible.
  * The feasible set is returned in event[].
  */
-static int power_check_constraints(unsigned int event[], int n_ev)
+static int power_check_constraints(unsigned int event[], unsigned int cflags[],
+				   int n_ev)
 {
 	u64 mask, value, nv;
 	unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
@@ -144,11 +149,15 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 
 	/* First see if the events will go on as-is */
 	for (i = 0; i < n_ev; ++i) {
-		alternatives[i][0] = event[i];
+		if ((cflags[i] & PPMU_LIMITED_PMC_REQD)
+		    && !ppmu->limited_pmc_event(event[i])) {
+			ppmu->get_alternatives(event[i], cflags[i],
+					       alternatives[i]);
+			event[i] = alternatives[i][0];
+		}
 		if (ppmu->get_constraint(event[i], &amasks[i][0],
 					 &avalues[i][0]))
 			return -1;
-		choice[i] = 0;
 	}
 	value = mask = 0;
 	for (i = 0; i < n_ev; ++i) {
@@ -166,7 +175,9 @@ static int power_check_constraints(unsigned int event[], int n_ev)
 	if (!ppmu->get_alternatives)
 		return -1;
 	for (i = 0; i < n_ev; ++i) {
-		n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]);
+		choice[i] = 0;
+		n_alt[i] = ppmu->get_alternatives(event[i], cflags[i],
+						  alternatives[i]);
 		for (j = 1; j < n_alt[i]; ++j)
 			ppmu->get_constraint(alternatives[i][j],
 					     &amasks[i][j], &avalues[i][j]);
@@ -231,28 +242,41 @@ static int power_check_constraints(unsigned int event[], int n_ev)
  * exclude_{user,kernel,hv} with each other and any previously
  * added counters.
  */
-static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new)
+static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
+			  int n_prev, int n_new)
 {
-	int eu, ek, eh;
-	int i, n;
+	int eu = 0, ek = 0, eh = 0;
+	int i, n, first;
 	struct perf_counter *counter;
 
 	n = n_prev + n_new;
 	if (n <= 1)
 		return 0;
 
-	eu = ctrs[0]->hw_event.exclude_user;
-	ek = ctrs[0]->hw_event.exclude_kernel;
-	eh = ctrs[0]->hw_event.exclude_hv;
-	if (n_prev == 0)
-		n_prev = 1;
-	for (i = n_prev; i < n; ++i) {
+	first = 1;
+	for (i = 0; i < n; ++i) {
+		if (cflags[i] & PPMU_LIMITED_PMC_OK) {
+			cflags[i] &= ~PPMU_LIMITED_PMC_REQD;
+			continue;
+		}
 		counter = ctrs[i];
-		if (counter->hw_event.exclude_user != eu ||
-		    counter->hw_event.exclude_kernel != ek ||
-		    counter->hw_event.exclude_hv != eh)
+		if (first) {
+			eu = counter->hw_event.exclude_user;
+			ek = counter->hw_event.exclude_kernel;
+			eh = counter->hw_event.exclude_hv;
+			first = 0;
+		} else if (counter->hw_event.exclude_user != eu ||
+			   counter->hw_event.exclude_kernel != ek ||
+			   counter->hw_event.exclude_hv != eh) {
 			return -EAGAIN;
+		}
 	}
+
+	if (eu || ek || eh)
+		for (i = 0; i < n; ++i)
+			if (cflags[i] & PPMU_LIMITED_PMC_OK)
+				cflags[i] |= PPMU_LIMITED_PMC_REQD;
+
 	return 0;
 }
 
@@ -279,6 +303,85 @@ static void power_pmu_read(struct perf_counter *counter)
 	atomic64_sub(delta, &counter->hw.period_left);
 }
 
+/*
+ * On some machines, PMC5 and PMC6 can't be written, don't respect
+ * the freeze conditions, and don't generate interrupts.  This tells
+ * us if `counter' is using such a PMC.
+ */
+static int is_limited_pmc(int pmcnum)
+{
+	return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6);
+}
+
+static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
+				    unsigned long pmc5, unsigned long pmc6)
+{
+	struct perf_counter *counter;
+	u64 val, prev, delta;
+	int i;
+
+	for (i = 0; i < cpuhw->n_limited; ++i) {
+		counter = cpuhw->limited_counter[i];
+		if (!counter->hw.idx)
+			continue;
+		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+		prev = atomic64_read(&counter->hw.prev_count);
+		counter->hw.idx = 0;
+		delta = (val - prev) & 0xfffffffful;
+		atomic64_add(delta, &counter->count);
+	}
+}
+
+static void thaw_limited_counters(struct cpu_hw_counters *cpuhw,
+				  unsigned long pmc5, unsigned long pmc6)
+{
+	struct perf_counter *counter;
+	u64 val;
+	int i;
+
+	for (i = 0; i < cpuhw->n_limited; ++i) {
+		counter = cpuhw->limited_counter[i];
+		counter->hw.idx = cpuhw->limited_hwidx[i];
+		val = (counter->hw.idx == 5) ? pmc5 : pmc6;
+		atomic64_set(&counter->hw.prev_count, val);
+		perf_counter_update_userpage(counter);
+	}
+}
+
+/*
+ * Since limited counters don't respect the freeze conditions, we
+ * have to read them immediately after freezing or unfreezing the
+ * other counters.  We try to keep the values from the limited
+ * counters as consistent as possible by keeping the delay (in
+ * cycles and instructions) between freezing/unfreezing and reading
+ * the limited counters as small and consistent as possible.
+ * Therefore, if any limited counters are in use, we read them
+ * both, and always in the same order, to minimize variability,
+ * and do it inside the same asm that writes MMCR0.
+ */
+static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
+{
+	unsigned long pmc5, pmc6;
+
+	if (!cpuhw->n_limited) {
+		mtspr(SPRN_MMCR0, mmcr0);
+		return;
+	}
+
+	/*
+	 * Write MMCR0, then read PMC5 and PMC6 immediately.
+	 */
+	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
+		     : "=&r" (pmc5), "=&r" (pmc6)
+		     : "r" (mmcr0), "i" (SPRN_MMCR0),
+		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
+
+	if (mmcr0 & MMCR0_FC)
+		freeze_limited_counters(cpuhw, pmc5, pmc6);
+	else
+		thaw_limited_counters(cpuhw, pmc5, pmc6);
+}
+
 /*
  * Disable all counters to prevent PMU interrupts and to allow
  * counters to be added or removed.
@@ -321,7 +424,7 @@ u64 hw_perf_save_disable(void)
 		 * executed and the PMU has frozen the counters
 		 * before we return.
 		 */
-		mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC);
+		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC);
 		mb();
 	}
 	local_irq_restore(flags);
@@ -342,6 +445,8 @@ void hw_perf_restore(u64 disable)
 	unsigned long val;
 	s64 left;
 	unsigned int hwc_index[MAX_HWCOUNTERS];
+	int n_lim;
+	int idx;
 
 	if (disable)
 		return;
@@ -414,10 +519,18 @@ void hw_perf_restore(u64 disable)
 	/*
 	 * Initialize the PMCs for all the new and moved counters.
 	 */
+	cpuhw->n_limited = n_lim = 0;
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
 		if (counter->hw.idx)
 			continue;
+		idx = hwc_index[i] + 1;
+		if (is_limited_pmc(idx)) {
+			cpuhw->limited_counter[n_lim] = counter;
+			cpuhw->limited_hwidx[n_lim] = idx;
+			++n_lim;
+			continue;
+		}
 		val = 0;
 		if (counter->hw_event.irq_period) {
 			left = atomic64_read(&counter->hw.period_left);
@@ -425,15 +538,16 @@ void hw_perf_restore(u64 disable)
 				val = 0x80000000L - left;
 		}
 		atomic64_set(&counter->hw.prev_count, val);
-		counter->hw.idx = hwc_index[i] + 1;
-		write_pmc(counter->hw.idx, val);
+		counter->hw.idx = idx;
+		write_pmc(idx, val);
 		perf_counter_update_userpage(counter);
 	}
+	cpuhw->n_limited = n_lim;
 	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE;
 
  out_enable:
 	mb();
-	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
 
 	/*
 	 * Enable instruction sampling if necessary
@@ -448,7 +562,8 @@ void hw_perf_restore(u64 disable)
 }
 
 static int collect_events(struct perf_counter *group, int max_count,
-			  struct perf_counter *ctrs[], unsigned int *events)
+			  struct perf_counter *ctrs[], unsigned int *events,
+			  unsigned int *flags)
 {
 	int n = 0;
 	struct perf_counter *counter;
@@ -457,6 +572,7 @@ static int collect_events(struct perf_counter *group, int max_count,
 		if (n >= max_count)
 			return -1;
 		ctrs[n] = group;
+		flags[n] = group->hw.counter_base;
 		events[n++] = group->hw.config;
 	}
 	list_for_each_entry(counter, &group->sibling_list, list_entry) {
@@ -465,6 +581,7 @@ static int collect_events(struct perf_counter *group, int max_count,
 			if (n >= max_count)
 				return -1;
 			ctrs[n] = counter;
+			flags[n] = counter->hw.counter_base;
 			events[n++] = counter->hw.config;
 		}
 	}
@@ -497,12 +614,14 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	n0 = cpuhw->n_counters;
 	n = collect_events(group_leader, ppmu->n_counter - n0,
-			   &cpuhw->counter[n0], &cpuhw->events[n0]);
+			   &cpuhw->counter[n0], &cpuhw->events[n0],
+			   &cpuhw->flags[n0]);
 	if (n < 0)
 		return -EAGAIN;
-	if (check_excludes(cpuhw->counter, n0, n))
+	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n))
 		return -EAGAIN;
-	if (power_check_constraints(cpuhw->events, n + n0))
+	i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0);
+	if (i < 0)
 		return -EAGAIN;
 	cpuhw->n_counters = n0 + n;
 	cpuhw->n_added += n;
@@ -554,9 +673,10 @@ static int power_pmu_enable(struct perf_counter *counter)
 		goto out;
 	cpuhw->counter[n0] = counter;
 	cpuhw->events[n0] = counter->hw.config;
-	if (check_excludes(cpuhw->counter, n0, 1))
+	cpuhw->flags[n0] = counter->hw.counter_base;
+	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1))
 		goto out;
-	if (power_check_constraints(cpuhw->events, n0 + 1))
+	if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1))
 		goto out;
 
 	counter->hw.config = cpuhw->events[n0];
@@ -592,12 +712,24 @@ static void power_pmu_disable(struct perf_counter *counter)
 				cpuhw->counter[i-1] = cpuhw->counter[i];
 			--cpuhw->n_counters;
 			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr);
-			write_pmc(counter->hw.idx, 0);
-			counter->hw.idx = 0;
+			if (counter->hw.idx) {
+				write_pmc(counter->hw.idx, 0);
+				counter->hw.idx = 0;
+			}
 			perf_counter_update_userpage(counter);
 			break;
 		}
 	}
+	for (i = 0; i < cpuhw->n_limited; ++i)
+		if (counter == cpuhw->limited_counter[i])
+			break;
+	if (i < cpuhw->n_limited) {
+		while (++i < cpuhw->n_limited) {
+			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i];
+			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i];
+		}
+		--cpuhw->n_limited;
+	}
 	if (cpuhw->n_counters == 0) {
 		/* disable exceptions if no counters are running */
 		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
@@ -613,6 +745,61 @@ struct pmu power_pmu = {
 	.read		= power_pmu_read,
 };
 
+/*
+ * Return 1 if we might be able to put counter on a limited PMC,
+ * or 0 if not.
+ * A counter can only go on a limited PMC if it counts something
+ * that a limited PMC can count, doesn't require interrupts, and
+ * doesn't exclude any processor mode.
+ */
+static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
+				 unsigned int flags)
+{
+	int n;
+	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+
+	if (counter->hw_event.exclude_user
+	    || counter->hw_event.exclude_kernel
+	    || counter->hw_event.exclude_hv
+	    || counter->hw_event.irq_period)
+		return 0;
+
+	if (ppmu->limited_pmc_event(ev))
+		return 1;
+
+	/*
+	 * The requested event isn't on a limited PMC already;
+	 * see if any alternative code goes on a limited PMC.
+	 */
+	if (!ppmu->get_alternatives)
+		return 0;
+
+	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
+	n = ppmu->get_alternatives(ev, flags, alt);
+	if (n)
+		return alt[0];
+
+	return 0;
+}
+
+/*
+ * Find an alternative event that goes on a normal PMC, if possible,
+ * and return the event code, or 0 if there is no such alternative.
+ * (Note: event code 0 is "don't count" on all machines.)
+ */
+static unsigned long normal_pmc_alternative(unsigned long ev,
+					    unsigned long flags)
+{
+	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+	int n;
+
+	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
+	n = ppmu->get_alternatives(ev, flags, alt);
+	if (!n)
+		return 0;
+	return alt[0];
+}
+
 /* Number of perf_counters counting hardware events */
 static atomic_t num_counters;
 /* Used to avoid races in calling reserve/release_pmc_hardware */
@@ -633,9 +820,10 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
-	unsigned long ev;
+	unsigned long ev, flags;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
 	unsigned int events[MAX_HWCOUNTERS];
+	unsigned int cflags[MAX_HWCOUNTERS];
 	int n;
 	int err;
 
@@ -661,7 +849,36 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
 		counter->hw_event.exclude_hv = 0;
-	
+
+	/*
+	 * If this is a per-task counter, then we can use
+	 * PM_RUN_* events interchangeably with their non RUN_*
+	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC.
+	 * XXX we should check if the task is an idle task.
+	 */
+	flags = 0;
+	if (counter->ctx->task)
+		flags |= PPMU_ONLY_COUNT_RUN;
+
+	/*
+	 * If this machine has limited counters, check whether this
+	 * event could go on a limited counter.
+	 */
+	if (ppmu->limited_pmc5_6) {
+		if (can_go_on_limited_pmc(counter, ev, flags)) {
+			flags |= PPMU_LIMITED_PMC_OK;
+		} else if (ppmu->limited_pmc_event(ev)) {
+			/*
+			 * The requested event is on a limited PMC,
+			 * but we can't use a limited PMC; see if any
+			 * alternative goes on a normal PMC.
+			 */
+			ev = normal_pmc_alternative(ev, flags);
+			if (!ev)
+				return ERR_PTR(-EINVAL);
+		}
+	}
+
 	/*
 	 * If this is in a group, check if it can go on with all the
 	 * other hardware counters in the group.  We assume the counter
@@ -670,18 +887,20 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	n = 0;
 	if (counter->group_leader != counter) {
 		n = collect_events(counter->group_leader, ppmu->n_counter - 1,
-				   ctrs, events);
+				   ctrs, events, cflags);
 		if (n < 0)
 			return ERR_PTR(-EINVAL);
 	}
 	events[n] = ev;
 	ctrs[n] = counter;
-	if (check_excludes(ctrs, n, 1))
+	cflags[n] = flags;
+	if (check_excludes(ctrs, cflags, n, 1))
 		return ERR_PTR(-EINVAL);
-	if (power_check_constraints(events, n + 1))
+	if (power_check_constraints(events, cflags, n + 1))
 		return ERR_PTR(-EINVAL);
 
 	counter->hw.config = events[n];
+	counter->hw.counter_base = cflags[n];
 	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
 
 	/*
@@ -763,6 +982,10 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	int found = 0;
 	int nmi;
 
+	if (cpuhw->n_limited)
+		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
+					mfspr(SPRN_PMC6));
+
 	/*
 	 * If interrupts were soft-disabled when this PMU interrupt
 	 * occurred, treat it as an NMI.
@@ -775,6 +998,8 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
+		if (is_limited_pmc(counter->hw.idx))
+			continue;
 		val = read_pmc(counter->hw.idx);
 		if ((int)val < 0) {
 			/* counter has overflowed */
@@ -791,6 +1016,8 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 */
 	if (!found) {
 		for (i = 0; i < ppmu->n_counter; ++i) {
+			if (is_limited_pmc(i + 1))
+				continue;
 			val = read_pmc(i + 1);
 			if ((int)val < 0)
 				write_pmc(i + 1, 0);
@@ -804,7 +1031,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 	 * XXX might want to use MSR.PM to keep the counters frozen until
 	 * we get back out of this interrupt.
 	 */
-	mtspr(SPRN_MMCR0, cpuhw->mmcr[0]);
+	write_mmcr0(cpuhw, cpuhw->mmcr[0]);
 
 	if (nmi)
 		nmi_exit();
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 1407b19ab619c..744a2756958e9 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -320,7 +320,8 @@ static unsigned int ppc_inst_cmpl[] = {
 	0x1001, 0x4001, 0x6001, 0x7001, 0x8001
 };
 
-static int p4_get_alternatives(unsigned int event, unsigned int alt[])
+static int p4_get_alternatives(unsigned int event, unsigned int flags,
+			       unsigned int alt[])
 {
 	int i, j, na;
 
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 1222c8ea3c26e..8154eaa2404f8 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -78,8 +78,8 @@
  * Layout of constraint bits:
  * 6666555555555544444444443333333333222222222211111111110000000000
  * 3210987654321098765432109876543210987654321098765432109876543210
- *             [  ><><>< ><> <><>[  >      <  ><  ><  ><  ><><><><>
- *             NC  G0G1G2 G3 T0T1 UC        B0  B1  B2  B3 P4P3P2P1
+ *             [  ><><>< ><> <><>[  >  <  ><  ><  ><  ><><><><><><>
+ *             NC  G0G1G2 G3 T0T1 UC    B0  B1  B2  B3 P6P5P4P3P2P1
  *
  * NC - number of counters
  *     51: NC error 0x0008_0000_0000_0000
@@ -105,18 +105,18 @@
  *     30: IDU|GRS events needed 0x00_4000_0000
  *
  * B0
- *     20-23: Byte 0 event source 0x00f0_0000
+ *     24-27: Byte 0 event source 0x0f00_0000
  *	      Encoding as for the event code
  *
  * B1, B2, B3
- *     16-19, 12-15, 8-11: Byte 1, 2, 3 event sources
+ *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources
  *
- * P4
- *     7: P1 error 0x80
- *     6-7: Count of events needing PMC4
+ * P6
+ *     11: P6 error 0x800
+ *     10-11: Count of events needing PMC6
  *
- * P1..P3
- *     0-6: Count of events needing PMC1..PMC3
+ * P1..P5
+ *     0-9: Count of events needing PMC1..PMC5
  */
 
 static const int grsel_shift[8] = {
@@ -143,11 +143,13 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 	if (pmc) {
-		if (pmc > 4)
+		if (pmc > 6)
 			return -1;
 		sh = (pmc - 1) * 2;
 		mask |= 2 << sh;
 		value |= 1 << sh;
+		if (pmc >= 5 && !(event == 0x500009 || event == 0x600005))
+			return -1;
 	}
 	if (event & PM_BUSEVENT_MSK) {
 		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
@@ -173,16 +175,26 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh;
 		}
 		/* Set byte lane select field */
-		mask  |= 0xfULL << (20 - 4 * byte);
-		value |= (u64)unit << (20 - 4 * byte);
+		mask  |= 0xfULL << (24 - 4 * byte);
+		value |= (u64)unit << (24 - 4 * byte);
+	}
+	if (pmc < 5) {
+		/* need a counter from PMC1-4 set */
+		mask  |= 0x8000000000000ull;
+		value |= 0x1000000000000ull;
 	}
-	mask  |= 0x8000000000000ull;
-	value |= 0x1000000000000ull;
 	*maskp = mask;
 	*valp = value;
 	return 0;
 }
 
+static int power5p_limited_pmc_event(unsigned int event)
+{
+	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+
+	return pmc == 5 || pmc == 6;
+}
+
 #define MAX_ALT	3	/* at most 3 alternatives for any event */
 
 static const unsigned int event_alternatives[][MAX_ALT] = {
@@ -193,6 +205,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
 	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */
 	{ 0x800c4,  0xc20e0 },			/* PM_DTLB_MISS */
 	{ 0xc50c6,  0xc60e0 },			/* PM_MRK_DTLB_MISS */
+	{ 0x100005, 0x600005 },			/* PM_RUN_CYC */
 	{ 0x100009, 0x200009 },			/* PM_INST_CMPL */
 	{ 0x200015, 0x300015 },			/* PM_LSU_LMQ_SRQ_EMPTY_CYC */
 	{ 0x300009, 0x400009 },			/* PM_INST_DISP */
@@ -260,24 +273,85 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5p_get_alternatives(unsigned int event, unsigned int alt[])
+static int power5p_get_alternatives(unsigned int event, unsigned int flags,
+				    unsigned int alt[])
 {
 	int i, j, ae, nalt = 1;
+	int nlim;
 
 	alt[0] = event;
 	nalt = 1;
+	nlim = power5p_limited_pmc_event(event);
 	i = find_alternative(event);
 	if (i >= 0) {
 		for (j = 0; j < MAX_ALT; ++j) {
 			ae = event_alternatives[i][j];
 			if (ae && ae != event)
 				alt[nalt++] = ae;
+			nlim += power5p_limited_pmc_event(ae);
 		}
 	} else {
 		ae = find_alternative_bdecode(event);
 		if (ae > 0)
 			alt[nalt++] = ae;
 	}
+
+	if (flags & PPMU_ONLY_COUNT_RUN) {
+		/*
+		 * We're only counting in RUN state,
+		 * so PM_CYC is equivalent to PM_RUN_CYC
+		 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
+		 * This doesn't include alternatives that don't provide
+		 * any extra flexibility in assigning PMCs (e.g.
+		 * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC).
+		 * Note that even with these additional alternatives
+		 * we never end up with more than 3 alternatives for any event.
+		 */
+		j = nalt;
+		for (i = 0; i < nalt; ++i) {
+			switch (alt[i]) {
+			case 0xf:	/* PM_CYC */
+				alt[j++] = 0x600005;	/* PM_RUN_CYC */
+				++nlim;
+				break;
+			case 0x600005:	/* PM_RUN_CYC */
+				alt[j++] = 0xf;
+				break;
+			case 0x100009:	/* PM_INST_CMPL */
+				alt[j++] = 0x500009;	/* PM_RUN_INST_CMPL */
+				++nlim;
+				break;
+			case 0x500009:	/* PM_RUN_INST_CMPL */
+				alt[j++] = 0x100009;	/* PM_INST_CMPL */
+				alt[j++] = 0x200009;
+				break;
+			}
+		}
+		nalt = j;
+	}
+
+	if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
+		/* remove the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (!power5p_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	} else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
+		/* remove all but the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (power5p_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	}
+
 	return nalt;
 }
 
@@ -390,7 +464,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 	unsigned char unituse[16];
 	int ttmuse;
 
-	if (n_ev > 4)
+	if (n_ev > 6)
 		return -1;
 
 	/* First pass to count resource use */
@@ -399,7 +473,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 	for (i = 0; i < n_ev; ++i) {
 		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
 		if (pmc) {
-			if (pmc > 4)
+			if (pmc > 6)
 				return -1;
 			if (pmc_inuse & (1 << (pmc - 1)))
 				return -1;
@@ -488,13 +562,16 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev,
 			if (pmc >= 4)
 				return -1;
 			pmc_inuse |= 1 << pmc;
-		} else {
+		} else if (pmc <= 4) {
 			/* Direct event */
 			--pmc;
 			if (isbus && (byte & 2) &&
 			    (psel == 8 || psel == 0x10 || psel == 0x28))
 				/* add events on higher-numbered bus */
 				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc);
+		} else {
+			/* Instructions or run cycles on PMC5/6 */
+			--pmc;
 		}
 		if (isbus && unit == PM_GRS) {
 			bit = psel & 7;
@@ -538,7 +615,7 @@ static int power5p_generic_events[] = {
 };
 
 struct power_pmu power5p_pmu = {
-	.n_counter = 4,
+	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
 	.add_fields = 0x7000000000055ull,
 	.test_adder = 0x3000040000000ull,
@@ -548,4 +625,6 @@ struct power_pmu power5p_pmu = {
 	.disable_pmc = power5p_disable_pmc,
 	.n_generic = ARRAY_SIZE(power5p_generic_events),
 	.generic_events = power5p_generic_events,
+	.limited_pmc5_6 = 1,
+	.limited_pmc_event = power5p_limited_pmc_event,
 };
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 116c4bb1809ea..6e667dc86470b 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -269,7 +269,8 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5_get_alternatives(unsigned int event, unsigned int alt[])
+static int power5_get_alternatives(unsigned int event, unsigned int flags,
+				   unsigned int alt[])
 {
 	int i, j, ae, nalt = 1;
 
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index fce1fc290a1d0..d44049f0ae277 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -182,7 +182,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 	unsigned int ttmset = 0;
 	unsigned int pmc_inuse = 0;
 
-	if (n_ev > 4)
+	if (n_ev > 6)
 		return -1;
 	for (i = 0; i < n_ev; ++i) {
 		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
@@ -202,6 +202,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 			for (pmc = 0; pmc < 4; ++pmc)
 				if (!(pmc_inuse & (1 << pmc)))
 					break;
+			if (pmc >= 4)
+				return -1;
 			pmc_inuse |= 1 << pmc;
 		}
 		hwc[i] = pmc;
@@ -240,7 +242,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
 		}
 		if (power6_marked_instr_event(event[i]))
 			mmcra |= MMCRA_SAMPLE_ENABLE;
-		mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
+		if (pmc < 4)
+			mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc);
 	}
 	mmcr[0] = 0;
 	if (pmc_inuse & 1)
@@ -256,19 +259,20 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
  * Layout of constraint bits:
  *
  *	0-1	add field: number of uses of PMC1 (max 1)
- *	2-3, 4-5, 6-7: ditto for PMC2, 3, 4
- *	8-10	select field: nest (subunit) event selector
+ *	2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6
+ *	12-15	add field: number of uses of PMC1-4 (max 4)
  *	16-19	select field: unit on byte 0 of event bus
  *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
+ *	32-34	select field: nest (subunit) event selector
  */
 static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 {
-	int pmc, byte, sh;
-	unsigned int mask = 0, value = 0;
+	int pmc, byte, sh, subunit;
+	u64 mask = 0, value = 0;
 
 	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 	if (pmc) {
-		if (pmc > 4)
+		if (pmc > 4 && !(event == 0x500009 || event == 0x600005))
 			return -1;
 		sh = (pmc - 1) * 2;
 		mask |= 2 << sh;
@@ -276,26 +280,38 @@ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	}
 	if (event & PM_BUSEVENT_MSK) {
 		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK;
-		sh = byte * 4;
+		sh = byte * 4 + (16 - PM_UNIT_SH);
 		mask |= PM_UNIT_MSKS << sh;
-		value |= (event & PM_UNIT_MSKS) << sh;
+		value |= (u64)(event & PM_UNIT_MSKS) << sh;
 		if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) {
-			mask |= PM_SUBUNIT_MSKS;
-			value |= event & PM_SUBUNIT_MSKS;
+			subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK;
+			mask  |= (u64)PM_SUBUNIT_MSK << 32;
+			value |= (u64)subunit << 32;
 		}
 	}
+	if (pmc <= 4) {
+		mask  |= 0x8000;	/* add field for count of PMC1-4 uses */
+		value |= 0x1000;
+	}
 	*maskp = mask;
 	*valp = value;
 	return 0;
 }
 
+static int p6_limited_pmc_event(unsigned int event)
+{
+	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+
+	return pmc == 5 || pmc == 6;
+}
+
 #define MAX_ALT	4	/* at most 4 alternatives for any event */
 
 static const unsigned int event_alternatives[][MAX_ALT] = {
 	{ 0x0130e8, 0x2000f6, 0x3000fc },	/* PM_PTEG_RELOAD_VALID */
 	{ 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */
 	{ 0x080088, 0x200054, 0x3000f0 },	/* PM_ST_MISS_L1 */
-	{ 0x10000a, 0x2000f4 },			/* PM_RUN_CYC */
+	{ 0x10000a, 0x2000f4, 0x600005 },	/* PM_RUN_CYC */
 	{ 0x10000b, 0x2000f5 },			/* PM_RUN_COUNT */
 	{ 0x10000e, 0x400010 },			/* PM_PURR */
 	{ 0x100010, 0x4000f8 },			/* PM_FLUSH */
@@ -340,13 +356,15 @@ static int find_alternatives_list(unsigned int event)
 	return -1;
 }
 
-static int p6_get_alternatives(unsigned int event, unsigned int alt[])
+static int p6_get_alternatives(unsigned int event, unsigned int flags,
+			       unsigned int alt[])
 {
-	int i, j;
+	int i, j, nlim;
 	unsigned int aevent, psel, pmc;
 	unsigned int nalt = 1;
 
 	alt[0] = event;
+	nlim = p6_limited_pmc_event(event);
 
 	/* check the alternatives table */
 	i = find_alternatives_list(event);
@@ -358,6 +376,7 @@ static int p6_get_alternatives(unsigned int event, unsigned int alt[])
 				break;
 			if (aevent != event)
 				alt[nalt++] = aevent;
+			nlim += p6_limited_pmc_event(aevent);
 		}
 
 	} else {
@@ -375,13 +394,75 @@ static int p6_get_alternatives(unsigned int event, unsigned int alt[])
 				((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH);
 	}
 
+	if (flags & PPMU_ONLY_COUNT_RUN) {
+		/*
+		 * We're only counting in RUN state,
+		 * so PM_CYC is equivalent to PM_RUN_CYC,
+		 * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR.
+		 * This doesn't include alternatives that don't provide
+		 * any extra flexibility in assigning PMCs (e.g.
+		 * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC).
+		 * Note that even with these additional alternatives
+		 * we never end up with more than 4 alternatives for any event.
+		 */
+		j = nalt;
+		for (i = 0; i < nalt; ++i) {
+			switch (alt[i]) {
+			case 0x1e:	/* PM_CYC */
+				alt[j++] = 0x600005;	/* PM_RUN_CYC */
+				++nlim;
+				break;
+			case 0x10000a:	/* PM_RUN_CYC */
+				alt[j++] = 0x1e;	/* PM_CYC */
+				break;
+			case 2:		/* PM_INST_CMPL */
+				alt[j++] = 0x500009;	/* PM_RUN_INST_CMPL */
+				++nlim;
+				break;
+			case 0x500009:	/* PM_RUN_INST_CMPL */
+				alt[j++] = 2;		/* PM_INST_CMPL */
+				break;
+			case 0x10000e:	/* PM_PURR */
+				alt[j++] = 0x4000f4;	/* PM_RUN_PURR */
+				break;
+			case 0x4000f4:	/* PM_RUN_PURR */
+				alt[j++] = 0x10000e;	/* PM_PURR */
+				break;
+			}
+		}
+		nalt = j;
+	}
+
+	if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) {
+		/* remove the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (!p6_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	} else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) {
+		/* remove all but the limited PMC events */
+		j = 0;
+		for (i = 0; i < nalt; ++i) {
+			if (p6_limited_pmc_event(alt[i])) {
+				alt[j] = alt[i];
+				++j;
+			}
+		}
+		nalt = j;
+	}
+
 	return nalt;
 }
 
 static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
 {
 	/* Set PMCxSEL to 0 to disable PMCx */
-	mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
+	if (pmc <= 3)
+		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc));
 }
 
 static int power6_generic_events[] = {
@@ -394,14 +475,16 @@ static int power6_generic_events[] = {
 };
 
 struct power_pmu power6_pmu = {
-	.n_counter = 4,
+	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
-	.add_fields = 0x55,
-	.test_adder = 0,
+	.add_fields = 0x1555,
+	.test_adder = 0x3000,
 	.compute_mmcr = p6_compute_mmcr,
 	.get_constraint = p6_get_constraint,
 	.get_alternatives = p6_get_alternatives,
 	.disable_pmc = p6_disable_pmc,
 	.n_generic = ARRAY_SIZE(power6_generic_events),
 	.generic_events = power6_generic_events,
+	.limited_pmc5_6 = 1,
+	.limited_pmc_event = p6_limited_pmc_event,
 };
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index aed8ccd7c0779..af2d1884058c1 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -243,7 +243,8 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int p970_get_alternatives(unsigned int event, unsigned int alt[])
+static int p970_get_alternatives(unsigned int event, unsigned int flags,
+				 unsigned int alt[])
 {
 	alt[0] = event;
 
-- 
GitLab


From b1fca26631f76a5e8b18435a43f5d82b8734da4b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 23 Mar 2009 18:22:09 +0100
Subject: [PATCH 0746/2198] mutex: add atomic_dec_and_mutex_lock()

Much like the atomic_dec_and_lock() function in which we take an hold a
spin_lock if we drop the atomic to 0 this function takes and holds the
mutex if we dec the atomic to 0.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090323172417.410913479@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mutex.h | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 3069ec7e0ab84..93054fc3635c9 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
 
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+
 #endif
-- 
GitLab


From 0c8b946e3ebb3846103486420ea7430a4b5e5b1b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 15 Apr 2009 17:48:18 +0200
Subject: [PATCH 0747/2198] vsprintf: introduce %pf format specifier

A printf format specifier which would allow us to print a pure
function name has been suggested by Andrew Morton a couple of
months ago.

The current %pF is very convenient to print a function symbol,
but often we only want to print the name of the function, without
its asm offset.

That's what  %pf does in this patch.  The lowecase f has been chosen
for its intuitive meaning of a 'weak kind of %pF'.

The support for this new format would be welcome by the tracing code
where the need to print pure function names is often needed. This is
also true for other parts of the kernel:

$ git-grep -E "kallsyms_lookup\(.+?\)"
arch/blackfin/kernel/traps.c:   symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf);
arch/powerpc/xmon/xmon.c:               name = kallsyms_lookup(pc, &size, &offset, NULL, tmpstr);
arch/sh/kernel/cpu/sh5/unwind.c:        sym = kallsyms_lookup(pc, NULL, &offset, NULL, namebuf);
arch/x86/kernel/ftrace.c:       kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
kernel/kprobes.c:               sym = kallsyms_lookup((unsigned long)p->addr, NULL,
kernel/lockdep.c:       return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
kernel/trace/ftrace.c:  kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
kernel/trace/ftrace.c:  kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
kernel/trace/trace_functions.c: kallsyms_lookup(ip, NULL, NULL, NULL, str);
kernel/trace/trace_output.c:    kallsyms_lookup(address, NULL, NULL, NULL, str);

Changes in v2:

- Add the explanation of the %pf role for vsnprintf() and bstr_printf()

- Change the comments by dropping the "asm offset" notion and only
  define the %pf against the actual function offset notion.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090415154817.GC5989@nowhere>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/vsprintf.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/lib/vsprintf.c b/lib/vsprintf.c
index b56f6d039d211..756ccafa9cec0 100644
--- a/lib/vsprintf.c
+++ b/lib/vsprintf.c
@@ -575,12 +575,15 @@ static char *string(char *buf, char *end, char *s, struct printf_spec spec)
 }
 
 static char *symbol_string(char *buf, char *end, void *ptr,
-				struct printf_spec spec)
+				struct printf_spec spec, char ext)
 {
 	unsigned long value = (unsigned long) ptr;
 #ifdef CONFIG_KALLSYMS
 	char sym[KSYM_SYMBOL_LEN];
-	sprint_symbol(sym, value);
+	if (ext != 'f')
+		sprint_symbol(sym, value);
+	else
+		kallsyms_lookup(value, NULL, NULL, NULL, sym);
 	return string(buf, end, sym, spec);
 #else
 	spec.field_width = 2*sizeof(void *);
@@ -692,7 +695,8 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr,
  *
  * Right now we handle:
  *
- * - 'F' For symbolic function descriptor pointers
+ * - 'F' For symbolic function descriptor pointers with offset
+ * - 'f' For simple symbolic function names without offset
  * - 'S' For symbolic direct pointers
  * - 'R' For a struct resource pointer, it prints the range of
  *       addresses (not the name nor the flags)
@@ -715,10 +719,11 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 
 	switch (*fmt) {
 	case 'F':
+	case 'f':
 		ptr = dereference_function_descriptor(ptr);
 		/* Fallthrough */
 	case 'S':
-		return symbol_string(buf, end, ptr, spec);
+		return symbol_string(buf, end, ptr, spec, *fmt);
 	case 'R':
 		return resource_string(buf, end, ptr, spec);
 	case 'm':
@@ -954,7 +959,8 @@ static int format_decode(const char *fmt, struct printf_spec *spec)
  *
  * This function follows C99 vsnprintf, but has some extensions:
  * %pS output the name of a text symbol
- * %pF output the name of a function pointer
+ * %pF output the name of a function pointer with its offset
+ * %pf output the name of a function pointer without its offset
  * %pR output the address range in a struct resource
  *
  * The return value is the number of characters which would
@@ -1412,7 +1418,8 @@ EXPORT_SYMBOL_GPL(vbin_printf);
  *
  * The format follows C99 vsnprintf, but has some extensions:
  * %pS output the name of a text symbol
- * %pF output the name of a function pointer
+ * %pF output the name of a function pointer with its offset
+ * %pf output the name of a function pointer without its offset
  * %pR output the address range in a struct resource
  * %n is ignored
  *
-- 
GitLab


From 43f6201a22dbf1c5abe1cab96b49bd56fa9df8f4 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 29 Apr 2009 16:55:56 +0200
Subject: [PATCH 0748/2198] perf_counter, x86: rename bitmasks to ->used_mask
 and ->active_mask

Standardize on explicitly mentioning '_mask' in fields that
are not plain flags but masks. This avoids typos like:

       if (cpuc->used)

(which could easily slip through review unnoticed), while if a
typo looks like this:

       if (cpuc->used_mask)

it might get noticed during review.

[ Impact: cleanup ]

Signed-off-by: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1241016956-24648-1-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 47e563bfd4cf2..fc06f4d326447 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -28,8 +28,8 @@ static u64 perf_counter_mask __read_mostly;
 
 struct cpu_hw_counters {
 	struct perf_counter	*counters[X86_PMC_IDX_MAX];
-	unsigned long		used[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
-	unsigned long		active[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
 	u64			throttle_ctrl;
 	int			enabled;
@@ -332,7 +332,7 @@ static u64 amd_pmu_save_disable_all(void)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE))
@@ -373,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl)
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
 		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
@@ -576,7 +576,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		 * Try to get the fixed counter, if that is already taken
 		 * then try to get a generic counter:
 		 */
-		if (test_and_set_bit(idx, cpuc->used))
+		if (test_and_set_bit(idx, cpuc->used_mask))
 			goto try_generic;
 
 		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
@@ -590,14 +590,14 @@ static int x86_pmu_enable(struct perf_counter *counter)
 	} else {
 		idx = hwc->idx;
 		/* Try to get the previous generic counter again */
-		if (test_and_set_bit(idx, cpuc->used)) {
+		if (test_and_set_bit(idx, cpuc->used_mask)) {
 try_generic:
-			idx = find_first_zero_bit(cpuc->used,
+			idx = find_first_zero_bit(cpuc->used_mask,
 						  x86_pmu.num_counters);
 			if (idx == x86_pmu.num_counters)
 				return -EAGAIN;
 
-			set_bit(idx, cpuc->used);
+			set_bit(idx, cpuc->used_mask);
 			hwc->idx = idx;
 		}
 		hwc->config_base  = x86_pmu.eventsel;
@@ -609,7 +609,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
-	set_bit(idx, cpuc->active);
+	set_bit(idx, cpuc->active_mask);
 
 	x86_perf_counter_set_period(counter, hwc, idx);
 	x86_pmu.enable(hwc, idx);
@@ -643,7 +643,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 	}
-	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used);
+	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask);
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -677,7 +677,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 * Must be done before we disable, otherwise the nmi handler
 	 * could reenable again:
 	 */
-	clear_bit(idx, cpuc->active);
+	clear_bit(idx, cpuc->active_mask);
 	x86_pmu.disable(hwc, idx);
 
 	/*
@@ -692,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	 */
 	x86_perf_counter_update(counter, hwc, idx);
 	cpuc->counters[idx] = NULL;
-	clear_bit(idx, cpuc->used);
+	clear_bit(idx, cpuc->used_mask);
 }
 
 /*
@@ -741,7 +741,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		struct perf_counter *counter = cpuc->counters[bit];
 
 		clear_bit(bit, (unsigned long *) &status);
-		if (!test_bit(bit, cpuc->active))
+		if (!test_bit(bit, cpuc->active_mask))
 			continue;
 
 		intel_pmu_save_and_restart(counter);
@@ -779,7 +779,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 	++cpuc->interrupts;
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		if (!test_bit(idx, cpuc->active))
+		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
-- 
GitLab


From 23b94b967f118bef941369238f33c8140be46539 Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Wed, 29 Apr 2009 21:54:51 +0100
Subject: [PATCH 0749/2198] locking, rtmutex.c: Documentation cleanup

Two minor updates on functions documentation:
 - Updated documentation for function rt_mutex_unlock(), which contained an
   incorrect name
 - Removed extra '*' from comment in function rt_mutex_destroy()

[ Impact: cleanup ]

Signed-off-by: Luis Henriques <henrix@sapo.pt>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090429205451.GA23154@hades.domain.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rtmutex.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 69d9cb921ffa6..013882e834974 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -864,9 +864,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 
 /**
- * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
- *				       the timeout structure is provided
- *				       by the caller
+ * rt_mutex_timed_lock - lock a rt_mutex interruptible
+ *			the timeout structure is provided
+ *			by the caller
  *
  * @lock: 		the rt_mutex to be locked
  * @timeout:		timeout structure or NULL (no timeout)
@@ -913,7 +913,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
 }
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 
-/***
+/**
  * rt_mutex_destroy - mark a mutex unusable
  * @lock: the mutex to be destroyed
  *
-- 
GitLab


From 88c48db9788862d0290831d081bc3c64e13b592f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 29 Apr 2009 14:00:25 -0400
Subject: [PATCH 0750/2198] SELinux: drop secondary_ops->sysctl

We are still calling secondary_ops->sysctl even though the capabilities
module does not define a sysctl operation.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by:  Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/selinux/hooks.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index ba808ef6babb5..dd19ba81201f9 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -1980,10 +1980,6 @@ static int selinux_sysctl(ctl_table *table, int op)
 	u32 tsid, sid;
 	int rc;
 
-	rc = secondary_ops->sysctl(table, op);
-	if (rc)
-		return rc;
-
 	sid = current_sid();
 
 	rc = selinux_sysctl_get_sid(table, (op == 0001) ?
-- 
GitLab


From 3bcac0263f0b45e67a64034ebcb69eb9abb742f4 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 29 Apr 2009 13:45:05 +0100
Subject: [PATCH 0751/2198] SELinux: Don't flush inherited SIGKILL during
 execve()

Don't flush inherited SIGKILL during execve() in SELinux's post cred commit
hook.  This isn't really a security problem: if the SIGKILL came before the
credentials were changed, then we were right to receive it at the time, and
should honour it; if it came after the creds were changed, then we definitely
should honour it; and in any case, all that will happen is that the process
will be scrapped before it ever returns to userspace.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/sched.h    |  1 +
 kernel/signal.c          | 11 ++++++++---
 security/selinux/hooks.c |  9 +++++----
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1d19c025f9d2e..d3b787c7aef36 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1875,6 +1875,7 @@ extern void sched_dead(struct task_struct *p);
 
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
+extern void __flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c8814481a117..f93efec14ff57 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -238,14 +238,19 @@ void flush_sigqueue(struct sigpending *queue)
 /*
  * Flush all pending signals for a task.
  */
+void __flush_signals(struct task_struct *t)
+{
+	clear_tsk_thread_flag(t, TIF_SIGPENDING);
+	flush_sigqueue(&t->pending);
+	flush_sigqueue(&t->signal->shared_pending);
+}
+
 void flush_signals(struct task_struct *t)
 {
 	unsigned long flags;
 
 	spin_lock_irqsave(&t->sighand->siglock, flags);
-	clear_tsk_thread_flag(t, TIF_SIGPENDING);
-	flush_sigqueue(&t->pending);
-	flush_sigqueue(&t->signal->shared_pending);
+	__flush_signals(t);
 	spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index dd19ba81201f9..5a345115036c3 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2394,11 +2394,12 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 		memset(&itimer, 0, sizeof itimer);
 		for (i = 0; i < 3; i++)
 			do_setitimer(i, &itimer, NULL);
-		flush_signals(current);
 		spin_lock_irq(&current->sighand->siglock);
-		flush_signal_handlers(current, 1);
-		sigemptyset(&current->blocked);
-		recalc_sigpending();
+		if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
+			__flush_signals(current);
+			flush_signal_handlers(current, 1);
+			sigemptyset(&current->blocked);
+		}
 		spin_unlock_irq(&current->sighand->siglock);
 	}
 
-- 
GitLab


From ecd6de3c88e8cbcad175b2eab48ba05c2014f7b6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 29 Apr 2009 16:02:24 +0200
Subject: [PATCH 0752/2198] selinux: selinux_bprm_committed_creds() should wake
 up ->real_parent, not ->parent.

We shouldn't worry about the tracer if current is ptraced, exec() must not
succeed if the tracer has no rights to trace this task after cred changing.
But we should notify ->real_parent which is, well, real parent.

Also, we don't need _irq to take tasklist, and we don't need parent's
->siglock to wake_up_interruptible(real_parent->signal->wait_chldexit).
Since we hold tasklist, real_parent->signal must be stable. Otherwise
spin_lock(siglock) is not safe too and can't help anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/selinux/hooks.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c
index 5a345115036c3..39046ddd90a9c 100644
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -2371,10 +2371,8 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 {
 	const struct task_security_struct *tsec = current_security();
 	struct itimerval itimer;
-	struct sighand_struct *psig;
 	u32 osid, sid;
 	int rc, i;
-	unsigned long flags;
 
 	osid = tsec->osid;
 	sid = tsec->sid;
@@ -2405,12 +2403,9 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 
 	/* Wake up the parent if it is waiting so that it can recheck
 	 * wait permission to the new task SID. */
-	read_lock_irq(&tasklist_lock);
-	psig = current->parent->sighand;
-	spin_lock_irqsave(&psig->siglock, flags);
-	wake_up_interruptible(&current->parent->signal->wait_chldexit);
-	spin_unlock_irqrestore(&psig->siglock, flags);
-	read_unlock_irq(&tasklist_lock);
+	read_lock(&tasklist_lock);
+	wake_up_interruptible(&current->real_parent->signal->wait_chldexit);
+	read_unlock(&tasklist_lock);
 }
 
 /* superblock security operations */
-- 
GitLab


From e6be3a25861429166f945499c7ee616875bc3db9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 12:56:37 +0900
Subject: [PATCH 0753/2198] sh: pass through ioremap() for non-mmu processors.

All 32-bit SuperH processors currently go through __ioremap_mode()
and check for IO_TRAPPED and directly mapped segments. With this
patch we simplify the MMU less case with a pass through version of
 __ioremap_mode() which just returns the physical address.

The effects of this is change are:
 - fix non-MMU ioremap() of high address hardware blocks (sh7203 CMT)
 - make sure IO_TRAPPED is not selected

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/Kconfig   |  6 +++---
 arch/sh/include/asm/io.h | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/sh/boards/Kconfig b/arch/sh/boards/Kconfig
index dcc1af8a2cfe6..99e2d47f79f56 100644
--- a/arch/sh/boards/Kconfig
+++ b/arch/sh/boards/Kconfig
@@ -121,7 +121,7 @@ config SH_RTS7751R2D
 	bool "RTS7751R2D"
 	depends on CPU_SUBTYPE_SH7751R
 	select SYS_SUPPORTS_PCI
-	select IO_TRAPPED
+	select IO_TRAPPED if MMU
 	help
 	  Select RTS7751R2D if configuring for a Renesas Technology
 	  Sales SH-Graphics board.
@@ -145,13 +145,13 @@ config SH_HIGHLANDER
 	bool "Highlander"
 	depends on CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785
 	select SYS_SUPPORTS_PCI
-	select IO_TRAPPED
+	select IO_TRAPPED if MMU
 
 config SH_SH7785LCR
 	bool "SH7785LCR"
 	depends on CPU_SUBTYPE_SH7785
 	select SYS_SUPPORTS_PCI
-	select IO_TRAPPED
+	select IO_TRAPPED if MMU
 
 config SH_SH7785LCR_29BIT_PHYSMAPS
 	bool "SH7785LCR 29bit physmaps"
diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h
index 0454f8d680591..ef217344afcbf 100644
--- a/arch/sh/include/asm/io.h
+++ b/arch/sh/include/asm/io.h
@@ -228,12 +228,6 @@ void __iounmap(void __iomem *addr);
 unsigned long onchip_remap(unsigned long addr, unsigned long size,
 			   const char *name);
 extern void onchip_unmap(unsigned long vaddr);
-#else
-#define __ioremap(offset, size, flags)	((void __iomem *)(offset))
-#define __iounmap(addr)			do { } while (0)
-#define onchip_remap(addr, size, name)	(addr)
-#define onchip_unmap(addr)		do { } while (0)
-#endif /* CONFIG_MMU */
 
 static inline void __iomem *
 __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags)
@@ -268,6 +262,12 @@ __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags)
 
 	return __ioremap(offset, size, flags);
 }
+#else
+#define onchip_remap(addr, size, name)		(addr)
+#define onchip_unmap(addr)			do { } while (0)
+#define __ioremap_mode(offset, size, flags)	((void __iomem *)(offset))
+#define __iounmap(addr)				do { } while (0)
+#endif /* CONFIG_MMU */
 
 #define ioremap(offset, size)				\
 	__ioremap_mode((offset), (size), 0)
-- 
GitLab


From 3014f47460ecfb13d4169daae51f26a20bacfa17 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 29 Apr 2009 14:50:37 +0000
Subject: [PATCH 0754/2198] clocksource: sh_cmt 16-bit fixes

This patch contains various fixes for 16-bit cmt hardware.
With this applied periodic clockevents work fine on sh7203.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/clocksource/sh_cmt.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index bf3e4c11fd37a..4ff1508e5ab70 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -158,16 +158,18 @@ static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate)
 		pr_err("sh_cmt: cannot enable clock \"%s\"\n", cfg->clk);
 		return ret;
 	}
-	*rate = clk_get_rate(p->clk) / 8;
 
 	/* make sure channel is disabled */
 	sh_cmt_start_stop_ch(p, 0);
 
 	/* configure channel, periodic mode and maximum timeout */
-	if (p->width == 16)
-		sh_cmt_write(p, CMCSR, 0);
-	else
+	if (p->width == 16) {
+		*rate = clk_get_rate(p->clk) / 512;
+		sh_cmt_write(p, CMCSR, 0x43);
+	} else {
+		*rate = clk_get_rate(p->clk) / 8;
 		sh_cmt_write(p, CMCSR, 0x01a4);
+	}
 
 	sh_cmt_write(p, CMCOR, 0xffffffff);
 	sh_cmt_write(p, CMCNT, 0);
@@ -615,7 +617,7 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
 	if (resource_size(res) == 6) {
 		p->width = 16;
 		p->overflow_bit = 0x80;
-		p->clear_bits = ~0xc0;
+		p->clear_bits = ~0x80;
 	} else {
 		p->width = 32;
 		p->overflow_bit = 0x8000;
-- 
GitLab


From 698aa99da5f5e2b4c666fd21ab77306f0225b8f5 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 04:08:18 +0000
Subject: [PATCH 0755/2198] sh: sh2/sh2a 16-bit CMT platform data

This patch adds 16-bit cmt platform data for the following cpus:
 - sh7619 (2 channels)
 - sh7203/sh7263 (2 channels)
 - sh7206 (2 channels)

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh2/setup-sh7619.c  | 84 ++++++++++++++++++++++++++
 arch/sh/kernel/cpu/sh2a/setup-sh7203.c | 84 ++++++++++++++++++++++++++
 arch/sh/kernel/cpu/sh2a/setup-sh7206.c | 84 ++++++++++++++++++++++++++
 3 files changed, 252 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
index 0e32d8e448ca5..d70c263eb07e4 100644
--- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
@@ -12,6 +12,8 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_cmt.h>
+#include <linux/io.h>
 
 enum {
 	UNUSED = 0,
@@ -109,9 +111,75 @@ static struct platform_device eth_device = {
 	.resource = eth_resources,
 };
 
+static struct sh_cmt_config cmt0_platform_data = {
+	.name = "CMT0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt0_resources[] = {
+	[0] = {
+		.name	= "CMT0",
+		.start	= 0xf84a0072,
+		.end	= 0xf84a0077,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 86,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt0_device = {
+	.name		= "sh_cmt",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &cmt0_platform_data,
+	},
+	.resource	= cmt0_resources,
+	.num_resources	= ARRAY_SIZE(cmt0_resources),
+};
+
+static struct sh_cmt_config cmt1_platform_data = {
+	.name = "CMT1",
+	.channel_offset = 0x08,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt1_resources[] = {
+	[0] = {
+		.name	= "CMT1",
+		.start	= 0xf84a0078,
+		.end	= 0xf84a007d,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 87,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt1_device = {
+	.name		= "sh_cmt",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &cmt1_platform_data,
+	},
+	.resource	= cmt1_resources,
+	.num_resources	= ARRAY_SIZE(cmt1_resources),
+};
+
 static struct platform_device *sh7619_devices[] __initdata = {
 	&sci_device,
 	&eth_device,
+	&cmt0_device,
+	&cmt1_device,
 };
 
 static int __init sh7619_devices_setup(void)
@@ -125,3 +193,19 @@ void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct platform_device *sh7619_early_devices[] __initdata = {
+	&cmt0_device,
+	&cmt1_device,
+};
+
+#define STBCR3 0xf80a0000
+
+void __init plat_early_device_setup(void)
+{
+	/* enable CMT clock */
+	__raw_writeb(__raw_readb(STBCR3) & ~0x10, STBCR3);
+
+	early_platform_add_devices(sh7619_early_devices,
+				   ARRAY_SIZE(sh7619_early_devices));
+}
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index 820dfb2e86563..0836acee22892 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -11,6 +11,8 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_cmt.h>
+#include <linux/io.h>
 
 enum {
 	UNUSED = 0,
@@ -205,6 +207,70 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_cmt_config cmt0_platform_data = {
+	.name = "CMT0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt0_resources[] = {
+	[0] = {
+		.name	= "CMT0",
+		.start	= 0xfffec002,
+		.end	= 0xfffec007,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 142,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt0_device = {
+	.name		= "sh_cmt",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &cmt0_platform_data,
+	},
+	.resource	= cmt0_resources,
+	.num_resources	= ARRAY_SIZE(cmt0_resources),
+};
+
+static struct sh_cmt_config cmt1_platform_data = {
+	.name = "CMT1",
+	.channel_offset = 0x08,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt1_resources[] = {
+	[0] = {
+		.name	= "CMT1",
+		.start	= 0xfffec008,
+		.end	= 0xfffec00d,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 143,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt1_device = {
+	.name		= "sh_cmt",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &cmt1_platform_data,
+	},
+	.resource	= cmt1_resources,
+	.num_resources	= ARRAY_SIZE(cmt1_resources),
+};
+
 static struct resource rtc_resources[] = {
 	[0] = {
 		.start	= 0xffff2000,
@@ -227,6 +293,8 @@ static struct platform_device rtc_device = {
 
 static struct platform_device *sh7203_devices[] __initdata = {
 	&sci_device,
+	&cmt0_device,
+	&cmt1_device,
 	&rtc_device,
 };
 
@@ -241,3 +309,19 @@ void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct platform_device *sh7203_early_devices[] __initdata = {
+	&cmt0_device,
+	&cmt1_device,
+};
+
+#define STBCR4 0xfffe040c
+
+void __init plat_early_device_setup(void)
+{
+	/* enable CMT clock */
+	__raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4);
+
+	early_platform_add_devices(sh7203_early_devices,
+				   ARRAY_SIZE(sh7203_early_devices));
+}
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index c46a8355726d1..f9606e3d25152 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -12,6 +12,8 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_cmt.h>
+#include <linux/io.h>
 
 enum {
 	UNUSED = 0,
@@ -165,8 +167,74 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_cmt_config cmt0_platform_data = {
+	.name = "CMT0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt0_resources[] = {
+	[0] = {
+		.name	= "CMT0",
+		.start	= 0xfffec002,
+		.end	= 0xfffec007,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 140,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt0_device = {
+	.name		= "sh_cmt",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &cmt0_platform_data,
+	},
+	.resource	= cmt0_resources,
+	.num_resources	= ARRAY_SIZE(cmt0_resources),
+};
+
+static struct sh_cmt_config cmt1_platform_data = {
+	.name = "CMT1",
+	.channel_offset = 0x08,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 0, /* disabled due to code generation issues */
+};
+
+static struct resource cmt1_resources[] = {
+	[0] = {
+		.name	= "CMT1",
+		.start	= 0xfffec008,
+		.end	= 0xfffec00d,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 144,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt1_device = {
+	.name		= "sh_cmt",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &cmt1_platform_data,
+	},
+	.resource	= cmt1_resources,
+	.num_resources	= ARRAY_SIZE(cmt1_resources),
+};
+
 static struct platform_device *sh7206_devices[] __initdata = {
 	&sci_device,
+	&cmt0_device,
+	&cmt1_device,
 };
 
 static int __init sh7206_devices_setup(void)
@@ -180,3 +248,19 @@ void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct platform_device *sh7206_early_devices[] __initdata = {
+	&cmt0_device,
+	&cmt1_device,
+};
+
+#define STBCR4 0xfffe040c
+
+void __init plat_early_device_setup(void)
+{
+	/* enable CMT clock */
+	__raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4);
+
+	early_platform_add_devices(sh7206_early_devices,
+				   ARRAY_SIZE(sh7206_early_devices));
+}
-- 
GitLab


From f425752fc66acf1d4e47970ea704ed7d31c14173 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 04:09:26 +0000
Subject: [PATCH 0756/2198] sh: remove old CMT driver

This patch removes the old CMT driver (CONFIG_SH_CMT/timer-cmt.c)

As replacement, select the sh_cmt driver with CONFIG_SH_TIMER_CMT
and configure timer channel using platform data.

If multiple CMT channels are enabled using platform data, use the
earlytimer parameter on the kernel command line to select channel.
For instance, use "earlytimer=sh_cmt.0" to select the first channel.

To verify which timer is being used, look at printouts or the timer
irq count in /proc/interrupts.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                   |  18 +--
 arch/sh/include/asm/timer.h       |   2 +-
 arch/sh/kernel/timers/Makefile    |   1 -
 arch/sh/kernel/timers/timer-cmt.c | 188 ------------------------------
 arch/sh/kernel/timers/timer.c     |   3 -
 5 files changed, 6 insertions(+), 206 deletions(-)
 delete mode 100644 arch/sh/kernel/timers/timer-cmt.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 9db02ced57a5a..7e96adea9602d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -462,22 +462,14 @@ config SH_TMU
 	help
 	  This enables the use of the TMU as the system timer.
 
-config SH_CMT
-	bool "CMT timer support"
-	depends on SYS_SUPPORTS_CMT && CPU_SH2
-	default y
-	help
-	  This enables the use of the CMT as the system timer.
-
-#
-# Support for the new-style CMT driver. This will replace SH_CMT
-# once its other dependencies are merged.
-#
 config SH_TIMER_CMT
-	bool "CMT clockevents driver"
-	depends on SYS_SUPPORTS_CMT && !SH_CMT
+	bool "CMT timer driver"
+	depends on SYS_SUPPORTS_CMT
+	default y
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_TIME
+	help
+	  This enables build of the CMT timer driver.
 
 config SH_MTU2
 	bool "MTU2 timer support"
diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index 4c3b66e30af23..8f80a55c04ae5 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -23,7 +23,7 @@ struct sys_timer {
 
 #define TICK_SIZE (tick_nsec / 1000)
 
-extern struct sys_timer tmu_timer, cmt_timer, mtu2_timer;
+extern struct sys_timer tmu_timer, mtu2_timer;
 extern struct sys_timer *sys_timer;
 
 #ifndef CONFIG_GENERIC_TIME
diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile
index 0b7f8577193ff..1e9a104c2aee9 100644
--- a/arch/sh/kernel/timers/Makefile
+++ b/arch/sh/kernel/timers/Makefile
@@ -6,6 +6,5 @@ obj-y	:= timer.o
 
 obj-$(CONFIG_SH_TMU)		+= timer-tmu.o
 obj-$(CONFIG_SH_MTU2)		+= timer-mtu2.o
-obj-$(CONFIG_SH_CMT)		+= timer-cmt.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= timer-broadcast.o
diff --git a/arch/sh/kernel/timers/timer-cmt.c b/arch/sh/kernel/timers/timer-cmt.c
deleted file mode 100644
index 9aa348658ae33..0000000000000
--- a/arch/sh/kernel/timers/timer-cmt.c
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * arch/sh/kernel/timers/timer-cmt.c - CMT Timer Support
- *
- *  Copyright (C) 2005  Yoshinori Sato
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/seqlock.h>
-#include <asm/timer.h>
-#include <asm/rtc.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/clock.h>
-
-#if defined(CONFIG_CPU_SUBTYPE_SH7619)
-#define CMT_CMSTR	0xf84a0070
-#define CMT_CMCSR_0	0xf84a0072
-#define CMT_CMCNT_0	0xf84a0074
-#define CMT_CMCOR_0	0xf84a0076
-#define CMT_CMCSR_1	0xf84a0078
-#define CMT_CMCNT_1	0xf84a007a
-#define CMT_CMCOR_1	0xf84a007c
-
-#define STBCR3		0xf80a0000
-#define cmt_clock_enable() do {	ctrl_outb(ctrl_inb(STBCR3) & ~0x10, STBCR3); } while(0)
-#define CMT_CMCSR_INIT	0x0040
-#define CMT_CMCSR_CALIB	0x0000
-#elif defined(CONFIG_CPU_SUBTYPE_SH7203) || \
-      defined(CONFIG_CPU_SUBTYPE_SH7206) || \
-      defined(CONFIG_CPU_SUBTYPE_SH7263)
-#define CMT_CMSTR	0xfffec000
-#define CMT_CMCSR_0	0xfffec002
-#define CMT_CMCNT_0	0xfffec004
-#define CMT_CMCOR_0	0xfffec006
-
-#define STBCR4		0xfffe040c
-#define cmt_clock_enable() do {	ctrl_outb(ctrl_inb(STBCR4) & ~0x04, STBCR4); } while(0)
-#define CMT_CMCSR_INIT	0x0040
-#define CMT_CMCSR_CALIB	0x0000
-#else
-#error "Unknown CPU SUBTYPE"
-#endif
-
-static unsigned long cmt_timer_get_offset(void)
-{
-	int count;
-	static unsigned short count_p = 0xffff;    /* for the first call after boot */
-	static unsigned long jiffies_p = 0;
-
-	/*
-	 * cache volatile jiffies temporarily; we have IRQs turned off.
-	 */
-	unsigned long jiffies_t;
-
-	/* timer count may underflow right here */
-	count =  ctrl_inw(CMT_CMCOR_0);
-	count -= ctrl_inw(CMT_CMCNT_0);
-
-	jiffies_t = jiffies;
-
-	/*
-	 * avoiding timer inconsistencies (they are rare, but they happen)...
-	 * there is one kind of problem that must be avoided here:
-	 *  1. the timer counter underflows
-	 */
-
-	if (jiffies_t == jiffies_p) {
-		if (count > count_p) {
-			/* the nutcase */
-			if (ctrl_inw(CMT_CMCSR_0) & 0x80) { /* Check CMF bit */
-				count -= LATCH;
-			} else {
-				printk("%s (): hardware timer problem?\n",
-				       __func__);
-			}
-		}
-	} else
-		jiffies_p = jiffies_t;
-
-	count_p = count;
-
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	count = (count + LATCH/2) / LATCH;
-
-	return count;
-}
-
-static irqreturn_t cmt_timer_interrupt(int irq, void *dev_id)
-{
-	unsigned long timer_status;
-
-	/* Clear CMF bit */
-	timer_status = ctrl_inw(CMT_CMCSR_0);
-	timer_status &= ~0x80;
-	ctrl_outw(timer_status, CMT_CMCSR_0);
-
-	handle_timer_tick();
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction cmt_irq = {
-	.name		= "timer",
-	.handler	= cmt_timer_interrupt,
-	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
-};
-
-static void cmt_clk_init(struct clk *clk)
-{
-	u8 divisor = CMT_CMCSR_INIT & 0x3;
-	ctrl_inw(CMT_CMCSR_0);
-	ctrl_outw(CMT_CMCSR_INIT, CMT_CMCSR_0);
-	clk->parent = clk_get(NULL, "module_clk");
-	clk->rate = clk->parent->rate / (8 << (divisor << 1));
-}
-
-static void cmt_clk_recalc(struct clk *clk)
-{
-	u8 divisor = ctrl_inw(CMT_CMCSR_0) & 0x3;
-	clk->rate = clk->parent->rate / (8 << (divisor << 1));
-}
-
-static struct clk_ops cmt_clk_ops = {
-	.init		= cmt_clk_init,
-	.recalc		= cmt_clk_recalc,
-};
-
-static struct clk cmt0_clk = {
-	.name		= "cmt0_clk",
-	.ops		= &cmt_clk_ops,
-};
-
-static int cmt_timer_start(void)
-{
-	ctrl_outw(ctrl_inw(CMT_CMSTR) | 0x01, CMT_CMSTR);
-	return 0;
-}
-
-static int cmt_timer_stop(void)
-{
-	ctrl_outw(ctrl_inw(CMT_CMSTR) & ~0x01, CMT_CMSTR);
-	return 0;
-}
-
-static int cmt_timer_init(void)
-{
-	unsigned long interval;
-
-	cmt_clock_enable();
-
-	setup_irq(CONFIG_SH_TIMER_IRQ, &cmt_irq);
-
-	cmt0_clk.parent = clk_get(NULL, "module_clk");
-
-	cmt_timer_stop();
-
-	interval = cmt0_clk.parent->rate / 8 / HZ;
-	printk(KERN_INFO "Interval = %ld\n", interval);
-
-	ctrl_outw(interval, CMT_CMCOR_0);
-
-	clk_register(&cmt0_clk);
-	clk_enable(&cmt0_clk);
-
-	cmt_timer_start();
-
-	return 0;
-}
-
-static struct sys_timer_ops cmt_timer_ops = {
-	.init		= cmt_timer_init,
-	.start		= cmt_timer_start,
-	.stop		= cmt_timer_stop,
-#ifndef CONFIG_GENERIC_TIME
-	.get_offset	= cmt_timer_get_offset,
-#endif
-};
-
-struct sys_timer cmt_timer = {
-	.name	= "cmt",
-	.ops	= &cmt_timer_ops,
-};
diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c
index 4e7e747d1b696..f3bd1413d5681 100644
--- a/arch/sh/kernel/timers/timer.c
+++ b/arch/sh/kernel/timers/timer.c
@@ -19,9 +19,6 @@ static struct sys_timer *sys_timers[] = {
 #endif
 #ifdef CONFIG_SH_MTU2
 	&mtu2_timer,
-#endif
-#ifdef CONFIG_SH_CMT
-	&cmt_timer,
 #endif
 	NULL,
 };
-- 
GitLab


From c5dd016cdf0a040e1de0b691e274fbfe642b2cdc Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 30 Apr 2009 09:48:16 +1000
Subject: [PATCH 0757/2198] perf_counter: update copyright notice

This adds my name to the list of copyright holders on the core
perf_counter.c, since I have contributed a significant amount of the
code in there.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
LKML-Reference: <18936.59200.888049.746658@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a95a171e608cf..75f2b6c823920 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
  *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
  *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  *
  *  For licensing details see kernel-base/COPYING
  */
-- 
GitLab


From a511e3f968c462a55ef58697257f5347c73d306e Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Wed, 29 Apr 2009 15:59:58 -0700
Subject: [PATCH 0758/2198] mutex: add atomic_dec_and_mutex_lock(), fix

include/linux/mutex.h:136: warning: 'mutex_lock' declared inline after being called
 include/linux/mutex.h:136: warning: previous declaration of 'mutex_lock' was here

uninline it.

[ Impact: clean up and uninline, address compiler warning ]

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Eric Paris <eparis@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <200904292318.n3TNIsi6028340@imap1.linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/mutex.h | 24 +-----------------------
 kernel/mutex.c        | 25 ++++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/include/linux/mutex.h b/include/linux/mutex.h
index 93054fc3635c9..878cab4f5fcc5 100644
--- a/include/linux/mutex.h
+++ b/include/linux/mutex.h
@@ -150,28 +150,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
-
-/**
- * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
- * @cnt: the atomic which we are to dec
- * @lock: the mutex to return holding if we dec to 0
- *
- * return true and hold lock if we dec to 0, return false otherwise
- */
-static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
-{
-	/* dec if we can't possibly hit 0 */
-	if (atomic_add_unless(cnt, -1, 1))
-		return 0;
-	/* we might hit 0, so take the lock */
-	mutex_lock(lock);
-	if (!atomic_dec_and_test(cnt)) {
-		/* when we actually did the dec, we didn't hit 0 */
-		mutex_unlock(lock);
-		return 0;
-	}
-	/* we hit 0, and we hold the lock */
-	return 1;
-}
+extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 507cf2b5e9f1e..e2d25e9e62ae1 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -471,5 +471,28 @@ int __sched mutex_trylock(struct mutex *lock)
 
 	return ret;
 }
-
 EXPORT_SYMBOL(mutex_trylock);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+	/* dec if we can't possibly hit 0 */
+	if (atomic_add_unless(cnt, -1, 1))
+		return 0;
+	/* we might hit 0, so take the lock */
+	mutex_lock(lock);
+	if (!atomic_dec_and_test(cnt)) {
+		/* when we actually did the dec, we didn't hit 0 */
+		mutex_unlock(lock);
+		return 0;
+	}
+	/* we hit 0, and we hold the lock */
+	return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
-- 
GitLab


From 2b72394e4089643f11669d9610907a1442fe044a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 28 Apr 2009 16:00:49 +0300
Subject: [PATCH 0759/2198] x86: move max_pfn_mapped and max_low_pfn_mapped to
 setup.c

This patch moves the max_pfn_mapped and max_low_pfn_mapped global
variables to kernel/setup.c where they're initialized.

[ Impact: cleanup ]

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1240923649.1982.21.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 8 ++++++++
 arch/x86/mm/init_32.c   | 4 +---
 arch/x86/mm/init_64.c   | 8 --------
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4158439bf634..0d77e56e821be 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,14 @@
 #define ARCH_SETUP
 #endif
 
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
 RESERVE_BRK(dmi_alloc, 65536);
 
 unsigned int boot_cpu_id __read_mostly;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 2b27120665b32..a640a7f049055 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -49,11 +49,9 @@
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
+#include <asm/page_types.h>
 #include <asm/init.h>
 
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index a4e7846efb1a0..1016ea015932e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -50,14 +50,6 @@
 #include <asm/cacheflush.h>
 #include <asm/init.h>
 
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
 static unsigned long dma_reserve __initdata;
 
 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-- 
GitLab


From 9518e0e4350a5ea8ca200ce320b28d6284a7b0ce Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Tue, 28 Apr 2009 16:00:50 +0300
Subject: [PATCH 0760/2198] x86: move per-cpu mmu_gathers to mm/init.c

[ Impact: cleanup ]

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1240923650.1982.22.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init.c    | 3 +++
 arch/x86/mm/init_32.c | 1 -
 arch/x86/mm/init_64.c | 2 --
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fedde5359a049..4d67c33a2e165 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -9,6 +9,9 @@
 #include <asm/sections.h>
 #include <asm/system.h>
 #include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
 unsigned long __initdata e820_table_start;
 unsigned long __meminitdata e820_table_end;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index a640a7f049055..fef1d90d4f150 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -52,7 +52,6 @@
 #include <asm/page_types.h>
 #include <asm/init.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 1016ea015932e..6a1a573e20f97 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -52,8 +52,6 @@
 
 static unsigned long dma_reserve __initdata;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 static int __init parse_direct_gbpages_off(char *arg)
 {
 	direct_gbpages = 0;
-- 
GitLab


From ba9c22f2c01cf5c88beed5a6b9e07d42e10bd358 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Mon, 20 Apr 2009 22:22:22 -0700
Subject: [PATCH 0761/2198] futex: remove FUTEX_REQUEUE_PI (non CMP)

The new requeue PI futex op codes were modeled after the existing
FUTEX_REQUEUE and FUTEX_CMP_REQUEUE calls.  I was unaware at the time
that FUTEX_REQUEUE was only around for compatibility reasons and
shouldn't be used in new code.  Ulrich Drepper elaborates on this in his
Futexes are Tricky paper: http://people.redhat.com/drepper/futex.pdf.
The deprecated call doesn't catch changes to the futex corresponding to
the destination futex which can lead to deadlock.

Therefor, I feel it best to remove FUTEX_REQUEUE_PI and leave only
FUTEX_CMP_REQUEUE_PI as there are not yet any existing users of the API.
This patch does change the OP code value of FUTEX_CMP_REQUEUE_PI to 12
from 13.  Since my test case is the only known user of this API, I felt
this was the right thing to do, rather than leave a hole in the
enumeration.

I chose to continue using the _CMP_ modifier in the OP code to make it
explicit to the user that the test is being done.

Builds, boots, and ran several hundred iterations requeue_pi.c.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
LKML-Reference: <49ED580E.1050502@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/futex.h | 4 +---
 kernel/futex.c        | 6 +-----
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index b05519ca9e574..34956c8fdebf8 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -24,8 +24,7 @@ union ktime;
 #define FUTEX_WAIT_BITSET	9
 #define FUTEX_WAKE_BITSET	10
 #define FUTEX_WAIT_REQUEUE_PI	11
-#define FUTEX_REQUEUE_PI	12
-#define FUTEX_CMP_REQUEUE_PI	13
+#define FUTEX_CMP_REQUEUE_PI	12
 
 #define FUTEX_PRIVATE_FLAG	128
 #define FUTEX_CLOCK_REALTIME	256
@@ -43,7 +42,6 @@ union ktime;
 #define FUTEX_WAKE_BITSET_PRIVATE	(FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAIT_REQUEUE_PI_PRIVATE	(FUTEX_WAIT_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
-#define FUTEX_REQUEUE_PI_PRIVATE	(FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG)
 #define FUTEX_CMP_REQUEUE_PI_PRIVATE	(FUTEX_CMP_REQUEUE_PI | \
 					 FUTEX_PRIVATE_FLAG)
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 6d2daa46f9ffb..aec8bf89bf4e7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2555,9 +2555,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
 					    clockrt, uaddr2);
 		break;
-	case FUTEX_REQUEUE_PI:
-		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1);
-		break;
 	case FUTEX_CMP_REQUEUE_PI:
 		ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
 				    1);
@@ -2596,8 +2593,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
 	 * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
 	 */
 	if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-	    cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI ||
-	    cmd == FUTEX_WAKE_OP)
+	    cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
 		val2 = (u32) (unsigned long) utime;
 
 	return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-- 
GitLab


From 83c4832683bc8ebcd1687b3c0bf3ba1ab253dd4f Mon Sep 17 00:00:00 2001
From: Sam Ravnborg <sam@ravnborg.org>
Date: Thu, 30 Apr 2009 12:03:16 +0200
Subject: [PATCH 0762/2198] x86: boot/compressed/vmlinux.lds.S: fix build of
 bzImage with 64 bit compiler

Jesper reported that he saw following build issue:

 > ld:arch/x86/boot/compressed/vmlinux.lds:9: syntax error
 > make[2]: *** [arch/x86/boot/compressed/vmlinux] Error 1
 > make[1]: *** [arch/x86/boot/compressed/vmlinux] Error 2
 > make: *** [bzImage] Error 2

CPP defines the symbol "i386" to "1".
Undefine this to fix it.

[ Impact: build fix with certain tool chains ]

Reported-by: Jesper Dangaard Brouer <jdb@comx.dk>
Signed-off-by: Sam Ravnborg <sam@ravnborg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.00.0904260958190.3101@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/boot/compressed/vmlinux.lds.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index ffcb19134bf71..0d26c92d3c7d0 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -1,5 +1,7 @@
 OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
 
+#undef i386
+
 #ifdef CONFIG_X86_64
 OUTPUT_ARCH(i386:x86-64)
 ENTRY(startup_64)
-- 
GitLab


From aac3f3c2c41ce49a6dbb98d9145265c00a964dc2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 30 Apr 2009 13:52:19 +0200
Subject: [PATCH 0763/2198] perf_counter tools: add perf-report to the Makefile

Build it explicitly until it's a proper builtin command.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 543ccf28ac4a6..877cf5dedb551 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -228,7 +228,7 @@ COMPAT_CFLAGS =
 COMPAT_OBJS =
 LIB_H =
 LIB_OBJS =
-PROGRAMS =
+PROGRAMS = perf-report
 SCRIPT_PERL =
 SCRIPT_SH =
 TEST_PROGRAMS =
@@ -808,6 +808,10 @@ clean:
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
 	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
 
+# temporary hack:
+perf-report: perf-report.cc ../../include/linux/perf_counter.h Makefile
+	g++ -g -O2 -Wall -lrt -o $@ $<
+
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
 .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
-- 
GitLab


From 66cf782996f3d57d3cc199f0a2d47a54e2aa5991 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 30 Apr 2009 13:53:33 +0200
Subject: [PATCH 0764/2198] perf_counter tools: perf stat: make -l default-on

Turn on scaling display by default - this is less confusing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 112b94ed32986..1fde12762ca4d 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -171,7 +171,7 @@ static unsigned int		page_size;
 
 static int			zero;
 
-static int			scale;
+static int			scale				=  1;
 
 static const unsigned int default_count[] = {
 	1000000,
-- 
GitLab


From bad760089c1ef7fe525c0f268a4078b9cb483903 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 30 Apr 2009 14:14:37 +0200
Subject: [PATCH 0765/2198] perf_counter tools: fix infinite loop in
 perf-report on zeroed event records

Bail out early if a record has zero size - we have no chance to make
reliable progress in that case. Print out the offset where this happens,
and print the number of bytes we missed out on.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/perf-report.cc | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
index 1727317352bf7..933a07544534a 100644
--- a/Documentation/perf_counter/perf-report.cc
+++ b/Documentation/perf_counter/perf-report.cc
@@ -13,6 +13,7 @@
 #include <ctype.h>
 #include <time.h>
 #include <getopt.h>
+#include <assert.h>
 
 #include <sys/ioctl.h>
 #include <sys/poll.h>
@@ -226,7 +227,7 @@ void load_kallsyms(void)
 	while (!feof(file)) {
 		uint64_t start;
 		char c;
-		char sym[1024];
+		char sym[1024000];
 
 		if (getline(&line, &n, file) < 0)
 			break;
@@ -416,12 +417,23 @@ more:
 
 	if (head + event->header.size >= page_size * mmap_window) {
 		unsigned long shift = page_size * (head / page_size);
+		int ret;
+
+		ret = munmap(buf, page_size * mmap_window);
+		assert(ret == 0);
 
-		munmap(buf, page_size * mmap_window);
 		offset += shift;
 		head -= shift;
 		goto remap;
 	}
+
+
+	if (!event->header.size) {
+		fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head);
+		fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head);
+		goto done;
+	}
+
 	head += event->header.size;
 
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
@@ -458,6 +470,8 @@ more:
 	if (offset + head < stat.st_size)
 		goto more;
 
+done:
+
 	close(input);
 
 	std::map<std::string, int>::iterator hi = hist.begin();
-- 
GitLab


From 03682411b1ccd38cbde2e9a6ab43884ff34fbefc Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Thu, 30 Apr 2009 18:38:01 +0200
Subject: [PATCH 0766/2198] alim15x3: Remove historical hacks, re-enable
 init_hwif for PowerPC

Some time ago we had to disable init_hwif callback for PowerPC builds.
That was because of a historical IRQ overwrite in the driver, which
was causing IDE malfunction on the MPC8610HPCD PowerPC boards.

It's unclear whether this overwrite is still useful, but it is proven
to cause a bit of harm, and today some PowerPC targets (Xilinx ML510,
as reported by Roderick Colenbrander) need the init_hwif, so we have
to re-enable it and remove the overwrite.

Reported-by: Roderick Colenbrander <thunderbird2k@gmail.com>
Suggested-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/alim15x3.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c
index 537da1cde16d6..e59b6dee9ae24 100644
--- a/drivers/ide/alim15x3.c
+++ b/drivers/ide/alim15x3.c
@@ -402,27 +402,23 @@ static u8 ali_cable_detect(ide_hwif_t *hwif)
 	return cbl;
 }
 
-#if !defined(CONFIG_SPARC64) && !defined(CONFIG_PPC)
+#ifndef CONFIG_SPARC64
 /**
  *	init_hwif_ali15x3	-	Initialize the ALI IDE x86 stuff
  *	@hwif: interface to configure
  *
  *	Obtain the IRQ tables for an ALi based IDE solution on the PC
  *	class platforms. This part of the code isn't applicable to the
- *	Sparc and PowerPC systems.
+ *	Sparc systems.
  */
 
 static void __devinit init_hwif_ali15x3 (ide_hwif_t *hwif)
 {
-	struct pci_dev *dev = to_pci_dev(hwif->dev);
 	u8 ideic, inmir;
 	s8 irq_routing_table[] = { -1,  9, 3, 10, 4,  5, 7,  6,
 				      1, 11, 0, 12, 0, 14, 0, 15 };
 	int irq = -1;
 
-	if (dev->device == PCI_DEVICE_ID_AL_M5229)
-		hwif->irq = hwif->channel ? 15 : 14;
-
 	if (isa_dev) {
 		/*
 		 * read IDE interface control
@@ -455,7 +451,7 @@ static void __devinit init_hwif_ali15x3 (ide_hwif_t *hwif)
 }
 #else
 #define init_hwif_ali15x3 NULL
-#endif /* !defined(CONFIG_SPARC64) && !defined(CONFIG_PPC) */
+#endif /* CONFIG_SPARC64 */
 
 /**
  *	init_dma_ali15x3	-	set up DMA on ALi15x3
-- 
GitLab


From 30b4ae8a4498543863501f707879b7220b649602 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 4 Apr 2009 21:01:01 +0000
Subject: [PATCH 0767/2198] signals: split do_tkill

Split out the code from do_tkill to make it reusable by the follow up
patch which implements sys_rt_tgsigqueueinfo

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
---
 kernel/signal.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/kernel/signal.c b/kernel/signal.c
index d8034737db4cb..56d27acad87e9 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2278,24 +2278,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 	return kill_something_info(sig, &info, pid);
 }
 
-static int do_tkill(pid_t tgid, pid_t pid, int sig)
+static int
+do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 {
-	int error;
-	struct siginfo info;
 	struct task_struct *p;
 	unsigned long flags;
-
-	error = -ESRCH;
-	info.si_signo = sig;
-	info.si_errno = 0;
-	info.si_code = SI_TKILL;
-	info.si_pid = task_tgid_vnr(current);
-	info.si_uid = current_uid();
+	int error = -ESRCH;
 
 	rcu_read_lock();
 	p = find_task_by_vpid(pid);
 	if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
-		error = check_kill_permission(sig, &info, p);
+		error = check_kill_permission(sig, info, p);
 		/*
 		 * The null signal is a permissions and process existence
 		 * probe.  No signal is actually delivered.
@@ -2305,7 +2298,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
 		 * signal is private anyway.
 		 */
 		if (!error && sig && lock_task_sighand(p, &flags)) {
-			error = specific_send_sig_info(sig, &info, p);
+			error = specific_send_sig_info(sig, info, p);
 			unlock_task_sighand(p, &flags);
 		}
 	}
@@ -2314,6 +2307,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
 	return error;
 }
 
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
+{
+	struct siginfo info;
+
+	info.si_signo = sig;
+	info.si_errno = 0;
+	info.si_code = SI_TKILL;
+	info.si_pid = task_tgid_vnr(current);
+	info.si_uid = current_uid();
+
+	return do_send_specific(tgid, pid, sig, &info);
+}
+
 /**
  *  sys_tgkill - send signal to one specific thread
  *  @tgid: the thread group ID of the thread
-- 
GitLab


From 62ab4505e3efaf67784f84059e0fb9cedb1728ea Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 4 Apr 2009 21:01:06 +0000
Subject: [PATCH 0768/2198] signals: implement sys_rt_tgsigqueueinfo

sys_kill has the per thread counterpart sys_tgkill. sigqueueinfo is
missing a thread directed counterpart. Such an interface is important
for migrating applications from other OSes which have the per thread
delivery implemented.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Ulrich Drepper <drepper@redhat.com>
---
 include/linux/compat.h |  2 ++
 include/linux/signal.h |  2 ++
 kernel/compat.c        | 11 +++++++++++
 kernel/signal.c        | 26 ++++++++++++++++++++++++++
 4 files changed, 41 insertions(+)

diff --git a/include/linux/compat.h b/include/linux/compat.h
index f2ded21f9a3c3..af931ee43dd8e 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -222,6 +222,8 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from);
 int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from);
 int get_compat_sigevent(struct sigevent *event,
 		const struct compat_sigevent __user *u_event);
+long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+				  struct compat_siginfo __user *uinfo);
 
 static inline int compat_timeval_compare(struct compat_timeval *lhs,
 					struct compat_timeval *rhs)
diff --git a/include/linux/signal.h b/include/linux/signal.h
index 84f997f8aa53c..c7552836bd954 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -235,6 +235,8 @@ static inline int valid_signal(unsigned long sig)
 extern int next_signal(struct sigpending *pending, sigset_t *mask);
 extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
 extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
+extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
+				 siginfo_t *info);
 extern long do_sigpending(void __user *, unsigned long);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
 extern int show_unhandled_signals;
diff --git a/kernel/compat.c b/kernel/compat.c
index 42d56544460f2..f6c204f07ea60 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
 
 }
 
+asmlinkage long
+compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+			     struct compat_siginfo __user *uinfo)
+{
+	siginfo_t info;
+
+	if (copy_siginfo_from_user32(&info, uinfo))
+		return -EFAULT;
+	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
 #ifdef __ARCH_WANT_COMPAT_SYS_TIME
 
 /* compat_time_t is a 32 bit "long" and needs to get converted. */
diff --git a/kernel/signal.c b/kernel/signal.c
index 56d27acad87e9..f79b3b9f8375e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2369,6 +2369,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
 	return kill_proc_info(sig, &info, pid);
 }
 
+long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+{
+	/* This is only valid for single tasks */
+	if (pid <= 0 || tgid <= 0)
+		return -EINVAL;
+
+	/* Not even root can pretend to send signals from the kernel.
+	   Nor can they impersonate a kill(), which adds source info.  */
+	if (info->si_code >= 0)
+		return -EPERM;
+	info->si_signo = sig;
+
+	return do_send_specific(tgid, pid, sig, info);
+}
+
+SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+		siginfo_t __user *, uinfo)
+{
+	siginfo_t info;
+
+	if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+		return -EFAULT;
+
+	return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct task_struct *t = current;
-- 
GitLab


From 12d161147f828192b5bcc33166f468a827832767 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 4 Apr 2009 21:01:10 +0000
Subject: [PATCH 0769/2198] x86: hookup sys_rt_tgsigqueueinfo

Make the new sys_rt_tgsigqueueinfo available for x86.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/ia32/ia32entry.S          | 1 +
 arch/x86/include/asm/unistd_32.h   | 1 +
 arch/x86/include/asm/unistd_64.h   | 2 ++
 arch/x86/kernel/syscall_table_32.S | 1 +
 4 files changed, 5 insertions(+)

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index a505202086e87..dcef387ddc36a 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -830,4 +830,5 @@ ia32_sys_call_table:
 	.quad sys_inotify_init1
 	.quad compat_sys_preadv
 	.quad compat_sys_pwritev
+	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 6e72d74cf8dc7..708dae61262df 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -340,6 +340,7 @@
 #define __NR_inotify_init1	332
 #define __NR_preadv		333
 #define __NR_pwritev		334
+#define __NR_rt_tgsigqueueinfo	335
 
 #ifdef __KERNEL__
 
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index f81829462325f..4e2b054044000 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -657,6 +657,8 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
 __SYSCALL(__NR_preadv, sys_preadv)
 #define __NR_pwritev				296
 __SYSCALL(__NR_pwritev, sys_pwritev)
+#define __NR_rt_tgsigqueueinfo			297
+__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 
 
 #ifndef __NO_STUBS
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index ff5c8736b491b..734f92c02dde0 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -334,3 +334,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init1
 	.long sys_preadv
 	.long sys_pwritev
+	.long sys_rt_tgsigqueueinfo	/* 335 */
-- 
GitLab


From bf293c17b26b8854241df08b9b63f7270cbde012 Mon Sep 17 00:00:00 2001
From: Remis Lima Baima <remis.developer@googlemail.com>
Date: Thu, 30 Apr 2009 18:36:23 +0200
Subject: [PATCH 0770/2198] x86: added 'ifndef _ASM_X86_IOMAP_H' to iomap.h

iomap.h misses the include guards.

[ Impact: cleanup ]

Signed-off-by: Remis Lima Baima <remis.developer@googlemail.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
LKML-Reference: <200904301836.23885.arnd@arndb.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/iomap.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index 86af26091d6c3..0e9fe1d9d9715 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -1,3 +1,6 @@
+#ifndef _ASM_X86_IOMAP_H
+#define _ASM_X86_IOMAP_H
+
 /*
  * Copyright © 2008 Ingo Molnar
  *
@@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 
 void
 iounmap_atomic(void *kvaddr, enum km_type type);
+
+#endif /* _ASM_X86_IOMAP_H */
-- 
GitLab


From 78a3d9d5654a7fd99cf8b2ab06b9497b9c7aad64 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Wed, 29 Apr 2009 18:01:23 +0200
Subject: [PATCH 0771/2198] do_wait: do take security_task_wait() into account

I was never able to understand what should we actually do when
security_task_wait() fails, but the current code doesn't look right.

If ->task_wait() returns the error, we update *notask_error correctly.
But then we either reap the child (despite the fact this was forbidden)
or clear *notask_error (and hide the securiy policy problems).

This patch assumes that "stolen by ptrace" doesn't matter. If selinux
denies the child we should ignore it but make sure we report -EACCESS
instead of -ECHLD if there are no other eligible children.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/exit.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c61..d2e8239ea187a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1582,6 +1582,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 		 */
 		if (*notask_error)
 			*notask_error = ret;
+		return 0;
 	}
 
 	if (likely(!ptrace) && unlikely(p->ptrace)) {
-- 
GitLab


From c33a0bc4e41ef169d6e807d8abb9502544b518e5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:16 +0200
Subject: [PATCH 0772/2198] perf_counter: fix race in perf_output_*

When two (or more) contexts output to the same buffer, it is possible
to observe half written output.

Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing
perf_counter_overflow(). If CPU1 does a wakeup and exposes head to
user-space, then CPU2 can observe the data CPU0 is still writing.

[ Impact: fix occasionally corrupted profiling records ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.007821627@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |   5 +-
 kernel/perf_counter.c        | 130 +++++++++++++++++++++++++++--------
 2 files changed, 105 insertions(+), 30 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 41aed42700574..f776851f8c4b1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -358,10 +358,13 @@ struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;	/* nr of data pages  */
 
-	atomic_t			wakeup;		/* POLL_ for wakeups */
+	atomic_t			poll;		/* POLL_ for wakeups */
 	atomic_t			head;		/* write position    */
 	atomic_t			events;		/* event limit       */
 
+	atomic_t			wakeup_head;	/* completed head    */
+	atomic_t			lock;		/* concurrent writes */
+
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 75f2b6c823920..8660ae5795300 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1279,14 +1279,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 {
 	struct perf_counter *counter = file->private_data;
 	struct perf_mmap_data *data;
-	unsigned int events;
+	unsigned int events = POLL_HUP;
 
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (data)
-		events = atomic_xchg(&data->wakeup, 0);
-	else
-		events = POLL_HUP;
+		events = atomic_xchg(&data->poll, 0);
 	rcu_read_unlock();
 
 	poll_wait(file, &counter->waitq, wait);
@@ -1568,22 +1566,6 @@ static const struct file_operations perf_fops = {
 
 void perf_counter_wakeup(struct perf_counter *counter)
 {
-	struct perf_mmap_data *data;
-
-	rcu_read_lock();
-	data = rcu_dereference(counter->data);
-	if (data) {
-		atomic_set(&data->wakeup, POLL_IN);
-		/*
-		 * Ensure all data writes are issued before updating the
-		 * user-space data head information. The matching rmb()
-		 * will be in userspace after reading this value.
-		 */
-		smp_wmb();
-		data->user_page->data_head = atomic_read(&data->head);
-	}
-	rcu_read_unlock();
-
 	wake_up_all(&counter->waitq);
 
 	if (counter->pending_kill) {
@@ -1721,10 +1703,14 @@ struct perf_output_handle {
 	int			wakeup;
 	int			nmi;
 	int			overflow;
+	int			locked;
+	unsigned long		flags;
 };
 
-static inline void __perf_output_wakeup(struct perf_output_handle *handle)
+static void perf_output_wakeup(struct perf_output_handle *handle)
 {
+	atomic_set(&handle->data->poll, POLL_IN);
+
 	if (handle->nmi) {
 		handle->counter->pending_wakeup = 1;
 		perf_pending_queue(&handle->counter->pending,
@@ -1733,6 +1719,86 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle)
 		perf_counter_wakeup(handle->counter);
 }
 
+/*
+ * Curious locking construct.
+ *
+ * We need to ensure a later event doesn't publish a head when a former
+ * event isn't done writing. However since we need to deal with NMIs we
+ * cannot fully serialize things.
+ *
+ * What we do is serialize between CPUs so we only have to deal with NMI
+ * nesting on a single CPU.
+ *
+ * We only publish the head (and generate a wakeup) when the outer-most
+ * event completes.
+ */
+static void perf_output_lock(struct perf_output_handle *handle)
+{
+	struct perf_mmap_data *data = handle->data;
+	int cpu;
+
+	handle->locked = 0;
+
+	local_irq_save(handle->flags);
+	cpu = smp_processor_id();
+
+	if (in_nmi() && atomic_read(&data->lock) == cpu)
+		return;
+
+	while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+		cpu_relax();
+
+	handle->locked = 1;
+}
+
+static void perf_output_unlock(struct perf_output_handle *handle)
+{
+	struct perf_mmap_data *data = handle->data;
+	int head, cpu;
+
+	if (handle->wakeup)
+		data->wakeup_head = data->head;
+
+	if (!handle->locked)
+		goto out;
+
+again:
+	/*
+	 * The xchg implies a full barrier that ensures all writes are done
+	 * before we publish the new head, matched by a rmb() in userspace when
+	 * reading this position.
+	 */
+	while ((head = atomic_xchg(&data->wakeup_head, 0))) {
+		data->user_page->data_head = head;
+		handle->wakeup = 1;
+	}
+
+	/*
+	 * NMI can happen here, which means we can miss a wakeup_head update.
+	 */
+
+	cpu = atomic_xchg(&data->lock, 0);
+	WARN_ON_ONCE(cpu != smp_processor_id());
+
+	/*
+	 * Therefore we have to validate we did not indeed do so.
+	 */
+	if (unlikely(atomic_read(&data->wakeup_head))) {
+		/*
+		 * Since we had it locked, we can lock it again.
+		 */
+		while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+			cpu_relax();
+
+		goto again;
+	}
+
+	if (handle->wakeup)
+		perf_output_wakeup(handle);
+out:
+	local_irq_restore(handle->flags);
+}
+
 static int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_counter *counter, unsigned int size,
 			     int nmi, int overflow)
@@ -1745,6 +1811,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data)
 		goto out;
 
+	handle->data	 = data;
 	handle->counter	 = counter;
 	handle->nmi	 = nmi;
 	handle->overflow = overflow;
@@ -1752,12 +1819,13 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	if (!data->nr_pages)
 		goto fail;
 
+	perf_output_lock(handle);
+
 	do {
 		offset = head = atomic_read(&data->head);
 		head += size;
 	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
 
-	handle->data	= data;
 	handle->offset	= offset;
 	handle->head	= head;
 	handle->wakeup	= (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
@@ -1765,7 +1833,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	return 0;
 
 fail:
-	__perf_output_wakeup(handle);
+	perf_output_wakeup(handle);
 out:
 	rcu_read_unlock();
 
@@ -1809,16 +1877,20 @@ static void perf_output_copy(struct perf_output_handle *handle,
 
 static void perf_output_end(struct perf_output_handle *handle)
 {
-	int wakeup_events = handle->counter->hw_event.wakeup_events;
+	struct perf_counter *counter = handle->counter;
+	struct perf_mmap_data *data = handle->data;
+
+	int wakeup_events = counter->hw_event.wakeup_events;
 
 	if (handle->overflow && wakeup_events) {
-		int events = atomic_inc_return(&handle->data->events);
+		int events = atomic_inc_return(&data->events);
 		if (events >= wakeup_events) {
-			atomic_sub(wakeup_events, &handle->data->events);
-			__perf_output_wakeup(handle);
+			atomic_sub(wakeup_events, &data->events);
+			handle->wakeup = 1;
 		}
-	} else if (handle->wakeup)
-		__perf_output_wakeup(handle);
+	}
+
+	perf_output_unlock(handle);
 	rcu_read_unlock();
 }
 
-- 
GitLab


From 63a809a2dc53b91268dd915bbcbd425063893676 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:17 +0200
Subject: [PATCH 0773/2198] perf_counter: fix nmi-watchdog interaction

When we don't have any perf-counters active, don't act like we know
what the NMI is for.

[ Impact: fix hard hang with nmi_watchdog=2 ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.109867793@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index fc06f4d326447..d4c0cc9d32637 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -871,6 +871,9 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	struct pt_regs *regs;
 	int ret;
 
+	if (!atomic_read(&num_counters))
+		return NOTIFY_DONE;
+
 	switch (cmd) {
 	case DIE_NMI:
 	case DIE_NMI_IPI:
-- 
GitLab


From 585e3374d9d29376c2c37d821c8b7637dd48ca95 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:18 +0200
Subject: [PATCH 0774/2198] perf_counter: tool: handle 0-length data files

Avoid perf-report barfing on 0-length data files.

[ Impact: fix perf-report SIGBUS ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.196245693@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/perf-report.cc | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
index 933a07544534a..911d7f3e7a65c 100644
--- a/Documentation/perf_counter/perf-report.cc
+++ b/Documentation/perf_counter/perf-report.cc
@@ -402,6 +402,11 @@ int main(int argc, char *argv[])
 		exit(-1);
 	}
 
+	if (!stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
 	load_kallsyms();
 
 remap:
-- 
GitLab


From e5791a808ae91a9e7e1b65ea9b8de0f96a043d88 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 1 May 2009 12:23:19 +0200
Subject: [PATCH 0775/2198] perf_counter: documentation update

Update the documentation to reflect the current state of affairs

[ Impact: documentation update ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090501102533.296727903@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/design.txt | 274 +++++++++++++++++++++-----
 1 file changed, 220 insertions(+), 54 deletions(-)

diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
index aaf105c02fba7..9930c4bddc6fa 100644
--- a/Documentation/perf_counter/design.txt
+++ b/Documentation/perf_counter/design.txt
@@ -34,41 +34,47 @@ can be poll()ed.
 
 When creating a new counter fd, 'perf_counter_hw_event' is:
 
-/*
- * Event to monitor via a performance monitoring counter:
- */
 struct perf_counter_hw_event {
-	__u64			event_config;
-
-	__u64			irq_period;
-	__u64			record_type;
-	__u64			read_format;
-
-	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
-				inherit	       :  1, /* children inherit it   */
-				pinned	       :  1, /* must always be on PMU */
-				exclusive      :  1, /* only group on PMU     */
-				exclude_user   :  1, /* don't count user      */
-				exclude_kernel :  1, /* ditto kernel          */
-				exclude_hv     :  1, /* ditto hypervisor      */
-				exclude_idle   :  1, /* don't count when idle */
-
-				__reserved_1   : 55;
-
-	__u32			extra_config_len;
-
-	__u32			__reserved_4;
-	__u64			__reserved_2;
-	__u64			__reserved_3;
+        /*
+         * The MSB of the config word signifies if the rest contains cpu
+         * specific (raw) counter configuration data, if unset, the next
+         * 7 bits are an event type and the rest of the bits are the event
+         * identifier.
+         */
+        __u64                   config;
+
+        __u64                   irq_period;
+        __u32                   record_type;
+        __u32                   read_format;
+
+        __u64                   disabled       :  1, /* off by default        */
+                                nmi            :  1, /* NMI sampling          */
+                                inherit        :  1, /* children inherit it   */
+                                pinned         :  1, /* must always be on PMU */
+                                exclusive      :  1, /* only group on PMU     */
+                                exclude_user   :  1, /* don't count user      */
+                                exclude_kernel :  1, /* ditto kernel          */
+                                exclude_hv     :  1, /* ditto hypervisor      */
+                                exclude_idle   :  1, /* don't count when idle */
+                                mmap           :  1, /* include mmap data     */
+                                munmap         :  1, /* include munmap data   */
+                                comm           :  1, /* include comm data     */
+
+                                __reserved_1   : 52;
+
+        __u32                   extra_config_len;
+        __u32                   wakeup_events;  /* wakeup every n events */
+
+        __u64                   __reserved_2;
+        __u64                   __reserved_3;
 };
 
-The 'event_config' field specifies what the counter should count.  It
+The 'config' field specifies what the counter should count.  It
 is divided into 3 bit-fields:
 
-raw_type: 1 bit (most significant bit)		0x8000_0000_0000_0000
-type:	  7 bits (next most significant)	0x7f00_0000_0000_0000
-event_id: 56 bits (least significant)		0x00ff_0000_0000_0000
+raw_type: 1 bit   (most significant bit)	0x8000_0000_0000_0000
+type:	  7 bits  (next most significant)	0x7f00_0000_0000_0000
+event_id: 56 bits (least significant)		0x00ff_ffff_ffff_ffff
 
 If 'raw_type' is 1, then the counter will count a hardware event
 specified by the remaining 63 bits of event_config.  The encoding is
@@ -134,41 +140,56 @@ enum sw_event_ids {
 	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
 };
 
+Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
+tracer is available, and event_id values can be obtained from
+/debug/tracing/events/*/*/id
+
+
 Counters come in two flavours: counting counters and sampling
 counters.  A "counting" counter is one that is used for counting the
 number of events that occur, and is characterised by having
-irq_period = 0 and record_type = PERF_RECORD_SIMPLE.  A read() on a
-counting counter simply returns the current value of the counter as
-an 8-byte number.
+irq_period = 0.
+
+
+A read() on a counter returns the current value of the counter and possible
+additional values as specified by 'read_format', each value is a u64 (8 bytes)
+in size.
+
+/*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+        PERF_FORMAT_TOTAL_TIME_ENABLED  =  1,
+        PERF_FORMAT_TOTAL_TIME_RUNNING  =  2,
+};
+
+Using these additional values one can establish the overcommit ratio for a
+particular counter allowing one to take the round-robin scheduling effect
+into account.
+
 
 A "sampling" counter is one that is set up to generate an interrupt
 every N events, where N is given by 'irq_period'.  A sampling counter
-has irq_period > 0 and record_type != PERF_RECORD_SIMPLE.  The
-record_type controls what data is recorded on each interrupt, and the
-available values are currently:
+has irq_period > 0. The record_type controls what data is recorded on each
+interrupt:
 
 /*
- * IRQ-notification data record type:
+ * Bits that can be set in hw_event.record_type to request information
+ * in the overflow packets.
  */
-enum perf_counter_record_type {
-	PERF_RECORD_SIMPLE		= 0,
-	PERF_RECORD_IRQ			= 1,
-	PERF_RECORD_GROUP		= 2,
+enum perf_counter_record_format {
+        PERF_RECORD_IP          = 1U << 0,
+        PERF_RECORD_TID         = 1U << 1,
+        PERF_RECORD_TIME        = 1U << 2,
+        PERF_RECORD_ADDR        = 1U << 3,
+        PERF_RECORD_GROUP       = 1U << 4,
+        PERF_RECORD_CALLCHAIN   = 1U << 5,
 };
 
-A record_type value of PERF_RECORD_IRQ will record the instruction
-pointer (IP) at which the interrupt occurred.  A record_type value of
-PERF_RECORD_GROUP will record the event_config and counter value of
-all of the other counters in the group, and should only be used on a
-group leader (see below).  Currently these two values are mutually
-exclusive, but record_type will become a bit-mask in future and
-support other values.
-
-A sampling counter has an event queue, into which an event is placed
-on each interrupt.  A read() on a sampling counter will read the next
-event from the event queue.  If the queue is empty, the read() will
-either block or return an EAGAIN error, depending on whether the fd
-has been set to non-blocking mode or not.
+Such (and other) events will be recorded in a ring-buffer, which is
+available to user-space using mmap() (see below).
 
 The 'disabled' bit specifies whether the counter starts out disabled
 or enabled.  If it is initially disabled, it can be enabled by ioctl
@@ -206,6 +227,13 @@ The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a
 way to request that counting of events be restricted to times when the
 CPU is in user, kernel and/or hypervisor mode.
 
+The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap
+operations, these can be used to relate userspace IP addresses to actual
+code, even after the mapping (or even the whole process) is gone,
+these events are recorded in the ring-buffer (see below).
+
+The 'comm' bit allows tracking of process comm data on process creation.
+This too is recorded in the ring-buffer (see below).
 
 The 'pid' parameter to the perf_counter_open() system call allows the
 counter to be specific to a task:
@@ -250,6 +278,138 @@ can be meaningfully compared, added, divided (to get ratios), etc.,
 with each other, since they have counted events for the same set of
 executed instructions.
 
+
+Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap
+tracking are logged into a ring-buffer. This ring-buffer is created and
+accessed through mmap().
+
+The mmap size should be 1+2^n pages, where the first page is a meta-data page
+(struct perf_counter_mmap_page) that contains various bits of information such
+as where the ring-buffer head is.
+
+/*
+ * Structure of the page that can be mapped via mmap
+ */
+struct perf_counter_mmap_page {
+        __u32   version;                /* version number of this structure */
+        __u32   compat_version;         /* lowest version this is compat with */
+
+        /*
+         * Bits needed to read the hw counters in user-space.
+         *
+         *   u32 seq;
+         *   s64 count;
+         *
+         *   do {
+         *     seq = pc->lock;
+         *
+         *     barrier()
+         *     if (pc->index) {
+         *       count = pmc_read(pc->index - 1);
+         *       count += pc->offset;
+         *     } else
+         *       goto regular_read;
+         *
+         *     barrier();
+         *   } while (pc->lock != seq);
+         *
+         * NOTE: for obvious reason this only works on self-monitoring
+         *       processes.
+         */
+        __u32   lock;                   /* seqlock for synchronization */
+        __u32   index;                  /* hardware counter identifier */
+        __s64   offset;                 /* add to hardware counter value */
+
+        /*
+         * Control data for the mmap() data buffer.
+         *
+         * User-space reading this value should issue an rmb(), on SMP capable
+         * platforms, after reading this value -- see perf_counter_wakeup().
+         */
+        __u32   data_head;              /* head in the data section */
+};
+
+NOTE: the hw-counter userspace bits are arch specific and are currently only
+      implemented on powerpc.
+
+The following 2^n pages are the ring-buffer which contains events of the form:
+
+#define PERF_EVENT_MISC_KERNEL          (1 << 0)
+#define PERF_EVENT_MISC_USER            (1 << 1)
+#define PERF_EVENT_MISC_OVERFLOW        (1 << 2)
+
+struct perf_event_header {
+        __u32   type;
+        __u16   misc;
+        __u16   size;
+};
+
+enum perf_event_type {
+
+        /*
+         * The MMAP events record the PROT_EXEC mappings so that we can
+         * correlate userspace IPs to code. They have the following structure:
+         *
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      u32                             pid, tid;
+         *      u64                             addr;
+         *      u64                             len;
+         *      u64                             pgoff;
+         *      char                            filename[];
+         * };
+         */
+        PERF_EVENT_MMAP                 = 1,
+        PERF_EVENT_MUNMAP               = 2,
+
+        /*
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      u32                             pid, tid;
+         *      char                            comm[];
+         * };
+         */
+        PERF_EVENT_COMM                 = 3,
+
+        /*
+         * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
+         * will be PERF_RECORD_*
+         *
+         * struct {
+         *      struct perf_event_header        header;
+         *
+         *      { u64                   ip;       } && PERF_RECORD_IP
+         *      { u32                   pid, tid; } && PERF_RECORD_TID
+         *      { u64                   time;     } && PERF_RECORD_TIME
+         *      { u64                   addr;     } && PERF_RECORD_ADDR
+         *
+         *      { u64                   nr;
+         *        { u64 event, val; }   cnt[nr];  } && PERF_RECORD_GROUP
+         *
+         *      { u16                   nr,
+         *                              hv,
+         *                              kernel,
+         *                              user;
+         *        u64                   ips[nr];  } && PERF_RECORD_CALLCHAIN
+         * };
+         */
+};
+
+NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented
+      on x86.
+
+Notification of new events is possible through poll()/select()/epoll() and
+fcntl() managing signals.
+
+Normally a notification is generated for every page filled, however one can
+additionally set perf_counter_hw_event.wakeup_events to generate one every
+so many counter overflow events.
+
+Future work will include a splice() interface to the ring-buffer.
+
+
 Counters can be enabled and disabled in two ways: via ioctl and via
 prctl.  When a counter is disabled, it doesn't count or generate
 events but does continue to exist and maintain its count value.
@@ -269,6 +429,12 @@ group other than the leader only affects that counter - disabling an
 non-leader stops that counter from counting but doesn't affect any
 other counter.
 
+Additionally, non-inherited overflow counters can use
+
+	ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr);
+
+to enable a counter for 'nr' events, after which it gets disabled again.
+
 A process can enable or disable all the counter groups that are
 attached to it, using prctl:
 
-- 
GitLab


From 56afb0f8823650f53a5f0e96d69a282e8892c61b Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 30 Apr 2009 13:29:36 -0400
Subject: [PATCH 0776/2198] kerneldoc, tracing: make kernel-doc understand
 TRACE_EVENT() macro (take #2)

Add support to kernel-doc for tracepoint comments above TRACE_EVENT()
macro definitions. Paves the way for tracepoint docbook.

[ Impact: extend DocBook infrastructure ]

Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: akpm@linux-foundation.org
Cc: rostedt@goodmis.org
Cc: fweisbec@gmail.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: wcohen@redhat.com
LKML-Reference: <d80706b6797e277924d2f3ec9af176c6b2951f88.1241107197.git.jbaron@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 scripts/kernel-doc | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/scripts/kernel-doc b/scripts/kernel-doc
index 0f11870116dc7..2b53a55fbece2 100755
--- a/scripts/kernel-doc
+++ b/scripts/kernel-doc
@@ -1827,6 +1827,25 @@ sub reset_state {
     $state = 0;
 }
 
+sub tracepoint_munge($) {
+	my $file = shift;
+	my $tracepointname = 0;
+	my $tracepointargs = 0;
+
+	if($prototype =~ m/TRACE_EVENT\((.*?),/) {
+		$tracepointname = $1;
+	}
+	if($prototype =~ m/TP_PROTO\((.*?)\)/) {
+		$tracepointargs = $1;
+	}
+	if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
+		print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n".
+			     "$prototype\n";
+	} else {
+		$prototype = "static inline void trace_$tracepointname($tracepointargs)";
+	}
+}
+
 sub syscall_munge() {
 	my $void = 0;
 
@@ -1881,6 +1900,9 @@ sub process_state3_function($$) {
 	if ($prototype =~ /SYSCALL_DEFINE/) {
 		syscall_munge();
 	}
+	if ($prototype =~ /TRACE_EVENT/) {
+		tracepoint_munge($file);
+	}
 	dump_function($prototype, $file);
 	reset_state();
     }
-- 
GitLab


From a76f8c6da1e48fd4ef025f42c736389532ff30ba Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 30 Apr 2009 13:29:42 -0400
Subject: [PATCH 0777/2198] tracing: add new tracepoints docbook

Add tracepoint docbook. This will help us document and understand
what tracepoints are in the kernel. Since there are multiple
macros, and files that contain tracepoints.

[ Impact: add documentation ]

Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: akpm@linux-foundation.org
Cc: rostedt@goodmis.org
Cc: fweisbec@gmail.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: wcohen@redhat.com
LKML-Reference: <84160b6bd94aff02455da7e12bad054d34c579a0.1241107197.git.jbaron@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/DocBook/Makefile        |  3 +-
 Documentation/DocBook/tracepoint.tmpl | 84 +++++++++++++++++++++++++++
 2 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/DocBook/tracepoint.tmpl

diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
index 8918a32c6b3a5..4c8f4d6e114a8 100644
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
 	    gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
 	    genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
 	    mac80211.xml debugobjects.xml sh.xml regulator.xml \
-	    alsa-driver-api.xml writing-an-alsa-driver.xml
+	    alsa-driver-api.xml writing-an-alsa-driver.xml \
+	    tracepoint.xml
 
 ###
 # The build process is as follows (targets):
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
new file mode 100644
index 0000000000000..70891bc684911
--- /dev/null
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Tracepoints">
+ <bookinfo>
+  <title>The Linux Kernel Tracepoint API</title>
+
+  <authorgroup>
+   <author>
+    <firstname>Jason</firstname>
+    <surname>Baron</surname>
+    <affiliation>
+     <address>
+      <email>jbaron@redhat.com</email>
+     </address>
+    </affiliation>
+   </author>
+  </authorgroup>
+
+  <legalnotice>
+   <para>
+     This documentation is free software; you can redistribute
+     it and/or modify it under the terms of the GNU General Public
+     License as published by the Free Software Foundation; either
+     version 2 of the License, or (at your option) any later
+     version.
+   </para>
+
+   <para>
+     This program is distributed in the hope that it will be
+     useful, but WITHOUT ANY WARRANTY; without even the implied
+     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+     See the GNU General Public License for more details.
+   </para>
+
+   <para>
+     You should have received a copy of the GNU General Public
+     License along with this program; if not, write to the Free
+     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+     MA 02111-1307 USA
+   </para>
+
+   <para>
+     For more details see the file COPYING in the source
+     distribution of Linux.
+   </para>
+  </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+  <chapter id="intro">
+   <title>Introduction</title>
+   <para>
+     Tracepoints are static probe points that are located in strategic points
+     throughout the kernel. 'Probes' register/unregister with tracepoints
+     via a callback mechanism. The 'probes' are strictly typed functions that
+     are passed a unique set of parameters defined by each tracepoint.
+   </para>
+
+   <para>
+     From this simple callback mechanism, 'probes' can be used to profile, debug,
+     and understand kernel behavior. There are a number of tools that provide a
+     framework for using 'probes'. These tools include Systemtap, ftrace, and
+     LTTng.
+   </para>
+
+   <para>
+     Tracepoints are defined in a number of header files via various macros. Thus,
+     the purpose of this document is to provide a clear accounting of the available
+     tracepoints. The intention is to understand not only what tracepoints are
+     available but also to understand where future tracepoints might be added.
+   </para>
+
+   <para>
+     The API presented has functions of the form:
+     <function>trace_tracepointname(function parameters)</function>. These are the
+     tracepoints callbacks that are found throughout the code. Registering and
+     unregistering probes with these callback sites is covered in the
+     <filename>Documentation/trace/*</filename> directory.
+   </para>
+  </chapter>
+
+</book>
-- 
GitLab


From 9ee1983c9aa18f12388ef660d0c76a23dc112959 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 30 Apr 2009 13:29:47 -0400
Subject: [PATCH 0778/2198] tracing: add irq tracepoint documentation

Document irqs for the newly created docbook.

[ Impact: add documentation ]

Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: akpm@linux-foundation.org
Cc: rostedt@goodmis.org
Cc: fweisbec@gmail.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: wcohen@redhat.com
LKML-Reference: <73ff42be3420157667ec548e9b0e409c3cfad05f.1241107197.git.jbaron@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/DocBook/tracepoint.tmpl |  5 +++
 include/trace/events/irq.h            | 46 ++++++++++++++++++++++++---
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
index 70891bc684911..b0756d0fd5791 100644
--- a/Documentation/DocBook/tracepoint.tmpl
+++ b/Documentation/DocBook/tracepoint.tmpl
@@ -81,4 +81,9 @@
    </para>
   </chapter>
 
+  <chapter id="irq">
+   <title>IRQ</title>
+!Iinclude/trace/events/irq.h
+  </chapter>
+
 </book>
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 7686864675185..32a9f7ef432bd 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -7,8 +7,16 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM irq
 
-/*
- * Tracepoint for entry of interrupt handler:
+/**
+ * irq_handler_entry - called immediately before the irq action handler
+ * @irq: irq number
+ * @action: pointer to struct irqaction
+ *
+ * The struct irqaction pointed to by @action contains various
+ * information about the handler, including the device name,
+ * @action->name, and the device id, @action->dev_id. When used in
+ * conjunction with the irq_handler_exit tracepoint, we can figure
+ * out irq handler latencies.
  */
 TRACE_EVENT(irq_handler_entry,
 
@@ -29,8 +37,16 @@ TRACE_EVENT(irq_handler_entry,
 	TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name))
 );
 
-/*
- * Tracepoint for return of an interrupt handler:
+/**
+ * irq_handler_exit - called immediately after the irq action handler returns
+ * @irq: irq number
+ * @action: pointer to struct irqaction
+ * @ret: return value
+ *
+ * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding
+ * @action->handler scuccessully handled this irq. Otherwise, the irq might be
+ * a shared irq line, or the irq was not handled successfully. Can be used in
+ * conjunction with the irq_handler_entry to understand irq handler latencies.
  */
 TRACE_EVENT(irq_handler_exit,
 
@@ -52,6 +68,17 @@ TRACE_EVENT(irq_handler_exit,
 		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
+/**
+ * softirq_entry - called immediately before the softirq handler
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter, contains a pointer to the struct softirq_action
+ * which has a pointer to the action handler that is called. By subtracting
+ * the @vec pointer from the @h pointer, we can determine the softirq
+ * number. Also, when used in combination with the softirq_exit tracepoint
+ * we can determine the softirq latency.
+ */
 TRACE_EVENT(softirq_entry,
 
 	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
@@ -71,6 +98,17 @@ TRACE_EVENT(softirq_entry,
 	TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name))
 );
 
+/**
+ * softirq_exit - called immediately after the softirq handler returns
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter contains a pointer to the struct softirq_action
+ * that has handled the softirq. By subtracting the @vec pointer from
+ * the @h pointer, we can determine the softirq number. Also, when used in
+ * combination with the softirq_entry tracepoint we can determine the softirq
+ * latency.
+ */
 TRACE_EVENT(softirq_exit,
 
 	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-- 
GitLab


From e0202f56a82cd1170c6f1c520db669431cf26ddc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 1 May 2009 16:51:44 +0200
Subject: [PATCH 0779/2198] perf_counter tools: fix x86 syscall numbers

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 4 ++--
 Documentation/perf_counter/builtin-stat.c   | 4 ++--
 Documentation/perf_counter/builtin-top.c    | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 4a50abf843ee9..9cff266fb6141 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -57,13 +57,13 @@
 #define asmlinkage
 
 #ifdef __x86_64__
-#define __NR_perf_counter_open 295
+#define __NR_perf_counter_open 298
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __i386__
-#define __NR_perf_counter_open 333
+#define __NR_perf_counter_open 336
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 1fde12762ca4d..9fbc66173de78 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -108,13 +108,13 @@
 #define asmlinkage
 
 #ifdef __x86_64__
-#define __NR_perf_counter_open 295
+#define __NR_perf_counter_open 298
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __i386__
-#define __NR_perf_counter_open 333
+#define __NR_perf_counter_open 336
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 8d28864a20c0f..b6d989e7b1961 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -89,13 +89,13 @@
 #define asmlinkage
 
 #ifdef __x86_64__
-#define __NR_perf_counter_open 295
+#define __NR_perf_counter_open 298
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __i386__
-#define __NR_perf_counter_open 333
+#define __NR_perf_counter_open 336
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
-- 
GitLab


From 3666932bf2212a8fa77e344c5d946e86787bdbbe Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 17:37:51 +0200
Subject: [PATCH 0780/2198] perf_counter tools: remove build generated files

These files are generated during the build process. No need to have
them in the git repository.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/perf_counter/PERF-BUILD-OPTIONS | 4 ----
 Documentation/perf_counter/PERF-CFLAGS        | 1 -
 Documentation/perf_counter/PERF-VERSION-FILE  | 1 -
 3 files changed, 6 deletions(-)
 delete mode 100644 Documentation/perf_counter/PERF-BUILD-OPTIONS
 delete mode 100644 Documentation/perf_counter/PERF-CFLAGS
 delete mode 100644 Documentation/perf_counter/PERF-VERSION-FILE

diff --git a/Documentation/perf_counter/PERF-BUILD-OPTIONS b/Documentation/perf_counter/PERF-BUILD-OPTIONS
deleted file mode 100644
index 46d8d6ceb2f41..0000000000000
--- a/Documentation/perf_counter/PERF-BUILD-OPTIONS
+++ /dev/null
@@ -1,4 +0,0 @@
-SHELL_PATH='/bin/sh'
-TAR='tar'
-NO_CURL=''
-NO_PERL=''
diff --git a/Documentation/perf_counter/PERF-CFLAGS b/Documentation/perf_counter/PERF-CFLAGS
deleted file mode 100644
index f24906ca688d6..0000000000000
--- a/Documentation/perf_counter/PERF-CFLAGS
+++ /dev/null
@@ -1 +0,0 @@
--g -O2 -Wall  -DSHA1_HEADER='<openssl/sha.h>' : /home/mingo/bin:libexec/perf-core:share/perf-core/templates:/home/mingo
diff --git a/Documentation/perf_counter/PERF-VERSION-FILE b/Documentation/perf_counter/PERF-VERSION-FILE
deleted file mode 100644
index 328e244c0c81c..0000000000000
--- a/Documentation/perf_counter/PERF-VERSION-FILE
+++ /dev/null
@@ -1 +0,0 @@
-PERF_VERSION = 0.0.1.PERF
-- 
GitLab


From 6eda5838bc5771578986429cde4a0870e1e5f5e1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 18:29:57 +0200
Subject: [PATCH 0781/2198] perfcounter tools: move common defines ... to local
 header file

No change, move of duplicated stuff only.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/perf_counter/Makefile         |  1 +
 Documentation/perf_counter/builtin-record.c | 64 +--------------------
 Documentation/perf_counter/builtin-stat.c   | 59 +------------------
 Documentation/perf_counter/builtin-top.c    | 63 +-------------------
 Documentation/perf_counter/perf.h           | 64 +++++++++++++++++++++
 5 files changed, 70 insertions(+), 181 deletions(-)
 create mode 100644 Documentation/perf_counter/perf.h

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 877cf5dedb551..481e4c26cd453 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -287,6 +287,7 @@ export PERL_PATH
 LIB_FILE=libperf.a
 
 LIB_H += ../../include/linux/perf_counter.h
+LIB_H += perf.h
 LIB_H += util/levenshtein.h
 LIB_H += util/parse-options.h
 LIB_H += util/quote.h
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 9cff266fb6141..59f1d87f41e93 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -1,6 +1,7 @@
 
 
-#define _GNU_SOURCE
+#include "util/util.h"
+
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/time.h>
@@ -32,66 +33,7 @@
 
 #include "../../include/linux/perf_counter.h"
 
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
-
-#define rdclock()                                       \
-({                                                      \
-        struct timespec ts;                             \
-                                                        \
-        clock_gettime(CLOCK_MONOTONIC, &ts);            \
-        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#ifdef __x86_64__
-#define __NR_perf_counter_open 298
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 336
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#define rmb() 		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-extern asmlinkage int sys_perf_counter_open(
-        struct perf_counter_hw_event    *hw_event_uptr          __user,
-        pid_t                           pid,
-        int                             cpu,
-        int                             group_fd,
-        unsigned long                   flags);
-
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+#include "perf.h"
 
 static int			nr_counters			=  0;
 static __u64			event_id[MAX_COUNTERS]		= { };
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 9fbc66173de78..6de38d2568830 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -85,64 +85,7 @@
 
 #include "../../include/linux/perf_counter.h"
 
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define rdclock()                                       \
-({                                                      \
-        struct timespec ts;                             \
-                                                        \
-        clock_gettime(CLOCK_MONOTONIC, &ts);            \
-        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#ifdef __x86_64__
-#define __NR_perf_counter_open 298
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 336
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#define rmb() 		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-extern asmlinkage int sys_perf_counter_open(
-        struct perf_counter_hw_event    *hw_event_uptr          __user,
-        pid_t                           pid,
-        int                             cpu,
-        int                             group_fd,
-        unsigned long                   flags);
-
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+#include "perf.h"
 
 static int			system_wide			=  0;
 
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index b6d989e7b1961..cd6f61d734181 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -66,68 +66,7 @@
 
 #include "../../include/linux/perf_counter.h"
 
-
-/*
- * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_COUNTERS_DISABLE   31
-#define PR_TASK_PERF_COUNTERS_ENABLE    32
-
-#define rdclock()                                       \
-({                                                      \
-        struct timespec ts;                             \
-                                                        \
-        clock_gettime(CLOCK_MONOTONIC, &ts);            \
-        ts.tv_sec * 1000000000ULL + ts.tv_nsec;         \
-})
-
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#ifdef __x86_64__
-#define __NR_perf_counter_open 298
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 336
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#define __NR_perf_counter_open 319
-#define rmb() 		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-asmlinkage int sys_perf_counter_open(
-        struct perf_counter_hw_event    *hw_event_uptr          __user,
-        pid_t                           pid,
-        int                             cpu,
-        int                             group_fd,
-        unsigned long                   flags)
-{
-        return syscall(
-                __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags);
-}
-
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
-
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+#include "perf.h"
 
 static int			system_wide			=  0;
 
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
new file mode 100644
index 0000000000000..391fcc73148af
--- /dev/null
+++ b/Documentation/perf_counter/perf.h
@@ -0,0 +1,64 @@
+#ifndef _PERF_PERF_H
+#define _PERF_PERF_H
+
+/*
+ * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
+ * counters in the current task.
+ */
+#define PR_TASK_PERF_COUNTERS_DISABLE   31
+#define PR_TASK_PERF_COUNTERS_ENABLE    32
+
+#define rdclock()					\
+({							\
+	struct timespec ts;				\
+							\
+	clock_gettime(CLOCK_MONOTONIC, &ts);		\
+	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
+})
+
+/*
+ * Pick up some kernel type conventions:
+ */
+#define __user
+#define asmlinkage
+
+#ifdef __x86_64__
+#define __NR_perf_counter_open 298
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __i386__
+#define __NR_perf_counter_open 336
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#define __NR_perf_counter_open 319
+#define rmb()		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#define unlikely(x)	__builtin_expect(!!(x), 0)
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+static inline int
+sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+		      pid_t pid, int cpu, int group_fd,
+		      unsigned long flags)
+{
+	return syscall(__NR_perf_counter_open, hw_event_uptr, pid, cpu,
+		       group_fd, flags);
+}
+
+#define MAX_COUNTERS			64
+#define MAX_NR_CPUS			256
+
+#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
+
+#endif
-- 
GitLab


From a92e70237c8abbd1c3241133bf72f2cd07c90eae Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 18:39:47 +0200
Subject: [PATCH 0782/2198] perfcounter tools: make rdclock an inline function

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/perf_counter/perf.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 391fcc73148af..fb1423072286e 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -8,13 +8,17 @@
 #define PR_TASK_PERF_COUNTERS_DISABLE   31
 #define PR_TASK_PERF_COUNTERS_ENABLE    32
 
-#define rdclock()					\
-({							\
-	struct timespec ts;				\
-							\
-	clock_gettime(CLOCK_MONOTONIC, &ts);		\
-	ts.tv_sec * 1000000000ULL + ts.tv_nsec;		\
-})
+#ifndef NSEC_PER_SEC
+# define NSEC_PER_SEC			1000000000ULL
+#endif
+
+static inline unsigned long long rdclock(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
 
 /*
  * Pick up some kernel type conventions:
-- 
GitLab


From 7bd5469cd938eec6a76b3135e6becd9b5e096e98 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 18:42:47 +0200
Subject: [PATCH 0783/2198] perfcounter tools: fix pointer mismatch

Neither process_options nor execvp take an const **char as argument.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/perf_counter/builtin-record.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 59f1d87f41e93..3a3deb3fbbc06 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -332,7 +332,7 @@ static void sigchld_handler(int sig)
 		done = 1;
 }
 
-int cmd_record(int argc, const char **argv)
+int cmd_record(int argc, char **argv)
 {
 	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-- 
GitLab


From 4ba67c1d48aeedcc31630bb40b6179fc7d360f90 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 1 May 2009 18:48:06 +0200
Subject: [PATCH 0784/2198] perfcounter tools: get the syscall number from
 arch/*/include/asm/unistd.h

Avoid further confusion during development

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/perf_counter/perf.h | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index fb1423072286e..6fa3656399f44 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -26,20 +26,14 @@ static inline unsigned long long rdclock(void)
 #define __user
 #define asmlinkage
 
-#ifdef __x86_64__
-#define __NR_perf_counter_open 298
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __i386__
-#define __NR_perf_counter_open 336
+#if defined(__x86_64__) || defined(__i386__)
+#include "../../arch/x86/include/asm/unistd.h"
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");
 #endif
 
 #ifdef __powerpc__
-#define __NR_perf_counter_open 319
+#include "../../arch/powerpc/include/asm/unistd.h"
 #define rmb()		asm volatile ("sync" ::: "memory")
 #define cpu_relax()	asm volatile ("" ::: "memory");
 #endif
-- 
GitLab


From 75507efb1372b6acf1aa6bf00ebd49ce196fd994 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 12:58:36 -0400
Subject: [PATCH 0785/2198] ext4: Don't avoid using BLOCK_UNINIT block groups
 in mballoc

By avoiding the use of not-yet-used block groups (i.e., block groups
with the BLOCK_UNINIT flag), mballoc had a tendency to create large
files with large non-contiguous gaps.  In addition avoiding the use of
new block groups had a tendency to push regular file data into the
first block group in a flex_bg group, which slows down the speed of
e2fsck pass 2, since it has a tendency to seek much more.  For
example:

               Before Patch                       After Patch
              Time in seconds                   Time in seconds
            Real /  User/  Sys   MB/s      Real /  User/  Sys    MB/s
Pass 1      8.52 / 2.21 / 0.46  20.43      8.84 / 4.97 / 1.11   19.68
Pass 2     21.16 / 1.02 / 1.86  11.30      6.54 / 1.77 / 1.78   36.39
Pass 3      0.01 / 0.00 / 0.00 139.00      0.01 / 0.01 / 0.00  128.90
Pass 4      0.16 / 0.15 / 0.00   0.00      0.17 / 0.17 / 0.00    0.00
Pass 5      2.52 / 1.99 / 0.09   0.79      2.31 / 1.78 / 0.06    0.86
Total      32.40 / 5.11 / 2.49  12.81     17.99 / 8.75 / 2.98   23.01

This was on a sample 80 gig root filesystem which was approximately
50% full.  Note the improved e2fsck pass 2 performance, by over a
factor of 3, due to a decreased number of seeks.  (The total amount of
I/O in pass 2 was unchanged; the layout of the directory blocks was
simply much better from e2fsck's's perspective.)

Other changes as a result of this patch on this sample filesystem:

                             Before Patch    After Patch
# of non-contig files           762             779
# of non-contig directories     571             570
# of BLOCK_UNINIT bg's          307             293
# of INODE_UNINIT bg's          503             503

Out of 640 block groups, of which 333 were in use, this patch caused
an extra 14 block groups to be utilized.  The number of non-contiguous
files did go up slightly, but when measured against the 99.9% of the
files (603,154) which were contiguously allocated, this is pretty
insignificant.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andreas Dilger <adilger@sun.com>
---
 fs/ext4/mballoc.c | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c3af9e6b66683..dbd47eac13ecf 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -1728,7 +1728,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	unsigned free, fragments;
 	unsigned i, bits;
 	int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
-	struct ext4_group_desc *desc;
 	struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
 	BUG_ON(cr < 0 || cr >= 4);
@@ -1744,10 +1743,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 	switch (cr) {
 	case 0:
 		BUG_ON(ac->ac_2order == 0);
-		/* If this group is uninitialized, skip it initially */
-		desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-			return 0;
 
 		/* Avoid using the first bg of a flexgroup for data files */
 		if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
@@ -2067,9 +2062,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 
 			ac->ac_groups_scanned++;
 			desc = ext4_get_group_desc(sb, group, NULL);
-			if (cr == 0 || (desc->bg_flags &
-					cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
-					ac->ac_2order != 0))
+			if (cr == 0)
 				ext4_mb_simple_scan_group(ac, &e4b);
 			else if (cr == 1 &&
 					ac->ac_g_ex.fe_len == sbi->s_stripe)
-- 
GitLab


From 15e957d08dd4a841359cfec59ecb74041e0097aa Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 30 Apr 2009 01:17:50 -0700
Subject: [PATCH 0786/2198] x86/irq: use move_irq_desc() in create_irq_nr()

move_irq_desc() will try to move irq_desc to the home node if
the allocated one is not correct, in create_irq_nr().

( This can happen on devices that are on different nodes that
  are using MSI, when drivers are loaded and unloaded randomly. )

v2: fix non-smp build
v3: add NUMA_IRQ_DESC to eliminate #ifdefs

[ Impact: improve irq descriptor locality on NUMA systems ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <49F95EAE.2050903@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig               |  4 ++++
 arch/x86/kernel/apic/io_apic.c |  6 +-----
 include/linux/irq.h            | 11 +++++++++--
 kernel/irq/Makefile            |  2 +-
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e1b2543f8ed3f..674e21e9f0a0c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -274,6 +274,10 @@ config SPARSE_IRQ
 
 	  If you don't know what to do here, say N.
 
+config NUMA_IRQ_DESC
+	def_bool y
+	depends on SPARSE_IRQ && NUMA
+
 config X86_MPPARSE
 	bool "Enable MPS table" if ACPI
 	default y
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 9cd4806cdf5f6..e583291fe6c36 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3197,11 +3197,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
 		if (cfg_new->vector != 0)
 			continue;
 
-#ifdef CONFIG_NUMA_IRQ_DESC
-		/* different node ?*/
-		if (desc_new->node != node)
-			desc = move_irq_desc(desc, node);
-#endif
+		desc_new = move_irq_desc(desc_new, node);
 
 		if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
 			irq = new;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 4b95ddb5304b5..eedbb8e5e0ccf 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -206,9 +206,16 @@ extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc
 
 #ifndef CONFIG_SPARSE_IRQ
 extern struct irq_desc irq_desc[NR_IRQS];
-#else /* CONFIG_SPARSE_IRQ */
+#endif
+
+#ifdef CONFIG_NUMA_IRQ_DESC
 extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node);
-#endif /* CONFIG_SPARSE_IRQ */
+#else
+static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
+{
+	return desc;
+}
+#endif
 
 extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
 
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 2f065277f8ee2..7d047808419da 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_SPARSE_IRQ) += numa_migrate.o
+obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
-- 
GitLab


From d444c3c38189b3f18337a213855ac1c07af4e2d9 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 13:44:33 -0400
Subject: [PATCH 0787/2198] ext4: Move the ext4_i.h header file into ext4.h

There is no longer a reason for a separate ext4_i.h header file, so
move it into ext4.h just to make life easier for developers to find
the relevant data structures and typedefs.  Should also speed up
compiles slightly, too.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h   | 122 ++++++++++++++++++++++++++++++++++++++++-
 fs/ext4/ext4_i.h | 140 -----------------------------------------------
 2 files changed, 121 insertions(+), 141 deletions(-)
 delete mode 100644 fs/ext4/ext4_i.h

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 02ec44bf38e60..ba57d669cb659 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -21,7 +21,10 @@
 #include <linux/magic.h>
 #include <linux/jbd2.h>
 #include <linux/quota.h>
-#include "ext4_i.h"
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
 
 /*
  * The fourth extended filesystem constants/structures
@@ -46,6 +49,19 @@
 #define ext4_debug(f, a...)	do {} while (0)
 #endif
 
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
+
+
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE		1
 /* blocks already reserved */
@@ -515,6 +531,110 @@ do {									       \
 
 #endif /* defined(__KERNEL__) || defined(__linux__) */
 
+/*
+ * storage for cached extent
+ */
+struct ext4_ext_cache {
+	ext4_fsblk_t	ec_start;
+	ext4_lblk_t	ec_block;
+	__u32		ec_len; /* must be 32bit to return holes */
+	__u32		ec_type;
+};
+
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+	__le32	i_data[15];	/* unconverted */
+	__u32	i_flags;
+	ext4_fsblk_t	i_file_acl;
+	__u32	i_dtime;
+
+	/*
+	 * i_block_group is the number of the block group which contains
+	 * this file's inode.  Constant across the lifetime of the inode,
+	 * it is ued for making block allocation decisions - we try to
+	 * place a file's data blocks near its inode block, and new inodes
+	 * near to their parent directory's inode.
+	 */
+	ext4_group_t	i_block_group;
+	__u32	i_state;		/* Dynamic state flags for ext4 */
+
+	ext4_lblk_t		i_dir_start_lookup;
+#ifdef CONFIG_EXT4_FS_XATTR
+	/*
+	 * Extended attributes can be read independently of the main file
+	 * data. Taking i_mutex even when reading would cause contention
+	 * between readers of EAs and writers of regular file data, so
+	 * instead we synchronize on xattr_sem when reading or changing
+	 * EAs.
+	 */
+	struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	struct posix_acl	*i_acl;
+	struct posix_acl	*i_default_acl;
+#endif
+
+	struct list_head i_orphan;	/* unlinked but open inodes */
+
+	/*
+	 * i_disksize keeps track of what the inode size is ON DISK, not
+	 * in memory.  During truncate, i_size is set to the new size by
+	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
+	 * set i_disksize to 0 until the truncate is actually under way.
+	 *
+	 * The intent is that i_disksize always represents the blocks which
+	 * are used by this file.  This allows recovery to restart truncate
+	 * on orphans if we crash during truncate.  We actually write i_disksize
+	 * into the on-disk inode when writing inodes out, instead of i_size.
+	 *
+	 * The only time when i_disksize and i_size may be different is when
+	 * a truncate is in progress.  The only things which change i_disksize
+	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+	 */
+	loff_t	i_disksize;
+
+	/*
+	 * i_data_sem is for serialising ext4_truncate() against
+	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+	 * data tree are chopped off during truncate. We can't do that in
+	 * ext4 because whenever we perform intermediate commits during
+	 * truncate, the inode and all the metadata blocks *must* be in a
+	 * consistent state which allows truncation of the orphans to restart
+	 * during recovery.  Hence we must fix the get_block-vs-truncate race
+	 * by other means, so we have i_data_sem.
+	 */
+	struct rw_semaphore i_data_sem;
+	struct inode vfs_inode;
+	struct jbd2_inode jinode;
+
+	struct ext4_ext_cache i_cached_extent;
+	/*
+	 * File creation time. Its function is same as that of
+	 * struct timespec i_{a,c,m}time in the generic inode.
+	 */
+	struct timespec i_crtime;
+
+	/* mballoc */
+	struct list_head i_prealloc_list;
+	spinlock_t i_prealloc_lock;
+
+	/* ialloc */
+	ext4_group_t	i_last_alloc_group;
+
+	/* allocation reservation info for delalloc */
+	unsigned int i_reserved_data_blocks;
+	unsigned int i_reserved_meta_blocks;
+	unsigned int i_allocated_meta_blocks;
+	unsigned short i_delalloc_reserved_flag;
+
+	/* on-disk additional length */
+	__u16 i_extra_isize;
+
+	spinlock_t i_block_reservation_lock;
+};
+
 /*
  * File system states
  */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
deleted file mode 100644
index 4ce2187123aad..0000000000000
--- a/fs/ext4/ext4_i.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- *  ext4_i.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_i.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-#ifndef _EXT4_I
-#define _EXT4_I
-
-#include <linux/rwsem.h>
-#include <linux/rbtree.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-
-/* data type for block offset of block group */
-typedef int ext4_grpblk_t;
-
-/* data type for filesystem-wide blocks number */
-typedef unsigned long long ext4_fsblk_t;
-
-/* data type for file logical block number */
-typedef __u32 ext4_lblk_t;
-
-/* data type for block group number */
-typedef unsigned int ext4_group_t;
-
-/*
- * storage for cached extent
- */
-struct ext4_ext_cache {
-	ext4_fsblk_t	ec_start;
-	ext4_lblk_t	ec_block;
-	__u32		ec_len; /* must be 32bit to return holes */
-	__u32		ec_type;
-};
-
-/*
- * fourth extended file system inode data in memory
- */
-struct ext4_inode_info {
-	__le32	i_data[15];	/* unconverted */
-	__u32	i_flags;
-	ext4_fsblk_t	i_file_acl;
-	__u32	i_dtime;
-
-	/*
-	 * i_block_group is the number of the block group which contains
-	 * this file's inode.  Constant across the lifetime of the inode,
-	 * it is ued for making block allocation decisions - we try to
-	 * place a file's data blocks near its inode block, and new inodes
-	 * near to their parent directory's inode.
-	 */
-	ext4_group_t	i_block_group;
-	__u32	i_state;		/* Dynamic state flags for ext4 */
-
-	ext4_lblk_t		i_dir_start_lookup;
-#ifdef CONFIG_EXT4_FS_XATTR
-	/*
-	 * Extended attributes can be read independently of the main file
-	 * data. Taking i_mutex even when reading would cause contention
-	 * between readers of EAs and writers of regular file data, so
-	 * instead we synchronize on xattr_sem when reading or changing
-	 * EAs.
-	 */
-	struct rw_semaphore xattr_sem;
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-	struct posix_acl	*i_acl;
-	struct posix_acl	*i_default_acl;
-#endif
-
-	struct list_head i_orphan;	/* unlinked but open inodes */
-
-	/*
-	 * i_disksize keeps track of what the inode size is ON DISK, not
-	 * in memory.  During truncate, i_size is set to the new size by
-	 * the VFS prior to calling ext4_truncate(), but the filesystem won't
-	 * set i_disksize to 0 until the truncate is actually under way.
-	 *
-	 * The intent is that i_disksize always represents the blocks which
-	 * are used by this file.  This allows recovery to restart truncate
-	 * on orphans if we crash during truncate.  We actually write i_disksize
-	 * into the on-disk inode when writing inodes out, instead of i_size.
-	 *
-	 * The only time when i_disksize and i_size may be different is when
-	 * a truncate is in progress.  The only things which change i_disksize
-	 * are ext4_get_block (growth) and ext4_truncate (shrinkth).
-	 */
-	loff_t	i_disksize;
-
-	/*
-	 * i_data_sem is for serialising ext4_truncate() against
-	 * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
-	 * data tree are chopped off during truncate. We can't do that in
-	 * ext4 because whenever we perform intermediate commits during
-	 * truncate, the inode and all the metadata blocks *must* be in a
-	 * consistent state which allows truncation of the orphans to restart
-	 * during recovery.  Hence we must fix the get_block-vs-truncate race
-	 * by other means, so we have i_data_sem.
-	 */
-	struct rw_semaphore i_data_sem;
-	struct inode vfs_inode;
-	struct jbd2_inode jinode;
-
-	struct ext4_ext_cache i_cached_extent;
-	/*
-	 * File creation time. Its function is same as that of
-	 * struct timespec i_{a,c,m}time in the generic inode.
-	 */
-	struct timespec i_crtime;
-
-	/* mballoc */
-	struct list_head i_prealloc_list;
-	spinlock_t i_prealloc_lock;
-
-	/* ialloc */
-	ext4_group_t	i_last_alloc_group;
-
-	/* allocation reservation info for delalloc */
-	unsigned int i_reserved_data_blocks;
-	unsigned int i_reserved_meta_blocks;
-	unsigned int i_allocated_meta_blocks;
-	unsigned short i_delalloc_reserved_flag;
-
-	/* on-disk additional length */
-	__u16 i_extra_isize;
-
-	spinlock_t i_block_reservation_lock;
-};
-
-#endif	/* _EXT4_I */
-- 
GitLab


From ca0faba0e8ac844dc0279825eb8db876b5962ea5 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 3 May 2009 16:33:44 -0400
Subject: [PATCH 0788/2198] ext4: Move the ext4_sb.h header file into ext4.h

There is no longer a reason for a separate ext4_sb.h header file, so
move it into ext4.h just to make life easier for developers to find
the relevant data structures and typedefs.  Should also speed up
compiles slightly, too.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    | 144 ++++++++++++++++++++++++++++++++++++++--
 fs/ext4/ext4_sb.h | 163 ----------------------------------------------
 2 files changed, 140 insertions(+), 167 deletions(-)
 delete mode 100644 fs/ext4/ext4_sb.h

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index ba57d669cb659..af3c906e705bb 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -25,6 +25,10 @@
 #include <linux/rbtree.h>
 #include <linux/seqlock.h>
 #include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
 
 /*
  * The fourth extended filesystem constants/structures
@@ -195,9 +199,6 @@ struct flex_groups {
 #define EXT4_BG_BLOCK_UNINIT	0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED	0x0004 /* On-disk itable initialized to zero */
 
-#ifdef __KERNEL__
-#include "ext4_sb.h"
-#endif
 /*
  * Macro-instructions used to manage group descriptors
  */
@@ -809,6 +810,136 @@ struct ext4_super_block {
 };
 
 #ifdef __KERNEL__
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
+	unsigned long s_inodes_per_block;/* Number of inodes per block */
+	unsigned long s_blocks_per_group;/* Number of blocks in a group */
+	unsigned long s_inodes_per_group;/* Number of inodes in a group */
+	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
+	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
+	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
+	ext4_group_t s_groups_count;	/* Number of groups in the fs */
+	unsigned long s_overhead_last;  /* Last calculated overhead */
+	unsigned long s_blocks_last;    /* Last seen block count */
+	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
+	struct buffer_head * s_sbh;	/* Buffer containing the super block */
+	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
+	struct buffer_head **s_group_desc;
+	unsigned long  s_mount_opt;
+	ext4_fsblk_t s_sb_block;
+	uid_t s_resuid;
+	gid_t s_resgid;
+	unsigned short s_mount_state;
+	unsigned short s_pad;
+	int s_addr_per_block_bits;
+	int s_desc_per_block_bits;
+	int s_inode_size;
+	int s_first_ino;
+	unsigned int s_inode_readahead_blks;
+	spinlock_t s_next_gen_lock;
+	u32 s_next_generation;
+	u32 s_hash_seed[4];
+	int s_def_hash_version;
+	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
+	struct percpu_counter s_freeblocks_counter;
+	struct percpu_counter s_freeinodes_counter;
+	struct percpu_counter s_dirs_counter;
+	struct percpu_counter s_dirtyblocks_counter;
+	struct blockgroup_lock *s_blockgroup_lock;
+	struct proc_dir_entry *s_proc;
+	struct kobject s_kobj;
+	struct completion s_kobj_unregister;
+
+	/* Journaling */
+	struct inode *s_journal_inode;
+	struct journal_s *s_journal;
+	struct list_head s_orphan;
+	struct mutex s_orphan_lock;
+	struct mutex s_resize_lock;
+	unsigned long s_commit_interval;
+	u32 s_max_batch_time;
+	u32 s_min_batch_time;
+	struct block_device *journal_bdev;
+#ifdef CONFIG_JBD2_DEBUG
+	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
+	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
+#endif
+#ifdef CONFIG_QUOTA
+	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
+	int s_jquota_fmt;			/* Format of quota to use */
+#endif
+	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+
+#ifdef EXTENTS_STATS
+	/* ext4 extents stats */
+	unsigned long s_ext_min;
+	unsigned long s_ext_max;
+	unsigned long s_depth_max;
+	spinlock_t s_ext_stats_lock;
+	unsigned long s_ext_blocks;
+	unsigned long s_ext_extents;
+#endif
+
+	/* for buddy allocator */
+	struct ext4_group_info ***s_group_info;
+	struct inode *s_buddy_cache;
+	long s_blocks_reserved;
+	spinlock_t s_reserve_lock;
+	spinlock_t s_md_lock;
+	tid_t s_last_transaction;
+	unsigned short *s_mb_offsets;
+	unsigned int *s_mb_maxs;
+
+	/* tunables */
+	unsigned long s_stripe;
+	unsigned int s_mb_stream_request;
+	unsigned int s_mb_max_to_scan;
+	unsigned int s_mb_min_to_scan;
+	unsigned int s_mb_stats;
+	unsigned int s_mb_order2_reqs;
+	unsigned int s_mb_group_prealloc;
+	/* where last allocation was done - for stream allocation */
+	unsigned long s_mb_last_group;
+	unsigned long s_mb_last_start;
+
+	/* history to debug policy */
+	struct ext4_mb_history *s_mb_history;
+	int s_mb_history_cur;
+	int s_mb_history_max;
+	int s_mb_history_num;
+	spinlock_t s_mb_history_lock;
+	int s_mb_history_filter;
+
+	/* stats for buddy allocator */
+	spinlock_t s_mb_pa_lock;
+	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
+	atomic_t s_bal_success;	/* we found long enough chunks */
+	atomic_t s_bal_allocated;	/* in blocks */
+	atomic_t s_bal_ex_scanned;	/* total extents scanned */
+	atomic_t s_bal_goals;	/* goal hits */
+	atomic_t s_bal_breaks;	/* too long searches */
+	atomic_t s_bal_2orders;	/* 2^order hits */
+	spinlock_t s_bal_lock;
+	unsigned long s_mb_buddies_generated;
+	unsigned long long s_mb_generation_time;
+	atomic_t s_mb_lost_chunks;
+	atomic_t s_mb_preallocated;
+	atomic_t s_mb_discarded;
+
+	/* locality groups */
+	struct ext4_locality_group *s_locality_groups;
+
+	/* for write statistics */
+	unsigned long s_sectors_written_start;
+	u64 s_kbytes_written;
+
+	unsigned int s_log_groups_per_flex;
+	struct flex_groups *s_flex_groups;
+};
+
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
@@ -824,7 +955,6 @@ static inline struct timespec ext4_current_time(struct inode *inode)
 		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
 
-
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
 	return ino == EXT4_ROOT_INO ||
@@ -833,6 +963,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		(ino >= EXT4_FIRST_INO(sb) &&
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
+
+static inline spinlock_t *
+sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
+{
+	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
+}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
deleted file mode 100644
index 2d36223d5f579..0000000000000
--- a/fs/ext4/ext4_sb.h
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- *  ext4_sb.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_sb.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-#ifndef _EXT4_SB
-#define _EXT4_SB
-
-#ifdef __KERNEL__
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
-#endif
-#include <linux/rbtree.h>
-
-/*
- * fourth extended-fs super-block data in memory
- */
-struct ext4_sb_info {
-	unsigned long s_desc_size;	/* Size of a group descriptor in bytes */
-	unsigned long s_inodes_per_block;/* Number of inodes per block */
-	unsigned long s_blocks_per_group;/* Number of blocks in a group */
-	unsigned long s_inodes_per_group;/* Number of inodes in a group */
-	unsigned long s_itb_per_group;	/* Number of inode table blocks per group */
-	unsigned long s_gdb_count;	/* Number of group descriptor blocks */
-	unsigned long s_desc_per_block;	/* Number of group descriptors per block */
-	ext4_group_t s_groups_count;	/* Number of groups in the fs */
-	unsigned long s_overhead_last;  /* Last calculated overhead */
-	unsigned long s_blocks_last;    /* Last seen block count */
-	loff_t s_bitmap_maxbytes;	/* max bytes for bitmap files */
-	struct buffer_head * s_sbh;	/* Buffer containing the super block */
-	struct ext4_super_block *s_es;	/* Pointer to the super block in the buffer */
-	struct buffer_head **s_group_desc;
-	unsigned long  s_mount_opt;
-	ext4_fsblk_t s_sb_block;
-	uid_t s_resuid;
-	gid_t s_resgid;
-	unsigned short s_mount_state;
-	unsigned short s_pad;
-	int s_addr_per_block_bits;
-	int s_desc_per_block_bits;
-	int s_inode_size;
-	int s_first_ino;
-	unsigned int s_inode_readahead_blks;
-	spinlock_t s_next_gen_lock;
-	u32 s_next_generation;
-	u32 s_hash_seed[4];
-	int s_def_hash_version;
-	int s_hash_unsigned;	/* 3 if hash should be signed, 0 if not */
-	struct percpu_counter s_freeblocks_counter;
-	struct percpu_counter s_freeinodes_counter;
-	struct percpu_counter s_dirs_counter;
-	struct percpu_counter s_dirtyblocks_counter;
-	struct blockgroup_lock *s_blockgroup_lock;
-	struct proc_dir_entry *s_proc;
-	struct kobject s_kobj;
-	struct completion s_kobj_unregister;
-
-	/* Journaling */
-	struct inode *s_journal_inode;
-	struct journal_s *s_journal;
-	struct list_head s_orphan;
-	struct mutex s_orphan_lock;
-	struct mutex s_resize_lock;
-	unsigned long s_commit_interval;
-	u32 s_max_batch_time;
-	u32 s_min_batch_time;
-	struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
-	struct timer_list turn_ro_timer;	/* For turning read-only (crash simulation) */
-	wait_queue_head_t ro_wait_queue;	/* For people waiting for the fs to go read-only */
-#endif
-#ifdef CONFIG_QUOTA
-	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
-	int s_jquota_fmt;			/* Format of quota to use */
-#endif
-	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
-
-#ifdef EXTENTS_STATS
-	/* ext4 extents stats */
-	unsigned long s_ext_min;
-	unsigned long s_ext_max;
-	unsigned long s_depth_max;
-	spinlock_t s_ext_stats_lock;
-	unsigned long s_ext_blocks;
-	unsigned long s_ext_extents;
-#endif
-
-	/* for buddy allocator */
-	struct ext4_group_info ***s_group_info;
-	struct inode *s_buddy_cache;
-	long s_blocks_reserved;
-	spinlock_t s_reserve_lock;
-	spinlock_t s_md_lock;
-	tid_t s_last_transaction;
-	unsigned short *s_mb_offsets;
-	unsigned int *s_mb_maxs;
-
-	/* tunables */
-	unsigned long s_stripe;
-	unsigned int s_mb_stream_request;
-	unsigned int s_mb_max_to_scan;
-	unsigned int s_mb_min_to_scan;
-	unsigned int s_mb_stats;
-	unsigned int s_mb_order2_reqs;
-	unsigned int s_mb_group_prealloc;
-	/* where last allocation was done - for stream allocation */
-	unsigned long s_mb_last_group;
-	unsigned long s_mb_last_start;
-
-	/* history to debug policy */
-	struct ext4_mb_history *s_mb_history;
-	int s_mb_history_cur;
-	int s_mb_history_max;
-	int s_mb_history_num;
-	spinlock_t s_mb_history_lock;
-	int s_mb_history_filter;
-
-	/* stats for buddy allocator */
-	spinlock_t s_mb_pa_lock;
-	atomic_t s_bal_reqs;	/* number of reqs with len > 1 */
-	atomic_t s_bal_success;	/* we found long enough chunks */
-	atomic_t s_bal_allocated;	/* in blocks */
-	atomic_t s_bal_ex_scanned;	/* total extents scanned */
-	atomic_t s_bal_goals;	/* goal hits */
-	atomic_t s_bal_breaks;	/* too long searches */
-	atomic_t s_bal_2orders;	/* 2^order hits */
-	spinlock_t s_bal_lock;
-	unsigned long s_mb_buddies_generated;
-	unsigned long long s_mb_generation_time;
-	atomic_t s_mb_lost_chunks;
-	atomic_t s_mb_preallocated;
-	atomic_t s_mb_discarded;
-
-	/* locality groups */
-	struct ext4_locality_group *s_locality_groups;
-
-	/* for write statistics */
-	unsigned long s_sectors_written_start;
-	u64 s_kbytes_written;
-
-	unsigned int s_log_groups_per_flex;
-	struct flex_groups *s_flex_groups;
-};
-
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
-	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
-
-#endif	/* _EXT4_SB */
-- 
GitLab


From 596397b77c895d0fa3674f579c94ad5ea88ef01d Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 13:49:15 -0400
Subject: [PATCH 0789/2198] ext4: Move fs/ext4/namei.h into ext4.h

The fs/ext4/namei.h header file had only a single function
declaration, and should have never been a standalone file.  Move it
into ext4.h, where should have been from the beginning.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 1 +
 fs/ext4/namei.c | 1 -
 fs/ext4/namei.h | 8 --------
 fs/ext4/super.c | 1 -
 4 files changed, 1 insertion(+), 10 deletions(-)
 delete mode 100644 fs/ext4/namei.h

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index af3c906e705bb..d9c5251d082c4 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1594,6 +1594,7 @@ extern const struct file_operations ext4_file_operations;
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
 
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8018e49a72875..c9690b250e5e1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -37,7 +37,6 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
-#include "namei.h"
 #include "xattr.h"
 #include "acl.h"
 
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644
index 5e4dfff36a00b..0000000000000
--- a/fs/ext4/namei.h
+++ /dev/null
@@ -1,8 +0,0 @@
-/*  linux/fs/ext4/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- *	Ben Dooks <ben@simtec.co.uk>
- *
-*/
-
-extern struct dentry *ext4_get_parent(struct dentry *child);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1fbf0906ae2ef..d79e1c428b4a0 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -46,7 +46,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "namei.h"
 #include "group.h"
 
 struct proc_dir_entry *ext4_proc_root;
-- 
GitLab


From 6f0aced639d346e5f54eea9fcb2784b633493d09 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Fri, 1 May 2009 23:54:25 +0400
Subject: [PATCH 0790/2198] x86, apic: use pr_ macro

Replace recenly appeared printk with pr_ macro
(the file already use a lot of them).

[ Impact: cleanup ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090501195425.GB4633@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 28f747d61d787..e258bedce7cba 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2191,7 +2191,7 @@ static int __cpuinit set_multi(const struct dmi_system_id *d)
 {
 	if (multi)
 		return 0;
-	printk(KERN_INFO "APIC: %s detected, Multi Chassis\n", d->ident);
+	pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
 	multi = 1;
 	return 0;
 }
-- 
GitLab


From bb23c20a851a5038b255a3c0d0aa56093c1da3f8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Fri, 1 May 2009 19:44:44 -0400
Subject: [PATCH 0791/2198] ext4: Move fs/ext4/group.h into ext4.h

Move the function prototypes in group.h into ext4.h so they are all
defined in one place.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  |  1 -
 fs/ext4/ext4.h    | 17 +++++++++++++++++
 fs/ext4/group.h   | 29 -----------------------------
 fs/ext4/ialloc.c  |  1 -
 fs/ext4/mballoc.h |  1 -
 fs/ext4/resize.c  |  1 -
 fs/ext4/super.c   |  1 -
 7 files changed, 17 insertions(+), 34 deletions(-)
 delete mode 100644 fs/ext4/group.h

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index a5ba039850c5e..92f557d957d94 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -19,7 +19,6 @@
 #include <linux/buffer_head.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "group.h"
 #include "mballoc.h"
 
 /*
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d9c5251d082c4..5973f3261b0c5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1270,6 +1270,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
 						    ext4_group_t block_group,
 						    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+				      ext4_group_t block_group);
+extern unsigned ext4_init_block_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t group,
+				       struct ext4_group_desc *desc);
+#define ext4_free_blocks_after_init(sb, group, desc)			\
+		ext4_init_block_bitmap(sb, NULL, group, desc)
 
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1294,6 +1302,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+				       struct buffer_head *bh,
+				       ext4_group_t group,
+				       struct ext4_group_desc *desc);
+extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1417,6 +1430,10 @@ extern void ext4_used_dirs_set(struct super_block *sb,
 				struct ext4_group_desc *bg, __u32 count);
 extern void ext4_itable_unused_set(struct super_block *sb,
 				   struct ext4_group_desc *bg, __u32 count);
+extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
+				   struct ext4_group_desc *gdp);
+extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+				       struct ext4_group_desc *gdp);
 
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644
index c2c0a8d06d0e0..0000000000000
--- a/fs/ext4/group.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  linux/fs/ext4/group.h
- *
- * Copyright (C) 2007 Cluster File Systems, Inc
- *
- * Author: Andreas Dilger <adilger@clusterfs.com>
- */
-
-#ifndef _LINUX_EXT4_GROUP_H
-#define _LINUX_EXT4_GROUP_H
-
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
-				   struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
-				       struct ext4_group_desc *gdp);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
-				      ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)			\
-		ext4_init_block_bitmap(sb, NULL, group, desc)
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-				       struct buffer_head *bh,
-				       ext4_group_t group,
-				       struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-#endif /* _LINUX_EXT4_GROUP_H */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 55ba419ca00bc..916d05c881cad 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -27,7 +27,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
 
 /*
  * ialloc.c contains the inodes allocation and deallocation routines
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index dd9e6cd5f6cf4..75e34f69215bf 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
-#include "group.h"
 
 /*
  * with AGGRESSIVE_CHECK allocator runs consistency checks over
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e8ded13b5cb13..27eb289eea370 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 
 #include "ext4_jbd2.h"
-#include "group.h"
 
 #define outside(b, first, last)	((b) < (first) || (b) >= (last))
 #define inside(b, first, last)	((b) >= (first) && (b) < (last))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d79e1c428b4a0..7903f20c80758 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -46,7 +46,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
 
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
-- 
GitLab


From f40339031b04279c3fdde7ac5fe97db33b2a7694 Mon Sep 17 00:00:00 2001
From: Curt Wohlgemuth <curtw@google.com>
Date: Fri, 1 May 2009 20:27:20 -0400
Subject: [PATCH 0792/2198] ext4: Make the length of the mb_history file
 tunable

In memory-constrained systems with many partitions, the ~68K for each
partition for the mb_history buffer can be excessive.

This patch adds a new mount option, mb_history_length, as well as a
way of setting the default via a module parameter (or via a sysfs
parameter in /sys/module/ext4/parameter/default_mb_history_length).
If the mb_history_length is set to zero, the mb_history facility is
disabled entirely.

Signed-off-by: Curt Wohlgemuth <curtw@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 13 +++++++------
 fs/ext4/super.c   | 18 +++++++++++++++++-
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index dbd47eac13ecf..df75855ae6f78 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2413,7 +2413,8 @@ static void ext4_mb_history_release(struct super_block *sb)
 
 	if (sbi->s_proc != NULL) {
 		remove_proc_entry("mb_groups", sbi->s_proc);
-		remove_proc_entry("mb_history", sbi->s_proc);
+		if (sbi->s_mb_history_max)
+			remove_proc_entry("mb_history", sbi->s_proc);
 	}
 	kfree(sbi->s_mb_history);
 }
@@ -2424,17 +2425,17 @@ static void ext4_mb_history_init(struct super_block *sb)
 	int i;
 
 	if (sbi->s_proc != NULL) {
-		proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
-				 &ext4_mb_seq_history_fops, sb);
+		if (sbi->s_mb_history_max)
+			proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+					 &ext4_mb_seq_history_fops, sb);
 		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_groups_fops, sb);
 	}
 
-	sbi->s_mb_history_max = 1000;
 	sbi->s_mb_history_cur = 0;
 	spin_lock_init(&sbi->s_mb_history_lock);
 	i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-	sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+	sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
 	/* if we can't allocate history, then we simple won't use it */
 }
 
@@ -2444,7 +2445,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	struct ext4_mb_history h;
 
-	if (unlikely(sbi->s_mb_history == NULL))
+	if (sbi->s_mb_history == NULL)
 		return;
 
 	if (!(ac->ac_op & sbi->s_mb_history_filter))
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 7903f20c80758..39223a52bc71b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -47,6 +47,13 @@
 #include "xattr.h"
 #include "acl.h"
 
+static int default_mb_history_length = 1000;
+
+module_param_named(default_mb_history_length, default_mb_history_length,
+		   int, 0644);
+MODULE_PARM_DESC(default_mb_history_length,
+		 "Default number of entries saved for mb_history");
+
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 
@@ -1042,7 +1049,7 @@ enum {
 	Opt_journal_update, Opt_journal_dev,
 	Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-	Opt_data_err_abort, Opt_data_err_ignore,
+	Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
 	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
@@ -1088,6 +1095,7 @@ static const match_table_t tokens = {
 	{Opt_data_writeback, "data=writeback"},
 	{Opt_data_err_abort, "data_err=abort"},
 	{Opt_data_err_ignore, "data_err=ignore"},
+	{Opt_mb_history_length, "mb_history_length=%u"},
 	{Opt_offusrjquota, "usrjquota="},
 	{Opt_usrjquota, "usrjquota=%s"},
 	{Opt_offgrpjquota, "grpjquota="},
@@ -1329,6 +1337,13 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_data_err_ignore:
 			clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
 			break;
+		case Opt_mb_history_length:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0)
+				return 0;
+			sbi->s_mb_history_max = option;
+			break;
 #ifdef CONFIG_QUOTA
 		case Opt_usrjquota:
 			qtype = USRQUOTA;
@@ -2345,6 +2360,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
 	sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
 	sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+	sbi->s_mb_history_max = default_mb_history_length;
 
 	set_opt(sbi->s_mount_opt, BARRIER);
 
-- 
GitLab


From abc8746eb91fb01e8d411896f80f7687c0d8372e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 2 May 2009 22:54:32 -0400
Subject: [PATCH 0793/2198] ext4: hook fiemap operation for directories

Add fiemap callback for directories

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index c9690b250e5e1..f2bc160463b72 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2534,6 +2534,7 @@ const struct inode_operations ext4_dir_inode_operations = {
 	.removexattr	= generic_removexattr,
 #endif
 	.permission	= ext4_permission,
+	.fiemap         = ext4_fiemap,
 };
 
 const struct inode_operations ext4_special_inode_operations = {
-- 
GitLab


From 19ba0559f9ce104171ab16706893ce01f03ef116 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 13 May 2009 18:12:05 -0400
Subject: [PATCH 0794/2198] vfs: Enable FS_IOC_FIEMAP and FIGETBSZ for all
 filetypes

The fiemap and get_blk_size ioctls should be enabled even for
directories.  So move it outisde file_ioctl.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ioctl.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 82d9c42b8bac9..286f38dfc6c07 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -414,10 +414,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
 	switch (cmd) {
 	case FIBMAP:
 		return ioctl_fibmap(filp, p);
-	case FS_IOC_FIEMAP:
-		return ioctl_fiemap(filp, arg);
-	case FIGETBSZ:
-		return put_user(inode->i_sb->s_blocksize, p);
 	case FIONREAD:
 		return put_user(i_size_read(inode) - filp->f_pos, p);
 	}
@@ -557,6 +553,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		error = ioctl_fsthaw(filp);
 		break;
 
+	case FS_IOC_FIEMAP:
+		return ioctl_fiemap(filp, arg);
+
+	case FIGETBSZ:
+	{
+		struct inode *inode = filp->f_path.dentry->d_inode;
+		int __user *p = (int __user *)arg;
+		return put_user(inode->i_sb->s_blocksize, p);
+	}
+
 	default:
 		if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
 			error = file_ioctl(filp, cmd, arg);
-- 
GitLab


From c9877b205f6ce7943bb95281342f4001cc1c00ec Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Fri, 1 May 2009 23:32:06 -0400
Subject: [PATCH 0795/2198] ext4: fix for fiemap last-block test

Carl Henrik Lunde reported and debugged this; the test for the
last allocated block was comparing bytes to blocks in this test:

	if (logical + length - 1 == EXT_MAX_BLOCK ||
	    ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
		flags |= FIEMAP_EXTENT_LAST;

so any extent which ended right at 4G was stopping the extent
walk.  Just replacing these values with the extent block &
length should fix it.

Also give blksize_bits a saner type, and reverse the order
of the tests to make the more likely case tested first.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reported-by: Carl Henrik Lunde <chlunde@ping.uio.no>
Tested-by: Carl Henrik Lunde <chlunde@ping.uio.no>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ea5c47608cea7..5f7295287de1d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3196,7 +3196,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 		       void *data)
 {
 	struct fiemap_extent_info *fieinfo = data;
-	unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+	unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
 	__u64	logical;
 	__u64	physical;
 	__u64	length;
@@ -3243,8 +3243,8 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 	 *
 	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
 	 */
-	if (logical + length - 1 == EXT_MAX_BLOCK ||
-	    ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
+	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK)
 		flags |= FIEMAP_EXTENT_LAST;
 
 	error = fiemap_fill_next_extent(fieinfo, logical, physical,
-- 
GitLab


From a25cbd045a2ffc42787d4dbcbb9c7118f5f42732 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Fri, 1 May 2009 14:45:46 +0900
Subject: [PATCH 0796/2198] clocksource: setup mult_orig in
 clocksource_enable()

Setup clocksource mult_orig in clocksource_enable().

Clocksource drivers can save power by using keeping the
device clock disabled while the clocksource is unused.

In practice this means that the enable() and disable()
callbacks perform clk_enable() and clk_disable().

The enable() callback may also use clk_get_rate() to get
the clock rate from the clock framework. This information
can then be used to calculate the shift and mult variables.

Currently the mult_orig variable is setup from mult at
registration time only. This is conflicting with the above
case since the clock is disabled and the mult variable is
not yet calculated at the time of registration.

Moving the mult_orig setup code to clocksource_enable()
allows us to both handle the common case with no enable()
callback and the mult-changed-after-enable() case.

[ Impact: allow dynamic clock source usage ]

Signed-off-by: Magnus Damm <damm@igel.co.jp>
LKML-Reference: <20090501054546.8193.10688.sendpatchset@rx1.opensource.se>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clocksource.h | 10 +++++++++-
 kernel/time/clocksource.c   |  3 ---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 5a40d14daa9fa..c56457c8334ee 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -288,7 +288,15 @@ static inline cycle_t clocksource_read(struct clocksource *cs)
  */
 static inline int clocksource_enable(struct clocksource *cs)
 {
-	return cs->enable ? cs->enable(cs) : 0;
+	int ret = 0;
+
+	if (cs->enable)
+		ret = cs->enable(cs);
+
+	/* save mult_orig on enable */
+	cs->mult_orig = cs->mult;
+
+	return ret;
 }
 
 /**
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ecfd7b5187e0a..80189f6f1c5a1 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c)
 	unsigned long flags;
 	int ret;
 
-	/* save mult_orig on registration */
-	c->mult_orig = c->mult;
-
 	spin_lock_irqsave(&clocksource_lock, flags);
 	ret = clocksource_enqueue(c);
 	if (!ret)
-- 
GitLab


From 7d27558c4138ac6b3684dea35c2f4379b940a7dd Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Fri, 1 May 2009 13:10:26 -0700
Subject: [PATCH 0797/2198] timekeeping: create arch_gettimeoffset
 infrastructure

Some arches don't supply their own clocksource. This is mainly the
case in architectures that get their inter-tick times by reading the
counter on their interval timer.  Since these timers wrap every tick,
they're not really useful as clocksources.  Wrapping them to act like
one is possible but not very efficient. So we provide a callout these
arches can implement for use with the jiffies clocksource to provide
finer then tick granular time.

[ Impact: ease the migration to generic time keeping ]

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time.h      | 15 +++++++++++++++
 kernel/time/timekeeping.c |  7 +++++++
 2 files changed, 22 insertions(+)

diff --git a/include/linux/time.h b/include/linux/time.h
index 242f62499bb7b..ea16c1a01d518 100644
--- a/include/linux/time.h
+++ b/include/linux/time.h
@@ -113,6 +113,21 @@ struct timespec current_kernel_time(void);
 #define CURRENT_TIME		(current_kernel_time())
 #define CURRENT_TIME_SEC	((struct timespec) { get_seconds(), 0 })
 
+/* Some architectures do not supply their own clocksource.
+ * This is mainly the case in architectures that get their
+ * inter-tick times by reading the counter on their interval
+ * timer. Since these timers wrap every tick, they're not really
+ * useful as clocksources. Wrapping them to act like one is possible
+ * but not very efficient. So we provide a callout these arches
+ * can implement for use with the jiffies clocksource to provide
+ * finer then tick granular time.
+ */
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+extern u32 arch_gettimeoffset(void);
+#else
+static inline u32 arch_gettimeoffset(void) { return 0; }
+#endif
+
 extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7d..e97c50f8458b5 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -77,6 +77,10 @@ static void clocksource_forward_now(void)
 	clock->cycle_last = cycle_now;
 
 	nsec = cyc2ns(clock, cycle_delta);
+
+	/* If arch requires, add in gettimeoffset() */
+	nsec += arch_gettimeoffset();
+
 	timespec_add_ns(&xtime, nsec);
 
 	nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
@@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts)
 		/* convert to nanoseconds: */
 		nsecs = cyc2ns(clock, cycle_delta);
 
+		/* If arch requires, add in gettimeoffset() */
+		nsecs += arch_gettimeoffset();
+
 	} while (read_seqretry(&xtime_lock, seq));
 
 	timespec_add_ns(ts, nsecs);
-- 
GitLab


From eefd7f03b86b8a319890e7fac5a6fcc7f8694b76 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 2 May 2009 19:05:37 -0400
Subject: [PATCH 0798/2198] ext4: fix the length returned by fiemap for an
 unallocated extent

If the file's blocks have not yet been allocated because of delayed
allocation, the length of the extent returned by fiemap is incorrect.
This commit fixes this bug.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 5f7295287de1d..4fec6b7463823 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3244,8 +3244,15 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
 	 * XXX this might miss a single-block extent at EXT_MAX_BLOCK
 	 */
 	if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
-	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK)
+	    newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
+		loff_t size = i_size_read(inode);
+		loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
+
 		flags |= FIEMAP_EXTENT_LAST;
+		if ((flags & FIEMAP_EXTENT_DELALLOC) &&
+		    logical+length > size)
+			length = (size - logical + bs - 1) & ~(bs-1);
+	}
 
 	error = fiemap_fill_next_extent(fieinfo, logical, physical,
 					length, flags);
-- 
GitLab


From 955ce5f5be67dfe0d1d096b543af33fe8a1ce3dd Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 2 May 2009 20:35:09 -0400
Subject: [PATCH 0799/2198] ext4: Convert ext4_lock_group to use sb_bgl_lock

We have sb_bgl_lock() and ext4_group_info.bb_state
bit spinlock to protech group information. The later is only
used within mballoc code. Consolidate them to use sb_bgl_lock().
This makes the mballoc.c code much simpler and also avoid
confusion with two locks protecting same info.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c  | 12 ++++----
 fs/ext4/ext4.h    | 26 ++++++----------
 fs/ext4/ialloc.c  | 29 +++++++++---------
 fs/ext4/mballoc.c | 78 ++++++++++++++++-------------------------------
 fs/ext4/super.c   |  6 ++--
 5 files changed, 59 insertions(+), 92 deletions(-)

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 92f557d957d94..e2126d70dff5b 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		ext4_init_block_bitmap(sb, bh, block_group, desc);
 		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
-		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_unlock_group(sb, block_group);
 	if (buffer_uptodate(bh)) {
 		/*
 		 * if not uninit if bh is uptodate,
@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 	down_write(&grp->alloc_sem);
 	for (i = 0, blocks_freed = 0; i < count; i++) {
 		BUFFER_TRACE(bitmap_bh, "clear bit");
-		if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+		if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
 						bit + i, bitmap_bh->b_data)) {
 			ext4_error(sb, __func__,
 				   "bit already cleared for block %llu",
@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
 			blocks_freed++;
 		}
 	}
-	spin_lock(sb_bgl_lock(sbi, block_group));
+	ext4_lock_group(sb, block_group);
 	blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
 	ext4_free_blks_set(sb, desc, blk_free_count);
 	desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
+	ext4_unlock_group(sb, block_group);
 	percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
 
 	if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5973f3261b0c5..149e02dc36065 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -963,12 +963,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 		(ino >= EXT4_FIRST_INO(sb) &&
 		 ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count));
 }
-
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
-	return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1568,33 +1562,31 @@ struct ext4_group_info {
 };
 
 #define EXT4_GROUP_INFO_NEED_INIT_BIT	0
-#define EXT4_GROUP_INFO_LOCKED_BIT	1
 
 #define EXT4_MB_GRP_NEED_INIT(grp)	\
 	(test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
 
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+					      ext4_group_t group)
 {
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
+	return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
+}
 
-	bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+{
+	spin_lock(ext4_group_lock_ptr(sb, group));
 }
 
 static inline void ext4_unlock_group(struct super_block *sb,
 					ext4_group_t group)
 {
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+	spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 
 static inline int ext4_is_group_locked(struct super_block *sb,
 					ext4_group_t group)
 {
-	struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-	return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-						&(grinfo->bb_state));
+	return spin_is_locked(ext4_group_lock_ptr(sb, group));
 }
 
 /*
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 916d05c881cad..82f7d1d7eae0e 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -122,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_lock_group(sb, block_group);
 	if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
 		ext4_init_inode_bitmap(sb, bh, block_group, desc);
 		set_bitmap_uptodate(bh);
 		set_buffer_uptodate(bh);
-		spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+		ext4_unlock_group(sb, block_group);
 		unlock_buffer(bh);
 		return bh;
 	}
-	spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+	ext4_unlock_group(sb, block_group);
 	if (buffer_uptodate(bh)) {
 		/*
 		 * if not uninit if bh is uptodate,
@@ -246,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		goto error_return;
 
 	/* Ok, now we can actually update the inode bitmaps.. */
-	spin_lock(sb_bgl_lock(sbi, block_group));
-	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
+	cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+					bit, bitmap_bh->b_data);
 	if (!cleared)
 		ext4_error(sb, "ext4_free_inode",
 			   "bit already cleared for inode %lu", ino);
@@ -260,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		if (fatal) goto error_return;
 
 		if (gdp) {
-			spin_lock(sb_bgl_lock(sbi, block_group));
+			ext4_lock_group(sb, block_group);
 			count = ext4_free_inodes_count(sb, gdp) + 1;
 			ext4_free_inodes_set(sb, gdp, count);
 			if (is_directory) {
@@ -276,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 			}
 			gdp->bg_checksum = ext4_group_desc_csum(sbi,
 							block_group, gdp);
-			spin_unlock(sb_bgl_lock(sbi, block_group));
+			ext4_unlock_group(sb, block_group);
 			percpu_counter_inc(&sbi->s_freeinodes_counter);
 			if (is_directory)
 				percpu_counter_dec(&sbi->s_dirs_counter);
@@ -707,10 +706,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 
 /*
  * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's sb_bgl_lock
+ * is uninit we need to take the groups's ext4_group_lock
  * and clear the uninit flag. The inode bitmap update
  * and group desc uninit flag clear should be done
- * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * after holding ext4_group_lock so that ext4_read_inode_bitmap
  * doesn't race with the ext4_claim_inode
  */
 static int ext4_claim_inode(struct super_block *sb,
@@ -721,7 +720,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 
-	spin_lock(sb_bgl_lock(sbi, group));
+	ext4_lock_group(sb, group);
 	if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
 		/* not a free inode */
 		retval = 1;
@@ -730,7 +729,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	ino++;
 	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
 			ino > EXT4_INODES_PER_GROUP(sb)) {
-		spin_unlock(sb_bgl_lock(sbi, group));
+		ext4_unlock_group(sb, group);
 		ext4_error(sb, __func__,
 			   "reserved inode or inode > inodes count - "
 			   "block_group = %u, inode=%lu", group,
@@ -779,7 +778,7 @@ static int ext4_claim_inode(struct super_block *sb,
 	}
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
-	spin_unlock(sb_bgl_lock(sbi, group));
+	ext4_unlock_group(sb, group);
 	return retval;
 }
 
@@ -935,7 +934,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 		}
 
 		free = 0;
-		spin_lock(sb_bgl_lock(sbi, group));
+		ext4_lock_group(sb, group);
 		/* recheck and clear flag under lock if we still need to */
 		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			free = ext4_free_blocks_after_init(sb, group, gdp);
@@ -944,7 +943,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 			gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
 								gdp);
 		}
-		spin_unlock(sb_bgl_lock(sbi, group));
+		ext4_unlock_group(sb, group);
 
 		/* Don't need to dirty bitmap block if we didn't change it */
 		if (free) {
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index df75855ae6f78..e76459cedcdbe 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr)
 	ext4_set_bit(bit, addr);
 }
 
-static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-	addr = mb_correct_addr_and_bit(&bit, addr);
-	ext4_set_bit_atomic(lock, bit, addr);
-}
-
 static inline void mb_clear_bit(int bit, void *addr)
 {
 	addr = mb_correct_addr_and_bit(&bit, addr);
 	ext4_clear_bit(bit, addr);
 }
 
-static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-	addr = mb_correct_addr_and_bit(&bit, addr);
-	ext4_clear_bit_atomic(lock, bit, addr);
-}
-
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
 	int fix = 0, ret, tmpmax;
@@ -803,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 			unlock_buffer(bh[i]);
 			continue;
 		}
-		spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+		ext4_lock_group(sb, first_group + i);
 		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 			ext4_init_block_bitmap(sb, bh[i],
 						first_group + i, desc);
 			set_bitmap_uptodate(bh[i]);
 			set_buffer_uptodate(bh[i]);
-			spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+			ext4_unlock_group(sb, first_group + i);
 			unlock_buffer(bh[i]);
 			continue;
 		}
-		spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+		ext4_unlock_group(sb, first_group + i);
 		if (buffer_uptodate(bh[i])) {
 			/*
 			 * if not uninit if bh is uptodate,
@@ -1080,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
 	return 0;
 }
 
-static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_clear_bits(void *bm, int cur, int len)
 {
 	__u32 *addr;
 
@@ -1093,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		if (lock)
-			mb_clear_bit_atomic(lock, cur, bm);
-		else
-			mb_clear_bit(cur, bm);
+		mb_clear_bit(cur, bm);
 		cur++;
 	}
 }
 
-static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_set_bits(void *bm, int cur, int len)
 {
 	__u32 *addr;
 
@@ -1114,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
 			cur += 32;
 			continue;
 		}
-		if (lock)
-			mb_set_bit_atomic(lock, cur, bm);
-		else
-			mb_set_bit(cur, bm);
+		mb_set_bit(cur, bm);
 		cur++;
 	}
 }
@@ -1332,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 		e4b->bd_info->bb_counters[ord]++;
 	}
 
-	mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
-			EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+	mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
 	mb_check_buddy(e4b);
 
 	return ret;
@@ -2756,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
 	return 0;
 }
 
-/* need to called with ext4 group lock (ext4_lock_group) */
+/* need to called with the ext4 group lock held */
 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 {
 	struct ext4_prealloc_space *pa;
@@ -2993,14 +2974,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		 * Fix the bitmap and repeat the block allocation
 		 * We leak some of the blocks here.
 		 */
-		mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
-				bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-				ac->ac_b_ex.fe_len);
+		ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+		mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+			    ac->ac_b_ex.fe_len);
+		ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
 		err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
 		if (!err)
 			err = -EAGAIN;
 		goto out_err;
 	}
+
+	ext4_lock_group(sb, ac->ac_b_ex.fe_group);
 #ifdef AGGRESSIVE_CHECK
 	{
 		int i;
@@ -3010,9 +2994,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		}
 	}
 #endif
-	spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-	mb_set_bits(NULL, bitmap_bh->b_data,
-				ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
+	mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
 	if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
 		gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
 		ext4_free_blks_set(sb, gdp,
@@ -3022,7 +3004,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 	len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
 	ext4_free_blks_set(sb, gdp, len);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+
+	ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
 	percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
 	/*
 	 * Now reduce the dirty block count also. Should not go negative
@@ -3455,7 +3438,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
  * the function goes through all block freed in the group
  * but not yet committed and marks them used in in-core bitmap.
  * buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with the ext4 group lock held
  */
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group)
@@ -3469,9 +3452,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 
 	while (n) {
 		entry = rb_entry(n, struct ext4_free_data, node);
-		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
-				bitmap, entry->start_blk,
-				entry->count);
+		mb_set_bits(bitmap, entry->start_blk, entry->count);
 		n = rb_next(n);
 	}
 	return;
@@ -3480,7 +3461,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 /*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with ext4 group lock held
  */
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 					ext4_group_t group)
@@ -3512,8 +3493,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
 		if (unlikely(len == 0))
 			continue;
 		BUG_ON(groupnr != group);
-		mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
-						bitmap, start, len);
+		mb_set_bits(bitmap, start, len);
 		preallocated += len;
 		count++;
 	}
@@ -4856,29 +4836,25 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
 		new_entry->group  = block_group;
 		new_entry->count = count;
 		new_entry->t_tid = handle->h_transaction->t_tid;
+
 		ext4_lock_group(sb, block_group);
-		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-				bit, count);
+		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		ext4_mb_free_metadata(handle, &e4b, new_entry);
-		ext4_unlock_group(sb, block_group);
 	} else {
-		ext4_lock_group(sb, block_group);
 		/* need to update group_info->bb_free and bitmap
 		 * with group lock held. generate_buddy look at
 		 * them with group lock_held
 		 */
-		mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-				bit, count);
+		ext4_lock_group(sb, block_group);
+		mb_clear_bits(bitmap_bh->b_data, bit, count);
 		mb_free_blocks(inode, &e4b, bit, count);
 		ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-		ext4_unlock_group(sb, block_group);
 	}
 
-	spin_lock(sb_bgl_lock(sbi, block_group));
 	ret = ext4_free_blks_count(sb, gdp) + count;
 	ext4_free_blks_set(sb, gdp, ret);
 	gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-	spin_unlock(sb_bgl_lock(sbi, block_group));
+	ext4_unlock_group(sb, block_group);
 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
 	if (sbi->s_log_groups_per_flex) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 39223a52bc71b..dc34ed3d13279 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1784,18 +1784,18 @@ static int ext4_check_descriptors(struct super_block *sb)
 			       "(block %llu)!\n", i, inode_table);
 			return 0;
 		}
-		spin_lock(sb_bgl_lock(sbi, i));
+		ext4_lock_group(sb, i);
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
 			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
 			       "Checksum for group %u failed (%u!=%u)\n",
 			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
 			       gdp)), le16_to_cpu(gdp->bg_checksum));
 			if (!(sb->s_flags & MS_RDONLY)) {
-				spin_unlock(sb_bgl_lock(sbi, i));
+				ext4_unlock_group(sb, i);
 				return 0;
 			}
 		}
-		spin_unlock(sb_bgl_lock(sbi, i));
+		ext4_unlock_group(sb, i);
 		if (!flexbg_flag)
 			first_block += EXT4_BLOCKS_PER_GROUP(sb);
 	}
-- 
GitLab


From dab6f6a3401f596fe934f41fc5da3f401adfdfb1 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sat, 2 May 2009 08:02:36 +0200
Subject: [PATCH 0800/2198] perf_counter tools: fix build error

ctype.h crawled out of the bit bucket :)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 3a3deb3fbbc06..ddfdcf86fb20a 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -15,7 +15,6 @@
 #include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
-#include <ctype.h>
 #include <time.h>
 #include <sched.h>
 #include <pthread.h>
-- 
GitLab


From a454ab3110175d710f4f9a96226a26ce4d5d5de2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 3 May 2009 10:09:03 +0200
Subject: [PATCH 0801/2198] x86, mm: fault.c, use printk_once() in
 is_errata93()

Andrew pointed out that the 'once' variable has a needlessly
function-global scope. We can in fact eliminate it completely,
via the use of printk_once().

[ Impact: cleanup ]

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/fault.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 24a36a6426abd..b9ca6d767dbbf 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -514,8 +514,6 @@ static void dump_pagetable(unsigned long address)
 static int is_errata93(struct pt_regs *regs, unsigned long address)
 {
 #ifdef CONFIG_X86_64
-	static int once;
-
 	if (address != regs->ip)
 		return 0;
 
@@ -525,10 +523,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
 	address |= 0xffffffffUL << 32;
 	if ((address >= (u64)_stext && address <= (u64)_etext) ||
 	    (address >= MODULES_VADDR && address <= MODULES_END)) {
-		if (!once) {
-			printk(errata93_warning);
-			once = 1;
-		}
+		printk_once(errata93_warning);
 		regs->ip = address;
 		return 1;
 	}
-- 
GitLab


From 0cd5f7b0c7fd96f9f00187ef2ad2328de28ae326 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 2 May 2009 20:00:44 +0000
Subject: [PATCH 0802/2198] sh: remove obsolete no_irq_type

Impact: cleanup

convert the last remaining users to no_irq_chip

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/sh_ksyms_32.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c
index 528de2955c814..2d868c60aa6e3 100644
--- a/arch/sh/kernel/sh_ksyms_32.c
+++ b/arch/sh/kernel/sh_ksyms_32.c
@@ -19,13 +19,12 @@
 #include <asm/ftrace.h>
 
 extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
-extern struct hw_interrupt_type no_irq_type;
 
 /* platform dependent support */
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(irq_desc);
-EXPORT_SYMBOL(no_irq_type);
+EXPORT_SYMBOL(no_irq_chip);
 
 EXPORT_SYMBOL(strlen);
 
-- 
GitLab


From d80498398276ca8eee7ebdbe0d47e06d01317439 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 2 May 2009 20:01:20 +0000
Subject: [PATCH 0803/2198] sh: remove obsolete hw_interrupt_type

Impact: cleanup

Convert the last remaining users to struct irq_chip and remove the
define.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/irq/imask.c    | 2 +-
 arch/sh/kernel/cpu/irq/intc-sh5.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sh/kernel/cpu/irq/imask.c b/arch/sh/kernel/cpu/irq/imask.c
index 301b505c42789..f43753b54e1d6 100644
--- a/arch/sh/kernel/cpu/irq/imask.c
+++ b/arch/sh/kernel/cpu/irq/imask.c
@@ -39,7 +39,7 @@ static unsigned int startup_imask_irq(unsigned int irq)
 	return 0; /* never anything pending */
 }
 
-static struct hw_interrupt_type imask_irq_type = {
+static struct irq_chip imask_irq_type = {
 	.typename = "SR.IMASK",
 	.startup = startup_imask_irq,
 	.shutdown = shutdown_imask_irq,
diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c
index 726f0335da76b..d8679a4d93a46 100644
--- a/arch/sh/kernel/cpu/irq/intc-sh5.c
+++ b/arch/sh/kernel/cpu/irq/intc-sh5.c
@@ -84,7 +84,7 @@ static void disable_intc_irq(unsigned int irq);
 static void mask_and_ack_intc(unsigned int);
 static void end_intc_irq(unsigned int irq);
 
-static struct hw_interrupt_type intc_irq_type = {
+static struct irq_chip intc_irq_type = {
 	.typename = "INTC",
 	.startup = startup_intc_irq,
 	.shutdown = shutdown_intc_irq,
-- 
GitLab


From 7563431107f6debf57c1dbecfb9498cf31a1c036 Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Fri, 1 May 2009 13:10:28 -0700
Subject: [PATCH 0804/2198] time: sh: convert to use arch_getoffset()
 infrastructure

Convert sh to use GENERIC_TIME via the arch_getoffset() infrastructure.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                    |  2 +-
 arch/sh/include/asm/timer.h        |  4 +-
 arch/sh/kernel/time_32.c           | 63 +++---------------------------
 arch/sh/kernel/time_64.c           | 53 +------------------------
 arch/sh/kernel/timers/timer-mtu2.c |  2 +-
 5 files changed, 11 insertions(+), 113 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7e96adea9602d..6f91478826d55 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -75,7 +75,7 @@ config GENERIC_IOMAP
 	bool
 
 config GENERIC_TIME
-	def_bool n
+	def_bool y
 
 config GENERIC_CLOCKEVENTS
 	def_bool n
diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index 8f80a55c04ae5..9c968d19cb9b7 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -9,7 +9,7 @@ struct sys_timer_ops {
 	int (*init)(void);
 	int (*start)(void);
 	int (*stop)(void);
-#ifndef CONFIG_GENERIC_TIME
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
 	unsigned long (*get_offset)(void);
 #endif
 };
@@ -26,7 +26,7 @@ struct sys_timer {
 extern struct sys_timer tmu_timer, mtu2_timer;
 extern struct sys_timer *sys_timer;
 
-#ifndef CONFIG_GENERIC_TIME
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
 static inline unsigned long get_timer_offset(void)
 {
 	return sys_timer->ops->get_offset();
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index 457332116e17d..9d34dff149942 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -83,65 +83,12 @@ static int __init rtc_generic_init(void)
 }
 module_init(rtc_generic_init);
 
-#ifndef CONFIG_GENERIC_TIME
-void do_gettimeofday(struct timeval *tv)
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
+u32 arch_gettimeoffset(void)
 {
-	unsigned long flags;
-	unsigned long seq;
-	unsigned long usec, sec;
-
-	do {
-		/*
-		 * Turn off IRQs when grabbing xtime_lock, so that
-		 * the sys_timer get_offset code doesn't have to handle it.
-		 */
-		seq = read_seqbegin_irqsave(&xtime_lock, flags);
-		usec = get_timer_offset();
-		sec = xtime.tv_sec;
-		usec += xtime.tv_nsec / NSEC_PER_USEC;
-	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
-
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	/*
-	 * This is revolting. We need to set "xtime" correctly. However, the
-	 * value in this location is the value at the most recent update of
-	 * wall time.  Discover what correction gettimeofday() would have
-	 * made, and then undo it!
-	 */
-	nsec -= get_timer_offset() * NSEC_PER_USEC;
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-
-	return 0;
+	return get_timer_offset() * 1000;
 }
-EXPORT_SYMBOL(do_settimeofday);
-#endif /* !CONFIG_GENERIC_TIME */
+#endif /* CONFIG_ARCH_USES_GETTIMEOFFSET */
 
 /* last time the RTC clock got updated */
 static long last_rtc_update;
@@ -238,7 +185,7 @@ struct clocksource clocksource_sh = {
 	.name		= "SuperH",
 };
 
-#ifdef CONFIG_GENERIC_TIME
+#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 unsigned long long sched_clock(void)
 {
 	unsigned long long cycles;
diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c
index 988c77c372312..f4f5e8ad5bec6 100644
--- a/arch/sh/kernel/time_64.c
+++ b/arch/sh/kernel/time_64.c
@@ -144,59 +144,10 @@ static unsigned long usecs_since_tick(void)
 	return result;
 }
 
-void do_gettimeofday(struct timeval *tv)
+u32 arch_gettimeoffset(void)
 {
-	unsigned long flags;
-	unsigned long seq;
-	unsigned long usec, sec;
-
-	do {
-		seq = read_seqbegin_irqsave(&xtime_lock, flags);
-		usec = usecs_since_tick();
-		sec = xtime.tv_sec;
-		usec += xtime.tv_nsec / 1000;
-	} while (read_seqretry_irqrestore(&xtime_lock, seq, flags));
-
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	/*
-	 * This is revolting. We need to set "xtime" correctly. However, the
-	 * value in this location is the value at the most recent update of
-	 * wall time.  Discover what correction gettimeofday() would have
-	 * made, and then undo it!
-	 */
-	nsec -= 1000 * usecs_since_tick();
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-
-	return 0;
+	return usecs_since_tick() * 1000;
 }
-EXPORT_SYMBOL(do_settimeofday);
 
 /* Dummy RTC ops */
 static void null_rtc_get_time(struct timespec *tv)
diff --git a/arch/sh/kernel/timers/timer-mtu2.c b/arch/sh/kernel/timers/timer-mtu2.c
index 9b0ef01264794..8a1dcc2c37236 100644
--- a/arch/sh/kernel/timers/timer-mtu2.c
+++ b/arch/sh/kernel/timers/timer-mtu2.c
@@ -191,7 +191,7 @@ struct sys_timer_ops mtu2_timer_ops = {
 	.init		= mtu2_timer_init,
 	.start		= mtu2_timer_start,
 	.stop		= mtu2_timer_stop,
-#ifndef CONFIG_GENERIC_TIME
+#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
 	.get_offset	= mtu2_timer_get_offset,
 #endif
 };
-- 
GitLab


From d5ed4c2e5ce9f5f6fd6a5a39ee1196a1f8a46eed Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 07:02:49 +0000
Subject: [PATCH 0805/2198] clocksource: SuperH MTU2 Timer driver

This patch adds a MTU2 driver for the SuperH architecture.

The MTU2 driver is a platform driver with early platform
support to allow using a MTU2 channel as only clockevent
during system bootup.

Clocksource on sh2a is currently unsupported due to code
generation issues with 64-bit math, so at this point only
periodic clockevent support is in place.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig               |  11 ++
 drivers/clocksource/Makefile  |   1 +
 drivers/clocksource/sh_mtu2.c | 357 ++++++++++++++++++++++++++++++++++
 include/linux/sh_mtu2.h       |  12 ++
 4 files changed, 381 insertions(+)
 create mode 100644 drivers/clocksource/sh_mtu2.c
 create mode 100644 include/linux/sh_mtu2.h

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 6f91478826d55..6d0dd378ecadb 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -113,6 +113,9 @@ config SYS_SUPPORTS_PCI
 config SYS_SUPPORTS_CMT
 	bool
 
+config SYS_SUPPORTS_MTU2
+	bool
+
 config STACKTRACE_SUPPORT
 	def_bool y
 
@@ -478,6 +481,14 @@ config SH_MTU2
 	help
 	  This enables the use of the MTU2 as the system timer.
 
+config SH_TIMER_MTU2
+	bool "MTU2 timer driver"
+	depends on SYS_SUPPORTS_MTU2 && !SH_MTU2
+	default y
+	select GENERIC_CLOCKEVENTS
+	help
+	  This enables build of the MTU2 timer driver.
+
 config SH_TIMER_IRQ
 	int
 	default "28" if CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 1efb2879a94f4..9785586dc8c4a 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_X86_CYCLONE_TIMER)	+= cyclone.o
 obj-$(CONFIG_X86_PM_TIMER)	+= acpi_pm.o
 obj-$(CONFIG_SCx200HR_TIMER)	+= scx200_hrt.o
 obj-$(CONFIG_SH_TIMER_CMT)	+= sh_cmt.o
+obj-$(CONFIG_SH_TIMER_MTU2)	+= sh_mtu2.o
diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c
new file mode 100644
index 0000000000000..420566f4c5011
--- /dev/null
+++ b/drivers/clocksource/sh_mtu2.c
@@ -0,0 +1,357 @@
+/*
+ * SuperH Timer Support - MTU2
+ *
+ *  Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/irq.h>
+#include <linux/err.h>
+#include <linux/clockchips.h>
+#include <linux/sh_mtu2.h>
+
+struct sh_mtu2_priv {
+	void __iomem *mapbase;
+	struct clk *clk;
+	struct irqaction irqaction;
+	struct platform_device *pdev;
+	unsigned long rate;
+	unsigned long periodic;
+	struct clock_event_device ced;
+};
+
+static DEFINE_SPINLOCK(sh_mtu2_lock);
+
+#define TSTR -1 /* shared register */
+#define TCR  0 /* channel register */
+#define TMDR 1 /* channel register */
+#define TIOR 2 /* channel register */
+#define TIER 3 /* channel register */
+#define TSR  4 /* channel register */
+#define TCNT 5 /* channel register */
+#define TGR  6 /* channel register */
+
+static unsigned long mtu2_reg_offs[] = {
+	[TCR] = 0,
+	[TMDR] = 1,
+	[TIOR] = 2,
+	[TIER] = 4,
+	[TSR] = 5,
+	[TCNT] = 6,
+	[TGR] = 8,
+};
+
+static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR)
+		return ioread8(base + cfg->channel_offset);
+
+	offs = mtu2_reg_offs[reg_nr];
+
+	if ((reg_nr == TCNT) || (reg_nr == TGR))
+		return ioread16(base + offs);
+	else
+		return ioread8(base + offs);
+}
+
+static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr,
+				unsigned long value)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR) {
+		iowrite8(value, base + cfg->channel_offset);
+		return;
+	}
+
+	offs = mtu2_reg_offs[reg_nr];
+
+	if ((reg_nr == TCNT) || (reg_nr == TGR))
+		iowrite16(value, base + offs);
+	else
+		iowrite8(value, base + offs);
+}
+
+static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	unsigned long flags, value;
+
+	/* start stop register shared by multiple timer channels */
+	spin_lock_irqsave(&sh_mtu2_lock, flags);
+	value = sh_mtu2_read(p, TSTR);
+
+	if (start)
+		value |= 1 << cfg->timer_bit;
+	else
+		value &= ~(1 << cfg->timer_bit);
+
+	sh_mtu2_write(p, TSTR, value);
+	spin_unlock_irqrestore(&sh_mtu2_lock, flags);
+}
+
+static int sh_mtu2_enable(struct sh_mtu2_priv *p)
+{
+	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	int ret;
+
+	/* enable clock */
+	ret = clk_enable(p->clk);
+	if (ret) {
+		pr_err("sh_mtu2: cannot enable clock \"%s\"\n", cfg->clk);
+		return ret;
+	}
+
+	/* make sure channel is disabled */
+	sh_mtu2_start_stop_ch(p, 0);
+
+	p->rate = clk_get_rate(p->clk) / 64;
+	p->periodic = (p->rate + HZ/2) / HZ;
+
+	/* "Periodic Counter Operation" */
+	sh_mtu2_write(p, TCR, 0x23); /* TGRA clear, divide clock by 64 */
+	sh_mtu2_write(p, TIOR, 0);
+	sh_mtu2_write(p, TGR, p->periodic);
+	sh_mtu2_write(p, TCNT, 0);
+	sh_mtu2_write(p, TMDR, 0);
+	sh_mtu2_write(p, TIER, 0x01);
+
+	/* enable channel */
+	sh_mtu2_start_stop_ch(p, 1);
+
+	return 0;
+}
+
+static void sh_mtu2_disable(struct sh_mtu2_priv *p)
+{
+	/* disable channel */
+	sh_mtu2_start_stop_ch(p, 0);
+
+	/* stop clock */
+	clk_disable(p->clk);
+}
+
+static irqreturn_t sh_mtu2_interrupt(int irq, void *dev_id)
+{
+	struct sh_mtu2_priv *p = dev_id;
+
+	/* acknowledge interrupt */
+	sh_mtu2_read(p, TSR);
+	sh_mtu2_write(p, TSR, 0xfe);
+
+	/* notify clockevent layer */
+	p->ced.event_handler(&p->ced);
+	return IRQ_HANDLED;
+}
+
+static struct sh_mtu2_priv *ced_to_sh_mtu2(struct clock_event_device *ced)
+{
+	return container_of(ced, struct sh_mtu2_priv, ced);
+}
+
+static void sh_mtu2_clock_event_mode(enum clock_event_mode mode,
+				    struct clock_event_device *ced)
+{
+	struct sh_mtu2_priv *p = ced_to_sh_mtu2(ced);
+	int disabled = 0;
+
+	/* deal with old setting first */
+	switch (ced->mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		sh_mtu2_disable(p);
+		disabled = 1;
+		break;
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		pr_info("sh_mtu2: %s used for periodic clock events\n",
+			ced->name);
+		sh_mtu2_enable(p);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+		if (!disabled)
+			sh_mtu2_disable(p);
+		break;
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	default:
+		break;
+	}
+}
+
+static void sh_mtu2_register_clockevent(struct sh_mtu2_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clock_event_device *ced = &p->ced;
+	int ret;
+
+	memset(ced, 0, sizeof(*ced));
+
+	ced->name = name;
+	ced->features = CLOCK_EVT_FEAT_PERIODIC;
+	ced->rating = rating;
+	ced->cpumask = cpumask_of(0);
+	ced->set_mode = sh_mtu2_clock_event_mode;
+
+	ret = setup_irq(p->irqaction.irq, &p->irqaction);
+	if (ret) {
+		pr_err("sh_mtu2: failed to request irq %d\n",
+		       p->irqaction.irq);
+		return;
+	}
+
+	pr_info("sh_mtu2: %s used for clock events\n", ced->name);
+	clockevents_register_device(ced);
+}
+
+int sh_mtu2_register(struct sh_mtu2_priv *p, char *name,
+		     unsigned long clockevent_rating)
+{
+	if (clockevent_rating)
+		sh_mtu2_register_clockevent(p, name, clockevent_rating);
+
+	return 0;
+}
+
+static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
+{
+	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	struct resource *res;
+	int irq, ret;
+	ret = -ENXIO;
+
+	memset(p, 0, sizeof(*p));
+	p->pdev = pdev;
+
+	if (!cfg) {
+		dev_err(&p->pdev->dev, "missing platform data\n");
+		goto err0;
+	}
+
+	platform_set_drvdata(pdev, p);
+
+	res = platform_get_resource(p->pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&p->pdev->dev, "failed to get I/O memory\n");
+		goto err0;
+	}
+
+	irq = platform_get_irq(p->pdev, 0);
+	if (irq < 0) {
+		dev_err(&p->pdev->dev, "failed to get irq\n");
+		goto err0;
+	}
+
+	/* map memory, let mapbase point to our channel */
+	p->mapbase = ioremap_nocache(res->start, resource_size(res));
+	if (p->mapbase == NULL) {
+		pr_err("sh_mtu2: failed to remap I/O memory\n");
+		goto err0;
+	}
+
+	/* setup data for setup_irq() (too early for request_irq()) */
+	p->irqaction.name = cfg->name;
+	p->irqaction.handler = sh_mtu2_interrupt;
+	p->irqaction.dev_id = p;
+	p->irqaction.irq = irq;
+	p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL;
+	p->irqaction.mask = CPU_MASK_NONE;
+
+	/* get hold of clock */
+	p->clk = clk_get(&p->pdev->dev, cfg->clk);
+	if (IS_ERR(p->clk)) {
+		pr_err("sh_mtu2: cannot get clock \"%s\"\n", cfg->clk);
+		ret = PTR_ERR(p->clk);
+		goto err1;
+	}
+
+	return sh_mtu2_register(p, cfg->name, cfg->clockevent_rating);
+ err1:
+	iounmap(p->mapbase);
+ err0:
+	return ret;
+}
+
+static int __devinit sh_mtu2_probe(struct platform_device *pdev)
+{
+	struct sh_mtu2_priv *p = platform_get_drvdata(pdev);
+	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	int ret;
+
+	if (p) {
+		pr_info("sh_mtu2: %s kept as earlytimer\n", cfg->name);
+		return 0;
+	}
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL) {
+		dev_err(&pdev->dev, "failed to allocate driver data\n");
+		return -ENOMEM;
+	}
+
+	ret = sh_mtu2_setup(p, pdev);
+	if (ret) {
+		kfree(p);
+		platform_set_drvdata(pdev, NULL);
+	}
+	return ret;
+}
+
+static int __devexit sh_mtu2_remove(struct platform_device *pdev)
+{
+	return -EBUSY; /* cannot unregister clockevent */
+}
+
+static struct platform_driver sh_mtu2_device_driver = {
+	.probe		= sh_mtu2_probe,
+	.remove		= __devexit_p(sh_mtu2_remove),
+	.driver		= {
+		.name	= "sh_mtu2",
+	}
+};
+
+static int __init sh_mtu2_init(void)
+{
+	return platform_driver_register(&sh_mtu2_device_driver);
+}
+
+static void __exit sh_mtu2_exit(void)
+{
+	platform_driver_unregister(&sh_mtu2_device_driver);
+}
+
+early_platform_init("earlytimer", &sh_mtu2_device_driver);
+module_init(sh_mtu2_init);
+module_exit(sh_mtu2_exit);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("SuperH MTU2 Timer Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/sh_mtu2.h b/include/linux/sh_mtu2.h
new file mode 100644
index 0000000000000..a98876340ff41
--- /dev/null
+++ b/include/linux/sh_mtu2.h
@@ -0,0 +1,12 @@
+#ifndef __SH_MTU2_H__
+#define __SH_MTU2_H__
+
+struct sh_mtu2_config {
+	char *name;
+	int channel_offset;
+	int timer_bit;
+	char *clk;
+	unsigned long clockevent_rating;
+};
+
+#endif /* __SH_MTU2_H__ */
-- 
GitLab


From da107c6ef919b3afd9c9b405a4f71e03b5725b04 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 07:06:26 +0000
Subject: [PATCH 0806/2198] sh: sh2a MTU2 platform data

This patch adds MTU2 platform data for the following cpus:
 - sh7201 (3/5 channels)
 - sh7203/sh7263 (2/4 channels)
 - sh7206 (3/5 channels)
 - MXG (3/5 channels)

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                        |   5 ++
 arch/sh/kernel/cpu/sh2a/setup-mxg.c    | 111 +++++++++++++++++++++++-
 arch/sh/kernel/cpu/sh2a/setup-sh7201.c | 115 +++++++++++++++++++++++++
 arch/sh/kernel/cpu/sh2a/setup-sh7203.c |  71 +++++++++++++++
 arch/sh/kernel/cpu/sh2a/setup-sh7206.c | 104 ++++++++++++++++++++++
 5 files changed, 405 insertions(+), 1 deletion(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 6d0dd378ecadb..2061488cc17cc 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -214,27 +214,32 @@ config CPU_SUBTYPE_SH7201
 	bool "Support SH7201 processor"
 	select CPU_SH2A
 	select CPU_HAS_FPU
+	select SYS_SUPPORTS_MTU2
  
 config CPU_SUBTYPE_SH7203
 	bool "Support SH7203 processor"
 	select CPU_SH2A
 	select CPU_HAS_FPU
 	select SYS_SUPPORTS_CMT
+	select SYS_SUPPORTS_MTU2
 
 config CPU_SUBTYPE_SH7206
 	bool "Support SH7206 processor"
 	select CPU_SH2A
 	select SYS_SUPPORTS_CMT
+	select SYS_SUPPORTS_MTU2
 
 config CPU_SUBTYPE_SH7263
 	bool "Support SH7263 processor"
 	select CPU_SH2A
 	select CPU_HAS_FPU
 	select SYS_SUPPORTS_CMT
+	select SYS_SUPPORTS_MTU2
 
 config CPU_SUBTYPE_MXG
 	bool "Support MX-G processor"
 	select CPU_SH2A
+	select SYS_SUPPORTS_MTU2
 	help
 	  Select MX-G if running on an R8A03022BG part.
 
diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
index 844293723cfc8..870030aa05bc7 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_mtu2.h>
 
 enum {
 	UNUSED = 0,
@@ -24,7 +25,7 @@ enum {
 
 	SCIF0, SCIF1,
 
-	MTU2_GROUP1, MTU2_GROUP2, MTU2_GROUP3, MTU2_GROUP4, MTU2_GROUP5
+	MTU2_GROUP1, MTU2_GROUP2, MTU2_GROUP3, MTU2_GROUP4, MTU2_GROUP5,
 	MTU2_TGI3B, MTU2_TGI3C,
 
 	/* interrupt groups */
@@ -113,6 +114,99 @@ static struct intc_mask_reg mask_registers[] __initdata = {
 static DECLARE_INTC_DESC(intc_desc, "mxg", vectors, groups,
 			 mask_registers, prio_registers, NULL);
 
+static struct sh_mtu2_config mtu2_0_platform_data = {
+	.name = "MTU2_0",
+	.channel_offset = -0x80,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_0_resources[] = {
+	[0] = {
+		.name	= "MTU2_0",
+		.start	= 0xff801300,
+		.end	= 0xff801326,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 228,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_0_device = {
+	.name		= "sh_mtu2",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &mtu2_0_platform_data,
+	},
+	.resource	= mtu2_0_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
+};
+
+static struct sh_mtu2_config mtu2_1_platform_data = {
+	.name = "MTU2_1",
+	.channel_offset = -0x100,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_1_resources[] = {
+	[0] = {
+		.name	= "MTU2_1",
+		.start	= 0xff801380,
+		.end	= 0xff801390,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 234,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_1_device = {
+	.name		= "sh_mtu2",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &mtu2_1_platform_data,
+	},
+	.resource	= mtu2_1_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
+};
+
+static struct sh_mtu2_config mtu2_2_platform_data = {
+	.name = "MTU2_2",
+	.channel_offset = 0x80,
+	.timer_bit = 2,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_2_resources[] = {
+	[0] = {
+		.name	= "MTU2_2",
+		.start	= 0xff801000,
+		.end	= 0xff80100a,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 240,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_2_device = {
+	.name		= "sh_mtu2",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &mtu2_2_platform_data,
+	},
+	.resource	= mtu2_2_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_2_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xff804000,
@@ -134,6 +228,9 @@ static struct platform_device sci_device = {
 
 static struct platform_device *mxg_devices[] __initdata = {
 	&sci_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
 };
 
 static int __init mxg_devices_setup(void)
@@ -147,3 +244,15 @@ void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct platform_device *mxg_early_devices[] __initdata = {
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(mxg_early_devices,
+				   ARRAY_SIZE(mxg_early_devices));
+}
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
index 00f42f9e3f5c6..074aa9062e4b3 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
@@ -12,6 +12,8 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_mtu2.h>
+#include <linux/io.h>
 
 enum {
 	UNUSED = 0,
@@ -249,9 +251,105 @@ static struct platform_device rtc_device = {
 	.resource	= rtc_resources,
 };
 
+static struct sh_mtu2_config mtu2_0_platform_data = {
+	.name = "MTU2_0",
+	.channel_offset = -0x80,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_0_resources[] = {
+	[0] = {
+		.name	= "MTU2_0",
+		.start	= 0xfffe4300,
+		.end	= 0xfffe4326,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 108,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_0_device = {
+	.name		= "sh_mtu2",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &mtu2_0_platform_data,
+	},
+	.resource	= mtu2_0_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
+};
+
+static struct sh_mtu2_config mtu2_1_platform_data = {
+	.name = "MTU2_1",
+	.channel_offset = -0x100,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_1_resources[] = {
+	[0] = {
+		.name	= "MTU2_1",
+		.start	= 0xfffe4380,
+		.end	= 0xfffe4390,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 116,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_1_device = {
+	.name		= "sh_mtu2",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &mtu2_1_platform_data,
+	},
+	.resource	= mtu2_1_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
+};
+
+static struct sh_mtu2_config mtu2_2_platform_data = {
+	.name = "MTU2_2",
+	.channel_offset = 0x80,
+	.timer_bit = 2,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_2_resources[] = {
+	[0] = {
+		.name	= "MTU2_2",
+		.start	= 0xfffe4000,
+		.end	= 0xfffe400a,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 124,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_2_device = {
+	.name		= "sh_mtu2",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &mtu2_2_platform_data,
+	},
+	.resource	= mtu2_2_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_2_resources),
+};
+
 static struct platform_device *sh7201_devices[] __initdata = {
 	&sci_device,
 	&rtc_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
 };
 
 static int __init sh7201_devices_setup(void)
@@ -265,3 +363,20 @@ void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
 }
+
+static struct platform_device *sh7201_early_devices[] __initdata = {
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
+};
+
+#define STBCR3 0xfffe0408
+
+void __init plat_early_device_setup(void)
+{
+	/* enable MTU2 clock */
+	__raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3);
+
+	early_platform_add_devices(sh7201_early_devices,
+				   ARRAY_SIZE(sh7201_early_devices));
+}
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index 0836acee22892..3448164b5734d 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -12,6 +12,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/sh_cmt.h>
+#include <linux/sh_mtu2.h>
 #include <linux/io.h>
 
 enum {
@@ -271,6 +272,68 @@ static struct platform_device cmt1_device = {
 	.num_resources	= ARRAY_SIZE(cmt1_resources),
 };
 
+static struct sh_mtu2_config mtu2_0_platform_data = {
+	.name = "MTU2_0",
+	.channel_offset = -0x80,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_0_resources[] = {
+	[0] = {
+		.name	= "MTU2_0",
+		.start	= 0xfffe4300,
+		.end	= 0xfffe4326,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 146,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_0_device = {
+	.name		= "sh_mtu2",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &mtu2_0_platform_data,
+	},
+	.resource	= mtu2_0_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
+};
+
+static struct sh_mtu2_config mtu2_1_platform_data = {
+	.name = "MTU2_1",
+	.channel_offset = -0x100,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_1_resources[] = {
+	[0] = {
+		.name	= "MTU2_1",
+		.start	= 0xfffe4380,
+		.end	= 0xfffe4390,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 153,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_1_device = {
+	.name		= "sh_mtu2",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &mtu2_1_platform_data,
+	},
+	.resource	= mtu2_1_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
+};
+
 static struct resource rtc_resources[] = {
 	[0] = {
 		.start	= 0xffff2000,
@@ -295,6 +358,8 @@ static struct platform_device *sh7203_devices[] __initdata = {
 	&sci_device,
 	&cmt0_device,
 	&cmt1_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
 	&rtc_device,
 };
 
@@ -313,8 +378,11 @@ void __init plat_irq_setup(void)
 static struct platform_device *sh7203_early_devices[] __initdata = {
 	&cmt0_device,
 	&cmt1_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
 };
 
+#define STBCR3 0xfffe0408
 #define STBCR4 0xfffe040c
 
 void __init plat_early_device_setup(void)
@@ -322,6 +390,9 @@ void __init plat_early_device_setup(void)
 	/* enable CMT clock */
 	__raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4);
 
+	/* enable MTU2 clock */
+	__raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3);
+
 	early_platform_add_devices(sh7203_early_devices,
 				   ARRAY_SIZE(sh7203_early_devices));
 }
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index f9606e3d25152..e700559b6b895 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -13,6 +13,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/sh_cmt.h>
+#include <linux/sh_mtu2.h>
 #include <linux/io.h>
 
 enum {
@@ -231,10 +232,106 @@ static struct platform_device cmt1_device = {
 	.num_resources	= ARRAY_SIZE(cmt1_resources),
 };
 
+static struct sh_mtu2_config mtu2_0_platform_data = {
+	.name = "MTU2_0",
+	.channel_offset = -0x80,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_0_resources[] = {
+	[0] = {
+		.name	= "MTU2_0",
+		.start	= 0xfffe4300,
+		.end	= 0xfffe4326,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 156,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_0_device = {
+	.name		= "sh_mtu2",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &mtu2_0_platform_data,
+	},
+	.resource	= mtu2_0_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
+};
+
+static struct sh_mtu2_config mtu2_1_platform_data = {
+	.name = "MTU2_1",
+	.channel_offset = -0x100,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_1_resources[] = {
+	[0] = {
+		.name	= "MTU2_1",
+		.start	= 0xfffe4380,
+		.end	= 0xfffe4390,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 164,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_1_device = {
+	.name		= "sh_mtu2",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &mtu2_1_platform_data,
+	},
+	.resource	= mtu2_1_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
+};
+
+static struct sh_mtu2_config mtu2_2_platform_data = {
+	.name = "MTU2_2",
+	.channel_offset = 0x80,
+	.timer_bit = 2,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource mtu2_2_resources[] = {
+	[0] = {
+		.name	= "MTU2_2",
+		.start	= 0xfffe4000,
+		.end	= 0xfffe400a,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 180,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device mtu2_2_device = {
+	.name		= "sh_mtu2",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &mtu2_2_platform_data,
+	},
+	.resource	= mtu2_2_resources,
+	.num_resources	= ARRAY_SIZE(mtu2_2_resources),
+};
+
 static struct platform_device *sh7206_devices[] __initdata = {
 	&sci_device,
 	&cmt0_device,
 	&cmt1_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
 };
 
 static int __init sh7206_devices_setup(void)
@@ -252,8 +349,12 @@ void __init plat_irq_setup(void)
 static struct platform_device *sh7206_early_devices[] __initdata = {
 	&cmt0_device,
 	&cmt1_device,
+	&mtu2_0_device,
+	&mtu2_1_device,
+	&mtu2_2_device,
 };
 
+#define STBCR3 0xfffe0408
 #define STBCR4 0xfffe040c
 
 void __init plat_early_device_setup(void)
@@ -261,6 +362,9 @@ void __init plat_early_device_setup(void)
 	/* enable CMT clock */
 	__raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4);
 
+	/* enable MTU2 clock */
+	__raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3);
+
 	early_platform_add_devices(sh7206_early_devices,
 				   ARRAY_SIZE(sh7206_early_devices));
 }
-- 
GitLab


From 1cbac972ba28e706fa9ce4d4c81830040bc811ee Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sat, 2 May 2009 13:39:56 +0400
Subject: [PATCH 0807/2198] x86: uv io-apic - use BUILD_BUG_ON instead of
 BUG_ON

The expression is known to be true/false at compilation
time so we're allowed to use build-time instead of
run-time check. Also align 'entry' items assignment.

[ Impact: shrink kernel a bit, cleanup ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090502093956.GB4791@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 8aef5f9d9479f..a80335ba12ccb 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3749,6 +3749,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 	unsigned long flags;
 	int err;
 
+	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
 	cfg = irq_cfg(irq);
 
 	err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3762,15 +3764,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
-	entry->vector = cfg->vector;
-	entry->delivery_mode = apic->irq_delivery_mode;
-	entry->dest_mode = apic->irq_dest_mode;
-	entry->polarity = 0;
-	entry->trigger = 0;
-	entry->mask = 0;
-	entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
+	entry->vector		= cfg->vector;
+	entry->delivery_mode	= apic->irq_delivery_mode;
+	entry->dest_mode	= apic->irq_dest_mode;
+	entry->polarity		= 0;
+	entry->trigger		= 0;
+	entry->mask		= 0;
+	entry->dest		= apic->cpu_mask_to_apicid(eligible_cpu);
 
 	mmr_pnode = uv_blade_to_pnode(mmr_blade);
 	uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3788,10 +3788,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
 	struct uv_IO_APIC_route_entry *entry;
 	int mmr_pnode;
 
+	BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
 	mmr_value = 0;
 	entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-	BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
 	entry->mask = 1;
 
 	mmr_pnode = uv_blade_to_pnode(mmr_blade);
-- 
GitLab


From 3280c8865e1b738604bacdea54738acef31e8c12 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 30 Apr 2009 07:12:09 +0000
Subject: [PATCH 0808/2198] sh: remove old MTU2 driver

This patch removes the old MTU2 driver (CONFIG_SH_MTU2/timer-mtu2.c)

As replacement, select the sh_cmt driver with CONFIG_SH_TIMER_MTU2
and configure timer channel using platform data.

If multiple MTU channels are enabled using platform data, use the
earlytimer parameter on the kernel command line to select channel.
For instance, use "earlytimer=sh_mtu2.0" to select the first channel.

To verify which timer is being used, look at printouts or the timer
irq count in /proc/interrupts.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                    |   9 +-
 arch/sh/include/asm/timer.h        |   2 +-
 arch/sh/kernel/timers/Makefile     |   1 -
 arch/sh/kernel/timers/timer-mtu2.c | 202 -----------------------------
 arch/sh/kernel/timers/timer.c      |   3 -
 5 files changed, 2 insertions(+), 215 deletions(-)
 delete mode 100644 arch/sh/kernel/timers/timer-mtu2.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 2061488cc17cc..bda3dc1b0c262 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -479,16 +479,9 @@ config SH_TIMER_CMT
 	help
 	  This enables build of the CMT timer driver.
 
-config SH_MTU2
-	bool "MTU2 timer support"
-	depends on CPU_SH2A && !GENERIC_TIME
-	default y
-	help
-	  This enables the use of the MTU2 as the system timer.
-
 config SH_TIMER_MTU2
 	bool "MTU2 timer driver"
-	depends on SYS_SUPPORTS_MTU2 && !SH_MTU2
+	depends on SYS_SUPPORTS_MTU2
 	default y
 	select GENERIC_CLOCKEVENTS
 	help
diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index 9c968d19cb9b7..581e3fe3fe040 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -23,7 +23,7 @@ struct sys_timer {
 
 #define TICK_SIZE (tick_nsec / 1000)
 
-extern struct sys_timer tmu_timer, mtu2_timer;
+extern struct sys_timer tmu_timer;
 extern struct sys_timer *sys_timer;
 
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile
index 1e9a104c2aee9..44156df0bdeaf 100644
--- a/arch/sh/kernel/timers/Makefile
+++ b/arch/sh/kernel/timers/Makefile
@@ -5,6 +5,5 @@
 obj-y	:= timer.o
 
 obj-$(CONFIG_SH_TMU)		+= timer-tmu.o
-obj-$(CONFIG_SH_MTU2)		+= timer-mtu2.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= timer-broadcast.o
diff --git a/arch/sh/kernel/timers/timer-mtu2.c b/arch/sh/kernel/timers/timer-mtu2.c
deleted file mode 100644
index 8a1dcc2c37236..0000000000000
--- a/arch/sh/kernel/timers/timer-mtu2.c
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * arch/sh/kernel/timers/timer-mtu2.c - MTU2 Timer Support
- *
- *  Copyright (C) 2005  Paul Mundt
- *
- * Based off of arch/sh/kernel/timers/timer-tmu.c
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/seqlock.h>
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/clock.h>
-
-/*
- * We use channel 1 for our lowly system timer. Channel 2 would be the other
- * likely candidate, but we leave it alone as it has higher divisors that
- * would be of more use to other more interesting applications.
- *
- * TODO: Presently we only implement a 16-bit single-channel system timer.
- * However, we can implement channel cascade if we go the overflow route and
- * get away with using 2 MTU2 channels as a 32-bit timer.
- */
-#define MTU2_TSTR	0xfffe4280
-#define MTU2_TCR_1	0xfffe4380
-#define MTU2_TMDR_1	0xfffe4381
-#define MTU2_TIOR_1	0xfffe4382
-#define MTU2_TIER_1	0xfffe4384
-#define MTU2_TSR_1	0xfffe4385
-#define MTU2_TCNT_1	0xfffe4386	/* 16-bit counter */
-
-#if defined(CONFIG_CPU_SUBTYPE_SH7201) || \
-    defined(CONFIG_CPU_SUBTYPE_SH7203)
-#define MTU2_TGRA_1	0xfffe4388
-#else
-#define MTU2_TGRA_1	0xfffe438a
-#endif
-
-#define STBCR3		0xfffe0408
-
-#define MTU2_TSTR_CST1	(1 << 1)	/* Counter Start 1 */
-
-#define MTU2_TSR_TGFA	(1 << 0)	/* GRA compare match */
-
-#define MTU2_TIER_TGIEA	(1 << 0)	/* GRA compare match  interrupt enable */
-
-#define MTU2_TCR_INIT	0x22
-
-#define MTU2_TCR_CALIB  0x00
-
-static unsigned long mtu2_timer_get_offset(void)
-{
-	int count;
-	static int count_p = 0x7fff;	/* for the first call after boot */
-	static unsigned long jiffies_p = 0;
-
-	/*
-	 * cache volatile jiffies temporarily; we have IRQs turned off.
-	 */
-	unsigned long jiffies_t;
-
-	/* timer count may underflow right here */
-	count = ctrl_inw(MTU2_TCNT_1);	/* read the latched count */
-
-	jiffies_t = jiffies;
-
-	/*
-	 * avoiding timer inconsistencies (they are rare, but they happen)...
-	 * there is one kind of problem that must be avoided here:
-	 *  1. the timer counter underflows
-	 */
-
-	if (jiffies_t == jiffies_p) {
-		if (count > count_p) {
-			if (ctrl_inb(MTU2_TSR_1) & MTU2_TSR_TGFA) {
-				count -= LATCH;
-			} else {
-				printk("%s (): hardware timer problem?\n",
-				       __func__);
-			}
-		}
-	} else
-		jiffies_p = jiffies_t;
-
-	count_p = count;
-
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	count = (count + LATCH/2) / LATCH;
-
-	return count;
-}
-
-static irqreturn_t mtu2_timer_interrupt(int irq, void *dev_id)
-{
-	unsigned long timer_status;
-
-	/* Clear TGFA bit */
-	timer_status = ctrl_inb(MTU2_TSR_1);
-	timer_status &= ~MTU2_TSR_TGFA;
-	ctrl_outb(timer_status, MTU2_TSR_1);
-
-	/* Do timer tick */
-	handle_timer_tick();
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction mtu2_irq = {
-	.name		= "timer",
-	.handler	= mtu2_timer_interrupt,
-	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
-};
-
-static unsigned int divisors[] = { 1, 4, 16, 64, 1, 1, 256 };
-
-static void mtu2_clk_init(struct clk *clk)
-{
-	u8 idx = MTU2_TCR_INIT & 0x7;
-
-	clk->rate = clk->parent->rate / divisors[idx];
-	/* Start TCNT counting */
-	ctrl_outb(ctrl_inb(MTU2_TSTR) | MTU2_TSTR_CST1, MTU2_TSTR);
-
-}
-
-static void mtu2_clk_recalc(struct clk *clk)
-{
-	u8 idx = ctrl_inb(MTU2_TCR_1) & 0x7;
-	clk->rate = clk->parent->rate / divisors[idx];
-}
-
-static struct clk_ops mtu2_clk_ops = {
-	.init		= mtu2_clk_init,
-	.recalc		= mtu2_clk_recalc,
-};
-
-static struct clk mtu2_clk1 = {
-	.name		= "mtu2_clk1",
-	.ops		= &mtu2_clk_ops,
-};
-
-static int mtu2_timer_start(void)
-{
-	ctrl_outb(ctrl_inb(MTU2_TSTR) | MTU2_TSTR_CST1, MTU2_TSTR);
-	return 0;
-}
-
-static int mtu2_timer_stop(void)
-{
-	ctrl_outb(ctrl_inb(MTU2_TSTR) & ~MTU2_TSTR_CST1, MTU2_TSTR);
-	return 0;
-}
-
-static int mtu2_timer_init(void)
-{
-	unsigned long interval;
-
-	setup_irq(CONFIG_SH_TIMER_IRQ, &mtu2_irq);
-
-	mtu2_clk1.parent = clk_get(NULL, "module_clk");
-
-	ctrl_outb(ctrl_inb(STBCR3) & (~0x20), STBCR3);
-
-	/* Normal operation */
-	ctrl_outb(0, MTU2_TMDR_1);
-	ctrl_outb(MTU2_TCR_INIT, MTU2_TCR_1);
-	ctrl_outb(0x01, MTU2_TIOR_1);
-
-	/* Enable underflow interrupt */
-	ctrl_outb(ctrl_inb(MTU2_TIER_1) | MTU2_TIER_TGIEA, MTU2_TIER_1);
-
-	interval = CONFIG_SH_PCLK_FREQ / 16 / HZ;
-	printk(KERN_INFO "Interval = %ld\n", interval);
-
-	ctrl_outw(interval, MTU2_TGRA_1);
-	ctrl_outw(0, MTU2_TCNT_1);
-
-	clk_register(&mtu2_clk1);
-	clk_enable(&mtu2_clk1);
-
-	return 0;
-}
-
-struct sys_timer_ops mtu2_timer_ops = {
-	.init		= mtu2_timer_init,
-	.start		= mtu2_timer_start,
-	.stop		= mtu2_timer_stop,
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-	.get_offset	= mtu2_timer_get_offset,
-#endif
-};
-
-struct sys_timer mtu2_timer = {
-	.name	= "mtu2",
-	.ops	= &mtu2_timer_ops,
-};
diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c
index f3bd1413d5681..920891acb7797 100644
--- a/arch/sh/kernel/timers/timer.c
+++ b/arch/sh/kernel/timers/timer.c
@@ -16,9 +16,6 @@
 static struct sys_timer *sys_timers[] = {
 #ifdef CONFIG_SH_TMU
 	&tmu_timer,
-#endif
-#ifdef CONFIG_SH_MTU2
-	&mtu2_timer,
 #endif
 	NULL,
 };
-- 
GitLab


From 3969c52d4d2fef5a4b9e3ab0e51b3901e1cc8b83 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Sun, 3 May 2009 11:11:35 +0530
Subject: [PATCH 0809/2198] x86: cpufeature.h fix name for X86_FEATURE_MCE

X86_FEATURE_MCE = Machine Check Exception
X86_FEATURE_MCA = Machine Check Architecture

[ Impact: cleanup ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
LKML-Reference: <1241329295.6321.1.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/cpufeature.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c397aad..ccc1061b8b257 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -22,7 +22,7 @@
 #define X86_FEATURE_TSC		(0*32+ 4) /* Time Stamp Counter */
 #define X86_FEATURE_MSR		(0*32+ 5) /* Model-Specific Registers */
 #define X86_FEATURE_PAE		(0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE		(0*32+ 7) /* Machine Check Architecture */
+#define X86_FEATURE_MCE		(0*32+ 7) /* Machine Check Exception */
 #define X86_FEATURE_CX8		(0*32+ 8) /* CMPXCHG8 instruction */
 #define X86_FEATURE_APIC	(0*32+ 9) /* Onboard APIC */
 #define X86_FEATURE_SEP		(0*32+11) /* SYSENTER/SYSEXIT */
-- 
GitLab


From 9570ef20423b549757aa484ad388f9a7d5bdc4d9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 1 May 2009 06:51:00 +0000
Subject: [PATCH 0810/2198] clocksource: SuperH TMU Timer driver

This patch adds a TMU driver for the SuperH architecture.

The TMU driver is a platform driver with early platform
support to allow using a TMU channel as clockevent or
clocksource during system bootup or later.

Clocksource or clockevent can be selected.
Both periodic and oneshot clockevents are supported.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig              |  12 +
 drivers/clocksource/Makefile |   1 +
 drivers/clocksource/sh_tmu.c | 461 +++++++++++++++++++++++++++++++++++
 include/linux/sh_tmu.h       |  13 +
 4 files changed, 487 insertions(+)
 create mode 100644 drivers/clocksource/sh_tmu.c
 create mode 100644 include/linux/sh_tmu.h

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index bda3dc1b0c262..1f74737c4f20e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -116,6 +116,9 @@ config SYS_SUPPORTS_CMT
 config SYS_SUPPORTS_MTU2
 	bool
 
+config SYS_SUPPORTS_TMU
+	bool
+
 config STACKTRACE_SUPPORT
 	def_bool y
 
@@ -470,6 +473,15 @@ config SH_TMU
 	help
 	  This enables the use of the TMU as the system timer.
 
+config SH_TIMER_TMU
+	bool "TMU timer driver"
+	depends on !SH_TMU && SYS_SUPPORTS_TMU
+	default y
+	select GENERIC_TIME
+	select GENERIC_CLOCKEVENTS
+	help
+	  This enables the build of the TMU timer driver.
+
 config SH_TIMER_CMT
 	bool "CMT timer driver"
 	depends on SYS_SUPPORTS_CMT
diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile
index 9785586dc8c4a..eef216f7f61d6 100644
--- a/drivers/clocksource/Makefile
+++ b/drivers/clocksource/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_X86_PM_TIMER)	+= acpi_pm.o
 obj-$(CONFIG_SCx200HR_TIMER)	+= scx200_hrt.o
 obj-$(CONFIG_SH_TIMER_CMT)	+= sh_cmt.o
 obj-$(CONFIG_SH_TIMER_MTU2)	+= sh_mtu2.o
+obj-$(CONFIG_SH_TIMER_TMU)	+= sh_tmu.o
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
new file mode 100644
index 0000000000000..21bd77aa6a34c
--- /dev/null
+++ b/drivers/clocksource/sh_tmu.c
@@ -0,0 +1,461 @@
+/*
+ * SuperH Timer Support - TMU
+ *
+ *  Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/spinlock.h>
+#include <linux/interrupt.h>
+#include <linux/ioport.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/clk.h>
+#include <linux/irq.h>
+#include <linux/err.h>
+#include <linux/clocksource.h>
+#include <linux/clockchips.h>
+#include <linux/sh_tmu.h>
+
+struct sh_tmu_priv {
+	void __iomem *mapbase;
+	struct clk *clk;
+	struct irqaction irqaction;
+	struct platform_device *pdev;
+	unsigned long rate;
+	unsigned long periodic;
+	struct clock_event_device ced;
+	struct clocksource cs;
+};
+
+static DEFINE_SPINLOCK(sh_tmu_lock);
+
+#define TSTR -1 /* shared register */
+#define TCOR  0 /* channel register */
+#define TCNT 1 /* channel register */
+#define TCR 2 /* channel register */
+
+static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR)
+		return ioread8(base - cfg->channel_offset);
+
+	offs = reg_nr << 2;
+
+	if (reg_nr == TCR)
+		return ioread16(base + offs);
+	else
+		return ioread32(base + offs);
+}
+
+static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr,
+				unsigned long value)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	void __iomem *base = p->mapbase;
+	unsigned long offs;
+
+	if (reg_nr == TSTR) {
+		iowrite8(value, base - cfg->channel_offset);
+		return;
+	}
+
+	offs = reg_nr << 2;
+
+	if (reg_nr == TCR)
+		iowrite16(value, base + offs);
+	else
+		iowrite32(value, base + offs);
+}
+
+static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	unsigned long flags, value;
+
+	/* start stop register shared by multiple timer channels */
+	spin_lock_irqsave(&sh_tmu_lock, flags);
+	value = sh_tmu_read(p, TSTR);
+
+	if (start)
+		value |= 1 << cfg->timer_bit;
+	else
+		value &= ~(1 << cfg->timer_bit);
+
+	sh_tmu_write(p, TSTR, value);
+	spin_unlock_irqrestore(&sh_tmu_lock, flags);
+}
+
+static int sh_tmu_enable(struct sh_tmu_priv *p)
+{
+	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	int ret;
+
+	/* enable clock */
+	ret = clk_enable(p->clk);
+	if (ret) {
+		pr_err("sh_tmu: cannot enable clock \"%s\"\n", cfg->clk);
+		return ret;
+	}
+
+	/* make sure channel is disabled */
+	sh_tmu_start_stop_ch(p, 0);
+
+	/* maximum timeout */
+	sh_tmu_write(p, TCOR, 0xffffffff);
+	sh_tmu_write(p, TCNT, 0xffffffff);
+
+	/* configure channel to parent clock / 4, irq off */
+	p->rate = clk_get_rate(p->clk) / 4;
+	sh_tmu_write(p, TCR, 0x0000);
+
+	/* enable channel */
+	sh_tmu_start_stop_ch(p, 1);
+
+	return 0;
+}
+
+static void sh_tmu_disable(struct sh_tmu_priv *p)
+{
+	/* disable channel */
+	sh_tmu_start_stop_ch(p, 0);
+
+	/* stop clock */
+	clk_disable(p->clk);
+}
+
+static void sh_tmu_set_next(struct sh_tmu_priv *p, unsigned long delta,
+			    int periodic)
+{
+	/* stop timer */
+	sh_tmu_start_stop_ch(p, 0);
+
+	/* acknowledge interrupt */
+	sh_tmu_read(p, TCR);
+
+	/* enable interrupt */
+	sh_tmu_write(p, TCR, 0x0020);
+
+	/* reload delta value in case of periodic timer */
+	if (periodic)
+		sh_tmu_write(p, TCOR, delta);
+	else
+		sh_tmu_write(p, TCOR, 0);
+
+	sh_tmu_write(p, TCNT, delta);
+
+	/* start timer */
+	sh_tmu_start_stop_ch(p, 1);
+}
+
+static irqreturn_t sh_tmu_interrupt(int irq, void *dev_id)
+{
+	struct sh_tmu_priv *p = dev_id;
+
+	/* disable or acknowledge interrupt */
+	if (p->ced.mode == CLOCK_EVT_MODE_ONESHOT)
+		sh_tmu_write(p, TCR, 0x0000);
+	else
+		sh_tmu_write(p, TCR, 0x0020);
+
+	/* notify clockevent layer */
+	p->ced.event_handler(&p->ced);
+	return IRQ_HANDLED;
+}
+
+static struct sh_tmu_priv *cs_to_sh_tmu(struct clocksource *cs)
+{
+	return container_of(cs, struct sh_tmu_priv, cs);
+}
+
+static cycle_t sh_tmu_clocksource_read(struct clocksource *cs)
+{
+	struct sh_tmu_priv *p = cs_to_sh_tmu(cs);
+
+	return sh_tmu_read(p, TCNT) ^ 0xffffffff;
+}
+
+static int sh_tmu_clocksource_enable(struct clocksource *cs)
+{
+	struct sh_tmu_priv *p = cs_to_sh_tmu(cs);
+	int ret;
+
+	ret = sh_tmu_enable(p);
+	if (ret)
+		return ret;
+
+	/* TODO: calculate good shift from rate and counter bit width */
+	cs->shift = 10;
+	cs->mult = clocksource_hz2mult(p->rate, cs->shift);
+	return 0;
+}
+
+static void sh_tmu_clocksource_disable(struct clocksource *cs)
+{
+	sh_tmu_disable(cs_to_sh_tmu(cs));
+}
+
+static int sh_tmu_register_clocksource(struct sh_tmu_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clocksource *cs = &p->cs;
+
+	cs->name = name;
+	cs->rating = rating;
+	cs->read = sh_tmu_clocksource_read;
+	cs->enable = sh_tmu_clocksource_enable;
+	cs->disable = sh_tmu_clocksource_disable;
+	cs->mask = CLOCKSOURCE_MASK(32);
+	cs->flags = CLOCK_SOURCE_IS_CONTINUOUS;
+	pr_info("sh_tmu: %s used as clock source\n", cs->name);
+	clocksource_register(cs);
+	return 0;
+}
+
+static struct sh_tmu_priv *ced_to_sh_tmu(struct clock_event_device *ced)
+{
+	return container_of(ced, struct sh_tmu_priv, ced);
+}
+
+static void sh_tmu_clock_event_start(struct sh_tmu_priv *p, int periodic)
+{
+	struct clock_event_device *ced = &p->ced;
+
+	sh_tmu_enable(p);
+
+	/* TODO: calculate good shift from rate and counter bit width */
+
+	ced->shift = 32;
+	ced->mult = div_sc(p->rate, NSEC_PER_SEC, ced->shift);
+	ced->max_delta_ns = clockevent_delta2ns(0xffffffff, ced);
+	ced->min_delta_ns = 5000;
+
+	if (periodic) {
+		p->periodic = (p->rate + HZ/2) / HZ;
+		sh_tmu_set_next(p, p->periodic, 1);
+	}
+}
+
+static void sh_tmu_clock_event_mode(enum clock_event_mode mode,
+				    struct clock_event_device *ced)
+{
+	struct sh_tmu_priv *p = ced_to_sh_tmu(ced);
+	int disabled = 0;
+
+	/* deal with old setting first */
+	switch (ced->mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+	case CLOCK_EVT_MODE_ONESHOT:
+		sh_tmu_disable(p);
+		disabled = 1;
+		break;
+	default:
+		break;
+	}
+
+	switch (mode) {
+	case CLOCK_EVT_MODE_PERIODIC:
+		pr_info("sh_tmu: %s used for periodic clock events\n",
+			ced->name);
+		sh_tmu_clock_event_start(p, 1);
+		break;
+	case CLOCK_EVT_MODE_ONESHOT:
+		pr_info("sh_tmu: %s used for oneshot clock events\n",
+			ced->name);
+		sh_tmu_clock_event_start(p, 0);
+		break;
+	case CLOCK_EVT_MODE_UNUSED:
+		if (!disabled)
+			sh_tmu_disable(p);
+		break;
+	case CLOCK_EVT_MODE_SHUTDOWN:
+	default:
+		break;
+	}
+}
+
+static int sh_tmu_clock_event_next(unsigned long delta,
+				   struct clock_event_device *ced)
+{
+	struct sh_tmu_priv *p = ced_to_sh_tmu(ced);
+
+	BUG_ON(ced->mode != CLOCK_EVT_MODE_ONESHOT);
+
+	/* program new delta value */
+	sh_tmu_set_next(p, delta, 0);
+	return 0;
+}
+
+static void sh_tmu_register_clockevent(struct sh_tmu_priv *p,
+				       char *name, unsigned long rating)
+{
+	struct clock_event_device *ced = &p->ced;
+	int ret;
+
+	memset(ced, 0, sizeof(*ced));
+
+	ced->name = name;
+	ced->features = CLOCK_EVT_FEAT_PERIODIC;
+	ced->features |= CLOCK_EVT_FEAT_ONESHOT;
+	ced->rating = rating;
+	ced->cpumask = cpumask_of(0);
+	ced->set_next_event = sh_tmu_clock_event_next;
+	ced->set_mode = sh_tmu_clock_event_mode;
+
+	ret = setup_irq(p->irqaction.irq, &p->irqaction);
+	if (ret) {
+		pr_err("sh_tmu: failed to request irq %d\n",
+		       p->irqaction.irq);
+		return;
+	}
+
+	pr_info("sh_tmu: %s used for clock events\n", ced->name);
+	clockevents_register_device(ced);
+}
+
+static int sh_tmu_register(struct sh_tmu_priv *p, char *name,
+		    unsigned long clockevent_rating,
+		    unsigned long clocksource_rating)
+{
+	if (clockevent_rating)
+		sh_tmu_register_clockevent(p, name, clockevent_rating);
+	else if (clocksource_rating)
+		sh_tmu_register_clocksource(p, name, clocksource_rating);
+
+	return 0;
+}
+
+static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
+{
+	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	struct resource *res;
+	int irq, ret;
+	ret = -ENXIO;
+
+	memset(p, 0, sizeof(*p));
+	p->pdev = pdev;
+
+	if (!cfg) {
+		dev_err(&p->pdev->dev, "missing platform data\n");
+		goto err0;
+	}
+
+	platform_set_drvdata(pdev, p);
+
+	res = platform_get_resource(p->pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&p->pdev->dev, "failed to get I/O memory\n");
+		goto err0;
+	}
+
+	irq = platform_get_irq(p->pdev, 0);
+	if (irq < 0) {
+		dev_err(&p->pdev->dev, "failed to get irq\n");
+		goto err0;
+	}
+
+	/* map memory, let mapbase point to our channel */
+	p->mapbase = ioremap_nocache(res->start, resource_size(res));
+	if (p->mapbase == NULL) {
+		pr_err("sh_tmu: failed to remap I/O memory\n");
+		goto err0;
+	}
+
+	/* setup data for setup_irq() (too early for request_irq()) */
+	p->irqaction.name = cfg->name;
+	p->irqaction.handler = sh_tmu_interrupt;
+	p->irqaction.dev_id = p;
+	p->irqaction.irq = irq;
+	p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL;
+	p->irqaction.mask = CPU_MASK_NONE;
+
+	/* get hold of clock */
+	p->clk = clk_get(&p->pdev->dev, cfg->clk);
+	if (IS_ERR(p->clk)) {
+		pr_err("sh_tmu: cannot get clock \"%s\"\n", cfg->clk);
+		ret = PTR_ERR(p->clk);
+		goto err1;
+	}
+
+	return sh_tmu_register(p, cfg->name,
+			       cfg->clockevent_rating,
+			       cfg->clocksource_rating);
+ err1:
+	iounmap(p->mapbase);
+ err0:
+	return ret;
+}
+
+static int __devinit sh_tmu_probe(struct platform_device *pdev)
+{
+	struct sh_tmu_priv *p = platform_get_drvdata(pdev);
+	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	int ret;
+
+	if (p) {
+		pr_info("sh_tmu: %s kept as earlytimer\n", cfg->name);
+		return 0;
+	}
+
+	p = kmalloc(sizeof(*p), GFP_KERNEL);
+	if (p == NULL) {
+		dev_err(&pdev->dev, "failed to allocate driver data\n");
+		return -ENOMEM;
+	}
+
+	ret = sh_tmu_setup(p, pdev);
+	if (ret) {
+		kfree(p);
+		platform_set_drvdata(pdev, NULL);
+	}
+	return ret;
+}
+
+static int __devexit sh_tmu_remove(struct platform_device *pdev)
+{
+	return -EBUSY; /* cannot unregister clockevent and clocksource */
+}
+
+static struct platform_driver sh_tmu_device_driver = {
+	.probe		= sh_tmu_probe,
+	.remove		= __devexit_p(sh_tmu_remove),
+	.driver		= {
+		.name	= "sh_tmu",
+	}
+};
+
+static int __init sh_tmu_init(void)
+{
+	return platform_driver_register(&sh_tmu_device_driver);
+}
+
+static void __exit sh_tmu_exit(void)
+{
+	platform_driver_unregister(&sh_tmu_device_driver);
+}
+
+early_platform_init("earlytimer", &sh_tmu_device_driver);
+module_init(sh_tmu_init);
+module_exit(sh_tmu_exit);
+
+MODULE_AUTHOR("Magnus Damm");
+MODULE_DESCRIPTION("SuperH TMU Timer Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/sh_tmu.h b/include/linux/sh_tmu.h
new file mode 100644
index 0000000000000..b1195e8a37d03
--- /dev/null
+++ b/include/linux/sh_tmu.h
@@ -0,0 +1,13 @@
+#ifndef __SH_TMU_H__
+#define __SH_TMU_H__
+
+struct sh_tmu_config {
+	char *name;
+	unsigned long channel_offset;
+	int timer_bit;
+	char *clk;
+	unsigned long clockevent_rating;
+	unsigned long clocksource_rating;
+};
+
+#endif /* __SH_TMU_H__ */
-- 
GitLab


From d43a41bf8b504a1d9f0b4ce7e17d803f4ef39d84 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 1 May 2009 06:58:52 +0000
Subject: [PATCH 0811/2198] sh: TMU platform data for sh7722

This patch adds TMU platform data for sh7722. Only clockevent
mode is enabled for now, clocksource requires this patch:
"clocksource: setup mult_orig in clocksource_enable()"

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                        |  1 +
 arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 99 ++++++++++++++++++++++++++
 2 files changed, 100 insertions(+)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 1f74737c4f20e..c3e9455498a0a 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -423,6 +423,7 @@ config CPU_SUBTYPE_SH7722
 	select ARCH_SPARSEMEM_ENABLE
 	select SYS_SUPPORTS_NUMA
 	select SYS_SUPPORTS_CMT
+	select SYS_SUPPORTS_TMU
 
 config CPU_SUBTYPE_SH7366
 	bool "Support SH7366 processor"
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 793c50da6a842..512735c5cc868 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -14,6 +14,7 @@
 #include <linux/mm.h>
 #include <linux/uio_driver.h>
 #include <linux/sh_cmt.h>
+#include <linux/sh_tmu.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 
@@ -209,6 +210,98 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
+static struct sh_tmu_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu0",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_tmu_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu0",
+	.clocksource_rating = 0, /* disabled for now */
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_tmu_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu0",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xffe00000,
@@ -243,6 +336,9 @@ static struct platform_device sci_device = {
 
 static struct platform_device *sh7722_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&rtc_device,
 	&usbf_device,
 	&iic_device,
@@ -271,6 +367,9 @@ __initcall(sh7722_devices_setup);
 
 static struct platform_device *sh7722_early_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 };
 
 void __init plat_early_device_setup(void)
-- 
GitLab


From 9a8709d44139748fe2e0ab56d20d8c384c8b65ad Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Sat, 2 May 2009 00:25:11 +0400
Subject: [PATCH 0812/2198] x86: uv - prevent NULL dereference in
 uv_system_init()

We may reach NULL dereference oops if kmalloc failed.
Prevent it with explicit BUG_ON.

[ Impact: more controlled assert in 'impossible' scenario ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Acked-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <20090501202511.GE4633@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 873bf7121e894..9d9e2281a829b 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -569,15 +569,18 @@ void __init uv_system_init(void)
 
 	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
 	uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+	BUG_ON(!uv_blade_info);
 
 	get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
 
 	bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
 	uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
+	BUG_ON(!uv_node_to_blade);
 	memset(uv_node_to_blade, 255, bytes);
 
 	bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
 	uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
+	BUG_ON(!uv_cpu_to_blade);
 	memset(uv_cpu_to_blade, 255, bytes);
 
 	blade = 0;
-- 
GitLab


From 46a12f7426d71cabc08972cf8d3ffdd441d26a3a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 17:57:17 +0900
Subject: [PATCH 0813/2198] sh: Consolidate MTU2/CMT/TMU timer platform data.

All of the SH timers use a roughly identical structure for platform data,
which presently is broken out for each block. Consolidate all of these
definitions, as there is no reason for them to be broken out in the first
place.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh2/setup-sh7619.c  |  6 +++---
 arch/sh/kernel/cpu/sh2a/setup-mxg.c    |  8 ++++----
 arch/sh/kernel/cpu/sh2a/setup-sh7201.c |  8 ++++----
 arch/sh/kernel/cpu/sh2a/setup-sh7203.c | 11 +++++------
 arch/sh/kernel/cpu/sh2a/setup-sh7206.c | 13 ++++++-------
 arch/sh/kernel/cpu/sh4a/setup-sh7343.c |  4 ++--
 arch/sh/kernel/cpu/sh4a/setup-sh7366.c |  4 ++--
 arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 11 +++++------
 arch/sh/kernel/cpu/sh4a/setup-sh7723.c |  4 ++--
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c |  4 ++--
 drivers/clocksource/sh_cmt.c           | 14 +++++++-------
 drivers/clocksource/sh_mtu2.c          | 14 +++++++-------
 drivers/clocksource/sh_tmu.c           | 14 +++++++-------
 include/linux/sh_cmt.h                 | 13 -------------
 include/linux/sh_mtu2.h                | 12 ------------
 include/linux/sh_timer.h               | 13 +++++++++++++
 include/linux/sh_tmu.h                 | 13 -------------
 17 files changed, 69 insertions(+), 97 deletions(-)
 delete mode 100644 include/linux/sh_cmt.h
 delete mode 100644 include/linux/sh_mtu2.h
 create mode 100644 include/linux/sh_timer.h
 delete mode 100644 include/linux/sh_tmu.h

diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
index d70c263eb07e4..94ac27fc2237d 100644
--- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -111,7 +111,7 @@ static struct platform_device eth_device = {
 	.resource = eth_resources,
 };
 
-static struct sh_cmt_config cmt0_platform_data = {
+static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
@@ -143,7 +143,7 @@ static struct platform_device cmt0_device = {
 	.num_resources	= ARRAY_SIZE(cmt0_resources),
 };
 
-static struct sh_cmt_config cmt1_platform_data = {
+static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
index 870030aa05bc7..a452d9649069e 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
@@ -11,7 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 
 enum {
 	UNUSED = 0,
@@ -114,7 +114,7 @@ static struct intc_mask_reg mask_registers[] __initdata = {
 static DECLARE_INTC_DESC(intc_desc, "mxg", vectors, groups,
 			 mask_registers, prio_registers, NULL);
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -145,7 +145,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
@@ -176,7 +176,7 @@ static struct platform_device mtu2_1_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
 };
 
-static struct sh_mtu2_config mtu2_2_platform_data = {
+static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
index 074aa9062e4b3..772358b7685ed 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
@@ -12,7 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -251,7 +251,7 @@ static struct platform_device rtc_device = {
 	.resource	= rtc_resources,
 };
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -282,7 +282,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
@@ -313,7 +313,7 @@ static struct platform_device mtu2_1_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
 };
 
-static struct sh_mtu2_config mtu2_2_platform_data = {
+static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index 3448164b5734d..d7493418ba60b 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -11,8 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_cmt.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -208,7 +207,7 @@ static struct platform_device sci_device = {
 	},
 };
 
-static struct sh_cmt_config cmt0_platform_data = {
+static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
@@ -240,7 +239,7 @@ static struct platform_device cmt0_device = {
 	.num_resources	= ARRAY_SIZE(cmt0_resources),
 };
 
-static struct sh_cmt_config cmt1_platform_data = {
+static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
@@ -272,7 +271,7 @@ static struct platform_device cmt1_device = {
 	.num_resources	= ARRAY_SIZE(cmt1_resources),
 };
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -303,7 +302,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index e700559b6b895..2fc6bff5c5fb1 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -12,8 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
-#include <linux/sh_cmt.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 
 enum {
@@ -168,7 +167,7 @@ static struct platform_device sci_device = {
 	},
 };
 
-static struct sh_cmt_config cmt0_platform_data = {
+static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
@@ -200,7 +199,7 @@ static struct platform_device cmt0_device = {
 	.num_resources	= ARRAY_SIZE(cmt0_resources),
 };
 
-static struct sh_cmt_config cmt1_platform_data = {
+static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
@@ -232,7 +231,7 @@ static struct platform_device cmt1_device = {
 	.num_resources	= ARRAY_SIZE(cmt1_resources),
 };
 
-static struct sh_mtu2_config mtu2_0_platform_data = {
+static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
@@ -263,7 +262,7 @@ static struct platform_device mtu2_0_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_0_resources),
 };
 
-static struct sh_mtu2_config mtu2_1_platform_data = {
+static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
@@ -294,7 +293,7 @@ static struct platform_device mtu2_1_device = {
 	.num_resources	= ARRAY_SIZE(mtu2_1_resources),
 };
 
-static struct sh_mtu2_config mtu2_2_platform_data = {
+static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index cb5b4db1ca259..f327b7eaf47c0 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -12,7 +12,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 
 static struct resource iic0_resources[] = {
@@ -141,7 +141,7 @@ static struct platform_device jpu_device = {
 	.num_resources	= ARRAY_SIZE(jpu_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index 2a771f48e9e09..7361ea989b96f 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -14,7 +14,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 
 static struct resource iic_resources[] = {
@@ -148,7 +148,7 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 512735c5cc868..624e8e5a605c2 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -13,8 +13,7 @@
 #include <linux/serial_sci.h>
 #include <linux/mm.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
-#include <linux/sh_tmu.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 
@@ -178,7 +177,7 @@ static struct platform_device jpu_device = {
 	.num_resources	= ARRAY_SIZE(jpu_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
@@ -210,7 +209,7 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
-static struct sh_tmu_config tmu0_platform_data = {
+static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
@@ -241,7 +240,7 @@ static struct platform_device tmu0_device = {
 	.num_resources	= ARRAY_SIZE(tmu0_resources),
 };
 
-static struct sh_tmu_config tmu1_platform_data = {
+static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
@@ -272,7 +271,7 @@ static struct platform_device tmu1_device = {
 	.num_resources	= ARRAY_SIZE(tmu1_resources),
 };
 
-static struct sh_tmu_config tmu2_platform_data = {
+static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index dbb44949ed195..fb9b5427a068c 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -13,7 +13,7 @@
 #include <linux/mm.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 
@@ -101,7 +101,7 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 8429396acb591..e074951b88b9d 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -18,7 +18,7 @@
 #include <linux/mm.h>
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
@@ -230,7 +230,7 @@ static struct platform_device veu1_device = {
 	.num_resources	= ARRAY_SIZE(veu1_resources),
 };
 
-static struct sh_cmt_config cmt_platform_data = {
+static struct sh_timer_config cmt_platform_data = {
 	.name = "CMT",
 	.channel_offset = 0x60,
 	.timer_bit = 5,
diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index 4ff1508e5ab70..aeb8c9b27b5b9 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -28,7 +28,7 @@
 #include <linux/err.h>
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
-#include <linux/sh_cmt.h>
+#include <linux/sh_timer.h>
 
 struct sh_cmt_priv {
 	void __iomem *mapbase;
@@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(sh_cmt_lock);
 
 static inline unsigned long sh_cmt_read(struct sh_cmt_priv *p, int reg_nr)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -83,7 +83,7 @@ static inline unsigned long sh_cmt_read(struct sh_cmt_priv *p, int reg_nr)
 static inline void sh_cmt_write(struct sh_cmt_priv *p, int reg_nr,
 				unsigned long value)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -131,7 +131,7 @@ static unsigned long sh_cmt_get_counter(struct sh_cmt_priv *p,
 
 static void sh_cmt_start_stop_ch(struct sh_cmt_priv *p, int start)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	unsigned long flags, value;
 
 	/* start stop register shared by multiple timer channels */
@@ -149,7 +149,7 @@ static void sh_cmt_start_stop_ch(struct sh_cmt_priv *p, int start)
 
 static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate)
 {
-	struct sh_cmt_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	int ret;
 
 	/* enable clock */
@@ -560,7 +560,7 @@ int sh_cmt_register(struct sh_cmt_priv *p, char *name,
 
 static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
 {
-	struct sh_cmt_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	struct resource *res;
 	int irq, ret;
 	ret = -ENXIO;
@@ -638,7 +638,7 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev)
 static int __devinit sh_cmt_probe(struct platform_device *pdev)
 {
 	struct sh_cmt_priv *p = platform_get_drvdata(pdev);
-	struct sh_cmt_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	int ret;
 
 	if (p) {
diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c
index 420566f4c5011..ef02185a827a5 100644
--- a/drivers/clocksource/sh_mtu2.c
+++ b/drivers/clocksource/sh_mtu2.c
@@ -28,7 +28,7 @@
 #include <linux/irq.h>
 #include <linux/err.h>
 #include <linux/clockchips.h>
-#include <linux/sh_mtu2.h>
+#include <linux/sh_timer.h>
 
 struct sh_mtu2_priv {
 	void __iomem *mapbase;
@@ -63,7 +63,7 @@ static unsigned long mtu2_reg_offs[] = {
 
 static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -81,7 +81,7 @@ static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr)
 static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr,
 				unsigned long value)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -100,7 +100,7 @@ static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr,
 
 static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	unsigned long flags, value;
 
 	/* start stop register shared by multiple timer channels */
@@ -118,7 +118,7 @@ static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start)
 
 static int sh_mtu2_enable(struct sh_mtu2_priv *p)
 {
-	struct sh_mtu2_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	int ret;
 
 	/* enable clock */
@@ -243,7 +243,7 @@ int sh_mtu2_register(struct sh_mtu2_priv *p, char *name,
 
 static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
 {
-	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	struct resource *res;
 	int irq, ret;
 	ret = -ENXIO;
@@ -303,7 +303,7 @@ static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev)
 static int __devinit sh_mtu2_probe(struct platform_device *pdev)
 {
 	struct sh_mtu2_priv *p = platform_get_drvdata(pdev);
-	struct sh_mtu2_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	int ret;
 
 	if (p) {
diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c
index 21bd77aa6a34c..d6ea4398bf623 100644
--- a/drivers/clocksource/sh_tmu.c
+++ b/drivers/clocksource/sh_tmu.c
@@ -29,7 +29,7 @@
 #include <linux/err.h>
 #include <linux/clocksource.h>
 #include <linux/clockchips.h>
-#include <linux/sh_tmu.h>
+#include <linux/sh_timer.h>
 
 struct sh_tmu_priv {
 	void __iomem *mapbase;
@@ -51,7 +51,7 @@ static DEFINE_SPINLOCK(sh_tmu_lock);
 
 static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -69,7 +69,7 @@ static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr)
 static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr,
 				unsigned long value)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	void __iomem *base = p->mapbase;
 	unsigned long offs;
 
@@ -88,7 +88,7 @@ static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr,
 
 static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	unsigned long flags, value;
 
 	/* start stop register shared by multiple timer channels */
@@ -106,7 +106,7 @@ static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start)
 
 static int sh_tmu_enable(struct sh_tmu_priv *p)
 {
-	struct sh_tmu_config *cfg = p->pdev->dev.platform_data;
+	struct sh_timer_config *cfg = p->pdev->dev.platform_data;
 	int ret;
 
 	/* enable clock */
@@ -345,7 +345,7 @@ static int sh_tmu_register(struct sh_tmu_priv *p, char *name,
 
 static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
 {
-	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	struct resource *res;
 	int irq, ret;
 	ret = -ENXIO;
@@ -407,7 +407,7 @@ static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev)
 static int __devinit sh_tmu_probe(struct platform_device *pdev)
 {
 	struct sh_tmu_priv *p = platform_get_drvdata(pdev);
-	struct sh_tmu_config *cfg = pdev->dev.platform_data;
+	struct sh_timer_config *cfg = pdev->dev.platform_data;
 	int ret;
 
 	if (p) {
diff --git a/include/linux/sh_cmt.h b/include/linux/sh_cmt.h
deleted file mode 100644
index 68cacde5954f4..0000000000000
--- a/include/linux/sh_cmt.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __SH_CMT_H__
-#define __SH_CMT_H__
-
-struct sh_cmt_config {
-	char *name;
-	unsigned long channel_offset;
-	int timer_bit;
-	char *clk;
-	unsigned long clockevent_rating;
-	unsigned long clocksource_rating;
-};
-
-#endif /* __SH_CMT_H__ */
diff --git a/include/linux/sh_mtu2.h b/include/linux/sh_mtu2.h
deleted file mode 100644
index a98876340ff41..0000000000000
--- a/include/linux/sh_mtu2.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#ifndef __SH_MTU2_H__
-#define __SH_MTU2_H__
-
-struct sh_mtu2_config {
-	char *name;
-	int channel_offset;
-	int timer_bit;
-	char *clk;
-	unsigned long clockevent_rating;
-};
-
-#endif /* __SH_MTU2_H__ */
diff --git a/include/linux/sh_timer.h b/include/linux/sh_timer.h
new file mode 100644
index 0000000000000..864bd56bd3b0b
--- /dev/null
+++ b/include/linux/sh_timer.h
@@ -0,0 +1,13 @@
+#ifndef __SH_TIMER_H__
+#define __SH_TIMER_H__
+
+struct sh_timer_config {
+	char *name;
+	long channel_offset;
+	int timer_bit;
+	char *clk;
+	unsigned long clockevent_rating;
+	unsigned long clocksource_rating;
+};
+
+#endif /* __SH_TIMER_H__ */
diff --git a/include/linux/sh_tmu.h b/include/linux/sh_tmu.h
deleted file mode 100644
index b1195e8a37d03..0000000000000
--- a/include/linux/sh_tmu.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef __SH_TMU_H__
-#define __SH_TMU_H__
-
-struct sh_tmu_config {
-	char *name;
-	unsigned long channel_offset;
-	int timer_bit;
-	char *clk;
-	unsigned long clockevent_rating;
-	unsigned long clocksource_rating;
-};
-
-#endif /* __SH_TMU_H__ */
-- 
GitLab


From d1fcc0a8db5e47c1abaa783a3e83dbf5f2184969 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 18:05:42 +0900
Subject: [PATCH 0814/2198] clocksource: sh_mtu2/cmt_register() should be
 static.

Neither of these need to be exported, so just make them static.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/clocksource/sh_cmt.c  | 6 +++---
 drivers/clocksource/sh_mtu2.c | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c
index aeb8c9b27b5b9..cf56a2af5fe11 100644
--- a/drivers/clocksource/sh_cmt.c
+++ b/drivers/clocksource/sh_cmt.c
@@ -537,9 +537,9 @@ static void sh_cmt_register_clockevent(struct sh_cmt_priv *p,
 	clockevents_register_device(ced);
 }
 
-int sh_cmt_register(struct sh_cmt_priv *p, char *name,
-		    unsigned long clockevent_rating,
-		    unsigned long clocksource_rating)
+static int sh_cmt_register(struct sh_cmt_priv *p, char *name,
+			   unsigned long clockevent_rating,
+			   unsigned long clocksource_rating)
 {
 	if (p->width == (sizeof(p->max_match_value) * 8))
 		p->max_match_value = ~0;
diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c
index ef02185a827a5..d1ae75454d10e 100644
--- a/drivers/clocksource/sh_mtu2.c
+++ b/drivers/clocksource/sh_mtu2.c
@@ -232,8 +232,8 @@ static void sh_mtu2_register_clockevent(struct sh_mtu2_priv *p,
 	clockevents_register_device(ced);
 }
 
-int sh_mtu2_register(struct sh_mtu2_priv *p, char *name,
-		     unsigned long clockevent_rating)
+static int sh_mtu2_register(struct sh_mtu2_priv *p, char *name,
+			    unsigned long clockevent_rating)
 {
 	if (clockevent_rating)
 		sh_mtu2_register_clockevent(p, name, clockevent_rating);
-- 
GitLab


From 938edae11ee3a7b20b6d754074a0f2c2edc4534b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 18:12:26 +0900
Subject: [PATCH 0815/2198] sh: select both GENERIC_TIME and
 GENERIC_CLOCKEVENTS.

Now that the rest of the timers that didn't support clockevents have been
rewritten, both of these can be enabled by default.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index c3e9455498a0a..565f39042ad94 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -78,7 +78,7 @@ config GENERIC_TIME
 	def_bool y
 
 config GENERIC_CLOCKEVENTS
-	def_bool n
+	def_bool y
 
 config GENERIC_CLOCKEVENTS_BROADCAST
 	bool
@@ -469,8 +469,6 @@ config SH_TMU
 	bool "TMU timer support"
 	depends on CPU_SH3 || CPU_SH4
 	default y
-	select GENERIC_TIME
-	select GENERIC_CLOCKEVENTS
 	help
 	  This enables the use of the TMU as the system timer.
 
@@ -478,8 +476,6 @@ config SH_TIMER_TMU
 	bool "TMU timer driver"
 	depends on !SH_TMU && SYS_SUPPORTS_TMU
 	default y
-	select GENERIC_TIME
-	select GENERIC_CLOCKEVENTS
 	help
 	  This enables the build of the TMU timer driver.
 
@@ -487,8 +483,6 @@ config SH_TIMER_CMT
 	bool "CMT timer driver"
 	depends on SYS_SUPPORTS_CMT
 	default y
-	select GENERIC_CLOCKEVENTS
-	select GENERIC_TIME
 	help
 	  This enables build of the CMT timer driver.
 
@@ -496,7 +490,6 @@ config SH_TIMER_MTU2
 	bool "MTU2 timer driver"
 	depends on SYS_SUPPORTS_MTU2
 	default y
-	select GENERIC_CLOCKEVENTS
 	help
 	  This enables build of the MTU2 timer driver.
 
-- 
GitLab


From dec56e6312434b2536fedf9d7e9e73d666aaf0a8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 18:18:14 +0900
Subject: [PATCH 0816/2198] sh: Kill off the now unused ARCH_USES_GETTIMEOFFSET
 code.

Now that the stragglers (MTU2/CMT/etc.) have been rewritten and we are
selecting both GENERIC_TIME and GENERIC_CLOCKEVENTS, the get_offset()
timer op is completely unused. As a result, we are now able to kill off
the ARCH_USES_GETTIMEOFFSET references.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/timer.h | 10 ----------
 arch/sh/kernel/time_32.c    |  9 ---------
 2 files changed, 19 deletions(-)

diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index 581e3fe3fe040..cb16645d83972 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -9,9 +9,6 @@ struct sys_timer_ops {
 	int (*init)(void);
 	int (*start)(void);
 	int (*stop)(void);
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-	unsigned long (*get_offset)(void);
-#endif
 };
 
 struct sys_timer {
@@ -26,13 +23,6 @@ struct sys_timer {
 extern struct sys_timer tmu_timer;
 extern struct sys_timer *sys_timer;
 
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-static inline unsigned long get_timer_offset(void)
-{
-	return sys_timer->ops->get_offset();
-}
-#endif
-
 /* arch/sh/kernel/timers/timer.c */
 struct sys_timer *get_sys_timer(void);
 
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c
index 9d34dff149942..d41ca4cf20cf9 100644
--- a/arch/sh/kernel/time_32.c
+++ b/arch/sh/kernel/time_32.c
@@ -83,13 +83,6 @@ static int __init rtc_generic_init(void)
 }
 module_init(rtc_generic_init);
 
-#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
-u32 arch_gettimeoffset(void)
-{
-	return get_timer_offset() * 1000;
-}
-#endif /* CONFIG_ARCH_USES_GETTIMEOFFSET */
-
 /* last time the RTC clock got updated */
 static long last_rtc_update;
 
@@ -185,7 +178,6 @@ struct clocksource clocksource_sh = {
 	.name		= "SuperH",
 };
 
-#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET
 unsigned long long sched_clock(void)
 {
 	unsigned long long cycles;
@@ -197,7 +189,6 @@ unsigned long long sched_clock(void)
 	cycles = clocksource_sh.read(&clocksource_sh);
 	return cyc2ns(&clocksource_sh, cycles);
 }
-#endif
 
 static void __init sh_late_time_init(void)
 {
-- 
GitLab


From 25483efeb2e56521e418a59fa93401be156dc3bb Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 3 May 2009 18:29:27 +0900
Subject: [PATCH 0817/2198] sh: Move dummy clockevents broadcast timer to its
 new home.

The old arch/sh/kernel/timers/ directly will be going away completely
once the rest of the TMU users are migrated, so move the dummy broadcast
driver up a level in preparation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/Makefile_32                                | 2 ++
 arch/sh/kernel/Makefile_64                                | 2 ++
 arch/sh/kernel/{timers/timer-broadcast.c => localtimer.c} | 0
 arch/sh/kernel/timers/Makefile                            | 2 --
 4 files changed, 4 insertions(+), 2 deletions(-)
 rename arch/sh/kernel/{timers/timer-broadcast.c => localtimer.c} (100%)

diff --git a/arch/sh/kernel/Makefile_32 b/arch/sh/kernel/Makefile_32
index 82a3a150c00da..5b1d4d0fe04d8 100644
--- a/arch/sh/kernel/Makefile_32
+++ b/arch/sh/kernel/Makefile_32
@@ -32,4 +32,6 @@ obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_DUMP_CODE)		+= disassemble.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o
 
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= localtimer.o
+
 EXTRA_CFLAGS += -Werror
diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index fe425d7f6871b..d256c77469574 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -17,4 +17,6 @@ obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_IO_TRAPPED)	+= io_trapped.o
 obj-$(CONFIG_GENERIC_GPIO)	+= gpio.o
 
+obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= localtimer.o
+
 EXTRA_CFLAGS += -Werror
diff --git a/arch/sh/kernel/timers/timer-broadcast.c b/arch/sh/kernel/localtimer.c
similarity index 100%
rename from arch/sh/kernel/timers/timer-broadcast.c
rename to arch/sh/kernel/localtimer.c
diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile
index 44156df0bdeaf..8bceba59e76f5 100644
--- a/arch/sh/kernel/timers/Makefile
+++ b/arch/sh/kernel/timers/Makefile
@@ -5,5 +5,3 @@
 obj-y	:= timer.o
 
 obj-$(CONFIG_SH_TMU)		+= timer-tmu.o
-
-obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST)	+= timer-broadcast.o
-- 
GitLab


From b82914ce33146186d554b0f5c41e4e13693614ce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 18:54:32 +0200
Subject: [PATCH 0818/2198] perf_counter: round-robin per-CPU counters too

This used to be unstable when we had the rq->lock dependencies,
but now that they are that of the past we can turn on percpu
counter RR too.

[ Impact: handle counter over-commit for per-CPU counters too ]

LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8660ae5795300..b9679c36bcc28 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1069,18 +1069,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
-	const int rotate_percpu = 0;
 
-	if (rotate_percpu)
-		perf_counter_cpu_sched_out(cpuctx);
+	perf_counter_cpu_sched_out(cpuctx);
 	perf_counter_task_sched_out(curr, cpu);
 
-	if (rotate_percpu)
-		rotate_ctx(&cpuctx->ctx);
+	rotate_ctx(&cpuctx->ctx);
 	rotate_ctx(ctx);
 
-	if (rotate_percpu)
-		perf_counter_cpu_sched_in(cpuctx, cpu);
+	perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
 }
 
-- 
GitLab


From ba77813a2a22d631fe5bc0bf1ec0d11350544b70 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 4 May 2009 18:47:44 +0200
Subject: [PATCH 0819/2198] perf_counter: x86: fixup nmi_watchdog vs
 perf_counter boo-boo

Invert the atomic_inc_not_zero() test so that we will indeed detect the
first activation.

Also rename the global num_counters, since its easy to confuse with
x86_pmu.num_counters.

[ Impact: fix non-working perfcounters on AMD CPUs, cleanup ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241455664.7620.4938.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index d4c0cc9d32637..196b58f044483 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -171,7 +171,7 @@ x86_perf_counter_update(struct perf_counter *counter,
 	return new_raw_count;
 }
 
-static atomic_t num_counters;
+static atomic_t active_counters;
 static DEFINE_MUTEX(pmc_reserve_mutex);
 
 static bool reserve_pmc_hardware(void)
@@ -224,7 +224,7 @@ static void release_pmc_hardware(void)
 
 static void hw_perf_counter_destroy(struct perf_counter *counter)
 {
-	if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) {
+	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) {
 		release_pmc_hardware();
 		mutex_unlock(&pmc_reserve_mutex);
 	}
@@ -248,12 +248,12 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		return -ENODEV;
 
 	err = 0;
-	if (atomic_inc_not_zero(&num_counters)) {
+	if (!atomic_inc_not_zero(&active_counters)) {
 		mutex_lock(&pmc_reserve_mutex);
-		if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware())
+		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware())
 			err = -EBUSY;
 		else
-			atomic_inc(&num_counters);
+			atomic_inc(&active_counters);
 		mutex_unlock(&pmc_reserve_mutex);
 	}
 	if (err)
@@ -280,7 +280,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
 		hwc->nmi = 1;
 
-	hwc->irq_period		= hw_event->irq_period;
+	hwc->irq_period	= hw_event->irq_period;
 	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
 		hwc->irq_period = x86_pmu.max_period;
 
@@ -871,7 +871,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	struct pt_regs *regs;
 	int ret;
 
-	if (!atomic_read(&num_counters))
+	if (!atomic_read(&active_counters))
 		return NOTIFY_DONE;
 
 	switch (cmd) {
-- 
GitLab


From 0d905bca23aca5c86a10ee101bcd3b1abbd40b25 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 19:13:30 +0200
Subject: [PATCH 0820/2198] perf_counter: initialize the per-cpu context
 earlier

percpu scheduling for perfcounters wants to take the context lock,
but that lock first needs to be initialized. Currently it is an
early_initcall() - but that is too late, the task tick runs much
sooner than that.

Call it explicitly from the scheduler init sequence instead.

[ Impact: fix access-before-init crash ]

LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 5 ++++-
 kernel/perf_counter.c        | 5 +----
 kernel/sched.c               | 5 ++++-
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f776851f8c4b1..a356fa69796c5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -573,6 +573,8 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_priv;
 
+extern void perf_counter_init(void);
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -600,9 +602,10 @@ perf_counter_mmap(unsigned long addr, unsigned long len,
 
 static inline void
 perf_counter_munmap(unsigned long addr, unsigned long len,
-		    unsigned long pgoff, struct file *file) 		{ }
+		    unsigned long pgoff, struct file *file)		{ }
 
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
+static inline void perf_counter_init(void)				{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index b9679c36bcc28..fcdafa234a5d9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3265,15 +3265,12 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = {
 	.notifier_call		= perf_cpu_notify,
 };
 
-static int __init perf_counter_init(void)
+void __init perf_counter_init(void)
 {
 	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	register_cpu_notifier(&perf_cpu_nb);
-
-	return 0;
 }
-early_initcall(perf_counter_init);
 
 static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index 2f600e30dcf05..a728976a3a6c9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -8996,7 +8997,7 @@ void __init sched_init(void)
 		 * 1024) and two child groups A0 and A1 (of weight 1024 each),
 		 * then A0's share of the cpu resource is:
 		 *
-		 * 	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
 		 *
 		 * We achieve this by letting init_task_group's tasks sit
 		 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9097,6 +9098,8 @@ void __init sched_init(void)
 	alloc_bootmem_cpumask_var(&cpu_isolated_map);
 #endif /* SMP */
 
+	perf_counter_init();
+
 	scheduler_running = 1;
 }
 
-- 
GitLab


From 1dce8d99b85aba6eddb8b8260baea944922e6fe7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 19:23:18 +0200
Subject: [PATCH 0821/2198] perf_counter: convert perf_resource_mutex to a
 spinlock

Now percpu counters can be initialized very early. But the init
sequence uses mutex_lock(). Fortunately, perf_resource_mutex should
be a spinlock anyway, so convert it.

[ Impact: fix crash due to early init mutex use ]

LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fcdafa234a5d9..5f86a1156c94c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,9 +46,9 @@ static atomic_t nr_comm_tracking __read_mostly;
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 
 /*
- * Mutex for (sysadmin-configurable) counter reservations:
+ * Lock for (sysadmin-configurable) counter reservations:
  */
-static DEFINE_MUTEX(perf_resource_mutex);
+static DEFINE_SPINLOCK(perf_resource_lock);
 
 /*
  * Architecture provided APIs - weak aliases:
@@ -3207,9 +3207,9 @@ static void __cpuinit perf_counter_init_cpu(int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	__perf_counter_init_context(&cpuctx->ctx, NULL);
 
-	mutex_lock(&perf_resource_mutex);
+	spin_lock(&perf_resource_lock);
 	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu;
-	mutex_unlock(&perf_resource_mutex);
+	spin_unlock(&perf_resource_lock);
 
 	hw_perf_counter_setup(cpu);
 }
@@ -3292,7 +3292,7 @@ perf_set_reserve_percpu(struct sysdev_class *class,
 	if (val > perf_max_counters)
 		return -EINVAL;
 
-	mutex_lock(&perf_resource_mutex);
+	spin_lock(&perf_resource_lock);
 	perf_reserved_percpu = val;
 	for_each_online_cpu(cpu) {
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
@@ -3302,7 +3302,7 @@ perf_set_reserve_percpu(struct sysdev_class *class,
 		cpuctx->max_pertask = mpt;
 		spin_unlock_irq(&cpuctx->ctx.lock);
 	}
-	mutex_unlock(&perf_resource_mutex);
+	spin_unlock(&perf_resource_lock);
 
 	return count;
 }
@@ -3324,9 +3324,9 @@ perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count)
 	if (val > 1)
 		return -EINVAL;
 
-	mutex_lock(&perf_resource_mutex);
+	spin_lock(&perf_resource_lock);
 	perf_overcommit = val;
-	mutex_unlock(&perf_resource_mutex);
+	spin_unlock(&perf_resource_lock);
 
 	return count;
 }
-- 
GitLab


From 066d7dea32c9bffe6decc0abe465627656cdd84e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 4 May 2009 19:04:09 +0200
Subject: [PATCH 0822/2198] perf_counter: fix fixed-purpose counter support on
 v2 Intel-PERFMON

Fixed-purpose counters stopped working in a simple 'perf stat ls' run:

   <not counted>  cache references
   <not counted>  cache misses

Due to:

  ef7b3e0: perf_counter, x86: remove vendor check in fixed_mode_idx()

Which made x86_pmu.num_counters_fixed matter: if it's nonzero, the
fixed-purpose counters are utilized.

But on v2 perfmon this field is not set (despite there being
fixed-purpose PMCs). So add a quirk to set the number of fixed-purpose
counters to at least three.

[ Impact: add quirk for three fixed-purpose counters on certain Intel CPUs ]

Cc: Robert Richter <robert.richter@amd.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 196b58f044483..a6878b0798e5b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -962,7 +962,13 @@ static int intel_pmu_init(void)
 	x86_pmu = intel_pmu;
 	x86_pmu.version = version;
 	x86_pmu.num_counters = eax.split.num_counters;
-	x86_pmu.num_counters_fixed = edx.split.num_counters_fixed;
+
+	/*
+	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
+	 * assume at least 3 counters:
+	 */
+	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
 	x86_pmu.counter_bits = eax.split.bit_width;
 	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
-- 
GitLab


From d6ce96dabe2c4409fd009ec14250a1fdbab4b133 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 01:15:24 -0400
Subject: [PATCH 0823/2198] ring-buffer: export symbols

I'm adding a module to do a series of tests on the ring buffer as well
as benchmarks. This module needs to have more of the ring buffer API
exported. There's nothing wrong with reading the ring buffer from a
module.

[ Impact: allow modules to read pages from the ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f4cc59040ebf8..3e86da9b2a09e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2802,6 +2802,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 
 	return bpage;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
 
 /**
  * ring_buffer_free_read_page - free an allocated read page
@@ -2814,6 +2815,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
 {
 	free_page((unsigned long)data);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 
 /**
  * ring_buffer_read_page - extract a page from the ring buffer
@@ -2959,6 +2961,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
  out:
 	return ret;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
-- 
GitLab


From f0d2c681ac0a85142fc8abe65fc33fcad35cb9b7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 13:43:37 -0400
Subject: [PATCH 0824/2198] ring-buffer: add counters for commit overrun and
 nmi dropped entries

The WARN_ON in the ring buffer when a commit is preempted and the
buffer is filled by preceding writes can happen in normal operations.
The WARN_ON makes it look like a bug, not to mention, because
it does not stop tracing and calls printk which can also recurse, this
is prone to deadlock (the WARN_ON is not in a position to recurse).

This patch removes the WARN_ON and replaces it with a counter that
can be retrieved by a tracer. This counter is called commit_overrun.

While at it, I added a nmi_dropped counter to count any time an NMI entry
is dropped because the NMI could not take the spinlock.

[ Impact: prevent deadlock by printing normal case warning ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h |  2 ++
 kernel/trace/ring_buffer.c  | 52 ++++++++++++++++++++++++++++++++++---
 2 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 1c2f80911fbe2..f1345828c7c58 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -153,6 +153,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer);
 unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
 
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
 void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3e86da9b2a09e..26e1359fe193e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -402,6 +402,8 @@ struct ring_buffer_per_cpu {
 	struct buffer_page		*tail_page;	/* write to tail */
 	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
+	unsigned long			nmi_dropped;
+	unsigned long			commit_overrun;
 	unsigned long			overrun;
 	unsigned long			entries;
 	u64				write_stamp;
@@ -1216,8 +1218,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * simply fail.
 		 */
 		if (unlikely(in_nmi())) {
-			if (!__raw_spin_trylock(&cpu_buffer->lock))
+			if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+				cpu_buffer->nmi_dropped++;
 				goto out_reset;
+			}
 		} else
 			__raw_spin_lock(&cpu_buffer->lock);
 
@@ -1238,8 +1242,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 * about it.
 		 */
 		if (unlikely(next_page == commit_page)) {
-			/* This can easily happen on small ring buffers */
-			WARN_ON_ONCE(buffer->pages > 2);
+			cpu_buffer->commit_overrun++;
 			goto out_reset;
 		}
 
@@ -1925,6 +1928,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 
+/**
+ * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	ret = cpu_buffer->nmi_dropped;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
+
+/**
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret;
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		return 0;
+
+	cpu_buffer = buffer->buffers[cpu];
+	ret = cpu_buffer->commit_overrun;
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
+
 /**
  * ring_buffer_entries - get the number of entries in a buffer
  * @buffer: The ring buffer
@@ -2595,6 +2639,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
 	cpu_buffer->reader_page->read = 0;
 
+	cpu_buffer->nmi_dropped = 0;
+	cpu_buffer->commit_overrun = 0;
 	cpu_buffer->overrun = 0;
 	cpu_buffer->entries = 0;
 
-- 
GitLab


From c8d771835e18c938dae8690611d65fe98ad30f58 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 29 Apr 2009 18:03:45 -0400
Subject: [PATCH 0825/2198] tracing: export stats of ring buffers to userspace

This patch adds stats to the ftrace ring buffers:

 # cat /debugfs/tracing/per_cpu/cpu0/stats
 entries: 42360
 overrun: 30509326
 commit overrun: 0
 nmi dropped: 0

Where entries are the total number of data entries in the buffer.

overrun is the number of entries not consumed and were overwritten by
the writer.

commit overrun is the number of entries dropped due to nested writers
wrapping the buffer before the initial writer finished the commit.

nmi dropped is the number of entries dropped due to the ring buffer
lock being held when an nmi was going to write to the ring buffer.
Note, this field will be meaningless and will go away when the ring
buffer becomes lockless.

[ Impact: let userspace know what is happening in the ring buffers ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f5427e0fc982a..74df029056b04 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3595,6 +3595,45 @@ static const struct file_operations tracing_buffers_fops = {
 	.llseek		= no_llseek,
 };
 
+static ssize_t
+tracing_stats_read(struct file *filp, char __user *ubuf,
+		   size_t count, loff_t *ppos)
+{
+	unsigned long cpu = (unsigned long)filp->private_data;
+	struct trace_array *tr = &global_trace;
+	struct trace_seq *s;
+	unsigned long cnt;
+
+	s = kmalloc(sizeof(*s), GFP_ATOMIC);
+	if (!s)
+		return ENOMEM;
+
+	trace_seq_init(s);
+
+	cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "entries: %ld\n", cnt);
+
+	cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "overrun: %ld\n", cnt);
+
+	cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+
+	cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
+	trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
+
+	count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+	kfree(s);
+
+	return count;
+}
+
+static const struct file_operations tracing_stats_fops = {
+	.open		= tracing_open_generic,
+	.read		= tracing_stats_read,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3708,6 +3747,9 @@ static void tracing_init_debugfs_percpu(long cpu)
 
 	trace_create_file("trace_pipe_raw", 0444, d_cpu,
 			(void *) cpu, &tracing_buffers_fops);
+
+	trace_create_file("stats", 0444, d_cpu,
+			(void *) cpu, &tracing_stats_fops);
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
-- 
GitLab


From 60aa605dfce2976e54fa76e805ab0f221372d4d9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:21 +0200
Subject: [PATCH 0826/2198] sched: rt: document the risk of small values in the
 bandwidth settings

Thomas noted that we should disallow sysctl_sched_rt_runtime == 0 for
(!RT_GROUP) since the root group always has some RT tasks in it.

Further, update the documentation to inspire clue.

[ Impact: exclude corner-case sysctl_sched_rt_runtime value ]

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090505155436.863098054@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/scheduler/sched-rt-group.txt | 18 ++++++++++++++++++
 kernel/sched.c                             |  7 +++++++
 2 files changed, 25 insertions(+)

diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index 5ba4d3fc625a4..eb74b014a3f8b 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -4,6 +4,7 @@
 CONTENTS
 ========
 
+0. WARNING
 1. Overview
   1.1 The problem
   1.2 The solution
@@ -14,6 +15,23 @@ CONTENTS
 3. Future plans
 
 
+0. WARNING
+==========
+
+ Fiddling with these settings can result in an unstable system, the knobs are
+ root only and assumes root knows what he is doing.
+
+Most notable:
+
+ * very small values in sched_rt_period_us can result in an unstable
+   system when the period is smaller than either the available hrtimer
+   resolution, or the time it takes to handle the budget refresh itself.
+
+ * very small values in sched_rt_runtime_us can result in an unstable
+   system when the runtime is so small the system has difficulty making
+   forward progress (NOTE: the migration thread and kstopmachine both
+   are real-time processes).
+
 1. Overview
 ===========
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 54d67b94f1a9f..2a43a581ead32 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9917,6 +9917,13 @@ static int sched_rt_global_constraints(void)
 	if (sysctl_sched_rt_period <= 0)
 		return -EINVAL;
 
+	/*
+	 * There's always some RT tasks in the root group
+	 * -- migration, kstopmachine etc..
+	 */
+	if (sysctl_sched_rt_runtime == 0)
+		return -EBUSY;
+
 	spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
 	for_each_possible_cpu(i) {
 		struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-- 
GitLab


From c66de4a5be7913247bd83d79168f8e4420c9cfbc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:22 +0200
Subject: [PATCH 0827/2198] perf_counter: uncouple data_head updates from
 wakeups

Keep data_head up-to-date irrespective of notifications. This fixes
the case where you disable a counter and don't get a notification for
the last few pending events, and it also allows polling usage.

[ Impact: increase precision of perfcounter mmap-ed fields ]

Suggested-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090505155436.925084300@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 +++-
 kernel/perf_counter.c        | 20 +++++++++-----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a356fa69796c5..17b63105f2aa7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -362,9 +362,11 @@ struct perf_mmap_data {
 	atomic_t			head;		/* write position    */
 	atomic_t			events;		/* event limit       */
 
-	atomic_t			wakeup_head;	/* completed head    */
+	atomic_t			done_head;	/* completed head    */
 	atomic_t			lock;		/* concurrent writes */
 
+	atomic_t			wakeup;		/* needs a wakeup    */
+
 	struct perf_counter_mmap_page   *user_page;
 	void 				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5f86a1156c94c..ba5e921e1f36d 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1696,7 +1696,6 @@ struct perf_output_handle {
 	struct perf_mmap_data	*data;
 	unsigned int		offset;
 	unsigned int		head;
-	int			wakeup;
 	int			nmi;
 	int			overflow;
 	int			locked;
@@ -1752,8 +1751,7 @@ static void perf_output_unlock(struct perf_output_handle *handle)
 	struct perf_mmap_data *data = handle->data;
 	int head, cpu;
 
-	if (handle->wakeup)
-		data->wakeup_head = data->head;
+	data->done_head = data->head;
 
 	if (!handle->locked)
 		goto out;
@@ -1764,13 +1762,11 @@ static void perf_output_unlock(struct perf_output_handle *handle)
 	 * before we publish the new head, matched by a rmb() in userspace when
 	 * reading this position.
 	 */
-	while ((head = atomic_xchg(&data->wakeup_head, 0))) {
+	while ((head = atomic_xchg(&data->done_head, 0)))
 		data->user_page->data_head = head;
-		handle->wakeup = 1;
-	}
 
 	/*
-	 * NMI can happen here, which means we can miss a wakeup_head update.
+	 * NMI can happen here, which means we can miss a done_head update.
 	 */
 
 	cpu = atomic_xchg(&data->lock, 0);
@@ -1779,7 +1775,7 @@ static void perf_output_unlock(struct perf_output_handle *handle)
 	/*
 	 * Therefore we have to validate we did not indeed do so.
 	 */
-	if (unlikely(atomic_read(&data->wakeup_head))) {
+	if (unlikely(atomic_read(&data->done_head))) {
 		/*
 		 * Since we had it locked, we can lock it again.
 		 */
@@ -1789,7 +1785,7 @@ static void perf_output_unlock(struct perf_output_handle *handle)
 		goto again;
 	}
 
-	if (handle->wakeup)
+	if (atomic_xchg(&data->wakeup, 0))
 		perf_output_wakeup(handle);
 out:
 	local_irq_restore(handle->flags);
@@ -1824,7 +1820,9 @@ static int perf_output_begin(struct perf_output_handle *handle,
 
 	handle->offset	= offset;
 	handle->head	= head;
-	handle->wakeup	= (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT);
+
+	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+		atomic_set(&data->wakeup, 1);
 
 	return 0;
 
@@ -1882,7 +1880,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 		int events = atomic_inc_return(&data->events);
 		if (events >= wakeup_events) {
 			atomic_sub(wakeup_events, &data->events);
-			handle->wakeup = 1;
+			atomic_set(&data->wakeup, 1);
 		}
 	}
 
-- 
GitLab


From 6de6a7b95705b859b61430fa3afa1403034eb3e6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:23 +0200
Subject: [PATCH 0828/2198] perf_counter: add ioctl(PERF_COUNTER_IOC_RESET)

Provide a way to reset an existing counter - this eases PAPI
libraries around perfcounters.

Similar to read() it doesn't collapse pending child counters.

[ Impact: new perfcounter fd ioctl method to reset counters ]

Suggested-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090505155437.022272933@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 1 +
 kernel/perf_counter.c        | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 17b63105f2aa7..0fcbf34a4f73b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -160,6 +160,7 @@ struct perf_counter_hw_event {
 #define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
 #define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
 #define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
+#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
 
 /*
  * Structure of the page that can be mapped via mmap
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ba5e921e1f36d..6e6834e0587e0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1288,6 +1288,11 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 	return events;
 }
 
+static void perf_counter_reset(struct perf_counter *counter)
+{
+	atomic_set(&counter->count, 0);
+}
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
@@ -1303,6 +1308,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case PERF_COUNTER_IOC_REFRESH:
 		perf_counter_refresh(counter, arg);
 		break;
+	case PERF_COUNTER_IOC_RESET:
+		perf_counter_reset(counter);
+		break;
 	default:
 		err = -ENOTTY;
 	}
-- 
GitLab


From c5078f78b455fbf67ea71442c7e7ca8acf9ff095 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:24 +0200
Subject: [PATCH 0829/2198] perf_counter: provide an mlock threshold

Provide a threshold to relax the mlock accounting, increasing usability.

Each counter gets perf_counter_mlock_kb for free.

[ Impact: allow more mmap buffering ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090505155437.112113632@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 15 +++++++++++----
 kernel/sysctl.c              |  8 ++++++++
 3 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0fcbf34a4f73b..00081d84169f5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -358,6 +358,7 @@ struct file;
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
 	int				nr_pages;	/* nr of data pages  */
+	int				nr_locked;	/* nr pages mlocked  */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
 	atomic_t			head;		/* write position    */
@@ -575,6 +576,7 @@ struct perf_callchain_entry {
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_priv;
+extern int sysctl_perf_counter_mlock;
 
 extern void perf_counter_init(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 6e6834e0587e0..2d13427383051 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,6 +44,7 @@ static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -1461,7 +1462,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
-		vma->vm_mm->locked_vm -= counter->data->nr_pages + 1;
+		vma->vm_mm->locked_vm -= counter->data->nr_locked;
 		perf_mmap_data_free(counter);
 		mutex_unlock(&counter->mmap_mutex);
 	}
@@ -1480,6 +1481,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long nr_pages;
 	unsigned long locked, lock_limit;
 	int ret = 0;
+	long extra;
 
 	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
 		return -EINVAL;
@@ -1507,8 +1509,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	locked = vma->vm_mm->locked_vm;
-	locked += nr_pages + 1;
+	extra = nr_pages /* + 1 only account the data pages */;
+	extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+	if (extra < 0)
+		extra = 0;
+
+	locked = vma->vm_mm->locked_vm + extra;
 
 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
@@ -1524,7 +1530,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 
 	atomic_set(&counter->mmap_count, 1);
-	vma->vm_mm->locked_vm += nr_pages + 1;
+	vma->vm_mm->locked_vm += extra;
+	counter->data->nr_locked = extra;
 unlock:
 	mutex_unlock(&counter->mmap_mutex);
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8203d70928d54..3b05c2b088d2a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -920,6 +920,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "perf_counter_mlock_kb",
+		.data		= &sysctl_perf_counter_mlock,
+		.maxlen		= sizeof(sysctl_perf_counter_mlock),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 /*
  * NOTE: do not add new entries to this table unless you have read
-- 
GitLab


From 22c1558e51c210787c6cf75d8905246fc91ec030 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:25 +0200
Subject: [PATCH 0830/2198] perf_counter: fix the output lock

Use -1 instead of 0 as unlocked, since 0 is a valid cpu number.

( This is not an issue right now but will be once we allow multiple
  counters to output to the same mmap area. )

[ Impact: prepare code for multi-counter profile output ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090505155437.232686598@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2d13427383051..c881afef997b9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1409,6 +1409,7 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
 	}
 
 	data->nr_pages = nr_pages;
+	atomic_set(&data->lock, -1);
 
 	rcu_assign_pointer(counter->data, data);
 
@@ -1755,7 +1756,7 @@ static void perf_output_lock(struct perf_output_handle *handle)
 	if (in_nmi() && atomic_read(&data->lock) == cpu)
 		return;
 
-	while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
 		cpu_relax();
 
 	handle->locked = 1;
@@ -1784,7 +1785,7 @@ static void perf_output_unlock(struct perf_output_handle *handle)
 	 * NMI can happen here, which means we can miss a done_head update.
 	 */
 
-	cpu = atomic_xchg(&data->lock, 0);
+	cpu = atomic_xchg(&data->lock, -1);
 	WARN_ON_ONCE(cpu != smp_processor_id());
 
 	/*
@@ -1794,7 +1795,7 @@ static void perf_output_unlock(struct perf_output_handle *handle)
 		/*
 		 * Since we had it locked, we can lock it again.
 		 */
-		while (atomic_cmpxchg(&data->lock, 0, cpu) != 0)
+		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1)
 			cpu_relax();
 
 		goto again;
-- 
GitLab


From 2023b359214bbc5bad31571cf50d7fb83b535c0a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:26 +0200
Subject: [PATCH 0831/2198] perf_counter: inheritable sample counters

Redirect the output to the parent counter and put in some sanity checks.

[ Impact: new perfcounter feature - inherited sampling counters ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090505155437.331556171@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c881afef997b9..60e55f0b48f4b 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -738,10 +738,18 @@ static void perf_counter_enable(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
-static void perf_counter_refresh(struct perf_counter *counter, int refresh)
+static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 {
+	/*
+	 * not supported on inherited counters
+	 */
+	if (counter->hw_event.inherit)
+		return -EINVAL;
+
 	atomic_add(refresh, &counter->event_limit);
 	perf_counter_enable(counter);
+
+	return 0;
 }
 
 /*
@@ -1307,7 +1315,7 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		perf_counter_disable_family(counter);
 		break;
 	case PERF_COUNTER_IOC_REFRESH:
-		perf_counter_refresh(counter, arg);
+		err = perf_counter_refresh(counter, arg);
 		break;
 	case PERF_COUNTER_IOC_RESET:
 		perf_counter_reset(counter);
@@ -1814,6 +1822,12 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	struct perf_mmap_data *data;
 	unsigned int offset, head;
 
+	/*
+	 * For inherited counters we send all the output towards the parent.
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
 	if (!data)
@@ -1995,6 +2009,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_ADDR)
 		perf_output_put(&handle, addr);
 
+	/*
+	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
+	 */
 	if (record_type & PERF_RECORD_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
@@ -2281,6 +2298,11 @@ int perf_counter_overflow(struct perf_counter *counter,
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
 
+	/*
+	 * XXX event_limit might not quite work as expected on inherited
+	 * counters
+	 */
+
 	counter->pending_kill = POLL_IN;
 	if (events && atomic_dec_and_test(&counter->event_limit)) {
 		ret = 1;
@@ -2801,6 +2823,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	pmu = NULL;
 
+	/*
+	 * we currently do not support PERF_RECORD_GROUP on inherited counters
+	 */
+	if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP))
+		goto done;
+
 	if (perf_event_raw(hw_event)) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
-- 
GitLab


From 16c8a10932aef971292c9570eb5f60b5d4e83ed2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 5 May 2009 17:50:27 +0200
Subject: [PATCH 0832/2198] perf_counter: tools: update the tools to support
 process and inherited counters

"perf record":
 - per task counter
 - inherit switch
 - nmi switch

"perf report":
 - userspace/kernel filter

"perf stat":
 - userspace/kernel filter

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090505155437.389163017@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 155 ++++++++++++--------
 Documentation/perf_counter/builtin-stat.c   |  24 ++-
 Documentation/perf_counter/perf-report.cc   |  27 +++-
 3 files changed, 140 insertions(+), 66 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index ddfdcf86fb20a..5f5e6df0260df 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -45,7 +45,10 @@ static unsigned int		mmap_pages			= 16;
 static int			output;
 static char 			*output_name			= "output.perf";
 static int			group				= 0;
-static unsigned int		realtime_prio			=  0;
+static unsigned int		realtime_prio			= 0;
+static int			system_wide			= 0;
+static int			inherit				= 1;
+static int			nmi				= 1;
 
 const unsigned int default_count[] = {
 	1000000,
@@ -167,7 +170,7 @@ static void display_events_help(void)
 static void display_help(void)
 {
 	printf(
-	"Usage: perf-record [<options>]\n"
+	"Usage: perf-record [<options>] <cmd>\n"
 	"perf-record Options (up to %d event types can be specified at once):\n\n",
 		 MAX_COUNTERS);
 
@@ -178,12 +181,13 @@ static void display_help(void)
 	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
 	" -o file   --output=<file>      # output file\n"
 	" -r prio   --realtime=<prio>    # use RT prio\n"
+	" -s        --system             # system wide profiling\n"
 	);
 
 	exit(0);
 }
 
-static void process_options(int argc, char *argv[])
+static void process_options(int argc, const char *argv[])
 {
 	int error = 0, counter;
 
@@ -196,9 +200,12 @@ static void process_options(int argc, char *argv[])
 			{"mmap_pages",	required_argument,	NULL, 'm'},
 			{"output",	required_argument,	NULL, 'o'},
 			{"realtime",	required_argument,	NULL, 'r'},
+			{"system",	no_argument,		NULL, 's'},
+			{"inherit",	no_argument,		NULL, 'i'},
+			{"nmi",		no_argument,		NULL, 'n'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:c:e:m:o:r:",
+		int c = getopt_long(argc, argv, "+:c:e:m:o:r:sin",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -209,9 +216,16 @@ static void process_options(int argc, char *argv[])
 		case 'm': mmap_pages			=   atoi(optarg); break;
 		case 'o': output_name			= strdup(optarg); break;
 		case 'r': realtime_prio			=   atoi(optarg); break;
+		case 's': system_wide                   ^=             1; break;
+		case 'i': inherit			^=	       1; break;
+		case 'n': nmi				^=	       1; break;
 		default: error = 1; break;
 		}
 	}
+
+	if (argc - optind == 0)
+		error = 1;
+
 	if (error)
 		display_help();
 
@@ -325,18 +339,82 @@ static void mmap_read(struct mmap_data *md)
 
 static volatile int done = 0;
 
-static void sigchld_handler(int sig)
+static void sig_handler(int sig)
 {
-	if (sig == SIGCHLD)
-		done = 1;
+	done = 1;
 }
 
-int cmd_record(int argc, char **argv)
+static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
+static int nr_poll;
+static int nr_cpu;
+
+static void open_counters(int cpu)
 {
-	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
-	int i, counter, group_fd, nr_poll = 0;
+	int counter, group_fd;
+	int track = 1;
+	pid_t pid = -1;
+
+	if (cpu < 0)
+		pid = 0;
+
+	group_fd = -1;
+	for (counter = 0; counter < nr_counters; counter++) {
+
+		memset(&hw_event, 0, sizeof(hw_event));
+		hw_event.config		= event_id[counter];
+		hw_event.irq_period	= event_count[counter];
+		hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
+		hw_event.nmi		= nmi;
+		hw_event.mmap		= track;
+		hw_event.comm		= track;
+		hw_event.inherit	= (cpu < 0) && inherit;
+
+		track = 0; // only the first counter needs these
+
+		fd[nr_cpu][counter] =
+			sys_perf_counter_open(&hw_event, pid, cpu, group_fd, 0);
+
+		if (fd[nr_cpu][counter] < 0) {
+			int err = errno;
+			printf("kerneltop error: syscall returned with %d (%s)\n",
+					fd[nr_cpu][counter], strerror(err));
+			if (err == EPERM)
+				printf("Are you root?\n");
+			exit(-1);
+		}
+		assert(fd[nr_cpu][counter] >= 0);
+		fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
+
+		/*
+		 * First counter acts as the group leader:
+		 */
+		if (group && group_fd == -1)
+			group_fd = fd[nr_cpu][counter];
+
+		event_array[nr_poll].fd = fd[nr_cpu][counter];
+		event_array[nr_poll].events = POLLIN;
+		nr_poll++;
+
+		mmap_array[nr_cpu][counter].counter = counter;
+		mmap_array[nr_cpu][counter].prev = 0;
+		mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
+		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+				PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
+		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
+			printf("kerneltop error: failed to mmap with %d (%s)\n",
+					errno, strerror(errno));
+			exit(-1);
+		}
+	}
+	nr_cpu++;
+}
+
+int cmd_record(int argc, const char **argv)
+{
+	int i, counter;
 	pid_t pid;
 	int ret;
 
@@ -357,54 +435,13 @@ int cmd_record(int argc, char **argv)
 	argc -= optind;
 	argv += optind;
 
-	for (i = 0; i < nr_cpus; i++) {
-		group_fd = -1;
-		for (counter = 0; counter < nr_counters; counter++) {
-
-			memset(&hw_event, 0, sizeof(hw_event));
-			hw_event.config		= event_id[counter];
-			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.nmi		= 1;
-			hw_event.mmap		= 1;
-			hw_event.comm		= 1;
-
-			fd[i][counter] = sys_perf_counter_open(&hw_event, -1, i, group_fd, 0);
-			if (fd[i][counter] < 0) {
-				int err = errno;
-				printf("kerneltop error: syscall returned with %d (%s)\n",
-					fd[i][counter], strerror(err));
-				if (err == EPERM)
-					printf("Are you root?\n");
-				exit(-1);
-			}
-			assert(fd[i][counter] >= 0);
-			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
-
-			/*
-			 * First counter acts as the group leader:
-			 */
-			if (group && group_fd == -1)
-				group_fd = fd[i][counter];
-
-			event_array[nr_poll].fd = fd[i][counter];
-			event_array[nr_poll].events = POLLIN;
-			nr_poll++;
-
-			mmap_array[i][counter].counter = counter;
-			mmap_array[i][counter].prev = 0;
-			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
-			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-					PROT_READ, MAP_SHARED, fd[i][counter], 0);
-			if (mmap_array[i][counter].base == MAP_FAILED) {
-				printf("kerneltop error: failed to mmap with %d (%s)\n",
-						errno, strerror(errno));
-				exit(-1);
-			}
-		}
-	}
+	if (!system_wide)
+		open_counters(-1);
+	else for (i = 0; i < nr_cpus; i++)
+		open_counters(i);
 
-	signal(SIGCHLD, sigchld_handler);
+	signal(SIGCHLD, sig_handler);
+	signal(SIGINT, sig_handler);
 
 	pid = fork();
 	if (pid < 0)
@@ -434,7 +471,7 @@ int cmd_record(int argc, char **argv)
 	while (!done) {
 		int hits = events;
 
-		for (i = 0; i < nr_cpus; i++) {
+		for (i = 0; i < nr_cpu; i++) {
 			for (counter = 0; counter < nr_counters; counter++)
 				mmap_read(&mmap_array[i][counter]);
 		}
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 6de38d2568830..e2fa117eab58e 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -87,6 +87,9 @@
 
 #include "perf.h"
 
+#define EVENT_MASK_KERNEL		1
+#define EVENT_MASK_USER			2
+
 static int			system_wide			=  0;
 
 static int			nr_counters			=  0;
@@ -104,6 +107,7 @@ static __u64			event_id[MAX_COUNTERS]		= {
 static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			event_mask[MAX_COUNTERS];
 
 static int			tid				= -1;
 static int			profile_cpu			= -1;
@@ -258,12 +262,23 @@ static __u64 match_event_symbols(char *str)
 	__u64 config, id;
 	int type;
 	unsigned int i;
+	char mask_str[4];
 
 	if (sscanf(str, "r%llx", &config) == 1)
 		return config | PERF_COUNTER_RAW_MASK;
 
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
+	switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
+		case 3:
+			if (strchr(mask_str, 'u'))
+				event_mask[nr_counters] |= EVENT_MASK_USER;
+			if (strchr(mask_str, 'k'))
+				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
+		case 2:
+			return EID(type, id);
+
+		default:
+			break;
+	}
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		if (!strncmp(str, event_symbols[i].symbol,
@@ -313,6 +328,11 @@ static void create_perfstat_counter(int counter)
 	hw_event.config		= event_id[counter];
 	hw_event.record_type	= 0;
 	hw_event.nmi		= 0;
+	hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
+	hw_event.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
+
+printf("exclude: %d\n", event_mask[counter]);
+
 	if (scale)
 		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
 					  PERF_FORMAT_TOTAL_TIME_RUNNING;
diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
index 911d7f3e7a65c..8855107fe6b3c 100644
--- a/Documentation/perf_counter/perf-report.cc
+++ b/Documentation/perf_counter/perf-report.cc
@@ -33,8 +33,13 @@
 #include <string>
 
 
+#define SHOW_KERNEL	1
+#define SHOW_USER	2
+#define SHOW_HV		4
+
 static char 		const *input_name = "output.perf";
 static int		input;
+static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -359,15 +364,21 @@ static void process_options(int argc, char *argv[])
 		/** Options for getopt */
 		static struct option long_options[] = {
 			{"input",	required_argument,	NULL, 'i'},
+			{"no-user",	no_argument,		NULL, 'u'},
+			{"no-kernel",	no_argument,		NULL, 'k'},
+			{"no-hv",	no_argument,		NULL, 'h'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:i:",
+		int c = getopt_long(argc, argv, "+:i:kuh",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
 
 		switch (c) {
 		case 'i': input_name			= strdup(optarg); break;
+		case 'k': show_mask &= ~SHOW_KERNEL; break;
+		case 'u': show_mask &= ~SHOW_USER; break;
+		case 'h': show_mask &= ~SHOW_HV; break;
 		default: error = 1; break;
 		}
 	}
@@ -443,22 +454,28 @@ more:
 
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
 		std::string comm, sym, level;
+		int show = 0;
 		char output[1024];
 
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+			show |= SHOW_KERNEL;
 			level = " [k] ";
 			sym = resolve_kernel_symbol(event->ip.ip);
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+			show |= SHOW_USER;
 			level = " [.] ";
 			sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
 		} else {
+			show |= SHOW_HV;
 			level = " [H] ";
 		}
-		comm = resolve_comm(event->ip.pid);
 
-		snprintf(output, sizeof(output), "%16s %s %s",
-				comm.c_str(), level.c_str(), sym.c_str());
-		hist[output]++;
+		if (show & show_mask) {
+			comm = resolve_comm(event->ip.pid);
+			snprintf(output, sizeof(output), "%16s %s %s",
+					comm.c_str(), level.c_str(), sym.c_str());
+			hist[output]++;
+		}
 
 		total++;
 
-- 
GitLab


From e4906eff9e6fbd2d311abcbcc53d5a531773c982 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 30 Apr 2009 20:49:44 -0400
Subject: [PATCH 0833/2198] ring-buffer: convert cpu buffer entries to local_t

The entries counter in cpu buffer is not atomic. It can be updated by
other interrupts or from another CPU (readers).

But making entries into "atomic_t" causes an atomic operation that can
hurt performance. Instead we convert it to a local_t that will increment
a counter with a local CPU atomic operation (if the arch supports it).

Instead of fighting with readers and overwrites that decrement the counter,
I added a "read" counter. Every time a reader reads an entry it is
incremented.

We already have a overrun counter and with that, the entries counter and
the read counter, we can calculate the total number of entries in the
buffer with:

  (entries - overrun) - read

As long as the total number of entries in the ring buffer is less than
the word size, this will work. But since the entries counter was previously
a long, this is no different than what we had before.

Thanks to Andrew Morton for pointing out in the first version that
atomic_t does not replace unsigned long. I switched to atomic_long_t
even though it is signed. A negative count is most likely a bug.

[ Impact: keep accurate count of cpu buffer entries ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 26e1359fe193e..c792ea893b016 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -405,7 +405,8 @@ struct ring_buffer_per_cpu {
 	unsigned long			nmi_dropped;
 	unsigned long			commit_overrun;
 	unsigned long			overrun;
-	unsigned long			entries;
+	unsigned long			read;
+	local_t				entries;
 	u64				write_stamp;
 	u64				read_stamp;
 	atomic_t			record_disabled;
@@ -997,7 +998,6 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
 		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
 		cpu_buffer->overrun++;
-		cpu_buffer->entries--;
 	}
 }
 
@@ -1588,7 +1588,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		      struct ring_buffer_event *event)
 {
-	cpu_buffer->entries++;
+	local_inc(&cpu_buffer->entries);
 
 	/* Only process further if we own the commit */
 	if (!rb_is_commit(cpu_buffer, event))
@@ -1722,7 +1722,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	 * The commit is still visible by the reader, so we
 	 * must increment entries.
 	 */
-	cpu_buffer->entries++;
+	local_inc(&cpu_buffer->entries);
  out:
 	/*
 	 * If a write came in and pushed the tail page
@@ -1902,7 +1902,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
-	ret = cpu_buffer->entries;
+	ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
+		- cpu_buffer->read;
 
 	return ret;
 }
@@ -1985,7 +1986,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
-		entries += cpu_buffer->entries;
+		entries += (local_read(&cpu_buffer->entries) -
+			    cpu_buffer->overrun) - cpu_buffer->read;
 	}
 
 	return entries;
@@ -2225,7 +2227,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 			|| rb_discarded_event(event))
-		cpu_buffer->entries--;
+		cpu_buffer->read++;
 
 	rb_update_read_stamp(cpu_buffer, event);
 
@@ -2642,7 +2644,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->nmi_dropped = 0;
 	cpu_buffer->commit_overrun = 0;
 	cpu_buffer->overrun = 0;
-	cpu_buffer->entries = 0;
+	cpu_buffer->read = 0;
+	local_set(&cpu_buffer->entries, 0);
 
 	cpu_buffer->write_stamp = 0;
 	cpu_buffer->read_stamp = 0;
@@ -2813,7 +2816,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
 		/* Only count data entries */
 		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
 			continue;
-		cpu_buffer->entries--;
+		cpu_buffer->read++;
 	}
 	__raw_spin_unlock(&cpu_buffer->lock);
 }
-- 
GitLab


From 41c51c98f588edcdf6141cff1895df738e03ddd4 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Sun, 3 May 2009 23:11:18 +0200
Subject: [PATCH 0834/2198] rcu: rcu_sched_grace_period(): kill the bogus
 flush_signals()

As a kernel thread, rcu_sched_grace_period() runs with all signals ignored.
It can never receive a signal even if it sleeps in TASK_INTERRUPTIBLE, it
needs the explicit allow_signal() to be visible for signals.

[ Impact: reduce kernel size, remove dead code ]

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090503211118.GA22973@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupreempt.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index ce97a4df64d35..beb0e659adcc6 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)
 
 		rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
 		spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
-		ret = 0;
+		ret = 0; /* unused */
 		__wait_event_interruptible(rcu_ctrlblk.sched_wq,
 			rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
 			ret);
 
-		/*
-		 * Signals would prevent us from sleeping, and we cannot
-		 * do much with them in any case.  So flush them.
-		 */
-		if (ret)
-			flush_signals(current);
 		couldsleepnext = 0;
 
 	} while (!kthread_should_stop());
-- 
GitLab


From 778c55d44eb4f5f658915ed631d68ed9d1ac3ad1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 1 May 2009 18:44:45 -0400
Subject: [PATCH 0835/2198] ring-buffer: record page entries in buffer page
 descriptor

Currently, when the ring buffer writer overflows the buffer and must
write over non consumed data, we increment the overrun counter by
reading the entries on the page we are about to overwrite. This reads
the entries one by one.

This is not very effecient. This patch adds another entry counter
into each buffer page descriptor that keeps track of the number of
entries on the page. Now on overwrite, the overrun counter simply
needs to add the number of entries that is on the page it is about
to overwrite.

[ Impact: speed up of ring buffer in overwrite mode ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 39 +++++++++++++-------------------------
 1 file changed, 13 insertions(+), 26 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c792ea893b016..342eacc4baa85 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -321,9 +321,10 @@ struct buffer_data_page {
 };
 
 struct buffer_page {
+	struct list_head list;		/* list of buffer pages */
 	local_t		 write;		/* index for next write */
 	unsigned	 read;		/* index for next read */
-	struct list_head list;		/* list of free pages */
+	local_t		 entries;	/* entries on this page */
 	struct buffer_data_page *page;	/* Actual data page */
 };
 
@@ -977,30 +978,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
 	return rb_page_commit(cpu_buffer->head_page);
 }
 
-/*
- * When the tail hits the head and the buffer is in overwrite mode,
- * the head jumps to the next page and all content on the previous
- * page is discarded. But before doing so, we update the overrun
- * variable of the buffer.
- */
-static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
-{
-	struct ring_buffer_event *event;
-	unsigned long head;
-
-	for (head = 0; head < rb_head_size(cpu_buffer);
-	     head += rb_event_length(event)) {
-
-		event = __rb_page_index(cpu_buffer->head_page, head);
-		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-			return;
-		/* Only count data entries */
-		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-			continue;
-		cpu_buffer->overrun++;
-	}
-}
-
 static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
 			       struct buffer_page **bpage)
 {
@@ -1253,7 +1230,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			/* tail_page has not moved yet? */
 			if (tail_page == cpu_buffer->tail_page) {
 				/* count overflows */
-				rb_update_overflow(cpu_buffer);
+				cpu_buffer->overrun +=
+					local_read(&head_page->entries);
 
 				rb_inc_page(cpu_buffer, &head_page);
 				cpu_buffer->head_page = head_page;
@@ -1268,6 +1246,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (tail_page == cpu_buffer->tail_page) {
 			local_set(&next_page->write, 0);
+			local_set(&next_page->entries, 0);
 			local_set(&next_page->page->commit, 0);
 			cpu_buffer->tail_page = next_page;
 
@@ -1313,6 +1292,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	event = __rb_page_index(tail_page, tail);
 	rb_update_event(event, type, length);
 
+	/* The passed in type is zero for DATA */
+	if (likely(!type))
+		local_inc(&tail_page->entries);
+
 	/*
 	 * If this is a commit and the tail is zero, then update
 	 * this page's time stamp.
@@ -2183,6 +2166,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page->list.prev = reader->list.prev;
 
 	local_set(&cpu_buffer->reader_page->write, 0);
+	local_set(&cpu_buffer->reader_page->entries, 0);
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
 
 	/* Make the reader page now replace the head */
@@ -2629,6 +2613,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->head_page
 		= list_entry(cpu_buffer->pages.next, struct buffer_page, list);
 	local_set(&cpu_buffer->head_page->write, 0);
+	local_set(&cpu_buffer->head_page->entries, 0);
 	local_set(&cpu_buffer->head_page->page->commit, 0);
 
 	cpu_buffer->head_page->read = 0;
@@ -2638,6 +2623,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 
 	INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
 	local_set(&cpu_buffer->reader_page->write, 0);
+	local_set(&cpu_buffer->reader_page->entries, 0);
 	local_set(&cpu_buffer->reader_page->page->commit, 0);
 	cpu_buffer->reader_page->read = 0;
 
@@ -2996,6 +2982,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		bpage = reader->page;
 		reader->page = *data_page;
 		local_set(&reader->write, 0);
+		local_set(&reader->entries, 0);
 		reader->read = 0;
 		*data_page = bpage;
 
-- 
GitLab


From afbab76a62b69ea6197e19727d4b8a8aef8deb25 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 1 May 2009 19:40:05 -0400
Subject: [PATCH 0836/2198] ring-buffer: have read page swap increment counter
 with page entries

In the swap page ring buffer code that is used by the ftrace splice code,
we scan the page to increment the counter of entries read.

With the number of entries already in the page we simply need to add it.

[ Impact: speed up reading page from ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 28 +++-------------------------
 1 file changed, 3 insertions(+), 25 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 342eacc4baa85..9e42a742a3f90 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2785,28 +2785,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
-static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-			      struct buffer_data_page *bpage,
-			      unsigned int offset)
-{
-	struct ring_buffer_event *event;
-	unsigned long head;
-
-	__raw_spin_lock(&cpu_buffer->lock);
-	for (head = offset; head < local_read(&bpage->commit);
-	     head += rb_event_length(event)) {
-
-		event = __rb_data_page_index(bpage, head);
-		if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-			return;
-		/* Only count data entries */
-		if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
-			continue;
-		cpu_buffer->read++;
-	}
-	__raw_spin_unlock(&cpu_buffer->lock);
-}
-
 /**
  * ring_buffer_alloc_read_page - allocate a page to read from buffer
  * @buffer: the buffer to allocate for.
@@ -2977,6 +2955,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		/* we copied everything to the beginning */
 		read = 0;
 	} else {
+		/* update the entry counter */
+		cpu_buffer->read += local_read(&reader->entries);
+
 		/* swap the pages */
 		rb_init_page(bpage);
 		bpage = reader->page;
@@ -2985,9 +2966,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		local_set(&reader->entries, 0);
 		reader->read = 0;
 		*data_page = bpage;
-
-		/* update the entry counter */
-		rb_remove_entries(cpu_buffer, bpage, read);
 	}
 	ret = read;
 
-- 
GitLab


From 41ede23eded40832c955d98d4b71bc244809abb3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 1 May 2009 20:26:54 -0400
Subject: [PATCH 0837/2198] ring-buffer: disable writers when resetting buffers

As a precaution, it is best to disable writing to the ring buffers
when reseting them.

[ Impact: prevent weird things if write happens during reset ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9e42a742a3f90..7876df00695fc 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2650,6 +2650,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return;
 
+	atomic_inc(&cpu_buffer->record_disabled);
+
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	__raw_spin_lock(&cpu_buffer->lock);
@@ -2659,6 +2661,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+	atomic_dec(&cpu_buffer->record_disabled);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
-- 
GitLab


From 31b6e76e21b2ffd3cb2f6fe4149790a9fdadce2d Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@MIT.EDU>
Date: Thu, 30 Apr 2009 20:06:11 -0400
Subject: [PATCH 0838/2198] ftrace: use .sched.text, not .text.sched in
 recordmcount.pl

The only references in the kernel to the .text.sched section are in
recordmcount.pl.  Since the code it has is intended to be example code
it should refer to real kernel sections.  So change it to .sched.text
instead.

[ Impact: consistency in comments ]

Signed-off-by: Tim Abbott <tabbott@mit.edu>
LKML-Reference: <1241136371-10768-1-git-send-email-tabbott@mit.edu>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 scripts/recordmcount.pl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 409596eca124f..0fae7da0529ca 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -26,7 +26,7 @@
 # which will also be the location of that section after final link.
 # e.g.
 #
-#  .section ".text.sched"
+#  .section ".sched.text", "ax"
 #  .globl my_func
 #  my_func:
 #        [...]
@@ -39,7 +39,7 @@
 #        [...]
 #
 # Both relocation offsets for the mcounts in the above example will be
-# offset from .text.sched. If we make another file called tmp.s with:
+# offset from .sched.text. If we make another file called tmp.s with:
 #
 #  .section __mcount_loc
 #  .quad  my_func + 0x5
@@ -51,7 +51,7 @@
 # But this gets hard if my_func is not globl (a static function).
 # In such a case we have:
 #
-#  .section ".text.sched"
+#  .section ".sched.text", "ax"
 #  my_func:
 #        [...]
 #        call mcount  (offset: 0x5)
-- 
GitLab


From 94487d6d53af5acae10cf9fd52f74498994d46b1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 19:22:53 -0400
Subject: [PATCH 0839/2198] tracing: use proper export symbol for tracing api

When adding the EXPORT_SYMBOL to some of the tracing API, I accidently
used EXPORT_SYMBOL instead of EXPORT_SYMBOL_GPL. This patch fixes
that mistake.

[ Impact: export the tracing code only for GPL modules ]

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 74df029056b04..4164a344e72aa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -887,21 +887,21 @@ trace_current_buffer_lock_reserve(int type, unsigned long len,
 	return trace_buffer_lock_reserve(&global_trace,
 					 type, len, flags, pc);
 }
-EXPORT_SYMBOL(trace_current_buffer_lock_reserve);
+EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
 
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
-EXPORT_SYMBOL(trace_current_buffer_unlock_commit);
+EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
 	__trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
 }
-EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit);
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
 
 void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
 {
-- 
GitLab


From 53fc0e2259f261602a2750dcc82b8d7bf04d3c35 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 5 May 2009 13:12:48 -0400
Subject: [PATCH 0840/2198] integrity: lsm audit rule matching fix

An audit subsystem change replaced AUDIT_EQUAL with Audit_equal.
Update calls to security_filter_rule_init()/match() to reflect
the change.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/integrity/ima/ima_policy.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index b5291ad5ef563..b168c1d595cea 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -96,7 +96,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule,
 	if ((rule->flags & IMA_UID) && rule->uid != tsk->cred->uid)
 		return false;
 	for (i = 0; i < MAX_LSM_RULES; i++) {
-		int rc;
+		int rc = 0;
 		u32 osid, sid;
 
 		if (!rule->lsm[i].rule)
@@ -109,7 +109,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule,
 			security_inode_getsecid(inode, &osid);
 			rc = security_filter_rule_match(osid,
 							rule->lsm[i].type,
-							AUDIT_EQUAL,
+							Audit_equal,
 							rule->lsm[i].rule,
 							NULL);
 			break;
@@ -119,7 +119,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule,
 			security_task_getsecid(tsk, &sid);
 			rc = security_filter_rule_match(sid,
 							rule->lsm[i].type,
-							AUDIT_EQUAL,
+							Audit_equal,
 							rule->lsm[i].rule,
 							NULL);
 		default:
@@ -227,7 +227,7 @@ static int ima_lsm_rule_init(struct ima_measure_rule_entry *entry,
 
 	entry->lsm[lsm_rule].type = audit_type;
 	result = security_filter_rule_init(entry->lsm[lsm_rule].type,
-					   AUDIT_EQUAL, args,
+					   Audit_equal, args,
 					   &entry->lsm[lsm_rule].rule);
 	return result;
 }
-- 
GitLab


From e5e520a715dcea6b72f6b9417b203a4b1e813a8b Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 5 May 2009 13:13:00 -0400
Subject: [PATCH 0841/2198] integrity: use audit_log_string

Based on a request from Eric Paris to simplify parsing, replace
audit_log_format statements containing "%s" with audit_log_string().

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/integrity/ima/ima_audit.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c
index 1e082bb987bee..c1461150691cb 100644
--- a/security/integrity/ima/ima_audit.c
+++ b/security/integrity/ima/ima_audit.c
@@ -54,19 +54,10 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
 			 audit_get_loginuid(current),
 			 audit_get_sessionid(current));
 	audit_log_task_context(ab);
-	switch (audit_msgno) {
-	case AUDIT_INTEGRITY_DATA:
-	case AUDIT_INTEGRITY_METADATA:
-	case AUDIT_INTEGRITY_PCR:
-	case AUDIT_INTEGRITY_STATUS:
-		audit_log_format(ab, " op=%s cause=%s", op, cause);
-		break;
-	case AUDIT_INTEGRITY_HASH:
-		audit_log_format(ab, " op=%s hash=%s", op, cause);
-		break;
-	default:
-		audit_log_format(ab, " op=%s", op);
-	}
+	audit_log_format(ab, " op=");
+	audit_log_string(ab, op);
+	audit_log_format(ab, " cause=");
+	audit_log_string(ab, cause);
 	audit_log_format(ab, " comm=");
 	audit_log_untrustedstring(ab, current->comm);
 	if (fname) {
-- 
GitLab


From 07ff7a0b187f3951788f64ae1f30e8109bc8e9eb Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 5 May 2009 13:13:10 -0400
Subject: [PATCH 0842/2198] integrity: remove __setup auditing msgs

Remove integrity audit messages from __setup()

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/integrity/ima/ima_audit.c | 13 ++-----------
 security/integrity/ima/ima_main.c  | 16 ++--------------
 2 files changed, 4 insertions(+), 25 deletions(-)

diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c
index c1461150691cb..b628eea477a68 100644
--- a/security/integrity/ima/ima_audit.c
+++ b/security/integrity/ima/ima_audit.c
@@ -22,18 +22,9 @@ static int ima_audit;
 static int __init ima_audit_setup(char *str)
 {
 	unsigned long audit;
-	int rc, result = 0;
-	char *op = "ima_audit";
-	char *cause;
 
-	rc = strict_strtoul(str, 0, &audit);
-	if (rc || audit > 1)
-		result = 1;
-	else
-		ima_audit = audit;
-	cause = ima_audit ? "enabled" : "not_enabled";
-	integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL,
-			    op, cause, result, 0);
+	if (!strict_strtoul(str, 0, &audit))
+		ima_audit = audit ? 1 : 0;
 	return 1;
 }
 __setup("ima_audit=", ima_audit_setup);
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index f4e7266f5aeec..122f17fc7fc1f 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -29,20 +29,8 @@ int ima_initialized;
 char *ima_hash = "sha1";
 static int __init hash_setup(char *str)
 {
-	const char *op = "hash_setup";
-	const char *hash = "sha1";
-	int result = 0;
-	int audit_info = 0;
-
-	if (strncmp(str, "md5", 3) == 0) {
-		hash = "md5";
-		ima_hash = str;
-	} else if (strncmp(str, "sha1", 4) != 0) {
-		hash = "invalid_hash_type";
-		result = 1;
-	}
-	integrity_audit_msg(AUDIT_INTEGRITY_HASH, NULL, NULL, op, hash,
-			    result, audit_info);
+	if (strncmp(str, "md5", 3) == 0)
+		ima_hash = "md5";
 	return 1;
 }
 __setup("ima_hash=", hash_setup);
-- 
GitLab


From aa20ae8444fc6c318272c643f856d8d8ad3e198d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 21:16:11 -0400
Subject: [PATCH 0843/2198] ring-buffer: move big if statement down

In the hot path of the ring buffer "__rb_reserve_next" there's a big
if statement that does not even return back to the work flow.

	code;

	if (cross to next page) {

		[ lots of code ]

		return;
	}

	more code;

The condition is even the unlikely path, although we do not denote it
with an unlikely because gcc is fine with it. The condition is true when
the write crosses a page boundary, and we need to start at a new page.

Having this if statement makes it hard to read, but calling another
function to do the work is also not appropriate, because we are using a lot
of variables that were set before the if statement, and we do not want to
send them as parameters.

This patch changes it to a goto:

	code;

	if (cross to next page)
		goto next_page;

	more code;

	return;

next_page:

	[ lots of code]

This makes the code easier to understand, and a bit more obvious.

The output from gcc is practically identical. For some reason, gcc decided
to use different registers when I switched it to a goto. But other than that,
the logic is the same.

[ Impact: easier to read code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 218 +++++++++++++++++++------------------
 1 file changed, 111 insertions(+), 107 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7876df00695fc..424129eb20a45 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1159,6 +1159,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		  unsigned type, unsigned long length, u64 *ts)
 {
 	struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
+	struct buffer_page *next_page;
 	unsigned long tail, write;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
@@ -1173,137 +1174,140 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	tail = write - length;
 
 	/* See if we shot pass the end of this buffer page */
-	if (write > BUF_PAGE_SIZE) {
-		struct buffer_page *next_page = tail_page;
+	if (write > BUF_PAGE_SIZE)
+		goto next_page;
 
-		local_irq_save(flags);
-		/*
-		 * Since the write to the buffer is still not
-		 * fully lockless, we must be careful with NMIs.
-		 * The locks in the writers are taken when a write
-		 * crosses to a new page. The locks protect against
-		 * races with the readers (this will soon be fixed
-		 * with a lockless solution).
-		 *
-		 * Because we can not protect against NMIs, and we
-		 * want to keep traces reentrant, we need to manage
-		 * what happens when we are in an NMI.
-		 *
-		 * NMIs can happen after we take the lock.
-		 * If we are in an NMI, only take the lock
-		 * if it is not already taken. Otherwise
-		 * simply fail.
-		 */
-		if (unlikely(in_nmi())) {
-			if (!__raw_spin_trylock(&cpu_buffer->lock)) {
-				cpu_buffer->nmi_dropped++;
-				goto out_reset;
-			}
-		} else
-			__raw_spin_lock(&cpu_buffer->lock);
-
-		lock_taken = true;
+	/* We reserved something on the buffer */
 
-		rb_inc_page(cpu_buffer, &next_page);
+	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+		return NULL;
 
-		head_page = cpu_buffer->head_page;
-		reader_page = cpu_buffer->reader_page;
+	event = __rb_page_index(tail_page, tail);
+	rb_update_event(event, type, length);
 
-		/* we grabbed the lock before incrementing */
-		if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
-			goto out_reset;
+	/* The passed in type is zero for DATA */
+	if (likely(!type))
+		local_inc(&tail_page->entries);
 
-		/*
-		 * If for some reason, we had an interrupt storm that made
-		 * it all the way around the buffer, bail, and warn
-		 * about it.
-		 */
-		if (unlikely(next_page == commit_page)) {
-			cpu_buffer->commit_overrun++;
-			goto out_reset;
-		}
+	/*
+	 * If this is a commit and the tail is zero, then update
+	 * this page's time stamp.
+	 */
+	if (!tail && rb_is_commit(cpu_buffer, event))
+		cpu_buffer->commit_page->page->time_stamp = *ts;
 
-		if (next_page == head_page) {
-			if (!(buffer->flags & RB_FL_OVERWRITE))
-				goto out_reset;
+	return event;
 
-			/* tail_page has not moved yet? */
-			if (tail_page == cpu_buffer->tail_page) {
-				/* count overflows */
-				cpu_buffer->overrun +=
-					local_read(&head_page->entries);
+ next_page:
 
-				rb_inc_page(cpu_buffer, &head_page);
-				cpu_buffer->head_page = head_page;
-				cpu_buffer->head_page->read = 0;
-			}
-		}
+	next_page = tail_page;
 
-		/*
-		 * If the tail page is still the same as what we think
-		 * it is, then it is up to us to update the tail
-		 * pointer.
-		 */
-		if (tail_page == cpu_buffer->tail_page) {
-			local_set(&next_page->write, 0);
-			local_set(&next_page->entries, 0);
-			local_set(&next_page->page->commit, 0);
-			cpu_buffer->tail_page = next_page;
-
-			/* reread the time stamp */
-			*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
-			cpu_buffer->tail_page->page->time_stamp = *ts;
+	local_irq_save(flags);
+	/*
+	 * Since the write to the buffer is still not
+	 * fully lockless, we must be careful with NMIs.
+	 * The locks in the writers are taken when a write
+	 * crosses to a new page. The locks protect against
+	 * races with the readers (this will soon be fixed
+	 * with a lockless solution).
+	 *
+	 * Because we can not protect against NMIs, and we
+	 * want to keep traces reentrant, we need to manage
+	 * what happens when we are in an NMI.
+	 *
+	 * NMIs can happen after we take the lock.
+	 * If we are in an NMI, only take the lock
+	 * if it is not already taken. Otherwise
+	 * simply fail.
+	 */
+	if (unlikely(in_nmi())) {
+		if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+			cpu_buffer->nmi_dropped++;
+			goto out_reset;
 		}
+	} else
+		__raw_spin_lock(&cpu_buffer->lock);
 
-		/*
-		 * The actual tail page has moved forward.
-		 */
-		if (tail < BUF_PAGE_SIZE) {
-			/* Mark the rest of the page with padding */
-			event = __rb_page_index(tail_page, tail);
-			rb_event_set_padding(event);
-		}
+	lock_taken = true;
 
-		if (tail <= BUF_PAGE_SIZE)
-			/* Set the write back to the previous setting */
-			local_set(&tail_page->write, tail);
+	rb_inc_page(cpu_buffer, &next_page);
 
-		/*
-		 * If this was a commit entry that failed,
-		 * increment that too
-		 */
-		if (tail_page == cpu_buffer->commit_page &&
-		    tail == rb_commit_index(cpu_buffer)) {
-			rb_set_commit_to_write(cpu_buffer);
-		}
+	head_page = cpu_buffer->head_page;
+	reader_page = cpu_buffer->reader_page;
 
-		__raw_spin_unlock(&cpu_buffer->lock);
-		local_irq_restore(flags);
+	/* we grabbed the lock before incrementing */
+	if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+		goto out_reset;
 
-		/* fail and let the caller try again */
-		return ERR_PTR(-EAGAIN);
+	/*
+	 * If for some reason, we had an interrupt storm that made
+	 * it all the way around the buffer, bail, and warn
+	 * about it.
+	 */
+	if (unlikely(next_page == commit_page)) {
+		cpu_buffer->commit_overrun++;
+		goto out_reset;
 	}
 
-	/* We reserved something on the buffer */
+	if (next_page == head_page) {
+		if (!(buffer->flags & RB_FL_OVERWRITE))
+			goto out_reset;
 
-	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
-		return NULL;
+		/* tail_page has not moved yet? */
+		if (tail_page == cpu_buffer->tail_page) {
+			/* count overflows */
+			cpu_buffer->overrun +=
+				local_read(&head_page->entries);
 
-	event = __rb_page_index(tail_page, tail);
-	rb_update_event(event, type, length);
+			rb_inc_page(cpu_buffer, &head_page);
+			cpu_buffer->head_page = head_page;
+			cpu_buffer->head_page->read = 0;
+		}
+	}
 
-	/* The passed in type is zero for DATA */
-	if (likely(!type))
-		local_inc(&tail_page->entries);
+	/*
+	 * If the tail page is still the same as what we think
+	 * it is, then it is up to us to update the tail
+	 * pointer.
+	 */
+	if (tail_page == cpu_buffer->tail_page) {
+		local_set(&next_page->write, 0);
+		local_set(&next_page->entries, 0);
+		local_set(&next_page->page->commit, 0);
+		cpu_buffer->tail_page = next_page;
+
+		/* reread the time stamp */
+		*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
+		cpu_buffer->tail_page->page->time_stamp = *ts;
+	}
 
 	/*
-	 * If this is a commit and the tail is zero, then update
-	 * this page's time stamp.
+	 * The actual tail page has moved forward.
 	 */
-	if (!tail && rb_is_commit(cpu_buffer, event))
-		cpu_buffer->commit_page->page->time_stamp = *ts;
+	if (tail < BUF_PAGE_SIZE) {
+		/* Mark the rest of the page with padding */
+		event = __rb_page_index(tail_page, tail);
+		rb_event_set_padding(event);
+	}
 
-	return event;
+	if (tail <= BUF_PAGE_SIZE)
+		/* Set the write back to the previous setting */
+		local_set(&tail_page->write, tail);
+
+	/*
+	 * If this was a commit entry that failed,
+	 * increment that too
+	 */
+	if (tail_page == cpu_buffer->commit_page &&
+	    tail == rb_commit_index(cpu_buffer)) {
+		rb_set_commit_to_write(cpu_buffer);
+	}
+
+	__raw_spin_unlock(&cpu_buffer->lock);
+	local_irq_restore(flags);
+
+	/* fail and let the caller try again */
+	return ERR_PTR(-EAGAIN);
 
  out_reset:
 	/* reset write */
-- 
GitLab


From c898faf91b3ec6b0f6efa35831b3984fa3331db0 Mon Sep 17 00:00:00 2001
From: Rik van Riel <riel@redhat.com>
Date: Tue, 5 May 2009 17:28:56 -0400
Subject: [PATCH 0844/2198] x86: 46 bit physical address support on 64 bits

Extend the maximum addressable memory on x86-64 from 2^44 to
2^46 bytes. This requires some shuffling around of the vmalloc
and virtual memmap memory areas, to keep them away from the
direct mapping of up to 64TB of physical memory.

This patch also introduces a guard hole between the vmalloc
area and the virtual memory map space.  There's really no
good reason why we wouldn't have a guard hole there.

[ Impact: future hardware enablement ]

Signed-off-by: Rik van Riel <riel@redhat.com>
LKML-Reference: <20090505172856.6820db22@cuia.bos.redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/x86/x86_64/mm.txt         | 9 +++++----
 arch/x86/include/asm/page_64_types.h    | 2 +-
 arch/x86/include/asm/pgtable_64_types.h | 8 ++++----
 arch/x86/include/asm/sparsemem.h        | 2 +-
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 29b52b14d0b43..539413235845d 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables:
 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
 hole caused by [48:63] sign extension
 ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
-ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory
-ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
-ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
-ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB)
+ffff880000000000 - ffffc8ffffffffff (=64 TB) direct mapping of all phys. memory
+ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
+ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
+ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
+ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffffff80000000 - ffffffffa0000000 (=512 MB)  kernel text mapping, from phys 0
 ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space
diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index 3f587188ae68a..6fadb020bd2ba 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -47,7 +47,7 @@
 #define __START_KERNEL		(__START_KERNEL_map + __PHYSICAL_START)
 #define __START_KERNEL_map	_AC(0xffffffff80000000, UL)
 
-/* See Documentation/x86_64/mm.txt for a description of the memory map. */
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
 #define __PHYSICAL_MASK_SHIFT	46
 #define __VIRTUAL_MASK_SHIFT	48
 
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index fbf42b8e03833..766ea16fbbbda 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_SIZE	(_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE - 1))
 
-
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
 #define MAXMEM		 _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#define VMALLOC_START    _AC(0xffffc20000000000, UL)
-#define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
-#define VMEMMAP_START	 _AC(0xffffe20000000000, UL)
+#define VMALLOC_START    _AC(0xffffc90000000000, UL)
+#define VMALLOC_END      _AC(0xffffe8ffffffffff, UL)
+#define VMEMMAP_START	 _AC(0xffffea0000000000, UL)
 #define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
 #define MODULES_END      _AC(0xffffffffff000000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
index e3cc3c063ec5e..4517d6b93188a 100644
--- a/arch/x86/include/asm/sparsemem.h
+++ b/arch/x86/include/asm/sparsemem.h
@@ -27,7 +27,7 @@
 #else /* CONFIG_X86_32 */
 # define SECTION_SIZE_BITS	27 /* matt - 128 is convenient right now */
 # define MAX_PHYSADDR_BITS	44
-# define MAX_PHYSMEM_BITS	44 /* Can be max 45 bits */
+# define MAX_PHYSMEM_BITS	46
 #endif
 
 #endif /* CONFIG_SPARSEMEM */
-- 
GitLab


From 2feceeff1e771850e49f9074307f071964fd9e3e Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 19:07:07 -0700
Subject: [PATCH 0845/2198] x86: fix typo in address space documentation

Fix a trivial typo in Documentation/x86/x86_64/mm.txt.

[ Impact: documentation only ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Rik van Riel <riel@redhat.com>
---
 Documentation/x86/x86_64/mm.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt
index 539413235845d..d6498e3cd7133 100644
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -6,7 +6,7 @@ Virtual memory map with 4 level page tables:
 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
 hole caused by [48:63] sign extension
 ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
-ffff880000000000 - ffffc8ffffffffff (=64 TB) direct mapping of all phys. memory
+ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
 ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
 ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
 ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
-- 
GitLab


From 5092dbc96f3acdac5433b27c06860352dc6d23b9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 5 May 2009 22:47:18 -0400
Subject: [PATCH 0846/2198] ring-buffer: add benchmark and tester

This patch adds code that can benchmark the ring buffer as well as
test it. This code can be compiled into the kernel (not recommended)
or as a module.

A separate ring buffer is used to not interfer with other users, like
ftrace. It creates a producer and a consumer (option to disable creation
of the consumer) and will run for 10 seconds, then sleep for 10 seconds
and then repeat.

While running, the producer will write 10 byte loads into the ring
buffer with just putting in the current CPU number. The reader will
continually try to read the buffer. The reader will alternate from reading
the buffer via event by event, or by full pages.

The output is a pr_info, thus it will fill up the syslogs.

  Starting ring buffer hammer
  End ring buffer hammer
  Time:     9000349 (usecs)
  Overruns: 12578640
  Read:     5358440  (by events)
  Entries:  0
  Total:    17937080
  Missed:   0
  Hit:      17937080
  Entries per millisec: 1993
  501 ns per entry
  Sleeping for 10 secs
  Starting ring buffer hammer
  End ring buffer hammer
  Time:     9936350 (usecs)
  Overruns: 0
  Read:     28146644  (by pages)
  Entries:  74
  Total:    28146718
  Missed:   0
  Hit:      28146718
  Entries per millisec: 2832
  353 ns per entry
  Sleeping for 10 secs

Time:      is the time the test ran
Overruns:  the number of events that were overwritten and not read
Read:      the number of events read (either by pages or events)
Entries:   the number of entries left in the buffer
                 (the by pages will only read full pages)
Total:     Entries + Read + Overruns
Missed:    the number of entries that failed to write
Hit:       the number of entries that were written

The above example shows that it takes ~353 nanosecs per entry when
there is a reader, reading by pages (and no overruns)

The event by event reader slowed the producer down to 501 nanosecs.

[ Impact: see how changes to the ring buffer affect stability and performance ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig                 |  16 ++
 kernel/trace/Makefile                |   1 +
 kernel/trace/ring_buffer_benchmark.c | 379 +++++++++++++++++++++++++++
 3 files changed, 396 insertions(+)
 create mode 100644 kernel/trace/ring_buffer_benchmark.c

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 450d3c2cfbd28..50f62a296e1d0 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -471,6 +471,22 @@ config MMIOTRACE_TEST
 
 	  Say N, unless you absolutely know what you are doing.
 
+config RING_BUFFER_BENCHMARK
+	tristate "Ring buffer benchmark stress tester"
+	depends on RING_BUFFER
+	help
+	  This option creates a test to stress the ring buffer and bench mark it.
+	  It creates its own ring buffer such that it will not interfer with
+	  any other users of the ring buffer (such as ftrace). It then creates
+	  a producer and consumer that will run for 10 seconds and sleep for
+	  10 seconds. Each interval it will print out the number of events
+	  it recorded and give a rough estimate of how long each iteration took.
+
+	  It does not disable interrupts or raise its priority, so it may be
+	  affected by processes that are running.
+
+	  If unsure, say N
+
 endif # FTRACE
 
 endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index fb9d7f964898b..7c34cbfff96e6 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -17,6 +17,7 @@ endif
 
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
 
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644
index 0000000000000..747244acb8fdb
--- /dev/null
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -0,0 +1,379 @@
+/*
+ * ring buffer tester and benchmark
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/time.h>
+
+struct rb_page {
+	u64		ts;
+	local_t		commit;
+	char		data[4080];
+};
+
+/* run time and sleep time in seconds */
+#define RUN_TIME	10
+#define SLEEP_TIME	10
+
+/* number of events for writer to wake up the reader */
+static int wakeup_interval = 100;
+
+static int reader_finish;
+static struct completion read_start;
+static struct completion read_done;
+
+static struct ring_buffer *buffer;
+static struct task_struct *producer;
+static struct task_struct *consumer;
+static unsigned long read;
+
+static int disable_reader;
+module_param(disable_reader, uint, 0644);
+MODULE_PARM_DESC(disable_reader, "only run producer");
+
+static int read_events;
+
+static int kill_test;
+
+#define KILL_TEST()				\
+	do {					\
+		if (!kill_test) {		\
+			kill_test = 1;		\
+			WARN_ON(1);		\
+		}				\
+	} while (0)
+
+enum event_status {
+	EVENT_FOUND,
+	EVENT_DROPPED,
+};
+
+static enum event_status read_event(int cpu)
+{
+	struct ring_buffer_event *event;
+	int *entry;
+	u64 ts;
+
+	event = ring_buffer_consume(buffer, cpu, &ts);
+	if (!event)
+		return EVENT_DROPPED;
+
+	entry = ring_buffer_event_data(event);
+	if (*entry != cpu) {
+		KILL_TEST();
+		return EVENT_DROPPED;
+	}
+
+	read++;
+	return EVENT_FOUND;
+}
+
+static enum event_status read_page(int cpu)
+{
+	struct ring_buffer_event *event;
+	struct rb_page *rpage;
+	unsigned long commit;
+	void *bpage;
+	int *entry;
+	int ret;
+	int inc;
+	int i;
+
+	bpage = ring_buffer_alloc_read_page(buffer);
+	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+	if (ret >= 0) {
+		rpage = bpage;
+		commit = local_read(&rpage->commit);
+		for (i = 0; i < commit && !kill_test; i += inc) {
+
+			if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+				KILL_TEST();
+				break;
+			}
+
+			inc = -1;
+			event = (void *)&rpage->data[i];
+			switch (event->type_len) {
+			case RINGBUF_TYPE_PADDING:
+				/* We don't expect any padding */
+				KILL_TEST();
+				break;
+			case RINGBUF_TYPE_TIME_EXTEND:
+				inc = 8;
+				break;
+			case 0:
+				entry = ring_buffer_event_data(event);
+				if (*entry != cpu) {
+					KILL_TEST();
+					break;
+				}
+				read++;
+				if (!event->array[0]) {
+					KILL_TEST();
+					break;
+				}
+				inc = event->array[0];
+				break;
+			default:
+				entry = ring_buffer_event_data(event);
+				if (*entry != cpu) {
+					KILL_TEST();
+					break;
+				}
+				read++;
+				inc = ((event->type_len + 1) * 4);
+			}
+			if (kill_test)
+				break;
+
+			if (inc <= 0) {
+				KILL_TEST();
+				break;
+			}
+		}
+	}
+	ring_buffer_free_read_page(buffer, bpage);
+
+	if (ret < 0)
+		return EVENT_DROPPED;
+	return EVENT_FOUND;
+}
+
+static void ring_buffer_consumer(void)
+{
+	/* toggle between reading pages and events */
+	read_events ^= 1;
+
+	read = 0;
+	while (!reader_finish && !kill_test) {
+		int found;
+
+		do {
+			int cpu;
+
+			found = 0;
+			for_each_online_cpu(cpu) {
+				enum event_status stat;
+
+				if (read_events)
+					stat = read_event(cpu);
+				else
+					stat = read_page(cpu);
+
+				if (kill_test)
+					break;
+				if (stat == EVENT_FOUND)
+					found = 1;
+			}
+		} while (found && !kill_test);
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (reader_finish)
+			break;
+
+		schedule();
+		__set_current_state(TASK_RUNNING);
+	}
+	reader_finish = 0;
+	complete(&read_done);
+}
+
+static void ring_buffer_producer(void)
+{
+	struct timeval start_tv;
+	struct timeval end_tv;
+	unsigned long long time;
+	unsigned long long entries;
+	unsigned long long overruns;
+	unsigned long missed = 0;
+	unsigned long hit = 0;
+	unsigned long avg;
+	int cnt = 0;
+
+	/*
+	 * Hammer the buffer for 10 secs (this may
+	 * make the system stall)
+	 */
+	pr_info("Starting ring buffer hammer\n");
+	do_gettimeofday(&start_tv);
+	do {
+		struct ring_buffer_event *event;
+		int *entry;
+
+		event = ring_buffer_lock_reserve(buffer, 10);
+		if (!event) {
+			missed++;
+		} else {
+			hit++;
+			entry = ring_buffer_event_data(event);
+			*entry = smp_processor_id();
+			ring_buffer_unlock_commit(buffer, event);
+		}
+		do_gettimeofday(&end_tv);
+
+		if (consumer && !(++cnt % wakeup_interval))
+			wake_up_process(consumer);
+
+	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+	pr_info("End ring buffer hammer\n");
+
+	if (consumer) {
+		/* Init both completions here to avoid races */
+		init_completion(&read_start);
+		init_completion(&read_done);
+		/* the completions must be visible before the finish var */
+		smp_wmb();
+		reader_finish = 1;
+		/* finish var visible before waking up the consumer */
+		smp_wmb();
+		wake_up_process(consumer);
+		wait_for_completion(&read_done);
+	}
+
+	time = end_tv.tv_sec - start_tv.tv_sec;
+	time *= 1000000;
+	time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
+
+	entries = ring_buffer_entries(buffer);
+	overruns = ring_buffer_overruns(buffer);
+
+	if (kill_test)
+		pr_info("ERROR!\n");
+	pr_info("Time:     %lld (usecs)\n", time);
+	pr_info("Overruns: %lld\n", overruns);
+	if (disable_reader)
+		pr_info("Read:     (reader disabled)\n");
+	else
+		pr_info("Read:     %ld  (by %s)\n", read,
+			read_events ? "events" : "pages");
+	pr_info("Entries:  %lld\n", entries);
+	pr_info("Total:    %lld\n", entries + overruns + read);
+	pr_info("Missed:   %ld\n", missed);
+	pr_info("Hit:      %ld\n", hit);
+
+	do_div(time, 1000);
+	if (time)
+		hit /= (long)time;
+	else
+		pr_info("TIME IS ZERO??\n");
+
+	pr_info("Entries per millisec: %ld\n", hit);
+
+	if (hit) {
+		avg = 1000000 / hit;
+		pr_info("%ld ns per entry\n", avg);
+	}
+}
+
+static void wait_to_die(void)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	while (!kthread_should_stop()) {
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+}
+
+static int ring_buffer_consumer_thread(void *arg)
+{
+	while (!kthread_should_stop() && !kill_test) {
+		complete(&read_start);
+
+		ring_buffer_consumer();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop() || kill_test)
+			break;
+
+		schedule();
+		__set_current_state(TASK_RUNNING);
+	}
+	__set_current_state(TASK_RUNNING);
+
+	if (kill_test)
+		wait_to_die();
+
+	return 0;
+}
+
+static int ring_buffer_producer_thread(void *arg)
+{
+	init_completion(&read_start);
+
+	while (!kthread_should_stop() && !kill_test) {
+		ring_buffer_reset(buffer);
+
+		if (consumer) {
+			smp_wmb();
+			wake_up_process(consumer);
+			wait_for_completion(&read_start);
+		}
+
+		ring_buffer_producer();
+
+		pr_info("Sleeping for 10 secs\n");
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ * SLEEP_TIME);
+		__set_current_state(TASK_RUNNING);
+	}
+
+	if (kill_test)
+		wait_to_die();
+
+	return 0;
+}
+
+static int __init ring_buffer_benchmark_init(void)
+{
+	int ret;
+
+	/* make a one meg buffer in overwite mode */
+	buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
+	if (!buffer)
+		return -ENOMEM;
+
+	if (!disable_reader) {
+		consumer = kthread_create(ring_buffer_consumer_thread,
+					  NULL, "rb_consumer");
+		ret = PTR_ERR(consumer);
+		if (IS_ERR(consumer))
+			goto out_fail;
+	}
+
+	producer = kthread_run(ring_buffer_producer_thread,
+			       NULL, "rb_producer");
+	ret = PTR_ERR(producer);
+
+	if (IS_ERR(producer))
+		goto out_kill;
+
+	return 0;
+
+ out_kill:
+	if (consumer)
+		kthread_stop(consumer);
+
+ out_fail:
+	ring_buffer_free(buffer);
+	return ret;
+}
+
+static void __exit ring_buffer_benchmark_exit(void)
+{
+	kthread_stop(producer);
+	if (consumer)
+		kthread_stop(consumer);
+	ring_buffer_free(buffer);
+}
+
+module_init(ring_buffer_benchmark_init);
+module_exit(ring_buffer_benchmark_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("ring_buffer_benchmark");
+MODULE_LICENSE("GPL");
-- 
GitLab


From b2e5d8588de0b5341eddad87dbe48d2185eaa3dd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 6 May 2009 07:55:33 +0200
Subject: [PATCH 0847/2198] irq: change ->set_affinity() to return status, fix

This build failure:

 arch/powerpc/sysdev/mpic.c:810: error: conflicting types for 'mpic_set_affinity'
 arch/powerpc/sysdev/mpic.h:39: error: previous declaration of 'mpic_set_affinity' was here
 make[2]: *** [arch/powerpc/sysdev/mpic.o] Error 1
 make[2]: *** Waiting for unfinished jobs....

Triggers because the function prototype was not updated when the
function call signature got changed by:

   d5dedd4: irq: change ->set_affinity() to return status

[ Impact: build fix on powerpc ]

Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: linux-arch@vger.kernel.org
LKML-Reference: <49F654E9.4070809@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/sysdev/mpic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/sysdev/mpic.h b/arch/powerpc/sysdev/mpic.h
index 3cef2af10f425..eff433c322a0c 100644
--- a/arch/powerpc/sysdev/mpic.h
+++ b/arch/powerpc/sysdev/mpic.h
@@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic)
 
 extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type);
 extern void mpic_set_vector(unsigned int virq, unsigned int vector);
-extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
+extern int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 
 #endif /* _POWERPC_SYSDEV_MPIC_H */
-- 
GitLab


From fd6da10a617f483348ee32bcfe53fd20c302eca1 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:32:13 +0800
Subject: [PATCH 0848/2198] tracing/events: don't say hi when loading the trace
 event sample

The sample is useful for testing, and I'm using it. But after
loading the module, it keeps saying hi every 10 seconds, this may
be disturbing.

Also Steven said commenting out the "hi" helped in causing races. :)

[ Impact: make testing a bit easier ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A00F6AD.2070008@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 samples/trace_events/trace-events-sample.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
index f33b3ba744ac8..aabc4e9709112 100644
--- a/samples/trace_events/trace-events-sample.c
+++ b/samples/trace_events/trace-events-sample.c
@@ -16,10 +16,6 @@ static void simple_thread_func(int cnt)
 	set_current_state(TASK_INTERRUPTIBLE);
 	schedule_timeout(HZ);
 	trace_foo_bar("hello", cnt);
-
-	if (!(cnt % 10))
-		/* It is really important that I say "hi!" */
-		printk(KERN_EMERG "hi!\n");
 }
 
 static int simple_thread(void *arg)
-- 
GitLab


From 96d17980fabeb757706d2d6db5a28580a6156bfc Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:32:32 +0800
Subject: [PATCH 0849/2198] tracing/events: make SAMPLE_TRACE_EVENTS default to
 n

Normally a config should be default to n. This patch also makes the
sample module-only, like SAMPLE_MARKERS and SAMPLE_TRACEPOINTS.

[ Impact: don't build trace event sample by default ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A00F6C0.8090803@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 samples/Kconfig | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/samples/Kconfig b/samples/Kconfig
index 93f41c0510920..b75d28cba3f75 100644
--- a/samples/Kconfig
+++ b/samples/Kconfig
@@ -20,9 +20,8 @@ config SAMPLE_TRACEPOINTS
 	  This build tracepoints example modules.
 
 config SAMPLE_TRACE_EVENTS
-	tristate "Build trace_events examples"
-	depends on EVENT_TRACING
-	default m
+	tristate "Build trace_events examples -- loadable modules only"
+	depends on EVENT_TRACING && m
 	help
 	  This build trace event example modules.
 
-- 
GitLab


From 2df75e415709ad12862028916c772c1f377f6a7c Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:33:04 +0800
Subject: [PATCH 0850/2198] tracing/events: fix memory leak when unloading
 module

When unloading a module, memory allocated by init_preds() and
trace_define_field() is not freed.

[ Impact: fix memory leak ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4A00F6E0.3040503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace_event.h       |  1 +
 kernel/trace/trace_events.c        | 18 ++++++++++++++++++
 kernel/trace/trace_events_filter.c | 22 +++++++++++++++-------
 3 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 5fff40c9ff59c..662c1becf367f 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -116,6 +116,7 @@ struct ftrace_event_call {
 #define MAX_FILTER_STR_VAL	128
 
 extern int init_preds(struct ftrace_event_call *call);
+extern void destroy_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
 extern int filter_current_check_discard(struct ftrace_event_call *call,
 					void *rec,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f789ca540fe1a..f251a150e75e8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -60,6 +60,22 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 }
 EXPORT_SYMBOL_GPL(trace_define_field);
 
+#ifdef CONFIG_MODULES
+
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+	struct ftrace_event_field *field, *next;
+
+	list_for_each_entry_safe(field, next, &call->fields, link) {
+		list_del(&field->link);
+		kfree(field->type);
+		kfree(field->name);
+		kfree(field);
+	}
+}
+
+#endif /* CONFIG_MODULES */
+
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call;
@@ -925,6 +941,8 @@ static void trace_module_remove_events(struct module *mod)
 				unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
 			list_del(&call->list);
+			trace_destroy_fields(call);
+			destroy_preds(call);
 		}
 	}
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index f49486687ee2a..ce07b81867102 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -346,6 +346,20 @@ static void filter_disable_preds(struct ftrace_event_call *call)
 		filter->preds[i]->fn = filter_pred_none;
 }
 
+void destroy_preds(struct ftrace_event_call *call)
+{
+	struct event_filter *filter = call->filter;
+	int i;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (filter->preds[i])
+			filter_free_pred(filter->preds[i]);
+	}
+	kfree(filter->preds);
+	kfree(filter);
+	call->filter = NULL;
+}
+
 int init_preds(struct ftrace_event_call *call)
 {
 	struct event_filter *filter;
@@ -374,13 +388,7 @@ int init_preds(struct ftrace_event_call *call)
 	return 0;
 
 oom:
-	for (i = 0; i < MAX_FILTER_PRED; i++) {
-		if (filter->preds[i])
-			filter_free_pred(filter->preds[i]);
-	}
-	kfree(filter->preds);
-	kfree(call->filter);
-	call->filter = NULL;
+	destroy_preds(call);
 
 	return -ENOMEM;
 }
-- 
GitLab


From 20c8928abe70e204bd077ab6cfe23002d7788983 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 6 May 2009 10:33:45 +0800
Subject: [PATCH 0851/2198] tracing/events: fix concurrent access to
 ftrace_events list

A module will add/remove its trace events when it gets loaded/unloaded, so
the ftrace_events list is not "const", and concurrent access needs to be
protected.

This patch thus fixes races between loading/unloding modules and read
'available_events' or read/write 'set_event', etc.

Below shows how to reproduce the race:

 # for ((; ;)) { cat /mnt/tracing/available_events; } > /dev/null &
 # for ((; ;)) { insmod trace-events-sample.ko; rmmod sample; } &

After a while:

BUG: unable to handle kernel paging request at 0010011c
IP: [<c1080f27>] t_next+0x1b/0x2d
...
Call Trace:
 [<c10c90e6>] ? seq_read+0x217/0x30d
 [<c10c8ecf>] ? seq_read+0x0/0x30d
 [<c10b4c19>] ? vfs_read+0x8f/0x136
 [<c10b4fc3>] ? sys_read+0x40/0x65
 [<c1002a68>] ? sysenter_do_call+0x12/0x36

[ Impact: fix races when concurrent accessing ftrace_events list ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4A00F709.3080800@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |  1 +
 kernel/trace/trace_event_profile.c | 19 ++++++++++++++-----
 kernel/trace/trace_events.c        | 20 +++++++++++---------
 kernel/trace/trace_events_filter.c | 10 +++++++---
 4 files changed, 33 insertions(+), 17 deletions(-)

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7736fe8c1b762..777c6c3a0cdee 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -825,6 +825,7 @@ static int filter_pred_##size(struct filter_pred *pred, void *event,	\
 	return match;							\
 }
 
+extern struct mutex event_mutex;
 extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index 7bf2ad65eee58..5b5895afecfe4 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -10,21 +10,30 @@
 int ftrace_profile_enable(int event_id)
 {
 	struct ftrace_event_call *event;
+	int ret = -EINVAL;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id)
-			return event->profile_enable(event);
+		if (event->id == event_id) {
+			ret = event->profile_enable(event);
+			break;
+		}
 	}
+	mutex_unlock(&event_mutex);
 
-	return -EINVAL;
+	return ret;
 }
 
 void ftrace_profile_disable(int event_id)
 {
 	struct ftrace_event_call *event;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(event, &ftrace_events, list) {
-		if (event->id == event_id)
-			return event->profile_disable(event);
+		if (event->id == event_id) {
+			event->profile_disable(event);
+			break;
+		}
 	}
+	mutex_unlock(&event_mutex);
 }
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f251a150e75e8..8d579ff236101 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -21,7 +21,7 @@
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
-static DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_mutex);
 
 LIST_HEAD(ftrace_events);
 
@@ -80,6 +80,7 @@ static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 
 		if (call->enabled) {
@@ -87,6 +88,7 @@ static void ftrace_clear_events(void)
 			call->unregfunc();
 		}
 	}
+	mutex_unlock(&event_mutex);
 }
 
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
@@ -274,6 +276,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+	mutex_lock(&event_mutex);
+	if (*pos == 0)
+		m->private = ftrace_events.next;
 	return t_next(m, NULL, pos);
 }
 
@@ -303,6 +308,9 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
 
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
+	mutex_lock(&event_mutex);
+	if (*pos == 0)
+		m->private = ftrace_events.next;
 	return s_next(m, NULL, pos);
 }
 
@@ -319,12 +327,12 @@ static int t_show(struct seq_file *m, void *v)
 
 static void t_stop(struct seq_file *m, void *p)
 {
+	mutex_unlock(&event_mutex);
 }
 
 static int
 ftrace_event_seq_open(struct inode *inode, struct file *file)
 {
-	int ret;
 	const struct seq_operations *seq_ops;
 
 	if ((file->f_mode & FMODE_WRITE) &&
@@ -332,13 +340,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 		ftrace_clear_events();
 
 	seq_ops = inode->i_private;
-	ret = seq_open(file, seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-
-		m->private = ftrace_events.next;
-	}
-	return ret;
+	return seq_open(file, seq_ops);
 }
 
 static ssize_t
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index ce07b81867102..7ac691085276a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -408,6 +408,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 		filter->n_preds = 0;
 	}
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->define_fields)
 			continue;
@@ -417,6 +418,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system)
 			remove_filter_string(call->filter);
 		}
 	}
+	mutex_unlock(&event_mutex);
 }
 
 static int filter_add_pred_fn(struct filter_parse_state *ps,
@@ -567,6 +569,7 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 {
 	struct event_filter *filter = system->filter;
 	struct ftrace_event_call *call;
+	int err = 0;
 
 	if (!filter->preds) {
 		filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
@@ -584,8 +587,8 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 	filter->preds[filter->n_preds] = pred;
 	filter->n_preds++;
 
+	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
-		int err;
 
 		if (!call->define_fields)
 			continue;
@@ -597,12 +600,13 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 		if (err) {
 			filter_free_subsystem_preds(system);
 			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-			return err;
+			break;
 		}
 		replace_filter_string(call->filter, filter_string);
 	}
+	mutex_unlock(&event_mutex);
 
-	return 0;
+	return err;
 }
 
 static void parse_init(struct filter_parse_state *ps,
-- 
GitLab


From de1d7286060430e79a1d50ad6e5fee8fe863c5f6 Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Tue, 5 May 2009 16:49:59 +0800
Subject: [PATCH 0852/2198] tracepoint: trace_sched_migrate_task(): remove
 parameter

The orig_cpu parameter in trace_sched_migrate_task() is not necessary,
it can be got by using task_cpu(p) in the probe.

[ Impact: micro-optimization ]

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
[ modified from Mathieu's patch. The original patch is at:
  http://marc.info/?l=linux-kernel&m=123791201716239&w=2 ]
Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: zhaolei@cn.fujitsu.com
Cc: laijs@cn.fujitsu.com
LKML-Reference: <49FFFDB7.1050402@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/events/sched.h | 6 +++---
 kernel/sched.c               | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ffa1cab586b9b..dd4033cf5b091 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -180,9 +180,9 @@ TRACE_EVENT(sched_switch,
  */
 TRACE_EVENT(sched_migrate_task,
 
-	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+	TP_PROTO(struct task_struct *p, int dest_cpu),
 
-	TP_ARGS(p, orig_cpu, dest_cpu),
+	TP_ARGS(p, dest_cpu),
 
 	TP_STRUCT__entry(
 		__array(	char,	comm,	TASK_COMM_LEN	)
@@ -196,7 +196,7 @@ TRACE_EVENT(sched_migrate_task,
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-		__entry->orig_cpu	= orig_cpu;
+		__entry->orig_cpu	= task_cpu(p);
 		__entry->dest_cpu	= dest_cpu;
 	),
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 9f7ffd00b6ea4..9cdedbd181ce1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1954,7 +1954,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
 	clock_offset = old_rq->clock - new_rq->clock;
 
-	trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+	trace_sched_migrate_task(p, new_cpu);
 
 #ifdef CONFIG_SCHEDSTATS
 	if (p->se.wait_start)
-- 
GitLab


From a42aaa3bbce85ac487ad4fad5db99e8e91b7aac1 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Mon, 4 May 2009 16:27:26 -0400
Subject: [PATCH 0853/2198] blktrace: correct remap names

This attempts to clarify names utilized during block I/O remap
operations (partition, volume manager). It correctly matches up the
/from/ information for both device & sector. This takes in the concept
from Kosaki Motohiro and extends it to include better naming for the
"device_from" field.

[ Impact: cleanup ]

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49FF4FAE.3000301@hp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/blktrace_api.h |  4 ++--
 include/trace/block.h        |  4 ++--
 kernel/trace/blktrace.c      | 24 ++++++++++++------------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 62763c9528546..82b4636030e9e 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -116,9 +116,9 @@ struct blk_io_trace {
  * The remap event
  */
 struct blk_io_trace_remap {
-	__be32 device;
 	__be32 device_from;
-	__be64 sector;
+	__be32 device_to;
+	__be64 sector_from;
 };
 
 enum {
diff --git a/include/trace/block.h b/include/trace/block.h
index 25b7068b819e1..87f6456fd32e0 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -70,7 +70,7 @@ DECLARE_TRACE(block_split,
 
 DECLARE_TRACE(block_remap,
 	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t from, sector_t to),
-	      TP_ARGS(q, bio, dev, from, to));
+		 sector_t to, sector_t from),
+	      TP_ARGS(q, bio, dev, to, from));
 
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c32062bd10b39..f8d46d6f5d344 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -830,8 +830,8 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  * @q:		queue the io is for
  * @bio:	the source bio
  * @dev:	target device
- * @from:	source sector
  * @to:		target sector
+ * @from:	source sector
  *
  * Description:
  *     Device mapper or raid target sometimes need to split a bio because
@@ -839,7 +839,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *
  **/
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
+				       dev_t dev, sector_t to, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
@@ -847,9 +847,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 	if (likely(!bt))
 		return;
 
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
+	r.device_from = cpu_to_be32(dev);
+	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector_from = cpu_to_be64(from);
 
 	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
 			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
@@ -1028,11 +1028,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
 			  struct blk_io_trace_remap *r)
 {
 	const struct blk_io_trace_remap *__r = pdu_start(ent);
-	__u64 sector = __r->sector;
+	__u64 sector_from = __r->sector_from;
 
-	r->device = be32_to_cpu(__r->device);
 	r->device_from = be32_to_cpu(__r->device_from);
-	r->sector = be64_to_cpu(sector);
+	r->device_to   = be32_to_cpu(__r->device_to);
+	r->sector_from = be64_to_cpu(sector_from);
 }
 
 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1148,13 +1148,13 @@ static int blk_log_with_error(struct trace_seq *s,
 
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 {
-	struct blk_io_trace_remap r = { .device = 0, };
+	struct blk_io_trace_remap r = { .device_from = 0, };
 
 	get_pdu_remap(ent, &r);
 	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-			       t_sector(ent),
-			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
-			       (unsigned long long)r.sector);
+				t_sector(ent), t_sec(ent),
+				MAJOR(r.device_from), MINOR(r.device_from),
+				(unsigned long long)r.sector_from);
 }
 
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
-- 
GitLab


From 22a7c31a9659deaddafbbcec6562d44141e84474 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Mon, 4 May 2009 16:35:08 -0400
Subject: [PATCH 0854/2198] blktrace: from-sector redundant in
 trace_block_remap

Remove redundant from-sector parameter: it's /always/ the bio's sector
passed in.

[ Impact: cleanup ]

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49FF517C.7000503@hp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blk-core.c        | 5 ++---
 drivers/md/dm.c         | 3 +--
 include/trace/block.h   | 4 ++--
 kernel/trace/blktrace.c | 8 ++++----
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 07ab75403e1a5..a5f747a8312e4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1275,7 +1275,7 @@ static inline void blk_partition_remap(struct bio *bio)
 		bio->bi_bdev = bdev->bd_contains;
 
 		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-				    bdev->bd_dev, bio->bi_sector,
+				    bdev->bd_dev,
 				    bio->bi_sector - p->start_sect);
 	}
 }
@@ -1444,8 +1444,7 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 
 		if (old_sector != -1)
-			trace_block_remap(q, bio, old_dev, bio->bi_sector,
-					    old_sector);
+			trace_block_remap(q, bio, old_dev, old_sector);
 
 		trace_block_bio_queue(q, bio);
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8a994be035ba4..b01514afb6b5d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -657,8 +657,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		/* the bio has been remapped so dispatch it */
 
 		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev,
-				    clone->bi_sector, sector);
+				    tio->io->bio->bi_bdev->bd_dev, sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
diff --git a/include/trace/block.h b/include/trace/block.h
index 87f6456fd32e0..8ac945b7746ea 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -70,7 +70,7 @@ DECLARE_TRACE(block_split,
 
 DECLARE_TRACE(block_remap,
 	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t to, sector_t from),
-	      TP_ARGS(q, bio, dev, to, from));
+		 sector_t to),
+	      TP_ARGS(q, bio, dev, to));
 
 #endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f8d46d6f5d344..e099f8cc1d1c4 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -830,7 +830,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  * @q:		queue the io is for
  * @bio:	the source bio
  * @dev:	target device
- * @to:		target sector
  * @from:	source sector
  *
  * Description:
@@ -839,7 +838,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *
  **/
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t to, sector_t from)
+				       dev_t dev, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
@@ -851,8 +850,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
 	r.sector_from = cpu_to_be64(from);
 
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
-			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+			BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
+			sizeof(r), &r);
 }
 
 /**
-- 
GitLab


From 48dd0fed90e2b1f1ba87401439b85942181c6df3 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 6 May 2009 15:45:45 +0530
Subject: [PATCH 0855/2198] tracing: trace_output.c, fix false positive
 compiler warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This compiler warning:

  CC      kernel/trace/trace_output.o
 kernel/trace/trace_output.c: In function ‘register_ftrace_event’:
 kernel/trace/trace_output.c:544: warning: ‘list’ may be used uninitialized in this function

Is wrong as 'list' is always initialized - but GCC (4.3.2) does not
recognize this relationship properly.

Work around the warning by initializing the variable to NULL.

[ Impact: fix false positive compiler warning ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5fc51f0f75fc0..8bd9a2c1a46a0 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -541,7 +541,7 @@ int register_ftrace_event(struct trace_event *event)
 	INIT_LIST_HEAD(&event->list);
 
 	if (!event->type) {
-		struct list_head *list;
+		struct list_head *list = NULL;
 
 		if (next_event_type > FTRACE_MAX_EVENT) {
 
-- 
GitLab


From 35cf723e99c0e26ddf51f037dffaa4ff2c2c9106 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Wed, 6 May 2009 12:33:38 +0200
Subject: [PATCH 0856/2198] tracing: small trave_events sample Makefile cleanup

Use -I$(src) to add the current directory the include path.

[ Impact: cleanup ]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 samples/trace_events/Makefile | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile
index 06c6dea1eb84c..0d428dc672833 100644
--- a/samples/trace_events/Makefile
+++ b/samples/trace_events/Makefile
@@ -1,8 +1,6 @@
 # builds the trace events example kernel modules;
 # then to use one (as root):  insmod <module_name.ko>
 
-PWD := $(shell pwd)
-
-CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/
+CFLAGS_trace-events-sample.o := -I$(src)
 
 obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
-- 
GitLab


From 8e7abf1c62941ebb7a1416cbc62392c8a0902625 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 10:26:45 -0400
Subject: [PATCH 0857/2198] ring-buffer: remove unneeded conditional in
 rb_reserve_next

The code in __rb_reserve_next checks on page overflow if it is the
original commiter and then resets the page back to the original
setting.  Although this is fine, and the code is correct, it is
a bit fragil. Some experimental work I did breaks it easily.

The better and more robust solution is to have all commiters that
overflow the page, simply subtract what they added.

[ Impact: more robust ring buffer account management ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 424129eb20a45..03ed52b67db30 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1290,9 +1290,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		rb_event_set_padding(event);
 	}
 
-	if (tail <= BUF_PAGE_SIZE)
-		/* Set the write back to the previous setting */
-		local_set(&tail_page->write, tail);
+	/* Set the write back to the previous setting */
+	local_sub(length, &tail_page->write);
 
 	/*
 	 * If this was a commit entry that failed,
@@ -1311,8 +1310,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
  out_reset:
 	/* reset write */
-	if (tail <= BUF_PAGE_SIZE)
-		local_set(&tail_page->write, tail);
+	local_sub(length, &tail_page->write);
 
 	if (likely(lock_taken))
 		__raw_spin_unlock(&cpu_buffer->lock);
-- 
GitLab


From 00c81a58c5b4e0de14ee33bfbc3d71c90f69f9ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 12:40:51 -0400
Subject: [PATCH 0858/2198] ring-buffer: check for failed allocation in ring
 buffer benchmark

The result of the allocation of the ring buffer read page in the
ring buffer bench mark does not check the return to see if a page
was actually allocated. This patch fixes that.

[ Impact: avoid NULL dereference ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 747244acb8fdb..dcd75e9e49f44 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -84,6 +84,9 @@ static enum event_status read_page(int cpu)
 	int i;
 
 	bpage = ring_buffer_alloc_read_page(buffer);
+	if (!bpage)
+		return EVENT_DROPPED;
+
 	ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
 	if (ret >= 0) {
 		rpage = bpage;
-- 
GitLab


From 6634ff26cce2da04e5c2a5481bcb8888e7d01786 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 15:30:07 -0400
Subject: [PATCH 0859/2198] ring-buffer: make moving the tail page a separate
 function

Ingo Molnar thought the code would be cleaner if we used a function call
instead of a goto for moving the tail page. After implementing this,
it seems that gcc still inlines the result and the output is pretty much
the same. Since this is considered a cleaner approach, might as well
implement it.

[ Impact: code clean up ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 89 +++++++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 40 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 03ed52b67db30..3ae5ccf2c0fcd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1154,51 +1154,18 @@ static unsigned rb_calculate_event_length(unsigned length)
 	return length;
 }
 
+
 static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-		  unsigned type, unsigned long length, u64 *ts)
+rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
+	     unsigned long length, unsigned long tail,
+	     struct buffer_page *commit_page,
+	     struct buffer_page *tail_page, u64 *ts)
 {
-	struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
-	struct buffer_page *next_page;
-	unsigned long tail, write;
+	struct buffer_page *next_page, *head_page, *reader_page;
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
-	unsigned long flags;
 	bool lock_taken = false;
-
-	commit_page = cpu_buffer->commit_page;
-	/* we just need to protect against interrupts */
-	barrier();
-	tail_page = cpu_buffer->tail_page;
-	write = local_add_return(length, &tail_page->write);
-	tail = write - length;
-
-	/* See if we shot pass the end of this buffer page */
-	if (write > BUF_PAGE_SIZE)
-		goto next_page;
-
-	/* We reserved something on the buffer */
-
-	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
-		return NULL;
-
-	event = __rb_page_index(tail_page, tail);
-	rb_update_event(event, type, length);
-
-	/* The passed in type is zero for DATA */
-	if (likely(!type))
-		local_inc(&tail_page->entries);
-
-	/*
-	 * If this is a commit and the tail is zero, then update
-	 * this page's time stamp.
-	 */
-	if (!tail && rb_is_commit(cpu_buffer, event))
-		cpu_buffer->commit_page->page->time_stamp = *ts;
-
-	return event;
-
- next_page:
+	unsigned long flags;
 
 	next_page = tail_page;
 
@@ -1318,6 +1285,48 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return NULL;
 }
 
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+		  unsigned type, unsigned long length, u64 *ts)
+{
+	struct buffer_page *tail_page, *commit_page;
+	struct ring_buffer_event *event;
+	unsigned long tail, write;
+
+	commit_page = cpu_buffer->commit_page;
+	/* we just need to protect against interrupts */
+	barrier();
+	tail_page = cpu_buffer->tail_page;
+	write = local_add_return(length, &tail_page->write);
+	tail = write - length;
+
+	/* See if we shot pass the end of this buffer page */
+	if (write > BUF_PAGE_SIZE)
+		return rb_move_tail(cpu_buffer, length, tail,
+				    commit_page, tail_page, ts);
+
+	/* We reserved something on the buffer */
+
+	if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE))
+		return NULL;
+
+	event = __rb_page_index(tail_page, tail);
+	rb_update_event(event, type, length);
+
+	/* The passed in type is zero for DATA */
+	if (likely(!type))
+		local_inc(&tail_page->entries);
+
+	/*
+	 * If this is a commit and the tail is zero, then update
+	 * this page's time stamp.
+	 */
+	if (!tail && rb_is_commit(cpu_buffer, event))
+		cpu_buffer->commit_page->page->time_stamp = *ts;
+
+	return event;
+}
+
 static int
 rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  u64 *ts, u64 *delta)
-- 
GitLab


From 3e07a4f680adc66dfa175aa5021aedf340251b12 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 18:36:59 -0400
Subject: [PATCH 0860/2198] ring-buffer: change test to be more latency
 friendly

The ring buffer benchmark/test runs a producer for 10 seconds.
This is done with preemption and interrupts enabled. But if the kernel
is not compiled with CONFIG_PREEMPT, it basically stops everything
but interrupts for 10 seconds.

Although this is just a test and is not for production, this attribute
can be quite annoying. It can also spawn badness elsewhere.

This patch solves the issues by calling "cond_resched" when the system
is not compiled with CONFIG_PREEMPT. It also keeps track of the time
spent to call cond_resched such that it does not go against the
time calculations. That is, if the task schedules away, the time scheduled
out is removed from the test data. Note, this only works for non PREEMPT
because we do not know when the task is scheduled out if we have PREEMPT
enabled.

[ Impact: prevent test from stopping the world for 10 seconds ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 31 ++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index dcd75e9e49f44..a26fc67b63bb8 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -185,6 +185,35 @@ static void ring_buffer_consumer(void)
 	complete(&read_done);
 }
 
+/*
+ * If we are a non preempt kernel, the 10 second run will
+ * stop everything while it runs. Instead, we will call cond_resched
+ * and also add any time that was lost by a rescedule.
+ */
+#ifdef CONFIG_PREEMPT
+static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
+{
+}
+#else
+static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
+{
+	struct timeval tv;
+
+	cond_resched();
+	do_gettimeofday(&tv);
+	if (tv.tv_usec < end_tv->tv_usec) {
+		tv.tv_usec += 1000000;
+		tv.tv_sec--;
+	}
+	start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec;
+	start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec;
+	if (start_tv->tv_usec > 1000000) {
+		start_tv->tv_usec -= 1000000;
+		start_tv->tv_sec++;
+	}
+}
+#endif
+
 static void ring_buffer_producer(void)
 {
 	struct timeval start_tv;
@@ -221,6 +250,8 @@ static void ring_buffer_producer(void)
 		if (consumer && !(++cnt % wakeup_interval))
 			wake_up_process(consumer);
 
+		sched_if_needed(&start_tv, &end_tv);
+
 	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
 	pr_info("End ring buffer hammer\n");
 
-- 
GitLab


From 71e1c8ac42ae4038ddb1367cce7097ab868dc532 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 21:20:39 -0400
Subject: [PATCH 0861/2198] tracing: update sample with TRACE_INCLUDE_FILE

When creating trace events for ftrace, the header file with the TRACE_EVENT
macros must also have a macro called TRACE_SYSTEM. This macro describes
the name of the system the TRACE_EVENTS are defined for. It also doubles
as a way for the define_trace.h file to include the file that included
it.

For example:

in irq.h

 #define TRACE_SYSTEM irq

[...]

 #include <trace/define_trace.h>

The define_trace will use TRACE_SYSTEM to include irq.h. But if the name
of the trace system does not match the name of the trace header file,
one can override it with:

Which will change define_trace.h to inclued foo_trace.h instead of foo.h

The sample comments this, but people that use the sample code will more
likely use the code and not read the comments. This patch changes the
sample code to use the TRACE_INCLUDE_FILE to better show developers how to
use it.

[ Impact: make sample less confusing to developers ]

Reported-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 samples/trace_events/trace-events-sample.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
index eab46443e610e..128a897687c51 100644
--- a/samples/trace_events/trace-events-sample.h
+++ b/samples/trace_events/trace-events-sample.h
@@ -31,7 +31,7 @@
  *
  */
 #undef TRACE_SYSTEM
-#define TRACE_SYSTEM trace-events-sample
+#define TRACE_SYSTEM sample
 
 /*
  * The TRACE_EVENT macro is broken up into 5 parts.
@@ -120,5 +120,10 @@ TRACE_EVENT(foo_bar,
  * result.
  */
 #undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
 #define TRACE_INCLUDE_PATH .
+/*
+ * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal
+ */
+#define TRACE_INCLUDE_FILE trace-events-sample
 #include <trace/define_trace.h>
-- 
GitLab


From 9456f0fa6d3cb944d3b9fc31c9a244e0362c26ea Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 21:54:09 -0400
Subject: [PATCH 0862/2198] tracing: reset ring buffer when removing modules
 with events

Li Zefan found that there's a race using the event ids of events and
modules. When a module is loaded, an event id is incremented. We only
have 16 bits for event ids (65536) and there is a possible (but highly
unlikely) race that we could load and unload a module that registers
events so many times that the event id counter overflows.

When it overflows, it then restarts and goes looking for available
ids. An id is available if it was added by a module and released.

The race is if you have one module add an id, and then is removed.
Another module loaded can use that same event id. But if the old module
still had events in the ring buffer, the new module's call back would
get bogus data.  At best (and most likely) the output would just be
garbage. But if the module for some reason used pointers (not recommended)
then this could potentially crash.

The safest thing to do is just reset the ring buffer if a module that
registered events is removed.

[ Impact: prevent unpredictable results of event id overflows ]

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <49FEAFD0.30106@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c        | 10 ++++++++++
 kernel/trace/trace.h        |  2 ++
 kernel/trace/trace_events.c |  9 +++++++++
 3 files changed, 21 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4164a344e72aa..dd40d23203462 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -639,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 		tracing_reset(tr, cpu);
 }
 
+void tracing_reset_current(int cpu)
+{
+	tracing_reset(&global_trace, cpu);
+}
+
+void tracing_reset_current_online_cpus(void)
+{
+	tracing_reset_online_cpus(&global_trace);
+}
+
 #define SAVED_CMDLINES 128
 #define NO_CMDLINE_MAP UINT_MAX
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 777c6c3a0cdee..ba25793ffe671 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -409,6 +409,8 @@ int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset_current(int cpu);
+void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *trace_create_file(const char *name,
 				 mode_t mode,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8d579ff236101..6d2c842a02482 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -932,9 +932,11 @@ static void trace_module_remove_events(struct module *mod)
 {
 	struct ftrace_module_file_ops *file_ops;
 	struct ftrace_event_call *call, *p;
+	bool found = false;
 
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
+			found = true;
 			if (call->enabled) {
 				call->enabled = 0;
 				call->unregfunc();
@@ -957,6 +959,13 @@ static void trace_module_remove_events(struct module *mod)
 		list_del(&file_ops->list);
 		kfree(file_ops);
 	}
+
+	/*
+	 * It is safest to reset the ring buffer if the module being unloaded
+	 * registered any events.
+	 */
+	if (found)
+		tracing_reset_current_online_cpus();
 }
 
 static int trace_module_notify(struct notifier_block *self,
-- 
GitLab


From 8ae79a138e88aceeeb07077bff2883245fb7c218 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 6 May 2009 22:52:15 -0400
Subject: [PATCH 0863/2198] tracing: add hierarchical enabling of events

With the current event directory, you can only enable individual events.
The file debugfs/tracing/set_event is used to be able to enable or
disable several events at once. But that can still be awkward.

This patch adds hierarchical enabling of events. That is, each directory
in debugfs/tracing/events has an "enable" file. This file can enable
or disable all events within the directory and below.

 # echo 1 > /debugfs/tracing/events/enable

will enable all events.

 # echo 1 > /debugfs/tracing/events/sched/enable

will enable all events in the sched subsystem.

 # echo 1 > /debugfs/tracing/events/enable
 # echo 0 > /debugfs/tracing/events/irq/enable

will enable all events, but then disable just the irq subsystem events.

When reading one of these enable files, there are four results:

 0 - all events this file affects are disabled
 1 - all events this file affects are enabled
 X - there is a mixture of events enabled and disabled
 ? - this file does not affect any event

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 140 ++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6d2c842a02482..87feb0117ce28 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -400,6 +400,133 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+		   loff_t *ppos)
+{
+	const char *system = filp->private_data;
+	struct ftrace_event_call *call;
+	char buf[2];
+	int set = -1;
+	int all = 0;
+	int ret;
+
+	if (system[0] == '*')
+		all = 1;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (!all && strcmp(call->system, system) != 0)
+			continue;
+
+		/*
+		 * We need to find out if all the events are set
+		 * or if all events or cleared, or if we have
+		 * a mixture.
+		 */
+		if (call->enabled) {
+			switch (set) {
+			case -1:
+				set = 1;
+				break;
+			case 0:
+				set = 2;
+				break;
+			}
+		} else {
+			switch (set) {
+			case -1:
+				set = 0;
+				break;
+			case 1:
+				set = 2;
+				break;
+			}
+		}
+		/*
+		 * If we have a mixture, no need to look further.
+		 */
+		if (set == 2)
+			break;
+	}
+	mutex_unlock(&event_mutex);
+
+	buf[1] = '\n';
+	switch (set) {
+	case 0:
+		buf[0] = '0';
+		break;
+	case 1:
+		buf[0] = '1';
+		break;
+	case 2:
+		buf[0] = 'X';
+		break;
+	default:
+		buf[0] = '?';
+	}
+
+	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+
+	return ret;
+}
+
+static ssize_t
+system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		    loff_t *ppos)
+{
+	const char *system = filp->private_data;
+	unsigned long val;
+	char *command;
+	char buf[64];
+	ssize_t ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
+	switch (val) {
+	case 0:
+	case 1:
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	command = kstrdup(system, GFP_KERNEL);
+	if (!command)
+		return -ENOMEM;
+
+	ret = ftrace_set_clr_event(command, val);
+	if (ret)
+		goto out_free;
+
+	ret = cnt;
+
+ out_free:
+	kfree(command);
+
+	*ppos += cnt;
+
+	return ret;
+}
+
 extern char *__bad_type_size(void);
 
 #undef FIELD
@@ -686,6 +813,12 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
 	.write = subsystem_filter_write,
 };
 
+static const struct file_operations ftrace_system_enable_fops = {
+	.open = tracing_open_generic,
+	.read = system_enable_read,
+	.write = system_enable_write,
+};
+
 static const struct file_operations ftrace_show_header_fops = {
 	.open = tracing_open_generic,
 	.read = show_header,
@@ -768,6 +901,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 			   "'%s/filter' entry\n", name);
 	}
 
+	entry = trace_create_file("enable", 0644, system->entry,
+				  (void *)system->name,
+				  &ftrace_system_enable_fops);
+
 	return system->entry;
 }
 
@@ -1041,6 +1178,9 @@ static __init int event_trace_init(void)
 			  ring_buffer_print_entry_header,
 			  &ftrace_show_header_fops);
 
+	trace_create_file("enable", 0644, d_events,
+			  "*:*", &ftrace_system_enable_fops);
+
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
 		if (!call->name)
-- 
GitLab


From 975e5f45500dff6d15c0001bb662e9aac0ce0076 Mon Sep 17 00:00:00 2001
From: Samuel Bronson <naesten@gmail.com>
Date: Wed, 6 May 2009 22:27:55 -0400
Subject: [PATCH 0864/2198] x86: use symbolic name for VM86_SIGNAL when used as
 vm86 default return

This code has apparently used "0" and not VM86_SIGNAL since Linux
1.1.9, when Linus added VM86_SIGNAL to vm86.h. This patch changes the
code to use the symbolic name.

The magic 0 tripped me up in trying to extend the vm86(2) manpage to
actually explain vm86()'s interface -- my greps for VM86_SIGNAL came up
fruitless.

[ Impact: cleanup; no object code change ]

Signed-off-by: Samuel Bronson <naesten@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/vm86_32.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1c7..b8035a0f4048a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -318,9 +318,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	}
 
 /*
- * Save old state, set default return value (%ax) to 0
+ * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
  */
-	info->regs32->ax = 0;
+	info->regs32->ax = VM86_SIGNAL;
 	tsk->thread.saved_sp0 = tsk->thread.sp0;
 	tsk->thread.saved_fs = info->regs32->fs;
 	tsk->thread.saved_gs = get_user_gs(info->regs32);
-- 
GitLab


From 40c8bca76ecaa6b663d403d34f0fcd422bbdbffd Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 7 May 2009 15:24:36 +0900
Subject: [PATCH 0865/2198] sh: Flag IRQSTACKS as BROKEN for now.

There still seems to be some stack corruption to sort out here, so flag
this as BROKEN until this issue is sorted out.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug
index 0d62681f72a09..fdb049373e7c0 100644
--- a/arch/sh/Kconfig.debug
+++ b/arch/sh/Kconfig.debug
@@ -92,7 +92,7 @@ config 4KSTACKS
 
 config IRQSTACKS
 	bool "Use separate kernel stacks when processing interrupts"
-	depends on DEBUG_KERNEL && SUPERH32
+	depends on DEBUG_KERNEL && SUPERH32 && BROKEN
 	help
 	  If you say Y here the kernel will use separate kernel stacks
 	  for handling hard and soft interrupts.  This can help avoid
-- 
GitLab


From 643bec956544d376b7c2a80a3d5c3d0bf94da8d3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 7 May 2009 09:12:50 +0200
Subject: [PATCH 0866/2198] x86: clean up arch/x86/kernel/tsc_sync.c a bit

 - remove unused define
 - make the lock variable definition stand out some more
 - convert KERN_* to pr_info() / pr_warning()

[ Impact: cleanup ]

LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/tsc_sync.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index bf36328f6ef92..027b5b498993b 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
  * of a critical section, to be able to prove TSC time-warps:
  */
 static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
 static __cpuinitdata cycles_t last_tsc;
 static __cpuinitdata cycles_t max_warp;
 static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
 		return;
 
 	if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
-		printk(KERN_INFO
-		       "Skipping synchronization checks as TSC is reliable.\n");
+		pr_info("Skipping synchronization checks as TSC is reliable.\n");
 		return;
 	}
 
-	printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
-			  smp_processor_id(), cpu);
+	pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
+		smp_processor_id(), cpu);
 
 	/*
 	 * Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
 
 	if (nr_warps) {
 		printk("\n");
-		printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
-				    " turning off TSC clock.\n", max_warp);
+		pr_warning("Measured %Ld cycles TSC warp between CPUs, "
+			   "turning off TSC clock.\n", max_warp);
 		mark_tsc_unstable("check_tsc_sync_source failed");
 	} else {
 		printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
 	while (atomic_read(&stop_count) != cpus)
 		cpu_relax();
 }
-#undef NR_LOOPS
-
-- 
GitLab


From aa47b7e0f89b9998dad4d1667447e8cb7703ff4e Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Mon, 4 May 2009 01:38:05 -0700
Subject: [PATCH 0867/2198] sched: emit thread info flags with stack trace

When a thread is oom killed and fails to exit, it's helpful to know which
threads have access to memory reserves if the machine livelocks.  This is
done by testing for the TIF_MEMDIE thread info flag and should be
displayed alongside stack traces to identify tasks that have access to
such reserves but are still stuck allocating pages, for instance.

It would probably be helpful in other cases as well, so all thread info
flags are emitted when showing a task.

( v2: fix warning reported by Stephen Rothwell )

[ Impact: extend debug printout info ]

Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
LKML-Reference: <alpine.DEB.2.00.0905040136390.15831@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 2a43a581ead32..5aa63f50c6969 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6610,8 +6610,9 @@ void sched_show_task(struct task_struct *p)
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
-	printk(KERN_CONT "%5lu %5d %6d\n", free,
-		task_pid_nr(p), task_pid_nr(p->real_parent));
+	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+		task_pid_nr(p), task_pid_nr(p->real_parent),
+		(unsigned long)task_thread_info(p)->flags);
 
 	show_stack(p, NULL);
 }
-- 
GitLab


From ee1acbfabd5270b40ce2cfdc202070b7ca91cdff Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 7 May 2009 16:38:16 +0900
Subject: [PATCH 0868/2198] sh: Handle shm_align_mask also for
 HAVE_ARCH_UNMAPPED_AREA_TOPDOWN.

Presently shm_align_mask is only looked at for the bottom up case, but we
still want this for proper colouring constraints in the topdown case.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/cacheflush.h |   2 -
 arch/sh/include/asm/pgtable.h    |   4 +
 arch/sh/mm/mmap.c                | 136 ++++++++++++++++++++++++++++++-
 3 files changed, 136 insertions(+), 6 deletions(-)

diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h
index 09acbc32d6c7e..4c5462daa74cf 100644
--- a/arch/sh/include/asm/cacheflush.h
+++ b/arch/sh/include/asm/cacheflush.h
@@ -75,7 +75,5 @@ extern void copy_from_user_page(struct vm_area_struct *vma,
 #define flush_cache_vmap(start, end)		flush_cache_all()
 #define flush_cache_vunmap(start, end)		flush_cache_all()
 
-#define HAVE_ARCH_UNMAPPED_AREA
-
 #endif /* __KERNEL__ */
 #endif /* __ASM_SH_CACHEFLUSH_H */
diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h
index b517ae08b9c0f..2a011b18090b6 100644
--- a/arch/sh/include/asm/pgtable.h
+++ b/arch/sh/include/asm/pgtable.h
@@ -154,6 +154,10 @@ extern void kmap_coherent_init(void);
 #define kmap_coherent_init()	do { } while (0)
 #endif
 
+/* arch/sh/mm/mmap.c */
+#define HAVE_ARCH_UNMAPPED_AREA
+#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
+
 #include <asm-generic/pgtable.h>
 
 #endif /* __ASM_SH_PGTABLE_H */
diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c
index 931f4d003fa07..1b5fdfb4e0c2a 100644
--- a/arch/sh/mm/mmap.c
+++ b/arch/sh/mm/mmap.c
@@ -1,7 +1,7 @@
 /*
  * arch/sh/mm/mmap.c
  *
- * Copyright (C) 2008  Paul Mundt
+ * Copyright (C) 2008 - 2009  Paul Mundt
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -21,9 +21,26 @@ EXPORT_SYMBOL(shm_align_mask);
 /*
  * To avoid cache aliases, we map the shared page with same color.
  */
-#define COLOUR_ALIGN(addr, pgoff)				\
-	((((addr) + shm_align_mask) & ~shm_align_mask) +	\
-	 (((pgoff) << PAGE_SHIFT) & shm_align_mask))
+static inline unsigned long COLOUR_ALIGN(unsigned long addr,
+					 unsigned long pgoff)
+{
+	unsigned long base = (addr + shm_align_mask) & ~shm_align_mask;
+	unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask;
+
+	return base + off;
+}
+
+static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr,
+					      unsigned long pgoff)
+{
+	unsigned long base = addr & ~shm_align_mask;
+	unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask;
+
+	if (base + off <= addr)
+		return base + off;
+
+	return base - off;
+}
 
 unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	unsigned long len, unsigned long pgoff, unsigned long flags)
@@ -103,6 +120,117 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 			addr = COLOUR_ALIGN(addr, pgoff);
 	}
 }
+
+unsigned long
+arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
+			  const unsigned long len, const unsigned long pgoff,
+			  const unsigned long flags)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm = current->mm;
+	unsigned long addr = addr0;
+	int do_colour_align;
+
+	if (flags & MAP_FIXED) {
+		/* We do not accept a shared mapping if it would violate
+		 * cache aliasing constraints.
+		 */
+		if ((flags & MAP_SHARED) &&
+		    ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask))
+			return -EINVAL;
+		return addr;
+	}
+
+	if (unlikely(len > TASK_SIZE))
+		return -ENOMEM;
+
+	do_colour_align = 0;
+	if (filp || (flags & MAP_SHARED))
+		do_colour_align = 1;
+
+	/* requesting a specific address */
+	if (addr) {
+		if (do_colour_align)
+			addr = COLOUR_ALIGN(addr, pgoff);
+		else
+			addr = PAGE_ALIGN(addr);
+
+		vma = find_vma(mm, addr);
+		if (TASK_SIZE - len >= addr &&
+		    (!vma || addr + len <= vma->vm_start))
+			return addr;
+	}
+
+	/* check if free_area_cache is useful for us */
+	if (len <= mm->cached_hole_size) {
+	        mm->cached_hole_size = 0;
+		mm->free_area_cache = mm->mmap_base;
+	}
+
+	/* either no address requested or can't fit in requested address hole */
+	addr = mm->free_area_cache;
+	if (do_colour_align) {
+		unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff);
+
+		addr = base + len;
+	}
+
+	/* make sure it can fit in the remaining address space */
+	if (likely(addr > len)) {
+		vma = find_vma(mm, addr-len);
+		if (!vma || addr <= vma->vm_start) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr-len);
+		}
+	}
+
+	if (unlikely(mm->mmap_base < len))
+		goto bottomup;
+
+	addr = mm->mmap_base-len;
+	if (do_colour_align)
+		addr = COLOUR_ALIGN_DOWN(addr, pgoff);
+
+	do {
+		/*
+		 * Lookup failure means no vma is above this address,
+		 * else if new region fits below vma->vm_start,
+		 * return with success:
+		 */
+		vma = find_vma(mm, addr);
+		if (likely(!vma || addr+len <= vma->vm_start)) {
+			/* remember the address as a hint for next time */
+			return (mm->free_area_cache = addr);
+		}
+
+		/* remember the largest hole we saw so far */
+		if (addr + mm->cached_hole_size < vma->vm_start)
+		        mm->cached_hole_size = vma->vm_start - addr;
+
+		/* try just below the current vma->vm_start */
+		addr = vma->vm_start-len;
+		if (do_colour_align)
+			addr = COLOUR_ALIGN_DOWN(addr, pgoff);
+	} while (likely(len < vma->vm_start));
+
+bottomup:
+	/*
+	 * A failed mmap() very likely causes application failure,
+	 * so fall back to the bottom-up function here. This scenario
+	 * can happen with large stack limits and large mmap()
+	 * allocations.
+	 */
+	mm->cached_hole_size = ~0UL;
+	mm->free_area_cache = TASK_UNMAPPED_BASE;
+	addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags);
+	/*
+	 * Restore the topdown base:
+	 */
+	mm->free_area_cache = mm->mmap_base;
+	mm->cached_hole_size = ~0UL;
+
+	return addr;
+}
 #endif /* CONFIG_MMU */
 
 /*
-- 
GitLab


From e8808c1019b048a43686dbd25c188a035842c2e2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 3 May 2009 02:48:52 +0200
Subject: [PATCH 0869/2198] tracing/filters: support for filters of dynamic
 sized arrays

Currently the filtering infrastructure supports well the
numeric types and fixed sized array types.

But the recently added __string() field uses a specific
indirect offset mechanism which requires a specific
predicate. Until now it wasn't supported.

This patch adds this support and implies very few changes,
only a new predicate is needed, the management of this specific
field can be done through the usual string helpers in the
filtering infrastructure.

[ Impact: support all kinds of strings in the tracing filters ]

Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events_filter.c | 44 ++++++++++++++++++++++++++++--
 1 file changed, 41 insertions(+), 3 deletions(-)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 7ac691085276a..01c76eb3e168a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -151,6 +151,7 @@ static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
 	return val1 || val2;
 }
 
+/* Filter predicate for fixed sized arrays of characters */
 static int filter_pred_string(struct filter_pred *pred, void *event,
 			      int val1, int val2)
 {
@@ -164,6 +165,30 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
 	return match;
 }
 
+/*
+ * Filter predicate for dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry.
+ * Also each of these strings have a field in the entry which
+ * contains its offset from the beginning of the entry.
+ * We have then first to get this field, dereference it
+ * and add it to the address of the entry, and at last we have
+ * the address of the string.
+ */
+static int filter_pred_strloc(struct filter_pred *pred, void *event,
+			      int val1, int val2)
+{
+	int str_loc = *(int *)(event + pred->offset);
+	char *addr = (char *)(event + str_loc);
+	int cmp, match;
+
+	cmp = strncmp(addr, pred->str_val, pred->str_len);
+
+	match = (!cmp) ^ pred->not;
+
+	return match;
+}
+
 static int filter_pred_none(struct filter_pred *pred, void *event,
 			    int val1, int val2)
 {
@@ -446,10 +471,18 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
 	return 0;
 }
 
+enum {
+	FILTER_STATIC_STRING = 1,
+	FILTER_DYN_STRING
+};
+
 static int is_string_field(const char *type)
 {
 	if (strchr(type, '[') && strstr(type, "char"))
-		return 1;
+		return FILTER_STATIC_STRING;
+
+	if (!strcmp(type, "__str_loc"))
+		return FILTER_DYN_STRING;
 
 	return 0;
 }
@@ -512,6 +545,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
 	struct ftrace_event_field *field;
 	filter_pred_fn_t fn;
 	unsigned long long val;
+	int string_type;
 
 	pred->fn = filter_pred_none;
 
@@ -536,8 +570,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
 		return -EINVAL;
 	}
 
-	if (is_string_field(field->type)) {
-		fn = filter_pred_string;
+	string_type = is_string_field(field->type);
+	if (string_type) {
+		if (string_type == FILTER_STATIC_STRING)
+			fn = filter_pred_string;
+		else
+			fn = filter_pred_strloc;
 		pred->str_len = field->size;
 		if (pred->op == OP_NE)
 			pred->not = 1;
-- 
GitLab


From 5928c3cc0ffcb6894bbab6be591b7ae1786b2d87 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 3 May 2009 03:03:57 +0200
Subject: [PATCH 0870/2198] tracing/filters: support for operator reserved
 characters in strings

When we set a filter for an event, such as:

echo "name == my_lock_name" > \
	/debug/tracing/events/lockdep/lock_acquired/filter

then the following order of token type is parsed:

- space
- operator
- parentheses
- operand

Because the operators and parentheses have a higher precedence
than the operand characters, which is normal, then we can't
use any string containing such special characters:

()=<>!&|

To get this support and also avoid ambiguous intepretation from
the parser or the human, we can do it using double quotes so that
we keep the usual languages habits.

Then after this patch you can still declare string condition like
before:

echo name == myname

But if you want to compare against a string containing an operator
character, you can use double quotes:

echo 'name == "&myname"'

Don't forget to include the whole expression into single quotes or
the double ones will be eaten by echo.

[ Impact: support strings with special characters for tracing filters ]

Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Zhaolei <zhaolei@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events_filter.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 01c76eb3e168a..8c62e5bdff098 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -851,10 +851,19 @@ static void postfix_clear(struct filter_parse_state *ps)
 
 static int filter_parse(struct filter_parse_state *ps)
 {
+	int in_string = 0;
 	int op, top_op;
 	char ch;
 
 	while ((ch = infix_next(ps))) {
+		if (ch == '"') {
+			in_string ^= 1;
+			continue;
+		}
+
+		if (in_string)
+			goto parse_operand;
+
 		if (isspace(ch))
 			continue;
 
@@ -908,6 +917,7 @@ static int filter_parse(struct filter_parse_state *ps)
 			}
 			continue;
 		}
+parse_operand:
 		if (append_operand_char(ps, ch)) {
 			parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
 			return -EINVAL;
-- 
GitLab


From d94fc523f3c35bd8013f04827e94756cbc0212f4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 7 May 2009 15:11:15 +0800
Subject: [PATCH 0871/2198] tracing/events: fix concurrent access to
 ftrace_events list, fix

In filter_add_subsystem_pred() we should release event_mutex before
calling filter_free_subsystem_preds(), since both functions hold
event_mutex.

[ Impact: fix deadlock when writing invalid pred into subsystem filter ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: tzanussi@gmail.com
Cc: a.p.zijlstra@chello.nl
Cc: fweisbec@gmail.com
Cc: rostedt@goodmis.org
LKML-Reference: <4A028993.7020509@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8c62e5bdff098..85ad6a8939ad2 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -636,14 +636,15 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps,
 
 		err = filter_add_pred(ps, call, pred);
 		if (err) {
+			mutex_unlock(&event_mutex);
 			filter_free_subsystem_preds(system);
 			parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
-			break;
+			goto out;
 		}
 		replace_filter_string(call->filter, filter_string);
 	}
 	mutex_unlock(&event_mutex);
-
+out:
 	return err;
 }
 
-- 
GitLab


From ae318a148e4d255dfbc87d963fdd6031c2af9c46 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 7 May 2009 17:55:09 +0900
Subject: [PATCH 0872/2198] sh: sh64 still needs ARCH_USES_GETTIMEOFFSET
 temporarily.

sh64 is still using this, so re-enable it temporarily while SH-5 gets
converted to use the generic code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 565f39042ad94..02688d7909a18 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -141,6 +141,10 @@ config ARCH_NO_VIRT_TO_BUS
 config ARCH_HAS_DEFAULT_IDLE
 	def_bool y
 
+config ARCH_USES_GETTIMEOFFSET
+	def_bool y 
+	depends on SUPERH64
+
 config IO_TRAPPED
 	bool
 
-- 
GitLab


From 0fb849b9d743a20056f2418cd955e5c650658663 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 7 May 2009 18:10:27 +0900
Subject: [PATCH 0873/2198] sh: Integrate the SH-5 onchip_remap() more
 coherently.

Presently this is special-cased for early initialization. While there are
situations where these static early initializations are still necessary,
with minor changes it is possible to use this for the regular ioremap
implementation as well. This allows us to kill off the special-casing for
the remap completely and to start tidying up all of the SH-5
special-casing in drivers.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-cayman/irq.c   |   2 +-
 arch/sh/boards/mach-cayman/setup.c |   2 +-
 arch/sh/drivers/pci/pci-sh5.c      |   4 +-
 arch/sh/include/asm/io.h           |   7 -
 arch/sh/kernel/cpu/irq/intc-sh5.c  |   2 +-
 arch/sh/kernel/cpu/sh5/Makefile    |   3 +
 arch/sh/kernel/cpu/sh5/clock-sh5.c |   2 +-
 arch/sh/kernel/cpu/sh5/entry.S     |   8 +-
 arch/sh/kernel/cpu/sh5/setup-sh5.c |  46 +++++
 arch/sh/kernel/time_64.c           |   4 +-
 arch/sh/mm/ioremap_64.c            | 265 ++++++++++-------------------
 drivers/serial/sh-sci.c            |   6 -
 12 files changed, 155 insertions(+), 196 deletions(-)
 create mode 100644 arch/sh/kernel/cpu/sh5/setup-sh5.c

diff --git a/arch/sh/boards/mach-cayman/irq.c b/arch/sh/boards/mach-cayman/irq.c
index da62ad516994b..dbb00c9913241 100644
--- a/arch/sh/boards/mach-cayman/irq.c
+++ b/arch/sh/boards/mach-cayman/irq.c
@@ -161,7 +161,7 @@ void init_cayman_irq(void)
 {
 	int i;
 
-	epld_virt = onchip_remap(EPLD_BASE, 1024, "EPLD");
+	epld_virt = (unsigned long)ioremap_nocache(EPLD_BASE, 1024);
 	if (!epld_virt) {
 		printk(KERN_ERR "Cayman IRQ: Unable to remap EPLD\n");
 		return;
diff --git a/arch/sh/boards/mach-cayman/setup.c b/arch/sh/boards/mach-cayman/setup.c
index e7f9cc5f2ff11..7e8216ac31bda 100644
--- a/arch/sh/boards/mach-cayman/setup.c
+++ b/arch/sh/boards/mach-cayman/setup.c
@@ -102,7 +102,7 @@ static int __init smsc_superio_setup(void)
 {
 	unsigned char devid, devrev;
 
-	smsc_superio_virt = onchip_remap(SMSC_SUPERIO_BASE, 1024, "SMSC SuperIO");
+	smsc_superio_virt = (unsigned long)ioremap_nocache(SMSC_SUPERIO_BASE, 1024);
 	if (!smsc_superio_virt) {
 		panic("Unable to remap SMSC SuperIO\n");
 	}
diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c
index cf431852213c0..873ed2b440557 100644
--- a/arch/sh/drivers/pci/pci-sh5.c
+++ b/arch/sh/drivers/pci/pci-sh5.c
@@ -119,12 +119,12 @@ static int __init sh5pci_init(void)
                 return -EINVAL;
         }
 
-	pcicr_virt = onchip_remap(SH5PCI_ICR_BASE, 1024, "PCICR");
+	pcicr_virt = (unsigned long)ioremap_nocache(SH5PCI_ICR_BASE, 1024);
 	if (!pcicr_virt) {
 		panic("Unable to remap PCICR\n");
 	}
 
-	PCI_IO_AREA = onchip_remap(SH5PCI_IO_BASE, 0x10000, "PCIIO");
+	PCI_IO_AREA = (unsigned long)ioremap_nocache(SH5PCI_IO_BASE, 0x10000);
 	if (!PCI_IO_AREA) {
 		panic("Unable to remap PCIIO\n");
 	}
diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h
index ef217344afcbf..c7c360b586671 100644
--- a/arch/sh/include/asm/io.h
+++ b/arch/sh/include/asm/io.h
@@ -224,11 +224,6 @@ void __iomem *__ioremap(unsigned long offset, unsigned long size,
 			unsigned long flags);
 void __iounmap(void __iomem *addr);
 
-/* arch/sh/mm/ioremap_64.c */
-unsigned long onchip_remap(unsigned long addr, unsigned long size,
-			   const char *name);
-extern void onchip_unmap(unsigned long vaddr);
-
 static inline void __iomem *
 __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags)
 {
@@ -263,8 +258,6 @@ __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags)
 	return __ioremap(offset, size, flags);
 }
 #else
-#define onchip_remap(addr, size, name)		(addr)
-#define onchip_unmap(addr)			do { } while (0)
 #define __ioremap_mode(offset, size, flags)	((void __iomem *)(offset))
 #define __iounmap(addr)				do { } while (0)
 #endif /* CONFIG_MMU */
diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c
index d8679a4d93a46..02d8137547e8b 100644
--- a/arch/sh/kernel/cpu/irq/intc-sh5.c
+++ b/arch/sh/kernel/cpu/irq/intc-sh5.c
@@ -188,7 +188,7 @@ void __init plat_irq_setup(void)
 	unsigned long reg;
 	int i;
 
-	intc_virt = onchip_remap(INTC_BASE, 1024, "INTC");
+	intc_virt = (unsigned long)ioremap_nocache(INTC_BASE, 1024);
 	if (!intc_virt) {
 		panic("Unable to remap INTC\n");
 	}
diff --git a/arch/sh/kernel/cpu/sh5/Makefile b/arch/sh/kernel/cpu/sh5/Makefile
index ce4602ea23a81..a184a31e686e0 100644
--- a/arch/sh/kernel/cpu/sh5/Makefile
+++ b/arch/sh/kernel/cpu/sh5/Makefile
@@ -6,6 +6,9 @@ obj-y := entry.o probe.o switchto.o
 obj-$(CONFIG_SH_FPU)		+= fpu.o
 obj-$(CONFIG_KALLSYMS)		+= unwind.o
 
+# CPU subtype setup
+obj-$(CONFIG_CPU_SH5)		+= setup-sh5.o
+
 # Primary on-chip clocks (common)
 clock-$(CONFIG_CPU_SH5)		:= clock-sh5.o
 
diff --git a/arch/sh/kernel/cpu/sh5/clock-sh5.c b/arch/sh/kernel/cpu/sh5/clock-sh5.c
index 52c49248833ac..5486324880e10 100644
--- a/arch/sh/kernel/cpu/sh5/clock-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/clock-sh5.c
@@ -71,7 +71,7 @@ static struct clk_ops *sh5_clk_ops[] = {
 
 void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 {
-	cprc_base = onchip_remap(CPRC_BASE, 1024, "CPRC");
+	cprc_base = (unsigned long)ioremap_nocache(CPRC_BASE, 1024);
 	BUG_ON(!cprc_base);
 
 	if (idx < ARRAY_SIZE(sh5_clk_ops))
diff --git a/arch/sh/kernel/cpu/sh5/entry.S b/arch/sh/kernel/cpu/sh5/entry.S
index 7e49cb812f8b6..516297515b25b 100644
--- a/arch/sh/kernel/cpu/sh5/entry.S
+++ b/arch/sh/kernel/cpu/sh5/entry.S
@@ -1410,8 +1410,8 @@ peek_real_address_q:
 	   r2(out) : result quadword
 
 	   This is provided as a cheapskate way of manipulating device
-	   registers for debugging (to avoid the need to onchip_remap the debug
-	   module, and to avoid the need to onchip_remap the watchpoint
+	   registers for debugging (to avoid the need to ioremap the debug
+	   module, and to avoid the need to ioremap the watchpoint
 	   controller in a way that identity maps sufficient bits to avoid the
 	   SH5-101 cut2 silicon defect).
 
@@ -1459,8 +1459,8 @@ poke_real_address_q:
 	   r3 : quadword value to write.
 
 	   This is provided as a cheapskate way of manipulating device
-	   registers for debugging (to avoid the need to onchip_remap the debug
-	   module, and to avoid the need to onchip_remap the watchpoint
+	   registers for debugging (to avoid the need to ioremap the debug
+	   module, and to avoid the need to ioremap the watchpoint
 	   controller in a way that identity maps sufficient bits to avoid the
 	   SH5-101 cut2 silicon defect).
 
diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
new file mode 100644
index 0000000000000..d8d59fea10825
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -0,0 +1,46 @@
+/*
+ * SH5-101/SH5-103 CPU Setup
+ *
+ *  Copyright (C) 2009  Paul Mundt
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/platform_device.h>
+#include <linux/init.h>
+#include <linux/serial.h>
+#include <linux/serial_sci.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <asm/addrspace.h>
+
+static struct plat_sci_port sci_platform_data[] = {
+	{
+		.mapbase	= PHYS_PERIPHERAL_BLOCK + 0x01030000,
+		.flags		= UPF_BOOT_AUTOCONF | UPF_IOREMAP,
+		.type		= PORT_SCIF,
+		.irqs		= { 39, 40, 42, 0 },
+	}, {
+		.flags = 0,
+	}
+};
+
+static struct platform_device sci_device = {
+	.name		= "sh-sci",
+	.id		= -1,
+	.dev		= {
+		.platform_data	= sci_platform_data,
+	},
+};
+
+static struct platform_device *sh5_devices[] __initdata = {
+	&sci_device,
+};
+
+static int __init sh5_devices_setup(void)
+{
+	return platform_add_devices(sh5_devices,
+				    ARRAY_SIZE(sh5_devices));
+}
+__initcall(sh5_devices_setup);
diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c
index f4f5e8ad5bec6..7bfeaa09b9479 100644
--- a/arch/sh/kernel/time_64.c
+++ b/arch/sh/kernel/time_64.c
@@ -243,12 +243,12 @@ void __init time_init(void)
 	unsigned long interval;
 	struct clk *clk;
 
-	tmu_base = onchip_remap(TMU_BASE, 1024, "TMU");
+	tmu_base = (unsigned long)ioremap_nocache(TMU_BASE, 1024);
 	if (!tmu_base) {
 		panic("Unable to remap TMU\n");
 	}
 
-	rtc_base = onchip_remap(RTC_BASE, 1024, "RTC");
+	rtc_base = (unsigned long)ioremap_nocache(RTC_BASE, 1024);
 	if (!rtc_base) {
 		panic("Unable to remap RTC\n");
 	}
diff --git a/arch/sh/mm/ioremap_64.c b/arch/sh/mm/ioremap_64.c
index 31e1bb5effbeb..2331229f81266 100644
--- a/arch/sh/mm/ioremap_64.c
+++ b/arch/sh/mm/ioremap_64.c
@@ -27,88 +27,17 @@
 #include <asm/tlbflush.h>
 #include <asm/mmu.h>
 
-static void shmedia_mapioaddr(unsigned long, unsigned long);
-static unsigned long shmedia_ioremap(struct resource *, u32, int);
-
-/*
- * Generic mapping function (not visible outside):
- */
-
-/*
- * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
- *
- * NOTE! We need to allow non-page-aligned mappings too: we will obviously
- * have to convert them into an offset in a page-aligned mapping, but the
- * caller shouldn't need to know that small detail.
- */
-void *__ioremap(unsigned long phys_addr, unsigned long size,
-		unsigned long flags)
-{
-	void * addr;
-	struct vm_struct * area;
-	unsigned long offset, last_addr;
-	pgprot_t pgprot;
-
-	/* Don't allow wraparound or zero size */
-	last_addr = phys_addr + size - 1;
-	if (!size || last_addr < phys_addr)
-		return NULL;
-
-	pgprot = __pgprot(_PAGE_PRESENT  | _PAGE_READ   |
-			  _PAGE_WRITE    | _PAGE_DIRTY  |
-			  _PAGE_ACCESSED | _PAGE_SHARED | flags);
-
-	/*
-	 * Mappings have to be page-aligned
-	 */
-	offset = phys_addr & ~PAGE_MASK;
-	phys_addr &= PAGE_MASK;
-	size = PAGE_ALIGN(last_addr + 1) - phys_addr;
-
-	/*
-	 * Ok, go for it..
-	 */
-	area = get_vm_area(size, VM_IOREMAP);
-	if (!area)
-		return NULL;
-	pr_debug("Get vm_area returns %p addr %p\n", area, area->addr);
-	area->phys_addr = phys_addr;
-	addr = area->addr;
-	if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size,
-			       phys_addr, pgprot)) {
-		vunmap(addr);
-		return NULL;
-	}
-	return (void *) (offset + (char *)addr);
-}
-EXPORT_SYMBOL(__ioremap);
-
-void __iounmap(void *addr)
-{
-	struct vm_struct *area;
-
-	vfree((void *) (PAGE_MASK & (unsigned long) addr));
-	area = remove_vm_area((void *) (PAGE_MASK & (unsigned long) addr));
-	if (!area) {
-		printk(KERN_ERR "iounmap: bad address %p\n", addr);
-		return;
-	}
-
-	kfree(area);
-}
-EXPORT_SYMBOL(__iounmap);
-
 static struct resource shmedia_iomap = {
 	.name	= "shmedia_iomap",
 	.start	= IOBASE_VADDR + PAGE_SIZE,
 	.end	= IOBASE_END - 1,
 };
 
-static void shmedia_mapioaddr(unsigned long pa, unsigned long va);
+static void shmedia_mapioaddr(unsigned long pa, unsigned long va,
+			      unsigned long flags);
 static void shmedia_unmapioaddr(unsigned long vaddr);
-static unsigned long shmedia_ioremap(struct resource *res, u32 pa, int sz);
+static void __iomem *shmedia_ioremap(struct resource *res, u32 pa,
+				     int sz, unsigned long flags);
 
 /*
  * We have the same problem as the SPARC, so lets have the same comment:
@@ -130,18 +59,18 @@ static struct xresource xresv[XNRES];
 
 static struct xresource *xres_alloc(void)
 {
-        struct xresource *xrp;
-        int n;
-
-        xrp = xresv;
-        for (n = 0; n < XNRES; n++) {
-                if (xrp->xflag == 0) {
-                        xrp->xflag = 1;
-                        return xrp;
-                }
-                xrp++;
-        }
-        return NULL;
+	struct xresource *xrp;
+	int n;
+
+	xrp = xresv;
+	for (n = 0; n < XNRES; n++) {
+		if (xrp->xflag == 0) {
+			xrp->xflag = 1;
+			return xrp;
+		}
+		xrp++;
+	}
+	return NULL;
 }
 
 static void xres_free(struct xresource *xrp)
@@ -161,76 +90,71 @@ static struct resource *shmedia_find_resource(struct resource *root,
 	return NULL;
 }
 
-static unsigned long shmedia_alloc_io(unsigned long phys, unsigned long size,
-				      const char *name)
+static void __iomem *shmedia_alloc_io(unsigned long phys, unsigned long size,
+				      const char *name, unsigned long flags)
 {
-        static int printed_full = 0;
-        struct xresource *xres;
-        struct resource *res;
-        char *tack;
-        int tlen;
-
-        if (name == NULL) name = "???";
-
-        if ((xres = xres_alloc()) != 0) {
-                tack = xres->xname;
-                res = &xres->xres;
-        } else {
-                if (!printed_full) {
-                        printk("%s: done with statics, switching to kmalloc\n",
-			       __func__);
-                        printed_full = 1;
-                }
-                tlen = strlen(name);
-                tack = kmalloc(sizeof (struct resource) + tlen + 1, GFP_KERNEL);
-                if (!tack)
-			return -ENOMEM;
-                memset(tack, 0, sizeof(struct resource));
-                res = (struct resource *) tack;
-                tack += sizeof (struct resource);
-        }
-
-        strncpy(tack, name, XNMLN);
-        tack[XNMLN] = 0;
-        res->name = tack;
-
-        return shmedia_ioremap(res, phys, size);
+	static int printed_full;
+	struct xresource *xres;
+	struct resource *res;
+	char *tack;
+	int tlen;
+
+	if (name == NULL)
+		name = "???";
+
+	xres = xres_alloc();
+	if (xres != 0) {
+		tack = xres->xname;
+		res = &xres->xres;
+	} else {
+		if (!printed_full) {
+			printk(KERN_NOTICE "%s: done with statics, "
+			       "switching to kmalloc\n", __func__);
+			printed_full = 1;
+		}
+		tlen = strlen(name);
+		tack = kmalloc(sizeof(struct resource) + tlen + 1, GFP_KERNEL);
+		if (!tack)
+			return NULL;
+		memset(tack, 0, sizeof(struct resource));
+		res = (struct resource *) tack;
+		tack += sizeof(struct resource);
+	}
+
+	strncpy(tack, name, XNMLN);
+	tack[XNMLN] = 0;
+	res->name = tack;
+
+	return shmedia_ioremap(res, phys, size, flags);
 }
 
-static unsigned long shmedia_ioremap(struct resource *res, u32 pa, int sz)
+static void __iomem *shmedia_ioremap(struct resource *res, u32 pa, int sz,
+				     unsigned long flags)
 {
-        unsigned long offset = ((unsigned long) pa) & (~PAGE_MASK);
+	unsigned long offset = ((unsigned long) pa) & (~PAGE_MASK);
 	unsigned long round_sz = (offset + sz + PAGE_SIZE-1) & PAGE_MASK;
-        unsigned long va;
-        unsigned int psz;
+	unsigned long va;
+	unsigned int psz;
 
-        if (allocate_resource(&shmedia_iomap, res, round_sz,
+	if (allocate_resource(&shmedia_iomap, res, round_sz,
 			      shmedia_iomap.start, shmedia_iomap.end,
 			      PAGE_SIZE, NULL, NULL) != 0) {
-                panic("alloc_io_res(%s): cannot occupy\n",
-                    (res->name != NULL)? res->name: "???");
-        }
+		panic("alloc_io_res(%s): cannot occupy\n",
+		      (res->name != NULL) ? res->name : "???");
+	}
 
-        va = res->start;
-        pa &= PAGE_MASK;
+	va = res->start;
+	pa &= PAGE_MASK;
 
 	psz = (res->end - res->start + (PAGE_SIZE - 1)) / PAGE_SIZE;
 
-	/* log at boot time ... */
-	printk("mapioaddr: %6s  [%2d page%s]  va 0x%08lx   pa 0x%08x\n",
-	       ((res->name != NULL) ? res->name : "???"),
-	       psz, psz == 1 ? " " : "s", va, pa);
-
-        for (psz = res->end - res->start + 1; psz != 0; psz -= PAGE_SIZE) {
-                shmedia_mapioaddr(pa, va);
-                va += PAGE_SIZE;
-                pa += PAGE_SIZE;
-        }
-
-        res->start += offset;
-        res->end = res->start + sz - 1;         /* not strictly necessary.. */
+	for (psz = res->end - res->start + 1; psz != 0; psz -= PAGE_SIZE) {
+		shmedia_mapioaddr(pa, va, flags);
+		va += PAGE_SIZE;
+		pa += PAGE_SIZE;
+	}
 
-        return res->start;
+	return (void __iomem *)(unsigned long)(res->start + offset);
 }
 
 static void shmedia_free_io(struct resource *res)
@@ -249,14 +173,12 @@ static void shmedia_free_io(struct resource *res)
 
 static __init_refok void *sh64_get_page(void)
 {
-	extern int after_bootmem;
 	void *page;
 
-	if (after_bootmem) {
-		page = (void *)get_zeroed_page(GFP_ATOMIC);
-	} else {
+	if (after_bootmem)
+		page = (void *)get_zeroed_page(GFP_KERNEL);
+	else
 		page = alloc_bootmem_pages(PAGE_SIZE);
-	}
 
 	if (!page || ((unsigned long)page & ~PAGE_MASK))
 		panic("sh64_get_page: Out of memory already?\n");
@@ -264,17 +186,20 @@ static __init_refok void *sh64_get_page(void)
 	return page;
 }
 
-static void shmedia_mapioaddr(unsigned long pa, unsigned long va)
+static void shmedia_mapioaddr(unsigned long pa, unsigned long va,
+			      unsigned long flags)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
 	pmd_t *pmdp;
 	pte_t *ptep, pte;
 	pgprot_t prot;
-	unsigned long flags = 1; /* 1 = CB0-1 device */
 
 	pr_debug("shmedia_mapiopage pa %08lx va %08lx\n",  pa, va);
 
+	if (!flags)
+		flags = 1; /* 1 = CB0-1 device */
+
 	pgdp = pgd_offset_k(va);
 	if (pgd_none(*pgdp) || !pgd_present(*pgdp)) {
 		pudp = (pud_t *)sh64_get_page();
@@ -288,7 +213,7 @@ static void shmedia_mapioaddr(unsigned long pa, unsigned long va)
 	}
 
 	pmdp = pmd_offset(pudp, va);
-	if (pmd_none(*pmdp) || !pmd_present(*pmdp) ) {
+	if (pmd_none(*pmdp) || !pmd_present(*pmdp)) {
 		ptep = (pte_t *)sh64_get_page();
 		set_pmd(pmdp, __pmd((unsigned long)ptep + _PAGE_TABLE));
 	}
@@ -336,17 +261,19 @@ static void shmedia_unmapioaddr(unsigned long vaddr)
 	pte_clear(&init_mm, vaddr, ptep);
 }
 
-unsigned long onchip_remap(unsigned long phys, unsigned long size, const char *name)
+void __iomem *__ioremap(unsigned long offset, unsigned long size,
+			unsigned long flags)
 {
-	if (size < PAGE_SIZE)
-		size = PAGE_SIZE;
+	char name[14];
 
-	return shmedia_alloc_io(phys, size, name);
+	sprintf(name, "phys_%08x", (u32)offset);
+	return shmedia_alloc_io(offset, size, name, flags);
 }
-EXPORT_SYMBOL(onchip_remap);
+EXPORT_SYMBOL(__ioremap);
 
-void onchip_unmap(unsigned long vaddr)
+void __iounmap(void __iomem *virtual)
 {
+	unsigned long vaddr = (unsigned long)virtual & PAGE_MASK;
 	struct resource *res;
 	unsigned int psz;
 
@@ -357,10 +284,7 @@ void onchip_unmap(unsigned long vaddr)
 		return;
 	}
 
-        psz = (res->end - res->start + (PAGE_SIZE - 1)) / PAGE_SIZE;
-
-        printk(KERN_DEBUG "unmapioaddr: %6s  [%2d page%s] freed\n",
-	       res->name, psz, psz == 1 ? " " : "s");
+	psz = (res->end - res->start + (PAGE_SIZE - 1)) / PAGE_SIZE;
 
 	shmedia_free_io(res);
 
@@ -371,9 +295,8 @@ void onchip_unmap(unsigned long vaddr)
 		kfree(res);
 	}
 }
-EXPORT_SYMBOL(onchip_unmap);
+EXPORT_SYMBOL(__iounmap);
 
-#ifdef CONFIG_PROC_FS
 static int
 ioremap_proc_info(char *buf, char **start, off_t fpos, int length, int *eof,
 		  void *data)
@@ -385,7 +308,10 @@ ioremap_proc_info(char *buf, char **start, off_t fpos, int length, int *eof,
 	for (r = ((struct resource *)data)->child; r != NULL; r = r->sibling) {
 		if (p + 32 >= e)        /* Better than nothing */
 			break;
-		if ((nm = r->name) == 0) nm = "???";
+		nm = r->name;
+		if (nm == NULL)
+			nm = "???";
+
 		p += sprintf(p, "%08lx-%08lx: %s\n",
 			     (unsigned long)r->start,
 			     (unsigned long)r->end, nm);
@@ -393,14 +319,11 @@ ioremap_proc_info(char *buf, char **start, off_t fpos, int length, int *eof,
 
 	return p-buf;
 }
-#endif /* CONFIG_PROC_FS */
 
 static int __init register_proc_onchip(void)
 {
-#ifdef CONFIG_PROC_FS
-	create_proc_read_entry("io_map",0,0, ioremap_proc_info, &shmedia_iomap);
-#endif
+	create_proc_read_entry("io_map", 0, 0, ioremap_proc_info,
+			       &shmedia_iomap);
 	return 0;
 }
-
-__initcall(register_proc_onchip);
+late_initcall(register_proc_onchip);
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index dbf5357a77b30..728d6a062bf6d 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -985,13 +985,7 @@ static void sci_config_port(struct uart_port *port, int flags)
 	port->type = s->type;
 
 	if (port->flags & UPF_IOREMAP && !port->membase) {
-#if defined(CONFIG_SUPERH64)
-		port->mapbase = onchip_remap(SCIF_ADDR_SH5, 1024, "SCIF");
-		port->membase = (void __iomem *)port->mapbase;
-#else
 		port->membase = ioremap_nocache(port->mapbase, 0x40);
-#endif
-
 		dev_err(port->dev, "can't remap port#%d\n", port->line);
 	}
 }
-- 
GitLab


From c51279ec0d2d959e13831ae84b714301f0494f27 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 7 May 2009 18:17:20 +0900
Subject: [PATCH 0874/2198] sh: Kill off unused SH-5 irq_describe cruft.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-cayman/irq.c  | 15 ---------------
 arch/sh/include/cpu-sh5/cpu/irq.h |  1 -
 arch/sh/kernel/cpu/irq/intc-sh5.c | 22 ----------------------
 3 files changed, 38 deletions(-)

diff --git a/arch/sh/boards/mach-cayman/irq.c b/arch/sh/boards/mach-cayman/irq.c
index dbb00c9913241..33f7708563193 100644
--- a/arch/sh/boards/mach-cayman/irq.c
+++ b/arch/sh/boards/mach-cayman/irq.c
@@ -142,21 +142,6 @@ int cayman_irq_demux(int evt)
 	return irq;
 }
 
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_SYSCTL)
-int cayman_irq_describe(char* p, int irq)
-{
-	if (irq < NR_INTC_IRQS) {
-		return intc_irq_describe(p, irq);
-	} else if (irq < NR_INTC_IRQS + 8) {
-		return sprintf(p, "(SMSC %d)", irq - NR_INTC_IRQS);
-	} else if ((irq >= NR_INTC_IRQS + 24) && (irq < NR_INTC_IRQS + 32)) {
-		return sprintf(p, "(PCI2 %d)", irq - (NR_INTC_IRQS + 24));
-	}
-
-	return 0;
-}
-#endif
-
 void init_cayman_irq(void)
 {
 	int i;
diff --git a/arch/sh/include/cpu-sh5/cpu/irq.h b/arch/sh/include/cpu-sh5/cpu/irq.h
index f0f0756e6e849..0ccf257a72d1f 100644
--- a/arch/sh/include/cpu-sh5/cpu/irq.h
+++ b/arch/sh/include/cpu-sh5/cpu/irq.h
@@ -111,7 +111,6 @@
 #define TOP_PRIORITY	15
 
 extern int intc_evt_to_irq[(0xE20/0x20)+1];
-int intc_irq_describe(char* p, int irq);
 extern int platform_int_priority[NR_INTC_IRQS];
 
 #endif /* __ASM_SH_CPU_SH5_IRQ_H */
diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c
index 02d8137547e8b..a619eaf6e6b4a 100644
--- a/arch/sh/kernel/cpu/irq/intc-sh5.c
+++ b/arch/sh/kernel/cpu/irq/intc-sh5.c
@@ -160,28 +160,6 @@ void make_intc_irq(unsigned int irq)
 	disable_intc_irq(irq);
 }
 
-#if defined(CONFIG_PROC_FS) && defined(CONFIG_SYSCTL)
-static int IRQ_to_vectorN[NR_INTC_IRQS] = {
-	0x12, 0x15, 0x18, 0x1B, 0x40, 0x41, 0x42, 0x43, /*  0- 7 */
-	  -1,   -1,   -1,   -1, 0x50, 0x51, 0x52, 0x53,	/*  8-15 */
-	0x54, 0x55, 0x32, 0x33, 0x34, 0x35, 0x36,   -1, /* 16-23 */
-	  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1, /* 24-31 */
-	0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x38,	/* 32-39 */
-        0x39, 0x3A, 0x3B,   -1,   -1,   -1,   -1,   -1, /* 40-47 */
-	  -1,   -1,   -1,   -1,   -1,   -1,   -1,   -1, /* 48-55 */
-	  -1,   -1,   -1,   -1,   -1,   -1,   -1, 0x2B, /* 56-63 */
-
-};
-
-int intc_irq_describe(char* p, int irq)
-{
-	if (irq < NR_INTC_IRQS)
-		return sprintf(p, "(0x%3x)", IRQ_to_vectorN[irq]*0x20);
-	else
-		return 0;
-}
-#endif
-
 void __init plat_irq_setup(void)
 {
 	unsigned long long __dummy0, __dummy1=~0x00000000100000f0;
-- 
GitLab


From 29c8000ee7da3a6756d26143991e573eaaf2a9f6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 11:13:42 -0400
Subject: [PATCH 0875/2198] ring-buffer: remove complex calculations in
 ring-buffer-test

Ingo Molnar thought that the code to calculate the time in cond_resched
is a bit too ugly and is not needed. This patch removes it and replaces
it with a simple call to cond_resched. I kept the comment that explains
the reason for the cond_resched.

[ Impact: remove ugly code ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 37 ++++++----------------------
 1 file changed, 7 insertions(+), 30 deletions(-)

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a26fc67b63bb8..f4ceb453c7ddd 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -185,35 +185,6 @@ static void ring_buffer_consumer(void)
 	complete(&read_done);
 }
 
-/*
- * If we are a non preempt kernel, the 10 second run will
- * stop everything while it runs. Instead, we will call cond_resched
- * and also add any time that was lost by a rescedule.
- */
-#ifdef CONFIG_PREEMPT
-static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
-{
-}
-#else
-static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv)
-{
-	struct timeval tv;
-
-	cond_resched();
-	do_gettimeofday(&tv);
-	if (tv.tv_usec < end_tv->tv_usec) {
-		tv.tv_usec += 1000000;
-		tv.tv_sec--;
-	}
-	start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec;
-	start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec;
-	if (start_tv->tv_usec > 1000000) {
-		start_tv->tv_usec -= 1000000;
-		start_tv->tv_sec++;
-	}
-}
-#endif
-
 static void ring_buffer_producer(void)
 {
 	struct timeval start_tv;
@@ -250,7 +221,13 @@ static void ring_buffer_producer(void)
 		if (consumer && !(++cnt % wakeup_interval))
 			wake_up_process(consumer);
 
-		sched_if_needed(&start_tv, &end_tv);
+		/*
+		 * If we are a non preempt kernel, the 10 second run will
+		 * stop everything while it runs. Instead, we will call
+		 * cond_resched and also add any time that was lost by a
+		 * rescedule.
+		 */
+		cond_resched();
 
 	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
 	pr_info("End ring buffer hammer\n");
-- 
GitLab


From d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 11:49:35 -0400
Subject: [PATCH 0876/2198] tracing: append ":*" to internal setting of system
 events

The system enabling of events uses the same code as the set_event file.
It passes in the name of the system to the parser and that will enable
all the events that has that system as a name.

The problem is that it will also enable events with the same name as the
system.

If you have system name foo, and system name bar, but within the system
bar, there exists an event called foo. By setting the system name foo,
you will also be enabling the event foo in the system bar. This is not
an expected result.

The solution is to pass in "foo:*", which will only enable the system
foo and not events called foo.

[ Impact: prevent accidental enabling of events with same name as a system ]

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 87feb0117ce28..8d0fae3af5954 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -509,9 +509,11 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 		return -EINVAL;
 	}
 
-	command = kstrdup(system, GFP_KERNEL);
+	/* +3 for the ":*\0" */
+	command = kmalloc(strlen(system)+3, GFP_KERNEL);
 	if (!command)
 		return -ENOMEM;
+	sprintf(command, "%s:*", system);
 
 	ret = ftrace_set_clr_event(command, val);
 	if (ret)
@@ -1179,7 +1181,7 @@ static __init int event_trace_init(void)
 			  &ftrace_show_header_fops);
 
 	trace_create_file("enable", 0644, d_events,
-			  "*:*", &ftrace_system_enable_fops);
+			  "*", &ftrace_system_enable_fops);
 
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
-- 
GitLab


From 65b77242043f74bca6a0d733c0e48ef03a8c9893 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 12:49:27 -0400
Subject: [PATCH 0877/2198] tracing: have menu default enabled when kernel
 debug is configured

Tracing can be very helpful to debug the kernel. When DEBUG_KERNEL is
enabled it is nice to enable the trace menu as well.

This patch only make the tracing menu enabled by default, it does not
make any of the tracers enabled. And the menu is only enabled by
default if DEBUG_KERNEL is enabled.

[ Impact: show tracing options to those debugging the kernel ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 50f62a296e1d0..f61be30157832 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -79,6 +79,7 @@ if TRACING_SUPPORT
 
 menuconfig FTRACE
 	bool "Tracers"
+	default y if DEBUG_KERNEL
 	help
 	 Enable the kernel tracing infrastructure.
 
-- 
GitLab


From 0574ea421b90e0e45a72c447dd3c2c79ffd8c153 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 14:20:28 -0400
Subject: [PATCH 0878/2198] ring-buffer: only periodically call cond_resched to
 ring-buffer-benchmark

Calling cond_resched at every iteration of the loop adds a bit of
overhead to the benchmark.

This patch does two things.

1) only calls cond-resched when CONFIG_PREEMPT is not enabled
2) only calls cond-resched after so many traces has been performed.

[ Impact: less overhead to the ring-buffer-benchmark ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index f4ceb453c7ddd..a7c048bb44668 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -218,16 +218,23 @@ static void ring_buffer_producer(void)
 		}
 		do_gettimeofday(&end_tv);
 
-		if (consumer && !(++cnt % wakeup_interval))
+		cnt++;
+		if (consumer && !(cnt % wakeup_interval))
 			wake_up_process(consumer);
 
+#ifndef CONFIG_PREEMPT
 		/*
 		 * If we are a non preempt kernel, the 10 second run will
 		 * stop everything while it runs. Instead, we will call
 		 * cond_resched and also add any time that was lost by a
 		 * rescedule.
+		 *
+		 * Do a cond resched at the same frequency we would wake up
+		 * the reader.
 		 */
-		cond_resched();
+		if (cnt % wakeup_interval)
+			cond_resched();
+#endif
 
 	} while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
 	pr_info("End ring buffer hammer\n");
-- 
GitLab


From 7da3046d6ce6ea97494020081c509b642b7016af Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 19:52:20 -0400
Subject: [PATCH 0879/2198] ring-buffer: add total count in
 ring-buffer-benchmark

It is nice to see the overhead of the benchmark test when tracing is
disabled. That is, we turn off the ring buffer just to see what the
cost of running the loop that calls into the ring buffer is.

Currently, if no entries wer made, we get 0. This is not informative.
This patch changes it to check if we had any "missed" (non recorded)
events. If so, a total count is also reported.

[ Impact: evaluate the over head of the ring buffer benchmark test ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a7c048bb44668..a21aa7b3d05e0 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -285,6 +285,17 @@ static void ring_buffer_producer(void)
 		avg = 1000000 / hit;
 		pr_info("%ld ns per entry\n", avg);
 	}
+
+
+	if (missed) {
+		if (time)
+			missed /= (long)time;
+
+		pr_info("Total iterations per millisec: %ld\n", hit + missed);
+
+		avg = 1000000 / (hit + missed);
+		pr_info("%ld ns per entry\n", avg);
+	}
 }
 
 static void wait_to_die(void)
-- 
GitLab


From 74f4fd21664148b8c454cc07bfe74e4dd51cf07b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 7 May 2009 19:58:55 -0400
Subject: [PATCH 0880/2198] ring-buffer: change WARN_ON from checking
 preempt_count to preemptible

There's a WARN_ON in the ring buffer code that makes sure preemption
is disabled. It checks "!preempt_count()". But when CONFIG_PREEMPT is not
enabled, preempt_count() is always zero, and this will trigger the warning.

[ Impact: prevent false warning on non preemptible kernels ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3ae5ccf2c0fcd..361170609bd04 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1688,7 +1688,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	 * committed yet. Thus we can assume that preemption
 	 * is still disabled.
 	 */
-	RB_WARN_ON(buffer, !preempt_count());
+	RB_WARN_ON(buffer, preemptible());
 
 	cpu = smp_processor_id();
 	cpu_buffer = buffer->buffers[cpu];
-- 
GitLab


From 5dafc91fca9135a7b0acb76d9709f4abfad92d6e Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 7 May 2009 10:17:44 +0000
Subject: [PATCH 0881/2198] sh: sh7785 early scif fix

This patch moves the SH4 case of EARLY_SCIF_CONSOLE_PORT
so the SH7785 default value gets used. Without this patch
the value for SH7785 is set to 0xffe80000.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug
index fdb049373e7c0..fafa907e14ac6 100644
--- a/arch/sh/Kconfig.debug
+++ b/arch/sh/Kconfig.debug
@@ -38,10 +38,10 @@ config EARLY_SCIF_CONSOLE_PORT
 	default "0xffe00000" if CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7763 || \
 				CPU_SUBTYPE_SH7722 || CPU_SUBTYPE_SH7366 || \
 				CPU_SUBTYPE_SH7343
-	default "0xffe80000" if CPU_SH4
 	default "0xffea0000" if CPU_SUBTYPE_SH7785
 	default "0xfffe8000" if CPU_SUBTYPE_SH7203
 	default "0xfffe9800" if CPU_SUBTYPE_SH7206 || CPU_SUBTYPE_SH7263
+	default "0xffe80000" if CPU_SH4
 	default "0x00000000"
 
 config EARLY_PRINTK
-- 
GitLab


From b3cacf318172757783d8272fc333284dd4f50140 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 7 May 2009 10:31:39 +0000
Subject: [PATCH 0882/2198] sh: call clock framework init() callback once

Make sure that clk->ops->init() only gets called once in
the case of CLK_ALWAYS_ENABLED. Without this patch the
init() callback may be called multiple times.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/clock.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 1dc896483b591..099373ae57d84 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -99,15 +99,18 @@ static int __clk_enable(struct clk *clk)
 	 * changes and the clock needs to hunt for the proper set of
 	 * divisors to use before it can effectively recalc.
 	 */
+
+	if (clk->flags & CLK_ALWAYS_ENABLED) {
+		kref_get(&clk->kref);
+		return 0;
+	}
+
 	if (unlikely(atomic_read(&clk->kref.refcount) == 1))
 		if (clk->ops && clk->ops->init)
 			clk->ops->init(clk);
 
 	kref_get(&clk->kref);
 
-	if (clk->flags & CLK_ALWAYS_ENABLED)
-		return 0;
-
 	if (likely(clk->ops && clk->ops->enable))
 		clk->ops->enable(clk);
 
-- 
GitLab


From 06ee846a25642c434c55209151c633f538c12022 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 7 May 2009 10:44:55 +0000
Subject: [PATCH 0883/2198] sh: r7785 highlander clock fixes

Update the r7785 highlander defconfig to fix
PCLK value and that mode4 is set high.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/configs/r7785rp_defconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig
index 890fe3cc91422..5e677a8e3a366 100644
--- a/arch/sh/configs/r7785rp_defconfig
+++ b/arch/sh/configs/r7785rp_defconfig
@@ -269,7 +269,7 @@ CONFIG_SH_R7785RP=y
 #
 CONFIG_SH_TMU=y
 CONFIG_SH_TIMER_IRQ=28
-CONFIG_SH_PCLK_FREQ=50000000
+CONFIG_SH_PCLK_FREQ=33333333
 CONFIG_TICK_ONESHOT=y
 CONFIG_NO_HZ=y
 CONFIG_HIGH_RES_TIMERS=y
@@ -333,7 +333,7 @@ CONFIG_GUSA=y
 CONFIG_ZERO_PAGE_OFFSET=0x00001000
 CONFIG_BOOT_LINK_OFFSET=0x00800000
 CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1"
+CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1 mode4_pin=high"
 
 #
 # Bus options
-- 
GitLab


From e367592cc93ac653e7bc0bebbc9bb713a77e2696 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 7 May 2009 10:55:37 +0000
Subject: [PATCH 0884/2198] sh: TMU platform data for sh7785

This patch adds TMU platform data for sh7785. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                        |   1 +
 arch/sh/kernel/cpu/sh4a/setup-sh7785.c | 204 +++++++++++++++++++++++++
 2 files changed, 205 insertions(+)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 02688d7909a18..b773e7d1b06bb 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -393,6 +393,7 @@ config CPU_SUBTYPE_SH7785
 	select CPU_SHX2
 	select ARCH_SPARSEMEM_ENABLE
 	select SYS_SUPPORTS_NUMA
+	select SYS_SUPPORTS_TMU
 
 config CPU_SUBTYPE_SH7786
 	bool "Support SH7786 processor"
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
index d80802a49dbd1..dc5d3e507a219 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
@@ -13,8 +13,191 @@
 #include <linux/serial_sci.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/sh_timer.h>
 #include <asm/mmzone.h>
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 28,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 29,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 30,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffdc0008,
+		.end	= 0xffdc0013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 96,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffdc0014,
+		.end	= 0xffdc001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 97,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffdc0020,
+		.end	= 0xffdc002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 98,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xffea0000,
@@ -60,6 +243,12 @@ static struct platform_device sci_device = {
 };
 
 static struct platform_device *sh7785_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 	&sci_device,
 };
 
@@ -70,6 +259,21 @@ static int __init sh7785_devices_setup(void)
 }
 __initcall(sh7785_devices_setup);
 
+static struct platform_device *sh7785_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7785_early_devices,
+				   ARRAY_SIZE(sh7785_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
GitLab


From 4fa48e1774992fd03a4cd83a0f2adecb288c46f8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 15:28:15 +0900
Subject: [PATCH 0885/2198] sh: Enable new TMU driver support for all SH-3 and
 SH-4 CPUs.

The TMU block is supported on all SH-3 and SH-4 subtypes, so just select
it there, rather than conditionalizing it per subtype.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index b773e7d1b06bb..88eb118e9fa66 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -168,6 +168,7 @@ config CPU_SH3
 	bool
 	select CPU_HAS_INTEVT
 	select CPU_HAS_SR_RB
+	select SYS_SUPPORTS_TMU
 
 config CPU_SH4
 	bool
@@ -175,6 +176,7 @@ config CPU_SH4
 	select CPU_HAS_SR_RB
 	select CPU_HAS_PTEA if !CPU_SH4A || CPU_SHX2
 	select CPU_HAS_FPU if !CPU_SH4AL_DSP
+	select SYS_SUPPORTS_TMU
 
 config CPU_SH4A
 	bool
@@ -393,7 +395,6 @@ config CPU_SUBTYPE_SH7785
 	select CPU_SHX2
 	select ARCH_SPARSEMEM_ENABLE
 	select SYS_SUPPORTS_NUMA
-	select SYS_SUPPORTS_TMU
 
 config CPU_SUBTYPE_SH7786
 	bool "Support SH7786 processor"
@@ -428,7 +429,6 @@ config CPU_SUBTYPE_SH7722
 	select ARCH_SPARSEMEM_ENABLE
 	select SYS_SUPPORTS_NUMA
 	select SYS_SUPPORTS_CMT
-	select SYS_SUPPORTS_TMU
 
 config CPU_SUBTYPE_SH7366
 	bool "Support SH7366 processor"
-- 
GitLab


From c2ecb4c4a7da16288062a057b693b7b1e16aaf88 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 15:39:25 +0900
Subject: [PATCH 0886/2198] sh: Move out rtc-sh registration from time_64.c to
 setup-sh5.c

Now that the onchip_remap() mess is sorted out, the rtc-sh support code
for SH-5 can follow the same approach as the other CPUs.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh5/setup-sh5.c | 31 ++++++++++++++++++
 arch/sh/kernel/time_64.c           | 52 +-----------------------------
 2 files changed, 32 insertions(+), 51 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
index d8d59fea10825..9a362c8f3fe97 100644
--- a/arch/sh/kernel/cpu/sh5/setup-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -34,8 +34,39 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct resource rtc_resources[] = {
+	[0] = {
+		.start	= PHYS_PERIPHERAL_BLOCK + 0x01040000,
+		.end	= PHYS_PERIPHERAL_BLOCK + 0x01040000 + 0x58 - 1,
+		.flags	= IORESOURCE_IO,
+	},
+	[1] = {
+		/* Period IRQ */
+		.start	= IRQ_PRI,
+		.flags	= IORESOURCE_IRQ,
+	},
+	[2] = {
+		/* Carry IRQ */
+		.start	= IRQ_CUI,
+		.flags	= IORESOURCE_IRQ,
+	},
+	[3] = {
+		/* Alarm IRQ */
+		.start	= IRQ_ATI,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device rtc_device = {
+	.name		= "sh-rtc",
+	.id		= -1,
+	.num_resources	= ARRAY_SIZE(rtc_resources),
+	.resource	= rtc_resources,
+};
+
 static struct platform_device *sh5_devices[] __initdata = {
 	&sci_device,
+	&rtc_device,
 };
 
 static int __init sh5_devices_setup(void)
diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c
index 7bfeaa09b9479..b4fe770e2a3f5 100644
--- a/arch/sh/kernel/time_64.c
+++ b/arch/sh/kernel/time_64.c
@@ -46,12 +46,6 @@
 #define TMU_TSTR_INIT	1
 #define TMU_TSTR_OFF	0
 
-/* Real Time Clock */
-#define	RTC_BLOCK_OFF	0x01040000
-#define RTC_BASE	PHYS_PERIPHERAL_BLOCK + RTC_BLOCK_OFF
-#define RTC_RCR1_CIE	0x10	/* Carry Interrupt Enable */
-#define RTC_RCR1	(rtc_base + 0x38)
-
 /* Time Management Unit */
 #define	TMU_BLOCK_OFF	0x01020000
 #define TMU_BASE	PHYS_PERIPHERAL_BLOCK + TMU_BLOCK_OFF
@@ -68,8 +62,7 @@
 
 #define TICK_SIZE (tick_nsec / 1000)
 
-static unsigned long tmu_base, rtc_base;
-unsigned long cprc_base;
+static unsigned long tmu_base;
 
 /* Variables to allow interpolation of time of day to resolution better than a
  * jiffy. */
@@ -248,11 +241,6 @@ void __init time_init(void)
 		panic("Unable to remap TMU\n");
 	}
 
-	rtc_base = (unsigned long)ioremap_nocache(RTC_BASE, 1024);
-	if (!rtc_base) {
-		panic("Unable to remap RTC\n");
-	}
-
 	clk = clk_get(NULL, "cpu_clk");
 	scaled_recip_ctc_ticks_per_jiffy = ((1ULL << CTC_JIFFY_SCALE_SHIFT) /
 			(unsigned long long)(clk_get_rate(clk) / HZ));
@@ -274,41 +262,3 @@ void __init time_init(void)
 	ctrl_outl(interval, TMU0_TCNT);
 	ctrl_outb(TMU_TSTR_INIT, TMU_TSTR);
 }
-
-static struct resource rtc_resources[] = {
-	[0] = {
-		/* RTC base, filled in by rtc_init */
-		.flags	= IORESOURCE_IO,
-	},
-	[1] = {
-		/* Period IRQ */
-		.start	= IRQ_PRI,
-		.flags	= IORESOURCE_IRQ,
-	},
-	[2] = {
-		/* Carry IRQ */
-		.start	= IRQ_CUI,
-		.flags	= IORESOURCE_IRQ,
-	},
-	[3] = {
-		/* Alarm IRQ */
-		.start	= IRQ_ATI,
-		.flags	= IORESOURCE_IRQ,
-	},
-};
-
-static struct platform_device rtc_device = {
-	.name		= "sh-rtc",
-	.id		= -1,
-	.num_resources	= ARRAY_SIZE(rtc_resources),
-	.resource	= rtc_resources,
-};
-
-static int __init rtc_init(void)
-{
-	rtc_resources[0].start	= rtc_base;
-	rtc_resources[0].end	= rtc_resources[0].start + 0x58 - 1;
-
-	return platform_device_register(&rtc_device);
-}
-device_initcall(rtc_init);
-- 
GitLab


From add47067a8ca324e9d26c4373350dc3cce7f1e7f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:12:17 +0900
Subject: [PATCH 0887/2198] sh: Finish the sh64 migration off of
 ARCH_USES_GETTIMEOFFSET.

This adds sh_tmu support to the SH-5 subtypes, which subsequently allows
us to kill off time_64.c and use the now generic time_32.c. As a bonus,
SH-5 now supports highres timers and tickless for the first time.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                    |   5 +-
 arch/sh/kernel/Makefile_64         |   2 +-
 arch/sh/kernel/cpu/sh5/setup-sh5.c | 118 +++++++++++++
 arch/sh/kernel/time_64.c           | 264 -----------------------------
 4 files changed, 120 insertions(+), 269 deletions(-)
 delete mode 100644 arch/sh/kernel/time_64.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 88eb118e9fa66..ca5c09b241c30 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -141,10 +141,6 @@ config ARCH_NO_VIRT_TO_BUS
 config ARCH_HAS_DEFAULT_IDLE
 	def_bool y
 
-config ARCH_USES_GETTIMEOFFSET
-	def_bool y 
-	depends on SUPERH64
-
 config IO_TRAPPED
 	bool
 
@@ -190,6 +186,7 @@ config CPU_SH4AL_DSP
 config CPU_SH5
 	bool
 	select CPU_HAS_FPU
+	select SYS_SUPPORTS_TMU
 
 config CPU_SHX2
 	bool
diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index d256c77469574..4667d4cac9538 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -2,7 +2,7 @@ extra-y	:= head_64.o init_task.o vmlinux.lds
 
 obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \
 	   ptrace_64.o setup.o signal_64.o sys_sh.o sys_sh64.o \
-	   syscalls_64.o time_64.o topology.o traps.o traps_64.o
+	   syscalls_64.o time_32.o topology.o traps.o traps_64.o
 
 obj-y				+= cpu/ timers/
 obj-$(CONFIG_VSYSCALL)		+= vsyscall/
diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
index 9a362c8f3fe97..678d69bdebbae 100644
--- a/arch/sh/kernel/cpu/sh5/setup-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -13,6 +13,7 @@
 #include <linux/serial_sci.h>
 #include <linux/io.h>
 #include <linux/mm.h>
+#include <linux/sh_timer.h>
 #include <asm/addrspace.h>
 
 static struct plat_sci_port sci_platform_data[] = {
@@ -64,6 +65,110 @@ static struct platform_device rtc_device = {
 	.resource	= rtc_resources,
 };
 
+#define	TMU_BLOCK_OFF	0x01020000
+#define TMU_BASE	PHYS_PERIPHERAL_BLOCK + TMU_BLOCK_OFF
+#define TMU0_BASE	(TMU_BASE + 0x8 + (0xc * 0x0))
+#define TMU1_BASE	(TMU_BASE + 0x8 + (0xc * 0x1))
+#define TMU2_BASE	(TMU_BASE + 0x8 + (0xc * 0x2))
+
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= TMU0_BASE,
+		.end	= TMU0_BASE + 0xc - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= IRQ_TUNI0,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= TMU1_BASE,
+		.end	= TMU1_BASE + 0xc - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= IRQ_TUNI1,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= TMU2_BASE,
+		.end	= TMU2_BASE + 0xc - 1,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= IRQ_TUNI2,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct platform_device *sh5_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
 static struct platform_device *sh5_devices[] __initdata = {
 	&sci_device,
 	&rtc_device,
@@ -71,7 +176,20 @@ static struct platform_device *sh5_devices[] __initdata = {
 
 static int __init sh5_devices_setup(void)
 {
+	int ret;
+
+	ret = platform_add_devices(sh5_early_devices,
+				   ARRAY_SIZE(sh5_early_devices));
+	if (unlikely(ret != 0))
+		return ret;
+
 	return platform_add_devices(sh5_devices,
 				    ARRAY_SIZE(sh5_devices));
 }
 __initcall(sh5_devices_setup);
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh5_early_devices,
+				   ARRAY_SIZE(sh5_early_devices));
+}
diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c
deleted file mode 100644
index b4fe770e2a3f5..0000000000000
--- a/arch/sh/kernel/time_64.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * arch/sh/kernel/time_64.c
- *
- * Copyright (C) 2000, 2001  Paolo Alberelli
- * Copyright (C) 2003 - 2007  Paul Mundt
- * Copyright (C) 2003  Richard Curnow
- *
- *    Original TMU/RTC code taken from sh version.
- *    Copyright (C) 1999  Tetsuya Okada & Niibe Yutaka
- *      Some code taken from i386 version.
- *      Copyright (C) 1991, 1992, 1995  Linus Torvalds
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/errno.h>
-#include <linux/rwsem.h>
-#include <linux/sched.h>
-#include <linux/kernel.h>
-#include <linux/param.h>
-#include <linux/string.h>
-#include <linux/mm.h>
-#include <linux/interrupt.h>
-#include <linux/time.h>
-#include <linux/delay.h>
-#include <linux/init.h>
-#include <linux/profile.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-#include <linux/bcd.h>
-#include <linux/timex.h>
-#include <linux/irq.h>
-#include <linux/io.h>
-#include <linux/platform_device.h>
-#include <cpu/registers.h>	 /* required by inline __asm__ stmt. */
-#include <cpu/irq.h>
-#include <asm/addrspace.h>
-#include <asm/processor.h>
-#include <asm/uaccess.h>
-#include <asm/delay.h>
-#include <asm/clock.h>
-
-#define TMU_TOCR_INIT	0x00
-#define TMU0_TCR_INIT	0x0020
-#define TMU_TSTR_INIT	1
-#define TMU_TSTR_OFF	0
-
-/* Time Management Unit */
-#define	TMU_BLOCK_OFF	0x01020000
-#define TMU_BASE	PHYS_PERIPHERAL_BLOCK + TMU_BLOCK_OFF
-#define TMU0_BASE	tmu_base + 0x8 + (0xc * 0x0)
-#define TMU1_BASE	tmu_base + 0x8 + (0xc * 0x1)
-#define TMU2_BASE	tmu_base + 0x8 + (0xc * 0x2)
-
-#define TMU_TOCR	tmu_base+0x0	/* Byte access */
-#define TMU_TSTR	tmu_base+0x4	/* Byte access */
-
-#define TMU0_TCOR	TMU0_BASE+0x0	/* Long access */
-#define TMU0_TCNT	TMU0_BASE+0x4	/* Long access */
-#define TMU0_TCR	TMU0_BASE+0x8	/* Word access */
-
-#define TICK_SIZE (tick_nsec / 1000)
-
-static unsigned long tmu_base;
-
-/* Variables to allow interpolation of time of day to resolution better than a
- * jiffy. */
-
-/* This is effectively protected by xtime_lock */
-static unsigned long ctc_last_interrupt;
-static unsigned long long usecs_per_jiffy = 1000000/HZ; /* Approximation */
-
-#define CTC_JIFFY_SCALE_SHIFT 40
-
-/* 2**CTC_JIFFY_SCALE_SHIFT / ctc_ticks_per_jiffy */
-static unsigned long long scaled_recip_ctc_ticks_per_jiffy;
-
-/* Estimate number of microseconds that have elapsed since the last timer tick,
-   by scaling the delta that has occurred in the CTC register.
-
-   WARNING WARNING WARNING : This algorithm relies on the CTC decrementing at
-   the CPU clock rate.  If the CPU sleeps, the CTC stops counting.  Bear this
-   in mind if enabling SLEEP_WORKS in process.c.  In that case, this algorithm
-   probably needs to use TMU.TCNT0 instead.  This will work even if the CPU is
-   sleeping, though will be coarser.
-
-   FIXME : What if usecs_per_tick is moving around too much, e.g. if an adjtime
-   is running or if the freq or tick arguments of adjtimex are modified after
-   we have calibrated the scaling factor?  This will result in either a jump at
-   the end of a tick period, or a wrap backwards at the start of the next one,
-   if the application is reading the time of day often enough.  I think we
-   ought to do better than this.  For this reason, usecs_per_jiffy is left
-   separated out in the calculation below.  This allows some future hook into
-   the adjtime-related stuff in kernel/timer.c to remove this hazard.
-
-*/
-
-static unsigned long usecs_since_tick(void)
-{
-	unsigned long long current_ctc;
-	long ctc_ticks_since_interrupt;
-	unsigned long long ull_ctc_ticks_since_interrupt;
-	unsigned long result;
-
-	unsigned long long mul1_out;
-	unsigned long long mul1_out_high;
-	unsigned long long mul2_out_low, mul2_out_high;
-
-	/* Read CTC register */
-	asm ("getcon cr62, %0" : "=r" (current_ctc));
-	/* Note, the CTC counts down on each CPU clock, not up.
-	   Note(2), use long type to get correct wraparound arithmetic when
-	   the counter crosses zero. */
-	ctc_ticks_since_interrupt = (long) ctc_last_interrupt - (long) current_ctc;
-	ull_ctc_ticks_since_interrupt = (unsigned long long) ctc_ticks_since_interrupt;
-
-	/* Inline assembly to do 32x32x32->64 multiplier */
-	asm volatile ("mulu.l %1, %2, %0" :
-	     "=r" (mul1_out) :
-	     "r" (ull_ctc_ticks_since_interrupt), "r" (usecs_per_jiffy));
-
-	mul1_out_high = mul1_out >> 32;
-
-	asm volatile ("mulu.l %1, %2, %0" :
-	     "=r" (mul2_out_low) :
-	     "r" (mul1_out), "r" (scaled_recip_ctc_ticks_per_jiffy));
-
-#if 1
-	asm volatile ("mulu.l %1, %2, %0" :
-	     "=r" (mul2_out_high) :
-	     "r" (mul1_out_high), "r" (scaled_recip_ctc_ticks_per_jiffy));
-#endif
-
-	result = (unsigned long) (((mul2_out_high << 32) + mul2_out_low) >> CTC_JIFFY_SCALE_SHIFT);
-
-	return result;
-}
-
-u32 arch_gettimeoffset(void)
-{
-	return usecs_since_tick() * 1000;
-}
-
-/* Dummy RTC ops */
-static void null_rtc_get_time(struct timespec *tv)
-{
-	tv->tv_sec = mktime(2000, 1, 1, 0, 0, 0);
-	tv->tv_nsec = 0;
-}
-
-static int null_rtc_set_time(const time_t secs)
-{
-	return 0;
-}
-
-void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time;
-int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time;
-
-/* last time the RTC clock got updated */
-static long last_rtc_update;
-
-/*
- * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
- */
-static inline void do_timer_interrupt(void)
-{
-	unsigned long long current_ctc;
-
-	if (current->pid)
-		profile_tick(CPU_PROFILING);
-
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-	asm ("getcon cr62, %0" : "=r" (current_ctc));
-	ctc_last_interrupt = (unsigned long) current_ctc;
-
-	do_timer(1);
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 */
-	if (ntp_synced() &&
-	    xtime.tv_sec > last_rtc_update + 660 &&
-	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
-	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
-		if (rtc_sh_set_time(xtime.tv_sec) == 0)
-			last_rtc_update = xtime.tv_sec;
-		else
-			/* do it again in 60 s */
-			last_rtc_update = xtime.tv_sec - 600;
-	}
-	write_sequnlock(&xtime_lock);
-
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(get_irq_regs()));
-#endif
-}
-
-/*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
- */
-static irqreturn_t timer_interrupt(int irq, void *dev_id)
-{
-	unsigned long timer_status;
-
-	/* Clear UNF bit */
-	timer_status = ctrl_inw(TMU0_TCR);
-	timer_status &= ~0x100;
-	ctrl_outw(timer_status, TMU0_TCR);
-
-	do_timer_interrupt();
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction irq0  = {
-	.handler = timer_interrupt,
-	.flags = IRQF_DISABLED,
-	.name = "timer",
-};
-
-void __init time_init(void)
-{
-	unsigned long interval;
-	struct clk *clk;
-
-	tmu_base = (unsigned long)ioremap_nocache(TMU_BASE, 1024);
-	if (!tmu_base) {
-		panic("Unable to remap TMU\n");
-	}
-
-	clk = clk_get(NULL, "cpu_clk");
-	scaled_recip_ctc_ticks_per_jiffy = ((1ULL << CTC_JIFFY_SCALE_SHIFT) /
-			(unsigned long long)(clk_get_rate(clk) / HZ));
-
-	rtc_sh_get_time(&xtime);
-
-	setup_irq(TIMER_IRQ, &irq0);
-
-	clk = clk_get(NULL, "module_clk");
-	interval = (clk_get_rate(clk)/(HZ*4));
-
-	printk("Interval = %ld\n", interval);
-
-	/* Start TMU0 */
-	ctrl_outb(TMU_TSTR_OFF, TMU_TSTR);
-	ctrl_outb(TMU_TOCR_INIT, TMU_TOCR);
-	ctrl_outw(TMU0_TCR_INIT, TMU0_TCR);
-	ctrl_outl(interval, TMU0_TCOR);
-	ctrl_outl(interval, TMU0_TCNT);
-	ctrl_outb(TMU_TSTR_INIT, TMU_TSTR);
-}
-- 
GitLab


From b179b72fad5c88c3b616fb88a9ae7cbbc1a750d3 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:17:36 +0900
Subject: [PATCH 0888/2198] sh: Rename arch/sh/kernel/time_32.c to
 arch/sh/kernel/time.c.

This is now fully generic, and used both by _32 and _64 variants.
Rename it accordingly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/Makefile_32           | 2 +-
 arch/sh/kernel/Makefile_64           | 2 +-
 arch/sh/kernel/{time_32.c => time.c} | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename arch/sh/kernel/{time_32.c => time.c} (100%)

diff --git a/arch/sh/kernel/Makefile_32 b/arch/sh/kernel/Makefile_32
index 5b1d4d0fe04d8..aee08afe5ff78 100644
--- a/arch/sh/kernel/Makefile_32
+++ b/arch/sh/kernel/Makefile_32
@@ -11,7 +11,7 @@ endif
 
 obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o			\
 	   machvec.o process_32.o ptrace_32.o setup.o signal_32.o	\
-	   sys_sh.o sys_sh32.o syscalls_32.o time_32.o topology.o	\
+	   sys_sh.o sys_sh32.o syscalls_32.o time.o topology.o	\
 	   traps.o traps_32.o
 
 obj-y				+= cpu/ timers/
diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index 4667d4cac9538..31f6ebf375eb1 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -2,7 +2,7 @@ extra-y	:= head_64.o init_task.o vmlinux.lds
 
 obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \
 	   ptrace_64.o setup.o signal_64.o sys_sh.o sys_sh64.o \
-	   syscalls_64.o time_32.o topology.o traps.o traps_64.o
+	   syscalls_64.o time.o topology.o traps.o traps_64.o
 
 obj-y				+= cpu/ timers/
 obj-$(CONFIG_VSYSCALL)		+= vsyscall/
diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time.c
similarity index 100%
rename from arch/sh/kernel/time_32.c
rename to arch/sh/kernel/time.c
-- 
GitLab


From 6d134b9e8d3f32331ad2faca2db8186f54198931 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:36:13 +0900
Subject: [PATCH 0889/2198] sh: Wire up GENERIC_CMOS_UPDATE for the platforms
 that need it.

Now that everything has converted over to generic timekeeping, we need an
alternate method for keeping the RTC updated for those platforms that are
still using the rtc_sh_get/set_time pairs, presently limited to SH-03 and
the Dreamcast. This wires up the GENERIC_CMOS_UPDATE hooks for those to
maintain the same behaviour.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig       |  4 ++++
 arch/sh/kernel/time.c | 14 ++++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index ca5c09b241c30..d88a61b11d321 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -83,6 +83,10 @@ config GENERIC_CLOCKEVENTS
 config GENERIC_CLOCKEVENTS_BROADCAST
 	bool
 
+config GENERIC_CMOS_UPDATE
+	def_bool y
+	depends on SH_SH03 || SH_DREAMCAST
+
 config GENERIC_LOCKBREAK
 	def_bool y
 	depends on SMP && PREEMPT
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index d41ca4cf20cf9..77b841a99c01f 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -46,6 +46,20 @@ static int null_rtc_set_time(const time_t secs)
 void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time;
 int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time;
 
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+unsigned long read_persistent_clock(void)
+{
+	struct timespec tv;
+	rtc_sh_get_time(&tv);
+	return tv.tv_sec;
+}
+
+int update_persistent_clock(struct timespec now)
+{
+	return rtc_sh_set_time(now.tv_sec);
+}
+#endif
+
 unsigned int get_rtc_time(struct rtc_time *tm)
 {
 	if (rtc_sh_get_time != null_rtc_get_time) {
-- 
GitLab


From 5ac5496411b30d41945a996fe7a7fb5abccf2aaa Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:44:00 +0900
Subject: [PATCH 0890/2198] sh: Kill off dead handle_timer_tick() code.

Nothing is using this anymore now that we have fully converted to generic
time, so kill it off completely.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/timer.h |  5 ----
 arch/sh/kernel/time.c       | 53 +++----------------------------------
 2 files changed, 4 insertions(+), 54 deletions(-)

diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index cb16645d83972..6c0851188c00f 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -18,17 +18,12 @@ struct sys_timer {
 	struct sys_timer_ops	*ops;
 };
 
-#define TICK_SIZE (tick_nsec / 1000)
-
 extern struct sys_timer tmu_timer;
 extern struct sys_timer *sys_timer;
 
 /* arch/sh/kernel/timers/timer.c */
 struct sys_timer *get_sys_timer(void);
 
-/* arch/sh/kernel/time.c */
-void handle_timer_tick(void);
-
 extern struct clocksource clocksource_sh;
 
 #endif /* __ASM_SH_TIMER_H */
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index 77b841a99c01f..f4c304adec42a 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -1,13 +1,14 @@
 /*
- *  arch/sh/kernel/time_32.c
+ *  arch/sh/kernel/time.c
  *
  *  Copyright (C) 1999  Tetsuya Okada & Niibe Yutaka
  *  Copyright (C) 2000  Philipp Rumpf <prumpf@tux.org>
  *  Copyright (C) 2002 - 2009  Paul Mundt
  *  Copyright (C) 2002  M. R. Brown  <mrbrown@linux-sh.org>
  *
- *  Some code taken from i386 version.
- *    Copyright (C) 1991, 1992, 1995  Linus Torvalds
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -23,7 +24,6 @@
 #include <asm/clock.h>
 #include <asm/rtc.h>
 #include <asm/timer.h>
-#include <asm/kgdb.h>
 
 struct sys_timer *sys_timer;
 
@@ -97,50 +97,6 @@ static int __init rtc_generic_init(void)
 }
 module_init(rtc_generic_init);
 
-/* last time the RTC clock got updated */
-static long last_rtc_update;
-
-/*
- * handle_timer_tick() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
- */
-void handle_timer_tick(void)
-{
-	if (current->pid)
-		profile_tick(CPU_PROFILING);
-
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-	do_timer(1);
-
-	/*
-	 * If we have an externally synchronized Linux clock, then update
-	 * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
-	 * called as close as possible to 500 ms before the new second starts.
-	 */
-	if (ntp_synced() &&
-	    xtime.tv_sec > last_rtc_update + 660 &&
-	    (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 &&
-	    (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) {
-		if (rtc_sh_set_time(xtime.tv_sec) == 0)
-			last_rtc_update = xtime.tv_sec;
-		else
-			/* do it again in 60s */
-			last_rtc_update = xtime.tv_sec - 600;
-	}
-	write_sequnlock(&xtime_lock);
-
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(get_irq_regs()));
-#endif
-}
-
 #ifdef CONFIG_PM
 int timer_suspend(struct sys_device *dev, pm_message_t state)
 {
@@ -242,4 +198,3 @@ void __init time_init(void)
 
 	late_time_init = sh_late_time_init;
 }
-
-- 
GitLab


From 6459d7bb72e9767bc7d22f2ee44aab35188e4b8a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:47:48 +0900
Subject: [PATCH 0891/2198] sh: Kill off dead timer sysclass pm hooks.

With the conversion to generic clockevents these are completely unused,
so just kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/time.c | 45 -------------------------------------------
 1 file changed, 45 deletions(-)

diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index f4c304adec42a..c26576a5a454c 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -97,51 +97,6 @@ static int __init rtc_generic_init(void)
 }
 module_init(rtc_generic_init);
 
-#ifdef CONFIG_PM
-int timer_suspend(struct sys_device *dev, pm_message_t state)
-{
-	struct sys_timer *sys_timer = container_of(dev, struct sys_timer, dev);
-
-	sys_timer->ops->stop();
-
-	return 0;
-}
-
-int timer_resume(struct sys_device *dev)
-{
-	struct sys_timer *sys_timer = container_of(dev, struct sys_timer, dev);
-
-	sys_timer->ops->start();
-
-	return 0;
-}
-#else
-#define timer_suspend NULL
-#define timer_resume NULL
-#endif
-
-static struct sysdev_class timer_sysclass = {
-	.name	 = "timer",
-	.suspend = timer_suspend,
-	.resume	 = timer_resume,
-};
-
-static int __init timer_init_sysfs(void)
-{
-	int ret;
-
-	if (!sys_timer)
-		return 0;
-
-	ret = sysdev_class_register(&timer_sysclass);
-	if (ret != 0)
-		return ret;
-
-	sys_timer->dev.cls = &timer_sysclass;
-	return sysdev_register(&sys_timer->dev);
-}
-device_initcall(timer_init_sysfs);
-
 void (*board_time_init)(void);
 
 struct clocksource clocksource_sh = {
-- 
GitLab


From cd1408f22d2fd8f0d09c082b07cf0b9bcfb6eac9 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:57:35 +0900
Subject: [PATCH 0892/2198] sh: mach-sh03: Give the sh03 rtc its own spinlock.

This converts the sh03 rtc code off of using the global rtc_lock and on
to its own spinlock. There are no other possible users of the rtc_lock,
so serializing with it is not necessary.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-sh03/rtc.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/sh/boards/mach-sh03/rtc.c b/arch/sh/boards/mach-sh03/rtc.c
index 0a9266bb51c58..a8b9f844ab5b5 100644
--- a/arch/sh/boards/mach-sh03/rtc.c
+++ b/arch/sh/boards/mach-sh03/rtc.c
@@ -35,13 +35,13 @@
 #define RTC_BUSY	1
 #define RTC_STOP	2
 
-extern spinlock_t rtc_lock;
+static DEFINE_SPINLOCK(sh03_rtc_lock);
 
 unsigned long get_cmos_time(void)
 {
 	unsigned int year, mon, day, hour, min, sec;
 
-	spin_lock(&rtc_lock);
+	spin_lock(&sh03_rtc_lock);
  again:
 	do {
 		sec  = (ctrl_inb(RTC_SEC1) & 0xf) + (ctrl_inb(RTC_SEC10) & 0x7) * 10;
@@ -73,7 +73,7 @@ unsigned long get_cmos_time(void)
 		goto again;
 	}
 
-	spin_unlock(&rtc_lock);
+	spin_unlock(&sh03_rtc_lock);
 	return mktime(year, mon, day, hour, min, sec);
 }
 
@@ -91,7 +91,7 @@ static int set_rtc_mmss(unsigned long nowtime)
 	int i;
 
 	/* gets recalled with irq locally disabled */
-	spin_lock(&rtc_lock);
+	spin_lock(&sh03_rtc_lock);
 	for (i = 0 ; i < 1000000 ; i++)	/* may take up to 1 second... */
 		if (!(ctrl_inb(RTC_CTL) & RTC_BUSY))
 			break;
@@ -113,7 +113,7 @@ static int set_rtc_mmss(unsigned long nowtime)
 		       cmos_minutes, real_minutes);
 		retval = -1;
 	}
-	spin_unlock(&rtc_lock);
+	spin_unlock(&sh03_rtc_lock);
 
 	return retval;
 }
-- 
GitLab


From 1af2fe45fec15bdba446c22b9b602699cdabfc9f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 16:59:05 +0900
Subject: [PATCH 0893/2198] sh: Kill off the global rtc_lock with extreme
 prejudice.

Now that all of the possible users for rtc_lock have gone away, it is no
longer necessary to keep this lock definition around.

This follows several other architectures that have either recently
dropped it or never supported it in the first place.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/time.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index c26576a5a454c..e0aa769481ff0 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -17,7 +17,6 @@
 #include <linux/timex.h>
 #include <linux/sched.h>
 #include <linux/clockchips.h>
-#include <linux/mc146818rtc.h>	/* for rtc_lock */
 #include <linux/platform_device.h>
 #include <linux/smp.h>
 #include <linux/rtc.h>
@@ -27,10 +26,6 @@
 
 struct sys_timer *sys_timer;
 
-/* Move this somewhere more sensible.. */
-DEFINE_SPINLOCK(rtc_lock);
-EXPORT_SYMBOL(rtc_lock);
-
 /* Dummy RTC ops */
 static void null_rtc_get_time(struct timespec *tv)
 {
-- 
GitLab


From cb3a86c89ebdf917b88665f70e06863986fbab5c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 17:25:35 +0900
Subject: [PATCH 0894/2198] sh: Kill off sh64's hand-rolled syscall tracer.

This is no longer necessary, as there are now sufficient generic
alternatives available.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig.debug          |   8 --
 arch/sh/Makefile               |  34 +--------
 arch/sh/kernel/cpu/sh5/entry.S |  57 --------------
 arch/sh/lib64/.gitignore       |   1 -
 arch/sh/lib64/dbg.c            | 134 ---------------------------------
 5 files changed, 2 insertions(+), 232 deletions(-)
 delete mode 100644 arch/sh/lib64/.gitignore

diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug
index fafa907e14ac6..44627ef52541b 100644
--- a/arch/sh/Kconfig.debug
+++ b/arch/sh/Kconfig.debug
@@ -137,12 +137,4 @@ config SH64_SR_WATCH
 	bool "Debug: set SR.WATCH to enable hardware watchpoints and trace"
 	depends on SUPERH64
 
-config POOR_MANS_STRACE
-	bool "Debug: enable rudimentary strace facility"
-	depends on SUPERH64
-	help
-	  This option allows system calls to be traced to the console.  It also
-	  aids in detecting kernel stack underflow.  It is useful for debugging
-	  early-userland problems (e.g. init incurring fatal exceptions.)
-
 endmenu
diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index bece1f7535f20..493da979eff1d 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -193,7 +193,7 @@ zImage uImage uImage.srec vmlinux.srec: vmlinux
 
 compressed: zImage
 
-archprepare: maketools arch/sh/lib64/syscalltab.h
+archprepare: maketools
 
 archclean:
 	$(Q)$(MAKE) $(clean)=$(boot)
@@ -205,34 +205,4 @@ define archhelp
 	@echo '  uImage.srec  	           - Create an S-record for U-Boot'
 endef
 
-define filechk_gen-syscalltab
-       (set -e; \
-	echo "/*"; \
-	echo " * DO NOT MODIFY."; \
-	echo " *"; \
-	echo " * This file was generated by arch/sh/Makefile"; \
-	echo " * Any changes will be reverted at build time."; \
-	echo " */"; \
-	echo ""; \
-	echo "#ifndef __SYSCALLTAB_H"; \
-	echo "#define __SYSCALLTAB_H"; \
-	echo ""; \
-	echo "#include <linux/kernel.h>"; \
-	echo ""; \
-	echo "struct syscall_info {"; \
-	echo "	const char *name;"; \
-	echo "} syscall_info_table[] = {"; \
-	sed -e '/^.*\.long /!d;s//	{ "/;s/\(\([^/]*\)\/\)\{1\}.*/\2/; \
-		s/[ \t]*$$//g;s/$$/" },/;s/\("\)sys_/\1/g'; \
-	echo "};"; \
-	echo ""; \
-	echo "#define NUM_SYSCALL_INFO_ENTRIES ARRAY_SIZE(syscall_info_table)";\
-	echo ""; \
-	echo "#endif /* __SYSCALLTAB_H */" )
-endef
-
-arch/sh/lib64/syscalltab.h: arch/sh/kernel/syscalls_64.S
-	$(call filechk,gen-syscalltab)
-
-CLEAN_FILES += arch/sh/lib64/syscalltab.h \
-	       include/asm-sh/machtypes.h
+CLEAN_FILES += include/asm-sh/machtypes.h
diff --git a/arch/sh/kernel/cpu/sh5/entry.S b/arch/sh/kernel/cpu/sh5/entry.S
index 516297515b25b..b0aacf675258f 100644
--- a/arch/sh/kernel/cpu/sh5/entry.S
+++ b/arch/sh/kernel/cpu/sh5/entry.S
@@ -812,27 +812,6 @@ no_underflow:
 	! exceptions
 	add	SP, ZERO, r14
 
-#ifdef CONFIG_POOR_MANS_STRACE
-	/* We've pushed all the registers now, so only r2-r4 hold anything
-	 * useful. Move them into callee save registers */
-	or	r2, ZERO, r28
-	or	r3, ZERO, r29
-	or	r4, ZERO, r30
-
-	/* Preserve r2 as the event code */
-	movi	evt_debug, r3
-	ori	r3, 1, r3
-	ptabs	r3, tr0
-
-	or	SP, ZERO, r6
-	getcon	TRA, r5
-	blink	tr0, LINK
-
-	or	r28, ZERO, r2
-	or	r29, ZERO, r3
-	or	r30, ZERO, r4
-#endif
-
 	/* For syscall and debug race condition, get TRA now */
 	getcon	TRA, r5
 
@@ -887,11 +866,6 @@ no_underflow:
  */
 	.global ret_from_irq
 ret_from_irq:
-#ifdef CONFIG_POOR_MANS_STRACE
-	pta	evt_debug_ret_from_irq, tr0
-	ori	SP, 0, r2
-	blink	tr0, LINK
-#endif
 	ld.q	SP, FRAME_S(FSSR), r6
 	shlri	r6, 30, r6
 	andi	r6, 1, r6
@@ -905,12 +879,6 @@ ret_from_irq:
 ret_from_exception:
 	preempt_stop()
 
-#ifdef CONFIG_POOR_MANS_STRACE
-	pta	evt_debug_ret_from_exc, tr0
-	ori	SP, 0, r2
-	blink	tr0, LINK
-#endif
-
 	ld.q	SP, FRAME_S(FSSR), r6
 	shlri	r6, 30, r6
 	andi	r6, 1, r6
@@ -1236,18 +1204,6 @@ syscall_bad:
 	.global syscall_ret
 syscall_ret:
 	st.q	SP, FRAME_R(9), r2	/* Expecting SP back to BASIC frame */
-
-#ifdef CONFIG_POOR_MANS_STRACE
-	/* nothing useful in registers at this point */
-
-	movi	evt_debug2, r5
-	ori	r5, 1, r5
-	ptabs	r5, tr0
-	ld.q	SP, FRAME_R(9), r2
-	or	SP, ZERO, r3
-	blink	tr0, LINK
-#endif
-
 	ld.q	SP, FRAME_S(FSPC), r2
 	addi	r2, 4, r2		/* Move PC, being pre-execution event */
 	st.q	SP, FRAME_S(FSPC), r2
@@ -1268,25 +1224,12 @@ ret_from_fork:
 	ptabs	r5, tr0
 	blink	tr0, LINK
 
-#ifdef CONFIG_POOR_MANS_STRACE
-	/* nothing useful in registers at this point */
-
-	movi	evt_debug2, r5
-	ori	r5, 1, r5
-	ptabs	r5, tr0
-	ld.q	SP, FRAME_R(9), r2
-	or	SP, ZERO, r3
-	blink	tr0, LINK
-#endif
-
 	ld.q	SP, FRAME_S(FSPC), r2
 	addi	r2, 4, r2		/* Move PC, being pre-execution event */
 	st.q	SP, FRAME_S(FSPC), r2
 	pta	ret_from_syscall, tr0
 	blink	tr0, ZERO
 
-
-
 syscall_allowed:
 	/* Use LINK to deflect the exit point, default is syscall_ret */
 	pta	syscall_ret, tr0
diff --git a/arch/sh/lib64/.gitignore b/arch/sh/lib64/.gitignore
deleted file mode 100644
index 3508c2cb23c4b..0000000000000
--- a/arch/sh/lib64/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-syscalltab.h
diff --git a/arch/sh/lib64/dbg.c b/arch/sh/lib64/dbg.c
index 2fb8eaf6de60a..25f2481bd89ed 100644
--- a/arch/sh/lib64/dbg.c
+++ b/arch/sh/lib64/dbg.c
@@ -135,140 +135,6 @@ void print_itlb(void)
 	    (" =============================================================\n");
 }
 
-/* ======================================================================= */
-
-#ifdef CONFIG_POOR_MANS_STRACE
-
-#include "syscalltab.h"
-
-struct ring_node {
-	int evt;
-	int ret_addr;
-	int event;
-	int tra;
-	int pid;
-	unsigned long sp;
-	unsigned long pc;
-};
-
-static struct ring_node event_ring[16];
-static int event_ptr = 0;
-
-struct stored_syscall_data {
-	int pid;
-	int syscall_number;
-};
-
-#define N_STORED_SYSCALLS 16
-
-static struct stored_syscall_data stored_syscalls[N_STORED_SYSCALLS];
-static int syscall_next=0;
-static int syscall_next_print=0;
-
-void evt_debug(int evt, int ret_addr, int event, int tra, struct pt_regs *regs)
-{
-	int syscallno = tra & 0xff;
-	unsigned long sp;
-	unsigned long stack_bottom;
-	int pid;
-	struct ring_node *rr;
-
-	pid = current->pid;
-	stack_bottom = (unsigned long) task_stack_page(current);
-	asm volatile("ori r15, 0, %0" : "=r" (sp));
-	rr = event_ring + event_ptr;
-	rr->evt = evt;
-	rr->ret_addr = ret_addr;
-	rr->event = event;
-	rr->tra = tra;
-	rr->pid = pid;
-	rr->sp = sp;
-	rr->pc = regs->pc;
-
-	if (sp < stack_bottom + 3092) {
-		int i, j;
-		printk("evt_debug : stack underflow report\n");
-		for (j=0, i = event_ptr; j<16; j++) {
-			rr = event_ring + i;
-			printk("evt=%08x event=%08x tra=%08x pid=%5d sp=%08lx pc=%08lx\n",
-				rr->evt, rr->event, rr->tra, rr->pid, rr->sp, rr->pc);
-			i--;
-			i &= 15;
-		}
-		panic("STACK UNDERFLOW\n");
-	}
-
-	event_ptr = (event_ptr + 1) & 15;
-
-	if ((event == 2) && (evt == 0x160)) {
-		if (syscallno < NUM_SYSCALL_INFO_ENTRIES) {
-			/* Store the syscall information to print later.  We
-			 * can't print this now - currently we're running with
-			 * SR.BL=1, so we can't take a tlbmiss (which could occur
-			 * in the console drivers under printk).
-			 *
-			 * Just overwrite old entries on ring overflow - this
-			 * is only for last-hope debugging. */
-			stored_syscalls[syscall_next].pid = current->pid;
-			stored_syscalls[syscall_next].syscall_number = syscallno;
-			syscall_next++;
-			syscall_next &= (N_STORED_SYSCALLS - 1);
-		}
-	}
-}
-
-static void drain_syscalls(void) {
-	while (syscall_next_print != syscall_next) {
-		printk("Task %d: %s()\n",
-			stored_syscalls[syscall_next_print].pid,
-			syscall_info_table[stored_syscalls[syscall_next_print].syscall_number].name);
-			syscall_next_print++;
-			syscall_next_print &= (N_STORED_SYSCALLS - 1);
-	}
-}
-
-void evt_debug2(unsigned int ret)
-{
-	drain_syscalls();
-	printk("Task %d: syscall returns %08x\n", current->pid, ret);
-}
-
-void evt_debug_ret_from_irq(struct pt_regs *regs)
-{
-	int pid;
-	struct ring_node *rr;
-
-	pid = current->pid;
-	rr = event_ring + event_ptr;
-	rr->evt = 0xffff;
-	rr->ret_addr = 0;
-	rr->event = 0;
-	rr->tra = 0;
-	rr->pid = pid;
-	rr->pc = regs->pc;
-	event_ptr = (event_ptr + 1) & 15;
-}
-
-void evt_debug_ret_from_exc(struct pt_regs *regs)
-{
-	int pid;
-	struct ring_node *rr;
-
-	pid = current->pid;
-	rr = event_ring + event_ptr;
-	rr->evt = 0xfffe;
-	rr->ret_addr = 0;
-	rr->event = 0;
-	rr->tra = 0;
-	rr->pid = pid;
-	rr->pc = regs->pc;
-	event_ptr = (event_ptr + 1) & 15;
-}
-
-#endif /* CONFIG_POOR_MANS_STRACE */
-
-/* ======================================================================= */
-
 void show_excp_regs(char *from, int trapnr, int signr, struct pt_regs *regs)
 {
 
-- 
GitLab


From ef9f89996e450e9686816b6fc1e34d7b9a65766c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 17:36:57 +0900
Subject: [PATCH 0895/2198] sh: Kill off unused sh64 debug code.

None of the print_page() code and associated helpers are presently used
by anything in-tree, so just kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/lib64/dbg.c    | 48 ------------------------------------------
 arch/sh/lib64/udelay.c |  2 +-
 2 files changed, 1 insertion(+), 49 deletions(-)

diff --git a/arch/sh/lib64/dbg.c b/arch/sh/lib64/dbg.c
index 25f2481bd89ed..6152a6a6d9c6e 100644
--- a/arch/sh/lib64/dbg.c
+++ b/arch/sh/lib64/dbg.c
@@ -246,51 +246,3 @@ void show_excp_regs(char *from, int trapnr, int signr, struct pt_regs *regs)
 	print_dtlb();
 	print_itlb();
 }
-
-/* ======================================================================= */
-
-/*
-** Depending on <base> scan the MMU, Data or Instruction side
-** looking for a valid mapping matching Eaddr & asid.
-** Return -1 if not found or the TLB id entry otherwise.
-** Note: it works only for 4k pages!
-*/
-static unsigned long
-lookup_mmu_side(unsigned long base, unsigned long Eaddr, unsigned long asid)
-{
-	regType_t pteH;
-	unsigned long epn;
-	int count;
-
-	epn = Eaddr & 0xfffff000;
-
-	for (count = 0; count < MAX_TLBs; count++, base += TLB_STEP) {
-		pteH = getConfigReg(base);
-		if (GET_VALID(pteH))
-			if ((unsigned long) GET_EPN(pteH) == epn)
-				if ((unsigned long) GET_ASID(pteH) == asid)
-					break;
-	}
-	return ((unsigned long) ((count < MAX_TLBs) ? base : -1));
-}
-
-unsigned long lookup_dtlb(unsigned long Eaddr)
-{
-	unsigned long asid = get_asid();
-	return (lookup_mmu_side((u64) DTLB_BASE, Eaddr, asid));
-}
-
-unsigned long lookup_itlb(unsigned long Eaddr)
-{
-	unsigned long asid = get_asid();
-	return (lookup_mmu_side((u64) ITLB_BASE, Eaddr, asid));
-}
-
-void print_page(struct page *page)
-{
-	printk("  page[%p] -> index 0x%lx,  count 0x%x,  flags 0x%lx\n",
-	       page, page->index, page_count(page), page->flags);
-	printk("       address_space = %p, pages =%ld\n", page->mapping,
-	       page->mapping->nrpages);
-
-}
diff --git a/arch/sh/lib64/udelay.c b/arch/sh/lib64/udelay.c
index d76bd801194f5..f215b063da704 100644
--- a/arch/sh/lib64/udelay.c
+++ b/arch/sh/lib64/udelay.c
@@ -33,7 +33,7 @@ void __delay(unsigned long loops)
 			     :"0"(loops));
 }
 
-inline void __const_udelay(unsigned long xloops)
+void __const_udelay(unsigned long xloops)
 {
 	__delay(xloops * (HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy));
 }
-- 
GitLab


From 7d170b1bc540a1d83098a9f27cf4939e026fda81 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 17:41:59 +0900
Subject: [PATCH 0896/2198] sh: Move out cayman-specific panic handler code to
 its own file.

This moves out the cayman-specific panic handler code to a better
location, and leaves the generic implementation a simple stub that is
still used under emulation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-cayman/Makefile |  2 +-
 arch/sh/boards/mach-cayman/panic.c  | 49 +++++++++++++++++++++++++++++
 arch/sh/lib64/panic.c               | 43 -------------------------
 3 files changed, 50 insertions(+), 44 deletions(-)
 create mode 100644 arch/sh/boards/mach-cayman/panic.c

diff --git a/arch/sh/boards/mach-cayman/Makefile b/arch/sh/boards/mach-cayman/Makefile
index cafe1ac3b29c2..00fa3eaecb1be 100644
--- a/arch/sh/boards/mach-cayman/Makefile
+++ b/arch/sh/boards/mach-cayman/Makefile
@@ -1,4 +1,4 @@
 #
 # Makefile for the Hitachi Cayman specific parts of the kernel
 #
-obj-y := setup.o irq.o
+obj-y := setup.o irq.o panic.o
diff --git a/arch/sh/boards/mach-cayman/panic.c b/arch/sh/boards/mach-cayman/panic.c
new file mode 100644
index 0000000000000..d1e67306d07c0
--- /dev/null
+++ b/arch/sh/boards/mach-cayman/panic.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2003  Richard Curnow, SuperH UK Limited
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <cpu/registers.h>
+
+/* THIS IS A PHYSICAL ADDRESS */
+#define HDSP2534_ADDR (0x04002100)
+
+static void poor_mans_delay(void)
+{
+	int i;
+
+	for (i = 0; i < 2500000; i++)
+		cpu_relax();
+}
+
+static void show_value(unsigned long x)
+{
+	int i;
+	unsigned nibble;
+	for (i = 0; i < 8; i++) {
+		nibble = ((x >> (i * 4)) & 0xf);
+
+		__raw_writeb(nibble + ((nibble > 9) ? 55 : 48),
+			  HDSP2534_ADDR + 0xe0 + ((7 - i) << 2));
+	}
+}
+
+void
+panic_handler(unsigned long panicPC, unsigned long panicSSR,
+	      unsigned long panicEXPEVT)
+{
+	while (1) {
+		/* This piece of code displays the PC on the LED display */
+		show_value(panicPC);
+		poor_mans_delay();
+		show_value(panicSSR);
+		poor_mans_delay();
+		show_value(panicEXPEVT);
+		poor_mans_delay();
+	}
+}
diff --git a/arch/sh/lib64/panic.c b/arch/sh/lib64/panic.c
index da32ba7b5fcc0..38c954e04f6a5 100644
--- a/arch/sh/lib64/panic.c
+++ b/arch/sh/lib64/panic.c
@@ -6,53 +6,10 @@
  * for more details.
  */
 
-#include <linux/kernel.h>
-#include <asm/io.h>
-#include <cpu/registers.h>
-
-/* THIS IS A PHYSICAL ADDRESS */
-#define HDSP2534_ADDR (0x04002100)
-
-#ifdef CONFIG_SH_CAYMAN
-
-static void poor_mans_delay(void)
-{
-	int i;
-	for (i = 0; i < 2500000; i++) {
-	}		/* poor man's delay */
-}
-
-static void show_value(unsigned long x)
-{
-	int i;
-	unsigned nibble;
-	for (i = 0; i < 8; i++) {
-		nibble = ((x >> (i * 4)) & 0xf);
-
-		ctrl_outb(nibble + ((nibble > 9) ? 55 : 48),
-			  HDSP2534_ADDR + 0xe0 + ((7 - i) << 2));
-	}
-}
-
-#endif
-
 void
 panic_handler(unsigned long panicPC, unsigned long panicSSR,
 	      unsigned long panicEXPEVT)
 {
-#ifdef CONFIG_SH_CAYMAN
-	while (1) {
-		/* This piece of code displays the PC on the LED display */
-		show_value(panicPC);
-		poor_mans_delay();
-		show_value(panicSSR);
-		poor_mans_delay();
-		show_value(panicEXPEVT);
-		poor_mans_delay();
-	}
-#endif
-
 	/* Never return from the panic handler */
 	for (;;) ;
-
 }
-- 
GitLab


From 4f5ecaa05493dfddf155b40224b951592bfce325 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 8 May 2009 08:23:29 +0000
Subject: [PATCH 0897/2198] sh: clock framework update, fix count and kill off
 kref

This patch updates the clock framework use count code.
With this patch the enable() and disable() callbacks
only get called when counting from and to zero.
While at it the kref stuff gets replaced with an int.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h |  4 +--
 arch/sh/kernel/cpu/clock.c  | 69 ++++++++++++++++++++-----------------
 2 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 2f6c9627bc1f8..b1f29199e4bd2 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -1,7 +1,6 @@
 #ifndef __ASM_SH_CLOCK_H
 #define __ASM_SH_CLOCK_H
 
-#include <linux/kref.h>
 #include <linux/list.h>
 #include <linux/seq_file.h>
 #include <linux/clk.h>
@@ -28,7 +27,7 @@ struct clk {
 	struct clk		*parent;
 	struct clk_ops		*ops;
 
-	struct kref		kref;
+	int			usecount;
 
 	unsigned long		rate;
 	unsigned long		flags;
@@ -37,6 +36,7 @@ struct clk {
 
 #define CLK_ALWAYS_ENABLED	(1 << 0)
 #define CLK_RATE_PROPAGATES	(1 << 1)
+#define CLK_NEEDS_INIT		(1 << 2)
 
 /* Should be defined by processor-specific code */
 void arch_init_clk_ops(struct clk_ops **, int type);
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 099373ae57d84..133dbe4033412 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -19,7 +19,6 @@
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/list.h>
-#include <linux/kref.h>
 #include <linux/kobject.h>
 #include <linux/sysdev.h>
 #include <linux/seq_file.h>
@@ -90,7 +89,7 @@ static void propagate_rate(struct clk *clk)
 	}
 }
 
-static int __clk_enable(struct clk *clk)
+static void __clk_init(struct clk *clk)
 {
 	/*
 	 * See if this is the first time we're enabling the clock, some
@@ -100,19 +99,33 @@ static int __clk_enable(struct clk *clk)
 	 * divisors to use before it can effectively recalc.
 	 */
 
-	if (clk->flags & CLK_ALWAYS_ENABLED) {
-		kref_get(&clk->kref);
-		return 0;
-	}
-
-	if (unlikely(atomic_read(&clk->kref.refcount) == 1))
+	if (clk->flags & CLK_NEEDS_INIT) {
 		if (clk->ops && clk->ops->init)
 			clk->ops->init(clk);
 
-	kref_get(&clk->kref);
+		clk->flags &= ~CLK_NEEDS_INIT;
+	}
+}
+
+static int __clk_enable(struct clk *clk)
+{
+	if (!clk)
+		return -EINVAL;
+
+	clk->usecount++;
+
+	/* nothing to do if always enabled */
+	if (clk->flags & CLK_ALWAYS_ENABLED)
+		return 0;
+
+	if (clk->usecount == 1) {
+		__clk_init(clk);
 
-	if (likely(clk->ops && clk->ops->enable))
-		clk->ops->enable(clk);
+		__clk_enable(clk->parent);
+
+		if (clk->ops && clk->ops->enable)
+			clk->ops->enable(clk);
+	}
 
 	return 0;
 }
@@ -122,11 +135,6 @@ int clk_enable(struct clk *clk)
 	unsigned long flags;
 	int ret;
 
-	if (!clk)
-		return -EINVAL;
-
-	clk_enable(clk->parent);
-
 	spin_lock_irqsave(&clock_lock, flags);
 	ret = __clk_enable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
@@ -135,21 +143,23 @@ int clk_enable(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_enable);
 
-static void clk_kref_release(struct kref *kref)
-{
-	/* Nothing to do */
-}
-
 static void __clk_disable(struct clk *clk)
 {
-	int count = kref_put(&clk->kref, clk_kref_release);
+	if (!clk)
+		return;
+
+	clk->usecount--;
+
+	WARN_ON(clk->usecount < 0);
 
 	if (clk->flags & CLK_ALWAYS_ENABLED)
 		return;
 
-	if (!count) {	/* count reaches zero, disable the clock */
+	if (clk->usecount == 0) {
 		if (likely(clk->ops && clk->ops->disable))
 			clk->ops->disable(clk);
+
+		__clk_disable(clk->parent);
 	}
 }
 
@@ -157,14 +167,9 @@ void clk_disable(struct clk *clk)
 {
 	unsigned long flags;
 
-	if (!clk)
-		return;
-
 	spin_lock_irqsave(&clock_lock, flags);
 	__clk_disable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
-
-	clk_disable(clk->parent);
 }
 EXPORT_SYMBOL_GPL(clk_disable);
 
@@ -173,14 +178,14 @@ int clk_register(struct clk *clk)
 	mutex_lock(&clock_list_sem);
 
 	list_add(&clk->node, &clock_list);
-	kref_init(&clk->kref);
+	clk->usecount = 0;
+	clk->flags |= CLK_NEEDS_INIT;
 
 	mutex_unlock(&clock_list_sem);
 
 	if (clk->flags & CLK_ALWAYS_ENABLED) {
+		__clk_init(clk);
 		pr_debug( "Clock '%s' is ALWAYS_ENABLED\n", clk->name);
-		if (clk->ops && clk->ops->init)
-			clk->ops->init(clk);
 		if (clk->ops && clk->ops->enable)
 			clk->ops->enable(clk);
 		pr_debug( "Enabled.");
@@ -356,7 +361,7 @@ static int show_clocks(char *buf, char **start, off_t off,
 		p += sprintf(p, "%-12s\t: %ld.%02ldMHz\t%s\n", clk->name,
 			     rate / 1000000, (rate % 1000000) / 10000,
 			     ((clk->flags & CLK_ALWAYS_ENABLED) ||
-			      (atomic_read(&clk->kref.refcount) != 1)) ?
+			      clk->usecount > 0) ?
 			     "enabled" : "disabled");
 	}
 
-- 
GitLab


From 583d1d549ff107500481461cec0325189c5349ec Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 8 May 2009 08:27:19 +0000
Subject: [PATCH 0898/2198] sh: enable TMU clocksource on sh7722

This patch enables the TMU clocksource on sh7722.
To prioritize TMU over CMT we also adjust the CMT
clock source rating.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 624e8e5a605c2..b1c3e7f9284a9 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -183,7 +183,7 @@ static struct sh_timer_config cmt_platform_data = {
 	.timer_bit = 5,
 	.clk = "cmt0",
 	.clockevent_rating = 125,
-	.clocksource_rating = 200,
+	.clocksource_rating = 125,
 };
 
 static struct resource cmt_resources[] = {
@@ -245,7 +245,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.channel_offset = 0x10,
 	.timer_bit = 1,
 	.clk = "tmu0",
-	.clocksource_rating = 0, /* disabled for now */
+	.clocksource_rating = 200,
 };
 
 static struct resource tmu1_resources[] = {
-- 
GitLab


From 47dd6f4439811c6c83d9abf2f364bcb6d2684640 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 8 May 2009 08:32:18 +0000
Subject: [PATCH 0899/2198] sh: TMU platform data for sh7723

This patch adds TMU platform data for sh7723. Both clockevent
and clocksource support is enabled. While at it, adjust the
CMT clocksource rating to prioritize the TMU.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7723.c | 196 ++++++++++++++++++++++++-
 1 file changed, 195 insertions(+), 1 deletion(-)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index fb9b5427a068c..484ca2dabdeb5 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -107,7 +107,7 @@ static struct sh_timer_config cmt_platform_data = {
 	.timer_bit = 5,
 	.clk = "cmt0",
 	.clockevent_rating = 125,
-	.clocksource_rating = 200,
+	.clocksource_rating = 125,
 };
 
 static struct resource cmt_resources[] = {
@@ -133,6 +133,188 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu0",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu0",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu0",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu1",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffd90008,
+		.end	= 0xffd90013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 57,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu1",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffd90014,
+		.end	= 0xffd9001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 58,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu1",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffd90020,
+		.end	= 0xffd9002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 57,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase        = 0xffe00000,
@@ -255,6 +437,12 @@ static struct platform_device iic_device = {
 
 static struct platform_device *sh7723_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 	&sci_device,
 	&rtc_device,
 	&iic_device,
@@ -282,6 +470,12 @@ __initcall(sh7723_devices_setup);
 
 static struct platform_device *sh7723_early_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 };
 
 void __init plat_early_device_setup(void)
-- 
GitLab


From 6b2e8523df148c15ea5abf13075026fb8bdb3f86 Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy@goop.org>
Date: Thu, 7 May 2009 11:56:49 -0700
Subject: [PATCH 0900/2198] xen: reserve Xen start_info rather than e820
 reserving

Use reserve_early rather than e820 reservations for Xen start info and mfn->pfn
table, so that the memory use is a bit more self-documenting.

[ Impact: cleanup ]

Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Xen-devel <xen-devel@lists.xensource.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <4A032EF1.6070708@goop.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/xen/setup.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 15c6c68db6a25..ad0047f47cd47 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
 	 *  - xen_start_info
 	 * See comment above "struct start_info" in <xen/interface/xen.h>
 	 */
-	e820_add_region(__pa(xen_start_info->mfn_list),
-			xen_start_info->pt_base - xen_start_info->mfn_list,
-			E820_RESERVED);
+	reserve_early(__pa(xen_start_info->mfn_list),
+		      __pa(xen_start_info->pt_base),
+			"XEN START INFO");
 
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
-- 
GitLab


From b366328335b98373325977e116d2423f500708ac Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 18:01:03 +0900
Subject: [PATCH 0901/2198] sh: Drop dead rules from
 arch/sh/kernel/Makefile_64.

Several of these options are specific to the SHcompact ISA and will need
to be rewritten for SHmedia if they are to be supported at all. Drop
the impossible rules for now.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/Makefile_64 | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index 31f6ebf375eb1..c845a0d619795 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -5,13 +5,10 @@ obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \
 	   syscalls_64.o time.o topology.o traps.o traps_64.o
 
 obj-y				+= cpu/ timers/
-obj-$(CONFIG_VSYSCALL)		+= vsyscall/
 obj-$(CONFIG_SMP)		+= smp.o
-obj-$(CONFIG_SH_STANDARD_BIOS)	+= sh_bios.o
 obj-$(CONFIG_SH_CPU_FREQ)	+= cpufreq.o
 obj-$(CONFIG_MODULES)		+= sh_ksyms_64.o module.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_KEXEC)		+= machine_kexec.o relocate_kernel.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump.o
 obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_IO_TRAPPED)	+= io_trapped.o
-- 
GitLab


From 1dcdb5a9e7c235e6e80f1f4d5b8247b3e5347e48 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 27 Apr 2009 17:44:11 +0200
Subject: [PATCH 0902/2198] oprofile: re-add force_arch_perfmon option

This re-adds the force_arch_perfmon option that was in the original
arch perfmon patchkit. Originally this was rejected in favour
of a generalized perfmon=name option, but it turned out implementing
the later in a reliable way is hard (and it would have been easy
to crash the kernel if a user gets it wrong)

But now Atom and Core i7 support being readded a user would
need to update their oprofile userland to beyond 0.9.4 to use oprofile again
on Atom or Core i7.

To avoid this problem readd the force_arch_perfmon option.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 Documentation/kernel-parameters.txt | 6 ++++++
 arch/x86/oprofile/nmi_int.c         | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 90b3924071b6c..9b9566bf3301f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1650,6 +1650,12 @@ and is between 256 and 4096 characters. It is defined in the file
 	oprofile.timer=	[HW]
 			Use timer interrupt instead of performance counters
 
+	oprofile.force_arch_perfmon=1	[X86]
+			Force use of architectural perfmon instead of
+			the CPU specific event set.
+			This might be useful if you have older oprofile
+			userland or if you want common events over Intel CPUs.
+
 	osst=		[HW,SCSI] SCSI Tape Driver
 			Format: <buffer_size>,<write_threshold>
 			See also Documentation/scsi/st.txt.
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 202864ad49a7e..e5171c99e1576 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -389,10 +389,16 @@ static int __init p4_init(char **cpu_type)
 	return 0;
 }
 
+int force_arch_perfmon;
+module_param(force_arch_perfmon, int, 0);
+
 static int __init ppro_init(char **cpu_type)
 {
 	__u8 cpu_model = boot_cpu_data.x86_model;
 
+	if (force_arch_perfmon && cpu_has_arch_perfmon)
+		return 0;
+
 	switch (cpu_model) {
 	case 0 ... 2:
 		*cpu_type = "i386/ppro";
-- 
GitLab


From 1f3d7b60691993d8d368d8dd7d5d85871d41e8f5 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 27 Apr 2009 17:44:12 +0200
Subject: [PATCH 0903/2198] oprofile: remove undocumented oprofile.p4force
 option

There are no new P4s and the oprofile code knows about all existing
ones, so we don't really need the p4force option anymore.

Remove it.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index e5171c99e1576..f472c0c48a3ef 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -356,14 +356,11 @@ static void exit_sysfs(void)
 #define exit_sysfs() do { } while (0)
 #endif /* CONFIG_PM */
 
-static int p4force;
-module_param(p4force, int, 0);
-
 static int __init p4_init(char **cpu_type)
 {
 	__u8 cpu_model = boot_cpu_data.x86_model;
 
-	if (!p4force && (cpu_model > 6 || cpu_model == 5))
+	if (cpu_model > 6 || cpu_model == 5)
 		return 0;
 
 #ifndef CONFIG_SMP
-- 
GitLab


From 6adf406f0a0eaf37251018d15f51e93f5b538ee6 Mon Sep 17 00:00:00 2001
From: Andi Kleen <andi@firstfloor.org>
Date: Mon, 27 Apr 2009 17:44:13 +0200
Subject: [PATCH 0904/2198] oprofile: add support for Core i7 and Atom

The registers are about the same as other Family 6 CPUs
so we only need to add detection.

I'm not completely happy with calling Nehalem Core i7 because
there will be undoubtedly other Nehalem based CPUs
in the future with different marketing names, but it's
the best we got for now.

Requires updated oprofile userland for the new event files.

If you don't want to update right now you can also use
oprofile.force_arch_perfmon=1 (added in the next patch) with 0.9.4

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index f472c0c48a3ef..3308147182aef 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -417,6 +417,13 @@ static int __init ppro_init(char **cpu_type)
 	case 15: case 23:
 		*cpu_type = "i386/core_2";
 		break;
+	case 26:
+		arch_perfmon_setup_counters();
+		*cpu_type = "i386/core_i7";
+		break;
+	case 28:
+		*cpu_type = "i386/atom";
+		break;
 	default:
 		/* Unknown */
 		return 0;
-- 
GitLab


From 7e4e0bd50e80df2fe5501f48f872448376cdd997 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 6 May 2009 12:10:23 +0200
Subject: [PATCH 0905/2198] oprofile: introduce module_param oprofile.cpu_type

This patch removes module_param oprofile.force_arch_perfmon and
introduces oprofile.cpu_type=archperfmon instead. This new parameter
can be reused for other models and architectures.

Currently only archperfmon is supported.

Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 Documentation/kernel-parameters.txt | 12 +++++++-----
 arch/x86/oprofile/nmi_int.c         | 13 +++++++++++--
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9b9566bf3301f..6ce5f48859cc5 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1650,11 +1650,13 @@ and is between 256 and 4096 characters. It is defined in the file
 	oprofile.timer=	[HW]
 			Use timer interrupt instead of performance counters
 
-	oprofile.force_arch_perfmon=1	[X86]
-			Force use of architectural perfmon instead of
-			the CPU specific event set.
-			This might be useful if you have older oprofile
-			userland or if you want common events over Intel CPUs.
+	oprofile.cpu_type=	Force an oprofile cpu type
+			This might be useful if you have an older oprofile
+			userland or if you want common events.
+			Format: { archperfmon }
+			archperfmon: [X86] Force use of architectural
+				perfmon on Intel CPUs instead of the
+				CPU specific event set.
 
 	osst=		[HW,SCSI] SCSI Tape Driver
 			Format: <buffer_size>,<write_threshold>
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 3308147182aef..3b285e656e27e 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -386,8 +386,17 @@ static int __init p4_init(char **cpu_type)
 	return 0;
 }
 
-int force_arch_perfmon;
-module_param(force_arch_perfmon, int, 0);
+static int force_arch_perfmon;
+static int force_cpu_type(const char *str, struct kernel_param *kp)
+{
+	if (!strcmp(str, "archperfmon")) {
+		force_arch_perfmon = 1;
+		printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
+	}
+
+	return 0;
+}
+module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
 
 static int __init ppro_init(char **cpu_type)
 {
-- 
GitLab


From 6eac1af01112d1919b1c432f6591e6368e8d538f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 19:48:47 +0900
Subject: [PATCH 0906/2198] sh: Always select RTC_LIB, not just for SUPERH32.

The RTC_LIB helpers are used in arch/sh/kernel/time.c, which was
previously only the case for the 32-bit variant. Now that this has
become the common implementation, move the RTC_LIB select to reflect
that. Fixes up the sh64 build.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d88a61b11d321..6ed16c4548c92 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -15,6 +15,7 @@ config SUPERH
 	select HAVE_IOREMAP_PROT if MMU
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_DMA_API_DEBUG
+	select RTC_LIB
 	help
 	  The SuperH is a RISC processor targeted for use in embedded systems
 	  and consumer electronics; it was also used in the Sega Dreamcast
@@ -30,7 +31,6 @@ config SUPERH32
 	select HAVE_DYNAMIC_FTRACE
 	select HAVE_ARCH_KGDB
 	select ARCH_HIBERNATION_POSSIBLE if MMU
-	select RTC_LIB
 
 config SUPERH64
 	def_bool ARCH = "sh64"
-- 
GitLab


From 30d88cf52f229c3e0c90b8d71036960ccc2db4e2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 20:20:56 +0900
Subject: [PATCH 0907/2198] sh: Kill off extra cflags Kconfig entry.

There is no real reason to use this anymore, as the build system
generally knows what it is doing with regards to cflags mangling.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig.debug | 11 -----------
 arch/sh/Makefile      |  3 ---
 2 files changed, 14 deletions(-)

diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug
index 44627ef52541b..8179cc9be9a4a 100644
--- a/arch/sh/Kconfig.debug
+++ b/arch/sh/Kconfig.debug
@@ -122,17 +122,6 @@ config SH_NO_BSS_INIT
 	  For all other cases, say N. If this option seems perplexing, or
 	  you aren't sure, say N.
 
-config MORE_COMPILE_OPTIONS
-	bool "Add any additional compile options"
-	help
-	  If you want to add additional CFLAGS to the kernel build, enable this
-	  option and then enter what you would like to add in the next question.
-	  Note however that -g is already appended with the selection of KGDB.
-
-config COMPILE_OPTIONS
-	string "Additional compile arguments"
-	depends on MORE_COMPILE_OPTIONS
-
 config SH64_SR_WATCH
 	bool "Debug: set SR.WATCH to enable hardware watchpoints and trace"
 	depends on SUPERH64
diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index 493da979eff1d..68348e7002882 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -70,9 +70,6 @@ cflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -ml
 cflags-y	+= $(call cc-option,-mno-fdpic)
 cflags-y	+= $(isaflags-y) -ffreestanding
 
-cflags-$(CONFIG_MORE_COMPILE_OPTIONS)	+= \
-	$(shell echo $(CONFIG_COMPILE_OPTIONS) | sed -e 's/"//g')
-
 OBJCOPYFLAGS	:= -O binary -R .note -R .note.gnu.build-id -R .comment \
 		   -R .stab -R .stabstr -S
 
-- 
GitLab


From c29418c2ae15ee9171bc136ad261c497b95a46fc Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 20:32:56 +0900
Subject: [PATCH 0908/2198] sh: Always fixup unaligned userspace accesses on
 sh64.

sh64 has traditionally had this configurable via a Kconfig option
(CONFIG_SH64_USER_MISALIGNED_FIXUP). In practice it has never really been
terribly useful to turn this off, so just get rid of the option entirely.

We leave the sysctl around so we don't end up breaking existing root
file systems, and to allow folks that really want this off to do so at
their own risk.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig.cpu       |  5 -----
 arch/sh/kernel/traps_64.c | 35 +++++------------------------------
 2 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/arch/sh/Kconfig.cpu b/arch/sh/Kconfig.cpu
index c7d704381a6d5..9eb1712372d80 100644
--- a/arch/sh/Kconfig.cpu
+++ b/arch/sh/Kconfig.cpu
@@ -76,11 +76,6 @@ config SPECULATIVE_EXECUTION
 
 	  If unsure, say N.
 
-config SH64_USER_MISALIGNED_FIXUP
-	def_bool y
-	prompt "Fixup misaligned loads/stores occurring in user mode"
-	depends on SUPERH64
-
 config SH64_ID2815_WORKAROUND
 	bool "Include workaround for SH5-101 cut2 silicon defect ID2815"
 	depends on CPU_SUBTYPE_SH5_101
diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c
index a85831cbf18ba..267e5ebbb4753 100644
--- a/arch/sh/kernel/traps_64.c
+++ b/arch/sh/kernel/traps_64.c
@@ -370,7 +370,6 @@ static int generate_and_check_address(struct pt_regs *regs,
 		return -1;
 	}
 
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 	/* Check accessible.  For misaligned access in the kernel, assume the
 	   address is always accessible (and if not, just fault when the
 	   load/store gets done.) */
@@ -380,18 +379,13 @@ static int generate_and_check_address(struct pt_regs *regs,
 		}
 		/* Do access_ok check later - it depends on whether it's a load or a store. */
 	}
-#endif
 
 	*address = addr;
 	return 0;
 }
 
-/* Default value as for sh */
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 static int user_mode_unaligned_fixup_count = 10;
 static int user_mode_unaligned_fixup_enable = 1;
-#endif
-
 static int kernel_mode_unaligned_fixup_count = 32;
 
 static void misaligned_kernel_word_load(__u64 address, int do_sign_extend, __u64 *result)
@@ -440,7 +434,6 @@ static int misaligned_load(struct pt_regs *regs,
 	}
 
 	destreg = (opcode >> 4) & 0x3f;
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 	if (user_mode(regs)) {
 		__u64 buffer;
 
@@ -470,9 +463,7 @@ static int misaligned_load(struct pt_regs *regs,
 				width_shift, (unsigned long) regs->pc);
 			break;
 		}
-	} else
-#endif
-	{
+	} else {
 		/* kernel mode - we can take short cuts since if we fault, it's a genuine bug */
 		__u64 lo, hi;
 
@@ -519,7 +510,6 @@ static int misaligned_store(struct pt_regs *regs,
 	}
 
 	srcreg = (opcode >> 4) & 0x3f;
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 	if (user_mode(regs)) {
 		__u64 buffer;
 
@@ -546,9 +536,7 @@ static int misaligned_store(struct pt_regs *regs,
 		if (__copy_user((void *)(int)address, &buffer, (1 << width_shift)) > 0) {
 			return -1; /* fault */
 		}
-	} else
-#endif
-	{
+	} else {
 		/* kernel mode - we can take short cuts since if we fault, it's a genuine bug */
 		__u64 val = regs->regs[srcreg];
 
@@ -576,7 +564,6 @@ static int misaligned_store(struct pt_regs *regs,
 
 }
 
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 /* Never need to fix up misaligned FPU accesses within the kernel since that's a real
    error. */
 static int misaligned_fpu_load(struct pt_regs *regs,
@@ -727,7 +714,6 @@ static int misaligned_fpu_store(struct pt_regs *regs,
 		return -1;
 	}
 }
-#endif
 
 static int misaligned_fixup(struct pt_regs *regs)
 {
@@ -735,12 +721,8 @@ static int misaligned_fixup(struct pt_regs *regs)
 	int error;
 	int major, minor;
 
-#if !defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
-	/* Never fixup user mode misaligned accesses without this option enabled. */
-	return -1;
-#else
-	if (!user_mode_unaligned_fixup_enable) return -1;
-#endif
+	if (!user_mode_unaligned_fixup_enable)
+		return -1;
 
 	error = read_opcode(regs->pc, &opcode, user_mode(regs));
 	if (error < 0) {
@@ -749,15 +731,12 @@ static int misaligned_fixup(struct pt_regs *regs)
 	major = (opcode >> 26) & 0x3f;
 	minor = (opcode >> 16) & 0xf;
 
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 	if (user_mode(regs) && (user_mode_unaligned_fixup_count > 0)) {
 		--user_mode_unaligned_fixup_count;
 		/* Only do 'count' worth of these reports, to remove a potential DoS against syslog */
 		printk("Fixing up unaligned userspace access in \"%s\" pid=%d pc=0x%08x ins=0x%08lx\n",
 		       current->comm, task_pid_nr(current), (__u32)regs->pc, opcode);
-	} else
-#endif
-	if (!user_mode(regs) && (kernel_mode_unaligned_fixup_count > 0)) {
+	} else if (!user_mode(regs) && (kernel_mode_unaligned_fixup_count > 0)) {
 		--kernel_mode_unaligned_fixup_count;
 		if (in_interrupt()) {
 			printk("Fixing up unaligned kernelspace access in interrupt pc=0x%08x ins=0x%08lx\n",
@@ -830,7 +809,6 @@ static int misaligned_fixup(struct pt_regs *regs)
 			}
 			break;
 
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 		case (0x94>>2): /* FLD.S */
 			error = misaligned_fpu_load(regs, opcode, 1, 2, 0);
 			break;
@@ -881,7 +859,6 @@ static int misaligned_fixup(struct pt_regs *regs)
 				break;
 			}
 			break;
-#endif
 
 		default:
 			/* Fault */
@@ -907,7 +884,6 @@ static ctl_table unaligned_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec
 	},
-#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP)
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "user_reports",
@@ -923,7 +899,6 @@ static ctl_table unaligned_table[] = {
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec},
-#endif
 	{}
 };
 
-- 
GitLab


From 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 8 May 2009 10:31:42 +0800
Subject: [PATCH 0909/2198] tracing/events: clean up for ftrace_set_clr_event()

Add a helper function __ftrace_set_clr_event(), and replace some
ftrace_set_clr_event() calls with this helper, thus we don't need any
kstrdup() or kmalloc().

As a side effect, this patch fixes an issue in self tests code, which is
similar to the one fixed in commit d6bf81ef0f7474434c2a049e8bf3c9146a14dd96
("tracing: append ":*" to internal setting of system events")

It's a small issue and won't cause any bug in fact, but we should do things
right anyway.

[ Impact: prevent spurious event-enabling in tracing self-tests ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A03998E.3020503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 126 +++++++++++++-----------------------
 1 file changed, 46 insertions(+), 80 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 8d0fae3af5954..45f1099386b6b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -111,11 +111,44 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 	}
 }
 
-static int ftrace_set_clr_event(char *buf, int set)
+/*
+ * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
+ */
+static int __ftrace_set_clr_event(const char *match, const char *sub,
+				  const char *event, int set)
 {
 	struct ftrace_event_call *call;
+	int ret;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (match &&
+		    strcmp(match, call->name) != 0 &&
+		    strcmp(match, call->system) != 0)
+			continue;
+
+		if (sub && strcmp(sub, call->system) != 0)
+			continue;
+
+		if (event && strcmp(event, call->name) != 0)
+			continue;
+
+		ftrace_event_enable_disable(call, set);
+
+		ret = 0;
+	}
+	mutex_unlock(&event_mutex);
+
+	return ret;
+}
+
+static int ftrace_set_clr_event(char *buf, int set)
+{
 	char *event = NULL, *sub = NULL, *match;
-	int ret = -EINVAL;
 
 	/*
 	 * The buf format can be <subsystem>:<event-name>
@@ -141,30 +174,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 			event = NULL;
 	}
 
-	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
-
-		if (!call->name || !call->regfunc)
-			continue;
-
-		if (match &&
-		    strcmp(match, call->name) != 0 &&
-		    strcmp(match, call->system) != 0)
-			continue;
-
-		if (sub && strcmp(sub, call->system) != 0)
-			continue;
-
-		if (event && strcmp(event, call->name) != 0)
-			continue;
-
-		ftrace_event_enable_disable(call, set);
-
-		ret = 0;
-	}
-	mutex_unlock(&event_mutex);
-
-	return ret;
+	return __ftrace_set_clr_event(match, sub, event, set);
 }
 
 /* 128 should be much more than enough */
@@ -408,18 +418,14 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call;
 	char buf[2];
 	int set = -1;
-	int all = 0;
 	int ret;
 
-	if (system[0] == '*')
-		all = 1;
-
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
 		if (!call->name || !call->regfunc)
 			continue;
 
-		if (!all && strcmp(call->system, system) != 0)
+		if (system && strcmp(call->system, system) != 0)
 			continue;
 
 		/*
@@ -480,7 +486,6 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 {
 	const char *system = filp->private_data;
 	unsigned long val;
-	char *command;
 	char buf[64];
 	ssize_t ret;
 
@@ -500,30 +505,16 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (ret < 0)
 		return ret;
 
-	switch (val) {
-	case 0:
-	case 1:
-		break;
-
-	default:
+	if (val != 0 && val != 1)
 		return -EINVAL;
-	}
 
-	/* +3 for the ":*\0" */
-	command = kmalloc(strlen(system)+3, GFP_KERNEL);
-	if (!command)
-		return -ENOMEM;
-	sprintf(command, "%s:*", system);
-
-	ret = ftrace_set_clr_event(command, val);
+	ret = __ftrace_set_clr_event(NULL, system, NULL, val);
 	if (ret)
-		goto out_free;
+		goto out;
 
 	ret = cnt;
 
- out_free:
-	kfree(command);
-
+out:
 	*ppos += cnt;
 
 	return ret;
@@ -1181,7 +1172,7 @@ static __init int event_trace_init(void)
 			  &ftrace_show_header_fops);
 
 	trace_create_file("enable", 0644, d_events,
-			  "*", &ftrace_system_enable_fops);
+			  NULL, &ftrace_system_enable_fops);
 
 	for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
 		/* The linker may leave blanks */
@@ -1259,7 +1250,6 @@ static __init void event_trace_self_tests(void)
 {
 	struct ftrace_event_call *call;
 	struct event_subsystem *system;
-	char *sysname;
 	int ret;
 
 	pr_info("Running tests on trace events:\n");
@@ -1305,14 +1295,7 @@ static __init void event_trace_self_tests(void)
 
 		pr_info("Testing event system %s: ", system->name);
 
-		/* ftrace_set_clr_event can modify the name passed in. */
-		sysname = kstrdup(system->name, GFP_KERNEL);
-		if (WARN_ON(!sysname)) {
-			pr_warning("Can't allocate memory, giving up!\n");
-			return;
-		}
-		ret = ftrace_set_clr_event(sysname, 1);
-		kfree(sysname);
+		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
 		if (WARN_ON_ONCE(ret)) {
 			pr_warning("error enabling system %s\n",
 				   system->name);
@@ -1321,14 +1304,7 @@ static __init void event_trace_self_tests(void)
 
 		event_test_stuff();
 
-		sysname = kstrdup(system->name, GFP_KERNEL);
-		if (WARN_ON(!sysname)) {
-			pr_warning("Can't allocate memory, giving up!\n");
-			return;
-		}
-		ret = ftrace_set_clr_event(sysname, 0);
-		kfree(sysname);
-
+		ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
 		if (WARN_ON_ONCE(ret))
 			pr_warning("error disabling system %s\n",
 				   system->name);
@@ -1341,15 +1317,8 @@ static __init void event_trace_self_tests(void)
 	pr_info("Running tests on all trace events:\n");
 	pr_info("Testing all events: ");
 
-	sysname = kmalloc(4, GFP_KERNEL);
-	if (WARN_ON(!sysname)) {
-		pr_warning("Can't allocate memory, giving up!\n");
-		return;
-	}
-	memcpy(sysname, "*:*", 4);
-	ret = ftrace_set_clr_event(sysname, 1);
+	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
 	if (WARN_ON_ONCE(ret)) {
-		kfree(sysname);
 		pr_warning("error enabling all events\n");
 		return;
 	}
@@ -1357,10 +1326,7 @@ static __init void event_trace_self_tests(void)
 	event_test_stuff();
 
 	/* reset sysname */
-	memcpy(sysname, "*:*", 4);
-	ret = ftrace_set_clr_event(sysname, 0);
-	kfree(sysname);
-
+	ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
 	if (WARN_ON_ONCE(ret)) {
 		pr_warning("error disabling all events\n");
 		return;
-- 
GitLab


From c142b15dc56ee6d55cb97a062e3c8e9c61e384c0 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 8 May 2009 10:32:05 +0800
Subject: [PATCH 0910/2198] tracing/events: simplify system_enable_read()

A smarter way to figure out the output of an enable file.

[ Impact: clean up ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A0399A5.2080603@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 40 ++++++-------------------------------
 1 file changed, 6 insertions(+), 34 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 45f1099386b6b..df394bc6d54bb 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -414,10 +414,11 @@ static ssize_t
 system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		   loff_t *ppos)
 {
+	const char set_to_char[4] = { '?', '0', '1', 'X' };
 	const char *system = filp->private_data;
 	struct ftrace_event_call *call;
 	char buf[2];
-	int set = -1;
+	int set = 0;
 	int ret;
 
 	mutex_lock(&event_mutex);
@@ -433,47 +434,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 		 * or if all events or cleared, or if we have
 		 * a mixture.
 		 */
-		if (call->enabled) {
-			switch (set) {
-			case -1:
-				set = 1;
-				break;
-			case 0:
-				set = 2;
-				break;
-			}
-		} else {
-			switch (set) {
-			case -1:
-				set = 0;
-				break;
-			case 1:
-				set = 2;
-				break;
-			}
-		}
+		set |= (1 << !!call->enabled);
+
 		/*
 		 * If we have a mixture, no need to look further.
 		 */
-		if (set == 2)
+		if (set == 3)
 			break;
 	}
 	mutex_unlock(&event_mutex);
 
+	buf[0] = set_to_char[set];
 	buf[1] = '\n';
-	switch (set) {
-	case 0:
-		buf[0] = '0';
-		break;
-	case 1:
-		buf[0] = '1';
-		break;
-	case 2:
-		buf[0] = 'X';
-		break;
-	default:
-		buf[0] = '?';
-	}
 
 	ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
 
-- 
GitLab


From c3d480ded1584dc17f6e82f49af4155380a51dda Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 21:57:04 +0900
Subject: [PATCH 0911/2198] sh: TMU platform data for SH7786.

Wires up all 12 TMU channels, with TMU0 and 1 used as clockevent and
clocksource respectively.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7786.c | 393 +++++++++++++++++++++++++
 1 file changed, 393 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
index 90e8cfff55fd4..2c464bf5a8990 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
@@ -3,6 +3,7 @@
  *
  * Copyright (C) 2009  Renesas Solutions Corp.
  * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ * Paul Mundt <paul.mundt@renesas.com>
  *
  * Based on SH7785 Setup
  *
@@ -19,6 +20,7 @@
 #include <linux/io.h>
 #include <linux/mm.h>
 #include <linux/dma-mapping.h>
+#include <linux/sh_timer.h>
 #include <asm/mmzone.h>
 
 static struct plat_sci_port sci_platform_data[] = {
@@ -69,6 +71,368 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffda0008,
+		.end	= 0xffda0013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 20,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffda0014,
+		.end	= 0xffda001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 21,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffda0020,
+		.end	= 0xffda002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 22,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
+static struct sh_timer_config tmu6_platform_data = {
+	.name = "TMU6",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu6_resources[] = {
+	[0] = {
+		.name	= "TMU6",
+		.start	= 0xffdc0008,
+		.end	= 0xffdc0013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 45,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu6_device = {
+	.name		= "sh_tmu",
+	.id		= 6,
+	.dev = {
+		.platform_data	= &tmu6_platform_data,
+	},
+	.resource	= tmu6_resources,
+	.num_resources	= ARRAY_SIZE(tmu6_resources),
+};
+
+static struct sh_timer_config tmu7_platform_data = {
+	.name = "TMU7",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu7_resources[] = {
+	[0] = {
+		.name	= "TMU7",
+		.start	= 0xffdc0014,
+		.end	= 0xffdc001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 45,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu7_device = {
+	.name		= "sh_tmu",
+	.id		= 7,
+	.dev = {
+		.platform_data	= &tmu7_platform_data,
+	},
+	.resource	= tmu7_resources,
+	.num_resources	= ARRAY_SIZE(tmu7_resources),
+};
+
+static struct sh_timer_config tmu8_platform_data = {
+	.name = "TMU8",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu8_resources[] = {
+	[0] = {
+		.name	= "TMU8",
+		.start	= 0xffdc0020,
+		.end	= 0xffdc002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 45,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu8_device = {
+	.name		= "sh_tmu",
+	.id		= 8,
+	.dev = {
+		.platform_data	= &tmu8_platform_data,
+	},
+	.resource	= tmu8_resources,
+	.num_resources	= ARRAY_SIZE(tmu8_resources),
+};
+
+static struct sh_timer_config tmu9_platform_data = {
+	.name = "TMU9",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu9_resources[] = {
+	[0] = {
+		.name	= "TMU9",
+		.start	= 0xffde0008,
+		.end	= 0xffde0013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 46,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu9_device = {
+	.name		= "sh_tmu",
+	.id		= 9,
+	.dev = {
+		.platform_data	= &tmu9_platform_data,
+	},
+	.resource	= tmu9_resources,
+	.num_resources	= ARRAY_SIZE(tmu9_resources),
+};
+
+static struct sh_timer_config tmu10_platform_data = {
+	.name = "TMU10",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu10_resources[] = {
+	[0] = {
+		.name	= "TMU10",
+		.start	= 0xffde0014,
+		.end	= 0xffde001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 46,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu10_device = {
+	.name		= "sh_tmu",
+	.id		= 10,
+	.dev = {
+		.platform_data	= &tmu10_platform_data,
+	},
+	.resource	= tmu10_resources,
+	.num_resources	= ARRAY_SIZE(tmu10_resources),
+};
+
+static struct sh_timer_config tmu11_platform_data = {
+	.name = "TMU11",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu11_resources[] = {
+	[0] = {
+		.name	= "TMU11",
+		.start	= 0xffde0020,
+		.end	= 0xffde002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 46,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu11_device = {
+	.name		= "sh_tmu",
+	.id		= 11,
+	.dev = {
+		.platform_data	= &tmu11_platform_data,
+	},
+	.resource	= tmu11_resources,
+	.num_resources	= ARRAY_SIZE(tmu11_resources),
+};
+
 static struct resource usb_ohci_resources[] = {
 	[0] = {
 		.start	= 0xffe70400,
@@ -94,6 +458,21 @@ static struct platform_device usb_ohci_device = {
 	.resource	= usb_ohci_resources,
 };
 
+static struct platform_device *sh7786_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+	&tmu6_device,
+	&tmu7_device,
+	&tmu8_device,
+	&tmu9_device,
+	&tmu10_device,
+	&tmu11_device,
+};
+
 static struct platform_device *sh7786_devices[] __initdata = {
 	&sci_device,
 	&usb_ohci_device,
@@ -156,12 +535,26 @@ static void __init sh7786_usb_setup(void)
 
 static int __init sh7786_devices_setup(void)
 {
+	int ret;
+
 	sh7786_usb_setup();
+
+	ret = platform_add_devices(sh7786_early_devices,
+				   ARRAY_SIZE(sh7786_early_devices));
+	if (unlikely(ret != 0))
+		return ret;
+
 	return platform_add_devices(sh7786_devices,
 				    ARRAY_SIZE(sh7786_devices));
 }
 device_initcall(sh7786_devices_setup);
 
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7786_early_devices,
+				   ARRAY_SIZE(sh7786_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
GitLab


From ccdaeb4c8f982900a2abf722e34b60131394d8eb Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 22:09:30 +0900
Subject: [PATCH 0912/2198] sh: TMU platform data for SH-X3 proto CPU.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-shx3.c | 209 ++++++++++++++++++++++++++-
 1 file changed, 207 insertions(+), 2 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
index bd35f32534b98..9d5185b42f13d 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
@@ -1,7 +1,7 @@
 /*
- * SH-X3 Setup
+ * SH-X3 Prototype Setup
  *
- *  Copyright (C) 2007  Paul Mundt
+ *  Copyright (C) 2007 - 2009  Paul Mundt
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -12,6 +12,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/io.h>
+#include <linux/sh_timer.h>
 #include <asm/mmzone.h>
 
 static struct plat_sci_port sci_platform_data[] = {
@@ -48,17 +49,221 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffc10008,
+		.end	= 0xffc10013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffc10014,
+		.end	= 0xffc1001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffc10020,
+		.end	= 0xffc1002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffc20008,
+		.end	= 0xffc20013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 19,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffc20014,
+		.end	= 0xffc2001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 20,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffc20020,
+		.end	= 0xffc2002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 21,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
+static struct platform_device *shx3_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+};
+
 static struct platform_device *shx3_devices[] __initdata = {
 	&sci_device,
 };
 
 static int __init shx3_devices_setup(void)
 {
+	int ret;
+
+	ret = platform_add_devices(shx3_early_devices,
+				   ARRAY_SIZE(shx3_early_devices));
+	if (unlikely(ret != 0))
+		return ret;
+
 	return platform_add_devices(shx3_devices,
 				    ARRAY_SIZE(shx3_devices));
 }
 __initcall(shx3_devices_setup);
 
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(shx3_early_devices,
+				   ARRAY_SIZE(shx3_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
GitLab


From 7b551f9daa9bd9533ba4ce31622ed4be1dd97d3e Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 22:14:01 +0900
Subject: [PATCH 0913/2198] sh: Kill off the GENERIC_CALIBRATE_DELAY ifndef.

Now that everyone is using the clock framework directly and we
unconditionally provide our own calibrate_delay() function, having it
wrapped in an ifndef is no longer useful. So, kill it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/setup.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 38515a7959206..00086b234278c 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -156,7 +156,7 @@ static void __init reserve_crashkernel(void)
 			&crash_size, &crash_base);
 	if (ret == 0 && crash_size) {
 		if (crash_base <= 0) {
-			vp = alloc_bootmem_nopanic(crash_size); 
+			vp = alloc_bootmem_nopanic(crash_size);
 			if (!vp) {
 				printk(KERN_INFO "crashkernel allocation "
 				       "failed\n");
@@ -185,7 +185,6 @@ static inline void __init reserve_crashkernel(void)
 {}
 #endif
 
-#ifndef CONFIG_GENERIC_CALIBRATE_DELAY
 void __cpuinit calibrate_delay(void)
 {
 	struct clk *clk = clk_get(NULL, "cpu_clk");
@@ -201,7 +200,6 @@ void __cpuinit calibrate_delay(void)
 			 (loops_per_jiffy/(5000/HZ)) % 100,
 			 loops_per_jiffy);
 }
-#endif
 
 void __init __add_active_range(unsigned int nid, unsigned long start_pfn,
 						unsigned long end_pfn)
-- 
GitLab


From e552de2413edad1a7b0c7f82a2f2753e4f905d93 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:13:42 +0000
Subject: [PATCH 0914/2198] sh-sci: add platform device private data

This patch adds per-platform private data to the sh-sci driver.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c | 127 +++++++++++++++++++++++++++-------------
 1 file changed, 85 insertions(+), 42 deletions(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 728d6a062bf6d..af5da110d52bd 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -47,6 +47,7 @@
 #include <linux/clk.h>
 #include <linux/ctype.h>
 #include <linux/err.h>
+#include <linux/list.h>
 
 #ifdef CONFIG_SUPERH
 #include <asm/clock.h>
@@ -77,6 +78,16 @@ struct sci_port {
 #ifdef CONFIG_HAVE_CLK
 	/* Port clock */
 	struct clk		*clk;
+#endif
+	struct list_head	node;
+};
+
+struct sh_sci_priv {
+	spinlock_t lock;
+	struct list_head ports;
+
+#ifdef CONFIG_HAVE_CLK
+	struct notifier_block clk_nb;
 #endif
 };
 
@@ -726,19 +737,22 @@ static irqreturn_t sci_mpxed_interrupt(int irq, void *ptr)
 static int sci_notifier(struct notifier_block *self,
 			unsigned long phase, void *p)
 {
-	int i;
+	struct sh_sci_priv *priv = container_of(self,
+						struct sh_sci_priv, clk_nb);
+	struct sci_port *sci_port;
+	unsigned long flags;
 
 	if ((phase == CPUFREQ_POSTCHANGE) ||
-	    (phase == CPUFREQ_RESUMECHANGE))
-		for (i = 0; i < SCI_NPORTS; i++) {
-			struct sci_port *s = &sci_ports[i];
-			s->port.uartclk = clk_get_rate(s->clk);
-		}
+	    (phase == CPUFREQ_RESUMECHANGE)) {
+		spin_lock_irqsave(&priv->lock, flags);
+		list_for_each_entry(sci_port, &priv->ports, node)
+			sci_port->port.uartclk = clk_get_rate(sci_port->clk);
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
 
 	return NOTIFY_OK;
 }
-
-static struct notifier_block sci_nb = { &sci_notifier, NULL, 0 };
 #endif
 
 static int sci_request_irq(struct sci_port *port)
@@ -1201,6 +1215,27 @@ static struct uart_driver sci_uart_driver = {
 	.cons		= SCI_CONSOLE,
 };
 
+
+static int __devexit sci_remove(struct platform_device *dev)
+{
+	struct sh_sci_priv *priv = platform_get_drvdata(dev);
+	struct sci_port *p;
+	unsigned long flags;
+
+#ifdef CONFIG_HAVE_CLK
+	cpufreq_unregister_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER);
+#endif
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry(p, &priv->ports, node)
+		uart_remove_one_port(&sci_uart_driver, &p->port);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	kfree(priv);
+	return 0;
+}
+
 /*
  * Register a set of serial devices attached to a platform device.  The
  * list is terminated with a zero flags entry, which means we expect
@@ -1210,7 +1245,22 @@ static struct uart_driver sci_uart_driver = {
 static int __devinit sci_probe(struct platform_device *dev)
 {
 	struct plat_sci_port *p = dev->dev.platform_data;
+	struct sh_sci_priv *priv;
 	int i, ret = -EINVAL;
+	unsigned long flags;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&priv->ports);
+	spin_lock_init(&priv->lock);
+	platform_set_drvdata(dev, priv);
+
+#ifdef CONFIG_HAVE_CLK
+	priv->clk_nb.notifier_call = sci_notifier;
+	cpufreq_register_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER);
+#endif
 
 	for (i = 0; p && p->flags != 0; p++, i++) {
 		struct sci_port *sciport = &sci_ports[i];
@@ -1254,12 +1304,19 @@ static int __devinit sci_probe(struct platform_device *dev)
 
 		memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs));
 
-		uart_add_one_port(&sci_uart_driver, &sciport->port);
-	}
+		ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
 
-#ifdef CONFIG_HAVE_CLK
-	cpufreq_register_notifier(&sci_nb, CPUFREQ_TRANSITION_NOTIFIER);
-#endif
+		if (ret && (p->flags & UPF_IOREMAP)) {
+			iounmap(p->membase);
+			goto err_unreg;
+		}
+
+		INIT_LIST_HEAD(&sciport->node);
+
+		spin_lock_irqsave(&priv->lock, flags);
+		list_add(&sciport->node, &priv->ports);
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
 
 #ifdef CONFIG_SH_STANDARD_BIOS
 	sh_bios_gdb_detach();
@@ -1268,50 +1325,36 @@ static int __devinit sci_probe(struct platform_device *dev)
 	return 0;
 
 err_unreg:
-	for (i = i - 1; i >= 0; i--)
-		uart_remove_one_port(&sci_uart_driver, &sci_ports[i].port);
-
+	sci_remove(dev);
 	return ret;
 }
 
-static int __devexit sci_remove(struct platform_device *dev)
-{
-	int i;
-
-#ifdef CONFIG_HAVE_CLK
-	cpufreq_unregister_notifier(&sci_nb, CPUFREQ_TRANSITION_NOTIFIER);
-#endif
-
-	for (i = 0; i < SCI_NPORTS; i++)
-		uart_remove_one_port(&sci_uart_driver, &sci_ports[i].port);
-
-	return 0;
-}
-
 static int sci_suspend(struct platform_device *dev, pm_message_t state)
 {
-	int i;
+	struct sh_sci_priv *priv = platform_get_drvdata(dev);
+	struct sci_port *p;
+	unsigned long flags;
 
-	for (i = 0; i < SCI_NPORTS; i++) {
-		struct sci_port *p = &sci_ports[i];
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry(p, &priv->ports, node)
+		uart_suspend_port(&sci_uart_driver, &p->port);
 
-		if (p->type != PORT_UNKNOWN && p->port.dev == &dev->dev)
-			uart_suspend_port(&sci_uart_driver, &p->port);
-	}
+	spin_unlock_irqrestore(&priv->lock, flags);
 
 	return 0;
 }
 
 static int sci_resume(struct platform_device *dev)
 {
-	int i;
+	struct sh_sci_priv *priv = platform_get_drvdata(dev);
+	struct sci_port *p;
+	unsigned long flags;
 
-	for (i = 0; i < SCI_NPORTS; i++) {
-		struct sci_port *p = &sci_ports[i];
+	spin_lock_irqsave(&priv->lock, flags);
+	list_for_each_entry(p, &priv->ports, node)
+		uart_resume_port(&sci_uart_driver, &p->port);
 
-		if (p->type != PORT_UNKNOWN && p->port.dev == &dev->dev)
-			uart_resume_port(&sci_uart_driver, &p->port);
-	}
+	spin_unlock_irqrestore(&priv->lock, flags);
 
 	return 0;
 }
-- 
GitLab


From 9080b72819650c3a757d173a19bc930d603b79d6 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:13:58 +0000
Subject: [PATCH 0915/2198] sh-sci: remove early_sci_setup()

Remove unused early_sci_setup() function from sh-sci.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c    | 14 --------------
 include/linux/serial_sci.h |  2 --
 2 files changed, 16 deletions(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index af5da110d52bd..5f37f7d31663e 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1082,20 +1082,6 @@ static void __init sci_init_ports(void)
 	}
 }
 
-int __init early_sci_setup(struct uart_port *port)
-{
-	if (unlikely(port->line > SCI_NPORTS))
-		return -ENODEV;
-
-	sci_init_ports();
-
-	sci_ports[port->line].port.membase	= port->membase;
-	sci_ports[port->line].port.mapbase	= port->mapbase;
-	sci_ports[port->line].port.type		= port->type;
-
-	return 0;
-}
-
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
 /*
  *	Print a string to the serial port trying not to disturb
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 893cc53486bc6..717059d6791f5 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -27,6 +27,4 @@ struct plat_sci_port {
 	upf_t		flags;			/* UPF_* flags */
 };
 
-int early_sci_setup(struct uart_port *port);
-
 #endif /* __LINUX_SERIAL_SCI_H */
-- 
GitLab


From dc8e6f5bfcd6a307a8196d3e41fd9798be5a1c76 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:06 +0000
Subject: [PATCH 0916/2198] sh-sci: rework serial console support

Rework sh-sci serial console code.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c | 47 ++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 5f37f7d31663e..d2cd1a400c12c 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -91,10 +91,6 @@ struct sh_sci_priv {
 #endif
 };
 
-#ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
-static struct sci_port *serial_console_port;
-#endif
-
 /* Function prototypes */
 static void sci_stop_tx(struct uart_port *port);
 
@@ -1083,6 +1079,18 @@ static void __init sci_init_ports(void)
 }
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
+static struct tty_driver *serial_console_device(struct console *co, int *index)
+{
+	struct uart_driver *p = &sci_uart_driver;
+	*index = co->index;
+	return p->tty_driver;
+}
+
+static void serial_console_putchar(struct uart_port *port, int ch)
+{
+	sci_poll_put_char(port, ch);
+}
+
 /*
  *	Print a string to the serial port trying not to disturb
  *	any possible real use of the port...
@@ -1090,16 +1098,10 @@ static void __init sci_init_ports(void)
 static void serial_console_write(struct console *co, const char *s,
 				 unsigned count)
 {
-	struct uart_port *port = &serial_console_port->port;
+	struct uart_port *port = co->data;
 	unsigned short bits;
-	int i;
 
-	for (i = 0; i < count; i++) {
-		if (*s == 10)
-			sci_poll_put_char(port, '\r');
-
-		sci_poll_put_char(port, *s++);
-	}
+	uart_console_write(co->data, s, count, serial_console_putchar);
 
 	/* wait until fifo is empty and last bit has been transmitted */
 	bits = SCxSR_TDxE(port) | SCxSR_TEND(port);
@@ -1109,6 +1111,7 @@ static void serial_console_write(struct console *co, const char *s,
 
 static int __init serial_console_setup(struct console *co, char *options)
 {
+	struct sci_port *sci_port;
 	struct uart_port *port;
 	int baud = 115200;
 	int bits = 8;
@@ -1124,8 +1127,9 @@ static int __init serial_console_setup(struct console *co, char *options)
 	if (co->index >= SCI_NPORTS)
 		co->index = 0;
 
-	serial_console_port = &sci_ports[co->index];
-	port = &serial_console_port->port;
+	sci_port = &sci_ports[co->index];
+	port = &sci_port->port;
+	co->data = port;
 
 	/*
 	 * Also need to check port->type, we don't actually have any
@@ -1135,21 +1139,17 @@ static int __init serial_console_setup(struct console *co, char *options)
 	 */
 	if (!port->type)
 		return -ENODEV;
-	if (!port->membase || !port->mapbase)
-		return -ENODEV;
-
-	port->type = serial_console_port->type;
 
 #ifdef CONFIG_HAVE_CLK
-	if (!serial_console_port->clk)
-		serial_console_port->clk = clk_get(NULL, "module_clk");
+	if (!sci_port->clk)
+		sci_port->clk = clk_get(NULL, "module_clk");
 #endif
 
 	if (port->flags & UPF_IOREMAP)
 		sci_config_port(port, 0);
 
-	if (serial_console_port->enable)
-		serial_console_port->enable(port);
+	if (sci_port->enable)
+		sci_port->enable(port);
 
 	if (options)
 		uart_parse_options(options, &baud, &parity, &bits, &flow);
@@ -1165,12 +1165,11 @@ static int __init serial_console_setup(struct console *co, char *options)
 
 static struct console serial_console = {
 	.name		= "ttySC",
-	.device		= uart_console_device,
+	.device		= serial_console_device,
 	.write		= serial_console_write,
 	.setup		= serial_console_setup,
 	.flags		= CON_PRINTBUFFER,
 	.index		= -1,
-	.data		= &sci_uart_driver,
 };
 
 static int __init sci_console_init(void)
-- 
GitLab


From a5660adae85918f2ab6b10ab58e2f574c1bd5ce1 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:38 +0000
Subject: [PATCH 0917/2198] sh-sci: use to_sci_port() if possible

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index d2cd1a400c12c..17fa7f17bbe3d 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -618,7 +618,7 @@ static inline int sci_handle_breaks(struct uart_port *port)
 	int copied = 0;
 	unsigned short status = sci_in(port, SCxSR);
 	struct tty_struct *tty = port->info->port.tty;
-	struct sci_port *s = &sci_ports[port->line];
+	struct sci_port *s = to_sci_port(port);
 
 	if (uart_handle_break(port))
 		return 0;
@@ -875,7 +875,7 @@ static void sci_break_ctl(struct uart_port *port, int break_state)
 
 static int sci_startup(struct uart_port *port)
 {
-	struct sci_port *s = &sci_ports[port->line];
+	struct sci_port *s = to_sci_port(port);
 
 	if (s->enable)
 		s->enable(port);
@@ -893,7 +893,7 @@ static int sci_startup(struct uart_port *port)
 
 static void sci_shutdown(struct uart_port *port)
 {
-	struct sci_port *s = &sci_ports[port->line];
+	struct sci_port *s = to_sci_port(port);
 
 	sci_stop_rx(port);
 	sci_stop_tx(port);
@@ -990,7 +990,7 @@ static int sci_request_port(struct uart_port *port)
 
 static void sci_config_port(struct uart_port *port, int flags)
 {
-	struct sci_port *s = &sci_ports[port->line];
+	struct sci_port *s = to_sci_port(port);
 
 	port->type = s->type;
 
@@ -1002,7 +1002,7 @@ static void sci_config_port(struct uart_port *port, int flags)
 
 static int sci_verify_port(struct uart_port *port, struct serial_struct *ser)
 {
-	struct sci_port *s = &sci_ports[port->line];
+	struct sci_port *s = to_sci_port(port);
 
 	if (ser->irq != s->irqs[SCIx_TXI_IRQ] || ser->irq > nr_irqs)
 		return -EINVAL;
-- 
GitLab


From 0ee70712922c15252183db8b50a7e369c96017c0 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:13:50 +0000
Subject: [PATCH 0918/2198] sh-sci: allow single port platform devices

Allow registration of single port sh-sci platform devices.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c | 125 +++++++++++++++++++++++-----------------
 1 file changed, 72 insertions(+), 53 deletions(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 17fa7f17bbe3d..e9b350c58ba8a 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1221,6 +1221,70 @@ static int __devexit sci_remove(struct platform_device *dev)
 	return 0;
 }
 
+static int __devinit sci_probe_single(struct platform_device *dev,
+				      unsigned int index,
+				      struct plat_sci_port *p,
+				      struct sci_port *sciport)
+{
+	struct sh_sci_priv *priv = platform_get_drvdata(dev);
+	unsigned long flags;
+	int ret;
+
+	/* Sanity check */
+	if (unlikely(index >= SCI_NPORTS)) {
+		dev_notice(&dev->dev, "Attempting to register port "
+			   "%d when only %d are available.\n",
+			   index+1, SCI_NPORTS);
+		dev_notice(&dev->dev, "Consider bumping "
+			   "CONFIG_SERIAL_SH_SCI_NR_UARTS!\n");
+		return 0;
+	}
+
+	sciport->port.mapbase	= p->mapbase;
+
+	if (p->mapbase && !p->membase) {
+		if (p->flags & UPF_IOREMAP) {
+			p->membase = ioremap_nocache(p->mapbase, 0x40);
+			if (IS_ERR(p->membase))
+				return PTR_ERR(p->membase);
+		} else {
+			/*
+			 * For the simple (and majority of) cases
+			 * where we don't need to do any remapping,
+			 * just cast the cookie directly.
+			 */
+			p->membase = (void __iomem *)p->mapbase;
+		}
+	}
+
+	sciport->port.membase	= p->membase;
+
+	sciport->port.irq	= p->irqs[SCIx_TXI_IRQ];
+	sciport->port.flags	= p->flags;
+	sciport->port.dev	= &dev->dev;
+
+	sciport->type		= sciport->port.type = p->type;
+
+	memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs));
+
+	ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
+
+	if (ret) {
+		if (p->flags & UPF_IOREMAP)
+			iounmap(p->membase);
+
+		return ret;
+	}
+
+	INIT_LIST_HEAD(&sciport->node);
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_add(&sciport->node, &priv->ports);
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return 0;
+}
+
 /*
  * Register a set of serial devices attached to a platform device.  The
  * list is terminated with a zero flags entry, which means we expect
@@ -1232,7 +1296,6 @@ static int __devinit sci_probe(struct platform_device *dev)
 	struct plat_sci_port *p = dev->dev.platform_data;
 	struct sh_sci_priv *priv;
 	int i, ret = -EINVAL;
-	unsigned long flags;
 
 	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
 	if (!priv)
@@ -1247,60 +1310,16 @@ static int __devinit sci_probe(struct platform_device *dev)
 	cpufreq_register_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER);
 #endif
 
-	for (i = 0; p && p->flags != 0; p++, i++) {
-		struct sci_port *sciport = &sci_ports[i];
-
-		/* Sanity check */
-		if (unlikely(i == SCI_NPORTS)) {
-			dev_notice(&dev->dev, "Attempting to register port "
-				   "%d when only %d are available.\n",
-				   i+1, SCI_NPORTS);
-			dev_notice(&dev->dev, "Consider bumping "
-				   "CONFIG_SERIAL_SH_SCI_NR_UARTS!\n");
-			break;
-		}
-
-		sciport->port.mapbase	= p->mapbase;
-
-		if (p->mapbase && !p->membase) {
-			if (p->flags & UPF_IOREMAP) {
-				p->membase = ioremap_nocache(p->mapbase, 0x40);
-				if (IS_ERR(p->membase)) {
-					ret = PTR_ERR(p->membase);
-					goto err_unreg;
-				}
-			} else {
-				/*
-				 * For the simple (and majority of) cases
-				 * where we don't need to do any remapping,
-				 * just cast the cookie directly.
-				 */
-				p->membase = (void __iomem *)p->mapbase;
-			}
-		}
-
-		sciport->port.membase	= p->membase;
-
-		sciport->port.irq	= p->irqs[SCIx_TXI_IRQ];
-		sciport->port.flags	= p->flags;
-		sciport->port.dev	= &dev->dev;
-
-		sciport->type		= sciport->port.type = p->type;
-
-		memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs));
-
-		ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
-
-		if (ret && (p->flags & UPF_IOREMAP)) {
-			iounmap(p->membase);
+	if (dev->id != -1) {
+		ret = sci_probe_single(dev, dev->id, p, &sci_ports[dev->id]);
+		if (ret)
 			goto err_unreg;
+	} else {
+		for (i = 0; p && p->flags != 0; p++, i++) {
+			ret = sci_probe_single(dev, i, p, &sci_ports[i]);
+			if (ret)
+				goto err_unreg;
 		}
-
-		INIT_LIST_HEAD(&sciport->node);
-
-		spin_lock_irqsave(&priv->lock, flags);
-		list_add(&sciport->node, &priv->ports);
-		spin_unlock_irqrestore(&priv->lock, flags);
 	}
 
 #ifdef CONFIG_SH_STANDARD_BIOS
-- 
GitLab


From 7ed7e0711b3ff85b3e15591081b42f2af96d584b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:14 +0000
Subject: [PATCH 0919/2198] sh-sci: replace sci_init_ports()

Replace sci_init_ports() with sci_init_single().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c | 107 +++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 57 deletions(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index e9b350c58ba8a..039c700ce1ec9 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1036,46 +1036,64 @@ static struct uart_ops sci_uart_ops = {
 #endif
 };
 
-static void __init sci_init_ports(void)
+static int __devinit sci_init_single(struct sci_port *sci_port,
+				     unsigned int index,
+				     struct plat_sci_port *p)
 {
-	static int first = 1;
-	int i;
-
-	if (!first)
-		return;
-
-	first = 0;
-
-	for (i = 0; i < SCI_NPORTS; i++) {
-		sci_ports[i].port.ops		= &sci_uart_ops;
-		sci_ports[i].port.iotype	= UPIO_MEM;
-		sci_ports[i].port.line		= i;
-		sci_ports[i].port.fifosize	= 1;
+	sci_port->port.ops	= &sci_uart_ops;
+	sci_port->port.iotype	= UPIO_MEM;
+	sci_port->port.line	= index;
+	sci_port->port.fifosize	= 1;
 
 #if defined(__H8300H__) || defined(__H8300S__)
 #ifdef __H8300S__
-		sci_ports[i].enable	= h8300_sci_enable;
-		sci_ports[i].disable	= h8300_sci_disable;
+	sci_port->enable	= h8300_sci_enable;
+	sci_port->disable	= h8300_sci_disable;
 #endif
-		sci_ports[i].port.uartclk = CONFIG_CPU_CLOCK;
+	sci_port->port.uartclk	= CONFIG_CPU_CLOCK;
 #elif defined(CONFIG_HAVE_CLK)
-		/*
-		 * XXX: We should use a proper SCI/SCIF clock
-		 */
-		{
-			struct clk *clk = clk_get(NULL, "module_clk");
-			sci_ports[i].port.uartclk = clk_get_rate(clk);
-			clk_put(clk);
-		}
+	/*
+	 * XXX: We should use a proper SCI/SCIF clock
+	 */
+	{
+		struct clk *clk = clk_get(NULL, "module_clk");
+		sci_port->port.uartclk = clk_get_rate(clk);
+		clk_put(clk);
+	}
 #else
 #error "Need a valid uartclk"
 #endif
 
-		sci_ports[i].break_timer.data = (unsigned long)&sci_ports[i];
-		sci_ports[i].break_timer.function = sci_break_timer;
+	sci_port->break_timer.data = (unsigned long)sci_port;
+	sci_port->break_timer.function = sci_break_timer;
+	init_timer(&sci_port->break_timer);
+
+	sci_port->port.mapbase	= p->mapbase;
 
-		init_timer(&sci_ports[i].break_timer);
+	if (p->mapbase && !p->membase) {
+		if (p->flags & UPF_IOREMAP) {
+			p->membase = ioremap_nocache(p->mapbase, 0x40);
+			if (IS_ERR(p->membase))
+				return PTR_ERR(p->membase);
+		} else {
+			/*
+			 * For the simple (and majority of) cases
+			 * where we don't need to do any remapping,
+			 * just cast the cookie directly.
+			 */
+			p->membase = (void __iomem *)p->mapbase;
+		}
 	}
+
+	sci_port->port.membase	= p->membase;
+
+	sci_port->port.irq	= p->irqs[SCIx_TXI_IRQ];
+	sci_port->port.flags	= p->flags;
+	sci_port->type		= sci_port->port.type = p->type;
+
+	memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs));
+
+	return 0;
 }
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
@@ -1174,7 +1192,6 @@ static struct console serial_console = {
 
 static int __init sci_console_init(void)
 {
-	sci_init_ports();
 	register_console(&serial_console);
 	return 0;
 }
@@ -1240,32 +1257,10 @@ static int __devinit sci_probe_single(struct platform_device *dev,
 		return 0;
 	}
 
-	sciport->port.mapbase	= p->mapbase;
-
-	if (p->mapbase && !p->membase) {
-		if (p->flags & UPF_IOREMAP) {
-			p->membase = ioremap_nocache(p->mapbase, 0x40);
-			if (IS_ERR(p->membase))
-				return PTR_ERR(p->membase);
-		} else {
-			/*
-			 * For the simple (and majority of) cases
-			 * where we don't need to do any remapping,
-			 * just cast the cookie directly.
-			 */
-			p->membase = (void __iomem *)p->mapbase;
-		}
-	}
-
-	sciport->port.membase	= p->membase;
-
-	sciport->port.irq	= p->irqs[SCIx_TXI_IRQ];
-	sciport->port.flags	= p->flags;
-	sciport->port.dev	= &dev->dev;
-
-	sciport->type		= sciport->port.type = p->type;
-
-	memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs));
+	sciport->port.dev = &dev->dev;
+	ret = sci_init_single(sciport, index, p);
+	if (ret)
+		return ret;
 
 	ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
 
@@ -1380,8 +1375,6 @@ static int __init sci_init(void)
 
 	printk(banner);
 
-	sci_init_ports();
-
 	ret = uart_register_driver(&sci_uart_driver);
 	if (likely(ret == 0)) {
 		ret = platform_driver_register(&sci_driver);
-- 
GitLab


From 08f8cb315fdf9195b472aeb440ae65b189b151da Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:22 +0000
Subject: [PATCH 0920/2198] sh-sci: ioremap() in a single place

Handle ioremap() in sci_config_port only.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c | 54 +++++++++++++++--------------------------
 1 file changed, 20 insertions(+), 34 deletions(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 039c700ce1ec9..408624ae7fecb 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -994,9 +994,21 @@ static void sci_config_port(struct uart_port *port, int flags)
 
 	port->type = s->type;
 
-	if (port->flags & UPF_IOREMAP && !port->membase) {
+	if (port->membase)
+		return;
+
+	if (port->flags & UPF_IOREMAP) {
 		port->membase = ioremap_nocache(port->mapbase, 0x40);
-		dev_err(port->dev, "can't remap port#%d\n", port->line);
+
+		if (IS_ERR(port->membase))
+			dev_err(port->dev, "can't remap port#%d\n", port->line);
+	} else {
+		/*
+		 * For the simple (and majority of) cases where we don't
+		 * need to do any remapping, just cast the cookie
+		 * directly.
+		 */
+		port->membase = (void __iomem *)port->mapbase;
 	}
 }
 
@@ -1036,9 +1048,9 @@ static struct uart_ops sci_uart_ops = {
 #endif
 };
 
-static int __devinit sci_init_single(struct sci_port *sci_port,
-				     unsigned int index,
-				     struct plat_sci_port *p)
+static void __devinit sci_init_single(struct sci_port *sci_port,
+				      unsigned int index,
+				      struct plat_sci_port *p)
 {
 	sci_port->port.ops	= &sci_uart_ops;
 	sci_port->port.iotype	= UPIO_MEM;
@@ -1069,22 +1081,6 @@ static int __devinit sci_init_single(struct sci_port *sci_port,
 	init_timer(&sci_port->break_timer);
 
 	sci_port->port.mapbase	= p->mapbase;
-
-	if (p->mapbase && !p->membase) {
-		if (p->flags & UPF_IOREMAP) {
-			p->membase = ioremap_nocache(p->mapbase, 0x40);
-			if (IS_ERR(p->membase))
-				return PTR_ERR(p->membase);
-		} else {
-			/*
-			 * For the simple (and majority of) cases
-			 * where we don't need to do any remapping,
-			 * just cast the cookie directly.
-			 */
-			p->membase = (void __iomem *)p->mapbase;
-		}
-	}
-
 	sci_port->port.membase	= p->membase;
 
 	sci_port->port.irq	= p->irqs[SCIx_TXI_IRQ];
@@ -1092,8 +1088,6 @@ static int __devinit sci_init_single(struct sci_port *sci_port,
 	sci_port->type		= sci_port->port.type = p->type;
 
 	memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs));
-
-	return 0;
 }
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
@@ -1163,8 +1157,7 @@ static int __init serial_console_setup(struct console *co, char *options)
 		sci_port->clk = clk_get(NULL, "module_clk");
 #endif
 
-	if (port->flags & UPF_IOREMAP)
-		sci_config_port(port, 0);
+	sci_config_port(port, 0);
 
 	if (sci_port->enable)
 		sci_port->enable(port);
@@ -1258,18 +1251,11 @@ static int __devinit sci_probe_single(struct platform_device *dev,
 	}
 
 	sciport->port.dev = &dev->dev;
-	ret = sci_init_single(sciport, index, p);
-	if (ret)
-		return ret;
+	sci_init_single(sciport, index, p);
 
 	ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
-
-	if (ret) {
-		if (p->flags & UPF_IOREMAP)
-			iounmap(p->membase);
-
+	if (ret)
 		return ret;
-	}
 
 	INIT_LIST_HEAD(&sciport->node);
 
-- 
GitLab


From 501b825d01efb93766c87d29f299851152cf4eb0 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 21 Jan 2009 15:14:30 +0000
Subject: [PATCH 0921/2198] sh-sci: improve clock framework support

Use enable/disable hooks for clock framework integration.
Make sure we control the clock for the serial console as well.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c    | 77 +++++++++++++++++++++++---------------
 include/linux/serial_sci.h |  1 +
 2 files changed, 47 insertions(+), 31 deletions(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 408624ae7fecb..3daf76725ac6f 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -76,8 +76,10 @@ struct sci_port {
 	int			break_flag;
 
 #ifdef CONFIG_HAVE_CLK
-	/* Port clock */
-	struct clk		*clk;
+	/* Interface clock */
+	struct clk		*iclk;
+	/* Data clock */
+	struct clk		*dclk;
 #endif
 	struct list_head	node;
 };
@@ -166,12 +168,12 @@ static void h8300_sci_config(struct uart_port *port, unsigned int ctrl)
 		*mstpcrl &= ~mask;
 }
 
-static inline void h8300_sci_enable(struct uart_port *port)
+static void h8300_sci_enable(struct uart_port *port)
 {
 	h8300_sci_config(port, sci_enable);
 }
 
-static inline void h8300_sci_disable(struct uart_port *port)
+static void h8300_sci_disable(struct uart_port *port)
 {
 	h8300_sci_config(port, sci_disable);
 }
@@ -742,13 +744,34 @@ static int sci_notifier(struct notifier_block *self,
 	    (phase == CPUFREQ_RESUMECHANGE)) {
 		spin_lock_irqsave(&priv->lock, flags);
 		list_for_each_entry(sci_port, &priv->ports, node)
-			sci_port->port.uartclk = clk_get_rate(sci_port->clk);
+			sci_port->port.uartclk = clk_get_rate(sci_port->dclk);
 
 		spin_unlock_irqrestore(&priv->lock, flags);
 	}
 
 	return NOTIFY_OK;
 }
+
+static void sci_clk_enable(struct uart_port *port)
+{
+	struct sci_port *sci_port = to_sci_port(port);
+
+	clk_enable(sci_port->dclk);
+	sci_port->port.uartclk = clk_get_rate(sci_port->dclk);
+
+	if (sci_port->iclk)
+		clk_enable(sci_port->iclk);
+}
+
+static void sci_clk_disable(struct uart_port *port)
+{
+	struct sci_port *sci_port = to_sci_port(port);
+
+	if (sci_port->iclk)
+		clk_disable(sci_port->iclk);
+
+	clk_disable(sci_port->dclk);
+}
 #endif
 
 static int sci_request_irq(struct sci_port *port)
@@ -880,10 +903,6 @@ static int sci_startup(struct uart_port *port)
 	if (s->enable)
 		s->enable(port);
 
-#ifdef CONFIG_HAVE_CLK
-	s->clk = clk_get(NULL, "module_clk");
-#endif
-
 	sci_request_irq(s);
 	sci_start_tx(port);
 	sci_start_rx(port, 1);
@@ -901,11 +920,6 @@ static void sci_shutdown(struct uart_port *port)
 
 	if (s->disable)
 		s->disable(port);
-
-#ifdef CONFIG_HAVE_CLK
-	clk_put(s->clk);
-	s->clk = NULL;
-#endif
 }
 
 static void sci_set_termios(struct uart_port *port, struct ktermios *termios,
@@ -1048,7 +1062,8 @@ static struct uart_ops sci_uart_ops = {
 #endif
 };
 
-static void __devinit sci_init_single(struct sci_port *sci_port,
+static void __devinit sci_init_single(struct platform_device *dev,
+				      struct sci_port *sci_port,
 				      unsigned int index,
 				      struct plat_sci_port *p)
 {
@@ -1064,14 +1079,10 @@ static void __devinit sci_init_single(struct sci_port *sci_port,
 #endif
 	sci_port->port.uartclk	= CONFIG_CPU_CLOCK;
 #elif defined(CONFIG_HAVE_CLK)
-	/*
-	 * XXX: We should use a proper SCI/SCIF clock
-	 */
-	{
-		struct clk *clk = clk_get(NULL, "module_clk");
-		sci_port->port.uartclk = clk_get_rate(clk);
-		clk_put(clk);
-	}
+	sci_port->iclk		= p->clk ? clk_get(&dev->dev, p->clk) : NULL;
+	sci_port->dclk		= clk_get(&dev->dev, "module_clk");
+	sci_port->enable	= sci_clk_enable;
+	sci_port->disable	= sci_clk_disable;
 #else
 #error "Need a valid uartclk"
 #endif
@@ -1085,9 +1096,11 @@ static void __devinit sci_init_single(struct sci_port *sci_port,
 
 	sci_port->port.irq	= p->irqs[SCIx_TXI_IRQ];
 	sci_port->port.flags	= p->flags;
+	sci_port->port.dev	= &dev->dev;
 	sci_port->type		= sci_port->port.type = p->type;
 
 	memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs));
+
 }
 
 #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE
@@ -1111,14 +1124,21 @@ static void serial_console_write(struct console *co, const char *s,
 				 unsigned count)
 {
 	struct uart_port *port = co->data;
+	struct sci_port *sci_port = to_sci_port(port);
 	unsigned short bits;
 
-	uart_console_write(co->data, s, count, serial_console_putchar);
+	if (sci_port->enable)
+		sci_port->enable(port);
+
+	uart_console_write(port, s, count, serial_console_putchar);
 
 	/* wait until fifo is empty and last bit has been transmitted */
 	bits = SCxSR_TDxE(port) | SCxSR_TEND(port);
 	while ((sci_in(port, SCxSR) & bits) != bits)
 		cpu_relax();
+
+	if (sci_port->disable);
+		sci_port->disable(port);
 }
 
 static int __init serial_console_setup(struct console *co, char *options)
@@ -1152,11 +1172,6 @@ static int __init serial_console_setup(struct console *co, char *options)
 	if (!port->type)
 		return -ENODEV;
 
-#ifdef CONFIG_HAVE_CLK
-	if (!sci_port->clk)
-		sci_port->clk = clk_get(NULL, "module_clk");
-#endif
-
 	sci_config_port(port, 0);
 
 	if (sci_port->enable)
@@ -1171,6 +1186,7 @@ static int __init serial_console_setup(struct console *co, char *options)
 	if (ret == 0)
 		sci_stop_rx(port);
 #endif
+	/* TODO: disable clock */
 	return ret;
 }
 
@@ -1250,8 +1266,7 @@ static int __devinit sci_probe_single(struct platform_device *dev,
 		return 0;
 	}
 
-	sciport->port.dev = &dev->dev;
-	sci_init_single(sciport, index, p);
+	sci_init_single(dev, sciport, index, p);
 
 	ret = uart_add_one_port(&sci_uart_driver, &sciport->port);
 	if (ret)
diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h
index 717059d6791f5..1c297ddc9d5a4 100644
--- a/include/linux/serial_sci.h
+++ b/include/linux/serial_sci.h
@@ -25,6 +25,7 @@ struct plat_sci_port {
 	unsigned int	irqs[SCIx_NR_IRQS];	/* ERI, RXI, TXI, BRI */
 	unsigned int	type;			/* SCI / SCIF / IRDA */
 	upf_t		flags;			/* UPF_* flags */
+	char		*clk;			/* clock string */
 };
 
 #endif /* __LINUX_SERIAL_SCI_H */
-- 
GitLab


From 3b226e15beb5ecf068738e796811afd1e5b3f81f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 23:28:54 +0900
Subject: [PATCH 0922/2198] sh: Add clock id to sh-sci platform data on
 SH-Mobile CPUs.

This adds the clock specifier to all of the SH-Mobile sh-sci ports.
Impacted CPUs are SH7343/SH7366/SH7722/SH7723/SH7724.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7343.c | 4 ++++
 arch/sh/kernel/cpu/sh4a/setup-sh7366.c | 1 +
 arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 3 +++
 arch/sh/kernel/cpu/sh4a/setup-sh7723.c | 6 ++++++
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 6 ++++++
 5 files changed, 20 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index f327b7eaf47c0..b9c361e8cc822 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -179,21 +179,25 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 80, 80, 80, 80 },
+		.clk		= "scif0",
 	}, {
 		.mapbase	= 0xffe10000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 81, 81, 81, 81 },
+		.clk		= "scif1",
 	}, {
 		.mapbase	= 0xffe20000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 82, 82, 82, 82 },
+		.clk		= "scif2",
 	}, {
 		.mapbase	= 0xffe30000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 83, 83, 83, 83 },
+		.clk		= "scif3",
 	}, {
 		.flags = 0,
 	}
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index 7361ea989b96f..d4a994be4a7d8 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -186,6 +186,7 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 80, 80, 80, 80 },
+		.clk		= "scif0",
 	}, {
 		.flags = 0,
 	}
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index b1c3e7f9284a9..5d6247fecd63c 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -307,18 +307,21 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 80, 80, 80, 80 },
+		.clk		= "scif0",
 	},
 	{
 		.mapbase	= 0xffe10000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 81, 81, 81, 81 },
+		.clk		= "scif1",
 	},
 	{
 		.mapbase	= 0xffe20000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 82, 82, 82, 82 },
+		.clk		= "scif2",
 	},
 	{
 		.flags = 0,
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 484ca2dabdeb5..1429fc5e42866 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -321,31 +321,37 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 80, 80, 80, 80 },
+		.clk		= "scif0",
 	},{
 		.mapbase        = 0xffe10000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 81, 81, 81, 81 },
+		.clk		= "scif1",
 	},{
 		.mapbase        = 0xffe20000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 82, 82, 82, 82 },
+		.clk		= "scif2",
 	},{
 		.mapbase	= 0xa4e30000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 56, 56, 56, 56 },
+		.clk		= "scif3",
 	},{
 		.mapbase	= 0xa4e40000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 88, 88, 88, 88 },
+		.clk		= "scif4",
 	},{
 		.mapbase	= 0xa4e50000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 109, 109, 109, 109 },
+		.clk		= "scif5",
 	}, {
 		.flags = 0,
 	}
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index e074951b88b9d..a3039ce1a6a59 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -30,31 +30,37 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 80, 80, 80, 80 },
+		.clk		= "scif0",
 	}, {
 		.mapbase        = 0xffe10000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 81, 81, 81, 81 },
+		.clk		= "scif1",
 	}, {
 		.mapbase        = 0xffe20000,
 		.flags          = UPF_BOOT_AUTOCONF,
 		.type           = PORT_SCIF,
 		.irqs           = { 82, 82, 82, 82 },
+		.clk		= "scif2",
 	}, {
 		.mapbase	= 0xa4e30000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 56, 56, 56, 56 },
+		.clk		= "scif3",
 	}, {
 		.mapbase	= 0xa4e40000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 88, 88, 88, 88 },
+		.clk		= "scif4",
 	}, {
 		.mapbase	= 0xa4e50000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIFA,
 		.irqs		= { 109, 109, 109, 109 },
+		.clk		= "scif5",
 	}, {
 		.flags = 0,
 	}
-- 
GitLab


From 54507f6ee99778a727ff1b38a1f4050fe6479835 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 8 May 2009 23:48:33 +0900
Subject: [PATCH 0923/2198] serial: sh-sci: Fix up section mismatch in error
 path.

The sci_probe_single() path attempts to use sci_remove() for the error
path, while sci_remove() is still flagged as __devexit. So, we simply
discard the section annotation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 3daf76725ac6f..686e4a456e32d 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1227,7 +1227,7 @@ static struct uart_driver sci_uart_driver = {
 };
 
 
-static int __devexit sci_remove(struct platform_device *dev)
+static int sci_remove(struct platform_device *dev)
 {
 	struct sh_sci_priv *priv = platform_get_drvdata(dev);
 	struct sci_port *p;
-- 
GitLab


From 168f36237b16e2b3159e24c7d3b658e3c912d149 Mon Sep 17 00:00:00 2001
From: Yoshinori Sato <ysato@users.sourceforge.jp>
Date: Tue, 28 Apr 2009 04:40:15 +0000
Subject: [PATCH 0924/2198] serial: sh-sci: Fix up h8300 support.

- Dummy SCIF functions define.
- h8300 specific header include.

Signed-off-by: Yoshinori Sato <ysato@users.sourceforge.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c |  4 ++++
 drivers/serial/sh-sci.h | 17 +++++++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 686e4a456e32d..4e3248d586020 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -54,6 +54,10 @@
 #include <asm/sh_bios.h>
 #endif
 
+#ifdef CONFIG_H8300
+#include <asm/gpio.h>
+#endif
+
 #include "sh-sci.h"
 
 struct sci_port {
diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h
index 84cc6512f0812..e3eae4eaaddf8 100644
--- a/drivers/serial/sh-sci.h
+++ b/drivers/serial/sh-sci.h
@@ -317,7 +317,18 @@
     }									\
   }
 
-#define CPU_SCIF_FNS(name, scif_offset, scif_size)				\
+#ifdef CONFIG_H8300
+/* h8300 don't have SCIF */
+#define CPU_SCIF_FNS(name)						\
+  static inline unsigned int sci_##name##_in(struct uart_port *port)	\
+  {									\
+    return 0;								\
+  }									\
+  static inline void sci_##name##_out(struct uart_port *port, unsigned int value) \
+  {									\
+  }
+#else
+#define CPU_SCIF_FNS(name, scif_offset, scif_size)			\
   static inline unsigned int sci_##name##_in(struct uart_port *port)	\
   {									\
     SCI_IN(scif_size, scif_offset);					\
@@ -326,6 +337,7 @@
   {									\
     SCI_OUT(scif_size, scif_offset, value);				\
   }
+#endif
 
 #define CPU_SCI_FNS(name, sci_offset, sci_size)				\
   static inline unsigned int sci_##name##_in(struct uart_port* port)	\
@@ -363,7 +375,8 @@
 		 sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size, \
                  h8_sci_offset, h8_sci_size) \
   CPU_SCI_FNS(name, h8_sci_offset, h8_sci_size)
-#define SCIF_FNS(name, sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size)
+#define SCIF_FNS(name, sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size) \
+  CPU_SCIF_FNS(name)
 #elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\
       defined(CONFIG_CPU_SUBTYPE_SH7724)
         #define SCIx_FNS(name, sh4_scifa_offset, sh4_scifa_size, sh4_scif_offset, sh4_scif_size) \
-- 
GitLab


From be6514c6295cc79498eeb9a8f933451082ca9e69 Mon Sep 17 00:00:00 2001
From: Kieran Bingham <kieranbingham@gmail.com>
Date: Fri, 8 May 2009 15:48:15 +0100
Subject: [PATCH 0925/2198] sh: Add in some ptrace definitions from GDB.

Plugs in PT_TEXT_END_ADDR/PT_TEXT_ADDR/PT_DATA_ADDR/PT_TEXT_LEN
definitions.

Signed-off-by: Kieran Bingham <kieranbingham@gmail.com>
Signed-off-by: Peter Griffin <pgriffin@mpc-data.co.uk>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/ptrace.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/sh/include/asm/ptrace.h b/arch/sh/include/asm/ptrace.h
index 68e20ff9aa9bb..1dc12cb44a2d7 100644
--- a/arch/sh/include/asm/ptrace.h
+++ b/arch/sh/include/asm/ptrace.h
@@ -102,6 +102,11 @@ struct pt_dspregs {
 #define	PTRACE_GETDSPREGS	55	/* DSP registers */
 #define	PTRACE_SETDSPREGS	56
 
+#define PT_TEXT_END_ADDR 	240
+#define PT_TEXT_ADDR 		244	/* &(struct user)->start_code */
+#define PT_DATA_ADDR 		248	/* &(struct user)->start_data */
+#define PT_TEXT_LEN		252
+
 #ifdef __KERNEL__
 #include <asm/addrspace.h>
 
-- 
GitLab


From e73173dbe55e5b4c2306728aad50c8e42194f6d5 Mon Sep 17 00:00:00 2001
From: Kieran Bingham <kieranbingham@gmail.com>
Date: Fri, 8 May 2009 15:49:50 +0100
Subject: [PATCH 0926/2198] sh: Fix UBC setup and registers for SH2A

Signed-off-by: Kieran Bingham <kieranbingham@gmail.com>
Signed-off-by: Peter Griffin <pgriffin@mpc-data.co.uk>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/ubc.h          | 11 +++++++++++
 arch/sh/include/cpu-sh2a/cpu/ubc.h | 29 ++++++++++++++++++++++++++++-
 arch/sh/kernel/process_32.c        |  4 +++-
 3 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/arch/sh/include/asm/ubc.h b/arch/sh/include/asm/ubc.h
index a7b9028bbfbb2..4ca4b77173713 100644
--- a/arch/sh/include/asm/ubc.h
+++ b/arch/sh/include/asm/ubc.h
@@ -42,12 +42,23 @@
 
 #define BRCR_CMFA		(1 << 15)
 #define BRCR_CMFB		(1 << 14)
+
+#if defined CONFIG_CPU_SH2A
+#define BRCR_CMFCA		(1 << 15)
+#define BRCR_CMFCB		(1 << 14)
+#define BRCR_CMFDA		(1 << 13)
+#define BRCR_CMFDB		(1 << 12)
+#define BRCR_PCBB		(1 << 6)	/* 1: after execution */
+#define BRCR_PCBA		(1 << 5)	/* 1: after execution */
+#define BRCR_PCTE		0
+#else
 #define BRCR_PCTE		(1 << 11)
 #define BRCR_PCBA		(1 << 10)	/* 1: after execution */
 #define BRCR_DBEB		(1 << 7)
 #define BRCR_PCBB		(1 << 6)
 #define BRCR_SEQ		(1 << 3)
 #define BRCR_UBDE		(1 << 0)
+#endif
 
 #ifndef __ASSEMBLY__
 /* arch/sh/kernel/cpu/ubc.S */
diff --git a/arch/sh/include/cpu-sh2a/cpu/ubc.h b/arch/sh/include/cpu-sh2a/cpu/ubc.h
index 8ce2fc1cf625a..1192e1c761a71 100644
--- a/arch/sh/include/cpu-sh2a/cpu/ubc.h
+++ b/arch/sh/include/cpu-sh2a/cpu/ubc.h
@@ -1 +1,28 @@
-#include <cpu-sh2/cpu/ubc.h>
+/*
+ * SH-2A UBC definitions
+ *
+ * Copyright (C) 2008 Kieran Bingham
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#ifndef __ASM_CPU_SH2A_UBC_H
+#define __ASM_CPU_SH2A_UBC_H
+
+#define UBC_BARA                0xfffc0400
+#define UBC_BAMRA               0xfffc0404
+#define UBC_BBRA                0xfffc04a0	/* 16 bit access */
+#define UBC_BDRA                0xfffc0408
+#define UBC_BDMRA               0xfffc040c
+
+#define UBC_BARB                0xfffc0410
+#define UBC_BAMRB               0xfffc0414
+#define UBC_BBRB                0xfffc04b0	/* 16 bit access */
+#define UBC_BDRB                0xfffc0418
+#define UBC_BDMRB               0xfffc041c
+
+#define UBC_BRCR                0xfffc04c0
+
+#endif /* __ASM_CPU_SH2A_UBC_H */
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index 6d94725d22f24..9289ede29c7b4 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -251,7 +251,8 @@ static void ubc_set_tracing(int asid, unsigned long pc)
 
 	if (current_cpu_data.type == CPU_SH7729 ||
 	    current_cpu_data.type == CPU_SH7710 ||
-	    current_cpu_data.type == CPU_SH7712) {
+	    current_cpu_data.type == CPU_SH7712 ||
+	    current_cpu_data.type == CPU_SH7203){
 		ctrl_outw(BBR_INST | BBR_READ | BBR_CPU, UBC_BBRA);
 		ctrl_outl(BRCR_PCBA | BRCR_PCTE, UBC_BRCR);
 	} else {
@@ -407,6 +408,7 @@ asmlinkage void break_point_trap(void)
 #else
 	ctrl_outw(0, UBC_BBRA);
 	ctrl_outw(0, UBC_BBRB);
+	ctrl_outl(0, UBC_BRCR);
 #endif
 	current->thread.ubc_pc = 0;
 	ubc_usercnt -= 1;
-- 
GitLab


From ba0d474082dfa37fb0efd439b4d0c0234ceb1e80 Mon Sep 17 00:00:00 2001
From: Peter Griffin <pgriffin@mpc-data.co.uk>
Date: Fri, 8 May 2009 15:50:54 +0100
Subject: [PATCH 0927/2198] sh: Add ptrace support for NOMMU debugging

Signed-off-by: Peter Griffin <pgriffin@mpc-data.co.uk>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/ptrace_32.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c
index f7b22dd83b0c1..3392e835a374a 100644
--- a/arch/sh/kernel/ptrace_32.c
+++ b/arch/sh/kernel/ptrace_32.c
@@ -334,6 +334,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 					[(addr - (long)&dummy->fpu) >> 2];
 		} else if (addr == (long) &dummy->u_fpvalid)
 			tmp = !!tsk_used_math(child);
+		else if (addr == PT_TEXT_ADDR)
+			tmp = child->mm->start_code;
+		else if (addr == PT_DATA_ADDR)
+			tmp = child->mm->start_data;
+		else if (addr == PT_TEXT_END_ADDR)
+			tmp = child->mm->end_code;
+		else if (addr == PT_TEXT_LEN)
+			tmp = child->mm->end_code - child->mm->start_code;
 		else
 			tmp = 0;
 		ret = put_user(tmp, datap);
-- 
GitLab


From cd89436e54b29a07a383ee82f2f718d8c9d24cc4 Mon Sep 17 00:00:00 2001
From: Peter Griffin <pgriffin@mpc-data.co.uk>
Date: Fri, 8 May 2009 15:51:51 +0100
Subject: [PATCH 0928/2198] sh: Add UBC trap vector for SH2A

Signed-off-by: Peter Griffin <pgriffin@mpc-data.co.uk>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/traps_32.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index 30ca9c51e52db..67550d88c4e66 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -34,6 +34,7 @@
 # define TRAP_ILLEGAL_SLOT_INST	6
 # define TRAP_ADDRESS_ERROR	9
 # ifdef CONFIG_CPU_SH2A
+#  define TRAP_UBC		12
 #  define TRAP_FPU_ERROR	13
 #  define TRAP_DIVZERO_ERROR	17
 #  define TRAP_DIVOVF_ERROR	18
@@ -849,6 +850,10 @@ void __init trap_init(void)
 #endif
 #endif
 
+#ifdef TRAP_UBC
+	set_exception_table_vec(TRAP_UBC, break_point_trap);
+#endif
+
 	/* Setup VBR for boot cpu */
 	per_cpu_trap_init();
 }
-- 
GitLab


From bf8b9a63c18a1a7777571650de0c9f4fd4368ca0 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Fri, 8 May 2009 20:53:58 +0530
Subject: [PATCH 0929/2198] x86: msr-index.h remove duplicate MSR C001_0015
 declaration

MSRC001_0015 Hardware Configuration Register (HWCR) is already defined
as MSR_K7_HWCR.

And HWCR is available for >= K7.

So MSR_K8_HWCR is not required and no-one is using it.

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
---
 arch/x86/include/asm/msr-index.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ec41fc16c167c..4d58d04fca830 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -121,7 +121,6 @@
 #define MSR_K8_TOP_MEM1			0xc001001a
 #define MSR_K8_TOP_MEM2			0xc001001d
 #define MSR_K8_SYSCFG			0xc0010010
-#define MSR_K8_HWCR			0xc0010015
 #define MSR_K8_INT_PENDING_MSG		0xc0010055
 /* C1E active bits in int pending message */
 #define K8_INTP_C1E_ACTIVE_MASK		0x18000000
-- 
GitLab


From 7fc23a5380797012e92a9633169440f2f4a21253 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:21 +0200
Subject: [PATCH 0930/2198] perf_counter: optimize perf_counter_task_tick()

perf_counter_task_tick() does way too much work to find out
there's nothing to do. Provide an easy short-circuit for the
normal case where there are no counters on the system.

[ Impact: micro-optimization ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170028.750619201@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 60e55f0b48f4b..fdb0d24212763 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -39,6 +39,7 @@ int perf_max_counters __read_mostly = 1;
 static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
+static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_tracking __read_mostly;
 static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
@@ -1076,8 +1077,14 @@ static void rotate_ctx(struct perf_counter_context *ctx)
 
 void perf_counter_task_tick(struct task_struct *curr, int cpu)
 {
-	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
+
+	if (!atomic_read(&nr_counters))
+		return;
+
+	cpuctx = &per_cpu(perf_cpu_context, cpu);
+	ctx = &curr->perf_counter_ctx;
 
 	perf_counter_cpu_sched_out(cpuctx);
 	perf_counter_task_sched_out(curr, cpu);
@@ -1197,6 +1204,7 @@ static void free_counter(struct perf_counter *counter)
 {
 	perf_pending_sync(counter);
 
+	atomic_dec(&nr_counters);
 	if (counter->hw_event.mmap)
 		atomic_dec(&nr_mmap_tracking);
 	if (counter->hw_event.munmap)
@@ -2861,6 +2869,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	counter->pmu = pmu;
 
+	atomic_inc(&nr_counters);
 	if (counter->hw_event.mmap)
 		atomic_inc(&nr_mmap_tracking);
 	if (counter->hw_event.munmap)
-- 
GitLab


From 3df5edad87a998273aa5a9a8c728c05d855ad00e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:22 +0200
Subject: [PATCH 0931/2198] perf_counter: rework ioctl()s

Corey noticed that ioctl()s on grouped counters didn't work on
the whole group. This extends the ioctl() interface to take a
second argument that is interpreted as a flags field. We then
provide PERF_IOC_FLAG_GROUP to toggle the behaviour.

Having this flag gives the greatest flexibility, allowing you
to individually enable/disable/reset counters in a group, or
all together.

[ Impact: fix group counter enable/disable semantics ]

Reported-by: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090508170028.837558214@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  10 +++-
 kernel/perf_counter.c        | 104 +++++++++++++++++++----------------
 2 files changed, 65 insertions(+), 49 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 00081d84169f5..88f863ec27487 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -157,10 +157,14 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
-#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_ENABLE		_IOW('$', 0, u32)
+#define PERF_COUNTER_IOC_DISABLE	_IOW('$', 1, u32)
 #define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
-#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
+#define PERF_COUNTER_IOC_RESET		_IOW('$', 3, u32)
+
+enum perf_counter_ioc_flags {
+	PERF_IOC_FLAG_GROUP		= 1U << 0,
+};
 
 /*
  * Structure of the page that can be mapped via mmap
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fdb0d24212763..f4883f1f47ebc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -82,7 +82,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	 * add it straight to the context's counter list, or to the group
 	 * leader's sibling list:
 	 */
-	if (counter->group_leader == counter)
+	if (group_leader == counter)
 		list_add_tail(&counter->list_entry, &ctx->counter_list);
 	else {
 		list_add_tail(&counter->list_entry, &group_leader->sibling_list);
@@ -385,24 +385,6 @@ static void perf_counter_disable(struct perf_counter *counter)
 	spin_unlock_irq(&ctx->lock);
 }
 
-/*
- * Disable a counter and all its children.
- */
-static void perf_counter_disable_family(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-
-	perf_counter_disable(counter);
-
-	/*
-	 * Lock the mutex to protect the list of children
-	 */
-	mutex_lock(&counter->mutex);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_disable(child);
-	mutex_unlock(&counter->mutex);
-}
-
 static int
 counter_sched_in(struct perf_counter *counter,
 		 struct perf_cpu_context *cpuctx,
@@ -753,24 +735,6 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 	return 0;
 }
 
-/*
- * Enable a counter and all its children.
- */
-static void perf_counter_enable_family(struct perf_counter *counter)
-{
-	struct perf_counter *child;
-
-	perf_counter_enable(counter);
-
-	/*
-	 * Lock the mutex to protect the list of children
-	 */
-	mutex_lock(&counter->mutex);
-	list_for_each_entry(child, &counter->child_list, child_list)
-		perf_counter_enable(child);
-	mutex_unlock(&counter->mutex);
-}
-
 void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
@@ -1307,31 +1271,79 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 
 static void perf_counter_reset(struct perf_counter *counter)
 {
+	(void)perf_counter_read(counter);
 	atomic_set(&counter->count, 0);
+	perf_counter_update_userpage(counter);
+}
+
+static void perf_counter_for_each_sibling(struct perf_counter *counter,
+					  void (*func)(struct perf_counter *))
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	struct perf_counter *sibling;
+
+	spin_lock_irq(&ctx->lock);
+	counter = counter->group_leader;
+
+	func(counter);
+	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
+		func(sibling);
+	spin_unlock_irq(&ctx->lock);
+}
+
+static void perf_counter_for_each_child(struct perf_counter *counter,
+					void (*func)(struct perf_counter *))
+{
+	struct perf_counter *child;
+
+	mutex_lock(&counter->mutex);
+	func(counter);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		func(child);
+	mutex_unlock(&counter->mutex);
+}
+
+static void perf_counter_for_each(struct perf_counter *counter,
+				  void (*func)(struct perf_counter *))
+{
+	struct perf_counter *child;
+
+	mutex_lock(&counter->mutex);
+	perf_counter_for_each_sibling(counter, func);
+	list_for_each_entry(child, &counter->child_list, child_list)
+		perf_counter_for_each_sibling(child, func);
+	mutex_unlock(&counter->mutex);
 }
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
-	int err = 0;
+	void (*func)(struct perf_counter *);
+	u32 flags = arg;
 
 	switch (cmd) {
 	case PERF_COUNTER_IOC_ENABLE:
-		perf_counter_enable_family(counter);
+		func = perf_counter_enable;
 		break;
 	case PERF_COUNTER_IOC_DISABLE:
-		perf_counter_disable_family(counter);
-		break;
-	case PERF_COUNTER_IOC_REFRESH:
-		err = perf_counter_refresh(counter, arg);
+		func = perf_counter_disable;
 		break;
 	case PERF_COUNTER_IOC_RESET:
-		perf_counter_reset(counter);
+		func = perf_counter_reset;
 		break;
+
+	case PERF_COUNTER_IOC_REFRESH:
+		return perf_counter_refresh(counter, arg);
 	default:
-		err = -ENOTTY;
+		return -ENOTTY;
 	}
-	return err;
+
+	if (flags & PERF_IOC_FLAG_GROUP)
+		perf_counter_for_each(counter, func);
+	else
+		perf_counter_for_each_child(counter, func);
+
+	return 0;
 }
 
 /*
-- 
GitLab


From a85f61abe11a46553c4562e74edb27ebc782aeb7 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:23 +0200
Subject: [PATCH 0932/2198] perf_counter: add PERF_RECORD_CONFIG

Much like CONFIG_RECORD_GROUP records the hw_event.config to
identify the values, allow to record this for all counters.

[ Impact: extend perfcounter output record format ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170028.923228280@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 2 ++
 kernel/perf_counter.c        | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 88f863ec27487..0e6303d36c66f 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -104,6 +104,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_ADDR	= 1U << 3,
 	PERF_RECORD_GROUP	= 1U << 4,
 	PERF_RECORD_CALLCHAIN	= 1U << 5,
+	PERF_RECORD_CONFIG	= 1U << 6,
 };
 
 /*
@@ -258,6 +259,7 @@ enum perf_event_type {
 	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
+	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f4883f1f47ebc..c615f52aa4083 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1994,6 +1994,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_CONFIG) {
+		header.type |= PERF_RECORD_CONFIG;
+		header.size += sizeof(u64);
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -2029,6 +2034,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_ADDR)
 		perf_output_put(&handle, addr);
 
+	if (record_type & PERF_RECORD_CONFIG)
+		perf_output_put(&handle, counter->hw_event.config);
+
 	/*
 	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
 	 */
-- 
GitLab


From f370e1e2f195ec1e6420e26fc83e0319595db578 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 8 May 2009 18:52:24 +0200
Subject: [PATCH 0933/2198] perf_counter: add PERF_RECORD_CPU

Allow recording the CPU number the event was generated on.

RFC: this leaves a u32 as reserved, should we fill in the
     node_id() there, or leave this open for future extention,
     as userspace can already easily do the cpu->node mapping
     if needed.

[ Impact: extend perfcounter output record format ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <20090508170029.008627711@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 13 +++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0e6303d36c66f..614f921d616ab 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -105,6 +105,7 @@ enum perf_counter_record_format {
 	PERF_RECORD_GROUP	= 1U << 4,
 	PERF_RECORD_CALLCHAIN	= 1U << 5,
 	PERF_RECORD_CONFIG	= 1U << 6,
+	PERF_RECORD_CPU		= 1U << 7,
 };
 
 /*
@@ -260,6 +261,7 @@ enum perf_event_type {
 	 * 	{ u64			time;     } && PERF_RECORD_TIME
 	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
 	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
+	 * 	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
 	 * 	{ u64			nr;
 	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c615f52aa4083..d850a1fb8d4cf 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1956,6 +1956,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	struct perf_callchain_entry *callchain = NULL;
 	int callchain_size = 0;
 	u64 time;
+	struct {
+		u32 cpu, reserved;
+	} cpu_entry;
 
 	header.type = 0;
 	header.size = sizeof(header);
@@ -1999,6 +2002,13 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
+	if (record_type & PERF_RECORD_CPU) {
+		header.type |= PERF_RECORD_CPU;
+		header.size += sizeof(cpu_entry);
+
+		cpu_entry.cpu = raw_smp_processor_id();
+	}
+
 	if (record_type & PERF_RECORD_GROUP) {
 		header.type |= PERF_RECORD_GROUP;
 		header.size += sizeof(u64) +
@@ -2037,6 +2047,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (record_type & PERF_RECORD_CONFIG)
 		perf_output_put(&handle, counter->hw_event.config);
 
+	if (record_type & PERF_RECORD_CPU)
+		perf_output_put(&handle, cpu_entry);
+
 	/*
 	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
 	 */
-- 
GitLab


From 29f93943d1916d1a3faa3f10f4a06994347ac990 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 16:06:47 -0400
Subject: [PATCH 0934/2198] tracing: initialize return value for
 __ftrace_set_clr_event

Commit 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2
tracing/events: clean up for ftrace_set_clr_event()

Moved out the code for ftrace_set_clr_event into a helper funciton but
did not initialize the return value. As a result, we do not warn about
a typo in the echoing of events in set_event.

This patch restores the old warning:

 # echo foobar > set_event
-bash: echo: write error: Invalid argument

[ Impact: restore warning of invalid entries to set_event ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index df394bc6d54bb..2eecb87e42d3a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -118,7 +118,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
 				  const char *event, int set)
 {
 	struct ftrace_event_call *call;
-	int ret;
+	int ret = -EINVAL;
 
 	mutex_lock(&event_mutex);
 	list_for_each_entry(call, &ftrace_events, list) {
-- 
GitLab


From 4671c79408a3f8a5a6a45e39c4c164dada3a5678 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 16:27:41 -0400
Subject: [PATCH 0935/2198] tracing: add trace_set_clr_event to export event
 enabling function

Other parts of the kernel may need to be able to enable or disable
specific events. Especially parts that create trace events.

[ Impact: allow enabling of trace events by those that create the event ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h |  2 ++
 kernel/trace/trace_events.c  | 17 +++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 662c1becf367f..bae51ddfabd35 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -127,6 +127,8 @@ extern int trace_define_field(struct ftrace_event_call *call, char *type,
 
 #define is_signed_type(type)	(((type)(-1)) < 0)
 
+int trace_set_clr_event(const char *system, const char *event, int set);
+
 /*
  * The double __builtin_constant_p is because gcc will give us an error
  * if we try to allocate the static variable to fmt if it is not a
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 2eecb87e42d3a..0eec0c55dd87d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -177,6 +177,23 @@ static int ftrace_set_clr_event(char *buf, int set)
 	return __ftrace_set_clr_event(match, sub, event, set);
 }
 
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+	return __ftrace_set_clr_event(NULL, system, event, set);
+}
+
 /* 128 should be much more than enough */
 #define EVENT_BUF_SIZE		127
 
-- 
GitLab


From 6cac5a924668a56c7ccefc345805f1fe0536a90e Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Sun, 29 Mar 2009 19:56:29 -0700
Subject: [PATCH 0936/2198] xen/x86-64: fix breakpoints and hardware
 watchpoints

Native x86-64 uses the IST mechanism to run int3 and debug traps on
an alternative stack.  Xen does not do this, and so the frames were
being misinterpreted by the ptrace code.  This change special-cases
these two exceptions by using Xen variants which run on the normal
kernel stack properly.

Impact: avoid crash or bad data when IST trap is invoked under Xen
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/include/asm/traps.h |  3 +++
 arch/x86/kernel/entry_64.S   |  5 +++++
 arch/x86/xen/enlighten.c     | 19 ++++++++++++++++++-
 3 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 0d5342515b869..c44e5002f2ff4 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -13,6 +13,9 @@ asmlinkage void divide_error(void);
 asmlinkage void debug(void);
 asmlinkage void nmi(void);
 asmlinkage void int3(void);
+asmlinkage void xen_debug(void);
+asmlinkage void xen_int3(void);
+asmlinkage void xen_stack_segment(void);
 asmlinkage void overflow(void);
 asmlinkage void bounds(void);
 asmlinkage void invalid_op(void);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 38946c6e84338..bb01ce080b80b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1379,6 +1379,11 @@ END(xen_failsafe_callback)
 paranoidzeroentry_ist debug do_debug DEBUG_STACK
 paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
 paranoiderrorentry stack_segment do_stack_segment
+#ifdef CONFIG_XEN
+zeroentry xen_debug do_debug
+zeroentry xen_int3 do_int3
+errorentry xen_stack_segment do_stack_segment
+#endif
 errorentry general_protection do_general_protection
 errorentry page_fault do_page_fault
 #ifdef CONFIG_X86_MCE
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 12a3159333bc0..7566e13c0cac2 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/start_kernel.h>
 #include <linux/sched.h>
+#include <linux/kprobes.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -44,6 +45,7 @@
 #include <asm/processor.h>
 #include <asm/proto.h>
 #include <asm/msr-index.h>
+#include <asm/traps.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
@@ -428,11 +430,26 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 static int cvt_gate_to_trap(int vector, const gate_desc *val,
 			    struct trap_info *info)
 {
+	unsigned long addr;
+
 	if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
 		return 0;
 
 	info->vector = vector;
-	info->address = gate_offset(*val);
+
+	addr = gate_offset(*val);
+#ifdef CONFIG_X86_64
+	if (addr == (unsigned long)debug)
+		addr = (unsigned long)xen_debug;
+	else if (addr == (unsigned long)int3)
+		addr = (unsigned long)xen_int3;
+	else if (addr == (unsigned long)stack_segment)
+		addr = (unsigned long)xen_stack_segment;
+	else
+		WARN_ON(val->ist != 0);
+#endif	/* CONFIG_X86_64 */
+	info->address = addr;
+
 	info->cs = gate_segment(*val);
 	info->flags = val->dpl;
 	/* interrupt gates clear IF */
-- 
GitLab


From b80119bb35a49a4e8dbfb9708872adfd5cf38dee Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 24 Apr 2009 00:22:08 -0700
Subject: [PATCH 0937/2198] xen/x86-64: clean up warnings about IST-using traps

Ignore known IST-using traps.  Aside from the debugger traps, they're
low-level faults which Xen will handle for us, so the kernel needn't
worry about them.  Keep warning in case unknown trap starts using IST.

Impact: suppress spurious warnings
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7566e13c0cac2..e9df942aa143d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -439,14 +439,32 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
 
 	addr = gate_offset(*val);
 #ifdef CONFIG_X86_64
+	/*
+	 * Look for known traps using IST, and substitute them
+	 * appropriately.  The debugger ones are the only ones we care
+	 * about.  Xen will handle faults like double_fault and
+	 * machine_check, so we should never see them.  Warn if
+	 * there's an unexpected IST-using fault handler.
+	 */
 	if (addr == (unsigned long)debug)
 		addr = (unsigned long)xen_debug;
 	else if (addr == (unsigned long)int3)
 		addr = (unsigned long)xen_int3;
 	else if (addr == (unsigned long)stack_segment)
 		addr = (unsigned long)xen_stack_segment;
-	else
-		WARN_ON(val->ist != 0);
+	else if (addr == (unsigned long)double_fault ||
+		 addr == (unsigned long)nmi) {
+		/* Don't need to handle these */
+		return 0;
+#ifdef CONFIG_X86_MCE
+	} else if (addr == (unsigned long)machine_check) {
+		return 0;
+#endif
+	} else {
+		/* Some other trap using IST? */
+		if (WARN_ON(val->ist != 0))
+			return 0;
+	}
 #endif	/* CONFIG_X86_64 */
 	info->address = addr;
 
-- 
GitLab


From a789ed5fb6d0256c4177c2cc27e06520ddbe4d4c Mon Sep 17 00:00:00 2001
From: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Date: Fri, 24 Apr 2009 00:26:50 -0700
Subject: [PATCH 0938/2198] xen: cache cr0 value to avoid trap'n'emulate for
 read_cr0

stts() is implemented in terms of read_cr0/write_cr0 to update the
state of the TS bit.  This happens during context switch, and so
is fairly performance critical.  Rather than falling back to
a trap-and-emulate native read_cr0, implement our own by caching
the last-written value from write_cr0 (the TS bit is the only one
we really care about).

Impact: optimise Xen context switches
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/enlighten.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e9df942aa143d..0a1700a2be9c8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -658,10 +658,26 @@ static void xen_clts(void)
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
 
+static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
+
+static unsigned long xen_read_cr0(void)
+{
+	unsigned long cr0 = percpu_read(xen_cr0_value);
+
+	if (unlikely(cr0 == 0)) {
+		cr0 = native_read_cr0();
+		percpu_write(xen_cr0_value, cr0);
+	}
+
+	return cr0;
+}
+
 static void xen_write_cr0(unsigned long cr0)
 {
 	struct multicall_space mcs;
 
+	percpu_write(xen_cr0_value, cr0);
+
 	/* Only pay attention to cr0.TS; everything else is
 	   ignored. */
 	mcs = xen_mc_entry(0);
@@ -847,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 
 	.clts = xen_clts,
 
-	.read_cr0 = native_read_cr0,
+	.read_cr0 = xen_read_cr0,
 	.write_cr0 = xen_write_cr0,
 
 	.read_cr4 = native_read_cr4,
-- 
GitLab


From 0b4eb462da10f832b28d518abffa4d77805928a0 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 30 Apr 2009 17:59:36 -0700
Subject: [PATCH 0939/2198] x86, boot: align the .bss section in the
 decompressor

Aligning the .bss section makes it trivial to use large operation
sizes for moving the initialized sections and clearing the .bss.
The alignment chosen (L1 cache) is somewhat arbitrary, but should be
large enough to avoid all known performance traps and small enough to
not cause troubles.

[ Impact: trivial performance enhancement, future patch prep	]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/vmlinux.lds.S | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index 0d26c92d3c7d0..dbe515e13fef2 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -42,6 +42,7 @@ SECTIONS
 		*(.data.*)
 		_edata = . ;
 	}
+	. = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
 	.bss : {
 		_bss = . ;
 		*(.bss)
-- 
GitLab


From d3dd3b5a29bb9582957451531fed461628dfc834 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 21:17:15 -0700
Subject: [PATCH 0940/2198] kbuild: allow compressors (gzip, bzip2, lzma) to
 take multiple inputs

Allow the compression commands in Kbuild (i.e. gzip, bzip2, lzma) to
take multiple input files and emit the concatenated compressed
output.  This avoids an intermediate step when a kernel image is built
from multiple components, such as the relocatable x86-32 kernel.

Sam Ravnborg integrated the bin_size script into the Makefile.

[ Impact: new build feature, not yet used ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
---
 scripts/Makefile.lib | 28 +++++++++++++++++++++-------
 scripts/bin_size     | 10 ----------
 2 files changed, 21 insertions(+), 17 deletions(-)
 delete mode 100644 scripts/bin_size

diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index 979619574f70f..f8cf938dde98d 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -183,20 +183,34 @@ cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@
 # ---------------------------------------------------------------------------
 
 quiet_cmd_gzip = GZIP    $@
-cmd_gzip = gzip -f -9 < $< > $@
+cmd_gzip = (cat $(filter-out FORCE,$^) | gzip -f -9 > $@) || \
+	(rm -f $@ ; false)
 
 
 # Bzip2
 # ---------------------------------------------------------------------------
 
-# Bzip2 does not include size in file... so we have to fake that
-size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size
-
-quiet_cmd_bzip2 = BZIP2    $@
-cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false)
+# Bzip2 and LZMA do not include size in file... so we have to fake that;
+# append the size as a 32-bit littleendian number as gzip does.
+size_append = echo -ne $(shell						\
+dec_size=0;								\
+for F in $1; do								\
+	fsize=$$(stat -c "%s" $$F);					\
+	dec_size=$$(expr $$dec_size + $$fsize);				\
+done;									\
+printf "%08x" $$dec_size |						\
+	sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'	\
+)
+
+quiet_cmd_bzip2 = BZIP2   $@
+cmd_bzip2 = (cat $(filter-out FORCE,$^) | \
+	bzip2 -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+	(rm -f $@ ; false)
 
 # Lzma
 # ---------------------------------------------------------------------------
 
 quiet_cmd_lzma = LZMA    $@
-cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false)
+cmd_lzma = (cat $(filter-out FORCE,$^) | \
+	lzma -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+	(rm -f $@ ; false)
diff --git a/scripts/bin_size b/scripts/bin_size
deleted file mode 100644
index 43e1b360cee69..0000000000000
--- a/scripts/bin_size
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-
-if [ $# = 0 ] ; then
-   echo Usage: $0 file
-fi
-
-size_dec=`stat -c "%s" $1`
-size_hex_echo_string=`printf "%08x" $size_dec |
-     sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'`
-/bin/echo -ne $size_hex_echo_string
-- 
GitLab


From 845adf7266a7ba6970bf982ffd96abc60d2018ab Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 21:20:51 -0700
Subject: [PATCH 0941/2198] x86: add a Kconfig symbol for when relocations are
 needed

We only need to build relocations when we are building a 32-bit
relocatable kernel.  Rather than unnecessarily complicating the
Makefiles, make an explicit Kbuild symbol for this.

[ Impact: permits future cleanup ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
---
 arch/x86/Kconfig | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 039c3f04aac5b..5aee45356b58d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1513,6 +1513,11 @@ config RELOCATABLE
 	  it has been loaded at and the compile time physical address
 	  (CONFIG_PHYSICAL_START) is ignored.
 
+# Relocation on x86-32 needs some additional build support
+config X86_NEED_RELOCS
+	def_bool y
+	depends on X86_32 && RELOCATABLE
+
 config PHYSICAL_ALIGN
 	hex
 	prompt "Alignment value to which kernel should be aligned" if X86_32
-- 
GitLab


From 5f11e02019ef44f041e6e38a1363fa2fd4b8785d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 22:53:11 -0700
Subject: [PATCH 0942/2198] x86, boot: simplify
 arch/x86/boot/compressed/Makefile

Simplify the arch/x86/boot/compressed/Makefile, by using the new
capability of specifying multiple inputs to a compressor, and the
CONFIG_X86_NEED_RELOCS Kconfig symbol.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Acked-by: Sam Ravnborg <sam@ravnborg.org>
---
 arch/x86/boot/compressed/Makefile | 39 +++++--------------------------
 1 file changed, 6 insertions(+), 33 deletions(-)

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0f4b5e2abd3fd..b35c3bb709074 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -29,7 +29,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
 
 
 targets += vmlinux.bin.all vmlinux.relocs relocs
-hostprogs-$(CONFIG_X86_32) += relocs
+hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
 
 quiet_cmd_relocs = RELOCS  $@
       cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -37,46 +37,19 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
 	$(call if_changed,relocs)
 
 vmlinux.bin.all-y := $(obj)/vmlinux.bin
-vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
-quiet_cmd_relocbin = BUILD   $@
-      cmd_relocbin = cat $(filter-out FORCE,$^) > $@
-$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
-	$(call if_changed,relocbin)
+vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
 
-ifeq ($(CONFIG_X86_32),y)
-
-ifdef CONFIG_RELOCATABLE
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
-	$(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
-	$(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
-	$(call if_changed,lzma)
-else
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
-	$(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
-	$(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
-	$(call if_changed,lzma)
-endif
-LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
-
-else
-
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,lzma)
 
-LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
-endif
-
 suffix_$(CONFIG_KERNEL_GZIP)  = gz
 suffix_$(CONFIG_KERNEL_BZIP2) = bz2
 suffix_$(CONFIG_KERNEL_LZMA)  = lzma
 
+LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T
 $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
 	$(call if_changed,ld)
-- 
GitLab


From 283ab1c0bd462dd0b179393fb081a626f6687413 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 15:32:47 -0700
Subject: [PATCH 0943/2198] x86, boot: follow standard Kbuild style for
 compression suffix

When generating the compression suffix in
arch/x86/boot/compressed/Makefile, follow standard Kbuild
conventions, that is:

- Use a dash not underscore before y/m/n endings
- Use := whenever possible.

Requested-by: Sam Ravnborg <sam@ravnborg.org>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index b35c3bb709074..7f24fdb584e65 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -46,10 +46,10 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
 $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
 	$(call if_changed,lzma)
 
-suffix_$(CONFIG_KERNEL_GZIP)  = gz
-suffix_$(CONFIG_KERNEL_BZIP2) = bz2
-suffix_$(CONFIG_KERNEL_LZMA)  = lzma
+suffix-$(CONFIG_KERNEL_GZIP)	:= gz
+suffix-$(CONFIG_KERNEL_BZIP2)	:= bz2
+suffix-$(CONFIG_KERNEL_LZMA)	:= lzma
 
 LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
+$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE
 	$(call if_changed,ld)
-- 
GitLab


From bd2a36984c50bb546a7d04cb395fddcf98a1092c Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 5 May 2009 23:24:50 -0700
Subject: [PATCH 0944/2198] x86, boot: use BP_scratch in
 arch/x86/boot/compressed/head_*.S

Use the BP_scratch symbol from asm-offsets.h instead of hard-coding
the location.

[ Impact: cleanup ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_32.S | 2 +-
 arch/x86/boot/compressed/head_64.S | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 85bd3285706da..e3398f3d1b34b 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -53,7 +53,7 @@ ENTRY(startup_32)
  * data at 0x1e4 (defined as a scratch field) are used as the stack
  * for this calculation. Only 4 bytes are needed.
  */
-	leal (0x1e4+4)(%esi), %esp
+	leal (BP_scratch+4)(%esi), %esp
 	call 1f
 1:	popl %ebp
 	subl $1b, %ebp
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index ed4a829480026..06cc7e59352b9 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -56,7 +56,7 @@ ENTRY(startup_32)
  * data at 0x1e4 (defined as a scratch field) are used as the stack
  * for this calculation. Only 4 bytes are needed.
  */
-	leal	(0x1e4+4)(%esi), %esp
+	leal	(BP_scratch+4)(%esi), %esp
 	call	1f
 1:	popl	%ebp
 	subl	$1b, %ebp
-- 
GitLab


From 5f64ec64e7f9b246c0a94f34cdf7782f98a6e55d Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 15:45:17 -0700
Subject: [PATCH 0945/2198] x86, boot: stylistic cleanups for
 boot/compressed/head_32.S

Reformat arch/x86/boot/compressed/head_32.S to be closer to currently
preferred kernel assembly style, that is:

- opcode and operand separated by tab
- operands separated by ", "
- C-style comments

This also makes it more similar to head_64.S.

[ Impact: cleanup, no object code change ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_32.S | 170 +++++++++++++++--------------
 1 file changed, 89 insertions(+), 81 deletions(-)

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index e3398f3d1b34b..7bd7766ffabfd 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -12,16 +12,16 @@
  * the page directory. [According to comments etc elsewhere on a compressed
  * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
  *
- * Page 0 is deliberately kept safe, since System Management Mode code in 
+ * Page 0 is deliberately kept safe, since System Management Mode code in
  * laptops may need to access the BIOS data stored there.  This is also
- * useful for future device drivers that either access the BIOS via VM86 
+ * useful for future device drivers that either access the BIOS via VM86
  * mode.
  */
 
 /*
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
-.text
+	.text
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
@@ -29,75 +29,80 @@
 #include <asm/boot.h>
 #include <asm/asm-offsets.h>
 
-.section ".text.head","ax",@progbits
+	.section ".text.head","ax",@progbits
 ENTRY(startup_32)
 	cld
-	/* test KEEP_SEGMENTS flag to see if the bootloader is asking
-	 * us to not reload segments */
-	testb $(1<<6), BP_loadflags(%esi)
-	jnz 1f
+	/*
+	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+	 * us to not reload segments
+	 */
+	testb	$(1<<6), BP_loadflags(%esi)
+	jnz	1f
 
 	cli
-	movl $(__BOOT_DS),%eax
-	movl %eax,%ds
-	movl %eax,%es
-	movl %eax,%fs
-	movl %eax,%gs
-	movl %eax,%ss
+	movl	$__BOOT_DS, %eax
+	movl	%eax, %ds
+	movl	%eax, %es
+	movl	%eax, %fs
+	movl	%eax, %gs
+	movl	%eax, %ss
 1:
 
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
  * at and where we were actually loaded at.  This can only be done
  * with a short local call on x86.  Nothing  else will tell us what
  * address we are running at.  The reserved chunk of the real-mode
  * data at 0x1e4 (defined as a scratch field) are used as the stack
  * for this calculation. Only 4 bytes are needed.
  */
-	leal (BP_scratch+4)(%esi), %esp
-	call 1f
-1:	popl %ebp
-	subl $1b, %ebp
+	leal	(BP_scratch+4)(%esi), %esp
+	call	1f
+1:	popl	%ebp
+	subl	$1b, %ebp
 
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+/*
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
  * contains the address where we should move the kernel image temporarily
  * for safe in-place decompression.
  */
 
 #ifdef CONFIG_RELOCATABLE
-	movl 	%ebp, %ebx
+	movl	%ebp, %ebx
 	addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
 	andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
 #else
-	movl $LOAD_PHYSICAL_ADDR, %ebx
+	movl	$LOAD_PHYSICAL_ADDR, %ebx
 #endif
 
 	/* Replace the compressed data size with the uncompressed size */
-	subl input_len(%ebp), %ebx
-	movl output_len(%ebp), %eax
-	addl %eax, %ebx
+	subl	input_len(%ebp), %ebx
+	movl	output_len(%ebp), %eax
+	addl	%eax, %ebx
 	/* Add 8 bytes for every 32K input block */
-	shrl $12, %eax
-	addl %eax, %ebx
+	shrl	$12, %eax
+	addl	%eax, %ebx
 	/* Add 32K + 18 bytes of extra slack */
-	addl $(32768 + 18), %ebx
+	addl	$(32768 + 18), %ebx
 	/* Align on a 4K boundary */
-	addl $4095, %ebx
-	andl $~4095, %ebx
+	addl	$4095, %ebx
+	andl	$~4095, %ebx
 
-/* Copy the compressed kernel to the end of our buffer
+/*
+ * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
  */
-	pushl %esi
-	leal _ebss(%ebp), %esi
-	leal _ebss(%ebx), %edi
-	movl $(_ebss - startup_32), %ecx
+	pushl	%esi
+	leal	_ebss(%ebp), %esi
+	leal	_ebss(%ebx), %edi
+	movl	$(_ebss - startup_32), %ecx
 	std
-	rep
-	movsb
+	rep	movsb
 	cld
-	popl %esi
+	popl	%esi
 
-/* Compute the kernel start address.
+/*
+ * Compute the kernel start address.
  */
 #ifdef CONFIG_RELOCATABLE
 	addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
@@ -109,81 +114,84 @@ ENTRY(startup_32)
 /*
  * Jump to the relocated address.
  */
-	leal relocated(%ebx), %eax
-	jmp *%eax
+	leal	relocated(%ebx), %eax
+	jmp	*%eax
 ENDPROC(startup_32)
 
-.section ".text"
+	.text
 relocated:
 
 /*
  * Clear BSS
  */
-	xorl %eax,%eax
-	leal _edata(%ebx),%edi
-	leal _ebss(%ebx), %ecx
-	subl %edi,%ecx
+	xorl	%eax, %eax
+	leal	_edata(%ebx), %edi
+	leal	_ebss(%ebx), %ecx
+	subl	%edi, %ecx
 	cld
-	rep
-	stosb
+	rep	stosb
 
 /*
  * Setup the stack for the decompressor
  */
-	leal boot_stack_end(%ebx), %esp
+	leal	boot_stack_end(%ebx), %esp
 
 /*
  * Do the decompression, and jump to the new kernel..
  */
-	movl output_len(%ebx), %eax
-	pushl %eax
-			# push arguments for decompress_kernel:
-	pushl %ebp	# output address
-	movl input_len(%ebx), %eax
-	pushl %eax	# input_len
-	leal input_data(%ebx), %eax
-	pushl %eax	# input_data
-	leal boot_heap(%ebx), %eax
-	pushl %eax	# heap area
-	pushl %esi	# real mode pointer
-	call decompress_kernel
-	addl $20, %esp
-	popl %ecx
+	movl	output_len(%ebx), %eax
+	pushl	%eax
+				/* push arguments for decompress_kernel: */
+	pushl	%ebp		/* output address */
+	movl	input_len(%ebx), %eax
+	pushl	%eax		/* input_len */
+	leal	input_data(%ebx), %eax
+	pushl	%eax		/* input_data */
+	leal	boot_heap(%ebx), %eax
+	pushl	%eax		/* heap area */
+	pushl	%esi		/* real mode pointer */
+	call	decompress_kernel
+	addl	$20, %esp
+	popl	%ecx
 
 #if CONFIG_RELOCATABLE
-/* Find the address of the relocations.
+/*
+ * Find the address of the relocations.
  */
-	movl %ebp, %edi
-	addl %ecx, %edi
+	movl	%ebp, %edi
+	addl	%ecx, %edi
 
-/* Calculate the delta between where vmlinux was compiled to run
+/*
+ * Calculate the delta between where vmlinux was compiled to run
  * and where it was actually loaded.
  */
-	movl %ebp, %ebx
-	subl $LOAD_PHYSICAL_ADDR, %ebx
-	jz   2f		/* Nothing to be done if loaded at compiled addr. */
+	movl	%ebp, %ebx
+	subl	$LOAD_PHYSICAL_ADDR, %ebx
+	jz	2f	/* Nothing to be done if loaded at compiled addr. */
 /*
  * Process relocations.
  */
 
-1:	subl $4, %edi
-	movl 0(%edi), %ecx
-	testl %ecx, %ecx
-	jz 2f
-	addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
-	jmp 1b
+1:	subl	$4, %edi
+	movl	(%edi), %ecx
+	testl	%ecx, %ecx
+	jz	2f
+	addl	%ebx, -__PAGE_OFFSET(%ebx, %ecx)
+	jmp	1b
 2:
 #endif
 
 /*
  * Jump to the decompressed kernel.
  */
-	xorl %ebx,%ebx
-	jmp *%ebp
+	xorl	%ebx, %ebx
+	jmp	*%ebp
 
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+	.bss
+	.balign 4
 boot_heap:
 	.fill BOOT_HEAP_SIZE, 1, 0
 boot_stack:
-- 
GitLab


From b40d68d5b5b799caaf99d2e073e62962e6d917ce Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 15:59:13 -0700
Subject: [PATCH 0946/2198] x86, boot: stylistic cleanups for
 boot/compressed/head_64.S

Clean up style issues in arch/x86/boot/compressed/head_64.S.  This
file had a lot fewer style issues than its 32-bit cousin, but the ones
it has are worth fixing, especially since it makes the two files more
similar.

[ Impact: cleanup, no object code change ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_64.S | 52 ++++++++++++++++++------------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 06cc7e59352b9..26c3def43ace9 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -21,8 +21,8 @@
 /*
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
-.code32
-.text
+	.code32
+	.text
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
@@ -33,12 +33,14 @@
 #include <asm/processor-flags.h>
 #include <asm/asm-offsets.h>
 
-.section ".text.head"
+	.section ".text.head"
 	.code32
 ENTRY(startup_32)
 	cld
-	/* test KEEP_SEGMENTS flag to see if the bootloader is asking
-	 * us to not reload segments */
+	/*
+	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+	 * us to not reload segments
+	 */
 	testb $(1<<6), BP_loadflags(%esi)
 	jnz 1f
 
@@ -49,7 +51,8 @@ ENTRY(startup_32)
 	movl	%eax, %ss
 1:
 
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
  * at and where we were actually loaded at.  This can only be done
  * with a short local call on x86.  Nothing  else will tell us what
  * address we are running at.  The reserved chunk of the real-mode
@@ -70,10 +73,11 @@ ENTRY(startup_32)
 	testl	%eax, %eax
 	jnz	no_longmode
 
-/* Compute the delta between where we were compiled to run at
+/*
+ * Compute the delta between where we were compiled to run at
  * and where the code will actually run at.
- */
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+ *
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
  * contains the address where we should move the kernel image temporarily
  * for safe in-place decompression.
  */
@@ -114,7 +118,7 @@ ENTRY(startup_32)
  /*
   * Build early 4G boot pagetable
   */
-	/* Initialize Page tables to 0*/
+	/* Initialize Page tables to 0 */
 	leal	pgtable(%ebx), %edi
 	xorl	%eax, %eax
 	movl	$((4096*6)/4), %ecx
@@ -155,7 +159,8 @@ ENTRY(startup_32)
 	btsl	$_EFER_LME, %eax
 	wrmsr
 
-	/* Setup for the jump to 64bit mode
+	/*
+	 * Setup for the jump to 64bit mode
 	 *
 	 * When the jump is performend we will be in long mode but
 	 * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
@@ -184,7 +189,8 @@ no_longmode:
 
 #include "../../kernel/verify_cpu_64.S"
 
-	/* Be careful here startup_64 needs to be at a predictable
+	/*
+	 * Be careful here startup_64 needs to be at a predictable
 	 * address so I can export it in an ELF header.  Bootloaders
 	 * should look at the ELF header to find this address, as
 	 * it may change in the future.
@@ -192,7 +198,8 @@ no_longmode:
 	.code64
 	.org 0x200
 ENTRY(startup_64)
-	/* We come here either from startup_32 or directly from a
+	/*
+	 * We come here either from startup_32 or directly from a
 	 * 64bit bootloader.  If we come here from a bootloader we depend on
 	 * an identity mapped page table being provied that maps our
 	 * entire text+data+bss and hopefully all of memory.
@@ -209,7 +216,8 @@ ENTRY(startup_64)
 	movl    $0x20, %eax
 	ltr	%ax
 
-	/* Compute the decompressed kernel start address.  It is where
+	/*
+	 * Compute the decompressed kernel start address.  It is where
 	 * we were loaded at aligned to a 2M boundary. %rbp contains the
 	 * decompressed kernel start address.
 	 *
@@ -241,7 +249,8 @@ ENTRY(startup_64)
 	addq	$(32768 + 18 + 4095), %rbx
 	andq	$~4095, %rbx
 
-/* Copy the compressed kernel to the end of our buffer
+/*
+ * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
  */
 	leaq	_end_before_pgt(%rip), %r8
@@ -260,7 +269,7 @@ ENTRY(startup_64)
 	leaq	relocated(%rbx), %rax
 	jmp	*%rax
 
-.section ".text"
+	.text
 relocated:
 
 /*
@@ -271,8 +280,7 @@ relocated:
 	leaq    _end_before_pgt(%rbx), %rcx
 	subq	%rdi, %rcx
 	cld
-	rep
-	stosb
+	rep	stosb
 
 	/* Setup the stack */
 	leaq	boot_stack_end(%rip), %rsp
@@ -311,9 +319,11 @@ gdt:
 	.quad   0x0000000000000000	/* TS continued */
 gdt_end:
 
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+	.bss
+	.balign 4
 boot_heap:
 	.fill BOOT_HEAP_SIZE, 1, 0
 boot_stack:
-- 
GitLab


From 5b11f1cee5797b38d16b94d8745b12b6727a8373 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 16:20:34 -0700
Subject: [PATCH 0947/2198] x86, boot: straighten out ranges to copy/zero in
 compressed/head*.S

Both on 32 and 64 bits, we copy all the way up to the end of bss,
except that on 64 bits there is a hack to avoid copying on top of the
page tables.  There is no point in copying bss at all, especially
since we are just about to zero it all anyway.

To clean up and unify the handling, we now do:

  - copy from startup_32 to _bss.
  - zero from _bss to _ebss.
  - the _ebss symbol is aligned to an 8-byte boundary.
  - the page tables are moved to a separate section.

Use _bss as the copy endpoint since _edata may be misaligned.

[ Impact: cleanup, trivial performance improvement ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_32.S     |  8 ++++----
 arch/x86/boot/compressed/head_64.S     | 18 +++++++++++++-----
 arch/x86/boot/compressed/vmlinux.lds.S | 19 ++++++++++++-------
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 7bd7766ffabfd..59425e157df3b 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -93,9 +93,9 @@ ENTRY(startup_32)
  * where decompression in place becomes safe.
  */
 	pushl	%esi
-	leal	_ebss(%ebp), %esi
-	leal	_ebss(%ebx), %edi
-	movl	$(_ebss - startup_32), %ecx
+	leal	_bss(%ebp), %esi
+	leal	_bss(%ebx), %edi
+	movl	$(_bss - startup_32), %ecx
 	std
 	rep	movsb
 	cld
@@ -125,7 +125,7 @@ relocated:
  * Clear BSS
  */
 	xorl	%eax, %eax
-	leal	_edata(%ebx), %edi
+	leal	_bss(%ebx), %edi
 	leal	_ebss(%ebx), %ecx
 	subl	%edi, %ecx
 	cld
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 26c3def43ace9..5bc9052615b6d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -253,9 +253,9 @@ ENTRY(startup_64)
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
  */
-	leaq	_end_before_pgt(%rip), %r8
-	leaq	_end_before_pgt(%rbx), %r9
-	movq	$_end_before_pgt /* - $startup_32 */, %rcx
+	leaq	_bss(%rip), %r8
+	leaq	_bss(%rbx), %r9
+	movq	$_bss /* - $startup_32 */, %rcx
 1:	subq	$8, %r8
 	subq	$8, %r9
 	movq	0(%r8), %rax
@@ -276,8 +276,8 @@ relocated:
  * Clear BSS
  */
 	xorq	%rax, %rax
-	leaq    _edata(%rbx), %rdi
-	leaq    _end_before_pgt(%rbx), %rcx
+	leaq    _bss(%rbx), %rdi
+	leaq    _ebss(%rbx), %rcx
 	subq	%rdi, %rcx
 	cld
 	rep	stosb
@@ -329,3 +329,11 @@ boot_heap:
 boot_stack:
 	.fill BOOT_STACK_SIZE, 1, 0
 boot_stack_end:
+
+/*
+ * Space for page tables (not in .bss so not zeroed)
+ */
+	.section ".pgtable","a",@nobits
+	.balign 4096
+pgtable:
+	.fill 6*4096, 1, 0
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index dbe515e13fef2..cc353e1b3ffd4 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -2,6 +2,8 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
 
 #undef i386
 
+#include <asm/page_types.h>
+
 #ifdef CONFIG_X86_64
 OUTPUT_ARCH(i386:x86-64)
 ENTRY(startup_64)
@@ -48,13 +50,16 @@ SECTIONS
 		*(.bss)
 		*(.bss.*)
 		*(COMMON)
-#ifdef CONFIG_X86_64
-		. = ALIGN(8);
-		_end_before_pgt = . ;
-		. = ALIGN(4096);
-		pgtable = . ;
-		. = . + 4096 * 6;
-#endif
+		. = ALIGN(8);	/* For convenience during zeroing */
 		_ebss = .;
 	}
+#ifdef CONFIG_X86_64
+       . = ALIGN(PAGE_SIZE);
+       .pgtable : {
+		_pgtable = . ;
+		*(.pgtable)
+		_epgtable = . ;
+	}
+#endif
+	_end = .;
 }
-- 
GitLab


From 0a137736704ef9af719409933b3c33e138461786 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 16:27:41 -0700
Subject: [PATCH 0948/2198] x86, boot: set up the decompression stack as early
 as possible

Set up the decompression stack as soon as we know where it needs to
go.  That way we have a full-service stack as soon as possible, rather
than relying on the BP_scratch field.

Note that the stack does need to be empty during bss zeroing (or
else the stack needs to be moved out of the bss segment, which is also
an option.)

[ Impact: cleanup, minor paranoia ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_32.S | 10 ++++------
 arch/x86/boot/compressed/head_64.S | 16 ++++++++--------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 59425e157df3b..d7245cf802617 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -88,6 +88,9 @@ ENTRY(startup_32)
 	addl	$4095, %ebx
 	andl	$~4095, %ebx
 
+	/* Set up the stack */
+	leal	boot_stack_end(%ebx), %esp
+
 /*
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
@@ -122,7 +125,7 @@ ENDPROC(startup_32)
 relocated:
 
 /*
- * Clear BSS
+ * Clear BSS (stack is currently empty)
  */
 	xorl	%eax, %eax
 	leal	_bss(%ebx), %edi
@@ -131,11 +134,6 @@ relocated:
 	cld
 	rep	stosb
 
-/*
- * Setup the stack for the decompressor
- */
-	leal	boot_stack_end(%ebx), %esp
-
 /*
  * Do the decompression, and jump to the new kernel..
  */
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 5bc9052615b6d..a0b18426069a1 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -249,6 +249,13 @@ ENTRY(startup_64)
 	addq	$(32768 + 18 + 4095), %rbx
 	andq	$~4095, %rbx
 
+	/* Set up the stack */
+	leaq	boot_stack_end(%rbx), %rsp
+
+	/* Zero EFLAGS */
+	pushq	$0
+	popfq
+
 /*
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
@@ -273,7 +280,7 @@ ENTRY(startup_64)
 relocated:
 
 /*
- * Clear BSS
+ * Clear BSS (stack is currently empty)
  */
 	xorq	%rax, %rax
 	leaq    _bss(%rbx), %rdi
@@ -282,13 +289,6 @@ relocated:
 	cld
 	rep	stosb
 
-	/* Setup the stack */
-	leaq	boot_stack_end(%rip), %rsp
-
-	/* zero EFLAGS after setting rsp */
-	pushq	$0
-	popfq
-
 /*
  * Do the decompression, and jump to the new kernel..
  */
-- 
GitLab


From 97541912785369925723b6255438ad9fce2ddf04 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 6 May 2009 17:56:51 -0700
Subject: [PATCH 0949/2198] x86, boot: zero EFLAGS on 32 bits

The 64-bit code already clears EFLAGS as soon as it has a stack.  This
seems like a reasonable precaution, so do it on 32 bits as well.

[ Impact: extra paranoia ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_32.S | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index d7245cf802617..d02a4f02be138 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -91,6 +91,10 @@ ENTRY(startup_32)
 	/* Set up the stack */
 	leal	boot_stack_end(%ebx), %esp
 
+	/* Zero EFLAGS */
+	pushl	$0
+	popfl
+
 /*
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
-- 
GitLab


From 36d3793c947f1ef7ba3d24eeeddc1be41adc5ab4 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 16:45:15 -0700
Subject: [PATCH 0950/2198] x86, boot: use appropriate rep string for move and
 clear

In the pre-decompression code, use the appropriate largest possible
rep movs and rep stos to move code and clear bss, respectively.  For
reverse copy, do note that the initial values are supposed to be the
address of the first (highest) copy datum, not one byte beyond the end
of the buffer.

rep strings are not necessarily the fastest way to perform these
operations on all current processors, but are likely to be in the
future, and perhaps more importantly, we want to encourage the
architecturally right thing to do here.

This also fixes a couple of trivial inefficiencies on 64 bits.

[ Impact: trivial performance enhancement, increase code similarity ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_32.S | 11 ++++++-----
 arch/x86/boot/compressed/head_64.S | 26 +++++++++++++-------------
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index d02a4f02be138..6710dc78ac599 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -100,11 +100,12 @@ ENTRY(startup_32)
  * where decompression in place becomes safe.
  */
 	pushl	%esi
-	leal	_bss(%ebp), %esi
-	leal	_bss(%ebx), %edi
+	leal	(_bss-4)(%ebp), %esi
+	leal	(_bss-4)(%ebx), %edi
 	movl	$(_bss - startup_32), %ecx
+	shrl	$2, %ecx
 	std
-	rep	movsb
+	rep	movsl
 	cld
 	popl	%esi
 
@@ -135,8 +136,8 @@ relocated:
 	leal	_bss(%ebx), %edi
 	leal	_ebss(%ebx), %ecx
 	subl	%edi, %ecx
-	cld
-	rep	stosb
+	shrl	$2, %ecx
+	rep	stosl
 
 /*
  * Do the decompression, and jump to the new kernel..
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index a0b18426069a1..723c72dfd7bc5 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -260,15 +260,15 @@ ENTRY(startup_64)
  * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
  */
-	leaq	_bss(%rip), %r8
-	leaq	_bss(%rbx), %r9
+	pushq	%rsi
+	leaq	(_bss-8)(%rip), %rsi
+	leaq	(_bss-8)(%rbx), %rdi
 	movq	$_bss /* - $startup_32 */, %rcx
-1:	subq	$8, %r8
-	subq	$8, %r9
-	movq	0(%r8), %rax
-	movq	%rax, 0(%r9)
-	subq	$8, %rcx
-	jnz	1b
+	shrq	$3, %rcx
+	std
+	rep	movsq
+	cld
+	popq	%rsi
 
 /*
  * Jump to the relocated address.
@@ -282,12 +282,12 @@ relocated:
 /*
  * Clear BSS (stack is currently empty)
  */
-	xorq	%rax, %rax
-	leaq    _bss(%rbx), %rdi
-	leaq    _ebss(%rbx), %rcx
+	xorl	%eax, %eax
+	leaq    _bss(%rip), %rdi
+	leaq    _ebss(%rip), %rcx
 	subq	%rdi, %rcx
-	cld
-	rep	stosb
+	shrq	$3, %rcx
+	rep	stosq
 
 /*
  * Do the decompression, and jump to the new kernel..
-- 
GitLab


From 02a884c0fe7ec8459d00d34b7d4101af21fc4a86 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Fri, 8 May 2009 17:42:16 -0700
Subject: [PATCH 0951/2198] x86, boot: determine compressed code offset at
 compile time

Determine the compressed code offset (from the kernel runtime address)
at compile time.  This allows some minor optimizations in
arch/x86/boot/compressed/head_*.S, but more importantly it makes this
value available to the build process, which will enable a future patch
to export the necessary linear memory footprint into the bzImage
header.

[ Impact: cleanup, future patch enabling ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/Makefile    | 11 +++-
 arch/x86/boot/compressed/head_32.S   | 24 ++-----
 arch/x86/boot/compressed/head_64.S   | 41 ++++--------
 arch/x86/boot/compressed/mkpiggy.c   | 97 ++++++++++++++++++++++++++++
 arch/x86/boot/compressed/vmlinux.scr | 10 ---
 5 files changed, 123 insertions(+), 60 deletions(-)
 create mode 100644 arch/x86/boot/compressed/mkpiggy.c
 delete mode 100644 arch/x86/boot/compressed/vmlinux.scr

diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 7f24fdb584e65..49c8a4c37d7c3 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,6 +19,8 @@ KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 LDFLAGS := -m elf_$(UTS_MACHINE)
 LDFLAGS_vmlinux := -T
 
+hostprogs-y	:= mkpiggy
+
 $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
 	@:
@@ -50,6 +52,9 @@ suffix-$(CONFIG_KERNEL_GZIP)	:= gz
 suffix-$(CONFIG_KERNEL_BZIP2)	:= bz2
 suffix-$(CONFIG_KERNEL_LZMA)	:= lzma
 
-LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE
-	$(call if_changed,ld)
+quiet_cmd_mkpiggy = MKPIGGY $@
+      cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
+
+targets += piggy.S
+$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
+	$(call if_changed,mkpiggy)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 6710dc78ac599..470474bafc4dc 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -75,18 +75,8 @@ ENTRY(startup_32)
 	movl	$LOAD_PHYSICAL_ADDR, %ebx
 #endif
 
-	/* Replace the compressed data size with the uncompressed size */
-	subl	input_len(%ebp), %ebx
-	movl	output_len(%ebp), %eax
-	addl	%eax, %ebx
-	/* Add 8 bytes for every 32K input block */
-	shrl	$12, %eax
-	addl	%eax, %ebx
-	/* Add 32K + 18 bytes of extra slack */
-	addl	$(32768 + 18), %ebx
-	/* Align on a 4K boundary */
-	addl	$4095, %ebx
-	andl	$~4095, %ebx
+	/* Target address to relocate to for decompression */
+	addl	$z_extract_offset, %ebx
 
 	/* Set up the stack */
 	leal	boot_stack_end(%ebx), %esp
@@ -142,12 +132,10 @@ relocated:
 /*
  * Do the decompression, and jump to the new kernel..
  */
-	movl	output_len(%ebx), %eax
-	pushl	%eax
+	leal	z_extract_offset_negative(%ebx), %ebp
 				/* push arguments for decompress_kernel: */
 	pushl	%ebp		/* output address */
-	movl	input_len(%ebx), %eax
-	pushl	%eax		/* input_len */
+	pushl	$z_input_len	/* input_len */
 	leal	input_data(%ebx), %eax
 	pushl	%eax		/* input_data */
 	leal	boot_heap(%ebx), %eax
@@ -155,14 +143,12 @@ relocated:
 	pushl	%esi		/* real mode pointer */
 	call	decompress_kernel
 	addl	$20, %esp
-	popl	%ecx
 
 #if CONFIG_RELOCATABLE
 /*
  * Find the address of the relocations.
  */
-	movl	%ebp, %edi
-	addl	%ecx, %edi
+	leal	z_output_len(%ebp), %edi
 
 /*
  * Calculate the delta between where vmlinux was compiled to run
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 723c72dfd7bc5..2b9f2510507b4 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -90,16 +90,8 @@ ENTRY(startup_32)
 	movl	$CONFIG_PHYSICAL_START, %ebx
 #endif
 
-	/* Replace the compressed data size with the uncompressed size */
-	subl	input_len(%ebp), %ebx
-	movl	output_len(%ebp), %eax
-	addl	%eax, %ebx
-	/* Add 8 bytes for every 32K input block */
-	shrl	$12, %eax
-	addl	%eax, %ebx
-	/* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
-	addl	$(32768 + 18 + 4095), %ebx
-	andl	$~4095, %ebx
+	/* Target address to relocate to for decompression */
+	addl	$z_extract_offset, %ebx
 
 /*
  * Prepare for entering 64 bit mode
@@ -224,6 +216,9 @@ ENTRY(startup_64)
 	 * If it is a relocatable kernel then decompress and run the kernel
 	 * from load address aligned to 2MB addr, otherwise decompress and
 	 * run the kernel from CONFIG_PHYSICAL_START
+	 *
+	 * We cannot rely on the calculation done in 32-bit mode, since we
+	 * may have been invoked via the 64-bit entry point.
 	 */
 
 	/* Start with the delta to where the kernel will run at. */
@@ -237,17 +232,8 @@ ENTRY(startup_64)
 	movq	%rbp, %rbx
 #endif
 
-	/* Replace the compressed data size with the uncompressed size */
-	movl	input_len(%rip), %eax
-	subq	%rax, %rbx
-	movl	output_len(%rip), %eax
-	addq	%rax, %rbx
-	/* Add 8 bytes for every 32K input block */
-	shrq	$12, %rax
-	addq	%rax, %rbx
-	/* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
-	addq	$(32768 + 18 + 4095), %rbx
-	andq	$~4095, %rbx
+	/* Target address to relocate to for decompression */
+	leaq	z_extract_offset(%rbp), %rbx
 
 	/* Set up the stack */
 	leaq	boot_stack_end(%rbx), %rsp
@@ -292,13 +278,12 @@ relocated:
 /*
  * Do the decompression, and jump to the new kernel..
  */
-	pushq	%rsi			# Save the real mode argument
-	movq	%rsi, %rdi		# real mode address
-	leaq	boot_heap(%rip), %rsi	# malloc area for uncompression
-	leaq	input_data(%rip), %rdx  # input_data
-	movl	input_len(%rip), %eax
-	movq	%rax, %rcx		# input_len
-	movq	%rbp, %r8		# output
+	pushq	%rsi			/* Save the real mode argument */
+	movq	%rsi, %rdi		/* real mode address */
+	leaq	boot_heap(%rip), %rsi	/* malloc area for uncompression */
+	leaq	input_data(%rip), %rdx  /* input_data */
+	movl	$z_input_len, %ecx	/* input_len */
+	movq	%rbp, %r8		/* output target address */
 	call	decompress_kernel
 	popq	%rsi
 
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
new file mode 100644
index 0000000000000..bcbd36c414321
--- /dev/null
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *  Copyright (C) 2009 Intel Corporation. All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License version
+ *  2 as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ *  02110-1301, USA.
+ *
+ *  H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Compute the desired load offset from a compressed program; outputs
+ * a small assembly wrapper with the appropriate symbols defined.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+static uint32_t getle32(const void *p)
+{
+	const uint8_t *cp = p;
+
+	return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
+		((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
+}
+
+int main(int argc, char *argv[])
+{
+	uint32_t olen;
+	long ilen;
+	unsigned long offs;
+	FILE *f;
+
+	if (argc < 2) {
+		fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
+		return 1;
+	}
+
+	/* Get the information for the compressed kernel image first */
+
+	f = fopen(argv[1], "r");
+	if (!f) {
+		perror(argv[1]);
+		return 1;
+	}
+
+
+	if (fseek(f, -4L, SEEK_END)) {
+		perror(argv[1]);
+	}
+	fread(&olen, sizeof olen, 1, f);
+	ilen = ftell(f);
+	olen = getle32(&olen);
+	fclose(f);
+
+	/*
+	 * Now we have the input (compressed) and output (uncompressed)
+	 * sizes, compute the necessary decompression offset...
+	 */
+
+	offs = (olen > ilen) ? olen - ilen : 0;
+	offs += olen >> 12;	/* Add 8 bytes for each 32K block */
+	offs += 32*1024 + 18;	/* Add 32K + 18 bytes slack */
+	offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
+
+	printf(".section \".rodata.compressed\",\"a\",@progbits\n");
+	printf(".globl z_input_len\n");
+	printf("z_input_len = %lu\n", ilen);
+	printf(".globl z_output_len\n");
+	printf("z_output_len = %lu\n", (unsigned long)olen);
+	printf(".globl z_extract_offset\n");
+	printf("z_extract_offset = 0x%lx\n", offs);
+	/* z_extract_offset_negative allows simplification of head_32.S */
+	printf(".globl z_extract_offset_negative\n");
+	printf("z_extract_offset_negative = -0x%lx\n", offs);
+
+	printf(".globl input_data, input_data_end\n");
+	printf("input_data:\n");
+	printf(".incbin \"%s\"\n", argv[1]);
+	printf("input_data_end:\n");
+
+	return 0;
+}
diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr
deleted file mode 100644
index f02382ae5c48b..0000000000000
--- a/arch/x86/boot/compressed/vmlinux.scr
+++ /dev/null
@@ -1,10 +0,0 @@
-SECTIONS
-{
-  .rodata.compressed : {
-	input_len = .;
-	LONG(input_data_end - input_data) input_data = .;
-	*(.data)
-	output_len = . - 4;
-	input_data_end = .;
-	}
-}
-- 
GitLab


From 778dedae0cb76a441145f3a0c5d59fcb3ba296d5 Mon Sep 17 00:00:00 2001
From: Huang Weiyi <weiyi.huang@gmail.com>
Date: Sat, 9 May 2009 12:54:34 +0800
Subject: [PATCH 0952/2198] x86: mce: remove duplicated #include

Remove duplicated #include in arch/x86/kernel/cpu/mcheck/mce_intel_64.c.

[ Impact: cleanup ]

Signed-off-by: Huang Weiyi <weiyi.huang@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
index d6b72df89d697..aedf9766ee952 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c
@@ -15,7 +15,6 @@
 #include <asm/hw_irq.h>
 #include <asm/idle.h>
 #include <asm/therm_throt.h>
-#include <asm/apic.h>
 
 asmlinkage void smp_thermal_interrupt(void)
 {
-- 
GitLab


From b30505c81a9d4adea8b70ecff512b0216929b797 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 7 May 2009 15:40:14 -0700
Subject: [PATCH 0953/2198] futex: add requeue-pi documentation

Add Documentation/futex-requeue-pi.txt describing the motivation for the
newly added FUTEX_*REQUEUE_PI op codes and their implementation.

[ Impact: add documentation ]

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Sripathi Kodi <sripathik@in.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Jakub Jelinek <jakub@redhat.com>
LKML-Reference: <4A03634E.3080609@us.ibm.com>
[ reformatted the file ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/futex-requeue-pi.txt | 131 +++++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 Documentation/futex-requeue-pi.txt

diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt
new file mode 100644
index 0000000000000..9dc1ff4fd5369
--- /dev/null
+++ b/Documentation/futex-requeue-pi.txt
@@ -0,0 +1,131 @@
+Futex Requeue PI
+----------------
+
+Requeueing of tasks from a non-PI futex to a PI futex requires
+special handling in order to ensure the underlying rt_mutex is never
+left without an owner if it has waiters; doing so would break the PI
+boosting logic [see rt-mutex-desgin.txt] For the purposes of
+brevity, this action will be referred to as "requeue_pi" throughout
+this document.  Priority inheritance is abbreviated throughout as
+"PI".
+
+Motivation
+----------
+
+Without requeue_pi, the glibc implementation of
+pthread_cond_broadcast() must resort to waking all the tasks waiting
+on a pthread_condvar and letting them try to sort out which task
+gets to run first in classic thundering-herd formation.  An ideal
+implementation would wake the highest-priority waiter, and leave the
+rest to the natural wakeup inherent in unlocking the mutex
+associated with the condvar.
+
+Consider the simplified glibc calls:
+
+/* caller must lock mutex */
+pthread_cond_wait(cond, mutex)
+{
+	lock(cond->__data.__lock);
+	unlock(mutex);
+	do {
+	   unlock(cond->__data.__lock);
+	   futex_wait(cond->__data.__futex);
+	   lock(cond->__data.__lock);
+	} while(...)
+	unlock(cond->__data.__lock);
+	lock(mutex);
+}
+
+pthread_cond_broadcast(cond)
+{
+	lock(cond->__data.__lock);
+	unlock(cond->__data.__lock);
+	futex_requeue(cond->data.__futex, cond->mutex);
+}
+
+Once pthread_cond_broadcast() requeues the tasks, the cond->mutex
+has waiters. Note that pthread_cond_wait() attempts to lock the
+mutex only after it has returned to user space.  This will leave the
+underlying rt_mutex with waiters, and no owner, breaking the
+previously mentioned PI-boosting algorithms.
+
+In order to support PI-aware pthread_condvar's, the kernel needs to
+be able to requeue tasks to PI futexes.  This support implies that
+upon a successful futex_wait system call, the caller would return to
+user space already holding the PI futex.  The glibc implementation
+would be modified as follows:
+
+
+/* caller must lock mutex */
+pthread_cond_wait_pi(cond, mutex)
+{
+	lock(cond->__data.__lock);
+	unlock(mutex);
+	do {
+	   unlock(cond->__data.__lock);
+	   futex_wait_requeue_pi(cond->__data.__futex);
+	   lock(cond->__data.__lock);
+	} while(...)
+	unlock(cond->__data.__lock);
+        /* the kernel acquired the the mutex for us */
+}
+
+pthread_cond_broadcast_pi(cond)
+{
+	lock(cond->__data.__lock);
+	unlock(cond->__data.__lock);
+	futex_requeue_pi(cond->data.__futex, cond->mutex);
+}
+
+The actual glibc implementation will likely test for PI and make the
+necessary changes inside the existing calls rather than creating new
+calls for the PI cases.  Similar changes are needed for
+pthread_cond_timedwait() and pthread_cond_signal().
+
+Implementation
+--------------
+
+In order to ensure the rt_mutex has an owner if it has waiters, it
+is necessary for both the requeue code, as well as the waiting code,
+to be able to acquire the rt_mutex before returning to user space.
+The requeue code cannot simply wake the waiter and leave it to
+acquire the rt_mutex as it would open a race window between the
+requeue call returning to user space and the waiter waking and
+starting to run.  This is especially true in the uncontended case.
+
+The solution involves two new rt_mutex helper routines,
+rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which
+allow the requeue code to acquire an uncontended rt_mutex on behalf
+of the waiter and to enqueue the waiter on a contended rt_mutex.
+Two new system calls provide the kernel<->user interface to
+requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI.
+
+FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait()
+and pthread_cond_timedwait()) to block on the initial futex and wait
+to be requeued to a PI-aware futex.  The implementation is the
+result of a high-speed collision between futex_wait() and
+futex_lock_pi(), with some extra logic to check for the additional
+wake-up scenarios.
+
+FUTEX_REQUEUE_CMP_PI is called by the waker
+(pthread_cond_broadcast() and pthread_cond_signal()) to requeue and
+possibly wake the waiting tasks. Internally, this system call is
+still handled by futex_requeue (by passing requeue_pi=1).  Before
+requeueing, futex_requeue() attempts to acquire the requeue target
+PI futex on behalf of the top waiter.  If it can, this waiter is
+woken.  futex_requeue() then proceeds to requeue the remaining
+nr_wake+nr_requeue tasks to the PI futex, calling
+rt_mutex_start_proxy_lock() prior to each requeue to prepare the
+task as a waiter on the underlying rt_mutex.  It is possible that
+the lock can be acquired at this stage as well, if so, the next
+waiter is woken to finish the acquisition of the lock.
+
+FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but
+their sum is all that really matters.  futex_requeue() will wake or
+requeue up to nr_wake + nr_requeue tasks.  It will wake only as many
+tasks as it can acquire the lock for, which in the majority of cases
+should be 0 as good programming practice dictates that the caller of
+either pthread_cond_broadcast() or pthread_cond_signal() acquire the
+mutex prior to making the call. FUTEX_REQUEUE_PI requires that
+nr_wake=1.  nr_requeue should be INT_MAX for broadcast and 0 for
+signal.
-- 
GitLab


From e9e8b1fb995543c4cc3930f465923a552b2e48b7 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 14:31:37 +0900
Subject: [PATCH 0954/2198] sh: Fix up the sh64 earlyprintk build.

sci_rxd_in() on SH-5 wants to be using SCSPTR, not SCSPTR2.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h
index e3eae4eaaddf8..38072c15b845d 100644
--- a/drivers/serial/sh-sci.h
+++ b/drivers/serial/sh-sci.h
@@ -636,7 +636,7 @@ static inline int sci_rxd_in(struct uart_port *port)
 #elif defined(CONFIG_CPU_SUBTYPE_SH5_101) || defined(CONFIG_CPU_SUBTYPE_SH5_103)
 static inline int sci_rxd_in(struct uart_port *port)
 {
-         return sci_in(port, SCSPTR2)&0x0001 ? 1 : 0; /* SCIF */
+         return sci_in(port, SCSPTR)&0x0001 ? 1 : 0; /* SCIF */
 }
 #elif defined(__H8300H__) || defined(__H8300S__)
 static inline int sci_rxd_in(struct uart_port *port)
-- 
GitLab


From 2fedaacdc07e053d93e0607047a96d282f62aca2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 14:38:49 +0900
Subject: [PATCH 0955/2198] sh: Cleanup irqflags size mismatch on SH-5 build.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/mm/cache-sh5.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c
index 9e277ec7d5360..86762092508c4 100644
--- a/arch/sh/mm/cache-sh5.c
+++ b/arch/sh/mm/cache-sh5.c
@@ -60,7 +60,7 @@ static inline void sh64_teardown_dtlb_cache_slot(void)
 static inline void sh64_icache_inv_all(void)
 {
 	unsigned long long addr, flag, data;
-	unsigned int flags;
+	unsigned long flags;
 
 	addr = ICCR0;
 	flag = ICCR0_ICI;
@@ -172,7 +172,7 @@ static void sh64_icache_inv_user_page_range(struct mm_struct *mm,
 		unsigned long eaddr;
 		unsigned long after_last_page_start;
 		unsigned long mm_asid, current_asid;
-		unsigned long long flags = 0ULL;
+		unsigned long flags = 0;
 
 		mm_asid = cpu_asid(smp_processor_id(), mm);
 		current_asid = get_asid();
@@ -236,7 +236,7 @@ static void sh64_icache_inv_user_small_range(struct mm_struct *mm,
 	unsigned long long eaddr = start;
 	unsigned long long eaddr_end = start + len;
 	unsigned long current_asid, mm_asid;
-	unsigned long long flags;
+	unsigned long flags;
 	unsigned long long epage_start;
 
 	/*
@@ -342,7 +342,7 @@ static void inline sh64_dcache_purge_sets(int sets_to_purge_base, int n_sets)
 			 * alloco is a NOP if the cache is write-through.
 			 */
 			if (test_bit(SH_CACHE_MODE_WT, &(cpu_data->dcache.flags)))
-				ctrl_inb(eaddr);
+				__raw_readb((unsigned long)eaddr);
 		}
 	}
 
-- 
GitLab


From 6dbe47a170f80159f23c856ad4e02f2685c6f6cb Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 14:44:30 +0900
Subject: [PATCH 0956/2198] sh: Provide __read_{read,write}sl() definitions for
 sh64.

These are presently only defined for sh32, use the plain unoptimized
versions for sh64. Fixes up smsc911x build.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/io.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h
index c7c360b586671..25348141674ba 100644
--- a/arch/sh/include/asm/io.h
+++ b/arch/sh/include/asm/io.h
@@ -123,10 +123,15 @@ static inline void __raw_reads##bwlq(volatile void __iomem *mem,	\
 
 __BUILD_MEMORY_STRING(b, u8)
 __BUILD_MEMORY_STRING(w, u16)
-__BUILD_MEMORY_STRING(q, u64)
 
+#ifdef CONFIG_SUPERH32
 void __raw_writesl(void __iomem *addr, const void *data, int longlen);
 void __raw_readsl(const void __iomem *addr, void *data, int longlen);
+#else
+__BUILD_MEMORY_STRING(l, u32)
+#endif
+
+__BUILD_MEMORY_STRING(q, u64)
 
 #define writesb			__raw_writesb
 #define writesw			__raw_writesw
-- 
GitLab


From 2bcfffa42309b6f73042c62459bf5207762a271d Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 16:02:08 +0900
Subject: [PATCH 0957/2198] sh: Rename opcode_t to insn_size_t.

This is now clashing with a driver, so just rename it.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/kprobes.h   |  2 +-
 arch/sh/include/asm/system_32.h |  2 +-
 arch/sh/include/asm/types.h     |  4 ++--
 arch/sh/kernel/io_trapped.c     |  2 +-
 arch/sh/kernel/kgdb.c           |  4 ++--
 arch/sh/kernel/traps.c          |  6 +++---
 arch/sh/kernel/traps_32.c       | 10 +++++-----
 7 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/sh/include/asm/kprobes.h b/arch/sh/include/asm/kprobes.h
index 613644a758e8e..036c3311233cd 100644
--- a/arch/sh/include/asm/kprobes.h
+++ b/arch/sh/include/asm/kprobes.h
@@ -6,7 +6,7 @@
 #include <linux/types.h>
 #include <linux/ptrace.h>
 
-typedef u16 kprobe_opcode_t;
+typedef insn_size_t kprobe_opcode_t;
 #define BREAKPOINT_INSTRUCTION	0xc33a
 
 #define MAX_INSN_SIZE 16
diff --git a/arch/sh/include/asm/system_32.h b/arch/sh/include/asm/system_32.h
index 240b31e1142c0..6c68a51f1cc58 100644
--- a/arch/sh/include/asm/system_32.h
+++ b/arch/sh/include/asm/system_32.h
@@ -198,7 +198,7 @@ do {							\
 })
 #endif
 
-int handle_unaligned_access(opcode_t instruction, struct pt_regs *regs,
+int handle_unaligned_access(insn_size_t instruction, struct pt_regs *regs,
 			    struct mem_access *ma);
 
 asmlinkage void do_address_error(struct pt_regs *regs,
diff --git a/arch/sh/include/asm/types.h b/arch/sh/include/asm/types.h
index beea4e6f8dfd9..b13caca62a768 100644
--- a/arch/sh/include/asm/types.h
+++ b/arch/sh/include/asm/types.h
@@ -23,9 +23,9 @@ typedef unsigned short umode_t;
 typedef u32 dma_addr_t;
 
 #ifdef CONFIG_SUPERH32
-typedef u16 opcode_t;
+typedef u16 insn_size_t;
 #else
-typedef u32 opcode_t;
+typedef u32 insn_size_t;
 #endif
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c
index c22853b059efd..77dfecb643739 100644
--- a/arch/sh/kernel/io_trapped.c
+++ b/arch/sh/kernel/io_trapped.c
@@ -267,7 +267,7 @@ static struct mem_access trapped_io_access = {
 int handle_trapped_io(struct pt_regs *regs, unsigned long address)
 {
 	mm_segment_t oldfs;
-	opcode_t instruction;
+	insn_size_t instruction;
 	int tmp;
 
 	if (!lookup_tiop(address))
diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c
index 7c747e7d71b86..305aad742aec8 100644
--- a/arch/sh/kernel/kgdb.c
+++ b/arch/sh/kernel/kgdb.c
@@ -47,7 +47,7 @@ char in_nmi = 0;	/* Set during NMI to prevent re-entry */
 /* Calculate the new address for after a step */
 static short *get_step_address(struct pt_regs *linux_regs)
 {
-	opcode_t op = __raw_readw(linux_regs->pc);
+	insn_size_t op = __raw_readw(linux_regs->pc);
 	long addr;
 
 	/* BT */
@@ -134,7 +134,7 @@ static short *get_step_address(struct pt_regs *linux_regs)
  */
 
 static unsigned long stepped_address;
-static opcode_t stepped_opcode;
+static insn_size_t stepped_opcode;
 
 static void do_single_step(struct pt_regs *linux_regs)
 {
diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c
index 438f1ebcc4533..46348ed07cc35 100644
--- a/arch/sh/kernel/traps.c
+++ b/arch/sh/kernel/traps.c
@@ -22,11 +22,11 @@ static void handle_BUG(struct pt_regs *regs)
 
 int is_valid_bugaddr(unsigned long addr)
 {
-	unsigned short opcode;
+	insn_size_t opcode;
 
 	if (addr < PAGE_OFFSET)
 		return 0;
-	if (probe_kernel_address((u16 *)addr, opcode))
+	if (probe_kernel_address((insn_size_t *)addr, opcode))
 		return 0;
 
 	return opcode == TRAPA_BUG_OPCODE;
@@ -66,7 +66,7 @@ BUILD_TRAP_HANDLER(bug)
 
 #ifdef CONFIG_BUG
 	if (__kernel_text_address(instruction_pointer(regs))) {
-		opcode_t insn = *(opcode_t *)instruction_pointer(regs);
+		insn_size_t insn = *(insn_size_t *)instruction_pointer(regs);
 		if (insn == TRAPA_BUG_OPCODE)
 			handle_BUG(regs);
 	}
diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index 67550d88c4e66..2b772776fcda9 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -177,7 +177,7 @@ static struct mem_access user_mem_access = {
  *   (if that instruction is in a branch delay slot)
  * - return 0 if emulation okay, -EFAULT on existential error
  */
-static int handle_unaligned_ins(opcode_t instruction, struct pt_regs *regs,
+static int handle_unaligned_ins(insn_size_t instruction, struct pt_regs *regs,
 				struct mem_access *ma)
 {
 	int ret, index, count;
@@ -322,10 +322,10 @@ static int handle_unaligned_ins(opcode_t instruction, struct pt_regs *regs,
  * - fetches the instruction from PC+2
  */
 static inline int handle_delayslot(struct pt_regs *regs,
-				   opcode_t old_instruction,
+				   insn_size_t old_instruction,
 				   struct mem_access *ma)
 {
-	opcode_t instruction;
+	insn_size_t instruction;
 	void __user *addr = (void __user *)(regs->pc +
 		instruction_size(old_instruction));
 
@@ -365,7 +365,7 @@ static inline int handle_delayslot(struct pt_regs *regs,
 
 static int handle_unaligned_notify_count = 10;
 
-int handle_unaligned_access(opcode_t instruction, struct pt_regs *regs,
+int handle_unaligned_access(insn_size_t instruction, struct pt_regs *regs,
 			    struct mem_access *ma)
 {
 	u_int rm;
@@ -523,7 +523,7 @@ asmlinkage void do_address_error(struct pt_regs *regs,
 	unsigned long error_code = 0;
 	mm_segment_t oldfs;
 	siginfo_t info;
-	opcode_t instruction;
+	insn_size_t instruction;
 	int tmp;
 
 	/* Intentional ifdef */
-- 
GitLab


From f15b18d0755b3ee4b29991fc2fde535ee41df53c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 9 May 2009 10:04:22 +0200
Subject: [PATCH 0958/2198] perf_counter tools: remove debug code from
 builtin-stat.c

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index e2fa117eab58e..cf575c305a6c5 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -331,8 +331,6 @@ static void create_perfstat_counter(int counter)
 	hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
 	hw_event.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
 
-printf("exclude: %d\n", event_mask[counter]);
-
 	if (scale)
 		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
 					  PERF_FORMAT_TOTAL_TIME_RUNNING;
-- 
GitLab


From 1031a3a57ed76f5cb8754a89ddb22f3d9ebe8861 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 17:57:21 +0900
Subject: [PATCH 0959/2198] sh: Provide an __sdivsi3_2 export for sh64.

Newer code paths that are heavier in 64-bit math manage to get this
generated by newer compilers, provide a definition and export
accordingly. This is trivially wrapped around the existing __sdivsi3
code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/sh_ksyms_64.c | 2 ++
 arch/sh/lib64/sdivsi3.S      | 6 +++++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/sh/kernel/sh_ksyms_64.c b/arch/sh/kernel/sh_ksyms_64.c
index 0d74d6b8774e4..8f54ef0cfbcaa 100644
--- a/arch/sh/kernel/sh_ksyms_64.c
+++ b/arch/sh/kernel/sh_ksyms_64.c
@@ -76,5 +76,7 @@ EXPORT_SYMBOL(strcpy);
 #define DECLARE_EXPORT(name) extern void name(void);EXPORT_SYMBOL(name)
 
 DECLARE_EXPORT(__sdivsi3);
+DECLARE_EXPORT(__sdivsi3_1);
+DECLARE_EXPORT(__sdivsi3_2);
 DECLARE_EXPORT(__udivsi3);
 DECLARE_EXPORT(__div_table);
diff --git a/arch/sh/lib64/sdivsi3.S b/arch/sh/lib64/sdivsi3.S
index 6a800c6a49043..1963bbd42288d 100644
--- a/arch/sh/lib64/sdivsi3.S
+++ b/arch/sh/lib64/sdivsi3.S
@@ -1,4 +1,6 @@
 	.global	__sdivsi3
+	.global	__sdivsi3_1
+	.global	__sdivsi3_2
 	.section	.text..SHmedia32,"ax"
 	.align	2
 
@@ -6,13 +8,15 @@
 	/* clobbered: r1,r18,r19,r20,r21,r25,tr0 */
 	/* result in r0 */
 __sdivsi3:
+__sdivsi3_1:
 	ptb __div_table,tr0
+	gettr tr0,r20
 
+__sdivsi3_2:
 	nsb r5, r1
 	shlld r5, r1, r25    /* normalize; [-2 ..1, 1..2) in s2.62 */
 	shari r25, 58, r21   /* extract 5(6) bit index (s2.4 with hole -1..1) */
 	/* bubble */
-	gettr tr0,r20
 	ldx.ub r20, r21, r19 /* u0.8 */
 	shari r25, 32, r25   /* normalize to s2.30 */
 	shlli r21, 1, r21
-- 
GitLab


From 7cd0378ef4c0d7b05e7dcb1daa115b841ffdc31a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 18:03:37 +0900
Subject: [PATCH 0960/2198] sh: Fix up SHmedia module ELF relocations.

This fixes up the LSB setting for SHmedia branching in updated symbols
when processing module relocations.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/kernel/module.c b/arch/sh/kernel/module.c
index c43081039dd57..c19b0f7d2cc13 100644
--- a/arch/sh/kernel/module.c
+++ b/arch/sh/kernel/module.c
@@ -90,7 +90,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs,
 		 * SHmedia, the LSB of the symbol needs to be asserted
 		 * for the CPU to be in SHmedia mode when it starts executing
 		 * the branch target. */
-		relocation |= (sym->st_other & 4);
+		relocation |= !!(sym->st_other & 4);
 #endif
 
 		switch (ELF32_R_TYPE(rel[i].r_info)) {
-- 
GitLab


From c3e2586b794b12ffcdf69b4e547030b51e18e6d9 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 23:33:02 +0900
Subject: [PATCH 0961/2198] sh: Integrate sh64 bits in vmlinux_32.lds.S.

This adds all of the requisite bits from vmlinux_64.lds.S in to the _32
variant, resulting in a unified and generic linker script that can be
shared across both.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/vmlinux_32.lds.S | 113 ++++++++++++++++++++------------
 1 file changed, 70 insertions(+), 43 deletions(-)

diff --git a/arch/sh/kernel/vmlinux_32.lds.S b/arch/sh/kernel/vmlinux_32.lds.S
index dd9b2ee1312d9..8d97f52ce3228 100644
--- a/arch/sh/kernel/vmlinux_32.lds.S
+++ b/arch/sh/kernel/vmlinux_32.lds.S
@@ -1,17 +1,23 @@
 /*
  * ld script to make SuperH Linux kernel
- * Written by Niibe Yutaka
+ * Written by Niibe Yutaka and Paul Mundt
  */
-#include <asm/thread_info.h>
-#include <asm/cache.h>
-#include <asm-generic/vmlinux.lds.h>
-
+#ifdef CONFIG_SUPERH64
+#define LOAD_OFFSET	CONFIG_PAGE_OFFSET
+OUTPUT_ARCH(sh:sh5)
+#else
+OUTPUT_ARCH(sh)
 #ifdef CONFIG_CPU_LITTLE_ENDIAN
 OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux")
 #else
 OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux")
 #endif
-OUTPUT_ARCH(sh)
+#endif
+
+#include <asm/thread_info.h>
+#include <asm/cache.h>
+#include <asm-generic/vmlinux.lds.h>
+
 ENTRY(_start)
 SECTIONS
 {
@@ -24,28 +30,35 @@ SECTIONS
 	. = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET;
 #endif
 
-	_text = .;			/* Text and read-only data */
-
-	.empty_zero_page : {
+	.empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) {
 		*(.empty_zero_page)
 	} = 0
 
-	.text : {
+	.text : AT(ADDR(.text) - LOAD_OFFSET) {
+		_text = .;		/* Text and read-only data */
 		HEAD_TEXT
 		TEXT_TEXT
+
+#ifdef CONFIG_SUPERH64
+		*(.text64)
+		*(.text..SHmedia32)
+#endif
+
 		SCHED_TEXT
 		LOCK_TEXT
 		KPROBES_TEXT
+		IRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
+		_etext = .;		/* End of text section */
 	} = 0x0009
 
 	. = ALIGN(16);		/* Exception table */
-	__start___ex_table = .;
-	__ex_table : { *(__ex_table) }
-	__stop___ex_table = .;
-
-	_etext = .;			/* End of text section */
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		*(__ex_table)
+		__stop___ex_table = .;
+	}
 
 	NOTES
 	RO_DATA(PAGE_SIZE)
@@ -54,13 +67,15 @@ SECTIONS
 	 * Code which must be executed uncached and the associated data
 	 */
 	. = ALIGN(PAGE_SIZE);
-	__uncached_start = .;
-	.uncached.text : { *(.uncached.text) }
-	.uncached.data : { *(.uncached.data) }
-	__uncached_end = .;
+	.uncached : AT(ADDR(.uncached) - LOAD_OFFSET) {
+		__uncached_start = .;
+		*(.uncached.text)
+		*(.uncached.data)
+		__uncached_end = .;
+	}
 
 	. = ALIGN(THREAD_SIZE);
-	.data : {			/* Data */
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {		/* Data */
 		*(.data.init_task)
 
 		. = ALIGN(L1_CACHE_BYTES);
@@ -84,39 +99,51 @@ SECTIONS
 	_edata = .;			/* End of data section */
 
 	. = ALIGN(PAGE_SIZE);		/* Init code and data */
-	__init_begin = .;
-	_sinittext = .;
-	.init.text : { INIT_TEXT }
-	_einittext = .;
-	.init.data : { INIT_DATA }
+	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		__init_begin = .;
+		_sinittext = .;
+		INIT_TEXT
+		_einittext = .;
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA }
 
 	. = ALIGN(16);
-	__setup_start = .;
-	.init.setup : { *(.init.setup) }
-	__setup_end = .;
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		__setup_start = .;
+		*(.init.setup)
+		__setup_end = .;
+	}
 
-	__initcall_start = .;
-	.initcall.init : {
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		__initcall_start = .;
 		INITCALLS
+		__initcall_end = .;
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		__con_initcall_start = .;
+		*(.con_initcall.init)
+		__con_initcall_end = .;
 	}
-	__initcall_end = .;
-	__con_initcall_start = .;
-	.con_initcall.init : { *(.con_initcall.init) }
-	__con_initcall_end = .;
 
 	SECURITY_INIT
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	. = ALIGN(PAGE_SIZE);
-	__initramfs_start = .;
-	.init.ramfs : { *(.init.ramfs) }
-	__initramfs_end = .;
+	.init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+	}
 #endif
 
 	. = ALIGN(4);
-	__machvec_start = .;
-	.machvec.init : { *(.machvec.init) }
-	__machvec_end = .;
+	.machvec.init : AT(ADDR(.machvec.init) - LOAD_OFFSET) {
+		__machvec_start = .;
+		*(.machvec.init)
+		__machvec_end = .;
+	}
 
 	PERCPU(PAGE_SIZE)
 
@@ -124,11 +151,11 @@ SECTIONS
 	 * .exit.text is discarded at runtime, not link time, to deal with
 	 * references from __bug_table
 	 */
-	.exit.text : { EXIT_TEXT }
-	.exit.data : { EXIT_DATA }
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT }
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { EXIT_DATA }
 
 	. = ALIGN(PAGE_SIZE);
-	.bss : {
+	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
 		__init_end = .;
 		__bss_start = .;		/* BSS */
 		*(.bss.page_aligned)
-- 
GitLab


From dce97c8cb2ad365fa87dc6924d99528c21c63912 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sat, 9 May 2009 23:36:10 +0900
Subject: [PATCH 0962/2198] sh: Move the unified linker script in place, kill
 off old _64 one.

Just forcefully rename the _32 variant overtop, and kill off the now
unused _64 version.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/vmlinux.lds.S    | 182 +++++++++++++++++++++++++++++++-
 arch/sh/kernel/vmlinux_32.lds.S | 181 -------------------------------
 arch/sh/kernel/vmlinux_64.lds.S | 163 ----------------------------
 3 files changed, 179 insertions(+), 347 deletions(-)
 delete mode 100644 arch/sh/kernel/vmlinux_32.lds.S
 delete mode 100644 arch/sh/kernel/vmlinux_64.lds.S

diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index d7d4991f32af2..8d97f52ce3228 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -1,5 +1,181 @@
-#ifdef CONFIG_SUPERH32
-# include "vmlinux_32.lds.S"
+/*
+ * ld script to make SuperH Linux kernel
+ * Written by Niibe Yutaka and Paul Mundt
+ */
+#ifdef CONFIG_SUPERH64
+#define LOAD_OFFSET	CONFIG_PAGE_OFFSET
+OUTPUT_ARCH(sh:sh5)
 #else
-# include "vmlinux_64.lds.S"
+OUTPUT_ARCH(sh)
+#ifdef CONFIG_CPU_LITTLE_ENDIAN
+OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux")
+#else
+OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux")
+#endif
+#endif
+
+#include <asm/thread_info.h>
+#include <asm/cache.h>
+#include <asm-generic/vmlinux.lds.h>
+
+ENTRY(_start)
+SECTIONS
+{
+#ifdef CONFIG_PMB_FIXED
+	. = CONFIG_PAGE_OFFSET + (CONFIG_MEMORY_START & 0x1fffffff) +
+	    CONFIG_ZERO_PAGE_OFFSET;
+#elif defined(CONFIG_32BIT)
+	. = CONFIG_PAGE_OFFSET + CONFIG_ZERO_PAGE_OFFSET;
+#else
+	. = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET;
+#endif
+
+	.empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) {
+		*(.empty_zero_page)
+	} = 0
+
+	.text : AT(ADDR(.text) - LOAD_OFFSET) {
+		_text = .;		/* Text and read-only data */
+		HEAD_TEXT
+		TEXT_TEXT
+
+#ifdef CONFIG_SUPERH64
+		*(.text64)
+		*(.text..SHmedia32)
+#endif
+
+		SCHED_TEXT
+		LOCK_TEXT
+		KPROBES_TEXT
+		IRQENTRY_TEXT
+		*(.fixup)
+		*(.gnu.warning)
+		_etext = .;		/* End of text section */
+	} = 0x0009
+
+	. = ALIGN(16);		/* Exception table */
+	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+		__start___ex_table = .;
+		*(__ex_table)
+		__stop___ex_table = .;
+	}
+
+	NOTES
+	RO_DATA(PAGE_SIZE)
+
+	/*
+	 * Code which must be executed uncached and the associated data
+	 */
+	. = ALIGN(PAGE_SIZE);
+	.uncached : AT(ADDR(.uncached) - LOAD_OFFSET) {
+		__uncached_start = .;
+		*(.uncached.text)
+		*(.uncached.data)
+		__uncached_end = .;
+	}
+
+	. = ALIGN(THREAD_SIZE);
+	.data : AT(ADDR(.data) - LOAD_OFFSET) {		/* Data */
+		*(.data.init_task)
+
+		. = ALIGN(L1_CACHE_BYTES);
+		*(.data.cacheline_aligned)
+
+		. = ALIGN(L1_CACHE_BYTES);
+		*(.data.read_mostly)
+
+		. = ALIGN(PAGE_SIZE);
+		*(.data.page_aligned)
+
+		__nosave_begin = .;
+		*(.data.nosave)
+		. = ALIGN(PAGE_SIZE);
+		__nosave_end = .;
+
+		DATA_DATA
+		CONSTRUCTORS
+	}
+
+	_edata = .;			/* End of data section */
+
+	. = ALIGN(PAGE_SIZE);		/* Init code and data */
+	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+		__init_begin = .;
+		_sinittext = .;
+		INIT_TEXT
+		_einittext = .;
+	}
+
+	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA }
+
+	. = ALIGN(16);
+	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+		__setup_start = .;
+		*(.init.setup)
+		__setup_end = .;
+	}
+
+	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+		__initcall_start = .;
+		INITCALLS
+		__initcall_end = .;
+	}
+
+	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+		__con_initcall_start = .;
+		*(.con_initcall.init)
+		__con_initcall_end = .;
+	}
+
+	SECURITY_INIT
+
+#ifdef CONFIG_BLK_DEV_INITRD
+	. = ALIGN(PAGE_SIZE);
+	.init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) {
+		__initramfs_start = .;
+		*(.init.ramfs)
+		__initramfs_end = .;
+	}
 #endif
+
+	. = ALIGN(4);
+	.machvec.init : AT(ADDR(.machvec.init) - LOAD_OFFSET) {
+		__machvec_start = .;
+		*(.machvec.init)
+		__machvec_end = .;
+	}
+
+	PERCPU(PAGE_SIZE)
+
+	/*
+	 * .exit.text is discarded at runtime, not link time, to deal with
+	 * references from __bug_table
+	 */
+	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT }
+	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { EXIT_DATA }
+
+	. = ALIGN(PAGE_SIZE);
+	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+		__init_end = .;
+		__bss_start = .;		/* BSS */
+		*(.bss.page_aligned)
+		*(.bss)
+		*(COMMON)
+		. = ALIGN(4);
+		_ebss = .;			/* uClinux MTD sucks */
+		_end = . ;
+	}
+
+	/*
+	 * When something in the kernel is NOT compiled as a module, the
+	 * module cleanup code and data are put into these segments. Both
+	 * can then be thrown away, as cleanup code is never called unless
+	 * it's a module.
+	 */
+	/DISCARD/ : {
+		*(.exitcall.exit)
+	}
+
+	STABS_DEBUG
+	DWARF_DEBUG
+}
diff --git a/arch/sh/kernel/vmlinux_32.lds.S b/arch/sh/kernel/vmlinux_32.lds.S
deleted file mode 100644
index 8d97f52ce3228..0000000000000
--- a/arch/sh/kernel/vmlinux_32.lds.S
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * ld script to make SuperH Linux kernel
- * Written by Niibe Yutaka and Paul Mundt
- */
-#ifdef CONFIG_SUPERH64
-#define LOAD_OFFSET	CONFIG_PAGE_OFFSET
-OUTPUT_ARCH(sh:sh5)
-#else
-OUTPUT_ARCH(sh)
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux")
-#else
-OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux")
-#endif
-#endif
-
-#include <asm/thread_info.h>
-#include <asm/cache.h>
-#include <asm-generic/vmlinux.lds.h>
-
-ENTRY(_start)
-SECTIONS
-{
-#ifdef CONFIG_PMB_FIXED
-	. = CONFIG_PAGE_OFFSET + (CONFIG_MEMORY_START & 0x1fffffff) +
-	    CONFIG_ZERO_PAGE_OFFSET;
-#elif defined(CONFIG_32BIT)
-	. = CONFIG_PAGE_OFFSET + CONFIG_ZERO_PAGE_OFFSET;
-#else
-	. = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET;
-#endif
-
-	.empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) {
-		*(.empty_zero_page)
-	} = 0
-
-	.text : AT(ADDR(.text) - LOAD_OFFSET) {
-		_text = .;		/* Text and read-only data */
-		HEAD_TEXT
-		TEXT_TEXT
-
-#ifdef CONFIG_SUPERH64
-		*(.text64)
-		*(.text..SHmedia32)
-#endif
-
-		SCHED_TEXT
-		LOCK_TEXT
-		KPROBES_TEXT
-		IRQENTRY_TEXT
-		*(.fixup)
-		*(.gnu.warning)
-		_etext = .;		/* End of text section */
-	} = 0x0009
-
-	. = ALIGN(16);		/* Exception table */
-	__ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-		__start___ex_table = .;
-		*(__ex_table)
-		__stop___ex_table = .;
-	}
-
-	NOTES
-	RO_DATA(PAGE_SIZE)
-
-	/*
-	 * Code which must be executed uncached and the associated data
-	 */
-	. = ALIGN(PAGE_SIZE);
-	.uncached : AT(ADDR(.uncached) - LOAD_OFFSET) {
-		__uncached_start = .;
-		*(.uncached.text)
-		*(.uncached.data)
-		__uncached_end = .;
-	}
-
-	. = ALIGN(THREAD_SIZE);
-	.data : AT(ADDR(.data) - LOAD_OFFSET) {		/* Data */
-		*(.data.init_task)
-
-		. = ALIGN(L1_CACHE_BYTES);
-		*(.data.cacheline_aligned)
-
-		. = ALIGN(L1_CACHE_BYTES);
-		*(.data.read_mostly)
-
-		. = ALIGN(PAGE_SIZE);
-		*(.data.page_aligned)
-
-		__nosave_begin = .;
-		*(.data.nosave)
-		. = ALIGN(PAGE_SIZE);
-		__nosave_end = .;
-
-		DATA_DATA
-		CONSTRUCTORS
-	}
-
-	_edata = .;			/* End of data section */
-
-	. = ALIGN(PAGE_SIZE);		/* Init code and data */
-	.init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-		__init_begin = .;
-		_sinittext = .;
-		INIT_TEXT
-		_einittext = .;
-	}
-
-	.init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA }
-
-	. = ALIGN(16);
-	.init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-		__setup_start = .;
-		*(.init.setup)
-		__setup_end = .;
-	}
-
-	.initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-		__initcall_start = .;
-		INITCALLS
-		__initcall_end = .;
-	}
-
-	.con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-		__con_initcall_start = .;
-		*(.con_initcall.init)
-		__con_initcall_end = .;
-	}
-
-	SECURITY_INIT
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	. = ALIGN(PAGE_SIZE);
-	.init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) {
-		__initramfs_start = .;
-		*(.init.ramfs)
-		__initramfs_end = .;
-	}
-#endif
-
-	. = ALIGN(4);
-	.machvec.init : AT(ADDR(.machvec.init) - LOAD_OFFSET) {
-		__machvec_start = .;
-		*(.machvec.init)
-		__machvec_end = .;
-	}
-
-	PERCPU(PAGE_SIZE)
-
-	/*
-	 * .exit.text is discarded at runtime, not link time, to deal with
-	 * references from __bug_table
-	 */
-	.exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT }
-	.exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { EXIT_DATA }
-
-	. = ALIGN(PAGE_SIZE);
-	.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-		__init_end = .;
-		__bss_start = .;		/* BSS */
-		*(.bss.page_aligned)
-		*(.bss)
-		*(COMMON)
-		. = ALIGN(4);
-		_ebss = .;			/* uClinux MTD sucks */
-		_end = . ;
-	}
-
-	/*
-	 * When something in the kernel is NOT compiled as a module, the
-	 * module cleanup code and data are put into these segments. Both
-	 * can then be thrown away, as cleanup code is never called unless
-	 * it's a module.
-	 */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-	}
-
-	STABS_DEBUG
-	DWARF_DEBUG
-}
diff --git a/arch/sh/kernel/vmlinux_64.lds.S b/arch/sh/kernel/vmlinux_64.lds.S
deleted file mode 100644
index 69664460c6886..0000000000000
--- a/arch/sh/kernel/vmlinux_64.lds.S
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * ld script to make SH64 Linux kernel
- *
- * Copyright (C) 2000, 2001  Paolo Alberelli
- *
- * benedict.gaster@superh.com:	 2nd May 2002
- *    Add definition of empty_zero_page to be the first page of kernel image.
- *
- * benedict.gaster@superh.com:	 3rd May 2002
- *    Added support for ramdisk, removing statically linked romfs at the
- *    same time.
- *
- * lethal@linux-sh.org:          9th May 2003
- *    Kill off GLOBAL_NAME() usage and other CDC-isms.
- *
- * lethal@linux-sh.org:         19th May 2003
- *    Remove support for ancient toolchains.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <asm/page.h>
-#include <asm/cache.h>
-#include <asm/thread_info.h>
-
-#define LOAD_OFFSET	CONFIG_PAGE_OFFSET
-#include <asm-generic/vmlinux.lds.h>
-
-OUTPUT_ARCH(sh:sh5)
-
-#define C_PHYS(x) AT (ADDR(x) - LOAD_OFFSET)
-
-ENTRY(__start)
-SECTIONS
-{
-	. = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + PAGE_SIZE;
-	_text = .;			/* Text and read-only data */
-
-	.empty_zero_page : C_PHYS(.empty_zero_page) {
-		*(.empty_zero_page)
-	} = 0
-
-	.text : C_PHYS(.text) {
-		HEAD_TEXT
-		TEXT_TEXT
-		*(.text64)
-		*(.text..SHmedia32)
-		SCHED_TEXT
-		LOCK_TEXT
-		KPROBES_TEXT
-		*(.fixup)
-		*(.gnu.warning)
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-	} = 0x6ff0fff0
-#else
-	} = 0xf0fff06f
-#endif
-
-	/* We likely want __ex_table to be Cache Line aligned */
-	. = ALIGN(L1_CACHE_BYTES);		/* Exception table */
-	__start___ex_table = .;
-	__ex_table : C_PHYS(__ex_table) { *(__ex_table) }
-	__stop___ex_table = .;
-
-	_etext = .;			/* End of text section */
-
-	NOTES 
-	RO_DATA(PAGE_SIZE)
-
-	. = ALIGN(THREAD_SIZE);
-	.data : C_PHYS(.data) {			/* Data */
-		*(.data.init_task)
-
-		. = ALIGN(L1_CACHE_BYTES);
-		*(.data.cacheline_aligned)
-
-		. = ALIGN(L1_CACHE_BYTES);
-		*(.data.read_mostly)
-
-		. = ALIGN(PAGE_SIZE);
-		*(.data.page_aligned)
-
-		__nosave_begin = .;
-		*(.data.nosave)
-		. = ALIGN(PAGE_SIZE);
-		__nosave_end = .;
-
-		DATA_DATA
-		CONSTRUCTORS
-	}
-
-	_edata = .;			/* End of data section */
-
-	. = ALIGN(PAGE_SIZE);		/* Init code and data */
-	__init_begin = .;
-	_sinittext = .;
-	.init.text : C_PHYS(.init.text) { INIT_TEXT }
-	_einittext = .;
-	.init.data : C_PHYS(.init.data) { INIT_DATA }
-	. = ALIGN(L1_CACHE_BYTES);	/* Better if Cache Line aligned */
-	__setup_start = .;
-	.init.setup : C_PHYS(.init.setup) { *(.init.setup) }
-	__setup_end = .;
-	__initcall_start = .;
-	.initcall.init : C_PHYS(.initcall.init) {
-		INITCALLS
-	}
-	__initcall_end = .;
-	__con_initcall_start = .;
-	.con_initcall.init : C_PHYS(.con_initcall.init) {
-		*(.con_initcall.init)
-	}
-	__con_initcall_end = .;
-
-	SECURITY_INIT
-
-#ifdef CONFIG_BLK_DEV_INITRD
-	. = ALIGN(PAGE_SIZE);
-	__initramfs_start = .;
-	.init.ramfs : C_PHYS(.init.ramfs) { *(.init.ramfs) }
-	__initramfs_end = .;
-#endif
-
-	. = ALIGN(8);
-	__machvec_start = .;
-	.machvec.init : C_PHYS(.machvec.init) { *(.machvec.init) }
-	__machvec_end = .;
-
-	PERCPU(PAGE_SIZE)
-
-	/*
-	 * .exit.text is discarded at runtime, not link time, to deal with
-	 * references from __bug_table
-	 */
-	.exit.text : C_PHYS(.exit.text) { EXIT_TEXT }
-	.exit.data : C_PHYS(.exit.data) { EXIT_DATA }
-
-	. = ALIGN(PAGE_SIZE);
-	.bss : C_PHYS(.bss) {
-		__init_end = .;
-		__bss_start = .;		/* BSS */
-		*(.bss.page_aligned)
-		*(.bss)
-		*(COMMON)
-		. = ALIGN(4);
-		_ebss = .;			/* uClinux MTD sucks */
-		_end = . ;
-	}
-
-	/*
-	 * When something in the kernel is NOT compiled as a module, the
-	 * module cleanup code and data are put into these segments. Both
-	 * can then be thrown away, as cleanup code is never called unless
-	 * it's a module.
-	 */
-	/DISCARD/ : {
-		*(.exitcall.exit)
-	}
-
-	STABS_DEBUG
-	DWARF_DEBUG
-}
-- 
GitLab


From 7b022d07a0fd2ce02d4456b732c674ff1d16f5ce Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 00:25:08 +0900
Subject: [PATCH 0963/2198] sh: Tidy up the ldscript output format specifier.

Tie this in to the Makefile directly, where we already know what we are
running on. This tidies up the linker script a bit, and is prep work for
unifying the arch/sh/boot/compressed linker scripts.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Makefile                    | 8 ++++++--
 arch/sh/boot/compressed/Makefile_32 | 3 ++-
 arch/sh/boot/compressed/Makefile_64 | 4 ++--
 arch/sh/kernel/vmlinux.lds.S        | 6 +-----
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index 68348e7002882..b941dc9b20f41 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -100,13 +100,17 @@ LDFLAGS_vmlinux	+= --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \
 endif
 
 ifdef CONFIG_CPU_LITTLE_ENDIAN
-LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64'
+ld-bfd			:= elf32-$(UTS_MACHINE)-linux
+LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64' --oformat $(ld-bfd)
 LDFLAGS			+= -EL
 else
-LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64+4'
+ld-bfd			:= elf32-$(UTS_MACHINE)big-linux
+LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64+4' --oformat $(ld-bfd)
 LDFLAGS			+= -EB
 endif
 
+export ld-bfd
+
 head-y			:= arch/sh/kernel/init_task.o
 head-$(CONFIG_SUPERH32)	+= arch/sh/kernel/head_32.o
 head-$(CONFIG_SUPERH64)	+= arch/sh/kernel/head_64.o
diff --git a/arch/sh/boot/compressed/Makefile_32 b/arch/sh/boot/compressed/Makefile_32
index b96a055b053eb..249255729d7c0 100644
--- a/arch/sh/boot/compressed/Makefile_32
+++ b/arch/sh/boot/compressed/Makefile_32
@@ -28,7 +28,8 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
 endif
 
-LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup -T $(obj)/../../kernel/vmlinux.lds
+LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \
+		   -T $(obj)/../../kernel/vmlinux.lds
 
 $(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE
 	$(call if_changed,ld)
diff --git a/arch/sh/boot/compressed/Makefile_64 b/arch/sh/boot/compressed/Makefile_64
index 658d4f9155567..541a529d066c7 100644
--- a/arch/sh/boot/compressed/Makefile_64
+++ b/arch/sh/boot/compressed/Makefile_64
@@ -24,8 +24,8 @@ OBJECTS		:= $(obj)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o \
 ZIMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
 		     $$[$(CONFIG_PAGE_OFFSET)+0x400000+0x10000]')
 
-LDFLAGS_vmlinux := -Ttext $(ZIMAGE_OFFSET) -e startup \
-		    -T $(obj)/../../kernel/vmlinux.lds
+LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(ZIMAGE_OFFSET) -e startup \
+		   -T $(obj)/../../kernel/vmlinux.lds
 
 $(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o FORCE
 	$(call if_changed,ld)
diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 8d97f52ce3228..1a2c8db5cb2a8 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -6,12 +6,8 @@
 #define LOAD_OFFSET	CONFIG_PAGE_OFFSET
 OUTPUT_ARCH(sh:sh5)
 #else
+#define LOAD_OFFSET	0
 OUTPUT_ARCH(sh)
-#ifdef CONFIG_CPU_LITTLE_ENDIAN
-OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux")
-#else
-OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux")
-#endif
 #endif
 
 #include <asm/thread_info.h>
-- 
GitLab


From 20b27fa33743c6ef77a1248421fab51e8bf21a25 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 00:36:22 +0900
Subject: [PATCH 0964/2198] sh: Fix up the sh64 zImage build.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boot/compressed/Makefile_64    |  3 +-
 arch/sh/boot/compressed/head_64.S      |  5 +-
 arch/sh/boot/compressed/vmlinux_64.lds | 64 --------------------------
 3 files changed, 2 insertions(+), 70 deletions(-)
 delete mode 100644 arch/sh/boot/compressed/vmlinux_64.lds

diff --git a/arch/sh/boot/compressed/Makefile_64 b/arch/sh/boot/compressed/Makefile_64
index 541a529d066c7..7093255d345bd 100644
--- a/arch/sh/boot/compressed/Makefile_64
+++ b/arch/sh/boot/compressed/Makefile_64
@@ -14,8 +14,7 @@
 targets		:= vmlinux vmlinux.bin vmlinux.bin.gz \
 		   head_64.o misc_64.o cache.o piggy.o
 
-OBJECTS		:= $(obj)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o \
-		   $(obj)/cache.o
+OBJECTS		:= $(obj)/head_64.o $(obj)/misc_64.o $(obj)/cache.o
 
 #
 # ZIMAGE_OFFSET is the load offset of the compression loader
diff --git a/arch/sh/boot/compressed/head_64.S b/arch/sh/boot/compressed/head_64.S
index 622eac3cf556c..9993113c67138 100644
--- a/arch/sh/boot/compressed/head_64.S
+++ b/arch/sh/boot/compressed/head_64.S
@@ -14,6 +14,7 @@
  *   Copyright (C) 2002 Stuart Menefy (stuart.menefy@st.com)
  */
 #include <asm/cache.h>
+#include <asm/tlb.h>
 #include <cpu/mmu_context.h>
 #include <cpu/registers.h>
 
@@ -33,11 +34,7 @@
 #define	ICCR0_INIT_VAL	ICCR0_ON | ICCR0_ICI		/* ICE + ICI */
 #define	ICCR1_INIT_VAL	ICCR1_NOLOCK			/* No locking */
 
-#if 1
 #define	OCCR0_INIT_VAL	OCCR0_ON | OCCR0_OCI | OCCR0_WB	/* OCE + OCI + WB */
-#else
-#define	OCCR0_INIT_VAL	OCCR0_OFF
-#endif
 #define	OCCR1_INIT_VAL	OCCR1_NOLOCK			/* No locking */
 
 	.text
diff --git a/arch/sh/boot/compressed/vmlinux_64.lds b/arch/sh/boot/compressed/vmlinux_64.lds
deleted file mode 100644
index 59c2ef4aeda5b..0000000000000
--- a/arch/sh/boot/compressed/vmlinux_64.lds
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * ld script to make compressed SuperH/shmedia Linux kernel+decompression
- *		bootstrap
- * Modified by Stuart Menefy from arch/sh/vmlinux.lds.S written by Niibe Yutaka
- */
-
-
-#ifdef CONFIG_LITTLE_ENDIAN
-/* OUTPUT_FORMAT("elf32-sh64l-linux", "elf32-sh64l-linux", "elf32-sh64l-linux") */
-#define NOP 0x6ff0fff0
-#else
-/* OUTPUT_FORMAT("elf32-sh64", "elf32-sh64", "elf32-sh64") */
-#define NOP 0xf0fff06f
-#endif
-
-OUTPUT_FORMAT("elf32-sh64-linux")
-OUTPUT_ARCH(sh)
-ENTRY(_start)
-
-#define ALIGNED_GAP(section, align) (((ADDR(section)+SIZEOF(section)+(align)-1) & ~((align)-1))-ADDR(section))
-#define FOLLOWING(section, align) AT (LOADADDR(section) + ALIGNED_GAP(section,align))
-
-SECTIONS
-{
-  _text = .;			/* Text and read-only data */
-
-  .text : {
-	*(.text)
-	*(.text64)
-	*(.text..SHmedia32)
-	*(.fixup)
-	*(.gnu.warning)
-	} = NOP
-  . = ALIGN(4);
-  .rodata : { *(.rodata) }
-
-  /* There is no 'real' reason for eight byte alignment, four would work
-   * as well, but gdb downloads much (*4) faster with this.
-   */
-  . = ALIGN(8);
-  .image : { *(.image) }
-  . = ALIGN(4);
-  _etext = .;			/* End of text section */
-
-  .data :			/* Data */
-	FOLLOWING(.image, 4)
-	{
-	_data = .;
-	*(.data)
-	}
-  _data_image = LOADADDR(.data);/* Address of data section in ROM */
-
-  _edata = .;			/* End of data section */
-
-  .stack : { stack = .;  _stack = .; }
-
-  . = ALIGN(4);
-  __bss_start = .;		/* BSS */
-  .bss : {
-	*(.bss)
-	}
-  . = ALIGN(4);
-  _end = . ;
-}
-- 
GitLab


From a2e76c80d93ec3c59a030b6ca37b9087033565c1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 00:54:39 +0900
Subject: [PATCH 0965/2198] sh: Provide a tighter BOOT_LINK_OFFSET definition
 for the Cayman board.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 6ed16c4548c92..bca6a2a0759c7 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -704,6 +704,7 @@ config ZERO_PAGE_OFFSET
 
 config BOOT_LINK_OFFSET
 	hex "Link address offset for booting"
+	default "0x00400000" if SH_CAYMAN
 	default "0x00800000"
 	help
 	  This option allows you to set the link address offset of the zImage.
-- 
GitLab


From b20883562455060272126c36563a7d8edafc30d3 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 00:55:45 +0900
Subject: [PATCH 0966/2198] sh: Provide a BITS definition, use it in the
 arch/sh/boot/ Makefiles.

This introduces a BITS export that can handily be picked up by Makefiles
for cleaner sharing. Reflect its use in arch/sh/boot/compressed/ in
preparation for unifying the Makefiles.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Makefile                    | 8 ++++----
 arch/sh/boot/Makefile               | 6 +++---
 arch/sh/boot/compressed/Makefile    | 4 ----
 arch/sh/boot/compressed/Makefile_32 | 6 +++---
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index b941dc9b20f41..c1bbae1f65e37 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -91,9 +91,11 @@ KBUILD_IMAGE		:= $(defaultimage-y)
 #
 ifdef CONFIG_SUPERH32
 UTS_MACHINE	:= sh
+BITS		:= 32
 LDFLAGS_vmlinux	+= -e _stext
 else
 UTS_MACHINE	:= sh64
+BITS		:= 64
 LDFLAGS_vmlinux	+= --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \
 		   --defsym phys_stext_shmedia=phys_stext+1 \
 		   -e phys_stext_shmedia
@@ -109,11 +111,9 @@ LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64+4' --oformat $(ld-bfd)
 LDFLAGS			+= -EB
 endif
 
-export ld-bfd
+export ld-bfd BITS
 
-head-y			:= arch/sh/kernel/init_task.o
-head-$(CONFIG_SUPERH32)	+= arch/sh/kernel/head_32.o
-head-$(CONFIG_SUPERH64)	+= arch/sh/kernel/head_64.o
+head-y	:= arch/sh/kernel/init_task.o arch/sh/kernel/head_$(BITS).o
 
 core-y				+= arch/sh/kernel/ arch/sh/mm/ arch/sh/boards/
 core-$(CONFIG_SH_FPU_EMU)	+= arch/sh/math-emu/
diff --git a/arch/sh/boot/Makefile b/arch/sh/boot/Makefile
index 95483d1612588..78efb04c28f3d 100644
--- a/arch/sh/boot/Makefile
+++ b/arch/sh/boot/Makefile
@@ -20,9 +20,6 @@ CONFIG_BOOT_LINK_OFFSET	?= 0x00800000
 CONFIG_ZERO_PAGE_OFFSET	?= 0x00001000
 CONFIG_ENTRY_OFFSET	?= 0x00001000
 
-export CONFIG_PAGE_OFFSET CONFIG_MEMORY_START CONFIG_BOOT_LINK_OFFSET \
-       CONFIG_ZERO_PAGE_OFFSET CONFIG_ENTRY_OFFSET
-
 targets := zImage vmlinux.srec uImage uImage.srec
 subdir- := compressed
 
@@ -43,6 +40,9 @@ KERNEL_MEMORY := $(shell /bin/bash -c 'printf "0x%08x" \
 		     $$[$(CONFIG_MEMORY_START)]')
 endif
 
+export CONFIG_PAGE_OFFSET CONFIG_MEMORY_START CONFIG_BOOT_LINK_OFFSET \
+       CONFIG_ZERO_PAGE_OFFSET CONFIG_ENTRY_OFFSET KERNEL_MEMORY
+
 KERNEL_LOAD	:= $(shell /bin/bash -c 'printf "0x%08x" \
 		     $$[$(CONFIG_PAGE_OFFSET)  + \
 			$(KERNEL_MEMORY) + \
diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile
index efb01dc3c8c3a..f0a71df26096c 100644
--- a/arch/sh/boot/compressed/Makefile
+++ b/arch/sh/boot/compressed/Makefile
@@ -1,5 +1 @@
-ifeq ($(CONFIG_SUPERH32),y)
 include ${srctree}/arch/sh/boot/compressed/Makefile_32
-else
-include ${srctree}/arch/sh/boot/compressed/Makefile_64
-endif
diff --git a/arch/sh/boot/compressed/Makefile_32 b/arch/sh/boot/compressed/Makefile_32
index 249255729d7c0..9531bf1b7c2f8 100644
--- a/arch/sh/boot/compressed/Makefile_32
+++ b/arch/sh/boot/compressed/Makefile_32
@@ -5,9 +5,9 @@
 #
 
 targets		:= vmlinux vmlinux.bin vmlinux.bin.gz \
-		   head_32.o misc_32.o piggy.o
+		   head_$(BITS).o misc_$(BITS).o piggy.o
 
-OBJECTS = $(obj)/head_32.o $(obj)/misc_32.o
+OBJECTS = $(obj)/head_$(BITS).o $(obj)/misc_$(BITS).o $(obj)/cache.o
 
 ifdef CONFIG_SH_STANDARD_BIOS
 OBJECTS += $(obj)/../../kernel/sh_bios.o
@@ -18,7 +18,7 @@ endif
 #
 IMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
 		     $$[$(CONFIG_PAGE_OFFSET)  + \
-			$(CONFIG_MEMORY_START) + \
+			$(KERNEL_MEMORY) + \
 			$(CONFIG_BOOT_LINK_OFFSET)]')
 
 LIBGCC	:= $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
-- 
GitLab


From 1eca133cc9f978a8c44788fc5b2fe54219f9425c Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 00:58:21 +0900
Subject: [PATCH 0967/2198] sh: Merge the split arch/sh/boot/compressed/
 Makefiles.

This kills off the _64 variant and moves the _32 one over as the generic
one to use.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boot/compressed/Makefile    | 48 ++++++++++++++++++++++++++++-
 arch/sh/boot/compressed/Makefile_32 | 47 ----------------------------
 arch/sh/boot/compressed/Makefile_64 | 42 -------------------------
 3 files changed, 47 insertions(+), 90 deletions(-)
 delete mode 100644 arch/sh/boot/compressed/Makefile_32
 delete mode 100644 arch/sh/boot/compressed/Makefile_64

diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile
index f0a71df26096c..9531bf1b7c2f8 100644
--- a/arch/sh/boot/compressed/Makefile
+++ b/arch/sh/boot/compressed/Makefile
@@ -1 +1,47 @@
-include ${srctree}/arch/sh/boot/compressed/Makefile_32
+#
+# linux/arch/sh/boot/compressed/Makefile
+#
+# create a compressed vmlinux image from the original vmlinux
+#
+
+targets		:= vmlinux vmlinux.bin vmlinux.bin.gz \
+		   head_$(BITS).o misc_$(BITS).o piggy.o
+
+OBJECTS = $(obj)/head_$(BITS).o $(obj)/misc_$(BITS).o $(obj)/cache.o
+
+ifdef CONFIG_SH_STANDARD_BIOS
+OBJECTS += $(obj)/../../kernel/sh_bios.o
+endif
+
+#
+# IMAGE_OFFSET is the load offset of the compression loader
+#
+IMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
+		     $$[$(CONFIG_PAGE_OFFSET)  + \
+			$(KERNEL_MEMORY) + \
+			$(CONFIG_BOOT_LINK_OFFSET)]')
+
+LIBGCC	:= $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
+
+ifeq ($(CONFIG_FUNCTION_TRACER),y)
+ORIG_CFLAGS := $(KBUILD_CFLAGS)
+KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
+endif
+
+LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \
+		   -T $(obj)/../../kernel/vmlinux.lds
+
+$(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE
+	$(call if_changed,ld)
+	@:
+
+$(obj)/vmlinux.bin: vmlinux FORCE
+	$(call if_changed,objcopy)
+
+$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+	$(call if_changed,gzip)
+
+OBJCOPYFLAGS += -R .empty_zero_page
+
+$(obj)/piggy.o: $(obj)/piggy.S $(obj)/vmlinux.bin.gz FORCE
+	$(call if_changed,as_o_S)
diff --git a/arch/sh/boot/compressed/Makefile_32 b/arch/sh/boot/compressed/Makefile_32
deleted file mode 100644
index 9531bf1b7c2f8..0000000000000
--- a/arch/sh/boot/compressed/Makefile_32
+++ /dev/null
@@ -1,47 +0,0 @@
-#
-# linux/arch/sh/boot/compressed/Makefile
-#
-# create a compressed vmlinux image from the original vmlinux
-#
-
-targets		:= vmlinux vmlinux.bin vmlinux.bin.gz \
-		   head_$(BITS).o misc_$(BITS).o piggy.o
-
-OBJECTS = $(obj)/head_$(BITS).o $(obj)/misc_$(BITS).o $(obj)/cache.o
-
-ifdef CONFIG_SH_STANDARD_BIOS
-OBJECTS += $(obj)/../../kernel/sh_bios.o
-endif
-
-#
-# IMAGE_OFFSET is the load offset of the compression loader
-#
-IMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
-		     $$[$(CONFIG_PAGE_OFFSET)  + \
-			$(KERNEL_MEMORY) + \
-			$(CONFIG_BOOT_LINK_OFFSET)]')
-
-LIBGCC	:= $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name)
-
-ifeq ($(CONFIG_FUNCTION_TRACER),y)
-ORIG_CFLAGS := $(KBUILD_CFLAGS)
-KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS))
-endif
-
-LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \
-		   -T $(obj)/../../kernel/vmlinux.lds
-
-$(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE
-	$(call if_changed,ld)
-	@:
-
-$(obj)/vmlinux.bin: vmlinux FORCE
-	$(call if_changed,objcopy)
-
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
-	$(call if_changed,gzip)
-
-OBJCOPYFLAGS += -R .empty_zero_page
-
-$(obj)/piggy.o: $(obj)/piggy.S $(obj)/vmlinux.bin.gz FORCE
-	$(call if_changed,as_o_S)
diff --git a/arch/sh/boot/compressed/Makefile_64 b/arch/sh/boot/compressed/Makefile_64
deleted file mode 100644
index 7093255d345bd..0000000000000
--- a/arch/sh/boot/compressed/Makefile_64
+++ /dev/null
@@ -1,42 +0,0 @@
-#
-# arch/sh/boot/compressed/Makefile_64
-#
-# create a compressed vmlinux image from the original vmlinux
-#
-# Copyright (C) 2002 Stuart Menefy
-# Copyright (C) 2004 Paul Mundt
-#
-# This file is subject to the terms and conditions of the GNU General Public
-# License.  See the file "COPYING" in the main directory of this archive
-# for more details.
-#
-
-targets		:= vmlinux vmlinux.bin vmlinux.bin.gz \
-		   head_64.o misc_64.o cache.o piggy.o
-
-OBJECTS		:= $(obj)/head_64.o $(obj)/misc_64.o $(obj)/cache.o
-
-#
-# ZIMAGE_OFFSET is the load offset of the compression loader
-# (4M for the kernel plus 64K for this loader)
-#
-ZIMAGE_OFFSET	:= $(shell /bin/bash -c 'printf "0x%08x" \
-		     $$[$(CONFIG_PAGE_OFFSET)+0x400000+0x10000]')
-
-LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(ZIMAGE_OFFSET) -e startup \
-		   -T $(obj)/../../kernel/vmlinux.lds
-
-$(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o FORCE
-	$(call if_changed,ld)
-	@:
-
-$(obj)/vmlinux.bin: vmlinux FORCE
-	$(call if_changed,objcopy)
-
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
-	$(call if_changed,gzip)
-
-OBJCOPYFLAGS += -R .empty_zero_page
-
-$(obj)/piggy.o: $(obj)/piggy.S $(obj)/vmlinux.bin.gz FORCE
-	$(call if_changed,as_o_S)
-- 
GitLab


From b412a49af970e703124fa3259bf24165c0c74024 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 01:23:25 +0900
Subject: [PATCH 0968/2198] sh: Consolidate the boot link and entry offset
 definitions.

Consolidate these in a single place in the Kconfig menus. At the same
time, disable their interactivity and set them according to the board
config defaults.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig    | 22 ++++++++++++++++++----
 arch/sh/mm/Kconfig |  8 --------
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index bca6a2a0759c7..a9dee13ddd8b3 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -694,23 +694,37 @@ endmenu
 menu "Boot options"
 
 config ZERO_PAGE_OFFSET
-	hex "Zero page offset"
-	default "0x00004000" if SH_SH03
-	default "0x00010000" if PAGE_SIZE_64KB
+	hex
+	default "0x00010000" if PAGE_SIZE_64KB || SH_RTS7751R2D || \
+				SH_7751_SOLUTION_ENGINE
+	default "0x00004000" if PAGE_SIZE_16KB || SH_SH03
 	default "0x00002000" if PAGE_SIZE_8KB
 	default "0x00001000"
 	help
 	  This sets the default offset of zero page.
 
 config BOOT_LINK_OFFSET
-	hex "Link address offset for booting"
+	hex
+	default "0x00210000" if SH_SHMIN
 	default "0x00400000" if SH_CAYMAN
+	default "0x00810000" if SH_7780_SOLUTION_ENGINE
+	default "0x009e0000" if SH_TITAN
+	default "0x01800000" if SH_SDK7780
+	default "0x02000000" if SH_EDOSK7760
 	default "0x00800000"
 	help
 	  This option allows you to set the link address offset of the zImage.
 	  This can be useful if you are on a board which has a small amount of
 	  memory.
 
+config ENTRY_OFFSET
+	hex
+	default "0x00001000" if PAGE_SIZE_4KB
+	default "0x00002000" if PAGE_SIZE_8KB
+	default "0x00004000" if PAGE_SIZE_16KB
+	default "0x00010000" if PAGE_SIZE_64KB
+	default "0x00000000"
+
 config UBC_WAKEUP
 	bool "Wakeup UBC on startup"
 	depends on CPU_SH4 && !CPU_SH4A
diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index d4079cab2d581..b900d2cd18f7f 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -201,14 +201,6 @@ config PAGE_SIZE_64KB
 
 endchoice
 
-config ENTRY_OFFSET
-	hex
-	default "0x00001000" if PAGE_SIZE_4KB
-	default "0x00002000" if PAGE_SIZE_8KB
-	default "0x00004000" if PAGE_SIZE_16KB
-	default "0x00010000" if PAGE_SIZE_64KB
-	default "0x00000000"
-
 choice
 	prompt "HugeTLB page size"
 	depends on HUGETLB_PAGE && (CPU_SH4 || CPU_SH5) && MMU
-- 
GitLab


From 457daa2b66e07bbd2280b9f8d2b03e800f357243 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 01:28:01 +0900
Subject: [PATCH 0969/2198] sh: Hook up cc-cross-prefix support.

This implements a simple case that just iterates through the common
cases, looking at UTS_MACHINE for hints.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Makefile | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index c1bbae1f65e37..a47fbd260ba4e 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -101,6 +101,12 @@ LDFLAGS_vmlinux	+= --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \
 		   -e phys_stext_shmedia
 endif
 
+ifneq ($(SUBARCH),$(ARCH))
+  ifeq ($(CROSS_COMPILE),)
+    CROSS_COMPILE := $(call cc-cross-prefix, $(UTS_MACHINE)-linux-  $(UTS_MACHINE)-linux-gnu-  $(UTS_MACHINE)-unknown-linux-gnu-)
+  endif
+endif
+
 ifdef CONFIG_CPU_LITTLE_ENDIAN
 ld-bfd			:= elf32-$(UTS_MACHINE)-linux
 LDFLAGS_vmlinux		+= --defsym 'jiffies=jiffies_64' --oformat $(ld-bfd)
-- 
GitLab


From 567bb8fd47624cb9f894c64ce9530d43d5862a71 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Sun, 10 May 2009 14:25:39 +0900
Subject: [PATCH 0970/2198] sh: Fix up R0 dependence in __arch_swab16/32.

There is nothing in these routines that inherently depends on R0 use.
Given that these routines are inlined, it is rather easy to blow up the
compiler by exhausting the spill class when performing a 64-bit swab.

This presently manifests itself as the following:

CC      fs/ocfs2/suballoc.o
fs/ocfs2/suballoc.c: In function 'ocfs2_reserve_suballoc_bits':
fs/ocfs2/suballoc.c:638: error: unrecognizable insn:
(insn 2793 1230 1231 103 arch/sh/include/asm/swab.h:33 (set (reg:HI 853)
        (subreg:HI (reg:SI 149 macl) 2)) -1 (expr_list:REG_DEAD (reg:SI 149 macl)
        (nil)))
fs/ocfs2/suballoc.c:638: internal compiler error: in extract_insn, at recog.c:1991

This patch switches over to using an arbitrarily assigned register instead.

While the same issue does not exist in the SH-5 case, there is likewise no harm
in having an alternate register used for the byterev/shari pair.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/swab.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/sh/include/asm/swab.h b/arch/sh/include/asm/swab.h
index e69315935107d..0e08fe54ad71f 100644
--- a/arch/sh/include/asm/swab.h
+++ b/arch/sh/include/asm/swab.h
@@ -14,15 +14,15 @@ static inline __attribute_const__ __u32 __arch_swab32(__u32 x)
 {
 	__asm__(
 #ifdef __SH5__
-		"byterev	%0, %0\n\t"
+		"byterev	%1, %0\n\t"
 		"shari		%0, 32, %0"
 #else
-		"swap.b		%0, %0\n\t"
+		"swap.b		%1, %0\n\t"
 		"swap.w		%0, %0\n\t"
 		"swap.b		%0, %0"
 #endif
 		: "=r" (x)
-		: "0" (x));
+		: "r" (x));
 
 	return x;
 }
@@ -32,13 +32,13 @@ static inline __attribute_const__ __u16 __arch_swab16(__u16 x)
 {
 	__asm__(
 #ifdef __SH5__
-		"byterev	%0, %0\n\t"
+		"byterev	%1, %0\n\t"
 		"shari		%0, 32, %0"
 #else
-		"swap.b		%0, %0"
+		"swap.b		%1, %0"
 #endif
 		: "=r" (x)
-		:  "0" (x));
+		:  "r" (x));
 
 	return x;
 }
-- 
GitLab


From 82afae6016b672acb90ceb8e773bba0bd977d2a3 Mon Sep 17 00:00:00 2001
From: Erdem Aktas <eaktas1@gmail.com>
Date: Sun, 10 May 2009 02:13:19 -0400
Subject: [PATCH 0971/2198] perf_counter tools: fix buffer overwrite problem
 for perf top command

There is a buffer overwrite problem in builtin-top.c line 526, When I
tried to use ./perf top command, it was giving memory corruption
problem.

[ Impact: fix 'perf top' crash ]

LKML-Reference: <3fee128b0905092313x608e65e0l7b1116d86914114f@mail.gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index cd6f61d734181..b1549dd277200 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -523,7 +523,7 @@ static int read_symbol(FILE *in, struct sym_entry *s)
 	if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
 		return 1;
 
-	s->sym = malloc(strlen(str));
+	s->sym = malloc(strlen(str)+1);
 	assert(s->sym);
 
 	strcpy((char *)s->sym, str);
-- 
GitLab


From 5e751e992f3fb08ba35e1ca8095ec8fbf9eda523 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 8 May 2009 13:55:22 +0100
Subject: [PATCH 0972/2198] CRED: Rename cred_exec_mutex to reflect that it's a
 guard against ptrace

Rename cred_exec_mutex to reflect that it's a guard against foreign
intervention on a process's credential state, such as is made by ptrace().  The
attachment of a debugger to a process affects execve()'s calculation of the new
credential state - _and_ also setprocattr()'s calculation of that state.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/compat.c               |  6 +++---
 fs/exec.c                 | 10 +++++-----
 include/linux/init_task.h |  4 ++--
 include/linux/sched.h     |  4 +++-
 kernel/cred.c             |  4 ++--
 kernel/ptrace.c           |  9 +++++----
 6 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/fs/compat.c b/fs/compat.c
index 681ed81e6be03..bb2a9b2e81738 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1488,7 +1488,7 @@ int compat_do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&current->cred_guard_mutex);
 	if (retval < 0)
 		goto out_free;
 	current->in_execve = 1;
@@ -1550,7 +1550,7 @@ int compat_do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1573,7 +1573,7 @@ int compat_do_execve(char * filename,
 
 out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
diff --git a/fs/exec.c b/fs/exec.c
index 639177b0eeac9..998e856c3079c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1045,7 +1045,7 @@ void install_exec_creds(struct linux_binprm *bprm)
 	commit_creds(bprm->cred);
 	bprm->cred = NULL;
 
-	/* cred_exec_mutex must be held at least to this point to prevent
+	/* cred_guard_mutex must be held at least to this point to prevent
 	 * ptrace_attach() from altering our determination of the task's
 	 * credentials; any time after this it may be unlocked */
 
@@ -1055,7 +1055,7 @@ EXPORT_SYMBOL(install_exec_creds);
 
 /*
  * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_exec_mutex to protect against
+ * - the caller must hold current->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1297,7 +1297,7 @@ int do_execve(char * filename,
 	if (!bprm)
 		goto out_files;
 
-	retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&current->cred_guard_mutex);
 	if (retval < 0)
 		goto out_free;
 	current->in_execve = 1;
@@ -1360,7 +1360,7 @@ int do_execve(char * filename,
 	/* execve succeeded */
 	current->fs->in_exec = 0;
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 	acct_update_integrals(current);
 	free_bprm(bprm);
 	if (displaced)
@@ -1383,7 +1383,7 @@ int do_execve(char * filename,
 
 out_unlock:
 	current->in_execve = 0;
-	mutex_unlock(&current->cred_exec_mutex);
+	mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
 	free_bprm(bprm);
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d87247d2641f5..7f54ba942429d 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -145,8 +145,8 @@ extern struct cred init_cred;
 	.group_leader	= &tsk,						\
 	.real_cred	= &init_cred,					\
 	.cred		= &init_cred,					\
-	.cred_exec_mutex =						\
-		 __MUTEX_INITIALIZER(tsk.cred_exec_mutex),		\
+	.cred_guard_mutex =						\
+		 __MUTEX_INITIALIZER(tsk.cred_guard_mutex),		\
 	.comm		= "swapper",					\
 	.thread		= INIT_THREAD,					\
 	.fs		= &init_fs,					\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3fa82b353c985..5932ace224001 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1247,7 +1247,9 @@ struct task_struct {
 					 * credentials (COW) */
 	const struct cred *cred;	/* effective (overridable) subjective task
 					 * credentials (COW) */
-	struct mutex cred_exec_mutex;	/* execve vs ptrace cred calculation mutex */
+	struct mutex cred_guard_mutex;	/* guard against foreign influences on
+					 * credential calculations
+					 * (notably. ptrace) */
 
 	char comm[TASK_COMM_LEN]; /* executable name excluding path
 				     - access with [gs]et_task_comm (which lock
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a039189d7074..1bb4d7e5d6169 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);
 
 /*
  * Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_exec_mutex
+ * - The caller must hold current->cred_guard_mutex
  */
 struct cred *prepare_exec_creds(void)
 {
@@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
 	struct cred *new;
 	int ret;
 
-	mutex_init(&p->cred_exec_mutex);
+	mutex_init(&p->cred_guard_mutex);
 
 	if (
 #ifdef CONFIG_KEYS
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 0692ab5a0d672..27ac80298bfa7 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -185,10 +185,11 @@ int ptrace_attach(struct task_struct *task)
 	if (same_thread_group(task, current))
 		goto out;
 
-	/* Protect exec's credential calculations against our interference;
-	 * SUID, SGID and LSM creds get determined differently under ptrace.
+	/* Protect the target's credential calculations against our
+	 * interference; SUID, SGID and LSM creds get determined differently
+	 * under ptrace.
 	 */
-	retval = mutex_lock_interruptible(&task->cred_exec_mutex);
+	retval = mutex_lock_interruptible(&task->cred_guard_mutex);
 	if (retval  < 0)
 		goto out;
 
@@ -232,7 +233,7 @@ int ptrace_attach(struct task_struct *task)
 bad:
 	write_unlock_irqrestore(&tasklist_lock, flags);
 	task_unlock(task);
-	mutex_unlock(&task->cred_exec_mutex);
+	mutex_unlock(&task->cred_guard_mutex);
 out:
 	return retval;
 }
-- 
GitLab


From 107db7c7dd137aeb7361b8c2606ac936c0be58ff Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 8 May 2009 13:55:27 +0100
Subject: [PATCH 0973/2198] CRED: Guard the setprocattr security hook against
 ptrace

Guard the setprocattr security hook against ptrace by taking the target task's
cred_guard_mutex around it.  The problem is that setprocattr() may otherwise
note the lack of a debugger, and then perform an action on that basis whilst
letting a debugger attach between the two points.  Holding cred_guard_mutex
across the test and the action prevents ptrace_attach() from doing that.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/proc/base.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index fb45615943c2a..23342e188a666 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2128,9 +2128,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	if (copy_from_user(page, buf, count))
 		goto out_free;
 
+	/* Guard against adverse ptrace interaction */
+	length = mutex_lock_interruptible(&task->cred_guard_mutex);
+	if (length < 0)
+		goto out_free;
+
 	length = security_setprocattr(task,
 				      (char*)file->f_path.dentry->d_name.name,
 				      (void*)page, count);
+	mutex_unlock(&task->cred_guard_mutex);
 out_free:
 	free_page((unsigned long) page);
 out:
-- 
GitLab


From d9d674e50007362567edb5df65c05dd56a5964bf Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 11 May 2009 12:12:38 +0900
Subject: [PATCH 0974/2198] sh: Fix up typo in arch/sh/kernel/vmlinux.lds.S

.init_ramfs ought to be .init.ramfs, fix it up.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/vmlinux.lds.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index 1a2c8db5cb2a8..d73723080fd13 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -127,7 +127,7 @@ SECTIONS
 
 #ifdef CONFIG_BLK_DEV_INITRD
 	. = ALIGN(PAGE_SIZE);
-	.init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) {
+	.init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
 		__initramfs_start = .;
 		*(.init.ramfs)
 		__initramfs_end = .;
-- 
GitLab


From 780f98ff1fa9cfcab177f6b5ab09b11321f1e5c8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 11 May 2009 12:15:14 +0900
Subject: [PATCH 0975/2198] sh: Account for INITIAL_JIFFIES when using jiffies
 clocksource.

In the case where we fall back on the generic jiffies clocksource,
INITIAL_JIFFIES needs to be accounted for so that printk times aren't
completely skewed.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/time.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index e0aa769481ff0..a77838f539f84 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -104,7 +104,7 @@ unsigned long long sched_clock(void)
 
 	/* jiffies based sched_clock if no clocksource is installed */
 	if (!clocksource_sh.rating)
-		return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
+		return (jiffies_64 - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ);
 
 	cycles = clocksource_sh.read(&clocksource_sh);
 	return cyc2ns(&clocksource_sh, cycles);
-- 
GitLab


From 45fbe3ee01b8e463b28c2751b5dcc0cbdc142d90 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 6 May 2009 08:06:44 -0700
Subject: [PATCH 0976/2198] x86, e820, pci: reserve extra free space near end
 of RAM

The point is to take all RAM resources we have, and
_after_ we've added all the resources we've seen in
the E820 tree, we then _also_ try to add fake reserved
entries for any "round up to X" at the end of the RAM
resources.

[ Impact: improve PCI mem-resource allocation robustness, protect "stolen RAM" ]

Reported-by: Yannick Roehlly <yannick.roehlly@free.fr>
Acked-by: Jesse Barnes <jesse.barnes@intel.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: yannick.roehlly@free.fr
LKML-Reference: <4A01A784.2050407@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0062813029256..a2335d9de052c 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1371,6 +1371,23 @@ void __init e820_reserve_resources(void)
 	}
 }
 
+/* How much should we pad RAM ending depending on where it is? */
+static unsigned long ram_alignment(resource_size_t pos)
+{
+	unsigned long mb = pos >> 20;
+
+	/* To 64kB in the first megabyte */
+	if (!mb)
+		return 64*1024;
+
+	/* To 1MB in the first 16MB */
+	if (mb < 16)
+		return 1024*1024;
+
+	/* To 32MB for anything above that */
+	return 32*1024*1024;
+}
+
 void __init e820_reserve_resources_late(void)
 {
 	int i;
@@ -1382,6 +1399,24 @@ void __init e820_reserve_resources_late(void)
 			insert_resource_expand_to_fit(&iomem_resource, res);
 		res++;
 	}
+
+	/*
+	 * Try to bump up RAM regions to reasonable boundaries to
+	 * avoid stolen RAM:
+	 */
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *entry = &e820_saved.map[i];
+		resource_size_t start, end;
+
+		if (entry->type != E820_RAM)
+			continue;
+		start = entry->addr + entry->size;
+		end = round_up(start, ram_alignment(start));
+		if (start == end)
+			continue;
+		reserve_region_with_split(&iomem_resource, start,
+						  end - 1, "RAM buffer");
+	}
 }
 
 char *__init default_machine_specific_memory_setup(void)
-- 
GitLab


From 5d423ccd7ba4285f1084e91b26805e1d0ae978ed Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 08:07:52 -0700
Subject: [PATCH 0977/2198] x86/pci: remove rounding quirk from
 e820_setup_gap()

Now that the e820 code explicitly reserves 'potentially dangerous'
free physical memory address space to protect ACPI stolen RAM,
there's no need for the rounding quirk in the PCI allocator anymore.

Also, this quirk was open-ended iteration that could end up reserving
a lot of free space and potentially breaking drivers - such as the one
reported by Yannick Roehlly <yannick.roehlly@free.fr> where there's
a PCI device with a large memory resource.

So remove it.

[ Impact: make more of the PCI hole available for assigning pci devices ]

Reported-by: Yannick Roehlly <yannick.roehlly@free.fr>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Jesse Barnes <jesse.barnes@intel.com>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A01A7C8.5090701@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a2335d9de052c..7271fa33d7913 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
  */
 __init void e820_setup_gap(void)
 {
-	unsigned long gapstart, gapsize, round;
+	unsigned long gapstart, gapsize;
 	int found;
 
 	gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
 #endif
 
 	/*
-	 * See how much we want to round up: start off with
-	 * rounding to the next 1MB area.
+	 * e820_reserve_resources_late protect stolen RAM already
 	 */
-	round = 0x100000;
-	while ((gapsize >> 4) > round)
-		round += round;
-	/* Fun with two's complement */
-	pci_mem_start = (gapstart + round) & -round;
+	pci_mem_start = gapstart;
 
 	printk(KERN_INFO
 	       "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
-- 
GitLab


From 53d6979ab6747e758207e8ac861b96d0da0d3332 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:35 +0900
Subject: [PATCH 0978/2198] nbd: don't clear rq->sector and nr_sectors
 unnecessarily

There's no reason to clear rq->sector and nr_sectors after calling
blk_rq_init().  They're guaranteed to be clear.  Drop unnecessary
clearing.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Paul Clements <paul.clements@steeleye.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/nbd.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 4d6de4f15ccb3..a9ab8be9d92f1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -580,13 +580,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 		blk_rq_init(NULL, &sreq);
 		sreq.cmd_type = REQ_TYPE_SPECIAL;
 		nbd_cmd(&sreq) = NBD_CMD_DISC;
-		/*
-		 * Set these to sane values in case server implementation
-		 * fails to check the request type first and also to keep
-		 * debugging output cleaner.
-		 */
-		sreq.sector = 0;
-		sreq.nr_sectors = 0;
 		if (!lo->sock)
 			return -EINVAL;
 		nbd_send_req(lo, &sreq);
-- 
GitLab


From 9720aef2539c10e3a872e9a92beec225030d99db Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:36 +0900
Subject: [PATCH 0979/2198] ide-tape: don't initialize rq->sector for rw
 requests

rq->sector is set to the tape->first_frame but it's never actually
used and not even in the correct unit (512 byte sectors).  Don't set
it.

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Borislav Petkov <petkovbb@gmail.com>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-tape.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8dfc68892d6ae..7149224d1fe98 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -892,7 +892,6 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd[13] = cmd;
 	rq->rq_disk = tape->disk;
-	rq->sector = tape->first_frame;
 
 	if (size) {
 		ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
-- 
GitLab


From c3a4d78c580de4edc9ef0f7c59812fb02ceb037f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:37 +0900
Subject: [PATCH 0980/2198] block: add rq->resid_len

rq->data_len served two purposes - the length of data buffer on issue
and the residual count on completion.  This duality creates some
headaches.

First of all, block layer and low level drivers can't really determine
what rq->data_len contains while a request is executing.  It could be
the total request length or it coulde be anything else one of the
lower layers is using to keep track of residual count.  This
complicates things because blk_rq_bytes() and thus
[__]blk_end_request_all() relies on rq->data_len for PC commands.
Drivers which want to report residual count should first cache the
total request length, update rq->data_len and then complete the
request with the cached data length.

Secondly, it makes requests default to reporting full residual count,
ie. reporting that no data transfer occurred.  The residual count is
an exception not the norm; however, the driver should clear
rq->data_len to zero to signify the normal cases while leaving it
alone means no data transfer occurred at all.  This reverse default
behavior complicates code unnecessarily and renders block PC on some
drivers (ide-tape/floppy) unuseable.

This patch adds rq->resid_len which is used only for residual count.

While at it, remove now unnecessasry blk_rq_bytes() caching in
ide_pc_intr() as rq->data_len is not changed anymore.

Boaz	: spotted missing conversion in osd
Sergei	: spotted too early conversion to blk_rq_bytes() in ide-tape

[ Impact: cleanup residual count handling, report 0 resid by default ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Doug Gilbert <dgilbert@interlog.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Darrick J. Wong <djwong@us.ibm.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/bsg.c                              |  8 ++---
 block/scsi_ioctl.c                       |  2 +-
 drivers/block/cciss.c                    | 13 +++-----
 drivers/block/ub.c                       |  6 ++--
 drivers/ide/ide-atapi.c                  |  9 +-----
 drivers/ide/ide-cd.c                     | 13 ++++----
 drivers/ide/ide-tape.c                   |  4 +--
 drivers/message/fusion/mptsas.c          |  3 +-
 drivers/scsi/libsas/sas_expander.c       |  6 +---
 drivers/scsi/libsas/sas_host_smp.c       | 38 +++++++++++++-----------
 drivers/scsi/mpt2sas/mpt2sas_transport.c |  4 +--
 drivers/scsi/scsi_lib.c                  | 24 +++++++--------
 drivers/scsi/sg.c                        |  2 +-
 drivers/scsi/st.c                        |  2 +-
 fs/exofs/osd.c                           |  4 +--
 include/linux/blkdev.h                   |  1 +
 16 files changed, 59 insertions(+), 80 deletions(-)

diff --git a/block/bsg.c b/block/bsg.c
index 206060e795da4..2d746e34f4c24 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -445,14 +445,14 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr,
 	}
 
 	if (rq->next_rq) {
-		hdr->dout_resid = rq->data_len;
-		hdr->din_resid = rq->next_rq->data_len;
+		hdr->dout_resid = rq->resid_len;
+		hdr->din_resid = rq->next_rq->resid_len;
 		blk_rq_unmap_user(bidi_bio);
 		blk_put_request(rq->next_rq);
 	} else if (rq_data_dir(rq) == READ)
-		hdr->din_resid = rq->data_len;
+		hdr->din_resid = rq->resid_len;
 	else
-		hdr->dout_resid = rq->data_len;
+		hdr->dout_resid = rq->resid_len;
 
 	/*
 	 * If the request generated a negative error number, return it
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index 58cf4560f742f..a9670dd4b5de0 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -230,7 +230,7 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
 	hdr->info = 0;
 	if (hdr->masked_status || hdr->host_status || hdr->driver_status)
 		hdr->info |= SG_INFO_CHECK;
-	hdr->resid = rq->data_len;
+	hdr->resid = rq->resid_len;
 	hdr->sb_len_wr = 0;
 
 	if (rq->sense_len && hdr->sbp) {
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 4d4d5e0d3fa64..f22d4932433fe 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1299,7 +1299,6 @@ static void cciss_softirq_done(struct request *rq)
 {
 	CommandList_struct *cmd = rq->completion_data;
 	ctlr_info_t *h = hba[cmd->ctlr];
-	unsigned int nr_bytes;
 	unsigned long flags;
 	u64bit temp64;
 	int i, ddir;
@@ -1321,15 +1320,11 @@ static void cciss_softirq_done(struct request *rq)
 	printk("Done with %p\n", rq);
 #endif				/* CCISS_DEBUG */
 
-	/*
-	 * Store the full size and set the residual count for pc requests
-	 */
-	nr_bytes = blk_rq_bytes(rq);
+	/* set the residual count for pc requests */
 	if (blk_pc_request(rq))
-		rq->data_len = cmd->err_info->ResidualCnt;
+		rq->resid_len = cmd->err_info->ResidualCnt;
 
-	if (blk_end_request(rq, (rq->errors == 0) ? 0 : -EIO, nr_bytes))
-		BUG();
+	blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO);
 
 	spin_lock_irqsave(&h->lock, flags);
 	cmd_free(h, cmd, 1);
@@ -2691,7 +2686,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd,
 			printk(KERN_WARNING "cciss: cmd %p has"
 			       " completed with data underrun "
 			       "reported\n", cmd);
-			cmd->rq->data_len = cmd->err_info->ResidualCnt;
+			cmd->rq->resid_len = cmd->err_info->ResidualCnt;
 		}
 		break;
 	case CMD_DATA_OVERRUN:
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 689cd27ac890a..8c2cc71327e3f 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -783,10 +783,8 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 
 	if (cmd->error == 0) {
 		if (blk_pc_request(rq)) {
-			if (cmd->act_len >= rq->data_len)
-				rq->data_len = 0;
-			else
-				rq->data_len -= cmd->act_len;
+			if (cmd->act_len < rq->data_len)
+				rq->resid_len = rq->data_len - cmd->act_len;
 			scsi_status = 0;
 		} else {
 			if (cmd->act_len != cmd->len) {
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index afe5a43238793..e4a02a05fc819 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -367,7 +367,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 	/* No more interrupts */
 	if ((stat & ATA_DRQ) == 0) {
 		int uptodate, error;
-		unsigned int done;
 
 		debug_log("Packet command completed, %d bytes transferred\n",
 			  pc->xferred);
@@ -406,12 +405,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 		if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0)
 			dsc = 1;
 
-		/*
-		 * ->pc_callback() might change rq->data_len for
-		 * residual count, cache total length.
-		 */
-		done = blk_rq_bytes(rq);
-
 		/* Command finished - Call the callback function */
 		uptodate = drive->pc_callback(drive, dsc);
 
@@ -431,7 +424,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 			error = uptodate ? 0 : -EIO;
 		}
 
-		ide_complete_rq(drive, error, done);
+		ide_complete_rq(drive, error, blk_rq_bytes(rq));
 		return ide_stopped;
 	}
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 673628790f108..8bbe222c5e423 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -519,7 +519,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
 		error = blk_execute_rq(drive->queue, info->disk, rq, 0);
 
 		if (buffer)
-			*bufflen = rq->data_len;
+			*bufflen = rq->resid_len;
 
 		flags = rq->cmd_flags;
 		blk_put_request(rq);
@@ -707,11 +707,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 out_end:
 	if (blk_pc_request(rq) && rc == 0) {
-		unsigned int dlen = rq->data_len;
-
-		rq->data_len = 0;
-
-		if (blk_end_request(rq, 0, dlen))
+		if (blk_end_request(rq, 0, rq->data_len))
 			BUG();
 
 		hwif->rq = NULL;
@@ -740,9 +736,10 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			nsectors = 1;
 
 		if (blk_fs_request(rq) == 0) {
-			rq->data_len -= (cmd->nbytes - cmd->nleft);
+			rq->resid_len = rq->data_len -
+				(cmd->nbytes - cmd->nleft);
 			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
-				rq->data_len += cmd->last_xfer_len;
+				rq->resid_len += cmd->last_xfer_len;
 		}
 
 		ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 7149224d1fe98..65c5b705883ab 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -380,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->data_len -= blocks * tape->blk_size;
+		rq->resid_len = rq->data_len - blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
@@ -903,7 +903,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 	blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
 	/* calculate the number of transferred bytes and update buffer state */
-	size -= rq->data_len;
+	size -= rq->resid_len;
 	tape->cur = tape->buf;
 	if (cmd == REQ_IDETAPE_READ)
 		tape->valid = size;
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index a9019f081b971..5d5f34715de45 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1357,8 +1357,7 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply;
 		memcpy(req->sense, smprep, sizeof(*smprep));
 		req->sense_len = sizeof(*smprep);
-		req->data_len = 0;
-		rsp->data_len -= smprep->ResponseDataLength;
+		rsp->resid_len = rsp->data_len - smprep->ResponseDataLength;
 	} else {
 		printk(MYIOC_s_ERR_FMT "%s: smp passthru reply failed to be returned\n",
 		    ioc->name, __func__);
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 3da02e4367884..6605ec905cc03 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -1936,12 +1936,8 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 			       bio_data(rsp->bio), rsp->data_len);
 	if (ret > 0) {
 		/* positive number is the untransferred residual */
-		rsp->data_len = ret;
-		req->data_len = 0;
+		rsp->resid_len = ret;
 		ret = 0;
-	} else if (ret == 0) {
-		rsp->data_len = 0;
-		req->data_len = 0;
 	}
 
 	return ret;
diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c
index d110a366c48a1..89952edd0be37 100644
--- a/drivers/scsi/libsas/sas_host_smp.c
+++ b/drivers/scsi/libsas/sas_host_smp.c
@@ -134,7 +134,7 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 {
 	u8 *req_data = NULL, *resp_data = NULL, *buf;
 	struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(shost);
-	int error = -EINVAL, resp_data_len = rsp->data_len;
+	int error = -EINVAL;
 
 	/* eight is the minimum size for request and response frames */
 	if (req->data_len < 8 || rsp->data_len < 8)
@@ -176,17 +176,20 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	resp_data[1] = req_data[1];
 	resp_data[2] = SMP_RESP_FUNC_UNK;
 
+	req->resid_len = req->data_len;
+	rsp->resid_len = rsp->data_len;
+
 	switch (req_data[1]) {
 	case SMP_REPORT_GENERAL:
-		req->data_len -= 8;
-		resp_data_len -= 32;
+		req->resid_len -= 8;
+		rsp->resid_len -= 32;
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 		resp_data[9] = sas_ha->num_phys;
 		break;
 
 	case SMP_REPORT_MANUF_INFO:
-		req->data_len -= 8;
-		resp_data_len -= 64;
+		req->resid_len -= 8;
+		rsp->resid_len -= 64;
 		resp_data[2] = SMP_RESP_FUNC_ACC;
 		memcpy(resp_data + 12, shost->hostt->name,
 		       SAS_EXPANDER_VENDOR_ID_LEN);
@@ -199,13 +202,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_DISCOVER:
-		req->data_len -= 16;
-		if ((int)req->data_len < 0) {
-			req->data_len = 0;
+		req->resid_len -= 16;
+		if ((int)req->resid_len < 0) {
+			req->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		resp_data_len -= 56;
+		rsp->resid_len -= 56;
 		sas_host_smp_discover(sas_ha, resp_data, req_data[9]);
 		break;
 
@@ -215,13 +218,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_REPORT_PHY_SATA:
-		req->data_len -= 16;
-		if ((int)req->data_len < 0) {
-			req->data_len = 0;
+		req->resid_len -= 16;
+		if ((int)req->resid_len < 0) {
+			req->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		resp_data_len -= 60;
+		rsp->resid_len -= 60;
 		sas_report_phy_sata(sas_ha, resp_data, req_data[9]);
 		break;
 
@@ -238,13 +241,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 		break;
 
 	case SMP_PHY_CONTROL:
-		req->data_len -= 44;
-		if ((int)req->data_len < 0) {
-			req->data_len = 0;
+		req->resid_len -= 44;
+		if ((int)req->resid_len < 0) {
+			req->resid_len = 0;
 			error = -EINVAL;
 			goto out;
 		}
-		resp_data_len -= 8;
+		rsp->resid_len -= 8;
 		sas_phy_control(sas_ha, req_data[9], req_data[10],
 				req_data[32] >> 4, req_data[33] >> 4,
 				resp_data);
@@ -265,7 +268,6 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	flush_kernel_dcache_page(bio_page(rsp->bio));
 	kunmap_atomic(buf - bio_offset(rsp->bio), KM_USER0);
 	local_irq_enable();
-	rsp->data_len = resp_data_len;
 
  out:
 	kfree(req_data);
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index e03dc0b1e1a0f..53759c566bfed 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1170,9 +1170,7 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 		memcpy(req->sense, mpi_reply, sizeof(*mpi_reply));
 		req->sense_len = sizeof(*mpi_reply);
-		req->data_len = 0;
-		rsp->data_len -= mpi_reply->ResponseDataLength;
-
+		rsp->resid_len = rsp->data_len - mpi_reply->ResponseDataLength;
 	} else {
 		dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT
 		    "%s - no reply\n", ioc->name, __func__));
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index aa9fc572e45fa..7d49ef589f336 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -240,11 +240,11 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd,
 	 * is invalid.  Prevent the garbage from being misinterpreted
 	 * and prevent security leaks by zeroing out the excess data.
 	 */
-	if (unlikely(req->data_len > 0 && req->data_len <= bufflen))
-		memset(buffer + (bufflen - req->data_len), 0, req->data_len);
+	if (unlikely(req->resid_len > 0 && req->resid_len <= bufflen))
+		memset(buffer + (bufflen - req->resid_len), 0, req->resid_len);
 
 	if (resid)
-		*resid = req->data_len;
+		*resid = req->resid_len;
 	ret = req->errors;
  out:
 	blk_put_request(req);
@@ -549,7 +549,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 		int leftover = (req->hard_nr_sectors << 9);
 
 		if (blk_pc_request(req))
-			leftover = req->data_len;
+			leftover = req->resid_len;
 
 		/* kill remainder if no retrys */
 		if (error && scsi_noretry_cmd(cmd))
@@ -673,11 +673,11 @@ void scsi_release_buffers(struct scsi_cmnd *cmd)
 EXPORT_SYMBOL(scsi_release_buffers);
 
 /*
- * Bidi commands Must be complete as a whole, both sides at once.
- * If part of the bytes were written and lld returned
- * scsi_in()->resid and/or scsi_out()->resid this information will be left
- * in req->data_len and req->next_rq->data_len. The upper-layer driver can
- * decide what to do with this information.
+ * Bidi commands Must be complete as a whole, both sides at once.  If
+ * part of the bytes were written and lld returned scsi_in()->resid
+ * and/or scsi_out()->resid this information will be left in
+ * req->resid_len and req->next_rq->resid_len. The upper-layer driver
+ * can decide what to do with this information.
  */
 static void scsi_end_bidi_request(struct scsi_cmnd *cmd)
 {
@@ -685,8 +685,8 @@ static void scsi_end_bidi_request(struct scsi_cmnd *cmd)
 	unsigned int dlen = req->data_len;
 	unsigned int next_dlen = req->next_rq->data_len;
 
-	req->data_len = scsi_out(cmd)->resid;
-	req->next_rq->data_len = scsi_in(cmd)->resid;
+	req->resid_len = scsi_out(cmd)->resid;
+	req->next_rq->resid_len = scsi_in(cmd)->resid;
 
 	/* The req and req->next_rq have not been completed */
 	BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
@@ -778,7 +778,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			scsi_end_bidi_request(cmd);
 			return;
 		}
-		req->data_len = scsi_get_resid(cmd);
+		req->resid_len = scsi_get_resid(cmd);
 	}
 
 	BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 82312df9b0bfb..dec4c70677dee 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1260,7 +1260,7 @@ static void sg_rq_end_io(struct request *rq, int uptodate)
 
 	sense = rq->sense;
 	result = rq->errors;
-	resid = rq->data_len;
+	resid = rq->resid_len;
 
 	SCSI_LOG_TIMEOUT(4, printk("sg_cmd_done: %s, pack_id=%d, res=0x%x\n",
 		sdp->disk->disk_name, srp->header.pack_id, result));
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index eb24efea8f145..8681b708344f0 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -463,7 +463,7 @@ static void st_scsi_execute_end(struct request *req, int uptodate)
 	struct scsi_tape *STp = SRpnt->stp;
 
 	STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors;
-	STp->buffer->cmdstat.residual = req->data_len;
+	STp->buffer->cmdstat.residual = req->resid_len;
 
 	if (SRpnt->waiting)
 		complete(SRpnt->waiting);
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
index b249ae97fb15b..06ca92672eb5d 100644
--- a/fs/exofs/osd.c
+++ b/fs/exofs/osd.c
@@ -50,10 +50,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
 
 	/* FIXME: should be include in osd_sense_info */
 	if (in_resid)
-		*in_resid = or->in.req ? or->in.req->data_len : 0;
+		*in_resid = or->in.req ? or->in.req->resid_len : 0;
 
 	if (out_resid)
-		*out_resid = or->out.req ? or->out.req->data_len : 0;
+		*out_resid = or->out.req ? or->out.req->resid_len : 0;
 
 	return ret;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 3a5b1bd6582c3..6a967cad89fa6 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -229,6 +229,7 @@ struct request {
 	unsigned int data_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
+	unsigned int resid_len;	/* residual count */
 	void *sense;
 
 	unsigned long deadline;
-- 
GitLab


From 5b93629b4509c03ffa87a9316412fedf6f58cb37 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:38 +0900
Subject: [PATCH 0981/2198] block: implement blk_rq_pos/[cur_]sectors() and
 convert obvious ones

Implement accessors - blk_rq_pos(), blk_rq_sectors() and
blk_rq_cur_sectors() which return rq->hard_sector, rq->hard_nr_sectors
and rq->hard_cur_sectors respectively and convert direct references of
the said fields to the accessors.

This is in preparation of request data length handling cleanup.

Geert	: suggested adding const to struct request * parameter to accessors
Sergei	: spotted error in patch description

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Acked-by: Stephen Rothwell <sfr@canb.auug.org.au>
Tested-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Ackec-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c             |  2 +-
 block/blk-core.c                |  2 +-
 block/cfq-iosched.c             |  2 +-
 drivers/block/ps3disk.c         |  2 +-
 drivers/block/viodasd.c         |  6 +++---
 drivers/block/xsysace.c         | 10 +++++-----
 drivers/ide/ide-cd.c            |  8 ++++----
 drivers/ide/ide-io.c            |  4 ++--
 drivers/message/i2o/i2o_block.c |  2 +-
 drivers/scsi/scsi_lib.c         |  2 +-
 include/linux/blkdev.h          | 23 ++++++++++++++++++++---
 kernel/trace/blktrace.c         |  4 ++--
 12 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index c8d087655eff8..c167de5b9eff4 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -163,7 +163,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	 * For an empty barrier, there's no actual BAR request, which
 	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
 	 */
-	if (!rq->hard_nr_sectors) {
+	if (!blk_rq_sectors(rq)) {
 		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 				QUEUE_ORDERED_DO_POSTFLUSH);
 		/*
diff --git a/block/blk-core.c b/block/blk-core.c
index 394c5bd812718..895e55b74a40b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1683,7 +1683,7 @@ static void blk_account_io_done(struct request *req)
 unsigned int blk_rq_bytes(struct request *rq)
 {
 	if (blk_fs_request(rq))
-		return rq->hard_nr_sectors << 9;
+		return blk_rq_sectors(rq) << 9;
 
 	return rq->data_len;
 }
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index def0c698a4bc7..575083a9ffe4a 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -760,7 +760,7 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq)
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 						cfqd->rq_in_driver);
 
-	cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors;
+	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 }
 
 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index f6586e4d351c0..c2388673684eb 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -136,7 +136,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 	dev_dbg(&dev->sbd.core,
 		"%s:%u: %s req has %u bvecs for %lu sectors %lu hard sectors\n",
 		__func__, __LINE__, op, n, req->nr_sectors,
-		req->hard_nr_sectors);
+		blk_rq_sectors(req));
 #endif
 
 	start_sector = req->sector * priv->blocking_factor;
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index ecccf65dce2f0..e821eed7132f4 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -368,12 +368,12 @@ static void do_viodasd_request(struct request_queue *q)
 		blkdev_dequeue_request(req);
 		/* check that request contains a valid command */
 		if (!blk_fs_request(req)) {
-			viodasd_end_request(req, -EIO, req->hard_nr_sectors);
+			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
 			continue;
 		}
 		/* Try sending the request */
 		if (send_request(req) != 0)
-			viodasd_end_request(req, -EIO, req->hard_nr_sectors);
+			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
 	}
 }
 
@@ -590,7 +590,7 @@ static int viodasd_handle_read_write(struct vioblocklpevent *bevent)
 		err = vio_lookup_rc(viodasd_err_table, bevent->sub_result);
 		printk(VIOD_KERN_WARNING "read/write error %d:0x%04x (%s)\n",
 				event->xRc, bevent->sub_result, err->msg);
-		num_sect = req->hard_nr_sectors;
+		num_sect = blk_rq_sectors(req);
 	}
 	qlock = req->q->queue_lock;
 	spin_lock_irqsave(qlock, irq_flags);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index b1e1d7e5ab1e8..5722931d14c50 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -645,8 +645,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
 
 		/* Okay, it's a data request, set it up for transfer */
 		dev_dbg(ace->dev,
-			"request: sec=%llx hcnt=%lx, ccnt=%x, dir=%i\n",
-			(unsigned long long) req->sector, req->hard_nr_sectors,
+			"request: sec=%llx hcnt=%x, ccnt=%x, dir=%i\n",
+			(unsigned long long) req->sector, blk_rq_sectors(req),
 			req->current_nr_sectors, rq_data_dir(req));
 
 		ace->req = req;
@@ -654,7 +654,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		ace->data_count = req->current_nr_sectors * ACE_BUF_PER_SECTOR;
 		ace_out32(ace, ACE_MPULBA, req->sector & 0x0FFFFFFF);
 
-		count = req->hard_nr_sectors;
+		count = blk_rq_sectors(req);
 		if (rq_data_dir(req)) {
 			/* Kick off write request */
 			dev_dbg(ace->dev, "write data\n");
@@ -719,8 +719,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		/* bio finished; is there another one? */
 		if (__blk_end_request(ace->req, 0,
 					blk_rq_cur_bytes(ace->req))) {
-			/* dev_dbg(ace->dev, "next block; h=%li c=%i\n",
-			 *      ace->req->hard_nr_sectors,
+			/* dev_dbg(ace->dev, "next block; h=%u c=%u\n",
+			 *      blk_rq_sectors(ace->req),
 			 *      ace->req->current_nr_sectors);
 			 */
 			ace->data_ptr = ace->req->buffer;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 8bbe222c5e423..182320dd6ea14 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -730,7 +730,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 		if (blk_pc_request(rq))
 			nsectors = (rq->data_len + 511) >> 9;
 		else
-			nsectors = rq->hard_nr_sectors;
+			nsectors = blk_rq_sectors(rq);
 
 		if (nsectors == 0)
 			nsectors = 1;
@@ -875,7 +875,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	return ide_issue_pc(drive, &cmd);
 out_end:
-	nsectors = rq->hard_nr_sectors;
+	nsectors = blk_rq_sectors(rq);
 
 	if (nsectors == 0)
 		nsectors = 1;
@@ -1359,8 +1359,8 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 {
 	int hard_sect = queue_hardsect_size(q);
-	long block = (long)rq->hard_sector / (hard_sect >> 9);
-	unsigned long blocks = rq->hard_nr_sectors / (hard_sect >> 9);
+	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
+	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
 
 	memset(rq->cmd, 0, BLK_MAX_CDB);
 
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index a0309ea661ac0..df23bcbd94b4a 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -118,7 +118,7 @@ unsigned int ide_rq_bytes(struct request *rq)
 	if (blk_pc_request(rq))
 		return rq->data_len;
 	else
-		return rq->hard_cur_sectors << 9;
+		return blk_rq_cur_sectors(rq) << 9;
 }
 EXPORT_SYMBOL_GPL(ide_rq_bytes);
 
@@ -133,7 +133,7 @@ int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes)
 	 * and complete the whole request right now
 	 */
 	if (blk_noretry_request(rq) && error <= 0)
-		nr_bytes = rq->hard_nr_sectors << 9;
+		nr_bytes = blk_rq_sectors(rq) << 9;
 
 	rc = ide_end_rq(drive, rq, error, nr_bytes);
 	if (rc == 0)
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 221317e6a006d..56e60f0d53123 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -427,7 +427,7 @@ static void i2o_block_end_request(struct request *req, int error,
 	unsigned long flags;
 
 	if (blk_end_request(req, error, nr_bytes)) {
-		int leftover = (req->hard_nr_sectors << KERNEL_SECTOR_SHIFT);
+		int leftover = (blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
 
 		if (blk_pc_request(req))
 			leftover = req->data_len;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 7d49ef589f336..9ff0ca9988a93 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -546,7 +546,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 	 * to queue the remainder of them.
 	 */
 	if (blk_end_request(req, error, bytes)) {
-		int leftover = (req->hard_nr_sectors << 9);
+		int leftover = blk_rq_sectors(req) << 9;
 
 		if (blk_pc_request(req))
 			leftover = req->resid_len;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6a967cad89fa6..4e5f855987280 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -832,13 +832,30 @@ static inline void blk_run_address_space(struct address_space *mapping)
 extern void blkdev_dequeue_request(struct request *req);
 
 /*
- * blk_end_request() takes bytes instead of sectors as a complete size.
- * blk_rq_bytes() returns bytes left to complete in the entire request.
- * blk_rq_cur_bytes() returns bytes left to complete in the current segment.
+ * blk_rq_pos()		: the current sector
+ * blk_rq_bytes()	: bytes left in the entire request
+ * blk_rq_cur_bytes()	: bytes left in the current segment
+ * blk_rq_sectors()	: sectors left in the entire request
+ * blk_rq_cur_sectors()	: sectors left in the current segment
  */
+static inline sector_t blk_rq_pos(const struct request *rq)
+{
+	return rq->hard_sector;
+}
+
 extern unsigned int blk_rq_bytes(struct request *rq);
 extern unsigned int blk_rq_cur_bytes(struct request *rq);
 
+static inline unsigned int blk_rq_sectors(const struct request *rq)
+{
+	return rq->hard_nr_sectors;
+}
+
+static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
+{
+	return rq->hard_cur_sectors;
+}
+
 /*
  * Request completion related functions.
  *
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 921ef5d1f0ba9..42f1c11e754cd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -646,7 +646,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 				rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
 				rw, what, rq->errors, 0, NULL);
 	}
 }
@@ -857,7 +857,7 @@ void blk_add_driver_data(struct request_queue *q,
 		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
 				rq->errors, len, data);
 	else
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
 				0, BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
-- 
GitLab


From 83096ebf1263b2c1ee5e653ba37d993d02e3eb7b Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:39 +0900
Subject: [PATCH 0982/2198] block: convert to pos and nr_sectors accessors

With recent cleanups, there is no place where low level driver
directly manipulates request fields.  This means that the 'hard'
request fields always equal the !hard fields.  Convert all
rq->sectors, nr_sectors and current_nr_sectors references to
accessors.

While at it, drop superflous blk_rq_pos() < 0 test in swim.c.

[ Impact: use pos and nr_sectors accessors ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Tested-by: Grant Likely <grant.likely@secretlab.ca>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Tested-by: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Acked-by: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Acked-by: Mike Miller <mike.miller@hp.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Clements <paul.clements@steeleye.com>
Cc: Tim Waugh <tim@cyberelk.net>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Dario Ballabio <ballabio_dario@emc.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: unsik Kim <donari75@gmail.com>
Cc: Laurent Vivier <Laurent@lvivier.info>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/um/drivers/ubd_kern.c          |  2 +-
 block/as-iosched.c                  | 18 +++++-----
 block/blk-barrier.c                 |  2 +-
 block/blk-core.c                    | 17 +++++----
 block/blk-merge.c                   | 10 +++---
 block/cfq-iosched.c                 | 18 +++++-----
 block/deadline-iosched.c            |  2 +-
 block/elevator.c                    | 22 ++++++------
 drivers/block/DAC960.c              |  6 ++--
 drivers/block/amiflop.c             |  6 ++--
 drivers/block/ataflop.c             | 10 +++---
 drivers/block/cciss.c               | 22 ++++++------
 drivers/block/cpqarray.c            |  9 ++---
 drivers/block/floppy.c              | 53 +++++++++++++++--------------
 drivers/block/hd.c                  | 14 ++++----
 drivers/block/mg_disk.c             | 25 +++++++-------
 drivers/block/nbd.c                 | 12 +++----
 drivers/block/paride/pcd.c          |  4 +--
 drivers/block/paride/pd.c           |  8 ++---
 drivers/block/paride/pf.c           |  8 ++---
 drivers/block/ps3disk.c             |  9 +++--
 drivers/block/sunvdc.c              |  2 +-
 drivers/block/swim.c                |  6 ++--
 drivers/block/swim3.c               | 34 +++++++++---------
 drivers/block/sx8.c                 |  6 ++--
 drivers/block/ub.c                  |  6 ++--
 drivers/block/viodasd.c             |  2 +-
 drivers/block/virtio_blk.c          |  2 +-
 drivers/block/xd.c                  |  4 +--
 drivers/block/xen-blkfront.c        | 11 +++---
 drivers/block/xsysace.c             | 17 ++++-----
 drivers/block/z2ram.c               |  6 ++--
 drivers/cdrom/gdrom.c               |  6 ++--
 drivers/cdrom/viocd.c               |  2 +-
 drivers/memstick/core/mspro_block.c |  6 ++--
 drivers/message/i2o/i2o_block.c     | 20 ++++++-----
 drivers/mmc/card/block.c            | 10 +++---
 drivers/mtd/mtd_blkdevs.c           |  7 ++--
 drivers/s390/block/dasd.c           |  2 +-
 drivers/s390/block/dasd_diag.c      |  5 +--
 drivers/s390/block/dasd_eckd.c      |  6 ++--
 drivers/s390/block/dasd_fba.c       |  7 ++--
 drivers/s390/char/tape_34xx.c       |  2 +-
 drivers/s390/char/tape_3590.c       |  2 +-
 drivers/s390/char/tape_block.c      |  2 +-
 drivers/sbus/char/jsflash.c         |  4 +--
 drivers/scsi/eata.c                 | 24 ++++++-------
 drivers/scsi/lpfc/lpfc_scsi.c       | 22 ++++++------
 drivers/scsi/scsi_lib.c             |  6 ++--
 drivers/scsi/sd.c                   | 24 ++++++-------
 drivers/scsi/sd_dif.c               |  2 +-
 drivers/scsi/sr.c                   | 15 ++++----
 drivers/scsi/u14-34f.c              | 22 ++++++------
 include/scsi/scsi_cmnd.h            |  2 +-
 54 files changed, 292 insertions(+), 279 deletions(-)

diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 433012764a376..402ba8f70fc90 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1239,7 +1239,7 @@ static void do_ubd_request(struct request_queue *q)
 		}
 
 		req = dev->request;
-		sector = req->sector;
+		sector = blk_rq_pos(req);
 		while(dev->start_sg < dev->end_sg){
 			struct scatterlist *sg = &dev->sg[dev->start_sg];
 
diff --git a/block/as-iosched.c b/block/as-iosched.c
index 45bd07059c285..7a12cf6ee1d35 100644
--- a/block/as-iosched.c
+++ b/block/as-iosched.c
@@ -306,8 +306,8 @@ as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2)
 	data_dir = rq_is_sync(rq1);
 
 	last = ad->last_sector[data_dir];
-	s1 = rq1->sector;
-	s2 = rq2->sector;
+	s1 = blk_rq_pos(rq1);
+	s2 = blk_rq_pos(rq2);
 
 	BUG_ON(data_dir != rq_is_sync(rq2));
 
@@ -566,13 +566,15 @@ static void as_update_iohist(struct as_data *ad, struct as_io_context *aic,
 			as_update_thinktime(ad, aic, thinktime);
 
 			/* Calculate read -> read seek distance */
-			if (aic->last_request_pos < rq->sector)
-				seek_dist = rq->sector - aic->last_request_pos;
+			if (aic->last_request_pos < blk_rq_pos(rq))
+				seek_dist = blk_rq_pos(rq) -
+					    aic->last_request_pos;
 			else
-				seek_dist = aic->last_request_pos - rq->sector;
+				seek_dist = aic->last_request_pos -
+					    blk_rq_pos(rq);
 			as_update_seekdist(ad, aic, seek_dist);
 		}
-		aic->last_request_pos = rq->sector + rq->nr_sectors;
+		aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 		set_bit(AS_TASK_IOSTARTED, &aic->state);
 		spin_unlock(&aic->lock);
 	}
@@ -587,7 +589,7 @@ static int as_close_req(struct as_data *ad, struct as_io_context *aic,
 {
 	unsigned long delay;	/* jiffies */
 	sector_t last = ad->last_sector[ad->batch_data_dir];
-	sector_t next = rq->sector;
+	sector_t next = blk_rq_pos(rq);
 	sector_t delta; /* acceptable close offset (in sectors) */
 	sector_t s;
 
@@ -981,7 +983,7 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq)
 	 * This has to be set in order to be correctly updated by
 	 * as_find_next_rq
 	 */
-	ad->last_sector[data_dir] = rq->sector + rq->nr_sectors;
+	ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (data_dir == BLK_RW_SYNC) {
 		struct io_context *ioc = RQ_IOC(rq);
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index c167de5b9eff4..8713c2fbc4f64 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -324,7 +324,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 	/*
 	 * The driver must store the error location in ->bi_sector, if
 	 * it supports it. For non-stacked drivers, this should be copied
-	 * from rq->sector.
+	 * from blk_rq_pos(rq).
 	 */
 	if (error_sector)
 		*error_sector = bio->bi_sector;
diff --git a/block/blk-core.c b/block/blk-core.c
index 895e55b74a40b..82dc20621c068 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -72,7 +72,7 @@ static void drive_stat_acct(struct request *rq, int new_io)
 		return;
 
 	cpu = part_stat_lock();
-	part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
+	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 
 	if (!new_io)
 		part_stat_inc(cpu, part, merges[rw]);
@@ -185,10 +185,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
 		rq->cmd_flags);
 
-	printk(KERN_INFO "  sector %llu, nr/cnr %lu/%u\n",
-						(unsigned long long)rq->sector,
-						rq->nr_sectors,
-						rq->current_nr_sectors);
+	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
+	       (unsigned long long)blk_rq_pos(rq),
+	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
 						rq->bio, rq->biotail,
 						rq->buffer, rq->data_len);
@@ -1557,7 +1556,7 @@ EXPORT_SYMBOL(submit_bio);
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
-	if (rq->nr_sectors > q->max_sectors ||
+	if (blk_rq_sectors(rq) > q->max_sectors ||
 	    rq->data_len > q->max_hw_sectors << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
@@ -1645,7 +1644,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, req->sector);
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
@@ -1665,7 +1664,7 @@ static void blk_account_io_done(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, req->sector);
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
@@ -1846,7 +1845,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
 		printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
 				req->rq_disk ? req->rq_disk->disk_name : "?",
-				(unsigned long long)req->sector);
+				(unsigned long long)blk_rq_pos(req));
 	}
 
 	blk_account_io_completion(req, nr_bytes);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 23d2a6fe34a38..bf62a87a9da22 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -259,7 +259,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 	else
 		max_sectors = q->max_sectors;
 
-	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
+	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -284,7 +284,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		max_sectors = q->max_sectors;
 
 
-	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
+	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -315,7 +315,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	/*
 	 * Will it become too large?
 	 */
-	if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
+	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > q->max_sectors)
 		return 0;
 
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
@@ -345,7 +345,7 @@ static void blk_account_io_merge(struct request *req)
 		int cpu;
 
 		cpu = part_stat_lock();
-		part = disk_map_sector_rcu(req->rq_disk, req->sector);
+		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part);
@@ -366,7 +366,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	/*
 	 * not contiguous
 	 */
-	if (req->sector + req->nr_sectors != next->sector)
+	if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
 		return 0;
 
 	if (rq_data_dir(req) != rq_data_dir(next)
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 575083a9ffe4a..db4d990a66bfd 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -349,8 +349,8 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2)
 	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
 		return rq2;
 
-	s1 = rq1->sector;
-	s2 = rq2->sector;
+	s1 = blk_rq_pos(rq1);
+	s2 = blk_rq_pos(rq2);
 
 	last = cfqd->last_position;
 
@@ -949,10 +949,10 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 					  struct request *rq)
 {
-	if (rq->sector >= cfqd->last_position)
-		return rq->sector - cfqd->last_position;
+	if (blk_rq_pos(rq) >= cfqd->last_position)
+		return blk_rq_pos(rq) - cfqd->last_position;
 	else
-		return cfqd->last_position - rq->sector;
+		return cfqd->last_position - blk_rq_pos(rq);
 }
 
 #define CIC_SEEK_THR	8 * 1024
@@ -1918,10 +1918,10 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic,
 
 	if (!cic->last_request_pos)
 		sdist = 0;
-	else if (cic->last_request_pos < rq->sector)
-		sdist = rq->sector - cic->last_request_pos;
+	else if (cic->last_request_pos < blk_rq_pos(rq))
+		sdist = blk_rq_pos(rq) - cic->last_request_pos;
 	else
-		sdist = cic->last_request_pos - rq->sector;
+		sdist = cic->last_request_pos - blk_rq_pos(rq);
 
 	/*
 	 * Don't allow the seek distance to get too large from the
@@ -2071,7 +2071,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfq_update_io_seektime(cfqd, cic, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 
-	cic->last_request_pos = rq->sector + rq->nr_sectors;
+	cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 
 	if (cfqq == cfqd->active_queue) {
 		/*
diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c
index c4d991d4adef0..b547cbca7b23a 100644
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -138,7 +138,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
 
 		__rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
 		if (__rq) {
-			BUG_ON(sector != __rq->sector);
+			BUG_ON(sector != blk_rq_pos(__rq));
 
 			if (elv_rq_merge_ok(__rq, bio)) {
 				ret = ELEVATOR_FRONT_MERGE;
diff --git a/block/elevator.c b/block/elevator.c
index 1af5d9f04affe..918920056e42b 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -52,7 +52,7 @@ static const int elv_hash_shift = 6;
 #define ELV_HASH_FN(sec)	\
 		(hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
 #define ELV_HASH_ENTRIES	(1 << elv_hash_shift)
-#define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
+#define rq_hash_key(rq)		(blk_rq_pos(rq) + blk_rq_sectors(rq))
 
 DEFINE_TRACE(block_rq_insert);
 DEFINE_TRACE(block_rq_issue);
@@ -119,9 +119,9 @@ static inline int elv_try_merge(struct request *__rq, struct bio *bio)
 	 * we can merge and sequence is ok, check if it's possible
 	 */
 	if (elv_rq_merge_ok(__rq, bio)) {
-		if (__rq->sector + __rq->nr_sectors == bio->bi_sector)
+		if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector)
 			ret = ELEVATOR_BACK_MERGE;
-		else if (__rq->sector - bio_sectors(bio) == bio->bi_sector)
+		else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector)
 			ret = ELEVATOR_FRONT_MERGE;
 	}
 
@@ -370,9 +370,9 @@ struct request *elv_rb_add(struct rb_root *root, struct request *rq)
 		parent = *p;
 		__rq = rb_entry(parent, struct request, rb_node);
 
-		if (rq->sector < __rq->sector)
+		if (blk_rq_pos(rq) < blk_rq_pos(__rq))
 			p = &(*p)->rb_left;
-		else if (rq->sector > __rq->sector)
+		else if (blk_rq_pos(rq) > blk_rq_pos(__rq))
 			p = &(*p)->rb_right;
 		else
 			return __rq;
@@ -400,9 +400,9 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector)
 	while (n) {
 		rq = rb_entry(n, struct request, rb_node);
 
-		if (sector < rq->sector)
+		if (sector < blk_rq_pos(rq))
 			n = n->rb_left;
-		else if (sector > rq->sector)
+		else if (sector > blk_rq_pos(rq))
 			n = n->rb_right;
 		else
 			return rq;
@@ -441,14 +441,14 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 			break;
 		if (pos->cmd_flags & stop_flags)
 			break;
-		if (rq->sector >= boundary) {
-			if (pos->sector < boundary)
+		if (blk_rq_pos(rq) >= boundary) {
+			if (blk_rq_pos(pos) < boundary)
 				continue;
 		} else {
-			if (pos->sector >= boundary)
+			if (blk_rq_pos(pos) >= boundary)
 				break;
 		}
-		if (rq->sector >= pos->sector)
+		if (blk_rq_pos(rq) >= blk_rq_pos(pos))
 			break;
 	}
 
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index f22ed6cc69f28..774ab05973a92 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3338,8 +3338,8 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_
 	}
 	Command->Completion = Request->end_io_data;
 	Command->LogicalDriveNumber = (long)Request->rq_disk->private_data;
-	Command->BlockNumber = Request->sector;
-	Command->BlockCount = Request->nr_sectors;
+	Command->BlockNumber = blk_rq_pos(Request);
+	Command->BlockCount = blk_rq_sectors(Request);
 	Command->Request = Request;
 	blkdev_dequeue_request(Request);
 	Command->SegmentCount = blk_rq_map_sg(req_q,
@@ -3431,7 +3431,7 @@ static void DAC960_queue_partial_rw(DAC960_Command_T *Command)
    * successfully as possible.
    */
   Command->SegmentCount = 1;
-  Command->BlockNumber = Request->sector;
+  Command->BlockNumber = blk_rq_pos(Request);
   Command->BlockCount = 1;
   DAC960_QueueReadWriteCommand(Command);
   return;
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 8ff95f2c0ede2..e4a14b94828ea 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1351,13 +1351,13 @@ static void redo_fd_request(void)
 	drive = floppy - unit;
 
 	/* Here someone could investigate to be more efficient */
-	for (cnt = 0; cnt < CURRENT->current_nr_sectors; cnt++) { 
+	for (cnt = 0; cnt < blk_rq_cur_sectors(CURRENT); cnt++) {
 #ifdef DEBUG
 		printk("fd: sector %ld + %d requested for %s\n",
-		       CURRENT->sector,cnt,
+		       blk_rq_pos(CURRENT), cnt,
 		       (rq_data_dir(CURRENT) == READ) ? "read" : "write");
 #endif
-		block = CURRENT->sector + cnt;
+		block = blk_rq_pos(CURRENT) + cnt;
 		if ((int)block > floppy->blocks) {
 			__blk_end_request_cur(CURRENT, -EIO);
 			goto repeat;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 25067287211fb..234024cda5ecf 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -725,7 +725,7 @@ static void do_fd_action( int drive )
 	    if (IS_BUFFERED( drive, ReqSide, ReqTrack )) {
 		if (ReqCmd == READ) {
 		    copy_buffer( SECTOR_BUFFER(ReqSector), ReqData );
-		    if (++ReqCnt < CURRENT->current_nr_sectors) {
+		    if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) {
 			/* read next sector */
 			setup_req_params( drive );
 			goto repeat;
@@ -1130,7 +1130,7 @@ static void fd_rwsec_done1(int status)
 		}
 	}
   
-	if (++ReqCnt < CURRENT->current_nr_sectors) {
+	if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) {
 		/* read next sector */
 		setup_req_params( SelectedDrive );
 		do_fd_action( SelectedDrive );
@@ -1394,7 +1394,7 @@ static void redo_fd_request(void)
 
 	DPRINT(("redo_fd_request: CURRENT=%p dev=%s CURRENT->sector=%ld\n",
 		CURRENT, CURRENT ? CURRENT->rq_disk->disk_name : "",
-		CURRENT ? CURRENT->sector : 0 ));
+		CURRENT ? blk_rq_pos(CURRENT) : 0 ));
 
 	IsFormatting = 0;
 
@@ -1440,7 +1440,7 @@ static void redo_fd_request(void)
 		UD.autoprobe = 0;
 	}
 	
-	if (CURRENT->sector + 1 > UDT->blocks) {
+	if (blk_rq_pos(CURRENT) + 1 > UDT->blocks) {
 		__blk_end_request_cur(CURRENT, -EIO);
 		goto repeat;
 	}
@@ -1450,7 +1450,7 @@ static void redo_fd_request(void)
 		
 	ReqCnt = 0;
 	ReqCmd = rq_data_dir(CURRENT);
-	ReqBlock = CURRENT->sector;
+	ReqBlock = blk_rq_pos(CURRENT);
 	ReqBuffer = CURRENT->buffer;
 	setup_req_params( drive );
 	do_fd_action( drive );
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index f22d4932433fe..ab7b04c0db70a 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2835,10 +2835,10 @@ static void do_cciss_request(struct request_queue *q)
 	c->Request.Timeout = 0;	// Don't time out
 	c->Request.CDB[0] =
 	    (rq_data_dir(creq) == READ) ? h->cciss_read : h->cciss_write;
-	start_blk = creq->sector;
+	start_blk = blk_rq_pos(creq);
 #ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "ciss: sector =%d nr_sectors=%d\n", (int)creq->sector,
-	       (int)creq->nr_sectors);
+	printk(KERN_DEBUG "ciss: sector =%d nr_sectors=%d\n",
+	       (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq));
 #endif				/* CCISS_DEBUG */
 
 	sg_init_table(tmp_sg, MAXSGENTRIES);
@@ -2864,8 +2864,8 @@ static void do_cciss_request(struct request_queue *q)
 		h->maxSG = seg;
 
 #ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "cciss: Submitting %lu sectors in %d segments\n",
-	       creq->nr_sectors, seg);
+	printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n",
+	       blk_rq_sectors(creq), seg);
 #endif				/* CCISS_DEBUG */
 
 	c->Header.SGList = c->Header.SGTotal = seg;
@@ -2877,8 +2877,8 @@ static void do_cciss_request(struct request_queue *q)
 			c->Request.CDB[4] = (start_blk >> 8) & 0xff;
 			c->Request.CDB[5] = start_blk & 0xff;
 			c->Request.CDB[6] = 0;	// (sect >> 24) & 0xff; MSB
-			c->Request.CDB[7] = (creq->nr_sectors >> 8) & 0xff;
-			c->Request.CDB[8] = creq->nr_sectors & 0xff;
+			c->Request.CDB[7] = (blk_rq_sectors(creq) >> 8) & 0xff;
+			c->Request.CDB[8] = blk_rq_sectors(creq) & 0xff;
 			c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0;
 		} else {
 			u32 upper32 = upper_32_bits(start_blk);
@@ -2893,10 +2893,10 @@ static void do_cciss_request(struct request_queue *q)
 			c->Request.CDB[7]= (start_blk >> 16) & 0xff;
 			c->Request.CDB[8]= (start_blk >>  8) & 0xff;
 			c->Request.CDB[9]= start_blk & 0xff;
-			c->Request.CDB[10]= (creq->nr_sectors >>  24) & 0xff;
-			c->Request.CDB[11]= (creq->nr_sectors >>  16) & 0xff;
-			c->Request.CDB[12]= (creq->nr_sectors >>  8) & 0xff;
-			c->Request.CDB[13]= creq->nr_sectors & 0xff;
+			c->Request.CDB[10]= (blk_rq_sectors(creq) >> 24) & 0xff;
+			c->Request.CDB[11]= (blk_rq_sectors(creq) >> 16) & 0xff;
+			c->Request.CDB[12]= (blk_rq_sectors(creq) >>  8) & 0xff;
+			c->Request.CDB[13]= blk_rq_sectors(creq) & 0xff;
 			c->Request.CDB[14] = c->Request.CDB[15] = 0;
 		}
 	} else if (blk_pc_request(creq)) {
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 488a8f4a60aa2..a5caeff4718e1 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -919,10 +919,11 @@ static void do_ida_request(struct request_queue *q)
 	c->hdr.size = sizeof(rblk_t) >> 2;
 	c->size += sizeof(rblk_t);
 
-	c->req.hdr.blk = creq->sector;
+	c->req.hdr.blk = blk_rq_pos(creq);
 	c->rq = creq;
 DBGPX(
-	printk("sector=%d, nr_sectors=%d\n", creq->sector, creq->nr_sectors);
+	printk("sector=%d, nr_sectors=%u\n",
+	       blk_rq_pos(creq), blk_rq_sectors(creq));
 );
 	sg_init_table(tmp_sg, SG_MAX);
 	seg = blk_rq_map_sg(q, creq, tmp_sg);
@@ -940,9 +941,9 @@ DBGPX(
 						 tmp_sg[i].offset,
 						 tmp_sg[i].length, dir);
 	}
-DBGPX(	printk("Submitting %d sectors in %d segments\n", creq->nr_sectors, seg); );
+DBGPX(	printk("Submitting %u sectors in %d segments\n", blk_rq_sectors(creq), seg); );
 	c->req.hdr.sg_cnt = seg;
-	c->req.hdr.blk_cnt = creq->nr_sectors;
+	c->req.hdr.blk_cnt = blk_rq_sectors(creq);
 	c->req.hdr.cmd = (rq_data_dir(creq) == READ) ? IDA_READ : IDA_WRITE;
 	c->type = CMD_RWREQ;
 
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 1300df6f1642a..452486283386b 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2303,7 +2303,7 @@ static void floppy_end_request(struct request *req, int error)
 
 	/* current_count_sectors can be zero if transfer failed */
 	if (error)
-		nr_sectors = req->current_nr_sectors;
+		nr_sectors = blk_rq_cur_sectors(req);
 	if (__blk_end_request(req, error, nr_sectors << 9))
 		return;
 
@@ -2332,7 +2332,7 @@ static void request_done(int uptodate)
 	if (uptodate) {
 		/* maintain values for invalidation on geometry
 		 * change */
-		block = current_count_sectors + req->sector;
+		block = current_count_sectors + blk_rq_pos(req);
 		INFBOUND(DRS->maxblock, block);
 		if (block > _floppy->sect)
 			DRS->maxtrack = 1;
@@ -2346,10 +2346,10 @@ static void request_done(int uptodate)
 			/* record write error information */
 			DRWE->write_errors++;
 			if (DRWE->write_errors == 1) {
-				DRWE->first_error_sector = req->sector;
+				DRWE->first_error_sector = blk_rq_pos(req);
 				DRWE->first_error_generation = DRS->generation;
 			}
-			DRWE->last_error_sector = req->sector;
+			DRWE->last_error_sector = blk_rq_pos(req);
 			DRWE->last_error_generation = DRS->generation;
 		}
 		spin_lock_irqsave(q->queue_lock, flags);
@@ -2503,24 +2503,24 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 
 	max_sector = transfer_size(ssize,
 				   min(max_sector, max_sector_2),
-				   current_req->nr_sectors);
+				   blk_rq_sectors(current_req));
 
 	if (current_count_sectors <= 0 && CT(COMMAND) == FD_WRITE &&
-	    buffer_max > fsector_t + current_req->nr_sectors)
+	    buffer_max > fsector_t + blk_rq_sectors(current_req))
 		current_count_sectors = min_t(int, buffer_max - fsector_t,
-					      current_req->nr_sectors);
+					      blk_rq_sectors(current_req));
 
 	remaining = current_count_sectors << 9;
 #ifdef FLOPPY_SANITY_CHECK
-	if ((remaining >> 9) > current_req->nr_sectors &&
+	if ((remaining >> 9) > blk_rq_sectors(current_req) &&
 	    CT(COMMAND) == FD_WRITE) {
 		DPRINT("in copy buffer\n");
 		printk("current_count_sectors=%ld\n", current_count_sectors);
 		printk("remaining=%d\n", remaining >> 9);
-		printk("current_req->nr_sectors=%ld\n",
-		       current_req->nr_sectors);
+		printk("current_req->nr_sectors=%u\n",
+		       blk_rq_sectors(current_req));
 		printk("current_req->current_nr_sectors=%u\n",
-		       current_req->current_nr_sectors);
+		       blk_rq_cur_sectors(current_req));
 		printk("max_sector=%d\n", max_sector);
 		printk("ssize=%d\n", ssize);
 	}
@@ -2530,7 +2530,7 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 
 	dma_buffer = floppy_track_buffer + ((fsector_t - buffer_min) << 9);
 
-	size = current_req->current_nr_sectors << 9;
+	size = blk_rq_cur_sectors(current_req) << 9;
 
 	rq_for_each_segment(bv, current_req, iter) {
 		if (!remaining)
@@ -2648,10 +2648,10 @@ static int make_raw_rw_request(void)
 
 	max_sector = _floppy->sect * _floppy->head;
 
-	TRACK = (int)current_req->sector / max_sector;
-	fsector_t = (int)current_req->sector % max_sector;
+	TRACK = (int)blk_rq_pos(current_req) / max_sector;
+	fsector_t = (int)blk_rq_pos(current_req) % max_sector;
 	if (_floppy->track && TRACK >= _floppy->track) {
-		if (current_req->current_nr_sectors & 1) {
+		if (blk_rq_cur_sectors(current_req) & 1) {
 			current_count_sectors = 1;
 			return 1;
 		} else
@@ -2669,7 +2669,7 @@ static int make_raw_rw_request(void)
 		if (fsector_t >= max_sector) {
 			current_count_sectors =
 			    min_t(int, _floppy->sect - fsector_t,
-				  current_req->nr_sectors);
+				  blk_rq_sectors(current_req));
 			return 1;
 		}
 		SIZECODE = 2;
@@ -2720,7 +2720,7 @@ static int make_raw_rw_request(void)
 
 	in_sector_offset = (fsector_t % _floppy->sect) % ssize;
 	aligned_sector_t = fsector_t - in_sector_offset;
-	max_size = current_req->nr_sectors;
+	max_size = blk_rq_sectors(current_req);
 	if ((raw_cmd->track == buffer_track) &&
 	    (current_drive == buffer_drive) &&
 	    (fsector_t >= buffer_min) && (fsector_t < buffer_max)) {
@@ -2729,10 +2729,10 @@ static int make_raw_rw_request(void)
 			copy_buffer(1, max_sector, buffer_max);
 			return 1;
 		}
-	} else if (in_sector_offset || current_req->nr_sectors < ssize) {
+	} else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) {
 		if (CT(COMMAND) == FD_WRITE) {
-			if (fsector_t + current_req->nr_sectors > ssize &&
-			    fsector_t + current_req->nr_sectors < ssize + ssize)
+			if (fsector_t + blk_rq_sectors(current_req) > ssize &&
+			    fsector_t + blk_rq_sectors(current_req) < ssize + ssize)
 				max_size = ssize + ssize;
 			else
 				max_size = ssize;
@@ -2776,7 +2776,7 @@ static int make_raw_rw_request(void)
 		    (indirect * 2 > direct * 3 &&
 		     *errors < DP->max_errors.read_track && ((!probing
 		       || (DP->read_track & (1 << DRS->probed_format)))))) {
-			max_size = current_req->nr_sectors;
+			max_size = blk_rq_sectors(current_req);
 		} else {
 			raw_cmd->kernel_data = current_req->buffer;
 			raw_cmd->length = current_count_sectors << 9;
@@ -2801,7 +2801,7 @@ static int make_raw_rw_request(void)
 	    fsector_t > buffer_max ||
 	    fsector_t < buffer_min ||
 	    ((CT(COMMAND) == FD_READ ||
-	      (!in_sector_offset && current_req->nr_sectors >= ssize)) &&
+	      (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) &&
 	     max_sector > 2 * max_buffer_sectors + buffer_min &&
 	     max_size + fsector_t > 2 * max_buffer_sectors + buffer_min)
 	    /* not enough space */
@@ -2879,8 +2879,8 @@ static int make_raw_rw_request(void)
 				printk("write\n");
 			return 0;
 		}
-	} else if (raw_cmd->length > current_req->nr_sectors << 9 ||
-		   current_count_sectors > current_req->nr_sectors) {
+	} else if (raw_cmd->length > blk_rq_sectors(current_req) << 9 ||
+		   current_count_sectors > blk_rq_sectors(current_req)) {
 		DPRINT("buffer overrun in direct transfer\n");
 		return 0;
 	} else if (raw_cmd->length < current_count_sectors << 9) {
@@ -2990,8 +2990,9 @@ static void do_fd_request(struct request_queue * q)
 	if (usage_count == 0) {
 		printk("warning: usage count=0, current_req=%p exiting\n",
 		       current_req);
-		printk("sect=%ld type=%x flags=%x\n", (long)current_req->sector,
-		       current_req->cmd_type, current_req->cmd_flags);
+		printk("sect=%ld type=%x flags=%x\n",
+		       (long)blk_rq_pos(current_req), current_req->cmd_type,
+		       current_req->cmd_flags);
 		return;
 	}
 	if (test_bit(0, &fdc_busy)) {
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 75b9ca95c4eb8..a3b39940ce02d 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -228,7 +228,7 @@ static void dump_status(const char *msg, unsigned int stat)
 			printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL),
 				inb(HD_CURRENT) & 0xf, inb(HD_SECTOR));
 			if (CURRENT)
-				printk(", sector=%ld", CURRENT->sector);
+				printk(", sector=%ld", blk_rq_pos(CURRENT));
 		}
 		printk("\n");
 	}
@@ -457,9 +457,9 @@ static void read_intr(void)
 	req = CURRENT;
 	insw(HD_DATA, req->buffer, 256);
 #ifdef DEBUG
-	printk("%s: read: sector %ld, remaining = %ld, buffer=%p\n",
-		req->rq_disk->disk_name, req->sector + 1, req->nr_sectors - 1,
-		req->buffer+512);
+	printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
+	       req->rq_disk->disk_name, blk_rq_pos(req) + 1,
+	       blk_rq_sectors(req) - 1, req->buffer+512);
 #endif
 	if (__blk_end_request(req, 0, 512)) {
 		SET_HANDLER(&read_intr);
@@ -485,7 +485,7 @@ static void write_intr(void)
 			continue;
 		if (!OK_STATUS(i))
 			break;
-		if ((req->nr_sectors <= 1) || (i & DRQ_STAT))
+		if ((blk_rq_sectors(req) <= 1) || (i & DRQ_STAT))
 			goto ok_to_write;
 	} while (--retries > 0);
 	dump_status("write_intr", i);
@@ -589,8 +589,8 @@ static void hd_request(void)
 		return;
 	}
 	disk = req->rq_disk->private_data;
-	block = req->sector;
-	nsect = req->nr_sectors;
+	block = blk_rq_pos(req);
+	nsect = blk_rq_sectors(req);
 	if (block >= get_capacity(req->rq_disk) ||
 	    ((block+nsect) > get_capacity(req->rq_disk))) {
 		printk("%s: bad access: block=%d, count=%d\n",
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 71e56cc28cac3..826c3492b9fe2 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -220,7 +220,8 @@ static void mg_dump_status(const char *msg, unsigned int stat,
 			if (host->breq) {
 				req = elv_next_request(host->breq);
 				if (req)
-					printk(", sector=%u", (u32)req->sector);
+					printk(", sector=%u",
+					       (unsigned int)blk_rq_pos(req));
 			}
 
 		}
@@ -493,12 +494,12 @@ static void mg_read(struct request *req)
 	u32 j;
 	struct mg_host *host = req->rq_disk->private_data;
 
-	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, NULL) !=
-			MG_ERR_NONE)
+	if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
+		   MG_CMD_RD, NULL) != MG_ERR_NONE)
 		mg_bad_rw_intr(host);
 
 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       req->nr_sectors, req->sector, req->buffer);
+	       blk_rq_sectors(req), blk_rq_pos(req), req->buffer);
 
 	do {
 		u16 *buff = (u16 *)req->buffer;
@@ -522,14 +523,14 @@ static void mg_write(struct request *req)
 	u32 j;
 	struct mg_host *host = req->rq_disk->private_data;
 
-	if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, NULL) !=
-			MG_ERR_NONE) {
+	if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
+		   MG_CMD_WR, NULL) != MG_ERR_NONE) {
 		mg_bad_rw_intr(host);
 		return;
 	}
 
 	MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
-	       req->nr_sectors, req->sector, req->buffer);
+	       blk_rq_sectors(req), blk_rq_pos(req), req->buffer);
 
 	do {
 		u16 *buff = (u16 *)req->buffer;
@@ -579,7 +580,7 @@ static void mg_read_intr(struct mg_host *host)
 			      (i << 1));
 
 	MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-			req->sector, req->nr_sectors - 1, req->buffer);
+	       blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer);
 
 	/* send read confirm */
 	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
@@ -609,7 +610,7 @@ static void mg_write_intr(struct mg_host *host)
 			break;
 		if (!MG_READY_OK(i))
 			break;
-		if ((req->nr_sectors <= 1) || (i & ATA_DRQ))
+		if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ))
 			goto ok_to_write;
 	} while (0);
 	mg_dump_status("mg_write_intr", i, host);
@@ -627,7 +628,7 @@ static void mg_write_intr(struct mg_host *host)
 			buff++;
 		}
 		MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
-				req->sector, req->nr_sectors, req->buffer);
+		       blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
 		host->mg_do_intr = mg_write_intr;
 		mod_timer(&host->timer, jiffies + 3 * HZ);
 	}
@@ -749,9 +750,9 @@ static void mg_request(struct request_queue *q)
 
 		del_timer(&host->timer);
 
-		sect_num = req->sector;
+		sect_num = blk_rq_pos(req);
 		/* deal whole segments */
-		sect_cnt = req->nr_sectors;
+		sect_cnt = blk_rq_sectors(req);
 
 		/* sanity check */
 		if (sect_num >= get_capacity(req->rq_disk) ||
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index a9ab8be9d92f1..977a57377930d 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -110,7 +110,7 @@ static void nbd_end_request(struct request *req)
 			req, error ? "failed" : "done");
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_end_request(req, error, req->nr_sectors << 9);
+	__blk_end_request(req, error, blk_rq_sectors(req) << 9);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -231,19 +231,19 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 {
 	int result, flags;
 	struct nbd_request request;
-	unsigned long size = req->nr_sectors << 9;
+	unsigned long size = blk_rq_sectors(req) << 9;
 
 	request.magic = htonl(NBD_REQUEST_MAGIC);
 	request.type = htonl(nbd_cmd(req));
-	request.from = cpu_to_be64((u64) req->sector << 9);
+	request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
 	request.len = htonl(size);
 	memcpy(request.handle, &req, sizeof(req));
 
-	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n",
+	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
 			lo->disk->disk_name, req,
 			nbdcmd_to_ascii(nbd_cmd(req)),
-			(unsigned long long)req->sector << 9,
-			req->nr_sectors << 9);
+			(unsigned long long)blk_rq_pos(req) << 9,
+			blk_rq_sectors(req) << 9);
 	result = sock_xmit(lo, 1, &request, sizeof(request),
 			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 9fd57c2aa4636..2d5dc0af55e4d 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -728,8 +728,8 @@ static void do_pcd_request(struct request_queue * q)
 			if (cd != pcd_current)
 				pcd_bufblk = -1;
 			pcd_current = cd;
-			pcd_sector = pcd_req->sector;
-			pcd_count = pcd_req->current_nr_sectors;
+			pcd_sector = blk_rq_pos(pcd_req);
+			pcd_count = blk_rq_cur_sectors(pcd_req);
 			pcd_buf = pcd_req->buffer;
 			pcd_busy = 1;
 			ps_set_intr(do_pcd_read, NULL, 0, nice);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 0732df4e901a5..9ec5d4ac0b644 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -444,11 +444,11 @@ static enum action do_pd_io_start(void)
 
 	pd_cmd = rq_data_dir(pd_req);
 	if (pd_cmd == READ || pd_cmd == WRITE) {
-		pd_block = pd_req->sector;
-		pd_count = pd_req->current_nr_sectors;
+		pd_block = blk_rq_pos(pd_req);
+		pd_count = blk_rq_cur_sectors(pd_req);
 		if (pd_block + pd_count > get_capacity(pd_req->rq_disk))
 			return Fail;
-		pd_run = pd_req->nr_sectors;
+		pd_run = blk_rq_sectors(pd_req);
 		pd_buf = pd_req->buffer;
 		pd_retries = 0;
 		if (pd_cmd == READ)
@@ -479,7 +479,7 @@ static int pd_next_buf(void)
 		return 0;
 	spin_lock_irqsave(&pd_lock, saved_flags);
 	__blk_end_request_cur(pd_req, 0);
-	pd_count = pd_req->current_nr_sectors;
+	pd_count = blk_rq_cur_sectors(pd_req);
 	pd_buf = pd_req->buffer;
 	spin_unlock_irqrestore(&pd_lock, saved_flags);
 	return 0;
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 3871e3586d6d2..e88c889aa7f21 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -768,9 +768,9 @@ static void do_pf_request(struct request_queue * q)
 		return;
 
 	pf_current = pf_req->rq_disk->private_data;
-	pf_block = pf_req->sector;
-	pf_run = pf_req->nr_sectors;
-	pf_count = pf_req->current_nr_sectors;
+	pf_block = blk_rq_pos(pf_req);
+	pf_run = blk_rq_sectors(pf_req);
+	pf_count = blk_rq_cur_sectors(pf_req);
 
 	if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) {
 		pf_end_request(-EIO);
@@ -810,7 +810,7 @@ static int pf_next_buf(void)
 		spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
 		if (!pf_req)
 			return 1;
-		pf_count = pf_req->current_nr_sectors;
+		pf_count = blk_rq_cur_sectors(pf_req);
 		pf_buf = pf_req->buffer;
 	}
 	return 0;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index c2388673684eb..8d583081b50a9 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -134,13 +134,12 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 	rq_for_each_segment(bv, req, iter)
 		n++;
 	dev_dbg(&dev->sbd.core,
-		"%s:%u: %s req has %u bvecs for %lu sectors %lu hard sectors\n",
-		__func__, __LINE__, op, n, req->nr_sectors,
-		blk_rq_sectors(req));
+		"%s:%u: %s req has %u bvecs for %u sectors\n",
+		__func__, __LINE__, op, n, blk_rq_sectors(req));
 #endif
 
-	start_sector = req->sector * priv->blocking_factor;
-	sectors = req->nr_sectors * priv->blocking_factor;
+	start_sector = blk_rq_pos(req) * priv->blocking_factor;
+	sectors = blk_rq_sectors(req) * priv->blocking_factor;
 	dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n",
 		__func__, __LINE__, op, sectors, start_sector);
 
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index f59887c5ffbdb..9f351bfa15ea8 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -416,7 +416,7 @@ static int __send_request(struct request *req)
 		desc->slice = 0;
 	}
 	desc->status = ~0;
-	desc->offset = (req->sector << 9) / port->vdisk_block_size;
+	desc->offset = (blk_rq_pos(req) << 9) / port->vdisk_block_size;
 	desc->size = len;
 	desc->ncookies = err;
 
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 97ef4266c4c7e..fc6a1c3229337 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -531,7 +531,7 @@ static void redo_fd_request(struct request_queue *q)
 	while ((req = elv_next_request(q))) {
 
 		fs = req->rq_disk->private_data;
-		if (req->sector < 0 || req->sector >= fs->total_secs) {
+		if (blk_rq_pos(req) >= fs->total_secs) {
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
@@ -551,8 +551,8 @@ static void redo_fd_request(struct request_queue *q)
 			__blk_end_request_cur(req, -EIO);
 			break;
 		case READ:
-			if (floppy_read_sectors(fs, req->sector,
-						req->current_nr_sectors,
+			if (floppy_read_sectors(fs, blk_rq_pos(req),
+						blk_rq_cur_sectors(req),
 						req->buffer)) {
 				__blk_end_request_cur(req, -EIO);
 				continue;
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index 424855945b9b3..c1b9a4dc11ba9 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -312,14 +312,14 @@ static void start_request(struct floppy_state *fs)
 	}
 	while (fs->state == idle && (req = elv_next_request(swim3_queue))) {
 #if 0
-		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%ld buf=%p\n",
+		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
 		       req->rq_disk->disk_name, req->cmd,
-		       (long)req->sector, req->nr_sectors, req->buffer);
-		printk("           errors=%d current_nr_sectors=%ld\n",
-		       req->errors, req->current_nr_sectors);
+		       (long)blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
+		printk("           errors=%d current_nr_sectors=%u\n",
+		       req->errors, blk_rq_cur_sectors(req));
 #endif
 
-		if (req->sector >= fs->total_secs) {
+		if (blk_rq_pos(req) >= fs->total_secs) {
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
@@ -337,13 +337,14 @@ static void start_request(struct floppy_state *fs)
 			}
 		}
 
-		/* Do not remove the cast. req->sector is now a sector_t and
-		 * can be 64 bits, but it will never go past 32 bits for this
-		 * driver anyway, so we can safely cast it down and not have
-		 * to do a 64/32 division
+		/* Do not remove the cast. blk_rq_pos(req) is now a
+		 * sector_t and can be 64 bits, but it will never go
+		 * past 32 bits for this driver anyway, so we can
+		 * safely cast it down and not have to do a 64/32
+		 * division
 		 */
-		fs->req_cyl = ((long)req->sector) / fs->secpercyl;
-		x = ((long)req->sector) % fs->secpercyl;
+		fs->req_cyl = ((long)blk_rq_pos(req)) / fs->secpercyl;
+		x = ((long)blk_rq_pos(req)) % fs->secpercyl;
 		fs->head = x / fs->secpertrack;
 		fs->req_sector = x % fs->secpertrack + 1;
 		fd_req = req;
@@ -420,7 +421,7 @@ static inline void setup_transfer(struct floppy_state *fs)
 	struct dbdma_cmd *cp = fs->dma_cmd;
 	struct dbdma_regs __iomem *dr = fs->dma;
 
-	if (fd_req->current_nr_sectors <= 0) {
+	if (blk_rq_cur_sectors(fd_req) <= 0) {
 		printk(KERN_ERR "swim3: transfer 0 sectors?\n");
 		return;
 	}
@@ -428,8 +429,8 @@ static inline void setup_transfer(struct floppy_state *fs)
 		n = 1;
 	else {
 		n = fs->secpertrack - fs->req_sector + 1;
-		if (n > fd_req->current_nr_sectors)
-			n = fd_req->current_nr_sectors;
+		if (n > blk_rq_cur_sectors(fd_req))
+			n = blk_rq_cur_sectors(fd_req);
 	}
 	fs->scount = n;
 	swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0);
@@ -600,7 +601,8 @@ static void xfer_timeout(unsigned long data)
 	out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
 	out_8(&sw->select, RELAX);
 	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
-	       (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)fd_req->sector);
+	       (rq_data_dir(fd_req)==WRITE? "writ": "read"),
+	       (long)blk_rq_pos(fd_req));
 	__blk_end_request_cur(fd_req, -EIO);
 	fs->state = idle;
 	start_request(fs);
@@ -714,7 +716,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 			} else {
 				printk("swim3: error %sing block %ld (err=%x)\n",
 				       rq_data_dir(fd_req) == WRITE? "writ": "read",
-				       (long)fd_req->sector, err);
+				       (long)blk_rq_pos(fd_req), err);
 				__blk_end_request_cur(fd_req, -EIO);
 				fs->state = idle;
 			}
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 60e85bb6f7902..087c94c8b2dab 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -903,10 +903,10 @@ static void carm_rq_fn(struct request_queue *q)
 	msg->sg_count	= n_elem;
 	msg->sg_type	= SGT_32BIT;
 	msg->handle	= cpu_to_le32(TAG_ENCODE(crq->tag));
-	msg->lba	= cpu_to_le32(rq->sector & 0xffffffff);
-	tmp		= (rq->sector >> 16) >> 16;
+	msg->lba	= cpu_to_le32(blk_rq_pos(rq) & 0xffffffff);
+	tmp		= (blk_rq_pos(rq) >> 16) >> 16;
 	msg->lba_high	= cpu_to_le16( (u16) tmp );
-	msg->lba_count	= cpu_to_le16(rq->nr_sectors);
+	msg->lba_count	= cpu_to_le16(blk_rq_sectors(rq));
 
 	msg_size = sizeof(struct carm_msg_rw) - sizeof(msg->sg);
 	for (i = 0; i < n_elem; i++) {
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 8c2cc71327e3f..dc3b899ad2b3c 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -726,8 +726,8 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 	 * The call to blk_queue_hardsect_size() guarantees that request
 	 * is aligned, but it is given in terms of 512 byte units, always.
 	 */
-	block = rq->sector >> lun->capacity.bshift;
-	nblks = rq->nr_sectors >> lun->capacity.bshift;
+	block = blk_rq_pos(rq) >> lun->capacity.bshift;
+	nblks = blk_rq_sectors(rq) >> lun->capacity.bshift;
 
 	cmd->cdb[0] = (cmd->dir == UB_DIR_READ)? READ_10: WRITE_10;
 	/* 10-byte uses 4 bytes of LBA: 2147483648KB, 2097152MB, 2048GB */
@@ -739,7 +739,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 	cmd->cdb[8] = nblks;
 	cmd->cdb_len = 10;
 
-	cmd->len = rq->nr_sectors * 512;
+	cmd->len = blk_rq_sectors(rq) * 512;
 }
 
 static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index e821eed7132f4..2086cb12d3ec7 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -252,7 +252,7 @@ static int send_request(struct request *req)
 	struct viodasd_device *d;
 	unsigned long flags;
 
-	start = (u64)req->sector << 9;
+	start = (u64)blk_rq_pos(req) << 9;
 
 	if (rq_data_dir(req) == READ) {
 		direction = DMA_FROM_DEVICE;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 50745e64414e5..1980ab4563569 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -85,7 +85,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	vbr->req = req;
 	if (blk_fs_request(vbr->req)) {
 		vbr->out_hdr.type = 0;
-		vbr->out_hdr.sector = vbr->req->sector;
+		vbr->out_hdr.sector = blk_rq_pos(vbr->req);
 		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 	} else if (blk_pc_request(vbr->req)) {
 		vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 14be4c1ed1aaf..4ef88018bcde1 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -306,8 +306,8 @@ static void do_xd_request (struct request_queue * q)
 		return;
 
 	while ((req = elv_next_request(q)) != NULL) {
-		unsigned block = req->sector;
-		unsigned count = req->nr_sectors;
+		unsigned block = blk_rq_pos(req);
+		unsigned count = blk_rq_sectors(req);
 		XD_INFO *disk = req->rq_disk->private_data;
 		int res = 0;
 		int retry;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index b4564479f6416..91fc56597e9b7 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -231,7 +231,7 @@ static int blkif_queue_request(struct request *req)
 	info->shadow[id].request = (unsigned long)req;
 
 	ring_req->id = id;
-	ring_req->sector_number = (blkif_sector_t)req->sector;
+	ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
 	ring_req->handle = info->handle;
 
 	ring_req->operation = rq_data_dir(req) ?
@@ -310,11 +310,10 @@ static void do_blkif_request(struct request_queue *rq)
 			goto wait;
 
 		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
-			 "(%u/%li) buffer:%p [%s]\n",
-			 req, req->cmd, (unsigned long)req->sector,
-			 req->current_nr_sectors,
-			 req->nr_sectors, req->buffer,
-			 rq_data_dir(req) ? "write" : "read");
+			 "(%u/%u) buffer:%p [%s]\n",
+			 req, req->cmd, (unsigned long)blk_rq_pos(req),
+			 blk_rq_cur_sectors(req), blk_rq_sectors(req),
+			 req->buffer, rq_data_dir(req) ? "write" : "read");
 
 
 		blkdev_dequeue_request(req);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 5722931d14c50..97c99b43f8815 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -646,13 +646,14 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		/* Okay, it's a data request, set it up for transfer */
 		dev_dbg(ace->dev,
 			"request: sec=%llx hcnt=%x, ccnt=%x, dir=%i\n",
-			(unsigned long long) req->sector, blk_rq_sectors(req),
-			req->current_nr_sectors, rq_data_dir(req));
+			(unsigned long long)blk_rq_pos(req),
+			blk_rq_sectors(req), blk_rq_cur_sectors(req),
+			rq_data_dir(req));
 
 		ace->req = req;
 		ace->data_ptr = req->buffer;
-		ace->data_count = req->current_nr_sectors * ACE_BUF_PER_SECTOR;
-		ace_out32(ace, ACE_MPULBA, req->sector & 0x0FFFFFFF);
+		ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR;
+		ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF);
 
 		count = blk_rq_sectors(req);
 		if (rq_data_dir(req)) {
@@ -688,7 +689,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			dev_dbg(ace->dev,
 				"CFBSY set; t=%i iter=%i c=%i dc=%i irq=%i\n",
 				ace->fsm_task, ace->fsm_iter_num,
-				ace->req->current_nr_sectors * 16,
+				blk_rq_cur_sectors(ace->req) * 16,
 				ace->data_count, ace->in_irq);
 			ace_fsm_yield(ace);	/* need to poll CFBSY bit */
 			break;
@@ -697,7 +698,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			dev_dbg(ace->dev,
 				"DATABUF not set; t=%i iter=%i c=%i dc=%i irq=%i\n",
 				ace->fsm_task, ace->fsm_iter_num,
-				ace->req->current_nr_sectors * 16,
+				blk_rq_cur_sectors(ace->req) * 16,
 				ace->data_count, ace->in_irq);
 			ace_fsm_yieldirq(ace);
 			break;
@@ -721,10 +722,10 @@ static void ace_fsm_dostate(struct ace_device *ace)
 					blk_rq_cur_bytes(ace->req))) {
 			/* dev_dbg(ace->dev, "next block; h=%u c=%u\n",
 			 *      blk_rq_sectors(ace->req),
-			 *      ace->req->current_nr_sectors);
+			 *      blk_rq_cur_sectors(ace->req));
 			 */
 			ace->data_ptr = ace->req->buffer;
-			ace->data_count = ace->req->current_nr_sectors * 16;
+			ace->data_count = blk_rq_cur_sectors(ace->req) * 16;
 			ace_fsm_yieldirq(ace);
 			break;
 		}
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index b66ad58a3c38a..d4e6b71f514ac 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -71,12 +71,12 @@ static void do_z2_request(struct request_queue *q)
 {
 	struct request *req;
 	while ((req = elv_next_request(q)) != NULL) {
-		unsigned long start = req->sector << 9;
-		unsigned long len  = req->current_nr_sectors << 9;
+		unsigned long start = blk_rq_pos(req) << 9;
+		unsigned long len  = blk_rq_cur_sectors(req) << 9;
 
 		if (start + len > z2ram_size) {
 			printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
-				req->sector, req->current_nr_sectors);
+				blk_rq_pos(req), blk_rq_cur_sectors(req));
 			__blk_end_request_cur(req, -EIO);
 			continue;
 		}
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index cab2b1fb2fe7a..488423cab51af 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -584,8 +584,8 @@ static void gdrom_readdisk_dma(struct work_struct *work)
 	list_for_each_safe(elem, next, &gdrom_deferred) {
 		req = list_entry(elem, struct request, queuelist);
 		spin_unlock(&gdrom_lock);
-		block = req->sector/GD_TO_BLK + GD_SESSION_OFFSET;
-		block_cnt = req->nr_sectors/GD_TO_BLK;
+		block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET;
+		block_cnt = blk_rq_sectors(req)/GD_TO_BLK;
 		ctrl_outl(PHYSADDR(req->buffer), GDROM_DMA_STARTADDR_REG);
 		ctrl_outl(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG);
 		ctrl_outl(1, GDROM_DMA_DIRECTION_REG);
@@ -661,7 +661,7 @@ static void gdrom_request(struct request_queue *rq)
 			printk(" write request ignored\n");
 			__blk_end_request_cur(req, -EIO);
 		}
-		if (req->nr_sectors)
+		if (blk_rq_sectors(req))
 			gdrom_request_handler_dma(req);
 		else
 			__blk_end_request_cur(req, -EIO);
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index cc3efa096e1a0..6e190a93d8dfc 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -282,7 +282,7 @@ static int send_request(struct request *req)
 			viopath_targetinst(viopath_hostLp),
 			(u64)req, VIOVERSION << 16,
 			((u64)DEVICE_NR(diskinfo) << 48) | dmaaddr,
-			(u64)req->sector * 512, len, 0);
+			(u64)blk_rq_pos(req) * 512, len, 0);
 	if (hvrc != HvLpEvent_Rc_Good) {
 		printk(VIOCD_KERN_WARNING "hv error on op %d\n", (int)hvrc);
 		return -1;
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index a41634699f846..9e600d22f40e3 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -677,10 +677,10 @@ static int mspro_block_issue_req(struct memstick_dev *card, int chunk)
 			continue;
 		}
 
-		t_sec = msb->block_req->sector << 9;
+		t_sec = blk_rq_pos(msb->block_req) << 9;
 		sector_div(t_sec, msb->page_size);
 
-		count = msb->block_req->nr_sectors << 9;
+		count = blk_rq_sectors(msb->block_req) << 9;
 		count /= msb->page_size;
 
 		param.system = msb->system;
@@ -745,7 +745,7 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error)
 					t_len *= msb->page_size;
 			}
 		} else
-			t_len = msb->block_req->nr_sectors << 9;
+			t_len = blk_rq_sectors(msb->block_req) << 9;
 
 		dev_dbg(&card->dev, "transferred %x (%d)\n", t_len, error);
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 56e60f0d53123..510eb47263746 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -761,7 +761,7 @@ static int i2o_block_transfer(struct request *req)
 			break;
 
 		case CACHE_SMARTFETCH:
-			if (req->nr_sectors > 16)
+			if (blk_rq_sectors(req) > 16)
 				ctl_flags = 0x201F0008;
 			else
 				ctl_flags = 0x001F0000;
@@ -781,13 +781,13 @@ static int i2o_block_transfer(struct request *req)
 			ctl_flags = 0x001F0010;
 			break;
 		case CACHE_SMARTBACK:
-			if (req->nr_sectors > 16)
+			if (blk_rq_sectors(req) > 16)
 				ctl_flags = 0x001F0004;
 			else
 				ctl_flags = 0x001F0010;
 			break;
 		case CACHE_SMARTTHROUGH:
-			if (req->nr_sectors > 16)
+			if (blk_rq_sectors(req) > 16)
 				ctl_flags = 0x001F0004;
 			else
 				ctl_flags = 0x001F0010;
@@ -827,22 +827,24 @@ static int i2o_block_transfer(struct request *req)
 
 		*mptr++ = cpu_to_le32(scsi_flags);
 
-		*((u32 *) & cmd[2]) = cpu_to_be32(req->sector * hwsec);
-		*((u16 *) & cmd[7]) = cpu_to_be16(req->nr_sectors * hwsec);
+		*((u32 *) & cmd[2]) = cpu_to_be32(blk_rq_pos(req) * hwsec);
+		*((u16 *) & cmd[7]) = cpu_to_be16(blk_rq_sectors(req) * hwsec);
 
 		memcpy(mptr, cmd, 10);
 		mptr += 4;
-		*mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT);
+		*mptr++ =
+		    cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
 	} else
 #endif
 	{
 		msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
 		*mptr++ = cpu_to_le32(ctl_flags);
-		*mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT);
 		*mptr++ =
-		    cpu_to_le32((u32) (req->sector << KERNEL_SECTOR_SHIFT));
+		    cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
+		*mptr++ =
+		    cpu_to_le32((u32) (blk_rq_pos(req) << KERNEL_SECTOR_SHIFT));
 		*mptr++ =
-		    cpu_to_le32(req->sector >> (32 - KERNEL_SECTOR_SHIFT));
+		    cpu_to_le32(blk_rq_pos(req) >> (32 - KERNEL_SECTOR_SHIFT));
 	}
 
 	if (!i2o_block_sglist_alloc(c, ireq, &mptr)) {
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index fe8041e619eab..949e99770ad6f 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -243,7 +243,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 		brq.mrq.cmd = &brq.cmd;
 		brq.mrq.data = &brq.data;
 
-		brq.cmd.arg = req->sector;
+		brq.cmd.arg = blk_rq_pos(req);
 		if (!mmc_card_blockaddr(card))
 			brq.cmd.arg <<= 9;
 		brq.cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC;
@@ -251,7 +251,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 		brq.stop.opcode = MMC_STOP_TRANSMISSION;
 		brq.stop.arg = 0;
 		brq.stop.flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC;
-		brq.data.blocks = req->nr_sectors;
+		brq.data.blocks = blk_rq_sectors(req);
 
 		/*
 		 * After a read error, we redo the request one sector at a time
@@ -293,7 +293,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 		 * Adjust the sg list so it is the same size as the
 		 * request.
 		 */
-		if (brq.data.blocks != req->nr_sectors) {
+		if (brq.data.blocks != blk_rq_sectors(req)) {
 			int i, data_size = brq.data.blocks << 9;
 			struct scatterlist *sg;
 
@@ -344,8 +344,8 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req)
 			printk(KERN_ERR "%s: error %d transferring data,"
 			       " sector %u, nr %u, card status %#x\n",
 			       req->rq_disk->disk_name, brq.data.error,
-			       (unsigned)req->sector,
-			       (unsigned)req->nr_sectors, status);
+			       (unsigned)blk_rq_pos(req),
+			       (unsigned)blk_rq_sectors(req), status);
 		}
 
 		if (brq.stop.error) {
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 76c4c8d130736..4ea2e67ac97cf 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -47,8 +47,8 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 	unsigned long block, nsect;
 	char *buf;
 
-	block = req->sector << 9 >> tr->blkshift;
-	nsect = req->current_nr_sectors << 9 >> tr->blkshift;
+	block = blk_rq_pos(req) << 9 >> tr->blkshift;
+	nsect = blk_rq_cur_sectors(req) << 9 >> tr->blkshift;
 
 	buf = req->buffer;
 
@@ -59,7 +59,8 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 	if (!blk_fs_request(req))
 		return -EIO;
 
-	if (req->sector + req->current_nr_sectors > get_capacity(req->rq_disk))
+	if (blk_rq_pos(req) + blk_rq_cur_sectors(req) >
+	    get_capacity(req->rq_disk))
 		return -EIO;
 
 	switch(rq_data_dir(req)) {
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index fabec95686b0f..7df03c7aea0d3 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -603,7 +603,7 @@ static void dasd_profile_end(struct dasd_block *block,
 	if (dasd_profile_level != DASD_PROFILE_ON)
 		return;
 
-	sectors = req->nr_sectors;
+	sectors = blk_rq_sectors(req);
 	if (!cqr->buildclk || !cqr->startclk ||
 	    !cqr->stopclk || !cqr->endclk ||
 	    !sectors)
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index b9a7f77334468..2efaddfae5608 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -505,8 +505,9 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev,
 		return ERR_PTR(-EINVAL);
 	blksize = block->bp_block;
 	/* Calculate record id of first and last block. */
-	first_rec = req->sector >> block->s2b_shift;
-	last_rec = (req->sector + req->nr_sectors - 1) >> block->s2b_shift;
+	first_rec = blk_rq_pos(req) >> block->s2b_shift;
+	last_rec =
+		(blk_rq_pos(req) + blk_rq_sectors(req) - 1) >> block->s2b_shift;
 	/* Check struct bio and count the number of blocks for the request. */
 	count = 0;
 	rq_for_each_segment(bv, req, iter) {
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index cb52da033f061..a41c94053e64b 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -2354,10 +2354,10 @@ static struct dasd_ccw_req *dasd_eckd_build_cp(struct dasd_device *startdev,
 	blksize = block->bp_block;
 	blk_per_trk = recs_per_track(&private->rdc_data, 0, blksize);
 	/* Calculate record id of first and last block. */
-	first_rec = first_trk = req->sector >> block->s2b_shift;
+	first_rec = first_trk = blk_rq_pos(req) >> block->s2b_shift;
 	first_offs = sector_div(first_trk, blk_per_trk);
 	last_rec = last_trk =
-		(req->sector + req->nr_sectors - 1) >> block->s2b_shift;
+		(blk_rq_pos(req) + blk_rq_sectors(req) - 1) >> block->s2b_shift;
 	last_offs = sector_div(last_trk, blk_per_trk);
 	cdlspecial = (private->uses_cdl && first_rec < 2*blk_per_trk);
 
@@ -2420,7 +2420,7 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req)
 	private = (struct dasd_eckd_private *) cqr->block->base->private;
 	blksize = cqr->block->bp_block;
 	blk_per_trk = recs_per_track(&private->rdc_data, 0, blksize);
-	recid = req->sector >> cqr->block->s2b_shift;
+	recid = blk_rq_pos(req) >> cqr->block->s2b_shift;
 	ccw = cqr->cpaddr;
 	/* Skip over define extent & locate record. */
 	ccw++;
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index a3eb6fd146731..8912358daa2fd 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -270,8 +270,9 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev,
 		return ERR_PTR(-EINVAL);
 	blksize = block->bp_block;
 	/* Calculate record id of first and last block. */
-	first_rec = req->sector >> block->s2b_shift;
-	last_rec = (req->sector + req->nr_sectors - 1) >> block->s2b_shift;
+	first_rec = blk_rq_pos(req) >> block->s2b_shift;
+	last_rec =
+		(blk_rq_pos(req) + blk_rq_sectors(req) - 1) >> block->s2b_shift;
 	/* Check struct bio and count the number of blocks for the request. */
 	count = 0;
 	cidaw = 0;
@@ -309,7 +310,7 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev,
 	ccw = cqr->cpaddr;
 	/* First ccw is define extent. */
 	define_extent(ccw++, cqr->data, rq_data_dir(req),
-		      block->bp_block, req->sector, req->nr_sectors);
+		      block->bp_block, blk_rq_pos(req), blk_rq_sectors(req));
 	/* Build locate_record + read/write ccws. */
 	idaws = (unsigned long *) (cqr->data + sizeof(struct DE_fba_data));
 	LO_data = (struct LO_fba_data *) (idaws + cidaw);
diff --git a/drivers/s390/char/tape_34xx.c b/drivers/s390/char/tape_34xx.c
index 5f8e8ef43dd31..2d00a383a475c 100644
--- a/drivers/s390/char/tape_34xx.c
+++ b/drivers/s390/char/tape_34xx.c
@@ -1134,7 +1134,7 @@ tape_34xx_bread(struct tape_device *device, struct request *req)
 	/* Setup ccws. */
 	request->op = TO_BLOCK;
 	start_block = (struct tape_34xx_block_id *) request->cpdata;
-	start_block->block = req->sector >> TAPEBLOCK_HSEC_S2B;
+	start_block->block = blk_rq_pos(req) >> TAPEBLOCK_HSEC_S2B;
 	DBF_EVENT(6, "start_block = %i\n", start_block->block);
 
 	ccw = request->cpaddr;
diff --git a/drivers/s390/char/tape_3590.c b/drivers/s390/char/tape_3590.c
index 823b05bd0dd79..c453b2f3e9f4c 100644
--- a/drivers/s390/char/tape_3590.c
+++ b/drivers/s390/char/tape_3590.c
@@ -633,7 +633,7 @@ tape_3590_bread(struct tape_device *device, struct request *req)
 	struct req_iterator iter;
 
 	DBF_EVENT(6, "xBREDid:");
-	start_block = req->sector >> TAPEBLOCK_HSEC_S2B;
+	start_block = blk_rq_pos(req) >> TAPEBLOCK_HSEC_S2B;
 	DBF_EVENT(6, "start_block = %i\n", start_block);
 
 	rq_for_each_segment(bv, req, iter)
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 86596d3813b5c..5d035e4939dc0 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -87,7 +87,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 	if (ccw_req->rc == 0)
 		/* Update position. */
 		device->blk_data.block_position =
-			(req->sector + req->nr_sectors) >> TAPEBLOCK_HSEC_S2B;
+		  (blk_rq_pos(req) + blk_rq_sectors(req)) >> TAPEBLOCK_HSEC_S2B;
 	else
 		/* We lost the position information due to an error. */
 		device->blk_data.block_position = -1;
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 09617884a50b1..2132c906e53a4 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -188,8 +188,8 @@ static void jsfd_do_request(struct request_queue *q)
 
 	while ((req = elv_next_request(q)) != NULL) {
 		struct jsfd_part *jdp = req->rq_disk->private_data;
-		unsigned long offset = req->sector << 9;
-		size_t len = req->current_nr_sectors << 9;
+		unsigned long offset = blk_rq_pos(req) << 9;
+		size_t len = blk_rq_cur_sectors(req) << 9;
 
 		if ((offset + len) > jdp->dsize) {
 			__blk_end_request_cur(req, -EIO);
diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c
index be5099dd94b5d..c7076ce25e212 100644
--- a/drivers/scsi/eata.c
+++ b/drivers/scsi/eata.c
@@ -1825,7 +1825,7 @@ static int eata2x_queuecommand(struct scsi_cmnd *SCpnt,
 	if (linked_comm && SCpnt->device->queue_depth > 2
 	    && TLDEV(SCpnt->device->type)) {
 		ha->cp_stat[i] = READY;
-		flush_dev(SCpnt->device, SCpnt->request->sector, ha, 0);
+		flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), ha, 0);
 		return 0;
 	}
 
@@ -2144,13 +2144,13 @@ static int reorder(struct hostdata *ha, unsigned long cursec,
 		if (!cpp->din)
 			input_only = 0;
 
-		if (SCpnt->request->sector < minsec)
-			minsec = SCpnt->request->sector;
-		if (SCpnt->request->sector > maxsec)
-			maxsec = SCpnt->request->sector;
+		if (blk_rq_pos(SCpnt->request) < minsec)
+			minsec = blk_rq_pos(SCpnt->request);
+		if (blk_rq_pos(SCpnt->request) > maxsec)
+			maxsec = blk_rq_pos(SCpnt->request);
 
-		sl[n] = SCpnt->request->sector;
-		ioseek += SCpnt->request->nr_sectors;
+		sl[n] = blk_rq_pos(SCpnt->request);
+		ioseek += blk_rq_sectors(SCpnt->request);
 
 		if (!n)
 			continue;
@@ -2190,7 +2190,7 @@ static int reorder(struct hostdata *ha, unsigned long cursec,
 			k = il[n];
 			cpp = &ha->cp[k];
 			SCpnt = cpp->SCpnt;
-			ll[n] = SCpnt->request->nr_sectors;
+			ll[n] = blk_rq_sectors(SCpnt->request);
 			pl[n] = SCpnt->serial_number;
 
 			if (!n)
@@ -2236,12 +2236,12 @@ static int reorder(struct hostdata *ha, unsigned long cursec,
 			cpp = &ha->cp[k];
 			SCpnt = cpp->SCpnt;
 			scmd_printk(KERN_INFO, SCpnt,
-			    "%s pid %ld mb %d fc %d nr %d sec %ld ns %ld"
+			    "%s pid %ld mb %d fc %d nr %d sec %ld ns %u"
 			     " cur %ld s:%c r:%c rev:%c in:%c ov:%c xd %d.\n",
 			     (ihdlr ? "ihdlr" : "qcomm"),
 			     SCpnt->serial_number, k, flushcount,
-			     n_ready, SCpnt->request->sector,
-			     SCpnt->request->nr_sectors, cursec, YESNO(s),
+			     n_ready, blk_rq_pos(SCpnt->request),
+			     blk_rq_sectors(SCpnt->request), cursec, YESNO(s),
 			     YESNO(r), YESNO(rev), YESNO(input_only),
 			     YESNO(overlap), cpp->din);
 		}
@@ -2408,7 +2408,7 @@ static irqreturn_t ihdlr(struct Scsi_Host *shost)
 
 	if (linked_comm && SCpnt->device->queue_depth > 2
 	    && TLDEV(SCpnt->device->type))
-		flush_dev(SCpnt->device, SCpnt->request->sector, ha, 1);
+		flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), ha, 1);
 
 	tstatus = status_byte(spp->target_status);
 
diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c
index 36fd2e75da1c0..a8fab39771168 100644
--- a/drivers/scsi/lpfc/lpfc_scsi.c
+++ b/drivers/scsi/lpfc/lpfc_scsi.c
@@ -1313,10 +1313,10 @@ lpfc_parse_bg_err(struct lpfc_hba *phba, struct lpfc_scsi_buf *lpfc_cmd,
 	uint32_t bgstat = bgf->bgstat;
 	uint64_t failing_sector = 0;
 
-	printk(KERN_ERR "BG ERROR in cmd 0x%x lba 0x%llx blk cnt 0x%lx "
+	printk(KERN_ERR "BG ERROR in cmd 0x%x lba 0x%llx blk cnt 0x%x "
 			"bgstat=0x%x bghm=0x%x\n",
 			cmd->cmnd[0], (unsigned long long)scsi_get_lba(cmd),
-			cmd->request->nr_sectors, bgstat, bghm);
+			blk_rq_sectors(cmd->request), bgstat, bghm);
 
 	spin_lock(&_dump_buf_lock);
 	if (!_dump_buf_done) {
@@ -2375,15 +2375,15 @@ lpfc_queuecommand(struct scsi_cmnd *cmnd, void (*done) (struct scsi_cmnd *))
 		if (cmnd->cmnd[0] == READ_10)
 			lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG,
 					"9035 BLKGRD: READ @ sector %llu, "
-					 "count %lu\n",
-					 (unsigned long long)scsi_get_lba(cmnd),
-					cmnd->request->nr_sectors);
+					"count %u\n",
+					(unsigned long long)scsi_get_lba(cmnd),
+					blk_rq_sectors(cmnd->request));
 		else if (cmnd->cmnd[0] == WRITE_10)
 			lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG,
 					"9036 BLKGRD: WRITE @ sector %llu, "
-					"count %lu cmd=%p\n",
+					"count %u cmd=%p\n",
 					(unsigned long long)scsi_get_lba(cmnd),
-					cmnd->request->nr_sectors,
+					blk_rq_sectors(cmnd->request),
 					cmnd);
 
 		err = lpfc_bg_scsi_prep_dma_buf(phba, lpfc_cmd);
@@ -2403,15 +2403,15 @@ lpfc_queuecommand(struct scsi_cmnd *cmnd, void (*done) (struct scsi_cmnd *))
 		if (cmnd->cmnd[0] == READ_10)
 			lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG,
 					 "9040 dbg: READ @ sector %llu, "
-					 "count %lu\n",
+					 "count %u\n",
 					 (unsigned long long)scsi_get_lba(cmnd),
-					 cmnd->request->nr_sectors);
+					 blk_rq_sectors(cmnd->request));
 		else if (cmnd->cmnd[0] == WRITE_10)
 			lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG,
 					 "9041 dbg: WRITE @ sector %llu, "
-					 "count %lu cmd=%p\n",
+					 "count %u cmd=%p\n",
 					 (unsigned long long)scsi_get_lba(cmnd),
-					 cmnd->request->nr_sectors, cmnd);
+					 blk_rq_sectors(cmnd->request), cmnd);
 		else
 			lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG,
 					 "9042 dbg: parser not implemented\n");
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 9ff0ca9988a93..39b3acfc0ddf4 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -787,9 +787,9 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	 * Next deal with any sectors which we were able to correctly
 	 * handle.
 	 */
-	SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, "
+	SCSI_LOG_HLCOMPLETE(1, printk("%u sectors total, "
 				      "%d bytes done.\n",
-				      req->nr_sectors, good_bytes));
+				      blk_rq_sectors(req), good_bytes));
 
 	/*
 	 * Recovered errors need reporting, but they're always treated
@@ -968,7 +968,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 	if (blk_pc_request(req))
 		sdb->length = req->data_len;
 	else
-		sdb->length = req->nr_sectors << 9;
+		sdb->length = blk_rq_sectors(req) << 9;
 	return BLKPREP_OK;
 }
 
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 3fcb64b91c43d..70c4dd99bbf01 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -383,9 +383,9 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	struct scsi_device *sdp = q->queuedata;
 	struct gendisk *disk = rq->rq_disk;
 	struct scsi_disk *sdkp;
-	sector_t block = rq->sector;
+	sector_t block = blk_rq_pos(rq);
 	sector_t threshold;
-	unsigned int this_count = rq->nr_sectors;
+	unsigned int this_count = blk_rq_sectors(rq);
 	int ret, host_dif;
 
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
@@ -412,10 +412,10 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 					this_count));
 
 	if (!sdp || !scsi_device_online(sdp) ||
- 	    block + rq->nr_sectors > get_capacity(disk)) {
+	    block + blk_rq_sectors(rq) > get_capacity(disk)) {
 		SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
-						"Finishing %ld sectors\n",
-						rq->nr_sectors));
+						"Finishing %u sectors\n",
+						blk_rq_sectors(rq)));
 		SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
 						"Retry with 0x%p\n", SCpnt));
 		goto out;
@@ -462,7 +462,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	 * for this.
 	 */
 	if (sdp->sector_size == 1024) {
-		if ((block & 1) || (rq->nr_sectors & 1)) {
+		if ((block & 1) || (blk_rq_sectors(rq) & 1)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
 			goto out;
@@ -472,7 +472,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 		}
 	}
 	if (sdp->sector_size == 2048) {
-		if ((block & 3) || (rq->nr_sectors & 3)) {
+		if ((block & 3) || (blk_rq_sectors(rq) & 3)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
 			goto out;
@@ -482,7 +482,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 		}
 	}
 	if (sdp->sector_size == 4096) {
-		if ((block & 7) || (rq->nr_sectors & 7)) {
+		if ((block & 7) || (blk_rq_sectors(rq) & 7)) {
 			scmd_printk(KERN_ERR, SCpnt,
 				    "Bad block number requested\n");
 			goto out;
@@ -511,10 +511,10 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq)
 	}
 
 	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
-					"%s %d/%ld 512 byte blocks.\n",
+					"%s %d/%u 512 byte blocks.\n",
 					(rq_data_dir(rq) == WRITE) ?
 					"writing" : "reading", this_count,
-					rq->nr_sectors));
+					blk_rq_sectors(rq)));
 
 	/* Set RDPROTECT/WRPROTECT if disk is formatted with DIF */
 	host_dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type);
@@ -970,8 +970,8 @@ static struct block_device_operations sd_fops = {
 
 static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd)
 {
-	u64 start_lba = scmd->request->sector;
-	u64 end_lba = scmd->request->sector + (scsi_bufflen(scmd) / 512);
+	u64 start_lba = blk_rq_pos(scmd->request);
+	u64 end_lba = blk_rq_pos(scmd->request) + (scsi_bufflen(scmd) / 512);
 	u64 bad_lba;
 	int info_valid;
 
diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c
index 184dff492797e..82f14a9482d07 100644
--- a/drivers/scsi/sd_dif.c
+++ b/drivers/scsi/sd_dif.c
@@ -507,7 +507,7 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes)
 	sector_sz = scmd->device->sector_size;
 	sectors = good_bytes / sector_sz;
 
-	phys = scmd->request->sector & 0xffffffff;
+	phys = blk_rq_pos(scmd->request) & 0xffffffff;
 	if (sector_sz == 4096)
 		phys >>= 3;
 
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index 0e1a0f2d2ad55..fddba53c7fe51 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -292,7 +292,8 @@ static int sr_done(struct scsi_cmnd *SCpnt)
 			if (cd->device->sector_size == 2048)
 				error_sector <<= 2;
 			error_sector &= ~(block_sectors - 1);
-			good_bytes = (error_sector - SCpnt->request->sector) << 9;
+			good_bytes = (error_sector -
+				      blk_rq_pos(SCpnt->request)) << 9;
 			if (good_bytes < 0 || good_bytes >= this_count)
 				good_bytes = 0;
 			/*
@@ -349,8 +350,8 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 				cd->disk->disk_name, block));
 
 	if (!cd->device || !scsi_device_online(cd->device)) {
-		SCSI_LOG_HLQUEUE(2, printk("Finishing %ld sectors\n",
-					rq->nr_sectors));
+		SCSI_LOG_HLQUEUE(2, printk("Finishing %u sectors\n",
+					   blk_rq_sectors(rq)));
 		SCSI_LOG_HLQUEUE(2, printk("Retry with 0x%p\n", SCpnt));
 		goto out;
 	}
@@ -413,7 +414,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 	/*
 	 * request doesn't start on hw block boundary, add scatter pads
 	 */
-	if (((unsigned int)rq->sector % (s_size >> 9)) ||
+	if (((unsigned int)blk_rq_pos(rq) % (s_size >> 9)) ||
 	    (scsi_bufflen(SCpnt) % s_size)) {
 		scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n");
 		goto out;
@@ -422,14 +423,14 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq)
 	this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9);
 
 
-	SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n",
+	SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%u 512 byte blocks.\n",
 				cd->cdi.name,
 				(rq_data_dir(rq) == WRITE) ?
 					"writing" : "reading",
-				this_count, rq->nr_sectors));
+				this_count, blk_rq_sectors(rq)));
 
 	SCpnt->cmnd[1] = 0;
-	block = (unsigned int)rq->sector / (s_size >> 9);
+	block = (unsigned int)blk_rq_pos(rq) / (s_size >> 9);
 
 	if (this_count > 0xffff) {
 		this_count = 0xffff;
diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c
index 601e95141cbe1..54023d41fd15c 100644
--- a/drivers/scsi/u14-34f.c
+++ b/drivers/scsi/u14-34f.c
@@ -1306,7 +1306,7 @@ static int u14_34f_queuecommand(struct scsi_cmnd *SCpnt, void (*done)(struct scs
    if (linked_comm && SCpnt->device->queue_depth > 2
                                      && TLDEV(SCpnt->device->type)) {
       HD(j)->cp_stat[i] = READY;
-      flush_dev(SCpnt->device, SCpnt->request->sector, j, FALSE);
+      flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), j, FALSE);
       return 0;
       }
 
@@ -1610,11 +1610,13 @@ static int reorder(unsigned int j, unsigned long cursec,
 
       if (!(cpp->xdir == DTD_IN)) input_only = FALSE;
 
-      if (SCpnt->request->sector < minsec) minsec = SCpnt->request->sector;
-      if (SCpnt->request->sector > maxsec) maxsec = SCpnt->request->sector;
+      if (blk_rq_pos(SCpnt->request) < minsec)
+	 minsec = blk_rq_pos(SCpnt->request);
+      if (blk_rq_pos(SCpnt->request) > maxsec)
+	 maxsec = blk_rq_pos(SCpnt->request);
 
-      sl[n] = SCpnt->request->sector;
-      ioseek += SCpnt->request->nr_sectors;
+      sl[n] = blk_rq_pos(SCpnt->request);
+      ioseek += blk_rq_sectors(SCpnt->request);
 
       if (!n) continue;
 
@@ -1642,7 +1644,7 @@ static int reorder(unsigned int j, unsigned long cursec,
 
    if (!input_only) for (n = 0; n < n_ready; n++) {
       k = il[n]; cpp = &HD(j)->cp[k]; SCpnt = cpp->SCpnt;
-      ll[n] = SCpnt->request->nr_sectors; pl[n] = SCpnt->serial_number;
+      ll[n] = blk_rq_sectors(SCpnt->request); pl[n] = SCpnt->serial_number;
 
       if (!n) continue;
 
@@ -1666,12 +1668,12 @@ static int reorder(unsigned int j, unsigned long cursec,
    if (link_statistics && (overlap || !(flushcount % link_statistics)))
       for (n = 0; n < n_ready; n++) {
          k = il[n]; cpp = &HD(j)->cp[k]; SCpnt = cpp->SCpnt;
-         printk("%s %d.%d:%d pid %ld mb %d fc %d nr %d sec %ld ns %ld"\
+         printk("%s %d.%d:%d pid %ld mb %d fc %d nr %d sec %ld ns %u"\
                 " cur %ld s:%c r:%c rev:%c in:%c ov:%c xd %d.\n",
                 (ihdlr ? "ihdlr" : "qcomm"), SCpnt->channel, SCpnt->target,
                 SCpnt->lun, SCpnt->serial_number, k, flushcount, n_ready,
-                SCpnt->request->sector, SCpnt->request->nr_sectors, cursec,
-                YESNO(s), YESNO(r), YESNO(rev), YESNO(input_only),
+                blk_rq_pos(SCpnt->request), blk_rq_sectors(SCpnt->request),
+		cursec, YESNO(s), YESNO(r), YESNO(rev), YESNO(input_only),
                 YESNO(overlap), cpp->xdir);
          }
 #endif
@@ -1799,7 +1801,7 @@ static irqreturn_t ihdlr(unsigned int j)
 
    if (linked_comm && SCpnt->device->queue_depth > 2
                                      && TLDEV(SCpnt->device->type))
-      flush_dev(SCpnt->device, SCpnt->request->sector, j, TRUE);
+      flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), j, TRUE);
 
    tstatus = status_byte(spp->target_status);
 
diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h
index 43b50d36925ca..3878d1dc7f596 100644
--- a/include/scsi/scsi_cmnd.h
+++ b/include/scsi/scsi_cmnd.h
@@ -270,7 +270,7 @@ static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd)
 
 static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd)
 {
-	return scmd->request->sector;
+	return blk_rq_pos(scmd->request);
 }
 
 static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd)
-- 
GitLab


From 9780e2dd8254351f6cbe11304849126b51dbd561 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:40 +0900
Subject: [PATCH 0983/2198] ide: convert to rq pos and nr_sectors accessors

ide doesn't manipulate request fields anymore and thus all hard and
their soft equivalents are always equal.  Convert all references to
accessors.

[ Impact: use pos and nr_sectors accessors ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-cd.c       | 8 ++++----
 drivers/ide/ide-disk.c     | 8 ++++----
 drivers/ide/ide-dma.c      | 2 +-
 drivers/ide/ide-floppy.c   | 6 +++---
 drivers/ide/ide-io.c       | 4 ++--
 drivers/ide/ide-lib.c      | 2 +-
 drivers/ide/ide-tape.c     | 6 +++---
 drivers/ide/ide-taskfile.c | 2 +-
 drivers/ide/pdc202xx_old.c | 2 +-
 drivers/ide/tc86c001.c     | 2 +-
 drivers/ide/tx4939ide.c    | 2 +-
 11 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 182320dd6ea14..eb4f3dc9f1824 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -775,8 +775,8 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	}
 
 	/* fs requests *must* be hardware frame aligned */
-	if ((rq->nr_sectors & (sectors_per_frame - 1)) ||
-	    (rq->sector & (sectors_per_frame - 1)))
+	if ((blk_rq_sectors(rq) & (sectors_per_frame - 1)) ||
+	    (blk_rq_pos(rq) & (sectors_per_frame - 1)))
 		return ide_stopped;
 
 	/* use DMA, if possible */
@@ -868,8 +868,8 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 	cmd.rq = rq;
 
 	if (blk_fs_request(rq) || rq->data_len) {
-		ide_init_sg_cmd(&cmd, blk_fs_request(rq) ? (rq->nr_sectors << 9)
-							 : rq->data_len);
+		ide_init_sg_cmd(&cmd, blk_fs_request(rq) ?
+				(blk_rq_sectors(rq) << 9) : rq->data_len);
 		ide_map_sg(drive, &cmd);
 	}
 
diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index c2438804d3c4a..ad18e14043c57 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -82,7 +82,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 					sector_t block)
 {
 	ide_hwif_t *hwif	= drive->hwif;
-	u16 nsectors		= (u16)rq->nr_sectors;
+	u16 nsectors		= (u16)blk_rq_sectors(rq);
 	u8 lba48		= !!(drive->dev_flags & IDE_DFLAG_LBA48);
 	u8 dma			= !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 	struct ide_cmd		cmd;
@@ -90,7 +90,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 	ide_startstop_t		rc;
 
 	if ((hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) && lba48 && dma) {
-		if (block + rq->nr_sectors > 1ULL << 28)
+		if (block + blk_rq_sectors(rq) > 1ULL << 28)
 			dma = 0;
 		else
 			lba48 = 0;
@@ -195,9 +195,9 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 
 	ledtrig_ide_activity();
 
-	pr_debug("%s: %sing: block=%llu, sectors=%lu, buffer=0x%08lx\n",
+	pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n",
 		 drive->name, rq_data_dir(rq) == READ ? "read" : "writ",
-		 (unsigned long long)block, rq->nr_sectors,
+		 (unsigned long long)block, blk_rq_sectors(rq),
 		 (unsigned long)rq->buffer);
 
 	if (hwif->rw_disk)
diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c
index d9123ecae4a98..001f68f0bb285 100644
--- a/drivers/ide/ide-dma.c
+++ b/drivers/ide/ide-dma.c
@@ -103,7 +103,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive)
 				ide_finish_cmd(drive, cmd, stat);
 			else
 				ide_complete_rq(drive, 0,
-						cmd->rq->nr_sectors << 9);
+						blk_rq_sectors(cmd->rq) << 9);
 			return ide_stopped;
 		}
 		printk(KERN_ERR "%s: %s: bad DMA status (0x%02x)\n",
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 537b7c5580339..1c460bd563752 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -194,7 +194,7 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive,
 {
 	struct ide_disk_obj *floppy = drive->driver_data;
 	int block = sector / floppy->bs_factor;
-	int blocks = rq->nr_sectors / floppy->bs_factor;
+	int blocks = blk_rq_sectors(rq) / floppy->bs_factor;
 	int cmd = rq_data_dir(rq);
 
 	ide_debug_log(IDE_DBG_FUNC, "block: %d, blocks: %d", block, blocks);
@@ -259,8 +259,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
 			goto out_end;
 	}
 	if (blk_fs_request(rq)) {
-		if (((long)rq->sector % floppy->bs_factor) ||
-		    (rq->nr_sectors % floppy->bs_factor)) {
+		if (((long)blk_rq_pos(rq) % floppy->bs_factor) ||
+		    (blk_rq_sectors(rq) % floppy->bs_factor)) {
 			printk(KERN_ERR PFX "%s: unsupported r/w rq size\n",
 				drive->name);
 			goto out_end;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index df23bcbd94b4a..59799ca552460 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -279,7 +279,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive,
 
 	if (cmd) {
 		if (cmd->protocol == ATA_PROT_PIO) {
-			ide_init_sg_cmd(cmd, rq->nr_sectors << 9);
+			ide_init_sg_cmd(cmd, blk_rq_sectors(rq) << 9);
 			ide_map_sg(drive, cmd);
 		}
 
@@ -387,7 +387,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
 
 		drv = *(struct ide_driver **)rq->rq_disk->private_data;
 
-		return drv->do_request(drive, rq, rq->sector);
+		return drv->do_request(drive, rq, blk_rq_pos(rq));
 	}
 	return do_special(drive);
 kill_rq:
diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c
index 56ff8c46c7d10..05b7fbc7ead54 100644
--- a/drivers/ide/ide-lib.c
+++ b/drivers/ide/ide-lib.c
@@ -114,7 +114,7 @@ static void ide_dump_ata_error(ide_drive_t *drive, u8 err)
 
 		if (rq)
 			printk(KERN_CONT ", sector=%llu",
-			       (unsigned long long)rq->sector);
+			       (unsigned long long)blk_rq_pos(rq));
 	}
 	printk(KERN_CONT "\n");
 }
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 65c5b705883ab..c6cf3a951f28a 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -586,7 +586,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
 				   struct ide_atapi_pc *pc, struct request *rq,
 				   u8 opcode)
 {
-	unsigned int length = rq->nr_sectors;
+	unsigned int length = blk_rq_sectors(rq);
 
 	ide_init_pc(pc);
 	put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
@@ -617,8 +617,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	struct ide_cmd cmd;
 	u8 stat;
 
-	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n"
-		  (unsigned long long)rq->sector, rq->nr_sectors);
+	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %u\n"
+		  (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq));
 
 	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
 		/* We do not support buffer cache originated requests. */
diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c
index f400eb4d4aff7..a0c3e1b2f73c2 100644
--- a/drivers/ide/ide-taskfile.c
+++ b/drivers/ide/ide-taskfile.c
@@ -385,7 +385,7 @@ static ide_startstop_t task_pio_intr(ide_drive_t *drive)
 	if ((cmd->tf_flags & IDE_TFLAG_FS) == 0)
 		ide_finish_cmd(drive, cmd, stat);
 	else
-		ide_complete_rq(drive, 0, cmd->rq->nr_sectors << 9);
+		ide_complete_rq(drive, 0, blk_rq_sectors(cmd->rq) << 9);
 	return ide_stopped;
 out_err:
 	ide_error_cmd(drive, cmd);
diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c
index 248a54bd2386f..c2a16a8f486de 100644
--- a/drivers/ide/pdc202xx_old.c
+++ b/drivers/ide/pdc202xx_old.c
@@ -177,7 +177,7 @@ static void pdc202xx_dma_start(ide_drive_t *drive)
 		u8 clock = inb(high_16 + 0x11);
 
 		outb(clock | (hwif->channel ? 0x08 : 0x02), high_16 + 0x11);
-		word_count = (rq->nr_sectors << 8);
+		word_count = (blk_rq_sectors(rq) << 8);
 		word_count = (rq_data_dir(rq) == READ) ?
 					word_count | 0x05000000 :
 					word_count | 0x06000000;
diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c
index b4cf42dc8a6fc..05a93d6baecc6 100644
--- a/drivers/ide/tc86c001.c
+++ b/drivers/ide/tc86c001.c
@@ -112,7 +112,7 @@ static void tc86c001_dma_start(ide_drive_t *drive)
 	ide_hwif_t *hwif	= drive->hwif;
 	unsigned long sc_base	= hwif->config_data;
 	unsigned long twcr_port	= sc_base + (drive->dn ? 0x06 : 0x04);
-	unsigned long nsectors	= hwif->rq->nr_sectors;
+	unsigned long nsectors	= blk_rq_sectors(hwif->rq);
 
 	/*
 	 * We have to manually load the sector count and size into
diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c
index 564422d239766..5ca76224f6d11 100644
--- a/drivers/ide/tx4939ide.c
+++ b/drivers/ide/tx4939ide.c
@@ -307,7 +307,7 @@ static int tx4939ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd)
 	tx4939ide_writew(SECTOR_SIZE / 2, base, drive->dn ?
 			 TX4939IDE_Xfer_Cnt_2 : TX4939IDE_Xfer_Cnt_1);
 
-	tx4939ide_writew(cmd->rq->nr_sectors, base, TX4939IDE_Sec_Cnt);
+	tx4939ide_writew(blk_rq_sectors(cmd->rq), base, TX4939IDE_Sec_Cnt);
 
 	return 0;
 }
-- 
GitLab


From 2e46e8b27aa57c6bd34b3102b40ee4d0144b4fab Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:41 +0900
Subject: [PATCH 0984/2198] block: drop request->hard_* and *nr_sectors

struct request has had a few different ways to represent some
properties of a request.  ->hard_* represent block layer's view of the
request progress (completion cursor) and the ones without the prefix
are supposed to represent the issue cursor and allowed to be updated
as necessary by the low level drivers.  The thing is that as block
layer supports partial completion, the two cursors really aren't
necessary and only cause confusion.  In addition, manual management of
request detail from low level drivers is cumbersome and error-prone at
the very least.

Another interesting duplicate fields are rq->[hard_]nr_sectors and
rq->{hard_cur|current}_nr_sectors against rq->data_len and
rq->bio->bi_size.  This is more convoluted than the hard_ case.

rq->[hard_]nr_sectors are initialized for requests with bio but
blk_rq_bytes() uses it only for !pc requests.  rq->data_len is
initialized for all request but blk_rq_bytes() uses it only for pc
requests.  This causes good amount of confusion throughout block layer
and its drivers and determining the request length has been a bit of
black magic which may or may not work depending on circumstances and
what the specific LLD is actually doing.

rq->{hard_cur|current}_nr_sectors represent the number of sectors in
the contiguous data area at the front.  This is mainly used by drivers
which transfers data by walking request segment-by-segment.  This
value always equals rq->bio->bi_size >> 9.  However, data length for
pc requests may not be multiple of 512 bytes and using this field
becomes a bit confusing.

In general, having multiple fields to represent the same property
leads only to confusion and subtle bugs.  With recent block low level
driver cleanups, no driver is accessing or manipulating these
duplicate fields directly.  Drop all the duplicates.  Now rq->sector
means the current sector, rq->data_len the current total length and
rq->bio->bi_size the current segment length.  Everything else is
defined in terms of these three and available only through accessors.

* blk_recalc_rq_sectors() is collapsed into blk_update_request() and
  now handles pc and fs requests equally other than rq->sector update.
  This means that now pc requests can use partial completion too (no
  in-kernel user yet tho).

* bio_cur_sectors() is replaced with bio_cur_bytes() as block layer
  now uses byte count as the primary data length.

* blk_rq_pos() is now guranteed to be always correct.  In-block users
  converted.

* blk_rq_bytes() is now guaranteed to be always valid as is
  blk_rq_sectors().  In-block users converted.

* blk_rq_sectors() is now guaranteed to equal blk_rq_bytes() >> 9.
  More convenient one is used.

* blk_rq_bytes() and blk_rq_cur_bytes() are now inlined and take const
  pointer to request.

[ Impact: API cleanup, single way to represent one property of a request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c         | 81 +++++++++++++++-------------------------
 block/blk-merge.c        | 36 ++----------------
 block/blk.h              |  1 -
 block/cfq-iosched.c      | 10 ++---
 include/linux/bio.h      |  6 +--
 include/linux/blkdev.h   | 37 ++++++++----------
 include/linux/elevator.h |  2 +-
 kernel/trace/blktrace.c  | 16 ++++----
 8 files changed, 67 insertions(+), 122 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 82dc20621c068..3596ca71909b4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,7 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
-	rq->sector = rq->hard_sector = (sector_t) -1;
+	rq->sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
@@ -189,8 +189,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg)
 	       (unsigned long long)blk_rq_pos(rq),
 	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
-						rq->bio, rq->biotail,
-						rq->buffer, rq->data_len);
+	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
 
 	if (blk_pc_request(rq)) {
 		printk(KERN_INFO "  cdb: ");
@@ -1096,7 +1095,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
-	req->hard_sector = req->sector = bio->bi_sector;
+	req->sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
@@ -1113,14 +1112,13 @@ static inline bool queue_should_plug(struct request_queue *q)
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
-	int el_ret, nr_sectors;
+	int el_ret;
+	unsigned int bytes = bio->bi_size;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	const int unplug = bio_unplug(bio);
 	int rw_flags;
 
-	nr_sectors = bio_sectors(bio);
-
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
@@ -1145,7 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
-		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+		req->data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1171,10 +1169,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		 * not touch req->buffer either...
 		 */
 		req->buffer = bio_data(bio);
-		req->current_nr_sectors = bio_cur_sectors(bio);
-		req->hard_cur_sectors = req->current_nr_sectors;
-		req->sector = req->hard_sector = bio->bi_sector;
-		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
+		req->sector = bio->bi_sector;
+		req->data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1557,7 +1553,7 @@ EXPORT_SYMBOL(submit_bio);
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
 	if (blk_rq_sectors(rq) > q->max_sectors ||
-	    rq->data_len > q->max_hw_sectors << 9) {
+	    blk_rq_bytes(rq) > q->max_hw_sectors << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
@@ -1675,35 +1671,6 @@ static void blk_account_io_done(struct request *req)
 	}
 }
 
-/**
- * blk_rq_bytes - Returns bytes left to complete in the entire request
- * @rq: the request being processed
- **/
-unsigned int blk_rq_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return blk_rq_sectors(rq) << 9;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_bytes);
-
-/**
- * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
- * @rq: the request being processed
- **/
-unsigned int blk_rq_cur_bytes(struct request *rq)
-{
-	if (blk_fs_request(rq))
-		return rq->current_nr_sectors << 9;
-
-	if (rq->bio)
-		return rq->bio->bi_size;
-
-	return rq->data_len;
-}
-EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
-
 struct request *elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
@@ -1736,7 +1703,7 @@ struct request *elv_next_request(struct request_queue *q)
 		if (rq->cmd_flags & REQ_DONTPREP)
 			break;
 
-		if (q->dma_drain_size && rq->data_len) {
+		if (q->dma_drain_size && blk_rq_bytes(rq)) {
 			/*
 			 * make sure space for the drain appears we
 			 * know we can do this because max_hw_segments
@@ -1759,7 +1726,7 @@ struct request *elv_next_request(struct request_queue *q)
 			 * avoid resource deadlock.  REQ_STARTED will
 			 * prevent other fs requests from passing this one.
 			 */
-			if (q->dma_drain_size && rq->data_len &&
+			if (q->dma_drain_size && blk_rq_bytes(rq) &&
 			    !(rq->cmd_flags & REQ_DONTPREP)) {
 				/*
 				 * remove the space for the drain we added
@@ -1911,8 +1878,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		 * can find how many bytes remain in the request
 		 * later.
 		 */
-		req->nr_sectors = req->hard_nr_sectors = 0;
-		req->current_nr_sectors = req->hard_cur_sectors = 0;
+		req->data_len = 0;
 		return false;
 	}
 
@@ -1926,8 +1892,25 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 
-	blk_recalc_rq_sectors(req, total_bytes >> 9);
+	req->data_len -= total_bytes;
+	req->buffer = bio_data(req->bio);
+
+	/* update sector only for requests with clear definition of sector */
+	if (blk_fs_request(req) || blk_discard_rq(req))
+		req->sector += total_bytes >> 9;
+
+	/*
+	 * If total number of sectors is less than the first segment
+	 * size, something has gone terribly wrong.
+	 */
+	if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
+		printk(KERN_ERR "blk: request botched\n");
+		req->data_len = blk_rq_cur_bytes(req);
+	}
+
+	/* recalculate the number of segments */
 	blk_recalc_rq_segments(req);
+
 	return true;
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
@@ -2049,11 +2032,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
-	rq->current_nr_sectors = bio_cur_sectors(bio);
-	rq->hard_cur_sectors = rq->current_nr_sectors;
-	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
 	rq->data_len = bio->bi_size;
-
 	rq->bio = rq->biotail = bio;
 
 	if (bio->bi_bdev)
diff --git a/block/blk-merge.c b/block/blk-merge.c
index bf62a87a9da22..b8df66aef0f84 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -9,35 +9,6 @@
 
 #include "blk.h"
 
-void blk_recalc_rq_sectors(struct request *rq, int nsect)
-{
-	if (blk_fs_request(rq) || blk_discard_rq(rq)) {
-		rq->hard_sector += nsect;
-		rq->hard_nr_sectors -= nsect;
-
-		/*
-		 * Move the I/O submission pointers ahead if required.
-		 */
-		if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
-		    (rq->sector <= rq->hard_sector)) {
-			rq->sector = rq->hard_sector;
-			rq->nr_sectors = rq->hard_nr_sectors;
-			rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
-			rq->current_nr_sectors = rq->hard_cur_sectors;
-			rq->buffer = bio_data(rq->bio);
-		}
-
-		/*
-		 * if total number of sectors is less than the first segment
-		 * size, something has gone terribly wrong
-		 */
-		if (rq->nr_sectors < rq->current_nr_sectors) {
-			printk(KERN_ERR "blk: request botched\n");
-			rq->nr_sectors = rq->current_nr_sectors;
-		}
-	}
-}
-
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio)
 {
@@ -199,8 +170,9 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 
 
 	if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
-	    (rq->data_len & q->dma_pad_mask)) {
-		unsigned int pad_len = (q->dma_pad_mask & ~rq->data_len) + 1;
+	    (blk_rq_bytes(rq) & q->dma_pad_mask)) {
+		unsigned int pad_len =
+			(q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
 
 		sg->length += pad_len;
 		rq->extra_len += pad_len;
@@ -398,7 +370,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 
-	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
+	req->data_len += blk_rq_bytes(next);
 
 	elv_merge_requests(q, req, next);
 
diff --git a/block/blk.h b/block/blk.h
index 51115599df9b8..ab54529103c00 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -101,7 +101,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 int attempt_back_merge(struct request_queue *q, struct request *rq);
 int attempt_front_merge(struct request_queue *q, struct request *rq);
 void blk_recalc_rq_segments(struct request *rq);
-void blk_recalc_rq_sectors(struct request *rq, int nsect);
 
 void blk_queue_congestion_threshold(struct request_queue *q);
 
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index db4d990a66bfd..99ac4304d711e 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -579,9 +579,9 @@ cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
 		 * Sort strictly based on sector.  Smallest to the left,
 		 * largest to the right.
 		 */
-		if (sector > cfqq->next_rq->sector)
+		if (sector > blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_right;
-		else if (sector < cfqq->next_rq->sector)
+		else if (sector < blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_left;
 		else
 			break;
@@ -611,8 +611,8 @@ static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 		return;
 
 	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
-	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, cfqq->next_rq->sector,
-					 &parent, &p);
+	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
+				      blk_rq_pos(cfqq->next_rq), &parent, &p);
 	if (!__cfqq) {
 		rb_link_node(&cfqq->p_node, parent, p);
 		rb_insert_color(&cfqq->p_node, cfqq->p_root);
@@ -996,7 +996,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 	if (cfq_rq_close(cfqd, __cfqq->next_rq))
 		return __cfqq;
 
-	if (__cfqq->next_rq->sector < sector)
+	if (blk_rq_pos(__cfqq->next_rq) < sector)
 		node = rb_next(&__cfqq->p_node);
 	else
 		node = rb_prev(&__cfqq->p_node);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index f37ca8c726ba0..d30ec6f30dd7e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -218,12 +218,12 @@ struct bio {
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
 #define bio_empty_barrier(bio)	(bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio))
 
-static inline unsigned int bio_cur_sectors(struct bio *bio)
+static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
 	if (bio->bi_vcnt)
-		return bio_iovec(bio)->bv_len >> 9;
+		return bio_iovec(bio)->bv_len;
 	else /* dataless requests such as discard */
-		return bio->bi_size >> 9;
+		return bio->bi_size;
 }
 
 static inline void *bio_data(struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4e5f855987280..ce2bf5efa9bae 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -166,19 +166,8 @@ struct request {
 	enum rq_cmd_type_bits cmd_type;
 	unsigned long atomic_flags;
 
-	/* Maintain bio traversal state for part by part I/O submission.
-	 * hard_* are block layer internals, no driver should touch them!
-	 */
-
-	sector_t sector;		/* next sector to submit */
-	sector_t hard_sector;		/* next sector to complete */
-	unsigned long nr_sectors;	/* no. of sectors left to submit */
-	unsigned long hard_nr_sectors;	/* no. of sectors left to complete */
-	/* no. of sectors left to submit in the current segment */
-	unsigned int current_nr_sectors;
-
-	/* no. of sectors left to complete in the current segment */
-	unsigned int hard_cur_sectors;
+	sector_t sector;	/* sector cursor */
+	unsigned int data_len;	/* total data len, don't access directly */
 
 	struct bio *bio;
 	struct bio *biotail;
@@ -226,7 +215,6 @@ struct request {
 	unsigned char __cmd[BLK_MAX_CDB];
 	unsigned char *cmd;
 
-	unsigned int data_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
 	unsigned int resid_len;	/* residual count */
@@ -840,20 +828,27 @@ extern void blkdev_dequeue_request(struct request *req);
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
-	return rq->hard_sector;
+	return rq->sector;
+}
+
+static inline unsigned int blk_rq_bytes(const struct request *rq)
+{
+	return rq->data_len;
 }
 
-extern unsigned int blk_rq_bytes(struct request *rq);
-extern unsigned int blk_rq_cur_bytes(struct request *rq);
+static inline int blk_rq_cur_bytes(const struct request *rq)
+{
+	return rq->bio ? bio_cur_bytes(rq->bio) : 0;
+}
 
 static inline unsigned int blk_rq_sectors(const struct request *rq)
 {
-	return rq->hard_nr_sectors;
+	return blk_rq_bytes(rq) >> 9;
 }
 
 static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 {
-	return rq->hard_cur_sectors;
+	return blk_rq_cur_bytes(rq) >> 9;
 }
 
 /*
@@ -928,7 +923,7 @@ static inline void blk_end_request_all(struct request *rq, int error)
  */
 static inline bool blk_end_request_cur(struct request *rq, int error)
 {
-	return blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
 
 /**
@@ -981,7 +976,7 @@ static inline void __blk_end_request_all(struct request *rq, int error)
  */
 static inline bool __blk_end_request_cur(struct request *rq, int error)
 {
-	return __blk_end_request(rq, error, rq->hard_cur_sectors << 9);
+	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
 
 extern void blk_complete_request(struct request *);
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index c59b769f62b0d..4e462878c9ca3 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -171,7 +171,7 @@ enum {
 	ELV_MQUEUE_MUST,
 };
 
-#define rq_end_sector(rq)	((rq)->sector + (rq)->nr_sectors)
+#define rq_end_sector(rq)	(blk_rq_pos(rq) + blk_rq_sectors(rq))
 #define rb_entry_rq(node)	rb_entry((node), struct request, rb_node)
 
 /*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 42f1c11e754cd..5708a14bee54d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -642,12 +642,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 
 	if (blk_pc_request(rq)) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
-				rq->cmd_len, rq->cmd);
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+				what, rq->errors, rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
-				rw, what, rq->errors, 0, NULL);
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
+				what, rq->errors, 0, NULL);
 	}
 }
 
@@ -854,11 +854,11 @@ void blk_add_driver_data(struct request_queue *q,
 		return;
 
 	if (blk_pc_request(rq))
-		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
-				rq->errors, len, data);
+		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
+				BLK_TA_DRV_DATA, rq->errors, len, data);
 	else
-		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9,
-				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
+				BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
-- 
GitLab


From b0790410300abaaf4f25f702803beff701baebf1 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:42 +0900
Subject: [PATCH 0985/2198] block: cleanup rq->data_len usages

With recent unification of fields, it's now guaranteed that
rq->data_len always equals blk_rq_bytes().  Convert all non-IDE direct
users to accessors.  IDE will be converted in a separate patch.

Boaz: spotted incorrect data_len/resid_len conversion in osd.

[ Impact: convert direct rq->data_len usages to blk_rq_bytes() ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Darrick J. Wong <djwong@us.ibm.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ata/libata-scsi.c                |  2 +-
 drivers/block/ub.c                       |  8 ++++----
 drivers/message/fusion/mptsas.c          | 20 ++++++++++----------
 drivers/message/i2o/i2o_block.c          |  2 +-
 drivers/scsi/libsas/sas_expander.c       |  8 ++++----
 drivers/scsi/libsas/sas_host_smp.c       | 18 +++++++++---------
 drivers/scsi/mpt2sas/mpt2sas_transport.c | 21 +++++++++++----------
 drivers/scsi/osd/osd_initiator.c         |  4 ++--
 drivers/scsi/scsi_lib.c                  | 13 ++++++-------
 drivers/scsi/scsi_tgt_lib.c              |  2 +-
 10 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c
index 2733b0c90b751..6e4c600f5a1c5 100644
--- a/drivers/ata/libata-scsi.c
+++ b/drivers/ata/libata-scsi.c
@@ -1084,7 +1084,7 @@ static int atapi_drain_needed(struct request *rq)
 	if (likely(!blk_pc_request(rq)))
 		return 0;
 
-	if (!rq->data_len || (rq->cmd_flags & REQ_RW))
+	if (!blk_rq_bytes(rq) || (rq->cmd_flags & REQ_RW))
 		return 0;
 
 	return atapi_cmd_type(rq->cmd[0]) == ATAPI_MISC;
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index dc3b899ad2b3c..1591f61b6c60d 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -747,7 +747,7 @@ static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
 {
 	struct request *rq = urq->rq;
 
-	if (rq->data_len == 0) {
+	if (blk_rq_bytes(rq) == 0) {
 		cmd->dir = UB_DIR_NONE;
 	} else {
 		if (rq_data_dir(rq) == WRITE)
@@ -762,7 +762,7 @@ static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
 	memcpy(&cmd->cdb, rq->cmd, rq->cmd_len);
 	cmd->cdb_len = rq->cmd_len;
 
-	cmd->len = rq->data_len;
+	cmd->len = blk_rq_bytes(rq);
 
 	/*
 	 * To reapply this to every URB is not as incorrect as it looks.
@@ -783,8 +783,8 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 
 	if (cmd->error == 0) {
 		if (blk_pc_request(rq)) {
-			if (cmd->act_len < rq->data_len)
-				rq->resid_len = rq->data_len - cmd->act_len;
+			if (cmd->act_len < blk_rq_bytes(rq))
+				rq->resid_len = blk_rq_bytes(rq) - cmd->act_len;
 			scsi_status = 0;
 		} else {
 			if (cmd->act_len != cmd->len) {
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 5d5f34715de45..4e6fcf06a6f2d 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1277,8 +1277,8 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	/* do we need to support multiple segments? */
 	if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
 		printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n",
-		    ioc->name, __func__, req->bio->bi_vcnt, req->data_len,
-		    rsp->bio->bi_vcnt, rsp->data_len);
+		    ioc->name, __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
+		    rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
 		return -EINVAL;
 	}
 
@@ -1295,7 +1295,7 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	smpreq = (SmpPassthroughRequest_t *)mf;
 	memset(smpreq, 0, sizeof(*smpreq));
 
-	smpreq->RequestDataLength = cpu_to_le16(req->data_len - 4);
+	smpreq->RequestDataLength = cpu_to_le16(blk_rq_bytes(req) - 4);
 	smpreq->Function = MPI_FUNCTION_SMP_PASSTHROUGH;
 
 	if (rphy)
@@ -1321,10 +1321,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		       MPI_SGE_FLAGS_END_OF_BUFFER |
 		       MPI_SGE_FLAGS_DIRECTION |
 		       mpt_addr_size()) << MPI_SGE_FLAGS_SHIFT;
-	flagsLength |= (req->data_len - 4);
+	flagsLength |= (blk_rq_bytes(req) - 4);
 
 	dma_addr_out = pci_map_single(ioc->pcidev, bio_data(req->bio),
-				      req->data_len, PCI_DMA_BIDIRECTIONAL);
+				      blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL);
 	if (!dma_addr_out)
 		goto put_mf;
 	mpt_add_sge(psge, flagsLength, dma_addr_out);
@@ -1332,9 +1332,9 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 	/* response */
 	flagsLength = MPT_SGE_FLAGS_SSIMPLE_READ;
-	flagsLength |= rsp->data_len + 4;
+	flagsLength |= blk_rq_bytes(rsp) + 4;
 	dma_addr_in =  pci_map_single(ioc->pcidev, bio_data(rsp->bio),
-				      rsp->data_len, PCI_DMA_BIDIRECTIONAL);
+				      blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL);
 	if (!dma_addr_in)
 		goto unmap;
 	mpt_add_sge(psge, flagsLength, dma_addr_in);
@@ -1357,7 +1357,7 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply;
 		memcpy(req->sense, smprep, sizeof(*smprep));
 		req->sense_len = sizeof(*smprep);
-		rsp->resid_len = rsp->data_len - smprep->ResponseDataLength;
+		rsp->resid_len = blk_rq_bytes(rsp) - smprep->ResponseDataLength;
 	} else {
 		printk(MYIOC_s_ERR_FMT "%s: smp passthru reply failed to be returned\n",
 		    ioc->name, __func__);
@@ -1365,10 +1365,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	}
 unmap:
 	if (dma_addr_out)
-		pci_unmap_single(ioc->pcidev, dma_addr_out, req->data_len,
+		pci_unmap_single(ioc->pcidev, dma_addr_out, blk_rq_bytes(req),
 				 PCI_DMA_BIDIRECTIONAL);
 	if (dma_addr_in)
-		pci_unmap_single(ioc->pcidev, dma_addr_in, rsp->data_len,
+		pci_unmap_single(ioc->pcidev, dma_addr_in, blk_rq_bytes(rsp),
 				 PCI_DMA_BIDIRECTIONAL);
 put_mf:
 	if (mf)
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 510eb47263746..6b61d289d6c9a 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -430,7 +430,7 @@ static void i2o_block_end_request(struct request *req, int error,
 		int leftover = (blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
 
 		if (blk_pc_request(req))
-			leftover = req->data_len;
+			leftover = blk_rq_bytes(req);
 
 		if (error)
 			blk_end_request(req, -EIO, leftover);
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 6605ec905cc03..531af9ed71991 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -1927,13 +1927,13 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	/* do we need to support multiple segments? */
 	if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
 		printk("%s: multiple segments req %u %u, rsp %u %u\n",
-		       __func__, req->bio->bi_vcnt, req->data_len,
-		       rsp->bio->bi_vcnt, rsp->data_len);
+		       __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
+		       rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
 		return -EINVAL;
 	}
 
-	ret = smp_execute_task(dev, bio_data(req->bio), req->data_len,
-			       bio_data(rsp->bio), rsp->data_len);
+	ret = smp_execute_task(dev, bio_data(req->bio), blk_rq_bytes(req),
+			       bio_data(rsp->bio), blk_rq_bytes(rsp));
 	if (ret > 0) {
 		/* positive number is the untransferred residual */
 		rsp->resid_len = ret;
diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c
index 89952edd0be37..be9a951b977d8 100644
--- a/drivers/scsi/libsas/sas_host_smp.c
+++ b/drivers/scsi/libsas/sas_host_smp.c
@@ -137,21 +137,21 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	int error = -EINVAL;
 
 	/* eight is the minimum size for request and response frames */
-	if (req->data_len < 8 || rsp->data_len < 8)
+	if (blk_rq_bytes(req) < 8 || blk_rq_bytes(rsp) < 8)
 		goto out;
 
-	if (bio_offset(req->bio) + req->data_len > PAGE_SIZE ||
-	    bio_offset(rsp->bio) + rsp->data_len > PAGE_SIZE) {
+	if (bio_offset(req->bio) + blk_rq_bytes(req) > PAGE_SIZE ||
+	    bio_offset(rsp->bio) + blk_rq_bytes(rsp) > PAGE_SIZE) {
 		shost_printk(KERN_ERR, shost,
 			"SMP request/response frame crosses page boundary");
 		goto out;
 	}
 
-	req_data = kzalloc(req->data_len, GFP_KERNEL);
+	req_data = kzalloc(blk_rq_bytes(req), GFP_KERNEL);
 
 	/* make sure frame can always be built ... we copy
 	 * back only the requested length */
-	resp_data = kzalloc(max(rsp->data_len, 128U), GFP_KERNEL);
+	resp_data = kzalloc(max(blk_rq_bytes(rsp), 128U), GFP_KERNEL);
 
 	if (!req_data || !resp_data) {
 		error = -ENOMEM;
@@ -160,7 +160,7 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 
 	local_irq_disable();
 	buf = kmap_atomic(bio_page(req->bio), KM_USER0) + bio_offset(req->bio);
-	memcpy(req_data, buf, req->data_len);
+	memcpy(req_data, buf, blk_rq_bytes(req));
 	kunmap_atomic(buf - bio_offset(req->bio), KM_USER0);
 	local_irq_enable();
 
@@ -176,8 +176,8 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	resp_data[1] = req_data[1];
 	resp_data[2] = SMP_RESP_FUNC_UNK;
 
-	req->resid_len = req->data_len;
-	rsp->resid_len = rsp->data_len;
+	req->resid_len = blk_rq_bytes(req);
+	rsp->resid_len = blk_rq_bytes(rsp);
 
 	switch (req_data[1]) {
 	case SMP_REPORT_GENERAL:
@@ -264,7 +264,7 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 
 	local_irq_disable();
 	buf = kmap_atomic(bio_page(rsp->bio), KM_USER0) + bio_offset(rsp->bio);
-	memcpy(buf, resp_data, rsp->data_len);
+	memcpy(buf, resp_data, blk_rq_bytes(rsp));
 	flush_kernel_dcache_page(bio_page(rsp->bio));
 	kunmap_atomic(buf - bio_offset(rsp->bio), KM_USER0);
 	local_irq_enable();
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index 53759c566bfed..af95a449930ea 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1041,7 +1041,7 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
 		printk(MPT2SAS_ERR_FMT "%s: multiple segments req %u %u, "
 		    "rsp %u %u\n", ioc->name, __func__, req->bio->bi_vcnt,
-		    req->data_len, rsp->bio->bi_vcnt, rsp->data_len);
+		    blk_rq_bytes(req), rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
 		return -EINVAL;
 	}
 
@@ -1104,7 +1104,7 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	*((u64 *)&mpi_request->SASAddress) = (rphy) ?
 	    cpu_to_le64(rphy->identify.sas_address) :
 	    cpu_to_le64(ioc->sas_hba.sas_address);
-	mpi_request->RequestDataLength = cpu_to_le16(req->data_len - 4);
+	mpi_request->RequestDataLength = cpu_to_le16(blk_rq_bytes(req) - 4);
 	psge = &mpi_request->SGL;
 
 	/* WRITE sgel first */
@@ -1112,13 +1112,13 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	    MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC);
 	sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
 	dma_addr_out = pci_map_single(ioc->pdev, bio_data(req->bio),
-	      req->data_len, PCI_DMA_BIDIRECTIONAL);
+		blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL);
 	if (!dma_addr_out) {
 		mpt2sas_base_free_smid(ioc, le16_to_cpu(smid));
 		goto unmap;
 	}
 
-	ioc->base_add_sg_single(psge, sgl_flags | (req->data_len - 4),
+	ioc->base_add_sg_single(psge, sgl_flags | (blk_rq_bytes(req) - 4),
 	    dma_addr_out);
 
 	/* incr sgel */
@@ -1129,14 +1129,14 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	    MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER |
 	    MPI2_SGE_FLAGS_END_OF_LIST);
 	sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
-	dma_addr_in =  pci_map_single(ioc->pdev, bio_data(rsp->bio),
-	      rsp->data_len, PCI_DMA_BIDIRECTIONAL);
+	dma_addr_in = pci_map_single(ioc->pdev, bio_data(rsp->bio),
+				     blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL);
 	if (!dma_addr_in) {
 		mpt2sas_base_free_smid(ioc, le16_to_cpu(smid));
 		goto unmap;
 	}
 
-	ioc->base_add_sg_single(psge, sgl_flags | (rsp->data_len + 4),
+	ioc->base_add_sg_single(psge, sgl_flags | (blk_rq_bytes(rsp) + 4),
 	    dma_addr_in);
 
 	dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT "%s - "
@@ -1170,7 +1170,8 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 		memcpy(req->sense, mpi_reply, sizeof(*mpi_reply));
 		req->sense_len = sizeof(*mpi_reply);
-		rsp->resid_len = rsp->data_len - mpi_reply->ResponseDataLength;
+		rsp->resid_len = blk_rq_bytes(rsp) -
+				 mpi_reply->ResponseDataLength;
 	} else {
 		dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT
 		    "%s - no reply\n", ioc->name, __func__));
@@ -1186,10 +1187,10 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
  unmap:
 	if (dma_addr_out)
-		pci_unmap_single(ioc->pdev, dma_addr_out, req->data_len,
+		pci_unmap_single(ioc->pdev, dma_addr_out, blk_rq_bytes(req),
 		    PCI_DMA_BIDIRECTIONAL);
 	if (dma_addr_in)
-		pci_unmap_single(ioc->pdev, dma_addr_in, rsp->data_len,
+		pci_unmap_single(ioc->pdev, dma_addr_in, blk_rq_bytes(rsp),
 		    PCI_DMA_BIDIRECTIONAL);
 
  out:
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 2a5f0777148d1..d178a8b799ec0 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1299,7 +1299,7 @@ int osd_finalize_request(struct osd_request *or,
 			return ret;
 		}
 		OSD_DEBUG("out bytes=%llu (bytes_req=%u)\n",
-			_LLU(or->out.total_bytes), or->out.req->data_len);
+			_LLU(or->out.total_bytes), blk_rq_bytes(or->out.req));
 	}
 	if (or->in.bio) {
 		ret = blk_rq_append_bio(or->request->q, or->in.req, or->in.bio);
@@ -1308,7 +1308,7 @@ int osd_finalize_request(struct osd_request *or,
 			return ret;
 		}
 		OSD_DEBUG("in bytes=%llu (bytes_req=%u)\n",
-			_LLU(or->in.total_bytes), or->in.req->data_len);
+			_LLU(or->in.total_bytes), blk_rq_bytes(or->in.req));
 	}
 
 	or->out.pad_buff = sg_out_pad_buffer;
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 39b3acfc0ddf4..3d16c70fbde04 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -682,14 +682,13 @@ EXPORT_SYMBOL(scsi_release_buffers);
 static void scsi_end_bidi_request(struct scsi_cmnd *cmd)
 {
 	struct request *req = cmd->request;
-	unsigned int dlen = req->data_len;
-	unsigned int next_dlen = req->next_rq->data_len;
 
 	req->resid_len = scsi_out(cmd)->resid;
 	req->next_rq->resid_len = scsi_in(cmd)->resid;
 
 	/* The req and req->next_rq have not been completed */
-	BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen));
+	BUG_ON(blk_end_bidi_request(req, 0, blk_rq_bytes(req),
+				    blk_rq_bytes(req->next_rq)));
 
 	scsi_release_buffers(cmd);
 
@@ -966,7 +965,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 	BUG_ON(count > sdb->table.nents);
 	sdb->table.nents = count;
 	if (blk_pc_request(req))
-		sdb->length = req->data_len;
+		sdb->length = blk_rq_bytes(req);
 	else
 		sdb->length = blk_rq_sectors(req) << 9;
 	return BLKPREP_OK;
@@ -1087,21 +1086,21 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req)
 		if (unlikely(ret))
 			return ret;
 	} else {
-		BUG_ON(req->data_len);
+		BUG_ON(blk_rq_bytes(req));
 
 		memset(&cmd->sdb, 0, sizeof(cmd->sdb));
 		req->buffer = NULL;
 	}
 
 	cmd->cmd_len = req->cmd_len;
-	if (!req->data_len)
+	if (!blk_rq_bytes(req))
 		cmd->sc_data_direction = DMA_NONE;
 	else if (rq_data_dir(req) == WRITE)
 		cmd->sc_data_direction = DMA_TO_DEVICE;
 	else
 		cmd->sc_data_direction = DMA_FROM_DEVICE;
 	
-	cmd->transfersize = req->data_len;
+	cmd->transfersize = blk_rq_bytes(req);
 	cmd->allowed = req->retries;
 	return BLKPREP_OK;
 }
diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c
index 48ba413f7f6af..10303272ba457 100644
--- a/drivers/scsi/scsi_tgt_lib.c
+++ b/drivers/scsi/scsi_tgt_lib.c
@@ -387,7 +387,7 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd,
 	 * we use REQ_TYPE_BLOCK_PC so scsi_init_io doesn't set the
 	 * length for us.
 	 */
-	cmd->sdb.length = rq->data_len;
+	cmd->sdb.length = blk_rq_bytes(rq);
 
 	return 0;
 
-- 
GitLab


From 34b7d2c957199834c474c9d46739265643f4d9c7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:43 +0900
Subject: [PATCH 0986/2198] ide: cleanup rq->data_len usages

With recent unification of fields, it's now guaranteed that
rq->data_len always equals blk_rq_bytes().  Convert all direct users
to accessors.

[ Impact: convert direct rq->data_len usages to blk_rq_bytes() ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-atapi.c  |  4 ++--
 drivers/ide/ide-cd.c     | 25 +++++++------------------
 drivers/ide/ide-floppy.c |  4 ++--
 drivers/ide/ide-io.c     |  2 +-
 drivers/ide/ide-tape.c   |  2 +-
 5 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index e4a02a05fc819..792534db8f857 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -255,7 +255,7 @@ void ide_retry_pc(ide_drive_t *drive)
 	ide_init_pc(pc);
 	memcpy(pc->c, sense_rq->cmd, 12);
 	pc->buf = bio_data(sense_rq->bio);	/* pointer to mapped address */
-	pc->req_xfer = sense_rq->data_len;
+	pc->req_xfer = blk_rq_bytes(sense_rq);
 
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
@@ -303,7 +303,7 @@ int ide_cd_get_xferlen(struct request *rq)
 		return 32768;
 	else if (blk_sense_request(rq) || blk_pc_request(rq) ||
 			 rq->cmd_type == REQ_TYPE_ATA_PC)
-		return rq->data_len;
+		return blk_rq_bytes(rq);
 	else
 		return 0;
 }
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index eb4f3dc9f1824..2eadc9d2e9652 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -577,7 +577,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 	struct request *rq = hwif->rq;
 	ide_expiry_t *expiry = NULL;
 	int dma_error = 0, dma, thislen, uptodate = 0;
-	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0, nsectors;
+	int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0;
 	int sense = blk_sense_request(rq);
 	unsigned int timeout;
 	u16 len;
@@ -707,9 +707,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 out_end:
 	if (blk_pc_request(rq) && rc == 0) {
-		if (blk_end_request(rq, 0, rq->data_len))
-			BUG();
-
+		blk_end_request_all(rq, 0);
 		hwif->rq = NULL;
 	} else {
 		if (sense && uptodate)
@@ -727,22 +725,14 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 			ide_cd_error_cmd(drive, cmd);
 
 		/* make sure it's fully ended */
-		if (blk_pc_request(rq))
-			nsectors = (rq->data_len + 511) >> 9;
-		else
-			nsectors = blk_rq_sectors(rq);
-
-		if (nsectors == 0)
-			nsectors = 1;
-
 		if (blk_fs_request(rq) == 0) {
-			rq->resid_len = rq->data_len -
+			rq->resid_len = blk_rq_bytes(rq) -
 				(cmd->nbytes - cmd->nleft);
 			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
 				rq->resid_len += cmd->last_xfer_len;
 		}
 
-		ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9);
+		ide_complete_rq(drive, uptodate ? 0 : -EIO, blk_rq_bytes(rq));
 
 		if (sense && rc == 2)
 			ide_error(drive, "request sense failure", stat);
@@ -819,7 +809,7 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
 		 */
 		alignment = queue_dma_alignment(q) | q->dma_pad_mask;
 		if ((unsigned long)buf & alignment
-		    || rq->data_len & q->dma_pad_mask
+		    || blk_rq_bytes(rq) & q->dma_pad_mask
 		    || object_is_on_stack(buf))
 			drive->dma = 0;
 	}
@@ -867,9 +857,8 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
 
 	cmd.rq = rq;
 
-	if (blk_fs_request(rq) || rq->data_len) {
-		ide_init_sg_cmd(&cmd, blk_fs_request(rq) ?
-				(blk_rq_sectors(rq) << 9) : rq->data_len);
+	if (blk_fs_request(rq) || blk_rq_bytes(rq)) {
+		ide_init_sg_cmd(&cmd, blk_rq_bytes(rq));
 		ide_map_sg(drive, &cmd);
 	}
 
diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c
index 1c460bd563752..650981758f15c 100644
--- a/drivers/ide/ide-floppy.c
+++ b/drivers/ide/ide-floppy.c
@@ -220,14 +220,14 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
 	ide_init_pc(pc);
 	memcpy(pc->c, rq->cmd, sizeof(pc->c));
 	pc->rq = rq;
-	if (rq->data_len) {
+	if (blk_rq_bytes(rq)) {
 		pc->flags |= PC_FLAG_DMA_OK;
 		if (rq_data_dir(rq) == WRITE)
 			pc->flags |= PC_FLAG_WRITING;
 	}
 	/* pio will be performed by ide_pio_bytes() which handles sg fine */
 	pc->buf = NULL;
-	pc->req_xfer = pc->buf_size = rq->data_len;
+	pc->req_xfer = pc->buf_size = blk_rq_bytes(rq);
 }
 
 static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index 59799ca552460..ca2519d7b9943 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -116,7 +116,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err)
 unsigned int ide_rq_bytes(struct request *rq)
 {
 	if (blk_pc_request(rq))
-		return rq->data_len;
+		return blk_rq_bytes(rq);
 	else
 		return blk_rq_cur_sectors(rq) << 9;
 }
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index c6cf3a951f28a..e16604562f281 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -380,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->resid_len = rq->data_len - blocks * tape->blk_size;
+		rq->resid_len = blk_rq_bytes(rq) - blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
-- 
GitLab


From a2dec7b36364a5cc564c4d76cf16d2e7d33f5c05 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:44 +0900
Subject: [PATCH 0987/2198] block: hide request sector and data_len

Block low level drivers for some reason have been pretty good at
abusing block layer API.  Especially struct request's fields tend to
get violated in all possible ways.  Make it clear that low level
drivers MUST NOT access or manipulate rq->sector and rq->data_len
directly by prefixing them with double underscores.

This change is also necessary to break build of out-of-tree codes
which assume the previous block API where internal fields can be
manipulated and rq->data_len carries residual count on completion.

[ Impact: hide internal fields, block API change ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 20 ++++++++++----------
 block/blk-map.c        |  2 +-
 block/blk-merge.c      |  2 +-
 include/linux/blkdev.h |  9 +++++----
 4 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 3596ca71909b4..6226a380fb6d5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -127,7 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
-	rq->sector = (sector_t) -1;
+	rq->__sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
@@ -1095,7 +1095,7 @@ void init_request_from_bio(struct request *req, struct bio *bio)
 		req->cmd_flags |= REQ_NOIDLE;
 
 	req->errors = 0;
-	req->sector = bio->bi_sector;
+	req->__sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
@@ -1143,7 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
-		req->data_len += bytes;
+		req->__data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1169,8 +1169,8 @@ static int __make_request(struct request_queue *q, struct bio *bio)
 		 * not touch req->buffer either...
 		 */
 		req->buffer = bio_data(bio);
-		req->sector = bio->bi_sector;
-		req->data_len += bytes;
+		req->__sector = bio->bi_sector;
+		req->__data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
@@ -1878,7 +1878,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		 * can find how many bytes remain in the request
 		 * later.
 		 */
-		req->data_len = 0;
+		req->__data_len = 0;
 		return false;
 	}
 
@@ -1892,12 +1892,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 
-	req->data_len -= total_bytes;
+	req->__data_len -= total_bytes;
 	req->buffer = bio_data(req->bio);
 
 	/* update sector only for requests with clear definition of sector */
 	if (blk_fs_request(req) || blk_discard_rq(req))
-		req->sector += total_bytes >> 9;
+		req->__sector += total_bytes >> 9;
 
 	/*
 	 * If total number of sectors is less than the first segment
@@ -1905,7 +1905,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 	 */
 	if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
 		printk(KERN_ERR "blk: request botched\n");
-		req->data_len = blk_rq_cur_bytes(req);
+		req->__data_len = blk_rq_cur_bytes(req);
 	}
 
 	/* recalculate the number of segments */
@@ -2032,7 +2032,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
-	rq->data_len = bio->bi_size;
+	rq->__data_len = bio->bi_size;
 	rq->bio = rq->biotail = bio;
 
 	if (bio->bi_bdev)
diff --git a/block/blk-map.c b/block/blk-map.c
index 694fefad34e7c..56082bea45041 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -20,7 +20,7 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		rq->biotail->bi_next = bio;
 		rq->biotail = bio;
 
-		rq->data_len += bio->bi_size;
+		rq->__data_len += bio->bi_size;
 	}
 	return 0;
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index b8df66aef0f84..4974dd5767e51 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -370,7 +370,7 @@ static int attempt_merge(struct request_queue *q, struct request *req,
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 
-	req->data_len += blk_rq_bytes(next);
+	req->__data_len += blk_rq_bytes(next);
 
 	elv_merge_requests(q, req, next);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ce2bf5efa9bae..c75580345700f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -166,8 +166,9 @@ struct request {
 	enum rq_cmd_type_bits cmd_type;
 	unsigned long atomic_flags;
 
-	sector_t sector;	/* sector cursor */
-	unsigned int data_len;	/* total data len, don't access directly */
+	/* the following two fields are internal, NEVER access directly */
+	sector_t __sector;		/* sector cursor */
+	unsigned int __data_len;	/* total data len */
 
 	struct bio *bio;
 	struct bio *biotail;
@@ -828,12 +829,12 @@ extern void blkdev_dequeue_request(struct request *req);
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
-	return rq->sector;
+	return rq->__sector;
 }
 
 static inline unsigned int blk_rq_bytes(const struct request *rq)
 {
-	return rq->data_len;
+	return rq->__data_len;
 }
 
 static inline int blk_rq_cur_bytes(const struct request *rq)
-- 
GitLab


From 1011c1b9f2e45ce7c6e38888d2b83936aec38771 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 7 May 2009 22:24:45 +0900
Subject: [PATCH 0988/2198] block: blk_rq_[cur_]_{sectors|bytes}() usage
 cleanup

With the previous changes, the followings are now guaranteed for all
requests in any valid state.

* blk_rq_sectors() == blk_rq_bytes() >> 9
* blk_rq_cur_sectors() == blk_rq_cur_bytes() >> 9

Clean up accessor usages.  Notable changes are

* nbd,i2o_block: end_all used instead of explicit byte count
* scsi_lib: unnecessary conditional on request type removed

[ Impact: cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Paul Clements <paul.clements@steeleye.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/floppy.c              |  7 +++----
 drivers/block/nbd.c                 |  6 +++---
 drivers/block/ub.c                  |  2 +-
 drivers/block/z2ram.c               |  2 +-
 drivers/memstick/core/mspro_block.c |  4 ++--
 drivers/message/i2o/i2o_block.c     | 16 ++++------------
 drivers/mtd/mtd_blkdevs.c           |  2 +-
 drivers/sbus/char/jsflash.c         |  2 +-
 drivers/scsi/scsi_lib.c             |  7 ++-----
 9 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 452486283386b..1e27ed9208b45 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -2512,8 +2512,7 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 
 	remaining = current_count_sectors << 9;
 #ifdef FLOPPY_SANITY_CHECK
-	if ((remaining >> 9) > blk_rq_sectors(current_req) &&
-	    CT(COMMAND) == FD_WRITE) {
+	if (remaining > blk_rq_bytes(current_req) && CT(COMMAND) == FD_WRITE) {
 		DPRINT("in copy buffer\n");
 		printk("current_count_sectors=%ld\n", current_count_sectors);
 		printk("remaining=%d\n", remaining >> 9);
@@ -2530,7 +2529,7 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2)
 
 	dma_buffer = floppy_track_buffer + ((fsector_t - buffer_min) << 9);
 
-	size = blk_rq_cur_sectors(current_req) << 9;
+	size = blk_rq_cur_bytes(current_req);
 
 	rq_for_each_segment(bv, current_req, iter) {
 		if (!remaining)
@@ -2879,7 +2878,7 @@ static int make_raw_rw_request(void)
 				printk("write\n");
 			return 0;
 		}
-	} else if (raw_cmd->length > blk_rq_sectors(current_req) << 9 ||
+	} else if (raw_cmd->length > blk_rq_bytes(current_req) ||
 		   current_count_sectors > blk_rq_sectors(current_req)) {
 		DPRINT("buffer overrun in direct transfer\n");
 		return 0;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 977a57377930d..fad167de23b44 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -110,7 +110,7 @@ static void nbd_end_request(struct request *req)
 			req, error ? "failed" : "done");
 
 	spin_lock_irqsave(q->queue_lock, flags);
-	__blk_end_request(req, error, blk_rq_sectors(req) << 9);
+	__blk_end_request_all(req, error);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -231,7 +231,7 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 {
 	int result, flags;
 	struct nbd_request request;
-	unsigned long size = blk_rq_sectors(req) << 9;
+	unsigned long size = blk_rq_bytes(req);
 
 	request.magic = htonl(NBD_REQUEST_MAGIC);
 	request.type = htonl(nbd_cmd(req));
@@ -243,7 +243,7 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 			lo->disk->disk_name, req,
 			nbdcmd_to_ascii(nbd_cmd(req)),
 			(unsigned long long)blk_rq_pos(req) << 9,
-			blk_rq_sectors(req) << 9);
+			blk_rq_bytes(req));
 	result = sock_xmit(lo, 1, &request, sizeof(request),
 			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 1591f61b6c60d..40d03cf63f2ec 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -739,7 +739,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 	cmd->cdb[8] = nblks;
 	cmd->cdb_len = 10;
 
-	cmd->len = blk_rq_sectors(rq) * 512;
+	cmd->len = blk_rq_bytes(rq);
 }
 
 static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index d4e6b71f514ac..6a1383834ecff 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -72,7 +72,7 @@ static void do_z2_request(struct request_queue *q)
 	struct request *req;
 	while ((req = elv_next_request(q)) != NULL) {
 		unsigned long start = blk_rq_pos(req) << 9;
-		unsigned long len  = blk_rq_cur_sectors(req) << 9;
+		unsigned long len  = blk_rq_cur_bytes(req);
 
 		if (start + len > z2ram_size) {
 			printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 9e600d22f40e3..93b2c61856566 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -680,7 +680,7 @@ static int mspro_block_issue_req(struct memstick_dev *card, int chunk)
 		t_sec = blk_rq_pos(msb->block_req) << 9;
 		sector_div(t_sec, msb->page_size);
 
-		count = blk_rq_sectors(msb->block_req) << 9;
+		count = blk_rq_bytes(msb->block_req);
 		count /= msb->page_size;
 
 		param.system = msb->system;
@@ -745,7 +745,7 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error)
 					t_len *= msb->page_size;
 			}
 		} else
-			t_len = blk_rq_sectors(msb->block_req) << 9;
+			t_len = blk_rq_bytes(msb->block_req);
 
 		dev_dbg(&card->dev, "transferred %x (%d)\n", t_len, error);
 
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 6b61d289d6c9a..e153f5d5237de 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -426,15 +426,9 @@ static void i2o_block_end_request(struct request *req, int error,
 	struct request_queue *q = req->q;
 	unsigned long flags;
 
-	if (blk_end_request(req, error, nr_bytes)) {
-		int leftover = (blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
-
-		if (blk_pc_request(req))
-			leftover = blk_rq_bytes(req);
-
+	if (blk_end_request(req, error, nr_bytes))
 		if (error)
-			blk_end_request(req, -EIO, leftover);
-	}
+			blk_end_request_all(req, -EIO);
 
 	spin_lock_irqsave(q->queue_lock, flags);
 
@@ -832,15 +826,13 @@ static int i2o_block_transfer(struct request *req)
 
 		memcpy(mptr, cmd, 10);
 		mptr += 4;
-		*mptr++ =
-		    cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
+		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
 	} else
 #endif
 	{
 		msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid);
 		*mptr++ = cpu_to_le32(ctl_flags);
-		*mptr++ =
-		    cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT);
+		*mptr++ = cpu_to_le32(blk_rq_bytes(req));
 		*mptr++ =
 		    cpu_to_le32((u32) (blk_rq_pos(req) << KERNEL_SECTOR_SHIFT));
 		*mptr++ =
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 4ea2e67ac97cf..50c76a2ca76e3 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -48,7 +48,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr,
 	char *buf;
 
 	block = blk_rq_pos(req) << 9 >> tr->blkshift;
-	nsect = blk_rq_cur_sectors(req) << 9 >> tr->blkshift;
+	nsect = blk_rq_cur_bytes(req) >> tr->blkshift;
 
 	buf = req->buffer;
 
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index 2132c906e53a4..d56ddaa770361 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -189,7 +189,7 @@ static void jsfd_do_request(struct request_queue *q)
 	while ((req = elv_next_request(q)) != NULL) {
 		struct jsfd_part *jdp = req->rq_disk->private_data;
 		unsigned long offset = blk_rq_pos(req) << 9;
-		size_t len = blk_rq_cur_sectors(req) << 9;
+		size_t len = blk_rq_cur_bytes(req);
 
 		if ((offset + len) > jdp->dsize) {
 			__blk_end_request_cur(req, -EIO);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 3d16c70fbde04..ee308f6f7982c 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -546,7 +546,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 	 * to queue the remainder of them.
 	 */
 	if (blk_end_request(req, error, bytes)) {
-		int leftover = blk_rq_sectors(req) << 9;
+		int leftover = blk_rq_bytes(req);
 
 		if (blk_pc_request(req))
 			leftover = req->resid_len;
@@ -964,10 +964,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb,
 	count = blk_rq_map_sg(req->q, req, sdb->table.sgl);
 	BUG_ON(count > sdb->table.nents);
 	sdb->table.nents = count;
-	if (blk_pc_request(req))
-		sdb->length = blk_rq_bytes(req);
-	else
-		sdb->length = blk_rq_sectors(req) << 9;
+	sdb->length = blk_rq_bytes(req);
 	return BLKPREP_OK;
 }
 
-- 
GitLab


From 8f6205cd572fece673da0255d74843680f67f879 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:53:59 +0900
Subject: [PATCH 0989/2198] ide: dequeue in-flight request

ide generally has single request in flight and tracks it using
hwif->rq and all state handlers follow the following convention.

* ide_started is returned if the request is in flight.

* ide_stopped is returned if the queue needs to be restarted.  The
  request might or might not have been processed fully or partially.

* hwif->rq is set to NULL, when an issued request completes.

So, dequeueing model can be implemented by dequeueing after fetch,
requeueing if hwif->rq isn't NULL on ide_stopped return and doing
about the same thing on completion / port unlock paths.  These changes
can be made in ide-io proper.

In addition to the above main changes, the following updates are
necessary.

* ide-cd shouldn't dequeue a request when issuing REQUEST SENSE for it
  as the request is already dequeued.

* ide-atapi uses request queue as stack when issuing REQUEST SENSE to
  put the REQUEST SENSE in front of the failed request.  This now
  needs to be done using requeueing.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-atapi.c | 14 ++++++++++++--
 drivers/ide/ide-cd.c    |  8 --------
 drivers/ide/ide-io.c    | 34 +++++++++++++++++++++++++++-------
 3 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 792534db8f857..2874c3d703a92 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -246,6 +246,7 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
  */
 void ide_retry_pc(ide_drive_t *drive)
 {
+	struct request *failed_rq = drive->hwif->rq;
 	struct request *sense_rq = &drive->sense_rq;
 	struct ide_atapi_pc *pc = &drive->request_sense_pc;
 
@@ -260,8 +261,17 @@ void ide_retry_pc(ide_drive_t *drive)
 	if (drive->media == ide_tape)
 		set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
 
-	if (ide_queue_sense_rq(drive, pc))
-		ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
+	/*
+	 * Push back the failed request and put request sense on top
+	 * of it.  The failed command will be retried after sense data
+	 * is acquired.
+	 */
+	blk_requeue_request(failed_rq->q, failed_rq);
+	drive->hwif->rq = NULL;
+	if (ide_queue_sense_rq(drive, pc)) {
+		blkdev_dequeue_request(failed_rq);
+		ide_complete_rq(drive, -EIO, blk_rq_bytes(failed_rq));
+	}
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 2eadc9d2e9652..4c7792fd5f934 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -405,15 +405,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
 end_request:
 	if (stat & ATA_ERR) {
-		struct request_queue *q = drive->queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
-		blkdev_dequeue_request(rq);
-		spin_unlock_irqrestore(q->queue_lock, flags);
-
 		hwif->rq = NULL;
-
 		return ide_queue_sense_rq(drive, rq) ? 2 : 1;
 	} else
 		return 2;
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index ca2519d7b9943..abda7337b3f49 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -487,10 +487,10 @@ void do_ide_request(struct request_queue *q)
 
 	if (!ide_lock_port(hwif)) {
 		ide_hwif_t *prev_port;
+
+		WARN_ON_ONCE(hwif->rq);
 repeat:
 		prev_port = hwif->host->cur_port;
-		hwif->rq = NULL;
-
 		if (drive->dev_flags & IDE_DFLAG_SLEEPING &&
 		    time_after(drive->sleep, jiffies)) {
 			ide_unlock_port(hwif);
@@ -519,7 +519,12 @@ void do_ide_request(struct request_queue *q)
 		 * we know that the queue isn't empty, but this can happen
 		 * if the q->prep_rq_fn() decides to kill a request
 		 */
-		rq = elv_next_request(drive->queue);
+		if (!rq) {
+			rq = elv_next_request(drive->queue);
+			if (rq)
+				blkdev_dequeue_request(rq);
+		}
+
 		spin_unlock_irq(q->queue_lock);
 		spin_lock_irq(&hwif->lock);
 
@@ -555,8 +560,11 @@ void do_ide_request(struct request_queue *q)
 		startstop = start_request(drive, rq);
 		spin_lock_irq(&hwif->lock);
 
-		if (startstop == ide_stopped)
+		if (startstop == ide_stopped) {
+			rq = hwif->rq;
+			hwif->rq = NULL;
 			goto repeat;
+		}
 	} else
 		goto plug_device;
 out:
@@ -572,18 +580,24 @@ void do_ide_request(struct request_queue *q)
 plug_device_2:
 	spin_lock_irq(q->queue_lock);
 
+	if (rq)
+		blk_requeue_request(q, rq);
 	if (!elv_queue_empty(q))
 		blk_plug_device(q);
 }
 
-static void ide_plug_device(ide_drive_t *drive)
+static void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq)
 {
 	struct request_queue *q = drive->queue;
 	unsigned long flags;
 
 	spin_lock_irqsave(q->queue_lock, flags);
+
+	if (rq)
+		blk_requeue_request(q, rq);
 	if (!elv_queue_empty(q))
 		blk_plug_device(q);
+
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
@@ -632,6 +646,7 @@ void ide_timer_expiry (unsigned long data)
 	unsigned long	flags;
 	int		wait = -1;
 	int		plug_device = 0;
+	struct request	*uninitialized_var(rq_in_flight);
 
 	spin_lock_irqsave(&hwif->lock, flags);
 
@@ -693,6 +708,8 @@ void ide_timer_expiry (unsigned long data)
 		spin_lock_irq(&hwif->lock);
 		enable_irq(hwif->irq);
 		if (startstop == ide_stopped) {
+			rq_in_flight = hwif->rq;
+			hwif->rq = NULL;
 			ide_unlock_port(hwif);
 			plug_device = 1;
 		}
@@ -701,7 +718,7 @@ void ide_timer_expiry (unsigned long data)
 
 	if (plug_device) {
 		ide_unlock_host(hwif->host);
-		ide_plug_device(drive);
+		ide_requeue_and_plug(drive, rq_in_flight);
 	}
 }
 
@@ -787,6 +804,7 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	ide_startstop_t startstop;
 	irqreturn_t irq_ret = IRQ_NONE;
 	int plug_device = 0;
+	struct request *uninitialized_var(rq_in_flight);
 
 	if (host->host_flags & IDE_HFLAG_SERIALIZE) {
 		if (hwif != host->cur_port)
@@ -866,6 +884,8 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 	 */
 	if (startstop == ide_stopped) {
 		BUG_ON(hwif->handler);
+		rq_in_flight = hwif->rq;
+		hwif->rq = NULL;
 		ide_unlock_port(hwif);
 		plug_device = 1;
 	}
@@ -875,7 +895,7 @@ irqreturn_t ide_intr (int irq, void *dev_id)
 out_early:
 	if (plug_device) {
 		ide_unlock_host(hwif->host);
-		ide_plug_device(drive);
+		ide_requeue_and_plug(drive, rq_in_flight);
 	}
 
 	return irq_ret;
-- 
GitLab


From 9a8d23d8855e554fc5887f14cb008b55c4300ccc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:00 +0900
Subject: [PATCH 0990/2198] mg_disk: fix queue hang / infinite retry on !fs
 requests

Both request functions in mg_disk simply return when they encounter a
!fs request, which means the request will never be cleared from the
queue causing queue hang and indefinite retry of the request.  Fix it.

While at it, flatten condition checks and add unlikely to !fs tests.

[ Impact: fix possible queue hang / infinite retry of !fs requests ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/mg_disk.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 826c3492b9fe2..be323880f24ab 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -672,16 +672,16 @@ static void mg_request_poll(struct request_queue *q)
 
 	while ((req = elv_next_request(q)) != NULL) {
 		host = req->rq_disk->private_data;
-		if (blk_fs_request(req)) {
-			switch (rq_data_dir(req)) {
-			case READ:
-				mg_read(req);
-				break;
-			case WRITE:
-				mg_write(req);
-				break;
-			}
+
+		if (unlikely(!blk_fs_request(req))) {
+			__blk_end_request_cur(req, -EIO);
+			continue;
 		}
+
+		if (rq_data_dir(req) == READ)
+			mg_read(req);
+		else
+			mg_write(req);
 	}
 }
 
@@ -766,8 +766,10 @@ static void mg_request(struct request_queue *q)
 			continue;
 		}
 
-		if (!blk_fs_request(req))
-			return;
+		if (unlikely(!blk_fs_request(req))) {
+			__blk_end_request_cur(req, -EIO);
+			continue;
+		}
 
 		if (!mg_issue_req(req, host, sect_num, sect_cnt))
 			return;
-- 
GitLab


From 5b36ad6000ddea390aca3c3b67ead7616ace2ffc Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:01 +0900
Subject: [PATCH 0991/2198] mg_disk: dequeue and track in-flight request

mg_disk has at most single request in flight per device.  Till now,
whenever it needs to access the in-flight request it called
elv_next_request().  This patch makes mg_disk track the in-flight
request directly using mg_host->req and dequeue it when processing
starts.

q->queuedata is set to mg_host so that mg_host can be determined
without fetching request from the queue.

[ Impact: dequeue in-flight request, one elv_next_request() per request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: unsik Kim <donari75@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/mg_disk.c | 109 ++++++++++++++++++++++------------------
 1 file changed, 59 insertions(+), 50 deletions(-)

diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index be323880f24ab..1ca5d1423fa3a 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -135,6 +135,7 @@ struct mg_host {
 	struct device *dev;
 
 	struct request_queue *breq;
+	struct request *req;
 	spinlock_t lock;
 	struct gendisk *gd;
 
@@ -171,17 +172,27 @@ struct mg_host {
 
 static void mg_request(struct request_queue *);
 
+static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes)
+{
+	if (__blk_end_request(host->req, err, nr_bytes))
+		return true;
+
+	host->req = NULL;
+	return false;
+}
+
+static bool mg_end_request_cur(struct mg_host *host, int err)
+{
+	return mg_end_request(host, err, blk_rq_cur_bytes(host->req));
+}
+
 static void mg_dump_status(const char *msg, unsigned int stat,
 		struct mg_host *host)
 {
 	char *name = MG_DISK_NAME;
-	struct request *req;
 
-	if (host->breq) {
-		req = elv_next_request(host->breq);
-		if (req)
-			name = req->rq_disk->disk_name;
-	}
+	if (host->req)
+		name = host->req->rq_disk->disk_name;
 
 	printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
 	if (stat & ATA_BUSY)
@@ -217,13 +228,9 @@ static void mg_dump_status(const char *msg, unsigned int stat,
 			printk("AddrMarkNotFound ");
 		printk("}");
 		if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) {
-			if (host->breq) {
-				req = elv_next_request(host->breq);
-				if (req)
-					printk(", sector=%u",
-					       (unsigned int)blk_rq_pos(req));
-			}
-
+			if (host->req)
+				printk(", sector=%u",
+				       (unsigned int)blk_rq_pos(host->req));
 		}
 		printk("\n");
 	}
@@ -453,11 +460,10 @@ static int mg_disk_init(struct mg_host *host)
 
 static void mg_bad_rw_intr(struct mg_host *host)
 {
-	struct request *req = elv_next_request(host->breq);
-	if (req != NULL)
-		if (++req->errors >= MG_MAX_ERRORS ||
-				host->error == MG_ERR_TIMEOUT)
-			__blk_end_request_cur(req, -EIO);
+	if (host->req)
+		if (++host->req->errors >= MG_MAX_ERRORS ||
+		    host->error == MG_ERR_TIMEOUT)
+			mg_end_request_cur(host, -EIO);
 }
 
 static unsigned int mg_out(struct mg_host *host,
@@ -515,7 +521,7 @@ static void mg_read(struct request *req)
 
 		outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
-	} while (__blk_end_request(req, 0, MG_SECTOR_SIZE));
+	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
 }
 
 static void mg_write(struct request *req)
@@ -545,14 +551,14 @@ static void mg_write(struct request *req)
 
 		outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
 				MG_REG_COMMAND);
-	} while (__blk_end_request(req, 0, MG_SECTOR_SIZE));
+	} while (mg_end_request(host, 0, MG_SECTOR_SIZE));
 }
 
 static void mg_read_intr(struct mg_host *host)
 {
+	struct request *req = host->req;
 	u32 i;
 	u16 *buff;
-	struct request *req;
 
 	/* check status */
 	do {
@@ -571,7 +577,6 @@ static void mg_read_intr(struct mg_host *host)
 
 ok_to_read:
 	/* get current segment of request */
-	req = elv_next_request(host->breq);
 	buff = (u16 *)req->buffer;
 
 	/* read 1 sector */
@@ -585,7 +590,7 @@ static void mg_read_intr(struct mg_host *host)
 	/* send read confirm */
 	outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND);
 
-	if (__blk_end_request(req, 0, MG_SECTOR_SIZE)) {
+	if (mg_end_request(host, 0, MG_SECTOR_SIZE)) {
 		/* set handler if read remains */
 		host->mg_do_intr = mg_read_intr;
 		mod_timer(&host->timer, jiffies + 3 * HZ);
@@ -595,14 +600,11 @@ static void mg_read_intr(struct mg_host *host)
 
 static void mg_write_intr(struct mg_host *host)
 {
+	struct request *req = host->req;
 	u32 i, j;
 	u16 *buff;
-	struct request *req;
 	bool rem;
 
-	/* get current segment of request */
-	req = elv_next_request(host->breq);
-
 	/* check status */
 	do {
 		i = inb((unsigned long)host->dev_base + MG_REG_STATUS);
@@ -619,7 +621,7 @@ static void mg_write_intr(struct mg_host *host)
 	return;
 
 ok_to_write:
-	if ((rem = __blk_end_request(req, 0, MG_SECTOR_SIZE))) {
+	if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
 		/* write 1 sector and set handler if remains */
 		buff = (u16 *)req->buffer;
 		for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) {
@@ -644,44 +646,47 @@ void mg_times_out(unsigned long data)
 {
 	struct mg_host *host = (struct mg_host *)data;
 	char *name;
-	struct request *req;
 
 	spin_lock_irq(&host->lock);
 
-	req = elv_next_request(host->breq);
-	if (!req)
+	if (!host->req)
 		goto out_unlock;
 
 	host->mg_do_intr = NULL;
 
-	name = req->rq_disk->disk_name;
+	name = host->req->rq_disk->disk_name;
 	printk(KERN_DEBUG "%s: timeout\n", name);
 
 	host->error = MG_ERR_TIMEOUT;
 	mg_bad_rw_intr(host);
 
-	mg_request(host->breq);
 out_unlock:
+	mg_request(host->breq);
 	spin_unlock_irq(&host->lock);
 }
 
 static void mg_request_poll(struct request_queue *q)
 {
-	struct request *req;
-	struct mg_host *host;
+	struct mg_host *host = q->queuedata;
 
-	while ((req = elv_next_request(q)) != NULL) {
-		host = req->rq_disk->private_data;
+	while (1) {
+		if (!host->req) {
+			host->req = elv_next_request(q);
+			if (host->req)
+				blkdev_dequeue_request(host->req);
+			else
+				break;
+		}
 
-		if (unlikely(!blk_fs_request(req))) {
-			__blk_end_request_cur(req, -EIO);
+		if (unlikely(!blk_fs_request(host->req))) {
+			mg_end_request_cur(host, -EIO);
 			continue;
 		}
 
-		if (rq_data_dir(req) == READ)
-			mg_read(req);
+		if (rq_data_dir(host->req) == READ)
+			mg_read(host->req);
 		else
-			mg_write(req);
+			mg_write(host->req);
 	}
 }
 
@@ -733,16 +738,19 @@ static unsigned int mg_issue_req(struct request *req,
 /* This function also called from IRQ context */
 static void mg_request(struct request_queue *q)
 {
+	struct mg_host *host = q->queuedata;
 	struct request *req;
-	struct mg_host *host;
 	u32 sect_num, sect_cnt;
 
 	while (1) {
-		req = elv_next_request(q);
-		if (!req)
-			return;
-
-		host = req->rq_disk->private_data;
+		if (!host->req) {
+			host->req = elv_next_request(q);
+			if (host->req)
+				blkdev_dequeue_request(host->req);
+			else
+				break;
+		}
+		req = host->req;
 
 		/* check unwanted request call */
 		if (host->mg_do_intr)
@@ -762,12 +770,12 @@ static void mg_request(struct request_queue *q)
 					"%s: bad access: sector=%d, count=%d\n",
 					req->rq_disk->disk_name,
 					sect_num, sect_cnt);
-			__blk_end_request_cur(req, -EIO);
+			mg_end_request_cur(host, -EIO);
 			continue;
 		}
 
 		if (unlikely(!blk_fs_request(req))) {
-			__blk_end_request_cur(req, -EIO);
+			mg_end_request_cur(host, -EIO);
 			continue;
 		}
 
@@ -981,6 +989,7 @@ static int mg_probe(struct platform_device *plat_dev)
 				__func__, __LINE__);
 		goto probe_err_5;
 	}
+	host->breq->queuedata = host;
 
 	/* mflash is random device, thanx for the noop */
 	elevator_exit(host->breq->elevator);
-- 
GitLab


From 8a12c4a456c7ee261a85c2cbec835601e6aae05a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:02 +0900
Subject: [PATCH 0992/2198] hd: dequeue and track in-flight request

hd has at most single request in flight.  Till now, whenever it needs
to access the in-flight request it called elv_next_request().  This
patch makes hd track the in-flight request directly and dequeue it
when processing starts.  The added complexity is minimal and this will
help future block layer changes.

[ Impact: dequeue in-flight request, one elv_next_request() per request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/hd.c | 63 +++++++++++++++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 23 deletions(-)

diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index a3b39940ce02d..288ab63c1029f 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -98,10 +98,9 @@
 
 static DEFINE_SPINLOCK(hd_lock);
 static struct request_queue *hd_queue;
+static struct request *hd_req;
 
 #define MAJOR_NR HD_MAJOR
-#define QUEUE (hd_queue)
-#define CURRENT elv_next_request(hd_queue)
 
 #define TIMEOUT_VALUE	(6*HZ)
 #define	HD_DELAY	0
@@ -195,11 +194,24 @@ static void __init hd_setup(char *str, int *ints)
 	NR_HD = hdind+1;
 }
 
+static bool hd_end_request(int err, unsigned int bytes)
+{
+	if (__blk_end_request(hd_req, err, bytes))
+		return true;
+	hd_req = NULL;
+	return false;
+}
+
+static bool hd_end_request_cur(int err)
+{
+	return hd_end_request(err, blk_rq_cur_bytes(hd_req));
+}
+
 static void dump_status(const char *msg, unsigned int stat)
 {
 	char *name = "hd?";
-	if (CURRENT)
-		name = CURRENT->rq_disk->disk_name;
+	if (hd_req)
+		name = hd_req->rq_disk->disk_name;
 
 #ifdef VERBOSE_ERRORS
 	printk("%s: %s: status=0x%02x { ", name, msg, stat & 0xff);
@@ -227,8 +239,8 @@ static void dump_status(const char *msg, unsigned int stat)
 		if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) {
 			printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL),
 				inb(HD_CURRENT) & 0xf, inb(HD_SECTOR));
-			if (CURRENT)
-				printk(", sector=%ld", blk_rq_pos(CURRENT));
+			if (hd_req)
+				printk(", sector=%ld", blk_rq_pos(hd_req));
 		}
 		printk("\n");
 	}
@@ -406,11 +418,12 @@ static void unexpected_hd_interrupt(void)
  */
 static void bad_rw_intr(void)
 {
-	struct request *req = CURRENT;
+	struct request *req = hd_req;
+
 	if (req != NULL) {
 		struct hd_i_struct *disk = req->rq_disk->private_data;
 		if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) {
-			__blk_end_request_cur(req, -EIO);
+			hd_end_request_cur(-EIO);
 			disk->special_op = disk->recalibrate = 1;
 		} else if (req->errors % RESET_FREQ == 0)
 			reset = 1;
@@ -454,14 +467,14 @@ static void read_intr(void)
 	return;
 
 ok_to_read:
-	req = CURRENT;
+	req = hd_req;
 	insw(HD_DATA, req->buffer, 256);
 #ifdef DEBUG
 	printk("%s: read: sector %ld, remaining = %u, buffer=%p\n",
 	       req->rq_disk->disk_name, blk_rq_pos(req) + 1,
 	       blk_rq_sectors(req) - 1, req->buffer+512);
 #endif
-	if (__blk_end_request(req, 0, 512)) {
+	if (hd_end_request(0, 512)) {
 		SET_HANDLER(&read_intr);
 		return;
 	}
@@ -475,7 +488,7 @@ static void read_intr(void)
 
 static void write_intr(void)
 {
-	struct request *req = CURRENT;
+	struct request *req = hd_req;
 	int i;
 	int retries = 100000;
 
@@ -494,7 +507,7 @@ static void write_intr(void)
 	return;
 
 ok_to_write:
-	if (__blk_end_request(req, 0, 512)) {
+	if (hd_end_request(0, 512)) {
 		SET_HANDLER(&write_intr);
 		outsw(HD_DATA, req->buffer, 256);
 		return;
@@ -525,18 +538,18 @@ static void hd_times_out(unsigned long dummy)
 
 	do_hd = NULL;
 
-	if (!CURRENT)
+	if (!hd_req)
 		return;
 
 	spin_lock_irq(hd_queue->queue_lock);
 	reset = 1;
-	name = CURRENT->rq_disk->disk_name;
+	name = hd_req->rq_disk->disk_name;
 	printk("%s: timeout\n", name);
-	if (++CURRENT->errors >= MAX_ERRORS) {
+	if (++hd_req->errors >= MAX_ERRORS) {
 #ifdef DEBUG
 		printk("%s: too many errors\n", name);
 #endif
-		__blk_end_request_cur(CURRENT, -EIO);
+		hd_end_request_cur(-EIO);
 	}
 	hd_request();
 	spin_unlock_irq(hd_queue->queue_lock);
@@ -551,7 +564,7 @@ static int do_special_op(struct hd_i_struct *disk, struct request *req)
 	}
 	if (disk->head > 16) {
 		printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name);
-		__blk_end_request_cur(req, -EIO);
+		hd_end_request_cur(-EIO);
 	}
 	disk->special_op = 0;
 	return 1;
@@ -578,11 +591,15 @@ static void hd_request(void)
 repeat:
 	del_timer(&device_timer);
 
-	req = CURRENT;
-	if (!req) {
-		do_hd = NULL;
-		return;
+	if (!hd_req) {
+		hd_req = elv_next_request(hd_queue);
+		if (!hd_req) {
+			do_hd = NULL;
+			return;
+		}
+		blkdev_dequeue_request(hd_req);
 	}
+	req = hd_req;
 
 	if (reset) {
 		reset_hd();
@@ -595,7 +612,7 @@ static void hd_request(void)
 	    ((block+nsect) > get_capacity(req->rq_disk))) {
 		printk("%s: bad access: block=%d, count=%d\n",
 			req->rq_disk->disk_name, block, nsect);
-		__blk_end_request_cur(req, -EIO);
+		hd_end_request_cur(-EIO);
 		goto repeat;
 	}
 
@@ -635,7 +652,7 @@ static void hd_request(void)
 			break;
 		default:
 			printk("unknown hd-command\n");
-			__blk_end_request_cur(req, -EIO);
+			hd_end_request_cur(-EIO);
 			break;
 		}
 	}
-- 
GitLab


From a336ca6fe6e0c46b2eef0e520951acb4e6cb2976 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:03 +0900
Subject: [PATCH 0993/2198] ataflop: dequeue and track in-flight request

ataflop has single request in flight.  Till now, whenever it needs to
access the in-flight request it called elv_next_request().  This patch
makes ataflop track the in-flight request directly and dequeue it when
processing starts.  The added complexity is minimal and this will help
future block layer changes.

[ Impact: dequeue in-flight request, one elv_next_request() per request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/ataflop.c | 63 +++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 234024cda5ecf..89a591d9c83b6 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -79,9 +79,7 @@
 #undef DEBUG
 
 static struct request_queue *floppy_queue;
-
-#define QUEUE (floppy_queue)
-#define CURRENT elv_next_request(floppy_queue)
+static struct request *fd_request;
 
 /* Disk types: DD, HD, ED */
 static struct atari_disk_type {
@@ -376,6 +374,12 @@ static DEFINE_TIMER(readtrack_timer, fd_readtrack_check, 0, 0);
 static DEFINE_TIMER(timeout_timer, fd_times_out, 0, 0);
 static DEFINE_TIMER(fd_timer, check_change, 0, 0);
 	
+static void fd_end_request_cur(int err)
+{
+	if (!__blk_end_request_cur(fd_request, err))
+		fd_request = NULL;
+}
+
 static inline void start_motor_off_timer(void)
 {
 	mod_timer(&motor_off_timer, jiffies + FD_MOTOR_OFF_DELAY);
@@ -606,15 +610,15 @@ static void fd_error( void )
 		return;
 	}
 
-	if (!CURRENT)
+	if (!fd_request)
 		return;
 
-	CURRENT->errors++;
-	if (CURRENT->errors >= MAX_ERRORS) {
+	fd_request->errors++;
+	if (fd_request->errors >= MAX_ERRORS) {
 		printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
-		__blk_end_request_cur(CURRENT, -EIO);
+		fd_end_request_cur(-EIO);
 	}
-	else if (CURRENT->errors == RECALIBRATE_ERRORS) {
+	else if (fd_request->errors == RECALIBRATE_ERRORS) {
 		printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
 		if (SelectedDrive != -1)
 			SUD.track = -1;
@@ -725,14 +729,14 @@ static void do_fd_action( int drive )
 	    if (IS_BUFFERED( drive, ReqSide, ReqTrack )) {
 		if (ReqCmd == READ) {
 		    copy_buffer( SECTOR_BUFFER(ReqSector), ReqData );
-		    if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) {
+		    if (++ReqCnt < blk_rq_cur_sectors(fd_request)) {
 			/* read next sector */
 			setup_req_params( drive );
 			goto repeat;
 		    }
 		    else {
 			/* all sectors finished */
-			__blk_end_request_cur(CURRENT, 0);
+			fd_end_request_cur(0);
 			redo_fd_request();
 			return;
 		    }
@@ -1130,14 +1134,14 @@ static void fd_rwsec_done1(int status)
 		}
 	}
   
-	if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) {
+	if (++ReqCnt < blk_rq_cur_sectors(fd_request)) {
 		/* read next sector */
 		setup_req_params( SelectedDrive );
 		do_fd_action( SelectedDrive );
 	}
 	else {
 		/* all sectors finished */
-		__blk_end_request_cur(CURRENT, 0);
+		fd_end_request_cur(0);
 		redo_fd_request();
 	}
 	return;
@@ -1378,7 +1382,7 @@ static void setup_req_params( int drive )
 	ReqData = ReqBuffer + 512 * ReqCnt;
 
 	if (UseTrackbuffer)
-		read_track = (ReqCmd == READ && CURRENT->errors == 0);
+		read_track = (ReqCmd == READ && fd_request->errors == 0);
 	else
 		read_track = 0;
 
@@ -1392,25 +1396,28 @@ static void redo_fd_request(void)
 	int drive, type;
 	struct atari_floppy_struct *floppy;
 
-	DPRINT(("redo_fd_request: CURRENT=%p dev=%s CURRENT->sector=%ld\n",
-		CURRENT, CURRENT ? CURRENT->rq_disk->disk_name : "",
-		CURRENT ? blk_rq_pos(CURRENT) : 0 ));
+	DPRINT(("redo_fd_request: fd_request=%p dev=%s fd_request->sector=%ld\n",
+		fd_request, fd_request ? fd_request->rq_disk->disk_name : "",
+		fd_request ? blk_rq_pos(fd_request) : 0 ));
 
 	IsFormatting = 0;
 
 repeat:
+	if (!fd_request) {
+		fd_request = elv_next_request(floppy_queue);
+		if (!fd_request)
+			goto the_end;
+		blkdev_dequeue_request(fd_request);
+	}
 
-	if (!CURRENT)
-		goto the_end;
-
-	floppy = CURRENT->rq_disk->private_data;
+	floppy = fd_request->rq_disk->private_data;
 	drive = floppy - unit;
 	type = floppy->type;
 	
 	if (!UD.connected) {
 		/* drive not connected */
 		printk(KERN_ERR "Unknown Device: fd%d\n", drive );
-		__blk_end_request_cur(CURRENT, -EIO);
+		fd_end_request_cur(-EIO);
 		goto repeat;
 	}
 		
@@ -1426,12 +1433,12 @@ static void redo_fd_request(void)
 		/* user supplied disk type */
 		if (--type >= NUM_DISK_MINORS) {
 			printk(KERN_WARNING "fd%d: invalid disk format", drive );
-			__blk_end_request_cur(CURRENT, -EIO);
+			fd_end_request_cur(-EIO);
 			goto repeat;
 		}
 		if (minor2disktype[type].drive_types > DriveType)  {
 			printk(KERN_WARNING "fd%d: unsupported disk format", drive );
-			__blk_end_request_cur(CURRENT, -EIO);
+			fd_end_request_cur(-EIO);
 			goto repeat;
 		}
 		type = minor2disktype[type].index;
@@ -1440,8 +1447,8 @@ static void redo_fd_request(void)
 		UD.autoprobe = 0;
 	}
 	
-	if (blk_rq_pos(CURRENT) + 1 > UDT->blocks) {
-		__blk_end_request_cur(CURRENT, -EIO);
+	if (blk_rq_pos(fd_request) + 1 > UDT->blocks) {
+		fd_end_request_cur(-EIO);
 		goto repeat;
 	}
 
@@ -1449,9 +1456,9 @@ static void redo_fd_request(void)
 	del_timer( &motor_off_timer );
 		
 	ReqCnt = 0;
-	ReqCmd = rq_data_dir(CURRENT);
-	ReqBlock = blk_rq_pos(CURRENT);
-	ReqBuffer = CURRENT->buffer;
+	ReqCmd = rq_data_dir(fd_request);
+	ReqBlock = blk_rq_pos(fd_request);
+	ReqBuffer = fd_request->buffer;
 	setup_req_params( drive );
 	do_fd_action( drive );
 
-- 
GitLab


From f4bd4b90bfa86e867111aa182895cb5ac203aa7a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:04 +0900
Subject: [PATCH 0994/2198] swim3: dequeue in-flight request

swim3 has at most single request in flight and already tracks it using
fd_req.  Convert it to dequeuing model by updating request fetching
and wrapping completion function.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/swim3.c | 47 +++++++++++++++++++++++++++++++------------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index c1b9a4dc11ba9..f48c6dd47e04d 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -251,6 +251,20 @@ static int floppy_release(struct gendisk *disk, fmode_t mode);
 static int floppy_check_change(struct gendisk *disk);
 static int floppy_revalidate(struct gendisk *disk);
 
+static bool swim3_end_request(int err, unsigned int nr_bytes)
+{
+	if (__blk_end_request(fd_req, err, nr_bytes))
+		return true;
+
+	fd_req = NULL;
+	return false;
+}
+
+static bool swim3_end_request_cur(int err)
+{
+	return swim3_end_request(err, blk_rq_cur_bytes(fd_req));
+}
+
 static void swim3_select(struct floppy_state *fs, int sel)
 {
 	struct swim3 __iomem *sw = fs->swim3;
@@ -310,7 +324,14 @@ static void start_request(struct floppy_state *fs)
 		wake_up(&fs->wait);
 		return;
 	}
-	while (fs->state == idle && (req = elv_next_request(swim3_queue))) {
+	while (fs->state == idle) {
+		if (!fd_req) {
+			fd_req = elv_next_request(swim3_queue);
+			if (!fd_req)
+				break;
+			blkdev_dequeue_request(fd_req);
+		}
+		req = fd_req;
 #if 0
 		printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n",
 		       req->rq_disk->disk_name, req->cmd,
@@ -320,11 +341,11 @@ static void start_request(struct floppy_state *fs)
 #endif
 
 		if (blk_rq_pos(req) >= fs->total_secs) {
-			__blk_end_request_cur(req, -EIO);
+			swim3_end_request_cur(-EIO);
 			continue;
 		}
 		if (fs->ejected) {
-			__blk_end_request_cur(req, -EIO);
+			swim3_end_request_cur(-EIO);
 			continue;
 		}
 
@@ -332,7 +353,7 @@ static void start_request(struct floppy_state *fs)
 			if (fs->write_prot < 0)
 				fs->write_prot = swim3_readbit(fs, WRITE_PROT);
 			if (fs->write_prot) {
-				__blk_end_request_cur(req, -EIO);
+				swim3_end_request_cur(-EIO);
 				continue;
 			}
 		}
@@ -505,7 +526,7 @@ static void act(struct floppy_state *fs)
 		case do_transfer:
 			if (fs->cur_cyl != fs->req_cyl) {
 				if (fs->retries > 5) {
-					__blk_end_request_cur(fd_req, -EIO);
+					swim3_end_request_cur(-EIO);
 					fs->state = idle;
 					return;
 				}
@@ -537,7 +558,7 @@ static void scan_timeout(unsigned long data)
 	out_8(&sw->intr_enable, 0);
 	fs->cur_cyl = -1;
 	if (fs->retries > 5) {
-		__blk_end_request_cur(fd_req, -EIO);
+		swim3_end_request_cur(-EIO);
 		fs->state = idle;
 		start_request(fs);
 	} else {
@@ -556,7 +577,7 @@ static void seek_timeout(unsigned long data)
 	out_8(&sw->select, RELAX);
 	out_8(&sw->intr_enable, 0);
 	printk(KERN_ERR "swim3: seek timeout\n");
-	__blk_end_request_cur(fd_req, -EIO);
+	swim3_end_request_cur(-EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -580,7 +601,7 @@ static void settle_timeout(unsigned long data)
 		return;
 	}
 	printk(KERN_ERR "swim3: seek settle timeout\n");
-	__blk_end_request_cur(fd_req, -EIO);
+	swim3_end_request_cur(-EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -603,7 +624,7 @@ static void xfer_timeout(unsigned long data)
 	printk(KERN_ERR "swim3: timeout %sing sector %ld\n",
 	       (rq_data_dir(fd_req)==WRITE? "writ": "read"),
 	       (long)blk_rq_pos(fd_req));
-	__blk_end_request_cur(fd_req, -EIO);
+	swim3_end_request_cur(-EIO);
 	fs->state = idle;
 	start_request(fs);
 }
@@ -634,7 +655,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk(KERN_ERR "swim3: seen sector but cyl=ff?\n");
 				fs->cur_cyl = -1;
 				if (fs->retries > 5) {
-					__blk_end_request_cur(fd_req, -EIO);
+					swim3_end_request_cur(-EIO);
 					fs->state = idle;
 					start_request(fs);
 				} else {
@@ -717,7 +738,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk("swim3: error %sing block %ld (err=%x)\n",
 				       rq_data_dir(fd_req) == WRITE? "writ": "read",
 				       (long)blk_rq_pos(fd_req), err);
-				__blk_end_request_cur(fd_req, -EIO);
+				swim3_end_request_cur(-EIO);
 				fs->state = idle;
 			}
 		} else {
@@ -726,12 +747,12 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id)
 				printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid);
 				printk(KERN_ERR "  state=%d, dir=%x, intr=%x, err=%x\n",
 				       fs->state, rq_data_dir(fd_req), intr, err);
-				__blk_end_request_cur(fd_req, -EIO);
+				swim3_end_request_cur(-EIO);
 				fs->state = idle;
 				start_request(fs);
 				break;
 			}
-			if (__blk_end_request(fd_req, 0, fs->scount << 9)) {
+			if (swim3_end_request(0, fs->scount << 9)) {
 				fs->req_sector += fs->scount;
 				if (fs->req_sector > fs->secpertrack) {
 					fs->req_sector -= fs->secpertrack;
-- 
GitLab


From 2d75ce084eec2a8154225c7e978191b364029003 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:05 +0900
Subject: [PATCH 0995/2198] xsysace: dequeue in-flight request

xsysace already tracks in-flight request using ace->req.  Converting
to dequeueing model is mostly a matter of adding dequeueing call after
request fetching.  The only tricky part is handling CF removal which
should complete both in flight and on queue requests.  Convert to
dequeueing model.

While at it, remove explicit blk_rq_cur_bytes() and use
__blk_end_request_cur() instead.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Grant Likely <grant.likely@secretlab.ca>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/xsysace.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 97c99b43f8815..edf137b6c3794 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -466,7 +466,8 @@ struct request *ace_get_next_request(struct request_queue * q)
 	while ((req = elv_next_request(q)) != NULL) {
 		if (blk_fs_request(req))
 			break;
-		__blk_end_request_cur(req, -EIO);
+		blkdev_dequeue_request(req);
+		__blk_end_request_all(req, -EIO);
 	}
 	return req;
 }
@@ -492,9 +493,15 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		set_capacity(ace->gd, 0);
 		dev_info(ace->dev, "No CF in slot\n");
 
-		/* Drop all pending requests */
-		while ((req = elv_next_request(ace->queue)) != NULL)
-			__blk_end_request_cur(req, -EIO);
+		/* Drop all in-flight and pending requests */
+		if (ace->req) {
+			__blk_end_request_all(ace->req, -EIO);
+			ace->req = NULL;
+		}
+		while ((req = elv_next_request(ace->queue)) != NULL) {
+			blkdev_dequeue_request(req);
+			__blk_end_request_all(req, -EIO);
+		}
 
 		/* Drop back to IDLE state and notify waiters */
 		ace->fsm_state = ACE_FSM_STATE_IDLE;
@@ -642,6 +649,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			ace->fsm_state = ACE_FSM_STATE_IDLE;
 			break;
 		}
+		blkdev_dequeue_request(req);
 
 		/* Okay, it's a data request, set it up for transfer */
 		dev_dbg(ace->dev,
@@ -718,8 +726,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 		}
 
 		/* bio finished; is there another one? */
-		if (__blk_end_request(ace->req, 0,
-					blk_rq_cur_bytes(ace->req))) {
+		if (__blk_end_request_cur(ace->req, 0)) {
 			/* dev_dbg(ace->dev, "next block; h=%u c=%u\n",
 			 *      blk_rq_sectors(ace->req),
 			 *      blk_rq_cur_sectors(ace->req));
-- 
GitLab


From b12d4f82c1a3cdcb2441c803a3368a9426f2f47f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:06 +0900
Subject: [PATCH 0996/2198] paride: dequeue in-flight request

pd/pf/pcd have track in-flight request by pd/pf/pcd_req.  They can be
converted to dequeueing model by updating fetching and completion
paths.  Convert them.

Note that removal of elv_next_request() call from pf_next_buf()
doesn't make any functional difference.  The path is traveled only
during partial completion of a request and elv_next_request() call
must return the same request anyway.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Tim Waugh <tim@cyberelk.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/paride/pcd.c | 18 ++++++++++++------
 drivers/block/paride/pd.c  | 14 +++++++++-----
 drivers/block/paride/pf.c  | 14 +++++++-------
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 2d5dc0af55e4d..425f81586a311 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -719,9 +719,12 @@ static void do_pcd_request(struct request_queue * q)
 	if (pcd_busy)
 		return;
 	while (1) {
-		pcd_req = elv_next_request(q);
-		if (!pcd_req)
-			return;
+		if (!pcd_req) {
+			pcd_req = elv_next_request(q);
+			if (!pcd_req)
+				return;
+			blkdev_dequeue_request(pcd_req);
+		}
 
 		if (rq_data_dir(pcd_req) == READ) {
 			struct pcd_unit *cd = pcd_req->rq_disk->private_data;
@@ -734,8 +737,10 @@ static void do_pcd_request(struct request_queue * q)
 			pcd_busy = 1;
 			ps_set_intr(do_pcd_read, NULL, 0, nice);
 			return;
-		} else
-			__blk_end_request_cur(pcd_req, -EIO);
+		} else {
+			__blk_end_request_all(pcd_req, -EIO);
+			pcd_req = NULL;
+		}
 	}
 }
 
@@ -744,7 +749,8 @@ static inline void next_request(int err)
 	unsigned long saved_flags;
 
 	spin_lock_irqsave(&pcd_lock, saved_flags);
-	__blk_end_request_cur(pcd_req, err);
+	if (!__blk_end_request_cur(pcd_req, err))
+		pcd_req = NULL;
 	pcd_busy = 0;
 	do_pcd_request(pcd_queue);
 	spin_unlock_irqrestore(&pcd_lock, saved_flags);
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 9ec5d4ac0b644..d2ca3f5520610 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -410,11 +410,14 @@ static void run_fsm(void)
 				pd_claimed = 0;
 				phase = NULL;
 				spin_lock_irqsave(&pd_lock, saved_flags);
-				__blk_end_request_cur(pd_req,
-						      res == Ok ? 0 : -EIO);
-				pd_req = elv_next_request(pd_queue);
-				if (!pd_req)
-					stop = 1;
+				if (!__blk_end_request_cur(pd_req,
+						res == Ok ? 0 : -EIO)) {
+					pd_req = elv_next_request(pd_queue);
+					if (!pd_req)
+						stop = 1;
+					else
+						blkdev_dequeue_request(pd_req);
+				}
 				spin_unlock_irqrestore(&pd_lock, saved_flags);
 				if (stop)
 					return;
@@ -706,6 +709,7 @@ static void do_pd_request(struct request_queue * q)
 	pd_req = elv_next_request(q);
 	if (!pd_req)
 		return;
+	blkdev_dequeue_request(pd_req);
 
 	schedule_fsm();
 }
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index e88c889aa7f21..d6f7bd84ed392 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -752,10 +752,8 @@ static struct request_queue *pf_queue;
 
 static void pf_end_request(int err)
 {
-	if (pf_req) {
-		__blk_end_request_cur(pf_req, err);
+	if (pf_req && !__blk_end_request_cur(pf_req, err))
 		pf_req = NULL;
-	}
 }
 
 static void do_pf_request(struct request_queue * q)
@@ -763,9 +761,12 @@ static void do_pf_request(struct request_queue * q)
 	if (pf_busy)
 		return;
 repeat:
-	pf_req = elv_next_request(q);
-	if (!pf_req)
-		return;
+	if (!pf_req) {
+		pf_req = elv_next_request(q);
+		if (!pf_req)
+			return;
+		blkdev_dequeue_request(pf_req);
+	}
 
 	pf_current = pf_req->rq_disk->private_data;
 	pf_block = blk_rq_pos(pf_req);
@@ -806,7 +807,6 @@ static int pf_next_buf(void)
 	if (!pf_count) {
 		spin_lock_irqsave(&pf_spin_lock, saved_flags);
 		pf_end_request(0);
-		pf_req = elv_next_request(pf_queue);
 		spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
 		if (!pf_req)
 			return 1;
-- 
GitLab


From 10e1e629b386aef97bf66de6ef28d450bec06ee3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:07 +0900
Subject: [PATCH 0997/2198] ps3disk: dequeue in-flight request

Other than in issue error paths, ps3disk always completely finishes
fetched requests.  With full completion on error paths, it can be
easily converted to dequeueing model.

* After L1 r/w call failure, ps3disk_submit_request_sg() now fails the
  whole request.  Issue failure isn't likely to benefit from partial
  retry anyway and ps3disk uses full failure in completion error path
  too, so I don't think this amounts to any meaningful functionality
  loss.

* flush completion is converted to _all for consistency.  It doesn't
  make any functional difference.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/ps3disk.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 8d583081b50a9..f4d8db944e7d7 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -157,7 +157,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
 	if (res) {
 		dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__,
 			__LINE__, op, res);
-		__blk_end_request_cur(req, -EIO);
+		__blk_end_request_all(req, -EIO);
 		return 0;
 	}
 
@@ -179,7 +179,7 @@ static int ps3disk_submit_flush_request(struct ps3_storage_device *dev,
 	if (res) {
 		dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
 			__func__, __LINE__, res);
-		__blk_end_request_cur(req, -EIO);
+		__blk_end_request_all(req, -EIO);
 		return 0;
 	}
 
@@ -195,6 +195,8 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
 	while ((req = elv_next_request(q))) {
+		blkdev_dequeue_request(req);
+
 		if (blk_fs_request(req)) {
 			if (ps3disk_submit_request_sg(dev, req))
 				break;
@@ -204,7 +206,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 				break;
 		} else {
 			blk_dump_rq_flags(req, DEVICE_NAME " bad request");
-			__blk_end_request_cur(req, -EIO);
+			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 	}
-- 
GitLab


From 9e31bebee2d8b5f8abe9677f2a29b90cba848657 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:08 +0900
Subject: [PATCH 0998/2198] amiflop: dequeue in-flight request

Request processing in amiflop is done sequentially in
redo_fd_request() proper and redo_fd_request() can easily be converted
to track in-flight request.  Remove CURRENT, track in-flight request
directly and dequeue it when processing starts.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/amiflop.c | 48 ++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index e4a14b94828ea..80a68b2e04515 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -112,8 +112,6 @@ module_param(fd_def_df0, ulong, 0);
 MODULE_LICENSE("GPL");
 
 static struct request_queue *floppy_queue;
-#define QUEUE (floppy_queue)
-#define CURRENT elv_next_request(floppy_queue)
 
 /*
  *  Macros
@@ -1335,59 +1333,61 @@ static int get_track(int drive, int track)
 
 static void redo_fd_request(void)
 {
+	struct request *rq;
 	unsigned int cnt, block, track, sector;
 	int drive;
 	struct amiga_floppy_struct *floppy;
 	char *data;
 	unsigned long flags;
+	int err;
 
- repeat:
-	if (!CURRENT) {
+next_req:
+	rq = elv_next_request(floppy_queue);
+	if (!rq) {
 		/* Nothing left to do */
 		return;
 	}
+	blkdev_dequeue_request(rq);
 
-	floppy = CURRENT->rq_disk->private_data;
+	floppy = rq->rq_disk->private_data;
 	drive = floppy - unit;
 
+next_segment:
 	/* Here someone could investigate to be more efficient */
-	for (cnt = 0; cnt < blk_rq_cur_sectors(CURRENT); cnt++) {
+	for (cnt = 0, err = 0; cnt < blk_rq_cur_sectors(rq); cnt++) {
 #ifdef DEBUG
 		printk("fd: sector %ld + %d requested for %s\n",
-		       blk_rq_pos(CURRENT), cnt,
-		       (rq_data_dir(CURRENT) == READ) ? "read" : "write");
+		       blk_rq_pos(rq), cnt,
+		       (rq_data_dir(rq) == READ) ? "read" : "write");
 #endif
-		block = blk_rq_pos(CURRENT) + cnt;
+		block = blk_rq_pos(rq) + cnt;
 		if ((int)block > floppy->blocks) {
-			__blk_end_request_cur(CURRENT, -EIO);
-			goto repeat;
+			err = -EIO;
+			break;
 		}
 
 		track = block / (floppy->dtype->sects * floppy->type->sect_mult);
 		sector = block % (floppy->dtype->sects * floppy->type->sect_mult);
-		data = CURRENT->buffer + 512 * cnt;
+		data = rq->buffer + 512 * cnt;
 #ifdef DEBUG
 		printk("access to track %d, sector %d, with buffer at "
 		       "0x%08lx\n", track, sector, data);
 #endif
 
 		if (get_track(drive, track) == -1) {
-			__blk_end_request_cur(CURRENT, -EIO);
-			goto repeat;
+			err = -EIO;
+			break;
 		}
 
-		switch (rq_data_dir(CURRENT)) {
-		case READ:
+		if (rq_data_dir(rq) == READ) {
 			memcpy(data, floppy->trackbuf + sector * 512, 512);
-			break;
-
-		case WRITE:
+		} else {
 			memcpy(floppy->trackbuf + sector * 512, data, 512);
 
 			/* keep the drive spinning while writes are scheduled */
 			if (!fd_motor_on(drive)) {
-				__blk_end_request_cur(CURRENT, -EIO);
-				goto repeat;
+				err = -EIO;
+				break;
 			}
 			/*
 			 * setup a callback to write the track buffer
@@ -1399,12 +1399,12 @@ static void redo_fd_request(void)
 		        /* reset the timer */
 			mod_timer (flush_track_timer + drive, jiffies + 1);
 			local_irq_restore(flags);
-			break;
 		}
 	}
 
-	__blk_end_request_cur(CURRENT, 0);
-	goto repeat;
+	if (__blk_end_request_cur(rq, err))
+		goto next_segment;
+	goto next_req;
 }
 
 static void do_fd_request(struct request_queue * q)
-- 
GitLab


From 06b0608e2b46465e8e663214e7db982ddb000346 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:09 +0900
Subject: [PATCH 0999/2198] swim: dequeue in-flight request

swim processes requests one-by-one synchronously and can easily be
converted to dequeuing model.  Convert it.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Laurent Vivier <Laurent@lvivier.info>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/swim.c | 47 ++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index fc6a1c3229337..dedd4893f5ea5 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -514,7 +514,7 @@ static int floppy_read_sectors(struct floppy_state *fs,
 			ret = swim_read_sector(fs, side, track, sector,
 						buffer);
 			if (try-- == 0)
-				return -1;
+				return -EIO;
 		} while (ret != 512);
 
 		buffer += ret;
@@ -528,38 +528,37 @@ static void redo_fd_request(struct request_queue *q)
 	struct request *req;
 	struct floppy_state *fs;
 
-	while ((req = elv_next_request(q))) {
+	req = elv_next_request(q);
+	if (req)
+		blkdev_dequeue_request(req);
+
+	while (req) {
+		int err = -EIO;
 
 		fs = req->rq_disk->private_data;
-		if (blk_rq_pos(req) >= fs->total_secs) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
-		if (!fs->disk_in) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
-		if (rq_data_dir(req) == WRITE) {
-			if (fs->write_protected) {
-				__blk_end_request_cur(req, -EIO);
-				continue;
-			}
-		}
+		if (blk_rq_pos(req) >= fs->total_secs)
+			goto done;
+		if (!fs->disk_in)
+			goto done;
+		if (rq_data_dir(req) == WRITE && fs->write_protected)
+			goto done;
+
 		switch (rq_data_dir(req)) {
 		case WRITE:
 			/* NOT IMPLEMENTED */
-			__blk_end_request_cur(req, -EIO);
 			break;
 		case READ:
-			if (floppy_read_sectors(fs, blk_rq_pos(req),
-						blk_rq_cur_sectors(req),
-						req->buffer)) {
-				__blk_end_request_cur(req, -EIO);
-				continue;
-			}
-			__blk_end_request_cur(req, 0);
+			err = floppy_read_sectors(fs, blk_rq_pos(req),
+						  blk_rq_cur_sectors(req),
+						  req->buffer);
 			break;
 		}
+	done:
+		if (!__blk_end_request_cur(req, err)) {
+			req = elv_next_request(q);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 	}
 }
 
-- 
GitLab


From bab2a807a489822ded0c9d4a5344c80bcac10b0a Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:10 +0900
Subject: [PATCH 1000/2198] xd: dequeue in-flight request

xd processes requests one-by-one synchronously and can be easily
converted to dequeueing model.  Convert it.

While at it, use rq_cur_bytes instead of rq_bytes when checking for
sector overflow.  This is for for consistency and better behavior for
merged requests.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/xd.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 4ef88018bcde1..d4c4352354b55 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -305,26 +305,31 @@ static void do_xd_request (struct request_queue * q)
 	if (xdc_busy)
 		return;
 
-	while ((req = elv_next_request(q)) != NULL) {
+	req = elv_next_request(q);
+	if (req)
+		blkdev_dequeue_request(req);
+
+	while (req) {
 		unsigned block = blk_rq_pos(req);
-		unsigned count = blk_rq_sectors(req);
+		unsigned count = blk_rq_cur_sectors(req);
 		XD_INFO *disk = req->rq_disk->private_data;
-		int res = 0;
+		int res = -EIO;
 		int retry;
 
-		if (!blk_fs_request(req)) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
-		if (block + count > get_capacity(req->rq_disk)) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
+		if (!blk_fs_request(req))
+			goto done;
+		if (block + count > get_capacity(req->rq_disk))
+			goto done;
 		for (retry = 0; (retry < XD_RETRIES) && !res; retry++)
 			res = xd_readwrite(rq_data_dir(req), disk, req->buffer,
 					   block, count);
+	done:
 		/* wrap up, 0 = success, -errno = fail */
-		__blk_end_request_cur(req, res);
+		if (!__blk_end_request_cur(req, res)) {
+			req = elv_next_request(q);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 	}
 }
 
-- 
GitLab


From 1498ada7a8e80afe324f2b8d86158925d0096ec9 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:11 +0900
Subject: [PATCH 1001/2198] mtd_blkdevs: dequeue in-flight request

mtd_blkdevs processes requests one-by-one synchronously from a kthread
and can be easily converted to dequeueing model.  Convert it.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/mtd/mtd_blkdevs.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 50c76a2ca76e3..3e10442615d17 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -89,18 +89,22 @@ static int mtd_blktrans_thread(void *arg)
 {
 	struct mtd_blktrans_ops *tr = arg;
 	struct request_queue *rq = tr->blkcore_priv->rq;
+	struct request *req = NULL;
 
 	/* we might get involved when memory gets low, so use PF_MEMALLOC */
 	current->flags |= PF_MEMALLOC;
 
 	spin_lock_irq(rq->queue_lock);
+
 	while (!kthread_should_stop()) {
-		struct request *req;
 		struct mtd_blktrans_dev *dev;
 		int res;
 
-		req = elv_next_request(rq);
-
+		if (!req) {
+			req = elv_next_request(rq);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 		if (!req) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(rq->queue_lock);
@@ -120,8 +124,13 @@ static int mtd_blktrans_thread(void *arg)
 
 		spin_lock_irq(rq->queue_lock);
 
-		__blk_end_request_cur(req, res);
+		if (!__blk_end_request_cur(req, res))
+			req = NULL;
 	}
+
+	if (req)
+		__blk_end_request_all(req, -EIO);
+
 	spin_unlock_irq(rq->queue_lock);
 
 	return 0;
-- 
GitLab


From 6b0bf407b586b6ba8e060ad9979cb2bc3370b7eb Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:12 +0900
Subject: [PATCH 1002/2198] jsflash: dequeue in-flight request

jsflash processes requests one-by-one synchronously from a kthread and
can be easily converted to dequeueing model.  Convert it.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/sbus/char/jsflash.c | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index d56ddaa770361..f572a4a1d1419 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -186,31 +186,37 @@ static void jsfd_do_request(struct request_queue *q)
 {
 	struct request *req;
 
-	while ((req = elv_next_request(q)) != NULL) {
+	req = elv_next_request(q);
+	if (req)
+		blkdev_dequeue_request(req);
+
+	while (req) {
 		struct jsfd_part *jdp = req->rq_disk->private_data;
 		unsigned long offset = blk_rq_pos(req) << 9;
 		size_t len = blk_rq_cur_bytes(req);
+		int err = -EIO;
 
-		if ((offset + len) > jdp->dsize) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
+		if ((offset + len) > jdp->dsize)
+			goto end;
 
 		if (rq_data_dir(req) != READ) {
 			printk(KERN_ERR "jsfd: write\n");
-			__blk_end_request_cur(req, -EIO);
-			continue;
+			goto end;
 		}
 
 		if ((jdp->dbase & 0xff000000) != 0x20000000) {
 			printk(KERN_ERR "jsfd: bad base %x\n", (int)jdp->dbase);
-			__blk_end_request_cur(req, -EIO);
-			continue;
+			goto end;
 		}
 
 		jsfd_read(req->buffer, jdp->dbase + offset, len);
-
-		__blk_end_request_cur(req, 0);
+		err = 0;
+	end:
+		if (!__blk_end_request_cur(req, err)) {
+			req = elv_next_request(q);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 	}
 }
 
-- 
GitLab


From fb3ac7f6b811eac8e0aafa3df1c16ed872e898a8 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:13 +0900
Subject: [PATCH 1003/2198] z2ram: dequeue in-flight request

z2ram processes requests one-by-one synchronously and can be easily
converted to dequeueing model.  Convert it.

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/z2ram.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 6a1383834ecff..c909c1a3f6507 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -70,15 +70,21 @@ static struct gendisk *z2ram_gendisk;
 static void do_z2_request(struct request_queue *q)
 {
 	struct request *req;
-	while ((req = elv_next_request(q)) != NULL) {
+
+	req = elv_next_request(q);
+	if (req)
+		blkdev_dequeue_request(req);
+
+	while (req) {
 		unsigned long start = blk_rq_pos(req) << 9;
 		unsigned long len  = blk_rq_cur_bytes(req);
+		int err = 0;
 
 		if (start + len > z2ram_size) {
 			printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
 				blk_rq_pos(req), blk_rq_cur_sectors(req));
-			__blk_end_request_cur(req, -EIO);
-			continue;
+			err = -EIO;
+			goto done;
 		}
 		while (len) {
 			unsigned long addr = start & Z2RAM_CHUNKMASK;
@@ -93,7 +99,12 @@ static void do_z2_request(struct request_queue *q)
 			start += size;
 			len -= size;
 		}
-		__blk_end_request_cur(req, 0);
+	done:
+		if (!__blk_end_request_cur(req, err)) {
+			req = elv_next_request(q);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 	}
 }
 
-- 
GitLab


From 296b2f6ae654581adc27f0d6f0af454c7f3d06ee Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:15 +0900
Subject: [PATCH 1004/2198] block: convert to dequeueing model (easy ones)

plat-omap/mailbox, floppy, viocd, mspro_block, i2o_block and
mmc/card/queue are already pretty close to dequeueing model and can be
converted with simple changes.  Convert them.

While at it,

* xen-blkfront: !fs check moved downwards to share dequeue call with
  normal path.

* mspro_block: __blk_end_request(..., blk_rq_cur_byte()) converted to
  __blk_end_request_cur()

* mmc/card/queue: loop of __blk_end_request() converted to
  __blk_end_request_all()

[ Impact: dequeue in-flight request ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Pierre Ossman <drzeus@drzeus.cx>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/arm/plat-omap/mailbox.c        |  9 +++++++++
 drivers/block/floppy.c              |  4 +++-
 drivers/block/xen-blkfront.c        | 13 +++++++------
 drivers/cdrom/viocd.c               |  2 ++
 drivers/memstick/core/mspro_block.c |  8 +++++---
 drivers/message/i2o/i2o_block.c     |  6 ++++--
 drivers/mmc/card/queue.c            | 12 ++++++------
 7 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index 538ba7541d3f6..7a1f5c25fd17c 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -198,6 +198,8 @@ static void mbox_tx_work(struct work_struct *work)
 
 		spin_lock(q->queue_lock);
 		rq = elv_next_request(q);
+		if (rq)
+			blkdev_dequeue_request(rq);
 		spin_unlock(q->queue_lock);
 
 		if (!rq)
@@ -208,6 +210,9 @@ static void mbox_tx_work(struct work_struct *work)
 		ret = __mbox_msg_send(mbox, tx_data->msg, tx_data->arg);
 		if (ret) {
 			enable_mbox_irq(mbox, IRQ_TX);
+			spin_lock(q->queue_lock);
+			blk_requeue_request(q, rq);
+			spin_unlock(q->queue_lock);
 			return;
 		}
 
@@ -238,6 +243,8 @@ static void mbox_rx_work(struct work_struct *work)
 	while (1) {
 		spin_lock_irqsave(q->queue_lock, flags);
 		rq = elv_next_request(q);
+		if (rq)
+			blkdev_dequeue_request(rq);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 		if (!rq)
 			break;
@@ -345,6 +352,8 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf)
 	while (1) {
 		spin_lock_irqsave(q->queue_lock, flags);
 		rq = elv_next_request(q);
+		if (rq)
+			blkdev_dequeue_request(rq);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 
 		if (!rq)
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 1e27ed9208b45..e2c70d2085ae1 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -931,7 +931,7 @@ static inline void unlock_fdc(void)
 	del_timer(&fd_timeout);
 	cont = NULL;
 	clear_bit(0, &fdc_busy);
-	if (elv_next_request(floppy_queue))
+	if (current_req || elv_next_request(floppy_queue))
 		do_fd_request(floppy_queue);
 	spin_unlock_irqrestore(&floppy_lock, flags);
 	wake_up(&fdc_wait);
@@ -2913,6 +2913,8 @@ static void redo_fd_request(void)
 
 			spin_lock_irq(floppy_queue->queue_lock);
 			req = elv_next_request(floppy_queue);
+			if (req)
+				blkdev_dequeue_request(req);
 			spin_unlock_irq(floppy_queue->queue_lock);
 			if (!req) {
 				do_floppy = NULL;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 91fc56597e9b7..66f834571b88c 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -301,22 +301,23 @@ static void do_blkif_request(struct request_queue *rq)
 
 	while ((req = elv_next_request(rq)) != NULL) {
 		info = req->rq_disk->private_data;
-		if (!blk_fs_request(req)) {
-			__blk_end_request_cur(req, -EIO);
-			continue;
-		}
 
 		if (RING_FULL(&info->ring))
 			goto wait;
 
+		blkdev_dequeue_request(req);
+
+		if (!blk_fs_request(req)) {
+			__blk_end_request_all(req, -EIO);
+			continue;
+		}
+
 		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
 			 "(%u/%u) buffer:%p [%s]\n",
 			 req, req->cmd, (unsigned long)blk_rq_pos(req),
 			 blk_rq_cur_sectors(req), blk_rq_sectors(req),
 			 req->buffer, rq_data_dir(req) ? "write" : "read");
 
-
-		blkdev_dequeue_request(req);
 		if (blkif_queue_request(req)) {
 			blk_requeue_request(rq, req);
 wait:
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index 6e190a93d8dfc..bbe9f08673475 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -298,6 +298,8 @@ static void do_viocd_request(struct request_queue *q)
 	struct request *req;
 
 	while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) {
+		blkdev_dequeue_request(req);
+
 		if (!blk_fs_request(req))
 			__blk_end_request_all(req, -EIO);
 		else if (send_request(req) < 0) {
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 93b2c61856566..58f5be8cd69e0 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -672,8 +672,7 @@ static int mspro_block_issue_req(struct memstick_dev *card, int chunk)
 					       msb->req_sg);
 
 		if (!msb->seg_count) {
-			chunk = __blk_end_request(msb->block_req, -ENOMEM,
-					blk_rq_cur_bytes(msb->block_req));
+			chunk = __blk_end_request_cur(msb->block_req, -ENOMEM);
 			continue;
 		}
 
@@ -711,6 +710,7 @@ static int mspro_block_issue_req(struct memstick_dev *card, int chunk)
 		dev_dbg(&card->dev, "issue end\n");
 		return -EAGAIN;
 	}
+	blkdev_dequeue_request(msb->block_req);
 
 	dev_dbg(&card->dev, "trying again\n");
 	chunk = 1;
@@ -825,8 +825,10 @@ static void mspro_block_submit_req(struct request_queue *q)
 		return;
 
 	if (msb->eject) {
-		while ((req = elv_next_request(q)) != NULL)
+		while ((req = elv_next_request(q)) != NULL) {
+			blkdev_dequeue_request(req);
 			__blk_end_request_all(req, -ENODEV);
+		}
 
 		return;
 	}
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index e153f5d5237de..8b5cbfc3ba97d 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -916,8 +916,10 @@ static void i2o_block_request_fn(struct request_queue *q)
 				blk_stop_queue(q);
 				break;
 			}
-		} else
-			__blk_end_request_cur(req, -EIO);
+		} else {
+			blkdev_dequeue_request(req);
+			__blk_end_request_all(req, -EIO);
+		}
 	}
 };
 
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 7a72e75d5c674..4b70f1e28347c 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -54,8 +54,11 @@ static int mmc_queue_thread(void *d)
 
 		spin_lock_irq(q->queue_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (!blk_queue_plugged(q))
+		if (!blk_queue_plugged(q)) {
 			req = elv_next_request(q);
+			if (req)
+				blkdev_dequeue_request(req);
+		}
 		mq->req = req;
 		spin_unlock_irq(q->queue_lock);
 
@@ -88,15 +91,12 @@ static void mmc_request(struct request_queue *q)
 {
 	struct mmc_queue *mq = q->queuedata;
 	struct request *req;
-	int ret;
 
 	if (!mq) {
 		printk(KERN_ERR "MMC: killing requests for dead queue\n");
 		while ((req = elv_next_request(q)) != NULL) {
-			do {
-				ret = __blk_end_request(req, -EIO,
-							blk_rq_cur_bytes(req));
-			} while (ret);
+			blkdev_dequeue_request(req);
+			__blk_end_request_all(req, -EIO);
 		}
 		return;
 	}
-- 
GitLab


From 2343046826a8ca426b07601d9593ee046c298b68 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:14 +0900
Subject: [PATCH 1005/2198] gdrom: dequeue in-flight request

gdrom already dequeues and fully completes requests on normal path and
the error paths can be easily converted to do so too.  Clean it up and
dequeue requests on error paths too.

While at it remove superflous blk_fs_request() && !blk_rq_sectors()
condition check.

[ Impact: dequeue in-flight request, cleanup ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/cdrom/gdrom.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 488423cab51af..3cc02bfe828dd 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -638,33 +638,31 @@ static void gdrom_readdisk_dma(struct work_struct *work)
 	kfree(read_command);
 }
 
-static void gdrom_request_handler_dma(struct request *req)
-{
-	/* dequeue, add to list of deferred work
-	* and then schedule workqueue */
-	blkdev_dequeue_request(req);
-	list_add_tail(&req->queuelist, &gdrom_deferred);
-	schedule_work(&work);
-}
-
 static void gdrom_request(struct request_queue *rq)
 {
 	struct request *req;
 
 	while ((req = elv_next_request(rq)) != NULL) {
+		blkdev_dequeue_request(req);
+
 		if (!blk_fs_request(req)) {
 			printk(KERN_DEBUG "GDROM: Non-fs request ignored\n");
-			__blk_end_request_cur(req, -EIO);
+			__blk_end_request_all(req, -EIO);
+			continue;
 		}
 		if (rq_data_dir(req) != READ) {
 			printk(KERN_NOTICE "GDROM: Read only device -");
 			printk(" write request ignored\n");
-			__blk_end_request_cur(req, -EIO);
+			__blk_end_request_all(req, -EIO);
+			continue;
 		}
-		if (blk_rq_sectors(req))
-			gdrom_request_handler_dma(req);
-		else
-			__blk_end_request_cur(req, -EIO);
+
+		/*
+		 * Add to list of deferred work and then schedule
+		 * workqueue.
+		 */
+		list_add_tail(&req->queuelist, &gdrom_deferred);
+		schedule_work(&work);
 	}
 }
 
-- 
GitLab


From 9934c8c04561413609d2bc38c6b9f268cba774a4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 8 May 2009 11:54:16 +0900
Subject: [PATCH 1006/2198] block: implement and enforce request
 peek/start/fetch

Till now block layer allowed two separate modes of request execution.
A request is always acquired from the request queue via
elv_next_request().  After that, drivers are free to either dequeue it
or process it without dequeueing.  Dequeue allows elv_next_request()
to return the next request so that multiple requests can be in flight.

Executing requests without dequeueing has its merits mostly in
allowing drivers for simpler devices which can't do sg to deal with
segments only without considering request boundary.  However, the
benefit this brings is dubious and declining while the cost of the API
ambiguity is increasing.  Segment based drivers are usually for very
old or limited devices and as converting to dequeueing model isn't
difficult, it doesn't justify the API overhead it puts on block layer
and its more modern users.

Previous patches converted all block low level drivers to dequeueing
model.  This patch completes the API transition by...

* renaming elv_next_request() to blk_peek_request()

* renaming blkdev_dequeue_request() to blk_start_request()

* adding blk_fetch_request() which is combination of peek and start

* disallowing completion of queued (not started) requests

* applying new API to all LLDs

Renamings are for consistency and to break out of tree code so that
it's apparent that out of tree drivers need updating.

[ Impact: block request issue API cleanup, no functional change ]

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Mike Miller <mike.miller@hp.com>
Cc: unsik Kim <donari75@gmail.com>
Cc: Paul Clements <paul.clements@steeleye.com>
Cc: Tim Waugh <tim@cyberelk.net>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Laurent Vivier <Laurent@lvivier.info>
Cc: Jeff Garzik <jgarzik@pobox.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: Adrian McMenamin <adrian@mcmen.demon.co.uk>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Borislav Petkov <petkovbb@googlemail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Alex Dubov <oakad@yahoo.com>
Cc: Pierre Ossman <drzeus@drzeus.cx>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Markus Lidel <Markus.Lidel@shadowconnect.com>
Cc: Stefan Weinhuber <wein@de.ibm.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/arm/plat-omap/mailbox.c        |  12 +---
 arch/um/drivers/ubd_kern.c          |   3 +-
 block/blk-barrier.c                 |   4 +-
 block/blk-core.c                    | 105 ++++++++++++++++++++--------
 block/blk-tag.c                     |   2 +-
 block/blk.h                         |   1 +
 drivers/block/DAC960.c              |   4 +-
 drivers/block/amiflop.c             |   3 +-
 drivers/block/ataflop.c             |   3 +-
 drivers/block/cciss.c               |   4 +-
 drivers/block/cpqarray.c            |   4 +-
 drivers/block/floppy.c              |   6 +-
 drivers/block/hd.c                  |   3 +-
 drivers/block/mg_disk.c             |  12 ++--
 drivers/block/nbd.c                 |   4 +-
 drivers/block/paride/pcd.c          |   3 +-
 drivers/block/paride/pd.c           |   7 +-
 drivers/block/paride/pf.c           |   3 +-
 drivers/block/ps3disk.c             |   4 +-
 drivers/block/sunvdc.c              |   3 +-
 drivers/block/swim.c                |  12 +---
 drivers/block/swim3.c               |   3 +-
 drivers/block/sx8.c                 |   8 +--
 drivers/block/ub.c                  |   8 +--
 drivers/block/viodasd.c             |   4 +-
 drivers/block/virtio_blk.c          |   4 +-
 drivers/block/xd.c                  |  12 +---
 drivers/block/xen-blkfront.c        |   4 +-
 drivers/block/xsysace.c             |  10 ++-
 drivers/block/z2ram.c               |  12 +---
 drivers/cdrom/gdrom.c               |   4 +-
 drivers/cdrom/viocd.c               |   4 +-
 drivers/ide/ide-atapi.c             |   2 +-
 drivers/ide/ide-io.c                |   9 +--
 drivers/memstick/core/mspro_block.c |   9 +--
 drivers/message/i2o/i2o_block.c     |   6 +-
 drivers/mmc/card/queue.c            |  11 +--
 drivers/mtd/mtd_blkdevs.c           |   7 +-
 drivers/s390/block/dasd.c           |  16 ++---
 drivers/s390/char/tape_block.c      |   7 +-
 drivers/sbus/char/jsflash.c         |  12 +---
 drivers/scsi/scsi_lib.c             |  10 +--
 drivers/scsi/scsi_transport_sas.c   |   4 +-
 include/linux/blkdev.h              |   9 ++-
 include/linux/elevator.h            |   2 -
 45 files changed, 172 insertions(+), 207 deletions(-)

diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c
index 7a1f5c25fd17c..40424edae9391 100644
--- a/arch/arm/plat-omap/mailbox.c
+++ b/arch/arm/plat-omap/mailbox.c
@@ -197,9 +197,7 @@ static void mbox_tx_work(struct work_struct *work)
 		struct omap_msg_tx_data *tx_data;
 
 		spin_lock(q->queue_lock);
-		rq = elv_next_request(q);
-		if (rq)
-			blkdev_dequeue_request(rq);
+		rq = blk_fetch_request(q);
 		spin_unlock(q->queue_lock);
 
 		if (!rq)
@@ -242,9 +240,7 @@ static void mbox_rx_work(struct work_struct *work)
 
 	while (1) {
 		spin_lock_irqsave(q->queue_lock, flags);
-		rq = elv_next_request(q);
-		if (rq)
-			blkdev_dequeue_request(rq);
+		rq = blk_fetch_request(q);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 		if (!rq)
 			break;
@@ -351,9 +347,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf)
 
 	while (1) {
 		spin_lock_irqsave(q->queue_lock, flags);
-		rq = elv_next_request(q);
-		if (rq)
-			blkdev_dequeue_request(rq);
+		rq = blk_fetch_request(q);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 
 		if (!rq)
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 402ba8f70fc90..aa9e926e13d73 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1228,12 +1228,11 @@ static void do_ubd_request(struct request_queue *q)
 	while(1){
 		struct ubd *dev = q->queuedata;
 		if(dev->end_sg == 0){
-			struct request *req = elv_next_request(q);
+			struct request *req = blk_fetch_request(q);
 			if(req == NULL)
 				return;
 
 			dev->request = req;
-			blkdev_dequeue_request(req);
 			dev->start_sg = 0;
 			dev->end_sg = blk_rq_map_sg(q, req, dev->sg);
 		}
diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 8713c2fbc4f64..0ab81a0a75023 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -180,7 +180,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	}
 
 	/* stash away the original request */
-	elv_dequeue_request(q, rq);
+	blk_dequeue_request(rq);
 	q->orig_bar_rq = rq;
 	rq = NULL;
 
@@ -248,7 +248,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 			 * Queue ordering not supported.  Terminate
 			 * with prejudice.
 			 */
-			elv_dequeue_request(q, rq);
+			blk_dequeue_request(rq);
 			__blk_end_request_all(rq, -EOPNOTSUPP);
 			*rqp = NULL;
 			return false;
diff --git a/block/blk-core.c b/block/blk-core.c
index 6226a380fb6d5..93691d2ac5a07 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -902,6 +902,8 @@ EXPORT_SYMBOL(blk_get_request);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
+	BUG_ON(blk_queued_rq(rq));
+
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
@@ -1610,28 +1612,6 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 
-/**
- * blkdev_dequeue_request - dequeue request and start timeout timer
- * @req: request to dequeue
- *
- * Dequeue @req and start timeout timer on it.  This hands off the
- * request to the driver.
- *
- * Block internal functions which don't want to start timer should
- * call elv_dequeue_request().
- */
-void blkdev_dequeue_request(struct request *req)
-{
-	elv_dequeue_request(req->q, req);
-
-	/*
-	 * We are now handing the request to the hardware, add the
-	 * timeout handler.
-	 */
-	blk_add_timer(req);
-}
-EXPORT_SYMBOL(blkdev_dequeue_request);
-
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
@@ -1671,7 +1651,23 @@ static void blk_account_io_done(struct request *req)
 	}
 }
 
-struct request *elv_next_request(struct request_queue *q)
+/**
+ * blk_peek_request - peek at the top of a request queue
+ * @q: request queue to peek at
+ *
+ * Description:
+ *     Return the request at the top of @q.  The returned request
+ *     should be started using blk_start_request() before LLD starts
+ *     processing it.
+ *
+ * Return:
+ *     Pointer to the request at the top of @q if available.  Null
+ *     otherwise.
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+struct request *blk_peek_request(struct request_queue *q)
 {
 	struct request *rq;
 	int ret;
@@ -1748,10 +1744,12 @@ struct request *elv_next_request(struct request_queue *q)
 
 	return rq;
 }
-EXPORT_SYMBOL(elv_next_request);
+EXPORT_SYMBOL(blk_peek_request);
 
-void elv_dequeue_request(struct request_queue *q, struct request *rq)
+void blk_dequeue_request(struct request *rq)
 {
+	struct request_queue *q = rq->q;
+
 	BUG_ON(list_empty(&rq->queuelist));
 	BUG_ON(ELV_ON_HASH(rq));
 
@@ -1766,6 +1764,58 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq)
 		q->in_flight++;
 }
 
+/**
+ * blk_start_request - start request processing on the driver
+ * @req: request to dequeue
+ *
+ * Description:
+ *     Dequeue @req and start timeout timer on it.  This hands off the
+ *     request to the driver.
+ *
+ *     Block internal functions which don't want to start timer should
+ *     call blk_dequeue_request().
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+void blk_start_request(struct request *req)
+{
+	blk_dequeue_request(req);
+
+	/*
+	 * We are now handing the request to the hardware, add the
+	 * timeout handler.
+	 */
+	blk_add_timer(req);
+}
+EXPORT_SYMBOL(blk_start_request);
+
+/**
+ * blk_fetch_request - fetch a request from a request queue
+ * @q: request queue to fetch a request from
+ *
+ * Description:
+ *     Return the request at the top of @q.  The request is started on
+ *     return and LLD can start processing it immediately.
+ *
+ * Return:
+ *     Pointer to the request at the top of @q if available.  Null
+ *     otherwise.
+ *
+ * Context:
+ *     queue_lock must be held.
+ */
+struct request *blk_fetch_request(struct request_queue *q)
+{
+	struct request *rq;
+
+	rq = blk_peek_request(q);
+	if (rq)
+		blk_start_request(rq);
+	return rq;
+}
+EXPORT_SYMBOL(blk_fetch_request);
+
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @rq:	      the request being processed
@@ -1937,12 +1987,11 @@ static bool blk_update_bidi_request(struct request *rq, int error,
  */
 static void blk_finish_request(struct request *req, int error)
 {
+	BUG_ON(blk_queued_rq(req));
+
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 
-	if (blk_queued_rq(req))
-		elv_dequeue_request(req->q, req);
-
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
 
diff --git a/block/blk-tag.c b/block/blk-tag.c
index 3c518e3303ae3..c260f7c30ddae 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -374,7 +374,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	rq->cmd_flags |= REQ_QUEUED;
 	rq->tag = tag;
 	bqt->tag_index[tag] = rq;
-	blkdev_dequeue_request(rq);
+	blk_start_request(rq);
 	list_add(&rq->queuelist, &q->tag_busy_list);
 	return 0;
 }
diff --git a/block/blk.h b/block/blk.h
index ab54529103c00..9e0042ca94959 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,7 @@ extern struct kobj_type blk_queue_ktype;
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
+void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 
 void blk_unplug_work(struct work_struct *work);
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 774ab05973a92..668dc234b8e22 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -3321,7 +3321,7 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_
 	DAC960_Command_T *Command;
 
    while(1) {
-	Request = elv_next_request(req_q);
+	Request = blk_peek_request(req_q);
 	if (!Request)
 		return 1;
 
@@ -3341,7 +3341,7 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_
 	Command->BlockNumber = blk_rq_pos(Request);
 	Command->BlockCount = blk_rq_sectors(Request);
 	Command->Request = Request;
-	blkdev_dequeue_request(Request);
+	blk_start_request(Request);
 	Command->SegmentCount = blk_rq_map_sg(req_q,
 		  Command->Request, Command->cmd_sglist);
 	/* pci_map_sg MAY change the value of SegCount */
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 80a68b2e04515..9c6e5b0fe894f 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1342,12 +1342,11 @@ static void redo_fd_request(void)
 	int err;
 
 next_req:
-	rq = elv_next_request(floppy_queue);
+	rq = blk_fetch_request(floppy_queue);
 	if (!rq) {
 		/* Nothing left to do */
 		return;
 	}
-	blkdev_dequeue_request(rq);
 
 	floppy = rq->rq_disk->private_data;
 	drive = floppy - unit;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 89a591d9c83b6..f5e7180d7f47d 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1404,10 +1404,9 @@ static void redo_fd_request(void)
 
 repeat:
 	if (!fd_request) {
-		fd_request = elv_next_request(floppy_queue);
+		fd_request = blk_fetch_request(floppy_queue);
 		if (!fd_request)
 			goto the_end;
-		blkdev_dequeue_request(fd_request);
 	}
 
 	floppy = fd_request->rq_disk->private_data;
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index ab7b04c0db70a..e714e7cce6f27 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2801,7 +2801,7 @@ static void do_cciss_request(struct request_queue *q)
 		goto startio;
 
       queue:
-	creq = elv_next_request(q);
+	creq = blk_peek_request(q);
 	if (!creq)
 		goto startio;
 
@@ -2810,7 +2810,7 @@ static void do_cciss_request(struct request_queue *q)
 	if ((c = cmd_alloc(h, 1)) == NULL)
 		goto full;
 
-	blkdev_dequeue_request(creq);
+	blk_start_request(creq);
 
 	spin_unlock_irq(q->queue_lock);
 
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index a5caeff4718e1..a02dcfc00f134 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -903,7 +903,7 @@ static void do_ida_request(struct request_queue *q)
 		goto startio;
 
 queue_next:
-	creq = elv_next_request(q);
+	creq = blk_peek_request(q);
 	if (!creq)
 		goto startio;
 
@@ -912,7 +912,7 @@ static void do_ida_request(struct request_queue *q)
 	if ((c = cmd_alloc(h,1)) == NULL)
 		goto startio;
 
-	blkdev_dequeue_request(creq);
+	blk_start_request(creq);
 
 	c->ctlr = h->ctlr;
 	c->hdr.unit = (drv_info_t *)(creq->rq_disk->private_data) - h->drv;
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index e2c70d2085ae1..90877fee0ee00 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -931,7 +931,7 @@ static inline void unlock_fdc(void)
 	del_timer(&fd_timeout);
 	cont = NULL;
 	clear_bit(0, &fdc_busy);
-	if (current_req || elv_next_request(floppy_queue))
+	if (current_req || blk_peek_request(floppy_queue))
 		do_fd_request(floppy_queue);
 	spin_unlock_irqrestore(&floppy_lock, flags);
 	wake_up(&fdc_wait);
@@ -2912,9 +2912,7 @@ static void redo_fd_request(void)
 			struct request *req;
 
 			spin_lock_irq(floppy_queue->queue_lock);
-			req = elv_next_request(floppy_queue);
-			if (req)
-				blkdev_dequeue_request(req);
+			req = blk_fetch_request(floppy_queue);
 			spin_unlock_irq(floppy_queue->queue_lock);
 			if (!req) {
 				do_floppy = NULL;
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 288ab63c1029f..961de56d00a94 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -592,12 +592,11 @@ static void hd_request(void)
 	del_timer(&device_timer);
 
 	if (!hd_req) {
-		hd_req = elv_next_request(hd_queue);
+		hd_req = blk_fetch_request(hd_queue);
 		if (!hd_req) {
 			do_hd = NULL;
 			return;
 		}
-		blkdev_dequeue_request(hd_req);
 	}
 	req = hd_req;
 
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index 1ca5d1423fa3a..c0cd0a03f6985 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -671,10 +671,8 @@ static void mg_request_poll(struct request_queue *q)
 
 	while (1) {
 		if (!host->req) {
-			host->req = elv_next_request(q);
-			if (host->req)
-				blkdev_dequeue_request(host->req);
-			else
+			host->req = blk_fetch_request(q);
+			if (!host->req)
 				break;
 		}
 
@@ -744,10 +742,8 @@ static void mg_request(struct request_queue *q)
 
 	while (1) {
 		if (!host->req) {
-			host->req = elv_next_request(q);
-			if (host->req)
-				blkdev_dequeue_request(host->req);
-			else
+			host->req = blk_fetch_request(q);
+			if (!host->req)
 				break;
 		}
 		req = host->req;
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index fad167de23b44..5d23ffad7c77f 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -533,11 +533,9 @@ static void do_nbd_request(struct request_queue *q)
 {
 	struct request *req;
 	
-	while ((req = elv_next_request(q)) != NULL) {
+	while ((req = blk_fetch_request(q)) != NULL) {
 		struct nbd_device *lo;
 
-		blkdev_dequeue_request(req);
-
 		spin_unlock_irq(q->queue_lock);
 
 		dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 425f81586a311..911dfd98d813c 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -720,10 +720,9 @@ static void do_pcd_request(struct request_queue * q)
 		return;
 	while (1) {
 		if (!pcd_req) {
-			pcd_req = elv_next_request(q);
+			pcd_req = blk_fetch_request(q);
 			if (!pcd_req)
 				return;
-			blkdev_dequeue_request(pcd_req);
 		}
 
 		if (rq_data_dir(pcd_req) == READ) {
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index d2ca3f5520610..bf5955b3d8735 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -412,11 +412,9 @@ static void run_fsm(void)
 				spin_lock_irqsave(&pd_lock, saved_flags);
 				if (!__blk_end_request_cur(pd_req,
 						res == Ok ? 0 : -EIO)) {
-					pd_req = elv_next_request(pd_queue);
+					pd_req = blk_fetch_request(pd_queue);
 					if (!pd_req)
 						stop = 1;
-					else
-						blkdev_dequeue_request(pd_req);
 				}
 				spin_unlock_irqrestore(&pd_lock, saved_flags);
 				if (stop)
@@ -706,10 +704,9 @@ static void do_pd_request(struct request_queue * q)
 {
 	if (pd_req)
 		return;
-	pd_req = elv_next_request(q);
+	pd_req = blk_fetch_request(q);
 	if (!pd_req)
 		return;
-	blkdev_dequeue_request(pd_req);
 
 	schedule_fsm();
 }
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index d6f7bd84ed392..68a90834e9938 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -762,10 +762,9 @@ static void do_pf_request(struct request_queue * q)
 		return;
 repeat:
 	if (!pf_req) {
-		pf_req = elv_next_request(q);
+		pf_req = blk_fetch_request(q);
 		if (!pf_req)
 			return;
-		blkdev_dequeue_request(pf_req);
 	}
 
 	pf_current = pf_req->rq_disk->private_data;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index f4d8db944e7d7..338cee4cc0ba4 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -194,9 +194,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev,
 
 	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
 
-	while ((req = elv_next_request(q))) {
-		blkdev_dequeue_request(req);
-
+	while ((req = blk_fetch_request(q))) {
 		if (blk_fs_request(req)) {
 			if (ps3disk_submit_request_sg(dev, req))
 				break;
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 9f351bfa15ea8..cbfd9c0aef034 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -441,12 +441,11 @@ static int __send_request(struct request *req)
 static void do_vdc_request(struct request_queue *q)
 {
 	while (1) {
-		struct request *req = elv_next_request(q);
+		struct request *req = blk_fetch_request(q);
 
 		if (!req)
 			break;
 
-		blkdev_dequeue_request(req);
 		if (__send_request(req) < 0)
 			__blk_end_request_all(req, -EIO);
 	}
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index dedd4893f5ea5..cf7877fb8a7d7 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -528,10 +528,7 @@ static void redo_fd_request(struct request_queue *q)
 	struct request *req;
 	struct floppy_state *fs;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		int err = -EIO;
 
@@ -554,11 +551,8 @@ static void redo_fd_request(struct request_queue *q)
 			break;
 		}
 	done:
-		if (!__blk_end_request_cur(req, err)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, err))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index f48c6dd47e04d..80df93e3cdd05 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -326,10 +326,9 @@ static void start_request(struct floppy_state *fs)
 	}
 	while (fs->state == idle) {
 		if (!fd_req) {
-			fd_req = elv_next_request(swim3_queue);
+			fd_req = blk_fetch_request(swim3_queue);
 			if (!fd_req)
 				break;
-			blkdev_dequeue_request(fd_req);
 		}
 		req = fd_req;
 #if 0
diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c
index 087c94c8b2dab..da403b6a7f434 100644
--- a/drivers/block/sx8.c
+++ b/drivers/block/sx8.c
@@ -810,12 +810,10 @@ static void carm_oob_rq_fn(struct request_queue *q)
 
 	while (1) {
 		DPRINTK("get req\n");
-		rq = elv_next_request(q);
+		rq = blk_fetch_request(q);
 		if (!rq)
 			break;
 
-		blkdev_dequeue_request(rq);
-
 		crq = rq->special;
 		assert(crq != NULL);
 		assert(crq->rq == rq);
@@ -846,7 +844,7 @@ static void carm_rq_fn(struct request_queue *q)
 
 queue_one_request:
 	VPRINTK("get req\n");
-	rq = elv_next_request(q);
+	rq = blk_peek_request(q);
 	if (!rq)
 		return;
 
@@ -857,7 +855,7 @@ static void carm_rq_fn(struct request_queue *q)
 	}
 	crq->rq = rq;
 
-	blkdev_dequeue_request(rq);
+	blk_start_request(rq);
 
 	if (rq_data_dir(rq) == WRITE) {
 		writing = 1;
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 40d03cf63f2ec..178f459a50ed8 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -627,7 +627,7 @@ static void ub_request_fn(struct request_queue *q)
 	struct ub_lun *lun = q->queuedata;
 	struct request *rq;
 
-	while ((rq = elv_next_request(q)) != NULL) {
+	while ((rq = blk_peek_request(q)) != NULL) {
 		if (ub_request_fn_1(lun, rq) != 0) {
 			blk_stop_queue(q);
 			break;
@@ -643,13 +643,13 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 	int n_elem;
 
 	if (atomic_read(&sc->poison)) {
-		blkdev_dequeue_request(rq);
+		blk_start_request(rq);
 		ub_end_rq(rq, DID_NO_CONNECT << 16, blk_rq_bytes(rq));
 		return 0;
 	}
 
 	if (lun->changed && !blk_pc_request(rq)) {
-		blkdev_dequeue_request(rq);
+		blk_start_request(rq);
 		ub_end_rq(rq, SAM_STAT_CHECK_CONDITION, blk_rq_bytes(rq));
 		return 0;
 	}
@@ -660,7 +660,7 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 		return -1;
 	memset(cmd, 0, sizeof(struct ub_scsi_cmd));
 
-	blkdev_dequeue_request(rq);
+	blk_start_request(rq);
 
 	urq = &lun->urq;
 	memset(urq, 0, sizeof(struct ub_request));
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index 2086cb12d3ec7..390d69bb7c482 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -361,11 +361,9 @@ static void do_viodasd_request(struct request_queue *q)
 	 * back later.
 	 */
 	while (num_req_outstanding < VIOMAXREQ) {
-		req = elv_next_request(q);
+		req = blk_fetch_request(q);
 		if (req == NULL)
 			return;
-		/* dequeue the current request from the queue */
-		blkdev_dequeue_request(req);
 		/* check that request contains a valid command */
 		if (!blk_fs_request(req)) {
 			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1980ab4563569..29a9daf48621f 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -128,7 +128,7 @@ static void do_virtblk_request(struct request_queue *q)
 	struct request *req;
 	unsigned int issued = 0;
 
-	while ((req = elv_next_request(q)) != NULL) {
+	while ((req = blk_peek_request(q)) != NULL) {
 		vblk = req->rq_disk->private_data;
 		BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
@@ -138,7 +138,7 @@ static void do_virtblk_request(struct request_queue *q)
 			blk_stop_queue(q);
 			break;
 		}
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 		issued++;
 	}
 
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index d4c4352354b55..ce24292199255 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -305,10 +305,7 @@ static void do_xd_request (struct request_queue * q)
 	if (xdc_busy)
 		return;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		unsigned block = blk_rq_pos(req);
 		unsigned count = blk_rq_cur_sectors(req);
@@ -325,11 +322,8 @@ static void do_xd_request (struct request_queue * q)
 					   block, count);
 	done:
 		/* wrap up, 0 = success, -errno = fail */
-		if (!__blk_end_request_cur(req, res)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, res))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 66f834571b88c..6d4ac76c2806a 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -299,13 +299,13 @@ static void do_blkif_request(struct request_queue *rq)
 
 	queued = 0;
 
-	while ((req = elv_next_request(rq)) != NULL) {
+	while ((req = blk_peek_request(rq)) != NULL) {
 		info = req->rq_disk->private_data;
 
 		if (RING_FULL(&info->ring))
 			goto wait;
 
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 
 		if (!blk_fs_request(req)) {
 			__blk_end_request_all(req, -EIO);
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index edf137b6c3794..3a4397edab71a 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -463,10 +463,10 @@ struct request *ace_get_next_request(struct request_queue * q)
 {
 	struct request *req;
 
-	while ((req = elv_next_request(q)) != NULL) {
+	while ((req = blk_peek_request(q)) != NULL) {
 		if (blk_fs_request(req))
 			break;
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 		__blk_end_request_all(req, -EIO);
 	}
 	return req;
@@ -498,10 +498,8 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			__blk_end_request_all(ace->req, -EIO);
 			ace->req = NULL;
 		}
-		while ((req = elv_next_request(ace->queue)) != NULL) {
-			blkdev_dequeue_request(req);
+		while ((req = blk_fetch_request(ace->queue)) != NULL)
 			__blk_end_request_all(req, -EIO);
-		}
 
 		/* Drop back to IDLE state and notify waiters */
 		ace->fsm_state = ACE_FSM_STATE_IDLE;
@@ -649,7 +647,7 @@ static void ace_fsm_dostate(struct ace_device *ace)
 			ace->fsm_state = ACE_FSM_STATE_IDLE;
 			break;
 		}
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 
 		/* Okay, it's a data request, set it up for transfer */
 		dev_dbg(ace->dev,
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index c909c1a3f6507..4575171e5beb1 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -71,10 +71,7 @@ static void do_z2_request(struct request_queue *q)
 {
 	struct request *req;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		unsigned long start = blk_rq_pos(req) << 9;
 		unsigned long len  = blk_rq_cur_bytes(req);
@@ -100,11 +97,8 @@ static void do_z2_request(struct request_queue *q)
 			len -= size;
 		}
 	done:
-		if (!__blk_end_request_cur(req, err)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, err))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 3cc02bfe828dd..1e366ad8f6806 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -642,9 +642,7 @@ static void gdrom_request(struct request_queue *rq)
 {
 	struct request *req;
 
-	while ((req = elv_next_request(rq)) != NULL) {
-		blkdev_dequeue_request(req);
-
+	while ((req = blk_fetch_request(rq)) != NULL) {
 		if (!blk_fs_request(req)) {
 			printk(KERN_DEBUG "GDROM: Non-fs request ignored\n");
 			__blk_end_request_all(req, -EIO);
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index bbe9f08673475..ca741c21e4aaa 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -297,9 +297,7 @@ static void do_viocd_request(struct request_queue *q)
 {
 	struct request *req;
 
-	while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) {
-		blkdev_dequeue_request(req);
-
+	while ((rwreq == 0) && ((req = blk_fetch_request(q)) != NULL)) {
 		if (!blk_fs_request(req))
 			__blk_end_request_all(req, -EIO);
 		else if (send_request(req) < 0) {
diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c
index 2874c3d703a92..8a894fa37b530 100644
--- a/drivers/ide/ide-atapi.c
+++ b/drivers/ide/ide-atapi.c
@@ -269,7 +269,7 @@ void ide_retry_pc(ide_drive_t *drive)
 	blk_requeue_request(failed_rq->q, failed_rq);
 	drive->hwif->rq = NULL;
 	if (ide_queue_sense_rq(drive, pc)) {
-		blkdev_dequeue_request(failed_rq);
+		blk_start_request(failed_rq);
 		ide_complete_rq(drive, -EIO, blk_rq_bytes(failed_rq));
 	}
 }
diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c
index abda7337b3f49..e4e3a0e3201e5 100644
--- a/drivers/ide/ide-io.c
+++ b/drivers/ide/ide-io.c
@@ -519,11 +519,8 @@ void do_ide_request(struct request_queue *q)
 		 * we know that the queue isn't empty, but this can happen
 		 * if the q->prep_rq_fn() decides to kill a request
 		 */
-		if (!rq) {
-			rq = elv_next_request(drive->queue);
-			if (rq)
-				blkdev_dequeue_request(rq);
-		}
+		if (!rq)
+			rq = blk_fetch_request(drive->queue);
 
 		spin_unlock_irq(q->queue_lock);
 		spin_lock_irq(&hwif->lock);
@@ -536,7 +533,7 @@ void do_ide_request(struct request_queue *q)
 		/*
 		 * Sanity: don't accept a request that isn't a PM request
 		 * if we are currently power managed. This is very important as
-		 * blk_stop_queue() doesn't prevent the elv_next_request()
+		 * blk_stop_queue() doesn't prevent the blk_fetch_request()
 		 * above to return us whatever is in the queue. Since we call
 		 * ide_do_request() ourselves, we end up taking requests while
 		 * the queue is blocked...
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 58f5be8cd69e0..c0bebc6a2f2cc 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -704,13 +704,12 @@ static int mspro_block_issue_req(struct memstick_dev *card, int chunk)
 		return 0;
 	}
 
-	dev_dbg(&card->dev, "elv_next\n");
-	msb->block_req = elv_next_request(msb->queue);
+	dev_dbg(&card->dev, "blk_fetch\n");
+	msb->block_req = blk_fetch_request(msb->queue);
 	if (!msb->block_req) {
 		dev_dbg(&card->dev, "issue end\n");
 		return -EAGAIN;
 	}
-	blkdev_dequeue_request(msb->block_req);
 
 	dev_dbg(&card->dev, "trying again\n");
 	chunk = 1;
@@ -825,10 +824,8 @@ static void mspro_block_submit_req(struct request_queue *q)
 		return;
 
 	if (msb->eject) {
-		while ((req = elv_next_request(q)) != NULL) {
-			blkdev_dequeue_request(req);
+		while ((req = blk_fetch_request(q)) != NULL)
 			__blk_end_request_all(req, -ENODEV);
-		}
 
 		return;
 	}
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 8b5cbfc3ba97d..6573ef4408f1d 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -877,7 +877,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 	struct request *req;
 
 	while (!blk_queue_plugged(q)) {
-		req = elv_next_request(q);
+		req = blk_peek_request(q);
 		if (!req)
 			break;
 
@@ -890,7 +890,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 
 			if (queue_depth < I2O_BLOCK_MAX_OPEN_REQUESTS) {
 				if (!i2o_block_transfer(req)) {
-					blkdev_dequeue_request(req);
+					blk_start_request(req);
 					continue;
 				} else
 					osm_info("transfer error\n");
@@ -917,7 +917,7 @@ static void i2o_block_request_fn(struct request_queue *q)
 				break;
 			}
 		} else {
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 		}
 	}
diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c
index 4b70f1e28347c..49e582356c65a 100644
--- a/drivers/mmc/card/queue.c
+++ b/drivers/mmc/card/queue.c
@@ -54,11 +54,8 @@ static int mmc_queue_thread(void *d)
 
 		spin_lock_irq(q->queue_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (!blk_queue_plugged(q)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!blk_queue_plugged(q))
+			req = blk_fetch_request(q);
 		mq->req = req;
 		spin_unlock_irq(q->queue_lock);
 
@@ -94,10 +91,8 @@ static void mmc_request(struct request_queue *q)
 
 	if (!mq) {
 		printk(KERN_ERR "MMC: killing requests for dead queue\n");
-		while ((req = elv_next_request(q)) != NULL) {
-			blkdev_dequeue_request(req);
+		while ((req = blk_fetch_request(q)) != NULL)
 			__blk_end_request_all(req, -EIO);
-		}
 		return;
 	}
 
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 3e10442615d17..502622f628bc0 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -100,12 +100,7 @@ static int mtd_blktrans_thread(void *arg)
 		struct mtd_blktrans_dev *dev;
 		int res;
 
-		if (!req) {
-			req = elv_next_request(rq);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
-		if (!req) {
+		if (!req && !(req = blk_fetch_request(rq))) {
 			set_current_state(TASK_INTERRUPTIBLE);
 			spin_unlock_irq(rq->queue_lock);
 			schedule();
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index 7df03c7aea0d3..e64f62d5e0fc5 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1656,17 +1656,13 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 	if (basedev->state < DASD_STATE_READY)
 		return;
 	/* Now we try to fetch requests from the request queue */
-	while (!blk_queue_plugged(queue) &&
-	       elv_next_request(queue)) {
-
-		req = elv_next_request(queue);
-
+	while (!blk_queue_plugged(queue) && (req = blk_peek_request(queue))) {
 		if (basedev->features & DASD_FEATURE_READONLY &&
 		    rq_data_dir(req) == WRITE) {
 			DBF_DEV_EVENT(DBF_ERR, basedev,
 				      "Rejecting write request %p",
 				      req);
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
@@ -1695,7 +1691,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 				      "CCW creation failed (rc=%ld) "
 				      "on request %p",
 				      PTR_ERR(cqr), req);
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
@@ -1705,7 +1701,7 @@ static void __dasd_process_request_queue(struct dasd_block *block)
 		 */
 		cqr->callback_data = (void *) req;
 		cqr->status = DASD_CQR_FILLED;
-		blkdev_dequeue_request(req);
+		blk_start_request(req);
 		list_add_tail(&cqr->blocklist, &block->ccw_queue);
 		dasd_profile_start(block, cqr, req);
 	}
@@ -2029,10 +2025,8 @@ static void dasd_flush_request_queue(struct dasd_block *block)
 		return;
 
 	spin_lock_irq(&block->request_queue_lock);
-	while ((req = elv_next_request(block->request_queue))) {
-		blkdev_dequeue_request(req);
+	while ((req = blk_fetch_request(block->request_queue)))
 		__blk_end_request_all(req, -EIO);
-	}
 	spin_unlock_irq(&block->request_queue_lock);
 }
 
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 5d035e4939dc0..1e79676759805 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -93,7 +93,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data)
 		device->blk_data.block_position = -1;
 	device->discipline->free_bread(ccw_req);
 	if (!list_empty(&device->req_queue) ||
-	    elv_next_request(device->blk_data.request_queue))
+	    blk_peek_request(device->blk_data.request_queue))
 		tapeblock_trigger_requeue(device);
 }
 
@@ -162,19 +162,16 @@ tapeblock_requeue(struct work_struct *work) {
 	spin_lock_irq(&device->blk_data.request_queue_lock);
 	while (
 		!blk_queue_plugged(queue) &&
-		elv_next_request(queue)   &&
+		(req = blk_fetch_request(queue)) &&
 		nr_queued < TAPEBLOCK_MIN_REQUEUE
 	) {
-		req = elv_next_request(queue);
 		if (rq_data_dir(req) == WRITE) {
 			DBF_EVENT(1, "TBLOCK: Rejecting write request\n");
-			blkdev_dequeue_request(req);
 			spin_unlock_irq(&device->blk_data.request_queue_lock);
 			blk_end_request_all(req, -EIO);
 			spin_lock_irq(&device->blk_data.request_queue_lock);
 			continue;
 		}
-		blkdev_dequeue_request(req);
 		nr_queued++;
 		spin_unlock_irq(&device->blk_data.request_queue_lock);
 		rc = tapeblock_start_request(device, req);
diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c
index f572a4a1d1419..6d46516846884 100644
--- a/drivers/sbus/char/jsflash.c
+++ b/drivers/sbus/char/jsflash.c
@@ -186,10 +186,7 @@ static void jsfd_do_request(struct request_queue *q)
 {
 	struct request *req;
 
-	req = elv_next_request(q);
-	if (req)
-		blkdev_dequeue_request(req);
-
+	req = blk_fetch_request(q);
 	while (req) {
 		struct jsfd_part *jdp = req->rq_disk->private_data;
 		unsigned long offset = blk_rq_pos(req) << 9;
@@ -212,11 +209,8 @@ static void jsfd_do_request(struct request_queue *q)
 		jsfd_read(req->buffer, jdp->dbase + offset, len);
 		err = 0;
 	end:
-		if (!__blk_end_request_cur(req, err)) {
-			req = elv_next_request(q);
-			if (req)
-				blkdev_dequeue_request(req);
-		}
+		if (!__blk_end_request_cur(req, err))
+			req = blk_fetch_request(q);
 	}
 }
 
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index ee308f6f7982c..b12750f82169f 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1207,7 +1207,7 @@ int scsi_prep_return(struct request_queue *q, struct request *req, int ret)
 		break;
 	case BLKPREP_DEFER:
 		/*
-		 * If we defer, the elv_next_request() returns NULL, but the
+		 * If we defer, the blk_peek_request() returns NULL, but the
 		 * queue must be restarted, so we plug here if no returning
 		 * command will automatically do that.
 		 */
@@ -1385,7 +1385,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
 	struct scsi_target *starget = scsi_target(sdev);
 	struct Scsi_Host *shost = sdev->host;
 
-	blkdev_dequeue_request(req);
+	blk_start_request(req);
 
 	if (unlikely(cmd == NULL)) {
 		printk(KERN_CRIT "impossible request in %s.\n",
@@ -1477,7 +1477,7 @@ static void scsi_request_fn(struct request_queue *q)
 
 	if (!sdev) {
 		printk("scsi: killing requests for dead queue\n");
-		while ((req = elv_next_request(q)) != NULL)
+		while ((req = blk_peek_request(q)) != NULL)
 			scsi_kill_request(req, q);
 		return;
 	}
@@ -1498,7 +1498,7 @@ static void scsi_request_fn(struct request_queue *q)
 		 * that the request is fully prepared even if we cannot 
 		 * accept it.
 		 */
-		req = elv_next_request(q);
+		req = blk_peek_request(q);
 		if (!req || !scsi_dev_queue_ready(q, sdev))
 			break;
 
@@ -1514,7 +1514,7 @@ static void scsi_request_fn(struct request_queue *q)
 		 * Remove the request from the request list.
 		 */
 		if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req)))
-			blkdev_dequeue_request(req);
+			blk_start_request(req);
 		sdev->device_busy++;
 
 		spin_unlock(q->queue_lock);
diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c
index 50988cbf7b2df..d606452297cf3 100644
--- a/drivers/scsi/scsi_transport_sas.c
+++ b/drivers/scsi/scsi_transport_sas.c
@@ -163,12 +163,10 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost,
 	int (*handler)(struct Scsi_Host *, struct sas_rphy *, struct request *);
 
 	while (!blk_queue_plugged(q)) {
-		req = elv_next_request(q);
+		req = blk_fetch_request(q);
 		if (!req)
 			break;
 
-		blkdev_dequeue_request(req);
-
 		spin_unlock_irq(q->queue_lock);
 
 		handler = to_sas_internal(shost->transportt)->f->smp_handler;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index c75580345700f..6e59d3b92ff2a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -818,8 +818,6 @@ static inline void blk_run_address_space(struct address_space *mapping)
 		blk_run_backing_dev(mapping->backing_dev_info, NULL);
 }
 
-extern void blkdev_dequeue_request(struct request *req);
-
 /*
  * blk_rq_pos()		: the current sector
  * blk_rq_bytes()	: bytes left in the entire request
@@ -852,6 +850,13 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 	return blk_rq_cur_bytes(rq) >> 9;
 }
 
+/*
+ * Request issue related functions.
+ */
+extern struct request *blk_peek_request(struct request_queue *q);
+extern void blk_start_request(struct request *rq);
+extern struct request *blk_fetch_request(struct request_queue *q);
+
 /*
  * Request completion related functions.
  *
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 4e462878c9ca3..1cb3372e65d89 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -103,10 +103,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *);
 extern void elv_merge_requests(struct request_queue *, struct request *,
 			       struct request *);
 extern void elv_merged_request(struct request_queue *, struct request *, int);
-extern void elv_dequeue_request(struct request_queue *, struct request *);
 extern void elv_requeue_request(struct request_queue *, struct request *);
 extern int elv_queue_empty(struct request_queue *);
-extern struct request *elv_next_request(struct request_queue *q);
 extern struct request *elv_former_request(struct request_queue *, struct request *);
 extern struct request *elv_latter_request(struct request_queue *, struct request *);
 extern int elv_register_queue(struct request_queue *q);
-- 
GitLab


From 7bce6c2740fab36708233e998a9e53115649b193 Mon Sep 17 00:00:00 2001
From: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Date: Mon, 11 May 2009 06:51:28 +0000
Subject: [PATCH 1007/2198] sh: sh7785lcr: fix I2C device address map for
 32-bit mode

This fixes up the broken I2C offset in 32-bit mode.
The cause is because the board datasheet had a mistake.

Signed-off-by: Yoshihiro Shimoda <shimoda.yoshihiro@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/mach-common/mach/sh7785lcr.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/sh/include/mach-common/mach/sh7785lcr.h b/arch/sh/include/mach-common/mach/sh7785lcr.h
index 1ce27d5c74918..90011d435f304 100644
--- a/arch/sh/include/mach-common/mach/sh7785lcr.h
+++ b/arch/sh/include/mach-common/mach/sh7785lcr.h
@@ -9,11 +9,11 @@
  * -----------------------------+---------------+---------------
  * 0x00000000 - 0x03ffffff(CS0)	| NOR Flash	| NOR Flash
  * 0x04000000 - 0x05ffffff(CS1)	| PLD		| PLD
- * 0x06000000 - 0x07ffffff(CS1)	| reserved	| I2C
+ * 0x06000000 - 0x07ffffff(CS1)	| I2C		| I2C
  * 0x08000000 - 0x0bffffff(CS2)	| USB		| DDR SDRAM
  * 0x0c000000 - 0x0fffffff(CS3)	| SD		| DDR SDRAM
  * 0x10000000 - 0x13ffffff(CS4)	| SM107		| SM107
- * 0x14000000 - 0x17ffffff(CS5)	| I2C		| USB
+ * 0x14000000 - 0x17ffffff(CS5)	| reserved	| USB
  * 0x18000000 - 0x1bffffff(CS6)	| reserved	| SD
  * 0x40000000 - 0x5fffffff	| DDR SDRAM	| (cannot use)
  *
@@ -32,6 +32,9 @@
 #define PLD_VERSR		(PLD_BASE_ADDR + 0x0c)
 #define PLD_MMSR		(PLD_BASE_ADDR + 0x0e)
 
+#define PCA9564_ADDR		0x06000000	/* I2C */
+#define PCA9564_SIZE		0x00000100
+
 #define SM107_MEM_ADDR		0x10000000
 #define SM107_MEM_SIZE		0x00e00000
 #define SM107_REG_ADDR		0x13e00000
@@ -40,16 +43,13 @@
 #if defined(CONFIG_SH_SH7785LCR_29BIT_PHYSMAPS)
 #define R8A66597_ADDR		0x14000000	/* USB */
 #define CG200_ADDR		0x18000000	/* SD */
-#define PCA9564_ADDR		0x06000000	/* I2C */
 #else
 #define R8A66597_ADDR		0x08000000
 #define CG200_ADDR		0x0c000000
-#define PCA9564_ADDR		0x14000000
 #endif
 
 #define R8A66597_SIZE		0x00000100
 #define CG200_SIZE		0x00010000
-#define PCA9564_SIZE		0x00000100
 
 #endif  /* __ASM_SH_RENESAS_SH7785LCR_H */
 
-- 
GitLab


From b9e0353fc85dab4ef5ebcef2bd09ebc4ce6d5a7b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:05:32 -0700
Subject: [PATCH 1008/2198] x86/acpi: remove irq-compression trick on 32-bit

We already have a per cpu vector on 32-bit via recent changes, and
don't need this trick any more (which trick obfuscates the real GSI
mappings and which only triggers on larger systems to begin with):

On 3 ioapic system (24 per ioapic) before patch I got:

ACPI: PCI Interrupt Link [ILSB] enabled at IRQ 71
IOAPIC[2]: Set routing entry (10-23 -> 0xa9 -> IRQ 64 Mode:1 Active:1)
pci 0000:80:01.1: PCI INT A -> Link[ILSB] -> GSI 71 (level, low) -> IRQ 64
ACPI: PCI Interrupt Link [LE5B] enabled at IRQ 67
IOAPIC[2]: Set routing entry (10-19 -> 0xb1 -> IRQ 65 Mode:1 Active:1)
pci 0000:83:00.0: PCI INT B -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65
ACPI: PCI Interrupt Link [LE5A] enabled at IRQ 66
IOAPIC[2]: Set routing entry (10-18 -> 0xb9 -> IRQ 66 Mode:1 Active:1)
pci 0000:83:00.1: PCI INT A -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
ACPI: PCI Interrupt Link [LE5D] enabled at IRQ 65
IOAPIC[2]: Set routing entry (10-17 -> 0xc1 -> IRQ 67 Mode:1 Active:1)
pci 0000:84:00.0: PCI INT B -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67
ACPI: PCI Interrupt Link [LE5C] enabled at IRQ 64
IOAPIC[2]: Set routing entry (10-16 -> 0xc9 -> IRQ 68 Mode:1 Active:1)
pci 0000:84:00.1: PCI INT A -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68
pci 0000:87:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
pci 0000:87:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67
pci 0000:88:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68
pci 0000:88:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65
pci 0000:8b:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
pci 0000:8b:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 67
pci 0000:8c:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 68
pci 0000:8c:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 65

after the patch we get:

ACPI: PCI Interrupt Link [ILSB] enabled at IRQ 71
IOAPIC[2]: Set routing entry (10-23 -> 0xa9 -> IRQ 71 Mode:1 Active:1)
pci 0000:80:01.1: PCI INT A -> Link[ILSB] -> GSI 71 (level, low) -> IRQ 71
ACPI: PCI Interrupt Link [LE5B] enabled at IRQ 67
IOAPIC[2]: Set routing entry (10-19 -> 0xb1 -> IRQ 67 Mode:1 Active:1)
pci 0000:83:00.0: PCI INT B -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67
ACPI: PCI Interrupt Link [LE5A] enabled at IRQ 66
IOAPIC[2]: Set routing entry (10-18 -> 0xb9 -> IRQ 66 Mode:1 Active:1)
pci 0000:83:00.1: PCI INT A -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
ACPI: PCI Interrupt Link [LE5D] enabled at IRQ 65
IOAPIC[2]: Set routing entry (10-17 -> 0xc1 -> IRQ 65 Mode:1 Active:1)
pci 0000:84:00.0: PCI INT B -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65
ACPI: PCI Interrupt Link [LE5C] enabled at IRQ 64
IOAPIC[2]: Set routing entry (10-16 -> 0xc9 -> IRQ 64 Mode:1 Active:1)
pci 0000:84:00.1: PCI INT A -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64
pci 0000:87:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
pci 0000:87:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65
pci 0000:88:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64
pci 0000:88:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67
pci 0000:8b:00.0: PCI INT B -> Link[LE5A] -> GSI 66 (level, low) -> IRQ 66
pci 0000:8b:00.1: PCI INT A -> Link[LE5D] -> GSI 65 (level, low) -> IRQ 65
pci 0000:8c:00.0: PCI INT B -> Link[LE5C] -> GSI 64 (level, low) -> IRQ 64
pci 0000:8c:00.1: PCI INT A -> Link[LE5B] -> GSI 67 (level, low) -> IRQ 67

As it can be seen that GSIs now get mapped lineary.

[ Impact: simplify irq number mapping on bigger 32-bit systems ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A01C35C.7060207@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c | 65 ++++---------------------------------
 1 file changed, 7 insertions(+), 58 deletions(-)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 6ee96b5530f15..fb5e88262d207 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1162,22 +1162,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	int ioapic;
 	int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM	4096
-#define IRQ_COMPRESSION_START	64
-
-	static int pci_irq = IRQ_COMPRESSION_START;
-	/*
-	 * Mapping between Global System Interrupts, which
-	 * represent all possible interrupts, and IRQs
-	 * assigned to actual devices.
-	 */
-	static int gsi_to_irq[MAX_GSI_NUM];
-#else
 
 	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
 		return gsi;
-#endif
 
 	/* Don't set up the ACPI SCI because it's already set up */
 	if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,66 +1183,28 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 		gsi = ioapic_renumber_irq(ioapic, gsi);
 #endif
 
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
 	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
 		printk(KERN_ERR "Invalid reference to IOAPIC pin "
 		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
 		       ioapic_pin);
 		return gsi;
 	}
+
+	/*
+	 * Avoid pin reprogramming.  PRTs typically include entries
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
 	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
 		pr_debug("Pin %d-%d already programmed\n",
 			 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
-		return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
 		return gsi;
-#endif
 	}
-
 	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
-	/*
-	 * For GSI >= 64, use IRQ compression
-	 */
-	if ((gsi >= IRQ_COMPRESSION_START)
-	    && (triggering == ACPI_LEVEL_SENSITIVE)) {
-		/*
-		 * For PCI devices assign IRQs in order, avoiding gaps
-		 * due to unused I/O APIC pins.
-		 */
-		int irq = gsi;
-		if (gsi < MAX_GSI_NUM) {
-			/*
-			 * Retain the VIA chipset work-around (gsi > 15), but
-			 * avoid a problem where the 8254 timer (IRQ0) is setup
-			 * via an override (so it's not on pin 0 of the ioapic),
-			 * and at the same time, the pin 0 interrupt is a PCI
-			 * type.  The gsi > 15 test could cause these two pins
-			 * to be shared as IRQ0, and they are not shareable.
-			 * So test for this condition, and if necessary, avoid
-			 * the pin collision.
-			 */
-			gsi = pci_irq++;
-			/*
-			 * Don't assign IRQ used by ACPI SCI
-			 */
-			if (gsi == acpi_gbl_FADT.sci_interrupt)
-				gsi = pci_irq++;
-			gsi_to_irq[irq] = gsi;
-		} else {
-			printk(KERN_ERR "GSI %u is too high\n", gsi);
-			return gsi;
-		}
-	}
-#endif
 	io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi,
 				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
 				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+
 	return gsi;
 }
 
-- 
GitLab


From ee214558c2e959781a406e76c5b34364da638e1d Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:07:07 -0700
Subject: [PATCH 1009/2198] x86: fix alloc_mptable()

Fix the conditions when we stop updating the mptable due to
running out of slots.

[ Impact: fix memory corruption / non-working update_mptable boot parameter ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A01C3BB.1000609@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 70fd7e414c154..cd2a41a7c45c3 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -870,24 +870,17 @@ static
 inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
 #endif /* CONFIG_X86_IO_APIC */
 
-static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
-		      int count)
+static int
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
 {
-	if (!mpc_new_phys) {
-		pr_info("No spare slots, try to append...take your risk, "
-			"new mpc_length %x\n", count);
-	} else {
-		if (count <= mpc_new_length)
-			pr_info("No spare slots, try to append..., "
-				"new mpc_length %x\n", count);
-		else {
-			pr_err("mpc_new_length %lx is too small\n",
-				mpc_new_length);
-			return -1;
-		}
+	int ret = 0;
+
+	if (!mpc_new_phys || count <= mpc_new_length) {
+		WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+		return -1;
 	}
 
-	return 0;
+	return ret;
 }
 
 static int  __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +939,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 		} else {
 			struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
 			count += sizeof(struct mpc_intsrc);
-			if (!check_slot(mpc_new_phys, mpc_new_length, count))
+			if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
 				goto out;
 			assign_to_mpc_intsrc(&mp_irqs[i], m);
 			mpc->length = count;
-- 
GitLab


From a31f82057ce6f7ced578d64c07a72ccbdc7336e4 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:06:15 -0700
Subject: [PATCH 1010/2198] x86/acpi: call mp_config_acpi_gsi() in
 mp_register_gsi()

The patch to call mp_config_acpi_gsi() from the ACPI IRQ registration
code never got mainline because there were open discussions about it.

This call is needed to properly update the kernel's copy of the mptable,
when the update_mptable boot parameter is needed.

Now that the dust has settled with the APIC unification, and since there
were no objections when the patch was re-submitted, try this again.

[ Impact: fix the update_mptable boot parameter ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A01C387.7090103@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/mpspec.h |  9 -----
 arch/x86/kernel/acpi/boot.c   | 66 +++++++++++++++++++++--------------
 2 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 3ea1f531f5324..c34961a45ec00 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -77,17 +77,8 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
 				 int active_high_low);
 extern int acpi_probe_gsi(void);
 #ifdef CONFIG_X86_IO_APIC
-extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-				u32 gsi, int triggering, int polarity);
 extern int mp_find_ioapic(int gsi);
 extern int mp_find_ioapic_pin(int ioapic, int gsi);
-#else
-static inline int
-mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-		   u32 gsi, int triggering, int polarity)
-{
-	return 0;
-}
 #endif
 #else /* !CONFIG_ACPI: */
 static inline int acpi_probe_gsi(void)
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index fb5e88262d207..8019ecf66e955 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -33,6 +33,7 @@
 #include <linux/irq.h>
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
+#include <linux/pci.h>
 
 #include <asm/pgtable.h>
 #include <asm/io_apic.h>
@@ -1158,6 +1159,44 @@ void __init mp_config_acpi_legacy_irqs(void)
 	}
 }
 
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering,
+			int polarity)
+{
+#ifdef CONFIG_X86_MPPARSE
+	struct mpc_intsrc mp_irq;
+	struct pci_dev *pdev;
+	unsigned char number;
+	unsigned int devfn;
+	int ioapic;
+	u8 pin;
+
+	if (!acpi_ioapic)
+		return 0;
+	if (!dev)
+		return 0;
+	if (dev->bus != &pci_bus_type)
+		return 0;
+
+	pdev = to_pci_dev(dev);
+	number = pdev->bus->number;
+	devfn = pdev->devfn;
+	pin = pdev->pin;
+	/* print the entry should happen on mptable identically */
+	mp_irq.type = MP_INTSRC;
+	mp_irq.irqtype = mp_INT;
+	mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+	mp_irq.srcbus = number;
+	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+	ioapic = mp_find_ioapic(gsi);
+	mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
+	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+	save_mp_irq(&mp_irq);
+#endif
+	return 0;
+}
+
 int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
 	int ioapic;
@@ -1189,6 +1228,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 		       ioapic_pin);
 		return gsi;
 	}
+	mp_config_acpi_gsi(dev, gsi, triggering, polarity);
 
 	/*
 	 * Avoid pin reprogramming.  PRTs typically include entries
@@ -1208,32 +1248,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 	return gsi;
 }
 
-int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-			u32 gsi, int triggering, int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
-	struct mpc_intsrc mp_irq;
-	int ioapic;
-
-	if (!acpi_ioapic)
-		return 0;
-
-	/* print the entry should happen on mptable identically */
-	mp_irq.type = MP_INTSRC;
-	mp_irq.irqtype = mp_INT;
-	mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
-				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
-	mp_irq.srcbus = number;
-	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
-	ioapic = mp_find_ioapic(gsi);
-	mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
-	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
-	save_mp_irq(&mp_irq);
-#endif
-	return 0;
-}
-
 /*
  * Parse IOAPIC related entries in MADT
  * returns 0 on success, < 0 on error
-- 
GitLab


From bdfe8ac153546537ed24de69610ea781a734f785 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:07:41 -0700
Subject: [PATCH 1011/2198] x86/acpi: move pin_programmed bit map to io_apic.c

Prepare to call setup_io_apic_routing() in pcibios_irq_enable()
also remove not needed member apic_id.

[ Impact: clean up, prepare for future change ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A01C3DD.3050104@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c    | 18 ++----------------
 arch/x86/kernel/apic/io_apic.c | 25 ++++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 8019ecf66e955..dcfbc3ab9e46e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -904,10 +904,8 @@ extern int es7000_plat;
 #endif
 
 static struct {
-	int apic_id;
 	int gsi_base;
 	int gsi_end;
-	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
 } mp_ioapic_routing[MAX_IO_APICS];
 
 int mp_find_ioapic(int gsi)
@@ -996,7 +994,6 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
 	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
 	 */
-	mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
 	mp_ioapic_routing[idx].gsi_base = gsi_base;
 	mp_ioapic_routing[idx].gsi_end = gsi_base +
 	    io_apic_get_redir_entries(idx);
@@ -1189,7 +1186,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering,
 	mp_irq.srcbus = number;
 	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
 	ioapic = mp_find_ioapic(gsi);
-	mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
+	mp_irq.dstapic = mp_ioapics[ioapic].apicid;
 	mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
 
 	save_mp_irq(&mp_irq);
@@ -1224,23 +1221,12 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 
 	if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
 		printk(KERN_ERR "Invalid reference to IOAPIC pin "
-		       "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+		       "%d-%d\n", mp_ioapics[ioapic].apicid,
 		       ioapic_pin);
 		return gsi;
 	}
 	mp_config_acpi_gsi(dev, gsi, triggering, polarity);
 
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-		pr_debug("Pin %d-%d already programmed\n",
-			 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-		return gsi;
-	}
-	set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
 	io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi,
 				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
 				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 21c30e1121ee9..e279ae3392858 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3922,7 +3922,7 @@ int __init io_apic_get_version(int ioapic)
 }
 #endif
 
-int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 				 int triggering, int polarity)
 {
 	struct irq_desc *desc;
@@ -3959,6 +3959,29 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 	return 0;
 }
 
+static struct {
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
+int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+				 int triggering, int polarity)
+{
+
+	/*
+	 * Avoid pin reprogramming.  PRTs typically include entries
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
+	if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+		pr_debug("Pin %d-%d already programmed\n",
+			 mp_ioapics[ioapic].apicid, pin);
+		return 0;
+	}
+	set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
+
+	return __io_apic_set_pci_routing(dev, ioapic, pin, irq,
+					 triggering, polarity);
+}
 
 int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 {
-- 
GitLab


From e20c06fd6950265a899edd96a02dc2e6ae2d1ce5 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:08:22 -0700
Subject: [PATCH 1012/2198] x86/pci: add 4 more return parameters to
 IO_APIC_get_PCI_irq_vector()

To prepare those params for pcibios_irq_enable() to call setup_io_apic_routing().

[ Impact: extend function call API to prepare for new functionality ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A01C406.2040303@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/hw_irq.h     |   4 +-
 arch/x86/kernel/apic/io_apic.c    | 107 ++++++++++++++++--------------
 arch/x86/pci/irq.c                |  24 +++++--
 drivers/pci/hotplug/ibmphp_core.c |  56 ++++++++--------
 4 files changed, 112 insertions(+), 79 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b762ea49bd703..26a40ab701313 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -63,7 +63,9 @@ extern unsigned long io_apic_irqs;
 extern void init_VISWS_APIC_irqs(void);
 extern void setup_IO_APIC(void);
 extern void disable_IO_APIC(void);
-extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
+					int *ioapic, int *ioapic_pin,
+					int *trigger, int *polarity);
 extern void setup_ioapic_dest(void);
 
 extern void enable_IO_APIC(void);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index e279ae3392858..caf9dbdde0501 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -873,54 +873,6 @@ static int __init find_isa_irq_apic(int irq, int type)
 	return -1;
 }
 
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
-	int apic, i, best_guess = -1;
-
-	apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
-		bus, slot, pin);
-	if (test_bit(bus, mp_bus_not_pci)) {
-		apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
-		return -1;
-	}
-	for (i = 0; i < mp_irq_entries; i++) {
-		int lbus = mp_irqs[i].srcbus;
-
-		for (apic = 0; apic < nr_ioapics; apic++)
-			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
-			    mp_irqs[i].dstapic == MP_APIC_ALL)
-				break;
-
-		if (!test_bit(lbus, mp_bus_not_pci) &&
-		    !mp_irqs[i].irqtype &&
-		    (bus == lbus) &&
-		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
-			int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
-
-			if (!(apic || IO_APIC_IRQ(irq)))
-				continue;
-
-			if (pin == (mp_irqs[i].srcbusirq & 3))
-				return irq;
-			/*
-			 * Use the first all-but-pin matching entry as a
-			 * best-guess fuzzy result for broken mptables.
-			 */
-			if (best_guess < 0)
-				best_guess = irq;
-		}
-	}
-	return best_guess;
-}
-
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
 /*
  * EISA Edge/Level control register, ELCR
@@ -1139,6 +1091,65 @@ static int pin_2_irq(int idx, int apic, int pin)
 	return irq;
 }
 
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+				int *ioapic, int *ioapic_pin,
+				int *trigger, int *polarity)
+{
+	int apic, i, best_guess = -1;
+
+	apic_printk(APIC_DEBUG,
+		    "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+		    bus, slot, pin);
+	if (test_bit(bus, mp_bus_not_pci)) {
+		apic_printk(APIC_VERBOSE,
+			    "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+		return -1;
+	}
+	for (i = 0; i < mp_irq_entries; i++) {
+		int lbus = mp_irqs[i].srcbus;
+
+		for (apic = 0; apic < nr_ioapics; apic++)
+			if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+			    mp_irqs[i].dstapic == MP_APIC_ALL)
+				break;
+
+		if (!test_bit(lbus, mp_bus_not_pci) &&
+		    !mp_irqs[i].irqtype &&
+		    (bus == lbus) &&
+		    (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+			int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
+
+			if (!(apic || IO_APIC_IRQ(irq)))
+				continue;
+
+			if (pin == (mp_irqs[i].srcbusirq & 3)) {
+				*ioapic = apic;
+				*ioapic_pin = mp_irqs[i].dstirq;
+				*trigger = irq_trigger(i);
+				*polarity = irq_polarity(i);
+				return irq;
+			}
+			/*
+			 * Use the first all-but-pin matching entry as a
+			 * best-guess fuzzy result for broken mptables.
+			 */
+			if (best_guess < 0) {
+				*ioapic = apic;
+				*ioapic_pin = mp_irqs[i].dstirq;
+				*trigger = irq_trigger(i);
+				*polarity = irq_polarity(i);
+				best_guess = irq;
+			}
+		}
+	}
+	return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
 void lock_vector_lock(void)
 {
 	/* Used to the online set of cpus does not change
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index fecbce6e7d7c2..a2f6bde9c4eb1 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1051,12 +1051,16 @@ static void __init pcibios_fixup_irqs(void)
 		 */
 		if (io_apic_assign_pci_irqs) {
 			int irq;
+			int ioapic = -1, ioapic_pin = -1;
+			int triggering, polarity;
 
 			/*
 			 * interrupt pins are numbered starting from 1
 			 */
 			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
-				PCI_SLOT(dev->devfn), pin - 1);
+						PCI_SLOT(dev->devfn), pin - 1,
+						&ioapic, &ioapic_pin,
+						&triggering, &polarity);
 			/*
 			 * Busses behind bridges are typically not listed in the
 			 * MP-table.  In this case we have to look up the IRQ
@@ -1072,7 +1076,10 @@ static void __init pcibios_fixup_irqs(void)
 				pin = pci_swizzle_interrupt_pin(dev, pin);
 				bus = bridge->bus->number;
 				irq = IO_APIC_get_PCI_irq_vector(bus,
-						PCI_SLOT(bridge->devfn), pin - 1);
+						PCI_SLOT(bridge->devfn),
+						pin - 1,
+						&ioapic, &ioapic_pin,
+						&triggering, &polarity);
 				if (irq >= 0)
 					dev_warn(&dev->dev,
 						"using bridge %s INT %c to "
@@ -1221,8 +1228,14 @@ static int pirq_enable_irq(struct pci_dev *dev)
 
 		if (io_apic_assign_pci_irqs) {
 			int irq;
+			int ioapic = -1, ioapic_pin = -1;
+			int triggering, polarity;
 
-			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1);
+			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+						PCI_SLOT(dev->devfn),
+						pin - 1,
+						&ioapic, &ioapic_pin,
+						&triggering, &polarity);
 			/*
 			 * Busses behind bridges are typically not listed in the MP-table.
 			 * In this case we have to look up the IRQ based on the parent bus,
@@ -1235,7 +1248,10 @@ static int pirq_enable_irq(struct pci_dev *dev)
 
 				pin = pci_swizzle_interrupt_pin(dev, pin);
 				irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
-						PCI_SLOT(bridge->devfn), pin - 1);
+						PCI_SLOT(bridge->devfn),
+						pin - 1,
+						&ioapic, &ioapic_pin,
+						&triggering, &polarity);
 				if (irq >= 0)
 					dev_warn(&dev->dev, "using bridge %s "
 						 "INT %c to get IRQ %d\n",
diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index dd18f857dfb04..ef53b05a411a2 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -153,45 +153,49 @@ int ibmphp_init_devno(struct slot **cur_slot)
 		return -1;
 	}
 	for (loop = 0; loop < len; loop++) {
-		if ((*cur_slot)->number == rtable->slots[loop].slot) {
-		if ((*cur_slot)->bus == rtable->slots[loop].bus) {
+		if ((*cur_slot)->number == rtable->slots[loop].slot &&
+		    (*cur_slot)->bus == rtable->slots[loop].bus) {
+			int ioapic = -1, ioapic_pin = -1;
+			int triggering, polarity;
+
 			(*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn);
 			for (i = 0; i < 4; i++)
 				(*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus,
-						(int) (*cur_slot)->device, i);
-
-				debug("(*cur_slot)->irq[0] = %x\n",
-						(*cur_slot)->irq[0]);
-				debug("(*cur_slot)->irq[1] = %x\n",
-						(*cur_slot)->irq[1]);
-				debug("(*cur_slot)->irq[2] = %x\n",
-						(*cur_slot)->irq[2]);
-				debug("(*cur_slot)->irq[3] = %x\n",
-						(*cur_slot)->irq[3]);
-
-				debug("rtable->exlusive_irqs = %x\n",
+						(int) (*cur_slot)->device, i.
+						&ioapic, &ioapic_pin,
+						&triggering, &polarity);
+
+			debug("(*cur_slot)->irq[0] = %x\n",
+					(*cur_slot)->irq[0]);
+			debug("(*cur_slot)->irq[1] = %x\n",
+					(*cur_slot)->irq[1]);
+			debug("(*cur_slot)->irq[2] = %x\n",
+					(*cur_slot)->irq[2]);
+			debug("(*cur_slot)->irq[3] = %x\n",
+					(*cur_slot)->irq[3]);
+
+			debug("rtable->exlusive_irqs = %x\n",
 					rtable->exclusive_irqs);
-				debug("rtable->slots[loop].irq[0].bitmap = %x\n",
+			debug("rtable->slots[loop].irq[0].bitmap = %x\n",
 					rtable->slots[loop].irq[0].bitmap);
-				debug("rtable->slots[loop].irq[1].bitmap = %x\n",
+			debug("rtable->slots[loop].irq[1].bitmap = %x\n",
 					rtable->slots[loop].irq[1].bitmap);
-				debug("rtable->slots[loop].irq[2].bitmap = %x\n",
+			debug("rtable->slots[loop].irq[2].bitmap = %x\n",
 					rtable->slots[loop].irq[2].bitmap);
-				debug("rtable->slots[loop].irq[3].bitmap = %x\n",
+			debug("rtable->slots[loop].irq[3].bitmap = %x\n",
 					rtable->slots[loop].irq[3].bitmap);
 
-				debug("rtable->slots[loop].irq[0].link = %x\n",
+			debug("rtable->slots[loop].irq[0].link = %x\n",
 					rtable->slots[loop].irq[0].link);
-				debug("rtable->slots[loop].irq[1].link = %x\n",
+			debug("rtable->slots[loop].irq[1].link = %x\n",
 					rtable->slots[loop].irq[1].link);
-				debug("rtable->slots[loop].irq[2].link = %x\n",
+			debug("rtable->slots[loop].irq[2].link = %x\n",
 					rtable->slots[loop].irq[2].link);
-				debug("rtable->slots[loop].irq[3].link = %x\n",
+			debug("rtable->slots[loop].irq[3].link = %x\n",
 					rtable->slots[loop].irq[3].link);
-				debug("end of init_devno\n");
-				kfree(rtable);
-				return 0;
-			}
+			debug("end of init_devno\n");
+			kfree(rtable);
+			return 0;
 		}
 	}
 
-- 
GitLab


From 5ef2183768bb7d64b85eccbfa1537a61cbefa97c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:08:50 -0700
Subject: [PATCH 1013/2198] x86/acpi: move setup io apic routing out of
 CONFIG_ACPI scope

So we could set io apic routing when ACPI is not enabled.

[ Impact: prepare for new functionality ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A01C422.5070400@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/io_apic.h |   4 +-
 arch/x86/kernel/apic/io_apic.c | 122 ++++++++++++++++-----------------
 2 files changed, 63 insertions(+), 63 deletions(-)

diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 27bd2fdd00aec..6fd99f96eb0ad 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -154,10 +154,10 @@ extern int timer_through_8259;
 extern int io_apic_get_unique_id(int ioapic, int apic_id);
 extern int io_apic_get_version(int ioapic);
 extern int io_apic_get_redir_entries(int ioapic);
-extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin,
-				  int irq, int edge_level, int active_high_low);
 #endif /* CONFIG_ACPI */
 
+extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin,
+				  int irq, int edge_level, int active_high_low);
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index caf9dbdde0501..3a68daee0d995 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -3839,6 +3839,67 @@ int __init arch_probe_nr_irqs(void)
 }
 #endif
 
+static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+				 int triggering, int polarity)
+{
+	struct irq_desc *desc;
+	struct irq_cfg *cfg;
+	int node;
+
+	if (!IO_APIC_IRQ(irq)) {
+		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+			ioapic);
+		return -EINVAL;
+	}
+
+	if (dev)
+		node = dev_to_node(dev);
+	else
+		node = cpu_to_node(boot_cpu_id);
+
+	desc = irq_to_desc_alloc_node(irq, node);
+	if (!desc) {
+		printk(KERN_INFO "can not get irq_desc %d\n", irq);
+		return 0;
+	}
+
+	/*
+	 * IRQs < 16 are already in the irq_2_pin[] map
+	 */
+	if (irq >= NR_IRQS_LEGACY) {
+		cfg = desc->chip_data;
+		add_pin_to_irq_node(cfg, node, ioapic, pin);
+	}
+
+	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
+
+	return 0;
+}
+
+static struct {
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
+int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
+				 int triggering, int polarity)
+{
+
+	/*
+	 * Avoid pin reprogramming.  PRTs typically include entries
+	 * with redundant pin->gsi mappings (but unique PCI devices);
+	 * we only program the IOAPIC on the first.
+	 */
+	if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+		pr_debug("Pin %d-%d already programmed\n",
+			 mp_ioapics[ioapic].apicid, pin);
+		return 0;
+	}
+	set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
+
+	return __io_apic_set_pci_routing(dev, ioapic, pin, irq,
+					 triggering, polarity);
+}
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
@@ -3933,67 +3994,6 @@ int __init io_apic_get_version(int ioapic)
 }
 #endif
 
-static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
-				 int triggering, int polarity)
-{
-	struct irq_desc *desc;
-	struct irq_cfg *cfg;
-	int node;
-
-	if (!IO_APIC_IRQ(irq)) {
-		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-			ioapic);
-		return -EINVAL;
-	}
-
-	if (dev)
-		node = dev_to_node(dev);
-	else
-		node = cpu_to_node(boot_cpu_id);
-
-	desc = irq_to_desc_alloc_node(irq, node);
-	if (!desc) {
-		printk(KERN_INFO "can not get irq_desc %d\n", irq);
-		return 0;
-	}
-
-	/*
-	 * IRQs < 16 are already in the irq_2_pin[] map
-	 */
-	if (irq >= NR_IRQS_LEGACY) {
-		cfg = desc->chip_data;
-		add_pin_to_irq_node(cfg, node, ioapic, pin);
-	}
-
-	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
-
-	return 0;
-}
-
-static struct {
-	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-} mp_ioapic_routing[MAX_IO_APICS];
-
-int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
-				 int triggering, int polarity)
-{
-
-	/*
-	 * Avoid pin reprogramming.  PRTs typically include entries
-	 * with redundant pin->gsi mappings (but unique PCI devices);
-	 * we only program the IOAPIC on the first.
-	 */
-	if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-		pr_debug("Pin %d-%d already programmed\n",
-			 mp_ioapics[ioapic].apicid, pin);
-		return 0;
-	}
-	set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
-
-	return __io_apic_set_pci_routing(dev, ioapic, pin, irq,
-					 triggering, polarity);
-}
-
 int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 {
 	int i;
-- 
GitLab


From b9c61b70075c87a8612624736faf4a2de5b1ed30 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 6 May 2009 10:10:06 -0700
Subject: [PATCH 1014/2198] x86/pci: update pirq_enable_irq() to setup io apic
 routing

So we can set io apic routing only when enabling the device irq.

This is advantageous for IRQ descriptor allocation affinity: if we set up
the IO-APIC entry later, we have a chance to allocate the IRQ descriptor
later and know which device it is on and can set affinity accordingly.

[ Impact: standardize/enhance irq-enabling sequence for mptable irqs ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A01C46E.8000501@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 150 ++++++++++++++++-----------------
 arch/x86/pci/irq.c             |  84 +++++++-----------
 2 files changed, 104 insertions(+), 130 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 3a68daee0d995..5d5f4120c7436 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1480,9 +1480,13 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
 	ioapic_write_entry(apic_id, pin, entry);
 }
 
+static struct {
+	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
 static void __init setup_IO_APIC_irqs(void)
 {
-	int apic_id, pin, idx, irq;
+	int apic_id = 0, pin, idx, irq;
 	int notcon = 0;
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
@@ -1490,48 +1494,53 @@ static void __init setup_IO_APIC_irqs(void)
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
-	for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
-		for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
-
-			idx = find_irq_entry(apic_id, pin, mp_INT);
-			if (idx == -1) {
-				if (!notcon) {
-					notcon = 1;
-					apic_printk(APIC_VERBOSE,
-						KERN_DEBUG " %d-%d",
-						mp_ioapics[apic_id].apicid, pin);
-				} else
-					apic_printk(APIC_VERBOSE, " %d-%d",
-						mp_ioapics[apic_id].apicid, pin);
-				continue;
-			}
-			if (notcon) {
-				apic_printk(APIC_VERBOSE,
-					" (apicid-pin) not connected\n");
-				notcon = 0;
-			}
+#ifdef CONFIG_ACPI
+	if (!acpi_disabled && acpi_ioapic) {
+		apic_id = mp_find_ioapic(0);
+		if (apic_id < 0)
+			apic_id = 0;
+	}
+#endif
 
-			irq = pin_2_irq(idx, apic_id, pin);
+	for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+		idx = find_irq_entry(apic_id, pin, mp_INT);
+		if (idx == -1) {
+			if (!notcon) {
+				notcon = 1;
+				apic_printk(APIC_VERBOSE,
+					KERN_DEBUG " %d-%d",
+					mp_ioapics[apic_id].apicid, pin);
+			} else
+				apic_printk(APIC_VERBOSE, " %d-%d",
+					mp_ioapics[apic_id].apicid, pin);
+			continue;
+		}
+		if (notcon) {
+			apic_printk(APIC_VERBOSE,
+				" (apicid-pin) not connected\n");
+			notcon = 0;
+		}
 
-			/*
-			 * Skip the timer IRQ if there's a quirk handler
-			 * installed and if it returns 1:
-			 */
-			if (apic->multi_timer_check &&
-					apic->multi_timer_check(apic_id, irq))
-				continue;
+		irq = pin_2_irq(idx, apic_id, pin);
 
-			desc = irq_to_desc_alloc_node(irq, node);
-			if (!desc) {
-				printk(KERN_INFO "can not get irq_desc for %d\n", irq);
-				continue;
-			}
-			cfg = desc->chip_data;
-			add_pin_to_irq_node(cfg, node, apic_id, pin);
+		/*
+		 * Skip the timer IRQ if there's a quirk handler
+		 * installed and if it returns 1:
+		 */
+		if (apic->multi_timer_check &&
+				apic->multi_timer_check(apic_id, irq))
+			continue;
 
-			setup_IO_APIC_irq(apic_id, pin, irq, desc,
-					irq_trigger(idx), irq_polarity(idx));
+		desc = irq_to_desc_alloc_node(irq, node);
+		if (!desc) {
+			printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+			continue;
 		}
+		cfg = desc->chip_data;
+		add_pin_to_irq_node(cfg, node, apic_id, pin);
+		set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
+		setup_IO_APIC_irq(apic_id, pin, irq, desc,
+				irq_trigger(idx), irq_polarity(idx));
 	}
 
 	if (notcon)
@@ -3876,10 +3885,6 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in
 	return 0;
 }
 
-static struct {
-	DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
-} mp_ioapic_routing[MAX_IO_APICS];
-
 int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 				 int triggering, int polarity)
 {
@@ -4023,51 +4028,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 #ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
-	int pin, ioapic, irq, irq_entry;
+	int pin, ioapic = 0, irq, irq_entry;
 	struct irq_desc *desc;
-	struct irq_cfg *cfg;
 	const struct cpumask *mask;
 
 	if (skip_ioapic_setup == 1)
 		return;
 
-	for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
-		for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
-			irq_entry = find_irq_entry(ioapic, pin, mp_INT);
-			if (irq_entry == -1)
-				continue;
-			irq = pin_2_irq(irq_entry, ioapic, pin);
-
-			/* setup_IO_APIC_irqs could fail to get vector for some device
-			 * when you have too many devices, because at that time only boot
-			 * cpu is online.
-			 */
-			desc = irq_to_desc(irq);
-			cfg = desc->chip_data;
-			if (!cfg->vector) {
-				setup_IO_APIC_irq(ioapic, pin, irq, desc,
-						  irq_trigger(irq_entry),
-						  irq_polarity(irq_entry));
-				continue;
+#ifdef CONFIG_ACPI
+	if (!acpi_disabled && acpi_ioapic) {
+		ioapic = mp_find_ioapic(0);
+		if (ioapic < 0)
+			ioapic = 0;
+	}
+#endif
 
-			}
+	for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+		irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+		if (irq_entry == -1)
+			continue;
+		irq = pin_2_irq(irq_entry, ioapic, pin);
 
-			/*
-			 * Honour affinities which have been set in early boot
-			 */
-			if (desc->status &
-			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-				mask = desc->affinity;
-			else
-				mask = apic->target_cpus();
+		desc = irq_to_desc(irq);
 
-			if (intr_remapping_enabled)
-				set_ir_ioapic_affinity_irq_desc(desc, mask);
-			else
-				set_ioapic_affinity_irq_desc(desc, mask);
-		}
+		/*
+		 * Honour affinities which have been set in early boot
+		 */
+		if (desc->status &
+		    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+			mask = desc->affinity;
+		else
+			mask = apic->target_cpus();
 
+		if (intr_remapping_enabled)
+			set_ir_ioapic_affinity_irq_desc(desc, mask);
+		else
+			set_ioapic_affinity_irq_desc(desc, mask);
 	}
+
 }
 #endif
 
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index a2f6bde9c4eb1..2f3e192615c0e 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 		return 0;
 	}
 
+	if (io_apic_assign_pci_irqs)
+		return 0;
+
 	/* Find IRQ routing entry */
 
 	if (!pirq_table)
@@ -1039,63 +1042,15 @@ static void __init pcibios_fixup_irqs(void)
 		pirq_penalty[dev->irq]++;
 	}
 
+	if (io_apic_assign_pci_irqs)
+		return;
+
 	dev = NULL;
 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
 		pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
 		if (!pin)
 			continue;
 
-#ifdef CONFIG_X86_IO_APIC
-		/*
-		 * Recalculate IRQ numbers if we use the I/O APIC.
-		 */
-		if (io_apic_assign_pci_irqs) {
-			int irq;
-			int ioapic = -1, ioapic_pin = -1;
-			int triggering, polarity;
-
-			/*
-			 * interrupt pins are numbered starting from 1
-			 */
-			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
-						PCI_SLOT(dev->devfn), pin - 1,
-						&ioapic, &ioapic_pin,
-						&triggering, &polarity);
-			/*
-			 * Busses behind bridges are typically not listed in the
-			 * MP-table.  In this case we have to look up the IRQ
-			 * based on the parent bus, parent slot, and pin number.
-			 * The SMP code detects such bridged busses itself so we
-			 * should get into this branch reliably.
-			 */
-			if (irq < 0 && dev->bus->parent) {
-				/* go back to the bridge */
-				struct pci_dev *bridge = dev->bus->self;
-				int bus;
-
-				pin = pci_swizzle_interrupt_pin(dev, pin);
-				bus = bridge->bus->number;
-				irq = IO_APIC_get_PCI_irq_vector(bus,
-						PCI_SLOT(bridge->devfn),
-						pin - 1,
-						&ioapic, &ioapic_pin,
-						&triggering, &polarity);
-				if (irq >= 0)
-					dev_warn(&dev->dev,
-						"using bridge %s INT %c to "
-							"get IRQ %d\n",
-						 pci_name(bridge),
-						 'A' + pin - 1, irq);
-			}
-			if (irq >= 0) {
-				dev_info(&dev->dev,
-					"PCI->APIC IRQ transform: INT %c "
-						"-> IRQ %d\n",
-					'A' + pin - 1, irq);
-				dev->irq = irq;
-			}
-		}
-#endif
 		/*
 		 * Still no IRQ? Try to lookup one...
 		 */
@@ -1190,6 +1145,19 @@ int __init pcibios_irq_init(void)
 	pcibios_enable_irq = pirq_enable_irq;
 
 	pcibios_fixup_irqs();
+
+	if (io_apic_assign_pci_irqs && pci_routeirq) {
+		struct pci_dev *dev = NULL;
+		/*
+		 * PCI IRQ routing is set up by pci_enable_device(), but we
+		 * also do it here in case there are still broken drivers that
+		 * don't use pci_enable_device().
+		 */
+		printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+		for_each_pci_dev(dev)
+			pirq_enable_irq(dev);
+	}
+
 	return 0;
 }
 
@@ -1220,13 +1188,17 @@ void pcibios_penalize_isa_irq(int irq, int active)
 static int pirq_enable_irq(struct pci_dev *dev)
 {
 	u8 pin;
-	struct pci_dev *temp_dev;
 
 	pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
-	if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
+	if (pin && !pcibios_lookup_irq(dev, 1)) {
 		char *msg = "";
 
+		if (!io_apic_assign_pci_irqs && dev->irq)
+			return 0;
+
 		if (io_apic_assign_pci_irqs) {
+#ifdef CONFIG_X86_IO_APIC
+			struct pci_dev *temp_dev;
 			int irq;
 			int ioapic = -1, ioapic_pin = -1;
 			int triggering, polarity;
@@ -1261,12 +1233,16 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
+				io_apic_set_pci_routing(&dev->dev, ioapic,
+							ioapic_pin, irq,
+							triggering, polarity);
+				dev->irq = irq;
 				dev_info(&dev->dev, "PCI->APIC IRQ transform: "
 					 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
-				dev->irq = irq;
 				return 0;
 			} else
 				msg = "; probably buggy MP table";
+#endif
 		} else if (pci_probe & PCI_BIOS_IRQ_SCAN)
 			msg = "";
 		else
-- 
GitLab


From 61fe91e1319556f32bebfd7ed2c68ef02e2c17f7 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 9 May 2009 23:47:42 -0700
Subject: [PATCH 1015/2198] x86: apic: Check rev 3 fadt correctly for
 physical_apic bit

Impact: fix fadt version checking

FADT2_REVISION_ID has value 3 aka rev 3 FADT. So need to use >= instead
of >, as other places in the code do.

[ Impact: extend scope of APIC boot quirk ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic_flat_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 306e5e88fb6f4..744e6d8af27b8 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	 * regardless of how many processors are present (x86_64 ES7000
 	 * is an example).
 	 */
-	if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+	if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
 		(acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
 		printk(KERN_DEBUG "system APIC only can use physical flat");
 		return 1;
-- 
GitLab


From 3e0c373749d7eb5b354ac0b043f2b2cdf84eefef Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 9 May 2009 23:47:42 -0700
Subject: [PATCH 1016/2198] x86: clean up and fix setup_clear/force_cpu_cap
 handling

setup_force_cpu_cap() only have one user (Xen guest code),
but it should not reuse cleared_cpu_cpus, otherwise it
will have problems on SMP.

Need to have a separate cpu_cpus_set array too, for forced-on
flags, beyond the forced-off flags.

Also need to setup handling before all cpus caps are combined.

[ Impact: fix the forced-set CPU feature flag logic ]

Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Yinghai Lu <yinghai.lu@kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/cpufeature.h |  4 ++--
 arch/x86/include/asm/processor.h  |  3 ++-
 arch/x86/kernel/cpu/common.c      | 17 ++++++++++++-----
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index ccc1061b8b257..13cc6a503a024 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -192,11 +192,11 @@ extern const char * const x86_power_flags[32];
 #define clear_cpu_cap(c, bit)	clear_bit(bit, (unsigned long *)((c)->x86_capability))
 #define setup_clear_cpu_cap(bit) do { \
 	clear_cpu_cap(&boot_cpu_data, bit);	\
-	set_bit(bit, (unsigned long *)cleared_cpu_caps); \
+	set_bit(bit, (unsigned long *)cpu_caps_cleared); \
 } while (0)
 #define setup_force_cpu_cap(bit) do { \
 	set_cpu_cap(&boot_cpu_data, bit);	\
-	clear_bit(bit, (unsigned long *)cleared_cpu_caps);	\
+	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 } while (0)
 
 #define cpu_has_fpu		boot_cpu_has(X86_FEATURE_FPU)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c2cceae709c86..fed93fec9764a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -135,7 +135,8 @@ extern struct cpuinfo_x86	boot_cpu_data;
 extern struct cpuinfo_x86	new_cpu_data;
 
 extern struct tss_struct	doublefault_tss;
-extern __u32			cleared_cpu_caps[NCAPINTS];
+extern __u32			cpu_caps_cleared[NCAPINTS];
+extern __u32			cpu_caps_set[NCAPINTS];
 
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c4f667896c28f..e7fd5c4935a35 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -292,7 +292,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
 	return NULL;		/* Not found */
 }
 
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
 
 void load_percpu_segment(int cpu)
 {
@@ -806,6 +807,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 #endif
 
 	init_hypervisor(c);
+
+	/*
+	 * Clear/Set all flags overriden by options, need do it
+	 * before following smp all cpus cap AND.
+	 */
+	for (i = 0; i < NCAPINTS; i++) {
+		c->x86_capability[i] &= ~cpu_caps_cleared[i];
+		c->x86_capability[i] |= cpu_caps_set[i];
+	}
+
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
 	 * all CPUs; so make sure that we indicate which features are
@@ -818,10 +829,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 			boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
 	}
 
-	/* Clear all flags overriden by options */
-	for (i = 0; i < NCAPINTS; i++)
-		c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
 #ifdef CONFIG_X86_MCE
 	/* Init Machine Check Exception if available. */
 	mcheck_init(c);
-- 
GitLab


From 1822952ba2b9f22f79019d07ebbeca31dc14b718 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 11 May 2009 17:56:07 +0900
Subject: [PATCH 1017/2198] block: let blk_end_request_all handle bidi requests

blk_end_request_all() and __blk_end_request_all() should finish all
bytes including bidi, by definition. That's what all bidi users need ,
bidi requests must be complete as a whole (partial completion is
impossible).

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/blkdev.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6e59d3b92ff2a..1069f4483c6ef 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,8 +910,12 @@ static inline bool blk_end_request(struct request *rq, int error,
 static inline void blk_end_request_all(struct request *rq, int error)
 {
 	bool pending;
+	unsigned int bidi_bytes = 0;
 
-	pending = blk_end_request(rq, error, blk_rq_bytes(rq));
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
+
+	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
 	BUG_ON(pending);
 }
 
@@ -962,8 +966,12 @@ static inline bool __blk_end_request(struct request *rq, int error,
 static inline void __blk_end_request_all(struct request *rq, int error)
 {
 	bool pending;
+	unsigned int bidi_bytes = 0;
+
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
 
-	pending = __blk_end_request(rq, error, blk_rq_bytes(rq));
+	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
 	BUG_ON(pending);
 }
 
-- 
GitLab


From e6bb7a96c2c36f20c05ef648f15bd3c2b1834c78 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 11 May 2009 17:56:08 +0900
Subject: [PATCH 1018/2198] scsi: simplify the bidi completion

Let's use blk_end_request_all() instead of blk_end_bidi_request().

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/scsi_lib.c | 43 +++++++++++++----------------------------
 1 file changed, 13 insertions(+), 30 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index b12750f82169f..a54bec9943869 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -672,33 +672,6 @@ void scsi_release_buffers(struct scsi_cmnd *cmd)
 }
 EXPORT_SYMBOL(scsi_release_buffers);
 
-/*
- * Bidi commands Must be complete as a whole, both sides at once.  If
- * part of the bytes were written and lld returned scsi_in()->resid
- * and/or scsi_out()->resid this information will be left in
- * req->resid_len and req->next_rq->resid_len. The upper-layer driver
- * can decide what to do with this information.
- */
-static void scsi_end_bidi_request(struct scsi_cmnd *cmd)
-{
-	struct request *req = cmd->request;
-
-	req->resid_len = scsi_out(cmd)->resid;
-	req->next_rq->resid_len = scsi_in(cmd)->resid;
-
-	/* The req and req->next_rq have not been completed */
-	BUG_ON(blk_end_bidi_request(req, 0, blk_rq_bytes(req),
-				    blk_rq_bytes(req->next_rq)));
-
-	scsi_release_buffers(cmd);
-
-	/*
-	 * This will goose the queue request function at the end, so we don't
-	 * need to worry about launching another command.
-	 */
-	scsi_next_command(cmd);
-}
-
 /*
  * Function:    scsi_io_completion()
  *
@@ -772,12 +745,22 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 			if (!sense_deferred)
 				error = -EIO;
 		}
+
+		req->resid_len = scsi_get_resid(cmd);
+
 		if (scsi_bidi_cmnd(cmd)) {
-			/* will also release_buffers */
-			scsi_end_bidi_request(cmd);
+			/*
+			 * Bidi commands Must be complete as a whole,
+			 * both sides at once.
+			 */
+			req->next_rq->resid_len = scsi_in(cmd)->resid;
+
+			blk_end_request_all(req, 0);
+
+			scsi_release_buffers(cmd);
+			scsi_next_command(cmd);
 			return;
 		}
-		req->resid_len = scsi_get_resid(cmd);
 	}
 
 	BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */
-- 
GitLab


From b1f744937f1be3e6d3009382a755679133cf782d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Mon, 11 May 2009 17:56:09 +0900
Subject: [PATCH 1019/2198] block: move completion related functions back to
 blk-core.c

Let's put the completion related functions back to block/blk-core.c
where they have lived. We can also unexport blk_end_bidi_request() and
__blk_end_bidi_request(), which nobody uses.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 128 +++++++++++++++++++++++++++++++++++++++--
 include/linux/blkdev.h | 128 +++--------------------------------------
 2 files changed, 130 insertions(+), 126 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 93691d2ac5a07..a2d97de1a12c3 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2026,8 +2026,8 @@ static void blk_finish_request(struct request *req, int error)
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-bool blk_end_bidi_request(struct request *rq, int error,
-			  unsigned int nr_bytes, unsigned int bidi_bytes)
+static bool blk_end_bidi_request(struct request *rq, int error,
+				 unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
@@ -2041,7 +2041,6 @@ bool blk_end_bidi_request(struct request *rq, int error,
 
 	return false;
 }
-EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 
 /**
  * __blk_end_bidi_request - Complete a bidi request with queue lock held
@@ -2058,8 +2057,8 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request);
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
-bool __blk_end_bidi_request(struct request *rq, int error,
-			    unsigned int nr_bytes, unsigned int bidi_bytes)
+static bool __blk_end_bidi_request(struct request *rq, int error,
+				   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
@@ -2068,7 +2067,124 @@ bool __blk_end_bidi_request(struct request *rq, int error,
 
 	return false;
 }
-EXPORT_SYMBOL_GPL(__blk_end_bidi_request);
+
+/**
+ * blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Ends I/O on a number of bytes attached to @rq.
+ *     If @rq has leftover, sets it up for the next range of segments.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
+{
+	return blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+EXPORT_SYMBOL_GPL(blk_end_request);
+
+/**
+ * blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.
+ */
+void blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+	unsigned int bidi_bytes = 0;
+
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
+
+	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
+	BUG_ON(pending);
+}
+EXPORT_SYMBOL_GPL(blk_end_request_all);
+
+/**
+ * blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Complete the current consecutively mapped chunk from @rq.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool blk_end_request_cur(struct request *rq, int error)
+{
+	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(blk_end_request_cur);
+
+/**
+ * __blk_end_request - Helper function for drivers to complete the request.
+ * @rq:       the request being processed
+ * @error:    %0 for success, < %0 for error
+ * @nr_bytes: number of bytes to complete
+ *
+ * Description:
+ *     Must be called with queue lock held unlike blk_end_request().
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ **/
+bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
+{
+	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
+}
+EXPORT_SYMBOL_GPL(__blk_end_request);
+
+/**
+ * __blk_end_request_all - Helper function for drives to finish the request.
+ * @rq: the request to finish
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Completely finish @rq.  Must be called with queue lock held.
+ */
+void __blk_end_request_all(struct request *rq, int error)
+{
+	bool pending;
+	unsigned int bidi_bytes = 0;
+
+	if (unlikely(blk_bidi_rq(rq)))
+		bidi_bytes = blk_rq_bytes(rq->next_rq);
+
+	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
+	BUG_ON(pending);
+}
+EXPORT_SYMBOL_GPL(__blk_end_request_all);
+
+/**
+ * __blk_end_request_cur - Helper function to finish the current request chunk.
+ * @rq: the request to finish the current chunk for
+ * @err: %0 for success, < %0 for error
+ *
+ * Description:
+ *     Complete the current consecutively mapped chunk from @rq.  Must
+ *     be called with queue lock held.
+ *
+ * Return:
+ *     %false - we are done with this request
+ *     %true  - still buffers pending for this request
+ */
+bool __blk_end_request_cur(struct request *rq, int error)
+{
+	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
+}
+EXPORT_SYMBOL_GPL(__blk_end_request_cur);
 
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 1069f4483c6ef..f9d60a78c08a4 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -872,126 +872,14 @@ extern struct request *blk_fetch_request(struct request_queue *q);
  */
 extern bool blk_update_request(struct request *rq, int error,
 			       unsigned int nr_bytes);
-extern bool blk_end_bidi_request(struct request *rq, int error,
-				 unsigned int nr_bytes,
-				 unsigned int bidi_bytes);
-extern bool __blk_end_bidi_request(struct request *rq, int error,
-				   unsigned int nr_bytes,
-				   unsigned int bidi_bytes);
-
-/**
- * blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Ends I/O on a number of bytes attached to @rq.
- *     If @rq has leftover, sets it up for the next range of segments.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- **/
-static inline bool blk_end_request(struct request *rq, int error,
-				   unsigned int nr_bytes)
-{
-	return blk_end_bidi_request(rq, error, nr_bytes, 0);
-}
-
-/**
- * blk_end_request_all - Helper function for drives to finish the request.
- * @rq: the request to finish
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Completely finish @rq.
- */
-static inline void blk_end_request_all(struct request *rq, int error)
-{
-	bool pending;
-	unsigned int bidi_bytes = 0;
-
-	if (unlikely(blk_bidi_rq(rq)))
-		bidi_bytes = blk_rq_bytes(rq->next_rq);
-
-	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
-	BUG_ON(pending);
-}
-
-/**
- * blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Complete the current consecutively mapped chunk from @rq.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-static inline bool blk_end_request_cur(struct request *rq, int error)
-{
-	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
-
-/**
- * __blk_end_request - Helper function for drivers to complete the request.
- * @rq:       the request being processed
- * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete
- *
- * Description:
- *     Must be called with queue lock held unlike blk_end_request().
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- **/
-static inline bool __blk_end_request(struct request *rq, int error,
-				     unsigned int nr_bytes)
-{
-	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
-}
-
-/**
- * __blk_end_request_all - Helper function for drives to finish the request.
- * @rq: the request to finish
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Completely finish @rq.  Must be called with queue lock held.
- */
-static inline void __blk_end_request_all(struct request *rq, int error)
-{
-	bool pending;
-	unsigned int bidi_bytes = 0;
-
-	if (unlikely(blk_bidi_rq(rq)))
-		bidi_bytes = blk_rq_bytes(rq->next_rq);
-
-	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
-	BUG_ON(pending);
-}
-
-/**
- * __blk_end_request_cur - Helper function to finish the current request chunk.
- * @rq: the request to finish the current chunk for
- * @err: %0 for success, < %0 for error
- *
- * Description:
- *     Complete the current consecutively mapped chunk from @rq.  Must
- *     be called with queue lock held.
- *
- * Return:
- *     %false - we are done with this request
- *     %true  - still buffers pending for this request
- */
-static inline bool __blk_end_request_cur(struct request *rq, int error)
-{
-	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
-}
+extern bool blk_end_request(struct request *rq, int error,
+			    unsigned int nr_bytes);
+extern void blk_end_request_all(struct request *rq, int error);
+extern bool blk_end_request_cur(struct request *rq, int error);
+extern bool __blk_end_request(struct request *rq, int error,
+			      unsigned int nr_bytes);
+extern void __blk_end_request_all(struct request *rq, int error);
+extern bool __blk_end_request_cur(struct request *rq, int error);
 
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
-- 
GitLab


From 80989ce0643c1034822f3e339ed8d790b649abe1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 9 May 2009 23:47:42 -0700
Subject: [PATCH 1020/2198] x86: clean up and and print out initial
 max_pfn_mapped

Do this so we can check the range that is mapped before
init_memory_mapping().

To be able to print out meaningful info, we first have to fix
64-bit to have max_pfn_mapped assigned before that call. This
also unifies the code-path a bit.

[ Impact: print more debug info, cleanup ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <49BF0978.40605@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup.c | 4 ++++
 arch/x86/mm/init.c      | 7 +++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0d77e56e821be..4031d6cb3ff97 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -862,12 +862,16 @@ void __init setup_arch(char **cmdline_p)
 		max_low_pfn = max_pfn;
 
 	high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+	max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 #endif
 
 #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
 	setup_bios_corruption_check();
 #endif
 
+	printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+			max_pfn_mapped<<PAGE_SHIFT);
+
 	reserve_brk();
 
 	/* max_pfn_mapped is updated here */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 95f5ecf2be502..92d2108a62c3d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -132,12 +132,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
 	 */
 #ifdef CONFIG_X86_32
 	start = 0x7000;
-	e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-					tables, PAGE_SIZE);
-#else /* CONFIG_X86_64 */
+#else
 	start = 0x8000;
-	e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
 #endif
+	e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+					tables, PAGE_SIZE);
 	if (e820_table_start == -1UL)
 		panic("Cannot find space for the kernel page tables");
 
-- 
GitLab


From 4401da6111ac58f94234417427d06a72c4048c74 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 2 May 2009 10:40:57 -0700
Subject: [PATCH 1021/2198] x86: read apic ID in the !acpi_lapic case

Ed found that on 32-bit, boot_cpu_physical_apicid is not read right,
when the mptable is broken.

Interestingly, actually three paths use/set it:

 1. acpi: at that time that is already read from reg
 2. mptable: only read from mptable
 3. no madt, and no mptable, that use default apic id 0 for 64-bit, -1 for 32-bit

so we could read the apic id for the 2/3 path. We trust the hardware
register more than we trust a BIOS data structure (the mptable).

We can also avoid the double set_fixmap() when acpi_lapic
is used, and also need to move cpu_has_apic earlier and
call apic_disable().

Also when need to update the apic id, we'd better read and
set the apic version as well - so that quirks are applied precisely.

v2: make path 3 with 64bit, use -1 as apic id, so could read it later.
v3: fix whitespace problem pointed out by Ed Swierk

[ Impact: get correct apic id for bsp other than acpi path ]

Reported-by: Ed Swierk <eswierk@aristanetworks.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <49FC85A9.2070702@kernel.org>
[ v4: sanity-check in the ACPI case too ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e258bedce7cba..1ee966f4ae956 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1456,7 +1456,6 @@ static int __init detect_init_APIC(void)
 	}
 
 	mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-	boot_cpu_physical_apicid = 0;
 	return 0;
 }
 #else
@@ -1570,6 +1569,8 @@ void __init early_init_lapic_mapping(void)
  */
 void __init init_apic_mappings(void)
 {
+	unsigned int new_apicid;
+
 	if (x2apic_mode) {
 		boot_cpu_physical_apicid = read_apic_id();
 		return;
@@ -1586,21 +1587,31 @@ void __init init_apic_mappings(void)
 	} else
 		apic_phys = mp_lapic_addr;
 
-	set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+	/* lets check if we may NOP'ify apic operations */
+	if (!cpu_has_apic) {
+		pr_info("APIC: disable apic facility\n");
+		apic_disable();
+		return;
+	}
+
+	/*
+	 * acpi lapic path already maps that address in
+	 * acpi_register_lapic_address()
+	 */
+	if (!acpi_lapic)
+		set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+
 	apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
 				APIC_BASE, apic_phys);
-
 	/*
 	 * Fetch the APIC ID of the BSP in case we have a
 	 * default configuration (or the MP table is broken).
 	 */
-	if (boot_cpu_physical_apicid == -1U)
-		boot_cpu_physical_apicid = read_apic_id();
-
-	/* lets check if we may to NOP'ify apic operations */
-	if (!cpu_has_apic) {
-		pr_info("APIC: disable apic facility\n");
-		apic_disable();
+	new_apicid = read_apic_id();
+	if (boot_cpu_physical_apicid != new_apicid) {
+		boot_cpu_physical_apicid = new_apicid;
+		apic_version[new_apicid] =
+			 GET_APIC_VERSION(apic_read(APIC_LVR));
 	}
 }
 
-- 
GitLab


From b37ab91907e9002925f4217e3bbd496aa12c2fa3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 8 May 2009 00:36:44 -0700
Subject: [PATCH 1022/2198] x86: Sanity check the e820 against the SRAT table
 using e820 map only

node_cover_memory() sanity checks the SRAT table by ensuring that all
PXMs cover the memory reported in the e820.

However, when calculating the size of the holes in the e820, it uses
the early_node_map[] which contains information taken from both SRAT
and e820. If the SRAT is missing an entry, then it is not detected
that the SRAT table is incorrect and missing entries.

This patch uses the e820 map to calculate the holes instead of
early_node_map[].

comment is from Mel.

[ Impact: reject incorrect SRAT tables ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A03E10C.60906@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/srat_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 01765955baaf6..c7a18aaccf8d2 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -345,7 +345,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 			pxmram = 0;
 	}
 
-	e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
+	e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
 	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
 	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
 		printk(KERN_ERR
-- 
GitLab


From 0964b0562bb9c93194e852b47bab2397b9e11c18 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 8 May 2009 00:37:34 -0700
Subject: [PATCH 1023/2198] x86: Allow 1MB of slack between the e820 map and
 SRAT, not 4GB

It is expected that there might be slight differences between the e820
map and the SRAT table and the intention was that 1MB of slack be allowed.

The calculation comparing e820ram and pxmram assumes the units are bytes,
when they are in fact pages. This means 4GB of slack is being allowed,
not 1MB. This patch makes the correct comparison.

comment is from Mel.

[ Impact: don't accept buggy SRATs that could dump up to 4G of RAM ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A03E13E.6050107@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/srat_64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index c7a18aaccf8d2..87b45bff250dd 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -346,8 +346,8 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 	}
 
 	e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
-	/* We seem to lose 3 pages somewhere. Allow a bit of slack. */
-	if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+	/* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+	if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
 		printk(KERN_ERR
 	"SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
 			(pxmram << PAGE_SHIFT) >> 20,
-- 
GitLab


From 53c0054c3f11b49fc09f24e46f58661def952728 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 08:45:27 +0000
Subject: [PATCH 1024/2198] sh: include empty_zero_page in text

Include empty_zero_page in _text. This fixes a problem
introduced by c3e2586b794b12ffcdf69b4e547030b51e18e6d9
which results in broken boot on R2D-Plus.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/vmlinux.lds.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S
index d73723080fd13..f53c76acaede2 100644
--- a/arch/sh/kernel/vmlinux.lds.S
+++ b/arch/sh/kernel/vmlinux.lds.S
@@ -26,12 +26,13 @@ SECTIONS
 	. = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET;
 #endif
 
+	_text = .;		/* Text and read-only data */
+
 	.empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) {
 		*(.empty_zero_page)
 	} = 0
 
 	.text : AT(ADDR(.text) - LOAD_OFFSET) {
-		_text = .;		/* Text and read-only data */
 		HEAD_TEXT
 		TEXT_TEXT
 
-- 
GitLab


From 03f408f1aad8d0f5eb6380732bffc0f72b250fa7 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 08:54:54 +0000
Subject: [PATCH 1025/2198] sh: TMU platform data for sh775x

This patch adds TMU platform data for sh775x. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4/setup-sh7750.c | 187 ++++++++++++++++++++++++++
 1 file changed, 187 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
index a1c80d909cd63..09da0c187d4cd 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/io.h>
+#include <linux/sh_timer.h>
 #include <linux/serial_sci.h>
 
 static struct resource rtc_resources[] = {
@@ -60,9 +61,177 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+/* SH7750R, SH7751 and SH7751R all have two extra timer channels */
+#if defined(CONFIG_CPU_SUBTYPE_SH7750R) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751R)
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xfe100008,
+		.end	= 0xfe100013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 72,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xfe100014,
+		.end	= 0xfe10001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 76,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+#endif
+
 static struct platform_device *sh7750_devices[] __initdata = {
 	&rtc_device,
 	&sci_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+#if defined(CONFIG_CPU_SUBTYPE_SH7750R) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751R)
+	&tmu3_device,
+	&tmu4_device,
+#endif
 };
 
 static int __init sh7750_devices_setup(void)
@@ -72,6 +241,24 @@ static int __init sh7750_devices_setup(void)
 }
 __initcall(sh7750_devices_setup);
 
+static struct platform_device *sh7750_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+#if defined(CONFIG_CPU_SUBTYPE_SH7750R) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751) || \
+	defined(CONFIG_CPU_SUBTYPE_SH7751R)
+	&tmu3_device,
+	&tmu4_device,
+#endif
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7750_early_devices,
+				   ARRAY_SIZE(sh7750_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
GitLab


From c42f32dca3855d8f867387ec6993d9b62516a00e Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 09:06:24 +0000
Subject: [PATCH 1026/2198] sh: TMU platform data for sh7760

This patch adds TMU platform data for sh7760. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4/setup-sh7760.c | 109 ++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
index d9bdc931ac091..4a6fd0d3c7bca 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
@@ -10,6 +10,7 @@
 #include <linux/platform_device.h>
 #include <linux/init.h>
 #include <linux/serial.h>
+#include <linux/sh_timer.h>
 #include <linux/serial_sci.h>
 #include <linux/io.h>
 
@@ -168,8 +169,104 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+
 static struct platform_device *sh7760_devices[] __initdata = {
 	&sci_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 };
 
 static int __init sh7760_devices_setup(void)
@@ -179,6 +276,18 @@ static int __init sh7760_devices_setup(void)
 }
 __initcall(sh7760_devices_setup);
 
+static struct platform_device *sh7760_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7760_early_devices,
+				   ARRAY_SIZE(sh7760_early_devices));
+}
+
 #define INTC_ICR	0xffd00000UL
 #define INTC_ICR_IRLM	(1 << 7)
 
-- 
GitLab


From 3551f88f6439cf4da3f5a3747b320280e30500de Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 7 May 2009 15:35:41 +0300
Subject: [PATCH 1027/2198] x86: unify 64-bit UMA and NUMA paging_init()

64-bit UMA and NUMA versions of paging_init() are almost identical.
Therefore, merge the copy in mm/numa_64.c to mm/init_64.c to remove
duplicate code.

[ Impact: cleanup ]

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1241699741.17846.30.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c |  6 +++++-
 arch/x86/mm/numa_64.c | 15 ---------------
 2 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 6a1a573e20f97..be7e127917877 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -585,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
 	early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
 	reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
 }
+#endif
 
 void __init paging_init(void)
 {
@@ -595,11 +596,14 @@ void __init paging_init(void)
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
+#ifdef CONFIG_NUMA
+	sparse_memory_present_with_active_regions(MAX_NUMNODES);
+#else
 	memory_present(0, 0, max_pfn);
+#endif
 	sparse_init();
 	free_area_init_nodes(max_zone_pfns);
 }
-#endif
 
 /*
  * Memory hotplug specific functions
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 2d05a12029dc3..fb61d81a656f7 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -578,21 +578,6 @@ unsigned long __init numa_free_all_bootmem(void)
 	return pages;
 }
 
-void __init paging_init(void)
-{
-	unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-	max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-	max_zone_pfns[ZONE_NORMAL] = max_pfn;
-
-	sparse_memory_present_with_active_regions(MAX_NUMNODES);
-	sparse_init();
-
-	free_area_init_nodes(max_zone_pfns);
-}
-
 static __init int numa_setup(char *opt)
 {
 	if (!opt)
-- 
GitLab


From 087fa4e964e04371a67f340e1f6ac92c5db5e0a6 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 7 May 2009 15:35:42 +0300
Subject: [PATCH 1028/2198] x86: use
 sparse_memory_present_with_active_regions() on UMA

There's no need to use call memory_present() manually on UMA because
initmem_init() sets up early_node_map by calling
e820_register_active_regions().

[ Impact: cleanup ]

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <1241699742.17846.31.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index be7e127917877..52bb9519bb86b 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -596,11 +596,7 @@ void __init paging_init(void)
 	max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
 	max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
-#ifdef CONFIG_NUMA
 	sparse_memory_present_with_active_regions(MAX_NUMNODES);
-#else
-	memory_present(0, 0, max_pfn);
-#endif
 	sparse_init();
 	free_area_init_nodes(max_zone_pfns);
 }
-- 
GitLab


From 8823392360dc4992f87bf4c623834d315f297493 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sun, 10 May 2009 10:53:05 +0200
Subject: [PATCH 1029/2198] perf_counter, x86: clean up throttling printk

s/PERFMON/perfcounters for perfcounter interrupt throttling warning.

'perfmon' is the CPU feature name that is Intel-only, while we do
throttling in a generic way.

[ Impact: cleanup ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index a6878b0798e5b..da27419923a8b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -814,7 +814,7 @@ void perf_counter_unthrottle(void)
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
 		if (printk_ratelimit())
-			printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n");
+			printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n");
 		hw_perf_restore(cpuc->throttle_ctrl);
 	}
 	cpuc->interrupts = 0;
-- 
GitLab


From 6751b71ea2c7ab8c0d65f01973a3fc8ea16992f4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 11 May 2009 12:08:02 +1000
Subject: [PATCH 1030/2198] perf_counter: Put whole group on when enabling
 group leader

Currently, if you have a group where the leader is disabled and there
are siblings that are enabled, and then you enable the leader, we only
put the leader on the PMU, and not its enabled siblings.  This is
incorrect, since the enabled group members should be all on or all off
at any given point.

This fixes it by adding a call to group_sched_in in
__perf_counter_enable in the case where we're enabling a group leader.

To avoid the need for a forward declaration this also moves
group_sched_in up before __perf_counter_enable.  The actual content of
group_sched_in is unchanged by this patch.

[ Impact: fix bug in counter enable code ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18951.34946.451546.691693@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 99 ++++++++++++++++++++++---------------------
 1 file changed, 51 insertions(+), 48 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d850a1fb8d4cf..a5bdc93ac4775 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -419,6 +419,54 @@ counter_sched_in(struct perf_counter *counter,
 	return 0;
 }
 
+static int
+group_sched_in(struct perf_counter *group_counter,
+	       struct perf_cpu_context *cpuctx,
+	       struct perf_counter_context *ctx,
+	       int cpu)
+{
+	struct perf_counter *counter, *partial_group;
+	int ret;
+
+	if (group_counter->state == PERF_COUNTER_STATE_OFF)
+		return 0;
+
+	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
+	if (ret)
+		return ret < 0 ? ret : 0;
+
+	group_counter->prev_state = group_counter->state;
+	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
+		return -EAGAIN;
+
+	/*
+	 * Schedule in siblings as one group (if any):
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		counter->prev_state = counter->state;
+		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
+			partial_group = counter;
+			goto group_error;
+		}
+	}
+
+	return 0;
+
+group_error:
+	/*
+	 * Groups can be scheduled in as one unit only, so undo any
+	 * partial group before returning:
+	 */
+	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
+		if (counter == partial_group)
+			break;
+		counter_sched_out(counter, cpuctx, ctx);
+	}
+	counter_sched_out(group_counter, cpuctx, ctx);
+
+	return -EAGAIN;
+}
+
 /*
  * Return 1 for a group consisting entirely of software counters,
  * 0 if the group contains any hardware counters.
@@ -643,6 +691,9 @@ static void __perf_counter_enable(void *info)
 
 	if (!group_can_go_on(counter, cpuctx, 1))
 		err = -EEXIST;
+	else if (counter == leader)
+		err = group_sched_in(counter, cpuctx, ctx,
+				     smp_processor_id());
 	else
 		err = counter_sched_in(counter, cpuctx, ctx,
 				       smp_processor_id());
@@ -791,54 +842,6 @@ static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 }
 
-static int
-group_sched_in(struct perf_counter *group_counter,
-	       struct perf_cpu_context *cpuctx,
-	       struct perf_counter_context *ctx,
-	       int cpu)
-{
-	struct perf_counter *counter, *partial_group;
-	int ret;
-
-	if (group_counter->state == PERF_COUNTER_STATE_OFF)
-		return 0;
-
-	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu);
-	if (ret)
-		return ret < 0 ? ret : 0;
-
-	group_counter->prev_state = group_counter->state;
-	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
-		return -EAGAIN;
-
-	/*
-	 * Schedule in siblings as one group (if any):
-	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		counter->prev_state = counter->state;
-		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
-			partial_group = counter;
-			goto group_error;
-		}
-	}
-
-	return 0;
-
-group_error:
-	/*
-	 * Groups can be scheduled in as one unit only, so undo any
-	 * partial group before returning:
-	 */
-	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		if (counter == partial_group)
-			break;
-		counter_sched_out(counter, cpuctx, ctx);
-	}
-	counter_sched_out(group_counter, cpuctx, ctx);
-
-	return -EAGAIN;
-}
-
 static void
 __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
-- 
GitLab


From a08b159fc243dbfe415250466d24cfc5010deee5 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 11 May 2009 15:46:10 +1000
Subject: [PATCH 1031/2198] perf_counter: don't count scheduler ticks as
 context switches

The context-switch software counter gives inflated values at present
because each scheduler tick and each process-wide counter
enable/disable prctl gets counted as a context switch.

This happens because perf_counter_task_tick, perf_counter_task_disable
and perf_counter_task_enable all call perf_counter_task_sched_out,
which calls perf_swcounter_event to record a context switch event.

This fixes it by introducing a variant of perf_counter_task_sched_out
with two underscores in front for internal use within the perf_counter
code, and makes perf_counter_task_{tick,disable,enable} call it.  This
variant doesn't record a context switch event, and takes a struct
perf_counter_context *.  This adds the new variant rather than
changing the behaviour or interface of perf_counter_task_sched_out
because that is called from other code.

[ Impact: fix inflated context-switch event counts ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18951.48034.485580.498953@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a5bdc93ac4775..7373b96bc36ce 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -837,6 +837,14 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 	cpuctx->task_ctx = NULL;
 }
 
+static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
+{
+	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
+
+	__perf_counter_sched_out(ctx, cpuctx);
+	cpuctx->task_ctx = NULL;
+}
+
 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 {
 	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
@@ -943,15 +951,13 @@ int perf_counter_task_disable(void)
 	struct perf_counter *counter;
 	unsigned long flags;
 	u64 perf_flags;
-	int cpu;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
 
 	local_irq_save(flags);
-	cpu = smp_processor_id();
 
-	perf_counter_task_sched_out(curr, cpu);
+	__perf_counter_task_sched_out(ctx);
 
 	spin_lock(&ctx->lock);
 
@@ -989,7 +995,7 @@ int perf_counter_task_enable(void)
 	local_irq_save(flags);
 	cpu = smp_processor_id();
 
-	perf_counter_task_sched_out(curr, cpu);
+	__perf_counter_task_sched_out(ctx);
 
 	spin_lock(&ctx->lock);
 
@@ -1054,7 +1060,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	ctx = &curr->perf_counter_ctx;
 
 	perf_counter_cpu_sched_out(cpuctx);
-	perf_counter_task_sched_out(curr, cpu);
+	__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
 	rotate_ctx(ctx);
-- 
GitLab


From 615a3f1e055ac9b0ae74e1f935a12ea2cfe2a2ad Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 11 May 2009 15:50:21 +1000
Subject: [PATCH 1032/2198] perf_counter: call atomic64_set for counter->count

A compile warning triggered because we are calling
atomic_set(&counter->count). But since counter->count
is an atomic64_t, we have to use atomic64_set.

So the count can be set short, resulting in the reset ioctl
only resetting the low word.

[ Impact: clear counter properly during the reset ioctl ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18951.48285.270311.981806@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7373b96bc36ce..5ea0240adab23 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1281,7 +1281,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
 static void perf_counter_reset(struct perf_counter *counter)
 {
 	(void)perf_counter_read(counter);
-	atomic_set(&counter->count, 0);
+	atomic64_set(&counter->count, 0);
 	perf_counter_update_userpage(counter);
 }
 
-- 
GitLab


From 049862579333cc6cd9e6edfd6987cd0addfd8c59 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 11 May 2009 14:33:23 +0800
Subject: [PATCH 1033/2198] blktrace: pdu_buf of pc events should be unsigned

I got this:
  8,0    1   305.417782332  2037  I   R 32 (ffffff9e 10 00 ...) [bash]

It should be:
  8,0    1   305.417782332  2037  I   R 32 (9e 10 00 ...) [bash]

[ Impact: fix output of pc events ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A07C6B3.9080802@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e099f8cc1d1c4..05b4747fd8732 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1065,7 +1065,7 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
 
 static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
 {
-	const char *pdu_buf;
+	const unsigned char *pdu_buf;
 	int pdu_len;
 	int i, end, ret;
 
-- 
GitLab


From 79c5d3ce614d8fe706545c7bca2158b63db6bb5e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 11 May 2009 15:06:46 +0800
Subject: [PATCH 1034/2198] blktrace: from-sector redundant in
 trace_block_remap, cleanup

The last argument of block_remap prober is the original sector
before remap, so it should be 'from', not 'to'.

[ Impact: clean up ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
LKML-Reference: <4A07CE86.5090301@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/block.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/trace/block.h b/include/trace/block.h
index 8ac945b7746ea..5b12efa096b67 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -70,7 +70,7 @@ DECLARE_TRACE(block_split,
 
 DECLARE_TRACE(block_remap,
 	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t to),
-	      TP_ARGS(q, bio, dev, to));
+		 sector_t from),
+	      TP_ARGS(q, bio, dev, from));
 
 #endif
-- 
GitLab


From c969f58ca43fc403c75f5d3da4cf1e21de7afaa0 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 7 Apr 2009 14:13:01 +0100
Subject: [PATCH 1035/2198] GFS2: Update the rw flags

After Jens recent updates:
http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=a1f242524c3c1f5d40f1c9c343427e34d1aadd6e
et al. this is a patch to bring gfs2 uptodate with the core
code. Also I've managed to squash another call to ll_rw_block()
along the way.

There is still one part of the GFS2 I/O paths which are not correctly
annotated and that is due to the sharing of the writeback code between
the data and metadata address spaces. I would like to change that too,
but this patch is still worth doing on its own, I think.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/log.c     |  6 +++---
 fs/gfs2/lops.c    | 14 ++++++++------
 fs/gfs2/meta_io.c | 38 +++++++++++++++++++++++++++-----------
 3 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 98918a756410c..aa62cf5976e80 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -120,7 +120,7 @@ __acquires(&sdp->sd_log_lock)
 			lock_buffer(bh);
 			if (test_clear_buffer_dirty(bh)) {
 				bh->b_end_io = end_buffer_write_sync;
-				submit_bh(WRITE, bh);
+				submit_bh(WRITE_SYNC_PLUG, bh);
 			} else {
 				unlock_buffer(bh);
 				brelse(bh);
@@ -604,7 +604,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
 		goto skip_barrier;
 	get_bh(bh);
-	submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh);
+	submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh);
 	wait_on_buffer(bh);
 	if (buffer_eopnotsupp(bh)) {
 		clear_buffer_eopnotsupp(bh);
@@ -664,7 +664,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp)
 		lock_buffer(bh);
 		if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
 			bh->b_end_io = end_buffer_write_sync;
-			submit_bh(WRITE, bh);
+			submit_bh(WRITE_SYNC_PLUG, bh);
 		} else {
 			unlock_buffer(bh);
 			brelse(bh);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 80e4f5f898bb5..00315f50fa467 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -13,6 +13,8 @@
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/bio.h>
+#include <linux/fs.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -189,7 +191,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 		}
 
 		gfs2_log_unlock(sdp);
-		submit_bh(WRITE, bh);
+		submit_bh(WRITE_SYNC_PLUG, bh);
 		gfs2_log_lock(sdp);
 
 		n = 0;
@@ -199,7 +201,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp)
 			gfs2_log_unlock(sdp);
 			lock_buffer(bd2->bd_bh);
 			bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
-			submit_bh(WRITE, bh);
+			submit_bh(WRITE_SYNC_PLUG, bh);
 			gfs2_log_lock(sdp);
 			if (++n >= num)
 				break;
@@ -341,7 +343,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 		sdp->sd_log_num_revoke--;
 
 		if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) {
-			submit_bh(WRITE, bh);
+			submit_bh(WRITE_SYNC_PLUG, bh);
 
 			bh = gfs2_log_get_buf(sdp);
 			mh = (struct gfs2_meta_header *)bh->b_data;
@@ -358,7 +360,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 	}
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
 
-	submit_bh(WRITE, bh);
+	submit_bh(WRITE_SYNC_PLUG, bh);
 }
 
 static void revoke_lo_before_scan(struct gfs2_jdesc *jd,
@@ -560,7 +562,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
 	ptr = bh_log_ptr(bh);
 	
 	get_bh(bh);
-	submit_bh(WRITE, bh);
+	submit_bh(WRITE_SYNC_PLUG, bh);
 	gfs2_log_lock(sdp);
 	while(!list_empty(list)) {
 		bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list);
@@ -586,7 +588,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh,
 		} else {
 			bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh);
 		}
-		submit_bh(WRITE, bh1);
+		submit_bh(WRITE_SYNC_PLUG, bh1);
 		gfs2_log_lock(sdp);
 		ptr += 2;
 	}
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 8d6f13256b263..75b2aec06f85a 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -201,16 +201,32 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		   struct buffer_head **bhp)
 {
-	*bhp = gfs2_getbuf(gl, blkno, CREATE);
-	if (!buffer_uptodate(*bhp)) {
-		ll_rw_block(READ_META, 1, bhp);
-		if (flags & DIO_WAIT) {
-			int error = gfs2_meta_wait(gl->gl_sbd, *bhp);
-			if (error) {
-				brelse(*bhp);
-				return error;
-			}
-		}
+	struct gfs2_sbd *sdp = gl->gl_sbd;
+	struct buffer_head *bh;
+
+	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
+		return -EIO;
+
+	*bhp = bh = gfs2_getbuf(gl, blkno, CREATE);
+
+	lock_buffer(bh);
+	if (buffer_uptodate(bh)) {
+		unlock_buffer(bh);
+		return 0;
+	}
+	bh->b_end_io = end_buffer_read_sync;
+	get_bh(bh);
+	submit_bh(READ_SYNC | (1 << BIO_RW_META), bh);
+	if (!(flags & DIO_WAIT))
+		return 0;
+
+	wait_on_buffer(bh);
+	if (unlikely(!buffer_uptodate(bh))) {
+		struct gfs2_trans *tr = current->journal_info;
+		if (tr && tr->tr_touched)
+			gfs2_io_error_bh(sdp, bh);
+		brelse(bh);
+		return -EIO;
 	}
 
 	return 0;
@@ -404,7 +420,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (buffer_uptodate(first_bh))
 		goto out;
 	if (!buffer_locked(first_bh))
-		ll_rw_block(READ_META, 1, &first_bh);
+		ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh);
 
 	dblock++;
 	extlen--;
-- 
GitLab


From 4a0f9a321a113392b448e477018311d14fba2b34 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 20 Apr 2009 08:16:26 +0100
Subject: [PATCH 1036/2198] GFS2: Optimise writepage for metadata

This adds a GFS2 specific writepage for metadata, rather than
continuing to use the VFS function. As a result we now tag all
our metadata I/O with the correct flag so that blktraces will
now be less confusing.

Also, the generic function was checking for a number of corner
cases which cannot happen on the metadata address spaces so that
this should be faster too.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/meta_io.c | 66 ++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 57 insertions(+), 9 deletions(-)

diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 75b2aec06f85a..78a5f43126677 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -33,17 +33,65 @@
 #include "util.h"
 #include "ops_address.h"
 
-static int aspace_get_block(struct inode *inode, sector_t lblock,
-			    struct buffer_head *bh_result, int create)
+static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
-	gfs2_assert_warn(inode->i_sb->s_fs_info, 0);
-	return -EOPNOTSUPP;
-}
+	int err;
+	struct buffer_head *bh, *head;
+	int nr_underway = 0;
+	int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ?
+			WRITE_SYNC_PLUG : WRITE));
+
+	BUG_ON(!PageLocked(page));
+	BUG_ON(!page_has_buffers(page));
+
+	head = page_buffers(page);
+	bh = head;
+
+	do {
+		if (!buffer_mapped(bh))
+			continue;
+		/*
+		 * If it's a fully non-blocking write attempt and we cannot
+		 * lock the buffer then redirty the page.  Note that this can
+		 * potentially cause a busy-wait loop from pdflush and kswapd
+		 * activity, but those code paths have their own higher-level
+		 * throttling.
+		 */
+		if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
+			lock_buffer(bh);
+		} else if (!trylock_buffer(bh)) {
+			redirty_page_for_writepage(wbc, page);
+			continue;
+		}
+		if (test_clear_buffer_dirty(bh)) {
+			mark_buffer_async_write(bh);
+		} else {
+			unlock_buffer(bh);
+		}
+	} while ((bh = bh->b_this_page) != head);
+
+	/*
+	 * The page and its buffers are protected by PageWriteback(), so we can
+	 * drop the bh refcounts early.
+	 */
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	do {
+		struct buffer_head *next = bh->b_this_page;
+		if (buffer_async_write(bh)) {
+			submit_bh(write_op, bh);
+			nr_underway++;
+		}
+		bh = next;
+	} while (bh != head);
+	unlock_page(page);
 
-static int gfs2_aspace_writepage(struct page *page,
-				 struct writeback_control *wbc)
-{
-	return block_write_full_page(page, aspace_get_block, wbc);
+	err = 0;
+	if (nr_underway == 0)
+		end_page_writeback(page);
+
+	return err;
 }
 
 static const struct address_space_operations aspace_aops = {
-- 
GitLab


From 48bf2b1711dc498494e77705c415ee46bb508fd9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 29 Apr 2009 13:59:35 +0100
Subject: [PATCH 1037/2198] GFS2: Something nonlinear this way comes!

For some reason GFS2 has been missing support for non-linear
mappings. This patch fixes that, and also avoids taking any
locks for mmap in the O_NOATIME case. In fact we don't actually need
to take the lock here at all - just doing file_accessed() would be
enough, but we have to take the lock eventually and this helps
it hit disk (and thus be seen by other nodes) faster.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_file.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 5d82e91887e31..0ee7bd287c5a6 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -425,33 +425,36 @@ static struct vm_operations_struct gfs2_vm_ops = {
 	.page_mkwrite = gfs2_page_mkwrite,
 };
 
-
 /**
  * gfs2_mmap -
  * @file: The file to map
  * @vma: The VMA which described the mapping
  *
- * Returns: 0 or error code
+ * There is no need to get a lock here unless we should be updating
+ * atime. We ignore any locking errors since the only consequence is
+ * a missed atime update (which will just be deferred until later).
+ *
+ * Returns: 0
  */
 
 static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-	struct gfs2_holder i_gh;
-	int error;
 
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-	error = gfs2_glock_nq(&i_gh);
-	if (error) {
-		gfs2_holder_uninit(&i_gh);
-		return error;
-	}
+	if (!(file->f_flags & O_NOATIME)) {
+		struct gfs2_holder i_gh;
+		int error;
 
+		gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+		error = gfs2_glock_nq(&i_gh);
+		file_accessed(file);
+		if (error == 0)
+			gfs2_glock_dq_uninit(&i_gh);
+	}
 	vma->vm_ops = &gfs2_vm_ops;
+	vma->vm_flags |= VM_CAN_NONLINEAR;
 
-	gfs2_glock_dq_uninit(&i_gh);
-
-	return error;
+	return 0;
 }
 
 /**
-- 
GitLab


From 7c77f0b3f9208c339a4b40737bb2cb0f0319bb8d Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 May 2009 15:37:35 +0200
Subject: [PATCH 1038/2198] splice: implement pipe to pipe splicing

Allow splice(2) to work when both the input and the output is a pipe.

Based on the impementation of the tee(2) syscall, but instead of
duplicating the buffer references move the buffers from the input pipe
to the output pipe.

Moving the whole buffer only succeeds if the full length of the buffer
is spliced.  Otherwise duplicate the buffer, just like tee(2), set the
length of the output buffer and advance the offset on the input
buffer.

Since splice is operating on two pipes, special care needs to be taken
with locking to prevent AN ABBA deadlock.  Again this is done
similarly to the tee(2) syscall, first preparing the input and output
pipes so there's data to consume and space for that data, and then
doing the move operation while holding both locks.

If other processes are doing I/O on the same pipes parallel to the
splice, then by the time both inodes are locked there might be no
buffers left to move, or no space to move them to.  In this case retry
the whole operation, including the preparation phase.  This could lead
to starvation, but I'm not sure if that's serious enough to worry
about.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 151 insertions(+), 11 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 666953d59a35c..e405cf552f5cb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1112,6 +1112,9 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
 	return ret;
 }
 
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+			       struct pipe_inode_info *opipe,
+			       size_t len, unsigned int flags);
 /*
  * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same
  * location, so checking ->i_pipe is not enough to verify that this is a
@@ -1132,12 +1135,32 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		      struct file *out, loff_t __user *off_out,
 		      size_t len, unsigned int flags)
 {
-	struct pipe_inode_info *pipe;
+	struct pipe_inode_info *ipipe;
+	struct pipe_inode_info *opipe;
 	loff_t offset, *off;
 	long ret;
 
-	pipe = pipe_info(in->f_path.dentry->d_inode);
-	if (pipe) {
+	ipipe = pipe_info(in->f_path.dentry->d_inode);
+	opipe = pipe_info(out->f_path.dentry->d_inode);
+
+	if (ipipe && opipe) {
+		if (off_in || off_out)
+			return -ESPIPE;
+
+		if (!(in->f_mode & FMODE_READ))
+			return -EBADF;
+
+		if (!(out->f_mode & FMODE_WRITE))
+			return -EBADF;
+
+		/* Splicing to self would be fun, but... */
+		if (ipipe == opipe)
+			return -EINVAL;
+
+		return splice_pipe_to_pipe(ipipe, opipe, len, flags);
+	}
+
+	if (ipipe) {
 		if (off_in)
 			return -ESPIPE;
 		if (off_out) {
@@ -1149,7 +1172,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		} else
 			off = &out->f_pos;
 
-		ret = do_splice_from(pipe, out, off, len, flags);
+		ret = do_splice_from(ipipe, out, off, len, flags);
 
 		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
 			ret = -EFAULT;
@@ -1157,8 +1180,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		return ret;
 	}
 
-	pipe = pipe_info(out->f_path.dentry->d_inode);
-	if (pipe) {
+	if (opipe) {
 		if (off_out)
 			return -ESPIPE;
 		if (off_in) {
@@ -1170,7 +1192,7 @@ static long do_splice(struct file *in, loff_t __user *off_in,
 		} else
 			off = &in->f_pos;
 
-		ret = do_splice_to(in, off, pipe, len, flags);
+		ret = do_splice_to(in, off, opipe, len, flags);
 
 		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
 			ret = -EFAULT;
@@ -1511,7 +1533,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in,
  * Make sure there's data to read. Wait for input if we can, otherwise
  * return an appropriate error.
  */
-static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
 	int ret;
 
@@ -1549,7 +1571,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
  * Make sure there's writeable room. Wait for room if we can, otherwise
  * return an appropriate error.
  */
-static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
+static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 {
 	int ret;
 
@@ -1586,6 +1608,124 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags)
 	return ret;
 }
 
+/*
+ * Splice contents of ipipe to opipe.
+ */
+static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
+			       struct pipe_inode_info *opipe,
+			       size_t len, unsigned int flags)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	int ret = 0, nbuf;
+	bool input_wakeup = false;
+
+
+retry:
+	ret = ipipe_prep(ipipe, flags);
+	if (ret)
+		return ret;
+
+	ret = opipe_prep(opipe, flags);
+	if (ret)
+		return ret;
+
+	/*
+	 * Potential ABBA deadlock, work around it by ordering lock
+	 * grabbing by pipe info address. Otherwise two different processes
+	 * could deadlock (one doing tee from A -> B, the other from B -> A).
+	 */
+	pipe_double_lock(ipipe, opipe);
+
+	do {
+		if (!opipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+
+		if (!ipipe->nrbufs && !ipipe->writers)
+			break;
+
+		/*
+		 * Cannot make any progress, because either the input
+		 * pipe is empty or the output pipe is full.
+		 */
+		if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) {
+			/* Already processed some buffers, break */
+			if (ret)
+				break;
+
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+
+			/*
+			 * We raced with another reader/writer and haven't
+			 * managed to process any buffers.  A zero return
+			 * value means EOF, so retry instead.
+			 */
+			pipe_unlock(ipipe);
+			pipe_unlock(opipe);
+			goto retry;
+		}
+
+		ibuf = ipipe->bufs + ipipe->curbuf;
+		nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS;
+		obuf = opipe->bufs + nbuf;
+
+		if (len >= ibuf->len) {
+			/*
+			 * Simply move the whole buffer from ipipe to opipe
+			 */
+			*obuf = *ibuf;
+			ibuf->ops = NULL;
+			opipe->nrbufs++;
+			ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS;
+			ipipe->nrbufs--;
+			input_wakeup = true;
+		} else {
+			/*
+			 * Get a reference to this pipe buffer,
+			 * so we can copy the contents over.
+			 */
+			ibuf->ops->get(ipipe, ibuf);
+			*obuf = *ibuf;
+
+			/*
+			 * Don't inherit the gift flag, we need to
+			 * prevent multiple steals of this page.
+			 */
+			obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+
+			obuf->len = len;
+			opipe->nrbufs++;
+			ibuf->offset += obuf->len;
+			ibuf->len -= obuf->len;
+		}
+		ret += obuf->len;
+		len -= obuf->len;
+	} while (len);
+
+	pipe_unlock(ipipe);
+	pipe_unlock(opipe);
+
+	/*
+	 * If we put data in the output pipe, wakeup any potential readers.
+	 */
+	if (ret > 0) {
+		smp_mb();
+		if (waitqueue_active(&opipe->wait))
+			wake_up_interruptible(&opipe->wait);
+		kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+	}
+	if (input_wakeup)
+		wakeup_pipe_writers(ipipe);
+
+	return ret;
+}
+
 /*
  * Link contents of ipipe to opipe.
  */
@@ -1690,9 +1830,9 @@ static long do_tee(struct file *in, struct file *out, size_t len,
 		 * Keep going, unless we encounter an error. The ipipe/opipe
 		 * ordering doesn't really matter.
 		 */
-		ret = link_ipipe_prep(ipipe, flags);
+		ret = ipipe_prep(ipipe, flags);
 		if (!ret) {
-			ret = link_opipe_prep(opipe, flags);
+			ret = opipe_prep(opipe, flags);
 			if (!ret)
 				ret = link_pipe(ipipe, opipe, len, flags);
 		}
-- 
GitLab


From 6818173bd658439b83896a2a7586f64ab51bf29c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 May 2009 15:37:36 +0200
Subject: [PATCH 1039/2198] splice: implement default splice_read method

If f_op->splice_read() is not implemented, fall back to a plain read.
Use vfs_readv() to read into previously allocated pages.

This will allow splice and functions using splice, such as the loop
device, to work on all filesystems.  This includes "direct_io" files
in fuse which bypass the page cache.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/loop.c      |  11 +---
 fs/coda/file.c            |   9 ++-
 fs/pipe.c                 |  14 +++++
 fs/read_write.c           |   7 +--
 fs/splice.c               | 120 ++++++++++++++++++++++++++++++++++++--
 include/linux/fs.h        |   2 +
 include/linux/pipe_fs_i.h |   1 +
 7 files changed, 140 insertions(+), 24 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9ca4bb0146571..801f4ab833025 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -709,10 +709,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
 	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
 		goto out_putf;
 
-	/* new backing store needs to support loop (eg splice_read) */
-	if (!inode->i_fop->splice_read)
-		goto out_putf;
-
 	/* size of the new backing store needs to be the same */
 	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
 		goto out_putf;
@@ -788,12 +784,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
 	error = -EINVAL;
 	if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		const struct address_space_operations *aops = mapping->a_ops;
-		/*
-		 * If we can't read - sorry. If we only can't write - well,
-		 * it's going to be read-only.
-		 */
-		if (!file->f_op->splice_read)
-			goto out_putf;
+
 		if (aops->write_begin)
 			lo_flags |= LO_FLAGS_USE_AOPS;
 		if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6a347fbc998a4..ffd42815fda1e 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
 		      struct pipe_inode_info *pipe, size_t count,
 		      unsigned int flags)
 {
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
 	struct coda_file_info *cfi;
 	struct file *host_file;
 
@@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos,
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
 	host_file = cfi->cfi_container;
 
-	if (!host_file->f_op || !host_file->f_op->splice_read)
-		return -EINVAL;
+	splice_read = host_file->f_op->splice_read;
+	if (!splice_read)
+		splice_read = default_file_splice_read;
 
-	return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags);
+	return splice_read(host_file, ppos, pipe, count, flags);
 }
 
 static ssize_t
diff --git a/fs/pipe.c b/fs/pipe.c
index 13414ec45b8d5..f7dd21ad85a61 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 	return 0;
 }
 
+/**
+ * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
+ * @pipe:	the pipe that the buffer belongs to
+ * @buf:	the buffer to put a reference to
+ *
+ * Description:
+ *	This function releases a reference to @buf.
+ */
+void generic_pipe_buf_release(struct pipe_inode_info *pipe,
+			      struct pipe_buffer *buf)
+{
+	page_cache_release(buf->page);
+}
+
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
 	.map = generic_pipe_buf_map,
diff --git a/fs/read_write.c b/fs/read_write.c
index 9d1e76bb9ee14..6c8c55dec2bcd 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 		goto out;
 	if (!(in_file->f_mode & FMODE_READ))
 		goto fput_in;
-	retval = -EINVAL;
-	in_inode = in_file->f_path.dentry->d_inode;
-	if (!in_inode)
-		goto fput_in;
-	if (!in_file->f_op || !in_file->f_op->splice_read)
-		goto fput_in;
 	retval = -ESPIPE;
 	if (!ppos)
 		ppos = &in_file->f_pos;
@@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos,
 	retval = -EINVAL;
 	if (!out_file->f_op || !out_file->f_op->sendpage)
 		goto fput_out;
+	in_inode = in_file->f_path.dentry->d_inode;
 	out_inode = out_file->f_path.dentry->d_inode;
 	retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
 	if (retval < 0)
diff --git a/fs/splice.c b/fs/splice.c
index e405cf552f5cb..3bd9cb21b38e9 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -507,9 +507,116 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
 
 	return ret;
 }
-
 EXPORT_SYMBOL(generic_file_splice_read);
 
+static const struct pipe_buf_operations default_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = generic_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
+			    unsigned long vlen, loff_t offset)
+{
+	mm_segment_t old_fs;
+	loff_t pos = offset;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos);
+	set_fs(old_fs);
+
+	return res;
+}
+
+ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
+{
+	unsigned int nr_pages;
+	unsigned int nr_freed;
+	size_t offset;
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct iovec vec[PIPE_BUFFERS];
+	pgoff_t index;
+	ssize_t res;
+	size_t this_len;
+	int error;
+	int i;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.flags = flags,
+		.ops = &default_pipe_buf_ops,
+		.spd_release = spd_release_page,
+	};
+
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	offset = *ppos & ~PAGE_CACHE_MASK;
+	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
+		struct page *page;
+
+		page = alloc_page(GFP_HIGHUSER);
+		error = -ENOMEM;
+		if (!page)
+			goto err;
+
+		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
+		vec[i].iov_base = (void __user *) kmap(page);
+		vec[i].iov_len = this_len;
+		pages[i] = page;
+		spd.nr_pages++;
+		len -= this_len;
+		offset = 0;
+	}
+
+	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
+	if (res < 0)
+		goto err;
+
+	error = 0;
+	if (!res)
+		goto err;
+
+	nr_freed = 0;
+	for (i = 0; i < spd.nr_pages; i++) {
+		kunmap(pages[i]);
+		this_len = min_t(size_t, vec[i].iov_len, res);
+		partial[i].offset = 0;
+		partial[i].len = this_len;
+		if (!this_len) {
+			__free_page(pages[i]);
+			pages[i] = NULL;
+			nr_freed++;
+		}
+		res -= this_len;
+	}
+	spd.nr_pages -= nr_freed;
+
+	res = splice_to_pipe(pipe, &spd);
+	if (res > 0)
+		*ppos += res;
+
+	return res;
+
+err:
+	for (i = 0; i < spd.nr_pages; i++) {
+		kunmap(pages[i]);
+		__free_page(pages[i]);
+	}
+	return error;
+}
+EXPORT_SYMBOL(default_file_splice_read);
+
 /*
  * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos'
  * using sendpage(). Return the number of bytes sent.
@@ -933,11 +1040,10 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 			 struct pipe_inode_info *pipe, size_t len,
 			 unsigned int flags)
 {
+	ssize_t (*splice_read)(struct file *, loff_t *,
+			       struct pipe_inode_info *, size_t, unsigned int);
 	int ret;
 
-	if (unlikely(!in->f_op || !in->f_op->splice_read))
-		return -EINVAL;
-
 	if (unlikely(!(in->f_mode & FMODE_READ)))
 		return -EBADF;
 
@@ -945,7 +1051,11 @@ static long do_splice_to(struct file *in, loff_t *ppos,
 	if (unlikely(ret < 0))
 		return ret;
 
-	return in->f_op->splice_read(in, ppos, pipe, len, flags);
+	splice_read = in->f_op->splice_read;
+	if (!splice_read)
+		splice_read = default_file_splice_read;
+
+	return splice_read(in, ppos, pipe, len, flags);
 }
 
 /**
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5bed436f43539..d926c2bea1662 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2204,6 +2204,8 @@ extern int generic_segment_checks(const struct iovec *iov,
 /* fs/splice.c */
 extern ssize_t generic_file_splice_read(struct file *, loff_t *,
 		struct pipe_inode_info *, size_t, unsigned int);
+extern ssize_t default_file_splice_read(struct file *, loff_t *,
+		struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h
index c8f038554e80d..b43a9e0390591 100644
--- a/include/linux/pipe_fs_i.h
+++ b/include/linux/pipe_fs_i.h
@@ -152,5 +152,6 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void
 void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *);
 int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *);
+void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *);
 
 #endif
-- 
GitLab


From 0b0a47f5c4a30b58432e20ae1706a27baea91a88 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <miklos@szeredi.hu>
Date: Thu, 7 May 2009 15:37:37 +0200
Subject: [PATCH 1040/2198] splice: implement default splice_write method

If f_op->splice_write() is not implemented, fall back to a plain write.
Use vfs_writev() to write from the pipe buffers.

This will allow splice on all filesystems and file types.  This
includes "direct_io" files in fuse which bypass the page cache.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 138 insertions(+), 4 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 3bd9cb21b38e9..eefd96b1d7fbf 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -535,6 +535,21 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 	return res;
 }
 
+static ssize_t kernel_writev(struct file *file, const struct iovec *vec,
+			    unsigned long vlen, loff_t *ppos)
+{
+	mm_segment_t old_fs;
+	ssize_t res;
+
+	old_fs = get_fs();
+	set_fs(get_ds());
+	/* The cast to a user pointer is valid due to the set_fs() */
+	res = vfs_writev(file, (const struct iovec __user *)vec, vlen, ppos);
+	set_fs(old_fs);
+
+	return res;
+}
+
 ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 				 struct pipe_inode_info *pipe, size_t len,
 				 unsigned int flags)
@@ -988,6 +1003,122 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 EXPORT_SYMBOL(generic_file_splice_write);
 
+static struct pipe_buffer *nth_pipe_buf(struct pipe_inode_info *pipe, int n)
+{
+	return &pipe->bufs[(pipe->curbuf + n) % PIPE_BUFFERS];
+}
+
+static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
+					 struct file *out, loff_t *ppos,
+					 size_t len, unsigned int flags)
+{
+	ssize_t ret = 0;
+	ssize_t total_len = 0;
+	int do_wakeup = 0;
+
+	pipe_lock(pipe);
+	while (len) {
+		struct pipe_buffer *buf;
+		void *data[PIPE_BUFFERS];
+		struct iovec vec[PIPE_BUFFERS];
+		unsigned int nr_pages = 0;
+		unsigned int write_len = 0;
+		unsigned int now_len = len;
+		unsigned int this_len;
+		int i;
+
+		BUG_ON(pipe->nrbufs > PIPE_BUFFERS);
+		for (i = 0; i < pipe->nrbufs && now_len; i++) {
+			buf = nth_pipe_buf(pipe, i);
+
+			ret = buf->ops->confirm(pipe, buf);
+			if (ret)
+				break;
+
+			data[i] = buf->ops->map(pipe, buf, 0);
+			this_len = min(buf->len, now_len);
+			vec[i].iov_base = (void __user *) data[i] + buf->offset;
+			vec[i].iov_len = this_len;
+			now_len -= this_len;
+			write_len += this_len;
+			nr_pages++;
+		}
+
+		if (nr_pages) {
+			ret = kernel_writev(out, vec, nr_pages, ppos);
+			if (ret == 0)
+				ret = -EIO;
+			if (ret > 0) {
+				len -= ret;
+				total_len += ret;
+			}
+		}
+
+		for (i = 0; i < nr_pages; i++) {
+			buf = nth_pipe_buf(pipe, i);
+			buf->ops->unmap(pipe, buf, data[i]);
+
+			if (ret > 0) {
+				this_len = min_t(unsigned, vec[i].iov_len, ret);
+				buf->offset += this_len;
+				buf->len -= this_len;
+				ret -= this_len;
+			}
+		}
+
+		if (ret < 0)
+			break;
+
+		while (pipe->nrbufs) {
+			const struct pipe_buf_operations *ops;
+
+			buf = nth_pipe_buf(pipe, 0);
+			if (buf->len)
+				break;
+
+			ops = buf->ops;
+			buf->ops = NULL;
+			ops->release(pipe, buf);
+			pipe->curbuf = (pipe->curbuf + 1) % PIPE_BUFFERS;
+			pipe->nrbufs--;
+			if (pipe->inode)
+				do_wakeup = 1;
+		}
+
+		if (pipe->nrbufs)
+			continue;
+		if (!pipe->writers)
+			break;
+		if (!pipe->waiting_writers) {
+			if (total_len)
+				break;
+		}
+
+		if (flags & SPLICE_F_NONBLOCK) {
+			ret = -EAGAIN;
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (do_wakeup) {
+			wakeup_pipe_writers(pipe);
+			do_wakeup = 0;
+		}
+
+		pipe_wait(pipe);
+	}
+	pipe_unlock(pipe);
+
+	if (do_wakeup)
+		wakeup_pipe_writers(pipe);
+
+	return total_len ? total_len : ret;
+}
+
 /**
  * generic_splice_sendpage - splice data from a pipe to a socket
  * @pipe:	pipe to splice from
@@ -1015,11 +1146,10 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 			   loff_t *ppos, size_t len, unsigned int flags)
 {
+	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
+				loff_t *, size_t, unsigned int);
 	int ret;
 
-	if (unlikely(!out->f_op || !out->f_op->splice_write))
-		return -EINVAL;
-
 	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
@@ -1030,7 +1160,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 	if (unlikely(ret < 0))
 		return ret;
 
-	return out->f_op->splice_write(pipe, out, ppos, len, flags);
+	splice_write = out->f_op->splice_write;
+	if (!splice_write)
+		splice_write = default_file_splice_write;
+
+	return splice_write(pipe, out, ppos, len, flags);
 }
 
 /*
-- 
GitLab


From 97a52714658cd959a3cfa35c5b6f489859f0204b Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Fri, 8 May 2009 18:23:50 +0200
Subject: [PATCH 1041/2198] x86: display extended apic registers with
 print_local_APIC and cpu_debug code

Both print_local_APIC (used when apic=debug kernel param is set) and
cpu_debug code missed support for some extended APIC registers that
I'd like to see.

This adds support to show:

 - extended APIC feature register
 - extended APIC control register
 - extended LVT registers

[ Impact: print more debug info ]

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090508162350.GO29045@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apicdef.h  |  8 ++++----
 arch/x86/kernel/apic/apic.c     |  2 +-
 arch/x86/kernel/apic/io_apic.c  | 14 +++++++++++++-
 arch/x86/kernel/cpu/cpu_debug.c | 14 +++++++++++++-
 4 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index bc9514fb3b13f..7ddb36ab933b9 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -22,6 +22,7 @@
 #  define	APIC_INTEGRATED(x)	(1)
 #endif
 #define		APIC_XAPIC(x)		((x) >= 0x14)
+#define		APIC_EXT_SPACE(x)	((x) & 0x80000000)
 #define	APIC_TASKPRI	0x80
 #define		APIC_TPRI_MASK		0xFFu
 #define	APIC_ARBPRI	0x90
@@ -116,7 +117,9 @@
 #define		APIC_TDR_DIV_32		0x8
 #define		APIC_TDR_DIV_64		0x9
 #define		APIC_TDR_DIV_128	0xA
-#define	APIC_EILVT0     0x500
+#define	APIC_EFEAT	0x400
+#define	APIC_ECTRL	0x410
+#define APIC_EILVTn(n)	(0x500 + 0x10 * n)
 #define		APIC_EILVT_NR_AMD_K8	1	/* # of extended interrupts */
 #define		APIC_EILVT_NR_AMD_10H	4
 #define		APIC_EILVT_LVTOFF(x)	(((x) >> 4) & 0xF)
@@ -125,9 +128,6 @@
 #define		APIC_EILVT_MSG_NMI	0x4
 #define		APIC_EILVT_MSG_EXT	0x7
 #define		APIC_EILVT_MASKED	(1 << 16)
-#define	APIC_EILVT1     0x510
-#define	APIC_EILVT2     0x520
-#define	APIC_EILVT3     0x530
 
 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
 #define APIC_BASE_MSR	0x800
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 1ee966f4ae956..0e6543fafb507 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -395,7 +395,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 
 static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
 {
-	unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+	unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
 	unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
 
 	apic_write(reg, v);
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 2afe145d277f0..65b824c9c4fc1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1739,7 +1739,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
 
 __apicdebuginit(void) print_local_APIC(void *dummy)
 {
-	unsigned int v, ver, maxlvt;
+	unsigned int i, v, ver, maxlvt;
 	u64 icr;
 
 	if (apic_verbosity == APIC_QUIET)
@@ -1827,6 +1827,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
 	printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
 	v = apic_read(APIC_TDCR);
 	printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+	if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+		v = apic_read(APIC_EFEAT);
+		maxlvt = (v >> 16) & 0xff;
+		printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+		v = apic_read(APIC_ECTRL);
+		printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+		for (i = 0; i < maxlvt; i++) {
+			v = apic_read(APIC_EILVTn(i));
+			printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+		}
+	}
 	printk("\n");
 }
 
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab96c6ac..2fc4f6bb9ca51 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -588,8 +588,20 @@ static void print_apic(void *arg)
 	seq_printf(seq, " TMICT\t\t: %08x\n",  apic_read(APIC_TMICT));
 	seq_printf(seq, " TMCCT\t\t: %08x\n",  apic_read(APIC_TMCCT));
 	seq_printf(seq, " TDCR\t\t: %08x\n",  apic_read(APIC_TDCR));
-#endif /* CONFIG_X86_LOCAL_APIC */
+	if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+		unsigned int i, v, maxeilvt;
+
+		v = apic_read(APIC_EFEAT);
+		maxeilvt = (v >> 16) & 0xff;
+		seq_printf(seq, " EFEAT\t\t: %08x\n", v);
+		seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
 
+		for (i = 0; i < maxeilvt; i++) {
+			v = apic_read(APIC_EILVTn(i));
+			seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
+		}
+	}
+#endif /* CONFIG_X86_LOCAL_APIC */
 	seq_printf(seq, "\n MSR\t:\n");
 }
 
-- 
GitLab


From 3d6ad460214cc72b93333f51f498441a56d622e9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 09:01:08 +0000
Subject: [PATCH 1042/2198] sh: multiple vectors per irq - sh7760

Update intc tables and platform data to use one linux irq
per maskable interrupt source instead of keeping the one-to-one
mapping between vectors and linux irqs.

This fixes potential irq masking issues for sh7760 hardware
blocks such as DMAC/TMU2/REF.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/dma/Kconfig           |  3 ++-
 arch/sh/kernel/cpu/sh4/setup-sh7760.c | 31 ++++++++++-----------------
 2 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/arch/sh/drivers/dma/Kconfig b/arch/sh/drivers/dma/Kconfig
index 666713ac5fcff..63e9dd30b41c0 100644
--- a/arch/sh/drivers/dma/Kconfig
+++ b/arch/sh/drivers/dma/Kconfig
@@ -16,7 +16,8 @@ config SH_DMA_IRQ_MULTI
 		     CPU_SUBTYPE_SH7750S || CPU_SUBTYPE_SH7750R || \
 		     CPU_SUBTYPE_SH7751R || CPU_SUBTYPE_SH7091  || \
 		     CPU_SUBTYPE_SH7763  || CPU_SUBTYPE_SH7764  || \
-		     CPU_SUBTYPE_SH7780  || CPU_SUBTYPE_SH7785
+		     CPU_SUBTYPE_SH7780  || CPU_SUBTYPE_SH7785  || \
+		     CPU_SUBTYPE_SH7760
 
 config NR_ONCHIP_DMA_CHANNELS
 	int
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
index 4a6fd0d3c7bca..cd097335758f1 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
@@ -19,10 +19,7 @@ enum {
 
 	/* interrupt sources */
 	IRL0, IRL1, IRL2, IRL3,
-	HUDI, GPIOI,
-	DMAC_DMTE0, DMAC_DMTE1, DMAC_DMTE2, DMAC_DMTE3,
-	DMAC_DMTE4, DMAC_DMTE5, DMAC_DMTE6, DMAC_DMTE7,
-	DMAC_DMAE,
+	HUDI, GPIOI, DMAC,
 	IRQ4, IRQ5, IRQ6, IRQ7,
 	HCAN20, HCAN21,
 	SSI0, SSI1,
@@ -37,21 +34,20 @@ enum {
 	HSPI,
 	MMCIF0, MMCIF1, MMCIF2, MMCIF3,
 	MFI, ADC, CMT,
-	TMU0, TMU1, TMU2_TUNI, TMU2_TICPI,
-	WDT,
-	REF_RCMI, REF_ROVI,
+	TMU0, TMU1, TMU2,
+	WDT, REF,
 
 	/* interrupt groups */
-	DMAC, DMABRG, SCIF0, SCIF1, SCIF2, SIM, MMCIF, TMU2, REF,
+	DMABRG, SCIF0, SCIF1, SCIF2, SIM, MMCIF,
 };
 
 static struct intc_vect vectors[] __initdata = {
 	INTC_VECT(HUDI, 0x600), INTC_VECT(GPIOI, 0x620),
-	INTC_VECT(DMAC_DMTE0, 0x640), INTC_VECT(DMAC_DMTE1, 0x660),
-	INTC_VECT(DMAC_DMTE2, 0x680), INTC_VECT(DMAC_DMTE3, 0x6a0),
-	INTC_VECT(DMAC_DMTE4, 0x780), INTC_VECT(DMAC_DMTE5, 0x7a0),
-	INTC_VECT(DMAC_DMTE6, 0x7c0), INTC_VECT(DMAC_DMTE7, 0x7e0),
-	INTC_VECT(DMAC_DMAE, 0x6c0),
+	INTC_VECT(DMAC, 0x640), INTC_VECT(DMAC, 0x660),
+	INTC_VECT(DMAC, 0x680), INTC_VECT(DMAC, 0x6a0),
+	INTC_VECT(DMAC, 0x780), INTC_VECT(DMAC, 0x7a0),
+	INTC_VECT(DMAC, 0x7c0), INTC_VECT(DMAC, 0x7e0),
+	INTC_VECT(DMAC, 0x6c0),
 	INTC_VECT(IRQ4, 0x800), INTC_VECT(IRQ5, 0x820),
 	INTC_VECT(IRQ6, 0x840), INTC_VECT(IRQ6, 0x860),
 	INTC_VECT(HCAN20, 0x900), INTC_VECT(HCAN21, 0x920),
@@ -75,23 +71,18 @@ static struct intc_vect vectors[] __initdata = {
 	INTC_VECT(MFI, 0xe80), /* 0xf80 according to data sheet */
 	INTC_VECT(ADC, 0xf80), INTC_VECT(CMT, 0xfa0),
 	INTC_VECT(TMU0, 0x400), INTC_VECT(TMU1, 0x420),
-	INTC_VECT(TMU2_TUNI, 0x440), INTC_VECT(TMU2_TICPI, 0x460),
+	INTC_VECT(TMU2, 0x440), INTC_VECT(TMU2, 0x460),
 	INTC_VECT(WDT, 0x560),
-	INTC_VECT(REF_RCMI, 0x580), INTC_VECT(REF_ROVI, 0x5a0),
+	INTC_VECT(REF, 0x580), INTC_VECT(REF, 0x5a0),
 };
 
 static struct intc_group groups[] __initdata = {
-	INTC_GROUP(DMAC, DMAC_DMTE0, DMAC_DMTE1, DMAC_DMTE2,
-		   DMAC_DMTE3, DMAC_DMTE4, DMAC_DMTE5,
-		   DMAC_DMTE6, DMAC_DMTE7, DMAC_DMAE),
 	INTC_GROUP(DMABRG, DMABRG0, DMABRG1, DMABRG2),
 	INTC_GROUP(SCIF0, SCIF0_ERI, SCIF0_RXI, SCIF0_BRI, SCIF0_TXI),
 	INTC_GROUP(SCIF1, SCIF1_ERI, SCIF1_RXI, SCIF1_BRI, SCIF1_TXI),
 	INTC_GROUP(SCIF2, SCIF2_ERI, SCIF2_RXI, SCIF2_BRI, SCIF2_TXI),
 	INTC_GROUP(SIM, SIM_ERI, SIM_RXI, SIM_TXI, SIM_TEI),
 	INTC_GROUP(MMCIF, MMCIF0, MMCIF1, MMCIF2, MMCIF3),
-	INTC_GROUP(TMU2, TMU2_TUNI, TMU2_TICPI),
-	INTC_GROUP(REF, REF_RCMI, REF_ROVI),
 };
 
 static struct intc_mask_reg mask_registers[] __initdata = {
-- 
GitLab


From 50e2d0d3b4ffc945a6c27f16d4e5e557e725ec67 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 11:34:16 +0000
Subject: [PATCH 1043/2198] sh: r7780 highlander clock fixes

Update the r7780 highlander defconfig to fix
PCLK value, while at it fix cmdline on r7785.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/configs/r7780mp_defconfig | 2 +-
 arch/sh/configs/r7785rp_defconfig | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig
index 68e5eff5d7fe6..943da63a38523 100644
--- a/arch/sh/configs/r7780mp_defconfig
+++ b/arch/sh/configs/r7780mp_defconfig
@@ -260,7 +260,7 @@ CONFIG_SH_R7780MP=y
 #
 CONFIG_SH_TMU=y
 CONFIG_SH_TIMER_IRQ=28
-CONFIG_SH_PCLK_FREQ=32000000
+CONFIG_SH_PCLK_FREQ=33333333
 # CONFIG_NO_HZ is not set
 # CONFIG_HIGH_RES_TIMERS is not set
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig
index 5e677a8e3a366..82658f6723980 100644
--- a/arch/sh/configs/r7785rp_defconfig
+++ b/arch/sh/configs/r7785rp_defconfig
@@ -333,7 +333,7 @@ CONFIG_GUSA=y
 CONFIG_ZERO_PAGE_OFFSET=0x00001000
 CONFIG_BOOT_LINK_OFFSET=0x00800000
 CONFIG_CMDLINE_BOOL=y
-CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1 mode4_pin=high"
+CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1"
 
 #
 # Bus options
-- 
GitLab


From ccc195655fb25d7d967b278c4a4725dc5e7a6bf4 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 11 May 2009 11:37:16 +0000
Subject: [PATCH 1044/2198] sh: TMU platform data for sh7780

This patch adds TMU platform data for sh7780. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7780.c | 204 +++++++++++++++++++++++++
 1 file changed, 204 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
index 6f7227cd65bff..f1df020950629 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
@@ -12,6 +12,189 @@
 #include <linux/serial.h>
 #include <linux/io.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
+
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 28,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 29,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 30,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffdc0008,
+		.end	= 0xffdc0013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 96,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffdc0014,
+		.end	= 0xffdc001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 97,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffdc0020,
+		.end	= 0xffdc002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 98,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
 
 static struct resource rtc_resources[] = {
 	[0] = {
@@ -58,6 +241,12 @@ static struct platform_device sci_device = {
 };
 
 static struct platform_device *sh7780_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 	&rtc_device,
 	&sci_device,
 };
@@ -69,6 +258,21 @@ static int __init sh7780_devices_setup(void)
 }
 __initcall(sh7780_devices_setup);
 
+static struct platform_device *sh7780_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7780_early_devices,
+				   ARRAY_SIZE(sh7780_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
GitLab


From cec6be6d1069d697beb490bbb40a290d5ff554a2 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@gmail.com>
Date: Mon, 11 May 2009 17:41:40 +0400
Subject: [PATCH 1045/2198] x86: apic: Fixmap apic address even if apic
 disabled

In case if apic were disabled by boot option
we still need read_apic operation. So fixmap
a fake apic area if needed.

[ Impact: fix boot crash ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: yinghai@kernel.org
Cc: eswierk@aristanetworks.com
LKML-Reference: <20090511134140.GH4624@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 0e6543fafb507..07cffc1214cbd 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1587,13 +1587,6 @@ void __init init_apic_mappings(void)
 	} else
 		apic_phys = mp_lapic_addr;
 
-	/* lets check if we may NOP'ify apic operations */
-	if (!cpu_has_apic) {
-		pr_info("APIC: disable apic facility\n");
-		apic_disable();
-		return;
-	}
-
 	/*
 	 * acpi lapic path already maps that address in
 	 * acpi_register_lapic_address()
@@ -1602,7 +1595,15 @@ void __init init_apic_mappings(void)
 		set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
 
 	apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
-				APIC_BASE, apic_phys);
+			APIC_BASE, apic_phys);
+
+	/* lets check if we may NOP'ify apic operations */
+	if (!cpu_has_apic) {
+		pr_info("APIC: disable apic facility\n");
+		apic_disable();
+		return;
+	}
+
 	/*
 	 * Fetch the APIC ID of the BSP in case we have a
 	 * default configuration (or the MP table is broken).
-- 
GitLab


From d756f4adb9d8a86e347a2d5435bb5cc95744733e Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 4 May 2009 03:29:52 +0400
Subject: [PATCH 1046/2198] x86, 32-bit: ifdef out struct thread_struct::fs

After commit 464d1a78fbf8cf6c7fd970e7b3e2db50a320ce28 aka
"[PATCH] i386: Convert i386 PDA code to use %fs"
%fs saved during context switch moved from thread_struct to pt_regs
and value on thread_struct became unused.

[ Impact: reduce thread_struct size on 32-bit ]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: containers@lists.linux-foundation.org
LKML-Reference: <20090503232952.GI16631@x200.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c2cceae709c86..a6732ff7b0163 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -428,7 +428,9 @@ struct thread_struct {
 	unsigned short		gsindex;
 #endif
 	unsigned long		ip;
+#ifdef CONFIG_X86_64
 	unsigned long		fs;
+#endif
 	unsigned long		gs;
 	/* Hardware debugging registers: */
 	unsigned long		debugreg0;
@@ -874,7 +876,6 @@ static inline void spin_lock_prefetch(const void *x)
 	.vm86_info		= NULL,					  \
 	.sysenter_cs		= __KERNEL_CS,				  \
 	.io_bitmap_ptr		= NULL,					  \
-	.fs			= __KERNEL_PERCPU,			  \
 }
 
 /*
-- 
GitLab


From 0c23590f00f85467b318ad0c20c36796a5bd4c60 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 4 May 2009 03:30:15 +0400
Subject: [PATCH 1047/2198] x86, 64-bit: ifdef out struct thread_struct::ip

struct thread_struct::ip isn't used on x86_64, struct pt_regs::ip is used
instead.

kgdb should be reading 0 always, but I can't check it.

[ Impact: (potentially) reduce thread_struct size on 64-bit ]

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: containers@lists.linux-foundation.org
LKML-Reference: <20090503233015.GJ16631@x200.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h | 2 ++
 arch/x86/kernel/kgdb.c           | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a6732ff7b0163..a9ba7436821e4 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -427,7 +427,9 @@ struct thread_struct {
 	unsigned short		fsindex;
 	unsigned short		gsindex;
 #endif
+#ifdef CONFIG_X86_32
 	unsigned long		ip;
+#endif
 #ifdef CONFIG_X86_64
 	unsigned long		fs;
 #endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index eedfaebe1063d..d07706f1976a6 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -141,7 +141,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
 	gdb_regs32[GDB_PS]	= *(unsigned long *)(p->thread.sp + 8);
 	gdb_regs32[GDB_CS]	= __KERNEL_CS;
 	gdb_regs32[GDB_SS]	= __KERNEL_DS;
-	gdb_regs[GDB_PC]	= p->thread.ip;
+	gdb_regs[GDB_PC]	= 0;
 	gdb_regs[GDB_R8]	= 0;
 	gdb_regs[GDB_R9]	= 0;
 	gdb_regs[GDB_R10]	= 0;
-- 
GitLab


From 5a772b2b3c68e7e0b503c5a48469113bb0634314 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 10:56:33 -0400
Subject: [PATCH 1048/2198] ring-buffer: replace constants with time macros in
 ring-buffer-benchmark

The use of numeric constants is discouraged. It is cleaner and more
descriptive to use macros for constant time conversions.

This patch also removes an extra new line.

[ Impact: more descriptive time conversions ]

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index a21aa7b3d05e0..7d3aef93c49f0 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -253,7 +253,7 @@ static void ring_buffer_producer(void)
 	}
 
 	time = end_tv.tv_sec - start_tv.tv_sec;
-	time *= 1000000;
+	time *= USEC_PER_SEC;
 	time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
 
 	entries = ring_buffer_entries(buffer);
@@ -273,7 +273,8 @@ static void ring_buffer_producer(void)
 	pr_info("Missed:   %ld\n", missed);
 	pr_info("Hit:      %ld\n", hit);
 
-	do_div(time, 1000);
+	/* Convert time from usecs to millisecs */
+	do_div(time, USEC_PER_MSEC);
 	if (time)
 		hit /= (long)time;
 	else
@@ -282,18 +283,19 @@ static void ring_buffer_producer(void)
 	pr_info("Entries per millisec: %ld\n", hit);
 
 	if (hit) {
-		avg = 1000000 / hit;
+		/* Calculate the average time in nanosecs */
+		avg = NSEC_PER_MSEC / hit;
 		pr_info("%ld ns per entry\n", avg);
 	}
 
-
 	if (missed) {
 		if (time)
 			missed /= (long)time;
 
 		pr_info("Total iterations per millisec: %ld\n", hit + missed);
 
-		avg = 1000000 / (hit + missed);
+		/* Caculate the average time in nanosecs */
+		avg = NSEC_PER_MSEC / (hit + missed);
 		pr_info("%ld ns per entry\n", avg);
 	}
 }
-- 
GitLab


From d988ff94c1074c4c914235c8591bcceafb585ecf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 8 May 2009 11:03:57 -0400
Subject: [PATCH 1049/2198] ring-buffer: check for divide by zero in
 ring-buffer-benchmark

Although we check if "missed" is not zero, we divide by hit + missed,
and the addition can possible overflow and become a divide by zero.

This patch checks for this case, and will report it when it happens
then modify "hit" to make the calculation be non zero.

[ Impact: prevent possible divide by zero in ring-buffer-benchmark ]

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer_benchmark.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 7d3aef93c49f0..8d68e149a8b3d 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -294,6 +294,12 @@ static void ring_buffer_producer(void)
 
 		pr_info("Total iterations per millisec: %ld\n", hit + missed);
 
+		/* it is possible that hit + missed will overflow and be zero */
+		if (!(hit + missed)) {
+			pr_info("hit + missed overflowed and totalled zero!\n");
+			hit--; /* make it non zero */
+		}
+
 		/* Caculate the average time in nanosecs */
 		avg = NSEC_PER_MSEC / (hit + missed);
 		pr_info("%ld ns per entry\n", avg);
-- 
GitLab


From 1cd8d7358948909ab80b254eb14bcebc555ad417 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 14:08:09 -0400
Subject: [PATCH 1050/2198] ring-buffer: remove type parameter from
 rb_reserve_next_event

The rb_reserve_next_event is only called for the data type (type = 0).
There is no reason to pass in the type to the function.

Before:
   text    data     bss     dec     hex filename
  16554      24      12   16590    40ce kernel/trace/ring_buffer.o

After:
   text    data     bss     dec     hex filename
  16538      24      12   16574    40be kernel/trace/ring_buffer.o

[ Impact: cleaner, smaller and slightly more efficient code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 361170609bd04..fe40f6c3507c3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1389,7 +1389,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 
 static struct ring_buffer_event *
 rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
-		      unsigned type, unsigned long length)
+		      unsigned long length)
 {
 	struct ring_buffer_event *event;
 	u64 ts, delta;
@@ -1448,7 +1448,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 		/* Non commits have zero deltas */
 		delta = 0;
 
-	event = __rb_reserve_next(cpu_buffer, type, length, &ts);
+	event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
 	if (PTR_ERR(event) == -EAGAIN)
 		goto again;
 
@@ -1556,7 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (length > BUF_PAGE_SIZE)
 		goto out;
 
-	event = rb_reserve_next_event(cpu_buffer, 0, length);
+	event = rb_reserve_next_event(cpu_buffer, length);
 	if (!event)
 		goto out;
 
@@ -1782,7 +1782,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 		goto out;
 
 	event_length = rb_calculate_event_length(length);
-	event = rb_reserve_next_event(cpu_buffer, 0, event_length);
+	event = rb_reserve_next_event(cpu_buffer, event_length);
 	if (!event)
 		goto out;
 
-- 
GitLab


From be957c447f7233a67904a1b11eb3ab61e702bf4d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 14:42:53 -0400
Subject: [PATCH 1051/2198] ring-buffer: move calculation of event length

The event length is calculated and passed in to rb_reserve_next_event
in two different locations. Having rb_reserve_next_event do the
calculations directly makes only one location to do the change and
causes the calculation to be inlined by gcc.

Before:
   text    data     bss     dec     hex filename
  16538      24      12   16574    40be kernel/trace/ring_buffer.o

After:
   text    data     bss     dec     hex filename
  16490      24      12   16526    408e kernel/trace/ring_buffer.o

[ Impact: smaller more efficient code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fe40f6c3507c3..493cba46abc9b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -367,6 +367,9 @@ static inline int test_time_stamp(u64 delta)
 
 #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
+/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
+#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
+
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
 	struct buffer_data_page field;
@@ -1396,6 +1399,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	int commit = 0;
 	int nr_loops = 0;
 
+	length = rb_calculate_event_length(length);
  again:
 	/*
 	 * We allow for interrupts to reenter here and do a trace.
@@ -1552,8 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 	if (atomic_read(&cpu_buffer->record_disabled))
 		goto out;
 
-	length = rb_calculate_event_length(length);
-	if (length > BUF_PAGE_SIZE)
+	if (length > BUF_MAX_DATA_SIZE)
 		goto out;
 
 	event = rb_reserve_next_event(cpu_buffer, length);
@@ -1758,7 +1761,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
-	unsigned long event_length;
 	void *body;
 	int ret = -EBUSY;
 	int cpu, resched;
@@ -1781,8 +1783,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
 	if (atomic_read(&cpu_buffer->record_disabled))
 		goto out;
 
-	event_length = rb_calculate_event_length(length);
-	event = rb_reserve_next_event(cpu_buffer, event_length);
+	if (length > BUF_MAX_DATA_SIZE)
+		goto out;
+
+	event = rb_reserve_next_event(cpu_buffer, length);
 	if (!event)
 		goto out;
 
-- 
GitLab


From b68d8201433a91cabbcbeae48b53d8c1c426433a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 03:45:08 +0900
Subject: [PATCH 1052/2198] sh: clkfwk: Make recalc return an unsigned long.

This is prep work for cleaning up some of the rate propagation bits.
Trivial conversion.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h            |  3 ++-
 arch/sh/kernel/cpu/clock.c             | 35 ++++++++++++++------------
 arch/sh/kernel/cpu/sh2/clock-sh7619.c  | 13 +++++-----
 arch/sh/kernel/cpu/sh2a/clock-sh7201.c | 14 +++++------
 arch/sh/kernel/cpu/sh2a/clock-sh7203.c | 12 ++++-----
 arch/sh/kernel/cpu/sh2a/clock-sh7206.c | 12 ++++-----
 arch/sh/kernel/cpu/sh3/clock-sh3.c     | 12 ++++-----
 arch/sh/kernel/cpu/sh3/clock-sh7705.c  | 12 ++++-----
 arch/sh/kernel/cpu/sh3/clock-sh7706.c  | 12 ++++-----
 arch/sh/kernel/cpu/sh3/clock-sh7709.c  | 12 ++++-----
 arch/sh/kernel/cpu/sh3/clock-sh7710.c  | 12 ++++-----
 arch/sh/kernel/cpu/sh3/clock-sh7712.c  |  8 +++---
 arch/sh/kernel/cpu/sh4/clock-sh4-202.c | 14 +++++------
 arch/sh/kernel/cpu/sh4/clock-sh4.c     | 12 ++++-----
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 31 ++++++++++-------------
 arch/sh/kernel/cpu/sh4a/clock-sh7763.c | 16 ++++++------
 arch/sh/kernel/cpu/sh4a/clock-sh7770.c | 12 ++++-----
 arch/sh/kernel/cpu/sh4a/clock-sh7780.c | 16 ++++++------
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 24 +++++++++---------
 arch/sh/kernel/cpu/sh4a/clock-sh7786.c | 20 +++++++--------
 arch/sh/kernel/cpu/sh4a/clock-shx3.c   | 16 ++++++------
 arch/sh/kernel/cpu/sh5/clock-sh5.c     | 12 ++++-----
 arch/sh/kernel/timers/timer-tmu.c      | 16 ++++++------
 23 files changed, 171 insertions(+), 175 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index b1f29199e4bd2..d63352b375cca 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -12,7 +12,7 @@ struct clk_ops {
 	void (*init)(struct clk *clk);
 	void (*enable)(struct clk *clk);
 	void (*disable)(struct clk *clk);
-	void (*recalc)(struct clk *clk);
+	unsigned long (*recalc)(struct clk *clk);
 	int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id);
 	int (*set_parent)(struct clk *clk, struct clk *parent);
 	long (*round_rate)(struct clk *clk, unsigned long rate);
@@ -96,4 +96,5 @@ enum clk_sh_algo_id {
 
 	IP_N1,
 };
+
 #endif /* __ASM_SH_CLOCK_H */
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 133dbe4033412..b022affb44cdb 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -75,6 +75,7 @@ static struct clk *onchip_clocks[] = {
 	&cpu_clk,
 };
 
+/* Propagate rate to children */
 static void propagate_rate(struct clk *clk)
 {
 	struct clk *clkp;
@@ -83,7 +84,7 @@ static void propagate_rate(struct clk *clk)
 		if (likely(clkp->parent != clk))
 			continue;
 		if (likely(clkp->ops && clkp->ops->recalc))
-			clkp->ops->recalc(clkp);
+			clkp->rate = clkp->ops->recalc(clkp);
 		if (unlikely(clkp->flags & CLK_RATE_PROPAGATES))
 			propagate_rate(clkp);
 	}
@@ -240,7 +241,7 @@ void clk_recalc_rate(struct clk *clk)
 		unsigned long flags;
 
 		spin_lock_irqsave(&clock_lock, flags);
-		clk->ops->recalc(clk);
+		clk->rate = clk->ops->recalc(clk);
 		spin_unlock_irqrestore(&clock_lock, flags);
 	}
 
@@ -377,20 +378,22 @@ static int clks_sysdev_suspend(struct sys_device *dev, pm_message_t state)
 	switch (state.event) {
 	case PM_EVENT_ON:
 		/* Resumeing from hibernation */
-		if (prev_state.event == PM_EVENT_FREEZE) {
-			list_for_each_entry(clkp, &clock_list, node)
-				if (likely(clkp->ops)) {
-					unsigned long rate = clkp->rate;
-
-					if (likely(clkp->ops->set_parent))
-						clkp->ops->set_parent(clkp,
-							clkp->parent);
-					if (likely(clkp->ops->set_rate))
-						clkp->ops->set_rate(clkp,
-							rate, NO_CHANGE);
-					else if (likely(clkp->ops->recalc))
-						clkp->ops->recalc(clkp);
-					}
+		if (prev_state.event != PM_EVENT_FREEZE)
+			break;
+
+		list_for_each_entry(clkp, &clock_list, node) {
+			if (likely(clkp->ops)) {
+				unsigned long rate = clkp->rate;
+
+				if (likely(clkp->ops->set_parent))
+					clkp->ops->set_parent(clkp,
+						clkp->parent);
+				if (likely(clkp->ops->set_rate))
+					clkp->ops->set_rate(clkp,
+						rate, NO_CHANGE);
+				else if (likely(clkp->ops->recalc))
+					clkp->rate = clkp->ops->recalc(clkp);
+			}
 		}
 		break;
 	case PM_EVENT_FREEZE:
diff --git a/arch/sh/kernel/cpu/sh2/clock-sh7619.c b/arch/sh/kernel/cpu/sh2/clock-sh7619.c
index d2c1579179998..26799139aa7a3 100644
--- a/arch/sh/kernel/cpu/sh2/clock-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/clock-sh7619.c
@@ -38,28 +38,28 @@ static struct clk_ops sh7619_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7619_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
-	clk->rate = clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 7];
+	return clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 7];
 }
 
 static struct clk_ops sh7619_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
-	clk->rate = clk->parent->rate;
+	return clk->parent->rate;
 }
 
 static struct clk_ops sh7619_cpu_clk_ops = {
@@ -78,4 +78,3 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 	if (idx < ARRAY_SIZE(sh7619_clk_ops))
 		*ops = sh7619_clk_ops[idx];
 }
-
diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7201.c b/arch/sh/kernel/cpu/sh2a/clock-sh7201.c
index 4a5e59732334c..7814c76159a72 100644
--- a/arch/sh/kernel/cpu/sh2a/clock-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/clock-sh7201.c
@@ -34,37 +34,37 @@ static const int pfc_divisors[]={1,2,3,4,6,8,12};
 
 static void master_clk_init(struct clk *clk)
 {
-	clk->rate = 10000000 * PLL2 * pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007];
+	return 10000000 * PLL2 * pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007];
 }
 
 static struct clk_ops sh7201_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7201_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7201_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inw(FREQCR) >> 4) & 0x0007);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7201_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
index fb781329848af..f8c6933857b3d 100644
--- a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
@@ -46,29 +46,29 @@ static struct clk_ops sh7203_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7203_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx-2];
+	return clk->parent->rate / pfc_divisors[idx-2];
 }
 
 static struct clk_ops sh7203_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
-	clk->rate = clk->parent->rate;
+	return clk->parent->rate;
 }
 
 static struct clk_ops sh7203_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7206.c b/arch/sh/kernel/cpu/sh2a/clock-sh7206.c
index 82d7f991ef6bd..c2268bdeceeb8 100644
--- a/arch/sh/kernel/cpu/sh2a/clock-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/clock-sh7206.c
@@ -41,29 +41,29 @@ static struct clk_ops sh7206_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7206_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
-	clk->rate = clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007];
+	return clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007];
 }
 
 static struct clk_ops sh7206_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FREQCR) & 0x0007);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7206_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh3.c b/arch/sh/kernel/cpu/sh3/clock-sh3.c
index c3c945958baf6..27b8738f0b098 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh3.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh3.c
@@ -38,36 +38,36 @@ static struct clk_ops sh3_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x2000) >> 11) | (frqcr & 0x0003);
 
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh3_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x8000) >> 13) | ((frqcr & 0x0030) >> 4);
 
-	clk->rate = clk->parent->rate / stc_multipliers[idx];
+	return clk->parent->rate / stc_multipliers[idx];
 }
 
 static struct clk_ops sh3_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x4000) >> 12) | ((frqcr & 0x000c) >> 2);
 
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh3_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7705.c b/arch/sh/kernel/cpu/sh3/clock-sh7705.c
index dfdbf3277fd7d..0ca8f2c3646c5 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh7705.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh7705.c
@@ -39,30 +39,30 @@ static struct clk_ops sh7705_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = ctrl_inw(FRQCR) & 0x0003;
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7705_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0300) >> 8;
-	clk->rate = clk->parent->rate / stc_multipliers[idx];
+	return clk->parent->rate / stc_multipliers[idx];
 }
 
 static struct clk_ops sh7705_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0030) >> 4;
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7705_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7706.c b/arch/sh/kernel/cpu/sh3/clock-sh7706.c
index 0cf96f9833bcb..4bf7887d310af 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh7706.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh7706.c
@@ -34,36 +34,36 @@ static struct clk_ops sh7706_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x2000) >> 11) | (frqcr & 0x0003);
 
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7706_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x8000) >> 13) | ((frqcr & 0x0030) >> 4);
 
-	clk->rate = clk->parent->rate / stc_multipliers[idx];
+	return clk->parent->rate / stc_multipliers[idx];
 }
 
 static struct clk_ops sh7706_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x4000) >> 12) | ((frqcr & 0x000c) >> 2);
 
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7706_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7709.c b/arch/sh/kernel/cpu/sh3/clock-sh7709.c
index b791a29fdb624..fa30b60177304 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh7709.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh7709.c
@@ -41,12 +41,12 @@ static struct clk_ops sh7709_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x2000) >> 11) | (frqcr & 0x0003);
 
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7709_module_clk_ops = {
@@ -56,25 +56,25 @@ static struct clk_ops sh7709_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = (frqcr & 0x0080) ?
 		((frqcr & 0x8000) >> 13) | ((frqcr & 0x0030) >> 4) : 1;
 
-	clk->rate = clk->parent->rate * stc_multipliers[idx];
+	return clk->parent->rate * stc_multipliers[idx];
 }
 
 static struct clk_ops sh7709_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = ((frqcr & 0x4000) >> 12) | ((frqcr & 0x000c) >> 2);
 
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7709_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7710.c b/arch/sh/kernel/cpu/sh3/clock-sh7710.c
index 4744c50ec449c..030a58ba18a50 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh7710.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh7710.c
@@ -33,30 +33,30 @@ static struct clk_ops sh7710_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0007);
-	clk->rate = clk->parent->rate / md_table[idx];
+	return clk->parent->rate / md_table[idx];
 }
 
 static struct clk_ops sh7710_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0700) >> 8;
-	clk->rate = clk->parent->rate / md_table[idx];
+	return clk->parent->rate / md_table[idx];
 }
 
 static struct clk_ops sh7710_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0070) >> 4;
-	clk->rate = clk->parent->rate / md_table[idx];
+	return clk->parent->rate / md_table[idx];
 }
 
 static struct clk_ops sh7710_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7712.c b/arch/sh/kernel/cpu/sh3/clock-sh7712.c
index 54f54df51ef0e..6428ee6c77edc 100644
--- a/arch/sh/kernel/cpu/sh3/clock-sh7712.c
+++ b/arch/sh/kernel/cpu/sh3/clock-sh7712.c
@@ -33,24 +33,24 @@ static struct clk_ops sh7712_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = frqcr & 0x0007;
 
-	clk->rate = clk->parent->rate / divisors[idx];
+	return clk->parent->rate / divisors[idx];
 }
 
 static struct clk_ops sh7712_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int frqcr = ctrl_inw(FRQCR);
 	int idx = (frqcr & 0x0030) >> 4;
 
-	clk->rate = clk->parent->rate / divisors[idx];
+	return clk->parent->rate / divisors[idx];
 }
 
 static struct clk_ops sh7712_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
index a33429463e96c..628d50ea6f6bf 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
@@ -21,10 +21,10 @@
 static int frqcr3_divisors[] = { 1, 2, 3, 4, 6, 8, 16 };
 static int frqcr3_values[]   = { 0, 1, 2, 3, 4, 5, 6  };
 
-static void emi_clk_recalc(struct clk *clk)
+static unsigned long emi_clk_recalc(struct clk *clk)
 {
 	int idx = ctrl_inl(CPG2_FRQCR3) & 0x0007;
-	clk->rate = clk->parent->rate / frqcr3_divisors[idx];
+	return clk->parent->rate / frqcr3_divisors[idx];
 }
 
 static inline int frqcr3_lookup(struct clk *clk, unsigned long rate)
@@ -50,10 +50,10 @@ static struct clk sh4202_emi_clk = {
 	.ops		= &sh4202_emi_clk_ops,
 };
 
-static void femi_clk_recalc(struct clk *clk)
+static unsigned long femi_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(CPG2_FRQCR3) >> 3) & 0x0007;
-	clk->rate = clk->parent->rate / frqcr3_divisors[idx];
+	return clk->parent->rate / frqcr3_divisors[idx];
 }
 
 static struct clk_ops sh4202_femi_clk_ops = {
@@ -90,10 +90,10 @@ static void shoc_clk_init(struct clk *clk)
 	WARN_ON(i == ARRAY_SIZE(frqcr3_divisors));	/* Undefined clock */
 }
 
-static void shoc_clk_recalc(struct clk *clk)
+static unsigned long shoc_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(CPG2_FRQCR3) >> 6) & 0x0007;
-	clk->rate = clk->parent->rate / frqcr3_divisors[idx];
+	return clk->parent->rate / frqcr3_divisors[idx];
 }
 
 static int shoc_clk_verify_rate(struct clk *clk, unsigned long rate)
@@ -127,7 +127,7 @@ static int shoc_clk_set_rate(struct clk *clk, unsigned long rate, int algo_id)
 	frqcr3 |= tmp << 6;
 	ctrl_outl(frqcr3, CPG2_FRQCR3);
 
-	clk->rate = clk->parent->rate / frqcr3_divisors[tmp];
+	return clk->parent->rate / frqcr3_divisors[tmp];
 
 	return 0;
 }
diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4.c b/arch/sh/kernel/cpu/sh4/clock-sh4.c
index dca9f87a12d68..73294d9cd0492 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4.c
@@ -35,30 +35,30 @@ static struct clk_ops sh4_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) & 0x0007);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh4_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) >> 3) & 0x0007;
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh4_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(FRQCR) >> 6) & 0x0007;
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh4_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 1ccdfc561fefa..5b1427f1ed41a 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -151,11 +151,11 @@ static int divisors2[] = { 4, 1, 8, 12, 16, 24, 32, 1, 48, 64, 72, 96, 1, 144 };
 static int divisors2[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32, 40 };
 #endif
 
-static void master_clk_recalc(struct clk *clk)
+static unsigned long master_clk_recalc(struct clk *clk)
 {
 	unsigned frqcr = ctrl_inl(FRQCR);
 
-	clk->rate = CONFIG_SH_PCLK_FREQ * STCPLL(frqcr);
+	return CONFIG_SH_PCLK_FREQ * STCPLL(frqcr);
 }
 
 static void master_clk_init(struct clk *clk)
@@ -166,12 +166,11 @@ static void master_clk_init(struct clk *clk)
 	master_clk_recalc(clk);
 }
 
-
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	unsigned long frqcr = ctrl_inl(FRQCR);
 
-	clk->rate = clk->parent->rate / STCPLL(frqcr);
+	return clk->parent->rate / STCPLL(frqcr);
 }
 
 #if defined(CONFIG_CPU_SUBTYPE_SH7724)
@@ -283,14 +282,14 @@ static int sh7722_find_div_index(unsigned long parent_rate, unsigned rate)
 	return index;
 }
 
-static void sh7722_frqcr_recalc(struct clk *clk)
+static unsigned long sh7722_frqcr_recalc(struct clk *clk)
 {
 	struct frqcr_context ctx = sh7722_get_clk_context(clk->name);
 	unsigned long frqcr = ctrl_inl(FRQCR);
 	int index;
 
 	index = (frqcr >> ctx.shift) & ctx.mask;
-	clk->rate = clk->parent->rate * 2 / divisors2[index];
+	return clk->parent->rate * 2 / divisors2[index];
 }
 
 static int sh7722_frqcr_set_rate(struct clk *clk, unsigned long rate,
@@ -439,11 +438,8 @@ static struct clk_ops sh7722_frqcr_clk_ops = {
 
 /*
  * clock ops methods for SIU A/B and IrDA clock
- *
  */
-
 #ifndef CONFIG_CPU_SUBTYPE_SH7343
-
 static int sh7722_siu_set_rate(struct clk *clk, unsigned long rate, int algo_id)
 {
 	unsigned long r;
@@ -458,12 +454,12 @@ static int sh7722_siu_set_rate(struct clk *clk, unsigned long rate, int algo_id)
 	return 0;
 }
 
-static void sh7722_siu_recalc(struct clk *clk)
+static unsigned long sh7722_siu_recalc(struct clk *clk)
 {
 	unsigned long r;
 
 	r = ctrl_inl(clk->arch_flags);
-	clk->rate = clk->parent->rate * 2 / divisors2[r & 0xF];
+	return clk->parent->rate * 2 / divisors2[r & 0xF];
 }
 
 static int sh7722_siu_start_stop(struct clk *clk, int enable)
@@ -525,12 +521,12 @@ static int sh7722_video_set_rate(struct clk *clk, unsigned long rate,
 	return 0;
 }
 
-static void sh7722_video_recalc(struct clk *clk)
+static unsigned long sh7722_video_recalc(struct clk *clk)
 {
 	unsigned long r;
 
 	r = ctrl_inl(VCLKCR);
-	clk->rate = clk->parent->rate / ((r & 0x3F) + 1);
+	return clk->parent->rate / ((r & 0x3F) + 1);
 }
 
 static struct clk_ops sh7722_video_clk_ops = {
@@ -627,7 +623,7 @@ static int sh7722_mstpcr_start_stop(struct clk *clk, int enable)
 		break;
 	default:
 		return -EINVAL;
-	}  
+	}
 
 	r = ctrl_inl(reg);
 
@@ -650,10 +646,9 @@ static void sh7722_mstpcr_disable(struct clk *clk)
 	sh7722_mstpcr_start_stop(clk, 0);
 }
 
-static void sh7722_mstpcr_recalc(struct clk *clk)
+static unsigned long sh7722_mstpcr_recalc(struct clk *clk)
 {
-	if (clk->parent)
-		clk->rate = clk->parent->rate;
+	return clk->parent->rate;
 }
 
 static struct clk_ops sh7722_mstpcr_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index 3177d0d1e06da..26630fb190c76 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -29,29 +29,29 @@ static struct clk_ops sh7763_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 4) & 0x07);
-	clk->rate = clk->parent->rate / p0fc_divisors[idx];
+	return clk->parent->rate / p0fc_divisors[idx];
 }
 
 static struct clk_ops sh7763_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 16) & 0x07);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh7763_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
-	clk->rate = clk->parent->rate;
+	return clk->parent->rate;
 }
 
 static struct clk_ops sh7763_cpu_clk_ops = {
@@ -71,10 +71,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 		*ops = sh7763_clk_ops[idx];
 }
 
-static void shyway_clk_recalc(struct clk *clk)
+static unsigned long shyway_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 20) & 0x07);
-	clk->rate = clk->parent->rate / cfc_divisors[idx];
+	return clk->parent->rate / cfc_divisors[idx];
 }
 
 static struct clk_ops sh7763_shyway_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7770.c b/arch/sh/kernel/cpu/sh4a/clock-sh7770.c
index 8e236062c7218..e0b896769205a 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7770.c
@@ -28,30 +28,30 @@ static struct clk_ops sh7770_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 28) & 0x000f);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7770_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(FRQCR) & 0x000f);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh7770_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 24) & 0x000f);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7770_cpu_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index 01f3da619d3d9..ba8dacc4ba238 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -29,30 +29,30 @@ static struct clk_ops sh7780_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(FRQCR) & 0x0003);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7780_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 16) & 0x0007);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh7780_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 24) & 0x0001);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7780_cpu_clk_ops = {
@@ -72,10 +72,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 		*ops = sh7780_clk_ops[idx];
 }
 
-static void shyway_clk_recalc(struct clk *clk)
+static unsigned long shyway_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> 20) & 0x0007);
-	clk->rate = clk->parent->rate / cfc_divisors[idx];
+	return clk->parent->rate / cfc_divisors[idx];
 }
 
 static struct clk_ops sh7780_shyway_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 27fa81bef6a07..52691eaeb9ba3 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -33,30 +33,30 @@ static struct clk_ops sh7785_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(FRQMR1) & 0x000f);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7785_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 16) & 0x000f);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh7785_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 28) & 0x0003);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7785_cpu_clk_ops = {
@@ -76,10 +76,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 		*ops = sh7785_clk_ops[idx];
 }
 
-static void shyway_clk_recalc(struct clk *clk)
+static unsigned long shyway_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 20) & 0x0003);
-	clk->rate = clk->parent->rate / sfc_divisors[idx];
+	return clk->parent->rate / sfc_divisors[idx];
 }
 
 static struct clk_ops sh7785_shyway_clk_ops = {
@@ -92,10 +92,10 @@ static struct clk sh7785_shyway_clk = {
 	.ops		= &sh7785_shyway_clk_ops,
 };
 
-static void ddr_clk_recalc(struct clk *clk)
+static unsigned long ddr_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 12) & 0x0003);
-	clk->rate = clk->parent->rate / mfc_divisors[idx];
+	return clk->parent->rate / mfc_divisors[idx];
 }
 
 static struct clk_ops sh7785_ddr_clk_ops = {
@@ -108,10 +108,10 @@ static struct clk sh7785_ddr_clk = {
 	.ops		= &sh7785_ddr_clk_ops,
 };
 
-static void ram_clk_recalc(struct clk *clk)
+static unsigned long ram_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 24) & 0x0003);
-	clk->rate = clk->parent->rate / ufc_divisors[idx];
+	return clk->parent->rate / ufc_divisors[idx];
 }
 
 static struct clk_ops sh7785_ram_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index f84a9c1344710..2e00ff436c632 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -36,30 +36,30 @@ static struct clk_ops sh7786_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inl(FRQMR1) & 0x000f);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops sh7786_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 16) & 0x000f);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops sh7786_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 28) & 0x0003);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops sh7786_cpu_clk_ops = {
@@ -79,10 +79,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 		*ops = sh7786_clk_ops[idx];
 }
 
-static void shyway_clk_recalc(struct clk *clk)
+static unsigned long shyway_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 20) & 0x0003);
-	clk->rate = clk->parent->rate / sfc_divisors[idx];
+	return clk->parent->rate / sfc_divisors[idx];
 }
 
 static struct clk_ops sh7786_shyway_clk_ops = {
@@ -95,10 +95,10 @@ static struct clk sh7786_shyway_clk = {
 	.ops		= &sh7786_shyway_clk_ops,
 };
 
-static void ddr_clk_recalc(struct clk *clk)
+static unsigned long ddr_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQMR1) >> 12) & 0x0003);
-	clk->rate = clk->parent->rate / mfc_divisors[idx];
+	return clk->parent->rate / mfc_divisors[idx];
 }
 
 static struct clk_ops sh7786_ddr_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index c630b29e06a81..770934e682813 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -40,30 +40,30 @@ static struct clk_ops shx3_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> PFC_POS) & PFC_MSK);
-	clk->rate = clk->parent->rate / pfc_divisors[idx];
+	return clk->parent->rate / pfc_divisors[idx];
 }
 
 static struct clk_ops shx3_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> BFC_POS) & BFC_MSK);
-	clk->rate = clk->parent->rate / bfc_divisors[idx];
+	return clk->parent->rate / bfc_divisors[idx];
 }
 
 static struct clk_ops shx3_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> IFC_POS) & IFC_MSK);
-	clk->rate = clk->parent->rate / ifc_divisors[idx];
+	return clk->parent->rate / ifc_divisors[idx];
 }
 
 static struct clk_ops shx3_cpu_clk_ops = {
@@ -83,10 +83,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
 		*ops = shx3_clk_ops[idx];
 }
 
-static void shyway_clk_recalc(struct clk *clk)
+static unsigned long shyway_clk_recalc(struct clk *clk)
 {
 	int idx = ((ctrl_inl(FRQCR) >> CFC_POS) & CFC_MSK);
-	clk->rate = clk->parent->rate / cfc_divisors[idx];
+	return clk->parent->rate / cfc_divisors[idx];
 }
 
 static struct clk_ops shx3_shyway_clk_ops = {
diff --git a/arch/sh/kernel/cpu/sh5/clock-sh5.c b/arch/sh/kernel/cpu/sh5/clock-sh5.c
index 5486324880e10..7f864ebc51d3b 100644
--- a/arch/sh/kernel/cpu/sh5/clock-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/clock-sh5.c
@@ -32,30 +32,30 @@ static struct clk_ops sh5_master_clk_ops = {
 	.init		= master_clk_init,
 };
 
-static void module_clk_recalc(struct clk *clk)
+static unsigned long module_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(cprc_base) >> 12) & 0x0007;
-	clk->rate = clk->parent->rate / ifc_table[idx];
+	return clk->parent->rate / ifc_table[idx];
 }
 
 static struct clk_ops sh5_module_clk_ops = {
 	.recalc		= module_clk_recalc,
 };
 
-static void bus_clk_recalc(struct clk *clk)
+static unsigned long bus_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(cprc_base) >> 3) & 0x0007;
-	clk->rate = clk->parent->rate / ifc_table[idx];
+	return clk->parent->rate / ifc_table[idx];
 }
 
 static struct clk_ops sh5_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static void cpu_clk_recalc(struct clk *clk)
+static unsigned long cpu_clk_recalc(struct clk *clk)
 {
 	int idx = (ctrl_inw(cprc_base) & 0x0007);
-	clk->rate = clk->parent->rate / ifc_table[idx];
+	return clk->parent->rate / ifc_table[idx];
 }
 
 static struct clk_ops sh5_cpu_clk_ops = {
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
index fe8d8930ccb62..a693dcdbe13ec 100644
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ b/arch/sh/kernel/timers/timer-tmu.c
@@ -172,22 +172,19 @@ static void __init tmu_clk_init(struct clk *clk)
 	clk->rate = clk_get_rate(clk->parent) / (4 << (divisor << 1));
 }
 
-static void tmu_clk_recalc(struct clk *clk)
+static unsigned long tmu_clk_recalc(struct clk *clk)
 {
 	int tmu_num = clk->name[3]-'0';
-	unsigned long prev_rate = clk_get_rate(clk);
+	unsigned long new_rate;
 	unsigned long flags;
 	u8 divisor = ctrl_inw(TMU0_TCR+tmu_num*0xC) & 0x7;
-	clk->rate  = clk_get_rate(clk->parent) / (4 << (divisor << 1));
 
-	if(prev_rate==clk_get_rate(clk))
-		return;
-
-	if(tmu_num)
-		return; /* No more work on TMU1 */
+	new_rate = clk_get_rate(clk->parent) / (4 << (divisor << 1));
+	if (clk->rate == new_rate || tmu_num)
+		return clk->rate; /* No more work on TMU1 */
 
 	local_irq_save(flags);
-	tmus_are_scaled = (prev_rate > clk->rate);
+	tmus_are_scaled = (clk->rate > new_rate);
 
 	_tmu_stop(TMU0);
 
@@ -210,6 +207,7 @@ static void tmu_clk_recalc(struct clk *clk)
 	_tmu_start(TMU0);
 
 	local_irq_restore(flags);
+	return new_rate;
 }
 
 static struct clk_ops tmu_clk_ops = {
-- 
GitLab


From a02cb230bb4fca04f091746c593de720a0e3a94a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 03:50:44 +0900
Subject: [PATCH 1053/2198] sh: clkfwk: Add a followparent_recalc() helper.

This adds a followparent_recalc() helper for clocks that just follow the
parent's rate. Switch over the few CPUs that use this scheme for some of
their clocks.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h            | 2 +-
 arch/sh/kernel/cpu/clock.c             | 6 ++++++
 arch/sh/kernel/cpu/sh2/clock-sh7619.c  | 7 +------
 arch/sh/kernel/cpu/sh2a/clock-sh7203.c | 7 +------
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 7 +------
 arch/sh/kernel/cpu/sh4a/clock-sh7763.c | 7 +------
 6 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index d63352b375cca..241f1c1d9ce12 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -44,7 +44,7 @@ int __init arch_clk_init(void);
 
 /* arch/sh/kernel/cpu/clock.c */
 int clk_init(void);
-
+unsigned long followparent_recalc(struct clk *clk);
 void clk_recalc_rate(struct clk *);
 
 int clk_register(struct clk *);
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index b022affb44cdb..17f6c078e8517 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -90,6 +90,12 @@ static void propagate_rate(struct clk *clk)
 	}
 }
 
+/* Used for clocks that always have same value as the parent clock */
+unsigned long followparent_recalc(struct clk *clk)
+{
+	return clk->parent->rate;
+}
+
 static void __clk_init(struct clk *clk)
 {
 	/*
diff --git a/arch/sh/kernel/cpu/sh2/clock-sh7619.c b/arch/sh/kernel/cpu/sh2/clock-sh7619.c
index 26799139aa7a3..4fe863170e315 100644
--- a/arch/sh/kernel/cpu/sh2/clock-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/clock-sh7619.c
@@ -57,13 +57,8 @@ static struct clk_ops sh7619_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static unsigned long cpu_clk_recalc(struct clk *clk)
-{
-	return clk->parent->rate;
-}
-
 static struct clk_ops sh7619_cpu_clk_ops = {
-	.recalc		= cpu_clk_recalc,
+	.recalc		= followparent_recalc,
 };
 
 static struct clk_ops *sh7619_clk_ops[] = {
diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
index f8c6933857b3d..9409869651024 100644
--- a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c
@@ -66,13 +66,8 @@ static struct clk_ops sh7203_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static unsigned long cpu_clk_recalc(struct clk *clk)
-{
-	return clk->parent->rate;
-}
-
 static struct clk_ops sh7203_cpu_clk_ops = {
-	.recalc		= cpu_clk_recalc,
+	.recalc		= followparent_recalc,
 };
 
 static struct clk_ops *sh7203_clk_ops[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 5b1427f1ed41a..4bdae84aa6b0c 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -646,15 +646,10 @@ static void sh7722_mstpcr_disable(struct clk *clk)
 	sh7722_mstpcr_start_stop(clk, 0);
 }
 
-static unsigned long sh7722_mstpcr_recalc(struct clk *clk)
-{
-	return clk->parent->rate;
-}
-
 static struct clk_ops sh7722_mstpcr_clk_ops = {
 	.enable = sh7722_mstpcr_enable,
 	.disable = sh7722_mstpcr_disable,
-	.recalc = sh7722_mstpcr_recalc,
+	.recalc = followparent_recalc,
 };
 
 #define MSTPCR(_name, _parent, regnr, bitnr) \
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index 26630fb190c76..db51cffc5d5bb 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -49,13 +49,8 @@ static struct clk_ops sh7763_bus_clk_ops = {
 	.recalc		= bus_clk_recalc,
 };
 
-static unsigned long cpu_clk_recalc(struct clk *clk)
-{
-	return clk->parent->rate;
-}
-
 static struct clk_ops sh7763_cpu_clk_ops = {
-	.recalc		= cpu_clk_recalc,
+	.recalc		= followparent_recalc,
 };
 
 static struct clk_ops *sh7763_clk_ops[] = {
-- 
GitLab


From b1f6cfe48c3cb1dfa77db3d2f42f765febaef9bc Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 04:27:43 +0900
Subject: [PATCH 1054/2198] sh: clkfwk: refactor rate propagation.

This resyncs the rate propagation strategy with the scheme used by the
OMAP clock framework. Child clocks are tracked on a list under each
parent and propagation happens there specifically rather than constantly
iterating over the global clock list.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h            |   9 +-
 arch/sh/kernel/cpu/clock.c             | 120 ++++++++++++++++---------
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c |   8 +-
 3 files changed, 85 insertions(+), 52 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 241f1c1d9ce12..5dc8b73a2bd5f 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -27,6 +27,9 @@ struct clk {
 	struct clk		*parent;
 	struct clk_ops		*ops;
 
+	struct list_head	children;
+	struct list_head	sibling;	/* node for children */
+
 	int			usecount;
 
 	unsigned long		rate;
@@ -35,7 +38,6 @@ struct clk {
 };
 
 #define CLK_ALWAYS_ENABLED	(1 << 0)
-#define CLK_RATE_PROPAGATES	(1 << 1)
 #define CLK_NEEDS_INIT		(1 << 2)
 
 /* Should be defined by processor-specific code */
@@ -44,9 +46,10 @@ int __init arch_clk_init(void);
 
 /* arch/sh/kernel/cpu/clock.c */
 int clk_init(void);
-unsigned long followparent_recalc(struct clk *clk);
+unsigned long followparent_recalc(struct clk *);
+void recalculate_root_clocks(void);
+void propagate_rate(struct clk *);
 void clk_recalc_rate(struct clk *);
-
 int clk_register(struct clk *);
 void clk_unregister(struct clk *);
 
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 17f6c078e8517..0a06df8cde2b5 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -1,11 +1,11 @@
 /*
  * arch/sh/kernel/cpu/clock.c - SuperH clock framework
  *
- *  Copyright (C) 2005, 2006, 2007  Paul Mundt
+ *  Copyright (C) 2005 - 2009  Paul Mundt
  *
  * This clock framework is derived from the OMAP version by:
  *
- *	Copyright (C) 2004 - 2005 Nokia Corporation
+ *	Copyright (C) 2004 - 2008 Nokia Corporation
  *	Written by Tuukka Tikkanen <tuukka.tikkanen@elektrobit.com>
  *
  *  Modified for omap shared clock framework by Tony Lindgren <tony@atomide.com>
@@ -43,20 +43,20 @@ static DEFINE_MUTEX(clock_list_sem);
  */
 static struct clk master_clk = {
 	.name		= "master_clk",
-	.flags		= CLK_ALWAYS_ENABLED | CLK_RATE_PROPAGATES,
+	.flags		= CLK_ALWAYS_ENABLED,
 	.rate		= CONFIG_SH_PCLK_FREQ,
 };
 
 static struct clk module_clk = {
 	.name		= "module_clk",
 	.parent		= &master_clk,
-	.flags		= CLK_ALWAYS_ENABLED | CLK_RATE_PROPAGATES,
+	.flags		= CLK_ALWAYS_ENABLED,
 };
 
 static struct clk bus_clk = {
 	.name		= "bus_clk",
 	.parent		= &master_clk,
-	.flags		= CLK_ALWAYS_ENABLED | CLK_RATE_PROPAGATES,
+	.flags		= CLK_ALWAYS_ENABLED,
 };
 
 static struct clk cpu_clk = {
@@ -75,27 +75,24 @@ static struct clk *onchip_clocks[] = {
 	&cpu_clk,
 };
 
+/* Used for clocks that always have same value as the parent clock */
+unsigned long followparent_recalc(struct clk *clk)
+{
+	return clk->parent->rate;
+}
+
 /* Propagate rate to children */
-static void propagate_rate(struct clk *clk)
+void propagate_rate(struct clk *tclk)
 {
 	struct clk *clkp;
 
-	list_for_each_entry(clkp, &clock_list, node) {
-		if (likely(clkp->parent != clk))
-			continue;
-		if (likely(clkp->ops && clkp->ops->recalc))
+	list_for_each_entry(clkp, &tclk->children, sibling) {
+		if (clkp->ops->recalc)
 			clkp->rate = clkp->ops->recalc(clkp);
-		if (unlikely(clkp->flags & CLK_RATE_PROPAGATES))
-			propagate_rate(clkp);
+		propagate_rate(clkp);
 	}
 }
 
-/* Used for clocks that always have same value as the parent clock */
-unsigned long followparent_recalc(struct clk *clk)
-{
-	return clk->parent->rate;
-}
-
 static void __clk_init(struct clk *clk)
 {
 	/*
@@ -180,10 +177,46 @@ void clk_disable(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_disable);
 
+static LIST_HEAD(root_clks);
+
+/**
+ * recalculate_root_clocks - recalculate and propagate all root clocks
+ *
+ * Recalculates all root clocks (clocks with no parent), which if the
+ * clock's .recalc is set correctly, should also propagate their rates.
+ * Called at init.
+ */
+void recalculate_root_clocks(void)
+{
+	struct clk *clkp;
+
+	list_for_each_entry(clkp, &root_clks, sibling) {
+		if (clkp->ops->recalc)
+			clkp->rate = clkp->ops->recalc(clkp);
+		propagate_rate(clkp);
+	}
+}
+
 int clk_register(struct clk *clk)
 {
+	if (clk == NULL || IS_ERR(clk))
+		return -EINVAL;
+
+	/*
+	 * trap out already registered clocks
+	 */
+	if (clk->node.next || clk->node.prev)
+		return 0;
+
 	mutex_lock(&clock_list_sem);
 
+	INIT_LIST_HEAD(&clk->children);
+
+	if (clk->parent)
+		list_add(&clk->sibling, &clk->parent->children);
+	else
+		list_add(&clk->sibling, &root_clks);
+
 	list_add(&clk->node, &clock_list);
 	clk->usecount = 0;
 	clk->flags |= CLK_NEEDS_INIT;
@@ -205,6 +238,7 @@ EXPORT_SYMBOL_GPL(clk_register);
 void clk_unregister(struct clk *clk)
 {
 	mutex_lock(&clock_list_sem);
+	list_del(&clk->sibling);
 	list_del(&clk->node);
 	mutex_unlock(&clock_list_sem);
 }
@@ -231,50 +265,53 @@ int clk_set_rate_ex(struct clk *clk, unsigned long rate, int algo_id)
 
 		spin_lock_irqsave(&clock_lock, flags);
 		ret = clk->ops->set_rate(clk, rate, algo_id);
+		if (ret == 0) {
+			if (clk->ops->recalc)
+				clk->rate = clk->ops->recalc(clk);
+			propagate_rate(clk);
+		}
 		spin_unlock_irqrestore(&clock_lock, flags);
 	}
 
-	if (unlikely(clk->flags & CLK_RATE_PROPAGATES))
-		propagate_rate(clk);
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(clk_set_rate_ex);
 
 void clk_recalc_rate(struct clk *clk)
 {
-	if (likely(clk->ops && clk->ops->recalc)) {
-		unsigned long flags;
+	unsigned long flags;
 
-		spin_lock_irqsave(&clock_lock, flags);
-		clk->rate = clk->ops->recalc(clk);
-		spin_unlock_irqrestore(&clock_lock, flags);
-	}
+	if (!clk->ops->recalc)
+		return;
 
-	if (unlikely(clk->flags & CLK_RATE_PROPAGATES))
-		propagate_rate(clk);
+	spin_lock_irqsave(&clock_lock, flags);
+	clk->rate = clk->ops->recalc(clk);
+	propagate_rate(clk);
+	spin_unlock_irqrestore(&clock_lock, flags);
 }
 EXPORT_SYMBOL_GPL(clk_recalc_rate);
 
 int clk_set_parent(struct clk *clk, struct clk *parent)
 {
+	unsigned long flags;
 	int ret = -EINVAL;
-	struct clk *old;
 
 	if (!parent || !clk)
 		return ret;
 
-	old = clk->parent;
-	if (likely(clk->ops && clk->ops->set_parent)) {
-		unsigned long flags;
-		spin_lock_irqsave(&clock_lock, flags);
-		ret = clk->ops->set_parent(clk, parent);
-		spin_unlock_irqrestore(&clock_lock, flags);
-		clk->parent = (ret ? old : parent);
-	}
+	spin_lock_irqsave(&clock_lock, flags);
+	if (clk->usecount == 0) {
+		if (clk->ops->set_parent)
+			ret = clk->ops->set_parent(clk, parent);
+		if (ret == 0) {
+			if (clk->ops->recalc)
+				clk->rate = clk->ops->recalc(clk);
+			propagate_rate(clk);
+		}
+	} else
+		ret = -EBUSY;
+	spin_unlock_irqrestore(&clock_lock, flags);
 
-	if (unlikely(clk->flags & CLK_RATE_PROPAGATES))
-		propagate_rate(clk);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(clk_set_parent);
@@ -457,8 +494,7 @@ int __init clk_init(void)
 	ret |= arch_clk_init();
 
 	/* Kick the child clocks.. */
-	propagate_rate(&master_clk);
-	propagate_rate(&bus_clk);
+	recalculate_root_clocks();
 
 	return ret;
 }
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 4bdae84aa6b0c..8e53829ca078b 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -161,9 +161,7 @@ static unsigned long master_clk_recalc(struct clk *clk)
 static void master_clk_init(struct clk *clk)
 {
 	clk->parent = NULL;
-	clk->flags |= CLK_RATE_PROPAGATES;
-	clk->rate = CONFIG_SH_PCLK_FREQ;
-	master_clk_recalc(clk);
+	clk->rate = master_clk_recalc(clk);
 }
 
 static unsigned long module_clk_recalc(struct clk *clk)
@@ -541,19 +539,16 @@ static struct clk_ops sh7722_video_clk_ops = {
 static struct clk sh7722_umem_clock = {
 	.name = "umem_clk",
 	.ops = &sh7722_frqcr_clk_ops,
-	.flags = CLK_RATE_PROPAGATES,
 };
 
 static struct clk sh7722_sh_clock = {
 	.name = "sh_clk",
 	.ops = &sh7722_frqcr_clk_ops,
-	.flags = CLK_RATE_PROPAGATES,
 };
 
 static struct clk sh7722_peripheral_clock = {
 	.name = "peripheral_clk",
 	.ops = &sh7722_frqcr_clk_ops,
-	.flags = CLK_RATE_PROPAGATES,
 };
 
 static struct clk sh7722_sdram_clock = {
@@ -564,7 +559,6 @@ static struct clk sh7722_sdram_clock = {
 static struct clk sh7722_r_clock = {
 	.name = "r_clk",
 	.rate = 32768,
-	.flags = CLK_RATE_PROPAGATES,
 };
 
 #if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\
-- 
GitLab


From 4ff29ff8e8723a41e7defd8bc78a7b16cbf940a2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 05:14:53 +0900
Subject: [PATCH 1055/2198] sh: clkfwk: Consolidate the ALWAYS_ENABLED /
 NEEDS_INIT mess.

There is no real distinction here in behaviour, either a clock needs to
be enabled on initialiation or not. The ALWAYS_ENABLED flag was always
intended to only apply to clocks that were physically always on and could
simply not be disabled at all from software. Unfortunately over time this
was abused and the meaning became a bit blurry.

So, we kill off both of all of those paths now, as well as the newer
NEEDS_INIT flag, and consolidate on a CLK_ENABLE_ON_INIT. Clocks that
need to be enabled on initialization can set this, and it will purposely
enable them and bump the refcount up.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h            |  3 +-
 arch/sh/kernel/cpu/clock.c             | 94 +++++++++-----------------
 arch/sh/kernel/cpu/sh4/clock-sh4-202.c |  6 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7763.c |  2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7780.c |  2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c |  6 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7786.c |  4 +-
 arch/sh/kernel/cpu/sh4a/clock-shx3.c   |  2 +-
 8 files changed, 44 insertions(+), 75 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 5dc8b73a2bd5f..246f9ebbed235 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -37,8 +37,7 @@ struct clk {
 	unsigned long		arch_flags;
 };
 
-#define CLK_ALWAYS_ENABLED	(1 << 0)
-#define CLK_NEEDS_INIT		(1 << 2)
+#define CLK_ENABLE_ON_INIT	(1 << 0)
 
 /* Should be defined by processor-specific code */
 void arch_init_clk_ops(struct clk_ops **, int type);
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 0a06df8cde2b5..c683be5ba8b2b 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -43,26 +43,26 @@ static DEFINE_MUTEX(clock_list_sem);
  */
 static struct clk master_clk = {
 	.name		= "master_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.rate		= CONFIG_SH_PCLK_FREQ,
 };
 
 static struct clk module_clk = {
 	.name		= "module_clk",
 	.parent		= &master_clk,
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 };
 
 static struct clk bus_clk = {
 	.name		= "bus_clk",
 	.parent		= &master_clk,
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 };
 
 static struct clk cpu_clk = {
 	.name		= "cpu_clk",
 	.parent		= &master_clk,
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 };
 
 /*
@@ -93,39 +93,11 @@ void propagate_rate(struct clk *tclk)
 	}
 }
 
-static void __clk_init(struct clk *clk)
-{
-	/*
-	 * See if this is the first time we're enabling the clock, some
-	 * clocks that are always enabled still require "special"
-	 * initialization. This is especially true if the clock mode
-	 * changes and the clock needs to hunt for the proper set of
-	 * divisors to use before it can effectively recalc.
-	 */
-
-	if (clk->flags & CLK_NEEDS_INIT) {
-		if (clk->ops && clk->ops->init)
-			clk->ops->init(clk);
-
-		clk->flags &= ~CLK_NEEDS_INIT;
-	}
-}
-
 static int __clk_enable(struct clk *clk)
 {
-	if (!clk)
-		return -EINVAL;
-
-	clk->usecount++;
-
-	/* nothing to do if always enabled */
-	if (clk->flags & CLK_ALWAYS_ENABLED)
-		return 0;
-
-	if (clk->usecount == 1) {
-		__clk_init(clk);
-
-		__clk_enable(clk->parent);
+	if (clk->usecount++ == 0) {
+		if (clk->parent)
+			__clk_enable(clk->parent);
 
 		if (clk->ops && clk->ops->enable)
 			clk->ops->enable(clk);
@@ -139,6 +111,9 @@ int clk_enable(struct clk *clk)
 	unsigned long flags;
 	int ret;
 
+	if (!clk)
+		return -EINVAL;
+
 	spin_lock_irqsave(&clock_lock, flags);
 	ret = __clk_enable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
@@ -149,21 +124,11 @@ EXPORT_SYMBOL_GPL(clk_enable);
 
 static void __clk_disable(struct clk *clk)
 {
-	if (!clk)
-		return;
-
-	clk->usecount--;
-
-	WARN_ON(clk->usecount < 0);
-
-	if (clk->flags & CLK_ALWAYS_ENABLED)
-		return;
-
-	if (clk->usecount == 0) {
+	if (clk->usecount > 0 && !(--clk->usecount)) {
 		if (likely(clk->ops && clk->ops->disable))
 			clk->ops->disable(clk);
-
-		__clk_disable(clk->parent);
+		if (likely(clk->parent))
+			__clk_disable(clk->parent);
 	}
 }
 
@@ -171,6 +136,9 @@ void clk_disable(struct clk *clk)
 {
 	unsigned long flags;
 
+	if (!clk)
+		return;
+
 	spin_lock_irqsave(&clock_lock, flags);
 	__clk_disable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
@@ -211,6 +179,7 @@ int clk_register(struct clk *clk)
 	mutex_lock(&clock_list_sem);
 
 	INIT_LIST_HEAD(&clk->children);
+	clk->usecount = 0;
 
 	if (clk->parent)
 		list_add(&clk->sibling, &clk->parent->children);
@@ -218,19 +187,10 @@ int clk_register(struct clk *clk)
 		list_add(&clk->sibling, &root_clks);
 
 	list_add(&clk->node, &clock_list);
-	clk->usecount = 0;
-	clk->flags |= CLK_NEEDS_INIT;
-
+	if (clk->ops->init)
+		clk->ops->init(clk);
 	mutex_unlock(&clock_list_sem);
 
-	if (clk->flags & CLK_ALWAYS_ENABLED) {
-		__clk_init(clk);
-		pr_debug( "Clock '%s' is ALWAYS_ENABLED\n", clk->name);
-		if (clk->ops && clk->ops->enable)
-			clk->ops->enable(clk);
-		pr_debug( "Enabled.");
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL_GPL(clk_register);
@@ -244,6 +204,15 @@ void clk_unregister(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_unregister);
 
+static void clk_enable_init_clocks(void)
+{
+	struct clk *clkp;
+
+	list_for_each_entry(clkp, &clock_list, node)
+		if (clkp->flags & CLK_ENABLE_ON_INIT)
+			clk_enable(clkp);
+}
+
 unsigned long clk_get_rate(struct clk *clk)
 {
 	return clk->rate;
@@ -404,9 +373,7 @@ static int show_clocks(char *buf, char **start, off_t off,
 
 		p += sprintf(p, "%-12s\t: %ld.%02ldMHz\t%s\n", clk->name,
 			     rate / 1000000, (rate % 1000000) / 10000,
-			     ((clk->flags & CLK_ALWAYS_ENABLED) ||
-			      clk->usecount > 0) ?
-			     "enabled" : "disabled");
+			      (clk->usecount > 0) ?  "enabled" : "disabled");
 	}
 
 	return p - buf;
@@ -496,6 +463,9 @@ int __init clk_init(void)
 	/* Kick the child clocks.. */
 	recalculate_root_clocks();
 
+	/* Enable the necessary init clocks */
+	clk_enable_init_clocks();
+
 	return ret;
 }
 
diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
index 628d50ea6f6bf..0caca9f99fe83 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
@@ -46,7 +46,7 @@ static struct clk_ops sh4202_emi_clk_ops = {
 
 static struct clk sh4202_emi_clk = {
 	.name		= "emi_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh4202_emi_clk_ops,
 };
 
@@ -62,7 +62,7 @@ static struct clk_ops sh4202_femi_clk_ops = {
 
 static struct clk sh4202_femi_clk = {
 	.name		= "femi_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh4202_femi_clk_ops,
 };
 
@@ -140,7 +140,7 @@ static struct clk_ops sh4202_shoc_clk_ops = {
 
 static struct clk sh4202_shoc_clk = {
 	.name		= "shoc_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh4202_shoc_clk_ops,
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index db51cffc5d5bb..21bd70f9ee457 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -78,7 +78,7 @@ static struct clk_ops sh7763_shyway_clk_ops = {
 
 static struct clk sh7763_shyway_clk = {
 	.name		= "shyway_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7763_shyway_clk_ops,
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index ba8dacc4ba238..4c11f8917e408 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -84,7 +84,7 @@ static struct clk_ops sh7780_shyway_clk_ops = {
 
 static struct clk sh7780_shyway_clk = {
 	.name		= "shyway_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7780_shyway_clk_ops,
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 52691eaeb9ba3..edd432894bd93 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -88,7 +88,7 @@ static struct clk_ops sh7785_shyway_clk_ops = {
 
 static struct clk sh7785_shyway_clk = {
 	.name		= "shyway_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7785_shyway_clk_ops,
 };
 
@@ -104,7 +104,7 @@ static struct clk_ops sh7785_ddr_clk_ops = {
 
 static struct clk sh7785_ddr_clk = {
 	.name		= "ddr_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7785_ddr_clk_ops,
 };
 
@@ -120,7 +120,7 @@ static struct clk_ops sh7785_ram_clk_ops = {
 
 static struct clk sh7785_ram_clk = {
 	.name		= "ram_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7785_ram_clk_ops,
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index 2e00ff436c632..2825494f85dca 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -91,7 +91,7 @@ static struct clk_ops sh7786_shyway_clk_ops = {
 
 static struct clk sh7786_shyway_clk = {
 	.name		= "shyway_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7786_shyway_clk_ops,
 };
 
@@ -107,7 +107,7 @@ static struct clk_ops sh7786_ddr_clk_ops = {
 
 static struct clk sh7786_ddr_clk = {
 	.name		= "ddr_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &sh7786_ddr_clk_ops,
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index 770934e682813..6e5c864cf40f8 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -95,7 +95,7 @@ static struct clk_ops shx3_shyway_clk_ops = {
 
 static struct clk shx3_shyway_clk = {
 	.name		= "shyway_clk",
-	.flags		= CLK_ALWAYS_ENABLED,
+	.flags		= CLK_ENABLE_ON_INIT,
 	.ops		= &shx3_shyway_clk_ops,
 };
 
-- 
GitLab


From 154502e160e02dee7b00ec2149762ae5d48e0bb4 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 05:18:13 +0900
Subject: [PATCH 1056/2198] sh: clkfwk: Convert SH-Mobile CPUs to use
 CLK_ENABLE_ON_INIT.

Kill off all of the clk_always_enabled leftovers and use the new flag
directly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h            |  16 --
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 365 +++++++++++++------------
 arch/sh/kernel/cpu/sh4a/setup-sh7343.c |   6 -
 arch/sh/kernel/cpu/sh4a/setup-sh7366.c |   6 -
 arch/sh/kernel/cpu/sh4a/setup-sh7722.c |   6 -
 arch/sh/kernel/cpu/sh4a/setup-sh7723.c |   5 -
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c |   4 -
 7 files changed, 183 insertions(+), 225 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 246f9ebbed235..e9fced9bd8f5f 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -52,22 +52,6 @@ void clk_recalc_rate(struct clk *);
 int clk_register(struct clk *);
 void clk_unregister(struct clk *);
 
-static inline int clk_always_enable(const char *id)
-{
-	struct clk *clk;
-	int ret;
-
-	clk = clk_get(NULL, id);
-	if (IS_ERR(clk))
-		return PTR_ERR(clk);
-
-	ret = clk_enable(clk);
-	if (ret)
-		clk_put(clk);
-
-	return ret;
-}
-
 /* the exported API, in addition to clk_set_rate */
 /**
  * clk_set_rate_ex - set the clock rate for a clock source, with additional parameter
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 8e53829ca078b..f777d00d4af68 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -646,207 +646,208 @@ static struct clk_ops sh7722_mstpcr_clk_ops = {
 	.recalc = followparent_recalc,
 };
 
-#define MSTPCR(_name, _parent, regnr, bitnr) \
+#define MSTPCR(_name, _parent, regnr, bitnr, _flags) \
 {						\
 	.name = _name,				\
+	.flags = _flags,			\
 	.arch_flags = MSTPCR_ARCH_FLAGS(regnr, bitnr),	\
 	.ops = (void *)_parent,		\
 }
 
 static struct clk sh7722_mstpcr_clocks[] = {
 #if defined(CONFIG_CPU_SUBTYPE_SH7722)
-	MSTPCR("uram0", "umem_clk", 0, 28),
-	MSTPCR("xymem0", "bus_clk", 0, 26),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15),
-	MSTPCR("cmt0", "r_clk", 0, 14),
-	MSTPCR("rwdt0", "r_clk", 0, 13),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10),
-	MSTPCR("scif0", "peripheral_clk", 0, 7),
-	MSTPCR("scif1", "peripheral_clk", 0, 6),
-	MSTPCR("scif2", "peripheral_clk", 0, 5),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9),
-	MSTPCR("rtc0", "r_clk", 1, 8),
-	MSTPCR("sdhi0", "peripheral_clk", 2, 18),
-	MSTPCR("keysc0", "r_clk", 2, 14),
-	MSTPCR("usbf0", "peripheral_clk", 2, 11),
-	MSTPCR("2dg0", "bus_clk", 2, 9),
-	MSTPCR("siu0", "bus_clk", 2, 8),
-	MSTPCR("vou0", "bus_clk", 2, 5),
-	MSTPCR("jpu0", "bus_clk", 2, 6),
-	MSTPCR("beu0", "bus_clk", 2, 4),
-	MSTPCR("ceu0", "bus_clk", 2, 3),
-	MSTPCR("veu0", "bus_clk", 2, 2),
-	MSTPCR("vpu0", "bus_clk", 2, 1),
-	MSTPCR("lcdc0", "bus_clk", 2, 0),
+	MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT),
+	MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
+	MSTPCR("cmt0", "r_clk", 0, 14, 0),
+	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
+	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
+	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
+	MSTPCR("scif1", "peripheral_clk", 0, 6, 0),
+	MSTPCR("scif2", "peripheral_clk", 0, 5, 0),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
+	MSTPCR("rtc0", "r_clk", 1, 8, 0),
+	MSTPCR("sdhi0", "peripheral_clk", 2, 18, 0),
+	MSTPCR("keysc0", "r_clk", 2, 14, 0),
+	MSTPCR("usbf0", "peripheral_clk", 2, 11, 0),
+	MSTPCR("2dg0", "bus_clk", 2, 9, 0),
+	MSTPCR("siu0", "bus_clk", 2, 8, 0),
+	MSTPCR("vou0", "bus_clk", 2, 5, 0),
+	MSTPCR("jpu0", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
+	MSTPCR("beu0", "bus_clk", 2, 4, 0),
+	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
+	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
+	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
+	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7723)
 	/* See page 60 of Datasheet V1.0: Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31),
-	MSTPCR("ic0", "cpu_clk", 0, 30),
-	MSTPCR("oc0", "cpu_clk", 0, 29),
-	MSTPCR("l2c0", "sh_clk", 0, 28),
-	MSTPCR("ilmem0", "cpu_clk", 0, 27),
-	MSTPCR("fpu0", "cpu_clk", 0, 24),
-	MSTPCR("intc0", "cpu_clk", 0, 22),
-	MSTPCR("dmac0", "bus_clk", 0, 21),
-	MSTPCR("sh0", "sh_clk", 0, 20),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19),
-	MSTPCR("ubc0", "cpu_clk", 0, 17),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15),
-	MSTPCR("cmt0", "r_clk", 0, 14),
-	MSTPCR("rwdt0", "r_clk", 0, 13),
-	MSTPCR("dmac1", "bus_clk", 0, 12),
-	MSTPCR("tmu1", "peripheral_clk", 0, 11),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10),
-	MSTPCR("scif0", "peripheral_clk", 0, 9),
-	MSTPCR("scif1", "peripheral_clk", 0, 8),
-	MSTPCR("scif2", "peripheral_clk", 0, 7),
-	MSTPCR("scif3", "bus_clk", 0, 6),
-	MSTPCR("scif4", "bus_clk", 0, 5),
-	MSTPCR("scif5", "bus_clk", 0, 4),
-	MSTPCR("msiof0", "bus_clk", 0, 2),
-	MSTPCR("msiof1", "bus_clk", 0, 1),
-	MSTPCR("meram0", "sh_clk", 0, 0),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9),
-	MSTPCR("rtc0", "r_clk", 1, 8),
-	MSTPCR("atapi0", "sh_clk", 2, 28),
-	MSTPCR("adc0", "peripheral_clk", 2, 28),
-	MSTPCR("tpu0", "bus_clk", 2, 25),
-	MSTPCR("irda0", "peripheral_clk", 2, 24),
-	MSTPCR("tsif0", "bus_clk", 2, 22),
-	MSTPCR("icb0", "bus_clk", 2, 21),
-	MSTPCR("sdhi0", "bus_clk", 2, 18),
-	MSTPCR("sdhi1", "bus_clk", 2, 17),
-	MSTPCR("keysc0", "r_clk", 2, 14),
-	MSTPCR("usb0", "bus_clk", 2, 11),
-	MSTPCR("2dg0", "bus_clk", 2, 10),
-	MSTPCR("siu0", "bus_clk", 2, 8),
-	MSTPCR("veu1", "bus_clk", 2, 6),
-	MSTPCR("vou0", "bus_clk", 2, 5),
-	MSTPCR("beu0", "bus_clk", 2, 4),
-	MSTPCR("ceu0", "bus_clk", 2, 3),
-	MSTPCR("veu0", "bus_clk", 2, 2),
-	MSTPCR("vpu0", "bus_clk", 2, 1),
-	MSTPCR("lcdc0", "bus_clk", 2, 0),
+	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
+	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
+	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
+	MSTPCR("l2c0", "sh_clk", 0, 28, 0),
+	MSTPCR("ilmem0", "cpu_clk", 0, 27, 0),
+	MSTPCR("fpu0", "cpu_clk", 0, 24, 0),
+	MSTPCR("intc0", "cpu_clk", 0, 22, 0),
+	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
+	MSTPCR("sh0", "sh_clk", 0, 20, 0),
+	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
+	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
+	MSTPCR("cmt0", "r_clk", 0, 14, 0),
+	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
+	MSTPCR("dmac1", "bus_clk", 0, 12, 0),
+	MSTPCR("tmu1", "peripheral_clk", 0, 11, 0),
+	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
+	MSTPCR("scif0", "peripheral_clk", 0, 9, 0),
+	MSTPCR("scif1", "peripheral_clk", 0, 8, 0),
+	MSTPCR("scif2", "peripheral_clk", 0, 7, 0),
+	MSTPCR("scif3", "bus_clk", 0, 6, 0),
+	MSTPCR("scif4", "bus_clk", 0, 5, 0),
+	MSTPCR("scif5", "bus_clk", 0, 4, 0),
+	MSTPCR("msiof0", "bus_clk", 0, 2, 0),
+	MSTPCR("msiof1", "bus_clk", 0, 1, 0),
+	MSTPCR("meram0", "sh_clk", 0, 0, CLK_ENABLE_ON_INIT),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
+	MSTPCR("rtc0", "r_clk", 1, 8, 0),
+	MSTPCR("atapi0", "sh_clk", 2, 28, 0),
+	MSTPCR("adc0", "peripheral_clk", 2, 28, 0),
+	MSTPCR("tpu0", "bus_clk", 2, 25, 0),
+	MSTPCR("irda0", "peripheral_clk", 2, 24, 0),
+	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
+	MSTPCR("icb0", "bus_clk", 2, 21, 0),
+	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
+	MSTPCR("sdhi1", "bus_clk", 2, 17, 0),
+	MSTPCR("keysc0", "r_clk", 2, 14, 0),
+	MSTPCR("usb0", "bus_clk", 2, 11, 0),
+	MSTPCR("2dg0", "bus_clk", 2, 10, 0),
+	MSTPCR("siu0", "bus_clk", 2, 8, 0),
+	MSTPCR("veu1", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
+	MSTPCR("vou0", "bus_clk", 2, 5, 0),
+	MSTPCR("beu0", "bus_clk", 2, 4, 0),
+	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
+	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
+	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
+	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7724)
 	/* See Datasheet : Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31),
-	MSTPCR("ic0", "cpu_clk", 0, 30),
-	MSTPCR("oc0", "cpu_clk", 0, 29),
-	MSTPCR("rs0", "bus_clk", 0, 28),
-	MSTPCR("ilmem0", "cpu_clk", 0, 27),
-	MSTPCR("l2c0", "sh_clk", 0, 26),
-	MSTPCR("fpu0", "cpu_clk", 0, 24),
-	MSTPCR("intc0", "peripheral_clk", 0, 22),
-	MSTPCR("dmac0", "bus_clk", 0, 21),
-	MSTPCR("sh0", "sh_clk", 0, 20),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19),
-	MSTPCR("ubc0", "cpu_clk", 0, 17),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15),
-	MSTPCR("cmt0", "r_clk", 0, 14),
-	MSTPCR("rwdt0", "r_clk", 0, 13),
-	MSTPCR("dmac1", "bus_clk", 0, 12),
-	MSTPCR("tmu1", "peripheral_clk", 0, 10),
-	MSTPCR("scif0", "peripheral_clk", 0, 9),
-	MSTPCR("scif1", "peripheral_clk", 0, 8),
-	MSTPCR("scif2", "peripheral_clk", 0, 7),
-	MSTPCR("scif3", "bus_clk", 0, 6),
-	MSTPCR("scif4", "bus_clk", 0, 5),
-	MSTPCR("scif5", "bus_clk", 0, 4),
-	MSTPCR("msiof0", "bus_clk", 0, 2),
-	MSTPCR("msiof1", "bus_clk", 0, 1),
-	MSTPCR("keysc0", "r_clk", 1, 12),
-	MSTPCR("rtc0", "r_clk", 1, 11),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9),
-	MSTPCR("i2c1", "peripheral_clk", 1, 8),
-	MSTPCR("mmc0", "bus_clk", 2, 29),
-	MSTPCR("eth0", "bus_clk", 2, 28),
-	MSTPCR("atapi0", "bus_clk", 2, 26),
-	MSTPCR("tpu0", "bus_clk", 2, 25),
-	MSTPCR("irda0", "peripheral_clk", 2, 24),
-	MSTPCR("tsif0", "bus_clk", 2, 22),
-	MSTPCR("usb1", "bus_clk", 2, 21),
-	MSTPCR("usb0", "bus_clk", 2, 20),
-	MSTPCR("2dg0", "bus_clk", 2, 19),
-	MSTPCR("sdhi0", "bus_clk", 2, 18),
-	MSTPCR("sdhi1", "bus_clk", 2, 17),
-	MSTPCR("veu1", "bus_clk", 2, 15),
-	MSTPCR("ceu1", "bus_clk", 2, 13),
-	MSTPCR("beu1", "bus_clk", 2, 12),
-	MSTPCR("2ddmac0", "sh_clk", 2, 10),
-	MSTPCR("spu0", "bus_clk", 2, 9),
-	MSTPCR("jpu0", "bus_clk", 2, 6),
-	MSTPCR("vou0", "bus_clk", 2, 5),
-	MSTPCR("beu0", "bus_clk", 2, 4),
-	MSTPCR("ceu0", "bus_clk", 2, 3),
-	MSTPCR("veu0", "bus_clk", 2, 2),
-	MSTPCR("vpu0", "bus_clk", 2, 1),
-	MSTPCR("lcdc0", "bus_clk", 2, 0),
+	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
+	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
+	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
+	MSTPCR("rs0", "bus_clk", 0, 28, 0),
+	MSTPCR("ilmem0", "cpu_clk", 0, 27, 0),
+	MSTPCR("l2c0", "sh_clk", 0, 26, 0),
+	MSTPCR("fpu0", "cpu_clk", 0, 24, 0),
+	MSTPCR("intc0", "peripheral_clk", 0, 22, 0),
+	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
+	MSTPCR("sh0", "sh_clk", 0, 20, 0),
+	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
+	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
+	MSTPCR("cmt0", "r_clk", 0, 14, 0),
+	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
+	MSTPCR("dmac1", "bus_clk", 0, 12, 0),
+	MSTPCR("tmu1", "peripheral_clk", 0, 10, 0),
+	MSTPCR("scif0", "peripheral_clk", 0, 9, 0),
+	MSTPCR("scif1", "peripheral_clk", 0, 8, 0),
+	MSTPCR("scif2", "peripheral_clk", 0, 7, 0),
+	MSTPCR("scif3", "bus_clk", 0, 6, 0),
+	MSTPCR("scif4", "bus_clk", 0, 5, 0),
+	MSTPCR("scif5", "bus_clk", 0, 4, 0),
+	MSTPCR("msiof0", "bus_clk", 0, 2, 0),
+	MSTPCR("msiof1", "bus_clk", 0, 1, 0),
+	MSTPCR("keysc0", "r_clk", 1, 12, 0),
+	MSTPCR("rtc0", "r_clk", 1, 11, 0),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
+	MSTPCR("i2c1", "peripheral_clk", 1, 8, 0),
+	MSTPCR("mmc0", "bus_clk", 2, 29, 0),
+	MSTPCR("eth0", "bus_clk", 2, 28, 0),
+	MSTPCR("atapi0", "bus_clk", 2, 26, 0),
+	MSTPCR("tpu0", "bus_clk", 2, 25, 0),
+	MSTPCR("irda0", "peripheral_clk", 2, 24, 0),
+	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
+	MSTPCR("usb1", "bus_clk", 2, 21, 0),
+	MSTPCR("usb0", "bus_clk", 2, 20, 0),
+	MSTPCR("2dg0", "bus_clk", 2, 19, 0),
+	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
+	MSTPCR("sdhi1", "bus_clk", 2, 17, 0),
+	MSTPCR("veu1", "bus_clk", 2, 15, CLK_ENABLE_ON_INIT),
+	MSTPCR("ceu1", "bus_clk", 2, 13, 0),
+	MSTPCR("beu1", "bus_clk", 2, 12, 0),
+	MSTPCR("2ddmac0", "sh_clk", 2, 10, 0),
+	MSTPCR("spu0", "bus_clk", 2, 9, 0),
+	MSTPCR("jpu0", "bus_clk", 2, 6, 0),
+	MSTPCR("vou0", "bus_clk", 2, 5, 0),
+	MSTPCR("beu0", "bus_clk", 2, 4, 0),
+	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
+	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
+	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
+	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7343)
-	MSTPCR("uram0", "umem_clk", 0, 28),
-	MSTPCR("xymem0", "bus_clk", 0, 26),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15),
-	MSTPCR("cmt0", "r_clk", 0, 14),
-	MSTPCR("rwdt0", "r_clk", 0, 13),
-	MSTPCR("scif0", "peripheral_clk", 0, 7),
-	MSTPCR("scif1", "peripheral_clk", 0, 6),
-	MSTPCR("scif2", "peripheral_clk", 0, 5),
-	MSTPCR("scif3", "peripheral_clk", 0, 4),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9),
-	MSTPCR("i2c1", "peripheral_clk", 1, 8),
-	MSTPCR("sdhi0", "peripheral_clk", 2, 18),
-	MSTPCR("keysc0", "r_clk", 2, 14),
-	MSTPCR("usbf0", "peripheral_clk", 2, 11),
-	MSTPCR("siu0", "bus_clk", 2, 8),
-	MSTPCR("jpu0", "bus_clk", 2, 6),
-	MSTPCR("vou0", "bus_clk", 2, 5),
-	MSTPCR("beu0", "bus_clk", 2, 4),
-	MSTPCR("ceu0", "bus_clk", 2, 3),
-	MSTPCR("veu0", "bus_clk", 2, 2),
-	MSTPCR("vpu0", "bus_clk", 2, 1),
-	MSTPCR("lcdc0", "bus_clk", 2, 0),
+	MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT),
+	MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
+	MSTPCR("cmt0", "r_clk", 0, 14, 0),
+	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
+	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
+	MSTPCR("scif1", "peripheral_clk", 0, 6, 0),
+	MSTPCR("scif2", "peripheral_clk", 0, 5, 0),
+	MSTPCR("scif3", "peripheral_clk", 0, 4, 0),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
+	MSTPCR("i2c1", "peripheral_clk", 1, 8, 0),
+	MSTPCR("sdhi0", "peripheral_clk", 2, 18, 0),
+	MSTPCR("keysc0", "r_clk", 2, 14, 0),
+	MSTPCR("usbf0", "peripheral_clk", 2, 11, 0),
+	MSTPCR("siu0", "bus_clk", 2, 8, 0),
+	MSTPCR("jpu0", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
+	MSTPCR("vou0", "bus_clk", 2, 5, 0),
+	MSTPCR("beu0", "bus_clk", 2, 4, 0),
+	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
+	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
+	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
+	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7366)
 	/* See page 52 of Datasheet V0.40: Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31),
-	MSTPCR("ic0", "cpu_clk", 0, 30),
-	MSTPCR("oc0", "cpu_clk", 0, 29),
-	MSTPCR("rsmem0", "sh_clk", 0, 28),
-	MSTPCR("xymem0", "cpu_clk", 0, 26),
-	MSTPCR("intc30", "peripheral_clk", 0, 23),
-	MSTPCR("intc0", "peripheral_clk", 0, 22),
-	MSTPCR("dmac0", "bus_clk", 0, 21),
-	MSTPCR("sh0", "sh_clk", 0, 20),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19),
-	MSTPCR("ubc0", "cpu_clk", 0, 17),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15),
-	MSTPCR("cmt0", "r_clk", 0, 14),
-	MSTPCR("rwdt0", "r_clk", 0, 13),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10),
-	MSTPCR("scif0", "peripheral_clk", 0, 7),
-	MSTPCR("scif1", "bus_clk", 0, 6),
-	MSTPCR("scif2", "bus_clk", 0, 5),
-	MSTPCR("msiof0", "peripheral_clk", 0, 2),
-	MSTPCR("sbr0", "peripheral_clk", 0, 1),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9),
-	MSTPCR("icb0", "bus_clk", 2, 27),
-	MSTPCR("meram0", "sh_clk", 2, 26),
-	MSTPCR("dacc0", "peripheral_clk", 2, 24),
-	MSTPCR("dacy0", "peripheral_clk", 2, 23),
-	MSTPCR("tsif0", "bus_clk", 2, 22),
-	MSTPCR("sdhi0", "bus_clk", 2, 18),
-	MSTPCR("mmcif0", "bus_clk", 2, 17),
-	MSTPCR("usb0", "bus_clk", 2, 11),
-	MSTPCR("siu0", "bus_clk", 2, 8),
-	MSTPCR("veu1", "bus_clk", 2, 7),
-	MSTPCR("vou0", "bus_clk", 2, 5),
-	MSTPCR("beu0", "bus_clk", 2, 4),
-	MSTPCR("ceu0", "bus_clk", 2, 3),
-	MSTPCR("veu0", "bus_clk", 2, 2),
-	MSTPCR("vpu0", "bus_clk", 2, 1),
-	MSTPCR("lcdc0", "bus_clk", 2, 0),
+	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
+	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
+	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
+	MSTPCR("rsmem0", "sh_clk", 0, 28, CLK_ENABLE_ON_INIT),
+	MSTPCR("xymem0", "cpu_clk", 0, 26, CLK_ENABLE_ON_INIT),
+	MSTPCR("intc30", "peripheral_clk", 0, 23, 0),
+	MSTPCR("intc0", "peripheral_clk", 0, 22, 0),
+	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
+	MSTPCR("sh0", "sh_clk", 0, 20, 0),
+	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
+	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
+	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
+	MSTPCR("cmt0", "r_clk", 0, 14, 0),
+	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
+	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
+	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
+	MSTPCR("scif1", "bus_clk", 0, 6, 0),
+	MSTPCR("scif2", "bus_clk", 0, 5, 0),
+	MSTPCR("msiof0", "peripheral_clk", 0, 2, 0),
+	MSTPCR("sbr0", "peripheral_clk", 0, 1, 0),
+	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
+	MSTPCR("icb0", "bus_clk", 2, 27, 0),
+	MSTPCR("meram0", "sh_clk", 2, 26, 0),
+	MSTPCR("dacc0", "peripheral_clk", 2, 24, 0),
+	MSTPCR("dacy0", "peripheral_clk", 2, 23, 0),
+	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
+	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
+	MSTPCR("mmcif0", "bus_clk", 2, 17, 0),
+	MSTPCR("usb0", "bus_clk", 2, 11, 0),
+	MSTPCR("siu0", "bus_clk", 2, 8, 0),
+	MSTPCR("veu1", "bus_clk", 2, 7, CLK_ENABLE_ON_INIT),
+	MSTPCR("vou0", "bus_clk", 2, 5, 0),
+	MSTPCR("beu0", "bus_clk", 2, 4, 0),
+	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
+	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
+	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
+	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
 };
 
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index b9c361e8cc822..42ce5fcbd735f 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -223,12 +223,6 @@ static struct platform_device *sh7343_devices[] __initdata = {
 
 static int __init sh7343_devices_setup(void)
 {
-	clk_always_enable("uram0"); /* URAM */
-	clk_always_enable("xymem0"); /* XYMEM */
-	clk_always_enable("veu0"); /* VEU */
-	clk_always_enable("vpu0"); /* VPU */
-	clk_always_enable("jpu0"); /* JPU */
-
 	platform_resource_setup_memory(&vpu_device, "vpu", 1 << 20);
 	platform_resource_setup_memory(&veu_device, "veu", 2 << 20);
 	platform_resource_setup_memory(&jpu_device, "jpu", 2 << 20);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index d4a994be4a7d8..d0172afa0b5d4 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -212,12 +212,6 @@ static struct platform_device *sh7366_devices[] __initdata = {
 
 static int __init sh7366_devices_setup(void)
 {
-	clk_always_enable("rsmem0"); /* RSMEM */
-	clk_always_enable("xymem0"); /* XYMEM */
-	clk_always_enable("veu1"); /* VEU-2 */
-	clk_always_enable("veu0"); /* VEU-1 */
-	clk_always_enable("vpu0"); /* VPU */
-
 	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
 	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
 	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
index 5d6247fecd63c..ea524a2da3e47 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c
@@ -352,12 +352,6 @@ static struct platform_device *sh7722_devices[] __initdata = {
 
 static int __init sh7722_devices_setup(void)
 {
-	clk_always_enable("uram0"); /* URAM */
-	clk_always_enable("xymem0"); /* XYMEM */
-	clk_always_enable("veu0"); /* VEU */
-	clk_always_enable("vpu0"); /* VPU */
-	clk_always_enable("jpu0"); /* JPU */
-
 	platform_resource_setup_memory(&vpu_device, "vpu", 1 << 20);
 	platform_resource_setup_memory(&veu_device, "veu", 2 << 20);
 	platform_resource_setup_memory(&jpu_device, "jpu", 2 << 20);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 1429fc5e42866..04cb4aae7ea75 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -460,11 +460,6 @@ static struct platform_device *sh7723_devices[] __initdata = {
 
 static int __init sh7723_devices_setup(void)
 {
-	clk_always_enable("meram0"); /* MERAM */
-	clk_always_enable("veu1"); /* VEU2H1 */
-	clk_always_enable("veu0"); /* VEU2H0 */
-	clk_always_enable("vpu0"); /* VPU */
-
 	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
 	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
 	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index a3039ce1a6a59..080f541ae08d1 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -281,10 +281,6 @@ static struct platform_device *sh7724_devices[] __initdata = {
 
 static int __init sh7724_devices_setup(void)
 {
-	clk_always_enable("vpu0");   /* VPU */
-	clk_always_enable("veu1");   /* VEU3F1 */
-	clk_always_enable("veu0");   /* VEU3F0 */
-
 	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
 	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
 	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
-- 
GitLab


From ae891a4264c91246c0b4c22be68b9838747ae48d Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 05:30:10 +0900
Subject: [PATCH 1057/2198] sh: clkfwk: Fix up the clk_enable() error path.

There are a couple of instances where a clk_enable() can fail, which the
SH-Mobile code presently handles, but doesn't get reported all the way
back up. This fixes up the return type so the errors make it all the way
down to the drivers.

Additionally, we now also error out properly if the parent enable fails.
Prep work for aggressively turning off unused clocks on boot.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h            |  2 +-
 arch/sh/kernel/cpu/clock.c             | 71 +++++++++++++++++---------
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 11 ++--
 3 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index e9fced9bd8f5f..5de72eef97244 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -10,7 +10,7 @@ struct clk;
 
 struct clk_ops {
 	void (*init)(struct clk *clk);
-	void (*enable)(struct clk *clk);
+	int (*enable)(struct clk *clk);
 	void (*disable)(struct clk *clk);
 	unsigned long (*recalc)(struct clk *clk);
 	int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id);
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index c683be5ba8b2b..e027fe5898d68 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -93,57 +93,78 @@ void propagate_rate(struct clk *tclk)
 	}
 }
 
-static int __clk_enable(struct clk *clk)
+static void __clk_disable(struct clk *clk)
 {
-	if (clk->usecount++ == 0) {
-		if (clk->parent)
-			__clk_enable(clk->parent);
-
-		if (clk->ops && clk->ops->enable)
-			clk->ops->enable(clk);
+	if (clk->usecount == 0) {
+		printk(KERN_ERR "Trying disable clock %s with 0 usecount\n",
+		       clk->name);
+		WARN_ON(1);
+		return;
 	}
 
-	return 0;
+	if (!(--clk->usecount)) {
+		if (likely(clk->ops && clk->ops->disable))
+			clk->ops->disable(clk);
+		if (likely(clk->parent))
+			__clk_disable(clk->parent);
+	}
 }
 
-int clk_enable(struct clk *clk)
+void clk_disable(struct clk *clk)
 {
 	unsigned long flags;
-	int ret;
 
 	if (!clk)
-		return -EINVAL;
+		return;
 
 	spin_lock_irqsave(&clock_lock, flags);
-	ret = __clk_enable(clk);
+	__clk_disable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
-
-	return ret;
 }
-EXPORT_SYMBOL_GPL(clk_enable);
+EXPORT_SYMBOL_GPL(clk_disable);
 
-static void __clk_disable(struct clk *clk)
+static int __clk_enable(struct clk *clk)
 {
-	if (clk->usecount > 0 && !(--clk->usecount)) {
-		if (likely(clk->ops && clk->ops->disable))
-			clk->ops->disable(clk);
-		if (likely(clk->parent))
-			__clk_disable(clk->parent);
+	int ret = 0;
+
+	if (clk->usecount++ == 0) {
+		if (clk->parent) {
+			ret = __clk_enable(clk->parent);
+			if (unlikely(ret))
+				goto err;
+		}
+
+		if (clk->ops && clk->ops->enable) {
+			ret = clk->ops->enable(clk);
+			if (ret) {
+				if (clk->parent)
+					__clk_disable(clk->parent);
+				goto err;
+			}
+		}
 	}
+
+	return ret;
+err:
+	clk->usecount--;
+	return ret;
 }
 
-void clk_disable(struct clk *clk)
+int clk_enable(struct clk *clk)
 {
 	unsigned long flags;
+	int ret;
 
 	if (!clk)
-		return;
+		return -EINVAL;
 
 	spin_lock_irqsave(&clock_lock, flags);
-	__clk_disable(clk);
+	ret = __clk_enable(clk);
 	spin_unlock_irqrestore(&clock_lock, flags);
+
+	return ret;
 }
-EXPORT_SYMBOL_GPL(clk_disable);
+EXPORT_SYMBOL_GPL(clk_enable);
 
 static LIST_HEAD(root_clks);
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index f777d00d4af68..1c41db41de7e7 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -472,9 +472,9 @@ static int sh7722_siu_start_stop(struct clk *clk, int enable)
 	return 0;
 }
 
-static void sh7722_siu_enable(struct clk *clk)
+static int sh7722_siu_enable(struct clk *clk)
 {
-	sh7722_siu_start_stop(clk, 1);
+	return sh7722_siu_start_stop(clk, 1);
 }
 
 static void sh7722_siu_disable(struct clk *clk)
@@ -491,12 +491,13 @@ static struct clk_ops sh7722_siu_clk_ops = {
 
 #endif /* CONFIG_CPU_SUBTYPE_SH7343 */
 
-static void sh7722_video_enable(struct clk *clk)
+static int sh7722_video_enable(struct clk *clk)
 {
 	unsigned long r;
 
 	r = ctrl_inl(VCLKCR);
 	ctrl_outl( r & ~(1<<8), VCLKCR);
+	return 0;
 }
 
 static void sh7722_video_disable(struct clk *clk)
@@ -630,9 +631,9 @@ static int sh7722_mstpcr_start_stop(struct clk *clk, int enable)
 	return 0;
 }
 
-static void sh7722_mstpcr_enable(struct clk *clk)
+static int sh7722_mstpcr_enable(struct clk *clk)
 {
-	sh7722_mstpcr_start_stop(clk, 1);
+	return sh7722_mstpcr_start_stop(clk, 1);
 }
 
 static void sh7722_mstpcr_disable(struct clk *clk)
-- 
GitLab


From aa87aa343f2cd236b5eccd643abd4df918ed5c4f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 05:51:05 +0900
Subject: [PATCH 1058/2198] sh: clkfwk: Improve the generic clk_set_parent()
 implementation.

This causes the generic clk_set_parent() implementation to be a bit more
intelligent. A clk_reparent() is added to move the clock over to the new
parent's sibling list, which then allows the generic rate propagation
code to succeed. This also becomes a nop if the new and old parents are
unchanged.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h |  1 +
 arch/sh/kernel/cpu/clock.c  | 20 ++++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 5de72eef97244..fdb915608dbc2 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -48,6 +48,7 @@ int clk_init(void);
 unsigned long followparent_recalc(struct clk *);
 void recalculate_root_clocks(void);
 void propagate_rate(struct clk *);
+int clk_reparent(struct clk *child, struct clk *parent);
 void clk_recalc_rate(struct clk *);
 int clk_register(struct clk *);
 void clk_unregister(struct clk *);
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index e027fe5898d68..e3d1de8a46fd2 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -81,6 +81,19 @@ unsigned long followparent_recalc(struct clk *clk)
 	return clk->parent->rate;
 }
 
+int clk_reparent(struct clk *child, struct clk *parent)
+{
+	list_del_init(&child->sibling);
+	if (parent)
+		list_add(&child->sibling, &parent->children);
+	child->parent = parent;
+
+	/* now do the debugfs renaming to reattach the child
+	   to the proper parent */
+
+	return 0;
+}
+
 /* Propagate rate to children */
 void propagate_rate(struct clk *tclk)
 {
@@ -288,12 +301,19 @@ int clk_set_parent(struct clk *clk, struct clk *parent)
 
 	if (!parent || !clk)
 		return ret;
+	if (clk->parent == parent)
+		return 0;
 
 	spin_lock_irqsave(&clock_lock, flags);
 	if (clk->usecount == 0) {
 		if (clk->ops->set_parent)
 			ret = clk->ops->set_parent(clk, parent);
+		else
+			ret = clk_reparent(clk, parent);
+
 		if (ret == 0) {
+			pr_debug("clock: set parent of %s to %s (new rate %ld)\n",
+				 clk->name, clk->parent->name, clk->rate);
 			if (clk->ops->recalc)
 				clk->rate = clk->ops->recalc(clk);
 			propagate_rate(clk);
-- 
GitLab


From f5c84cf50812c80133e64683d0500b2416d55cb3 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 05:59:27 +0900
Subject: [PATCH 1059/2198] sh: clkfwk: Tidy up on-chip clock registration and
 rate propagation.

This tidies up the set_rate hack that the on-chip clocks were abusing to
trigger rate propagation, which is now handled generically.

Additionally, now that CLK_ENABLE_ON_INIT is wired up where it needs to
be for these clocks, the clk_enable() can go away. In some cases this was
bumping up the refcount higher than it should have been.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4/clock-sh4-202.c | 19 ++++---------------
 arch/sh/kernel/cpu/sh4a/clock-sh7763.c | 17 +++--------------
 arch/sh/kernel/cpu/sh4a/clock-sh7780.c | 17 +++--------------
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 15 +++------------
 arch/sh/kernel/cpu/sh4a/clock-sh7786.c | 15 +++------------
 arch/sh/kernel/cpu/sh4a/clock-shx3.c   | 15 +++------------
 6 files changed, 19 insertions(+), 79 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
index 0caca9f99fe83..435f4f12ffb8b 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
@@ -127,7 +127,7 @@ static int shoc_clk_set_rate(struct clk *clk, unsigned long rate, int algo_id)
 	frqcr3 |= tmp << 6;
 	ctrl_outl(frqcr3, CPG2_FRQCR3);
 
-	return clk->parent->rate / frqcr3_divisors[tmp];
+	clk->rate = clk->parent->rate / frqcr3_divisors[tmp];
 
 	return 0;
 }
@@ -153,28 +153,17 @@ static struct clk *sh4202_onchip_clocks[] = {
 static int __init sh4202_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(sh4202_onchip_clocks); i++) {
 		struct clk *clkp = sh4202_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
-
 arch_initcall(sh4202_clk_init);
-
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index 21bd70f9ee457..0110da64a43b7 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -93,28 +93,17 @@ static struct clk *sh7763_onchip_clocks[] = {
 static int __init sh7763_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(sh7763_onchip_clocks); i++) {
 		struct clk *clkp = sh7763_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
-
 arch_initcall(sh7763_clk_init);
-
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index 4c11f8917e408..0a22d50b109f4 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -99,28 +99,17 @@ static struct clk *sh7780_onchip_clocks[] = {
 static int __init sh7780_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(sh7780_onchip_clocks); i++) {
 		struct clk *clkp = sh7780_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
-
 arch_initcall(sh7780_clk_init);
-
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index edd432894bd93..4dcd1f6f0cbd6 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -137,26 +137,17 @@ static struct clk *sh7785_onchip_clocks[] = {
 static int __init sh7785_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(sh7785_onchip_clocks); i++) {
 		struct clk *clkp = sh7785_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
 arch_initcall(sh7785_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index 2825494f85dca..825556fe23063 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -123,26 +123,17 @@ static struct clk *sh7786_onchip_clocks[] = {
 static int __init sh7786_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(sh7786_onchip_clocks); i++) {
 		struct clk *clkp = sh7786_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
 arch_initcall(sh7786_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index 6e5c864cf40f8..1eb149b0fe6e4 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -110,26 +110,17 @@ static struct clk *shx3_onchip_clocks[] = {
 static int __init shx3_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
-	int i;
+	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(shx3_onchip_clocks); i++) {
 		struct clk *clkp = shx3_onchip_clocks[i];
 
 		clkp->parent = clk;
-		clk_register(clkp);
-		clk_enable(clkp);
+		ret |= clk_register(clkp);
 	}
 
-	/*
-	 * Now that we have the rest of the clocks registered, we need to
-	 * force the parent clock to propagate so that these clocks will
-	 * automatically figure out their rate. We cheat by handing the
-	 * parent clock its current rate and forcing child propagation.
-	 */
-	clk_set_rate(clk, clk_get_rate(clk));
-
 	clk_put(clk);
 
-	return 0;
+	return ret;
 }
 arch_initcall(shx3_clk_init);
-- 
GitLab


From 007e8363b656768fe3f59c180824ff704680bb25 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 06:05:32 +0900
Subject: [PATCH 1060/2198] sh: clkfwk: Kill off clk_recalc_rate().

The only user for this is the SH-Mobile r_clk, which is now added as a
root clock and can be kicked via propagate_rate() as usual. Given that,
there is no longer any need for the special clk_recalc_rate(), so we kill
it off.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h            |  1 -
 arch/sh/kernel/cpu/clock.c             | 14 --------------
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c |  2 +-
 3 files changed, 1 insertion(+), 16 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index fdb915608dbc2..6a2f46333376e 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -49,7 +49,6 @@ unsigned long followparent_recalc(struct clk *);
 void recalculate_root_clocks(void);
 void propagate_rate(struct clk *);
 int clk_reparent(struct clk *child, struct clk *parent);
-void clk_recalc_rate(struct clk *);
 int clk_register(struct clk *);
 void clk_unregister(struct clk *);
 
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index e3d1de8a46fd2..7a356de99d7d0 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -280,20 +280,6 @@ int clk_set_rate_ex(struct clk *clk, unsigned long rate, int algo_id)
 }
 EXPORT_SYMBOL_GPL(clk_set_rate_ex);
 
-void clk_recalc_rate(struct clk *clk)
-{
-	unsigned long flags;
-
-	if (!clk->ops->recalc)
-		return;
-
-	spin_lock_irqsave(&clock_lock, flags);
-	clk->rate = clk->ops->recalc(clk);
-	propagate_rate(clk);
-	spin_unlock_irqrestore(&clock_lock, flags);
-}
-EXPORT_SYMBOL_GPL(clk_recalc_rate);
-
 int clk_set_parent(struct clk *clk, struct clk *parent)
 {
 	unsigned long flags;
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 1c41db41de7e7..426065b432694 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -912,7 +912,7 @@ int __init arch_clk_init(void)
 		clk_put(clk);
 	}
 
-	clk_recalc_rate(&sh7722_r_clock); /* make sure rate gets propagated */
+	propagate_rate(&sh7722_r_clock); /* make sure rate gets propagated */
 
 	return 0;
 }
-- 
GitLab


From 0dae89572cbcd5f676ea52a9448d9639d97a53d6 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 06:18:09 +0900
Subject: [PATCH 1061/2198] sh: clkfwk: Wire up clk_get_sys() support.

This stubs in clk_get_sys() from the ARM clkdev implementation.
Tentatively conver the clk_get() lookup code to use this, and once the
rest of the in-tree users are happy with this, it can replace the
fallback lookups.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h |  7 +++++
 arch/sh/kernel/cpu/clock.c  | 59 +++++++++++++++++++++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 6a2f46333376e..e2f5bf1b4a9c6 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -37,6 +37,13 @@ struct clk {
 	unsigned long		arch_flags;
 };
 
+struct clk_lookup {
+	struct list_head	node;
+	const char		*dev_id;
+	const char		*con_id;
+	struct clk		*clk;
+};
+
 #define CLK_ENABLE_ON_INIT	(1 << 0)
 
 /* Should be defined by processor-specific code */
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 7a356de99d7d0..34707b8677608 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -10,6 +10,10 @@
  *
  *  Modified for omap shared clock framework by Tony Lindgren <tony@atomide.com>
  *
+ *  With clkdev bits:
+ *
+ *	Copyright (C) 2008 Russell King.
+ *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
@@ -334,15 +338,70 @@ long clk_round_rate(struct clk *clk, unsigned long rate)
 }
 EXPORT_SYMBOL_GPL(clk_round_rate);
 
+/*
+ * Find the correct struct clk for the device and connection ID.
+ * We do slightly fuzzy matching here:
+ *  An entry with a NULL ID is assumed to be a wildcard.
+ *  If an entry has a device ID, it must match
+ *  If an entry has a connection ID, it must match
+ * Then we take the most specific entry - with the following
+ * order of precidence: dev+con > dev only > con only.
+ */
+static struct clk *clk_find(const char *dev_id, const char *con_id)
+{
+	struct clk_lookup *p;
+	struct clk *clk = NULL;
+	int match, best = 0;
+
+	list_for_each_entry(p, &clock_list, node) {
+		match = 0;
+		if (p->dev_id) {
+			if (!dev_id || strcmp(p->dev_id, dev_id))
+				continue;
+			match += 2;
+		}
+		if (p->con_id) {
+			if (!con_id || strcmp(p->con_id, con_id))
+				continue;
+			match += 1;
+		}
+		if (match == 0)
+			continue;
+
+		if (match > best) {
+			clk = p->clk;
+			best = match;
+		}
+	}
+	return clk;
+}
+
+struct clk *clk_get_sys(const char *dev_id, const char *con_id)
+{
+	struct clk *clk;
+
+	mutex_lock(&clock_list_sem);
+	clk = clk_find(dev_id, con_id);
+	mutex_unlock(&clock_list_sem);
+
+	return clk ? clk : ERR_PTR(-ENOENT);
+}
+EXPORT_SYMBOL_GPL(clk_get_sys);
+
 /*
  * Returns a clock. Note that we first try to use device id on the bus
  * and clock name. If this fails, we try to use clock name only.
  */
 struct clk *clk_get(struct device *dev, const char *id)
 {
+	const char *dev_id = dev ? dev_name(dev) : NULL;
 	struct clk *p, *clk = ERR_PTR(-ENOENT);
 	int idno;
 
+	clk = clk_get_sys(dev_id, id);
+	if (clk)
+		return clk;
+
 	if (dev == NULL || dev->bus != &platform_bus_type)
 		idno = -1;
 	else
-- 
GitLab


From 77d1a4999502c260df0eb2de437d320bf8c64b36 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 14:21:12 -0700
Subject: [PATCH 1062/2198] x86, boot: make symbols from the main vmlinux
 available

Make symbols from the main vmlinux, as opposed to just
compressed/vmlinux, available to header.S.  Also, export a few
additional symbols.

This will be used in a subsequent patch to export the total memory
footprint of the kernel.

[ Impact: enable future enhancement ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/Makefile | 24 ++++++++++++++++--------
 arch/x86/boot/header.S |  7 ++++---
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 6633b6e7505a6..75e0301fc69a9 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -86,19 +86,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
 
 SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
 
-sed-offsets := -e 's/^00*/0/' \
-        -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
 
-quiet_cmd_offsets = OFFSETS $@
-      cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@
+quiet_cmd_voffset = VOFFSET $@
+      cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
 
-$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE
-	$(call if_changed,offsets)
+targets += voffset.h
+$(obj)/voffset.h: vmlinux FORCE
+	$(call if_changed,voffset)
+
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+
+quiet_cmd_zoffset = ZOFFSET $@
+      cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
+
+targets += zoffset.h
+$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
+	$(call if_changed,zoffset)
 
-targets += offsets.h
 
 AFLAGS_header.o += -I$(obj)
-$(obj)/header.o: $(obj)/offsets.h
+$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
 
 LDFLAGS_setup.elf	:= -T
 $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 5d84d1c74e4c6..27285143adeb4 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -22,7 +22,8 @@
 #include <asm/page_types.h>
 #include <asm/setup.h>
 #include "boot.h"
-#include "offsets.h"
+#include "voffset.h"
+#include "zoffset.h"
 
 BOOTSEG		= 0x07C0		/* original address of boot-sector */
 SYSSEG		= 0x1000		/* historical load address >> 4 */
@@ -212,8 +213,8 @@ hardware_subarch:	.long 0			# subarchitecture, added with 2.07
 
 hardware_subarch_data:	.quad 0
 
-payload_offset:		.long input_data
-payload_length:		.long input_data_end-input_data
+payload_offset:		.long ZO_input_data
+payload_length:		.long ZO_z_input_len
 
 setup_data:		.quad 0			# 64-bit physical pointer to
 						# single linked list of
-- 
GitLab


From 40b387a8a9a821878ecdf9fb117958c426fc1385 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 14:41:55 -0700
Subject: [PATCH 1063/2198] x86, boot: use LOAD_PHYSICAL_ADDR on 64 bits

Use LOAD_PHYSICAL_ADDR instead of CONFIG_PHYSICAL_START in the 64-bit
decompression code, for equivalence with the 32-bit code.

[ Impact: cleanup, increases code similarity ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_64.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 2b9f2510507b4..4135d438b6626 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -87,7 +87,7 @@ ENTRY(startup_32)
 	addl	$(PMD_PAGE_SIZE -1), %ebx
 	andl	$PMD_PAGE_MASK, %ebx
 #else
-	movl	$CONFIG_PHYSICAL_START, %ebx
+	movl	$LOAD_PHYSICAL_ADDR, %ebx
 #endif
 
 	/* Target address to relocate to for decompression */
@@ -215,7 +215,7 @@ ENTRY(startup_64)
 	 *
 	 * If it is a relocatable kernel then decompress and run the kernel
 	 * from load address aligned to 2MB addr, otherwise decompress and
-	 * run the kernel from CONFIG_PHYSICAL_START
+	 * run the kernel from LOAD_PHYSICAL_ADDR
 	 *
 	 * We cannot rely on the calculation done in 32-bit mode, since we
 	 * may have been invoked via the 64-bit entry point.
@@ -228,7 +228,7 @@ ENTRY(startup_64)
 	andq	$PMD_PAGE_MASK, %rbp
 	movq	%rbp, %rbx
 #else
-	movq	$CONFIG_PHYSICAL_START, %rbp
+	movq	$LOAD_PHYSICAL_ADDR, %rbp
 	movq	%rbp, %rbx
 #endif
 
-- 
GitLab


From 2ff799d3cff1ecb274049378b28120ee5c1c5e5f Mon Sep 17 00:00:00 2001
From: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Date: Mon, 11 May 2009 20:09:14 +0530
Subject: [PATCH 1064/2198] sched: Don't export sched_mc_power_savings on
 multi-socket single core system

Fix to prevent sched_mc_power_saving from being exported through sysfs
for multi-scoket single core system. Max cores should be always greater than
one (1). My earlier patch that introduced fix for not exporting
'sched_mc_power_saving' on laptops  broke it on multi-socket single core
system. This fix addresses issue on both laptop and multi-socket single
core system.
Below are the Test results:

1. Single socket - multi-core
       Before Patch: Does not export 'sched_mc_power_saving'
       After Patch: Does not export 'sched_mc_power_saving'
       Result: Pass

2. Multi Socket - single core
      Before Patch: exports 'sched_mc_power_saving'
      After Patch: Does not export 'sched_mc_power_saving'
      Result: Pass

3. Multi Socket - Multi core
      Before Patch: exports 'sched_mc_power_saving'
      After Patch: exports 'sched_mc_power_saving'

[ Impact: make the sched_mc_power_saving control available more consistently ]

Signed-off-by: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
Cc: Suresh B Siddha <suresh.b.siddha@intel.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090511143914.GB4853@dirshya.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/topology.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index f44b49abca49b..066ef590d7e05 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -203,7 +203,8 @@ struct pci_bus;
 void x86_pci_root_bus_res_quirks(struct pci_bus *b);
 
 #ifdef CONFIG_SMP
-#define mc_capable()	(cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)
+#define mc_capable()	((boot_cpu_data.x86_max_cores > 1) && \
+			(cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
 #define smt_capable()			(smp_num_siblings > 1)
 #endif
 
-- 
GitLab


From 99aa45595f45603526513d5e29fc00f8afbf3913 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:02:10 -0700
Subject: [PATCH 1065/2198] x86, boot: remove dead code from
 boot/compressed/head_*.S

Remove a couple of lines of dead code from
arch/x86/boot/compressed/head_*.S; all of these update registers that
are dead in the current code.

[ Impact: cleanup ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_32.S | 10 ----------
 arch/x86/boot/compressed/head_64.S |  2 --
 2 files changed, 12 deletions(-)

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 470474bafc4dc..2b8e0dfa4b276 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -99,16 +99,6 @@ ENTRY(startup_32)
 	cld
 	popl	%esi
 
-/*
- * Compute the kernel start address.
- */
-#ifdef CONFIG_RELOCATABLE
-	addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
-	andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
-#else
-	movl	$LOAD_PHYSICAL_ADDR, %ebp
-#endif
-
 /*
  * Jump to the relocated address.
  */
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 4135d438b6626..2bb500af1bd76 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -226,10 +226,8 @@ ENTRY(startup_64)
 	leaq	startup_32(%rip) /* - $startup_32 */, %rbp
 	addq	$(PMD_PAGE_SIZE - 1), %rbp
 	andq	$PMD_PAGE_MASK, %rbp
-	movq	%rbp, %rbx
 #else
 	movq	$LOAD_PHYSICAL_ADDR, %rbp
-	movq	%rbp, %rbx
 #endif
 
 	/* Target address to relocate to for decompression */
-- 
GitLab


From 37ba7ab5e33cebc25c68fffe33e9f21e7c2014e8 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 15:56:08 -0700
Subject: [PATCH 1066/2198] x86, boot: make kernel_alignment adjustable; new
 bzImage fields

Make the kernel_alignment field adjustable; this allows us to set it
to a large value (intended to be 16 MB to avoid ZONE_DMA contention,
memory holes and other weirdness) while a smart bootloader can still
force a loading at a lesser alignment if absolutely necessary.

Also export pref_address (preferred loading address, corresponding to
the link-time address) and init_size, the total amount of linear
memory the kernel will require during initialization.

[ Impact: allows better kernel placement, gives bootloader more info ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/head_32.S |  7 +++++--
 arch/x86/boot/compressed/head_64.S | 14 ++++++++++----
 arch/x86/boot/header.S             | 15 +++++++++++++--
 arch/x86/include/asm/boot.h        | 15 +++++++++++++++
 arch/x86/kernel/asm-offsets_32.c   |  1 +
 arch/x86/kernel/asm-offsets_64.c   |  1 +
 6 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 2b8e0dfa4b276..75e4f001e7061 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -69,8 +69,11 @@ ENTRY(startup_32)
 
 #ifdef CONFIG_RELOCATABLE
 	movl	%ebp, %ebx
-	addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
-	andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
+	movl	BP_kernel_alignment(%esi), %eax
+	decl	%eax
+	addl    %eax, %ebx
+	notl	%eax
+	andl    %eax, %ebx
 #else
 	movl	$LOAD_PHYSICAL_ADDR, %ebx
 #endif
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 2bb500af1bd76..f62c284db9eb5 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -84,8 +84,11 @@ ENTRY(startup_32)
 
 #ifdef CONFIG_RELOCATABLE
 	movl	%ebp, %ebx
-	addl	$(PMD_PAGE_SIZE -1), %ebx
-	andl	$PMD_PAGE_MASK, %ebx
+	movl	BP_kernel_alignment(%esi), %eax
+	decl	%eax
+	addl	%eax, %ebx
+	notl	%eax
+	andl	%eax, %ebx
 #else
 	movl	$LOAD_PHYSICAL_ADDR, %ebx
 #endif
@@ -224,8 +227,11 @@ ENTRY(startup_64)
 	/* Start with the delta to where the kernel will run at. */
 #ifdef CONFIG_RELOCATABLE
 	leaq	startup_32(%rip) /* - $startup_32 */, %rbp
-	addq	$(PMD_PAGE_SIZE - 1), %rbp
-	andq	$PMD_PAGE_MASK, %rbp
+	movl	BP_kernel_alignment(%rsi), %eax
+	decl	%eax
+	addq	%rax, %rbp
+	notq	%rax
+	andq	%rax, %rbp
 #else
 	movq	$LOAD_PHYSICAL_ADDR, %rbp
 #endif
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 27285143adeb4..a0b426978d554 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -116,7 +116,7 @@ _start:
 	# Part 2 of the header, from the old setup.S
 
 		.ascii	"HdrS"		# header signature
-		.word	0x0209		# header version number (>= 0x0105)
+		.word	0x020a		# header version number (>= 0x0105)
 					# or else old loadlin-1.5 will fail)
 		.globl realmode_swtch
 realmode_swtch:	.word	0, 0		# default_switch, SETUPSEG
@@ -201,7 +201,7 @@ relocatable_kernel:    .byte 1
 #else
 relocatable_kernel:    .byte 0
 #endif
-pad2:			.byte 0
+min_alignment:		.byte MIN_KERNEL_ALIGN_LG2	# minimum alignment
 pad3:			.word 0
 
 cmdline_size:   .long   COMMAND_LINE_SIZE-1     #length of the command line,
@@ -220,6 +220,17 @@ setup_data:		.quad 0			# 64-bit physical pointer to
 						# single linked list of
 						# struct setup_data
 
+pref_address:		.quad LOAD_PHYSICAL_ADDR	# preferred load addr
+
+#define ZO_INIT_SIZE	(ZO__end - ZO_startup_32 + ZO_extract_offset)
+#define VO_INIT_SIZE	(VO__end - VO__text)
+#if ZO_INIT_SIZE > VO_INIT_SIZE
+#define INIT_SIZE ZO_INIT_SIZE
+#else
+#define INIT_SIZE VO_INIT_SIZE
+#endif
+init_size:		.long INIT_SIZE		# kernel initialization size
+
 # End of setup header #####################################################
 
 	.section ".inittext", "ax"
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 6ba23dd9fc921..418e632d4a801 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -8,11 +8,26 @@
 
 #ifdef __KERNEL__
 
+#include <asm/page_types.h>
+
 /* Physical address where kernel should be loaded. */
 #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
 				+ (CONFIG_PHYSICAL_ALIGN - 1)) \
 				& ~(CONFIG_PHYSICAL_ALIGN - 1))
 
+/* Minimum kernel alignment, as a power of two */
+#ifdef CONFIG_x86_64
+#define MIN_KERNEL_ALIGN_LG2	PMD_SHIFT
+#else
+#define MIN_KERNEL_ALIGN_LG2	(PAGE_SHIFT+1)
+#endif
+#define MIN_KERNEL_ALIGN	(_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
+
+#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
+	(CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
+#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
+#endif
+
 #ifdef CONFIG_KERNEL_BZIP2
 #define BOOT_HEAP_SIZE             0x400000
 #else /* !CONFIG_KERNEL_BZIP2 */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 5a6aa1c1162f0..1a830cbd70153 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -146,4 +146,5 @@ void foo(void)
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
+	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index e72f062fb4b5f..898ecc47e129e 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -125,6 +125,7 @@ int main(void)
 	OFFSET(BP_loadflags, boot_params, hdr.loadflags);
 	OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
 	OFFSET(BP_version, boot_params, hdr.version);
+	OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
 
 	BLANK();
 	DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
-- 
GitLab


From d297366ba692faf1f0384811a6ff0b20c3470b1b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:06:23 -0700
Subject: [PATCH 1067/2198] x86: document new bzImage fields

Document the new bzImage fields for kernel memory placement.

[ Impact: adds documentation ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/x86/boot.txt | 69 +++++++++++++++++++++++++++++++++++---
 1 file changed, 65 insertions(+), 4 deletions(-)

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index e0203662f9e9f..cf8dfc70a118d 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -50,6 +50,11 @@ Protocol 2.08:	(Kernel 2.6.26) Added crc32 checksum and ELF format
 Protocol 2.09:	(Kernel 2.6.26) Added a field of 64-bit physical
 		pointer to single linked list of struct	setup_data.
 
+Protocol 2.10:	(Kernel 2.6.31) A protocol for relaxed alignment
+		beyond the kernel_alignment added, new init_size and
+		pref_address fields.
+
+
 **** MEMORY LAYOUT
 
 The traditional memory map for the kernel loader, used for Image or
@@ -173,7 +178,7 @@ Offset	Proto	Name		Meaning
 022C/4	2.03+	ramdisk_max	Highest legal initrd address
 0230/4	2.05+	kernel_alignment Physical addr alignment required for kernel
 0234/1	2.05+	relocatable_kernel Whether kernel is relocatable or not
-0235/1	N/A	pad2		Unused
+0235/1	2.10+	min_alignment	Minimum alignment, as a power of two
 0236/2	N/A	pad3		Unused
 0238/4	2.06+	cmdline_size	Maximum size of the kernel command line
 023C/4	2.07+	hardware_subarch Hardware subarchitecture
@@ -182,6 +187,8 @@ Offset	Proto	Name		Meaning
 024C/4	2.08+	payload_length	Length of kernel payload
 0250/8	2.09+	setup_data	64-bit physical pointer to linked list
 				of struct setup_data
+0258/8	2.10+	pref_address	Preferred loading address
+0260/4	2.10+	init_size	Linear memory required during initialization
 
 (1) For backwards compatibility, if the setup_sects field contains 0, the
     real value is 4.
@@ -482,11 +489,19 @@ Protocol:	2.03+
   0x37FFFFFF, you can start your ramdisk at 0x37FE0000.)
 
 Field name:	kernel_alignment
-Type:		read (reloc)
+Type:		read/modify (reloc)
 Offset/size:	0x230/4
-Protocol:	2.05+
+Protocol:	2.05+ (read), 2.10+ (modify)
+
+  Alignment unit required by the kernel (if relocatable_kernel is
+  true.)  A relocatable kernel that is loaded at an alignment
+  incompatible with the value in this field will be realigned during
+  kernel initialization.
 
-  Alignment unit required by the kernel (if relocatable_kernel is true.)
+  Starting with protocol version 2.10, this reflects the kernel
+  alignment preferred for optimal performance; it is possible for the
+  loader to modify this field to permit a lesser alignment.  See the
+  min_alignment and pref_address field below.
 
 Field name:	relocatable_kernel
 Type:		read (reloc)
@@ -498,6 +513,22 @@ Protocol:	2.05+
   After loading, the boot loader must set the code32_start field to
   point to the loaded code, or to a boot loader hook.
 
+Field name:	min_alignment
+Type:		read (reloc)
+Offset/size:	0x235/1
+Protocol:	2.10+
+
+  This field, if nonzero, indicates as a power of two the minimum
+  alignment required, as opposed to preferred, by the kernel to boot.
+  If a boot loader makes use of this field, it should update the
+  kernel_alignment field with the alignment unit desired; typically:
+
+	kernel_alignment = 1 << min_alignment
+
+  There may be a considerable performance cost with an excessively
+  misaligned kernel.  Therefore, a loader should typically try each
+  power-of-two alignment from kernel_alignment down to this alignment.
+
 Field name:	cmdline_size
 Type:		read
 Offset/size:	0x238/4
@@ -582,6 +613,36 @@ Protocol:	2.09+
   sure to consider the case where the linked list already contains
   entries.
 
+Field name:	pref_address
+Type:		read (reloc)
+Offset/size:	0x258/8
+Protocol:	2.10+
+
+  This field, if nonzero, represents a preferred load address for the
+  kernel.  A relocating bootloader should attempt to load at this
+  address if possible.
+
+  A non-relocatable kernel will unconditionally move itself and to run
+  at this address.
+
+Field name:	init_size
+Type:		read
+Offset/size:	0x25c/4
+
+  This field indicates the amount of linear contiguous memory starting
+  at the kernel runtime start address that the kernel needs before it
+  is capable of examining its memory map.  This is not the same thing
+  as the total amount of memory the kernel needs to boot, but it can
+  be used by a relocating boot loader to help select a safe load
+  address for the kernel.
+
+  The kernel runtime start address is determined by the following algorithm:
+
+  if (relocatable_kernel)
+	runtime_start = align_up(load_address, kernel_alignment)
+  else
+	runtime_start = pref_address
+
 
 **** THE IMAGE CHECKSUM
 
-- 
GitLab


From ceefccc93932b920a8ec6f35f596db05202a12fe Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:12:16 -0700
Subject: [PATCH 1068/2198] x86: default CONFIG_PHYSICAL_START and
 CONFIG_PHYSICAL_ALIGN to 16 MB

Default CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN each to 16 MB,
so that both non-relocatable and relocatable kernels are loaded at
16 MB by a non-relocating bootloader.  This is somewhat hacky, but it
appears to be the only way to do this that does not break some some
set of existing bootloaders.

We want to avoid the bottom 16 MB because of large page breakup,
memory holes, and ZONE_DMA.  Embedded systems may need to reduce this,
or update their bootloaders to be aware of the new min_alignment field.

[ Impact: performance improvement, avoids problems on some systems ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 27 ++++++++++++---------------
 1 file changed, 12 insertions(+), 15 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5aee45356b58d..50fbb47f5295b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1455,9 +1455,7 @@ config KEXEC_JUMP
 
 config PHYSICAL_START
 	hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
-	default "0x1000000" if X86_NUMAQ
-	default "0x200000" if X86_64
-	default "0x100000"
+	default "0x1000000"
 	---help---
 	  This gives the physical address where the kernel is loaded.
 
@@ -1476,15 +1474,15 @@ config PHYSICAL_START
 	  to be specifically compiled to run from a specific memory area
 	  (normally a reserved region) and this option comes handy.
 
-	  So if you are using bzImage for capturing the crash dump, leave
-	  the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y.
-	  Otherwise if you plan to use vmlinux for capturing the crash dump
-	  change this value to start of the reserved region (Typically 16MB
-	  0x1000000). In other words, it can be set based on the "X" value as
-	  specified in the "crashkernel=YM@XM" command line boot parameter
-	  passed to the panic-ed kernel. Typically this parameter is set as
-	  crashkernel=64M@16M. Please take a look at
-	  Documentation/kdump/kdump.txt for more details about crash dumps.
+	  So if you are using bzImage for capturing the crash dump,
+	  leave the value here unchanged to 0x1000000 and set
+	  CONFIG_RELOCATABLE=y.  Otherwise if you plan to use vmlinux
+	  for capturing the crash dump change this value to start of
+	  the reserved region.  In other words, it can be set based on
+	  the "X" value as specified in the "crashkernel=YM@XM"
+	  command line boot parameter passed to the panic-ed
+	  kernel. Please take a look at Documentation/kdump/kdump.txt
+	  for more details about crash dumps.
 
 	  Usage of bzImage for capturing the crash dump is recommended as
 	  one does not have to build two kernels. Same kernel can be used
@@ -1521,9 +1519,8 @@ config X86_NEED_RELOCS
 config PHYSICAL_ALIGN
 	hex
 	prompt "Alignment value to which kernel should be aligned" if X86_32
-	default "0x100000" if X86_32
-	default "0x200000" if X86_64
-	range 0x2000 0x400000
+	default "0x1000000"
+	range 0x2000 0x1000000
 	---help---
 	  This value puts the alignment restrictions on physical address
 	  where kernel is loaded and run from. Kernel is compiled for an
-- 
GitLab


From 26717808f93a27c22d4853c4fb17fa225f4ccc68 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 7 May 2009 14:19:34 -0700
Subject: [PATCH 1069/2198] x86: make CONFIG_RELOCATABLE the default

Remove the EXPERIMENTAL tag from CONFIG_RELOCATABLE and make it the
default.  Relocatable kernels have been used for a while now, and
should now have identical semantics to non-relocatable kernels when
loaded by a non-relocating bootloader.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 50fbb47f5295b..3e0f80a764a78 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1495,8 +1495,8 @@ config PHYSICAL_START
 	  Don't change this unless you know what you are doing.
 
 config RELOCATABLE
-	bool "Build a relocatable kernel (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	bool "Build a relocatable kernel"
+	default y
 	---help---
 	  This builds a kernel image that retains relocation information
 	  so it can be loaded someplace besides the default 1MB.
-- 
GitLab


From c4a994645d04d5fa6bfa52a3204af87dd92168d5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:20:12 -0700
Subject: [PATCH 1070/2198] x86, defconfig: update to current, no material
 changes

Update defconfigs to reflect current configuration files.  No other
changes.

[ Impact: updates defconfigs to match what "make defconfig" generates ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/configs/i386_defconfig   | 143 ++++++++++++++++++++++-------
 arch/x86/configs/x86_64_defconfig | 148 ++++++++++++++++++++++--------
 2 files changed, 223 insertions(+), 68 deletions(-)

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 235b81d0f6f2b..70036a7a95062 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,12 +1,13 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:50:58 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:19:47 2009
 #
 # CONFIG_64BIT is not set
 CONFIG_X86_32=y
 # CONFIG_X86_64 is not set
 CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf32-i386"
 CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
 CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_CMOS_UPDATE=y
@@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
 CONFIG_ARCH_HAS_DEFAULT_IDLE=y
 CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
 CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
 # CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
 CONFIG_ARCH_HIBERNATION_POSSIBLE=y
 CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
 CONFIG_ARCH_POPULATES_NODE_MAP=y
 # CONFIG_AUDIT_ARCH is not set
 CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
 CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
 CONFIG_GENERIC_IRQ_PROBE=y
 CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
 CONFIG_USE_GENERIC_SMP_HELPERS=y
 CONFIG_X86_32_SMP=y
 CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
 CONFIG_X86_TRAMPOLINE=y
+CONFIG_X86_32_LAZY_GS=y
 CONFIG_KTIME_SCALAR=y
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 
@@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 CONFIG_LOCALVERSION=""
 # CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 CONFIG_TASKSTATS=y
@@ -113,23 +123,26 @@ CONFIG_PID_NS=y
 CONFIG_NET_NS=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
 # CONFIG_EMBEDDED is not set
 CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
 CONFIG_ELF_CORE=y
 CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
 CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
 CONFIG_EPOLL=y
 CONFIG_SIGNALFD=y
 CONFIG_TIMERFD=y
@@ -139,6 +152,7 @@ CONFIG_AIO=y
 CONFIG_VM_EVENT_COUNTERS=y
 CONFIG_PCI_QUIRKS=y
 CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
 # CONFIG_SLAB is not set
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
@@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_STOP_MACHINE=y
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
 CONFIG_SMP=y
 CONFIG_SPARSE_IRQ=y
-CONFIG_X86_FIND_SMP_CONFIG=y
 CONFIG_X86_MPPARSE=y
+# CONFIG_X86_BIGSMP is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
 # CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
-# CONFIG_X86_VSMP is not set
 # CONFIG_X86_RDC321X is not set
+# CONFIG_X86_32_NON_STANDARD is not set
 CONFIG_SCHED_OMIT_FRAME_POINTER=y
 # CONFIG_PARAVIRT_GUEST is not set
 # CONFIG_MEMTEST is not set
@@ -230,8 +245,10 @@ CONFIG_M686=y
 # CONFIG_GENERIC_CPU is not set
 CONFIG_X86_GENERIC=y
 CONFIG_X86_CPU=y
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
 CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=5
 CONFIG_X86_XADD=y
 # CONFIG_X86_PPRO_FENCE is not set
 CONFIG_X86_WP_WORKS_OK=y
@@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y
 CONFIG_CPU_SUP_INTEL=y
 CONFIG_CPU_SUP_CYRIX_32=y
 CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_32=y
+CONFIG_CPU_SUP_CENTAUR=y
 CONFIG_CPU_SUP_TRANSMETA_32=y
 CONFIG_CPU_SUP_UMC_32=y
 CONFIG_X86_DS=y
@@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y
 CONFIG_MICROCODE_OLD_INTERFACE=y
 CONFIG_X86_MSR=y
 CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
 # CONFIG_NOHIGHMEM is not set
 CONFIG_HIGHMEM4G=y
 # CONFIG_HIGHMEM64G is not set
@@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1
 CONFIG_BOUNCE=y
 CONFIG_VIRT_TO_BUS=y
 CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
 CONFIG_HIGHPTE=y
 CONFIG_X86_CHECK_BIOS_CORRUPTION=y
 CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
@@ -312,6 +332,7 @@ CONFIG_MTRR=y
 CONFIG_X86_PAT=y
 CONFIG_EFI=y
 CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
 # CONFIG_HZ_100 is not set
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_300 is not set
@@ -363,7 +384,6 @@ CONFIG_ACPI_THERMAL=y
 CONFIG_ACPI_BLACKLIST_YEAR=0
 # CONFIG_ACPI_DEBUG is not set
 # CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
 CONFIG_X86_PM_TIMER=y
 CONFIG_ACPI_CONTAINER=y
 # CONFIG_ACPI_SBS is not set
@@ -425,6 +445,7 @@ CONFIG_PCI_BIOS=y
 CONFIG_PCI_DIRECT=y
 CONFIG_PCI_MMCONFIG=y
 CONFIG_PCI_DOMAINS=y
+# CONFIG_DMAR is not set
 CONFIG_PCIEPORTBUS=y
 # CONFIG_HOTPLUG_PCI_PCIE is not set
 CONFIG_PCIEAER=y
@@ -435,6 +456,7 @@ CONFIG_PCI_MSI=y
 # CONFIG_PCI_DEBUG is not set
 # CONFIG_PCI_STUB is not set
 CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
 CONFIG_ISA_DMA_API=y
 # CONFIG_ISA is not set
 # CONFIG_MCA is not set
@@ -481,7 +503,6 @@ CONFIG_NET=y
 #
 # Networking options
 #
-CONFIG_COMPAT_NET_DEV_OPS=y
 CONFIG_PACKET=y
 CONFIG_PACKET_MMAP=y
 CONFIG_UNIX=y
@@ -639,6 +660,7 @@ CONFIG_LLC=y
 # CONFIG_LAPB is not set
 # CONFIG_ECONET is not set
 # CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
 CONFIG_NET_SCHED=y
 
 #
@@ -696,6 +718,7 @@ CONFIG_NET_SCH_FIFO=y
 #
 # CONFIG_NET_PKTGEN is not set
 # CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
 CONFIG_HAMRADIO=y
 
 #
@@ -706,12 +729,10 @@ CONFIG_HAMRADIO=y
 # CONFIG_IRDA is not set
 # CONFIG_BT is not set
 # CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
 CONFIG_FIB_RULES=y
 CONFIG_WIRELESS=y
 CONFIG_CFG80211=y
 # CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
 CONFIG_WIRELESS_OLD_REGULATORY=y
 CONFIG_WIRELESS_EXT=y
 CONFIG_WIRELESS_EXT_SYSFS=y
@@ -789,6 +810,7 @@ CONFIG_MISC_DEVICES=y
 # CONFIG_ICS932S401 is not set
 # CONFIG_ENCLOSURE_SERVICES is not set
 # CONFIG_HP_ILO is not set
+# CONFIG_ISL29003 is not set
 # CONFIG_C2PORT is not set
 
 #
@@ -842,6 +864,7 @@ CONFIG_SCSI_SPI_ATTRS=y
 # CONFIG_SCSI_LOWLEVEL is not set
 # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
 # CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
 CONFIG_ATA=y
 # CONFIG_ATA_NONSTANDARD is not set
 CONFIG_ATA_ACPI=y
@@ -940,6 +963,7 @@ CONFIG_DM_ZERO=y
 CONFIG_MACINTOSH_DRIVERS=y
 CONFIG_MAC_EMUMOUSEBTN=y
 CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
 # CONFIG_IFB is not set
 # CONFIG_DUMMY is not set
 # CONFIG_BONDING is not set
@@ -977,6 +1001,8 @@ CONFIG_MII=y
 CONFIG_NET_VENDOR_3COM=y
 # CONFIG_VORTEX is not set
 # CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
 CONFIG_NET_TULIP=y
 # CONFIG_DE2104X is not set
 # CONFIG_TULIP is not set
@@ -1026,6 +1052,7 @@ CONFIG_E1000=y
 CONFIG_E1000E=y
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -1040,6 +1067,7 @@ CONFIG_BNX2=y
 # CONFIG_QLA3XXX is not set
 # CONFIG_ATL1 is not set
 # CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
 # CONFIG_JME is not set
 CONFIG_NETDEV_10000=y
 # CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1077,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -1058,6 +1087,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_BNX2X is not set
 # CONFIG_QLGE is not set
 # CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
 CONFIG_TR=y
 # CONFIG_IBMOL is not set
 # CONFIG_IBMLS is not set
@@ -1073,8 +1103,8 @@ CONFIG_WLAN_80211=y
 # CONFIG_LIBERTAS is not set
 # CONFIG_LIBERTAS_THINFIRM is not set
 # CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
 # CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
 # CONFIG_AIRO_CS is not set
 # CONFIG_PCMCIA_WL3501 is not set
 # CONFIG_PRISM54 is not set
@@ -1084,21 +1114,21 @@ CONFIG_WLAN_80211=y
 # CONFIG_RTL8187 is not set
 # CONFIG_ADM8211 is not set
 # CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
 # CONFIG_P54_COMMON is not set
 CONFIG_ATH5K=y
 # CONFIG_ATH5K_DEBUG is not set
 # CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
 # CONFIG_IPW2100 is not set
 # CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
 # CONFIG_HOSTAP is not set
 # CONFIG_B43 is not set
 # CONFIG_B43LEGACY is not set
 # CONFIG_ZD1211RW is not set
 # CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
 
 #
 # Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1209,6 +1239,8 @@ CONFIG_INPUT_TABLET=y
 # CONFIG_TABLET_USB_KBTAB is not set
 # CONFIG_TABLET_USB_WACOM is not set
 CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
 # CONFIG_TOUCHSCREEN_FUJITSU is not set
 # CONFIG_TOUCHSCREEN_GUNZE is not set
 # CONFIG_TOUCHSCREEN_ELO is not set
@@ -1303,6 +1335,7 @@ CONFIG_UNIX98_PTYS=y
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_IPMI_HANDLER is not set
 CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
 CONFIG_HW_RANDOM_INTEL=y
 CONFIG_HW_RANDOM_AMD=y
 CONFIG_HW_RANDOM_GEODE=y
@@ -1390,7 +1423,6 @@ CONFIG_I2C_I801=y
 # CONFIG_SENSORS_PCF8574 is not set
 # CONFIG_PCF8575 is not set
 # CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_MAX6875 is not set
 # CONFIG_SENSORS_TSL2550 is not set
 # CONFIG_I2C_DEBUG_CORE is not set
@@ -1424,6 +1456,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_ADT7475 is not set
 # CONFIG_SENSORS_K8TEMP is not set
 # CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
 # CONFIG_SENSORS_ATXP1 is not set
 # CONFIG_SENSORS_DS1621 is not set
 # CONFIG_SENSORS_I5K_AMB is not set
@@ -1433,6 +1466,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_FSCHER is not set
 # CONFIG_SENSORS_FSCPOS is not set
 # CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_CORETEMP is not set
@@ -1448,11 +1482,14 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_LM90 is not set
 # CONFIG_SENSORS_LM92 is not set
 # CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
 # CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
 # CONFIG_SENSORS_MAX1619 is not set
 # CONFIG_SENSORS_MAX6650 is not set
 # CONFIG_SENSORS_PC87360 is not set
 # CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_SIS5595 is not set
 # CONFIG_SENSORS_DME1737 is not set
 # CONFIG_SENSORS_SMSC47M1 is not set
@@ -1643,7 +1680,6 @@ CONFIG_FB_EFI=y
 # CONFIG_FB_3DFX is not set
 # CONFIG_FB_VOODOO1 is not set
 # CONFIG_FB_VT8623 is not set
-# CONFIG_FB_CYBLA is not set
 # CONFIG_FB_TRIDENT is not set
 # CONFIG_FB_ARK is not set
 # CONFIG_FB_PM3 is not set
@@ -1652,6 +1688,7 @@ CONFIG_FB_EFI=y
 # CONFIG_FB_VIRTUAL is not set
 # CONFIG_FB_METRONOME is not set
 # CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_LCD_CLASS_DEVICE is not set
 CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1738,6 +1775,8 @@ CONFIG_SND_PCI=y
 # CONFIG_SND_INDIGO is not set
 # CONFIG_SND_INDIGOIO is not set
 # CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
 # CONFIG_SND_EMU10K1 is not set
 # CONFIG_SND_EMU10K1X is not set
 # CONFIG_SND_ENS1370 is not set
@@ -1811,15 +1850,17 @@ CONFIG_USB_HIDDEV=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
 CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
 CONFIG_HID_LOGITECH=y
 CONFIG_LOGITECH_FF=y
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1885,11 +1926,11 @@ CONFIG_USB_PRINTER=y
 # CONFIG_USB_TMC is not set
 
 #
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
 #
 
 #
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
 #
 CONFIG_USB_STORAGE=y
 # CONFIG_USB_STORAGE_DEBUG is not set
@@ -1931,7 +1972,6 @@ CONFIG_USB_LIBUSUAL=y
 # CONFIG_USB_LED is not set
 # CONFIG_USB_CYPRESS_CY7C63 is not set
 # CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
 # CONFIG_USB_IDMOUSE is not set
 # CONFIG_USB_FTDI_ELAN is not set
 # CONFIG_USB_APPLEDISPLAY is not set
@@ -1947,6 +1987,7 @@ CONFIG_USB_LIBUSUAL=y
 #
 # OTG and related infrastructure
 #
+# CONFIG_NOP_USB_XCEIV is not set
 # CONFIG_UWB is not set
 # CONFIG_MMC is not set
 # CONFIG_MEMSTICK is not set
@@ -1958,8 +1999,10 @@ CONFIG_LEDS_CLASS=y
 #
 # CONFIG_LEDS_ALIX2 is not set
 # CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
 # CONFIG_LEDS_CLEVO_MAIL is not set
 # CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
 
 #
 # LED Triggers
@@ -1969,6 +2012,10 @@ CONFIG_LEDS_TRIGGERS=y
 # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
 # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
 # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
 # CONFIG_ACCESSIBILITY is not set
 # CONFIG_INFINIBAND is not set
 CONFIG_EDAC=y
@@ -2037,6 +2084,7 @@ CONFIG_DMADEVICES=y
 # DMA Devices
 #
 # CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
 # CONFIG_UIO is not set
 # CONFIG_STAGING is not set
 CONFIG_X86_PLATFORM_DEVICES=y
@@ -2071,6 +2119,7 @@ CONFIG_DMIID=y
 #
 # CONFIG_EXT2_FS is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
@@ -2100,6 +2149,11 @@ CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
 CONFIG_GENERIC_ACL=y
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -2151,6 +2205,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -2164,7 +2219,6 @@ CONFIG_NFS_ACL_SUPPORT=y
 CONFIG_NFS_COMMON=y
 CONFIG_SUNRPC=y
 CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
 CONFIG_RPCSEC_GSS_KRB5=y
 # CONFIG_RPCSEC_GSS_SPKM3 is not set
 # CONFIG_SMB_FS is not set
@@ -2251,6 +2305,7 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
@@ -2266,6 +2321,7 @@ CONFIG_TIMER_STATS=y
 # CONFIG_LOCK_STAT is not set
 # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
 # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
 # CONFIG_DEBUG_KOBJECT is not set
 # CONFIG_DEBUG_HIGHMEM is not set
 CONFIG_DEBUG_BUGVERBOSE=y
@@ -2289,13 +2345,19 @@ CONFIG_FRAME_POINTER=y
 # CONFIG_FAULT_INJECTION is not set
 # CONFIG_LATENCYTOP is not set
 CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
 CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -2305,13 +2367,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
 # CONFIG_SYSPROF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_POWER_TRACER is not set
 # CONFIG_STACK_TRACER is not set
 # CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
 CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -2321,7 +2391,6 @@ CONFIG_EARLY_PRINTK=y
 CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
 # CONFIG_DEBUG_PER_CPU_MAPS is not set
 # CONFIG_X86_PTDUMP is not set
 CONFIG_DEBUG_RODATA=y
@@ -2329,7 +2398,7 @@ CONFIG_DEBUG_RODATA=y
 CONFIG_DEBUG_NX_TEST=m
 # CONFIG_4KSTACKS is not set
 CONFIG_DOUBLEFAULT=y
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
 CONFIG_IO_DELAY_TYPE_0X80=0
 CONFIG_IO_DELAY_TYPE_0XED=1
 CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2365,6 +2434,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
 CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
 # CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
 CONFIG_CRYPTO=y
 
 #
@@ -2380,10 +2451,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
 CONFIG_CRYPTO_HASH=y
 CONFIG_CRYPTO_HASH2=y
 CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
 CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_MANAGER2=y
 # CONFIG_CRYPTO_GF128MUL is not set
 # CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
 # CONFIG_CRYPTO_CRYPTD is not set
 CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_TEST is not set
@@ -2456,6 +2529,7 @@ CONFIG_CRYPTO_DES=y
 # Compression
 #
 # CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
 # CONFIG_CRYPTO_LZO is not set
 
 #
@@ -2467,11 +2541,13 @@ CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_GEODE is not set
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
 CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
 CONFIG_VIRTUALIZATION=y
 # CONFIG_KVM is not set
 # CONFIG_LGUEST is not set
 # CONFIG_VIRTIO_PCI is not set
 # CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
@@ -2489,7 +2565,10 @@ CONFIG_CRC32=y
 # CONFIG_LIBCRC32C is not set
 CONFIG_AUDIT_GENERIC=y
 CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 9fe5d212ab4cc..f3e53e21f438d 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,12 +1,13 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:44:16 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:19:24 2009
 #
 CONFIG_64BIT=y
 # CONFIG_X86_32 is not set
 CONFIG_X86_64=y
 CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf64-x86-64"
 CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
 CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_CMOS_UPDATE=y
@@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
 CONFIG_ARCH_HAS_DEFAULT_IDLE=y
 CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
 CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
 CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
 CONFIG_ARCH_HIBERNATION_POSSIBLE=y
 CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y
 CONFIG_ARCH_POPULATES_NODE_MAP=y
 CONFIG_AUDIT_ARCH=y
 CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
 CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
 CONFIG_GENERIC_IRQ_PROBE=y
 CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
 CONFIG_USE_GENERIC_SMP_HELPERS=y
 CONFIG_X86_64_SMP=y
 CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
 CONFIG_X86_TRAMPOLINE=y
 # CONFIG_KTIME_SCALAR is not set
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 CONFIG_LOCALVERSION=""
 # CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 CONFIG_TASKSTATS=y
@@ -114,23 +123,26 @@ CONFIG_PID_NS=y
 CONFIG_NET_NS=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
 # CONFIG_EMBEDDED is not set
 CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
 CONFIG_ELF_CORE=y
 CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
 CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
 CONFIG_EPOLL=y
 CONFIG_SIGNALFD=y
 CONFIG_TIMERFD=y
@@ -140,6 +152,7 @@ CONFIG_AIO=y
 CONFIG_VM_EVENT_COUNTERS=y
 CONFIG_PCI_QUIRKS=y
 CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
 # CONFIG_SLAB is not set
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
@@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_STOP_MACHINE=y
 CONFIG_BLOCK=y
-CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_BLK_DEV_INTEGRITY is not set
 CONFIG_BLOCK_COMPAT=y
@@ -196,11 +210,10 @@ CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
 CONFIG_SMP=y
 CONFIG_SPARSE_IRQ=y
 # CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
-CONFIG_X86_FIND_SMP_CONFIG=y
 CONFIG_X86_MPPARSE=y
-# CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
 # CONFIG_X86_VSMP is not set
+# CONFIG_X86_UV is not set
 CONFIG_SCHED_OMIT_FRAME_POINTER=y
 # CONFIG_PARAVIRT_GUEST is not set
 # CONFIG_MEMTEST is not set
@@ -230,10 +243,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y
 # CONFIG_MCORE2 is not set
 CONFIG_GENERIC_CPU=y
 CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=128
-CONFIG_X86_INTERNODE_CACHE_BYTES=128
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
 CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=6
 CONFIG_X86_WP_WORKS_OK=y
 CONFIG_X86_TSC=y
 CONFIG_X86_CMPXCHG64=y
@@ -242,7 +255,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64
 CONFIG_X86_DEBUGCTLMSR=y
 CONFIG_CPU_SUP_INTEL=y
 CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_64=y
+CONFIG_CPU_SUP_CENTAUR=y
 CONFIG_X86_DS=y
 CONFIG_X86_PTRACE_BTS=y
 CONFIG_HPET_TIMER=y
@@ -269,6 +282,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
 CONFIG_X86_MCE=y
 CONFIG_X86_MCE_INTEL=y
 CONFIG_X86_MCE_AMD=y
+CONFIG_X86_MCE_THRESHOLD=y
 # CONFIG_I8K is not set
 CONFIG_MICROCODE=y
 CONFIG_MICROCODE_INTEL=y
@@ -276,6 +290,7 @@ CONFIG_MICROCODE_AMD=y
 CONFIG_MICROCODE_OLD_INTERFACE=y
 CONFIG_X86_MSR=y
 CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
 CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
 CONFIG_DIRECT_GBPAGES=y
 CONFIG_NUMA=y
@@ -309,6 +324,8 @@ CONFIG_ZONE_DMA_FLAG=1
 CONFIG_BOUNCE=y
 CONFIG_VIRT_TO_BUS=y
 CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
 CONFIG_X86_CHECK_BIOS_CORRUPTION=y
 CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
 CONFIG_X86_RESERVE_LOW_64K=y
@@ -317,6 +334,7 @@ CONFIG_MTRR=y
 CONFIG_X86_PAT=y
 CONFIG_EFI=y
 CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
 # CONFIG_HZ_100 is not set
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_300 is not set
@@ -325,9 +343,10 @@ CONFIG_HZ=1000
 CONFIG_SCHED_HRTICK=y
 CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
+# CONFIG_KEXEC_JUMP is not set
 CONFIG_PHYSICAL_START=0x1000000
 # CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_PHYSICAL_ALIGN=0x1000000
 CONFIG_HOTPLUG_CPU=y
 # CONFIG_COMPAT_VDSO is not set
 # CONFIG_CMDLINE_BOOL is not set
@@ -370,7 +389,6 @@ CONFIG_ACPI_NUMA=y
 CONFIG_ACPI_BLACKLIST_YEAR=0
 # CONFIG_ACPI_DEBUG is not set
 # CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
 CONFIG_X86_PM_TIMER=y
 CONFIG_ACPI_CONTAINER=y
 # CONFIG_ACPI_SBS is not set
@@ -436,6 +454,7 @@ CONFIG_PCI_MSI=y
 # CONFIG_PCI_DEBUG is not set
 # CONFIG_PCI_STUB is not set
 CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
 CONFIG_ISA_DMA_API=y
 CONFIG_K8_NB=y
 CONFIG_PCCARD=y
@@ -481,7 +500,6 @@ CONFIG_NET=y
 #
 # Networking options
 #
-CONFIG_COMPAT_NET_DEV_OPS=y
 CONFIG_PACKET=y
 CONFIG_PACKET_MMAP=y
 CONFIG_UNIX=y
@@ -639,6 +657,7 @@ CONFIG_LLC=y
 # CONFIG_LAPB is not set
 # CONFIG_ECONET is not set
 # CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
 CONFIG_NET_SCHED=y
 
 #
@@ -696,6 +715,7 @@ CONFIG_NET_SCH_FIFO=y
 #
 # CONFIG_NET_PKTGEN is not set
 # CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
 CONFIG_HAMRADIO=y
 
 #
@@ -706,12 +726,10 @@ CONFIG_HAMRADIO=y
 # CONFIG_IRDA is not set
 # CONFIG_BT is not set
 # CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
 CONFIG_FIB_RULES=y
 CONFIG_WIRELESS=y
 CONFIG_CFG80211=y
 # CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
 CONFIG_WIRELESS_OLD_REGULATORY=y
 CONFIG_WIRELESS_EXT=y
 CONFIG_WIRELESS_EXT_SYSFS=y
@@ -788,9 +806,8 @@ CONFIG_MISC_DEVICES=y
 # CONFIG_TIFM_CORE is not set
 # CONFIG_ICS932S401 is not set
 # CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_SGI_XP is not set
 # CONFIG_HP_ILO is not set
-# CONFIG_SGI_GRU is not set
+# CONFIG_ISL29003 is not set
 # CONFIG_C2PORT is not set
 
 #
@@ -844,6 +861,7 @@ CONFIG_SCSI_SPI_ATTRS=y
 # CONFIG_SCSI_LOWLEVEL is not set
 # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
 # CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
 CONFIG_ATA=y
 # CONFIG_ATA_NONSTANDARD is not set
 CONFIG_ATA_ACPI=y
@@ -940,6 +958,7 @@ CONFIG_DM_ZERO=y
 CONFIG_MACINTOSH_DRIVERS=y
 CONFIG_MAC_EMUMOUSEBTN=y
 CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
 # CONFIG_IFB is not set
 # CONFIG_DUMMY is not set
 # CONFIG_BONDING is not set
@@ -977,6 +996,8 @@ CONFIG_MII=y
 CONFIG_NET_VENDOR_3COM=y
 # CONFIG_VORTEX is not set
 # CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
 CONFIG_NET_TULIP=y
 # CONFIG_DE2104X is not set
 # CONFIG_TULIP is not set
@@ -1026,6 +1047,7 @@ CONFIG_E1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -1040,6 +1062,7 @@ CONFIG_TIGON3=y
 # CONFIG_QLA3XXX is not set
 # CONFIG_ATL1 is not set
 # CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
 # CONFIG_JME is not set
 CONFIG_NETDEV_10000=y
 # CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1072,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -1058,6 +1082,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_BNX2X is not set
 # CONFIG_QLGE is not set
 # CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
 CONFIG_TR=y
 # CONFIG_IBMOL is not set
 # CONFIG_3C359 is not set
@@ -1072,8 +1097,8 @@ CONFIG_WLAN_80211=y
 # CONFIG_LIBERTAS is not set
 # CONFIG_LIBERTAS_THINFIRM is not set
 # CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
 # CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
 # CONFIG_AIRO_CS is not set
 # CONFIG_PCMCIA_WL3501 is not set
 # CONFIG_PRISM54 is not set
@@ -1083,21 +1108,21 @@ CONFIG_WLAN_80211=y
 # CONFIG_RTL8187 is not set
 # CONFIG_ADM8211 is not set
 # CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
 # CONFIG_P54_COMMON is not set
 CONFIG_ATH5K=y
 # CONFIG_ATH5K_DEBUG is not set
 # CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
 # CONFIG_IPW2100 is not set
 # CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
 # CONFIG_HOSTAP is not set
 # CONFIG_B43 is not set
 # CONFIG_B43LEGACY is not set
 # CONFIG_ZD1211RW is not set
 # CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
 
 #
 # Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1208,6 +1233,8 @@ CONFIG_INPUT_TABLET=y
 # CONFIG_TABLET_USB_KBTAB is not set
 # CONFIG_TABLET_USB_WACOM is not set
 CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
 # CONFIG_TOUCHSCREEN_FUJITSU is not set
 # CONFIG_TOUCHSCREEN_GUNZE is not set
 # CONFIG_TOUCHSCREEN_ELO is not set
@@ -1301,6 +1328,7 @@ CONFIG_UNIX98_PTYS=y
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_IPMI_HANDLER is not set
 CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
 # CONFIG_HW_RANDOM_INTEL is not set
 # CONFIG_HW_RANDOM_AMD is not set
 CONFIG_NVRAM=y
@@ -1382,7 +1410,6 @@ CONFIG_I2C_I801=y
 # CONFIG_SENSORS_PCF8574 is not set
 # CONFIG_PCF8575 is not set
 # CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_MAX6875 is not set
 # CONFIG_SENSORS_TSL2550 is not set
 # CONFIG_I2C_DEBUG_CORE is not set
@@ -1416,6 +1443,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_ADT7475 is not set
 # CONFIG_SENSORS_K8TEMP is not set
 # CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
 # CONFIG_SENSORS_ATXP1 is not set
 # CONFIG_SENSORS_DS1621 is not set
 # CONFIG_SENSORS_I5K_AMB is not set
@@ -1425,6 +1453,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_FSCHER is not set
 # CONFIG_SENSORS_FSCPOS is not set
 # CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_CORETEMP is not set
@@ -1440,11 +1469,14 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_LM90 is not set
 # CONFIG_SENSORS_LM92 is not set
 # CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
 # CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
 # CONFIG_SENSORS_MAX1619 is not set
 # CONFIG_SENSORS_MAX6650 is not set
 # CONFIG_SENSORS_PC87360 is not set
 # CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_SIS5595 is not set
 # CONFIG_SENSORS_DME1737 is not set
 # CONFIG_SENSORS_SMSC47M1 is not set
@@ -1635,6 +1667,7 @@ CONFIG_FB_EFI=y
 # CONFIG_FB_VIRTUAL is not set
 # CONFIG_FB_METRONOME is not set
 # CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_LCD_CLASS_DEVICE is not set
 CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1720,6 +1753,8 @@ CONFIG_SND_PCI=y
 # CONFIG_SND_INDIGO is not set
 # CONFIG_SND_INDIGOIO is not set
 # CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
 # CONFIG_SND_EMU10K1 is not set
 # CONFIG_SND_EMU10K1X is not set
 # CONFIG_SND_ENS1370 is not set
@@ -1792,15 +1827,17 @@ CONFIG_USB_HIDDEV=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
 CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
 CONFIG_HID_LOGITECH=y
 CONFIG_LOGITECH_FF=y
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1866,11 +1903,11 @@ CONFIG_USB_PRINTER=y
 # CONFIG_USB_TMC is not set
 
 #
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
 #
 
 #
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
 #
 CONFIG_USB_STORAGE=y
 # CONFIG_USB_STORAGE_DEBUG is not set
@@ -1912,7 +1949,6 @@ CONFIG_USB_LIBUSUAL=y
 # CONFIG_USB_LED is not set
 # CONFIG_USB_CYPRESS_CY7C63 is not set
 # CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
 # CONFIG_USB_IDMOUSE is not set
 # CONFIG_USB_FTDI_ELAN is not set
 # CONFIG_USB_APPLEDISPLAY is not set
@@ -1928,6 +1964,7 @@ CONFIG_USB_LIBUSUAL=y
 #
 # OTG and related infrastructure
 #
+# CONFIG_NOP_USB_XCEIV is not set
 # CONFIG_UWB is not set
 # CONFIG_MMC is not set
 # CONFIG_MEMSTICK is not set
@@ -1939,8 +1976,10 @@ CONFIG_LEDS_CLASS=y
 #
 # CONFIG_LEDS_ALIX2 is not set
 # CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
 # CONFIG_LEDS_CLEVO_MAIL is not set
 # CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
 
 #
 # LED Triggers
@@ -1950,6 +1989,10 @@ CONFIG_LEDS_TRIGGERS=y
 # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
 # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
 # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
 # CONFIG_ACCESSIBILITY is not set
 # CONFIG_INFINIBAND is not set
 CONFIG_EDAC=y
@@ -2018,6 +2061,7 @@ CONFIG_DMADEVICES=y
 # DMA Devices
 #
 # CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
 # CONFIG_UIO is not set
 # CONFIG_STAGING is not set
 CONFIG_X86_PLATFORM_DEVICES=y
@@ -2051,6 +2095,7 @@ CONFIG_DMIID=y
 #
 # CONFIG_EXT2_FS is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
@@ -2081,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
 CONFIG_GENERIC_ACL=y
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -2132,6 +2182,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -2145,7 +2196,6 @@ CONFIG_NFS_ACL_SUPPORT=y
 CONFIG_NFS_COMMON=y
 CONFIG_SUNRPC=y
 CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
 CONFIG_RPCSEC_GSS_KRB5=y
 # CONFIG_RPCSEC_GSS_SPKM3 is not set
 # CONFIG_SMB_FS is not set
@@ -2232,6 +2282,7 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
@@ -2247,6 +2298,7 @@ CONFIG_TIMER_STATS=y
 # CONFIG_LOCK_STAT is not set
 # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
 # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
 # CONFIG_DEBUG_KOBJECT is not set
 CONFIG_DEBUG_BUGVERBOSE=y
 # CONFIG_DEBUG_INFO is not set
@@ -2269,13 +2321,19 @@ CONFIG_FRAME_POINTER=y
 # CONFIG_FAULT_INJECTION is not set
 # CONFIG_LATENCYTOP is not set
 CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
 CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -2285,13 +2343,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
 # CONFIG_SYSPROF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_POWER_TRACER is not set
 # CONFIG_STACK_TRACER is not set
 # CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
 CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -2301,14 +2367,13 @@ CONFIG_EARLY_PRINTK=y
 CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
 # CONFIG_DEBUG_PER_CPU_MAPS is not set
 # CONFIG_X86_PTDUMP is not set
 CONFIG_DEBUG_RODATA=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_NX_TEST=m
 # CONFIG_IOMMU_DEBUG is not set
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
 CONFIG_IO_DELAY_TYPE_0X80=0
 CONFIG_IO_DELAY_TYPE_0XED=1
 CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2344,6 +2409,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
 CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
 # CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
 CONFIG_CRYPTO=y
 
 #
@@ -2359,10 +2426,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
 CONFIG_CRYPTO_HASH=y
 CONFIG_CRYPTO_HASH2=y
 CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
 CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_MANAGER2=y
 # CONFIG_CRYPTO_GF128MUL is not set
 # CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
 # CONFIG_CRYPTO_CRYPTD is not set
 CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_TEST is not set
@@ -2414,6 +2483,7 @@ CONFIG_CRYPTO_SHA1=y
 #
 CONFIG_CRYPTO_AES=y
 # CONFIG_CRYPTO_AES_X86_64 is not set
+# CONFIG_CRYPTO_AES_NI_INTEL is not set
 # CONFIG_CRYPTO_ANUBIS is not set
 CONFIG_CRYPTO_ARC4=y
 # CONFIG_CRYPTO_BLOWFISH is not set
@@ -2435,6 +2505,7 @@ CONFIG_CRYPTO_DES=y
 # Compression
 #
 # CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
 # CONFIG_CRYPTO_LZO is not set
 
 #
@@ -2444,10 +2515,12 @@ CONFIG_CRYPTO_DES=y
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
 CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
 CONFIG_VIRTUALIZATION=y
 # CONFIG_KVM is not set
 # CONFIG_VIRTIO_PCI is not set
 # CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
@@ -2464,7 +2537,10 @@ CONFIG_CRC32=y
 # CONFIG_CRC7 is not set
 # CONFIG_LIBCRC32C is not set
 CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
-- 
GitLab


From fe83fcc0a14dcf71996de5eb84771b2369ba7abc Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Mon, 11 May 2009 16:23:16 -0700
Subject: [PATCH 1071/2198] x86, defconfig: update kernel position parameters

Update CONFIG_RELOCATABLE, CONFIG_PHYSICAL_START and
CONFIG_PHYSICAL_ALIGN to reflect the current defaults.

[ Impact: make defconfig match Kconfig defaults ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/configs/i386_defconfig   | 7 ++++---
 arch/x86/configs/x86_64_defconfig | 4 ++--
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 70036a7a95062..edb992ebef92e 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.30-rc2
-# Mon May 11 16:19:47 2009
+# Mon May 11 16:21:55 2009
 #
 # CONFIG_64BIT is not set
 CONFIG_X86_32=y
@@ -343,8 +343,9 @@ CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
 # CONFIG_KEXEC_JUMP is not set
 CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RELOCATABLE=y
+CONFIG_X86_NEED_RELOCS=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
 CONFIG_HOTPLUG_CPU=y
 # CONFIG_COMPAT_VDSO is not set
 # CONFIG_CMDLINE_BOOL is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index f3e53e21f438d..4ba7d4ef9aac6 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,7 +1,7 @@
 #
 # Automatically generated make config: don't edit
 # Linux kernel version: 2.6.30-rc2
-# Mon May 11 16:19:24 2009
+# Mon May 11 16:22:00 2009
 #
 CONFIG_64BIT=y
 # CONFIG_X86_32 is not set
@@ -345,7 +345,7 @@ CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
 # CONFIG_KEXEC_JUMP is not set
 CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
+CONFIG_RELOCATABLE=y
 CONFIG_PHYSICAL_ALIGN=0x1000000
 CONFIG_HOTPLUG_CPU=y
 # CONFIG_COMPAT_VDSO is not set
-- 
GitLab


From 5031296c57024a78ddad4edfc993367dbf4abb98 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Thu, 7 May 2009 16:54:11 -0700
Subject: [PATCH 1072/2198] x86: add extension fields for bootloader type and
 version

A long ago, in days of yore, it all began with a god named Thor.
There were vikings and boats and some plans for a Linux kernel
header.  Unfortunately, a single 8-bit field was used for bootloader
type and version.  This has generally worked without *too* much pain,
but we're getting close to flat running out of ID fields.

Add extension fields for both type and version.  The type will be
extended if it the old field is 0xE; the version is a simple MSB
extension.

Keep /proc/sys/kernel/bootloader_type containing
(type << 4) + (ver & 0xf) for backwards compatiblity, but also add
/proc/sys/kernel/bootloader_version which contains the full version
number.

[ Impact: new feature to support more bootloaders ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 Documentation/x86/boot.txt       | 59 ++++++++++++++++++++++++++++----
 arch/x86/boot/header.S           |  6 +++-
 arch/x86/include/asm/bootparam.h |  3 +-
 arch/x86/include/asm/processor.h |  1 +
 arch/x86/kernel/setup.c          | 10 ++++--
 kernel/sysctl.c                  |  8 +++++
 6 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index cf8dfc70a118d..8da3a795083fe 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -50,10 +50,9 @@ Protocol 2.08:	(Kernel 2.6.26) Added crc32 checksum and ELF format
 Protocol 2.09:	(Kernel 2.6.26) Added a field of 64-bit physical
 		pointer to single linked list of struct	setup_data.
 
-Protocol 2.10:	(Kernel 2.6.31) A protocol for relaxed alignment
+Protocol 2.10:	(Kernel 2.6.31) Added a protocol for relaxed alignment
 		beyond the kernel_alignment added, new init_size and
-		pref_address fields.
-
+		pref_address fields.  Added extended boot loader IDs.
 
 **** MEMORY LAYOUT
 
@@ -173,7 +172,8 @@ Offset	Proto	Name		Meaning
 021C/4	2.00+	ramdisk_size	initrd size (set by boot loader)
 0220/4	2.00+	bootsect_kludge	DO NOT USE - for bootsect.S use only
 0224/2	2.01+	heap_end_ptr	Free memory after setup end
-0226/2	N/A	pad1		Unused
+0226/1	2.02+(3 ext_loader_ver	Extended boot loader version
+0227/1	2.02+(3	ext_loader_type	Extended boot loader ID
 0228/4	2.02+	cmd_line_ptr	32-bit pointer to the kernel command line
 022C/4	2.03+	ramdisk_max	Highest legal initrd address
 0230/4	2.05+	kernel_alignment Physical addr alignment required for kernel
@@ -197,6 +197,8 @@ Offset	Proto	Name		Meaning
     field are unusable, which means the size of a bzImage kernel
     cannot be determined.
 
+(3) Ignored, but safe to set, for boot protocols 2.02-2.09.
+
 If the "HdrS" (0x53726448) magic number is not found at offset 0x202,
 the boot protocol version is "old".  Loading an old kernel, the
 following parameters should be assumed:
@@ -350,18 +352,32 @@ Protocol:	2.00+
   0xTV here, where T is an identifier for the boot loader and V is
   a version number.  Otherwise, enter 0xFF here.
 
+  For boot loader IDs above T = 0xD, write T = 0xE to this field and
+  write the extended ID minus 0x10 to the ext_loader_type field.
+  Similarly, the ext_loader_ver field can be used to provide more than
+  four bits for the bootloader version.
+
+  For example, for T = 0x15, V = 0x234, write:
+
+  type_of_loader  <- 0xE4
+  ext_loader_type <- 0x05
+  ext_loader_ver  <- 0x23
+
   Assigned boot loader ids:
 	0  LILO			(0x00 reserved for pre-2.00 bootloader)
 	1  Loadlin
 	2  bootsect-loader	(0x20, all other values reserved)
-	3  SYSLINUX
-	4  EtherBoot
+	3  Syslinux
+	4  Etherboot/gPXE
 	5  ELILO
 	7  GRUB
-	8  U-BOOT
+	8  U-Boot
 	9  Xen
 	A  Gujin
 	B  Qemu
+	C  Arcturus Networks uCbootloader
+	E  Extended		(see ext_loader_type)
+	F  Special		(0xFF = undefined)
 
   Please contact <hpa@zytor.com> if you need a bootloader ID
   value assigned.
@@ -460,6 +476,35 @@ Protocol:	2.01+
   Set this field to the offset (from the beginning of the real-mode
   code) of the end of the setup stack/heap, minus 0x0200.
 
+Field name:	ext_loader_ver
+Type:		write (optional)
+Offset/size:	0x226/1
+Protocol:	2.02+
+
+  This field is used as an extension of the version number in the
+  type_of_loader field.  The total version number is considered to be
+  (type_of_loader & 0x0f) + (ext_loader_ver << 4).
+
+  The use of this field is boot loader specific.  If not written, it
+  is zero.
+
+  Kernels prior to 2.6.31 did not recognize this field, but it is safe
+  to write for protocol version 2.02 or higher.
+
+Field name:	ext_loader_type
+Type:		write (obligatory if (type_of_loader & 0xf0) == 0xe0)
+Offset/size:	0x227/1
+Protocol:	2.02+
+
+  This field is used as an extension of the type number in
+  type_of_loader field.  If the type in type_of_loader is 0xE, then
+  the actual type is (ext_loader_type + 0x10).
+
+  This field is ignored if the type in type_of_loader is not 0xE.
+
+  Kernels prior to 2.6.31 did not recognize this field, but it is safe
+  to write for protocol version 2.02 or higher.
+
 Field name:	cmd_line_ptr
 Type:		write (obligatory)
 Offset/size:	0x228/4
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index a0b426978d554..68c3bfbaff240 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -169,7 +169,11 @@ heap_end_ptr:	.word	_end+STACK_SIZE-512
 					# end of setup code can be used by setup
 					# for local heap purposes.
 
-pad1:		.word	0
+ext_loader_ver:
+		.byte	0		# Extended boot loader version
+ext_loader_type:
+		.byte	0		# Extended boot loader type
+
 cmd_line_ptr:	.long	0		# (Header version 0x0202 or later)
 					# If nonzero, a 32-bit pointer
 					# to the kernel command line.
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 433adaebf9b63..1724e8de317c5 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -50,7 +50,8 @@ struct setup_header {
 	__u32	ramdisk_size;
 	__u32	bootsect_kludge;
 	__u16	heap_end_ptr;
-	__u16	_pad1;
+	__u8	ext_loader_ver;
+	__u8	ext_loader_type;
 	__u32	cmd_line_ptr;
 	__u32	initrd_addr_max;
 	__u32	kernel_alignment;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fcf4d92e7e04d..6384d25121cab 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -814,6 +814,7 @@ extern unsigned int		BIOS_revision;
 
 /* Boot loader type from the setup header: */
 extern int			bootloader_type;
+extern int			bootloader_version;
 
 extern char			ignore_fpu_irq;
 
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index b4158439bf634..2b093451aec95 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -214,8 +214,8 @@ unsigned long mmu_cr4_features;
 unsigned long mmu_cr4_features = X86_CR4_PAE;
 #endif
 
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
 
 /*
  * Setup options
@@ -706,6 +706,12 @@ void __init setup_arch(char **cmdline_p)
 #endif
 	saved_video_mode = boot_params.hdr.vid_mode;
 	bootloader_type = boot_params.hdr.type_of_loader;
+	if ((bootloader_type >> 4) == 0xe) {
+		bootloader_type &= 0xf;
+		bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+	}
+	bootloader_version  = bootloader_type & 0xf;
+	bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
 
 #ifdef CONFIG_BLK_DEV_RAM
 	rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e3d2c7dd59b97..cf91c9317b26f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -727,6 +727,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0444,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "bootloader_version",
+		.data		= &bootloader_version,
+		.maxlen		= sizeof (int),
+		.mode		= 0444,
+		.proc_handler	= &proc_dointvec,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "kstack_depth_to_print",
-- 
GitLab


From 37bcbf13d32e4e453e9def79ee72bd953b88302f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 11 May 2009 13:59:10 -0400
Subject: [PATCH 1073/2198] IMA: use current_cred() instead of current->cred

Proper invocation of the current credentials is to use current_cred() not
current->cred.  This patches makes IMA use the new method.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/integrity/ima/ima_audit.c | 2 +-
 security/integrity/ima/ima_main.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c
index b628eea477a68..ff513ff737f5c 100644
--- a/security/integrity/ima/ima_audit.c
+++ b/security/integrity/ima/ima_audit.c
@@ -41,7 +41,7 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
 
 	ab = audit_log_start(current->audit_context, GFP_KERNEL, audit_msgno);
 	audit_log_format(ab, "integrity: pid=%d uid=%u auid=%u ses=%u",
-			 current->pid, current->cred->uid,
+			 current->pid, current_cred()->uid,
 			 audit_get_loginuid(current),
 			 audit_get_sessionid(current));
 	audit_log_task_context(ab);
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 122f17fc7fc1f..cdae13c5ae05f 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -184,7 +184,7 @@ int ima_path_check(struct path *path, int mask)
 		struct dentry *dentry = dget(path->dentry);
 		struct vfsmount *mnt = mntget(path->mnt);
 
-		file = dentry_open(dentry, mnt, O_RDONLY, current->cred);
+		file = dentry_open(dentry, mnt, O_RDONLY, current_cred());
 		rc = get_path_measurement(iint, file, dentry->d_name.name);
 	}
 out:
-- 
GitLab


From f06dd16a03f6f7f72fab4db03be36e28c28c6fd6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 11 May 2009 13:59:16 -0400
Subject: [PATCH 1074/2198] IMA: Handle dentry_open failures

Currently IMA does not handle failures from dentry_open().  This means that we
leave a pointer set to ERR_PTR(errno) and then try to use it just a few lines
later in fput().  Oops.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/integrity/ima/ima_main.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index cdae13c5ae05f..1987424623c24 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -116,10 +116,6 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
 {
 	int rc = 0;
 
-	if (IS_ERR(file)) {
-		pr_info("%s dentry_open failed\n", filename);
-		return rc;
-	}
 	iint->opencount++;
 	iint->readcount++;
 
@@ -185,6 +181,12 @@ int ima_path_check(struct path *path, int mask)
 		struct vfsmount *mnt = mntget(path->mnt);
 
 		file = dentry_open(dentry, mnt, O_RDONLY, current_cred());
+		if (IS_ERR(file)) {
+			pr_info("%s dentry_open failed\n", dentry->d_name.name);
+			rc = PTR_ERR(file);
+			file = NULL;
+			goto out;
+		}
 		rc = get_path_measurement(iint, file, dentry->d_name.name);
 	}
 out:
-- 
GitLab


From 1a62e958fa4aaeeb752311b4f5e16b2a86737b23 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 11 May 2009 13:59:22 -0400
Subject: [PATCH 1075/2198] IMA: open all files O_LARGEFILE

If IMA tried to measure a file which was larger than 4G dentry_open would fail
with -EOVERFLOW since IMA wasn't passing O_LARGEFILE.  This patch passes
O_LARGEFILE to all IMA opens to avoid this problem.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/integrity/ima/ima_main.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index 1987424623c24..c4228c0eb2d0f 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -180,7 +180,8 @@ int ima_path_check(struct path *path, int mask)
 		struct dentry *dentry = dget(path->dentry);
 		struct vfsmount *mnt = mntget(path->mnt);
 
-		file = dentry_open(dentry, mnt, O_RDONLY, current_cred());
+		file = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE,
+				   current_cred());
 		if (IS_ERR(file)) {
 			pr_info("%s dentry_open failed\n", dentry->d_name.name);
 			rc = PTR_ERR(file);
-- 
GitLab


From d93e4c940f51ae06b59c14523c4d55947f9597d6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 11 May 2009 20:47:15 -0400
Subject: [PATCH 1076/2198] securityfs: securityfs_remove should handle IS_ERR
 pointers

Both of the securityfs users (TPM and IMA) can call securityfs_remove and pass
an IS_ERR(dentry) in their failure paths.  This patch handles those rather
than panicing when it tries to start deferencing some negative memory.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/inode.c b/security/inode.c
index f3b91bfbe4cb9..f7496c6a022b7 100644
--- a/security/inode.c
+++ b/security/inode.c
@@ -287,7 +287,7 @@ void securityfs_remove(struct dentry *dentry)
 {
 	struct dentry *parent;
 
-	if (!dentry)
+	if (!dentry || IS_ERR(dentry))
 		return;
 
 	parent = dentry->d_parent;
-- 
GitLab


From 0f0c85fc80adbbd2265d89867d743f929d516805 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 16:08:00 -0400
Subject: [PATCH 1077/2198] ring-buffer: small optimizations

Doing some small changes in the fast path of the ring buffer recording
saves over 3% in the ring-buffer-benchmark test.

[ Impact: a little faster ring buffer recording ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 493cba46abc9b..f452de2ce4900 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1000,7 +1000,7 @@ rb_event_index(struct ring_buffer_event *event)
 	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
 }
 
-static int
+static inline int
 rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 	     struct ring_buffer_event *event)
 {
@@ -1423,9 +1423,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * also be made. But only the entry that did the actual
 	 * commit will be something other than zero.
 	 */
-	if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
-	    rb_page_write(cpu_buffer->tail_page) ==
-	    rb_commit_index(cpu_buffer)) {
+	if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
+		   rb_page_write(cpu_buffer->tail_page) ==
+		   rb_commit_index(cpu_buffer))) {
 
 		delta = ts - cpu_buffer->write_stamp;
 
@@ -1436,7 +1436,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 		if (unlikely(ts < cpu_buffer->write_stamp))
 			delta = 0;
 
-		if (test_time_stamp(delta)) {
+		else if (unlikely(test_time_stamp(delta))) {
 
 			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
 
@@ -1470,7 +1470,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	 * If the timestamp was commited, make the commit our entry
 	 * now so that we will update it when needed.
 	 */
-	if (commit)
+	if (unlikely(commit))
 		rb_set_commit_event(cpu_buffer, event);
 	else if (!rb_is_commit(cpu_buffer, event))
 		delta = 0;
-- 
GitLab


From 88eb0125362f2ab272cbaf84252cf101ddc2dec9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 16:28:23 -0400
Subject: [PATCH 1078/2198] ring-buffer: use internal time stamp function

The ring_buffer_time_stamp that is exported adds a little more overhead
than is needed for using it internally. This patch adds an internal
timestamp function that can be inlined (a single line function)
and used internally for the ring buffer.

[ Impact: a little less overhead to the ring buffer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f452de2ce4900..a9e645a5bc10c 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -454,13 +454,18 @@ struct ring_buffer_iter {
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
+static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+	/* shift to debug/test normalization and TIME_EXTENTS */
+	return buffer->clock() << DEBUG_SHIFT;
+}
+
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
 {
 	u64 time;
 
 	preempt_disable_notrace();
-	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = buffer->clock() << DEBUG_SHIFT;
+	time = rb_time_stamp(buffer, cpu);
 	preempt_enable_no_resched_notrace();
 
 	return time;
@@ -1247,7 +1252,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 		cpu_buffer->tail_page = next_page;
 
 		/* reread the time stamp */
-		*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
+		*ts = rb_time_stamp(buffer, cpu_buffer->cpu);
 		cpu_buffer->tail_page->page->time_stamp = *ts;
 	}
 
@@ -1413,7 +1418,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		return NULL;
 
-	ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+	ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
 
 	/*
 	 * Only the first commit can update the timestamp.
-- 
GitLab


From 168b6b1d0594c7866caa73b12f3b8d91075695f2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 11 May 2009 22:11:05 -0400
Subject: [PATCH 1079/2198] ring-buffer: move code around to remove some
 branches

This is a bit of micro-optimizations. But since the ring buffer is used
in tracing every function call, it is an extreme hot path. Every nanosecond
counts.

This change shows over 5% improvement in the ring-buffer-benchmark.

[ Impact: more efficient code ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a9e645a5bc10c..16b24d49604ca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1400,7 +1400,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 		      unsigned long length)
 {
 	struct ring_buffer_event *event;
-	u64 ts, delta;
+	u64 ts, delta = 0;
 	int commit = 0;
 	int nr_loops = 0;
 
@@ -1431,20 +1431,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
 		   rb_page_write(cpu_buffer->tail_page) ==
 		   rb_commit_index(cpu_buffer))) {
+		u64 diff;
 
-		delta = ts - cpu_buffer->write_stamp;
+		diff = ts - cpu_buffer->write_stamp;
 
-		/* make sure this delta is calculated here */
+		/* make sure this diff is calculated here */
 		barrier();
 
 		/* Did the write stamp get updated already? */
 		if (unlikely(ts < cpu_buffer->write_stamp))
-			delta = 0;
+			goto get_event;
 
-		else if (unlikely(test_time_stamp(delta))) {
+		delta = diff;
+		if (unlikely(test_time_stamp(delta))) {
 
 			commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-
 			if (commit == -EBUSY)
 				return NULL;
 
@@ -1453,12 +1454,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 
 			RB_WARN_ON(cpu_buffer, commit < 0);
 		}
-	} else
-		/* Non commits have zero deltas */
-		delta = 0;
+	}
 
+ get_event:
 	event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
-	if (PTR_ERR(event) == -EAGAIN)
+	if (unlikely(PTR_ERR(event) == -EAGAIN))
 		goto again;
 
 	if (!event) {
-- 
GitLab


From bc8e67409ccdcff72c3f1656b1fb1aad7ff396db Mon Sep 17 00:00:00 2001
From: Vincent Minet <vincent@vincent-minet.net>
Date: Fri, 15 May 2009 08:33:18 -0400
Subject: [PATCH 1080/2198] ext4: Fix spinlock assertions on UP systems

On UP systems without DEBUG_SPINLOCK, ext4_is_group_locked always fails
which triggers a BUG_ON() call.
This patch fixes it by using assert_spin_locked instead.

Signed-off-by: Vincent Minet <vincent@vincent-minet.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  6 ------
 fs/ext4/mballoc.c | 10 +++++-----
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 149e02dc36065..89190ae671f62 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1583,12 +1583,6 @@ static inline void ext4_unlock_group(struct super_block *sb,
 	spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 
-static inline int ext4_is_group_locked(struct super_block *sb,
-					ext4_group_t group)
-{
-	return spin_is_locked(ext4_group_lock_ptr(sb, group));
-}
-
 /*
  * Inodes and files operations
  */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index e76459cedcdbe..541bd9adffa20 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -436,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 
 	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 		return;
-	BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 	for (i = 0; i < count; i++) {
 		if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
 			ext4_fsblk_t blocknr;
@@ -460,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 
 	if (unlikely(e4b->bd_info->bb_bitmap == NULL))
 		return;
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	for (i = 0; i < count; i++) {
 		BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
 		mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -1115,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
 	struct super_block *sb = e4b->bd_sb;
 
 	BUG_ON(first + count > (sb->s_blocksize << 3));
-	BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
 	mb_check_buddy(e4b);
 	mb_free_blocks_double(inode, e4b, first, count);
 
@@ -1196,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
 	int ord;
 	void *buddy;
 
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	BUG_ON(ex == NULL);
 
 	buddy = mb_find_buddy(e4b, order, &max);
@@ -1260,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 
 	BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
 	BUG_ON(e4b->bd_group != ex->fe_group);
-	BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+	assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
 	mb_check_buddy(e4b);
 	mb_mark_used_double(e4b, start, len);
 
-- 
GitLab


From f888e652d758bfe0c04c209b72a05972daeba386 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 00:21:29 -0400
Subject: [PATCH 1081/2198] ext4: Simplify function signature for
 ext4_da_get_block_write()

The function ext4_da_get_block_write() is called in exactly one write,
and the last argument, create, is always 1.  Remove it to simplify the
code slightly.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 4e7f363e3030a..476d843610aca 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2000,7 +2000,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 
 #define		EXT4_DELALLOC_RSVED	1
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result, int create)
+				   struct buffer_head *bh_result)
 {
 	int ret;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
@@ -2010,7 +2010,7 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
 	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+				   bh_result, 1, 0, EXT4_DELALLOC_RSVED);
 	if (ret <= 0)
 		return ret;
 
@@ -2088,7 +2088,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	if (!new.b_size)
 		return 0;
 
-	err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+	err = ext4_da_get_block_write(mpd->inode, next, &new);
 	if (err) {
 		/*
 		 * If get block returns with error we simply
-- 
GitLab


From e4d996ca806e93dddb5d76c0d3d859b494c559f6 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 00:25:28 -0400
Subject: [PATCH 1082/2198] ext4: Rename ext4_get_blocks_handle() to be
 ext4_ind_get_blocks()

The static function ext4_get_blocks_handle() is badly named.  Of
*course* it takes a handle.  Since its counterpart for extent-based
file is ext4_ext_get_blocks(), rename it to be ext4_ind_get_blocks().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 476d843610aca..f758e8021d1a9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -914,7 +914,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
  * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
  * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
  */
-static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 				  ext4_lblk_t iblock, unsigned int maxblocks,
 				  struct buffer_head *bh_result,
 				  int create, int extend_disksize)
@@ -1129,7 +1129,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  * mapped.
  *
  * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocate.
@@ -1160,8 +1160,8 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
 				bh, 0, 0);
 	} else {
-		retval = ext4_get_blocks_handle(handle,
-				inode, block, max_blocks, bh, 0, 0);
+		retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
+					     bh, 0, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
@@ -1215,7 +1215,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
 				bh, create, extend_disksize);
 	} else {
-		retval = ext4_get_blocks_handle(handle, inode, block,
+		retval = ext4_ind_get_blocks(handle, inode, block,
 				max_blocks, bh, create, extend_disksize);
 
 		if (retval > 0 && buffer_new(bh)) {
@@ -1297,7 +1297,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	err = ext4_get_blocks_wrap(handle, inode, block, 1,
 					&dummy, create, 1, 0);
 	/*
-	 * ext4_get_blocks_handle() returns number of blocks
+	 * ext4_get_blocks_wrap() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
 	 */
 	if (err > 0) {
-- 
GitLab


From e458824f9d32e9bf7700d1eb0d201749af48eee0 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 May 2009 08:49:32 +0200
Subject: [PATCH 1083/2198] scsi: fix resid_len mis-conversion in
 scsi_end_request()

Commit c3a4d78c580de4edc9ef0f7c59812fb02ceb037f introduced
rq->data_len and converted residual count users to it.  While
converting, it mistakenly converted scsi_end_request() to finish
requests with residual count when it wants to do is fully complete the
request.  Fix it by using blk_end_request_all() instead.

This bug was spotted by Boaz Harrosh.

Signed-off-by: Tejun Heo <tj@kernel.org>
Spotted-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/scsi_lib.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index a54bec9943869..e410d667910de 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -546,14 +546,9 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error,
 	 * to queue the remainder of them.
 	 */
 	if (blk_end_request(req, error, bytes)) {
-		int leftover = blk_rq_bytes(req);
-
-		if (blk_pc_request(req))
-			leftover = req->resid_len;
-
 		/* kill remainder if no retrys */
 		if (error && scsi_noretry_cmd(cmd))
-			blk_end_request(req, error, leftover);
+			blk_end_request_all(req, error);
 		else {
 			if (requeue) {
 				/*
-- 
GitLab


From f3f8290cb3fa4aa627321530bb85d5f35e487433 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 16:07:40 +0900
Subject: [PATCH 1084/2198] sh: clkfwk: Handle clk_get_sys() returning an
 ERR_PTR.

clk_get() needs to also perform an IS_ERR() check to see whether
clk_get_sys() failed or not.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/clock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 34707b8677608..9e1fc133a4732 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -399,7 +399,7 @@ struct clk *clk_get(struct device *dev, const char *id)
 	int idno;
 
 	clk = clk_get_sys(dev_id, id);
-	if (clk)
+	if (clk && !IS_ERR(clk))
 		return clk;
 
 	if (dev == NULL || dev->bus != &platform_bus_type)
-- 
GitLab


From 871b72dd1e12afc3f024479531d25a9339d2e3f9 Mon Sep 17 00:00:00 2001
From: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Date: Mon, 11 May 2009 23:48:27 +0200
Subject: [PATCH 1085/2198] x86: microcode: use smp_call_function_single
 instead of set_cpus_allowed, cleanup of synchronization logic

* Solve issues described in 6f66cbc63081fd70e3191b4dbb796746780e5ae1
  in a way that doesn't resort to set_cpus_allowed();

* in fact, only collect_cpu_info and apply_microcode callbacks
  must run on a target cpu, others will do just fine on any other.
  smp_call_function_single() (as suggested by Ingo) is used to run
  these callbacks on a target cpu.

* cleanup of synchronization logic of the 'microcode_core' part

  The generic 'microcode_core' part guarantees that only a single cpu
  (be it a full-fledged cpu, one of the cores or HT)
  is being updated at any particular moment of time.

  In general, there is no need for any additional sync. mechanism in
  arch-specific parts (the patch removes existing spinlocks).

  See also the "Synchronization" section in microcode_core.c.

* return -EINVAL instead of -1 (which is translated into -EPERM) in
  microcode_write(), reload_cpu() and mc_sysdev_add(). Other suggestions
  for an error code?

* use 'enum ucode_state' as return value of request_microcode_{fw, user}
  to gain more flexibility by distinguishing between real error cases
  and situations when an appropriate ucode was not found (which is not an
  error per-se).

* some minor cleanups

Thanks a lot to Hugh Dickins for review/suggestions/testing!

   Reference: http://marc.info/?l=linux-kernel&m=124025889012541&w=2

[ Impact: refactor and clean up microcode driver locking code ]

Signed-off-by: Dmitry Adamushko <dmitry.adamushko@gmail.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Peter Oruba <peter.oruba@amd.com>
Cc: Arjan van de Ven <arjan@infradead.org>
LKML-Reference: <1242078507.5560.9.camel@earth>
[ did some more cleanups ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
 arch/x86/include/asm/microcode.h  |   25 ++
 arch/x86/kernel/microcode_amd.c   |   58 ++----
 arch/x86/kernel/microcode_core.c  |  326 +++++++++++++++++++++-----------------
 arch/x86/kernel/microcode_intel.c |   92 +++-------
 4 files changed, 261 insertions(+), 240 deletions(-)

(~20 new comment lines)
---
 arch/x86/include/asm/microcode.h  |  25 ++-
 arch/x86/kernel/microcode_amd.c   |  58 ++----
 arch/x86/kernel/microcode_core.c  | 329 +++++++++++++++++-------------
 arch/x86/kernel/microcode_intel.c |  90 +++-----
 4 files changed, 262 insertions(+), 240 deletions(-)

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index c882664716c1d..ef51b501e22a6 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -9,20 +9,31 @@ struct cpu_signature {
 
 struct device;
 
+enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
+
 struct microcode_ops {
-	int  (*request_microcode_user) (int cpu, const void __user *buf, size_t size);
-	int  (*request_microcode_fw) (int cpu, struct device *device);
+	enum ucode_state (*request_microcode_user) (int cpu,
+				const void __user *buf, size_t size);
 
-	void (*apply_microcode) (int cpu);
+	enum ucode_state (*request_microcode_fw) (int cpu,
+				struct device *device);
 
-	int  (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
 	void (*microcode_fini_cpu) (int cpu);
+
+	/*
+	 * The generic 'microcode_core' part guarantees that
+	 * the callbacks below run on a target cpu when they
+	 * are being called.
+	 * See also the "Synchronization" section in microcode_core.c.
+	 */
+	int (*apply_microcode) (int cpu);
+	int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
 };
 
 struct ucode_cpu_info {
-	struct cpu_signature cpu_sig;
-	int valid;
-	void *mc;
+	struct cpu_signature	cpu_sig;
+	int			valid;
+	void			*mc;
 };
 extern struct ucode_cpu_info ucode_cpu_info[];
 
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index 453b5795a5c6a..c8be20f16447d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,25 +13,13 @@
  *  Licensed under the terms of the GNU General Public
  *  License version 2. See file COPYING for details.
  */
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
 #include <linux/firmware.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
 #include <linux/pci_ids.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
 #include <linux/pci.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
 #define UCODE_CONTAINER_SECTION_HDR	8
 #define UCODE_CONTAINER_HEADER_SIZE	12
 
-/* serialize access to the physical write */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
 static struct equiv_cpu_entry *equiv_cpu_table;
 
 static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
 	return 1;
 }
 
-static void apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
 {
-	unsigned long flags;
 	u32 rev, dummy;
 	int cpu_num = raw_smp_processor_id();
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
 	BUG_ON(cpu_num != cpu);
 
 	if (mc_amd == NULL)
-		return;
+		return 0;
 
-	spin_lock_irqsave(&microcode_update_lock, flags);
 	wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
 	/* get patch id after patching */
 	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-	spin_unlock_irqrestore(&microcode_update_lock, flags);
 
 	/* check current patch id and patch's id for match */
 	if (rev != mc_amd->hdr.patch_id) {
 		printk(KERN_ERR "microcode: CPU%d: update failed "
 		       "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
-		return;
+		return -1;
 	}
 
 	printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
 	       cpu, rev);
 
 	uci->cpu_sig.rev = rev;
+
+	return 0;
 }
 
 static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -263,7 +247,8 @@ static void free_equiv_cpu_table(void)
 	}
 }
 
-static int generic_load_microcode(int cpu, const u8 *data, size_t size)
+static enum ucode_state
+generic_load_microcode(int cpu, const u8 *data, size_t size)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 	const u8 *ucode_ptr = data;
@@ -272,12 +257,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
 	int new_rev = uci->cpu_sig.rev;
 	unsigned int leftover;
 	unsigned long offset;
+	enum ucode_state state = UCODE_OK;
 
 	offset = install_equiv_cpu_table(ucode_ptr);
 	if (!offset) {
 		printk(KERN_ERR "microcode: failed to create "
 		       "equivalent cpu table\n");
-		return -EINVAL;
+		return UCODE_ERROR;
 	}
 
 	ucode_ptr += offset;
@@ -312,28 +298,27 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
 			pr_debug("microcode: CPU%d found a matching microcode "
 				 "update with version 0x%x (current=0x%x)\n",
 				 cpu, new_rev, uci->cpu_sig.rev);
-		} else
+		} else {
 			vfree(new_mc);
-	}
+			state = UCODE_ERROR;
+		}
+	} else
+		state = UCODE_NFOUND;
 
 	free_equiv_cpu_table();
 
-	return (int)leftover;
+	return state;
 }
 
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
 {
 	const char *fw_name = "amd-ucode/microcode_amd.bin";
 	const struct firmware *firmware;
-	int ret;
-
-	/* We should bind the task to the CPU */
-	BUG_ON(cpu != raw_smp_processor_id());
+	enum ucode_state ret;
 
-	ret = request_firmware(&firmware, fw_name, device);
-	if (ret) {
+	if (request_firmware(&firmware, fw_name, device)) {
 		printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
-		return ret;
+		return UCODE_NFOUND;
 	}
 
 	ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +328,12 @@ static int request_microcode_fw(int cpu, struct device *device)
 	return ret;
 }
 
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
 	printk(KERN_INFO "microcode: AMD microcode update via "
 	       "/dev/cpu/microcode not supported\n");
-	return -1;
+	return UCODE_ERROR;
 }
 
 static void microcode_fini_cpu_amd(int cpu)
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 98c470c069d15..9c4461501fcbb 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -71,27 +71,18 @@
  *		Thanks to Stuart Swales for pointing out this bug.
  */
 #include <linux/platform_device.h>
-#include <linux/capability.h>
 #include <linux/miscdevice.h>
-#include <linux/firmware.h>
+#include <linux/capability.h>
 #include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
-#include <asm/msr.h>
 
 MODULE_DESCRIPTION("Microcode Update Driver");
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
 
 static struct microcode_ops	*microcode_ops;
 
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ *   the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
 static DEFINE_MUTEX(microcode_mutex);
 
 struct ucode_cpu_info		ucode_cpu_info[NR_CPUS];
 EXPORT_SYMBOL_GPL(ucode_cpu_info);
 
+/*
+ * Operations that are run on a target cpu:
+ */
+
+struct cpu_info_ctx {
+	struct cpu_signature	*cpu_sig;
+	int			err;
+};
+
+static void collect_cpu_info_local(void *arg)
+{
+	struct cpu_info_ctx *ctx = arg;
+
+	ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
+						   ctx->cpu_sig);
+}
+
+static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+{
+	struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
+	int ret;
+
+	ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
+	if (!ret)
+		ret = ctx.err;
+
+	return ret;
+}
+
+static int collect_cpu_info(int cpu)
+{
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	int ret;
+
+	memset(uci, 0, sizeof(*uci));
+
+	ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
+	if (!ret)
+		uci->valid = 1;
+
+	return ret;
+}
+
+struct apply_microcode_ctx {
+	int err;
+};
+
+static void apply_microcode_local(void *arg)
+{
+	struct apply_microcode_ctx *ctx = arg;
+
+	ctx->err = microcode_ops->apply_microcode(smp_processor_id());
+}
+
+static int apply_microcode_on_target(int cpu)
+{
+	struct apply_microcode_ctx ctx = { .err = 0 };
+	int ret;
+
+	ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
+	if (!ret)
+		ret = ctx.err;
+
+	return ret;
+}
+
 #ifdef CONFIG_MICROCODE_OLD_INTERFACE
 static int do_microcode_update(const void __user *buf, size_t size)
 {
-	cpumask_t old;
 	int error = 0;
 	int cpu;
 
-	old = current->cpus_allowed;
-
 	for_each_online_cpu(cpu) {
 		struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+		enum ucode_state ustate;
 
 		if (!uci->valid)
 			continue;
 
-		set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-		error = microcode_ops->request_microcode_user(cpu, buf, size);
-		if (error < 0)
-			goto out;
-		if (!error)
-			microcode_ops->apply_microcode(cpu);
+		ustate = microcode_ops->request_microcode_user(cpu, buf, size);
+		if (ustate == UCODE_ERROR) {
+			error = -1;
+			break;
+		} else if (ustate == UCODE_OK)
+			apply_microcode_on_target(cpu);
 	}
-out:
-	set_cpus_allowed_ptr(current, &old);
+
 	return error;
 }
 
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
 static ssize_t microcode_write(struct file *file, const char __user *buf,
 			       size_t len, loff_t *ppos)
 {
-	ssize_t ret;
+	ssize_t ret = -EINVAL;
 
 	if ((len >> PAGE_SHIFT) > num_physpages) {
-		printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
-		       num_physpages);
-		return -EINVAL;
+		pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
+		return ret;
 	}
 
 	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 
-	ret = do_microcode_update(buf, len);
-	if (!ret)
+	if (do_microcode_update(buf, len) == 0)
 		ret = (ssize_t)len;
 
 	mutex_unlock(&microcode_mutex);
@@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
 }
 
 static const struct file_operations microcode_fops = {
-	.owner		= THIS_MODULE,
-	.write		= microcode_write,
-	.open		= microcode_open,
+	.owner			= THIS_MODULE,
+	.write			= microcode_write,
+	.open			= microcode_open,
 };
 
 static struct miscdevice microcode_dev = {
-	.minor		= MICROCODE_MINOR,
-	.name		= "microcode",
-	.fops		= &microcode_fops,
+	.minor			= MICROCODE_MINOR,
+	.name			= "microcode",
+	.fops			= &microcode_fops,
 };
 
 static int __init microcode_dev_init(void)
@@ -182,9 +245,7 @@ static int __init microcode_dev_init(void)
 
 	error = misc_register(&microcode_dev);
 	if (error) {
-		printk(KERN_ERR
-			"microcode: can't misc_register on minor=%d\n",
-			MICROCODE_MINOR);
+		pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
 		return error;
 	}
 
@@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
 /* fake device for request_firmware */
 static struct platform_device	*microcode_pdev;
 
-static long reload_for_cpu(void *unused)
+static int reload_for_cpu(int cpu)
 {
-	struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 	int err = 0;
 
 	mutex_lock(&microcode_mutex);
 	if (uci->valid) {
-		err = microcode_ops->request_microcode_fw(smp_processor_id(),
-							  &microcode_pdev->dev);
-		if (!err)
-			microcode_ops->apply_microcode(smp_processor_id());
+		enum ucode_state ustate;
+
+		ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+		if (ustate == UCODE_OK)
+			apply_microcode_on_target(cpu);
+		else
+			if (ustate == UCODE_ERROR)
+				err = -EINVAL;
 	}
 	mutex_unlock(&microcode_mutex);
+
 	return err;
 }
 
 static ssize_t reload_store(struct sys_device *dev,
 			    struct sysdev_attribute *attr,
-			    const char *buf, size_t sz)
+			    const char *buf, size_t size)
 {
-	char *end;
-	unsigned long val = simple_strtoul(buf, &end, 0);
-	int err = 0;
+	unsigned long val;
 	int cpu = dev->id;
+	int ret = 0;
+	char *end;
 
+	val = simple_strtoul(buf, &end, 0);
 	if (end == buf)
 		return -EINVAL;
+
 	if (val == 1) {
 		get_online_cpus();
 		if (cpu_online(cpu))
-			err = work_on_cpu(cpu, reload_for_cpu, NULL);
+			ret = reload_for_cpu(cpu);
 		put_online_cpus();
 	}
-	if (err)
-		return err;
-	return sz;
+
+	if (!ret)
+		ret = size;
+
+	return ret;
 }
 
 static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = {
 };
 
 static struct attribute_group mc_attr_group = {
-	.attrs		= mc_default_attrs,
-	.name		= "microcode",
+	.attrs			= mc_default_attrs,
+	.name			= "microcode",
 };
 
-static void __microcode_fini_cpu(int cpu)
+static void microcode_fini_cpu(int cpu)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
@@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu)
 	uci->valid = 0;
 }
 
-static void microcode_fini_cpu(int cpu)
-{
-	mutex_lock(&microcode_mutex);
-	__microcode_fini_cpu(cpu);
-	mutex_unlock(&microcode_mutex);
-}
-
-static void collect_cpu_info(int cpu)
+static enum ucode_state microcode_resume_cpu(int cpu)
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
-	memset(uci, 0, sizeof(*uci));
-	if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
-		uci->valid = 1;
+	if (!uci->mc)
+		return UCODE_NFOUND;
+
+	pr_debug("microcode: CPU%d updated upon resume\n", cpu);
+	apply_microcode_on_target(cpu);
+
+	return UCODE_OK;
 }
 
-static int microcode_resume_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu)
 {
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-	struct cpu_signature nsig;
+	enum ucode_state ustate;
 
-	pr_debug("microcode: CPU%d resumed\n", cpu);
+	if (collect_cpu_info(cpu))
+		return UCODE_ERROR;
 
-	if (!uci->mc)
-		return 1;
+	/* --dimm. Trigger a delayed update? */
+	if (system_state != SYSTEM_RUNNING)
+		return UCODE_NFOUND;
 
-	/*
-	 * Let's verify that the 'cached' ucode does belong
-	 * to this cpu (a bit of paranoia):
-	 */
-	if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
-		__microcode_fini_cpu(cpu);
-		printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
-				cpu);
-		return -1;
-	}
+	ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
 
-	if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
-		__microcode_fini_cpu(cpu);
-		printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
-				cpu);
-		/* Should we look for a new ucode here? */
-		return 1;
+	if (ustate == UCODE_OK) {
+		pr_debug("microcode: CPU%d updated upon init\n", cpu);
+		apply_microcode_on_target(cpu);
 	}
 
-	return 0;
+	return ustate;
 }
 
-static long microcode_update_cpu(void *unused)
+static enum ucode_state microcode_update_cpu(int cpu)
 {
-	struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
-	int err = 0;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+	enum ucode_state ustate;
 
-	/*
-	 * Check if the system resume is in progress (uci->valid != NULL),
-	 * otherwise just request a firmware:
-	 */
-	if (uci->valid) {
-		err = microcode_resume_cpu(smp_processor_id());
-	} else {
-		collect_cpu_info(smp_processor_id());
-		if (uci->valid && system_state == SYSTEM_RUNNING)
-			err = microcode_ops->request_microcode_fw(
-					smp_processor_id(),
-					&microcode_pdev->dev);
-	}
-	if (!err)
-		microcode_ops->apply_microcode(smp_processor_id());
-	return err;
-}
+	if (uci->valid)
+		ustate = microcode_resume_cpu(cpu);
+	else
+		ustate = microcode_init_cpu(cpu);
 
-static int microcode_init_cpu(int cpu)
-{
-	int err;
-	mutex_lock(&microcode_mutex);
-	err = work_on_cpu(cpu, microcode_update_cpu, NULL);
-	mutex_unlock(&microcode_mutex);
-
-	return err;
+	return ustate;
 }
 
 static int mc_sysdev_add(struct sys_device *sys_dev)
 {
 	int err, cpu = sys_dev->id;
-	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
 	if (!cpu_online(cpu))
 		return 0;
 
 	pr_debug("microcode: CPU%d added\n", cpu);
-	memset(uci, 0, sizeof(*uci));
 
 	err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
 	if (err)
 		return err;
 
-	err = microcode_init_cpu(cpu);
+	if (microcode_init_cpu(cpu) == UCODE_ERROR)
+		err = -EINVAL;
 
 	return err;
 }
@@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
 static int mc_sysdev_resume(struct sys_device *dev)
 {
 	int cpu = dev->id;
+	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
 	if (!cpu_online(cpu))
 		return 0;
 
-	/* only CPU 0 will apply ucode here */
-	microcode_update_cpu(NULL);
+	/*
+	 * All non-bootup cpus are still disabled,
+	 * so only CPU 0 will apply ucode here.
+	 *
+	 * Moreover, there can be no concurrent
+	 * updates from any other places at this point.
+	 */
+	WARN_ON(cpu != 0);
+
+	if (uci->valid && uci->mc)
+		microcode_ops->apply_microcode(cpu);
+
 	return 0;
 }
 
 static struct sysdev_driver mc_sysdev_driver = {
-	.add		= mc_sysdev_add,
-	.remove		= mc_sysdev_remove,
-	.resume		= mc_sysdev_resume,
+	.add			= mc_sysdev_add,
+	.remove			= mc_sysdev_remove,
+	.resume			= mc_sysdev_resume,
 };
 
 static __cpuinit int
@@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	switch (action) {
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		if (microcode_init_cpu(cpu))
-			printk(KERN_ERR "microcode: failed to init CPU%d\n",
-			       cpu);
+		microcode_update_cpu(cpu);
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
 		pr_debug("microcode: CPU%d added\n", cpu);
 		if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
-			printk(KERN_ERR "microcode: Failed to create the sysfs "
-				"group for CPU%d\n", cpu);
+			pr_err("microcode: Failed to create group for CPU%d\n", cpu);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +508,10 @@ static int __init microcode_init(void)
 		microcode_ops = init_amd_microcode();
 
 	if (!microcode_ops) {
-		printk(KERN_ERR "microcode: no support for this CPU vendor\n");
+		pr_err("microcode: no support for this CPU vendor\n");
 		return -ENODEV;
 	}
 
-	error = microcode_dev_init();
-	if (error)
-		return error;
 	microcode_pdev = platform_device_register_simple("microcode", -1,
 							 NULL, 0);
 	if (IS_ERR(microcode_pdev)) {
@@ -480,23 +520,31 @@ static int __init microcode_init(void)
 	}
 
 	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+
 	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+
+	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
+
 	if (error) {
-		microcode_dev_exit();
 		platform_device_unregister(microcode_pdev);
 		return error;
 	}
 
+	error = microcode_dev_init();
+	if (error)
+		return error;
+
 	register_hotcpu_notifier(&mc_cpu_notifier);
 
-	printk(KERN_INFO
-	       "Microcode Update Driver: v" MICROCODE_VERSION
+	pr_info("Microcode Update Driver: v" MICROCODE_VERSION
 	       " <tigran@aivazian.fsnet.co.uk>,"
 	       " Peter Oruba\n");
 
 	return 0;
 }
+module_init(microcode_init);
 
 static void __exit microcode_exit(void)
 {
@@ -505,16 +553,17 @@ static void __exit microcode_exit(void)
 	unregister_hotcpu_notifier(&mc_cpu_notifier);
 
 	get_online_cpus();
+	mutex_lock(&microcode_mutex);
+
 	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+
+	mutex_unlock(&microcode_mutex);
 	put_online_cpus();
 
 	platform_device_unregister(microcode_pdev);
 
 	microcode_ops = NULL;
 
-	printk(KERN_INFO
-	       "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+	pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
 }
-
-module_init(microcode_init);
 module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 149b9ec7c1ab7..0d334ddd0a964 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,24 +70,11 @@
  *		Fix sigmatch() macro to handle old CPUs with pf == 0.
  *		Thanks to Stuart Swales for pointing out this bug.
  */
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
 #include <linux/firmware.h>
-#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
 #include <linux/uaccess.h>
-#include <linux/vmalloc.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
+#include <linux/vmalloc.h>
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
 
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
 static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu_num);
-	unsigned long flags;
 	unsigned int val[2];
 
 	memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 		csig->pf = 1 << ((val[1] >> 18) & 7);
 	}
 
-	/* serialize access to the physical write to MSR 0x79 */
-	spin_lock_irqsave(&microcode_update_lock, flags);
-
 	wrmsr(MSR_IA32_UCODE_REV, 0, 0);
 	/* see notes above for revision 1.07.  Apparent chip bug */
 	sync_core();
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
-	spin_unlock_irqrestore(&microcode_update_lock, flags);
 
-	pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
-			csig->sig, csig->pf, csig->rev);
+	printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
+			cpu_num, csig->sig, csig->pf, csig->rev);
 
 	return 0;
 }
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
 	return 0;
 }
 
-static void apply_microcode(int cpu)
+static int apply_microcode(int cpu)
 {
 	struct microcode_intel *mc_intel;
 	struct ucode_cpu_info *uci;
-	unsigned long flags;
 	unsigned int val[2];
 	int cpu_num;
 
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
 	BUG_ON(cpu_num != cpu);
 
 	if (mc_intel == NULL)
-		return;
-
-	/* serialize access to the physical write to MSR 0x79 */
-	spin_lock_irqsave(&microcode_update_lock, flags);
+		return 0;
 
 	/* write microcode via MSR 0x79 */
 	wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
 	/* get the current revision from MSR 0x8B */
 	rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
 
-	spin_unlock_irqrestore(&microcode_update_lock, flags);
 	if (val[1] != mc_intel->hdr.rev) {
-		printk(KERN_ERR "microcode: CPU%d update from revision "
-				"0x%x to 0x%x failed\n",
-			cpu_num, uci->cpu_sig.rev, val[1]);
-		return;
+		printk(KERN_ERR "microcode: CPU%d update "
+				"to revision 0x%x failed\n",
+			cpu_num, mc_intel->hdr.rev);
+		return -1;
 	}
-	printk(KERN_INFO "microcode: CPU%d updated from revision "
-			 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
-		cpu_num, uci->cpu_sig.rev, val[1],
+	printk(KERN_INFO "microcode: CPU%d updated to revision "
+			 "0x%x, date = %04x-%02x-%02x \n",
+		cpu_num, val[1],
 		mc_intel->hdr.date & 0xffff,
 		mc_intel->hdr.date >> 24,
 		(mc_intel->hdr.date >> 16) & 0xff);
 
 	uci->cpu_sig.rev = val[1];
+
+	return 0;
 }
 
-static int generic_load_microcode(int cpu, void *data, size_t size,
-		int (*get_ucode_data)(void *, const void *, size_t))
+static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
+				int (*get_ucode_data)(void *, const void *, size_t))
 {
 	struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 	u8 *ucode_ptr = data, *new_mc = NULL, *mc;
 	int new_rev = uci->cpu_sig.rev;
 	unsigned int leftover = size;
+	enum ucode_state state = UCODE_OK;
 
 	while (leftover) {
 		struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
 		leftover  -= mc_size;
 	}
 
-	if (!new_mc)
+	if (leftover) {
+		if (new_mc)
+			vfree(new_mc);
+		state = UCODE_ERROR;
 		goto out;
+	}
 
-	if (leftover) {
-		vfree(new_mc);
+	if (!new_mc) {
+		state = UCODE_NFOUND;
 		goto out;
 	}
 
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
 	pr_debug("microcode: CPU%d found a matching microcode update with"
 		 " version 0x%x (current=0x%x)\n",
 			cpu, new_rev, uci->cpu_sig.rev);
-
- out:
-	return (int)leftover;
+out:
+	return state;
 }
 
 static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
 	return 0;
 }
 
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
 {
 	char name[30];
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	const struct firmware *firmware;
-	int ret;
+	enum ucode_state ret;
 
-	/* We should bind the task to the CPU */
-	BUG_ON(cpu != raw_smp_processor_id());
 	sprintf(name, "intel-ucode/%02x-%02x-%02x",
 		c->x86, c->x86_model, c->x86_mask);
-	ret = request_firmware(&firmware, name, device);
-	if (ret) {
+
+	if (request_firmware(&firmware, name, device)) {
 		pr_debug("microcode: data file %s load failed\n", name);
-		return ret;
+		return UCODE_NFOUND;
 	}
 
 	ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
 	return copy_from_user(to, from, n);
 }
 
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
-	/* We should bind the task to the CPU */
-	BUG_ON(cpu != raw_smp_processor_id());
-
 	return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
 }
 
-- 
GitLab


From 2b1ccc0ee918a653d0483fdad9dd6112ce8e9043 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 12 May 2009 11:11:48 +0200
Subject: [PATCH 1086/2198] splice: fix misleading comment

Splice is tied to pipes by design, it'll not change. And now that
the splice stuff is in splice.h (and note pipe.h), the rest of the comment
is out-of-date as well.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 include/linux/splice.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/include/linux/splice.h b/include/linux/splice.h
index 5f3faa9d15aea..18e7c7c0cae6d 100644
--- a/include/linux/splice.h
+++ b/include/linux/splice.h
@@ -11,8 +11,7 @@
 #include <linux/pipe_fs_i.h>
 
 /*
- * splice is tied to pipes as a transport (at least for now), so we'll just
- * add the splice flags here.
+ * Flags passed in from splice/tee/vmsplice
  */
 #define SPLICE_F_MOVE	(0x01)	/* move pages instead of copying */
 #define SPLICE_F_NONBLOCK (0x02) /* don't block on the pipe splicing (but */
-- 
GitLab


From 9d62dcdfa6f6fc843f7d9b494bcd48f00b94f883 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Mon, 11 May 2009 22:05:28 -0400
Subject: [PATCH 1087/2198] x86: merge process.c a bit

Merge arch_align_stack() and arch_randomize_brk(), since
they are the same.

Tested on x86_64.

[ Impact: cleanup ]

Signed-off-by: Amerigo Wang <amwang@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process.c    | 14 ++++++++++++++
 arch/x86/kernel/process_32.c | 13 -------------
 arch/x86/kernel/process_64.c | 13 -------------
 3 files changed, 14 insertions(+), 26 deletions(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ca989158e8479..2b9a8d0fb4746 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,6 +8,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/random.h>
 #include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
@@ -613,3 +614,16 @@ static int __init idle_setup(char *str)
 }
 early_param("idle", idle_setup);
 
+unsigned long arch_align_stack(unsigned long sp)
+{
+	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+		sp -= get_random_int() % 8192;
+	return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+	unsigned long range_end = mm->brk + 0x02000000;
+	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 76f8f84043a2a..a3bb049ad0827 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -33,7 +33,6 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
-#include <linux/random.h>
 #include <linux/personality.h>
 #include <linux/tick.h>
 #include <linux/percpu.h>
@@ -497,15 +496,3 @@ unsigned long get_wchan(struct task_struct *p)
 	return 0;
 }
 
-unsigned long arch_align_stack(unsigned long sp)
-{
-	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-		sp -= get_random_int() % 8192;
-	return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-	unsigned long range_end = mm->brk + 0x02000000;
-	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index b751a41392b1b..34386f72bdcf8 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -32,7 +32,6 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
-#include <linux/random.h>
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
@@ -660,15 +659,3 @@ long sys_arch_prctl(int code, unsigned long addr)
 	return do_arch_prctl(current, code, addr);
 }
 
-unsigned long arch_align_stack(unsigned long sp)
-{
-	if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-		sp -= get_random_int() % 8192;
-	return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-	unsigned long range_end = mm->brk + 0x02000000;
-	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
-- 
GitLab


From bf78ad69cd351798b9447a269c6bd41ce1f111f4 Mon Sep 17 00:00:00 2001
From: Amerigo Wang <amwang@redhat.com>
Date: Mon, 11 May 2009 23:29:09 -0400
Subject: [PATCH 1088/2198] x86: process.c, remove useless headers

<stdarg.h> is not needed by these files, remove them.

[ Impact: cleanup ]

Signed-off-by: WANG Cong <amwang@redhat.com>
Cc: akpm@linux-foundation.org
LKML-Reference: <20090512032956.5040.77055.sendpatchset@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process_32.c | 2 --
 arch/x86/kernel/process_64.c | 2 --
 2 files changed, 4 deletions(-)

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index a3bb049ad0827..56d50b7d71dfc 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -9,8 +9,6 @@
  * This file handles the architecture-dependent parts of process handling..
  */
 
-#include <stdarg.h>
-
 #include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 34386f72bdcf8..9d6b20e6cd804 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -14,8 +14,6 @@
  * This file handles the architecture-dependent parts of process handling..
  */
 
-#include <stdarg.h>
-
 #include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
-- 
GitLab


From ed077b58f6146684069975122b1728a9d248a501 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Tue, 12 May 2009 16:40:00 +0800
Subject: [PATCH 1089/2198] x86: make sparse mem work in non-NUMA mode

With sparse memory, holes should not be marked present for memmap.
This patch makes sure sparsemem really works on SMP mode (!NUMA).

[ Impact: use less memory to map fragmented RAM, avoid boot-OOM/crash ]

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Sheng Yang <sheng.yang@intel.com>
LKML-Reference: <1242117600.22431.0.camel@sli10-desk.sh.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_32.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index fef1d90d4f150..949708d7a481e 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -706,15 +706,15 @@ void __init initmem_init(unsigned long start_pfn,
 	highstart_pfn = highend_pfn = max_pfn;
 	if (max_pfn > max_low_pfn)
 		highstart_pfn = max_low_pfn;
-	memory_present(0, 0, highend_pfn);
 	e820_register_active_regions(0, 0, highend_pfn);
+	sparse_memory_present_with_active_regions(0);
 	printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
 		pages_to_mb(highend_pfn - highstart_pfn));
 	num_physpages = highend_pfn;
 	high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-	memory_present(0, 0, max_low_pfn);
 	e820_register_active_regions(0, 0, max_low_pfn);
+	sparse_memory_present_with_active_regions(0);
 	num_physpages = max_low_pfn;
 	high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
-- 
GitLab


From 4797f6b021a3fa399942245d07a1feb30df81bb8 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sat, 2 May 2009 10:40:57 -0700
Subject: [PATCH 1090/2198] x86: read apic ID in the !acpi_lapic case

Ed found that on 32-bit, boot_cpu_physical_apicid is not read right,
when the mptable is broken.

Interestingly, actually three paths use/set it:

 1. acpi: at that time that is already read from reg
 2. mptable: only read from mptable
 3. no madt, and no mptable, that use default apic id 0 for 64-bit, -1 for 32-bit

so we could read the apic id for the 2/3 path. We trust the hardware
register more than we trust a BIOS data structure (the mptable).

We can also avoid the double set_fixmap() when acpi_lapic
is used, and also need to move cpu_has_apic earlier and
call apic_disable().

Also when need to update the apic id, we'd better read and
set the apic version as well - so that quirks are applied precisely.

v2: make path 3 with 64bit, use -1 as apic id, so could read it later.
v3: fix whitespace problem pointed out by Ed Swierk
v5: fix boot crash

[ Impact: get correct apic id for bsp other than acpi path ]

Reported-by: Ed Swierk <eswierk@aristanetworks.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <49FC85A9.2070702@kernel.org>
[ v4: sanity-check in the ACPI case too ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/smp.h     |  2 +-
 arch/x86/kernel/apic/apic.c    | 46 ++++++++++++++++------------------
 arch/x86/kernel/apic/io_apic.c |  5 ++++
 3 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 19e0d88b966d7..6a84ed166aec1 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void);
 static inline int logical_smp_processor_id(void)
 {
 	/* we don't want to mark this access volatile - bad code generation */
-	return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+	return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
 }
 
 #endif
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 07cffc1214cbd..b0fd26442c418 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -242,17 +242,24 @@ static int modern_apic(void)
  * bare function to substitute write operation
  * and it's _that_ fast :)
  */
-void native_apic_write_dummy(u32 reg, u32 v)
+static void native_apic_write_dummy(u32 reg, u32 v)
 {
 	WARN_ON_ONCE((cpu_has_apic || !disable_apic));
 }
 
+static u32 native_apic_read_dummy(u32 reg)
+{
+	WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+	return 0;
+}
+
 /*
- * right after this call apic->write doesn't do anything
+ * right after this call apic->write/read doesn't do anything
  * note that there is no restore operation it works one way
  */
 void apic_disable(void)
 {
+	apic->read = native_apic_read_dummy;
 	apic->write = native_apic_write_dummy;
 }
 
@@ -1576,32 +1583,23 @@ void __init init_apic_mappings(void)
 		return;
 	}
 
-	/*
-	 * If no local APIC can be found then set up a fake all
-	 * zeroes page to simulate the local APIC and another
-	 * one for the IO-APIC.
-	 */
+	/* If no local APIC can be found return early */
 	if (!smp_found_config && detect_init_APIC()) {
-		apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
-		apic_phys = __pa(apic_phys);
-	} else
+		/* lets NOP'ify apic operations */
+		pr_info("APIC: disable apic facility\n");
+		apic_disable();
+	} else {
 		apic_phys = mp_lapic_addr;
 
-	/*
-	 * acpi lapic path already maps that address in
-	 * acpi_register_lapic_address()
-	 */
-	if (!acpi_lapic)
-		set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-
-	apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
-			APIC_BASE, apic_phys);
+		/*
+		 * acpi lapic path already maps that address in
+		 * acpi_register_lapic_address()
+		 */
+		if (!acpi_lapic)
+			set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
 
-	/* lets check if we may NOP'ify apic operations */
-	if (!cpu_has_apic) {
-		pr_info("APIC: disable apic facility\n");
-		apic_disable();
-		return;
+		apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+					APIC_BASE, apic_phys);
 	}
 
 	/*
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1f3d3669dae8d..74d2b480a20bf 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1878,6 +1878,11 @@ __apicdebuginit(void) print_PIC(void)
 __apicdebuginit(int) print_all_ICs(void)
 {
 	print_PIC();
+
+	/* don't print out if apic is not there */
+	if (!cpu_has_apic || disable_apic)
+		return 0;
+
 	print_all_local_APICs();
 	print_IO_APIC();
 
-- 
GitLab


From 9fe5ee0efb1b1d4a0939bc4252a8427e3337d96a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 19:29:04 +0900
Subject: [PATCH 1091/2198] sh: clkfwk: Use arch_clk_init() for on-chip clock
 registration.

CPUs registering on-chip clocks should be using arch_clk_init() with the
new scheme so that the CPUs have the opportunity to establish the
topology prior to the initial root clock rate propagation. This ensures
that CPUs with on-chip clocks that use CLK_ENABLE_ON_INIT are properly
enabled at the initial propagation time, without having to further poke
the root clocks.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/clock.c             | 8 +-------
 arch/sh/kernel/cpu/sh4/clock-sh4-202.c | 3 +--
 arch/sh/kernel/cpu/sh4a/clock-sh7763.c | 3 +--
 arch/sh/kernel/cpu/sh4a/clock-sh7780.c | 3 +--
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 3 +--
 arch/sh/kernel/cpu/sh4a/clock-sh7786.c | 3 +--
 arch/sh/kernel/cpu/sh4a/clock-shx3.c   | 3 +--
 7 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 9e1fc133a4732..f833843a194a0 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -437,13 +437,7 @@ void clk_put(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_put);
 
-void __init __attribute__ ((weak))
-arch_init_clk_ops(struct clk_ops **ops, int type)
-{
-}
-
-int __init __attribute__ ((weak))
-arch_clk_init(void)
+int __init __weak arch_clk_init(void)
 {
 	return 0;
 }
diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
index 435f4f12ffb8b..a72dc326d0b33 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
@@ -150,7 +150,7 @@ static struct clk *sh4202_onchip_clocks[] = {
 	&sh4202_shoc_clk,
 };
 
-static int __init sh4202_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -166,4 +166,3 @@ static int __init sh4202_clk_init(void)
 
 	return ret;
 }
-arch_initcall(sh4202_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index 0110da64a43b7..ce3d4e6319a31 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -90,7 +90,7 @@ static struct clk *sh7763_onchip_clocks[] = {
 	&sh7763_shyway_clk,
 };
 
-static int __init sh7763_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -106,4 +106,3 @@ static int __init sh7763_clk_init(void)
 
 	return ret;
 }
-arch_initcall(sh7763_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index 0a22d50b109f4..38b8b1ddb283b 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -96,7 +96,7 @@ static struct clk *sh7780_onchip_clocks[] = {
 	&sh7780_shyway_clk,
 };
 
-static int __init sh7780_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -112,4 +112,3 @@ static int __init sh7780_clk_init(void)
 
 	return ret;
 }
-arch_initcall(sh7780_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 4dcd1f6f0cbd6..fa14f35bc116d 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -134,7 +134,7 @@ static struct clk *sh7785_onchip_clocks[] = {
 	&sh7785_ram_clk,
 };
 
-static int __init sh7785_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -150,4 +150,3 @@ static int __init sh7785_clk_init(void)
 
 	return ret;
 }
-arch_initcall(sh7785_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index 825556fe23063..7907002fa1df0 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -120,7 +120,7 @@ static struct clk *sh7786_onchip_clocks[] = {
 	&sh7786_ddr_clk,
 };
 
-static int __init sh7786_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -136,4 +136,3 @@ static int __init sh7786_clk_init(void)
 
 	return ret;
 }
-arch_initcall(sh7786_clk_init);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index 1eb149b0fe6e4..c14553e3e13f7 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -107,7 +107,7 @@ static struct clk *shx3_onchip_clocks[] = {
 	&shx3_shyway_clk,
 };
 
-static int __init shx3_clk_init(void)
+int __init arch_clk_init(void)
 {
 	struct clk *clk = clk_get(NULL, "master_clk");
 	int i, ret = 0;
@@ -123,4 +123,3 @@ static int __init shx3_clk_init(void)
 
 	return ret;
 }
-arch_initcall(shx3_clk_init);
-- 
GitLab


From 0ee8b4d7c7d39d9a9913e27686ef786642c3ccbb Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:13:59 +0000
Subject: [PATCH 1092/2198] sh: TMU platform data for sh7763

This patch adds TMU platform data for sh7763. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7763.c | 204 +++++++++++++++++++++++++
 1 file changed, 204 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
index bdf0f61ae1edd..c91f34c9aa83a 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
@@ -12,6 +12,7 @@
 #include <linux/platform_device.h>
 #include <linux/init.h>
 #include <linux/serial.h>
+#include <linux/sh_timer.h>
 #include <linux/io.h>
 #include <linux/serial_sci.h>
 
@@ -113,7 +114,195 @@ static struct platform_device usbf_device = {
 	.resource	= usbf_resources,
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 28,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 29,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 30,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffd88008,
+		.end	= 0xffd88013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 96,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffd88014,
+		.end	= 0xffd8801f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 97,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffd88020,
+		.end	= 0xffd8802b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 98,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
 static struct platform_device *sh7763_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 	&rtc_device,
 	&sci_device,
 	&usb_ohci_device,
@@ -127,6 +316,21 @@ static int __init sh7763_devices_setup(void)
 }
 __initcall(sh7763_devices_setup);
 
+static struct platform_device *sh7763_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7763_early_devices,
+				   ARRAY_SIZE(sh7763_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
GitLab


From 5d8728a71f416fd38a0945271c71cb2933fc5734 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:18:13 +0000
Subject: [PATCH 1093/2198] sh: add sh7770_generic_defconfig

This patch adds a generic sh7770 defconfig file.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/configs/sh7770_generic_defconfig | 700 +++++++++++++++++++++++
 1 file changed, 700 insertions(+)
 create mode 100644 arch/sh/configs/sh7770_generic_defconfig

diff --git a/arch/sh/configs/sh7770_generic_defconfig b/arch/sh/configs/sh7770_generic_defconfig
new file mode 100644
index 0000000000000..d6489b46ca5ba
--- /dev/null
+++ b/arch/sh/configs/sh7770_generic_defconfig
@@ -0,0 +1,700 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.30-rc4
+# Tue May 12 14:48:21 2009
+#
+CONFIG_SUPERH=y
+CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
+CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_BUG=y
+CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
+CONFIG_GENERIC_IRQ_PROBE=y
+# CONFIG_GENERIC_GPIO is not set
+CONFIG_GENERIC_TIME=y
+CONFIG_GENERIC_CLOCKEVENTS=y
+# CONFIG_ARCH_SUSPEND_POSSIBLE is not set
+CONFIG_ARCH_HIBERNATION_POSSIBLE=y
+CONFIG_SYS_SUPPORTS_TMU=y
+CONFIG_STACKTRACE_SUPPORT=y
+CONFIG_LOCKDEP_SUPPORT=y
+CONFIG_HAVE_LATENCYTOP_SUPPORT=y
+# CONFIG_ARCH_HAS_ILOG2_U32 is not set
+# CONFIG_ARCH_HAS_ILOG2_U64 is not set
+CONFIG_ARCH_NO_VIRT_TO_BUS=y
+CONFIG_ARCH_HAS_DEFAULT_IDLE=y
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# General setup
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_BROKEN_ON_SMP=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_SYSVIPC_SYSCTL=y
+# CONFIG_BSD_PROCESS_ACCT is not set
+
+#
+# RCU Subsystem
+#
+# CONFIG_CLASSIC_RCU is not set
+CONFIG_TREE_RCU=y
+# CONFIG_PREEMPT_RCU is not set
+# CONFIG_RCU_TRACE is not set
+CONFIG_RCU_FANOUT=32
+# CONFIG_RCU_FANOUT_EXACT is not set
+# CONFIG_TREE_RCU_TRACE is not set
+# CONFIG_PREEMPT_RCU_TRACE is not set
+# CONFIG_IKCONFIG is not set
+CONFIG_LOG_BUF_SHIFT=17
+CONFIG_GROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_USER_SCHED=y
+# CONFIG_CGROUP_SCHED is not set
+CONFIG_CGROUPS=y
+# CONFIG_CGROUP_DEBUG is not set
+# CONFIG_CGROUP_NS is not set
+# CONFIG_CGROUP_FREEZER is not set
+# CONFIG_CGROUP_DEVICE is not set
+# CONFIG_CPUSETS is not set
+# CONFIG_CGROUP_CPUACCT is not set
+# CONFIG_RESOURCE_COUNTERS is not set
+# CONFIG_RELAY is not set
+# CONFIG_NAMESPACES is not set
+# CONFIG_BLK_DEV_INITRD is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
+CONFIG_EMBEDDED=y
+# CONFIG_UID16 is not set
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS=y
+# CONFIG_KALLSYMS_EXTRA_PASS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SIGNALFD=y
+CONFIG_TIMERFD=y
+CONFIG_EVENTFD=y
+CONFIG_SHMEM=y
+CONFIG_AIO=y
+CONFIG_VM_EVENT_COUNTERS=y
+# CONFIG_COMPAT_BRK is not set
+# CONFIG_SLAB is not set
+CONFIG_SLUB=y
+# CONFIG_SLOB is not set
+CONFIG_PROFILING=y
+CONFIG_TRACEPOINTS=y
+# CONFIG_MARKERS is not set
+CONFIG_OPROFILE=y
+CONFIG_HAVE_OPROFILE=y
+CONFIG_HAVE_IOREMAP_PROT=y
+CONFIG_HAVE_KPROBES=y
+CONFIG_HAVE_KRETPROBES=y
+CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
+CONFIG_HAVE_GENERIC_DMA_COHERENT=y
+CONFIG_RT_MUTEXES=y
+CONFIG_BASE_SMALL=0
+# CONFIG_MODULES is not set
+CONFIG_BLOCK=y
+# CONFIG_LBD is not set
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_BLK_DEV_INTEGRITY is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+CONFIG_DEFAULT_AS=y
+# CONFIG_DEFAULT_DEADLINE is not set
+# CONFIG_DEFAULT_CFQ is not set
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="anticipatory"
+CONFIG_FREEZER=y
+
+#
+# System type
+#
+CONFIG_CPU_SH4=y
+CONFIG_CPU_SH4A=y
+# CONFIG_CPU_SUBTYPE_SH7619 is not set
+# CONFIG_CPU_SUBTYPE_SH7201 is not set
+# CONFIG_CPU_SUBTYPE_SH7203 is not set
+# CONFIG_CPU_SUBTYPE_SH7206 is not set
+# CONFIG_CPU_SUBTYPE_SH7263 is not set
+# CONFIG_CPU_SUBTYPE_MXG is not set
+# CONFIG_CPU_SUBTYPE_SH7705 is not set
+# CONFIG_CPU_SUBTYPE_SH7706 is not set
+# CONFIG_CPU_SUBTYPE_SH7707 is not set
+# CONFIG_CPU_SUBTYPE_SH7708 is not set
+# CONFIG_CPU_SUBTYPE_SH7709 is not set
+# CONFIG_CPU_SUBTYPE_SH7710 is not set
+# CONFIG_CPU_SUBTYPE_SH7712 is not set
+# CONFIG_CPU_SUBTYPE_SH7720 is not set
+# CONFIG_CPU_SUBTYPE_SH7721 is not set
+# CONFIG_CPU_SUBTYPE_SH7750 is not set
+# CONFIG_CPU_SUBTYPE_SH7091 is not set
+# CONFIG_CPU_SUBTYPE_SH7750R is not set
+# CONFIG_CPU_SUBTYPE_SH7750S is not set
+# CONFIG_CPU_SUBTYPE_SH7751 is not set
+# CONFIG_CPU_SUBTYPE_SH7751R is not set
+# CONFIG_CPU_SUBTYPE_SH7760 is not set
+# CONFIG_CPU_SUBTYPE_SH4_202 is not set
+# CONFIG_CPU_SUBTYPE_SH7723 is not set
+# CONFIG_CPU_SUBTYPE_SH7724 is not set
+# CONFIG_CPU_SUBTYPE_SH7763 is not set
+CONFIG_CPU_SUBTYPE_SH7770=y
+# CONFIG_CPU_SUBTYPE_SH7780 is not set
+# CONFIG_CPU_SUBTYPE_SH7785 is not set
+# CONFIG_CPU_SUBTYPE_SH7786 is not set
+# CONFIG_CPU_SUBTYPE_SHX3 is not set
+# CONFIG_CPU_SUBTYPE_SH7343 is not set
+# CONFIG_CPU_SUBTYPE_SH7722 is not set
+# CONFIG_CPU_SUBTYPE_SH7366 is not set
+
+#
+# Memory management options
+#
+CONFIG_QUICKLIST=y
+CONFIG_MMU=y
+CONFIG_PAGE_OFFSET=0x80000000
+CONFIG_MEMORY_START=0x08000000
+CONFIG_MEMORY_SIZE=0x04000000
+CONFIG_29BIT=y
+CONFIG_VSYSCALL=y
+CONFIG_ARCH_FLATMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_MAX_ACTIVE_REGIONS=1
+CONFIG_ARCH_POPULATES_NODE_MAP=y
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y
+CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y
+CONFIG_PAGE_SIZE_4KB=y
+# CONFIG_PAGE_SIZE_8KB is not set
+# CONFIG_PAGE_SIZE_16KB is not set
+# CONFIG_PAGE_SIZE_64KB is not set
+CONFIG_SELECT_MEMORY_MODEL=y
+# CONFIG_FLATMEM_MANUAL is not set
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+CONFIG_SPARSEMEM_MANUAL=y
+CONFIG_SPARSEMEM=y
+CONFIG_HAVE_MEMORY_PRESENT=y
+CONFIG_SPARSEMEM_STATIC=y
+
+#
+# Memory hotplug is currently incompatible with Software Suspend
+#
+CONFIG_PAGEFLAGS_EXTENDED=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+CONFIG_MIGRATION=y
+# CONFIG_PHYS_ADDR_T_64BIT is not set
+CONFIG_ZONE_DMA_FLAG=0
+CONFIG_NR_QUICK=2
+CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
+
+#
+# Cache configuration
+#
+CONFIG_CACHE_WRITEBACK=y
+# CONFIG_CACHE_WRITETHROUGH is not set
+# CONFIG_CACHE_OFF is not set
+
+#
+# Processor features
+#
+CONFIG_CPU_LITTLE_ENDIAN=y
+# CONFIG_CPU_BIG_ENDIAN is not set
+CONFIG_SH_FPU=y
+# CONFIG_SH_STORE_QUEUES is not set
+CONFIG_CPU_HAS_INTEVT=y
+CONFIG_CPU_HAS_SR_RB=y
+CONFIG_CPU_HAS_FPU=y
+
+#
+# Board support
+#
+
+#
+# Timer and clock configuration
+#
+CONFIG_SH_TMU=y
+CONFIG_SH_TIMER_IRQ=16
+CONFIG_SH_PCLK_FREQ=41666666
+CONFIG_TICK_ONESHOT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
+
+#
+# CPU Frequency scaling
+#
+CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_TABLE=y
+# CONFIG_CPU_FREQ_DEBUG is not set
+CONFIG_CPU_FREQ_STAT=y
+# CONFIG_CPU_FREQ_STAT_DETAILS is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set
+# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set
+CONFIG_CPU_FREQ_GOV_PERFORMANCE=y
+# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set
+# CONFIG_CPU_FREQ_GOV_USERSPACE is not set
+# CONFIG_CPU_FREQ_GOV_ONDEMAND is not set
+# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set
+CONFIG_SH_CPU_FREQ=y
+
+#
+# DMA support
+#
+# CONFIG_SH_DMA is not set
+
+#
+# Companion Chips
+#
+
+#
+# Additional SuperH Device Drivers
+#
+# CONFIG_HEARTBEAT is not set
+# CONFIG_PUSH_SWITCH is not set
+
+#
+# Kernel features
+#
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
+# CONFIG_HZ_300 is not set
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=250
+CONFIG_SCHED_HRTICK=y
+CONFIG_KEXEC=y
+# CONFIG_CRASH_DUMP is not set
+CONFIG_KEXEC_JUMP=y
+CONFIG_PREEMPT_NONE=y
+# CONFIG_PREEMPT_VOLUNTARY is not set
+# CONFIG_PREEMPT is not set
+CONFIG_GUSA=y
+
+#
+# Boot options
+#
+CONFIG_ZERO_PAGE_OFFSET=0x00001000
+CONFIG_BOOT_LINK_OFFSET=0x00800000
+CONFIG_ENTRY_OFFSET=0x00001000
+# CONFIG_CMDLINE_BOOL is not set
+
+#
+# Bus options
+#
+# CONFIG_ARCH_SUPPORTS_MSI is not set
+# CONFIG_PCCARD is not set
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_HAVE_AOUT is not set
+# CONFIG_BINFMT_MISC is not set
+
+#
+# Power management options (EXPERIMENTAL)
+#
+CONFIG_PM=y
+# CONFIG_PM_DEBUG is not set
+CONFIG_PM_SLEEP=y
+CONFIG_HIBERNATION=y
+CONFIG_PM_STD_PARTITION=""
+CONFIG_CPU_IDLE=y
+CONFIG_CPU_IDLE_GOV_LADDER=y
+CONFIG_CPU_IDLE_GOV_MENU=y
+# CONFIG_NET is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_STANDALONE=y
+# CONFIG_PREVENT_FIRMWARE_BUILD is not set
+CONFIG_FW_LOADER=y
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE=""
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_MTD is not set
+# CONFIG_PARPORT is not set
+CONFIG_BLK_DEV=y
+# CONFIG_BLK_DEV_COW_COMMON is not set
+# CONFIG_BLK_DEV_LOOP is not set
+# CONFIG_BLK_DEV_RAM is not set
+# CONFIG_CDROM_PKTCDVD is not set
+# CONFIG_BLK_DEV_HD is not set
+# CONFIG_MISC_DEVICES is not set
+CONFIG_HAVE_IDE=y
+# CONFIG_IDE is not set
+
+#
+# SCSI device support
+#
+# CONFIG_RAID_ATTRS is not set
+# CONFIG_SCSI is not set
+# CONFIG_SCSI_DMA is not set
+# CONFIG_SCSI_NETLINK is not set
+# CONFIG_ATA is not set
+# CONFIG_MD is not set
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+# CONFIG_INPUT is not set
+
+#
+# Hardware I/O ports
+#
+# CONFIG_SERIO is not set
+# CONFIG_GAMEPORT is not set
+
+#
+# Character devices
+#
+# CONFIG_VT is not set
+# CONFIG_DEVKMEM is not set
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+# CONFIG_SERIAL_8250 is not set
+
+#
+# Non-8250 serial port support
+#
+CONFIG_SERIAL_SH_SCI=y
+CONFIG_SERIAL_SH_SCI_NR_UARTS=6
+CONFIG_SERIAL_SH_SCI_CONSOLE=y
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+# CONFIG_UNIX98_PTYS is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_IPMI_HANDLER is not set
+# CONFIG_HW_RANDOM is not set
+# CONFIG_R3964 is not set
+# CONFIG_RAW_DRIVER is not set
+# CONFIG_TCG_TPM is not set
+CONFIG_I2C=y
+CONFIG_I2C_BOARDINFO=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_HELPER_AUTO=y
+
+#
+# I2C Hardware Bus support
+#
+
+#
+# I2C system bus drivers (mostly embedded / system-on-chip)
+#
+# CONFIG_I2C_OCORES is not set
+CONFIG_I2C_SH_MOBILE=y
+# CONFIG_I2C_SIMTEC is not set
+
+#
+# External I2C/SMBus adapter drivers
+#
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_TAOS_EVM is not set
+
+#
+# Other I2C/SMBus bus drivers
+#
+# CONFIG_I2C_PCA_PLATFORM is not set
+
+#
+# Miscellaneous I2C Chip support
+#
+# CONFIG_DS1682 is not set
+# CONFIG_SENSORS_PCF8574 is not set
+# CONFIG_PCF8575 is not set
+# CONFIG_SENSORS_PCA9539 is not set
+# CONFIG_SENSORS_MAX6875 is not set
+# CONFIG_SENSORS_TSL2550 is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+# CONFIG_SPI is not set
+# CONFIG_W1 is not set
+# CONFIG_POWER_SUPPLY is not set
+# CONFIG_HWMON is not set
+# CONFIG_THERMAL is not set
+# CONFIG_THERMAL_HWMON is not set
+# CONFIG_WATCHDOG is not set
+CONFIG_SSB_POSSIBLE=y
+
+#
+# Sonics Silicon Backplane
+#
+# CONFIG_SSB is not set
+
+#
+# Multifunction device drivers
+#
+# CONFIG_MFD_CORE is not set
+# CONFIG_MFD_SM501 is not set
+# CONFIG_HTC_PASIC3 is not set
+# CONFIG_TWL4030_CORE is not set
+# CONFIG_MFD_TMIO is not set
+# CONFIG_PMIC_DA903X is not set
+# CONFIG_MFD_WM8400 is not set
+# CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_PCF50633 is not set
+# CONFIG_REGULATOR is not set
+
+#
+# Multimedia devices
+#
+
+#
+# Multimedia core support
+#
+# CONFIG_VIDEO_DEV is not set
+# CONFIG_VIDEO_MEDIA is not set
+
+#
+# Multimedia drivers
+#
+# CONFIG_DAB is not set
+
+#
+# Graphics support
+#
+# CONFIG_VGASTATE is not set
+# CONFIG_VIDEO_OUTPUT_CONTROL is not set
+# CONFIG_FB is not set
+# CONFIG_BACKLIGHT_LCD_SUPPORT is not set
+
+#
+# Display device support
+#
+# CONFIG_DISPLAY_SUPPORT is not set
+# CONFIG_SOUND is not set
+# CONFIG_USB_SUPPORT is not set
+# CONFIG_MMC is not set
+# CONFIG_MEMSTICK is not set
+# CONFIG_NEW_LEDS is not set
+# CONFIG_ACCESSIBILITY is not set
+CONFIG_RTC_LIB=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_HCTOSYS=y
+CONFIG_RTC_HCTOSYS_DEVICE="rtc0"
+# CONFIG_RTC_DEBUG is not set
+
+#
+# RTC interfaces
+#
+CONFIG_RTC_INTF_DEV=y
+# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
+# CONFIG_RTC_DRV_TEST is not set
+
+#
+# I2C RTC drivers
+#
+# CONFIG_RTC_DRV_DS1307 is not set
+# CONFIG_RTC_DRV_DS1374 is not set
+# CONFIG_RTC_DRV_DS1672 is not set
+# CONFIG_RTC_DRV_MAX6900 is not set
+# CONFIG_RTC_DRV_RS5C372 is not set
+# CONFIG_RTC_DRV_ISL1208 is not set
+# CONFIG_RTC_DRV_X1205 is not set
+# CONFIG_RTC_DRV_PCF8563 is not set
+# CONFIG_RTC_DRV_PCF8583 is not set
+# CONFIG_RTC_DRV_M41T80 is not set
+# CONFIG_RTC_DRV_S35390A is not set
+# CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
+
+#
+# SPI RTC drivers
+#
+
+#
+# Platform RTC drivers
+#
+# CONFIG_RTC_DRV_DS1286 is not set
+# CONFIG_RTC_DRV_DS1511 is not set
+# CONFIG_RTC_DRV_DS1553 is not set
+# CONFIG_RTC_DRV_DS1742 is not set
+# CONFIG_RTC_DRV_STK17TA8 is not set
+# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_M48T35 is not set
+# CONFIG_RTC_DRV_M48T59 is not set
+# CONFIG_RTC_DRV_BQ4802 is not set
+# CONFIG_RTC_DRV_V3020 is not set
+
+#
+# on-CPU RTC drivers
+#
+CONFIG_RTC_DRV_SH=y
+# CONFIG_RTC_DRV_GENERIC is not set
+# CONFIG_DMADEVICES is not set
+# CONFIG_AUXDISPLAY is not set
+CONFIG_UIO=y
+# CONFIG_UIO_PDRV is not set
+CONFIG_UIO_PDRV_GENIRQ=y
+# CONFIG_UIO_SMX is not set
+# CONFIG_UIO_SERCOS3 is not set
+# CONFIG_STAGING is not set
+
+#
+# File systems
+#
+# CONFIG_EXT2_FS is not set
+# CONFIG_EXT3_FS is not set
+# CONFIG_EXT4_FS is not set
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+# CONFIG_FS_POSIX_ACL is not set
+CONFIG_FILE_LOCKING=y
+# CONFIG_XFS_FS is not set
+# CONFIG_BTRFS_FS is not set
+# CONFIG_DNOTIFY is not set
+# CONFIG_INOTIFY is not set
+# CONFIG_QUOTA is not set
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_FUSE_FS is not set
+
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+# CONFIG_MSDOS_FS is not set
+# CONFIG_VFAT_FS is not set
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+# CONFIG_PROC_FS is not set
+# CONFIG_SYSFS is not set
+# CONFIG_TMPFS is not set
+# CONFIG_HUGETLBFS is not set
+# CONFIG_HUGETLB_PAGE is not set
+# CONFIG_MISC_FILESYSTEMS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+# CONFIG_NLS is not set
+
+#
+# Kernel hacking
+#
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+# CONFIG_PRINTK_TIME is not set
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_MAGIC_SYSRQ is not set
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_DEBUG_FS=y
+# CONFIG_HEADERS_CHECK is not set
+# CONFIG_DEBUG_KERNEL is not set
+CONFIG_STACKTRACE=y
+# CONFIG_DEBUG_BUGVERBOSE is not set
+# CONFIG_DEBUG_MEMORY_INIT is not set
+# CONFIG_RCU_CPU_STALL_DETECTOR is not set
+# CONFIG_LATENCYTOP is not set
+# CONFIG_SYSCTL_SYSCALL_CHECK is not set
+CONFIG_NOP_TRACER=y
+CONFIG_HAVE_FUNCTION_TRACER=y
+CONFIG_HAVE_DYNAMIC_FTRACE=y
+CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
+
+#
+# Tracers
+#
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
+# CONFIG_SAMPLES is not set
+CONFIG_HAVE_ARCH_KGDB=y
+# CONFIG_SH_STANDARD_BIOS is not set
+# CONFIG_EARLY_SCIF_CONSOLE is not set
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITYFS is not set
+# CONFIG_SECURITY_FILE_CAPABILITIES is not set
+# CONFIG_CRYPTO is not set
+CONFIG_BINARY_PRINTF=y
+
+#
+# Library routines
+#
+CONFIG_GENERIC_FIND_LAST_BIT=y
+# CONFIG_CRC_CCITT is not set
+# CONFIG_CRC16 is not set
+# CONFIG_CRC_T10DIF is not set
+# CONFIG_CRC_ITU_T is not set
+# CONFIG_CRC32 is not set
+# CONFIG_CRC7 is not set
+# CONFIG_LIBCRC32C is not set
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
-- 
GitLab


From f251935e02e89aa203e0729bfd727175018cec6f Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:20:08 +0000
Subject: [PATCH 1094/2198] sh: TMU platform data for sh7770

This patch adds TMU platform data for sh7770. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7770.c | 300 +++++++++++++++++++++++++
 1 file changed, 300 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
index b73578ee295d4..b61d6143aaaa9 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 
 static struct plat_sci_port sci_platform_data[] = {
 	{
@@ -76,7 +77,288 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffd81008,
+		.end	= 0xffd81013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 19,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffd81014,
+		.end	= 0xffd8101f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 20,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffd81020,
+		.end	= 0xffd8102f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 21,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
+static struct sh_timer_config tmu6_platform_data = {
+	.name = "TMU6",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+};
+
+static struct resource tmu6_resources[] = {
+	[0] = {
+		.name	= "TMU6",
+		.start	= 0xffd82008,
+		.end	= 0xffd82013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 22,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu6_device = {
+	.name		= "sh_tmu",
+	.id		= 6,
+	.dev = {
+		.platform_data	= &tmu6_platform_data,
+	},
+	.resource	= tmu6_resources,
+	.num_resources	= ARRAY_SIZE(tmu6_resources),
+};
+
+static struct sh_timer_config tmu7_platform_data = {
+	.name = "TMU7",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource tmu7_resources[] = {
+	[0] = {
+		.name	= "TMU7",
+		.start	= 0xffd82014,
+		.end	= 0xffd8201f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 23,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu7_device = {
+	.name		= "sh_tmu",
+	.id		= 7,
+	.dev = {
+		.platform_data	= &tmu7_platform_data,
+	},
+	.resource	= tmu7_resources,
+	.num_resources	= ARRAY_SIZE(tmu7_resources),
+};
+
+static struct sh_timer_config tmu8_platform_data = {
+	.name = "TMU8",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu8_resources[] = {
+	[0] = {
+		.name	= "TMU8",
+		.start	= 0xffd82020,
+		.end	= 0xffd8202b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 24,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu8_device = {
+	.name		= "sh_tmu",
+	.id		= 8,
+	.dev = {
+		.platform_data	= &tmu8_platform_data,
+	},
+	.resource	= tmu8_resources,
+	.num_resources	= ARRAY_SIZE(tmu8_resources),
+};
+
 static struct platform_device *sh7770_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+	&tmu6_device,
+	&tmu7_device,
+	&tmu8_device,
 	&sci_device,
 };
 
@@ -87,6 +369,24 @@ static int __init sh7770_devices_setup(void)
 }
 __initcall(sh7770_devices_setup);
 
+static struct platform_device *sh7770_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
+	&tmu6_device,
+	&tmu7_device,
+	&tmu8_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7770_early_devices,
+				   ARRAY_SIZE(sh7770_early_devices));
+}
+
 void __init plat_irq_setup(void)
 {
 }
-- 
GitLab


From 5f8a29ba39d52b2eaaed907b3cb3016b949a8f9b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:28:25 +0000
Subject: [PATCH 1095/2198] sh: TMU platform data for sh4-202

This patch adds TMU platform data for sh4-202. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4/setup-sh4-202.c | 108 +++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
index 7371abf64f808..034c069f7155d 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 
 static struct plat_sci_port sci_platform_data[] = {
 	{
@@ -31,8 +32,103 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct platform_device *sh4202_devices[] __initdata = {
 	&sci_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 };
 
 static int __init sh4202_devices_setup(void)
@@ -42,6 +138,18 @@ static int __init sh4202_devices_setup(void)
 }
 __initcall(sh4202_devices_setup);
 
+static struct platform_device *sh4202_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh4202_early_devices,
+				   ARRAY_SIZE(sh4202_early_devices));
+}
+
 void __init plat_irq_setup(void)
 {
 	/* do nothing - all IRL interrupts are handled by the board code */
-- 
GitLab


From 67d889bd828a3cbf36ef27bd4a5c5f7ec76e20b9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:34:26 +0000
Subject: [PATCH 1096/2198] sh: add sh4-202 INTC tables

This patch adds INTC tables for sh4-202 with support
for HUDI, TMU0, TMU1, TMU2, RTC, SCIF and WDT.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4/setup-sh4-202.c | 56 +++++++++++++++++++++++++-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
index 034c069f7155d..be79fa136255f 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
@@ -2,6 +2,7 @@
  * SH4-202 Setup
  *
  *  Copyright (C) 2006  Paul Mundt
+ *  Copyright (C) 2009  Magnus Damm
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -12,6 +13,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/sh_timer.h>
+#include <linux/io.h>
 
 static struct plat_sci_port sci_platform_data[] = {
 	{
@@ -150,7 +152,59 @@ void __init plat_early_device_setup(void)
 				   ARRAY_SIZE(sh4202_early_devices));
 }
 
+enum {
+	UNUSED = 0,
+
+	/* interrupt sources */
+	IRL0, IRL1, IRL2, IRL3, /* only IRLM mode supported */
+	HUDI, TMU0, TMU1, TMU2, RTC, SCIF, WDT,
+};
+
+static struct intc_vect vectors[] __initdata = {
+	INTC_VECT(HUDI, 0x600),
+	INTC_VECT(TMU0, 0x400), INTC_VECT(TMU1, 0x420),
+	INTC_VECT(TMU2, 0x440), INTC_VECT(TMU2, 0x460),
+	INTC_VECT(RTC, 0x480), INTC_VECT(RTC, 0x4a0),
+	INTC_VECT(RTC, 0x4c0),
+	INTC_VECT(SCIF, 0x700), INTC_VECT(SCIF, 0x720),
+	INTC_VECT(SCIF, 0x740), INTC_VECT(SCIF, 0x760),
+	INTC_VECT(WDT, 0x560),
+};
+
+static struct intc_prio_reg prio_registers[] __initdata = {
+	{ 0xffd00004, 0, 16, 4, /* IPRA */ { TMU0, TMU1, TMU2, RTC } },
+	{ 0xffd00008, 0, 16, 4, /* IPRB */ { WDT, 0, 0, 0 } },
+	{ 0xffd0000c, 0, 16, 4, /* IPRC */ { 0, 0, SCIF, HUDI } },
+	{ 0xffd00010, 0, 16, 4, /* IPRD */ { IRL0, IRL1, IRL2, IRL3 } },
+};
+
+static DECLARE_INTC_DESC(intc_desc, "sh4-202", vectors, NULL,
+			 NULL, prio_registers, NULL);
+
+static struct intc_vect vectors_irlm[] __initdata = {
+	INTC_VECT(IRL0, 0x240), INTC_VECT(IRL1, 0x2a0),
+	INTC_VECT(IRL2, 0x300), INTC_VECT(IRL3, 0x360),
+};
+
+static DECLARE_INTC_DESC(intc_desc_irlm, "sh4-202_irlm", vectors_irlm, NULL,
+			 NULL, prio_registers, NULL);
+
 void __init plat_irq_setup(void)
 {
-	/* do nothing - all IRL interrupts are handled by the board code */
+	register_intc_controller(&intc_desc);
+}
+
+#define INTC_ICR	0xffd00000UL
+#define INTC_ICR_IRLM   (1<<7)
+
+void __init plat_irq_setup_pins(int mode)
+{
+	switch (mode) {
+	case IRQ_MODE_IRQ: /* individual interrupt mode for IRL3-0 */
+		ctrl_outw(ctrl_inw(INTC_ICR) | INTC_ICR_IRLM, INTC_ICR);
+		register_intc_controller(&intc_desc_irlm);
+		break;
+	default:
+		BUG();
+	}
 }
-- 
GitLab


From e37677a429a67cb45004b060ea3376e500bc73c4 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:39:06 +0000
Subject: [PATCH 1097/2198] sh: TMU platform data for sh7343

This patch adds TMU platform data for sh7343. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7343.c | 98 ++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
index b9c361e8cc822..51204dc7ca21c 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c
@@ -173,6 +173,98 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu0",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu0",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu0",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xffe00000,
@@ -213,6 +305,9 @@ static struct platform_device sci_device = {
 
 static struct platform_device *sh7343_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&iic0_device,
 	&iic1_device,
 	&sci_device,
@@ -240,6 +335,9 @@ __initcall(sh7343_devices_setup);
 
 static struct platform_device *sh7343_early_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 };
 
 void __init plat_early_device_setup(void)
-- 
GitLab


From f2710ebcd0a0404ccca3bbad68fd3639fe87ba6f Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:48:21 +0000
Subject: [PATCH 1098/2198] sh: TMU platform data for sh7366

This patch adds TMU platform data for sh7366. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7366.c | 98 ++++++++++++++++++++++++++
 1 file changed, 98 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
index d4a994be4a7d8..04de0fa85120b 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c
@@ -180,6 +180,98 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu0",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu0",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu0",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct plat_sci_port sci_platform_data[] = {
 	{
 		.mapbase	= 0xffe00000,
@@ -202,6 +294,9 @@ static struct platform_device sci_device = {
 
 static struct platform_device *sh7366_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&iic_device,
 	&sci_device,
 	&usb_host_device,
@@ -229,6 +324,9 @@ __initcall(sh7366_devices_setup);
 
 static struct platform_device *sh7366_early_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 };
 
 void __init plat_early_device_setup(void)
-- 
GitLab


From 6a3501b63d30643120566cc1efba5acd0e64b12d Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 09:50:46 +0000
Subject: [PATCH 1099/2198] sh: TMU platform data for sh7724

This patch adds TMU platform data for sh7724. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 195 +++++++++++++++++++++++++
 1 file changed, 195 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index a3039ce1a6a59..852f8104f03ac 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -268,8 +268,197 @@ static struct platform_device cmt_device = {
 	.num_resources	= ARRAY_SIZE(cmt_resources),
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu0",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xffd80008,
+		.end	= 0xffd80013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu0",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xffd80014,
+		.end	= 0xffd8001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu0",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xffd80020,
+		.end	= 0xffd8002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
+
+static struct sh_timer_config tmu3_platform_data = {
+	.name = "TMU3",
+	.channel_offset = 0x04,
+	.timer_bit = 0,
+	.clk = "tmu1",
+};
+
+static struct resource tmu3_resources[] = {
+	[0] = {
+		.name	= "TMU3",
+		.start	= 0xffd90008,
+		.end	= 0xffd90013,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 57,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu3_device = {
+	.name		= "sh_tmu",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &tmu3_platform_data,
+	},
+	.resource	= tmu3_resources,
+	.num_resources	= ARRAY_SIZE(tmu3_resources),
+};
+
+static struct sh_timer_config tmu4_platform_data = {
+	.name = "TMU4",
+	.channel_offset = 0x10,
+	.timer_bit = 1,
+	.clk = "tmu1",
+};
+
+static struct resource tmu4_resources[] = {
+	[0] = {
+		.name	= "TMU4",
+		.start	= 0xffd90014,
+		.end	= 0xffd9001f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 58,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu4_device = {
+	.name		= "sh_tmu",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &tmu4_platform_data,
+	},
+	.resource	= tmu4_resources,
+	.num_resources	= ARRAY_SIZE(tmu4_resources),
+};
+
+static struct sh_timer_config tmu5_platform_data = {
+	.name = "TMU5",
+	.channel_offset = 0x1c,
+	.timer_bit = 2,
+	.clk = "tmu1",
+};
+
+static struct resource tmu5_resources[] = {
+	[0] = {
+		.name	= "TMU5",
+		.start	= 0xffd90020,
+		.end	= 0xffd9002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 57,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu5_device = {
+	.name		= "sh_tmu",
+	.id		= 5,
+	.dev = {
+		.platform_data	= &tmu5_platform_data,
+	},
+	.resource	= tmu5_resources,
+	.num_resources	= ARRAY_SIZE(tmu5_resources),
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 	&sci_device,
 	&rtc_device,
 	&iic0_device,
@@ -296,6 +485,12 @@ device_initcall(sh7724_devices_setup);
 
 static struct platform_device *sh7724_early_devices[] __initdata = {
 	&cmt_device,
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+	&tmu3_device,
+	&tmu4_device,
+	&tmu5_device,
 };
 
 void __init plat_early_device_setup(void)
-- 
GitLab


From acd664ab54a449a7257dcb3c7ef9cd1e310b4c4f Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:01:01 +0000
Subject: [PATCH 1100/2198] sh: TMU platform data for sh7705

This patch adds TMU platform data for sh7705. Both clockevent
and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh3/setup-sh7705.c | 108 ++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
index 63b67badd67e6..39513664d5d73 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
@@ -13,6 +13,7 @@
 #include <linux/irq.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 #include <asm/rtc.h>
 
 enum {
@@ -116,7 +117,102 @@ static struct platform_device rtc_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xfffffe94,
+		.end	= 0xfffffe9f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0xe,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xfffffea0,
+		.end	= 0xfffffeab,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1a,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xfffffeac,
+		.end	= 0xfffffebb,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct platform_device *sh7705_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&sci_device,
 	&rtc_device,
 };
@@ -128,6 +224,18 @@ static int __init sh7705_devices_setup(void)
 }
 __initcall(sh7705_devices_setup);
 
+static struct platform_device *sh7705_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7705_early_devices,
+				   ARRAY_SIZE(sh7705_early_devices));
+}
+
 void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
-- 
GitLab


From c8a9011bce6c43f4988cbcf1db8fc31433e69964 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:04:49 +0000
Subject: [PATCH 1101/2198] sh: TMU platform data for
 sh7706/sh7707/sh7708/sh7709

Add TMU platform data for sh7706/sh7707/sh7708/sh7709.
Both clockevent and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh3/setup-sh770x.c | 108 ++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
index a74f960b5e79a..9412d915b84ed 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
@@ -18,6 +18,7 @@
 #include <linux/platform_device.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 
 enum {
 	UNUSED = 0,
@@ -144,7 +145,102 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xfffffe94,
+		.end	= 0xfffffe9f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0xe,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xfffffea0,
+		.end	= 0xfffffeab,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1a,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xfffffeac,
+		.end	= 0xfffffebb,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct platform_device *sh770x_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&sci_device,
 	&rtc_device,
 };
@@ -156,6 +252,18 @@ static int __init sh770x_devices_setup(void)
 }
 __initcall(sh770x_devices_setup);
 
+static struct platform_device *sh770x_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh770x_early_devices,
+				   ARRAY_SIZE(sh770x_early_devices));
+}
+
 void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
-- 
GitLab


From e5ad00896a381937326ac55fc173630fe731d041 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:15:30 +0000
Subject: [PATCH 1102/2198] sh: TMU platform data for sh7710/sh7712

This patch adds TMU platform data for sh7710 and sh7712.
Both clockevent and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh3/setup-sh7710.c | 108 ++++++++++++++++++++++++++
 1 file changed, 108 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
index 335098b66e2f6..07ff38d055a76 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
@@ -13,6 +13,7 @@
 #include <linux/irq.h>
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 #include <asm/rtc.h>
 
 enum {
@@ -120,7 +121,102 @@ static struct platform_device sci_device = {
 	},
 };
 
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xa412fe94,
+		.end	= 0xa412fe9f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0xe,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xa412fea0,
+		.end	= 0xa412feab,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1a,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xa412feac,
+		.end	= 0xa412feb5,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct platform_device *sh7710_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&sci_device,
 	&rtc_device,
 };
@@ -132,6 +228,18 @@ static int __init sh7710_devices_setup(void)
 }
 __initcall(sh7710_devices_setup);
 
+static struct platform_device *sh7710_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7710_early_devices,
+				   ARRAY_SIZE(sh7710_early_devices));
+}
+
 void __init plat_irq_setup(void)
 {
 	register_intc_controller(&intc_desc);
-- 
GitLab


From 4a1a5a2f60ceabc026ba28cdbf81d7d47603b480 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:17:52 +0000
Subject: [PATCH 1103/2198] sh: TMU platform data for sh7720/sh7721

This patch adds TMU platform data for sh7720 and sh7721.
Both clockevent and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh3/setup-sh7720.c | 109 ++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
index 003874a2fd2a1..c09619c25d9de 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
@@ -18,6 +18,7 @@
 #include <linux/serial.h>
 #include <linux/io.h>
 #include <linux/serial_sci.h>
+#include <linux/sh_timer.h>
 #include <asm/rtc.h>
 
 static struct resource rtc_resources[] = {
@@ -123,7 +124,103 @@ static struct platform_device usbf_device = {
 	.resource	= usbf_resources,
 };
 
+
+static struct sh_timer_config tmu0_platform_data = {
+	.name = "TMU0",
+	.channel_offset = 0x02,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 200,
+};
+
+static struct resource tmu0_resources[] = {
+	[0] = {
+		.name	= "TMU0",
+		.start	= 0xa412fe94,
+		.end	= 0xa412fe9f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 16,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu0_device = {
+	.name		= "sh_tmu",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &tmu0_platform_data,
+	},
+	.resource	= tmu0_resources,
+	.num_resources	= ARRAY_SIZE(tmu0_resources),
+};
+
+static struct sh_timer_config tmu1_platform_data = {
+	.name = "TMU1",
+	.channel_offset = 0xe,
+	.timer_bit = 1,
+	.clk = "module_clk",
+	.clocksource_rating = 200,
+};
+
+static struct resource tmu1_resources[] = {
+	[0] = {
+		.name	= "TMU1",
+		.start	= 0xa412fea0,
+		.end	= 0xa412feab,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 17,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu1_device = {
+	.name		= "sh_tmu",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &tmu1_platform_data,
+	},
+	.resource	= tmu1_resources,
+	.num_resources	= ARRAY_SIZE(tmu1_resources),
+};
+
+static struct sh_timer_config tmu2_platform_data = {
+	.name = "TMU2",
+	.channel_offset = 0x1a,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource tmu2_resources[] = {
+	[0] = {
+		.name	= "TMU2",
+		.start	= 0xa412feac,
+		.end	= 0xa412feb5,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 18,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device tmu2_device = {
+	.name		= "sh_tmu",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &tmu2_platform_data,
+	},
+	.resource	= tmu2_resources,
+	.num_resources	= ARRAY_SIZE(tmu2_resources),
+};
+
 static struct platform_device *sh7720_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
 	&rtc_device,
 	&sci_device,
 	&usb_ohci_device,
@@ -137,6 +234,18 @@ static int __init sh7720_devices_setup(void)
 }
 __initcall(sh7720_devices_setup);
 
+static struct platform_device *sh7720_early_devices[] __initdata = {
+	&tmu0_device,
+	&tmu1_device,
+	&tmu2_device,
+};
+
+void __init plat_early_device_setup(void)
+{
+	early_platform_add_devices(sh7720_early_devices,
+				   ARRAY_SIZE(sh7720_early_devices));
+}
+
 enum {
 	UNUSED = 0,
 
-- 
GitLab


From 2b23a8826a60268ec52302729911dd7ac6b10776 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:21:11 +0000
Subject: [PATCH 1104/2198] sh: CMT platform data for sh7720/sh7721

This patch adds CMT platform data for sh7720 and sh7721.
All 5 32-bit CMT channels unfortunately share a single IRQ.
Both clockevent and clocksource support is enabled.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                       |   2 +
 arch/sh/kernel/cpu/sh3/setup-sh7720.c | 161 ++++++++++++++++++++++++++
 2 files changed, 163 insertions(+)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index a9dee13ddd8b3..5391746be1e9e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -302,6 +302,7 @@ config CPU_SUBTYPE_SH7720
 	bool "Support SH7720 processor"
 	select CPU_SH3
 	select CPU_HAS_DSP
+	select SYS_SUPPORTS_CMT
 	help
 	  Select SH7720 if you have a SH3-DSP SH7720 CPU.
 
@@ -309,6 +310,7 @@ config CPU_SUBTYPE_SH7721
 	bool "Support SH7721 processor"
 	select CPU_SH3
 	select CPU_HAS_DSP
+	select SYS_SUPPORTS_CMT
 	help
 	  Select SH7721 if you have a SH3-DSP SH7721 CPU.
 
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
index c09619c25d9de..d8b46f5dff607 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
@@ -124,6 +124,157 @@ static struct platform_device usbf_device = {
 	.resource	= usbf_resources,
 };
 
+static struct sh_timer_config cmt0_platform_data = {
+	.name = "CMT0",
+	.channel_offset = 0x10,
+	.timer_bit = 0,
+	.clk = "module_clk",
+	.clockevent_rating = 125,
+	.clocksource_rating = 125,
+};
+
+static struct resource cmt0_resources[] = {
+	[0] = {
+		.name	= "CMT0",
+		.start	= 0x044a0010,
+		.end	= 0x044a001b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt0_device = {
+	.name		= "sh_cmt",
+	.id		= 0,
+	.dev = {
+		.platform_data	= &cmt0_platform_data,
+	},
+	.resource	= cmt0_resources,
+	.num_resources	= ARRAY_SIZE(cmt0_resources),
+};
+
+static struct sh_timer_config cmt1_platform_data = {
+	.name = "CMT1",
+	.channel_offset = 0x20,
+	.timer_bit = 1,
+	.clk = "module_clk",
+};
+
+static struct resource cmt1_resources[] = {
+	[0] = {
+		.name	= "CMT1",
+		.start	= 0x044a0020,
+		.end	= 0x044a002b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt1_device = {
+	.name		= "sh_cmt",
+	.id		= 1,
+	.dev = {
+		.platform_data	= &cmt1_platform_data,
+	},
+	.resource	= cmt1_resources,
+	.num_resources	= ARRAY_SIZE(cmt1_resources),
+};
+
+static struct sh_timer_config cmt2_platform_data = {
+	.name = "CMT2",
+	.channel_offset = 0x30,
+	.timer_bit = 2,
+	.clk = "module_clk",
+};
+
+static struct resource cmt2_resources[] = {
+	[0] = {
+		.name	= "CMT2",
+		.start	= 0x044a0030,
+		.end	= 0x044a003b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt2_device = {
+	.name		= "sh_cmt",
+	.id		= 2,
+	.dev = {
+		.platform_data	= &cmt2_platform_data,
+	},
+	.resource	= cmt2_resources,
+	.num_resources	= ARRAY_SIZE(cmt2_resources),
+};
+
+static struct sh_timer_config cmt3_platform_data = {
+	.name = "CMT3",
+	.channel_offset = 0x40,
+	.timer_bit = 3,
+	.clk = "module_clk",
+};
+
+static struct resource cmt3_resources[] = {
+	[0] = {
+		.name	= "CMT3",
+		.start	= 0x044a0040,
+		.end	= 0x044a004b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt3_device = {
+	.name		= "sh_cmt",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &cmt3_platform_data,
+	},
+	.resource	= cmt3_resources,
+	.num_resources	= ARRAY_SIZE(cmt3_resources),
+};
+
+static struct sh_timer_config cmt4_platform_data = {
+	.name = "CMT4",
+	.channel_offset = 0x50,
+	.timer_bit = 4,
+	.clk = "module_clk",
+};
+
+static struct resource cmt4_resources[] = {
+	[0] = {
+		.name	= "CMT4",
+		.start	= 0x044a0050,
+		.end	= 0x044a005b,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 104,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device cmt4_device = {
+	.name		= "sh_cmt",
+	.id		= 4,
+	.dev = {
+		.platform_data	= &cmt4_platform_data,
+	},
+	.resource	= cmt4_resources,
+	.num_resources	= ARRAY_SIZE(cmt4_resources),
+};
 
 static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
@@ -218,6 +369,11 @@ static struct platform_device tmu2_device = {
 };
 
 static struct platform_device *sh7720_devices[] __initdata = {
+	&cmt0_device,
+	&cmt1_device,
+	&cmt2_device,
+	&cmt3_device,
+	&cmt4_device,
 	&tmu0_device,
 	&tmu1_device,
 	&tmu2_device,
@@ -235,6 +391,11 @@ static int __init sh7720_devices_setup(void)
 __initcall(sh7720_devices_setup);
 
 static struct platform_device *sh7720_early_devices[] __initdata = {
+	&cmt0_device,
+	&cmt1_device,
+	&cmt2_device,
+	&cmt3_device,
+	&cmt4_device,
 	&tmu0_device,
 	&tmu1_device,
 	&tmu2_device,
-- 
GitLab


From f19900b2e608b604777a74d6d711bbf744657756 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 12 May 2009 10:25:54 +0000
Subject: [PATCH 1105/2198] sh: remove old TMU driver

This patch removes the old TMU driver (CONFIG_SH_TMU/timer-tmu.c)

As replacement, select the sh_tmu driver with CONFIG_SH_TIMER_TMU
and configure timer channel using platform data.

If multiple TMU channels are enabled using platform data, use the
earlytimer parameter on the kernel command line to select channel.
For instance, use "earlytimer=sh_tmu.0" to select the first channel.

To verify which timer is being used, look at printouts or the timer
irq count in /proc/interrupts.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                     |   9 +-
 arch/sh/include/asm/timer.h         |   1 -
 arch/sh/include/cpu-sh3/cpu/timer.h |  67 -------
 arch/sh/include/cpu-sh4/cpu/timer.h |  60 ------
 arch/sh/kernel/timers/Makefile      |   2 -
 arch/sh/kernel/timers/timer-tmu.c   | 297 ----------------------------
 arch/sh/kernel/timers/timer.c       |   3 -
 7 files changed, 1 insertion(+), 438 deletions(-)
 delete mode 100644 arch/sh/include/cpu-sh3/cpu/timer.h
 delete mode 100644 arch/sh/include/cpu-sh4/cpu/timer.h
 delete mode 100644 arch/sh/kernel/timers/timer-tmu.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 5391746be1e9e..e7b6406c96b6c 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -473,16 +473,9 @@ source "arch/sh/boards/Kconfig"
 
 menu "Timer and clock configuration"
 
-config SH_TMU
-	bool "TMU timer support"
-	depends on CPU_SH3 || CPU_SH4
-	default y
-	help
-	  This enables the use of the TMU as the system timer.
-
 config SH_TIMER_TMU
 	bool "TMU timer driver"
-	depends on !SH_TMU && SYS_SUPPORTS_TMU
+	depends on SYS_SUPPORTS_TMU
 	default y
 	help
 	  This enables the build of the TMU timer driver.
diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
index 6c0851188c00f..f27a88bfdcc72 100644
--- a/arch/sh/include/asm/timer.h
+++ b/arch/sh/include/asm/timer.h
@@ -18,7 +18,6 @@ struct sys_timer {
 	struct sys_timer_ops	*ops;
 };
 
-extern struct sys_timer tmu_timer;
 extern struct sys_timer *sys_timer;
 
 /* arch/sh/kernel/timers/timer.c */
diff --git a/arch/sh/include/cpu-sh3/cpu/timer.h b/arch/sh/include/cpu-sh3/cpu/timer.h
deleted file mode 100644
index 793acf12aa086..0000000000000
--- a/arch/sh/include/cpu-sh3/cpu/timer.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * include/asm-sh/cpu-sh3/timer.h
- *
- * Copyright (C) 2004 Lineo Solutions, Inc.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#ifndef __ASM_CPU_SH3_TIMER_H
-#define __ASM_CPU_SH3_TIMER_H
-
-/*
- * ---------------------------------------------------------------------------
- * TMU Common definitions for SH3 processors
- *	SH7706
- *	SH7709S
- *	SH7727
- *	SH7729R
- *	SH7710
- *	SH7720
- *	SH7710
- * ---------------------------------------------------------------------------
- */
-
-#if  !defined(CONFIG_CPU_SUBTYPE_SH7720) && !defined(CONFIG_CPU_SUBTYPE_SH7721)
-#define TMU_TOCR	0xfffffe90	/* Byte access */
-#endif
-
-#if defined(CONFIG_CPU_SUBTYPE_SH7710) || \
-    defined(CONFIG_CPU_SUBTYPE_SH7720) || \
-    defined(CONFIG_CPU_SUBTYPE_SH7721)
-#define TMU_012_TSTR	0xa412fe92	/* Byte access */
-
-#define TMU0_TCOR	0xa412fe94	/* Long access */
-#define TMU0_TCNT	0xa412fe98	/* Long access */
-#define TMU0_TCR	0xa412fe9c	/* Word access */
-
-#define TMU1_TCOR	0xa412fea0	/* Long access */
-#define TMU1_TCNT	0xa412fea4	/* Long access */
-#define TMU1_TCR	0xa412fea8	/* Word access */
-
-#define TMU2_TCOR	0xa412feac	/* Long access */
-#define TMU2_TCNT	0xa412feb0	/* Long access */
-#define TMU2_TCR	0xa412feb4	/* Word access */
-
-#else
-#define TMU_012_TSTR	0xfffffe92	/* Byte access */
-
-#define TMU0_TCOR	0xfffffe94	/* Long access */
-#define TMU0_TCNT	0xfffffe98	/* Long access */
-#define TMU0_TCR	0xfffffe9c	/* Word access */
-
-#define TMU1_TCOR	0xfffffea0	/* Long access */
-#define TMU1_TCNT	0xfffffea4	/* Long access */
-#define TMU1_TCR	0xfffffea8	/* Word access */
-
-#define TMU2_TCOR	0xfffffeac	/* Long access */
-#define TMU2_TCNT	0xfffffeb0	/* Long access */
-#define TMU2_TCR	0xfffffeb4	/* Word access */
-#if !defined(CONFIG_CPU_SUBTYPE_SH7720) && !defined(CONFIG_CPU_SUBTYPE_SH7721)
-#define TMU2_TCPR2	0xfffffeb8	/* Long access */
-#endif
-#endif
-
-#endif /* __ASM_CPU_SH3_TIMER_H */
-
diff --git a/arch/sh/include/cpu-sh4/cpu/timer.h b/arch/sh/include/cpu-sh4/cpu/timer.h
deleted file mode 100644
index d1e796b968885..0000000000000
--- a/arch/sh/include/cpu-sh4/cpu/timer.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * include/asm-sh/cpu-sh4/timer.h
- *
- * Copyright (C) 2004 Lineo Solutions, Inc.
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#ifndef __ASM_CPU_SH4_TIMER_H
-#define __ASM_CPU_SH4_TIMER_H
-
-/*
- * ---------------------------------------------------------------------------
- * TMU Common definitions for SH4 processors
- *	SH7750S/SH7750R
- *	SH7751/SH7751R
- *	SH7760
- *	SH-X3
- * ---------------------------------------------------------------------------
- */
-#ifdef CONFIG_CPU_SUBTYPE_SHX3
-#define TMU_012_BASE	0xffc10000
-#define TMU_345_BASE	0xffc20000
-#else
-#define TMU_012_BASE	0xffd80000
-#define TMU_345_BASE	0xfe100000
-#endif
-
-#define TMU_TOCR	TMU_012_BASE	/* Not supported on all CPUs */
-
-#define TMU_012_TSTR	(TMU_012_BASE + 0x04)
-#define TMU_345_TSTR	(TMU_345_BASE + 0x04)
-
-#define TMU0_TCOR	(TMU_012_BASE + 0x08)
-#define TMU0_TCNT	(TMU_012_BASE + 0x0c)
-#define TMU0_TCR	(TMU_012_BASE + 0x10)
-
-#define TMU1_TCOR       (TMU_012_BASE + 0x14)
-#define TMU1_TCNT       (TMU_012_BASE + 0x18)
-#define TMU1_TCR        (TMU_012_BASE + 0x1c)
-
-#define TMU2_TCOR       (TMU_012_BASE + 0x20)
-#define TMU2_TCNT       (TMU_012_BASE + 0x24)
-#define TMU2_TCR	(TMU_012_BASE + 0x28)
-#define TMU2_TCPR	(TMU_012_BASE + 0x2c)
-
-#define TMU3_TCOR	(TMU_345_BASE + 0x08)
-#define TMU3_TCNT	(TMU_345_BASE + 0x0c)
-#define TMU3_TCR	(TMU_345_BASE + 0x10)
-
-#define TMU4_TCOR	(TMU_345_BASE + 0x14)
-#define TMU4_TCNT	(TMU_345_BASE + 0x18)
-#define TMU4_TCR	(TMU_345_BASE + 0x1c)
-
-#define TMU5_TCOR	(TMU_345_BASE + 0x20)
-#define TMU5_TCNT	(TMU_345_BASE + 0x24)
-#define TMU5_TCR	(TMU_345_BASE + 0x28)
-
-#endif /* __ASM_CPU_SH4_TIMER_H */
diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile
index 8bceba59e76f5..fefd7edd413de 100644
--- a/arch/sh/kernel/timers/Makefile
+++ b/arch/sh/kernel/timers/Makefile
@@ -3,5 +3,3 @@
 #
 
 obj-y	:= timer.o
-
-obj-$(CONFIG_SH_TMU)		+= timer-tmu.o
diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c
deleted file mode 100644
index fe8d8930ccb62..0000000000000
--- a/arch/sh/kernel/timers/timer-tmu.c
+++ /dev/null
@@ -1,297 +0,0 @@
-/*
- * arch/sh/kernel/timers/timer-tmu.c - TMU Timer Support
- *
- *  Copyright (C) 2005 - 2007  Paul Mundt
- *
- * TMU handling code hacked out of arch/sh/kernel/time.c
- *
- *  Copyright (C) 1999  Tetsuya Okada & Niibe Yutaka
- *  Copyright (C) 2000  Philipp Rumpf <prumpf@tux.org>
- *  Copyright (C) 2002, 2003, 2004  Paul Mundt
- *  Copyright (C) 2002  M. R. Brown  <mrbrown@linux-sh.org>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/interrupt.h>
-#include <linux/seqlock.h>
-#include <linux/clockchips.h>
-#include <asm/timer.h>
-#include <asm/rtc.h>
-#include <asm/io.h>
-#include <asm/irq.h>
-#include <asm/clock.h>
-
-#define TMU_TOCR_INIT	0x00
-#define TMU_TCR_INIT	0x0020
-
-#define TMU0		(0)
-#define TMU1		(1)
-
-static inline void _tmu_start(int tmu_num)
-{
-	ctrl_outb(ctrl_inb(TMU_012_TSTR) | (0x1<<tmu_num), TMU_012_TSTR);
-}
-
-static inline void _tmu_set_irq(int tmu_num, int enabled)
-{
-	register unsigned long tmu_tcr = TMU0_TCR + (0xc*tmu_num);
-	ctrl_outw( (enabled ? ctrl_inw(tmu_tcr) | (1<<5) : ctrl_inw(tmu_tcr) & ~(1<<5)), tmu_tcr);
-}
-
-static inline void _tmu_stop(int tmu_num)
-{
-	ctrl_outb(ctrl_inb(TMU_012_TSTR) & ~(0x1<<tmu_num), TMU_012_TSTR);
-}
-
-static inline void _tmu_clear_status(int tmu_num)
-{
-	register unsigned long tmu_tcr = TMU0_TCR + (0xc*tmu_num);
-	/* Clear UNF bit */
-	ctrl_outw(ctrl_inw(tmu_tcr) & ~0x100, tmu_tcr);
-}
-
-static inline unsigned long _tmu_read(int tmu_num)
-{
-        return ctrl_inl(TMU0_TCNT+0xC*tmu_num);
-}
-
-static int tmu_timer_start(void)
-{
-	_tmu_start(TMU0);
-	_tmu_start(TMU1);
-	_tmu_set_irq(TMU0,1);
-	return 0;
-}
-
-static int tmu_timer_stop(void)
-{
-	_tmu_stop(TMU0);
-	_tmu_stop(TMU1);
-	_tmu_clear_status(TMU0);
-	return 0;
-}
-
-/*
- * also when the module_clk is scaled the TMU1
- * will show the same frequency
- */
-static int tmus_are_scaled;
-
-static cycle_t tmu_timer_read(struct clocksource *cs)
-{
-	return ((cycle_t)(~_tmu_read(TMU1)))<<tmus_are_scaled;
-}
-
-
-static unsigned long tmu_latest_interval[3];
-static void tmu_timer_set_interval(int tmu_num, unsigned long interval, unsigned int reload)
-{
-	unsigned long tmu_tcnt = TMU0_TCNT + tmu_num*0xC;
-	unsigned long tmu_tcor = TMU0_TCOR + tmu_num*0xC;
-
-	_tmu_stop(tmu_num);
-
-	ctrl_outl(interval, tmu_tcnt);
-	tmu_latest_interval[tmu_num] = interval;
-
-	/*
-	 * TCNT reloads from TCOR on underflow, clear it if we don't
-	 * intend to auto-reload
-	 */
-	ctrl_outl( reload ? interval : 0 , tmu_tcor);
-
-	_tmu_start(tmu_num);
-}
-
-static int tmu_set_next_event(unsigned long cycles,
-			      struct clock_event_device *evt)
-{
-	tmu_timer_set_interval(TMU0,cycles, evt->mode == CLOCK_EVT_MODE_PERIODIC);
-	_tmu_set_irq(TMU0,1);
-	return 0;
-}
-
-static void tmu_set_mode(enum clock_event_mode mode,
-			 struct clock_event_device *evt)
-{
-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		ctrl_outl(tmu_latest_interval[TMU0], TMU0_TCOR);
-		break;
-	case CLOCK_EVT_MODE_ONESHOT:
-		ctrl_outl(0, TMU0_TCOR);
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-	case CLOCK_EVT_MODE_RESUME:
-		break;
-	}
-}
-
-static struct clock_event_device tmu0_clockevent = {
-	.name		= "tmu0",
-	.shift		= 32,
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= tmu_set_mode,
-	.set_next_event	= tmu_set_next_event,
-};
-
-static irqreturn_t tmu_timer_interrupt(int irq, void *dummy)
-{
-	struct clock_event_device *evt = &tmu0_clockevent;
-	_tmu_clear_status(TMU0);
-	_tmu_set_irq(TMU0,tmu0_clockevent.mode != CLOCK_EVT_MODE_ONESHOT);
-
-	switch (tmu0_clockevent.mode) {
-	case CLOCK_EVT_MODE_ONESHOT:
-	case CLOCK_EVT_MODE_PERIODIC:
-		evt->event_handler(evt);
-		break;
-	default:
-		break;
-	}
-
-	return IRQ_HANDLED;
-}
-
-static struct irqaction tmu0_irq = {
-	.name		= "periodic/oneshot timer",
-	.handler	= tmu_timer_interrupt,
-	.flags		= IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL,
-};
-
-static void __init tmu_clk_init(struct clk *clk)
-{
-	u8 divisor  = TMU_TCR_INIT & 0x7;
-	int tmu_num = clk->name[3]-'0';
-	ctrl_outw(TMU_TCR_INIT, TMU0_TCR+(tmu_num*0xC));
-	clk->rate = clk_get_rate(clk->parent) / (4 << (divisor << 1));
-}
-
-static void tmu_clk_recalc(struct clk *clk)
-{
-	int tmu_num = clk->name[3]-'0';
-	unsigned long prev_rate = clk_get_rate(clk);
-	unsigned long flags;
-	u8 divisor = ctrl_inw(TMU0_TCR+tmu_num*0xC) & 0x7;
-	clk->rate  = clk_get_rate(clk->parent) / (4 << (divisor << 1));
-
-	if(prev_rate==clk_get_rate(clk))
-		return;
-
-	if(tmu_num)
-		return; /* No more work on TMU1 */
-
-	local_irq_save(flags);
-	tmus_are_scaled = (prev_rate > clk->rate);
-
-	_tmu_stop(TMU0);
-
-	tmu0_clockevent.mult = div_sc(clk->rate, NSEC_PER_SEC,
-				tmu0_clockevent.shift);
-	tmu0_clockevent.max_delta_ns =
-			clockevent_delta2ns(-1, &tmu0_clockevent);
-	tmu0_clockevent.min_delta_ns =
-			clockevent_delta2ns(1, &tmu0_clockevent);
-
-	if (tmus_are_scaled)
-		tmu_latest_interval[TMU0] >>= 1;
-	else
-		tmu_latest_interval[TMU0] <<= 1;
-
-	tmu_timer_set_interval(TMU0,
-		tmu_latest_interval[TMU0],
-		tmu0_clockevent.mode == CLOCK_EVT_MODE_PERIODIC);
-
-	_tmu_start(TMU0);
-
-	local_irq_restore(flags);
-}
-
-static struct clk_ops tmu_clk_ops = {
-	.init		= tmu_clk_init,
-	.recalc		= tmu_clk_recalc,
-};
-
-static struct clk tmu0_clk = {
-	.name		= "tmu0_clk",
-	.ops		= &tmu_clk_ops,
-};
-
-static struct clk tmu1_clk = {
-	.name		= "tmu1_clk",
-	.ops		= &tmu_clk_ops,
-};
-
-static int tmu_timer_init(void)
-{
-	unsigned long interval;
-	unsigned long frequency;
-
-	setup_irq(CONFIG_SH_TIMER_IRQ, &tmu0_irq);
-
-	tmu0_clk.parent = clk_get(NULL, "module_clk");
-	tmu1_clk.parent = clk_get(NULL, "module_clk");
-
-	tmu_timer_stop();
-
-#if !defined(CONFIG_CPU_SUBTYPE_SH7720) && \
-    !defined(CONFIG_CPU_SUBTYPE_SH7721) && \
-    !defined(CONFIG_CPU_SUBTYPE_SH7760) && \
-    !defined(CONFIG_CPU_SUBTYPE_SH7785) && \
-    !defined(CONFIG_CPU_SUBTYPE_SH7786) && \
-    !defined(CONFIG_CPU_SUBTYPE_SHX3)
-	ctrl_outb(TMU_TOCR_INIT, TMU_TOCR);
-#endif
-
-	clk_register(&tmu0_clk);
-	clk_register(&tmu1_clk);
-	clk_enable(&tmu0_clk);
-	clk_enable(&tmu1_clk);
-
-	frequency = clk_get_rate(&tmu0_clk);
-	interval = (frequency + HZ / 2) / HZ;
-
-	tmu_timer_set_interval(TMU0,interval, 1);
-	tmu_timer_set_interval(TMU1,~0,1);
-
-	_tmu_start(TMU1);
-
-	clocksource_sh.rating = 200;
-	clocksource_sh.mask = CLOCKSOURCE_MASK(32);
-	clocksource_sh.read = tmu_timer_read;
-	clocksource_sh.shift = 10;
-	clocksource_sh.mult = clocksource_hz2mult(clk_get_rate(&tmu1_clk),
-						  clocksource_sh.shift);
-	clocksource_sh.flags = CLOCK_SOURCE_IS_CONTINUOUS;
-	clocksource_register(&clocksource_sh);
-
-	tmu0_clockevent.mult = div_sc(frequency, NSEC_PER_SEC,
-				      tmu0_clockevent.shift);
-	tmu0_clockevent.max_delta_ns =
-			clockevent_delta2ns(-1, &tmu0_clockevent);
-	tmu0_clockevent.min_delta_ns =
-			clockevent_delta2ns(1, &tmu0_clockevent);
-
-	tmu0_clockevent.cpumask = cpumask_of(0);
-	tmu0_clockevent.rating = 100;
-
-	clockevents_register_device(&tmu0_clockevent);
-
-	return 0;
-}
-
-static struct sys_timer_ops tmu_timer_ops = {
-	.init		= tmu_timer_init,
-	.start		= tmu_timer_start,
-	.stop		= tmu_timer_stop,
-};
-
-struct sys_timer tmu_timer = {
-	.name	= "tmu",
-	.ops	= &tmu_timer_ops,
-};
diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c
index 920891acb7797..f8812bd3b7973 100644
--- a/arch/sh/kernel/timers/timer.c
+++ b/arch/sh/kernel/timers/timer.c
@@ -14,9 +14,6 @@
 #include <asm/timer.h>
 
 static struct sys_timer *sys_timers[] = {
-#ifdef CONFIG_SH_TMU
-	&tmu_timer,
-#endif
 	NULL,
 };
 
-- 
GitLab


From 8be5f1a68f2c14082939dd54e7037dcee2eb54f8 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 12 May 2009 19:53:55 +0900
Subject: [PATCH 1106/2198] sh: Kill off the remnants of the old timer code.

Now with all of the TMU users moved over to the new TMU driver, and the
old TMU driver killed off, the left-over infrastructure can go along
with it.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                | 11 --------
 arch/sh/include/asm/timer.h    | 28 ---------------------
 arch/sh/kernel/Makefile_32     |  2 +-
 arch/sh/kernel/Makefile_64     |  2 +-
 arch/sh/kernel/cpu/clock.c     |  1 -
 arch/sh/kernel/time.c          | 29 ++-------------------
 arch/sh/kernel/timers/Makefile |  5 ----
 arch/sh/kernel/timers/timer.c  | 46 ----------------------------------
 8 files changed, 4 insertions(+), 120 deletions(-)
 delete mode 100644 arch/sh/include/asm/timer.h
 delete mode 100644 arch/sh/kernel/timers/Makefile
 delete mode 100644 arch/sh/kernel/timers/timer.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index e7b6406c96b6c..fb75c2d1928df 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -494,17 +494,6 @@ config SH_TIMER_MTU2
 	help
 	  This enables build of the MTU2 timer driver.
 
-config SH_TIMER_IRQ
-	int
-	default "28" if CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \
-			CPU_SUBTYPE_SH7763
-	default "86" if CPU_SUBTYPE_SH7619
-	default "140" if CPU_SUBTYPE_SH7206
-	default "142" if CPU_SUBTYPE_SH7203 && SH_CMT
-	default "153" if CPU_SUBTYPE_SH7203 && SH_MTU2
-	default "238" if CPU_SUBTYPE_MXG
-	default "16"
-
 config SH_PCLK_FREQ
 	int "Peripheral clock frequency (in Hz)"
 	default "27000000" if CPU_SUBTYPE_SH7343
diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h
deleted file mode 100644
index f27a88bfdcc72..0000000000000
--- a/arch/sh/include/asm/timer.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef __ASM_SH_TIMER_H
-#define __ASM_SH_TIMER_H
-
-#include <linux/sysdev.h>
-#include <linux/clocksource.h>
-#include <cpu/timer.h>
-
-struct sys_timer_ops {
-	int (*init)(void);
-	int (*start)(void);
-	int (*stop)(void);
-};
-
-struct sys_timer {
-	const char		*name;
-
-	struct sys_device	dev;
-	struct sys_timer_ops	*ops;
-};
-
-extern struct sys_timer *sys_timer;
-
-/* arch/sh/kernel/timers/timer.c */
-struct sys_timer *get_sys_timer(void);
-
-extern struct clocksource clocksource_sh;
-
-#endif /* __ASM_SH_TIMER_H */
diff --git a/arch/sh/kernel/Makefile_32 b/arch/sh/kernel/Makefile_32
index aee08afe5ff78..9411e3e31e686 100644
--- a/arch/sh/kernel/Makefile_32
+++ b/arch/sh/kernel/Makefile_32
@@ -14,7 +14,7 @@ obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o			\
 	   sys_sh.o sys_sh32.o syscalls_32.o time.o topology.o	\
 	   traps.o traps_32.o
 
-obj-y				+= cpu/ timers/
+obj-y				+= cpu/
 obj-$(CONFIG_VSYSCALL)		+= vsyscall/
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_SH_STANDARD_BIOS)	+= sh_bios.o
diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64
index c845a0d619795..67b9f6c6326b6 100644
--- a/arch/sh/kernel/Makefile_64
+++ b/arch/sh/kernel/Makefile_64
@@ -4,7 +4,7 @@ obj-y	:= debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \
 	   ptrace_64.o setup.o signal_64.o sys_sh.o sys_sh64.o \
 	   syscalls_64.o time.o topology.o traps.o traps_64.o
 
-obj-y				+= cpu/ timers/
+obj-y				+= cpu/
 obj-$(CONFIG_SMP)		+= smp.o
 obj-$(CONFIG_SH_CPU_FREQ)	+= cpufreq.o
 obj-$(CONFIG_MODULES)		+= sh_ksyms_64.o module.o
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 133dbe4033412..f54769f455b11 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -26,7 +26,6 @@
 #include <linux/platform_device.h>
 #include <linux/proc_fs.h>
 #include <asm/clock.h>
-#include <asm/timer.h>
 
 static LIST_HEAD(clock_list);
 static DEFINE_SPINLOCK(clock_lock);
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index a77838f539f84..2edde32c764be 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -22,9 +22,6 @@
 #include <linux/rtc.h>
 #include <asm/clock.h>
 #include <asm/rtc.h>
-#include <asm/timer.h>
-
-struct sys_timer *sys_timer;
 
 /* Dummy RTC ops */
 static void null_rtc_get_time(struct timespec *tv)
@@ -94,20 +91,9 @@ module_init(rtc_generic_init);
 
 void (*board_time_init)(void);
 
-struct clocksource clocksource_sh = {
-	.name		= "SuperH",
-};
-
 unsigned long long sched_clock(void)
 {
-	unsigned long long cycles;
-
-	/* jiffies based sched_clock if no clocksource is installed */
-	if (!clocksource_sh.rating)
-		return (jiffies_64 - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ);
-
-	cycles = clocksource_sh.read(&clocksource_sh);
-	return cyc2ns(&clocksource_sh, cycles);
+	return (jiffies_64 - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ);
 }
 
 static void __init sh_late_time_init(void)
@@ -117,18 +103,7 @@ static void __init sh_late_time_init(void)
 	 * Run probe() for one "earlytimer" device.
 	 */
 	early_platform_driver_register_all("earlytimer");
-	if (early_platform_driver_probe("earlytimer", 1, 0))
-		return;
-
-	/*
-	 * Find the timer to use as the system timer, it will be
-	 * initialized for us.
-	 */
-	sys_timer = get_sys_timer();
-	if (unlikely(!sys_timer))
-		panic("System timer missing.\n");
-
-	printk(KERN_INFO "Using %s for system timer\n", sys_timer->name);
+	early_platform_driver_probe("earlytimer", 1, 0);
 }
 
 void __init time_init(void)
diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile
deleted file mode 100644
index fefd7edd413de..0000000000000
--- a/arch/sh/kernel/timers/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
-#
-# Makefile for the various Linux/SuperH timers
-#
-
-obj-y	:= timer.o
diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c
deleted file mode 100644
index f8812bd3b7973..0000000000000
--- a/arch/sh/kernel/timers/timer.c
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * arch/sh/kernel/timers/timer.c - Common timer code
- *
- *  Copyright (C) 2005  Paul Mundt
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
- */
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/timer.h>
-#include <linux/string.h>
-#include <asm/timer.h>
-
-static struct sys_timer *sys_timers[] = {
-	NULL,
-};
-
-static char timer_override[10];
-static int __init timer_setup(char *str)
-{
-	if (str)
-		strlcpy(timer_override, str, sizeof(timer_override));
-	return 1;
-}
-__setup("timer=", timer_setup);
-
-struct sys_timer *get_sys_timer(void)
-{
-	int i;
-
-	for (i = 0; i < ARRAY_SIZE(sys_timers); i++) {
-		struct sys_timer *t = sys_timers[i];
-
-		if (unlikely(!t))
-			break;
-		if (unlikely(timer_override[0]))
-			if ((strcmp(timer_override, t->name) != 0))
-				continue;
-		if (likely(t->ops->init() == 0))
-			return t;
-	}
-
-	return NULL;
-}
-- 
GitLab


From e758a33d6fc5b9d6a3ae489863d04fcecad8120b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 12 May 2009 21:59:01 +1000
Subject: [PATCH 1107/2198] perf_counter: call hw_perf_save_disable/restore
 around group_sched_in

I noticed that when enabling a group via the PERF_COUNTER_IOC_ENABLE
ioctl on the group leader, the counters weren't enabled and counting
immediately on return from the ioctl, but did start counting a little
while later (presumably after a context switch).

The reason was that __perf_counter_enable calls group_sched_in which
calls hw_perf_group_sched_in, which on powerpc assumes that the caller
has called hw_perf_save_disable already.  Until commit 46d686c6
("perf_counter: put whole group on when enabling group leader") it was
true that all callers of group_sched_in had called
hw_perf_save_disable first, and the powerpc hw_perf_group_sched_in
relies on that (there isn't an x86 version).

This fixes the problem by putting calls to hw_perf_save_disable /
hw_perf_restore around the calls to group_sched_in and
counter_sched_in in __perf_counter_enable.  Having the calls to
hw_perf_save_disable/restore around the counter_sched_in call is
harmless and makes this call consistent with the other call sites
of counter_sched_in, which have all called hw_perf_save_disable first.

[ Impact: more precise counter group disable/enable functionality ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18953.25733.53359.147452@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5ea0240adab23..ff166c11b69ae 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -663,6 +663,7 @@ static void __perf_counter_enable(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *leader = counter->group_leader;
+	unsigned long pmuflags;
 	unsigned long flags;
 	int err;
 
@@ -689,14 +690,18 @@ static void __perf_counter_enable(void *info)
 	if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)
 		goto unlock;
 
-	if (!group_can_go_on(counter, cpuctx, 1))
+	if (!group_can_go_on(counter, cpuctx, 1)) {
 		err = -EEXIST;
-	else if (counter == leader)
-		err = group_sched_in(counter, cpuctx, ctx,
-				     smp_processor_id());
-	else
-		err = counter_sched_in(counter, cpuctx, ctx,
-				       smp_processor_id());
+	} else {
+		pmuflags = hw_perf_save_disable();
+		if (counter == leader)
+			err = group_sched_in(counter, cpuctx, ctx,
+					     smp_processor_id());
+		else
+			err = counter_sched_in(counter, cpuctx, ctx,
+					       smp_processor_id());
+		hw_perf_restore(pmuflags);
+	}
 
 	if (err) {
 		/*
-- 
GitLab


From b5710ce92a8cf8e3fc0ffc230cfdbfa23463f1c8 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue, 12 May 2009 18:51:28 +0400
Subject: [PATCH 1108/2198] x86/pci: add 4 more return parameters to
 IO_APIC_get_PCI_irq_vector(), fix

Fix trivial typo in the drivers/pci/hotplug/ibmphp_core.c changes.

[ Impact: build fix ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
CC: Yinghai Lu <yinghai@kernel.org>
Cc: eswierk@aristanetworks.com
LKML-Reference: <20090512145128.GA10220@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/pci/hotplug/ibmphp_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index ef53b05a411a2..79901a0db885e 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -161,7 +161,7 @@ int ibmphp_init_devno(struct slot **cur_slot)
 			(*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn);
 			for (i = 0; i < 4; i++)
 				(*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus,
-						(int) (*cur_slot)->device, i.
+						(int) (*cur_slot)->device, i,
 						&ioapic, &ioapic_pin,
 						&triggering, &polarity);
 
-- 
GitLab


From 7537d81aa7b7cd31b0caeac8091456e93d96fa8d Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Tue, 12 May 2009 11:16:20 -0500
Subject: [PATCH 1109/2198] GFS2: Fix timestamps on write

This patch copies the timestamps from the vfs inode into gfs2 and syncs
it to the disk inode during writes.

Signed-off-by: Abhijith Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_address.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index a6dde1751e17d..e5664210f0d88 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -781,10 +781,12 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	unlock_page(page);
 	page_cache_release(page);
 
-	if (inode->i_size < to) {
-		i_size_write(inode, to);
-		ip->i_disksize = inode->i_size;
-		di->di_size = cpu_to_be64(inode->i_size);
+	if (copied) {
+		if (inode->i_size < to) {
+			i_size_write(inode, to);
+			ip->i_disksize = inode->i_size;
+		}
+		gfs2_dinode_out(ip, di);
 		mark_inode_dirty(inode);
 	}
 
@@ -824,7 +826,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct buffer_head *dibh;
 	struct gfs2_alloc *al = ip->i_alloc;
-	struct gfs2_dinode *di;
 	unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
 	unsigned int to = from + len;
 	int ret;
@@ -847,11 +848,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 		gfs2_page_add_databufs(ip, page, from, to);
 
 	ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
-
-	if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) {
-		di = (struct gfs2_dinode *)dibh->b_data;
-		ip->i_disksize = inode->i_size;
-		di->di_size = cpu_to_be64(inode->i_size);
+	if (ret > 0) {
+		if (inode->i_size > ip->i_disksize)
+			ip->i_disksize = inode->i_size;
+		gfs2_dinode_out(ip, dibh->b_data);
 		mark_inode_dirty(inode);
 	}
 
-- 
GitLab


From 12b7ac176831df1aa58a787e67c3e5d698b30163 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 00:57:44 -0400
Subject: [PATCH 1110/2198] ext4: Rename ext4_get_blocks_wrap() to be
 ext4_get_blocks()

Another function rename for clarity's sake.  The _wrap prefix simply
confuses people, and didn't add much people trying to follow the code
paths.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c     |  3 +--
 fs/ext4/ext4.h    |  8 ++++----
 fs/ext4/extents.c |  6 +++---
 fs/ext4/inode.c   | 35 +++++++++++++++++------------------
 4 files changed, 25 insertions(+), 27 deletions(-)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index b64789929a65b..052d6378f9979 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
-						0, 0, 0);
+		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0, 0, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 89190ae671f62..5dc8368e46bc5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1617,10 +1617,10 @@ extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
-extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
-			sector_t block, unsigned int max_blocks,
-			struct buffer_head *bh, int create,
-			int extend_disksize, int flag);
+extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
+			   sector_t block, unsigned int max_blocks,
+			   struct buffer_head *bh, int create,
+			   int extend_disksize, int flag);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 4fec6b7463823..7e7d02dd27390 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3151,9 +3151,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 			break;
 		}
 		map_bh.b_state = 0;
-		ret = ext4_get_blocks_wrap(handle, inode, block,
-					  max_blocks, &map_bh,
-					  EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+		ret = ext4_get_blocks(handle, inode, block,
+				      max_blocks, &map_bh,
+				      EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f758e8021d1a9..a9a9b9b77e8ec 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1121,7 +1121,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 }
 
 /*
- * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * The ext4_get_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
@@ -1142,9 +1142,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  *
  * It returns the error in case of allocation failure.
  */
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
-			unsigned int max_blocks, struct buffer_head *bh,
-			int create, int extend_disksize, int flag)
+int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+		    unsigned int max_blocks, struct buffer_head *bh,
+		    int create, int extend_disksize, int flag)
 {
 	int retval;
 
@@ -1268,8 +1268,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 		started = 1;
 	}
 
-	ret = ext4_get_blocks_wrap(handle, inode, iblock,
-					max_blocks, bh_result, create, 0, 0);
+	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+			      create, 0, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1294,10 +1294,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
-	err = ext4_get_blocks_wrap(handle, inode, block, 1,
-					&dummy, create, 1, 0);
+	err = ext4_get_blocks(handle, inode, block, 1, &dummy, create, 1, 0);
 	/*
-	 * ext4_get_blocks_wrap() returns number of blocks
+	 * ext4_get_blocks() returns number of blocks
 	 * mapped. 0 in case of a HOLE.
 	 */
 	if (err > 0) {
@@ -2009,8 +2008,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
-	ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-				   bh_result, 1, 0, EXT4_DELALLOC_RSVED);
+	ret = ext4_get_blocks(handle, inode, iblock, max_blocks,
+			      bh_result, 1, 0, EXT4_DELALLOC_RSVED);
 	if (ret <= 0)
 		return ret;
 
@@ -2067,11 +2066,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
 	 * ext4_da_get_block_write(), since it calls
-	 * ext4_get_blocks_wrap() with the EXT4_DELALLOC_RSVED flag.
-	 * This flag causes ext4_get_blocks_wrap() to call
+	 * ext4_get_blocks() with the EXT4_DELALLOC_RSVED flag.
+	 * This flag causes ext4_get_blocks() to call
 	 * ext4_da_update_reserve_space() if the passed buffer head
 	 * has the BH_Delay flag set.  In the future, once we clean up
-	 * the interfaces to ext4_get_blocks_wrap(), we should pass in
+	 * the interfaces to ext4_get_blocks(), we should pass in
 	 * a separate flag which requests that the delayed allocation
 	 * statistics should be updated, instead of depending on the
 	 * state information getting passed down via the map_bh's
@@ -2363,7 +2362,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
 	if ((ret == 0) && !buffer_delay(bh_result)) {
 		/* the block isn't (pre)allocated yet, let's reserve space */
 		/*
@@ -2407,8 +2406,8 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 	 * we don't want to do block allocation in writepage
 	 * so call get_block_wrap with create = 0
 	 */
-	ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
-				   bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks,
+			      bh_result, 0, 0, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -5034,7 +5033,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
  * Calculate the journal credits for a chunk of data modification.
  *
  * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
  *
  * journal buffers for data blocks are not included here, as DIO
  * and fallocate do no need to journal data buffers.
-- 
GitLab


From c21770573319922e3f3fcb331cfaa290c49f1c81 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 00:58:52 -0400
Subject: [PATCH 1111/2198] ext4: Define a new set of flags for
 ext4_get_blocks()

The functions ext4_get_blocks(), ext4_ext_get_blocks(), and
ext4_ind_get_blocks() used an ad-hoc set of integer variables used as
boolean flags passed in as arguments.  Use a single flags parameter
and a setandard set of bitfield flags instead.  This saves space on
the call stack, and it also makes the code a bit more understandable.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/dir.c     |  2 +-
 fs/ext4/ext4.h    | 22 ++++++++++++------
 fs/ext4/extents.c | 22 +++++++++---------
 fs/ext4/inode.c   | 57 +++++++++++++++++++++++++----------------------
 4 files changed, 57 insertions(+), 46 deletions(-)

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 052d6378f9979..9dc93168e2623 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -131,7 +131,7 @@ static int ext4_readdir(struct file *filp,
 		struct buffer_head *bh = NULL;
 
 		map_bh.b_state = 0;
-		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0, 0, 0);
+		err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
 		if (err > 0) {
 			pgoff_t index = map_bh.b_blocknr >>
 					(PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5dc8368e46bc5..17feb4ac633a1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -314,10 +314,20 @@ struct ext4_new_group_data {
 };
 
 /*
- * Following is used by preallocation code to tell get_blocks() that we
- * want uninitialzed extents.
+ * Flags used by ext4_get_blocks()
  */
-#define EXT4_CREATE_UNINITIALIZED_EXT		2
+	/* Allocate any needed blocks and/or convert an unitialized
+	   extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE			1
+	/* Request the creation of an unitialized extent */
+#define EXT4_GET_BLOCKS_UNINIT_EXT		2
+#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
+						 EXT4_GET_BLOCKS_CREATE)
+	/* Update the ext4_inode_info i_disksize field */
+#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		4
+	/* Caller is from the delayed allocation writeout path,
+	   so the filesystem blocks have already been accounted for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	8
 
 /*
  * ioctl commands
@@ -1610,8 +1620,7 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
 				       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			       ext4_lblk_t iblock, unsigned int max_blocks,
-			       struct buffer_head *bh_result,
-			       int create, int extend_disksize);
+			       struct buffer_head *bh_result, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
@@ -1619,8 +1628,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
 			  loff_t len);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
 			   sector_t block, unsigned int max_blocks,
-			   struct buffer_head *bh, int create,
-			   int extend_disksize, int flag);
+			   struct buffer_head *bh, int flags);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 7e7d02dd27390..27c383c7b43c6 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2784,7 +2784,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 			ext4_lblk_t iblock,
 			unsigned int max_blocks, struct buffer_head *bh_result,
-			int create, int extend_disksize)
+			int flags)
 {
 	struct ext4_ext_path *path = NULL;
 	struct ext4_extent_header *eh;
@@ -2803,7 +2803,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	cache_type = ext4_ext_in_cache(inode, iblock, &newex);
 	if (cache_type) {
 		if (cache_type == EXT4_EXT_CACHE_GAP) {
-			if (!create) {
+			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				/*
 				 * block isn't allocated yet and
 				 * user doesn't want to allocate it
@@ -2869,9 +2869,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 							EXT4_EXT_CACHE_EXTENT);
 				goto out;
 			}
-			if (create == EXT4_CREATE_UNINITIALIZED_EXT)
+			if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
 				goto out;
-			if (!create) {
+			if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 				if (allocated > max_blocks)
 					allocated = max_blocks;
 				/*
@@ -2903,7 +2903,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * requested block isn't allocated yet;
 	 * we couldn't try to create block if create flag is zero
 	 */
-	if (!create) {
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
 		/*
 		 * put just found gap into cache to speed up
 		 * subsequent requests
@@ -2932,10 +2932,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	 * EXT_UNINIT_MAX_LEN.
 	 */
 	if (max_blocks > EXT_INIT_MAX_LEN &&
-	    create != EXT4_CREATE_UNINITIALIZED_EXT)
+	    !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
 		max_blocks = EXT_INIT_MAX_LEN;
 	else if (max_blocks > EXT_UNINIT_MAX_LEN &&
-		 create == EXT4_CREATE_UNINITIALIZED_EXT)
+		 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
 		max_blocks = EXT_UNINIT_MAX_LEN;
 
 	/* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
@@ -2966,7 +2966,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	/* try to insert new extent into found leaf and return */
 	ext4_ext_store_pblock(&newex, newblock);
 	newex.ee_len = cpu_to_le16(ar.len);
-	if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
+	if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
 		ext4_ext_mark_uninitialized(&newex);
 	err = ext4_ext_insert_extent(handle, inode, path, &newex);
 	if (err) {
@@ -2983,7 +2983,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-	if (extend_disksize) {
+	if (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE) {
 		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
 		if (disksize > i_size_read(inode))
 			disksize = i_size_read(inode);
@@ -2994,7 +2994,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
-	if (create != EXT4_CREATE_UNINITIALIZED_EXT)
+	if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
 		ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
 						EXT4_EXT_CACHE_EXTENT);
 out:
@@ -3153,7 +3153,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 		map_bh.b_state = 0;
 		ret = ext4_get_blocks(handle, inode, block,
 				      max_blocks, &map_bh,
-				      EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+				      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
 		if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
 			WARN_ON(ret <= 0);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a9a9b9b77e8ec..8b7564dfacdfa 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -917,7 +917,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 				  ext4_lblk_t iblock, unsigned int maxblocks,
 				  struct buffer_head *bh_result,
-				  int create, int extend_disksize)
+				  int flags)
 {
 	int err = -EIO;
 	ext4_lblk_t offsets[4];
@@ -934,7 +934,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
-	J_ASSERT(handle != NULL || create == 0);
+	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
 	depth = ext4_block_to_path(inode, iblock, offsets,
 					&blocks_to_boundary);
 
@@ -963,7 +963,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	}
 
 	/* Next simple case - plain lookup or failed read of indirect block */
-	if (!create || err == -EIO)
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
 		goto cleanup;
 
 	/*
@@ -1002,7 +1002,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	 * protect it if you're about to implement concurrent
 	 * ext4_get_block() -bzzz
 	*/
-	if (!err && extend_disksize) {
+	if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) {
 		disksize = ((loff_t) iblock + count) << inode->i_blkbits;
 		if (disksize > i_size_read(inode))
 			disksize = i_size_read(inode);
@@ -1144,7 +1144,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  */
 int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		    unsigned int max_blocks, struct buffer_head *bh,
-		    int create, int extend_disksize, int flag)
+		    int flags)
 {
 	int retval;
 
@@ -1158,15 +1158,15 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, 0, 0);
+				bh, 0);
 	} else {
 		retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
-					     bh, 0, 0);
+					     bh, 0);
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
 	/* If it is only a block(s) look up */
-	if (!create)
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 		return retval;
 
 	/*
@@ -1205,7 +1205,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 * let the underlying get_block() function know to
 	 * avoid double accounting
 	 */
-	if (flag)
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		EXT4_I(inode)->i_delalloc_reserved_flag = 1;
 	/*
 	 * We need to check for EXT4 here because migrate
@@ -1213,10 +1213,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	 */
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
 		retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-				bh, create, extend_disksize);
+					      bh, flags);
 	} else {
 		retval = ext4_ind_get_blocks(handle, inode, block,
-				max_blocks, bh, create, extend_disksize);
+					     max_blocks, bh, flags);
 
 		if (retval > 0 && buffer_new(bh)) {
 			/*
@@ -1229,7 +1229,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		}
 	}
 
-	if (flag) {
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
 		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
 		/*
 		 * Update reserved blocks/metadata blocks
@@ -1269,7 +1269,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
 	}
 
 	ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-			      create, 0, 0);
+			      create ? EXT4_GET_BLOCKS_CREATE : 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
@@ -1288,16 +1288,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head dummy;
 	int fatal = 0, err;
+	int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE;
 
 	J_ASSERT(handle != NULL || create == 0);
 
 	dummy.b_state = 0;
 	dummy.b_blocknr = -1000;
 	buffer_trace_init(&dummy.b_history);
-	err = ext4_get_blocks(handle, inode, block, 1, &dummy, create, 1, 0);
+	if (create)
+		flags |= EXT4_GET_BLOCKS_CREATE;
+	err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
 	/*
-	 * ext4_get_blocks() returns number of blocks
-	 * mapped. 0 in case of a HOLE.
+	 * ext4_get_blocks() returns number of blocks mapped. 0 in
+	 * case of a HOLE.
 	 */
 	if (err > 0) {
 		if (err > 1)
@@ -1997,7 +2000,6 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
-#define		EXT4_DELALLOC_RSVED	1
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result)
 {
@@ -2009,7 +2011,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 	handle = ext4_journal_current_handle();
 	BUG_ON(!handle);
 	ret = ext4_get_blocks(handle, inode, iblock, max_blocks,
-			      bh_result, 1, 0, EXT4_DELALLOC_RSVED);
+			      bh_result, EXT4_GET_BLOCKS_CREATE|
+			      EXT4_GET_BLOCKS_DELALLOC_RESERVE);
 	if (ret <= 0)
 		return ret;
 
@@ -2065,16 +2068,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		return 0;
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
-	 * ext4_da_get_block_write(), since it calls
-	 * ext4_get_blocks() with the EXT4_DELALLOC_RSVED flag.
-	 * This flag causes ext4_get_blocks() to call
+	 * ext4_da_get_block_write(), since it calls ext4_get_blocks()
+	 * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.  This flag
+	 * causes ext4_get_blocks() to call
 	 * ext4_da_update_reserve_space() if the passed buffer head
 	 * has the BH_Delay flag set.  In the future, once we clean up
-	 * the interfaces to ext4_get_blocks(), we should pass in
-	 * a separate flag which requests that the delayed allocation
+	 * the interfaces to ext4_get_blocks(), we should pass in a
+	 * separate flag which requests that the delayed allocation
 	 * statistics should be updated, instead of depending on the
 	 * state information getting passed down via the map_bh's
-	 * state bitmasks plus the magic EXT4_DELALLOC_RSVED flag.
+	 * state bitmasks plus the magic
+	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
 	 */
 	new.b_state = mpd->b_state & (1 << BH_Delay);
 	new.b_blocknr = 0;
@@ -2362,7 +2366,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	 * preallocated blocks are unmapped but should treated
 	 * the same as allocated blocks.
 	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
 	if ((ret == 0) && !buffer_delay(bh_result)) {
 		/* the block isn't (pre)allocated yet, let's reserve space */
 		/*
@@ -2406,8 +2410,7 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 	 * we don't want to do block allocation in writepage
 	 * so call get_block_wrap with create = 0
 	 */
-	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks,
-			      bh_result, 0, 0, 0);
+	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
-- 
GitLab


From b920c75502cb2c48654ef196d647c8eb81ab608a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 00:54:29 -0400
Subject: [PATCH 1112/2198] ext4: Add documentation to the ext4_*get_block*
 functions

This adds more documentation to various internal functions in
fs/ext4/inode.c, most notably ext4_ind_get_blocks(),
ext4_da_get_block_write(), ext4_da_get_block_prep(),
ext4_normal_get_block_write().

In addition, the static function ext4_normal_get_block_write() has
been renamed noalloc_get_block_write(), since it is used in many
places far beyond ext4_normal_writepage().

Plenty of warnings have been added to the noalloc_get_block_write()
function, since the way it is used is amazingly fragile.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 86 +++++++++++++++++++++++++++++++------------------
 1 file changed, 55 insertions(+), 31 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8b7564dfacdfa..fd5f27a9b81bb 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -892,6 +892,10 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 }
 
 /*
+ * The ext4_ind_get_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_get_blocks().
+ *
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
  * to tree, set linkage between the newborn blocks, write them if sync is
@@ -909,10 +913,11 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
  * return = 0, if plain lookup failed.
  * return < 0, error case.
  *
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
  */
 static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 				  ext4_lblk_t iblock, unsigned int maxblocks,
@@ -1152,8 +1157,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	clear_buffer_unwritten(bh);
 
 	/*
-	 * Try to see if we can get  the block without requesting
-	 * for new file system block.
+	 * Try to see if we can get the block without requesting a new
+	 * file system block.
 	 */
 	down_read((&EXT4_I(inode)->i_data_sem));
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -2000,6 +2005,12 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
+/*
+ * This function is used by mpage_da_map_blocks().  We separate it out
+ * as a separate function just to make life easier, and because
+ * mpage_da_map_blocks() used to be a generic function that took a
+ * get_block_t.
+ */
 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result)
 {
@@ -2031,8 +2042,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
 
 	/*
 	 * Update on-disk size along with block allocation we don't
-	 * use 'extend_disksize' as size may change within already
-	 * allocated block -bzzz
+	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
+	 * within already allocated block -bzzz
 	 */
 	disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
 	if (disksize > i_size_read(inode))
@@ -2338,8 +2349,9 @@ static int __mpage_da_writepage(struct page *page,
 }
 
 /*
- * this is a special callback for ->write_begin() only
- * it's intention is to return mapped block or reserve space
+ * This is a special get_blocks_t callback which is used by
+ * ext4_da_write_begin().  It will either return mapped block or
+ * reserve space for a single block.
  *
  * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
  * We also have b_blocknr = -1 and b_bdev initialized properly
@@ -2347,7 +2359,6 @@ static int __mpage_da_writepage(struct page *page,
  * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
  * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
  * initialized properly.
- *
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 				  struct buffer_head *bh_result, int create)
@@ -2400,7 +2411,23 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 	return ret;
 }
 
-static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+/*
+ * This function is used as a standard get_block_t calback function
+ * when there is no desire to allocate any blocks.  It is used as a
+ * callback function for block_prepare_write(), nobh_writepage(), and
+ * block_write_full_page().  These functions should only try to map a
+ * single block at a time.
+ *
+ * Since this function doesn't do block allocations even if the caller
+ * requests it by passing in create=1, it is critically important that
+ * any caller checks to make sure that any buffer heads are returned
+ * by this function are either all already mapped or marked for
+ * delayed allocation before calling nobh_writepage() or
+ * block_write_full_page().  Otherwise, b_blocknr could be left
+ * unitialized, and the page write functions will be taken by
+ * surprise.
+ */
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 				   struct buffer_head *bh_result, int create)
 {
 	int ret = 0;
@@ -2419,10 +2446,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 }
 
 /*
- * get called vi ext4_da_writepages after taking page lock (have journal handle)
- * get called via journal_submit_inode_data_buffers (no journal handle)
- * get called via shrink_page_list via pdflush (no journal handle)
- * or grab_page_cache when doing write_begin (have journal handle)
+ * This function can get called via...
+ *   - ext4_da_writepages after taking page lock (have journal handle)
+ *   - journal_submit_inode_data_buffers (no journal handle)
+ *   - shrink_page_list via pdflush (no journal handle)
+ *   - grab_page_cache when doing write_begin (have journal handle)
  */
 static int ext4_da_writepage(struct page *page,
 				struct writeback_control *wbc)
@@ -2473,7 +2501,7 @@ static int ext4_da_writepage(struct page *page,
 		 * do block allocation here.
 		 */
 		ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-						ext4_normal_get_block_write);
+					  noalloc_get_block_write);
 		if (!ret) {
 			page_bufs = page_buffers(page);
 			/* check whether all are mapped and non delay */
@@ -2498,11 +2526,10 @@ static int ext4_da_writepage(struct page *page,
 	}
 
 	if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-		ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+		ret = nobh_writepage(page, noalloc_get_block_write, wbc);
 	else
-		ret = block_write_full_page(page,
-						ext4_normal_get_block_write,
-						wbc);
+		ret = block_write_full_page(page, noalloc_get_block_write,
+					    wbc);
 
 	return ret;
 }
@@ -2814,7 +2841,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 	*pagep = page;
 
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext4_da_get_block_prep);
+				ext4_da_get_block_prep);
 	if (ret < 0) {
 		unlock_page(page);
 		ext4_journal_stop(handle);
@@ -3122,12 +3149,10 @@ static int __ext4_normal_writepage(struct page *page,
 	struct inode *inode = page->mapping->host;
 
 	if (test_opt(inode->i_sb, NOBH))
-		return nobh_writepage(page,
-					ext4_normal_get_block_write, wbc);
+		return nobh_writepage(page, noalloc_get_block_write, wbc);
 	else
-		return block_write_full_page(page,
-						ext4_normal_get_block_write,
-						wbc);
+		return block_write_full_page(page, noalloc_get_block_write,
+					     wbc);
 }
 
 static int ext4_normal_writepage(struct page *page,
@@ -3179,7 +3204,7 @@ static int __ext4_journalled_writepage(struct page *page,
 	int err;
 
 	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-					ext4_normal_get_block_write);
+				  noalloc_get_block_write);
 	if (ret != 0)
 		goto out_unlock;
 
@@ -3264,9 +3289,8 @@ static int ext4_journalled_writepage(struct page *page,
 		 * really know unless we go poke around in the buffer_heads.
 		 * But block_write_full_page will do the right thing.
 		 */
-		return block_write_full_page(page,
-						ext4_normal_get_block_write,
-						wbc);
+		return block_write_full_page(page, noalloc_get_block_write,
+					     wbc);
 	}
 no_write:
 	redirty_page_for_writepage(wbc, page);
-- 
GitLab


From a2dc52b5d1d8cc280b3e795abf1c80ac8c49f30c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 12 May 2009 13:51:29 -0400
Subject: [PATCH 1113/2198] ext4: Add BUG_ON debugging checks to
 noalloc_get_block_write()

Enforce that noalloc_get_block_write() is only called to map one block
at a time, and that it always is successful in finding a mapping for
given an inode's logical block block number if it is called with
create == 1.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index fd5f27a9b81bb..e6113c3a126f8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2433,11 +2433,14 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
 	int ret = 0;
 	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 
+	BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
 	/*
 	 * we don't want to do block allocation in writepage
 	 * so call get_block_wrap with create = 0
 	 */
 	ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
+	BUG_ON(create && ret == 0);
 	if (ret > 0) {
 		bh_result->b_size = (ret << inode->i_blkbits);
 		ret = 0;
-- 
GitLab


From 7ed42a28b269f8682eefae27f5c11187eb56e63b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 12 May 2009 11:33:08 -0700
Subject: [PATCH 1114/2198] x86, boot: correct sanity checks in
 boot/compressed/misc.c

arch/x86/boot/compressed/misc.c contains several sanity checks on the
output address.  Correct constraints that are no longer correct:

- the alignment test should be MIN_KERNEL_ALIGN on both 32 and 64
  bits.
- the 64 bit maximum address was set to 2^40, which was the limit of
  one specific x86-64 implementation.  Change the test to 2^46, the
  current Linux limit, and at least try to test the end rather than
  the beginning.
- for non-relocatable kernels, test against LOAD_PHYSICAL_ADDR on both
  32 and 64 bits.

[ Impact: fix potential boot failure due to invalid tests ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/compressed/misc.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index e45be73684ffe..842b2a36174a2 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -325,20 +325,18 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
 
+	if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
+		error("Destination address inappropriately aligned");
 #ifdef CONFIG_X86_64
-	if ((unsigned long)output & (__KERNEL_ALIGN - 1))
-		error("Destination address not 2M aligned");
-	if ((unsigned long)output >= 0xffffffffffUL)
+	if (heap > 0x3fffffffffffUL)
 		error("Destination address too large");
 #else
-	if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
-		error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
 	if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
 		error("Destination address too large");
+#endif
 #ifndef CONFIG_RELOCATABLE
-	if ((u32)output != LOAD_PHYSICAL_ADDR)
+	if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
 		error("Wrong destination address");
-#endif
 #endif
 
 	if (!quiet)
-- 
GitLab


From c4f68236e41641494f9c8a418ccc0678c335bbb5 Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Tue, 12 May 2009 11:37:34 -0700
Subject: [PATCH 1115/2198] x86-64: align __PHYSICAL_START, remove
 __KERNEL_ALIGN

Handle the misconfiguration where CONFIG_PHYSICAL_START is
incompatible with CONFIG_PHYSICAL_ALIGN.  This is a configuration
error, but one which arises easily since Kconfig doesn't have the
smarts to express the true relationship between these two variables.
Hence, align __PHYSICAL_START the same way we align LOAD_PHYSICAL_ADDR
in <asm/boot.h>.

For non-relocatable kernels, this would cause the boot to fail.

[ Impact: fix boot failures for non-relocatable kernels ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/include/asm/page_64_types.h | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h
index d38c91b702483..e11900f2500e3 100644
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -32,17 +32,9 @@
  */
 #define __PAGE_OFFSET           _AC(0xffff880000000000, UL)
 
-#define __PHYSICAL_START	CONFIG_PHYSICAL_START
-#define __KERNEL_ALIGN		0x200000
-
-/*
- * Make sure kernel is aligned to 2MB address. Catching it at compile
- * time is better. Change your config file and compile the kernel
- * for a 2MB aligned address (CONFIG_PHYSICAL_START)
- */
-#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
-#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
-#endif
+#define __PHYSICAL_START	((CONFIG_PHYSICAL_START +	 	\
+				  (CONFIG_PHYSICAL_ALIGN - 1)) &	\
+				 ~(CONFIG_PHYSICAL_ALIGN - 1))
 
 #define __START_KERNEL		(__START_KERNEL_map + __PHYSICAL_START)
 #define __START_KERNEL_map	_AC(0xffffffff80000000, UL)
-- 
GitLab


From 5bb9efe33ea4001a17ab98186a40a134a3061d67 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 08:12:51 +0200
Subject: [PATCH 1116/2198] perf_counter: fix print debug irq disable

inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
bash/15802 [HC0[0]:SC0[0]:HE1:SE1] takes:
 (sysrq_key_table_lock){?.....},

Don't unconditionally enable interrupts in the perf_counter_print_debug()
path.

[ Impact: fix potential deadlock pointed out by lockdep ]

LKML-Reference: <new-submission>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index da27419923a8b..f7772ff7936ed 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -621,12 +621,13 @@ void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
 	struct cpu_hw_counters *cpuc;
+	unsigned long flags;
 	int cpu, idx;
 
 	if (!x86_pmu.num_counters)
 		return;
 
-	local_irq_disable();
+	local_irq_save(flags);
 
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
@@ -664,7 +665,7 @@ void perf_counter_print_debug(void)
 		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
 			cpu, idx, pmc_count);
 	}
-	local_irq_enable();
+	local_irq_restore(flags);
 }
 
 static void x86_pmu_disable(struct perf_counter *counter)
-- 
GitLab


From 4f23122858a27ba97444b9b37a066d83edebd4c8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 13 May 2009 08:35:35 +0200
Subject: [PATCH 1117/2198] splice: fix repeated kmap()'s in
 default_file_splice_read()

We cannot reliably map more than one page at the time, or we risk
deadlocking. Just allocate the pages from low mem instead.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index eefd96b1d7fbf..c5e3c79b95a8a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -580,13 +580,13 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) {
 		struct page *page;
 
-		page = alloc_page(GFP_HIGHUSER);
+		page = alloc_page(GFP_USER);
 		error = -ENOMEM;
 		if (!page)
 			goto err;
 
 		this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset);
-		vec[i].iov_base = (void __user *) kmap(page);
+		vec[i].iov_base = (void __user *) page_address(page);
 		vec[i].iov_len = this_len;
 		pages[i] = page;
 		spd.nr_pages++;
@@ -604,7 +604,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 
 	nr_freed = 0;
 	for (i = 0; i < spd.nr_pages; i++) {
-		kunmap(pages[i]);
 		this_len = min_t(size_t, vec[i].iov_len, res);
 		partial[i].offset = 0;
 		partial[i].len = this_len;
@@ -624,10 +623,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	return res;
 
 err:
-	for (i = 0; i < spd.nr_pages; i++) {
-		kunmap(pages[i]);
+	for (i = 0; i < spd.nr_pages; i++)
 		__free_page(pages[i]);
-	}
+
 	return error;
 }
 EXPORT_SYMBOL(default_file_splice_read);
-- 
GitLab


From af777ce42d3d51cdef353ce296d6f99dc503feef Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 16:59:40 +0900
Subject: [PATCH 1118/2198] sh: clkfwk: module_clk -> peripheral_clk rename.

For consistenct naming, and to allow us to fix up some confusion in the
SH-Mobile clock framework, amongst other places.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/clock.c             |  6 +++---
 arch/sh/kernel/cpu/sh2/setup-sh7619.c  |  4 ++--
 arch/sh/kernel/cpu/sh2a/setup-mxg.c    |  6 +++---
 arch/sh/kernel/cpu/sh2a/setup-sh7201.c |  6 +++---
 arch/sh/kernel/cpu/sh2a/setup-sh7203.c |  8 ++++----
 arch/sh/kernel/cpu/sh2a/setup-sh7206.c | 10 +++++-----
 arch/sh/kernel/cpu/sh3/setup-sh7705.c  |  6 +++---
 arch/sh/kernel/cpu/sh3/setup-sh770x.c  |  6 +++---
 arch/sh/kernel/cpu/sh3/setup-sh7710.c  |  6 +++---
 arch/sh/kernel/cpu/sh3/setup-sh7720.c  | 16 ++++++++--------
 arch/sh/kernel/cpu/sh4/setup-sh4-202.c |  6 +++---
 arch/sh/kernel/cpu/sh4/setup-sh7750.c  | 10 +++++-----
 arch/sh/kernel/cpu/sh4/setup-sh7760.c  |  6 +++---
 arch/sh/kernel/cpu/sh4a/setup-sh7763.c | 12 ++++++------
 arch/sh/kernel/cpu/sh4a/setup-sh7770.c | 18 +++++++++---------
 arch/sh/kernel/cpu/sh4a/setup-sh7780.c | 12 ++++++------
 arch/sh/kernel/cpu/sh4a/setup-sh7785.c | 12 ++++++------
 arch/sh/kernel/cpu/sh4a/setup-sh7786.c | 24 ++++++++++++------------
 arch/sh/kernel/cpu/sh4a/setup-shx3.c   | 12 ++++++------
 arch/sh/kernel/cpu/sh5/setup-sh5.c     |  6 +++---
 drivers/i2c/busses/i2c-sh7760.c        |  2 +-
 drivers/serial/sh-sci.c                |  2 +-
 sound/oss/sh_dac_audio.c               | 10 +++++-----
 23 files changed, 103 insertions(+), 103 deletions(-)

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 61ff227561dce..0eedf93926471 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -50,8 +50,8 @@ static struct clk master_clk = {
 	.rate		= CONFIG_SH_PCLK_FREQ,
 };
 
-static struct clk module_clk = {
-	.name		= "module_clk",
+static struct clk peripheral_clk = {
+	.name		= "peripheral_clk",
 	.parent		= &master_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 };
@@ -73,7 +73,7 @@ static struct clk cpu_clk = {
  */
 static struct clk *onchip_clocks[] = {
 	&master_clk,
-	&module_clk,
+	&peripheral_clk,
 	&bus_clk,
 	&cpu_clk,
 };
diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
index 94ac27fc2237d..13798733f2db9 100644
--- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c
+++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c
@@ -115,7 +115,7 @@ static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
@@ -147,7 +147,7 @@ static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
index a452d9649069e..869c2da4820b3 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c
@@ -118,7 +118,7 @@ static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -149,7 +149,7 @@ static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -180,7 +180,7 @@ static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
index 772358b7685ed..d8febe1280662 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c
@@ -255,7 +255,7 @@ static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -286,7 +286,7 @@ static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -317,7 +317,7 @@ static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
index d7493418ba60b..62e3039d2398a 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c
@@ -211,7 +211,7 @@ static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
@@ -243,7 +243,7 @@ static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
@@ -275,7 +275,7 @@ static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -306,7 +306,7 @@ static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
index 2fc6bff5c5fb1..3e6f3d7a58be9 100644
--- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
+++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c
@@ -171,7 +171,7 @@ static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
@@ -203,7 +203,7 @@ static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x08,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 0, /* disabled due to code generation issues */
 };
@@ -235,7 +235,7 @@ static struct sh_timer_config mtu2_0_platform_data = {
 	.name = "MTU2_0",
 	.channel_offset = -0x80,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -266,7 +266,7 @@ static struct sh_timer_config mtu2_1_platform_data = {
 	.name = "MTU2_1",
 	.channel_offset = -0x100,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -297,7 +297,7 @@ static struct sh_timer_config mtu2_2_platform_data = {
 	.name = "MTU2_2",
 	.channel_offset = 0x80,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
index 39513664d5d73..88f742fed9edf 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c
@@ -121,7 +121,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -152,7 +152,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0xe,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -183,7 +183,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1a,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
index 9412d915b84ed..c56306798584f 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c
@@ -149,7 +149,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -180,7 +180,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0xe,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -211,7 +211,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1a,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
index 07ff38d055a76..efa76c8148f4e 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c
@@ -125,7 +125,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -156,7 +156,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0xe,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -187,7 +187,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1a,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
index d8b46f5dff607..5b2107798edbe 100644
--- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c
+++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c
@@ -128,7 +128,7 @@ static struct sh_timer_config cmt0_platform_data = {
 	.name = "CMT0",
 	.channel_offset = 0x10,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 125,
 	.clocksource_rating = 125,
 };
@@ -160,7 +160,7 @@ static struct sh_timer_config cmt1_platform_data = {
 	.name = "CMT1",
 	.channel_offset = 0x20,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource cmt1_resources[] = {
@@ -190,7 +190,7 @@ static struct sh_timer_config cmt2_platform_data = {
 	.name = "CMT2",
 	.channel_offset = 0x30,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource cmt2_resources[] = {
@@ -220,7 +220,7 @@ static struct sh_timer_config cmt3_platform_data = {
 	.name = "CMT3",
 	.channel_offset = 0x40,
 	.timer_bit = 3,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource cmt3_resources[] = {
@@ -250,7 +250,7 @@ static struct sh_timer_config cmt4_platform_data = {
 	.name = "CMT4",
 	.channel_offset = 0x50,
 	.timer_bit = 4,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource cmt4_resources[] = {
@@ -280,7 +280,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x02,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -311,7 +311,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0xe,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -342,7 +342,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1a,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
index be79fa136255f..6d088d1235917 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c
@@ -38,7 +38,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -69,7 +69,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -100,7 +100,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
index 09da0c187d4cd..851672d15cf4a 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c
@@ -65,7 +65,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -96,7 +96,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -127,7 +127,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -162,7 +162,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -192,7 +192,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
index cd097335758f1..5b822519bd90c 100644
--- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c
+++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c
@@ -164,7 +164,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -195,7 +195,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -226,7 +226,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
index c91f34c9aa83a..f1e0c0d36da74 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c
@@ -118,7 +118,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -149,7 +149,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -180,7 +180,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -210,7 +210,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -240,7 +240,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -270,7 +270,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
index b61d6143aaaa9..12fb8752d4ff7 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
@@ -81,7 +81,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -112,7 +112,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -143,7 +143,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -173,7 +173,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -203,7 +203,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -233,7 +233,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
@@ -263,7 +263,7 @@ static struct sh_timer_config tmu6_platform_data = {
 	.name = "TMU6",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu6_resources[] = {
@@ -293,7 +293,7 @@ static struct sh_timer_config tmu7_platform_data = {
 	.name = "TMU7",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu7_resources[] = {
@@ -323,7 +323,7 @@ static struct sh_timer_config tmu8_platform_data = {
 	.name = "TMU8",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu8_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
index f1df020950629..715e05b431e5e 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c
@@ -18,7 +18,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -49,7 +49,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -80,7 +80,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -110,7 +110,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -140,7 +140,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -170,7 +170,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
index dc5d3e507a219..d7e77bc77e288 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
@@ -20,7 +20,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -51,7 +51,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -82,7 +82,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -112,7 +112,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -142,7 +142,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -172,7 +172,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
index 2c464bf5a8990..93e0d2c017e82 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c
@@ -75,7 +75,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -106,7 +106,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -137,7 +137,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -167,7 +167,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -197,7 +197,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -227,7 +227,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
@@ -257,7 +257,7 @@ static struct sh_timer_config tmu6_platform_data = {
 	.name = "TMU6",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu6_resources[] = {
@@ -287,7 +287,7 @@ static struct sh_timer_config tmu7_platform_data = {
 	.name = "TMU7",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu7_resources[] = {
@@ -317,7 +317,7 @@ static struct sh_timer_config tmu8_platform_data = {
 	.name = "TMU8",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu8_resources[] = {
@@ -347,7 +347,7 @@ static struct sh_timer_config tmu9_platform_data = {
 	.name = "TMU9",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu9_resources[] = {
@@ -377,7 +377,7 @@ static struct sh_timer_config tmu10_platform_data = {
 	.name = "TMU10",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu10_resources[] = {
@@ -407,7 +407,7 @@ static struct sh_timer_config tmu11_platform_data = {
 	.name = "TMU11",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu11_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
index 9d5185b42f13d..53c65fd9ccef7 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c
@@ -53,7 +53,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -84,7 +84,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -115,7 +115,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
@@ -145,7 +145,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu3_resources[] = {
@@ -175,7 +175,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu4_resources[] = {
@@ -205,7 +205,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu5_resources[] = {
diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c
index 678d69bdebbae..f5ff1ac57fc28 100644
--- a/arch/sh/kernel/cpu/sh5/setup-sh5.c
+++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c
@@ -75,7 +75,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clockevent_rating = 200,
 };
 
@@ -106,7 +106,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 	.clocksource_rating = 200,
 };
 
@@ -137,7 +137,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "module_clk",
+	.clk = "peripheral_clk",
 };
 
 static struct resource tmu2_resources[] = {
diff --git a/drivers/i2c/busses/i2c-sh7760.c b/drivers/i2c/busses/i2c-sh7760.c
index baa28b73ae423..b9680f50f541b 100644
--- a/drivers/i2c/busses/i2c-sh7760.c
+++ b/drivers/i2c/busses/i2c-sh7760.c
@@ -396,7 +396,7 @@ static int __devinit calc_CCR(unsigned long scl_hz)
 	signed char cdf, cdfm;
 	int scgd, scgdm, scgds;
 
-	mclk = clk_get(NULL, "module_clk");
+	mclk = clk_get(NULL, "peripheral_clk");
 	if (IS_ERR(mclk)) {
 		return PTR_ERR(mclk);
 	} else {
diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index 4e3248d586020..fa4d52a6c032a 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -1084,7 +1084,7 @@ static void __devinit sci_init_single(struct platform_device *dev,
 	sci_port->port.uartclk	= CONFIG_CPU_CLOCK;
 #elif defined(CONFIG_HAVE_CLK)
 	sci_port->iclk		= p->clk ? clk_get(&dev->dev, p->clk) : NULL;
-	sci_port->dclk		= clk_get(&dev->dev, "module_clk");
+	sci_port->dclk		= clk_get(&dev->dev, "peripheral_clk");
 	sci_port->enable	= sci_clk_enable;
 	sci_port->disable	= sci_clk_disable;
 #else
diff --git a/sound/oss/sh_dac_audio.c b/sound/oss/sh_dac_audio.c
index 78cfb66e4c59c..6fa3140720bcf 100644
--- a/sound/oss/sh_dac_audio.c
+++ b/sound/oss/sh_dac_audio.c
@@ -95,18 +95,18 @@ static void dac_audio_stop(void)
 		outw(v, HD64461_GPADR);
 	}
 
- 	sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
+	sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
 	sh_dac_disable(CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
 }
 
 static void dac_audio_set_rate(void)
 {
 	unsigned long interval;
- 	struct clk *clk;
+	struct clk *clk;
 
- 	clk = clk_get(NULL, "module_clk");
- 	interval = (clk_get_rate(clk) / 4) / rate;
- 	clk_put(clk);
+	clk = clk_get(NULL, "peripheral_clk");
+	interval = (clk_get_rate(clk) / 4) / rate;
+	clk_put(clk);
 	ctrl_outl(interval, TMU1_TCOR);
 	ctrl_outl(interval, TMU1_TCNT);
 }
-- 
GitLab


From d672fef02738582bdeae6e77176e141eeb9169bc Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 17:03:09 +0900
Subject: [PATCH 1119/2198] sh: clkfwk: Handle NULL clkops for root clocks.

root clocks may simply be placeholders for rate and ancestry information,
and have no real associated operations of their own. Account for this,
so we are still able to use these sorts of clocks for rate propagation.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/clock.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 0eedf93926471..2ced20f870d1c 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -103,7 +103,7 @@ void propagate_rate(struct clk *tclk)
 	struct clk *clkp;
 
 	list_for_each_entry(clkp, &tclk->children, sibling) {
-		if (clkp->ops->recalc)
+		if (clkp->ops && clkp->ops->recalc)
 			clkp->rate = clkp->ops->recalc(clkp);
 		propagate_rate(clkp);
 	}
@@ -196,7 +196,7 @@ void recalculate_root_clocks(void)
 	struct clk *clkp;
 
 	list_for_each_entry(clkp, &root_clks, sibling) {
-		if (clkp->ops->recalc)
+		if (clkp->ops && clkp->ops->recalc)
 			clkp->rate = clkp->ops->recalc(clkp);
 		propagate_rate(clkp);
 	}
@@ -224,7 +224,7 @@ int clk_register(struct clk *clk)
 		list_add(&clk->sibling, &root_clks);
 
 	list_add(&clk->node, &clock_list);
-	if (clk->ops->init)
+	if (clk->ops && clk->ops->init)
 		clk->ops->init(clk);
 	mutex_unlock(&clock_list_sem);
 
-- 
GitLab


From 100890c55e326a9acb4429593c5ad2012c194564 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 17:05:51 +0900
Subject: [PATCH 1120/2198] sh: clkfwk: Provide a generic clk_set_rate_ex()
 path for root clocks.

In the case of root clocks (such as clkin oscillators, extal, etc.), the
rate information is entirely platform dependent and needs to be lazily
set and propagated from the platform code. This provides a method for
establishing the rate update on these types of clocks that define no
set_rate() op of their own.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/clock.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 2ced20f870d1c..0a7755cc8a25b 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -265,20 +265,27 @@ EXPORT_SYMBOL_GPL(clk_set_rate);
 int clk_set_rate_ex(struct clk *clk, unsigned long rate, int algo_id)
 {
 	int ret = -EOPNOTSUPP;
+	unsigned long flags;
 
-	if (likely(clk->ops && clk->ops->set_rate)) {
-		unsigned long flags;
+	spin_lock_irqsave(&clock_lock, flags);
 
-		spin_lock_irqsave(&clock_lock, flags);
+	if (likely(clk->ops && clk->ops->set_rate)) {
 		ret = clk->ops->set_rate(clk, rate, algo_id);
-		if (ret == 0) {
-			if (clk->ops->recalc)
-				clk->rate = clk->ops->recalc(clk);
-			propagate_rate(clk);
-		}
-		spin_unlock_irqrestore(&clock_lock, flags);
+		if (ret != 0)
+			goto out_unlock;
+	} else {
+		clk->rate = rate;
+		ret = 0;
 	}
 
+	if (clk->ops && clk->ops->recalc)
+		clk->rate = clk->ops->recalc(clk);
+
+	propagate_rate(clk);
+
+out_unlock:
+	spin_unlock_irqrestore(&clock_lock, flags);
+
 	return ret;
 }
 EXPORT_SYMBOL_GPL(clk_set_rate_ex);
-- 
GitLab


From 253b0887b3736160feac9ccdcf146a2073e41463 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 17:38:11 +0900
Subject: [PATCH 1121/2198] sh: clkfwk: Rework legacy CPG clock handling.

This moves out the old legacy CPG clocks to their own file, and converts
over the existing users. With these clocks going away and each CPU
dealing with them on their own, CPUs can gradually move over to the new
interface.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                        |  3 ++
 arch/sh/include/asm/clock.h            |  5 +-
 arch/sh/include/asm/machvec.h          |  2 +
 arch/sh/kernel/cpu/Makefile            |  1 +
 arch/sh/kernel/cpu/clock-cpg.c         | 60 ++++++++++++++++++++++
 arch/sh/kernel/cpu/clock.c             | 70 ++++++--------------------
 arch/sh/kernel/cpu/sh4/clock-sh4-202.c |  5 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c |  2 +
 arch/sh/kernel/cpu/sh4a/clock-sh7763.c |  5 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7780.c |  5 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c |  5 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7786.c |  5 +-
 arch/sh/kernel/cpu/sh4a/clock-shx3.c   |  5 +-
 13 files changed, 110 insertions(+), 63 deletions(-)
 create mode 100644 arch/sh/kernel/cpu/clock-cpg.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index fb75c2d1928df..d7990cd2f8d4d 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -513,6 +513,9 @@ config SH_PCLK_FREQ
 	  This is necessary for determining the reference clock value on
 	  platforms lacking an RTC.
 
+config SH_CLK_CPG_LEGACY
+	def_bool y
+
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
 	depends on CPU_SH2
diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index e2f5bf1b4a9c6..c27e844db0de3 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -47,7 +47,7 @@ struct clk_lookup {
 #define CLK_ENABLE_ON_INIT	(1 << 0)
 
 /* Should be defined by processor-specific code */
-void arch_init_clk_ops(struct clk_ops **, int type);
+void __deprecated arch_init_clk_ops(struct clk_ops **, int type);
 int __init arch_clk_init(void);
 
 /* arch/sh/kernel/cpu/clock.c */
@@ -59,6 +59,9 @@ int clk_reparent(struct clk *child, struct clk *parent);
 int clk_register(struct clk *);
 void clk_unregister(struct clk *);
 
+/* arch/sh/kernel/cpu/clock-cpg.c */
+int __init __deprecated cpg_clk_init(void);
+
 /* the exported API, in addition to clk_set_rate */
 /**
  * clk_set_rate_ex - set the clock rate for a clock source, with additional parameter
diff --git a/arch/sh/include/asm/machvec.h b/arch/sh/include/asm/machvec.h
index 64b1c16a0f035..73d6d16fa06b3 100644
--- a/arch/sh/include/asm/machvec.h
+++ b/arch/sh/include/asm/machvec.h
@@ -46,6 +46,8 @@ struct sh_machine_vector {
 
 	void __iomem *(*mv_ioport_map)(unsigned long port, unsigned int size);
 	void (*mv_ioport_unmap)(void __iomem *);
+
+	int (*mv_clk_init)(void);
 };
 
 extern struct sh_machine_vector sh_mv;
diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile
index 2600641a483f7..05105b980c21b 100644
--- a/arch/sh/kernel/cpu/Makefile
+++ b/arch/sh/kernel/cpu/Makefile
@@ -17,5 +17,6 @@ obj-$(CONFIG_ARCH_SHMOBILE)	+= shmobile/
 
 obj-$(CONFIG_UBC_WAKEUP)	+= ubc.o
 obj-$(CONFIG_SH_ADC)		+= adc.o
+obj-$(CONFIG_SH_CLK_CPG_LEGACY)	+= clock-cpg.o
 
 obj-y	+= irq/ init.o clock.o
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
new file mode 100644
index 0000000000000..b4ca20048446e
--- /dev/null
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -0,0 +1,60 @@
+#include <linux/clk.h>
+#include <linux/compiler.h>
+#include <asm/clock.h>
+
+static struct clk master_clk = {
+	.name		= "master_clk",
+	.flags		= CLK_ENABLE_ON_INIT,
+	.rate		= CONFIG_SH_PCLK_FREQ,
+};
+
+static struct clk peripheral_clk = {
+	.name		= "peripheral_clk",
+	.parent		= &master_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static struct clk bus_clk = {
+	.name		= "bus_clk",
+	.parent		= &master_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static struct clk cpu_clk = {
+	.name		= "cpu_clk",
+	.parent		= &master_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+/*
+ * The ordering of these clocks matters, do not change it.
+ */
+static struct clk *onchip_clocks[] = {
+	&master_clk,
+	&peripheral_clk,
+	&bus_clk,
+	&cpu_clk,
+};
+
+int __init __deprecated cpg_clk_init(void)
+{
+	int i, ret = 0;
+
+	for (i = 0; i < ARRAY_SIZE(onchip_clocks); i++) {
+		struct clk *clk = onchip_clocks[i];
+		arch_init_clk_ops(&clk->ops, i);
+		if (clk->ops)
+			ret |= clk_register(clk);
+	}
+
+	return ret;
+}
+
+/*
+ * Placeholder for compatability, until the lazy CPUs do this
+ * on their own.
+ */
+int __init __weak arch_clk_init(void)
+{
+	return cpg_clk_init();
+}
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 0a7755cc8a25b..033f4662b59d9 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -30,54 +30,12 @@
 #include <linux/platform_device.h>
 #include <linux/proc_fs.h>
 #include <asm/clock.h>
+#include <asm/machvec.h>
 
 static LIST_HEAD(clock_list);
 static DEFINE_SPINLOCK(clock_lock);
 static DEFINE_MUTEX(clock_list_sem);
 
-/*
- * Each subtype is expected to define the init routines for these clocks,
- * as each subtype (or processor family) will have these clocks at the
- * very least. These are all provided through the CPG, which even some of
- * the more quirky parts (such as ST40, SH4-202, etc.) still have.
- *
- * The processor-specific code is expected to register any additional
- * clock sources that are of interest.
- */
-static struct clk master_clk = {
-	.name		= "master_clk",
-	.flags		= CLK_ENABLE_ON_INIT,
-	.rate		= CONFIG_SH_PCLK_FREQ,
-};
-
-static struct clk peripheral_clk = {
-	.name		= "peripheral_clk",
-	.parent		= &master_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-};
-
-static struct clk bus_clk = {
-	.name		= "bus_clk",
-	.parent		= &master_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-};
-
-static struct clk cpu_clk = {
-	.name		= "cpu_clk",
-	.parent		= &master_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-};
-
-/*
- * The ordering of these clocks matters, do not change it.
- */
-static struct clk *onchip_clocks[] = {
-	&master_clk,
-	&peripheral_clk,
-	&bus_clk,
-	&cpu_clk,
-};
-
 /* Used for clocks that always have same value as the parent clock */
 unsigned long followparent_recalc(struct clk *clk)
 {
@@ -443,10 +401,6 @@ void clk_put(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_put);
 
-int __init __weak arch_clk_init(void)
-{
-	return 0;
-}
 
 static int show_clocks(char *buf, char **start, off_t off,
 		       int len, int *eof, void *data)
@@ -533,18 +487,22 @@ subsys_initcall(clk_sysdev_init);
 
 int __init clk_init(void)
 {
-	int i, ret = 0;
-
-	BUG_ON(!master_clk.rate);
-
-	for (i = 0; i < ARRAY_SIZE(onchip_clocks); i++) {
-		struct clk *clk = onchip_clocks[i];
+	int ret;
 
-		arch_init_clk_ops(&clk->ops, i);
-		ret |= clk_register(clk);
+	ret = arch_clk_init();
+	if (unlikely(ret)) {
+		pr_err("%s: CPU clock registration failed.\n", __func__);
+		return ret;
 	}
 
-	ret |= arch_clk_init();
+	if (sh_mv.mv_clk_init) {
+		ret = sh_mv.mv_clk_init();
+		if (unlikely(ret)) {
+			pr_err("%s: machvec clock initialization failed.\n",
+			       __func__);
+			return ret;
+		}
+	}
 
 	/* Kick the child clocks.. */
 	recalculate_root_clocks();
diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
index a72dc326d0b33..21421e34e7d5b 100644
--- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
+++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c
@@ -152,9 +152,12 @@ static struct clk *sh4202_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh4202_onchip_clocks); i++) {
 		struct clk *clkp = sh4202_onchip_clocks[i];
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 426065b432694..a98d7da67b97b 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -892,6 +892,8 @@ int __init arch_clk_init(void)
 	struct clk *clk;
 	int i;
 
+	clk_cpg_init();
+
 	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7722_clocks); i++) {
 		pr_debug( "Registering clock '%s'\n", sh7722_clocks[i]->name);
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
index ce3d4e6319a31..370cd47642efc 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c
@@ -92,9 +92,12 @@ static struct clk *sh7763_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7763_onchip_clocks); i++) {
 		struct clk *clkp = sh7763_onchip_clocks[i];
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
index 38b8b1ddb283b..a249d823578ea 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c
@@ -98,9 +98,12 @@ static struct clk *sh7780_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7780_onchip_clocks); i++) {
 		struct clk *clkp = sh7780_onchip_clocks[i];
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index fa14f35bc116d..bf5a0dacf8e57 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -136,9 +136,12 @@ static struct clk *sh7785_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7785_onchip_clocks); i++) {
 		struct clk *clkp = sh7785_onchip_clocks[i];
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
index 7907002fa1df0..a0e8869071ace 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c
@@ -122,9 +122,12 @@ static struct clk *sh7786_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7786_onchip_clocks); i++) {
 		struct clk *clkp = sh7786_onchip_clocks[i];
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
index c14553e3e13f7..23c27d32d9823 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c
@@ -109,9 +109,12 @@ static struct clk *shx3_onchip_clocks[] = {
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk = clk_get(NULL, "master_clk");
+	struct clk *clk;
 	int i, ret = 0;
 
+	cpg_clk_init();
+
+	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(shx3_onchip_clocks); i++) {
 		struct clk *clkp = shx3_onchip_clocks[i];
 
-- 
GitLab


From a77b5ac0ea8e47c77008d3a9a9976dcfbc01c42a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 17:55:00 +0900
Subject: [PATCH 1122/2198] sh: clkfwk: Update SH7785 for refactored clock
 framework.

This updates the SH7785 CPU code as well as the SH7785LCR board support
code for making use of the newly refactored clock framework. Support for
the legacy CPG clocks is dropped at this point, with the extal frequency
fed in from the board code.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                        |   2 +-
 arch/sh/boards/board-sh7785lcr.c       |  22 ++-
 arch/sh/include/asm/clock.h            |   2 +
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 203 +++++++++++++------------
 4 files changed, 125 insertions(+), 104 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d7990cd2f8d4d..df764c56b0508 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -514,7 +514,7 @@ config SH_PCLK_FREQ
 	  platforms lacking an RTC.
 
 config SH_CLK_CPG_LEGACY
-	def_bool y
+	def_bool y if !CPU_SUBTYPE_SH7785
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/boards/board-sh7785lcr.c b/arch/sh/boards/board-sh7785lcr.c
index 6f94f17adc467..33b194b0454d5 100644
--- a/arch/sh/boards/board-sh7785lcr.c
+++ b/arch/sh/boards/board-sh7785lcr.c
@@ -2,12 +2,12 @@
  * Renesas Technology Corp. R0P7785LC0011RL Support.
  *
  * Copyright (C) 2008  Yoshihiro Shimoda
+ * Copyright (C) 2009  Paul Mundt
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
  * for more details.
  */
-
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/sm501.h>
@@ -19,8 +19,11 @@
 #include <linux/i2c-pca-platform.h>
 #include <linux/i2c-algo-pca.h>
 #include <linux/irq.h>
-#include <asm/heartbeat.h>
+#include <linux/clk.h>
+#include <linux/errno.h>
 #include <mach/sh7785lcr.h>
+#include <asm/heartbeat.h>
+#include <asm/clock.h>
 
 /*
  * NOTE: This board has 2 physical memory maps.
@@ -273,6 +276,20 @@ void __init init_sh7785lcr_IRQ(void)
 	plat_irq_setup_pins(IRQ_MODE_IRQ3210);
 }
 
+static int sh7785lcr_clk_init(void)
+{
+	struct clk *clk;
+	int ret;
+
+	clk = clk_get(NULL, "extal");
+	if (!clk || IS_ERR(clk))
+		return PTR_ERR(clk);
+	ret = clk_set_rate(clk, 33333333);
+	clk_put(clk);
+
+	return ret;
+}
+
 static void sh7785lcr_power_off(void)
 {
 	unsigned char *p;
@@ -309,6 +326,7 @@ static void __init sh7785lcr_setup(char **cmdline_p)
 static struct sh_machine_vector mv_sh7785lcr __initmv = {
 	.mv_name		= "SH7785LCR",
 	.mv_setup		= sh7785lcr_setup,
+	.mv_clk_init		= sh7785lcr_clk_init,
 	.mv_init_irq		= init_sh7785lcr_IRQ,
 };
 
diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index c27e844db0de3..40cf3c07d7e62 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -34,7 +34,9 @@ struct clk {
 
 	unsigned long		rate;
 	unsigned long		flags;
+
 	unsigned long		arch_flags;
+	void			*priv;
 };
 
 struct clk_lookup {
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index bf5a0dacf8e57..87584dc819260 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -3,7 +3,7 @@
  *
  * SH7785 support for the clock framework
  *
- *  Copyright (C) 2007  Paul Mundt
+ *  Copyright (C) 2007 - 2009  Paul Mundt
  *
  * This file is subject to the terms and conditions of the GNU General Public
  * License.  See the file "COPYING" in the main directory of this archive
@@ -11,145 +11,146 @@
  */
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/clk.h>
+#include <linux/io.h>
 #include <asm/clock.h>
 #include <asm/freq.h>
-#include <asm/io.h>
-
-static int ifc_divisors[] = { 1, 2, 4, 6 };
-static int ufc_divisors[] = { 1, 1, 4, 6 };
-static int sfc_divisors[] = { 1, 1, 4, 6 };
-static int bfc_divisors[] = { 1, 1, 1, 1, 1, 12, 16, 18,
-			     24, 32, 36, 48, 1, 1, 1, 1 };
-static int mfc_divisors[] = { 1, 1, 4, 6 };
-static int pfc_divisors[] = { 1, 1, 1, 1, 1, 1, 1, 18,
-			      24, 32, 36, 48, 1, 1, 1, 1 };
-
-static void master_clk_init(struct clk *clk)
-{
-	clk->rate *= pfc_divisors[ctrl_inl(FRQMR1) & 0x000f];
-}
 
-static struct clk_ops sh7785_master_clk_ops = {
-	.init		= master_clk_init,
+static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
+			       24, 32, 36, 48 };
+struct clk_priv {
+	unsigned int shift;
 };
 
-static unsigned long module_clk_recalc(struct clk *clk)
-{
-	int idx = (ctrl_inl(FRQMR1) & 0x000f);
-	return clk->parent->rate / pfc_divisors[idx];
-}
+#define FRQMR_CLK_DATA(_name, _shift)	\
+static struct clk_priv _name##_data = {	.shift = _shift, }
 
-static struct clk_ops sh7785_module_clk_ops = {
-	.recalc		= module_clk_recalc,
-};
+FRQMR_CLK_DATA(pfc,  0);
+FRQMR_CLK_DATA(s3fc, 4);
+FRQMR_CLK_DATA(s2fc, 8);
+FRQMR_CLK_DATA(mfc, 12);
+FRQMR_CLK_DATA(bfc, 16);
+FRQMR_CLK_DATA(sfc, 20);
+FRQMR_CLK_DATA(ufc, 24);
+FRQMR_CLK_DATA(ifc, 28);
 
-static unsigned long bus_clk_recalc(struct clk *clk)
+static unsigned long frqmr_clk_recalc(struct clk *clk)
 {
-	int idx = ((ctrl_inl(FRQMR1) >> 16) & 0x000f);
-	return clk->parent->rate / bfc_divisors[idx];
-}
+	struct clk_priv *data = clk->priv;
+	unsigned int idx;
 
-static struct clk_ops sh7785_bus_clk_ops = {
-	.recalc		= bus_clk_recalc,
-};
+	idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f;
 
-static unsigned long cpu_clk_recalc(struct clk *clk)
-{
-	int idx = ((ctrl_inl(FRQMR1) >> 28) & 0x0003);
-	return clk->parent->rate / ifc_divisors[idx];
+	/*
+	 * XXX: PLL1 multiplier is locked for the default clock mode,
+	 * when mode pin detection and configuration support is added,
+	 * select the multiplier dynamically.
+	 */
+	return clk->parent->rate * 36 / div2[idx];
 }
 
-static struct clk_ops sh7785_cpu_clk_ops = {
-	.recalc		= cpu_clk_recalc,
+static struct clk_ops frqmr_clk_ops = {
+	.recalc		= frqmr_clk_recalc,
 };
 
-static struct clk_ops *sh7785_clk_ops[] = {
-	&sh7785_master_clk_ops,
-	&sh7785_module_clk_ops,
-	&sh7785_bus_clk_ops,
-	&sh7785_cpu_clk_ops,
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+static struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
 };
 
-void __init arch_init_clk_ops(struct clk_ops **ops, int idx)
-{
-	if (idx < ARRAY_SIZE(sh7785_clk_ops))
-		*ops = sh7785_clk_ops[idx];
-}
-
-static unsigned long shyway_clk_recalc(struct clk *clk)
-{
-	int idx = ((ctrl_inl(FRQMR1) >> 20) & 0x0003);
-	return clk->parent->rate / sfc_divisors[idx];
-}
-
-static struct clk_ops sh7785_shyway_clk_ops = {
-	.recalc		= shyway_clk_recalc,
+static struct clk cpu_clk = {
+	.name		= "cpu_clk",		/* Ick */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+	.priv		= &ifc_data,
 };
 
-static struct clk sh7785_shyway_clk = {
-	.name		= "shyway_clk",
+static struct clk shyway_clk = {
+	.name		= "shyway_clk",		/* SHck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
-	.ops		= &sh7785_shyway_clk_ops,
+	.priv		= &sfc_data,
 };
 
-static unsigned long ddr_clk_recalc(struct clk *clk)
-{
-	int idx = ((ctrl_inl(FRQMR1) >> 12) & 0x0003);
-	return clk->parent->rate / mfc_divisors[idx];
-}
+static struct clk peripheral_clk = {
+	.name		= "peripheral_clk",	/* Pck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+	.priv		= &pfc_data,
+};
 
-static struct clk_ops sh7785_ddr_clk_ops = {
-	.recalc		= ddr_clk_recalc,
+static struct clk ddr_clk = {
+	.name		= "ddr_clk",		/* DDRck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+	.priv		= &mfc_data,
 };
 
-static struct clk sh7785_ddr_clk = {
-	.name		= "ddr_clk",
+static struct clk bus_clk = {
+	.name		= "bus_clk",		/* Bck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
-	.ops		= &sh7785_ddr_clk_ops,
+	.priv		= &bfc_data,
 };
 
-static unsigned long ram_clk_recalc(struct clk *clk)
-{
-	int idx = ((ctrl_inl(FRQMR1) >> 24) & 0x0003);
-	return clk->parent->rate / ufc_divisors[idx];
-}
+static struct clk ga_clk = {
+	.name		= "ga_clk",		/* GAck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
+	.priv		= &s2fc_data,
+};
 
-static struct clk_ops sh7785_ram_clk_ops = {
-	.recalc		= ram_clk_recalc,
+static struct clk du_clk = {
+	.name		= "du_clk",		/* DUck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
+	.priv		= &s3fc_data,
 };
 
-static struct clk sh7785_ram_clk = {
-	.name		= "ram_clk",
+static struct clk umem_clk = {
+	.name		= "umem_clk",		/* uck */
+	.id		= -1,
+	.ops		= &frqmr_clk_ops,
+	.parent		= &extal_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
-	.ops		= &sh7785_ram_clk_ops,
+	.priv		= &ufc_data,
 };
 
-/*
- * Additional SH7785-specific on-chip clocks that aren't already part of the
- * clock framework
- */
-static struct clk *sh7785_onchip_clocks[] = {
-	&sh7785_shyway_clk,
-	&sh7785_ddr_clk,
-	&sh7785_ram_clk,
+static struct clk *clks[] = {
+	&extal_clk,
+	&cpu_clk,
+	&shyway_clk,
+	&peripheral_clk,
+	&ddr_clk,
+	&bus_clk,
+	&ga_clk,
+	&du_clk,
+	&umem_clk,
 };
 
 int __init arch_clk_init(void)
 {
-	struct clk *clk;
 	int i, ret = 0;
 
-	cpg_clk_init();
-
-	clk = clk_get(NULL, "master_clk");
-	for (i = 0; i < ARRAY_SIZE(sh7785_onchip_clocks); i++) {
-		struct clk *clkp = sh7785_onchip_clocks[i];
-
-		clkp->parent = clk;
-		ret |= clk_register(clkp);
-	}
-
-	clk_put(clk);
+	for (i = 0; i < ARRAY_SIZE(clks); i++)
+		ret |= clk_register(clks[i]);
 
 	return ret;
 }
-- 
GitLab


From a1c0643ff9f360a30644f6e3cd643ca2a5083aea Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 13 May 2009 10:56:52 +0100
Subject: [PATCH 1123/2198] GFS2: Move journal live test at transaction start

There seems little point grabbing the transaction glock
only to have to release it again if the journal isn't
live. This moves the test earlier to avoid grabbing the lock
when we don't need it in the first place.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/trans.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 053752d4b27f0..4ef0e9fa3549b 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -33,6 +33,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 	BUG_ON(current->journal_info);
 	BUG_ON(blocks == 0 && revokes == 0);
 
+	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))
+		return -EROFS;
+
 	tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS);
 	if (!tr)
 		return -ENOMEM;
@@ -54,12 +57,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 	if (error)
 		goto fail_holder_uninit;
 
-	if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-		tr->tr_t_gh.gh_flags |= GL_NOCACHE;
-		error = -EROFS;
-		goto fail_gunlock;
-	}
-
 	error = gfs2_log_reserve(sdp, tr->tr_reserved);
 	if (error)
 		goto fail_gunlock;
-- 
GitLab


From cc96eace48fdf0f8925a74c6c1f7ffa512e458d2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 20:28:15 +0900
Subject: [PATCH 1124/2198] sh: clkfwk: rate table construction and rounding
 for SH7785.

This adds support for constructing a rate table by looking at potential
divisors for a specified clock. Each FQRMR clock is given its own table.
Presently each table is rebuilt when the parent propagates down a new
rate, so some more logic needs to be added to do this more intelligently.

Additionally, a fairly generic round_rate() implementation is then
layered on top of it, which subsequently provides us with cpufreq support.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h            |   1 +
 arch/sh/kernel/cpu/clock.c             |   3 +
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 109 ++++++++++++++++++++++---
 3 files changed, 100 insertions(+), 13 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 40cf3c07d7e62..da681de1500ae 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -16,6 +16,7 @@ struct clk_ops {
 	int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id);
 	int (*set_parent)(struct clk *clk, struct clk *parent);
 	long (*round_rate)(struct clk *clk, unsigned long rate);
+	void (*build_rate_table)(struct clk *clk);
 };
 
 struct clk {
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 033f4662b59d9..56c6e11fa83be 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -63,6 +63,9 @@ void propagate_rate(struct clk *tclk)
 	list_for_each_entry(clkp, &tclk->children, sibling) {
 		if (clkp->ops && clkp->ops->recalc)
 			clkp->rate = clkp->ops->recalc(clkp);
+		if (clkp->ops && clkp->ops->build_rate_table)
+			clkp->ops->build_rate_table(clkp);
+
 		propagate_rate(clkp);
 	}
 }
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 87584dc819260..b7a32dd1b2dbe 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -13,28 +13,43 @@
 #include <linux/kernel.h>
 #include <linux/clk.h>
 #include <linux/io.h>
+#include <linux/cpufreq.h>
 #include <asm/clock.h>
 #include <asm/freq.h>
 
 static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
 			       24, 32, 36, 48 };
 struct clk_priv {
-	unsigned int shift;
+	unsigned int			shift;
+
+	/* allowable divisor bitmap */
+	unsigned long			div_bitmap;
+
+	/* Supportable frequencies + termination entry */
+	struct cpufreq_frequency_table	freq_table[ARRAY_SIZE(div2)+1];
 };
 
-#define FRQMR_CLK_DATA(_name, _shift)	\
-static struct clk_priv _name##_data = {	.shift = _shift, }
+#define FRQMR_CLK_DATA(_name, _shift, _div_bitmap)	\
+static struct clk_priv _name##_data = {			\
+	.shift		= _shift,			\
+	.div_bitmap	= _div_bitmap,			\
+							\
+	.freq_table[0]	= {				\
+		.index = 0,				\
+		.frequency = CPUFREQ_TABLE_END,		\
+	},						\
+}
 
-FRQMR_CLK_DATA(pfc,  0);
-FRQMR_CLK_DATA(s3fc, 4);
-FRQMR_CLK_DATA(s2fc, 8);
-FRQMR_CLK_DATA(mfc, 12);
-FRQMR_CLK_DATA(bfc, 16);
-FRQMR_CLK_DATA(sfc, 20);
-FRQMR_CLK_DATA(ufc, 24);
-FRQMR_CLK_DATA(ifc, 28);
+FRQMR_CLK_DATA(pfc,  0, 0x0f80);
+FRQMR_CLK_DATA(s3fc, 4, 0x0ff0);
+FRQMR_CLK_DATA(s2fc, 8, 0x0030);
+FRQMR_CLK_DATA(mfc, 12, 0x000c);
+FRQMR_CLK_DATA(bfc, 16, 0x0fe0);
+FRQMR_CLK_DATA(sfc, 20, 0x000c);
+FRQMR_CLK_DATA(ufc, 24, 0x000c);
+FRQMR_CLK_DATA(ifc, 28, 0x000e);
 
-static unsigned long frqmr_clk_recalc(struct clk *clk)
+static unsigned long frqmr_recalc(struct clk *clk)
 {
 	struct clk_priv *data = clk->priv;
 	unsigned int idx;
@@ -49,8 +64,76 @@ static unsigned long frqmr_clk_recalc(struct clk *clk)
 	return clk->parent->rate * 36 / div2[idx];
 }
 
+static void frqmr_build_rate_table(struct clk *clk)
+{
+	struct clk_priv *data = clk->priv;
+	int i, entry;
+
+	for (i = entry = 0; i < ARRAY_SIZE(div2); i++) {
+		if ((data->div_bitmap & (1 << i)) == 0)
+			continue;
+
+		data->freq_table[entry].index = entry;
+		data->freq_table[entry].frequency =
+			clk->parent->rate * 36 / div2[i];
+
+		entry++;
+	}
+
+	if (entry == 0) {
+		pr_warning("clkfwk: failed to build frequency table "
+			   "for \"%s\" clk!\n", clk->name);
+		return;
+	}
+
+	/* Termination entry */
+	data->freq_table[entry].index = entry;
+	data->freq_table[entry].frequency = CPUFREQ_TABLE_END;
+}
+
+static long frqmr_round_rate(struct clk *clk, unsigned long rate)
+{
+	struct clk_priv *data = clk->priv;
+	unsigned long rate_error, rate_error_prev = ~0UL;
+	unsigned long rate_best_fit = rate;
+	unsigned long highest, lowest;
+	int i;
+
+	highest = lowest = 0;
+
+	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+		unsigned long freq = data->freq_table[i].frequency;
+
+		if (freq == CPUFREQ_ENTRY_INVALID)
+			continue;
+
+		if (freq > highest)
+			highest = freq;
+		if (freq < lowest)
+			lowest = freq;
+
+		rate_error = abs(freq - rate);
+		if (rate_error < rate_error_prev) {
+			rate_best_fit = freq;
+			rate_error_prev = rate_error;
+		}
+
+		if (rate_error == 0)
+			break;
+	}
+
+	if (rate >= highest)
+		rate_best_fit = highest;
+	if (rate <= lowest)
+		rate_best_fit = lowest;
+
+	return rate_best_fit;
+}
+
 static struct clk_ops frqmr_clk_ops = {
-	.recalc		= frqmr_clk_recalc,
+	.recalc			= frqmr_recalc,
+	.build_rate_table	= frqmr_build_rate_table,
+	.round_rate		= frqmr_round_rate,
 };
 
 /*
-- 
GitLab


From cedcf3366f2191885aff92d33d6078ef08203e52 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 13 May 2009 21:51:28 +0900
Subject: [PATCH 1125/2198] sh: clkfwk: Map tree hierarchy in debugfs.

This adopts the OMAP clock framework debugfs bits and replaces the aging
procfs bits. The procfs clocks entry was primarily a debugging aid, and
used to be tied in to cpuinfo before the clock list grew too unweildly.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h |   1 +
 arch/sh/kernel/cpu/clock.c  | 110 +++++++++++++++++++++++++++---------
 2 files changed, 85 insertions(+), 26 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index da681de1500ae..c499d470b8c92 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -38,6 +38,7 @@ struct clk {
 
 	unsigned long		arch_flags;
 	void			*priv;
+	struct dentry		*dentry;
 };
 
 struct clk_lookup {
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 56c6e11fa83be..686477f8ae5bb 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -28,7 +28,7 @@
 #include <linux/seq_file.h>
 #include <linux/err.h>
 #include <linux/platform_device.h>
-#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include <asm/clock.h>
 #include <asm/machvec.h>
 
@@ -404,24 +404,6 @@ void clk_put(struct clk *clk)
 }
 EXPORT_SYMBOL_GPL(clk_put);
 
-
-static int show_clocks(char *buf, char **start, off_t off,
-		       int len, int *eof, void *data)
-{
-	struct clk *clk;
-	char *p = buf;
-
-	list_for_each_entry_reverse(clk, &clock_list, node) {
-		unsigned long rate = clk_get_rate(clk);
-
-		p += sprintf(p, "%-12s\t: %ld.%02ldMHz\t%s\n", clk->name,
-			     rate / 1000000, (rate % 1000000) / 10000,
-			      (clk->usecount > 0) ?  "enabled" : "disabled");
-	}
-
-	return p - buf;
-}
-
 #ifdef CONFIG_PM
 static int clks_sysdev_suspend(struct sys_device *dev, pm_message_t state)
 {
@@ -516,14 +498,90 @@ int __init clk_init(void)
 	return ret;
 }
 
-static int __init clk_proc_init(void)
+/*
+ *	debugfs support to trace clock tree hierarchy and attributes
+ */
+static struct dentry *clk_debugfs_root;
+
+static int clk_debugfs_register_one(struct clk *c)
 {
-	struct proc_dir_entry *p;
-	p = create_proc_read_entry("clocks", S_IRUSR, NULL,
-				   show_clocks, NULL);
-	if (unlikely(!p))
-		return -EINVAL;
+	int err;
+	struct dentry *d, *child;
+	struct clk *pa = c->parent;
+	char s[255];
+	char *p = s;
+
+	p += sprintf(p, "%s", c->name);
+	if (c->id > 0)
+		sprintf(p, ":%d", c->id);
+	d = debugfs_create_dir(s, pa ? pa->dentry : clk_debugfs_root);
+	if (!d)
+		return -ENOMEM;
+	c->dentry = d;
+
+	d = debugfs_create_u8("usecount", S_IRUGO, c->dentry, (u8 *)&c->usecount);
+	if (!d) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	d = debugfs_create_u32("rate", S_IRUGO, c->dentry, (u32 *)&c->rate);
+	if (!d) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	d = debugfs_create_x32("flags", S_IRUGO, c->dentry, (u32 *)&c->flags);
+	if (!d) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	return 0;
+
+err_out:
+	d = c->dentry;
+	list_for_each_entry(child, &d->d_subdirs, d_u.d_child)
+		debugfs_remove(child);
+	debugfs_remove(c->dentry);
+	return err;
+}
+
+static int clk_debugfs_register(struct clk *c)
+{
+	int err;
+	struct clk *pa = c->parent;
+
+	if (pa && !pa->dentry) {
+		err = clk_debugfs_register(pa);
+		if (err)
+			return err;
+	}
 
+	if (!c->dentry) {
+		err = clk_debugfs_register_one(c);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int __init clk_debugfs_init(void)
+{
+	struct clk *c;
+	struct dentry *d;
+	int err;
+
+	d = debugfs_create_dir("clock", NULL);
+	if (!d)
+		return -ENOMEM;
+	clk_debugfs_root = d;
+
+	list_for_each_entry(c, &clock_list, node) {
+		err = clk_debugfs_register(c);
+		if (err)
+			goto err_out;
+	}
 	return 0;
+err_out:
+	debugfs_remove(clk_debugfs_root); /* REVISIT: Cleanup correctly */
+	return err;
 }
-subsys_initcall(clk_proc_init);
+late_initcall(clk_debugfs_init);
-- 
GitLab


From 48c2b613616235d7c97fda5982f50100a6c79166 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 13 May 2009 14:49:48 +0100
Subject: [PATCH 1126/2198] GFS2: Add commit= mount option

It has always been possible to adjust the gfs2 log commit
interval, but only from the sysfs interface. This adds a
mount option, commit=<nn>, which will be familar to ext3
users.

The sysfs interface continues to be available as well, although
this might be removed in the future.

Also this patch cleans up some duplicated structures in the GFS2
sysfs code.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h     |  1 +
 fs/gfs2/mount.c      | 10 +++++
 fs/gfs2/ops_fstype.c |  4 +-
 fs/gfs2/ops_super.c  | 13 ++++++-
 fs/gfs2/sys.c        | 92 ++++++++++++++++----------------------------
 5 files changed, 60 insertions(+), 60 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 399d1b9780497..65f438e9537a0 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -418,6 +418,7 @@ struct gfs2_args {
 	unsigned int ar_data:2;			/* ordered/writeback */
 	unsigned int ar_meta:1;			/* mount metafs */
 	unsigned int ar_discard:1;		/* discard requests */
+	int ar_commit;				/* Commit interval */
 };
 
 struct gfs2_tune {
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
index f7e8527a21e07..947af151fa244 100644
--- a/fs/gfs2/mount.c
+++ b/fs/gfs2/mount.c
@@ -45,6 +45,7 @@ enum {
 	Opt_meta,
 	Opt_discard,
 	Opt_nodiscard,
+	Opt_commit,
 	Opt_err,
 };
 
@@ -73,6 +74,7 @@ static const match_table_t tokens = {
 	{Opt_meta, "meta"},
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
+	{Opt_commit, "commit=%d"},
 	{Opt_err, NULL}
 };
 
@@ -89,6 +91,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 	char *o;
 	int token;
 	substring_t tmp[MAX_OPT_ARGS];
+	int rv;
 
 	/* Split the options into tokens with the "," character and
 	   process them */
@@ -173,6 +176,13 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
 		case Opt_nodiscard:
 			args->ar_discard = 0;
 			break;
+		case Opt_commit:
+			rv = match_int(&tmp[0], &args->ar_commit);
+			if (rv || args->ar_commit <= 0) {
+				fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
 		case Opt_err:
 		default:
 			fs_info(sdp, "invalid mount option: %s\n", o);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea7531..7981fbc9fc3be 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -55,7 +55,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	spin_lock_init(&gt->gt_spin);
 
 	gt->gt_incore_log_blocks = 1024;
-	gt->gt_log_flush_secs = 60;
 	gt->gt_recoverd_secs = 60;
 	gt->gt_logd_secs = 1;
 	gt->gt_quota_simul_sync = 64;
@@ -1165,6 +1164,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 
 	sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
 	sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
+	sdp->sd_args.ar_commit = 60;
 
 	error = gfs2_mount_args(sdp, &sdp->sd_args, data);
 	if (error) {
@@ -1191,6 +1191,8 @@ static int fill_super(struct super_block *sb, void *data, int silent)
                                GFS2_BASIC_BLOCK_SHIFT;
 	sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift;
 
+	sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit;
+
 	error = init_names(sdp, silent);
 	if (error)
 		goto fail;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 458019569dcb4..0677a83785600 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -436,8 +436,12 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+	struct gfs2_tune *gt = &sdp->sd_tune;
 	int error;
 
+	spin_lock(&gt->gt_spin);
+	args.ar_commit = gt->gt_log_flush_secs;
+	spin_unlock(&gt->gt_spin);
 	error = gfs2_mount_args(sdp, &args, data);
 	if (error)
 		return error;
@@ -473,6 +477,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
 		sb->s_flags |= MS_POSIXACL;
 	else
 		sb->s_flags &= ~MS_POSIXACL;
+	spin_lock(&gt->gt_spin);
+	gt->gt_log_flush_secs = args.ar_commit;
+	spin_unlock(&gt->gt_spin);
+
 	return 0;
 }
 
@@ -550,6 +558,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 {
 	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
 	struct gfs2_args *args = &sdp->sd_args;
+	int lfsecs;
 
 	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
 		seq_printf(s, ",meta");
@@ -610,7 +619,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
 	}
 	if (args->ar_discard)
 		seq_printf(s, ",discard");
-
+	lfsecs = sdp->sd_tune.gt_log_flush_secs;
+	if (lfsecs != 60)
+		seq_printf(s, ",commit=%d", lfsecs);
 	return 0;
 }
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 7655f5025fec4..d53b22edc9806 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -26,6 +26,36 @@
 #include "util.h"
 #include "glops.h"
 
+struct gfs2_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct gfs2_sbd *, char *);
+	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
+};
+
+static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
+			      char *buf)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->show ? a->show(sdp, buf) : 0;
+}
+
+static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
+			       const char *buf, size_t len)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
+	return a->store ? a->store(sdp, buf, len) : len;
+}
+
+static struct sysfs_ops gfs2_attr_ops = {
+	.show  = gfs2_attr_show,
+	.store = gfs2_attr_store,
+};
+
+
+static struct kset *gfs2_kset;
+
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -212,11 +242,6 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
 	return len;
 }
 
-struct gfs2_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
 
 #define GFS2_ATTR(name, mode, show, store) \
 static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store)
@@ -246,49 +271,21 @@ static struct attribute *gfs2_attrs[] = {
 	NULL,
 };
 
-static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr,
-			      char *buf)
-{
-	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-	return a->show ? a->show(sdp, buf) : 0;
-}
-
-static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr,
-			       const char *buf, size_t len)
-{
-	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
-	struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr);
-	return a->store ? a->store(sdp, buf, len) : len;
-}
-
-static struct sysfs_ops gfs2_attr_ops = {
-	.show  = gfs2_attr_show,
-	.store = gfs2_attr_store,
-};
-
 static struct kobj_type gfs2_ktype = {
 	.default_attrs = gfs2_attrs,
 	.sysfs_ops     = &gfs2_attr_ops,
 };
 
-static struct kset *gfs2_kset;
-
 /*
  * display struct lm_lockstruct fields
  */
 
-struct lockstruct_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-
 #define LOCKSTRUCT_ATTR(name, fmt)                                          \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
 {                                                                           \
 	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
 }                                                                           \
-static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name)
+static struct gfs2_attr lockstruct_attr_##name = __ATTR_RO(name)
 
 LOCKSTRUCT_ATTR(jid,      "%u\n");
 LOCKSTRUCT_ATTR(first,    "%u\n");
@@ -401,14 +398,8 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
 }
 
-struct gdlm_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *sdp, char *);
-	ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t);
-};
-
 #define GDLM_ATTR(_name,_mode,_show,_store) \
-static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
+static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
 GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
 GDLM_ATTR(block,          0644, block_show,          block_store);
@@ -434,21 +425,12 @@ static struct attribute *lock_module_attrs[] = {
 	NULL,
 };
 
-/*
- * display struct gfs2_args fields
- */
-
-struct args_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-};
-
 #define ARGS_ATTR(name, fmt)                                                \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
 {                                                                           \
 	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
 }                                                                           \
-static struct args_attr args_attr_##name = __ATTR_RO(name)
+static struct gfs2_attr args_attr_##name = __ATTR_RO(name)
 
 ARGS_ATTR(lockproto,       "%s\n");
 ARGS_ATTR(locktable,       "%s\n");
@@ -531,14 +513,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field,
 	return len;
 }
 
-struct tune_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct gfs2_sbd *, char *);
-	ssize_t (*store)(struct gfs2_sbd *, const char *, size_t);
-};
-
 #define TUNE_ATTR_3(name, show, store)                                        \
-static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store)
+static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store)
 
 #define TUNE_ATTR_2(name, store)                                              \
 static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                   \
-- 
GitLab


From 9582d41135c0d362f04ed6bf3dc8d693a7eafee2 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 13 May 2009 15:06:25 +0100
Subject: [PATCH 1127/2198] GFS2: Remove a couple of unused sysfs entries

These two tunables are pointless and would never need to be
changed anyway. There is also a race between them and umount
as the deamons which they refer to might have gone away. The
easiest way to fix the race is to remove the interface.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index d53b22edc9806..894bf773ec930 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -530,15 +530,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
 }                                                                             \
 TUNE_ATTR_2(name, name##_store)
 
-#define TUNE_ATTR_DAEMON(name, process)                                       \
-static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\
-{                                                                             \
-	ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len);      \
-	wake_up_process(sdp->sd_##process);                                   \
-	return r;                                                             \
-}                                                                             \
-TUNE_ATTR_2(name, name##_store)
-
 TUNE_ATTR(incore_log_blocks, 0);
 TUNE_ATTR(log_flush_secs, 0);
 TUNE_ATTR(quota_warn_period, 0);
@@ -550,8 +541,6 @@ TUNE_ATTR(new_files_jdata, 0);
 TUNE_ATTR(quota_simul_sync, 1);
 TUNE_ATTR(stall_secs, 1);
 TUNE_ATTR(statfs_quantum, 1);
-TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process);
-TUNE_ATTR_DAEMON(logd_secs, logd_process);
 TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store);
 
 static struct attribute *tune_attrs[] = {
@@ -565,8 +554,6 @@ static struct attribute *tune_attrs[] = {
 	&tune_attr_quota_simul_sync.attr,
 	&tune_attr_stall_secs.attr,
 	&tune_attr_statfs_quantum.attr,
-	&tune_attr_recoverd_secs.attr,
-	&tune_attr_logd_secs.attr,
 	&tune_attr_quota_scale.attr,
 	&tune_attr_new_files_jdata.attr,
 	NULL,
-- 
GitLab


From b103387037cea2ba0f04b44d408d54c53f678061 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 13 May 2009 12:50:40 -0400
Subject: [PATCH 1128/2198] TPM: get_event_name stack corruption

get_event_name uses sprintf to fill a buffer declared on the stack.  It fills
the buffer 2 bytes at a time.  What the code doesn't take into account is that
sprintf(buf, "%02x", data) actually writes 3 bytes.  2 bytes for the data and
then it nul terminates the string.  Since we declare buf to be 40 characters
long and then we write 40 bytes of data into buf sprintf is going to write 41
characters.  The fix is to leave room in buf for the nul terminator.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 drivers/char/tpm/tpm_bios.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/char/tpm/tpm_bios.c b/drivers/char/tpm/tpm_bios.c
index ed306eb1057fd..0c2f55a38b954 100644
--- a/drivers/char/tpm/tpm_bios.c
+++ b/drivers/char/tpm/tpm_bios.c
@@ -212,7 +212,8 @@ static int get_event_name(char *dest, struct tcpa_event *event,
 			unsigned char * event_entry)
 {
 	const char *name = "";
-	char data[40] = "";
+	/* 41 so there is room for 40 data and 1 nul */
+	char data[41] = "";
 	int i, n_len = 0, d_len = 0;
 	struct tcpa_pc_event *pc_event;
 
-- 
GitLab


From 77f6bf57ba9d2c50173536dbfdacdab27cb867ca Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 14 May 2009 09:49:44 +0200
Subject: [PATCH 1129/2198] splice: fix error return code

fs/splice.c: In function 'default_file_splice_read':
fs/splice.c:566: warning: 'error' may be used uninitialized in this function

which is sort-of true.  The code will in fact return -ENOMEM instead of the
kernel_readv() return value.

Cc: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/splice.c b/fs/splice.c
index c5e3c79b95a8a..41179c0a655bb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -595,8 +595,10 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos,
 	}
 
 	res = kernel_readv(in, vec, spd.nr_pages, *ppos);
-	if (res < 0)
+	if (res < 0) {
+		error = res;
 		goto err;
+	}
 
 	error = 0;
 	if (!res)
-- 
GitLab


From 549b5e358d17a8c04953ed80896ce07d37722451 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 14 May 2009 17:38:46 +0900
Subject: [PATCH 1130/2198] sh: clkfwk: Add MSTP bits to SH7785 clock
 framework.

This plugs in all of the MSTP functions in to the clock framework,
and hands them off to the platform devices that want them.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h            |  3 ++
 arch/sh/kernel/cpu/clock.c             |  4 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 63 ++++++++++++++++++++++++++
 arch/sh/kernel/cpu/sh4a/setup-sh7785.c | 18 +++++---
 4 files changed, 80 insertions(+), 8 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index c499d470b8c92..64c93cb3d685a 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -36,6 +36,9 @@ struct clk {
 	unsigned long		rate;
 	unsigned long		flags;
 
+	void __iomem		*enable_reg;
+	unsigned int		enable_bit;
+
 	unsigned long		arch_flags;
 	void			*priv;
 	struct dentry		*dentry;
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 686477f8ae5bb..012d23476a72c 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -39,7 +39,7 @@ static DEFINE_MUTEX(clock_list_sem);
 /* Used for clocks that always have same value as the parent clock */
 unsigned long followparent_recalc(struct clk *clk)
 {
-	return clk->parent->rate;
+	return clk->parent ? clk->parent->rate : 0;
 }
 
 int clk_reparent(struct clk *child, struct clk *parent)
@@ -512,7 +512,7 @@ static int clk_debugfs_register_one(struct clk *c)
 	char *p = s;
 
 	p += sprintf(p, "%s", c->name);
-	if (c->id > 0)
+	if (c->id >= 0)
 		sprintf(p, ":%d", c->id);
 	d = debugfs_create_dir(s, pa ? pa->dentry : clk_debugfs_root);
 	if (!d)
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index b7a32dd1b2dbe..cf042b53b3aeb 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -228,12 +228,75 @@ static struct clk *clks[] = {
 	&umem_clk,
 };
 
+static int mstpcr_clk_enable(struct clk *clk)
+{
+	__raw_writel(__raw_readl(clk->enable_reg) & ~(1 << clk->enable_bit),
+		     clk->enable_reg);
+	return 0;
+}
+
+static void mstpcr_clk_disable(struct clk *clk)
+{
+	__raw_writel(__raw_readl(clk->enable_reg) | (1 << clk->enable_bit),
+		     clk->enable_reg);
+}
+
+static struct clk_ops mstpcr_clk_ops = {
+	.enable		= mstpcr_clk_enable,
+	.disable	= mstpcr_clk_disable,
+	.recalc		= followparent_recalc,
+};
+
+#define MSTPCR0		0xffc80030
+#define MSTPCR1		0xffc80034
+
+#define CLK(_name, _id, _parent, _enable_reg,		\
+	    _enable_bit, _flags)			\
+{							\
+	.name		= _name,			\
+	.id		= _id,				\
+	.parent		= _parent,			\
+	.enable_reg	= (void __iomem *)_enable_reg,	\
+	.enable_bit	= _enable_bit,			\
+	.flags		= _flags,			\
+	.ops		= &mstpcr_clk_ops,		\
+}
+
+static struct clk mstpcr_clks[] = {
+	/* MSTPCR0 */
+	CLK("scif_fck", 5, &peripheral_clk, MSTPCR0, 29, 0),
+	CLK("scif_fck", 4, &peripheral_clk, MSTPCR0, 28, 0),
+	CLK("scif_fck", 3, &peripheral_clk, MSTPCR0, 27, 0),
+	CLK("scif_fck", 2, &peripheral_clk, MSTPCR0, 26, 0),
+	CLK("scif_fck", 1, &peripheral_clk, MSTPCR0, 25, 0),
+	CLK("scif_fck", 0, &peripheral_clk, MSTPCR0, 24, 0),
+	CLK("ssi_fck", 1, &peripheral_clk, MSTPCR0, 21, 0),
+	CLK("ssi_fck", 0, &peripheral_clk, MSTPCR0, 20, 0),
+	CLK("hac_fck", 1, &peripheral_clk, MSTPCR0, 17, 0),
+	CLK("hac_fck", 0, &peripheral_clk, MSTPCR0, 16, 0),
+	CLK("mmcif_fck", -1, &peripheral_clk, MSTPCR0, 13, 0),
+	CLK("flctl_fck", -1, &peripheral_clk, MSTPCR0, 12, 0),
+	CLK("tmu345_fck", -1, &peripheral_clk, MSTPCR0, 9, 0),
+	CLK("tmu012_fck", -1, &peripheral_clk, MSTPCR0, 8, 0),
+	CLK("siof_fck", -1, &peripheral_clk, MSTPCR0, 3, 0),
+	CLK("hspi_fck", -1, &peripheral_clk, MSTPCR0, 2, 0),
+
+	/* MSTPCR1 */
+	CLK("hudi_fck", -1, NULL, MSTPCR1, 19, 0),
+	CLK("ubc_fck", -1, NULL, MSTPCR1, 17, 0),
+	CLK("dmac_11_6_fck", -1, NULL, MSTPCR1, 5, 0),
+	CLK("dmac_5_0_fck", -1, NULL, MSTPCR1, 4, 0),
+	CLK("gdta_fck", -1, NULL, MSTPCR1, 0, 0),
+};
+
 int __init arch_clk_init(void)
 {
 	int i, ret = 0;
 
 	for (i = 0; i < ARRAY_SIZE(clks); i++)
 		ret |= clk_register(clks[i]);
+	for (i = 0; i < ARRAY_SIZE(mstpcr_clks); i++)
+		ret |= clk_register(&mstpcr_clks[i]);
 
 	return ret;
 }
diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
index d7e77bc77e288..af561402570bd 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c
@@ -20,7 +20,7 @@ static struct sh_timer_config tmu0_platform_data = {
 	.name = "TMU0",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "peripheral_clk",
+	.clk = "tmu012_fck",
 	.clockevent_rating = 200,
 };
 
@@ -51,7 +51,7 @@ static struct sh_timer_config tmu1_platform_data = {
 	.name = "TMU1",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "peripheral_clk",
+	.clk = "tmu012_fck",
 	.clocksource_rating = 200,
 };
 
@@ -82,7 +82,7 @@ static struct sh_timer_config tmu2_platform_data = {
 	.name = "TMU2",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "peripheral_clk",
+	.clk = "tmu012_fck",
 };
 
 static struct resource tmu2_resources[] = {
@@ -112,7 +112,7 @@ static struct sh_timer_config tmu3_platform_data = {
 	.name = "TMU3",
 	.channel_offset = 0x04,
 	.timer_bit = 0,
-	.clk = "peripheral_clk",
+	.clk = "tmu345_fck",
 };
 
 static struct resource tmu3_resources[] = {
@@ -142,7 +142,7 @@ static struct sh_timer_config tmu4_platform_data = {
 	.name = "TMU4",
 	.channel_offset = 0x10,
 	.timer_bit = 1,
-	.clk = "peripheral_clk",
+	.clk = "tmu345_fck",
 };
 
 static struct resource tmu4_resources[] = {
@@ -172,7 +172,7 @@ static struct sh_timer_config tmu5_platform_data = {
 	.name = "TMU5",
 	.channel_offset = 0x1c,
 	.timer_bit = 2,
-	.clk = "peripheral_clk",
+	.clk = "tmu345_fck",
 };
 
 static struct resource tmu5_resources[] = {
@@ -204,31 +204,37 @@ static struct plat_sci_port sci_platform_data[] = {
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 40, 40, 40, 40 },
+		.clk		= "scif_fck",
 	}, {
 		.mapbase	= 0xffeb0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 44, 44, 44, 44 },
+		.clk		= "scif_fck",
 	}, {
 		.mapbase	= 0xffec0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 60, 60, 60, 60 },
+		.clk		= "scif_fck",
 	}, {
 		.mapbase	= 0xffed0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 61, 61, 61, 61 },
+		.clk		= "scif_fck",
 	}, {
 		.mapbase	= 0xffee0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 62, 62, 62, 62 },
+		.clk		= "scif_fck",
 	}, {
 		.mapbase	= 0xffef0000,
 		.flags		= UPF_BOOT_AUTOCONF,
 		.type		= PORT_SCIF,
 		.irqs		= { 63, 63, 63, 63 },
+		.clk		= "scif_fck",
 	}, {
 		.flags = 0,
 	}
-- 
GitLab


From ad3256e361663923342bff7f292dd289f794aa33 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 14 May 2009 17:40:08 +0900
Subject: [PATCH 1131/2198] sh: Provide FORCE_MAX_ZONEORDER.

Several platforms want to be able to do large physically contiguous
allocations (primarily nommu and video codecs on SH-Mobile), provide a
MAX_ORDER override for those cases.

Tested-by: Conrad Parker <conrad@metadecks.org>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/mm/Kconfig | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig
index b900d2cd18f7f..2795618e4f072 100644
--- a/arch/sh/mm/Kconfig
+++ b/arch/sh/mm/Kconfig
@@ -21,6 +21,29 @@ config PAGE_OFFSET
 	default "0x20000000" if MMU && SUPERH64
 	default "0x00000000"
 
+config FORCE_MAX_ZONEORDER
+	int "Maximum zone order"
+	range 9 64 if PAGE_SIZE_16KB
+	default "9" if PAGE_SIZE_16KB
+	range 7 64 if PAGE_SIZE_64KB
+	default "7" if PAGE_SIZE_64KB
+	range 11 64
+	default "14" if !MMU
+	default "11"
+	help
+	  The kernel memory allocator divides physically contiguous memory
+	  blocks into "zones", where each zone is a power of two number of
+	  pages.  This option selects the largest power of two that the kernel
+	  keeps in the memory allocator.  If you need to allocate very large
+	  blocks of physically contiguous memory, then you may need to
+	  increase this value.
+
+	  This config option is actually maximum order plus one. For example,
+	  a value of 11 means that the largest free memory block is 2^10 pages.
+
+	  The page size is not necessarily 4KB. Keep this in mind when
+	  choosing a value for this option.
+
 config MEMORY_START
 	hex "Physical memory start address"
 	default "0x08000000"
-- 
GitLab


From 9304d0ccf1922b9b954b6e9223d1b2bc83198ad2 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 14 May 2009 07:53:39 +0000
Subject: [PATCH 1132/2198] sh: intc tables for sh7770

This patch adds INTC tables for sh7770, thanks
goes to Paul for the first prototype version.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7770.c | 246 +++++++++++++++++++++++++
 1 file changed, 246 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
index b61d6143aaaa9..0feba41d218de 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c
@@ -12,6 +12,7 @@
 #include <linux/serial.h>
 #include <linux/serial_sci.h>
 #include <linux/sh_timer.h>
+#include <linux/io.h>
 
 static struct plat_sci_port sci_platform_data[] = {
 	{
@@ -387,6 +388,251 @@ void __init plat_early_device_setup(void)
 				   ARRAY_SIZE(sh7770_early_devices));
 }
 
+enum {
+	UNUSED = 0,
+
+	/* interrupt sources */
+	IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH,
+	IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH,
+	IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH,
+	IRL_HHLL, IRL_HHLH, IRL_HHHL,
+
+	IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5,
+
+	GPIO,
+	TMU0, TMU1, TMU2, TMU2_TICPI,
+	TMU3, TMU4, TMU5, TMU5_TICPI,
+	TMU6, TMU7, TMU8,
+	HAC, IPI, SPDIF, HUDI, I2C,
+	DMAC0_DMINT0, DMAC0_DMINT1, DMAC0_DMINT2,
+	I2S0, I2S1, I2S2, I2S3,
+	SRC_RX, SRC_TX, SRC_SPDIF,
+	DU, VIDEO_IN, REMOTE, YUV, USB, ATAPI, CAN, GPS, GFX2D,
+	GFX3D_MBX, GFX3D_DMAC,
+	EXBUS_ATA,
+	SPI0, SPI1,
+	SCIF089, SCIF1234, SCIF567,
+	ADC,
+	BBDMAC_0_3, BBDMAC_4_7, BBDMAC_8_10, BBDMAC_11_14,
+	BBDMAC_15_18, BBDMAC_19_22, BBDMAC_23_26, BBDMAC_27,
+	BBDMAC_28, BBDMAC_29, BBDMAC_30, BBDMAC_31,
+
+	/* interrupt groups */
+	TMU, DMAC, I2S, SRC, GFX3D, SPI, SCIF, BBDMAC,
+};
+
+static struct intc_vect vectors[] __initdata = {
+	INTC_VECT(GPIO, 0x3e0),
+	INTC_VECT(TMU0, 0x400), INTC_VECT(TMU1, 0x420),
+	INTC_VECT(TMU2, 0x440), INTC_VECT(TMU2_TICPI, 0x460),
+	INTC_VECT(TMU3, 0x480), INTC_VECT(TMU4, 0x4a0),
+	INTC_VECT(TMU5, 0x4c0), INTC_VECT(TMU5_TICPI, 0x4e0),
+	INTC_VECT(TMU6, 0x500), INTC_VECT(TMU7, 0x520),
+	INTC_VECT(TMU8, 0x540),
+	INTC_VECT(HAC, 0x580), INTC_VECT(IPI, 0x5c0),
+	INTC_VECT(SPDIF, 0x5e0),
+	INTC_VECT(HUDI, 0x600), INTC_VECT(I2C, 0x620),
+	INTC_VECT(DMAC0_DMINT0, 0x640), INTC_VECT(DMAC0_DMINT1, 0x660),
+	INTC_VECT(DMAC0_DMINT2, 0x680),
+	INTC_VECT(I2S0, 0x6a0), INTC_VECT(I2S1, 0x6c0),
+	INTC_VECT(I2S2, 0x6e0), INTC_VECT(I2S3, 0x700),
+	INTC_VECT(SRC_RX, 0x720), INTC_VECT(SRC_TX, 0x740),
+	INTC_VECT(SRC_SPDIF, 0x760),
+	INTC_VECT(DU, 0x780), INTC_VECT(VIDEO_IN, 0x7a0),
+	INTC_VECT(REMOTE, 0x7c0), INTC_VECT(YUV, 0x7e0),
+	INTC_VECT(USB, 0x840), INTC_VECT(ATAPI, 0x860),
+	INTC_VECT(CAN, 0x880), INTC_VECT(GPS, 0x8a0),
+	INTC_VECT(GFX2D, 0x8c0),
+	INTC_VECT(GFX3D_MBX, 0x900), INTC_VECT(GFX3D_DMAC, 0x920),
+	INTC_VECT(EXBUS_ATA, 0x940),
+	INTC_VECT(SPI0, 0x960), INTC_VECT(SPI1, 0x980),
+	INTC_VECT(SCIF089, 0x9a0), INTC_VECT(SCIF1234, 0x9c0),
+	INTC_VECT(SCIF1234, 0x9e0), INTC_VECT(SCIF1234, 0xa00),
+	INTC_VECT(SCIF1234, 0xa20), INTC_VECT(SCIF567, 0xa40),
+	INTC_VECT(SCIF567, 0xa60), INTC_VECT(SCIF567, 0xa80),
+	INTC_VECT(SCIF089, 0xaa0), INTC_VECT(SCIF089, 0xac0),
+	INTC_VECT(ADC, 0xb20),
+	INTC_VECT(BBDMAC_0_3, 0xba0), INTC_VECT(BBDMAC_0_3, 0xbc0),
+	INTC_VECT(BBDMAC_0_3, 0xbe0), INTC_VECT(BBDMAC_0_3, 0xc00),
+	INTC_VECT(BBDMAC_4_7, 0xc20), INTC_VECT(BBDMAC_4_7, 0xc40),
+	INTC_VECT(BBDMAC_4_7, 0xc60), INTC_VECT(BBDMAC_4_7, 0xc80),
+	INTC_VECT(BBDMAC_8_10, 0xca0), INTC_VECT(BBDMAC_8_10, 0xcc0),
+	INTC_VECT(BBDMAC_8_10, 0xce0), INTC_VECT(BBDMAC_11_14, 0xd00),
+	INTC_VECT(BBDMAC_11_14, 0xd20), INTC_VECT(BBDMAC_11_14, 0xd40),
+	INTC_VECT(BBDMAC_11_14, 0xd60), INTC_VECT(BBDMAC_15_18, 0xd80),
+	INTC_VECT(BBDMAC_15_18, 0xda0), INTC_VECT(BBDMAC_15_18, 0xdc0),
+	INTC_VECT(BBDMAC_15_18, 0xde0), INTC_VECT(BBDMAC_19_22, 0xe00),
+	INTC_VECT(BBDMAC_19_22, 0xe20), INTC_VECT(BBDMAC_19_22, 0xe40),
+	INTC_VECT(BBDMAC_19_22, 0xe60), INTC_VECT(BBDMAC_23_26, 0xe80),
+	INTC_VECT(BBDMAC_23_26, 0xea0), INTC_VECT(BBDMAC_23_26, 0xec0),
+	INTC_VECT(BBDMAC_23_26, 0xee0), INTC_VECT(BBDMAC_27, 0xf00),
+	INTC_VECT(BBDMAC_28, 0xf20), INTC_VECT(BBDMAC_29, 0xf40),
+	INTC_VECT(BBDMAC_30, 0xf60), INTC_VECT(BBDMAC_31, 0xf80),
+};
+
+static struct intc_group groups[] __initdata = {
+	INTC_GROUP(TMU, TMU0, TMU1, TMU2, TMU2_TICPI, TMU3, TMU4, TMU5,
+		   TMU5_TICPI, TMU6, TMU7, TMU8),
+	INTC_GROUP(DMAC, DMAC0_DMINT0, DMAC0_DMINT1, DMAC0_DMINT2),
+	INTC_GROUP(I2S, I2S0, I2S1, I2S2, I2S3),
+	INTC_GROUP(SRC, SRC_RX, SRC_TX, SRC_SPDIF),
+	INTC_GROUP(GFX3D, GFX3D_MBX, GFX3D_DMAC),
+	INTC_GROUP(SPI, SPI0, SPI1),
+	INTC_GROUP(SCIF, SCIF089, SCIF1234, SCIF567),
+	INTC_GROUP(BBDMAC,
+		   BBDMAC_0_3, BBDMAC_4_7, BBDMAC_8_10, BBDMAC_11_14,
+		   BBDMAC_15_18, BBDMAC_19_22, BBDMAC_23_26, BBDMAC_27,
+		   BBDMAC_28, BBDMAC_29, BBDMAC_30, BBDMAC_31),
+};
+
+static struct intc_mask_reg mask_registers[] __initdata = {
+	{ 0xffe00040, 0xffe00044, 32, /* INT2MSKR / INT2MSKCR */
+	  { 0, BBDMAC, ADC, SCIF, SPI, EXBUS_ATA, GFX3D, GFX2D,
+	    GPS, CAN, ATAPI, USB, YUV, REMOTE, VIDEO_IN, DU, SRC, I2S,
+	    DMAC, I2C, HUDI, SPDIF, IPI, HAC, TMU, GPIO } },
+};
+
+static struct intc_prio_reg prio_registers[] __initdata = {
+	{ 0xffe00000, 0, 32, 8, /* INT2PRI0 */ { GPIO, TMU0, 0, HAC } },
+	{ 0xffe00004, 0, 32, 8, /* INT2PRI1 */ { IPI, SPDIF, HUDI, I2C } },
+	{ 0xffe00008, 0, 32, 8, /* INT2PRI2 */ { DMAC, I2S, SRC, DU } },
+	{ 0xffe0000c, 0, 32, 8, /* INT2PRI3 */ { VIDEO_IN, REMOTE, YUV, USB } },
+	{ 0xffe00010, 0, 32, 8, /* INT2PRI4 */ { ATAPI, CAN, GPS, GFX2D } },
+	{ 0xffe00014, 0, 32, 8, /* INT2PRI5 */ { 0, GFX3D, EXBUS_ATA, SPI } },
+	{ 0xffe00018, 0, 32, 8, /* INT2PRI6 */ { SCIF1234, SCIF567, SCIF089 } },
+	{ 0xffe0001c, 0, 32, 8, /* INT2PRI7 */ { ADC, 0, 0, BBDMAC_0_3 } },
+	{ 0xffe00020, 0, 32, 8, /* INT2PRI8 */
+	  { BBDMAC_4_7, BBDMAC_8_10, BBDMAC_11_14, BBDMAC_15_18 } },
+	{ 0xffe00024, 0, 32, 8, /* INT2PRI9 */
+	  { BBDMAC_19_22, BBDMAC_23_26, BBDMAC_27, BBDMAC_28 } },
+	{ 0xffe00028, 0, 32, 8, /* INT2PRI10 */
+	  { BBDMAC_29, BBDMAC_30, BBDMAC_31 } },
+	{ 0xffe0002c, 0, 32, 8, /* INT2PRI11 */
+	  { TMU1, TMU2, TMU2_TICPI, TMU3 } },
+	{ 0xffe00030, 0, 32, 8, /* INT2PRI12 */
+	  { TMU4, TMU5, TMU5_TICPI, TMU6 } },
+	{ 0xffe00034, 0, 32, 8, /* INT2PRI13 */
+	  { TMU7, TMU8 } },
+};
+
+static DECLARE_INTC_DESC(intc_desc, "sh7770", vectors, groups,
+			 mask_registers, prio_registers, NULL);
+
+/* Support for external interrupt pins in IRQ mode */
+static struct intc_vect irq_vectors[] __initdata = {
+	INTC_VECT(IRQ0, 0x240), INTC_VECT(IRQ1, 0x280),
+	INTC_VECT(IRQ2, 0x2c0), INTC_VECT(IRQ3, 0x300),
+	INTC_VECT(IRQ4, 0x340), INTC_VECT(IRQ5, 0x380),
+};
+
+static struct intc_mask_reg irq_mask_registers[] __initdata = {
+	{ 0xffd00044, 0xffd00064, 32, /* INTMSK0 / INTMSKCLR0 */
+	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, } },
+};
+
+static struct intc_prio_reg irq_prio_registers[] __initdata = {
+	{ 0xffd00010, 0, 32, 4, /* INTPRI */ { IRQ0, IRQ1, IRQ2, IRQ3,
+					       IRQ4, IRQ5, } },
+};
+
+static struct intc_sense_reg irq_sense_registers[] __initdata = {
+	{ 0xffd0001c, 32, 2, /* ICR1 */   { IRQ0, IRQ1, IRQ2, IRQ3,
+					    IRQ4, IRQ5, } },
+};
+
+static DECLARE_INTC_DESC(intc_irq_desc, "sh7770-irq", irq_vectors,
+			 NULL, irq_mask_registers, irq_prio_registers,
+			 irq_sense_registers);
+
+/* External interrupt pins in IRL mode */
+static struct intc_vect irl_vectors[] __initdata = {
+	INTC_VECT(IRL_LLLL, 0x200), INTC_VECT(IRL_LLLH, 0x220),
+	INTC_VECT(IRL_LLHL, 0x240), INTC_VECT(IRL_LLHH, 0x260),
+	INTC_VECT(IRL_LHLL, 0x280), INTC_VECT(IRL_LHLH, 0x2a0),
+	INTC_VECT(IRL_LHHL, 0x2c0), INTC_VECT(IRL_LHHH, 0x2e0),
+	INTC_VECT(IRL_HLLL, 0x300), INTC_VECT(IRL_HLLH, 0x320),
+	INTC_VECT(IRL_HLHL, 0x340), INTC_VECT(IRL_HLHH, 0x360),
+	INTC_VECT(IRL_HHLL, 0x380), INTC_VECT(IRL_HHLH, 0x3a0),
+	INTC_VECT(IRL_HHHL, 0x3c0),
+};
+
+static struct intc_mask_reg irl3210_mask_registers[] __initdata = {
+	{ 0xffd40080, 0xffd40084, 32, /* INTMSK2 / INTMSKCLR2 */
+	  { IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH,
+	    IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH,
+	    IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH,
+	    IRL_HHLL, IRL_HHLH, IRL_HHHL, } },
+};
+
+static struct intc_mask_reg irl7654_mask_registers[] __initdata = {
+	{ 0xffd40080, 0xffd40084, 32, /* INTMSK2 / INTMSKCLR2 */
+	  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	    IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH,
+	    IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH,
+	    IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH,
+	    IRL_HHLL, IRL_HHLH, IRL_HHHL, } },
+};
+
+static DECLARE_INTC_DESC(intc_irl7654_desc, "sh7780-irl7654", irl_vectors,
+			 NULL, irl7654_mask_registers, NULL, NULL);
+
+static DECLARE_INTC_DESC(intc_irl3210_desc, "sh7780-irl3210", irl_vectors,
+			 NULL, irl3210_mask_registers, NULL, NULL);
+
+#define INTC_ICR0	0xffd00000
+#define INTC_INTMSK0	0xffd00044
+#define INTC_INTMSK1	0xffd00048
+#define INTC_INTMSK2	0xffd40080
+#define INTC_INTMSKCLR1	0xffd00068
+#define INTC_INTMSKCLR2	0xffd40084
+
 void __init plat_irq_setup(void)
 {
+	/* disable IRQ7-0 */
+	ctrl_outl(0xff000000, INTC_INTMSK0);
+
+	/* disable IRL3-0 + IRL7-4 */
+	ctrl_outl(0xc0000000, INTC_INTMSK1);
+	ctrl_outl(0xfffefffe, INTC_INTMSK2);
+
+	/* select IRL mode for IRL3-0 + IRL7-4 */
+	ctrl_outl(ctrl_inl(INTC_ICR0) & ~0x00c00000, INTC_ICR0);
+
+	/* disable holding function, ie enable "SH-4 Mode" */
+	ctrl_outl(ctrl_inl(INTC_ICR0) | 0x00200000, INTC_ICR0);
+
+	register_intc_controller(&intc_desc);
+}
+
+void __init plat_irq_setup_pins(int mode)
+{
+	switch (mode) {
+	case IRQ_MODE_IRQ:
+		/* select IRQ mode for IRL3-0 + IRL7-4 */
+		ctrl_outl(ctrl_inl(INTC_ICR0) | 0x00c00000, INTC_ICR0);
+		register_intc_controller(&intc_irq_desc);
+		break;
+	case IRQ_MODE_IRL7654:
+		/* enable IRL7-4 but don't provide any masking */
+		ctrl_outl(0x40000000, INTC_INTMSKCLR1);
+		ctrl_outl(0x0000fffe, INTC_INTMSKCLR2);
+		break;
+	case IRQ_MODE_IRL3210:
+		/* enable IRL0-3 but don't provide any masking */
+		ctrl_outl(0x80000000, INTC_INTMSKCLR1);
+		ctrl_outl(0xfffe0000, INTC_INTMSKCLR2);
+		break;
+	case IRQ_MODE_IRL7654_MASK:
+		/* enable IRL7-4 and mask using cpu intc controller */
+		ctrl_outl(0x40000000, INTC_INTMSKCLR1);
+		register_intc_controller(&intc_irl7654_desc);
+		break;
+	case IRQ_MODE_IRL3210_MASK:
+		/* enable IRL0-3 and mask using cpu intc controller */
+		ctrl_outl(0x80000000, INTC_INTMSKCLR1);
+		register_intc_controller(&intc_irl3210_desc);
+		break;
+	default:
+		BUG();
+	}
 }
-- 
GitLab


From 2fa3cdfb319055fd8b25abdafa413e16f00ad493 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 09:29:45 -0400
Subject: [PATCH 1133/2198] ext4: Merge ext4_da_get_block_write() into
 mpage_da_map_blocks()

The static function ext4_da_get_block_write() was only used by
mpage_da_map_blocks().  So to simplify the code, merge that function
into mpage_da_map_blocks().

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 110 +++++++++++++++++++-----------------------------
 1 file changed, 43 insertions(+), 67 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e6113c3a126f8..bfe50a22363b8 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2005,57 +2005,6 @@ static void ext4_print_free_blocks(struct inode *inode)
 	return;
 }
 
-/*
- * This function is used by mpage_da_map_blocks().  We separate it out
- * as a separate function just to make life easier, and because
- * mpage_da_map_blocks() used to be a generic function that took a
- * get_block_t.
- */
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-				   struct buffer_head *bh_result)
-{
-	int ret;
-	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-	loff_t disksize = EXT4_I(inode)->i_disksize;
-	handle_t *handle = NULL;
-
-	handle = ext4_journal_current_handle();
-	BUG_ON(!handle);
-	ret = ext4_get_blocks(handle, inode, iblock, max_blocks,
-			      bh_result, EXT4_GET_BLOCKS_CREATE|
-			      EXT4_GET_BLOCKS_DELALLOC_RESERVE);
-	if (ret <= 0)
-		return ret;
-
-	bh_result->b_size = (ret << inode->i_blkbits);
-
-	if (ext4_should_order_data(inode)) {
-		int retval;
-		retval = ext4_jbd2_file_inode(handle, inode);
-		if (retval)
-			/*
-			 * Failed to add inode for ordered mode. Don't
-			 * update file size
-			 */
-			return retval;
-	}
-
-	/*
-	 * Update on-disk size along with block allocation we don't
-	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
-	 * within already allocated block -bzzz
-	 */
-	disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-	if (disksize > i_size_read(inode))
-		disksize = i_size_read(inode);
-	if (disksize > EXT4_I(inode)->i_disksize) {
-		ext4_update_i_disksize(inode, disksize);
-		ret = ext4_mark_inode_dirty(handle, inode);
-		return ret;
-	}
-	return 0;
-}
-
 /*
  * mpage_da_map_blocks - go through given space
  *
@@ -2066,9 +2015,12 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
  */
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-	int err = 0;
+	int err, blks;
 	struct buffer_head new;
-	sector_t next;
+	sector_t next = mpd->b_blocknr;
+	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
+	loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
+	handle_t *handle = NULL;
 
 	/*
 	 * We consider only non-mapped and non-allocated blocks
@@ -2077,6 +2029,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		!(mpd->b_state & (1 << BH_Delay)) &&
 		!(mpd->b_state & (1 << BH_Unwritten)))
 		return 0;
+
+	/*
+	 * If we didn't accumulate anything to write simply return
+	 */
+	if (!mpd->b_size)
+		return 0;
+
+	handle = ext4_journal_current_handle();
+	BUG_ON(!handle);
+
 	/*
 	 * We need to make sure the BH_Delay flag is passed down to
 	 * ext4_da_get_block_write(), since it calls ext4_get_blocks()
@@ -2092,18 +2054,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
 	 */
 	new.b_state = mpd->b_state & (1 << BH_Delay);
-	new.b_blocknr = 0;
-	new.b_size = mpd->b_size;
-	next = mpd->b_blocknr;
-	/*
-	 * If we didn't accumulate anything
-	 * to write simply return
-	 */
-	if (!new.b_size)
-		return 0;
-
-	err = ext4_da_get_block_write(mpd->inode, next, &new);
-	if (err) {
+	blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+			       &new, EXT4_GET_BLOCKS_CREATE|
+			       EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+	if (blks < 0) {
+		err = blks;
 		/*
 		 * If get block returns with error we simply
 		 * return. Later writepage will redirty the page and
@@ -2136,12 +2091,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 		if (err == -ENOSPC) {
 			ext4_print_free_blocks(mpd->inode);
 		}
-		/* invlaidate all the pages */
+		/* invalidate all the pages */
 		ext4_da_block_invalidatepages(mpd, next,
 				mpd->b_size >> mpd->inode->i_blkbits);
 		return err;
 	}
-	BUG_ON(new.b_size == 0);
+	BUG_ON(blks == 0);
+
+	new.b_size = (blks << mpd->inode->i_blkbits);
 
 	if (buffer_new(&new))
 		__unmap_underlying_blocks(mpd->inode, &new);
@@ -2154,6 +2111,25 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	    (mpd->b_state & (1 << BH_Unwritten)))
 		mpage_put_bnr_to_bhs(mpd, next, &new);
 
+	if (ext4_should_order_data(mpd->inode)) {
+		err = ext4_jbd2_file_inode(handle, mpd->inode);
+		if (err)
+			return err;
+	}
+
+	/*
+	 * Update on-disk size along with block allocation we don't
+	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
+	 * within already allocated block -bzzz
+	 */
+	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
+	if (disksize > i_size_read(mpd->inode))
+		disksize = i_size_read(mpd->inode);
+	if (disksize > EXT4_I(mpd->inode)->i_disksize) {
+		ext4_update_i_disksize(mpd->inode, disksize);
+		return ext4_mark_inode_dirty(handle, mpd->inode);
+	}
+
 	return 0;
 }
 
-- 
GitLab


From 2ac3b6e00acb46406c993d57921f86a594aafe08 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 14 May 2009 13:57:08 -0400
Subject: [PATCH 1134/2198] ext4: Clean up ext4_get_blocks() so it does not
 depend on bh_result->b_state

The ext4_get_blocks() function was depending on the value of
bh_result->b_state as an input parameter to decide whether or not
update the delalloc accounting statistics by calling
ext4_da_update_reserve_space().  We now use a separate flag,
EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE, to requests this update, so that
all callers of ext4_get_blocks() can clear map_bh.b_state before
calling ext4_get_blocks() without worrying about any consistency
issues.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  | 15 ++++++++-----
 fs/ext4/inode.c | 56 +++++++++++++++++++++++++++----------------------
 2 files changed, 41 insertions(+), 30 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 17feb4ac633a1..d164f1294e5f7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -318,16 +318,21 @@ struct ext4_new_group_data {
  */
 	/* Allocate any needed blocks and/or convert an unitialized
 	   extent to be an initialized ext4 */
-#define EXT4_GET_BLOCKS_CREATE			1
+#define EXT4_GET_BLOCKS_CREATE			0x0001
 	/* Request the creation of an unitialized extent */
-#define EXT4_GET_BLOCKS_UNINIT_EXT		2
+#define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
 	/* Update the ext4_inode_info i_disksize field */
-#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		4
+#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		0x0004
 	/* Caller is from the delayed allocation writeout path,
-	   so the filesystem blocks have already been accounted for */
-#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	8
+	   so set the magic i_delalloc_reserve_flag after taking the 
+	   inode allocation semaphore for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0008
+	/* Call ext4_da_update_reserve_space() after successfully 
+	   allocating the blocks */
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0010
+
 
 /*
  * ioctl commands
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index bfe50a22363b8..d7b7480682b90 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1234,16 +1234,15 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		}
 	}
 
-	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) {
+	if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
 		EXT4_I(inode)->i_delalloc_reserved_flag = 0;
-		/*
-		 * Update reserved blocks/metadata blocks
-		 * after successful block allocation
-		 * which were deferred till now
-		 */
-		if ((retval > 0) && buffer_delay(bh))
-			ext4_da_update_reserve_space(inode, retval);
-	}
+
+	/*
+	 * Update reserved blocks/metadata blocks after successful
+	 * block allocation which had been deferred till now.
+	 */
+	if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
+		ext4_da_update_reserve_space(inode, retval);
 
 	up_write((&EXT4_I(inode)->i_data_sem));
 	return retval;
@@ -2015,7 +2014,7 @@ static void ext4_print_free_blocks(struct inode *inode)
  */
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-	int err, blks;
+	int err, blks, get_blocks_flags;
 	struct buffer_head new;
 	sector_t next = mpd->b_blocknr;
 	unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
@@ -2040,23 +2039,30 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	BUG_ON(!handle);
 
 	/*
-	 * We need to make sure the BH_Delay flag is passed down to
-	 * ext4_da_get_block_write(), since it calls ext4_get_blocks()
-	 * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.  This flag
-	 * causes ext4_get_blocks() to call
-	 * ext4_da_update_reserve_space() if the passed buffer head
-	 * has the BH_Delay flag set.  In the future, once we clean up
-	 * the interfaces to ext4_get_blocks(), we should pass in a
-	 * separate flag which requests that the delayed allocation
-	 * statistics should be updated, instead of depending on the
-	 * state information getting passed down via the map_bh's
-	 * state bitmasks plus the magic
-	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag.
+	 * Call ext4_get_blocks() to allocate any delayed allocation
+	 * blocks, or to convert an uninitialized extent to be
+	 * initialized (in the case where we have written into
+	 * one or more preallocated blocks).
+	 *
+	 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
+	 * indicate that we are on the delayed allocation path.  This
+	 * affects functions in many different parts of the allocation
+	 * call path.  This flag exists primarily because we don't
+	 * want to change *many* call functions, so ext4_get_blocks()
+	 * will set the magic i_delalloc_reserved_flag once the
+	 * inode's allocation semaphore is taken.
+	 *
+	 * If the blocks in questions were delalloc blocks, set
+	 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
+	 * variables are updated after the blocks have been allocated.
 	 */
-	new.b_state = mpd->b_state & (1 << BH_Delay);
+	new.b_state = 0;
+	get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+			    EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+	if (mpd->b_state & (1 << BH_Delay))
+		get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
 	blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
-			       &new, EXT4_GET_BLOCKS_CREATE|
-			       EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+			       &new, get_blocks_flags);
 	if (blks < 0) {
 		err = blks;
 		/*
-- 
GitLab


From f850a7c040d9faafb41bceb0a05d6bb7432c8c7a Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 12 May 2009 15:13:55 -0400
Subject: [PATCH 1135/2198] IMA: remove read permissions on the ima policy file

The IMA policy file does not implement read.  Trying to just open/read/close
the file will load a blank policy and you cannot then change the policy
without a reboot.  This removes the read permission from the file so one must
at least be attempting to write...

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/integrity/ima/ima_fs.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index ffbe259700b10..3305a9615863f 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -15,6 +15,7 @@
  *	implemenents security file system for reporting
  *	current measurement list and IMA statistics
  */
+#include <linux/fcntl.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/rculist.h>
@@ -283,6 +284,9 @@ static atomic_t policy_opencount = ATOMIC_INIT(1);
  */
 int ima_open_policy(struct inode * inode, struct file * filp)
 {
+	/* No point in being allowed to open it if you aren't going to write */
+	if (!(filp->f_flags & O_WRONLY))
+		return -EACCES;
 	if (atomic_dec_and_test(&policy_opencount))
 		return 0;
 	return -EBUSY;
@@ -349,7 +353,7 @@ int ima_fs_init(void)
 		goto out;
 
 	ima_policy = securityfs_create_file("policy",
-					    S_IRUSR | S_IRGRP | S_IWUSR,
+					    S_IWUSR,
 					    ima_dir, NULL,
 					    &ima_measure_policy_ops);
 	if (IS_ERR(ima_policy))
-- 
GitLab


From c3d20103d08e5c0b6738fbd0acf3ca004e5356c5 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 12 May 2009 15:14:23 -0400
Subject: [PATCH 1136/2198] IMA: do not measure everything opened by root by
 default

The IMA default policy measures every single file opened by root.  This is
terrible for most users.  Consider a system (like mine) with virtual machine
images.  When those images are touched (which happens at boot for me) those
images are measured.  This is just way too much for the default case.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/integrity/ima/ima_policy.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index b168c1d595cea..dec6dcb1c8de5 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -61,8 +61,6 @@ static struct ima_measure_rule_entry default_rules[] = {
 	 .flags = IMA_FUNC | IMA_MASK},
 	{.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC,
 	 .flags = IMA_FUNC | IMA_MASK},
-	{.action = MEASURE,.func = PATH_CHECK,.mask = MAY_READ,.uid = 0,
-	 .flags = IMA_FUNC | IMA_MASK | IMA_UID}
 };
 
 static LIST_HEAD(measure_default_rules);
-- 
GitLab


From bec36eca6f5d1d83a9c3733fc40ba173ad849df2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 15 May 2009 12:03:04 +0900
Subject: [PATCH 1137/2198] sh: hd64461: Fix up I/O base register offsets.

hd64461 is mapped in a fixed location, so the I/O base itself is fairly
meaningless as a configuration item. Additionally, this makes it
impossible to share hd64461 code alongside generic drivers (in the case
of sh_dac_audio), so simply make it commonly defined and permit the
mach_is_foo() logic to work out the proper semantics.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/cchips/Kconfig        |   5 --
 arch/sh/include/asm/hd64461.h | 146 +++++++++++++++++-----------------
 2 files changed, 74 insertions(+), 77 deletions(-)

diff --git a/arch/sh/cchips/Kconfig b/arch/sh/cchips/Kconfig
index f43d18373f224..a5ab2eccdaa65 100644
--- a/arch/sh/cchips/Kconfig
+++ b/arch/sh/cchips/Kconfig
@@ -34,11 +34,6 @@ config HD64461_IRQ
 
 	  Do not change this unless you know what you are doing.
 
-config HD64461_IOBASE
-	hex "HD64461 start address"
-	depends on HD64461
-	default "0xb0000000"
-
 config HD64461_ENABLER
 	bool "HD64461 PCMCIA enabler"
 	depends on HD64461
diff --git a/arch/sh/include/asm/hd64461.h b/arch/sh/include/asm/hd64461.h
index 52b4b62382777..c16d54563c932 100644
--- a/arch/sh/include/asm/hd64461.h
+++ b/arch/sh/include/asm/hd64461.h
@@ -13,13 +13,15 @@
 #define	HD64461_PCC_WINDOW	0x01000000
 
 /* Area 6 - Slot 0 - memory and/or IO card */
-#define	HD64461_PCC0_BASE	(CONFIG_HD64461_IOBASE + 0x8000000)
+#define HD64461_IOBASE		0xb0000000
+#define HD64461_IO_OFFSET(x)	(HD64461_IOBASE + (x))
+#define	HD64461_PCC0_BASE	HD64461_IO_OFFSET(0x8000000)
 #define	HD64461_PCC0_ATTR	(HD64461_PCC0_BASE)				/* 0xb80000000 */
 #define	HD64461_PCC0_COMM	(HD64461_PCC0_BASE+HD64461_PCC_WINDOW)		/* 0xb90000000 */
 #define	HD64461_PCC0_IO		(HD64461_PCC0_BASE+2*HD64461_PCC_WINDOW)	/* 0xba0000000 */
 
 /* Area 5 - Slot 1 - memory card only */
-#define	HD64461_PCC1_BASE	(CONFIG_HD64461_IOBASE + 0x4000000)
+#define	HD64461_PCC1_BASE	HD64461_IO_OFFSET(0x4000000)
 #define	HD64461_PCC1_ATTR	(HD64461_PCC1_BASE)				/* 0xb4000000 */
 #define	HD64461_PCC1_COMM	(HD64461_PCC1_BASE+HD64461_PCC_WINDOW)		/* 0xb5000000 */
 
@@ -41,19 +43,19 @@
 #define	HD64461_STBCR_SURTST		0x0001
 
 /* System Configuration Register */
-#define	HD64461_SYSCR		(CONFIG_HD64461_IOBASE + 0x02)
+#define	HD64461_SYSCR		HD64461_IO_OFFSET(0x02)
 
 /* CPU Data Bus Control Register */
-#define	HD64461_SCPUCR		(CONFIG_HD64461_IOBASE + 0x04)
+#define	HD64461_SCPUCR		HD64461_IO_OFFSET(0x04)
 
 /* Base Address Register */
-#define	HD64461_LCDCBAR		(CONFIG_HD64461_IOBASE + 0x1000)
+#define	HD64461_LCDCBAR		HD64461_IO_OFFSET(0x1000)
 
 /* Line increment address */
-#define	HD64461_LCDCLOR		(CONFIG_HD64461_IOBASE + 0x1002)
+#define	HD64461_LCDCLOR		HD64461_IO_OFFSET(0x1002)
 
 /* Controls LCD controller */
-#define	HD64461_LCDCCR		(CONFIG_HD64461_IOBASE + 0x1004)
+#define	HD64461_LCDCCR		HD64461_IO_OFFSET(0x1004)
 
 /* LCCDR control bits */
 #define	HD64461_LCDCCR_STBACK	0x0400	/* Standby Back */
@@ -64,30 +66,30 @@
 #define	HD64461_LCDCCR_SPON	0x0010	/* Start Power On */
 
 /* Controls LCD (1) */
-#define	HD64461_LDR1		(CONFIG_HD64461_IOBASE + 0x1010)
+#define	HD64461_LDR1		HD64461_IO_OFFSET(0x1010)
 #define	HD64461_LDR1_DON	0x01	/* Display On */
 #define	HD64461_LDR1_DINV	0x80	/* Display Invert */
 
 /* Controls LCD (2) */
-#define	HD64461_LDR2		(CONFIG_HD64461_IOBASE + 0x1012)
-#define	HD64461_LDHNCR		(CONFIG_HD64461_IOBASE + 0x1014)	/* Number of horizontal characters */
-#define	HD64461_LDHNSR		(CONFIG_HD64461_IOBASE + 0x1016)	/* Specify output start position + width of CL1 */
-#define	HD64461_LDVNTR		(CONFIG_HD64461_IOBASE + 0x1018)	/* Specify total vertical lines */
-#define	HD64461_LDVNDR		(CONFIG_HD64461_IOBASE + 0x101a)	/* specify number of display vertical lines */
-#define	HD64461_LDVSPR		(CONFIG_HD64461_IOBASE + 0x101c)	/* specify vertical synchronization pos and AC nr */
+#define	HD64461_LDR2		HD64461_IO_OFFSET(0x1012)
+#define	HD64461_LDHNCR		HD64461_IO_OFFSET(0x1014)	/* Number of horizontal characters */
+#define	HD64461_LDHNSR		HD64461_IO_OFFSET(0x1016)	/* Specify output start position + width of CL1 */
+#define	HD64461_LDVNTR		HD64461_IO_OFFSET(0x1018)	/* Specify total vertical lines */
+#define	HD64461_LDVNDR		HD64461_IO_OFFSET(0x101a)	/* specify number of display vertical lines */
+#define	HD64461_LDVSPR		HD64461_IO_OFFSET(0x101c)	/* specify vertical synchronization pos and AC nr */
 
 /* Controls LCD (3) */
-#define	HD64461_LDR3		(CONFIG_HD64461_IOBASE + 0x101e)
+#define	HD64461_LDR3		HD64461_IO_OFFSET(0x101e)
 
 /* Palette Registers */
-#define	HD64461_CPTWAR		(CONFIG_HD64461_IOBASE + 0x1030)	/* Color Palette Write Address Register */
-#define	HD64461_CPTWDR		(CONFIG_HD64461_IOBASE + 0x1032)	/* Color Palette Write Data Register */
-#define	HD64461_CPTRAR		(CONFIG_HD64461_IOBASE + 0x1034)	/* Color Palette Read Address Register */
-#define	HD64461_CPTRDR		(CONFIG_HD64461_IOBASE + 0x1036)	/* Color Palette Read Data Register */
+#define	HD64461_CPTWAR		HD64461_IO_OFFSET(0x1030)	/* Color Palette Write Address Register */
+#define	HD64461_CPTWDR		HD64461_IO_OFFSET(0x1032)	/* Color Palette Write Data Register */
+#define	HD64461_CPTRAR		HD64461_IO_OFFSET(0x1034)	/* Color Palette Read Address Register */
+#define	HD64461_CPTRDR		HD64461_IO_OFFSET(0x1036)	/* Color Palette Read Data Register */
 
-#define	HD64461_GRDOR		(CONFIG_HD64461_IOBASE + 0x1040)	/* Display Resolution Offset Register */
-#define	HD64461_GRSCR		(CONFIG_HD64461_IOBASE + 0x1042)	/* Solid Color Register */
-#define	HD64461_GRCFGR		(CONFIG_HD64461_IOBASE + 0x1044)	/* Accelerator Configuration Register */
+#define	HD64461_GRDOR		HD64461_IO_OFFSET(0x1040)	/* Display Resolution Offset Register */
+#define	HD64461_GRSCR		HD64461_IO_OFFSET(0x1042)	/* Solid Color Register */
+#define	HD64461_GRCFGR		HD64461_IO_OFFSET(0x1044)	/* Accelerator Configuration Register */
 
 #define	HD64461_GRCFGR_ACCSTATUS	0x10	/* Accelerator Status */
 #define	HD64461_GRCFGR_ACCRESET		0x08	/* Accelerator Reset */
@@ -97,41 +99,41 @@
 #define	HD64461_GRCFGR_COLORDEPTH8	0x01	/* Sets Colordepth 8 for Accelerator */
 
 /* Line Drawing Registers */
-#define	HD64461_LNSARH		(CONFIG_HD64461_IOBASE + 0x1046)	/* Line Start Address Register (H) */
-#define	HD64461_LNSARL		(CONFIG_HD64461_IOBASE + 0x1048)	/* Line Start Address Register (L) */
-#define	HD64461_LNAXLR		(CONFIG_HD64461_IOBASE + 0x104a)	/* Axis Pixel Length Register */
-#define	HD64461_LNDGR		(CONFIG_HD64461_IOBASE + 0x104c)	/* Diagonal Register */
-#define	HD64461_LNAXR		(CONFIG_HD64461_IOBASE + 0x104e)	/* Axial Register */
-#define	HD64461_LNERTR		(CONFIG_HD64461_IOBASE + 0x1050)	/* Start Error Term Register */
-#define	HD64461_LNMDR		(CONFIG_HD64461_IOBASE + 0x1052)	/* Line Mode Register */
+#define	HD64461_LNSARH		HD64461_IO_OFFSET(0x1046)	/* Line Start Address Register (H) */
+#define	HD64461_LNSARL		HD64461_IO_OFFSET(0x1048)	/* Line Start Address Register (L) */
+#define	HD64461_LNAXLR		HD64461_IO_OFFSET(0x104a)	/* Axis Pixel Length Register */
+#define	HD64461_LNDGR		HD64461_IO_OFFSET(0x104c)	/* Diagonal Register */
+#define	HD64461_LNAXR		HD64461_IO_OFFSET(0x104e)	/* Axial Register */
+#define	HD64461_LNERTR		HD64461_IO_OFFSET(0x1050)	/* Start Error Term Register */
+#define	HD64461_LNMDR		HD64461_IO_OFFSET(0x1052)	/* Line Mode Register */
 
 /* BitBLT Registers */
-#define	HD64461_BBTSSARH	(CONFIG_HD64461_IOBASE + 0x1054)	/* Source Start Address Register (H) */
-#define	HD64461_BBTSSARL	(CONFIG_HD64461_IOBASE + 0x1056)	/* Source Start Address Register (L) */
-#define	HD64461_BBTDSARH	(CONFIG_HD64461_IOBASE + 0x1058)	/* Destination Start Address Register (H) */
-#define	HD64461_BBTDSARL	(CONFIG_HD64461_IOBASE + 0x105a)	/* Destination Start Address Register (L) */
-#define	HD64461_BBTDWR		(CONFIG_HD64461_IOBASE + 0x105c)	/* Destination Block Width Register */
-#define	HD64461_BBTDHR		(CONFIG_HD64461_IOBASE + 0x105e)	/* Destination Block Height Register */
-#define	HD64461_BBTPARH		(CONFIG_HD64461_IOBASE + 0x1060)	/* Pattern Start Address Register (H) */
-#define	HD64461_BBTPARL		(CONFIG_HD64461_IOBASE + 0x1062)	/* Pattern Start Address Register (L) */
-#define	HD64461_BBTMARH		(CONFIG_HD64461_IOBASE + 0x1064)	/* Mask Start Address Register (H) */
-#define	HD64461_BBTMARL		(CONFIG_HD64461_IOBASE + 0x1066)	/* Mask Start Address Register (L) */
-#define	HD64461_BBTROPR		(CONFIG_HD64461_IOBASE + 0x1068)	/* ROP Register */
-#define	HD64461_BBTMDR		(CONFIG_HD64461_IOBASE + 0x106a)	/* BitBLT Mode Register */
+#define	HD64461_BBTSSARH	HD64461_IO_OFFSET(0x1054)	/* Source Start Address Register (H) */
+#define	HD64461_BBTSSARL	HD64461_IO_OFFSET(0x1056)	/* Source Start Address Register (L) */
+#define	HD64461_BBTDSARH	HD64461_IO_OFFSET(0x1058)	/* Destination Start Address Register (H) */
+#define	HD64461_BBTDSARL	HD64461_IO_OFFSET(0x105a)	/* Destination Start Address Register (L) */
+#define	HD64461_BBTDWR		HD64461_IO_OFFSET(0x105c)	/* Destination Block Width Register */
+#define	HD64461_BBTDHR		HD64461_IO_OFFSET(0x105e)	/* Destination Block Height Register */
+#define	HD64461_BBTPARH		HD64461_IO_OFFSET(0x1060)	/* Pattern Start Address Register (H) */
+#define	HD64461_BBTPARL		HD64461_IO_OFFSET(0x1062)	/* Pattern Start Address Register (L) */
+#define	HD64461_BBTMARH		HD64461_IO_OFFSET(0x1064)	/* Mask Start Address Register (H) */
+#define	HD64461_BBTMARL		HD64461_IO_OFFSET(0x1066)	/* Mask Start Address Register (L) */
+#define	HD64461_BBTROPR		HD64461_IO_OFFSET(0x1068)	/* ROP Register */
+#define	HD64461_BBTMDR		HD64461_IO_OFFSET(0x106a)	/* BitBLT Mode Register */
 
 /* PC Card Controller Registers */
 /* Maps to Physical Area 6 */
-#define	HD64461_PCC0ISR		(CONFIG_HD64461_IOBASE + 0x2000)	/* socket 0 interface status */
-#define	HD64461_PCC0GCR		(CONFIG_HD64461_IOBASE + 0x2002)	/* socket 0 general control */
-#define	HD64461_PCC0CSCR	(CONFIG_HD64461_IOBASE + 0x2004)	/* socket 0 card status change */
-#define	HD64461_PCC0CSCIER	(CONFIG_HD64461_IOBASE + 0x2006)	/* socket 0 card status change interrupt enable */
-#define	HD64461_PCC0SCR		(CONFIG_HD64461_IOBASE + 0x2008)	/* socket 0 software control */
+#define	HD64461_PCC0ISR		HD64461_IO_OFFSET(0x2000)	/* socket 0 interface status */
+#define	HD64461_PCC0GCR		HD64461_IO_OFFSET(0x2002)	/* socket 0 general control */
+#define	HD64461_PCC0CSCR	HD64461_IO_OFFSET(0x2004)	/* socket 0 card status change */
+#define	HD64461_PCC0CSCIER	HD64461_IO_OFFSET(0x2006)	/* socket 0 card status change interrupt enable */
+#define	HD64461_PCC0SCR		HD64461_IO_OFFSET(0x2008)	/* socket 0 software control */
 /* Maps to Physical Area 5 */
-#define	HD64461_PCC1ISR		(CONFIG_HD64461_IOBASE + 0x2010)	/* socket 1 interface status */
-#define	HD64461_PCC1GCR		(CONFIG_HD64461_IOBASE + 0x2012)	/* socket 1 general control */
-#define	HD64461_PCC1CSCR	(CONFIG_HD64461_IOBASE + 0x2014)	/* socket 1 card status change */
-#define	HD64461_PCC1CSCIER	(CONFIG_HD64461_IOBASE + 0x2016)	/* socket 1 card status change interrupt enable */
-#define	HD64461_PCC1SCR		(CONFIG_HD64461_IOBASE + 0x2018)	/* socket 1 software control */
+#define	HD64461_PCC1ISR		HD64461_IO_OFFSET(0x2010)	/* socket 1 interface status */
+#define	HD64461_PCC1GCR		HD64461_IO_OFFSET(0x2012)	/* socket 1 general control */
+#define	HD64461_PCC1CSCR	HD64461_IO_OFFSET(0x2014)	/* socket 1 card status change */
+#define	HD64461_PCC1CSCIER	HD64461_IO_OFFSET(0x2016)	/* socket 1 card status change interrupt enable */
+#define	HD64461_PCC1SCR		HD64461_IO_OFFSET(0x2018)	/* socket 1 software control */
 
 /* PCC Interface Status Register */
 #define	HD64461_PCCISR_READY		0x80	/* card ready */
@@ -189,41 +191,41 @@
 #define	HD64461_PCCSCR_SWP		0x01	/* write protect */
 
 /* PCC0 Output Pins Control Register */
-#define	HD64461_P0OCR		(CONFIG_HD64461_IOBASE + 0x202a)
+#define	HD64461_P0OCR		HD64461_IO_OFFSET(0x202a)
 
 /* PCC1 Output Pins Control Register */
-#define	HD64461_P1OCR		(CONFIG_HD64461_IOBASE + 0x202c)
+#define	HD64461_P1OCR		HD64461_IO_OFFSET(0x202c)
 
 /* PC Card General Control Register */
-#define	HD64461_PGCR		(CONFIG_HD64461_IOBASE + 0x202e)
+#define	HD64461_PGCR		HD64461_IO_OFFSET(0x202e)
 
 /* Port Control Registers */
-#define	HD64461_GPACR		(CONFIG_HD64461_IOBASE + 0x4000)	/* Port A - Handles IRDA/TIMER */
-#define	HD64461_GPBCR		(CONFIG_HD64461_IOBASE + 0x4002)	/* Port B - Handles UART */
-#define	HD64461_GPCCR		(CONFIG_HD64461_IOBASE + 0x4004)	/* Port C - Handles PCMCIA 1 */
-#define	HD64461_GPDCR		(CONFIG_HD64461_IOBASE + 0x4006)	/* Port D - Handles PCMCIA 1 */
+#define	HD64461_GPACR		HD64461_IO_OFFSET(0x4000)	/* Port A - Handles IRDA/TIMER */
+#define	HD64461_GPBCR		HD64461_IO_OFFSET(0x4002)	/* Port B - Handles UART */
+#define	HD64461_GPCCR		HD64461_IO_OFFSET(0x4004)	/* Port C - Handles PCMCIA 1 */
+#define	HD64461_GPDCR		HD64461_IO_OFFSET(0x4006)	/* Port D - Handles PCMCIA 1 */
 
 /* Port Control Data Registers */
-#define	HD64461_GPADR		(CONFIG_HD64461_IOBASE + 0x4010)	/* A */
-#define	HD64461_GPBDR		(CONFIG_HD64461_IOBASE + 0x4012)	/* B */
-#define	HD64461_GPCDR		(CONFIG_HD64461_IOBASE + 0x4014)	/* C */
-#define	HD64461_GPDDR		(CONFIG_HD64461_IOBASE + 0x4016)	/* D */
+#define	HD64461_GPADR		HD64461_IO_OFFSET(0x4010)	/* A */
+#define	HD64461_GPBDR		HD64461_IO_OFFSET(0x4012)	/* B */
+#define	HD64461_GPCDR		HD64461_IO_OFFSET(0x4014)	/* C */
+#define	HD64461_GPDDR		HD64461_IO_OFFSET(0x4016)	/* D */
 
 /* Interrupt Control Registers */
-#define	HD64461_GPAICR		(CONFIG_HD64461_IOBASE + 0x4020)	/* A */
-#define	HD64461_GPBICR		(CONFIG_HD64461_IOBASE + 0x4022)	/* B */
-#define	HD64461_GPCICR		(CONFIG_HD64461_IOBASE + 0x4024)	/* C */
-#define	HD64461_GPDICR		(CONFIG_HD64461_IOBASE + 0x4026)	/* D */
+#define	HD64461_GPAICR		HD64461_IO_OFFSET(0x4020)	/* A */
+#define	HD64461_GPBICR		HD64461_IO_OFFSET(0x4022)	/* B */
+#define	HD64461_GPCICR		HD64461_IO_OFFSET(0x4024)	/* C */
+#define	HD64461_GPDICR		HD64461_IO_OFFSET(0x4026)	/* D */
 
 /* Interrupt Status Registers */
-#define	HD64461_GPAISR		(CONFIG_HD64461_IOBASE + 0x4040)	/* A */
-#define	HD64461_GPBISR		(CONFIG_HD64461_IOBASE + 0x4042)	/* B */
-#define	HD64461_GPCISR		(CONFIG_HD64461_IOBASE + 0x4044)	/* C */
-#define	HD64461_GPDISR		(CONFIG_HD64461_IOBASE + 0x4046)	/* D */
+#define	HD64461_GPAISR		HD64461_IO_OFFSET(0x4040)	/* A */
+#define	HD64461_GPBISR		HD64461_IO_OFFSET(0x4042)	/* B */
+#define	HD64461_GPCISR		HD64461_IO_OFFSET(0x4044)	/* C */
+#define	HD64461_GPDISR		HD64461_IO_OFFSET(0x4046)	/* D */
 
 /* Interrupt Request Register & Interrupt Mask Register */
-#define	HD64461_NIRR		(CONFIG_HD64461_IOBASE + 0x5000)
-#define	HD64461_NIMR		(CONFIG_HD64461_IOBASE + 0x5002)
+#define	HD64461_NIRR		HD64461_IO_OFFSET(0x5000)
+#define	HD64461_NIMR		HD64461_IO_OFFSET(0x5002)
 
 #define	HD64461_IRQBASE		OFFCHIP_IRQ_BASE
 #define	OFFCHIP_IRQ_BASE	64
-- 
GitLab


From 639138a991aaf1a3f65cc66700d097edc5f602fe Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Fri, 15 May 2009 12:07:17 +0900
Subject: [PATCH 1138/2198] sound: oss: sh_dac_audio timer fixes.

This patch modifies sh_dac_audio in the following ways:

- use high resolution timer instead of TMU1
- fix cpu/dac.h include
- add future rewrite comment

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 sound/oss/Kconfig        |  2 +-
 sound/oss/sh_dac_audio.c | 85 +++++++++++++++-------------------------
 2 files changed, 33 insertions(+), 54 deletions(-)

diff --git a/sound/oss/Kconfig b/sound/oss/Kconfig
index 1ca7427c4b6d8..bcf2a0698d545 100644
--- a/sound/oss/Kconfig
+++ b/sound/oss/Kconfig
@@ -561,7 +561,7 @@ endif	# SOUND_OSS
 
 config SOUND_SH_DAC_AUDIO
 	tristate "SuperH DAC audio support"
-	depends on CPU_SH3
+	depends on CPU_SH3 && HIGH_RES_TIMERS
 
 config SOUND_SH_DAC_AUDIO_CHANNEL
 	int "DAC channel"
diff --git a/sound/oss/sh_dac_audio.c b/sound/oss/sh_dac_audio.c
index 78cfb66e4c59c..b2ed8757542ac 100644
--- a/sound/oss/sh_dac_audio.c
+++ b/sound/oss/sh_dac_audio.c
@@ -18,47 +18,36 @@
 #include <linux/sound.h>
 #include <linux/soundcard.h>
 #include <linux/interrupt.h>
+#include <linux/hrtimer.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 #include <asm/irq.h>
 #include <asm/delay.h>
 #include <asm/clock.h>
-#include <asm/cpu/dac.h>
-#include <asm/cpu/timer.h>
+#include <cpu/dac.h>
 #include <asm/machvec.h>
 #include <mach/hp6xx.h>
 #include <asm/hd64461.h>
 
 #define MODNAME "sh_dac_audio"
 
-#define TMU_TOCR_INIT	0x00
-
-#define TMU1_TCR_INIT	0x0020	/* Clock/4, rising edge; interrupt on */
-#define TMU1_TSTR_INIT  0x02	/* Bit to turn on TMU1 */
-
 #define BUFFER_SIZE 48000
 
 static int rate;
 static int empty;
 static char *data_buffer, *buffer_begin, *buffer_end;
 static int in_use, device_major;
+static struct hrtimer hrtimer;
+static ktime_t wakeups_per_second;
 
 static void dac_audio_start_timer(void)
 {
-	u8 tstr;
-
-	tstr = ctrl_inb(TMU_TSTR);
-	tstr |= TMU1_TSTR_INIT;
-	ctrl_outb(tstr, TMU_TSTR);
+	hrtimer_start(&hrtimer, wakeups_per_second, HRTIMER_MODE_REL);
 }
 
 static void dac_audio_stop_timer(void)
 {
-	u8 tstr;
-
-	tstr = ctrl_inb(TMU_TSTR);
-	tstr &= ~TMU1_TSTR_INIT;
-	ctrl_outb(tstr, TMU_TSTR);
+	hrtimer_cancel(&hrtimer);
 }
 
 static void dac_audio_reset(void)
@@ -77,38 +66,30 @@ static void dac_audio_sync(void)
 static void dac_audio_start(void)
 {
 	if (mach_is_hp6xx()) {
-		u16 v = inw(HD64461_GPADR);
+		u16 v = __raw_readw(HD64461_GPADR);
 		v &= ~HD64461_GPADR_SPEAKER;
-		outw(v, HD64461_GPADR);
+		__raw_writew(v, HD64461_GPADR);
 	}
 
 	sh_dac_enable(CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
-	ctrl_outw(TMU1_TCR_INIT, TMU1_TCR);
 }
 static void dac_audio_stop(void)
 {
 	dac_audio_stop_timer();
 
 	if (mach_is_hp6xx()) {
-		u16 v = inw(HD64461_GPADR);
+		u16 v = __raw_readw(HD64461_GPADR);
 		v |= HD64461_GPADR_SPEAKER;
-		outw(v, HD64461_GPADR);
+		__raw_writew(v, HD64461_GPADR);
 	}
 
- 	sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
+	sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
 	sh_dac_disable(CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
 }
 
 static void dac_audio_set_rate(void)
 {
-	unsigned long interval;
- 	struct clk *clk;
-
- 	clk = clk_get(NULL, "module_clk");
- 	interval = (clk_get_rate(clk) / 4) / rate;
- 	clk_put(clk);
-	ctrl_outl(interval, TMU1_TCOR);
-	ctrl_outl(interval, TMU1_TCNT);
+	wakeups_per_second = ktime_set(0, 1000000000 / rate);
 }
 
 static int dac_audio_ioctl(struct inode *inode, struct file *file,
@@ -265,32 +246,26 @@ const struct file_operations dac_audio_fops = {
       .release =	dac_audio_release,
 };
 
-static irqreturn_t timer1_interrupt(int irq, void *dev)
+static enum hrtimer_restart sh_dac_audio_timer(struct hrtimer *handle)
 {
-	unsigned long timer_status;
-
-	timer_status = ctrl_inw(TMU1_TCR);
-	timer_status &= ~0x100;
-	ctrl_outw(timer_status, TMU1_TCR);
-
 	if (!empty) {
 		sh_dac_output(*buffer_begin, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL);
 		buffer_begin++;
 
 		if (buffer_begin == data_buffer + BUFFER_SIZE)
 			buffer_begin = data_buffer;
-		if (buffer_begin == buffer_end) {
+		if (buffer_begin == buffer_end)
 			empty = 1;
-			dac_audio_stop_timer();
-		}
 	}
-	return IRQ_HANDLED;
+
+	if (!empty)
+		hrtimer_start(&hrtimer, wakeups_per_second, HRTIMER_MODE_REL);
+
+	return HRTIMER_NORESTART;
 }
 
 static int __init dac_audio_init(void)
 {
-	int retval;
-
 	if ((device_major = register_sound_dsp(&dac_audio_fops, -1)) < 0) {
 		printk(KERN_ERR "Cannot register dsp device");
 		return device_major;
@@ -306,21 +281,25 @@ static int __init dac_audio_init(void)
 	rate = 8000;
 	dac_audio_set_rate();
 
-	retval =
-	    request_irq(TIMER1_IRQ, timer1_interrupt, IRQF_DISABLED, MODNAME, 0);
-	if (retval < 0) {
-		printk(KERN_ERR "sh_dac_audio: IRQ %d request failed\n",
-		       TIMER1_IRQ);
-		return retval;
-	}
+	/* Today: High Resolution Timer driven DAC playback.
+	 * The timer callback gets called once per sample. Ouch.
+	 *
+	 * Future: A much better approach would be to use the
+	 * SH7720 CMT+DMAC+DAC hardware combination like this:
+	 * - Program sample rate using CMT0 or CMT1
+	 * - Program DMAC to use CMT for timing and output to DAC
+	 * - Play sound using DMAC, let CPU sleep.
+	 * - While at it, rewrite this driver to use ALSA.
+	 */
+
+	hrtimer_init(&hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer.function = sh_dac_audio_timer;
 
 	return 0;
 }
 
 static void __exit dac_audio_exit(void)
 {
-	free_irq(TIMER1_IRQ, 0);
-
 	unregister_sound_dsp(device_major);
 	kfree((void *)data_buffer);
 }
-- 
GitLab


From 29a679754b1a2581ee456eada6c2de7ce95068bb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 14 May 2009 23:19:09 -0400
Subject: [PATCH 1139/2198] x86/stacktrace: return 0 instead of -1 for stack
 ops

If we return -1 in the ops->stack for the stacktrace saving, we end up
breaking out of the loop if the stack we are tracing is in the exception
stack. This causes traces like:

          <idle>-0     [002] 34263.745825: raise_softirq_irqoff <-__blk_complete_request
          <idle>-0     [002] 34263.745826:
 <= 0
 <= 0
 <= 0
 <= 0
 <= 0
 <= 0
 <= 0

By returning "0" instead, the irq stack is saved as well, and we see:

          <idle>-0     [003]   883.280992: raise_softirq_irqoff <-__hrtimer_star
t_range_ns
          <idle>-0     [003]   883.280992:
 <= hrtimer_start_range_ns
 <= tick_nohz_restart_sched_tick
 <= cpu_idle
 <= start_secondary
 <=
 <= 0
 <= 0

[ Impact: record stacks from interrupts ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 arch/x86/kernel/stacktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index f7bddc2e37d1b..4aaf7e48394fb 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
 
 static int save_stack_stack(void *data, char *name)
 {
-	return -1;
+	return 0;
 }
 
 static void save_stack_address(void *data, unsigned long addr, int reliable)
-- 
GitLab


From 1ec7c4849c214fc78b023230264399836ea3b245 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 14 May 2009 23:40:06 -0400
Subject: [PATCH 1140/2198] tracing: stop stack trace on first empty entry

The stack tracer stores eight entries in the ring buffer when an event
traces the stack. The output outputs all eight entries regardless of
how many entries were recorded.

This patch breaks out of the loop when a null entry is discovered.

[ Impact: only print the stack that is recorded ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8bd9a2c1a46a0..489c0e8ada097 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -898,6 +898,8 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 	trace_assign_type(field, iter->ent);
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		if (!field->caller[i])
+			break;
 		if (i) {
 			if (!trace_seq_puts(s, " <= "))
 				goto partial;
-- 
GitLab


From 8cd995b6deedf98b7694ed32a786ee7f793d1eec Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 15 May 2009 11:07:27 +0800
Subject: [PATCH 1141/2198] tracing/filters: add missing unlock in a failure
 path

[ Impact: fix deadlock in a rare case we fail to allocate memory ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A0CDC6F.7070200@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 85ad6a8939ad2..22c29984fe0e7 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1079,9 +1079,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 		return 0;
 	}
 
+	err = -ENOMEM;
 	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 	if (!ps)
-		return -ENOMEM;
+		goto out_unlock;
 
 	filter_disable_preds(call);
 	replace_filter_string(call->filter, filter_string);
@@ -1101,7 +1102,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
 	kfree(ps);
-
+out_unlock:
 	mutex_unlock(&filter_mutex);
 
 	return err;
@@ -1123,9 +1124,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 		return 0;
 	}
 
+	err = -ENOMEM;
 	ps = kzalloc(sizeof(*ps), GFP_KERNEL);
 	if (!ps)
-		return -ENOMEM;
+		goto out_unlock;
 
 	filter_free_subsystem_preds(system);
 	replace_filter_string(system->filter, filter_string);
@@ -1145,7 +1147,7 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
 	filter_opstack_clear(ps);
 	postfix_clear(ps);
 	kfree(ps);
-
+out_unlock:
 	mutex_unlock(&filter_mutex);
 
 	return err;
-- 
GitLab


From 5872144f64b34a5942f6b4acedc90b02de72c58b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 15 May 2009 11:07:56 +0800
Subject: [PATCH 1142/2198] tracing/filters: fix off-by-one bug

We should leave the last slot for the ending '\0'.

[ Impact: fix possible crash when the length of an operand is 128 ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A0CDC8C.30602@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events_filter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 22c29984fe0e7..a7430b16d2439 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -736,7 +736,7 @@ static inline void clear_operand_string(struct filter_parse_state *ps)
 
 static inline int append_operand_char(struct filter_parse_state *ps, char c)
 {
-	if (ps->operand.tail == MAX_FILTER_STR_VAL)
+	if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
 		return -EINVAL;
 
 	ps->operand.string[ps->operand.tail++] = c;
-- 
GitLab


From 1a853e36871b533ccc3f3c5bdd5cd0d867043a00 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 14 May 2009 22:50:46 -0300
Subject: [PATCH 1143/2198] perf record: Allow specifying a pid to record

Allow specifying a pid instead of always fork+exec'ing a command.

Because the PERF_EVENT_COMM and PERF_EVENT_MMAP events happened before
we connected, we must synthesize them so that 'perf report' can get what
it needs.

[ Impact: add new command line option ]

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090515015046.GA13664@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 163 ++++++++++++++++++--
 1 file changed, 146 insertions(+), 17 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 5f5e6df0260df..efb87595f3cb0 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -34,6 +34,9 @@
 
 #include "perf.h"
 
+#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
+#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+
 static int			nr_counters			=  0;
 static __u64			event_id[MAX_COUNTERS]		= { };
 static int			default_interval = 100000;
@@ -47,6 +50,7 @@ static char 			*output_name			= "output.perf";
 static int			group				= 0;
 static unsigned int		realtime_prio			= 0;
 static int			system_wide			= 0;
+static pid_t			target_pid			= -1;
 static int			inherit				= 1;
 static int			nmi				= 1;
 
@@ -180,6 +184,7 @@ static void display_help(void)
 	" -c CNT    --count=CNT          # event period to sample\n"
 	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
 	" -o file   --output=<file>      # output file\n"
+	" -p pid    --pid=<pid>		 # record events on existing pid\n"
 	" -r prio   --realtime=<prio>    # use RT prio\n"
 	" -s        --system             # system wide profiling\n"
 	);
@@ -199,13 +204,14 @@ static void process_options(int argc, const char *argv[])
 			{"event",	required_argument,	NULL, 'e'},
 			{"mmap_pages",	required_argument,	NULL, 'm'},
 			{"output",	required_argument,	NULL, 'o'},
+			{"pid",		required_argument,	NULL, 'p'},
 			{"realtime",	required_argument,	NULL, 'r'},
 			{"system",	no_argument,		NULL, 's'},
 			{"inherit",	no_argument,		NULL, 'i'},
 			{"nmi",		no_argument,		NULL, 'n'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:c:e:m:o:r:sin",
+		int c = getopt_long(argc, argv, "+:c:e:m:o:p:r:sin",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -215,6 +221,7 @@ static void process_options(int argc, const char *argv[])
 		case 'e': error				= parse_events(optarg); break;
 		case 'm': mmap_pages			=   atoi(optarg); break;
 		case 'o': output_name			= strdup(optarg); break;
+		case 'p': target_pid			=   atoi(optarg); break;
 		case 'r': realtime_prio			=   atoi(optarg); break;
 		case 's': system_wide                   ^=             1; break;
 		case 'i': inherit			^=	       1; break;
@@ -223,7 +230,7 @@ static void process_options(int argc, const char *argv[])
 		}
 	}
 
-	if (argc - optind == 0)
+	if (argc - optind == 0 && target_pid == -1)
 		error = 1;
 
 	if (error)
@@ -350,15 +357,135 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 static int nr_poll;
 static int nr_cpu;
 
-static void open_counters(int cpu)
+struct mmap_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	__u64 start;
+	__u64 len;
+	__u64 pgoff;
+	char filename[PATH_MAX];
+};
+struct comm_event {
+	struct perf_event_header header;
+	__u32 pid,tid;
+	char comm[16];
+};
+
+static pid_t pid_synthesize_comm_event(pid_t pid)
+{
+	char filename[PATH_MAX];
+	char bf[BUFSIZ];
+	struct comm_event comm_ev;
+	size_t size;
+	int fd;
+
+	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
+
+	fd = open(filename, O_RDONLY);
+	if (fd < 0) {
+		fprintf(stderr, "couldn't open %s\n", filename);
+		exit(EXIT_FAILURE);
+	}
+	if (read(fd, bf, sizeof(bf)) < 0) {
+		fprintf(stderr, "couldn't read %s\n", filename);
+		exit(EXIT_FAILURE);
+	}
+	close(fd);
+
+	pid_t spid, ppid;
+	char state;
+	char comm[18];
+
+	memset(&comm_ev, 0, sizeof(comm_ev));
+        int nr = sscanf(bf, "%d %s %c %d %d ",
+			&spid, comm, &state, &ppid, &comm_ev.pid);
+	if (nr != 5) {
+		fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
+			filename);
+		exit(EXIT_FAILURE);
+	}
+	comm_ev.header.type = PERF_EVENT_COMM;
+	comm_ev.tid = pid;
+	size = strlen(comm);
+	comm[--size] = '\0'; /* Remove the ')' at the end */
+	--size; /* Remove the '(' at the begin */
+	memcpy(comm_ev.comm, comm + 1, size);
+	size = ALIGN(size, sizeof(uint64_t));
+	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
+	int ret = write(output, &comm_ev, comm_ev.header.size);
+	if (ret < 0) {
+		perror("failed to write");
+		exit(-1);
+	}
+	return comm_ev.pid;
+}
+
+static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
+{
+	char filename[PATH_MAX];
+	FILE *fp;
+
+	snprintf(filename, sizeof(filename), "/proc/%d/maps", pid);
+
+	fp = fopen(filename, "r");
+	if (fp == NULL) {
+		fprintf(stderr, "couldn't open %s\n", filename);
+		exit(EXIT_FAILURE);
+	}
+	while (1) {
+		char bf[BUFSIZ];
+		unsigned char vm_read, vm_write, vm_exec, vm_mayshare;
+		struct mmap_event mmap_ev = {
+			.header.type = PERF_EVENT_MMAP,
+		};
+		unsigned long ino;
+		int major, minor;
+		size_t size;
+		if (fgets(bf, sizeof(bf), fp) == NULL)
+			break;
+
+		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
+		sscanf(bf, "%llx-%llx %c%c%c%c %llx %x:%x %lu",
+			&mmap_ev.start, &mmap_ev.len,
+                        &vm_read, &vm_write, &vm_exec, &vm_mayshare,
+                        &mmap_ev.pgoff, &major, &minor, &ino);
+		if (vm_exec == 'x') {
+			char *execname = strrchr(bf, ' ');
+
+			if (execname == NULL || execname[1] != '/')
+				continue;
+
+			execname += 1;
+			size = strlen(execname);
+			execname[size - 1] = '\0'; /* Remove \n */
+			memcpy(mmap_ev.filename, execname, size);
+			size = ALIGN(size, sizeof(uint64_t));
+			mmap_ev.len -= mmap_ev.start;
+			mmap_ev.header.size = (sizeof(mmap_ev) -
+					       (sizeof(mmap_ev.filename) - size));
+			mmap_ev.pid = pgid;
+			mmap_ev.tid = pid;
+
+			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
+				perror("failed to write");
+				exit(-1);
+			}
+		}
+	}
+
+	fclose(fp);
+}
+
+static void open_counters(int cpu, pid_t pid)
 {
 	struct perf_counter_hw_event hw_event;
 	int counter, group_fd;
 	int track = 1;
-	pid_t pid = -1;
 
-	if (cpu < 0)
-		pid = 0;
+	if (pid > 0) {
+		pid_t pgid = pid_synthesize_comm_event(pid);
+		pid_synthesize_mmap_events(pid, pgid);
+	}
 
 	group_fd = -1;
 	for (counter = 0; counter < nr_counters; counter++) {
@@ -435,22 +562,24 @@ int cmd_record(int argc, const char **argv)
 	argc -= optind;
 	argv += optind;
 
-	if (!system_wide)
-		open_counters(-1);
-	else for (i = 0; i < nr_cpus; i++)
-		open_counters(i);
+	if (!system_wide) {
+		open_counters(-1, target_pid != -1 ? target_pid : 0);
+	} else for (i = 0; i < nr_cpus; i++)
+		open_counters(i, target_pid);
 
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 
-	pid = fork();
-	if (pid < 0)
-		perror("failed to fork");
+	if (target_pid == -1) {
+		pid = fork();
+		if (pid < 0)
+			perror("failed to fork");
 
-	if (!pid) {
-		if (execvp(argv[0], argv)) {
-			perror(argv[0]);
-			exit(-1);
+		if (!pid) {
+			if (execvp(argv[0], argv)) {
+				perror(argv[0]);
+				exit(-1);
+			}
 		}
 	}
 
-- 
GitLab


From ec3232bdf8518bea8410f0027f870b24d3aa8753 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 09:45:19 +0200
Subject: [PATCH 1144/2198] perf_counter: x86: More accurate counter update

Take the counter width into account instead of assuming 32 bits.

In particular Nehalem has 44 bit wide counters, and all
arithmetics should happen on a 44-bit signed integer basis.

[ Impact: fix rare event imprecision, warning message on Nehalem ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f7772ff7936ed..3a92a2b2a80f5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -138,7 +138,9 @@ static u64
 x86_perf_counter_update(struct perf_counter *counter,
 			struct hw_perf_counter *hwc, int idx)
 {
-	u64 prev_raw_count, new_raw_count, delta;
+	int shift = 64 - x86_pmu.counter_bits;
+	u64 prev_raw_count, new_raw_count;
+	s64 delta;
 
 	/*
 	 * Careful: an NMI might modify the previous counter value.
@@ -161,9 +163,10 @@ x86_perf_counter_update(struct perf_counter *counter,
 	 * (counter-)time and add that to the generic counter.
 	 *
 	 * Careful, not all hw sign-extends above the physical width
-	 * of the count, so we do that by clipping the delta to 32 bits:
+	 * of the count.
 	 */
-	delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+	delta = (new_raw_count << shift) - (prev_raw_count << shift);
+	delta >>= shift;
 
 	atomic64_add(delta, &counter->count);
 	atomic64_sub(delta, &hwc->period_left);
-- 
GitLab


From f5a5a2f6e69e88647ae12da39f0ff3a510bcf0a6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 13 May 2009 12:54:01 +0200
Subject: [PATCH 1145/2198] perf_counter: x86: Fix throttling

If counters are disabled globally when a perfcounter IRQ/NMI hits,
and if we throttle in that case, we'll promote the '0' value to
the next lapic IRQ and disable all perfcounters at that point,
permanently ...

Fix it.

[ Impact: fix hung perfcounters under load ]

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 3a92a2b2a80f5..88ae8cebf3c1c 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -765,8 +765,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	/*
 	 * Restore - do not reenable when global enable is off or throttled:
 	 */
-	if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS)
-		intel_pmu_restore_all(cpuc->throttle_ctrl);
+	if (cpuc->throttle_ctrl) {
+		if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) {
+			intel_pmu_restore_all(cpuc->throttle_ctrl);
+		} else {
+			pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id());
+		}
+	}
 
 	return ret;
 }
@@ -817,11 +822,16 @@ void perf_counter_unthrottle(void)
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		if (printk_ratelimit())
-			printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n");
+		pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id());
+
+		/*
+		 * Clear them before re-enabling irqs/NMIs again:
+		 */
+		cpuc->interrupts = 0;
 		hw_perf_restore(cpuc->throttle_ctrl);
+	} else {
+		cpuc->interrupts = 0;
 	}
-	cpuc->interrupts = 0;
 }
 
 void smp_perf_counter_interrupt(struct pt_regs *regs)
-- 
GitLab


From a026dfecc035f213c1cfa0bf6407ce3155f6a9df Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 10:02:57 +0200
Subject: [PATCH 1146/2198] perf_counter: x86: Allow unpriviliged use of NMIs

Apply sysctl_perf_counter_priv to NMIs. Also, fail the counter
creation instead of silently down-grading to regular interrupts.

[ Impact: allow wider perf-counter usage ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 88ae8cebf3c1c..c19e927b6979b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -280,8 +280,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	 * If privileged enough, allow NMI events:
 	 */
 	hwc->nmi = 0;
-	if (capable(CAP_SYS_ADMIN) && hw_event->nmi)
+	if (hw_event->nmi) {
+		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
 		hwc->nmi = 1;
+	}
 
 	hwc->irq_period	= hw_event->irq_period;
 	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
-- 
GitLab


From 53020fe81eecd0b7be295868ce5850ef8f41074e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 21:26:19 +0200
Subject: [PATCH 1147/2198] perf_counter: Fix perf_output_copy() WARN to
 account for overflow

The simple reservation test in perf_output_copy() failed to take
unsigned int overflow into account, fix this.

[ Impact: fix false positive warning with more than 4GB of profiling data ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ff166c11b69ae..985be0b662af9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1927,7 +1927,11 @@ static void perf_output_copy(struct perf_output_handle *handle,
 
 	handle->offset = offset;
 
-	WARN_ON_ONCE(handle->offset > handle->head);
+	/*
+	 * Check we didn't copy past our reservation window, taking the
+	 * possible unsigned int wrap into account.
+	 */
+	WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0);
 }
 
 #define perf_output_put(handle, x) \
-- 
GitLab


From 962bf7a66edca4d36a730a38ff8410a67f560e40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 13:21:36 +0200
Subject: [PATCH 1148/2198] perf_counter: x86: Fix up the amd NMI/INT throttle

perf_counter_unthrottle() restores throttle_ctrl, buts its never set.
Also, we fail to disable all counters when throttling.

[ Impact: fix rare stuck perf-counters when they are throttled ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++----------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c19e927b6979b..7601c014f8f64 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -334,6 +334,8 @@ static u64 amd_pmu_save_disable_all(void)
 	 * right thing.
 	 */
 	barrier();
+	if (!enabled)
+		goto out;
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
@@ -347,6 +349,7 @@ static u64 amd_pmu_save_disable_all(void)
 		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
 
+out:
 	return enabled;
 }
 
@@ -787,32 +790,43 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	int handled = 0;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	int idx;
+	int idx, throttle = 0;
+
+	cpuc->throttle_ctrl = cpuc->enabled;
+	cpuc->enabled = 0;
+	barrier();
+
+	if (cpuc->throttle_ctrl) {
+		if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
+			throttle = 1;
+	}
 
-	++cpuc->interrupts;
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		int disable = 0;
+
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
+
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			continue;
+			goto next;
+
 		/* counter overflow */
 		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
-		if (perf_counter_overflow(counter, nmi, regs, 0))
-			amd_pmu_disable_counter(hwc, idx);
-		else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
-			/*
-			 * do not reenable when throttled, but reload
-			 * the register
-			 */
+		disable = perf_counter_overflow(counter, nmi, regs, 0);
+
+next:
+		if (disable || throttle)
 			amd_pmu_disable_counter(hwc, idx);
-		else if (counter->state == PERF_COUNTER_STATE_ACTIVE)
-			amd_pmu_enable_counter(hwc, idx);
 	}
+
+	if (cpuc->throttle_ctrl && !throttle)
+		cpuc->enabled = 1;
+
 	return handled;
 }
 
-- 
GitLab


From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 May 2009 16:21:38 +0200
Subject: [PATCH 1149/2198] perf_counter: Rework the perf counter
 disable/enable

The current disable/enable mechanism is:

	token = hw_perf_save_disable();
	...
	/* do bits */
	...
	hw_perf_restore(token);

This works well, provided that the use nests properly. Except we don't.

x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore
provide a reference counter disable/enable interface, where the first disable
disables the hardware, and the last enable enables the hardware again.

[ Impact: refactor, simplify the PMU disable/enable logic ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  24 +++---
 arch/x86/kernel/cpu/perf_counter.c | 113 +++++++++++------------------
 drivers/acpi/processor_idle.c      |   6 +-
 include/linux/perf_counter.h       |  10 ++-
 kernel/perf_counter.c              |  76 +++++++++++--------
 5 files changed, 109 insertions(+), 120 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 15cdc8e672296..bb1b463c1361b 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -386,7 +386,7 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
  * Disable all counters to prevent PMU interrupts and to allow
  * counters to be added or removed.
  */
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long ret;
@@ -428,7 +428,6 @@ u64 hw_perf_save_disable(void)
 		mb();
 	}
 	local_irq_restore(flags);
-	return ret;
 }
 
 /*
@@ -436,7 +435,7 @@ u64 hw_perf_save_disable(void)
  * If we were previously disabled and counters were added, then
  * put the new config on the PMU.
  */
-void hw_perf_restore(u64 disable)
+void hw_perf_enable(void)
 {
 	struct perf_counter *counter;
 	struct cpu_hw_counters *cpuhw;
@@ -448,9 +447,12 @@ void hw_perf_restore(u64 disable)
 	int n_lim;
 	int idx;
 
-	if (disable)
-		return;
 	local_irq_save(flags);
+	if (!cpuhw->disabled) {
+		local_irq_restore(flags);
+		return;
+	}
+
 	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	cpuhw->disabled = 0;
 
@@ -649,19 +651,18 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader,
 /*
  * Add a counter to the PMU.
  * If all counters are not already frozen, then we disable and
- * re-enable the PMU in order to get hw_perf_restore to do the
+ * re-enable the PMU in order to get hw_perf_enable to do the
  * actual work of reconfiguring the PMU.
  */
 static int power_pmu_enable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	unsigned long flags;
-	u64 pmudis;
 	int n0;
 	int ret = -EAGAIN;
 
 	local_irq_save(flags);
-	pmudis = hw_perf_save_disable();
+	perf_disable();
 
 	/*
 	 * Add the counter to the list (if there is room)
@@ -685,7 +686,7 @@ static int power_pmu_enable(struct perf_counter *counter)
 
 	ret = 0;
  out:
-	hw_perf_restore(pmudis);
+	perf_enable();
 	local_irq_restore(flags);
 	return ret;
 }
@@ -697,11 +698,10 @@ static void power_pmu_disable(struct perf_counter *counter)
 {
 	struct cpu_hw_counters *cpuhw;
 	long i;
-	u64 pmudis;
 	unsigned long flags;
 
 	local_irq_save(flags);
-	pmudis = hw_perf_save_disable();
+	perf_disable();
 
 	power_pmu_read(counter);
 
@@ -735,7 +735,7 @@ static void power_pmu_disable(struct perf_counter *counter)
 		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE);
 	}
 
-	hw_perf_restore(pmudis);
+	perf_enable();
 	local_irq_restore(flags);
 }
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7601c014f8f64..313638cecbb54 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -31,7 +31,6 @@ struct cpu_hw_counters {
 	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	unsigned long		interrupts;
-	u64			throttle_ctrl;
 	int			enabled;
 };
 
@@ -42,8 +41,8 @@ struct x86_pmu {
 	const char	*name;
 	int		version;
 	int		(*handle_irq)(struct pt_regs *, int);
-	u64		(*save_disable_all)(void);
-	void		(*restore_all)(u64);
+	void		(*disable_all)(void);
+	void		(*enable_all)(void);
 	void		(*enable)(struct hw_perf_counter *, int);
 	void		(*disable)(struct hw_perf_counter *, int);
 	unsigned	eventsel;
@@ -56,6 +55,7 @@ struct x86_pmu {
 	int		counter_bits;
 	u64		counter_mask;
 	u64		max_period;
+	u64		intel_ctrl;
 };
 
 static struct x86_pmu x86_pmu __read_mostly;
@@ -311,22 +311,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
-static u64 intel_pmu_save_disable_all(void)
+static void intel_pmu_disable_all(void)
 {
-	u64 ctrl;
-
-	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
-
-	return ctrl;
 }
 
-static u64 amd_pmu_save_disable_all(void)
+static void amd_pmu_disable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	int enabled, idx;
+	int idx;
+
+	if (!cpuc->enabled)
+		return;
 
-	enabled = cpuc->enabled;
 	cpuc->enabled = 0;
 	/*
 	 * ensure we write the disable before we start disabling the
@@ -334,8 +331,6 @@ static u64 amd_pmu_save_disable_all(void)
 	 * right thing.
 	 */
 	barrier();
-	if (!enabled)
-		goto out;
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
@@ -348,37 +343,31 @@ static u64 amd_pmu_save_disable_all(void)
 		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
 		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
-
-out:
-	return enabled;
 }
 
-u64 hw_perf_save_disable(void)
+void hw_perf_disable(void)
 {
 	if (!x86_pmu_initialized())
-		return 0;
-	return x86_pmu.save_disable_all();
+		return;
+	return x86_pmu.disable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
-static void intel_pmu_restore_all(u64 ctrl)
+static void intel_pmu_enable_all(void)
 {
-	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl);
+	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 }
 
-static void amd_pmu_restore_all(u64 ctrl)
+static void amd_pmu_enable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
 	int idx;
 
-	cpuc->enabled = ctrl;
-	barrier();
-	if (!ctrl)
+	if (cpuc->enabled)
 		return;
 
+	cpuc->enabled = 1;
+	barrier();
+
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		u64 val;
 
@@ -392,16 +381,12 @@ static void amd_pmu_restore_all(u64 ctrl)
 	}
 }
 
-void hw_perf_restore(u64 ctrl)
+void hw_perf_enable(void)
 {
 	if (!x86_pmu_initialized())
 		return;
-	x86_pmu.restore_all(ctrl);
+	x86_pmu.enable_all();
 }
-/*
- * Exported because of ACPI idle
- */
-EXPORT_SYMBOL_GPL(hw_perf_restore);
 
 static inline u64 intel_pmu_get_status(void)
 {
@@ -735,15 +720,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	int bit, cpu = smp_processor_id();
 	u64 ack, status;
 	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
-	int ret = 0;
-
-	cpuc->throttle_ctrl = intel_pmu_save_disable_all();
 
+	perf_disable();
 	status = intel_pmu_get_status();
-	if (!status)
-		goto out;
+	if (!status) {
+		perf_enable();
+		return 0;
+	}
 
-	ret = 1;
 again:
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
@@ -767,19 +751,11 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	status = intel_pmu_get_status();
 	if (status)
 		goto again;
-out:
-	/*
-	 * Restore - do not reenable when global enable is off or throttled:
-	 */
-	if (cpuc->throttle_ctrl) {
-		if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) {
-			intel_pmu_restore_all(cpuc->throttle_ctrl);
-		} else {
-			pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id());
-		}
-	}
 
-	return ret;
+	if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
+		perf_enable();
+
+	return 1;
 }
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
@@ -792,13 +768,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	struct hw_perf_counter *hwc;
 	int idx, throttle = 0;
 
-	cpuc->throttle_ctrl = cpuc->enabled;
-	cpuc->enabled = 0;
-	barrier();
-
-	if (cpuc->throttle_ctrl) {
-		if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS)
-			throttle = 1;
+	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
+		throttle = 1;
+		__perf_disable();
+		cpuc->enabled = 0;
+		barrier();
 	}
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
@@ -824,9 +798,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
-	if (cpuc->throttle_ctrl && !throttle)
-		cpuc->enabled = 1;
-
 	return handled;
 }
 
@@ -839,13 +810,11 @@ void perf_counter_unthrottle(void)
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id());
-
 		/*
 		 * Clear them before re-enabling irqs/NMIs again:
 		 */
 		cpuc->interrupts = 0;
-		hw_perf_restore(cpuc->throttle_ctrl);
+		perf_enable();
 	} else {
 		cpuc->interrupts = 0;
 	}
@@ -931,8 +900,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 static struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
-	.save_disable_all	= intel_pmu_save_disable_all,
-	.restore_all		= intel_pmu_restore_all,
+	.disable_all		= intel_pmu_disable_all,
+	.enable_all		= intel_pmu_enable_all,
 	.enable			= intel_pmu_enable_counter,
 	.disable		= intel_pmu_disable_counter,
 	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0,
@@ -951,8 +920,8 @@ static struct x86_pmu intel_pmu = {
 static struct x86_pmu amd_pmu = {
 	.name			= "AMD",
 	.handle_irq		= amd_pmu_handle_irq,
-	.save_disable_all	= amd_pmu_save_disable_all,
-	.restore_all		= amd_pmu_restore_all,
+	.disable_all		= amd_pmu_disable_all,
+	.enable_all		= amd_pmu_enable_all,
 	.enable			= amd_pmu_enable_counter,
 	.disable		= amd_pmu_disable_counter,
 	.eventsel		= MSR_K7_EVNTSEL0,
@@ -1003,6 +972,8 @@ static int intel_pmu_init(void)
 	x86_pmu.counter_bits = eax.split.bit_width;
 	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
 
+	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
+
 	return 0;
 }
 
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index d2830f39d46b4..9645758c04721 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -763,11 +763,9 @@ static int acpi_idle_bm_check(void)
  */
 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
-	u64 perf_flags;
-
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -782,7 +780,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 	start_critical_timings();
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 614f921d616ab..e543ecc129f12 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -544,8 +544,10 @@ extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void perf_counter_unthrottle(void);
-extern u64 hw_perf_save_disable(void);
-extern void hw_perf_restore(u64 ctrl);
+extern void __perf_disable(void);
+extern bool __perf_enable(void);
+extern void perf_disable(void);
+extern void perf_enable(void);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
 extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
@@ -600,8 +602,8 @@ static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_counter_unthrottle(void)			{ }
-static inline void hw_perf_restore(u64 ctrl)				{ }
-static inline u64 hw_perf_save_disable(void)		      { return 0; }
+static inline void perf_disable(void)					{ }
+static inline void perf_enable(void)					{ }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
 static inline int perf_counter_task_enable(void)	{ return -EINVAL; }
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 985be0b662af9..e814ff04d7ca0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -60,8 +60,9 @@ extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counte
 	return NULL;
 }
 
-u64 __weak hw_perf_save_disable(void)		{ return 0; }
-void __weak hw_perf_restore(u64 ctrl)		{ barrier(); }
+void __weak hw_perf_disable(void)		{ barrier(); }
+void __weak hw_perf_enable(void)		{ barrier(); }
+
 void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
 int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
@@ -72,6 +73,32 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
 
 void __weak perf_counter_print_debug(void)	{ }
 
+static DEFINE_PER_CPU(int, disable_count);
+
+void __perf_disable(void)
+{
+	__get_cpu_var(disable_count)++;
+}
+
+bool __perf_enable(void)
+{
+	return !--__get_cpu_var(disable_count);
+}
+
+void perf_disable(void)
+{
+	__perf_disable();
+	hw_perf_disable();
+}
+EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */
+
+void perf_enable(void)
+{
+	if (__perf_enable())
+		hw_perf_enable();
+}
+EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */
+
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -170,7 +197,6 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
-	u64 perf_flags;
 
 	/*
 	 * If this is a task context, we need to check whether it is
@@ -191,9 +217,9 @@ static void __perf_counter_remove_from_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	list_del_counter(counter, ctx);
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	if (!ctx->task) {
 		/*
@@ -538,7 +564,6 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter *leader = counter->group_leader;
 	int cpu = smp_processor_id();
 	unsigned long flags;
-	u64 perf_flags;
 	int err;
 
 	/*
@@ -556,7 +581,7 @@ static void __perf_install_in_context(void *info)
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level. NOP for non NMI based counters.
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	add_counter_to_ctx(counter, ctx);
 
@@ -596,7 +621,7 @@ static void __perf_install_in_context(void *info)
 		cpuctx->max_pertask--;
 
  unlock:
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
 }
@@ -663,7 +688,6 @@ static void __perf_counter_enable(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *leader = counter->group_leader;
-	unsigned long pmuflags;
 	unsigned long flags;
 	int err;
 
@@ -693,14 +717,14 @@ static void __perf_counter_enable(void *info)
 	if (!group_can_go_on(counter, cpuctx, 1)) {
 		err = -EEXIST;
 	} else {
-		pmuflags = hw_perf_save_disable();
+		perf_disable();
 		if (counter == leader)
 			err = group_sched_in(counter, cpuctx, ctx,
 					     smp_processor_id());
 		else
 			err = counter_sched_in(counter, cpuctx, ctx,
 					       smp_processor_id());
-		hw_perf_restore(pmuflags);
+		perf_enable();
 	}
 
 	if (err) {
@@ -795,7 +819,6 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 			      struct perf_cpu_context *cpuctx)
 {
 	struct perf_counter *counter;
-	u64 flags;
 
 	spin_lock(&ctx->lock);
 	ctx->is_active = 0;
@@ -803,12 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 		goto out;
 	update_context_time(ctx);
 
-	flags = hw_perf_save_disable();
+	perf_disable();
 	if (ctx->nr_active) {
 		list_for_each_entry(counter, &ctx->counter_list, list_entry)
 			group_sched_out(counter, cpuctx, ctx);
 	}
-	hw_perf_restore(flags);
+	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
 }
@@ -860,7 +883,6 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 			struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct perf_counter *counter;
-	u64 flags;
 	int can_add_hw = 1;
 
 	spin_lock(&ctx->lock);
@@ -870,7 +892,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 
 	ctx->timestamp = perf_clock();
 
-	flags = hw_perf_save_disable();
+	perf_disable();
 
 	/*
 	 * First go through the list and put on any pinned groups
@@ -917,7 +939,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 				can_add_hw = 0;
 		}
 	}
-	hw_perf_restore(flags);
+	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
 }
@@ -955,7 +977,6 @@ int perf_counter_task_disable(void)
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
 	unsigned long flags;
-	u64 perf_flags;
 
 	if (likely(!ctx->nr_counters))
 		return 0;
@@ -969,7 +990,7 @@ int perf_counter_task_disable(void)
 	/*
 	 * Disable all the counters:
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state != PERF_COUNTER_STATE_ERROR) {
@@ -978,7 +999,7 @@ int perf_counter_task_disable(void)
 		}
 	}
 
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
 
@@ -991,7 +1012,6 @@ int perf_counter_task_enable(void)
 	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
 	struct perf_counter *counter;
 	unsigned long flags;
-	u64 perf_flags;
 	int cpu;
 
 	if (likely(!ctx->nr_counters))
@@ -1007,7 +1027,7 @@ int perf_counter_task_enable(void)
 	/*
 	 * Disable all the counters:
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state > PERF_COUNTER_STATE_OFF)
@@ -1017,7 +1037,7 @@ int perf_counter_task_enable(void)
 			ctx->time - counter->total_time_enabled;
 		counter->hw_event.disabled = 0;
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock(&ctx->lock);
 
@@ -1034,7 +1054,6 @@ int perf_counter_task_enable(void)
 static void rotate_ctx(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 perf_flags;
 
 	if (!ctx->nr_counters)
 		return;
@@ -1043,12 +1062,12 @@ static void rotate_ctx(struct perf_counter_context *ctx)
 	/*
 	 * Rotate the first entry last (works just fine for group counters too):
 	 */
-	perf_flags = hw_perf_save_disable();
+	perf_disable();
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		list_move_tail(&counter->list_entry, &ctx->counter_list);
 		break;
 	}
-	hw_perf_restore(perf_flags);
+	perf_enable();
 
 	spin_unlock(&ctx->lock);
 }
@@ -3194,7 +3213,6 @@ __perf_counter_exit_task(struct task_struct *child,
 	} else {
 		struct perf_cpu_context *cpuctx;
 		unsigned long flags;
-		u64 perf_flags;
 
 		/*
 		 * Disable and unlink this counter.
@@ -3203,7 +3221,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		 * could still be processing it:
 		 */
 		local_irq_save(flags);
-		perf_flags = hw_perf_save_disable();
+		perf_disable();
 
 		cpuctx = &__get_cpu_var(perf_cpu_context);
 
@@ -3214,7 +3232,7 @@ __perf_counter_exit_task(struct task_struct *child,
 
 		child_ctx->nr_counters--;
 
-		hw_perf_restore(perf_flags);
+		perf_enable();
 		local_irq_restore(flags);
 	}
 
-- 
GitLab


From a4016a79fcbd139e7378944c0d86a39fdbc70ecc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 14 May 2009 14:52:17 +0200
Subject: [PATCH 1150/2198] perf_counter: x86: Robustify interrupt handling

Two consecutive NMIs could daze and confuse the machine when the
first would handle the overflow of both counters.

[ Impact: fix false-positive syslog messages under multi-session profiling ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 313638cecbb54..1dcf67057f161 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -783,6 +783,10 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
+
+		if (counter->hw_event.nmi != nmi)
+			goto next;
+
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			goto next;
@@ -869,7 +873,6 @@ perf_counter_nmi_handler(struct notifier_block *self,
 {
 	struct die_args *args = __args;
 	struct pt_regs *regs;
-	int ret;
 
 	if (!atomic_read(&active_counters))
 		return NOTIFY_DONE;
@@ -886,9 +889,16 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	regs = args->regs;
 
 	apic_write(APIC_LVTPC, APIC_DM_NMI);
-	ret = x86_pmu.handle_irq(regs, 1);
+	/*
+	 * Can't rely on the handled return value to say it was our NMI, two
+	 * counters could trigger 'simultaneously' raising two back-to-back NMIs.
+	 *
+	 * If the first NMI handles both, the latter will be empty and daze
+	 * the CPU.
+	 */
+	x86_pmu.handle_irq(regs, 1);
 
-	return ret ? NOTIFY_STOP : NOTIFY_OK;
+	return NOTIFY_STOP;
 }
 
 static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
-- 
GitLab


From 1c80f4b598d9b075a2a0be694e28be93a6702bcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 May 2009 08:25:22 +0200
Subject: [PATCH 1151/2198] perf_counter: x86: Disallow interval of 1

On certain CPUs i have observed a stuck PMU if interval was set to
1 and NMIs were used. The PMU had PMC0 set in MSR_CORE_PERF_GLOBAL_STATUS,
but it was not possible to ack it via MSR_CORE_PERF_GLOBAL_OVF_CTRL,
and the NMI loop got stuck infinitely.

[ Impact: fix rare hangs during high perfcounter load ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1dcf67057f161..46a82d1e4cbec 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -473,6 +473,11 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 		left += period;
 		atomic64_set(&hwc->period_left, left);
 	}
+	/*
+	 * Quirk: certain CPUs dont like it if just 1 event is left:
+	 */
+	if (unlikely(left < 2))
+		left = 2;
 
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
-- 
GitLab


From 9029a5e3801f1cc7cdaab80169d82427acf928d8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 May 2009 08:26:20 +0200
Subject: [PATCH 1152/2198] perf_counter: x86: Protect against infinite loops
 in intel_pmu_handle_irq()

intel_pmu_handle_irq() can lock up in an infinite loop if the hardware
does not allow the acking of irqs. Alas, this happened in testing so
make this robust and emit a warning if it happens in the future.

Also, clean up the IRQ handlers a bit.

[ Impact: improve perfcounter irq/nmi handling robustness ]

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 25 ++++++++++++++++++-------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 46a82d1e4cbec..5a7f718eb1e15 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -722,9 +722,13 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
  */
 static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int bit, cpu = smp_processor_id();
+	struct cpu_hw_counters *cpuc;
+	struct cpu_hw_counters;
+	int bit, cpu, loops;
 	u64 ack, status;
-	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	perf_disable();
 	status = intel_pmu_get_status();
@@ -733,7 +737,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		return 0;
 	}
 
+	loops = 0;
 again:
+	if (++loops > 100) {
+		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+		return 1;
+	}
+
 	inc_irq_stat(apic_perf_irqs);
 	ack = status;
 	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
@@ -765,13 +775,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int cpu = smp_processor_id();
-	struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu);
-	u64 val;
-	int handled = 0;
+	int cpu, idx, throttle = 0, handled = 0;
+	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	int idx, throttle = 0;
+	u64 val;
+
+	cpu = smp_processor_id();
+	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
 	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
 		throttle = 1;
-- 
GitLab


From 251e8e3c7235f5944805a64f24c79fc4696793f1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 May 2009 05:16:59 +0200
Subject: [PATCH 1153/2198] perf_counter: Remove ACPI quirk

We had a disable/enable around acpi_idle_do_entry() due to an erratum
in an early prototype CPU i had access to. That erratum has been fixed
in the BIOS so remove the quirk.

The quirk also kept us from profiling interrupts that hit the ACPI idle
instruction - so this is an improvement as well, beyond a cleanup and
a micro-optimization.

[ Impact: improve profiling scope, cleanup, micro-optimization ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/acpi/processor_idle.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 9645758c04721..f7ca8c55956bd 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -765,7 +765,6 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 {
 	/* Don't trace irqs off for idle */
 	stop_critical_timings();
-	perf_disable();
 	if (cx->entry_method == ACPI_CSTATE_FFH) {
 		/* Call into architectural FFH based C-state */
 		acpi_processor_ffh_cstate_enter(cx);
@@ -780,7 +779,6 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
 		   gets asserted in time to freeze execution properly. */
 		unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
 	}
-	perf_enable();
 	start_critical_timings();
 }
 
-- 
GitLab


From 58d7e993b16b62d30f8ef27757614056fe4def11 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 15 May 2009 11:03:23 +0200
Subject: [PATCH 1154/2198] perf stat: handle Ctrl-C

Before this change, if a long-running perf stat workload was Ctrl-C-ed,
the utility exited without displaying statistics.

After the change, the Ctrl-C gets propagated into the workload (and
causes its early exit there), but perf stat itself will still continue
to run and will display counter results.

This is useful to run open-ended workloads, let them run for
a while, then Ctrl-C them to get the stats.

[ Impact: extend perf stat with new functionality ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index cf575c305a6c5..03518d75d864d 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -538,8 +538,14 @@ static void process_options(int argc, char **argv)
 	}
 }
 
+static void skip_signal(int signo)
+{
+}
+
 int cmd_stat(int argc, char **argv, const char *prefix)
 {
+	sigset_t blocked;
+
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	process_options(argc, argv);
@@ -548,5 +554,15 @@ int cmd_stat(int argc, char **argv, const char *prefix)
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
+	/*
+	 * We dont want to block the signals - that would cause
+	 * child tasks to inherit that and Ctrl-C would not work.
+	 * What we want is for Ctrl-C to work in the exec()-ed
+	 * task, but being ignored by perf stat itself:
+	 */
+	signal(SIGINT,  skip_signal);
+	signal(SIGALRM, skip_signal);
+	signal(SIGABRT, skip_signal);
+
 	return do_perfstat(argc, argv);
 }
-- 
GitLab


From f1a11e0576c7a73d759d05d776692b2b2d37172b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 5 May 2009 19:21:40 +0200
Subject: [PATCH 1155/2198] futex: remove the wait queue

The waitqueue which is used in struct futex_q is a leftover from the
futexfd implementation. There is no need to use a waitqueue at all, as
the waiting task is the only user of it. The waitqueue just adds
additional locking and a loop in the wake up path which both can be
avoided.

We have already a task reference in struct futex_q which is used for
PI futexes. Use it for normal futexes as well and just wake up the
task directly.

The logic of signalling the futex wakeup via setting q->lock_ptr to
NULL is kept with the difference that we set it NULL before doing the
wakeup. This opens an exit race window vs. a non futex wake up of the
to be woken up task, which we prevent with get_task_struct /
put_task_struct on the waiter.

[ Impact: simplification ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 58 ++++++++++++++++++++++----------------------------
 1 file changed, 25 insertions(+), 33 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index aec8bf89bf4e7..157bfcd725b88 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -100,8 +100,8 @@ struct futex_pi_state {
  */
 struct futex_q {
 	struct plist_node list;
-	/* There can only be a single waiter */
-	wait_queue_head_t waiter;
+	/* Waiter reference */
+	struct task_struct *task;
 
 	/* Which hash list lock to use: */
 	spinlock_t *lock_ptr;
@@ -111,7 +111,6 @@ struct futex_q {
 
 	/* Optional priority inheritance state: */
 	struct futex_pi_state *pi_state;
-	struct task_struct *task;
 
 	/* rt_waiter storage for requeue_pi: */
 	struct rt_mutex_waiter *rt_waiter;
@@ -694,22 +693,29 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
  */
 static void wake_futex(struct futex_q *q)
 {
-	plist_del(&q->list, &q->list.plist);
+	struct task_struct *p = q->task;
+
 	/*
-	 * The lock in wake_up_all() is a crucial memory barrier after the
-	 * plist_del() and also before assigning to q->lock_ptr.
+	 * We set q->lock_ptr = NULL _before_ we wake up the task. If
+	 * a non futex wake up happens on another CPU then the task
+	 * might exit and p would dereference a non existing task
+	 * struct. Prevent this by holding a reference on p across the
+	 * wake up.
 	 */
-	wake_up(&q->waiter);
+	get_task_struct(p);
+
+	plist_del(&q->list, &q->list.plist);
 	/*
-	 * The waiting task can free the futex_q as soon as this is written,
-	 * without taking any locks.  This must come last.
-	 *
-	 * A memory barrier is required here to prevent the following store to
-	 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
-	 * end of wake_up() does not prevent this store from moving.
+	 * The waiting task can free the futex_q as soon as
+	 * q->lock_ptr = NULL is written, without taking any locks. A
+	 * memory barrier is required here to prevent the following
+	 * store to lock_ptr from getting ahead of the plist_del.
 	 */
 	smp_wmb();
 	q->lock_ptr = NULL;
+
+	wake_up_state(p, TASK_NORMAL);
+	put_task_struct(p);
 }
 
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -1003,7 +1009,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
 	WARN_ON(!q->rt_waiter);
 	q->rt_waiter = NULL;
 
-	wake_up(&q->waiter);
+	wake_up_state(q->task, TASK_NORMAL);
 }
 
 /**
@@ -1280,8 +1286,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
 	struct futex_hash_bucket *hb;
 
-	init_waitqueue_head(&q->waiter);
-
 	get_futex_key_refs(&q->key);
 	hb = hash_futex(&q->key);
 	q->lock_ptr = &hb->lock;
@@ -1575,11 +1579,9 @@ static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
  * @hb:		the futex hash bucket, must be locked by the caller
  * @q:		the futex_q to queue up on
  * @timeout:	the prepared hrtimer_sleeper, or null for no timeout
- * @wait:	the wait_queue to add to the futex_q after queueing in the hb
  */
 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
-				struct hrtimer_sleeper *timeout,
-				wait_queue_t *wait)
+				struct hrtimer_sleeper *timeout)
 {
 	queue_me(q, hb);
 
@@ -1587,19 +1589,11 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	 * There might have been scheduling since the queue_me(), as we
 	 * cannot hold a spinlock across the get_user() in case it
 	 * faults, and we cannot just set TASK_INTERRUPTIBLE state when
-	 * queueing ourselves into the futex hash.  This code thus has to
+	 * queueing ourselves into the futex hash. This code thus has to
 	 * rely on the futex_wake() code removing us from hash when it
 	 * wakes us up.
 	 */
-
-	/* add_wait_queue is the barrier after __set_current_state. */
-	__set_current_state(TASK_INTERRUPTIBLE);
-
-	/*
-	 * Add current as the futex_q waiter.  We don't remove ourselves from
-	 * the wait_queue because we are the only user of it.
-	 */
-	add_wait_queue(&q->waiter, wait);
+	set_current_state(TASK_INTERRUPTIBLE);
 
 	/* Arm the timer */
 	if (timeout) {
@@ -1704,7 +1698,6 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
-	DECLARE_WAITQUEUE(wait, current);
 	struct restart_block *restart;
 	struct futex_hash_bucket *hb;
 	struct futex_q q;
@@ -1733,7 +1726,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 		goto out;
 
 	/* queue_me and wait for wakeup, timeout, or a signal. */
-	futex_wait_queue_me(hb, &q, to, &wait);
+	futex_wait_queue_me(hb, &q, to);
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
 	ret = 0;
@@ -2147,7 +2140,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct rt_mutex *pi_mutex = NULL;
-	DECLARE_WAITQUEUE(wait, current);
 	struct restart_block *restart;
 	struct futex_hash_bucket *hb;
 	union futex_key key2;
@@ -2191,7 +2183,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	}
 
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
-	futex_wait_queue_me(hb, &q, to, &wait);
+	futex_wait_queue_me(hb, &q, to);
 
 	spin_lock(&hb->lock);
 	ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
-- 
GitLab


From 548e1ddf255b4ebfb4ef20c08936fd8d4deb3bd9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:26 +0200
Subject: [PATCH 1156/2198] perf_counter: remove perf_disable/enable exports

Now that ACPI idle doesn't use it anymore, remove the exports.

[ Impact: remove dead code/data ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.429826617@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e814ff04d7ca0..0173738dd548e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -90,14 +90,12 @@ void perf_disable(void)
 	__perf_disable();
 	hw_perf_disable();
 }
-EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */
 
 void perf_enable(void)
 {
 	if (__perf_enable())
 		hw_perf_enable();
 }
-EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */
 
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
-- 
GitLab


From 789f90fcf6b0b54e655740e9396c954378542c79 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:27 +0200
Subject: [PATCH 1157/2198] perf_counter: per user mlock gift

Instead of a per-process mlock gift for perf-counters, use a
per-user gift so that there is less of a DoS potential.

[ Impact: allow less worst-case unprivileged memory consumption ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.496182835@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  4 ++++
 kernel/perf_counter.c | 22 +++++++++++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d1857580a1328..ff59d12315192 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -674,6 +674,10 @@ struct user_struct {
 	struct work_struct work;
 #endif
 #endif
+
+#ifdef CONFIG_PERF_COUNTERS
+	atomic_long_t locked_vm;
+#endif
 };
 
 extern int uids_sysfs_init(void);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0173738dd548e..93f4a0e4b8739 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -45,7 +45,7 @@ static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
-int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */
+int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -1522,6 +1522,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
+		struct user_struct *user = current_user();
+
+		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
 		vma->vm_mm->locked_vm -= counter->data->nr_locked;
 		perf_mmap_data_free(counter);
 		mutex_unlock(&counter->mmap_mutex);
@@ -1537,11 +1540,13 @@ static struct vm_operations_struct perf_mmap_vmops = {
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = file->private_data;
+	struct user_struct *user = current_user();
 	unsigned long vma_size;
 	unsigned long nr_pages;
+	unsigned long user_locked, user_lock_limit;
 	unsigned long locked, lock_limit;
+	long user_extra, extra;
 	int ret = 0;
-	long extra;
 
 	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
 		return -EINVAL;
@@ -1569,15 +1574,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 	}
 
-	extra = nr_pages /* + 1 only account the data pages */;
-	extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
-	if (extra < 0)
-		extra = 0;
+	user_extra = nr_pages + 1;
+	user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 
-	locked = vma->vm_mm->locked_vm + extra;
+	extra = 0;
+	if (user_locked > user_lock_limit)
+		extra = user_locked - user_lock_limit;
 
 	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
 	lock_limit >>= PAGE_SHIFT;
+	locked = vma->vm_mm->locked_vm + extra;
 
 	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
 		ret = -EPERM;
@@ -1590,6 +1597,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 		goto unlock;
 
 	atomic_set(&counter->mmap_count, 1);
+	atomic_long_add(user_extra, &user->locked_vm);
 	vma->vm_mm->locked_vm += extra;
 	counter->data->nr_locked = extra;
 unlock:
-- 
GitLab


From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:28 +0200
Subject: [PATCH 1158/2198] perf_counter: frequency based adaptive irq_period

Instead of specifying the irq_period for a counter, provide a target interrupt
frequency and dynamically adapt the irq_period to match this frequency.

[ Impact: new perf-counter attribute/feature ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.646195868@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 13 +++---
 arch/x86/kernel/cpu/perf_counter.c |  9 ++---
 include/linux/perf_counter.h       | 10 ++++-
 kernel/perf_counter.c              | 63 ++++++++++++++++++++++++------
 4 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index bb1b463c1361b..db8d5cafc1595 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -534,7 +534,7 @@ void hw_perf_enable(void)
 			continue;
 		}
 		val = 0;
-		if (counter->hw_event.irq_period) {
+		if (counter->hw.irq_period) {
 			left = atomic64_read(&counter->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
@@ -829,8 +829,6 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if ((s64)counter->hw_event.irq_period < 0)
-		return ERR_PTR(-EINVAL);
 	if (!perf_event_raw(&counter->hw_event)) {
 		ev = perf_event_id(&counter->hw_event);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
@@ -901,7 +899,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period);
+	atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -934,6 +932,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
+	u64 period = counter->hw.irq_period;
 	s64 prev, delta, left;
 	int record = 0;
 
@@ -948,11 +947,11 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	val = 0;
 	left = atomic64_read(&counter->hw.period_left) - delta;
-	if (counter->hw_event.irq_period) {
+	if (period) {
 		if (left <= 0) {
-			left += counter->hw_event.irq_period;
+			left += period;
 			if (left <= 0)
-				left = counter->hw_event.irq_period;
+				left = period;
 			record = 1;
 		}
 		if (left < 0x80000000L)
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5a7f718eb1e15..886dcf334bc35 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,11 +286,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 	}
 
-	hwc->irq_period	= hw_event->irq_period;
-	if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period)
-		hwc->irq_period = x86_pmu.max_period;
-
-	atomic64_set(&hwc->period_left, hwc->irq_period);
+	atomic64_set(&hwc->period_left,
+			min(x86_pmu.max_period, hwc->irq_period));
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -458,7 +455,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->irq_period;
+	s64 period = min(x86_pmu.max_period, hwc->irq_period);
 	int err;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e543ecc129f12..004b6e162b96c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -130,7 +130,11 @@ struct perf_counter_hw_event {
 	 */
 	__u64			config;
 
-	__u64			irq_period;
+	union {
+		__u64		irq_period;
+		__u64		irq_freq;
+	};
+
 	__u32			record_type;
 	__u32			read_format;
 
@@ -146,8 +150,9 @@ struct perf_counter_hw_event {
 				mmap           :  1, /* include mmap data     */
 				munmap         :  1, /* include munmap data   */
 				comm	       :  1, /* include comm data     */
+				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 52;
+				__reserved_1   : 51;
 
 	__u32			extra_config_len;
 	__u32			wakeup_events;	/* wakeup every n events */
@@ -337,6 +342,7 @@ struct hw_perf_counter {
 	atomic64_t			prev_count;
 	u64				irq_period;
 	atomic64_t			period_left;
+	u64				interrupts;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 93f4a0e4b8739..0ad1db4f3d651 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1046,6 +1046,38 @@ int perf_counter_task_enable(void)
 	return 0;
 }
 
+void perf_adjust_freq(struct perf_counter_context *ctx)
+{
+	struct perf_counter *counter;
+	u64 irq_period;
+	u64 events, period;
+	s64 delta;
+
+	spin_lock(&ctx->lock);
+	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+			continue;
+
+		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+			continue;
+
+		events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+		period = div64_u64(events, counter->hw_event.irq_freq);
+
+		delta = (s64)(1 + period - counter->hw.irq_period);
+		delta >>= 1;
+
+		irq_period = counter->hw.irq_period + delta;
+
+		if (!irq_period)
+			irq_period = 1;
+
+		counter->hw.irq_period = irq_period;
+		counter->hw.interrupts = 0;
+	}
+	spin_unlock(&ctx->lock);
+}
+
 /*
  * Round-robin a context's counters:
  */
@@ -1081,6 +1113,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	ctx = &curr->perf_counter_ctx;
 
+	perf_adjust_freq(&cpuctx->ctx);
+	perf_adjust_freq(ctx);
+
 	perf_counter_cpu_sched_out(cpuctx);
 	__perf_counter_task_sched_out(ctx);
 
@@ -2382,6 +2417,8 @@ int perf_counter_overflow(struct perf_counter *counter,
 	int events = atomic_read(&counter->event_limit);
 	int ret = 0;
 
+	counter->hw.interrupts++;
+
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * counters
@@ -2450,6 +2487,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_counter *counter;
 	struct pt_regs *regs;
+	u64 period;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
 	counter->pmu->read(counter);
@@ -2468,7 +2506,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			ret = HRTIMER_NORESTART;
 	}
 
-	hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
+	period = max_t(u64, 10000, counter->hw.irq_period);
+	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
 	return ret;
 }
@@ -2629,8 +2668,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
+		u64 period = max_t(u64, 10000, hwc->irq_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(hwc->irq_period), 0,
+				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
 	}
 
@@ -2679,8 +2719,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
 	if (hwc->irq_period) {
+		u64 period = max_t(u64, 10000, hwc->irq_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(hwc->irq_period), 0,
+				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
 	}
 
@@ -2811,9 +2852,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 
 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
-	struct perf_counter_hw_event *hw_event = &counter->hw_event;
 	const struct pmu *pmu = NULL;
-	struct hw_perf_counter *hwc = &counter->hw;
 
 	/*
 	 * Software counters (currently) can't in general distinguish
@@ -2826,8 +2865,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
-		if (hw_event->irq_period && hw_event->irq_period < 10000)
-			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_TASK_CLOCK:
 		/*
@@ -2839,8 +2876,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 		else
 			pmu = &perf_ops_cpu_clock;
 
-		if (hw_event->irq_period && hw_event->irq_period < 10000)
-			hw_event->irq_period = 10000;
 		break;
 	case PERF_COUNT_PAGE_FAULTS:
 	case PERF_COUNT_PAGE_FAULTS_MIN:
@@ -2854,9 +2889,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 		break;
 	}
 
-	if (pmu)
-		hwc->irq_period = hw_event->irq_period;
-
 	return pmu;
 }
 
@@ -2872,6 +2904,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 {
 	const struct pmu *pmu;
 	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
 	long err;
 
 	counter = kzalloc(sizeof(*counter), gfpflags);
@@ -2907,6 +2940,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	pmu = NULL;
 
+	hwc = &counter->hw;
+	if (hw_event->freq && hw_event->irq_freq)
+		hwc->irq_period = TICK_NSEC / hw_event->irq_freq;
+	else
+		hwc->irq_period = hw_event->irq_period;
+
 	/*
 	 * we currently do not support PERF_RECORD_GROUP on inherited counters
 	 */
-- 
GitLab


From f5456a6b056b709282e87a68b4c1b81ac2e866fa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:19:29 +0200
Subject: [PATCH 1159/2198] perf top: update to use the new freq interface

Provide perf top -F as alternative to -c.

[ Impact: new 'perf top' feature ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090515132018.707922166@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index b1549dd277200..814b2e4925e30 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -98,6 +98,7 @@ static unsigned int		page_size;
 static unsigned int		mmap_pages			=  16;
 static int			use_mmap			= 0;
 static int			use_munmap			= 0;
+static int			freq				= 0;
 
 static char			*vmlinux;
 
@@ -846,9 +847,10 @@ static void process_options(int argc, char **argv)
 			{"stat",	no_argument,		NULL, 'S'},
 			{"vmlinux",	required_argument,	NULL, 'x'},
 			{"zero",	no_argument,		NULL, 'z'},
+			{"freq",	required_argument,	NULL, 'F'},
 			{NULL,		0,			NULL,  0 }
 		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
+		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMUF:",
 				    long_options, &option_index);
 		if (c == -1)
 			break;
@@ -889,6 +891,7 @@ static void process_options(int argc, char **argv)
 		case 'm': mmap_pages			=   atoi(optarg); break;
 		case 'M': use_mmap			=              1; break;
 		case 'U': use_munmap			=              1; break;
+		case 'F': freq = 1; default_interval	=   atoi(optarg); break;
 		default: error = 1; break;
 		}
 	}
@@ -1075,6 +1078,7 @@ int cmd_top(int argc, char **argv, const char *prefix)
 			hw_event.nmi		= nmi;
 			hw_event.mmap		= use_mmap;
 			hw_event.munmap		= use_munmap;
+			hw_event.freq		= freq;
 
 			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
-- 
GitLab


From dce48a84adf1806676319f6f480e30a6daa012f9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 11 Apr 2009 10:43:41 +0200
Subject: [PATCH 1160/2198] sched, timers: move calc_load() to scheduler

Dimitri Sivanich noticed that xtime_lock is held write locked across
calc_load() which iterates over all online CPUs. That can cause long
latencies for xtime_lock readers on large SMP systems.

The load average calculation is an rough estimate anyway so there is
no real need to protect the readers vs. the update. It's not a problem
when the avenrun array is updated while a reader copies the values.

Instead of iterating over all online CPUs let the scheduler_tick code
update the number of active tasks shortly before the avenrun update
happens. The avenrun update itself is handled by the CPU which calls
do_timer().

[ Impact: reduce xtime_lock write locked section ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/sched.h     |  2 +-
 kernel/sched.c            | 84 ++++++++++++++++++++++++++++++++++-----
 kernel/sched_idletask.c   |  3 +-
 kernel/time/timekeeping.c |  2 +-
 kernel/timer.c            | 54 +------------------------
 5 files changed, 80 insertions(+), 65 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index b4c38bc8049cb..6eb4892efe453 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -135,8 +135,8 @@ DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
-extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
+extern void calc_global_load(void);
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 8908d190a348e..f4eb88153bd18 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -630,6 +630,10 @@ struct rq {
 	struct list_head migration_queue;
 #endif
 
+	/* calc_load related fields */
+	unsigned long calc_load_update;
+	long calc_load_active;
+
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
 	int hrtick_csd_pending;
@@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
 
+static void calc_load_account_active(struct rq *this_rq);
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void)
 	return sum;
 }
 
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
-	unsigned long i, running = 0, uninterruptible = 0;
+	load *= exp;
+	load += active * (FIXED_1 - exp);
+	return load >> FSHIFT;
+}
 
-	for_each_online_cpu(i) {
-		running += cpu_rq(i)->nr_running;
-		uninterruptible += cpu_rq(i)->nr_uninterruptible;
-	}
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+	unsigned long upd = calc_load_update + 10;
+	long active;
+
+	if (time_before(jiffies, upd))
+		return;
 
-	if (unlikely((long)uninterruptible < 0))
-		uninterruptible = 0;
+	active = atomic_long_read(&calc_load_tasks);
+	active = active > 0 ? active * FIXED_1 : 0;
 
-	return running + uninterruptible;
+	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+	avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+	calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+	long nr_active, delta;
+
+	nr_active = this_rq->nr_running;
+	nr_active += (long) this_rq->nr_uninterruptible;
+
+	if (nr_active != this_rq->calc_load_active) {
+		delta = nr_active - this_rq->calc_load_active;
+		this_rq->calc_load_active = nr_active;
+		atomic_long_add(delta, &calc_load_tasks);
+	}
 }
 
 /*
@@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq)
 			new_load += scale-1;
 		this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
 	}
+
+	if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+		this_rq->calc_load_update += LOAD_FREQ;
+		calc_load_account_active(this_rq);
+	}
 }
 
 #ifdef CONFIG_SMP
@@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 
 	}
 }
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+	atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		/* Update our root-domain */
 		rq = cpu_rq(cpu);
 		spin_lock_irqsave(&rq->lock, flags);
+		rq->calc_load_update = calc_load_update;
+		rq->calc_load_active = 0;
 		if (rq->rd) {
 			BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
@@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		cpuset_unlock();
 		migrate_nr_uninterruptible(rq);
 		BUG_ON(rq->nr_running != 0);
-
+		calc_global_load_remove(rq);
 		/*
 		 * No need to migrate the tasks: it was best-effort if
 		 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -9059,6 +9118,8 @@ void __init sched_init(void)
 		rq = cpu_rq(i);
 		spin_lock_init(&rq->lock);
 		rq->nr_running = 0;
+		rq->calc_load_active = 0;
+		rq->calc_load_update = jiffies + LOAD_FREQ;
 		init_cfs_rq(&rq->cfs, rq);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9166,6 +9227,9 @@ void __init sched_init(void)
 	 * when this runqueue becomes "idle".
 	 */
 	init_idle(current, smp_processor_id());
+
+	calc_load_update = jiffies + LOAD_FREQ;
+
 	/*
 	 * During early bootup we pretend to be a normal task:
 	 */
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 8a21a2e28c13a..499672c10cbd6 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
-
+	/* adjust the active tasks as we might go into a long sleep */
+	calc_load_account_active(rq);
 	return rq->idle;
 }
 
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 687dff49f6e7d..52a8bf8931f39 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -22,7 +22,7 @@
 
 /*
  * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 
diff --git a/kernel/timer.c b/kernel/timer.c
index cffffad01c31c..6a21d7af96209 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1122,47 +1122,6 @@ void update_process_times(int user_tick)
 	run_posix_cpu_timers(p);
 }
 
-/*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
-	return nr_active() * FIXED_1;
-}
-
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-
-EXPORT_SYMBOL(avenrun);
-
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
-	unsigned long active_tasks; /* fixed-point */
-	static int count = LOAD_FREQ;
-
-	count -= ticks;
-	if (unlikely(count < 0)) {
-		active_tasks = count_active_tasks();
-		do {
-			CALC_LOAD(avenrun[0], EXP_1, active_tasks);
-			CALC_LOAD(avenrun[1], EXP_5, active_tasks);
-			CALC_LOAD(avenrun[2], EXP_15, active_tasks);
-			count += LOAD_FREQ;
-		} while (count < 0);
-	}
-}
-
 /*
  * This function runs timers and the timer-tq in bottom half context.
  */
@@ -1186,16 +1145,6 @@ void run_local_timers(void)
 	softlockup_tick();
 }
 
-/*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
-	update_wall_time();
-	calc_load(ticks);
-}
-
 /*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks)
 void do_timer(unsigned long ticks)
 {
 	jiffies_64 += ticks;
-	update_times(ticks);
+	update_wall_time();
+	calc_global_load();
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
-- 
GitLab


From 2d02494f5a90f2e4b3c4c6acc85ec94674cdc431 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 2 May 2009 20:08:52 +0200
Subject: [PATCH 1161/2198] sched, timers: cleanup avenrun users

avenrun is an rough estimate so we don't have to worry about
consistency of the three avenrun values. Remove the xtime lock
dependency and provide a function to scale the values. Cleanup the
users.

[ Impact: cleanup ]

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
---
 fs/proc/loadavg.c     | 18 ++++++------------
 include/linux/sched.h |  1 +
 kernel/sched.c        | 15 +++++++++++++++
 kernel/timer.c        | 32 ++++++--------------------------
 4 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c
index 9bca39cf99eeb..1afa4dd4cae24 100644
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -12,20 +12,14 @@
 
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
-	int a, b, c;
-	unsigned long seq;
+	unsigned long avnrun[3];
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		a = avenrun[0] + (FIXED_1/200);
-		b = avenrun[1] + (FIXED_1/200);
-		c = avenrun[2] + (FIXED_1/200);
-	} while (read_seqretry(&xtime_lock, seq));
+	get_avenrun(avnrun, FIXED_1/200, 0);
 
-	seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
-		LOAD_INT(a), LOAD_FRAC(a),
-		LOAD_INT(b), LOAD_FRAC(b),
-		LOAD_INT(c), LOAD_FRAC(c),
+	seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
 		nr_running(), nr_threads,
 		task_active_pid_ns(current)->last_pid);
 	return 0;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6eb4892efe453..de7b3b2177729 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -116,6 +116,7 @@ struct fs_struct;
  *    11 bit fractions.
  */
 extern unsigned long avenrun[];		/* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
 
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
diff --git a/kernel/sched.c b/kernel/sched.c
index f4eb88153bd18..497c09ba61e77 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2868,6 +2868,21 @@ static unsigned long calc_load_update;
 unsigned long avenrun[3];
 EXPORT_SYMBOL(avenrun);
 
+/**
+ * get_avenrun - get the load average array
+ * @loads:	pointer to dest load array
+ * @offset:	offset to add
+ * @shift:	shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+	loads[0] = (avenrun[0] + offset) << shift;
+	loads[1] = (avenrun[1] + offset) << shift;
+	loads[2] = (avenrun[2] + offset) << shift;
+}
+
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
diff --git a/kernel/timer.c b/kernel/timer.c
index 6a21d7af96209..a26ed294f9387 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1356,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info)
 {
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
-	unsigned long seq;
+	struct timespec tp;
 
 	memset(info, 0, sizeof(struct sysinfo));
 
-	do {
-		struct timespec tp;
-		seq = read_seqbegin(&xtime_lock);
-
-		/*
-		 * This is annoying.  The below is the same thing
-		 * posix_get_clock_monotonic() does, but it wants to
-		 * take the lock which we want to cover the loads stuff
-		 * too.
-		 */
-
-		getnstimeofday(&tp);
-		tp.tv_sec += wall_to_monotonic.tv_sec;
-		tp.tv_nsec += wall_to_monotonic.tv_nsec;
-		monotonic_to_bootbased(&tp);
-		if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
-			tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
-			tp.tv_sec++;
-		}
-		info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+	ktime_get_ts(&tp);
+	monotonic_to_bootbased(&tp);
+	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 
-		info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
-		info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-		info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
 
-		info->procs = nr_threads;
-	} while (read_seqretry(&xtime_lock, seq));
+	info->procs = nr_threads;
 
 	si_meminfo(info);
 	si_swapinfo(info);
-- 
GitLab


From 2e569d36729c8105ae066a9b105068305442cc77 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 15:37:47 +0200
Subject: [PATCH 1162/2198] perf_counter: frequency based adaptive irq_period,
 32-bit fix

fix:

  kernel/built-in.o: In function `perf_counter_alloc':
  perf_counter.c:(.text+0x7ddc7): undefined reference to `__udivdi3'

[ Impact: build fix on 32-bit systems ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <1242394667.6642.1887.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0ad1db4f3d651..728a595399b00 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2942,7 +2942,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	hwc = &counter->hw;
 	if (hw_event->freq && hw_event->irq_freq)
-		hwc->irq_period = TICK_NSEC / hw_event->irq_freq;
+		hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq);
 	else
 		hwc->irq_period = hw_event->irq_period;
 
-- 
GitLab


From ef923214a4816c289e4af2d67a9ebb1a31e4ac61 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 14 May 2009 13:29:14 +1000
Subject: [PATCH 1163/2198] perf_counter: powerpc: use u64 for event codes
 internally

Although the perf_counter API allows 63-bit raw event codes,
internally in the powerpc back-end we had been using 32-bit
event codes.  This expands them to 64 bits so that we can add
bits for specifying threshold start/stop events and instruction
sampling modes later.

This also corrects the return value of can_go_on_limited_pmc;
we were returning an event code rather than just a 0/1 value in
some circumstances. That didn't particularly matter while event
codes were 32-bit, but now that event codes are 64-bit it
might, so this fixes it.

[ Impact: extend PowerPC perfcounter interfaces from u32 to u64 ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18955.36874.472452.353104@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/perf_counter.h | 10 +++++-----
 arch/powerpc/kernel/perf_counter.c      | 26 ++++++++++++-------------
 arch/powerpc/kernel/power4-pmu.c        |  9 ++++-----
 arch/powerpc/kernel/power5+-pmu.c       | 14 ++++++-------
 arch/powerpc/kernel/power5-pmu.c        | 16 +++++++--------
 arch/powerpc/kernel/power6-pmu.c        | 16 +++++++--------
 arch/powerpc/kernel/ppc970-pmu.c        |  9 ++++-----
 7 files changed, 48 insertions(+), 52 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 56d66c38143bc..ceea76a48e3d9 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -23,13 +23,13 @@ struct power_pmu {
 	int	max_alternatives;
 	u64	add_fields;
 	u64	test_adder;
-	int	(*compute_mmcr)(unsigned int events[], int n_ev,
+	int	(*compute_mmcr)(u64 events[], int n_ev,
 				unsigned int hwc[], u64 mmcr[]);
-	int	(*get_constraint)(unsigned int event, u64 *mskp, u64 *valp);
-	int	(*get_alternatives)(unsigned int event, unsigned int flags,
-				    unsigned int alt[]);
+	int	(*get_constraint)(u64 event, u64 *mskp, u64 *valp);
+	int	(*get_alternatives)(u64 event, unsigned int flags,
+				    u64 alt[]);
 	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
-	int	(*limited_pmc_event)(unsigned int event);
+	int	(*limited_pmc_event)(u64 event);
 	int	limited_pmc5_6;	/* PMC5 and PMC6 have limited function */
 	int	n_generic;
 	int	*generic_events;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index db8d5cafc1595..8d4cafc84b823 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -26,7 +26,7 @@ struct cpu_hw_counters {
 	int n_limited;
 	u8  pmcs_enabled;
 	struct perf_counter *counter[MAX_HWCOUNTERS];
-	unsigned int events[MAX_HWCOUNTERS];
+	u64 events[MAX_HWCOUNTERS];
 	unsigned int flags[MAX_HWCOUNTERS];
 	u64 mmcr[3];
 	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS];
@@ -131,11 +131,11 @@ static void write_pmc(int idx, unsigned long val)
  * and see if any combination of alternative codes is feasible.
  * The feasible set is returned in event[].
  */
-static int power_check_constraints(unsigned int event[], unsigned int cflags[],
+static int power_check_constraints(u64 event[], unsigned int cflags[],
 				   int n_ev)
 {
 	u64 mask, value, nv;
-	unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
+	u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 	u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 	u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES];
 	u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS];
@@ -564,7 +564,7 @@ void hw_perf_enable(void)
 }
 
 static int collect_events(struct perf_counter *group, int max_count,
-			  struct perf_counter *ctrs[], unsigned int *events,
+			  struct perf_counter *ctrs[], u64 *events,
 			  unsigned int *flags)
 {
 	int n = 0;
@@ -752,11 +752,11 @@ struct pmu power_pmu = {
  * that a limited PMC can count, doesn't require interrupts, and
  * doesn't exclude any processor mode.
  */
-static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
+static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 				 unsigned int flags)
 {
 	int n;
-	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+	u64 alt[MAX_EVENT_ALTERNATIVES];
 
 	if (counter->hw_event.exclude_user
 	    || counter->hw_event.exclude_kernel
@@ -776,10 +776,8 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
 
 	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD;
 	n = ppmu->get_alternatives(ev, flags, alt);
-	if (n)
-		return alt[0];
 
-	return 0;
+	return n > 0;
 }
 
 /*
@@ -787,10 +785,9 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev,
  * and return the event code, or 0 if there is no such alternative.
  * (Note: event code 0 is "don't count" on all machines.)
  */
-static unsigned long normal_pmc_alternative(unsigned long ev,
-					    unsigned long flags)
+static u64 normal_pmc_alternative(u64 ev, unsigned long flags)
 {
-	unsigned int alt[MAX_EVENT_ALTERNATIVES];
+	u64 alt[MAX_EVENT_ALTERNATIVES];
 	int n;
 
 	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD);
@@ -820,9 +817,10 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
-	unsigned long ev, flags;
+	u64 ev;
+	unsigned long flags;
 	struct perf_counter *ctrs[MAX_HWCOUNTERS];
-	unsigned int events[MAX_HWCOUNTERS];
+	u64 events[MAX_HWCOUNTERS];
 	unsigned int cflags[MAX_HWCOUNTERS];
 	int n;
 	int err;
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 744a2756958e9..836fa118eb1e6 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -213,7 +213,7 @@ static unsigned char direct_marked_event[8] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int p4_marked_instr_event(unsigned int event)
+static int p4_marked_instr_event(u64 event)
 {
 	int pmc, psel, unit, byte, bit;
 	unsigned int mask;
@@ -249,7 +249,7 @@ static int p4_marked_instr_event(unsigned int event)
 	return (mask >> (byte * 8 + bit)) & 1;
 }
 
-static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, lower, sh;
 	u64 mask = 0, value = 0;
@@ -320,8 +320,7 @@ static unsigned int ppc_inst_cmpl[] = {
 	0x1001, 0x4001, 0x6001, 0x7001, 0x8001
 };
 
-static int p4_get_alternatives(unsigned int event, unsigned int flags,
-			       unsigned int alt[])
+static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, na;
 
@@ -353,7 +352,7 @@ static int p4_get_alternatives(unsigned int event, unsigned int flags,
 	return na;
 }
 
-static int p4_compute_mmcr(unsigned int event[], int n_ev,
+static int p4_compute_mmcr(u64 event[], int n_ev,
 			   unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 8154eaa2404f8..3ac0654372abe 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -135,7 +135,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_GRS] =   { 0x0e00000000ull, 0x0c40000000ull },
 };
 
-static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, sh;
 	int bit, fmask;
@@ -188,7 +188,7 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int power5p_limited_pmc_event(unsigned int event)
+static int power5p_limited_pmc_event(u64 event)
 {
 	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 
@@ -273,11 +273,11 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5p_get_alternatives(unsigned int event, unsigned int flags,
-				    unsigned int alt[])
+static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
-	int i, j, ae, nalt = 1;
+	int i, j, nalt = 1;
 	int nlim;
+	u64 ae;
 
 	alt[0] = event;
 	nalt = 1;
@@ -402,7 +402,7 @@ static unsigned char direct_event_is_marked[0x28] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int power5p_marked_instr_event(unsigned int event)
+static int power5p_marked_instr_event(u64 event)
 {
 	int pmc, psel;
 	int bit, byte, unit;
@@ -451,7 +451,7 @@ static int power5p_marked_instr_event(unsigned int event)
 	return (mask >> (byte * 8 + bit)) & 1;
 }
 
-static int power5p_compute_mmcr(unsigned int event[], int n_ev,
+static int power5p_compute_mmcr(u64 event[], int n_ev,
 				unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 6e667dc86470b..d5344968ee9cb 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -139,7 +139,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_GRS] =   { 0x30002000000000ull, 0x30000400000000ull },
 };
 
-static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, sh;
 	int bit, fmask;
@@ -224,7 +224,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
  * Scan the alternatives table for a match and return the
  * index into the alternatives table if found, else -1.
  */
-static int find_alternative(unsigned int event)
+static int find_alternative(u64 event)
 {
 	int i, j;
 
@@ -250,7 +250,7 @@ static const unsigned char bytedecode_alternatives[4][4] = {
  * PMCSEL values on other counters.  This returns the alternative
  * event code for those that do, or -1 otherwise.
  */
-static int find_alternative_bdecode(unsigned int event)
+static u64 find_alternative_bdecode(u64 event)
 {
 	int pmc, altpmc, pp, j;
 
@@ -269,10 +269,10 @@ static int find_alternative_bdecode(unsigned int event)
 	return -1;
 }
 
-static int power5_get_alternatives(unsigned int event, unsigned int flags,
-				   unsigned int alt[])
+static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
-	int i, j, ae, nalt = 1;
+	int i, j, nalt = 1;
+	u64 ae;
 
 	alt[0] = event;
 	nalt = 1;
@@ -338,7 +338,7 @@ static unsigned char direct_event_is_marked[0x28] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int power5_marked_instr_event(unsigned int event)
+static int power5_marked_instr_event(u64 event)
 {
 	int pmc, psel;
 	int bit, byte, unit;
@@ -382,7 +382,7 @@ static int power5_marked_instr_event(unsigned int event)
 	return (mask >> (byte * 8 + bit)) & 1;
 }
 
-static int power5_compute_mmcr(unsigned int event[], int n_ev,
+static int power5_compute_mmcr(u64 event[], int n_ev,
 			       unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index d44049f0ae277..ab7c615c458db 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -134,7 +134,7 @@ static u32 marked_bus_events[16] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int power6_marked_instr_event(unsigned int event)
+static int power6_marked_instr_event(u64 event)
 {
 	int pmc, psel, ptype;
 	int bit, byte, unit;
@@ -172,7 +172,7 @@ static int power6_marked_instr_event(unsigned int event)
 /*
  * Assign PMC numbers and compute MMCR1 value for a set of events
  */
-static int p6_compute_mmcr(unsigned int event[], int n_ev,
+static int p6_compute_mmcr(u64 event[], int n_ev,
 			   unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr1 = 0;
@@ -265,7 +265,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev,
  *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3
  *	32-34	select field: nest (subunit) event selector
  */
-static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, sh, subunit;
 	u64 mask = 0, value = 0;
@@ -298,7 +298,7 @@ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int p6_limited_pmc_event(unsigned int event)
+static int p6_limited_pmc_event(u64 event)
 {
 	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
 
@@ -337,7 +337,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = {
  * This could be made more efficient with a binary search on
  * a presorted list, if necessary
  */
-static int find_alternatives_list(unsigned int event)
+static int find_alternatives_list(u64 event)
 {
 	int i, j;
 	unsigned int alt;
@@ -356,12 +356,12 @@ static int find_alternatives_list(unsigned int event)
 	return -1;
 }
 
-static int p6_get_alternatives(unsigned int event, unsigned int flags,
-			       unsigned int alt[])
+static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, nlim;
-	unsigned int aevent, psel, pmc;
+	unsigned int psel, pmc;
 	unsigned int nalt = 1;
+	u64 aevent;
 
 	alt[0] = event;
 	nlim = p6_limited_pmc_event(event);
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index af2d1884058c1..eed47c4523f17 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -147,7 +147,7 @@ static unsigned char direct_marked_event[8] = {
  * Returns 1 if event counts things relating to marked instructions
  * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
  */
-static int p970_marked_instr_event(unsigned int event)
+static int p970_marked_instr_event(u64 event)
 {
 	int pmc, psel, unit, byte, bit;
 	unsigned int mask;
@@ -192,7 +192,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = {
 	[PM_STS] =   { 0x380000000000ull, 0x310000000000ull },
 };
 
-static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
+static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp)
 {
 	int pmc, byte, unit, sh, spcsel;
 	u64 mask = 0, value = 0;
@@ -243,8 +243,7 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp)
 	return 0;
 }
 
-static int p970_get_alternatives(unsigned int event, unsigned int flags,
-				 unsigned int alt[])
+static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	alt[0] = event;
 
@@ -257,7 +256,7 @@ static int p970_get_alternatives(unsigned int event, unsigned int flags,
 	return 1;
 }
 
-static int p970_compute_mmcr(unsigned int event[], int n_ev,
+static int p970_compute_mmcr(u64 event[], int n_ev,
 			     unsigned int hwc[], u64 mmcr[])
 {
 	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0;
-- 
GitLab


From 9d23a90a67261e73b2fcac04d8ca963c6b496afb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 14 May 2009 21:48:08 +1000
Subject: [PATCH 1164/2198] perf_counter: allow arch to supply event misc flags
 and instruction pointer

At present the values we put in overflow events for the misc
flags indicating processor mode and the instruction pointer are
obtained using the standard user_mode() and
instruction_pointer() functions. Those functions tell you where
the performance monitor interrupt was taken, which might not be
exactly where the counter overflow occurred, for example
because interrupts were disabled at the point where the
overflow occurred, or because the processor had many
instructions in flight and chose to complete some more
instructions beyond the one that caused the counter overflow.

Some architectures (e.g. powerpc) can supply more precise
information about where the counter overflow occurred and the
processor mode at that point.  This introduces new functions,
perf_misc_flags() and perf_instruction_pointer(), which arch
code can override to provide more precise information if
available.  They have default implementations which are
identical to the existing code.

This also adds a new misc flag value,
PERF_EVENT_MISC_HYPERVISOR, for the case where a counter
overflow occurred in the hypervisor.  We encode the processor
mode in the 2 bits previously used to indicate user or kernel
mode; the values for user and kernel mode are unchanged and
hypervisor mode is indicated by both bits being set.

[ Impact: generalize perfcounter core facilities ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18956.1272.818511.561835@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 11 ++++++++++-
 kernel/perf_counter.c        |  5 ++---
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 004b6e162b96c..c8c1dfc22c938 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -215,8 +215,11 @@ struct perf_counter_mmap_page {
 	__u32   data_head;		/* head in the data section */
 };
 
+#define PERF_EVENT_MISC_CPUMODE_MASK	(3 << 0)
+#define PERF_EVENT_MISC_CPUMODE_UNKNOWN	(0 << 0)
 #define PERF_EVENT_MISC_KERNEL		(1 << 0)
-#define PERF_EVENT_MISC_USER		(1 << 1)
+#define PERF_EVENT_MISC_USER		(2 << 0)
+#define PERF_EVENT_MISC_HYPERVISOR	(3 << 0)
 #define PERF_EVENT_MISC_OVERFLOW	(1 << 2)
 
 struct perf_event_header {
@@ -596,6 +599,12 @@ extern int sysctl_perf_counter_mlock;
 
 extern void perf_counter_init(void);
 
+#ifndef perf_misc_flags
+#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_EVENT_MISC_USER : \
+				 PERF_EVENT_MISC_KERNEL)
+#define perf_instruction_pointer(regs)	instruction_pointer(regs)
+#endif
+
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 728a595399b00..57840a94b1634 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2042,11 +2042,10 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.size = sizeof(header);
 
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
-	header.misc |= user_mode(regs) ?
-		PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
+	header.misc |= perf_misc_flags(regs);
 
 	if (record_type & PERF_RECORD_IP) {
-		ip = instruction_pointer(regs);
+		ip = perf_instruction_pointer(regs);
 		header.type |= PERF_RECORD_IP;
 		header.size += sizeof(ip);
 	}
-- 
GitLab


From 0bbd0d4be8d5d3676c126e06e3c75c16def00441 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 14 May 2009 13:31:48 +1000
Subject: [PATCH 1165/2198] perf_counter: powerpc: supply more precise
 information on counter overflow events

This uses values from the MMCRA, SIAR and SDAR registers on
powerpc to supply more precise information for overflow events,
including a data address when PERF_RECORD_ADDR is specified.

Since POWER6 uses different bit positions in MMCRA from earlier
processors, this converts the struct power_pmu limited_pmc5_6
field, which only had 0/1 values, into a flags field and
defines bit values for its previous use (PPMU_LIMITED_PMC5_6)
and a new flag (PPMU_ALT_SIPR) to indicate that the processor
uses the POWER6 bit positions rather than the earlier
positions.  It also adds definitions in reg.h for the new and
old positions of the bit that indicates that the SIAR and SDAR
values come from the same instruction.

For the data address, the SDAR value is supplied if we are not
doing instruction sampling.  In that case there is no guarantee
that the address given in the PERF_RECORD_ADDR subrecord will
correspond to the instruction whose address is given in the
PERF_RECORD_IP subrecord.

If instruction sampling is enabled (e.g. because this counter
is counting a marked instruction event), then we only supply
the SDAR value for the PERF_RECORD_ADDR subrecord if it
corresponds to the instruction whose address is in the
PERF_RECORD_IP subrecord.  Otherwise we supply 0.

[ Impact: support more PMU hardware features on PowerPC ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18955.37028.48861.555309@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/perf_counter.h | 14 ++++-
 arch/powerpc/include/asm/reg.h          |  2 +
 arch/powerpc/kernel/perf_counter.c      | 84 +++++++++++++++++++++++--
 arch/powerpc/kernel/power5+-pmu.c       |  2 +-
 arch/powerpc/kernel/power6-pmu.c        |  2 +-
 5 files changed, 97 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index ceea76a48e3d9..1c60f0ca79201 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -30,13 +30,19 @@ struct power_pmu {
 				    u64 alt[]);
 	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]);
 	int	(*limited_pmc_event)(u64 event);
-	int	limited_pmc5_6;	/* PMC5 and PMC6 have limited function */
+	u32	flags;
 	int	n_generic;
 	int	*generic_events;
 };
 
 extern struct power_pmu *ppmu;
 
+/*
+ * Values for power_pmu.flags
+ */
+#define PPMU_LIMITED_PMC5_6	1	/* PMC5/6 have limited function */
+#define PPMU_ALT_SIPR		2	/* uses alternate posn for SIPR/HV */
+
 /*
  * Values for flags to get_alternatives()
  */
@@ -44,6 +50,12 @@ extern struct power_pmu *ppmu;
 #define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */
 #define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */
 
+struct pt_regs;
+extern unsigned long perf_misc_flags(struct pt_regs *regs);
+#define perf_misc_flags(regs)	perf_misc_flags(regs)
+
+extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
+
 /*
  * The power_pmu.get_constraint function returns a 64-bit value and
  * a 64-bit mask that express the constraints between this event and
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index e8018d540e87d..fb359b0a6937d 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -492,11 +492,13 @@
 #define   MMCR0_FCHV	0x00000001UL /* freeze conditions in hypervisor mode */
 #define SPRN_MMCR1	798
 #define SPRN_MMCRA	0x312
+#define   MMCRA_SDSYNC	0x80000000UL /* SDAR synced with SIAR */
 #define   MMCRA_SIHV	0x10000000UL /* state of MSR HV when SIAR set */
 #define   MMCRA_SIPR	0x08000000UL /* state of MSR PR when SIAR set */
 #define   MMCRA_SLOT	0x07000000UL /* SLOT bits (37-39) */
 #define   MMCRA_SLOT_SHIFT	24
 #define   MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */
+#define   POWER6_MMCRA_SDSYNC 0x0000080000000000ULL	/* SDAR/SIAR synced */
 #define   POWER6_MMCRA_SIHV   0x0000040000000000ULL
 #define   POWER6_MMCRA_SIPR   0x0000020000000000ULL
 #define   POWER6_MMCRA_THRM	0x00000020UL
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 8d4cafc84b823..6baae5a5c3311 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -17,6 +17,7 @@
 #include <asm/pmc.h>
 #include <asm/machdep.h>
 #include <asm/firmware.h>
+#include <asm/ptrace.h>
 
 struct cpu_hw_counters {
 	int n_counters;
@@ -310,7 +311,8 @@ static void power_pmu_read(struct perf_counter *counter)
  */
 static int is_limited_pmc(int pmcnum)
 {
-	return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6);
+	return (ppmu->flags & PPMU_LIMITED_PMC5_6)
+		&& (pmcnum == 5 || pmcnum == 6);
 }
 
 static void freeze_limited_counters(struct cpu_hw_counters *cpuhw,
@@ -860,7 +862,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 * If this machine has limited counters, check whether this
 	 * event could go on a limited counter.
 	 */
-	if (ppmu->limited_pmc5_6) {
+	if (ppmu->flags & PPMU_LIMITED_PMC5_6) {
 		if (can_go_on_limited_pmc(counter, ev, flags)) {
 			flags |= PPMU_LIMITED_PMC_OK;
 		} else if (ppmu->limited_pmc_event(ev)) {
@@ -933,6 +935,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	u64 period = counter->hw.irq_period;
 	s64 prev, delta, left;
 	int record = 0;
+	u64 addr, mmcra, sdsync;
 
 	/* we don't have to worry about interrupts here */
 	prev = atomic64_read(&counter->hw.prev_count);
@@ -963,8 +966,76 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	/*
 	 * Finally record data if requested.
 	 */
-	if (record)
-		perf_counter_overflow(counter, nmi, regs, 0);
+	if (record) {
+		addr = 0;
+		if (counter->hw_event.record_type & PERF_RECORD_ADDR) {
+			/*
+			 * The user wants a data address recorded.
+			 * If we're not doing instruction sampling,
+			 * give them the SDAR (sampled data address).
+			 * If we are doing instruction sampling, then only
+			 * give them the SDAR if it corresponds to the
+			 * instruction pointed to by SIAR; this is indicated
+			 * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA.
+			 */
+			mmcra = regs->dsisr;
+			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
+				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
+			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
+				addr = mfspr(SPRN_SDAR);
+		}
+		perf_counter_overflow(counter, nmi, regs, addr);
+	}
+}
+
+/*
+ * Called from generic code to get the misc flags (i.e. processor mode)
+ * for an event.
+ */
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+	unsigned long mmcra;
+
+	if (TRAP(regs) != 0xf00) {
+		/* not a PMU interrupt */
+		return user_mode(regs) ? PERF_EVENT_MISC_USER :
+			PERF_EVENT_MISC_KERNEL;
+	}
+
+	mmcra = regs->dsisr;
+	if (ppmu->flags & PPMU_ALT_SIPR) {
+		if (mmcra & POWER6_MMCRA_SIHV)
+			return PERF_EVENT_MISC_HYPERVISOR;
+		return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+			PERF_EVENT_MISC_KERNEL;
+	}
+	if (mmcra & MMCRA_SIHV)
+		return PERF_EVENT_MISC_HYPERVISOR;
+	return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER :
+			PERF_EVENT_MISC_KERNEL;
+}
+
+/*
+ * Called from generic code to get the instruction pointer
+ * for an event.
+ */
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
+{
+	unsigned long mmcra;
+	unsigned long ip;
+	unsigned long slot;
+
+	if (TRAP(regs) != 0xf00)
+		return regs->nip;	/* not a PMU interrupt */
+
+	ip = mfspr(SPRN_SIAR);
+	mmcra = regs->dsisr;
+	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) {
+		slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT;
+		if (slot > 1)
+			ip += 4 * (slot - 1);
+	}
+	return ip;
 }
 
 /*
@@ -983,6 +1054,11 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5),
 					mfspr(SPRN_PMC6));
 
+	/*
+	 * Overload regs->dsisr to store MMCRA so we only need to read it once.
+	 */
+	regs->dsisr = mfspr(SPRN_MMCRA);
+
 	/*
 	 * If interrupts were soft-disabled when this PMU interrupt
 	 * occurred, treat it as an NMI.
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 3ac0654372abe..c6cdfc165d6ef 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -625,6 +625,6 @@ struct power_pmu power5p_pmu = {
 	.disable_pmc = power5p_disable_pmc,
 	.n_generic = ARRAY_SIZE(power5p_generic_events),
 	.generic_events = power5p_generic_events,
-	.limited_pmc5_6 = 1,
+	.flags = PPMU_LIMITED_PMC5_6,
 	.limited_pmc_event = power5p_limited_pmc_event,
 };
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index ab7c615c458db..cd4fbe06c35d4 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -485,6 +485,6 @@ struct power_pmu power6_pmu = {
 	.disable_pmc = p6_disable_pmc,
 	.n_generic = ARRAY_SIZE(power6_generic_events),
 	.generic_events = power6_generic_events,
-	.limited_pmc5_6 = 1,
+	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
 	.limited_pmc_event = p6_limited_pmc_event,
 };
-- 
GitLab


From d9bcc01d58d18ed287091707b0b45c6ac888a11a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:06:12 +0530
Subject: [PATCH 1166/2198] x86, mtrr: replace MTRRcap_MSR with msr-index's
 MSR_MTRRcap

Use standard msr-index.h's MSR declaration and no need to declare again.

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 4 ++--
 arch/x86/kernel/cpu/mtrr/main.c    | 2 +-
 arch/x86/kernel/cpu/mtrr/mtrr.h    | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 0b776c09aff38..de9c20ffa0dbc 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -306,7 +306,7 @@ void __init get_mtrr_state(void)
 
 	vrs = mtrr_state.var_ranges;
 
-	rdmsr(MTRRcap_MSR, lo, dummy);
+	rdmsr(MSR_MTRRcap, lo, dummy);
 	mtrr_state.have_fixed = (lo >> 8) & 1;
 
 	for (i = 0; i < num_var_ranges; i++)
@@ -703,7 +703,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
 static int generic_have_wrcomb(void)
 {
 	unsigned long config, dummy;
-	rdmsr(MTRRcap_MSR, config, dummy);
+	rdmsr(MSR_MTRRcap, config, dummy);
 	return (config & (1 << 10));
 }
 
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 03cda01f57c71..8fc248b5aeafe 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
 	unsigned long config = 0, dummy;
 
 	if (use_intel()) {
-		rdmsr(MTRRcap_MSR, config, dummy);
+		rdmsr(MSR_MTRRcap, config, dummy);
 	} else if (is_cpu(AMD))
 		config = 2;
 	else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 77f67f7b347a3..5d37fb1452331 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,7 +5,6 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 
-#define MTRRcap_MSR     0x0fe
 #define MTRRdefType_MSR 0x2ff
 
 #define MTRRfix64K_00000_MSR 0x250
-- 
GitLab


From a036c7a358cc9d7ed28a188480b9a4d709e09b24 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:10:43 +0530
Subject: [PATCH 1167/2198] x86, mtrr: replace MTRRfix64K_00000_MSR with
 msr-index's MSR_MTRRfix64K_00000

Use standard msr-index.h's MSR declaration and no need to declare again.

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 4 ++--
 arch/x86/kernel/cpu/mtrr/mtrr.h    | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index de9c20ffa0dbc..8b115c0e5900f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -20,7 +20,7 @@ struct fixed_range_block {
 };
 
 static struct fixed_range_block fixed_range_blocks[] = {
-	{ MTRRfix64K_00000_MSR, 1 }, /* one  64k MTRR  */
+	{ MSR_MTRRfix64K_00000, 1 }, /* one  64k MTRR  */
 	{ MTRRfix16K_80000_MSR, 2 }, /* two  16k MTRRs */
 	{ MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */
 	{}
@@ -194,7 +194,7 @@ get_fixed_ranges(mtrr_type * frs)
 
 	k8_check_syscfg_dram_mod_en();
 
-	rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
+	rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
 
 	for (i = 0; i < 2; i++)
 		rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 5d37fb1452331..7f23caede71ba 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -7,7 +7,6 @@
 
 #define MTRRdefType_MSR 0x2ff
 
-#define MTRRfix64K_00000_MSR 0x250
 #define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
 #define MTRRfix4K_C0000_MSR 0x268
-- 
GitLab


From 7d9d55e449089df8463bca2045d702ae6cda64a2 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:15:32 +0530
Subject: [PATCH 1168/2198] x86, mtrr: replace MTRRfix16K_80000_MSR with
 msr-index's MSR_MTRRfix16K_80000

Use standard msr-index.h's MSR declaration and no need to declare again

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 4 ++--
 arch/x86/kernel/cpu/mtrr/mtrr.h    | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 8b115c0e5900f..00437c2e8dd34 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -21,7 +21,7 @@ struct fixed_range_block {
 
 static struct fixed_range_block fixed_range_blocks[] = {
 	{ MSR_MTRRfix64K_00000, 1 }, /* one  64k MTRR  */
-	{ MTRRfix16K_80000_MSR, 2 }, /* two  16k MTRRs */
+	{ MSR_MTRRfix16K_80000, 2 }, /* two  16k MTRRs */
 	{ MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */
 	{}
 };
@@ -197,7 +197,7 @@ get_fixed_ranges(mtrr_type * frs)
 	rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
 
 	for (i = 0; i < 2; i++)
-		rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
+		rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
 	for (i = 0; i < 8; i++)
 		rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
 }
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 7f23caede71ba..712b60524e63d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -7,7 +7,6 @@
 
 #define MTRRdefType_MSR 0x2ff
 
-#define MTRRfix16K_80000_MSR 0x258
 #define MTRRfix16K_A0000_MSR 0x259
 #define MTRRfix4K_C0000_MSR 0x268
 #define MTRRfix4K_C8000_MSR 0x269
-- 
GitLab


From 654ac05801ae806661c8fdeb3b5c420a31cbc5b1 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:21:54 +0530
Subject: [PATCH 1169/2198] x86, mtrr: remove mtrr MSRs double declaration

Removed MTRR MSR from mtrr/mtrr.h as these are already declared in
msr-index.h and nobody is using them:
 MTRRfix16K_A0000_MSR
 MTRRfix4K_C8000_MSR
 MTRRfix4K_D0000_MSR
 MTRRfix4K_D8000_MSR
 MTRRfix4K_E0000_MSR
 MTRRfix4K_E8000_MSR
 MTRRfix4K_F0000_MSR
 MTRRfix4K_F8000_MSR

Use standard msr-index.h's MSR declaration and no need to declare again

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mtrr/mtrr.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 712b60524e63d..5053793f91244 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -7,15 +7,7 @@
 
 #define MTRRdefType_MSR 0x2ff
 
-#define MTRRfix16K_A0000_MSR 0x259
 #define MTRRfix4K_C0000_MSR 0x268
-#define MTRRfix4K_C8000_MSR 0x269
-#define MTRRfix4K_D0000_MSR 0x26a
-#define MTRRfix4K_D8000_MSR 0x26b
-#define MTRRfix4K_E0000_MSR 0x26c
-#define MTRRfix4K_E8000_MSR 0x26d
-#define MTRRfix4K_F0000_MSR 0x26e
-#define MTRRfix4K_F8000_MSR 0x26f
 
 #define MTRR_CHANGE_MASK_FIXED     0x01
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
-- 
GitLab


From ba5673ff1ff5f428256db4cedd4b05b7be008bb6 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:29:25 +0530
Subject: [PATCH 1170/2198] x86, mtrr: replace MTRRfix4K_C0000_MSR with
 msr-index's MSR_MTRRfix4K_C0000

Use standard msr-index.h's MSR declaration and no need to declare again.

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mtrr/generic.c | 4 ++--
 arch/x86/kernel/cpu/mtrr/mtrr.h    | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 00437c2e8dd34..3cf58e2653459 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -22,7 +22,7 @@ struct fixed_range_block {
 static struct fixed_range_block fixed_range_blocks[] = {
 	{ MSR_MTRRfix64K_00000, 1 }, /* one  64k MTRR  */
 	{ MSR_MTRRfix16K_80000, 2 }, /* two  16k MTRRs */
-	{ MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */
+	{ MSR_MTRRfix4K_C0000,  8 }, /* eight 4k MTRRs */
 	{}
 };
 
@@ -199,7 +199,7 @@ get_fixed_ranges(mtrr_type * frs)
 	for (i = 0; i < 2; i++)
 		rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
 	for (i = 0; i < 8; i++)
-		rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
+		rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
 }
 
 void mtrr_save_fixed_ranges(void *info)
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index 5053793f91244..e5ee686d2c39d 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -7,8 +7,6 @@
 
 #define MTRRdefType_MSR 0x2ff
 
-#define MTRRfix4K_C0000_MSR 0x268
-
 #define MTRR_CHANGE_MASK_FIXED     0x01
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
-- 
GitLab


From 52650257ea06bb15c2e2bbe854cbdf463726141a Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Thu, 14 May 2009 12:35:46 +0530
Subject: [PATCH 1171/2198] x86, mtrr: replace MTRRdefType_MSR with msr-index's
 MSR_MTRRdefType

Use standard msr-index.h's MSR declaration and no need to declare again.

[ Impact: cleanup, no object code change ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/cpu/mtrr/cleanup.c | 4 ++--
 arch/x86/kernel/cpu/mtrr/generic.c | 8 ++++----
 arch/x86/kernel/cpu/mtrr/mtrr.h    | 2 --
 arch/x86/kernel/cpu/mtrr/state.c   | 6 +++---
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index ce0fe4b5c04f6..1d584a18a50da 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 
 	if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
 		return 0;
-	rdmsr(MTRRdefType_MSR, def, dummy);
+	rdmsr(MSR_MTRRdefType, def, dummy);
 	def &= 0xff;
 	if (def != MTRR_TYPE_UNCACHABLE)
 		return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
 	 */
 	if (!is_cpu(INTEL) || disable_mtrr_trim)
 		return 0;
-	rdmsr(MTRRdefType_MSR, def, dummy);
+	rdmsr(MSR_MTRRdefType, def, dummy);
 	def &= 0xff;
 	if (def != MTRR_TYPE_UNCACHABLE)
 		return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 3cf58e2653459..e930a3117700f 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -314,7 +314,7 @@ void __init get_mtrr_state(void)
 	if (mtrr_state.have_fixed)
 		get_fixed_ranges(mtrr_state.fixed_ranges);
 
-	rdmsr(MTRRdefType_MSR, lo, dummy);
+	rdmsr(MSR_MTRRdefType, lo, dummy);
 	mtrr_state.def_type = (lo & 0xff);
 	mtrr_state.enabled = (lo & 0xc00) >> 10;
 
@@ -579,10 +579,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
 	__flush_tlb();
 
 	/*  Save MTRR state */
-	rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+	rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
 
 	/*  Disable MTRRs, and set the default type to uncached  */
-	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
+	mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
 }
 
 static void post_set(void) __releases(set_atomicity_lock)
@@ -591,7 +591,7 @@ static void post_set(void) __releases(set_atomicity_lock)
 	__flush_tlb();
 
 	/* Intel (P6) standard MTRRs */
-	mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+	mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
 		
 	/*  Enable caches  */
 	write_cr0(read_cr0() & 0xbfffffff);
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index e5ee686d2c39d..7538b767f2060 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -5,8 +5,6 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 
-#define MTRRdefType_MSR 0x2ff
-
 #define MTRR_CHANGE_MASK_FIXED     0x01
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
index 7f7e2753685bc..1f5fb1588d1fb 100644
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ b/arch/x86/kernel/cpu/mtrr/state.c
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
 
 		if (use_intel())
 			/*  Save MTRR state */
-			rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+			rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
 		else
 			/* Cyrix ARRs - everything else were excluded at the top */
 			ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
 {
 	if (use_intel())
 		/*  Disable MTRRs, and set the default type to uncached  */
-		mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
+		mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
 		      ctxt->deftype_hi);
 	else if (is_cpu(CYRIX))
 		/* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
 		/*  Restore MTRRdefType  */
 		if (use_intel())
 			/* Intel (P6) standard MTRRs */
-			mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+			mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
 		else
 			/* Cyrix ARRs - everything else was excluded at the top */
 			setCx86(CX86_CCR3, ctxt->ccr3);
-- 
GitLab


From 8bc2095951517e2c74b8aeeca4685ddd6b16ed4b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 20:45:59 +0200
Subject: [PATCH 1172/2198] perf_counter: Fix inheritance cleanup code

Clean up code that open-coded the list_{add,del}_counter() code in
__perf_counter_exit_task() which consequently diverged. This could
lead to software counter crashes.

Also, fold the ctx->nr_counter inc/dec into those functions and clean
up some of the related code.

[ Impact: fix potential sw counter crash, cleanup ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 57840a94b1634..59a926d04bafd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -115,6 +115,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	}
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
+	ctx->nr_counters++;
 }
 
 static void
@@ -122,6 +123,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
 	struct perf_counter *sibling, *tmp;
 
+	ctx->nr_counters--;
+
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
 
@@ -209,7 +212,6 @@ static void __perf_counter_remove_from_context(void *info)
 	counter_sched_out(counter, cpuctx, ctx);
 
 	counter->task = NULL;
-	ctx->nr_counters--;
 
 	/*
 	 * Protect the list operation against NMI by disabling the
@@ -276,7 +278,6 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
 	 * succeed.
 	 */
 	if (!list_empty(&counter->list_entry)) {
-		ctx->nr_counters--;
 		list_del_counter(counter, ctx);
 		counter->task = NULL;
 	}
@@ -544,7 +545,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 			       struct perf_counter_context *ctx)
 {
 	list_add_counter(counter, ctx);
-	ctx->nr_counters++;
 	counter->prev_state = PERF_COUNTER_STATE_OFF;
 	counter->tstamp_enabled = ctx->time;
 	counter->tstamp_running = ctx->time;
@@ -3206,9 +3206,8 @@ static int inherit_group(struct perf_counter *parent_counter,
 static void sync_child_counter(struct perf_counter *child_counter,
 			       struct perf_counter *parent_counter)
 {
-	u64 parent_val, child_val;
+	u64 child_val;
 
-	parent_val = atomic64_read(&parent_counter->count);
 	child_val = atomic64_read(&child_counter->count);
 
 	/*
@@ -3240,7 +3239,6 @@ __perf_counter_exit_task(struct task_struct *child,
 			 struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *parent_counter;
-	struct perf_counter *sub, *tmp;
 
 	/*
 	 * If we do not self-reap then we have to wait for the
@@ -3252,8 +3250,8 @@ __perf_counter_exit_task(struct task_struct *child,
 	 */
 	if (child != current) {
 		wait_task_inactive(child, 0);
-		list_del_init(&child_counter->list_entry);
 		update_counter_times(child_counter);
+		list_del_counter(child_counter, child_ctx);
 	} else {
 		struct perf_cpu_context *cpuctx;
 		unsigned long flags;
@@ -3272,9 +3270,7 @@ __perf_counter_exit_task(struct task_struct *child,
 		group_sched_out(child_counter, cpuctx, child_ctx);
 		update_counter_times(child_counter);
 
-		list_del_init(&child_counter->list_entry);
-
-		child_ctx->nr_counters--;
+		list_del_counter(child_counter, child_ctx);
 
 		perf_enable();
 		local_irq_restore(flags);
@@ -3288,13 +3284,6 @@ __perf_counter_exit_task(struct task_struct *child,
 	 */
 	if (parent_counter) {
 		sync_child_counter(child_counter, parent_counter);
-		list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
-					 list_entry) {
-			if (sub->parent) {
-				sync_child_counter(sub, sub->parent);
-				free_counter(sub);
-			}
-		}
 		free_counter(child_counter);
 	}
 }
@@ -3315,9 +3304,18 @@ void perf_counter_exit_task(struct task_struct *child)
 	if (likely(!child_ctx->nr_counters))
 		return;
 
+again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
 		__perf_counter_exit_task(child, child_counter, child_ctx);
+
+	/*
+	 * If the last counter was a group counter, it will have appended all
+	 * its siblings to the list, but we obtained 'tmp' before that which
+	 * will still point to the list head terminating the iteration.
+	 */
+	if (!list_empty(&child_ctx->counter_list))
+		goto again;
 }
 
 /*
-- 
GitLab


From 856d56b9e5de650a64a6c41c17aaed702b55d578 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 15 May 2009 20:45:59 +0200
Subject: [PATCH 1173/2198] perf_counter: Fix counter inheritance

Srivatsa Vaddagiri reported that a Java workload triggers this
warning in kernel/exit.c:

   WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));

Add the inherited counter propagation on self-detach, this could
cause counter leaks and incomplete stats in threaded code like
the below:

  #include <pthread.h>
  #include <unistd.h>

  void *thread(void *arg)
  {
          sleep(5);
          return NULL;
  }

  void main(void)
  {
          pthread_t thr;
          pthread_create(&thr, NULL, thread, NULL);
  }

Reported-by: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/exit.c b/kernel/exit.c
index 4741376c8decc..16d74f13a3e79 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -128,6 +128,12 @@ static void __exit_signal(struct task_struct *tsk)
 		sig = NULL; /* Marker for below. */
 	}
 
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	__unhash_process(tsk);
 
 	/*
-- 
GitLab


From 0203026b58b4299ba7281c0b4b417207c1f05d0e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:24:08 +0200
Subject: [PATCH 1174/2198] perf_counter: fix threaded task exit

Flushing counters in __exit_signal() with irqs disabled is not
a good idea as perf_counter_exit_task() acquires mutexes. So
flush it before acquiring the tasklist lock.

(Note, we still need a fix for when the PID has been unhashed.)

[ Impact: fix crash with inherited counters ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 16d74f13a3e79..73affd35e76d1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -128,12 +128,6 @@ static void __exit_signal(struct task_struct *tsk)
 		sig = NULL; /* Marker for below. */
 	}
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(tsk);
-
 	__unhash_process(tsk);
 
 	/*
@@ -183,6 +177,13 @@ void release_task(struct task_struct * p)
 	atomic_dec(&__task_cred(p)->user->processes);
 
 	proc_flush_task(p);
+
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(p);
+
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
 	__exit_signal(p);
-- 
GitLab


From d2517a49d55536b38c7a87e5289550cfedaa4dcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 10:04:45 +0200
Subject: [PATCH 1175/2198] perf_counter, x86: fix zero irq_period counters

The quirk to irq_period unearthed an unrobustness we had in the
hw_counter initialization sequence: we left irq_period at 0, which
was then quirked up to 2 ... which then generated a _lot_ of
interrupts during 'perf stat' runs, slowed them down and skewed
the counter results in general.

Initialize irq_period to the maximum instead.

[ Impact: fix perf stat results ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 886dcf334bc35..5bfd30ab39205 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -286,6 +286,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->nmi = 1;
 	}
 
+	if (!hwc->irq_period)
+		hwc->irq_period = x86_pmu.max_period;
+
 	atomic64_set(&hwc->period_left,
 			min(x86_pmu.max_period, hwc->irq_period));
 
-- 
GitLab


From e8e7526c3c0863be25ab03a0871ee0978de5ba50 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sun, 17 May 2009 17:22:53 +0200
Subject: [PATCH 1176/2198] ide-tape: fix debug call

This error only occurs when IDETAPE_DEBUG_LOG is enabled.

Signed-off-by: Mark de Wever <koraq@xs4all.nl>
Signed-off-by: Borislav Petkov <petkovbb@gmail.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 drivers/ide/ide-tape.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index 8dfc68892d6ae..203bbeac182f5 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -617,7 +617,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
 	struct ide_cmd cmd;
 	u8 stat;
 
-	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n"
+	debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n",
 		  (unsigned long long)rq->sector, rq->nr_sectors);
 
 	if (!(blk_special_request(rq) || blk_sense_request(rq))) {
-- 
GitLab


From 6fd058f7791087648c683eb8572edf3be3c4c23c Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 17 May 2009 15:38:01 -0400
Subject: [PATCH 1177/2198] ext4: Add a comprehensive block validity check to
 ext4_get_blocks()

To catch filesystem bugs or corruption which could lead to the
filesystem getting severly damaged, this patch adds a facility for
tracking all of the filesystem metadata blocks by contiguous regions
in a red-black tree.  This allows quick searching of the tree to
locate extents which might overlap with filesystem metadata blocks.

This facility is also used by the multi-block allocator to assure that
it is not allocating blocks out of the system zone, as well as by the
routines used when reading indirect blocks and extents information
from disk to make sure their contents are valid.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/Makefile         |   4 +-
 fs/ext4/block_validity.c | 244 +++++++++++++++++++++++++++++++++++++++
 fs/ext4/ext4.h           |  11 ++
 fs/ext4/extents.c        |  22 +---
 fs/ext4/inode.c          |  47 ++++++--
 fs/ext4/mballoc.c        |  11 +-
 fs/ext4/super.c          |  32 ++++-
 7 files changed, 332 insertions(+), 39 deletions(-)
 create mode 100644 fs/ext4/block_validity.c

diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile
index a8ff003a00f70..8a34710ecf40e 100644
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -5,8 +5,8 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
 
 ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-		   ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-		   ext4_jbd2.o migrate.o mballoc.o
+		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+		ext4_jbd2.o migrate.o mballoc.o block_validity.o
 
 ext4-$(CONFIG_EXT4_FS_XATTR)		+= xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644
index 0000000000000..50784ef075631
--- /dev/null
+++ b/fs/ext4/block_validity.c
@@ -0,0 +1,244 @@
+/*
+ *  linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4.h"
+
+struct ext4_system_zone {
+	struct rb_node	node;
+	ext4_fsblk_t	start_blk;
+	unsigned int	count;
+};
+
+static struct kmem_cache *ext4_system_zone_cachep;
+
+int __init init_ext4_system_zone(void)
+{
+	ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+					     SLAB_RECLAIM_ACCOUNT);
+	if (ext4_system_zone_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+void exit_ext4_system_zone(void)
+{
+	kmem_cache_destroy(ext4_system_zone_cachep);
+}
+
+static inline int can_merge(struct ext4_system_zone *entry1,
+		     struct ext4_system_zone *entry2)
+{
+	if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+		return 1;
+	return 0;
+}
+
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+			   ext4_fsblk_t start_blk,
+			   unsigned int count)
+{
+	struct ext4_system_zone *new_entry = NULL, *entry;
+	struct rb_node **n = &sbi->system_blks.rb_node, *node;
+	struct rb_node *parent = NULL, *new_node = NULL;
+
+	while (*n) {
+		parent = *n;
+		entry = rb_entry(parent, struct ext4_system_zone, node);
+		if (start_blk < entry->start_blk)
+			n = &(*n)->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = &(*n)->rb_right;
+		else {
+			if (start_blk + count > (entry->start_blk + 
+						 entry->count))
+				entry->count = (start_blk + count - 
+						entry->start_blk);
+			new_node = *n;
+			new_entry = rb_entry(new_node, struct ext4_system_zone,
+					     node);
+			break;
+		}
+	}
+
+	if (!new_entry) {
+		new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+					     GFP_KERNEL);
+		if (!new_entry)
+			return -ENOMEM;
+		new_entry->start_blk = start_blk;
+		new_entry->count = count;
+		new_node = &new_entry->node;
+
+		rb_link_node(new_node, parent, n);
+		rb_insert_color(new_node, &sbi->system_blks);
+	}
+
+	/* Can we merge to the left? */
+	node = rb_prev(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(entry, new_entry)) {
+			new_entry->start_blk = entry->start_blk;
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+
+	/* Can we merge to the right? */
+	node = rb_next(new_node);
+	if (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		if (can_merge(new_entry, entry)) {
+			new_entry->count += entry->count;
+			rb_erase(node, &sbi->system_blks);
+			kmem_cache_free(ext4_system_zone_cachep, entry);
+		}
+	}
+	return 0;
+}
+
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+	struct rb_node *node;
+	struct ext4_system_zone *entry;
+	int first = 1;
+
+	printk(KERN_INFO "System zones: ");
+	node = rb_first(&sbi->system_blks);
+	while (node) {
+		entry = rb_entry(node, struct ext4_system_zone, node);
+		printk("%s%llu-%llu", first ? "" : ", ",
+		       entry->start_blk, entry->start_blk + entry->count - 1);
+		first = 0;
+		node = rb_next(node);
+	}
+	printk("\n");
+}
+
+int ext4_setup_system_zone(struct super_block *sb)
+{
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_group_desc *gdp;
+	ext4_group_t i;
+	int flex_size = ext4_flex_bg_size(sbi);
+	int ret;
+
+	if (!test_opt(sb, BLOCK_VALIDITY)) {
+		if (EXT4_SB(sb)->system_blks.rb_node)
+			ext4_release_system_zone(sb);
+		return 0;
+	}
+	if (EXT4_SB(sb)->system_blks.rb_node)
+		return 0;
+
+	for (i=0; i < ngroups; i++) {
+		if (ext4_bg_has_super(sb, i) &&
+		    ((i < 5) || ((i % flex_size) == 0)))
+			add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+					sbi->s_gdb_count + 1);
+		gdp = ext4_get_group_desc(sb, i, NULL);
+		ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+		if (ret)
+			return ret;
+		ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+				sbi->s_itb_per_group);
+		if (ret)
+			return ret;
+	}
+
+	if (test_opt(sb, DEBUG))
+		debug_print_tree(EXT4_SB(sb));
+	return 0;
+}
+
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+	struct rb_node	*n = EXT4_SB(sb)->system_blks.rb_node;
+	struct rb_node	*parent;
+	struct ext4_system_zone	*entry;
+
+	while (n) {
+		/* Do the node's children first */
+		if (n->rb_left) {
+			n = n->rb_left;
+			continue;
+		}
+		if (n->rb_right) {
+			n = n->rb_right;
+			continue;
+		}
+		/*
+		 * The node has no children; free it, and then zero
+		 * out parent's link to it.  Finally go to the
+		 * beginning of the loop and try to free the parent
+		 * node.
+		 */
+		parent = rb_parent(n);
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		kmem_cache_free(ext4_system_zone_cachep, entry);
+		if (!parent)
+			EXT4_SB(sb)->system_blks.rb_node = NULL;
+		else if (parent->rb_left == n)
+			parent->rb_left = NULL;
+		else if (parent->rb_right == n)
+			parent->rb_right = NULL;
+		n = parent;
+	}
+	EXT4_SB(sb)->system_blks.rb_node = NULL;
+}
+
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+			  unsigned int count)
+{
+	struct ext4_system_zone *entry;
+	struct rb_node *n = sbi->system_blks.rb_node;
+
+	if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+	    (start_blk + count > ext4_blocks_count(sbi->s_es)))
+		return 0;
+	while (n) {
+		entry = rb_entry(n, struct ext4_system_zone, node);
+		if (start_blk + count - 1 < entry->start_blk)
+			n = n->rb_left;
+		else if (start_blk >= (entry->start_blk + entry->count))
+			n = n->rb_right;
+		else
+			return 0;
+	}
+	return 1;
+}
+
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index d164f1294e5f7..4311cc85b5340 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -696,6 +696,7 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY	0x20000000 /* Block validity checking */
 
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -887,6 +888,7 @@ struct ext4_sb_info {
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
 	unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+	struct rb_root system_blks;
 
 #ifdef EXTENTS_STATS
 	/* ext4 extents stats */
@@ -1618,6 +1620,15 @@ extern struct dentry *ext4_get_parent(struct dentry *child);
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
 
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init init_ext4_system_zone(void);
+extern void exit_ext4_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+				 ext4_fsblk_t start_blk,
+				 unsigned int count);
+
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 27c383c7b43c6..d04b779b780ec 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-	ext4_fsblk_t block = ext_pblock(ext), valid_block;
+	ext4_fsblk_t block = ext_pblock(ext);
 	int len = ext4_ext_get_actual_len(ext);
-	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 
-	valid_block = le32_to_cpu(es->s_first_data_block) +
-		EXT4_SB(inode->i_sb)->s_gdb_count;
-	if (unlikely(block <= valid_block ||
-		     ((block + len) > ext4_blocks_count(es))))
-		return 0;
-	else
-		return 1;
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
 
 static int ext4_valid_extent_idx(struct inode *inode,
 				struct ext4_extent_idx *ext_idx)
 {
-	ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
-	struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+	ext4_fsblk_t block = idx_pblock(ext_idx);
 
-	valid_block = le32_to_cpu(es->s_first_data_block) +
-		EXT4_SB(inode->i_sb)->s_gdb_count;
-	if (unlikely(block <= valid_block ||
-		     (block >= ext4_blocks_count(es))))
-		return 0;
-	else
-		return 1;
+	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
 
 static int ext4_valid_extent_entries(struct inode *inode,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d7b7480682b90..dadd3f995db5e 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,20 +372,21 @@ static int ext4_block_to_path(struct inode *inode,
 }
 
 static int __ext4_check_blockref(const char *function, struct inode *inode,
-				 __le32 *p, unsigned int max) {
-
-	unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+				 __le32 *p, unsigned int max)
+{
 	__le32 *bref = p;
+	unsigned int blk;
+
 	while (bref < p+max) {
-		if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
+		blk = le32_to_cpu(*bref++);
+		if (blk && 
+		    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 
+						    blk, 1))) {
 			ext4_error(inode->i_sb, function,
-				   "block reference %u >= max (%u) "
-				   "in inode #%lu, offset=%d",
-				   le32_to_cpu(*bref), maxblocks,
-				   inode->i_ino, (int)(bref-p));
+				   "invalid block reference %u "
+				   "in inode #%lu", blk, inode->i_ino);
  			return -EIO;
  		}
-		bref++;
  	}
  	return 0;
 }
@@ -1125,6 +1126,21 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
 		ext4_discard_preallocations(inode);
 }
 
+static int check_block_validity(struct inode *inode, sector_t logical,
+				sector_t phys, int len)
+{
+	if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+		ext4_error(inode->i_sb, "check_block_validity",
+			   "inode #%lu logical block %llu mapped to %llu "
+			   "(size %d)", inode->i_ino,
+			   (unsigned long long) logical,
+			   (unsigned long long) phys, len);
+		WARN_ON(1);
+		return -EIO;
+	}
+	return 0;
+}
+
 /*
  * The ext4_get_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
@@ -1170,6 +1186,13 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 	}
 	up_read((&EXT4_I(inode)->i_data_sem));
 
+	if (retval > 0 && buffer_mapped(bh)) {
+		int ret = check_block_validity(inode, block, 
+					       bh->b_blocknr, retval);
+		if (ret != 0)
+			return ret;
+	}
+
 	/* If it is only a block(s) look up */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
 		return retval;
@@ -1245,6 +1268,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 		ext4_da_update_reserve_space(inode, retval);
 
 	up_write((&EXT4_I(inode)->i_data_sem));
+	if (retval > 0 && buffer_mapped(bh)) {
+		int ret = check_block_validity(inode, block, 
+					       bh->b_blocknr, retval);
+		if (ret != 0)
+			return ret;
+	}
 	return retval;
 }
 
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 541bd9adffa20..ed8482e22c0ea 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2961,15 +2961,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
 		+ le32_to_cpu(es->s_first_data_block);
 
 	len = ac->ac_b_ex.fe_len;
-	if (in_range(ext4_block_bitmap(sb, gdp), block, len) ||
-	    in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
-	    in_range(block, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group) ||
-	    in_range(block + len - 1, ext4_inode_table(sb, gdp),
-		     EXT4_SB(sb)->s_itb_per_group)) {
+	if (!ext4_data_block_valid(sbi, block, len)) {
 		ext4_error(sb, __func__,
-			   "Allocating block %llu in system zone of %d group\n",
-			   block, ac->ac_b_ex.fe_group);
+			   "Allocating blocks %llu-%llu which overlap "
+			   "fs metadata\n", block, block+len);
 		/* File system mounted not to panic on error
 		 * Fix the bitmap and repeat the block allocation
 		 * We leak some of the blocks here.
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index dc34ed3d13279..600b7ad699b53 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -568,6 +568,7 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
 	ext4_xattr_put_super(sb);
@@ -1055,6 +1056,7 @@ enum {
 	Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
 	Opt_usrquota, Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+	Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio
 };
 
@@ -1114,6 +1116,8 @@ static const match_table_t tokens = {
 	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
+	{Opt_block_validity, "block_validity"},
+	{Opt_noblock_validity, "noblock_validity"},
 	{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
 	{Opt_journal_ioprio, "journal_ioprio=%u"},
 	{Opt_auto_da_alloc, "auto_da_alloc=%u"},
@@ -1508,6 +1512,12 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_delalloc:
 			set_opt(sbi->s_mount_opt, DELALLOC);
 			break;
+		case Opt_block_validity:
+			set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+			break;
+		case Opt_noblock_validity:
+			clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+			break;
 		case Opt_inode_readahead_blks:
 			if (match_int(&args[0], &option))
 				return 0;
@@ -2826,6 +2836,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	} else if (test_opt(sb, DELALLOC))
 		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
 
+	err = ext4_setup_system_zone(sb);
+	if (err) {
+		printk(KERN_ERR "EXT4-fs: failed to initialize system "
+		       "zone (%d)\n", err);
+		goto failed_mount4;
+	}
+
 	ext4_ext_init(sb);
 	err = ext4_mb_init(sb, needs_recovery);
 	if (err) {
@@ -2875,6 +2892,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 failed_mount4:
 	printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+	ext4_release_system_zone(sb);
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
@@ -3515,6 +3533,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 				sb->s_flags &= ~MS_RDONLY;
 		}
 	}
+	ext4_setup_system_zone(sb);
 	if (sbi->s_journal == NULL)
 		ext4_commit_super(sb, 1);
 
@@ -3927,13 +3946,16 @@ static int __init init_ext4_fs(void)
 {
 	int err;
 
+	err = init_ext4_system_zone();
+	if (err)
+		return err;
 	ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
 	if (!ext4_kset)
-		return -ENOMEM;
+		goto out4;
 	ext4_proc_root = proc_mkdir("fs/ext4", NULL);
 	err = init_ext4_mballoc();
 	if (err)
-		return err;
+		goto out3;
 
 	err = init_ext4_xattr();
 	if (err)
@@ -3958,6 +3980,11 @@ static int __init init_ext4_fs(void)
 	exit_ext4_xattr();
 out2:
 	exit_ext4_mballoc();
+out3:
+	remove_proc_entry("fs/ext4", NULL);
+	kset_unregister(ext4_kset);
+out4:
+	exit_ext4_system_zone();
 	return err;
 }
 
@@ -3972,6 +3999,7 @@ static void __exit exit_ext4_fs(void)
 	exit_ext4_mballoc();
 	remove_proc_entry("fs/ext4", NULL);
 	kset_unregister(ext4_kset);
+	exit_ext4_system_zone();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
-- 
GitLab


From 0568c518937ee3a9b6a94d18bae9c150fe5d6832 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sun, 17 May 2009 23:31:23 -0400
Subject: [PATCH 1178/2198] ext4: down i_data_sem only for read when walking
 tree for fiemap

Not sure why I put this in as down_write originally; all we are
doing is walking the tree, nothing will change under us and
concurrent reads should be no problem.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d04b779b780ec..d4e99e96fddb2 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3312,10 +3312,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 * Walk the extent tree gathering extent information.
 		 * ext4_ext_fiemap_cb will push extents back to user.
 		 */
-		down_write(&EXT4_I(inode)->i_data_sem);
+		down_read(&EXT4_I(inode)->i_data_sem);
 		error = ext4_ext_walk_space(inode, start_blk, len_blks,
 					  ext4_ext_fiemap_cb, fieinfo);
-		up_write(&EXT4_I(inode)->i_data_sem);
+		up_read(&EXT4_I(inode)->i_data_sem);
 	}
 
 	return error;
-- 
GitLab


From f68301656b5f5d2de104f2687add6beeb8f3c3b9 Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Sun, 17 May 2009 23:52:44 -0400
Subject: [PATCH 1179/2198] ext4: Fix memory leak in ext4_fill_super() in case
 of a failed mount

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 600b7ad699b53..eca6c057b1197 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2924,6 +2924,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	brelse(bh);
 out_fail:
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	lock_kernel();
 	return ret;
-- 
GitLab


From de5ce037304f2c88a319b1c3b808ab0c4c618c1c Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Sun, 17 May 2009 23:52:47 -0400
Subject: [PATCH 1180/2198] ext3: Fix memory leak in ext3_fill_super() in case
 of a failed mount

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext3/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c39..d8b73d4abe3eb 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2021,6 +2021,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 	brelse(bh);
 out_fail:
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	lock_kernel();
 	return ret;
-- 
GitLab


From 0f7ee7c17241915fdaff49d1a36f5aafd80a7dce Mon Sep 17 00:00:00 2001
From: Manish Katiyar <mkatiyar@gmail.com>
Date: Sun, 17 May 2009 23:52:51 -0400
Subject: [PATCH 1181/2198] ext2: Fix memory leak in ext2_fill_super() in case
 of a failed mount

Signed-off-by: Manish Katiyar <mkatiyar@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext2/super.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 5c4afe652245a..e3c748faf2dbc 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1093,6 +1093,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
 	brelse(bh);
 failed_sbi:
 	sb->s_fs_info = NULL;
+	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 	return ret;
 }
-- 
GitLab


From c0daaf3f1f672defa3a45ca449b76d0e86c55892 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 18 May 2009 14:02:12 +1000
Subject: [PATCH 1182/2198] perf_counter: powerpc: initialize cpuhw pointer
 before use

Commit 9e35ad38 ("perf_counter: Rework the perf counter
disable/enable") added code to the powerpc hw_perf_enable (renamed
from hw_perf_restore) to test cpuhw->disabled and return immediately
if it is not set (i.e. if the PMU is already enabled).

Unfortunately the test got added before cpuhw was initialized,
resulting in an oops the first time hw_perf_enable got called.
This fixes it by moving the initialization of cpuhw to before
cpuhw->disabled is tested.

[ Impact: fix oops-causing bug on powerpc ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18960.56772.869734.304631@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 6baae5a5c3311..fe21b2440f282 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -450,12 +450,11 @@ void hw_perf_enable(void)
 	int idx;
 
 	local_irq_save(flags);
+	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	if (!cpuhw->disabled) {
 		local_irq_restore(flags);
 		return;
 	}
-
-	cpuhw = &__get_cpu_var(cpu_hw_counters);
 	cpuhw->disabled = 0;
 
 	/*
-- 
GitLab


From e5198075c67a22ec9a09565b1ce88d3d3f5ba855 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:05:16 -0700
Subject: [PATCH 1183/2198] x86, apic: introduce io_apic_irq_attr

according to Ingo, io_apic irq-setup related functions have too many
parameters with a repetitive signature.

So reduce related funcs to get less params by passing a pointer
to a newly defined io_apic_irq_attr structure.

v2: io_apic_irq ==> irq_attr
    triggering ==> trigger

v3: add set_io_apic_irq_attr

[ Impact: cleanup ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A08ACD3.2070401@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/hw_irq.h     | 21 +++++++++++++--
 arch/x86/include/asm/io_apic.h    |  5 ++--
 arch/x86/kernel/acpi/boot.c       | 22 +++++++++-------
 arch/x86/kernel/apic/io_apic.c    | 43 ++++++++++++++++++-------------
 arch/x86/pci/irq.c                | 16 ++++--------
 drivers/pci/hotplug/ibmphp_core.c |  6 ++---
 6 files changed, 66 insertions(+), 47 deletions(-)

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 26a40ab701313..a7d14bbae1103 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -63,9 +63,26 @@ extern unsigned long io_apic_irqs;
 extern void init_VISWS_APIC_irqs(void);
 extern void setup_IO_APIC(void);
 extern void disable_IO_APIC(void);
+
+struct io_apic_irq_attr {
+	int ioapic;
+	int ioapic_pin;
+	int trigger;
+	int polarity;
+};
+
+static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
+					int ioapic, int ioapic_pin,
+					int trigger, int polarity)
+{
+	irq_attr->ioapic     = ioapic;
+	irq_attr->ioapic_pin = ioapic_pin;
+	irq_attr->trigger    = trigger;
+	irq_attr->polarity   = polarity;
+}
+
 extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
-					int *ioapic, int *ioapic_pin,
-					int *trigger, int *polarity);
+					struct io_apic_irq_attr *irq_attr);
 extern void setup_ioapic_dest(void);
 
 extern void enable_IO_APIC(void);
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 6fd99f96eb0ad..daf866ed06124 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -156,8 +156,9 @@ extern int io_apic_get_version(int ioapic);
 extern int io_apic_get_redir_entries(int ioapic);
 #endif /* CONFIG_ACPI */
 
-extern int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin,
-				  int irq, int edge_level, int active_high_low);
+struct io_apic_irq_attr;
+extern int io_apic_set_pci_routing(struct device *dev, int irq,
+		 struct io_apic_irq_attr *irq_attr);
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index dcfbc3ab9e46e..4af63dfb0f06f 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -523,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 {
 	unsigned int irq;
 	unsigned int plat_gsi = gsi;
@@ -533,14 +533,14 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 	 * Make sure all (legacy) PCI IRQs are set as level-triggered.
 	 */
 	if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
-		if (triggering == ACPI_LEVEL_SENSITIVE)
+		if (trigger == ACPI_LEVEL_SENSITIVE)
 			eisa_set_level_irq(gsi);
 	}
 #endif
 
 #ifdef CONFIG_X86_IO_APIC
 	if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
-		plat_gsi = mp_register_gsi(dev, gsi, triggering, polarity);
+		plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
 	}
 #endif
 	acpi_gsi_to_irq(plat_gsi, &irq);
@@ -1156,7 +1156,7 @@ void __init mp_config_acpi_legacy_irqs(void)
 	}
 }
 
-static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering,
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
 			int polarity)
 {
 #ifdef CONFIG_X86_MPPARSE
@@ -1181,7 +1181,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering,
 	/* print the entry should happen on mptable identically */
 	mp_irq.type = MP_INTSRC;
 	mp_irq.irqtype = mp_INT;
-	mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+	mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
 				(polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
 	mp_irq.srcbus = number;
 	mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
@@ -1194,10 +1194,11 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int triggering,
 	return 0;
 }
 
-int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
+int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 {
 	int ioapic;
 	int ioapic_pin;
+	struct io_apic_irq_attr irq_attr;
 
 	if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
 		return gsi;
@@ -1225,11 +1226,12 @@ int mp_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 		       ioapic_pin);
 		return gsi;
 	}
-	mp_config_acpi_gsi(dev, gsi, triggering, polarity);
+	mp_config_acpi_gsi(dev, gsi, trigger, polarity);
 
-	io_apic_set_pci_routing(dev, ioapic, ioapic_pin, gsi,
-				triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-				polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+	set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
+			     trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
+			     polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+	io_apic_set_pci_routing(dev, gsi, &irq_attr);
 
 	return gsi;
 }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 74d2b480a20bf..ce1ac74baa738 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1096,8 +1096,7 @@ static int pin_2_irq(int idx, int apic, int pin)
  * Not an __init, possibly needed by modules
  */
 int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
-				int *ioapic, int *ioapic_pin,
-				int *trigger, int *polarity)
+				struct io_apic_irq_attr *irq_attr)
 {
 	int apic, i, best_guess = -1;
 
@@ -1127,10 +1126,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
 				continue;
 
 			if (pin == (mp_irqs[i].srcbusirq & 3)) {
-				*ioapic = apic;
-				*ioapic_pin = mp_irqs[i].dstirq;
-				*trigger = irq_trigger(i);
-				*polarity = irq_polarity(i);
+				set_io_apic_irq_attr(irq_attr, apic,
+						     mp_irqs[i].dstirq,
+						     irq_trigger(i),
+						     irq_polarity(i));
 				return irq;
 			}
 			/*
@@ -1138,10 +1137,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
 			 * best-guess fuzzy result for broken mptables.
 			 */
 			if (best_guess < 0) {
-				*ioapic = apic;
-				*ioapic_pin = mp_irqs[i].dstirq;
-				*trigger = irq_trigger(i);
-				*polarity = irq_polarity(i);
+				set_io_apic_irq_attr(irq_attr, apic,
+						     mp_irqs[i].dstirq,
+						     irq_trigger(i),
+						     irq_polarity(i));
 				best_guess = irq;
 			}
 		}
@@ -3865,13 +3864,16 @@ int __init arch_probe_nr_irqs(void)
 }
 #endif
 
-static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
-				 int triggering, int polarity)
+static int __io_apic_set_pci_routing(struct device *dev, int irq,
+				struct io_apic_irq_attr *irq_attr)
 {
 	struct irq_desc *desc;
 	struct irq_cfg *cfg;
 	int node;
+	int ioapic, pin;
+	int trigger, polarity;
 
+	ioapic = irq_attr->ioapic;
 	if (!IO_APIC_IRQ(irq)) {
 		apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
 			ioapic);
@@ -3889,6 +3891,10 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in
 		return 0;
 	}
 
+	pin = irq_attr->ioapic_pin;
+	trigger = irq_attr->trigger;
+	polarity = irq_attr->polarity;
+
 	/*
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
@@ -3897,20 +3903,22 @@ static int __io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, in
 		add_pin_to_irq_node(cfg, node, ioapic, pin);
 	}
 
-	setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
+	setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
 
 	return 0;
 }
 
-int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
-				 int triggering, int polarity)
+int io_apic_set_pci_routing(struct device *dev, int irq,
+				struct io_apic_irq_attr *irq_attr)
 {
-
+	int ioapic, pin;
 	/*
 	 * Avoid pin reprogramming.  PRTs typically include entries
 	 * with redundant pin->gsi mappings (but unique PCI devices);
 	 * we only program the IOAPIC on the first.
 	 */
+	ioapic = irq_attr->ioapic;
+	pin = irq_attr->ioapic_pin;
 	if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
 		pr_debug("Pin %d-%d already programmed\n",
 			 mp_ioapics[ioapic].apicid, pin);
@@ -3918,8 +3926,7 @@ int io_apic_set_pci_routing(struct device *dev, int ioapic, int pin, int irq,
 	}
 	set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
 
-	return __io_apic_set_pci_routing(dev, ioapic, pin, irq,
-					 triggering, polarity);
+	return __io_apic_set_pci_routing(dev, irq, irq_attr);
 }
 
 /* --------------------------------------------------------------------------
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 2f3e192615c0e..0696d506c4ade 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1200,14 +1200,11 @@ static int pirq_enable_irq(struct pci_dev *dev)
 #ifdef CONFIG_X86_IO_APIC
 			struct pci_dev *temp_dev;
 			int irq;
-			int ioapic = -1, ioapic_pin = -1;
-			int triggering, polarity;
+			struct io_apic_irq_attr irq_attr;
 
 			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
 						PCI_SLOT(dev->devfn),
-						pin - 1,
-						&ioapic, &ioapic_pin,
-						&triggering, &polarity);
+						pin - 1, &irq_attr);
 			/*
 			 * Busses behind bridges are typically not listed in the MP-table.
 			 * In this case we have to look up the IRQ based on the parent bus,
@@ -1221,9 +1218,7 @@ static int pirq_enable_irq(struct pci_dev *dev)
 				pin = pci_swizzle_interrupt_pin(dev, pin);
 				irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
 						PCI_SLOT(bridge->devfn),
-						pin - 1,
-						&ioapic, &ioapic_pin,
-						&triggering, &polarity);
+						pin - 1, &irq_attr);
 				if (irq >= 0)
 					dev_warn(&dev->dev, "using bridge %s "
 						 "INT %c to get IRQ %d\n",
@@ -1233,9 +1228,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
-				io_apic_set_pci_routing(&dev->dev, ioapic,
-							ioapic_pin, irq,
-							triggering, polarity);
+				io_apic_set_pci_routing(&dev->dev, irq,
+							 &irq_attr);
 				dev->irq = irq;
 				dev_info(&dev->dev, "PCI->APIC IRQ transform: "
 					 "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
diff --git a/drivers/pci/hotplug/ibmphp_core.c b/drivers/pci/hotplug/ibmphp_core.c
index 79901a0db885e..42e4260c3b123 100644
--- a/drivers/pci/hotplug/ibmphp_core.c
+++ b/drivers/pci/hotplug/ibmphp_core.c
@@ -155,15 +155,13 @@ int ibmphp_init_devno(struct slot **cur_slot)
 	for (loop = 0; loop < len; loop++) {
 		if ((*cur_slot)->number == rtable->slots[loop].slot &&
 		    (*cur_slot)->bus == rtable->slots[loop].bus) {
-			int ioapic = -1, ioapic_pin = -1;
-			int triggering, polarity;
+			struct io_apic_irq_attr irq_attr;
 
 			(*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn);
 			for (i = 0; i < 4; i++)
 				(*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus,
 						(int) (*cur_slot)->device, i,
-						&ioapic, &ioapic_pin,
-						&triggering, &polarity);
+						&irq_attr);
 
 			debug("(*cur_slot)->irq[0] = %x\n",
 					(*cur_slot)->irq[0]);
-- 
GitLab


From 2759c3287de27266e06f1f4e82cbd2d65f6a044c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:05:16 -0700
Subject: [PATCH 1184/2198] x86: don't call read_apic_id if !cpu_has_apic

should not call that if apic is disabled.

[ Impact: fix crash on certain UP configs ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
LKML-Reference: <4A09CCBB.2000306@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic_flat_64.c | 2 +-
 arch/x86/kernel/cpu/amd.c           | 2 +-
 arch/x86/kernel/cpu/common.c        | 6 ++++++
 arch/x86/kernel/cpu/intel.c         | 6 +++---
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 744e6d8af27b8..d0c99abc26c32 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
 
 static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
 {
-	return hard_smp_processor_id() >> index_msb;
+	return initial_apic_id >> index_msb;
 }
 
 struct apic apic_flat =  {
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa644..728b3750a3e8b 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -272,7 +272,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
 	int cpu = smp_processor_id();
 	int node;
-	unsigned apicid = hard_smp_processor_id();
+	unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
 
 	node = c->phys_proc_id;
 	if (apicid_to_node[apicid] != NUMA_NO_NODE)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c1caefc82e625..017c600e05abb 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -761,6 +761,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 	if (this_cpu->c_identify)
 		this_cpu->c_identify(c);
 
+	/* Clear/Set all flags overriden by options, after probe */
+	for (i = 0; i < NCAPINTS; i++) {
+		c->x86_capability[i] &= ~cpu_caps_cleared[i];
+		c->x86_capability[i] |= cpu_caps_set[i];
+	}
+
 #ifdef CONFIG_X86_64
 	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
 #endif
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 7437fa133c02d..daed39ba2614d 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 }
 #endif
 
-static void __cpuinit srat_detect_node(void)
+static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 {
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
 	unsigned node;
 	int cpu = smp_processor_id();
-	int apicid = hard_smp_processor_id();
+	int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
 
 	/* Don't do the funky fallback heuristics the AMD version employs
 	   for now. */
@@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	}
 
 	/* Work around errata */
-	srat_detect_node();
+	srat_detect_node(c);
 
 	if (cpu_has(c, X86_FEATURE_VMX))
 		detect_vmx_virtcap(c);
-- 
GitLab


From 888a589f6be07d624e21e2174d98375e9f95911b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:59:37 -0700
Subject: [PATCH 1185/2198] mm, x86: remove MEMORY_HOTPLUG_RESERVE related code

after:

 | commit b263295dbffd33b0fbff670720fa178c30e3392a
 | Author: Christoph Lameter <clameter@sgi.com>
 | Date:   Wed Jan 30 13:30:47 2008 +0100
 |
 |    x86: 64-bit, make sparsemem vmemmap the only memory model

we don't have MEMORY_HOTPLUG_RESERVE anymore.

Historically, x86-64 had an architecture-specific method for memory hotplug
whereby it scanned the SRAT for physical memory ranges that could be
potentially used for memory hot-add later. By reserving those ranges
without physical memory, the memmap would be allocated and left dormant
until needed. This depended on the DISCONTIG memory model which has been
removed so the code implementing HOTPLUG_RESERVE is now dead.

This patch removes the dead code used by MEMORY_HOTPLUG_RESERVE.

(Changelog authored by Mel.)

v2: updated changelog, and remove hotadd= in doc

[ Impact: remove dead code ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Reviewed-by: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Mel Gorman <mel@csn.ul.ie>
Workflow-found-OK-by: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <4A0C4910.7090508@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/x86/x86_64/boot-options.txt |  5 --
 arch/x86/include/asm/numa_64.h            |  3 -
 arch/x86/mm/numa_64.c                     |  5 --
 arch/x86/mm/srat_64.c                     | 63 ++++-----------------
 include/linux/mm.h                        |  2 -
 mm/page_alloc.c                           | 69 -----------------------
 6 files changed, 12 insertions(+), 135 deletions(-)

diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt
index 34c13040a718f..2db5893d6c972 100644
--- a/Documentation/x86/x86_64/boot-options.txt
+++ b/Documentation/x86/x86_64/boot-options.txt
@@ -150,11 +150,6 @@ NUMA
 		Otherwise, the remaining system RAM is allocated to an
 		additional node.
 
-  numa=hotadd=percent
-		Only allow hotadd memory to preallocate page structures upto
-		percent of already available memory.
-		numa=hotadd=0 will disable hotadd memory.
-
 ACPI
 
   acpi=off	Don't enable ACPI
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 064ed6df4cbec..7feff0648d743 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
 extern void numa_init_array(void);
 extern int numa_off;
 
-extern void srat_reserve_add_area(int nodeid);
-extern int hotadd_percent;
-
 extern s16 apicid_to_node[MAX_LOCAL_APIC];
 
 extern unsigned long numa_free_all_bootmem(void);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index fb61d81a656f7..a6a93c395231a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -272,9 +272,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
 		reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
 				 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
 
-#ifdef CONFIG_ACPI_NUMA
-	srat_reserve_add_area(nodeid);
-#endif
 	node_set_online(nodeid);
 }
 
@@ -591,8 +588,6 @@ static __init int numa_setup(char *opt)
 #ifdef CONFIG_ACPI_NUMA
 	if (!strncmp(opt, "noacpi", 6))
 		acpi_numa = -1;
-	if (!strncmp(opt, "hotadd=", 7))
-		hotadd_percent = simple_strtoul(opt+7, NULL, 10);
 #endif
 	return 0;
 }
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 87b45bff250dd..b0dbbd48e58a3 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -31,8 +31,6 @@ static nodemask_t nodes_parsed __initdata;
 static nodemask_t cpu_nodes_parsed __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
-static int found_add_area __initdata;
-int hotadd_percent __initdata = 0;
 
 static int num_node_memblks __initdata;
 static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
@@ -66,9 +64,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 {
 	struct bootnode *nd = &nodes[i];
 
-	if (found_add_area)
-		return;
-
 	if (nd->start < start) {
 		nd->start = start;
 		if (nd->end < nd->start)
@@ -86,7 +81,6 @@ static __init void bad_srat(void)
 	int i;
 	printk(KERN_ERR "SRAT: SRAT not used.\n");
 	acpi_numa = -1;
-	found_add_area = 0;
 	for (i = 0; i < MAX_LOCAL_APIC; i++)
 		apicid_to_node[i] = NUMA_NO_NODE;
 	for (i = 0; i < MAX_NUMNODES; i++)
@@ -182,24 +176,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	       pxm, apic_id, node);
 }
 
-static int update_end_of_memory(unsigned long end) {return -1;}
-static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static inline int save_add_info(void) {return 1;}
 #else
 static inline int save_add_info(void) {return 0;}
 #endif
 /*
- * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add information.
- * This code supports one contiguous hot add area per node.
+ * Update nodes_add[]
+ * This code supports one contiguous hot add area per node
  */
-static int __init
-reserve_hotadd(int node, unsigned long start, unsigned long end)
+static void __init
+update_nodes_add(int node, unsigned long start, unsigned long end)
 {
 	unsigned long s_pfn = start >> PAGE_SHIFT;
 	unsigned long e_pfn = end >> PAGE_SHIFT;
-	int ret = 0, changed = 0;
+	int changed = 0;
 	struct bootnode *nd = &nodes_add[node];
 
 	/* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +201,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
 	   mistakes */
 	if ((signed long)(end - start) < NODE_MIN_SIZE) {
 		printk(KERN_ERR "SRAT: Hotplug area too small\n");
-		return -1;
+		return;
 	}
 
 	/* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +209,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
 		printk(KERN_ERR
 			"SRAT: Hotplug area %lu -> %lu has existing memory\n",
 			s_pfn, e_pfn);
-		return -1;
-	}
-
-	if (!hotadd_enough_memory(&nodes_add[node]))  {
-		printk(KERN_ERR "SRAT: Hotplug area too large\n");
-		return -1;
+		return;
 	}
 
 	/* Looks good */
@@ -245,11 +231,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
 			printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
 	}
 
-	ret = update_end_of_memory(nd->end);
-
 	if (changed)
-	 	printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
-	return ret;
+		printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
+				 nd->start, nd->end);
 }
 
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +294,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	       start, end);
 	e820_register_active_regions(node, start >> PAGE_SHIFT,
 				     end >> PAGE_SHIFT);
-	push_node_boundaries(node, nd->start >> PAGE_SHIFT,
-						nd->end >> PAGE_SHIFT);
 
-	if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
-	    (reserve_hotadd(node, start, end) < 0)) {
-		/* Ignore hotadd region. Undo damage */
-		printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+	if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+		update_nodes_add(node, start, end);
+		/* restore nodes[node] */
 		*nd = oldnode;
 		if ((nd->start | nd->end) == 0)
 			node_clear(node, nodes_parsed);
@@ -510,26 +491,6 @@ static int null_slit_node_compare(int a, int b)
 }
 #endif /* CONFIG_NUMA_EMU */
 
-void __init srat_reserve_add_area(int nodeid)
-{
-	if (found_add_area && nodes_add[nodeid].end) {
-		u64 total_mb;
-
-		printk(KERN_INFO "SRAT: Reserving hot-add memory space "
-				"for node %d at %Lx-%Lx\n",
-			nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
-		total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
-					>> PAGE_SHIFT;
-		total_mb *= sizeof(struct page);
-		total_mb >>= 20;
-		printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
-				"pre-allocated memory.\n", (unsigned long long)total_mb);
-		reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
-			       nodes_add[nodeid].end - nodes_add[nodeid].start,
-			       BOOTMEM_DEFAULT);
-	}
-}
-
 int __node_distance(int a, int b)
 {
 	int index;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d475c75..511b098670964 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1031,8 +1031,6 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
 extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
 					unsigned long end_pfn);
-extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
-					unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
 						unsigned long end_pfn);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index fe753ecf2aa5f..474c7e9dd51ac 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -149,10 +149,6 @@ static unsigned long __meminitdata dma_reserve;
   static int __meminitdata nr_nodemap_entries;
   static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
   static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
-  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
   static unsigned long __initdata required_kernelcore;
   static unsigned long __initdata required_movablecore;
   static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -3102,64 +3098,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
 				early_node_map[i].end_pfn);
 }
 
-/**
- * push_node_boundaries - Push node boundaries to at least the requested boundary
- * @nid: The nid of the node to push the boundary for
- * @start_pfn: The start pfn of the node
- * @end_pfn: The end pfn of the node
- *
- * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
- * time. Specifically, on x86_64, SRAT will report ranges that can potentially
- * be hotplugged even though no physical memory exists. This function allows
- * an arch to push out the node boundaries so mem_map is allocated that can
- * be used later.
- */
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-void __init push_node_boundaries(unsigned int nid,
-		unsigned long start_pfn, unsigned long end_pfn)
-{
-	mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-			"Entering push_node_boundaries(%u, %lu, %lu)\n",
-			nid, start_pfn, end_pfn);
-
-	/* Initialise the boundary for this node if necessary */
-	if (node_boundary_end_pfn[nid] == 0)
-		node_boundary_start_pfn[nid] = -1UL;
-
-	/* Update the boundaries */
-	if (node_boundary_start_pfn[nid] > start_pfn)
-		node_boundary_start_pfn[nid] = start_pfn;
-	if (node_boundary_end_pfn[nid] < end_pfn)
-		node_boundary_end_pfn[nid] = end_pfn;
-}
-
-/* If necessary, push the node boundary out for reserve hotadd */
-static void __meminit account_node_boundary(unsigned int nid,
-		unsigned long *start_pfn, unsigned long *end_pfn)
-{
-	mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-			"Entering account_node_boundary(%u, %lu, %lu)\n",
-			nid, *start_pfn, *end_pfn);
-
-	/* Return if boundary information has not been provided */
-	if (node_boundary_end_pfn[nid] == 0)
-		return;
-
-	/* Check the boundaries and update if necessary */
-	if (node_boundary_start_pfn[nid] < *start_pfn)
-		*start_pfn = node_boundary_start_pfn[nid];
-	if (node_boundary_end_pfn[nid] > *end_pfn)
-		*end_pfn = node_boundary_end_pfn[nid];
-}
-#else
-void __init push_node_boundaries(unsigned int nid,
-		unsigned long start_pfn, unsigned long end_pfn) {}
-
-static void __meminit account_node_boundary(unsigned int nid,
-		unsigned long *start_pfn, unsigned long *end_pfn) {}
-#endif
-
-
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
@@ -3185,9 +3123,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
 
 	if (*start_pfn == -1UL)
 		*start_pfn = 0;
-
-	/* Push the node boundaries out if requested */
-	account_node_boundary(nid, start_pfn, end_pfn);
 }
 
 /*
@@ -3793,10 +3728,6 @@ void __init remove_all_active_ranges(void)
 {
 	memset(early_node_map, 0, sizeof(early_node_map));
 	nr_nodemap_entries = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-	memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
-	memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
 }
 
 /* Compare two active node_active_regions */
-- 
GitLab


From 7c43769a9776141ec23ca81a1bdd5a9c0512f165 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:59:37 -0700
Subject: [PATCH 1186/2198] x86, mm: Fix node_possible_map logic

Recently there were some changes to the meaning of node_possible_map,
and it is quite strange:

- the node without memory would be set in node_possible_map
- but some node with less NODE_MIN_SIZE will be kicked out of node_possible_map.

fix it by adding strict_setup_node_bootmem().

Also, remove unparse_node().

so result will be:

1. cpu_to_node() will return online node only (nearest one)
2. apicid_to_node() still returns the node that could be not online but is set
   in node_possible_map.
3. node_possible_map will include nodes that mem on it are less NODE_MIN_SIZE

v2: after move_cpus_to_node change.

[ Impact: get node_possible_map right ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Tested-by: Jack Steiner <steiner@sgi.com>
LKML-Reference: <4A0C49BE.6080800@kernel.org>
[ v3: various small cleanups and comment clarifications ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/numa_64.h |  7 +++++++
 arch/x86/mm/numa_64.c          | 13 ++++++++++---
 arch/x86/mm/srat_64.c          | 29 ++---------------------------
 3 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 7feff0648d743..c4ae822e415f9 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -24,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
 			       unsigned long end);
 
 #ifdef CONFIG_NUMA
+/*
+ * Too small node sizes may confuse the VM badly. Usually they
+ * result from BIOS bugs. So dont recognize nodes as standalone
+ * NUMA entities that have less than this amount of RAM listed:
+ */
+#define NODE_MIN_SIZE (4*1024*1024)
+
 extern void __init init_cpu_to_node(void);
 extern void __cpuinit numa_set_node(int cpu, int node);
 extern void __cpuinit numa_clear_node(int cpu);
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a6a93c395231a..459913beac71d 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 }
 
 /* Initialize bootmem allocator for a node */
-void __init setup_node_bootmem(int nodeid, unsigned long start,
-			       unsigned long end)
+void __init
+setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 {
 	unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
+	const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
 	unsigned long bootmap_start, nodedata_phys;
 	void *bootmap;
-	const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
 	int nid;
 
 	if (!end)
 		return;
 
+	/*
+	 * Don't confuse VM with a node that doesn't have the
+	 * minimum amount of memory:
+	 */
+	if (end && (end - start) < NODE_MIN_SIZE)
+		return;
+
 	start = roundup(start, ZONE_ALIGN);
 
 	printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index b0dbbd48e58a3..2dfcbf9df2ae8 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -36,10 +36,6 @@ static int num_node_memblks __initdata;
 static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
 static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
 
-/* Too small nodes confuse the VM badly. Usually they result
-   from BIOS bugs. */
-#define NODE_MIN_SIZE (4*1024*1024)
-
 static __init int setup_node(int pxm)
 {
 	return acpi_map_pxm_to_node(pxm);
@@ -338,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
 	return 1;
 }
 
-static void __init unparse_node(int node)
-{
-	int i;
-	node_clear(node, nodes_parsed);
-	node_clear(node, cpu_nodes_parsed);
-	for (i = 0; i < MAX_LOCAL_APIC; i++) {
-		if (apicid_to_node[i] == node)
-			apicid_to_node[i] = NUMA_NO_NODE;
-	}
-}
-
 void __init acpi_numa_arch_fixup(void) {}
 
 /* Use the information discovered above to actually set up the nodes. */
@@ -360,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 		return -1;
 
 	/* First clean up the node list */
-	for (i = 0; i < MAX_NUMNODES; i++) {
+	for (i = 0; i < MAX_NUMNODES; i++)
 		cutoff_node(i, start, end);
-		/*
-		 * don't confuse VM with a node that doesn't have the
-		 * minimum memory.
-		 */
-		if (nodes[i].end &&
-			(nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
-			unparse_node(i);
-			node_set_offline(i);
-		}
-	}
 
 	if (!nodes_cover_memory(nodes)) {
 		bad_srat();
@@ -404,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 
 		if (node == NUMA_NO_NODE)
 			continue;
-		if (!node_isset(node, node_possible_map))
+		if (!node_online(node))
 			numa_clear_node(i);
 	}
 	numa_init_array();
-- 
GitLab


From 35d5a9a61490bf39d2e48d7f499c8c801a39ebe9 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:59:37 -0700
Subject: [PATCH 1187/2198] x86: fix system without memory on node0

Jack found a boot crash on a system which doesn't have memory on node0.

It turns out with recent per_cpu changes, node_number for BSP will always
be 0, and it is not consistent to cpu_to_node() that might set it to a
different (nearer) node already.

aka when numa_set_node() for node0 is called early before per_cpu area is
setup:

two places touched that per_cpu(node_number,):

1. in cpu/common.c::cpu_init() and it is not for BP
| #ifdef CONFIG_NUMA
|        if (cpu != 0 && percpu_read(node_number) == 0 &&
|            cpu_to_node(cpu) != NUMA_NO_NODE)
|                percpu_write(node_number, cpu_to_node(cpu));
| #endif
for BP: traps_init ==> cpu_init
for AP: start_secondary ==> cpu_init

2. cpu/intel.c or amd.c::srat_detect_node via numa_set_node()
for BP: check_bugs ==> identify_boot_cpu ==> identify_cpu()
	 that is rather later before numa_node_id() is used for BP...
for AP: start_secondary => smp_callin => smp_store_cpu_info() =>
	=> identify_secondary_cpu => identify_cpu()

so try to set that for BP earlier in setup_per_cpu_areas(), and
don't bother to set that for APs there (it will be updated later
and will be used later)

(and don't mess the 0 before the copying BP per_cpu data to APs)

[ Impact: fix boot crash on memoryless node-0 ]

Reported-and-tested-by: Jack Steiner <steiner@sgi.com>
Cc: Tejun Heo <htejun@gmail.com>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4A0C4A02.7050401@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 3a97a4cf18724..3b5f3271e7308 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -423,6 +423,14 @@ void __init setup_per_cpu_areas(void)
 	early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
 #endif
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+	/*
+	 * make sure boot cpu node_number is right, when boot cpu is on the
+	 * node that doesn't have mem installed
+	 */
+	per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
+#endif
+
 	/* Setup node to cpumask map */
 	setup_node_to_cpumask_map();
 
-- 
GitLab


From 629e15d245f46bef9d26199b450f882f9437a8fe Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:05:16 -0700
Subject: [PATCH 1188/2198] x86, irq: update_mptable needs pci_routeirq

To get all device irq routing and to save them.

This is basically an implicit pci=routeirq enablement if (and on if)
the update_mptable boot option (which is off by default) has been
specified.

[ Impact: extend the update_mptable boot opion's scope ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Jesse Barnes <jbarnes@virtuousgeek.org>
LKML-Reference: <4A0DB7B4.4060702@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/mpparse.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index cd2a41a7c45c3..e6bf9d08e5033 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -17,6 +17,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/smp.h>
+#include <linux/pci.h>
 
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
@@ -961,6 +962,9 @@ static int __initdata enable_update_mptable;
 static int __init update_mptable_setup(char *str)
 {
 	enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+	pci_routeirq = 1;
+#endif
 	return 0;
 }
 early_param("update_mptable", update_mptable_setup);
@@ -973,6 +977,9 @@ static int __initdata alloc_mptable;
 static int __init parse_alloc_mptable_opt(char *p)
 {
 	enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+	pci_routeirq = 1;
+#endif
 	alloc_mptable = 1;
 	if (!p)
 		return 0;
-- 
GitLab


From f1bdb523880c7f6990e9e8e50b0fc972ca475e84 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Fri, 15 May 2009 13:05:16 -0700
Subject: [PATCH 1189/2198] x86, irq: don't call mp_config_acpi_gsi() if
 update_mptable is not enabled

Len expressed concern that the update_mptable feature has
side-effects on the ACPI code.

Make it sure explicitly that the code only ever gets called if
the (default disabled) update_mptable boot quirk option is
disabled.

[ Impact: isolate the update_mptable feature from ACPI code more ]

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Cc: Len Brown <lenb@kernel.org>
LKML-Reference: <4A0DC832.5090200@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/mpspec.h | 9 +++++++++
 arch/x86/kernel/acpi/boot.c   | 4 +++-
 arch/x86/kernel/mpparse.c     | 2 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c34961a45ec00..3dcbaaaa363e7 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -87,6 +87,15 @@ static inline int acpi_probe_gsi(void)
 }
 #endif /* CONFIG_ACPI */
 
+#ifdef CONFIG_X86_MPPARSE
+extern int enable_update_mptable;
+#else
+static inline int enable_update_mptable(void)
+{
+	return 0;
+}
+#endif
+
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
 
 struct physid_mask {
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 4af63dfb0f06f..844e5e25213ba 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1226,7 +1226,9 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 		       ioapic_pin);
 		return gsi;
 	}
-	mp_config_acpi_gsi(dev, gsi, trigger, polarity);
+
+	if (enable_update_mptable)
+		mp_config_acpi_gsi(dev, gsi, trigger, polarity);
 
 	set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
 			     trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e6bf9d08e5033..651c93b28862a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -957,7 +957,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
 	return 0;
 }
 
-static int __initdata enable_update_mptable;
+int enable_update_mptable;
 
 static int __init update_mptable_setup(char *str)
 {
-- 
GitLab


From b68f1d2e7aa21029d73c7d453a8046e95d351740 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 19:37:25 +0200
Subject: [PATCH 1190/2198] perf_counter, x86: speed up the scheduling
 fast-path

We have to set up the LVT entry only at counter init time, not at
every switch-in time.

There's friction between NMI and non-NMI use here - we'll probably
remove the per counter configurability of it - but until then, dont
slow down things ...

[ Impact: micro-optimization ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 5bfd30ab39205..c109819c2cb95 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,6 +285,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			return -EACCES;
 		hwc->nmi = 1;
 	}
+	perf_counters_lapic_init(hwc->nmi);
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -603,8 +604,6 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
-	perf_counters_lapic_init(hwc->nmi);
-
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
@@ -1054,7 +1053,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init(1);
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
GitLab


From 24ed0c4bfc7d2d7507bb9d50f7f3bbdcd85d76dd Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Sun, 17 May 2009 15:31:38 +0800
Subject: [PATCH 1191/2198] tracing: fix check for return value of
 register_module_notifier

return zero should be correct, so fix it.

[ Impact: eliminate incorrect syslog message ]

Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: rostedt@goodmis.org
LKML-Reference: <1242545498-7285-1-git-send-email-tom.leiming@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5b606f45b6c43..140699a9a8a72 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2826,7 +2826,7 @@ void __init ftrace_init(void)
 				  __stop_mcount_loc);
 
 	ret = register_module_notifier(&ftrace_module_nb);
-	if (!ret)
+	if (ret)
 		pr_warning("Failed to register trace ftrace module notifier\n");
 
 	return;
-- 
GitLab


From 6c3b46f74587d46e71b8c2b78fdca626a3aca280 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 18 May 2009 14:38:28 +0200
Subject: [PATCH 1192/2198] virtio_blk: don't blindly derefence req->rq_disk

request->rq_disk is only set for FS requests or BLOCK_PC requests
originating from the generic block layer scsi ioctls.  It's not set
for requests origination from other soures or internal cache flush
commands implemented by the patch I'll send after this.

So instead of using it to get at the private data in do_virtblk_request
setup queue->queuedata and use it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/virtio_blk.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 29a9daf48621f..dfe9ee5f1696b 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -124,12 +124,11 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 
 static void do_virtblk_request(struct request_queue *q)
 {
-	struct virtio_blk *vblk = NULL;
+	struct virtio_blk *vblk = q->queuedata;
 	struct request *req;
 	unsigned int issued = 0;
 
 	while ((req = blk_peek_request(q)) != NULL) {
-		vblk = req->rq_disk->private_data;
 		BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 
 		/* If this request fails, stop queue and wait for something to
@@ -249,6 +248,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 		goto out_put_disk;
 	}
 
+	vblk->disk->queue->queuedata = vblk;
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, vblk->disk->queue);
 
 	if (index < 26) {
-- 
GitLab


From 1cde26f928863d90e9e7c1217880c8450464d305 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 18 May 2009 14:41:30 +0200
Subject: [PATCH 1193/2198] virtio_blk: SG_IO passthru support

Add support for SG_IO passthru to virtio_blk.  We add the scsi command
block after the normal outhdr, and the scsi inhdr with full status
information aswell as the sense buffer before the regular inhdr.

[hch: forward ported, added the VIRTIO_BLK_F_SCSI flags, some comments
 and tested the whole beast]
[axboe: updated to use ->resid and not dual-path the byte count]

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (+ checkpatch.pl tweak)
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/virtio_blk.c | 64 +++++++++++++++++++++++++++++---------
 include/linux/virtio_blk.h |  8 +++++
 2 files changed, 58 insertions(+), 14 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index dfe9ee5f1696b..62275dbdf2eb2 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -37,6 +37,7 @@ struct virtblk_req
 	struct list_head list;
 	struct request *req;
 	struct virtio_blk_outhdr out_hdr;
+	struct virtio_scsi_inhdr in_hdr;
 	u8 status;
 };
 
@@ -49,7 +50,9 @@ static void blk_done(struct virtqueue *vq)
 
 	spin_lock_irqsave(&vblk->lock, flags);
 	while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
+		unsigned int nr_bytes;
 		int error;
+
 		switch (vbr->status) {
 		case VIRTIO_BLK_S_OK:
 			error = 0;
@@ -62,6 +65,12 @@ static void blk_done(struct virtqueue *vq)
 			break;
 		}
 
+		if (blk_pc_request(vbr->req)) {
+			vbr->req->resid_len = vbr->in_hdr.residual;
+			vbr->req->sense_len = vbr->in_hdr.sense_len;
+			vbr->req->errors = vbr->in_hdr.errors;
+		}
+
 		__blk_end_request_all(vbr->req, error);
 		list_del(&vbr->list);
 		mempool_free(vbr, vblk->pool);
@@ -74,7 +83,7 @@ static void blk_done(struct virtqueue *vq)
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		   struct request *req)
 {
-	unsigned long num, out, in;
+	unsigned long num, out = 0, in = 0;
 	struct virtblk_req *vbr;
 
 	vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
@@ -99,18 +108,36 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 	if (blk_barrier_rq(vbr->req))
 		vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
 
-	sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr));
-	num = blk_rq_map_sg(q, vbr->req, vblk->sg+1);
-	sg_set_buf(&vblk->sg[num+1], &vbr->status, sizeof(vbr->status));
+	sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
 
-	if (rq_data_dir(vbr->req) == WRITE) {
-		vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
-		out = 1 + num;
-		in = 1;
-	} else {
-		vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
-		out = 1;
-		in = 1 + num;
+	/*
+	 * If this is a packet command we need a couple of additional headers.
+	 * Behind the normal outhdr we put a segment with the scsi command
+	 * block, and before the normal inhdr we put the sense data and the
+	 * inhdr with additional status information before the normal inhdr.
+	 */
+	if (blk_pc_request(vbr->req))
+		sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
+
+	num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
+
+	if (blk_pc_request(vbr->req)) {
+		sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96);
+		sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
+			   sizeof(vbr->in_hdr));
+	}
+
+	sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
+		   sizeof(vbr->status));
+
+	if (num) {
+		if (rq_data_dir(vbr->req) == WRITE) {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
+			out += num;
+		} else {
+			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
+			in += num;
+		}
 	}
 
 	if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) {
@@ -148,8 +175,16 @@ static void do_virtblk_request(struct request_queue *q)
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 			 unsigned cmd, unsigned long data)
 {
-	return scsi_cmd_ioctl(bdev->bd_disk->queue,
-			      bdev->bd_disk, mode, cmd,
+	struct gendisk *disk = bdev->bd_disk;
+	struct virtio_blk *vblk = disk->private_data;
+
+	/*
+	 * Only allow the generic SCSI ioctls if the host can support it.
+	 */
+	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
+		return -ENOIOCTLCMD;
+
+	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
 			      (void __user *)data);
 }
 
@@ -356,6 +391,7 @@ static struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
 	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
 	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
+	VIRTIO_BLK_F_SCSI,
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 94c56d29869df..4dbcbc1c3481c 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -15,6 +15,7 @@
 #define VIRTIO_BLK_F_GEOMETRY	4	/* Legacy geometry available  */
 #define VIRTIO_BLK_F_RO		5	/* Disk is read-only */
 #define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
+#define VIRTIO_BLK_F_SCSI	7	/* Supports scsi command passthru */
 
 struct virtio_blk_config
 {
@@ -55,6 +56,13 @@ struct virtio_blk_outhdr
 	__u64 sector;
 };
 
+struct virtio_scsi_inhdr {
+	__u32 errors;
+	__u32 data_len;
+	__u32 sense_len;
+	__u32 residual;
+};
+
 /* And this is the final byte of the write scatter-gather list. */
 #define VIRTIO_BLK_S_OK		0
 #define VIRTIO_BLK_S_IOERR	1
-- 
GitLab


From f831cc03490c78a76e2d35ad77ec2292d0323728 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 18 May 2009 14:44:45 +0200
Subject: [PATCH 1194/2198] virtio_blk: get rid of unused variable

drivers/block/virtio_blk.c: In function 'blk_done':
drivers/block/virtio_blk.c:53: warning: unused variable 'nr_bytes'

Leftover from commit 1cde26f928863d90e9e7c1217880c8450464d305

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/virtio_blk.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 62275dbdf2eb2..511d4ae2d1764 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -50,7 +50,6 @@ static void blk_done(struct virtqueue *vq)
 
 	spin_lock_irqsave(&vblk->lock, flags);
 	while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) {
-		unsigned int nr_bytes;
 		int error;
 
 		switch (vbr->status) {
-- 
GitLab


From 75834fc3b6fcff00327f5d2a18760c1e8e0179c5 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 18 May 2009 10:26:10 -0400
Subject: [PATCH 1195/2198] SELinux: move SELINUX_MAGIC into magic.h

The selinuxfs superblock magic is used inside the IMA code, but is being
defined in two places and could someday get out of sync.  This patch moves the
declaration into magic.h so it is only done once.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/magic.h               | 1 +
 security/integrity/ima/ima_policy.c | 8 +++-----
 security/selinux/include/security.h | 3 +--
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/include/linux/magic.h b/include/linux/magic.h
index 5b4e28bcb788d..927138cf30502 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -9,6 +9,7 @@
 #define DEBUGFS_MAGIC          0x64626720
 #define SYSFS_MAGIC		0x62656572
 #define SECURITYFS_MAGIC	0x73636673
+#define SELINUX_MAGIC		0xf97cff8c
 #define TMPFS_MAGIC		0x01021994
 #define SQUASHFS_MAGIC		0x73717368
 #define EFS_SUPER_MAGIC		0x414A53
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index dec6dcb1c8de5..31d677f7c65f5 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -49,14 +49,12 @@ struct ima_measure_rule_entry {
  * written in terms of .action, .func, .mask, .fsmagic, and .uid
  */
 static struct ima_measure_rule_entry default_rules[] = {
-	{.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,
-	 .flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = DEBUGFS_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = TMPFS_MAGIC,.flags = IMA_FSMAGIC},
-	{.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,
-	 .flags = IMA_FSMAGIC},
-	{.action = DONT_MEASURE,.fsmagic = 0xF97CFF8C,.flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,.flags = IMA_FSMAGIC},
+	{.action = DONT_MEASURE,.fsmagic = SELINUX_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = MEASURE,.func = FILE_MMAP,.mask = MAY_EXEC,
 	 .flags = IMA_FUNC | IMA_MASK},
 	{.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC,
diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h
index a7be3f01fb081..ca835795a8b32 100644
--- a/security/selinux/include/security.h
+++ b/security/selinux/include/security.h
@@ -8,14 +8,13 @@
 #ifndef _SELINUX_SECURITY_H_
 #define _SELINUX_SECURITY_H_
 
+#include <linux/magic.h>
 #include "flask.h"
 
 #define SECSID_NULL			0x00000000 /* unspecified SID */
 #define SECSID_WILD			0xffffffff /* wildcard SID */
 #define SECCLASS_NULL			0x0000 /* no class */
 
-#define SELINUX_MAGIC 0xf97cff8c
-
 /* Identify specific policy version changes */
 #define POLICYDB_VERSION_BASE		15
 #define POLICYDB_VERSION_BOOL		16
-- 
GitLab


From 4200efd9acda4accf24640f1e77d24fdcdb524df Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 19 May 2009 09:22:19 +0200
Subject: [PATCH 1196/2198] sched: properly define the sched_group::cpumask and
 sched_domain::span fields

Properly document the variable-size structure tricks we are doing
wrt. struct sched_group and sched_domain, and use the field[0] GCC
extension instead of defining a vla array.

Dont use unions for this, as pointed out by Linus.

[ Impact: cleanup, un-confuse Sparse and LLVM ]

Reported-by: Jeff Garzik <jeff@garzik.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <alpine.LFD.2.01.0905180850110.3301@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 25 ++++++++++++++++++++++---
 kernel/sched.c        |  5 +++--
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index de7b3b2177729..dbb1043e86569 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -839,7 +839,17 @@ struct sched_group {
 	 */
 	u32 reciprocal_cpu_power;
 
-	unsigned long cpumask[];
+	/*
+	 * The CPUs this group covers.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_group' in kernel/sched.c)
+	 */
+	unsigned long cpumask[0];
 };
 
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -925,8 +935,17 @@ struct sched_domain {
 	char *name;
 #endif
 
-	/* span of all CPUs in this domain */
-	unsigned long span[];
+	/*
+	 * Span of all CPUs in this domain.
+	 *
+	 * NOTE: this field is variable length. (Allocated dynamically
+	 * by attaching extra space to the end of the structure,
+	 * depending on how many CPUs the kernel has booted up with)
+	 *
+	 * It is also be embedded into static data structures at build
+	 * time. (See 'struct static_sched_domain' in kernel/sched.c)
+	 */
+	unsigned long span[0];
 };
 
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
diff --git a/kernel/sched.c b/kernel/sched.c
index 497c09ba61e77..228acae8821f6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7948,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
 /*
  * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
  */
 struct static_sched_group {
 	struct sched_group sg;
-- 
GitLab


From 143c145e3a475065a4be661468d0df1bd0b25f74 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 19 May 2009 14:43:15 +0800
Subject: [PATCH 1197/2198] tracing/events: Documentation updates

- fix some typos
- document the difference between '>' and '>>'
- document the 'enable' toggle
- remove section "Defining an event-enabled tracepoint", since it's
  out-dated and sample/trace_events/ already serves this purpose.

v2: add "Updated by Li Zefan"

[ Impact: make documentation up-to-date ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
LKML-Reference: <4A125503.5060406@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/trace/events.txt | 159 ++++++++++++---------------------
 1 file changed, 57 insertions(+), 102 deletions(-)

diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index abdee664c0f66..f157d7594ea7c 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -1,9 +1,10 @@
 			     Event Tracing
 
 		Documentation written by Theodore Ts'o
+			Updated by Li Zefan
 
-Introduction
-============
+1. Introduction
+===============
 
 Tracepoints (see Documentation/trace/tracepoints.txt) can be used
 without creating custom kernel modules to register probe functions
@@ -12,30 +13,37 @@ using the event tracing infrastructure.
 Not all tracepoints can be traced using the event tracing system;
 the kernel developer must provide code snippets which define how the
 tracing information is saved into the tracing buffer, and how the
-the tracing information should be printed.
+tracing information should be printed.
 
-Using Event Tracing
-===================
+2. Using Event Tracing
+======================
+
+2.1 Via the 'set_event' interface
+---------------------------------
 
 The events which are available for tracing can be found in the file
-/sys/kernel/debug/tracing/available_events.
+/debug/tracing/available_events.
 
 To enable a particular event, such as 'sched_wakeup', simply echo it
-to /sys/debug/tracing/set_event. For example:
+to /debug/tracing/set_event. For example:
 
-	# echo sched_wakeup > /sys/kernel/debug/tracing/set_event
+	# echo sched_wakeup >> /debug/tracing/set_event
 
-[ Note: events can also be enabled/disabled via the 'enabled' toggle
-  found in the /sys/kernel/tracing/events/ hierarchy of directories. ]
+[ Note: '>>' is necessary, otherwise it will firstly disable
+  all the events. ]
 
 To disable an event, echo the event name to the set_event file prefixed
 with an exclamation point:
 
-	# echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event
+	# echo '!sched_wakeup' >> /debug/tracing/set_event
+
+To disable all events, echo an empty line to the set_event file:
+
+	# echo > /debug/tracing/set_event
 
-To disable events, echo an empty line to the set_event file:
+To enable all events, echo '*:*' or '*:' to the set_event file:
 
-	# echo > /sys/kernel/debug/tracing/set_event
+	# echo *:* > /debug/tracing/set_event
 
 The events are organized into subsystems, such as ext4, irq, sched,
 etc., and a full event name looks like this: <subsystem>:<event>.  The
@@ -44,92 +52,39 @@ file.  All of the events in a subsystem can be specified via the syntax
 "<subsystem>:*"; for example, to enable all irq events, you can use the
 command:
 
-	# echo 'irq:*' > /sys/kernel/debug/tracing/set_event
-
-Defining an event-enabled tracepoint
-------------------------------------
-
-A kernel developer which wishes to define an event-enabled tracepoint
-must declare the tracepoint using TRACE_EVENT instead of DECLARE_TRACE.
-This is done via two header files in include/trace.  For example, to
-event-enable the jbd2 subsystem, we must create two files,
-include/trace/jbd2.h and include/trace/jbd2_event_types.h.  The
-include/trace/jbd2.h file should be included by kernel source files that
-will have a tracepoint inserted, and might look like this:
-
-#ifndef _TRACE_JBD2_H
-#define _TRACE_JBD2_H
-
-#include <linux/jbd2.h>
-#include <linux/tracepoint.h>
-
-#include <trace/jbd2_event_types.h>
-
-#endif
-
-In a file that utilizes a jbd2 tracepoint, this header file would be
-included.  Note that you still have to use DEFINE_TRACE().  So for
-example, if fs/jbd2/commit.c planned to use the jbd2_start_commit
-tracepoint, it would have the following near the beginning of the file:
-
-#include <trace/jbd2.h>
-
-DEFINE_TRACE(jbd2_start_commit);
-
-Then in the function that would call the tracepoint, it would call the
-tracepoint function.  (For more information, please see the tracepoint
-documentation in Documentation/trace/tracepoints.txt):
-
-	trace_jbd2_start_commit(journal, commit_transaction);
-
-The code snippets which allow jbd2_start_commit to be an event-enabled
-tracepoint are placed in the file include/trace/jbd2_event_types.h:
-
-/* use <trace/jbd2.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM jbd2
-
-#include <linux/jbd2.h>
-
-TRACE_EVENT(jbd2_start_commit,
-	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
-	TP_ARGS(journal, commit_transaction),
-	TP_STRUCT__entry(
-		__array(	char,	devname, BDEVNAME_SIZE+24 )
-		__field(	int,	transaction		  )
-	),
-	TP_fast_assign(
-		memcpy(__entry->devname, journal->j_devname, BDEVNAME_SIZE+24);
-		__entry->transaction	= commit_transaction->t_tid;
-	),
-	TP_printk("dev %s transaction %d",
-		  __entry->devname, __entry->transaction)
-);
-
-The TP_PROTO and TP_ARGS are unchanged from DECLARE_TRACE.  The new
-arguments to TRACE_EVENT are TP_STRUCT__entry, TP_fast_assign, and
-TP_printk.
-
-TP_STRUCT__entry defines the data structure which will be stored in the
-trace buffer.  Normally, fields in __entry will be arrays or simple
-types.  It is possible to place data structures in __entry --- however,
-pointers in the data structure can not be trusted, since they will be
-accessed sometime later by TP_printk, and if the data structure contains
-fields that will not or cannot be used by TP_printk, this will waste
-space in the trace buffer.  In general, data structures should be
-avoided, unless they do only contain non-pointer types and all of the
-fields will be used by TP_printk.
-
-TP_fast_assign defines the code snippet which saves information into the
-__entry data structure, using the passed-in arguments defined in
-TP_PROTO and TP_ARGS.
-
-Finally, TP_printk will print the __entry data structure.  At the time
-when the code snippet defined by TP_printk is executed, it will not have
-access to the TP_ARGS arguments; it can only use the information saved
-in the __entry data structure.
+	# echo 'irq:*' > /debug/tracing/set_event
+
+2.2 Via the 'enable' toggle
+---------------------------
+
+The events available are also listed in /debug/tracing/events/ hierarchy
+of directories.
+
+To enable event 'sched_wakeup':
+
+	# echo 1 > /debug/tracing/events/sched/sched_wakeup/enable
+
+To disable it:
+
+	# echo 0 > /debug/tracing/events/sched/sched_wakeup/enable
+
+To enable all events in sched subsystem:
+
+	# echo 1 > /debug/tracing/events/sched/enable
+
+To eanble all events:
+
+	# echo 1 > /debug/tracing/events/enable
+
+When reading one of these enable files, there are four results:
+
+ 0 - all events this file affects are disabled
+ 1 - all events this file affects are enabled
+ X - there is a mixture of events enabled and disabled
+ ? - this file does not affect any event
+
+3. Defining an event-enabled tracepoint
+=======================================
+
+See The example provided in samples/trace_events
+
-- 
GitLab


From fd51d251e4cdb21f68e9dbc4336514d64a105a79 Mon Sep 17 00:00:00 2001
From: Stefan Raspl <raspl@linux.vnet.ibm.com>
Date: Tue, 19 May 2009 09:59:08 +0200
Subject: [PATCH 1198/2198] blktrace: remove debugfs entries on bad path

debugfs directory entries for devices are not removed on some
of the failure pathes in do_blk_trace_setup().
One way to reproduce is to start blktrace on multiple devices
with insufficient Vmalloc space: Devices will fail with
a message like this:

	BLKTRACESETUP(2) /dev/sdu failed: 5/Input/output error

If so, the respective entries in debugfs
(e.g. /sys/kernel/debug/block/sdu) will remain and subsequent
attempts to start blktrace on the respective devices will not
succeed due to existing directories.

[ Impact: fix /debug/tracing file cleanup corner case ]

Signed-off-by: Stefan Raspl <stefan.raspl@linux.vnet.ibm.com>
Acked-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: schwidefsky@de.ibm.com
Cc: heiko.carstens@de.ibm.com
LKML-Reference: <4A1266CC.5040801@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 05b4747fd8732..e3abf55bc8e57 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -262,6 +262,7 @@ static void blk_trace_free(struct blk_trace *bt)
 {
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
+	debugfs_remove(bt->dir);
 	relay_close(bt->rchan);
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
-- 
GitLab


From fe64d517df0970a68417184a12fcd4ba0589cc28 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 19 May 2009 10:01:18 +0100
Subject: [PATCH 1199/2198] GFS2: Umount recovery race fix

This patch fixes a race condition where we can receive recovery
requests part way through processing a umount. This was causing
problems since the recovery thread had already gone away.

Looking in more detail at the recovery code, it was really trying
to implement a slight variation on a work queue, and that happens to
align nicely with the recently introduced slow-work subsystem. As a
result I've updated the code to use slow-work, rather than its own home
grown variety of work queue.

When using the wait_on_bit() function, I noticed that the wait function
that was supplied as an argument was appearing in the WCHAN field, so
I've updated the function names in order to produce more meaningful
output.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Kconfig      |   1 +
 fs/gfs2/glock.c      |  21 +++++++--
 fs/gfs2/incore.h     |  14 +++---
 fs/gfs2/main.c       |   8 ++++
 fs/gfs2/ops_fstype.c |  20 +++------
 fs/gfs2/ops_super.c  |  25 ++++++++++-
 fs/gfs2/recovery.c   | 102 ++++++++++++++-----------------------------
 fs/gfs2/recovery.h   |   2 +-
 fs/gfs2/sys.c        |  53 +++++++++++-----------
 9 files changed, 122 insertions(+), 124 deletions(-)

diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index 3a981b7f64caf..cad957cdb1e5a 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -7,6 +7,7 @@ config GFS2_FS
 	select IP_SCTP if DLM_SCTP
 	select FS_POSIX_ACL
 	select CRC32
+	select SLOW_WORK
 	help
 	  A cluster filesystem.
 
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index ff49810904897..2bf62bcc51817 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -796,22 +796,37 @@ void gfs2_holder_uninit(struct gfs2_holder *gh)
 	gh->gh_ip = 0;
 }
 
-static int just_schedule(void *word)
+/**
+ * gfs2_glock_holder_wait
+ * @word: unused
+ *
+ * This function and gfs2_glock_demote_wait both show up in the WCHAN
+ * field. Thus I've separated these otherwise identical functions in
+ * order to be more informative to the user.
+ */
+
+static int gfs2_glock_holder_wait(void *word)
 {
         schedule();
         return 0;
 }
 
+static int gfs2_glock_demote_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
 static void wait_on_holder(struct gfs2_holder *gh)
 {
 	might_sleep();
-	wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE);
+	wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE);
 }
 
 static void wait_on_demote(struct gfs2_glock *gl)
 {
 	might_sleep();
-	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE);
+	wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE);
 }
 
 /**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 65f438e9537a0..0060e9564bb92 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -12,6 +12,7 @@
 
 #include <linux/fs.h>
 #include <linux/workqueue.h>
+#include <linux/slow-work.h>
 #include <linux/dlm.h>
 #include <linux/buffer_head.h>
 
@@ -376,11 +377,11 @@ struct gfs2_journal_extent {
 struct gfs2_jdesc {
 	struct list_head jd_list;
 	struct list_head extent_list;
-
+	struct slow_work jd_work;
 	struct inode *jd_inode;
+	unsigned long jd_flags;
+#define JDF_RECOVERY 1
 	unsigned int jd_jid;
-	int jd_dirty;
-
 	unsigned int jd_blocks;
 };
 
@@ -390,9 +391,6 @@ struct gfs2_statfs_change_host {
 	s64 sc_dinodes;
 };
 
-#define GFS2_GLOCKD_DEFAULT	1
-#define GFS2_GLOCKD_MAX		16
-
 #define GFS2_QUOTA_DEFAULT	GFS2_QUOTA_OFF
 #define GFS2_QUOTA_OFF		0
 #define GFS2_QUOTA_ACCOUNT	1
@@ -427,7 +425,6 @@ struct gfs2_tune {
 	unsigned int gt_incore_log_blocks;
 	unsigned int gt_log_flush_secs;
 
-	unsigned int gt_recoverd_secs;
 	unsigned int gt_logd_secs;
 
 	unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */
@@ -448,6 +445,7 @@ enum {
 	SDF_JOURNAL_LIVE	= 1,
 	SDF_SHUTDOWN		= 2,
 	SDF_NOBARRIERS		= 3,
+	SDF_NORECOVERY		= 4,
 };
 
 #define GFS2_FSNAME_LEN		256
@@ -494,7 +492,6 @@ struct lm_lockstruct {
 	unsigned long ls_flags;
 	dlm_lockspace_t *ls_dlm;
 
-	int ls_recover_jid;
 	int ls_recover_jid_done;
 	int ls_recover_jid_status;
 };
@@ -583,7 +580,6 @@ struct gfs2_sbd {
 
 	/* Daemon stuff */
 
-	struct task_struct *sd_recoverd_process;
 	struct task_struct *sd_logd_process;
 	struct task_struct *sd_quotad_process;
 
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index a6892ed0840a3..eacd78a5d0827 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -15,6 +15,7 @@
 #include <linux/init.h>
 #include <linux/gfs2_ondisk.h>
 #include <asm/atomic.h>
+#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -113,12 +114,18 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		goto fail_unregister;
 
+	error = slow_work_register_user();
+	if (error)
+		goto fail_slow;
+
 	gfs2_register_debugfs();
 
 	printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__);
 
 	return 0;
 
+fail_slow:
+	unregister_filesystem(&gfs2meta_fs_type);
 fail_unregister:
 	unregister_filesystem(&gfs2_fs_type);
 fail:
@@ -156,6 +163,7 @@ static void __exit exit_gfs2_fs(void)
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
+	slow_work_unregister_user();
 
 	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7981fbc9fc3be..2cd1164c88d7f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -17,6 +17,7 @@
 #include <linux/namei.h>
 #include <linux/mount.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -55,7 +56,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 	spin_lock_init(&gt->gt_spin);
 
 	gt->gt_incore_log_blocks = 1024;
-	gt->gt_recoverd_secs = 60;
 	gt->gt_logd_secs = 1;
 	gt->gt_quota_simul_sync = 64;
 	gt->gt_quota_warn_period = 10;
@@ -675,6 +675,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh)
 			break;
 
 		INIT_LIST_HEAD(&jd->extent_list);
+		slow_work_init(&jd->jd_work, &gfs2_recover_ops);
 		jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1);
 		if (!jd->jd_inode || IS_ERR(jd->jd_inode)) {
 			if (!jd->jd_inode)
@@ -700,14 +701,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 {
 	struct inode *master = sdp->sd_master_dir->d_inode;
 	struct gfs2_holder ji_gh;
-	struct task_struct *p;
 	struct gfs2_inode *ip;
 	int jindex = 1;
 	int error = 0;
 
 	if (undo) {
 		jindex = 0;
-		goto fail_recoverd;
+		goto fail_jinode_gh;
 	}
 
 	sdp->sd_jindex = gfs2_lookup_simple(master, "jindex");
@@ -800,18 +800,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 	gfs2_glock_dq_uninit(&ji_gh);
 	jindex = 0;
 
-	p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd");
-	error = IS_ERR(p);
-	if (error) {
-		fs_err(sdp, "can't start recoverd thread: %d\n", error);
-		goto fail_jinode_gh;
-	}
-	sdp->sd_recoverd_process = p;
-
 	return 0;
 
-fail_recoverd:
-	kthread_stop(sdp->sd_recoverd_process);
 fail_jinode_gh:
 	if (!sdp->sd_args.ar_spectator)
 		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
@@ -1172,8 +1162,10 @@ static int fill_super(struct super_block *sb, void *data, int silent)
 		goto fail;
 	}
 
-	if (sdp->sd_args.ar_spectator)
+	if (sdp->sd_args.ar_spectator) {
                 sb->s_flags |= MS_RDONLY;
+		set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	}
 	if (sdp->sd_args.ar_posix_acl)
 		sb->s_flags |= MS_POSIXACL;
 
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 0677a83785600..a3c2272e7cade 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -121,6 +121,12 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	return error;
 }
 
+static int gfs2_umount_recovery_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
 /**
  * gfs2_put_super - Unmount the filesystem
  * @sb: The VFS superblock
@@ -131,6 +137,7 @@ static void gfs2_put_super(struct super_block *sb)
 {
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	int error;
+	struct gfs2_jdesc *jd;
 
 	/*  Unfreeze the filesystem, if we need to  */
 
@@ -139,9 +146,25 @@ static void gfs2_put_super(struct super_block *sb)
 		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
 	mutex_unlock(&sdp->sd_freeze_lock);
 
+	/* No more recovery requests */
+	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	smp_mb();
+
+	/* Wait on outstanding recovery */
+restart:
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+			continue;
+		spin_unlock(&sdp->sd_jindex_spin);
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+			    gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
+		goto restart;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
 	kthread_stop(sdp->sd_quotad_process);
 	kthread_stop(sdp->sd_logd_process);
-	kthread_stop(sdp->sd_recoverd_process);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		error = gfs2_make_fs_ro(sdp);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 247e8f7d6b3d5..59d2695509d30 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -13,8 +13,7 @@
 #include <linux/buffer_head.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
-#include <linux/kthread.h>
-#include <linux/freezer.h>
+#include <linux/slow-work.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -441,18 +440,25 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
         kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
 }
 
-/**
- * gfs2_recover_journal - recover a given journal
- * @jd: the struct gfs2_jdesc describing the journal
- *
- * Acquire the journal's lock, check to see if the journal is clean, and
- * do recovery if necessary.
- *
- * Returns: errno
- */
+static int gfs2_recover_get_ref(struct slow_work *work)
+{
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+	if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags))
+		return -EBUSY;
+	return 0;
+}
 
-int gfs2_recover_journal(struct gfs2_jdesc *jd)
+static void gfs2_recover_put_ref(struct slow_work *work)
+{
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
+	clear_bit(JDF_RECOVERY, &jd->jd_flags);
+	smp_mb__after_clear_bit();
+	wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
+}
+
+static void gfs2_recover_work(struct slow_work *work)
 {
+	struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work);
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 	struct gfs2_log_header_host head;
@@ -569,7 +575,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 		gfs2_glock_dq_uninit(&j_gh);
 
 	fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
-	return 0;
+	return;
 
 fail_gunlock_tr:
 	gfs2_glock_dq_uninit(&t_gh);
@@ -584,70 +590,28 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd)
 
 fail:
 	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
-	return error;
 }
 
-static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp)
-{
-	struct gfs2_jdesc *jd;
-	int found = 0;
-
-	spin_lock(&sdp->sd_jindex_spin);
+struct slow_work_ops gfs2_recover_ops = {
+	.get_ref = gfs2_recover_get_ref,
+	.put_ref = gfs2_recover_put_ref,
+	.execute = gfs2_recover_work,
+};
 
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (jd->jd_dirty) {
-			jd->jd_dirty = 0;
-			found = 1;
-			break;
-		}
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-
-	if (!found)
-		jd = NULL;
 
-	return jd;
-}
-
-/**
- * gfs2_check_journals - Recover any dirty journals
- * @sdp: the filesystem
- *
- */
-
-static void gfs2_check_journals(struct gfs2_sbd *sdp)
+static int gfs2_recovery_wait(void *word)
 {
-	struct gfs2_jdesc *jd;
-
-	for (;;) {
-		jd = gfs2_jdesc_find_dirty(sdp);
-		if (!jd)
-			break;
-
-		if (jd != sdp->sd_jdesc)
-			gfs2_recover_journal(jd);
-	}
+	schedule();
+	return 0;
 }
 
-/**
- * gfs2_recoverd - Recover dead machine's journals
- * @sdp: Pointer to GFS2 superblock
- *
- */
-
-int gfs2_recoverd(void *data)
+int gfs2_recover_journal(struct gfs2_jdesc *jd)
 {
-	struct gfs2_sbd *sdp = data;
-	unsigned long t;
-
-	while (!kthread_should_stop()) {
-		gfs2_check_journals(sdp);
-		t = gfs2_tune_get(sdp,  gt_recoverd_secs) * HZ;
-		if (freezing(current))
-			refrigerator();
-		schedule_timeout_interruptible(t);
-	}
-
+	int rv;
+	rv = slow_work_enqueue(&jd->jd_work);
+	if (rv)
+		return rv;
+	wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE);
 	return 0;
 }
 
diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h
index a8218ea15b57d..1616ac22569a9 100644
--- a/fs/gfs2/recovery.h
+++ b/fs/gfs2/recovery.h
@@ -28,7 +28,7 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp);
 extern int gfs2_find_jhead(struct gfs2_jdesc *jd,
 		    struct gfs2_log_header_host *head);
 extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd);
-extern int gfs2_recoverd(void *data);
+extern struct slow_work_ops gfs2_recover_ops;
 
 #endif /* __RECOVERY_DOT_H__ */
 
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 894bf773ec930..9f6d48b75fd2d 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -356,34 +356,33 @@ static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
 	return sprintf(buf, "%d\n", ls->ls_first_done);
 }
 
-static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf)
-{
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	return sprintf(buf, "%d\n", ls->ls_recover_jid);
-}
-
-static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid)
+static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
+	unsigned jid;
 	struct gfs2_jdesc *jd;
+	int rv;
+
+	rv = sscanf(buf, "%u", &jid);
+	if (rv != 1)
+		return -EINVAL;
 
+	rv = -ESHUTDOWN;
 	spin_lock(&sdp->sd_jindex_spin);
+	if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
+		goto out;
+	rv = -EBUSY;
+	if (sdp->sd_jdesc->jd_jid == jid)
+		goto out;
+	rv = -ENOENT;
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		if (jd->jd_jid != jid)
 			continue;
-		jd->jd_dirty = 1;
+		rv = slow_work_enqueue(&jd->jd_work);
 		break;
 	}
+out:
 	spin_unlock(&sdp->sd_jindex_spin);
-}
-
-static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
-{
-	struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-	ls->ls_recover_jid = simple_strtol(buf, NULL, 0);
-	gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid);
-	if (sdp->sd_recoverd_process)
-		wake_up_process(sdp->sd_recoverd_process);
-	return len;
+	return rv ? rv : len;
 }
 
 static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf)
@@ -401,15 +400,15 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
-GDLM_ATTR(proto_name,     0444, proto_name_show,     NULL);
-GDLM_ATTR(block,          0644, block_show,          block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,       withdraw_store);
-GDLM_ATTR(id,             0444, lkid_show,           NULL);
-GDLM_ATTR(first,          0444, lkfirst_show,        NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,     NULL);
-GDLM_ATTR(recover,        0644, recover_show,        recover_store);
-GDLM_ATTR(recover_done,   0444, recover_done_show,   NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show, NULL);
+GDLM_ATTR(proto_name,     0444, proto_name_show,	NULL);
+GDLM_ATTR(block,          0644, block_show,		block_store);
+GDLM_ATTR(withdraw,       0644, withdraw_show,		withdraw_store);
+GDLM_ATTR(id,             0444, lkid_show,		NULL);
+GDLM_ATTR(first,          0444, lkfirst_show,		NULL);
+GDLM_ATTR(first_done,     0444, first_done_show,	NULL);
+GDLM_ATTR(recover,        0200, NULL,			recover_store);
+GDLM_ATTR(recover_done,   0444, recover_done_show,	NULL);
+GDLM_ATTR(recover_status, 0444, recover_status_show,	NULL);
 
 static struct attribute *lock_module_attrs[] = {
 	&gdlm_attr_proto_name.attr,
-- 
GitLab


From e9ccb73ab57ada469602506496c42e5b4468ac3e Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 19 May 2009 10:23:23 +0100
Subject: [PATCH 1200/2198] GFS2: Update docs

Update a few things which were out of date, and fix a typo.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 Documentation/filesystems/gfs2-glocks.txt |  2 +-
 Documentation/filesystems/gfs2.txt        | 19 +++++++++++--------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt
index 4dae9a3840bf9..0494f78d87e40 100644
--- a/Documentation/filesystems/gfs2-glocks.txt
+++ b/Documentation/filesystems/gfs2-glocks.txt
@@ -60,7 +60,7 @@ go_lock          | Called for the first local holder of a lock
 go_unlock        | Called on the final local unlock of a lock
 go_dump          | Called to print content of object for debugfs file, or on
                  | error to dump glock to the log.
-go_type;         | The type of the glock, LM_TYPE_.....
+go_type          | The type of the glock, LM_TYPE_.....
 go_min_hold_time | The minimum hold time
 
 The minimum hold time for each lock is the time after a remote lock
diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt
index 593004b6bbaba..5e3ab8f3beff1 100644
--- a/Documentation/filesystems/gfs2.txt
+++ b/Documentation/filesystems/gfs2.txt
@@ -11,18 +11,15 @@ their I/O so file system consistency is maintained.  One of the nifty
 features of GFS is perfect consistency -- changes made to the file system
 on one machine show up immediately on all other machines in the cluster.
 
-GFS uses interchangable inter-node locking mechanisms.  Different lock
-modules can plug into GFS and each file system selects the appropriate
-lock module at mount time.  Lock modules include:
+GFS uses interchangable inter-node locking mechanisms, the currently
+supported mechanisms are:
 
   lock_nolock -- allows gfs to be used as a local file system
 
   lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking
   The dlm is found at linux/fs/dlm/
 
-In addition to interfacing with an external locking manager, a gfs lock
-module is responsible for interacting with external cluster management
-systems.  Lock_dlm depends on user space cluster management systems found
+Lock_dlm depends on user space cluster management systems found
 at the URL above.
 
 To use gfs as a local file system, no external clustering systems are
@@ -31,13 +28,19 @@ needed, simply:
   $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device
   $ mount -t gfs2 /dev/block_device /dir
 
-GFS2 is not on-disk compatible with previous versions of GFS.
+If you are using Fedora, you need to install the gfs2-utils package
+and, for lock_dlm, you will also need to install the cman package
+and write a cluster.conf as per the documentation.
+
+GFS2 is not on-disk compatible with previous versions of GFS, but it
+is pretty close.
 
 The following man pages can be found at the URL above:
-  gfs2_fsck	to repair a filesystem
+  fsck.gfs2	to repair a filesystem
   gfs2_grow	to expand a filesystem online
   gfs2_jadd	to add journals to a filesystem online
   gfs2_tool	to manipulate, examine and tune a filesystem
   gfs2_quota	to examine and change quota values in a filesystem
+  gfs2_convert	to convert a gfs filesystem to gfs2 in-place
   mount.gfs2	to help mount(8) mount a filesystem
   mkfs.gfs2	to make a filesystem
-- 
GitLab


From 3755100dd5f66761aaaa7ae44c70b319a7c78a56 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 May 2009 18:33:04 +0900
Subject: [PATCH 1201/2198] ub: use __blk_end_request_all()

ub_end_rq() always tries to complete full request.  The @cmd_len
parameter was there because rq->data_len used to be overwritten with
residue count.  Drop @cmd_len and use __blk_end_request_all().

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/ub.c | 24 +++++++-----------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 178f459a50ed8..f32781cff4566 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -360,8 +360,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun,
     struct ub_scsi_cmd *cmd, struct ub_request *urq);
 static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
-static void ub_end_rq(struct request *rq, unsigned int status,
-    unsigned int cmd_len);
+static void ub_end_rq(struct request *rq, unsigned int status);
 static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
     struct ub_request *urq, struct ub_scsi_cmd *cmd);
 static int ub_submit_scsi(struct ub_dev *sc, struct ub_scsi_cmd *cmd);
@@ -644,13 +643,13 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 
 	if (atomic_read(&sc->poison)) {
 		blk_start_request(rq);
-		ub_end_rq(rq, DID_NO_CONNECT << 16, blk_rq_bytes(rq));
+		ub_end_rq(rq, DID_NO_CONNECT << 16);
 		return 0;
 	}
 
 	if (lun->changed && !blk_pc_request(rq)) {
 		blk_start_request(rq);
-		ub_end_rq(rq, SAM_STAT_CHECK_CONDITION, blk_rq_bytes(rq));
+		ub_end_rq(rq, SAM_STAT_CHECK_CONDITION);
 		return 0;
 	}
 
@@ -702,7 +701,7 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq)
 
 drop:
 	ub_put_cmd(lun, cmd);
-	ub_end_rq(rq, DID_ERROR << 16, blk_rq_bytes(rq));
+	ub_end_rq(rq, DID_ERROR << 16);
 	return 0;
 }
 
@@ -777,7 +776,6 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 	struct ub_request *urq = cmd->back;
 	struct request *rq;
 	unsigned int scsi_status;
-	unsigned int cmd_len;
 
 	rq = urq->rq;
 
@@ -816,17 +814,14 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 
 	urq->rq = NULL;
 
-	cmd_len = cmd->len;
 	ub_put_cmd(lun, cmd);
-	ub_end_rq(rq, scsi_status, cmd_len);
+	ub_end_rq(rq, scsi_status);
 	blk_start_queue(lun->disk->queue);
 }
 
-static void ub_end_rq(struct request *rq, unsigned int scsi_status,
-    unsigned int cmd_len)
+static void ub_end_rq(struct request *rq, unsigned int scsi_status)
 {
 	int error;
-	long rqlen;
 
 	if (scsi_status == 0) {
 		error = 0;
@@ -834,12 +829,7 @@ static void ub_end_rq(struct request *rq, unsigned int scsi_status,
 		error = -EIO;
 		rq->errors = scsi_status;
 	}
-	rqlen = blk_rq_bytes(rq);    /* Oddly enough, this is the residue. */
-	if (__blk_end_request(rq, error, cmd_len)) {
-		printk(KERN_WARNING DRV_NAME
-		    ": __blk_end_request blew, %s-cmd total %u rqlen %ld\n",
-		    blk_pc_request(rq)? "pc": "fs", cmd_len, rqlen);
-	}
+	__blk_end_request_all(rq, error);
 }
 
 static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun,
-- 
GitLab


From 5f49f63178360b07a095bd33b0d850d60edf7590 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 May 2009 18:33:05 +0900
Subject: [PATCH 1202/2198] block: set rq->resid_len to blk_rq_bytes() on issue

In commit c3a4d78c580de4edc9ef0f7c59812fb02ceb037f, while introducing
rq->resid_len, the default value of residue count was changed from
full count to zero.  The conversion was done under the assumption that
when a request fails residue count wasn't defined.  However, Boaz and
James pointed out that this wasn't true and the residue count should
be preserved for failed requests too.

This patchset restores the original behavior by setting rq->resid_len
to blk_rq_bytes(rq) on request start and restoring explicit clearing
in affected drivers.  While at it, take advantage of the fact that
rq->resid_len is set to full count where applicable.

* ide-cd: rq->resid_len cleared on pc success

* mptsas: req->resid_len cleared on success

* sas_expander: rsp/req->resid_len cleared on success

* mpt2sas_transport: req->resid_len cleared on success

* ide-cd, ide-tape, mptsas, sas_host_smp, mpt2sas_transport, ub: take
  advantage of initial full count to simplify code

Boaz Harrosh spotted bug in resid_len initialization.  Fixed as
suggested.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Borislav Petkov <petkovbb@googlemail.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Cc: Pete Zaitcev <zaitcev@redhat.com>
Cc: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Cc: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Cc: Eric Moore <Eric.Moore@lsi.com>
Cc: Darrick J. Wong <djwong@us.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c                         | 5 +++--
 drivers/block/ub.c                       | 6 ++++--
 drivers/ide/ide-cd.c                     | 4 ++--
 drivers/ide/ide-tape.c                   | 2 +-
 drivers/message/fusion/mptsas.c          | 3 ++-
 drivers/scsi/libsas/sas_expander.c       | 4 ++++
 drivers/scsi/libsas/sas_host_smp.c       | 3 ---
 drivers/scsi/mpt2sas/mpt2sas_transport.c | 4 ++--
 8 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index a2d97de1a12c3..e3f7e6a3a0957 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1783,9 +1783,10 @@ void blk_start_request(struct request *req)
 	blk_dequeue_request(req);
 
 	/*
-	 * We are now handing the request to the hardware, add the
-	 * timeout handler.
+	 * We are now handing the request to the hardware, initialize
+	 * resid_len to full count and add the timeout handler.
 	 */
+	req->resid_len = blk_rq_bytes(req);
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index f32781cff4566..e67bbae9547d1 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -781,8 +781,10 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd)
 
 	if (cmd->error == 0) {
 		if (blk_pc_request(rq)) {
-			if (cmd->act_len < blk_rq_bytes(rq))
-				rq->resid_len = blk_rq_bytes(rq) - cmd->act_len;
+			if (cmd->act_len >= rq->resid_len)
+				rq->resid_len = 0;
+			else
+				rq->resid_len -= cmd->act_len;
 			scsi_status = 0;
 		} else {
 			if (cmd->act_len != cmd->len) {
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 4c7792fd5f934..081aed6781cc1 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -699,6 +699,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 out_end:
 	if (blk_pc_request(rq) && rc == 0) {
+		rq->resid_len = 0;
 		blk_end_request_all(rq, 0);
 		hwif->rq = NULL;
 	} else {
@@ -718,8 +719,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive)
 
 		/* make sure it's fully ended */
 		if (blk_fs_request(rq) == 0) {
-			rq->resid_len = blk_rq_bytes(rq) -
-				(cmd->nbytes - cmd->nleft);
+			rq->resid_len -= cmd->nbytes - cmd->nleft;
 			if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE))
 				rq->resid_len += cmd->last_xfer_len;
 		}
diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c
index e16604562f281..683ff37d40794 100644
--- a/drivers/ide/ide-tape.c
+++ b/drivers/ide/ide-tape.c
@@ -380,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
 		}
 
 		tape->first_frame += blocks;
-		rq->resid_len = blk_rq_bytes(rq) - blocks * tape->blk_size;
+		rq->resid_len -= blocks * tape->blk_size;
 
 		if (pc->error) {
 			uptodate = 0;
diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c
index 4e6fcf06a6f2d..79f5433359f9b 100644
--- a/drivers/message/fusion/mptsas.c
+++ b/drivers/message/fusion/mptsas.c
@@ -1357,7 +1357,8 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 		smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply;
 		memcpy(req->sense, smprep, sizeof(*smprep));
 		req->sense_len = sizeof(*smprep);
-		rsp->resid_len = blk_rq_bytes(rsp) - smprep->ResponseDataLength;
+		req->resid_len = 0;
+		rsp->resid_len -= smprep->ResponseDataLength;
 	} else {
 		printk(MYIOC_s_ERR_FMT "%s: smp passthru reply failed to be returned\n",
 		    ioc->name, __func__);
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 531af9ed71991..54fa1e42dc4d2 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -1937,7 +1937,11 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 	if (ret > 0) {
 		/* positive number is the untransferred residual */
 		rsp->resid_len = ret;
+		req->resid_len = 0;
 		ret = 0;
+	} else if (ret == 0) {
+		rsp->resid_len = 0;
+		req->resid_len = 0;
 	}
 
 	return ret;
diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c
index be9a951b977d8..1bc3b75679947 100644
--- a/drivers/scsi/libsas/sas_host_smp.c
+++ b/drivers/scsi/libsas/sas_host_smp.c
@@ -176,9 +176,6 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req,
 	resp_data[1] = req_data[1];
 	resp_data[2] = SMP_RESP_FUNC_UNK;
 
-	req->resid_len = blk_rq_bytes(req);
-	rsp->resid_len = blk_rq_bytes(rsp);
-
 	switch (req_data[1]) {
 	case SMP_REPORT_GENERAL:
 		req->resid_len -= 8;
diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c
index af95a449930ea..5c65da519e39a 100644
--- a/drivers/scsi/mpt2sas/mpt2sas_transport.c
+++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c
@@ -1170,8 +1170,8 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
 		memcpy(req->sense, mpi_reply, sizeof(*mpi_reply));
 		req->sense_len = sizeof(*mpi_reply);
-		rsp->resid_len = blk_rq_bytes(rsp) -
-				 mpi_reply->ResponseDataLength;
+		req->resid_len = 0;
+		rsp->resid_len -= mpi_reply->ResponseDataLength;
 	} else {
 		dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT
 		    "%s - no reply\n", ioc->name, __func__));
-- 
GitLab


From 4fc981ef9e7c0953d5c4896ce088b19c50cb018f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 19 May 2009 18:33:06 +0900
Subject: [PATCH 1203/2198] bio: always copy back data for copied kernel
 requests

When a read bio_copy_kern() request fails, the content of the bounce
buffer is not copied back.  However, as request failure doesn't
necessarily mean complete failure, the buffer state can be useful.
This behavior is also inconsistent with the user map counterpart and
causes the subtle difference between bounced and unbounced IO causes
confusion.

This patch makes bio_copy_kern_endio() ignore @err and always copy
back data on request completion.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Cc: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bio.c b/fs/bio.c
index 7bbc98f0eda15..ee3bc67833d2b 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1198,7 +1198,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
 		char *addr = page_address(bvec->bv_page);
 		int len = bmd->iovecs[i].bv_len;
 
-		if (read && !err)
+		if (read)
 			memcpy(p, addr, len);
 
 		__free_page(bvec->bv_page);
-- 
GitLab


From b2858d7d1639c04ca3c54988d76c5f7300b76f1c Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Tue, 19 May 2009 11:37:46 +0200
Subject: [PATCH 1204/2198] splice: fix kmaps in default_file_splice_write()

Unfortunately multiple kmap() within a single thread are deadlockable,
so writing out multiple buffers with writev() isn't possible.

Change the implementation so that it does a separate write() for each
buffer.  This actually simplifies the code a lot since the
splice_from_pipe() helper can be used.

This limitation is caused by HIGHMEM pages, and so only affects a
subset of architectures and configurations.  In the future it may be
worth to implement default_file_splice_write() in a more efficient way
on configs that allow it.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/splice.c | 130 +++++++++-------------------------------------------
 1 file changed, 22 insertions(+), 108 deletions(-)

diff --git a/fs/splice.c b/fs/splice.c
index 41179c0a655bb..73766d24f97b9 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -535,8 +535,8 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec,
 	return res;
 }
 
-static ssize_t kernel_writev(struct file *file, const struct iovec *vec,
-			    unsigned long vlen, loff_t *ppos)
+static ssize_t kernel_write(struct file *file, const char *buf, size_t count,
+			    loff_t pos)
 {
 	mm_segment_t old_fs;
 	ssize_t res;
@@ -544,7 +544,7 @@ static ssize_t kernel_writev(struct file *file, const struct iovec *vec,
 	old_fs = get_fs();
 	set_fs(get_ds());
 	/* The cast to a user pointer is valid due to the set_fs() */
-	res = vfs_writev(file, (const struct iovec __user *)vec, vlen, ppos);
+	res = vfs_write(file, (const char __user *)buf, count, &pos);
 	set_fs(old_fs);
 
 	return res;
@@ -1003,120 +1003,34 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
 
 EXPORT_SYMBOL(generic_file_splice_write);
 
-static struct pipe_buffer *nth_pipe_buf(struct pipe_inode_info *pipe, int n)
+static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+			  struct splice_desc *sd)
 {
-	return &pipe->bufs[(pipe->curbuf + n) % PIPE_BUFFERS];
+	int ret;
+	void *data;
+
+	ret = buf->ops->confirm(pipe, buf);
+	if (ret)
+		return ret;
+
+	data = buf->ops->map(pipe, buf, 0);
+	ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos);
+	buf->ops->unmap(pipe, buf, data);
+
+	return ret;
 }
 
 static ssize_t default_file_splice_write(struct pipe_inode_info *pipe,
 					 struct file *out, loff_t *ppos,
 					 size_t len, unsigned int flags)
 {
-	ssize_t ret = 0;
-	ssize_t total_len = 0;
-	int do_wakeup = 0;
-
-	pipe_lock(pipe);
-	while (len) {
-		struct pipe_buffer *buf;
-		void *data[PIPE_BUFFERS];
-		struct iovec vec[PIPE_BUFFERS];
-		unsigned int nr_pages = 0;
-		unsigned int write_len = 0;
-		unsigned int now_len = len;
-		unsigned int this_len;
-		int i;
-
-		BUG_ON(pipe->nrbufs > PIPE_BUFFERS);
-		for (i = 0; i < pipe->nrbufs && now_len; i++) {
-			buf = nth_pipe_buf(pipe, i);
-
-			ret = buf->ops->confirm(pipe, buf);
-			if (ret)
-				break;
-
-			data[i] = buf->ops->map(pipe, buf, 0);
-			this_len = min(buf->len, now_len);
-			vec[i].iov_base = (void __user *) data[i] + buf->offset;
-			vec[i].iov_len = this_len;
-			now_len -= this_len;
-			write_len += this_len;
-			nr_pages++;
-		}
-
-		if (nr_pages) {
-			ret = kernel_writev(out, vec, nr_pages, ppos);
-			if (ret == 0)
-				ret = -EIO;
-			if (ret > 0) {
-				len -= ret;
-				total_len += ret;
-			}
-		}
-
-		for (i = 0; i < nr_pages; i++) {
-			buf = nth_pipe_buf(pipe, i);
-			buf->ops->unmap(pipe, buf, data[i]);
-
-			if (ret > 0) {
-				this_len = min_t(unsigned, vec[i].iov_len, ret);
-				buf->offset += this_len;
-				buf->len -= this_len;
-				ret -= this_len;
-			}
-		}
-
-		if (ret < 0)
-			break;
-
-		while (pipe->nrbufs) {
-			const struct pipe_buf_operations *ops;
-
-			buf = nth_pipe_buf(pipe, 0);
-			if (buf->len)
-				break;
-
-			ops = buf->ops;
-			buf->ops = NULL;
-			ops->release(pipe, buf);
-			pipe->curbuf = (pipe->curbuf + 1) % PIPE_BUFFERS;
-			pipe->nrbufs--;
-			if (pipe->inode)
-				do_wakeup = 1;
-		}
-
-		if (pipe->nrbufs)
-			continue;
-		if (!pipe->writers)
-			break;
-		if (!pipe->waiting_writers) {
-			if (total_len)
-				break;
-		}
-
-		if (flags & SPLICE_F_NONBLOCK) {
-			ret = -EAGAIN;
-			break;
-		}
-
-		if (signal_pending(current)) {
-			ret = -ERESTARTSYS;
-			break;
-		}
-
-		if (do_wakeup) {
-			wakeup_pipe_writers(pipe);
-			do_wakeup = 0;
-		}
-
-		pipe_wait(pipe);
-	}
-	pipe_unlock(pipe);
+	ssize_t ret;
 
-	if (do_wakeup)
-		wakeup_pipe_writers(pipe);
+	ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf);
+	if (ret > 0)
+		*ppos += ret;
 
-	return total_len ? total_len : ret;
+	return ret;
 }
 
 /**
-- 
GitLab


From 3a5a39276d2a32b05b1ee25b384516805b17cf87 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sun, 17 May 2009 18:55:18 +0300
Subject: [PATCH 1205/2198] block: allow blk_rq_map_kern to append to requests

Use blk_rq_append_bio() internally instead of blk_rq_bio_prep()
so blk_rq_map_kern can be called multiple times, to map multiple
buffers.

This is in the effort to un-export blk_rq_append_bio()

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index 56082bea45041..caa05a6677431 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -282,7 +282,8 @@ EXPORT_SYMBOL(blk_rq_unmap_user);
  *
  * Description:
  *    Data will be mapped directly if possible. Otherwise a bounce
- *    buffer is used.
+ *    buffer is used. Can be called multple times to append multple
+ *    buffers.
  */
 int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		    unsigned int len, gfp_t gfp_mask)
@@ -290,6 +291,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	int reading = rq_data_dir(rq) == READ;
 	int do_copy = 0;
 	struct bio *bio;
+	int ret;
 
 	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
@@ -311,7 +313,13 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	if (do_copy)
 		rq->cmd_flags |= REQ_COPY_USER;
 
-	blk_rq_bio_prep(q, rq, bio);
+	ret = blk_rq_append_bio(q, rq, bio);
+	if (unlikely(ret)) {
+		/* request is too big */
+		bio_put(bio);
+		return ret;
+	}
+
 	blk_queue_bounce(q, &rq->bio);
 	rq->buffer = NULL;
 	return 0;
-- 
GitLab


From bc38bf106c967389a465d926be22c7371abba69d Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 17 May 2009 18:56:17 +0300
Subject: [PATCH 1206/2198] libosd: Use new blk_rq_map_kern

Now that blk_rq_map_kern will append the buffer onto the
request we can use it easily for adding extra segments
(eg. attributes)

This patch is dependent on a block layer patch titled:
   [BLOCK] allow blk_rq_map_kern to append to requests

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/osd/osd_initiator.c | 24 ++----------------------
 1 file changed, 2 insertions(+), 22 deletions(-)

diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index d178a8b799ec0..bf66e301866f1 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -826,26 +826,6 @@ int osd_req_add_set_attr_list(struct osd_request *or,
 }
 EXPORT_SYMBOL(osd_req_add_set_attr_list);
 
-static int _append_map_kern(struct request *req,
-	void *buff, unsigned len, gfp_t flags)
-{
-	struct bio *bio;
-	int ret;
-
-	bio = bio_map_kern(req->q, buff, len, flags);
-	if (IS_ERR(bio)) {
-		OSD_ERR("Failed bio_map_kern(%p, %d) => %ld\n", buff, len,
-			PTR_ERR(bio));
-		return PTR_ERR(bio);
-	}
-	ret = blk_rq_append_bio(req->q, req, bio);
-	if (ret) {
-		OSD_ERR("Failed blk_rq_append_bio(%p) => %d\n", bio, ret);
-		bio_put(bio);
-	}
-	return ret;
-}
-
 static int _req_append_segment(struct osd_request *or,
 	unsigned padding, struct _osd_req_data_segment *seg,
 	struct _osd_req_data_segment *last_seg, struct _osd_io_info *io)
@@ -861,14 +841,14 @@ static int _req_append_segment(struct osd_request *or,
 		else
 			pad_buff = io->pad_buff;
 
-		ret = _append_map_kern(io->req, pad_buff, padding,
+		ret = blk_rq_map_kern(io->req->q, io->req, pad_buff, padding,
 				       or->alloc_flags);
 		if (ret)
 			return ret;
 		io->total_bytes += padding;
 	}
 
-	ret = _append_map_kern(io->req, seg->buff, seg->total_bytes,
+	ret = blk_rq_map_kern(io->req->q, io->req, seg->buff, seg->total_bytes,
 			       or->alloc_flags);
 	if (ret)
 		return ret;
-- 
GitLab


From 79eb63e9e5875b84341a3a05f8e6ae9cdb4bb6f6 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 17 May 2009 18:57:15 +0300
Subject: [PATCH 1207/2198] block: Add blk_make_request(), takes bio, returns a
 request

New block API:
given a struct bio allocates a new request. This is the parallel of
generic_make_request for BLOCK_PC commands users.

The passed bio may be a chained-bio. The bio is bounced if needed
inside the call to this member.

This is in the effort of un-exporting blk_rq_append_bio().

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
CC: Jeff Garzik <jeff@garzik.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 45 ++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |  2 ++
 2 files changed, 47 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index e3f7e6a3a0957..bec1d69952d07 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -890,6 +890,51 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(blk_get_request);
 
+/**
+ * blk_make_request - given a bio, allocate a corresponding struct request.
+ *
+ * @bio:  The bio describing the memory mappings that will be submitted for IO.
+ *        It may be a chained-bio properly constructed by block/bio layer.
+ *
+ * blk_make_request is the parallel of generic_make_request for BLOCK_PC
+ * type commands. Where the struct request needs to be farther initialized by
+ * the caller. It is passed a &struct bio, which describes the memory info of
+ * the I/O transfer.
+ *
+ * The caller of blk_make_request must make sure that bi_io_vec
+ * are set to describe the memory buffers. That bio_data_dir() will return
+ * the needed direction of the request. (And all bio's in the passed bio-chain
+ * are properly set accordingly)
+ *
+ * If called under none-sleepable conditions, mapped bio buffers must not
+ * need bouncing, by calling the appropriate masked or flagged allocator,
+ * suitable for the target device. Otherwise the call to blk_queue_bounce will
+ * BUG.
+ */
+struct request *blk_make_request(struct request_queue *q, struct bio *bio,
+				 gfp_t gfp_mask)
+{
+	struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
+
+	if (unlikely(!rq))
+		return ERR_PTR(-ENOMEM);
+
+	for_each_bio(bio) {
+		struct bio *bounce_bio = bio;
+		int ret;
+
+		blk_queue_bounce(q, &bounce_bio);
+		ret = blk_rq_append_bio(q, rq, bounce_bio);
+		if (unlikely(ret)) {
+			blk_put_request(rq);
+			return ERR_PTR(ret);
+		}
+	}
+
+	return rq;
+}
+EXPORT_SYMBOL(blk_make_request);
+
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index f9d60a78c08a4..88a83e112c954 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -740,6 +740,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
+extern struct request *blk_make_request(struct request_queue *, struct bio *,
+					gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
-- 
GitLab


From c29b70f6ee4f2fa3ef07f55bc9082945861e5391 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 17 May 2009 18:58:41 +0300
Subject: [PATCH 1208/2198] libosd: Use of new blk_make_request

Use new blk_make_request() to allocate a request from bio
and avoid using deprecated blk_rq_append_bio().

This patch is dependent on a block layer patch titled:
    [BLOCK] New blk_make_request() takes bio returns request

This is the last usage of blk_rq_append_bio in osd, it can now
be un-exported.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
CC: Jeff Garzik <jeff@garzik.org>
CC: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/osd/osd_initiator.c | 48 +++++++++++++++-----------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index bf66e301866f1..865ec0f4aa804 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -1200,6 +1200,21 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or,
 /*
  * osd_finalize_request and helpers
  */
+static struct request *_make_request(struct request_queue *q, bool has_write,
+			      struct _osd_io_info *oii, gfp_t flags)
+{
+	if (oii->bio)
+		return blk_make_request(q, oii->bio, flags);
+	else {
+		struct request *req;
+
+		req = blk_get_request(q, has_write ? WRITE : READ, flags);
+		if (unlikely(!req))
+			return ERR_PTR(-ENOMEM);
+
+		return req;
+	}
+}
 
 static int _init_blk_request(struct osd_request *or,
 	bool has_in, bool has_out)
@@ -1208,11 +1223,13 @@ static int _init_blk_request(struct osd_request *or,
 	struct scsi_device *scsi_device = or->osd_dev->scsi_device;
 	struct request_queue *q = scsi_device->request_queue;
 	struct request *req;
-	int ret = -ENOMEM;
+	int ret;
 
-	req = blk_get_request(q, has_out, flags);
-	if (!req)
+	req = _make_request(q, has_out, has_out ? &or->out : &or->in, flags);
+	if (IS_ERR(req)) {
+		ret = PTR_ERR(req);
 		goto out;
+	}
 
 	or->request = req;
 	req->cmd_type = REQ_TYPE_BLOCK_PC;
@@ -1225,9 +1242,10 @@ static int _init_blk_request(struct osd_request *or,
 		or->out.req = req;
 		if (has_in) {
 			/* allocate bidi request */
-			req = blk_get_request(q, READ, flags);
-			if (!req) {
+			req = _make_request(q, false, &or->in, flags);
+			if (IS_ERR(req)) {
 				OSD_DEBUG("blk_get_request for bidi failed\n");
+				ret = PTR_ERR(req);
 				goto out;
 			}
 			req->cmd_type = REQ_TYPE_BLOCK_PC;
@@ -1271,26 +1289,6 @@ int osd_finalize_request(struct osd_request *or,
 		return ret;
 	}
 
-	if (or->out.bio) {
-		ret = blk_rq_append_bio(or->request->q, or->out.req,
-					or->out.bio);
-		if (ret) {
-			OSD_DEBUG("blk_rq_append_bio out failed\n");
-			return ret;
-		}
-		OSD_DEBUG("out bytes=%llu (bytes_req=%u)\n",
-			_LLU(or->out.total_bytes), blk_rq_bytes(or->out.req));
-	}
-	if (or->in.bio) {
-		ret = blk_rq_append_bio(or->request->q, or->in.req, or->in.bio);
-		if (ret) {
-			OSD_DEBUG("blk_rq_append_bio in failed\n");
-			return ret;
-		}
-		OSD_DEBUG("in bytes=%llu (bytes_req=%u)\n",
-			_LLU(or->in.total_bytes), blk_rq_bytes(or->in.req));
-	}
-
 	or->out.pad_buff = sg_out_pad_buffer;
 	or->in.pad_buff = sg_in_pad_buffer;
 
-- 
GitLab


From a411f4bbb89f1f08687b344064d6775bce1e4658 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 17 May 2009 19:00:01 +0300
Subject: [PATCH 1209/2198] block: Un-export blk_rq_append_bio

OSD was the last in-tree user of blk_rq_append_bio(). Now
that it is fixed blk_rq_append_bio is un-exported and
is only used internally by block layer.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-map.c        | 1 -
 block/blk.h            | 2 ++
 include/linux/blkdev.h | 6 ------
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index caa05a6677431..ef2492adca7e3 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -24,7 +24,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 	}
 	return 0;
 }
-EXPORT_SYMBOL(blk_rq_append_bio);
 
 static int __blk_rq_unmap_user(struct bio *bio)
 {
diff --git a/block/blk.h b/block/blk.h
index 9e0042ca94959..c863ec2281e05 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -13,6 +13,8 @@ extern struct kobj_type blk_queue_ktype;
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
+int blk_rq_append_bio(struct request_queue *q, struct request *rq,
+		      struct bio *bio);
 void blk_dequeue_request(struct request *rq);
 void __blk_queue_free_tags(struct request_queue *q);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 88a83e112c954..564445be7a6d9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -757,12 +757,6 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 
-/*
- * Temporary export, until SCSI gets fixed up.
- */
-extern int blk_rq_append_bio(struct request_queue *q, struct request *rq,
-			     struct bio *bio);
-
 /*
  * A queue has just exitted congestion.  Note this in the global counter of
  * congested queues, and wake up anyone who was waiting for requests to be
-- 
GitLab


From 4aee2ad461889132bfb5a1518a9580d00e17008c Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Tue, 19 May 2009 17:07:01 +0530
Subject: [PATCH 1210/2198] x86: asm/processor.h: remove double declaration

Remove double declaration of:

 extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
 extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
 extern unsigned short num_cache_leaves;

they are already defined in the same file.

[ Impact: cleanup ]

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
LKML-Reference: <1242733021.3377.1.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/processor.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 34c52370f2fe8..85628ea9d9b5e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -403,9 +403,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary);
 extern unsigned int xstate_size;
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
 
 struct thread_struct {
 	/* Cached TLS descriptors: */
-- 
GitLab


From 4c6f18fc81565967da20f2d4a3922cdba33f8e2b Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 18 May 2009 10:23:28 -0700
Subject: [PATCH 1211/2198] x86, io-apic: Don't mark pin_programmed early

Peter bisected that:

| commit b9c61b70075c87a8612624736faf4a2de5b1ed30
| Date:   Wed May 6 10:10:06 2009 -0700
|
|     x86/pci: update pirq_enable_irq() to setup io apic routing
|
|     So we can set io apic routing only when enabling the device irq.

wrecked his opteron box, ata1 interrupts fail to get through.

ata1 is using irq 11:

[    1.451839] sata_svw 0000:01:0e.0: version 2.3
[    1.456333] sata_svw 0000:01:0e.0: PCI INT A -> GSI 11 (level, low) -> IRQ 11
[    1.463639] scsi0 : sata_svw
[    1.466949] scsi1 : sata_svw
[    1.470022] scsi2 : sata_svw
[    1.473090] scsi3 : sata_svw
[    1.476112] ata1: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe000 irq 11
[    1.483490] ata2: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe100 irq 11
[    1.490870] ata3: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe200 irq 11
[    1.498247] ata4: SATA max UDMA/133 mmio m8192@0xff3fe000 port 0xff3fe300 irq 11

that pin is overlapped with pin with legacy ones.

We should not set bits in pin_programmed here, so that those bit could
be set later via io_apic_set_pci_routing().

[ Impact: fix boot hang on certain systems ]

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Yinghai Lu <yinghai.lu@kernel.org>
Tested-by: Peter Zijlstra <peterz@infradead.org>
Cc: Jack Steiner <steiner@sgi.com>
LKML-Reference: <4A119990.9020606@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/io_apic.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ce1ac74baa738..ac7f3b6ad583e 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -1537,7 +1537,10 @@ static void __init setup_IO_APIC_irqs(void)
 		}
 		cfg = desc->chip_data;
 		add_pin_to_irq_node(cfg, node, apic_id, pin);
-		set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
+		/*
+		 * don't mark it in pin_programmed, so later acpi could
+		 * set it correctly when irq < 16
+		 */
 		setup_IO_APIC_irq(apic_id, pin, irq, desc,
 				irq_trigger(idx), irq_polarity(idx));
 	}
-- 
GitLab


From ef9e8b14a5c1d0afbaf12b4c3b271188ddfc52a4 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 19 May 2009 14:25:16 +0100
Subject: [PATCH 1212/2198] GFS2: Don't warn when delete inode fails on ro
 filesystem

If the filesystem is read-only, then we expect that delete inode
will fail, so there is no need to warn about it.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index a3c2272e7cade..2fd1dcbcc5b74 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -714,7 +714,7 @@ static void gfs2_delete_inode(struct inode *inode)
 		gfs2_glock_dq(&ip->i_iopen_gh);
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
-	if (error && error != GLR_TRYFAILED)
+	if (error && error != GLR_TRYFAILED && error != -EROFS)
 		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
 out:
 	truncate_inode_pages(&inode->i_data, 0);
-- 
GitLab


From c5642f4bbae30122beb696e723f6da273caa570e Mon Sep 17 00:00:00 2001
From: Stephen Smalley <sds@tycho.nsa.gov>
Date: Tue, 19 May 2009 09:02:23 -0400
Subject: [PATCH 1213/2198] selinux: remove obsolete read buffer limit from
 sel_read_bool

On Tue, 2009-05-19 at 00:05 -0400, Eamon Walsh wrote:
> Recent versions of coreutils have bumped the read buffer size from 4K to
> 32K in several of the utilities.
>
> This means that "cat /selinux/booleans/xserver_object_manager" no longer
> works, it returns "Invalid argument" on F11.  getsebool works fine.
>
> sel_read_bool has a check for "count > PAGE_SIZE" that doesn't seem to
> be present in the other read functions.  Maybe it could be removed?

Yes, that check is obsoleted by the conversion of those functions to
using simple_read_from_buffer(), which will reduce count if necessary to
what is available in the buffer.

Signed-off-by:  Stephen Smalley <sds@tycho.nsa.gov>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/selinux/selinuxfs.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c
index 8d4007fbe0e96..b4fc506e7a87c 100644
--- a/security/selinux/selinuxfs.c
+++ b/security/selinux/selinuxfs.c
@@ -803,10 +803,6 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
 		goto out;
 	}
 
-	if (count > PAGE_SIZE) {
-		ret = -EINVAL;
-		goto out;
-	}
 	page = (char *)get_zeroed_page(GFP_KERNEL);
 	if (!page) {
 		ret = -ENOMEM;
-- 
GitLab


From 53674ac5a997a8eedbb2dfdc308b895170746c8b Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 19 May 2009 19:52:35 +0200
Subject: [PATCH 1214/2198] block: add warning to blk_make_request()

Add a note about how one needs to be careful when setting up these bio
chains.

Extracted from Boaz's updated patch.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index bec1d69952d07..49065075d4622 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -910,6 +910,15 @@ EXPORT_SYMBOL(blk_get_request);
  * need bouncing, by calling the appropriate masked or flagged allocator,
  * suitable for the target device. Otherwise the call to blk_queue_bounce will
  * BUG.
+ *
+ * WARNING: When allocating/cloning a bio-chain, careful consideration should be
+ * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
+ * anything but the first bio in the chain. Otherwise you risk waiting for IO
+ * completion of a bio that hasn't been submitted yet, thus resulting in a
+ * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
+ * of bio_alloc(), as that avoids the mempool deadlock.
+ * If possible a big IO should be split into smaller parts when allocation
+ * fails. Partial allocation should not be an error, or you risk a live-lock.
  */
 struct request *blk_make_request(struct request_queue *q, struct bio *bio,
 				 gfp_t gfp_mask)
-- 
GitLab


From ac36552a52a6ec8563ac0a109e2a0935673f4abb Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 19 May 2009 19:54:09 +0200
Subject: [PATCH 1215/2198] scsi_lib: remove unused variable

The last request completion cleanup in scsi_lib left an unused
this_count variable in scsi_io_completion().
(It was used before in a code segment that now uses blk_end_request_all())

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/scsi/scsi_lib.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index e410d667910de..d7c6c752e0a65 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -706,7 +706,6 @@ EXPORT_SYMBOL(scsi_release_buffers);
 void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 {
 	int result = cmd->result;
-	int this_count;
 	struct request_queue *q = cmd->device->request_queue;
 	struct request *req = cmd->request;
 	int error = 0;
@@ -789,7 +788,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes)
 	 */
 	if (scsi_end_request(cmd, error, good_bytes, result == 0) == NULL)
 		return;
-	this_count = blk_rq_bytes(req);
 
 	error = -EIO;
 
-- 
GitLab


From 33b2fb303fe7f6b08bbb32f708e67b96eaa94a7a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:08:41 +0200
Subject: [PATCH 1216/2198] perf_counter: fix counter freeing logic

Fix counter lifetime bugs which explain the crashes reported by
Marcelo Tosatti and Arnaldo Carvalho de Melo.

The new rule is: flushing + freeing is only done for a task's
own counters, never for other tasks.

[ Impact: fix crashes/lockups with inherited counters ]

Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/exit.c         | 19 +++++++------------
 kernel/perf_counter.c |  2 ++
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/kernel/exit.c b/kernel/exit.c
index 73affd35e76d1..f9dfedd94af0c 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -178,12 +178,6 @@ void release_task(struct task_struct * p)
 
 	proc_flush_task(p);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(p);
-
 	write_lock_irq(&tasklist_lock);
 	tracehook_finish_release_task(p);
 	__exit_signal(p);
@@ -985,6 +979,13 @@ NORET_TYPE void do_exit(long code)
 		module_put(tsk->binfmt->module);
 
 	proc_exit_connector(tsk);
+
+	/*
+	 * Flush inherited counters to the parent - before the parent
+	 * gets woken up by child-exit notifications.
+	 */
+	perf_counter_exit_task(tsk);
+
 	exit_notify(tsk, group_dead);
 #ifdef CONFIG_NUMA
 	mpol_put(tsk->mempolicy);
@@ -1257,12 +1258,6 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	 */
 	read_unlock(&tasklist_lock);
 
-	/*
-	 * Flush inherited counters to the parent - before the parent
-	 * gets woken up by child-exit notifications.
-	 */
-	perf_counter_exit_task(p);
-
 	retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
 	status = (p->signal->flags & SIGNAL_GROUP_EXIT)
 		? p->signal->group_exit_code : p->exit_code;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 59a926d04bafd..7af16d1c480fb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3299,6 +3299,8 @@ void perf_counter_exit_task(struct task_struct *child)
 	struct perf_counter *child_counter, *tmp;
 	struct perf_counter_context *child_ctx;
 
+	WARN_ON_ONCE(child != current);
+
 	child_ctx = &child->perf_counter_ctx;
 
 	if (likely(!child_ctx->nr_counters))
-- 
GitLab


From c44d70a340554a33071339064a303ac0f1a31623 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:24:08 +0200
Subject: [PATCH 1217/2198] perf_counter: fix counter inheritance race

Context rotation should not occur when we are in the middle of
walking the counter list when inheriting counters ...

[ Impact: fix occasionally incorrect perf stat results ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 +
 kernel/perf_counter.c        | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c8c1dfc22c938..13cb2fbbf3342 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -508,6 +508,7 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
+	int			rr_allowed;
 	struct task_struct	*task;
 
 	/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7af16d1c480fb..4d8f97375f3af 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1120,7 +1120,8 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	rotate_ctx(ctx);
+	if (ctx->rr_allowed)
+		rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
@@ -3108,6 +3109,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
 	INIT_LIST_HEAD(&ctx->event_list);
+	ctx->rr_allowed = 1;
 	ctx->task = task;
 }
 
@@ -3348,6 +3350,9 @@ void perf_counter_init_task(struct task_struct *child)
 	 */
 	mutex_lock(&parent_ctx->mutex);
 
+	parent_ctx->rr_allowed = 0;
+	barrier(); /* irqs */
+
 	/*
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
@@ -3361,6 +3366,9 @@ void perf_counter_init_task(struct task_struct *child)
 			break;
 	}
 
+	barrier(); /* irqs */
+	parent_ctx->rr_allowed = 1;
+
 	mutex_unlock(&parent_ctx->mutex);
 }
 
-- 
GitLab


From 62669e61a5f559826b1d2e863649a6005eee629b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 20 May 2009 11:27:13 +0900
Subject: [PATCH 1218/2198] sh: mach-hp6xx: Fix up the hp6xx build for hd64461
 changes.

Fixes several compile errors due to the recent hd64461 I/O base changes.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/cchips/hd6446x/hd64461.c | 2 +-
 arch/sh/include/asm/hd64461.h    | 2 +-
 drivers/video/hitfb.c            | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/sh/cchips/hd6446x/hd64461.c b/arch/sh/cchips/hd6446x/hd64461.c
index 25ef910615217..50aa0c1f76ea7 100644
--- a/arch/sh/cchips/hd6446x/hd64461.c
+++ b/arch/sh/cchips/hd6446x/hd64461.c
@@ -80,7 +80,7 @@ int __init setup_hd64461(void)
 
 	printk(KERN_INFO
 	       "HD64461 configured at 0x%x on irq %d(mapped into %d to %d)\n",
-	       CONFIG_HD64461_IOBASE, CONFIG_HD64461_IRQ, HD64461_IRQBASE,
+	       HD64461_IOBASE, CONFIG_HD64461_IRQ, HD64461_IRQBASE,
 	       HD64461_IRQBASE + 15);
 
 /* Should be at processor specific part.. */
diff --git a/arch/sh/include/asm/hd64461.h b/arch/sh/include/asm/hd64461.h
index c16d54563c932..977355f0a4836 100644
--- a/arch/sh/include/asm/hd64461.h
+++ b/arch/sh/include/asm/hd64461.h
@@ -26,7 +26,7 @@
 #define	HD64461_PCC1_COMM	(HD64461_PCC1_BASE+HD64461_PCC_WINDOW)		/* 0xb5000000 */
 
 /* Standby Control Register for HD64461 */
-#define	HD64461_STBCR			CONFIG_HD64461_IOBASE
+#define	HD64461_STBCR			HD64461_IO_OFFSET(0x00000000)
 #define	HD64461_STBCR_CKIO_STBY		0x2000
 #define	HD64461_STBCR_SAFECKE_IST	0x1000
 #define	HD64461_STBCR_SLCKE_IST		0x0800
diff --git a/drivers/video/hitfb.c b/drivers/video/hitfb.c
index e6467cf9f19f1..020db7fc9153a 100644
--- a/drivers/video/hitfb.c
+++ b/drivers/video/hitfb.c
@@ -335,9 +335,9 @@ static int __init hitfb_probe(struct platform_device *dev)
 	if (fb_get_options("hitfb", NULL))
 		return -ENODEV;
 
-	hitfb_fix.mmio_start = CONFIG_HD64461_IOBASE+0x1000;
+	hitfb_fix.mmio_start = HD64461_IO_OFFSET(0x1000);
 	hitfb_fix.mmio_len = 0x1000;
-	hitfb_fix.smem_start = CONFIG_HD64461_IOBASE + 0x02000000;
+	hitfb_fix.smem_start = HD64461_IO_OFFSET(0x02000000);
 	hitfb_fix.smem_len = 512 * 1024;
 
 	lcdclor = fb_readw(HD64461_LCDCLOR);
-- 
GitLab


From 0a7ae2ff0d29bb3b327edff4c8ab67b3834fa811 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 20 May 2009 08:54:31 +0200
Subject: [PATCH 1219/2198] block: change the tag sync vs async restriction
 logic

Make them fully share the tag space, but disallow async requests using
the last any two slots.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c    |  2 +-
 block/blk-core.c       |  2 +-
 block/blk-tag.c        | 15 +++++++++------
 block/elevator.c       |  8 ++++----
 include/linux/blkdev.h |  7 ++++++-
 5 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 0ab81a0a75023..0d98054cdbd77 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -218,7 +218,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 	} else
 		skip |= QUEUE_ORDSEQ_PREFLUSH;
 
-	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && q->in_flight)
+	if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q))
 		rq = NULL;
 	else
 		skip |= QUEUE_ORDSEQ_DRAIN;
diff --git a/block/blk-core.c b/block/blk-core.c
index 49065075d4622..1c74840388291 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1815,7 +1815,7 @@ void blk_dequeue_request(struct request *rq)
 	 * the driver side.
 	 */
 	if (blk_account_rq(rq))
-		q->in_flight++;
+		q->in_flight[rq_is_sync(rq)]++;
 }
 
 /**
diff --git a/block/blk-tag.c b/block/blk-tag.c
index c260f7c30ddae..2e5cfeb59333e 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(blk_queue_end_tag);
 int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
-	unsigned max_depth, offset;
+	unsigned max_depth;
 	int tag;
 
 	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
@@ -355,13 +355,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 	 * to starve sync IO on behalf of flooding async IO.
 	 */
 	max_depth = bqt->max_depth;
-	if (rq_is_sync(rq))
-		offset = 0;
-	else
-		offset = max_depth >> 2;
+	if (!rq_is_sync(rq) && max_depth > 1) {
+		max_depth -= 2;
+		if (!max_depth)
+			max_depth = 1;
+		if (q->in_flight[0] > max_depth)
+			return 1;
+	}
 
 	do {
-		tag = find_next_zero_bit(bqt->tag_map, max_depth, offset);
+		tag = find_first_zero_bit(bqt->tag_map, max_depth);
 		if (tag >= max_depth)
 			return 1;
 
diff --git a/block/elevator.c b/block/elevator.c
index 918920056e42b..ebee948293ebc 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -546,7 +546,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 	 * in_flight count again
 	 */
 	if (blk_account_rq(rq)) {
-		q->in_flight--;
+		q->in_flight[rq_is_sync(rq)]--;
 		if (blk_sorted_rq(rq))
 			elv_deactivate_rq(q, rq);
 	}
@@ -685,7 +685,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
 
 	if (unplug_it && blk_queue_plugged(q)) {
 		int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC]
-			- q->in_flight;
+				- queue_in_flight(q);
 
 		if (nrq >= q->unplug_thresh)
 			__generic_unplug_device(q);
@@ -823,7 +823,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 	 * request is released from the driver, io must be done
 	 */
 	if (blk_account_rq(rq)) {
-		q->in_flight--;
+		q->in_flight[rq_is_sync(rq)]--;
 		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
@@ -838,7 +838,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq)
 		if (!list_empty(&q->queue_head))
 			next = list_entry_rq(q->queue_head.next);
 
-		if (!q->in_flight &&
+		if (!queue_in_flight(q) &&
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
 		    (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 564445be7a6d9..a967dd775dbd1 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -404,7 +404,7 @@ struct request_queue
 	struct list_head	tag_busy_list;
 
 	unsigned int		nr_sorted;
-	unsigned int		in_flight;
+	unsigned int		in_flight[2];
 
 	unsigned int		rq_timeout;
 	struct timer_list	timeout;
@@ -511,6 +511,11 @@ static inline void queue_flag_clear_unlocked(unsigned int flag,
 	__clear_bit(flag, &q->queue_flags);
 }
 
+static inline int queue_in_flight(struct request_queue *q)
+{
+	return q->in_flight[0] + q->in_flight[1];
+}
+
 static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
-- 
GitLab


From c8b15a706d921baed3195407e4f55270112bb3c6 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 May 2009 09:18:50 +0200
Subject: [PATCH 1220/2198] futex: cleanup error exit

Reuse the put_key_ref(key2) call in the exit path.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 476603afd1478..381125a9f1e0b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2185,10 +2185,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 
 	/* Prepare to wait on uaddr. */
 	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
-	if (ret) {
-		put_futex_key(fshared, &key2);
-		goto out;
-	}
+	if (ret)
+		goto out_key2;
 
 	/* Queue the futex_q, drop the hb lock, wait for wakeup. */
 	futex_wait_queue_me(hb, &q, to);
@@ -2282,6 +2280,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 
 out_put_keys:
 	put_futex_key(fshared, &q.key);
+out_key2:
 	put_futex_key(fshared, &key2);
 
 out:
-- 
GitLab


From 1c840c14906d4ddf66c1f4f5daea059aad951c82 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 20 May 2009 09:22:40 +0200
Subject: [PATCH 1221/2198] futex: fix restart for early wakeup in
 futex_wait_requeue_pi()

The futex_wait_requeue_pi op should restart unconditionally like
futex_lock_pi. The user of that function e.g. pthread_cond_wait can
not be interrupted so we do not care about the SA_RESTART flag of the
signal. Clean up the FIXMEs.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 381125a9f1e0b..2aa216e5b594f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2060,7 +2060,7 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared)
  *
  * Returns
  *  0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?)
+ * <0 - -ETIMEDOUT or -ERESTARTNOINTR
  */
 static inline
 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2087,15 +2087,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 
 		if (timeout && !timeout->task)
 			ret = -ETIMEDOUT;
-		else {
-			/*
-			 * We expect signal_pending(current), but another
-			 * thread may have handled it for us already.
-			 */
-			/* FIXME: ERESTARTSYS or ERESTARTNOINTR?  Do we care if
-			 * the user specified SA_RESTART or not? */
-			ret = -ERESTARTSYS;
-		}
+		else
+			ret = -ERESTARTNOINTR;
 	}
 	return ret;
 }
-- 
GitLab


From 2070887fdeacd9c13f3e805e3f0086c9f22a4d93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 19 May 2009 23:04:59 +0200
Subject: [PATCH 1222/2198] futex: fix restart in wait_requeue_pi

If the waiter has been requeued to the outer PI futex and is
interrupted by a signal and the thread handles the signal then
ERESTART_RESTARTBLOCK is changed to EINTR and the restart block is
discarded. That way we return an unexcpected EINTR to user space
instead of ending up in futex_lock_pi_restart.

But we do not need to restart the syscall because we know that the
condition has changed since we have been requeued. If we would simply
restart the syscall then we would drop out via the comparison of the
user space value with EWOULDBLOCK.

The user space side needs to handle EWOULDBLOCK anyway as the
enqueueing on the inner futex can race with a requeue/wake. So we can
simply return EWOULDBLOCK to user space which also signals that we did
not take the outer futex and let user space handle it in the same way
it has to handle the requeue/wake race.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 49 +++++++++----------------------------------------
 1 file changed, 9 insertions(+), 40 deletions(-)

diff --git a/kernel/futex.c b/kernel/futex.c
index 2aa216e5b594f..80b5ce716596a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1507,7 +1507,6 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 #define FLAGS_HAS_TIMEOUT	0x04
 
 static long futex_wait_restart(struct restart_block *restart);
-static long futex_lock_pi_restart(struct restart_block *restart);
 
 /**
  * fixup_owner() - Post lock pi_state and corner case management
@@ -1930,21 +1929,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	goto retry;
 }
 
-static long futex_lock_pi_restart(struct restart_block *restart)
-{
-	u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
-	ktime_t t, *tp = NULL;
-	int fshared = restart->futex.flags & FLAGS_SHARED;
-
-	if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
-		t.tv64 = restart->futex.time;
-		tp = &t;
-	}
-	restart->fn = do_no_restart_syscall;
-
-	return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0);
-}
-
 /*
  * Userspace attempted a TID -> 0 atomic transition, and failed.
  * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -2141,12 +2125,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct rt_mutex_waiter rt_waiter;
 	struct rt_mutex *pi_mutex = NULL;
-	struct restart_block *restart;
 	struct futex_hash_bucket *hb;
 	union futex_key key2;
 	struct futex_q q;
 	int res, ret;
-	u32 uval;
 
 	if (!bitset)
 		return -EINVAL;
@@ -2245,30 +2227,17 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 		if (rt_mutex_owner(pi_mutex) == current)
 			rt_mutex_unlock(pi_mutex);
 	} else if (ret == -EINTR) {
-		ret = -EFAULT;
-		if (get_user(uval, uaddr2))
-			goto out_put_keys;
-
 		/*
-		 * We've already been requeued, so restart by calling
-		 * futex_lock_pi() directly, rather then returning to this
-		 * function.
+		 * We've already been requeued, but we have no way to
+		 * restart by calling futex_lock_pi() directly. We
+		 * could restart the syscall, but that will look at
+		 * the user space value and return right away. So we
+		 * drop back with EWOULDBLOCK to tell user space that
+		 * "val" has been changed. That's the same what the
+		 * restart of the syscall would do in
+		 * futex_wait_setup().
 		 */
-		ret = -ERESTART_RESTARTBLOCK;
-		restart = &current_thread_info()->restart_block;
-		restart->fn = futex_lock_pi_restart;
-		restart->futex.uaddr = (u32 *)uaddr2;
-		restart->futex.val = uval;
-		restart->futex.flags = 0;
-		if (abs_time) {
-			restart->futex.flags |= FLAGS_HAS_TIMEOUT;
-			restart->futex.time = abs_time->tv64;
-		}
-
-		if (fshared)
-			restart->futex.flags |= FLAGS_SHARED;
-		if (clockrt)
-			restart->futex.flags |= FLAGS_CLOCKRT;
+		ret = -EWOULDBLOCK;
 	}
 
 out_put_keys:
-- 
GitLab


From 09010978345e8883003bf411bb99753710eb5a3a Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 20 May 2009 10:48:47 +0100
Subject: [PATCH 1223/2198] GFS2: Improve resource group error handling

This patch improves the error handling in the case where we
discover that the summary information in the resource group
doesn't match the bitmap information while in the process of
allocating blocks. Originally this resulted in a kernel bug,
but this patch changes that so that we return -EIO and print
some messages explaining what went wrong, and how to fix it.

We also remember locally not to try and allocate from the
same rgrp again, so that a subsequent allocation in a
different rgrp should succeed.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c   |  9 ++++++--
 fs/gfs2/dir.c    | 11 +++++++--
 fs/gfs2/eattr.c  | 14 +++++++++---
 fs/gfs2/glops.c  | 20 +----------------
 fs/gfs2/incore.h |  7 +++---
 fs/gfs2/rgrp.c   | 58 +++++++++++++++++++++++++++++++++++-------------
 fs/gfs2/rgrp.h   | 47 ++++++++++++++++++++-------------------
 7 files changed, 99 insertions(+), 67 deletions(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 3a5d3f883e101..253e1a39f8419 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -136,7 +136,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
 		   and write it out to disk */
 
 		unsigned int n = 1;
-		block = gfs2_alloc_block(ip, &n);
+		error = gfs2_alloc_block(ip, &block, &n);
+		if (error)
+			goto out_brelse;
 		if (isdir) {
 			gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
 			error = gfs2_dir_get_new_buffer(ip, block, &bh);
@@ -476,8 +478,11 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 	blks = dblks + iblks;
 	i = sheight;
 	do {
+		int error;
 		n = blks - alloced;
-		bn = gfs2_alloc_block(ip, &n);
+		error = gfs2_alloc_block(ip, &bn, &n);
+		if (error)
+			return error;
 		alloced += n;
 		if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 			gfs2_trans_add_unrevoke(sdp, bn, n);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index aef4d0c067488..297d7e5cebad8 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -803,13 +803,20 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	unsigned int n = 1;
-	u64 bn = gfs2_alloc_block(ip, &n);
-	struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
+	u64 bn;
+	int error;
+	struct buffer_head *bh;
 	struct gfs2_leaf *leaf;
 	struct gfs2_dirent *dent;
 	struct qstr name = { .name = "", .len = 0, .hash = 0 };
+
+	error = gfs2_alloc_block(ip, &bn, &n);
+	if (error)
+		return NULL;
+	bh = gfs2_meta_new(ip->i_gl, bn);
 	if (!bh)
 		return NULL;
+
 	gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
 	gfs2_trans_add_bh(ip->i_gl, bh, 1);
 	gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index 899763aed2174..07ea9529adda1 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -582,8 +582,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
 	struct gfs2_ea_header *ea;
 	unsigned int n = 1;
 	u64 block;
+	int error;
 
-	block = gfs2_alloc_block(ip, &n);
+	error = gfs2_alloc_block(ip, &block, &n);
+	if (error)
+		return error;
 	gfs2_trans_add_unrevoke(sdp, block, 1);
 	*bhp = gfs2_meta_new(ip->i_gl, block);
 	gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
@@ -617,6 +620,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 		    struct gfs2_ea_request *er)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+	int error;
 
 	ea->ea_data_len = cpu_to_be32(er->er_data_len);
 	ea->ea_name_len = er->er_name_len;
@@ -642,7 +646,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 			int mh_size = sizeof(struct gfs2_meta_header);
 			unsigned int n = 1;
 
-			block = gfs2_alloc_block(ip, &n);
+			error = gfs2_alloc_block(ip, &block, &n);
+			if (error)
+				return error;
 			gfs2_trans_add_unrevoke(sdp, block, 1);
 			bh = gfs2_meta_new(ip->i_gl, block);
 			gfs2_trans_add_bh(ip->i_gl, bh, 1);
@@ -963,7 +969,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	} else {
 		u64 blk;
 		unsigned int n = 1;
-		blk = gfs2_alloc_block(ip, &n);
+		error = gfs2_alloc_block(ip, &blk, &n);
+		if (error)
+			return error;
 		gfs2_trans_add_unrevoke(sdp, blk, 1);
 		indbh = gfs2_meta_new(ip->i_gl, blk);
 		gfs2_trans_add_bh(ip->i_gl, indbh, 1);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 70f87f43afa2b..d5e4ab155ca0a 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -309,24 +309,6 @@ static void rgrp_go_unlock(struct gfs2_holder *gh)
 	gfs2_rgrp_bh_put(gh->gh_gl->gl_object);
 }
 
-/**
- * rgrp_go_dump - print out an rgrp
- * @seq: The iterator
- * @gl: The glock in question
- *
- */
-
-static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
-{
-	const struct gfs2_rgrpd *rgd = gl->gl_object;
-	if (rgd == NULL)
-		return 0;
-	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
-		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
-		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
-	return 0;
-}
-
 /**
  * trans_go_sync - promote/demote the transaction glock
  * @gl: the glock
@@ -410,7 +392,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_demote_ok = rgrp_go_demote_ok,
 	.go_lock = rgrp_go_lock,
 	.go_unlock = rgrp_go_unlock,
-	.go_dump = rgrp_go_dump,
+	.go_dump = gfs2_rgrp_dump,
 	.go_type = LM_TYPE_RGRP,
 	.go_min_hold_time = HZ / 5,
 };
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 0060e9564bb92..de50d86fec125 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -92,9 +92,10 @@ struct gfs2_rgrpd {
 	unsigned int rd_bh_count;
 	u32 rd_last_alloc;
 	unsigned char rd_flags;
-#define GFS2_RDF_CHECK        0x01      /* Need to check for unlinked inodes */
-#define GFS2_RDF_NOALLOC      0x02      /* rg prohibits allocation */
-#define GFS2_RDF_UPTODATE     0x04      /* rg is up to date */
+#define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
+#define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
+#define GFS2_RDF_ERROR		0x40000000 /* error in rg */
+#define GFS2_RDF_MASK		0xf0000000 /* mask for internal flags */
 };
 
 enum gfs2_state_bits {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa2b..fbacf09ee34e9 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -701,10 +701,7 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 	u32 rg_flags;
 
 	rg_flags = be32_to_cpu(str->rg_flags);
-	if (rg_flags & GFS2_RGF_NOALLOC)
-		rgd->rd_flags |= GFS2_RDF_NOALLOC;
-	else
-		rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
+	rg_flags &= ~GFS2_RDF_MASK;
 	rgd->rd_free = be32_to_cpu(str->rg_free);
 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -713,11 +710,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
 {
 	struct gfs2_rgrp *str = buf;
-	u32 rg_flags = 0;
 
-	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
-		rg_flags |= GFS2_RGF_NOALLOC;
-	str->rg_flags = cpu_to_be32(rg_flags);
+	str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK);
 	str->rg_free = cpu_to_be32(rgd->rd_free);
 	str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes);
 	str->__pad = cpu_to_be32(0);
@@ -942,7 +936,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 	struct gfs2_sbd *sdp = rgd->rd_sbd;
 	int ret = 0;
 
-	if (rgd->rd_flags & GFS2_RDF_NOALLOC)
+	if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
 		return 0;
 
 	spin_lock(&sdp->sd_rindex_spin);
@@ -1435,13 +1429,33 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
 }
 
 /**
- * gfs2_alloc_block - Allocate a block
+ * gfs2_rgrp_dump - print out an rgrp
+ * @seq: The iterator
+ * @gl: The glock in question
+ *
+ */
+
+int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
+{
+	const struct gfs2_rgrpd *rgd = gl->gl_object;
+	if (rgd == NULL)
+		return 0;
+	gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n",
+		       (unsigned long long)rgd->rd_addr, rgd->rd_flags,
+		       rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes);
+	return 0;
+}
+
+/**
+ * gfs2_alloc_block - Allocate one or more blocks
  * @ip: the inode to allocate the block for
+ * @bn: Used to return the starting block number
+ * @n: requested number of blocks/extent length (value/result)
  *
- * Returns: the allocated block
+ * Returns: 0 or error
  */
 
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
+int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct buffer_head *dibh;
@@ -1457,7 +1471,10 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 		goal = rgd->rd_last_alloc;
 
 	blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
-	BUG_ON(blk == BFITNOENT);
+
+	/* Since all blocks are reserved in advance, this shouldn't happen */
+	if (blk == BFITNOENT)
+		goto rgrp_error;
 
 	rgd->rd_last_alloc = blk;
 	block = rgd->rd_data0 + blk;
@@ -1469,7 +1486,9 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 		di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal);
 		brelse(dibh);
 	}
-	gfs2_assert_withdraw(sdp, rgd->rd_free >= *n);
+	if (rgd->rd_free < *n)
+		goto rgrp_error;
+
 	rgd->rd_free -= *n;
 
 	gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
@@ -1484,7 +1503,16 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
 	rgd->rd_free_clone -= *n;
 	spin_unlock(&sdp->sd_rindex_spin);
 
-	return block;
+	*bn = block;
+	return 0;
+
+rgrp_error:
+	fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+	        (unsigned long long)rgd->rd_addr);
+	fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+	gfs2_rgrp_dump(NULL, rgd->rd_gl);
+	rgd->rd_flags |= GFS2_RDF_ERROR;
+	return -EIO;
 }
 
 /**
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 3181c7e624bff..1e76ff0f3e00c 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -14,22 +14,22 @@ struct gfs2_rgrpd;
 struct gfs2_sbd;
 struct gfs2_holder;
 
-void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd);
 
 struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk);
 struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp);
 struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd);
 
-void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
-int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
+extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp);
+extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh);
 
-int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
-void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
+extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd);
 
-void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
+extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd);
 
-struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
+extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
 static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 {
 	BUG_ON(ip->i_alloc == NULL);
@@ -37,22 +37,22 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip)
 	ip->i_alloc = NULL;
 }
 
-int gfs2_inplace_reserve_i(struct gfs2_inode *ip,
-			 char *file, unsigned int line);
+extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file,
+				  unsigned int line);
 #define gfs2_inplace_reserve(ip) \
 gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
 
-void gfs2_inplace_release(struct gfs2_inode *ip);
+extern void gfs2_inplace_release(struct gfs2_inode *ip);
 
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
+extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
 
-u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
-u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
+extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
 
-void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
-void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
-void gfs2_unlink_di(struct inode *inode);
+extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
+extern void gfs2_unlink_di(struct inode *inode);
 
 struct gfs2_rgrp_list {
 	unsigned int rl_rgrps;
@@ -61,10 +61,11 @@ struct gfs2_rgrp_list {
 	struct gfs2_holder *rl_ghs;
 };
 
-void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
-		    u64 block);
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
-void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
-u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
+			   u64 block);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
+extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
+extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
+extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
 
 #endif /* __RGRP_DOT_H__ */
-- 
GitLab


From d7b629a34fc4134a43c730b5f0197855dc4948d0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 May 2009 12:21:19 +0200
Subject: [PATCH 1224/2198] perf_counter: Solve the rotate_ctx vs inherit race
 differently

Instead of disabling RR scheduling of the counters, use a different list
that does not get rotated to iterate the counters on inheritance.

[ Impact: cleanup, optimization ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090520102553.237504544@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 -
 kernel/perf_counter.c        | 15 +++++----------
 2 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 13cb2fbbf3342..c8c1dfc22c938 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -508,7 +508,6 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
-	int			rr_allowed;
 	struct task_struct	*task;
 
 	/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4d8f97375f3af..64113e6d1942f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1120,8 +1120,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	if (ctx->rr_allowed)
-		rotate_ctx(ctx);
+	rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
 	perf_counter_task_sched_in(curr, cpu);
@@ -3109,7 +3108,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	mutex_init(&ctx->mutex);
 	INIT_LIST_HEAD(&ctx->counter_list);
 	INIT_LIST_HEAD(&ctx->event_list);
-	ctx->rr_allowed = 1;
 	ctx->task = task;
 }
 
@@ -3350,14 +3348,14 @@ void perf_counter_init_task(struct task_struct *child)
 	 */
 	mutex_lock(&parent_ctx->mutex);
 
-	parent_ctx->rr_allowed = 0;
-	barrier(); /* irqs */
-
 	/*
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) {
+	list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) {
+		if (counter != counter->group_leader)
+			continue;
+
 		if (!counter->hw_event.inherit)
 			continue;
 
@@ -3366,9 +3364,6 @@ void perf_counter_init_task(struct task_struct *child)
 			break;
 	}
 
-	barrier(); /* irqs */
-	parent_ctx->rr_allowed = 1;
-
 	mutex_unlock(&parent_ctx->mutex);
 }
 
-- 
GitLab


From 26b119bc811a73bac6ecf95bdf284bf31c7955f0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 May 2009 12:21:20 +0200
Subject: [PATCH 1225/2198] perf_counter: Log irq_period changes

For the dynamic irq_period code, log whenever we change the period so that
analyzing code can normalize the event flow.

[ Impact: add new feature to allow more precise profiling ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090520102553.298769743@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  8 ++++++++
 kernel/perf_counter.c        | 40 +++++++++++++++++++++++++++++++++++-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c8c1dfc22c938..f612941ef46ed 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -257,6 +257,14 @@ enum perf_event_type {
 	 */
 	PERF_EVENT_COMM			= 3,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				irq_period;
+	 * };
+	 */
+	PERF_EVENT_PERIOD		= 4,
+
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 64113e6d1942f..db02eb16c777f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1046,7 +1046,9 @@ int perf_counter_task_enable(void)
 	return 0;
 }
 
-void perf_adjust_freq(struct perf_counter_context *ctx)
+static void perf_log_period(struct perf_counter *counter, u64 period);
+
+static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
 	u64 irq_period;
@@ -1072,6 +1074,8 @@ void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (!irq_period)
 			irq_period = 1;
 
+		perf_log_period(counter, irq_period);
+
 		counter->hw.irq_period = irq_period;
 		counter->hw.interrupts = 0;
 	}
@@ -2406,6 +2410,40 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 	perf_counter_mmap_event(&mmap_event);
 }
 
+/*
+ *
+ */
+
+static void perf_log_period(struct perf_counter *counter, u64 period)
+{
+	struct perf_output_handle handle;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				time;
+		u64				period;
+	} freq_event = {
+		.header = {
+			.type = PERF_EVENT_PERIOD,
+			.misc = 0,
+			.size = sizeof(freq_event),
+		},
+		.time = sched_clock(),
+		.period = period,
+	};
+
+	if (counter->hw.irq_period == period)
+		return;
+
+	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, freq_event);
+	perf_output_end(&handle);
+}
+
 /*
  * Generic counter overflow handling.
  */
-- 
GitLab


From b986d7ec0f8b7ea3cc7366d80a137fbe839df227 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 May 2009 12:21:21 +0200
Subject: [PATCH 1226/2198] perf_counter: Optimize disable of time based sw
 counters

Currently we call hrtimer_cancel() unconditionally on disable of time based
software counters. Avoid when possible.

[ Impact: micro-optimize the code ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090520102553.388185031@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index db02eb16c777f..473ed2cafbfca 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2716,7 +2716,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	hrtimer_cancel(&counter->hw.hrtimer);
+	if (counter->hw.irq_period)
+		hrtimer_cancel(&counter->hw.hrtimer);
 	cpu_clock_perf_counter_update(counter);
 }
 
@@ -2767,7 +2768,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	hrtimer_cancel(&counter->hw.hrtimer);
+	if (counter->hw.irq_period)
+		hrtimer_cancel(&counter->hw.hrtimer);
 	task_clock_perf_counter_update(counter, counter->ctx->time);
 
 }
-- 
GitLab


From afedadf23a2c90f3ba0d963282cbe6a6be129494 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 20 May 2009 12:21:22 +0200
Subject: [PATCH 1227/2198] perf_counter: Optimize sched in/out of counters

Avoid a function call for !group counters by directly calling the counter
function.

[ Impact: micro-optimize the code ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090520102553.511933670@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 473ed2cafbfca..69d4de8159631 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -826,8 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 
 	perf_disable();
 	if (ctx->nr_active) {
-		list_for_each_entry(counter, &ctx->counter_list, list_entry)
-			group_sched_out(counter, cpuctx, ctx);
+		list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+			if (counter != counter->group_leader)
+				counter_sched_out(counter, cpuctx, ctx);
+			else
+				group_sched_out(counter, cpuctx, ctx);
+		}
 	}
 	perf_enable();
  out:
@@ -903,8 +907,12 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		if (group_can_go_on(counter, cpuctx, 1))
-			group_sched_in(counter, cpuctx, ctx, cpu);
+		if (counter != counter->group_leader)
+			counter_sched_in(counter, cpuctx, ctx, cpu);
+		else {
+			if (group_can_go_on(counter, cpuctx, 1))
+				group_sched_in(counter, cpuctx, ctx, cpu);
+		}
 
 		/*
 		 * If this pinned group hasn't been scheduled,
@@ -932,9 +940,14 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
 
-		if (group_can_go_on(counter, cpuctx, can_add_hw)) {
-			if (group_sched_in(counter, cpuctx, ctx, cpu))
+		if (counter != counter->group_leader) {
+			if (counter_sched_in(counter, cpuctx, ctx, cpu))
 				can_add_hw = 0;
+		} else {
+			if (group_can_go_on(counter, cpuctx, can_add_hw)) {
+				if (group_sched_in(counter, cpuctx, ctx, cpu))
+					can_add_hw = 0;
+			}
 		}
 	}
 	perf_enable();
-- 
GitLab


From 5537937696c55530447c20aa27daccb8d0d29b33 Mon Sep 17 00:00:00 2001
From: Ming Lei <tom.leiming@gmail.com>
Date: Mon, 18 May 2009 23:04:46 +0800
Subject: [PATCH 1228/2198] ftrace: fix check for return value of
 register_module_notifier in event_trace_init

register_module_notifier() returns zero in the success case.
So fix the inverted fail case check in trace events modules
handler.

[ Impact: fix spurious warning on ftrace initialization]

Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 0eec0c55dd87d..9e91c4ad7c8b3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1174,7 +1174,7 @@ static __init int event_trace_init(void)
 	}
 
 	ret = register_module_notifier(&trace_module_nb);
-	if (!ret)
+	if (ret)
 		pr_warning("Failed to register trace events module notifier\n");
 
 	return 0;
-- 
GitLab


From 34adc8062227f41b04ade0ff3fbd1dbe3002669e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 May 2009 20:13:28 +0200
Subject: [PATCH 1229/2198] perf_counter: Fix context removal deadlock

Disable the PMU globally before removing a counter from a
context. This fixes the following lockup:

[22081.741922] ------------[ cut here ]------------
[22081.746668] WARNING: at arch/x86/kernel/cpu/perf_counter.c:803 intel_pmu_handle_irq+0x9b/0x24e()
[22081.755624] Hardware name: X8DTN
[22081.758903] perfcounters: irq loop stuck!
[22081.762985] Modules linked in:
[22081.766136] Pid: 11082, comm: perf Not tainted 2.6.30-rc6-tip #226
[22081.772432] Call Trace:
[22081.774940]  <NMI>  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.781993]  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.788368]  [<ffffffff8104505c>] ? warn_slowpath_common+0x77/0xa3
[22081.794649]  [<ffffffff810450d3>] ? warn_slowpath_fmt+0x40/0x45
[22081.800696]  [<ffffffff81019aed>] ? intel_pmu_handle_irq+0x9b/0x24e
[22081.807080]  [<ffffffff814d1a72>] ? perf_counter_nmi_handler+0x3f/0x4a
[22081.813751]  [<ffffffff814d2d09>] ? notifier_call_chain+0x58/0x86
[22081.819951]  [<ffffffff8105b250>] ? notify_die+0x2d/0x32
[22081.825392]  [<ffffffff814d1414>] ? do_nmi+0x8e/0x242
[22081.830538]  [<ffffffff814d0f0a>] ? nmi+0x1a/0x20
[22081.835342]  [<ffffffff8117e102>] ? selinux_file_free_security+0x0/0x1a
[22081.842105]  [<ffffffff81018793>] ? x86_pmu_disable_counter+0x15/0x41
[22081.848673]  <<EOE>>  [<ffffffff81018f3d>] ? x86_pmu_disable+0x86/0x103
[22081.855512]  [<ffffffff8108fedd>] ? __perf_counter_remove_from_context+0x0/0xfe
[22081.862926]  [<ffffffff8108fcbc>] ? counter_sched_out+0x30/0xce
[22081.868909]  [<ffffffff8108ff36>] ? __perf_counter_remove_from_context+0x59/0xfe
[22081.876382]  [<ffffffff8106808a>] ? smp_call_function_single+0x6c/0xe6
[22081.882955]  [<ffffffff81091b96>] ? perf_release+0x86/0x14c
[22081.888600]  [<ffffffff810c4c84>] ? __fput+0xe7/0x195
[22081.893718]  [<ffffffff810c213e>] ? filp_close+0x5b/0x62
[22081.899107]  [<ffffffff81046a70>] ? put_files_struct+0x64/0xc2
[22081.905031]  [<ffffffff8104841a>] ? do_exit+0x1e2/0x6ef
[22081.910360]  [<ffffffff814d0a60>] ? _spin_lock_irqsave+0x9/0xe
[22081.916292]  [<ffffffff8104898e>] ? do_group_exit+0x67/0x93
[22081.921953]  [<ffffffff810489cc>] ? sys_exit_group+0x12/0x16
[22081.927759]  [<ffffffff8100baab>] ? system_call_fastpath+0x16/0x1b
[22081.934076] ---[ end trace 3a3936ce3e1b4505 ]---

And could potentially also fix the lockup reported by Marcelo Tosatti.

Also, print more debug info in case of a detected lockup.

[ Impact: fix lockup ]

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  1 +
 kernel/perf_counter.c              | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c109819c2cb95..6cc1660db8d67 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -740,6 +740,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 again:
 	if (++loops > 100) {
 		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
+		perf_counter_print_debug();
 		return 1;
 	}
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 69d4de8159631..08584c16049fc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -208,18 +208,17 @@ static void __perf_counter_remove_from_context(void *info)
 		return;
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	/*
+	 * Protect the list operation against NMI by disabling the
+	 * counters on a global level.
+	 */
+	perf_disable();
 
 	counter_sched_out(counter, cpuctx, ctx);
 
 	counter->task = NULL;
 
-	/*
-	 * Protect the list operation against NMI by disabling the
-	 * counters on a global level. NOP for non NMI based counters.
-	 */
-	perf_disable();
 	list_del_counter(counter, ctx);
-	perf_enable();
 
 	if (!ctx->task) {
 		/*
@@ -231,6 +230,7 @@ static void __perf_counter_remove_from_context(void *info)
 			    perf_max_counters - perf_reserved_percpu);
 	}
 
+	perf_enable();
 	spin_unlock_irqrestore(&ctx->lock, flags);
 }
 
-- 
GitLab


From c6ac4c18fbc92a26df71ece609b082bc3099676b Mon Sep 17 00:00:00 2001
From: "H. Peter Anvin" <hpa@zytor.com>
Date: Wed, 20 May 2009 11:26:09 -0700
Subject: [PATCH 1230/2198] x86, boot: correct the calculation of ZO_INIT_SIZE

Correct the calculation of ZO_INIT_SIZE (the amount of memory we need
during decompression).  One symbol (ZO_startup_32) was missing from
zoffset.h, and another (ZO_z_extract_offset) was misspelled.

[ Impact: build fix ]

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/Makefile | 2 +-
 arch/x86/boot/header.S | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 75e0301fc69a9..619d297aa2bae 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -95,7 +95,7 @@ targets += voffset.h
 $(obj)/voffset.h: vmlinux FORCE
 	$(call if_changed,voffset)
 
-sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
 
 quiet_cmd_zoffset = ZOFFSET $@
       cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index 68c3bfbaff240..1040f6e8010cd 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -226,7 +226,7 @@ setup_data:		.quad 0			# 64-bit physical pointer to
 
 pref_address:		.quad LOAD_PHYSICAL_ADDR	# preferred load addr
 
-#define ZO_INIT_SIZE	(ZO__end - ZO_startup_32 + ZO_extract_offset)
+#define ZO_INIT_SIZE	(ZO__end - ZO_startup_32 + ZO_z_extract_offset)
 #define VO_INIT_SIZE	(VO__end - VO__text)
 #if ZO_INIT_SIZE > VO_INIT_SIZE
 #define INIT_SIZE ZO_INIT_SIZE
-- 
GitLab


From 60a0b8f93664621a07b93273fc8ebc29590c62f5 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 21 May 2009 12:23:12 +0100
Subject: [PATCH 1231/2198] GFS2: Add a rgrp bitmap full flag

During block allocation, it is useful to know if sections of disk
are full on a finer grained basis than a single resource group.
This can make a performance difference when resource groups have
larger numbers of bitmap blocks, since we no longer have to search
them all block by block in each individual bitmap.

The full flag is set on a per-bitmap basis when it has been
searched and found to have no free space. It is then skipped in
subsequent searches until the flag is reset. The resetting
occurs if we have to drop the glock on the resource group for any
reason, or if we deallocate some blocks within that resource
group and thus free up some space.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h |  3 ++
 fs/gfs2/rgrp.c   | 77 +++++++++++++++++++++++++++++-------------------
 2 files changed, 50 insertions(+), 30 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index de50d86fec125..dd87379b61e67 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -64,9 +64,12 @@ struct gfs2_log_element {
 	const struct gfs2_log_operations *le_ops;
 };
 
+#define GBF_FULL 1
+
 struct gfs2_bitmap {
 	struct buffer_head *bi_bh;
 	char *bi_clone;
+	unsigned long bi_flags;
 	u32 bi_offset;
 	u32 bi_start;
 	u32 bi_len;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fbacf09ee34e9..23637b9d1c739 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -442,6 +442,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
 	for (x = 0; x < length; x++) {
 		bi = rgd->rd_bits + x;
 
+		bi->bi_flags = 0;
 		/* small rgrp; bitmap stored completely in header block */
 		if (length == 1) {
 			bytes = bytes_left;
@@ -769,6 +770,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 	}
 
 	if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
+		for (x = 0; x < length; x++)
+			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
 		rgd->rd_flags |= GFS2_RDF_UPTODATE;
 	}
@@ -897,6 +900,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd)
 			continue;
 		if (sdp->sd_args.ar_discard)
 			gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi);
+		clear_bit(GBF_FULL, &bi->bi_flags);
 		memcpy(bi->bi_clone + bi->bi_offset,
 		       bi->bi_bh->b_data + bi->bi_offset, bi->bi_len);
 	}
@@ -1309,30 +1313,37 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 {
 	struct gfs2_bitmap *bi = NULL;
 	const u32 length = rgd->rd_length;
-	u32 blk = 0;
+	u32 blk = BFITNOENT;
 	unsigned int buf, x;
 	const unsigned int elen = *n;
-	const u8 *buffer;
+	const u8 *buffer = NULL;
 
 	*n = 0;
 	/* Find bitmap block that contains bits for goal block */
 	for (buf = 0; buf < length; buf++) {
 		bi = rgd->rd_bits + buf;
-		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY)
-			break;
+		/* Convert scope of "goal" from rgrp-wide to within found bit block */
+		if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
+			goal -= bi->bi_start * GFS2_NBBY;
+			goto do_search;
+		}
 	}
+	buf = 0;
+	goal = 0;
 
-	gfs2_assert(rgd->rd_sbd, buf < length);
-
-	/* Convert scope of "goal" from rgrp-wide to within found bit block */
-	goal -= bi->bi_start * GFS2_NBBY;
-
+do_search:
 	/* Search (up to entire) bitmap in this rgrp for allocatable block.
 	   "x <= length", instead of "x < length", because we typically start
 	   the search in the middle of a bit block, but if we can't find an
 	   allocatable block anywhere else, we want to be able wrap around and
 	   search in the first part of our first-searched bit block.  */
 	for (x = 0; x <= length; x++) {
+		bi = rgd->rd_bits + buf;
+
+		if (test_bit(GBF_FULL, &bi->bi_flags) &&
+		    (old_state == GFS2_BLKST_FREE))
+			goto skip;
+
 		/* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
 		   bitmaps, so we must search the originals for that. */
 		buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1343,33 +1354,39 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
 		if (blk != BFITNOENT)
 			break;
 
+		if ((goal == 0) && (old_state == GFS2_BLKST_FREE))
+			set_bit(GBF_FULL, &bi->bi_flags);
+
 		/* Try next bitmap block (wrap back to rgrp header if at end) */
-		buf = (buf + 1) % length;
-		bi = rgd->rd_bits + buf;
+skip:
+		buf++;
+		buf %= length;
 		goal = 0;
 	}
 
-	if (blk != BFITNOENT && old_state != new_state) {
-		*n = 1;
-		gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+	if (blk == BFITNOENT)
+		return blk;
+	*n = 1;
+	if (old_state == new_state)
+		goto out;
+
+	gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
+	gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
+		    bi->bi_len, blk, new_state);
+	goal = blk;
+	while (*n < elen) {
+		goal++;
+		if (goal >= (bi->bi_len * GFS2_NBBY))
+			break;
+		if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
+		    GFS2_BLKST_FREE)
+			break;
 		gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
-			    bi->bi_len, blk, new_state);
-		goal = blk;
-		while (*n < elen) {
-			goal++;
-			if (goal >= (bi->bi_len * GFS2_NBBY))
-				break;
-			if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
-			    GFS2_BLKST_FREE)
-				break;
-			gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
-				    bi->bi_offset, bi->bi_len, goal,
-				    new_state);
-			(*n)++;
-		}
+			    bi->bi_len, goal, new_state);
+		(*n)++;
 	}
-
-	return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
+out:
+	return (bi->bi_start * GFS2_NBBY) + blk;
 }
 
 /**
-- 
GitLab


From 1ce97e564b628bee30b8dbb64e5e653a484308f6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Thu, 21 May 2009 15:18:19 +0100
Subject: [PATCH 1232/2198] GFS2: Be more aggressive in reclaiming unlinked
 inodes

This patch increases the frequency with which gfs2 looks
for unlinked, but still allocated inodes. Its the equivalent
operation to ext3's orphan list, but done with bitmaps in
the resource groups.

This also fixes a bug where a field in the rgrp was too small.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 2 +-
 fs/gfs2/rgrp.c   | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index dd87379b61e67..225347fbff3c2 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -94,7 +94,7 @@ struct gfs2_rgrpd {
 	struct gfs2_sbd *rd_sbd;
 	unsigned int rd_bh_count;
 	u32 rd_last_alloc;
-	unsigned char rd_flags;
+	u32 rd_flags;
 #define GFS2_RDF_CHECK		0x10000000 /* check for unlinked inodes */
 #define GFS2_RDF_UPTODATE	0x20000000 /* rg is up to date */
 #define GFS2_RDF_ERROR		0x40000000 /* error in rg */
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 23637b9d1c739..ee3d5c1876a32 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -581,7 +581,6 @@ static int read_rindex_entry(struct gfs2_inode *ip,
 
 	rgd->rd_gl->gl_object = rgd;
 	rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
-	rgd->rd_flags |= GFS2_RDF_CHECK;
 	return error;
 }
 
@@ -703,6 +702,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
 
 	rg_flags = be32_to_cpu(str->rg_flags);
 	rg_flags &= ~GFS2_RDF_MASK;
+	rgd->rd_flags &= GFS2_RDF_MASK;
+	rgd->rd_flags |= rg_flags;
 	rgd->rd_free = be32_to_cpu(str->rg_free);
 	rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes);
 	rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration);
@@ -773,7 +774,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 		for (x = 0; x < length; x++)
 			clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags);
 		gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
-		rgd->rd_flags |= GFS2_RDF_UPTODATE;
+		rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
 	}
 
 	spin_lock(&sdp->sd_rindex_spin);
-- 
GitLab


From 55620c86ebc29013c0d26c5b3bc423faea299fee Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 01:16:17 +0900
Subject: [PATCH 1233/2198] sh: irq: Rework the SR.IMASK bitmap handling.

This tidies up how the SR.IMASK bitmap is managed, using the bitmap API
directly instead. At the same time, tidy up the irq_chip conversion a
bit.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/irq/imask.c | 64 +++++++++-------------------------
 1 file changed, 16 insertions(+), 48 deletions(-)

diff --git a/arch/sh/kernel/cpu/irq/imask.c b/arch/sh/kernel/cpu/irq/imask.c
index f43753b54e1d6..3bef3c2629ad2 100644
--- a/arch/sh/kernel/cpu/irq/imask.c
+++ b/arch/sh/kernel/cpu/irq/imask.c
@@ -18,38 +18,17 @@
 #include <linux/spinlock.h>
 #include <linux/cache.h>
 #include <linux/irq.h>
+#include <linux/bitmap.h>
 #include <asm/system.h>
 #include <asm/irq.h>
 
 /* Bitmap of IRQ masked */
-static unsigned long imask_mask = 0x7fff;
-static int interrupt_priority = 0;
-
-static void enable_imask_irq(unsigned int irq);
-static void disable_imask_irq(unsigned int irq);
-static void shutdown_imask_irq(unsigned int irq);
-static void mask_and_ack_imask(unsigned int);
-static void end_imask_irq(unsigned int irq);
-
 #define IMASK_PRIORITY	15
 
-static unsigned int startup_imask_irq(unsigned int irq)
-{
-	/* Nothing to do */
-	return 0; /* never anything pending */
-}
+static DECLARE_BITMAP(imask_mask, IMASK_PRIORITY);
+static int interrupt_priority;
 
-static struct irq_chip imask_irq_type = {
-	.typename = "SR.IMASK",
-	.startup = startup_imask_irq,
-	.shutdown = shutdown_imask_irq,
-	.enable = enable_imask_irq,
-	.disable = disable_imask_irq,
-	.ack = mask_and_ack_imask,
-	.end = end_imask_irq
-};
-
-void static inline set_interrupt_registers(int ip)
+static inline void set_interrupt_registers(int ip)
 {
 	unsigned long __dummy;
 
@@ -72,42 +51,31 @@ void static inline set_interrupt_registers(int ip)
 		     : "t");
 }
 
-static void disable_imask_irq(unsigned int irq)
+static void mask_imask_irq(unsigned int irq)
 {
 	clear_bit(irq, &imask_mask);
 	if (interrupt_priority < IMASK_PRIORITY - irq)
 		interrupt_priority = IMASK_PRIORITY - irq;
-
 	set_interrupt_registers(interrupt_priority);
 }
 
-static void enable_imask_irq(unsigned int irq)
+static void unmask_imask_irq(unsigned int irq)
 {
 	set_bit(irq, &imask_mask);
-	interrupt_priority = IMASK_PRIORITY - ffz(imask_mask);
-
+	interrupt_priority = IMASK_PRIORITY -
+		find_first_zero_bit(imask_mask, IMASK_PRIORITY);
 	set_interrupt_registers(interrupt_priority);
 }
 
-static void mask_and_ack_imask(unsigned int irq)
-{
-	disable_imask_irq(irq);
-}
-
-static void end_imask_irq(unsigned int irq)
-{
-	if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)))
-		enable_imask_irq(irq);
-}
-
-static void shutdown_imask_irq(unsigned int irq)
-{
-	/* Nothing to do */
-}
+static struct irq_chip imask_irq_chip = {
+	.typename	= "SR.IMASK",
+	.mask		= mask_imask_irq,
+	.unmask		= unmask_imask_irq,
+	.mask_ack	= mask_imask_irq,
+};
 
 void make_imask_irq(unsigned int irq)
 {
-	disable_irq_nosync(irq);
-	irq_desc[irq].chip = &imask_irq_type;
-	enable_irq(irq);
+	set_irq_chip_and_handler_name(irq, &imask_irq_chip,
+				      handle_level_irq, "level");
 }
-- 
GitLab


From fa1d43ab451084785153d37ae559c4fdd1546a5b Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 01:26:16 +0900
Subject: [PATCH 1234/2198] sh: irq: Convert from irq_desc[] to irq_to_desc().

This converts a few places that were using the old irq_desc[] array over
to the shiny new irq_to_desc() helper. Preperatory work for sparse irq
support.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/irq/intc-sh5.c | 10 +----
 arch/sh/kernel/irq.c              | 69 +++++++++++++++++++++----------
 arch/sh/kernel/sh_ksyms_32.c      |  8 ----
 3 files changed, 48 insertions(+), 39 deletions(-)

diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c
index a619eaf6e6b4a..6c092f1f55579 100644
--- a/arch/sh/kernel/cpu/irq/intc-sh5.c
+++ b/arch/sh/kernel/cpu/irq/intc-sh5.c
@@ -152,14 +152,6 @@ static void end_intc_irq(unsigned int irq)
 	enable_intc_irq(irq);
 }
 
-/* For future use, if we ever support IRLM=0) */
-void make_intc_irq(unsigned int irq)
-{
-	disable_irq_nosync(irq);
-	irq_desc[irq].chip = &intc_irq_type;
-	disable_intc_irq(irq);
-}
-
 void __init plat_irq_setup(void)
 {
 	unsigned long long __dummy0, __dummy1=~0x00000000100000f0;
@@ -174,7 +166,7 @@ void __init plat_irq_setup(void)
 
 	/* Set default: per-line enable/disable, priority driven ack/eoi */
 	for (i = 0; i < NR_INTC_IRQS; i++)
-		irq_desc[i].chip = &intc_irq_type;
+		set_irq_chip_and_handler(i, &intc_irq_type, handle_level_irq);
 
 
 	/* Disable all interrupts and set all priorities to 0 to avoid trouble */
diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 3f1372eb0091f..878532b617625 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -31,39 +31,64 @@ void ack_bad_irq(unsigned int irq)
 }
 
 #if defined(CONFIG_PROC_FS)
+/*
+ * /proc/interrupts printing:
+ */
+static int show_other_interrupts(struct seq_file *p, int prec)
+{
+	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
+	return 0;
+}
+
 int show_interrupts(struct seq_file *p, void *v)
 {
-	int i = *(loff_t *) v, j;
-	struct irqaction * action;
-	unsigned long flags;
+	unsigned long flags, any_count = 0;
+	int i = *(loff_t *)v, j, prec;
+	struct irqaction *action;
+	struct irq_desc *desc;
+
+	if (i > nr_irqs)
+		return 0;
+
+	for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+		j *= 10;
+
+	if (i == nr_irqs)
+		return show_other_interrupts(p, prec);
 
 	if (i == 0) {
-		seq_puts(p, "           ");
+		seq_printf(p, "%*s", prec + 8, "");
 		for_each_online_cpu(j)
-			seq_printf(p, "CPU%d       ",j);
+			seq_printf(p, "CPU%-8d", j);
 		seq_putc(p, '\n');
 	}
 
-	if (i < sh_mv.mv_nr_irqs) {
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
-		if (!action)
-			goto unlock;
-		seq_printf(p, "%3d: ",i);
-		for_each_online_cpu(j)
-			seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
-		seq_printf(p, " %14s", irq_desc[i].chip->name);
-		seq_printf(p, "-%-8s", irq_desc[i].name);
-		seq_printf(p, "  %s", action->name);
+	desc = irq_to_desc(i);
+	if (!desc)
+		return 0;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	for_each_online_cpu(j)
+		any_count |= kstat_irqs_cpu(i, j);
+	action = desc->action;
+	if (!action && !any_count)
+		goto out;
 
-		for (action=action->next; action; action = action->next)
+	seq_printf(p, "%*d: ", prec, i);
+	for_each_online_cpu(j)
+		seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+	seq_printf(p, " %14s", desc->chip->name);
+	seq_printf(p, "-%-8s", desc->name);
+
+	if (action) {
+		seq_printf(p, "  %s", action->name);
+		while ((action = action->next) != NULL)
 			seq_printf(p, ", %s", action->name);
-		seq_putc(p, '\n');
-unlock:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
-	} else if (i == sh_mv.mv_nr_irqs)
-		seq_printf(p, "Err: %10u\n", atomic_read(&irq_err_count));
+	}
 
+	seq_putc(p, '\n');
+out:
+	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
 }
 #endif
diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c
index 2d868c60aa6e3..fcc5de31f83b8 100644
--- a/arch/sh/kernel/sh_ksyms_32.c
+++ b/arch/sh/kernel/sh_ksyms_32.c
@@ -23,9 +23,6 @@ extern int dump_fpu(struct pt_regs *, elf_fpregset_t *);
 /* platform dependent support */
 EXPORT_SYMBOL(dump_fpu);
 EXPORT_SYMBOL(kernel_thread);
-EXPORT_SYMBOL(irq_desc);
-EXPORT_SYMBOL(no_irq_chip);
-
 EXPORT_SYMBOL(strlen);
 
 /* PCI exports */
@@ -40,11 +37,6 @@ EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memmove);
 EXPORT_SYMBOL(__copy_user);
-
-#ifdef CONFIG_MMU
-EXPORT_SYMBOL(get_vm_area);
-#endif
-
 EXPORT_SYMBOL(__udelay);
 EXPORT_SYMBOL(__ndelay);
 EXPORT_SYMBOL(__const_udelay);
-- 
GitLab


From 05ff3004d278b54760abd71530506d803182c71d Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 01:28:33 +0900
Subject: [PATCH 1235/2198] sh: irq: Teach ipr and intc about dynamically
 allocating irq_descs.

This hooks in irq_to_desc_alloc_cpu() to the necessary code paths in the
intc and ipr controller registration paths. As these are the primary call
paths for all SH CPUs, this alone will make all CPUs sparse IRQ ready.

There is the added benefit now that each CPU contains specific IPR and
INTC tables, so only the vectors with interrupt sources backing them will
ever see an irq_desc instantiation. This effectively packs irq_desc
down to match the CPU, rather than padding NR_IRQS out to cover the valid
vector range.

Boards with extra sources will still have to fiddle with the nr_irqs
setting, but they can continue doing so through the machvec as before.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/irq/ipr.c |  8 ++++++++
 drivers/sh/intc.c            | 12 ++++++++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/sh/kernel/cpu/irq/ipr.c b/arch/sh/kernel/cpu/irq/ipr.c
index 3eb17ee5540e8..fa0c8467a280e 100644
--- a/arch/sh/kernel/cpu/irq/ipr.c
+++ b/arch/sh/kernel/cpu/irq/ipr.c
@@ -59,10 +59,18 @@ void register_ipr_controller(struct ipr_desc *desc)
 
 	for (i = 0; i < desc->nr_irqs; i++) {
 		struct ipr_data *p = desc->ipr_data + i;
+		struct irq_desc *irq_desc;
 
 		BUG_ON(p->ipr_idx >= desc->nr_offsets);
 		BUG_ON(!desc->ipr_offsets[p->ipr_idx]);
 
+		irq_desc = irq_to_desc_alloc_cpu(p->irq, smp_processor_id());
+		if (unlikely(!irq_desc)) {
+			printk(KERN_INFO "can not get irq_desc for %d\n",
+			       p->irq);
+			continue;
+		}
+
 		disable_irq_nosync(p->irq);
 		set_irq_chip_and_handler_name(p->irq, &desc->chip,
 				      handle_level_irq, "level");
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
index 12d13d99b6f09..098b767e9afda 100644
--- a/drivers/sh/intc.c
+++ b/drivers/sh/intc.c
@@ -671,7 +671,7 @@ unsigned int intc_evt2irq(unsigned int vector)
 
 void __init register_intc_controller(struct intc_desc *desc)
 {
-	unsigned int i, k, smp;
+	unsigned int i, k, smp, cpu = smp_processor_id();
 	struct intc_desc_int *d;
 
 	d = alloc_bootmem(sizeof(*d));
@@ -770,11 +770,19 @@ void __init register_intc_controller(struct intc_desc *desc)
 	/* register the vectors one by one */
 	for (i = 0; i < desc->nr_vectors; i++) {
 		struct intc_vect *vect = desc->vectors + i;
+		unsigned int irq = evt2irq(vect->vect);
+		struct irq_desc *irq_desc;
 
 		if (!vect->enum_id)
 			continue;
 
-		intc_register_irq(desc, d, vect->enum_id, evt2irq(vect->vect));
+		irq_desc = irq_to_desc_alloc_cpu(irq, cpu);
+		if (unlikely(!irq_desc)) {
+			printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+			continue;
+		}
+
+		intc_register_irq(desc, d, vect->enum_id, irq);
 	}
 }
 
-- 
GitLab


From d8586ba6e1415150e1bab89f0a05447bb6f2d6d5 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 01:36:13 +0900
Subject: [PATCH 1236/2198] sh: irq: Provide an arch_probe_nr_irqs() that wraps
 the machvec def.

This is just a simple arch_probe_nr_irqs() stub that wraps to the
platform defined number of IRQs. This can be made gradually more
intelligent based on what we can infer from the INTC tables and so on.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/irq.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c
index 878532b617625..3d09062f46827 100644
--- a/arch/sh/kernel/irq.c
+++ b/arch/sh/kernel/irq.c
@@ -279,3 +279,11 @@ void __init init_IRQ(void)
 
 	irq_ctx_init(smp_processor_id());
 }
+
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+	nr_irqs = sh_mv.mv_nr_irqs;
+	return 0;
+}
+#endif
-- 
GitLab


From 30cff215b54b20d201a2bd7d50361c4c14700281 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 20 May 2009 13:44:40 +0000
Subject: [PATCH 1237/2198] sh: clkfwk branch compile fix for clock-sh7722

Fix clkfwk branch compile error in clock-sh7722.c.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index a98d7da67b97b..ae78efd0582dc 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -892,7 +892,7 @@ int __init arch_clk_init(void)
 	struct clk *clk;
 	int i;
 
-	clk_cpg_init();
+	cpg_clk_init();
 
 	clk = clk_get(NULL, "master_clk");
 	for (i = 0; i < ARRAY_SIZE(sh7722_clocks); i++) {
-- 
GitLab


From 5789ba3bd0a3cd20df5980ebf03358f2eb44fd67 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 15:47:06 -0400
Subject: [PATCH 1238/2198] IMA: Minimal IMA policy and boot param for TCB IMA
 policy

The IMA TCB policy is dangerous.  A normal use can use all of a system's
memory (which cannot be freed) simply by building and running lots of
executables.  The TCB policy is also nearly useless because logging in as root
often causes a policy violation when dealing with utmp, thus rendering the
measurements meaningless.

There is no good fix for this in the kernel.  A full TCB policy would need to
be loaded in userspace using LSM rule matching to get both a protected and
useful system.  But, if too little is measured before userspace can load a real
policy one again ends up with a meaningless set of measurements.  One option
would be to put the policy load inside the initrd in order to get it early
enough in the boot sequence to be useful, but this runs into trouble with the
LSM.  For IMA to measure the LSM policy and the LSM policy loading mechanism
it needs rules to do so, but we already talked about problems with defaulting
to such broad rules....

IMA also depends on the files being measured to be on an FS which implements
and supports i_version.  Since the only FS with this support (ext4) doesn't
even use it by default it seems silly to have any IMA rules by default.

This should reduce the performance overhead of IMA to near 0 while still
letting users who choose to configure their machine as such to inclue the
ima_tcb kernel paramenter and get measurements during boot before they can
load a customized, reasonable policy in userspace.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 Documentation/kernel-parameters.txt |  6 ++++++
 security/integrity/ima/ima_policy.c | 30 ++++++++++++++++++++++++++---
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbfbcc75e..d9a24a04cfb17 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -914,6 +914,12 @@ and is between 256 and 4096 characters. It is defined in the file
 			Formt: { "sha1" | "md5" }
 			default: "sha1"
 
+	ima_tcb		[IMA]
+			Load a policy which meets the needs of the Trusted
+			Computing Base.  This means IMA will measure all
+			programs exec'd, files mmap'd for exec, and all files
+			opened for read by uid=0.
+
 	in2000=		[HW,SCSI]
 			See header of drivers/scsi/in2000.c.
 
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 31d677f7c65f5..4719bbf1641a7 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -45,9 +45,17 @@ struct ima_measure_rule_entry {
 	} lsm[MAX_LSM_RULES];
 };
 
-/* Without LSM specific knowledge, the default policy can only be
+/*
+ * Without LSM specific knowledge, the default policy can only be
  * written in terms of .action, .func, .mask, .fsmagic, and .uid
  */
+
+/*
+ * The minimum rule set to allow for full TCB coverage.  Measures all files
+ * opened or mmap for exec and everything read by root.  Dangerous because
+ * normal users can easily run the machine out of memory simply building
+ * and running executables.
+ */
 static struct ima_measure_rule_entry default_rules[] = {
 	{.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,.flags = IMA_FSMAGIC},
 	{.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC},
@@ -59,6 +67,8 @@ static struct ima_measure_rule_entry default_rules[] = {
 	 .flags = IMA_FUNC | IMA_MASK},
 	{.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC,
 	 .flags = IMA_FUNC | IMA_MASK},
+	{.action = MEASURE,.func = PATH_CHECK,.mask = MAY_READ,.uid = 0,
+	 .flags = IMA_FUNC | IMA_MASK | IMA_UID},
 };
 
 static LIST_HEAD(measure_default_rules);
@@ -67,6 +77,14 @@ static struct list_head *ima_measure;
 
 static DEFINE_MUTEX(ima_measure_mutex);
 
+static bool ima_use_tcb __initdata;
+static int __init default_policy_setup(char *str)
+{
+	ima_use_tcb = 1;
+	return 1;
+}
+__setup("ima_tcb", default_policy_setup);
+
 /**
  * ima_match_rules - determine whether an inode matches the measure rule.
  * @rule: a pointer to a rule
@@ -162,9 +180,15 @@ int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask)
  */
 void ima_init_policy(void)
 {
-	int i;
+	int i, entries;
+
+	/* if !ima_use_tcb set entries = 0 so we load NO default rules */
+	if (ima_use_tcb)
+		entries = ARRAY_SIZE(default_rules);
+	else
+		entries = 0;
 
-	for (i = 0; i < ARRAY_SIZE(default_rules); i++)
+	for (i = 0; i < entries; i++)
 		list_add_tail(&default_rules[i].list, &measure_default_rules);
 	ima_measure = &measure_default_rules;
 }
-- 
GitLab


From 932995f0ce52525b32ff5127b522c2c164de3810 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 15:43:32 -0400
Subject: [PATCH 1239/2198] IMA: Add __init notation to ima functions

A number of IMA functions only used during init are not marked with __init.
Add those notations so they are freed automatically.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/integrity/ima/ima_crypto.c | 4 ++--
 security/integrity/ima/ima_fs.c     | 2 +-
 security/integrity/ima/ima_iint.c   | 2 +-
 security/integrity/ima/ima_init.c   | 4 ++--
 security/integrity/ima/ima_policy.c | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c
index 50d572b74caff..63003a63aaeed 100644
--- a/security/integrity/ima/ima_crypto.c
+++ b/security/integrity/ima/ima_crypto.c
@@ -103,7 +103,7 @@ int ima_calc_template_hash(int template_len, void *template, char *digest)
 	return rc;
 }
 
-static void ima_pcrread(int idx, u8 *pcr)
+static void __init ima_pcrread(int idx, u8 *pcr)
 {
 	if (!ima_used_chip)
 		return;
@@ -115,7 +115,7 @@ static void ima_pcrread(int idx, u8 *pcr)
 /*
  * Calculate the boot aggregate hash
  */
-int ima_calc_boot_aggregate(char *digest)
+int __init ima_calc_boot_aggregate(char *digest)
 {
 	struct hash_desc desc;
 	struct scatterlist sg;
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index 3305a9615863f..7039b14e1f731 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -319,7 +319,7 @@ static struct file_operations ima_measure_policy_ops = {
 	.release = ima_release_policy
 };
 
-int ima_fs_init(void)
+int __init ima_fs_init(void)
 {
 	ima_dir = securityfs_create_dir("ima", NULL);
 	if (IS_ERR(ima_dir))
diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c
index ec79f1ee992cb..b8dd693f8790a 100644
--- a/security/integrity/ima/ima_iint.c
+++ b/security/integrity/ima/ima_iint.c
@@ -196,7 +196,7 @@ static void init_once(void *foo)
 	kref_set(&iint->refcount, 1);
 }
 
-void ima_iintcache_init(void)
+void __init ima_iintcache_init(void)
 {
 	iint_cache =
 	    kmem_cache_create("iint_cache", sizeof(struct ima_iint_cache), 0,
diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c
index 0b0bb8c978cc8..a40da7ae59002 100644
--- a/security/integrity/ima/ima_init.c
+++ b/security/integrity/ima/ima_init.c
@@ -38,7 +38,7 @@ int ima_used_chip;
  * a different value.) Violations add a zero entry to the measurement
  * list and extend the aggregate PCR value with ff...ff's.
  */
-static void ima_add_boot_aggregate(void)
+static void __init ima_add_boot_aggregate(void)
 {
 	struct ima_template_entry *entry;
 	const char *op = "add_boot_aggregate";
@@ -71,7 +71,7 @@ static void ima_add_boot_aggregate(void)
 			    audit_cause, result, 0);
 }
 
-int ima_init(void)
+int __init ima_init(void)
 {
 	u8 pcr_i[IMA_DIGEST_SIZE];
 	int rc;
diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c
index 4719bbf1641a7..e1278399b3454 100644
--- a/security/integrity/ima/ima_policy.c
+++ b/security/integrity/ima/ima_policy.c
@@ -178,7 +178,7 @@ int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask)
  * ima_measure points to either the measure_default_rules or the
  * the new measure_policy_rules.
  */
-void ima_init_policy(void)
+void __init ima_init_policy(void)
 {
 	int i, entries;
 
-- 
GitLab


From b9fc745db833bbf74b4988493b8cd902a84c9415 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 19 May 2009 13:25:57 -0400
Subject: [PATCH 1240/2198] integrity: path_check update

- Add support in ima_path_check() for integrity checking without
incrementing the counts. (Required for nfsd.)
- rename and export opencount_get to ima_counts_get
- replace ima_shm_check calls with ima_counts_get
- export ima_path_check

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/exec.c                         |  5 ++--
 fs/namei.c                        |  6 ++--
 include/linux/ima.h               | 11 ++++---
 ipc/shm.c                         |  4 +--
 mm/shmem.c                        |  2 +-
 security/integrity/ima/ima_main.c | 48 +++++++++++++++++++------------
 6 files changed, 46 insertions(+), 30 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 998e856c3079c..618d6d1e2c52f 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -130,7 +130,8 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 				 MAY_READ | MAY_EXEC | MAY_OPEN);
 	if (error)
 		goto exit;
-	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN);
+	error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN,
+			       IMA_COUNT_UPDATE);
 	if (error)
 		goto exit;
 
@@ -680,7 +681,7 @@ struct file *open_exec(const char *name)
 	err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
 	if (err)
 		goto out_path_put;
-	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN);
+	err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN, IMA_COUNT_UPDATE);
 	if (err)
 		goto out_path_put;
 
diff --git a/fs/namei.c b/fs/namei.c
index 78f253cd2d4fe..b05a2b1dea64a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -853,7 +853,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
 			err = inode_permission(nd->path.dentry->d_inode,
 					       MAY_EXEC);
 		if (!err)
-			err = ima_path_check(&nd->path, MAY_EXEC);
+			err = ima_path_check(&nd->path, MAY_EXEC,
+				             IMA_COUNT_UPDATE);
  		if (err)
 			break;
 
@@ -1515,7 +1516,8 @@ int may_open(struct path *path, int acc_mode, int flag)
 		return error;
 
 	error = ima_path_check(path,
-			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+			       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+			       IMA_COUNT_UPDATE);
 	if (error)
 		return error;
 	/*
diff --git a/include/linux/ima.h b/include/linux/ima.h
index 0e2aa45cb0cef..b1b827d091a99 100644
--- a/include/linux/ima.h
+++ b/include/linux/ima.h
@@ -13,14 +13,17 @@
 #include <linux/fs.h>
 struct linux_binprm;
 
+#define IMA_COUNT_UPDATE 1
+#define IMA_COUNT_LEAVE 0
+
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_inode_alloc(struct inode *inode);
 extern void ima_inode_free(struct inode *inode);
-extern int ima_path_check(struct path *path, int mask);
+extern int ima_path_check(struct path *path, int mask, int update_counts);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
-extern void ima_shm_check(struct file *file);
+extern void ima_counts_get(struct file *file);
 
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
@@ -38,7 +41,7 @@ static inline void ima_inode_free(struct inode *inode)
 	return;
 }
 
-static inline int ima_path_check(struct path *path, int mask)
+static inline int ima_path_check(struct path *path, int mask, int update_counts)
 {
 	return 0;
 }
@@ -53,7 +56,7 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot)
 	return 0;
 }
 
-static inline void ima_shm_check(struct file *file)
+static inline void ima_counts_get(struct file *file)
 {
 	return;
 }
diff --git a/ipc/shm.c b/ipc/shm.c
index faa46da99ebed..47b464229cd59 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -384,7 +384,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto no_file;
-	ima_shm_check(file);
+	ima_counts_get(file);
 
 	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 	if (id < 0) {
@@ -891,7 +891,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
 	file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations);
 	if (!file)
 		goto out_free;
-	ima_shm_check(file);
+	ima_counts_get(file);
 
 	file->private_data = sfd;
 	file->f_mapping = shp->shm_file->f_mapping;
diff --git a/mm/shmem.c b/mm/shmem.c
index b25f95ce3db76..a817f75f1441f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2684,7 +2684,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ima_shm_check(file);
+	ima_counts_get(file);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	vma->vm_file = file;
diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index c4228c0eb2d0f..a2eb23310eaf1 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -125,6 +125,15 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
 	return rc;
 }
 
+static void ima_update_counts(struct ima_iint_cache *iint, int mask)
+{
+	iint->opencount++;
+	if ((mask & MAY_WRITE) || (mask == 0))
+		iint->writecount++;
+	else if (mask & (MAY_READ | MAY_EXEC))
+		iint->readcount++;
+}
+
 /**
  * ima_path_check - based on policy, collect/store measurement.
  * @path: contains a pointer to the path to be measured
@@ -143,7 +152,7 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
  * Return 0 on success, an error code on failure.
  * (Based on the results of appraise_measurement().)
  */
-int ima_path_check(struct path *path, int mask)
+int ima_path_check(struct path *path, int mask, int update_counts)
 {
 	struct inode *inode = path->dentry->d_inode;
 	struct ima_iint_cache *iint;
@@ -157,11 +166,8 @@ int ima_path_check(struct path *path, int mask)
 		return 0;
 
 	mutex_lock(&iint->mutex);
-	iint->opencount++;
-	if ((mask & MAY_WRITE) || (mask == 0))
-		iint->writecount++;
-	else if (mask & (MAY_READ | MAY_EXEC))
-		iint->readcount++;
+	if (update_counts)
+		ima_update_counts(iint, mask);
 
 	rc = ima_must_measure(iint, inode, MAY_READ, PATH_CHECK);
 	if (rc < 0)
@@ -197,6 +203,7 @@ int ima_path_check(struct path *path, int mask)
 	kref_put(&iint->refcount, iint_free);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(ima_path_check);
 
 static int process_measurement(struct file *file, const unsigned char *filename,
 			       int mask, int function)
@@ -225,7 +232,16 @@ static int process_measurement(struct file *file, const unsigned char *filename,
 	return rc;
 }
 
-static void opencount_get(struct file *file)
+/*
+ * ima_opens_get - increment file counts
+ *
+ * - for IPC shm and shmat file.
+ * - for nfsd exported files.
+ *
+ * Increment the counts for these files to prevent unnecessary
+ * imbalance messages.
+ */
+void ima_counts_get(struct file *file)
 {
 	struct inode *inode = file->f_dentry->d_inode;
 	struct ima_iint_cache *iint;
@@ -237,8 +253,14 @@ static void opencount_get(struct file *file)
 		return;
 	mutex_lock(&iint->mutex);
 	iint->opencount++;
+	if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+		iint->readcount++;
+
+	if (file->f_mode & FMODE_WRITE)
+		iint->writecount++;
 	mutex_unlock(&iint->mutex);
 }
+EXPORT_SYMBOL_GPL(ima_counts_get);
 
 /**
  * ima_file_mmap - based on policy, collect/store measurement.
@@ -263,18 +285,6 @@ int ima_file_mmap(struct file *file, unsigned long prot)
 	return 0;
 }
 
-/*
- * ima_shm_check - IPC shm and shmat create/fput a file
- *
- * Maintain the opencount for these files to prevent unnecessary
- * imbalance messages.
- */
-void ima_shm_check(struct file *file)
-{
-	opencount_get(file);
-	return;
-}
-
 /**
  * ima_bprm_check - based on policy, collect/store measurement.
  * @bprm: contains the linux_binprm structure
-- 
GitLab


From c9d9ac525a0285a5b5ad9c3f9aa8b7c1753e6121 Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Tue, 19 May 2009 13:25:58 -0400
Subject: [PATCH 1241/2198] integrity: move ima_counts_get

Based on discussion on lkml (Andrew Morton and Eric Paris),
move ima_counts_get down a layer into shmem/hugetlb__file_setup().
Resolves drm shmem_file_setup() usage case as well.

HD comment:
  I still think you're doing this at the wrong level, but recognize
  that you probably won't be persuaded until a few more users of
  alloc_file() emerge, all wanting your ima_counts_get().

  Resolving GEM's shmem_file_setup() is an improvement, so I'll say

Acked-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/hugetlbfs/inode.c | 2 ++
 ipc/shm.c            | 1 -
 mm/shmem.c           | 2 +-
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 153d9681192b6..ccc62de96df8d 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -30,6 +30,7 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 
@@ -997,6 +998,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 			&hugetlbfs_file_operations);
 	if (!file)
 		goto out_dentry; /* inode is already attached */
+	ima_counts_get(file);
 
 	return file;
 
diff --git a/ipc/shm.c b/ipc/shm.c
index 47b464229cd59..5608183535990 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -384,7 +384,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	error = PTR_ERR(file);
 	if (IS_ERR(file))
 		goto no_file;
-	ima_counts_get(file);
 
 	id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
 	if (id < 0) {
diff --git a/mm/shmem.c b/mm/shmem.c
index a817f75f1441f..0132fbd45a238 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2659,6 +2659,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
 	if (error)
 		goto close_file;
 #endif
+	ima_counts_get(file);
 	return file;
 
 close_file:
@@ -2684,7 +2685,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 	if (IS_ERR(file))
 		return PTR_ERR(file);
 
-	ima_counts_get(file);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	vma->vm_file = file;
-- 
GitLab


From 6470c077cae12227318f40f3e6d756caadcce4b0 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 21 May 2009 18:42:54 +0200
Subject: [PATCH 1242/2198] smack: do not beyond ARRAY_SIZE of data

Do not go beyond ARRAY_SIZE of data

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/smack/smackfs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c
index 904af3483286f..8d3c2a051c7b1 100644
--- a/security/smack/smackfs.c
+++ b/security/smack/smackfs.c
@@ -776,7 +776,7 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
 	struct sockaddr_in newname;
 	char smack[SMK_LABELLEN];
 	char *sp;
-	char data[SMK_NETLBLADDRMAX];
+	char data[SMK_NETLBLADDRMAX + 1];
 	char *host = (char *)&newname.sin_addr.s_addr;
 	int rc;
 	struct netlbl_audit audit_info;
-- 
GitLab


From 7fc1e5c15fde0fa9d2c08441f6898a9e51593d47 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 21 May 2009 18:25:23 +0000
Subject: [PATCH 1243/2198] sh: clkfwk: beyond ARRAY_SIZE of onchip_ops for
 sh7722.

Do not go beyond ARRAY_SIZE of onchip_ops

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index ae78efd0582dc..c090c9a373f66 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -883,7 +883,7 @@ struct clk_ops *onchip_ops[] = {
 void __init
 arch_init_clk_ops(struct clk_ops **ops, int type)
 {
-	BUG_ON(type < 0 || type > ARRAY_SIZE(onchip_ops));
+	BUG_ON(type < 0 || type >= ARRAY_SIZE(onchip_ops));
 	*ops = onchip_ops[type];
 }
 
-- 
GitLab


From 2f3ed17e010e8c0873094016f93c1afbb4adb666 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 13:47:52 +0900
Subject: [PATCH 1244/2198] sh: Wrap irq_to_desc_alloc_cpu() around
 CONFIG_SPARSE_IRQ temporarily.

irq_to_desc_alloc_cpu() has been renamed to irq_to_desc_alloc_node() in
-next, but as we can not presently enable SPARSE_IRQ without the early
irq_desc alloc patch, protect it with an ifdef until the interface has
settled and we are ready to enable it system-wide.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/irq/ipr.c | 4 ++++
 drivers/sh/intc.c            | 5 ++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/sh/kernel/cpu/irq/ipr.c b/arch/sh/kernel/cpu/irq/ipr.c
index fa0c8467a280e..6ad40dbad8812 100644
--- a/arch/sh/kernel/cpu/irq/ipr.c
+++ b/arch/sh/kernel/cpu/irq/ipr.c
@@ -59,17 +59,21 @@ void register_ipr_controller(struct ipr_desc *desc)
 
 	for (i = 0; i < desc->nr_irqs; i++) {
 		struct ipr_data *p = desc->ipr_data + i;
+#ifdef CONFIG_SPARSE_IRQ
 		struct irq_desc *irq_desc;
+#endif
 
 		BUG_ON(p->ipr_idx >= desc->nr_offsets);
 		BUG_ON(!desc->ipr_offsets[p->ipr_idx]);
 
+#ifdef CONFIG_SPARSE_IRQ
 		irq_desc = irq_to_desc_alloc_cpu(p->irq, smp_processor_id());
 		if (unlikely(!irq_desc)) {
 			printk(KERN_INFO "can not get irq_desc for %d\n",
 			       p->irq);
 			continue;
 		}
+#endif
 
 		disable_irq_nosync(p->irq);
 		set_irq_chip_and_handler_name(p->irq, &desc->chip,
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
index 098b767e9afda..caf0656940424 100644
--- a/drivers/sh/intc.c
+++ b/drivers/sh/intc.c
@@ -771,16 +771,19 @@ void __init register_intc_controller(struct intc_desc *desc)
 	for (i = 0; i < desc->nr_vectors; i++) {
 		struct intc_vect *vect = desc->vectors + i;
 		unsigned int irq = evt2irq(vect->vect);
+#ifdef CONFIG_SPARSE_IRQ
 		struct irq_desc *irq_desc;
-
+#endif
 		if (!vect->enum_id)
 			continue;
 
+#ifdef CONFIG_SPARSE_IRQ
 		irq_desc = irq_to_desc_alloc_cpu(irq, cpu);
 		if (unlikely(!irq_desc)) {
 			printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 			continue;
 		}
+#endif
 
 		intc_register_irq(desc, d, vect->enum_id, irq);
 	}
-- 
GitLab


From 62fad39be0662a924b60e4354b802525ceda0bb1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 13:50:18 +0900
Subject: [PATCH 1245/2198] sh: Add a NR_IRQS_LEGACY for external IRQ0-7.

This adds a NR_IRQS_LEGACY definition, which will be used by sparse irq.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/irq.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/sh/include/asm/irq.h b/arch/sh/include/asm/irq.h
index d319baaf4fbdf..a2b8c99cc06f1 100644
--- a/arch/sh/include/asm/irq.h
+++ b/arch/sh/include/asm/irq.h
@@ -8,7 +8,8 @@
  * advised to cap this at the hard limit that they're interested in
  * through the machvec.
  */
-#define NR_IRQS 256
+#define NR_IRQS			256
+#define NR_IRQS_LEGACY		8	/* Legacy external IRQ0-7 */
 
 /*
  * Convert back and forth between INTEVT and IRQ values.
-- 
GitLab


From 36aa1e32f451b664adaf3fc9a77d8279b7a833b2 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 14:00:34 +0900
Subject: [PATCH 1246/2198] sh: clkfwk: Make clock-cpg usable for non-legacy
 platforms.

This adds a new SH_CLK_CPG for parts that have CPG support.
SH_CLK_CPG_LEGACY is made to depend on this, and still needs to be set
for platforms that want clock-cpg to register the legacy clocks. With
this new config item in place, it is now possible to start layering more
generic CPG code in place while other platforms transition off of the
legacy clocks.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                | 4 ++++
 arch/sh/kernel/cpu/Makefile    | 2 +-
 arch/sh/kernel/cpu/clock-cpg.c | 2 ++
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index df764c56b0508..c815975b8d75e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -513,7 +513,11 @@ config SH_PCLK_FREQ
 	  This is necessary for determining the reference clock value on
 	  platforms lacking an RTC.
 
+config SH_CLK_CPG
+	def_bool y
+
 config SH_CLK_CPG_LEGACY
+	depends on SH_CLK_CPG
 	def_bool y if !CPU_SUBTYPE_SH7785
 
 config SH_CLK_MD
diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile
index 05105b980c21b..eecad7cbd61e9 100644
--- a/arch/sh/kernel/cpu/Makefile
+++ b/arch/sh/kernel/cpu/Makefile
@@ -17,6 +17,6 @@ obj-$(CONFIG_ARCH_SHMOBILE)	+= shmobile/
 
 obj-$(CONFIG_UBC_WAKEUP)	+= ubc.o
 obj-$(CONFIG_SH_ADC)		+= adc.o
-obj-$(CONFIG_SH_CLK_CPG_LEGACY)	+= clock-cpg.o
+obj-$(CONFIG_SH_CLK_CPG)	+= clock-cpg.o
 
 obj-y	+= irq/ init.o clock.o
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index b4ca20048446e..b78c237ab3666 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -2,6 +2,7 @@
 #include <linux/compiler.h>
 #include <asm/clock.h>
 
+#ifdef CONFIG_SH_CLK_CPG_LEGACY
 static struct clk master_clk = {
 	.name		= "master_clk",
 	.flags		= CLK_ENABLE_ON_INIT,
@@ -58,3 +59,4 @@ int __init __weak arch_clk_init(void)
 {
 	return cpg_clk_init();
 }
+#endif /* CONFIG_SH_CPG_CLK_LEGACY */
-- 
GitLab


From 8fc40238b4ebf07cd11ca9707843338be22af72f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 14:21:03 +0900
Subject: [PATCH 1247/2198] sh: Prefer slab_is_available() over after_bootmem.

This kills off after_bootmem and switches to using slab_is_available()
instead. Presently the only place this is used is by the sh64 ioremap,
and there's not much point in keeping the reference around otherwise.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/mm/init.c       | 3 ---
 arch/sh/mm/ioremap_64.c | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 3edf297c829bd..ee8e6bbe882ca 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -184,7 +184,6 @@ void __init paging_init(void)
 }
 
 static struct kcore_list kcore_mem, kcore_vmalloc;
-int after_bootmem = 0;
 
 void __init mem_init(void)
 {
@@ -217,8 +216,6 @@ void __init mem_init(void)
 	memset(empty_zero_page, 0, PAGE_SIZE);
 	__flush_wback_region(empty_zero_page, PAGE_SIZE);
 
-	after_bootmem = 1;
-
 	codesize =  (unsigned long) &_etext - (unsigned long) &_text;
 	datasize =  (unsigned long) &_edata - (unsigned long) &_etext;
 	initsize =  (unsigned long) &__init_end - (unsigned long) &__init_begin;
diff --git a/arch/sh/mm/ioremap_64.c b/arch/sh/mm/ioremap_64.c
index 2331229f81266..828c8597219da 100644
--- a/arch/sh/mm/ioremap_64.c
+++ b/arch/sh/mm/ioremap_64.c
@@ -20,6 +20,7 @@
 #include <linux/io.h>
 #include <linux/bootmem.h>
 #include <linux/proc_fs.h>
+#include <linux/slab.h>
 #include <asm/page.h>
 #include <asm/pgalloc.h>
 #include <asm/addrspace.h>
@@ -175,7 +176,7 @@ static __init_refok void *sh64_get_page(void)
 {
 	void *page;
 
-	if (after_bootmem)
+	if (slab_is_available())
 		page = (void *)get_zeroed_page(GFP_KERNEL);
 	else
 		page = alloc_bootmem_pages(PAGE_SIZE);
-- 
GitLab


From b9ed7252d219c1c663944bf03846eabb515dbe75 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Fri, 22 May 2009 09:25:32 +0200
Subject: [PATCH 1248/2198] xen-blkfront: beyond ARRAY_SIZE of info->shadow

Do not go beyond ARRAY_SIZE of info->shadow
Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/xen-blkfront.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 6d4ac76c2806a..6d5950839bd04 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -122,7 +122,7 @@ static DEFINE_SPINLOCK(blkif_io_lock);
 static int get_id_from_freelist(struct blkfront_info *info)
 {
 	unsigned long free = info->shadow_free;
-	BUG_ON(free > BLK_RING_SIZE);
+	BUG_ON(free >= BLK_RING_SIZE);
 	info->shadow_free = info->shadow[free].req.id;
 	info->shadow[free].req.id = 0x0fffffee; /* debug */
 	return free;
-- 
GitLab


From b1e71b0622974953e46a284aa986504a90869a9b Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:01:55 +0100
Subject: [PATCH 1249/2198] GFS2: Clean up some file names

This patch renames the ops_*.c files which have no counterpart
without the ops_ prefix in order to shorten the name and make
it more readable. In addition, ops_address.h (which was very
small) is moved into inode.h and inode.h is cleaned up by
adding extern where required.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile                   |  2 +-
 fs/gfs2/{ops_address.c => aops.c}  |  1 -
 fs/gfs2/bmap.c                     |  1 -
 fs/gfs2/{ops_dentry.c => dentry.c} |  0
 fs/gfs2/{ops_export.c => export.c} |  0
 fs/gfs2/{ops_file.c => file.c}     |  1 -
 fs/gfs2/inode.c                    |  1 -
 fs/gfs2/inode.h                    | 57 +++++++++++++++++-------------
 fs/gfs2/meta_io.c                  |  1 -
 fs/gfs2/ops_address.h              | 23 ------------
 fs/gfs2/quota.c                    |  1 -
 fs/gfs2/rgrp.c                     |  1 -
 12 files changed, 34 insertions(+), 55 deletions(-)
 rename fs/gfs2/{ops_address.c => aops.c} (99%)
 rename fs/gfs2/{ops_dentry.c => dentry.c} (100%)
 rename fs/gfs2/{ops_export.c => export.c} (100%)
 rename fs/gfs2/{ops_file.c => file.c} (99%)
 delete mode 100644 fs/gfs2/ops_address.h

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index a851ea4bdf703..4f7332c7682fd 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,7 +1,7 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
-	mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
+	mount.o aops.o dentry.o export.o file.o \
 	ops_fstype.o ops_inode.o ops_super.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/aops.c
similarity index 99%
rename from fs/gfs2/ops_address.c
rename to fs/gfs2/aops.c
index e5664210f0d88..03ebb439ace07 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/aops.c
@@ -28,7 +28,6 @@
 #include "inode.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_address.h"
 #include "quota.h"
 #include "trans.h"
 #include "rgrp.h"
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 253e1a39f8419..1153a078920c2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -25,7 +25,6 @@
 #include "trans.h"
 #include "dir.h"
 #include "util.h"
-#include "ops_address.h"
 
 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  * block is 512, so __u16 is fine for that. It saves stack space to
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/dentry.c
similarity index 100%
rename from fs/gfs2/ops_dentry.c
rename to fs/gfs2/dentry.c
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/export.c
similarity index 100%
rename from fs/gfs2/ops_export.c
rename to fs/gfs2/export.c
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/file.c
similarity index 99%
rename from fs/gfs2/ops_file.c
rename to fs/gfs2/file.c
index 0ee7bd287c5a6..73b6f552f06dd 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/file.c
@@ -39,7 +39,6 @@
 #include "trans.h"
 #include "util.h"
 #include "eaops.h"
-#include "ops_address.h"
 
 /**
  * gfs2_llseek - seek to a location in a file
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 5a31d426116fb..c03a1a384e72b 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -30,7 +30,6 @@
 #include "inode.h"
 #include "log.h"
 #include "meta_io.h"
-#include "ops_address.h"
 #include "quota.h"
 #include "rgrp.h"
 #include "trans.h"
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index c30be2b66580f..2c3ec072d60e2 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -11,8 +11,16 @@
 #define __INODE_DOT_H__
 
 #include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/mm.h>
 #include "util.h"
 
+extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
+extern int gfs2_internal_read(struct gfs2_inode *ip,
+			      struct file_ra_state *ra_state,
+			      char *buf, loff_t *pos, unsigned size);
+extern void gfs2_set_aops(struct inode *inode);
+
 static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
 {
 	return !ip->i_height;
@@ -73,30 +81,31 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 }
 
 
-void gfs2_set_iop(struct inode *inode);
-struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
-				u64 no_addr, u64 no_formal_ino,
-				int skip_freeing);
-struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
-
-int gfs2_inode_refresh(struct gfs2_inode *ip);
-
-int gfs2_dinode_dealloc(struct gfs2_inode *inode);
-int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
-struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
-			   int is_root);
-struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
-			   unsigned int mode, dev_t dev);
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-		struct gfs2_inode *ip);
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-		   const struct gfs2_inode *ip);
-int gfs2_permission(struct inode *inode, int mask);
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
-int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
-struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
-void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
-void gfs2_dinode_print(const struct gfs2_inode *ip);
+extern void gfs2_set_iop(struct inode *inode);
+extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
+				       u64 no_addr, u64 no_formal_ino,
+				       int skip_freeing);
+extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
+
+extern int gfs2_inode_refresh(struct gfs2_inode *ip);
+
+extern int gfs2_dinode_dealloc(struct gfs2_inode *inode);
+extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff);
+extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
+				  int is_root);
+extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
+				  const struct qstr *name,
+				  unsigned int mode, dev_t dev);
+extern int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+		       struct gfs2_inode *ip);
+extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+			  const struct gfs2_inode *ip);
+extern int gfs2_permission(struct inode *inode, int mask);
+extern int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
+extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
+extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
+extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
+extern void gfs2_dinode_print(const struct gfs2_inode *ip);
 
 extern const struct inode_operations gfs2_file_iops;
 extern const struct inode_operations gfs2_dir_iops;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 78a5f43126677..cb8d7a93d5ec2 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -31,7 +31,6 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-#include "ops_address.h"
 
 static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc)
 {
diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h
deleted file mode 100644
index 5da21285bba45..0000000000000
--- a/fs/gfs2/ops_address.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#ifndef __OPS_ADDRESS_DOT_H__
-#define __OPS_ADDRESS_DOT_H__
-
-#include <linux/fs.h>
-#include <linux/buffer_head.h>
-#include <linux/mm.h>
-
-extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask);
-extern int gfs2_internal_read(struct gfs2_inode *ip,
-			      struct file_ra_state *ra_state,
-			      char *buf, loff_t *pos, unsigned size);
-extern void gfs2_set_aops(struct inode *inode);
-
-#endif /* __OPS_ADDRESS_DOT_H__ */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 152e6c4a0dca2..2e9b9326bfc96 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -60,7 +60,6 @@
 #include "super.h"
 #include "trans.h"
 #include "inode.h"
-#include "ops_address.h"
 #include "util.h"
 
 #define QUOTA_USER 1
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index ee3d5c1876a32..6122c7ee3648a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -29,7 +29,6 @@
 #include "util.h"
 #include "log.h"
 #include "inode.h"
-#include "ops_address.h"
 
 #define BFITNOENT ((u32)~0)
 #define NO_BLOCK ((u64)~0)
-- 
GitLab


From 9e6e0a128bca0a151d8d3fbd9459b22fc21cfebb Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:36:01 +0100
Subject: [PATCH 1250/2198] GFS2: Merge mount.c and ops_super.c into super.c

mount.c only contained a single function, so is not really
worth retaining on its own. All of the super related code
is now either in super.c or ops_fstype.c

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/Makefile    |   4 +-
 fs/gfs2/mount.c     | 195 ----------
 fs/gfs2/ops_super.c | 757 -------------------------------------
 fs/gfs2/super.c     | 903 +++++++++++++++++++++++++++++++++++++++++++-
 4 files changed, 903 insertions(+), 956 deletions(-)
 delete mode 100644 fs/gfs2/mount.c
 delete mode 100644 fs/gfs2/ops_super.c

diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 4f7332c7682fd..d53a9bea1c2f2 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,8 +1,8 @@
 obj-$(CONFIG_GFS2_FS) += gfs2.o
 gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
 	glops.o inode.o log.o lops.o main.o meta_io.o \
-	mount.o aops.o dentry.o export.o file.o \
-	ops_fstype.o ops_inode.o ops_super.o quota.o \
+	aops.o dentry.o export.o file.o \
+	ops_fstype.o ops_inode.o quota.o \
 	recovery.o rgrp.o super.o sys.o trans.o util.o
 
 gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c
deleted file mode 100644
index 947af151fa244..0000000000000
--- a/fs/gfs2/mount.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/parser.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-
-enum {
-	Opt_lockproto,
-	Opt_locktable,
-	Opt_hostdata,
-	Opt_spectator,
-	Opt_ignore_local_fs,
-	Opt_localflocks,
-	Opt_localcaching,
-	Opt_debug,
-	Opt_nodebug,
-	Opt_upgrade,
-	Opt_acl,
-	Opt_noacl,
-	Opt_quota_off,
-	Opt_quota_account,
-	Opt_quota_on,
-	Opt_quota,
-	Opt_noquota,
-	Opt_suiddir,
-	Opt_nosuiddir,
-	Opt_data_writeback,
-	Opt_data_ordered,
-	Opt_meta,
-	Opt_discard,
-	Opt_nodiscard,
-	Opt_commit,
-	Opt_err,
-};
-
-static const match_table_t tokens = {
-	{Opt_lockproto, "lockproto=%s"},
-	{Opt_locktable, "locktable=%s"},
-	{Opt_hostdata, "hostdata=%s"},
-	{Opt_spectator, "spectator"},
-	{Opt_ignore_local_fs, "ignore_local_fs"},
-	{Opt_localflocks, "localflocks"},
-	{Opt_localcaching, "localcaching"},
-	{Opt_debug, "debug"},
-	{Opt_nodebug, "nodebug"},
-	{Opt_upgrade, "upgrade"},
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-	{Opt_quota_off, "quota=off"},
-	{Opt_quota_account, "quota=account"},
-	{Opt_quota_on, "quota=on"},
-	{Opt_quota, "quota"},
-	{Opt_noquota, "noquota"},
-	{Opt_suiddir, "suiddir"},
-	{Opt_nosuiddir, "nosuiddir"},
-	{Opt_data_writeback, "data=writeback"},
-	{Opt_data_ordered, "data=ordered"},
-	{Opt_meta, "meta"},
-	{Opt_discard, "discard"},
-	{Opt_nodiscard, "nodiscard"},
-	{Opt_commit, "commit=%d"},
-	{Opt_err, NULL}
-};
-
-/**
- * gfs2_mount_args - Parse mount options
- * @sdp:
- * @data:
- *
- * Return: errno
- */
-
-int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
-{
-	char *o;
-	int token;
-	substring_t tmp[MAX_OPT_ARGS];
-	int rv;
-
-	/* Split the options into tokens with the "," character and
-	   process them */
-
-	while (1) {
-		o = strsep(&options, ",");
-		if (o == NULL)
-			break;
-		if (*o == '\0')
-			continue;
-
-		token = match_token(o, tokens, tmp);
-		switch (token) {
-		case Opt_lockproto:
-			match_strlcpy(args->ar_lockproto, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_locktable:
-			match_strlcpy(args->ar_locktable, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_hostdata:
-			match_strlcpy(args->ar_hostdata, &tmp[0],
-				      GFS2_LOCKNAME_LEN);
-			break;
-		case Opt_spectator:
-			args->ar_spectator = 1;
-			break;
-		case Opt_ignore_local_fs:
-			args->ar_ignore_local_fs = 1;
-			break;
-		case Opt_localflocks:
-			args->ar_localflocks = 1;
-			break;
-		case Opt_localcaching:
-			args->ar_localcaching = 1;
-			break;
-		case Opt_debug:
-			args->ar_debug = 1;
-			break;
-		case Opt_nodebug:
-			args->ar_debug = 0;
-			break;
-		case Opt_upgrade:
-			args->ar_upgrade = 1;
-			break;
-		case Opt_acl:
-			args->ar_posix_acl = 1;
-			break;
-		case Opt_noacl:
-			args->ar_posix_acl = 0;
-			break;
-		case Opt_quota_off:
-		case Opt_noquota:
-			args->ar_quota = GFS2_QUOTA_OFF;
-			break;
-		case Opt_quota_account:
-			args->ar_quota = GFS2_QUOTA_ACCOUNT;
-			break;
-		case Opt_quota_on:
-		case Opt_quota:
-			args->ar_quota = GFS2_QUOTA_ON;
-			break;
-		case Opt_suiddir:
-			args->ar_suiddir = 1;
-			break;
-		case Opt_nosuiddir:
-			args->ar_suiddir = 0;
-			break;
-		case Opt_data_writeback:
-			args->ar_data = GFS2_DATA_WRITEBACK;
-			break;
-		case Opt_data_ordered:
-			args->ar_data = GFS2_DATA_ORDERED;
-			break;
-		case Opt_meta:
-			args->ar_meta = 1;
-			break;
-		case Opt_discard:
-			args->ar_discard = 1;
-			break;
-		case Opt_nodiscard:
-			args->ar_discard = 0;
-			break;
-		case Opt_commit:
-			rv = match_int(&tmp[0], &args->ar_commit);
-			if (rv || args->ar_commit <= 0) {
-				fs_info(sdp, "commit mount option requires a positive numeric argument\n");
-				return rv ? rv : -EINVAL;
-			}
-			break;
-		case Opt_err:
-		default:
-			fs_info(sdp, "invalid mount option: %s\n", o);
-			return -EINVAL;
-		}
-	}
-
-	return 0;
-}
-
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
deleted file mode 100644
index 2fd1dcbcc5b74..0000000000000
--- a/fs/gfs2/ops_super.c
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2008 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/statfs.h>
-#include <linux/seq_file.h>
-#include <linux/mount.h>
-#include <linux/kthread.h>
-#include <linux/delay.h>
-#include <linux/gfs2_ondisk.h>
-#include <linux/crc32.h>
-#include <linux/time.h>
-
-#include "gfs2.h"
-#include "incore.h"
-#include "glock.h"
-#include "inode.h"
-#include "log.h"
-#include "quota.h"
-#include "recovery.h"
-#include "rgrp.h"
-#include "super.h"
-#include "sys.h"
-#include "util.h"
-#include "trans.h"
-#include "dir.h"
-#include "eattr.h"
-#include "bmap.h"
-#include "meta_io.h"
-
-#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
-
-/**
- * gfs2_write_inode - Make sure the inode is stable on the disk
- * @inode: The inode
- * @sync: synchronous write flag
- *
- * Returns: errno
- */
-
-static int gfs2_write_inode(struct inode *inode, int sync)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_holder gh;
-	struct buffer_head *bh;
-	struct timespec atime;
-	struct gfs2_dinode *di;
-	int ret = 0;
-
-	/* Check this is a "normal" inode, etc */
-	if (!test_bit(GIF_USER, &ip->i_flags) ||
-	    (current->flags & PF_MEMALLOC))
-		return 0;
-	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (ret)
-		goto do_flush;
-	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
-	if (ret)
-		goto do_unlock;
-	ret = gfs2_meta_inode_buffer(ip, &bh);
-	if (ret == 0) {
-		di = (struct gfs2_dinode *)bh->b_data;
-		atime.tv_sec = be64_to_cpu(di->di_atime);
-		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
-		if (timespec_compare(&inode->i_atime, &atime) > 0) {
-			gfs2_trans_add_bh(ip->i_gl, bh, 1);
-			gfs2_dinode_out(ip, bh->b_data);
-		}
-		brelse(bh);
-	}
-	gfs2_trans_end(sdp);
-do_unlock:
-	gfs2_glock_dq_uninit(&gh);
-do_flush:
-	if (sync != 0)
-		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
-	return ret;
-}
-
-/**
- * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
- * @sdp: the filesystem
- *
- * Returns: errno
- */
-
-static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
-{
-	struct gfs2_holder t_gh;
-	int error;
-
-	gfs2_quota_sync(sdp);
-	gfs2_statfs_sync(sdp);
-
-	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-				   &t_gh);
-	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return error;
-
-	gfs2_meta_syncfs(sdp);
-	gfs2_log_shutdown(sdp);
-
-	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-
-	if (t_gh.gh_gl)
-		gfs2_glock_dq_uninit(&t_gh);
-
-	gfs2_quota_cleanup(sdp);
-
-	return error;
-}
-
-static int gfs2_umount_recovery_wait(void *word)
-{
-	schedule();
-	return 0;
-}
-
-/**
- * gfs2_put_super - Unmount the filesystem
- * @sb: The VFS superblock
- *
- */
-
-static void gfs2_put_super(struct super_block *sb)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	int error;
-	struct gfs2_jdesc *jd;
-
-	/*  Unfreeze the filesystem, if we need to  */
-
-	mutex_lock(&sdp->sd_freeze_lock);
-	if (sdp->sd_freeze_count)
-		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-	mutex_unlock(&sdp->sd_freeze_lock);
-
-	/* No more recovery requests */
-	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
-	smp_mb();
-
-	/* Wait on outstanding recovery */
-restart:
-	spin_lock(&sdp->sd_jindex_spin);
-	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
-		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
-			continue;
-		spin_unlock(&sdp->sd_jindex_spin);
-		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
-			    gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
-		goto restart;
-	}
-	spin_unlock(&sdp->sd_jindex_spin);
-
-	kthread_stop(sdp->sd_quotad_process);
-	kthread_stop(sdp->sd_logd_process);
-
-	if (!(sb->s_flags & MS_RDONLY)) {
-		error = gfs2_make_fs_ro(sdp);
-		if (error)
-			gfs2_io_error(sdp);
-	}
-	/*  At this point, we're through modifying the disk  */
-
-	/*  Release stuff  */
-
-	iput(sdp->sd_jindex);
-	iput(sdp->sd_inum_inode);
-	iput(sdp->sd_statfs_inode);
-	iput(sdp->sd_rindex);
-	iput(sdp->sd_quota_inode);
-
-	gfs2_glock_put(sdp->sd_rename_gl);
-	gfs2_glock_put(sdp->sd_trans_gl);
-
-	if (!sdp->sd_args.ar_spectator) {
-		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-		iput(sdp->sd_ir_inode);
-		iput(sdp->sd_sc_inode);
-		iput(sdp->sd_qc_inode);
-	}
-
-	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
-	gfs2_clear_rgrpd(sdp);
-	gfs2_jindex_free(sdp);
-	/*  Take apart glock structures and buffer lists  */
-	gfs2_gl_hash_clear(sdp);
-	/*  Unmount the locking protocol  */
-	gfs2_lm_unmount(sdp);
-
-	/*  At this point, we're through participating in the lockspace  */
-	gfs2_sys_fs_del(sdp);
-}
-
-/**
- * gfs2_write_super
- * @sb: the superblock
- *
- */
-
-static void gfs2_write_super(struct super_block *sb)
-{
-	sb->s_dirt = 0;
-}
-
-/**
- * gfs2_sync_fs - sync the filesystem
- * @sb: the superblock
- *
- * Flushes the log to disk.
- */
-
-static int gfs2_sync_fs(struct super_block *sb, int wait)
-{
-	sb->s_dirt = 0;
-	if (wait && sb->s_fs_info)
-		gfs2_log_flush(sb->s_fs_info, NULL);
-	return 0;
-}
-
-/**
- * gfs2_freeze - prevent further writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-
-static int gfs2_freeze(struct super_block *sb)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	int error;
-
-	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return -EINVAL;
-
-	for (;;) {
-		error = gfs2_freeze_fs(sdp);
-		if (!error)
-			break;
-
-		switch (error) {
-		case -EBUSY:
-			fs_err(sdp, "waiting for recovery before freeze\n");
-			break;
-
-		default:
-			fs_err(sdp, "error freezing FS: %d\n", error);
-			break;
-		}
-
-		fs_err(sdp, "retrying...\n");
-		msleep(1000);
-	}
-	return 0;
-}
-
-/**
- * gfs2_unfreeze - reallow writes to the filesystem
- * @sb: the VFS structure for the filesystem
- *
- */
-
-static int gfs2_unfreeze(struct super_block *sb)
-{
-	gfs2_unfreeze_fs(sb->s_fs_info);
-	return 0;
-}
-
-/**
- * statfs_fill - fill in the sg for a given RG
- * @rgd: the RG
- * @sc: the sc structure
- *
- * Returns: 0 on success, -ESTALE if the LVB is invalid
- */
-
-static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
-			    struct gfs2_statfs_change_host *sc)
-{
-	gfs2_rgrp_verify(rgd);
-	sc->sc_total += rgd->rd_data;
-	sc->sc_free += rgd->rd_free;
-	sc->sc_dinodes += rgd->rd_dinodes;
-	return 0;
-}
-
-/**
- * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
- * @sdp: the filesystem
- * @sc: the sc info that will be returned
- *
- * Any error (other than a signal) will cause this routine to fall back
- * to the synchronous version.
- *
- * FIXME: This really shouldn't busy wait like this.
- *
- * Returns: errno
- */
-
-static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-	struct gfs2_holder ri_gh;
-	struct gfs2_rgrpd *rgd_next;
-	struct gfs2_holder *gha, *gh;
-	unsigned int slots = 64;
-	unsigned int x;
-	int done;
-	int error = 0, err;
-
-	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
-	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
-	if (!gha)
-		return -ENOMEM;
-
-	error = gfs2_rindex_hold(sdp, &ri_gh);
-	if (error)
-		goto out;
-
-	rgd_next = gfs2_rgrpd_get_first(sdp);
-
-	for (;;) {
-		done = 1;
-
-		for (x = 0; x < slots; x++) {
-			gh = gha + x;
-
-			if (gh->gh_gl && gfs2_glock_poll(gh)) {
-				err = gfs2_glock_wait(gh);
-				if (err) {
-					gfs2_holder_uninit(gh);
-					error = err;
-				} else {
-					if (!error)
-						error = statfs_slow_fill(
-							gh->gh_gl->gl_object, sc);
-					gfs2_glock_dq_uninit(gh);
-				}
-			}
-
-			if (gh->gh_gl)
-				done = 0;
-			else if (rgd_next && !error) {
-				error = gfs2_glock_nq_init(rgd_next->rd_gl,
-							   LM_ST_SHARED,
-							   GL_ASYNC,
-							   gh);
-				rgd_next = gfs2_rgrpd_get_next(rgd_next);
-				done = 0;
-			}
-
-			if (signal_pending(current))
-				error = -ERESTARTSYS;
-		}
-
-		if (done)
-			break;
-
-		yield();
-	}
-
-	gfs2_glock_dq_uninit(&ri_gh);
-
-out:
-	kfree(gha);
-	return error;
-}
-
-/**
- * gfs2_statfs_i - Do a statfs
- * @sdp: the filesystem
- * @sg: the sg structure
- *
- * Returns: errno
- */
-
-static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
-{
-	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
-	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-
-	spin_lock(&sdp->sd_statfs_spin);
-
-	*sc = *m_sc;
-	sc->sc_total += l_sc->sc_total;
-	sc->sc_free += l_sc->sc_free;
-	sc->sc_dinodes += l_sc->sc_dinodes;
-
-	spin_unlock(&sdp->sd_statfs_spin);
-
-	if (sc->sc_free < 0)
-		sc->sc_free = 0;
-	if (sc->sc_free > sc->sc_total)
-		sc->sc_free = sc->sc_total;
-	if (sc->sc_dinodes < 0)
-		sc->sc_dinodes = 0;
-
-	return 0;
-}
-
-/**
- * gfs2_statfs - Gather and return stats about the filesystem
- * @sb: The superblock
- * @statfsbuf: The buffer
- *
- * Returns: 0 on success or error code
- */
-
-static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	struct super_block *sb = dentry->d_inode->i_sb;
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_statfs_change_host sc;
-	int error;
-
-	if (gfs2_tune_get(sdp, gt_statfs_slow))
-		error = gfs2_statfs_slow(sdp, &sc);
-	else
-		error = gfs2_statfs_i(sdp, &sc);
-
-	if (error)
-		return error;
-
-	buf->f_type = GFS2_MAGIC;
-	buf->f_bsize = sdp->sd_sb.sb_bsize;
-	buf->f_blocks = sc.sc_total;
-	buf->f_bfree = sc.sc_free;
-	buf->f_bavail = sc.sc_free;
-	buf->f_files = sc.sc_dinodes + sc.sc_free;
-	buf->f_ffree = sc.sc_free;
-	buf->f_namelen = GFS2_FNAMESIZE;
-
-	return 0;
-}
-
-/**
- * gfs2_remount_fs - called when the FS is remounted
- * @sb:  the filesystem
- * @flags:  the remount flags
- * @data:  extra data passed in (not used right now)
- *
- * Returns: errno
- */
-
-static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-	struct gfs2_sbd *sdp = sb->s_fs_info;
-	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
-	struct gfs2_tune *gt = &sdp->sd_tune;
-	int error;
-
-	spin_lock(&gt->gt_spin);
-	args.ar_commit = gt->gt_log_flush_secs;
-	spin_unlock(&gt->gt_spin);
-	error = gfs2_mount_args(sdp, &args, data);
-	if (error)
-		return error;
-
-	/* Not allowed to change locking details */
-	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
-	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
-	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
-		return -EINVAL;
-
-	/* Some flags must not be changed */
-	if (args_neq(&args, &sdp->sd_args, spectator) ||
-	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
-	    args_neq(&args, &sdp->sd_args, localflocks) ||
-	    args_neq(&args, &sdp->sd_args, localcaching) ||
-	    args_neq(&args, &sdp->sd_args, meta))
-		return -EINVAL;
-
-	if (sdp->sd_args.ar_spectator)
-		*flags |= MS_RDONLY;
-
-	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
-		if (*flags & MS_RDONLY)
-			error = gfs2_make_fs_ro(sdp);
-		else
-			error = gfs2_make_fs_rw(sdp);
-		if (error)
-			return error;
-	}
-
-	sdp->sd_args = args;
-	if (sdp->sd_args.ar_posix_acl)
-		sb->s_flags |= MS_POSIXACL;
-	else
-		sb->s_flags &= ~MS_POSIXACL;
-	spin_lock(&gt->gt_spin);
-	gt->gt_log_flush_secs = args.ar_commit;
-	spin_unlock(&gt->gt_spin);
-
-	return 0;
-}
-
-/**
- * gfs2_drop_inode - Drop an inode (test for remote unlink)
- * @inode: The inode to drop
- *
- * If we've received a callback on an iopen lock then its because a
- * remote node tried to deallocate the inode but failed due to this node
- * still having the inode open. Here we mark the link count zero
- * since we know that it must have reached zero if the GLF_DEMOTE flag
- * is set on the iopen glock. If we didn't do a disk read since the
- * remote node removed the final link then we might otherwise miss
- * this event. This check ensures that this node will deallocate the
- * inode's blocks, or alternatively pass the baton on to another
- * node for later deallocation.
- */
-
-static void gfs2_drop_inode(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
-		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
-		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
-			clear_nlink(inode);
-	}
-	generic_drop_inode(inode);
-}
-
-/**
- * gfs2_clear_inode - Deallocate an inode when VFS is done with it
- * @inode: The VFS inode
- *
- */
-
-static void gfs2_clear_inode(struct inode *inode)
-{
-	struct gfs2_inode *ip = GFS2_I(inode);
-
-	/* This tells us its a "real" inode and not one which only
-	 * serves to contain an address space (see rgrp.c, meta_io.c)
-	 * which therefore doesn't have its own glocks.
-	 */
-	if (test_bit(GIF_USER, &ip->i_flags)) {
-		ip->i_gl->gl_object = NULL;
-		gfs2_glock_put(ip->i_gl);
-		ip->i_gl = NULL;
-		if (ip->i_iopen_gh.gh_gl) {
-			ip->i_iopen_gh.gh_gl->gl_object = NULL;
-			gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-		}
-	}
-}
-
-static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
-{
-	do {
-		if (d1 == d2)
-			return 1;
-		d1 = d1->d_parent;
-	} while (!IS_ROOT(d1));
-	return 0;
-}
-
-/**
- * gfs2_show_options - Show mount options for /proc/mounts
- * @s: seq_file structure
- * @mnt: vfsmount
- *
- * Returns: 0 on success or error code
- */
-
-static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
-{
-	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
-	struct gfs2_args *args = &sdp->sd_args;
-	int lfsecs;
-
-	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
-		seq_printf(s, ",meta");
-	if (args->ar_lockproto[0])
-		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
-	if (args->ar_locktable[0])
-		seq_printf(s, ",locktable=%s", args->ar_locktable);
-	if (args->ar_hostdata[0])
-		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
-	if (args->ar_spectator)
-		seq_printf(s, ",spectator");
-	if (args->ar_ignore_local_fs)
-		seq_printf(s, ",ignore_local_fs");
-	if (args->ar_localflocks)
-		seq_printf(s, ",localflocks");
-	if (args->ar_localcaching)
-		seq_printf(s, ",localcaching");
-	if (args->ar_debug)
-		seq_printf(s, ",debug");
-	if (args->ar_upgrade)
-		seq_printf(s, ",upgrade");
-	if (args->ar_posix_acl)
-		seq_printf(s, ",acl");
-	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
-		char *state;
-		switch (args->ar_quota) {
-		case GFS2_QUOTA_OFF:
-			state = "off";
-			break;
-		case GFS2_QUOTA_ACCOUNT:
-			state = "account";
-			break;
-		case GFS2_QUOTA_ON:
-			state = "on";
-			break;
-		default:
-			state = "unknown";
-			break;
-		}
-		seq_printf(s, ",quota=%s", state);
-	}
-	if (args->ar_suiddir)
-		seq_printf(s, ",suiddir");
-	if (args->ar_data != GFS2_DATA_DEFAULT) {
-		char *state;
-		switch (args->ar_data) {
-		case GFS2_DATA_WRITEBACK:
-			state = "writeback";
-			break;
-		case GFS2_DATA_ORDERED:
-			state = "ordered";
-			break;
-		default:
-			state = "unknown";
-			break;
-		}
-		seq_printf(s, ",data=%s", state);
-	}
-	if (args->ar_discard)
-		seq_printf(s, ",discard");
-	lfsecs = sdp->sd_tune.gt_log_flush_secs;
-	if (lfsecs != 60)
-		seq_printf(s, ",commit=%d", lfsecs);
-	return 0;
-}
-
-/*
- * We have to (at the moment) hold the inodes main lock to cover
- * the gap between unlocking the shared lock on the iopen lock and
- * taking the exclusive lock. I'd rather do a shared -> exclusive
- * conversion on the iopen lock, but we can change that later. This
- * is safe, just less efficient.
- */
-
-static void gfs2_delete_inode(struct inode *inode)
-{
-	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_holder gh;
-	int error;
-
-	if (!test_bit(GIF_USER, &ip->i_flags))
-		goto out;
-
-	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-	if (unlikely(error)) {
-		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
-		goto out;
-	}
-
-	gfs2_glock_dq_wait(&ip->i_iopen_gh);
-	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
-	error = gfs2_glock_nq(&ip->i_iopen_gh);
-	if (error)
-		goto out_truncate;
-
-	if (S_ISDIR(inode->i_mode) &&
-	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
-		error = gfs2_dir_exhash_dealloc(ip);
-		if (error)
-			goto out_unlock;
-	}
-
-	if (ip->i_eattr) {
-		error = gfs2_ea_dealloc(ip);
-		if (error)
-			goto out_unlock;
-	}
-
-	if (!gfs2_is_stuffed(ip)) {
-		error = gfs2_file_dealloc(ip);
-		if (error)
-			goto out_unlock;
-	}
-
-	error = gfs2_dinode_dealloc(ip);
-	if (error)
-		goto out_unlock;
-
-out_truncate:
-	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
-	if (error)
-		goto out_unlock;
-	/* Needs to be done before glock release & also in a transaction */
-	truncate_inode_pages(&inode->i_data, 0);
-	gfs2_trans_end(sdp);
-
-out_unlock:
-	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
-		gfs2_glock_dq(&ip->i_iopen_gh);
-	gfs2_holder_uninit(&ip->i_iopen_gh);
-	gfs2_glock_dq_uninit(&gh);
-	if (error && error != GLR_TRYFAILED && error != -EROFS)
-		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
-out:
-	truncate_inode_pages(&inode->i_data, 0);
-	clear_inode(inode);
-}
-
-static struct inode *gfs2_alloc_inode(struct super_block *sb)
-{
-	struct gfs2_inode *ip;
-
-	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
-	if (ip) {
-		ip->i_flags = 0;
-		ip->i_gl = NULL;
-	}
-	return &ip->i_inode;
-}
-
-static void gfs2_destroy_inode(struct inode *inode)
-{
-	kmem_cache_free(gfs2_inode_cachep, inode);
-}
-
-const struct super_operations gfs2_super_ops = {
-	.alloc_inode		= gfs2_alloc_inode,
-	.destroy_inode		= gfs2_destroy_inode,
-	.write_inode		= gfs2_write_inode,
-	.delete_inode		= gfs2_delete_inode,
-	.put_super		= gfs2_put_super,
-	.write_super		= gfs2_write_super,
-	.sync_fs		= gfs2_sync_fs,
-	.freeze_fs 		= gfs2_freeze,
-	.unfreeze_fs		= gfs2_unfreeze,
-	.statfs			= gfs2_statfs,
-	.remount_fs		= gfs2_remount_fs,
-	.clear_inode		= gfs2_clear_inode,
-	.drop_inode		= gfs2_drop_inode,
-	.show_options		= gfs2_show_options,
-};
-
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 601913e0a4829..40bcc37e5a700 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -7,14 +7,20 @@
  * of the GNU General Public License version 2.
  */
 
+#include <linux/bio.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/crc32.h>
+#include <linux/statfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
+#include <linux/kthread.h>
+#include <linux/delay.h>
 #include <linux/gfs2_ondisk.h>
-#include <linux/bio.h>
+#include <linux/crc32.h>
+#include <linux/time.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -31,6 +37,183 @@
 #include "super.h"
 #include "trans.h"
 #include "util.h"
+#include "sys.h"
+#include "eattr.h"
+
+#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
+
+enum {
+	Opt_lockproto,
+	Opt_locktable,
+	Opt_hostdata,
+	Opt_spectator,
+	Opt_ignore_local_fs,
+	Opt_localflocks,
+	Opt_localcaching,
+	Opt_debug,
+	Opt_nodebug,
+	Opt_upgrade,
+	Opt_acl,
+	Opt_noacl,
+	Opt_quota_off,
+	Opt_quota_account,
+	Opt_quota_on,
+	Opt_quota,
+	Opt_noquota,
+	Opt_suiddir,
+	Opt_nosuiddir,
+	Opt_data_writeback,
+	Opt_data_ordered,
+	Opt_meta,
+	Opt_discard,
+	Opt_nodiscard,
+	Opt_commit,
+	Opt_error,
+};
+
+static const match_table_t tokens = {
+	{Opt_lockproto, "lockproto=%s"},
+	{Opt_locktable, "locktable=%s"},
+	{Opt_hostdata, "hostdata=%s"},
+	{Opt_spectator, "spectator"},
+	{Opt_ignore_local_fs, "ignore_local_fs"},
+	{Opt_localflocks, "localflocks"},
+	{Opt_localcaching, "localcaching"},
+	{Opt_debug, "debug"},
+	{Opt_nodebug, "nodebug"},
+	{Opt_upgrade, "upgrade"},
+	{Opt_acl, "acl"},
+	{Opt_noacl, "noacl"},
+	{Opt_quota_off, "quota=off"},
+	{Opt_quota_account, "quota=account"},
+	{Opt_quota_on, "quota=on"},
+	{Opt_quota, "quota"},
+	{Opt_noquota, "noquota"},
+	{Opt_suiddir, "suiddir"},
+	{Opt_nosuiddir, "nosuiddir"},
+	{Opt_data_writeback, "data=writeback"},
+	{Opt_data_ordered, "data=ordered"},
+	{Opt_meta, "meta"},
+	{Opt_discard, "discard"},
+	{Opt_nodiscard, "nodiscard"},
+	{Opt_commit, "commit=%d"},
+	{Opt_error, NULL}
+};
+
+/**
+ * gfs2_mount_args - Parse mount options
+ * @sdp:
+ * @data:
+ *
+ * Return: errno
+ */
+
+int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
+{
+	char *o;
+	int token;
+	substring_t tmp[MAX_OPT_ARGS];
+	int rv;
+
+	/* Split the options into tokens with the "," character and
+	   process them */
+
+	while (1) {
+		o = strsep(&options, ",");
+		if (o == NULL)
+			break;
+		if (*o == '\0')
+			continue;
+
+		token = match_token(o, tokens, tmp);
+		switch (token) {
+		case Opt_lockproto:
+			match_strlcpy(args->ar_lockproto, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_locktable:
+			match_strlcpy(args->ar_locktable, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_hostdata:
+			match_strlcpy(args->ar_hostdata, &tmp[0],
+				      GFS2_LOCKNAME_LEN);
+			break;
+		case Opt_spectator:
+			args->ar_spectator = 1;
+			break;
+		case Opt_ignore_local_fs:
+			args->ar_ignore_local_fs = 1;
+			break;
+		case Opt_localflocks:
+			args->ar_localflocks = 1;
+			break;
+		case Opt_localcaching:
+			args->ar_localcaching = 1;
+			break;
+		case Opt_debug:
+			args->ar_debug = 1;
+			break;
+		case Opt_nodebug:
+			args->ar_debug = 0;
+			break;
+		case Opt_upgrade:
+			args->ar_upgrade = 1;
+			break;
+		case Opt_acl:
+			args->ar_posix_acl = 1;
+			break;
+		case Opt_noacl:
+			args->ar_posix_acl = 0;
+			break;
+		case Opt_quota_off:
+		case Opt_noquota:
+			args->ar_quota = GFS2_QUOTA_OFF;
+			break;
+		case Opt_quota_account:
+			args->ar_quota = GFS2_QUOTA_ACCOUNT;
+			break;
+		case Opt_quota_on:
+		case Opt_quota:
+			args->ar_quota = GFS2_QUOTA_ON;
+			break;
+		case Opt_suiddir:
+			args->ar_suiddir = 1;
+			break;
+		case Opt_nosuiddir:
+			args->ar_suiddir = 0;
+			break;
+		case Opt_data_writeback:
+			args->ar_data = GFS2_DATA_WRITEBACK;
+			break;
+		case Opt_data_ordered:
+			args->ar_data = GFS2_DATA_ORDERED;
+			break;
+		case Opt_meta:
+			args->ar_meta = 1;
+			break;
+		case Opt_discard:
+			args->ar_discard = 1;
+			break;
+		case Opt_nodiscard:
+			args->ar_discard = 0;
+			break;
+		case Opt_commit:
+			rv = match_int(&tmp[0], &args->ar_commit);
+			if (rv || args->ar_commit <= 0) {
+				fs_info(sdp, "commit mount option requires a positive numeric argument\n");
+				return rv ? rv : -EINVAL;
+			}
+			break;
+		case Opt_error:
+		default:
+			fs_info(sdp, "invalid mount option: %s\n", o);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
 
 /**
  * gfs2_jindex_free - Clear all the journal index information
@@ -436,3 +619,719 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
 	mutex_unlock(&sdp->sd_freeze_lock);
 }
 
+
+/**
+ * gfs2_write_inode - Make sure the inode is stable on the disk
+ * @inode: The inode
+ * @sync: synchronous write flag
+ *
+ * Returns: errno
+ */
+
+static int gfs2_write_inode(struct inode *inode, int sync)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
+	struct gfs2_holder gh;
+	struct buffer_head *bh;
+	struct timespec atime;
+	struct gfs2_dinode *di;
+	int ret = 0;
+
+	/* Check this is a "normal" inode, etc */
+	if (!test_bit(GIF_USER, &ip->i_flags) ||
+	    (current->flags & PF_MEMALLOC))
+		return 0;
+	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (ret)
+		goto do_flush;
+	ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
+	if (ret)
+		goto do_unlock;
+	ret = gfs2_meta_inode_buffer(ip, &bh);
+	if (ret == 0) {
+		di = (struct gfs2_dinode *)bh->b_data;
+		atime.tv_sec = be64_to_cpu(di->di_atime);
+		atime.tv_nsec = be32_to_cpu(di->di_atime_nsec);
+		if (timespec_compare(&inode->i_atime, &atime) > 0) {
+			gfs2_trans_add_bh(ip->i_gl, bh, 1);
+			gfs2_dinode_out(ip, bh->b_data);
+		}
+		brelse(bh);
+	}
+	gfs2_trans_end(sdp);
+do_unlock:
+	gfs2_glock_dq_uninit(&gh);
+do_flush:
+	if (sync != 0)
+		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+	return ret;
+}
+
+/**
+ * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one
+ * @sdp: the filesystem
+ *
+ * Returns: errno
+ */
+
+static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
+{
+	struct gfs2_holder t_gh;
+	int error;
+
+	gfs2_quota_sync(sdp);
+	gfs2_statfs_sync(sdp);
+
+	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
+				   &t_gh);
+	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return error;
+
+	gfs2_meta_syncfs(sdp);
+	gfs2_log_shutdown(sdp);
+
+	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
+	if (t_gh.gh_gl)
+		gfs2_glock_dq_uninit(&t_gh);
+
+	gfs2_quota_cleanup(sdp);
+
+	return error;
+}
+
+static int gfs2_umount_recovery_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
+/**
+ * gfs2_put_super - Unmount the filesystem
+ * @sb: The VFS superblock
+ *
+ */
+
+static void gfs2_put_super(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+	struct gfs2_jdesc *jd;
+
+	/*  Unfreeze the filesystem, if we need to  */
+
+	mutex_lock(&sdp->sd_freeze_lock);
+	if (sdp->sd_freeze_count)
+		gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+	mutex_unlock(&sdp->sd_freeze_lock);
+
+	/* No more recovery requests */
+	set_bit(SDF_NORECOVERY, &sdp->sd_flags);
+	smp_mb();
+
+	/* Wait on outstanding recovery */
+restart:
+	spin_lock(&sdp->sd_jindex_spin);
+	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
+		if (!test_bit(JDF_RECOVERY, &jd->jd_flags))
+			continue;
+		spin_unlock(&sdp->sd_jindex_spin);
+		wait_on_bit(&jd->jd_flags, JDF_RECOVERY,
+			    gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE);
+		goto restart;
+	}
+	spin_unlock(&sdp->sd_jindex_spin);
+
+	kthread_stop(sdp->sd_quotad_process);
+	kthread_stop(sdp->sd_logd_process);
+
+	if (!(sb->s_flags & MS_RDONLY)) {
+		error = gfs2_make_fs_ro(sdp);
+		if (error)
+			gfs2_io_error(sdp);
+	}
+	/*  At this point, we're through modifying the disk  */
+
+	/*  Release stuff  */
+
+	iput(sdp->sd_jindex);
+	iput(sdp->sd_inum_inode);
+	iput(sdp->sd_statfs_inode);
+	iput(sdp->sd_rindex);
+	iput(sdp->sd_quota_inode);
+
+	gfs2_glock_put(sdp->sd_rename_gl);
+	gfs2_glock_put(sdp->sd_trans_gl);
+
+	if (!sdp->sd_args.ar_spectator) {
+		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
+		gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
+		iput(sdp->sd_ir_inode);
+		iput(sdp->sd_sc_inode);
+		iput(sdp->sd_qc_inode);
+	}
+
+	gfs2_glock_dq_uninit(&sdp->sd_live_gh);
+	gfs2_clear_rgrpd(sdp);
+	gfs2_jindex_free(sdp);
+	/*  Take apart glock structures and buffer lists  */
+	gfs2_gl_hash_clear(sdp);
+	/*  Unmount the locking protocol  */
+	gfs2_lm_unmount(sdp);
+
+	/*  At this point, we're through participating in the lockspace  */
+	gfs2_sys_fs_del(sdp);
+}
+
+/**
+ * gfs2_write_super
+ * @sb: the superblock
+ *
+ */
+
+static void gfs2_write_super(struct super_block *sb)
+{
+	sb->s_dirt = 0;
+}
+
+/**
+ * gfs2_sync_fs - sync the filesystem
+ * @sb: the superblock
+ *
+ * Flushes the log to disk.
+ */
+
+static int gfs2_sync_fs(struct super_block *sb, int wait)
+{
+	sb->s_dirt = 0;
+	if (wait && sb->s_fs_info)
+		gfs2_log_flush(sb->s_fs_info, NULL);
+	return 0;
+}
+
+/**
+ * gfs2_freeze - prevent further writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static int gfs2_freeze(struct super_block *sb)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	int error;
+
+	if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return -EINVAL;
+
+	for (;;) {
+		error = gfs2_freeze_fs(sdp);
+		if (!error)
+			break;
+
+		switch (error) {
+		case -EBUSY:
+			fs_err(sdp, "waiting for recovery before freeze\n");
+			break;
+
+		default:
+			fs_err(sdp, "error freezing FS: %d\n", error);
+			break;
+		}
+
+		fs_err(sdp, "retrying...\n");
+		msleep(1000);
+	}
+	return 0;
+}
+
+/**
+ * gfs2_unfreeze - reallow writes to the filesystem
+ * @sb: the VFS structure for the filesystem
+ *
+ */
+
+static int gfs2_unfreeze(struct super_block *sb)
+{
+	gfs2_unfreeze_fs(sb->s_fs_info);
+	return 0;
+}
+
+/**
+ * statfs_fill - fill in the sg for a given RG
+ * @rgd: the RG
+ * @sc: the sc structure
+ *
+ * Returns: 0 on success, -ESTALE if the LVB is invalid
+ */
+
+static int statfs_slow_fill(struct gfs2_rgrpd *rgd,
+			    struct gfs2_statfs_change_host *sc)
+{
+	gfs2_rgrp_verify(rgd);
+	sc->sc_total += rgd->rd_data;
+	sc->sc_free += rgd->rd_free;
+	sc->sc_dinodes += rgd->rd_dinodes;
+	return 0;
+}
+
+/**
+ * gfs2_statfs_slow - Stat a filesystem using asynchronous locking
+ * @sdp: the filesystem
+ * @sc: the sc info that will be returned
+ *
+ * Any error (other than a signal) will cause this routine to fall back
+ * to the synchronous version.
+ *
+ * FIXME: This really shouldn't busy wait like this.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_holder ri_gh;
+	struct gfs2_rgrpd *rgd_next;
+	struct gfs2_holder *gha, *gh;
+	unsigned int slots = 64;
+	unsigned int x;
+	int done;
+	int error = 0, err;
+
+	memset(sc, 0, sizeof(struct gfs2_statfs_change_host));
+	gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL);
+	if (!gha)
+		return -ENOMEM;
+
+	error = gfs2_rindex_hold(sdp, &ri_gh);
+	if (error)
+		goto out;
+
+	rgd_next = gfs2_rgrpd_get_first(sdp);
+
+	for (;;) {
+		done = 1;
+
+		for (x = 0; x < slots; x++) {
+			gh = gha + x;
+
+			if (gh->gh_gl && gfs2_glock_poll(gh)) {
+				err = gfs2_glock_wait(gh);
+				if (err) {
+					gfs2_holder_uninit(gh);
+					error = err;
+				} else {
+					if (!error)
+						error = statfs_slow_fill(
+							gh->gh_gl->gl_object, sc);
+					gfs2_glock_dq_uninit(gh);
+				}
+			}
+
+			if (gh->gh_gl)
+				done = 0;
+			else if (rgd_next && !error) {
+				error = gfs2_glock_nq_init(rgd_next->rd_gl,
+							   LM_ST_SHARED,
+							   GL_ASYNC,
+							   gh);
+				rgd_next = gfs2_rgrpd_get_next(rgd_next);
+				done = 0;
+			}
+
+			if (signal_pending(current))
+				error = -ERESTARTSYS;
+		}
+
+		if (done)
+			break;
+
+		yield();
+	}
+
+	gfs2_glock_dq_uninit(&ri_gh);
+
+out:
+	kfree(gha);
+	return error;
+}
+
+/**
+ * gfs2_statfs_i - Do a statfs
+ * @sdp: the filesystem
+ * @sg: the sg structure
+ *
+ * Returns: errno
+ */
+
+static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc)
+{
+	struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+
+	spin_lock(&sdp->sd_statfs_spin);
+
+	*sc = *m_sc;
+	sc->sc_total += l_sc->sc_total;
+	sc->sc_free += l_sc->sc_free;
+	sc->sc_dinodes += l_sc->sc_dinodes;
+
+	spin_unlock(&sdp->sd_statfs_spin);
+
+	if (sc->sc_free < 0)
+		sc->sc_free = 0;
+	if (sc->sc_free > sc->sc_total)
+		sc->sc_free = sc->sc_total;
+	if (sc->sc_dinodes < 0)
+		sc->sc_dinodes = 0;
+
+	return 0;
+}
+
+/**
+ * gfs2_statfs - Gather and return stats about the filesystem
+ * @sb: The superblock
+ * @statfsbuf: The buffer
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_inode->i_sb;
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_statfs_change_host sc;
+	int error;
+
+	if (gfs2_tune_get(sdp, gt_statfs_slow))
+		error = gfs2_statfs_slow(sdp, &sc);
+	else
+		error = gfs2_statfs_i(sdp, &sc);
+
+	if (error)
+		return error;
+
+	buf->f_type = GFS2_MAGIC;
+	buf->f_bsize = sdp->sd_sb.sb_bsize;
+	buf->f_blocks = sc.sc_total;
+	buf->f_bfree = sc.sc_free;
+	buf->f_bavail = sc.sc_free;
+	buf->f_files = sc.sc_dinodes + sc.sc_free;
+	buf->f_ffree = sc.sc_free;
+	buf->f_namelen = GFS2_FNAMESIZE;
+
+	return 0;
+}
+
+/**
+ * gfs2_remount_fs - called when the FS is remounted
+ * @sb:  the filesystem
+ * @flags:  the remount flags
+ * @data:  extra data passed in (not used right now)
+ *
+ * Returns: errno
+ */
+
+static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+	struct gfs2_sbd *sdp = sb->s_fs_info;
+	struct gfs2_args args = sdp->sd_args; /* Default to current settings */
+	struct gfs2_tune *gt = &sdp->sd_tune;
+	int error;
+
+	spin_lock(&gt->gt_spin);
+	args.ar_commit = gt->gt_log_flush_secs;
+	spin_unlock(&gt->gt_spin);
+	error = gfs2_mount_args(sdp, &args, data);
+	if (error)
+		return error;
+
+	/* Not allowed to change locking details */
+	if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) ||
+	    strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) ||
+	    strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata))
+		return -EINVAL;
+
+	/* Some flags must not be changed */
+	if (args_neq(&args, &sdp->sd_args, spectator) ||
+	    args_neq(&args, &sdp->sd_args, ignore_local_fs) ||
+	    args_neq(&args, &sdp->sd_args, localflocks) ||
+	    args_neq(&args, &sdp->sd_args, localcaching) ||
+	    args_neq(&args, &sdp->sd_args, meta))
+		return -EINVAL;
+
+	if (sdp->sd_args.ar_spectator)
+		*flags |= MS_RDONLY;
+
+	if ((sb->s_flags ^ *flags) & MS_RDONLY) {
+		if (*flags & MS_RDONLY)
+			error = gfs2_make_fs_ro(sdp);
+		else
+			error = gfs2_make_fs_rw(sdp);
+		if (error)
+			return error;
+	}
+
+	sdp->sd_args = args;
+	if (sdp->sd_args.ar_posix_acl)
+		sb->s_flags |= MS_POSIXACL;
+	else
+		sb->s_flags &= ~MS_POSIXACL;
+	spin_lock(&gt->gt_spin);
+	gt->gt_log_flush_secs = args.ar_commit;
+	spin_unlock(&gt->gt_spin);
+
+	return 0;
+}
+
+/**
+ * gfs2_drop_inode - Drop an inode (test for remote unlink)
+ * @inode: The inode to drop
+ *
+ * If we've received a callback on an iopen lock then its because a
+ * remote node tried to deallocate the inode but failed due to this node
+ * still having the inode open. Here we mark the link count zero
+ * since we know that it must have reached zero if the GLF_DEMOTE flag
+ * is set on the iopen glock. If we didn't do a disk read since the
+ * remote node removed the final link then we might otherwise miss
+ * this event. This check ensures that this node will deallocate the
+ * inode's blocks, or alternatively pass the baton on to another
+ * node for later deallocation.
+ */
+
+static void gfs2_drop_inode(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) {
+		struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl;
+		if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags))
+			clear_nlink(inode);
+	}
+	generic_drop_inode(inode);
+}
+
+/**
+ * gfs2_clear_inode - Deallocate an inode when VFS is done with it
+ * @inode: The VFS inode
+ *
+ */
+
+static void gfs2_clear_inode(struct inode *inode)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+
+	/* This tells us its a "real" inode and not one which only
+	 * serves to contain an address space (see rgrp.c, meta_io.c)
+	 * which therefore doesn't have its own glocks.
+	 */
+	if (test_bit(GIF_USER, &ip->i_flags)) {
+		ip->i_gl->gl_object = NULL;
+		gfs2_glock_put(ip->i_gl);
+		ip->i_gl = NULL;
+		if (ip->i_iopen_gh.gh_gl) {
+			ip->i_iopen_gh.gh_gl->gl_object = NULL;
+			gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		}
+	}
+}
+
+static int is_ancestor(const struct dentry *d1, const struct dentry *d2)
+{
+	do {
+		if (d1 == d2)
+			return 1;
+		d1 = d1->d_parent;
+	} while (!IS_ROOT(d1));
+	return 0;
+}
+
+/**
+ * gfs2_show_options - Show mount options for /proc/mounts
+ * @s: seq_file structure
+ * @mnt: vfsmount
+ *
+ * Returns: 0 on success or error code
+ */
+
+static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
+{
+	struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info;
+	struct gfs2_args *args = &sdp->sd_args;
+	int lfsecs;
+
+	if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir))
+		seq_printf(s, ",meta");
+	if (args->ar_lockproto[0])
+		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
+	if (args->ar_locktable[0])
+		seq_printf(s, ",locktable=%s", args->ar_locktable);
+	if (args->ar_hostdata[0])
+		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
+	if (args->ar_spectator)
+		seq_printf(s, ",spectator");
+	if (args->ar_ignore_local_fs)
+		seq_printf(s, ",ignore_local_fs");
+	if (args->ar_localflocks)
+		seq_printf(s, ",localflocks");
+	if (args->ar_localcaching)
+		seq_printf(s, ",localcaching");
+	if (args->ar_debug)
+		seq_printf(s, ",debug");
+	if (args->ar_upgrade)
+		seq_printf(s, ",upgrade");
+	if (args->ar_posix_acl)
+		seq_printf(s, ",acl");
+	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
+		char *state;
+		switch (args->ar_quota) {
+		case GFS2_QUOTA_OFF:
+			state = "off";
+			break;
+		case GFS2_QUOTA_ACCOUNT:
+			state = "account";
+			break;
+		case GFS2_QUOTA_ON:
+			state = "on";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",quota=%s", state);
+	}
+	if (args->ar_suiddir)
+		seq_printf(s, ",suiddir");
+	if (args->ar_data != GFS2_DATA_DEFAULT) {
+		char *state;
+		switch (args->ar_data) {
+		case GFS2_DATA_WRITEBACK:
+			state = "writeback";
+			break;
+		case GFS2_DATA_ORDERED:
+			state = "ordered";
+			break;
+		default:
+			state = "unknown";
+			break;
+		}
+		seq_printf(s, ",data=%s", state);
+	}
+	if (args->ar_discard)
+		seq_printf(s, ",discard");
+	lfsecs = sdp->sd_tune.gt_log_flush_secs;
+	if (lfsecs != 60)
+		seq_printf(s, ",commit=%d", lfsecs);
+	return 0;
+}
+
+/*
+ * We have to (at the moment) hold the inodes main lock to cover
+ * the gap between unlocking the shared lock on the iopen lock and
+ * taking the exclusive lock. I'd rather do a shared -> exclusive
+ * conversion on the iopen lock, but we can change that later. This
+ * is safe, just less efficient.
+ */
+
+static void gfs2_delete_inode(struct inode *inode)
+{
+	struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_holder gh;
+	int error;
+
+	if (!test_bit(GIF_USER, &ip->i_flags))
+		goto out;
+
+	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+	if (unlikely(error)) {
+		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		goto out;
+	}
+
+	gfs2_glock_dq_wait(&ip->i_iopen_gh);
+	gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
+	error = gfs2_glock_nq(&ip->i_iopen_gh);
+	if (error)
+		goto out_truncate;
+
+	if (S_ISDIR(inode->i_mode) &&
+	    (ip->i_diskflags & GFS2_DIF_EXHASH)) {
+		error = gfs2_dir_exhash_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (ip->i_eattr) {
+		error = gfs2_ea_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	if (!gfs2_is_stuffed(ip)) {
+		error = gfs2_file_dealloc(ip);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = gfs2_dinode_dealloc(ip);
+	if (error)
+		goto out_unlock;
+
+out_truncate:
+	error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
+	if (error)
+		goto out_unlock;
+	/* Needs to be done before glock release & also in a transaction */
+	truncate_inode_pages(&inode->i_data, 0);
+	gfs2_trans_end(sdp);
+
+out_unlock:
+	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
+		gfs2_glock_dq(&ip->i_iopen_gh);
+	gfs2_holder_uninit(&ip->i_iopen_gh);
+	gfs2_glock_dq_uninit(&gh);
+	if (error && error != GLR_TRYFAILED && error != -EROFS)
+		fs_warn(sdp, "gfs2_delete_inode: %d\n", error);
+out:
+	truncate_inode_pages(&inode->i_data, 0);
+	clear_inode(inode);
+}
+
+static struct inode *gfs2_alloc_inode(struct super_block *sb)
+{
+	struct gfs2_inode *ip;
+
+	ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL);
+	if (ip) {
+		ip->i_flags = 0;
+		ip->i_gl = NULL;
+	}
+	return &ip->i_inode;
+}
+
+static void gfs2_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(gfs2_inode_cachep, inode);
+}
+
+const struct super_operations gfs2_super_ops = {
+	.alloc_inode		= gfs2_alloc_inode,
+	.destroy_inode		= gfs2_destroy_inode,
+	.write_inode		= gfs2_write_inode,
+	.delete_inode		= gfs2_delete_inode,
+	.put_super		= gfs2_put_super,
+	.write_super		= gfs2_write_super,
+	.sync_fs		= gfs2_sync_fs,
+	.freeze_fs 		= gfs2_freeze,
+	.unfreeze_fs		= gfs2_unfreeze,
+	.statfs			= gfs2_statfs,
+	.remount_fs		= gfs2_remount_fs,
+	.clear_inode		= gfs2_clear_inode,
+	.drop_inode		= gfs2_drop_inode,
+	.show_options		= gfs2_show_options,
+};
+
-- 
GitLab


From 2286dbfad1fb622ee2691537e5caaedee4618860 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:45:09 +0100
Subject: [PATCH 1251/2198] GFS2: Move gfs2_rmdiri into ops_inode.c

Move gfs2_rmdiri() into ops_inode.c and make it static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c     | 52 --------------------------------------------
 fs/gfs2/inode.h     |  2 --
 fs/gfs2/ops_inode.c | 53 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 53 insertions(+), 54 deletions(-)

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index c03a1a384e72b..9b17447a0f957 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1046,58 +1046,6 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	return ERR_PTR(error);
 }
 
-/**
- * gfs2_rmdiri - Remove a directory
- * @dip: The parent directory of the directory to be removed
- * @name: The name of the directory to be removed
- * @ip: The GFS2 inode of the directory to be removed
- *
- * Assumes Glocks on dip and ip are held
- *
- * Returns: errno
- */
-
-int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-		struct gfs2_inode *ip)
-{
-	struct qstr dotname;
-	int error;
-
-	if (ip->i_entries != 2) {
-		if (gfs2_consist_inode(ip))
-			gfs2_dinode_print(ip);
-		return -EIO;
-	}
-
-	error = gfs2_dir_del(dip, name);
-	if (error)
-		return error;
-
-	error = gfs2_change_nlink(dip, -1);
-	if (error)
-		return error;
-
-	gfs2_str2qstr(&dotname, ".");
-	error = gfs2_dir_del(ip, &dotname);
-	if (error)
-		return error;
-
-	gfs2_str2qstr(&dotname, "..");
-	error = gfs2_dir_del(ip, &dotname);
-	if (error)
-		return error;
-
-	/* It looks odd, but it really should be done twice */
-	error = gfs2_change_nlink(ip, -1);
-	if (error)
-		return error;
-
-	error = gfs2_change_nlink(ip, -1);
-	if (error)
-		return error;
-
-	return error;
-}
 
 /*
  * gfs2_unlink_ok - check to see that a inode is still in a directory
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 2c3ec072d60e2..6cd39284eb08b 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,8 +96,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 				  const struct qstr *name,
 				  unsigned int mode, dev_t dev);
-extern int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
-		       struct gfs2_inode *ip);
 extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 			  const struct gfs2_inode *ip);
 extern int gfs2_permission(struct inode *inode, int mask);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 1c70fa5168d6a..5dacd647ff0dc 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -472,6 +472,59 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return 0;
 }
 
+/**
+ * gfs2_rmdiri - Remove a directory
+ * @dip: The parent directory of the directory to be removed
+ * @name: The name of the directory to be removed
+ * @ip: The GFS2 inode of the directory to be removed
+ *
+ * Assumes Glocks on dip and ip are held
+ *
+ * Returns: errno
+ */
+
+static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name,
+		       struct gfs2_inode *ip)
+{
+	struct qstr dotname;
+	int error;
+
+	if (ip->i_entries != 2) {
+		if (gfs2_consist_inode(ip))
+			gfs2_dinode_print(ip);
+		return -EIO;
+	}
+
+	error = gfs2_dir_del(dip, name);
+	if (error)
+		return error;
+
+	error = gfs2_change_nlink(dip, -1);
+	if (error)
+		return error;
+
+	gfs2_str2qstr(&dotname, ".");
+	error = gfs2_dir_del(ip, &dotname);
+	if (error)
+		return error;
+
+	gfs2_str2qstr(&dotname, "..");
+	error = gfs2_dir_del(ip, &dotname);
+	if (error)
+		return error;
+
+	/* It looks odd, but it really should be done twice */
+	error = gfs2_change_nlink(ip, -1);
+	if (error)
+		return error;
+
+	error = gfs2_change_nlink(ip, -1);
+	if (error)
+		return error;
+
+	return error;
+}
+
 /**
  * gfs2_rmdir - Remove a directory
  * @dir: The parent directory of the directory to be removed
-- 
GitLab


From 536baf02f650f4547f105386878b4736fbc181e8 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:48:59 +0100
Subject: [PATCH 1252/2198] GFS2: Move gfs2_readlinki into ops_inode.c

Move gfs2_readlinki into ops_inode.c and make it static

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c     | 58 +--------------------------------------------
 fs/gfs2/inode.h     |  1 -
 fs/gfs2/ops_inode.c | 55 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 58 deletions(-)

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9b17447a0f957..676e750fc84c7 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1085,63 +1085,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 	return 0;
 }
 
-/**
- * gfs2_readlinki - return the contents of a symlink
- * @ip: the symlink's inode
- * @buf: a pointer to the buffer to be filled
- * @len: a pointer to the length of @buf
- *
- * If @buf is too small, a piece of memory is kmalloc()ed and needs
- * to be freed by the caller.
- *
- * Returns: errno
- */
-
-int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
-{
-	struct gfs2_holder i_gh;
-	struct buffer_head *dibh;
-	unsigned int x;
-	int error;
-
-	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
-	error = gfs2_glock_nq(&i_gh);
-	if (error) {
-		gfs2_holder_uninit(&i_gh);
-		return error;
-	}
-
-	if (!ip->i_disksize) {
-		gfs2_consist_inode(ip);
-		error = -EIO;
-		goto out;
-	}
-
-	error = gfs2_meta_inode_buffer(ip, &dibh);
-	if (error)
-		goto out;
-
-	x = ip->i_disksize + 1;
-	if (x > *len) {
-		*buf = kmalloc(x, GFP_NOFS);
-		if (!*buf) {
-			error = -ENOMEM;
-			goto out_brelse;
-		}
-	}
-
-	memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
-	*len = x;
-
-out_brelse:
-	brelse(dibh);
-out:
-	gfs2_glock_dq_uninit(&i_gh);
-	return error;
-}
-
-static int
-__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
+static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
 	struct buffer_head *dibh;
 	int error;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index 6cd39284eb08b..fc9a08f45be70 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -99,7 +99,6 @@ extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 			  const struct gfs2_inode *ip);
 extern int gfs2_permission(struct inode *inode, int mask);
-extern int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
 extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 5dacd647ff0dc..f607f0908cffa 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -937,6 +937,61 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	return error;
 }
 
+/**
+ * gfs2_readlinki - return the contents of a symlink
+ * @ip: the symlink's inode
+ * @buf: a pointer to the buffer to be filled
+ * @len: a pointer to the length of @buf
+ *
+ * If @buf is too small, a piece of memory is kmalloc()ed and needs
+ * to be freed by the caller.
+ *
+ * Returns: errno
+ */
+
+static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
+{
+	struct gfs2_holder i_gh;
+	struct buffer_head *dibh;
+	unsigned int x;
+	int error;
+
+	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
+	error = gfs2_glock_nq(&i_gh);
+	if (error) {
+		gfs2_holder_uninit(&i_gh);
+		return error;
+	}
+
+	if (!ip->i_disksize) {
+		gfs2_consist_inode(ip);
+		error = -EIO;
+		goto out;
+	}
+
+	error = gfs2_meta_inode_buffer(ip, &dibh);
+	if (error)
+		goto out;
+
+	x = ip->i_disksize + 1;
+	if (x > *len) {
+		*buf = kmalloc(x, GFP_NOFS);
+		if (!*buf) {
+			error = -ENOMEM;
+			goto out_brelse;
+		}
+	}
+
+	memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x);
+	*len = x;
+
+out_brelse:
+	brelse(dibh);
+out:
+	gfs2_glock_dq_uninit(&i_gh);
+	return error;
+}
+
 /**
  * gfs2_readlink - Read the value of a symlink
  * @dentry: the symlink
-- 
GitLab


From 87ec21741138bb42e7f943bb142b1d8567c10925 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 22 May 2009 10:54:50 +0100
Subject: [PATCH 1253/2198] GFS2: Move gfs2_unlink_ok into ops_inode.c

Another function which is only called from one ops_inode.c so
we move it and make it static.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c     | 39 ---------------------------------------
 fs/gfs2/inode.h     |  2 --
 fs/gfs2/ops_inode.c | 38 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 41 deletions(-)

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 676e750fc84c7..2f94bd723698a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1046,45 +1046,6 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
 	return ERR_PTR(error);
 }
 
-
-/*
- * gfs2_unlink_ok - check to see that a inode is still in a directory
- * @dip: the directory
- * @name: the name of the file
- * @ip: the inode
- *
- * Assumes that the lock on (at least) @dip is held.
- *
- * Returns: 0 if the parent/child relationship is correct, errno if it isn't
- */
-
-int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-		   const struct gfs2_inode *ip)
-{
-	int error;
-
-	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
-		return -EPERM;
-
-	if ((dip->i_inode.i_mode & S_ISVTX) &&
-	    dip->i_inode.i_uid != current_fsuid() &&
-	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
-		return -EPERM;
-
-	if (IS_APPEND(&dip->i_inode))
-		return -EPERM;
-
-	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
-	if (error)
-		return error;
-
-	error = gfs2_dir_check(&dip->i_inode, name, ip);
-	if (error)
-		return error;
-
-	return 0;
-}
-
 static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr)
 {
 	struct buffer_head *dibh;
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index fc9a08f45be70..c341aaf67adb2 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -96,8 +96,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
 extern struct inode *gfs2_createi(struct gfs2_holder *ghs,
 				  const struct qstr *name,
 				  unsigned int mode, dev_t dev);
-extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
-			  const struct gfs2_inode *ip);
 extern int gfs2_permission(struct inode *inode, int mask);
 extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr);
 extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f607f0908cffa..f8bd20baf99c4 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -262,6 +262,44 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	return error;
 }
 
+/*
+ * gfs2_unlink_ok - check to see that a inode is still in a directory
+ * @dip: the directory
+ * @name: the name of the file
+ * @ip: the inode
+ *
+ * Assumes that the lock on (at least) @dip is held.
+ *
+ * Returns: 0 if the parent/child relationship is correct, errno if it isn't
+ */
+
+static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
+			  const struct gfs2_inode *ip)
+{
+	int error;
+
+	if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+		return -EPERM;
+
+	if ((dip->i_inode.i_mode & S_ISVTX) &&
+	    dip->i_inode.i_uid != current_fsuid() &&
+	    ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER))
+		return -EPERM;
+
+	if (IS_APPEND(&dip->i_inode))
+		return -EPERM;
+
+	error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC);
+	if (error)
+		return error;
+
+	error = gfs2_dir_check(&dip->i_inode, name, ip);
+	if (error)
+		return error;
+
+	return 0;
+}
+
 /**
  * gfs2_unlink - Unlink a file
  * @dir: The inode of the directory containing the file to unlink
-- 
GitLab


From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 22 May 2009 14:17:31 +1000
Subject: [PATCH 1254/2198] perf_counter: Dynamically allocate tasks'
 perf_counter_context struct

This replaces the struct perf_counter_context in the task_struct with
a pointer to a dynamically allocated perf_counter_context struct.  The
main reason for doing is this is to allow us to transfer a
perf_counter_context from one task to another when we do lazy PMU
switching in a later patch.

This has a few side-benefits: the task_struct becomes a little smaller,
we save some memory because only tasks that have perf_counters attached
get a perf_counter_context allocated for them, and we can remove the
inclusion of <linux/perf_counter.h> in sched.h, meaning that we don't
end up recompiling nearly everything whenever perf_counter.h changes.

The perf_counter_context structures are reference-counted and freed
when the last reference is dropped.  A context can have references
from its task and the counters on its task.  Counters can outlive the
task so it is possible that a context will be freed well after its
task has exited.

Contexts are allocated on fork if the parent had a context, or
otherwise the first time that a per-task counter is created on a task.
In the latter case, we set the context pointer in the task struct
locklessly using an atomic compare-and-exchange operation in case we
raced with some other task in creating a context for the subject task.

This also removes the task pointer from the perf_counter struct.  The
task pointer was not used anywhere and would make it harder to move a
context from one task to another.  Anything that needed to know which
task a counter was attached to was already using counter->ctx->task.

The __perf_counter_init_context function moves up in perf_counter.c
so that it can be called from find_get_context, and now initializes
the refcount, but is otherwise unchanged.

We were potentially calling list_del_counter twice: once from
__perf_counter_exit_task when the task exits and once from
__perf_counter_remove_from_context when the counter's fd gets closed.
This adds a check in list_del_counter so it doesn't do anything if
the counter has already been removed from the lists.

Since perf_counter_task_sched_in doesn't do anything if the task doesn't
have a context, and leaves cpuctx->task_ctx = NULL, this adds code to
__perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in
the case where the current task adds the first counter to itself and
thus creates a context for itself.

This also adds similar code to __perf_counter_enable to handle a
similar situation which can arise when the counters have been disabled
using prctl; that also leaves cpuctx->task_ctx = NULL.

[ Impact: refactor counter context management to prepare for new feature ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c  |   1 +
 include/linux/init_task.h    |  13 ---
 include/linux/perf_counter.h |   4 +-
 include/linux/sched.h        |   6 +-
 kernel/exit.c                |   3 +-
 kernel/fork.c                |   1 +
 kernel/perf_counter.c        | 218 ++++++++++++++++++++++-------------
 7 files changed, 145 insertions(+), 101 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e9021a9080206..b4f64402a82a4 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -14,6 +14,7 @@
  *	Mikael Pettersson	:	PM converted to driver model.
  */
 
+#include <linux/perf_counter.h>
 #include <linux/kernel_stat.h>
 #include <linux/mc146818rtc.h>
 #include <linux/acpi_pmtmr.h>
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 503afaa0afa7b..d87247d2641f5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,18 +108,6 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
-#ifdef CONFIG_PERF_COUNTERS
-# define INIT_PERF_COUNTERS(tsk)					\
-	.perf_counter_ctx.counter_list =				\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list),	\
-	.perf_counter_ctx.event_list =					\
-		LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list),	\
-	.perf_counter_ctx.lock =					\
-		__SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock),
-#else
-# define INIT_PERF_COUNTERS(tsk)
-#endif
-
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -183,7 +171,6 @@ extern struct cred init_cred;
 	},								\
 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_IDS							\
-	INIT_PERF_COUNTERS(tsk)						\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f612941ef46ed..071309005468a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -449,7 +449,6 @@ struct perf_counter {
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
-	struct task_struct		*task;
 	struct file			*filp;
 
 	struct perf_counter		*parent;
@@ -498,7 +497,6 @@ struct perf_counter {
  * Used as a container for task counters and CPU counters as well:
  */
 struct perf_counter_context {
-#ifdef CONFIG_PERF_COUNTERS
 	/*
 	 * Protect the states of the counters in the list,
 	 * nr_active, and the list:
@@ -516,6 +514,7 @@ struct perf_counter_context {
 	int			nr_counters;
 	int			nr_active;
 	int			is_active;
+	atomic_t		refcount;
 	struct task_struct	*task;
 
 	/*
@@ -523,7 +522,6 @@ struct perf_counter_context {
 	 */
 	u64			time;
 	u64			timestamp;
-#endif
 };
 
 /**
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ff59d12315192..9714d450f4178 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -71,7 +71,6 @@ struct sched_param {
 #include <linux/path.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
-#include <linux/perf_counter.h>
 #include <linux/pid.h>
 #include <linux/percpu.h>
 #include <linux/topology.h>
@@ -99,6 +98,7 @@ struct robust_list_head;
 struct bio;
 struct bts_tracer;
 struct fs_struct;
+struct perf_counter_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -1387,7 +1387,9 @@ struct task_struct {
 	struct list_head pi_state_list;
 	struct futex_pi_state *pi_state_cache;
 #endif
-	struct perf_counter_context perf_counter_ctx;
+#ifdef CONFIG_PERF_COUNTERS
+	struct perf_counter_context *perf_counter_ctxp;
+#endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;
 	short il_next;
diff --git a/kernel/exit.c b/kernel/exit.c
index f9dfedd94af0c..99ad4063ee4a0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,6 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
+#include <linux/perf_counter.h>
 #include <trace/sched.h>
 
 #include <asm/uaccess.h>
@@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
 	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
 
 #ifdef CONFIG_PERF_COUNTERS
-	WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
+	WARN_ON_ONCE(tsk->perf_counter_ctxp);
 #endif
 	trace_sched_process_free(tsk);
 	put_task_struct(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index d32fef4d38e5c..e72a09f5355bf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -63,6 +63,7 @@
 #include <linux/fs_struct.h>
 #include <trace/sched.h>
 #include <linux/magic.h>
+#include <linux/perf_counter.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 08584c16049fc..06ea3eae886e3 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -97,6 +97,17 @@ void perf_enable(void)
 		hw_perf_enable();
 }
 
+static void get_ctx(struct perf_counter_context *ctx)
+{
+	atomic_inc(&ctx->refcount);
+}
+
+static void put_ctx(struct perf_counter_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->refcount))
+		kfree(ctx);
+}
+
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -118,11 +129,17 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	ctx->nr_counters++;
 }
 
+/*
+ * Remove a counter from the lists for its context.
+ * Must be called with counter->mutex and ctx->mutex held.
+ */
 static void
 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
 	struct perf_counter *sibling, *tmp;
 
+	if (list_empty(&counter->list_entry))
+		return;
 	ctx->nr_counters--;
 
 	list_del_init(&counter->list_entry);
@@ -216,8 +233,6 @@ static void __perf_counter_remove_from_context(void *info)
 
 	counter_sched_out(counter, cpuctx, ctx);
 
-	counter->task = NULL;
-
 	list_del_counter(counter, ctx);
 
 	if (!ctx->task) {
@@ -279,7 +294,6 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
 	 */
 	if (!list_empty(&counter->list_entry)) {
 		list_del_counter(counter, ctx);
-		counter->task = NULL;
 	}
 	spin_unlock_irq(&ctx->lock);
 }
@@ -568,11 +582,17 @@ static void __perf_install_in_context(void *info)
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
 	 * scheduled out before the smp call arrived.
+	 * Or possibly this is the right context but it isn't
+	 * on this cpu because it had no counters.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	ctx->is_active = 1;
 	update_context_time(ctx);
 
 	/*
@@ -653,7 +673,6 @@ perf_install_in_context(struct perf_counter_context *ctx,
 		return;
 	}
 
-	counter->task = task;
 retry:
 	task_oncpu_function_call(task, __perf_install_in_context,
 				 counter);
@@ -693,10 +712,14 @@ static void __perf_counter_enable(void *info)
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
-		return;
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		if (cpuctx->task_ctx || ctx->task != current)
+			return;
+		cpuctx->task_ctx = ctx;
+	}
 
 	spin_lock_irqsave(&ctx->lock, flags);
+	ctx->is_active = 1;
 	update_context_time(ctx);
 
 	counter->prev_state = counter->state;
@@ -852,10 +875,10 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 	struct pt_regs *regs;
 
-	if (likely(!cpuctx->task_ctx))
+	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
 
 	update_context_time(ctx);
@@ -871,6 +894,8 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 
+	if (!cpuctx->task_ctx)
+		return;
 	__perf_counter_sched_out(ctx, cpuctx);
 	cpuctx->task_ctx = NULL;
 }
@@ -969,8 +994,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-	struct perf_counter_context *ctx = &task->perf_counter_ctx;
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 
+	if (likely(!ctx))
+		return;
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 	cpuctx->task_ctx = ctx;
 }
@@ -985,11 +1012,11 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 int perf_counter_task_disable(void)
 {
 	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
 	unsigned long flags;
 
-	if (likely(!ctx->nr_counters))
+	if (!ctx || !ctx->nr_counters)
 		return 0;
 
 	local_irq_save(flags);
@@ -1020,12 +1047,12 @@ int perf_counter_task_disable(void)
 int perf_counter_task_enable(void)
 {
 	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
 	unsigned long flags;
 	int cpu;
 
-	if (likely(!ctx->nr_counters))
+	if (!ctx || !ctx->nr_counters)
 		return 0;
 
 	local_irq_save(flags);
@@ -1128,19 +1155,23 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 		return;
 
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
-	ctx = &curr->perf_counter_ctx;
+	ctx = curr->perf_counter_ctxp;
 
 	perf_adjust_freq(&cpuctx->ctx);
-	perf_adjust_freq(ctx);
+	if (ctx)
+		perf_adjust_freq(ctx);
 
 	perf_counter_cpu_sched_out(cpuctx);
-	__perf_counter_task_sched_out(ctx);
+	if (ctx)
+		__perf_counter_task_sched_out(ctx);
 
 	rotate_ctx(&cpuctx->ctx);
-	rotate_ctx(ctx);
+	if (ctx)
+		rotate_ctx(ctx);
 
 	perf_counter_cpu_sched_in(cpuctx, cpu);
-	perf_counter_task_sched_in(curr, cpu);
+	if (ctx)
+		perf_counter_task_sched_in(curr, cpu);
 }
 
 /*
@@ -1176,6 +1207,22 @@ static u64 perf_counter_read(struct perf_counter *counter)
 	return atomic64_read(&counter->count);
 }
 
+/*
+ * Initialize the perf_counter context in a task_struct:
+ */
+static void
+__perf_counter_init_context(struct perf_counter_context *ctx,
+			    struct task_struct *task)
+{
+	memset(ctx, 0, sizeof(*ctx));
+	spin_lock_init(&ctx->lock);
+	mutex_init(&ctx->mutex);
+	INIT_LIST_HEAD(&ctx->counter_list);
+	INIT_LIST_HEAD(&ctx->event_list);
+	atomic_set(&ctx->refcount, 1);
+	ctx->task = task;
+}
+
 static void put_context(struct perf_counter_context *ctx)
 {
 	if (ctx->task)
@@ -1186,6 +1233,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
+	struct perf_counter_context *tctx;
 	struct task_struct *task;
 
 	/*
@@ -1225,15 +1273,36 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
-	ctx = &task->perf_counter_ctx;
-	ctx->task = task;
-
 	/* Reuse ptrace permission checks for now. */
 	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
-		put_context(ctx);
+		put_task_struct(task);
 		return ERR_PTR(-EACCES);
 	}
 
+	ctx = task->perf_counter_ctxp;
+	if (!ctx) {
+		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+		if (!ctx) {
+			put_task_struct(task);
+			return ERR_PTR(-ENOMEM);
+		}
+		__perf_counter_init_context(ctx, task);
+		/*
+		 * Make sure other cpus see correct values for *ctx
+		 * once task->perf_counter_ctxp is visible to them.
+		 */
+		smp_wmb();
+		tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
+		if (tctx) {
+			/*
+			 * We raced with some other task; use
+			 * the context they set.
+			 */
+			kfree(ctx);
+			ctx = tctx;
+		}
+	}
+
 	return ctx;
 }
 
@@ -1242,6 +1311,7 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
+	put_ctx(counter->ctx);
 	kfree(counter);
 }
 
@@ -2247,7 +2317,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
 	put_cpu_var(perf_cpu_context);
 
-	perf_counter_comm_ctx(&current->perf_counter_ctx, comm_event);
+	perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event);
 }
 
 void perf_counter_comm(struct task_struct *task)
@@ -2256,7 +2326,9 @@ void perf_counter_comm(struct task_struct *task)
 
 	if (!atomic_read(&nr_comm_tracking))
 		return;
-       
+	if (!current->perf_counter_ctxp)
+		return;
+
 	comm_event = (struct perf_comm_event){
 		.task	= task,
 		.event  = {
@@ -2372,7 +2444,7 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
 	put_cpu_var(perf_cpu_context);
 
-	perf_counter_mmap_ctx(&current->perf_counter_ctx, mmap_event);
+	perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event);
 
 	kfree(buf);
 }
@@ -2384,6 +2456,8 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 
 	if (!atomic_read(&nr_mmap_tracking))
 		return;
+	if (!current->perf_counter_ctxp)
+		return;
 
 	mmap_event = (struct perf_mmap_event){
 		.file   = file,
@@ -2985,6 +3059,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->pmu			= NULL;
 	counter->ctx			= ctx;
+	get_ctx(ctx);
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
@@ -3149,21 +3224,6 @@ SYSCALL_DEFINE5(perf_counter_open,
 	goto out_fput;
 }
 
-/*
- * Initialize the perf_counter context in a task_struct:
- */
-static void
-__perf_counter_init_context(struct perf_counter_context *ctx,
-			    struct task_struct *task)
-{
-	memset(ctx, 0, sizeof(*ctx));
-	spin_lock_init(&ctx->lock);
-	mutex_init(&ctx->mutex);
-	INIT_LIST_HEAD(&ctx->counter_list);
-	INIT_LIST_HEAD(&ctx->event_list);
-	ctx->task = task;
-}
-
 /*
  * inherit a counter from parent task to child task:
  */
@@ -3195,7 +3255,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Link it up in the child's context:
 	 */
-	child_counter->task = child;
 	add_counter_to_ctx(child_counter, child_ctx);
 
 	child_counter->parent = parent_counter;
@@ -3294,40 +3353,15 @@ __perf_counter_exit_task(struct task_struct *child,
 	struct perf_counter *parent_counter;
 
 	/*
-	 * If we do not self-reap then we have to wait for the
-	 * child task to unschedule (it will happen for sure),
-	 * so that its counter is at its final count. (This
-	 * condition triggers rarely - child tasks usually get
-	 * off their CPU before the parent has a chance to
-	 * get this far into the reaping action)
+	 * Protect against concurrent operations on child_counter
+	 * due its fd getting closed, etc.
 	 */
-	if (child != current) {
-		wait_task_inactive(child, 0);
-		update_counter_times(child_counter);
-		list_del_counter(child_counter, child_ctx);
-	} else {
-		struct perf_cpu_context *cpuctx;
-		unsigned long flags;
-
-		/*
-		 * Disable and unlink this counter.
-		 *
-		 * Be careful about zapping the list - IRQ/NMI context
-		 * could still be processing it:
-		 */
-		local_irq_save(flags);
-		perf_disable();
-
-		cpuctx = &__get_cpu_var(perf_cpu_context);
+	mutex_lock(&child_counter->mutex);
 
-		group_sched_out(child_counter, cpuctx, child_ctx);
-		update_counter_times(child_counter);
+	update_counter_times(child_counter);
+	list_del_counter(child_counter, child_ctx);
 
-		list_del_counter(child_counter, child_ctx);
-
-		perf_enable();
-		local_irq_restore(flags);
-	}
+	mutex_unlock(&child_counter->mutex);
 
 	parent_counter = child_counter->parent;
 	/*
@@ -3346,19 +3380,29 @@ __perf_counter_exit_task(struct task_struct *child,
  *
  * Note: we may be running in child context, but the PID is not hashed
  * anymore so new counters will not be added.
+ * (XXX not sure that is true when we get called from flush_old_exec.
+ *  -- paulus)
  */
 void perf_counter_exit_task(struct task_struct *child)
 {
 	struct perf_counter *child_counter, *tmp;
 	struct perf_counter_context *child_ctx;
+	unsigned long flags;
 
 	WARN_ON_ONCE(child != current);
 
-	child_ctx = &child->perf_counter_ctx;
+	child_ctx = child->perf_counter_ctxp;
 
-	if (likely(!child_ctx->nr_counters))
+	if (likely(!child_ctx))
 		return;
 
+	local_irq_save(flags);
+	__perf_counter_task_sched_out(child_ctx);
+	child->perf_counter_ctxp = NULL;
+	local_irq_restore(flags);
+
+	mutex_lock(&child_ctx->mutex);
+
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
@@ -3371,6 +3415,10 @@ void perf_counter_exit_task(struct task_struct *child)
 	 */
 	if (!list_empty(&child_ctx->counter_list))
 		goto again;
+
+	mutex_unlock(&child_ctx->mutex);
+
+	put_ctx(child_ctx);
 }
 
 /*
@@ -3382,19 +3430,25 @@ void perf_counter_init_task(struct task_struct *child)
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 
-	child_ctx  =  &child->perf_counter_ctx;
-	parent_ctx = &parent->perf_counter_ctx;
-
-	__perf_counter_init_context(child_ctx, child);
+	child->perf_counter_ctxp = NULL;
 
 	/*
 	 * This is executed from the parent task context, so inherit
-	 * counters that have been marked for cloning:
+	 * counters that have been marked for cloning.
+	 * First allocate and initialize a context for the child.
 	 */
 
-	if (likely(!parent_ctx->nr_counters))
+	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
+	if (!child_ctx)
+		return;
+
+	parent_ctx = parent->perf_counter_ctxp;
+	if (likely(!parent_ctx || !parent_ctx->nr_counters))
 		return;
 
+	__perf_counter_init_context(child_ctx, child);
+	child->perf_counter_ctxp = child_ctx;
+
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
 	 * hashed yet and not running, so nobody can access it.
-- 
GitLab


From 564c2b210add41df9a3a5aaa365c1d97cff6110d Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 22 May 2009 14:27:22 +1000
Subject: [PATCH 1255/2198] perf_counter: Optimize context switch between
 identical inherited contexts

When monitoring a process and its descendants with a set of inherited
counters, we can often get the situation in a context switch where
both the old (outgoing) and new (incoming) process have the same set
of counters, and their values are ultimately going to be added together.
In that situation it doesn't matter which set of counters are used to
count the activity for the new process, so there is really no need to
go through the process of reading the hardware counters and updating
the old task's counters and then setting up the PMU for the new task.

This optimizes the context switch in this situation.  Instead of
scheduling out the perf_counter_context for the old task and
scheduling in the new context, we simply transfer the old context
to the new task and keep using it without interruption.  The new
context gets transferred to the old task.  This means that both
tasks still have a valid perf_counter_context, so no special case
is introduced when the old task gets scheduled in again, either on
this CPU or another CPU.

The equivalence of contexts is detected by keeping a pointer in
each cloned context pointing to the context it was cloned from.
To cope with the situation where a context is changed by adding
or removing counters after it has been cloned, we also keep a
generation number on each context which is incremented every time
a context is changed.  When a context is cloned we take a copy
of the parent's generation number, and two cloned contexts are
equivalent only if they have the same parent and the same
generation number.  In order that the parent context pointer
remains valid (and is not reused), we increment the parent
context's reference count for each context cloned from it.

Since we don't have individual fds for the counters in a cloned
context, the only thing that can make two clones of a given parent
different after they have been cloned is enabling or disabling all
counters with prctl.  To account for this, we keep a count of the
number of enabled counters in each context.  Two contexts must have
the same number of enabled counters to be considered equivalent.

Here are some measurements of the context switch time as measured with
the lat_ctx benchmark from lmbench, comparing the times obtained with
and without this patch series:

		-----Unmodified-----		With this patch series
Counters:	none	2 HW	4H+4S	none	2 HW	4H+4S

2 processes:
Average		3.44	6.45	11.24	3.12	3.39	3.60
St dev		0.04	0.04	0.13	0.05	0.17	0.19

8 processes:
Average		6.45	8.79	14.00	5.57	6.23	7.57
St dev		1.27	1.04	0.88	1.42	1.46	1.42

32 processes:
Average		5.56	8.43	13.78	5.28	5.55	7.15
St dev		0.41	0.47	0.53	0.54	0.57	0.81

The numbers are the mean and standard deviation of 20 runs of
lat_ctx.  The "none" columns are lat_ctx run directly without any
counters.  The "2 HW" columns are with lat_ctx run under perfstat,
counting cycles and instructions.  The "4H+4S" columns are lat_ctx run
under perfstat with 4 hardware counters and 4 software counters
(cycles, instructions, cache references, cache misses, task
clock, context switch, cpu migrations, and page faults).

[ Impact: performance optimization of counter context-switches ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  12 +++-
 kernel/perf_counter.c        | 109 ++++++++++++++++++++++++++++++-----
 kernel/sched.c               |   2 +-
 3 files changed, 107 insertions(+), 16 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 071309005468a..4cae01a504508 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -513,6 +513,7 @@ struct perf_counter_context {
 	struct list_head	event_list;
 	int			nr_counters;
 	int			nr_active;
+	int			nr_enabled;
 	int			is_active;
 	atomic_t		refcount;
 	struct task_struct	*task;
@@ -522,6 +523,14 @@ struct perf_counter_context {
 	 */
 	u64			time;
 	u64			timestamp;
+
+	/*
+	 * These fields let us detect when two contexts have both
+	 * been cloned (inherited) from a common ancestor.
+	 */
+	struct perf_counter_context *parent_ctx;
+	u32			parent_gen;
+	u32			generation;
 };
 
 /**
@@ -552,7 +561,8 @@ extern int perf_max_counters;
 extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter);
 
 extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
-extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
+extern void perf_counter_task_sched_out(struct task_struct *task,
+					struct task_struct *next, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 06ea3eae886e3..c10055416dea5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -104,8 +104,11 @@ static void get_ctx(struct perf_counter_context *ctx)
 
 static void put_ctx(struct perf_counter_context *ctx)
 {
-	if (atomic_dec_and_test(&ctx->refcount))
+	if (atomic_dec_and_test(&ctx->refcount)) {
+		if (ctx->parent_ctx)
+			put_ctx(ctx->parent_ctx);
 		kfree(ctx);
+	}
 }
 
 static void
@@ -127,6 +130,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 	ctx->nr_counters++;
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		ctx->nr_enabled++;
 }
 
 /*
@@ -141,6 +146,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	if (list_empty(&counter->list_entry))
 		return;
 	ctx->nr_counters--;
+	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		ctx->nr_enabled--;
 
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
@@ -203,6 +210,22 @@ group_sched_out(struct perf_counter *group_counter,
 		cpuctx->exclusive = 0;
 }
 
+/*
+ * Mark this context as not being a clone of another.
+ * Called when counters are added to or removed from this context.
+ * We also increment our generation number so that anything that
+ * was cloned from this context before this will not match anything
+ * cloned from this context after this.
+ */
+static void unclone_ctx(struct perf_counter_context *ctx)
+{
+	++ctx->generation;
+	if (!ctx->parent_ctx)
+		return;
+	put_ctx(ctx->parent_ctx);
+	ctx->parent_ctx = NULL;
+}
+
 /*
  * Cross CPU call to remove a performance counter
  *
@@ -263,6 +286,7 @@ static void perf_counter_remove_from_context(struct perf_counter *counter)
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
 
+	unclone_ctx(ctx);
 	if (!task) {
 		/*
 		 * Per cpu counters are removed via an smp call and
@@ -378,6 +402,7 @@ static void __perf_counter_disable(void *info)
 		else
 			counter_sched_out(counter, cpuctx, ctx);
 		counter->state = PERF_COUNTER_STATE_OFF;
+		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
@@ -419,6 +444,7 @@ static void perf_counter_disable(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 		update_counter_times(counter);
 		counter->state = PERF_COUNTER_STATE_OFF;
+		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irq(&ctx->lock);
@@ -727,6 +753,7 @@ static void __perf_counter_enable(void *info)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
+	ctx->nr_enabled++;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -817,6 +844,7 @@ static void perf_counter_enable(struct perf_counter *counter)
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->tstamp_enabled =
 			ctx->time - counter->total_time_enabled;
+		ctx->nr_enabled++;
 	}
  out:
 	spin_unlock_irq(&ctx->lock);
@@ -861,6 +889,25 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
 	spin_unlock(&ctx->lock);
 }
 
+/*
+ * Test whether two contexts are equivalent, i.e. whether they
+ * have both been cloned from the same version of the same context
+ * and they both have the same number of enabled counters.
+ * If the number of enabled counters is the same, then the set
+ * of enabled counters should be the same, because these are both
+ * inherited contexts, therefore we can't access individual counters
+ * in them directly with an fd; we can only enable/disable all
+ * counters via prctl, or enable/disable all counters in a family
+ * via ioctl, which will have the same effect on both contexts.
+ */
+static int context_equiv(struct perf_counter_context *ctx1,
+			 struct perf_counter_context *ctx2)
+{
+	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
+		&& ctx1->parent_gen == ctx2->parent_gen
+		&& ctx1->nr_enabled == ctx2->nr_enabled;
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -872,10 +919,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx,
  * accessing the counter control register. If a NMI hits, then it will
  * not restart the counter.
  */
-void perf_counter_task_sched_out(struct task_struct *task, int cpu)
+void perf_counter_task_sched_out(struct task_struct *task,
+				 struct task_struct *next, int cpu)
 {
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = task->perf_counter_ctxp;
+	struct perf_counter_context *next_ctx;
 	struct pt_regs *regs;
 
 	if (likely(!ctx || !cpuctx->task_ctx))
@@ -885,6 +934,16 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 
 	regs = task_pt_regs(task);
 	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
+	next_ctx = next->perf_counter_ctxp;
+	if (next_ctx && context_equiv(ctx, next_ctx)) {
+		task->perf_counter_ctxp = next_ctx;
+		next->perf_counter_ctxp = ctx;
+		ctx->task = next;
+		next_ctx->task = task;
+		return;
+	}
+
 	__perf_counter_sched_out(ctx, cpuctx);
 
 	cpuctx->task_ctx = NULL;
@@ -998,6 +1057,8 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
 
 	if (likely(!ctx))
 		return;
+	if (cpuctx->task_ctx == ctx)
+		return;
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 	cpuctx->task_ctx = ctx;
 }
@@ -3252,6 +3313,16 @@ inherit_counter(struct perf_counter *parent_counter,
 	if (IS_ERR(child_counter))
 		return child_counter;
 
+	/*
+	 * Make the child state follow the state of the parent counter,
+	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * so we won't race with perf_counter_{en,dis}able_family.
+	 */
+	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
+		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+	else
+		child_counter->state = PERF_COUNTER_STATE_OFF;
+
 	/*
 	 * Link it up in the child's context:
 	 */
@@ -3277,16 +3348,6 @@ inherit_counter(struct perf_counter *parent_counter,
 	mutex_lock(&parent_counter->mutex);
 	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
 
-	/*
-	 * Make the child state follow the state of the parent counter,
-	 * not its hw_event.disabled bit.  We hold the parent's mutex,
-	 * so we won't race with perf_counter_{en,dis}able_family.
-	 */
-	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-	else
-		child_counter->state = PERF_COUNTER_STATE_OFF;
-
 	mutex_unlock(&parent_counter->mutex);
 
 	return child_counter;
@@ -3429,6 +3490,7 @@ void perf_counter_init_task(struct task_struct *child)
 	struct perf_counter_context *child_ctx, *parent_ctx;
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
+	int inherited_all = 1;
 
 	child->perf_counter_ctxp = NULL;
 
@@ -3463,12 +3525,31 @@ void perf_counter_init_task(struct task_struct *child)
 		if (counter != counter->group_leader)
 			continue;
 
-		if (!counter->hw_event.inherit)
+		if (!counter->hw_event.inherit) {
+			inherited_all = 0;
 			continue;
+		}
 
 		if (inherit_group(counter, parent,
-				  parent_ctx, child, child_ctx))
+				  parent_ctx, child, child_ctx)) {
+			inherited_all = 0;
 			break;
+		}
+	}
+
+	if (inherited_all) {
+		/*
+		 * Mark the child context as a clone of the parent
+		 * context, or of whatever the parent is a clone of.
+		 */
+		if (parent_ctx->parent_ctx) {
+			child_ctx->parent_ctx = parent_ctx->parent_ctx;
+			child_ctx->parent_gen = parent_ctx->parent_gen;
+		} else {
+			child_ctx->parent_ctx = parent_ctx;
+			child_ctx->parent_gen = parent_ctx->generation;
+		}
+		get_ctx(child_ctx->parent_ctx);
 	}
 
 	mutex_unlock(&parent_ctx->mutex);
diff --git a/kernel/sched.c b/kernel/sched.c
index 419a39d0988fd..4c0d58bce6b28 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5091,7 +5091,7 @@ asmlinkage void __sched __schedule(void)
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
-		perf_counter_task_sched_out(prev, cpu);
+		perf_counter_task_sched_out(prev, next, cpu);
 
 		rq->nr_switches++;
 		rq->curr = next;
-- 
GitLab


From 910431c7f2e963017d767b29c80ae706421e569f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 22 May 2009 12:32:15 +0200
Subject: [PATCH 1256/2198] perf_counter: fix !PERF_COUNTERS build failure

Update the !CONFIG_PERF_COUNTERS prototype too, for
perf_counter_task_sched_out().

[ Impact: build fix ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4cae01a504508..2eedae8498d3a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -625,7 +625,8 @@ extern void perf_counter_init(void);
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
 static inline void
-perf_counter_task_sched_out(struct task_struct *task, int cpu)		{ }
+perf_counter_task_sched_out(struct task_struct *task,
+			    struct task_struct *next, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline void perf_counter_init_task(struct task_struct *child)	{ }
-- 
GitLab


From c6eb13847ba081552d2af644219bddeff7110caf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 22 May 2009 18:18:28 +0200
Subject: [PATCH 1257/2198] perf_counter tools: increase limits

I tried to run with 300 active counters and the tools bailed out
because our limit was at 64. So increase the counter limit to 1024
and the CPU limit to 4096.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/perf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 6fa3656399f44..81a737444c8ff 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -54,8 +54,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
 		       group_fd, flags);
 }
 
-#define MAX_COUNTERS			64
-#define MAX_NR_CPUS			256
+#define MAX_COUNTERS			1024
+#define MAX_NR_CPUS			4096
 
 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
 
-- 
GitLab


From e1defc4ff0cf57aca6c5e3ff99fa503f5943c1f1 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:49 -0400
Subject: [PATCH 1258/2198] block: Do away with the notion of hardsect_size

Until now we have had a 1:1 mapping between storage device physical
block size and the logical block sized used when addressing the device.
With SATA 4KB drives coming out that will no longer be the case.  The
sector size will be 4KB but the logical block size will remain
512-bytes.  Hence we need to distinguish between the physical block size
and the logical ditto.

This patch renames hardsect_size to logical_block_size.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/powerpc/sysdev/axonram.c       |  2 +-
 block/blk-integrity.c               |  2 +-
 block/blk-settings.c                | 21 ++++++++++-----------
 block/blk-sysfs.c                   | 12 +++++++++---
 block/compat_ioctl.c                |  2 +-
 block/ioctl.c                       |  2 +-
 drivers/block/cciss.c               |  6 +++---
 drivers/block/cpqarray.c            |  4 ++--
 drivers/block/hd.c                  |  2 +-
 drivers/block/mg_disk.c             |  2 +-
 drivers/block/pktcdvd.c             |  2 +-
 drivers/block/ps3disk.c             |  2 +-
 drivers/block/ub.c                  |  6 +++---
 drivers/block/virtio_blk.c          |  2 +-
 drivers/block/xen-blkfront.c        |  2 +-
 drivers/block/xsysace.c             |  2 +-
 drivers/cdrom/gdrom.c               |  2 +-
 drivers/cdrom/viocd.c               |  4 ++--
 drivers/char/raw.c                  |  2 +-
 drivers/ide/ide-cd.c                | 12 ++++++------
 drivers/md/bitmap.c                 |  4 ++--
 drivers/md/dm-exception-store.c     |  2 +-
 drivers/md/dm-log.c                 |  3 ++-
 drivers/md/dm-snap-persistent.c     |  2 +-
 drivers/md/dm-table.c               | 12 +++++++-----
 drivers/md/md.c                     |  2 +-
 drivers/memstick/core/mspro_block.c |  2 +-
 drivers/message/i2o/i2o_block.c     |  5 +++--
 drivers/mmc/card/block.c            |  2 +-
 drivers/mtd/mtd_blkdevs.c           |  2 +-
 drivers/s390/block/dasd.c           |  2 +-
 drivers/s390/block/dcssblk.c        |  2 +-
 drivers/s390/block/xpram.c          |  2 +-
 drivers/s390/char/tape_block.c      |  2 +-
 drivers/scsi/sd.c                   |  2 +-
 drivers/scsi/sr.c                   |  2 +-
 fs/bio.c                            |  3 ++-
 fs/block_dev.c                      |  6 +++---
 fs/buffer.c                         |  6 +++---
 fs/direct-io.c                      |  2 +-
 fs/ext3/super.c                     |  4 ++--
 fs/ext4/super.c                     |  2 +-
 fs/gfs2/ops_fstype.c                |  4 ++--
 fs/gfs2/rgrp.c                      |  2 +-
 fs/nilfs2/the_nilfs.c               |  2 +-
 fs/ntfs/super.c                     |  6 +++---
 fs/ocfs2/cluster/heartbeat.c        |  2 +-
 fs/ocfs2/super.c                    |  2 +-
 fs/partitions/ibm.c                 |  2 +-
 fs/partitions/msdos.c               |  4 ++--
 fs/udf/super.c                      |  2 +-
 fs/xfs/linux-2.6/xfs_buf.c          |  2 +-
 include/linux/blkdev.h              | 14 +++++++-------
 include/linux/device-mapper.h       |  2 +-
 54 files changed, 108 insertions(+), 98 deletions(-)

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 9e105cbc5e5f4..a4779912a5caa 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -250,7 +250,7 @@ axon_ram_probe(struct of_device *device, const struct of_device_id *device_id)
 
 	set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
 	blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
-	blk_queue_hardsect_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
+	blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
 	add_disk(bank->disk);
 
 	bank->irq_id = irq_of_parse_and_map(device->node, 0);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 91fa8e06b6a57..73e28d3556884 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -340,7 +340,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 		kobject_uevent(&bi->kobj, KOBJ_ADD);
 
 		bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE;
-		bi->sector_size = disk->queue->hardsect_size;
+		bi->sector_size = queue_logical_block_size(disk->queue);
 		disk->integrity = bi;
 	} else
 		bi = disk->integrity;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 57af728d94bb0..15c3164537b8f 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -134,7 +134,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
-	blk_queue_hardsect_size(q, 512);
+	blk_queue_logical_block_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
 	q->nr_batching = BLK_BATCH_REQ;
@@ -288,21 +288,20 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 EXPORT_SYMBOL(blk_queue_max_segment_size);
 
 /**
- * blk_queue_hardsect_size - set hardware sector size for the queue
+ * blk_queue_logical_block_size - set logical block size for the queue
  * @q:  the request queue for the device
- * @size:  the hardware sector size, in bytes
+ * @size:  the logical block size, in bytes
  *
  * Description:
- *   This should typically be set to the lowest possible sector size
- *   that the hardware can operate on (possible without reverting to
- *   even internal read-modify-write operations). Usually the default
- *   of 512 covers most hardware.
+ *   This should be set to the lowest possible block size that the
+ *   storage device can address.  The default of 512 covers most
+ *   hardware.
  **/
-void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
+void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
-	q->hardsect_size = size;
+	q->logical_block_size = size;
 }
-EXPORT_SYMBOL(blk_queue_hardsect_size);
+EXPORT_SYMBOL(blk_queue_logical_block_size);
 
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
@@ -324,7 +323,7 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
 	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
 	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
-	t->hardsect_size = max(t->hardsect_size, b->hardsect_size);
+	t->logical_block_size = max(t->logical_block_size, b->logical_block_size);
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
 	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3ff9bba3379a8..13d38b7e4d0fa 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -100,9 +100,9 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 	return queue_var_show(max_sectors_kb, (page));
 }
 
-static ssize_t queue_hw_sector_size_show(struct request_queue *q, char *page)
+static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(q->hardsect_size, page);
+	return queue_var_show(queue_logical_block_size(q), page);
 }
 
 static ssize_t
@@ -249,7 +249,12 @@ static struct queue_sysfs_entry queue_iosched_entry = {
 
 static struct queue_sysfs_entry queue_hw_sector_size_entry = {
 	.attr = {.name = "hw_sector_size", .mode = S_IRUGO },
-	.show = queue_hw_sector_size_show,
+	.show = queue_logical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_logical_block_size_entry = {
+	.attr = {.name = "logical_block_size", .mode = S_IRUGO },
+	.show = queue_logical_block_size_show,
 };
 
 static struct queue_sysfs_entry queue_nonrot_entry = {
@@ -283,6 +288,7 @@ static struct attribute *default_attrs[] = {
 	&queue_max_sectors_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
+	&queue_logical_block_size_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f87615dea46bb..9eaa1940273a9 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -763,7 +763,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
 		return compat_put_int(arg, block_size(bdev));
 	case BLKSSZGET: /* get block device hardware sector size */
-		return compat_put_int(arg, bdev_hardsect_size(bdev));
+		return compat_put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return compat_put_ushort(arg,
 					 bdev_get_queue(bdev)->max_sectors);
diff --git a/block/ioctl.c b/block/ioctl.c
index ad474d4bbccee..7aa97f65da82e 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -311,7 +311,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */
 		return put_int(arg, block_size(bdev));
 	case BLKSSZGET: /* get block device hardware sector size */
-		return put_int(arg, bdev_hardsect_size(bdev));
+		return put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
 	case BLKRASET:
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e714e7cce6f27..94474f5f8bcee 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1389,8 +1389,8 @@ static void cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 
 	disk->queue->queuedata = h;
 
-	blk_queue_hardsect_size(disk->queue,
-				h->drv[drv_index].block_size);
+	blk_queue_logical_block_size(disk->queue,
+				     h->drv[drv_index].block_size);
 
 	/* Make sure all queue data is written out before */
 	/* setting h->drv[drv_index].queue, as setting this */
@@ -2298,7 +2298,7 @@ static int cciss_revalidate(struct gendisk *disk)
 	cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size,
 			       inq_buff, drv);
 
-	blk_queue_hardsect_size(drv->queue, drv->block_size);
+	blk_queue_logical_block_size(drv->queue, drv->block_size);
 	set_capacity(disk, drv->nr_blocks);
 
 	kfree(inq_buff);
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index a02dcfc00f134..44fa2018f6b02 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -474,7 +474,7 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev)
 		disk->fops = &ida_fops;
 		if (j && !drv->nr_blks)
 			continue;
-		blk_queue_hardsect_size(hba[i]->queue, drv->blk_size);
+		blk_queue_logical_block_size(hba[i]->queue, drv->blk_size);
 		set_capacity(disk, drv->nr_blks);
 		disk->queue = hba[i]->queue;
 		disk->private_data = drv;
@@ -1546,7 +1546,7 @@ static int revalidate_allvol(ctlr_info_t *host)
 		drv_info_t *drv = &host->drv[i];
 		if (i && !drv->nr_blks)
 			continue;
-		blk_queue_hardsect_size(host->queue, drv->blk_size);
+		blk_queue_logical_block_size(host->queue, drv->blk_size);
 		set_capacity(disk, drv->nr_blks);
 		disk->queue = host->queue;
 		disk->private_data = drv;
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 961de56d00a94..f65b3f369eb0c 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -724,7 +724,7 @@ static int __init hd_init(void)
 	blk_queue_max_sectors(hd_queue, 255);
 	init_timer(&device_timer);
 	device_timer.function = hd_times_out;
-	blk_queue_hardsect_size(hd_queue, 512);
+	blk_queue_logical_block_size(hd_queue, 512);
 
 	if (!NR_HD) {
 		/*
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index c0cd0a03f6985..60de5a01e71e6 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -996,7 +996,7 @@ static int mg_probe(struct platform_device *plat_dev)
 		goto probe_err_6;
 	}
 	blk_queue_max_sectors(host->breq, MG_MAX_SECTS);
-	blk_queue_hardsect_size(host->breq, MG_SECTOR_SIZE);
+	blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
 
 	init_timer(&host->timer);
 	host->timer.function = mg_times_out;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index dc7a8c352da2e..293f5858921d4 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2657,7 +2657,7 @@ static void pkt_init_queue(struct pktcdvd_device *pd)
 	struct request_queue *q = pd->disk->queue;
 
 	blk_queue_make_request(q, pkt_make_request);
-	blk_queue_hardsect_size(q, CD_FRAMESIZE);
+	blk_queue_logical_block_size(q, CD_FRAMESIZE);
 	blk_queue_max_sectors(q, PACKET_MAX_SECTORS);
 	blk_queue_merge_bvec(q, pkt_merge_bvec);
 	q->queuedata = pd;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 338cee4cc0ba4..aaeeb544228a3 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -477,7 +477,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
 	blk_queue_max_sectors(queue, dev->bounce_size >> 9);
 	blk_queue_segment_boundary(queue, -1UL);
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
-	blk_queue_hardsect_size(queue, dev->blk_size);
+	blk_queue_logical_block_size(queue, dev->blk_size);
 
 	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH,
 			  ps3disk_prepare_flush);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index e67bbae9547d1..cc54473b8e77f 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -722,7 +722,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 	/*
 	 * build the command
 	 *
-	 * The call to blk_queue_hardsect_size() guarantees that request
+	 * The call to blk_queue_logical_block_size() guarantees that request
 	 * is aligned, but it is given in terms of 512 byte units, always.
 	 */
 	block = blk_rq_pos(rq) >> lun->capacity.bshift;
@@ -1749,7 +1749,7 @@ static int ub_bd_revalidate(struct gendisk *disk)
 	ub_revalidate(lun->udev, lun);
 
 	/* XXX Support sector size switching like in sr.c */
-	blk_queue_hardsect_size(disk->queue, lun->capacity.bsize);
+	blk_queue_logical_block_size(disk->queue, lun->capacity.bsize);
 	set_capacity(disk, lun->capacity.nsec);
 	// set_disk_ro(sdkp->disk, lun->readonly);
 
@@ -2324,7 +2324,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum)
 	blk_queue_max_phys_segments(q, UB_MAX_REQ_SG);
 	blk_queue_segment_boundary(q, 0xffffffff);	/* Dubious. */
 	blk_queue_max_sectors(q, UB_MAX_SECTORS);
-	blk_queue_hardsect_size(q, lun->capacity.bsize);
+	blk_queue_logical_block_size(q, lun->capacity.bsize);
 
 	lun->disk = disk;
 	q->queuedata = lun;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 511d4ae2d1764..c4845b1694640 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -347,7 +347,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 				offsetof(struct virtio_blk_config, blk_size),
 				&blk_size);
 	if (!err)
-		blk_queue_hardsect_size(vblk->disk->queue, blk_size);
+		blk_queue_logical_block_size(vblk->disk->queue, blk_size);
 
 	add_disk(vblk->disk);
 	return 0;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 132120ae4bde5..c1996829d5ecb 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -344,7 +344,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
-	blk_queue_hardsect_size(rq, sector_size);
+	blk_queue_logical_block_size(rq, sector_size);
 	blk_queue_max_sectors(rq, 512);
 
 	/* Each segment in a request is up to an aligned page in size. */
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 3a4397edab71a..f08491a3a813b 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -984,7 +984,7 @@ static int __devinit ace_setup(struct ace_device *ace)
 	ace->queue = blk_init_queue(ace_request, &ace->lock);
 	if (ace->queue == NULL)
 		goto err_blk_initq;
-	blk_queue_hardsect_size(ace->queue, 512);
+	blk_queue_logical_block_size(ace->queue, 512);
 
 	/*
 	 * Allocate and initialize GD structure
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 1e366ad8f6806..b5621f27c4be8 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -739,7 +739,7 @@ static void __devinit probe_gdrom_setupdisk(void)
 
 static int __devinit probe_gdrom_setupqueue(void)
 {
-	blk_queue_hardsect_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
+	blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
 	/* using DMA so memory will need to be contiguous */
 	blk_queue_max_hw_segments(gd.gdrom_rq, 1);
 	/* set a large max size to get most from DMA */
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index f177c2d4017f4..0fff646cc2f09 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -469,8 +469,8 @@ static void vio_handle_cd_event(struct HvLpEvent *event)
 	case viocdopen:
 		if (event->xRc == 0) {
 			di = &viocd_diskinfo[bevent->disk];
-			blk_queue_hardsect_size(di->viocd_disk->queue,
-					bevent->block_size);
+			blk_queue_logical_block_size(di->viocd_disk->queue,
+						     bevent->block_size);
 			set_capacity(di->viocd_disk,
 					bevent->media_size *
 					bevent->block_size / 512);
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 20d90e6a6e50f..db32f0e4c7dd4 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -71,7 +71,7 @@ static int raw_open(struct inode *inode, struct file *filp)
 	err = bd_claim(bdev, raw_open);
 	if (err)
 		goto out1;
-	err = set_blocksize(bdev, bdev_hardsect_size(bdev));
+	err = set_blocksize(bdev, bdev_logical_block_size(bdev));
 	if (err)
 		goto out2;
 	filp->f_flags |= O_DIRECT;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 1799328decfb1..424140c6c400e 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -182,7 +182,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 				 (sense->information[2] <<  8) |
 				 (sense->information[3]);
 
-			if (drive->queue->hardsect_size == 2048)
+			if (queue_logical_block_size(drive->queue) == 2048)
 				/* device sector size is 2K */
 				sector <<= 2;
 
@@ -737,7 +737,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	struct request_queue *q = drive->queue;
 	int write = rq_data_dir(rq) == WRITE;
 	unsigned short sectors_per_frame =
-		queue_hardsect_size(q) >> SECTOR_BITS;
+		queue_logical_block_size(q) >> SECTOR_BITS;
 
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, rq->cmd_flags: 0x%x, "
 				  "secs_per_frame: %u",
@@ -1021,8 +1021,8 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
 	/* save a private copy of the TOC capacity for error handling */
 	drive->probed_capacity = toc->capacity * sectors_per_frame;
 
-	blk_queue_hardsect_size(drive->queue,
-				sectors_per_frame << SECTOR_BITS);
+	blk_queue_logical_block_size(drive->queue,
+				     sectors_per_frame << SECTOR_BITS);
 
 	/* first read just the header, so we know how long the TOC is */
 	stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
@@ -1338,7 +1338,7 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 /* standard prep_rq_fn that builds 10 byte cmds */
 static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 {
-	int hard_sect = queue_hardsect_size(q);
+	int hard_sect = queue_logical_block_size(q);
 	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
 	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
 
@@ -1543,7 +1543,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 
 	nslots = ide_cdrom_probe_capabilities(drive);
 
-	blk_queue_hardsect_size(q, CD_FRAMESIZE);
+	blk_queue_logical_block_size(q, CD_FRAMESIZE);
 
 	if (ide_cdrom_register(drive, nslots)) {
 		printk(KERN_ERR PFX "%s: %s failed to register device with the"
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 47c68bc75a178..06b0ded1ce230 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
 		target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
 
 		if (sync_page_io(rdev->bdev, target,
-				 roundup(size, bdev_hardsect_size(rdev->bdev)),
+				 roundup(size, bdev_logical_block_size(rdev->bdev)),
 				 page, READ)) {
 			page->index = index;
 			attach_page_buffers(page, NULL); /* so that free_buffer will
@@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 			int size = PAGE_SIZE;
 			if (page->index == bitmap->file_pages-1)
 				size = roundup(bitmap->last_page_size,
-					       bdev_hardsect_size(rdev->bdev));
+					       bdev_logical_block_size(rdev->bdev));
 			/* Just make sure we aren't corrupting data or
 			 * metadata
 			 */
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index a2e26c2421415..75d8081a90417 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store,
 	}
 
 	/* Validate the chunk size against the device block size */
-	if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) {
+	if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
 		*error = "Chunk size is not a multiple of device blocksize";
 		return -EINVAL;
 	}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index be233bc4d9178..6fa8ccf91c70e 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -413,7 +413,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		 * Buffer holds both header and bitset.
 		 */
 		buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
-				       bitset_size, ti->limits.hardsect_size);
+				       bitset_size,
+				       ti->limits.logical_block_size);
 
 		if (buf_size > dev->bdev->bd_inode->i_size) {
 			DMWARN("log device %s too small: need %llu bytes",
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index e75c6dd76a9ad..2662a41337e78 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	 */
 	if (!ps->store->chunk_size) {
 		ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
-		    bdev_hardsect_size(ps->store->cow->bdev) >> 9);
+		    bdev_logical_block_size(ps->store->cow->bdev) >> 9);
 		ps->store->chunk_mask = ps->store->chunk_size - 1;
 		ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
 		chunk_size_supplied = 0;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 429b50b975d5a..65e2d9759857d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -108,7 +108,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs,
 	lhs->max_hw_segments =
 		min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
 
-	lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size);
+	lhs->logical_block_size = max(lhs->logical_block_size,
+				      rhs->logical_block_size);
 
 	lhs->max_segment_size =
 		min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
@@ -529,7 +530,8 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	rs->max_hw_segments =
 		min_not_zero(rs->max_hw_segments, q->max_hw_segments);
 
-	rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
+	rs->logical_block_size = max(rs->logical_block_size,
+				     queue_logical_block_size(q));
 
 	rs->max_segment_size =
 		min_not_zero(rs->max_segment_size, q->max_segment_size);
@@ -683,8 +685,8 @@ static void check_for_valid_limits(struct io_restrictions *rs)
 		rs->max_phys_segments = MAX_PHYS_SEGMENTS;
 	if (!rs->max_hw_segments)
 		rs->max_hw_segments = MAX_HW_SEGMENTS;
-	if (!rs->hardsect_size)
-		rs->hardsect_size = 1 << SECTOR_SHIFT;
+	if (!rs->logical_block_size)
+		rs->logical_block_size = 1 << SECTOR_SHIFT;
 	if (!rs->max_segment_size)
 		rs->max_segment_size = MAX_SEGMENT_SIZE;
 	if (!rs->seg_boundary_mask)
@@ -914,7 +916,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_sectors(q, t->limits.max_sectors);
 	q->max_phys_segments = t->limits.max_phys_segments;
 	q->max_hw_segments = t->limits.max_hw_segments;
-	q->hardsect_size = t->limits.hardsect_size;
+	q->logical_block_size = t->limits.logical_block_size;
 	q->max_segment_size = t->limits.max_segment_size;
 	q->max_hw_sectors = t->limits.max_hw_sectors;
 	q->seg_boundary_mask = t->limits.seg_boundary_mask;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fccc8343a2501..4cbc19f5c3047 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1202,7 +1202,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
 
 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
-	bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
+	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
 	if (rdev->sb_size & bmask)
 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
 
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index c0bebc6a2f2cc..7847bbc1440d6 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1242,7 +1242,7 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 
 	sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
 
-	blk_queue_hardsect_size(msb->queue, msb->page_size);
+	blk_queue_logical_block_size(msb->queue, msb->page_size);
 
 	capacity = be16_to_cpu(sys_info->user_block_count);
 	capacity *= be16_to_cpu(sys_info->block_size);
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 6573ef4408f1d..335d4c78a7751 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -794,8 +794,9 @@ static int i2o_block_transfer(struct request *req)
 	if (c->adaptec) {
 		u8 cmd[10];
 		u32 scsi_flags;
-		u16 hwsec = queue_hardsect_size(req->q) >> KERNEL_SECTOR_SHIFT;
+		u16 hwsec;
 
+		hwsec = queue_logical_block_size(req->q) >> KERNEL_SECTOR_SHIFT;
 		memset(cmd, 0, 10);
 
 		sgl_offset = SGL_OFFSET_12;
@@ -1078,7 +1079,7 @@ static int i2o_block_probe(struct device *dev)
 	 */
 	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
 	    !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
-		blk_queue_hardsect_size(queue, le32_to_cpu(blocksize));
+		blk_queue_logical_block_size(queue, le32_to_cpu(blocksize));
 	} else
 		osm_warn("unable to get blocksize of %s\n", gd->disk_name);
 
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index c5df86546458d..98ffc41eaf2c6 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -521,7 +521,7 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card)
 
 	sprintf(md->disk->disk_name, "mmcblk%d", devidx);
 
-	blk_queue_hardsect_size(md->queue.queue, 512);
+	blk_queue_logical_block_size(md->queue.queue, 512);
 
 	if (!mmc_card_sd(card) && mmc_card_blockaddr(card)) {
 		/*
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 502622f628bc0..aaac3b6800b73 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -378,7 +378,7 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 	}
 
 	tr->blkcore_priv->rq->queuedata = tr;
-	blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize);
+	blk_queue_logical_block_size(tr->blkcore_priv->rq, tr->blksize);
 	if (tr->discard)
 		blk_queue_set_discard(tr->blkcore_priv->rq,
 				      blktrans_discard_request);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index e64f62d5e0fc5..27a1be0cd4d46 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1990,7 +1990,7 @@ static void dasd_setup_queue(struct dasd_block *block)
 {
 	int max;
 
-	blk_queue_hardsect_size(block->request_queue, block->bp_block);
+	blk_queue_logical_block_size(block->request_queue, block->bp_block);
 	max = block->base->discipline->max_blocks << block->s2b_shift;
 	blk_queue_max_sectors(block->request_queue, max);
 	blk_queue_max_phys_segments(block->request_queue, -1L);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index cfdcf1aed33c2..a4c7ffcd9987a 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -602,7 +602,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	dev_info->gd->private_data = dev_info;
 	dev_info->gd->driverfs_dev = &dev_info->dev;
 	blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request);
-	blk_queue_hardsect_size(dev_info->dcssblk_queue, 4096);
+	blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096);
 
 	seg_byte_size = (dev_info->end - dev_info->start + 1);
 	set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 76814f3e898a9..0ae0c83ef8790 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -343,7 +343,7 @@ static int __init xpram_setup_blkdev(void)
 			goto out;
 		}
 		blk_queue_make_request(xpram_queues[i], xpram_make_request);
-		blk_queue_hardsect_size(xpram_queues[i], 4096);
+		blk_queue_logical_block_size(xpram_queues[i], 4096);
 	}
 
 	/*
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 1e79676759805..47ff695255ea6 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -222,7 +222,7 @@ tapeblock_setup_device(struct tape_device * device)
 	if (rc)
 		goto cleanup_queue;
 
-	blk_queue_hardsect_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE);
+	blk_queue_logical_block_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE);
 	blk_queue_max_sectors(blkdat->request_queue, TAPEBLOCK_MAX_SEC);
 	blk_queue_max_phys_segments(blkdat->request_queue, -1L);
 	blk_queue_max_hw_segments(blkdat->request_queue, -1L);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 40d2860f235a3..bcf3bd40bbd5f 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1510,7 +1510,7 @@ sd_read_capacity(struct scsi_disk *sdkp, unsigned char *buffer)
 		 */
 		sector_size = 512;
 	}
-	blk_queue_hardsect_size(sdp->request_queue, sector_size);
+	blk_queue_logical_block_size(sdp->request_queue, sector_size);
 
 	{
 		char cap_str_2[10], cap_str_10[10];
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index fddba53c7fe51..cd350dfc1216f 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -727,7 +727,7 @@ static void get_sectorsize(struct scsi_cd *cd)
 	}
 
 	queue = cd->device->request_queue;
-	blk_queue_hardsect_size(queue, sector_size);
+	blk_queue_logical_block_size(queue, sector_size);
 
 	return;
 }
diff --git a/fs/bio.c b/fs/bio.c
index 81dc93e72535f..4445c38217304 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1490,11 +1490,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 sector_t bio_sector_offset(struct bio *bio, unsigned short index,
 			   unsigned int offset)
 {
-	unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+	unsigned int sector_sz;
 	struct bio_vec *bv;
 	sector_t sectors;
 	int i;
 
+	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
 	sectors = 0;
 
 	if (index >= bio->bi_idx)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a85fe310fc6fe..a29b4dcc1bca2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -76,7 +76,7 @@ int set_blocksize(struct block_device *bdev, int size)
 		return -EINVAL;
 
 	/* Size cannot be smaller than the size supported by the device */
-	if (size < bdev_hardsect_size(bdev))
+	if (size < bdev_logical_block_size(bdev))
 		return -EINVAL;
 
 	/* Don't change the size if it is same as current */
@@ -106,7 +106,7 @@ EXPORT_SYMBOL(sb_set_blocksize);
 
 int sb_min_blocksize(struct super_block *sb, int size)
 {
-	int minsize = bdev_hardsect_size(sb->s_bdev);
+	int minsize = bdev_logical_block_size(sb->s_bdev);
 	if (size < minsize)
 		size = minsize;
 	return sb_set_blocksize(sb, size);
@@ -1117,7 +1117,7 @@ EXPORT_SYMBOL(check_disk_change);
 
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
-	unsigned bsize = bdev_hardsect_size(bdev);
+	unsigned bsize = bdev_logical_block_size(bdev);
 
 	bdev->bd_inode->i_size = size;
 	while (bsize < PAGE_CACHE_SIZE) {
diff --git a/fs/buffer.c b/fs/buffer.c
index aed297739eb07..36e2bbc60ec75 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1085,12 +1085,12 @@ static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
 	/* Size must be multiple of hard sectorsize */
-	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
+	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
 			(size < 512 || size > PAGE_SIZE))) {
 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
 					size);
-		printk(KERN_ERR "hardsect size: %d\n",
-					bdev_hardsect_size(bdev));
+		printk(KERN_ERR "logical block size: %d\n",
+					bdev_logical_block_size(bdev));
 
 		dump_stack();
 		return NULL;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bbc2050b..8b10b87dc01a2 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		rw = WRITE_ODIRECT;
 
 	if (bdev)
-		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
+		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
 
 	if (offset & blocksize_mask) {
 		if (bdev)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c39..acbb94fdf9037 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1696,7 +1696,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	hblock = bdev_hardsect_size(sb->s_bdev);
+	hblock = bdev_logical_block_size(sb->s_bdev);
 	if (sb->s_blocksize != blocksize) {
 		/*
 		 * Make sure the blocksize for the filesystem is larger
@@ -2119,7 +2119,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		printk(KERN_ERR
 			"EXT3-fs: blocksize too small for journal device.\n");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f222a..a30549f7a3053 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2962,7 +2962,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		printk(KERN_ERR
 			"EXT4-fs: blocksize too small for journal device.\n");
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea7531..a3b2ac989fc35 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
 	}
 
 	/* Set up the buffer cache and SB for real */
-	if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+	if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
 		ret = -EINVAL;
 		fs_err(sdp, "FS block size (%u) is too small for device "
 		       "block size (%u)\n",
-		       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+		       sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
 		goto out;
 	}
 	if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa2b..a971d24e10ce5 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -845,7 +845,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 	struct super_block *sb = sdp->sd_vfs;
 	struct block_device *bdev = sb->s_bdev;
 	const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
-					   bdev_hardsect_size(sb->s_bdev);
+					   bdev_logical_block_size(sb->s_bdev);
 	u64 blk;
 	sector_t start = 0;
 	sector_t nr_sects = 0;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 7f65b3be4aa91..a91f15b8673c3 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -515,7 +515,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
 	if (sb->s_blocksize != blocksize) {
-		int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+		int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
 
 		if (blocksize < hw_blocksize) {
 			printk(KERN_ERR
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index f76951dcd4a6e..6aa7c47135366 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>	/* For bdev_hardsect_size(). */
+#include <linux/blkdev.h>	/* For bdev_logical_block_size(). */
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
@@ -2785,13 +2785,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		goto err_out_now;
 
 	/* We support sector sizes up to the PAGE_CACHE_SIZE. */
-	if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+	if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
 		if (!silent)
 			ntfs_error(sb, "Device has unsupported sector size "
 					"(%i).  The maximum supported sector "
 					"size on this architecture is %lu "
 					"bytes.",
-					bdev_hardsect_size(sb->s_bdev),
+					bdev_logical_block_size(sb->s_bdev),
 					PAGE_CACHE_SIZE);
 		goto err_out_now;
 	}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4f85eceab3760..09cc25d04611b 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 
 	bdevname(reg->hr_bdev, reg->hr_dev_name);
 
-	sectsize = bdev_hardsect_size(reg->hr_bdev);
+	sectsize = bdev_logical_block_size(reg->hr_bdev);
 	if (sectsize != reg->hr_block_bytes) {
 		mlog(ML_ERROR,
 		     "blocksize %u incorrect for device, expected %d",
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9d37e08..5c6163f550393 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -713,7 +713,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
 	*bh = NULL;
 
 	/* may be > 512 */
-	*sector_size = bdev_hardsect_size(sb->s_bdev);
+	*sector_size = bdev_logical_block_size(sb->s_bdev);
 	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
 		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
 		     *sector_size, OCFS2_MAX_BLOCKSIZE);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 46297683cd34b..fc71aab084603 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	Sector sect;
 
 	res = 0;
-	blocksize = bdev_hardsect_size(bdev);
+	blocksize = bdev_logical_block_size(bdev);
 	if (blocksize <= 0)
 		goto out_exit;
 	i_size = i_size_read(bdev->bd_inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 796511886f285..0028d2ef0662b 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 	Sector sect;
 	unsigned char *data;
 	u32 this_sector, this_size;
-	int sector_size = bdev_hardsect_size(bdev) / 512;
+	int sector_size = bdev_logical_block_size(bdev) / 512;
 	int loopct = 0;		/* number of links followed
 				   without finding a data partition */
 	int i;
@@ -415,7 +415,7 @@ static struct {
  
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-	int sector_size = bdev_hardsect_size(bdev) / 512;
+	int sector_size = bdev_logical_block_size(bdev) / 512;
 	Sector sect;
 	unsigned char *data;
 	struct partition *p;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 72348cc855a45..0ba44107d8f10 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1915,7 +1915,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 	} else {
-		uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+		uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 		if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
 			if (!silent)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e28800a9f2b53..1418b916fc275 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early(
 	struct block_device	*bdev)
 {
 	return xfs_setsize_buftarg_flags(btp,
-			PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
+			PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 
 int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56ce53fce72ee..872b78b7a1014 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -391,7 +391,7 @@ struct request_queue
 	unsigned int		max_hw_sectors;
 	unsigned short		max_phys_segments;
 	unsigned short		max_hw_segments;
-	unsigned short		hardsect_size;
+	unsigned short		logical_block_size;
 	unsigned int		max_segment_size;
 
 	unsigned long		seg_boundary_mask;
@@ -901,7 +901,7 @@ extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
-extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
+extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
@@ -988,19 +988,19 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
-static inline int queue_hardsect_size(struct request_queue *q)
+static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
 
-	if (q && q->hardsect_size)
-		retval = q->hardsect_size;
+	if (q && q->logical_block_size)
+		retval = q->logical_block_size;
 
 	return retval;
 }
 
-static inline int bdev_hardsect_size(struct block_device *bdev)
+static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 {
-	return queue_hardsect_size(bdev_get_queue(bdev));
+	return queue_logical_block_size(bdev_get_queue(bdev));
 }
 
 static inline int queue_dma_alignment(struct request_queue *q)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ded2d7c42668c..49c2362977fde 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -149,7 +149,7 @@ struct io_restrictions {
 	unsigned max_hw_sectors;
 	unsigned max_sectors;
 	unsigned max_segment_size;
-	unsigned short hardsect_size;
+	unsigned short logical_block_size;
 	unsigned short max_hw_segments;
 	unsigned short max_phys_segments;
 	unsigned char no_cluster; /* inverted so that 0 is default */
-- 
GitLab


From ae03bf639a5027d27270123f5f6e3ee6a412781d Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:50 -0400
Subject: [PATCH 1259/2198] block: Use accessor functions for queue limits

Convert all external users of queue limits to using wrapper functions
instead of poking the request queue variables directly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c            |  8 ++++----
 block/blk-core.c               | 16 +++++++--------
 block/blk-map.c                |  4 ++--
 block/blk-merge.c              | 27 +++++++++++++------------
 block/blk-settings.c           | 15 +++++++++++---
 block/blk-sysfs.c              |  8 ++++----
 block/compat_ioctl.c           |  2 +-
 block/ioctl.c                  | 10 +++++-----
 block/scsi_ioctl.c             |  8 ++++----
 drivers/block/pktcdvd.c        |  6 ++++--
 drivers/cdrom/cdrom.c          |  4 ++--
 drivers/md/dm-table.c          | 28 +++++++++++++-------------
 drivers/md/linear.c            |  2 +-
 drivers/md/multipath.c         |  4 ++--
 drivers/md/raid0.c             |  2 +-
 drivers/md/raid1.c             |  4 ++--
 drivers/md/raid10.c            |  8 ++++----
 drivers/md/raid5.c             |  4 ++--
 drivers/scsi/sg.c              | 15 +++++++-------
 drivers/scsi/st.c              |  4 ++--
 drivers/usb/storage/scsiglue.c |  4 ++--
 fs/bio.c                       | 19 +++++++++---------
 include/linux/bio.h            |  2 +-
 include/linux/blkdev.h         | 36 ++++++++++++++++++++++++++++++++++
 mm/bounce.c                    |  4 ++--
 25 files changed, 147 insertions(+), 97 deletions(-)

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 0d98054cdbd77..30022b4e2f630 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -388,10 +388,10 @@ int blkdev_issue_discard(struct block_device *bdev,
 
 		bio->bi_sector = sector;
 
-		if (nr_sects > q->max_hw_sectors) {
-			bio->bi_size = q->max_hw_sectors << 9;
-			nr_sects -= q->max_hw_sectors;
-			sector += q->max_hw_sectors;
+		if (nr_sects > queue_max_hw_sectors(q)) {
+			bio->bi_size = queue_max_hw_sectors(q) << 9;
+			nr_sects -= queue_max_hw_sectors(q);
+			sector += queue_max_hw_sectors(q);
 		} else {
 			bio->bi_size = nr_sects << 9;
 			nr_sects = 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 59c4af5231121..7a4c40184a64c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1437,11 +1437,11 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
-		if (unlikely(nr_sectors > q->max_hw_sectors)) {
+		if (unlikely(nr_sectors > queue_max_hw_sectors(q))) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-				bdevname(bio->bi_bdev, b),
-				bio_sectors(bio),
-				q->max_hw_sectors);
+			       bdevname(bio->bi_bdev, b),
+			       bio_sectors(bio),
+			       queue_max_hw_sectors(q));
 			goto end_io;
 		}
 
@@ -1608,8 +1608,8 @@ EXPORT_SYMBOL(submit_bio);
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
-	if (blk_rq_sectors(rq) > q->max_sectors ||
-	    blk_rq_bytes(rq) > q->max_hw_sectors << 9) {
+	if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
+	    blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
@@ -1621,8 +1621,8 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 	 * limitation.
 	 */
 	blk_recalc_rq_segments(rq);
-	if (rq->nr_phys_segments > q->max_phys_segments ||
-	    rq->nr_phys_segments > q->max_hw_segments) {
+	if (rq->nr_phys_segments > queue_max_phys_segments(q) ||
+	    rq->nr_phys_segments > queue_max_hw_segments(q)) {
 		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
 		return -EIO;
 	}
diff --git a/block/blk-map.c b/block/blk-map.c
index ef2492adca7e3..9083cf0180cc8 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -115,7 +115,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 	struct bio *bio = NULL;
 	int ret;
 
-	if (len > (q->max_hw_sectors << 9))
+	if (len > (queue_max_hw_sectors(q) << 9))
 		return -EINVAL;
 	if (!len)
 		return -EINVAL;
@@ -292,7 +292,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	struct bio *bio;
 	int ret;
 
-	if (len > (q->max_hw_sectors << 9))
+	if (len > (queue_max_hw_sectors(q) << 9))
 		return -EINVAL;
 	if (!len || !kbuf)
 		return -EINVAL;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4974dd5767e51..39ce64432ba6d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -32,11 +32,12 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 			 * never considered part of another segment, since that
 			 * might change with the bounce page.
 			 */
-			high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
+			high = page_to_pfn(bv->bv_page) > queue_bounce_pfn(q);
 			if (high || highprv)
 				goto new_segment;
 			if (cluster) {
-				if (seg_size + bv->bv_len > q->max_segment_size)
+				if (seg_size + bv->bv_len
+				    > queue_max_segment_size(q))
 					goto new_segment;
 				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
 					goto new_segment;
@@ -91,7 +92,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 		return 0;
 
 	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
-	    q->max_segment_size)
+	    queue_max_segment_size(q))
 		return 0;
 
 	if (!bio_has_data(bio))
@@ -134,7 +135,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 		int nbytes = bvec->bv_len;
 
 		if (bvprv && cluster) {
-			if (sg->length + nbytes > q->max_segment_size)
+			if (sg->length + nbytes > queue_max_segment_size(q))
 				goto new_segment;
 
 			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
@@ -205,8 +206,8 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
-	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) ||
+	    req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -227,9 +228,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 	unsigned short max_sectors;
 
 	if (unlikely(blk_pc_request(req)))
-		max_sectors = q->max_hw_sectors;
+		max_sectors = queue_max_hw_sectors(q);
 	else
-		max_sectors = q->max_sectors;
+		max_sectors = queue_max_sectors(q);
 
 	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -251,9 +252,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 	unsigned short max_sectors;
 
 	if (unlikely(blk_pc_request(req)))
-		max_sectors = q->max_hw_sectors;
+		max_sectors = queue_max_hw_sectors(q);
 	else
-		max_sectors = q->max_sectors;
+		max_sectors = queue_max_sectors(q);
 
 
 	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
@@ -287,7 +288,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	/*
 	 * Will it become too large?
 	 */
-	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > q->max_sectors)
+	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q))
 		return 0;
 
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
@@ -299,10 +300,10 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 		total_phys_segments--;
 	}
 
-	if (total_phys_segments > q->max_phys_segments)
+	if (total_phys_segments > queue_max_phys_segments(q))
 		return 0;
 
-	if (total_phys_segments > q->max_hw_segments)
+	if (total_phys_segments > queue_max_hw_segments(q))
 		return 0;
 
 	/* Merge is OK... */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 15c3164537b8f..0b32f984eed24 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -219,6 +219,15 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
 }
 EXPORT_SYMBOL(blk_queue_max_sectors);
 
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
+{
+	if (BLK_DEF_MAX_SECTORS > max_sectors)
+		q->max_hw_sectors = BLK_DEF_MAX_SECTORS;
+	else
+		q->max_hw_sectors = max_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_hw_sectors);
+
 /**
  * blk_queue_max_phys_segments - set max phys segments for a request for this queue
  * @q:  the request queue for the device
@@ -395,11 +404,11 @@ int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size)
 {
-	if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
+	if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2)
 		return -EINVAL;
 	/* make room for appending the drain */
-	--q->max_hw_segments;
-	--q->max_phys_segments;
+	blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1);
+	blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1);
 	q->dma_drain_needed = dma_drain_needed;
 	q->dma_drain_buffer = buf;
 	q->dma_drain_size = size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 13d38b7e4d0fa..142a4acddd432 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -95,7 +95,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 {
-	int max_sectors_kb = q->max_sectors >> 1;
+	int max_sectors_kb = queue_max_sectors(q) >> 1;
 
 	return queue_var_show(max_sectors_kb, (page));
 }
@@ -109,7 +109,7 @@ static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
 	unsigned long max_sectors_kb,
-			max_hw_sectors_kb = q->max_hw_sectors >> 1,
+		max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
 			page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
 	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
 
@@ -117,7 +117,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 		return -EINVAL;
 
 	spin_lock_irq(q->queue_lock);
-	q->max_sectors = max_sectors_kb << 1;
+	blk_queue_max_sectors(q, max_sectors_kb << 1);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
@@ -125,7 +125,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
 {
-	int max_hw_sectors_kb = q->max_hw_sectors >> 1;
+	int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1;
 
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 9eaa1940273a9..df18a156d011b 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -766,7 +766,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return compat_put_ushort(arg,
-					 bdev_get_queue(bdev)->max_sectors);
+					 queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET: /* compatible, but no compat_ptr (!) */
 	case BLKFRASET:
 		if (!capable(CAP_SYS_ADMIN))
diff --git a/block/ioctl.c b/block/ioctl.c
index 7aa97f65da82e..500e4c73cc52b 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -152,10 +152,10 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 		bio->bi_private = &wait;
 		bio->bi_sector = start;
 
-		if (len > q->max_hw_sectors) {
-			bio->bi_size = q->max_hw_sectors << 9;
-			len -= q->max_hw_sectors;
-			start += q->max_hw_sectors;
+		if (len > queue_max_hw_sectors(q)) {
+			bio->bi_size = queue_max_hw_sectors(q) << 9;
+			len -= queue_max_hw_sectors(q);
+			start += queue_max_hw_sectors(q);
 		} else {
 			bio->bi_size = len << 9;
 			len = 0;
@@ -313,7 +313,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKSSZGET: /* get block device hardware sector size */
 		return put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
-		return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
+		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET:
 	case BLKFRASET:
 		if(!capable(CAP_SYS_ADMIN))
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a9670dd4b5de0..5f8e798ede4ee 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -75,7 +75,7 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
 
 static int sg_get_reserved_size(struct request_queue *q, int __user *p)
 {
-	unsigned val = min(q->sg_reserved_size, q->max_sectors << 9);
+	unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9);
 
 	return put_user(val, p);
 }
@@ -89,8 +89,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p)
 
 	if (size < 0)
 		return -EINVAL;
-	if (size > (q->max_sectors << 9))
-		size = q->max_sectors << 9;
+	if (size > (queue_max_sectors(q) << 9))
+		size = queue_max_sectors(q) << 9;
 
 	q->sg_reserved_size = size;
 	return 0;
@@ -264,7 +264,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	if (hdr->cmd_len > BLK_MAX_CDB)
 		return -EINVAL;
 
-	if (hdr->dxfer_len > (q->max_hw_sectors << 9))
+	if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
 		return -EIO;
 
 	if (hdr->dxfer_len)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 293f5858921d4..d57f11759480c 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -991,13 +991,15 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
  */
 static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
 {
-	if ((pd->settings.size << 9) / CD_FRAMESIZE <= q->max_phys_segments) {
+	if ((pd->settings.size << 9) / CD_FRAMESIZE
+	    <= queue_max_phys_segments(q)) {
 		/*
 		 * The cdrom device can handle one segment/frame
 		 */
 		clear_bit(PACKET_MERGE_SEGS, &pd->flags);
 		return 0;
-	} else if ((pd->settings.size << 9) / PAGE_SIZE <= q->max_phys_segments) {
+	} else if ((pd->settings.size << 9) / PAGE_SIZE
+		   <= queue_max_phys_segments(q)) {
 		/*
 		 * We can handle this case at the expense of some extra memory
 		 * copies during write operations
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index cceace61ef286..71d1b9bab70b5 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2101,8 +2101,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		nr = nframes;
 		if (cdi->cdda_method == CDDA_BPC_SINGLE)
 			nr = 1;
-		if (nr * CD_FRAMESIZE_RAW > (q->max_sectors << 9))
-			nr = (q->max_sectors << 9) / CD_FRAMESIZE_RAW;
+		if (nr * CD_FRAMESIZE_RAW > (queue_max_sectors(q) << 9))
+			nr = (queue_max_sectors(q) << 9) / CD_FRAMESIZE_RAW;
 
 		len = nr * CD_FRAMESIZE_RAW;
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 65e2d9759857d..e9a73bb242b09 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -510,7 +510,7 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	 *        combine_restrictions_low()
 	 */
 	rs->max_sectors =
-		min_not_zero(rs->max_sectors, q->max_sectors);
+		min_not_zero(rs->max_sectors, queue_max_sectors(q));
 
 	/*
 	 * Check if merge fn is supported.
@@ -525,25 +525,25 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 
 	rs->max_phys_segments =
 		min_not_zero(rs->max_phys_segments,
-			     q->max_phys_segments);
+			     queue_max_phys_segments(q));
 
 	rs->max_hw_segments =
-		min_not_zero(rs->max_hw_segments, q->max_hw_segments);
+		min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q));
 
 	rs->logical_block_size = max(rs->logical_block_size,
 				     queue_logical_block_size(q));
 
 	rs->max_segment_size =
-		min_not_zero(rs->max_segment_size, q->max_segment_size);
+		min_not_zero(rs->max_segment_size, queue_max_segment_size(q));
 
 	rs->max_hw_sectors =
-		min_not_zero(rs->max_hw_sectors, q->max_hw_sectors);
+		min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q));
 
 	rs->seg_boundary_mask =
 		min_not_zero(rs->seg_boundary_mask,
-			     q->seg_boundary_mask);
+			     queue_segment_boundary(q));
 
-	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn);
+	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q));
 
 	rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 }
@@ -914,13 +914,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	 * restrictions.
 	 */
 	blk_queue_max_sectors(q, t->limits.max_sectors);
-	q->max_phys_segments = t->limits.max_phys_segments;
-	q->max_hw_segments = t->limits.max_hw_segments;
-	q->logical_block_size = t->limits.logical_block_size;
-	q->max_segment_size = t->limits.max_segment_size;
-	q->max_hw_sectors = t->limits.max_hw_sectors;
-	q->seg_boundary_mask = t->limits.seg_boundary_mask;
-	q->bounce_pfn = t->limits.bounce_pfn;
+	blk_queue_max_phys_segments(q, t->limits.max_phys_segments);
+	blk_queue_max_hw_segments(q, t->limits.max_hw_segments);
+	blk_queue_logical_block_size(q, t->limits.logical_block_size);
+	blk_queue_max_segment_size(q, t->limits.max_segment_size);
+	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
+	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
+	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 7a36e38393a1e..64f1f3e046e08 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->num_sectors = rdev->sectors;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 41ced0cbe823c..4ee31aa13c407 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		 * merge_bvec_fn will be involved in multipath.)
 		 */
 			if (q->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			    queue_max_sectors(q) > (PAGE_SIZE>>9))
 				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			conf->working_disks++;
@@ -467,7 +467,7 @@ static int multipath_run (mddev_t *mddev)
 		 * violating it, not that we ever expect a device with
 		 * a merge_bvec_fn to be involved in multipath */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		if (!test_bit(Faulty, &rdev->flags))
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c08d7559be553..925507e7d673b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -144,7 +144,7 @@ static int create_strip_zones (mddev_t *mddev)
 		 */
 
 		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		if (!smallest || (rdev1->sectors < smallest->sectors))
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 36df9109cde18..e23758b4a34e2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			 * a one page request is never in violation.
 			 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			p->head_position = 0;
@@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 499620afb44b2..750550c1166fe 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			 * a one page request is never in violation.
 			 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-				mddev->queue->max_sectors = (PAGE_SIZE>>9);
+			    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
+				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -2145,8 +2145,8 @@ static int run(mddev_t *mddev)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			mddev->queue->max_sectors = (PAGE_SIZE>>9);
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4616bc3a6e713..7970dc8c522eb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3463,10 +3463,10 @@ static int bio_fits_rdev(struct bio *bi)
 {
 	struct request_queue *q = bdev_get_queue(bi->bi_bdev);
 
-	if ((bi->bi_size>>9) > q->max_sectors)
+	if ((bi->bi_size>>9) > queue_max_sectors(q))
 		return 0;
 	blk_recount_segments(q, bi);
-	if (bi->bi_phys_segments > q->max_phys_segments)
+	if (bi->bi_phys_segments > queue_max_phys_segments(q))
 		return 0;
 
 	if (q->merge_bvec_fn)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0fc2c0ae7691b..9bd407fa98e4f 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -289,8 +289,8 @@ sg_open(struct inode *inode, struct file *filp)
 	if (list_empty(&sdp->sfds)) {	/* no existing opens on this device */
 		sdp->sgdebug = 0;
 		q = sdp->device->request_queue;
-		sdp->sg_tablesize = min(q->max_hw_segments,
-					q->max_phys_segments);
+		sdp->sg_tablesize = min(queue_max_hw_segments(q),
+					queue_max_phys_segments(q));
 	}
 	if ((sfp = sg_add_sfp(sdp, dev)))
 		filp->private_data = sfp;
@@ -909,7 +909,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
                 if (val < 0)
                         return -EINVAL;
 		val = min_t(int, val,
-				sdp->device->request_queue->max_sectors * 512);
+			    queue_max_sectors(sdp->device->request_queue) * 512);
 		if (val != sfp->reserve.bufflen) {
 			if (sg_res_in_use(sfp) || sfp->mmap_called)
 				return -EBUSY;
@@ -919,7 +919,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 		return 0;
 	case SG_GET_RESERVED_SIZE:
 		val = min_t(int, sfp->reserve.bufflen,
-				sdp->device->request_queue->max_sectors * 512);
+			    queue_max_sectors(sdp->device->request_queue) * 512);
 		return put_user(val, ip);
 	case SG_SET_COMMAND_Q:
 		result = get_user(val, ip);
@@ -1059,7 +1059,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 			return -ENODEV;
 		return scsi_ioctl(sdp->device, cmd_in, p);
 	case BLKSECTGET:
-		return put_user(sdp->device->request_queue->max_sectors * 512,
+		return put_user(queue_max_sectors(sdp->device->request_queue) * 512,
 				ip);
 	case BLKTRACESETUP:
 		return blk_trace_setup(sdp->device->request_queue,
@@ -1377,7 +1377,8 @@ static Sg_device *sg_alloc(struct gendisk *disk, struct scsi_device *scsidp)
 	sdp->device = scsidp;
 	INIT_LIST_HEAD(&sdp->sfds);
 	init_waitqueue_head(&sdp->o_excl_wait);
-	sdp->sg_tablesize = min(q->max_hw_segments, q->max_phys_segments);
+	sdp->sg_tablesize = min(queue_max_hw_segments(q),
+				queue_max_phys_segments(q));
 	sdp->index = k;
 	kref_init(&sdp->d_ref);
 
@@ -2055,7 +2056,7 @@ sg_add_sfp(Sg_device * sdp, int dev)
 		sg_big_buff = def_reserved_size;
 
 	bufflen = min_t(int, sg_big_buff,
-			sdp->device->request_queue->max_sectors * 512);
+			queue_max_sectors(sdp->device->request_queue) * 512);
 	sg_build_reserve(sfp, bufflen);
 	SCSI_LOG_TIMEOUT(3, printk("sg_add_sfp:   bufflen=%d, k_use_sg=%d\n",
 			   sfp->reserve.bufflen, sfp->reserve.k_use_sg));
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 8681b708344f0..89bd438e1fe30 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -3983,8 +3983,8 @@ static int st_probe(struct device *dev)
 		return -ENODEV;
 	}
 
-	i = min(SDp->request_queue->max_hw_segments,
-		SDp->request_queue->max_phys_segments);
+	i = min(queue_max_hw_segments(SDp->request_queue),
+		queue_max_phys_segments(SDp->request_queue));
 	if (st_max_sg_segs < i)
 		i = st_max_sg_segs;
 	buffer = new_tape_buffer((SDp->host)->unchecked_isa_dma, i);
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 4ca3b58606437..cfa26d56ce60c 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -132,7 +132,7 @@ static int slave_configure(struct scsi_device *sdev)
 
 		if (us->fflags & US_FL_MAX_SECTORS_MIN)
 			max_sectors = PAGE_CACHE_SIZE >> 9;
-		if (sdev->request_queue->max_sectors > max_sectors)
+		if (queue_max_sectors(sdev->request_queue) > max_sectors)
 			blk_queue_max_sectors(sdev->request_queue,
 					      max_sectors);
 	} else if (sdev->type == TYPE_TAPE) {
@@ -483,7 +483,7 @@ static ssize_t show_max_sectors(struct device *dev, struct device_attribute *att
 {
 	struct scsi_device *sdev = to_scsi_device(dev);
 
-	return sprintf(buf, "%u\n", sdev->request_queue->max_sectors);
+	return sprintf(buf, "%u\n", queue_max_sectors(sdev->request_queue));
 }
 
 /* Input routine for the sysfs max_sectors file */
diff --git a/fs/bio.c b/fs/bio.c
index 4445c38217304..ab423a1024ab8 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -499,11 +499,11 @@ int bio_get_nr_vecs(struct block_device *bdev)
 	struct request_queue *q = bdev_get_queue(bdev);
 	int nr_pages;
 
-	nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (nr_pages > q->max_phys_segments)
-		nr_pages = q->max_phys_segments;
-	if (nr_pages > q->max_hw_segments)
-		nr_pages = q->max_hw_segments;
+	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (nr_pages > queue_max_phys_segments(q))
+		nr_pages = queue_max_phys_segments(q);
+	if (nr_pages > queue_max_hw_segments(q))
+		nr_pages = queue_max_hw_segments(q);
 
 	return nr_pages;
 }
@@ -562,8 +562,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 * make this too complex.
 	 */
 
-	while (bio->bi_phys_segments >= q->max_phys_segments
-	       || bio->bi_phys_segments >= q->max_hw_segments) {
+	while (bio->bi_phys_segments >= queue_max_phys_segments(q)
+	       || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
 
 		if (retried_segments)
 			return 0;
@@ -634,7 +634,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
 		    unsigned int len, unsigned int offset)
 {
-	return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
+	return __bio_add_page(q, bio, page, len, offset,
+			      queue_max_hw_sectors(q));
 }
 
 /**
@@ -654,7 +655,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 		 unsigned int offset)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
+	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
 }
 
 struct bio_map_data {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d30ec6f30dd7e..12737be586011 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -279,7 +279,7 @@ static inline int bio_has_allocated_vec(struct bio *bio)
 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
 	(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
-	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask)
+	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q)))
 #define BIO_SEG_BOUNDARY(q, b1, b2) \
 	BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2)))
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 872b78b7a1014..29b48f7b4ba82 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -898,6 +898,7 @@ extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
+extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
@@ -988,6 +989,41 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
+static inline unsigned long queue_bounce_pfn(struct request_queue *q)
+{
+	return q->bounce_pfn;
+}
+
+static inline unsigned long queue_segment_boundary(struct request_queue *q)
+{
+	return q->seg_boundary_mask;
+}
+
+static inline unsigned int queue_max_sectors(struct request_queue *q)
+{
+	return q->max_sectors;
+}
+
+static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
+{
+	return q->max_hw_sectors;
+}
+
+static inline unsigned short queue_max_hw_segments(struct request_queue *q)
+{
+	return q->max_hw_segments;
+}
+
+static inline unsigned short queue_max_phys_segments(struct request_queue *q)
+{
+	return q->max_phys_segments;
+}
+
+static inline unsigned int queue_max_segment_size(struct request_queue *q)
+{
+	return q->max_segment_size;
+}
+
 static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272fe7a8f..8dcd4315e01c0 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -192,7 +192,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		/*
 		 * is destination page below bounce pfn?
 		 */
-		if (page_to_pfn(page) <= q->bounce_pfn)
+		if (page_to_pfn(page) <= queue_bounce_pfn(q))
 			continue;
 
 		/*
@@ -284,7 +284,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	 * don't waste time iterating over bio segments
 	 */
 	if (!(q->bounce_gfp & GFP_DMA)) {
-		if (q->bounce_pfn >= blk_max_pfn)
+		if (queue_bounce_pfn(q) >= blk_max_pfn)
 			return;
 		pool = page_pool;
 	} else {
-- 
GitLab


From 025146e13b63483add912706c101fb0fb6f015cc Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:51 -0400
Subject: [PATCH 1260/2198] block: Move queue limits to an embedded struct

To accommodate stacking drivers that do not have an associated request
queue we're moving the limits to a separate, embedded structure.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 55 ++++++++++++++++++++++++++----------------
 include/linux/blkdev.h | 44 +++++++++++++++++++--------------
 2 files changed, 60 insertions(+), 39 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0b32f984eed24..b0f547cecfb87 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -179,16 +179,16 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
-	q->bounce_pfn = max_low_pfn;
+	q->limits.bounce_pfn = max_low_pfn;
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
-	q->bounce_pfn = b_pfn;
+	q->limits.bounce_pfn = b_pfn;
 #endif
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
-		q->bounce_pfn = b_pfn;
+		q->limits.bounce_pfn = b_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
@@ -211,10 +211,10 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
 	}
 
 	if (BLK_DEF_MAX_SECTORS > max_sectors)
-		q->max_hw_sectors = q->max_sectors = max_sectors;
+		q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors;
 	else {
-		q->max_sectors = BLK_DEF_MAX_SECTORS;
-		q->max_hw_sectors = max_sectors;
+		q->limits.max_sectors = BLK_DEF_MAX_SECTORS;
+		q->limits.max_hw_sectors = max_sectors;
 	}
 }
 EXPORT_SYMBOL(blk_queue_max_sectors);
@@ -222,9 +222,9 @@ EXPORT_SYMBOL(blk_queue_max_sectors);
 void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
 {
 	if (BLK_DEF_MAX_SECTORS > max_sectors)
-		q->max_hw_sectors = BLK_DEF_MAX_SECTORS;
+		q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS;
 	else
-		q->max_hw_sectors = max_sectors;
+		q->limits.max_hw_sectors = max_sectors;
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 
@@ -247,7 +247,7 @@ void blk_queue_max_phys_segments(struct request_queue *q,
 		       __func__, max_segments);
 	}
 
-	q->max_phys_segments = max_segments;
+	q->limits.max_phys_segments = max_segments;
 }
 EXPORT_SYMBOL(blk_queue_max_phys_segments);
 
@@ -271,7 +271,7 @@ void blk_queue_max_hw_segments(struct request_queue *q,
 		       __func__, max_segments);
 	}
 
-	q->max_hw_segments = max_segments;
+	q->limits.max_hw_segments = max_segments;
 }
 EXPORT_SYMBOL(blk_queue_max_hw_segments);
 
@@ -292,7 +292,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 		       __func__, max_size);
 	}
 
-	q->max_segment_size = max_size;
+	q->limits.max_segment_size = max_size;
 }
 EXPORT_SYMBOL(blk_queue_max_segment_size);
 
@@ -308,7 +308,7 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
  **/
 void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
-	q->logical_block_size = size;
+	q->limits.logical_block_size = size;
 }
 EXPORT_SYMBOL(blk_queue_logical_block_size);
 
@@ -325,14 +325,27 @@ EXPORT_SYMBOL(blk_queue_logical_block_size);
 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 {
 	/* zero is "infinity" */
-	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
-	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
-	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask);
-
-	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
-	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
-	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
-	t->logical_block_size = max(t->logical_block_size, b->logical_block_size);
+	t->limits.max_sectors = min_not_zero(queue_max_sectors(t),
+					     queue_max_sectors(b));
+
+	t->limits.max_hw_sectors = min_not_zero(queue_max_hw_sectors(t),
+						queue_max_hw_sectors(b));
+
+	t->limits.seg_boundary_mask = min_not_zero(queue_segment_boundary(t),
+						   queue_segment_boundary(b));
+
+	t->limits.max_phys_segments = min_not_zero(queue_max_phys_segments(t),
+						   queue_max_phys_segments(b));
+
+	t->limits.max_hw_segments = min_not_zero(queue_max_hw_segments(t),
+						 queue_max_hw_segments(b));
+
+	t->limits.max_segment_size = min_not_zero(queue_max_segment_size(t),
+						  queue_max_segment_size(b));
+
+	t->limits.logical_block_size = max(queue_logical_block_size(t),
+					   queue_logical_block_size(b));
+
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
 	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
@@ -430,7 +443,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
 		       __func__, mask);
 	}
 
-	q->seg_boundary_mask = mask;
+	q->limits.seg_boundary_mask = mask;
 }
 EXPORT_SYMBOL(blk_queue_segment_boundary);
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 29b48f7b4ba82..b7bb6fdba12c2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -307,6 +307,21 @@ struct blk_cmd_filter {
 	struct kobject kobj;
 };
 
+struct queue_limits {
+	unsigned long		bounce_pfn;
+	unsigned long		seg_boundary_mask;
+
+	unsigned int		max_hw_sectors;
+	unsigned int		max_sectors;
+	unsigned int		max_segment_size;
+
+	unsigned short		logical_block_size;
+	unsigned short		max_hw_segments;
+	unsigned short		max_phys_segments;
+
+	unsigned char		no_cluster;
+};
+
 struct request_queue
 {
 	/*
@@ -358,7 +373,6 @@ struct request_queue
 	/*
 	 * queue needs bounce pages for pages above this limit
 	 */
-	unsigned long		bounce_pfn;
 	gfp_t			bounce_gfp;
 
 	/*
@@ -387,14 +401,6 @@ struct request_queue
 	unsigned int		nr_congestion_off;
 	unsigned int		nr_batching;
 
-	unsigned int		max_sectors;
-	unsigned int		max_hw_sectors;
-	unsigned short		max_phys_segments;
-	unsigned short		max_hw_segments;
-	unsigned short		logical_block_size;
-	unsigned int		max_segment_size;
-
-	unsigned long		seg_boundary_mask;
 	void			*dma_drain_buffer;
 	unsigned int		dma_drain_size;
 	unsigned int		dma_pad_mask;
@@ -410,6 +416,8 @@ struct request_queue
 	struct timer_list	timeout;
 	struct list_head	timeout_list;
 
+	struct queue_limits	limits;
+
 	/*
 	 * sg stuff
 	 */
@@ -991,45 +999,45 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 static inline unsigned long queue_bounce_pfn(struct request_queue *q)
 {
-	return q->bounce_pfn;
+	return q->limits.bounce_pfn;
 }
 
 static inline unsigned long queue_segment_boundary(struct request_queue *q)
 {
-	return q->seg_boundary_mask;
+	return q->limits.seg_boundary_mask;
 }
 
 static inline unsigned int queue_max_sectors(struct request_queue *q)
 {
-	return q->max_sectors;
+	return q->limits.max_sectors;
 }
 
 static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
 {
-	return q->max_hw_sectors;
+	return q->limits.max_hw_sectors;
 }
 
 static inline unsigned short queue_max_hw_segments(struct request_queue *q)
 {
-	return q->max_hw_segments;
+	return q->limits.max_hw_segments;
 }
 
 static inline unsigned short queue_max_phys_segments(struct request_queue *q)
 {
-	return q->max_phys_segments;
+	return q->limits.max_phys_segments;
 }
 
 static inline unsigned int queue_max_segment_size(struct request_queue *q)
 {
-	return q->max_segment_size;
+	return q->limits.max_segment_size;
 }
 
 static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
 
-	if (q && q->logical_block_size)
-		retval = q->logical_block_size;
+	if (q && q->limits.logical_block_size)
+		retval = q->limits.logical_block_size;
 
 	return retval;
 }
-- 
GitLab


From cd43e26f071524647e660706b784ebcbefbd2e44 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:52 -0400
Subject: [PATCH 1261/2198] block: Expose stacked device queues in sysfs

Currently stacking devices do not have a queue directory in sysfs.
However, many of the I/O characteristics like sector size, maximum
request size, etc. are queue properties.

This patch enables the queue directory for MD/DM devices.  The elevator
code has been modified to deal with queues that do not have an I/O
scheduler.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-sysfs.c |  6 +++---
 block/elevator.c  | 13 ++++++++++++-
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 142a4acddd432..3ccdadb8e2044 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -395,9 +395,6 @@ int blk_register_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return -ENXIO;
 
-	if (!q->request_fn)
-		return 0;
-
 	ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
 			  "%s", "queue");
 	if (ret < 0)
@@ -405,6 +402,9 @@ int blk_register_queue(struct gendisk *disk)
 
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 
+	if (!q->request_fn)
+		return 0;
+
 	ret = elv_register_queue(q);
 	if (ret) {
 		kobject_uevent(&q->kobj, KOBJ_REMOVE);
diff --git a/block/elevator.c b/block/elevator.c
index ebee948293ebc..bd78f693fcf28 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -575,6 +575,9 @@ void elv_drain_elevator(struct request_queue *q)
  */
 void elv_quiesce_start(struct request_queue *q)
 {
+	if (!q->elevator)
+		return;
+
 	queue_flag_set(QUEUE_FLAG_ELVSWITCH, q);
 
 	/*
@@ -1050,6 +1053,9 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 	char elevator_name[ELV_NAME_MAX];
 	struct elevator_type *e;
 
+	if (!q->elevator)
+		return count;
+
 	strlcpy(elevator_name, name, sizeof(elevator_name));
 	strstrip(elevator_name);
 
@@ -1073,10 +1079,15 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
 	struct elevator_queue *e = q->elevator;
-	struct elevator_type *elv = e->elevator_type;
+	struct elevator_type *elv;
 	struct elevator_type *__e;
 	int len = 0;
 
+	if (!q->elevator)
+		return sprintf(name, "none\n");
+
+	elv = e->elevator_type;
+
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
 		if (!strcmp(elv->elevator_name, __e->elevator_name))
-- 
GitLab


From c72758f33784e5e2a1a4bb9421ef3e6de8f9fcf3 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:53 -0400
Subject: [PATCH 1262/2198] block: Export I/O topology for block devices and
 partitions

To support devices with physical block sizes bigger than 512 bytes we
need to ensure proper alignment.  This patch adds support for exposing
I/O topology characteristics as devices are stacked.

  logical_block_size is the smallest unit the device can address.

  physical_block_size indicates the smallest I/O the device can write
  without incurring a read-modify-write penalty.

  The io_min parameter is the smallest preferred I/O size reported by
  the device.  In many cases this is the same as the physical block
  size.  However, the io_min parameter can be scaled up when stacking
  (RAID5 chunk size > physical block size).

  The io_opt characteristic indicates the optimal I/O size reported by
  the device.  This is usually the stripe width for arrays.

  The alignment_offset parameter indicates the number of bytes the start
  of the device/partition is offset from the device's natural alignment.
  Partition tools and MD/DM utilities can use this to pad their offsets
  so filesystems start on proper boundaries.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/ABI/testing/sysfs-block |  59 ++++++++
 block/blk-settings.c                  | 186 ++++++++++++++++++++++++++
 block/blk-sysfs.c                     |  33 +++++
 block/genhd.c                         |  11 ++
 fs/partitions/check.c                 |  10 ++
 include/linux/blkdev.h                |  47 +++++++
 include/linux/genhd.h                 |   1 +
 7 files changed, 347 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block
index 44f52a4f59033..cbbd3e0699453 100644
--- a/Documentation/ABI/testing/sysfs-block
+++ b/Documentation/ABI/testing/sysfs-block
@@ -60,3 +60,62 @@ Description:
 		Indicates whether the block layer should automatically
 		generate checksums for write requests bound for
 		devices that support receiving integrity metadata.
+
+What:		/sys/block/<disk>/alignment_offset
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a physical block size that is
+		bigger than the logical block size (for instance a drive
+		with 4KB physical sectors exposing 512-byte logical
+		blocks to the operating system).  This parameter
+		indicates how many bytes the beginning of the device is
+		offset from the disk's natural alignment.
+
+What:		/sys/block/<disk>/<partition>/alignment_offset
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a physical block size that is
+		bigger than the logical block size (for instance a drive
+		with 4KB physical sectors exposing 512-byte logical
+		blocks to the operating system).  This parameter
+		indicates how many bytes the beginning of the partition
+		is offset from the disk's natural alignment.
+
+What:		/sys/block/<disk>/queue/logical_block_size
+Date:		May 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		This is the smallest unit the storage device can
+		address.  It is typically 512 bytes.
+
+What:		/sys/block/<disk>/queue/physical_block_size
+Date:		May 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		This is the smallest unit the storage device can write
+		without resorting to read-modify-write operation.  It is
+		usually the same as the logical block size but may be
+		bigger.  One example is SATA drives with 4KB sectors
+		that expose a 512-byte logical block size to the
+		operating system.
+
+What:		/sys/block/<disk>/queue/minimum_io_size
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report a preferred minimum I/O size,
+		which is the smallest request the device can perform
+		without incurring a read-modify-write penalty.  For disk
+		drives this is often the physical block size.  For RAID
+		arrays it is often the stripe chunk size.
+
+What:		/sys/block/<disk>/queue/optimal_io_size
+Date:		April 2009
+Contact:	Martin K. Petersen <martin.petersen@oracle.com>
+Description:
+		Storage devices may report an optimal I/O size, which is
+		the device's preferred unit of receiving I/O.  This is
+		rarely reported for disk drives.  For RAID devices it is
+		usually the stripe width or the internal block size.
diff --git a/block/blk-settings.c b/block/blk-settings.c
index b0f547cecfb87..5649f34adb40d 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -309,9 +309,94 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
 void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
 	q->limits.logical_block_size = size;
+
+	if (q->limits.physical_block_size < size)
+		q->limits.physical_block_size = size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
 }
 EXPORT_SYMBOL(blk_queue_logical_block_size);
 
+/**
+ * blk_queue_physical_block_size - set physical block size for the queue
+ * @q:  the request queue for the device
+ * @size:  the physical block size, in bytes
+ *
+ * Description:
+ *   This should be set to the lowest possible sector size that the
+ *   hardware can operate on without reverting to read-modify-write
+ *   operations.
+ */
+void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
+{
+	q->limits.physical_block_size = size;
+
+	if (q->limits.physical_block_size < q->limits.logical_block_size)
+		q->limits.physical_block_size = q->limits.logical_block_size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_physical_block_size);
+
+/**
+ * blk_queue_alignment_offset - set physical block alignment offset
+ * @q:	the request queue for the device
+ * @alignment:	alignment offset in bytes
+ *
+ * Description:
+ *   Some devices are naturally misaligned to compensate for things like
+ *   the legacy DOS partition table 63-sector offset.  Low-level drivers
+ *   should call this function for devices whose first sector is not
+ *   naturally aligned.
+ */
+void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
+{
+	q->limits.alignment_offset =
+		offset & (q->limits.physical_block_size - 1);
+	q->limits.misaligned = 0;
+}
+EXPORT_SYMBOL(blk_queue_alignment_offset);
+
+/**
+ * blk_queue_io_min - set minimum request size for the queue
+ * @q:	the request queue for the device
+ * @io_min:  smallest I/O size in bytes
+ *
+ * Description:
+ *   Some devices have an internal block size bigger than the reported
+ *   hardware sector size.  This function can be used to signal the
+ *   smallest I/O the device can perform without incurring a performance
+ *   penalty.
+ */
+void blk_queue_io_min(struct request_queue *q, unsigned int min)
+{
+	q->limits.io_min = min;
+
+	if (q->limits.io_min < q->limits.logical_block_size)
+		q->limits.io_min = q->limits.logical_block_size;
+
+	if (q->limits.io_min < q->limits.physical_block_size)
+		q->limits.io_min = q->limits.physical_block_size;
+}
+EXPORT_SYMBOL(blk_queue_io_min);
+
+/**
+ * blk_queue_io_opt - set optimal request size for the queue
+ * @q:	the request queue for the device
+ * @io_opt:  optimal request size in bytes
+ *
+ * Description:
+ *   Drivers can call this function to set the preferred I/O request
+ *   size for devices that report such a value.
+ */
+void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
+{
+	q->limits.io_opt = opt;
+}
+EXPORT_SYMBOL(blk_queue_io_opt);
+
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
  */
@@ -357,6 +442,107 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 
+/**
+ * blk_stack_limits - adjust queue_limits for stacked devices
+ * @t:	the stacking driver limits (top)
+ * @bdev:  the underlying queue limits (bottom)
+ * @offset:  offset to beginning of data within component device
+ *
+ * Description:
+ *    Merges two queue_limit structs.  Returns 0 if alignment didn't
+ *    change.  Returns -1 if adding the bottom device caused
+ *    misalignment.
+ */
+int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+		     sector_t offset)
+{
+	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
+	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+
+	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
+					    b->seg_boundary_mask);
+
+	t->max_phys_segments = min_not_zero(t->max_phys_segments,
+					    b->max_phys_segments);
+
+	t->max_hw_segments = min_not_zero(t->max_hw_segments,
+					  b->max_hw_segments);
+
+	t->max_segment_size = min_not_zero(t->max_segment_size,
+					   b->max_segment_size);
+
+	t->logical_block_size = max(t->logical_block_size,
+				    b->logical_block_size);
+
+	t->physical_block_size = max(t->physical_block_size,
+				     b->physical_block_size);
+
+	t->io_min = max(t->io_min, b->io_min);
+	t->no_cluster |= b->no_cluster;
+
+	/* Bottom device offset aligned? */
+	if (offset &&
+	    (offset & (b->physical_block_size - 1)) != b->alignment_offset) {
+		t->misaligned = 1;
+		return -1;
+	}
+
+	/* If top has no alignment offset, inherit from bottom */
+	if (!t->alignment_offset)
+		t->alignment_offset =
+			b->alignment_offset & (b->physical_block_size - 1);
+
+	/* Top device aligned on logical block boundary? */
+	if (t->alignment_offset & (t->logical_block_size - 1)) {
+		t->misaligned = 1;
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * disk_stack_limits - adjust queue limits for stacked drivers
+ * @t:	MD/DM gendisk (top)
+ * @bdev:  the underlying block device (bottom)
+ * @offset:  offset to beginning of data within component device
+ *
+ * Description:
+ *    Merges the limits for two queues.  Returns 0 if alignment
+ *    didn't change.  Returns -1 if adding the bottom device caused
+ *    misalignment.
+ */
+void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
+		       sector_t offset)
+{
+	struct request_queue *t = disk->queue;
+	struct request_queue *b = bdev_get_queue(bdev);
+
+	offset += get_start_sect(bdev) << 9;
+
+	if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) {
+		char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
+
+		disk_name(disk, 0, top);
+		bdevname(bdev, bottom);
+
+		printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
+		       top, bottom);
+	}
+
+	if (!t->queue_lock)
+		WARN_ON_ONCE(1);
+	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
+		unsigned long flags;
+
+		spin_lock_irqsave(t->queue_lock, flags);
+		if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
+			queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
+		spin_unlock_irqrestore(t->queue_lock, flags);
+	}
+}
+EXPORT_SYMBOL(disk_stack_limits);
+
 /**
  * blk_queue_dma_pad - set pad mask
  * @q:     the request queue for the device
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3ccdadb8e2044..9337e17f9110d 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -105,6 +105,21 @@ static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page
 	return queue_var_show(queue_logical_block_size(q), page);
 }
 
+static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_physical_block_size(q), page);
+}
+
+static ssize_t queue_io_min_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_io_min(q), page);
+}
+
+static ssize_t queue_io_opt_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_io_opt(q), page);
+}
+
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -257,6 +272,21 @@ static struct queue_sysfs_entry queue_logical_block_size_entry = {
 	.show = queue_logical_block_size_show,
 };
 
+static struct queue_sysfs_entry queue_physical_block_size_entry = {
+	.attr = {.name = "physical_block_size", .mode = S_IRUGO },
+	.show = queue_physical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_io_min_entry = {
+	.attr = {.name = "minimum_io_size", .mode = S_IRUGO },
+	.show = queue_io_min_show,
+};
+
+static struct queue_sysfs_entry queue_io_opt_entry = {
+	.attr = {.name = "optimal_io_size", .mode = S_IRUGO },
+	.show = queue_io_opt_show,
+};
+
 static struct queue_sysfs_entry queue_nonrot_entry = {
 	.attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_nonrot_show,
@@ -289,6 +319,9 @@ static struct attribute *default_attrs[] = {
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
 	&queue_logical_block_size_entry.attr,
+	&queue_physical_block_size_entry.attr,
+	&queue_io_min_entry.attr,
+	&queue_io_opt_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/genhd.c b/block/genhd.c
index 1a4916e01732b..fe7ccc0a618f3 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -852,11 +852,21 @@ static ssize_t disk_capability_show(struct device *dev,
 	return sprintf(buf, "%x\n", disk->flags);
 }
 
+static ssize_t disk_alignment_offset_show(struct device *dev,
+					  struct device_attribute *attr,
+					  char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+
+	return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
+}
+
 static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL);
 static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL);
 static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -875,6 +885,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_removable.attr,
 	&dev_attr_ro.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment_offset.attr,
 	&dev_attr_capability.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 99e33ef40be47..0af36085eb28b 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev,
 	return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects);
 }
 
+ssize_t part_alignment_offset_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+}
+
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
@@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev,
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
+static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
@@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = {
 	&dev_attr_partition.attr,
 	&dev_attr_start.attr,
 	&dev_attr_size.attr,
+	&dev_attr_alignment_offset.attr,
 	&dev_attr_stat.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
@@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno,
 	pdev = part_to_dev(p);
 
 	p->start_sect = start;
+	p->alignment_offset = queue_sector_alignment_offset(disk->queue, start);
 	p->nr_sects = len;
 	p->partno = partno;
 	p->policy = get_disk_ro(disk);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b7bb6fdba12c2..5e740a135e733 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -314,11 +314,16 @@ struct queue_limits {
 	unsigned int		max_hw_sectors;
 	unsigned int		max_sectors;
 	unsigned int		max_segment_size;
+	unsigned int		physical_block_size;
+	unsigned int		alignment_offset;
+	unsigned int		io_min;
+	unsigned int		io_opt;
 
 	unsigned short		logical_block_size;
 	unsigned short		max_hw_segments;
 	unsigned short		max_phys_segments;
 
+	unsigned char		misaligned;
 	unsigned char		no_cluster;
 };
 
@@ -911,6 +916,15 @@ extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
+extern void blk_queue_alignment_offset(struct request_queue *q,
+				       unsigned int alignment);
+extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
+extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
+extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
+			    sector_t offset);
+extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
+			      sector_t offset);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
@@ -1047,6 +1061,39 @@ static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 	return queue_logical_block_size(bdev_get_queue(bdev));
 }
 
+static inline unsigned int queue_physical_block_size(struct request_queue *q)
+{
+	return q->limits.physical_block_size;
+}
+
+static inline unsigned int queue_io_min(struct request_queue *q)
+{
+	return q->limits.io_min;
+}
+
+static inline unsigned int queue_io_opt(struct request_queue *q)
+{
+	return q->limits.io_opt;
+}
+
+static inline int queue_alignment_offset(struct request_queue *q)
+{
+	if (q && q->limits.misaligned)
+		return -1;
+
+	if (q && q->limits.alignment_offset)
+		return q->limits.alignment_offset;
+
+	return 0;
+}
+
+static inline int queue_sector_alignment_offset(struct request_queue *q,
+						sector_t sector)
+{
+	return ((sector << 9) - q->limits.alignment_offset)
+		& (q->limits.io_min - 1);
+}
+
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index a1a28caed23dc..149fda264c86c 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -90,6 +90,7 @@ struct disk_stats {
 struct hd_struct {
 	sector_t start_sect;
 	sector_t nr_sects;
+	sector_t alignment_offset;
 	struct device __dev;
 	struct kobject *holder_dir;
 	int policy, partno;
-- 
GitLab


From 3b77f777b8f1c001b63e317c4ce317292ff0ff94 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Sat, 23 May 2009 08:23:16 +0200
Subject: [PATCH 1263/2198] ide-disk: fix missing max_sectors accessor function
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The recent move to accessor functions for querying queue limits
missed an entry in ide-disk.c:

drivers/ide/ide-disk.c: In function ‘ide_disk_setup’:
drivers/ide/ide-disk.c:642: error: ‘struct request_queue’ has no member named ‘max_sectors’

Fix it.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/ide/ide-disk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c
index ad18e14043c57..c6f7fcfb9d672 100644
--- a/drivers/ide/ide-disk.c
+++ b/drivers/ide/ide-disk.c
@@ -639,7 +639,7 @@ static void ide_disk_setup(ide_drive_t *drive)
 	}
 
 	printk(KERN_INFO "%s: max request size: %dKiB\n", drive->name,
-		q->max_sectors / 2);
+	       queue_max_sectors(q) / 2);
 
 	if (ata_id_is_ssd(id))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
-- 
GitLab


From 948cd52906baf1f92aeea2f9b5c515db1b2e592a Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Fri, 22 May 2009 10:40:09 +0900
Subject: [PATCH 1264/2198] sparseirq: Allow early irq_desc allocation

Presently non-legacy IRQs have their irq_desc allocated with
kzalloc_node(). This assumes that all callers of irq_to_desc_node_alloc()
will be sufficiently late in the boot process that kmalloc is available.

While porting sparseirq support to sh this blew up immediately, as at the
time that we register the CPU's interrupt vector map only bootmem is
available. Check slab_is_available() to work out which path to use.

[ Impact: fix SH early boot crash with sparseirq enabled ]

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: Yinghai Lu <yinghai@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Mel Gorman <mel@csn.ul.ie>
LKML-Reference: <20090522014008.GA2806@linux-sh.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a3c671e0f165c..18041a254d324 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
@@ -81,11 +82,16 @@ static struct irq_desc irq_desc_init = {
 	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
 };
 
-void init_kstat_irqs(struct irq_desc *desc, int node, int nr)
+void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
 	void *ptr;
 
-	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
+	if (slab_is_available())
+		ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+				   GFP_ATOMIC, node);
+	else
+		ptr = alloc_bootmem_node(NODE_DATA(node),
+				nr * sizeof(*desc->kstat_irqs));
 
 	/*
 	 * don't overwite if can not get new one
@@ -186,7 +192,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
 	return NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 {
 	struct irq_desc *desc;
 	unsigned long flags;
@@ -208,7 +214,11 @@ struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 	if (desc)
 		goto out_unlock;
 
-	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	if (slab_is_available())
+		desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+	else
+		desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+
 	printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
 	if (!desc) {
 		printk(KERN_ERR "can not alloc irq_desc\n");
-- 
GitLab


From e220d2dcb944c5c488b6855d15ec66d76900514f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:55 +0200
Subject: [PATCH 1265/2198] perf_counter: Fix dynamic irq_period logging

We call perf_adjust_freq() from perf_counter_task_tick() which
is is called under the rq->lock causing lock recursion.
However, it's no longer required to be called under the
rq->lock, so remove it from under it.

Also, fix up some related comments.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.476197912@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 1 +
 kernel/perf_counter.c        | 3 ++-
 kernel/sched.c               | 3 ++-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2eedae8498d3a..23ddd29730f81 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -260,6 +260,7 @@ enum perf_event_type {
 	/*
 	 * struct {
 	 * 	struct perf_event_header	header;
+	 * 	u64				time;
 	 * 	u64				irq_period;
 	 * };
 	 */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c10055416dea5..2f410ea2cb399 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2559,7 +2559,8 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 }
 
 /*
- *
+ * Log irq_period changes so that analyzing tools can re-normalize the
+ * event flow.
  */
 
 static void perf_log_period(struct perf_counter *counter, u64 period)
diff --git a/kernel/sched.c b/kernel/sched.c
index 4c0d58bce6b28..ad079f07c9c8c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4875,9 +4875,10 @@ void scheduler_tick(void)
 	update_rq_clock(rq);
 	update_cpu_load(rq);
 	curr->sched_class->task_tick(rq, curr, 0);
-	perf_counter_task_tick(curr, cpu);
 	spin_unlock(&rq->lock);
 
+	perf_counter_task_tick(curr, cpu);
+
 #ifdef CONFIG_SMP
 	rq->idle_at_tick = idle_cpu(cpu);
 	trigger_load_balance(rq, cpu);
-- 
GitLab


From fccc714b3148ab9741fafc1e90c3876d50df6093 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:56 +0200
Subject: [PATCH 1266/2198] perf_counter: Sanitize counter->mutex

s/counter->mutex/counter->child_mutex/ and make sure its only
used to protect child_list.

The usage in __perf_counter_exit_task() doesn't appear to be
problematic since ctx->mutex also covers anything related to fd
tear-down.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.533186528@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 +++---
 kernel/perf_counter.c        | 47 +++++++++++++++---------------------
 2 files changed, 22 insertions(+), 32 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 23ddd29730f81..4ab8050eb9e8a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -452,9 +452,6 @@ struct perf_counter {
 	struct perf_counter_context	*ctx;
 	struct file			*filp;
 
-	struct perf_counter		*parent;
-	struct list_head		child_list;
-
 	/*
 	 * These accumulate total time (in nanoseconds) that children
 	 * counters have been enabled and running, respectively.
@@ -465,7 +462,9 @@ struct perf_counter {
 	/*
 	 * Protect attach/detach and child_list:
 	 */
-	struct mutex			mutex;
+	struct mutex			child_mutex;
+	struct list_head		child_list;
+	struct perf_counter		*parent;
 
 	int				oncpu;
 	int				cpu;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 2f410ea2cb399..679c3b5bb7d4c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -111,6 +111,10 @@ static void put_ctx(struct perf_counter_context *ctx)
 	}
 }
 
+/*
+ * Add a counter from the lists for its context.
+ * Must be called with ctx->mutex and ctx->lock held.
+ */
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -136,7 +140,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 /*
  * Remove a counter from the lists for its context.
- * Must be called with counter->mutex and ctx->mutex held.
+ * Must be called with ctx->mutex and ctx->lock held.
  */
 static void
 list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
@@ -276,7 +280,7 @@ static void __perf_counter_remove_from_context(void *info)
 /*
  * Remove the counter from a task's (or a CPU's) list of counters.
  *
- * Must be called with counter->mutex and ctx->mutex held.
+ * Must be called with ctx->mutex held.
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
@@ -1407,11 +1411,7 @@ static int perf_release(struct inode *inode, struct file *file)
 	file->private_data = NULL;
 
 	mutex_lock(&ctx->mutex);
-	mutex_lock(&counter->mutex);
-
 	perf_counter_remove_from_context(counter);
-
-	mutex_unlock(&counter->mutex);
 	mutex_unlock(&ctx->mutex);
 
 	free_counter(counter);
@@ -1437,7 +1437,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->state == PERF_COUNTER_STATE_ERROR)
 		return 0;
 
-	mutex_lock(&counter->mutex);
+	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
 	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
@@ -1446,7 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
-	mutex_unlock(&counter->mutex);
+	mutex_unlock(&counter->child_mutex);
 
 	if (count < n * sizeof(u64))
 		return -EINVAL;
@@ -1510,11 +1510,11 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
-	mutex_lock(&counter->mutex);
+	mutex_lock(&counter->child_mutex);
 	func(counter);
 	list_for_each_entry(child, &counter->child_list, child_list)
 		func(child);
-	mutex_unlock(&counter->mutex);
+	mutex_unlock(&counter->child_mutex);
 }
 
 static void perf_counter_for_each(struct perf_counter *counter,
@@ -1522,11 +1522,11 @@ static void perf_counter_for_each(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
-	mutex_lock(&counter->mutex);
+	mutex_lock(&counter->child_mutex);
 	perf_counter_for_each_sibling(counter, func);
 	list_for_each_entry(child, &counter->child_list, child_list)
 		perf_counter_for_each_sibling(child, func);
-	mutex_unlock(&counter->mutex);
+	mutex_unlock(&counter->child_mutex);
 }
 
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -3106,7 +3106,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	if (!group_leader)
 		group_leader = counter;
 
-	mutex_init(&counter->mutex);
+	mutex_init(&counter->child_mutex);
+	INIT_LIST_HEAD(&counter->child_list);
+
 	INIT_LIST_HEAD(&counter->list_entry);
 	INIT_LIST_HEAD(&counter->event_entry);
 	INIT_LIST_HEAD(&counter->sibling_list);
@@ -3114,8 +3116,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
 	mutex_init(&counter->mmap_mutex);
 
-	INIT_LIST_HEAD(&counter->child_list);
-
 	counter->cpu			= cpu;
 	counter->hw_event		= *hw_event;
 	counter->group_leader		= group_leader;
@@ -3346,10 +3346,9 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Link this into the parent counter's child list
 	 */
-	mutex_lock(&parent_counter->mutex);
+	mutex_lock(&parent_counter->child_mutex);
 	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
-
-	mutex_unlock(&parent_counter->mutex);
+	mutex_unlock(&parent_counter->child_mutex);
 
 	return child_counter;
 }
@@ -3396,9 +3395,9 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	/*
 	 * Remove this counter from the parent's list
 	 */
-	mutex_lock(&parent_counter->mutex);
+	mutex_lock(&parent_counter->child_mutex);
 	list_del_init(&child_counter->child_list);
-	mutex_unlock(&parent_counter->mutex);
+	mutex_unlock(&parent_counter->child_mutex);
 
 	/*
 	 * Release the parent counter, if this was the last
@@ -3414,17 +3413,9 @@ __perf_counter_exit_task(struct task_struct *child,
 {
 	struct perf_counter *parent_counter;
 
-	/*
-	 * Protect against concurrent operations on child_counter
-	 * due its fd getting closed, etc.
-	 */
-	mutex_lock(&child_counter->mutex);
-
 	update_counter_times(child_counter);
 	list_del_counter(child_counter, child_ctx);
 
-	mutex_unlock(&child_counter->mutex);
-
 	parent_counter = child_counter->parent;
 	/*
 	 * It can happen that parent exits first, and has counters
-- 
GitLab


From 682076ae1de0aba9c2da509f7b19dc03e30a6e1f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:57 +0200
Subject: [PATCH 1267/2198] perf_counter: Sanitize context locking

Ensure we're consistent with the context locks.

 context->mutex
   context->lock
     list_{add,del}_counter();

so that either lock is sufficient to stabilize the context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.618790733@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 679c3b5bb7d4c..d162d2f0b2708 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -597,6 +597,8 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 
 /*
  * Cross CPU call to install and enable a performance counter
+ *
+ * Must be called with ctx->mutex held
  */
 static void __perf_install_in_context(void *info)
 {
@@ -1496,13 +1498,13 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *sibling;
 
-	spin_lock_irq(&ctx->lock);
+	mutex_lock(&ctx->mutex);
 	counter = counter->group_leader;
 
 	func(counter);
 	list_for_each_entry(sibling, &counter->sibling_list, list_entry)
 		func(sibling);
-	spin_unlock_irq(&ctx->lock);
+	mutex_unlock(&ctx->mutex);
 }
 
 static void perf_counter_for_each_child(struct perf_counter *counter,
@@ -3414,7 +3416,10 @@ __perf_counter_exit_task(struct task_struct *child,
 	struct perf_counter *parent_counter;
 
 	update_counter_times(child_counter);
+
+	spin_lock_irq(&child_ctx->lock);
 	list_del_counter(child_counter, child_ctx);
+	spin_unlock_irq(&child_ctx->lock);
 
 	parent_counter = child_counter->parent;
 	/*
-- 
GitLab


From 1a482f38c5aafeb3576079a38a5b21b46619f3d2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:58 +0200
Subject: [PATCH 1268/2198] perf_counter: Fix userspace build

recent userspace (F11) seems to already include the
linux/unistd.h bits which means we cannot include the version
in the kernel sources due to the header guards being the same.

Ensure we include the kernel version first.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.739756497@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c |  3 +-
 Documentation/perf_counter/builtin-stat.c   |  5 +---
 Documentation/perf_counter/builtin-top.c    |  5 +---
 Documentation/perf_counter/perf.h           | 31 +++++++++++++--------
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index efb87595f3cb0..1b19f187d3529 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -1,5 +1,6 @@
 
 
+#include "perf.h"
 #include "util/util.h"
 
 #include <sys/types.h>
@@ -30,9 +31,7 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-#include "../../include/linux/perf_counter.h"
 
-#include "perf.h"
 
 #define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
 #define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 03518d75d864d..8ae01d51f2914 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -61,6 +61,7 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
+#include "perf.h"
 #include "util/util.h"
 
 #include <getopt.h>
@@ -83,10 +84,6 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-#include "../../include/linux/perf_counter.h"
-
-#include "perf.h"
-
 #define EVENT_MASK_KERNEL		1
 #define EVENT_MASK_USER			2
 
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 814b2e4925e30..a3216a6018c49 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -42,6 +42,7 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
+#include "perf.h"
 #include "util/util.h"
 
 #include <getopt.h>
@@ -64,10 +65,6 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-#include "../../include/linux/perf_counter.h"
-
-#include "perf.h"
-
 static int			system_wide			=  0;
 
 static int			nr_counters			=  0;
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 81a737444c8ff..a517683fc661a 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -1,6 +1,25 @@
 #ifndef _PERF_PERF_H
 #define _PERF_PERF_H
 
+#if defined(__x86_64__) || defined(__i386__)
+#include "../../arch/x86/include/asm/unistd.h"
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#ifdef __powerpc__
+#include "../../arch/powerpc/include/asm/unistd.h"
+#define rmb()		asm volatile ("sync" ::: "memory")
+#define cpu_relax()	asm volatile ("" ::: "memory");
+#endif
+
+#include <time.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+
+#include "../../include/linux/perf_counter.h"
+
 /*
  * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all
  * counters in the current task.
@@ -26,18 +45,6 @@ static inline unsigned long long rdclock(void)
 #define __user
 #define asmlinkage
 
-#if defined(__x86_64__) || defined(__i386__)
-#include "../../arch/x86/include/asm/unistd.h"
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#include "../../arch/powerpc/include/asm/unistd.h"
-#define rmb()		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
 #define unlikely(x)	__builtin_expect(!!(x), 0)
 #define min(x, y) ({				\
 	typeof(x) _min1 = (x);			\
-- 
GitLab


From aa9c67f53d1969cf1db4c9c2db3a78c4ceb96469 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:28:59 +0200
Subject: [PATCH 1269/2198] perf_counter: Simplify context cleanup

Use perf_counter_remove_from_context() to remove counters from
the context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.796275849@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d162d2f0b2708..0e97f89613336 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3416,10 +3416,7 @@ __perf_counter_exit_task(struct task_struct *child,
 	struct perf_counter *parent_counter;
 
 	update_counter_times(child_counter);
-
-	spin_lock_irq(&child_ctx->lock);
-	list_del_counter(child_counter, child_ctx);
-	spin_unlock_irq(&child_ctx->lock);
+	perf_counter_remove_from_context(child_counter);
 
 	parent_counter = child_counter->parent;
 	/*
-- 
GitLab


From 79f52b77b89e8b7aa9fbe62135eea198a2ecbd5b Mon Sep 17 00:00:00 2001
From: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Date: Sat, 23 May 2009 20:28:41 -0500
Subject: [PATCH 1270/2198] jfs: Add missing mutex_unlock call to error path

Jan Kucera found an missing call to mutex_unlock() with his static code
checker.  It's an unlikely error path to hit in the real world, but it
should be fixed.

Signed-off-by: Dave Kleikamp <shaggy@linux.vnet.ibm.com>
Reported-by: Jan Kucera <kucera.jan.cz@gmail.com>
---
 fs/jfs/jfs_imap.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 346057218edcd..0fc30407f0391 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp)
 
 			txAbort(tid, 0);
 			txEnd(tid);
+			mutex_unlock(&JFS_IP(ipimap)->commit_mutex);
 
 			/* release the inode map lock */
 			IWRITE_UNLOCK(ipimap);
-- 
GitLab


From 082ff5a2767a0679ee543f14883adbafb631ffbe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:29:00 +0200
Subject: [PATCH 1271/2198] perf_counter: Change pctrl() behaviour

Instead of en/dis-abling all counters acting on a particular
task, en/dis- able all counters we created.

[ v2: fix crash on first counter enable ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163012.916937244@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h    | 10 +++++
 include/linux/perf_counter.h |  3 ++
 include/linux/sched.h        |  2 +
 kernel/perf_counter.c        | 87 ++++++++++--------------------------
 4 files changed, 39 insertions(+), 63 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index d87247d2641f5..353c0ac7723a2 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -108,6 +108,15 @@ extern struct group_info init_groups;
 
 extern struct cred init_cred;
 
+#ifdef CONFIG_PERF_COUNTERS
+# define INIT_PERF_COUNTERS(tsk)					\
+	.perf_counter_mutex = 						\
+		 __MUTEX_INITIALIZER(tsk.perf_counter_mutex),		\
+	.perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list),
+#else
+# define INIT_PERF_COUNTERS(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -171,6 +180,7 @@ extern struct cred init_cred;
 	},								\
 	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\
 	INIT_IDS							\
+	INIT_PERF_COUNTERS(tsk)						\
 	INIT_TRACE_IRQFLAGS						\
 	INIT_LOCKDEP							\
 	INIT_FTRACE_GRAPH						\
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4ab8050eb9e8a..4159ee5940f8a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -469,6 +469,9 @@ struct perf_counter {
 	int				oncpu;
 	int				cpu;
 
+	struct list_head		owner_entry;
+	struct task_struct		*owner;
+
 	/* mmap bits */
 	struct mutex			mmap_mutex;
 	atomic_t			mmap_count;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9714d450f4178..bc9326dcdde18 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1389,6 +1389,8 @@ struct task_struct {
 #endif
 #ifdef CONFIG_PERF_COUNTERS
 	struct perf_counter_context *perf_counter_ctxp;
+	struct mutex perf_counter_mutex;
+	struct list_head perf_counter_list;
 #endif
 #ifdef CONFIG_NUMA
 	struct mempolicy *mempolicy;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0e97f89613336..4c86a63697647 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1076,79 +1076,26 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 }
 
-int perf_counter_task_disable(void)
+int perf_counter_task_enable(void)
 {
-	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
-	unsigned long flags;
-
-	if (!ctx || !ctx->nr_counters)
-		return 0;
-
-	local_irq_save(flags);
 
-	__perf_counter_task_sched_out(ctx);
-
-	spin_lock(&ctx->lock);
-
-	/*
-	 * Disable all the counters:
-	 */
-	perf_disable();
-
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state != PERF_COUNTER_STATE_ERROR) {
-			update_group_times(counter);
-			counter->state = PERF_COUNTER_STATE_OFF;
-		}
-	}
-
-	perf_enable();
-
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	mutex_lock(&current->perf_counter_mutex);
+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+		perf_counter_enable(counter);
+	mutex_unlock(&current->perf_counter_mutex);
 
 	return 0;
 }
 
-int perf_counter_task_enable(void)
+int perf_counter_task_disable(void)
 {
-	struct task_struct *curr = current;
-	struct perf_counter_context *ctx = curr->perf_counter_ctxp;
 	struct perf_counter *counter;
-	unsigned long flags;
-	int cpu;
-
-	if (!ctx || !ctx->nr_counters)
-		return 0;
-
-	local_irq_save(flags);
-	cpu = smp_processor_id();
-
-	__perf_counter_task_sched_out(ctx);
-
-	spin_lock(&ctx->lock);
 
-	/*
-	 * Disable all the counters:
-	 */
-	perf_disable();
-
-	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
-		if (counter->state > PERF_COUNTER_STATE_OFF)
-			continue;
-		counter->state = PERF_COUNTER_STATE_INACTIVE;
-		counter->tstamp_enabled =
-			ctx->time - counter->total_time_enabled;
-		counter->hw_event.disabled = 0;
-	}
-	perf_enable();
-
-	spin_unlock(&ctx->lock);
-
-	perf_counter_task_sched_in(curr, cpu);
-
-	local_irq_restore(flags);
+	mutex_lock(&current->perf_counter_mutex);
+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+		perf_counter_disable(counter);
+	mutex_unlock(&current->perf_counter_mutex);
 
 	return 0;
 }
@@ -1416,6 +1363,11 @@ static int perf_release(struct inode *inode, struct file *file)
 	perf_counter_remove_from_context(counter);
 	mutex_unlock(&ctx->mutex);
 
+	mutex_lock(&counter->owner->perf_counter_mutex);
+	list_del_init(&counter->owner_entry);
+	mutex_unlock(&counter->owner->perf_counter_mutex);
+	put_task_struct(counter->owner);
+
 	free_counter(counter);
 	put_context(ctx);
 
@@ -3272,6 +3224,12 @@ SYSCALL_DEFINE5(perf_counter_open,
 	perf_install_in_context(ctx, counter, cpu);
 	mutex_unlock(&ctx->mutex);
 
+	counter->owner = current;
+	get_task_struct(current);
+	mutex_lock(&current->perf_counter_mutex);
+	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
+	mutex_unlock(&current->perf_counter_mutex);
+
 	fput_light(counter_file, fput_needed2);
 
 out_fput:
@@ -3488,6 +3446,9 @@ void perf_counter_init_task(struct task_struct *child)
 
 	child->perf_counter_ctxp = NULL;
 
+	mutex_init(&child->perf_counter_mutex);
+	INIT_LIST_HEAD(&child->perf_counter_list);
+
 	/*
 	 * This is executed from the parent task context, so inherit
 	 * counters that have been marked for cloning.
-- 
GitLab


From 475c55797323b67435083f6e2eb8ee670f6410ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Sat, 23 May 2009 18:29:01 +0200
Subject: [PATCH 1272/2198] perf_counter: Remove
 perf_counter_context::nr_enabled

now that pctrl() no longer disables other people's counters,
remove the PMU cache code that deals with that.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090523163013.032998331@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  1 -
 kernel/perf_counter.c        | 11 +----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4159ee5940f8a..2ddf5e3c55186 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -516,7 +516,6 @@ struct perf_counter_context {
 	struct list_head	event_list;
 	int			nr_counters;
 	int			nr_active;
-	int			nr_enabled;
 	int			is_active;
 	atomic_t		refcount;
 	struct task_struct	*task;
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4c86a63697647..cb4062559b47c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -134,8 +134,6 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 
 	list_add_rcu(&counter->event_entry, &ctx->event_list);
 	ctx->nr_counters++;
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		ctx->nr_enabled++;
 }
 
 /*
@@ -150,8 +148,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 	if (list_empty(&counter->list_entry))
 		return;
 	ctx->nr_counters--;
-	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
-		ctx->nr_enabled--;
 
 	list_del_init(&counter->list_entry);
 	list_del_rcu(&counter->event_entry);
@@ -406,7 +402,6 @@ static void __perf_counter_disable(void *info)
 		else
 			counter_sched_out(counter, cpuctx, ctx);
 		counter->state = PERF_COUNTER_STATE_OFF;
-		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irqrestore(&ctx->lock, flags);
@@ -448,7 +443,6 @@ static void perf_counter_disable(struct perf_counter *counter)
 	if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
 		update_counter_times(counter);
 		counter->state = PERF_COUNTER_STATE_OFF;
-		ctx->nr_enabled--;
 	}
 
 	spin_unlock_irq(&ctx->lock);
@@ -759,7 +753,6 @@ static void __perf_counter_enable(void *info)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	counter->tstamp_enabled = ctx->time - counter->total_time_enabled;
-	ctx->nr_enabled++;
 
 	/*
 	 * If the counter is in a group and isn't the group leader,
@@ -850,7 +843,6 @@ static void perf_counter_enable(struct perf_counter *counter)
 		counter->state = PERF_COUNTER_STATE_INACTIVE;
 		counter->tstamp_enabled =
 			ctx->time - counter->total_time_enabled;
-		ctx->nr_enabled++;
 	}
  out:
 	spin_unlock_irq(&ctx->lock);
@@ -910,8 +902,7 @@ static int context_equiv(struct perf_counter_context *ctx1,
 			 struct perf_counter_context *ctx2)
 {
 	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-		&& ctx1->parent_gen == ctx2->parent_gen
-		&& ctx1->nr_enabled == ctx2->nr_enabled;
+		&& ctx1->parent_gen == ctx2->parent_gen;
 }
 
 /*
-- 
GitLab


From c2990a2a582d73562d4dcf2502c39892a19a691d Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Sun, 24 May 2009 08:35:49 +0200
Subject: [PATCH 1273/2198] perf top: fix segfault

c6eb13 increased stack usage such that perf-top now croaks on startup.

Take event_array and mmap_array off the stack to prevent segfault on boxen
with smallish ulimit -s setting.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index a3216a6018c49..74021ac90f51d 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -1035,10 +1035,11 @@ static void mmap_read(struct mmap_data *md)
 	md->prev = old;
 }
 
+static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
+static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
 int cmd_top(int argc, char **argv, const char *prefix)
 {
-	struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-	struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 	struct perf_counter_hw_event hw_event;
 	pthread_t thread;
 	int i, counter, group_fd, nr_poll = 0;
-- 
GitLab


From a3862d3f814ce7dfca9eed56ac23d29db3aee8d5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 24 May 2009 09:02:37 +0200
Subject: [PATCH 1274/2198] perf_counter: Increase mmap limit

In a default 'perf top' run the tool will create a counter for
each online CPU. With enough CPUs this will eventually exhaust
the default limit.

So scale it up with the number of online CPUs.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index cb4062559b47c..6cdf8248eda21 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1704,6 +1704,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 	user_extra = nr_pages + 1;
 	user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+
+	/*
+	 * Increase the limit linearly with more CPUs:
+	 */
+	user_lock_limit *= num_online_cpus();
+
 	user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 
 	extra = 0;
-- 
GitLab


From 85a9f9200226ddffc2ea50dae6a8df04c033ecd4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 09:59:50 +0200
Subject: [PATCH 1275/2198] perf_counter tools: increase limits, fix

NR_CPUS and NR_COUNTERS goes up quadratic ... 1024x4096 was far
too ambitious upper limit - go for 256x256 which is still plenty.

[ Impact: reduce perf tool memory consumption ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/perf.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index a517683fc661a..5a2520bb7e55a 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -61,8 +61,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
 		       group_fd, flags);
 }
 
-#define MAX_COUNTERS			1024
-#define MAX_NR_CPUS			4096
+#define MAX_COUNTERS			256
+#define MAX_NR_CPUS			256
 
 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
 
-- 
GitLab


From d94b943054721c346b0881865d645f000cd19880 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 25 May 2009 09:57:56 +0200
Subject: [PATCH 1276/2198] perf top: Reduce display overhead

Iterate over the symbol table once per display interval, and
copy/sort/tally/decay only those symbols which are active.

Before:

 top - 10:14:53 up  4:08, 17 users,  load average: 1.17, 1.53, 1.49
 Tasks: 273 total,   5 running, 268 sleeping,   0 stopped,   0 zombie
 Cpu(s):  6.9%us, 38.2%sy,  0.0%ni, 19.9%id,  0.0%wa,  0.0%hi, 35.0%si,  0.0%st

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  P COMMAND
 28504 root      20   0  1044  260  164 S   58  0.0   0:04.19 2 netserver
 28499 root      20   0  1040  412  316 R   51  0.0   0:04.15 0 netperf
 28500 root      20   0  1040  408  316 R   50  0.0   0:04.14 1 netperf
 28503 root      20   0  1044  260  164 S   50  0.0   0:04.01 1 netserver
 28501 root      20   0  1044  260  164 S   49  0.0   0:03.99 0 netserver
 28502 root      20   0  1040  412  316 S   43  0.0   0:03.96 2 netperf
 28468 root      20   0 1892m 325m  972 S   16 10.8   0:10.50 3 perf
 28467 root      20   0 1892m 325m  972 R    2 10.8   0:00.72 3 perf

After:

 top - 10:16:30 up  4:10, 17 users,  load average: 2.27, 1.88, 1.62
 Tasks: 273 total,   6 running, 267 sleeping,   0 stopped,   0 zombie
 Cpu(s):  2.5%us, 39.7%sy,  0.0%ni, 24.6%id,  0.0%wa,  0.0%hi, 33.3%si,  0.0%st

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  P COMMAND
 28590 root      20   0  1040  412  316 S   54  0.0   0:07.85 2 netperf
 28589 root      20   0  1044  260  164 R   54  0.0   0:07.84 0 netserver
 28588 root      20   0  1040  412  316 R   50  0.0   0:07.89 1 netperf
 28591 root      20   0  1044  256  164 S   50  0.0   0:07.82 1 netserver
 28587 root      20   0  1040  408  316 R   47  0.0   0:07.61 0 netperf
 28592 root      20   0  1044  260  164 R   47  0.0   0:07.85 2 netserver
 28378 root      20   0  8732 1300  860 R    2  0.0   0:01.81 3 top
 28577 root      20   0 1892m 165m  972 R    2  5.5   0:00.48 3 perf
 28578 root      20   0 1892m 165m  972 S    2  5.5   0:00.04 3 perf

[ Impact: optimization ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 54 ++++++++++++------------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 74021ac90f51d..4bed265926dd2 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -374,18 +374,26 @@ static struct sym_entry		tmp[MAX_SYMS];
 
 static void print_sym_table(void)
 {
-	int i, printed;
+	int i, j, active_count, printed;
 	int counter;
 	float events_per_sec = events/delay_secs;
 	float kevents_per_sec = (events-userspace_events)/delay_secs;
 	float sum_kevents = 0.0;
 
 	events = userspace_events = 0;
-	memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count);
-	qsort(tmp, sym_table_count, sizeof(tmp[0]), compare);
 
-	for (i = 0; i < sym_table_count && tmp[i].count[0]; i++)
-		sum_kevents += tmp[i].count[0];
+	/* Iterate over symbol table and copy/tally/decay active symbols. */
+	for (i = 0, active_count = 0; i < sym_table_count; i++) {
+		if (sym_table[i].count[0]) {
+			tmp[active_count++] = sym_table[i];
+			sum_kevents += sym_table[i].count[0];
+
+			for (j = 0; j < nr_counters; j++)
+				sym_table[i].count[j] = zero ? 0 : sym_table[i].count[j] * 7 / 8;
+		}
+	}
+
+	qsort(tmp, active_count + 1, sizeof(tmp[0]), compare);
 
 	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
 
@@ -433,29 +441,23 @@ static void print_sym_table(void)
 	       	       "  ______     ______   _____   ________________   _______________\n\n"
 	);
 
-	for (i = 0, printed = 0; i < sym_table_count; i++) {
+	for (i = 0, printed = 0; i < active_count; i++) {
 		float pcnt;
-		int count;
 
-		if (printed <= 18 && tmp[i].count[0] >= count_filter) {
-			pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
-
-			if (nr_counters == 1)
-				printf("%19.2f - %4.1f%% - %016llx : %s\n",
-					sym_weight(tmp + i),
-					pcnt, tmp[i].addr, tmp[i].sym);
-			else
-				printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
-					sym_weight(tmp + i),
-					tmp[i].count[0],
-					pcnt, tmp[i].addr, tmp[i].sym);
-			printed++;
-		}
-		/*
-		 * Add decay to the counts:
-		 */
-		for (count = 0; count < nr_counters; count++)
-			sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8;
+		if (++printed > 18 || tmp[i].count[0] < count_filter)
+			break;
+
+		pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
+
+		if (nr_counters == 1)
+			printf("%19.2f - %4.1f%% - %016llx : %s\n",
+				sym_weight(tmp + i),
+				pcnt, tmp[i].addr, tmp[i].sym);
+		else
+			printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
+				sym_weight(tmp + i),
+				tmp[i].count[0],
+				pcnt, tmp[i].addr, tmp[i].sym);
 	}
 
 	if (sym_filter_entry)
-- 
GitLab


From e4cbb4e3ac8b09fdb11e39e5a5611bfab0a7cd1a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 19 May 2009 15:50:30 +0200
Subject: [PATCH 1277/2198] perf_counter: Move child perfcounter init to after
 scheduler init

Initialize a task's perfcounters (inherit from parent, etc.) after
the child task's scheduler fields have been initialized already.

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index e72a09f5355bf..675e01e9072a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -984,7 +984,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto fork_out;
 
 	rt_mutex_init_task(p);
-	perf_counter_init_task(p);
 
 #ifdef CONFIG_PROVE_LOCKING
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
@@ -1096,6 +1095,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
+	perf_counter_init_task(p);
 
 	if ((retval = audit_alloc(p)))
 		goto bad_fork_cleanup_policy;
-- 
GitLab


From d3f4b3855ba87caff8f35e738c7e7e3bad0a6ab1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 14:40:01 +0200
Subject: [PATCH 1278/2198] perf stat: flip around ':k' and ':u' flags

This output:

 $ perf stat -e 0:1:k -e 0:1:u ./hello
  Performance counter stats for './hello':
          140131  instructions         (events)
         1906968  instructions         (events)

Is quite confusing - as :k means "user instructions", :u means
"kernel instructions".

Flip them around - as the 'exclude' property is not intuitive in
the flag naming.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 8ae01d51f2914..88c70be990319 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -266,9 +266,9 @@ static __u64 match_event_symbols(char *str)
 
 	switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
 		case 3:
-			if (strchr(mask_str, 'u'))
-				event_mask[nr_counters] |= EVENT_MASK_USER;
 			if (strchr(mask_str, 'k'))
+				event_mask[nr_counters] |= EVENT_MASK_USER;
+			if (strchr(mask_str, 'u'))
 				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
 		case 2:
 			return EID(type, id);
-- 
GitLab


From 266dfb0b58bc4181b6158ee63a0069abaa9f3a98 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:24 +0200
Subject: [PATCH 1279/2198] perf_counter: Fix perf-$cmd invokation

Fix:

  $ perf-top
  fatal: cannot handle -top internally

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124559.995591577@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/perf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 594d270be390d..1d6d7aa575a34 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -357,7 +357,7 @@ int main(int argc, const char **argv)
 	 * die if that one cannot handle it.
 	 */
 	if (!prefixcmp(cmd, "perf-")) {
-		cmd += 4;
+		cmd += 5;
 		argv[0] = cmd;
 		handle_internal_command(argc, argv);
 		die("cannot handle %s internally", cmd);
-- 
GitLab


From e527ea312f31e88a7fa5472b71db71c565b0d44f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:25 +0200
Subject: [PATCH 1280/2198] perf_counter: Remove unused ABI bits

extra_config_len isn't used for anything, remove it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124600.116035832@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2ddf5e3c55186..b1f2bac09f970 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -154,11 +154,11 @@ struct perf_counter_hw_event {
 
 				__reserved_1   : 51;
 
-	__u32			extra_config_len;
 	__u32			wakeup_events;	/* wakeup every n events */
+	__u32			__reserved_2;
 
-	__u64			__reserved_2;
 	__u64			__reserved_3;
+	__u64			__reserved_4;
 };
 
 /*
-- 
GitLab


From 771d7cde144d87f2d1fbee4da3c6234d61f7e42a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:26 +0200
Subject: [PATCH 1281/2198] perf_counter: Make pctrl() affect inherited
 counters too

Paul noted that the new ptcrl() didn't work on child counters.

Reported-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124600.203151469@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 48 +++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 6cdf8248eda21..217dbcce2ebdb 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1067,30 +1067,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 }
 
-int perf_counter_task_enable(void)
-{
-	struct perf_counter *counter;
-
-	mutex_lock(&current->perf_counter_mutex);
-	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
-		perf_counter_enable(counter);
-	mutex_unlock(&current->perf_counter_mutex);
-
-	return 0;
-}
-
-int perf_counter_task_disable(void)
-{
-	struct perf_counter *counter;
-
-	mutex_lock(&current->perf_counter_mutex);
-	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
-		perf_counter_disable(counter);
-	mutex_unlock(&current->perf_counter_mutex);
-
-	return 0;
-}
-
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
 static void perf_adjust_freq(struct perf_counter_context *ctx)
@@ -1505,6 +1481,30 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return 0;
 }
 
+int perf_counter_task_enable(void)
+{
+	struct perf_counter *counter;
+
+	mutex_lock(&current->perf_counter_mutex);
+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+		perf_counter_for_each_child(counter, perf_counter_enable);
+	mutex_unlock(&current->perf_counter_mutex);
+
+	return 0;
+}
+
+int perf_counter_task_disable(void)
+{
+	struct perf_counter *counter;
+
+	mutex_lock(&current->perf_counter_mutex);
+	list_for_each_entry(counter, &current->perf_counter_list, owner_entry)
+		perf_counter_for_each_child(counter, perf_counter_disable);
+	mutex_unlock(&current->perf_counter_mutex);
+
+	return 0;
+}
+
 /*
  * Callers need to ensure there can be no nesting of this function, otherwise
  * the seqlock logic goes bad. We can not serialize this because the arch
-- 
GitLab


From 6ab423e0eaca827fbd201ca4ae7d4f8573a366b2 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:27 +0200
Subject: [PATCH 1282/2198] perf_counter: Propagate inheritance failures down
 the fork() path

Fail fork() when we fail inheritance for some reason (-ENOMEM most likely).

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124600.324656474@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 ++--
 kernel/fork.c                |  6 +++++-
 kernel/perf_counter.c        | 20 ++++++++++++--------
 3 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index b1f2bac09f970..d3e85de9bf1e1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -566,7 +566,7 @@ extern void perf_counter_task_sched_in(struct task_struct *task, int cpu);
 extern void perf_counter_task_sched_out(struct task_struct *task,
 					struct task_struct *next, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
-extern void perf_counter_init_task(struct task_struct *child);
+extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
@@ -631,7 +631,7 @@ perf_counter_task_sched_out(struct task_struct *task,
 			    struct task_struct *next, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline void perf_counter_init_task(struct task_struct *child)	{ }
+static inline int perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
diff --git a/kernel/fork.c b/kernel/fork.c
index 675e01e9072a4..c07c3335ceac6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1095,7 +1095,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
 	sched_fork(p, clone_flags);
-	perf_counter_init_task(p);
+
+	retval = perf_counter_init_task(p);
+	if (retval)
+		goto bad_fork_cleanup_policy;
 
 	if ((retval = audit_alloc(p)))
 		goto bad_fork_cleanup_policy;
@@ -1295,6 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_policy:
+	perf_counter_exit_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 217dbcce2ebdb..7a7a144870efa 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3434,18 +3434,23 @@ void perf_counter_exit_task(struct task_struct *child)
 /*
  * Initialize the perf_counter context in task_struct
  */
-void perf_counter_init_task(struct task_struct *child)
+int perf_counter_init_task(struct task_struct *child)
 {
 	struct perf_counter_context *child_ctx, *parent_ctx;
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
+	int ret = 0;
 
 	child->perf_counter_ctxp = NULL;
 
 	mutex_init(&child->perf_counter_mutex);
 	INIT_LIST_HEAD(&child->perf_counter_list);
 
+	parent_ctx = parent->perf_counter_ctxp;
+	if (likely(!parent_ctx || !parent_ctx->nr_counters))
+		return 0;
+
 	/*
 	 * This is executed from the parent task context, so inherit
 	 * counters that have been marked for cloning.
@@ -3454,11 +3459,7 @@ void perf_counter_init_task(struct task_struct *child)
 
 	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
 	if (!child_ctx)
-		return;
-
-	parent_ctx = parent->perf_counter_ctxp;
-	if (likely(!parent_ctx || !parent_ctx->nr_counters))
-		return;
+		return -ENOMEM;
 
 	__perf_counter_init_context(child_ctx, child);
 	child->perf_counter_ctxp = child_ctx;
@@ -3482,8 +3483,9 @@ void perf_counter_init_task(struct task_struct *child)
 			continue;
 		}
 
-		if (inherit_group(counter, parent,
-				  parent_ctx, child, child_ctx)) {
+		ret = inherit_group(counter, parent, parent_ctx,
+					     child, child_ctx);
+		if (ret) {
 			inherited_all = 0;
 			break;
 		}
@@ -3505,6 +3507,8 @@ void perf_counter_init_task(struct task_struct *child)
 	}
 
 	mutex_unlock(&parent_ctx->mutex);
+
+	return ret;
 }
 
 static void __cpuinit perf_counter_init_cpu(int cpu)
-- 
GitLab


From 10989fb2451763fae6f42d85fa6106c8fd010cf5 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 14:45:28 +0200
Subject: [PATCH 1283/2198] perf_counter: Fix PERF_COUNTER_CONTEXT_SWITCHES for
 cpu counters

Ingo noticed that cpu counters had 0 context switches, even though
there was plenty scheduling on the cpu.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525124600.419025548@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7a7a144870efa..14b1fe984832f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -924,14 +924,13 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	struct perf_counter_context *next_ctx;
 	struct pt_regs *regs;
 
+	regs = task_pt_regs(task);
+	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+
 	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
 
 	update_context_time(ctx);
-
-	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
-
 	next_ctx = next->perf_counter_ctxp;
 	if (next_ctx && context_equiv(ctx, next_ctx)) {
 		task->perf_counter_ctxp = next_ctx;
-- 
GitLab


From 759d427aa5a9d88a81afd11817cdeb40aea85234 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 25 May 2009 11:51:00 -0400
Subject: [PATCH 1284/2198] ext4: remove unused function
 __ext4_write_dirty_metadata

The __ext4_write_dirty_metadata() function was introduced by commit
0390131b, "ext4: Allow ext4 to run without a journal", but nothing
ever used the function, either then or since.  So let's remove it and
save a bit of space.

Cc: Frank Mayhar <fmayhar@google.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dadd3f995db5e..14c00fff37137 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4790,25 +4790,6 @@ int ext4_write_inode(struct inode *inode, int wait)
 	return ext4_force_commit(inode->i_sb);
 }
 
-int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
-{
-	int err = 0;
-
-	mark_buffer_dirty(bh);
-	if (inode && inode_needs_sync(inode)) {
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh)) {
-			ext4_error(inode->i_sb, __func__,
-				   "IO error syncing inode, "
-				   "inode=%lu, block=%llu",
-				   inode->i_ino,
-				   (unsigned long long)bh->b_blocknr);
-			err = -EIO;
-		}
-	}
-	return err;
-}
-
 /*
  * ext4_setattr()
  *
-- 
GitLab


From 88b6edd17c62b7d346d21f4087893ce7d4ef828a Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Mon, 25 May 2009 11:50:39 -0400
Subject: [PATCH 1285/2198] ext4: Clean up calls to ext4_get_group_desc()

If the caller isn't planning on modifying the block group descriptors,
there's no need to pass in a pointer to a struct buffer_head.  Nuking
this saves a tiny amount of CPU time and stack space usage.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 3 +--
 fs/ext4/super.c  | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 82f7d1d7eae0e..3743bd849bce8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -347,7 +347,6 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *desc;
-	struct buffer_head *bh;
 	struct flex_groups *flex_group = sbi->s_flex_groups;
 	ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
 	ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
@@ -402,7 +401,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 found_flexbg:
 	for (i = best_flex * flex_size; i < ngroups &&
 		     i < (best_flex + 1) * flex_size; i++) {
-		desc = ext4_get_group_desc(sb, i, &bh);
+		desc = ext4_get_group_desc(sb, i, NULL);
 		if (ext4_free_inodes_count(sb, desc)) {
 			*best_group = i;
 			goto out;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eca6c057b1197..91b98b58ccb9e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1660,7 +1660,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_group_desc *gdp = NULL;
-	struct buffer_head *bh;
 	ext4_group_t flex_group_count;
 	ext4_group_t flex_group;
 	int groups_per_flex = 0;
@@ -1693,7 +1692,7 @@ static int ext4_fill_flex_info(struct super_block *sb)
 	}
 
 	for (i = 0; i < sbi->s_groups_count; i++) {
-		gdp = ext4_get_group_desc(sb, i, &bh);
+		gdp = ext4_get_group_desc(sb, i, NULL);
 
 		flex_group = ext4_flex_group(sbi, i);
 		atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
-- 
GitLab


From ff99be573e02e9f7edc23b472c7f9a5ddba12795 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:03 +0200
Subject: [PATCH 1286/2198] perf_counter: x86: Expose INV and EDGE bits

Expose the INV and EDGE bits of the PMU to raw configs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.494709027@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 6cc1660db8d67..c14437faf5d29 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -87,11 +87,15 @@ static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
 #define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL
+#define CORE_EVNTSEL_INV_MASK		0x00800000ULL
 #define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
 #define CORE_EVNTSEL_MASK 		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
 	 CORE_EVNTSEL_UNIT_MASK  |	\
+	 CORE_EVNTSEL_EDGE_MASK  |	\
+	 CORE_EVNTSEL_INV_MASK  |	\
 	 CORE_EVNTSEL_COUNTER_MASK)
 
 	return event & CORE_EVNTSEL_MASK;
@@ -119,11 +123,15 @@ static u64 amd_pmu_raw_event(u64 event)
 {
 #define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL
 #define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL
+#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL
+#define K7_EVNTSEL_INV_MASK	0x000800000ULL
 #define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL
 
 #define K7_EVNTSEL_MASK			\
 	(K7_EVNTSEL_EVENT_MASK |	\
 	 K7_EVNTSEL_UNIT_MASK  |	\
+	 K7_EVNTSEL_EDGE_MASK  |	\
+	 K7_EVNTSEL_INV_MASK   |	\
 	 K7_EVNTSEL_COUNTER_MASK)
 
 	return event & K7_EVNTSEL_MASK;
-- 
GitLab


From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:04 +0200
Subject: [PATCH 1287/2198] perf_counter: x86: Remove interrupt throttle

remove the x86 specific interrupt throttle

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.616671838@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c        |  2 --
 arch/x86/kernel/cpu/perf_counter.c | 47 ++++--------------------------
 include/linux/perf_counter.h       |  2 --
 3 files changed, 5 insertions(+), 46 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b4f64402a82a4..89b63b5fad337 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void)
 	inc_irq_stat(apic_timer_irqs);
 
 	evt->event_handler(evt);
-
-	perf_counter_unthrottle();
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c14437faf5d29..8c8177f859fe1 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -718,11 +718,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 		intel_pmu_enable_counter(hwc, idx);
 }
 
-/*
- * Maximum interrupt frequency of 100KHz per CPU
- */
-#define PERFMON_MAX_INTERRUPTS (100000/HZ)
-
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -775,15 +770,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	if (status)
 		goto again;
 
-	if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS)
-		perf_enable();
+	perf_enable();
 
 	return 1;
 }
 
 static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 {
-	int cpu, idx, throttle = 0, handled = 0;
+	int cpu, idx, handled = 0;
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
@@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
-	if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) {
-		throttle = 1;
-		__perf_disable();
-		cpuc->enabled = 0;
-		barrier();
-	}
-
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
-		int disable = 0;
-
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
 
@@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		hwc = &counter->hw;
 
 		if (counter->hw_event.nmi != nmi)
-			goto next;
+			continue;
 
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
-			goto next;
+			continue;
 
 		/* counter overflow */
 		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
-		disable = perf_counter_overflow(counter, nmi, regs, 0);
-
-next:
-		if (disable || throttle)
+		if (perf_counter_overflow(counter, nmi, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
 	return handled;
 }
 
-void perf_counter_unthrottle(void)
-{
-	struct cpu_hw_counters *cpuc;
-
-	if (!x86_pmu_initialized())
-		return;
-
-	cpuc = &__get_cpu_var(cpu_hw_counters);
-	if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) {
-		/*
-		 * Clear them before re-enabling irqs/NMIs again:
-		 */
-		cpuc->interrupts = 0;
-		perf_enable();
-	} else {
-		cpuc->interrupts = 0;
-	}
-}
-
 void smp_perf_counter_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d3e85de9bf1e1..0c160be2078f1 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -570,7 +570,6 @@ extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
-extern void perf_counter_unthrottle(void);
 extern void __perf_disable(void);
 extern bool __perf_enable(void);
 extern void perf_disable(void);
@@ -635,7 +634,6 @@ static inline int perf_counter_init_task(struct task_struct *child)	{ }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
-static inline void perf_counter_unthrottle(void)			{ }
 static inline void perf_disable(void)					{ }
 static inline void perf_enable(void)					{ }
 static inline int perf_counter_task_disable(void)	{ return -EINVAL; }
-- 
GitLab


From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 25 May 2009 17:39:05 +0200
Subject: [PATCH 1288/2198] perf_counter: Generic per counter interrupt
 throttle

Introduce a generic per counter interrupt throttle.

This uses the perf_counter_overflow() quick disable to throttle a specific
counter when its going too fast when a pmu->unthrottle() method is provided
which can undo the quick disable.

Power needs to implement both the quick disable and the unthrottle method.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 13 +++++++
 include/linux/perf_counter.h       | 11 ++++++
 kernel/perf_counter.c              | 59 ++++++++++++++++++++++++++++--
 kernel/sysctl.c                    |  8 ++++
 4 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8c8177f859fe1..c4b543d1a86fe 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -623,6 +623,18 @@ static int x86_pmu_enable(struct perf_counter *counter)
 	return 0;
 }
 
+static void x86_pmu_unthrottle(struct perf_counter *counter)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	struct hw_perf_counter *hwc = &counter->hw;
+
+	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
+				cpuc->counters[hwc->idx] != counter))
+		return;
+
+	x86_pmu.enable(hwc, hwc->idx);
+}
+
 void perf_counter_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
@@ -1038,6 +1050,7 @@ static const struct pmu pmu = {
 	.enable		= x86_pmu_enable,
 	.disable	= x86_pmu_disable,
 	.read		= x86_pmu_read,
+	.unthrottle	= x86_pmu_unthrottle,
 };
 
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0c160be2078f1..e3a7585d3e43b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -266,6 +266,15 @@ enum perf_event_type {
 	 */
 	PERF_EVENT_PERIOD		= 4,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u64				time;
+	 * };
+	 */
+	PERF_EVENT_THROTTLE		= 5,
+	PERF_EVENT_UNTHROTTLE		= 6,
+
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
@@ -367,6 +376,7 @@ struct pmu {
 	int (*enable)			(struct perf_counter *counter);
 	void (*disable)			(struct perf_counter *counter);
 	void (*read)			(struct perf_counter *counter);
+	void (*unthrottle)		(struct perf_counter *counter);
 };
 
 /**
@@ -613,6 +623,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_priv;
 extern int sysctl_perf_counter_mlock;
+extern int sysctl_perf_counter_limit;
 
 extern void perf_counter_init(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 14b1fe984832f..ec9c4007a7f99 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -46,6 +46,7 @@ static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
+int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -1066,12 +1067,15 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 	__perf_counter_sched_in(ctx, cpuctx, cpu);
 }
 
+#define MAX_INTERRUPTS (~0ULL)
+
+static void perf_log_throttle(struct perf_counter *counter, int enable);
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 irq_period;
+	u64 interrupts, irq_period;
 	u64 events, period;
 	s64 delta;
 
@@ -1080,10 +1084,19 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 			continue;
 
+		interrupts = counter->hw.interrupts;
+		counter->hw.interrupts = 0;
+
+		if (interrupts == MAX_INTERRUPTS) {
+			perf_log_throttle(counter, 1);
+			counter->pmu->unthrottle(counter);
+			interrupts = 2*sysctl_perf_counter_limit/HZ;
+		}
+
 		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
 			continue;
 
-		events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+		events = HZ * interrupts * counter->hw.irq_period;
 		period = div64_u64(events, counter->hw_event.irq_freq);
 
 		delta = (s64)(1 + period - counter->hw.irq_period);
@@ -1097,7 +1110,6 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		perf_log_period(counter, irq_period);
 
 		counter->hw.irq_period = irq_period;
-		counter->hw.interrupts = 0;
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -2543,6 +2555,35 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 	perf_output_end(&handle);
 }
 
+/*
+ * IRQ throttle logging
+ */
+
+static void perf_log_throttle(struct perf_counter *counter, int enable)
+{
+	struct perf_output_handle handle;
+	int ret;
+
+	struct {
+		struct perf_event_header	header;
+		u64				time;
+	} throttle_event = {
+		.header = {
+			.type = PERF_EVENT_THROTTLE + 1,
+			.misc = 0,
+			.size = sizeof(throttle_event),
+		},
+		.time = sched_clock(),
+	};
+
+	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0);
+	if (ret)
+		return;
+
+	perf_output_put(&handle, throttle_event);
+	perf_output_end(&handle);
+}
+
 /*
  * Generic counter overflow handling.
  */
@@ -2551,9 +2592,19 @@ int perf_counter_overflow(struct perf_counter *counter,
 			  int nmi, struct pt_regs *regs, u64 addr)
 {
 	int events = atomic_read(&counter->event_limit);
+	int throttle = counter->pmu->unthrottle != NULL;
 	int ret = 0;
 
-	counter->hw.interrupts++;
+	if (!throttle) {
+		counter->hw.interrupts++;
+	} else if (counter->hw.interrupts != MAX_INTERRUPTS) {
+		counter->hw.interrupts++;
+		if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
+			counter->hw.interrupts = MAX_INTERRUPTS;
+			perf_log_throttle(counter, 0);
+			ret = 1;
+		}
+	}
 
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3cb1849f59895..0c4bf863afa32 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -930,6 +930,14 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "perf_counter_int_limit",
+		.data		= &sysctl_perf_counter_limit,
+		.maxlen		= sizeof(sysctl_perf_counter_limit),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 #endif
 /*
  * NOTE: do not add new entries to this table unless you have read
-- 
GitLab


From 53b441a565bf4036ab49c8ea04c5ad06ace7dd6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 21:41:28 +0200
Subject: [PATCH 1289/2198] Revert "perf_counter, x86: speed up the scheduling
 fast-path"

This reverts commit b68f1d2e7aa21029d73c7d453a8046e95d351740.

It is causing problems (stuck/stuttering profiling) - when mixed
NMI and non-NMI counters are used.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c4b543d1a86fe..189bf9d7cdabd 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -293,7 +293,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 			return -EACCES;
 		hwc->nmi = 1;
 	}
-	perf_counters_lapic_init(hwc->nmi);
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -612,6 +611,8 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
+	perf_counters_lapic_init(hwc->nmi);
+
 	x86_pmu.disable(hwc, idx);
 
 	cpuc->counters[idx] = counter;
@@ -1037,7 +1038,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(1);
+	perf_counters_lapic_init(0);
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
GitLab


From 0127c3ea082ee9f1034789b978dfc7fd83254617 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 25 May 2009 22:03:26 +0200
Subject: [PATCH 1290/2198] perf_counter: fix warning & lockup

 - remove bogus warning
 - fix wakeup from NMI path lockup
 - also fix up whitespace noise in perf_counter.h

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090525153931.703093461@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 78 ++++++++++++++++++------------------
 kernel/perf_counter.c        |  4 +-
 2 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index e3a7585d3e43b..2b16ed37b74cc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,7 +73,7 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-#define __PERF_COUNTER_MASK(name) 			\
+#define __PERF_COUNTER_MASK(name)			\
 	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
 	 PERF_COUNTER_##name##_SHIFT)
 
@@ -98,14 +98,14 @@ enum sw_event_ids {
  * in the overflow packets.
  */
 enum perf_counter_record_format {
-	PERF_RECORD_IP		= 1U << 0,
-	PERF_RECORD_TID		= 1U << 1,
-	PERF_RECORD_TIME	= 1U << 2,
-	PERF_RECORD_ADDR	= 1U << 3,
-	PERF_RECORD_GROUP	= 1U << 4,
-	PERF_RECORD_CALLCHAIN	= 1U << 5,
-	PERF_RECORD_CONFIG	= 1U << 6,
-	PERF_RECORD_CPU		= 1U << 7,
+	PERF_RECORD_IP			= 1U << 0,
+	PERF_RECORD_TID			= 1U << 1,
+	PERF_RECORD_TIME		= 1U << 2,
+	PERF_RECORD_ADDR		= 1U << 3,
+	PERF_RECORD_GROUP		= 1U << 4,
+	PERF_RECORD_CALLCHAIN		= 1U << 5,
+	PERF_RECORD_CONFIG		= 1U << 6,
+	PERF_RECORD_CPU			= 1U << 7,
 };
 
 /*
@@ -235,13 +235,13 @@ enum perf_event_type {
 	 * correlate userspace IPs to code. They have the following structure:
 	 *
 	 * struct {
-	 * 	struct perf_event_header	header;
+	 *	struct perf_event_header	header;
 	 *
-	 * 	u32				pid, tid;
-	 * 	u64				addr;
-	 * 	u64				len;
-	 * 	u64				pgoff;
-	 * 	char				filename[];
+	 *	u32				pid, tid;
+	 *	u64				addr;
+	 *	u64				len;
+	 *	u64				pgoff;
+	 *	char				filename[];
 	 * };
 	 */
 	PERF_EVENT_MMAP			= 1,
@@ -249,27 +249,27 @@ enum perf_event_type {
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
+	 *	struct perf_event_header	header;
 	 *
-	 * 	u32				pid, tid;
-	 * 	char				comm[];
+	 *	u32				pid, tid;
+	 *	char				comm[];
 	 * };
 	 */
 	PERF_EVENT_COMM			= 3,
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u64				time;
-	 * 	u64				irq_period;
+	 *	struct perf_event_header	header;
+	 *	u64				time;
+	 *	u64				irq_period;
 	 * };
 	 */
 	PERF_EVENT_PERIOD		= 4,
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u64				time;
+	 *	struct perf_event_header	header;
+	 *	u64				time;
 	 * };
 	 */
 	PERF_EVENT_THROTTLE		= 5,
@@ -280,23 +280,23 @@ enum perf_event_type {
 	 * will be PERF_RECORD_*
 	 *
 	 * struct {
-	 * 	struct perf_event_header	header;
+	 *	struct perf_event_header	header;
 	 *
-	 * 	{ u64			ip;	  } && PERF_RECORD_IP
-	 * 	{ u32			pid, tid; } && PERF_RECORD_TID
-	 * 	{ u64			time;     } && PERF_RECORD_TIME
-	 * 	{ u64			addr;     } && PERF_RECORD_ADDR
-	 * 	{ u64			config;   } && PERF_RECORD_CONFIG
-	 * 	{ u32			cpu, res; } && PERF_RECORD_CPU
+	 *	{ u64			ip;	  } && PERF_RECORD_IP
+	 *	{ u32			pid, tid; } && PERF_RECORD_TID
+	 *	{ u64			time;     } && PERF_RECORD_TIME
+	 *	{ u64			addr;     } && PERF_RECORD_ADDR
+	 *	{ u64			config;   } && PERF_RECORD_CONFIG
+	 *	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
-	 * 	{ u64			nr;
-	 * 	  { u64 event, val; } 	cnt[nr];  } && PERF_RECORD_GROUP
+	 *	{ u64			nr;
+	 *	  { u64 event, val; }	cnt[nr];  } && PERF_RECORD_GROUP
 	 *
-	 * 	{ u16			nr,
-	 * 				hv,
-	 * 				kernel,
-	 * 				user;
-	 * 	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
+	 *	{ u16			nr,
+	 *				hv,
+	 *				kernel,
+	 *				user;
+	 *	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN
 	 * };
 	 */
 };
@@ -406,7 +406,7 @@ struct perf_mmap_data {
 	atomic_t			wakeup;		/* needs a wakeup    */
 
 	struct perf_counter_mmap_page   *user_page;
-	void 				*data_pages[0];
+	void				*data_pages[0];
 };
 
 struct perf_pending_entry {
@@ -422,7 +422,7 @@ struct perf_counter {
 	struct list_head		list_entry;
 	struct list_head		event_entry;
 	struct list_head		sibling_list;
-	int 				nr_siblings;
+	int				nr_siblings;
 	struct perf_counter		*group_leader;
 	const struct pmu		*pmu;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ec9c4007a7f99..070f92d3232a8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2576,7 +2576,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 		.time = sched_clock(),
 	};
 
-	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0);
+	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
 	if (ret)
 		return;
 
@@ -3449,8 +3449,6 @@ void perf_counter_exit_task(struct task_struct *child)
 	struct perf_counter_context *child_ctx;
 	unsigned long flags;
 
-	WARN_ON_ONCE(child != current);
-
 	child_ctx = child->perf_counter_ctxp;
 
 	if (likely(!child_ctx))
-- 
GitLab


From 4f5359685af6de7dca101393dc606620adbe963f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 18 May 2009 19:35:34 +0800
Subject: [PATCH 1291/2198] tracing: add trace_event_read_lock()

I found that there is nothing to protect event_hash in
ftrace_find_event(). Rcu protects the event hashlist
but not the event itself while we use it after its extraction
through ftrace_find_event().

This lack of a proper locking in this spot opens a race
window between any event dereferencing and module removal.

Eg:

--Task A--

print_trace_line(trace) {
  event = find_ftrace_event(trace)

--Task B--

trace_module_remove_events(mod) {
  list_trace_events_module(ev, mod) {
    unregister_ftrace_event(ev->event) {
      hlist_del(ev->event->node)
        list_del(....)
    }
  }
}
|--> module removed, the event has been dropped

--Task A--

  event->print(trace); // Dereferencing freed memory

If the event retrieved belongs to a module and this module
is concurrently removed, we may end up dereferencing a data
from a freed module.

RCU could solve this, but it would add latency to the kernel and
forbid tracers output callbacks to call any sleepable code.
So this fix converts 'trace_event_mutex' to a read/write semaphore,
and adds trace_event_read_lock() to protect ftrace_find_event().

[ Impact: fix possible freed memory dereference in ftrace ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A114806.7090302@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace.c        |  8 ++++++++
 kernel/trace/trace_output.c | 25 ++++++++++++++++++-------
 kernel/trace/trace_output.h |  2 ++
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dd40d23203462..02d32baa23ac9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 		p = s_next(m, p, &l);
 	}
 
+	trace_event_read_lock();
 	return p;
 }
 
 static void s_stop(struct seq_file *m, void *p)
 {
 	atomic_dec(&trace_record_cmdline_disabled);
+	trace_event_read_unlock();
 }
 
 static void print_lat_help_header(struct seq_file *m)
@@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter)
 	return 1;
 }
 
+/*  Called with trace_event_read_lock() held. */
 static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
 	enum print_line_t ret;
@@ -3008,6 +3011,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	       offsetof(struct trace_iterator, seq));
 	iter->pos = -1;
 
+	trace_event_read_lock();
 	while (find_next_entry_inc(iter) != NULL) {
 		enum print_line_t ret;
 		int len = iter->seq.len;
@@ -3024,6 +3028,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		if (iter->seq.len >= cnt)
 			break;
 	}
+	trace_event_read_unlock();
 
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		goto out_err;
 	}
 
+	trace_event_read_lock();
+
 	/* Fill as many pages as possible. */
 	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
 		pages[i] = alloc_page(GFP_KERNEL);
@@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		trace_seq_init(&iter->seq);
 	}
 
+	trace_event_read_unlock();
 	mutex_unlock(&iter->mutex);
 
 	spd.nr_pages = i;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 489c0e8ada097..7136420603aa0 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE	128
 
-static DEFINE_MUTEX(trace_event_mutex);
+static DECLARE_RWSEM(trace_event_mutex);
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -466,6 +466,7 @@ static int task_state_char(unsigned long state)
  * @type: the type of event to look for
  *
  * Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
  */
 struct trace_event *ftrace_find_event(int type)
 {
@@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type)
 
 	key = type & (EVENT_HASHSIZE - 1);
 
-	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+	hlist_for_each_entry(event, n, &event_hash[key], node) {
 		if (event->type == type)
 			return event;
 	}
@@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list)
 	return last + 1;
 }
 
+void trace_event_read_lock(void)
+{
+	down_read(&trace_event_mutex);
+}
+
+void trace_event_read_unlock(void)
+{
+	up_read(&trace_event_mutex);
+}
+
 /**
  * register_ftrace_event - register output for an event type
  * @event: the event type to register
@@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event)
 	unsigned key;
 	int ret = 0;
 
-	mutex_lock(&trace_event_mutex);
+	down_write(&trace_event_mutex);
 
 	if (WARN_ON(!event))
 		goto out;
@@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event)
 
 	key = event->type & (EVENT_HASHSIZE - 1);
 
-	hlist_add_head_rcu(&event->node, &event_hash[key]);
+	hlist_add_head(&event->node, &event_hash[key]);
 
 	ret = event->type;
  out:
-	mutex_unlock(&trace_event_mutex);
+	up_write(&trace_event_mutex);
 
 	return ret;
 }
@@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
  */
 int unregister_ftrace_event(struct trace_event *event)
 {
-	mutex_lock(&trace_event_mutex);
+	down_write(&trace_event_mutex);
 	hlist_del(&event->node);
 	list_del(&event->list);
-	mutex_unlock(&trace_event_mutex);
+	up_write(&trace_event_mutex);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 6e220a8e5706a..ac240e76eb012 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
-- 
GitLab


From b0aae68cc5508f3c2fbf728988c954db4c8b8a53 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 21 May 2009 13:59:18 +0800
Subject: [PATCH 1292/2198] tracing/events: change the type of __str_loc_item
 to unsigned short

When defining a dynamic size string, we add __str_loc_##item to the
trace entry, and it stores the location of the actual string in
entry->_str_data[]

'unsigned short' should be sufficient to store this information, thus
we save 2 bytes per dyn-size string in the ring buffer.

[ Impact: reduce memory occupied by dyn-size strings in ring buffer ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A14EDB6.2050507@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/trace/ftrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index edb02bc9f8fff..b5ff2e8229ece 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -25,7 +25,7 @@
 #define __field(type, item)		type	item;
 
 #undef __string
-#define __string(item, src)		int	__str_loc_##item;
+#define __string(item, src)		unsigned short	__str_loc_##item;
 
 #undef TP_STRUCT__entry
 #define TP_STRUCT__entry(args...) args
-- 
GitLab


From 29fcefba8a2f0fea11e2b721fe174a1832801284 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Sun, 24 May 2009 11:13:17 +0300
Subject: [PATCH 1293/2198] kmemtrace: fix kernel parameter documentation

The kmemtrace.enable kernel parameter no longer works. To enable
kmemtrace at boot-time, you must pass "ftrace=kmemtrace" instead.

[ Impact: remove obsolete kernel parameter documentation ]

Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <alpine.DEB.2.00.0905241112190.10296@rocky>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 Documentation/kernel-parameters.txt | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbfbcc75e..9243dd84f4d6c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -56,7 +56,6 @@ parameter is applicable:
 	ISAPNP	ISA PnP code is enabled.
 	ISDN	Appropriate ISDN support is enabled.
 	JOY	Appropriate joystick support is enabled.
-	KMEMTRACE kmemtrace is enabled.
 	LIBATA  Libata driver is enabled
 	LP	Printer support is enabled.
 	LOOP	Loopback device support is enabled.
@@ -1054,15 +1053,6 @@ and is between 256 and 4096 characters. It is defined in the file
 			use the HighMem zone if it exists, and the Normal
 			zone if it does not.
 
-	kmemtrace.enable=	[KNL,KMEMTRACE] Format: { yes | no }
-				Controls whether kmemtrace is enabled
-				at boot-time.
-
-	kmemtrace.subbufs=n	[KNL,KMEMTRACE] Overrides the number of
-			subbufs kmemtrace's relay channel has. Set this
-			higher than default (KMEMTRACE_N_SUBBUFS in code) if
-			you experience buffer overruns.
-
 	kgdboc=		[HW] kgdb over consoles.
 			Requires a tty driver that supports console polling.
 			(only serial suported for now)
-- 
GitLab


From b11c53e12f94a46b50bccc7a1a953d7ca1d54a31 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 25 May 2009 18:11:59 +0800
Subject: [PATCH 1294/2198] ftrace: Add task_comm support for trace_event

If we enable a trace event alone without any tracer running (such as
function tracer, sched switch tracer, etc...) it can't output enough
task command information.

We need to use the tracing_{start/stop}_cmdline_record() helpers
which are designed to keep track of cmdlines for any tasks that
were scheduled during the tracing.

Before this patch:
 # echo 1 > debugfs/tracing/events/sched/sched_switch/enable
 # cat debugfs/tracing/trace
 # tracer: nop
 #
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
            <...>-2289  [000] 526276.724790: sched_switch: task bash:2289 [120] ==> sshd:2287 [120]
            <...>-2287  [000] 526276.725231: sched_switch: task sshd:2287 [120] ==> bash:2289 [120]
            <...>-2289  [000] 526276.725452: sched_switch: task bash:2289 [120] ==> sshd:2287 [120]
            <...>-2287  [000] 526276.727181: sched_switch: task sshd:2287 [120] ==> swapper:0 [140]
           <idle>-0     [000] 526277.032734: sched_switch: task swapper:0 [140] ==> events/0:5 [115]
            <...>-5     [000] 526277.032782: sched_switch: task events/0:5 [115] ==> swapper:0 [140]
 ...

After this patch:
 # tracer: nop
 #
 #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
 #              | |       |          |         |
             bash-2269  [000] 527347.989229: sched_switch: task bash:2269 [120] ==> sshd:2267 [120]
             sshd-2267  [000] 527347.990960: sched_switch: task sshd:2267 [120] ==> bash:2269 [120]
             bash-2269  [000] 527347.991143: sched_switch: task bash:2269 [120] ==> sshd:2267 [120]
             sshd-2267  [000] 527347.992959: sched_switch: task sshd:2267 [120] ==> swapper:0 [140]
           <idle>-0     [000] 527348.531989: sched_switch: task swapper:0 [140] ==> events/0:5 [115]
         events/0-5     [000] 527348.532115: sched_switch: task events/0:5 [115] ==> swapper:0 [140]
 ...

Changelog:
v1->v2: Update Kconfig to select CONTEXT_SWITCH_TRACER in
        ENABLE_EVENT_TRACING
v2->v3: v2 can solve problem that was caused by config EVENT_TRACING
        alone, but when CONFIG_FTRACE is off and CONFIG_TRACING is
        selected by other config, compile fail happened again.
        This version solves it.

[ Impact: fix incomplete output of event tracing ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/Kconfig        | 9 +++++++--
 kernel/trace/trace_events.c | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f61be30157832..a508b9d2adb8c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -49,6 +49,11 @@ config FTRACE_NMI_ENTER
        default y
 
 config EVENT_TRACING
+	select CONTEXT_SWITCH_TRACER
+	bool
+
+config CONTEXT_SWITCH_TRACER
+	select MARKERS
 	bool
 
 config TRACING
@@ -176,10 +181,10 @@ config SCHED_TRACER
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 
-config CONTEXT_SWITCH_TRACER
+config ENABLE_CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
 	select TRACING
-	select MARKERS
+	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9e91c4ad7c8b3..9b246eb01d5f6 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -85,6 +85,7 @@ static void ftrace_clear_events(void)
 
 		if (call->enabled) {
 			call->enabled = 0;
+			tracing_stop_cmdline_record();
 			call->unregfunc();
 		}
 	}
@@ -99,12 +100,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 	case 0:
 		if (call->enabled) {
 			call->enabled = 0;
+			tracing_stop_cmdline_record();
 			call->unregfunc();
 		}
 		break;
 	case 1:
 		if (!call->enabled) {
 			call->enabled = 1;
+			tracing_start_cmdline_record();
 			call->regfunc();
 		}
 		break;
@@ -1058,6 +1061,7 @@ static void trace_module_remove_events(struct module *mod)
 			found = true;
 			if (call->enabled) {
 				call->enabled = 0;
+				tracing_stop_cmdline_record();
 				call->unregfunc();
 			}
 			if (call->event)
@@ -1262,11 +1266,13 @@ static __init void event_trace_self_tests(void)
 		}
 
 		call->enabled = 1;
+		tracing_start_cmdline_record();
 		call->regfunc();
 
 		event_test_stuff();
 
 		call->unregfunc();
+		tracing_stop_cmdline_record();
 		call->enabled = 0;
 
 		pr_cont("OK\n");
-- 
GitLab


From 0e907c99391362385c8e3af2c43b904dd1fd5d73 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 25 May 2009 18:13:59 +0800
Subject: [PATCH 1295/2198] ftrace: clean up of using
 ftrace_event_enable_disable()

Always use ftrace_event_enable_disable() to enable/disable an event
so that we can factorize out the event toggling code.

[ Impact: factorize and cleanup event tracing code ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_events.c | 44 ++++++++++++-------------------------
 1 file changed, 14 insertions(+), 30 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 9b246eb01d5f6..6c81f9c21426f 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -76,26 +76,9 @@ static void trace_destroy_fields(struct ftrace_event_call *call)
 
 #endif /* CONFIG_MODULES */
 
-static void ftrace_clear_events(void)
-{
-	struct ftrace_event_call *call;
-
-	mutex_lock(&event_mutex);
-	list_for_each_entry(call, &ftrace_events, list) {
-
-		if (call->enabled) {
-			call->enabled = 0;
-			tracing_stop_cmdline_record();
-			call->unregfunc();
-		}
-	}
-	mutex_unlock(&event_mutex);
-}
-
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 					int enable)
 {
-
 	switch (enable) {
 	case 0:
 		if (call->enabled) {
@@ -114,6 +97,17 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 	}
 }
 
+static void ftrace_clear_events(void)
+{
+	struct ftrace_event_call *call;
+
+	mutex_lock(&event_mutex);
+	list_for_each_entry(call, &ftrace_events, list) {
+		ftrace_event_enable_disable(call, 0);
+	}
+	mutex_unlock(&event_mutex);
+}
+
 /*
  * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
  */
@@ -1059,11 +1053,7 @@ static void trace_module_remove_events(struct module *mod)
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
 			found = true;
-			if (call->enabled) {
-				call->enabled = 0;
-				tracing_stop_cmdline_record();
-				call->unregfunc();
-			}
+			ftrace_event_enable_disable(call, 0);
 			if (call->event)
 				unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
@@ -1265,15 +1255,9 @@ static __init void event_trace_self_tests(void)
 			continue;
 		}
 
-		call->enabled = 1;
-		tracing_start_cmdline_record();
-		call->regfunc();
-
+		ftrace_event_enable_disable(call, 1);
 		event_test_stuff();
-
-		call->unregfunc();
-		tracing_stop_cmdline_record();
-		call->enabled = 0;
+		ftrace_event_enable_disable(call, 0);
 
 		pr_cont("OK\n");
 	}
-- 
GitLab


From 3709ab8dfa23cab553ea9ffea3372c8e0a28f332 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 26 May 2009 13:52:28 +0900
Subject: [PATCH 1296/2198] sh: irq: Fix up imask build warnings.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/irq/imask.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/sh/kernel/cpu/irq/imask.c b/arch/sh/kernel/cpu/irq/imask.c
index 3bef3c2629ad2..6b5d191eec3a9 100644
--- a/arch/sh/kernel/cpu/irq/imask.c
+++ b/arch/sh/kernel/cpu/irq/imask.c
@@ -53,7 +53,7 @@ static inline void set_interrupt_registers(int ip)
 
 static void mask_imask_irq(unsigned int irq)
 {
-	clear_bit(irq, &imask_mask);
+	clear_bit(irq, imask_mask);
 	if (interrupt_priority < IMASK_PRIORITY - irq)
 		interrupt_priority = IMASK_PRIORITY - irq;
 	set_interrupt_registers(interrupt_priority);
@@ -61,7 +61,7 @@ static void mask_imask_irq(unsigned int irq)
 
 static void unmask_imask_irq(unsigned int irq)
 {
-	set_bit(irq, &imask_mask);
+	set_bit(irq, imask_mask);
 	interrupt_priority = IMASK_PRIORITY -
 		find_first_zero_bit(imask_mask, IMASK_PRIORITY);
 	set_interrupt_registers(interrupt_priority);
-- 
GitLab


From 8a7b8cb91f26a671f22cedc7fd54508667f2d9b9 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 26 May 2009 16:27:59 +1000
Subject: [PATCH 1297/2198] perf_counter: powerpc: Implement interrupt
 throttling

This implements interrupt throttling on powerpc.  Since we don't have
individual count enable/disable or interrupt enable/disable controls
per counter, this simply sets the hardware counter to 0, meaning that
it will not interrupt again until it has counted 2^31 counts, which
will take at least 2^30 cycles assuming a maximum of 2 counts per
cycle.  Also, we set counter->hw.period_left to the maximum possible
value (2^63 - 1), so we won't report overflows for this counter for
the forseeable future.

The unthrottle operation restores counter->hw.period_left and the
hardware counter so that we will once again report a counter overflow
after counter->hw.irq_period counts.

[ Impact: new perfcounters robustness feature on PowerPC ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
LKML-Reference: <18971.35823.643362.446774@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 48 ++++++++++++++++++++++++++----
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index fe21b2440f282..f96d55f55bd60 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -740,10 +740,37 @@ static void power_pmu_disable(struct perf_counter *counter)
 	local_irq_restore(flags);
 }
 
+/*
+ * Re-enable interrupts on a counter after they were throttled
+ * because they were coming too fast.
+ */
+static void power_pmu_unthrottle(struct perf_counter *counter)
+{
+	s64 val, left;
+	unsigned long flags;
+
+	if (!counter->hw.idx || !counter->hw.irq_period)
+		return;
+	local_irq_save(flags);
+	perf_disable();
+	power_pmu_read(counter);
+	left = counter->hw.irq_period;
+	val = 0;
+	if (left < 0x80000000L)
+		val = 0x80000000L - left;
+	write_pmc(counter->hw.idx, val);
+	atomic64_set(&counter->hw.prev_count, val);
+	atomic64_set(&counter->hw.period_left, left);
+	perf_counter_update_userpage(counter);
+	perf_enable();
+	local_irq_restore(flags);
+}
+
 struct pmu power_pmu = {
 	.enable		= power_pmu_enable,
 	.disable	= power_pmu_disable,
 	.read		= power_pmu_read,
+	.unthrottle	= power_pmu_unthrottle,
 };
 
 /*
@@ -957,10 +984,6 @@ static void record_and_restart(struct perf_counter *counter, long val,
 		if (left < 0x80000000L)
 			val = 0x80000000L - left;
 	}
-	write_pmc(counter->hw.idx, val);
-	atomic64_set(&counter->hw.prev_count, val);
-	atomic64_set(&counter->hw.period_left, left);
-	perf_counter_update_userpage(counter);
 
 	/*
 	 * Finally record data if requested.
@@ -983,8 +1006,23 @@ static void record_and_restart(struct perf_counter *counter, long val,
 			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
 				addr = mfspr(SPRN_SDAR);
 		}
-		perf_counter_overflow(counter, nmi, regs, addr);
+		if (perf_counter_overflow(counter, nmi, regs, addr)) {
+			/*
+			 * Interrupts are coming too fast - throttle them
+			 * by setting the counter to 0, so it will be
+			 * at least 2^30 cycles until the next interrupt
+			 * (assuming each counter counts at most 2 counts
+			 * per cycle).
+			 */
+			val = 0;
+			left = ~0ULL >> 1;
+		}
 	}
+
+	write_pmc(counter->hw.idx, val);
+	atomic64_set(&counter->hw.prev_count, val);
+	atomic64_set(&counter->hw.period_left, left);
+	perf_counter_update_userpage(counter);
 }
 
 /*
-- 
GitLab


From 79202ba9ff8cf570a75596f42e011167734d1c4b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 08:10:00 +0200
Subject: [PATCH 1298/2198] perf_counter, x86: Fix APIC NMI programming

My Nehalem box locks up in certain situations (with an
always-asserted NMI causing a lockup) if the PMU LVT
entry is programmed between NMI and IRQ mode with a
high frequency.

Standardize exlusively on NMIs instead.

[ Impact: fix lockup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 189bf9d7cdabd..ece3813c7a3c8 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,14 +285,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
 	/*
-	 * If privileged enough, allow NMI events:
+	 * Use NMI events all the time:
 	 */
-	hwc->nmi = 0;
-	if (hw_event->nmi) {
-		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
-		hwc->nmi = 1;
-	}
+	hwc->nmi	= 1;
+	hw_event->nmi	= 1;
 
 	if (!hwc->irq_period)
 		hwc->irq_period = x86_pmu.max_period;
@@ -553,9 +549,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 	if (!x86_pmu.num_counters_fixed)
 		return -1;
 
-	if (unlikely(hwc->nmi))
-		return -1;
-
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
 	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
@@ -806,9 +799,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		counter = cpuc->counters[idx];
 		hwc = &counter->hw;
 
-		if (counter->hw_event.nmi != nmi)
-			continue;
-
 		val = x86_perf_counter_update(counter, hwc, idx);
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
-- 
GitLab


From aaba98018b8295dfa2119345d17f833d74448cd0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 08:10:00 +0200
Subject: [PATCH 1299/2198] perf_counter, x86: Make NMI lockups more robust

We have a debug check that detects stuck NMIs and returns with
the PMU disabled in the global ctrl MSR - but i managed to trigger
a situation where this was not enough to deassert the NMI.

So clear/reset the full PMU and keep the disable count balanced when
exiting from here. This way the box produces a debug warning but
stays up and is more debuggable.

[ Impact: in case of PMU related bugs, recover more gracefully ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ece3813c7a3c8..2eeaa99add1c7 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter)
 		intel_pmu_enable_counter(hwc, idx);
 }
 
+static void intel_pmu_reset(void)
+{
+	unsigned long flags;
+	int idx;
+
+	if (!x86_pmu.num_counters)
+		return;
+
+	local_irq_save(flags);
+
+	printk("clearing PMU state on CPU#%d\n", smp_processor_id());
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
+		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull);
+	}
+	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
+		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
+	}
+
+	local_irq_restore(flags);
+}
+
+
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
@@ -750,6 +774,8 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	if (++loops > 100) {
 		WARN_ONCE(1, "perfcounters: irq loop stuck!\n");
 		perf_counter_print_debug();
+		intel_pmu_reset();
+		perf_enable();
 		return 1;
 	}
 
-- 
GitLab


From 329d876d6fd326109f191ae0fb2798b8834fb70b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 08:10:00 +0200
Subject: [PATCH 1300/2198] perf_counter: Initialize ->oncpu properly

This shouldnt matter normally (and i have not seen any
misbehavior), because active counters always have a
proper ->oncpu value - but nevertheless initialize the
field properly to -1.

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 070f92d3232a8..367299f91aaff 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3122,6 +3122,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->group_leader		= group_leader;
 	counter->pmu			= NULL;
 	counter->ctx			= ctx;
+	counter->oncpu			= -1;
+
 	get_ctx(ctx);
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-- 
GitLab


From 287c129716c2d4b75f3d8dd6e68732a3cd326ee6 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Tue, 26 May 2009 07:04:52 +0000
Subject: [PATCH 1301/2198] sh: Add ms7724se (SH7724) board support

This adds preliminary support for the ms7724se solution engine board.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/Kconfig                |    9 +
 arch/sh/boards/mach-se/7724/Makefile  |   10 +
 arch/sh/boards/mach-se/7724/irq.c     |  139 +++
 arch/sh/boards/mach-se/7724/setup.c   |  448 +++++++
 arch/sh/boards/mach-se/Makefile       |    1 +
 arch/sh/configs/se7724_defconfig      | 1552 +++++++++++++++++++++++++
 arch/sh/include/mach-se/mach/se7724.h |   67 ++
 7 files changed, 2226 insertions(+)
 create mode 100644 arch/sh/boards/mach-se/7724/Makefile
 create mode 100644 arch/sh/boards/mach-se/7724/irq.c
 create mode 100644 arch/sh/boards/mach-se/7724/setup.c
 create mode 100644 arch/sh/configs/se7724_defconfig
 create mode 100644 arch/sh/include/mach-se/mach/se7724.h

diff --git a/arch/sh/boards/Kconfig b/arch/sh/boards/Kconfig
index 99e2d47f79f56..1c91b1f565d54 100644
--- a/arch/sh/boards/Kconfig
+++ b/arch/sh/boards/Kconfig
@@ -46,6 +46,15 @@ config SH_7722_SOLUTION_ENGINE
 	  Select 7722 SolutionEngine if configuring for a Hitachi SH772
 	  evaluation board.
 
+config SH_7724_SOLUTION_ENGINE
+	bool "SolutionEngine7724"
+	select SOLUTION_ENGINE
+	depends on CPU_SUBTYPE_SH7724
+	select ARCH_REQUIRE_GPIOLIB
+	help
+	  Select 7724 SolutionEngine if configuring for a Hitachi SH7724
+	  evaluation board.
+
 config SH_7751_SOLUTION_ENGINE
 	bool "SolutionEngine7751"
 	select SOLUTION_ENGINE
diff --git a/arch/sh/boards/mach-se/7724/Makefile b/arch/sh/boards/mach-se/7724/Makefile
new file mode 100644
index 0000000000000..349cbd6ce82d9
--- /dev/null
+++ b/arch/sh/boards/mach-se/7724/Makefile
@@ -0,0 +1,10 @@
+#
+# Makefile for the HITACHI UL SolutionEngine 7724 specific parts of the kernel
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+#
+
+obj-y	 := setup.o irq.o
\ No newline at end of file
diff --git a/arch/sh/boards/mach-se/7724/irq.c b/arch/sh/boards/mach-se/7724/irq.c
new file mode 100644
index 0000000000000..f76cf3b49f237
--- /dev/null
+++ b/arch/sh/boards/mach-se/7724/irq.c
@@ -0,0 +1,139 @@
+/*
+ * linux/arch/sh/boards/se/7724/irq.c
+ *
+ * Copyright (C) 2009 Renesas Solutions Corp.
+ *
+ * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ *
+ * Based on  linux/arch/sh/boards/se/7722/irq.c
+ * Copyright (C) 2007  Nobuhiro Iwamatsu
+ *
+ * Hitachi UL SolutionEngine 7724 Support.
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+#include <linux/init.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <asm/irq.h>
+#include <asm/io.h>
+#include <mach-se/mach/se7724.h>
+
+struct fpga_irq {
+	unsigned long  sraddr;
+	unsigned long  mraddr;
+	unsigned short mask;
+	unsigned int   base;
+};
+
+static unsigned int fpga2irq(unsigned int irq)
+{
+	if (irq >= IRQ0_BASE &&
+	    irq <= IRQ0_END)
+		return IRQ0_IRQ;
+	else if (irq >= IRQ1_BASE &&
+		 irq <= IRQ1_END)
+		return IRQ1_IRQ;
+	else
+		return IRQ2_IRQ;
+}
+
+static struct fpga_irq get_fpga_irq(unsigned int irq)
+{
+	struct fpga_irq set;
+
+	switch (irq) {
+	case IRQ0_IRQ:
+		set.sraddr = IRQ0_SR;
+		set.mraddr = IRQ0_MR;
+		set.mask   = IRQ0_MASK;
+		set.base   = IRQ0_BASE;
+		break;
+	case IRQ1_IRQ:
+		set.sraddr = IRQ1_SR;
+		set.mraddr = IRQ1_MR;
+		set.mask   = IRQ1_MASK;
+		set.base   = IRQ1_BASE;
+		break;
+	default:
+		set.sraddr = IRQ2_SR;
+		set.mraddr = IRQ2_MR;
+		set.mask   = IRQ2_MASK;
+		set.base   = IRQ2_BASE;
+		break;
+	}
+
+	return set;
+}
+
+static void disable_se7724_irq(unsigned int irq)
+{
+	struct fpga_irq set = get_fpga_irq(fpga2irq(irq));
+	unsigned int bit = irq - set.base;
+	ctrl_outw(ctrl_inw(set.mraddr) | 0x0001 << bit, set.mraddr);
+}
+
+static void enable_se7724_irq(unsigned int irq)
+{
+	struct fpga_irq set = get_fpga_irq(fpga2irq(irq));
+	unsigned int bit = irq - set.base;
+	ctrl_outw(ctrl_inw(set.mraddr) & ~(0x0001 << bit), set.mraddr);
+}
+
+static struct irq_chip se7724_irq_chip __read_mostly = {
+	.name           = "SE7724-FPGA",
+	.mask           = disable_se7724_irq,
+	.unmask         = enable_se7724_irq,
+	.mask_ack       = disable_se7724_irq,
+};
+
+static void se7724_irq_demux(unsigned int irq, struct irq_desc *desc)
+{
+	struct fpga_irq set = get_fpga_irq(irq);
+	unsigned short intv = ctrl_inw(set.sraddr);
+	struct irq_desc *ext_desc;
+	unsigned int ext_irq = set.base;
+
+	intv &= set.mask;
+
+	while (intv) {
+		if (intv & 0x0001) {
+			ext_desc = irq_desc + ext_irq;
+			handle_level_irq(ext_irq, ext_desc);
+		}
+		intv >>= 1;
+		ext_irq++;
+	}
+}
+
+/*
+ * Initialize IRQ setting
+ */
+void __init init_se7724_IRQ(void)
+{
+	int i;
+
+	ctrl_outw(0xffff, IRQ0_MR);  /* mask all */
+	ctrl_outw(0xffff, IRQ1_MR);  /* mask all */
+	ctrl_outw(0xffff, IRQ2_MR);  /* mask all */
+	ctrl_outw(0x0000, IRQ0_SR);  /* clear irq */
+	ctrl_outw(0x0000, IRQ1_SR);  /* clear irq */
+	ctrl_outw(0x0000, IRQ2_SR);  /* clear irq */
+	ctrl_outw(0x002a, IRQ_MODE); /* set irq type */
+
+	for (i = 0; i < SE7724_FPGA_IRQ_NR; i++)
+		set_irq_chip_and_handler_name(SE7724_FPGA_IRQ_BASE + i,
+					      &se7724_irq_chip,
+					      handle_level_irq, "level");
+
+	set_irq_chained_handler(IRQ0_IRQ, se7724_irq_demux);
+	set_irq_type(IRQ0_IRQ, IRQ_TYPE_LEVEL_LOW);
+
+	set_irq_chained_handler(IRQ1_IRQ, se7724_irq_demux);
+	set_irq_type(IRQ1_IRQ, IRQ_TYPE_LEVEL_LOW);
+
+	set_irq_chained_handler(IRQ2_IRQ, se7724_irq_demux);
+	set_irq_type(IRQ2_IRQ, IRQ_TYPE_LEVEL_LOW);
+}
diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c
new file mode 100644
index 0000000000000..9cd04bd558b88
--- /dev/null
+++ b/arch/sh/boards/mach-se/7724/setup.c
@@ -0,0 +1,448 @@
+/*
+ * linux/arch/sh/boards/se/7724/setup.c
+ *
+ * Copyright (C) 2009 Renesas Solutions Corp.
+ *
+ * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+#include <linux/mtd/physmap.h>
+#include <linux/delay.h>
+#include <linux/smc91x.h>
+#include <linux/gpio.h>
+#include <linux/input.h>
+#include <video/sh_mobile_lcdc.h>
+#include <media/sh_mobile_ceu.h>
+#include <asm/io.h>
+#include <asm/heartbeat.h>
+#include <asm/sh_keysc.h>
+#include <cpu/sh7724.h>
+#include <mach-se/mach/se7724.h>
+
+/*
+ * SWx    1234 5678
+ * ------------------------------------
+ * SW31 : 1001 1100    : default
+ * SW32 : 0111 1111    : use on board flash
+ *
+ * SW41 : abxx xxxx  -> a = 0 : Analog  monitor
+ *                          1 : Digital monitor
+ *                      b = 0 : VGA
+ *                          1 : SVGA
+ */
+
+/* Heartbeat */
+static struct heartbeat_data heartbeat_data = {
+	.regsize = 16,
+};
+
+static struct resource heartbeat_resources[] = {
+	[0] = {
+		.start  = PA_LED,
+		.end    = PA_LED,
+		.flags  = IORESOURCE_MEM,
+	},
+};
+
+static struct platform_device heartbeat_device = {
+	.name           = "heartbeat",
+	.id             = -1,
+	.dev = {
+		.platform_data = &heartbeat_data,
+	},
+	.num_resources  = ARRAY_SIZE(heartbeat_resources),
+	.resource       = heartbeat_resources,
+};
+
+/* LAN91C111 */
+static struct smc91x_platdata smc91x_info = {
+	.flags = SMC91X_USE_16BIT | SMC91X_NOWAIT,
+};
+
+static struct resource smc91x_eth_resources[] = {
+	[0] = {
+		.name   = "SMC91C111" ,
+		.start  = 0x1a300300,
+		.end    = 0x1a30030f,
+		.flags  = IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = IRQ0_SMC,
+		.flags  = IORESOURCE_IRQ | IORESOURCE_IRQ_HIGHLEVEL,
+	},
+};
+
+static struct platform_device smc91x_eth_device = {
+	.name	= "smc91x",
+	.num_resources  = ARRAY_SIZE(smc91x_eth_resources),
+	.resource       = smc91x_eth_resources,
+	.dev	= {
+		.platform_data	= &smc91x_info,
+	},
+};
+
+/* MTD */
+static struct mtd_partition nor_flash_partitions[] = {
+	{
+		.name = "uboot",
+		.offset = 0,
+		.size = (1 * 1024 * 1024),
+		.mask_flags = MTD_WRITEABLE,	/* Read-only */
+	}, {
+		.name = "kernel",
+		.offset = MTDPART_OFS_APPEND,
+		.size = (2 * 1024 * 1024),
+	}, {
+		.name = "free-area",
+		.offset = MTDPART_OFS_APPEND,
+		.size = MTDPART_SIZ_FULL,
+	},
+};
+
+static struct physmap_flash_data nor_flash_data = {
+	.width		= 2,
+	.parts		= nor_flash_partitions,
+	.nr_parts	= ARRAY_SIZE(nor_flash_partitions),
+};
+
+static struct resource nor_flash_resources[] = {
+	[0] = {
+		.name	= "NOR Flash",
+		.start	= 0x00000000,
+		.end	= 0x01ffffff,
+		.flags	= IORESOURCE_MEM,
+	}
+};
+
+static struct platform_device nor_flash_device = {
+	.name		= "physmap-flash",
+	.resource	= nor_flash_resources,
+	.num_resources	= ARRAY_SIZE(nor_flash_resources),
+	.dev		= {
+		.platform_data = &nor_flash_data,
+	},
+};
+
+/* LCDC */
+static struct sh_mobile_lcdc_info lcdc_info = {
+	.clock_source = LCDC_CLK_EXTERNAL,
+	.ch[0] = {
+		.chan = LCDC_CHAN_MAINLCD,
+		.bpp = 16,
+		.clock_divider = 1,
+		.lcd_cfg = {
+			.name = "LB070WV1",
+			.sync = 0, /* hsync and vsync are active low */
+		},
+		.lcd_size_cfg = { /* 7.0 inch */
+			.width = 152,
+			.height = 91,
+		},
+		.board_cfg = {
+		},
+	}
+};
+
+static struct resource lcdc_resources[] = {
+	[0] = {
+		.name	= "LCDC",
+		.start	= 0xfe940000,
+		.end	= 0xfe941fff,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start	= 106,
+		.flags	= IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device lcdc_device = {
+	.name		= "sh_mobile_lcdc_fb",
+	.num_resources	= ARRAY_SIZE(lcdc_resources),
+	.resource	= lcdc_resources,
+	.dev		= {
+		.platform_data	= &lcdc_info,
+	},
+};
+
+/* CEU0 */
+static struct sh_mobile_ceu_info sh_mobile_ceu0_info = {
+	.flags = SH_CEU_FLAG_USE_8BIT_BUS,
+};
+
+static struct resource ceu0_resources[] = {
+	[0] = {
+		.name	= "CEU0",
+		.start	= 0xfe910000,
+		.end	= 0xfe91009f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = 52,
+		.flags  = IORESOURCE_IRQ,
+	},
+	[2] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device ceu0_device = {
+	.name		= "sh_mobile_ceu",
+	.id             = 0, /* "ceu0" clock */
+	.num_resources	= ARRAY_SIZE(ceu0_resources),
+	.resource	= ceu0_resources,
+	.dev	= {
+		.platform_data	= &sh_mobile_ceu0_info,
+	},
+};
+
+/* CEU1 */
+static struct sh_mobile_ceu_info sh_mobile_ceu1_info = {
+	.flags = SH_CEU_FLAG_USE_8BIT_BUS,
+};
+
+static struct resource ceu1_resources[] = {
+	[0] = {
+		.name	= "CEU1",
+		.start	= 0xfe914000,
+		.end	= 0xfe91409f,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = 63,
+		.flags  = IORESOURCE_IRQ,
+	},
+	[2] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device ceu1_device = {
+	.name		= "sh_mobile_ceu",
+	.id             = 1, /* "ceu1" clock */
+	.num_resources	= ARRAY_SIZE(ceu1_resources),
+	.resource	= ceu1_resources,
+	.dev	= {
+		.platform_data	= &sh_mobile_ceu1_info,
+	},
+};
+
+/* KEYSC */
+static struct sh_keysc_info keysc_info = {
+	.mode = SH_KEYSC_MODE_1,
+	.scan_timing = 10,
+	.delay = 50,
+	.keycodes = {
+		KEY_1, KEY_2, KEY_3, KEY_4, KEY_5,
+		KEY_6, KEY_7, KEY_8, KEY_9, KEY_A,
+		KEY_B, KEY_C, KEY_D, KEY_E, KEY_F,
+		KEY_G, KEY_H, KEY_I, KEY_K, KEY_L,
+		KEY_M, KEY_N, KEY_O, KEY_P, KEY_Q,
+		KEY_R, KEY_S, KEY_T, KEY_U, KEY_V,
+	},
+};
+
+static struct resource keysc_resources[] = {
+	[0] = {
+		.start  = 0x1a204000,
+		.end    = 0x1a20400f,
+		.flags  = IORESOURCE_MEM,
+	},
+	[1] = {
+		.start  = IRQ0_KEY,
+		.flags  = IORESOURCE_IRQ,
+	},
+};
+
+static struct platform_device keysc_device = {
+	.name           = "sh_keysc",
+	.id             = 0, /* "keysc0" clock */
+	.num_resources  = ARRAY_SIZE(keysc_resources),
+	.resource       = keysc_resources,
+	.dev	= {
+		.platform_data	= &keysc_info,
+	},
+};
+
+static struct platform_device *ms7724se_devices[] __initdata = {
+	&heartbeat_device,
+	&smc91x_eth_device,
+	&lcdc_device,
+	&nor_flash_device,
+	&ceu0_device,
+	&ceu1_device,
+	&keysc_device,
+};
+
+#define SW4140    0xBA201000
+#define FPGA_OUT  0xBA200400
+#define PORT_HIZA 0xA4050158
+
+#define SW41_A    0x0100
+#define SW41_B    0x0200
+#define SW41_C    0x0400
+#define SW41_D    0x0800
+#define SW41_E    0x1000
+#define SW41_F    0x2000
+#define SW41_G    0x4000
+#define SW41_H    0x8000
+static int __init devices_setup(void)
+{
+	u16 sw = ctrl_inw(SW4140); /* select camera, monitor */
+
+	/* Reset Release */
+	ctrl_outw(ctrl_inw(FPGA_OUT) &
+		  ~((1 << 1)  | /* LAN */
+		    (1 << 6)  | /* VIDEO DAC */
+		    (1 << 12)), /* USB0 */
+		  FPGA_OUT);
+
+	/* enable IRQ 0,1,2 */
+	gpio_request(GPIO_FN_INTC_IRQ0, NULL);
+	gpio_request(GPIO_FN_INTC_IRQ1, NULL);
+	gpio_request(GPIO_FN_INTC_IRQ2, NULL);
+
+	/* enable SCIFA3 */
+	gpio_request(GPIO_FN_SCIF3_I_SCK, NULL);
+	gpio_request(GPIO_FN_SCIF3_I_RXD, NULL);
+	gpio_request(GPIO_FN_SCIF3_I_TXD, NULL);
+	gpio_request(GPIO_FN_SCIF3_I_CTS, NULL);
+	gpio_request(GPIO_FN_SCIF3_I_RTS, NULL);
+
+	/* enable LCDC */
+	gpio_request(GPIO_FN_LCDD23,   NULL);
+	gpio_request(GPIO_FN_LCDD22,   NULL);
+	gpio_request(GPIO_FN_LCDD21,   NULL);
+	gpio_request(GPIO_FN_LCDD20,   NULL);
+	gpio_request(GPIO_FN_LCDD19,   NULL);
+	gpio_request(GPIO_FN_LCDD18,   NULL);
+	gpio_request(GPIO_FN_LCDD17,   NULL);
+	gpio_request(GPIO_FN_LCDD16,   NULL);
+	gpio_request(GPIO_FN_LCDD15,   NULL);
+	gpio_request(GPIO_FN_LCDD14,   NULL);
+	gpio_request(GPIO_FN_LCDD13,   NULL);
+	gpio_request(GPIO_FN_LCDD12,   NULL);
+	gpio_request(GPIO_FN_LCDD11,   NULL);
+	gpio_request(GPIO_FN_LCDD10,   NULL);
+	gpio_request(GPIO_FN_LCDD9,    NULL);
+	gpio_request(GPIO_FN_LCDD8,    NULL);
+	gpio_request(GPIO_FN_LCDD7,    NULL);
+	gpio_request(GPIO_FN_LCDD6,    NULL);
+	gpio_request(GPIO_FN_LCDD5,    NULL);
+	gpio_request(GPIO_FN_LCDD4,    NULL);
+	gpio_request(GPIO_FN_LCDD3,    NULL);
+	gpio_request(GPIO_FN_LCDD2,    NULL);
+	gpio_request(GPIO_FN_LCDD1,    NULL);
+	gpio_request(GPIO_FN_LCDD0,    NULL);
+	gpio_request(GPIO_FN_LCDDISP,  NULL);
+	gpio_request(GPIO_FN_LCDHSYN,  NULL);
+	gpio_request(GPIO_FN_LCDDCK,   NULL);
+	gpio_request(GPIO_FN_LCDVSYN,  NULL);
+	gpio_request(GPIO_FN_LCDDON,   NULL);
+	gpio_request(GPIO_FN_LCDVEPWC, NULL);
+	gpio_request(GPIO_FN_LCDVCPWC, NULL);
+	gpio_request(GPIO_FN_LCDRD,    NULL);
+	gpio_request(GPIO_FN_LCDLCLK,  NULL);
+	ctrl_outw((ctrl_inw(PORT_HIZA) & ~0x0001), PORT_HIZA);
+
+	/* enable CEU0 */
+	gpio_request(GPIO_FN_VIO0_D15, NULL);
+	gpio_request(GPIO_FN_VIO0_D14, NULL);
+	gpio_request(GPIO_FN_VIO0_D13, NULL);
+	gpio_request(GPIO_FN_VIO0_D12, NULL);
+	gpio_request(GPIO_FN_VIO0_D11, NULL);
+	gpio_request(GPIO_FN_VIO0_D10, NULL);
+	gpio_request(GPIO_FN_VIO0_D9,  NULL);
+	gpio_request(GPIO_FN_VIO0_D8,  NULL);
+	gpio_request(GPIO_FN_VIO0_D7,  NULL);
+	gpio_request(GPIO_FN_VIO0_D6,  NULL);
+	gpio_request(GPIO_FN_VIO0_D5,  NULL);
+	gpio_request(GPIO_FN_VIO0_D4,  NULL);
+	gpio_request(GPIO_FN_VIO0_D3,  NULL);
+	gpio_request(GPIO_FN_VIO0_D2,  NULL);
+	gpio_request(GPIO_FN_VIO0_D1,  NULL);
+	gpio_request(GPIO_FN_VIO0_D0,  NULL);
+	gpio_request(GPIO_FN_VIO0_VD,  NULL);
+	gpio_request(GPIO_FN_VIO0_CLK, NULL);
+	gpio_request(GPIO_FN_VIO0_FLD, NULL);
+	gpio_request(GPIO_FN_VIO0_HD,  NULL);
+	platform_resource_setup_memory(&ceu0_device, "ceu", 4 << 20);
+
+	/* enable CEU1 */
+	gpio_request(GPIO_FN_VIO1_D7,  NULL);
+	gpio_request(GPIO_FN_VIO1_D6,  NULL);
+	gpio_request(GPIO_FN_VIO1_D5,  NULL);
+	gpio_request(GPIO_FN_VIO1_D4,  NULL);
+	gpio_request(GPIO_FN_VIO1_D3,  NULL);
+	gpio_request(GPIO_FN_VIO1_D2,  NULL);
+	gpio_request(GPIO_FN_VIO1_D1,  NULL);
+	gpio_request(GPIO_FN_VIO1_D0,  NULL);
+	gpio_request(GPIO_FN_VIO1_FLD, NULL);
+	gpio_request(GPIO_FN_VIO1_HD,  NULL);
+	gpio_request(GPIO_FN_VIO1_VD,  NULL);
+	gpio_request(GPIO_FN_VIO1_CLK, NULL);
+	platform_resource_setup_memory(&ceu1_device, "ceu", 4 << 20);
+
+	/* KEYSC */
+	gpio_request(GPIO_FN_KEYOUT5_IN5, NULL);
+	gpio_request(GPIO_FN_KEYOUT4_IN6, NULL);
+	gpio_request(GPIO_FN_KEYIN4,      NULL);
+	gpio_request(GPIO_FN_KEYIN3,      NULL);
+	gpio_request(GPIO_FN_KEYIN2,      NULL);
+	gpio_request(GPIO_FN_KEYIN1,      NULL);
+	gpio_request(GPIO_FN_KEYIN0,      NULL);
+	gpio_request(GPIO_FN_KEYOUT3,     NULL);
+	gpio_request(GPIO_FN_KEYOUT2,     NULL);
+	gpio_request(GPIO_FN_KEYOUT1,     NULL);
+	gpio_request(GPIO_FN_KEYOUT0,     NULL);
+
+	if (sw & SW41_B) {
+		/* SVGA */
+		lcdc_info.ch[0].lcd_cfg.xres         = 800;
+		lcdc_info.ch[0].lcd_cfg.yres         = 600;
+		lcdc_info.ch[0].lcd_cfg.left_margin  = 142;
+		lcdc_info.ch[0].lcd_cfg.right_margin = 52;
+		lcdc_info.ch[0].lcd_cfg.hsync_len    = 96;
+		lcdc_info.ch[0].lcd_cfg.upper_margin = 24;
+		lcdc_info.ch[0].lcd_cfg.lower_margin = 2;
+		lcdc_info.ch[0].lcd_cfg.vsync_len    = 2;
+	} else {
+		/* VGA */
+		lcdc_info.ch[0].lcd_cfg.xres         = 640;
+		lcdc_info.ch[0].lcd_cfg.yres         = 480;
+		lcdc_info.ch[0].lcd_cfg.left_margin  = 105;
+		lcdc_info.ch[0].lcd_cfg.right_margin = 50;
+		lcdc_info.ch[0].lcd_cfg.hsync_len    = 96;
+		lcdc_info.ch[0].lcd_cfg.upper_margin = 33;
+		lcdc_info.ch[0].lcd_cfg.lower_margin = 10;
+		lcdc_info.ch[0].lcd_cfg.vsync_len    = 2;
+	}
+
+	if (sw & SW41_A) {
+		/* Digital monitor */
+		lcdc_info.ch[0].interface_type = RGB18;
+		lcdc_info.ch[0].flags          = 0;
+	} else {
+		/* Analog monitor */
+		lcdc_info.ch[0].interface_type = RGB24;
+		lcdc_info.ch[0].flags          = LCDC_FLAGS_DWPOL;
+	}
+
+	return platform_add_devices(ms7724se_devices,
+				ARRAY_SIZE(ms7724se_devices));
+}
+device_initcall(devices_setup);
+
+static struct sh_machine_vector mv_ms7724se __initmv = {
+	.mv_name	= "ms7724se",
+	.mv_init_irq	= init_se7724_IRQ,
+	.mv_nr_irqs	= SE7724_FPGA_IRQ_BASE + SE7724_FPGA_IRQ_NR,
+};
diff --git a/arch/sh/boards/mach-se/Makefile b/arch/sh/boards/mach-se/Makefile
index 2de42bae4b4f8..b537e238c6bcd 100644
--- a/arch/sh/boards/mach-se/Makefile
+++ b/arch/sh/boards/mach-se/Makefile
@@ -7,3 +7,4 @@ obj-$(CONFIG_SH_7751_SOLUTION_ENGINE)	+= 7751/
 obj-$(CONFIG_SH_7780_SOLUTION_ENGINE)	+= 7780/
 obj-$(CONFIG_SH_7343_SOLUTION_ENGINE)	+= 7343/
 obj-$(CONFIG_SH_7721_SOLUTION_ENGINE)	+= 7721/
+obj-$(CONFIG_SH_7724_SOLUTION_ENGINE)	+= 7724/
diff --git a/arch/sh/configs/se7724_defconfig b/arch/sh/configs/se7724_defconfig
new file mode 100644
index 0000000000000..a029d0fb11635
--- /dev/null
+++ b/arch/sh/configs/se7724_defconfig
@@ -0,0 +1,1552 @@
+#
+# Automatically generated make config: don't edit
+# Linux kernel version: 2.6.30-rc6
+# Tue May 26 13:18:09 2009
+#
+CONFIG_SUPERH=y
+CONFIG_SUPERH32=y
+# CONFIG_SUPERH64 is not set
+CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig"
+CONFIG_RWSEM_GENERIC_SPINLOCK=y
+CONFIG_GENERIC_BUG=y
+CONFIG_GENERIC_FIND_NEXT_BIT=y
+CONFIG_GENERIC_HWEIGHT=y
+CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
+CONFIG_GENERIC_IRQ_PROBE=y
+CONFIG_GENERIC_GPIO=y
+CONFIG_GENERIC_TIME=y
+CONFIG_GENERIC_CLOCKEVENTS=y
+CONFIG_ARCH_SUSPEND_POSSIBLE=y
+CONFIG_ARCH_HIBERNATION_POSSIBLE=y
+CONFIG_SYS_SUPPORTS_CMT=y
+CONFIG_SYS_SUPPORTS_TMU=y
+CONFIG_STACKTRACE_SUPPORT=y
+CONFIG_LOCKDEP_SUPPORT=y
+CONFIG_HAVE_LATENCYTOP_SUPPORT=y
+# CONFIG_ARCH_HAS_ILOG2_U32 is not set
+# CONFIG_ARCH_HAS_ILOG2_U64 is not set
+CONFIG_ARCH_NO_VIRT_TO_BUS=y
+CONFIG_ARCH_HAS_DEFAULT_IDLE=y
+CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
+
+#
+# General setup
+#
+CONFIG_EXPERIMENTAL=y
+CONFIG_BROKEN_ON_SMP=y
+CONFIG_LOCK_KERNEL=y
+CONFIG_INIT_ENV_ARG_LIMIT=32
+CONFIG_LOCALVERSION=""
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_SWAP=y
+CONFIG_SYSVIPC=y
+CONFIG_SYSVIPC_SYSCTL=y
+# CONFIG_POSIX_MQUEUE is not set
+CONFIG_BSD_PROCESS_ACCT=y
+# CONFIG_BSD_PROCESS_ACCT_V3 is not set
+# CONFIG_TASKSTATS is not set
+# CONFIG_AUDIT is not set
+
+#
+# RCU Subsystem
+#
+CONFIG_CLASSIC_RCU=y
+# CONFIG_TREE_RCU is not set
+# CONFIG_PREEMPT_RCU is not set
+# CONFIG_TREE_RCU_TRACE is not set
+# CONFIG_PREEMPT_RCU_TRACE is not set
+# CONFIG_IKCONFIG is not set
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_GROUP_SCHED=y
+CONFIG_FAIR_GROUP_SCHED=y
+# CONFIG_RT_GROUP_SCHED is not set
+CONFIG_USER_SCHED=y
+# CONFIG_CGROUP_SCHED is not set
+# CONFIG_CGROUPS is not set
+CONFIG_SYSFS_DEPRECATED=y
+CONFIG_SYSFS_DEPRECATED_V2=y
+# CONFIG_RELAY is not set
+# CONFIG_NAMESPACES is not set
+# CONFIG_BLK_DEV_INITRD is not set
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
+CONFIG_EMBEDDED=y
+CONFIG_UID16=y
+CONFIG_SYSCTL_SYSCALL=y
+# CONFIG_KALLSYMS is not set
+# CONFIG_STRIP_ASM_SYMS is not set
+CONFIG_HOTPLUG=y
+CONFIG_PRINTK=y
+CONFIG_BUG=y
+CONFIG_ELF_CORE=y
+CONFIG_BASE_FULL=y
+CONFIG_FUTEX=y
+CONFIG_EPOLL=y
+CONFIG_SIGNALFD=y
+CONFIG_TIMERFD=y
+CONFIG_EVENTFD=y
+CONFIG_SHMEM=y
+CONFIG_AIO=y
+CONFIG_VM_EVENT_COUNTERS=y
+CONFIG_COMPAT_BRK=y
+CONFIG_SLAB=y
+# CONFIG_SLUB is not set
+# CONFIG_SLOB is not set
+# CONFIG_PROFILING is not set
+# CONFIG_MARKERS is not set
+CONFIG_HAVE_OPROFILE=y
+CONFIG_HAVE_IOREMAP_PROT=y
+CONFIG_HAVE_KPROBES=y
+CONFIG_HAVE_KRETPROBES=y
+CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_CLK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
+CONFIG_HAVE_GENERIC_DMA_COHERENT=y
+CONFIG_SLABINFO=y
+CONFIG_RT_MUTEXES=y
+CONFIG_BASE_SMALL=0
+CONFIG_MODULES=y
+# CONFIG_MODULE_FORCE_LOAD is not set
+CONFIG_MODULE_UNLOAD=y
+# CONFIG_MODULE_FORCE_UNLOAD is not set
+# CONFIG_MODVERSIONS is not set
+# CONFIG_MODULE_SRCVERSION_ALL is not set
+CONFIG_BLOCK=y
+# CONFIG_LBD is not set
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_BLK_DEV_INTEGRITY is not set
+
+#
+# IO Schedulers
+#
+CONFIG_IOSCHED_NOOP=y
+CONFIG_IOSCHED_AS=y
+CONFIG_IOSCHED_DEADLINE=y
+CONFIG_IOSCHED_CFQ=y
+# CONFIG_DEFAULT_AS is not set
+# CONFIG_DEFAULT_DEADLINE is not set
+CONFIG_DEFAULT_CFQ=y
+# CONFIG_DEFAULT_NOOP is not set
+CONFIG_DEFAULT_IOSCHED="cfq"
+# CONFIG_FREEZER is not set
+
+#
+# System type
+#
+CONFIG_CPU_SH4=y
+CONFIG_CPU_SH4A=y
+CONFIG_CPU_SHX2=y
+CONFIG_ARCH_SHMOBILE=y
+# CONFIG_CPU_SUBTYPE_SH7619 is not set
+# CONFIG_CPU_SUBTYPE_SH7201 is not set
+# CONFIG_CPU_SUBTYPE_SH7203 is not set
+# CONFIG_CPU_SUBTYPE_SH7206 is not set
+# CONFIG_CPU_SUBTYPE_SH7263 is not set
+# CONFIG_CPU_SUBTYPE_MXG is not set
+# CONFIG_CPU_SUBTYPE_SH7705 is not set
+# CONFIG_CPU_SUBTYPE_SH7706 is not set
+# CONFIG_CPU_SUBTYPE_SH7707 is not set
+# CONFIG_CPU_SUBTYPE_SH7708 is not set
+# CONFIG_CPU_SUBTYPE_SH7709 is not set
+# CONFIG_CPU_SUBTYPE_SH7710 is not set
+# CONFIG_CPU_SUBTYPE_SH7712 is not set
+# CONFIG_CPU_SUBTYPE_SH7720 is not set
+# CONFIG_CPU_SUBTYPE_SH7721 is not set
+# CONFIG_CPU_SUBTYPE_SH7750 is not set
+# CONFIG_CPU_SUBTYPE_SH7091 is not set
+# CONFIG_CPU_SUBTYPE_SH7750R is not set
+# CONFIG_CPU_SUBTYPE_SH7750S is not set
+# CONFIG_CPU_SUBTYPE_SH7751 is not set
+# CONFIG_CPU_SUBTYPE_SH7751R is not set
+# CONFIG_CPU_SUBTYPE_SH7760 is not set
+# CONFIG_CPU_SUBTYPE_SH4_202 is not set
+# CONFIG_CPU_SUBTYPE_SH7723 is not set
+CONFIG_CPU_SUBTYPE_SH7724=y
+# CONFIG_CPU_SUBTYPE_SH7763 is not set
+# CONFIG_CPU_SUBTYPE_SH7770 is not set
+# CONFIG_CPU_SUBTYPE_SH7780 is not set
+# CONFIG_CPU_SUBTYPE_SH7785 is not set
+# CONFIG_CPU_SUBTYPE_SH7786 is not set
+# CONFIG_CPU_SUBTYPE_SHX3 is not set
+# CONFIG_CPU_SUBTYPE_SH7343 is not set
+# CONFIG_CPU_SUBTYPE_SH7722 is not set
+# CONFIG_CPU_SUBTYPE_SH7366 is not set
+
+#
+# Memory management options
+#
+CONFIG_QUICKLIST=y
+CONFIG_MMU=y
+CONFIG_PAGE_OFFSET=0x80000000
+CONFIG_FORCE_MAX_ZONEORDER=11
+CONFIG_MEMORY_START=0x08000000
+CONFIG_MEMORY_SIZE=0x08000000
+CONFIG_29BIT=y
+# CONFIG_X2TLB is not set
+CONFIG_VSYSCALL=y
+CONFIG_ARCH_FLATMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_ENABLE=y
+CONFIG_ARCH_SPARSEMEM_DEFAULT=y
+CONFIG_MAX_ACTIVE_REGIONS=1
+CONFIG_ARCH_POPULATES_NODE_MAP=y
+CONFIG_ARCH_SELECT_MEMORY_MODEL=y
+CONFIG_PAGE_SIZE_4KB=y
+# CONFIG_PAGE_SIZE_8KB is not set
+# CONFIG_PAGE_SIZE_16KB is not set
+# CONFIG_PAGE_SIZE_64KB is not set
+CONFIG_SELECT_MEMORY_MODEL=y
+CONFIG_FLATMEM_MANUAL=y
+# CONFIG_DISCONTIGMEM_MANUAL is not set
+# CONFIG_SPARSEMEM_MANUAL is not set
+CONFIG_FLATMEM=y
+CONFIG_FLAT_NODE_MEM_MAP=y
+CONFIG_SPARSEMEM_STATIC=y
+CONFIG_PAGEFLAGS_EXTENDED=y
+CONFIG_SPLIT_PTLOCK_CPUS=4
+# CONFIG_PHYS_ADDR_T_64BIT is not set
+CONFIG_ZONE_DMA_FLAG=0
+CONFIG_NR_QUICK=2
+CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
+
+#
+# Cache configuration
+#
+CONFIG_CACHE_WRITEBACK=y
+# CONFIG_CACHE_WRITETHROUGH is not set
+# CONFIG_CACHE_OFF is not set
+
+#
+# Processor features
+#
+CONFIG_CPU_LITTLE_ENDIAN=y
+# CONFIG_CPU_BIG_ENDIAN is not set
+CONFIG_SH_FPU=y
+# CONFIG_SH_STORE_QUEUES is not set
+CONFIG_CPU_HAS_INTEVT=y
+CONFIG_CPU_HAS_SR_RB=y
+CONFIG_CPU_HAS_PTEA=y
+CONFIG_CPU_HAS_FPU=y
+
+#
+# Board support
+#
+CONFIG_SOLUTION_ENGINE=y
+CONFIG_SH_7724_SOLUTION_ENGINE=y
+
+#
+# Timer and clock configuration
+#
+CONFIG_SH_TIMER_TMU=y
+# CONFIG_SH_TIMER_CMT is not set
+CONFIG_SH_PCLK_FREQ=41666666
+# CONFIG_NO_HZ is not set
+# CONFIG_HIGH_RES_TIMERS is not set
+CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
+
+#
+# CPU Frequency scaling
+#
+# CONFIG_CPU_FREQ is not set
+
+#
+# DMA support
+#
+# CONFIG_SH_DMA is not set
+
+#
+# Companion Chips
+#
+
+#
+# Additional SuperH Device Drivers
+#
+CONFIG_HEARTBEAT=y
+# CONFIG_PUSH_SWITCH is not set
+
+#
+# Kernel features
+#
+# CONFIG_HZ_100 is not set
+CONFIG_HZ_250=y
+# CONFIG_HZ_300 is not set
+# CONFIG_HZ_1000 is not set
+CONFIG_HZ=250
+# CONFIG_SCHED_HRTICK is not set
+# CONFIG_KEXEC is not set
+# CONFIG_CRASH_DUMP is not set
+CONFIG_SECCOMP=y
+# CONFIG_PREEMPT_NONE is not set
+# CONFIG_PREEMPT_VOLUNTARY is not set
+CONFIG_PREEMPT=y
+CONFIG_GUSA=y
+
+#
+# Boot options
+#
+CONFIG_ZERO_PAGE_OFFSET=0x00001000
+CONFIG_BOOT_LINK_OFFSET=0x00800000
+CONFIG_ENTRY_OFFSET=0x00001000
+CONFIG_CMDLINE_BOOL=y
+CONFIG_CMDLINE="console=tty1 console=ttySC3,115200 root=/dev/nfs ip=dhcp memchunk.vpu=4m"
+
+#
+# Bus options
+#
+# CONFIG_ARCH_SUPPORTS_MSI is not set
+# CONFIG_PCCARD is not set
+
+#
+# Executable file formats
+#
+CONFIG_BINFMT_ELF=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_HAVE_AOUT is not set
+# CONFIG_BINFMT_MISC is not set
+
+#
+# Power management options (EXPERIMENTAL)
+#
+# CONFIG_PM is not set
+# CONFIG_CPU_IDLE is not set
+CONFIG_NET=y
+
+#
+# Networking options
+#
+CONFIG_PACKET=y
+# CONFIG_PACKET_MMAP is not set
+CONFIG_UNIX=y
+# CONFIG_NET_KEY is not set
+CONFIG_INET=y
+# CONFIG_IP_MULTICAST is not set
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_ASK_IP_FIB_HASH=y
+# CONFIG_IP_FIB_TRIE is not set
+CONFIG_IP_FIB_HASH=y
+# CONFIG_IP_MULTIPLE_TABLES is not set
+# CONFIG_IP_ROUTE_MULTIPATH is not set
+# CONFIG_IP_ROUTE_VERBOSE is not set
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+# CONFIG_IP_PNP_BOOTP is not set
+# CONFIG_IP_PNP_RARP is not set
+# CONFIG_NET_IPIP is not set
+# CONFIG_NET_IPGRE is not set
+# CONFIG_ARPD is not set
+# CONFIG_SYN_COOKIES is not set
+# CONFIG_INET_AH is not set
+# CONFIG_INET_ESP is not set
+# CONFIG_INET_IPCOMP is not set
+# CONFIG_INET_XFRM_TUNNEL is not set
+# CONFIG_INET_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_TRANSPORT is not set
+# CONFIG_INET_XFRM_MODE_TUNNEL is not set
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG=y
+CONFIG_INET_TCP_DIAG=y
+# CONFIG_TCP_CONG_ADVANCED is not set
+CONFIG_TCP_CONG_CUBIC=y
+CONFIG_DEFAULT_TCP_CONG="cubic"
+# CONFIG_TCP_MD5SIG is not set
+# CONFIG_IPV6 is not set
+# CONFIG_NETWORK_SECMARK is not set
+# CONFIG_NETFILTER is not set
+# CONFIG_IP_DCCP is not set
+# CONFIG_IP_SCTP is not set
+# CONFIG_TIPC is not set
+# CONFIG_ATM is not set
+# CONFIG_BRIDGE is not set
+# CONFIG_NET_DSA is not set
+# CONFIG_VLAN_8021Q is not set
+# CONFIG_DECNET is not set
+# CONFIG_LLC2 is not set
+# CONFIG_IPX is not set
+# CONFIG_ATALK is not set
+# CONFIG_X25 is not set
+# CONFIG_LAPB is not set
+# CONFIG_ECONET is not set
+# CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
+# CONFIG_NET_SCHED is not set
+# CONFIG_DCB is not set
+
+#
+# Network testing
+#
+# CONFIG_NET_PKTGEN is not set
+# CONFIG_HAMRADIO is not set
+# CONFIG_CAN is not set
+# CONFIG_IRDA is not set
+# CONFIG_BT is not set
+# CONFIG_AF_RXRPC is not set
+CONFIG_WIRELESS=y
+# CONFIG_CFG80211 is not set
+# CONFIG_WIRELESS_OLD_REGULATORY is not set
+# CONFIG_WIRELESS_EXT is not set
+# CONFIG_LIB80211 is not set
+# CONFIG_MAC80211 is not set
+# CONFIG_WIMAX is not set
+# CONFIG_RFKILL is not set
+# CONFIG_NET_9P is not set
+
+#
+# Device Drivers
+#
+
+#
+# Generic Driver Options
+#
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_STANDALONE=y
+CONFIG_PREVENT_FIRMWARE_BUILD=y
+CONFIG_FW_LOADER=y
+CONFIG_FIRMWARE_IN_KERNEL=y
+CONFIG_EXTRA_FIRMWARE=""
+# CONFIG_SYS_HYPERVISOR is not set
+# CONFIG_CONNECTOR is not set
+CONFIG_MTD=y
+# CONFIG_MTD_DEBUG is not set
+CONFIG_MTD_CONCAT=y
+CONFIG_MTD_PARTITIONS=y
+# CONFIG_MTD_TESTS is not set
+# CONFIG_MTD_REDBOOT_PARTS is not set
+CONFIG_MTD_CMDLINE_PARTS=y
+# CONFIG_MTD_AR7_PARTS is not set
+
+#
+# User Modules And Translation Layers
+#
+CONFIG_MTD_CHAR=y
+CONFIG_MTD_BLKDEVS=y
+CONFIG_MTD_BLOCK=y
+# CONFIG_FTL is not set
+# CONFIG_NFTL is not set
+# CONFIG_INFTL is not set
+# CONFIG_RFD_FTL is not set
+# CONFIG_SSFDC is not set
+# CONFIG_MTD_OOPS is not set
+
+#
+# RAM/ROM/Flash chip drivers
+#
+CONFIG_MTD_CFI=y
+# CONFIG_MTD_JEDECPROBE is not set
+CONFIG_MTD_GEN_PROBE=y
+# CONFIG_MTD_CFI_ADV_OPTIONS is not set
+CONFIG_MTD_MAP_BANK_WIDTH_1=y
+CONFIG_MTD_MAP_BANK_WIDTH_2=y
+CONFIG_MTD_MAP_BANK_WIDTH_4=y
+# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set
+# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set
+CONFIG_MTD_CFI_I1=y
+CONFIG_MTD_CFI_I2=y
+# CONFIG_MTD_CFI_I4 is not set
+# CONFIG_MTD_CFI_I8 is not set
+# CONFIG_MTD_CFI_INTELEXT is not set
+CONFIG_MTD_CFI_AMDSTD=y
+# CONFIG_MTD_CFI_STAA is not set
+CONFIG_MTD_CFI_UTIL=y
+# CONFIG_MTD_RAM is not set
+# CONFIG_MTD_ROM is not set
+# CONFIG_MTD_ABSENT is not set
+
+#
+# Mapping drivers for chip access
+#
+# CONFIG_MTD_COMPLEX_MAPPINGS is not set
+CONFIG_MTD_PHYSMAP=y
+# CONFIG_MTD_PHYSMAP_COMPAT is not set
+# CONFIG_MTD_PLATRAM is not set
+
+#
+# Self-contained MTD device drivers
+#
+# CONFIG_MTD_DATAFLASH is not set
+# CONFIG_MTD_M25P80 is not set
+# CONFIG_MTD_SLRAM is not set
+# CONFIG_MTD_PHRAM is not set
+# CONFIG_MTD_MTDRAM is not set
+# CONFIG_MTD_BLOCK2MTD is not set
+
+#
+# Disk-On-Chip Device Drivers
+#
+# CONFIG_MTD_DOC2000 is not set
+# CONFIG_MTD_DOC2001 is not set
+# CONFIG_MTD_DOC2001PLUS is not set
+CONFIG_MTD_NAND=y
+# CONFIG_MTD_NAND_VERIFY_WRITE is not set
+# CONFIG_MTD_NAND_ECC_SMC is not set
+# CONFIG_MTD_NAND_MUSEUM_IDS is not set
+CONFIG_MTD_NAND_IDS=y
+# CONFIG_MTD_NAND_DISKONCHIP is not set
+# CONFIG_MTD_NAND_NANDSIM is not set
+# CONFIG_MTD_NAND_PLATFORM is not set
+# CONFIG_MTD_ALAUDA is not set
+# CONFIG_MTD_ONENAND is not set
+
+#
+# LPDDR flash memory drivers
+#
+# CONFIG_MTD_LPDDR is not set
+
+#
+# UBI - Unsorted block images
+#
+CONFIG_MTD_UBI=y
+CONFIG_MTD_UBI_WL_THRESHOLD=4096
+CONFIG_MTD_UBI_BEB_RESERVE=1
+# CONFIG_MTD_UBI_GLUEBI is not set
+
+#
+# UBI debugging options
+#
+# CONFIG_MTD_UBI_DEBUG is not set
+# CONFIG_PARPORT is not set
+CONFIG_BLK_DEV=y
+# CONFIG_BLK_DEV_COW_COMMON is not set
+# CONFIG_BLK_DEV_LOOP is not set
+# CONFIG_BLK_DEV_NBD is not set
+# CONFIG_BLK_DEV_UB is not set
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_COUNT=4
+CONFIG_BLK_DEV_RAM_SIZE=4096
+# CONFIG_BLK_DEV_XIP is not set
+# CONFIG_CDROM_PKTCDVD is not set
+# CONFIG_ATA_OVER_ETH is not set
+# CONFIG_BLK_DEV_HD is not set
+CONFIG_MISC_DEVICES=y
+# CONFIG_ICS932S401 is not set
+# CONFIG_ENCLOSURE_SERVICES is not set
+# CONFIG_ISL29003 is not set
+# CONFIG_C2PORT is not set
+
+#
+# EEPROM support
+#
+# CONFIG_EEPROM_AT24 is not set
+# CONFIG_EEPROM_AT25 is not set
+# CONFIG_EEPROM_LEGACY is not set
+# CONFIG_EEPROM_93CX6 is not set
+CONFIG_HAVE_IDE=y
+# CONFIG_IDE is not set
+
+#
+# SCSI device support
+#
+# CONFIG_RAID_ATTRS is not set
+CONFIG_SCSI=y
+CONFIG_SCSI_DMA=y
+# CONFIG_SCSI_TGT is not set
+# CONFIG_SCSI_NETLINK is not set
+CONFIG_SCSI_PROC_FS=y
+
+#
+# SCSI support type (disk, tape, CD-ROM)
+#
+CONFIG_BLK_DEV_SD=y
+# CONFIG_CHR_DEV_ST is not set
+# CONFIG_CHR_DEV_OSST is not set
+# CONFIG_BLK_DEV_SR is not set
+# CONFIG_CHR_DEV_SG is not set
+# CONFIG_CHR_DEV_SCH is not set
+
+#
+# Some SCSI devices (e.g. CD jukebox) support multiple LUNs
+#
+# CONFIG_SCSI_MULTI_LUN is not set
+# CONFIG_SCSI_CONSTANTS is not set
+# CONFIG_SCSI_LOGGING is not set
+# CONFIG_SCSI_SCAN_ASYNC is not set
+CONFIG_SCSI_WAIT_SCAN=m
+
+#
+# SCSI Transports
+#
+# CONFIG_SCSI_SPI_ATTRS is not set
+# CONFIG_SCSI_FC_ATTRS is not set
+# CONFIG_SCSI_ISCSI_ATTRS is not set
+# CONFIG_SCSI_SAS_LIBSAS is not set
+# CONFIG_SCSI_SRP_ATTRS is not set
+CONFIG_SCSI_LOWLEVEL=y
+# CONFIG_ISCSI_TCP is not set
+# CONFIG_LIBFC is not set
+# CONFIG_LIBFCOE is not set
+# CONFIG_SCSI_DEBUG is not set
+# CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
+# CONFIG_ATA is not set
+# CONFIG_MD is not set
+CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
+# CONFIG_DUMMY is not set
+# CONFIG_BONDING is not set
+# CONFIG_MACVLAN is not set
+# CONFIG_EQUALIZER is not set
+# CONFIG_TUN is not set
+# CONFIG_VETH is not set
+CONFIG_PHYLIB=y
+
+#
+# MII PHY device drivers
+#
+# CONFIG_MARVELL_PHY is not set
+# CONFIG_DAVICOM_PHY is not set
+# CONFIG_QSEMI_PHY is not set
+# CONFIG_LXT_PHY is not set
+# CONFIG_CICADA_PHY is not set
+# CONFIG_VITESSE_PHY is not set
+CONFIG_SMSC_PHY=y
+# CONFIG_BROADCOM_PHY is not set
+# CONFIG_ICPLUS_PHY is not set
+# CONFIG_REALTEK_PHY is not set
+# CONFIG_NATIONAL_PHY is not set
+# CONFIG_STE10XP is not set
+# CONFIG_LSI_ET1011C_PHY is not set
+# CONFIG_FIXED_PHY is not set
+CONFIG_MDIO_BITBANG=y
+# CONFIG_MDIO_GPIO is not set
+CONFIG_NET_ETHERNET=y
+CONFIG_MII=y
+# CONFIG_AX88796 is not set
+# CONFIG_STNIC is not set
+CONFIG_SMC91X=y
+# CONFIG_ENC28J60 is not set
+# CONFIG_ETHOC is not set
+# CONFIG_SMC911X is not set
+# CONFIG_SMSC911X is not set
+# CONFIG_DNET is not set
+# CONFIG_IBM_NEW_EMAC_ZMII is not set
+# CONFIG_IBM_NEW_EMAC_RGMII is not set
+# CONFIG_IBM_NEW_EMAC_TAH is not set
+# CONFIG_IBM_NEW_EMAC_EMAC4 is not set
+# CONFIG_IBM_NEW_EMAC_NO_FLOW_CTRL is not set
+# CONFIG_IBM_NEW_EMAC_MAL_CLR_ICINTSTAT is not set
+# CONFIG_IBM_NEW_EMAC_MAL_COMMON_ERR is not set
+# CONFIG_B44 is not set
+# CONFIG_NETDEV_1000 is not set
+# CONFIG_NETDEV_10000 is not set
+
+#
+# Wireless LAN
+#
+# CONFIG_WLAN_PRE80211 is not set
+# CONFIG_WLAN_80211 is not set
+
+#
+# Enable WiMAX (Networking options) to see the WiMAX drivers
+#
+
+#
+# USB Network Adapters
+#
+# CONFIG_USB_CATC is not set
+# CONFIG_USB_KAWETH is not set
+# CONFIG_USB_PEGASUS is not set
+# CONFIG_USB_RTL8150 is not set
+# CONFIG_USB_USBNET is not set
+# CONFIG_WAN is not set
+# CONFIG_PPP is not set
+# CONFIG_SLIP is not set
+# CONFIG_NETCONSOLE is not set
+# CONFIG_NETPOLL is not set
+# CONFIG_NET_POLL_CONTROLLER is not set
+# CONFIG_ISDN is not set
+# CONFIG_PHONE is not set
+
+#
+# Input device support
+#
+CONFIG_INPUT=y
+# CONFIG_INPUT_FF_MEMLESS is not set
+# CONFIG_INPUT_POLLDEV is not set
+
+#
+# Userland interfaces
+#
+# CONFIG_INPUT_MOUSEDEV is not set
+# CONFIG_INPUT_JOYDEV is not set
+CONFIG_INPUT_EVDEV=y
+# CONFIG_INPUT_EVBUG is not set
+
+#
+# Input Device Drivers
+#
+CONFIG_INPUT_KEYBOARD=y
+# CONFIG_KEYBOARD_ATKBD is not set
+# CONFIG_KEYBOARD_SUNKBD is not set
+# CONFIG_KEYBOARD_LKKBD is not set
+# CONFIG_KEYBOARD_XTKBD is not set
+# CONFIG_KEYBOARD_NEWTON is not set
+# CONFIG_KEYBOARD_STOWAWAY is not set
+# CONFIG_KEYBOARD_GPIO is not set
+CONFIG_KEYBOARD_SH_KEYSC=y
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_INPUT_JOYSTICK is not set
+# CONFIG_INPUT_TABLET is not set
+# CONFIG_INPUT_TOUCHSCREEN is not set
+# CONFIG_INPUT_MISC is not set
+
+#
+# Hardware I/O ports
+#
+# CONFIG_SERIO is not set
+# CONFIG_GAMEPORT is not set
+
+#
+# Character devices
+#
+CONFIG_VT=y
+CONFIG_CONSOLE_TRANSLATIONS=y
+CONFIG_VT_CONSOLE=y
+CONFIG_HW_CONSOLE=y
+CONFIG_VT_HW_CONSOLE_BINDING=y
+CONFIG_DEVKMEM=y
+# CONFIG_SERIAL_NONSTANDARD is not set
+
+#
+# Serial drivers
+#
+# CONFIG_SERIAL_8250 is not set
+
+#
+# Non-8250 serial port support
+#
+# CONFIG_SERIAL_MAX3100 is not set
+CONFIG_SERIAL_SH_SCI=y
+CONFIG_SERIAL_SH_SCI_NR_UARTS=6
+CONFIG_SERIAL_SH_SCI_CONSOLE=y
+CONFIG_SERIAL_CORE=y
+CONFIG_SERIAL_CORE_CONSOLE=y
+CONFIG_UNIX98_PTYS=y
+# CONFIG_DEVPTS_MULTIPLE_INSTANCES is not set
+CONFIG_LEGACY_PTYS=y
+CONFIG_LEGACY_PTY_COUNT=256
+# CONFIG_IPMI_HANDLER is not set
+CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
+# CONFIG_R3964 is not set
+# CONFIG_RAW_DRIVER is not set
+# CONFIG_TCG_TPM is not set
+CONFIG_I2C=y
+CONFIG_I2C_BOARDINFO=y
+CONFIG_I2C_CHARDEV=y
+CONFIG_I2C_HELPER_AUTO=y
+
+#
+# I2C Hardware Bus support
+#
+
+#
+# I2C system bus drivers (mostly embedded / system-on-chip)
+#
+# CONFIG_I2C_GPIO is not set
+# CONFIG_I2C_OCORES is not set
+CONFIG_I2C_SH_MOBILE=y
+# CONFIG_I2C_SIMTEC is not set
+
+#
+# External I2C/SMBus adapter drivers
+#
+# CONFIG_I2C_PARPORT_LIGHT is not set
+# CONFIG_I2C_TAOS_EVM is not set
+# CONFIG_I2C_TINY_USB is not set
+
+#
+# Other I2C/SMBus bus drivers
+#
+# CONFIG_I2C_PCA_PLATFORM is not set
+# CONFIG_I2C_STUB is not set
+
+#
+# Miscellaneous I2C Chip support
+#
+# CONFIG_DS1682 is not set
+# CONFIG_SENSORS_PCF8574 is not set
+# CONFIG_PCF8575 is not set
+# CONFIG_SENSORS_PCA9539 is not set
+# CONFIG_SENSORS_MAX6875 is not set
+# CONFIG_SENSORS_TSL2550 is not set
+# CONFIG_I2C_DEBUG_CORE is not set
+# CONFIG_I2C_DEBUG_ALGO is not set
+# CONFIG_I2C_DEBUG_BUS is not set
+# CONFIG_I2C_DEBUG_CHIP is not set
+CONFIG_SPI=y
+CONFIG_SPI_MASTER=y
+
+#
+# SPI Master Controller Drivers
+#
+CONFIG_SPI_BITBANG=y
+# CONFIG_SPI_GPIO is not set
+# CONFIG_SPI_SH_SCI is not set
+
+#
+# SPI Protocol Masters
+#
+# CONFIG_SPI_SPIDEV is not set
+# CONFIG_SPI_TLE62X0 is not set
+CONFIG_ARCH_REQUIRE_GPIOLIB=y
+CONFIG_GPIOLIB=y
+# CONFIG_GPIO_SYSFS is not set
+
+#
+# Memory mapped GPIO expanders:
+#
+
+#
+# I2C GPIO expanders:
+#
+# CONFIG_GPIO_MAX732X is not set
+# CONFIG_GPIO_PCA953X is not set
+# CONFIG_GPIO_PCF857X is not set
+
+#
+# PCI GPIO expanders:
+#
+
+#
+# SPI GPIO expanders:
+#
+# CONFIG_GPIO_MAX7301 is not set
+# CONFIG_GPIO_MCP23S08 is not set
+# CONFIG_W1 is not set
+# CONFIG_POWER_SUPPLY is not set
+# CONFIG_HWMON is not set
+# CONFIG_THERMAL is not set
+# CONFIG_THERMAL_HWMON is not set
+# CONFIG_WATCHDOG is not set
+CONFIG_SSB_POSSIBLE=y
+
+#
+# Sonics Silicon Backplane
+#
+# CONFIG_SSB is not set
+
+#
+# Multifunction device drivers
+#
+# CONFIG_MFD_CORE is not set
+# CONFIG_MFD_SM501 is not set
+# CONFIG_HTC_PASIC3 is not set
+# CONFIG_TPS65010 is not set
+# CONFIG_TWL4030_CORE is not set
+# CONFIG_MFD_TMIO is not set
+# CONFIG_PMIC_DA903X is not set
+# CONFIG_MFD_WM8400 is not set
+# CONFIG_MFD_WM8350_I2C is not set
+# CONFIG_MFD_PCF50633 is not set
+# CONFIG_REGULATOR is not set
+
+#
+# Multimedia devices
+#
+
+#
+# Multimedia core support
+#
+CONFIG_VIDEO_DEV=y
+CONFIG_VIDEO_V4L2_COMMON=y
+# CONFIG_VIDEO_ALLOW_V4L1 is not set
+CONFIG_VIDEO_V4L1_COMPAT=y
+# CONFIG_DVB_CORE is not set
+CONFIG_VIDEO_MEDIA=y
+
+#
+# Multimedia drivers
+#
+# CONFIG_MEDIA_ATTACH is not set
+CONFIG_MEDIA_TUNER=y
+# CONFIG_MEDIA_TUNER_CUSTOMISE is not set
+CONFIG_MEDIA_TUNER_SIMPLE=y
+CONFIG_MEDIA_TUNER_TDA8290=y
+CONFIG_MEDIA_TUNER_TDA9887=y
+CONFIG_MEDIA_TUNER_TEA5761=y
+CONFIG_MEDIA_TUNER_TEA5767=y
+CONFIG_MEDIA_TUNER_MT20XX=y
+CONFIG_MEDIA_TUNER_XC2028=y
+CONFIG_MEDIA_TUNER_XC5000=y
+CONFIG_MEDIA_TUNER_MC44S803=y
+CONFIG_VIDEO_V4L2=y
+CONFIG_VIDEOBUF_GEN=y
+CONFIG_VIDEOBUF_DMA_CONTIG=y
+CONFIG_VIDEO_CAPTURE_DRIVERS=y
+# CONFIG_VIDEO_ADV_DEBUG is not set
+# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set
+CONFIG_VIDEO_HELPER_CHIPS_AUTO=y
+# CONFIG_VIDEO_VIVI is not set
+# CONFIG_VIDEO_SAA5246A is not set
+# CONFIG_VIDEO_SAA5249 is not set
+CONFIG_SOC_CAMERA=y
+# CONFIG_SOC_CAMERA_MT9M001 is not set
+# CONFIG_SOC_CAMERA_MT9M111 is not set
+# CONFIG_SOC_CAMERA_MT9T031 is not set
+# CONFIG_SOC_CAMERA_MT9V022 is not set
+# CONFIG_SOC_CAMERA_TW9910 is not set
+# CONFIG_SOC_CAMERA_PLATFORM is not set
+CONFIG_SOC_CAMERA_OV772X=y
+CONFIG_VIDEO_SH_MOBILE_CEU=y
+CONFIG_V4L_USB_DRIVERS=y
+# CONFIG_USB_VIDEO_CLASS is not set
+CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y
+CONFIG_USB_GSPCA=m
+# CONFIG_USB_M5602 is not set
+# CONFIG_USB_STV06XX is not set
+# CONFIG_USB_GSPCA_CONEX is not set
+# CONFIG_USB_GSPCA_ETOMS is not set
+# CONFIG_USB_GSPCA_FINEPIX is not set
+# CONFIG_USB_GSPCA_MARS is not set
+# CONFIG_USB_GSPCA_MR97310A is not set
+# CONFIG_USB_GSPCA_OV519 is not set
+# CONFIG_USB_GSPCA_OV534 is not set
+# CONFIG_USB_GSPCA_PAC207 is not set
+# CONFIG_USB_GSPCA_PAC7311 is not set
+# CONFIG_USB_GSPCA_SONIXB is not set
+# CONFIG_USB_GSPCA_SONIXJ is not set
+# CONFIG_USB_GSPCA_SPCA500 is not set
+# CONFIG_USB_GSPCA_SPCA501 is not set
+# CONFIG_USB_GSPCA_SPCA505 is not set
+# CONFIG_USB_GSPCA_SPCA506 is not set
+# CONFIG_USB_GSPCA_SPCA508 is not set
+# CONFIG_USB_GSPCA_SPCA561 is not set
+# CONFIG_USB_GSPCA_SQ905 is not set
+# CONFIG_USB_GSPCA_SQ905C is not set
+# CONFIG_USB_GSPCA_STK014 is not set
+# CONFIG_USB_GSPCA_SUNPLUS is not set
+# CONFIG_USB_GSPCA_T613 is not set
+# CONFIG_USB_GSPCA_TV8532 is not set
+# CONFIG_USB_GSPCA_VC032X is not set
+# CONFIG_USB_GSPCA_ZC3XX is not set
+# CONFIG_VIDEO_PVRUSB2 is not set
+# CONFIG_VIDEO_HDPVR is not set
+# CONFIG_VIDEO_EM28XX is not set
+# CONFIG_VIDEO_CX231XX is not set
+# CONFIG_VIDEO_USBVISION is not set
+# CONFIG_USB_ET61X251 is not set
+# CONFIG_USB_SN9C102 is not set
+# CONFIG_USB_ZC0301 is not set
+CONFIG_USB_PWC_INPUT_EVDEV=y
+# CONFIG_USB_ZR364XX is not set
+# CONFIG_USB_STKWEBCAM is not set
+# CONFIG_USB_S2255 is not set
+# CONFIG_RADIO_ADAPTERS is not set
+# CONFIG_DAB is not set
+
+#
+# Graphics support
+#
+# CONFIG_VGASTATE is not set
+# CONFIG_VIDEO_OUTPUT_CONTROL is not set
+CONFIG_FB=y
+# CONFIG_FIRMWARE_EDID is not set
+# CONFIG_FB_DDC is not set
+# CONFIG_FB_BOOT_VESA_SUPPORT is not set
+# CONFIG_FB_CFB_FILLRECT is not set
+# CONFIG_FB_CFB_COPYAREA is not set
+# CONFIG_FB_CFB_IMAGEBLIT is not set
+# CONFIG_FB_CFB_REV_PIXELS_IN_BYTE is not set
+CONFIG_FB_SYS_FILLRECT=y
+CONFIG_FB_SYS_COPYAREA=y
+CONFIG_FB_SYS_IMAGEBLIT=y
+# CONFIG_FB_FOREIGN_ENDIAN is not set
+CONFIG_FB_SYS_FOPS=y
+CONFIG_FB_DEFERRED_IO=y
+# CONFIG_FB_SVGALIB is not set
+# CONFIG_FB_MACMODES is not set
+# CONFIG_FB_BACKLIGHT is not set
+# CONFIG_FB_MODE_HELPERS is not set
+# CONFIG_FB_TILEBLITTING is not set
+
+#
+# Frame buffer hardware drivers
+#
+# CONFIG_FB_S1D13XXX is not set
+CONFIG_FB_SH_MOBILE_LCDC=y
+# CONFIG_FB_VIRTUAL is not set
+# CONFIG_FB_METRONOME is not set
+# CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
+# CONFIG_BACKLIGHT_LCD_SUPPORT is not set
+
+#
+# Display device support
+#
+# CONFIG_DISPLAY_SUPPORT is not set
+
+#
+# Console display driver support
+#
+CONFIG_DUMMY_CONSOLE=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+# CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY is not set
+# CONFIG_FRAMEBUFFER_CONSOLE_ROTATION is not set
+# CONFIG_FONTS is not set
+CONFIG_FONT_8x8=y
+CONFIG_FONT_8x16=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+# CONFIG_LOGO_LINUX_CLUT224 is not set
+# CONFIG_LOGO_SUPERH_MONO is not set
+# CONFIG_LOGO_SUPERH_VGA16 is not set
+CONFIG_LOGO_SUPERH_CLUT224=y
+# CONFIG_SOUND is not set
+CONFIG_HID_SUPPORT=y
+CONFIG_HID=y
+# CONFIG_HID_DEBUG is not set
+# CONFIG_HIDRAW is not set
+
+#
+# USB Input Devices
+#
+CONFIG_USB_HID=y
+# CONFIG_HID_PID is not set
+# CONFIG_USB_HIDDEV is not set
+
+#
+# Special HID drivers
+#
+# CONFIG_HID_A4TECH is not set
+# CONFIG_HID_APPLE is not set
+# CONFIG_HID_BELKIN is not set
+# CONFIG_HID_CHERRY is not set
+# CONFIG_HID_CHICONY is not set
+# CONFIG_HID_CYPRESS is not set
+# CONFIG_DRAGONRISE_FF is not set
+# CONFIG_HID_EZKEY is not set
+# CONFIG_HID_KYE is not set
+# CONFIG_HID_GYRATION is not set
+# CONFIG_HID_KENSINGTON is not set
+# CONFIG_HID_LOGITECH is not set
+# CONFIG_HID_MICROSOFT is not set
+# CONFIG_HID_MONTEREY is not set
+# CONFIG_HID_NTRIG is not set
+# CONFIG_HID_PANTHERLORD is not set
+# CONFIG_HID_PETALYNX is not set
+# CONFIG_HID_SAMSUNG is not set
+# CONFIG_HID_SONY is not set
+# CONFIG_HID_SUNPLUS is not set
+# CONFIG_GREENASIA_FF is not set
+# CONFIG_HID_TOPSEED is not set
+# CONFIG_THRUSTMASTER_FF is not set
+# CONFIG_ZEROPLUS_FF is not set
+CONFIG_USB_SUPPORT=y
+CONFIG_USB_ARCH_HAS_HCD=y
+# CONFIG_USB_ARCH_HAS_OHCI is not set
+# CONFIG_USB_ARCH_HAS_EHCI is not set
+CONFIG_USB=y
+# CONFIG_USB_DEBUG is not set
+# CONFIG_USB_ANNOUNCE_NEW_DEVICES is not set
+
+#
+# Miscellaneous USB options
+#
+# CONFIG_USB_DEVICEFS is not set
+CONFIG_USB_DEVICE_CLASS=y
+# CONFIG_USB_DYNAMIC_MINORS is not set
+# CONFIG_USB_OTG is not set
+# CONFIG_USB_OTG_WHITELIST is not set
+# CONFIG_USB_OTG_BLACKLIST_HUB is not set
+CONFIG_USB_MON=y
+# CONFIG_USB_WUSB is not set
+# CONFIG_USB_WUSB_CBAF is not set
+
+#
+# USB Host Controller Drivers
+#
+# CONFIG_USB_C67X00_HCD is not set
+# CONFIG_USB_OXU210HP_HCD is not set
+# CONFIG_USB_ISP116X_HCD is not set
+# CONFIG_USB_ISP1760_HCD is not set
+# CONFIG_USB_SL811_HCD is not set
+CONFIG_USB_R8A66597_HCD=y
+# CONFIG_USB_HWA_HCD is not set
+
+#
+# USB Device Class drivers
+#
+# CONFIG_USB_ACM is not set
+# CONFIG_USB_PRINTER is not set
+# CONFIG_USB_WDM is not set
+# CONFIG_USB_TMC is not set
+
+#
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
+#
+
+#
+# also be needed; see USB_STORAGE Help for more info
+#
+CONFIG_USB_STORAGE=y
+# CONFIG_USB_STORAGE_DEBUG is not set
+# CONFIG_USB_STORAGE_DATAFAB is not set
+# CONFIG_USB_STORAGE_FREECOM is not set
+# CONFIG_USB_STORAGE_ISD200 is not set
+# CONFIG_USB_STORAGE_USBAT is not set
+# CONFIG_USB_STORAGE_SDDR09 is not set
+# CONFIG_USB_STORAGE_SDDR55 is not set
+# CONFIG_USB_STORAGE_JUMPSHOT is not set
+# CONFIG_USB_STORAGE_ALAUDA is not set
+# CONFIG_USB_STORAGE_ONETOUCH is not set
+# CONFIG_USB_STORAGE_KARMA is not set
+# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
+# CONFIG_USB_LIBUSUAL is not set
+
+#
+# USB Imaging devices
+#
+# CONFIG_USB_MDC800 is not set
+# CONFIG_USB_MICROTEK is not set
+
+#
+# USB port drivers
+#
+# CONFIG_USB_SERIAL is not set
+
+#
+# USB Miscellaneous drivers
+#
+# CONFIG_USB_EMI62 is not set
+# CONFIG_USB_EMI26 is not set
+# CONFIG_USB_ADUTUX is not set
+# CONFIG_USB_SEVSEG is not set
+# CONFIG_USB_RIO500 is not set
+# CONFIG_USB_LEGOTOWER is not set
+# CONFIG_USB_LCD is not set
+# CONFIG_USB_BERRY_CHARGE is not set
+# CONFIG_USB_LED is not set
+# CONFIG_USB_CYPRESS_CY7C63 is not set
+# CONFIG_USB_CYTHERM is not set
+# CONFIG_USB_IDMOUSE is not set
+# CONFIG_USB_FTDI_ELAN is not set
+# CONFIG_USB_APPLEDISPLAY is not set
+# CONFIG_USB_LD is not set
+# CONFIG_USB_TRANCEVIBRATOR is not set
+# CONFIG_USB_IOWARRIOR is not set
+# CONFIG_USB_ISIGHTFW is not set
+# CONFIG_USB_VST is not set
+# CONFIG_USB_GADGET is not set
+
+#
+# OTG and related infrastructure
+#
+# CONFIG_USB_GPIO_VBUS is not set
+# CONFIG_NOP_USB_XCEIV is not set
+CONFIG_MMC=y
+# CONFIG_MMC_DEBUG is not set
+# CONFIG_MMC_UNSAFE_RESUME is not set
+
+#
+# MMC/SD/SDIO Card Drivers
+#
+CONFIG_MMC_BLOCK=y
+CONFIG_MMC_BLOCK_BOUNCE=y
+# CONFIG_SDIO_UART is not set
+# CONFIG_MMC_TEST is not set
+
+#
+# MMC/SD/SDIO Host Controller Drivers
+#
+# CONFIG_MMC_SDHCI is not set
+CONFIG_MMC_SPI=y
+# CONFIG_MEMSTICK is not set
+# CONFIG_NEW_LEDS is not set
+# CONFIG_ACCESSIBILITY is not set
+CONFIG_RTC_LIB=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_HCTOSYS=y
+CONFIG_RTC_HCTOSYS_DEVICE="rtc0"
+# CONFIG_RTC_DEBUG is not set
+
+#
+# RTC interfaces
+#
+CONFIG_RTC_INTF_SYSFS=y
+CONFIG_RTC_INTF_PROC=y
+CONFIG_RTC_INTF_DEV=y
+# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set
+# CONFIG_RTC_DRV_TEST is not set
+
+#
+# I2C RTC drivers
+#
+# CONFIG_RTC_DRV_DS1307 is not set
+# CONFIG_RTC_DRV_DS1374 is not set
+# CONFIG_RTC_DRV_DS1672 is not set
+# CONFIG_RTC_DRV_MAX6900 is not set
+# CONFIG_RTC_DRV_RS5C372 is not set
+# CONFIG_RTC_DRV_ISL1208 is not set
+# CONFIG_RTC_DRV_X1205 is not set
+CONFIG_RTC_DRV_PCF8563=y
+# CONFIG_RTC_DRV_PCF8583 is not set
+# CONFIG_RTC_DRV_M41T80 is not set
+# CONFIG_RTC_DRV_S35390A is not set
+# CONFIG_RTC_DRV_FM3130 is not set
+# CONFIG_RTC_DRV_RX8581 is not set
+
+#
+# SPI RTC drivers
+#
+# CONFIG_RTC_DRV_M41T94 is not set
+# CONFIG_RTC_DRV_DS1305 is not set
+# CONFIG_RTC_DRV_DS1390 is not set
+# CONFIG_RTC_DRV_MAX6902 is not set
+# CONFIG_RTC_DRV_R9701 is not set
+# CONFIG_RTC_DRV_RS5C348 is not set
+# CONFIG_RTC_DRV_DS3234 is not set
+
+#
+# Platform RTC drivers
+#
+# CONFIG_RTC_DRV_DS1286 is not set
+# CONFIG_RTC_DRV_DS1511 is not set
+# CONFIG_RTC_DRV_DS1553 is not set
+# CONFIG_RTC_DRV_DS1742 is not set
+# CONFIG_RTC_DRV_STK17TA8 is not set
+# CONFIG_RTC_DRV_M48T86 is not set
+# CONFIG_RTC_DRV_M48T35 is not set
+# CONFIG_RTC_DRV_M48T59 is not set
+# CONFIG_RTC_DRV_BQ4802 is not set
+# CONFIG_RTC_DRV_V3020 is not set
+
+#
+# on-CPU RTC drivers
+#
+# CONFIG_RTC_DRV_SH is not set
+# CONFIG_RTC_DRV_GENERIC is not set
+# CONFIG_DMADEVICES is not set
+# CONFIG_AUXDISPLAY is not set
+CONFIG_UIO=y
+# CONFIG_UIO_PDRV is not set
+CONFIG_UIO_PDRV_GENIRQ=y
+# CONFIG_UIO_SMX is not set
+# CONFIG_UIO_SERCOS3 is not set
+# CONFIG_STAGING is not set
+
+#
+# File systems
+#
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+# CONFIG_EXT2_FS_XIP is not set
+CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
+CONFIG_EXT3_FS_XATTR=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+# CONFIG_EXT4_FS is not set
+CONFIG_JBD=y
+CONFIG_FS_MBCACHE=y
+# CONFIG_REISERFS_FS is not set
+# CONFIG_JFS_FS is not set
+CONFIG_FS_POSIX_ACL=y
+CONFIG_FILE_LOCKING=y
+# CONFIG_XFS_FS is not set
+# CONFIG_OCFS2_FS is not set
+# CONFIG_BTRFS_FS is not set
+CONFIG_DNOTIFY=y
+CONFIG_INOTIFY=y
+CONFIG_INOTIFY_USER=y
+# CONFIG_QUOTA is not set
+# CONFIG_AUTOFS_FS is not set
+# CONFIG_AUTOFS4_FS is not set
+# CONFIG_FUSE_FS is not set
+
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
+#
+# CD-ROM/DVD Filesystems
+#
+# CONFIG_ISO9660_FS is not set
+# CONFIG_UDF_FS is not set
+
+#
+# DOS/FAT/NT Filesystems
+#
+CONFIG_FAT_FS=y
+# CONFIG_MSDOS_FS is not set
+CONFIG_VFAT_FS=y
+CONFIG_FAT_DEFAULT_CODEPAGE=437
+CONFIG_FAT_DEFAULT_IOCHARSET="iso8859-1"
+# CONFIG_NTFS_FS is not set
+
+#
+# Pseudo filesystems
+#
+CONFIG_PROC_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_PROC_SYSCTL=y
+CONFIG_PROC_PAGE_MONITOR=y
+CONFIG_SYSFS=y
+CONFIG_TMPFS=y
+# CONFIG_TMPFS_POSIX_ACL is not set
+# CONFIG_HUGETLBFS is not set
+# CONFIG_HUGETLB_PAGE is not set
+# CONFIG_CONFIGFS_FS is not set
+CONFIG_MISC_FILESYSTEMS=y
+# CONFIG_ADFS_FS is not set
+# CONFIG_AFFS_FS is not set
+# CONFIG_HFS_FS is not set
+# CONFIG_HFSPLUS_FS is not set
+# CONFIG_BEFS_FS is not set
+# CONFIG_BFS_FS is not set
+# CONFIG_EFS_FS is not set
+# CONFIG_JFFS2_FS is not set
+# CONFIG_UBIFS_FS is not set
+# CONFIG_CRAMFS is not set
+# CONFIG_SQUASHFS is not set
+# CONFIG_VXFS_FS is not set
+# CONFIG_MINIX_FS is not set
+# CONFIG_OMFS_FS is not set
+# CONFIG_HPFS_FS is not set
+# CONFIG_QNX4FS_FS is not set
+# CONFIG_ROMFS_FS is not set
+# CONFIG_SYSV_FS is not set
+# CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
+CONFIG_NETWORK_FILESYSTEMS=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3=y
+# CONFIG_NFS_V3_ACL is not set
+# CONFIG_NFS_V4 is not set
+CONFIG_ROOT_NFS=y
+CONFIG_NFSD=y
+CONFIG_NFSD_V3=y
+# CONFIG_NFSD_V3_ACL is not set
+# CONFIG_NFSD_V4 is not set
+CONFIG_LOCKD=y
+CONFIG_LOCKD_V4=y
+CONFIG_EXPORTFS=y
+CONFIG_NFS_COMMON=y
+CONFIG_SUNRPC=y
+# CONFIG_RPCSEC_GSS_KRB5 is not set
+# CONFIG_RPCSEC_GSS_SPKM3 is not set
+# CONFIG_SMB_FS is not set
+# CONFIG_CIFS is not set
+# CONFIG_NCP_FS is not set
+# CONFIG_CODA_FS is not set
+# CONFIG_AFS_FS is not set
+
+#
+# Partition Types
+#
+# CONFIG_PARTITION_ADVANCED is not set
+CONFIG_MSDOS_PARTITION=y
+CONFIG_NLS=y
+CONFIG_NLS_DEFAULT="iso8859-1"
+CONFIG_NLS_CODEPAGE_437=y
+# CONFIG_NLS_CODEPAGE_737 is not set
+# CONFIG_NLS_CODEPAGE_775 is not set
+# CONFIG_NLS_CODEPAGE_850 is not set
+# CONFIG_NLS_CODEPAGE_852 is not set
+# CONFIG_NLS_CODEPAGE_855 is not set
+# CONFIG_NLS_CODEPAGE_857 is not set
+# CONFIG_NLS_CODEPAGE_860 is not set
+# CONFIG_NLS_CODEPAGE_861 is not set
+# CONFIG_NLS_CODEPAGE_862 is not set
+# CONFIG_NLS_CODEPAGE_863 is not set
+# CONFIG_NLS_CODEPAGE_864 is not set
+# CONFIG_NLS_CODEPAGE_865 is not set
+# CONFIG_NLS_CODEPAGE_866 is not set
+# CONFIG_NLS_CODEPAGE_869 is not set
+# CONFIG_NLS_CODEPAGE_936 is not set
+# CONFIG_NLS_CODEPAGE_950 is not set
+CONFIG_NLS_CODEPAGE_932=y
+# CONFIG_NLS_CODEPAGE_949 is not set
+# CONFIG_NLS_CODEPAGE_874 is not set
+# CONFIG_NLS_ISO8859_8 is not set
+# CONFIG_NLS_CODEPAGE_1250 is not set
+# CONFIG_NLS_CODEPAGE_1251 is not set
+# CONFIG_NLS_ASCII is not set
+CONFIG_NLS_ISO8859_1=y
+# CONFIG_NLS_ISO8859_2 is not set
+# CONFIG_NLS_ISO8859_3 is not set
+# CONFIG_NLS_ISO8859_4 is not set
+# CONFIG_NLS_ISO8859_5 is not set
+# CONFIG_NLS_ISO8859_6 is not set
+# CONFIG_NLS_ISO8859_7 is not set
+# CONFIG_NLS_ISO8859_9 is not set
+# CONFIG_NLS_ISO8859_13 is not set
+# CONFIG_NLS_ISO8859_14 is not set
+# CONFIG_NLS_ISO8859_15 is not set
+# CONFIG_NLS_KOI8_R is not set
+# CONFIG_NLS_KOI8_U is not set
+# CONFIG_NLS_UTF8 is not set
+# CONFIG_DLM is not set
+
+#
+# Kernel hacking
+#
+CONFIG_TRACE_IRQFLAGS_SUPPORT=y
+# CONFIG_PRINTK_TIME is not set
+CONFIG_ENABLE_WARN_DEPRECATED=y
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=1024
+# CONFIG_MAGIC_SYSRQ is not set
+# CONFIG_UNUSED_SYMBOLS is not set
+# CONFIG_DEBUG_FS is not set
+# CONFIG_HEADERS_CHECK is not set
+# CONFIG_DEBUG_KERNEL is not set
+# CONFIG_DEBUG_BUGVERBOSE is not set
+# CONFIG_DEBUG_MEMORY_INIT is not set
+# CONFIG_RCU_CPU_STALL_DETECTOR is not set
+# CONFIG_LATENCYTOP is not set
+CONFIG_SYSCTL_SYSCALL_CHECK=y
+CONFIG_HAVE_FUNCTION_TRACER=y
+CONFIG_HAVE_DYNAMIC_FTRACE=y
+CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
+CONFIG_TRACING_SUPPORT=y
+
+#
+# Tracers
+#
+# CONFIG_FUNCTION_TRACER is not set
+# CONFIG_IRQSOFF_TRACER is not set
+# CONFIG_PREEMPT_TRACER is not set
+# CONFIG_SCHED_TRACER is not set
+# CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_BOOT_TRACER is not set
+# CONFIG_TRACE_BRANCH_PROFILING is not set
+# CONFIG_STACK_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+# CONFIG_BLK_DEV_IO_TRACE is not set
+# CONFIG_DMA_API_DEBUG is not set
+# CONFIG_SAMPLES is not set
+CONFIG_HAVE_ARCH_KGDB=y
+# CONFIG_SH_STANDARD_BIOS is not set
+# CONFIG_EARLY_SCIF_CONSOLE is not set
+
+#
+# Security options
+#
+# CONFIG_KEYS is not set
+# CONFIG_SECURITY is not set
+# CONFIG_SECURITYFS is not set
+# CONFIG_SECURITY_FILE_CAPABILITIES is not set
+CONFIG_CRYPTO=y
+
+#
+# Crypto core or helper
+#
+# CONFIG_CRYPTO_FIPS is not set
+CONFIG_CRYPTO_ALGAPI=y
+CONFIG_CRYPTO_ALGAPI2=y
+CONFIG_CRYPTO_AEAD2=y
+CONFIG_CRYPTO_BLKCIPHER=y
+CONFIG_CRYPTO_BLKCIPHER2=y
+CONFIG_CRYPTO_HASH2=y
+CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
+CONFIG_CRYPTO_MANAGER=y
+CONFIG_CRYPTO_MANAGER2=y
+# CONFIG_CRYPTO_GF128MUL is not set
+# CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
+# CONFIG_CRYPTO_CRYPTD is not set
+# CONFIG_CRYPTO_AUTHENC is not set
+# CONFIG_CRYPTO_TEST is not set
+
+#
+# Authenticated Encryption with Associated Data
+#
+# CONFIG_CRYPTO_CCM is not set
+# CONFIG_CRYPTO_GCM is not set
+# CONFIG_CRYPTO_SEQIV is not set
+
+#
+# Block modes
+#
+CONFIG_CRYPTO_CBC=y
+# CONFIG_CRYPTO_CTR is not set
+# CONFIG_CRYPTO_CTS is not set
+# CONFIG_CRYPTO_ECB is not set
+# CONFIG_CRYPTO_LRW is not set
+# CONFIG_CRYPTO_PCBC is not set
+# CONFIG_CRYPTO_XTS is not set
+
+#
+# Hash modes
+#
+# CONFIG_CRYPTO_HMAC is not set
+# CONFIG_CRYPTO_XCBC is not set
+
+#
+# Digest
+#
+# CONFIG_CRYPTO_CRC32C is not set
+# CONFIG_CRYPTO_MD4 is not set
+# CONFIG_CRYPTO_MD5 is not set
+# CONFIG_CRYPTO_MICHAEL_MIC is not set
+# CONFIG_CRYPTO_RMD128 is not set
+# CONFIG_CRYPTO_RMD160 is not set
+# CONFIG_CRYPTO_RMD256 is not set
+# CONFIG_CRYPTO_RMD320 is not set
+# CONFIG_CRYPTO_SHA1 is not set
+# CONFIG_CRYPTO_SHA256 is not set
+# CONFIG_CRYPTO_SHA512 is not set
+# CONFIG_CRYPTO_TGR192 is not set
+# CONFIG_CRYPTO_WP512 is not set
+
+#
+# Ciphers
+#
+# CONFIG_CRYPTO_AES is not set
+# CONFIG_CRYPTO_ANUBIS is not set
+# CONFIG_CRYPTO_ARC4 is not set
+# CONFIG_CRYPTO_BLOWFISH is not set
+# CONFIG_CRYPTO_CAMELLIA is not set
+# CONFIG_CRYPTO_CAST5 is not set
+# CONFIG_CRYPTO_CAST6 is not set
+# CONFIG_CRYPTO_DES is not set
+# CONFIG_CRYPTO_FCRYPT is not set
+# CONFIG_CRYPTO_KHAZAD is not set
+# CONFIG_CRYPTO_SALSA20 is not set
+# CONFIG_CRYPTO_SEED is not set
+# CONFIG_CRYPTO_SERPENT is not set
+# CONFIG_CRYPTO_TEA is not set
+# CONFIG_CRYPTO_TWOFISH is not set
+
+#
+# Compression
+#
+# CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
+# CONFIG_CRYPTO_LZO is not set
+
+#
+# Random Number Generation
+#
+# CONFIG_CRYPTO_ANSI_CPRNG is not set
+CONFIG_CRYPTO_HW=y
+# CONFIG_BINARY_PRINTF is not set
+
+#
+# Library routines
+#
+CONFIG_BITREVERSE=y
+CONFIG_GENERIC_FIND_LAST_BIT=y
+# CONFIG_CRC_CCITT is not set
+# CONFIG_CRC16 is not set
+CONFIG_CRC_T10DIF=y
+CONFIG_CRC_ITU_T=y
+CONFIG_CRC32=y
+CONFIG_CRC7=y
+# CONFIG_LIBCRC32C is not set
+CONFIG_HAS_IOMEM=y
+CONFIG_HAS_IOPORT=y
+CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
diff --git a/arch/sh/include/mach-se/mach/se7724.h b/arch/sh/include/mach-se/mach/se7724.h
new file mode 100644
index 0000000000000..74164b60d0dbb
--- /dev/null
+++ b/arch/sh/include/mach-se/mach/se7724.h
@@ -0,0 +1,67 @@
+#ifndef __ASM_SH_SE7724_H
+#define __ASM_SH_SE7724_H
+
+/*
+ * linux/include/asm-sh/se7724.h
+ *
+ * Copyright (C) 2009 Renesas Solutions Corp.
+ *
+ * Kuninori Morimoto <morimoto.kuninori@renesas.com>
+ *
+ * Hitachi UL SolutionEngine 7724 Support.
+ *
+ * Based on se7722.h
+ * Copyright (C) 2007  Nobuhiro Iwamatsu
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ */
+#include <asm/addrspace.h>
+
+#define PA_LED		(0xba203000)	/* 8bit LED */
+#define IRQ_MODE	(0xba200010)
+#define IRQ0_SR		(0xba200014)
+#define IRQ1_SR		(0xba200018)
+#define IRQ2_SR		(0xba20001c)
+#define IRQ0_MR		(0xba200020)
+#define IRQ1_MR		(0xba200024)
+#define IRQ2_MR		(0xba200028)
+
+/* IRQ */
+#define IRQ0_IRQ        32
+#define IRQ1_IRQ        33
+#define IRQ2_IRQ        34
+
+/* Bits in IRQ012 registers */
+#define SE7724_FPGA_IRQ_BASE	220
+
+/* IRQ0 */
+#define IRQ0_BASE	SE7724_FPGA_IRQ_BASE
+#define IRQ0_KEY	(IRQ0_BASE + 12)
+#define IRQ0_RMII	(IRQ0_BASE + 13)
+#define IRQ0_SMC	(IRQ0_BASE + 14)
+#define IRQ0_MASK	0x7fff
+#define IRQ0_END	IRQ0_SMC
+/* IRQ1 */
+#define IRQ1_BASE	(IRQ0_END + 1)
+#define IRQ1_TS		(IRQ1_BASE + 0)
+#define IRQ1_MASK	0x0001
+#define IRQ1_END	IRQ1_TS
+/* IRQ2 */
+#define IRQ2_BASE	(IRQ1_END + 1)
+#define IRQ2_USB0	(IRQ1_BASE + 0)
+#define IRQ2_USB1	(IRQ1_BASE + 1)
+#define IRQ2_MASK	0x0003
+#define IRQ2_END	IRQ2_USB1
+
+#define SE7724_FPGA_IRQ_NR	(IRQ2_END - IRQ0_BASE)
+
+/* arch/sh/boards/se/7724/irq.c */
+void init_se7724_IRQ(void);
+
+#define __IO_PREFIX		se7724
+#include <asm/io_generic.h>
+
+#endif  /* __ASM_SH_SE7724_H */
-- 
GitLab


From 5b25ab29bad3114f798b136b4147f255a5d5742f Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 26 May 2009 17:02:18 +0900
Subject: [PATCH 1302/2198] sh: Record ms7724se in mach-types.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/tools/mach-types | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/sh/tools/mach-types b/arch/sh/tools/mach-types
index 8477b5d884fd1..fec3a53b86504 100644
--- a/arch/sh/tools/mach-types
+++ b/arch/sh/tools/mach-types
@@ -23,6 +23,7 @@ HD64461			HD64461
 7619SE			SH_7619_SOLUTION_ENGINE
 7721SE			SH_7721_SOLUTION_ENGINE
 7722SE			SH_7722_SOLUTION_ENGINE
+7724SE			SH_7724_SOLUTION_ENGINE
 7751SE			SH_7751_SOLUTION_ENGINE
 7780SE			SH_7780_SOLUTION_ENGINE
 7751SYSTEMH		SH_7751_SYSTEMH
-- 
GitLab


From 69aa48ab82e17299efe2be6c21795945731a6c17 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:02:27 +0200
Subject: [PATCH 1303/2198] perf record: Straighten out argv types

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 1b19f187d3529..f225efaff9f52 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -191,7 +191,7 @@ static void display_help(void)
 	exit(0);
 }
 
-static void process_options(int argc, const char *argv[])
+static void process_options(int argc, char * const argv[])
 {
 	int error = 0, counter;
 
@@ -538,7 +538,7 @@ static void open_counters(int cpu, pid_t pid)
 	nr_cpu++;
 }
 
-int cmd_record(int argc, const char **argv)
+int cmd_record(int argc, char * const argv[])
 {
 	int i, counter;
 	pid_t pid;
-- 
GitLab


From 4e97ddf09ee3ce715fc334399bae4cc0c0a13057 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 10:07:44 +0200
Subject: [PATCH 1304/2198] perf stat: Remove unused variable

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 88c70be990319..c1053d820c1fb 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -541,8 +541,6 @@ static void skip_signal(int signo)
 
 int cmd_stat(int argc, char **argv, const char *prefix)
 {
-	sigset_t blocked;
-
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	process_options(argc, argv);
-- 
GitLab


From c9904dd15922f349b5f06839e34b1723d4a75940 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 25 May 2009 08:10:19 +0000
Subject: [PATCH 1305/2198] sh: add pll_clk to sh7785

This patch converts the sh7785 pll implementation from the
all-in-one code in frqmr_recalc() and frqmr_build_rate_table()
to a separate struct clk. This allows us to remove the processor
specific multiplier and use generic rate table functions.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 48 ++++++++++++++++++--------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index cf042b53b3aeb..7021ab0bfb88d 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -56,12 +56,7 @@ static unsigned long frqmr_recalc(struct clk *clk)
 
 	idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f;
 
-	/*
-	 * XXX: PLL1 multiplier is locked for the default clock mode,
-	 * when mode pin detection and configuration support is added,
-	 * select the multiplier dynamically.
-	 */
-	return clk->parent->rate * 36 / div2[idx];
+	return clk->parent->rate / div2[idx];
 }
 
 static void frqmr_build_rate_table(struct clk *clk)
@@ -75,7 +70,7 @@ static void frqmr_build_rate_table(struct clk *clk)
 
 		data->freq_table[entry].index = entry;
 		data->freq_table[entry].frequency =
-			clk->parent->rate * 36 / div2[i];
+			clk->parent->rate / div2[i];
 
 		entry++;
 	}
@@ -136,6 +131,20 @@ static struct clk_ops frqmr_clk_ops = {
 	.round_rate		= frqmr_round_rate,
 };
 
+static unsigned long pll_recalc(struct clk *clk)
+{
+	/*
+	 * XXX: PLL1 multiplier is locked for the default clock mode,
+	 * when mode pin detection and configuration support is added,
+	 * select the multiplier dynamically.
+	 */
+	return clk->parent->rate * 36;
+}
+
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
+};
+
 /*
  * Default rate for the root input clock, reset this with clk_set_rate()
  * from the platform code.
@@ -146,11 +155,19 @@ static struct clk extal_clk = {
 	.rate		= 33333333,
 };
 
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.parent		= &extal_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
 static struct clk cpu_clk = {
 	.name		= "cpu_clk",		/* Ick */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &ifc_data,
 };
@@ -159,7 +176,7 @@ static struct clk shyway_clk = {
 	.name		= "shyway_clk",		/* SHck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &sfc_data,
 };
@@ -168,7 +185,7 @@ static struct clk peripheral_clk = {
 	.name		= "peripheral_clk",	/* Pck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &pfc_data,
 };
@@ -177,7 +194,7 @@ static struct clk ddr_clk = {
 	.name		= "ddr_clk",		/* DDRck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &mfc_data,
 };
@@ -186,7 +203,7 @@ static struct clk bus_clk = {
 	.name		= "bus_clk",		/* Bck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &bfc_data,
 };
@@ -195,7 +212,7 @@ static struct clk ga_clk = {
 	.name		= "ga_clk",		/* GAck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.priv		= &s2fc_data,
 };
 
@@ -203,7 +220,7 @@ static struct clk du_clk = {
 	.name		= "du_clk",		/* DUck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.priv		= &s3fc_data,
 };
 
@@ -211,13 +228,14 @@ static struct clk umem_clk = {
 	.name		= "umem_clk",		/* uck */
 	.id		= -1,
 	.ops		= &frqmr_clk_ops,
-	.parent		= &extal_clk,
+	.parent		= &pll_clk,
 	.flags		= CLK_ENABLE_ON_INIT,
 	.priv		= &ufc_data,
 };
 
 static struct clk *clks[] = {
 	&extal_clk,
+	&pll_clk,
 	&cpu_clk,
 	&shyway_clk,
 	&peripheral_clk,
-- 
GitLab


From c94a85746f7bdc13035acdf88c130d7b6fa41bde Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 25 May 2009 08:10:28 +0000
Subject: [PATCH 1306/2198] sh: add shared clock framework frequency table code

Add SuperH-specific clock framework helper functions:
- clk_rate_table_build() - build cpufreq table from divisors/multipliers
- clk_rate_table_round() - use cpufreq table to find matching frequency

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h | 18 +++++++++
 arch/sh/kernel/cpu/clock.c  | 75 +++++++++++++++++++++++++++++++++++++
 2 files changed, 93 insertions(+)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 64c93cb3d685a..60d5d2bc714a7 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -100,4 +100,22 @@ enum clk_sh_algo_id {
 	IP_N1,
 };
 
+struct clk_div_mult_table {
+	unsigned int *divisors;
+	unsigned int nr_divisors;
+	unsigned int *multipliers;
+	unsigned int nr_multipliers;
+};
+
+struct cpufreq_frequency_table;
+void clk_rate_table_build(struct clk *clk,
+			  struct cpufreq_frequency_table *freq_table,
+			  int nr_freqs,
+			  struct clk_div_mult_table *src_table,
+			  unsigned long *bitmap);
+
+long clk_rate_table_round(struct clk *clk,
+			  struct cpufreq_frequency_table *freq_table,
+			  unsigned long rate);
+
 #endif /* __ASM_SH_CLOCK_H */
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 012d23476a72c..59764d6c5f418 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -29,6 +29,7 @@
 #include <linux/err.h>
 #include <linux/platform_device.h>
 #include <linux/debugfs.h>
+#include <linux/cpufreq.h>
 #include <asm/clock.h>
 #include <asm/machvec.h>
 
@@ -36,6 +37,80 @@ static LIST_HEAD(clock_list);
 static DEFINE_SPINLOCK(clock_lock);
 static DEFINE_MUTEX(clock_list_sem);
 
+void clk_rate_table_build(struct clk *clk,
+			  struct cpufreq_frequency_table *freq_table,
+			  int nr_freqs,
+			  struct clk_div_mult_table *src_table,
+			  unsigned long *bitmap)
+{
+	unsigned long mult, div;
+	unsigned long freq;
+	int i;
+
+	for (i = 0; i < nr_freqs; i++) {
+		div = 1;
+		mult = 1;
+
+		if (src_table->divisors && i < src_table->nr_divisors)
+			div = src_table->divisors[i];
+
+		if (src_table->multipliers && i < src_table->nr_multipliers)
+			mult = src_table->multipliers[i];
+
+		if (!div || !mult || (bitmap && !test_bit(i, bitmap)))
+			freq = CPUFREQ_ENTRY_INVALID;
+		else
+			freq = clk->parent->rate * mult / div;
+
+		freq_table[i].index = i;
+		freq_table[i].frequency = freq;
+	}
+
+	/* Termination entry */
+	freq_table[i].index = i;
+	freq_table[i].frequency = CPUFREQ_TABLE_END;
+}
+
+long clk_rate_table_round(struct clk *clk,
+			  struct cpufreq_frequency_table *freq_table,
+			  unsigned long rate)
+{
+	unsigned long rate_error, rate_error_prev = ~0UL;
+	unsigned long rate_best_fit = rate;
+	unsigned long highest, lowest;
+	int i;
+
+	highest = lowest = 0;
+
+	for (i = 0; freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+		unsigned long freq = freq_table[i].frequency;
+
+		if (freq == CPUFREQ_ENTRY_INVALID)
+			continue;
+
+		if (freq > highest)
+			highest = freq;
+		if (freq < lowest)
+			lowest = freq;
+
+		rate_error = abs(freq - rate);
+		if (rate_error < rate_error_prev) {
+			rate_best_fit = freq;
+			rate_error_prev = rate_error;
+		}
+
+		if (rate_error == 0)
+			break;
+	}
+
+	if (rate >= highest)
+		rate_best_fit = highest;
+	if (rate <= lowest)
+		rate_best_fit = lowest;
+
+	return rate_best_fit;
+}
+
 /* Used for clocks that always have same value as the parent clock */
 unsigned long followparent_recalc(struct clk *clk)
 {
-- 
GitLab


From df109e630f82de63ec82eebbfec2a57852517f28 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 25 May 2009 08:10:36 +0000
Subject: [PATCH 1307/2198] sh: use shared frequency tables on sh7785

This patch converts the sh7785 clock code to make use
of clk_rate_table_build() and clk_rate_table_round().
The ->build_rate_table() callback is removed, the
table building is instead handled in ->recalc().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 77 ++++----------------------
 1 file changed, 12 insertions(+), 65 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 7021ab0bfb88d..a4a9bcbec6647 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -19,6 +19,12 @@
 
 static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
 			       24, 32, 36, 48 };
+
+static struct clk_div_mult_table cpg_div = {
+	.divisors = div2,
+	.nr_divisors = ARRAY_SIZE(div2),
+};
+
 struct clk_priv {
 	unsigned int			shift;
 
@@ -52,82 +58,23 @@ FRQMR_CLK_DATA(ifc, 28, 0x000e);
 static unsigned long frqmr_recalc(struct clk *clk)
 {
 	struct clk_priv *data = clk->priv;
-	unsigned int idx;
-
-	idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f;
-
-	return clk->parent->rate / div2[idx];
-}
-
-static void frqmr_build_rate_table(struct clk *clk)
-{
-	struct clk_priv *data = clk->priv;
-	int i, entry;
-
-	for (i = entry = 0; i < ARRAY_SIZE(div2); i++) {
-		if ((data->div_bitmap & (1 << i)) == 0)
-			continue;
+	unsigned int idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f;
 
-		data->freq_table[entry].index = entry;
-		data->freq_table[entry].frequency =
-			clk->parent->rate / div2[i];
-
-		entry++;
-	}
-
-	if (entry == 0) {
-		pr_warning("clkfwk: failed to build frequency table "
-			   "for \"%s\" clk!\n", clk->name);
-		return;
-	}
-
-	/* Termination entry */
-	data->freq_table[entry].index = entry;
-	data->freq_table[entry].frequency = CPUFREQ_TABLE_END;
+	clk_rate_table_build(clk, data->freq_table, ARRAY_SIZE(div2),
+			     &cpg_div, &data->div_bitmap);
+	
+	return data->freq_table[idx].frequency;
 }
 
 static long frqmr_round_rate(struct clk *clk, unsigned long rate)
 {
 	struct clk_priv *data = clk->priv;
-	unsigned long rate_error, rate_error_prev = ~0UL;
-	unsigned long rate_best_fit = rate;
-	unsigned long highest, lowest;
-	int i;
-
-	highest = lowest = 0;
-
-	for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
-		unsigned long freq = data->freq_table[i].frequency;
-
-		if (freq == CPUFREQ_ENTRY_INVALID)
-			continue;
-
-		if (freq > highest)
-			highest = freq;
-		if (freq < lowest)
-			lowest = freq;
-
-		rate_error = abs(freq - rate);
-		if (rate_error < rate_error_prev) {
-			rate_best_fit = freq;
-			rate_error_prev = rate_error;
-		}
-
-		if (rate_error == 0)
-			break;
-	}
-
-	if (rate >= highest)
-		rate_best_fit = highest;
-	if (rate <= lowest)
-		rate_best_fit = lowest;
 
-	return rate_best_fit;
+	return clk_rate_table_round(clk, data->freq_table, rate);
 }
 
 static struct clk_ops frqmr_clk_ops = {
 	.recalc			= frqmr_recalc,
-	.build_rate_table	= frqmr_build_rate_table,
 	.round_rate		= frqmr_round_rate,
 };
 
-- 
GitLab


From 61ce5393e4c8914c46ec99cbda76823515109709 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 25 May 2009 08:10:45 +0000
Subject: [PATCH 1308/2198] sh: remove clk_ops->build_rate_table()

This patch removes the ->build_rate_table() callback,
->recalc() may instead be used for this purpose.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h | 1 -
 arch/sh/kernel/cpu/clock.c  | 2 --
 2 files changed, 3 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 60d5d2bc714a7..aa9480d4aa059 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -16,7 +16,6 @@ struct clk_ops {
 	int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id);
 	int (*set_parent)(struct clk *clk, struct clk *parent);
 	long (*round_rate)(struct clk *clk, unsigned long rate);
-	void (*build_rate_table)(struct clk *clk);
 };
 
 struct clk {
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index 59764d6c5f418..aa0fd0893585b 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -138,8 +138,6 @@ void propagate_rate(struct clk *tclk)
 	list_for_each_entry(clkp, &tclk->children, sibling) {
 		if (clkp->ops && clkp->ops->recalc)
 			clkp->rate = clkp->ops->recalc(clkp);
-		if (clkp->ops && clkp->ops->build_rate_table)
-			clkp->ops->build_rate_table(clkp);
 
 		propagate_rate(clkp);
 	}
-- 
GitLab


From f9e2b97dc2cf832ccceea2a2e6eccc4bc9af72cf Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 26 May 2009 17:12:20 +0900
Subject: [PATCH 1309/2198] sh: Add a KBUILD_DEFCONFIG for sh64.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Makefile | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index a47fbd260ba4e..0fe35cff90f6b 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -82,7 +82,6 @@ defaultimage-$(CONFIG_SH_7206_SOLUTION_ENGINE)	:= vmlinux
 defaultimage-$(CONFIG_SH_7619_SOLUTION_ENGINE)	:= vmlinux
 
 # Set some sensible Kbuild defaults
-KBUILD_DEFCONFIG	:= shx3_defconfig
 KBUILD_IMAGE		:= $(defaultimage-y)
 
 #
@@ -90,15 +89,17 @@ KBUILD_IMAGE		:= $(defaultimage-y)
 # error messages during linking.
 #
 ifdef CONFIG_SUPERH32
-UTS_MACHINE	:= sh
-BITS		:= 32
-LDFLAGS_vmlinux	+= -e _stext
+UTS_MACHINE		:= sh
+BITS			:= 32
+LDFLAGS_vmlinux		+= -e _stext
+KBUILD_DEFCONFIG	:= shx3_defconfig
 else
-UTS_MACHINE	:= sh64
-BITS		:= 64
-LDFLAGS_vmlinux	+= --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \
-		   --defsym phys_stext_shmedia=phys_stext+1 \
-		   -e phys_stext_shmedia
+UTS_MACHINE		:= sh64
+BITS			:= 64
+LDFLAGS_vmlinux		+= --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \
+			   --defsym phys_stext_shmedia=phys_stext+1 \
+			   -e phys_stext_shmedia
+KBUILD_DEFCONFIG	:= cayman_defconfig
 endif
 
 ifneq ($(SUBARCH),$(ARCH))
-- 
GitLab


From 0e9b20b8a1cab6c6ab4f98f917a2d98783103969 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:17:18 +0200
Subject: [PATCH 1310/2198] perf record: Convert to Git option parsing

Remove getopt usage and use Git's much more advanced and more compact
command option library.

Git's library (util/parse-options.[ch]) constructs help texts and
error messages automatically, and has a number of other convenience
features as well.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 372 +++++++++-----------
 Documentation/perf_counter/builtin-top.c    |   3 +
 2 files changed, 177 insertions(+), 198 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index f225efaff9f52..f12a7822fcf17 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -2,6 +2,8 @@
 
 #include "perf.h"
 #include "util/util.h"
+#include "util/parse-options.h"
+#include "util/exec_cmd.h"
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -11,7 +13,6 @@
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
-#include <getopt.h>
 #include <assert.h>
 #include <fcntl.h>
 #include <stdio.h>
@@ -33,8 +34,8 @@
 
 
-#define ALIGN(x,a)		__ALIGN_MASK(x,(typeof(x))(a)-1)
-#define __ALIGN_MASK(x,mask)	(((x)+(mask))&~(mask))
+#define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
+#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
 static int			nr_counters			=  0;
 static __u64			event_id[MAX_COUNTERS]		= { };
@@ -45,7 +46,7 @@ static int			nr_cpus				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 16;
 static int			output;
-static char 			*output_name			= "output.perf";
+static const char		*output_name			= "output.perf";
 static int			group				= 0;
 static unsigned int		realtime_prio			= 0;
 static int			system_wide			= 0;
@@ -62,192 +63,6 @@ const unsigned int default_count[] = {
 	  10000,
 };
 
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(char *str)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-static void display_events_help(void)
-{
-	unsigned int i;
-	__u64 e;
-
-	printf(
-	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		printf("\n                             %d:%d: %-20s",
-				type, id, event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
-}
-
-static void display_help(void)
-{
-	printf(
-	"Usage: perf-record [<options>] <cmd>\n"
-	"perf-record Options (up to %d event types can be specified at once):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -c CNT    --count=CNT          # event period to sample\n"
-	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
-	" -o file   --output=<file>      # output file\n"
-	" -p pid    --pid=<pid>		 # record events on existing pid\n"
-	" -r prio   --realtime=<prio>    # use RT prio\n"
-	" -s        --system             # system wide profiling\n"
-	);
-
-	exit(0);
-}
-
-static void process_options(int argc, char * const argv[])
-{
-	int error = 0, counter;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"mmap_pages",	required_argument,	NULL, 'm'},
-			{"output",	required_argument,	NULL, 'o'},
-			{"pid",		required_argument,	NULL, 'p'},
-			{"realtime",	required_argument,	NULL, 'r'},
-			{"system",	no_argument,		NULL, 's'},
-			{"inherit",	no_argument,		NULL, 'i'},
-			{"nmi",		no_argument,		NULL, 'n'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:c:e:m:o:p:r:sin",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'c': default_interval		=   atoi(optarg); break;
-		case 'e': error				= parse_events(optarg); break;
-		case 'm': mmap_pages			=   atoi(optarg); break;
-		case 'o': output_name			= strdup(optarg); break;
-		case 'p': target_pid			=   atoi(optarg); break;
-		case 'r': realtime_prio			=   atoi(optarg); break;
-		case 's': system_wide                   ^=             1; break;
-		case 'i': inherit			^=	       1; break;
-		case 'n': nmi				^=	       1; break;
-		default: error = 1; break;
-		}
-	}
-
-	if (argc - optind == 0 && target_pid == -1)
-		error = 1;
-
-	if (error)
-		display_help();
-
-	if (!nr_counters) {
-		nr_counters = 1;
-		event_id[0] = 0;
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
-
-		event_count[counter] = default_interval;
-	}
-}
-
 struct mmap_data {
 	int counter;
 	void *base;
@@ -538,16 +353,13 @@ static void open_counters(int cpu, pid_t pid)
 	nr_cpu++;
 }
 
-int cmd_record(int argc, char * const argv[])
+static int __cmd_record(int argc, const char **argv)
 {
 	int i, counter;
 	pid_t pid;
 	int ret;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
-
-	process_options(argc, argv);
-
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
@@ -558,9 +370,6 @@ int cmd_record(int argc, char * const argv[])
 		exit(-1);
 	}
 
-	argc -= optind;
-	argv += optind;
-
 	if (!system_wide) {
 		open_counters(-1, target_pid != -1 ? target_pid : 0);
 	} else for (i = 0; i < nr_cpus; i++)
@@ -575,7 +384,7 @@ int cmd_record(int argc, char * const argv[])
 			perror("failed to fork");
 
 		if (!pid) {
-			if (execvp(argv[0], argv)) {
+			if (execvp(argv[0], (char **)argv)) {
 				perror(argv[0]);
 				exit(-1);
 			}
@@ -610,3 +419,170 @@ int cmd_record(int argc, char * const argv[])
 
 	return 0;
 }
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(const char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+static int parse_events(const struct option *opt, const char *str, int unset)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+static char events_help[100000];
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+
+
+static void create_events_help(void)
+{
+	unsigned int i;
+	char *str;
+	__u64 e;
+
+	str = events_help;
+
+	str += sprintf(str,
+	"event name: [");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		if (i)
+			str += sprintf(str, "|");
+
+		str += sprintf(str, "%s",
+				event_symbols[i].symbol);
+	}
+
+	str += sprintf(str, "|rNNN]");
+}
+
+static const char * const record_usage[] = {
+	"perf record [<options>] <command>",
+	NULL
+};
+
+const struct option options[] = {
+	OPT_CALLBACK('e', "event", NULL, "event",
+		     events_help, parse_events),
+	OPT_INTEGER('c', "count", &default_interval,
+		    "event period to sample"),
+	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
+		    "number of mmap data pages"),
+	OPT_STRING('o', "output", &output_name, "file",
+		    "output file name"),
+	OPT_BOOLEAN('i', "inherit", &inherit,
+		    "child tasks inherit counters"),
+	OPT_INTEGER('p', "pid", &target_pid,
+		    "record events on existing pid"),
+	OPT_INTEGER('r', "realtime", &realtime_prio,
+		    "collect data with this RT SCHED_FIFO priority"),
+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+			    "system-wide collection from all CPUs"),
+	OPT_END()
+};
+
+int cmd_record(int argc, const char **argv, const char *prefix)
+{
+	int counter;
+
+	create_events_help();
+
+	argc = parse_options(argc, argv, options, record_usage, 0);
+	if (!argc)
+		usage_with_options(record_usage, options);
+
+	if (!nr_counters) {
+		nr_counters = 1;
+		event_id[0] = 0;
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+
+	return __cmd_record(argc, argv);
+}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 4bed265926dd2..626b320764991 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -42,13 +42,16 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
+
 #include "perf.h"
 #include "util/util.h"
 
 #include <getopt.h>
 #include <assert.h>
 #include <fcntl.h>
+
 #include <stdio.h>
+
 #include <errno.h>
 #include <time.h>
 #include <sched.h>
-- 
GitLab


From 8ad8db3788fd9a449941fb2392ca85af4ee1cde1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 11:10:09 +0200
Subject: [PATCH 1311/2198] perf_counter tools: Librarize event string parsing

Extract the event string parser from builtin-record.c, and
librarize it - to be reused in other commands.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile           |   2 +
 Documentation/perf_counter/builtin-record.c   | 154 +-----------------
 .../perf_counter/util/parse-events.c          | 127 +++++++++++++++
 .../perf_counter/util/parse-events.h          |  10 ++
 4 files changed, 145 insertions(+), 148 deletions(-)
 create mode 100644 Documentation/perf_counter/util/parse-events.c
 create mode 100644 Documentation/perf_counter/util/parse-events.h

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 481e4c26cd453..45daa72facdc5 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -290,6 +290,7 @@ LIB_H += ../../include/linux/perf_counter.h
 LIB_H += perf.h
 LIB_H += util/levenshtein.h
 LIB_H += util/parse-options.h
+LIB_H += util/parse-events.h
 LIB_H += util/quote.h
 LIB_H += util/util.h
 LIB_H += util/help.h
@@ -304,6 +305,7 @@ LIB_OBJS += util/exec_cmd.o
 LIB_OBJS += util/help.o
 LIB_OBJS += util/levenshtein.o
 LIB_OBJS += util/parse-options.o
+LIB_OBJS += util/parse-events.o
 LIB_OBJS += util/path.o
 LIB_OBJS += util/run-command.o
 LIB_OBJS += util/quote.o
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index f12a7822fcf17..6fa6ed664950e 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -3,44 +3,17 @@
 #include "perf.h"
 #include "util/util.h"
 #include "util/parse-options.h"
+#include "util/parse-events.h"
 #include "util/exec_cmd.h"
 
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <time.h>
 #include <sched.h>
-#include <pthread.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/poll.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-#include <sys/mman.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-
 
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
 #define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
-static int			nr_counters			=  0;
-static __u64			event_id[MAX_COUNTERS]		= { };
 static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
+
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 static int			nr_cpus				=  0;
 static unsigned int		page_size;
@@ -420,131 +393,16 @@ static int __cmd_record(int argc, const char **argv)
 	return 0;
 }
 
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(const char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(const struct option *opt, const char *str, int unset)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
-static char events_help[100000];
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-
-
-static void create_events_help(void)
-{
-	unsigned int i;
-	char *str;
-	__u64 e;
-
-	str = events_help;
-
-	str += sprintf(str,
-	"event name: [");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		if (i)
-			str += sprintf(str, "|");
-
-		str += sprintf(str, "%s",
-				event_symbols[i].symbol);
-	}
-
-	str += sprintf(str, "|rNNN]");
-}
-
 static const char * const record_usage[] = {
 	"perf record [<options>] <command>",
 	NULL
 };
 
+static char events_help_msg[EVENTS_HELP_MAX];
+
 const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
-		     events_help, parse_events),
+		     events_help_msg, parse_events),
 	OPT_INTEGER('c', "count", &default_interval,
 		    "event period to sample"),
 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
@@ -566,7 +424,7 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 {
 	int counter;
 
-	create_events_help();
+	create_events_help(events_help_msg);
 
 	argc = parse_options(argc, argv, options, record_usage, 0);
 	if (!argc)
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
new file mode 100644
index 0000000000000..77d0917d55d37
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -0,0 +1,127 @@
+
+#include "../perf.h"
+#include "util.h"
+#include "parse-options.h"
+#include "parse-events.h"
+#include "exec_cmd.h"
+
+int nr_counters;
+
+__u64			event_id[MAX_COUNTERS]		= { };
+
+struct event_symbol {
+	__u64 event;
+	char *symbol;
+};
+
+static struct event_symbol event_symbols[] = {
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
+	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
+
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
+	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+};
+
+/*
+ * Each event can have multiple symbolic names.
+ * Symbolic names are (almost) exactly matched.
+ */
+static __u64 match_event_symbols(const char *str)
+{
+	__u64 config, id;
+	int type;
+	unsigned int i;
+
+	if (sscanf(str, "r%llx", &config) == 1)
+		return config | PERF_COUNTER_RAW_MASK;
+
+	if (sscanf(str, "%d:%llu", &type, &id) == 2)
+		return EID(type, id);
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		if (!strncmp(str, event_symbols[i].symbol,
+			     strlen(event_symbols[i].symbol)))
+			return event_symbols[i].event;
+	}
+
+	return ~0ULL;
+}
+
+int parse_events(const struct option *opt, const char *str, int unset)
+{
+	__u64 config;
+
+again:
+	if (nr_counters == MAX_COUNTERS)
+		return -1;
+
+	config = match_event_symbols(str);
+	if (config == ~0ULL)
+		return -1;
+
+	event_id[nr_counters] = config;
+	nr_counters++;
+
+	str = strstr(str, ",");
+	if (str) {
+		str++;
+		goto again;
+	}
+
+	return 0;
+}
+
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+/*
+ * Create the help text for the event symbols:
+ */
+void create_events_help(char *events_help_msg)
+{
+	unsigned int i;
+	char *str;
+	__u64 e;
+
+	str = events_help_msg;
+
+	str += sprintf(str,
+	"event name: [");
+
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
+		int type, id;
+
+		e = event_symbols[i].event;
+		type = PERF_COUNTER_TYPE(e);
+		id = PERF_COUNTER_ID(e);
+
+		if (i)
+			str += sprintf(str, "|");
+
+		str += sprintf(str, "%s",
+				event_symbols[i].symbol);
+	}
+
+	str += sprintf(str, "|rNNN]");
+}
+
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
new file mode 100644
index 0000000000000..6e2ebe5ff7d71
--- /dev/null
+++ b/Documentation/perf_counter/util/parse-events.h
@@ -0,0 +1,10 @@
+
+extern int nr_counters;
+extern __u64			event_id[MAX_COUNTERS];
+
+extern int parse_events(const struct option *opt, const char *str, int unset);
+
+#define EVENTS_HELP_MAX (128*1024)
+
+extern void create_events_help(char *help_msg);
+
-- 
GitLab


From 5242519b0296d128425368fc6ab17f541d5fa775 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:17:18 +0200
Subject: [PATCH 1312/2198] perf stat: Convert to Git option parsing

Remove getopt usage and use Git's much more advanced and more compact
command option library.

Extend the event parser library with the extensions that were in
perf-stat before.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c   |   3 +-
 Documentation/perf_counter/builtin-stat.c     | 414 +++---------------
 .../perf_counter/util/parse-events.c          |  82 +++-
 .../perf_counter/util/parse-events.h          |  10 +
 4 files changed, 145 insertions(+), 364 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 6fa6ed664950e..ec2b787b23bd0 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -4,7 +4,6 @@
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
-#include "util/exec_cmd.h"
 
 #include <sched.h>
 
@@ -400,7 +399,7 @@ static const char * const record_usage[] = {
 
 static char events_help_msg[EVENTS_HELP_MAX];
 
-const struct option options[] = {
+static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
 		     events_help_msg, parse_events),
 	OPT_INTEGER('c', "count", &default_interval,
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index c1053d820c1fb..e7cb9412212b5 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -1,35 +1,5 @@
 /*
- * kerneltop.c: show top kernel functions - performance counters showcase
-
-   Build with:
-
-     cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt
-
-   Sample output:
-
-------------------------------------------------------------------------------
- KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
-------------------------------------------------------------------------------
-
-             weight         RIP          kernel function
-             ______   ________________   _______________
-
-              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
-              33.00 - ffffffff804cb740 : sock_alloc_send_skb
-              31.26 - ffffffff804ce808 : skb_push
-              22.43 - ffffffff80510004 : tcp_established_options
-              19.00 - ffffffff8027d250 : find_get_page
-              15.76 - ffffffff804e4fc9 : eth_type_trans
-              15.20 - ffffffff804d8baa : dst_release
-              14.86 - ffffffff804cf5d8 : skb_release_head_state
-              14.00 - ffffffff802217d5 : read_hpet
-              12.00 - ffffffff804ffb7f : __ip_local_out
-              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
-               8.54 - ffffffff805001a3 : ip_queue_xmit
- */
-
-/*
- * perfstat:  /usr/bin/time -alike performance counter statistics utility
+ * perf stat:  /usr/bin/time -alike performance counter statistics utility
 
           It summarizes the counter events of all tasks (and child tasks),
           covering all CPUs that the command (or workload) executes on.
@@ -38,59 +8,38 @@
 
    Sample output:
 
-   $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
+   $ perf stat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
 
    Performance counter stats for 'ls':
 
            163516953 instructions
                 2295 cache-misses
              2855182 branch-misses
+ *
+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *
+ * Improvements and fixes by:
+ *
+ *   Arjan van de Ven <arjan@linux.intel.com>
+ *   Yanmin Zhang <yanmin.zhang@intel.com>
+ *   Wu Fengguang <fengguang.wu@intel.com>
+ *   Mike Galbraith <efault@gmx.de>
+ *   Paul Mackerras <paulus@samba.org>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
  */
 
- /*
-  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
-  *
-  * Improvements and fixes by:
-  *
-  *   Arjan van de Ven <arjan@linux.intel.com>
-  *   Yanmin Zhang <yanmin.zhang@intel.com>
-  *   Wu Fengguang <fengguang.wu@intel.com>
-  *   Mike Galbraith <efault@gmx.de>
-  *   Paul Mackerras <paulus@samba.org>
-  *
-  * Released under the GPL v2. (and only v2, not any later version)
-  */
-
 #include "perf.h"
 #include "util/util.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
 
-#include <getopt.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <time.h>
-#include <sched.h>
-#include <pthread.h>
-
-#include <sys/syscall.h>
-#include <sys/ioctl.h>
-#include <sys/poll.h>
 #include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/uio.h>
-#include <sys/mman.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-#define EVENT_MASK_KERNEL		1
-#define EVENT_MASK_USER			2
 
 static int			system_wide			=  0;
+static int			inherit				=  1;
 
-static int			nr_counters			=  0;
-static __u64			event_id[MAX_COUNTERS]		= {
+static __u64			default_event_id[MAX_COUNTERS]	= {
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
@@ -101,20 +50,15 @@ static __u64			event_id[MAX_COUNTERS]		= {
 	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
 	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
 };
+
 static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-static int			event_mask[MAX_COUNTERS];
 
-static int			tid				= -1;
-static int			profile_cpu			= -1;
+static int			target_pid			= -1;
 static int			nr_cpus				=  0;
-static int			nmi				=  1;
-static int			group				=  0;
 static unsigned int		page_size;
 
-static int			zero;
-
 static int			scale				=  1;
 
 static const unsigned int default_count[] = {
@@ -126,197 +70,6 @@ static const unsigned int default_count[] = {
 	  10000,
 };
 
-static char *hw_event_names[] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names[] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-	"minor faults",
-	"major faults",
-};
-
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-static void display_events_help(void)
-{
-	unsigned int i;
-	__u64 e;
-
-	printf(
-	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		printf("\n                             %d:%d: %-20s",
-				type, id, event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
-}
-
-static void display_help(void)
-{
-	printf(
-	"Usage: perfstat [<events...>] <cmd...>\n\n"
-	"PerfStat Options (up to %d event types can be specified):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -l                           # scale counter values\n"
-	" -a                           # system-wide collection\n");
-	exit(0);
-}
-
-static char *event_name(int ctr)
-{
-	__u64 config = event_id[ctr];
-	int type = PERF_COUNTER_TYPE(config);
-	int id = PERF_COUNTER_ID(config);
-	static char buf[32];
-
-	if (PERF_COUNTER_RAW(config)) {
-		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
-		return buf;
-	}
-
-	switch (type) {
-	case PERF_TYPE_HARDWARE:
-		if (id < PERF_HW_EVENTS_MAX)
-			return hw_event_names[id];
-		return "unknown-hardware";
-
-	case PERF_TYPE_SOFTWARE:
-		if (id < PERF_SW_EVENTS_MAX)
-			return sw_event_names[id];
-		return "unknown-software";
-
-	default:
-		break;
-	}
-
-	return "unknown";
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-	char mask_str[4];
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
-		case 3:
-			if (strchr(mask_str, 'k'))
-				event_mask[nr_counters] |= EVENT_MASK_USER;
-			if (strchr(mask_str, 'u'))
-				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
-		case 2:
-			return EID(type, id);
-
-		default:
-			break;
-	}
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(char *str)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
-
-/*
- * perfstat
- */
-
-char fault_here[1000000];
-
 static void create_perfstat_counter(int counter)
 {
 	struct perf_counter_hw_event hw_event;
@@ -324,7 +77,7 @@ static void create_perfstat_counter(int counter)
 	memset(&hw_event, 0, sizeof(hw_event));
 	hw_event.config		= event_id[counter];
 	hw_event.record_type	= 0;
-	hw_event.nmi		= 0;
+	hw_event.nmi		= 1;
 	hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
 	hw_event.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
 
@@ -343,7 +96,7 @@ static void create_perfstat_counter(int counter)
 			}
 		}
 	} else {
-		hw_event.inherit	= 1;
+		hw_event.inherit	= inherit;
 		hw_event.disabled	= 1;
 
 		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
@@ -355,7 +108,7 @@ static void create_perfstat_counter(int counter)
 	}
 }
 
-int do_perfstat(int argc, char *argv[])
+int do_perfstat(int argc, const char **argv)
 {
 	unsigned long long t0, t1;
 	int counter;
@@ -369,12 +122,6 @@ int do_perfstat(int argc, char *argv[])
 	for (counter = 0; counter < nr_counters; counter++)
 		create_perfstat_counter(counter);
 
-	argc -= optind;
-	argv += optind;
-
-	if (!argc)
-		display_help();
-
 	/*
 	 * Enable counters and exec the command:
 	 */
@@ -384,7 +131,7 @@ int do_perfstat(int argc, char *argv[])
 	if ((pid = fork()) < 0)
 		perror("failed to fork");
 	if (!pid) {
-		if (execvp(argv[0], argv)) {
+		if (execvp(argv[0], (char **)argv)) {
 			perror(argv[0]);
 			exit(-1);
 		}
@@ -458,70 +205,45 @@ int do_perfstat(int argc, char *argv[])
 	return 0;
 }
 
-static void process_options(int argc, char **argv)
+static void skip_signal(int signo)
 {
-	int error = 0, counter;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"cpu",		required_argument,	NULL, 'C'},
-			{"delay",	required_argument,	NULL, 'd'},
-			{"dump_symtab",	no_argument,		NULL, 'D'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"filter",	required_argument,	NULL, 'f'},
-			{"group",	required_argument,	NULL, 'g'},
-			{"help",	no_argument,		NULL, 'h'},
-			{"nmi",		required_argument,	NULL, 'n'},
-			{"munmap_info",	no_argument,		NULL, 'U'},
-			{"pid",		required_argument,	NULL, 'p'},
-			{"realtime",	required_argument,	NULL, 'r'},
-			{"scale",	no_argument,		NULL, 'l'},
-			{"symbol",	required_argument,	NULL, 's'},
-			{"stat",	no_argument,		NULL, 'S'},
-			{"vmlinux",	required_argument,	NULL, 'x'},
-			{"zero",	no_argument,		NULL, 'z'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'a': system_wide			=	       1; break;
-		case 'c': default_interval		=   atoi(optarg); break;
-		case 'C':
-			/* CPU and PID are mutually exclusive */
-			if (tid != -1) {
-				printf("WARNING: CPU switch overriding PID\n");
-				sleep(1);
-				tid = -1;
-			}
-			profile_cpu			=   atoi(optarg); break;
-
-		case 'e': error				= parse_events(optarg); break;
-
-		case 'g': group				=   atoi(optarg); break;
-		case 'h':      				  display_help(); break;
-		case 'l': scale				=	       1; break;
-		case 'n': nmi				=   atoi(optarg); break;
-		case 'p':
-			/* CPU and PID are mutually exclusive */
-			if (profile_cpu != -1) {
-				printf("WARNING: PID switch overriding CPU\n");
-				sleep(1);
-				profile_cpu = -1;
-			}
-			tid				=   atoi(optarg); break;
-		case 'z': zero				=              1; break;
-		default: error = 1; break;
-		}
-	}
-	if (error)
-		display_help();
+}
+
+static const char * const stat_usage[] = {
+	"perf stat [<options>] <command>",
+	NULL
+};
+
+static char events_help_msg[EVENTS_HELP_MAX];
+
+static const struct option options[] = {
+	OPT_CALLBACK('e', "event", NULL, "event",
+		     events_help_msg, parse_events),
+	OPT_INTEGER('c', "count", &default_interval,
+		    "event period to sample"),
+	OPT_BOOLEAN('i', "inherit", &inherit,
+		    "child tasks inherit counters"),
+	OPT_INTEGER('p', "pid", &target_pid,
+		    "stat events on existing pid"),
+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+			    "system-wide collection from all CPUs"),
+	OPT_BOOLEAN('l', "scale", &scale,
+			    "scale/normalize counters"),
+	OPT_END()
+};
+
+int cmd_stat(int argc, const char **argv, const char *prefix)
+{
+	int counter;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	create_events_help(events_help_msg);
+	memcpy(event_id, default_event_id, sizeof(default_event_id));
+
+	argc = parse_options(argc, argv, options, stat_usage, 0);
+	if (!argc)
+		usage_with_options(stat_usage, options);
 
 	if (!nr_counters) {
 		nr_counters = 8;
@@ -533,18 +255,6 @@ static void process_options(int argc, char **argv)
 
 		event_count[counter] = default_interval;
 	}
-}
-
-static void skip_signal(int signo)
-{
-}
-
-int cmd_stat(int argc, char **argv, const char *prefix)
-{
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	process_options(argc, argv);
-
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index 77d0917d55d37..88c903eb260a7 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -8,6 +8,7 @@
 int nr_counters;
 
 __u64			event_id[MAX_COUNTERS]		= { };
+int			event_mask[MAX_COUNTERS];
 
 struct event_symbol {
 	__u64 event;
@@ -37,6 +38,64 @@ static struct event_symbol event_symbols[] = {
 	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
 };
 
+#define __PERF_COUNTER_FIELD(config, name) \
+	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
+
+#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
+#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
+#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
+#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
+
+static char *hw_event_names[] = {
+	"CPU cycles",
+	"instructions",
+	"cache references",
+	"cache misses",
+	"branches",
+	"branch misses",
+	"bus cycles",
+};
+
+static char *sw_event_names[] = {
+	"cpu clock ticks",
+	"task clock ticks",
+	"pagefaults",
+	"context switches",
+	"CPU migrations",
+	"minor faults",
+	"major faults",
+};
+
+char *event_name(int ctr)
+{
+	__u64 config = event_id[ctr];
+	int type = PERF_COUNTER_TYPE(config);
+	int id = PERF_COUNTER_ID(config);
+	static char buf[32];
+
+	if (PERF_COUNTER_RAW(config)) {
+		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
+		return buf;
+	}
+
+	switch (type) {
+	case PERF_TYPE_HARDWARE:
+		if (id < PERF_HW_EVENTS_MAX)
+			return hw_event_names[id];
+		return "unknown-hardware";
+
+	case PERF_TYPE_SOFTWARE:
+		if (id < PERF_SW_EVENTS_MAX)
+			return sw_event_names[id];
+		return "unknown-software";
+
+	default:
+		break;
+	}
+
+	return "unknown";
+}
+
 /*
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
@@ -46,12 +105,23 @@ static __u64 match_event_symbols(const char *str)
 	__u64 config, id;
 	int type;
 	unsigned int i;
+	char mask_str[4];
 
 	if (sscanf(str, "r%llx", &config) == 1)
 		return config | PERF_COUNTER_RAW_MASK;
 
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
+	switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
+		case 3:
+			if (strchr(mask_str, 'k'))
+				event_mask[nr_counters] |= EVENT_MASK_USER;
+			if (strchr(mask_str, 'u'))
+				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
+		case 2:
+			return EID(type, id);
+
+		default:
+			break;
+	}
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		if (!strncmp(str, event_symbols[i].symbol,
@@ -86,14 +156,6 @@ int parse_events(const struct option *opt, const char *str, int unset)
 	return 0;
 }
 
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
 /*
  * Create the help text for the event symbols:
  */
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
index 6e2ebe5ff7d71..0da306bb90281 100644
--- a/Documentation/perf_counter/util/parse-events.h
+++ b/Documentation/perf_counter/util/parse-events.h
@@ -1,6 +1,16 @@
 
+/*
+ * Parse symbolic events/counts passed in as options:
+ */
+
 extern int nr_counters;
 extern __u64			event_id[MAX_COUNTERS];
+extern int			event_mask[MAX_COUNTERS];
+
+#define EVENT_MASK_KERNEL	1
+#define EVENT_MASK_USER		2
+
+extern char *event_name(int ctr);
 
 extern int parse_events(const struct option *opt, const char *str, int unset);
 
-- 
GitLab


From b456bae0ff4f3cf91639dd32b2bfc49b1c30b4b0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:17:18 +0200
Subject: [PATCH 1313/2198] perf top: Convert to Git option parsing

Remove getopt usage and use Git's much more advanced and more compact
command option library.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 559 +++++------------------
 1 file changed, 105 insertions(+), 454 deletions(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 626b320764991..87b925c8f8e8d 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -45,8 +45,10 @@
 
 #include "perf.h"
 #include "util/util.h"
+#include "util/util.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
 
-#include <getopt.h>
 #include <assert.h>
 #include <fcntl.h>
 
@@ -70,8 +72,7 @@
 
 static int			system_wide			=  0;
 
-static int			nr_counters			=  0;
-static __u64			event_id[MAX_COUNTERS]		= {
+static __u64			default_event_id[MAX_COUNTERS]		= {
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
 	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
@@ -88,7 +89,7 @@ static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static __u64			count_filter		       = 100;
 
-static int			tid				= -1;
+static int			target_pid				= -1;
 static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
 static int			nmi				=  1;
@@ -100,8 +101,6 @@ static int			use_mmap			= 0;
 static int			use_munmap			= 0;
 static int			freq				= 0;
 
-static char			*vmlinux;
-
 static char			*sym_filter;
 static unsigned long		filter_start;
 static unsigned long		filter_end;
@@ -110,18 +109,6 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
-static int			scale;
-
-struct source_line {
-	uint64_t		EIP;
-	unsigned long		count;
-	char			*line;
-	struct source_line	*next;
-};
-
-static struct source_line	*lines;
-static struct source_line	**lines_tail;
-
 static const unsigned int default_count[] = {
 	1000000,
 	1000000,
@@ -131,194 +118,6 @@ static const unsigned int default_count[] = {
 	  10000,
 };
 
-static char *hw_event_names[] = {
-	"CPU cycles",
-	"instructions",
-	"cache references",
-	"cache misses",
-	"branches",
-	"branch misses",
-	"bus cycles",
-};
-
-static char *sw_event_names[] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-	"minor faults",
-	"major faults",
-};
-
-struct event_symbol {
-	__u64 event;
-	char *symbol;
-};
-
-static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
-};
-
-#define __PERF_COUNTER_FIELD(config, name) \
-	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW)
-#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG)
-#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE)
-#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
-
-static void display_events_help(void)
-{
-	unsigned int i;
-	__u64 e;
-
-	printf(
-	" -e EVENT     --event=EVENT   #  symbolic-name        abbreviations");
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
-
-		printf("\n                             %d:%d: %-20s",
-				type, id, event_symbols[i].symbol);
-	}
-
-	printf("\n"
-	"                           rNNN: raw PMU events (eventsel+umask)\n\n");
-}
-
-static void display_help(void)
-{
-	printf(
-	"Usage: kerneltop [<options>]\n"
-	"   Or: kerneltop -S [<options>] COMMAND [ARGS]\n\n"
-	"KernelTop Options (up to %d event types can be specified at once):\n\n",
-		 MAX_COUNTERS);
-
-	display_events_help();
-
-	printf(
-	" -c CNT    --count=CNT        # event period to sample\n\n"
-	" -C CPU    --cpu=CPU          # CPU (-1 for all)                 [default: -1]\n"
-	" -p PID    --pid=PID          # PID of sampled task (-1 for all) [default: -1]\n\n"
-	" -l                           # show scale factor for RR events\n"
-	" -d delay  --delay=<seconds>  # sampling/display delay           [default:  2]\n"
-	" -f CNT    --filter=CNT       # min-event-count filter          [default: 100]\n\n"
-	" -r prio   --realtime=<prio>  # event acquisition runs with SCHED_FIFO policy\n"
-	" -s symbol --symbol=<symbol>  # function to be showed annotated one-shot\n"
-	" -x path   --vmlinux=<path>   # the vmlinux binary, required for -s use\n"
-	" -z        --zero             # zero counts after display\n"
-	" -D        --dump_symtab      # dump symbol table to stderr on startup\n"
-	" -m pages  --mmap_pages=<pages> # number of mmap data pages\n"
-	" -M        --mmap_info        # print mmap info stream\n"
-	" -U        --munmap_info      # print munmap info stream\n"
-	);
-
-	exit(0);
-}
-
-static char *event_name(int ctr)
-{
-	__u64 config = event_id[ctr];
-	int type = PERF_COUNTER_TYPE(config);
-	int id = PERF_COUNTER_ID(config);
-	static char buf[32];
-
-	if (PERF_COUNTER_RAW(config)) {
-		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
-		return buf;
-	}
-
-	switch (type) {
-	case PERF_TYPE_HARDWARE:
-		if (id < PERF_HW_EVENTS_MAX)
-			return hw_event_names[id];
-		return "unknown-hardware";
-
-	case PERF_TYPE_SOFTWARE:
-		if (id < PERF_SW_EVENTS_MAX)
-			return sw_event_names[id];
-		return "unknown-software";
-
-	default:
-		break;
-	}
-
-	return "unknown";
-}
-
-/*
- * Each event can have multiple symbolic names.
- * Symbolic names are (almost) exactly matched.
- */
-static __u64 match_event_symbols(char *str)
-{
-	__u64 config, id;
-	int type;
-	unsigned int i;
-
-	if (sscanf(str, "r%llx", &config) == 1)
-		return config | PERF_COUNTER_RAW_MASK;
-
-	if (sscanf(str, "%d:%llu", &type, &id) == 2)
-		return EID(type, id);
-
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
-	}
-
-	return ~0ULL;
-}
-
-static int parse_events(char *str)
-{
-	__u64 config;
-
-again:
-	if (nr_counters == MAX_COUNTERS)
-		return -1;
-
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
-
-	event_id[nr_counters] = config;
-	nr_counters++;
-
-	str = strstr(str, ",");
-	if (str) {
-		str++;
-		goto again;
-	}
-
-	return 0;
-}
-
 /*
  * Symbols
  */
@@ -331,7 +130,6 @@ struct sym_entry {
 	char			*sym;
 	unsigned long		count[MAX_COUNTERS];
 	int			skip;
-	struct source_line	*source;
 };
 
 #define MAX_SYMS		100000
@@ -342,8 +140,6 @@ struct sym_entry		*sym_filter_entry;
 
 static struct sym_entry		sym_table[MAX_SYMS];
 
-static void show_details(struct sym_entry *sym);
-
 /*
  * Ordering weight: count-1 * count-2 * ... / count-n
  */
@@ -419,15 +215,15 @@ static void print_sym_table(void)
 
 	printf( "], ");
 
-	if (tid != -1)
-		printf(" (tid: %d", tid);
+	if (target_pid != -1)
+		printf(" (target_pid: %d", target_pid);
 	else
 		printf(" (all");
 
 	if (profile_cpu != -1)
 		printf(", cpu: %d)\n", profile_cpu);
 	else {
-		if (tid != -1)
+		if (target_pid != -1)
 			printf(")\n");
 		else
 			printf(", %d CPUs)\n", nr_cpus);
@@ -463,9 +259,6 @@ static void print_sym_table(void)
 				pcnt, tmp[i].addr, tmp[i].sym);
 	}
 
-	if (sym_filter_entry)
-		show_details(sym_filter_entry);
-
 	{
 		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
 
@@ -628,134 +421,8 @@ static void parse_symbols(void)
 	}
 }
 
-/*
- * Source lines
- */
-
-static void parse_vmlinux(char *filename)
-{
-	FILE *file;
-	char command[PATH_MAX*2];
-	if (!filename)
-		return;
-
-	sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename);
-
-	file = popen(command, "r");
-	if (!file)
-		return;
-
-	lines_tail = &lines;
-	while (!feof(file)) {
-		struct source_line *src;
-		size_t dummy = 0;
-		char *c;
-
-		src = malloc(sizeof(struct source_line));
-		assert(src != NULL);
-		memset(src, 0, sizeof(struct source_line));
-
-		if (getline(&src->line, &dummy, file) < 0)
-			break;
-		if (!src->line)
-			break;
-
-		c = strchr(src->line, '\n');
-		if (c)
-			*c = 0;
-
-		src->next = NULL;
-		*lines_tail = src;
-		lines_tail = &src->next;
-
-		if (strlen(src->line)>8 && src->line[8] == ':')
-			src->EIP = strtoull(src->line, NULL, 16);
-		if (strlen(src->line)>8 && src->line[16] == ':')
-			src->EIP = strtoull(src->line, NULL, 16);
-	}
-	pclose(file);
-}
-
-static void record_precise_ip(uint64_t ip)
-{
-	struct source_line *line;
-
-	for (line = lines; line; line = line->next) {
-		if (line->EIP == ip)
-			line->count++;
-		if (line->EIP > ip)
-			break;
-	}
-}
-
-static void lookup_sym_in_vmlinux(struct sym_entry *sym)
-{
-	struct source_line *line;
-	char pattern[PATH_MAX];
-	sprintf(pattern, "<%s>:", sym->sym);
-
-	for (line = lines; line; line = line->next) {
-		if (strstr(line->line, pattern)) {
-			sym->source = line;
-			break;
-		}
-	}
-}
-
-static void show_lines(struct source_line *line_queue, int line_queue_count)
-{
-	int i;
-	struct source_line *line;
-
-	line = line_queue;
-	for (i = 0; i < line_queue_count; i++) {
-		printf("%8li\t%s\n", line->count, line->line);
-		line = line->next;
-	}
-}
-
 #define TRACE_COUNT     3
 
-static void show_details(struct sym_entry *sym)
-{
-	struct source_line *line;
-	struct source_line *line_queue = NULL;
-	int displayed = 0;
-	int line_queue_count = 0;
-
-	if (!sym->source)
-		lookup_sym_in_vmlinux(sym);
-	if (!sym->source)
-		return;
-
-	printf("Showing details for %s\n", sym->sym);
-
-	line = sym->source;
-	while (line) {
-		if (displayed && strstr(line->line, ">:"))
-			break;
-
-		if (!line_queue_count)
-			line_queue = line;
-		line_queue_count ++;
-
-		if (line->count >= count_filter) {
-			show_lines(line_queue, line_queue_count);
-			line_queue_count = 0;
-			line_queue = NULL;
-		} else if (line_queue_count > TRACE_COUNT) {
-			line_queue = line_queue->next;
-			line_queue_count --;
-		}
-
-		line->count = 0;
-		displayed++;
-		if (displayed > 300)
-			break;
-		line = line->next;
-	}
-}
-
 /*
  * Binary search in the histogram table and record the hit:
  */
@@ -764,8 +431,6 @@ static void record_ip(uint64_t ip, int counter)
 	int left_idx, middle_idx, right_idx, idx;
 	unsigned long left, middle, right;
 
-	record_precise_ip(ip);
-
 	left_idx = 0;
 	right_idx = sym_table_count-1;
 	assert(ip <= max_ip && ip >= min_ip);
@@ -822,97 +487,6 @@ static void process_event(uint64_t ip, int counter)
 	record_ip(ip, counter);
 }
 
-static void process_options(int argc, char **argv)
-{
-	int error = 0, counter;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"count",	required_argument,	NULL, 'c'},
-			{"cpu",		required_argument,	NULL, 'C'},
-			{"delay",	required_argument,	NULL, 'd'},
-			{"dump_symtab",	no_argument,		NULL, 'D'},
-			{"event",	required_argument,	NULL, 'e'},
-			{"filter",	required_argument,	NULL, 'f'},
-			{"group",	required_argument,	NULL, 'g'},
-			{"help",	no_argument,		NULL, 'h'},
-			{"nmi",		required_argument,	NULL, 'n'},
-			{"mmap_info",	no_argument,		NULL, 'M'},
-			{"mmap_pages",	required_argument,	NULL, 'm'},
-			{"munmap_info",	no_argument,		NULL, 'U'},
-			{"pid",		required_argument,	NULL, 'p'},
-			{"realtime",	required_argument,	NULL, 'r'},
-			{"scale",	no_argument,		NULL, 'l'},
-			{"symbol",	required_argument,	NULL, 's'},
-			{"stat",	no_argument,		NULL, 'S'},
-			{"vmlinux",	required_argument,	NULL, 'x'},
-			{"zero",	no_argument,		NULL, 'z'},
-			{"freq",	required_argument,	NULL, 'F'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMUF:",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'a': system_wide			=	       1; break;
-		case 'c': default_interval		=   atoi(optarg); break;
-		case 'C':
-			/* CPU and PID are mutually exclusive */
-			if (tid != -1) {
-				printf("WARNING: CPU switch overriding PID\n");
-				sleep(1);
-				tid = -1;
-			}
-			profile_cpu			=   atoi(optarg); break;
-		case 'd': delay_secs			=   atoi(optarg); break;
-		case 'D': dump_symtab			=              1; break;
-
-		case 'e': error				= parse_events(optarg); break;
-
-		case 'f': count_filter			=   atoi(optarg); break;
-		case 'g': group				=   atoi(optarg); break;
-		case 'h':      				  display_help(); break;
-		case 'l': scale				=	       1; break;
-		case 'n': nmi				=   atoi(optarg); break;
-		case 'p':
-			/* CPU and PID are mutually exclusive */
-			if (profile_cpu != -1) {
-				printf("WARNING: PID switch overriding CPU\n");
-				sleep(1);
-				profile_cpu = -1;
-			}
-			tid				=   atoi(optarg); break;
-		case 'r': realtime_prio			=   atoi(optarg); break;
-		case 's': sym_filter			= strdup(optarg); break;
-		case 'x': vmlinux			= strdup(optarg); break;
-		case 'z': zero				=              1; break;
-		case 'm': mmap_pages			=   atoi(optarg); break;
-		case 'M': use_mmap			=              1; break;
-		case 'U': use_munmap			=              1; break;
-		case 'F': freq = 1; default_interval	=   atoi(optarg); break;
-		default: error = 1; break;
-		}
-	}
-	if (error)
-		display_help();
-
-	if (!nr_counters) {
-		nr_counters = 1;
-		event_id[0] = 0;
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
-
-		event_count[counter] = default_interval;
-	}
-}
-
 struct mmap_data {
 	int counter;
 	void *base;
@@ -973,11 +547,11 @@ static void mmap_read(struct mmap_data *md)
 		struct ip_event {
 			struct perf_event_header header;
 			__u64 ip;
-			__u32 pid, tid;
+			__u32 pid, target_pid;
 		};
 		struct mmap_event {
 			struct perf_event_header header;
-			__u32 pid, tid;
+			__u32 pid, target_pid;
 			__u64 start;
 			__u64 len;
 			__u64 pgoff;
@@ -1043,7 +617,7 @@ static void mmap_read(struct mmap_data *md)
 static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
-int cmd_top(int argc, char **argv, const char *prefix)
+static int __cmd_top(void)
 {
 	struct perf_counter_hw_event hw_event;
 	pthread_t thread;
@@ -1051,27 +625,12 @@ int cmd_top(int argc, char **argv, const char *prefix)
 	unsigned int cpu;
 	int ret;
 
-	page_size = sysconf(_SC_PAGE_SIZE);
-
-	process_options(argc, argv);
-
-	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
-	assert(nr_cpus <= MAX_NR_CPUS);
-	assert(nr_cpus >= 0);
-
-	if (tid != -1 || profile_cpu != -1)
-		nr_cpus = 1;
-
-	parse_symbols();
-	if (vmlinux && sym_filter_entry)
-		parse_vmlinux(vmlinux);
-
 	for (i = 0; i < nr_cpus; i++) {
 		group_fd = -1;
 		for (counter = 0; counter < nr_counters; counter++) {
 
 			cpu	= profile_cpu;
-			if (tid == -1 && profile_cpu == -1)
+			if (target_pid == -1 && profile_cpu == -1)
 				cpu = i;
 
 			memset(&hw_event, 0, sizeof(hw_event));
@@ -1083,7 +642,7 @@ int cmd_top(int argc, char **argv, const char *prefix)
 			hw_event.munmap		= use_munmap;
 			hw_event.freq		= freq;
 
-			fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0);
+			fd[i][counter] = sys_perf_counter_open(&hw_event, target_pid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
 				int err = errno;
 				printf("kerneltop error: syscall returned with %d (%s)\n",
@@ -1147,3 +706,95 @@ int cmd_top(int argc, char **argv, const char *prefix)
 
 	return 0;
 }
+
+static const char * const top_usage[] = {
+	"perf top [<options>]",
+	NULL
+};
+
+static char events_help_msg[EVENTS_HELP_MAX];
+
+static const struct option options[] = {
+	OPT_CALLBACK('e', "event", NULL, "event",
+		     events_help_msg, parse_events),
+	OPT_INTEGER('c', "count", &default_interval,
+		    "event period to sample"),
+	OPT_INTEGER('p', "pid", &target_pid,
+		    "profile events on existing pid"),
+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+			    "system-wide collection from all CPUs"),
+	OPT_INTEGER('C', "CPU", &profile_cpu,
+		    "CPU to profile on"),
+	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
+		    "number of mmap data pages"),
+	OPT_INTEGER('r', "realtime", &realtime_prio,
+		    "collect data with this RT SCHED_FIFO priority"),
+	OPT_INTEGER('d', "delay", &realtime_prio,
+		    "number of seconds to delay between refreshes"),
+	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
+			    "dump the symbol table used for profiling"),
+	OPT_INTEGER('f', "--count-filter", &count_filter,
+		    "only display functions with more events than this"),
+	OPT_BOOLEAN('g', "group", &group,
+			    "put the counters into a counter group"),
+	OPT_STRING('s', "sym-filter", &sym_filter, "pattern",
+		    "only display symbols matchig this pattern"),
+	OPT_BOOLEAN('z', "zero", &group,
+		    "zero history across updates"),
+	OPT_BOOLEAN('M', "use-mmap", &use_mmap,
+		    "track mmap events"),
+	OPT_BOOLEAN('U', "use-munmap", &use_munmap,
+		    "track munmap events"),
+	OPT_INTEGER('F', "--freq", &freq,
+		    "profile at this frequency"),
+	OPT_END()
+};
+
+int cmd_top(int argc, const char **argv, const char *prefix)
+{
+	int counter;
+
+	page_size = sysconf(_SC_PAGE_SIZE);
+
+	create_events_help(events_help_msg);
+	memcpy(event_id, default_event_id, sizeof(default_event_id));
+
+	argc = parse_options(argc, argv, options, top_usage, 0);
+	if (argc)
+		usage_with_options(top_usage, options);
+
+	if (freq) {
+		default_interval = freq;
+		freq = 1;
+	}
+
+	/* CPU and PID are mutually exclusive */
+	if (target_pid != -1 && profile_cpu != -1) {
+		printf("WARNING: PID switch overriding CPU\n");
+		sleep(1);
+		profile_cpu = -1;
+	}
+
+	if (!nr_counters) {
+		nr_counters = 1;
+		event_id[0] = 0;
+	}
+
+	for (counter = 0; counter < nr_counters; counter++) {
+		if (event_count[counter])
+			continue;
+
+		event_count[counter] = default_interval;
+	}
+
+	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
+	assert(nr_cpus <= MAX_NR_CPUS);
+	assert(nr_cpus >= 0);
+
+	if (target_pid != -1 || profile_cpu != -1)
+		nr_cpus = 1;
+
+	parse_symbols();
+
+	return __cmd_top();
+}
-- 
GitLab


From 8fa66bdcc81dd4fc2c91228074d0a4698120c5a2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 18 May 2009 12:45:42 -0300
Subject: [PATCH 1314/2198] perf_counter: First part of 'perf report'
 conversion to C + elfutils

Integrate perf-report into 'perf', as builtin-report.c.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile         |   6 +-
 Documentation/perf_counter/builtin-report.c | 751 ++++++++++++++++++++
 Documentation/perf_counter/builtin.h        |   1 +
 Documentation/perf_counter/command-list.txt |   1 +
 Documentation/perf_counter/perf.c           |   1 +
 5 files changed, 755 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/perf_counter/builtin-report.c

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 45daa72facdc5..49c601e10692d 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -228,7 +228,6 @@ COMPAT_CFLAGS =
 COMPAT_OBJS =
 LIB_H =
 LIB_OBJS =
-PROGRAMS = perf-report
 SCRIPT_PERL =
 SCRIPT_SH =
 TEST_PROGRAMS =
@@ -315,6 +314,7 @@ LIB_OBJS += util/wrapper.o
 
 BUILTIN_OBJS += builtin-help.o
 BUILTIN_OBJS += builtin-record.o
+BUILTIN_OBJS += builtin-report.o
 BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
 
@@ -811,10 +811,6 @@ clean:
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
 	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
 
-# temporary hack:
-perf-report: perf-report.cc ../../include/linux/perf_counter.h Makefile
-	g++ -g -O2 -Wall -lrt -o $@ $<
-
 .PHONY: all install clean strip
 .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
 .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
new file mode 100644
index 0000000000000..864f68f06a9e8
--- /dev/null
+++ b/Documentation/perf_counter/builtin-report.c
@@ -0,0 +1,751 @@
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+#include <time.h>
+#include <getopt.h>
+#include <assert.h>
+#include <search.h>
+
+#include <sys/ioctl.h>
+#include <sys/poll.h>
+#include <sys/prctl.h>
+#include <sys/wait.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+
+#include <linux/unistd.h>
+#include <linux/types.h>
+
+#include "../../include/linux/perf_counter.h"
+#include "list.h"
+
+#define SHOW_KERNEL	1
+#define SHOW_USER	2
+#define SHOW_HV		4
+
+static char 		const *input_name = "output.perf";
+static int		input;
+static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
+
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+static const char *perf_event_names[] = {
+	[PERF_EVENT_MMAP]   = " PERF_EVENT_MMAP",
+	[PERF_EVENT_MUNMAP] = " PERF_EVENT_MUNMAP",
+	[PERF_EVENT_COMM]   = " PERF_EVENT_COMM",
+};
+
+struct ip_event {
+	struct perf_event_header header;
+	__u64 ip;
+	__u32 pid, tid;
+};
+struct mmap_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	__u64 start;
+	__u64 len;
+	__u64 pgoff;
+	char filename[PATH_MAX];
+};
+struct comm_event {
+	struct perf_event_header header;
+	__u32 pid,tid;
+	char comm[16];
+};
+
+typedef union event_union {
+	struct perf_event_header header;
+	struct ip_event ip;
+	struct mmap_event mmap;
+	struct comm_event comm;
+} event_t;
+
+struct section {
+	struct list_head node;
+	uint64_t	 start;
+	uint64_t	 end;
+	uint64_t	 offset;
+	char		 name[0];
+};
+
+static struct section *section__new(uint64_t start, uint64_t size,
+				    uint64_t offset, char *name)
+{
+	struct section *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		self->start  = start;
+		self->end    = start + size;
+		self->offset = offset;
+		strcpy(self->name, name);
+	}
+
+	return self;
+}
+
+static void section__delete(struct section *self)
+{
+	free(self);
+}
+
+struct symbol {
+	struct list_head node;
+	uint64_t	 start;
+	uint64_t	 end;
+	char		 name[0];
+};
+
+static struct symbol *symbol__new(uint64_t start, uint64_t len, const char *name)
+{
+	struct symbol *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		self->start = start;
+		self->end   = start + len;
+		strcpy(self->name, name);
+	}
+
+	return self;
+}
+
+static void symbol__delete(struct symbol *self)
+{
+	free(self);
+}
+
+static size_t symbol__fprintf(struct symbol *self, FILE *fp)
+{
+	return fprintf(fp, " %lx-%lx %s\n",
+		       self->start, self->end, self->name);
+}
+
+struct dso {
+	struct list_head node;
+	struct list_head sections;
+	struct list_head syms;
+	char		 name[0];
+};
+
+static struct dso *dso__new(const char *name)
+{
+	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		strcpy(self->name, name);
+		INIT_LIST_HEAD(&self->sections);
+		INIT_LIST_HEAD(&self->syms);
+	}
+
+	return self;
+}
+
+static void dso__delete_sections(struct dso *self)
+{
+	struct section *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &self->sections, node)
+		section__delete(pos);
+}
+
+static void dso__delete_symbols(struct dso *self)
+{
+	struct symbol *pos, *n;
+
+	list_for_each_entry_safe(pos, n, &self->syms, node)
+		symbol__delete(pos);
+}
+
+static void dso__delete(struct dso *self)
+{
+	dso__delete_sections(self);
+	dso__delete_symbols(self);
+	free(self);
+}
+
+static void dso__insert_symbol(struct dso *self, struct symbol *sym)
+{
+	list_add_tail(&sym->node, &self->syms);
+}
+
+static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
+{
+	if (self == NULL)
+		return NULL;
+
+	struct symbol *pos;
+
+	list_for_each_entry(pos, &self->syms, node)
+		if (ip >= pos->start && ip <= pos->end)
+			return pos;
+
+	return NULL;
+}
+
+static int dso__load(struct dso *self)
+{
+	/* FIXME */
+	return 0;
+}
+
+static size_t dso__fprintf(struct dso *self, FILE *fp)
+{
+	struct symbol *pos;
+	size_t ret = fprintf(fp, "dso: %s\n", self->name);
+
+	list_for_each_entry(pos, &self->syms, node)
+		ret += symbol__fprintf(pos, fp);
+
+	return ret;
+}
+
+static LIST_HEAD(dsos);
+static struct dso *kernel_dso;
+
+static void dsos__add(struct dso *dso)
+{
+	list_add_tail(&dso->node, &dsos);
+}
+
+static struct dso *dsos__find(const char *name)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		if (strcmp(pos->name, name) == 0)
+			return pos;
+	return NULL;
+}
+
+static struct dso *dsos__findnew(const char *name)
+{
+	struct dso *dso = dsos__find(name);
+
+	if (dso == NULL) {
+		dso = dso__new(name);
+		if (dso != NULL && dso__load(dso) < 0)
+			goto out_delete_dso;
+
+		dsos__add(dso);
+	}
+
+	return dso;
+
+out_delete_dso:
+	dso__delete(dso);
+	return NULL;
+}
+
+static void dsos__fprintf(FILE *fp)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		dso__fprintf(pos, fp);
+}
+
+static int load_kallsyms(void)
+{
+	kernel_dso = dso__new("[kernel]");
+	if (kernel_dso == NULL)
+		return -1;
+
+	FILE *file = fopen("/proc/kallsyms", "r");
+
+	if (file == NULL)
+		goto out_delete_dso;
+
+	char *line = NULL;
+	size_t n;
+
+	while (!feof(file)) {
+		unsigned long long start;
+		char c, symbf[4096];
+
+		if (getline(&line, &n, file) < 0)
+			break;
+
+		if (!line)
+			goto out_delete_dso;
+
+		if (sscanf(line, "%llx %c %s", &start, &c, symbf) == 3) {
+			struct symbol *sym = symbol__new(start, 0x1000000, symbf);
+
+			if (sym == NULL)
+				goto out_delete_dso;
+
+			dso__insert_symbol(kernel_dso, sym);
+		}
+	}
+
+	dsos__add(kernel_dso);
+	free(line);
+	fclose(file);
+	return 0;
+
+out_delete_dso:
+	dso__delete(kernel_dso);
+	return -1;
+}
+
+struct map {
+	struct list_head node;
+	uint64_t	 start;
+	uint64_t	 end;
+	uint64_t	 pgoff;
+	struct dso	 *dso;
+};
+
+static struct map *map__new(struct mmap_event *event)
+{
+	struct map *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->start = event->start;
+		self->end   = event->start + event->len;
+		self->pgoff = event->pgoff;
+
+		self->dso = dsos__findnew(event->filename);
+		if (self->dso == NULL)
+			goto out_delete;
+	}
+	return self;
+out_delete:
+	free(self);
+	return NULL;
+}
+
+static size_t map__fprintf(struct map *self, FILE *fp)
+{
+	return fprintf(fp, " %lx-%lx %lx %s\n",
+		       self->start, self->end, self->pgoff, self->dso->name);
+}
+
+struct symhist {
+	struct list_head node;
+	struct dso	 *dso;
+	struct symbol	 *sym;
+	uint32_t	 count;
+	char		 level;
+};
+
+static struct symhist *symhist__new(struct symbol *sym, struct dso *dso,
+				    char level)
+{
+	struct symhist *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->sym   = sym;
+		self->dso   = dso;
+		self->level = level;
+		self->count = 0;
+	}
+
+	return self;
+}
+
+static void symhist__delete(struct symhist *self)
+{
+	free(self);
+}
+
+static bool symhist__equal(struct symhist *self, struct symbol *sym,
+			   struct dso *dso, char level)
+{
+	return self->level == level && self->sym == sym && self->dso == dso;
+}
+
+static void symhist__inc(struct symhist *self)
+{
+	++self->count;
+}
+
+static size_t symhist__fprintf(struct symhist *self, FILE *fp)
+{
+	size_t ret = fprintf(fp, "[%c] ", self->level);
+
+	if (self->level != '.')
+		ret += fprintf(fp, "%s", self->sym->name);
+	else
+		ret += fprintf(fp, "%s: %s",
+			       self->dso ? self->dso->name : "<unknown",
+			       self->sym ? self->sym->name : "<unknown>");
+	return ret + fprintf(fp, ": %u\n", self->count);
+}
+
+struct thread {
+	struct list_head node;
+	struct list_head maps;
+	struct list_head symhists;
+	pid_t		 pid;
+	char		 *comm;
+};
+
+static struct thread *thread__new(pid_t pid)
+{
+	struct thread *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->pid = pid;
+		self->comm = NULL;
+		INIT_LIST_HEAD(&self->maps);
+		INIT_LIST_HEAD(&self->symhists);
+	}
+
+	return self;
+}
+
+static void thread__insert_symhist(struct thread *self,
+				   struct symhist *symhist)
+{
+	list_add_tail(&symhist->node, &self->symhists);
+}
+
+static struct symhist *thread__symhists_find(struct thread *self,
+					     struct symbol *sym,
+					     struct dso *dso, char level)
+{
+	struct symhist *pos;
+
+	list_for_each_entry(pos, &self->symhists, node)
+		if (symhist__equal(pos, sym, dso, level))
+			return pos;
+
+	return NULL;
+}
+
+static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
+				 struct dso *dso, char level)
+{
+	struct symhist *symhist = thread__symhists_find(self, sym, dso, level);
+
+	if (symhist == NULL) {
+		symhist = symhist__new(sym, dso, level);
+		if (symhist == NULL)
+			goto out_error;
+		thread__insert_symhist(self, symhist);
+	}
+
+	symhist__inc(symhist);
+	return 0;
+out_error:
+	return -ENOMEM;
+}
+
+static int thread__set_comm(struct thread *self, const char *comm)
+{
+	self->comm = strdup(comm);
+	return self->comm ? 0 : -ENOMEM;
+}
+
+static size_t thread__maps_fprintf(struct thread *self, FILE *fp)
+{
+	struct map *pos;
+	size_t ret = 0;
+
+	list_for_each_entry(pos, &self->maps, node)
+		ret += map__fprintf(pos, fp);
+
+	return ret;
+}
+
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+	struct symhist *pos;
+	int ret = fprintf(fp, "thread: %d %s\n", self->pid, self->comm);
+
+	list_for_each_entry(pos, &self->symhists, node)
+		ret += symhist__fprintf(pos, fp);
+
+	return ret;
+}
+
+static LIST_HEAD(threads);
+
+static void threads__add(struct thread *thread)
+{
+	list_add_tail(&thread->node, &threads);
+}
+
+static struct thread *threads__find(pid_t pid)
+{
+	struct thread *pos;
+
+	list_for_each_entry(pos, &threads, node)
+		if (pos->pid == pid)
+			return pos;
+	return NULL;
+}
+
+static struct thread *threads__findnew(pid_t pid)
+{
+	struct thread *thread = threads__find(pid);
+
+	if (thread == NULL) {
+		thread = thread__new(pid);
+		if (thread != NULL)
+			threads__add(thread);
+	}
+
+	return thread;
+}
+
+static void thread__insert_map(struct thread *self, struct map *map)
+{
+	list_add_tail(&map->node, &self->maps);
+}
+
+static struct map *thread__find_map(struct thread *self, uint64_t ip)
+{
+	if (self == NULL)
+		return NULL;
+
+	struct map *pos;
+
+	list_for_each_entry(pos, &self->maps, node)
+		if (ip >= pos->start && ip <= pos->end)
+			return pos;
+
+	return NULL;
+}
+
+static void threads__fprintf(FILE *fp)
+{
+	struct thread *pos;
+
+	list_for_each_entry(pos, &threads, node)
+		thread__fprintf(pos, fp);
+}
+
+#if 0
+static std::string resolve_user_symbol(int pid, uint64_t ip)
+{
+	std::string sym = "<unknown>";
+
+	maps_t &m = maps[pid];
+	maps_t::const_iterator mi = m.upper_bound(map(ip));
+	if (mi == m.end())
+		return sym;
+
+	ip -= mi->start + mi->pgoff;
+
+	symbols_t &s = dsos[mi->dso].syms;
+	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
+
+	sym = mi->dso + ": <unknown>";
+
+	if (si == s.begin())
+		return sym;
+	si--;
+
+	if (si->start <= ip && ip < si->end)
+		sym = mi->dso + ": " + si->name;
+#if 0
+	else if (si->start <= ip)
+		sym = mi->dso + ": ?" + si->name;
+#endif
+
+	return sym;
+}
+#endif
+
+static void display_help(void)
+{
+	printf(
+	"Usage: perf-report [<options>]\n"
+	" -i file   --input=<file>      # input file\n"
+	);
+
+	exit(0);
+}
+
+static void process_options(int argc, char *argv[])
+{
+	int error = 0;
+
+	for (;;) {
+		int option_index = 0;
+		/** Options for getopt */
+		static struct option long_options[] = {
+			{"input",	required_argument,	NULL, 'i'},
+			{"no-user",	no_argument,		NULL, 'u'},
+			{"no-kernel",	no_argument,		NULL, 'k'},
+			{"no-hv",	no_argument,		NULL, 'h'},
+			{NULL,		0,			NULL,  0 }
+		};
+		int c = getopt_long(argc, argv, "+:i:kuh",
+				    long_options, &option_index);
+		if (c == -1)
+			break;
+
+		switch (c) {
+		case 'i': input_name			= strdup(optarg); break;
+		case 'k': show_mask &= ~SHOW_KERNEL; break;
+		case 'u': show_mask &= ~SHOW_USER; break;
+		case 'h': show_mask &= ~SHOW_HV; break;
+		default: error = 1; break;
+		}
+	}
+
+	if (error)
+		display_help();
+}
+
+int cmd_report(int argc, char **argv)
+{
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat stat;
+	char *buf;
+	event_t *event;
+	int ret, rc = EXIT_FAILURE;
+	unsigned long total = 0;
+
+	page_size = getpagesize();
+
+	process_options(argc, argv);
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
+	if (load_kallsyms() < 0) {
+		perror("failed to open kallsyms");
+		return EXIT_FAILURE;
+	}
+
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int ret;
+
+		ret = munmap(buf, page_size * mmap_window);
+		assert(ret == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+
+	if (!event->header.size) {
+		fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head);
+		fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head);
+		goto done;
+	}
+
+	head += event->header.size;
+
+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
+		char level;
+		int show = 0;
+		struct dso *dso = NULL;
+		struct thread *thread = threads__findnew(event->ip.pid);
+
+		if (thread == NULL)
+			goto done;
+
+		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+			show = SHOW_KERNEL;
+			level = 'k';
+			dso = kernel_dso;
+		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+			show = SHOW_USER;
+			level = '.';
+			struct map *map = thread__find_map(thread, event->ip.ip);
+			if (map != NULL)
+				dso = map->dso;
+		} else {
+			show = SHOW_HV;
+			level = 'H';
+		}
+
+		if (show & show_mask) {
+			struct symbol *sym = dso__find_symbol(dso, event->ip.ip);
+
+			if (thread__symbol_incnew(thread, sym, dso, level))
+				goto done;
+		}
+		total++;
+	} else switch (event->header.type) {
+	case PERF_EVENT_MMAP: {
+		struct thread *thread = threads__findnew(event->mmap.pid);
+		struct map *map = map__new(&event->mmap);
+
+		if (thread == NULL || map == NULL )
+			goto done;
+		thread__insert_map(thread, map);
+		break;
+	}
+	case PERF_EVENT_COMM: {
+		struct thread *thread = threads__findnew(event->comm.pid);
+
+		if (thread == NULL ||
+		    thread__set_comm(thread, event->comm.comm))
+			goto done;
+		break;
+	}
+	}
+
+	if (offset + head < stat.st_size)
+		goto more;
+
+	rc = EXIT_SUCCESS;
+done:
+	close(input);
+	//dsos__fprintf(stdout);
+	threads__fprintf(stdout);
+#if 0
+	std::map<std::string, int>::iterator hi = hist.begin();
+
+	while (hi != hist.end()) {
+		rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
+		hist.erase(hi++);
+	}
+
+	std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
+
+	while (ri != rev_hist.end()) {
+		printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
+		ri++;
+	}
+#endif
+	return rc;
+}
+
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index d32318aed8cf1..5bfea57d33fe1 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -16,6 +16,7 @@ extern int check_pager_config(const char *cmd);
 
 extern int cmd_help(int argc, const char **argv, const char *prefix);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
+extern int cmd_report(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index d15210aa0cae0..439029207770f 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -1,6 +1,7 @@
 # List of known perf commands.
 # command name				category [deprecated] [common]
 perf-record                             mainporcelain common
+perf-report                             mainporcelain common
 perf-stat                               mainporcelain common
 perf-top                                mainporcelain common
 
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 1d6d7aa575a34..e8a85842b4939 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -250,6 +250,7 @@ static void handle_internal_command(int argc, const char **argv)
 	static struct cmd_struct commands[] = {
 		{ "help", cmd_help, 0 },
 		{ "record", cmd_record, 0 },
+		{ "report", cmd_report, 0 },
 		{ "stat", cmd_stat, 0 },
 		{ "top", cmd_top, 0 },
 		{ "version", cmd_version, 0 },
-- 
GitLab


From fd4242bb35b70557eee8d0c79f82dacc3f3b89e0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 May 2009 12:45:34 +0200
Subject: [PATCH 1315/2198] perf_counter tools: remove the standalone
 perf-report utility

With a built-in 'perf report' command now available, remove the
standalone implementation for good.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/perf-report.cc | 515 ----------------------
 1 file changed, 515 deletions(-)
 delete mode 100644 Documentation/perf_counter/perf-report.cc

diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc
deleted file mode 100644
index 8855107fe6b3c..0000000000000
--- a/Documentation/perf_counter/perf-report.cc
+++ /dev/null
@@ -1,515 +0,0 @@
-#define _GNU_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <ctype.h>
-#include <time.h>
-#include <getopt.h>
-#include <assert.h>
-
-#include <sys/ioctl.h>
-#include <sys/poll.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-#include "../../include/linux/perf_counter.h"
-
-#include <set>
-#include <map>
-#include <string>
-
-
-#define SHOW_KERNEL	1
-#define SHOW_USER	2
-#define SHOW_HV		4
-
-static char 		const *input_name = "output.perf";
-static int		input;
-static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
-
-static unsigned long	page_size;
-static unsigned long	mmap_window = 32;
-
-struct ip_event {
-	struct perf_event_header header;
-	__u64 ip;
-	__u32 pid, tid;
-};
-struct mmap_event {
-	struct perf_event_header header;
-	__u32 pid, tid;
-	__u64 start;
-	__u64 len;
-	__u64 pgoff;
-	char filename[PATH_MAX];
-};
-struct comm_event {
-	struct perf_event_header header;
-	__u32 pid,tid;
-	char comm[16];
-};
-
-typedef union event_union {
-	struct perf_event_header header;
-	struct ip_event ip;
-	struct mmap_event mmap;
-	struct comm_event comm;
-} event_t;
-
-struct section {
-	uint64_t start;
-	uint64_t end;
-
-	uint64_t offset;
-
-	std::string name;
-
-	section() { };
-
-	section(uint64_t stab) : end(stab) { };
-
-	section(uint64_t start, uint64_t size, uint64_t offset, std::string name) :
-		start(start), end(start + size), offset(offset), name(name)
-	{ };
-
-	bool operator < (const struct section &s) const {
-		return end < s.end;
-	};
-};
-
-typedef std::set<struct section> sections_t;
-
-struct symbol {
-	uint64_t start;
-	uint64_t end;
-
-	std::string name;
-
-	symbol() { };
-
-	symbol(uint64_t ip) : start(ip) { }
-
-	symbol(uint64_t start, uint64_t len, std::string name) :
-		start(start), end(start + len), name(name)
-	{ };
-
-	bool operator < (const struct symbol &s) const {
-		return start < s.start;
-	};
-};
-
-typedef std::set<struct symbol> symbols_t;
-
-struct dso {
-	sections_t sections;
-	symbols_t syms;
-};
-
-static std::map<std::string, struct dso> dsos;
-
-static void load_dso_sections(std::string dso_name)
-{
-	struct dso &dso = dsos[dso_name];
-
-	std::string cmd = "readelf -DSW " + dso_name;
-
-	FILE *file = popen(cmd.c_str(), "r");
-	if (!file) {
-		perror("failed to open pipe");
-		exit(-1);
-	}
-
-	char *line = NULL;
-	size_t n = 0;
-
-	while (!feof(file)) {
-		uint64_t addr, off, size;
-		char name[32];
-
-		if (getline(&line, &n, file) < 0)
-			break;
-		if (!line)
-			break;
-
-		if (sscanf(line, "  [%*2d] %16s %*14s %Lx %Lx %Lx",
-					name, &addr, &off, &size) == 4) {
-
-			dso.sections.insert(section(addr, size, addr - off, name));
-		}
-#if 0
-		/*
-		 * for reading readelf symbols (-s), however these don't seem
-		 * to include nearly everything, so use nm for that.
-		 */
-		if (sscanf(line, " %*4d %*3d: %Lx %5Lu %*7s %*6s %*7s %3d %s",
-			   &start, &size, &section, sym) == 4) {
-
-			start -= dso.section_offsets[section];
-
-			dso.syms.insert(symbol(start, size, std::string(sym)));
-		}
-#endif
-	}
-	pclose(file);
-}
-
-static void load_dso_symbols(std::string dso_name, std::string args)
-{
-	struct dso &dso = dsos[dso_name];
-
-	std::string cmd = "nm -nSC " + args + " " + dso_name;
-
-	FILE *file = popen(cmd.c_str(), "r");
-	if (!file) {
-		perror("failed to open pipe");
-		exit(-1);
-	}
-
-	char *line = NULL;
-	size_t n = 0;
-
-	while (!feof(file)) {
-		uint64_t start, size;
-		char c;
-		char sym[1024];
-
-		if (getline(&line, &n, file) < 0)
-			break;
-		if (!line)
-			break;
-
-
-		if (sscanf(line, "%Lx %Lx %c %s", &start, &size, &c, sym) == 4) {
-			sections_t::const_iterator si =
-				dso.sections.upper_bound(section(start));
-			if (si == dso.sections.end()) {
-				printf("symbol in unknown section: %s\n", sym);
-				continue;
-			}
-
-			start -= si->offset;
-
-			dso.syms.insert(symbol(start, size, sym));
-		}
-	}
-	pclose(file);
-}
-
-static void load_dso(std::string dso_name)
-{
-	load_dso_sections(dso_name);
-	load_dso_symbols(dso_name, "-D"); /* dynamic symbols */
-	load_dso_symbols(dso_name, "");   /* regular ones */
-}
-
-void load_kallsyms(void)
-{
-	struct dso &dso = dsos["[kernel]"];
-
-	FILE *file = fopen("/proc/kallsyms", "r");
-	if (!file) {
-		perror("failed to open kallsyms");
-		exit(-1);
-	}
-
-	char *line;
-	size_t n;
-
-	while (!feof(file)) {
-		uint64_t start;
-		char c;
-		char sym[1024000];
-
-		if (getline(&line, &n, file) < 0)
-			break;
-		if (!line)
-			break;
-
-		if (sscanf(line, "%Lx %c %s", &start, &c, sym) == 3)
-			dso.syms.insert(symbol(start, 0x1000000, std::string(sym)));
-	}
-	fclose(file);
-}
-
-struct map {
-	uint64_t start;
-	uint64_t end;
-	uint64_t pgoff;
-
-	std::string dso;
-
-	map() { };
-
-	map(uint64_t ip) : end(ip) { }
-
-	map(mmap_event *mmap) {
-		start = mmap->start;
-		end = mmap->start + mmap->len;
-		pgoff = mmap->pgoff;
-
-		dso = std::string(mmap->filename);
-
-		if (dsos.find(dso) == dsos.end())
-			load_dso(dso);
-	};
-
-	bool operator < (const struct map &m) const {
-		return end < m.end;
-	};
-};
-
-typedef std::set<struct map> maps_t;
-
-static std::map<int, maps_t> maps;
-
-static std::map<int, std::string> comms;
-
-static std::map<std::string, int> hist;
-static std::multimap<int, std::string> rev_hist;
-
-static std::string resolve_comm(int pid)
-{
-	std::string comm;
-
-	std::map<int, std::string>::const_iterator ci = comms.find(pid);
-	if (ci != comms.end()) {
-		comm = ci->second;
-	} else {
-		char pid_str[30];
-
-		sprintf(pid_str, ":%d", pid);
-		comm = pid_str;
-	}
-
-	return comm;
-}
-
-static std::string resolve_user_symbol(int pid, uint64_t ip)
-{
-	std::string sym = "<unknown>";
-
-	maps_t &m = maps[pid];
-	maps_t::const_iterator mi = m.upper_bound(map(ip));
-	if (mi == m.end())
-		return sym;
-
-	ip -= mi->start + mi->pgoff;
-
-	symbols_t &s = dsos[mi->dso].syms;
-	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
-
-	sym = mi->dso + ": <unknown>";
-
-	if (si == s.begin())
-		return sym;
-	si--;
-
-	if (si->start <= ip && ip < si->end)
-		sym = mi->dso + ": " + si->name;
-#if 0
-	else if (si->start <= ip)
-		sym = mi->dso + ": ?" + si->name;
-#endif
-
-	return sym;
-}
-
-static std::string resolve_kernel_symbol(uint64_t ip)
-{
-	std::string sym = "<unknown>";
-
-	symbols_t &s = dsos["[kernel]"].syms;
-	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
-
-	if (si == s.begin())
-		return sym;
-	si--;
-
-	if (si->start <= ip && ip < si->end)
-		sym = si->name;
-
-	return sym;
-}
-
-static void display_help(void)
-{
-	printf(
-	"Usage: perf-report [<options>]\n"
-	" -i file   --input=<file>      # input file\n"
-	);
-
-	exit(0);
-}
-
-static void process_options(int argc, char *argv[])
-{
-	int error = 0;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"input",	required_argument,	NULL, 'i'},
-			{"no-user",	no_argument,		NULL, 'u'},
-			{"no-kernel",	no_argument,		NULL, 'k'},
-			{"no-hv",	no_argument,		NULL, 'h'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:i:kuh",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'i': input_name			= strdup(optarg); break;
-		case 'k': show_mask &= ~SHOW_KERNEL; break;
-		case 'u': show_mask &= ~SHOW_USER; break;
-		case 'h': show_mask &= ~SHOW_HV; break;
-		default: error = 1; break;
-		}
-	}
-
-	if (error)
-		display_help();
-}
-
-int main(int argc, char *argv[])
-{
-	unsigned long offset = 0;
-	unsigned long head = 0;
-	struct stat stat;
-	char *buf;
-	event_t *event;
-	int ret;
-	unsigned long total = 0;
-
-	page_size = getpagesize();
-
-	process_options(argc, argv);
-
-	input = open(input_name, O_RDONLY);
-	if (input < 0) {
-		perror("failed to open file");
-		exit(-1);
-	}
-
-	ret = fstat(input, &stat);
-	if (ret < 0) {
-		perror("failed to stat file");
-		exit(-1);
-	}
-
-	if (!stat.st_size) {
-		fprintf(stderr, "zero-sized file, nothing to do!\n");
-		exit(0);
-	}
-
-	load_kallsyms();
-
-remap:
-	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-			   MAP_SHARED, input, offset);
-	if (buf == MAP_FAILED) {
-		perror("failed to mmap file");
-		exit(-1);
-	}
-
-more:
-	event = (event_t *)(buf + head);
-
-	if (head + event->header.size >= page_size * mmap_window) {
-		unsigned long shift = page_size * (head / page_size);
-		int ret;
-
-		ret = munmap(buf, page_size * mmap_window);
-		assert(ret == 0);
-
-		offset += shift;
-		head -= shift;
-		goto remap;
-	}
-
-
-	if (!event->header.size) {
-		fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head);
-		fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head);
-		goto done;
-	}
-
-	head += event->header.size;
-
-	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
-		std::string comm, sym, level;
-		int show = 0;
-		char output[1024];
-
-		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
-			show |= SHOW_KERNEL;
-			level = " [k] ";
-			sym = resolve_kernel_symbol(event->ip.ip);
-		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
-			show |= SHOW_USER;
-			level = " [.] ";
-			sym = resolve_user_symbol(event->ip.pid, event->ip.ip);
-		} else {
-			show |= SHOW_HV;
-			level = " [H] ";
-		}
-
-		if (show & show_mask) {
-			comm = resolve_comm(event->ip.pid);
-			snprintf(output, sizeof(output), "%16s %s %s",
-					comm.c_str(), level.c_str(), sym.c_str());
-			hist[output]++;
-		}
-
-		total++;
-
-	} else switch (event->header.type) {
-	case PERF_EVENT_MMAP:
-		maps[event->mmap.pid].insert(map(&event->mmap));
-		break;
-
-	case PERF_EVENT_COMM:
-		comms[event->comm.pid] = std::string(event->comm.comm);
-		break;
-	}
-
-	if (offset + head < stat.st_size)
-		goto more;
-
-done:
-
-	close(input);
-
-	std::map<std::string, int>::iterator hi = hist.begin();
-
-	while (hi != hist.end()) {
-		rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
-		hist.erase(hi++);
-	}
-
-	std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
-
-	while (ri != rev_hist.end()) {
-		printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
-		ri++;
-	}
-
-	return 0;
-}
-
-- 
GitLab


From 62eb93905b3b43cea407cfbc061cc7b40ae1c6e9 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 18 May 2009 14:28:47 -0300
Subject: [PATCH 1316/2198] perf_counter: Implement dso__load using libelf

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile         |   2 +-
 Documentation/perf_counter/builtin-report.c | 122 +++++++++++++++++++-
 2 files changed, 121 insertions(+), 3 deletions(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 49c601e10692d..6bffa86af6be4 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -160,7 +160,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
 CFLAGS = -g -O2 -Wall
-LDFLAGS = -lpthread -lrt
+LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
 STRIP ?= strip
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 864f68f06a9e8..ad2f327a6576b 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -8,6 +8,9 @@
 #include <stdlib.h>
 #include <string.h>
 #include <limits.h>
+#include <gelf.h>
+#include <elf.h>
+#include <libelf.h>
 #include <fcntl.h>
 #include <stdio.h>
 #include <errno.h>
@@ -195,10 +198,123 @@ static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
 	return NULL;
 }
 
+/**
+ * elf_symtab__for_each_symbol - iterate thru all the symbols
+ *
+ * @self: struct elf_symtab instance to iterate
+ * @index: uint32_t index
+ * @sym: GElf_Sym iterator
+ */
+#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \
+	for (index = 0, gelf_getsym(syms, index, &sym);\
+	     index < nr_syms; \
+	     index++, gelf_getsym(syms, index, &sym))
+
+static inline uint8_t elf_sym__type(const GElf_Sym *sym)
+{
+	return GELF_ST_TYPE(sym->st_info);
+}
+
+static inline bool elf_sym__is_function(const GElf_Sym *sym)
+{
+	return elf_sym__type(sym) == STT_FUNC &&
+	       sym->st_name != 0 &&
+	       sym->st_shndx != SHN_UNDEF;
+}
+
+static inline const char *elf_sym__name(const GElf_Sym *sym,
+					const Elf_Data *symstrs)
+{
+	return symstrs->d_buf + sym->st_name;
+}
+
+static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
+				    GElf_Shdr *shp, const char *name,
+				    size_t *index)
+{
+	Elf_Scn *sec = NULL;
+	size_t cnt = 1;
+
+	while ((sec = elf_nextscn(elf, sec)) != NULL) {
+		char *str;
+
+		gelf_getshdr(sec, shp);
+		str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
+		if (!strcmp(name, str)) {
+			if (index)
+				*index = cnt;
+			break;
+		}
+		++cnt;
+	}
+
+	return sec;
+}
+
 static int dso__load(struct dso *self)
 {
-	/* FIXME */
-	return 0;
+	int fd = open(self->name, O_RDONLY), err = -1;
+
+	if (fd == -1)
+		return -1;
+
+	Elf *elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	if (elf == NULL) {
+		fprintf(stderr, "%s: cannot read %s ELF file.\n",
+			__func__, self->name);
+		goto out_close;
+	}
+
+	GElf_Ehdr ehdr;
+	if (gelf_getehdr(elf, &ehdr) == NULL) {
+		fprintf(stderr, "%s: cannot get elf header.\n", __func__);
+		goto out_elf_end;
+	}
+
+	GElf_Shdr shdr;
+	Elf_Scn *sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
+	if (sec == NULL)
+		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
+
+	if (sec == NULL)
+		goto out_elf_end;
+
+	if (gelf_getshdr(sec, &shdr) == NULL)
+		goto out_elf_end;
+
+	Elf_Data *syms = elf_getdata(sec, NULL);
+	if (syms == NULL)
+		goto out_elf_end;
+
+	sec = elf_getscn(elf, shdr.sh_link);
+	if (sec == NULL)
+		goto out_elf_end;
+
+	Elf_Data *symstrs = elf_getdata(sec, NULL);
+	if (symstrs == NULL)
+		goto out_elf_end;
+
+	const uint32_t nr_syms = shdr.sh_size / shdr.sh_entsize;
+
+	GElf_Sym sym;
+	uint32_t index;
+	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
+		if (!elf_sym__is_function(&sym))
+			continue;
+		struct symbol *f = symbol__new(sym.st_value, sym.st_size,
+					       elf_sym__name(&sym, symstrs));
+		if (f == NULL)
+			goto out_elf_end;
+
+		dso__insert_symbol(self, f);
+	}
+
+	err = 0;
+out_elf_end:
+	elf_end(elf);
+out_close:
+	close(fd);
+	return err;
 }
 
 static size_t dso__fprintf(struct dso *self, FILE *fp)
@@ -614,6 +730,8 @@ int cmd_report(int argc, char **argv)
 	int ret, rc = EXIT_FAILURE;
 	unsigned long total = 0;
 
+	elf_version(EV_CURRENT);
+
 	page_size = getpagesize();
 
 	process_options(argc, argv);
-- 
GitLab


From 35a50c8a20eea22c141e05c5667ac21c48b8b65d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 18 May 2009 16:24:49 -0300
Subject: [PATCH 1317/2198] perf_counter: Use rb_trees in perf report

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile         |   3 +
 Documentation/perf_counter/builtin-report.c |  60 ++-
 Documentation/perf_counter/util/rbtree.c    | 383 ++++++++++++++++++++
 Documentation/perf_counter/util/rbtree.h    | 171 +++++++++
 4 files changed, 601 insertions(+), 16 deletions(-)
 create mode 100644 Documentation/perf_counter/util/rbtree.c
 create mode 100644 Documentation/perf_counter/util/rbtree.h

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 6bffa86af6be4..412dea15d0b7e 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -287,6 +287,8 @@ LIB_FILE=libperf.a
 
 LIB_H += ../../include/linux/perf_counter.h
 LIB_H += perf.h
+LIB_H += util/list.h
+LIB_H += util/rbtree.h
 LIB_H += util/levenshtein.h
 LIB_H += util/parse-options.h
 LIB_H += util/parse-events.h
@@ -306,6 +308,7 @@ LIB_OBJS += util/levenshtein.o
 LIB_OBJS += util/parse-options.o
 LIB_OBJS += util/parse-events.o
 LIB_OBJS += util/path.o
+LIB_OBJS += util/rbtree.o
 LIB_OBJS += util/run-command.o
 LIB_OBJS += util/quote.o
 LIB_OBJS += util/strbuf.o
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index ad2f327a6576b..f63057fe2cd27 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -32,7 +32,8 @@
 #include <linux/types.h>
 
 #include "../../include/linux/perf_counter.h"
-#include "list.h"
+#include "util/list.h"
+#include "util/rbtree.h"
 
 #define SHOW_KERNEL	1
 #define SHOW_USER	2
@@ -106,10 +107,10 @@ static void section__delete(struct section *self)
 }
 
 struct symbol {
-	struct list_head node;
-	uint64_t	 start;
-	uint64_t	 end;
-	char		 name[0];
+	struct rb_node rb_node;
+	uint64_t       start;
+	uint64_t       end;
+	char	       name[0];
 };
 
 static struct symbol *symbol__new(uint64_t start, uint64_t len, const char *name)
@@ -139,7 +140,7 @@ static size_t symbol__fprintf(struct symbol *self, FILE *fp)
 struct dso {
 	struct list_head node;
 	struct list_head sections;
-	struct list_head syms;
+	struct rb_root	 syms;
 	char		 name[0];
 };
 
@@ -150,7 +151,7 @@ static struct dso *dso__new(const char *name)
 	if (self != NULL) {
 		strcpy(self->name, name);
 		INIT_LIST_HEAD(&self->sections);
-		INIT_LIST_HEAD(&self->syms);
+		self->syms = RB_ROOT;
 	}
 
 	return self;
@@ -166,10 +167,14 @@ static void dso__delete_sections(struct dso *self)
 
 static void dso__delete_symbols(struct dso *self)
 {
-	struct symbol *pos, *n;
+	struct symbol *pos;
+	struct rb_node *next = rb_first(&self->syms);
 
-	list_for_each_entry_safe(pos, n, &self->syms, node)
+	while (next) {
+		pos = rb_entry(next, struct symbol, rb_node);
+		next = rb_next(&pos->rb_node);
 		symbol__delete(pos);
+	}
 }
 
 static void dso__delete(struct dso *self)
@@ -181,7 +186,21 @@ static void dso__delete(struct dso *self)
 
 static void dso__insert_symbol(struct dso *self, struct symbol *sym)
 {
-	list_add_tail(&sym->node, &self->syms);
+	struct rb_node **p = &self->syms.rb_node;
+	struct rb_node *parent = NULL;
+	const uint64_t ip = sym->start;
+	struct symbol *s;
+
+	while (*p != NULL) {
+		parent = *p;
+		s = rb_entry(parent, struct symbol, rb_node);
+		if (ip < s->start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&sym->rb_node, parent, p);
+	rb_insert_color(&sym->rb_node, &self->syms);
 }
 
 static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
@@ -189,11 +208,18 @@ static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
 	if (self == NULL)
 		return NULL;
 
-	struct symbol *pos;
+	struct rb_node *n = self->syms.rb_node;
 
-	list_for_each_entry(pos, &self->syms, node)
-		if (ip >= pos->start && ip <= pos->end)
-			return pos;
+	while (n) {
+		struct symbol *s = rb_entry(n, struct symbol, rb_node);
+
+		if (ip < s->start)
+			n = n->rb_left;
+		else if (ip > s->end)
+			n = n->rb_right;
+		else
+			return s;
+	}
 
 	return NULL;
 }
@@ -319,11 +345,13 @@ static int dso__load(struct dso *self)
 
 static size_t dso__fprintf(struct dso *self, FILE *fp)
 {
-	struct symbol *pos;
 	size_t ret = fprintf(fp, "dso: %s\n", self->name);
 
-	list_for_each_entry(pos, &self->syms, node)
+	struct rb_node *nd;
+	for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
+		struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
 		ret += symbol__fprintf(pos, fp);
+	}
 
 	return ret;
 }
diff --git a/Documentation/perf_counter/util/rbtree.c b/Documentation/perf_counter/util/rbtree.c
new file mode 100644
index 0000000000000..b15ba9c7cb3f2
--- /dev/null
+++ b/Documentation/perf_counter/util/rbtree.c
@@ -0,0 +1,383 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  (C) 2002  David Woodhouse <dwmw2@infradead.org>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/lib/rbtree.c
+*/
+
+#include "rbtree.h"
+
+static void __rb_rotate_left(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *right = node->rb_right;
+	struct rb_node *parent = rb_parent(node);
+
+	if ((node->rb_right = right->rb_left))
+		rb_set_parent(right->rb_left, node);
+	right->rb_left = node;
+
+	rb_set_parent(right, parent);
+
+	if (parent)
+	{
+		if (node == parent->rb_left)
+			parent->rb_left = right;
+		else
+			parent->rb_right = right;
+	}
+	else
+		root->rb_node = right;
+	rb_set_parent(node, right);
+}
+
+static void __rb_rotate_right(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *left = node->rb_left;
+	struct rb_node *parent = rb_parent(node);
+
+	if ((node->rb_left = left->rb_right))
+		rb_set_parent(left->rb_right, node);
+	left->rb_right = node;
+
+	rb_set_parent(left, parent);
+
+	if (parent)
+	{
+		if (node == parent->rb_right)
+			parent->rb_right = left;
+		else
+			parent->rb_left = left;
+	}
+	else
+		root->rb_node = left;
+	rb_set_parent(node, left);
+}
+
+void rb_insert_color(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *parent, *gparent;
+
+	while ((parent = rb_parent(node)) && rb_is_red(parent))
+	{
+		gparent = rb_parent(parent);
+
+		if (parent == gparent->rb_left)
+		{
+			{
+				register struct rb_node *uncle = gparent->rb_right;
+				if (uncle && rb_is_red(uncle))
+				{
+					rb_set_black(uncle);
+					rb_set_black(parent);
+					rb_set_red(gparent);
+					node = gparent;
+					continue;
+				}
+			}
+
+			if (parent->rb_right == node)
+			{
+				register struct rb_node *tmp;
+				__rb_rotate_left(parent, root);
+				tmp = parent;
+				parent = node;
+				node = tmp;
+			}
+
+			rb_set_black(parent);
+			rb_set_red(gparent);
+			__rb_rotate_right(gparent, root);
+		} else {
+			{
+				register struct rb_node *uncle = gparent->rb_left;
+				if (uncle && rb_is_red(uncle))
+				{
+					rb_set_black(uncle);
+					rb_set_black(parent);
+					rb_set_red(gparent);
+					node = gparent;
+					continue;
+				}
+			}
+
+			if (parent->rb_left == node)
+			{
+				register struct rb_node *tmp;
+				__rb_rotate_right(parent, root);
+				tmp = parent;
+				parent = node;
+				node = tmp;
+			}
+
+			rb_set_black(parent);
+			rb_set_red(gparent);
+			__rb_rotate_left(gparent, root);
+		}
+	}
+
+	rb_set_black(root->rb_node);
+}
+
+static void __rb_erase_color(struct rb_node *node, struct rb_node *parent,
+			     struct rb_root *root)
+{
+	struct rb_node *other;
+
+	while ((!node || rb_is_black(node)) && node != root->rb_node)
+	{
+		if (parent->rb_left == node)
+		{
+			other = parent->rb_right;
+			if (rb_is_red(other))
+			{
+				rb_set_black(other);
+				rb_set_red(parent);
+				__rb_rotate_left(parent, root);
+				other = parent->rb_right;
+			}
+			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+			    (!other->rb_right || rb_is_black(other->rb_right)))
+			{
+				rb_set_red(other);
+				node = parent;
+				parent = rb_parent(node);
+			}
+			else
+			{
+				if (!other->rb_right || rb_is_black(other->rb_right))
+				{
+					rb_set_black(other->rb_left);
+					rb_set_red(other);
+					__rb_rotate_right(other, root);
+					other = parent->rb_right;
+				}
+				rb_set_color(other, rb_color(parent));
+				rb_set_black(parent);
+				rb_set_black(other->rb_right);
+				__rb_rotate_left(parent, root);
+				node = root->rb_node;
+				break;
+			}
+		}
+		else
+		{
+			other = parent->rb_left;
+			if (rb_is_red(other))
+			{
+				rb_set_black(other);
+				rb_set_red(parent);
+				__rb_rotate_right(parent, root);
+				other = parent->rb_left;
+			}
+			if ((!other->rb_left || rb_is_black(other->rb_left)) &&
+			    (!other->rb_right || rb_is_black(other->rb_right)))
+			{
+				rb_set_red(other);
+				node = parent;
+				parent = rb_parent(node);
+			}
+			else
+			{
+				if (!other->rb_left || rb_is_black(other->rb_left))
+				{
+					rb_set_black(other->rb_right);
+					rb_set_red(other);
+					__rb_rotate_left(other, root);
+					other = parent->rb_left;
+				}
+				rb_set_color(other, rb_color(parent));
+				rb_set_black(parent);
+				rb_set_black(other->rb_left);
+				__rb_rotate_right(parent, root);
+				node = root->rb_node;
+				break;
+			}
+		}
+	}
+	if (node)
+		rb_set_black(node);
+}
+
+void rb_erase(struct rb_node *node, struct rb_root *root)
+{
+	struct rb_node *child, *parent;
+	int color;
+
+	if (!node->rb_left)
+		child = node->rb_right;
+	else if (!node->rb_right)
+		child = node->rb_left;
+	else
+	{
+		struct rb_node *old = node, *left;
+
+		node = node->rb_right;
+		while ((left = node->rb_left) != NULL)
+			node = left;
+		child = node->rb_right;
+		parent = rb_parent(node);
+		color = rb_color(node);
+
+		if (child)
+			rb_set_parent(child, parent);
+		if (parent == old) {
+			parent->rb_right = child;
+			parent = node;
+		} else
+			parent->rb_left = child;
+
+		node->rb_parent_color = old->rb_parent_color;
+		node->rb_right = old->rb_right;
+		node->rb_left = old->rb_left;
+
+		if (rb_parent(old))
+		{
+			if (rb_parent(old)->rb_left == old)
+				rb_parent(old)->rb_left = node;
+			else
+				rb_parent(old)->rb_right = node;
+		} else
+			root->rb_node = node;
+
+		rb_set_parent(old->rb_left, node);
+		if (old->rb_right)
+			rb_set_parent(old->rb_right, node);
+		goto color;
+	}
+
+	parent = rb_parent(node);
+	color = rb_color(node);
+
+	if (child)
+		rb_set_parent(child, parent);
+	if (parent)
+	{
+		if (parent->rb_left == node)
+			parent->rb_left = child;
+		else
+			parent->rb_right = child;
+	}
+	else
+		root->rb_node = child;
+
+ color:
+	if (color == RB_BLACK)
+		__rb_erase_color(child, parent, root);
+}
+
+/*
+ * This function returns the first node (in sort order) of the tree.
+ */
+struct rb_node *rb_first(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_left)
+		n = n->rb_left;
+	return n;
+}
+
+struct rb_node *rb_last(const struct rb_root *root)
+{
+	struct rb_node	*n;
+
+	n = root->rb_node;
+	if (!n)
+		return NULL;
+	while (n->rb_right)
+		n = n->rb_right;
+	return n;
+}
+
+struct rb_node *rb_next(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (rb_parent(node) == node)
+		return NULL;
+
+	/* If we have a right-hand child, go down and then left as far
+	   as we can. */
+	if (node->rb_right) {
+		node = node->rb_right; 
+		while (node->rb_left)
+			node=node->rb_left;
+		return (struct rb_node *)node;
+	}
+
+	/* No right-hand children.  Everything down and left is
+	   smaller than us, so any 'next' node must be in the general
+	   direction of our parent. Go up the tree; any time the
+	   ancestor is a right-hand child of its parent, keep going
+	   up. First time it's a left-hand child of its parent, said
+	   parent is our 'next' node. */
+	while ((parent = rb_parent(node)) && node == parent->rb_right)
+		node = parent;
+
+	return parent;
+}
+
+struct rb_node *rb_prev(const struct rb_node *node)
+{
+	struct rb_node *parent;
+
+	if (rb_parent(node) == node)
+		return NULL;
+
+	/* If we have a left-hand child, go down and then right as far
+	   as we can. */
+	if (node->rb_left) {
+		node = node->rb_left; 
+		while (node->rb_right)
+			node=node->rb_right;
+		return (struct rb_node *)node;
+	}
+
+	/* No left-hand children. Go up till we find an ancestor which
+	   is a right-hand child of its parent */
+	while ((parent = rb_parent(node)) && node == parent->rb_left)
+		node = parent;
+
+	return parent;
+}
+
+void rb_replace_node(struct rb_node *victim, struct rb_node *new,
+		     struct rb_root *root)
+{
+	struct rb_node *parent = rb_parent(victim);
+
+	/* Set the surrounding nodes to point to the replacement */
+	if (parent) {
+		if (victim == parent->rb_left)
+			parent->rb_left = new;
+		else
+			parent->rb_right = new;
+	} else {
+		root->rb_node = new;
+	}
+	if (victim->rb_left)
+		rb_set_parent(victim->rb_left, new);
+	if (victim->rb_right)
+		rb_set_parent(victim->rb_right, new);
+
+	/* Copy the pointers/colour from the victim to the replacement */
+	*new = *victim;
+}
diff --git a/Documentation/perf_counter/util/rbtree.h b/Documentation/perf_counter/util/rbtree.h
new file mode 100644
index 0000000000000..6bdc488a47fba
--- /dev/null
+++ b/Documentation/perf_counter/util/rbtree.h
@@ -0,0 +1,171 @@
+/*
+  Red Black Trees
+  (C) 1999  Andrea Arcangeli <andrea@suse.de>
+  
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU General Public License as published by
+  the Free Software Foundation; either version 2 of the License, or
+  (at your option) any later version.
+
+  This program is distributed in the hope that it will be useful,
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  GNU General Public License for more details.
+
+  You should have received a copy of the GNU General Public License
+  along with this program; if not, write to the Free Software
+  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+  linux/include/linux/rbtree.h
+
+  To use rbtrees you'll have to implement your own insert and search cores.
+  This will avoid us to use callbacks and to drop drammatically performances.
+  I know it's not the cleaner way,  but in C (not in C++) to get
+  performances and genericity...
+
+  Some example of insert and search follows here. The search is a plain
+  normal search over an ordered tree. The insert instead must be implemented
+  int two steps: as first thing the code must insert the element in
+  order as a red leaf in the tree, then the support library function
+  rb_insert_color() must be called. Such function will do the
+  not trivial work to rebalance the rbtree if necessary.
+
+-----------------------------------------------------------------------
+static inline struct page * rb_search_page_cache(struct inode * inode,
+						 unsigned long offset)
+{
+	struct rb_node * n = inode->i_rb_page_cache.rb_node;
+	struct page * page;
+
+	while (n)
+	{
+		page = rb_entry(n, struct page, rb_page_cache);
+
+		if (offset < page->offset)
+			n = n->rb_left;
+		else if (offset > page->offset)
+			n = n->rb_right;
+		else
+			return page;
+	}
+	return NULL;
+}
+
+static inline struct page * __rb_insert_page_cache(struct inode * inode,
+						   unsigned long offset,
+						   struct rb_node * node)
+{
+	struct rb_node ** p = &inode->i_rb_page_cache.rb_node;
+	struct rb_node * parent = NULL;
+	struct page * page;
+
+	while (*p)
+	{
+		parent = *p;
+		page = rb_entry(parent, struct page, rb_page_cache);
+
+		if (offset < page->offset)
+			p = &(*p)->rb_left;
+		else if (offset > page->offset)
+			p = &(*p)->rb_right;
+		else
+			return page;
+	}
+
+	rb_link_node(node, parent, p);
+
+	return NULL;
+}
+
+static inline struct page * rb_insert_page_cache(struct inode * inode,
+						 unsigned long offset,
+						 struct rb_node * node)
+{
+	struct page * ret;
+	if ((ret = __rb_insert_page_cache(inode, offset, node)))
+		goto out;
+	rb_insert_color(node, &inode->i_rb_page_cache);
+ out:
+	return ret;
+}
+-----------------------------------------------------------------------
+*/
+
+#ifndef	_LINUX_RBTREE_H
+#define	_LINUX_RBTREE_H
+
+#include <stddef.h>
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+	(type *)( (char *)__mptr - offsetof(type,member) );})
+
+struct rb_node
+{
+	unsigned long  rb_parent_color;
+#define	RB_RED		0
+#define	RB_BLACK	1
+	struct rb_node *rb_right;
+	struct rb_node *rb_left;
+} __attribute__((aligned(sizeof(long))));
+    /* The alignment might seem pointless, but allegedly CRIS needs it */
+
+struct rb_root
+{
+	struct rb_node *rb_node;
+};
+
+
+#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3))
+#define rb_color(r)   ((r)->rb_parent_color & 1)
+#define rb_is_red(r)   (!rb_color(r))
+#define rb_is_black(r) rb_color(r)
+#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0)
+#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0)
+
+static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p)
+{
+	rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p;
+}
+static inline void rb_set_color(struct rb_node *rb, int color)
+{
+	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color;
+}
+
+#define RB_ROOT	(struct rb_root) { NULL, }
+#define	rb_entry(ptr, type, member) container_of(ptr, type, member)
+
+#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL)
+#define RB_EMPTY_NODE(node)	(rb_parent(node) == node)
+#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node))
+
+extern void rb_insert_color(struct rb_node *, struct rb_root *);
+extern void rb_erase(struct rb_node *, struct rb_root *);
+
+/* Find logical next and previous nodes in a tree */
+extern struct rb_node *rb_next(const struct rb_node *);
+extern struct rb_node *rb_prev(const struct rb_node *);
+extern struct rb_node *rb_first(const struct rb_root *);
+extern struct rb_node *rb_last(const struct rb_root *);
+
+/* Fast replacement of a single node without remove/rebalance/add/rebalance */
+extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, 
+			    struct rb_root *root);
+
+static inline void rb_link_node(struct rb_node * node, struct rb_node * parent,
+				struct rb_node ** rb_link)
+{
+	node->rb_parent_color = (unsigned long )parent;
+	node->rb_left = node->rb_right = NULL;
+
+	*rb_link = node;
+}
+
+#endif	/* _LINUX_RBTREE_H */
-- 
GitLab


From 040e6034124c504d536736ce08e4643e640cd7c2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 18 May 2009 16:25:31 -0300
Subject: [PATCH 1318/2198] perf_counter: Add our private copy of list.h

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/list.h | 603 +++++++++++++++++++++++++
 1 file changed, 603 insertions(+)
 create mode 100644 Documentation/perf_counter/util/list.h

diff --git a/Documentation/perf_counter/util/list.h b/Documentation/perf_counter/util/list.h
new file mode 100644
index 0000000000000..e2548e8072cfb
--- /dev/null
+++ b/Documentation/perf_counter/util/list.h
@@ -0,0 +1,603 @@
+#ifndef _LINUX_LIST_H
+#define _LINUX_LIST_H
+/*
+  Copyright (C) Cast of dozens, comes from the Linux kernel
+
+  This program is free software; you can redistribute it and/or modify it
+  under the terms of version 2 of the GNU General Public License as
+  published by the Free Software Foundation.
+*/
+
+#include <stddef.h>
+
+/*
+ * These are non-NULL pointers that will result in page faults
+ * under normal circumstances, used to verify that nobody uses
+ * non-initialized list entries.
+ */
+#define LIST_POISON1 ((void *)0x00100100)
+#define LIST_POISON2 ((void *)0x00200200)
+
+/**
+ * container_of - cast a member of a structure out to the containing structure
+ * @ptr:	the pointer to the member.
+ * @type:	the type of the container struct this is embedded in.
+ * @member:	the name of the member within the struct.
+ *
+ */
+#define container_of(ptr, type, member) ({			\
+        const typeof( ((type *)0)->member ) *__mptr = (ptr);	\
+        (type *)( (char *)__mptr - offsetof(type,member) );})
+
+/*
+ * Simple doubly linked list implementation.
+ *
+ * Some of the internal functions ("__xxx") are useful when
+ * manipulating whole lists rather than single entries, as
+ * sometimes we already know the next/prev entries and we can
+ * generate better code by using them directly rather than
+ * using the generic single-entry routines.
+ */
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+#define LIST_HEAD_INIT(name) { &(name), &(name) }
+
+#define LIST_HEAD(name) \
+	struct list_head name = LIST_HEAD_INIT(name)
+
+static inline void INIT_LIST_HEAD(struct list_head *list)
+{
+	list->next = list;
+	list->prev = list;
+}
+
+/*
+ * Insert a new entry between two known consecutive entries.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_add(struct list_head *new,
+			      struct list_head *prev,
+			      struct list_head *next)
+{
+	next->prev = new;
+	new->next = next;
+	new->prev = prev;
+	prev->next = new;
+}
+
+/**
+ * list_add - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it after
+ *
+ * Insert a new entry after the specified head.
+ * This is good for implementing stacks.
+ */
+static inline void list_add(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head, head->next);
+}
+
+/**
+ * list_add_tail - add a new entry
+ * @new: new entry to be added
+ * @head: list head to add it before
+ *
+ * Insert a new entry before the specified head.
+ * This is useful for implementing queues.
+ */
+static inline void list_add_tail(struct list_head *new, struct list_head *head)
+{
+	__list_add(new, head->prev, head);
+}
+
+/*
+ * Delete a list entry by making the prev/next entries
+ * point to each other.
+ *
+ * This is only for internal list manipulation where we know
+ * the prev/next entries already!
+ */
+static inline void __list_del(struct list_head * prev, struct list_head * next)
+{
+	next->prev = prev;
+	prev->next = next;
+}
+
+/**
+ * list_del - deletes entry from list.
+ * @entry: the element to delete from the list.
+ * Note: list_empty on entry does not return true after this, the entry is
+ * in an undefined state.
+ */
+static inline void list_del(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	entry->next = LIST_POISON1;
+	entry->prev = LIST_POISON2;
+}
+
+/**
+ * list_del_range - deletes range of entries from list.
+ * @beging: first element in the range to delete from the list.
+ * @beging: first element in the range to delete from the list.
+ * Note: list_empty on the range of entries does not return true after this,
+ * the entries is in an undefined state.
+ */
+static inline void list_del_range(struct list_head *begin,
+				  struct list_head *end)
+{
+	begin->prev->next = end->next;
+	end->next->prev = begin->prev;
+}
+
+/**
+ * list_replace - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ * Note: if 'old' was empty, it will be overwritten.
+ */
+static inline void list_replace(struct list_head *old,
+				struct list_head *new)
+{
+	new->next = old->next;
+	new->next->prev = new;
+	new->prev = old->prev;
+	new->prev->next = new;
+}
+
+static inline void list_replace_init(struct list_head *old,
+					struct list_head *new)
+{
+	list_replace(old, new);
+	INIT_LIST_HEAD(old);
+}
+
+/**
+ * list_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+static inline void list_del_init(struct list_head *entry)
+{
+	__list_del(entry->prev, entry->next);
+	INIT_LIST_HEAD(entry);
+}
+
+/**
+ * list_move - delete from one list and add as another's head
+ * @list: the entry to move
+ * @head: the head that will precede our entry
+ */
+static inline void list_move(struct list_head *list, struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add(list, head);
+}
+
+/**
+ * list_move_tail - delete from one list and add as another's tail
+ * @list: the entry to move
+ * @head: the head that will follow our entry
+ */
+static inline void list_move_tail(struct list_head *list,
+				  struct list_head *head)
+{
+        __list_del(list->prev, list->next);
+        list_add_tail(list, head);
+}
+
+/**
+ * list_is_last - tests whether @list is the last entry in list @head
+ * @list: the entry to test
+ * @head: the head of the list
+ */
+static inline int list_is_last(const struct list_head *list,
+				const struct list_head *head)
+{
+	return list->next == head;
+}
+
+/**
+ * list_empty - tests whether a list is empty
+ * @head: the list to test.
+ */
+static inline int list_empty(const struct list_head *head)
+{
+	return head->next == head;
+}
+
+/**
+ * list_empty_careful - tests whether a list is empty and not being modified
+ * @head: the list to test
+ *
+ * Description:
+ * tests whether a list is empty _and_ checks that no other CPU might be
+ * in the process of modifying either member (next or prev)
+ *
+ * NOTE: using list_empty_careful() without synchronization
+ * can only be safe if the only activity that can happen
+ * to the list entry is list_del_init(). Eg. it cannot be used
+ * if another CPU could re-list_add() it.
+ */
+static inline int list_empty_careful(const struct list_head *head)
+{
+	struct list_head *next = head->next;
+	return (next == head) && (next == head->prev);
+}
+
+static inline void __list_splice(struct list_head *list,
+				 struct list_head *head)
+{
+	struct list_head *first = list->next;
+	struct list_head *last = list->prev;
+	struct list_head *at = head->next;
+
+	first->prev = head;
+	head->next = first;
+
+	last->next = at;
+	at->prev = last;
+}
+
+/**
+ * list_splice - join two lists
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ */
+static inline void list_splice(struct list_head *list, struct list_head *head)
+{
+	if (!list_empty(list))
+		__list_splice(list, head);
+}
+
+/**
+ * list_splice_init - join two lists and reinitialise the emptied list.
+ * @list: the new list to add.
+ * @head: the place to add it in the first list.
+ *
+ * The list at @list is reinitialised
+ */
+static inline void list_splice_init(struct list_head *list,
+				    struct list_head *head)
+{
+	if (!list_empty(list)) {
+		__list_splice(list, head);
+		INIT_LIST_HEAD(list);
+	}
+}
+
+/**
+ * list_entry - get the struct for this entry
+ * @ptr:	the &struct list_head pointer.
+ * @type:	the type of the struct this is embedded in.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_entry(ptr, type, member) \
+	container_of(ptr, type, member)
+
+/**
+ * list_first_entry - get the first element from a list
+ * @ptr:       the list head to take the element from.
+ * @type:      the type of the struct this is embedded in.
+ * @member:    the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ */
+#define list_first_entry(ptr, type, member) \
+	list_entry((ptr)->next, type, member)
+
+/**
+ * list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); \
+        	pos = pos->next)
+
+/**
+ * __list_for_each	-	iterate over a list
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ *
+ * This variant differs from list_for_each() in that it's the
+ * simplest possible list iteration code, no prefetching is done.
+ * Use this for code that knows the list to be very short (empty
+ * or 1 entry) most of the time.
+ */
+#define __list_for_each(pos, head) \
+	for (pos = (head)->next; pos != (head); pos = pos->next)
+
+/**
+ * list_for_each_prev	-	iterate over a list backwards
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @head:	the head for your list.
+ */
+#define list_for_each_prev(pos, head) \
+	for (pos = (head)->prev; pos != (head); \
+        	pos = pos->prev)
+
+/**
+ * list_for_each_safe - iterate over a list safe against removal of list entry
+ * @pos:	the &struct list_head to use as a loop cursor.
+ * @n:		another &struct list_head to use as temporary storage
+ * @head:	the head for your list.
+ */
+#define list_for_each_safe(pos, n, head) \
+	for (pos = (head)->next, n = pos->next; pos != (head); \
+		pos = n, n = pos->next)
+
+/**
+ * list_for_each_entry	-	iterate over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry(pos, head, member)				\
+	for (pos = list_entry((head)->next, typeof(*pos), member);	\
+	     &pos->member != (head); 	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_reverse - iterate backwards over list of given type.
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_reverse(pos, head, member)			\
+	for (pos = list_entry((head)->prev, typeof(*pos), member);	\
+	     &pos->member != (head); 	\
+	     pos = list_entry(pos->member.prev, typeof(*pos), member))
+
+/**
+ * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue
+ * @pos:	the type * to use as a start point
+ * @head:	the head of the list
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Prepares a pos entry for use as a start point in list_for_each_entry_continue.
+ */
+#define list_prepare_entry(pos, head, member) \
+	((pos) ? : list_entry(head, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_continue - continue iteration over list of given type
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Continue to iterate over list of given type, continuing after
+ * the current position.
+ */
+#define list_for_each_entry_continue(pos, head, member) 		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head);	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_from - iterate over list of given type from the current point
+ * @pos:	the type * to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing from current position.
+ */
+#define list_for_each_entry_from(pos, head, member) 			\
+	for (; &pos->member != (head);	\
+	     pos = list_entry(pos->member.next, typeof(*pos), member))
+
+/**
+ * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ */
+#define list_for_each_entry_safe(pos, n, head, member)			\
+	for (pos = list_entry((head)->next, typeof(*pos), member),	\
+		n = list_entry(pos->member.next, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_continue
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type, continuing after current point,
+ * safe against removal of list entry.
+ */
+#define list_for_each_entry_safe_continue(pos, n, head, member) 		\
+	for (pos = list_entry(pos->member.next, typeof(*pos), member), 		\
+		n = list_entry(pos->member.next, typeof(*pos), member);		\
+	     &pos->member != (head);						\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_from
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate over list of given type from current point, safe against
+ * removal of list entry.
+ */
+#define list_for_each_entry_safe_from(pos, n, head, member) 			\
+	for (n = list_entry(pos->member.next, typeof(*pos), member);		\
+	     &pos->member != (head);						\
+	     pos = n, n = list_entry(n->member.next, typeof(*n), member))
+
+/**
+ * list_for_each_entry_safe_reverse
+ * @pos:	the type * to use as a loop cursor.
+ * @n:		another type * to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the list_struct within the struct.
+ *
+ * Iterate backwards over list of given type, safe against removal
+ * of list entry.
+ */
+#define list_for_each_entry_safe_reverse(pos, n, head, member)		\
+	for (pos = list_entry((head)->prev, typeof(*pos), member),	\
+		n = list_entry(pos->member.prev, typeof(*pos), member);	\
+	     &pos->member != (head); 					\
+	     pos = n, n = list_entry(n->member.prev, typeof(*n), member))
+
+/*
+ * Double linked lists with a single pointer list head.
+ * Mostly useful for hash tables where the two pointer list head is
+ * too wasteful.
+ * You lose the ability to access the tail in O(1).
+ */
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+#define HLIST_HEAD_INIT { .first = NULL }
+#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL }
+#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL)
+static inline void INIT_HLIST_NODE(struct hlist_node *h)
+{
+	h->next = NULL;
+	h->pprev = NULL;
+}
+
+static inline int hlist_unhashed(const struct hlist_node *h)
+{
+	return !h->pprev;
+}
+
+static inline int hlist_empty(const struct hlist_head *h)
+{
+	return !h->first;
+}
+
+static inline void __hlist_del(struct hlist_node *n)
+{
+	struct hlist_node *next = n->next;
+	struct hlist_node **pprev = n->pprev;
+	*pprev = next;
+	if (next)
+		next->pprev = pprev;
+}
+
+static inline void hlist_del(struct hlist_node *n)
+{
+	__hlist_del(n);
+	n->next = LIST_POISON1;
+	n->pprev = LIST_POISON2;
+}
+
+static inline void hlist_del_init(struct hlist_node *n)
+{
+	if (!hlist_unhashed(n)) {
+		__hlist_del(n);
+		INIT_HLIST_NODE(n);
+	}
+}
+
+static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
+{
+	struct hlist_node *first = h->first;
+	n->next = first;
+	if (first)
+		first->pprev = &n->next;
+	h->first = n;
+	n->pprev = &h->first;
+}
+
+/* next must be != NULL */
+static inline void hlist_add_before(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	n->pprev = next->pprev;
+	n->next = next;
+	next->pprev = &n->next;
+	*(n->pprev) = n;
+}
+
+static inline void hlist_add_after(struct hlist_node *n,
+					struct hlist_node *next)
+{
+	next->next = n->next;
+	n->next = next;
+	next->pprev = &n->next;
+
+	if(next->next)
+		next->next->pprev  = &next->next;
+}
+
+#define hlist_entry(ptr, type, member) container_of(ptr,type,member)
+
+#define hlist_for_each(pos, head) \
+	for (pos = (head)->first; pos; \
+	     pos = pos->next)
+
+#define hlist_for_each_safe(pos, n, head) \
+	for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \
+	     pos = n)
+
+/**
+ * hlist_for_each_entry	- iterate over list of given type
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry(tpos, pos, head, member)			 \
+	for (pos = (head)->first;					 \
+	     pos && 			 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_continue - iterate over a hlist continuing after current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_continue(tpos, pos, member)		 \
+	for (pos = (pos)->next;						 \
+	     pos && 			 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_from - iterate over a hlist continuing from current point
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_from(tpos, pos, member)			 \
+	for (; pos && 			 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = pos->next)
+
+/**
+ * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry
+ * @tpos:	the type * to use as a loop cursor.
+ * @pos:	the &struct hlist_node to use as a loop cursor.
+ * @n:		another &struct hlist_node to use as temporary storage
+ * @head:	the head for your list.
+ * @member:	the name of the hlist_node within the struct.
+ */
+#define hlist_for_each_entry_safe(tpos, pos, n, head, member) 		 \
+	for (pos = (head)->first;					 \
+	     pos && ({ n = pos->next; 1; }) && 				 \
+		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
+	     pos = n)
+
+#endif
-- 
GitLab


From ce7e43653b08db094326f378958bc293a68e8e5b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 19 May 2009 09:30:23 -0300
Subject: [PATCH 1319/2198] perf_counter: Use rb_tree for symhists and threads
 in report

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 178 +++++++++-----------
 1 file changed, 75 insertions(+), 103 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index f63057fe2cd27..e857201e1e0f4 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -479,23 +479,25 @@ static size_t map__fprintf(struct map *self, FILE *fp)
 }
 
 struct symhist {
-	struct list_head node;
+	struct rb_node	 rb_node;
 	struct dso	 *dso;
 	struct symbol	 *sym;
+	uint64_t	 ip;
 	uint32_t	 count;
 	char		 level;
 };
 
-static struct symhist *symhist__new(struct symbol *sym, struct dso *dso,
-				    char level)
+static struct symhist *symhist__new(struct symbol *sym, uint64_t ip,
+				    struct dso *dso, char level)
 {
 	struct symhist *self = malloc(sizeof(*self));
 
 	if (self != NULL) {
 		self->sym   = sym;
+		self->ip    = ip;
 		self->dso   = dso;
 		self->level = level;
-		self->count = 0;
+		self->count = 1;
 	}
 
 	return self;
@@ -506,12 +508,6 @@ static void symhist__delete(struct symhist *self)
 	free(self);
 }
 
-static bool symhist__equal(struct symhist *self, struct symbol *sym,
-			   struct dso *dso, char level)
-{
-	return self->level == level && self->sym == sym && self->dso == dso;
-}
-
 static void symhist__inc(struct symhist *self)
 {
 	++self->count;
@@ -519,7 +515,7 @@ static void symhist__inc(struct symhist *self)
 
 static size_t symhist__fprintf(struct symhist *self, FILE *fp)
 {
-	size_t ret = fprintf(fp, "[%c] ", self->level);
+	size_t ret = fprintf(fp, "%#llx [%c] ", (unsigned long long)self->ip, self->level);
 
 	if (self->level != '.')
 		ret += fprintf(fp, "%s", self->sym->name);
@@ -531,9 +527,9 @@ static size_t symhist__fprintf(struct symhist *self, FILE *fp)
 }
 
 struct thread {
-	struct list_head node;
+	struct rb_node	 rb_node;
 	struct list_head maps;
-	struct list_head symhists;
+	struct rb_root	 symhists;
 	pid_t		 pid;
 	char		 *comm;
 };
@@ -546,47 +542,43 @@ static struct thread *thread__new(pid_t pid)
 		self->pid = pid;
 		self->comm = NULL;
 		INIT_LIST_HEAD(&self->maps);
-		INIT_LIST_HEAD(&self->symhists);
+		self->symhists = RB_ROOT;
 	}
 
 	return self;
 }
 
-static void thread__insert_symhist(struct thread *self,
-				   struct symhist *symhist)
-{
-	list_add_tail(&symhist->node, &self->symhists);
-}
-
-static struct symhist *thread__symhists_find(struct thread *self,
-					     struct symbol *sym,
-					     struct dso *dso, char level)
+static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
+				 uint64_t ip, struct dso *dso, char level)
 {
-	struct symhist *pos;
+	struct rb_node **p = &self->symhists.rb_node;
+	struct rb_node *parent = NULL;
+	struct symhist *sh;
 
-	list_for_each_entry(pos, &self->symhists, node)
-		if (symhist__equal(pos, sym, dso, level))
-			return pos;
+	while (*p != NULL) {
+		parent = *p;
+		sh = rb_entry(parent, struct symhist, rb_node);
 
-	return NULL;
-}
+		if (sh->sym == sym || ip == sh->ip) {
+			symhist__inc(sh);
+			return 0;
+		}
 
-static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
-				 struct dso *dso, char level)
-{
-	struct symhist *symhist = thread__symhists_find(self, sym, dso, level);
+		/* Handle unresolved symbols too */
+		const uint64_t start = !sh->sym ? sh->ip : sh->sym->start;
 
-	if (symhist == NULL) {
-		symhist = symhist__new(sym, dso, level);
-		if (symhist == NULL)
-			goto out_error;
-		thread__insert_symhist(self, symhist);
+		if (ip < start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
 	}
 
-	symhist__inc(symhist);
+	sh = symhist__new(sym, ip, dso, level);
+	if (sh == NULL)
+		return -ENOMEM;
+	rb_link_node(&sh->rb_node, parent, p);
+	rb_insert_color(&sh->rb_node, &self->symhists);
 	return 0;
-out_error:
-	return -ENOMEM;
 }
 
 static int thread__set_comm(struct thread *self, const char *comm)
@@ -608,43 +600,44 @@ static size_t thread__maps_fprintf(struct thread *self, FILE *fp)
 
 static size_t thread__fprintf(struct thread *self, FILE *fp)
 {
-	struct symhist *pos;
 	int ret = fprintf(fp, "thread: %d %s\n", self->pid, self->comm);
+	struct rb_node *nd;
 
-	list_for_each_entry(pos, &self->symhists, node)
+	for (nd = rb_first(&self->symhists); nd; nd = rb_next(nd)) {
+		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
 		ret += symhist__fprintf(pos, fp);
+	}
 
 	return ret;
 }
 
-static LIST_HEAD(threads);
+static struct rb_root threads = RB_ROOT;
 
-static void threads__add(struct thread *thread)
-{
-	list_add_tail(&thread->node, &threads);
-}
-
-static struct thread *threads__find(pid_t pid)
+static struct thread *threads__findnew(pid_t pid)
 {
-	struct thread *pos;
+	struct rb_node **p = &threads.rb_node;
+	struct rb_node *parent = NULL;
+	struct thread *th;
 
-	list_for_each_entry(pos, &threads, node)
-		if (pos->pid == pid)
-			return pos;
-	return NULL;
-}
+	while (*p != NULL) {
+		parent = *p;
+		th = rb_entry(parent, struct thread, rb_node);
 
-static struct thread *threads__findnew(pid_t pid)
-{
-	struct thread *thread = threads__find(pid);
+		if (th->pid == pid)
+			return th;
 
-	if (thread == NULL) {
-		thread = thread__new(pid);
-		if (thread != NULL)
-			threads__add(thread);
+		if (pid < th->pid)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
 	}
 
-	return thread;
+	th = thread__new(pid);
+	if (th != NULL) {
+		rb_link_node(&th->rb_node, parent, p);
+		rb_insert_color(&th->rb_node, &threads);
+	}
+	return th;
 }
 
 static void thread__insert_map(struct thread *self, struct map *map)
@@ -668,44 +661,13 @@ static struct map *thread__find_map(struct thread *self, uint64_t ip)
 
 static void threads__fprintf(FILE *fp)
 {
-	struct thread *pos;
-
-	list_for_each_entry(pos, &threads, node)
+	struct rb_node *nd;
+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
 		thread__fprintf(pos, fp);
+	}
 }
 
-#if 0
-static std::string resolve_user_symbol(int pid, uint64_t ip)
-{
-	std::string sym = "<unknown>";
-
-	maps_t &m = maps[pid];
-	maps_t::const_iterator mi = m.upper_bound(map(ip));
-	if (mi == m.end())
-		return sym;
-
-	ip -= mi->start + mi->pgoff;
-
-	symbols_t &s = dsos[mi->dso].syms;
-	symbols_t::const_iterator si = s.upper_bound(symbol(ip));
-
-	sym = mi->dso + ": <unknown>";
-
-	if (si == s.begin())
-		return sym;
-	si--;
-
-	if (si->start <= ip && ip < si->end)
-		sym = mi->dso + ": " + si->name;
-#if 0
-	else if (si->start <= ip)
-		sym = mi->dso + ": ?" + si->name;
-#endif
-
-	return sym;
-}
-#endif
-
 static void display_help(void)
 {
 	printf(
@@ -824,8 +786,11 @@ int cmd_report(int argc, char **argv)
 		struct dso *dso = NULL;
 		struct thread *thread = threads__findnew(event->ip.pid);
 
-		if (thread == NULL)
+		if (thread == NULL) {
+			fprintf(stderr, "problem processing %d event, bailing out\n",
+				event->header.type);
 			goto done;
+		}
 
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
 			show = SHOW_KERNEL;
@@ -845,8 +810,11 @@ int cmd_report(int argc, char **argv)
 		if (show & show_mask) {
 			struct symbol *sym = dso__find_symbol(dso, event->ip.ip);
 
-			if (thread__symbol_incnew(thread, sym, dso, level))
+			if (thread__symbol_incnew(thread, sym, event->ip.ip,
+						  dso, level)) {
+				fprintf(stderr, "problem incrementing symbol count, bailing out\n");
 				goto done;
+			}
 		}
 		total++;
 	} else switch (event->header.type) {
@@ -854,8 +822,10 @@ int cmd_report(int argc, char **argv)
 		struct thread *thread = threads__findnew(event->mmap.pid);
 		struct map *map = map__new(&event->mmap);
 
-		if (thread == NULL || map == NULL )
+		if (thread == NULL || map == NULL) {
+			fprintf(stderr, "problem processing PERF_EVENT_MMAP, bailing out\n");
 			goto done;
+		}
 		thread__insert_map(thread, map);
 		break;
 	}
@@ -863,8 +833,10 @@ int cmd_report(int argc, char **argv)
 		struct thread *thread = threads__findnew(event->comm.pid);
 
 		if (thread == NULL ||
-		    thread__set_comm(thread, event->comm.comm))
+		    thread__set_comm(thread, event->comm.comm)) {
+			fprintf(stderr, "problem processing PERF_EVENT_COMM, bailing out\n");
 			goto done;
+		}
 		break;
 	}
 	}
-- 
GitLab


From f3e08c5341c528284460530b546608f27232f737 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 22 May 2009 15:34:54 +0200
Subject: [PATCH 1320/2198] perf report: Fix segfault on unknown symbols

Ingo reported:

> Program received signal SIGSEGV, Segmentation fault.
> 0x0000003e25080f80 in strlen () from /lib64/libc.so.6
> Missing separate debuginfos, use: debuginfo-install elfutils.x86_64
> glibc.x86_64 zlib.x86_64
> (gdb) bt
> #0  0x0000003e25080f80 in strlen () from /lib64/libc.so.6
> #1  0x0000003e2506954e in fputs () from /lib64/libc.so.6
> #2  0x00000000004059e8 in cmd_report (argc=<value optimized out>,
>     argv=<value optimized out>) at builtin-report.c:521
> #3  0x0000000000402dad in handle_internal_command (argc=1, argv=0x7fffe1218e30)
>     at perf.c:226
> #4  0x0000000000402f6d in main (argc=1, argv=0x7fffe1218e30) at perf.c:324
> (gdb)

Signed-off-by Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index e857201e1e0f4..21386a8c6f629 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -518,7 +518,7 @@ static size_t symhist__fprintf(struct symhist *self, FILE *fp)
 	size_t ret = fprintf(fp, "%#llx [%c] ", (unsigned long long)self->ip, self->level);
 
 	if (self->level != '.')
-		ret += fprintf(fp, "%s", self->sym->name);
+		ret += fprintf(fp, "%s", self->sym ? self->sym->name: "<unknown>");
 	else
 		ret += fprintf(fp, "%s: %s",
 			       self->dso ? self->dso->name : "<unknown",
-- 
GitLab


From 53cb8bc2a3d976efd1a800c3de4640a7220afbb3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:17:18 +0200
Subject: [PATCH 1321/2198] perf record: Convert to Git option parsing

Remove getopt usage and use Git's much more advanced and more compact
command option library.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 126 ++++++--------------
 1 file changed, 38 insertions(+), 88 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 21386a8c6f629..9e59d6071ef3a 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1,52 +1,29 @@
-#define _GNU_SOURCE
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <unistd.h>
-#include <stdint.h>
-#include <stdbool.h>
-#include <stdlib.h>
-#include <string.h>
-#include <limits.h>
+#include "util/util.h"
+
+#include <libelf.h>
 #include <gelf.h>
 #include <elf.h>
-#include <libelf.h>
-#include <fcntl.h>
-#include <stdio.h>
-#include <errno.h>
-#include <ctype.h>
-#include <time.h>
-#include <getopt.h>
-#include <assert.h>
-#include <search.h>
-
-#include <sys/ioctl.h>
-#include <sys/poll.h>
-#include <sys/prctl.h>
-#include <sys/wait.h>
-#include <sys/mman.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-
-#include <linux/unistd.h>
-#include <linux/types.h>
-
-#include "../../include/linux/perf_counter.h"
+
 #include "util/list.h"
 #include "util/rbtree.h"
 
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
 #define SHOW_KERNEL	1
 #define SHOW_USER	2
 #define SHOW_HV		4
 
-static char 		const *input_name = "output.perf";
+static char		const *input_name = "output.perf";
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
-static const char *perf_event_names[] = {
+const char *perf_event_names[] = {
 	[PERF_EVENT_MMAP]   = " PERF_EVENT_MMAP",
 	[PERF_EVENT_MUNMAP] = " PERF_EVENT_MUNMAP",
 	[PERF_EVENT_COMM]   = " PERF_EVENT_COMM",
@@ -86,7 +63,7 @@ struct section {
 	char		 name[0];
 };
 
-static struct section *section__new(uint64_t start, uint64_t size,
+struct section *section__new(uint64_t start, uint64_t size,
 				    uint64_t offset, char *name)
 {
 	struct section *self = malloc(sizeof(*self) + strlen(name) + 1);
@@ -241,7 +218,7 @@ static inline uint8_t elf_sym__type(const GElf_Sym *sym)
 	return GELF_ST_TYPE(sym->st_info);
 }
 
-static inline bool elf_sym__is_function(const GElf_Sym *sym)
+static inline int elf_sym__is_function(const GElf_Sym *sym)
 {
 	return elf_sym__type(sym) == STT_FUNC &&
 	       sym->st_name != 0 &&
@@ -393,7 +370,7 @@ static struct dso *dsos__findnew(const char *name)
 	return NULL;
 }
 
-static void dsos__fprintf(FILE *fp)
+void dsos__fprintf(FILE *fp)
 {
 	struct dso *pos;
 
@@ -503,7 +480,7 @@ static struct symhist *symhist__new(struct symbol *sym, uint64_t ip,
 	return self;
 }
 
-static void symhist__delete(struct symhist *self)
+void symhist__delete(struct symhist *self)
 {
 	free(self);
 }
@@ -587,7 +564,7 @@ static int thread__set_comm(struct thread *self, const char *comm)
 	return self->comm ? 0 : -ENOMEM;
 }
 
-static size_t thread__maps_fprintf(struct thread *self, FILE *fp)
+size_t thread__maps_fprintf(struct thread *self, FILE *fp)
 {
 	struct map *pos;
 	size_t ret = 0;
@@ -668,49 +645,7 @@ static void threads__fprintf(FILE *fp)
 	}
 }
 
-static void display_help(void)
-{
-	printf(
-	"Usage: perf-report [<options>]\n"
-	" -i file   --input=<file>      # input file\n"
-	);
-
-	exit(0);
-}
-
-static void process_options(int argc, char *argv[])
-{
-	int error = 0;
-
-	for (;;) {
-		int option_index = 0;
-		/** Options for getopt */
-		static struct option long_options[] = {
-			{"input",	required_argument,	NULL, 'i'},
-			{"no-user",	no_argument,		NULL, 'u'},
-			{"no-kernel",	no_argument,		NULL, 'k'},
-			{"no-hv",	no_argument,		NULL, 'h'},
-			{NULL,		0,			NULL,  0 }
-		};
-		int c = getopt_long(argc, argv, "+:i:kuh",
-				    long_options, &option_index);
-		if (c == -1)
-			break;
-
-		switch (c) {
-		case 'i': input_name			= strdup(optarg); break;
-		case 'k': show_mask &= ~SHOW_KERNEL; break;
-		case 'u': show_mask &= ~SHOW_USER; break;
-		case 'h': show_mask &= ~SHOW_HV; break;
-		default: error = 1; break;
-		}
-	}
-
-	if (error)
-		display_help();
-}
-
-int cmd_report(int argc, char **argv)
+static int __cmd_report(void)
 {
 	unsigned long offset = 0;
 	unsigned long head = 0;
@@ -720,12 +655,6 @@ int cmd_report(int argc, char **argv)
 	int ret, rc = EXIT_FAILURE;
 	unsigned long total = 0;
 
-	elf_version(EV_CURRENT);
-
-	page_size = getpagesize();
-
-	process_options(argc, argv);
-
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
 		perror("failed to open file");
@@ -867,3 +796,24 @@ int cmd_report(int argc, char **argv)
 	return rc;
 }
 
+static const char * const report_usage[] = {
+	"perf report [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		    "input file name"),
+	OPT_END()
+};
+
+int cmd_report(int argc, const char **argv, const char *prefix)
+{
+	elf_version(EV_CURRENT);
+
+	page_size = getpagesize();
+
+	parse_options(argc, argv, options, report_usage, 0);
+
+	return __cmd_report();
+}
-- 
GitLab


From 0bec253c813fbb067db4dfd9f5b6cec1bd2ef026 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 09:17:18 +0200
Subject: [PATCH 1322/2198] perf report: Add help/manpage

Add a (minimal) manpage for perf report.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../Documentation/perf-report.txt             | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 Documentation/perf_counter/Documentation/perf-report.txt

diff --git a/Documentation/perf_counter/Documentation/perf-report.txt b/Documentation/perf_counter/Documentation/perf-report.txt
new file mode 100644
index 0000000000000..64696a218105d
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-report.txt
@@ -0,0 +1,32 @@
+perf-report(1)
+==========
+
+NAME
+----
+perf-report - Read output.perf (created by perf record) and display the profile
+
+SYNOPSIS
+--------
+[verse]
+'perf report' [-i <file> | --input=file]
+
+DESCRIPTION
+-----------
+This command displays the performance counter profile information recorded
+via perf report.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: output.perf)
+
+Configuration
+-------------
+
+EXAMPLES
+--------
+
+SEE ALSO
+--------
+linkperf:perf-stat[1]
-- 
GitLab


From f91183fe3780d44849110a1653dfe8af7bc67aa4 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 26 May 2009 15:25:34 +0200
Subject: [PATCH 1323/2198] perf top: Remove leftover NMI/IRQ bits

79202b removed IRQ/NMI mode selection, so remove it from
perf top as well.

[ Impact: cleanup ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 87b925c8f8e8d..cacaa3c2518d0 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -8,7 +8,7 @@
    Sample output:
 
 ------------------------------------------------------------------------------
- KernelTop:    2669 irqs/sec  [NMI, cache-misses/cache-refs],  (all, cpu: 2)
+ KernelTop:    2669 irqs/sec  [cache-misses/cache-refs],  (all, cpu: 2)
 ------------------------------------------------------------------------------
 
              weight         RIP          kernel function
@@ -92,7 +92,6 @@ static __u64			count_filter		       = 100;
 static int			target_pid				= -1;
 static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
-static int			nmi				=  1;
 static unsigned int		realtime_prio			=  0;
 static int			group				=  0;
 static unsigned int		page_size;
@@ -198,10 +197,9 @@ static void print_sym_table(void)
 
 	printf(
 "------------------------------------------------------------------------------\n");
-	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [%s, ",
+	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [",
 		events_per_sec,
-		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)),
-		nmi ? "NMI" : "IRQ");
+		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)));
 
 	if (nr_counters == 1)
 		printf("%d ", event_count[0]);
@@ -637,7 +635,7 @@ static int __cmd_top(void)
 			hw_event.config		= event_id[counter];
 			hw_event.irq_period	= event_count[counter];
 			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.nmi		= nmi;
+			hw_event.nmi		= 1;
 			hw_event.mmap		= use_mmap;
 			hw_event.munmap		= use_munmap;
 			hw_event.freq		= freq;
-- 
GitLab


From db20c0031288ff524d82b1f240f35f85d4a052eb Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 26 May 2009 15:25:34 +0200
Subject: [PATCH 1324/2198] perf top: fix typo in -d option

Clean up copy/paste options parsing conversion error.

[ Impact: reactivate -d option ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index cacaa3c2518d0..6b1c66f99e4de 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -727,7 +727,7 @@ static const struct option options[] = {
 		    "number of mmap data pages"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
-	OPT_INTEGER('d', "delay", &realtime_prio,
+	OPT_INTEGER('d', "delay", &delay_secs,
 		    "number of seconds to delay between refreshes"),
 	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
 			    "dump the symbol table used for profiling"),
-- 
GitLab


From d076d2bd0d9379314df5f2ab8b9c83f617c70923 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 26 May 2009 23:10:15 +0900
Subject: [PATCH 1325/2198] sh: pci-sh7751: Initialize io_map_base in
 controller definition.

As there is only a single controller and remapping has no impact for the
address range in question, just initialize it directly in the controller
definition. This fixes up boot time warnings about not having the field
initialized.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-sh7751.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c
index c4fa0bb139761..70c1999a0ec4c 100644
--- a/arch/sh/drivers/pci/pci-sh7751.c
+++ b/arch/sh/drivers/pci/pci-sh7751.c
@@ -52,7 +52,7 @@ static struct resource sh7751_io_resource = {
 };
 
 static struct resource sh7751_mem_resource = {
-	.name	= "SH7785_mem",
+	.name	= "SH7751_mem",
 	.start	= SH7751_PCI_MEMORY_BASE,
 	.end	= SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1,
 	.flags	= IORESOURCE_MEM
@@ -64,6 +64,7 @@ static struct pci_channel sh7751_pci_controller = {
 	.mem_offset	= 0x00000000,
 	.io_resource	= &sh7751_io_resource,
 	.io_offset	= 0x00000000,
+	.io_map_base	= SH7751_PCI_IO_BASE,
 };
 
 static struct sh4_pci_address_map sh7751_pci_map = {
@@ -179,8 +180,6 @@ static int __init sh7751_pci_init(void)
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM;
 	pci_write_reg(chan, word, SH4_PCICR);
 
-	__set_io_port_base(SH7751_PCI_IO_BASE);
-
 	register_pci_controller(chan);
 
 	return 0;
-- 
GitLab


From b7e2ac619465f1774b827d9997109ceef4a61851 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 26 May 2009 23:13:13 +0900
Subject: [PATCH 1326/2198] sh: pci: Disable MWI and make
 pci_dma_burst_advice() a bit more accurate.

None of the SH PCI controllers support MWI, it is always treated as a
direct memory write, so simply disable it outright. In the case of the
PCI cache line size, consult that for the pci_dma_burst_advice()
strategy, and switch over to PCI_DMA_BURST_MULTIPLE, as PPC64.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/pci.h | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h
index 5b2e0fcdfc22b..ae0da6f48b6dc 100644
--- a/arch/sh/include/asm/pci.h
+++ b/arch/sh/include/asm/pci.h
@@ -89,12 +89,28 @@ static inline void pcibios_penalize_isa_irq(int irq, int active)
 #endif
 
 #ifdef CONFIG_PCI
+/*
+ * None of the SH PCI controllers support MWI, it is always treated as a
+ * direct memory write.
+ */
+#define PCI_DISABLE_MWI
+
 static inline void pci_dma_burst_advice(struct pci_dev *pdev,
 					enum pci_dma_burst_strategy *strat,
 					unsigned long *strategy_parameter)
 {
-	*strat = PCI_DMA_BURST_INFINITY;
-	*strategy_parameter = ~0UL;
+	unsigned long cacheline_size;
+	u8 byte;
+
+	pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &byte);
+
+	if (byte == 0)
+		cacheline_size = L1_CACHE_BYTES;
+	else
+		cacheline_size = byte << 2;
+
+	*strat = PCI_DMA_BURST_MULTIPLE;
+	*strategy_parameter = cacheline_size;
 }
 #endif
 
-- 
GitLab


From f17e04afaff84b5cfd317da29ac4d764908ff833 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 May 2009 15:30:22 +0200
Subject: [PATCH 1327/2198] perf report: Fix ELF symbol parsing

[ Impact: fix DSO symbol output in perf report ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile         |  2 +-
 Documentation/perf_counter/builtin-report.c | 72 ++++++---------------
 2 files changed, 22 insertions(+), 52 deletions(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 412dea15d0b7e..10c13a6f2bc93 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -159,7 +159,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -g -O2 -Wall
+CFLAGS = -ggdb3 -Wall
 LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 9e59d6071ef3a..697f960495fc8 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -55,34 +55,6 @@ typedef union event_union {
 	struct comm_event comm;
 } event_t;
 
-struct section {
-	struct list_head node;
-	uint64_t	 start;
-	uint64_t	 end;
-	uint64_t	 offset;
-	char		 name[0];
-};
-
-struct section *section__new(uint64_t start, uint64_t size,
-				    uint64_t offset, char *name)
-{
-	struct section *self = malloc(sizeof(*self) + strlen(name) + 1);
-
-	if (self != NULL) {
-		self->start  = start;
-		self->end    = start + size;
-		self->offset = offset;
-		strcpy(self->name, name);
-	}
-
-	return self;
-}
-
-static void section__delete(struct section *self)
-{
-	free(self);
-}
-
 struct symbol {
 	struct rb_node rb_node;
 	uint64_t       start;
@@ -116,7 +88,6 @@ static size_t symbol__fprintf(struct symbol *self, FILE *fp)
 
 struct dso {
 	struct list_head node;
-	struct list_head sections;
 	struct rb_root	 syms;
 	char		 name[0];
 };
@@ -127,21 +98,12 @@ static struct dso *dso__new(const char *name)
 
 	if (self != NULL) {
 		strcpy(self->name, name);
-		INIT_LIST_HEAD(&self->sections);
 		self->syms = RB_ROOT;
 	}
 
 	return self;
 }
 
-static void dso__delete_sections(struct dso *self)
-{
-	struct section *pos, *n;
-
-	list_for_each_entry_safe(pos, n, &self->sections, node)
-		section__delete(pos);
-}
-
 static void dso__delete_symbols(struct dso *self)
 {
 	struct symbol *pos;
@@ -156,7 +118,6 @@ static void dso__delete_symbols(struct dso *self)
 
 static void dso__delete(struct dso *self)
 {
-	dso__delete_sections(self);
 	dso__delete_symbols(self);
 	free(self);
 }
@@ -282,9 +243,6 @@ static int dso__load(struct dso *self)
 	if (sec == NULL)
 		goto out_elf_end;
 
-	if (gelf_getshdr(sec, &shdr) == NULL)
-		goto out_elf_end;
-
 	Elf_Data *syms = elf_getdata(sec, NULL);
 	if (syms == NULL)
 		goto out_elf_end;
@@ -302,11 +260,21 @@ static int dso__load(struct dso *self)
 	GElf_Sym sym;
 	uint32_t index;
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
+		struct symbol *f;
+
 		if (!elf_sym__is_function(&sym))
 			continue;
-		struct symbol *f = symbol__new(sym.st_value, sym.st_size,
-					       elf_sym__name(&sym, symstrs));
-		if (f == NULL)
+
+		sec = elf_getscn(elf, sym.st_shndx);
+		if (!sec)
+			goto out_elf_end;
+
+		gelf_getshdr(sec, &shdr);
+		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+
+		f = symbol__new(sym.st_value, sym.st_size,
+				elf_sym__name(&sym, symstrs));
+		if (!f)
 			goto out_elf_end;
 
 		dso__insert_symbol(self, f);
@@ -498,7 +466,7 @@ static size_t symhist__fprintf(struct symhist *self, FILE *fp)
 		ret += fprintf(fp, "%s", self->sym ? self->sym->name: "<unknown>");
 	else
 		ret += fprintf(fp, "%s: %s",
-			       self->dso ? self->dso->name : "<unknown",
+			       self->dso ? self->dso->name : "<unknown>",
 			       self->sym ? self->sym->name : "<unknown>");
 	return ret + fprintf(fp, ": %u\n", self->count);
 }
@@ -714,6 +682,7 @@ static int __cmd_report(void)
 		int show = 0;
 		struct dso *dso = NULL;
 		struct thread *thread = threads__findnew(event->ip.pid);
+		uint64_t ip = event->ip.ip;
 
 		if (thread == NULL) {
 			fprintf(stderr, "problem processing %d event, bailing out\n",
@@ -728,19 +697,20 @@ static int __cmd_report(void)
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
 			show = SHOW_USER;
 			level = '.';
-			struct map *map = thread__find_map(thread, event->ip.ip);
-			if (map != NULL)
+			struct map *map = thread__find_map(thread, ip);
+			if (map != NULL) {
 				dso = map->dso;
+				ip -= map->start + map->pgoff;
+			}
 		} else {
 			show = SHOW_HV;
 			level = 'H';
 		}
 
 		if (show & show_mask) {
-			struct symbol *sym = dso__find_symbol(dso, event->ip.ip);
+			struct symbol *sym = dso__find_symbol(dso, ip);
 
-			if (thread__symbol_incnew(thread, sym, event->ip.ip,
-						  dso, level)) {
+			if (thread__symbol_incnew(thread, sym, ip, dso, level)) {
 				fprintf(stderr, "problem incrementing symbol count, bailing out\n");
 				goto done;
 			}
-- 
GitLab


From 59d81029b6804c3d5895d07cad77d7dfddc6b5b2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 11:14:27 -0300
Subject: [PATCH 1328/2198] perf report: Fix kernel symbol resolution

kallsyms have just the symbol start, so we need to read two lines
to get the len.

[ Impact: fix incorrect kernel symbol display in perf report ]

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 26 +++++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 697f960495fc8..b19b893d4ff5d 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -360,9 +360,17 @@ static int load_kallsyms(void)
 	char *line = NULL;
 	size_t n;
 
+	if (getline(&line, &n, file) < 0 || !line)
+		goto out_delete_dso;
+
+	unsigned long long previous_start;
+	char c, previous_symbf[4096];
+	if (sscanf(line, "%llx %c %s", &previous_start, &c, previous_symbf) != 3)
+		goto out_delete_line;
+
 	while (!feof(file)) {
 		unsigned long long start;
-		char c, symbf[4096];
+		char symbf[4096];
 
 		if (getline(&line, &n, file) < 0)
 			break;
@@ -371,12 +379,18 @@ static int load_kallsyms(void)
 			goto out_delete_dso;
 
 		if (sscanf(line, "%llx %c %s", &start, &c, symbf) == 3) {
-			struct symbol *sym = symbol__new(start, 0x1000000, symbf);
+			if (start > previous_start) {
+				struct symbol *sym = symbol__new(previous_start,
+								 start - previous_start,
+								 previous_symbf);
 
-			if (sym == NULL)
-				goto out_delete_dso;
+				if (sym == NULL)
+					goto out_delete_dso;
 
-			dso__insert_symbol(kernel_dso, sym);
+				dso__insert_symbol(kernel_dso, sym);
+				previous_start = start;
+				strcpy(previous_symbf, symbf);
+			}
 		}
 	}
 
@@ -385,6 +399,8 @@ static int load_kallsyms(void)
 	fclose(file);
 	return 0;
 
+out_delete_line:
+	free(line);
 out_delete_dso:
 	dso__delete(kernel_dso);
 	return -1;
-- 
GitLab


From e1b28aab5804aa477c33d19855d6747607a885fd Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 26 May 2009 15:41:27 +0100
Subject: [PATCH 1329/2198] GFS2: Remove lockstruct subdir from gfs2 sysfs
 files

The lockstruct sub directory contained two entries, both of
which are duplicated elsewhere in the gfs2 sysfs files as
well as being available via /proc/mounts. There is no userland program
using either of them, so this patch removes them.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 41 ++++++++---------------------------------
 1 file changed, 8 insertions(+), 33 deletions(-)

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 9f6d48b75fd2d..94bd59ec54e2c 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -276,25 +276,6 @@ static struct kobj_type gfs2_ktype = {
 	.sysfs_ops     = &gfs2_attr_ops,
 };
 
-/*
- * display struct lm_lockstruct fields
- */
-
-#define LOCKSTRUCT_ATTR(name, fmt)                                          \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \
-}                                                                           \
-static struct gfs2_attr lockstruct_attr_##name = __ATTR_RO(name)
-
-LOCKSTRUCT_ATTR(jid,      "%u\n");
-LOCKSTRUCT_ATTR(first,    "%u\n");
-
-static struct attribute *lockstruct_attrs[] = {
-	&lockstruct_attr_jid.attr,
-	&lockstruct_attr_first.attr,
-	NULL,
-};
 
 /*
  * lock_module. Originally from lock_dlm
@@ -397,6 +378,11 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf)
 	return sprintf(buf, "%d\n", ls->ls_recover_jid_status);
 }
 
+static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
+{
+	return sprintf(buf, "%u\n", sdp->sd_lockstruct.ls_jid);
+}
+
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 
@@ -404,6 +390,7 @@ GDLM_ATTR(proto_name,     0444, proto_name_show,	NULL);
 GDLM_ATTR(block,          0644, block_show,		block_store);
 GDLM_ATTR(withdraw,       0644, withdraw_show,		withdraw_store);
 GDLM_ATTR(id,             0444, lkid_show,		NULL);
+GDLM_ATTR(jid,		  0444, jid_show,		NULL);
 GDLM_ATTR(first,          0444, lkfirst_show,		NULL);
 GDLM_ATTR(first_done,     0444, first_done_show,	NULL);
 GDLM_ATTR(recover,        0200, NULL,			recover_store);
@@ -415,7 +402,7 @@ static struct attribute *lock_module_attrs[] = {
 	&gdlm_attr_block.attr,
 	&gdlm_attr_withdraw.attr,
 	&gdlm_attr_id.attr,
-	&lockstruct_attr_jid.attr,
+	&gdlm_attr_jid.attr,
 	&gdlm_attr_first.attr,
 	&gdlm_attr_first_done.attr,
 	&gdlm_attr_recover.attr,
@@ -558,11 +545,6 @@ static struct attribute *tune_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group lockstruct_group = {
-	.name = "lockstruct",
-	.attrs = lockstruct_attrs,
-};
-
 static struct attribute_group args_group = {
 	.name = "args",
 	.attrs = args_attrs,
@@ -588,13 +570,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail;
 
-	error = sysfs_create_group(&sdp->sd_kobj, &lockstruct_group);
-	if (error)
-		goto fail_reg;
-
 	error = sysfs_create_group(&sdp->sd_kobj, &args_group);
 	if (error)
-		goto fail_lockstruct;
+		goto fail_reg;
 
 	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
 	if (error)
@@ -611,8 +589,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_args:
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
-fail_lockstruct:
-	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 fail_reg:
 	kobject_put(&sdp->sd_kobj);
 fail:
@@ -624,7 +600,6 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 	sysfs_remove_group(&sdp->sd_kobj, &args_group);
-	sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
 }
-- 
GitLab


From f6eb53498ee8f725832f3a0fffca90566bb118a6 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Tue, 26 May 2009 15:50:25 +0100
Subject: [PATCH 1330/2198] GFS2: Remove args subdir from gfs2 sysfs files

Since we can cat /proc/mounts there is no need to have this
subdirectory in the gfs2 sysfs files. In fact this does not
reflect the full range of possible mount argumenmts, where
as /proc/mounts does.

There was only one userland user of this set of sysfs files
and it will function perfectly well without these files
being present (in fact that subcommand of gfs2_tool is
obsolete anyway).

The tune/* subdirectory is also considered mostly obsolete,
but there are a few uses of this until mount arguments can
be added for the last few functions for which there are no
equivalents currently. However the tune/* directory is still
in my sights and new code should avoid using it. Only the gfs2_quota
and gfs2_tool programs are know to use tune/* at the moment.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/sys.c | 52 +--------------------------------------------------
 1 file changed, 1 insertion(+), 51 deletions(-)

diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 94bd59ec54e2c..23419dc3027b6 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -411,44 +411,6 @@ static struct attribute *lock_module_attrs[] = {
 	NULL,
 };
 
-#define ARGS_ATTR(name, fmt)                                                \
-static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf)                 \
-{                                                                           \
-	return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name);       \
-}                                                                           \
-static struct gfs2_attr args_attr_##name = __ATTR_RO(name)
-
-ARGS_ATTR(lockproto,       "%s\n");
-ARGS_ATTR(locktable,       "%s\n");
-ARGS_ATTR(hostdata,        "%s\n");
-ARGS_ATTR(spectator,       "%d\n");
-ARGS_ATTR(ignore_local_fs, "%d\n");
-ARGS_ATTR(localcaching,    "%d\n");
-ARGS_ATTR(localflocks,     "%d\n");
-ARGS_ATTR(debug,           "%d\n");
-ARGS_ATTR(upgrade,         "%d\n");
-ARGS_ATTR(posix_acl,       "%d\n");
-ARGS_ATTR(quota,           "%u\n");
-ARGS_ATTR(suiddir,         "%d\n");
-ARGS_ATTR(data,            "%d\n");
-
-static struct attribute *args_attrs[] = {
-	&args_attr_lockproto.attr,
-	&args_attr_locktable.attr,
-	&args_attr_hostdata.attr,
-	&args_attr_spectator.attr,
-	&args_attr_ignore_local_fs.attr,
-	&args_attr_localcaching.attr,
-	&args_attr_localflocks.attr,
-	&args_attr_debug.attr,
-	&args_attr_upgrade.attr,
-	&args_attr_posix_acl.attr,
-	&args_attr_quota.attr,
-	&args_attr_suiddir.attr,
-	&args_attr_data.attr,
-	NULL,
-};
-
 /*
  * get and set struct gfs2_tune fields
  */
@@ -545,11 +507,6 @@ static struct attribute *tune_attrs[] = {
 	NULL,
 };
 
-static struct attribute_group args_group = {
-	.name = "args",
-	.attrs = args_attrs,
-};
-
 static struct attribute_group tune_group = {
 	.name = "tune",
 	.attrs = tune_attrs,
@@ -570,13 +527,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 	if (error)
 		goto fail;
 
-	error = sysfs_create_group(&sdp->sd_kobj, &args_group);
-	if (error)
-		goto fail_reg;
-
 	error = sysfs_create_group(&sdp->sd_kobj, &tune_group);
 	if (error)
-		goto fail_args;
+		goto fail_reg;
 
 	error = sysfs_create_group(&sdp->sd_kobj, &lock_module_group);
 	if (error)
@@ -587,8 +540,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 
 fail_tune:
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-fail_args:
-	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 fail_reg:
 	kobject_put(&sdp->sd_kobj);
 fail:
@@ -599,7 +550,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
 	sysfs_remove_group(&sdp->sd_kobj, &tune_group);
-	sysfs_remove_group(&sdp->sd_kobj, &args_group);
 	sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 	kobject_put(&sdp->sd_kobj);
 }
-- 
GitLab


From 5582b0648de6248c67c0b47fa170e5fb15ab4bf1 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 27 May 2009 00:12:58 +0900
Subject: [PATCH 1331/2198] sh: pci-sh7780: Fix up for PCI_DISABLE_MWI changes.

This fixes a build error where references to pci_cache_line_size are
undefined, as this ceases to be exported when PCI_DISABLE_MWI is enabled,
as is now the default.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/drivers/pci/pci-sh7780.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c
index ae13ff925c61e..323b92d565fee 100644
--- a/arch/sh/drivers/pci/pci-sh7780.c
+++ b/arch/sh/drivers/pci/pci-sh7780.c
@@ -15,8 +15,6 @@
 #include <linux/delay.h>
 #include "pci-sh4.h"
 
-extern u8 pci_cache_line_size;
-
 static struct resource sh7785_io_resource = {
 	.name	= "SH7785_IO",
 	.start	= SH7780_PCI_IO_BASE,
@@ -37,6 +35,7 @@ static struct pci_channel sh7780_pci_controller = {
 	.mem_offset	= 0x00000000,
 	.io_resource	= &sh7785_io_resource,
 	.io_offset	= 0x00000000,
+	.io_map_base	= SH7780_PCI_IO_BASE,
 };
 
 static struct sh4_pci_address_map sh7780_pci_map = {
@@ -99,8 +98,6 @@ static int __init sh7780_pci_init(void)
 	__raw_writeb(PCI_CLASS_BRIDGE_HOST & 0xff,
 		     chan->reg_base + SH7780_PCISUB);
 
-	pci_cache_line_size = pci_read_reg(chan, SH7780_PCICLS) / 4;
-
 	/*
 	 * Set IO and Mem windows to local address
 	 * Make PCI and local address the same for easy 1 to 1 mapping
@@ -142,8 +139,6 @@ static int __init sh7780_pci_init(void)
 	word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO;
 	pci_write_reg(chan, word, SH4_PCICR);
 
-	__set_io_port_base(SH7780_PCI_IO_BASE);
-
 	register_pci_controller(chan);
 
 	return 0;
-- 
GitLab


From abd54f68629fa73ed4fa040d433196211a9bbed2 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 12:21:34 -0300
Subject: [PATCH 1332/2198] perf: Don't assume /proc/kallsyms is ordered

perf: Don't assume /proc/kallsyms is ordered

Since we _are_ ordering it by the symbol start, just traverse the
freshly built rbtree setting the prev->end members to curr->start - 1.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090526152134.GF4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 44 ++++++++++++---------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index b19b893d4ff5d..e17819001dd57 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -360,17 +360,9 @@ static int load_kallsyms(void)
 	char *line = NULL;
 	size_t n;
 
-	if (getline(&line, &n, file) < 0 || !line)
-		goto out_delete_dso;
-
-	unsigned long long previous_start;
-	char c, previous_symbf[4096];
-	if (sscanf(line, "%llx %c %s", &previous_start, &c, previous_symbf) != 3)
-		goto out_delete_line;
-
 	while (!feof(file)) {
 		unsigned long long start;
-		char symbf[4096];
+		char c, symbf[4096];
 
 		if (getline(&line, &n, file) < 0)
 			break;
@@ -379,21 +371,35 @@ static int load_kallsyms(void)
 			goto out_delete_dso;
 
 		if (sscanf(line, "%llx %c %s", &start, &c, symbf) == 3) {
-			if (start > previous_start) {
-				struct symbol *sym = symbol__new(previous_start,
-								 start - previous_start,
-								 previous_symbf);
+			/*
+			 * Well fix up the end later, when we have all sorted.
+			 */
+			struct symbol *sym = symbol__new(start, 0xdead, symbf);
 
-				if (sym == NULL)
-					goto out_delete_dso;
+			if (sym == NULL)
+				goto out_delete_dso;
 
-				dso__insert_symbol(kernel_dso, sym);
-				previous_start = start;
-				strcpy(previous_symbf, symbf);
-			}
+			dso__insert_symbol(kernel_dso, sym);
 		}
 	}
 
+	/*
+	 * Now that we have all sorted out, just set the ->end of all
+	 * symbols
+	 */
+	struct rb_node *nd, *prevnd = rb_first(&kernel_dso->syms);
+
+	if (prevnd == NULL)
+		goto out_delete_line;
+
+	for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
+		struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
+			      *curr = rb_entry(nd, struct symbol, rb_node);
+
+		prev->end = curr->start - 1;
+		prevnd = nd;
+	}
+
 	dsos__add(kernel_dso);
 	free(line);
 	fclose(file);
-- 
GitLab


From 97b07b699b11d4bd1218a841e5dfed16bd53de06 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 18:48:58 +0200
Subject: [PATCH 1333/2198] perf report: add --dump-raw-trace option

To help the inspection of various data files, implement an ASCII dump
method that just dumps the records as they are read in - then we exit.

[ Impact: new feature ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 39 ++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index e17819001dd57..8ea8aaa05af57 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -20,6 +20,8 @@ static char		const *input_name = "output.perf";
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
+static int		dump_trace = 0;
+
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
@@ -643,7 +645,7 @@ static int __cmd_report(void)
 	char *buf;
 	event_t *event;
 	int ret, rc = EXIT_FAILURE;
-	unsigned long total = 0;
+	unsigned long total = 0, total_mmap = 0, total_comm = 0;
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
@@ -706,6 +708,13 @@ static int __cmd_report(void)
 		struct thread *thread = threads__findnew(event->ip.pid);
 		uint64_t ip = event->ip.ip;
 
+		if (dump_trace) {
+			fprintf(stderr, "PERF_EVENT (IP, %d): %d: %p\n",
+				event->header.misc,
+				event->ip.pid,
+				(void *)event->ip.ip);
+		}
+
 		if (thread == NULL) {
 			fprintf(stderr, "problem processing %d event, bailing out\n",
 				event->header.type);
@@ -743,23 +752,40 @@ static int __cmd_report(void)
 		struct thread *thread = threads__findnew(event->mmap.pid);
 		struct map *map = map__new(&event->mmap);
 
+		if (dump_trace) {
+			fprintf(stderr, "PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+				(void *)event->mmap.start,
+				(void *)event->mmap.len,
+				(void *)event->mmap.pgoff,
+				event->mmap.filename);
+		}
 		if (thread == NULL || map == NULL) {
 			fprintf(stderr, "problem processing PERF_EVENT_MMAP, bailing out\n");
 			goto done;
 		}
 		thread__insert_map(thread, map);
+		total_mmap++;
 		break;
 	}
 	case PERF_EVENT_COMM: {
 		struct thread *thread = threads__findnew(event->comm.pid);
 
+		if (dump_trace) {
+			fprintf(stderr, "PERF_EVENT_COMM: %s:%d\n",
+				event->comm.comm, event->comm.pid);
+		}
 		if (thread == NULL ||
 		    thread__set_comm(thread, event->comm.comm)) {
 			fprintf(stderr, "problem processing PERF_EVENT_COMM, bailing out\n");
 			goto done;
 		}
+		total_comm++;
 		break;
 	}
+	default: {
+		fprintf(stderr, "skipping unknown header type: %d\n",
+			event->header.type);
+	}
 	}
 
 	if (offset + head < stat.st_size)
@@ -768,6 +794,15 @@ static int __cmd_report(void)
 	rc = EXIT_SUCCESS;
 done:
 	close(input);
+
+	if (dump_trace) {
+		fprintf(stderr, "   IP events: %10ld\n", total);
+		fprintf(stderr, " mmap events: %10ld\n", total_mmap);
+		fprintf(stderr, " comm events: %10ld\n", total_comm);
+
+		return 0;
+	}
+
 	//dsos__fprintf(stdout);
 	threads__fprintf(stdout);
 #if 0
@@ -796,6 +831,8 @@ static const char * const report_usage[] = {
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
 	OPT_END()
 };
 
-- 
GitLab


From 3e70611460fe74ad32534fa9791774f6bbdd4159 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 18:53:17 +0200
Subject: [PATCH 1334/2198] perf report: add counter for unknown events

Add a counter for unknown event records.

[ Impact: improve debugging ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 8ea8aaa05af57..4b5ccc5bd0e61 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -645,7 +645,7 @@ static int __cmd_report(void)
 	char *buf;
 	event_t *event;
 	int ret, rc = EXIT_FAILURE;
-	unsigned long total = 0, total_mmap = 0, total_comm = 0;
+	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown;
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
@@ -785,6 +785,7 @@ static int __cmd_report(void)
 	default: {
 		fprintf(stderr, "skipping unknown header type: %d\n",
 			event->header.type);
+		total_unknown++;
 	}
 	}
 
@@ -796,9 +797,10 @@ static int __cmd_report(void)
 	close(input);
 
 	if (dump_trace) {
-		fprintf(stderr, "   IP events: %10ld\n", total);
-		fprintf(stderr, " mmap events: %10ld\n", total_mmap);
-		fprintf(stderr, " comm events: %10ld\n", total_comm);
+		fprintf(stderr, "      IP events: %10ld\n", total);
+		fprintf(stderr, "    mmap events: %10ld\n", total_mmap);
+		fprintf(stderr, "    comm events: %10ld\n", total_comm);
+		fprintf(stderr, " unknown events: %10ld\n", total_unknown);
 
 		return 0;
 	}
-- 
GitLab


From f49515b157e2d3ca3633eb0664fc46c42f6cb37e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 26 May 2009 19:03:36 +0200
Subject: [PATCH 1335/2198] perf report: add more debugging

Add the offset of the file we are analyzing, and the size of the record.

In case of problems it's easier to see where the parser lost track.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 22 ++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 4b5ccc5bd0e61..2d4e4cc655a38 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -645,7 +645,7 @@ static int __cmd_report(void)
 	char *buf;
 	event_t *event;
 	int ret, rc = EXIT_FAILURE;
-	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown;
+	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
@@ -699,8 +699,6 @@ static int __cmd_report(void)
 		goto done;
 	}
 
-	head += event->header.size;
-
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
 		char level;
 		int show = 0;
@@ -709,7 +707,9 @@ static int __cmd_report(void)
 		uint64_t ip = event->ip.ip;
 
 		if (dump_trace) {
-			fprintf(stderr, "PERF_EVENT (IP, %d): %d: %p\n",
+			fprintf(stderr, "%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+				(void *)(offset + head),
+				(void *)(long)(event->header.size),
 				event->header.misc,
 				event->ip.pid,
 				(void *)event->ip.ip);
@@ -753,7 +753,9 @@ static int __cmd_report(void)
 		struct map *map = map__new(&event->mmap);
 
 		if (dump_trace) {
-			fprintf(stderr, "PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+			fprintf(stderr, "%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+				(void *)(offset + head),
+				(void *)(long)(event->header.size),
 				(void *)event->mmap.start,
 				(void *)event->mmap.len,
 				(void *)event->mmap.pgoff,
@@ -771,7 +773,9 @@ static int __cmd_report(void)
 		struct thread *thread = threads__findnew(event->comm.pid);
 
 		if (dump_trace) {
-			fprintf(stderr, "PERF_EVENT_COMM: %s:%d\n",
+			fprintf(stderr, "%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+				(void *)(offset + head),
+				(void *)(long)(event->header.size),
 				event->comm.comm, event->comm.pid);
 		}
 		if (thread == NULL ||
@@ -783,12 +787,16 @@ static int __cmd_report(void)
 		break;
 	}
 	default: {
-		fprintf(stderr, "skipping unknown header type: %d\n",
+		fprintf(stderr, "%p [%p]: skipping unknown header type: %d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
 			event->header.type);
 		total_unknown++;
 	}
 	}
 
+	head += event->header.size;
+
 	if (offset + head < stat.st_size)
 		goto more;
 
-- 
GitLab


From 6142f9ec108a4ddbf0d5904c3daa5fdcaa618792 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 26 May 2009 20:51:47 +0200
Subject: [PATCH 1336/2198] perf report: More robust error handling

Don't let funny events confuse us, stick to what we know and
try to find sensible data again.

If we find an unknown event, check we're still u64 aligned, and
increment by one u64. This ensures we're bound to happen upon a
valid event soon.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 27 +++++++++++++++------
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 2d4e4cc655a38..a58be7fee4256 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -645,6 +645,7 @@ static int __cmd_report(void)
 	char *buf;
 	event_t *event;
 	int ret, rc = EXIT_FAILURE;
+	uint32_t size;
 	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
 
 	input = open(input_name, O_RDONLY);
@@ -680,6 +681,10 @@ static int __cmd_report(void)
 more:
 	event = (event_t *)(buf + head);
 
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
 	if (head + event->header.size >= page_size * mmap_window) {
 		unsigned long shift = page_size * (head / page_size);
 		int ret;
@@ -692,12 +697,9 @@ static int __cmd_report(void)
 		goto remap;
 	}
 
-
-	if (!event->header.size) {
-		fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head);
-		fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head);
-		goto done;
-	}
+	size = event->header.size;
+	if (!size)
+		goto broken_event;
 
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
 		char level;
@@ -787,15 +789,26 @@ static int __cmd_report(void)
 		break;
 	}
 	default: {
+broken_event:
 		fprintf(stderr, "%p [%p]: skipping unknown header type: %d\n",
 			(void *)(offset + head),
 			(void *)(long)(event->header.size),
 			event->header.type);
 		total_unknown++;
+
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
 	}
 	}
 
-	head += event->header.size;
+	head += size;
 
 	if (offset + head < stat.st_size)
 		goto more;
-- 
GitLab


From be74b73a57645cc253d881ab0c1014eb64b9cf22 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 26 May 2009 20:25:22 +0200
Subject: [PATCH 1337/2198] tracing: add __print_flags for events

Developers have been asking for the ability in the ftrace event tracer
to display names of bits in a flags variable.

Instead of printing out c2, it would be easier to read FOO|BAR|GOO,
assuming that FOO is bit 1, BAR is bit 6 and GOO is bit 7.

Some examples where this would be useful are the state flags in a context
switch, kmalloc flags, and even permision flags in accessing files.

[
  v2 changes include:

  Frederic Weisbecker's idea of using a mask instead of bits,
  thus we can output GFP_KERNEL instead of GPF_WAIT|GFP_IO|GFP_FS.

  Li Zefan's idea of allowing the caller of __print_flags to add their
  own delimiter (or no delimiter) where we can get for file permissions
  rwx instead of r|w|x.
]

[
  v3 changes:

   Christoph Hellwig's idea of using an array instead of va_args.
]

[ Impact: better displaying of flags in trace output ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/ftrace_event.h | 13 +++++++++++-
 include/trace/ftrace.h       | 14 +++++++++++++
 kernel/trace/trace_output.c  | 39 ++++++++++++++++++++++++++++++++++++
 3 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index bae51ddfabd35..4b58cf1a11c2c 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -3,12 +3,23 @@
 
 #include <linux/trace_seq.h>
 #include <linux/ring_buffer.h>
-
+#include <linux/percpu.h>
 
 struct trace_array;
 struct tracer;
 struct dentry;
 
+DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq);
+
+struct trace_print_flags {
+	unsigned long		mask;
+	const char		*name;
+};
+
+const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+				   unsigned long flags,
+				   const struct trace_print_flags *flag_array);
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index b5ff2e8229ece..22c94719c5695 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -87,6 +87,7 @@
  *	struct trace_seq *s = &iter->seq;
  *	struct ftrace_raw_<call> *field; <-- defined in stage 1
  *	struct trace_entry *entry;
+ *	struct trace_seq *p;
  *	int ret;
  *
  *	entry = iter->ent;
@@ -98,7 +99,9 @@
  *
  *	field = (typeof(field))entry;
  *
+ *	p = get_cpu_var(ftrace_event_seq);
  *	ret = trace_seq_printf(s, <TP_printk> "\n");
+ *	put_cpu();
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -119,6 +122,14 @@
 #undef __get_str
 #define __get_str(field)	((char *)__entry + __entry->__str_loc_##field)
 
+#undef __print_flags
+#define __print_flags(flag, delim, flag_array...)			\
+	({								\
+		static const struct trace_print_flags flags[] =		\
+			{ flag_array, { -1, NULL }};			\
+		ftrace_print_flags_seq(p, delim, flag, flags);		\
+	})
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
@@ -127,6 +138,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	struct trace_seq *s = &iter->seq;				\
 	struct ftrace_raw_##call *field;				\
 	struct trace_entry *entry;					\
+	struct trace_seq *p;						\
 	int ret;							\
 									\
 	entry = iter->ent;						\
@@ -138,7 +150,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
+	p = &get_cpu_var(ftrace_event_seq);				\
 	ret = trace_seq_printf(s, #call ": " print);			\
+	put_cpu();							\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 7136420603aa0..a4840c260c891 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -15,6 +15,9 @@
 #define EVENT_HASHSIZE	128
 
 static DECLARE_RWSEM(trace_event_mutex);
+
+DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
@@ -212,6 +215,42 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
 	return 0;
 }
 
+const char *
+ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+		       unsigned long flags,
+		       const struct trace_print_flags *flag_array)
+{
+	unsigned long mask;
+	const char *str;
+	int i;
+
+	trace_seq_init(p);
+
+	for (i = 0;  flag_array[i].name && flags; i++) {
+
+		mask = flag_array[i].mask;
+		if ((flags & mask) != mask)
+			continue;
+
+		str = flag_array[i].name;
+		flags &= ~mask;
+		if (p->len && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_puts(p, str);
+	}
+
+	/* check for left over flags */
+	if (flags) {
+		if (p->len && delim)
+			trace_seq_puts(p, delim);
+		trace_seq_printf(p, "0x%lx", flags);
+	}
+
+	trace_seq_putc(p, 0);
+
+	return p->buffer;
+}
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
GitLab


From 937cdb9db7f59278d0cb1582e6e64e3dfd73b4fc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 15 May 2009 10:51:13 -0400
Subject: [PATCH 1338/2198] tracing: add previous task state info to sched
 switch event

It is useful to see the state of a task that is being switched out.
This patch adds the output of the state of the previous task in
the context switch event.

[ Impact: see state of switched out task in context switch ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/trace/events/sched.h | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index dd4033cf5b091..24ab5bcff7b2b 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -156,6 +156,7 @@ TRACE_EVENT(sched_switch,
 		__array(	char,	prev_comm,	TASK_COMM_LEN	)
 		__field(	pid_t,	prev_pid			)
 		__field(	int,	prev_prio			)
+		__field(	long,	prev_state			)
 		__array(	char,	next_comm,	TASK_COMM_LEN	)
 		__field(	pid_t,	next_pid			)
 		__field(	int,	next_prio			)
@@ -165,13 +166,19 @@ TRACE_EVENT(sched_switch,
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		__entry->prev_pid	= prev->pid;
 		__entry->prev_prio	= prev->prio;
+		__entry->prev_state	= prev->state;
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		__entry->next_pid	= next->pid;
 		__entry->next_prio	= next->prio;
 	),
 
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+	TP_printk("task %s:%d [%d] (%s) ==> %s:%d [%d]",
 		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->prev_state ?
+		  __print_flags(__entry->prev_state, "|",
+				{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
+				{ 16, "Z" }, { 32, "X" }, { 64, "x" },
+				{ 128, "W" }) : "R",
 		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 
-- 
GitLab


From 62ba180e80f4194a498585ac0e4c07daa8ca08d1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 15 May 2009 16:16:30 -0400
Subject: [PATCH 1339/2198] tracing: add flag output for kmem events

This patch changes the output for gfp_flags from being a simple hex value
to the actual names.

  gfp_flags=GFP_ATOMIC  instead of gfp_flags=00000020

And even

  gfp_flags=GFP_KERNEL instead of gfp_flags=000000d0

(Thanks to Frederic Weisbecker for pointing out that the first version
 had a bad order of GFP masks)

[ Impact: more human readable output from tracer ]

Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/trace/events/kmem.h | 53 +++++++++++++++++++++++++++++++------
 1 file changed, 45 insertions(+), 8 deletions(-)

diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index c22c42f980b5e..9baba50d6512e 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -7,6 +7,43 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM kmem
 
+/*
+ * The order of these masks is important. Matching masks will be seen
+ * first and the left over flags will end up showing by themselves.
+ *
+ * For example, if we have GFP_KERNEL before GFP_USER we wil get:
+ *
+ *  GFP_KERNEL|GFP_HARDWALL
+ *
+ * Thus most bits set go first.
+ */
+#define show_gfp_flags(flags)						\
+	(flags) ? __print_flags(flags, "|",				\
+	{(unsigned long)GFP_HIGHUSER_MOVABLE,	"GFP_HIGHUSER_MOVABLE"}, \
+	{(unsigned long)GFP_HIGHUSER,		"GFP_HIGHUSER"},	\
+	{(unsigned long)GFP_USER,		"GFP_USER"},		\
+	{(unsigned long)GFP_TEMPORARY,		"GFP_TEMPORARY"},	\
+	{(unsigned long)GFP_KERNEL,		"GFP_KERNEL"},		\
+	{(unsigned long)GFP_NOFS,		"GFP_NOFS"},		\
+	{(unsigned long)GFP_ATOMIC,		"GFP_ATOMIC"},		\
+	{(unsigned long)GFP_NOIO,		"GFP_NOIO"},		\
+	{(unsigned long)__GFP_HIGH,		"GFP_HIGH"},		\
+	{(unsigned long)__GFP_WAIT,		"GFP_WAIT"},		\
+	{(unsigned long)__GFP_IO,		"GFP_IO"},		\
+	{(unsigned long)__GFP_COLD,		"GFP_COLD"},		\
+	{(unsigned long)__GFP_NOWARN,		"GFP_NOWARN"},		\
+	{(unsigned long)__GFP_REPEAT,		"GFP_REPEAT"},		\
+	{(unsigned long)__GFP_NOFAIL,		"GFP_NOFAIL"},		\
+	{(unsigned long)__GFP_NORETRY,		"GFP_NORETRY"},		\
+	{(unsigned long)__GFP_COMP,		"GFP_COMP"},		\
+	{(unsigned long)__GFP_ZERO,		"GFP_ZERO"},		\
+	{(unsigned long)__GFP_NOMEMALLOC,	"GFP_NOMEMALLOC"},	\
+	{(unsigned long)__GFP_HARDWALL,		"GFP_HARDWALL"},	\
+	{(unsigned long)__GFP_THISNODE,		"GFP_THISNODE"},	\
+	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
+	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"}		\
+	) : "GFP_NOWAIT"
+
 TRACE_EVENT(kmalloc,
 
 	TP_PROTO(unsigned long call_site,
@@ -33,12 +70,12 @@ TRACE_EVENT(kmalloc,
 		__entry->gfp_flags	= gfp_flags;
 	),
 
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
 		__entry->call_site,
 		__entry->ptr,
 		__entry->bytes_req,
 		__entry->bytes_alloc,
-		__entry->gfp_flags)
+		show_gfp_flags(__entry->gfp_flags))
 );
 
 TRACE_EVENT(kmem_cache_alloc,
@@ -67,12 +104,12 @@ TRACE_EVENT(kmem_cache_alloc,
 		__entry->gfp_flags	= gfp_flags;
 	),
 
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x",
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
 		__entry->call_site,
 		__entry->ptr,
 		__entry->bytes_req,
 		__entry->bytes_alloc,
-		__entry->gfp_flags)
+		show_gfp_flags(__entry->gfp_flags))
 );
 
 TRACE_EVENT(kmalloc_node,
@@ -104,12 +141,12 @@ TRACE_EVENT(kmalloc_node,
 		__entry->node		= node;
 	),
 
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
 		__entry->call_site,
 		__entry->ptr,
 		__entry->bytes_req,
 		__entry->bytes_alloc,
-		__entry->gfp_flags,
+		show_gfp_flags(__entry->gfp_flags),
 		__entry->node)
 );
 
@@ -142,12 +179,12 @@ TRACE_EVENT(kmem_cache_alloc_node,
 		__entry->node		= node;
 	),
 
-	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d",
+	TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
 		__entry->call_site,
 		__entry->ptr,
 		__entry->bytes_req,
 		__entry->bytes_alloc,
-		__entry->gfp_flags,
+		show_gfp_flags(__entry->gfp_flags),
 		__entry->node)
 );
 
-- 
GitLab


From 0f4fc29dd68dfab9c6ddd5d087d34a5b6818cb00 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 20 May 2009 19:21:47 -0400
Subject: [PATCH 1340/2198] tracing: add __print_symbolic to trace events

This patch adds __print_symbolic which is similar to __print_flags but
works for an enumeration type instead. That is, there is only a one to one
mapping between the values and the symbols. When a match is made, then
it is printed, otherwise the hex value is outputed.

[ Impact: add interface for showing symbol names in events ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/linux/ftrace_event.h |  3 +++
 include/trace/ftrace.h       |  8 ++++++++
 kernel/trace/trace_output.c  | 25 +++++++++++++++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 4b58cf1a11c2c..bbf40f624fc8d 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -20,6 +20,9 @@ const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 				   unsigned long flags,
 				   const struct trace_print_flags *flag_array);
 
+const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+				     const struct trace_print_flags *symbol_array);
+
 /*
  * The trace entry - the most basic unit of tracing. This is what
  * is printed in the end as a single line in the trace output, such as:
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 22c94719c5695..87fc227c6fbe5 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -130,6 +130,14 @@
 		ftrace_print_flags_seq(p, delim, flag, flags);		\
 	})
 
+#undef __print_symbolic
+#define __print_symbolic(value, symbol_array...)			\
+	({								\
+		static const struct trace_print_flags symbols[] =	\
+			{ symbol_array, { -1, NULL }};			\
+		ftrace_print_symbols_seq(p, value, symbols);		\
+	})
+
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a4840c260c891..c12d95db2f562 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -251,6 +251,31 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 	return p->buffer;
 }
 
+const char *
+ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+			 const struct trace_print_flags *symbol_array)
+{
+	int i;
+
+	trace_seq_init(p);
+
+	for (i = 0;  symbol_array[i].name; i++) {
+
+		if (val != symbol_array[i].mask)
+			continue;
+
+		trace_seq_puts(p, symbol_array[i].name);
+		break;
+	}
+
+	if (!p->len)
+		trace_seq_printf(p, "0x%lx", val);
+		
+	trace_seq_putc(p, 0);
+
+	return p->buffer;
+}
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
-- 
GitLab


From c2adae0970ca1db8adb92fb56ae3bcabd916e8bd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 20 May 2009 19:56:19 -0400
Subject: [PATCH 1341/2198] tracing: convert irq events to use __print_symbolic

The recording of the names at trace time is inefficient. This patch
implements the softirq event recording to only record the vector
and then use the __print_symbolic interface to print out the names.

[ Impact: faster recording of softirq events ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/trace/events/irq.h | 23 +++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 32a9f7ef432bd..683fb36a9943e 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -7,6 +7,19 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM irq
 
+#define softirq_name(sirq) { sirq, #sirq }
+#define show_softirq_name(val)						\
+	__print_symbolic(val,						\
+			 softirq_name(HI_SOFTIRQ),			\
+			 softirq_name(TIMER_SOFTIRQ),			\
+			 softirq_name(NET_TX_SOFTIRQ),			\
+			 softirq_name(NET_RX_SOFTIRQ),			\
+			 softirq_name(BLOCK_SOFTIRQ),			\
+			 softirq_name(TASKLET_SOFTIRQ),			\
+			 softirq_name(SCHED_SOFTIRQ),			\
+			 softirq_name(HRTIMER_SOFTIRQ),			\
+			 softirq_name(RCU_SOFTIRQ))
+
 /**
  * irq_handler_entry - called immediately before the irq action handler
  * @irq: irq number
@@ -87,15 +100,14 @@ TRACE_EVENT(softirq_entry,
 
 	TP_STRUCT__entry(
 		__field(	int,	vec			)
-		__string(	name,	softirq_to_name[h-vec]	)
 	),
 
 	TP_fast_assign(
 		__entry->vec = (int)(h - vec);
-		__assign_str(name, softirq_to_name[h-vec]);
 	),
 
-	TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name))
+	TP_printk("softirq=%d action=%s", __entry->vec,
+		  show_softirq_name(__entry->vec))
 );
 
 /**
@@ -117,15 +129,14 @@ TRACE_EVENT(softirq_exit,
 
 	TP_STRUCT__entry(
 		__field(	int,	vec			)
-		__string(	name,	softirq_to_name[h-vec]	)
 	),
 
 	TP_fast_assign(
 		__entry->vec = (int)(h - vec);
-		__assign_str(name, softirq_to_name[h-vec]);
 	),
 
-	TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name))
+	TP_printk("softirq=%d action=%s", __entry->vec,
+		  show_softirq_name(__entry->vec))
 );
 
 #endif /*  _TRACE_IRQ_H */
-- 
GitLab


From 3a4b8cc70b7473a0b9f26f5b4ddc6579b5e214be Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 16:19:04 -0300
Subject: [PATCH 1342/2198] perf report: Sort output by symbol usage

[acme@emilia ~]$ perf record find / > /dev/null 2>&1
[acme@emilia ~]$ perf stat perf report | head -20
 4.95          find [k] 0xffffffff81393d65 _spin_lock
 3.89          find [.] 0x000000000000af89 /usr/bin/find: <unknown>
 2.19          find [k] 0xffffffffa00518e0 ext3fs_dirhash
 1.87          find [k] 0xffffffff810a6cea __rcu_read_lock
 1.86          find [k] 0xffffffff811c7312 _atomic_dec_and_lock
 1.86          find [.] 0x00000000000782ab /lib64/libc-2.5.so: __GI_strlen
 1.85          find [k] 0xffffffff810fedfb __kmalloc
 1.62          find [.] 0x00000000000430ff /lib64/libc-2.5.so: vfprintf
 1.59          find [k] 0xffffffff810a6d6d __rcu_read_unlock
 1.55          find [k] 0xffffffff81119395 __d_lookup
 1.39          find [.] 0x0000000000071b40 /lib64/libc-2.5.so: _int_malloc
 1.30          find [k] 0xffffffffa031c4fc nfs_do_filldir
 1.21          find [k] 0xffffffff811876a5 avc_has_perm_noaudit
 1.15          find [k] 0xffffffff810fef62 kmem_cache_alloc
 1.07          find [k] 0xffffffff811d03fb copy_user_generic_string
 1.03          find [k] 0xffffffffa0043882 ext3_htree_store_dirent
 0.99          find [k] 0xffffffff81393ebb _spin_lock_bh
 0.98          find [k] 0xffffffffa03319a2 nfs3_decode_dirent
 0.97          find [k] 0xffffffff8100bf20 system_call
 0.92          find [k] 0xffffffff8139437e _spin_unlock

 Performance counter stats for 'perf':

     244.278972  task clock ticks     (msecs)
              8  context switches     (events)
              9  CPU migrations       (events)
           2104  pagefaults           (events)
       35329669  CPU cycles           (events)  (scaled from 75.40%)
       13740366  instructions         (events)  (scaled from 75.49%)
          59073  cache references     (events)  (scaled from 24.60%)
            196  cache misses         (events)  (scaled from 24.51%)

 Wall-clock time elapsed:   246.060717 msecs

[acme@emilia ~]$
[acme@emilia ~]$ grep "model name" /proc/cpuinfo | head -1
model name	: Intel(R) Xeon(R) CPU           E5405  @ 2.00GHz
[acme@emilia ~]$ grep "model name" /proc/cpuinfo | wc -l
8
[acme@emilia ~]$

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090526191904.GH4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 129 +++++++++++++++-----
 1 file changed, 99 insertions(+), 30 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a58be7fee4256..a4c6ffa96505e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -447,26 +447,33 @@ static size_t map__fprintf(struct map *self, FILE *fp)
 		       self->start, self->end, self->pgoff, self->dso->name);
 }
 
+struct thread;
+
+static const char *thread__name(struct thread *self, char *bf, size_t size);
+
 struct symhist {
 	struct rb_node	 rb_node;
 	struct dso	 *dso;
 	struct symbol	 *sym;
+	struct thread	 *thread;
 	uint64_t	 ip;
 	uint32_t	 count;
 	char		 level;
 };
 
 static struct symhist *symhist__new(struct symbol *sym, uint64_t ip,
-				    struct dso *dso, char level)
+				    struct thread *thread, struct dso *dso,
+				    char level)
 {
 	struct symhist *self = malloc(sizeof(*self));
 
 	if (self != NULL) {
-		self->sym   = sym;
-		self->ip    = ip;
-		self->dso   = dso;
-		self->level = level;
-		self->count = 1;
+		self->sym    = sym;
+		self->thread = thread;
+		self->ip     = ip;
+		self->dso    = dso;
+		self->level  = level;
+		self->count  = 1;
 	}
 
 	return self;
@@ -482,17 +489,29 @@ static void symhist__inc(struct symhist *self)
 	++self->count;
 }
 
-static size_t symhist__fprintf(struct symhist *self, FILE *fp)
+static size_t
+symhist__fprintf(struct symhist *self, uint64_t total_samples, FILE *fp)
 {
-	size_t ret = fprintf(fp, "%#llx [%c] ", (unsigned long long)self->ip, self->level);
+	char bf[32];
+	size_t ret;
+
+	if (total_samples)
+		ret = fprintf(fp, "%5.2f", (self->count * 100.0) / total_samples);
+	else
+		ret = fprintf(fp, "%12d", self->count);
+
+	ret += fprintf(fp, "%14s [%c] %#018llx ",
+		       thread__name(self->thread, bf, sizeof(bf)),
+		       self->level, (unsigned long long)self->ip);
 
 	if (self->level != '.')
-		ret += fprintf(fp, "%s", self->sym ? self->sym->name: "<unknown>");
+		ret += fprintf(fp, "%s\n",
+			       self->sym ? self->sym->name : "<unknown>");
 	else
-		ret += fprintf(fp, "%s: %s",
+		ret += fprintf(fp, "%s: %s\n",
 			       self->dso ? self->dso->name : "<unknown>",
 			       self->sym ? self->sym->name : "<unknown>");
-	return ret + fprintf(fp, ": %u\n", self->count);
+	return ret;
 }
 
 struct thread {
@@ -503,6 +522,15 @@ struct thread {
 	char		 *comm;
 };
 
+static const char *thread__name(struct thread *self, char *bf, size_t size)
+{
+	if (self->comm)
+		return self->comm;
+
+	snprintf(bf, sizeof(bf), ":%u", self->pid);
+	return bf;
+}
+
 static struct thread *thread__new(pid_t pid)
 {
 	struct thread *self = malloc(sizeof(*self));
@@ -542,7 +570,7 @@ static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
 			p = &(*p)->rb_right;
 	}
 
-	sh = symhist__new(sym, ip, dso, level);
+	sh = symhist__new(sym, ip, self, dso, level);
 	if (sh == NULL)
 		return -ENOMEM;
 	rb_link_node(&sh->rb_node, parent, p);
@@ -574,7 +602,7 @@ static size_t thread__fprintf(struct thread *self, FILE *fp)
 
 	for (nd = rb_first(&self->symhists); nd; nd = rb_next(nd)) {
 		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
-		ret += symhist__fprintf(pos, fp);
+		ret += symhist__fprintf(pos, 0, fp);
 	}
 
 	return ret;
@@ -628,7 +656,7 @@ static struct map *thread__find_map(struct thread *self, uint64_t ip)
 	return NULL;
 }
 
-static void threads__fprintf(FILE *fp)
+void threads__fprintf(FILE *fp)
 {
 	struct rb_node *nd;
 	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
@@ -637,6 +665,61 @@ static void threads__fprintf(FILE *fp)
 	}
 }
 
+static struct rb_root global_symhists = RB_ROOT;
+
+static void threads__insert_symhist(struct symhist *sh)
+{
+	struct rb_node **p = &global_symhists.rb_node;
+	struct rb_node *parent = NULL;
+	struct symhist *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct symhist, rb_node);
+
+		/* Reverse order */
+		if (sh->count > iter->count)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&sh->rb_node, parent, p);
+	rb_insert_color(&sh->rb_node, &global_symhists);
+}
+
+static void threads__sort_symhists(void)
+{
+	struct rb_node *nd;
+
+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+		struct thread *thread = rb_entry(nd, struct thread, rb_node);
+		struct rb_node *next = rb_first(&thread->symhists);
+
+		while (next) {
+			struct symhist *n = rb_entry(next, struct symhist,
+						     rb_node);
+			next = rb_next(&n->rb_node);
+			rb_erase(&n->rb_node, &thread->symhists);
+			threads__insert_symhist(n);
+		}
+
+	}
+}
+
+static size_t threads__symhists_fprintf(uint64_t total_samples, FILE *fp)
+{
+	struct rb_node *nd;
+	size_t ret = 0;
+
+	for (nd = rb_first(&global_symhists); nd; nd = rb_next(nd)) {
+		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
+		ret += symhist__fprintf(pos, total_samples, fp);
+	}
+
+	return ret;
+}
+
 static int __cmd_report(void)
 {
 	unsigned long offset = 0;
@@ -826,23 +909,9 @@ static int __cmd_report(void)
 		return 0;
 	}
 
-	//dsos__fprintf(stdout);
-	threads__fprintf(stdout);
-#if 0
-	std::map<std::string, int>::iterator hi = hist.begin();
-
-	while (hi != hist.end()) {
-		rev_hist.insert(std::pair<int, std::string>(hi->second, hi->first));
-		hist.erase(hi++);
-	}
-
-	std::multimap<int, std::string>::const_iterator ri = rev_hist.begin();
+	threads__sort_symhists();
+	threads__symhists_fprintf(total, stdout);
 
-	while (ri != rev_hist.end()) {
-		printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str());
-		ri++;
-	}
-#endif
 	return rc;
 }
 
-- 
GitLab


From 46a7574caf5bc533c24b315800ed323c187614f5 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sun, 24 May 2009 18:45:17 -0400
Subject: [PATCH 1343/2198] cifs: fix artificial limit on reading symlinks

There's no reason to limit the size of a symlink that we can read to
4000 bytes. That may be nowhere near PATH_MAX if the server is sending
UCS2 strings. CIFS should be able to read in a symlink up to the size of
the buffer. The size of the header has already been accounted for when
creating the slabcache, so CIFSMaxBufSize should be the correct size to
pass in.

Fixes samba bug #6384.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifssmb.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index d06260251c304..aece2a8c1a7c3 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -2427,8 +2427,7 @@ CIFSSMBUnixQuerySymLink(const int xid, struct cifsTconInfo *tcon,
 	params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
 	pSMB->TotalDataCount = 0;
 	pSMB->MaxParameterCount = cpu_to_le16(2);
-	/* BB find exact max data count below from sess structure BB */
-	pSMB->MaxDataCount = cpu_to_le16(4000);
+	pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
 	pSMB->MaxSetupCount = 0;
 	pSMB->Reserved = 0;
 	pSMB->Flags = 0;
-- 
GitLab


From f55ed1a83d099f275c9560ad7d4c4700d1e54bdd Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 26 May 2009 16:28:11 -0400
Subject: [PATCH 1344/2198] cifs: tighten up default file_mode/dir_mode

The current default file mode is 02767 and dir mode is 0777. This is
extremely "loose". Given that CIFS is a single-user protocol, these
permissions allow anyone to use the mount -- in effect, giving anyone on
the machine access to the credentials used to mount the share.

Change this by making the default permissions restrict write access to
the default owner of the mount. Give read and execute permissions to
everyone else. These are the same permissions that VFAT mounts get by
default so there is some precedent here.

Note that this patch also removes the mandatory locking flags from the
default file_mode. After having looked at how these flags are used by
the kernel, I don't think that keeping them as the default offers any
real benefit. That flag combination makes it so that the kernel enforces
mandatory locking.

Since the server is going to do that for us anyway, I don't think we
want the client to enforce this by default on applications that just
want advisory locks. Anyone that does want this behavior can always
enable it by setting the file_mode appropriately.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 4aa81a507b741..f32c9036741e4 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -827,9 +827,9 @@ cifs_parse_mount_options(char *options, const char *devname,
 	vol->target_rfc1001_name[0] = 0;
 	vol->linux_uid = current_uid();  /* use current_euid() instead? */
 	vol->linux_gid = current_gid();
-	vol->dir_mode = S_IRWXUGO;
-	/* 2767 perms indicate mandatory locking support */
-	vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
+
+	/* default to only allowing write access to owner of the mount */
+	vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
 
 	/* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
 	vol->rw = true;
-- 
GitLab


From 76b0187525f024cb391c8043adf2e359b2adb988 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 26 May 2009 14:16:31 +0900
Subject: [PATCH 1345/2198] rootplug: Remove redundant initialization.

We don't need to explicitly initialize to cap_* because
it will be filled by security_fixup_ops().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/root_plug.c | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/security/root_plug.c b/security/root_plug.c
index 40fb4f15e27b6..2f7ffa67c4d2d 100644
--- a/security/root_plug.c
+++ b/security/root_plug.c
@@ -71,18 +71,6 @@ static int rootplug_bprm_check_security (struct linux_binprm *bprm)
 }
 
 static struct security_operations rootplug_security_ops = {
-	/* Use the capability functions for some of the hooks */
-	.ptrace_may_access =		cap_ptrace_may_access,
-	.ptrace_traceme =		cap_ptrace_traceme,
-	.capget =			cap_capget,
-	.capset =			cap_capset,
-	.capable =			cap_capable,
-
-	.bprm_set_creds =		cap_bprm_set_creds,
-
-	.task_fix_setuid =		cap_task_fix_setuid,
-	.task_prctl =			cap_task_prctl,
-
 	.bprm_check_security =		rootplug_bprm_check_security,
 };
 
-- 
GitLab


From d8d1656ee15d3085e0085a87e70f9093a0a102c5 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 19:20:57 -0300
Subject: [PATCH 1346/2198] perf report: Use hex2long instead of sscanf

Before:

[acme@emilia ~]$ perf record -o perf_report.perf perf stat perf report > /dev/null

 Performance counter stats for 'perf':

     245.414985  task clock ticks     (msecs)
              6  context switches     (events)
              6  CPU migrations       (events)
           2108  pagefaults           (events)
       37493013  CPU cycles           (events)  (scaled from 67.04%)
       13576789  instructions         (events)  (scaled from 66.76%)
          57931  cache references     (events)  (scaled from 21.96%)
          12263  cache misses         (events)  (scaled from 21.98%)

 Wall-clock time elapsed:   246.575587 msecs

[acme@emilia ~]$ perf report -i perf_report.perf | head
12.15          perf [.] 0x000000000005432a /lib64/libc-2.5.so: _IO_vfscanf_internal
 9.38          perf [k] 0xffffffff8101b1d2 intel_pmu_enable_all
 8.53          perf [.] 0x00000000000056b8 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__insert_symbol
 6.61          perf [.] 0x00000000000057cb /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__find_symbol
 5.33          perf [k] 0xffffffff811ce082 number
 4.69          perf [.] 0x0000000000034829 /lib64/libc-2.5.so: ____strtoull_l_internal
 4.48          perf [.] 0x0000000000006505 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: thread__symbol_incnew
 3.41          perf [.] 0x000000000000fce6 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: rb_insert_color
 3.20          perf [k] 0xffffffff811cfc01 vsnprintf
 2.99          perf [k] 0xffffffff811ce5e8 format_decode

After:

[acme@emilia ~]$ perf record -o perf_report.perf perf stat perf report > /dev/null

 Performance counter stats for 'perf':

     218.186805  task clock ticks     (msecs)
              4  context switches     (events)
              7  CPU migrations       (events)
           2133  pagefaults           (events)
       32735365  CPU cycles           (events)  (scaled from 67.04%)
       11952309  instructions         (events)  (scaled from 66.26%)
          50314  cache references     (events)  (scaled from 21.96%)
          13228  cache misses         (events)  (scaled from 21.98%)

 Wall-clock time elapsed:   218.810451 msecs

[acme@emilia ~]$ perf report -i perf_report.perf | head
10.68          perf [.] 0x000000000000578d /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__find_symbol
 9.62          perf [.] 0x00000000000065f7 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: thread__symbol_incnew
 9.40          perf [.] 0x00000000000056b4 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__insert_symbol
 9.19          perf [k] 0xffffffff8101b1d2 intel_pmu_enable_all
 5.13          perf [.] 0x0000000000005ec7 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex2long
 4.49          perf [k] 0xffffffff81083808 kallsyms_expand_symbol
 3.85          perf [k] 0xffffffff811ce2c1 number
 3.63          perf [.] 0x0000000000005e81 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex
 2.99          perf [.] 0x000000000000fd5b /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: rb_insert_color
 2.99          perf [k] 0xffffffff811cf251 string
[acme@emilia ~]$

[ Impact: optimization ]

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090526222057.GI4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 63 ++++++++++++++++-----
 1 file changed, 50 insertions(+), 13 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a4c6ffa96505e..c517483fd614e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -348,6 +348,39 @@ void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
+static int hex(char ch)
+{
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int hex2long(char *ptr, unsigned long *long_val)
+{
+	const char *p = ptr;
+	*long_val = 0;
+
+	while (*p) {
+		const int hex_val = hex(*p);
+
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		p++;
+	}
+
+	return p - ptr;
+}
+
 static int load_kallsyms(void)
 {
 	kernel_dso = dso__new("[kernel]");
@@ -363,26 +396,30 @@ static int load_kallsyms(void)
 	size_t n;
 
 	while (!feof(file)) {
-		unsigned long long start;
-		char c, symbf[4096];
-
-		if (getline(&line, &n, file) < 0)
+		unsigned long start;
+		int line_len = getline(&line, &n, file);
+		if (line_len < 0)
 			break;
 
 		if (!line)
 			goto out_delete_dso;
 
-		if (sscanf(line, "%llx %c %s", &start, &c, symbf) == 3) {
-			/*
-			 * Well fix up the end later, when we have all sorted.
-			 */
-			struct symbol *sym = symbol__new(start, 0xdead, symbf);
+		line[--line_len] = '\0'; /* \n */
+
+		int len = hex2long(line, &start);
+		
+		len += 3; /* ' t ' */
+		if (len >= line_len)
+			continue;
+		/*
+		 * Well fix up the end later, when we have all sorted.
+		 */
+		struct symbol *sym = symbol__new(start, 0xdead, line + len);
 
-			if (sym == NULL)
-				goto out_delete_dso;
+		if (sym == NULL)
+			goto out_delete_dso;
 
-			dso__insert_symbol(kernel_dso, sym);
-		}
+		dso__insert_symbol(kernel_dso, sym);
 	}
 
 	/*
-- 
GitLab


From 03f6316d32738ec5eda2e6f628c12d1c01e61a87 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 19:21:55 -0300
Subject: [PATCH 1347/2198] perf report: Only load text symbols from kallsyms

Just like we do for userspace when reading the symtab, reducing the
number of entries we insert on the symbols rbtree.

Before:

[acme@emilia ~]$ rm -f perf_report.perf ; perf record -o perf_report.perf perf stat perf report > /dev/null

 Performance counter stats for 'perf':

     218.138382  task clock ticks     (msecs)
              4  context switches     (events)
              8  CPU migrations       (events)
           2136  pagefaults           (events)
       32746212  CPU cycles           (events)  (scaled from 67.04%)
       11961102  instructions         (events)  (scaled from 66.19%)
          49841  cache references     (events)  (scaled from 21.96%)
          13777  cache misses         (events)  (scaled from 21.98%)

 Wall-clock time elapsed:   218.702477 msecs

[acme@emilia ~]$ perf report -i perf_report.perf | head
11.06          perf [.] 0x00000000000057cb /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__find_symbol
 9.15          perf [.] 0x00000000000056a0 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__insert_symbol
 8.72          perf [k] 0xffffffff8101b1d2 intel_pmu_enable_all
 8.51          perf [.] 0x0000000000006672 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: thread__symbol_incnew
 3.83          perf [k] 0xffffffff811cfc5a vsnprintf
 3.40          perf [.] 0x0000000000005e33 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex
 3.40          perf [.] 0x0000000000005ec7 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex2long
 3.19          perf [k] 0xffffffff811ce1c1 number
 2.77          perf [.] 0x0000000000006869 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: threads__findnew
 2.77          perf [.] 0x000000000000fde3 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: rb_insert_color
[acme@emilia ~]$

After:

acme@emilia ~]$ rm -f perf_report.perf ; perf record -o perf_report.perf perf stat perf report > /dev/null

 Performance counter stats for 'perf':

     190.228511  task clock ticks     (msecs)
              4  context switches     (events)
              7  CPU migrations       (events)
           1625  pagefaults           (events)
       29578745  CPU cycles           (events)  (scaled from 66.92%)
       10516914  instructions         (events)  (scaled from 66.47%)
          44015  cache references     (events)  (scaled from 22.04%)
           8248  cache misses         (events)  (scaled from 22.07%)

 Wall-clock time elapsed:   190.816096 msecs

[acme@emilia ~]$ perf report -i perf_report.perf | head
15.99          perf [.] 0x00000000000057a9 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__find_symbol
10.87          perf [.] 0x000000000000674d /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: thread__symbol_incnew
 8.74          perf [k] 0xffffffff8101b1d2 intel_pmu_enable_all
 5.54          perf [.] 0x0000000000005e42 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex
 4.48          perf [.] 0x0000000000005ebe /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: hex2long
 4.48          perf [k] 0xffffffff811cfba0 vsnprintf
 3.84          perf [.] 0x00000000000056b4 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__insert_symbol
 3.62          perf [.] 0x00000000000068d0 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: threads__findnew
 3.20          perf [k] 0xffffffff811ce0b3 number
 2.56          perf [.] 0x0000000000006d78 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: __cmd_report
[acme@emilia ~]$

[ Impact: optimization ]

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090526222155.GJ4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index c517483fd614e..a55f15d7651d5 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -3,6 +3,7 @@
 #include <libelf.h>
 #include <gelf.h>
 #include <elf.h>
+#include <ctype.h>
 
 #include "util/list.h"
 #include "util/rbtree.h"
@@ -408,13 +409,20 @@ static int load_kallsyms(void)
 
 		int len = hex2long(line, &start);
 		
-		len += 3; /* ' t ' */
-		if (len >= line_len)
+		len++;
+		if (len + 2 >= line_len)
+			continue;
+
+		char symbol_type = line[len];
+		/*
+		 * We're interested only in code ('T'ext)
+		 */
+		if (toupper(symbol_type) != 'T')
 			continue;
 		/*
 		 * Well fix up the end later, when we have all sorted.
 		 */
-		struct symbol *sym = symbol__new(start, 0xdead, line + len);
+		struct symbol *sym = symbol__new(start, 0xdead, line + len + 2);
 
 		if (sym == NULL)
 			goto out_delete_dso;
-- 
GitLab


From af83632f98aefd1ae4d8ca3c7c285ccf6a7d3956 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 08:38:48 +0200
Subject: [PATCH 1348/2198] perf report: Only load text symbols from kallsyms,
 fix

- allow 'W' symbols too
 - Convert initializations to C99 style
 - whitespace cleanups

Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090526222155.GJ4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 30 ++++++++++++---------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a55f15d7651d5..ed3da9d61985a 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -384,21 +384,26 @@ int hex2long(char *ptr, unsigned long *long_val)
 
 static int load_kallsyms(void)
 {
+	struct rb_node *nd, *prevnd;
+	char *line = NULL;
+	FILE *file;
+	size_t n;
+
 	kernel_dso = dso__new("[kernel]");
 	if (kernel_dso == NULL)
 		return -1;
 
-	FILE *file = fopen("/proc/kallsyms", "r");
-
+	file = fopen("/proc/kallsyms", "r");
 	if (file == NULL)
 		goto out_delete_dso;
 
-	char *line = NULL;
-	size_t n;
-
 	while (!feof(file)) {
 		unsigned long start;
-		int line_len = getline(&line, &n, file);
+		struct symbol *sym;
+		int line_len, len;
+		char symbol_type;
+
+		line_len = getline(&line, &n, file);
 		if (line_len < 0)
 			break;
 
@@ -407,22 +412,22 @@ static int load_kallsyms(void)
 
 		line[--line_len] = '\0'; /* \n */
 
-		int len = hex2long(line, &start);
-		
+		len = hex2long(line, &start);
+
 		len++;
 		if (len + 2 >= line_len)
 			continue;
 
-		char symbol_type = line[len];
+		symbol_type = toupper(line[len]);
 		/*
 		 * We're interested only in code ('T'ext)
 		 */
-		if (toupper(symbol_type) != 'T')
+		if (symbol_type != 'T' && symbol_type != 'W')
 			continue;
 		/*
 		 * Well fix up the end later, when we have all sorted.
 		 */
-		struct symbol *sym = symbol__new(start, 0xdead, line + len + 2);
+		sym = symbol__new(start, 0xdead, line + len + 2);
 
 		if (sym == NULL)
 			goto out_delete_dso;
@@ -434,7 +439,7 @@ static int load_kallsyms(void)
 	 * Now that we have all sorted out, just set the ->end of all
 	 * symbols
 	 */
-	struct rb_node *nd, *prevnd = rb_first(&kernel_dso->syms);
+	prevnd = rb_first(&kernel_dso->syms);
 
 	if (prevnd == NULL)
 		goto out_delete_line;
@@ -450,6 +455,7 @@ static int load_kallsyms(void)
 	dsos__add(kernel_dso);
 	free(line);
 	fclose(file);
+
 	return 0;
 
 out_delete_line:
-- 
GitLab


From 815e777f913ed54ddb449d2854015c65b4ecbfe3 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 26 May 2009 19:46:14 -0300
Subject: [PATCH 1349/2198] perf report: Show the IP only in --verbose mode

perf: report should show the IP only in --verbose mode

[acme@emilia ~]$ perf report | head
 4.95          find [k] _spin_lock
 2.19          find [k] ext3fs_dirhash	[ext3]
 1.87          find [k] __rcu_read_lock
 1.86          find [k] _atomic_dec_and_lock
 1.86          find [.] /lib64/libc-2.5.so: __GI_strlen
 1.85          find [k] __kmalloc
 1.62          find [.] /lib64/libc-2.5.so: vfprintf
 1.59          find [k] __rcu_read_unlock
 1.55          find [k] __d_lookup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090526224614.GK4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index ed3da9d61985a..2d65d9c12aad5 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -22,6 +22,7 @@ static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int		dump_trace = 0;
+static int 		verbose;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -551,9 +552,12 @@ symhist__fprintf(struct symhist *self, uint64_t total_samples, FILE *fp)
 	else
 		ret = fprintf(fp, "%12d", self->count);
 
-	ret += fprintf(fp, "%14s [%c] %#018llx ",
+	ret += fprintf(fp, "%14s [%c] ",
 		       thread__name(self->thread, bf, sizeof(bf)),
-		       self->level, (unsigned long long)self->ip);
+		       self->level);
+
+	if (verbose)
+		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
 
 	if (self->level != '.')
 		ret += fprintf(fp, "%s\n",
@@ -974,6 +978,8 @@ static const char * const report_usage[] = {
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_END()
-- 
GitLab


From 16f762a2ac5ecf8a11f6f0332e46cc3459220da5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 09:10:38 +0200
Subject: [PATCH 1350/2198] perf_counter tools: Introduce stricter C code
 checking

Tighten up our C code requirements:

 - disallow warnings
 - disallow declarations-mixed-with-statements
 - require proper prototypes
 - require C99 (with gcc extensions)

Fix up a ton of problems these measures unearth:

 - unused functions
 - needlessly global functions
 - missing prototypes
 - code mixed with declarations

Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090526222155.GJ4424@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile         |   2 +-
 Documentation/perf_counter/builtin-help.c   |   2 +-
 Documentation/perf_counter/builtin-record.c |  38 ++++----
 Documentation/perf_counter/builtin-report.c | 103 ++++++++++----------
 Documentation/perf_counter/builtin-stat.c   |   3 +-
 Documentation/perf_counter/builtin-top.c    |   2 +-
 Documentation/perf_counter/util/abspath.c   |   2 +-
 Documentation/perf_counter/util/cache.h     |   2 +
 Documentation/perf_counter/util/util.h      |   2 +
 9 files changed, 82 insertions(+), 74 deletions(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 10c13a6f2bc93..efb05892db69d 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -159,7 +159,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -ggdb3 -Wall
+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement
 LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
index 6616de0ef0535..d2bd3177b98c7 100644
--- a/Documentation/perf_counter/builtin-help.c
+++ b/Documentation/perf_counter/builtin-help.c
@@ -399,7 +399,7 @@ static void get_html_page_path(struct strbuf *page_path, const char *page)
  * HTML.
  */
 #ifndef open_html
-void open_html(const char *path)
+static void open_html(const char *path)
 {
 	execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL);
 }
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index ec2b787b23bd0..68abfdf71d399 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -1,6 +1,7 @@
 
 
 #include "perf.h"
+#include "builtin.h"
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
@@ -144,26 +145,32 @@ static int nr_poll;
 static int nr_cpu;
 
 struct mmap_event {
-	struct perf_event_header header;
-	__u32 pid, tid;
-	__u64 start;
-	__u64 len;
-	__u64 pgoff;
-	char filename[PATH_MAX];
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	__u64				start;
+	__u64				len;
+	__u64				pgoff;
+	char				filename[PATH_MAX];
 };
+
 struct comm_event {
-	struct perf_event_header header;
-	__u32 pid,tid;
-	char comm[16];
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	char				comm[16];
 };
 
 static pid_t pid_synthesize_comm_event(pid_t pid)
 {
+	struct comm_event comm_ev;
 	char filename[PATH_MAX];
+	pid_t spid, ppid;
 	char bf[BUFSIZ];
-	struct comm_event comm_ev;
+	int fd, nr, ret;
+	char comm[18];
 	size_t size;
-	int fd;
+	char state;
 
 	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
 
@@ -178,12 +185,8 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 	}
 	close(fd);
 
-	pid_t spid, ppid;
-	char state;
-	char comm[18];
-
 	memset(&comm_ev, 0, sizeof(comm_ev));
-        int nr = sscanf(bf, "%d %s %c %d %d ",
+        nr = sscanf(bf, "%d %s %c %d %d ",
 			&spid, comm, &state, &ppid, &comm_ev.pid);
 	if (nr != 5) {
 		fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
@@ -198,7 +201,8 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 	memcpy(comm_ev.comm, comm + 1, size);
 	size = ALIGN(size, sizeof(uint64_t));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
-	int ret = write(output, &comm_ev, comm_ev.header.size);
+
+	ret = write(output, &comm_ev, comm_ev.header.size);
 	if (ret < 0) {
 		perror("failed to write");
 		exit(-1);
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 2d65d9c12aad5..7f1255dcd2223 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1,4 +1,5 @@
 #include "util/util.h"
+#include "builtin.h"
 
 #include <libelf.h>
 #include <gelf.h>
@@ -22,7 +23,7 @@ static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int		dump_trace = 0;
-static int 		verbose;
+static int		verbose;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -60,10 +61,10 @@ typedef union event_union {
 } event_t;
 
 struct symbol {
-	struct rb_node rb_node;
-	uint64_t       start;
-	uint64_t       end;
-	char	       name[0];
+	struct rb_node		rb_node;
+	__u64			start;
+	__u64			end;
+	char			name[0];
 };
 
 static struct symbol *symbol__new(uint64_t start, uint64_t len, const char *name)
@@ -86,7 +87,7 @@ static void symbol__delete(struct symbol *self)
 
 static size_t symbol__fprintf(struct symbol *self, FILE *fp)
 {
-	return fprintf(fp, " %lx-%lx %s\n",
+	return fprintf(fp, " %llx-%llx %s\n",
 		       self->start, self->end, self->name);
 }
 
@@ -147,10 +148,12 @@ static void dso__insert_symbol(struct dso *self, struct symbol *sym)
 
 static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
 {
+	struct rb_node *n;
+
 	if (self == NULL)
 		return NULL;
 
-	struct rb_node *n = self->syms.rb_node;
+	n = self->syms.rb_node;
 
 	while (n) {
 		struct symbol *s = rb_entry(n, struct symbol, rb_node);
@@ -221,33 +224,42 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 
 static int dso__load(struct dso *self)
 {
-	int fd = open(self->name, O_RDONLY), err = -1;
+	Elf_Data *symstrs;
+	uint32_t nr_syms;
+	int fd, err = -1;
+	uint32_t index;
+	GElf_Ehdr ehdr;
+	GElf_Shdr shdr;
+	Elf_Data *syms;
+	GElf_Sym sym;
+	Elf_Scn *sec;
+	Elf *elf;
 
+
+	fd = open(self->name, O_RDONLY);
 	if (fd == -1)
 		return -1;
 
-	Elf *elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
 	if (elf == NULL) {
 		fprintf(stderr, "%s: cannot read %s ELF file.\n",
 			__func__, self->name);
 		goto out_close;
 	}
 
-	GElf_Ehdr ehdr;
 	if (gelf_getehdr(elf, &ehdr) == NULL) {
 		fprintf(stderr, "%s: cannot get elf header.\n", __func__);
 		goto out_elf_end;
 	}
 
-	GElf_Shdr shdr;
-	Elf_Scn *sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
+	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
 	if (sec == NULL)
 		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
 
 	if (sec == NULL)
 		goto out_elf_end;
 
-	Elf_Data *syms = elf_getdata(sec, NULL);
+	syms = elf_getdata(sec, NULL);
 	if (syms == NULL)
 		goto out_elf_end;
 
@@ -255,14 +267,12 @@ static int dso__load(struct dso *self)
 	if (sec == NULL)
 		goto out_elf_end;
 
-	Elf_Data *symstrs = elf_getdata(sec, NULL);
+	symstrs = elf_getdata(sec, NULL);
 	if (symstrs == NULL)
 		goto out_elf_end;
 
-	const uint32_t nr_syms = shdr.sh_size / shdr.sh_entsize;
+	nr_syms = shdr.sh_size / shdr.sh_entsize;
 
-	GElf_Sym sym;
-	uint32_t index;
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
 
@@ -342,7 +352,7 @@ static struct dso *dsos__findnew(const char *name)
 	return NULL;
 }
 
-void dsos__fprintf(FILE *fp)
+static void dsos__fprintf(FILE *fp)
 {
 	struct dso *pos;
 
@@ -365,7 +375,7 @@ static int hex(char ch)
  * While we find nice hex chars, build a long_val.
  * Return number of chars processed.
  */
-int hex2long(char *ptr, unsigned long *long_val)
+static int hex2long(char *ptr, unsigned long *long_val)
 {
 	const char *p = ptr;
 	*long_val = 0;
@@ -493,12 +503,6 @@ static struct map *map__new(struct mmap_event *event)
 	return NULL;
 }
 
-static size_t map__fprintf(struct map *self, FILE *fp)
-{
-	return fprintf(fp, " %lx-%lx %lx %s\n",
-		       self->start, self->end, self->pgoff, self->dso->name);
-}
-
 struct thread;
 
 static const char *thread__name(struct thread *self, char *bf, size_t size);
@@ -531,11 +535,6 @@ static struct symhist *symhist__new(struct symbol *sym, uint64_t ip,
 	return self;
 }
 
-void symhist__delete(struct symhist *self)
-{
-	free(self);
-}
-
 static void symhist__inc(struct symhist *self)
 {
 	++self->count;
@@ -608,6 +607,8 @@ static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
 	struct symhist *sh;
 
 	while (*p != NULL) {
+		uint64_t start;
+
 		parent = *p;
 		sh = rb_entry(parent, struct symhist, rb_node);
 
@@ -617,7 +618,7 @@ static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
 		}
 
 		/* Handle unresolved symbols too */
-		const uint64_t start = !sh->sym ? sh->ip : sh->sym->start;
+		start = !sh->sym ? sh->ip : sh->sym->start;
 
 		if (ip < start)
 			p = &(*p)->rb_left;
@@ -639,17 +640,6 @@ static int thread__set_comm(struct thread *self, const char *comm)
 	return self->comm ? 0 : -ENOMEM;
 }
 
-size_t thread__maps_fprintf(struct thread *self, FILE *fp)
-{
-	struct map *pos;
-	size_t ret = 0;
-
-	list_for_each_entry(pos, &self->maps, node)
-		ret += map__fprintf(pos, fp);
-
-	return ret;
-}
-
 static size_t thread__fprintf(struct thread *self, FILE *fp)
 {
 	int ret = fprintf(fp, "thread: %d %s\n", self->pid, self->comm);
@@ -657,13 +647,14 @@ static size_t thread__fprintf(struct thread *self, FILE *fp)
 
 	for (nd = rb_first(&self->symhists); nd; nd = rb_next(nd)) {
 		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
+
 		ret += symhist__fprintf(pos, 0, fp);
 	}
 
 	return ret;
 }
 
-static struct rb_root threads = RB_ROOT;
+static struct rb_root threads;
 
 static struct thread *threads__findnew(pid_t pid)
 {
@@ -699,11 +690,11 @@ static void thread__insert_map(struct thread *self, struct map *map)
 
 static struct map *thread__find_map(struct thread *self, uint64_t ip)
 {
+	struct map *pos;
+
 	if (self == NULL)
 		return NULL;
 
-	struct map *pos;
-
 	list_for_each_entry(pos, &self->maps, node)
 		if (ip >= pos->start && ip <= pos->end)
 			return pos;
@@ -711,7 +702,7 @@ static struct map *thread__find_map(struct thread *self, uint64_t ip)
 	return NULL;
 }
 
-void threads__fprintf(FILE *fp)
+static void threads__fprintf(FILE *fp)
 {
 	struct rb_node *nd;
 	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
@@ -720,7 +711,7 @@ void threads__fprintf(FILE *fp)
 	}
 }
 
-static struct rb_root global_symhists = RB_ROOT;
+static struct rb_root global_symhists;
 
 static void threads__insert_symhist(struct symhist *sh)
 {
@@ -852,7 +843,7 @@ static int __cmd_report(void)
 				(void *)(long)(event->header.size),
 				event->header.misc,
 				event->ip.pid,
-				(void *)event->ip.ip);
+				(void *)(long)ip);
 		}
 
 		if (thread == NULL) {
@@ -866,9 +857,12 @@ static int __cmd_report(void)
 			level = 'k';
 			dso = kernel_dso;
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+			struct map *map;
+
 			show = SHOW_USER;
 			level = '.';
-			struct map *map = thread__find_map(thread, ip);
+
+			map = thread__find_map(thread, ip);
 			if (map != NULL) {
 				dso = map->dso;
 				ip -= map->start + map->pgoff;
@@ -896,9 +890,9 @@ static int __cmd_report(void)
 			fprintf(stderr, "%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
 				(void *)(offset + head),
 				(void *)(long)(event->header.size),
-				(void *)event->mmap.start,
-				(void *)event->mmap.len,
-				(void *)event->mmap.pgoff,
+				(void *)(long)event->mmap.start,
+				(void *)(long)event->mmap.len,
+				(void *)(long)event->mmap.pgoff,
 				event->mmap.filename);
 		}
 		if (thread == NULL || map == NULL) {
@@ -964,6 +958,11 @@ static int __cmd_report(void)
 		return 0;
 	}
 
+	if (verbose >= 2) {
+		dsos__fprintf(stdout);
+		threads__fprintf(stdout);
+	}
+
 	threads__sort_symhists();
 	threads__symhists_fprintf(total, stdout);
 
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index e7cb9412212b5..ce661e2fa8ddd 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -30,6 +30,7 @@
  */
 
 #include "perf.h"
+#include "builtin.h"
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
@@ -108,7 +109,7 @@ static void create_perfstat_counter(int counter)
 	}
 }
 
-int do_perfstat(int argc, const char **argv)
+static int do_perfstat(int argc, const char **argv)
 {
 	unsigned long long t0, t1;
 	int counter;
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 6b1c66f99e4de..a890872638c10 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -42,8 +42,8 @@
   * Released under the GPL v2. (and only v2, not any later version)
   */
 
-
 #include "perf.h"
+#include "builtin.h"
 #include "util/util.h"
 #include "util/util.h"
 #include "util/parse-options.h"
diff --git a/Documentation/perf_counter/util/abspath.c b/Documentation/perf_counter/util/abspath.c
index 649f34f83365d..61d33b81fc979 100644
--- a/Documentation/perf_counter/util/abspath.c
+++ b/Documentation/perf_counter/util/abspath.c
@@ -5,7 +5,7 @@
  * symlink to a directory, we do not want to say it is a directory when
  * dealing with tracked content in the working tree.
  */
-int is_directory(const char *path)
+static int is_directory(const char *path)
 {
 	struct stat st;
 	return (!stat(path, &st) && S_ISDIR(st.st_mode));
diff --git a/Documentation/perf_counter/util/cache.h b/Documentation/perf_counter/util/cache.h
index 71080512fa862..393d6146d13b3 100644
--- a/Documentation/perf_counter/util/cache.h
+++ b/Documentation/perf_counter/util/cache.h
@@ -104,6 +104,8 @@ char *strip_path_suffix(const char *path, const char *suffix);
 
 extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
 extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2)));
+/* perf_mkstemp() - create tmp file honoring TMPDIR variable */
+extern int perf_mkstemp(char *path, size_t len, const char *template);
 
 extern char *mksnpath(char *buf, size_t n, const char *fmt, ...)
 	__attribute__((format (printf, 3, 4)));
diff --git a/Documentation/perf_counter/util/util.h b/Documentation/perf_counter/util/util.h
index 36e40c38e0939..76590a16c2717 100644
--- a/Documentation/perf_counter/util/util.h
+++ b/Documentation/perf_counter/util/util.h
@@ -309,6 +309,8 @@ extern ssize_t xread(int fd, void *buf, size_t len);
 extern ssize_t xwrite(int fd, const void *buf, size_t len);
 extern int xdup(int fd);
 extern FILE *xfdopen(int fd, const char *mode);
+extern int xmkstemp(char *template);
+
 static inline size_t xsize_t(off_t len)
 {
 	return (size_t)len;
-- 
GitLab


From 23ac9cbed82b00ca3520bb81dbe9ea3b7a936a1b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 09:33:18 +0200
Subject: [PATCH 1351/2198] perf_counter tools: Rename output.perf to perf.data

output.perf is only output to perf-record - it's input to
perf-report. So change it to a more direction-neutral name.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Documentation/perf-record.txt | 4 ++--
 Documentation/perf_counter/Documentation/perf-report.txt | 4 ++--
 Documentation/perf_counter/builtin-record.c              | 2 +-
 Documentation/perf_counter/builtin-report.c              | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
index d07700e35eb27..353db1bb98abf 100644
--- a/Documentation/perf_counter/Documentation/perf-record.txt
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -3,7 +3,7 @@ perf-record(1)
 
 NAME
 ----
-perf-record - Run a command and record its profile into output.perf
+perf-record - Run a command and record its profile into perf.data
 
 SYNOPSIS
 --------
@@ -13,7 +13,7 @@ SYNOPSIS
 DESCRIPTION
 -----------
 This command runs a command and gathers a performance counter profile
-from it, into output.perf - without displaying anything.
+from it, into perf.data - without displaying anything.
 
 This file can then be inspected later on, using 'perf report'.
 
diff --git a/Documentation/perf_counter/Documentation/perf-report.txt b/Documentation/perf_counter/Documentation/perf-report.txt
index 64696a218105d..49efe16c95882 100644
--- a/Documentation/perf_counter/Documentation/perf-report.txt
+++ b/Documentation/perf_counter/Documentation/perf-report.txt
@@ -3,7 +3,7 @@ perf-report(1)
 
 NAME
 ----
-perf-report - Read output.perf (created by perf record) and display the profile
+perf-report - Read perf.data (created by perf record) and display the profile
 
 SYNOPSIS
 --------
@@ -19,7 +19,7 @@ OPTIONS
 -------
 -i::
 --input=::
-        Input file name. (default: output.perf)
+        Input file name. (default: perf.data)
 
 Configuration
 -------------
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 68abfdf71d399..431077a6fb772 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -19,7 +19,7 @@ static int			nr_cpus				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 16;
 static int			output;
-static const char		*output_name			= "output.perf";
+static const char		*output_name			= "perf.data";
 static int			group				= 0;
 static unsigned int		realtime_prio			= 0;
 static int			system_wide			= 0;
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 7f1255dcd2223..e2712cd063107 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -18,7 +18,7 @@
 #define SHOW_USER	2
 #define SHOW_HV		4
 
-static char		const *input_name = "output.perf";
+static char		const *input_name = "perf.data";
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
-- 
GitLab


From a930d2c0d0a685ab955472b08baad041cc5edb4a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 09:50:13 +0200
Subject: [PATCH 1352/2198] perf_counter tools: Add built-in pager support

Add Git's pager.c (and sigchain) code. A command only
has to call setup_pager() to get paged interactive
output.

Non-interactive (redirected, command-piped, etc.) uses
are not affected.

Update perf-report to make use of this.

[ Impact: new feature ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile           |  4 +
 Documentation/perf_counter/builtin-report.c   |  3 +
 Documentation/perf_counter/util/environment.c |  8 ++
 Documentation/perf_counter/util/pager.c       | 99 +++++++++++++++++++
 Documentation/perf_counter/util/sigchain.c    | 52 ++++++++++
 Documentation/perf_counter/util/sigchain.h    | 11 +++
 6 files changed, 177 insertions(+)
 create mode 100644 Documentation/perf_counter/util/environment.c
 create mode 100644 Documentation/perf_counter/util/pager.c
 create mode 100644 Documentation/perf_counter/util/sigchain.c
 create mode 100644 Documentation/perf_counter/util/sigchain.h

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index efb05892db69d..51b13f989833e 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -297,11 +297,13 @@ LIB_H += util/util.h
 LIB_H += util/help.h
 LIB_H += util/strbuf.h
 LIB_H += util/run-command.h
+LIB_H += util/sigchain.h
 
 LIB_OBJS += util/abspath.o
 LIB_OBJS += util/alias.o
 LIB_OBJS += util/config.o
 LIB_OBJS += util/ctype.o
+LIB_OBJS += util/environment.o
 LIB_OBJS += util/exec_cmd.o
 LIB_OBJS += util/help.o
 LIB_OBJS += util/levenshtein.o
@@ -314,6 +316,8 @@ LIB_OBJS += util/quote.o
 LIB_OBJS += util/strbuf.o
 LIB_OBJS += util/usage.o
 LIB_OBJS += util/wrapper.o
+LIB_OBJS += util/sigchain.o
+LIB_OBJS += util/pager.o
 
 BUILTIN_OBJS += builtin-help.o
 BUILTIN_OBJS += builtin-record.o
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index e2712cd063107..9aef7c54483ec 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -7,6 +7,7 @@
 #include <ctype.h>
 
 #include "util/list.h"
+#include "util/cache.h"
 #include "util/rbtree.h"
 
 #include "perf.h"
@@ -992,5 +993,7 @@ int cmd_report(int argc, const char **argv, const char *prefix)
 
 	parse_options(argc, argv, options, report_usage, 0);
 
+	setup_pager();
+
 	return __cmd_report();
 }
diff --git a/Documentation/perf_counter/util/environment.c b/Documentation/perf_counter/util/environment.c
new file mode 100644
index 0000000000000..9b1c8199e7293
--- /dev/null
+++ b/Documentation/perf_counter/util/environment.c
@@ -0,0 +1,8 @@
+/*
+ * We put all the perf config variables in this same object
+ * file, so that programs can link against the config parser
+ * without having to link against all the rest of perf.
+ */
+#include "cache.h"
+
+const char *pager_program;
diff --git a/Documentation/perf_counter/util/pager.c b/Documentation/perf_counter/util/pager.c
new file mode 100644
index 0000000000000..a28bccae5458b
--- /dev/null
+++ b/Documentation/perf_counter/util/pager.c
@@ -0,0 +1,99 @@
+#include "cache.h"
+#include "run-command.h"
+#include "sigchain.h"
+
+/*
+ * This is split up from the rest of git so that we can do
+ * something different on Windows.
+ */
+
+static int spawned_pager;
+
+#ifndef __MINGW32__
+static void pager_preexec(void)
+{
+	/*
+	 * Work around bug in "less" by not starting it until we
+	 * have real input
+	 */
+	fd_set in;
+
+	FD_ZERO(&in);
+	FD_SET(0, &in);
+	select(1, &in, NULL, &in, NULL);
+
+	setenv("LESS", "FRSX", 0);
+}
+#endif
+
+static const char *pager_argv[] = { "sh", "-c", NULL, NULL };
+static struct child_process pager_process;
+
+static void wait_for_pager(void)
+{
+	fflush(stdout);
+	fflush(stderr);
+	/* signal EOF to pager */
+	close(1);
+	close(2);
+	finish_command(&pager_process);
+}
+
+static void wait_for_pager_signal(int signo)
+{
+	wait_for_pager();
+	sigchain_pop(signo);
+	raise(signo);
+}
+
+void setup_pager(void)
+{
+	const char *pager = getenv("PERF_PAGER");
+
+	if (!isatty(1))
+		return;
+	if (!pager) {
+		if (!pager_program)
+			perf_config(perf_default_config, NULL);
+		pager = pager_program;
+	}
+	if (!pager)
+		pager = getenv("PAGER");
+	if (!pager)
+		pager = "less";
+	else if (!*pager || !strcmp(pager, "cat"))
+		return;
+
+	spawned_pager = 1; /* means we are emitting to terminal */
+
+	/* spawn the pager */
+	pager_argv[2] = pager;
+	pager_process.argv = pager_argv;
+	pager_process.in = -1;
+#ifndef __MINGW32__
+	pager_process.preexec_cb = pager_preexec;
+#endif
+	if (start_command(&pager_process))
+		return;
+
+	/* original process continues, but writes to the pipe */
+	dup2(pager_process.in, 1);
+	if (isatty(2))
+		dup2(pager_process.in, 2);
+	close(pager_process.in);
+
+	/* this makes sure that the parent terminates after the pager */
+	sigchain_push_common(wait_for_pager_signal);
+	atexit(wait_for_pager);
+}
+
+int pager_in_use(void)
+{
+	const char *env;
+
+	if (spawned_pager)
+		return 1;
+
+	env = getenv("PERF_PAGER_IN_USE");
+	return env ? perf_config_bool("PERF_PAGER_IN_USE", env) : 0;
+}
diff --git a/Documentation/perf_counter/util/sigchain.c b/Documentation/perf_counter/util/sigchain.c
new file mode 100644
index 0000000000000..1118b99e57d33
--- /dev/null
+++ b/Documentation/perf_counter/util/sigchain.c
@@ -0,0 +1,52 @@
+#include "sigchain.h"
+#include "cache.h"
+
+#define SIGCHAIN_MAX_SIGNALS 32
+
+struct sigchain_signal {
+	sigchain_fun *old;
+	int n;
+	int alloc;
+};
+static struct sigchain_signal signals[SIGCHAIN_MAX_SIGNALS];
+
+static void check_signum(int sig)
+{
+	if (sig < 1 || sig >= SIGCHAIN_MAX_SIGNALS)
+		die("BUG: signal out of range: %d", sig);
+}
+
+int sigchain_push(int sig, sigchain_fun f)
+{
+	struct sigchain_signal *s = signals + sig;
+	check_signum(sig);
+
+	ALLOC_GROW(s->old, s->n + 1, s->alloc);
+	s->old[s->n] = signal(sig, f);
+	if (s->old[s->n] == SIG_ERR)
+		return -1;
+	s->n++;
+	return 0;
+}
+
+int sigchain_pop(int sig)
+{
+	struct sigchain_signal *s = signals + sig;
+	check_signum(sig);
+	if (s->n < 1)
+		return 0;
+
+	if (signal(sig, s->old[s->n - 1]) == SIG_ERR)
+		return -1;
+	s->n--;
+	return 0;
+}
+
+void sigchain_push_common(sigchain_fun f)
+{
+	sigchain_push(SIGINT, f);
+	sigchain_push(SIGHUP, f);
+	sigchain_push(SIGTERM, f);
+	sigchain_push(SIGQUIT, f);
+	sigchain_push(SIGPIPE, f);
+}
diff --git a/Documentation/perf_counter/util/sigchain.h b/Documentation/perf_counter/util/sigchain.h
new file mode 100644
index 0000000000000..618083bce0c66
--- /dev/null
+++ b/Documentation/perf_counter/util/sigchain.h
@@ -0,0 +1,11 @@
+#ifndef SIGCHAIN_H
+#define SIGCHAIN_H
+
+typedef void (*sigchain_fun)(int);
+
+int sigchain_push(int sig, sigchain_fun f);
+int sigchain_pop(int sig);
+
+void sigchain_push_common(sigchain_fun f);
+
+#endif /* SIGCHAIN_H */
-- 
GitLab


From ef65b2a0b3a2f82850144df6e6a7796f6d66da6b Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Wed, 27 May 2009 10:10:51 +0200
Subject: [PATCH 1353/2198] perf record: Fix the profiling of existing pid or
 whole box

Perf record bails if no command argument is provided, so you can't use
naked -a or -p to profile a running task or the whole box.

Allow foreground profiling of an existing pid or the entire system.

[ Impact: fix command option handling bug ]

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 431077a6fb772..4a068664a32aa 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -354,7 +354,7 @@ static int __cmd_record(int argc, const char **argv)
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 
-	if (target_pid == -1) {
+	if (target_pid == -1 && argc) {
 		pid = fork();
 		if (pid < 0)
 			perror("failed to fork");
@@ -430,7 +430,7 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 	create_events_help(events_help_msg);
 
 	argc = parse_options(argc, argv, options, record_usage, 0);
-	if (!argc)
+	if (!argc && target_pid == -1 && !system_wide)
 		usage_with_options(record_usage, options);
 
 	if (!nr_counters) {
-- 
GitLab


From d716fba49c7445ec87c3f045c59624fac03ee3f2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 13:19:59 +0200
Subject: [PATCH 1354/2198] perf report: Remove <ctype.h> include

Pekka reported build failure in builtin-report.c:

    CC builtin-report.o
    In file included from builtin-report.c:7:
    /usr/include/ctype.h:102: error: expected expression before token

And observed:

| Removing #include <ctype.h> from builtin-report.c makes the problem
| go away. I am running Ubuntu 9.04 that has gcc 4.3.3 and libc 2.9.

Reported-by: Pekka J Enberg <penberg@cs.helsinki.fi>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 9aef7c54483ec..6265bedcd9370 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -4,7 +4,6 @@
 #include <libelf.h>
 #include <gelf.h>
 #include <elf.h>
-#include <ctype.h>
 
 #include "util/list.h"
 #include "util/cache.h"
-- 
GitLab


From ba396a6c104682dfe5c8b4fbbf5974d5ac9f3687 Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Wed, 27 May 2009 14:17:08 +0200
Subject: [PATCH 1355/2198] block: fix oops with block tag queueing

commit e8939a50466fd963eb1ba9118c34b9ffb7ff6aa6
Author: Tejun Heo <tj@kernel.org>
Date:   Fri May 8 11:54:16 2009 +0900

    block: implement and enforce request peek/start/fetch

Added a BUG_ON(blk_queued_rq(req)) to the top of blk_finish_req().
Unfortunately, this checks whether req->queuelist is empty.  This list
is doing double duty both as the queue list and the tag list, so tagged
requests come in here with this not empty and boom (the tag list is
emptied by blk_queue_end_tag() lower down).

Fix this by moving the BUG_ON to below the end tag we also seem
vulnerable to this in blk_requeue_request() as well.  I think all uses
of blk_queued_rq() need auditing because the check is clearly wrong in
the tagged case.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 7a4c40184a64c..8b3b74e691841 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -956,8 +956,6 @@ EXPORT_SYMBOL(blk_make_request);
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
-	BUG_ON(blk_queued_rq(rq));
-
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
@@ -965,6 +963,8 @@ void blk_requeue_request(struct request_queue *q, struct request *rq)
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 
+	BUG_ON(blk_queued_rq(rq));
+
 	elv_requeue_request(q, rq);
 }
 EXPORT_SYMBOL(blk_requeue_request);
@@ -2042,11 +2042,11 @@ static bool blk_update_bidi_request(struct request *rq, int error,
  */
 static void blk_finish_request(struct request *req, int error)
 {
-	BUG_ON(blk_queued_rq(req));
-
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 
+	BUG_ON(blk_queued_rq(req));
+
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
 
-- 
GitLab


From 3c4198e874cde694f5ea1463706873e7907bdb18 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Wed, 27 May 2009 14:50:02 +0200
Subject: [PATCH 1356/2198] block: fix no diskstat problem

The commit below in 2.6-block/for-2.6.31 causes no diskstat problem
because the blk_discard_rq() check was added with '&&'.
It should be 'blk_fs_request() || blk_discard_rq()'.
This patch does it and fixes the no diskstat problem.
Please review and apply.

------ /proc/diskstat without this patch -------------------------------------
   8       0 sda 0 0 0 0 0 0 0 0 0 0 0
------------------------------------------------------------------------------

----- /proc/diskstat with this patch applied ---------------------------------
   8       0 sda 4186 303 373621 61600 9578 3859 107468 169479 2 89755 231059
------------------------------------------------------------------------------

--------------------------------------------------------------------------
commit c69d48540c201394d08cb4d48b905e001313d9b8
Author: Jens Axboe <jens.axboe@oracle.com>
Date:   Fri Apr 24 08:12:19 2009 +0200

    block: include discard requests in IO accounting

    We currently don't do merging on discard requests, but we potentially
    could. If we do, then we need to include discard requests in the IO
    accounting, or merging would end up decrementing in_flight IO counters
    for an IO which never incremented them.

    So enable accounting for discard requests.

<snip>

 static inline int blk_do_io_stat(struct request *rq)
 {
-       return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq);
+       return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq) &&
+               blk_discard_rq(rq);
 }
--------------------------------------------------------------------------

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/blk.h b/block/blk.h
index c863ec2281e05..3fae6add54301 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -156,12 +156,12 @@ static inline int blk_cpu_to_group(int cpu)
  *
  *	a) it's attached to a gendisk, and
  *	b) the queue had IO stats enabled when this request was started, and
- *	c) it's a file system request
+ *	c) it's a file system request or a discard request
  */
 static inline int blk_do_io_stat(struct request *rq)
 {
-	return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq) &&
-		blk_discard_rq(rq);
+	return rq->rq_disk && blk_rq_io_stat(rq) &&
+	       (blk_fs_request(rq) || blk_discard_rq(rq));
 }
 
 #endif
-- 
GitLab


From b7a16eac5e679fb5f531b9eeff7db7952303e77d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 13:35:35 +0200
Subject: [PATCH 1357/2198] perf_counter: tools: /usr/lib/debug%s.debug support

Some distros seem to store debuginfo in weird places.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 94 +++++++++++++++++----
 1 file changed, 76 insertions(+), 18 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6265bedcd9370..a9ff49a4edea3 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -190,7 +190,8 @@ static inline int elf_sym__is_function(const GElf_Sym *sym)
 {
 	return elf_sym__type(sym) == STT_FUNC &&
 	       sym->st_name != 0 &&
-	       sym->st_shndx != SHN_UNDEF;
+	       sym->st_shndx != SHN_UNDEF &&
+	       sym->st_size != 0;
 }
 
 static inline const char *elf_sym__name(const GElf_Sym *sym,
@@ -222,11 +223,11 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 	return sec;
 }
 
-static int dso__load(struct dso *self)
+static int dso__load_sym(struct dso *self, int fd, char *name)
 {
 	Elf_Data *symstrs;
 	uint32_t nr_syms;
-	int fd, err = -1;
+	int err = -1;
 	uint32_t index;
 	GElf_Ehdr ehdr;
 	GElf_Shdr shdr;
@@ -234,16 +235,12 @@ static int dso__load(struct dso *self)
 	GElf_Sym sym;
 	Elf_Scn *sec;
 	Elf *elf;
-
-
-	fd = open(self->name, O_RDONLY);
-	if (fd == -1)
-		return -1;
+	int nr = 0;
 
 	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
 	if (elf == NULL) {
 		fprintf(stderr, "%s: cannot read %s ELF file.\n",
-			__func__, self->name);
+			__func__, name);
 		goto out_close;
 	}
 
@@ -292,16 +289,63 @@ static int dso__load(struct dso *self)
 			goto out_elf_end;
 
 		dso__insert_symbol(self, f);
+
+		nr++;
 	}
 
-	err = 0;
+	err = nr;
 out_elf_end:
 	elf_end(elf);
 out_close:
-	close(fd);
 	return err;
 }
 
+static int dso__load(struct dso *self)
+{
+	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
+	char *name = malloc(size);
+	int variant = 0;
+	int ret = -1;
+	int fd;
+
+	if (!name)
+		return -1;
+
+more:
+	do {
+		switch (variant) {
+		case 0: /* Fedora */
+			snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
+			break;
+		case 1: /* Ubuntu */
+			snprintf(name, size, "/usr/lib/debug%s", self->name);
+			break;
+		case 2: /* Sane people */
+			snprintf(name, size, "%s", self->name);
+			break;
+
+		default:
+			goto out;
+		}
+		variant++;
+
+		fd = open(name, O_RDONLY);
+	} while (fd < 0);
+
+	ret = dso__load_sym(self, fd, name);
+	close(fd);
+
+	/*
+	 * Some people seem to have debuginfo files _WITHOUT_ debug info!?!?
+	 */
+	if (!ret)
+		goto more;
+
+out:
+	free(name);
+	return ret;
+}
+
 static size_t dso__fprintf(struct dso *self, FILE *fp)
 {
 	size_t ret = fprintf(fp, "dso: %s\n", self->name);
@@ -336,11 +380,23 @@ static struct dso *dsos__find(const char *name)
 static struct dso *dsos__findnew(const char *name)
 {
 	struct dso *dso = dsos__find(name);
+	int nr;
 
 	if (dso == NULL) {
 		dso = dso__new(name);
-		if (dso != NULL && dso__load(dso) < 0)
+		if (!dso)
+			goto out_delete_dso;
+
+		nr = dso__load(dso);
+		if (nr < 0) {
+			fprintf(stderr, "Failed to open: %s\n", name);
 			goto out_delete_dso;
+		}
+		if (!nr) {
+			fprintf(stderr,
+		"Failed to find debug symbols for: %s, maybe install a debug package?\n",
+					name);
+		}
 
 		dsos__add(dso);
 	}
@@ -547,9 +603,9 @@ symhist__fprintf(struct symhist *self, uint64_t total_samples, FILE *fp)
 	size_t ret;
 
 	if (total_samples)
-		ret = fprintf(fp, "%5.2f", (self->count * 100.0) / total_samples);
+		ret = fprintf(fp, "%5.2f%% ", (self->count * 100.0) / total_samples);
 	else
-		ret = fprintf(fp, "%12d", self->count);
+		ret = fprintf(fp, "%12d ", self->count);
 
 	ret += fprintf(fp, "%14s [%c] ",
 		       thread__name(self->thread, bf, sizeof(bf)),
@@ -922,10 +978,12 @@ static int __cmd_report(void)
 	}
 	default: {
 broken_event:
-		fprintf(stderr, "%p [%p]: skipping unknown header type: %d\n",
-			(void *)(offset + head),
-			(void *)(long)(event->header.size),
-			event->header.type);
+		if (dump_trace)
+			fprintf(stderr, "%p [%p]: skipping unknown header type: %d\n",
+					(void *)(offset + head),
+					(void *)(long)(event->header.size),
+					event->header.type);
+
 		total_unknown++;
 
 		/*
-- 
GitLab


From 450aaa2b2a1b006870ba68251fbb40b2387caade Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:23 +0200
Subject: [PATCH 1358/2198] perf_counter: tools: report: Add vmlinux support

Allow to use vmlinux instead of kallsyms.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182100.740018486@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 37 ++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a9ff49a4edea3..3e87cbd3045a4 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -19,6 +19,7 @@
 #define SHOW_HV		4
 
 static char		const *input_name = "perf.data";
+static char		*vmlinux = NULL;
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
@@ -532,6 +533,39 @@ static int load_kallsyms(void)
 	return -1;
 }
 
+static int load_kernel(void)
+{
+	int fd, nr;
+
+	if (!vmlinux)
+		goto kallsyms;
+
+	fd = open(vmlinux, O_RDONLY);
+	if (fd < 0)
+		goto kallsyms;
+
+	kernel_dso = dso__new("[kernel]");
+	if (!kernel_dso)
+		goto fail_open;
+
+	nr = dso__load_sym(kernel_dso, fd, vmlinux);
+
+	if (nr <= 0)
+		goto fail_load;
+
+	dsos__add(kernel_dso);
+	close(fd);
+
+	return 0;
+
+fail_load:
+	dso__delete(kernel_dso);
+fail_open:
+	close(fd);
+kallsyms:
+	return load_kallsyms();
+}
+
 struct map {
 	struct list_head node;
 	uint64_t	 start;
@@ -850,7 +884,7 @@ static int __cmd_report(void)
 		exit(0);
 	}
 
-	if (load_kallsyms() < 0) {
+	if (load_kernel() < 0) {
 		perror("failed to open kallsyms");
 		return EXIT_FAILURE;
 	}
@@ -1039,6 +1073,7 @@ static const struct option options[] = {
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
 	OPT_END()
 };
 
-- 
GitLab


From e7fb08b1d06a6b37263c765205de5614a2273aeb Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:24 +0200
Subject: [PATCH 1359/2198] perf_counter: tools: report: Rework histogram code

In preparation for configurable sorting, rework the histgram code a bit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182100.796410098@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 294 ++++++++++----------
 1 file changed, 143 insertions(+), 151 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 3e87cbd3045a4..276256439b784 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -597,71 +597,9 @@ struct thread;
 
 static const char *thread__name(struct thread *self, char *bf, size_t size);
 
-struct symhist {
-	struct rb_node	 rb_node;
-	struct dso	 *dso;
-	struct symbol	 *sym;
-	struct thread	 *thread;
-	uint64_t	 ip;
-	uint32_t	 count;
-	char		 level;
-};
-
-static struct symhist *symhist__new(struct symbol *sym, uint64_t ip,
-				    struct thread *thread, struct dso *dso,
-				    char level)
-{
-	struct symhist *self = malloc(sizeof(*self));
-
-	if (self != NULL) {
-		self->sym    = sym;
-		self->thread = thread;
-		self->ip     = ip;
-		self->dso    = dso;
-		self->level  = level;
-		self->count  = 1;
-	}
-
-	return self;
-}
-
-static void symhist__inc(struct symhist *self)
-{
-	++self->count;
-}
-
-static size_t
-symhist__fprintf(struct symhist *self, uint64_t total_samples, FILE *fp)
-{
-	char bf[32];
-	size_t ret;
-
-	if (total_samples)
-		ret = fprintf(fp, "%5.2f%% ", (self->count * 100.0) / total_samples);
-	else
-		ret = fprintf(fp, "%12d ", self->count);
-
-	ret += fprintf(fp, "%14s [%c] ",
-		       thread__name(self->thread, bf, sizeof(bf)),
-		       self->level);
-
-	if (verbose)
-		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
-
-	if (self->level != '.')
-		ret += fprintf(fp, "%s\n",
-			       self->sym ? self->sym->name : "<unknown>");
-	else
-		ret += fprintf(fp, "%s: %s\n",
-			       self->dso ? self->dso->name : "<unknown>",
-			       self->sym ? self->sym->name : "<unknown>");
-	return ret;
-}
-
 struct thread {
 	struct rb_node	 rb_node;
 	struct list_head maps;
-	struct rb_root	 symhists;
 	pid_t		 pid;
 	char		 *comm;
 };
@@ -683,67 +621,17 @@ static struct thread *thread__new(pid_t pid)
 		self->pid = pid;
 		self->comm = NULL;
 		INIT_LIST_HEAD(&self->maps);
-		self->symhists = RB_ROOT;
 	}
 
 	return self;
 }
 
-static int thread__symbol_incnew(struct thread *self, struct symbol *sym,
-				 uint64_t ip, struct dso *dso, char level)
-{
-	struct rb_node **p = &self->symhists.rb_node;
-	struct rb_node *parent = NULL;
-	struct symhist *sh;
-
-	while (*p != NULL) {
-		uint64_t start;
-
-		parent = *p;
-		sh = rb_entry(parent, struct symhist, rb_node);
-
-		if (sh->sym == sym || ip == sh->ip) {
-			symhist__inc(sh);
-			return 0;
-		}
-
-		/* Handle unresolved symbols too */
-		start = !sh->sym ? sh->ip : sh->sym->start;
-
-		if (ip < start)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	sh = symhist__new(sym, ip, self, dso, level);
-	if (sh == NULL)
-		return -ENOMEM;
-	rb_link_node(&sh->rb_node, parent, p);
-	rb_insert_color(&sh->rb_node, &self->symhists);
-	return 0;
-}
-
 static int thread__set_comm(struct thread *self, const char *comm)
 {
 	self->comm = strdup(comm);
 	return self->comm ? 0 : -ENOMEM;
 }
 
-static size_t thread__fprintf(struct thread *self, FILE *fp)
-{
-	int ret = fprintf(fp, "thread: %d %s\n", self->pid, self->comm);
-	struct rb_node *nd;
-
-	for (nd = rb_first(&self->symhists); nd; nd = rb_next(nd)) {
-		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
-
-		ret += symhist__fprintf(pos, 0, fp);
-	}
-
-	return ret;
-}
-
 static struct rb_root threads;
 
 static struct thread *threads__findnew(pid_t pid)
@@ -792,70 +680,172 @@ static struct map *thread__find_map(struct thread *self, uint64_t ip)
 	return NULL;
 }
 
-static void threads__fprintf(FILE *fp)
+/*
+ * histogram, sorted on item, collects counts
+ */
+
+static struct rb_root hist;
+
+struct hist_entry {
+	struct rb_node	 rb_node;
+
+	struct thread	 *thread;
+	struct map	 *map;
+	struct dso	 *dso;
+	struct symbol	 *sym;
+	uint64_t	 ip;
+	char		 level;
+
+	uint32_t	 count;
+};
+
+static int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t ip_l, ip_r;
+	int cmp = right->thread->pid - left->thread->pid;
+
+	if (cmp)
+		return cmp;
+
+	if (left->sym == right->sym)
+		return 0;
+
+	ip_l = left->sym ? left->sym->start : left->ip;
+	ip_r = right->sym ? right->sym->start : right->ip;
+
+	return (int64_t)(ip_r - ip_l);
+}
+
+static int
+hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
+		struct symbol *sym, uint64_t ip, char level)
 {
-	struct rb_node *nd;
-	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
-		struct thread *pos = rb_entry(nd, struct thread, rb_node);
-		thread__fprintf(pos, fp);
+	struct rb_node **p = &hist.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *he;
+	struct hist_entry entry = {
+		.thread	= thread,
+		.map	= map,
+		.dso	= dso,
+		.sym	= sym,
+		.ip	= ip,
+		.level	= level,
+		.count	= 1,
+	};
+	int cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		he = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__cmp(&entry, he);
+
+		if (!cmp) {
+			he->count++;
+			return 0;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
 	}
+
+	he = malloc(sizeof(*he));
+	if (!he)
+		return -ENOMEM;
+	*he = entry;
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &hist);
+
+	return 0;
 }
 
-static struct rb_root global_symhists;
+static size_t
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+{
+	char bf[32];
+	size_t ret;
+
+	if (total_samples) {
+		ret = fprintf(fp, "%5.2f%% ",
+				(self->count * 100.0) / total_samples);
+	} else
+		ret = fprintf(fp, "%12d ", self->count);
 
-static void threads__insert_symhist(struct symhist *sh)
+	ret += fprintf(fp, "%14s [%c] ",
+		       thread__name(self->thread, bf, sizeof(bf)),
+		       self->level);
+
+	if (verbose)
+		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
+
+	if (self->level != '.')
+		ret += fprintf(fp, "%s\n",
+			       self->sym ? self->sym->name : "<unknown>");
+	else
+		ret += fprintf(fp, "%s: %s\n",
+			       self->dso ? self->dso->name : "<unknown>",
+			       self->sym ? self->sym->name : "<unknown>");
+	return ret;
+}
+
+/*
+ * reverse the map, sort on count.
+ */
+
+static struct rb_root output_hists;
+
+static void output__insert_entry(struct hist_entry *he)
 {
-	struct rb_node **p = &global_symhists.rb_node;
+	struct rb_node **p = &output_hists.rb_node;
 	struct rb_node *parent = NULL;
-	struct symhist *iter;
+	struct hist_entry *iter;
 
 	while (*p != NULL) {
 		parent = *p;
-		iter = rb_entry(parent, struct symhist, rb_node);
+		iter = rb_entry(parent, struct hist_entry, rb_node);
 
-		/* Reverse order */
-		if (sh->count > iter->count)
+		if (he->count > iter->count)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
 	}
 
-	rb_link_node(&sh->rb_node, parent, p);
-	rb_insert_color(&sh->rb_node, &global_symhists);
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &output_hists);
 }
 
-static void threads__sort_symhists(void)
+static void output__resort(void)
 {
-	struct rb_node *nd;
-
-	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
-		struct thread *thread = rb_entry(nd, struct thread, rb_node);
-		struct rb_node *next = rb_first(&thread->symhists);
+	struct rb_node *next = rb_first(&hist);
+	struct hist_entry *n;
 
-		while (next) {
-			struct symhist *n = rb_entry(next, struct symhist,
-						     rb_node);
-			next = rb_next(&n->rb_node);
-			rb_erase(&n->rb_node, &thread->symhists);
-			threads__insert_symhist(n);
-		}
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
 
+		rb_erase(&n->rb_node, &hist);
+		output__insert_entry(n);
 	}
 }
 
-static size_t threads__symhists_fprintf(uint64_t total_samples, FILE *fp)
+static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 {
+	struct hist_entry *pos;
 	struct rb_node *nd;
 	size_t ret = 0;
 
-	for (nd = rb_first(&global_symhists); nd; nd = rb_next(nd)) {
-		struct symhist *pos = rb_entry(nd, struct symhist, rb_node);
-		ret += symhist__fprintf(pos, total_samples, fp);
+	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct hist_entry, rb_node);
+		ret += hist_entry__fprintf(fp, pos, total_samples);
 	}
 
 	return ret;
 }
 
+
 static int __cmd_report(void)
 {
 	unsigned long offset = 0;
@@ -926,6 +916,7 @@ static int __cmd_report(void)
 		struct dso *dso = NULL;
 		struct thread *thread = threads__findnew(event->ip.pid);
 		uint64_t ip = event->ip.ip;
+		struct map *map = NULL;
 
 		if (dump_trace) {
 			fprintf(stderr, "%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
@@ -945,9 +936,10 @@ static int __cmd_report(void)
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
 			show = SHOW_KERNEL;
 			level = 'k';
+
 			dso = kernel_dso;
+
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
-			struct map *map;
 
 			show = SHOW_USER;
 			level = '.';
@@ -957,6 +949,7 @@ static int __cmd_report(void)
 				dso = map->dso;
 				ip -= map->start + map->pgoff;
 			}
+
 		} else {
 			show = SHOW_HV;
 			level = 'H';
@@ -965,8 +958,9 @@ static int __cmd_report(void)
 		if (show & show_mask) {
 			struct symbol *sym = dso__find_symbol(dso, ip);
 
-			if (thread__symbol_incnew(thread, sym, ip, dso, level)) {
-				fprintf(stderr, "problem incrementing symbol count, bailing out\n");
+			if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+				fprintf(stderr,
+		"problem incrementing symbol count, bailing out\n");
 				goto done;
 			}
 		}
@@ -1050,13 +1044,11 @@ static int __cmd_report(void)
 		return 0;
 	}
 
-	if (verbose >= 2) {
+	if (verbose >= 2)
 		dsos__fprintf(stdout);
-		threads__fprintf(stdout);
-	}
 
-	threads__sort_symhists();
-	threads__symhists_fprintf(total, stdout);
+	output__resort();
+	output__fprintf(stdout, total);
 
 	return rc;
 }
-- 
GitLab


From 1aa167382323eeeeb38368cab85cf17979793cbe Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:25 +0200
Subject: [PATCH 1360/2198] perf_counter: tools: report: Dynamic sort/print
 bits

Make the sorting and printing dynamic.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182100.921953817@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 141 +++++++++++++++-----
 1 file changed, 107 insertions(+), 34 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 276256439b784..856186fd2bd4c 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -699,14 +699,41 @@ struct hist_entry {
 	uint32_t	 count;
 };
 
+/*
+ * configurable sorting bits
+ */
+
+struct sort_entry {
+	struct list_head list;
+
+	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+	size_t	(*print)(FILE *fp, struct hist_entry *);
+};
+
 static int64_t
-hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	uint64_t ip_l, ip_r;
-	int cmp = right->thread->pid - left->thread->pid;
+	return right->thread->pid - left->thread->pid;
+}
+
+static size_t
+sort__thread_print(FILE *fp, struct hist_entry *self)
+{
+	char bf[32];
+
+	return fprintf(fp, "%14s ",
+			thread__name(self->thread, bf, sizeof(bf)));
+}
 
-	if (cmp)
-		return cmp;
+static struct sort_entry sort_thread = {
+	.cmp	= sort__thread_cmp,
+	.print	= sort__thread_print,
+};
+
+static int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t ip_l, ip_r;
 
 	if (left->sym == right->sym)
 		return 0;
@@ -717,6 +744,79 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(ip_r - ip_l);
 }
 
+static size_t
+sort__sym_print(FILE *fp, struct hist_entry *self)
+{
+	size_t ret = 0;
+
+	ret += fprintf(fp, "[%c] ", self->level);
+
+	if (verbose)
+		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
+
+	if (self->level != '.')
+		ret += fprintf(fp, "%s ",
+			       self->sym ? self->sym->name : "<unknown>");
+	else
+		ret += fprintf(fp, "%s: %s ",
+			       self->dso ? self->dso->name : "<unknown>",
+			       self->sym ? self->sym->name : "<unknown>");
+
+	return ret;
+}
+
+static struct sort_entry sort_sym = {
+	.cmp	= sort__sym_cmp,
+	.print	= sort__sym_print,
+};
+
+static LIST_HEAD(hist_entry__sort_list);
+
+static void setup_sorting(void)
+{
+	list_add_tail(&sort_thread.list, &hist_entry__sort_list);
+	list_add_tail(&sort_sym.list, &hist_entry__sort_list);
+}
+
+static int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		cmp = se->cmp(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+static size_t
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+{
+	struct sort_entry *se;
+	size_t ret;
+
+	if (total_samples) {
+		ret = fprintf(fp, "%5.2f%% ",
+				(self->count * 100.0) / total_samples);
+	} else
+		ret = fprintf(fp, "%12d ", self->count);
+
+	list_for_each_entry(se, &hist_entry__sort_list, list)
+		ret += se->print(fp, self);
+
+	ret += fprintf(fp, "\n");
+
+	return ret;
+}
+
+/*
+ * collect histogram counts
+ */
+
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		struct symbol *sym, uint64_t ip, char level)
@@ -762,35 +862,6 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 	return 0;
 }
 
-static size_t
-hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
-{
-	char bf[32];
-	size_t ret;
-
-	if (total_samples) {
-		ret = fprintf(fp, "%5.2f%% ",
-				(self->count * 100.0) / total_samples);
-	} else
-		ret = fprintf(fp, "%12d ", self->count);
-
-	ret += fprintf(fp, "%14s [%c] ",
-		       thread__name(self->thread, bf, sizeof(bf)),
-		       self->level);
-
-	if (verbose)
-		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
-
-	if (self->level != '.')
-		ret += fprintf(fp, "%s\n",
-			       self->sym ? self->sym->name : "<unknown>");
-	else
-		ret += fprintf(fp, "%s: %s\n",
-			       self->dso ? self->dso->name : "<unknown>",
-			       self->sym ? self->sym->name : "<unknown>");
-	return ret;
-}
-
 /*
  * reverse the map, sort on count.
  */
@@ -1077,6 +1148,8 @@ int cmd_report(int argc, const char **argv, const char *prefix)
 
 	parse_options(argc, argv, options, report_usage, 0);
 
+	setup_sorting();
+
 	setup_pager();
 
 	return __cmd_report();
-- 
GitLab


From 37f440cba299bb479cf45d12eef923f0979dbcaf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:26 +0200
Subject: [PATCH 1361/2198] pref_counter: tools: report: Add --sort option

option parsing for dynamic sorting.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182101.041817692@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 43 ++++++++++++++++++++-
 1 file changed, 41 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 856186fd2bd4c..982abce0e7c15 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -20,6 +20,7 @@
 
 static char		const *input_name = "perf.data";
 static char		*vmlinux = NULL;
+static char		*sort_order = "pid,symbol";
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
@@ -770,12 +771,49 @@ static struct sort_entry sort_sym = {
 	.print	= sort__sym_print,
 };
 
+struct sort_dimension {
+	char *name;
+	struct sort_entry *entry;
+	int taken;
+};
+
+static struct sort_dimension sort_dimensions[] = {
+	{ .name = "pid",	.entry = &sort_thread,	},
+	{ .name = "symbol",	.entry = &sort_sym,	},
+};
+
 static LIST_HEAD(hist_entry__sort_list);
 
+static int sort_dimension__add(char *tok)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
+		struct sort_dimension *sd = &sort_dimensions[i];
+
+		if (sd->taken)
+			continue;
+
+		if (strcmp(tok, sd->name))
+			continue;
+
+		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+		sd->taken = 1;
+		return 0;
+	}
+
+	return -ESRCH;
+}
+
 static void setup_sorting(void)
 {
-	list_add_tail(&sort_thread.list, &hist_entry__sort_list);
-	list_add_tail(&sort_sym.list, &hist_entry__sort_list);
+	char *tmp, *tok, *str = strdup(sort_order);
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp))
+		sort_dimension__add(tok);
+
+	free(str);
 }
 
 static int64_t
@@ -1137,6 +1175,7 @@ static const struct option options[] = {
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+	OPT_STRING('s', "sort", &sort_order, "foo", "bar"),
 	OPT_END()
 };
 
-- 
GitLab


From 992444b173f35997f96f5cbb214f0de81d1b97ff Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:27 +0200
Subject: [PATCH 1362/2198] perf_counter: tools: report: Add comm sorting

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182101.129302022@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 30 +++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 982abce0e7c15..a634022bae079 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -731,6 +731,35 @@ static struct sort_entry sort_thread = {
 	.print	= sort__thread_print,
 };
 
+static int64_t
+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	char *comm_l = left->thread->comm;
+	char *comm_r = right->thread->comm;
+
+	if (!comm_l || !comm_r) {
+		if (!comm_l && !comm_r)
+			return 0;
+		else if (!comm_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(comm_l, comm_r);
+}
+
+static size_t
+sort__comm_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%20s ", self->thread->comm ?: "<unknown>");
+}
+
+static struct sort_entry sort_comm = {
+	.cmp	= sort__comm_cmp,
+	.print	= sort__comm_print,
+};
+
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -779,6 +808,7 @@ struct sort_dimension {
 
 static struct sort_dimension sort_dimensions[] = {
 	{ .name = "pid",	.entry = &sort_thread,	},
+	{ .name = "comm",	.entry = &sort_comm,	},
 	{ .name = "symbol",	.entry = &sort_sym,	},
 };
 
-- 
GitLab


From 55e5ec41a9de46b6ca06031f4fbdfdfc76dc24dc Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 27 May 2009 20:20:28 +0200
Subject: [PATCH 1363/2198] pref_counter: tools: report: Add dso sorting

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182101.229504802@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 30 +++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a634022bae079..30e12c7f7108b 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -760,6 +760,35 @@ static struct sort_entry sort_comm = {
 	.print	= sort__comm_print,
 };
 
+static int64_t
+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct dso *dso_l = left->dso;
+	struct dso *dso_r = right->dso;
+
+	if (!dso_l || !dso_r) {
+		if (!dso_l && !dso_r)
+			return 0;
+		else if (!dso_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(dso_l->name, dso_r->name);
+}
+
+static size_t
+sort__dso_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%64s ", self->dso ? self->dso->name : "<unknown>");
+}
+
+static struct sort_entry sort_dso = {
+	.cmp	= sort__dso_cmp,
+	.print	= sort__dso_print,
+};
+
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -809,6 +838,7 @@ struct sort_dimension {
 static struct sort_dimension sort_dimensions[] = {
 	{ .name = "pid",	.entry = &sort_thread,	},
 	{ .name = "comm",	.entry = &sort_comm,	},
+	{ .name = "dso",	.entry = &sort_dso,	},
 	{ .name = "symbol",	.entry = &sort_sym,	},
 };
 
-- 
GitLab


From 2d65537ee7cd4a0818ea80a97ab7932368fff5cd Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 21:36:22 +0200
Subject: [PATCH 1364/2198] pref_counter: tools: report: Add header printout &
 prettify

Old default output:

 3.12%    perf-report [.] ./perf-report:       dsos__find
 2.44%    perf-report [k] kernel:              kallsyms_expand_symbol
 2.28%          :4483 [.] <unknown>:           <unknown>
 2.05%          :4174 [k] kernel:              _spin_lock_irqsave
 2.01%    perf-report [k] kernel:              vsnprintf
 1.92%    perf-report [k] kernel:              format_decode
 1.92%          :4438 [k] kernel:              _spin_lock

New default output:

 #
 # Overhead          Command       File: Symbol
 # ........          .......       ............
 #
      6.54%             perf  [k]  kernel: kallsyms_expand_symbol
      6.26%             perf  [.]  /home/mingo/tip/Documentation/perf_counter/perf: dso__insert_symbol
      4.76%             perf  [.]  /home/mingo/tip/Documentation/perf_counter/perf: hex2long
      4.55%             perf  [k]  kernel: number
      4.48%             perf  [k]  kernel: format_decode
      4.09%             perf  [k]  kernel: vsnprintf

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <20090527182101.229504802@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 40 +++++++++++++++------
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 30e12c7f7108b..6df95c2698c62 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -708,6 +708,7 @@ struct sort_entry {
 	struct list_head list;
 
 	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+	size_t (*print_header)(FILE *fp);
 	size_t	(*print)(FILE *fp, struct hist_entry *);
 };
 
@@ -722,7 +723,7 @@ sort__thread_print(FILE *fp, struct hist_entry *self)
 {
 	char bf[32];
 
-	return fprintf(fp, "%14s ",
+	return fprintf(fp, " %16s",
 			thread__name(self->thread, bf, sizeof(bf)));
 }
 
@@ -752,7 +753,7 @@ sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__comm_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "%20s ", self->thread->comm ?: "<unknown>");
+	return fprintf(fp, " %16s", self->thread->comm ?: "<unknown>");
 }
 
 static struct sort_entry sort_comm = {
@@ -781,7 +782,7 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__dso_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "%64s ", self->dso ? self->dso->name : "<unknown>");
+	return fprintf(fp, " %64s", self->dso ? self->dso->name : "<unknown>");
 }
 
 static struct sort_entry sort_dso = {
@@ -803,21 +804,33 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(ip_r - ip_l);
 }
 
+static size_t sort__sym_print_header(FILE *fp)
+{
+	size_t ret = 0;
+
+	ret += fprintf(fp, "#\n");
+	ret += fprintf(fp, "# Overhead          Command       File: Symbol\n");
+	ret += fprintf(fp, "# ........          .......       ............\n");
+	ret += fprintf(fp, "#\n");
+
+	return ret;
+}
+
 static size_t
 sort__sym_print(FILE *fp, struct hist_entry *self)
 {
 	size_t ret = 0;
 
-	ret += fprintf(fp, "[%c] ", self->level);
+	ret += fprintf(fp, "  [%c] ", self->level);
 
 	if (verbose)
-		ret += fprintf(fp, "%#018llx ", (unsigned long long)self->ip);
+		ret += fprintf(fp, " %#018llx", (unsigned long long)self->ip);
 
 	if (self->level != '.')
-		ret += fprintf(fp, "%s ",
+		ret += fprintf(fp, " kernel: %s",
 			       self->sym ? self->sym->name : "<unknown>");
 	else
-		ret += fprintf(fp, "%s: %s ",
+		ret += fprintf(fp, " %s: %s",
 			       self->dso ? self->dso->name : "<unknown>",
 			       self->sym ? self->sym->name : "<unknown>");
 
@@ -825,8 +838,9 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_sym = {
-	.cmp	= sort__sym_cmp,
-	.print	= sort__sym_print,
+	.cmp		= sort__sym_cmp,
+	.print_header	= sort__sym_print_header,
+	.print		= sort__sym_print,
 };
 
 struct sort_dimension {
@@ -898,7 +912,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 	size_t ret;
 
 	if (total_samples) {
-		ret = fprintf(fp, "%5.2f%% ",
+		ret = fprintf(fp, "    %5.2f%%",
 				(self->count * 100.0) / total_samples);
 	} else
 		ret = fprintf(fp, "%12d ", self->count);
@@ -1003,9 +1017,15 @@ static void output__resort(void)
 static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 {
 	struct hist_entry *pos;
+	struct sort_entry *se;
 	struct rb_node *nd;
 	size_t ret = 0;
 
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		if (se->print_header)
+			ret += se->print_header(fp);
+	}
+
 	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
 		pos = rb_entry(nd, struct hist_entry, rb_node);
 		ret += hist_entry__fprintf(fp, pos, total_samples);
-- 
GitLab


From 55717314c4e3a5180a54228a2f97e50f3496de4c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 27 May 2009 22:13:17 +0200
Subject: [PATCH 1365/2198] pref_counter: tools: report: Robustify in case of
 weird events

This error condition:

  aldebaran:~/linux/linux/Documentation/perf_counter> perf report
  dso__load_sym: cannot get elf header.
  failed to open: /etc/ld.so.cache
  problem processing PERF_EVENT_MMAP, bailing out

caused the profile to be very short - as the error was at the beginning
of the file and we bailed out completely.

Be more permissive and consider the event broken instead.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6df95c2698c62..5993c129d736e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1117,9 +1117,9 @@ static int __cmd_report(void)
 		}
 
 		if (thread == NULL) {
-			fprintf(stderr, "problem processing %d event, bailing out\n",
+			fprintf(stderr, "problem processing %d event, skipping it.\n",
 				event->header.type);
-			goto done;
+			goto broken_event;
 		}
 
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
@@ -1149,8 +1149,8 @@ static int __cmd_report(void)
 
 			if (hist_entry__add(thread, map, dso, sym, ip, level)) {
 				fprintf(stderr,
-		"problem incrementing symbol count, bailing out\n");
-				goto done;
+		"problem incrementing symbol count, skipping event\n");
+				goto broken_event;
 			}
 		}
 		total++;
@@ -1169,8 +1169,8 @@ static int __cmd_report(void)
 				event->mmap.filename);
 		}
 		if (thread == NULL || map == NULL) {
-			fprintf(stderr, "problem processing PERF_EVENT_MMAP, bailing out\n");
-			goto done;
+			fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
+			goto broken_event;
 		}
 		thread__insert_map(thread, map);
 		total_mmap++;
@@ -1187,8 +1187,8 @@ static int __cmd_report(void)
 		}
 		if (thread == NULL ||
 		    thread__set_comm(thread, event->comm.comm)) {
-			fprintf(stderr, "problem processing PERF_EVENT_COMM, bailing out\n");
-			goto done;
+			fprintf(stderr, "problem processing PERF_EVENT_COMM, skipping event.\n");
+			goto broken_event;
 		}
 		total_comm++;
 		break;
@@ -1221,7 +1221,6 @@ static int __cmd_report(void)
 		goto more;
 
 	rc = EXIT_SUCCESS;
-done:
 	close(input);
 
 	if (dump_trace) {
-- 
GitLab


From 5b6045a906f48d37591365c5dcdd6d1d146bfd4a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Tue, 26 May 2009 17:28:02 +0200
Subject: [PATCH 1366/2198] trace: disable preemption before taking raw
 spinlocks

s390 code uses smp_processor_id() in __raw_spin_lock() code which
reveals that a (raw) spinlock is taken without preemption disabled.
This can potentially deadlock.

To fix this explicitly disable and enable preemption.

BUG: using smp_processor_id() in preemptible [00000000] code: cat/2278
caller is trace_find_cmdline+0x40/0xfc
CPU: 0 Not tainted 2.6.30-rc7-dirty #39
Process cat (pid: 2278, task: 000000003faedb68, ksp: 000000003b33b988)
000000003b33b988 000000003b33bae0 0000000000000002 0000000000000000
       000000003b33bb80 000000003b33baf8 000000003b33baf8 00000000000175d6
       0000000000000001 000000003b33b988 000000003f9b0000 000000000000000b
       000000000000000c 000000003b33bb40 000000003b33bae0 0000000000000000
       0000000000000000 00000000000175d6 000000003b33bae0 000000003b33bb28
Call Trace:
([<00000000000174b2>] show_trace+0x112/0x170)
 [<0000000000017582>] show_stack+0x72/0x100
 [<0000000000441538>] dump_stack+0xc8/0xd8
 [<000000000025c350>] debug_smp_processor_id+0x114/0x130
 [<00000000000bf0e4>] trace_find_cmdline+0x40/0xfc
 [<00000000000c35d4>] trace_print_context+0x58/0xac
 [<00000000000bb676>] print_trace_line+0x416/0x470
 [<00000000000bc8fe>] s_show+0x4e/0x428
 [<000000000013834e>] seq_read+0x36a/0x5d4
 [<0000000000112a78>] vfs_read+0xc8/0x174
 [<0000000000112c58>] SyS_read+0x74/0xc4
 [<000000000002c7ae>] sysc_noemu+0x10/0x16
 [<000002000012436c>] 0x2000012436c
1 lock held by cat/2278:
 #0:  (&p->lock){+.+.+.}, at: [<0000000000138056>] seq_read+0x72/0x5d4

[ Impact: fix preempt-unsafe raw spinlock ]

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 02d32baa23ac9..a3a8a87d7e912 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -808,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
 		return;
 	}
 
+	preempt_disable();
 	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
 	if (map != NO_CMDLINE_MAP)
@@ -816,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
 		strcpy(comm, "<...>");
 
 	__raw_spin_unlock(&trace_cmdline_lock);
+	preempt_enable();
 }
 
 void tracing_record_cmdline(struct task_struct *tsk)
-- 
GitLab


From f2aebaee653a35b01c3665de2cbb1e31456b8ea8 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 27 May 2009 21:36:02 +0800
Subject: [PATCH 1367/2198] ftrace: don't convert function's local variable
 name in macro

"call" is an argument of macro, but it is also used as a local
variable name of function in macro.
We should keep this local variable name distinct from any
CPP macro parameter name if both are in the same macro scope,
although it hasn't caused any problem yet.

[ Impact: robustify macro ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/trace/ftrace.h | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 87fc227c6fbe5..b4ec83ae711fe 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -397,19 +397,19 @@ static void ftrace_profile_##call(proto)				\
 	perf_tpcounter_event(event_##call.id);				\
 }									\
 									\
-static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
+static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
 {									\
 	int ret = 0;							\
 									\
-	if (!atomic_inc_return(&call->profile_count))			\
+	if (!atomic_inc_return(&event_call->profile_count))		\
 		ret = register_trace_##call(ftrace_profile_##call);	\
 									\
 	return ret;							\
 }									\
 									\
-static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
+static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
 {									\
-	if (atomic_add_negative(-1, &call->profile_count))		\
+	if (atomic_add_negative(-1, &event_call->profile_count))	\
 		unregister_trace_##call(ftrace_profile_##call);		\
 }
 
@@ -433,9 +433,9 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
 #define __array(type, item, len)
 
 #undef __string
-#define __string(item, src)						       \
-	__str_offsets.item = __str_size +				       \
-			     offsetof(typeof(*entry), __str_data);	       \
+#define __string(item, src)						\
+	__str_offsets.item = __str_size +				\
+			     offsetof(typeof(*entry), __str_data);	\
 	__str_size += strlen(src) + 1;
 
 #undef __assign_str
@@ -451,8 +451,8 @@ static struct ftrace_event_call event_##call;				\
 									\
 static void ftrace_raw_event_##call(proto)				\
 {									\
-	struct ftrace_str_offsets_##call __maybe_unused __str_offsets;  \
-	struct ftrace_event_call *call = &event_##call;			\
+	struct ftrace_str_offsets_##call __maybe_unused __str_offsets;	\
+	struct ftrace_event_call *event_call = &event_##call;		\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
 	unsigned long irq_flags;					\
@@ -473,7 +473,7 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	if (!filter_current_check_discard(call, entry, event))		\
+	if (!filter_current_check_discard(event_call, entry, event))	\
 		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
 }									\
 									\
-- 
GitLab


From 14dba5331b90c20588ae6504fea8049c7283028d Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Wed, 27 May 2009 09:31:52 -0400
Subject: [PATCH 1368/2198] integrity: nfsd imbalance bug fix

An nfsd exported file is opened/closed by the kernel causing the
integrity imbalance message.

Before a file is opened, there normally is permission checking, which
is done in inode_permission().  However, as integrity checking requires
a dentry and mount point, which is not available in inode_permission(),
the integrity (permission) checking must be called separately.

In order to detect any missing integrity checking calls, we keep track
of file open/closes.  ima_path_check() increments these counts and
does the integrity (permission) checking. As a result, the number of
calls to ima_path_check()/ima_file_free() should be balanced.  An extra
call to fput(), indicates the file could have been accessed without first
calling ima_path_check().

In nfsv3 permission checking is done once, followed by multiple reads,
which do an open/close for each read.  The integrity (permission) checking
call should be in nfsd_permission() after the inode_permission() call, but
as there is no correlation between the number of permission checking and
open calls, the integrity checking call should not increment the counters,
but defer it to when the file is actually opened.

This patch adds:
- integrity (permission) checking for nfsd exported files in nfsd_permission().
- a call to increment counts for files opened by nfsd.

This patch has been updated to return the nfs error types.

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/nfsd/vfs.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 6c68ffd6b4bb0..81ff0f4de4b74 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -55,6 +55,7 @@
 #include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
 #include <linux/jhash.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 
@@ -735,6 +736,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 			    flags, cred);
 	if (IS_ERR(*filp))
 		host_err = PTR_ERR(*filp);
+	else
+		ima_counts_get(*filp);
 out_nfserr:
 	err = nfserrno(host_err);
 out:
@@ -2024,6 +2027,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 					struct dentry *dentry, int acc)
 {
 	struct inode	*inode = dentry->d_inode;
+	struct path	path;
 	int		err;
 
 	if (acc == NFSD_MAY_NOP)
@@ -2096,7 +2100,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
 	if (err == -EACCES && S_ISREG(inode->i_mode) &&
 	    acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
 		err = inode_permission(inode, MAY_EXEC);
+	if (err)
+		goto nfsd_out;
 
+	/* Do integrity (permission) checking now, but defer incrementing
+	 * IMA counts to the actual file open.
+	 */
+	path.mnt = exp->ex_path.mnt;
+	path.dentry = dentry;
+	err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
+			     IMA_COUNT_LEAVE);
+nfsd_out:
 	return err? nfserrno(err) : 0;
 }
 
-- 
GitLab


From abfe0af9813153bae8c85d9bac966bafcb8ddab1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 20 May 2009 00:37:40 -0700
Subject: [PATCH 1369/2198] x86: enable_update_mptable should be a macro

instead of declaring one variant as an inline function...
because other case is a variable

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <4A13B344.7030307@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/mpspec.h | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 3dcbaaaa363e7..e2a1bb6d71ea8 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
 #ifdef CONFIG_X86_MPPARSE
 extern void find_smp_config(void);
 extern void early_reserve_e820_mpc_new(void);
+extern int enable_update_mptable;
 #else
 static inline void find_smp_config(void) { }
 static inline void early_reserve_e820_mpc_new(void) { }
+#define enable_update_mptable 0
 #endif
 
 void __cpuinit generic_processor_info(int apicid, int version);
@@ -87,15 +89,6 @@ static inline int acpi_probe_gsi(void)
 }
 #endif /* CONFIG_ACPI */
 
-#ifdef CONFIG_X86_MPPARSE
-extern int enable_update_mptable;
-#else
-static inline int enable_update_mptable(void)
-{
-	return 0;
-}
-#endif
-
 #define PHYSID_ARRAY_SIZE	BITS_TO_LONGS(MAX_APICS)
 
 struct physid_mask {
-- 
GitLab


From 13b297d943828c4594527a2bd9c30ecd04e37886 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 26 May 2009 14:18:07 +0900
Subject: [PATCH 1370/2198] smack: Remove redundant initialization.

We don't need to explicitly initialize to cap_* because
it will be filled by security_fixup_ops().

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Casey Schaufler <casey@schaufler-ca.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/smack/smack_lsm.c | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c
index 0d030b4513c8c..0023182078c72 100644
--- a/security/smack/smack_lsm.c
+++ b/security/smack/smack_lsm.c
@@ -3034,15 +3034,7 @@ struct security_operations smack_ops = {
 
 	.ptrace_may_access =		smack_ptrace_may_access,
 	.ptrace_traceme =		smack_ptrace_traceme,
-	.capget = 			cap_capget,
-	.capset = 			cap_capset,
-	.capable = 			cap_capable,
 	.syslog = 			smack_syslog,
-	.settime = 			cap_settime,
-	.vm_enough_memory = 		cap_vm_enough_memory,
-
-	.bprm_set_creds = 		cap_bprm_set_creds,
-	.bprm_secureexec = 		cap_bprm_secureexec,
 
 	.sb_alloc_security = 		smack_sb_alloc_security,
 	.sb_free_security = 		smack_sb_free_security,
@@ -3066,8 +3058,6 @@ struct security_operations smack_ops = {
 	.inode_post_setxattr = 		smack_inode_post_setxattr,
 	.inode_getxattr = 		smack_inode_getxattr,
 	.inode_removexattr = 		smack_inode_removexattr,
-	.inode_need_killpriv =		cap_inode_need_killpriv,
-	.inode_killpriv =		cap_inode_killpriv,
 	.inode_getsecurity = 		smack_inode_getsecurity,
 	.inode_setsecurity = 		smack_inode_setsecurity,
 	.inode_listsecurity = 		smack_inode_listsecurity,
@@ -3088,7 +3078,6 @@ struct security_operations smack_ops = {
 	.cred_commit =			smack_cred_commit,
 	.kernel_act_as =		smack_kernel_act_as,
 	.kernel_create_files_as =	smack_kernel_create_files_as,
-	.task_fix_setuid =		cap_task_fix_setuid,
 	.task_setpgid = 		smack_task_setpgid,
 	.task_getpgid = 		smack_task_getpgid,
 	.task_getsid = 			smack_task_getsid,
@@ -3102,7 +3091,6 @@ struct security_operations smack_ops = {
 	.task_kill = 			smack_task_kill,
 	.task_wait = 			smack_task_wait,
 	.task_to_inode = 		smack_task_to_inode,
-	.task_prctl =			cap_task_prctl,
 
 	.ipc_permission = 		smack_ipc_permission,
 	.ipc_getsecid =			smack_ipc_getsecid,
@@ -3129,9 +3117,6 @@ struct security_operations smack_ops = {
 	.sem_semctl = 			smack_sem_semctl,
 	.sem_semop = 			smack_sem_semop,
 
-	.netlink_send =			cap_netlink_send,
-	.netlink_recv = 		cap_netlink_recv,
-
 	.d_instantiate = 		smack_d_instantiate,
 
 	.getprocattr = 			smack_getprocattr,
-- 
GitLab


From d3e78ee3d015dac1794433abb6403b6fc8e70e10 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 May 2009 11:41:50 +0200
Subject: [PATCH 1371/2198] perf_counter: Fix perf_counter_init_task() on
 !CONFIG_PERF_COUNTERS

Pointed out by compiler warnings:

   tip/include/linux/perf_counter.h:644: warning: no return statement in function returning non-void

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 2b16ed37b74cc..a65ddc580514d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -641,7 +641,7 @@ perf_counter_task_sched_out(struct task_struct *task,
 			    struct task_struct *next, int cpu)		{ }
 static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
-static inline int perf_counter_init_task(struct task_struct *child)	{ }
+static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
-- 
GitLab


From ca8cdeef9ca2ff89ee8a21d6f6ff3dfb60286041 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 28 May 2009 11:08:33 +0200
Subject: [PATCH 1372/2198] perf_counter tools: report: Implement header output
 for --sort variants

Implement this style of header:

 #
 # Overhead          Command       File: Symbol
 # ........          .......       ............
 #

for the various --sort variants as well.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 70 +++++++++------------
 1 file changed, 28 insertions(+), 42 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 5993c129d736e..506cde437b785 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -596,8 +596,6 @@ static struct map *map__new(struct mmap_event *event)
 
 struct thread;
 
-static const char *thread__name(struct thread *self, char *bf, size_t size);
-
 struct thread {
 	struct rb_node	 rb_node;
 	struct list_head maps;
@@ -605,15 +603,6 @@ struct thread {
 	char		 *comm;
 };
 
-static const char *thread__name(struct thread *self, char *bf, size_t size)
-{
-	if (self->comm)
-		return self->comm;
-
-	snprintf(bf, sizeof(bf), ":%u", self->pid);
-	return bf;
-}
-
 static struct thread *thread__new(pid_t pid)
 {
 	struct thread *self = malloc(sizeof(*self));
@@ -707,8 +696,9 @@ struct hist_entry {
 struct sort_entry {
 	struct list_head list;
 
+	char *header;
+
 	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
-	size_t (*print_header)(FILE *fp);
 	size_t	(*print)(FILE *fp, struct hist_entry *);
 };
 
@@ -721,13 +711,11 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__thread_print(FILE *fp, struct hist_entry *self)
 {
-	char bf[32];
-
-	return fprintf(fp, " %16s",
-			thread__name(self->thread, bf, sizeof(bf)));
+	return fprintf(fp, " %16s:%5d", self->thread->comm ?: "", self->thread->pid);
 }
 
 static struct sort_entry sort_thread = {
+	.header = "         Command: Pid ",
 	.cmp	= sort__thread_cmp,
 	.print	= sort__thread_print,
 };
@@ -757,6 +745,7 @@ sort__comm_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_comm = {
+	.header = "         Command",
 	.cmp	= sort__comm_cmp,
 	.print	= sort__comm_print,
 };
@@ -786,6 +775,7 @@ sort__dso_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_dso = {
+	.header = "                                                    Shared Object",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
 };
@@ -804,43 +794,25 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 	return (int64_t)(ip_r - ip_l);
 }
 
-static size_t sort__sym_print_header(FILE *fp)
-{
-	size_t ret = 0;
-
-	ret += fprintf(fp, "#\n");
-	ret += fprintf(fp, "# Overhead          Command       File: Symbol\n");
-	ret += fprintf(fp, "# ........          .......       ............\n");
-	ret += fprintf(fp, "#\n");
-
-	return ret;
-}
-
 static size_t
 sort__sym_print(FILE *fp, struct hist_entry *self)
 {
 	size_t ret = 0;
 
-	ret += fprintf(fp, "  [%c] ", self->level);
-
 	if (verbose)
 		ret += fprintf(fp, " %#018llx", (unsigned long long)self->ip);
 
-	if (self->level != '.')
-		ret += fprintf(fp, " kernel: %s",
-			       self->sym ? self->sym->name : "<unknown>");
-	else
-		ret += fprintf(fp, " %s: %s",
-			       self->dso ? self->dso->name : "<unknown>",
-			       self->sym ? self->sym->name : "<unknown>");
+	ret += fprintf(fp, " %s: %s",
+			self->dso ? self->dso->name : "<unknown>",
+			self->sym ? self->sym->name : "<unknown>");
 
 	return ret;
 }
 
 static struct sort_entry sort_sym = {
-	.cmp		= sort__sym_cmp,
-	.print_header	= sort__sym_print_header,
-	.print		= sort__sym_print,
+	.header = "Shared Object: Symbol",
+	.cmp	= sort__sym_cmp,
+	.print	= sort__sym_print,
 };
 
 struct sort_dimension {
@@ -1021,10 +993,24 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 	struct rb_node *nd;
 	size_t ret = 0;
 
+	fprintf(fp, "#\n");
+
+	fprintf(fp, "# Overhead");
+	list_for_each_entry(se, &hist_entry__sort_list, list)
+		fprintf(fp, " %s", se->header);
+	fprintf(fp, "\n");
+
+	fprintf(fp, "# ........");
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		if (se->print_header)
-			ret += se->print_header(fp);
+		int i;
+
+		fprintf(fp, " ");
+		for (i = 0; i < strlen(se->header); i++)
+			fprintf(fp, ".");
 	}
+	fprintf(fp, "\n");
+
+	fprintf(fp, "#\n");
 
 	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
 		pos = rb_entry(nd, struct hist_entry, rb_node);
-- 
GitLab


From 63299f057fbce47da895e8865cba7e9c3eb01a20 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 28 May 2009 10:52:00 +0200
Subject: [PATCH 1373/2198] perf_counter tools: report: Add help text for
 --sort

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 506cde437b785..9fdf8224ee61f 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1240,7 +1240,8 @@ static const struct option options[] = {
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
-	OPT_STRING('s', "sort", &sort_order, "foo", "bar"),
+	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
+		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
 	OPT_END()
 };
 
-- 
GitLab


From 5d85d3247cc3555215d7d11c78576a396c98e4d9 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Thu, 28 May 2009 11:04:53 +0200
Subject: [PATCH 1374/2198] block: export blk_stack_limits()

DM needs to use blk_stack_limits(), so it needs to be exported.

Acked-by: Martin K. Petersen <martin.petersen@oracle.com>

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5649f34adb40d..8d33934928918 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -500,6 +500,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 
 	return 0;
 }
+EXPORT_SYMBOL(blk_stack_limits);
 
 /**
  * disk_stack_limits - adjust queue limits for stacked drivers
-- 
GitLab


From 7d96fd41cadc55f4e00231c8c71b8e25c779f122 Mon Sep 17 00:00:00 2001
From: Petr Tesarik <ptesarik@suse.cz>
Date: Mon, 25 May 2009 11:02:02 +0200
Subject: [PATCH 1375/2198] x86: move rdtsc_barrier() into the TSC vread method

The *fence instructions were moved to vsyscall_64.c by commit
cb9e35dce94a1b9c59d46224e8a94377d673e204.  But this breaks the
vDSO, because vread methods are also called from there.

Besides, the synchronization might be unnecessary for other
time sources than TSC.

[ Impact: fix potential time warp in VDSO ]

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
LKML-Reference: <9d0ea9ea0f866bdc1f4d76831221ae117f11ea67.1243241859.git.ptesarik@suse.cz>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@kernel.org>
---
 arch/x86/kernel/tsc.c         | 11 ++++++++++-
 arch/x86/kernel/vsyscall_64.c |  8 --------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index d57de05dc4309..cf8611d991e08 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs)
 #ifdef CONFIG_X86_64
 static cycle_t __vsyscall_fn vread_tsc(void)
 {
-	cycle_t ret = (cycle_t)vget_cycles();
+	cycle_t ret;
+
+	/*
+	 * Surround the RDTSC by barriers, to make sure it's not
+	 * speculated to outside the seqlock critical section and
+	 * does not cause time warps:
+	 */
+	rdtsc_barrier();
+	ret = (cycle_t)vget_cycles();
+	rdtsc_barrier();
 
 	return ret >= __vsyscall_gtod_data.clock.cycle_last ?
 		ret : __vsyscall_gtod_data.clock.cycle_last;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 44153afc90675..25ee06a80aad3 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
 			return;
 		}
 
-		/*
-		 * Surround the RDTSC by barriers, to make sure it's not
-		 * speculated to outside the seqlock critical section and
-		 * does not cause time warps:
-		 */
-		rdtsc_barrier();
 		now = vread();
-		rdtsc_barrier();
-
 		base = __vsyscall_gtod_data.clock.cycle_last;
 		mask = __vsyscall_gtod_data.clock.mask;
 		mult = __vsyscall_gtod_data.clock.mult;
-- 
GitLab


From c93f7669098eb97c5376e5396e3dfb734c17df4f Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 28 May 2009 22:18:17 +1000
Subject: [PATCH 1376/2198] perf_counter: Fix race in attaching counters to
 tasks and exiting

Commit 564c2b21 ("perf_counter: Optimize context switch between
identical inherited contexts") introduced a race where it is possible
that a counter being attached to a task could get attached to the
wrong task, if the task is one that has inherited its context from
another task via fork.  This happens because the optimized context
switch could switch the context to another task after find_get_context
has read task->perf_counter_ctxp.  In fact, it's possible that the
context could then get freed, if the other task then exits.

This fixes the problem by protecting both the context switch and the
critical code in find_get_context with spinlocks.  The context switch
locks the cxt->lock of both the outgoing and incoming contexts before
swapping them.  That means that once code such as find_get_context
has obtained the spinlock for the context associated with a task,
the context can't get swapped to another task.  However, the context
may have been swapped in the interval between reading
task->perf_counter_ctxp and getting the lock, so it is necessary to
check and retry.

To make sure that none of the contexts being looked at in
find_get_context can get freed, this changes the context freeing code
to use RCU.  Thus an rcu_read_lock() is sufficient to ensure that no
contexts can get freed.  This part of the patch is lifted from a patch
posted by Peter Zijlstra.

This also adds a check to make sure that we can't add a counter to a
task that is exiting.

There is also a race between perf_counter_exit_task and
find_get_context; this solves the race by moving the get_ctx that
was in perf_counter_alloc into the locked region in find_get_context,
so that once find_get_context has got the context for a task, it
won't get freed even if the task calls perf_counter_exit_task.  It
doesn't matter if new top-level (non-inherited) counters get attached
to the context after perf_counter_exit_task has detached the context
from the task.  They will just stay there and never get scheduled in
until the counters' fds get closed, and then perf_release will remove
them from the context and eventually free the context.

With this, we are now doing the unclone in find_get_context rather
than when a counter was added to or removed from a context (actually,
we were missing the unclone_ctx() call when adding a counter to a
context).  We don't need to unclone when removing a counter from a
context because we have no way to remove a counter from a cloned
context.

This also takes out the smp_wmb() in find_get_context, which Peter
Zijlstra pointed out was unnecessary because the cmpxchg implies a
full barrier anyway.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18974.33033.667187.273886@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |   5 +-
 kernel/perf_counter.c        | 205 +++++++++++++++++++++++++----------
 2 files changed, 148 insertions(+), 62 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index a65ddc580514d..717bf3b59ba40 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -541,8 +541,9 @@ struct perf_counter_context {
 	 * been cloned (inherited) from a common ancestor.
 	 */
 	struct perf_counter_context *parent_ctx;
-	u32			parent_gen;
-	u32			generation;
+	u64			parent_gen;
+	u64			generation;
+	struct rcu_head		rcu_head;
 };
 
 /**
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 367299f91aaff..52e5a15321d80 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -103,12 +103,22 @@ static void get_ctx(struct perf_counter_context *ctx)
 	atomic_inc(&ctx->refcount);
 }
 
+static void free_ctx(struct rcu_head *head)
+{
+	struct perf_counter_context *ctx;
+
+	ctx = container_of(head, struct perf_counter_context, rcu_head);
+	kfree(ctx);
+}
+
 static void put_ctx(struct perf_counter_context *ctx)
 {
 	if (atomic_dec_and_test(&ctx->refcount)) {
 		if (ctx->parent_ctx)
 			put_ctx(ctx->parent_ctx);
-		kfree(ctx);
+		if (ctx->task)
+			put_task_struct(ctx->task);
+		call_rcu(&ctx->rcu_head, free_ctx);
 	}
 }
 
@@ -211,22 +221,6 @@ group_sched_out(struct perf_counter *group_counter,
 		cpuctx->exclusive = 0;
 }
 
-/*
- * Mark this context as not being a clone of another.
- * Called when counters are added to or removed from this context.
- * We also increment our generation number so that anything that
- * was cloned from this context before this will not match anything
- * cloned from this context after this.
- */
-static void unclone_ctx(struct perf_counter_context *ctx)
-{
-	++ctx->generation;
-	if (!ctx->parent_ctx)
-		return;
-	put_ctx(ctx->parent_ctx);
-	ctx->parent_ctx = NULL;
-}
-
 /*
  * Cross CPU call to remove a performance counter
  *
@@ -281,13 +275,19 @@ static void __perf_counter_remove_from_context(void *info)
  *
  * CPU counters are removed with a smp call. For task counters we only
  * call when the task is on a CPU.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This is OK when called from perf_release since
+ * that only calls us on the top-level context, which can't be a clone.
+ * When called from perf_counter_exit_task, it's OK because the
+ * context has been detached from its task.
  */
 static void perf_counter_remove_from_context(struct perf_counter *counter)
 {
 	struct perf_counter_context *ctx = counter->ctx;
 	struct task_struct *task = ctx->task;
 
-	unclone_ctx(ctx);
 	if (!task) {
 		/*
 		 * Per cpu counters are removed via an smp call and
@@ -410,6 +410,16 @@ static void __perf_counter_disable(void *info)
 
 /*
  * Disable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisifed when called through
+ * perf_counter_for_each_child or perf_counter_for_each because they
+ * hold the top-level counter's child_mutex, so any descendant that
+ * goes to exit will block in sync_child_counter.
+ * When called from perf_pending_counter it's OK because counter->ctx
+ * is the current context on this CPU and preemption is disabled,
+ * hence we can't get into perf_counter_task_sched_out for this context.
  */
 static void perf_counter_disable(struct perf_counter *counter)
 {
@@ -794,6 +804,12 @@ static void __perf_counter_enable(void *info)
 
 /*
  * Enable a counter.
+ *
+ * If counter->ctx is a cloned context, callers must make sure that
+ * every task struct that counter->ctx->task could possibly point to
+ * remains valid.  This condition is satisfied when called through
+ * perf_counter_for_each_child or perf_counter_for_each as described
+ * for perf_counter_disable.
  */
 static void perf_counter_enable(struct perf_counter *counter)
 {
@@ -923,7 +939,9 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
 	struct perf_counter_context *ctx = task->perf_counter_ctxp;
 	struct perf_counter_context *next_ctx;
+	struct perf_counter_context *parent;
 	struct pt_regs *regs;
+	int do_switch = 1;
 
 	regs = task_pt_regs(task);
 	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
@@ -932,18 +950,39 @@ void perf_counter_task_sched_out(struct task_struct *task,
 		return;
 
 	update_context_time(ctx);
+
+	rcu_read_lock();
+	parent = rcu_dereference(ctx->parent_ctx);
 	next_ctx = next->perf_counter_ctxp;
-	if (next_ctx && context_equiv(ctx, next_ctx)) {
-		task->perf_counter_ctxp = next_ctx;
-		next->perf_counter_ctxp = ctx;
-		ctx->task = next;
-		next_ctx->task = task;
-		return;
+	if (parent && next_ctx &&
+	    rcu_dereference(next_ctx->parent_ctx) == parent) {
+		/*
+		 * Looks like the two contexts are clones, so we might be
+		 * able to optimize the context switch.  We lock both
+		 * contexts and check that they are clones under the
+		 * lock (including re-checking that neither has been
+		 * uncloned in the meantime).  It doesn't matter which
+		 * order we take the locks because no other cpu could
+		 * be trying to lock both of these tasks.
+		 */
+		spin_lock(&ctx->lock);
+		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
+		if (context_equiv(ctx, next_ctx)) {
+			task->perf_counter_ctxp = next_ctx;
+			next->perf_counter_ctxp = ctx;
+			ctx->task = next;
+			next_ctx->task = task;
+			do_switch = 0;
+		}
+		spin_unlock(&next_ctx->lock);
+		spin_unlock(&ctx->lock);
 	}
+	rcu_read_unlock();
 
-	__perf_counter_sched_out(ctx, cpuctx);
-
-	cpuctx->task_ctx = NULL;
+	if (do_switch) {
+		__perf_counter_sched_out(ctx, cpuctx);
+		cpuctx->task_ctx = NULL;
+	}
 }
 
 static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
@@ -1215,18 +1254,13 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 	ctx->task = task;
 }
 
-static void put_context(struct perf_counter_context *ctx)
-{
-	if (ctx->task)
-		put_task_struct(ctx->task);
-}
-
 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
-	struct perf_counter_context *tctx;
+	struct perf_counter_context *parent_ctx;
 	struct task_struct *task;
+	int err;
 
 	/*
 	 * If cpu is not a wildcard then this is a percpu counter:
@@ -1249,6 +1283,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 
 		cpuctx = &per_cpu(perf_cpu_context, cpu);
 		ctx = &cpuctx->ctx;
+		get_ctx(ctx);
 
 		return ctx;
 	}
@@ -1265,37 +1300,79 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!task)
 		return ERR_PTR(-ESRCH);
 
+	/*
+	 * Can't attach counters to a dying task.
+	 */
+	err = -ESRCH;
+	if (task->flags & PF_EXITING)
+		goto errout;
+
 	/* Reuse ptrace permission checks for now. */
-	if (!ptrace_may_access(task, PTRACE_MODE_READ)) {
-		put_task_struct(task);
-		return ERR_PTR(-EACCES);
+	err = -EACCES;
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto errout;
+
+ retry_lock:
+	rcu_read_lock();
+ retry:
+	ctx = rcu_dereference(task->perf_counter_ctxp);
+	if (ctx) {
+		/*
+		 * If this context is a clone of another, it might
+		 * get swapped for another underneath us by
+		 * perf_counter_task_sched_out, though the
+		 * rcu_read_lock() protects us from any context
+		 * getting freed.  Lock the context and check if it
+		 * got swapped before we could get the lock, and retry
+		 * if so.  If we locked the right context, then it
+		 * can't get swapped on us any more and we can
+		 * unclone it if necessary.
+		 * Once it's not a clone things will be stable.
+		 */
+		spin_lock_irq(&ctx->lock);
+		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+			spin_unlock_irq(&ctx->lock);
+			goto retry;
+		}
+		parent_ctx = ctx->parent_ctx;
+		if (parent_ctx) {
+			put_ctx(parent_ctx);
+			ctx->parent_ctx = NULL;		/* no longer a clone */
+		}
+		++ctx->generation;
+		/*
+		 * Get an extra reference before dropping the lock so that
+		 * this context won't get freed if the task exits.
+		 */
+		get_ctx(ctx);
+		spin_unlock_irq(&ctx->lock);
 	}
+	rcu_read_unlock();
 
-	ctx = task->perf_counter_ctxp;
 	if (!ctx) {
 		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
-		if (!ctx) {
-			put_task_struct(task);
-			return ERR_PTR(-ENOMEM);
-		}
+		err = -ENOMEM;
+		if (!ctx)
+			goto errout;
 		__perf_counter_init_context(ctx, task);
-		/*
-		 * Make sure other cpus see correct values for *ctx
-		 * once task->perf_counter_ctxp is visible to them.
-		 */
-		smp_wmb();
-		tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx);
-		if (tctx) {
+		get_ctx(ctx);
+		if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) {
 			/*
 			 * We raced with some other task; use
 			 * the context they set.
 			 */
 			kfree(ctx);
-			ctx = tctx;
+			goto retry_lock;
 		}
+		get_task_struct(task);
 	}
 
+	put_task_struct(task);
 	return ctx;
+
+ errout:
+	put_task_struct(task);
+	return ERR_PTR(err);
 }
 
 static void free_counter_rcu(struct rcu_head *head)
@@ -1303,7 +1380,6 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
-	put_ctx(counter->ctx);
 	kfree(counter);
 }
 
@@ -1324,6 +1400,7 @@ static void free_counter(struct perf_counter *counter)
 	if (counter->destroy)
 		counter->destroy(counter);
 
+	put_ctx(counter->ctx);
 	call_rcu(&counter->rcu_head, free_counter_rcu);
 }
 
@@ -1347,7 +1424,6 @@ static int perf_release(struct inode *inode, struct file *file)
 	put_task_struct(counter->owner);
 
 	free_counter(counter);
-	put_context(ctx);
 
 	return 0;
 }
@@ -1437,6 +1513,12 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
 	mutex_unlock(&ctx->mutex);
 }
 
+/*
+ * Holding the top-level counter's child_mutex means that any
+ * descendant process that has inherited this counter will block
+ * in sync_child_counter if it goes to exit, thus satisfying the
+ * task existence requirements of perf_counter_enable/disable.
+ */
 static void perf_counter_for_each_child(struct perf_counter *counter,
 					void (*func)(struct perf_counter *))
 {
@@ -3124,8 +3206,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->ctx			= ctx;
 	counter->oncpu			= -1;
 
-	get_ctx(ctx);
-
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (hw_event->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
@@ -3290,7 +3370,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	kfree(counter);
 
 err_put_context:
-	put_context(ctx);
+	put_ctx(ctx);
 
 	goto out_fput;
 }
@@ -3322,6 +3402,7 @@ inherit_counter(struct perf_counter *parent_counter,
 					   group_leader, GFP_KERNEL);
 	if (IS_ERR(child_counter))
 		return child_counter;
+	get_ctx(child_ctx);
 
 	/*
 	 * Make the child state follow the state of the parent counter,
@@ -3439,11 +3520,6 @@ __perf_counter_exit_task(struct task_struct *child,
 
 /*
  * When a child task exits, feed back counter values to parent counters.
- *
- * Note: we may be running in child context, but the PID is not hashed
- * anymore so new counters will not be added.
- * (XXX not sure that is true when we get called from flush_old_exec.
- *  -- paulus)
  */
 void perf_counter_exit_task(struct task_struct *child)
 {
@@ -3458,7 +3534,15 @@ void perf_counter_exit_task(struct task_struct *child)
 
 	local_irq_save(flags);
 	__perf_counter_task_sched_out(child_ctx);
+
+	/*
+	 * Take the context lock here so that if find_get_context is
+	 * reading child->perf_counter_ctxp, we wait until it has
+	 * incremented the context's refcount before we do put_ctx below.
+	 */
+	spin_lock(&child_ctx->lock);
 	child->perf_counter_ctxp = NULL;
+	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
 	mutex_lock(&child_ctx->mutex);
@@ -3513,6 +3597,7 @@ int perf_counter_init_task(struct task_struct *child)
 
 	__perf_counter_init_context(child_ctx, child);
 	child->perf_counter_ctxp = child_ctx;
+	get_task_struct(child);
 
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
-- 
GitLab


From 07119a4df8c8c77d888f2f46964ea9512ea84ff8 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: [PATCH 1377/2198] cifs: have cifs_NTtimeToUnix take a little-endian
 arg

...and just have the function call le64_to_cpu.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  2 +-
 fs/cifs/file.c      |  2 +-
 fs/cifs/inode.c     | 15 ++++++---------
 fs/cifs/netmisc.c   | 12 ++++++------
 fs/cifs/readdir.c   | 12 ++++++------
 5 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index fae083930eee2..8831f649720fb 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -90,7 +90,7 @@ extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
 						 struct cifsTconInfo *);
 extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
-extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601);
+extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 302ea15f02e61..06866841b97f1 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -241,7 +241,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
 	/* BB need same check in cifs_create too? */
 	/* if not oplocked, invalidate inode pages if mtime or file
 	   size changed */
-	temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+	temp = cifs_NTtimeToUnix(buf->LastWriteTime);
 	if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
 			   (file->f_path.dentry->d_inode->i_size ==
 			    (loff_t)le64_to_cpu(buf->EndOfFile))) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 9c869a6dcba18..42d6e0fb6f310 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -85,10 +85,10 @@ static void cifs_unix_info_to_inode(struct inode *inode,
 	__u64 num_of_bytes = le64_to_cpu(info->NumOfBytes);
 	__u64 end_of_file = le64_to_cpu(info->EndOfFile);
 
-	inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime));
+	inode->i_atime = cifs_NTtimeToUnix(info->LastAccessTime);
 	inode->i_mtime =
-		cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime));
-	inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange));
+		cifs_NTtimeToUnix(info->LastModificationTime);
+	inode->i_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
 	inode->i_mode = le64_to_cpu(info->Permissions);
 
 	/*
@@ -554,14 +554,11 @@ int cifs_get_inode_info(struct inode **pinode,
 
 	/* Linux can not store file creation time so ignore it */
 	if (pfindData->LastAccessTime)
-		inode->i_atime = cifs_NTtimeToUnix
-			(le64_to_cpu(pfindData->LastAccessTime));
+		inode->i_atime = cifs_NTtimeToUnix(pfindData->LastAccessTime);
 	else /* do not need to use current_fs_time - time not stored */
 		inode->i_atime = CURRENT_TIME;
-	inode->i_mtime =
-		    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
-	inode->i_ctime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+	inode->i_mtime = cifs_NTtimeToUnix(pfindData->LastWriteTime);
+	inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime);
 	cFYI(DBG2, ("Attributes came in as 0x%x", attr));
 	if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
 		inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index e2fe998989a38..d3ba75ef014f5 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -853,12 +853,12 @@ smbCalcSize_LE(struct smb_hdr *ptr)
 
 #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
 
-    /*
-     * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
-     * into Unix UTC (based 1970-01-01, in seconds).
-     */
+/*
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * into Unix UTC (based 1970-01-01, in seconds).
+ */
 struct timespec
-cifs_NTtimeToUnix(u64 ntutc)
+cifs_NTtimeToUnix(__le64 ntutc)
 {
 	struct timespec ts;
 	/* BB what about the timezone? BB */
@@ -866,7 +866,7 @@ cifs_NTtimeToUnix(u64 ntutc)
 	/* Subtract the NTFS time offset, then convert to 1s intervals. */
 	u64 t;
 
-	t = ntutc - NTFS_TIME_OFFSET;
+	t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
 	ts.tv_nsec = do_div(t, 10000000) * 100;
 	ts.tv_sec = t;
 	return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 964e097c82035..79c46c2226c5a 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -150,11 +150,11 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 		allocation_size = le64_to_cpu(pfindData->AllocationSize);
 		end_of_file = le64_to_cpu(pfindData->EndOfFile);
 		tmp_inode->i_atime =
-		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+			cifs_NTtimeToUnix(pfindData->LastAccessTime);
 		tmp_inode->i_mtime =
-		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
+			cifs_NTtimeToUnix(pfindData->LastWriteTime);
 		tmp_inode->i_ctime =
-		      cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+			cifs_NTtimeToUnix(pfindData->ChangeTime);
 	} else { /* legacy, OS2 and DOS style */
 /*		struct timespec ts;*/
 		FIND_FILE_STANDARD_INFO *pfindData =
@@ -331,11 +331,11 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
 	local_size  = tmp_inode->i_size;
 
 	tmp_inode->i_atime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+	    cifs_NTtimeToUnix(pfindData->LastAccessTime);
 	tmp_inode->i_mtime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastModificationTime));
+	    cifs_NTtimeToUnix(pfindData->LastModificationTime);
 	tmp_inode->i_ctime =
-	    cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange));
+	    cifs_NTtimeToUnix(pfindData->LastStatusChange);
 
 	tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
 	/* since we set the inode type below we need to mask off type
-- 
GitLab


From c4a2c08db7d976c2e23a97da5d69ec7c9701034d Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: [PATCH 1378/2198] cifs: make cnvrtDosUnixTm take a little-endian args
 and an offset

The callers primarily end up converting the args from le anyway. Also,
most of the callers end up needing to add an offset to the result. The
exception to these rules is cnvrtDosCifsTm, but there are no callers of
that function, so we might as well remove it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  4 ++--
 fs/cifs/cifssmb.c   |  4 ++--
 fs/cifs/netmisc.c   | 12 ++++--------
 fs/cifs/readdir.c   | 32 ++++++++++----------------------
 4 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 8831f649720fb..d542cf1f69c31 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -92,8 +92,8 @@ extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
 extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
-extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
-extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
+				      int offset);
 
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
 			   struct super_block *sb, int mode, int oflags,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index aece2a8c1a7c3..b84c61d5bca41 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -524,8 +524,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 			int val, seconds, remain, result;
 			struct timespec ts, utc;
 			utc = CURRENT_TIME;
-			ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
-						le16_to_cpu(rsp->SrvTime.Time));
+			ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+					    rsp->SrvTime.Time, 0);
 			cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
 				(int)ts.tv_sec, (int)utc.tv_sec,
 				(int)(utc.tv_sec - ts.tv_sec)));
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d3ba75ef014f5..32d6baa0a54fa 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -883,16 +883,12 @@ cifs_UnixTimeToNT(struct timespec t)
 static int total_days_of_prev_months[] =
 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
 
-
-__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
-{
-	return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
-}
-
-struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
+struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 {
 	struct timespec ts;
 	int sec, min, days, month, year;
+	u16 date = le16_to_cpu(le_date);
+	u16 time = le16_to_cpu(le_time);
 	SMB_TIME *st = (SMB_TIME *)&time;
 	SMB_DATE *sd = (SMB_DATE *)&date;
 
@@ -933,7 +929,7 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
 		days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
 	sec += 24 * 60 * 60 * days;
 
-	ts.tv_sec = sec;
+	ts.tv_sec = sec + offset;
 
 	/* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
 
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 79c46c2226c5a..86d0055dc5299 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -115,17 +115,6 @@ construct_dentry(struct qstr *qstring, struct file *file,
 	return rc;
 }
 
-static void AdjustForTZ(struct cifsTconInfo *tcon, struct inode *inode)
-{
-	if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
-		inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
-		inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
-		inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
-	}
-	return;
-}
-
-
 static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 			  char *buf, unsigned int *pobject_type, int isNewInode)
 {
@@ -156,20 +145,19 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
 		tmp_inode->i_ctime =
 			cifs_NTtimeToUnix(pfindData->ChangeTime);
 	} else { /* legacy, OS2 and DOS style */
-/*		struct timespec ts;*/
+		int offset = cifs_sb->tcon->ses->server->timeAdj;
 		FIND_FILE_STANDARD_INFO *pfindData =
 			(FIND_FILE_STANDARD_INFO *)buf;
 
-		tmp_inode->i_mtime = cnvrtDosUnixTm(
-				le16_to_cpu(pfindData->LastWriteDate),
-				le16_to_cpu(pfindData->LastWriteTime));
-		tmp_inode->i_atime = cnvrtDosUnixTm(
-				le16_to_cpu(pfindData->LastAccessDate),
-				le16_to_cpu(pfindData->LastAccessTime));
-		tmp_inode->i_ctime = cnvrtDosUnixTm(
-				le16_to_cpu(pfindData->LastWriteDate),
-				le16_to_cpu(pfindData->LastWriteTime));
-		AdjustForTZ(cifs_sb->tcon, tmp_inode);
+		tmp_inode->i_mtime = cnvrtDosUnixTm(pfindData->LastWriteDate,
+						    pfindData->LastWriteTime,
+						    offset);
+		tmp_inode->i_atime = cnvrtDosUnixTm(pfindData->LastAccessDate,
+						    pfindData->LastAccessTime,
+						    offset);
+		tmp_inode->i_ctime = cnvrtDosUnixTm(pfindData->LastWriteDate,
+						    pfindData->LastWriteTime,
+						    offset);
 		attr = le16_to_cpu(pfindData->Attributes);
 		allocation_size = le32_to_cpu(pfindData->AllocationSize);
 		end_of_file = le32_to_cpu(pfindData->DataSize);
-- 
GitLab


From bd433d4cf4d8593a5f1764776b91f1794fce5a77 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 09:37:34 -0400
Subject: [PATCH 1379/2198] cifs: rename cifs_iget to cifs_root_iget

The current cifs_iget isn't suitable for anything but the root inode.
Rename it with a more appropriate name.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsfs.c | 2 +-
 fs/cifs/cifsfs.h | 2 +-
 fs/cifs/inode.c  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 5e6d35804d735..0a10a59b63928 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -146,7 +146,7 @@ cifs_read_super(struct super_block *sb, void *data,
 #endif
 	sb->s_blocksize = CIFS_MAX_MSGSIZE;
 	sb->s_blocksize_bits = 14;	/* default 2**14 = CIFS_MAX_MSGSIZE */
-	inode = cifs_iget(sb, ROOT_I);
+	inode = cifs_root_iget(sb, ROOT_I);
 
 	if (IS_ERR(inode)) {
 		rc = PTR_ERR(inode);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 051b71cfdea9b..3b6a85cd484d5 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -36,7 +36,7 @@ extern void cifs_read_inode(struct inode *);
 
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_iget(struct super_block *, unsigned long);
+extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
 extern int cifs_create(struct inode *, struct dentry *, int,
 		       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 42d6e0fb6f310..84b7bea736740 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -696,7 +696,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 }
 
 /* gets root inode */
-struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
 	int xid;
 	struct cifs_sb_info *cifs_sb;
-- 
GitLab


From a0c9217f64ee3cd1e534966da8c5f05768e1ab09 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 27 May 2009 15:40:47 -0400
Subject: [PATCH 1380/2198] cifs: make serverino the default when mounting

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index f32c9036741e4..8ae563f028bc0 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -835,6 +835,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 	vol->rw = true;
 	/* default is always to request posix paths. */
 	vol->posix_paths = 1;
+	/* default to using server inode numbers where available */
+	vol->server_ino = 1;
 
 	if (!options)
 		return 1;
-- 
GitLab


From c5077ec42303e07c2c685b0f6cb8eee0f2c7751c Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Thu, 28 May 2009 15:09:04 +0000
Subject: [PATCH 1381/2198] [CIFS] Update readme to indicate change to default
 mount (serverino)

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES  | 8 ++++++++
 fs/cifs/README   | 7 ++++++-
 fs/cifs/cifsfs.h | 2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index f20c4069c2200..227c681b816de 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,9 @@
+Version 1.59
+------------
+Client uses server inode numbers (which are persistent) rather than
+client generated ones by default (mount option "serverino" turned
+on by default if server supports it).
+
 Version 1.58
 ------------
 Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
@@ -10,6 +16,8 @@ we converted from).  Fix endianness of the vcnum field used during
 session setup to distinguish multiple mounts to same server from different
 userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
 flag to be set to 2, and mount must enable krb5 to turn on extended security).
+Performance of file create to Samba improved (posix create on lookup
+removes 1 of 2 network requests sent on file create)
  
 Version 1.57
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index db208ddb98991..6d1608fabde9d 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -388,8 +388,13 @@ A partial list of the supported mount options follows:
 		or the CIFS Unix Extensions equivalent and for those
 		this mount option will have no effect.  Exporting cifs mounts
 		under nfsd requires this mount option on the cifs mount.
+		This is now the default if server supports the 
+		required network operation.
   noserverino   Client generates inode numbers (rather than using the actual one
-		from the server) by default.
+		from the server). These inode numbers will vary after
+		unmount or reboot which can confuse some applications,
+		but not all server filesystems support unique inode
+		numbers.
   setuids       If the CIFS Unix extensions are negotiated with the server
 		the client will attempt to set the effective uid and gid of
 		the local process on newly created files, directories, and
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 3b6a85cd484d5..9570a0e8023f4 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.58"
+#define CIFS_VERSION   "1.59"
 #endif				/* _CIFSFS_H */
-- 
GitLab


From ed888aef427365d19f887c271a3a906d16422d24 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 17:16:04 +0200
Subject: [PATCH 1382/2198] dma-debug: re-add dma memory leak detection

This is basically a revert of commit 314eeac9 but now in a
fixed version.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index cdd205d6bf7c6..e47e1a08c3370 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -105,6 +105,11 @@ static const char *type2name[4] = { "single", "page",
 static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
 				   "DMA_FROM_DEVICE", "DMA_NONE" };
 
+/* little merge helper - remove it after the merge window */
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
+
 /*
  * The access to some variables in this macro is racy. We can't use atomic_t
  * here because all these variables are exported to debugfs. Some of them even
@@ -458,9 +463,60 @@ static int dma_debug_fs_init(void)
 	return -ENOMEM;
 }
 
+static int device_dma_allocations(struct device *dev)
+{
+	struct dma_debug_entry *entry;
+	unsigned long flags;
+	int count = 0, i;
+
+	for (i = 0; i < HASH_SIZE; ++i) {
+		spin_lock_irqsave(&dma_entry_hash[i].lock, flags);
+		list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
+			if (entry->dev == dev)
+				count += 1;
+		}
+		spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags);
+	}
+
+	return count;
+}
+
+static int dma_debug_device_change(struct notifier_block *nb,
+				    unsigned long action, void *data)
+{
+	struct device *dev = data;
+	int count;
+
+
+	switch (action) {
+	case BUS_NOTIFY_UNBOUND_DRIVER:
+		count = device_dma_allocations(dev);
+		if (count == 0)
+			break;
+		err_printk(dev, NULL, "DMA-API: device driver has pending "
+				"DMA allocations while released from device "
+				"[count=%d]\n", count);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
 void dma_debug_add_bus(struct bus_type *bus)
 {
-	/* FIXME: register notifier */
+	struct notifier_block *nb;
+
+	nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
+	if (nb == NULL) {
+		printk(KERN_ERR "dma_debug_add_bus: out of memory\n");
+		return;
+	}
+
+	nb->notifier_call = dma_debug_device_change;
+
+	bus_register_notifier(bus, nb);
 }
 
 /*
-- 
GitLab


From fefda117ddb324b872312f1f061230e627c9f5ee Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 May 2009 12:21:42 +0200
Subject: [PATCH 1383/2198] amd-iommu: add amd_iommu_dump parameter

This kernel parameter will be useful to get some AMD IOMMU related
information in dmesg that is not necessary for the default user but may
be helpful in debug situations.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h |  6 ++++++
 arch/x86/kernel/amd_iommu_init.c       | 10 ++++++++++
 2 files changed, 16 insertions(+)

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9d22b54..89dfb3793edd6 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -194,6 +194,12 @@
 #define PD_DMA_OPS_MASK		(1UL << 0) /* domain used for dma_ops */
 #define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
 					      domain for an IOMMU */
+extern bool amd_iommu_dump;
+#define DUMP_printk(format, arg...)					\
+	do {								\
+		if (amd_iommu_dump)						\
+			printk(KERN_INFO "AMD IOMMU: " format, ## arg);	\
+	} while(0);
 
 /*
  * This structure contains generic data for  IOMMU protection domains
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902dacb..57fb7a7cb6e85 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -115,6 +115,8 @@ struct ivmd_header {
 	u64 range_length;
 } __attribute__((packed));
 
+bool amd_iommu_dump;
+
 static int __initdata amd_iommu_detected;
 
 u16 amd_iommu_last_bdf;			/* largest PCI device id we have
@@ -1211,6 +1213,13 @@ void __init amd_iommu_detect(void)
  *
  ****************************************************************************/
 
+static int __init parse_amd_iommu_dump(char *str)
+{
+	amd_iommu_dump = true;
+
+	return 1;
+}
+
 static int __init parse_amd_iommu_options(char *str)
 {
 	for (; *str; ++str) {
@@ -1235,5 +1244,6 @@ static int __init parse_amd_iommu_size_options(char *str)
 	return 1;
 }
 
+__setup("amd_iommu_dump", parse_amd_iommu_dump);
 __setup("amd_iommu=", parse_amd_iommu_options);
 __setup("amd_iommu_size=", parse_amd_iommu_size_options);
-- 
GitLab


From 9c72041f719e2864d4208a89341c36b316dbf893 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 May 2009 13:53:57 +0200
Subject: [PATCH 1384/2198] amd-iommu: add dump for iommus described in ivrs
 table

Add information about IOMMU devices described in the IVRS ACPI table to
the kernel log if amd_iommu_dump was specified on the kernel command
line.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 57fb7a7cb6e85..28165902ae25b 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -748,6 +748,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
 		h = (struct ivhd_header *)p;
 		switch (*p) {
 		case ACPI_IVHD_TYPE:
+
+			DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+				    "seg: %d flags: %01x info %04x\n",
+				    PCI_BUS(h->devid), PCI_SLOT(h->devid),
+				    PCI_FUNC(h->devid), h->cap_ptr,
+				    h->pci_seg, h->flags, h->info);
+			DUMP_printk("       mmio-addr: %016llx\n",
+				    h->mmio_phys);
+
 			iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
 			if (iommu == NULL)
 				return -ENOMEM;
-- 
GitLab


From 42a698f40a0946f5517308411b9e003ae031414d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 May 2009 15:41:28 +0200
Subject: [PATCH 1385/2198] amd-iommu: print ivhd information to dmesg when
 requested

Add information about devices belonging to an IOMMU as described in the
IVRS ACPI table to the kernel log if amd_iommu_dump was specified on the
kernel command line.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 73 ++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 28165902ae25b..fe3e6453cbf73 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -598,32 +598,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 	p += sizeof(struct ivhd_header);
 	end += h->length;
 
+
 	while (p < end) {
 		e = (struct ivhd_entry *)p;
 		switch (e->type) {
 		case IVHD_DEV_ALL:
+
+			DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x"
+				    " last device %02x:%02x.%x flags: %02x\n",
+				    PCI_BUS(iommu->first_device),
+				    PCI_SLOT(iommu->first_device),
+				    PCI_FUNC(iommu->first_device),
+				    PCI_BUS(iommu->last_device),
+				    PCI_SLOT(iommu->last_device),
+				    PCI_FUNC(iommu->last_device),
+				    e->flags);
+
 			for (dev_i = iommu->first_device;
 					dev_i <= iommu->last_device; ++dev_i)
 				set_dev_entry_from_acpi(iommu, dev_i,
 							e->flags, 0);
 			break;
 		case IVHD_DEV_SELECT:
+
+			DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x "
+				    "flags: %02x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags);
+
 			devid = e->devid;
 			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
 			break;
 		case IVHD_DEV_SELECT_RANGE_START:
+
+			DUMP_printk("  DEV_SELECT_RANGE_START\t "
+				    "devid: %02x:%02x.%x flags: %02x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags);
+
 			devid_start = e->devid;
 			flags = e->flags;
 			ext_flags = 0;
 			alias = false;
 			break;
 		case IVHD_DEV_ALIAS:
+
+			DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
+				    "flags: %02x devid_to: %02x:%02x.%x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags,
+				    PCI_BUS(e->ext >> 8),
+				    PCI_SLOT(e->ext >> 8),
+				    PCI_FUNC(e->ext >> 8));
+
 			devid = e->devid;
 			devid_to = e->ext >> 8;
 			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
 			amd_iommu_alias_table[devid] = devid_to;
 			break;
 		case IVHD_DEV_ALIAS_RANGE:
+
+			DUMP_printk("  DEV_ALIAS_RANGE\t\t "
+				    "devid: %02x:%02x.%x flags: %02x "
+				    "devid_to: %02x:%02x.%x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags,
+				    PCI_BUS(e->ext >> 8),
+				    PCI_SLOT(e->ext >> 8),
+				    PCI_FUNC(e->ext >> 8));
+
 			devid_start = e->devid;
 			flags = e->flags;
 			devid_to = e->ext >> 8;
@@ -631,17 +682,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 			alias = true;
 			break;
 		case IVHD_DEV_EXT_SELECT:
+
+			DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
+				    "flags: %02x ext: %08x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags, e->ext);
+
 			devid = e->devid;
 			set_dev_entry_from_acpi(iommu, devid, e->flags,
 						e->ext);
 			break;
 		case IVHD_DEV_EXT_SELECT_RANGE:
+
+			DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: "
+				    "%02x:%02x.%x flags: %02x ext: %08x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid),
+				    e->flags, e->ext);
+
 			devid_start = e->devid;
 			flags = e->flags;
 			ext_flags = e->ext;
 			alias = false;
 			break;
 		case IVHD_DEV_RANGE_END:
+
+			DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
+				    PCI_BUS(e->devid),
+				    PCI_SLOT(e->devid),
+				    PCI_FUNC(e->devid));
+
 			devid = e->devid;
 			for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
 				if (alias)
-- 
GitLab


From 02acc43a294098c2a4cd22cf24e9c988644f9f7f Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Wed, 20 May 2009 16:24:21 +0200
Subject: [PATCH 1386/2198] amd-iommu: print ivmd information to dmesg when
 requested

Add information about device memory mapping requirements for the IOMMU
as described in the IVRS ACPI table to the kernel log if amd_iommu_dump
was specified on the kernel command line.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index fe3e6453cbf73..b90a78cfdcb8e 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -983,6 +983,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
 static int __init init_unity_map_range(struct ivmd_header *m)
 {
 	struct unity_map_entry *e = 0;
+	char *s;
 
 	e = kzalloc(sizeof(*e), GFP_KERNEL);
 	if (e == NULL)
@@ -991,13 +992,16 @@ static int __init init_unity_map_range(struct ivmd_header *m)
 	switch (m->type) {
 	default:
 	case ACPI_IVMD_TYPE:
+		s = "IVMD_TYPEi\t\t\t";
 		e->devid_start = e->devid_end = m->devid;
 		break;
 	case ACPI_IVMD_TYPE_ALL:
+		s = "IVMD_TYPE_ALL\t\t";
 		e->devid_start = 0;
 		e->devid_end = amd_iommu_last_bdf;
 		break;
 	case ACPI_IVMD_TYPE_RANGE:
+		s = "IVMD_TYPE_RANGE\t\t";
 		e->devid_start = m->devid;
 		e->devid_end = m->aux;
 		break;
@@ -1006,6 +1010,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
 	e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
 	e->prot = m->flags >> 1;
 
+	DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
+		    " range_start: %016llx range_end: %016llx flags: %x\n", s,
+		    PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
+		    PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
+		    PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
+		    e->address_start, e->address_end, m->flags);
+
 	list_add_tail(&e->list, &amd_iommu_unity_map);
 
 	return 0;
-- 
GitLab


From b3b99ef8b4f80f3f093a72110e7697c2281ae45d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:02:48 +0200
Subject: [PATCH 1387/2198] amd-iommu: move protection domain printk to dump
 code

This information is only helpful for debugging. Don't print it anymore
unless explicitly requested.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad52f..33565990164a8 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1009,8 +1009,9 @@ static int device_change_notifier(struct notifier_block *nb,
 		if (!dma_domain)
 			dma_domain = iommu->default_dom;
 		attach_device(iommu, &dma_domain->domain, devid);
-		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
-		       "device %s\n", dma_domain->domain.id, dev_name(dev));
+		DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain "
+			    "%d for device %s\n",
+			    dma_domain->domain.id, dev_name(dev));
 		break;
 	case BUS_NOTIFY_UNBIND_DRIVER:
 		if (!domain)
@@ -1133,8 +1134,9 @@ static int get_device_resources(struct device *dev,
 			dma_dom = (*iommu)->default_dom;
 		*domain = &dma_dom->domain;
 		attach_device(*iommu, *domain, *bdf);
-		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
-				"device %s\n", (*domain)->id, dev_name(dev));
+		DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain "
+				"%d for device %s\n",
+				(*domain)->id, dev_name(dev));
 	}
 
 	if (domain_for_device(_bdf) == NULL)
-- 
GitLab


From 2be69c79e9a46a554fc3ff57d886e65e7a73eb72 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:15:49 +0200
Subject: [PATCH 1388/2198] x86/iommu: add IOMMU_STRESS Kconfig entry

This Kconfig option is intended to enable various code paths or
parameters in IOMMU implementations to stress test the code and/or the
hardware. This can also be done by disabling optimizations in the code
when this option is switched on.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
---
 arch/x86/Kconfig.debug | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 5865712d105d5..33fac6bbe1c2f 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -159,6 +159,14 @@ config IOMMU_DEBUG
 	  options. See Documentation/x86_64/boot-options.txt for more
 	  details.
 
+config IOMMU_STRESS
+	bool "Enable IOMMU stress-test mode"
+	---help---
+	  This option disables various optimizations in IOMMU related
+	  code to do real stress testing of the IOMMU code. This option
+	  will cause a performance drop and should only be enabled for
+	  testing.
+
 config IOMMU_LEAK
 	bool "IOMMU leak tracing"
 	depends on IOMMU_DEBUG && DMA_API_DEBUG
-- 
GitLab


From 2e8b569614b89c9b1b85cba37db36daeeeff744e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:44:03 +0200
Subject: [PATCH 1389/2198] amd-iommu: disable device isolation with
 CONFIG_IOMMU_STRESS

With device isolation disabled we can test better for race conditions in
dma_ops related code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index b90a78cfdcb8e..66941129e9c75 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -124,8 +124,14 @@ u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
 					   we find in ACPI */
 unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+
+#ifdef CONFIG_IOMMU_STRESS
+bool amd_iommu_isolate = false;
+#else
 bool amd_iommu_isolate = true;		/* if true, device isolation is
 					   enabled */
+#endif
+
 bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
 
 LIST_HEAD(amd_iommu_list);		/* list of all AMD IOMMUs in the
-- 
GitLab


From 421f909c803d1c397f6c66b75653f238696c39ee Mon Sep 17 00:00:00 2001
From: Neil Turton <nturton@solarflare.com>
Date: Thu, 14 May 2009 14:00:35 +0100
Subject: [PATCH 1390/2198] amd-iommu: fix an off-by-one error in the AMD IOMMU
 driver.

The variable amd_iommu_last_bdf holds the maximum bdf of any device
controlled by an IOMMU, so the number of device entries needed is
amd_iommu_last_bdf+1.  The function tbl_size used amd_iommu_last_bdf
instead.  This would be a problem if the last device were a large
enough power of 2.

[ Impact: fix amd_iommu_last_bdf off-by-one error ]

Signed-off-by: Neil Turton <nturton@solarflare.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902dacb..35fc9654c7a8b 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -175,7 +175,7 @@ static inline void update_last_devid(u16 devid)
 static inline unsigned long tbl_size(int entry_size)
 {
 	unsigned shift = PAGE_SHIFT +
-			 get_order(amd_iommu_last_bdf * entry_size);
+			 get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
 
 	return 1UL << shift;
 }
-- 
GitLab


From 7455aab1f95f6464c5af3fbdee28744e73f38564 Mon Sep 17 00:00:00 2001
From: Neil Turton <nturton@solarflare.com>
Date: Thu, 14 May 2009 14:08:11 +0100
Subject: [PATCH 1391/2198] amd-iommu: fix the handling of device aliases in
 the AMD IOMMU driver.

The devid parameter to set_dev_entry_from_acpi is the requester ID
rather than the device ID since it is used to index the IOMMU device
table.  The handling of IVHD_DEV_ALIAS used to pass the device ID.
This patch fixes it to pass the requester ID.

[ Impact: fix setting the wrong req-id in acpi-table parsing ]

Signed-off-by: Neil Turton <nturton@solarflare.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 35fc9654c7a8b..53f93db54c4da 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -618,7 +618,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 		case IVHD_DEV_ALIAS:
 			devid = e->devid;
 			devid_to = e->ext >> 8;
-			set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+			set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
 			amd_iommu_alias_table[devid] = devid_to;
 			break;
 		case IVHD_DEV_ALIAS_RANGE:
-- 
GitLab


From 0bc252f430d6a3ac7836d40f00d0ae020593b11b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:48:05 +0200
Subject: [PATCH 1392/2198] amd-iommu: make sure only ivmd entries are parsed

The bug never triggered. But it should be fixed to protect against
broken ACPI tables in the future.

[ Impact: protect against broken ivrs acpi table ]

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 53f93db54c4da..a3a2b98bb39e4 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -906,6 +906,8 @@ static int __init init_unity_map_range(struct ivmd_header *m)
 
 	switch (m->type) {
 	default:
+		kfree(e);
+		return 0;
 	case ACPI_IVMD_TYPE:
 		e->devid_start = e->devid_end = m->devid;
 		break;
-- 
GitLab


From c1eee67b2d8464781f5868a34168df61e40e85a6 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Thu, 21 May 2009 00:56:58 -0700
Subject: [PATCH 1393/2198] amd iommu: properly detach from protection domain
 on ->remove

Some drivers may use the dma api during ->remove which will
cause a protection domain to get reattached to a device.  Delay the
detach until after the driver is completely unbound.

[ joro: added a little merge helper ]

[ Impact: fix too early device<->domain removal ]

Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad52f..d6898833c363b 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -57,6 +57,10 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 static struct dma_ops_domain *find_protection_domain(u16 devid);
 
 
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
+
 #ifdef CONFIG_AMD_IOMMU_STATS
 
 /*
@@ -1012,7 +1016,7 @@ static int device_change_notifier(struct notifier_block *nb,
 		printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
 		       "device %s\n", dma_domain->domain.id, dev_name(dev));
 		break;
-	case BUS_NOTIFY_UNBIND_DRIVER:
+	case BUS_NOTIFY_UNBOUND_DRIVER:
 		if (!domain)
 			goto out;
 		detach_device(domain, devid);
-- 
GitLab


From 3bd221724adb9d642270df0e78b0105fb61e4a1c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 4 May 2009 15:06:20 +0200
Subject: [PATCH 1394/2198] amd-iommu: introduce for_each_iommu* macros

This patch introduces the for_each_iommu and for_each_iommu_safe macros
to simplify the developers life when having to iterate over all AMD
IOMMUs in the system.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h | 8 ++++++++
 arch/x86/kernel/amd_iommu.c            | 8 ++++----
 arch/x86/kernel/amd_iommu_init.c       | 8 ++++----
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9d22b54..cf5ef172cfcae 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -195,6 +195,14 @@
 #define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
 					      domain for an IOMMU */
 
+/*
+ * Make iterating over all IOMMUs easier
+ */
+#define for_each_iommu(iommu) \
+	list_for_each_entry((iommu), &amd_iommu_list, list)
+#define for_each_iommu_safe(iommu, next) \
+	list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
+
 /*
  * This structure contains generic data for  IOMMU protection domains
  * independent of their use.
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad52f..d9e9dc141a1e9 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -213,7 +213,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
 {
 	struct amd_iommu *iommu;
 
-	list_for_each_entry(iommu, &amd_iommu_list, list)
+	for_each_iommu(iommu)
 		iommu_poll_events(iommu);
 
 	return IRQ_HANDLED;
@@ -440,7 +440,7 @@ static void iommu_flush_domain(u16 domid)
 	__iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
 				      domid, 1, 1);
 
-	list_for_each_entry(iommu, &amd_iommu_list, list) {
+	for_each_iommu(iommu) {
 		spin_lock_irqsave(&iommu->lock, flags);
 		__iommu_queue_command(iommu, &cmd);
 		__iommu_completion_wait(iommu);
@@ -1672,7 +1672,7 @@ int __init amd_iommu_init_dma_ops(void)
 	 * found in the system. Devices not assigned to any other
 	 * protection domain will be assigned to the default one.
 	 */
-	list_for_each_entry(iommu, &amd_iommu_list, list) {
+	for_each_iommu(iommu) {
 		iommu->default_dom = dma_ops_domain_alloc(iommu, order);
 		if (iommu->default_dom == NULL)
 			return -ENOMEM;
@@ -1710,7 +1710,7 @@ int __init amd_iommu_init_dma_ops(void)
 
 free_domains:
 
-	list_for_each_entry(iommu, &amd_iommu_list, list) {
+	for_each_iommu(iommu) {
 		if (iommu->default_dom)
 			dma_ops_domain_free(iommu->default_dom);
 	}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902dacb..675a4b642f704 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -679,7 +679,7 @@ static void __init free_iommu_all(void)
 {
 	struct amd_iommu *iommu, *next;
 
-	list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+	for_each_iommu_safe(iommu, next) {
 		list_del(&iommu->list);
 		free_iommu_one(iommu);
 		kfree(iommu);
@@ -779,7 +779,7 @@ static int __init iommu_setup_msix(struct amd_iommu *iommu)
 	struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
 	int nvec = 0, i;
 
-	list_for_each_entry(curr, &amd_iommu_list, list) {
+	for_each_iommu(curr) {
 		if (curr->dev == iommu->dev) {
 			entries[nvec].entry = curr->evt_msi_num;
 			entries[nvec].vector = 0;
@@ -818,7 +818,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
 	int r;
 	struct amd_iommu *curr;
 
-	list_for_each_entry(curr, &amd_iommu_list, list) {
+	for_each_iommu(curr) {
 		if (curr->dev == iommu->dev)
 			curr->int_enabled = true;
 	}
@@ -971,7 +971,7 @@ static void __init enable_iommus(void)
 {
 	struct amd_iommu *iommu;
 
-	list_for_each_entry(iommu, &amd_iommu_list, list) {
+	for_each_iommu(iommu) {
 		iommu_set_exclusion_range(iommu);
 		iommu_init_msi(iommu);
 		iommu_enable_event_logging(iommu);
-- 
GitLab


From 58492e128892e3b55f1a6ef0cf3c3ab4ce7cc214 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 4 May 2009 18:41:16 +0200
Subject: [PATCH 1395/2198] amd-iommu: consolidate hardware initialization to
 one function

This patch restructures the AMD IOMMU initialization code to initialize
all hardware registers with one single function call.
This is helpful for suspend/resume support.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 50 ++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 675a4b642f704..74f4f1fea9306 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -252,13 +252,6 @@ static void __init iommu_enable(struct amd_iommu *iommu)
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 
-/* Function to enable IOMMU event logging and event interrupts */
-static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
-{
-	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
-}
-
 /*
  * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
  * the system has one.
@@ -413,25 +406,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 {
 	u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 			get_order(CMD_BUFFER_SIZE));
-	u64 entry;
 
 	if (cmd_buf == NULL)
 		return NULL;
 
 	iommu->cmd_buf_size = CMD_BUFFER_SIZE;
 
-	entry = (u64)virt_to_phys(cmd_buf);
+	return cmd_buf;
+}
+
+/*
+ * This function writes the command buffer address to the hardware and
+ * enables it.
+ */
+static void iommu_enable_command_buffer(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	BUG_ON(iommu->cmd_buf == NULL);
+
+	entry = (u64)virt_to_phys(iommu->cmd_buf);
 	entry |= MMIO_CMD_SIZE_512;
+
 	memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
-			&entry, sizeof(entry));
+		    &entry, sizeof(entry));
 
 	/* set head and tail to zero manually */
 	writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
 	writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
 
 	iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-
-	return cmd_buf;
 }
 
 static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +447,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
 /* allocates the memory where the IOMMU will log its events to */
 static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
 {
-	u64 entry;
 	iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
 						get_order(EVT_BUFFER_SIZE));
 
 	if (iommu->evt_buf == NULL)
 		return NULL;
 
+	return iommu->evt_buf;
+}
+
+static void iommu_enable_event_buffer(struct amd_iommu *iommu)
+{
+	u64 entry;
+
+	BUG_ON(iommu->evt_buf == NULL);
+
 	entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+
 	memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
 		    &entry, sizeof(entry));
 
-	iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
-	return iommu->evt_buf;
+	iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
 }
 
 static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -710,7 +721,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
 	if (!iommu->mmio_base)
 		return -ENOMEM;
 
-	iommu_set_device_table(iommu);
 	iommu->cmd_buf = alloc_command_buffer(iommu);
 	if (!iommu->cmd_buf)
 		return -ENOMEM;
@@ -837,6 +847,8 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
 		return 1;
 	}
 
+	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+
 	return 0;
 }
 
@@ -972,9 +984,11 @@ static void __init enable_iommus(void)
 	struct amd_iommu *iommu;
 
 	for_each_iommu(iommu) {
+		iommu_set_device_table(iommu);
+		iommu_enable_command_buffer(iommu);
+		iommu_enable_event_buffer(iommu);
 		iommu_set_exclusion_range(iommu);
 		iommu_init_msi(iommu);
-		iommu_enable_event_logging(iommu);
 		iommu_enable(iommu);
 	}
 }
-- 
GitLab


From fab6afa30954a0684ef8ac1d9a606e74a6215ab6 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 4 May 2009 18:46:34 +0200
Subject: [PATCH 1396/2198] amd-iommu: drop pointless iommu-loop in msi setup
 code

It is not necessary to loop again over all IOMMUs in this code. So drop
the loop.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 74f4f1fea9306..cc99f6092230d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -826,13 +826,6 @@ static int __init iommu_setup_msix(struct amd_iommu *iommu)
 static int __init iommu_setup_msi(struct amd_iommu *iommu)
 {
 	int r;
-	struct amd_iommu *curr;
-
-	for_each_iommu(curr) {
-		if (curr->dev == iommu->dev)
-			curr->int_enabled = true;
-	}
-
 
 	if (pci_enable_msi(iommu->dev))
 		return 1;
@@ -847,6 +840,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
 		return 1;
 	}
 
+	iommu->int_enabled = true;
 	iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
 
 	return 0;
-- 
GitLab


From d91cecdd796c27df46339e80ed436a980c56fcad Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 4 May 2009 18:51:00 +0200
Subject: [PATCH 1397/2198] amd-iommu: remove support for msi-x

Current hardware uses msi instead of msi-x so this code it not necessary
and can not be tested. The best thing is to drop this code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 44 +-------------------------------
 1 file changed, 1 insertion(+), 43 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index cc99f6092230d..feee475e62647 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -783,46 +783,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
  *
  ****************************************************************************/
 
-static int __init iommu_setup_msix(struct amd_iommu *iommu)
-{
-	struct amd_iommu *curr;
-	struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
-	int nvec = 0, i;
-
-	for_each_iommu(curr) {
-		if (curr->dev == iommu->dev) {
-			entries[nvec].entry = curr->evt_msi_num;
-			entries[nvec].vector = 0;
-			curr->int_enabled = true;
-			nvec++;
-		}
-	}
-
-	if (pci_enable_msix(iommu->dev, entries, nvec)) {
-		pci_disable_msix(iommu->dev);
-		return 1;
-	}
-
-	for (i = 0; i < nvec; ++i) {
-		int r = request_irq(entries->vector, amd_iommu_int_handler,
-				    IRQF_SAMPLE_RANDOM,
-				    "AMD IOMMU",
-				    NULL);
-		if (r)
-			goto out_free;
-	}
-
-	return 0;
-
-out_free:
-	for (i -= 1; i >= 0; --i)
-		free_irq(entries->vector, NULL);
-
-	pci_disable_msix(iommu->dev);
-
-	return 1;
-}
-
 static int __init iommu_setup_msi(struct amd_iommu *iommu)
 {
 	int r;
@@ -851,9 +811,7 @@ static int __init iommu_init_msi(struct amd_iommu *iommu)
 	if (iommu->int_enabled)
 		return 0;
 
-	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
-		return iommu_setup_msix(iommu);
-	else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+	if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
 		return iommu_setup_msi(iommu);
 
 	return 1;
-- 
GitLab


From 92ac4320af6ed4294c2c221dd4ccbfd9026a3aa7 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 May 2009 19:06:27 +0200
Subject: [PATCH 1398/2198] amd-iommu: add function to disable all iommus

This function is required for suspend/resume support with AMD IOMMU
enabled.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index feee475e62647..ed10c0f5ff778 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -252,6 +252,11 @@ static void __init iommu_enable(struct amd_iommu *iommu)
 	iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 
+static void iommu_disable(struct amd_iommu *iommu)
+{
+	iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
+}
+
 /*
  * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
  * the system has one.
@@ -945,6 +950,14 @@ static void __init enable_iommus(void)
 	}
 }
 
+static void disable_iommus(void)
+{
+	struct amd_iommu *iommu;
+
+	for_each_iommu(iommu)
+		iommu_disable(iommu);
+}
+
 /*
  * Suspend/Resume support
  * disable suspend until real resume implemented
-- 
GitLab


From bfd1be1857e5a3385bf146e02e6dc3dd4241bec1 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 5 May 2009 15:33:57 +0200
Subject: [PATCH 1399/2198] amd-iommu: add function to flush tlb for all
 domains

This function is required for suspend/resume support with AMD IOMMU
enabled.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu.h |  1 +
 arch/x86/kernel/amd_iommu.c      | 11 +++++++++++
 2 files changed, 12 insertions(+)

diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index f712344329bc7..1750e1f85d3c5 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -27,6 +27,7 @@ extern int amd_iommu_init(void);
 extern int amd_iommu_init_dma_ops(void);
 extern void amd_iommu_detect(void);
 extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_flush_all_domains(void);
 #else
 static inline int amd_iommu_init(void) { return -ENODEV; }
 static inline void amd_iommu_detect(void) { }
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d9e9dc141a1e9..826ad079efc41 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -449,6 +449,17 @@ static void iommu_flush_domain(u16 domid)
 	}
 }
 
+void amd_iommu_flush_all_domains(void)
+{
+	int i;
+
+	for (i = 1; i < MAX_DOMAIN_ID; ++i) {
+		if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+			continue;
+		iommu_flush_domain(i);
+	}
+}
+
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
-- 
GitLab


From 7d7a110c6127b7fc683dc6d764555f2dbd22b054 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 5 May 2009 15:48:10 +0200
Subject: [PATCH 1400/2198] amd-iommu: add function to flush tlb for all
 devices

This function is required for suspend/resume support with AMD IOMMU
enabled.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu.h |  1 +
 arch/x86/kernel/amd_iommu.c      | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 1750e1f85d3c5..262e02820049a 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -28,6 +28,7 @@ extern int amd_iommu_init_dma_ops(void);
 extern void amd_iommu_detect(void);
 extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
 extern void amd_iommu_flush_all_domains(void);
+extern void amd_iommu_flush_all_devices(void);
 #else
 static inline int amd_iommu_init(void) { return -ENODEV; }
 static inline void amd_iommu_detect(void) { }
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 826ad079efc41..92b0e1881e09f 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -460,6 +460,24 @@ void amd_iommu_flush_all_domains(void)
 	}
 }
 
+void amd_iommu_flush_all_devices(void)
+{
+	struct amd_iommu *iommu;
+	int i;
+
+	for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+		if (amd_iommu_pd_table[i] == NULL)
+			continue;
+
+		iommu = amd_iommu_rlookup_table[i];
+		if (!iommu)
+			continue;
+
+		iommu_queue_inv_dev_entry(iommu, i);
+		iommu_completion_wait(iommu);
+	}
+}
+
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
-- 
GitLab


From 05f92db9f47f852ff48bbed1b063b8ab8ad00285 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 12 May 2009 09:52:46 +0200
Subject: [PATCH 1401/2198] amd_iommu: un __init functions required for
 suspend/resume

This patch makes sure that no function required for suspend/resume of
AMD IOMMU driver is thrown away after boot.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index ed10c0f5ff778..330896ba6a9fe 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -193,7 +193,7 @@ static inline unsigned long tbl_size(int entry_size)
  * This function set the exclusion range in the IOMMU. DMA accesses to the
  * exclusion range are passed through untranslated
  */
-static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+static void iommu_set_exclusion_range(struct amd_iommu *iommu)
 {
 	u64 start = iommu->exclusion_start & PAGE_MASK;
 	u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +225,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
 }
 
 /* Generic functions to enable/disable certain features of the IOMMU. */
-static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
 {
 	u32 ctrl;
 
@@ -244,7 +244,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 }
 
 /* Function to enable the hardware */
-static void __init iommu_enable(struct amd_iommu *iommu)
+static void iommu_enable(struct amd_iommu *iommu)
 {
 	printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
 	       dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -811,7 +811,7 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
 	return 0;
 }
 
-static int __init iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_msi(struct amd_iommu *iommu)
 {
 	if (iommu->int_enabled)
 		return 0;
@@ -936,7 +936,7 @@ static void init_device_table(void)
  * This function finally enables all IOMMUs found in the system after
  * they have been initialized
  */
-static void __init enable_iommus(void)
+static void enable_iommus(void)
 {
 	struct amd_iommu *iommu;
 
-- 
GitLab


From 736501ee000757082a4f0832826ae1eda7ea106e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 12 May 2009 09:56:12 +0200
Subject: [PATCH 1402/2198] amd-iommu: implement suspend/resume

This patch puts everything together and enables suspend/resume support
in the AMD IOMMU driver.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 330896ba6a9fe..4ca8fbfb68dcf 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -965,12 +965,31 @@ static void disable_iommus(void)
 
 static int amd_iommu_resume(struct sys_device *dev)
 {
+	/*
+	 * Disable IOMMUs before reprogramming the hardware registers.
+	 * IOMMU is still enabled from the resume kernel.
+	 */
+	disable_iommus();
+
+	/* re-load the hardware */
+	enable_iommus();
+
+	/*
+	 * we have to flush after the IOMMUs are enabled because a
+	 * disabled IOMMU will never execute the commands we send
+	 */
+	amd_iommu_flush_all_domains();
+	amd_iommu_flush_all_devices();
+
 	return 0;
 }
 
 static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
 {
-	return -EINVAL;
+	/* disable IOMMUs to go out of the way for BIOS */
+	disable_iommus();
+
+	return 0;
 }
 
 static struct sysdev_class amd_iommu_sysdev_class = {
-- 
GitLab


From c3239567a20e90e3026ac5453d5267506ef7b030 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 12 May 2009 10:56:44 +0200
Subject: [PATCH 1403/2198] amd-iommu: introduce aperture_range structure

This is a preperation for extended address allocator.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h | 32 ++++++++++++------
 arch/x86/kernel/amd_iommu.c            | 46 ++++++++++++--------------
 2 files changed, 43 insertions(+), 35 deletions(-)

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 95c8cd9d22b54..4c64c9bc68395 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -195,6 +195,8 @@
 #define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
 					      domain for an IOMMU */
 
+#define APERTURE_RANGE_SIZE	(128 * 1024 * 1024)
+
 /*
  * This structure contains generic data for  IOMMU protection domains
  * independent of their use.
@@ -209,6 +211,24 @@ struct protection_domain {
 	void *priv;		/* private data */
 };
 
+/*
+ * For dynamic growth the aperture size is split into ranges of 128MB of
+ * DMA address space each. This struct represents one such range.
+ */
+struct aperture_range {
+
+	/* address allocation bitmap */
+	unsigned long *bitmap;
+
+	/*
+	 * Array of PTE pages for the aperture. In this array we save all the
+	 * leaf pages of the domain page table used for the aperture. This way
+	 * we don't need to walk the page table to find a specific PTE. We can
+	 * just calculate its address in constant time.
+	 */
+	u64 *pte_pages[64];
+};
+
 /*
  * Data container for a dma_ops specific protection domain
  */
@@ -224,16 +244,8 @@ struct dma_ops_domain {
 	/* address we start to search for free addresses */
 	unsigned long next_bit;
 
-	/* address allocation bitmap */
-	unsigned long *bitmap;
-
-	/*
-	 * Array of PTE pages for the aperture. In this array we save all the
-	 * leaf pages of the domain page table used for the aperture. This way
-	 * we don't need to walk the page table to find a specific PTE. We can
-	 * just calculate its address in constant time.
-	 */
-	u64 **pte_pages;
+	/* address space relevant data */
+	struct aperture_range aperture;
 
 	/* This will be set to true when TLB needs to be flushed */
 	bool need_flush;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a97db99dad52f..62acd09cd19f5 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -595,7 +595,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 		 * as allocated in the aperture
 		 */
 		if (addr < dma_dom->aperture_size)
-			__set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+			__set_bit(addr >> PAGE_SHIFT,
+				  dma_dom->aperture.bitmap);
 	}
 
 	return 0;
@@ -656,11 +657,12 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
 		dom->need_flush = true;
 	}
 
-	address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
-				   0 , boundary_size, align_mask);
+	address = iommu_area_alloc(dom->aperture.bitmap, limit, dom->next_bit,
+				   pages, 0 , boundary_size, align_mask);
 	if (address == -1) {
-		address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
-				0, boundary_size, align_mask);
+		address = iommu_area_alloc(dom->aperture.bitmap, limit, 0,
+					   pages, 0, boundary_size,
+					   align_mask);
 		dom->need_flush = true;
 	}
 
@@ -685,7 +687,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 				   unsigned int pages)
 {
 	address >>= PAGE_SHIFT;
-	iommu_area_free(dom->bitmap, address, pages);
+	iommu_area_free(dom->aperture.bitmap, address, pages);
 
 	if (address >= dom->next_bit)
 		dom->need_flush = true;
@@ -741,7 +743,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 	if (start_page + pages > last_page)
 		pages = last_page - start_page;
 
-	iommu_area_reserve(dom->bitmap, start_page, pages);
+	iommu_area_reserve(dom->aperture.bitmap, start_page, pages);
 }
 
 static void free_pagetable(struct protection_domain *domain)
@@ -785,9 +787,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
 
 	free_pagetable(&dom->domain);
 
-	kfree(dom->pte_pages);
-
-	kfree(dom->bitmap);
+	free_page((unsigned long)dom->aperture.bitmap);
 
 	kfree(dom);
 }
@@ -826,16 +826,15 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->domain.priv = dma_dom;
 	if (!dma_dom->domain.pt_root)
 		goto free_dma_dom;
-	dma_dom->aperture_size = (1ULL << order);
-	dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
-				  GFP_KERNEL);
-	if (!dma_dom->bitmap)
+	dma_dom->aperture_size = APERTURE_RANGE_SIZE;
+	dma_dom->aperture.bitmap = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!dma_dom->aperture.bitmap)
 		goto free_dma_dom;
 	/*
 	 * mark the first page as allocated so we never return 0 as
 	 * a valid dma-address. So we can use 0 as error value
 	 */
-	dma_dom->bitmap[0] = 1;
+	dma_dom->aperture.bitmap[0] = 1;
 	dma_dom->next_bit = 0;
 
 	dma_dom->need_flush = false;
@@ -854,13 +853,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	/*
 	 * At the last step, build the page tables so we don't need to
 	 * allocate page table pages in the dma_ops mapping/unmapping
-	 * path.
+	 * path for the first 128MB of dma address space.
 	 */
 	num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
-	dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
-			GFP_KERNEL);
-	if (!dma_dom->pte_pages)
-		goto free_dma_dom;
 
 	l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
 	if (l2_pde == NULL)
@@ -869,10 +864,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
 
 	for (i = 0; i < num_pte_pages; ++i) {
-		dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
-		if (!dma_dom->pte_pages[i])
+		u64 **pte_page = &dma_dom->aperture.pte_pages[i];
+		*pte_page = (u64 *)get_zeroed_page(GFP_KERNEL);
+		if (!*pte_page)
 			goto free_dma_dom;
-		address = virt_to_phys(dma_dom->pte_pages[i]);
+		address = virt_to_phys(*pte_page);
 		l2_pde[i] = IOMMU_L1_PDE(address);
 	}
 
@@ -1159,7 +1155,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 
 	paddr &= PAGE_MASK;
 
-	pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	pte  = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)];
 	pte += IOMMU_PTE_L0_INDEX(address);
 
 	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
@@ -1192,7 +1188,7 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
 
 	WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
 
-	pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	pte  = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)];
 	pte += IOMMU_PTE_L0_INDEX(address);
 
 	WARN_ON(!*pte);
-- 
GitLab


From 8bda3092bcfa68f786d94549ae026e8db1eff041 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 12 May 2009 12:02:46 +0200
Subject: [PATCH 1404/2198] amd-iommu: move page table allocation code to
 seperate function

This patch makes page table allocation usable for dma_ops code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 86 ++++++++++++++++++++++++++-----------
 1 file changed, 61 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 62acd09cd19f5..ded79f7747c54 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -55,7 +55,9 @@ struct iommu_cmd {
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 			     struct unity_map_entry *e);
 static struct dma_ops_domain *find_protection_domain(u16 devid);
-
+static u64* alloc_pte(struct protection_domain *dom,
+		      unsigned long address, u64
+		      **pte_page, gfp_t gfp);
 
 #ifdef CONFIG_AMD_IOMMU_STATS
 
@@ -468,7 +470,7 @@ static int iommu_map_page(struct protection_domain *dom,
 			  unsigned long phys_addr,
 			  int prot)
 {
-	u64 __pte, *pte, *page;
+	u64 __pte, *pte;
 
 	bus_addr  = PAGE_ALIGN(bus_addr);
 	phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +479,7 @@ static int iommu_map_page(struct protection_domain *dom,
 	if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
 		return -EINVAL;
 
-	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
-	if (!IOMMU_PTE_PRESENT(*pte)) {
-		page = (u64 *)get_zeroed_page(GFP_KERNEL);
-		if (!page)
-			return -ENOMEM;
-		*pte = IOMMU_L2_PDE(virt_to_phys(page));
-	}
-
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
-	if (!IOMMU_PTE_PRESENT(*pte)) {
-		page = (u64 *)get_zeroed_page(GFP_KERNEL);
-		if (!page)
-			return -ENOMEM;
-		*pte = IOMMU_L1_PDE(virt_to_phys(page));
-	}
-
-	pte = IOMMU_PTE_PAGE(*pte);
-	pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+	pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
 
 	if (IOMMU_PTE_PRESENT(*pte))
 		return -EBUSY;
@@ -1139,6 +1121,61 @@ static int get_device_resources(struct device *dev,
 	return 1;
 }
 
+/*
+ * If the pte_page is not yet allocated this function is called
+ */
+static u64* alloc_pte(struct protection_domain *dom,
+		      unsigned long address, u64 **pte_page, gfp_t gfp)
+{
+	u64 *pte, *page;
+
+	pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+	if (!IOMMU_PTE_PRESENT(*pte)) {
+		page = (u64 *)get_zeroed_page(gfp);
+		if (!page)
+			return NULL;
+		*pte = IOMMU_L2_PDE(virt_to_phys(page));
+	}
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+	if (!IOMMU_PTE_PRESENT(*pte)) {
+		page = (u64 *)get_zeroed_page(gfp);
+		if (!page)
+			return NULL;
+		*pte = IOMMU_L1_PDE(virt_to_phys(page));
+	}
+
+	pte = IOMMU_PTE_PAGE(*pte);
+
+	if (pte_page)
+		*pte_page = pte;
+
+	pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+	return pte;
+}
+
+/*
+ * This function fetches the PTE for a given address in the aperture
+ */
+static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
+			    unsigned long address)
+{
+	struct aperture_range *aperture = &dom->aperture;
+	u64 *pte, *pte_page;
+
+	pte = aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	if (!pte) {
+		pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+		aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)] = pte_page;
+	}
+
+	return pte;
+}
+
 /*
  * This is the generic map function. It maps one 4kb page at paddr to
  * the given address in the DMA address space for the domain.
@@ -1155,8 +1192,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 
 	paddr &= PAGE_MASK;
 
-	pte  = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)];
-	pte += IOMMU_PTE_L0_INDEX(address);
+	pte  = dma_ops_get_pte(dom, address);
 
 	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
 
-- 
GitLab


From 53812c115cda1f660b286c939669154a56976f6b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 12 May 2009 12:17:38 +0200
Subject: [PATCH 1405/2198] amd-iommu: handle page table allocation failures in
 dma_ops code

The code will be required when the aperture size increases dynamically
in the extended address allocator.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index ded79f7747c54..a467addb44b7b 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1193,6 +1193,8 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 	paddr &= PAGE_MASK;
 
 	pte  = dma_ops_get_pte(dom, address);
+	if (!pte)
+		return bad_dma_address;
 
 	__pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
 
@@ -1248,7 +1250,7 @@ static dma_addr_t __map_single(struct device *dev,
 			       u64 dma_mask)
 {
 	dma_addr_t offset = paddr & ~PAGE_MASK;
-	dma_addr_t address, start;
+	dma_addr_t address, start, ret;
 	unsigned int pages;
 	unsigned long align_mask = 0;
 	int i;
@@ -1271,7 +1273,10 @@ static dma_addr_t __map_single(struct device *dev,
 
 	start = address;
 	for (i = 0; i < pages; ++i) {
-		dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+		ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+		if (ret == bad_dma_address)
+			goto out_unmap;
+
 		paddr += PAGE_SIZE;
 		start += PAGE_SIZE;
 	}
@@ -1287,6 +1292,17 @@ static dma_addr_t __map_single(struct device *dev,
 
 out:
 	return address;
+
+out_unmap:
+
+	for (--i; i >= 0; --i) {
+		start -= PAGE_SIZE;
+		dma_ops_domain_unmap(iommu, dma_dom, start);
+	}
+
+	dma_ops_free_addresses(dma_dom, address, pages);
+
+	return bad_dma_address;
 }
 
 /*
-- 
GitLab


From 384de72910a7bf96a02a6d8023fe9e16d872beb2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 15 May 2009 12:30:05 +0200
Subject: [PATCH 1406/2198] amd-iommu: make address allocator aware of multiple
 aperture ranges

This patch changes the AMD IOMMU address allocator to allow up to 32
aperture ranges per dma_ops domain.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h |  11 +-
 arch/x86/kernel/amd_iommu.c            | 138 ++++++++++++++++++-------
 2 files changed, 110 insertions(+), 39 deletions(-)

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 4c64c9bc68395..eca912931a85a 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -195,7 +195,12 @@
 #define PD_DEFAULT_MASK		(1UL << 1) /* domain is a default dma_ops
 					      domain for an IOMMU */
 
-#define APERTURE_RANGE_SIZE	(128 * 1024 * 1024)
+#define APERTURE_RANGE_SHIFT	27	/* 128 MB */
+#define APERTURE_RANGE_SIZE	(1ULL << APERTURE_RANGE_SHIFT)
+#define APERTURE_RANGE_PAGES	(APERTURE_RANGE_SIZE >> PAGE_SHIFT)
+#define APERTURE_MAX_RANGES	32	/* allows 4GB of DMA address space */
+#define APERTURE_RANGE_INDEX(a)	((a) >> APERTURE_RANGE_SHIFT)
+#define APERTURE_PAGE_INDEX(a)	(((a) >> 21) & 0x3fULL)
 
 /*
  * This structure contains generic data for  IOMMU protection domains
@@ -227,6 +232,8 @@ struct aperture_range {
 	 * just calculate its address in constant time.
 	 */
 	u64 *pte_pages[64];
+
+	unsigned long offset;
 };
 
 /*
@@ -245,7 +252,7 @@ struct dma_ops_domain {
 	unsigned long next_bit;
 
 	/* address space relevant data */
-	struct aperture_range aperture;
+	struct aperture_range *aperture[APERTURE_MAX_RANGES];
 
 	/* This will be set to true when TLB needs to be flushed */
 	bool need_flush;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index a467addb44b7b..794163ae97b49 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -578,7 +578,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
 		 */
 		if (addr < dma_dom->aperture_size)
 			__set_bit(addr >> PAGE_SHIFT,
-				  dma_dom->aperture.bitmap);
+				  dma_dom->aperture[0]->bitmap);
 	}
 
 	return 0;
@@ -615,43 +615,74 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  ****************************************************************************/
 
 /*
- * The address allocator core function.
+ * The address allocator core functions.
  *
  * called with domain->lock held
  */
+
+static unsigned long dma_ops_area_alloc(struct device *dev,
+					struct dma_ops_domain *dom,
+					unsigned int pages,
+					unsigned long align_mask,
+					u64 dma_mask,
+					unsigned long start)
+{
+	unsigned long next_bit = dom->next_bit % APERTURE_RANGE_PAGES;
+	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
+	int i = start >> APERTURE_RANGE_SHIFT;
+	unsigned long boundary_size;
+	unsigned long address = -1;
+	unsigned long limit;
+
+	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+			PAGE_SIZE) >> PAGE_SHIFT;
+
+	for (;i < max_index; ++i) {
+		unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
+
+		if (dom->aperture[i]->offset >= dma_mask)
+			break;
+
+		limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
+					       dma_mask >> PAGE_SHIFT);
+
+		address = iommu_area_alloc(dom->aperture[i]->bitmap,
+					   limit, next_bit, pages, 0,
+					    boundary_size, align_mask);
+		if (address != -1) {
+			address = dom->aperture[i]->offset +
+				  (address << PAGE_SHIFT);
+			dom->next_bit = (address >> PAGE_SHIFT) + pages;
+			break;
+		}
+
+		next_bit = 0;
+	}
+
+	return address;
+}
+
 static unsigned long dma_ops_alloc_addresses(struct device *dev,
 					     struct dma_ops_domain *dom,
 					     unsigned int pages,
 					     unsigned long align_mask,
 					     u64 dma_mask)
 {
-	unsigned long limit;
 	unsigned long address;
-	unsigned long boundary_size;
+	unsigned long start = dom->next_bit << PAGE_SHIFT;
 
-	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-			PAGE_SIZE) >> PAGE_SHIFT;
-	limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
-				       dma_mask >> PAGE_SHIFT);
 
-	if (dom->next_bit >= limit) {
-		dom->next_bit = 0;
-		dom->need_flush = true;
-	}
+	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+				     dma_mask, start);
 
-	address = iommu_area_alloc(dom->aperture.bitmap, limit, dom->next_bit,
-				   pages, 0 , boundary_size, align_mask);
 	if (address == -1) {
-		address = iommu_area_alloc(dom->aperture.bitmap, limit, 0,
-					   pages, 0, boundary_size,
-					   align_mask);
+		dom->next_bit = 0;
+		address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+					     dma_mask, 0);
 		dom->need_flush = true;
 	}
 
-	if (likely(address != -1)) {
-		dom->next_bit = address + pages;
-		address <<= PAGE_SHIFT;
-	} else
+	if (unlikely(address == -1))
 		address = bad_dma_address;
 
 	WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -668,11 +699,17 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 				   unsigned long address,
 				   unsigned int pages)
 {
-	address >>= PAGE_SHIFT;
-	iommu_area_free(dom->aperture.bitmap, address, pages);
+	unsigned i = address >> APERTURE_RANGE_SHIFT;
+	struct aperture_range *range = dom->aperture[i];
+
+	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
 
-	if (address >= dom->next_bit)
+	if ((address >> PAGE_SHIFT) >= dom->next_bit)
 		dom->need_flush = true;
+
+	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
+	iommu_area_free(range->bitmap, address, pages);
+
 }
 
 /****************************************************************************
@@ -720,12 +757,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
 				      unsigned long start_page,
 				      unsigned int pages)
 {
-	unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+	unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
 
 	if (start_page + pages > last_page)
 		pages = last_page - start_page;
 
-	iommu_area_reserve(dom->aperture.bitmap, start_page, pages);
+	for (i = start_page; i < start_page + pages; ++i) {
+		int index = i / APERTURE_RANGE_PAGES;
+		int page  = i % APERTURE_RANGE_PAGES;
+		__set_bit(page, dom->aperture[index]->bitmap);
+	}
 }
 
 static void free_pagetable(struct protection_domain *domain)
@@ -764,12 +805,19 @@ static void free_pagetable(struct protection_domain *domain)
  */
 static void dma_ops_domain_free(struct dma_ops_domain *dom)
 {
+	int i;
+
 	if (!dom)
 		return;
 
 	free_pagetable(&dom->domain);
 
-	free_page((unsigned long)dom->aperture.bitmap);
+	for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
+		if (!dom->aperture[i])
+			continue;
+		free_page((unsigned long)dom->aperture[i]->bitmap);
+		kfree(dom->aperture[i]);
+	}
 
 	kfree(dom);
 }
@@ -797,6 +845,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	if (!dma_dom)
 		return NULL;
 
+	dma_dom->aperture[0] = kzalloc(sizeof(struct aperture_range),
+				       GFP_KERNEL);
+	if (!dma_dom->aperture[0])
+		goto free_dma_dom;
+
 	spin_lock_init(&dma_dom->domain.lock);
 
 	dma_dom->domain.id = domain_id_alloc();
@@ -809,14 +862,14 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	if (!dma_dom->domain.pt_root)
 		goto free_dma_dom;
 	dma_dom->aperture_size = APERTURE_RANGE_SIZE;
-	dma_dom->aperture.bitmap = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!dma_dom->aperture.bitmap)
+	dma_dom->aperture[0]->bitmap = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!dma_dom->aperture[0]->bitmap)
 		goto free_dma_dom;
 	/*
 	 * mark the first page as allocated so we never return 0 as
 	 * a valid dma-address. So we can use 0 as error value
 	 */
-	dma_dom->aperture.bitmap[0] = 1;
+	dma_dom->aperture[0]->bitmap[0] = 1;
 	dma_dom->next_bit = 0;
 
 	dma_dom->need_flush = false;
@@ -846,7 +899,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
 
 	for (i = 0; i < num_pte_pages; ++i) {
-		u64 **pte_page = &dma_dom->aperture.pte_pages[i];
+		u64 **pte_page = &dma_dom->aperture[0]->pte_pages[i];
 		*pte_page = (u64 *)get_zeroed_page(GFP_KERNEL);
 		if (!*pte_page)
 			goto free_dma_dom;
@@ -1164,14 +1217,19 @@ static u64* alloc_pte(struct protection_domain *dom,
 static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
 			    unsigned long address)
 {
-	struct aperture_range *aperture = &dom->aperture;
+	struct aperture_range *aperture;
 	u64 *pte, *pte_page;
 
-	pte = aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)];
+	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+	if (!aperture)
+		return NULL;
+
+	pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
 	if (!pte) {
 		pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
-		aperture->pte_pages[IOMMU_PTE_L1_INDEX(address)] = pte_page;
-	}
+		aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
+	} else
+		pte += IOMMU_PTE_L0_INDEX(address);
 
 	return pte;
 }
@@ -1219,14 +1277,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
 				 struct dma_ops_domain *dom,
 				 unsigned long address)
 {
+	struct aperture_range *aperture;
 	u64 *pte;
 
 	if (address >= dom->aperture_size)
 		return;
 
-	WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
+	aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+	if (!aperture)
+		return;
+
+	pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+	if (!pte)
+		return;
 
-	pte  = dom->aperture.pte_pages[IOMMU_PTE_L1_INDEX(address)];
 	pte += IOMMU_PTE_L0_INDEX(address);
 
 	WARN_ON(!*pte);
-- 
GitLab


From 803b8cb4d9a93b90c67aba2aab7f2c54d595b5b9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 18 May 2009 15:32:48 +0200
Subject: [PATCH 1407/2198] amd-iommu: change dma_dom->next_bit to
 dma_dom->next_address

Simplify the code a little bit by using the same unit for all address
space related state in the dma_ops domain structure.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/include/asm/amd_iommu_types.h |  2 +-
 arch/x86/kernel/amd_iommu.c            | 17 +++++++++--------
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index eca912931a85a..4ff4cf1f08098 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -249,7 +249,7 @@ struct dma_ops_domain {
 	unsigned long aperture_size;
 
 	/* address we start to search for free addresses */
-	unsigned long next_bit;
+	unsigned long next_address;
 
 	/* address space relevant data */
 	struct aperture_range *aperture[APERTURE_MAX_RANGES];
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 794163ae97b49..c1a08b9119c9d 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -627,13 +627,15 @@ static unsigned long dma_ops_area_alloc(struct device *dev,
 					u64 dma_mask,
 					unsigned long start)
 {
-	unsigned long next_bit = dom->next_bit % APERTURE_RANGE_PAGES;
+	unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
 	int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
 	int i = start >> APERTURE_RANGE_SHIFT;
 	unsigned long boundary_size;
 	unsigned long address = -1;
 	unsigned long limit;
 
+	next_bit >>= PAGE_SHIFT;
+
 	boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
 			PAGE_SIZE) >> PAGE_SHIFT;
 
@@ -652,7 +654,7 @@ static unsigned long dma_ops_area_alloc(struct device *dev,
 		if (address != -1) {
 			address = dom->aperture[i]->offset +
 				  (address << PAGE_SHIFT);
-			dom->next_bit = (address >> PAGE_SHIFT) + pages;
+			dom->next_address = address + (pages << PAGE_SHIFT);
 			break;
 		}
 
@@ -669,14 +671,12 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
 					     u64 dma_mask)
 {
 	unsigned long address;
-	unsigned long start = dom->next_bit << PAGE_SHIFT;
-
 
 	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
-				     dma_mask, start);
+				     dma_mask, dom->next_address);
 
 	if (address == -1) {
-		dom->next_bit = 0;
+		dom->next_address = 0;
 		address = dma_ops_area_alloc(dev, dom, pages, align_mask,
 					     dma_mask, 0);
 		dom->need_flush = true;
@@ -704,10 +704,11 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 
 	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
 
-	if ((address >> PAGE_SHIFT) >= dom->next_bit)
+	if (address >= dom->next_address)
 		dom->need_flush = true;
 
 	address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
+
 	iommu_area_free(range->bitmap, address, pages);
 
 }
@@ -870,7 +871,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	 * a valid dma-address. So we can use 0 as error value
 	 */
 	dma_dom->aperture[0]->bitmap[0] = 1;
-	dma_dom->next_bit = 0;
+	dma_dom->next_address = 0;
 
 	dma_dom->need_flush = false;
 	dma_dom->target_dev = 0xffff;
-- 
GitLab


From 9cabe89b99773e682538a8809abc7d4000c77083 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 18 May 2009 16:38:55 +0200
Subject: [PATCH 1408/2198] amd-iommu: move aperture_range allocation code to
 seperate function

This patch prepares the dynamic increasement of dma_ops domain
apertures.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 95 +++++++++++++++++++++++--------------
 1 file changed, 59 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index c1a08b9119c9d..8ff02ee69e865 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -620,6 +620,59 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  * called with domain->lock held
  */
 
+/*
+ * This function is used to add a new aperture range to an existing
+ * aperture in case of dma_ops domain allocation or address allocation
+ * failure.
+ */
+static int alloc_new_range(struct dma_ops_domain *dma_dom,
+			   bool populate, gfp_t gfp)
+{
+	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+
+	if (index >= APERTURE_MAX_RANGES)
+		return -ENOMEM;
+
+	dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
+	if (!dma_dom->aperture[index])
+		return -ENOMEM;
+
+	dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
+	if (!dma_dom->aperture[index]->bitmap)
+		goto out_free;
+
+	dma_dom->aperture[index]->offset = dma_dom->aperture_size;
+
+	if (populate) {
+		unsigned long address = dma_dom->aperture_size;
+		int i, num_ptes = APERTURE_RANGE_PAGES / 512;
+		u64 *pte, *pte_page;
+
+		for (i = 0; i < num_ptes; ++i) {
+			pte = alloc_pte(&dma_dom->domain, address,
+					&pte_page, gfp);
+			if (!pte)
+				goto out_free;
+
+			dma_dom->aperture[index]->pte_pages[i] = pte_page;
+
+			address += APERTURE_RANGE_SIZE / 64;
+		}
+	}
+
+	dma_dom->aperture_size += APERTURE_RANGE_SIZE;
+
+	return 0;
+
+out_free:
+	free_page((unsigned long)dma_dom->aperture[index]->bitmap);
+
+	kfree(dma_dom->aperture[index]);
+	dma_dom->aperture[index] = NULL;
+
+	return -ENOMEM;
+}
+
 static unsigned long dma_ops_area_alloc(struct device *dev,
 					struct dma_ops_domain *dom,
 					unsigned int pages,
@@ -832,9 +885,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 						   unsigned order)
 {
 	struct dma_ops_domain *dma_dom;
-	unsigned i, num_pte_pages;
-	u64 *l2_pde;
-	u64 address;
 
 	/*
 	 * Currently the DMA aperture must be between 32 MB and 1GB in size
@@ -846,11 +896,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	if (!dma_dom)
 		return NULL;
 
-	dma_dom->aperture[0] = kzalloc(sizeof(struct aperture_range),
-				       GFP_KERNEL);
-	if (!dma_dom->aperture[0])
-		goto free_dma_dom;
-
 	spin_lock_init(&dma_dom->domain.lock);
 
 	dma_dom->domain.id = domain_id_alloc();
@@ -862,10 +907,13 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->domain.priv = dma_dom;
 	if (!dma_dom->domain.pt_root)
 		goto free_dma_dom;
-	dma_dom->aperture_size = APERTURE_RANGE_SIZE;
-	dma_dom->aperture[0]->bitmap = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!dma_dom->aperture[0]->bitmap)
+
+	dma_dom->need_flush = false;
+	dma_dom->target_dev = 0xffff;
+
+	if (alloc_new_range(dma_dom, true, GFP_KERNEL))
 		goto free_dma_dom;
+
 	/*
 	 * mark the first page as allocated so we never return 0 as
 	 * a valid dma-address. So we can use 0 as error value
@@ -873,9 +921,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->aperture[0]->bitmap[0] = 1;
 	dma_dom->next_address = 0;
 
-	dma_dom->need_flush = false;
-	dma_dom->target_dev = 0xffff;
-
 	/* Intialize the exclusion range if necessary */
 	if (iommu->exclusion_start &&
 	    iommu->exclusion_start < dma_dom->aperture_size) {
@@ -886,28 +931,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 		dma_ops_reserve_addresses(dma_dom, startpage, pages);
 	}
 
-	/*
-	 * At the last step, build the page tables so we don't need to
-	 * allocate page table pages in the dma_ops mapping/unmapping
-	 * path for the first 128MB of dma address space.
-	 */
-	num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
-
-	l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
-	if (l2_pde == NULL)
-		goto free_dma_dom;
-
-	dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
-
-	for (i = 0; i < num_pte_pages; ++i) {
-		u64 **pte_page = &dma_dom->aperture[0]->pte_pages[i];
-		*pte_page = (u64 *)get_zeroed_page(GFP_KERNEL);
-		if (!*pte_page)
-			goto free_dma_dom;
-		address = virt_to_phys(*pte_page);
-		l2_pde[i] = IOMMU_L1_PDE(address);
-	}
-
 	return dma_dom;
 
 free_dma_dom:
-- 
GitLab


From 00cd122ae5e5e7c60cce2af3c35b190d4c3f2d0d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 May 2009 09:52:40 +0200
Subject: [PATCH 1409/2198] amd-iommu: handle exlusion ranges and unity
 mappings in alloc_new_range

This patch makes sure no reserved addresses are allocated in an dma_ops
domain when the aperture is increased dynamically.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 71 +++++++++++++++++++++++++++++++------
 1 file changed, 60 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 8ff02ee69e865..59ee1b94a7ceb 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -58,6 +58,9 @@ static struct dma_ops_domain *find_protection_domain(u16 devid);
 static u64* alloc_pte(struct protection_domain *dom,
 		      unsigned long address, u64
 		      **pte_page, gfp_t gfp);
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+				      unsigned long start_page,
+				      unsigned int pages);
 
 #ifdef CONFIG_AMD_IOMMU_STATS
 
@@ -620,15 +623,43 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  * called with domain->lock held
  */
 
+/*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64* fetch_pte(struct protection_domain *domain,
+		      unsigned long address)
+{
+	u64 *pte;
+
+	pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+	if (!IOMMU_PTE_PRESENT(*pte))
+		return NULL;
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+	if (!IOMMU_PTE_PRESENT(*pte))
+		return NULL;
+
+	pte = IOMMU_PTE_PAGE(*pte);
+	pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+	return pte;
+}
+
 /*
  * This function is used to add a new aperture range to an existing
  * aperture in case of dma_ops domain allocation or address allocation
  * failure.
  */
-static int alloc_new_range(struct dma_ops_domain *dma_dom,
+static int alloc_new_range(struct amd_iommu *iommu,
+			   struct dma_ops_domain *dma_dom,
 			   bool populate, gfp_t gfp)
 {
 	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+	int i;
 
 	if (index >= APERTURE_MAX_RANGES)
 		return -ENOMEM;
@@ -662,6 +693,33 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
 
 	dma_dom->aperture_size += APERTURE_RANGE_SIZE;
 
+	/* Intialize the exclusion range if necessary */
+	if (iommu->exclusion_start &&
+	    iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
+	    iommu->exclusion_start < dma_dom->aperture_size) {
+		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+		int pages = iommu_num_pages(iommu->exclusion_start,
+					    iommu->exclusion_length,
+					    PAGE_SIZE);
+		dma_ops_reserve_addresses(dma_dom, startpage, pages);
+	}
+
+	/*
+	 * Check for areas already mapped as present in the new aperture
+	 * range and mark those pages as reserved in the allocator. Such
+	 * mappings may already exist as a result of requested unity
+	 * mappings for devices.
+	 */
+	for (i = dma_dom->aperture[index]->offset;
+	     i < dma_dom->aperture_size;
+	     i += PAGE_SIZE) {
+		u64 *pte = fetch_pte(&dma_dom->domain, i);
+		if (!pte || !IOMMU_PTE_PRESENT(*pte))
+			continue;
+
+		dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
+	}
+
 	return 0;
 
 out_free:
@@ -911,7 +969,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->need_flush = false;
 	dma_dom->target_dev = 0xffff;
 
-	if (alloc_new_range(dma_dom, true, GFP_KERNEL))
+	if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
 		goto free_dma_dom;
 
 	/*
@@ -921,15 +979,6 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
 	dma_dom->aperture[0]->bitmap[0] = 1;
 	dma_dom->next_address = 0;
 
-	/* Intialize the exclusion range if necessary */
-	if (iommu->exclusion_start &&
-	    iommu->exclusion_start < dma_dom->aperture_size) {
-		unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
-		int pages = iommu_num_pages(iommu->exclusion_start,
-					    iommu->exclusion_length,
-					    PAGE_SIZE);
-		dma_ops_reserve_addresses(dma_dom, startpage, pages);
-	}
 
 	return dma_dom;
 
-- 
GitLab


From 11b83888ae729457b5cfb936dbd498481f6408df Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 May 2009 10:23:15 +0200
Subject: [PATCH 1410/2198] amd-iommu: enlarge the aperture dynamically

By dynamically increasing the aperture the extended allocator is now
ready for use.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 59ee1b94a7ceb..d129d8feba073 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1403,10 +1403,26 @@ static dma_addr_t __map_single(struct device *dev,
 	if (align)
 		align_mask = (1UL << get_order(size)) - 1;
 
+retry:
 	address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
 					  dma_mask);
-	if (unlikely(address == bad_dma_address))
-		goto out;
+	if (unlikely(address == bad_dma_address)) {
+		/*
+		 * setting next_address here will let the address
+		 * allocator only scan the new allocated range in the
+		 * first run. This is a small optimization.
+		 */
+		dma_dom->next_address = dma_dom->aperture_size;
+
+		if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
+			goto out;
+
+		/*
+		 * aperture was sucessfully enlarged by 128 MB, try
+		 * allocation again
+		 */
+		goto retry;
+	}
 
 	start = address;
 	for (i = 0; i < pages; ++i) {
-- 
GitLab


From d9cfed925448f097ec7faab80d903eb7e5f99712 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 19 May 2009 12:16:29 +0200
Subject: [PATCH 1411/2198] amd-iommu: remove amd_iommu_size kernel parameter

This parameter is not longer necessary when aperture increases
dynamically.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 Documentation/kernel-parameters.txt |  5 -----
 arch/x86/kernel/amd_iommu.c         | 18 ++++--------------
 arch/x86/kernel/amd_iommu_init.c    | 15 ---------------
 3 files changed, 4 insertions(+), 34 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbfbcc75e..5b776c6e7964c 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -329,11 +329,6 @@ and is between 256 and 4096 characters. It is defined in the file
 				    flushed before they will be reused, which
 				    is a lot of faster
 
-	amd_iommu_size= [HW,X86-64]
-			Define the size of the aperture for the AMD IOMMU
-			driver. Possible values are:
-			'32M', '64M' (default), '128M', '256M', '512M', '1G'
-
 	amijoy.map=	[HW,JOY] Amiga joystick support
 			Map of devices attached to JOY0DAT and JOY1DAT
 			Format: <a>,<b>
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d129d8feba073..31d56c36010ac 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -939,17 +939,10 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
  * It also intializes the page table and the address allocator data
  * structures required for the dma_ops interface
  */
-static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
-						   unsigned order)
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
 {
 	struct dma_ops_domain *dma_dom;
 
-	/*
-	 * Currently the DMA aperture must be between 32 MB and 1GB in size
-	 */
-	if ((order < 25) || (order > 30))
-		return NULL;
-
 	dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
 	if (!dma_dom)
 		return NULL;
@@ -1087,7 +1080,6 @@ static int device_change_notifier(struct notifier_block *nb,
 	struct protection_domain *domain;
 	struct dma_ops_domain *dma_domain;
 	struct amd_iommu *iommu;
-	int order = amd_iommu_aperture_order;
 	unsigned long flags;
 
 	if (devid > amd_iommu_last_bdf)
@@ -1126,7 +1118,7 @@ static int device_change_notifier(struct notifier_block *nb,
 		dma_domain = find_protection_domain(devid);
 		if (dma_domain)
 			goto out;
-		dma_domain = dma_ops_domain_alloc(iommu, order);
+		dma_domain = dma_ops_domain_alloc(iommu);
 		if (!dma_domain)
 			goto out;
 		dma_domain->target_dev = devid;
@@ -1826,7 +1818,6 @@ static void prealloc_protection_domains(void)
 	struct pci_dev *dev = NULL;
 	struct dma_ops_domain *dma_dom;
 	struct amd_iommu *iommu;
-	int order = amd_iommu_aperture_order;
 	u16 devid;
 
 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1839,7 +1830,7 @@ static void prealloc_protection_domains(void)
 		iommu = amd_iommu_rlookup_table[devid];
 		if (!iommu)
 			continue;
-		dma_dom = dma_ops_domain_alloc(iommu, order);
+		dma_dom = dma_ops_domain_alloc(iommu);
 		if (!dma_dom)
 			continue;
 		init_unity_mappings_for_device(dma_dom, devid);
@@ -1865,7 +1856,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
 int __init amd_iommu_init_dma_ops(void)
 {
 	struct amd_iommu *iommu;
-	int order = amd_iommu_aperture_order;
 	int ret;
 
 	/*
@@ -1874,7 +1864,7 @@ int __init amd_iommu_init_dma_ops(void)
 	 * protection domain will be assigned to the default one.
 	 */
 	list_for_each_entry(iommu, &amd_iommu_list, list) {
-		iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+		iommu->default_dom = dma_ops_domain_alloc(iommu);
 		if (iommu->default_dom == NULL)
 			return -ENOMEM;
 		iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 8c0be0902dacb..762a4eefec930 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -121,7 +121,6 @@ u16 amd_iommu_last_bdf;			/* largest PCI device id we have
 					   to handle */
 LIST_HEAD(amd_iommu_unity_map);		/* a list of required unity mappings
 					   we find in ACPI */
-unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
 bool amd_iommu_isolate = true;		/* if true, device isolation is
 					   enabled */
 bool amd_iommu_unmap_flush;		/* if true, flush on every unmap */
@@ -1137,9 +1136,6 @@ int __init amd_iommu_init(void)
 
 	enable_iommus();
 
-	printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
-			(1 << (amd_iommu_aperture_order-20)));
-
 	printk(KERN_INFO "AMD IOMMU: device isolation ");
 	if (amd_iommu_isolate)
 		printk("enabled\n");
@@ -1225,15 +1221,4 @@ static int __init parse_amd_iommu_options(char *str)
 	return 1;
 }
 
-static int __init parse_amd_iommu_size_options(char *str)
-{
-	unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
-
-	if ((order > 24) && (order < 31))
-		amd_iommu_aperture_order = order;
-
-	return 1;
-}
-
 __setup("amd_iommu=", parse_amd_iommu_options);
-__setup("amd_iommu_size=", parse_amd_iommu_size_options);
-- 
GitLab


From fe16f088a88fb73161bba8784375c829f7e87b54 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:27:53 +0200
Subject: [PATCH 1412/2198] amd-iommu: disable round-robin allocator for
 CONFIG_IOMMU_STRESS

Disabling the round-robin allocator results in reusing the same
dma-addresses again very fast. This is a good test if the iotlb flushing
is working correctly.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 31d56c36010ac..543822b39a846 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -783,6 +783,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
 {
 	unsigned long address;
 
+#ifdef CONFIG_IOMMU_STRESS
+	dom->next_address = 0;
+	dom->need_flush = true;
+#endif
+
 	address = dma_ops_area_alloc(dev, dom, pages, align_mask,
 				     dma_mask, dom->next_address);
 
-- 
GitLab


From f5e9705c6429d24dee832b2edd7f4848d432ea03 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:31:53 +0200
Subject: [PATCH 1413/2198] amd-iommu: don't preallocate page tables with
 CONFIG_IOMMU_STRESS

This forces testing of on-demand page table allocation code.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 543822b39a846..33434c497a6a0 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -661,6 +661,10 @@ static int alloc_new_range(struct amd_iommu *iommu,
 	int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
 	int i;
 
+#ifdef CONFIG_IOMMU_STRESS
+	populate = false;
+#endif
+
 	if (index >= APERTURE_MAX_RANGES)
 		return -ENOMEM;
 
-- 
GitLab


From 47bccd6bb2b866449d3ecf2ba350ac1c7473b2b8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 12:40:54 +0200
Subject: [PATCH 1414/2198] amd-iommu: don't free dma adresses below 512MB with
 CONFIG_IOMMU_STRESS

This will test the automatic aperture enlargement code. This is
important because only very few devices will ever trigger this code
path. So force it under CONFIG_IOMMU_STRESS.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 33434c497a6a0..04ff5ec4ac0ed 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -824,6 +824,11 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
 
 	BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
 
+#ifdef CONFIG_IOMMU_STRESS
+	if (i < 4)
+		return;
+#endif
+
 	if (address >= dom->next_address)
 		dom->need_flush = true;
 
-- 
GitLab


From 1bf4072da67c14d6b02cfeef02212aa5a6211df2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: [PATCH 1415/2198] cifs: reorganize get_cifs_acl

Thus spake Christoph:

"But this whole set_cifs_acl function is a real mess anyway and needs
some splitting up."

With this change too, it's possible to call acl_to_uid_mode() with a
NULL inode pointer. That (or something close to it) will eventually be
necessary when cifs_get_inode_info is reorganized.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c   | 100 +++++++++++++++++++++++---------------------
 fs/cifs/cifsproto.h |   4 +-
 fs/cifs/inode.c     |   2 +-
 3 files changed, 55 insertions(+), 51 deletions(-)

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 57ecdc83c26f3..7f8e6c46d1165 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -552,67 +552,66 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
 	return rc;
 }
 
-
-/* Retrieve an ACL from the server */
-static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
-				       const char *path, const __u16 *pfid)
+static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
+		__u16 fid, u32 *pacllen)
 {
-	struct cifsFileInfo *open_file = NULL;
-	bool unlock_file = false;
-	int xid;
-	int rc = -EIO;
-	__u16 fid;
-	struct super_block *sb;
-	struct cifs_sb_info *cifs_sb;
 	struct cifs_ntsd *pntsd = NULL;
+	int xid, rc;
 
-	cFYI(1, ("get mode from ACL for %s", path));
+	xid = GetXid();
+	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+	FreeXid(xid);
 
-	if (inode == NULL)
-		return NULL;
 
-	xid = GetXid();
-	if (pfid == NULL)
-		open_file = find_readable_file(CIFS_I(inode));
-	else
-		fid = *pfid;
+	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+	return pntsd;
+}
 
-	sb = inode->i_sb;
-	if (sb == NULL) {
-		FreeXid(xid);
-		return NULL;
-	}
-	cifs_sb = CIFS_SB(sb);
+static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
+		const char *path, u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	int oplock = 0;
+	int xid, rc;
+	__u16 fid;
 
-	if (open_file) {
-		unlock_file = true;
-		fid = open_file->netfid;
-	} else if (pfid == NULL) {
-		int oplock = 0;
-		/* open file */
-		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-				READ_CONTROL, 0, &fid, &oplock, NULL,
-				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc != 0) {
-			cERROR(1, ("Unable to open file to get ACL"));
-			FreeXid(xid);
-			return NULL;
-		}
+	xid = GetXid();
+
+	rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
+			 &fid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc) {
+		cERROR(1, ("Unable to open file to get ACL"));
+		goto out;
 	}
 
 	rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
 	cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-	if (unlock_file == true) /* find_readable_file increments ref count */
-		atomic_dec(&open_file->wrtPending);
-	else if (pfid == NULL) /* if opened above we have to close the handle */
-		CIFSSMBClose(xid, cifs_sb->tcon, fid);
-	/* else handle was passed in by caller */
 
+	CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
 	FreeXid(xid);
 	return pntsd;
 }
 
+/* Retrieve an ACL from the server */
+static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+				      struct inode *inode, const char *path,
+				      u32 *pacllen)
+{
+	struct cifs_ntsd *pntsd = NULL;
+	struct cifsFileInfo *open_file = NULL;
+
+	if (inode)
+		open_file = find_readable_file(CIFS_I(inode));
+	if (!open_file)
+		return get_cifs_acl_by_path(cifs_sb, path, pacllen);
+
+	pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
+	atomic_dec(&open_file->wrtPending);
+	return pntsd;
+}
+
 /* Set an ACL on the server */
 static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 				struct inode *inode, const char *path)
@@ -668,14 +667,19 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
 }
 
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid)
+void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+		     const char *path, const __u16 *pfid)
 {
 	struct cifs_ntsd *pntsd = NULL;
 	u32 acllen = 0;
 	int rc = 0;
 
 	cFYI(DBG2, ("converting ACL to mode for %s", path));
-	pntsd = get_cifs_acl(&acllen, inode, path, pfid);
+
+	if (pfid)
+		pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
+	else
+		pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
 
 	/* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
 	if (pntsd)
@@ -698,7 +702,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
 	cFYI(DBG2, ("set ACL from mode for %s", path));
 
 	/* Get the security descriptor */
-	pntsd = get_cifs_acl(&secdesclen, inode, path, NULL);
+	pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
 
 	/* Add three ACEs for owner, group, everyone getting rid of
 	   other ACEs as chmod disables ACEs and set the security descriptor */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index d542cf1f69c31..f9452329bcce5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -108,8 +108,8 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
 			const unsigned char *search_path,
 			struct super_block *sb, int xid);
-extern void acl_to_uid_mode(struct inode *inode, const char *path,
-			    const __u16 *pfid);
+extern void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+			    const char *path, const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 84b7bea736740..fad882b075ba5 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -626,7 +626,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	/* fill in 0777 bits from ACL */
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
 		cFYI(1, ("Getting mode bits from ACL"));
-		acl_to_uid_mode(inode, full_path, pfid);
+		acl_to_uid_mode(cifs_sb, inode, full_path, pfid);
 	}
 #endif
 	if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
-- 
GitLab


From b96d31a62f714566fa6420851b3bb3615c796322 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Wed, 27 May 2009 09:37:33 -0400
Subject: [PATCH 1416/2198] cifs: clean up set_cifs_acl interfaces

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Shirish Pargaonkar <shirishp@us.ibm.com>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsacl.c | 78 +++++++++++++++++++++++++----------------------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 7f8e6c46d1165..1403b5d86a739 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -612,57 +612,61 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
 	return pntsd;
 }
 
-/* Set an ACL on the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
-				struct inode *inode, const char *path)
+static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
+		struct cifs_ntsd *pnntsd, u32 acllen)
 {
-	struct cifsFileInfo *open_file;
-	bool unlock_file = false;
-	int xid;
-	int rc = -EIO;
-	__u16 fid;
-	struct super_block *sb;
-	struct cifs_sb_info *cifs_sb;
+	int xid, rc;
 
-	cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+	xid = GetXid();
+	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+	FreeXid(xid);
 
-	if (!inode)
-		return rc;
+	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+	return rc;
+}
 
-	sb = inode->i_sb;
-	if (sb == NULL)
-		return rc;
+static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
+		struct cifs_ntsd *pnntsd, u32 acllen)
+{
+	int oplock = 0;
+	int xid, rc;
+	__u16 fid;
 
-	cifs_sb = CIFS_SB(sb);
 	xid = GetXid();
 
-	open_file = find_readable_file(CIFS_I(inode));
-	if (open_file) {
-		unlock_file = true;
-		fid = open_file->netfid;
-	} else {
-		int oplock = 0;
-		/* open file */
-		rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-				WRITE_DAC, 0, &fid, &oplock, NULL,
-				cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
-		if (rc != 0) {
-			cERROR(1, ("Unable to open file to set ACL"));
-			FreeXid(xid);
-			return rc;
-		}
+	rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
+			 &fid, &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc) {
+		cERROR(1, ("Unable to open file to set ACL"));
+		goto out;
 	}
 
 	rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
 	cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-	if (unlock_file)
-		atomic_dec(&open_file->wrtPending);
-	else
-		CIFSSMBClose(xid, cifs_sb->tcon, fid);
 
+	CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
 	FreeXid(xid);
+	return rc;
+}
 
+/* Set an ACL on the server */
+static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+				struct inode *inode, const char *path)
+{
+	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+	struct cifsFileInfo *open_file;
+	int rc;
+
+	cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+
+	open_file = find_readable_file(CIFS_I(inode));
+	if (!open_file)
+		return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
+
+	rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
+	atomic_dec(&open_file->wrtPending);
 	return rc;
 }
 
-- 
GitLab


From a2928c42a5d69328c3578b41bd4d72f6658cf7dc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 May 2009 14:55:04 -0300
Subject: [PATCH 1417/2198] perf_counter tools: Move symbol resolution classes
 from report to libperf

Will be used by perf top as well.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090528175504.GC4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile         |   2 +
 Documentation/perf_counter/builtin-report.c | 458 +-------------------
 Documentation/perf_counter/util/symbol.c    | 421 ++++++++++++++++++
 Documentation/perf_counter/util/symbol.h    |  33 ++
 4 files changed, 471 insertions(+), 443 deletions(-)
 create mode 100644 Documentation/perf_counter/util/symbol.c
 create mode 100644 Documentation/perf_counter/util/symbol.h

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 51b13f989833e..bd29a5c001004 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -298,6 +298,7 @@ LIB_H += util/help.h
 LIB_H += util/strbuf.h
 LIB_H += util/run-command.h
 LIB_H += util/sigchain.h
+LIB_H += util/symbol.h
 
 LIB_OBJS += util/abspath.o
 LIB_OBJS += util/alias.o
@@ -317,6 +318,7 @@ LIB_OBJS += util/strbuf.o
 LIB_OBJS += util/usage.o
 LIB_OBJS += util/wrapper.o
 LIB_OBJS += util/sigchain.o
+LIB_OBJS += util/symbol.o
 LIB_OBJS += util/pager.o
 
 BUILTIN_OBJS += builtin-help.o
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 9fdf8224ee61f..04fc7ec0c3597 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1,13 +1,10 @@
 #include "util/util.h"
 #include "builtin.h"
 
-#include <libelf.h>
-#include <gelf.h>
-#include <elf.h>
-
 #include "util/list.h"
 #include "util/cache.h"
 #include "util/rbtree.h"
+#include "util/symbol.h"
 
 #include "perf.h"
 
@@ -62,305 +59,6 @@ typedef union event_union {
 	struct comm_event comm;
 } event_t;
 
-struct symbol {
-	struct rb_node		rb_node;
-	__u64			start;
-	__u64			end;
-	char			name[0];
-};
-
-static struct symbol *symbol__new(uint64_t start, uint64_t len, const char *name)
-{
-	struct symbol *self = malloc(sizeof(*self) + strlen(name) + 1);
-
-	if (self != NULL) {
-		self->start = start;
-		self->end   = start + len;
-		strcpy(self->name, name);
-	}
-
-	return self;
-}
-
-static void symbol__delete(struct symbol *self)
-{
-	free(self);
-}
-
-static size_t symbol__fprintf(struct symbol *self, FILE *fp)
-{
-	return fprintf(fp, " %llx-%llx %s\n",
-		       self->start, self->end, self->name);
-}
-
-struct dso {
-	struct list_head node;
-	struct rb_root	 syms;
-	char		 name[0];
-};
-
-static struct dso *dso__new(const char *name)
-{
-	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
-
-	if (self != NULL) {
-		strcpy(self->name, name);
-		self->syms = RB_ROOT;
-	}
-
-	return self;
-}
-
-static void dso__delete_symbols(struct dso *self)
-{
-	struct symbol *pos;
-	struct rb_node *next = rb_first(&self->syms);
-
-	while (next) {
-		pos = rb_entry(next, struct symbol, rb_node);
-		next = rb_next(&pos->rb_node);
-		symbol__delete(pos);
-	}
-}
-
-static void dso__delete(struct dso *self)
-{
-	dso__delete_symbols(self);
-	free(self);
-}
-
-static void dso__insert_symbol(struct dso *self, struct symbol *sym)
-{
-	struct rb_node **p = &self->syms.rb_node;
-	struct rb_node *parent = NULL;
-	const uint64_t ip = sym->start;
-	struct symbol *s;
-
-	while (*p != NULL) {
-		parent = *p;
-		s = rb_entry(parent, struct symbol, rb_node);
-		if (ip < s->start)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-	rb_link_node(&sym->rb_node, parent, p);
-	rb_insert_color(&sym->rb_node, &self->syms);
-}
-
-static struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
-{
-	struct rb_node *n;
-
-	if (self == NULL)
-		return NULL;
-
-	n = self->syms.rb_node;
-
-	while (n) {
-		struct symbol *s = rb_entry(n, struct symbol, rb_node);
-
-		if (ip < s->start)
-			n = n->rb_left;
-		else if (ip > s->end)
-			n = n->rb_right;
-		else
-			return s;
-	}
-
-	return NULL;
-}
-
-/**
- * elf_symtab__for_each_symbol - iterate thru all the symbols
- *
- * @self: struct elf_symtab instance to iterate
- * @index: uint32_t index
- * @sym: GElf_Sym iterator
- */
-#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \
-	for (index = 0, gelf_getsym(syms, index, &sym);\
-	     index < nr_syms; \
-	     index++, gelf_getsym(syms, index, &sym))
-
-static inline uint8_t elf_sym__type(const GElf_Sym *sym)
-{
-	return GELF_ST_TYPE(sym->st_info);
-}
-
-static inline int elf_sym__is_function(const GElf_Sym *sym)
-{
-	return elf_sym__type(sym) == STT_FUNC &&
-	       sym->st_name != 0 &&
-	       sym->st_shndx != SHN_UNDEF &&
-	       sym->st_size != 0;
-}
-
-static inline const char *elf_sym__name(const GElf_Sym *sym,
-					const Elf_Data *symstrs)
-{
-	return symstrs->d_buf + sym->st_name;
-}
-
-static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
-				    GElf_Shdr *shp, const char *name,
-				    size_t *index)
-{
-	Elf_Scn *sec = NULL;
-	size_t cnt = 1;
-
-	while ((sec = elf_nextscn(elf, sec)) != NULL) {
-		char *str;
-
-		gelf_getshdr(sec, shp);
-		str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
-		if (!strcmp(name, str)) {
-			if (index)
-				*index = cnt;
-			break;
-		}
-		++cnt;
-	}
-
-	return sec;
-}
-
-static int dso__load_sym(struct dso *self, int fd, char *name)
-{
-	Elf_Data *symstrs;
-	uint32_t nr_syms;
-	int err = -1;
-	uint32_t index;
-	GElf_Ehdr ehdr;
-	GElf_Shdr shdr;
-	Elf_Data *syms;
-	GElf_Sym sym;
-	Elf_Scn *sec;
-	Elf *elf;
-	int nr = 0;
-
-	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
-	if (elf == NULL) {
-		fprintf(stderr, "%s: cannot read %s ELF file.\n",
-			__func__, name);
-		goto out_close;
-	}
-
-	if (gelf_getehdr(elf, &ehdr) == NULL) {
-		fprintf(stderr, "%s: cannot get elf header.\n", __func__);
-		goto out_elf_end;
-	}
-
-	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
-	if (sec == NULL)
-		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
-
-	if (sec == NULL)
-		goto out_elf_end;
-
-	syms = elf_getdata(sec, NULL);
-	if (syms == NULL)
-		goto out_elf_end;
-
-	sec = elf_getscn(elf, shdr.sh_link);
-	if (sec == NULL)
-		goto out_elf_end;
-
-	symstrs = elf_getdata(sec, NULL);
-	if (symstrs == NULL)
-		goto out_elf_end;
-
-	nr_syms = shdr.sh_size / shdr.sh_entsize;
-
-	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
-		struct symbol *f;
-
-		if (!elf_sym__is_function(&sym))
-			continue;
-
-		sec = elf_getscn(elf, sym.st_shndx);
-		if (!sec)
-			goto out_elf_end;
-
-		gelf_getshdr(sec, &shdr);
-		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
-
-		f = symbol__new(sym.st_value, sym.st_size,
-				elf_sym__name(&sym, symstrs));
-		if (!f)
-			goto out_elf_end;
-
-		dso__insert_symbol(self, f);
-
-		nr++;
-	}
-
-	err = nr;
-out_elf_end:
-	elf_end(elf);
-out_close:
-	return err;
-}
-
-static int dso__load(struct dso *self)
-{
-	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
-	char *name = malloc(size);
-	int variant = 0;
-	int ret = -1;
-	int fd;
-
-	if (!name)
-		return -1;
-
-more:
-	do {
-		switch (variant) {
-		case 0: /* Fedora */
-			snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
-			break;
-		case 1: /* Ubuntu */
-			snprintf(name, size, "/usr/lib/debug%s", self->name);
-			break;
-		case 2: /* Sane people */
-			snprintf(name, size, "%s", self->name);
-			break;
-
-		default:
-			goto out;
-		}
-		variant++;
-
-		fd = open(name, O_RDONLY);
-	} while (fd < 0);
-
-	ret = dso__load_sym(self, fd, name);
-	close(fd);
-
-	/*
-	 * Some people seem to have debuginfo files _WITHOUT_ debug info!?!?
-	 */
-	if (!ret)
-		goto more;
-
-out:
-	free(name);
-	return ret;
-}
-
-static size_t dso__fprintf(struct dso *self, FILE *fp)
-{
-	size_t ret = fprintf(fp, "dso: %s\n", self->name);
-
-	struct rb_node *nd;
-	for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
-		struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
-		ret += symbol__fprintf(pos, fp);
-	}
-
-	return ret;
-}
-
 static LIST_HEAD(dsos);
 static struct dso *kernel_dso;
 
@@ -418,153 +116,27 @@ static void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
-static int hex(char ch)
-{
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	if ((ch >= 'A') && (ch <= 'F'))
-		return ch - 'A' + 10;
-	return -1;
-}
-
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-static int hex2long(char *ptr, unsigned long *long_val)
-{
-	const char *p = ptr;
-	*long_val = 0;
-
-	while (*p) {
-		const int hex_val = hex(*p);
-
-		if (hex_val < 0)
-			break;
-
-		*long_val = (*long_val << 4) | hex_val;
-		p++;
-	}
-
-	return p - ptr;
-}
-
-static int load_kallsyms(void)
-{
-	struct rb_node *nd, *prevnd;
-	char *line = NULL;
-	FILE *file;
-	size_t n;
-
-	kernel_dso = dso__new("[kernel]");
-	if (kernel_dso == NULL)
-		return -1;
-
-	file = fopen("/proc/kallsyms", "r");
-	if (file == NULL)
-		goto out_delete_dso;
-
-	while (!feof(file)) {
-		unsigned long start;
-		struct symbol *sym;
-		int line_len, len;
-		char symbol_type;
-
-		line_len = getline(&line, &n, file);
-		if (line_len < 0)
-			break;
-
-		if (!line)
-			goto out_delete_dso;
-
-		line[--line_len] = '\0'; /* \n */
-
-		len = hex2long(line, &start);
-
-		len++;
-		if (len + 2 >= line_len)
-			continue;
-
-		symbol_type = toupper(line[len]);
-		/*
-		 * We're interested only in code ('T'ext)
-		 */
-		if (symbol_type != 'T' && symbol_type != 'W')
-			continue;
-		/*
-		 * Well fix up the end later, when we have all sorted.
-		 */
-		sym = symbol__new(start, 0xdead, line + len + 2);
-
-		if (sym == NULL)
-			goto out_delete_dso;
-
-		dso__insert_symbol(kernel_dso, sym);
-	}
-
-	/*
-	 * Now that we have all sorted out, just set the ->end of all
-	 * symbols
-	 */
-	prevnd = rb_first(&kernel_dso->syms);
-
-	if (prevnd == NULL)
-		goto out_delete_line;
-
-	for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
-		struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
-			      *curr = rb_entry(nd, struct symbol, rb_node);
-
-		prev->end = curr->start - 1;
-		prevnd = nd;
-	}
-
-	dsos__add(kernel_dso);
-	free(line);
-	fclose(file);
-
-	return 0;
-
-out_delete_line:
-	free(line);
-out_delete_dso:
-	dso__delete(kernel_dso);
-	return -1;
-}
-
 static int load_kernel(void)
 {
-	int fd, nr;
-
-	if (!vmlinux)
-		goto kallsyms;
-
-	fd = open(vmlinux, O_RDONLY);
-	if (fd < 0)
-		goto kallsyms;
+	int err = -1;
 
 	kernel_dso = dso__new("[kernel]");
 	if (!kernel_dso)
-		goto fail_open;
-
-	nr = dso__load_sym(kernel_dso, fd, vmlinux);
+		return -1;
 
-	if (nr <= 0)
-		goto fail_load;
+	if (vmlinux)
+		err = dso__load_vmlinux(kernel_dso, vmlinux);
 
-	dsos__add(kernel_dso);
-	close(fd);
+	if (err)
+		err = dso__load_kallsyms(kernel_dso);
 
-	return 0;
+	if (err) {
+		dso__delete(kernel_dso);
+		kernel_dso = NULL;
+	} else
+		dsos__add(kernel_dso);
 
-fail_load:
-	dso__delete(kernel_dso);
-fail_open:
-	close(fd);
-kallsyms:
-	return load_kallsyms();
+	return err;
 }
 
 struct map {
@@ -1050,7 +622,7 @@ static int __cmd_report(void)
 	}
 
 	if (load_kernel() < 0) {
-		perror("failed to open kallsyms");
+		perror("failed to load kernel symbols");
 		return EXIT_FAILURE;
 	}
 
@@ -1247,7 +819,7 @@ static const struct option options[] = {
 
 int cmd_report(int argc, const char **argv, const char *prefix)
 {
-	elf_version(EV_CURRENT);
+	symbol__init();
 
 	page_size = getpagesize();
 
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
new file mode 100644
index 0000000000000..d06de2cfcdc64
--- /dev/null
+++ b/Documentation/perf_counter/util/symbol.c
@@ -0,0 +1,421 @@
+#include "util.h"
+#include "../perf.h"
+#include "symbol.h"
+
+#include <libelf.h>
+#include <gelf.h>
+#include <elf.h>
+
+static struct symbol *symbol__new(uint64_t start, uint64_t len,
+				  const char *name)
+{
+	struct symbol *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		self->start = start;
+		self->end   = start + len;
+		strcpy(self->name, name);
+	}
+
+	return self;
+}
+
+static void symbol__delete(struct symbol *self)
+{
+	free(self);
+}
+
+static size_t symbol__fprintf(struct symbol *self, FILE *fp)
+{
+	return fprintf(fp, " %llx-%llx %s\n",
+		       self->start, self->end, self->name);
+}
+
+struct dso *dso__new(const char *name)
+{
+	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
+
+	if (self != NULL) {
+		strcpy(self->name, name);
+		self->syms = RB_ROOT;
+	}
+
+	return self;
+}
+
+static void dso__delete_symbols(struct dso *self)
+{
+	struct symbol *pos;
+	struct rb_node *next = rb_first(&self->syms);
+
+	while (next) {
+		pos = rb_entry(next, struct symbol, rb_node);
+		next = rb_next(&pos->rb_node);
+		symbol__delete(pos);
+	}
+}
+
+void dso__delete(struct dso *self)
+{
+	dso__delete_symbols(self);
+	free(self);
+}
+
+static void dso__insert_symbol(struct dso *self, struct symbol *sym)
+{
+	struct rb_node **p = &self->syms.rb_node;
+	struct rb_node *parent = NULL;
+	const uint64_t ip = sym->start;
+	struct symbol *s;
+
+	while (*p != NULL) {
+		parent = *p;
+		s = rb_entry(parent, struct symbol, rb_node);
+		if (ip < s->start)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&sym->rb_node, parent, p);
+	rb_insert_color(&sym->rb_node, &self->syms);
+}
+
+struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
+{
+	struct rb_node *n;
+
+	if (self == NULL)
+		return NULL;
+
+	n = self->syms.rb_node;
+
+	while (n) {
+		struct symbol *s = rb_entry(n, struct symbol, rb_node);
+
+		if (ip < s->start)
+			n = n->rb_left;
+		else if (ip > s->end)
+			n = n->rb_right;
+		else
+			return s;
+	}
+
+	return NULL;
+}
+
+size_t dso__fprintf(struct dso *self, FILE *fp)
+{
+	size_t ret = fprintf(fp, "dso: %s\n", self->name);
+
+	struct rb_node *nd;
+	for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) {
+		struct symbol *pos = rb_entry(nd, struct symbol, rb_node);
+		ret += symbol__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
+static int hex(char ch)
+{
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+static int hex2long(char *ptr, unsigned long *long_val)
+{
+	const char *p = ptr;
+	*long_val = 0;
+
+	while (*p) {
+		const int hex_val = hex(*p);
+
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		p++;
+	}
+
+	return p - ptr;
+}
+
+int dso__load_kallsyms(struct dso *self)
+{
+	struct rb_node *nd, *prevnd;
+	char *line = NULL;
+	size_t n;
+	FILE *file = fopen("/proc/kallsyms", "r");
+
+	if (file == NULL)
+		goto out_failure;
+
+	while (!feof(file)) {
+		unsigned long start;
+		struct symbol *sym;
+		int line_len, len;
+		char symbol_type;
+
+		line_len = getline(&line, &n, file);
+		if (line_len < 0)
+			break;
+
+		if (!line)
+			goto out_failure;
+
+		line[--line_len] = '\0'; /* \n */
+
+		len = hex2long(line, &start);
+
+		len++;
+		if (len + 2 >= line_len)
+			continue;
+
+		symbol_type = toupper(line[len]);
+		/*
+		 * We're interested only in code ('T'ext)
+		 */
+		if (symbol_type != 'T' && symbol_type != 'W')
+			continue;
+		/*
+		 * Well fix up the end later, when we have all sorted.
+		 */
+		sym = symbol__new(start, 0xdead, line + len + 2);
+
+		if (sym == NULL)
+			goto out_delete_line;
+
+		dso__insert_symbol(self, sym);
+	}
+
+	/*
+	 * Now that we have all sorted out, just set the ->end of all
+	 * symbols
+	 */
+	prevnd = rb_first(&self->syms);
+
+	if (prevnd == NULL)
+		goto out_delete_line;
+
+	for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) {
+		struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node),
+			      *curr = rb_entry(nd, struct symbol, rb_node);
+
+		prev->end = curr->start - 1;
+		prevnd = nd;
+	}
+
+	free(line);
+	fclose(file);
+
+	return 0;
+
+out_delete_line:
+	free(line);
+out_failure:
+	return -1;
+}
+
+/**
+ * elf_symtab__for_each_symbol - iterate thru all the symbols
+ *
+ * @self: struct elf_symtab instance to iterate
+ * @index: uint32_t index
+ * @sym: GElf_Sym iterator
+ */
+#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \
+	for (index = 0, gelf_getsym(syms, index, &sym);\
+	     index < nr_syms; \
+	     index++, gelf_getsym(syms, index, &sym))
+
+static inline uint8_t elf_sym__type(const GElf_Sym *sym)
+{
+	return GELF_ST_TYPE(sym->st_info);
+}
+
+static inline int elf_sym__is_function(const GElf_Sym *sym)
+{
+	return elf_sym__type(sym) == STT_FUNC &&
+	       sym->st_name != 0 &&
+	       sym->st_shndx != SHN_UNDEF &&
+	       sym->st_size != 0;
+}
+
+static inline const char *elf_sym__name(const GElf_Sym *sym,
+					const Elf_Data *symstrs)
+{
+	return symstrs->d_buf + sym->st_name;
+}
+
+static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
+				    GElf_Shdr *shp, const char *name,
+				    size_t *index)
+{
+	Elf_Scn *sec = NULL;
+	size_t cnt = 1;
+
+	while ((sec = elf_nextscn(elf, sec)) != NULL) {
+		char *str;
+
+		gelf_getshdr(sec, shp);
+		str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name);
+		if (!strcmp(name, str)) {
+			if (index)
+				*index = cnt;
+			break;
+		}
+		++cnt;
+	}
+
+	return sec;
+}
+
+static int dso__load_sym(struct dso *self, int fd, const char *name)
+{
+	Elf_Data *symstrs;
+	uint32_t nr_syms;
+	int err = -1;
+	uint32_t index;
+	GElf_Ehdr ehdr;
+	GElf_Shdr shdr;
+	Elf_Data *syms;
+	GElf_Sym sym;
+	Elf_Scn *sec;
+	Elf *elf;
+	int nr = 0;
+
+	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	if (elf == NULL) {
+		fprintf(stderr, "%s: cannot read %s ELF file.\n",
+			__func__, name);
+		goto out_close;
+	}
+
+	if (gelf_getehdr(elf, &ehdr) == NULL) {
+		fprintf(stderr, "%s: cannot get elf header.\n", __func__);
+		goto out_elf_end;
+	}
+
+	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
+	if (sec == NULL)
+		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
+
+	if (sec == NULL)
+		goto out_elf_end;
+
+	syms = elf_getdata(sec, NULL);
+	if (syms == NULL)
+		goto out_elf_end;
+
+	sec = elf_getscn(elf, shdr.sh_link);
+	if (sec == NULL)
+		goto out_elf_end;
+
+	symstrs = elf_getdata(sec, NULL);
+	if (symstrs == NULL)
+		goto out_elf_end;
+
+	nr_syms = shdr.sh_size / shdr.sh_entsize;
+
+	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
+		struct symbol *f;
+
+		if (!elf_sym__is_function(&sym))
+			continue;
+
+		sec = elf_getscn(elf, sym.st_shndx);
+		if (!sec)
+			goto out_elf_end;
+
+		gelf_getshdr(sec, &shdr);
+		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
+
+		f = symbol__new(sym.st_value, sym.st_size,
+				elf_sym__name(&sym, symstrs));
+		if (!f)
+			goto out_elf_end;
+
+		dso__insert_symbol(self, f);
+
+		nr++;
+	}
+
+	err = nr;
+out_elf_end:
+	elf_end(elf);
+out_close:
+	return err;
+}
+
+int dso__load(struct dso *self)
+{
+	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
+	char *name = malloc(size);
+	int variant = 0;
+	int ret = -1;
+	int fd;
+
+	if (!name)
+		return -1;
+
+more:
+	do {
+		switch (variant) {
+		case 0: /* Fedora */
+			snprintf(name, size, "/usr/lib/debug%s.debug", self->name);
+			break;
+		case 1: /* Ubuntu */
+			snprintf(name, size, "/usr/lib/debug%s", self->name);
+			break;
+		case 2: /* Sane people */
+			snprintf(name, size, "%s", self->name);
+			break;
+
+		default:
+			goto out;
+		}
+		variant++;
+
+		fd = open(name, O_RDONLY);
+	} while (fd < 0);
+
+	ret = dso__load_sym(self, fd, name);
+	close(fd);
+
+	/*
+	 * Some people seem to have debuginfo files _WITHOUT_ debug info!?!?
+	 */
+	if (!ret)
+		goto more;
+
+out:
+	free(name);
+	return ret;
+}
+
+int dso__load_vmlinux(struct dso *self, const char *vmlinux)
+{
+	int err, fd = open(vmlinux, O_RDONLY);
+
+	if (fd < 0)
+		return -1;
+
+	err = dso__load_sym(self, fd, vmlinux);
+	close(fd);
+
+	return err;
+}
+
+void symbol__init(void)
+{
+	elf_version(EV_CURRENT);
+}
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
new file mode 100644
index 0000000000000..6dffe76a28f07
--- /dev/null
+++ b/Documentation/perf_counter/util/symbol.h
@@ -0,0 +1,33 @@
+#ifndef _PERF_SYMBOL_
+#define _PERF_SYMBOL_ 1
+
+#include <linux/types.h>
+#include "list.h"
+#include "rbtree.h"
+
+struct symbol {
+	struct rb_node	rb_node;
+	__u64		start;
+	__u64		end;
+	char		name[0];
+};
+
+struct dso {
+	struct list_head node;
+	struct rb_root	 syms;
+	char		 name[0];
+};
+
+struct dso *dso__new(const char *name);
+void dso__delete(struct dso *self);
+
+struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
+
+int dso__load_kallsyms(struct dso *self);
+int dso__load_vmlinux(struct dso *self, const char *vmlinux);
+int dso__load(struct dso *self);
+
+size_t dso__fprintf(struct dso *self, FILE *fp);
+
+void symbol__init(void);
+#endif /* _PERF_SYMBOL_ */
-- 
GitLab


From 0085c954140d27937ada29d139c16341075d45e4 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 May 2009 14:55:13 -0300
Subject: [PATCH 1418/2198] perf_counter tools: struct symbol priv area

When creating a dso instance allow asking that all symbols in this dso
have a private area just before the symbol.

perf top will use this for its counters, etc.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090528175513.GD4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c |  4 ++--
 Documentation/perf_counter/util/symbol.c    | 26 ++++++++++++++-------
 Documentation/perf_counter/util/symbol.h    |  8 ++++++-
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 04fc7ec0c3597..889067eb2b690 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -83,7 +83,7 @@ static struct dso *dsos__findnew(const char *name)
 	int nr;
 
 	if (dso == NULL) {
-		dso = dso__new(name);
+		dso = dso__new(name, 0);
 		if (!dso)
 			goto out_delete_dso;
 
@@ -120,7 +120,7 @@ static int load_kernel(void)
 {
 	int err = -1;
 
-	kernel_dso = dso__new("[kernel]");
+	kernel_dso = dso__new("[kernel]", 0);
 	if (!kernel_dso)
 		return -1;
 
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index d06de2cfcdc64..7088206244ac7 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -7,22 +7,27 @@
 #include <elf.h>
 
 static struct symbol *symbol__new(uint64_t start, uint64_t len,
-				  const char *name)
+				  const char *name, unsigned int priv_size)
 {
-	struct symbol *self = malloc(sizeof(*self) + strlen(name) + 1);
+	size_t namelen = strlen(name) + 1;
+	struct symbol *self = malloc(priv_size + sizeof(*self) + namelen);
 
 	if (self != NULL) {
+		if (priv_size) {
+			memset(self, 0, priv_size);
+			self = ((void *)self) + priv_size;
+		}
 		self->start = start;
 		self->end   = start + len;
-		strcpy(self->name, name);
+		memcpy(self->name, name, namelen);
 	}
 
 	return self;
 }
 
-static void symbol__delete(struct symbol *self)
+static void symbol__delete(struct symbol *self, unsigned int priv_size)
 {
-	free(self);
+	free(((void *)self) - priv_size);
 }
 
 static size_t symbol__fprintf(struct symbol *self, FILE *fp)
@@ -31,13 +36,14 @@ static size_t symbol__fprintf(struct symbol *self, FILE *fp)
 		       self->start, self->end, self->name);
 }
 
-struct dso *dso__new(const char *name)
+struct dso *dso__new(const char *name, unsigned int sym_priv_size)
 {
 	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1);
 
 	if (self != NULL) {
 		strcpy(self->name, name);
 		self->syms = RB_ROOT;
+		self->sym_priv_size = sym_priv_size;
 	}
 
 	return self;
@@ -51,7 +57,7 @@ static void dso__delete_symbols(struct dso *self)
 	while (next) {
 		pos = rb_entry(next, struct symbol, rb_node);
 		next = rb_next(&pos->rb_node);
-		symbol__delete(pos);
+		symbol__delete(pos, self->sym_priv_size);
 	}
 }
 
@@ -189,7 +195,8 @@ int dso__load_kallsyms(struct dso *self)
 		/*
 		 * Well fix up the end later, when we have all sorted.
 		 */
-		sym = symbol__new(start, 0xdead, line + len + 2);
+		sym = symbol__new(start, 0xdead, line + len + 2,
+				  self->sym_priv_size);
 
 		if (sym == NULL)
 			goto out_delete_line;
@@ -340,7 +347,8 @@ static int dso__load_sym(struct dso *self, int fd, const char *name)
 		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
 
 		f = symbol__new(sym.st_value, sym.st_size,
-				elf_sym__name(&sym, symstrs));
+				elf_sym__name(&sym, symstrs),
+				self->sym_priv_size);
 		if (!f)
 			goto out_elf_end;
 
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index 6dffe76a28f07..9e120af9c71f9 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -15,12 +15,18 @@ struct symbol {
 struct dso {
 	struct list_head node;
 	struct rb_root	 syms;
+	unsigned int	 sym_priv_size;
 	char		 name[0];
 };
 
-struct dso *dso__new(const char *name);
+struct dso *dso__new(const char *name, unsigned int sym_priv_size);
 void dso__delete(struct dso *self);
 
+static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
+{
+	return ((void *)sym) - self->sym_priv_size;
+}
+
 struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
 
 int dso__load_kallsyms(struct dso *self);
-- 
GitLab


From a827c875f2ebe69c6e6db5e7f112d4beb4d80f01 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 May 2009 14:55:19 -0300
Subject: [PATCH 1419/2198] perf_counter tools: Consolidate dso methods to load
 kernel symbols

Now one has just to use dso__load_kernel() optionally passing a vmlinux
filename.

Will make things easier for perf top that will want to pass a callback
to filter some symbols.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c |  9 ++-------
 Documentation/perf_counter/util/symbol.c    | 17 +++++++++++++++--
 Documentation/perf_counter/util/symbol.h    |  3 +--
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 889067eb2b690..4e9f2bc104527 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -118,18 +118,13 @@ static void dsos__fprintf(FILE *fp)
 
 static int load_kernel(void)
 {
-	int err = -1;
+	int err;
 
 	kernel_dso = dso__new("[kernel]", 0);
 	if (!kernel_dso)
 		return -1;
 
-	if (vmlinux)
-		err = dso__load_vmlinux(kernel_dso, vmlinux);
-
-	if (err)
-		err = dso__load_kallsyms(kernel_dso);
-
+	err = dso__load_kernel(kernel_dso, vmlinux);
 	if (err) {
 		dso__delete(kernel_dso);
 		kernel_dso = NULL;
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 7088206244ac7..504ac3132019e 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -155,7 +155,7 @@ static int hex2long(char *ptr, unsigned long *long_val)
 	return p - ptr;
 }
 
-int dso__load_kallsyms(struct dso *self)
+static int dso__load_kallsyms(struct dso *self)
 {
 	struct rb_node *nd, *prevnd;
 	char *line = NULL;
@@ -410,7 +410,7 @@ int dso__load(struct dso *self)
 	return ret;
 }
 
-int dso__load_vmlinux(struct dso *self, const char *vmlinux)
+static int dso__load_vmlinux(struct dso *self, const char *vmlinux)
 {
 	int err, fd = open(vmlinux, O_RDONLY);
 
@@ -423,6 +423,19 @@ int dso__load_vmlinux(struct dso *self, const char *vmlinux)
 	return err;
 }
 
+int dso__load_kernel(struct dso *self, const char *vmlinux)
+{
+	int err = -1;
+
+	if (vmlinux)
+		err = dso__load_vmlinux(self, vmlinux);
+
+	if (err)
+		err = dso__load_kallsyms(self);
+
+	return err;
+}
+
 void symbol__init(void)
 {
 	elf_version(EV_CURRENT);
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index 9e120af9c71f9..db2fdf9f70a20 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -29,8 +29,7 @@ static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
 
 struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
 
-int dso__load_kallsyms(struct dso *self);
-int dso__load_vmlinux(struct dso *self, const char *vmlinux);
+int dso__load_kernel(struct dso *self, const char *vmlinux);
 int dso__load(struct dso *self);
 
 size_t dso__fprintf(struct dso *self, FILE *fp);
-- 
GitLab


From 69ee69f63c82e63d9c6c6081d12673af4933c51e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 May 2009 14:55:26 -0300
Subject: [PATCH 1420/2198] perf_counter tools: Optionally pass a symbol filter
 to the dso load routines

Will be used by perf top.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090528175526.GF4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c |  4 +--
 Documentation/perf_counter/util/symbol.c    | 34 +++++++++++++--------
 Documentation/perf_counter/util/symbol.h    |  7 +++--
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 4e9f2bc104527..412d524dd6581 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -87,7 +87,7 @@ static struct dso *dsos__findnew(const char *name)
 		if (!dso)
 			goto out_delete_dso;
 
-		nr = dso__load(dso);
+		nr = dso__load(dso, NULL);
 		if (nr < 0) {
 			fprintf(stderr, "Failed to open: %s\n", name);
 			goto out_delete_dso;
@@ -124,7 +124,7 @@ static int load_kernel(void)
 	if (!kernel_dso)
 		return -1;
 
-	err = dso__load_kernel(kernel_dso, vmlinux);
+	err = dso__load_kernel(kernel_dso, vmlinux, NULL);
 	if (err) {
 		dso__delete(kernel_dso);
 		kernel_dso = NULL;
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 504ac3132019e..47281210443de 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -155,7 +155,7 @@ static int hex2long(char *ptr, unsigned long *long_val)
 	return p - ptr;
 }
 
-static int dso__load_kallsyms(struct dso *self)
+static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter)
 {
 	struct rb_node *nd, *prevnd;
 	char *line = NULL;
@@ -201,7 +201,10 @@ static int dso__load_kallsyms(struct dso *self)
 		if (sym == NULL)
 			goto out_delete_line;
 
-		dso__insert_symbol(self, sym);
+		if (filter && filter(self, sym))
+			symbol__delete(sym, self->sym_priv_size);
+		else
+			dso__insert_symbol(self, sym);
 	}
 
 	/*
@@ -286,7 +289,8 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 	return sec;
 }
 
-static int dso__load_sym(struct dso *self, int fd, const char *name)
+static int dso__load_sym(struct dso *self, int fd, const char *name,
+			 symbol_filter_t filter)
 {
 	Elf_Data *symstrs;
 	uint32_t nr_syms;
@@ -352,9 +356,12 @@ static int dso__load_sym(struct dso *self, int fd, const char *name)
 		if (!f)
 			goto out_elf_end;
 
-		dso__insert_symbol(self, f);
-
-		nr++;
+		if (filter && filter(self, f))
+			symbol__delete(f, self->sym_priv_size);
+		else {
+			dso__insert_symbol(self, f);
+			nr++;
+		}
 	}
 
 	err = nr;
@@ -364,7 +371,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name)
 	return err;
 }
 
-int dso__load(struct dso *self)
+int dso__load(struct dso *self, symbol_filter_t filter)
 {
 	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
 	char *name = malloc(size);
@@ -396,7 +403,7 @@ int dso__load(struct dso *self)
 		fd = open(name, O_RDONLY);
 	} while (fd < 0);
 
-	ret = dso__load_sym(self, fd, name);
+	ret = dso__load_sym(self, fd, name, filter);
 	close(fd);
 
 	/*
@@ -410,28 +417,29 @@ int dso__load(struct dso *self)
 	return ret;
 }
 
-static int dso__load_vmlinux(struct dso *self, const char *vmlinux)
+static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
+			     symbol_filter_t filter)
 {
 	int err, fd = open(vmlinux, O_RDONLY);
 
 	if (fd < 0)
 		return -1;
 
-	err = dso__load_sym(self, fd, vmlinux);
+	err = dso__load_sym(self, fd, vmlinux, filter);
 	close(fd);
 
 	return err;
 }
 
-int dso__load_kernel(struct dso *self, const char *vmlinux)
+int dso__load_kernel(struct dso *self, const char *vmlinux, symbol_filter_t filter)
 {
 	int err = -1;
 
 	if (vmlinux)
-		err = dso__load_vmlinux(self, vmlinux);
+		err = dso__load_vmlinux(self, vmlinux, filter);
 
 	if (err)
-		err = dso__load_kallsyms(self);
+		err = dso__load_kallsyms(self, filter);
 
 	return err;
 }
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index db2fdf9f70a20..b0299bc0cf50e 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -19,6 +19,8 @@ struct dso {
 	char		 name[0];
 };
 
+typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym);
+
 struct dso *dso__new(const char *name, unsigned int sym_priv_size);
 void dso__delete(struct dso *self);
 
@@ -29,8 +31,9 @@ static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
 
 struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
 
-int dso__load_kernel(struct dso *self, const char *vmlinux);
-int dso__load(struct dso *self);
+int dso__load_kernel(struct dso *self, const char *vmlinux,
+		     symbol_filter_t filter);
+int dso__load(struct dso *self, symbol_filter_t filter);
 
 size_t dso__fprintf(struct dso *self, FILE *fp);
 
-- 
GitLab


From de04687f868bf98e4ef644af91ed85a3bc212ce8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 28 May 2009 14:55:41 -0300
Subject: [PATCH 1421/2198] perf_counter tools: Convert builtin-top to use
 libperf symbol routines

Now both perf top and report use the same routines.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090528175541.GG4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 303 +++++++++--------------
 1 file changed, 117 insertions(+), 186 deletions(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index a890872638c10..52ba9f4216cf5 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -44,8 +44,9 @@
 
 #include "perf.h"
 #include "builtin.h"
+#include "util/symbol.h"
 #include "util/util.h"
-#include "util/util.h"
+#include "util/rbtree.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
 
@@ -125,19 +126,21 @@ static uint64_t			min_ip;
 static uint64_t			max_ip = -1ll;
 
 struct sym_entry {
-	unsigned long long	addr;
-	char			*sym;
+	struct rb_node		rb_node;
+	struct list_head	node;
 	unsigned long		count[MAX_COUNTERS];
 	int			skip;
 };
 
-#define MAX_SYMS		100000
-
-static int sym_table_count;
-
 struct sym_entry		*sym_filter_entry;
 
-static struct sym_entry		sym_table[MAX_SYMS];
+struct dso *kernel_dso;
+
+/*
+ * Symbols will be added here in record_ip and will get out
+ * after decayed.
+ */
+static LIST_HEAD(active_symbols);
 
 /*
  * Ordering weight: count-1 * count-2 * ... / count-n
@@ -157,42 +160,60 @@ static double sym_weight(const struct sym_entry *sym)
 	return weight;
 }
 
-static int compare(const void *__sym1, const void *__sym2)
-{
-	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
-
-	return sym_weight(sym1) < sym_weight(sym2);
-}
-
 static long			events;
 static long			userspace_events;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
 
-static struct sym_entry		tmp[MAX_SYMS];
+static void list_insert_active_sym(struct sym_entry *syme)
+{
+	list_add(&syme->node, &active_symbols);
+}
+
+static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
+{
+	struct rb_node **p = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	struct sym_entry *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct sym_entry, rb_node);
+
+		if (sym_weight(se) > sym_weight(iter))
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&se->rb_node, parent, p);
+	rb_insert_color(&se->rb_node, tree);
+}
 
 static void print_sym_table(void)
 {
-	int i, j, active_count, printed;
+	int printed, j;
 	int counter;
 	float events_per_sec = events/delay_secs;
 	float kevents_per_sec = (events-userspace_events)/delay_secs;
 	float sum_kevents = 0.0;
+	struct sym_entry *syme, *n;
+	struct rb_root tmp = RB_ROOT;
+	struct rb_node *nd;
 
 	events = userspace_events = 0;
 
-	/* Iterate over symbol table and copy/tally/decay active symbols. */
-	for (i = 0, active_count = 0; i < sym_table_count; i++) {
-		if (sym_table[i].count[0]) {
-			tmp[active_count++] = sym_table[i];
-			sum_kevents += sym_table[i].count[0];
+	/* Sort the active symbols */
+	list_for_each_entry_safe(syme, n, &active_symbols, node) {
+		if (syme->count[0] != 0) {
+			rb_insert_active_sym(&tmp, syme);
+			sum_kevents += syme->count[0];
 
 			for (j = 0; j < nr_counters; j++)
-				sym_table[i].count[j] = zero ? 0 : sym_table[i].count[j] * 7 / 8;
-		}
+				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
+		} else
+			list_del_init(&syme->node);
 	}
 
-	qsort(tmp, active_count + 1, sizeof(tmp[0]), compare);
-
 	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
 
 	printf(
@@ -238,23 +259,25 @@ static void print_sym_table(void)
 	       	       "  ______     ______   _____   ________________   _______________\n\n"
 	);
 
-	for (i = 0, printed = 0; i < active_count; i++) {
+	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
+		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
+		struct symbol *sym = (struct symbol *)(syme + 1);
 		float pcnt;
 
-		if (++printed > 18 || tmp[i].count[0] < count_filter)
+		if (++printed > 18 || syme->count[0] < count_filter)
 			break;
 
-		pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents));
+		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->count[0]) /
+					 sum_kevents));
 
 		if (nr_counters == 1)
 			printf("%19.2f - %4.1f%% - %016llx : %s\n",
-				sym_weight(tmp + i),
-				pcnt, tmp[i].addr, tmp[i].sym);
+				sym_weight(syme),
+				pcnt, sym->start, sym->name);
 		else
 			printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
-				sym_weight(tmp + i),
-				tmp[i].count[0],
-				pcnt, tmp[i].addr, tmp[i].sym);
+				sym_weight(syme), syme->count[0],
+				pcnt, sym->start, sym->name);
 	}
 
 	{
@@ -277,146 +300,85 @@ static void *display_thread(void *arg)
 	return NULL;
 }
 
-static int read_symbol(FILE *in, struct sym_entry *s)
+static int symbol_filter(struct dso *self, struct symbol *sym)
 {
-	static int filter_match = 0;
-	char *sym, stype;
-	char str[500];
-	int rc, pos;
-
-	rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str);
-	if (rc == EOF)
-		return -1;
-
-	assert(rc == 3);
-
-	/* skip until end of line: */
-	pos = strlen(str);
-	do {
-		rc = fgetc(in);
-		if (rc == '\n' || rc == EOF || pos >= 499)
-			break;
-		str[pos] = rc;
-		pos++;
-	} while (1);
-	str[pos] = 0;
-
-	sym = str;
-
-	/* Filter out known duplicates and non-text symbols. */
-	if (!strcmp(sym, "_text"))
-		return 1;
-	if (!min_ip && !strcmp(sym, "_stext"))
-		return 1;
-	if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext"))
-		return 1;
-	if (stype != 'T' && stype != 't')
-		return 1;
-	if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14))
-		return 1;
-	if (strstr(sym, "_text_start") || strstr(sym, "_text_end"))
+	static int filter_match;
+	struct sym_entry *syme;
+	const char *name = sym->name;
+
+	if (!strcmp(name, "_text") ||
+	    !strcmp(name, "_etext") ||
+	    !strcmp(name, "_sinittext") ||
+	    !strncmp("init_module", name, 11) ||
+	    !strncmp("cleanup_module", name, 14) ||
+	    strstr(name, "_text_start") ||
+	    strstr(name, "_text_end"))
 		return 1;
 
-	s->sym = malloc(strlen(str)+1);
-	assert(s->sym);
-
-	strcpy((char *)s->sym, str);
-	s->skip = 0;
-
+	syme = dso__sym_priv(self, sym);
 	/* Tag events to be skipped. */
-	if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym))
-		s->skip = 1;
-	else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym))
-		s->skip = 1;
-	else if (!strcmp("mwait_idle", s->sym))
-		s->skip = 1;
+	if (!strcmp("default_idle", name) ||
+	    !strcmp("cpu_idle", name) ||
+	    !strcmp("enter_idle", name) ||
+	    !strcmp("exit_idle", name) ||
+	    !strcmp("mwait_idle", name))
+		syme->skip = 1;
 
 	if (filter_match == 1) {
-		filter_end = s->addr;
+		filter_end = sym->start;
 		filter_match = -1;
 		if (filter_end - filter_start > 10000) {
-			printf("hm, too large filter symbol <%s> - skipping.\n",
+			fprintf(stderr,
+				"hm, too large filter symbol <%s> - skipping.\n",
 				sym_filter);
-			printf("symbol filter start: %016lx\n", filter_start);
-			printf("                end: %016lx\n", filter_end);
+			fprintf(stderr, "symbol filter start: %016lx\n",
+				filter_start);
+			fprintf(stderr, "                end: %016lx\n",
+				filter_end);
 			filter_end = filter_start = 0;
 			sym_filter = NULL;
 			sleep(1);
 		}
 	}
-	if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) {
+
+	if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) {
 		filter_match = 1;
-		filter_start = s->addr;
+		filter_start = sym->start;
 	}
 
+
 	return 0;
 }
 
-static int compare_addr(const void *__sym1, const void *__sym2)
+static int parse_symbols(void)
 {
-	const struct sym_entry *sym1 = __sym1, *sym2 = __sym2;
+	struct rb_node *node;
+	struct symbol  *sym;
 
-	return sym1->addr > sym2->addr;
-}
-
-static void sort_symbol_table(void)
-{
-	int i, dups;
-
-	do {
-		qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr);
-		for (i = 0, dups = 0; i < sym_table_count; i++) {
-			if (sym_table[i].addr == sym_table[i+1].addr) {
-				sym_table[i+1].addr = -1ll;
-				dups++;
-			}
-		}
-		sym_table_count -= dups;
-	} while(dups);
-}
-
-static void parse_symbols(void)
-{
-	struct sym_entry *last;
+	kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry));
+	if (kernel_dso == NULL)
+		return -1;
 
-	FILE *kallsyms = fopen("/proc/kallsyms", "r");
+	if (dso__load_kernel(kernel_dso, NULL, symbol_filter) != 0)
+		goto out_delete_dso;
 
-	if (!kallsyms) {
-		printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n");
-		exit(-1);
-	}
+	node = rb_first(&kernel_dso->syms);
+	sym = rb_entry(node, struct symbol, rb_node);
+	min_ip = sym->start;
 
-	while (!feof(kallsyms)) {
-		if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) {
-			sym_table_count++;
-			assert(sym_table_count <= MAX_SYMS);
-		}
-	}
+	node = rb_last(&kernel_dso->syms);
+	sym = rb_entry(node, struct symbol, rb_node);
+	max_ip = sym->start;
 
-	sort_symbol_table();
-	min_ip = sym_table[0].addr;
-	max_ip = sym_table[sym_table_count-1].addr;
-	last = sym_table + sym_table_count++;
+	if (dump_symtab)
+		dso__fprintf(kernel_dso, stdout);
 
-	last->addr = -1ll;
-	last->sym = "<end>";
-
-	if (filter_end) {
-		int count;
-		for (count=0; count < sym_table_count; count ++) {
-			if (!strcmp(sym_table[count].sym, sym_filter)) {
-				sym_filter_entry = &sym_table[count];
-				break;
-			}
-		}
-	}
-	if (dump_symtab) {
-		int i;
+	return 0;
 
-		for (i = 0; i < sym_table_count; i++)
-			fprintf(stderr, "%llx %s\n",
-				sym_table[i].addr, sym_table[i].sym);
-	}
+out_delete_dso:
+	dso__delete(kernel_dso);
+	kernel_dso = NULL;
+	return -1;
 }
 
 #define TRACE_COUNT     3
@@ -426,51 +388,20 @@ static void parse_symbols(void)
  */
 static void record_ip(uint64_t ip, int counter)
 {
-	int left_idx, middle_idx, right_idx, idx;
-	unsigned long left, middle, right;
-
-	left_idx = 0;
-	right_idx = sym_table_count-1;
-	assert(ip <= max_ip && ip >= min_ip);
-
-	while (left_idx + 1 < right_idx) {
-		middle_idx = (left_idx + right_idx) / 2;
+	struct symbol *sym = dso__find_symbol(kernel_dso, ip);
 
-		left   = sym_table[  left_idx].addr;
-		middle = sym_table[middle_idx].addr;
-		right  = sym_table[ right_idx].addr;
+	if (sym != NULL) {
+		struct sym_entry *syme = dso__sym_priv(kernel_dso, sym);
 
-		if (!(left <= middle && middle <= right)) {
-			printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right);
-			printf("%d %d %d\n", left_idx, middle_idx, right_idx);
+		if (!syme->skip) {
+			syme->count[counter]++;
+			if (list_empty(&syme->node) || !syme->node.next)
+				list_insert_active_sym(syme);
+			return;
 		}
-		assert(left <= middle && middle <= right);
-		if (!(left <= ip && ip <= right)) {
-			printf(" left: %016lx\n", left);
-			printf("   ip: %016lx\n", (unsigned long)ip);
-			printf("right: %016lx\n", right);
-		}
-		assert(left <= ip && ip <= right);
-		/*
-		 * [ left .... target .... middle .... right ]
-		 *   => right := middle
-		 */
-		if (ip < middle) {
-			right_idx = middle_idx;
-			continue;
-		}
-		/*
-		 * [ left .... middle ... target ... right ]
-		 *   => left := middle
-		 */
-		left_idx = middle_idx;
 	}
 
-	idx = left_idx;
-
-	if (!sym_table[idx].skip)
-		sym_table[idx].count[counter]++;
-	else events--;
+	events--;
 }
 
 static void process_event(uint64_t ip, int counter)
-- 
GitLab


From 9e09675366695405412b709e91709c1ce2925c90 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 28 May 2009 16:25:34 +0200
Subject: [PATCH 1422/2198] perf_counter tools: Document '--' option parsing
 terminator

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Documentation/perf-record.txt | 1 +
 Documentation/perf_counter/Documentation/perf-stat.txt   | 1 +
 Documentation/perf_counter/builtin-record.c              | 3 ++-
 Documentation/perf_counter/builtin-stat.c                | 1 +
 4 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
index 353db1bb98abf..a93d2ec6176a3 100644
--- a/Documentation/perf_counter/Documentation/perf-record.txt
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -9,6 +9,7 @@ SYNOPSIS
 --------
 [verse]
 'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
 
 DESCRIPTION
 -----------
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
index 7fcab271e570a..828c59ff5f5d1 100644
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -9,6 +9,7 @@ SYNOPSIS
 --------
 [verse]
 'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command>
+'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>]
 
 DESCRIPTION
 -----------
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 4a068664a32aa..23d1224ce98c2 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -397,7 +397,8 @@ static int __cmd_record(int argc, const char **argv)
 }
 
 static const char * const record_usage[] = {
-	"perf record [<options>] <command>",
+	"perf record [<options>] [<command>]",
+	"perf record [<options>] -- <command> [<options>]",
 	NULL
 };
 
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index ce661e2fa8ddd..ac14086d9a706 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -212,6 +212,7 @@ static void skip_signal(int signo)
 
 static const char * const stat_usage[] = {
 	"perf stat [<options>] <command>",
+	"perf stat [<options>] -- <command> [<options>]",
 	NULL
 };
 
-- 
GitLab


From a3ec8d70f1a55acccc4874fe9b4dadbbb9454a0f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 29 May 2009 06:46:46 +0200
Subject: [PATCH 1423/2198] perf_counter tools: Fix top symbol table dump typo

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 52ba9f4216cf5..0d100f52b70d9 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -371,7 +371,7 @@ static int parse_symbols(void)
 	max_ip = sym->start;
 
 	if (dump_symtab)
-		dso__fprintf(kernel_dso, stdout);
+		dso__fprintf(kernel_dso, stderr);
 
 	return 0;
 
-- 
GitLab


From da417a7537cbf4beb28a08a49adf915f2358040c Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 29 May 2009 08:23:16 +0200
Subject: [PATCH 1424/2198] perf_counter tools: Fix top symbol table max_ip
 typo

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 0d100f52b70d9..ebe8bec1a0e8d 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -368,7 +368,7 @@ static int parse_symbols(void)
 
 	node = rb_last(&kernel_dso->syms);
 	sym = rb_entry(node, struct symbol, rb_node);
-	max_ip = sym->start;
+	max_ip = sym->end;
 
 	if (dump_symtab)
 		dso__fprintf(kernel_dso, stderr);
-- 
GitLab


From c323d95fa4dbe0b6bf6d59e24a0b7db067dd08a7 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Fri, 29 May 2009 13:28:35 +0800
Subject: [PATCH 1425/2198] perf_counter/x86: Always use NMI for
 performance-monitoring interrupt

Always use NMI for performance-monitoring interrupt as there could be
racy situations if we switch between irq and nmi mode frequently.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
LKML-Reference: <20090529052835.GA13657@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/perf_counter.h |  4 ++--
 arch/x86/kernel/apic/apic.c         |  2 +-
 arch/x86/kernel/cpu/perf_counter.c  | 19 +++++--------------
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h
index d08dd52cb8ff1..876ed97147b30 100644
--- a/arch/x86/include/asm/perf_counter.h
+++ b/arch/x86/include/asm/perf_counter.h
@@ -91,10 +91,10 @@ extern void set_perf_counter_pending(void);
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
-extern void perf_counters_lapic_init(int nmi);
+extern void perf_counters_lapic_init(void);
 #else
 static inline void init_hw_perf_counters(void)		{ }
-static inline void perf_counters_lapic_init(int nmi)	{ }
+static inline void perf_counters_lapic_init(void)	{ }
 #endif
 
 #endif /* _ASM_X86_PERF_COUNTER_H */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 89b63b5fad337..60df2efd7c80c 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1135,7 +1135,7 @@ void __cpuinit setup_local_APIC(void)
 		apic_write(APIC_ESR, 0);
 	}
 #endif
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init();
 
 	preempt_disable();
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 2eeaa99add1c7..316b0c995f387 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -604,7 +604,7 @@ static int x86_pmu_enable(struct perf_counter *counter)
 		hwc->counter_base = x86_pmu.perfctr;
 	}
 
-	perf_counters_lapic_init(hwc->nmi);
+	perf_counters_lapic_init();
 
 	x86_pmu.disable(hwc, idx);
 
@@ -863,24 +863,15 @@ void set_perf_counter_pending(void)
 	apic->send_IPI_self(LOCAL_PENDING_VECTOR);
 }
 
-void perf_counters_lapic_init(int nmi)
+void perf_counters_lapic_init(void)
 {
-	u32 apic_val;
-
 	if (!x86_pmu_initialized())
 		return;
 
 	/*
-	 * Enable the performance counter vector in the APIC LVT:
+	 * Always use NMI for PMU
 	 */
-	apic_val = apic_read(APIC_LVTERR);
-
-	apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED);
-	if (nmi)
-		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	else
-		apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
-	apic_write(APIC_LVTERR, apic_val);
+	apic_write(APIC_LVTPC, APIC_DM_NMI);
 }
 
 static int __kprobes
@@ -1054,7 +1045,7 @@ void __init init_hw_perf_counters(void)
 
 	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
 
-	perf_counters_lapic_init(0);
+	perf_counters_lapic_init();
 	register_die_notifier(&perf_counter_nmi_notifier);
 }
 
-- 
GitLab


From c04f5e5d7b523f90ee3cdd70a68c4002aaecd3fa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 09:10:54 +0200
Subject: [PATCH 1426/2198] perf_counter tools: Clean up builtin-stat.c's
 do_perfstat()

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 116 +++++++++++++---------
 1 file changed, 67 insertions(+), 49 deletions(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index ac14086d9a706..6a2936150f288 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -109,11 +109,75 @@ static void create_perfstat_counter(int counter)
 	}
 }
 
+/*
+ * Does the counter have nsecs as a unit?
+ */
+static inline int nsec_counter(int counter)
+{
+	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK))
+		return 1;
+	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Print out the results of a single counter:
+ */
+static void print_counter(int counter)
+{
+	__u64 count[3], single_count[3];
+	ssize_t res;
+	int cpu, nv;
+	int scaled;
+
+	count[0] = count[1] = count[2] = 0;
+	nv = scale ? 3 : 1;
+	for (cpu = 0; cpu < nr_cpus; cpu ++) {
+		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
+		assert(res == nv * sizeof(__u64));
+
+		count[0] += single_count[0];
+		if (scale) {
+			count[1] += single_count[1];
+			count[2] += single_count[2];
+		}
+	}
+
+	scaled = 0;
+	if (scale) {
+		if (count[2] == 0) {
+			fprintf(stderr, " %14s  %-20s\n",
+				"<not counted>", event_name(counter));
+			return;
+		}
+		if (count[2] < count[1]) {
+			scaled = 1;
+			count[0] = (unsigned long long)
+				((double)count[0] * count[1] / count[2] + 0.5);
+		}
+	}
+
+	if (nsec_counter(counter)) {
+		double msecs = (double)count[0] / 1000000;
+
+		fprintf(stderr, " %14.6f  %-20s (msecs)",
+			msecs, event_name(counter));
+	} else {
+		fprintf(stderr, " %14Ld  %-20s (events)",
+			count[0], event_name(counter));
+	}
+	if (scaled)
+		fprintf(stderr, "  (scaled from %.2f%%)",
+			(double) count[2] / count[1] * 100);
+	fprintf(stderr, "\n");
+}
+
 static int do_perfstat(int argc, const char **argv)
 {
 	unsigned long long t0, t1;
 	int counter;
-	ssize_t res;
 	int status;
 	int pid;
 
@@ -149,55 +213,10 @@ static int do_perfstat(int argc, const char **argv)
 		argv[0]);
 	fprintf(stderr, "\n");
 
-	for (counter = 0; counter < nr_counters; counter++) {
-		int cpu, nv;
-		__u64 count[3], single_count[3];
-		int scaled;
-
-		count[0] = count[1] = count[2] = 0;
-		nv = scale ? 3 : 1;
-		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			res = read(fd[cpu][counter],
-				   single_count, nv * sizeof(__u64));
-			assert(res == nv * sizeof(__u64));
-
-			count[0] += single_count[0];
-			if (scale) {
-				count[1] += single_count[1];
-				count[2] += single_count[2];
-			}
-		}
-
-		scaled = 0;
-		if (scale) {
-			if (count[2] == 0) {
-				fprintf(stderr, " %14s  %-20s\n",
-					"<not counted>", event_name(counter));
-				continue;
-			}
-			if (count[2] < count[1]) {
-				scaled = 1;
-				count[0] = (unsigned long long)
-					((double)count[0] * count[1] / count[2] + 0.5);
-			}
-		}
-
-		if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) ||
-		    event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+	for (counter = 0; counter < nr_counters; counter++)
+		print_counter(counter);
 
-			double msecs = (double)count[0] / 1000000;
 
-			fprintf(stderr, " %14.6f  %-20s (msecs)",
-				msecs, event_name(counter));
-		} else {
-			fprintf(stderr, " %14Ld  %-20s (events)",
-				count[0], event_name(counter));
-		}
-		if (scaled)
-			fprintf(stderr, "  (scaled from %.2f%%)",
-				(double) count[2] / count[1] * 100);
-		fprintf(stderr, "\n");
-	}
 	fprintf(stderr, "\n");
 	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n",
 			(double)(t1-t0)/1e6);
@@ -212,7 +231,6 @@ static void skip_signal(int signo)
 
 static const char * const stat_usage[] = {
 	"perf stat [<options>] <command>",
-	"perf stat [<options>] -- <command> [<options>]",
 	NULL
 };
 
-- 
GitLab


From 2996f5ddb7ba8889caeeac65edafe48845106eaa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 09:10:54 +0200
Subject: [PATCH 1427/2198] perf_counter tools: Split display into reading and
 printing

We introduce the extra pass to allow the print-out to possibly
rely on already read counters.

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 40 +++++++++++++++++++----
 1 file changed, 34 insertions(+), 6 deletions(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 6a2936150f288..0c92eb7255261 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -71,6 +71,9 @@ static const unsigned int default_count[] = {
 	  10000,
 };
 
+static __u64			event_res[MAX_COUNTERS][3];
+static __u64			event_scaled[MAX_COUNTERS];
+
 static void create_perfstat_counter(int counter)
 {
 	struct perf_counter_hw_event hw_event;
@@ -123,16 +126,19 @@ static inline int nsec_counter(int counter)
 }
 
 /*
- * Print out the results of a single counter:
+ * Read out the results of a single counter:
  */
-static void print_counter(int counter)
+static void read_counter(int counter)
 {
-	__u64 count[3], single_count[3];
+	__u64 *count, single_count[3];
 	ssize_t res;
 	int cpu, nv;
 	int scaled;
 
+	count = event_res[counter];
+
 	count[0] = count[1] = count[2] = 0;
+
 	nv = scale ? 3 : 1;
 	for (cpu = 0; cpu < nr_cpus; cpu ++) {
 		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
@@ -148,16 +154,35 @@ static void print_counter(int counter)
 	scaled = 0;
 	if (scale) {
 		if (count[2] == 0) {
-			fprintf(stderr, " %14s  %-20s\n",
-				"<not counted>", event_name(counter));
+			event_scaled[counter] = -1;
+			count[0] = 0;
 			return;
 		}
+
 		if (count[2] < count[1]) {
-			scaled = 1;
+			event_scaled[counter] = 1;
 			count[0] = (unsigned long long)
 				((double)count[0] * count[1] / count[2] + 0.5);
 		}
 	}
+}
+
+/*
+ * Print out the results of a single counter:
+ */
+static void print_counter(int counter)
+{
+	__u64 *count;
+	int scaled;
+
+	count = event_res[counter];
+	scaled = event_scaled[counter];
+
+	if (scaled == -1) {
+		fprintf(stderr, " %14s  %-20s\n",
+			"<not counted>", event_name(counter));
+		return;
+	}
 
 	if (nsec_counter(counter)) {
 		double msecs = (double)count[0] / 1000000;
@@ -213,6 +238,9 @@ static int do_perfstat(int argc, const char **argv)
 		argv[0]);
 	fprintf(stderr, "\n");
 
+	for (counter = 0; counter < nr_counters; counter++)
+		read_counter(counter);
+
 	for (counter = 0; counter < nr_counters; counter++)
 		print_counter(counter);
 
-- 
GitLab


From be1ac0d81d0e3ab655f8c8ade31fb860ef6aa186 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 09:10:54 +0200
Subject: [PATCH 1428/2198] perf_counter tools: Also display time-normalized
 stat results

Add new column that normalizes counter results by
'nanoseconds spent running' unit.

Before:

 Performance counter stats for '/home/mingo/hackbench':

   10469.403605  task clock ticks     (msecs)
          75502  context switches     (events)
           9501  CPU migrations       (events)
          36158  pagefaults           (events)
    31975676185  CPU cycles           (events)
    26257738659  instructions         (events)
      108740581  cache references     (events)
       54606088  cache misses         (events)

 Wall-clock time elapsed:   810.514504 msecs

After:

 Performance counter stats for '/home/mingo/hackbench':

   10469.403605  task clock ticks     (msecs)
          75502  context switches     #        0.007 M/sec
           9501  CPU migrations       #        0.001 M/sec
          36158  pagefaults           #        0.003 M/sec
    31975676185  CPU cycles           #     3054.202 M/sec
    26257738659  instructions         #     2508.045 M/sec
      108740581  cache references     #       10.387 M/sec
       54606088  cache misses         #        5.216 M/sec

 Wall-clock time elapsed:   810.514504 msecs

The advantage of that column is that it is characteristic of the
execution workflow, regardless of runtime. Hence 'hackbench 10'
will look similar to 'hackbench 15' - while the absolute counter
values are very different.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 0c92eb7255261..ef7e0e1192cbf 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -74,6 +74,8 @@ static const unsigned int default_count[] = {
 static __u64			event_res[MAX_COUNTERS][3];
 static __u64			event_scaled[MAX_COUNTERS];
 
+static __u64			runtime_nsecs;
+
 static void create_perfstat_counter(int counter)
 {
 	struct perf_counter_hw_event hw_event;
@@ -165,6 +167,11 @@ static void read_counter(int counter)
 				((double)count[0] * count[1] / count[2] + 0.5);
 		}
 	}
+	/*
+	 * Save the full runtime - to allow normalization during printout:
+	 */
+	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+		runtime_nsecs = count[0];
 }
 
 /*
@@ -190,8 +197,11 @@ static void print_counter(int counter)
 		fprintf(stderr, " %14.6f  %-20s (msecs)",
 			msecs, event_name(counter));
 	} else {
-		fprintf(stderr, " %14Ld  %-20s (events)",
+		fprintf(stderr, " %14Ld  %-20s",
 			count[0], event_name(counter));
+		if (runtime_nsecs)
+			fprintf(stderr, " # %12.3f M/sec",
+				(double)count[0]/runtime_nsecs*1000.0);
 	}
 	if (scaled)
 		fprintf(stderr, "  (scaled from %.2f%%)",
-- 
GitLab


From ad3a37de81c45f6c20d410ece86004b98f7b6d84 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 29 May 2009 16:06:20 +1000
Subject: [PATCH 1429/2198] perf_counter: Don't swap contexts containing locked
 mutex

Peter Zijlstra pointed out that under some circumstances, we can take
the mutex in a context or a counter and then swap that context or
counter to another task, potentially leading to lock order inversions
or the mutexes not protecting what they are supposed to protect.

This fixes the problem by making sure that we never take a mutex in a
context or counter which could get swapped to another task.  Most of
the cases where we take a mutex is on a top-level counter or context,
i.e. a counter which has an fd associated with it or a context that
contains such a counter.  This adds WARN_ON_ONCE statements to verify
that.

The two cases where we need to take the mutex on a context that is a
clone of another are in perf_counter_exit_task and
perf_counter_init_task.  The perf_counter_exit_task case is solved by
uncloning the context before starting to remove the counters from it.
The perf_counter_init_task is a little trickier; we temporarily
disable context swapping for the parent (forking) task by setting its
ctx->parent_gen to the all-1s value after locking the context, if it
is a cloned context, and restore the ctx->parent_gen value at the end
if the context didn't get uncloned in the meantime.

This also moves the increment of the context generation count to be
within the same critical section, protected by the context mutex, that
adds the new counter to the context.  That way, taking the mutex is
sufficient to ensure that both the counter list and the generation
count are stable.

[ Impact: fix hangs, races with inherited and PID counters ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18975.31580.520676.619896@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 89 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 79 insertions(+), 10 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 52e5a15321d80..db843f812a607 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -919,7 +919,8 @@ static int context_equiv(struct perf_counter_context *ctx1,
 			 struct perf_counter_context *ctx2)
 {
 	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
-		&& ctx1->parent_gen == ctx2->parent_gen;
+		&& ctx1->parent_gen == ctx2->parent_gen
+		&& ctx1->parent_gen != ~0ull;
 }
 
 /*
@@ -1339,7 +1340,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 			put_ctx(parent_ctx);
 			ctx->parent_ctx = NULL;		/* no longer a clone */
 		}
-		++ctx->generation;
 		/*
 		 * Get an extra reference before dropping the lock so that
 		 * this context won't get freed if the task exits.
@@ -1414,6 +1414,7 @@ static int perf_release(struct inode *inode, struct file *file)
 
 	file->private_data = NULL;
 
+	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_counter_remove_from_context(counter);
 	mutex_unlock(&ctx->mutex);
@@ -1445,6 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->state == PERF_COUNTER_STATE_ERROR)
 		return 0;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
@@ -1504,6 +1506,7 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter,
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *sibling;
 
+	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	counter = counter->group_leader;
 
@@ -1524,6 +1527,7 @@ static void perf_counter_for_each_child(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
 	func(counter);
 	list_for_each_entry(child, &counter->child_list, child_list)
@@ -1536,6 +1540,7 @@ static void perf_counter_for_each(struct perf_counter *counter,
 {
 	struct perf_counter *child;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->child_mutex);
 	perf_counter_for_each_sibling(counter, func);
 	list_for_each_entry(child, &counter->child_list, child_list)
@@ -1741,6 +1746,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = vma->vm_file->private_data;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
 				      &counter->mmap_mutex)) {
 		struct user_struct *user = current_user();
@@ -1788,6 +1794,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_pgoff != 0)
 		return -EINVAL;
 
+	WARN_ON_ONCE(counter->ctx->parent_ctx);
 	mutex_lock(&counter->mmap_mutex);
 	if (atomic_inc_not_zero(&counter->mmap_count)) {
 		if (nr_pages != counter->data->nr_pages)
@@ -3349,8 +3356,10 @@ SYSCALL_DEFINE5(perf_counter_open,
 		goto err_free_put_context;
 
 	counter->filp = counter_file;
+	WARN_ON_ONCE(ctx->parent_ctx);
 	mutex_lock(&ctx->mutex);
 	perf_install_in_context(ctx, counter, cpu);
+	++ctx->generation;
 	mutex_unlock(&ctx->mutex);
 
 	counter->owner = current;
@@ -3436,6 +3445,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Link this into the parent counter's child list
 	 */
+	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
 	mutex_lock(&parent_counter->child_mutex);
 	list_add_tail(&child_counter->child_list, &parent_counter->child_list);
 	mutex_unlock(&parent_counter->child_mutex);
@@ -3485,6 +3495,7 @@ static void sync_child_counter(struct perf_counter *child_counter,
 	/*
 	 * Remove this counter from the parent's list
 	 */
+	WARN_ON_ONCE(parent_counter->ctx->parent_ctx);
 	mutex_lock(&parent_counter->child_mutex);
 	list_del_init(&child_counter->child_list);
 	mutex_unlock(&parent_counter->child_mutex);
@@ -3527,12 +3538,17 @@ void perf_counter_exit_task(struct task_struct *child)
 	struct perf_counter_context *child_ctx;
 	unsigned long flags;
 
-	child_ctx = child->perf_counter_ctxp;
-
-	if (likely(!child_ctx))
+	if (likely(!child->perf_counter_ctxp))
 		return;
 
 	local_irq_save(flags);
+	/*
+	 * We can't reschedule here because interrupts are disabled,
+	 * and either child is current or it is a task that can't be
+	 * scheduled, so we are now safe from rescheduling changing
+	 * our context.
+	 */
+	child_ctx = child->perf_counter_ctxp;
 	__perf_counter_task_sched_out(child_ctx);
 
 	/*
@@ -3542,6 +3558,15 @@ void perf_counter_exit_task(struct task_struct *child)
 	 */
 	spin_lock(&child_ctx->lock);
 	child->perf_counter_ctxp = NULL;
+	if (child_ctx->parent_ctx) {
+		/*
+		 * This context is a clone; unclone it so it can't get
+		 * swapped to another process while we're removing all
+		 * the counters from it.
+		 */
+		put_ctx(child_ctx->parent_ctx);
+		child_ctx->parent_ctx = NULL;
+	}
 	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
@@ -3571,9 +3596,11 @@ void perf_counter_exit_task(struct task_struct *child)
 int perf_counter_init_task(struct task_struct *child)
 {
 	struct perf_counter_context *child_ctx, *parent_ctx;
+	struct perf_counter_context *cloned_ctx;
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
+	u64 cloned_gen;
 	int ret = 0;
 
 	child->perf_counter_ctxp = NULL;
@@ -3581,8 +3608,7 @@ int perf_counter_init_task(struct task_struct *child)
 	mutex_init(&child->perf_counter_mutex);
 	INIT_LIST_HEAD(&child->perf_counter_list);
 
-	parent_ctx = parent->perf_counter_ctxp;
-	if (likely(!parent_ctx || !parent_ctx->nr_counters))
+	if (likely(!parent->perf_counter_ctxp))
 		return 0;
 
 	/*
@@ -3599,6 +3625,34 @@ int perf_counter_init_task(struct task_struct *child)
 	child->perf_counter_ctxp = child_ctx;
 	get_task_struct(child);
 
+	/*
+	 * If the parent's context is a clone, temporarily set its
+	 * parent_gen to an impossible value (all 1s) so it won't get
+	 * swapped under us.  The rcu_read_lock makes sure that
+	 * parent_ctx continues to exist even if it gets swapped to
+	 * another process and then freed while we are trying to get
+	 * its lock.
+	 */
+	rcu_read_lock();
+ retry:
+	parent_ctx = rcu_dereference(parent->perf_counter_ctxp);
+	/*
+	 * No need to check if parent_ctx != NULL here; since we saw
+	 * it non-NULL earlier, the only reason for it to become NULL
+	 * is if we exit, and since we're currently in the middle of
+	 * a fork we can't be exiting at the same time.
+	 */
+	spin_lock_irq(&parent_ctx->lock);
+	if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) {
+		spin_unlock_irq(&parent_ctx->lock);
+		goto retry;
+	}
+	cloned_gen = parent_ctx->parent_gen;
+	if (parent_ctx->parent_ctx)
+		parent_ctx->parent_gen = ~0ull;
+	spin_unlock_irq(&parent_ctx->lock);
+	rcu_read_unlock();
+
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
 	 * hashed yet and not running, so nobody can access it.
@@ -3630,10 +3684,15 @@ int perf_counter_init_task(struct task_struct *child)
 		/*
 		 * Mark the child context as a clone of the parent
 		 * context, or of whatever the parent is a clone of.
+		 * Note that if the parent is a clone, it could get
+		 * uncloned at any point, but that doesn't matter
+		 * because the list of counters and the generation
+		 * count can't have changed since we took the mutex.
 		 */
-		if (parent_ctx->parent_ctx) {
-			child_ctx->parent_ctx = parent_ctx->parent_ctx;
-			child_ctx->parent_gen = parent_ctx->parent_gen;
+		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
+		if (cloned_ctx) {
+			child_ctx->parent_ctx = cloned_ctx;
+			child_ctx->parent_gen = cloned_gen;
 		} else {
 			child_ctx->parent_ctx = parent_ctx;
 			child_ctx->parent_gen = parent_ctx->generation;
@@ -3643,6 +3702,16 @@ int perf_counter_init_task(struct task_struct *child)
 
 	mutex_unlock(&parent_ctx->mutex);
 
+	/*
+	 * Restore the clone status of the parent.
+	 */
+	if (parent_ctx->parent_ctx) {
+		spin_lock_irq(&parent_ctx->lock);
+		if (parent_ctx->parent_ctx)
+			parent_ctx->parent_gen = cloned_gen;
+		spin_unlock_irq(&parent_ctx->lock);
+	}
+
 	return ret;
 }
 
-- 
GitLab


From 15aedea439c4d7dbec17c99b5e1594c01b979833 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 27 May 2009 09:43:01 +0900
Subject: [PATCH 1430/2198] dma-debug: use sg_dma_address accessor instead of
 using dma_address directly

Architectures might not have dma_address in struct scatterlist (PARISC
doesn't). Directly accessing to dma_address in struct scatterlist is
wrong; we need to use sg_dma_address() accesssor instead.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index e47e1a08c3370..1b5bb82f106ab 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -840,7 +840,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 		entry->dev            = dev;
 		entry->paddr          = sg_phys(s);
 		entry->size           = s->length;
-		entry->dev_addr       = s->dma_address;
+		entry->dev_addr       = sg_dma_address(s);
 		entry->direction      = direction;
 		entry->sg_call_ents   = nents;
 		entry->sg_mapped_ents = mapped_ents;
@@ -872,7 +872,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 			.type           = dma_debug_sg,
 			.dev            = dev,
 			.paddr          = sg_phys(s),
-			.dev_addr       = s->dma_address,
+			.dev_addr       = sg_dma_address(s),
 			.size           = s->length,
 			.direction      = dir,
 			.sg_call_ents   = 0,
@@ -996,8 +996,8 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
-		check_sync(dev, s->dma_address, s->dma_length, 0,
-				direction, true);
+		check_sync(dev, sg_dma_address(s), s->dma_length, 0,
+			   direction, true);
 	}
 }
 EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu);
@@ -1012,8 +1012,8 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
-		check_sync(dev, s->dma_address, s->dma_length, 0,
-				direction, false);
+		check_sync(dev, sg_dma_address(s), s->dma_length, 0,
+			   direction, false);
 	}
 }
 EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
-- 
GitLab


From 884d05970bfbc3db368f23460dc4ce63257f240d Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 27 May 2009 09:43:02 +0900
Subject: [PATCH 1431/2198] dma-debug: use sg_dma_len accessor

debug_dma_map_sg() and debug_dma_unmap_sg() use length in struct
scatterlist while debug_dma_sync_sg_for_cpu() and
debug_dma_sync_sg_for_device() use dma_length. This causes bugs
warnings on some IOMMU implementations since these values are not
same; the length doesn't represent the dma length.

We always need to use sg_dma_len() accessor to get the dma length of a
scatterlist entry.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 1b5bb82f106ab..51f95e5b6265f 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -839,7 +839,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 		entry->type           = dma_debug_sg;
 		entry->dev            = dev;
 		entry->paddr          = sg_phys(s);
-		entry->size           = s->length;
+		entry->size           = sg_dma_len(s);
 		entry->dev_addr       = sg_dma_address(s);
 		entry->direction      = direction;
 		entry->sg_call_ents   = nents;
@@ -847,7 +847,7 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 
 		if (!PageHighMem(sg_page(s))) {
 			check_for_stack(dev, sg_virt(s));
-			check_for_illegal_area(dev, sg_virt(s), s->length);
+			check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
 		}
 
 		add_dma_entry(entry);
@@ -873,7 +873,7 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 			.dev            = dev,
 			.paddr          = sg_phys(s),
 			.dev_addr       = sg_dma_address(s),
-			.size           = s->length,
+			.size           = sg_dma_len(s),
 			.direction      = dir,
 			.sg_call_ents   = 0,
 		};
@@ -996,7 +996,7 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
-		check_sync(dev, sg_dma_address(s), s->dma_length, 0,
+		check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
 			   direction, true);
 	}
 }
@@ -1012,7 +1012,7 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
-		check_sync(dev, sg_dma_address(s), s->dma_length, 0,
+		check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
 			   direction, false);
 	}
 }
-- 
GitLab


From 88f3907f6f447899544beadf491dccb32015dacb Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Wed, 27 May 2009 09:43:03 +0900
Subject: [PATCH 1432/2198] dma-debug: fix debug_dma_sync_sg_for_cpu and
 debug_dma_sync_sg_for_device

DMA-mapping.txt says that debug_dma_sync_sg family must be called with
the _same_ one you passed into the dma_map_sg call, it should _NOT_ be
the 'count' value _returned_ from the dma_map_sg call.

debug_dma_sync_sg_for_cpu and debug_dma_sync_sg_for_device can't
handle this properly; they need to use the sg_mapped_ents in struct
dma_debug_entry as debug_dma_unmap_sg() does.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 48 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 51f95e5b6265f..1abed176d35a2 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -855,13 +855,32 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 }
 EXPORT_SYMBOL(debug_dma_map_sg);
 
+static int get_nr_mapped_entries(struct device *dev, struct scatterlist *s)
+{
+	struct dma_debug_entry *entry;
+	struct hash_bucket *bucket;
+	unsigned long flags;
+	int mapped_ents = 0;
+	struct dma_debug_entry ref;
+
+	ref.dev = dev;
+	ref.dev_addr = sg_dma_address(s);
+	ref.size = sg_dma_len(s),
+
+	bucket = get_hash_bucket(&ref, &flags);
+	entry = hash_bucket_find(bucket, &ref);
+	if (entry)
+		mapped_ents = entry->sg_mapped_ents;
+	put_hash_bucket(bucket, &flags);
+
+	return mapped_ents;
+}
+
 void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 			int nelems, int dir)
 {
-	struct dma_debug_entry *entry;
 	struct scatterlist *s;
 	int mapped_ents = 0, i;
-	unsigned long flags;
 
 	if (unlikely(global_disable))
 		return;
@@ -881,14 +900,9 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
 		if (mapped_ents && i >= mapped_ents)
 			break;
 
-		if (mapped_ents == 0) {
-			struct hash_bucket *bucket;
+		if (!i) {
 			ref.sg_call_ents = nelems;
-			bucket = get_hash_bucket(&ref, &flags);
-			entry = hash_bucket_find(bucket, &ref);
-			if (entry)
-				mapped_ents = entry->sg_mapped_ents;
-			put_hash_bucket(bucket, &flags);
+			mapped_ents = get_nr_mapped_entries(dev, s);
 		}
 
 		check_unmap(&ref);
@@ -990,12 +1004,18 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
 			       int nelems, int direction)
 {
 	struct scatterlist *s;
-	int i;
+	int mapped_ents = 0, i;
 
 	if (unlikely(global_disable))
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
+		if (!i)
+			mapped_ents = get_nr_mapped_entries(dev, s);
+
+		if (i >= mapped_ents)
+			break;
+
 		check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
 			   direction, true);
 	}
@@ -1006,12 +1026,18 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 				  int nelems, int direction)
 {
 	struct scatterlist *s;
-	int i;
+	int mapped_ents = 0, i;
 
 	if (unlikely(global_disable))
 		return;
 
 	for_each_sg(sg, s, nelems, i) {
+		if (!i)
+			mapped_ents = get_nr_mapped_entries(dev, s);
+
+		if (i >= mapped_ents)
+			break;
+
 		check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
 			   direction, false);
 	}
-- 
GitLab


From 3f4dee227348daac32f36daad9a91059efd0723e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 11:25:09 +0200
Subject: [PATCH 1433/2198] perf_counter: Fix cpuctx->task_ctx races

Peter noticed that we are sometimes reading cpuctx->task_ctx with
interrupts enabled.

Noticed-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index db843f812a607..eb346048f00f5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -234,15 +234,18 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
 
+	local_irq_save(flags);
 	/*
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
 	 * scheduled out before the smp call arrived.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		local_irq_restore(flags);
 		return;
+	}
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	spin_lock(&ctx->lock);
 	/*
 	 * Protect the list operation against NMI by disabling the
 	 * counters on a global level.
@@ -382,14 +385,17 @@ static void __perf_counter_disable(void *info)
 	struct perf_counter_context *ctx = counter->ctx;
 	unsigned long flags;
 
+	local_irq_save(flags);
 	/*
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx)
+	if (ctx->task && cpuctx->task_ctx != ctx) {
+		local_irq_restore(flags);
 		return;
+	}
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	spin_lock(&ctx->lock);
 
 	/*
 	 * If the counter is on, turn it off.
@@ -615,6 +621,7 @@ static void __perf_install_in_context(void *info)
 	unsigned long flags;
 	int err;
 
+	local_irq_save(flags);
 	/*
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
@@ -623,12 +630,14 @@ static void __perf_install_in_context(void *info)
 	 * on this cpu because it had no counters.
 	 */
 	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
+		if (cpuctx->task_ctx || ctx->task != current) {
+			local_irq_restore(flags);
 			return;
+		}
 		cpuctx->task_ctx = ctx;
 	}
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
@@ -745,17 +754,20 @@ static void __perf_counter_enable(void *info)
 	unsigned long flags;
 	int err;
 
+	local_irq_save(flags);
 	/*
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
 	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current)
+		if (cpuctx->task_ctx || ctx->task != current) {
+			local_irq_restore(flags);
 			return;
+		}
 		cpuctx->task_ctx = ctx;
 	}
 
-	spin_lock_irqsave(&ctx->lock, flags);
+	spin_lock(&ctx->lock);
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
-- 
GitLab


From 012b84dae17126d8b5d159173091eb3db5a2bc43 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 17 May 2009 11:08:41 +0200
Subject: [PATCH 1434/2198] perf_counter: Robustify counter-free logic

This fixes a nasty crash and highlights a bug that we were
freeing failed-fork() counters incorrectly.

(the fix for that will come separately)

[ Impact: fix crashes/lockups with inherited counters ]

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index eb346048f00f5..616c52426b32e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1004,6 +1004,10 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 
 	if (!cpuctx->task_ctx)
 		return;
+
+	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+		return;
+
 	__perf_counter_sched_out(ctx, cpuctx);
 	cpuctx->task_ctx = NULL;
 }
-- 
GitLab


From 294ae4011530d008c59c4fb9847738e39228821e Mon Sep 17 00:00:00 2001
From: GeunSik Lim <leemgs1@gmail.com>
Date: Thu, 28 May 2009 10:36:11 +0900
Subject: [PATCH 1435/2198] ftrace: fix typo about map of kernel priority in
 ftrace.txt file.

Fix typo about chart to map the kernel priority to user land priorities.

   * About sched_setscheduler(2)
      Processes scheduled under SCHED_FIFO or SCHED_RR
      can have a (user-space) static priority in the range 1 to 99.
      (reference: http://www.kernel.org/doc/man-pages/online/pages/
                  man2/sched_setscheduler.2.html)

   * From: Steven Rostedt
      0 to 98 - maps to RT tasks 99 to 1 (SCHED_RR or SCHED_FIFO)

      99 - maps to internal kernel threads that want to be lower than RT tasks
      but higher than SCHED_OTHER tasks. Although I'm not sure if any
      kernel thread actually uses this. I'm not even sure how this can be
      set, because the internal sched_setscheduler function does not allow
      for it.

      100 to 139 - maps nice levels -20 to 19. These are not set via
      sched_setscheduler, but are set via the nice system call.

      140 - reserved for idle tasks.

Signed-off-by: GeunSik Lim <geunsik.lim@samsung.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/trace/ftrace.txt | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index fd9a3e6938135..e362f50c496f3 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice
 values starting at 100 (nice -20). Below is a quick chart to map
 the kernel priority to user land priorities.
 
-  Kernel priority: 0 to 99    ==> user RT priority 99 to 0
-  Kernel priority: 100 to 139 ==> user nice -20 to 19
-  Kernel priority: 140        ==> idle task priority
+   Kernel Space                     User Space
+ ===============================================================
+   0(high) to  98(low)     user RT priority 99(high) to 1(low)
+                           with SCHED_RR or SCHED_FIFO
+ ---------------------------------------------------------------
+  99                       sched_priority is not used in scheduling
+                           decisions(it must be specified as 0)
+ ---------------------------------------------------------------
+ 100(high) to 139(low)     user nice -20(high) to 19(low)
+ ---------------------------------------------------------------
+ 140                       idle task priority
+ ---------------------------------------------------------------
 
 The task states are:
 
-- 
GitLab


From f04d82b7e0c63d0251f9952a537a4bc4d73aa1a9 Mon Sep 17 00:00:00 2001
From: GeunSik Lim <leemgs1@gmail.com>
Date: Thu, 28 May 2009 10:36:14 +0900
Subject: [PATCH 1436/2198] sched: fix typo in sched-rt-group.txt file

Fix typo about static priority's range.

    Kernel Space                     User Space
    ===============================================================
      0(high) to  98(low)     user RT priority 99(high) to 1(low)
                              with SCHED_RR or SCHED_FIFO
    ---------------------------------------------------------------
     99                       sched_priority is not used in scheduling
                              decisions(it must be specified as 0)
    ---------------------------------------------------------------
    100(high) to 139(low)     user nice -20(high) to 19(low)
    ---------------------------------------------------------------
    140                       idle task priority
    ---------------------------------------------------------------
    * ref) http://www.kernel.org/doc/man-pages/online/pages/man2/sched_setscheduler.2.html

Signed-off-by: GeunSik Lim <geunsik.lim@samsung.com>
CC: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/scheduler/sched-rt-group.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt
index eb74b014a3f8b..1df7f9cdab057 100644
--- a/Documentation/scheduler/sched-rt-group.txt
+++ b/Documentation/scheduler/sched-rt-group.txt
@@ -187,7 +187,7 @@ get their allocated time.
 
 Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
 the biggest challenge as the current linux PI infrastructure is geared towards
-the limited static priority levels 0-139. With deadline scheduling you need to
+the limited static priority levels 0-99. With deadline scheduling you need to
 do deadline inheritance (since priority is inversely proportional to the
 deadline delta (deadline - now).
 
-- 
GitLab


From efb3d17240d80e27508d238809168120fe4b93a4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 29 May 2009 14:25:58 +0200
Subject: [PATCH 1437/2198] perf_counter: Fix COMM and MMAP events for cpu wide
 counters

Commit a63eaf34ae6 ("perf_counter: Dynamically allocate tasks'
perf_counter_context struct") broke COMM and MMAP notification for
cpu wide counters by dropping out early if there was no task context,
thereby also not iterating the cpu context.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 616c52426b32e..58d6d198faa27 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2443,9 +2443,9 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
+	if (cpuctx->task_ctx)
+		perf_counter_comm_ctx(cpuctx->task_ctx, comm_event);
 	put_cpu_var(perf_cpu_context);
-
-	perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event);
 }
 
 void perf_counter_comm(struct task_struct *task)
@@ -2454,8 +2454,6 @@ void perf_counter_comm(struct task_struct *task)
 
 	if (!atomic_read(&nr_comm_tracking))
 		return;
-	if (!current->perf_counter_ctxp)
-		return;
 
 	comm_event = (struct perf_comm_event){
 		.task	= task,
@@ -2570,10 +2568,10 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
+	if (cpuctx->task_ctx)
+		perf_counter_mmap_ctx(cpuctx->task_ctx, mmap_event);
 	put_cpu_var(perf_cpu_context);
 
-	perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event);
-
 	kfree(buf);
 }
 
@@ -2584,8 +2582,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 
 	if (!atomic_read(&nr_mmap_tracking))
 		return;
-	if (!current->perf_counter_ctxp)
-		return;
 
 	mmap_event = (struct perf_mmap_event){
 		.file   = file,
-- 
GitLab


From 665c2142a94202881a3c11cbaee6506cb10ada2d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 29 May 2009 14:51:57 +0200
Subject: [PATCH 1438/2198] perf_counter: Clean up task_ctx vs interrupts

Remove the local_irq_save() etc.. in routines that are smp function
calls, or have IRQs disabled by other means.

Then change the COMM, MMAP, and swcounter context iteration to
current->perf_counter_ctxp and RCU, since it really doesn't matter
which context they iterate, they're all folded.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 82 ++++++++++++++++++++++++++-----------------
 1 file changed, 50 insertions(+), 32 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 58d6d198faa27..0c000d305e0e0 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -232,18 +232,14 @@ static void __perf_counter_remove_from_context(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter *counter = info;
 	struct perf_counter_context *ctx = counter->ctx;
-	unsigned long flags;
 
-	local_irq_save(flags);
 	/*
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
 	 * scheduled out before the smp call arrived.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		local_irq_restore(flags);
+	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
-	}
 
 	spin_lock(&ctx->lock);
 	/*
@@ -267,7 +263,7 @@ static void __perf_counter_remove_from_context(void *info)
 	}
 
 	perf_enable();
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
 }
 
 
@@ -383,17 +379,13 @@ static void __perf_counter_disable(void *info)
 	struct perf_counter *counter = info;
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = counter->ctx;
-	unsigned long flags;
 
-	local_irq_save(flags);
 	/*
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
-	if (ctx->task && cpuctx->task_ctx != ctx) {
-		local_irq_restore(flags);
+	if (ctx->task && cpuctx->task_ctx != ctx)
 		return;
-	}
 
 	spin_lock(&ctx->lock);
 
@@ -411,7 +403,7 @@ static void __perf_counter_disable(void *info)
 		counter->state = PERF_COUNTER_STATE_OFF;
 	}
 
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
 }
 
 /*
@@ -618,10 +610,8 @@ static void __perf_install_in_context(void *info)
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *leader = counter->group_leader;
 	int cpu = smp_processor_id();
-	unsigned long flags;
 	int err;
 
-	local_irq_save(flags);
 	/*
 	 * If this is a task context, we need to check whether it is
 	 * the current task context of this cpu. If not it has been
@@ -630,10 +620,8 @@ static void __perf_install_in_context(void *info)
 	 * on this cpu because it had no counters.
 	 */
 	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current) {
-			local_irq_restore(flags);
+		if (cpuctx->task_ctx || ctx->task != current)
 			return;
-		}
 		cpuctx->task_ctx = ctx;
 	}
 
@@ -687,7 +675,7 @@ static void __perf_install_in_context(void *info)
  unlock:
 	perf_enable();
 
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
 }
 
 /*
@@ -751,19 +739,15 @@ static void __perf_counter_enable(void *info)
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
 	struct perf_counter_context *ctx = counter->ctx;
 	struct perf_counter *leader = counter->group_leader;
-	unsigned long flags;
 	int err;
 
-	local_irq_save(flags);
 	/*
 	 * If this is a per-task counter, need to check whether this
 	 * counter's task is the current task on this cpu.
 	 */
 	if (ctx->task && cpuctx->task_ctx != ctx) {
-		if (cpuctx->task_ctx || ctx->task != current) {
-			local_irq_restore(flags);
+		if (cpuctx->task_ctx || ctx->task != current)
 			return;
-		}
 		cpuctx->task_ctx = ctx;
 	}
 
@@ -811,7 +795,7 @@ static void __perf_counter_enable(void *info)
 	}
 
  unlock:
-	spin_unlock_irqrestore(&ctx->lock, flags);
+	spin_unlock(&ctx->lock);
 }
 
 /*
@@ -981,6 +965,10 @@ void perf_counter_task_sched_out(struct task_struct *task,
 		spin_lock(&ctx->lock);
 		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
 		if (context_equiv(ctx, next_ctx)) {
+			/*
+			 * XXX do we need a memory barrier of sorts
+			 * wrt to rcu_dereference() of perf_counter_ctxp
+			 */
 			task->perf_counter_ctxp = next_ctx;
 			next->perf_counter_ctxp = ctx;
 			ctx->task = next;
@@ -998,6 +986,9 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	}
 }
 
+/*
+ * Called with IRQs disabled
+ */
 static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 {
 	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
@@ -1012,6 +1003,9 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx)
 	cpuctx->task_ctx = NULL;
 }
 
+/*
+ * Called with IRQs disabled
+ */
 static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 {
 	__perf_counter_sched_out(&cpuctx->ctx, cpuctx);
@@ -2431,6 +2425,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
 static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 {
 	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
 	unsigned int size;
 	char *comm = comm_event->task->comm;
 
@@ -2443,9 +2438,17 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_counter_comm_ctx(&cpuctx->ctx, comm_event);
-	if (cpuctx->task_ctx)
-		perf_counter_comm_ctx(cpuctx->task_ctx, comm_event);
 	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (ctx)
+		perf_counter_comm_ctx(ctx, comm_event);
+	rcu_read_unlock();
 }
 
 void perf_counter_comm(struct task_struct *task)
@@ -2536,6 +2539,7 @@ static void perf_counter_mmap_ctx(struct perf_counter_context *ctx,
 static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 {
 	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
 	struct file *file = mmap_event->file;
 	unsigned int size;
 	char tmp[16];
@@ -2568,10 +2572,18 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 
 	cpuctx = &get_cpu_var(perf_cpu_context);
 	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event);
-	if (cpuctx->task_ctx)
-		perf_counter_mmap_ctx(cpuctx->task_ctx, mmap_event);
 	put_cpu_var(perf_cpu_context);
 
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (ctx)
+		perf_counter_mmap_ctx(ctx, mmap_event);
+	rcu_read_unlock();
+
 	kfree(buf);
 }
 
@@ -2882,6 +2894,7 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
+	struct perf_counter_context *ctx;
 
 	if (*recursion)
 		goto out;
@@ -2891,10 +2904,15 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event,
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
 				 nr, nmi, regs, addr);
-	if (cpuctx->task_ctx) {
-		perf_swcounter_ctx_event(cpuctx->task_ctx, type, event,
-					 nr, nmi, regs, addr);
-	}
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (ctx)
+		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr);
+	rcu_read_unlock();
 
 	barrier();
 	(*recursion)--;
-- 
GitLab


From bbbee90829304d156c12b171c0ac7e6e1aba8b90 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 29 May 2009 14:25:58 +0200
Subject: [PATCH 1439/2198] perf_counter: Ammend cleanup in fork() fail

When fork() fails we cannot use perf_counter_exit_task() since that
assumes to operate on current. Write a new helper that cleans up
unused/clean contexts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/fork.c                |  2 +-
 kernel/perf_counter.c        | 43 +++++++++++++++++++++++++++++++++---
 3 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 717bf3b59ba40..519a41bba2495 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -579,6 +579,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task,
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern int perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
+extern void perf_counter_free_task(struct task_struct *task);
 extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void __perf_disable(void);
@@ -644,6 +645,7 @@ static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)		{ }
 static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; }
 static inline void perf_counter_exit_task(struct task_struct *child)	{ }
+static inline void perf_counter_free_task(struct task_struct *task)	{ }
 static inline void perf_counter_do_pending(void)			{ }
 static inline void perf_counter_print_debug(void)			{ }
 static inline void perf_disable(void)					{ }
diff --git a/kernel/fork.c b/kernel/fork.c
index c07c3335ceac6..23bf757ed3211 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1298,7 +1298,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 bad_fork_cleanup_audit:
 	audit_free(p);
 bad_fork_cleanup_policy:
-	perf_counter_exit_task(p);
+	perf_counter_free_task(p);
 #ifdef CONFIG_NUMA
 	mpol_put(p->mempolicy);
 bad_fork_cleanup_cgroup:
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0c000d305e0e0..79c3f26541d3e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3538,8 +3538,7 @@ static void sync_child_counter(struct perf_counter *child_counter,
 }
 
 static void
-__perf_counter_exit_task(struct task_struct *child,
-			 struct perf_counter *child_counter,
+__perf_counter_exit_task(struct perf_counter *child_counter,
 			 struct perf_counter_context *child_ctx)
 {
 	struct perf_counter *parent_counter;
@@ -3605,7 +3604,7 @@ void perf_counter_exit_task(struct task_struct *child)
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
 				 list_entry)
-		__perf_counter_exit_task(child, child_counter, child_ctx);
+		__perf_counter_exit_task(child_counter, child_ctx);
 
 	/*
 	 * If the last counter was a group counter, it will have appended all
@@ -3620,6 +3619,44 @@ void perf_counter_exit_task(struct task_struct *child)
 	put_ctx(child_ctx);
 }
 
+/*
+ * free an unexposed, unused context as created by inheritance by
+ * init_task below, used by fork() in case of fail.
+ */
+void perf_counter_free_task(struct task_struct *task)
+{
+	struct perf_counter_context *ctx = task->perf_counter_ctxp;
+	struct perf_counter *counter, *tmp;
+
+	if (!ctx)
+		return;
+
+	mutex_lock(&ctx->mutex);
+again:
+	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) {
+		struct perf_counter *parent = counter->parent;
+
+		if (WARN_ON_ONCE(!parent))
+			continue;
+
+		mutex_lock(&parent->child_mutex);
+		list_del_init(&counter->child_list);
+		mutex_unlock(&parent->child_mutex);
+
+		fput(parent->filp);
+
+		list_del_counter(counter, ctx);
+		free_counter(counter);
+	}
+
+	if (!list_empty(&ctx->counter_list))
+		goto again;
+
+	mutex_unlock(&ctx->mutex);
+
+	put_ctx(ctx);
+}
+
 /*
  * Initialize the perf_counter context in task_struct
  */
-- 
GitLab


From b78c07d45a7e71be7b5c5d7486f922355ccf23a8 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 29 May 2009 13:48:59 -0300
Subject: [PATCH 1440/2198] perf_counter tools: Shorten the DSO names using cwd

[acme@emilia linux-2.6-tip]$ pwd
/home/acme/git/linux-2.6-tip

Before (still available using -P/--full-paths)

[acme@emilia linux-2.6-tip]$ perf report -P | head -10
    11.48%             perf: 7454 [kernel]: clear_page_c
     4.89%             perf: 7454 [kernel]: vsnprintf
     4.61%             perf: 7454 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__find_symbol
     4.09%             perf: 7454 [kernel]: number
     4.06%             perf: 7454 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: dso__fprintf
     4.00%             perf: 7454 /home/acme/git/linux-2.6-tip/Documentation/perf_counter/perf: symbol_filter

New default:

[acme@emilia linux-2.6-tip]$ perf report | head -10
    11.48%             perf: 7454 [kernel]: clear_page_c
     4.89%             perf: 7454 [kernel]: vsnprintf
     4.61%             perf: 7454 ./Documentation/perf_counter/perf: dso__find_symbol
     4.09%             perf: 7454 [kernel]: number
     4.06%             perf: 7454 ./Documentation/perf_counter/perf: dso__fprintf
     4.00%             perf: 7454 ./Documentation/perf_counter/perf: symbol_filter

Suggested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090529164859.GN4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 41 +++++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 412d524dd6581..4705679debae1 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -23,6 +23,7 @@ static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int		dump_trace = 0;
 static int		verbose;
+static int		full_paths;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -134,6 +135,16 @@ static int load_kernel(void)
 	return err;
 }
 
+static int strcommon(const char *pathname, const char *cwd, int cwdlen)
+{
+	int n = 0;
+
+	while (pathname[n] == cwd[n] && n < cwdlen)
+		++n;
+
+	return n;
+}
+
 struct map {
 	struct list_head node;
 	uint64_t	 start;
@@ -142,16 +153,28 @@ struct map {
 	struct dso	 *dso;
 };
 
-static struct map *map__new(struct mmap_event *event)
+static struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen)
 {
 	struct map *self = malloc(sizeof(*self));
 
 	if (self != NULL) {
+		const char *filename = event->filename;
+		char newfilename[PATH_MAX];
+
+		if (cwd) {
+			int n = strcommon(filename, cwd, cwdlen);
+			if (n == cwdlen) {
+				snprintf(newfilename, sizeof(newfilename),
+					 ".%s", filename + n);
+				filename = newfilename;
+			}
+		}
+
 		self->start = event->start;
 		self->end   = event->start + event->len;
 		self->pgoff = event->pgoff;
 
-		self->dso = dsos__findnew(event->filename);
+		self->dso = dsos__findnew(filename);
 		if (self->dso == NULL)
 			goto out_delete;
 	}
@@ -598,6 +621,8 @@ static int __cmd_report(void)
 	int ret, rc = EXIT_FAILURE;
 	uint32_t size;
 	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
+	char cwd[PATH_MAX], *cwdp = cwd;
+	int cwdlen;
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
@@ -621,6 +646,14 @@ static int __cmd_report(void)
 		return EXIT_FAILURE;
 	}
 
+	if (!full_paths) {
+		if (getcwd(cwd, sizeof(cwd)) == NULL) {
+			perror("failed to get the current directory");
+			return EXIT_FAILURE;
+		}
+		cwdlen = strlen(cwd);
+	} else
+		cwdp = NULL;
 remap:
 	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
 			   MAP_SHARED, input, offset);
@@ -710,7 +743,7 @@ static int __cmd_report(void)
 	} else switch (event->header.type) {
 	case PERF_EVENT_MMAP: {
 		struct thread *thread = threads__findnew(event->mmap.pid);
-		struct map *map = map__new(&event->mmap);
+		struct map *map = map__new(&event->mmap, cwdp, cwdlen);
 
 		if (dump_trace) {
 			fprintf(stderr, "%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
@@ -809,6 +842,8 @@ static const struct option options[] = {
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
 		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
+	OPT_BOOLEAN('P', "full-paths", &full_paths,
+		    "Don't shorten the pathnames taking into account the cwd"),
 	OPT_END()
 };
 
-- 
GitLab


From c143dc903d7c0b15f5052e00b2c7de33a8b4299c Mon Sep 17 00:00:00 2001
From: James Bottomley <James.Bottomley@HansenPartnership.com>
Date: Sat, 30 May 2009 06:43:49 +0200
Subject: [PATCH 1441/2198] block: fix an oops on BLKPREP_KILL

Doing a bit of torture testing, I ran across a BUG in the block
subsystem (at blk-core.c:2048): the test for if the request is queued.

It turns out the trigger was a BLKPREP_KILL coming out of the SCSI prep
function.  Currently for BLKPREP_KILL requests, we send them straight
into __blk_end_request_all() with an error, but they've never been
dequeued, so they trip the bug.  Fix this by starting requests before
killing them.

Signed-off-by: James Bottomley <James.Bottomley@HansenPartnership.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 8b3b74e691841..7ae83a1e2acff 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1789,6 +1789,11 @@ struct request *blk_peek_request(struct request_queue *q)
 			break;
 		} else if (ret == BLKPREP_KILL) {
 			rq->cmd_flags |= REQ_QUIET;
+			/*
+			 * Mark this request as started so we don't trigger
+			 * any debug logic in the end I/O path.
+			 */
+			blk_start_request(rq);
 			__blk_end_request_all(rq, -EIO);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
-- 
GitLab


From c44613a4c1092e85841b78b7ab52a06654fcd321 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 29 May 2009 17:03:07 -0300
Subject: [PATCH 1442/2198] perf_counter tools: Add locking to perf top

perf_counter tools: Add locking to perf top

We need to protect the active_symbols list as two threads change it:
the main thread adding entries to the head and the display thread
decaying entries from any place in the list.

Also related: take a snapshot of syme->count[0] before using it to
calculate the weight and to show the same number used in this calc when
displaying the symbol usage.

Reported-by: Mike Galbraith <efault@gmx.de>
Tested-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090529200307.GR4747@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 47 ++++++++++++++++--------
 1 file changed, 31 insertions(+), 16 deletions(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index ebe8bec1a0e8d..24a887907a7ad 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -129,6 +129,8 @@ struct sym_entry {
 	struct rb_node		rb_node;
 	struct list_head	node;
 	unsigned long		count[MAX_COUNTERS];
+	unsigned long		snap_count;
+	double			weight;
 	int			skip;
 };
 
@@ -141,17 +143,16 @@ struct dso *kernel_dso;
  * after decayed.
  */
 static LIST_HEAD(active_symbols);
+static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER;
 
 /*
  * Ordering weight: count-1 * count-2 * ... / count-n
  */
 static double sym_weight(const struct sym_entry *sym)
 {
-	double weight;
+	double weight = sym->snap_count;
 	int counter;
 
-	weight = sym->count[0];
-
 	for (counter = 1; counter < nr_counters-1; counter++)
 		weight *= sym->count[counter];
 
@@ -164,11 +165,18 @@ static long			events;
 static long			userspace_events;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
 
-static void list_insert_active_sym(struct sym_entry *syme)
+static void __list_insert_active_sym(struct sym_entry *syme)
 {
 	list_add(&syme->node, &active_symbols);
 }
 
+static void list_remove_active_sym(struct sym_entry *syme)
+{
+	pthread_mutex_lock(&active_symbols_lock);
+	list_del_init(&syme->node);
+	pthread_mutex_unlock(&active_symbols_lock);
+}
+
 static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
 {
 	struct rb_node **p = &tree->rb_node;
@@ -179,7 +187,7 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
 		parent = *p;
 		iter = rb_entry(parent, struct sym_entry, rb_node);
 
-		if (sym_weight(se) > sym_weight(iter))
+		if (se->weight > iter->weight)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
@@ -203,15 +211,21 @@ static void print_sym_table(void)
 	events = userspace_events = 0;
 
 	/* Sort the active symbols */
-	list_for_each_entry_safe(syme, n, &active_symbols, node) {
-		if (syme->count[0] != 0) {
+	pthread_mutex_lock(&active_symbols_lock);
+	syme = list_entry(active_symbols.next, struct sym_entry, node);
+	pthread_mutex_unlock(&active_symbols_lock);
+
+	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
+		syme->snap_count = syme->count[0];
+		if (syme->snap_count != 0) {
+			syme->weight = sym_weight(syme);
 			rb_insert_active_sym(&tmp, syme);
-			sum_kevents += syme->count[0];
+			sum_kevents += syme->snap_count;
 
 			for (j = 0; j < nr_counters; j++)
 				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
 		} else
-			list_del_init(&syme->node);
+			list_remove_active_sym(syme);
 	}
 
 	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
@@ -264,19 +278,18 @@ static void print_sym_table(void)
 		struct symbol *sym = (struct symbol *)(syme + 1);
 		float pcnt;
 
-		if (++printed > 18 || syme->count[0] < count_filter)
-			break;
+		if (++printed > 18 || syme->snap_count < count_filter)
+			continue;
 
-		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->count[0]) /
+		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->snap_count) /
 					 sum_kevents));
 
 		if (nr_counters == 1)
 			printf("%19.2f - %4.1f%% - %016llx : %s\n",
-				sym_weight(syme),
-				pcnt, sym->start, sym->name);
+				syme->weight, pcnt, sym->start, sym->name);
 		else
 			printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
-				sym_weight(syme), syme->count[0],
+				syme->weight, syme->snap_count,
 				pcnt, sym->start, sym->name);
 	}
 
@@ -395,8 +408,10 @@ static void record_ip(uint64_t ip, int counter)
 
 		if (!syme->skip) {
 			syme->count[counter]++;
+			pthread_mutex_lock(&active_symbols_lock);
 			if (list_empty(&syme->node) || !syme->node.next)
-				list_insert_active_sym(syme);
+				__list_insert_active_sym(syme);
+			pthread_mutex_unlock(&active_symbols_lock);
 			return;
 		}
 	}
-- 
GitLab


From d7c29318c2daa96d64b7312afd8283488c1cb29f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 30 May 2009 12:38:51 +0200
Subject: [PATCH 1443/2198] perf_counter tools: Print 'CPU utilization factor'
 in builtin-stat

Before:

 Performance counter stats for '/home/mingo/hackbench':

    5728.862689  task clock ticks     (msecs)
          34426  context switches     #        0.006 M/sec
           3835  CPU migrations       #        0.001 M/sec
          18158  pagefaults           #        0.003 M/sec
    16218109156  CPU cycles           #     2830.947 M/sec
    13519616840  instructions         #     2359.913 M/sec
       55941661  cache references     #        9.765 M/sec
       23554938  cache misses         #        4.112 M/sec

 Wall-clock time elapsed:   528.886980 msecs

After:

 Performance counter stats for '/home/mingo/hackbench':

    5845.443541  task clock ticks     #      11.886 CPU utilization factor
          38289  context switches     #       0.007 M/sec
           4208  CPU migrations       #       0.001 M/sec
          17755  pagefaults           #       0.003 M/sec
    16664668576  CPU cycles           #    2850.882 M/sec
    13468113991  instructions         #    2304.036 M/sec
       57445468  cache references     #       9.827 M/sec
       26896502  cache misses         #       4.601 M/sec

 Wall-clock time elapsed:   491.802357 msecs

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index ef7e0e1192cbf..588679167c888 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -75,6 +75,7 @@ static __u64			event_res[MAX_COUNTERS][3];
 static __u64			event_scaled[MAX_COUNTERS];
 
 static __u64			runtime_nsecs;
+static __u64			walltime_nsecs;
 
 static void create_perfstat_counter(int counter)
 {
@@ -194,13 +195,19 @@ static void print_counter(int counter)
 	if (nsec_counter(counter)) {
 		double msecs = (double)count[0] / 1000000;
 
-		fprintf(stderr, " %14.6f  %-20s (msecs)",
+		fprintf(stderr, " %14.6f  %-20s",
 			msecs, event_name(counter));
+		if (event_id[counter] ==
+				EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+
+			fprintf(stderr, " # %11.3f CPU utilization factor",
+				(double)count[0] / (double)walltime_nsecs);
+		}
 	} else {
 		fprintf(stderr, " %14Ld  %-20s",
 			count[0], event_name(counter));
 		if (runtime_nsecs)
-			fprintf(stderr, " # %12.3f M/sec",
+			fprintf(stderr, " # %11.3f M/sec",
 				(double)count[0]/runtime_nsecs*1000.0);
 	}
 	if (scaled)
@@ -241,6 +248,8 @@ static int do_perfstat(int argc, const char **argv)
 	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
 	t1 = rdclock();
 
+	walltime_nsecs = t1 - t0;
+
 	fflush(stdout);
 
 	fprintf(stderr, "\n");
-- 
GitLab


From 7fbd55449aafb86d3237b5d1a26fb4dab2aa2c76 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 30 May 2009 12:38:51 +0200
Subject: [PATCH 1444/2198] perf_counter tools: Fix 'make install'

'make install' didnt install perf itself - which needs a special
rule to be copied to bindir.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index bd29a5c001004..8f725840477fd 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -776,6 +776,7 @@ install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
 ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
-- 
GitLab


From c1c2365acf8c044f749c0fe1ea236497e8d1718e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 30 May 2009 12:38:51 +0200
Subject: [PATCH 1445/2198] perf_counter tools: Generate per command manpages
 (and pdf/html, etc.)

Import Git's nice .txt => {man/html/pdf} generation machinery.

Fix various errors in the Documentation/perf*.txt description as well.

Also fix a bug in builtin-help: we'd map 'perf help top' to 'perftop'
if only the 'perf' binary is in the default PATH - confusing the manpage
logic. I dont fully understand why Git did it this way - but i suppose
it's a migration artifact from their migration from standalone git-xyz
commands to 'git xyz' commands. The perf tools were always using the
modern form so it's not an issue there.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../perf_counter/Documentation/Makefile       | 300 ++++++++++++++++++
 .../perf_counter/Documentation/asciidoc.conf  |  91 ++++++
 .../Documentation/manpage-1.72.xsl            |  14 +
 .../Documentation/manpage-base.xsl            |  35 ++
 .../Documentation/manpage-bold-literal.xsl    |  17 +
 .../Documentation/manpage-normal.xsl          |  13 +
 .../Documentation/manpage-suppress-sp.xsl     |  21 ++
 .../Documentation/perf-record.txt             |  10 +-
 .../Documentation/perf-report.txt             |   8 +-
 .../perf_counter/Documentation/perf-stat.txt  |   5 +-
 .../perf_counter/Documentation/perf-top.txt   |   8 +-
 .../perf_counter/Documentation/perf.txt       |  23 ++
 Documentation/perf_counter/Makefile           |  61 ++++
 Documentation/perf_counter/builtin-help.c     |   2 +-
 14 files changed, 581 insertions(+), 27 deletions(-)
 create mode 100644 Documentation/perf_counter/Documentation/Makefile
 create mode 100644 Documentation/perf_counter/Documentation/asciidoc.conf
 create mode 100644 Documentation/perf_counter/Documentation/manpage-1.72.xsl
 create mode 100644 Documentation/perf_counter/Documentation/manpage-base.xsl
 create mode 100644 Documentation/perf_counter/Documentation/manpage-bold-literal.xsl
 create mode 100644 Documentation/perf_counter/Documentation/manpage-normal.xsl
 create mode 100644 Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl
 create mode 100644 Documentation/perf_counter/Documentation/perf.txt

diff --git a/Documentation/perf_counter/Documentation/Makefile b/Documentation/perf_counter/Documentation/Makefile
new file mode 100644
index 0000000000000..5457192e1b411
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/Makefile
@@ -0,0 +1,300 @@
+MAN1_TXT= \
+	$(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \
+		$(wildcard perf-*.txt)) \
+	perf.txt
+MAN5_TXT=
+MAN7_TXT=
+
+MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT)
+MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
+MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
+
+DOC_HTML=$(MAN_HTML)
+
+ARTICLES =
+# with their own formatting rules.
+SP_ARTICLES =
+API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technical/api-index.txt, $(wildcard technical/api-*.txt)))
+SP_ARTICLES += $(API_DOCS)
+SP_ARTICLES += technical/api-index
+
+DOC_HTML += $(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
+
+DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
+DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
+DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
+
+prefix?=$(HOME)
+bindir?=$(prefix)/bin
+htmldir?=$(prefix)/share/doc/perf-doc
+pdfdir?=$(prefix)/share/doc/perf-doc
+mandir?=$(prefix)/share/man
+man1dir=$(mandir)/man1
+man5dir=$(mandir)/man5
+man7dir=$(mandir)/man7
+# DESTDIR=
+
+ASCIIDOC=asciidoc
+ASCIIDOC_EXTRA =
+MANPAGE_XSL = manpage-normal.xsl
+XMLTO_EXTRA =
+INSTALL?=install
+RM ?= rm -f
+DOC_REF = origin/man
+HTML_REF = origin/html
+
+infodir?=$(prefix)/share/info
+MAKEINFO=makeinfo
+INSTALL_INFO=install-info
+DOCBOOK2X_TEXI=docbook2x-texi
+DBLATEX=dblatex
+ifndef PERL_PATH
+	PERL_PATH = /usr/bin/perl
+endif
+
+-include ../config.mak.autogen
+-include ../config.mak
+
+#
+# For asciidoc ...
+#	-7.1.2,	no extra settings are needed.
+#	8.0-,	set ASCIIDOC8.
+#
+
+#
+# For docbook-xsl ...
+#	-1.68.1,	set ASCIIDOC_NO_ROFF? (based on changelog from 1.73.0)
+#	1.69.0,		no extra settings are needed?
+#	1.69.1-1.71.0,	set DOCBOOK_SUPPRESS_SP?
+#	1.71.1,		no extra settings are needed?
+#	1.72.0,		set DOCBOOK_XSL_172.
+#	1.73.0-,	set ASCIIDOC_NO_ROFF
+#
+
+#
+# If you had been using DOCBOOK_XSL_172 in an attempt to get rid
+# of 'the ".ft C" problem' in your generated manpages, and you
+# instead ended up with weird characters around callouts, try
+# using ASCIIDOC_NO_ROFF instead (it works fine with ASCIIDOC8).
+#
+
+ifdef ASCIIDOC8
+ASCIIDOC_EXTRA += -a asciidoc7compatible
+endif
+ifdef DOCBOOK_XSL_172
+ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
+MANPAGE_XSL = manpage-1.72.xsl
+else
+	ifdef ASCIIDOC_NO_ROFF
+	# docbook-xsl after 1.72 needs the regular XSL, but will not
+	# pass-thru raw roff codes from asciidoc.conf, so turn them off.
+	ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff
+	endif
+endif
+ifdef MAN_BOLD_LITERAL
+XMLTO_EXTRA += -m manpage-bold-literal.xsl
+endif
+ifdef DOCBOOK_SUPPRESS_SP
+XMLTO_EXTRA += -m manpage-suppress-sp.xsl
+endif
+
+SHELL_PATH ?= $(SHELL)
+# Shell quote;
+SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
+
+#
+# Please note that there is a minor bug in asciidoc.
+# The version after 6.0.3 _will_ include the patch found here:
+#   http://marc.theaimsgroup.com/?l=perf&m=111558757202243&w=2
+#
+# Until that version is released you may have to apply the patch
+# yourself - yes, all 6 characters of it!
+#
+
+QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
+QUIET_SUBDIR1  =
+
+ifneq ($(findstring $(MAKEFLAGS),w),w)
+PRINT_DIR = --no-print-directory
+else # "make -w"
+NO_SUBDIR = :
+endif
+
+ifneq ($(findstring $(MAKEFLAGS),s),s)
+ifndef V
+	QUIET_ASCIIDOC	= @echo '   ' ASCIIDOC $@;
+	QUIET_XMLTO	= @echo '   ' XMLTO $@;
+	QUIET_DB2TEXI	= @echo '   ' DB2TEXI $@;
+	QUIET_MAKEINFO	= @echo '   ' MAKEINFO $@;
+	QUIET_DBLATEX	= @echo '   ' DBLATEX $@;
+	QUIET_XSLTPROC	= @echo '   ' XSLTPROC $@;
+	QUIET_GEN	= @echo '   ' GEN $@;
+	QUIET_STDERR	= 2> /dev/null
+	QUIET_SUBDIR0	= +@subdir=
+	QUIET_SUBDIR1	= ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+			  $(MAKE) $(PRINT_DIR) -C $$subdir
+	export V
+endif
+endif
+
+all: html man
+
+html: $(DOC_HTML)
+
+$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7): asciidoc.conf
+
+man: man1 man5 man7
+man1: $(DOC_MAN1)
+man5: $(DOC_MAN5)
+man7: $(DOC_MAN7)
+
+info: perf.info perfman.info
+
+pdf: user-manual.pdf
+
+install: install-man
+
+install-man: man
+	$(INSTALL) -d -m 755 $(DESTDIR)$(man1dir)
+#	$(INSTALL) -d -m 755 $(DESTDIR)$(man5dir)
+#	$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir)
+	$(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir)
+#	$(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir)
+#	$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
+
+install-info: info
+	$(INSTALL) -d -m 755 $(DESTDIR)$(infodir)
+	$(INSTALL) -m 644 perf.info perfman.info $(DESTDIR)$(infodir)
+	if test -r $(DESTDIR)$(infodir)/dir; then \
+	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
+	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
+	else \
+	  echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \
+	fi
+
+install-pdf: pdf
+	$(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
+	$(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
+
+install-html: html
+	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
+
+../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
+	$(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
+
+-include ../PERF-VERSION-FILE
+
+#
+# Determine "include::" file references in asciidoc files.
+#
+doc.dep : $(wildcard *.txt) build-docdep.perl
+	$(QUIET_GEN)$(RM) $@+ $@ && \
+	$(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \
+	mv $@+ $@
+
+-include doc.dep
+
+cmds_txt = cmds-ancillaryinterrogators.txt \
+	cmds-ancillarymanipulators.txt \
+	cmds-mainporcelain.txt \
+	cmds-plumbinginterrogators.txt \
+	cmds-plumbingmanipulators.txt \
+	cmds-synchingrepositories.txt \
+	cmds-synchelpers.txt \
+	cmds-purehelpers.txt \
+	cmds-foreignscminterface.txt
+
+$(cmds_txt): cmd-list.made
+
+cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
+	$(QUIET_GEN)$(RM) $@ && \
+	$(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \
+	date >$@
+
+clean:
+	$(RM) *.xml *.xml+ *.html *.html+ *.1 *.5 *.7
+	$(RM) *.texi *.texi+ *.texi++ perf.info perfman.info
+	$(RM) howto-index.txt howto/*.html doc.dep
+	$(RM) technical/api-*.html technical/api-index.txt
+	$(RM) $(cmds_txt) *.made
+
+$(MAN_HTML): %.html : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	$(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \
+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+	mv $@+ $@
+
+%.1 %.5 %.7 : %.xml
+	$(QUIET_XMLTO)$(RM) $@ && \
+	xmlto -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
+
+%.xml : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	$(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \
+		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
+	mv $@+ $@
+
+XSLT = docbook.xsl
+XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css
+
+user-manual.html: user-manual.xml
+	$(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $<
+
+perf.info: user-manual.texi
+	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ user-manual.texi
+
+user-manual.texi: user-manual.xml
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	$(DOCBOOK2X_TEXI) user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
+	$(PERL_PATH) fix-texi.perl <$@++ >$@+ && \
+	rm $@++ && \
+	mv $@+ $@
+
+user-manual.pdf: user-manual.xml
+	$(QUIET_DBLATEX)$(RM) $@+ $@ && \
+	$(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \
+	mv $@+ $@
+
+perfman.texi: $(MAN_XML) cat-texi.perl
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \
+		--to-stdout $(xml) &&) true) > $@++ && \
+	$(PERL_PATH) cat-texi.perl $@ <$@++ >$@+ && \
+	rm $@++ && \
+	mv $@+ $@
+
+perfman.info: perfman.texi
+	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi
+
+$(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml
+	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
+	$(DOCBOOK2X_TEXI) --to-stdout $*.xml >$@+ && \
+	mv $@+ $@
+
+howto-index.txt: howto-index.sh $(wildcard howto/*.txt)
+	$(QUIET_GEN)$(RM) $@+ $@ && \
+	'$(SHELL_PATH_SQ)' ./howto-index.sh $(wildcard howto/*.txt) >$@+ && \
+	mv $@+ $@
+
+$(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt
+	$(QUIET_ASCIIDOC)$(ASCIIDOC) -b xhtml11 $*.txt
+
+WEBDOC_DEST = /pub/software/tools/perf/docs
+
+$(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
+	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
+	sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
+	mv $@+ $@
+
+install-webdoc : html
+	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
+
+quick-install: quick-install-man
+
+quick-install-man:
+	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
+
+quick-install-html:
+	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
+
+.PHONY: .FORCE-PERF-VERSION-FILE
diff --git a/Documentation/perf_counter/Documentation/asciidoc.conf b/Documentation/perf_counter/Documentation/asciidoc.conf
new file mode 100644
index 0000000000000..356b23a403398
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/asciidoc.conf
@@ -0,0 +1,91 @@
+## linkperf: macro
+#
+# Usage: linkperf:command[manpage-section]
+#
+# Note, {0} is the manpage section, while {target} is the command.
+#
+# Show PERF link as: <command>(<section>); if section is defined, else just show
+# the command.
+
+[macros]
+(?su)[\\]?(?P<name>linkperf):(?P<target>\S*?)\[(?P<attrlist>.*?)\]=
+
+[attributes]
+asterisk=&#42;
+plus=&#43;
+caret=&#94;
+startsb=&#91;
+endsb=&#93;
+tilde=&#126;
+
+ifdef::backend-docbook[]
+[linkperf-inlinemacro]
+{0%{target}}
+{0#<citerefentry>}
+{0#<refentrytitle>{target}</refentrytitle><manvolnum>{0}</manvolnum>}
+{0#</citerefentry>}
+endif::backend-docbook[]
+
+ifdef::backend-docbook[]
+ifndef::perf-asciidoc-no-roff[]
+# "unbreak" docbook-xsl v1.68 for manpages. v1.69 works with or without this.
+# v1.72 breaks with this because it replaces dots not in roff requests.
+[listingblock]
+<example><title>{title}</title>
+<literallayout>
+ifdef::doctype-manpage[]
+&#10;.ft C&#10;
+endif::doctype-manpage[]
+|
+ifdef::doctype-manpage[]
+&#10;.ft&#10;
+endif::doctype-manpage[]
+</literallayout>
+{title#}</example>
+endif::perf-asciidoc-no-roff[]
+
+ifdef::perf-asciidoc-no-roff[]
+ifdef::doctype-manpage[]
+# The following two small workarounds insert a simple paragraph after screen
+[listingblock]
+<example><title>{title}</title>
+<literallayout>
+|
+</literallayout><simpara></simpara>
+{title#}</example>
+
+[verseblock]
+<formalpara{id? id="{id}"}><title>{title}</title><para>
+{title%}<literallayout{id? id="{id}"}>
+{title#}<literallayout>
+|
+</literallayout>
+{title#}</para></formalpara>
+{title%}<simpara></simpara>
+endif::doctype-manpage[]
+endif::perf-asciidoc-no-roff[]
+endif::backend-docbook[]
+
+ifdef::doctype-manpage[]
+ifdef::backend-docbook[]
+[header]
+template::[header-declarations]
+<refentry>
+<refmeta>
+<refentrytitle>{mantitle}</refentrytitle>
+<manvolnum>{manvolnum}</manvolnum>
+<refmiscinfo class="source">perf</refmiscinfo>
+<refmiscinfo class="version">{perf_version}</refmiscinfo>
+<refmiscinfo class="manual">perf Manual</refmiscinfo>
+</refmeta>
+<refnamediv>
+  <refname>{manname}</refname>
+  <refpurpose>{manpurpose}</refpurpose>
+</refnamediv>
+endif::backend-docbook[]
+endif::doctype-manpage[]
+
+ifdef::backend-xhtml11[]
+[linkperf-inlinemacro]
+<a href="{target}.html">{target}{0?({0})}</a>
+endif::backend-xhtml11[]
diff --git a/Documentation/perf_counter/Documentation/manpage-1.72.xsl b/Documentation/perf_counter/Documentation/manpage-1.72.xsl
new file mode 100644
index 0000000000000..b4d315cb8c479
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/manpage-1.72.xsl
@@ -0,0 +1,14 @@
+<!-- manpage-1.72.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles peculiarities in docbook-xsl 1.72.0 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<xsl:import href="manpage-base.xsl"/>
+
+<!-- these are the special values for the roff control characters
+     needed for docbook-xsl 1.72.0 -->
+<xsl:param name="git.docbook.backslash">&#x2593;</xsl:param>
+<xsl:param name="git.docbook.dot"      >&#x2302;</xsl:param>
+
+</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-base.xsl b/Documentation/perf_counter/Documentation/manpage-base.xsl
new file mode 100644
index 0000000000000..a264fa616093c
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/manpage-base.xsl
@@ -0,0 +1,35 @@
+<!-- manpage-base.xsl:
+     special formatting for manpages rendered from asciidoc+docbook -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- these params silence some output from xmlto -->
+<xsl:param name="man.output.quietly" select="1"/>
+<xsl:param name="refentry.meta.get.quietly" select="1"/>
+
+<!-- convert asciidoc callouts to man page format;
+     git.docbook.backslash and git.docbook.dot params
+     must be supplied by another XSL file or other means -->
+<xsl:template match="co">
+	<xsl:value-of select="concat(
+			      $git.docbook.backslash,'fB(',
+			      substring-after(@id,'-'),')',
+			      $git.docbook.backslash,'fR')"/>
+</xsl:template>
+<xsl:template match="calloutlist">
+	<xsl:value-of select="$git.docbook.dot"/>
+	<xsl:text>sp&#10;</xsl:text>
+	<xsl:apply-templates/>
+	<xsl:text>&#10;</xsl:text>
+</xsl:template>
+<xsl:template match="callout">
+	<xsl:value-of select="concat(
+			      $git.docbook.backslash,'fB',
+			      substring-after(@arearefs,'-'),
+			      '. ',$git.docbook.backslash,'fR')"/>
+	<xsl:apply-templates/>
+	<xsl:value-of select="$git.docbook.dot"/>
+	<xsl:text>br&#10;</xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-bold-literal.xsl b/Documentation/perf_counter/Documentation/manpage-bold-literal.xsl
new file mode 100644
index 0000000000000..608eb5df62814
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/manpage-bold-literal.xsl
@@ -0,0 +1,17 @@
+<!-- manpage-bold-literal.xsl:
+     special formatting for manpages rendered from asciidoc+docbook -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- render literal text as bold (instead of plain or monospace);
+     this makes literal text easier to distinguish in manpages
+     viewed on a tty -->
+<xsl:template match="literal">
+	<xsl:value-of select="$git.docbook.backslash"/>
+	<xsl:text>fB</xsl:text>
+	<xsl:apply-templates/>
+	<xsl:value-of select="$git.docbook.backslash"/>
+	<xsl:text>fR</xsl:text>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-normal.xsl b/Documentation/perf_counter/Documentation/manpage-normal.xsl
new file mode 100644
index 0000000000000..a48f5b11f3dcc
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/manpage-normal.xsl
@@ -0,0 +1,13 @@
+<!-- manpage-normal.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles anything we want to keep away from docbook-xsl 1.72.0 -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<xsl:import href="manpage-base.xsl"/>
+
+<!-- these are the normal values for the roff control characters -->
+<xsl:param name="git.docbook.backslash">\</xsl:param>
+<xsl:param name="git.docbook.dot"	>.</xsl:param>
+
+</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl b/Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl
new file mode 100644
index 0000000000000..a63c7632a87d4
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl
@@ -0,0 +1,21 @@
+<!-- manpage-suppress-sp.xsl:
+     special settings for manpages rendered from asciidoc+docbook
+     handles erroneous, inline .sp in manpage output of some
+     versions of docbook-xsl -->
+<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
+		version="1.0">
+
+<!-- attempt to work around spurious .sp at the tail of the line
+     that some versions of docbook stylesheets seem to add -->
+<xsl:template match="simpara">
+  <xsl:variable name="content">
+    <xsl:apply-templates/>
+  </xsl:variable>
+  <xsl:value-of select="normalize-space($content)"/>
+  <xsl:if test="not(ancestor::authorblurb) and
+                not(ancestor::personblurb)">
+    <xsl:text>&#10;&#10;</xsl:text>
+  </xsl:if>
+</xsl:template>
+
+</xsl:stylesheet>
diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
index a93d2ec6176a3..4d3416fc76441 100644
--- a/Documentation/perf_counter/Documentation/perf-record.txt
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -1,5 +1,5 @@
 perf-record(1)
-==========
+==============
 
 NAME
 ----
@@ -53,12 +53,6 @@ OPTIONS
 -l::
         scale counter values
 
-Configuration
--------------
-
-EXAMPLES
---------
-
 SEE ALSO
 --------
-linkperf:git-stat[1]
+linkperf:perf-stat[1]
diff --git a/Documentation/perf_counter/Documentation/perf-report.txt b/Documentation/perf_counter/Documentation/perf-report.txt
index 49efe16c95882..52d3fc6846a97 100644
--- a/Documentation/perf_counter/Documentation/perf-report.txt
+++ b/Documentation/perf_counter/Documentation/perf-report.txt
@@ -1,5 +1,5 @@
 perf-report(1)
-==========
+==============
 
 NAME
 ----
@@ -21,12 +21,6 @@ OPTIONS
 --input=::
         Input file name. (default: perf.data)
 
-Configuration
--------------
-
-EXAMPLES
---------
-
 SEE ALSO
 --------
 linkperf:perf-stat[1]
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
index 828c59ff5f5d1..a67d0e3b7d0d7 100644
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -51,9 +51,6 @@ OPTIONS
 -l::
         scale counter values
 
-Configuration
--------------
-
 EXAMPLES
 --------
 
@@ -74,4 +71,4 @@ $ perf stat sleep 1
 
 SEE ALSO
 --------
-linkperf:git-tops[1]
+linkperf:perf-tops[1]
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
index 057333b725341..15251e40e4f84 100644
--- a/Documentation/perf_counter/Documentation/perf-top.txt
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -50,12 +50,6 @@ OPTIONS
 -l::
         scale counter values
 
-Configuration
--------------
-
-EXAMPLES
---------
-
 SEE ALSO
 --------
-linkperf:git-stat[1]
+linkperf:perf-stat[1]
diff --git a/Documentation/perf_counter/Documentation/perf.txt b/Documentation/perf_counter/Documentation/perf.txt
new file mode 100644
index 0000000000000..e3d8b3832c6f4
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf.txt
@@ -0,0 +1,23 @@
+perf(1)
+=======
+
+NAME
+----
+perf - Performance analysis tools for Linux
+
+SYNOPSIS
+--------
+[verse]
+'perf' [--version] [--help] COMMAND [ARGS]
+
+DESCRIPTION
+-----------
+Performance counters for Linux are are a new kernel-based subsystem
+that provide a framework for all things performance analysis. It
+covers hardware level (CPU/PMU, Performance Monitoring Unit) features
+and software features (software counters, tracepoints) as well.
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-top[1],
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 8f725840477fd..416ab11e9786d 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -693,6 +693,21 @@ builtin-revert.o wt-status.o: wt-status.h
 $(LIB_FILE): $(LIB_OBJS)
 	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
 
+doc:
+	$(MAKE) -C Documentation all
+
+man:
+	$(MAKE) -C Documentation man
+
+html:
+	$(MAKE) -C Documentation html
+
+info:
+	$(MAKE) -C Documentation info
+
+pdf:
+	$(MAKE) -C Documentation pdf
+
 TAGS:
 	$(RM) TAGS
 	$(FIND) . -name '*.[hcS]' -print | xargs etags -a
@@ -781,6 +796,31 @@ ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
 
+install-doc:
+	$(MAKE) -C Documentation install
+
+install-man:
+	$(MAKE) -C Documentation install-man
+
+install-html:
+	$(MAKE) -C Documentation install-html
+
+install-info:
+	$(MAKE) -C Documentation install-info
+
+install-pdf:
+	$(MAKE) -C Documentation install-pdf
+
+quick-install-doc:
+	$(MAKE) -C Documentation quick-install
+
+quick-install-man:
+	$(MAKE) -C Documentation quick-install-man
+
+quick-install-html:
+	$(MAKE) -C Documentation quick-install-html
+
+
 ### Maintainer's dist rules
 
 perf.spec: perf.spec.in
@@ -801,6 +841,26 @@ dist: perf.spec perf-archive$(X) configure
 	@$(RM) -r $(PERF_TARNAME)
 	gzip -f -9 $(PERF_TARNAME).tar
 
+htmldocs = perf-htmldocs-$(PERF_VERSION)
+manpages = perf-manpages-$(PERF_VERSION)
+dist-doc:
+	$(RM) -r .doc-tmp-dir
+	mkdir .doc-tmp-dir
+	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
+	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
+	gzip -n -9 -f $(htmldocs).tar
+	:
+	$(RM) -r .doc-tmp-dir
+	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
+	$(MAKE) -C Documentation DESTDIR=./ \
+		man1dir=../.doc-tmp-dir/man1 \
+		man5dir=../.doc-tmp-dir/man5 \
+		man7dir=../.doc-tmp-dir/man7 \
+		install
+	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
+	gzip -n -9 -f $(manpages).tar
+	$(RM) -r .doc-tmp-dir
+
 rpm: dist
 	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
 
@@ -819,6 +879,7 @@ clean:
 	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
 	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
 	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
+	$(MAKE) -C Documentation/ clean
 	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
 
 .PHONY: all install clean strip
diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
index d2bd3177b98c7..a3894bfb9b6ec 100644
--- a/Documentation/perf_counter/builtin-help.c
+++ b/Documentation/perf_counter/builtin-help.c
@@ -317,7 +317,7 @@ static const char *cmd_to_page(const char *perf_cmd)
 	else if (is_perf_command(perf_cmd))
 		return prepend("perf-", perf_cmd);
 	else
-		return prepend("perf", perf_cmd);
+		return prepend("perf-", perf_cmd);
 }
 
 static void setup_man_path(void)
-- 
GitLab


From 27b9613b7be39412775d0ab80f57229aa73bb07d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 31 May 2009 22:09:49 +0200
Subject: [PATCH 1446/2198] perf_counter tools: Fix unknown command help text

Arjan reported this error when entering an unknown command to perf:

  $ perf start
  fatal: Uh oh. Your system reports no Git commands at all.

The Git code expects there to be perf-* commands - but since Perf
is a 'pure' utility with no dash commands anymore, this old assumption
of Git does not hold anymore. Remove that error check.

Reported-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/help.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/Documentation/perf_counter/util/help.c b/Documentation/perf_counter/util/help.c
index edde541d238d7..397487fb2bee6 100644
--- a/Documentation/perf_counter/util/help.c
+++ b/Documentation/perf_counter/util/help.c
@@ -323,9 +323,6 @@ const char *help_unknown_cmd(const char *cmd)
 	qsort(main_cmds.names, main_cmds.cnt,
 	      sizeof(*main_cmds.names), levenshtein_compare);
 
-	if (!main_cmds.cnt)
-		die ("Uh oh. Your system reports no Git commands at all.");
-
 	best_similarity = main_cmds.names[0]->len;
 	n = 1;
 	while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
-- 
GitLab


From eb9b9b56eed280e65a9e194aaeb50a5a75111859 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 11:51:51 +0000
Subject: [PATCH 1447/2198] sh: boot word / mode pin support V2

Add mode pin support for the SuperH architecture V2.

With this patch applied the board code can add their
own function to export the cpu mode pin configuration.
In most cases this will be a constant bitmap, but
boards that allow reading this from a register can
instead read out the pin state from hardware.

The code warns if a pin is tested but no board specific
mode pin function has been provided.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/machvec.h   |  1 +
 arch/sh/include/asm/processor.h |  4 ++++
 arch/sh/kernel/machvec.c        |  1 +
 arch/sh/kernel/setup.c          | 12 ++++++++++++
 4 files changed, 18 insertions(+)

diff --git a/arch/sh/include/asm/machvec.h b/arch/sh/include/asm/machvec.h
index 73d6d16fa06b3..84dd37761f563 100644
--- a/arch/sh/include/asm/machvec.h
+++ b/arch/sh/include/asm/machvec.h
@@ -48,6 +48,7 @@ struct sh_machine_vector {
 	void (*mv_ioport_unmap)(void __iomem *);
 
 	int (*mv_clk_init)(void);
+	int (*mv_mode_pins)(void);
 };
 
 extern struct sh_machine_vector sh_mv;
diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h
index 005c962c8b1c5..fb67482e47eb4 100644
--- a/arch/sh/include/asm/processor.h
+++ b/arch/sh/include/asm/processor.h
@@ -94,6 +94,10 @@ extern struct pt_regs fake_swapper_regs;
 const char *get_cpu_subtype(struct sh_cpuinfo *c);
 extern const struct seq_operations cpuinfo_op;
 
+/* processor boot mode configuration */
+int generic_mode_pins(void);
+int test_mode_pin(int pin);
+
 #ifdef CONFIG_VSYSCALL
 int vsyscall_init(void);
 #else
diff --git a/arch/sh/kernel/machvec.c b/arch/sh/kernel/machvec.c
index c1ea41e5812a5..548f6607fd0fb 100644
--- a/arch/sh/kernel/machvec.c
+++ b/arch/sh/kernel/machvec.c
@@ -129,6 +129,7 @@ void __init sh_mv_setup(void)
 	mv_set(ioport_map);
 	mv_set(ioport_unmap);
 	mv_set(irq_demux);
+	mv_set(mode_pins);
 
 	if (!sh_mv.mv_nr_irqs)
 		sh_mv.mv_nr_irqs = NR_IRQS;
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 00086b234278c..050131eec7738 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -420,6 +420,18 @@ void __init setup_arch(char **cmdline_p)
 #endif
 }
 
+/* processor boot mode configuration */
+int generic_mode_pins(void)
+{
+	pr_warning("generic_mode_pins(): missing mode pin configuration\n");
+	return 0;
+}
+
+int test_mode_pin(int pin)
+{
+	return sh_mv.mv_mode_pins() & (1 << pin);
+}
+
 static const char *cpu_name[] = {
 	[CPU_SH7201]	= "SH7201",
 	[CPU_SH7203]	= "SH7203",	[CPU_SH7263]	= "SH7263",
-- 
GitLab


From 4a44b32969bfc29140b9f21a1ec924c796d6ac43 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Mon, 1 Jun 2009 15:56:00 +0900
Subject: [PATCH 1448/2198] sh: sh7785 mode pin definitions

This patch adds sh7785 mode pin definitions. Mode pins and
pin function controller comments are added as well.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/cpu-sh4/cpu/sh7785.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7785.h b/arch/sh/include/cpu-sh4/cpu/sh7785.h
index e4006afb735ee..89afaa6dc2d8f 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7785.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7785.h
@@ -1,6 +1,30 @@
 #ifndef __ASM_SH7785_H__
 #define __ASM_SH7785_H__
 
+/* Boot Mode Pins, more information in sh7785 manual Rev.1.00, page 1628 */
+enum {
+	MODE_PIN_MODE0,  /* CPG - Initial Pck/Bck Frequency [FRQMR1] */
+	MODE_PIN_MODE1,  /* CPG - Initial Uck/SHck/DDRck Frequency [FRQMR1] */
+	MODE_PIN_MODE2,  /* CPG - Reserved (L: Normal operation) */
+	MODE_PIN_MODE3,  /* CPG - Reserved (L: Normal operation) */
+	MODE_PIN_MODE4,  /* CPG - Initial PLL setting (72x/36x) */
+	MODE_PIN_MODE5,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.8] */
+	MODE_PIN_MODE6,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.9] */
+	MODE_PIN_MODE7,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.3] */
+	MODE_PIN_MODE8,  /* LBSC - Endian Mode (L: Big, H: Little) [BCR.31] */
+	MODE_PIN_MODE9,  /* LBSC - Master/Slave Mode (L: Slave) [BCR.30] */
+	MODE_PIN_MODE10, /* CPG - Clock Input (L: Ext Clk, H: Crystal) */
+	MODE_PIN_MODE11, /* PCI - Pin Mode (LL: PCI host, LH: PCI slave) */
+	MODE_PIN_MODE12, /* PCI - Pin Mode (HL: Local bus, HH: DU) */
+	MODE_PIN_MODE13, /* Boot Address Mode (L: 29-bit, H: 32-bit) */
+	MODE_PIN_MODE14, /* Reserved (H: Normal operation) */
+	MODE_PIN_MPMD,   /* Emulation Mode (L: Emulation mode, H: LSI mode) */
+};
+
+/* Pin Function Controller:
+ * GPIO_FN_xx - GPIO used to select pin function
+ * GPIO_Pxx - GPIO mapped to real I/O pin on CPU
+ */
 enum {
 	/* PA */
 	GPIO_PA7, GPIO_PA6, GPIO_PA5, GPIO_PA4,
-- 
GitLab


From 63d12e23235d982d8f55696e09b2ff91e3ba0042 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 12:00:25 +0000
Subject: [PATCH 1449/2198] sh: sh7785lcr mode pin configuration

This patch adds mode pin support to the sh7785lcr board.

The harware allows the user to control the mode pins using
dip switches S1 and S2, but from the software the pins are
fixed to the factory default since we have no way to reading
out this configuration from software.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/board-sh7785lcr.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/arch/sh/boards/board-sh7785lcr.c b/arch/sh/boards/board-sh7785lcr.c
index 33b194b0454d5..c2894c5b71ea0 100644
--- a/arch/sh/boards/board-sh7785lcr.c
+++ b/arch/sh/boards/board-sh7785lcr.c
@@ -24,6 +24,7 @@
 #include <mach/sh7785lcr.h>
 #include <asm/heartbeat.h>
 #include <asm/clock.h>
+#include <cpu/sh7785.h>
 
 /*
  * NOTE: This board has 2 physical memory maps.
@@ -320,6 +321,26 @@ static void __init sh7785lcr_setup(char **cmdline_p)
 	writel(0x000307c2, sm501_reg);
 }
 
+/* Return the board specific boot mode pin configuration */
+static int sh7785lcr_mode_pins(void)
+{
+	int value = 0;
+
+	/* These are the factory default settings of S1 and S2.
+	 * If you change these dip switches then you will need to
+	 * adjust the values below as well.
+	 */
+	value |= 1 << MODE_PIN_MODE4; /* Clock Mode 16 */
+	value |= 1 << MODE_PIN_MODE5; /* 32-bit Area0 bus width */
+	value |= 1 << MODE_PIN_MODE6; /* 32-bit Area0 bus width */
+	value |= 1 << MODE_PIN_MODE7; /* Area 0 SRAM interface [fixed] */
+	value |= 1 << MODE_PIN_MODE8; /* Little Endian */
+	value |= 1 << MODE_PIN_MODE9; /* Master Mode */
+	value |= 1 << MODE_PIN_MODE14; /* No PLL step-up */
+
+	return value;
+}
+
 /*
  * The Machine Vector
  */
@@ -328,5 +349,6 @@ static struct sh_machine_vector mv_sh7785lcr __initmv = {
 	.mv_setup		= sh7785lcr_setup,
 	.mv_clk_init		= sh7785lcr_clk_init,
 	.mv_init_irq		= init_sh7785lcr_IRQ,
+	.mv_mode_pins		= sh7785lcr_mode_pins,
 };
 
-- 
GitLab


From 1823f6d5e6b81cca6542ed2e5f30d2556aad0f67 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 12:06:17 +0000
Subject: [PATCH 1450/2198] sh: sh7785 pll configuration from mode pin

This patch modifies the sh7785 clock code to use the MODE4
value to switch between 72x and 36x PLL multiplication.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index a4a9bcbec6647..705b023f82201 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -16,6 +16,7 @@
 #include <linux/cpufreq.h>
 #include <asm/clock.h>
 #include <asm/freq.h>
+#include <cpu/sh7785.h>
 
 static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
 			       24, 32, 36, 48 };
@@ -80,12 +81,11 @@ static struct clk_ops frqmr_clk_ops = {
 
 static unsigned long pll_recalc(struct clk *clk)
 {
-	/*
-	 * XXX: PLL1 multiplier is locked for the default clock mode,
-	 * when mode pin detection and configuration support is added,
-	 * select the multiplier dynamically.
-	 */
-	return clk->parent->rate * 36;
+	int multiplier;
+
+	multiplier = test_mode_pin(MODE_PIN_MODE4) ? 36 : 72;
+
+	return clk->parent->rate * multiplier;
 }
 
 static struct clk_ops pll_clk_ops = {
-- 
GitLab


From 98fbe45bea77c1804eae0e71f27673db1824a2a8 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Fri, 29 May 2009 07:41:23 +0000
Subject: [PATCH 1451/2198] sh: SH7724 has an L2 cache.

Add the CPU_HAS_L2_CACHE flag to SH7724.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4/probe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c
index 973ff831c8a8c..7d821ce8434f6 100644
--- a/arch/sh/kernel/cpu/sh4/probe.c
+++ b/arch/sh/kernel/cpu/sh4/probe.c
@@ -160,7 +160,7 @@ int __init detect_cpu_and_cache_system(void)
 		boot_cpu_data.type = CPU_SH7724;
 		boot_cpu_data.icache.ways = 4;
 		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_FPU;
+		boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_FPU | CPU_HAS_L2_CACHE;
 		break;
 	case 0x4000:	/* 1st cut */
 	case 0x4001:	/* 2nd cut */
-- 
GitLab


From 25346b93ca079080c9cb23331db5c4f6404e8530 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:48:12 +1000
Subject: [PATCH 1452/2198] perf_counter: Provide functions for locking and
 pinning the context for a task

This abstracts out the code for locking the context associated
with a task.  Because the context might get transferred from
one task to another concurrently, we have to check after
locking the context that it is still the right context for the
task and retry if not.  This was open-coded in
find_get_context() and perf_counter_init_task().

This adds a further function for pinning the context for a
task, i.e. marking it so it can't be transferred to another
task.  This adds a 'pin_count' field to struct
perf_counter_context to indicate that a context is pinned,
instead of the previous method of setting the parent_gen count
to all 1s.  Pinning the context with a pin_count is easier to
undo and doesn't require saving the parent_gen value.  This
also adds a perf_unpin_context() to undo the effect of
perf_pin_task_context() and changes perf_counter_init_task to
use it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.34748.755674.596386@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |   1 +
 kernel/perf_counter.c        | 128 ++++++++++++++++++++---------------
 2 files changed, 75 insertions(+), 54 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 519a41bba2495..81ec79c9f1934 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -543,6 +543,7 @@ struct perf_counter_context {
 	struct perf_counter_context *parent_ctx;
 	u64			parent_gen;
 	u64			generation;
+	int			pin_count;
 	struct rcu_head		rcu_head;
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 79c3f26541d3e..da8dfef4b472c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -122,6 +122,69 @@ static void put_ctx(struct perf_counter_context *ctx)
 	}
 }
 
+/*
+ * Get the perf_counter_context for a task and lock it.
+ * This has to cope with with the fact that until it is locked,
+ * the context could get moved to another task.
+ */
+static struct perf_counter_context *perf_lock_task_context(
+				struct task_struct *task, unsigned long *flags)
+{
+	struct perf_counter_context *ctx;
+
+	rcu_read_lock();
+ retry:
+	ctx = rcu_dereference(task->perf_counter_ctxp);
+	if (ctx) {
+		/*
+		 * If this context is a clone of another, it might
+		 * get swapped for another underneath us by
+		 * perf_counter_task_sched_out, though the
+		 * rcu_read_lock() protects us from any context
+		 * getting freed.  Lock the context and check if it
+		 * got swapped before we could get the lock, and retry
+		 * if so.  If we locked the right context, then it
+		 * can't get swapped on us any more.
+		 */
+		spin_lock_irqsave(&ctx->lock, *flags);
+		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
+			spin_unlock_irqrestore(&ctx->lock, *flags);
+			goto retry;
+		}
+	}
+	rcu_read_unlock();
+	return ctx;
+}
+
+/*
+ * Get the context for a task and increment its pin_count so it
+ * can't get swapped to another task.  This also increments its
+ * reference count so that the context can't get freed.
+ */
+static struct perf_counter_context *perf_pin_task_context(struct task_struct *task)
+{
+	struct perf_counter_context *ctx;
+	unsigned long flags;
+
+	ctx = perf_lock_task_context(task, &flags);
+	if (ctx) {
+		++ctx->pin_count;
+		get_ctx(ctx);
+		spin_unlock_irqrestore(&ctx->lock, flags);
+	}
+	return ctx;
+}
+
+static void perf_unpin_context(struct perf_counter_context *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->lock, flags);
+	--ctx->pin_count;
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	put_ctx(ctx);
+}
+
 /*
  * Add a counter from the lists for its context.
  * Must be called with ctx->mutex and ctx->lock held.
@@ -916,7 +979,7 @@ static int context_equiv(struct perf_counter_context *ctx1,
 {
 	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
 		&& ctx1->parent_gen == ctx2->parent_gen
-		&& ctx1->parent_gen != ~0ull;
+		&& !ctx1->pin_count && !ctx2->pin_count;
 }
 
 /*
@@ -1271,6 +1334,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	struct perf_counter_context *ctx;
 	struct perf_counter_context *parent_ctx;
 	struct task_struct *task;
+	unsigned long flags;
 	int err;
 
 	/*
@@ -1323,28 +1387,9 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	if (!ptrace_may_access(task, PTRACE_MODE_READ))
 		goto errout;
 
- retry_lock:
-	rcu_read_lock();
  retry:
-	ctx = rcu_dereference(task->perf_counter_ctxp);
+	ctx = perf_lock_task_context(task, &flags);
 	if (ctx) {
-		/*
-		 * If this context is a clone of another, it might
-		 * get swapped for another underneath us by
-		 * perf_counter_task_sched_out, though the
-		 * rcu_read_lock() protects us from any context
-		 * getting freed.  Lock the context and check if it
-		 * got swapped before we could get the lock, and retry
-		 * if so.  If we locked the right context, then it
-		 * can't get swapped on us any more and we can
-		 * unclone it if necessary.
-		 * Once it's not a clone things will be stable.
-		 */
-		spin_lock_irq(&ctx->lock);
-		if (ctx != rcu_dereference(task->perf_counter_ctxp)) {
-			spin_unlock_irq(&ctx->lock);
-			goto retry;
-		}
 		parent_ctx = ctx->parent_ctx;
 		if (parent_ctx) {
 			put_ctx(parent_ctx);
@@ -1355,9 +1400,8 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 		 * this context won't get freed if the task exits.
 		 */
 		get_ctx(ctx);
-		spin_unlock_irq(&ctx->lock);
+		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
-	rcu_read_unlock();
 
 	if (!ctx) {
 		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL);
@@ -1372,7 +1416,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 			 * the context they set.
 			 */
 			kfree(ctx);
-			goto retry_lock;
+			goto retry;
 		}
 		get_task_struct(task);
 	}
@@ -3667,7 +3711,6 @@ int perf_counter_init_task(struct task_struct *child)
 	struct perf_counter *counter;
 	struct task_struct *parent = current;
 	int inherited_all = 1;
-	u64 cloned_gen;
 	int ret = 0;
 
 	child->perf_counter_ctxp = NULL;
@@ -3693,32 +3736,17 @@ int perf_counter_init_task(struct task_struct *child)
 	get_task_struct(child);
 
 	/*
-	 * If the parent's context is a clone, temporarily set its
-	 * parent_gen to an impossible value (all 1s) so it won't get
-	 * swapped under us.  The rcu_read_lock makes sure that
-	 * parent_ctx continues to exist even if it gets swapped to
-	 * another process and then freed while we are trying to get
-	 * its lock.
+	 * If the parent's context is a clone, pin it so it won't get
+	 * swapped under us.
 	 */
-	rcu_read_lock();
- retry:
-	parent_ctx = rcu_dereference(parent->perf_counter_ctxp);
+	parent_ctx = perf_pin_task_context(parent);
+
 	/*
 	 * No need to check if parent_ctx != NULL here; since we saw
 	 * it non-NULL earlier, the only reason for it to become NULL
 	 * is if we exit, and since we're currently in the middle of
 	 * a fork we can't be exiting at the same time.
 	 */
-	spin_lock_irq(&parent_ctx->lock);
-	if (parent_ctx != rcu_dereference(parent->perf_counter_ctxp)) {
-		spin_unlock_irq(&parent_ctx->lock);
-		goto retry;
-	}
-	cloned_gen = parent_ctx->parent_gen;
-	if (parent_ctx->parent_ctx)
-		parent_ctx->parent_gen = ~0ull;
-	spin_unlock_irq(&parent_ctx->lock);
-	rcu_read_unlock();
 
 	/*
 	 * Lock the parent list. No need to lock the child - not PID
@@ -3759,7 +3787,7 @@ int perf_counter_init_task(struct task_struct *child)
 		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx);
 		if (cloned_ctx) {
 			child_ctx->parent_ctx = cloned_ctx;
-			child_ctx->parent_gen = cloned_gen;
+			child_ctx->parent_gen = parent_ctx->parent_gen;
 		} else {
 			child_ctx->parent_ctx = parent_ctx;
 			child_ctx->parent_gen = parent_ctx->generation;
@@ -3769,15 +3797,7 @@ int perf_counter_init_task(struct task_struct *child)
 
 	mutex_unlock(&parent_ctx->mutex);
 
-	/*
-	 * Restore the clone status of the parent.
-	 */
-	if (parent_ctx->parent_ctx) {
-		spin_lock_irq(&parent_ctx->lock);
-		if (parent_ctx->parent_ctx)
-			parent_ctx->parent_gen = cloned_gen;
-		spin_unlock_irq(&parent_ctx->lock);
-	}
+	perf_unpin_context(parent_ctx);
 
 	return ret;
 }
-- 
GitLab


From 880ca15adf2392770a68047e7a98e076ff4d21da Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:49:14 +1000
Subject: [PATCH 1453/2198] perf_counter: Allow software counters to count
 while task is not running

This changes perf_swcounter_match() so that per-task software
counters can count events that occur while their associated
task is not running.  This will allow us to use the generic
software counter code for counting task migrations, which can
occur while the task is not scheduled in.

To do this, we have to distinguish between the situations where
the counter is inactive because its task has been scheduled
out, and those where the counter is inactive because it is part
of a group that was not able to go on the PMU.  In the former
case we want the counter to count, but not in the latter case.
If the context is active, we have the latter case.  If the
context is inactive then we need to know whether the counter
was counting when the context was last active, which we can
determine by comparing its ->tstamp_stopped timestamp with the
context's timestamp.

This also folds three checks in perf_swcounter_match, checking
perf_event_raw(), perf_event_type() and perf_event_id()
individually, into a single 64-bit comparison on
counter->hw_event.config, as an optimization.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.34810.259718.955621@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 48 +++++++++++++++++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index da8dfef4b472c..ff8b4636f8451 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2867,20 +2867,56 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 
 }
 
+static int perf_swcounter_is_counting(struct perf_counter *counter)
+{
+	struct perf_counter_context *ctx;
+	unsigned long flags;
+	int count;
+
+	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
+		return 1;
+
+	if (counter->state != PERF_COUNTER_STATE_INACTIVE)
+		return 0;
+
+	/*
+	 * If the counter is inactive, it could be just because
+	 * its task is scheduled out, or because it's in a group
+	 * which could not go on the PMU.  We want to count in
+	 * the first case but not the second.  If the context is
+	 * currently active then an inactive software counter must
+	 * be the second case.  If it's not currently active then
+	 * we need to know whether the counter was active when the
+	 * context was last active, which we can determine by
+	 * comparing counter->tstamp_stopped with ctx->time.
+	 *
+	 * We are within an RCU read-side critical section,
+	 * which protects the existence of *ctx.
+	 */
+	ctx = counter->ctx;
+	spin_lock_irqsave(&ctx->lock, flags);
+	count = 1;
+	/* Re-check state now we have the lock */
+	if (counter->state < PERF_COUNTER_STATE_INACTIVE ||
+	    counter->ctx->is_active ||
+	    counter->tstamp_stopped < ctx->time)
+		count = 0;
+	spin_unlock_irqrestore(&ctx->lock, flags);
+	return count;
+}
+
 static int perf_swcounter_match(struct perf_counter *counter,
 				enum perf_event_types type,
 				u32 event, struct pt_regs *regs)
 {
-	if (counter->state != PERF_COUNTER_STATE_ACTIVE)
-		return 0;
+	u64 event_config;
 
-	if (perf_event_raw(&counter->hw_event))
-		return 0;
+	event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event;
 
-	if (perf_event_type(&counter->hw_event) != type)
+	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (perf_event_id(&counter->hw_event) != event)
+	if (counter->hw_event.config != event_config)
 		return 0;
 
 	if (counter->hw_event.exclude_user && user_mode(regs))
-- 
GitLab


From 6881e8bf3d86b23dd124134fae113ebd05fae08a Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 12:52:29 +0000
Subject: [PATCH 1454/2198] sh: shared mstp32 clock code

Add shared 32-bit module stop bit clock support.

Processor specific code can use SH_CLK_MSTP32()
to initialize module stop bit clocks, and then
use sh_clk_mstp32() for registration.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h    | 13 +++++++++++++
 arch/sh/kernel/cpu/clock-cpg.c | 35 ++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index aa9480d4aa059..f43d3e72d2669 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -117,4 +117,17 @@ long clk_rate_table_round(struct clk *clk,
 			  struct cpufreq_frequency_table *freq_table,
 			  unsigned long rate);
 
+#define SH_CLK_MSTP32(_name, _id, _parent, _enable_reg,	\
+	    _enable_bit, _flags)			\
+{							\
+	.name		= _name,			\
+	.id		= _id,				\
+	.parent		= _parent,			\
+	.enable_reg	= (void __iomem *)_enable_reg,	\
+	.enable_bit	= _enable_bit,			\
+	.flags		= _flags,			\
+}
+
+int sh_clk_mstp32_register(struct clk *clks, int nr);
+
 #endif /* __ASM_SH_CLOCK_H */
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index b78c237ab3666..72228d2945ac2 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -1,7 +1,42 @@
 #include <linux/clk.h>
 #include <linux/compiler.h>
+#include <linux/io.h>
 #include <asm/clock.h>
 
+static int sh_clk_mstp32_enable(struct clk *clk)
+{
+	__raw_writel(__raw_readl(clk->enable_reg) & ~(1 << clk->enable_bit),
+		     clk->enable_reg);
+	return 0;
+}
+
+static void sh_clk_mstp32_disable(struct clk *clk)
+{
+	__raw_writel(__raw_readl(clk->enable_reg) | (1 << clk->enable_bit),
+		     clk->enable_reg);
+}
+
+static struct clk_ops sh_clk_mstp32_clk_ops = {
+	.enable		= sh_clk_mstp32_enable,
+	.disable	= sh_clk_mstp32_disable,
+	.recalc		= followparent_recalc,
+};
+
+int __init sh_clk_mstp32_register(struct clk *clks, int nr)
+{
+	struct clk *clkp;
+	int ret = 0;
+	int k;
+
+	for (k = 0; !ret && (k < nr); k++) {
+		clkp = clks + k;
+		clkp->ops = &sh_clk_mstp32_clk_ops;
+		ret |= clk_register(clkp);
+	}
+
+	return ret;
+}
+
 #ifdef CONFIG_SH_CLK_CPG_LEGACY
 static struct clk master_clk = {
 	.name		= "master_clk",
-- 
GitLab


From e89d53e60593ee7066e1d36ab5c1ccf2648f5f53 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 13:01:53 +0000
Subject: [PATCH 1455/2198] sh: hook up shared mstp32 clock code to sh7785

Hook up the shared 32-bit module stop bit code to sh7785.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 80 ++++++++------------------
 1 file changed, 25 insertions(+), 55 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 705b023f82201..7d557068f4a3b 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -193,65 +193,34 @@ static struct clk *clks[] = {
 	&umem_clk,
 };
 
-static int mstpcr_clk_enable(struct clk *clk)
-{
-	__raw_writel(__raw_readl(clk->enable_reg) & ~(1 << clk->enable_bit),
-		     clk->enable_reg);
-	return 0;
-}
-
-static void mstpcr_clk_disable(struct clk *clk)
-{
-	__raw_writel(__raw_readl(clk->enable_reg) | (1 << clk->enable_bit),
-		     clk->enable_reg);
-}
-
-static struct clk_ops mstpcr_clk_ops = {
-	.enable		= mstpcr_clk_enable,
-	.disable	= mstpcr_clk_disable,
-	.recalc		= followparent_recalc,
-};
-
 #define MSTPCR0		0xffc80030
 #define MSTPCR1		0xffc80034
 
-#define CLK(_name, _id, _parent, _enable_reg,		\
-	    _enable_bit, _flags)			\
-{							\
-	.name		= _name,			\
-	.id		= _id,				\
-	.parent		= _parent,			\
-	.enable_reg	= (void __iomem *)_enable_reg,	\
-	.enable_bit	= _enable_bit,			\
-	.flags		= _flags,			\
-	.ops		= &mstpcr_clk_ops,		\
-}
-
-static struct clk mstpcr_clks[] = {
+static struct clk mstp_clks[] = {
 	/* MSTPCR0 */
-	CLK("scif_fck", 5, &peripheral_clk, MSTPCR0, 29, 0),
-	CLK("scif_fck", 4, &peripheral_clk, MSTPCR0, 28, 0),
-	CLK("scif_fck", 3, &peripheral_clk, MSTPCR0, 27, 0),
-	CLK("scif_fck", 2, &peripheral_clk, MSTPCR0, 26, 0),
-	CLK("scif_fck", 1, &peripheral_clk, MSTPCR0, 25, 0),
-	CLK("scif_fck", 0, &peripheral_clk, MSTPCR0, 24, 0),
-	CLK("ssi_fck", 1, &peripheral_clk, MSTPCR0, 21, 0),
-	CLK("ssi_fck", 0, &peripheral_clk, MSTPCR0, 20, 0),
-	CLK("hac_fck", 1, &peripheral_clk, MSTPCR0, 17, 0),
-	CLK("hac_fck", 0, &peripheral_clk, MSTPCR0, 16, 0),
-	CLK("mmcif_fck", -1, &peripheral_clk, MSTPCR0, 13, 0),
-	CLK("flctl_fck", -1, &peripheral_clk, MSTPCR0, 12, 0),
-	CLK("tmu345_fck", -1, &peripheral_clk, MSTPCR0, 9, 0),
-	CLK("tmu012_fck", -1, &peripheral_clk, MSTPCR0, 8, 0),
-	CLK("siof_fck", -1, &peripheral_clk, MSTPCR0, 3, 0),
-	CLK("hspi_fck", -1, &peripheral_clk, MSTPCR0, 2, 0),
+	SH_CLK_MSTP32("scif_fck", 5, &peripheral_clk, MSTPCR0, 29, 0),
+	SH_CLK_MSTP32("scif_fck", 4, &peripheral_clk, MSTPCR0, 28, 0),
+	SH_CLK_MSTP32("scif_fck", 3, &peripheral_clk, MSTPCR0, 27, 0),
+	SH_CLK_MSTP32("scif_fck", 2, &peripheral_clk, MSTPCR0, 26, 0),
+	SH_CLK_MSTP32("scif_fck", 1, &peripheral_clk, MSTPCR0, 25, 0),
+	SH_CLK_MSTP32("scif_fck", 0, &peripheral_clk, MSTPCR0, 24, 0),
+	SH_CLK_MSTP32("ssi_fck", 1, &peripheral_clk, MSTPCR0, 21, 0),
+	SH_CLK_MSTP32("ssi_fck", 0, &peripheral_clk, MSTPCR0, 20, 0),
+	SH_CLK_MSTP32("hac_fck", 1, &peripheral_clk, MSTPCR0, 17, 0),
+	SH_CLK_MSTP32("hac_fck", 0, &peripheral_clk, MSTPCR0, 16, 0),
+	SH_CLK_MSTP32("mmcif_fck", -1, &peripheral_clk, MSTPCR0, 13, 0),
+	SH_CLK_MSTP32("flctl_fck", -1, &peripheral_clk, MSTPCR0, 12, 0),
+	SH_CLK_MSTP32("tmu345_fck", -1, &peripheral_clk, MSTPCR0, 9, 0),
+	SH_CLK_MSTP32("tmu012_fck", -1, &peripheral_clk, MSTPCR0, 8, 0),
+	SH_CLK_MSTP32("siof_fck", -1, &peripheral_clk, MSTPCR0, 3, 0),
+	SH_CLK_MSTP32("hspi_fck", -1, &peripheral_clk, MSTPCR0, 2, 0),
 
 	/* MSTPCR1 */
-	CLK("hudi_fck", -1, NULL, MSTPCR1, 19, 0),
-	CLK("ubc_fck", -1, NULL, MSTPCR1, 17, 0),
-	CLK("dmac_11_6_fck", -1, NULL, MSTPCR1, 5, 0),
-	CLK("dmac_5_0_fck", -1, NULL, MSTPCR1, 4, 0),
-	CLK("gdta_fck", -1, NULL, MSTPCR1, 0, 0),
+	SH_CLK_MSTP32("hudi_fck", -1, NULL, MSTPCR1, 19, 0),
+	SH_CLK_MSTP32("ubc_fck", -1, NULL, MSTPCR1, 17, 0),
+	SH_CLK_MSTP32("dmac_11_6_fck", -1, NULL, MSTPCR1, 5, 0),
+	SH_CLK_MSTP32("dmac_5_0_fck", -1, NULL, MSTPCR1, 4, 0),
+	SH_CLK_MSTP32("gdta_fck", -1, NULL, MSTPCR1, 0, 0),
 };
 
 int __init arch_clk_init(void)
@@ -260,8 +229,9 @@ int __init arch_clk_init(void)
 
 	for (i = 0; i < ARRAY_SIZE(clks); i++)
 		ret |= clk_register(clks[i]);
-	for (i = 0; i < ARRAY_SIZE(mstpcr_clks); i++)
-		ret |= clk_register(&mstpcr_clks[i]);
+
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
 
 	return ret;
 }
-- 
GitLab


From a1153e27eec25e9963f5843ba8932952bd9847ac Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 13:11:31 +0000
Subject: [PATCH 1456/2198] sh: shared div4 clock code

Add shared code for 4-bit divisor clocks.

Processor specific code can use SH_CLK_DIV4()
to initialize div4 clocks, and then use
sh_clk_div4_register() for registration.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h    | 15 ++++++++++
 arch/sh/kernel/cpu/clock-cpg.c | 55 ++++++++++++++++++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index f43d3e72d2669..7435e40022e67 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -3,6 +3,7 @@
 
 #include <linux/list.h>
 #include <linux/seq_file.h>
+#include <linux/cpufreq.h>
 #include <linux/clk.h>
 #include <linux/err.h>
 
@@ -41,6 +42,7 @@ struct clk {
 	unsigned long		arch_flags;
 	void			*priv;
 	struct dentry		*dentry;
+	struct cpufreq_frequency_table *freq_table;
 };
 
 struct clk_lookup {
@@ -130,4 +132,17 @@ long clk_rate_table_round(struct clk *clk,
 
 int sh_clk_mstp32_register(struct clk *clks, int nr);
 
+#define SH_CLK_DIV4(_name, _parent, _reg, _shift, _div_bitmap, _flags)	\
+{									\
+	.name = _name,							\
+	.parent = _parent,						\
+	.enable_reg = (void __iomem *)_reg,				\
+	.enable_bit = _shift,						\
+	.arch_flags = _div_bitmap,					\
+	.flags = _flags,						\
+}
+
+int sh_clk_div4_register(struct clk *clks, int nr,
+			 struct clk_div_mult_table *table);
+
 #endif /* __ASM_SH_CLOCK_H */
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index 72228d2945ac2..88fc30d2f5fd9 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -1,5 +1,6 @@
 #include <linux/clk.h>
 #include <linux/compiler.h>
+#include <linux/bootmem.h>
 #include <linux/io.h>
 #include <asm/clock.h>
 
@@ -37,6 +38,60 @@ int __init sh_clk_mstp32_register(struct clk *clks, int nr)
 	return ret;
 }
 
+static unsigned long sh_clk_div4_recalc(struct clk *clk)
+{
+	struct clk_div_mult_table *table = clk->priv;
+	unsigned int idx;
+
+	clk_rate_table_build(clk, clk->freq_table, table->nr_divisors,
+			     table, &clk->arch_flags);
+
+	idx = (__raw_readl(clk->enable_reg) >> clk->enable_bit) & 0x000f;
+
+	return clk->freq_table[idx].frequency;
+}
+
+static long sh_clk_div4_round_rate(struct clk *clk, unsigned long rate)
+{
+	return clk_rate_table_round(clk, clk->freq_table, rate);
+}
+
+static struct clk_ops sh_clk_div4_clk_ops = {
+	.recalc		= sh_clk_div4_recalc,
+	.round_rate	= sh_clk_div4_round_rate,
+};
+
+int __init sh_clk_div4_register(struct clk *clks, int nr,
+				struct clk_div_mult_table *table)
+{
+	struct clk *clkp;
+	void *freq_table;
+	int nr_divs = table->nr_divisors;
+	int freq_table_size = sizeof(struct cpufreq_frequency_table);
+	int ret = 0;
+	int k;
+
+	k = nr_divs + 1;
+	freq_table = alloc_bootmem(freq_table_size * nr * (nr_divs + 1));
+	if (!freq_table)
+		return -ENOMEM;
+
+	for (k = 0; !ret && (k < nr); k++) {
+		clkp = clks + k;
+
+		clkp->ops = &sh_clk_div4_clk_ops;
+		clkp->id = -1;
+		clkp->priv = table;
+
+		clkp->freq_table = freq_table + (k * freq_table_size);
+		clkp->freq_table[nr_divs].frequency = CPUFREQ_TABLE_END;
+
+		ret = clk_register(clkp);
+	}
+
+	return ret;
+}
+
 #ifdef CONFIG_SH_CLK_CPG_LEGACY
 static struct clk master_clk = {
 	.name		= "master_clk",
-- 
GitLab


From 43909a938063f9b6f98c05a2e28b072dd972ece7 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 28 May 2009 13:13:56 +0000
Subject: [PATCH 1457/2198] sh: hook up shared div4 clock code to sh7785

Hook up the shared 4-bit divisor clock code to sh7785.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 207 ++++++-------------------
 1 file changed, 48 insertions(+), 159 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index 7d557068f4a3b..dae20aca536d0 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -18,65 +18,14 @@
 #include <asm/freq.h>
 #include <cpu/sh7785.h>
 
-static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
-			       24, 32, 36, 48 };
-
-static struct clk_div_mult_table cpg_div = {
-	.divisors = div2,
-	.nr_divisors = ARRAY_SIZE(div2),
-};
-
-struct clk_priv {
-	unsigned int			shift;
-
-	/* allowable divisor bitmap */
-	unsigned long			div_bitmap;
-
-	/* Supportable frequencies + termination entry */
-	struct cpufreq_frequency_table	freq_table[ARRAY_SIZE(div2)+1];
-};
-
-#define FRQMR_CLK_DATA(_name, _shift, _div_bitmap)	\
-static struct clk_priv _name##_data = {			\
-	.shift		= _shift,			\
-	.div_bitmap	= _div_bitmap,			\
-							\
-	.freq_table[0]	= {				\
-		.index = 0,				\
-		.frequency = CPUFREQ_TABLE_END,		\
-	},						\
-}
-
-FRQMR_CLK_DATA(pfc,  0, 0x0f80);
-FRQMR_CLK_DATA(s3fc, 4, 0x0ff0);
-FRQMR_CLK_DATA(s2fc, 8, 0x0030);
-FRQMR_CLK_DATA(mfc, 12, 0x000c);
-FRQMR_CLK_DATA(bfc, 16, 0x0fe0);
-FRQMR_CLK_DATA(sfc, 20, 0x000c);
-FRQMR_CLK_DATA(ufc, 24, 0x000c);
-FRQMR_CLK_DATA(ifc, 28, 0x000e);
-
-static unsigned long frqmr_recalc(struct clk *clk)
-{
-	struct clk_priv *data = clk->priv;
-	unsigned int idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f;
-
-	clk_rate_table_build(clk, data->freq_table, ARRAY_SIZE(div2),
-			     &cpg_div, &data->div_bitmap);
-	
-	return data->freq_table[idx].frequency;
-}
-
-static long frqmr_round_rate(struct clk *clk, unsigned long rate)
-{
-	struct clk_priv *data = clk->priv;
-
-	return clk_rate_table_round(clk, data->freq_table, rate);
-}
-
-static struct clk_ops frqmr_clk_ops = {
-	.recalc			= frqmr_recalc,
-	.round_rate		= frqmr_round_rate,
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+static struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
 };
 
 static unsigned long pll_recalc(struct clk *clk)
@@ -92,16 +41,6 @@ static struct clk_ops pll_clk_ops = {
 	.recalc		= pll_recalc,
 };
 
-/*
- * Default rate for the root input clock, reset this with clk_set_rate()
- * from the platform code.
- */
-static struct clk extal_clk = {
-	.name		= "extal",
-	.id		= -1,
-	.rate		= 33333333,
-};
-
 static struct clk pll_clk = {
 	.name		= "pll_clk",
 	.id		= -1,
@@ -110,87 +49,34 @@ static struct clk pll_clk = {
 	.flags		= CLK_ENABLE_ON_INIT,
 };
 
-static struct clk cpu_clk = {
-	.name		= "cpu_clk",		/* Ick */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &ifc_data,
-};
-
-static struct clk shyway_clk = {
-	.name		= "shyway_clk",		/* SHck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &sfc_data,
-};
-
-static struct clk peripheral_clk = {
-	.name		= "peripheral_clk",	/* Pck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &pfc_data,
-};
-
-static struct clk ddr_clk = {
-	.name		= "ddr_clk",		/* DDRck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &mfc_data,
+static struct clk *clks[] = {
+	&extal_clk,
+	&pll_clk,
 };
 
-static struct clk bus_clk = {
-	.name		= "bus_clk",		/* Bck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &bfc_data,
-};
+static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18,
+			       24, 32, 36, 48 };
 
-static struct clk ga_clk = {
-	.name		= "ga_clk",		/* GAck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.priv		= &s2fc_data,
+static struct clk_div_mult_table div4_table = {
+	.divisors = div2,
+	.nr_divisors = ARRAY_SIZE(div2),
 };
 
-static struct clk du_clk = {
-	.name		= "du_clk",		/* DUck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.priv		= &s3fc_data,
-};
+enum { DIV4_I, DIV4_U, DIV4_SH, DIV4_B, DIV4_DDR, DIV4_GA,
+	DIV4_DU, DIV4_P, DIV4_NR };
 
-static struct clk umem_clk = {
-	.name		= "umem_clk",		/* uck */
-	.id		= -1,
-	.ops		= &frqmr_clk_ops,
-	.parent		= &pll_clk,
-	.flags		= CLK_ENABLE_ON_INIT,
-	.priv		= &ufc_data,
-};
+#define DIV4(_str, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, FRQMR1, _bit, _mask, _flags)
 
-static struct clk *clks[] = {
-	&extal_clk,
-	&pll_clk,
-	&cpu_clk,
-	&shyway_clk,
-	&peripheral_clk,
-	&ddr_clk,
-	&bus_clk,
-	&ga_clk,
-	&du_clk,
-	&umem_clk,
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_P] = DIV4("peripheral_clk", 0, 0x0f80, 0),
+	[DIV4_DU] = DIV4("du_clk", 4, 0x0ff0, 0),
+	[DIV4_GA] = DIV4("ga_clk", 8, 0x0030, 0),
+	[DIV4_DDR] = DIV4("ddr_clk", 12, 0x000c, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", 16, 0x0fe0, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", 20, 0x000c, CLK_ENABLE_ON_INIT),
+	[DIV4_U] = DIV4("umem_clk", 24, 0x000c, CLK_ENABLE_ON_INIT),
+	[DIV4_I] = DIV4("cpu_clk", 28, 0x000e, CLK_ENABLE_ON_INIT),
 };
 
 #define MSTPCR0		0xffc80030
@@ -198,22 +84,22 @@ static struct clk *clks[] = {
 
 static struct clk mstp_clks[] = {
 	/* MSTPCR0 */
-	SH_CLK_MSTP32("scif_fck", 5, &peripheral_clk, MSTPCR0, 29, 0),
-	SH_CLK_MSTP32("scif_fck", 4, &peripheral_clk, MSTPCR0, 28, 0),
-	SH_CLK_MSTP32("scif_fck", 3, &peripheral_clk, MSTPCR0, 27, 0),
-	SH_CLK_MSTP32("scif_fck", 2, &peripheral_clk, MSTPCR0, 26, 0),
-	SH_CLK_MSTP32("scif_fck", 1, &peripheral_clk, MSTPCR0, 25, 0),
-	SH_CLK_MSTP32("scif_fck", 0, &peripheral_clk, MSTPCR0, 24, 0),
-	SH_CLK_MSTP32("ssi_fck", 1, &peripheral_clk, MSTPCR0, 21, 0),
-	SH_CLK_MSTP32("ssi_fck", 0, &peripheral_clk, MSTPCR0, 20, 0),
-	SH_CLK_MSTP32("hac_fck", 1, &peripheral_clk, MSTPCR0, 17, 0),
-	SH_CLK_MSTP32("hac_fck", 0, &peripheral_clk, MSTPCR0, 16, 0),
-	SH_CLK_MSTP32("mmcif_fck", -1, &peripheral_clk, MSTPCR0, 13, 0),
-	SH_CLK_MSTP32("flctl_fck", -1, &peripheral_clk, MSTPCR0, 12, 0),
-	SH_CLK_MSTP32("tmu345_fck", -1, &peripheral_clk, MSTPCR0, 9, 0),
-	SH_CLK_MSTP32("tmu012_fck", -1, &peripheral_clk, MSTPCR0, 8, 0),
-	SH_CLK_MSTP32("siof_fck", -1, &peripheral_clk, MSTPCR0, 3, 0),
-	SH_CLK_MSTP32("hspi_fck", -1, &peripheral_clk, MSTPCR0, 2, 0),
+	SH_CLK_MSTP32("scif_fck", 5, &div4_clks[DIV4_P], MSTPCR0, 29, 0),
+	SH_CLK_MSTP32("scif_fck", 4, &div4_clks[DIV4_P], MSTPCR0, 28, 0),
+	SH_CLK_MSTP32("scif_fck", 3, &div4_clks[DIV4_P], MSTPCR0, 27, 0),
+	SH_CLK_MSTP32("scif_fck", 2, &div4_clks[DIV4_P], MSTPCR0, 26, 0),
+	SH_CLK_MSTP32("scif_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 25, 0),
+	SH_CLK_MSTP32("scif_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 24, 0),
+	SH_CLK_MSTP32("ssi_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 21, 0),
+	SH_CLK_MSTP32("ssi_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 20, 0),
+	SH_CLK_MSTP32("hac_fck", 1, &div4_clks[DIV4_P], MSTPCR0, 17, 0),
+	SH_CLK_MSTP32("hac_fck", 0, &div4_clks[DIV4_P], MSTPCR0, 16, 0),
+	SH_CLK_MSTP32("mmcif_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 13, 0),
+	SH_CLK_MSTP32("flctl_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 12, 0),
+	SH_CLK_MSTP32("tmu345_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 9, 0),
+	SH_CLK_MSTP32("tmu012_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 8, 0),
+	SH_CLK_MSTP32("siof_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 3, 0),
+	SH_CLK_MSTP32("hspi_fck", -1, &div4_clks[DIV4_P], MSTPCR0, 2, 0),
 
 	/* MSTPCR1 */
 	SH_CLK_MSTP32("hudi_fck", -1, NULL, MSTPCR1, 19, 0),
@@ -230,6 +116,9 @@ int __init arch_clk_init(void)
 	for (i = 0; i < ARRAY_SIZE(clks); i++)
 		ret |= clk_register(clks[i]);
 
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, ARRAY_SIZE(div4_clks),
+					   &div4_table);
 	if (!ret)
 		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
 
-- 
GitLab


From 7863d3f7aeae05099a38693a0a7eb7bdc7b2ab05 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 1 Jun 2009 19:38:41 +0900
Subject: [PATCH 1458/2198] sh: Tidy up the optional L2 probing, wire it up for
 SH7786.

This tidies up the L2 probing, as it may or may not be implemented on a
CPU, regardless of whether it is supported. This converts the cvr
validity checks from BUG_ON()'s to simply clearing the CPU_HAS_L2_CACHE
flag and moving on with life.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4/probe.c | 61 ++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 28 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c
index 7d821ce8434f6..28a2f0db01db6 100644
--- a/arch/sh/kernel/cpu/sh4/probe.c
+++ b/arch/sh/kernel/cpu/sh4/probe.c
@@ -134,7 +134,7 @@ int __init detect_cpu_and_cache_system(void)
 		boot_cpu_data.icache.ways = 4;
 		boot_cpu_data.dcache.ways = 4;
 		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_PERF_COUNTER |
-			CPU_HAS_LLSC | CPU_HAS_PTEAEX;
+			CPU_HAS_LLSC | CPU_HAS_PTEAEX | CPU_HAS_L2_CACHE;
 		break;
 	case 0x3008:
 		boot_cpu_data.icache.ways = 4;
@@ -228,43 +228,48 @@ int __init detect_cpu_and_cache_system(void)
 	}
 
 	/*
-	 * Setup the L2 cache desc
-	 *
 	 * SH-4A's have an optional PIPT L2.
 	 */
 	if (boot_cpu_data.flags & CPU_HAS_L2_CACHE) {
-		/* Bug if we can't decode the L2 info */
-		BUG_ON(!(cvr & 0xf));
-
-		/* Silicon and specifications have clearly never met.. */
-		cvr ^= 0xf;
-
 		/*
-		 * Size calculation is much more sensible
-		 * than it is for the L1.
-		 *
-		 * Sizes are 128KB, 258KB, 512KB, and 1MB.
+		 * Verify that it really has something hooked up, this
+		 * is the safety net for CPUs that have optional L2
+		 * support yet do not implement it.
 		 */
-		size = (cvr & 0xf) << 17;
+		if ((cvr & 0xf) == 0)
+			boot_cpu_data.flags &= ~CPU_HAS_L2_CACHE;
+		else {
+			/*
+			 * Silicon and specifications have clearly never
+			 * met..
+			 */
+			cvr ^= 0xf;
 
-		BUG_ON(!size);
+			/*
+			 * Size calculation is much more sensible
+			 * than it is for the L1.
+			 *
+			 * Sizes are 128KB, 258KB, 512KB, and 1MB.
+			 */
+			size = (cvr & 0xf) << 17;
 
-		boot_cpu_data.scache.way_incr		= (1 << 16);
-		boot_cpu_data.scache.entry_shift	= 5;
-		boot_cpu_data.scache.ways		= 4;
-		boot_cpu_data.scache.linesz		= L1_CACHE_BYTES;
+			boot_cpu_data.scache.way_incr		= (1 << 16);
+			boot_cpu_data.scache.entry_shift	= 5;
+			boot_cpu_data.scache.ways		= 4;
+			boot_cpu_data.scache.linesz		= L1_CACHE_BYTES;
 
-		boot_cpu_data.scache.entry_mask	=
-			(boot_cpu_data.scache.way_incr -
-			 boot_cpu_data.scache.linesz);
+			boot_cpu_data.scache.entry_mask	=
+				(boot_cpu_data.scache.way_incr -
+				 boot_cpu_data.scache.linesz);
 
-		boot_cpu_data.scache.sets	= size /
-			(boot_cpu_data.scache.linesz *
-			 boot_cpu_data.scache.ways);
+			boot_cpu_data.scache.sets	= size /
+				(boot_cpu_data.scache.linesz *
+				 boot_cpu_data.scache.ways);
 
-		boot_cpu_data.scache.way_size	=
-			(boot_cpu_data.scache.sets *
-			 boot_cpu_data.scache.linesz);
+			boot_cpu_data.scache.way_size	=
+				(boot_cpu_data.scache.sets *
+				 boot_cpu_data.scache.linesz);
+		}
 	}
 
 	return 0;
-- 
GitLab


From 0bf8513ed0df64b38edce63411d4b7b368464f47 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 1 Jun 2009 19:50:08 +0900
Subject: [PATCH 1459/2198] sh: Tidy up SH-4A boot_cpu_data.flags probing.

This tidies up the boot_cpu_data.flags probing on SH-4A. All of them have
a few things in common, which we can blindly set, rather than having each
subtype have to set the same flags. We can also make assumptions about
cache ways and the validity of PTEA, so this also kills off CPU_HAS_PTEA
as a config option. There was also a bug in the FPU probing, which is now
tidied up.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                |  1 -
 arch/sh/Kconfig.cpu            |  3 --
 arch/sh/kernel/cpu/sh4/probe.c | 61 +++++++++-------------------------
 3 files changed, 15 insertions(+), 50 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index c815975b8d75e..e9392aa8c5614 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -174,7 +174,6 @@ config CPU_SH4
 	bool
 	select CPU_HAS_INTEVT
 	select CPU_HAS_SR_RB
-	select CPU_HAS_PTEA if !CPU_SH4A || CPU_SHX2
 	select CPU_HAS_FPU if !CPU_SH4AL_DSP
 	select SYS_SUPPORTS_TMU
 
diff --git a/arch/sh/Kconfig.cpu b/arch/sh/Kconfig.cpu
index 9eb1712372d80..cd6e3ea598d5b 100644
--- a/arch/sh/Kconfig.cpu
+++ b/arch/sh/Kconfig.cpu
@@ -96,9 +96,6 @@ config CPU_HAS_SR_RB
 	  See <file:Documentation/sh/register-banks.txt> for further
 	  information on SR.RB and register banking in the kernel in general.
 
-config CPU_HAS_PTEA
-	bool
-
 config CPU_HAS_PTEAEX
 	bool
 
diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c
index 28a2f0db01db6..6c78d0a9c857b 100644
--- a/arch/sh/kernel/cpu/sh4/probe.c
+++ b/arch/sh/kernel/cpu/sh4/probe.c
@@ -60,12 +60,18 @@ int __init detect_cpu_and_cache_system(void)
 		if ((cvr & 0x10000000) == 0)
 			boot_cpu_data.flags |= CPU_HAS_DSP;
 
-		boot_cpu_data.flags |= CPU_HAS_LLSC;
+		boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_PERF_COUNTER;
 		boot_cpu_data.cut_major = pvr & 0x7f;
+
+		boot_cpu_data.icache.ways = 4;
+		boot_cpu_data.dcache.ways = 4;
+	} else {
+		/* And some SH-4 defaults.. */
+		boot_cpu_data.flags |= CPU_HAS_PTEA;
 	}
 
 	/* FPU detection works for everyone */
-	if ((cvr & 0x20000000) == 1)
+	if ((cvr & 0x20000000))
 		boot_cpu_data.flags |= CPU_HAS_FPU;
 
 	/* Mask off the upper chip ID */
@@ -78,25 +84,20 @@ int __init detect_cpu_and_cache_system(void)
 	switch (pvr) {
 	case 0x205:
 		boot_cpu_data.type = CPU_SH7750;
-		boot_cpu_data.flags |= CPU_HAS_P2_FLUSH_BUG | CPU_HAS_FPU |
-				   CPU_HAS_PERF_COUNTER;
+		boot_cpu_data.flags |= CPU_HAS_P2_FLUSH_BUG |
+				       CPU_HAS_PERF_COUNTER;
 		break;
 	case 0x206:
 		boot_cpu_data.type = CPU_SH7750S;
-		boot_cpu_data.flags |= CPU_HAS_P2_FLUSH_BUG | CPU_HAS_FPU |
-				   CPU_HAS_PERF_COUNTER;
+		boot_cpu_data.flags |= CPU_HAS_P2_FLUSH_BUG |
+				       CPU_HAS_PERF_COUNTER;
 		break;
 	case 0x1100:
 		boot_cpu_data.type = CPU_SH7751;
-		boot_cpu_data.flags |= CPU_HAS_FPU;
 		break;
 	case 0x2001:
 	case 0x2004:
 		boot_cpu_data.type = CPU_SH7770;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-
-		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_LLSC;
 		break;
 	case 0x2006:
 	case 0x200A:
@@ -107,45 +108,26 @@ int __init detect_cpu_and_cache_system(void)
 		else
 			boot_cpu_data.type = CPU_SH7780;
 
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-
-		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_PERF_COUNTER |
-				   CPU_HAS_LLSC;
 		break;
 	case 0x3000:
 	case 0x3003:
 	case 0x3009:
 		boot_cpu_data.type = CPU_SH7343;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_LLSC;
 		break;
 	case 0x3004:
 	case 0x3007:
 		boot_cpu_data.type = CPU_SH7785;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_PERF_COUNTER |
-					  CPU_HAS_LLSC;
 		break;
 	case 0x4004:
 		boot_cpu_data.type = CPU_SH7786;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_PERF_COUNTER |
-			CPU_HAS_LLSC | CPU_HAS_PTEAEX | CPU_HAS_L2_CACHE;
+		boot_cpu_data.flags |= CPU_HAS_PTEAEX | CPU_HAS_L2_CACHE;
 		break;
 	case 0x3008:
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_LLSC;
-
 		switch (prr) {
 		case 0x50:
 		case 0x51:
 			boot_cpu_data.type = CPU_SH7723;
-			boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_L2_CACHE;
+			boot_cpu_data.flags |= CPU_HAS_L2_CACHE;
 			break;
 		case 0x70:
 			boot_cpu_data.type = CPU_SH7366;
@@ -158,17 +140,11 @@ int __init detect_cpu_and_cache_system(void)
 		break;
 	case 0x300b:
 		boot_cpu_data.type = CPU_SH7724;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_FPU | CPU_HAS_L2_CACHE;
+		boot_cpu_data.flags |= CPU_HAS_L2_CACHE;
 		break;
 	case 0x4000:	/* 1st cut */
 	case 0x4001:	/* 2nd cut */
 		boot_cpu_data.type = CPU_SHX3;
-		boot_cpu_data.icache.ways = 4;
-		boot_cpu_data.dcache.ways = 4;
-		boot_cpu_data.flags |= CPU_HAS_FPU | CPU_HAS_PERF_COUNTER |
-					  CPU_HAS_LLSC;
 		break;
 	case 0x700:
 		boot_cpu_data.type = CPU_SH4_501;
@@ -179,7 +155,6 @@ int __init detect_cpu_and_cache_system(void)
 		boot_cpu_data.type = CPU_SH4_202;
 		boot_cpu_data.icache.ways = 2;
 		boot_cpu_data.dcache.ways = 2;
-		boot_cpu_data.flags |= CPU_HAS_FPU;
 		break;
 	case 0x500 ... 0x501:
 		switch (prr) {
@@ -197,18 +172,12 @@ int __init detect_cpu_and_cache_system(void)
 		boot_cpu_data.icache.ways = 2;
 		boot_cpu_data.dcache.ways = 2;
 
-		boot_cpu_data.flags |= CPU_HAS_FPU;
-
 		break;
 	default:
 		boot_cpu_data.type = CPU_SH_NONE;
 		break;
 	}
 
-#ifdef CONFIG_CPU_HAS_PTEA
-	boot_cpu_data.flags |= CPU_HAS_PTEA;
-#endif
-
 	/*
 	 * On anything that's not a direct-mapped cache, look to the CVR
 	 * for I/D-cache specifics.
-- 
GitLab


From 22a4f650d686eeaac3629dae1c4294381485efdf Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 1 Jun 2009 10:13:37 +0200
Subject: [PATCH 1460/2198] perf_counter: Tidy up style details

 - whitespace fixlets
 - make local variable definitions more consistent

[ Impact: cleanup ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 +-
 kernel/perf_counter.c        | 39 +++++++++++++++++++-----------------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 81ec79c9f1934..0e57d8cc5a3d2 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -562,7 +562,7 @@ struct perf_cpu_context {
 	 *
 	 * task, softirq, irq, nmi context
 	 */
-	int			recursion[4];
+	int				recursion[4];
 };
 
 #ifdef CONFIG_PERF_COUNTERS
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ff8b4636f8451..df319c48c52b9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -16,8 +16,9 @@
 #include <linux/file.h>
 #include <linux/poll.h>
 #include <linux/sysfs.h>
-#include <linux/ptrace.h>
+#include <linux/dcache.h>
 #include <linux/percpu.h>
+#include <linux/ptrace.h>
 #include <linux/vmstat.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
@@ -26,7 +27,6 @@
 #include <linux/anon_inodes.h>
 #include <linux/kernel_stat.h>
 #include <linux/perf_counter.h>
-#include <linux/dcache.h>
 
 #include <asm/irq_regs.h>
 
@@ -65,7 +65,9 @@ void __weak hw_perf_disable(void)		{ barrier(); }
 void __weak hw_perf_enable(void)		{ barrier(); }
 
 void __weak hw_perf_counter_setup(int cpu)	{ barrier(); }
-int __weak hw_perf_group_sched_in(struct perf_counter *group_leader,
+
+int __weak
+hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_cpu_context *cpuctx,
 	       struct perf_counter_context *ctx, int cpu)
 {
@@ -127,8 +129,8 @@ static void put_ctx(struct perf_counter_context *ctx)
  * This has to cope with with the fact that until it is locked,
  * the context could get moved to another task.
  */
-static struct perf_counter_context *perf_lock_task_context(
-				struct task_struct *task, unsigned long *flags)
+static struct perf_counter_context *
+perf_lock_task_context(struct task_struct *task, unsigned long *flags)
 {
 	struct perf_counter_context *ctx;
 
@@ -1330,9 +1332,9 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 
 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
-	struct perf_cpu_context *cpuctx;
-	struct perf_counter_context *ctx;
 	struct perf_counter_context *parent_ctx;
+	struct perf_counter_context *ctx;
+	struct perf_cpu_context *cpuctx;
 	struct task_struct *task;
 	unsigned long flags;
 	int err;
@@ -1664,8 +1666,8 @@ int perf_counter_task_disable(void)
  */
 void perf_counter_update_userpage(struct perf_counter *counter)
 {
-	struct perf_mmap_data *data;
 	struct perf_counter_mmap_page *userpg;
+	struct perf_mmap_data *data;
 
 	rcu_read_lock();
 	data = rcu_dereference(counter->data);
@@ -1769,10 +1771,11 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
 
 static void __perf_mmap_data_free(struct rcu_head *rcu_head)
 {
-	struct perf_mmap_data *data = container_of(rcu_head,
-			struct perf_mmap_data, rcu_head);
+	struct perf_mmap_data *data;
 	int i;
 
+	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+
 	free_page((unsigned long)data->user_page);
 	for (i = 0; i < data->nr_pages; i++)
 		free_page((unsigned long)data->data_pages[i]);
@@ -1801,8 +1804,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 	struct perf_counter *counter = vma->vm_file->private_data;
 
 	WARN_ON_ONCE(counter->ctx->parent_ctx);
-	if (atomic_dec_and_mutex_lock(&counter->mmap_count,
-				      &counter->mmap_mutex)) {
+	if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) {
 		struct user_struct *user = current_user();
 
 		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
@@ -1821,11 +1823,11 @@ static struct vm_operations_struct perf_mmap_vmops = {
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct perf_counter *counter = file->private_data;
+	unsigned long user_locked, user_lock_limit;
 	struct user_struct *user = current_user();
+	unsigned long locked, lock_limit;
 	unsigned long vma_size;
 	unsigned long nr_pages;
-	unsigned long user_locked, user_lock_limit;
-	unsigned long locked, lock_limit;
 	long user_extra, extra;
 	int ret = 0;
 
@@ -1900,8 +1902,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 
 static int perf_fasync(int fd, struct file *filp, int on)
 {
-	struct perf_counter *counter = filp->private_data;
 	struct inode *inode = filp->f_path.dentry->d_inode;
+	struct perf_counter *counter = filp->private_data;
 	int retval;
 
 	mutex_lock(&inode->i_mutex);
@@ -2412,8 +2414,8 @@ static void perf_counter_output(struct perf_counter *counter,
  */
 
 struct perf_comm_event {
-	struct task_struct 	*task;
-	char 			*comm;
+	struct task_struct	*task;
+	char			*comm;
 	int			comm_size;
 
 	struct {
@@ -2932,6 +2934,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 			       int nmi, struct pt_regs *regs, u64 addr)
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
+
 	if (counter->hw.irq_period && !neg)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
@@ -3526,7 +3529,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * Make the child state follow the state of the parent counter,
 	 * not its hw_event.disabled bit.  We hold the parent's mutex,
-	 * so we won't race with perf_counter_{en,dis}able_family.
+	 * so we won't race with perf_counter_{en, dis}able_family.
 	 */
 	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-- 
GitLab


From 229c4eedcedcdadf70411120ba34bc37554a74bd Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 28 May 2009 16:28:53 +0200
Subject: [PATCH 1461/2198] perf_counter tools: Guard against record damaging
 existing files

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 23d1224ce98c2..96bfb7c5f1e4f 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -340,7 +340,7 @@ static int __cmd_record(int argc, const char **argv)
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
-	output = open(output_name, O_CREAT|O_RDWR, S_IRWXU);
+	output = open(output_name, O_CREAT|O_EXCL|O_RDWR, S_IRWXU);
 	if (output < 0) {
 		perror("failed to create output file");
 		exit(-1);
-- 
GitLab


From fb39125fd79a25c5002f3b45cf4c80e3fa6b961b Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 17 Apr 2009 15:15:51 +0800
Subject: [PATCH 1462/2198] ftrace, workqueuetrace: make workqueue tracepoints
 use TRACE_EVENT macro

v3: zhaolei@cn.fujitsu.com: Change TRACE_EVENT definition to new format
    introduced by Steven Rostedt: consolidate trace and trace_event headers
v2: kosaki@jp.fujitsu.com: print the function names instead of addr, and zap
    the work addr
v1: zhaolei@cn.fujitsu.com: Make workqueue tracepoints use TRACE_EVENT macro

TRACE_EVENT is a more generic way to define tracepoints.
Doing so adds these new capabilities to the tracepoints:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions

Then, this patch converts DEFINE_TRACE to TRACE_EVENT in workqueue related
tracepoints.

[ Impact: expand workqueue tracer to events tracing ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 include/trace/events/workqueue.h | 100 +++++++++++++++++++++++++++++++
 include/trace/workqueue.h        |  25 --------
 kernel/trace/trace_workqueue.c   |   2 +-
 kernel/workqueue.c               |  11 +---
 4 files changed, 103 insertions(+), 35 deletions(-)
 create mode 100644 include/trace/events/workqueue.h
 delete mode 100644 include/trace/workqueue.h

diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h
new file mode 100644
index 0000000000000..035f1bff288e6
--- /dev/null
+++ b/include/trace/events/workqueue.h
@@ -0,0 +1,100 @@
+#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WORKQUEUE_H
+
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM workqueue
+
+TRACE_EVENT(workqueue_insertion,
+
+	TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+
+	TP_ARGS(wq_thread, work),
+
+	TP_STRUCT__entry(
+		__array(char,		thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,		thread_pid)
+		__field(work_func_t,	func)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+		__entry->func		= work->func;
+	),
+
+	TP_printk("thread=%s:%d func=%pF", __entry->thread_comm,
+		__entry->thread_pid, __entry->func)
+);
+
+TRACE_EVENT(workqueue_execution,
+
+	TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+
+	TP_ARGS(wq_thread, work),
+
+	TP_STRUCT__entry(
+		__array(char,		thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,		thread_pid)
+		__field(work_func_t,	func)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+		__entry->func		= work->func;
+	),
+
+	TP_printk("thread=%s:%d func=%pF", __entry->thread_comm,
+		__entry->thread_pid, __entry->func)
+);
+
+/* Trace the creation of one workqueue thread on a cpu */
+TRACE_EVENT(workqueue_creation,
+
+	TP_PROTO(struct task_struct *wq_thread, int cpu),
+
+	TP_ARGS(wq_thread, cpu),
+
+	TP_STRUCT__entry(
+		__array(char,	thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,	thread_pid)
+		__field(int,	cpu)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+		__entry->cpu		= cpu;
+	),
+
+	TP_printk("thread=%s:%d cpu=%d", __entry->thread_comm,
+		__entry->thread_pid, __entry->cpu)
+);
+
+TRACE_EVENT(workqueue_destruction,
+
+	TP_PROTO(struct task_struct *wq_thread),
+
+	TP_ARGS(wq_thread),
+
+	TP_STRUCT__entry(
+		__array(char,	thread_comm,	TASK_COMM_LEN)
+		__field(pid_t,	thread_pid)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+		__entry->thread_pid	= wq_thread->pid;
+	),
+
+	TP_printk("thread=%s:%d", __entry->thread_comm, __entry->thread_pid)
+);
+
+#endif /* _TRACE_WORKQUEUE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h
deleted file mode 100644
index 7626523deeba5..0000000000000
--- a/include/trace/workqueue.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __TRACE_WORKQUEUE_H
-#define __TRACE_WORKQUEUE_H
-
-#include <linux/tracepoint.h>
-#include <linux/workqueue.h>
-#include <linux/sched.h>
-
-DECLARE_TRACE(workqueue_insertion,
-	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TP_ARGS(wq_thread, work));
-
-DECLARE_TRACE(workqueue_execution,
-	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TP_ARGS(wq_thread, work));
-
-/* Trace the creation of one workqueue thread on a cpu */
-DECLARE_TRACE(workqueue_creation,
-	   TP_PROTO(struct task_struct *wq_thread, int cpu),
-	   TP_ARGS(wq_thread, cpu));
-
-DECLARE_TRACE(workqueue_destruction,
-	   TP_PROTO(struct task_struct *wq_thread),
-	   TP_ARGS(wq_thread));
-
-#endif /* __TRACE_WORKQUEUE_H */
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 984b9175c13da..cfe56d31d85b1 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -6,7 +6,7 @@
  */
 
 
-#include <trace/workqueue.h>
+#include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
 #include "trace_stat.h"
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f71fb2a089503..0668795d88184 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,7 +33,8 @@
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
-#include <trace/workqueue.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
 
-DEFINE_TRACE(workqueue_insertion);
-
 static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head)
 {
@@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
-DEFINE_TRACE(workqueue_execution);
-
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	spin_lock_irq(&cwq->lock);
@@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 	return cwq;
 }
 
-DEFINE_TRACE(workqueue_creation);
-
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
-DEFINE_TRACE(workqueue_destruction);
-
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
 	/*
-- 
GitLab


From 1fdfca9c577aac96a559c1ea68f5c9156f17d636 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Apr 2009 14:58:26 +0800
Subject: [PATCH 1463/2198] trace_workqueue: use list_for_each_entry() instead
 of list_for_each_entry_safe()

No need to use list_for_each_entry_safe() in iteration without deleting
any node, we can use list_for_each_entry() instead.

[ Impact: cleanup ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_workqueue.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index cfe56d31d85b1..128b64b93f142 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -47,12 +47,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
 			  struct work_struct *work)
 {
 	int cpu = cpumask_first(&wq_thread->cpus_allowed);
-	struct cpu_workqueue_stats *node, *next;
+	struct cpu_workqueue_stats *node;
 	unsigned long flags;
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-							list) {
+	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
 		if (node->pid == wq_thread->pid) {
 			atomic_inc(&node->inserted);
 			goto found;
@@ -69,12 +68,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,
 			  struct work_struct *work)
 {
 	int cpu = cpumask_first(&wq_thread->cpus_allowed);
-	struct cpu_workqueue_stats *node, *next;
+	struct cpu_workqueue_stats *node;
 	unsigned long flags;
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-							list) {
+	list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
 		if (node->pid == wq_thread->pid) {
 			node->executed++;
 			goto found;
-- 
GitLab


From b8867164f05791a6b5363bd51c1274e03600886e Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Apr 2009 14:59:36 +0800
Subject: [PATCH 1464/2198] trace_workqueue: remove
 cpu_workqueue_stats->first_entry

cpu_workqueue_stats->first_entry is useless because we can retrieve the
header of a cpu workqueue using:
if (&cpu_workqueue_stats->list == workqueue_cpu_stat(cpu)->list.next)

[ Impact: cleanup ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_workqueue.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 128b64b93f142..890974aed64da 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -16,8 +16,6 @@
 /* A cpu workqueue thread */
 struct cpu_workqueue_stats {
 	struct list_head            list;
-/* Useful to know if we print the cpu headers */
-	bool		            first_entry;
 	int		            cpu;
 	pid_t			    pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
@@ -103,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 	cws->pid = wq_thread->pid;
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	if (list_empty(&workqueue_cpu_stat(cpu)->list))
-		cws->first_entry = true;
 	list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
 	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
-- 
GitLab


From f3c4ae26e93d354152196b62797ba86ad86dd0cc Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Mon, 20 Apr 2009 15:02:17 +0800
Subject: [PATCH 1465/2198] trace_workqueue: remove blank line between each cpu

The blankline between each cpu's workqueue stat is not necessary, because
the cpu number is enough to part them by eye.
Old style also caused a blankline below headline, and made code complex
by using lock, disableirq and get cpu var.

Old style:
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   0   8644       8644       events/0
   0      0          0       cpuset
   ...
   0      1          1       kdmflush

   1  35365      35365       events/1
   ...

New style:
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   0   8644       8644       events/0
   0      0          0       cpuset
   ...
   0      1          1       kdmflush
   1  35365      35365       events/1
   ...

[ Impact: provide more readable code ]

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_workqueue.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 890974aed64da..97fcea4acce15 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -185,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)
 static int workqueue_stat_show(struct seq_file *s, void *p)
 {
 	struct cpu_workqueue_stats *cws = p;
-	unsigned long flags;
-	int cpu = cws->cpu;
 	struct pid *pid;
 	struct task_struct *tsk;
 
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-		seq_printf(s, "\n");
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
 	pid = find_get_pid(cws->pid);
 	if (pid) {
 		tsk = get_pid_task(pid, PIDTYPE_PID);
-- 
GitLab


From 0d64f8342de26d02451900b1aad94716fe92c4ab Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 16 May 2009 05:58:49 +0200
Subject: [PATCH 1466/2198] tracing/stat: replace trace_stat_session by
 stat_session

The "trace" prefix in struct trace_stat_session type is annoying while
reading the trace_stat.c file. It makes the lines longer, and
is not that much useful to explain the sense of this type.

Just keep "struct stat_session" for this type.

[ Impact: make the code a bit more readable ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index fdde3a4a94cdc..3b6816be825d9 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -22,7 +22,7 @@ struct trace_stat_list {
 };
 
 /* A stat session is the stats output in one file */
-struct tracer_stat_session {
+struct stat_session {
 	struct list_head	session_list;
 	struct tracer_stat	*ts;
 	struct list_head	stat_list;
@@ -38,7 +38,7 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
 static struct dentry		*stat_dir;
 
 
-static void reset_stat_session(struct tracer_stat_session *session)
+static void reset_stat_session(struct stat_session *session)
 {
 	struct trace_stat_list *node, *next;
 
@@ -48,7 +48,7 @@ static void reset_stat_session(struct tracer_stat_session *session)
 	INIT_LIST_HEAD(&session->stat_list);
 }
 
-static void destroy_session(struct tracer_stat_session *session)
+static void destroy_session(struct stat_session *session)
 {
 	debugfs_remove(session->file);
 	reset_stat_session(session);
@@ -71,7 +71,7 @@ static int dummy_cmp(void *p1, void *p2)
  * All of these copies and sorting are required on all opening
  * since the stats could have changed between two file sessions.
  */
-static int stat_seq_init(struct tracer_stat_session *session)
+static int stat_seq_init(struct stat_session *session)
 {
 	struct trace_stat_list *iter_entry, *new_entry;
 	struct tracer_stat *ts = session->ts;
@@ -154,7 +154,7 @@ static int stat_seq_init(struct tracer_stat_session *session)
 
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 
 	/* Prevent from tracer switch or stat_list modification */
 	mutex_lock(&session->stat_mutex);
@@ -168,7 +168,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 
 	if (p == SEQ_START_TOKEN)
 		return seq_list_start(&session->stat_list, *pos);
@@ -178,13 +178,13 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 
 static void stat_seq_stop(struct seq_file *s, void *p)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 	mutex_unlock(&session->stat_mutex);
 }
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct tracer_stat_session *session = s->private;
+	struct stat_session *session = s->private;
 	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
 
 	if (v == SEQ_START_TOKEN)
@@ -205,7 +205,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
-	struct tracer_stat_session *session = inode->i_private;
+	struct stat_session *session = inode->i_private;
 
 	ret = seq_open(file, &trace_stat_seq_ops);
 	if (!ret) {
@@ -222,7 +222,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
  */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-	struct tracer_stat_session *session = i->i_private;
+	struct stat_session *session = i->i_private;
 
 	mutex_lock(&session->stat_mutex);
 	reset_stat_session(session);
@@ -251,7 +251,7 @@ static int tracing_stat_init(void)
 	return 0;
 }
 
-static int init_stat_file(struct tracer_stat_session *session)
+static int init_stat_file(struct stat_session *session)
 {
 	if (!stat_dir && tracing_stat_init())
 		return -ENODEV;
@@ -266,7 +266,7 @@ static int init_stat_file(struct tracer_stat_session *session)
 
 int register_stat_tracer(struct tracer_stat *trace)
 {
-	struct tracer_stat_session *session, *node, *tmp;
+	struct stat_session *session, *node, *tmp;
 	int ret;
 
 	if (!trace)
@@ -286,7 +286,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 	mutex_unlock(&all_stat_sessions_mutex);
 
 	/* Init the session */
-	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+	session = kmalloc(sizeof(struct stat_session), GFP_KERNEL);
 	if (!session)
 		return -ENOMEM;
 
@@ -312,7 +312,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 
 void unregister_stat_tracer(struct tracer_stat *trace)
 {
-	struct tracer_stat_session *node, *tmp;
+	struct stat_session *node, *tmp;
 
 	mutex_lock(&all_stat_sessions_mutex);
 	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-- 
GitLab


From 8f184f27300f66f6dcc8296c2dae7a1fbe8429c9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 16 May 2009 06:24:36 +0200
Subject: [PATCH 1467/2198] tracing/stat: replace linked list by an rbtree for
 sorting

When the stat tracing framework prepares the entries from a tracer
to output them to the user, it starts by computing a linear sort
through a linked list to give the entries ordered by relevance
to the user.

This is quite ugly and causes a small latency when we begin to
read the file.

This patch changes that by turning the linked list into a red-black
tree. Athough the whole iteration using the start and next tracer
callbacks while opening the file remain the same, it is now much
more fast and scalable.

The rbtree guarantees O(log(n)) insertions whereas a linked
list with linear sorting brought us a O(n) despair. Now the
(visible) latency has disapeared.

[ Impact: kill the latency while starting to read a stat tracer file ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 140 +++++++++++++++++++++++++++-----------
 1 file changed, 100 insertions(+), 40 deletions(-)

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 3b6816be825d9..0bd0fc82da5d2 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -1,7 +1,7 @@
 /*
  * Infrastructure for statistic tracing (histogram output).
  *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
  *
  * Based on the code from trace_branch.c which is
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
@@ -10,14 +10,19 @@
 
 
 #include <linux/list.h>
+#include <linux/rbtree.h>
 #include <linux/debugfs.h>
 #include "trace_stat.h"
 #include "trace.h"
 
 
-/* List of stat entries from a tracer */
-struct trace_stat_list {
-	struct list_head	list;
+/*
+ * List of stat red-black nodes from a tracer
+ * We use a such tree to sort quickly the stat
+ * entries from the tracer.
+ */
+struct stat_node {
+	struct rb_node		node;
 	void			*stat;
 };
 
@@ -25,7 +30,7 @@ struct trace_stat_list {
 struct stat_session {
 	struct list_head	session_list;
 	struct tracer_stat	*ts;
-	struct list_head	stat_list;
+	struct rb_root		stat_root;
 	struct mutex		stat_mutex;
 	struct dentry		*file;
 };
@@ -37,15 +42,45 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
 /* The root directory for all stat files */
 static struct dentry		*stat_dir;
 
+/*
+ * Iterate through the rbtree using a post order traversal path
+ * to release the next node.
+ * It won't necessary release one at each iteration
+ * but it will at least advance closer to the next one
+ * to be released.
+ */
+static struct rb_node *release_next(struct rb_node *node)
+{
+	struct stat_node *snode;
+	struct rb_node *parent = rb_parent(node);
+
+	if (node->rb_left)
+		return node->rb_left;
+	else if (node->rb_right)
+		return node->rb_right;
+	else {
+		if (!parent)
+			return NULL;
+		if (parent->rb_left == node)
+			parent->rb_left = NULL;
+		else
+			parent->rb_right = NULL;
+
+		snode = container_of(node, struct stat_node, node);
+		kfree(snode);
+
+		return parent;
+	}
+}
 
 static void reset_stat_session(struct stat_session *session)
 {
-	struct trace_stat_list *node, *next;
+	struct rb_node *node = session->stat_root.rb_node;
 
-	list_for_each_entry_safe(node, next, &session->stat_list, list)
-		kfree(node);
+	while (node)
+		node = release_next(node);
 
-	INIT_LIST_HEAD(&session->stat_list);
+	session->stat_root = RB_ROOT;
 }
 
 static void destroy_session(struct stat_session *session)
@@ -56,6 +91,35 @@ static void destroy_session(struct stat_session *session)
 	kfree(session);
 }
 
+typedef int (*cmp_stat_t)(void *, void *);
+
+static void
+insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	/*
+	 * Figure out where to put new node
+	 * This is a descendent sorting
+	 */
+	while (*new) {
+		struct stat_node *this;
+		int result;
+
+		this = container_of(*new, struct stat_node, node);
+		result = cmp(data->stat, this->stat);
+
+		parent = *new;
+		if (result >= 0)
+			new = &((*new)->rb_left);
+		else
+			new = &((*new)->rb_right);
+	}
+
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
 /*
  * For tracers that don't provide a stat_cmp callback.
  * This one will force an immediate insertion on tail of
@@ -73,8 +137,9 @@ static int dummy_cmp(void *p1, void *p2)
  */
 static int stat_seq_init(struct stat_session *session)
 {
-	struct trace_stat_list *iter_entry, *new_entry;
 	struct tracer_stat *ts = session->ts;
+	struct stat_node *new_entry;
+	struct rb_root *root;
 	void *stat;
 	int ret = 0;
 	int i;
@@ -93,15 +158,13 @@ static int stat_seq_init(struct stat_session *session)
 	 * The first entry. Actually this is the second, but the first
 	 * one (the stat_list head) is pointless.
 	 */
-	new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
 	if (!new_entry) {
 		ret = -ENOMEM;
 		goto exit;
 	}
-
-	INIT_LIST_HEAD(&new_entry->list);
-
-	list_add(&new_entry->list, &session->stat_list);
+	root = &session->stat_root;
+	insert_stat(root, new_entry, dummy_cmp);
 
 	new_entry->stat = stat;
 
@@ -116,31 +179,17 @@ static int stat_seq_init(struct stat_session *session)
 		if (!stat)
 			break;
 
-		new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+		new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
 		if (!new_entry) {
 			ret = -ENOMEM;
 			goto exit_free_list;
 		}
 
-		INIT_LIST_HEAD(&new_entry->list);
 		new_entry->stat = stat;
 
-		list_for_each_entry_reverse(iter_entry, &session->stat_list,
-				list) {
-
-			/* Insertion with a descendent sorting */
-			if (ts->stat_cmp(iter_entry->stat,
-					new_entry->stat) >= 0) {
-
-				list_add(&new_entry->list, &iter_entry->list);
-				break;
-			}
-		}
-
-		/* The current larger value */
-		if (list_empty(&new_entry->list))
-			list_add(&new_entry->list, &session->stat_list);
+		insert_stat(root, new_entry, ts->stat_cmp);
 	}
+
 exit:
 	mutex_unlock(&session->stat_mutex);
 	return ret;
@@ -155,25 +204,38 @@ static int stat_seq_init(struct stat_session *session)
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
 	struct stat_session *session = s->private;
+	struct rb_node *node;
+	int i;
 
 	/* Prevent from tracer switch or stat_list modification */
 	mutex_lock(&session->stat_mutex);
 
 	/* If we are in the beginning of the file, print the headers */
-	if (!*pos && session->ts->stat_headers)
+	if (!*pos && session->ts->stat_headers) {
+		(*pos)++;
 		return SEQ_START_TOKEN;
+	}
 
-	return seq_list_start(&session->stat_list, *pos);
+	node = rb_first(&session->stat_root);
+	for (i = 0; node && i < *pos; i++)
+		node = rb_next(node);
+
+	(*pos)++;
+
+	return node;
 }
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
 	struct stat_session *session = s->private;
+	struct rb_node *node = p;
+
+	(*pos)++;
 
 	if (p == SEQ_START_TOKEN)
-		return seq_list_start(&session->stat_list, *pos);
+		return rb_first(&session->stat_root);
 
-	return seq_list_next(p, &session->stat_list, pos);
+	return rb_next(node);
 }
 
 static void stat_seq_stop(struct seq_file *s, void *p)
@@ -185,7 +247,7 @@ static void stat_seq_stop(struct seq_file *s, void *p)
 static int stat_seq_show(struct seq_file *s, void *v)
 {
 	struct stat_session *session = s->private;
-	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+	struct stat_node *l = container_of(v, struct stat_node, node);
 
 	if (v == SEQ_START_TOKEN)
 		return session->ts->stat_headers(s);
@@ -286,15 +348,13 @@ int register_stat_tracer(struct tracer_stat *trace)
 	mutex_unlock(&all_stat_sessions_mutex);
 
 	/* Init the session */
-	session = kmalloc(sizeof(struct stat_session), GFP_KERNEL);
+	session = kzalloc(sizeof(*session), GFP_KERNEL);
 	if (!session)
 		return -ENOMEM;
 
 	session->ts = trace;
 	INIT_LIST_HEAD(&session->session_list);
-	INIT_LIST_HEAD(&session->stat_list);
 	mutex_init(&session->stat_mutex);
-	session->file = NULL;
 
 	ret = init_stat_file(session);
 	if (ret) {
-- 
GitLab


From b3dd7ba7d862707800c7ac45068f14ade2b65155 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 27 May 2009 11:04:26 +0800
Subject: [PATCH 1468/2198] tracing/stat: change dummpy_cmp() to return -1

Currently the output of trace_stat/workqueues is totally reversed:

 # cat /debug/tracing/trace_stat/workqueues
    ...
    1       17       17      210       37   `-blk_unplug_work+0x0/0x57
    1     3779     3779      181       11   |-cfq_kick_queue+0x0/0x2f
    1     3796     3796                     kblockd/1:120
    ...

The correct output should be:

    1     3796     3796                     kblockd/1:120
    1     3779     3779      181       11   |-cfq_kick_queue+0x0/0x2f
    1       17       17      210       37   `-blk_unplug_work+0x0/0x57

It's caused by "tracing/stat: replace linked list by an rbtree for
sorting"
(53059c9b67a62a3dc8c80204d3da42b9267ea5a0).

dummpy_cmp() should return -1, so rb_node will always be inserted as
right-most node in the rbtree, thus we sort the output in ascending
order.

[ Impact: fix the output of trace_stat/workqueues ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 0bd0fc82da5d2..5816d1aebcc9f 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -127,7 +127,7 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
  */
 static int dummy_cmp(void *p1, void *p2)
 {
-	return 1;
+	return -1;
 }
 
 /*
-- 
GitLab


From e16228069083a2f6b94383ac5739aea7a0f38ce4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 27 May 2009 11:04:48 +0800
Subject: [PATCH 1469/2198] tracing/stat: remember to free root node

When closing a trace_stat file, we destroy the rbtree constructed during
file open, but there is memory leak that the root node is not freed.

[ Impact: fix memory leak when closing a trace_stat file ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 5816d1aebcc9f..8030ec98dbad0 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -60,8 +60,8 @@ static struct rb_node *release_next(struct rb_node *node)
 		return node->rb_right;
 	else {
 		if (!parent)
-			return NULL;
-		if (parent->rb_left == node)
+			;
+		else if (parent->rb_left == node)
 			parent->rb_left = NULL;
 		else
 			parent->rb_right = NULL;
-- 
GitLab


From dbd3fbdfeecfad4e71139db05d72560c3583e2a9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 27 May 2009 11:42:46 +0800
Subject: [PATCH 1470/2198] tracing/stat: do some cleanups

- remove duplicate code in stat_seq_init()
- update comments to reflect the change from stat list to stat rbtree

[ Impact: clean up ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 54 +++++++++++++++------------------------
 1 file changed, 21 insertions(+), 33 deletions(-)

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 8030ec98dbad0..17f20ebdad2a5 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -93,10 +93,15 @@ static void destroy_session(struct stat_session *session)
 
 typedef int (*cmp_stat_t)(void *, void *);
 
-static void
-insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
+static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
 {
 	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct stat_node *data;
+
+	data = kzalloc(sizeof(*data), GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	data->stat = stat;
 
 	/*
 	 * Figure out where to put new node
@@ -118,12 +123,13 @@ insert_stat(struct rb_root *root, struct stat_node *data, cmp_stat_t cmp)
 
 	rb_link_node(&data->node, parent, new);
 	rb_insert_color(&data->node, root);
+	return 0;
 }
 
 /*
  * For tracers that don't provide a stat_cmp callback.
- * This one will force an immediate insertion on tail of
- * the list.
+ * This one will force an insertion as right-most node
+ * in the rbtree.
  */
 static int dummy_cmp(void *p1, void *p2)
 {
@@ -131,15 +137,14 @@ static int dummy_cmp(void *p1, void *p2)
 }
 
 /*
- * Initialize the stat list at each trace_stat file opening.
+ * Initialize the stat rbtree at each trace_stat file opening.
  * All of these copies and sorting are required on all opening
  * since the stats could have changed between two file sessions.
  */
 static int stat_seq_init(struct stat_session *session)
 {
 	struct tracer_stat *ts = session->ts;
-	struct stat_node *new_entry;
-	struct rb_root *root;
+	struct rb_root *root = &session->stat_root;
 	void *stat;
 	int ret = 0;
 	int i;
@@ -154,23 +159,12 @@ static int stat_seq_init(struct stat_session *session)
 	if (!stat)
 		goto exit;
 
-	/*
-	 * The first entry. Actually this is the second, but the first
-	 * one (the stat_list head) is pointless.
-	 */
-	new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
-	if (!new_entry) {
-		ret = -ENOMEM;
+	ret = insert_stat(root, stat, ts->stat_cmp);
+	if (ret)
 		goto exit;
-	}
-	root = &session->stat_root;
-	insert_stat(root, new_entry, dummy_cmp);
-
-	new_entry->stat = stat;
 
 	/*
-	 * Iterate over the tracer stat entries and store them in a sorted
-	 * list.
+	 * Iterate over the tracer stat entries and store them in an rbtree.
 	 */
 	for (i = 1; ; i++) {
 		stat = ts->stat_next(stat, i);
@@ -179,22 +173,16 @@ static int stat_seq_init(struct stat_session *session)
 		if (!stat)
 			break;
 
-		new_entry = kzalloc(sizeof(*new_entry), GFP_KERNEL);
-		if (!new_entry) {
-			ret = -ENOMEM;
-			goto exit_free_list;
-		}
-
-		new_entry->stat = stat;
-
-		insert_stat(root, new_entry, ts->stat_cmp);
+		ret = insert_stat(root, stat, ts->stat_cmp);
+		if (ret)
+			goto exit_free_rbtree;
 	}
 
 exit:
 	mutex_unlock(&session->stat_mutex);
 	return ret;
 
-exit_free_list:
+exit_free_rbtree:
 	reset_stat_session(session);
 	mutex_unlock(&session->stat_mutex);
 	return ret;
@@ -207,7 +195,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 	struct rb_node *node;
 	int i;
 
-	/* Prevent from tracer switch or stat_list modification */
+	/* Prevent from tracer switch or rbtree modification */
 	mutex_lock(&session->stat_mutex);
 
 	/* If we are in the beginning of the file, print the headers */
@@ -280,7 +268,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 }
 
 /*
- * Avoid consuming memory with our now useless list.
+ * Avoid consuming memory with our now useless rbtree.
  */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-- 
GitLab


From 43bd1236234cacbc18d1476a9b57e7a306efddf5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 30 May 2009 04:25:30 +0200
Subject: [PATCH 1471/2198] tracing/stat: remove unappropriate safe walk on
 list

register_stat_tracer() uses list_for_each_entry_safe
to check whether a tracer is already present in the list.
But we don't delete anything from the list here, so
we don't need the safe version

[ Impact: cleanup list use is stat tracing ]

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
---
 kernel/trace/trace_stat.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 17f20ebdad2a5..c00643733f4cc 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -316,7 +316,7 @@ static int init_stat_file(struct stat_session *session)
 
 int register_stat_tracer(struct tracer_stat *trace)
 {
-	struct stat_session *session, *node, *tmp;
+	struct stat_session *session, *node;
 	int ret;
 
 	if (!trace)
@@ -327,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 
 	/* Already registered? */
 	mutex_lock(&all_stat_sessions_mutex);
-	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+	list_for_each_entry(node, &all_stat_sessions, session_list) {
 		if (node->ts == trace) {
 			mutex_unlock(&all_stat_sessions_mutex);
 			return -EINVAL;
-- 
GitLab


From 58f892e022e88438183c48661dcdc6a2997dab99 Mon Sep 17 00:00:00 2001
From: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Date: Tue, 26 May 2009 21:48:07 +0000
Subject: [PATCH 1472/2198] x86: Print real IOAPIC version for x86-64

Fix the fact that the IOAPIC version number in the x86_64 code path always
gets assigned to 0, instead of the correct value.

Before the patch: (from "dmesg" output):

 ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0])
 IOAPIC[0]: apic_id 8, version 0, address 0xfec00000, GSI 0-23     <---

 After the patch:
 ACPI: IOAPIC (id[0x08] address[0xfec00000] gsi_base[0])
 IOAPIC[0]: apic_id 8, version 32, address 0xfec00000, GSI 0-23    <---

History:

io_apic_get_version() was compiled out of the x86_64 code path in the commit
f2c2cca3acef8b253a36381d9b469ad4fb08563a:

Author: Andi Kleen <ak@suse.de>
Date:   Tue Sep 26 10:52:37 2006 +0200

    [PATCH] Remove APIC version/cpu capability mpparse checking/printing

    ACPI went to great trouble to get the APIC version and CPU capabilities
    of different CPUs before passing them to the mpparser. But all
    that data was used was to print it out.  Actually it even faked some data
    based on the boot cpu, not on the actual CPU being booted.

    Remove all this code because it's not needed.

    Cc: len.brown@intel.com

At the time, the IOAPIC version number was deliberately not printed
in the x86_64 code path. However, after the x86 and x86_64 files were
merged, the net result is that the IOAPIC version is printed incorrectly
in the x86_64 code path.

The patch below provides a fix. I have tested it with acpi, and with
acpi=off, and did not see any problems.

Signed-off-by: Naga Chumbalkar <nagananda.chumbalkar@hp.com>
Acked-by: Yinghai Lu <yhlu.kernel@gmail.com>
LKML-Reference: <20090416014230.4885.94926.sendpatchset@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
*************************
---
 arch/x86/kernel/acpi/boot.c    | 5 +----
 arch/x86/kernel/apic/io_apic.c | 2 +-
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 844e5e25213ba..631086159c53b 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -985,11 +985,8 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 
 	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
 	mp_ioapics[idx].apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
 	mp_ioapics[idx].apicver = io_apic_get_version(idx);
-#else
-	mp_ioapics[idx].apicver = 0;
-#endif
+
 	/*
 	 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
 	 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index ac7f3b6ad583e..f712f8ff403cd 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -4012,6 +4012,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
 
 	return apic_id;
 }
+#endif
 
 int __init io_apic_get_version(int ioapic)
 {
@@ -4024,7 +4025,6 @@ int __init io_apic_get_version(int ioapic)
 
 	return reg_01.bits.version;
 }
-#endif
 
 int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 {
-- 
GitLab


From 3d58829b0510244596079c1d2f1762c53aef2e97 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 28 May 2009 09:54:47 +0200
Subject: [PATCH 1473/2198] x86, apic: Restore irqs on fail paths

lapic_resume forgets to restore interrupts on fail paths.
Fix that.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <1243497289-18591-1-git-send-email-jirislaby@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/apic/apic.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index b0fd26442c418..e82488d3f0bad 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2027,7 +2027,7 @@ static int lapic_resume(struct sys_device *dev)
 	unsigned int l, h;
 	unsigned long flags;
 	int maxlvt;
-	int ret;
+	int ret = 0;
 	struct IO_APIC_route_entry **ioapic_entries = NULL;
 
 	if (!apic_pm_state.active)
@@ -2038,14 +2038,15 @@ static int lapic_resume(struct sys_device *dev)
 		ioapic_entries = alloc_ioapic_entries();
 		if (!ioapic_entries) {
 			WARN(1, "Alloc ioapic_entries in lapic resume failed.");
-			return -ENOMEM;
+			ret = -ENOMEM;
+			goto restore;
 		}
 
 		ret = save_IO_APIC_setup(ioapic_entries);
 		if (ret) {
 			WARN(1, "Saving IO-APIC state failed: %d\n", ret);
 			free_ioapic_entries(ioapic_entries);
-			return ret;
+			goto restore;
 		}
 
 		mask_IO_APIC_setup(ioapic_entries);
@@ -2097,10 +2098,10 @@ static int lapic_resume(struct sys_device *dev)
 		restore_IO_APIC_setup(ioapic_entries);
 		free_ioapic_entries(ioapic_entries);
 	}
-
+restore:
 	local_irq_restore(flags);
 
-	return 0;
+	return ret;
 }
 
 /*
-- 
GitLab


From ea5cc87c63b49c133d15ec2911bb2e49e8124516 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 1 Jun 2009 22:31:03 -0300
Subject: [PATCH 1474/2198] perf_counter tools: Add string.[ch]

Add hex conversion libraries. We are going to replace sscanf()
uses with them.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/string.c | 34 ++++++++++++++++++++++++
 Documentation/perf_counter/util/string.h |  8 ++++++
 2 files changed, 42 insertions(+)
 create mode 100644 Documentation/perf_counter/util/string.c
 create mode 100644 Documentation/perf_counter/util/string.h

diff --git a/Documentation/perf_counter/util/string.c b/Documentation/perf_counter/util/string.c
new file mode 100644
index 0000000000000..ec33c0c7f4e2d
--- /dev/null
+++ b/Documentation/perf_counter/util/string.c
@@ -0,0 +1,34 @@
+#include "string.h"
+
+static int hex(char ch)
+{
+	if ((ch >= '0') && (ch <= '9'))
+		return ch - '0';
+	if ((ch >= 'a') && (ch <= 'f'))
+		return ch - 'a' + 10;
+	if ((ch >= 'A') && (ch <= 'F'))
+		return ch - 'A' + 10;
+	return -1;
+}
+
+/*
+ * While we find nice hex chars, build a long_val.
+ * Return number of chars processed.
+ */
+int hex2u64(const char *ptr, __u64 *long_val)
+{
+	const char *p = ptr;
+	*long_val = 0;
+
+	while (*p) {
+		const int hex_val = hex(*p);
+
+		if (hex_val < 0)
+			break;
+
+		*long_val = (*long_val << 4) | hex_val;
+		p++;
+	}
+
+	return p - ptr;
+}
diff --git a/Documentation/perf_counter/util/string.h b/Documentation/perf_counter/util/string.h
new file mode 100644
index 0000000000000..72812c1c9a7a2
--- /dev/null
+++ b/Documentation/perf_counter/util/string.h
@@ -0,0 +1,8 @@
+#ifndef _PERF_STRING_H_
+#define _PERF_STRING_H_
+
+#include <linux/types.h>
+
+int hex2u64(const char *ptr, __u64 *val);
+
+#endif
-- 
GitLab


From a0055ae2a4e13db9534c438cf8f3896181da6afc Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 1 Jun 2009 17:50:19 -0300
Subject: [PATCH 1475/2198] perf_counter tools: Use hex2u64 in more places

This has also a nice side effect, tools built on newer systems such as
fedora 10 again work on systems with older versions of glibc:

My workstation:

[acme@doppio ~]$ rpm -q glibc.x86_64
glibc-2.9-3.x86_64

Test machine:

[acme@emilia ~]$ rpm -q glibc.x86_64
glibc-2.5-24

Before:

[acme@emilia ~]$ perf
perf: /lib64/libc.so.6: version `GLIBC_2.7' not found (required by perf)
[acme@emilia ~]$ nm `which perf` | grep GLIBC_2\.7
                 U __isoc99_sscanf@@GLIBC_2.7
[acme@emilia ~]$

After:
[acme@emilia ~]$ perf
usage: perf [--version] [--help] COMMAND [ARGS]

The most commonly used perf commands are:
   record   Run a command and record its profile into perf.data
   report   Read perf.data (created by perf record) and display the
profile
   stat     Run a command and gather performance counter statistics
   top      Run a command and profile it

See 'perf help COMMAND' for more information on a specific command.
[acme@emilia ~]$ nm `which perf` | grep GLIBC_2\.7
[acme@emilia ~]$

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090601205019.GA7805@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile           |  2 +
 Documentation/perf_counter/builtin-record.c   | 56 +++++++++++--------
 Documentation/perf_counter/builtin-report.c   |  1 +
 .../perf_counter/util/parse-events.c          | 27 +++++----
 Documentation/perf_counter/util/symbol.c      | 38 +------------
 5 files changed, 54 insertions(+), 70 deletions(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 416ab11e9786d..3b8275feb8898 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -296,6 +296,7 @@ LIB_H += util/quote.h
 LIB_H += util/util.h
 LIB_H += util/help.h
 LIB_H += util/strbuf.h
+LIB_H += util/string.h
 LIB_H += util/run-command.h
 LIB_H += util/sigchain.h
 LIB_H += util/symbol.h
@@ -315,6 +316,7 @@ LIB_OBJS += util/rbtree.o
 LIB_OBJS += util/run-command.o
 LIB_OBJS += util/quote.o
 LIB_OBJS += util/strbuf.o
+LIB_OBJS += util/string.o
 LIB_OBJS += util/usage.o
 LIB_OBJS += util/wrapper.o
 LIB_OBJS += util/sigchain.o
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 96bfb7c5f1e4f..9c151ded22fab 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -5,6 +5,7 @@
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
+#include "util/string.h"
 
 #include <sched.h>
 
@@ -165,12 +166,10 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 {
 	struct comm_event comm_ev;
 	char filename[PATH_MAX];
-	pid_t spid, ppid;
 	char bf[BUFSIZ];
-	int fd, nr, ret;
-	char comm[18];
+	int fd, ret;
 	size_t size;
-	char state;
+	char *field, *sep;
 
 	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
 
@@ -185,20 +184,22 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 	}
 	close(fd);
 
+	/* 9027 (cat) R 6747 9027 6747 34816 9027 ... */
 	memset(&comm_ev, 0, sizeof(comm_ev));
-        nr = sscanf(bf, "%d %s %c %d %d ",
-			&spid, comm, &state, &ppid, &comm_ev.pid);
-	if (nr != 5) {
-		fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
-			filename);
-		exit(EXIT_FAILURE);
-	}
+	field = strchr(bf, '(');
+	if (field == NULL)
+		goto out_failure;
+	sep = strchr(++field, ')');
+	if (sep == NULL)
+		goto out_failure;
+	size = sep - field;
+	memcpy(comm_ev.comm, field, size++);
+	field = strchr(sep + 4, ' ');
+	if (field == NULL)
+		goto out_failure;
+	comm_ev.pid = atoi(++field);
 	comm_ev.header.type = PERF_EVENT_COMM;
 	comm_ev.tid = pid;
-	size = strlen(comm);
-	comm[--size] = '\0'; /* Remove the ')' at the end */
-	--size; /* Remove the '(' at the begin */
-	memcpy(comm_ev.comm, comm + 1, size);
 	size = ALIGN(size, sizeof(uint64_t));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
 
@@ -208,6 +209,11 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 		exit(-1);
 	}
 	return comm_ev.pid;
+out_failure:
+	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
+		filename);
+	exit(EXIT_FAILURE);
+	return -1;
 }
 
 static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
@@ -223,23 +229,25 @@ static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
 		exit(EXIT_FAILURE);
 	}
 	while (1) {
-		char bf[BUFSIZ];
-		unsigned char vm_read, vm_write, vm_exec, vm_mayshare;
+		char bf[BUFSIZ], *pbf = bf;
 		struct mmap_event mmap_ev = {
 			.header.type = PERF_EVENT_MMAP,
 		};
-		unsigned long ino;
-		int major, minor;
+		int n;
 		size_t size;
 		if (fgets(bf, sizeof(bf), fp) == NULL)
 			break;
 
 		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */
-		sscanf(bf, "%llx-%llx %c%c%c%c %llx %x:%x %lu",
-			&mmap_ev.start, &mmap_ev.len,
-                        &vm_read, &vm_write, &vm_exec, &vm_mayshare,
-                        &mmap_ev.pgoff, &major, &minor, &ino);
-		if (vm_exec == 'x') {
+		n = hex2u64(pbf, &mmap_ev.start);
+		if (n < 0)
+			continue;
+		pbf += n + 1;
+		n = hex2u64(pbf, &mmap_ev.len);
+		if (n < 0)
+			continue;
+		pbf += n + 3;
+		if (*pbf == 'x') { /* vm_exec */
 			char *execname = strrchr(bf, ' ');
 
 			if (execname == NULL || execname[1] != '/')
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 4705679debae1..7973092094e13 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -5,6 +5,7 @@
 #include "util/cache.h"
 #include "util/rbtree.h"
 #include "util/symbol.h"
+#include "util/string.h"
 
 #include "perf.h"
 
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index 88c903eb260a7..2fdfd1d923f2a 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -4,6 +4,7 @@
 #include "parse-options.h"
 #include "parse-events.h"
 #include "exec_cmd.h"
+#include "string.h"
 
 int nr_counters;
 
@@ -105,22 +106,26 @@ static __u64 match_event_symbols(const char *str)
 	__u64 config, id;
 	int type;
 	unsigned int i;
-	char mask_str[4];
+	const char *sep, *pstr;
 
-	if (sscanf(str, "r%llx", &config) == 1)
+	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0)
 		return config | PERF_COUNTER_RAW_MASK;
 
-	switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) {
-		case 3:
-			if (strchr(mask_str, 'k'))
+	pstr = str;
+	sep = strchr(pstr, ':');
+	if (sep) {
+		type = atoi(pstr);
+		pstr = sep + 1;
+		id = atoi(pstr);
+		sep = strchr(pstr, ':');
+		if (sep) {
+			pstr = sep + 1;
+			if (strchr(pstr, 'k'))
 				event_mask[nr_counters] |= EVENT_MASK_USER;
-			if (strchr(mask_str, 'u'))
+			if (strchr(pstr, 'u'))
 				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
-		case 2:
-			return EID(type, id);
-
-		default:
-			break;
+		}
+		return EID(type, id);
 	}
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 47281210443de..31e8fae584264 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -1,5 +1,6 @@
 #include "util.h"
 #include "../perf.h"
+#include "string.h"
 #include "symbol.h"
 
 #include <libelf.h>
@@ -122,39 +123,6 @@ size_t dso__fprintf(struct dso *self, FILE *fp)
 	return ret;
 }
 
-static int hex(char ch)
-{
-	if ((ch >= '0') && (ch <= '9'))
-		return ch - '0';
-	if ((ch >= 'a') && (ch <= 'f'))
-		return ch - 'a' + 10;
-	if ((ch >= 'A') && (ch <= 'F'))
-		return ch - 'A' + 10;
-	return -1;
-}
-
-/*
- * While we find nice hex chars, build a long_val.
- * Return number of chars processed.
- */
-static int hex2long(char *ptr, unsigned long *long_val)
-{
-	const char *p = ptr;
-	*long_val = 0;
-
-	while (*p) {
-		const int hex_val = hex(*p);
-
-		if (hex_val < 0)
-			break;
-
-		*long_val = (*long_val << 4) | hex_val;
-		p++;
-	}
-
-	return p - ptr;
-}
-
 static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter)
 {
 	struct rb_node *nd, *prevnd;
@@ -166,7 +134,7 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter)
 		goto out_failure;
 
 	while (!feof(file)) {
-		unsigned long start;
+		__u64 start;
 		struct symbol *sym;
 		int line_len, len;
 		char symbol_type;
@@ -180,7 +148,7 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter)
 
 		line[--line_len] = '\0'; /* \n */
 
-		len = hex2long(line, &start);
+		len = hex2u64(line, &start);
 
 		len++;
 		if (len + 2 >= line_len)
-- 
GitLab


From c8c96525f3c25f43a4fa230e293c4976c0c36cc1 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 1 Jun 2009 17:50:57 -0300
Subject: [PATCH 1476/2198] perf_counter tools: Add missing rb_erase in
 dso__delete_symbols

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090601205057.GB7805@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/symbol.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 31e8fae584264..039931fcb1b54 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -58,6 +58,7 @@ static void dso__delete_symbols(struct dso *self)
 	while (next) {
 		pos = rb_entry(next, struct symbol, rb_node);
 		next = rb_next(&pos->rb_node);
+		rb_erase(&pos->rb_node, &self->syms);
 		symbol__delete(pos, self->sym_priv_size);
 	}
 }
-- 
GitLab


From 4778541470cf7d074acd998fd40c06b94711e4ad Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Fri, 29 May 2009 07:41:26 +0000
Subject: [PATCH 1477/2198] sh: fix CONFIG_SH_PCLK_FREQ bug for sh7724

CONFIG_SH_PCLK_FREQ=33333333 is correct for sh7724.
sh7724 master clock is 33333333, but peripheral is 41666666.
This bug came to light because sh-sci driver had changed clk
from "module_clk" to "peripheral_clk"

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                  | 3 +--
 arch/sh/configs/se7724_defconfig | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index e9392aa8c5614..dec8b7b8dc6cf 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -502,8 +502,7 @@ config SH_PCLK_FREQ
 			      CPU_SUBTYPE_SH7760 || CPU_SUBTYPE_SH7705 || \
 			      CPU_SUBTYPE_SH7203 || CPU_SUBTYPE_SH7206 || \
 			      CPU_SUBTYPE_SH7263 || CPU_SUBTYPE_MXG    || \
-			      CPU_SUBTYPE_SH7786
-	default "41666666" if CPU_SUBTYPE_SH7724
+			      CPU_SUBTYPE_SH7786 || CPU_SUBTYPE_SH7724
 	default "60000000" if CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R
 	default "66000000" if CPU_SUBTYPE_SH4_202
 	default "50000000"
diff --git a/arch/sh/configs/se7724_defconfig b/arch/sh/configs/se7724_defconfig
index a029d0fb11635..96d2587467e6e 100644
--- a/arch/sh/configs/se7724_defconfig
+++ b/arch/sh/configs/se7724_defconfig
@@ -243,7 +243,7 @@ CONFIG_SH_7724_SOLUTION_ENGINE=y
 #
 CONFIG_SH_TIMER_TMU=y
 # CONFIG_SH_TIMER_CMT is not set
-CONFIG_SH_PCLK_FREQ=41666666
+CONFIG_SH_PCLK_FREQ=33333333
 # CONFIG_NO_HZ is not set
 # CONFIG_HIGH_RES_TIMERS is not set
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
-- 
GitLab


From dd0a3e77c825c9f5c6d2a97deb047f8d52026581 Mon Sep 17 00:00:00 2001
From: SUGIOKA Toshinobu <sugioka@itonet.co.jp>
Date: Mon, 1 Jun 2009 03:53:41 +0000
Subject: [PATCH 1478/2198] serial: sh-sci: Fix up PORT_SCI console output
 ordering.

Fix SCI transmission sequence in console output function.

This reorders the write sequence to match the SH-3 manual, and corrects
a console corruption bug observed on SH-3 SCI.

Signed-off-by: Toshinobu Sugioka <sugioka@itonet.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 drivers/serial/sh-sci.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c
index fa4d52a6c032a..a4cf1079b3129 100644
--- a/drivers/serial/sh-sci.c
+++ b/drivers/serial/sh-sci.c
@@ -151,9 +151,8 @@ static void sci_poll_put_char(struct uart_port *port, unsigned char c)
 		status = sci_in(port, SCxSR);
 	} while (!(status & SCxSR_TDxE(port)));
 
-	sci_in(port, SCxSR);            /* Dummy read */
-	sci_out(port, SCxSR, SCxSR_TDxE_CLEAR(port) & ~SCxSR_TEND(port));
 	sci_out(port, SCxTDR, c);
+	sci_out(port, SCxSR, SCxSR_TDxE_CLEAR(port) & ~SCxSR_TEND(port));
 }
 #endif /* CONFIG_CONSOLE_POLL || CONFIG_SERIAL_SH_SCI_CONSOLE */
 
-- 
GitLab


From d974ac24b762296eeaebb23e5eff8ed15ce44529 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Tue, 2 Jun 2009 02:49:13 +0000
Subject: [PATCH 1479/2198] sh: add RAMCR definition for sh4

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/cpu-sh4/cpu/cache.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/sh/include/cpu-sh4/cpu/cache.h b/arch/sh/include/cpu-sh4/cpu/cache.h
index 1c61ebf5c8e3b..7bfb9e8b069c2 100644
--- a/arch/sh/include/cpu-sh4/cpu/cache.h
+++ b/arch/sh/include/cpu-sh4/cpu/cache.h
@@ -38,5 +38,7 @@
 #define CACHE_IC_ADDRESS_ARRAY	0xf0000000
 #define CACHE_OC_ADDRESS_ARRAY	0xf4000000
 
+#define RAMCR			0xFF000074
+
 #endif /* __ASM_CPU_SH4_CACHE_H */
 
-- 
GitLab


From fab88d9fe98e9091aafeb9789fbf2e04fdace8ed Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Tue, 2 Jun 2009 02:49:20 +0000
Subject: [PATCH 1480/2198] sh: add weak l2_cache_init function.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/init.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/sh/kernel/cpu/init.c b/arch/sh/kernel/cpu/init.c
index d29e69c156f0a..ad85421099cdd 100644
--- a/arch/sh/kernel/cpu/init.c
+++ b/arch/sh/kernel/cpu/init.c
@@ -62,6 +62,11 @@ static void __init speculative_execution_init(void)
 #define speculative_execution_init()	do { } while (0)
 #endif
 
+/* 2nd-level cache init */
+void __uses_jump_to_uncached __attribute__ ((weak)) l2_cache_init(void)
+{
+}
+
 /*
  * Generic first-level cache init
  */
@@ -146,6 +151,8 @@ static void __uses_jump_to_uncached cache_init(void)
 	flags &= ~CCR_CACHE_ENABLE;
 #endif
 
+	l2_cache_init();
+
 	ctrl_outl(flags, CCR);
 	back_to_cached();
 }
-- 
GitLab


From b4bd9eb0d8f5c20f942ff037c8a7939f69cb188a Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Tue, 2 Jun 2009 02:49:25 +0000
Subject: [PATCH 1481/2198] sh: sh7724: L2 cache initialization.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 191f0e2a7e08b..000f3b82669b6 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -495,6 +495,15 @@ void __init plat_early_device_setup(void)
 				   ARRAY_SIZE(sh7724_early_devices));
 }
 
+#define RAMCR_CACHE_L2FC	0x0002
+#define RAMCR_CACHE_L2E		0x0001
+#define L2_CACHE_ENABLE		(RAMCR_CACHE_L2E|RAMCR_CACHE_L2FC)
+void __uses_jump_to_uncached l2_cache_init(void)
+{
+	/* Enable L2 cache */
+	ctrl_outl(L2_CACHE_ENABLE, RAMCR);
+}
+
 enum {
 	UNUSED = 0,
 
-- 
GitLab


From 138f025267dcc07d5e7d0bb1f20e9a6b5f2fdcf7 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Tue, 2 Jun 2009 02:49:28 +0000
Subject: [PATCH 1482/2198] sh: sh7723: L2 cache initialization.

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7723.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
index 04cb4aae7ea75..d8f4a13aeff9d 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c
@@ -14,6 +14,7 @@
 #include <linux/serial_sci.h>
 #include <linux/uio_driver.h>
 #include <linux/sh_timer.h>
+#include <linux/io.h>
 #include <asm/clock.h>
 #include <asm/mmzone.h>
 
@@ -485,6 +486,15 @@ void __init plat_early_device_setup(void)
 				   ARRAY_SIZE(sh7723_early_devices));
 }
 
+#define RAMCR_CACHE_L2FC	0x0002
+#define RAMCR_CACHE_L2E		0x0001
+#define L2_CACHE_ENABLE		(RAMCR_CACHE_L2E|RAMCR_CACHE_L2FC)
+void __uses_jump_to_uncached l2_cache_init(void)
+{
+	/* Enable L2 cache */
+	ctrl_outl(L2_CACHE_ENABLE, RAMCR);
+}
+
 enum {
 	UNUSED=0,
 
-- 
GitLab


From 2af15d6a44b871ad4c2a651302374cde8f335480 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 May 2009 13:37:24 -0400
Subject: [PATCH 1483/2198] ftrace: add kernel command line function filtering

When using ftrace=function on the command line to trace functions
on boot up, one can not filter out functions that are commonly called.

This patch adds two new ftrace command line commands.

  ftrace_notrace=function-list
  ftrace_filter=function-list

Where function-list is a comma separated list of functions to filter.
The ftrace_notrace will make the functions listed not be included
in the function tracing, and ftrace_filter will only trace the functions
listed.

These two act the same as the debugfs/tracing/set_ftrace_notrace and
debugfs/tracing/set_ftrace_filter respectively.

The simple glob expressions that are allowed by the filter files can also
be used by the command line interface.

	ftrace_notrace=rcu*,*lock,*spin*

Will not trace any function that starts with rcu, ends with lock, or has
the word spin in it.

Note, if the self tests are enabled, they may interfere with the filtering
set by the command lines.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/kernel-parameters.txt | 17 ++++++++++--
 kernel/trace/ftrace.c               | 42 +++++++++++++++++++++++++++++
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9243dd84f4d6c..fcd3bfbe74e88 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -751,12 +751,25 @@ and is between 256 and 4096 characters. It is defined in the file
 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
 	ftrace=[tracer]
-			[ftrace] will set and start the specified tracer
+			[FTRACE] will set and start the specified tracer
 			as early as possible in order to facilitate early
 			boot debugging.
 
 	ftrace_dump_on_oops
-			[ftrace] will dump the trace buffers on oops.
+			[FTRACE] will dump the trace buffers on oops.
+
+	ftrace_filter=[function-list]
+			[FTRACE] Limit the functions traced by the function
+			tracer at boot up. function-list is a comma separated
+			list of functions. This list can be changed at run
+			time by the set_ftrace_filter file in the debugfs
+			tracing directory. 
+
+	ftrace_notrace=[function-list]
+			[FTRACE] Do not trace the functions specified in
+			function-list. This list can be changed at run time
+			by the set_ftrace_notrace file in the debugfs
+			tracing directory.
 
 	gamecon.map[2|3]=
 			[HW,JOY] Multisystem joystick and NES/SNES/PSX pad
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 140699a9a8a72..2074e5b7766b3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -32,6 +32,7 @@
 #include <trace/events/sched.h>
 
 #include <asm/ftrace.h>
+#include <asm/setup.h>
 
 #include "trace_output.h"
 #include "trace_stat.h"
@@ -2369,6 +2370,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
 	ftrace_set_regex(buf, len, reset, 0);
 }
 
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE		COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+
+static int __init set_ftrace_notrace(char *str)
+{
+	strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+
+static int __init set_ftrace_filter(char *str)
+{
+	strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+	return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+
+static void __init set_ftrace_early_filter(char *buf, int enable)
+{
+	char *func;
+
+	while (buf) {
+		func = strsep(&buf, ",");
+		ftrace_set_regex(func, strlen(func), 0, enable);
+	}
+}
+
+static void __init set_ftrace_early_filters(void)
+{
+	if (ftrace_filter_buf[0])
+		set_ftrace_early_filter(ftrace_filter_buf, 1);
+	if (ftrace_notrace_buf[0])
+		set_ftrace_early_filter(ftrace_notrace_buf, 0);
+}
+
 static int
 ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 {
@@ -2829,6 +2869,8 @@ void __init ftrace_init(void)
 	if (ret)
 		pr_warning("Failed to register trace ftrace module notifier\n");
 
+	set_ftrace_early_filters();
+
 	return;
  failed:
 	ftrace_disabled = 1;
-- 
GitLab


From 5e0a093910876882f91f1d4b8a1635a099e6c7ba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 May 2009 15:50:13 -0400
Subject: [PATCH 1484/2198] tracing: fix config options to not show when
 automatically selected

There are two options that are selected by all tracers, but we want
to have those options available when no tracer is selected. These are

 The event tracer and sched switch tracer.

The are enabled by all tracers, but if a tracer is not selected we want
the options to appear. All tracers including them select TRACING.
Thus what we would like to do is:

  config EVENT_TRACER
	bool "prompt"
	depends on TRACING
	select TRACING

But that gives us a bug in the kbuild system since we just created a
circular dependency. We only want the prompt to show when TRACING is off.

This patch adds GENERIC_TRACER that all tracers will select instead of
TRACING. The two options (sched switch and event tracer) will select
TRACING directly and depend on !GENERIC_TRACER. This solves the cicular
dependency.

[ Impact: hide options that are selected by default ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 43 ++++++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a508b9d2adb8c..6e55cc3ac49dc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -56,6 +56,13 @@ config CONTEXT_SWITCH_TRACER
 	select MARKERS
 	bool
 
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hidding of the automatic options options.
+
 config TRACING
 	bool
 	select DEBUG_FS
@@ -66,6 +73,10 @@ config TRACING
 	select BINARY_PRINTF
 	select EVENT_TRACING
 
+config GENERIC_TRACER
+	bool
+	select TRACING
+
 #
 # Minimum requirements an architecture has to meet for us to
 # be able to offer generic tracing facilities:
@@ -95,7 +106,7 @@ config FUNCTION_TRACER
 	depends on HAVE_FUNCTION_TRACER
 	select FRAME_POINTER
 	select KALLSYMS
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  Enable the kernel to trace every kernel function. This is done
@@ -126,7 +137,7 @@ config IRQSOFF_TRACER
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on GENERIC_TIME
 	select TRACE_IRQFLAGS
-	select TRACING
+	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	help
 	  This option measures the time spent in irqs-off critical
@@ -147,7 +158,7 @@ config PREEMPT_TRACER
 	default n
 	depends on GENERIC_TIME
 	depends on PREEMPT
-	select TRACING
+	select GENERIC_TRACER
 	select TRACER_MAX_TRACE
 	help
 	  This option measures the time spent in preemption off critical
@@ -166,7 +177,7 @@ config PREEMPT_TRACER
 config SYSPROF_TRACER
 	bool "Sysprof Tracer"
 	depends on X86
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
@@ -174,7 +185,7 @@ config SYSPROF_TRACER
 
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
 	help
@@ -183,6 +194,7 @@ config SCHED_TRACER
 
 config ENABLE_CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
+	depends on !GENERIC_TRACER
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	help
@@ -191,6 +203,7 @@ config ENABLE_CONTEXT_SWITCH_TRACER
 
 config ENABLE_EVENT_TRACING
 	bool "Trace various events in the kernel"
+	depends on !GENERIC_TRACER
 	select TRACING
 	help
 	  This tracer hooks to various trace points in the kernel
@@ -204,14 +217,14 @@ config ENABLE_EVENT_TRACING
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
-	select TRACING
+	select GENERIC_TRACER
 	select KALLSYMS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
 config BOOT_TRACER
 	bool "Trace boot initcalls"
-	select TRACING
+	select GENERIC_TRACER
 	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer helps developers to optimize boot times: it records
@@ -228,7 +241,7 @@ config BOOT_TRACER
 
 config TRACE_BRANCH_PROFILING
 	bool
-	select TRACING
+	select GENERIC_TRACER
 
 choice
 	prompt "Branch Profiling"
@@ -308,7 +321,7 @@ config BRANCH_TRACER
 config POWER_TRACER
 	bool "Trace power consumption behavior"
 	depends on X86
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  This tracer helps developers to analyze and optimize the kernels
 	  power management decisions, specifically the C-state and P-state
@@ -342,14 +355,14 @@ config STACK_TRACER
 config HW_BRANCH_TRACER
 	depends on HAVE_HW_BRANCH_TRACER
 	bool "Trace hw branches"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 
 config KMEMTRACE
 	bool "Trace SLAB allocations"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -369,7 +382,7 @@ config KMEMTRACE
 
 config WORKQUEUE_TRACER
 	bool "Trace workqueues"
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  The workqueue tracer provides some statistical informations
           about each cpu workqueue thread such as the number of the
@@ -385,7 +398,7 @@ config BLK_DEV_IO_TRACE
 	select RELAY
 	select DEBUG_FS
 	select TRACEPOINTS
-	select TRACING
+	select GENERIC_TRACER
 	select STACKTRACE
 	help
 	  Say Y here if you want to be able to trace the block layer actions
@@ -446,7 +459,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
-	depends on TRACING
+	depends on GENERIC_TRACER
 	select FTRACE_SELFTEST
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
@@ -457,7 +470,7 @@ config FTRACE_STARTUP_TEST
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
 	depends on HAVE_MMIOTRACE_SUPPORT && PCI
-	select TRACING
+	select GENERIC_TRACER
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  debugging and reverse engineering. It is called from the ioremap
-- 
GitLab


From 897f17a65389a26509bd0c79a9812d1c9ea8ea6f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 28 May 2009 16:31:21 -0400
Subject: [PATCH 1485/2198] tracing: combine the default tracers into one
 config

Both event tracer and sched switch plugin are selected by default
by all generic tracers. But if no generic tracer is enabled, their options
appear. But ether one of them will select the other, thus it only
makes sense to have the default tracers be selected by one option.

[ Impact: clean up kconfig menu ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/Kconfig | 19 +++----------------
 1 file changed, 3 insertions(+), 16 deletions(-)

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6e55cc3ac49dc..4a13e5a01ce31 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -192,27 +192,14 @@ config SCHED_TRACER
 	  This tracer tracks the latency of the highest priority task
 	  to be scheduled in, starting from the point it has woken up.
 
-config ENABLE_CONTEXT_SWITCH_TRACER
-	bool "Trace process context switches"
-	depends on !GENERIC_TRACER
-	select TRACING
-	select CONTEXT_SWITCH_TRACER
-	help
-	  This tracer gets called from the context switch and records
-	  all switching of tasks.
-
-config ENABLE_EVENT_TRACING
-	bool "Trace various events in the kernel"
+config ENABLE_DEFAULT_TRACERS
+	bool "Trace process context switches and events"
 	depends on !GENERIC_TRACER
 	select TRACING
 	help
 	  This tracer hooks to various trace points in the kernel
 	  allowing the user to pick and choose which trace point they
-	  want to trace.
-
-	  Note, all tracers enable event tracing. This option is
-	  only a convenience to enable event tracing when no other
-	  tracers are selected.
+	  want to trace. It also includes the sched_switch tracer plugin.
 
 config FTRACE_SYSCALLS
 	bool "Trace syscalls"
-- 
GitLab


From 6e25db44a7ad7eb380f4ec774ec00a8fcddea112 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 29 May 2009 11:24:59 +0800
Subject: [PATCH 1486/2198] tracing/events: fix a typo in __string() format
 output

"tsize" should be "\tsize". Also remove the space before "__str_loc".

Before:
 # cat tracing/events/irq/irq_handler_entry/format
        ...
        field:int irq;  offset:12;      size:4;
        field: __str_loc name;  offset:16;tsize:2;
        ...

After:
 # cat tracing/events/irq/irq_handler_entry/format
	...
        field:int irq;  offset:12;      size:4;
        field:__str_loc name;   offset:16;      size:2;
	...

[ Impact: standardize __string field description in events format file ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/ftrace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index b4ec83ae711fe..9276ec4f34de0 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -209,8 +209,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 
 #undef __string
 #define __string(item, src)						       \
-	ret = trace_seq_printf(s, "\tfield: __str_loc " #item ";\t"	       \
-			       "offset:%u;tsize:%u;\n",			       \
+	ret = trace_seq_printf(s, "\tfield:__str_loc " #item ";\t"	       \
+			       "offset:%u;\tsize:%u;\n",		       \
 			       (unsigned int)offsetof(typeof(field),	       \
 					__str_loc_##item),		       \
 			       (unsigned int)sizeof(field.__str_loc_##item));  \
-- 
GitLab


From a9c1c3abe1160a5632e48c929b02b740556bf423 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 1 Jun 2009 15:35:13 +0800
Subject: [PATCH 1487/2198] tracing/events: put TP_fast_assign into braces

Currently TP_fast_assign has a limitation that we can't define local
variables in it.

Here's one use case when we introduce __dynamic_array():

TP_fast_assign(
	type *p = __get_dynamic_array(item);

	foo(p);
	bar(p);
),

[ Impact: allow defining local variables in TP_fast_assign ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2384B1.90100@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/ftrace.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 9276ec4f34de0..ee9268222448f 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -471,7 +471,7 @@ static void ftrace_raw_event_##call(proto)				\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
-	assign;								\
+	{ assign; }							\
 									\
 	if (!filter_current_check_discard(event_call, entry, event))	\
 		trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
-- 
GitLab


From 7fcb7c472f455d1711eb5a7633204dba8800a6d6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Mon, 1 Jun 2009 15:35:46 +0800
Subject: [PATCH 1488/2198] tracing/events: introduce __dynamic_array()

__string() is limited:

  - it's a char array, but we may want to define array with other types
  - a source string should be available, but we may just know the string size

We introduce __dynamic_array() to break those limitations, and __string()
becomes a wrapper of it. As a side effect, now __get_str() can be used
in TP_fast_assign but not only TP_print.

Take XFS for example, we have the string length in the dirent, but the
string itself is not NULL-terminated, so __dynamic_array() can be used:

TRACE_EVENT(xfs_dir2,
	TP_PROTO(struct xfs_da_args *args),
	TP_ARGS(args),

	TP_STRUCT__entry(
		__field(int, namelen)
		__dynamic_array(char, name, args->namelen + 1)
		...
	),

	TP_fast_assign(
		char *name = __get_str(name);

		if (args->namelen)
			memcpy(name, args->name, args->namelen);
		name[args->namelen] = '\0';

		__entry->namelen = args->namelen;
	),

	TP_printk("name %.*s namelen %d",
		  __entry->namelen ? __get_str(name) : NULL
		  __entry->namelen)
);

[ Impact: allow defining dynamic size arrays ]

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2384D2.3080403@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/ftrace.h             | 122 +++++++++++++++++++++--------
 kernel/trace/trace_events_filter.c |   6 +-
 2 files changed, 91 insertions(+), 37 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index ee9268222448f..b5478dab579b7 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -18,14 +18,17 @@
 
 #include <linux/ftrace_event.h>
 
+#undef __field
+#define __field(type, item)		type	item;
+
 #undef __array
 #define __array(type, item, len)	type	item[len];
 
-#undef __field
-#define __field(type, item)		type	item;
+#undef __dynamic_array
+#define __dynamic_array(type, item, len) unsigned short __data_loc_##item;
 
 #undef __string
-#define __string(item, src)		unsigned short	__str_loc_##item;
+#define __string(item, src) __dynamic_array(char, item, -1)
 
 #undef TP_STRUCT__entry
 #define TP_STRUCT__entry(args...) args
@@ -35,7 +38,7 @@
 	struct ftrace_raw_##name {				\
 		struct trace_entry	ent;			\
 		tstruct						\
-		char			__str_data[0];		\
+		char			__data[0];		\
 	};							\
 	static struct ftrace_event_call event_##name
 
@@ -47,30 +50,31 @@
  *
  * Include the following:
  *
- * struct ftrace_str_offsets_<call> {
- *	int				<str1>;
- *	int				<str2>;
+ * struct ftrace_data_offsets_<call> {
+ *	int				<item1>;
+ *	int				<item2>;
  *	[...]
  * };
  *
- * The __string() macro will create each int <str>, this is to
- * keep the offset of each string from the beggining of the event
- * once we perform the strlen() of the src strings.
- *
+ * The __dynamic_array() macro will create each int <item>, this is
+ * to keep the offset of each array from the beginning of the event.
  */
 
+#undef __field
+#define __field(type, item);
+
 #undef __array
 #define __array(type, item, len)
 
-#undef __field
-#define __field(type, item);
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)	int item;
 
 #undef __string
-#define __string(item, src)	int item;
+#define __string(item, src) __dynamic_array(char, item, -1)
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
-	struct ftrace_str_offsets_##call {				\
+	struct ftrace_data_offsets_##call {				\
 		tstruct;						\
 	};
 
@@ -119,8 +123,12 @@
 #undef TP_printk
 #define TP_printk(fmt, args...) fmt "\n", args
 
+#undef __get_dynamic_array
+#define __get_dynamic_array(field)	\
+		((void *)__entry + __entry->__data_loc_##field)
+
 #undef __get_str
-#define __get_str(field)	((char *)__entry + __entry->__str_loc_##field)
+#define __get_str(field) (char *)__get_dynamic_array(field)
 
 #undef __print_flags
 #define __print_flags(flag, delim, flag_array...)			\
@@ -207,16 +215,19 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	if (!ret)							\
 		return 0;
 
-#undef __string
-#define __string(item, src)						       \
-	ret = trace_seq_printf(s, "\tfield:__str_loc " #item ";\t"	       \
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				       \
+	ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t"	       \
 			       "offset:%u;\tsize:%u;\n",		       \
 			       (unsigned int)offsetof(typeof(field),	       \
-					__str_loc_##item),		       \
-			       (unsigned int)sizeof(field.__str_loc_##item));  \
+					__data_loc_##item),		       \
+			       (unsigned int)sizeof(field.__data_loc_##item)); \
 	if (!ret)							       \
 		return 0;
 
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
 #undef __entry
 #define __entry REC
 
@@ -260,11 +271,14 @@ ftrace_format_##call(struct trace_seq *s)				\
 	if (ret)							\
 		return ret;
 
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				       \
+	ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\
+				offsetof(typeof(field), __data_loc_##item),    \
+				 sizeof(field.__data_loc_##item), 0);
+
 #undef __string
-#define __string(item, src)						       \
-	ret = trace_define_field(event_call, "__str_loc", #item,	       \
-				offsetof(typeof(field), __str_loc_##item),     \
-				 sizeof(field.__str_loc_##item), 0);
+#define __string(item, src) __dynamic_array(char, item, -1)
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
@@ -288,6 +302,43 @@ ftrace_define_fields_##call(void)					\
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+/*
+ * remember the offset of each array from the beginning of the event.
+ */
+
+#undef __entry
+#define __entry entry
+
+#undef __field
+#define __field(type, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				\
+	__data_offsets->item = __data_size +				\
+			       offsetof(typeof(*entry), __data);	\
+	__data_size += (len) * sizeof(type);
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1)       \
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+static inline int ftrace_get_offsets_##call(				\
+	struct ftrace_data_offsets_##call *__data_offsets, proto)       \
+{									\
+	int __data_size = 0;						\
+	struct ftrace_raw_##call __maybe_unused *entry;			\
+									\
+	tstruct;							\
+									\
+	return __data_size;						\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
 /*
  * Stage 4 of the trace events.
  *
@@ -432,15 +483,15 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
 #undef __array
 #define __array(type, item, len)
 
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				\
+	__entry->__data_loc_##item = __data_offsets.item;
+
 #undef __string
-#define __string(item, src)						\
-	__str_offsets.item = __str_size +				\
-			     offsetof(typeof(*entry), __str_data);	\
-	__str_size += strlen(src) + 1;
+#define __string(item, src) __dynamic_array(char, item, -1)       	\
 
 #undef __assign_str
 #define __assign_str(dst, src)						\
-	__entry->__str_loc_##dst = __str_offsets.dst;			\
 	strcpy(__get_str(dst), src);
 
 #undef TRACE_EVENT
@@ -451,26 +502,29 @@ static struct ftrace_event_call event_##call;				\
 									\
 static void ftrace_raw_event_##call(proto)				\
 {									\
-	struct ftrace_str_offsets_##call __maybe_unused __str_offsets;	\
+	struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
 	struct ftrace_event_call *event_call = &event_##call;		\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
 	unsigned long irq_flags;					\
-	int __str_size = 0;						\
+	int __data_size;						\
 	int pc;								\
 									\
 	local_save_flags(irq_flags);					\
 	pc = preempt_count();						\
 									\
-	tstruct;							\
+	__data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
 									\
 	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				 sizeof(struct ftrace_raw_##call) + __str_size,\
+				 sizeof(*entry) + __data_size,		\
 				 irq_flags, pc);			\
 	if (!event)							\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
+									\
+	tstruct								\
+									\
 	{ assign; }							\
 									\
 	if (!filter_current_check_discard(event_call, entry, event))	\
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index a7430b16d2439..db6e54bdb596d 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -478,12 +478,12 @@ enum {
 
 static int is_string_field(const char *type)
 {
+	if (strstr(type, "__data_loc") && strstr(type, "char"))
+		return FILTER_DYN_STRING;
+
 	if (strchr(type, '[') && strstr(type, "char"))
 		return FILTER_STATIC_STRING;
 
-	if (!strcmp(type, "__str_loc"))
-		return FILTER_DYN_STRING;
-
 	return 0;
 }
 
-- 
GitLab


From ec081ddc3d90aab35bc0de19a358b964978837cf Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Mon, 1 Jun 2009 15:53:35 +0100
Subject: [PATCH 1489/2198] tracing: add exports to use __print_symbolic and
 __print_flags from a module

A patch to allow the use of __print_symbolic and __print_flags
from a module. This allows the current GFS2 tracing patch to
build.

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
LKML-Reference: <1243868015.29604.542.camel@localhost.localdomain>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c12d95db2f562..0fe3b223f7ed8 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -17,6 +17,7 @@
 static DECLARE_RWSEM(trace_event_mutex);
 
 DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
 
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
@@ -250,6 +251,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 
 	return p->buffer;
 }
+EXPORT_SYMBOL(ftrace_print_flags_seq);
 
 const char *
 ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
@@ -275,6 +277,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 
 	return p->buffer;
 }
+EXPORT_SYMBOL(ftrace_print_symbols_seq);
 
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
-- 
GitLab


From 1d080d6c3141623c92caaebe20e847cb99ccbb60 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Jun 2009 12:20:40 -0400
Subject: [PATCH 1490/2198] tracing: remove redundant SOFTIRQ from softirq
 event traces

After converting the softirq tracer to use te flags options, this
caused a regression with the name. Since the flag was used directly
it was printed out (i.e. HRTIMER_SOFTIRQ).

This patch only shows the softirq name without the SOFTIRQ part.

[ Impact: fix regression of output from softirq events ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/irq.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
index 683fb36a9943e..b0c7ede55eb15 100644
--- a/include/trace/events/irq.h
+++ b/include/trace/events/irq.h
@@ -7,18 +7,18 @@
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM irq
 
-#define softirq_name(sirq) { sirq, #sirq }
-#define show_softirq_name(val)						\
-	__print_symbolic(val,						\
-			 softirq_name(HI_SOFTIRQ),			\
-			 softirq_name(TIMER_SOFTIRQ),			\
-			 softirq_name(NET_TX_SOFTIRQ),			\
-			 softirq_name(NET_RX_SOFTIRQ),			\
-			 softirq_name(BLOCK_SOFTIRQ),			\
-			 softirq_name(TASKLET_SOFTIRQ),			\
-			 softirq_name(SCHED_SOFTIRQ),			\
-			 softirq_name(HRTIMER_SOFTIRQ),			\
-			 softirq_name(RCU_SOFTIRQ))
+#define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }
+#define show_softirq_name(val)			\
+	__print_symbolic(val,			\
+			 softirq_name(HI),	\
+			 softirq_name(TIMER),	\
+			 softirq_name(NET_TX),	\
+			 softirq_name(NET_RX),	\
+			 softirq_name(BLOCK),	\
+			 softirq_name(TASKLET),	\
+			 softirq_name(SCHED),	\
+			 softirq_name(HRTIMER),	\
+			 softirq_name(RCU))
 
 /**
  * irq_handler_entry - called immediately before the irq action handler
-- 
GitLab


From 112f38a7e36e9d688b389507136bf3af3e6d159b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Jun 2009 15:16:05 -0400
Subject: [PATCH 1491/2198] tracing: make trace pipe recognize latency format
 flag

The trace_pipe did not recognize the latency format flag and would produce
different output than the trace file. The problem was partly due that
the trace flags in the iterator was not set as well as the trace_pipe
zeros out part of the iterator (including the flags) to be able to use
the same routines as the trace file. trace_flags of the iterator should
not cause any problems when not zeroed out by for trace_pipe.

Reported-by: Johannes Berg <johannes@sipsolutions.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace_event.h | 2 +-
 kernel/trace/trace.c         | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index bbf40f624fc8d..5c093ffc655bf 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -51,6 +51,7 @@ struct trace_iterator {
 	int			cpu_file;
 	struct mutex		mutex;
 	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+	unsigned long		iter_flags;
 
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
@@ -58,7 +59,6 @@ struct trace_iterator {
 	int			cpu;
 	u64			ts;
 
-	unsigned long		iter_flags;
 	loff_t			pos;
 	long			idx;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a3a8a87d7e912..cae34c69752fe 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2826,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 	/* trace pipe does not show start of buffer */
 	cpumask_setall(iter->started);
 
+	if (trace_flags & TRACE_ITER_LATENCY_FMT)
+		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
 	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
 	mutex_init(&iter->mutex);
-- 
GitLab


From 0f6ce3de4ef6ff940308087c49760d068851c1a7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Jun 2009 21:51:28 -0400
Subject: [PATCH 1492/2198] ftrace: do not profile functions when disabled

A race was found that if one were to enable and disable the function
profiler repeatedly, then the system can panic. This was because a profiled
function may be preempted just before disabling interrupts. While
the profiler is disabled and then reenabled, the preempted function
could start again, and access the hash as it is being initialized.

This just adds a check in the irq disabled part to check if the profiler
is enabled, and if it is not then it will just exit.

When the system is disabled, the profile_enabled variable is cleared
before calling the unregistering of the function profiler. This
unregistering calls stop machine which also acts as a synchronize schedule.

[ Impact: fix panic in enabling/disabling function profiler ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2074e5b7766b3..d6973dfadb36d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -599,7 +599,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_save(flags);
 
 	stat = &__get_cpu_var(ftrace_profile_stats);
-	if (!stat->hash)
+	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
 	rec = ftrace_find_profiled_func(stat, ip);
@@ -630,7 +630,7 @@ static void profile_graph_return(struct ftrace_graph_ret *trace)
 
 	local_irq_save(flags);
 	stat = &__get_cpu_var(ftrace_profile_stats);
-	if (!stat->hash)
+	if (!stat->hash || !ftrace_profile_enabled)
 		goto out;
 
 	calltime = trace->rettime - trace->calltime;
@@ -724,6 +724,10 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
 			ftrace_profile_enabled = 1;
 		} else {
 			ftrace_profile_enabled = 0;
+			/*
+			 * unregister_ftrace_profiler calls stop_machine
+			 * so this acts like an synchronize_sched.
+			 */
 			unregister_ftrace_profiler();
 		}
 	}
-- 
GitLab


From fbeb4a9c20d00e2550156f9e5a34473fbde59de2 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Mon, 1 Jun 2009 22:47:19 -0500
Subject: [PATCH 1493/2198] tomoyo: avoid get+put of task_struct

Use task_cred_xxx(task, security) in tomoyo_real_domain() to
avoid a get+put of the target cred.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/tomoyo/tomoyo.h | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/security/tomoyo/tomoyo.h b/security/tomoyo/tomoyo.h
index 41c6ebafb9c56..f12d5ada7dcbb 100644
--- a/security/tomoyo/tomoyo.h
+++ b/security/tomoyo/tomoyo.h
@@ -90,17 +90,10 @@ static inline struct tomoyo_domain_info *tomoyo_domain(void)
 	return current_cred()->security;
 }
 
-/* Caller holds tasklist_lock spinlock. */
 static inline struct tomoyo_domain_info *tomoyo_real_domain(struct task_struct
 							    *task)
 {
-	/***** CRITICAL SECTION START *****/
-	const struct cred *cred = get_task_cred(task);
-	struct tomoyo_domain_info *domain = cred->security;
-
-	put_cred(cred);
-	return domain;
-	/***** CRITICAL SECTION END *****/
+	return task_cred_xxx(task, security);
 }
 
 #endif /* !defined(_SECURITY_TOMOYO_TOMOYO_H) */
-- 
GitLab


From 53c663ce0f39ba8e8ef652e400b317bc60ac7f19 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Tue, 2 Jun 2009 08:44:01 +0200
Subject: [PATCH 1494/2198] block: fix a possible oops on elv_abort_queue()

I found one more mis-conversion to the 'request is always dequeued
when completing' model in elv_abort_queue() during code inspection.
Although I haven't hit any problem caused by this mis-conversion yet
and just done compile/boot test, please apply if you have no problem.

Request must be dequeued when it completes.
However, elv_abort_queue() completes requests without dequeueing.
This will cause oops in the __blk_end_request_all().
This patch fixes the oops.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/elevator.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/block/elevator.c b/block/elevator.c
index bd78f693fcf28..a029cfed80dae 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -813,6 +813,11 @@ void elv_abort_queue(struct request_queue *q)
 		rq = list_entry_rq(q->queue_head.next);
 		rq->cmd_flags |= REQ_QUIET;
 		trace_block_rq_abort(q, rq);
+		/*
+		 * Mark this request as started so we don't trigger
+		 * any debug logic in the end I/O path.
+		 */
+		blk_start_request(rq);
 		__blk_end_request_all(rq, -EIO);
 	}
 }
-- 
GitLab


From a12af1ebe675e85831fde3c4d0908fc3b0908b7a Mon Sep 17 00:00:00 2001
From: Abhijith Das <adas@redhat.com>
Date: Mon, 1 Jun 2009 12:30:03 -0500
Subject: [PATCH 1495/2198] GFS2: smbd proccess hangs with flock() call.

GFS2 currently does not support mandatory flocks. An flock() call with
LOCK_MAND triggers unexpected behavior because gfs2 is not checking for
this lock type. This patch corrects that.

Signed-off-by: Abhi Das <adas@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73b6f552f06dd..841ddc9793881 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -698,8 +698,8 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
-	if (__mandatory_lock(&ip->i_inode))
-		return -ENOLCK;
+	if (fl->fl_type & LOCK_MAND)
+		return -EOPNOTSUPP;
 
 	if (fl->fl_type == F_UNLCK) {
 		do_unflock(file, fl);
-- 
GitLab


From c25486c5ea3ea5586f71285a3aa5cfafb1db59f9 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 2 Jun 2009 08:09:48 +0200
Subject: [PATCH 1496/2198] perf_counter tools: Make .gitignore reflect
 perf_counter tools files

Make .gitignore reflect perf_counter tools files so
git status doesn't gripe about untracked files.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/.gitignore | 187 ++------------------------
 1 file changed, 12 insertions(+), 175 deletions(-)

diff --git a/Documentation/perf_counter/.gitignore b/Documentation/perf_counter/.gitignore
index 41c0b20a76ce0..d69a759a10466 100644
--- a/Documentation/perf_counter/.gitignore
+++ b/Documentation/perf_counter/.gitignore
@@ -1,179 +1,16 @@
-GIT-BUILD-OPTIONS
-GIT-CFLAGS
-GIT-GUI-VARS
-GIT-VERSION-FILE
-git
-git-add
-git-add--interactive
-git-am
-git-annotate
-git-apply
-git-archimport
-git-archive
-git-bisect
-git-bisect--helper
-git-blame
-git-branch
-git-bundle
-git-cat-file
-git-check-attr
-git-check-ref-format
-git-checkout
-git-checkout-index
-git-cherry
-git-cherry-pick
-git-clean
-git-clone
-git-commit
-git-commit-tree
-git-config
-git-count-objects
-git-cvsexportcommit
-git-cvsimport
-git-cvsserver
-git-daemon
-git-diff
-git-diff-files
-git-diff-index
-git-diff-tree
-git-difftool
-git-difftool--helper
-git-describe
-git-fast-export
-git-fast-import
-git-fetch
-git-fetch--tool
-git-fetch-pack
-git-filter-branch
-git-fmt-merge-msg
-git-for-each-ref
-git-format-patch
-git-fsck
-git-fsck-objects
-git-gc
-git-get-tar-commit-id
-git-grep
-git-hash-object
-git-help
-git-http-fetch
-git-http-push
-git-imap-send
-git-index-pack
-git-init
-git-init-db
-git-instaweb
-git-log
-git-lost-found
-git-ls-files
-git-ls-remote
-git-ls-tree
-git-mailinfo
-git-mailsplit
-git-merge
-git-merge-base
-git-merge-index
-git-merge-file
-git-merge-tree
-git-merge-octopus
-git-merge-one-file
-git-merge-ours
-git-merge-recursive
-git-merge-resolve
-git-merge-subtree
-git-mergetool
-git-mergetool--lib
-git-mktag
-git-mktree
-git-name-rev
-git-mv
-git-pack-redundant
-git-pack-objects
-git-pack-refs
-git-parse-remote
-git-patch-id
-git-peek-remote
-git-prune
-git-prune-packed
-git-pull
-git-push
-git-quiltimport
-git-read-tree
-git-rebase
-git-rebase--interactive
-git-receive-pack
-git-reflog
-git-relink
-git-remote
-git-repack
-git-repo-config
-git-request-pull
-git-rerere
-git-reset
-git-rev-list
-git-rev-parse
-git-revert
-git-rm
-git-send-email
-git-send-pack
-git-sh-setup
-git-shell
-git-shortlog
-git-show
-git-show-branch
-git-show-index
-git-show-ref
-git-stage
-git-stash
-git-status
-git-stripspace
-git-submodule
-git-svn
-git-symbolic-ref
-git-tag
-git-tar-tree
-git-unpack-file
-git-unpack-objects
-git-update-index
-git-update-ref
-git-update-server-info
-git-upload-archive
-git-upload-pack
-git-var
-git-verify-pack
-git-verify-tag
-git-web--browse
-git-whatchanged
-git-write-tree
-git-core-*/?*
-gitk-wish
-gitweb/gitweb.cgi
-test-chmtime
-test-ctype
-test-date
-test-delta
-test-dump-cache-tree
-test-genrandom
-test-match-trees
-test-parse-options
-test-path-utils
-test-sha1
-test-sigchain
+PERF-BUILD-OPTIONS
+PERF-CFLAGS
+PERF-GUI-VARS
+PERF-VERSION-FILE
+perf
+perf-help
+perf-record
+perf-report
+perf-stat
+perf-top
+perf*.1
+perf*.xml
 common-cmds.h
-*.tar.gz
-*.dsc
-*.deb
-git.spec
-*.exe
-*.[aos]
-*.py[co]
-config.mak
-autom4te.cache
-config.cache
-config.log
-config.status
-config.mak.autogen
-config.mak.append
-configure
 tags
 TAGS
 cscope*
-- 
GitLab


From c1079abd1d5e66edabeb04d75e48e604cd8c167f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 2 Jun 2009 10:17:34 +0200
Subject: [PATCH 1497/2198] perf_counter tools: Cleanup Makefile

We currently build perf-stat/record etc, only to do nothing
with them.  We also install the perf binary in two places,
$prefix/bin and $perfexec_instdir, which appears to be for
binaries which perf would exec were a command not linked in.
Correct this, and comment out broken/incomplete targets dist
and coverage.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile | 141 ++++++++++++++--------------
 1 file changed, 73 insertions(+), 68 deletions(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 3b8275feb8898..eae88561233b4 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -260,8 +260,6 @@ PROGRAMS += perf
 
 # List built-in command $C whose implementation cmd_$C() is not in
 # builtin-$C.o but is linked in as part of some other command.
-BUILT_INS += $(patsubst builtin-%.o,perf-%$X,$(BUILTIN_OBJS))
-
 #
 # None right now:
 #
@@ -791,12 +789,14 @@ export perfexec_instdir
 
 install: all
 	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-	$(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 	$(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
+ifdef BUILT_INS
+	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
 ifneq (,$X)
 	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
 endif
+endif
 
 install-doc:
 	$(MAKE) -C Documentation install
@@ -824,52 +824,55 @@ quick-install-html:
 
 
 ### Maintainer's dist rules
-
-perf.spec: perf.spec.in
-	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
-	mv $@+ $@
-
-PERF_TARNAME=perf-$(PERF_VERSION)
-dist: perf.spec perf-archive$(X) configure
-	./perf-archive --format=tar \
-		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
-	@mkdir -p $(PERF_TARNAME)
-	@cp perf.spec configure $(PERF_TARNAME)
-	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
-	$(TAR) rf $(PERF_TARNAME).tar \
-		$(PERF_TARNAME)/perf.spec \
-		$(PERF_TARNAME)/configure \
-		$(PERF_TARNAME)/version
-	@$(RM) -r $(PERF_TARNAME)
-	gzip -f -9 $(PERF_TARNAME).tar
-
-htmldocs = perf-htmldocs-$(PERF_VERSION)
-manpages = perf-manpages-$(PERF_VERSION)
-dist-doc:
-	$(RM) -r .doc-tmp-dir
-	mkdir .doc-tmp-dir
-	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
-	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
-	gzip -n -9 -f $(htmldocs).tar
-	:
-	$(RM) -r .doc-tmp-dir
-	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
-	$(MAKE) -C Documentation DESTDIR=./ \
-		man1dir=../.doc-tmp-dir/man1 \
-		man5dir=../.doc-tmp-dir/man5 \
-		man7dir=../.doc-tmp-dir/man7 \
-		install
-	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
-	gzip -n -9 -f $(manpages).tar
-	$(RM) -r .doc-tmp-dir
-
-rpm: dist
-	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
+#
+# None right now
+#
+#
+# perf.spec: perf.spec.in
+#	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
+#	mv $@+ $@
+#
+# PERF_TARNAME=perf-$(PERF_VERSION)
+# dist: perf.spec perf-archive$(X) configure
+#	./perf-archive --format=tar \
+#		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
+#	@mkdir -p $(PERF_TARNAME)
+#	@cp perf.spec configure $(PERF_TARNAME)
+#	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
+#	$(TAR) rf $(PERF_TARNAME).tar \
+#		$(PERF_TARNAME)/perf.spec \
+#		$(PERF_TARNAME)/configure \
+#		$(PERF_TARNAME)/version
+#	@$(RM) -r $(PERF_TARNAME)
+#	gzip -f -9 $(PERF_TARNAME).tar
+#
+# htmldocs = perf-htmldocs-$(PERF_VERSION)
+# manpages = perf-manpages-$(PERF_VERSION)
+# dist-doc:
+#	$(RM) -r .doc-tmp-dir
+#	mkdir .doc-tmp-dir
+#	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
+#	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
+#	gzip -n -9 -f $(htmldocs).tar
+#	:
+#	$(RM) -r .doc-tmp-dir
+#	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
+#	$(MAKE) -C Documentation DESTDIR=./ \
+#		man1dir=../.doc-tmp-dir/man1 \
+#		man5dir=../.doc-tmp-dir/man5 \
+#		man7dir=../.doc-tmp-dir/man7 \
+#		install
+#	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
+#	gzip -n -9 -f $(manpages).tar
+#	$(RM) -r .doc-tmp-dir
+#
+# rpm: dist
+#	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
 
 ### Cleaning rules
 
 distclean: clean
-	$(RM) configure
+#	$(RM) configure
 
 clean:
 	$(RM) *.o */*.o $(LIB_FILE)
@@ -896,25 +899,27 @@ check-builtins::
 
 ### Test suite coverage testing
 #
-.PHONY: coverage coverage-clean coverage-build coverage-report
-
-coverage:
-	$(MAKE) coverage-build
-	$(MAKE) coverage-report
-
-coverage-clean:
-	rm -f *.gcda *.gcno
-
-COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
-COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
-
-coverage-build: coverage-clean
-	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
-	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
-		-j1 test
-
-coverage-report:
-	gcov -b *.c */*.c
-	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
-		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
-		| tee coverage-untested-functions
+# None right now
+#
+# .PHONY: coverage coverage-clean coverage-build coverage-report
+#
+# coverage:
+#	$(MAKE) coverage-build
+#	$(MAKE) coverage-report
+#
+# coverage-clean:
+#	rm -f *.gcda *.gcno
+#
+# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
+# COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
+#
+# coverage-build: coverage-clean
+#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
+#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
+#		-j1 test
+#
+# coverage-report:
+#	gcov -b *.c */*.c
+#	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
+#		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
+#		| tee coverage-untested-functions
-- 
GitLab


From 10a2825514a988225ac2e336c7a9502c4ca57c39 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 2 Jun 2009 11:04:44 +0200
Subject: [PATCH 1498/2198] perf_counter tools: Fix uninitialized variable in
 perf-report.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# make prefix=/usr/local V=1
gcc -o builtin-report.o -c -O2 -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement   -DSHA1_HEADER='<openssl/sha.h>'  builtin-report.c
cc1: warnings being treated as errors
builtin-report.c: In function ‘__cmd_report’:
builtin-report.c:626: error: ‘cwdlen’ may be used uninitialized in this function

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <new-submission>
---
 Documentation/perf_counter/builtin-report.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 7973092094e13..20a4e519dfd17 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -653,8 +653,10 @@ static int __cmd_report(void)
 			return EXIT_FAILURE;
 		}
 		cwdlen = strlen(cwd);
-	} else
+	} else {
 		cwdp = NULL;
+		cwdlen = 0;
+	}
 remap:
 	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
 			   MAP_SHARED, input, offset);
-- 
GitLab


From f38b082081bf69a06fffb8b32a175999e2320c5b Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 2 Jun 2009 21:05:16 +1000
Subject: [PATCH 1499/2198] perf_counter: Initialize per-cpu context earlier on
 cpu up

This arranges for perf_counter's notifier for cpu hotplug
operations to be called earlier than the migration notifier in
sched.c by increasing its priority to 20, compared to the 10
for the migration notifier.  The reason for doing this is that
a subsequent commit to convert the cpu migration counter to use
the generic swcounter infrastructure will add a call into the
perf_counter subsystem when tasks get migrated.  Therefore the
perf_counter subsystem needs a chance to initialize its per-cpu
data for the new cpu before it can get called from the
migration code.

This also adds a comment to the migration notifier noting that
its priority needs to be lower than that of the perf_counter
notifier.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18981.1900.792795.836858@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 4 ++++
 kernel/sched.c        | 6 ++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index df319c48c52b9..8d2653f137e90 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3902,8 +3902,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
+/*
+ * This has to have a higher priority than migration_notifier in sched.c.
+ */
 static struct notifier_block __cpuinitdata perf_cpu_nb = {
 	.notifier_call		= perf_cpu_notify,
+	.priority		= 20,
 };
 
 void __init perf_counter_init(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index ad079f07c9c8c..3226cc132e9fc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7319,8 +7319,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }
 
-/* Register at highest priority so that task migration (migrate_all_tasks)
- * happens before everything else.
+/*
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
  */
 static struct notifier_block __cpuinitdata migration_notifier = {
 	.notifier_call = migration_call,
-- 
GitLab


From 3f731ca60afc29f5bcdb5fd2a04391466313a9ac Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:52:30 +1000
Subject: [PATCH 1500/2198] perf_counter: Fix cpu migration counter

This fixes the cpu migration software counter to count
correctly even when contexts get swapped from one task to
another.  Previously the cpu migration counts reported by perf
stat were bogus, ranging from negative to several thousand for
a single "lat_ctx 2 8 32" run.  With this patch the cpu
migration count reported for "lat_ctx 2 8 32" is almost always
between 35 and 44.

This fixes the problem by adding a call into the perf_counter
code from set_task_cpu when tasks are migrated.  This enables
us to use the generic swcounter code (with some modifications)
for the cpu migration counter.

This modifies the swcounter code to allow a NULL regs pointer
to be passed in to perf_swcounter_ctx_event() etc.  The cpu
migration counter does this because there isn't necessarily a
pt_regs struct for the task available.  In this case, the
counter will not have interrupt capability - but the migration
counter didn't have interrupt capability before, so this is no
loss.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.35006.819769.416327@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  4 ++
 kernel/perf_counter.c        | 74 ++++++++++--------------------------
 kernel/sched.c               |  1 +
 3 files changed, 26 insertions(+), 53 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 0e57d8cc5a3d2..deb9acf9ad2a3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -615,6 +615,8 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 
 extern void perf_counter_comm(struct task_struct *tsk);
 
+extern void perf_counter_task_migration(struct task_struct *task, int cpu);
+
 #define MAX_STACK_DEPTH		255
 
 struct perf_callchain_entry {
@@ -668,6 +670,8 @@ perf_counter_munmap(unsigned long addr, unsigned long len,
 
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
+static inline void perf_counter_task_migration(struct task_struct *task,
+					       int cpu)			{ }
 #endif
 
 #endif /* __KERNEL__ */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8d2653f137e90..cd94cf3bf9e2a 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2921,11 +2921,13 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (counter->hw_event.config != event_config)
 		return 0;
 
-	if (counter->hw_event.exclude_user && user_mode(regs))
-		return 0;
+	if (regs) {
+		if (counter->hw_event.exclude_user && user_mode(regs))
+			return 0;
 
-	if (counter->hw_event.exclude_kernel && !user_mode(regs))
-		return 0;
+		if (counter->hw_event.exclude_kernel && !user_mode(regs))
+			return 0;
+	}
 
 	return 1;
 }
@@ -2935,7 +2937,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 
-	if (counter->hw.irq_period && !neg)
+	if (counter->hw.irq_period && !neg && regs)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
@@ -3151,55 +3153,24 @@ static const struct pmu perf_ops_task_clock = {
 /*
  * Software counter: cpu migrations
  */
-
-static inline u64 get_cpu_migrations(struct perf_counter *counter)
-{
-	struct task_struct *curr = counter->ctx->task;
-
-	if (curr)
-		return curr->se.nr_migrations;
-	return cpu_nr_migrations(smp_processor_id());
-}
-
-static void cpu_migrations_perf_counter_update(struct perf_counter *counter)
-{
-	u64 prev, now;
-	s64 delta;
-
-	prev = atomic64_read(&counter->hw.prev_count);
-	now = get_cpu_migrations(counter);
-
-	atomic64_set(&counter->hw.prev_count, now);
-
-	delta = now - prev;
-
-	atomic64_add(delta, &counter->count);
-}
-
-static void cpu_migrations_perf_counter_read(struct perf_counter *counter)
+void perf_counter_task_migration(struct task_struct *task, int cpu)
 {
-	cpu_migrations_perf_counter_update(counter);
-}
+	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+	struct perf_counter_context *ctx;
 
-static int cpu_migrations_perf_counter_enable(struct perf_counter *counter)
-{
-	if (counter->prev_state <= PERF_COUNTER_STATE_OFF)
-		atomic64_set(&counter->hw.prev_count,
-			     get_cpu_migrations(counter));
-	return 0;
-}
+	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
+				 PERF_COUNT_CPU_MIGRATIONS,
+				 1, 1, NULL, 0);
 
-static void cpu_migrations_perf_counter_disable(struct perf_counter *counter)
-{
-	cpu_migrations_perf_counter_update(counter);
+	ctx = perf_pin_task_context(task);
+	if (ctx) {
+		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
+					 PERF_COUNT_CPU_MIGRATIONS,
+					 1, 1, NULL, 0);
+		perf_unpin_context(ctx);
+	}
 }
 
-static const struct pmu perf_ops_cpu_migrations = {
-	.enable		= cpu_migrations_perf_counter_enable,
-	.disable	= cpu_migrations_perf_counter_disable,
-	.read		= cpu_migrations_perf_counter_read,
-};
-
 #ifdef CONFIG_EVENT_PROFILE
 void perf_tpcounter_event(int event_id)
 {
@@ -3272,11 +3243,8 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	case PERF_COUNT_PAGE_FAULTS_MIN:
 	case PERF_COUNT_PAGE_FAULTS_MAJ:
 	case PERF_COUNT_CONTEXT_SWITCHES:
-		pmu = &perf_ops_generic;
-		break;
 	case PERF_COUNT_CPU_MIGRATIONS:
-		if (!counter->hw_event.exclude_kernel)
-			pmu = &perf_ops_cpu_migrations;
+		pmu = &perf_ops_generic;
 		break;
 	}
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3226cc132e9fc..8d43347a0c0dd 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1977,6 +1977,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 		if (task_hot(p, old_rq->clock, NULL))
 			schedstat_inc(p, se.nr_forced2_migrations);
 #endif
+		perf_counter_task_migration(p, new_cpu);
 	}
 	p->se.vruntime -= old_cfsrq->min_vruntime -
 					 new_cfsrq->min_vruntime;
-- 
GitLab


From bf4e0ed3d027ce581be18496036862131b5f32aa Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Mon, 1 Jun 2009 17:53:16 +1000
Subject: [PATCH 1501/2198] perf_counter: Remove unused prev_state field

This removes the prev_state field of struct perf_counter since
it is now unused.  It was only used by the cpu migration
counter, which doesn't use it any more.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <18979.35052.915728.626374@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 1 -
 kernel/perf_counter.c        | 4 ----
 2 files changed, 5 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index deb9acf9ad2a3..d970fbc16afd5 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -427,7 +427,6 @@ struct perf_counter {
 	const struct pmu		*pmu;
 
 	enum perf_counter_active_state	state;
-	enum perf_counter_active_state	prev_state;
 	atomic64_t			count;
 
 	/*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index cd94cf3bf9e2a..fbed4d28ad7d7 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -572,7 +572,6 @@ group_sched_in(struct perf_counter *group_counter,
 	if (ret)
 		return ret < 0 ? ret : 0;
 
-	group_counter->prev_state = group_counter->state;
 	if (counter_sched_in(group_counter, cpuctx, ctx, cpu))
 		return -EAGAIN;
 
@@ -580,7 +579,6 @@ group_sched_in(struct perf_counter *group_counter,
 	 * Schedule in siblings as one group (if any):
 	 */
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) {
-		counter->prev_state = counter->state;
 		if (counter_sched_in(counter, cpuctx, ctx, cpu)) {
 			partial_group = counter;
 			goto group_error;
@@ -657,7 +655,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
 			       struct perf_counter_context *ctx)
 {
 	list_add_counter(counter, ctx);
-	counter->prev_state = PERF_COUNTER_STATE_OFF;
 	counter->tstamp_enabled = ctx->time;
 	counter->tstamp_running = ctx->time;
 	counter->tstamp_stopped = ctx->time;
@@ -820,7 +817,6 @@ static void __perf_counter_enable(void *info)
 	ctx->is_active = 1;
 	update_context_time(ctx);
 
-	counter->prev_state = counter->state;
 	if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
 		goto unlock;
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-- 
GitLab


From fe67e6f2d6df371b58ba721954d45a196df5e8b8 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 2 Jun 2009 17:00:45 +0900
Subject: [PATCH 1502/2198] TOMOYO: Remove unused mutex.

I forgot to remove on TOMOYO's 15th posting.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/tomoyo/domain.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 2d6748741a26e..ee43631f049be 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -296,7 +296,6 @@ static int tomoyo_update_domain_keeper_entry(const char *domainname,
 	struct tomoyo_domain_keeper_entry *ptr;
 	const struct tomoyo_path_info *saved_domainname;
 	const struct tomoyo_path_info *saved_program = NULL;
-	static DEFINE_MUTEX(lock);
 	int error = -ENOMEM;
 	bool is_last_name = false;
 
-- 
GitLab


From 4a4b2d7684c66dbd8ed04eb284bc94a78e061d29 Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cca.cpqcorp.net>
Date: Tue, 2 Jun 2009 14:47:50 +0200
Subject: [PATCH 1503/2198] cciss: factor out core of sendcmd() for a more sane
 interface

Factor out the core of sendcmd() to provide a simpler interface which
exposes all the error information to the caller and make the original
sendcmd use this new function.  Rationale: The SCSI error handling
routines need to send commands with interrupts turned off, but they also
need access to the full error information.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Cc: Mike Miller <mikem@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss.c | 225 ++++++++++++++++++++----------------------
 1 file changed, 109 insertions(+), 116 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 94474f5f8bcee..8d0f8932fee76 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2373,41 +2373,21 @@ static int add_sendcmd_reject(__u8 cmd, int ctlr, unsigned long complete)
 	return 0;
 }
 
-/*
- * Send a command to the controller, and wait for it to complete.
- * Only used at init time.
+/* Send command c to controller h and poll for it to complete.
+ * Turns interrupts off on the board.  Used at driver init time
+ * and during SCSI error recovery.
  */
-static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, unsigned int use_unit_num,	/* 0: address the controller,
-												   1: address logical volume log_unit,
-												   2: periph device address is scsi3addr */
-		   unsigned int log_unit,
-		   __u8 page_code, unsigned char *scsi3addr, int cmd_type)
+static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
 {
-	CommandList_struct *c;
 	int i;
 	unsigned long complete;
-	ctlr_info_t *info_p = hba[ctlr];
+	int status = IO_ERROR;
 	u64bit buff_dma_handle;
-	int status, done = 0;
 
-	if ((c = cmd_alloc(info_p, 1)) == NULL) {
-		printk(KERN_WARNING "cciss: unable to get memory");
-		return IO_ERROR;
-	}
-	status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
-			  log_unit, page_code, scsi3addr, cmd_type);
-	if (status != IO_OK) {
-		cmd_free(info_p, c, 1);
-		return status;
-	}
-      resend_cmd1:
-	/*
-	 * Disable interrupt
-	 */
-#ifdef CCISS_DEBUG
-	printk(KERN_DEBUG "cciss: turning intr off\n");
-#endif				/* CCISS_DEBUG */
-	info_p->access.set_intr_mask(info_p, CCISS_INTR_OFF);
+resend_cmd1:
+
+	/* Disable interrupt on the board. */
+	h->access.set_intr_mask(h, CCISS_INTR_OFF);
 
 	/* Make sure there is room in the command FIFO */
 	/* Actually it should be completely empty at this time */
@@ -2415,21 +2395,15 @@ static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, unsigned int use
 	/* tape side of the driver. */
 	for (i = 200000; i > 0; i--) {
 		/* if fifo isn't full go */
-		if (!(info_p->access.fifo_full(info_p))) {
-
+		if (!(h->access.fifo_full(h)))
 			break;
-		}
 		udelay(10);
 		printk(KERN_WARNING "cciss cciss%d: SendCmd FIFO full,"
-		       " waiting!\n", ctlr);
+		       " waiting!\n", h->ctlr);
 	}
-	/*
-	 * Send the cmd
-	 */
-	info_p->access.submit_command(info_p, c);
-	done = 0;
+	h->access.submit_command(h, c); /* Send the cmd */
 	do {
-		complete = pollcomplete(ctlr);
+		complete = pollcomplete(h->ctlr);
 
 #ifdef CCISS_DEBUG
 		printk(KERN_DEBUG "cciss: command completed\n");
@@ -2438,97 +2412,116 @@ static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size, unsigned int use
 		if (complete == 1) {
 			printk(KERN_WARNING
 			       "cciss cciss%d: SendCmd Timeout out, "
-			       "No command list address returned!\n", ctlr);
+			       "No command list address returned!\n", h->ctlr);
 			status = IO_ERROR;
-			done = 1;
 			break;
 		}
 
-		/* This will need to change for direct lookup completions */
-		if ((complete & CISS_ERROR_BIT)
-		    && (complete & ~CISS_ERROR_BIT) == c->busaddr) {
-			/* if data overrun or underun on Report command
-			   ignore it
-			 */
-			if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
-			     (c->Request.CDB[0] == CISS_REPORT_PHYS) ||
-			     (c->Request.CDB[0] == CISS_INQUIRY)) &&
-			    ((c->err_info->CommandStatus ==
-			      CMD_DATA_OVERRUN) ||
-			     (c->err_info->CommandStatus == CMD_DATA_UNDERRUN)
-			    )) {
-				complete = c->busaddr;
-			} else {
-				if (c->err_info->CommandStatus ==
-				    CMD_UNSOLICITED_ABORT) {
-					printk(KERN_WARNING "cciss%d: "
-					       "unsolicited abort %p\n",
-					       ctlr, c);
-					if (c->retry_count < MAX_CMD_RETRIES) {
-						printk(KERN_WARNING
-						       "cciss%d: retrying %p\n",
-						       ctlr, c);
-						c->retry_count++;
-						/* erase the old error */
-						/* information */
-						memset(c->err_info, 0,
-						       sizeof
-						       (ErrorInfo_struct));
-						goto resend_cmd1;
-					} else {
-						printk(KERN_WARNING
-						       "cciss%d: retried %p too "
-						       "many times\n", ctlr, c);
-						status = IO_ERROR;
-						goto cleanup1;
-					}
-				} else if (c->err_info->CommandStatus ==
-					   CMD_UNABORTABLE) {
-					printk(KERN_WARNING
-					       "cciss%d: command could not be aborted.\n",
-					       ctlr);
-					status = IO_ERROR;
-					goto cleanup1;
-				}
-				printk(KERN_WARNING "ciss ciss%d: sendcmd"
-				       " Error %x \n", ctlr,
-				       c->err_info->CommandStatus);
-				printk(KERN_WARNING "ciss ciss%d: sendcmd"
-				       " offensive info\n"
-				       "  size %x\n   num %x   value %x\n",
-				       ctlr,
-				       c->err_info->MoreErrInfo.Invalid_Cmd.
-				       offense_size,
-				       c->err_info->MoreErrInfo.Invalid_Cmd.
-				       offense_num,
-				       c->err_info->MoreErrInfo.Invalid_Cmd.
-				       offense_value);
-				status = IO_ERROR;
-				goto cleanup1;
-			}
+		/* If it's not the cmd we're looking for, save it for later */
+		if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
+			if (add_sendcmd_reject(c->Request.CDB[0],
+				h->ctlr, complete) != 0)
+				BUG(); /* we are hosed if we get here. */
+			continue;
+		}
+
+		/* It is our command.  If no error, we're done. */
+		if (!(complete & CISS_ERROR_BIT)) {
+			status = IO_OK;
+			break;
+		}
+
+		/* There is an error... */
+
+		/* if data overrun or underun on Report command ignore it */
+		if (((c->Request.CDB[0] == CISS_REPORT_LOG) ||
+		     (c->Request.CDB[0] == CISS_REPORT_PHYS) ||
+		     (c->Request.CDB[0] == CISS_INQUIRY)) &&
+			((c->err_info->CommandStatus == CMD_DATA_OVERRUN) ||
+			 (c->err_info->CommandStatus == CMD_DATA_UNDERRUN))) {
+			complete = c->busaddr;
+			status = IO_OK;
+			break;
 		}
-		/* This will need changing for direct lookup completions */
-		if (complete != c->busaddr) {
-			if (add_sendcmd_reject(cmd, ctlr, complete) != 0) {
-				BUG();	/* we are pretty much hosed if we get here. */
+
+		if (c->err_info->CommandStatus == CMD_UNSOLICITED_ABORT) {
+			printk(KERN_WARNING "cciss%d: unsolicited abort %p\n",
+				h->ctlr, c);
+			if (c->retry_count < MAX_CMD_RETRIES) {
+				printk(KERN_WARNING "cciss%d: retrying %p\n",
+				   h->ctlr, c);
+				c->retry_count++;
+				/* erase the old error information */
+				memset(c->err_info, 0, sizeof(c->err_info));
+				goto resend_cmd1;
 			}
-			continue;
-		} else
-			done = 1;
-	} while (!done);
+			printk(KERN_WARNING "cciss%d: retried %p too many "
+				"times\n", h->ctlr, c);
+			status = IO_ERROR;
+			goto cleanup1;
+		}
+
+		if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
+			printk(KERN_WARNING "cciss%d: command could not be "
+				"aborted.\n", h->ctlr);
+			status = IO_ERROR;
+			goto cleanup1;
+		}
+
+		printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
+		printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
+			c->Request.CDB[0], c->err_info->CommandStatus);
+		if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
+			printk(KERN_WARNING "Target status = 0x%02x\n",
+			c->err_info->ScsiStatus);
+			if (c->err_info->ScsiStatus == 2) /* chk cond */
+				printk(KERN_WARNING "Sense key = 0x%02x\n",
+					0xf & c->err_info->SenseInfo[2]);
+		}
+
+		status = IO_ERROR;
+		goto cleanup1;
+
+	} while (1);
 
-      cleanup1:
+cleanup1:
 	/* unlock the data buffer from DMA */
 	buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
 	buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
-	pci_unmap_single(info_p->pdev, (dma_addr_t) buff_dma_handle.val,
+	pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
 			 c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
 #ifdef CONFIG_CISS_SCSI_TAPE
 	/* if we saved some commands for later, process them now. */
-	if (info_p->scsi_rejects.ncompletions > 0)
-		do_cciss_intr(0, info_p);
+	if (h->scsi_rejects.ncompletions > 0)
+		do_cciss_intr(0, h);
 #endif
-	cmd_free(info_p, c, 1);
+	return status;
+}
+
+/*
+ * Send a command to the controller, and wait for it to complete.
+ * Used at init time, and during SCSI error recovery.
+ */
+static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
+	unsigned int use_unit_num,/* 0: address the controller,
+				     1: address logical volume log_unit,
+				     2: periph device address is scsi3addr */
+	unsigned int log_unit,
+	__u8 page_code, unsigned char *scsi3addr, int cmd_type)
+{
+	CommandList_struct *c;
+	int status;
+
+	c = cmd_alloc(hba[ctlr], 1);
+	if (!c) {
+		printk(KERN_WARNING "cciss: unable to get memory");
+		return IO_ERROR;
+	}
+	status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
+			  log_unit, page_code, scsi3addr, cmd_type);
+	if (status == IO_OK)
+		status = sendcmd_core(hba[ctlr], c);
+	cmd_free(hba[ctlr], c, 1);
 	return status;
 }
 
-- 
GitLab


From 88f627ae394eadd75ada669904269f1a4a77b3bd Mon Sep 17 00:00:00 2001
From: "Stephen M. Cameron" <scameron@beardog.cca.cpqcorp.net>
Date: Tue, 2 Jun 2009 14:48:11 +0200
Subject: [PATCH 1504/2198] cciss: fix SCSI device reset handler

Fix the SCSI reset error handler to send a working, properly addressed
reset message to the target device and add code to wait for the target
device to become ready by polling it with Test Unit Ready.

The existing reset code was broken in that it didn't bother to set the
8-byte LUN address to anything besides zero, so the command was addressed
to the controller, which pretended to the driver that the command
succeeded, while doing nothing.  Ages ago I tested this code, but
unbeknownst to me, my test was flawed, and what I thought was a tape drive
getting reset was actually nothing of the sort.  Unfortunately, there is
still lots of Smartarray firmware that doesn't handle doing target resets
right, and this code won't help in those cases, but it also shouldn't make
things worse in those cases than they already are.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Cc: Mike Miller <mikem@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss.c      | 14 +++++--
 drivers/block/cciss_scsi.c | 85 +++++++++++++++++++++++++++++++++++---
 2 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 8d0f8932fee76..cb43fb3af1597 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1974,6 +1974,13 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			c->Request.CDB[0] = BMIC_WRITE;
 			c->Request.CDB[6] = BMIC_CACHE_FLUSH;
 			break;
+		case TEST_UNIT_READY:
+			memcpy(c->Header. LUN.LunAddrBytes, scsi3addr, 8);
+			c->Request.CDBLen = 6;
+			c->Request.Type.Attribute = ATTR_SIMPLE;
+			c->Request.Type.Direction = XFER_NONE;
+			c->Request.Timeout = 0;
+			break;
 		default:
 			printk(KERN_WARNING
 			       "cciss%d:  Unknown Command 0x%c\n", ctlr, cmd);
@@ -1992,13 +1999,14 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			memcpy(&c->Request.CDB[4], buff, 8);
 			break;
 		case 1:	/* RESET message */
-			c->Request.CDBLen = 12;
+			memcpy(c->Header.LUN.LunAddrBytes, scsi3addr, 8);
+			c->Request.CDBLen = 16;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
-			c->Request.Type.Direction = XFER_WRITE;
+			c->Request.Type.Direction = XFER_NONE;
 			c->Request.Timeout = 0;
 			memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB));
 			c->Request.CDB[0] = cmd;	/* reset */
-			c->Request.CDB[1] = 0x04;	/* reset a LUN */
+			c->Request.CDB[1] = 0x03;	/* reset a target */
 			break;
 		case 3:	/* No-Op message */
 			c->Request.CDBLen = 1;
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index a3fd87b414444..8575c48c89170 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -58,6 +58,18 @@ static int sendcmd(
 	unsigned char *scsi3addr,
 	int cmd_type);
 
+static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
+	size_t size,
+	unsigned int use_unit_num, /* 0: address the controller,
+				      1: address logical volume log_unit,
+				      2: periph device address is scsi3addr */
+	unsigned int log_unit, __u8 page_code, unsigned char *scsi3addr,
+	int cmd_type);
+
+static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c);
+
+static CommandList_struct *cmd_alloc(ctlr_info_t *h, int get_from_pool);
+static void cmd_free(ctlr_info_t *h, CommandList_struct *c, int got_from_pool);
 
 static int cciss_scsi_proc_info(
 		struct Scsi_Host *sh,
@@ -1575,6 +1587,68 @@ cciss_seq_tape_report(struct seq_file *seq, int ctlr)
 	CPQ_TAPE_UNLOCK(ctlr, flags);
 }
 
+static int wait_for_device_to_become_ready(ctlr_info_t *h,
+	unsigned char lunaddr[])
+{
+	int rc;
+	int count = 0;
+	int waittime = HZ;
+	CommandList_struct *c;
+
+	c = cmd_alloc(h, 1);
+	if (!c) {
+		printk(KERN_WARNING "cciss%d: out of memory in "
+			"wait_for_device_to_become_ready.\n", h->ctlr);
+		return IO_ERROR;
+	}
+
+	/* Send test unit ready until device ready, or give up. */
+	while (count < 20) {
+
+		/* Wait for a bit.  do this first, because if we send
+		 * the TUR right away, the reset will just abort it.
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(waittime);
+		count++;
+
+		/* Increase wait time with each try, up to a point. */
+		if (waittime < (HZ * 30))
+			waittime = waittime * 2;
+
+		/* Send the Test Unit Ready */
+		rc = fill_cmd(c, TEST_UNIT_READY, h->ctlr, NULL, 0, 0, 0, 0,
+			lunaddr, TYPE_CMD);
+		if (rc == 0) {
+			rc = sendcmd_core(h, c);
+			/* sendcmd turned off interrupts, turn 'em back on. */
+			h->access.set_intr_mask(h, CCISS_INTR_ON);
+		}
+
+		if (rc == 0 && c->err_info->CommandStatus == CMD_SUCCESS)
+			break;
+
+		if (rc == 0 &&
+			c->err_info->CommandStatus == CMD_TARGET_STATUS &&
+			c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION &&
+			(c->err_info->SenseInfo[2] == NO_SENSE ||
+			c->err_info->SenseInfo[2] == UNIT_ATTENTION))
+			break;
+
+		printk(KERN_WARNING "cciss%d: Waiting %d secs "
+			"for device to become ready.\n",
+			h->ctlr, waittime / HZ);
+		rc = 1; /* device not ready. */
+	}
+
+	if (rc)
+		printk("cciss%d: giving up on device.\n", h->ctlr);
+	else
+		printk(KERN_WARNING "cciss%d: device is ready.\n", h->ctlr);
+
+	cmd_free(h, c, 1);
+	return rc;
+}
 
 /* Need at least one of these error handlers to keep ../scsi/hosts.c from 
  * complaining.  Doing a host- or bus-reset can't do anything good here. 
@@ -1591,6 +1665,7 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
 {
 	int rc;
 	CommandList_struct *cmd_in_trouble;
+	unsigned char lunaddr[8];
 	ctlr_info_t **c;
 	int ctlr;
 
@@ -1600,19 +1675,17 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
 		return FAILED;
 	ctlr = (*c)->ctlr;
 	printk(KERN_WARNING "cciss%d: resetting tape drive or medium changer.\n", ctlr);
-
 	/* find the command that's giving us trouble */
 	cmd_in_trouble = (CommandList_struct *) scsicmd->host_scribble;
-	if (cmd_in_trouble == NULL) { /* paranoia */
+	if (cmd_in_trouble == NULL) /* paranoia */
 		return FAILED;
-	}
+	memcpy(lunaddr, &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 8);
 	/* send a reset to the SCSI LUN which the command was sent to */
-	rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 2, 0, 0, 
-		(unsigned char *) &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 
+	rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 2, 0, 0, lunaddr,
 		TYPE_MSG);
 	/* sendcmd turned off interrupts on the board, turn 'em back on. */
 	(*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
-	if (rc == 0)
+	if (rc == 0 && wait_for_device_to_become_ready(*c, lunaddr) == 0)
 		return SUCCESS;
 	printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr);
 	return FAILED;
-- 
GitLab


From 7fe063268e73681cdca1a6496a25f93d3332f517 Mon Sep 17 00:00:00 2001
From: Andrew Patterson <andrew.patterson@hp.com>
Date: Tue, 2 Jun 2009 14:48:39 +0200
Subject: [PATCH 1505/2198] cciss: add cciss driver sysfs entries

Add sysfs entries to the cciss driver needed for the dm/multipath tools.

A file for vendor, model, rev, and unique_id is added for each logical
drive under directory /sys/bus/pci/devices/<dev>/ccissX/cXdY.  Where X =
the controller (or host) number and Y is the logical drive number.

A link from /sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY to
/sys/block/cciss!cXdY/device is also created.  A bus is created in
/sys/bus/cciss.  A link is created from the pci ccissX entry to
/sys/bus/cciss/devices/ccissX.  Please consider this for inclusion.

Signed-off-by: Mike Miller <mike.miller@hp.com>
Cc: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 .../ABI/testing/sysfs-bus-pci-devices-cciss   |  33 +++
 drivers/block/cciss.c                         | 267 +++++++++++++++++-
 drivers/block/cciss.h                         |  24 +-
 3 files changed, 314 insertions(+), 10 deletions(-)
 create mode 100644 Documentation/ABI/testing/sysfs-bus-pci-devices-cciss

diff --git a/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
new file mode 100644
index 0000000000000..0a92a7c93a620
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-bus-pci-devices-cciss
@@ -0,0 +1,33 @@
+Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/model
+Date:		March 2009
+Kernel Version: 2.6.30
+Contact:	iss_storagedev@hp.com
+Description:	Displays the SCSI INQUIRY page 0 model for logical drive
+		Y of controller X.
+
+Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/rev
+Date:		March 2009
+Kernel Version: 2.6.30
+Contact:	iss_storagedev@hp.com
+Description:	Displays the SCSI INQUIRY page 0 revision for logical
+		drive Y of controller X.
+
+Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/unique_id
+Date:		March 2009
+Kernel Version: 2.6.30
+Contact:	iss_storagedev@hp.com
+Description:	Displays the SCSI INQUIRY page 83 serial number for logical
+		drive Y of controller X.
+
+Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/vendor
+Date:		March 2009
+Kernel Version: 2.6.30
+Contact:	iss_storagedev@hp.com
+Description:	Displays the SCSI INQUIRY page 0 vendor for logical drive
+		Y of controller X.
+
+Where:		/sys/bus/pci/devices/<dev>/ccissX/cXdY/block:cciss!cXdY
+Date:		March 2009
+Kernel Version: 2.6.30
+Contact:	iss_storagedev@hp.com
+Description:	A symbolic link to /sys/block/cciss!cXdY
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index cb43fb3af1597..e7d00952dd4fe 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -437,6 +437,194 @@ static void __devinit cciss_procinit(int i)
 }
 #endif				/* CONFIG_PROC_FS */
 
+#define MAX_PRODUCT_NAME_LEN 19
+
+#define to_hba(n) container_of(n, struct ctlr_info, dev)
+#define to_drv(n) container_of(n, drive_info_struct, dev)
+
+static struct device_type cciss_host_type = {
+	.name		= "cciss_host",
+};
+
+static ssize_t dev_show_unique_id(struct device *dev,
+				 struct device_attribute *attr,
+				 char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	__u8 sn[16];
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	if (h->busy_configuring)
+		ret = -EBUSY;
+	else
+		memcpy(sn, drv->serial_no, sizeof(sn));
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+
+	if (ret)
+		return ret;
+	else
+		return snprintf(buf, 16 * 2 + 2,
+				"%02X%02X%02X%02X%02X%02X%02X%02X"
+				"%02X%02X%02X%02X%02X%02X%02X%02X\n",
+				sn[0], sn[1], sn[2], sn[3],
+				sn[4], sn[5], sn[6], sn[7],
+				sn[8], sn[9], sn[10], sn[11],
+				sn[12], sn[13], sn[14], sn[15]);
+}
+DEVICE_ATTR(unique_id, S_IRUGO, dev_show_unique_id, NULL);
+
+static ssize_t dev_show_vendor(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	char vendor[VENDOR_LEN + 1];
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	if (h->busy_configuring)
+		ret = -EBUSY;
+	else
+		memcpy(vendor, drv->vendor, VENDOR_LEN + 1);
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+
+	if (ret)
+		return ret;
+	else
+		return snprintf(buf, sizeof(vendor) + 1, "%s\n", drv->vendor);
+}
+DEVICE_ATTR(vendor, S_IRUGO, dev_show_vendor, NULL);
+
+static ssize_t dev_show_model(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	char model[MODEL_LEN + 1];
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	if (h->busy_configuring)
+		ret = -EBUSY;
+	else
+		memcpy(model, drv->model, MODEL_LEN + 1);
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+
+	if (ret)
+		return ret;
+	else
+		return snprintf(buf, sizeof(model) + 1, "%s\n", drv->model);
+}
+DEVICE_ATTR(model, S_IRUGO, dev_show_model, NULL);
+
+static ssize_t dev_show_rev(struct device *dev,
+			    struct device_attribute *attr,
+			    char *buf)
+{
+	drive_info_struct *drv = to_drv(dev);
+	struct ctlr_info *h = to_hba(drv->dev.parent);
+	char rev[REV_LEN + 1];
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	if (h->busy_configuring)
+		ret = -EBUSY;
+	else
+		memcpy(rev, drv->rev, REV_LEN + 1);
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+
+	if (ret)
+		return ret;
+	else
+		return snprintf(buf, sizeof(rev) + 1, "%s\n", drv->rev);
+}
+DEVICE_ATTR(rev, S_IRUGO, dev_show_rev, NULL);
+
+static struct attribute *cciss_dev_attrs[] = {
+	&dev_attr_unique_id.attr,
+	&dev_attr_model.attr,
+	&dev_attr_vendor.attr,
+	&dev_attr_rev.attr,
+	NULL
+};
+
+static struct attribute_group cciss_dev_attr_group = {
+	.attrs = cciss_dev_attrs,
+};
+
+static struct attribute_group *cciss_dev_attr_groups[] = {
+	&cciss_dev_attr_group,
+	NULL
+};
+
+static struct device_type cciss_dev_type = {
+	.name		= "cciss_device",
+	.groups		= cciss_dev_attr_groups,
+};
+
+static struct bus_type cciss_bus_type = {
+	.name		= "cciss",
+};
+
+
+/*
+ * Initialize sysfs entry for each controller.  This sets up and registers
+ * the 'cciss#' directory for each individual controller under
+ * /sys/bus/pci/devices/<dev>/.
+ */
+static int cciss_create_hba_sysfs_entry(struct ctlr_info *h)
+{
+	device_initialize(&h->dev);
+	h->dev.type = &cciss_host_type;
+	h->dev.bus = &cciss_bus_type;
+	dev_set_name(&h->dev, "%s", h->devname);
+	h->dev.parent = &h->pdev->dev;
+
+	return device_add(&h->dev);
+}
+
+/*
+ * Remove sysfs entries for an hba.
+ */
+static void cciss_destroy_hba_sysfs_entry(struct ctlr_info *h)
+{
+	device_del(&h->dev);
+}
+
+/*
+ * Initialize sysfs for each logical drive.  This sets up and registers
+ * the 'c#d#' directory for each individual logical drive under
+ * /sys/bus/pci/devices/<dev/ccis#/. We also create a link from
+ * /sys/block/cciss!c#d# to this entry.
+ */
+static int cciss_create_ld_sysfs_entry(struct ctlr_info *h,
+				       drive_info_struct *drv,
+				       int drv_index)
+{
+	device_initialize(&drv->dev);
+	drv->dev.type = &cciss_dev_type;
+	drv->dev.bus = &cciss_bus_type;
+	dev_set_name(&drv->dev, "c%dd%d", h->ctlr, drv_index);
+	drv->dev.parent = &h->dev;
+	return device_add(&drv->dev);
+}
+
+/*
+ * Remove sysfs entries for a logical drive.
+ */
+static void cciss_destroy_ld_sysfs_entry(drive_info_struct *drv)
+{
+	device_del(&drv->dev);
+}
+
 /*
  * For operations that cannot sleep, a command block is allocated at init,
  * and managed by cmd_alloc() and cmd_free() using a simple bitmap to track
@@ -1332,6 +1520,45 @@ static void cciss_softirq_done(struct request *rq)
 	spin_unlock_irqrestore(&h->lock, flags);
 }
 
+/* This function gets the SCSI vendor, model, and revision of a logical drive
+ * via the inquiry page 0.  Model, vendor, and rev are set to empty strings if
+ * they cannot be read.
+ */
+static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
+				   char *vendor, char *model, char *rev)
+{
+	int rc;
+	InquiryData_struct *inq_buf;
+
+	*vendor = '\0';
+	*model = '\0';
+	*rev = '\0';
+
+	inq_buf = kzalloc(sizeof(InquiryData_struct), GFP_KERNEL);
+	if (!inq_buf)
+		return;
+
+	if (withirq)
+		rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf,
+				     sizeof(InquiryData_struct), 1, logvol,
+				     0, TYPE_CMD);
+	else
+		rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
+			     sizeof(InquiryData_struct), 1, logvol, 0, NULL,
+			     TYPE_CMD);
+	if (rc == IO_OK) {
+		memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
+		vendor[VENDOR_LEN] = '\0';
+		memcpy(model, &inq_buf->data_byte[16], MODEL_LEN);
+		model[MODEL_LEN] = '\0';
+		memcpy(rev, &inq_buf->data_byte[32], REV_LEN);
+		rev[REV_LEN] = '\0';
+	}
+
+	kfree(inq_buf);
+	return;
+}
+
 /* This function gets the serial number of a logical drive via
  * inquiry page 0x83.  Serial no. is 16 bytes.  If the serial
  * number cannot be had, for whatever reason, 16 bytes of 0xff
@@ -1372,7 +1599,7 @@ static void cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 	disk->first_minor = drv_index << NWD_SHIFT;
 	disk->fops = &cciss_fops;
 	disk->private_data = &h->drv[drv_index];
-	disk->driverfs_dev = &h->pdev->dev;
+	disk->driverfs_dev = &h->drv[drv_index].dev;
 
 	/* Set up queue information */
 	blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask);
@@ -1463,6 +1690,8 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
 	drvinfo->block_size = block_size;
 	drvinfo->nr_blocks = total_size + 1;
 
+	cciss_get_device_descr(ctlr, drv_index, 1, drvinfo->vendor,
+				drvinfo->model, drvinfo->rev);
 	cciss_get_serial_no(ctlr, drv_index, 1, drvinfo->serial_no,
 			sizeof(drvinfo->serial_no));
 
@@ -1512,6 +1741,9 @@ static void cciss_update_drive_info(int ctlr, int drv_index, int first_time)
 	h->drv[drv_index].cylinders = drvinfo->cylinders;
 	h->drv[drv_index].raid_level = drvinfo->raid_level;
 	memcpy(h->drv[drv_index].serial_no, drvinfo->serial_no, 16);
+	memcpy(h->drv[drv_index].vendor, drvinfo->vendor, VENDOR_LEN + 1);
+	memcpy(h->drv[drv_index].model, drvinfo->model, MODEL_LEN + 1);
+	memcpy(h->drv[drv_index].rev, drvinfo->rev, REV_LEN + 1);
 
 	++h->num_luns;
 	disk = h->gendisk[drv_index];
@@ -1586,6 +1818,8 @@ static int cciss_add_gendisk(ctlr_info_t *h, __u32 lunid, int controller_node)
 		}
 	}
 	h->drv[drv_index].LunID = lunid;
+	if (cciss_create_ld_sysfs_entry(h, &h->drv[drv_index], drv_index))
+		goto err_free_disk;
 
 	/* Don't need to mark this busy because nobody */
 	/* else knows about this disk yet to contend */
@@ -1593,6 +1827,11 @@ static int cciss_add_gendisk(ctlr_info_t *h, __u32 lunid, int controller_node)
 	h->drv[drv_index].busy_configuring = 0;
 	wmb();
 	return drv_index;
+
+err_free_disk:
+	put_disk(h->gendisk[drv_index]);
+	h->gendisk[drv_index] = NULL;
+	return -1;
 }
 
 /* This is for the special case of a controller which
@@ -1713,6 +1952,7 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time)
 			h->drv[i].busy_configuring = 1;
 			spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
 			return_code = deregister_disk(h, i, 1);
+			cciss_destroy_ld_sysfs_entry(&h->drv[i]);
 			h->drv[i].busy_configuring = 0;
 		}
 	}
@@ -3719,12 +3959,15 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	INIT_HLIST_HEAD(&hba[i]->reqQ);
 
 	if (cciss_pci_init(hba[i], pdev) != 0)
-		goto clean1;
+		goto clean0;
 
 	sprintf(hba[i]->devname, "cciss%d", i);
 	hba[i]->ctlr = i;
 	hba[i]->pdev = pdev;
 
+	if (cciss_create_hba_sysfs_entry(hba[i]))
+		goto clean0;
+
 	/* configure PCI DMA stuff */
 	if (!pci_set_dma_mask(pdev, DMA_BIT_MASK(64)))
 		dac = 1;
@@ -3868,6 +4111,8 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 clean2:
 	unregister_blkdev(hba[i]->major, hba[i]->devname);
 clean1:
+	cciss_destroy_hba_sysfs_entry(hba[i]);
+clean0:
 	hba[i]->busy_initializing = 0;
 	/* cleanup any queues that may have been initialized */
 	for (j=0; j <= hba[i]->highest_lun; j++){
@@ -3978,6 +4223,7 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
 	 */
 	pci_release_regions(pdev);
 	pci_set_drvdata(pdev, NULL);
+	cciss_destroy_hba_sysfs_entry(hba[i]);
 	free_hba(i);
 }
 
@@ -3995,6 +4241,8 @@ static struct pci_driver cciss_pci_driver = {
  */
 static int __init cciss_init(void)
 {
+	int err;
+
 	/*
 	 * The hardware requires that commands are aligned on a 64-bit
 	 * boundary. Given that we use pci_alloc_consistent() to allocate an
@@ -4004,8 +4252,20 @@ static int __init cciss_init(void)
 
 	printk(KERN_INFO DRIVER_NAME "\n");
 
+	err = bus_register(&cciss_bus_type);
+	if (err)
+		return err;
+
 	/* Register for our PCI devices */
-	return pci_register_driver(&cciss_pci_driver);
+	err = pci_register_driver(&cciss_pci_driver);
+	if (err)
+		goto err_bus_register;
+
+	return 0;
+
+err_bus_register:
+	bus_unregister(&cciss_bus_type);
+	return err;
 }
 
 static void __exit cciss_cleanup(void)
@@ -4022,6 +4282,7 @@ static void __exit cciss_cleanup(void)
 		}
 	}
 	remove_proc_entry("driver/cciss", NULL);
+	bus_unregister(&cciss_bus_type);
 }
 
 static void fail_all_cmds(unsigned long ctlr)
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 703e08038fb93..dd1926d8cd971 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -12,6 +12,10 @@
 #define IO_OK		0
 #define IO_ERROR	1
 
+#define VENDOR_LEN	8
+#define MODEL_LEN	16
+#define REV_LEN		4
+
 struct ctlr_info;
 typedef struct ctlr_info ctlr_info_t;
 
@@ -34,13 +38,18 @@ typedef struct _drive_info_struct
 	int 	cylinders;
 	int	raid_level; /* set to -1 to indicate that
 			     * the drive is not in use/configured
-			    */
-	int	busy_configuring; /*This is set when the drive is being removed
-				   *to prevent it from being opened or it's queue
-				   *from being started.
-				  */
-	__u8 serial_no[16]; /* from inquiry page 0x83, */
-			    /* not necc. null terminated. */
+			     */
+	int	busy_configuring; /* This is set when a drive is being removed
+				   * to prevent it from being opened or it's
+				   * queue from being started.
+				   */
+	struct	device dev;
+	__u8 serial_no[16]; /* from inquiry page 0x83,
+			     * not necc. null terminated.
+			     */
+	char vendor[VENDOR_LEN + 1]; /* SCSI vendor string */
+	char model[MODEL_LEN + 1];   /* SCSI model string */
+	char rev[REV_LEN + 1];       /* SCSI revision string */
 } drive_info_struct;
 
 #ifdef CONFIG_CISS_SCSI_TAPE
@@ -123,6 +132,7 @@ struct ctlr_info
 	unsigned char alive;
 	struct completion *rescan_wait;
 	struct task_struct *cciss_scan_thread;
+	struct device dev;
 };
 
 /*  Defining the diffent access_menthods */
-- 
GitLab


From 77b0308a0778861111184e097533000f7a458c37 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Tue, 2 Jun 2009 14:51:30 +0200
Subject: [PATCH 1506/2198] cciss: use schedule_timeout_interruptible()

Use schedule_timeout_interruptible() instead of open-coding the set and
schedule parts.

Cc: Mike Miller <mikem@beardog.cca.cpqcorp.net>
Cc: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss_scsi.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 8575c48c89170..2edfc9b644eb6 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -1608,8 +1608,7 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 		/* Wait for a bit.  do this first, because if we send
 		 * the TUR right away, the reset will just abort it.
 		 */
-		set_current_state(TASK_INTERRUPTIBLE);
-		schedule_timeout(waittime);
+		schedule_timeout_interruptible(waittime);
 		count++;
 
 		/* Increase wait time with each try, up to a point. */
-- 
GitLab


From dbdc9dd342f0a7e32f40f0d4ade662bdfe057484 Mon Sep 17 00:00:00 2001
From: vibi sreenivasan <vibi_sreenivasan@cms.com>
Date: Tue, 2 Jun 2009 14:52:32 +0200
Subject: [PATCH 1507/2198] Removed reference to non-existing file
 Documentation/PCI/PCI-DMA-mapping.txt

File Documentation/PCI/PCI-DMA-mapping.txt does not exist.
 Documentation/DMA-mapping.txt contains DMA Mapping details

Signed-off-by: vibi sreenivasan <vibi_sreenivasan@cms.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/block/biodoc.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt
index 6fab97ea7e6b0..8d2158a1c6aaf 100644
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -186,7 +186,7 @@ a virtual address mapping (unlike the earlier scheme of virtual address
 do not have a corresponding kernel virtual address space mapping) and
 low-memory pages.
 
-Note: Please refer to Documentation/PCI/PCI-DMA-mapping.txt for a discussion
+Note: Please refer to Documentation/DMA-mapping.txt for a discussion
 on PCI high mem DMA aspects and mapping of scatter gather lists, and support
 for 64 bit PCI.
 
-- 
GitLab


From 2e507d849f1834d3fe9aae71b7aabf4c2a3827b8 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 18:24:20 +0200
Subject: [PATCH 1508/2198] dma-debug: add variables and checks for driver
 filter

This patch adds the state variables for the driver filter and a function
to check if the filter is enabled and matches to the current device. The
check is built into the err_printk function.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 48 insertions(+), 1 deletion(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index cdd205d6bf7c6..c01f64780caf7 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -99,6 +99,15 @@ static struct dentry *show_num_errors_dent  __read_mostly;
 static struct dentry *num_free_entries_dent __read_mostly;
 static struct dentry *min_free_entries_dent __read_mostly;
 
+/* per-driver filter related state */
+
+#define NAME_MAX_LEN	64
+
+static char                  current_driver_name[NAME_MAX_LEN] __read_mostly;
+static struct device_driver *current_driver                    __read_mostly;
+
+static DEFINE_RWLOCK(driver_name_lock);
+
 static const char *type2name[4] = { "single", "page",
 				    "scather-gather", "coherent" };
 
@@ -128,9 +137,47 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
 #endif
 }
 
+static bool driver_filter(struct device *dev)
+{
+	/* driver filter off */
+	if (likely(!current_driver_name[0]))
+		return true;
+
+	/* driver filter on and initialized */
+	if (current_driver && dev->driver == current_driver)
+		return true;
+
+	/* driver filter on but not yet initialized */
+	if (!current_driver && current_driver_name[0]) {
+		struct device_driver *drv = get_driver(dev->driver);
+		unsigned long flags;
+		bool ret = false;
+
+		if (!drv)
+			return false;
+
+		/* lock to protect against change of current_driver_name */
+		read_lock_irqsave(&driver_name_lock, flags);
+
+		if (drv->name &&
+		    strncmp(current_driver_name, drv->name, 63) == 0) {
+			current_driver = drv;
+			ret = true;
+		}
+
+		read_unlock_irqrestore(&driver_name_lock, flags);
+		put_driver(drv);
+
+		return ret;
+	}
+
+	return false;
+}
+
 #define err_printk(dev, entry, format, arg...) do {		\
 		error_count += 1;				\
-		if (show_all_errors || show_num_errors > 0) {	\
+		if (driver_filter(dev) &&			\
+		    (show_all_errors || show_num_errors > 0)) {	\
 			WARN(1, "%s %s: " format,		\
 			     dev_driver_string(dev),		\
 			     dev_name(dev) , ## arg);		\
-- 
GitLab


From 709e50cf870e61745b39552044aa6c7c38e4f9e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 14:13:15 +0200
Subject: [PATCH 1509/2198] perf_counter: Use PID namespaces properly

Stop using task_struct::pid and start using PID namespaces.

PIDs will be reported in the PID namespace of the monitoring
task at the moment of counter creation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  3 +++
 kernel/perf_counter.c        | 42 +++++++++++++++++++++++++++++-------
 2 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d970fbc16afd5..9ec20fc6bd367 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -317,6 +317,7 @@ enum perf_event_type {
 #include <linux/spinlock.h>
 #include <linux/hrtimer.h>
 #include <linux/fs.h>
+#include <linux/pid_namespace.h>
 #include <asm/atomic.h>
 
 struct task_struct;
@@ -500,6 +501,8 @@ struct perf_counter {
 
 	void (*destroy)(struct perf_counter *);
 	struct rcu_head			rcu_head;
+
+	struct pid_namespace		*ns;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fbed4d28ad7d7..caa012cfe49a9 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1432,6 +1432,8 @@ static void free_counter_rcu(struct rcu_head *head)
 	struct perf_counter *counter;
 
 	counter = container_of(head, struct perf_counter, rcu_head);
+	if (counter->ns)
+		put_pid_ns(counter->ns);
 	kfree(counter);
 }
 
@@ -2267,6 +2269,28 @@ static void perf_output_end(struct perf_output_handle *handle)
 	rcu_read_unlock();
 }
 
+static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p)
+{
+	/*
+	 * only top level counters have the pid namespace they were created in
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
+	return task_tgid_nr_ns(p, counter->ns);
+}
+
+static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
+{
+	/*
+	 * only top level counters have the pid namespace they were created in
+	 */
+	if (counter->parent)
+		counter = counter->parent;
+
+	return task_pid_nr_ns(p, counter->ns);
+}
+
 static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
@@ -2303,8 +2327,8 @@ static void perf_counter_output(struct perf_counter *counter,
 
 	if (record_type & PERF_RECORD_TID) {
 		/* namespace issues */
-		tid_entry.pid = current->group_leader->pid;
-		tid_entry.tid = current->pid;
+		tid_entry.pid = perf_counter_pid(counter, current);
+		tid_entry.tid = perf_counter_tid(counter, current);
 
 		header.type |= PERF_RECORD_TID;
 		header.size += sizeof(tid_entry);
@@ -2432,6 +2456,9 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 	if (ret)
 		return;
 
+	comm_event->event.pid = perf_counter_pid(counter, comm_event->task);
+	comm_event->event.tid = perf_counter_tid(counter, comm_event->task);
+
 	perf_output_put(&handle, comm_event->event);
 	perf_output_copy(&handle, comm_event->comm,
 				   comm_event->comm_size);
@@ -2504,8 +2531,6 @@ void perf_counter_comm(struct task_struct *task)
 		.task	= task,
 		.event  = {
 			.header = { .type = PERF_EVENT_COMM, },
-			.pid	= task->group_leader->pid,
-			.tid	= task->pid,
 		},
 	};
 
@@ -2542,6 +2567,9 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 	if (ret)
 		return;
 
+	mmap_event->event.pid = perf_counter_pid(counter, current);
+	mmap_event->event.tid = perf_counter_tid(counter, current);
+
 	perf_output_put(&handle, mmap_event->event);
 	perf_output_copy(&handle, mmap_event->file_name,
 				   mmap_event->file_size);
@@ -2641,8 +2669,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 		.file   = file,
 		.event  = {
 			.header = { .type = PERF_EVENT_MMAP, },
-			.pid	= current->group_leader->pid,
-			.tid	= current->pid,
 			.start  = addr,
 			.len    = len,
 			.pgoff  = pgoff,
@@ -2664,8 +2690,6 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 		.file   = file,
 		.event  = {
 			.header = { .type = PERF_EVENT_MUNMAP, },
-			.pid	= current->group_leader->pid,
-			.tid	= current->pid,
 			.start  = addr,
 			.len    = len,
 			.pgoff  = pgoff,
@@ -3445,6 +3469,8 @@ SYSCALL_DEFINE5(perf_counter_open,
 	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
 	mutex_unlock(&current->perf_counter_mutex);
 
+	counter->ns = get_pid_ns(current->nsproxy->pid_ns);
+
 	fput_light(counter_file, fput_needed2);
 
 out_fput:
-- 
GitLab


From f70e87d7a6d9c5a23d5f43ed0a0c224c157ef597 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 14:13:24 +0200
Subject: [PATCH 1510/2198] perf_counter: tools: Expand the COMM,MMAP event
 synthesizer

Include code to pre-construct mappings based on /proc,
on system wide recording.

Fix the existing code to properly fill out ->pid and ->tid.

The PID should be the Thread Group ID (PIDTYPE_PID of task->group_leader)
The TID should be the Thread ID (PIDTYPE_PID of task)

Furthermore, change the default sorting of report to comm,dso for a
better quick overview.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 84 ++++++++++++++++-----
 Documentation/perf_counter/builtin-report.c |  2 +-
 2 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 9c151ded22fab..810fc275ca65c 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -162,7 +162,7 @@ struct comm_event {
 	char				comm[16];
 };
 
-static pid_t pid_synthesize_comm_event(pid_t pid)
+static void pid_synthesize_comm_event(pid_t pid, int full)
 {
 	struct comm_event comm_ev;
 	char filename[PATH_MAX];
@@ -170,6 +170,8 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 	int fd, ret;
 	size_t size;
 	char *field, *sep;
+	DIR *tasks;
+	struct dirent dirent, *next;
 
 	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid);
 
@@ -194,29 +196,50 @@ static pid_t pid_synthesize_comm_event(pid_t pid)
 		goto out_failure;
 	size = sep - field;
 	memcpy(comm_ev.comm, field, size++);
-	field = strchr(sep + 4, ' ');
-	if (field == NULL)
-		goto out_failure;
-	comm_ev.pid = atoi(++field);
+
+	comm_ev.pid = pid;
 	comm_ev.header.type = PERF_EVENT_COMM;
-	comm_ev.tid = pid;
 	size = ALIGN(size, sizeof(uint64_t));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
 
-	ret = write(output, &comm_ev, comm_ev.header.size);
-	if (ret < 0) {
-		perror("failed to write");
-		exit(-1);
+	if (!full) {
+		comm_ev.tid = pid;
+
+		ret = write(output, &comm_ev, comm_ev.header.size);
+		if (ret < 0) {
+			perror("failed to write");
+			exit(-1);
+		}
+		return;
+	}
+
+	snprintf(filename, sizeof(filename), "/proc/%d/task", pid);
+
+	tasks = opendir(filename);
+	while (!readdir_r(tasks, &dirent, &next) && next) {
+		char *end;
+		pid = strtol(dirent.d_name, &end, 10);
+		if (*end)
+			continue;
+
+		comm_ev.tid = pid;
+
+		ret = write(output, &comm_ev, comm_ev.header.size);
+		if (ret < 0) {
+			perror("failed to write");
+			exit(-1);
+		}
 	}
-	return comm_ev.pid;
+	closedir(tasks);
+	return;
+
 out_failure:
 	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n",
 		filename);
 	exit(EXIT_FAILURE);
-	return -1;
 }
 
-static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
+static void pid_synthesize_mmap_events(pid_t pid)
 {
 	char filename[PATH_MAX];
 	FILE *fp;
@@ -261,7 +284,7 @@ static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
 			mmap_ev.len -= mmap_ev.start;
 			mmap_ev.header.size = (sizeof(mmap_ev) -
 					       (sizeof(mmap_ev.filename) - size));
-			mmap_ev.pid = pgid;
+			mmap_ev.pid = pid;
 			mmap_ev.tid = pid;
 
 			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) {
@@ -274,6 +297,28 @@ static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid)
 	fclose(fp);
 }
 
+static void synthesize_events(void)
+{
+	DIR *proc;
+	struct dirent dirent, *next;
+
+	proc = opendir("/proc");
+
+	while (!readdir_r(proc, &dirent, &next) && next) {
+		char *end;
+		pid_t pid;
+
+		pid = strtol(dirent.d_name, &end, 10);
+		if (*end) /* only interested in proper numerical dirents */
+			continue;
+
+		pid_synthesize_comm_event(pid, 1);
+		pid_synthesize_mmap_events(pid);
+	}
+
+	closedir(proc);
+}
+
 static void open_counters(int cpu, pid_t pid)
 {
 	struct perf_counter_hw_event hw_event;
@@ -281,8 +326,8 @@ static void open_counters(int cpu, pid_t pid)
 	int track = 1;
 
 	if (pid > 0) {
-		pid_t pgid = pid_synthesize_comm_event(pid);
-		pid_synthesize_mmap_events(pid, pgid);
+		pid_synthesize_comm_event(pid, 0);
+		pid_synthesize_mmap_events(pid);
 	}
 
 	group_fd = -1;
@@ -348,7 +393,7 @@ static int __cmd_record(int argc, const char **argv)
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
-	output = open(output_name, O_CREAT|O_EXCL|O_RDWR, S_IRWXU);
+	output = open(output_name, O_CREAT|O_EXCL|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR);
 	if (output < 0) {
 		perror("failed to create output file");
 		exit(-1);
@@ -385,9 +430,8 @@ static int __cmd_record(int argc, const char **argv)
 		}
 	}
 
-	/*
-	 * TODO: store the current /proc/$/maps information somewhere
-	 */
+	if (system_wide)
+		synthesize_events();
 
 	while (!done) {
 		int hits = events;
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 20a4e519dfd17..0558c1e1aa5b2 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -18,7 +18,7 @@
 
 static char		const *input_name = "perf.data";
 static char		*vmlinux = NULL;
-static char		*sort_order = "pid,symbol";
+static char		*sort_order = "comm,dso";
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
-- 
GitLab


From 97124d5e2df5b9eaa5bb684bb1e8ebc7e29d0f5d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:52:24 +0200
Subject: [PATCH 1511/2198] perf_counter: tools: Better handle existing data
 files

Provide an argument (-f) to overwrite existing data files.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 810fc275ca65c..bace7a8edf809 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -7,6 +7,7 @@
 #include "util/parse-events.h"
 #include "util/string.h"
 
+#include <unistd.h>
 #include <sched.h>
 
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
@@ -26,7 +27,7 @@ static unsigned int		realtime_prio			= 0;
 static int			system_wide			= 0;
 static pid_t			target_pid			= -1;
 static int			inherit				= 1;
-static int			nmi				= 1;
+static int			force				= 0;
 
 const unsigned int default_count[] = {
 	1000000,
@@ -337,7 +338,6 @@ static void open_counters(int cpu, pid_t pid)
 		hw_event.config		= event_id[counter];
 		hw_event.irq_period	= event_count[counter];
 		hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-		hw_event.nmi		= nmi;
 		hw_event.mmap		= track;
 		hw_event.comm		= track;
 		hw_event.inherit	= (cpu < 0) && inherit;
@@ -387,13 +387,20 @@ static int __cmd_record(int argc, const char **argv)
 	int i, counter;
 	pid_t pid;
 	int ret;
+	struct stat st;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
-	output = open(output_name, O_CREAT|O_EXCL|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR);
+	if (!stat(output_name, &st) && !force) {
+		fprintf(stderr, "Error, output file: %s exists, use -f to overwrite.\n",
+				output_name);
+		exit(-1);
+	}
+
+	output = open(output_name, O_CREAT|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR);
 	if (output < 0) {
 		perror("failed to create output file");
 		exit(-1);
@@ -473,6 +480,8 @@ static const struct option options[] = {
 		    "collect data with this RT SCHED_FIFO priority"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 			    "system-wide collection from all CPUs"),
+	OPT_BOOLEAN('f', "force", &force,
+			"overwrite existing data file"),
 	OPT_END()
 };
 
-- 
GitLab


From 8a6fc708b9bb48a79a385bdc2be0959ee2ab788d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 21:23:13 +0200
Subject: [PATCH 1512/2198] dma-debug: add debugfs file for driver filter

This patch adds the dma-api/driver_filter file to debugfs. The root user
can write a driver name into this file to see only dma-api errors for
that particular driver in the kernel log. Writing an empty string to
that file disables the driver filter.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 102 +++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 101 insertions(+), 1 deletion(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index c01f64780caf7..c6330b1a7be70 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -23,9 +23,11 @@
 #include <linux/dma-debug.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
+#include <linux/uaccess.h>
 #include <linux/device.h>
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/slab.h>
 
@@ -98,6 +100,7 @@ static struct dentry *show_all_errors_dent  __read_mostly;
 static struct dentry *show_num_errors_dent  __read_mostly;
 static struct dentry *num_free_entries_dent __read_mostly;
 static struct dentry *min_free_entries_dent __read_mostly;
+static struct dentry *filter_dent           __read_mostly;
 
 /* per-driver filter related state */
 
@@ -160,7 +163,8 @@ static bool driver_filter(struct device *dev)
 		read_lock_irqsave(&driver_name_lock, flags);
 
 		if (drv->name &&
-		    strncmp(current_driver_name, drv->name, 63) == 0) {
+		    strncmp(current_driver_name, drv->name,
+			    NAME_MAX_LEN-1) == 0) {
 			current_driver = drv;
 			ret = true;
 		}
@@ -454,6 +458,97 @@ static int prealloc_memory(u32 num_entries)
 	return -ENOMEM;
 }
 
+static ssize_t filter_read(struct file *file, char __user *user_buf,
+			   size_t count, loff_t *ppos)
+{
+	unsigned long flags;
+	char buf[NAME_MAX_LEN + 1];
+	int len;
+
+	if (!current_driver_name[0])
+		return 0;
+
+	/*
+	 * We can't copy to userspace directly because current_driver_name can
+	 * only be read under the driver_name_lock with irqs disabled. So
+	 * create a temporary copy first.
+	 */
+	read_lock_irqsave(&driver_name_lock, flags);
+	len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name);
+	read_unlock_irqrestore(&driver_name_lock, flags);
+
+	return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t filter_write(struct file *file, const char __user *userbuf,
+			    size_t count, loff_t *ppos)
+{
+	unsigned long flags;
+	char buf[NAME_MAX_LEN];
+	size_t len = NAME_MAX_LEN - 1;
+	int i;
+
+	/*
+	 * We can't copy from userspace directly. Access to
+	 * current_driver_name is protected with a write_lock with irqs
+	 * disabled. Since copy_from_user can fault and may sleep we
+	 * need to copy to temporary buffer first
+	 */
+	len = min(count, len);
+	if (copy_from_user(buf, userbuf, len))
+		return -EFAULT;
+
+	buf[len] = 0;
+
+	write_lock_irqsave(&driver_name_lock, flags);
+
+	/* Now handle the string we got from userspace very carefully.
+	 * The rules are:
+	 *         - only use the first token we got
+	 *         - token delimiter is everything looking like a space
+	 *           character (' ', '\n', '\t' ...)
+	 *
+	 */
+	if (!isalnum(buf[0])) {
+		/*
+		   If the first character userspace gave us is not
+		 * alphanumerical then assume the filter should be
+		 * switched off.
+		 */
+		if (current_driver_name[0])
+			printk(KERN_INFO "DMA-API: switching off dma-debug "
+					 "driver filter\n");
+		current_driver_name[0] = 0;
+		current_driver = NULL;
+		goto out_unlock;
+	}
+
+	/*
+	 * Now parse out the first token and use it as the name for the
+	 * driver to filter for.
+	 */
+	for (i = 0; i < NAME_MAX_LEN; ++i) {
+		current_driver_name[i] = buf[i];
+		if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0)
+			break;
+	}
+	current_driver_name[i] = 0;
+	current_driver = NULL;
+
+	printk(KERN_INFO "DMA-API: enable driver filter for driver [%s]\n",
+	       current_driver_name);
+
+out_unlock:
+	write_unlock_irqrestore(&driver_name_lock, flags);
+
+	return count;
+}
+
+const struct file_operations filter_fops = {
+	.read  = filter_read,
+	.write = filter_write,
+};
+
 static int dma_debug_fs_init(void)
 {
 	dma_debug_dent = debugfs_create_dir("dma-api", NULL);
@@ -497,6 +592,11 @@ static int dma_debug_fs_init(void)
 	if (!min_free_entries_dent)
 		goto out_err;
 
+	filter_dent = debugfs_create_file("driver_filter", 0644,
+					  dma_debug_dent, NULL, &filter_fops);
+	if (!filter_dent)
+		goto out_err;
+
 	return 0;
 
 out_err:
-- 
GitLab


From 1745de5e5639457513fe43440f2800e23c3cbc7d Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 21:49:51 +0200
Subject: [PATCH 1513/2198] dma-debug: add dma_debug_driver kernel command line

This patch add the dma_debug_driver= boot parameter to enable the driver
filter for early boot.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 Documentation/kernel-parameters.txt |  7 +++++++
 lib/dma-debug.c                     | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e87bdbfbcc75e..b3f1314588c95 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -646,6 +646,13 @@ and is between 256 and 4096 characters. It is defined in the file
 			DMA-API debugging code disables itself because the
 			architectural default is too low.
 
+	dma_debug_driver=<driver_name>
+			With this option the DMA-API debugging driver
+			filter feature can be enabled at boot time. Just
+			pass the driver to filter for as the parameter.
+			The filter can be disabled or changed to another
+			driver later using sysfs.
+
 	dscc4.setup=	[NET]
 
 	dtc3181e=	[HW,SCSI]
diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index c6330b1a7be70..d0618aa13b49c 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -1109,3 +1109,21 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
 }
 EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
 
+static int __init dma_debug_driver_setup(char *str)
+{
+	int i;
+
+	for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) {
+		current_driver_name[i] = *str;
+		if (*str == 0)
+			break;
+	}
+
+	if (current_driver_name[0])
+		printk(KERN_INFO "DMA-API: enable driver filter for "
+				 "driver [%s]\n", current_driver_name);
+
+
+	return 1;
+}
+__setup("dma_debug_driver=", dma_debug_driver_setup);
-- 
GitLab


From 016ea6874a6d58df85b54f56997d26df13c307b2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 22 May 2009 21:57:23 +0200
Subject: [PATCH 1514/2198] dma-debug: add documentation for the driver filter

This patch adds the driver filter feature to the dma-debug
documentation.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 Documentation/DMA-API.txt | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index d9aa43d78bcc9..25fb8bcf32a27 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -704,12 +704,24 @@ this directory the following files can currently be found:
 				The current number of free dma_debug_entries
 				in the allocator.
 
+	dma-api/driver-filter
+				You can write a name of a driver into this file
+				to limit the debug output to requests from that
+				particular driver. Write an empty string to
+				that file to disable the filter and see
+				all errors again.
+
 If you have this code compiled into your kernel it will be enabled by default.
 If you want to boot without the bookkeeping anyway you can provide
 'dma_debug=off' as a boot parameter. This will disable DMA-API debugging.
 Notice that you can not enable it again at runtime. You have to reboot to do
 so.
 
+If you want to see debug messages only for a special device driver you can
+specify the dma_debug_driver=<drivername> parameter. This will enable the
+driver filter at boot time. The debug code will only print errors for that
+driver afterwards. This filter can be disabled or changed later using debugfs.
+
 When the code disables itself at runtime this is most likely because it ran
 out of dma_debug_entries. These entries are preallocated at boot. The number
 of preallocated entries is defined per architecture. If it is too low for you
-- 
GitLab


From 4593bba8679b925a056f84edac061676e7eda71c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 15:34:25 +0200
Subject: [PATCH 1515/2198] perf report: Clean up the default output

 - extra space between columns
 - left-aligned the symbol column
 - moved the no-symbols printout to -v

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 55 +++++++++++----------
 1 file changed, 28 insertions(+), 27 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 0558c1e1aa5b2..19c1e056bb665 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -84,24 +84,25 @@ static struct dso *dsos__findnew(const char *name)
 	struct dso *dso = dsos__find(name);
 	int nr;
 
-	if (dso == NULL) {
-		dso = dso__new(name, 0);
-		if (!dso)
-			goto out_delete_dso;
-
-		nr = dso__load(dso, NULL);
-		if (nr < 0) {
-			fprintf(stderr, "Failed to open: %s\n", name);
-			goto out_delete_dso;
-		}
-		if (!nr) {
-			fprintf(stderr,
-		"Failed to find debug symbols for: %s, maybe install a debug package?\n",
-					name);
-		}
+	if (dso)
+		return dso;
+
+	dso = dso__new(name, 0);
+	if (!dso)
+		goto out_delete_dso;
 
-		dsos__add(dso);
+	nr = dso__load(dso, NULL);
+	if (nr < 0) {
+		fprintf(stderr, "Failed to open: %s\n", name);
+		goto out_delete_dso;
 	}
+	if (!nr && verbose) {
+		fprintf(stderr,
+		"No symbols found in: %s, maybe install a debug package?\n",
+				name);
+	}
+
+	dsos__add(dso);
 
 	return dso;
 
@@ -302,11 +303,11 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__thread_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, " %16s:%5d", self->thread->comm ?: "", self->thread->pid);
+	return fprintf(fp, "  %16s:%5d", self->thread->comm ?: "", self->thread->pid);
 }
 
 static struct sort_entry sort_thread = {
-	.header = "         Command: Pid ",
+	.header = "          Command: Pid ",
 	.cmp	= sort__thread_cmp,
 	.print	= sort__thread_print,
 };
@@ -332,11 +333,11 @@ sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__comm_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, " %16s", self->thread->comm ?: "<unknown>");
+	return fprintf(fp, "  %16s", self->thread->comm ?: "<unknown>");
 }
 
 static struct sort_entry sort_comm = {
-	.header = "         Command",
+	.header = "          Command",
 	.cmp	= sort__comm_cmp,
 	.print	= sort__comm_print,
 };
@@ -362,11 +363,11 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__dso_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, " %64s", self->dso ? self->dso->name : "<unknown>");
+	return fprintf(fp, "  %s", self->dso ? self->dso->name : "<unknown>");
 }
 
 static struct sort_entry sort_dso = {
-	.header = "                                                    Shared Object",
+	.header = " Shared Object",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
 };
@@ -391,9 +392,9 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	size_t ret = 0;
 
 	if (verbose)
-		ret += fprintf(fp, " %#018llx", (unsigned long long)self->ip);
+		ret += fprintf(fp, "  %#018llx", (unsigned long long)self->ip);
 
-	ret += fprintf(fp, " %s: %s",
+	ret += fprintf(fp, "  %s: %s",
 			self->dso ? self->dso->name : "<unknown>",
 			self->sym ? self->sym->name : "<unknown>");
 
@@ -401,7 +402,7 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_sym = {
-	.header = "Shared Object: Symbol",
+	.header = " Shared Object: Symbol",
 	.cmp	= sort__sym_cmp,
 	.print	= sort__sym_print,
 };
@@ -595,8 +596,8 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		int i;
 
-		fprintf(fp, " ");
-		for (i = 0; i < strlen(se->header); i++)
+		fprintf(fp, "  ");
+		for (i = 0; i < strlen(se->header)-1; i++)
 			fprintf(fp, ".");
 	}
 	fprintf(fp, "\n");
-- 
GitLab


From 50b64e3b77d569c217a48e078cd565dbd6462ad0 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 2 Jun 2009 06:55:20 -0400
Subject: [PATCH 1516/2198] cifs: fix IPv6 address length check

For IPv6 the userspace mount helper sends an address in the "ip="
option.  This check fails if the length is > 35 characters. I have no
idea where the magic 35 character limit came from, but it's clearly not
enough for IPv6. Fix it by making it use the INET6_ADDRSTRLEN #define.

While we're at it, use the same #define for the address length in SPNEGO
upcalls.

Reported-by: Charles R. Anderson <cra@wpi.edu>
Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifs_spnego.c | 6 ++----
 fs/cifs/connect.c     | 4 +++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 67bf93a40d2ee..4a4581cb2b5e3 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
+#include <linux/inet.h>
 #include "cifsglob.h"
 #include "cifs_spnego.h"
 #include "cifs_debug.h"
@@ -73,9 +74,6 @@ struct key_type cifs_spnego_key_type = {
  * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN	13
 
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
-#define MAX_IPV6_ADDR_LEN	43
-
 /* strlen of "host=" */
 #define HOST_KEY_LEN		5
 
@@ -102,7 +100,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
 	   host=hostname sec=mechanism uid=0xFF user=username */
 	desc_len = MAX_VER_STR_LEN +
 		   HOST_KEY_LEN + strlen(hostname) +
-		   IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
+		   IP_KEY_LEN + INET6_ADDRSTRLEN +
 		   MAX_MECH_STR_LEN +
 		   UID_KEY_LEN + (sizeof(uid_t) * 2) +
 		   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8ae563f028bc0..74b5a87e9195d 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -35,6 +35,7 @@
 #include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <linux/inet.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -960,7 +961,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 		} else if (strnicmp(data, "ip", 2) == 0) {
 			if (!value || !*value) {
 				vol->UNCip = NULL;
-			} else if (strnlen(value, 35) < 35) {
+			} else if (strnlen(value, INET6_ADDRSTRLEN) <
+							INET6_ADDRSTRLEN) {
 				vol->UNCip = value;
 			} else {
 				printk(KERN_WARNING "CIFS: ip address "
-- 
GitLab


From 179c498ae2998461fe436437a74dc29036fc7dcc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 12:03:19 -0400
Subject: [PATCH 1517/2198] function-graph: only allocate init tasks if it was
 not already done

When the function graph tracer is enabled, it calls the initialization
needed for the init tasks that would be called on all created tasks.

The problem is that this is called every time the function graph tracer
is enabled, and the ret_stack is allocated for the idle tasks each time.
Thus, the old ret_stack is lost and a memory leak is created.

This is also dangerous because if an interrupt happened on another CPU
with the init task and the ret_stack is replaced, we then lose all the
return pointers for the interrupt, and a crash would take place.

[ Impact: fix memory leak and possible crash due to race ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f1ed080406c31..ebff62ef40be7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2643,8 +2643,10 @@ static int start_graph_tracing(void)
 		return -ENOMEM;
 
 	/* The cpu_boot init_task->ret_stack will never be freed */
-	for_each_online_cpu(cpu)
-		ftrace_graph_init_task(idle_task(cpu));
+	for_each_online_cpu(cpu) {
+		if (!idle_task(cpu)->ret_stack)
+			ftrace_graph_init_task(idle_task(cpu));
+	}
 
 	do {
 		ret = alloc_retstack_tasklist(ret_stack_list);
-- 
GitLab


From 82310a3272d5a2a7652f5649ad8a55f58c8f74d9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 12:26:07 -0400
Subject: [PATCH 1518/2198] function-graph: enable the stack after
 initialization of other variables

The function graph tracer checks if the task_struct has ret_stack defined
to know if it is OK or not to use it. The initialization is done for
all tasks by one process, but the idle tasks use the same initialization
used by new tasks.

If an interrupt happens on an idle task that just had the ret_stack
created, but before the rest of the initialization took place, then
we can corrupt the return address of the functions.

This patch moves the setting of the task_struct's ret_stack to after
the other variables have been initialized.

[ Impact: prevent kernel panic on idle task when starting function graph ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c                | 9 +++++++--
 kernel/trace/trace_functions_graph.c | 6 ++++++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ebff62ef40be7..20e066065eb39 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2739,15 +2739,20 @@ void unregister_ftrace_graph(void)
 void ftrace_graph_init_task(struct task_struct *t)
 {
 	if (atomic_read(&ftrace_graph_active)) {
-		t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+		struct ftrace_ret_stack *ret_stack;
+
+		ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
 				* sizeof(struct ftrace_ret_stack),
 				GFP_KERNEL);
-		if (!t->ret_stack)
+		if (!ret_stack)
 			return;
 		t->curr_ret_stack = -1;
 		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
 		t->ftrace_timestamp = 0;
+		/* make curr_ret_stack visable before we add the ret_stack */
+		smp_wmb();
+		t->ret_stack = ret_stack;
 	} else
 		t->ret_stack = NULL;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index d28687e7b3a7b..baeb5fe36108c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 	if (!current->ret_stack)
 		return -EBUSY;
 
+	/*
+	 * We must make sure the ret_stack is tested before we read
+	 * anything else.
+	 */
+	smp_rmb();
+
 	/* The return trace stack is full */
 	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
 		atomic_inc(&current->trace_overrun);
-- 
GitLab


From 26c01624a2a40f8a4ddf6449b65c9b1c418d0e72 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 14:01:19 -0400
Subject: [PATCH 1519/2198] function-graph: add memory barriers for accessing
 task's ret_stack

The code that handles the tasks ret_stack allocation for every task
assumes that only an interrupt can cause issues (even though interrupts
are disabled).

In reality, the code is allocating the ret_stack for tasks that may be
running on other CPUs and there are not efficient memory barriers to
handle this case.

[ Impact: prevent crash due to using of uninitialized ret_stack variables ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 20e066065eb39..1664d3f33d38c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2580,12 +2580,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
 		}
 
 		if (t->ret_stack == NULL) {
-			t->curr_ret_stack = -1;
-			/* Make sure IRQs see the -1 first: */
-			barrier();
-			t->ret_stack = ret_stack_list[start++];
 			atomic_set(&t->tracing_graph_pause, 0);
 			atomic_set(&t->trace_overrun, 0);
+			t->curr_ret_stack = -1;
+			/* Make sure the tasks see the -1 first: */
+			smp_wmb();
+			t->ret_stack = ret_stack_list[start++];
 		}
 	} while_each_thread(g, t);
 
-- 
GitLab


From 29c2810276fbf8419c9b4d942b99c873e9b7640a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:56:26 +0200
Subject: [PATCH 1520/2198] perf_counter tools: Remove the last nmi bits

Everything is nmi these days, remove the userspace bits so that
the kernel can drop the interface.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 1 -
 Documentation/perf_counter/builtin-top.c  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 588679167c888..644f850b7bde0 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -84,7 +84,6 @@ static void create_perfstat_counter(int counter)
 	memset(&hw_event, 0, sizeof(hw_event));
 	hw_event.config		= event_id[counter];
 	hw_event.record_type	= 0;
-	hw_event.nmi		= 1;
 	hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
 	hw_event.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
 
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 24a887907a7ad..a2cff7b05276b 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -581,7 +581,6 @@ static int __cmd_top(void)
 			hw_event.config		= event_id[counter];
 			hw_event.irq_period	= event_count[counter];
 			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.nmi		= 1;
 			hw_event.mmap		= use_mmap;
 			hw_event.munmap		= use_munmap;
 			hw_event.freq		= freq;
-- 
GitLab


From 53e111a730ea8b002d57dd226098c12789993329 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 17:01:58 +0200
Subject: [PATCH 1521/2198] x86: Fix atomic_long_xchg() on 64bit

Apparently I'm the first to use it :-)

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/atomic.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h
index 3673a13b6703f..81d3be459efb5 100644
--- a/include/asm-generic/atomic.h
+++ b/include/asm-generic/atomic.h
@@ -134,7 +134,7 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)
 #define atomic_long_cmpxchg(l, old, new) \
 	(atomic64_cmpxchg((atomic64_t *)(l), (old), (new)))
 #define atomic_long_xchg(v, new) \
-	(atomic64_xchg((atomic64_t *)(l), (new)))
+	(atomic64_xchg((atomic64_t *)(v), (new)))
 
 #else  /*  BITS_PER_LONG == 64  */
 
-- 
GitLab


From 8e5799b1ad2a0567fdfaaf0e91b40efee010f2c1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:08:15 +0200
Subject: [PATCH 1522/2198] perf_counter: Add unique counter id

Stephan raised the issue that we currently cannot distinguish between
similar counters within a group (PERF_RECORD_GROUP uses the config
value as identifier).

Therefore, generate a new ID for each counter using a global u64
sequence counter.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 8 +++++---
 kernel/perf_counter.c        | 9 +++++++--
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9ec20fc6bd367..4845a214b9e7a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -114,8 +114,9 @@ enum perf_counter_record_format {
  * in increasing order of bit value, after the counter value.
  */
 enum perf_counter_read_format {
-	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1,
-	PERF_FORMAT_TOTAL_TIME_RUNNING	=  2,
+	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING	=  1U << 1,
+	PERF_FORMAT_ID			=  1U << 2,
 };
 
 /*
@@ -290,7 +291,7 @@ enum perf_event_type {
 	 *	{ u32			cpu, res; } && PERF_RECORD_CPU
 	 *
 	 *	{ u64			nr;
-	 *	  { u64 event, val; }	cnt[nr];  } && PERF_RECORD_GROUP
+	 *	  { u64 id, val; }	cnt[nr];  } && PERF_RECORD_GROUP
 	 *
 	 *	{ u16			nr,
 	 *				hv,
@@ -503,6 +504,7 @@ struct perf_counter {
 	struct rcu_head			rcu_head;
 
 	struct pid_namespace		*ns;
+	u64				id;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index caa012cfe49a9..978ecfcc7aafd 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1510,6 +1510,8 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
+	if (counter->hw_event.read_format & PERF_FORMAT_ID)
+		values[n++] = counter->id;
 	mutex_unlock(&counter->child_mutex);
 
 	if (count < n * sizeof(u64))
@@ -2303,7 +2305,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		u32 pid, tid;
 	} tid_entry;
 	struct {
-		u64 event;
+		u64 id;
 		u64 counter;
 	} group_entry;
 	struct perf_callchain_entry *callchain = NULL;
@@ -2416,7 +2418,7 @@ static void perf_counter_output(struct perf_counter *counter,
 			if (sub != counter)
 				sub->pmu->read(sub);
 
-			group_entry.event = sub->hw_event.config;
+			group_entry.id = sub->id;
 			group_entry.counter = atomic64_read(&sub->count);
 
 			perf_output_put(&handle, group_entry);
@@ -3375,6 +3377,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	return counter;
 }
 
+static atomic64_t perf_counter_id;
+
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -3470,6 +3474,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	mutex_unlock(&current->perf_counter_mutex);
 
 	counter->ns = get_pid_ns(current->nsproxy->pid_ns);
+	counter->id = atomic64_inc_return(&perf_counter_id);
 
 	fput_light(counter_file, fput_needed2);
 
-- 
GitLab


From b23f3325ed465f1bd914384884269af0d106778c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:13:03 +0200
Subject: [PATCH 1523/2198] perf_counter: Rename various fields

A few renames:

  s/irq_period/sample_period/
  s/irq_freq/sample_freq/
  s/PERF_RECORD_/PERF_SAMPLE_/
  s/record_type/sample_type/

And change both the new sample_type and read_format to u64.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  12 ++--
 arch/x86/kernel/cpu/perf_counter.c |   8 +--
 include/linux/perf_counter.h       |  32 ++++-----
 kernel/perf_counter.c              | 104 ++++++++++++++---------------
 4 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index f96d55f55bd60..c9633321e7a58 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -535,7 +535,7 @@ void hw_perf_enable(void)
 			continue;
 		}
 		val = 0;
-		if (counter->hw.irq_period) {
+		if (counter->hw.sample_period) {
 			left = atomic64_read(&counter->hw.period_left);
 			if (left < 0x80000000L)
 				val = 0x80000000L - left;
@@ -749,12 +749,12 @@ static void power_pmu_unthrottle(struct perf_counter *counter)
 	s64 val, left;
 	unsigned long flags;
 
-	if (!counter->hw.idx || !counter->hw.irq_period)
+	if (!counter->hw.idx || !counter->hw.sample_period)
 		return;
 	local_irq_save(flags);
 	perf_disable();
 	power_pmu_read(counter);
-	left = counter->hw.irq_period;
+	left = counter->hw.sample_period;
 	val = 0;
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
@@ -789,7 +789,7 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 	if (counter->hw_event.exclude_user
 	    || counter->hw_event.exclude_kernel
 	    || counter->hw_event.exclude_hv
-	    || counter->hw_event.irq_period)
+	    || counter->hw_event.sample_period)
 		return 0;
 
 	if (ppmu->limited_pmc_event(ev))
@@ -925,7 +925,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw.irq_period);
+	atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -958,7 +958,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 static void record_and_restart(struct perf_counter *counter, long val,
 			       struct pt_regs *regs, int nmi)
 {
-	u64 period = counter->hw.irq_period;
+	u64 period = counter->hw.sample_period;
 	s64 prev, delta, left;
 	int record = 0;
 	u64 addr, mmcra, sdsync;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 316b0c995f387..ec06aa5e9282e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -290,11 +290,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	hwc->nmi	= 1;
 	hw_event->nmi	= 1;
 
-	if (!hwc->irq_period)
-		hwc->irq_period = x86_pmu.max_period;
+	if (!hwc->sample_period)
+		hwc->sample_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left,
-			min(x86_pmu.max_period, hwc->irq_period));
+			min(x86_pmu.max_period, hwc->sample_period));
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -462,7 +462,7 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = min(x86_pmu.max_period, hwc->irq_period);
+	s64 period = min(x86_pmu.max_period, hwc->sample_period);
 	int err;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4845a214b9e7a..1fcd3cc93855a 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -94,18 +94,18 @@ enum sw_event_ids {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
- * Bits that can be set in hw_event.record_type to request information
+ * Bits that can be set in hw_event.sample_type to request information
  * in the overflow packets.
  */
-enum perf_counter_record_format {
-	PERF_RECORD_IP			= 1U << 0,
-	PERF_RECORD_TID			= 1U << 1,
-	PERF_RECORD_TIME		= 1U << 2,
-	PERF_RECORD_ADDR		= 1U << 3,
-	PERF_RECORD_GROUP		= 1U << 4,
-	PERF_RECORD_CALLCHAIN		= 1U << 5,
-	PERF_RECORD_CONFIG		= 1U << 6,
-	PERF_RECORD_CPU			= 1U << 7,
+enum perf_counter_sample_format {
+	PERF_SAMPLE_IP			= 1U << 0,
+	PERF_SAMPLE_TID			= 1U << 1,
+	PERF_SAMPLE_TIME		= 1U << 2,
+	PERF_SAMPLE_ADDR		= 1U << 3,
+	PERF_SAMPLE_GROUP		= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
+	PERF_SAMPLE_CONFIG		= 1U << 6,
+	PERF_SAMPLE_CPU			= 1U << 7,
 };
 
 /*
@@ -132,12 +132,12 @@ struct perf_counter_hw_event {
 	__u64			config;
 
 	union {
-		__u64		irq_period;
-		__u64		irq_freq;
+		__u64		sample_period;
+		__u64		sample_freq;
 	};
 
-	__u32			record_type;
-	__u32			read_format;
+	__u64			sample_type;
+	__u64			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
 				nmi	       :  1, /* NMI sampling          */
@@ -262,7 +262,7 @@ enum perf_event_type {
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
-	 *	u64				irq_period;
+	 *	u64				sample_period;
 	 * };
 	 */
 	PERF_EVENT_PERIOD		= 4,
@@ -363,7 +363,7 @@ struct hw_perf_counter {
 		};
 	};
 	atomic64_t			prev_count;
-	u64				irq_period;
+	u64				sample_period;
 	atomic64_t			period_left;
 	u64				interrupts;
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 978ecfcc7aafd..5ecd9981c0357 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1186,7 +1186,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period);
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
-	u64 interrupts, irq_period;
+	u64 interrupts, sample_period;
 	u64 events, period;
 	s64 delta;
 
@@ -1204,23 +1204,23 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 			interrupts = 2*sysctl_perf_counter_limit/HZ;
 		}
 
-		if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+		if (!counter->hw_event.freq || !counter->hw_event.sample_freq)
 			continue;
 
-		events = HZ * interrupts * counter->hw.irq_period;
-		period = div64_u64(events, counter->hw_event.irq_freq);
+		events = HZ * interrupts * counter->hw.sample_period;
+		period = div64_u64(events, counter->hw_event.sample_freq);
 
-		delta = (s64)(1 + period - counter->hw.irq_period);
+		delta = (s64)(1 + period - counter->hw.sample_period);
 		delta >>= 1;
 
-		irq_period = counter->hw.irq_period + delta;
+		sample_period = counter->hw.sample_period + delta;
 
-		if (!irq_period)
-			irq_period = 1;
+		if (!sample_period)
+			sample_period = 1;
 
-		perf_log_period(counter, irq_period);
+		perf_log_period(counter, sample_period);
 
-		counter->hw.irq_period = irq_period;
+		counter->hw.sample_period = sample_period;
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -2297,7 +2297,7 @@ static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
-	u64 record_type = counter->hw_event.record_type;
+	u64 sample_type = counter->hw_event.sample_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
@@ -2321,61 +2321,61 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
 	header.misc |= perf_misc_flags(regs);
 
-	if (record_type & PERF_RECORD_IP) {
+	if (sample_type & PERF_SAMPLE_IP) {
 		ip = perf_instruction_pointer(regs);
-		header.type |= PERF_RECORD_IP;
+		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
 
-	if (record_type & PERF_RECORD_TID) {
+	if (sample_type & PERF_SAMPLE_TID) {
 		/* namespace issues */
 		tid_entry.pid = perf_counter_pid(counter, current);
 		tid_entry.tid = perf_counter_tid(counter, current);
 
-		header.type |= PERF_RECORD_TID;
+		header.type |= PERF_SAMPLE_TID;
 		header.size += sizeof(tid_entry);
 	}
 
-	if (record_type & PERF_RECORD_TIME) {
+	if (sample_type & PERF_SAMPLE_TIME) {
 		/*
 		 * Maybe do better on x86 and provide cpu_clock_nmi()
 		 */
 		time = sched_clock();
 
-		header.type |= PERF_RECORD_TIME;
+		header.type |= PERF_SAMPLE_TIME;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_ADDR) {
-		header.type |= PERF_RECORD_ADDR;
+	if (sample_type & PERF_SAMPLE_ADDR) {
+		header.type |= PERF_SAMPLE_ADDR;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_CONFIG) {
-		header.type |= PERF_RECORD_CONFIG;
+	if (sample_type & PERF_SAMPLE_CONFIG) {
+		header.type |= PERF_SAMPLE_CONFIG;
 		header.size += sizeof(u64);
 	}
 
-	if (record_type & PERF_RECORD_CPU) {
-		header.type |= PERF_RECORD_CPU;
+	if (sample_type & PERF_SAMPLE_CPU) {
+		header.type |= PERF_SAMPLE_CPU;
 		header.size += sizeof(cpu_entry);
 
 		cpu_entry.cpu = raw_smp_processor_id();
 	}
 
-	if (record_type & PERF_RECORD_GROUP) {
-		header.type |= PERF_RECORD_GROUP;
+	if (sample_type & PERF_SAMPLE_GROUP) {
+		header.type |= PERF_SAMPLE_GROUP;
 		header.size += sizeof(u64) +
 			counter->nr_siblings * sizeof(group_entry);
 	}
 
-	if (record_type & PERF_RECORD_CALLCHAIN) {
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
 		callchain = perf_callchain(regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
 
-			header.type |= PERF_RECORD_CALLCHAIN;
+			header.type |= PERF_SAMPLE_CALLCHAIN;
 			header.size += callchain_size;
 		}
 	}
@@ -2386,28 +2386,28 @@ static void perf_counter_output(struct perf_counter *counter,
 
 	perf_output_put(&handle, header);
 
-	if (record_type & PERF_RECORD_IP)
+	if (sample_type & PERF_SAMPLE_IP)
 		perf_output_put(&handle, ip);
 
-	if (record_type & PERF_RECORD_TID)
+	if (sample_type & PERF_SAMPLE_TID)
 		perf_output_put(&handle, tid_entry);
 
-	if (record_type & PERF_RECORD_TIME)
+	if (sample_type & PERF_SAMPLE_TIME)
 		perf_output_put(&handle, time);
 
-	if (record_type & PERF_RECORD_ADDR)
+	if (sample_type & PERF_SAMPLE_ADDR)
 		perf_output_put(&handle, addr);
 
-	if (record_type & PERF_RECORD_CONFIG)
+	if (sample_type & PERF_SAMPLE_CONFIG)
 		perf_output_put(&handle, counter->hw_event.config);
 
-	if (record_type & PERF_RECORD_CPU)
+	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
 
 	/*
-	 * XXX PERF_RECORD_GROUP vs inherited counters seems difficult.
+	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
 	 */
-	if (record_type & PERF_RECORD_GROUP) {
+	if (sample_type & PERF_SAMPLE_GROUP) {
 		struct perf_counter *leader, *sub;
 		u64 nr = counter->nr_siblings;
 
@@ -2702,7 +2702,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 }
 
 /*
- * Log irq_period changes so that analyzing tools can re-normalize the
+ * Log sample_period changes so that analyzing tools can re-normalize the
  * event flow.
  */
 
@@ -2725,7 +2725,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 		.period = period,
 	};
 
-	if (counter->hw.irq_period == period)
+	if (counter->hw.sample_period == period)
 		return;
 
 	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
@@ -2834,7 +2834,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = hwc->irq_period;
+	s64 period = hwc->sample_period;
 
 	if (unlikely(left <= -period)) {
 		left = period;
@@ -2874,7 +2874,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 			ret = HRTIMER_NORESTART;
 	}
 
-	period = max_t(u64, 10000, counter->hw.irq_period);
+	period = max_t(u64, 10000, counter->hw.sample_period);
 	hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
 	return ret;
@@ -2959,7 +2959,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 {
 	int neg = atomic64_add_negative(nr, &counter->hw.count);
 
-	if (counter->hw.irq_period && !neg && regs)
+	if (counter->hw.sample_period && !neg && regs)
 		perf_swcounter_overflow(counter, nmi, regs, addr);
 }
 
@@ -3080,8 +3080,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->irq_period) {
-		u64 period = max_t(u64, 10000, hwc->irq_period);
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -3092,7 +3092,7 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void cpu_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	if (counter->hw.irq_period)
+	if (counter->hw.sample_period)
 		hrtimer_cancel(&counter->hw.hrtimer);
 	cpu_clock_perf_counter_update(counter);
 }
@@ -3132,8 +3132,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 	atomic64_set(&hwc->prev_count, now);
 	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	hwc->hrtimer.function = perf_swcounter_hrtimer;
-	if (hwc->irq_period) {
-		u64 period = max_t(u64, 10000, hwc->irq_period);
+	if (hwc->sample_period) {
+		u64 period = max_t(u64, 10000, hwc->sample_period);
 		__hrtimer_start_range_ns(&hwc->hrtimer,
 				ns_to_ktime(period), 0,
 				HRTIMER_MODE_REL, 0);
@@ -3144,7 +3144,7 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_disable(struct perf_counter *counter)
 {
-	if (counter->hw.irq_period)
+	if (counter->hw.sample_period)
 		hrtimer_cancel(&counter->hw.hrtimer);
 	task_clock_perf_counter_update(counter, counter->ctx->time);
 
@@ -3223,7 +3223,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.irq_period = counter->hw_event.irq_period;
+	counter->hw.sample_period = counter->hw_event.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3323,15 +3323,15 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	pmu = NULL;
 
 	hwc = &counter->hw;
-	if (hw_event->freq && hw_event->irq_freq)
-		hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq);
+	if (hw_event->freq && hw_event->sample_freq)
+		hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq);
 	else
-		hwc->irq_period = hw_event->irq_period;
+		hwc->sample_period = hw_event->sample_period;
 
 	/*
-	 * we currently do not support PERF_RECORD_GROUP on inherited counters
+	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
 	 */
-	if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP))
+	if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
 	if (perf_event_raw(hw_event)) {
-- 
GitLab


From 8a016db386195b193e2a8aeddff9fe937dcb7a40 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 15:27:45 +0200
Subject: [PATCH 1524/2198] perf_counter: Remove the last nmi/irq bits

IRQ (non-NMI) sampling is not used anymore - remove the last few bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 ------
 include/linux/perf_counter.h       | 4 +---
 2 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index ec06aa5e9282e..9e144fbebd209 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -284,12 +284,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!hw_event->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	/*
-	 * Use NMI events all the time:
-	 */
-	hwc->nmi	= 1;
-	hw_event->nmi	= 1;
-
 	if (!hwc->sample_period)
 		hwc->sample_period = x86_pmu.max_period;
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1fcd3cc93855a..cef9931793fd0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -140,7 +140,6 @@ struct perf_counter_hw_event {
 	__u64			read_format;
 
 	__u64			disabled       :  1, /* off by default        */
-				nmi	       :  1, /* NMI sampling          */
 				inherit	       :  1, /* children inherit it   */
 				pinned	       :  1, /* must always be on PMU */
 				exclusive      :  1, /* only group on PMU     */
@@ -153,7 +152,7 @@ struct perf_counter_hw_event {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 51;
+				__reserved_1   : 52;
 
 	__u32			wakeup_events;	/* wakeup every n events */
 	__u32			__reserved_2;
@@ -354,7 +353,6 @@ struct hw_perf_counter {
 			u64				config;
 			unsigned long			config_base;
 			unsigned long			counter_base;
-			int				nmi;
 			int				idx;
 		};
 		union { /* software */
-- 
GitLab


From e4abb5d4f7ddabc1fc7c392cf0a10d8e5868c9ca Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 16:08:20 +0200
Subject: [PATCH 1525/2198] perf_counter: x86: Emulate longer sample periods

Do as Power already does, emulate sample periods up to 2^63-1 by
composing them of smaller values limited by hardware capabilities.
Only once we wrap the software period do we generate an overflow
event.

Just 10 lines of new code.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 31 +++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 9e144fbebd209..904571bea7104 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -287,8 +287,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!hwc->sample_period)
 		hwc->sample_period = x86_pmu.max_period;
 
-	atomic64_set(&hwc->period_left,
-			min(x86_pmu.max_period, hwc->sample_period));
+	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * Raw event type provide the config in the event structure
@@ -451,13 +450,13 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]);
  * Set the next IRQ period, based on the hwc->period_left value.
  * To be called with the counter disabled in hw:
  */
-static void
+static int
 x86_perf_counter_set_period(struct perf_counter *counter,
 			     struct hw_perf_counter *hwc, int idx)
 {
 	s64 left = atomic64_read(&hwc->period_left);
-	s64 period = min(x86_pmu.max_period, hwc->sample_period);
-	int err;
+	s64 period = hwc->sample_period;
+	int err, ret = 0;
 
 	/*
 	 * If we are way outside a reasoable range then just skip forward:
@@ -465,11 +464,13 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_set(&hwc->period_left, left);
+		ret = 1;
 	}
 	/*
 	 * Quirk: certain CPUs dont like it if just 1 event is left:
@@ -477,6 +478,9 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left < 2))
 		left = 2;
 
+	if (left > x86_pmu.max_period)
+		left = x86_pmu.max_period;
+
 	per_cpu(prev_left[idx], smp_processor_id()) = left;
 
 	/*
@@ -487,6 +491,8 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 
 	err = checking_wrmsrl(hwc->counter_base + idx,
 			     (u64)(-left) & x86_pmu.counter_mask);
+
+	return ret;
 }
 
 static inline void
@@ -706,16 +712,19 @@ static void x86_pmu_disable(struct perf_counter *counter)
  * Save and restart an expired counter. Called by NMI contexts,
  * so it has to be careful about preempting normal counter ops:
  */
-static void intel_pmu_save_and_restart(struct perf_counter *counter)
+static int intel_pmu_save_and_restart(struct perf_counter *counter)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int idx = hwc->idx;
+	int ret;
 
 	x86_perf_counter_update(counter, hwc, idx);
-	x86_perf_counter_set_period(counter, hwc, idx);
+	ret = x86_perf_counter_set_period(counter, hwc, idx);
 
 	if (counter->state == PERF_COUNTER_STATE_ACTIVE)
 		intel_pmu_enable_counter(hwc, idx);
+
+	return ret;
 }
 
 static void intel_pmu_reset(void)
@@ -782,7 +791,9 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		if (!test_bit(bit, cpuc->active_mask))
 			continue;
 
-		intel_pmu_save_and_restart(counter);
+		if (!intel_pmu_save_and_restart(counter))
+			continue;
+
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
@@ -824,9 +835,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 			continue;
 
 		/* counter overflow */
-		x86_perf_counter_set_period(counter, hwc, idx);
 		handled = 1;
 		inc_irq_stat(apic_perf_irqs);
+		if (!x86_perf_counter_set_period(counter, hwc, idx))
+			continue;
+
 		if (perf_counter_overflow(counter, nmi, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
-- 
GitLab


From 8e3747c13c39246c7e46def7cf495d9d21d4c5f9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 16:16:02 +0200
Subject: [PATCH 1526/2198] perf_counter: Change data head from u32 to u64

Since some people worried that 4G might not be a large enough
as an mmap data window, extend it to 64 bit for capable
platforms.

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  7 ++++---
 kernel/perf_counter.c        | 15 ++++++++-------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index cef9931793fd0..c046f7d97cfaf 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -212,7 +212,7 @@ struct perf_counter_mmap_page {
 	 * User-space reading this value should issue an rmb(), on SMP capable
 	 * platforms, after reading this value -- see perf_counter_wakeup().
 	 */
-	__u32   data_head;		/* head in the data section */
+	__u64   data_head;		/* head in the data section */
 };
 
 #define PERF_EVENT_MISC_CPUMODE_MASK	(3 << 0)
@@ -397,10 +397,11 @@ struct perf_mmap_data {
 	int				nr_locked;	/* nr pages mlocked  */
 
 	atomic_t			poll;		/* POLL_ for wakeups */
-	atomic_t			head;		/* write position    */
 	atomic_t			events;		/* event limit       */
 
-	atomic_t			done_head;	/* completed head    */
+	atomic_long_t			head;		/* write position    */
+	atomic_long_t			done_head;	/* completed head    */
+
 	atomic_t			lock;		/* concurrent writes */
 
 	atomic_t			wakeup;		/* needs a wakeup    */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5ecd9981c0357..3f11a2bc6c799 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2067,8 +2067,8 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 struct perf_output_handle {
 	struct perf_counter	*counter;
 	struct perf_mmap_data	*data;
-	unsigned int		offset;
-	unsigned int		head;
+	unsigned long		head;
+	unsigned long		offset;
 	int			nmi;
 	int			overflow;
 	int			locked;
@@ -2122,7 +2122,8 @@ static void perf_output_lock(struct perf_output_handle *handle)
 static void perf_output_unlock(struct perf_output_handle *handle)
 {
 	struct perf_mmap_data *data = handle->data;
-	int head, cpu;
+	unsigned long head;
+	int cpu;
 
 	data->done_head = data->head;
 
@@ -2135,7 +2136,7 @@ static void perf_output_unlock(struct perf_output_handle *handle)
 	 * before we publish the new head, matched by a rmb() in userspace when
 	 * reading this position.
 	 */
-	while ((head = atomic_xchg(&data->done_head, 0)))
+	while ((head = atomic_long_xchg(&data->done_head, 0)))
 		data->user_page->data_head = head;
 
 	/*
@@ -2148,7 +2149,7 @@ static void perf_output_unlock(struct perf_output_handle *handle)
 	/*
 	 * Therefore we have to validate we did not indeed do so.
 	 */
-	if (unlikely(atomic_read(&data->done_head))) {
+	if (unlikely(atomic_long_read(&data->done_head))) {
 		/*
 		 * Since we had it locked, we can lock it again.
 		 */
@@ -2195,7 +2196,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	do {
 		offset = head = atomic_read(&data->head);
 		head += size;
-	} while (atomic_cmpxchg(&data->head, offset, head) != offset);
+	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
 	handle->offset	= offset;
 	handle->head	= head;
@@ -2246,7 +2247,7 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	 * Check we didn't copy past our reservation window, taking the
 	 * possible unsigned int wrap into account.
 	 */
-	WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0);
+	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
 #define perf_output_put(handle, x) \
-- 
GitLab


From 08247e31ca79b8f02cce47b7e8120797a8726606 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 16:46:57 +0200
Subject: [PATCH 1527/2198] perf_counter: Add ioctl for changing the sample
 period/frequency

Reported-by: Stephane Eranian <eranian@googlemail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  9 ++++----
 kernel/perf_counter.c        | 41 ++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 4 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c046f7d97cfaf..45bdd3b95d3e7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -164,10 +164,11 @@ struct perf_counter_hw_event {
 /*
  * Ioctls that can be done on a perf counter fd:
  */
-#define PERF_COUNTER_IOC_ENABLE		_IOW('$', 0, u32)
-#define PERF_COUNTER_IOC_DISABLE	_IOW('$', 1, u32)
-#define PERF_COUNTER_IOC_REFRESH	_IOW('$', 2, u32)
-#define PERF_COUNTER_IOC_RESET		_IOW('$', 3, u32)
+#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0)
+#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1)
+#define PERF_COUNTER_IOC_REFRESH	_IO ('$', 2)
+#define PERF_COUNTER_IOC_RESET		_IO ('$', 3)
+#define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64)
 
 enum perf_counter_ioc_flags {
 	PERF_IOC_FLAG_GROUP		= 1U << 0,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3f11a2bc6c799..abe2f3b6c4242 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1604,6 +1604,43 @@ static void perf_counter_for_each(struct perf_counter *counter,
 	mutex_unlock(&counter->child_mutex);
 }
 
+static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
+{
+	struct perf_counter_context *ctx = counter->ctx;
+	unsigned long size;
+	int ret = 0;
+	u64 value;
+
+	if (!counter->hw_event.sample_period)
+		return -EINVAL;
+
+	size = copy_from_user(&value, arg, sizeof(value));
+	if (size != sizeof(value))
+		return -EFAULT;
+
+	if (!value)
+		return -EINVAL;
+
+	spin_lock_irq(&ctx->lock);
+	if (counter->hw_event.freq) {
+		if (value > sysctl_perf_counter_limit) {
+			ret = -EINVAL;
+			goto unlock;
+		}
+
+		counter->hw_event.sample_freq = value;
+	} else {
+		counter->hw_event.sample_period = value;
+		counter->hw.sample_period = value;
+
+		perf_log_period(counter, value);
+	}
+unlock:
+	spin_unlock_irq(&ctx->lock);
+
+	return ret;
+}
+
 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
 	struct perf_counter *counter = file->private_data;
@@ -1623,6 +1660,10 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 
 	case PERF_COUNTER_IOC_REFRESH:
 		return perf_counter_refresh(counter, arg);
+
+	case PERF_COUNTER_IOC_PERIOD:
+		return perf_counter_period(counter, (u64 __user *)arg);
+
 	default:
 		return -ENOTTY;
 	}
-- 
GitLab


From 0d48696f87e3618b0d35bd3e4e9d7c188d51e7de Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 19:22:16 +0200
Subject: [PATCH 1528/2198] perf_counter: Rename perf_counter_hw_event =>
 perf_counter_attr

The structure isn't hw only and when I read event, I think about those
things that fall out the other end. Rename the thing.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  38 +++++-----
 arch/x86/kernel/cpu/perf_counter.c |  16 ++--
 include/linux/perf_counter.h       |  34 ++++-----
 include/linux/syscalls.h           |   4 +-
 kernel/perf_counter.c              | 116 ++++++++++++++---------------
 5 files changed, 104 insertions(+), 104 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index c9633321e7a58..ea54686cb7878 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -262,13 +262,13 @@ static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[],
 		}
 		counter = ctrs[i];
 		if (first) {
-			eu = counter->hw_event.exclude_user;
-			ek = counter->hw_event.exclude_kernel;
-			eh = counter->hw_event.exclude_hv;
+			eu = counter->attr.exclude_user;
+			ek = counter->attr.exclude_kernel;
+			eh = counter->attr.exclude_hv;
 			first = 0;
-		} else if (counter->hw_event.exclude_user != eu ||
-			   counter->hw_event.exclude_kernel != ek ||
-			   counter->hw_event.exclude_hv != eh) {
+		} else if (counter->attr.exclude_user != eu ||
+			   counter->attr.exclude_kernel != ek ||
+			   counter->attr.exclude_hv != eh) {
 			return -EAGAIN;
 		}
 	}
@@ -483,16 +483,16 @@ void hw_perf_enable(void)
 
 	/*
 	 * Add in MMCR0 freeze bits corresponding to the
-	 * hw_event.exclude_* bits for the first counter.
+	 * attr.exclude_* bits for the first counter.
 	 * We have already checked that all counters have the
 	 * same values for these bits as the first counter.
 	 */
 	counter = cpuhw->counter[0];
-	if (counter->hw_event.exclude_user)
+	if (counter->attr.exclude_user)
 		cpuhw->mmcr[0] |= MMCR0_FCP;
-	if (counter->hw_event.exclude_kernel)
+	if (counter->attr.exclude_kernel)
 		cpuhw->mmcr[0] |= freeze_counters_kernel;
-	if (counter->hw_event.exclude_hv)
+	if (counter->attr.exclude_hv)
 		cpuhw->mmcr[0] |= MMCR0_FCHV;
 
 	/*
@@ -786,10 +786,10 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev,
 	int n;
 	u64 alt[MAX_EVENT_ALTERNATIVES];
 
-	if (counter->hw_event.exclude_user
-	    || counter->hw_event.exclude_kernel
-	    || counter->hw_event.exclude_hv
-	    || counter->hw_event.sample_period)
+	if (counter->attr.exclude_user
+	    || counter->attr.exclude_kernel
+	    || counter->attr.exclude_hv
+	    || counter->attr.sample_period)
 		return 0;
 
 	if (ppmu->limited_pmc_event(ev))
@@ -855,13 +855,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (!perf_event_raw(&counter->hw_event)) {
-		ev = perf_event_id(&counter->hw_event);
+	if (!perf_event_raw(&counter->attr)) {
+		ev = perf_event_id(&counter->attr);
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = perf_event_config(&counter->hw_event);
+		ev = perf_event_config(&counter->attr);
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
@@ -872,7 +872,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 	 * the user set it to.
 	 */
 	if (!firmware_has_feature(FW_FEATURE_LPAR))
-		counter->hw_event.exclude_hv = 0;
+		counter->attr.exclude_hv = 0;
 
 	/*
 	 * If this is a per-task counter, then we can use
@@ -990,7 +990,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		addr = 0;
-		if (counter->hw_event.record_type & PERF_RECORD_ADDR) {
+		if (counter->attr.record_type & PERF_RECORD_ADDR) {
 			/*
 			 * The user wants a data address recorded.
 			 * If we're not doing instruction sampling,
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 904571bea7104..e16e8c13132fd 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -247,11 +247,11 @@ static inline int x86_pmu_initialized(void)
 }
 
 /*
- * Setup the hardware configuration for a given hw_event_type
+ * Setup the hardware configuration for a given attr_type
  */
 static int __hw_perf_counter_init(struct perf_counter *counter)
 {
-	struct perf_counter_hw_event *hw_event = &counter->hw_event;
+	struct perf_counter_attr *attr = &counter->attr;
 	struct hw_perf_counter *hwc = &counter->hw;
 	int err;
 
@@ -279,9 +279,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Count user and OS events unless requested not to.
 	 */
-	if (!hw_event->exclude_user)
+	if (!attr->exclude_user)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
-	if (!hw_event->exclude_kernel)
+	if (!attr->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
 	if (!hwc->sample_period)
@@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (perf_event_raw(hw_event)) {
-		hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event));
+	if (perf_event_raw(attr)) {
+		hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
 	} else {
-		if (perf_event_id(hw_event) >= x86_pmu.max_events)
+		if (perf_event_id(attr) >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu.event_map(perf_event_id(hw_event));
+		hwc->config |= x86_pmu.event_map(perf_event_id(attr));
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 45bdd3b95d3e7..37d5541d74cb0 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -22,7 +22,7 @@
  */
 
 /*
- * hw_event.type
+ * attr.type
  */
 enum perf_event_types {
 	PERF_TYPE_HARDWARE		= 0,
@@ -37,10 +37,10 @@ enum perf_event_types {
 };
 
 /*
- * Generalized performance counter event types, used by the hw_event.event_id
+ * Generalized performance counter event types, used by the attr.event_id
  * parameter of the sys_perf_counter_open() syscall:
  */
-enum hw_event_ids {
+enum attr_ids {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
@@ -94,7 +94,7 @@ enum sw_event_ids {
 #define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
 
 /*
- * Bits that can be set in hw_event.sample_type to request information
+ * Bits that can be set in attr.sample_type to request information
  * in the overflow packets.
  */
 enum perf_counter_sample_format {
@@ -109,7 +109,7 @@ enum perf_counter_sample_format {
 };
 
 /*
- * Bits that can be set in hw_event.read_format to request that
+ * Bits that can be set in attr.read_format to request that
  * reads on the counter should return the indicated quantities,
  * in increasing order of bit value, after the counter value.
  */
@@ -122,7 +122,7 @@ enum perf_counter_read_format {
 /*
  * Hardware event to monitor via a performance monitoring counter:
  */
-struct perf_counter_hw_event {
+struct perf_counter_attr {
 	/*
 	 * The MSB of the config word signifies if the rest contains cpu
 	 * specific (raw) counter configuration data, if unset, the next
@@ -323,25 +323,25 @@ enum perf_event_type {
 
 struct task_struct;
 
-static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_raw(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_RAW_MASK;
+	return attr->config & PERF_COUNTER_RAW_MASK;
 }
 
-static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_config(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_CONFIG_MASK;
+	return attr->config & PERF_COUNTER_CONFIG_MASK;
 }
 
-static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_type(struct perf_counter_attr *attr)
 {
-	return (hw_event->config & PERF_COUNTER_TYPE_MASK) >>
+	return (attr->config & PERF_COUNTER_TYPE_MASK) >>
 		PERF_COUNTER_TYPE_SHIFT;
 }
 
-static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event)
+static inline u64 perf_event_id(struct perf_counter_attr *attr)
 {
-	return hw_event->config & PERF_COUNTER_EVENT_MASK;
+	return attr->config & PERF_COUNTER_EVENT_MASK;
 }
 
 /**
@@ -457,7 +457,7 @@ struct perf_counter {
 	u64				tstamp_running;
 	u64				tstamp_stopped;
 
-	struct perf_counter_hw_event	hw_event;
+	struct perf_counter_attr	attr;
 	struct hw_perf_counter		hw;
 
 	struct perf_counter_context	*ctx;
@@ -605,8 +605,8 @@ extern int perf_counter_overflow(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !perf_event_raw(&counter->hw_event) &&
-		perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE;
+	return !perf_event_raw(&counter->attr) &&
+		perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE;
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 79faae950e2e8..c6c84ad8bd711 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -55,7 +55,7 @@ struct compat_timeval;
 struct robust_list_head;
 struct getcpu_cache;
 struct old_linux_dirent;
-struct perf_counter_hw_event;
+struct perf_counter_attr;
 
 #include <linux/types.h>
 #include <linux/aio_abi.h>
@@ -758,6 +758,6 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
 
 
 asmlinkage long sys_perf_counter_open(
-		const struct perf_counter_hw_event __user *hw_event_uptr,
+		const struct perf_counter_attr __user *attr_uptr,
 		pid_t pid, int cpu, int group_fd, unsigned long flags);
 #endif
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index abe2f3b6c4242..317cef78a3883 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -260,7 +260,7 @@ counter_sched_out(struct perf_counter *counter,
 	if (!is_software_counter(counter))
 		cpuctx->active_oncpu--;
 	ctx->nr_active--;
-	if (counter->hw_event.exclusive || !cpuctx->active_oncpu)
+	if (counter->attr.exclusive || !cpuctx->active_oncpu)
 		cpuctx->exclusive = 0;
 }
 
@@ -282,7 +282,7 @@ group_sched_out(struct perf_counter *group_counter,
 	list_for_each_entry(counter, &group_counter->sibling_list, list_entry)
 		counter_sched_out(counter, cpuctx, ctx);
 
-	if (group_counter->hw_event.exclusive)
+	if (group_counter->attr.exclusive)
 		cpuctx->exclusive = 0;
 }
 
@@ -550,7 +550,7 @@ counter_sched_in(struct perf_counter *counter,
 		cpuctx->active_oncpu++;
 	ctx->nr_active++;
 
-	if (counter->hw_event.exclusive)
+	if (counter->attr.exclusive)
 		cpuctx->exclusive = 1;
 
 	return 0;
@@ -642,7 +642,7 @@ static int group_can_go_on(struct perf_counter *counter,
 	 * If this group is exclusive and there are already
 	 * counters on the CPU, it can't go on.
 	 */
-	if (counter->hw_event.exclusive && cpuctx->active_oncpu)
+	if (counter->attr.exclusive && cpuctx->active_oncpu)
 		return 0;
 	/*
 	 * Otherwise, try to add it if all previous groups were able
@@ -725,7 +725,7 @@ static void __perf_install_in_context(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned) {
+		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
 		}
@@ -849,7 +849,7 @@ static void __perf_counter_enable(void *info)
 		 */
 		if (leader != counter)
 			group_sched_out(leader, cpuctx, ctx);
-		if (leader->hw_event.pinned) {
+		if (leader->attr.pinned) {
 			update_group_times(leader);
 			leader->state = PERF_COUNTER_STATE_ERROR;
 		}
@@ -927,7 +927,7 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh)
 	/*
 	 * not supported on inherited counters
 	 */
-	if (counter->hw_event.inherit)
+	if (counter->attr.inherit)
 		return -EINVAL;
 
 	atomic_add(refresh, &counter->event_limit);
@@ -1094,7 +1094,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 	 */
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    !counter->hw_event.pinned)
+		    !counter->attr.pinned)
 			continue;
 		if (counter->cpu != -1 && counter->cpu != cpu)
 			continue;
@@ -1122,7 +1122,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx,
 		 * ignore pinned counters since we did them already.
 		 */
 		if (counter->state <= PERF_COUNTER_STATE_OFF ||
-		    counter->hw_event.pinned)
+		    counter->attr.pinned)
 			continue;
 
 		/*
@@ -1204,11 +1204,11 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 			interrupts = 2*sysctl_perf_counter_limit/HZ;
 		}
 
-		if (!counter->hw_event.freq || !counter->hw_event.sample_freq)
+		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
 		events = HZ * interrupts * counter->hw.sample_period;
-		period = div64_u64(events, counter->hw_event.sample_freq);
+		period = div64_u64(events, counter->attr.sample_freq);
 
 		delta = (s64)(1 + period - counter->hw.sample_period);
 		delta >>= 1;
@@ -1444,11 +1444,11 @@ static void free_counter(struct perf_counter *counter)
 	perf_pending_sync(counter);
 
 	atomic_dec(&nr_counters);
-	if (counter->hw_event.mmap)
+	if (counter->attr.mmap)
 		atomic_dec(&nr_mmap_tracking);
-	if (counter->hw_event.munmap)
+	if (counter->attr.munmap)
 		atomic_dec(&nr_munmap_tracking);
-	if (counter->hw_event.comm)
+	if (counter->attr.comm)
 		atomic_dec(&nr_comm_tracking);
 
 	if (counter->destroy)
@@ -1504,13 +1504,13 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 	mutex_lock(&counter->child_mutex);
 	values[0] = perf_counter_read(counter);
 	n = 1;
-	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
 		values[n++] = counter->total_time_enabled +
 			atomic64_read(&counter->child_total_time_enabled);
-	if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
-	if (counter->hw_event.read_format & PERF_FORMAT_ID)
+	if (counter->attr.read_format & PERF_FORMAT_ID)
 		values[n++] = counter->id;
 	mutex_unlock(&counter->child_mutex);
 
@@ -1611,7 +1611,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 	int ret = 0;
 	u64 value;
 
-	if (!counter->hw_event.sample_period)
+	if (!counter->attr.sample_period)
 		return -EINVAL;
 
 	size = copy_from_user(&value, arg, sizeof(value));
@@ -1622,15 +1622,15 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 		return -EINVAL;
 
 	spin_lock_irq(&ctx->lock);
-	if (counter->hw_event.freq) {
+	if (counter->attr.freq) {
 		if (value > sysctl_perf_counter_limit) {
 			ret = -EINVAL;
 			goto unlock;
 		}
 
-		counter->hw_event.sample_freq = value;
+		counter->attr.sample_freq = value;
 	} else {
-		counter->hw_event.sample_period = value;
+		counter->attr.sample_period = value;
 		counter->hw.sample_period = value;
 
 		perf_log_period(counter, value);
@@ -2299,7 +2299,7 @@ static void perf_output_end(struct perf_output_handle *handle)
 	struct perf_counter *counter = handle->counter;
 	struct perf_mmap_data *data = handle->data;
 
-	int wakeup_events = counter->hw_event.wakeup_events;
+	int wakeup_events = counter->attr.wakeup_events;
 
 	if (handle->overflow && wakeup_events) {
 		int events = atomic_inc_return(&data->events);
@@ -2339,7 +2339,7 @@ static void perf_counter_output(struct perf_counter *counter,
 				int nmi, struct pt_regs *regs, u64 addr)
 {
 	int ret;
-	u64 sample_type = counter->hw_event.sample_type;
+	u64 sample_type = counter->attr.sample_type;
 	struct perf_output_handle handle;
 	struct perf_event_header header;
 	u64 ip;
@@ -2441,7 +2441,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		perf_output_put(&handle, addr);
 
 	if (sample_type & PERF_SAMPLE_CONFIG)
-		perf_output_put(&handle, counter->hw_event.config);
+		perf_output_put(&handle, counter->attr.config);
 
 	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
@@ -2512,7 +2512,7 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 static int perf_counter_comm_match(struct perf_counter *counter,
 				   struct perf_comm_event *comm_event)
 {
-	if (counter->hw_event.comm &&
+	if (counter->attr.comm &&
 	    comm_event->event.header.type == PERF_EVENT_COMM)
 		return 1;
 
@@ -2623,11 +2623,11 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 static int perf_counter_mmap_match(struct perf_counter *counter,
 				   struct perf_mmap_event *mmap_event)
 {
-	if (counter->hw_event.mmap &&
+	if (counter->attr.mmap &&
 	    mmap_event->event.header.type == PERF_EVENT_MMAP)
 		return 1;
 
-	if (counter->hw_event.munmap &&
+	if (counter->attr.munmap &&
 	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
 		return 1;
 
@@ -2907,8 +2907,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->hw_event.exclude_kernel || !regs) &&
-			!counter->hw_event.exclude_user)
+	if ((counter->attr.exclude_kernel || !regs) &&
+			!counter->attr.exclude_user)
 		regs = task_pt_regs(current);
 
 	if (regs) {
@@ -2982,14 +2982,14 @@ static int perf_swcounter_match(struct perf_counter *counter,
 	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (counter->hw_event.config != event_config)
+	if (counter->attr.config != event_config)
 		return 0;
 
 	if (regs) {
-		if (counter->hw_event.exclude_user && user_mode(regs))
+		if (counter->attr.exclude_user && user_mode(regs))
 			return 0;
 
-		if (counter->hw_event.exclude_kernel && !user_mode(regs))
+		if (counter->attr.exclude_kernel && !user_mode(regs))
 			return 0;
 	}
 
@@ -3252,12 +3252,12 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(perf_event_id(&counter->hw_event));
+	ftrace_profile_disable(perf_event_id(&counter->attr));
 }
 
 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = perf_event_id(&counter->hw_event);
+	int event_id = perf_event_id(&counter->attr);
 	int ret;
 
 	ret = ftrace_profile_enable(event_id);
@@ -3265,7 +3265,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.sample_period = counter->hw_event.sample_period;
+	counter->hw.sample_period = counter->attr.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3287,7 +3287,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (perf_event_id(&counter->hw_event)) {
+	switch (perf_event_id(&counter->attr)) {
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -3319,7 +3319,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
  * Allocate and initialize a counter structure
  */
 static struct perf_counter *
-perf_counter_alloc(struct perf_counter_hw_event *hw_event,
+perf_counter_alloc(struct perf_counter_attr *attr,
 		   int cpu,
 		   struct perf_counter_context *ctx,
 		   struct perf_counter *group_leader,
@@ -3352,36 +3352,36 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	mutex_init(&counter->mmap_mutex);
 
 	counter->cpu			= cpu;
-	counter->hw_event		= *hw_event;
+	counter->attr		= *attr;
 	counter->group_leader		= group_leader;
 	counter->pmu			= NULL;
 	counter->ctx			= ctx;
 	counter->oncpu			= -1;
 
 	counter->state = PERF_COUNTER_STATE_INACTIVE;
-	if (hw_event->disabled)
+	if (attr->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
 	pmu = NULL;
 
 	hwc = &counter->hw;
-	if (hw_event->freq && hw_event->sample_freq)
-		hwc->sample_period = div64_u64(TICK_NSEC, hw_event->sample_freq);
+	if (attr->freq && attr->sample_freq)
+		hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq);
 	else
-		hwc->sample_period = hw_event->sample_period;
+		hwc->sample_period = attr->sample_period;
 
 	/*
 	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
 	 */
-	if (hw_event->inherit && (hw_event->sample_type & PERF_SAMPLE_GROUP))
+	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
-	if (perf_event_raw(hw_event)) {
+	if (perf_event_raw(attr)) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
-	switch (perf_event_type(hw_event)) {
+	switch (perf_event_type(attr)) {
 	case PERF_TYPE_HARDWARE:
 		pmu = hw_perf_counter_init(counter);
 		break;
@@ -3409,11 +3409,11 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 	counter->pmu = pmu;
 
 	atomic_inc(&nr_counters);
-	if (counter->hw_event.mmap)
+	if (counter->attr.mmap)
 		atomic_inc(&nr_mmap_tracking);
-	if (counter->hw_event.munmap)
+	if (counter->attr.munmap)
 		atomic_inc(&nr_munmap_tracking);
-	if (counter->hw_event.comm)
+	if (counter->attr.comm)
 		atomic_inc(&nr_comm_tracking);
 
 	return counter;
@@ -3424,17 +3424,17 @@ static atomic64_t perf_counter_id;
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
- * @hw_event_uptr:	event type attributes for monitoring/sampling
+ * @attr_uptr:	event type attributes for monitoring/sampling
  * @pid:		target pid
  * @cpu:		target cpu
  * @group_fd:		group leader counter fd
  */
 SYSCALL_DEFINE5(perf_counter_open,
-		const struct perf_counter_hw_event __user *, hw_event_uptr,
+		const struct perf_counter_attr __user *, attr_uptr,
 		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
 {
 	struct perf_counter *counter, *group_leader;
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 	struct perf_counter_context *ctx;
 	struct file *counter_file = NULL;
 	struct file *group_file = NULL;
@@ -3446,7 +3446,7 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (flags)
 		return -EINVAL;
 
-	if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0)
+	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
 		return -EFAULT;
 
 	/*
@@ -3484,11 +3484,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 		/*
 		 * Only a group leader can be exclusive or pinned
 		 */
-		if (hw_event.exclusive || hw_event.pinned)
+		if (attr.exclusive || attr.pinned)
 			goto err_put_context;
 	}
 
-	counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader,
+	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader,
 				     GFP_KERNEL);
 	ret = PTR_ERR(counter);
 	if (IS_ERR(counter))
@@ -3556,7 +3556,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	if (parent_counter->parent)
 		parent_counter = parent_counter->parent;
 
-	child_counter = perf_counter_alloc(&parent_counter->hw_event,
+	child_counter = perf_counter_alloc(&parent_counter->attr,
 					   parent_counter->cpu, child_ctx,
 					   group_leader, GFP_KERNEL);
 	if (IS_ERR(child_counter))
@@ -3565,7 +3565,7 @@ inherit_counter(struct perf_counter *parent_counter,
 
 	/*
 	 * Make the child state follow the state of the parent counter,
-	 * not its hw_event.disabled bit.  We hold the parent's mutex,
+	 * not its attr.disabled bit.  We hold the parent's mutex,
 	 * so we won't race with perf_counter_{en, dis}able_family.
 	 */
 	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE)
@@ -3582,7 +3582,7 @@ inherit_counter(struct perf_counter *parent_counter,
 	/*
 	 * inherit into child's child as well:
 	 */
-	child_counter->hw_event.inherit = 1;
+	child_counter->attr.inherit = 1;
 
 	/*
 	 * Get a reference to the parent filp - we will fput it
@@ -3838,7 +3838,7 @@ int perf_counter_init_task(struct task_struct *child)
 		if (counter != counter->group_leader)
 			continue;
 
-		if (!counter->hw_event.inherit) {
+		if (!counter->attr.inherit) {
 			inherited_all = 0;
 			continue;
 		}
-- 
GitLab


From c70975bc8d5bac487616785f5d5bc7b090dfa2d9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 17:38:21 +0200
Subject: [PATCH 1529/2198] perf_counter tools: Fix up the ABI shakeup

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 18 ++++++++---------
 Documentation/perf_counter/builtin-stat.c   | 22 ++++++++++-----------
 Documentation/perf_counter/builtin-top.c    | 20 +++++++++----------
 Documentation/perf_counter/perf.h           |  4 ++--
 4 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index bace7a8edf809..c2fd042449723 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -322,7 +322,7 @@ static void synthesize_events(void)
 
 static void open_counters(int cpu, pid_t pid)
 {
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 	int counter, group_fd;
 	int track = 1;
 
@@ -334,18 +334,18 @@ static void open_counters(int cpu, pid_t pid)
 	group_fd = -1;
 	for (counter = 0; counter < nr_counters; counter++) {
 
-		memset(&hw_event, 0, sizeof(hw_event));
-		hw_event.config		= event_id[counter];
-		hw_event.irq_period	= event_count[counter];
-		hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-		hw_event.mmap		= track;
-		hw_event.comm		= track;
-		hw_event.inherit	= (cpu < 0) && inherit;
+		memset(&attr, 0, sizeof(attr));
+		attr.config		= event_id[counter];
+		attr.sample_period	= event_count[counter];
+		attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+		attr.mmap		= track;
+		attr.comm		= track;
+		attr.inherit	= (cpu < 0) && inherit;
 
 		track = 0; // only the first counter needs these
 
 		fd[nr_cpu][counter] =
-			sys_perf_counter_open(&hw_event, pid, cpu, group_fd, 0);
+			sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
 
 		if (fd[nr_cpu][counter] < 0) {
 			int err = errno;
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 644f850b7bde0..27abe6a2db3b6 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -79,22 +79,22 @@ static __u64			walltime_nsecs;
 
 static void create_perfstat_counter(int counter)
 {
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 
-	memset(&hw_event, 0, sizeof(hw_event));
-	hw_event.config		= event_id[counter];
-	hw_event.record_type	= 0;
-	hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
-	hw_event.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
+	memset(&attr, 0, sizeof(attr));
+	attr.config		= event_id[counter];
+	attr.sample_type	= 0;
+	attr.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
+	attr.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
 
 	if (scale)
-		hw_event.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
+		attr.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
 					  PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide) {
 		int cpu;
 		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0);
+			fd[cpu][counter] = sys_perf_counter_open(&attr, -1, cpu, -1, 0);
 			if (fd[cpu][counter] < 0) {
 				printf("perfstat error: syscall returned with %d (%s)\n",
 						fd[cpu][counter], strerror(errno));
@@ -102,10 +102,10 @@ static void create_perfstat_counter(int counter)
 			}
 		}
 	} else {
-		hw_event.inherit	= inherit;
-		hw_event.disabled	= 1;
+		attr.inherit	= inherit;
+		attr.disabled	= 1;
 
-		fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0);
+		fd[0][counter] = sys_perf_counter_open(&attr, 0, -1, -1, 0);
 		if (fd[0][counter] < 0) {
 			printf("perfstat error: syscall returned with %d (%s)\n",
 					fd[0][counter], strerror(errno));
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index a2cff7b05276b..5029d8e6cd9ca 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -537,7 +537,7 @@ static void mmap_read(struct mmap_data *md)
 		old += size;
 
 		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
-			if (event->header.type & PERF_RECORD_IP)
+			if (event->header.type & PERF_SAMPLE_IP)
 				process_event(event->ip.ip, md->counter);
 		} else {
 			switch (event->header.type) {
@@ -563,7 +563,7 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int __cmd_top(void)
 {
-	struct perf_counter_hw_event hw_event;
+	struct perf_counter_attr attr;
 	pthread_t thread;
 	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
@@ -577,15 +577,15 @@ static int __cmd_top(void)
 			if (target_pid == -1 && profile_cpu == -1)
 				cpu = i;
 
-			memset(&hw_event, 0, sizeof(hw_event));
-			hw_event.config		= event_id[counter];
-			hw_event.irq_period	= event_count[counter];
-			hw_event.record_type	= PERF_RECORD_IP | PERF_RECORD_TID;
-			hw_event.mmap		= use_mmap;
-			hw_event.munmap		= use_munmap;
-			hw_event.freq		= freq;
+			memset(&attr, 0, sizeof(attr));
+			attr.config		= event_id[counter];
+			attr.sample_period	= event_count[counter];
+			attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+			attr.mmap		= use_mmap;
+			attr.munmap		= use_munmap;
+			attr.freq		= freq;
 
-			fd[i][counter] = sys_perf_counter_open(&hw_event, target_pid, cpu, group_fd, 0);
+			fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
 				int err = errno;
 				printf("kerneltop error: syscall returned with %d (%s)\n",
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 5a2520bb7e55a..10622a48b408e 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -53,11 +53,11 @@ static inline unsigned long long rdclock(void)
 	_min1 < _min2 ? _min1 : _min2; })
 
 static inline int
-sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr,
+sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
 		      pid_t pid, int cpu, int group_fd,
 		      unsigned long flags)
 {
-	return syscall(__NR_perf_counter_open, hw_event_uptr, pid, cpu,
+	return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu,
 		       group_fd, flags);
 }
 
-- 
GitLab


From 436224a6d8bb3e29fe0cc18122f8d1f593da67b8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 2 Jun 2009 21:02:36 +0200
Subject: [PATCH 1530/2198] perf report: Separate out idle threads

Introduce the special comm name [idle] for idle theads.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 19c1e056bb665..6d68f3aa86b7e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -612,6 +612,17 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 	return ret;
 }
 
+static void register_idle_thread(void)
+{
+	struct thread *thread = threads__findnew(0);
+
+	if (thread == NULL ||
+			thread__set_comm(thread, "[idle]")) {
+		fprintf(stderr, "problem inserting idle task.\n");
+		exit(-1);
+	}
+}
+
 
 static int __cmd_report(void)
 {
@@ -626,6 +637,8 @@ static int __cmd_report(void)
 	char cwd[PATH_MAX], *cwdp = cwd;
 	int cwdlen;
 
+	register_idle_thread();
+
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
 		perror("failed to open file");
-- 
GitLab


From cf25c63c609e99bfb9303b68a7a90a56a3a32cea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 22:12:14 +0200
Subject: [PATCH 1531/2198] perf report: Fix column width/alignment of dsos

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6d68f3aa86b7e..b84aaf18bf1f7 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -303,11 +303,11 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__thread_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "  %16s:%5d", self->thread->comm ?: "", self->thread->pid);
+	return fprintf(fp, " %16s:%5d", self->thread->comm ?: "", self->thread->pid);
 }
 
 static struct sort_entry sort_thread = {
-	.header = "          Command: Pid ",
+	.header = "         Command: Pid ",
 	.cmp	= sort__thread_cmp,
 	.print	= sort__thread_print,
 };
@@ -363,11 +363,11 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__dso_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "  %s", self->dso ? self->dso->name : "<unknown>");
+	return fprintf(fp, "  %-25s", self->dso ? self->dso->name : "<unknown>");
 }
 
 static struct sort_entry sort_dso = {
-	.header = " Shared Object",
+	.header = " Shared Object          ",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
 };
-- 
GitLab


From f7e8b616ed1cc6f790b82324bce8a2a60295e5c2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 16:39:48 -0400
Subject: [PATCH 1532/2198] function-graph: move initialization of new tasks up
 in fork

When the function graph tracer is enabled, all new tasks must allocate
a ret_stack to place the return address of functions. This is because
the function graph tracer will replace the real return address with a
call to the tracing of the exit function.

This initialization happens in fork, but it happens too late. If fork
fails, then it will call free_task and that calls the freeing of this
ret_stack. But before initialization happens, the new (failed) task
points to its parents ret_stack. If a fork failure happens during
the function trace, it would be catastrophic for the parent.

Also, there's no need to call ftrace_graph_exit_task from fork, since
it is called by free_task which fork calls on failure.

[ Impact: prevent crash during failed fork running function graph tracer ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/fork.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index b9e2edd007265..c4b1e35c430ba 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -982,6 +982,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (!p)
 		goto fork_out;
 
+	ftrace_graph_init_task(p);
+
 	rt_mutex_init_task(p);
 
 #ifdef CONFIG_PROVE_LOCKING
@@ -1131,8 +1133,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		}
 	}
 
-	ftrace_graph_init_task(p);
-
 	p->pid = pid_nr(pid);
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
@@ -1141,7 +1141,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	if (current->nsproxy != p->nsproxy) {
 		retval = ns_cgroup_clone(p, pid);
 		if (retval)
-			goto bad_fork_free_graph;
+			goto bad_fork_free_pid;
 	}
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1233,7 +1233,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		spin_unlock(&current->sighand->siglock);
 		write_unlock_irq(&tasklist_lock);
 		retval = -ERESTARTNOINTR;
-		goto bad_fork_free_graph;
+		goto bad_fork_free_pid;
 	}
 
 	if (clone_flags & CLONE_THREAD) {
@@ -1268,8 +1268,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	cgroup_post_fork(p);
 	return p;
 
-bad_fork_free_graph:
-	ftrace_graph_exit_task(p);
 bad_fork_free_pid:
 	if (pid != &init_struct_pid)
 		free_pid(pid);
-- 
GitLab


From 84047e360af0394ac5861d433f26bbcf30f77dd1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 16:51:55 -0400
Subject: [PATCH 1533/2198] function-graph: always initialize task ret_stack

On creating a new task while running the function graph tracer, if
we fail to allocate the ret_stack, and then fail the fork, the
code will free the parent ret_stack. This is because the child
duplicated the parent and currently points to the parent's ret_stack.

This patch always initializes the task's ret_stack to NULL.

[ Impact: prevent crash of parent on low memory during fork ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1664d3f33d38c..bb081f37cac78 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2738,6 +2738,9 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
+	/* Make sure we do not use the parent ret_stack */
+	t->ret_stack = NULL;
+
 	if (atomic_read(&ftrace_graph_active)) {
 		struct ftrace_ret_stack *ret_stack;
 
@@ -2753,8 +2756,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		/* make curr_ret_stack visable before we add the ret_stack */
 		smp_wmb();
 		t->ret_stack = ret_stack;
-	} else
-		t->ret_stack = NULL;
+	}
 }
 
 void ftrace_graph_exit_task(struct task_struct *t)
-- 
GitLab


From abaff32a03e26e5d6674cb2a26ad882efe7493a3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 22:59:57 +0200
Subject: [PATCH 1534/2198] perf record: Add --append option

Allow incremental profiling via 'perf record -A' - this will append
to an existing perf.data.

Also reorder perf record options by utility / likelyhood of usage.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 40 +++++++++++++--------
 1 file changed, 26 insertions(+), 14 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index c2fd042449723..19cba6b5ee9cd 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -1,5 +1,7 @@
-
-
+/*
+ * perf record: Record the profile of a workload (or a CPU, or a PID) into
+ * the perf.data output file - for later analysis via perf report.
+ */
 #include "perf.h"
 #include "builtin.h"
 #include "util/util.h"
@@ -28,6 +30,7 @@ static int			system_wide			= 0;
 static pid_t			target_pid			= -1;
 static int			inherit				= 1;
 static int			force				= 0;
+static int			append_file			= 0;
 
 const unsigned int default_count[] = {
 	1000000,
@@ -385,22 +388,29 @@ static void open_counters(int cpu, pid_t pid)
 static int __cmd_record(int argc, const char **argv)
 {
 	int i, counter;
+	struct stat st;
 	pid_t pid;
+	int flags;
 	int ret;
-	struct stat st;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
 
-	if (!stat(output_name, &st) && !force) {
-		fprintf(stderr, "Error, output file: %s exists, use -f to overwrite.\n",
+	if (!stat(output_name, &st) && !force && !append_file) {
+		fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n",
 				output_name);
 		exit(-1);
 	}
 
-	output = open(output_name, O_CREAT|O_TRUNC|O_RDWR, S_IRUSR|S_IWUSR);
+	flags = O_CREAT|O_RDWR;
+	if (append_file)
+		flags |= O_APPEND;
+	else
+		flags |= O_TRUNC;
+
+	output = open(output_name, flags, S_IRUSR|S_IWUSR);
 	if (output < 0) {
 		perror("failed to create output file");
 		exit(-1);
@@ -466,22 +476,24 @@ static char events_help_msg[EVENTS_HELP_MAX];
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
 		     events_help_msg, parse_events),
-	OPT_INTEGER('c', "count", &default_interval,
-		    "event period to sample"),
-	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
-		    "number of mmap data pages"),
-	OPT_STRING('o', "output", &output_name, "file",
-		    "output file name"),
-	OPT_BOOLEAN('i', "inherit", &inherit,
-		    "child tasks inherit counters"),
 	OPT_INTEGER('p', "pid", &target_pid,
 		    "record events on existing pid"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 			    "system-wide collection from all CPUs"),
+	OPT_BOOLEAN('A', "append", &append_file,
+			    "append to the output file to do incremental profiling"),
 	OPT_BOOLEAN('f', "force", &force,
 			"overwrite existing data file"),
+	OPT_INTEGER('c', "count", &default_interval,
+		    "event period to sample"),
+	OPT_STRING('o', "output", &output_name, "file",
+		    "output file name"),
+	OPT_BOOLEAN('i', "inherit", &inherit,
+		    "child tasks inherit counters"),
+	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
+		    "number of mmap data pages"),
 	OPT_END()
 };
 
-- 
GitLab


From 3cf165fc2e7f221a7a95098b47eb990779e29f5f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 23:02:59 +0200
Subject: [PATCH 1535/2198] perf record: Increase mmap buffering default

I've run into mmap overruns with the current 16 pages default,
increase it to 128 pages.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 19cba6b5ee9cd..8feb1192e09e6 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -19,9 +19,9 @@ static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
 
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-static int			nr_cpus				=  0;
+static int			nr_cpus				= 0;
 static unsigned int		page_size;
-static unsigned int		mmap_pages			= 16;
+static unsigned int		mmap_pages			= 128;
 static int			output;
 static const char		*output_name			= "perf.data";
 static int			group				= 0;
-- 
GitLab


From 0a520c63e1625b92ef775da40192e1542910e7f6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 23:24:45 +0200
Subject: [PATCH 1536/2198] perf report: Print more info instead of <unknown>
 entries

Sometimes we still fail to find a DSO or look up a symbol,
print out the raw information in this case (which an help
debug the problem), instead of a not very helpful <unknown>
string.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 28 +++++++++++++++------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index b84aaf18bf1f7..270e986c2d42a 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -201,7 +201,9 @@ static struct thread *thread__new(pid_t pid)
 
 	if (self != NULL) {
 		self->pid = pid;
-		self->comm = NULL;
+		self->comm = malloc(30);
+		if (self->comm)
+			sprintf(self->comm, ":%d", pid);
 		INIT_LIST_HEAD(&self->maps);
 	}
 
@@ -333,7 +335,7 @@ sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__comm_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "  %16s", self->thread->comm ?: "<unknown>");
+	return fprintf(fp, "  %16s", self->thread->comm);
 }
 
 static struct sort_entry sort_comm = {
@@ -363,7 +365,10 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__dso_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "  %-25s", self->dso ? self->dso->name : "<unknown>");
+	if (self->dso)
+		return fprintf(fp, "  %-25s", self->dso->name);
+
+	return fprintf(fp, "  %016llx", (__u64)self->ip);
 }
 
 static struct sort_entry sort_dso = {
@@ -392,11 +397,17 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	size_t ret = 0;
 
 	if (verbose)
-		ret += fprintf(fp, "  %#018llx", (unsigned long long)self->ip);
+		ret += fprintf(fp, "  %#018llx", (__u64)self->ip);
+
+	if (self->dso)
+		ret += fprintf(fp, "  %s: ", self->dso->name);
+	else
+		ret += fprintf(fp, "  %#016llx: ", (__u64)self->ip);
 
-	ret += fprintf(fp, "  %s: %s",
-			self->dso ? self->dso->name : "<unknown>",
-			self->sym ? self->sym->name : "<unknown>");
+	if (self->sym)
+		ret += fprintf(fp, "%s", self->sym->name);
+	else
+		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
 
 	return ret;
 }
@@ -772,7 +783,8 @@ static int __cmd_report(void)
 				event->mmap.filename);
 		}
 		if (thread == NULL || map == NULL) {
-			fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
+			if (verbose)
+				fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
 			goto broken_event;
 		}
 		thread__insert_map(thread, map);
-- 
GitLab


From bf9e187637ca3d85cee7407e3af93995868cc87c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 23:37:05 +0200
Subject: [PATCH 1537/2198] perf_counter tools: Make source code headers more
 coherent

The perf commands had different ways of describing themselves,
introduce a coherent command-file-header format taken from the
Git project.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 11 ++--
 Documentation/perf_counter/builtin-report.c | 10 +++-
 Documentation/perf_counter/builtin-stat.c   | 29 ++++++----
 Documentation/perf_counter/builtin-top.c    | 60 +++++++--------------
 Documentation/perf_counter/perf.c           |  9 ++++
 5 files changed, 62 insertions(+), 57 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 8feb1192e09e6..2741b3561bb72 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -1,9 +1,14 @@
 /*
- * perf record: Record the profile of a workload (or a CPU, or a PID) into
- * the perf.data output file - for later analysis via perf report.
+ * builtin-record.c
+ *
+ * Builtin record command: Record the profile of a workload
+ * (or a CPU, or a PID) into the perf.data output file - for
+ * later analysis via perf report.
  */
-#include "perf.h"
 #include "builtin.h"
+
+#include "perf.h"
+
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 270e986c2d42a..9da990fba4a5e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1,6 +1,14 @@
-#include "util/util.h"
+/*
+ * builtin-report.c
+ *
+ * Builtin report command: Analyze the perf.data input file,
+ * look up and read DSOs and symbol information and display
+ * a histogram of results, along various sorting keys.
+ */
 #include "builtin.h"
 
+#include "util/util.h"
+
 #include "util/list.h"
 #include "util/cache.h"
 #include "util/rbtree.h"
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 27abe6a2db3b6..2357a663b6755 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -1,20 +1,27 @@
 /*
- * perf stat:  /usr/bin/time -alike performance counter statistics utility
+ * builtin-stat.c
+ *
+ * Builtin stat command: Give a precise performance counters summary
+ * overview about any workload, CPU or specific PID.
+ *
+ * Sample output:
 
-          It summarizes the counter events of all tasks (and child tasks),
-          covering all CPUs that the command (or workload) executes on.
-          It only counts the per-task events of the workload started,
-          independent of how many other tasks run on those CPUs.
+   $ perf stat ~/hackbench 10
+   Time: 0.104
 
-   Sample output:
+    Performance counter stats for '/home/mingo/hackbench':
 
-   $ perf stat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null
+       1255.538611  task clock ticks     #      10.143 CPU utilization factor
+             54011  context switches     #       0.043 M/sec
+               385  CPU migrations       #       0.000 M/sec
+             17755  pagefaults           #       0.014 M/sec
+        3808323185  CPU cycles           #    3033.219 M/sec
+        1575111190  instructions         #    1254.530 M/sec
+          17367895  cache references     #      13.833 M/sec
+           7674421  cache misses         #       6.112 M/sec
 
-   Performance counter stats for 'ls':
+    Wall-clock time elapsed:   123.786620 msecs
 
-           163516953 instructions
-                2295 cache-misses
-             2855182 branch-misses
  *
  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
  *
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 5029d8e6cd9ca..a63935276cacb 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -1,49 +1,25 @@
 /*
- * kerneltop.c: show top kernel functions - performance counters showcase
-
-   Build with:
-
-     make -C Documentation/perf_counter/
-
-   Sample output:
-
-------------------------------------------------------------------------------
- KernelTop:    2669 irqs/sec  [cache-misses/cache-refs],  (all, cpu: 2)
-------------------------------------------------------------------------------
-
-             weight         RIP          kernel function
-             ______   ________________   _______________
-
-              35.20 - ffffffff804ce74b : skb_copy_and_csum_dev
-              33.00 - ffffffff804cb740 : sock_alloc_send_skb
-              31.26 - ffffffff804ce808 : skb_push
-              22.43 - ffffffff80510004 : tcp_established_options
-              19.00 - ffffffff8027d250 : find_get_page
-              15.76 - ffffffff804e4fc9 : eth_type_trans
-              15.20 - ffffffff804d8baa : dst_release
-              14.86 - ffffffff804cf5d8 : skb_release_head_state
-              14.00 - ffffffff802217d5 : read_hpet
-              12.00 - ffffffff804ffb7f : __ip_local_out
-              11.97 - ffffffff804fc0c8 : ip_local_deliver_finish
-               8.54 - ffffffff805001a3 : ip_queue_xmit
+ * builtin-top.c
+ *
+ * Builtin top command: Display a continuously updated profile of
+ * any workload, CPU or specific PID.
+ *
+ * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *
+ * Improvements and fixes by:
+ *
+ *   Arjan van de Ven <arjan@linux.intel.com>
+ *   Yanmin Zhang <yanmin.zhang@intel.com>
+ *   Wu Fengguang <fengguang.wu@intel.com>
+ *   Mike Galbraith <efault@gmx.de>
+ *   Paul Mackerras <paulus@samba.org>
+ *
+ * Released under the GPL v2. (and only v2, not any later version)
  */
-
- /*
-  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
-  *
-  * Improvements and fixes by:
-  *
-  *   Arjan van de Ven <arjan@linux.intel.com>
-  *   Yanmin Zhang <yanmin.zhang@intel.com>
-  *   Wu Fengguang <fengguang.wu@intel.com>
-  *   Mike Galbraith <efault@gmx.de>
-  *   Paul Mackerras <paulus@samba.org>
-  *
-  * Released under the GPL v2. (and only v2, not any later version)
-  */
+#include "builtin.h"
 
 #include "perf.h"
-#include "builtin.h"
+
 #include "util/symbol.h"
 #include "util/util.h"
 #include "util/rbtree.h"
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index e8a85842b4939..ec7edb7fbe298 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -1,4 +1,13 @@
+/*
+ * perf.c
+ *
+ * Performance analysis utility.
+ *
+ * This is the main hub from which the sub-commands (perf stat,
+ * perf top, perf record, perf report, etc.) are started.
+ */
 #include "builtin.h"
+
 #include "util/exec_cmd.h"
 #include "util/cache.h"
 #include "util/quote.h"
-- 
GitLab


From addc2785ce92ff05da8edf18317b6b4719e10d9f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 2 Jun 2009 23:43:11 +0200
Subject: [PATCH 1538/2198] perf record: Print out the number of events
 captured

It makes sense to inform the user about how many events
perf record has written - so that the sufficiency of
profiling coverage and intensity can be determined at
a glance.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 2741b3561bb72..ec3b73adbd9d1 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -467,6 +467,9 @@ static int __cmd_record(int argc, const char **argv)
 			ret = poll(event_array, nr_poll, 100);
 	}
 
+
+	fprintf(stderr, "[ perf record: Captured and wrote %ld events. ]\n", events);
+
 	return 0;
 }
 
-- 
GitLab


From 850b0cee165576f969363a8c52021b5cf9ecbe67 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 2 Jun 2009 17:01:16 -0400
Subject: [PATCH 1539/2198] SELinux: define audit permissions for audit tree
 netlink messages

Audit trees defined 2 new netlink messages but the netlink mapping tables for
selinux permissions were not set up.  This patch maps these 2 new operations
to AUDIT_WRITE.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/selinux/nlmsgtab.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/security/selinux/nlmsgtab.c b/security/selinux/nlmsgtab.c
index c6875fd3b9d61..dd7cc6de77f9e 100644
--- a/security/selinux/nlmsgtab.c
+++ b/security/selinux/nlmsgtab.c
@@ -112,6 +112,8 @@ static struct nlmsg_perm nlmsg_audit_perms[] =
 	{ AUDIT_DEL_RULE,	NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_USER,		NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
 	{ AUDIT_SIGNAL_INFO,	NETLINK_AUDIT_SOCKET__NLMSG_READ     },
+	{ AUDIT_TRIM,		NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
+	{ AUDIT_MAKE_EQUIV,	NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
 	{ AUDIT_TTY_GET,	NETLINK_AUDIT_SOCKET__NLMSG_READ     },
 	{ AUDIT_TTY_SET,	NETLINK_AUDIT_SOCKET__NLMSG_TTY_AUDIT	},
 };
-- 
GitLab


From ab588ccadc80f6ef5495e83e176e88c5c0fc2d0e Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Tue, 2 Jun 2009 14:23:39 +0900
Subject: [PATCH 1540/2198] TOMOYO: Remove redundant markers.

Remove '/***** START/STOP *****/' markers.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/tomoyo/common.c   |  8 --------
 security/tomoyo/domain.c   | 14 --------------
 security/tomoyo/file.c     | 10 ----------
 security/tomoyo/realpath.c |  4 ----
 4 files changed, 36 deletions(-)

diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index ddfb9cccf468a..a42fe02c61444 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -866,7 +866,6 @@ static struct tomoyo_profile *tomoyo_find_or_assign_new_profile(const unsigned
 
 	if (profile >= TOMOYO_MAX_PROFILES)
 		return NULL;
-	/***** EXCLUSIVE SECTION START *****/
 	mutex_lock(&lock);
 	ptr = tomoyo_profile_ptr[profile];
 	if (ptr)
@@ -880,7 +879,6 @@ static struct tomoyo_profile *tomoyo_find_or_assign_new_profile(const unsigned
 	tomoyo_profile_ptr[profile] = ptr;
  ok:
 	mutex_unlock(&lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return ptr;
 }
 
@@ -1050,7 +1048,6 @@ static int tomoyo_update_manager_entry(const char *manager,
 	saved_manager = tomoyo_save_name(manager);
 	if (!saved_manager)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_policy_manager_list_lock);
 	list_for_each_entry(ptr, &tomoyo_policy_manager_list, list) {
 		if (ptr->manager != saved_manager)
@@ -1072,7 +1069,6 @@ static int tomoyo_update_manager_entry(const char *manager,
 	error = 0;
  out:
 	up_write(&tomoyo_policy_manager_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -1197,13 +1193,11 @@ static bool tomoyo_is_select_one(struct tomoyo_io_buffer *head,
 
 	if (sscanf(data, "pid=%u", &pid) == 1) {
 		struct task_struct *p;
-		/***** CRITICAL SECTION START *****/
 		read_lock(&tasklist_lock);
 		p = find_task_by_vpid(pid);
 		if (p)
 			domain = tomoyo_real_domain(p);
 		read_unlock(&tasklist_lock);
-		/***** CRITICAL SECTION END *****/
 	} else if (!strncmp(data, "domain=", 7)) {
 		if (tomoyo_is_domain_def(data + 7)) {
 			down_read(&tomoyo_domain_list_lock);
@@ -1594,13 +1588,11 @@ static int tomoyo_read_pid(struct tomoyo_io_buffer *head)
 		const int pid = head->read_step;
 		struct task_struct *p;
 		struct tomoyo_domain_info *domain = NULL;
-		/***** CRITICAL SECTION START *****/
 		read_lock(&tasklist_lock);
 		p = find_task_by_vpid(pid);
 		if (p)
 			domain = tomoyo_real_domain(p);
 		read_unlock(&tasklist_lock);
-		/***** CRITICAL SECTION END *****/
 		if (domain)
 			tomoyo_io_printf(head, "%d %u %s", pid, domain->profile,
 					 domain->domainname->name);
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index ee43631f049be..aa119ca5a7828 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -67,14 +67,12 @@ void tomoyo_set_domain_flag(struct tomoyo_domain_info *domain,
 {
 	/* We need to serialize because this is bitfield operation. */
 	static DEFINE_SPINLOCK(lock);
-	/***** CRITICAL SECTION START *****/
 	spin_lock(&lock);
 	if (!is_delete)
 		domain->flags |= flags;
 	else
 		domain->flags &= ~flags;
 	spin_unlock(&lock);
-	/***** CRITICAL SECTION END *****/
 }
 
 /**
@@ -135,7 +133,6 @@ static int tomoyo_update_domain_initializer_entry(const char *domainname,
 	saved_program = tomoyo_save_name(program);
 	if (!saved_program)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_initializer_list_lock);
 	list_for_each_entry(ptr, &tomoyo_domain_initializer_list, list) {
 		if (ptr->is_not != is_not ||
@@ -161,7 +158,6 @@ static int tomoyo_update_domain_initializer_entry(const char *domainname,
 	error = 0;
  out:
 	up_write(&tomoyo_domain_initializer_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -314,7 +310,6 @@ static int tomoyo_update_domain_keeper_entry(const char *domainname,
 	saved_domainname = tomoyo_save_name(domainname);
 	if (!saved_domainname)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_keeper_list_lock);
 	list_for_each_entry(ptr, &tomoyo_domain_keeper_list, list) {
 		if (ptr->is_not != is_not ||
@@ -340,7 +335,6 @@ static int tomoyo_update_domain_keeper_entry(const char *domainname,
 	error = 0;
  out:
 	up_write(&tomoyo_domain_keeper_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -475,7 +469,6 @@ static int tomoyo_update_alias_entry(const char *original_name,
 	saved_aliased_name = tomoyo_save_name(aliased_name);
 	if (!saved_original_name || !saved_aliased_name)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_alias_list_lock);
 	list_for_each_entry(ptr, &tomoyo_alias_list, list) {
 		if (ptr->original_name != saved_original_name ||
@@ -498,7 +491,6 @@ static int tomoyo_update_alias_entry(const char *original_name,
 	error = 0;
  out:
 	up_write(&tomoyo_alias_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -566,7 +558,6 @@ int tomoyo_delete_domain(char *domainname)
 
 	name.name = domainname;
 	tomoyo_fill_path_info(&name);
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_list_lock);
 	/* Is there an active domain? */
 	list_for_each_entry(domain, &tomoyo_domain_list, list) {
@@ -580,7 +571,6 @@ int tomoyo_delete_domain(char *domainname)
 		break;
 	}
 	up_write(&tomoyo_domain_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return 0;
 }
 
@@ -599,7 +589,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 	struct tomoyo_domain_info *domain = NULL;
 	const struct tomoyo_path_info *saved_domainname;
 
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_list_lock);
 	domain = tomoyo_find_domain(domainname);
 	if (domain)
@@ -618,7 +607,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 		    domain->domainname != saved_domainname)
 			continue;
 		flag = false;
-		/***** CRITICAL SECTION START *****/
 		read_lock(&tasklist_lock);
 		for_each_process(p) {
 			if (tomoyo_real_domain(p) != domain)
@@ -627,7 +615,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 			break;
 		}
 		read_unlock(&tasklist_lock);
-		/***** CRITICAL SECTION END *****/
 		if (flag)
 			continue;
 		list_for_each_entry(ptr, &domain->acl_info_list, list) {
@@ -650,7 +637,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
 	}
  out:
 	up_write(&tomoyo_domain_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return domain;
 }
 
diff --git a/security/tomoyo/file.c b/security/tomoyo/file.c
index 2316da8ec5bca..adf786d7421db 100644
--- a/security/tomoyo/file.c
+++ b/security/tomoyo/file.c
@@ -166,7 +166,6 @@ static int tomoyo_update_globally_readable_entry(const char *filename,
 	saved_filename = tomoyo_save_name(filename);
 	if (!saved_filename)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_globally_readable_list_lock);
 	list_for_each_entry(ptr, &tomoyo_globally_readable_list, list) {
 		if (ptr->filename != saved_filename)
@@ -187,7 +186,6 @@ static int tomoyo_update_globally_readable_entry(const char *filename,
 	error = 0;
  out:
 	up_write(&tomoyo_globally_readable_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -284,7 +282,6 @@ static int tomoyo_update_file_pattern_entry(const char *pattern,
 	saved_pattern = tomoyo_save_name(pattern);
 	if (!saved_pattern)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_pattern_list_lock);
 	list_for_each_entry(ptr, &tomoyo_pattern_list, list) {
 		if (saved_pattern != ptr->pattern)
@@ -305,7 +302,6 @@ static int tomoyo_update_file_pattern_entry(const char *pattern,
 	error = 0;
  out:
 	up_write(&tomoyo_pattern_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -407,7 +403,6 @@ static int tomoyo_update_no_rewrite_entry(const char *pattern,
 	saved_pattern = tomoyo_save_name(pattern);
 	if (!saved_pattern)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_no_rewrite_list_lock);
 	list_for_each_entry(ptr, &tomoyo_no_rewrite_list, list) {
 		if (ptr->pattern != saved_pattern)
@@ -428,7 +423,6 @@ static int tomoyo_update_no_rewrite_entry(const char *pattern,
 	error = 0;
  out:
 	up_write(&tomoyo_no_rewrite_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -745,7 +739,6 @@ static int tomoyo_update_single_path_acl(const u8 type, const char *filename,
 	saved_filename = tomoyo_save_name(filename);
 	if (!saved_filename)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_acl_info_list_lock);
 	if (is_delete)
 		goto delete;
@@ -800,7 +793,6 @@ static int tomoyo_update_single_path_acl(const u8 type, const char *filename,
 	}
  out:
 	up_write(&tomoyo_domain_acl_info_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
@@ -836,7 +828,6 @@ static int tomoyo_update_double_path_acl(const u8 type, const char *filename1,
 	saved_filename2 = tomoyo_save_name(filename2);
 	if (!saved_filename1 || !saved_filename2)
 		return -ENOMEM;
-	/***** EXCLUSIVE SECTION START *****/
 	down_write(&tomoyo_domain_acl_info_list_lock);
 	if (is_delete)
 		goto delete;
@@ -884,7 +875,6 @@ static int tomoyo_update_double_path_acl(const u8 type, const char *filename1,
 	}
  out:
 	up_write(&tomoyo_domain_acl_info_list_lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return error;
 }
 
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index 40927a84cb6e6..3948f6b56ae2d 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -220,7 +220,6 @@ void *tomoyo_alloc_element(const unsigned int size)
 		= roundup(size, max(sizeof(void *), sizeof(long)));
 	if (word_aligned_size > PATH_MAX)
 		return NULL;
-	/***** EXCLUSIVE SECTION START *****/
 	mutex_lock(&lock);
 	if (buf_used_len + word_aligned_size > PATH_MAX) {
 		if (!tomoyo_quota_for_elements ||
@@ -251,7 +250,6 @@ void *tomoyo_alloc_element(const unsigned int size)
 		}
 	}
 	mutex_unlock(&lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return ptr;
 }
 
@@ -318,7 +316,6 @@ const struct tomoyo_path_info *tomoyo_save_name(const char *name)
 		return NULL;
 	}
 	hash = full_name_hash((const unsigned char *) name, len - 1);
-	/***** EXCLUSIVE SECTION START *****/
 	mutex_lock(&lock);
 	list_for_each_entry(ptr, &tomoyo_name_list[hash % TOMOYO_MAX_HASH],
 			     list) {
@@ -366,7 +363,6 @@ const struct tomoyo_path_info *tomoyo_save_name(const char *name)
 	}
  out:
 	mutex_unlock(&lock);
-	/***** EXCLUSIVE SECTION END *****/
 	return ptr ? &ptr->entry : NULL;
 }
 
-- 
GitLab


From 7d2948b1248109dbc7f4aaf9867c54b1912d494c Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Date: Tue, 2 Jun 2009 20:42:24 +0900
Subject: [PATCH 1541/2198] TOMOYO: Simplify policy reader.

We can directly assign the result of tomoyo_io_printf() to done flag.

Signed-off-by: Kentaro Takeda <takedakn@nttdata.co.jp>
Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: Toshiharu Harada <haradats@nttdata.co.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/tomoyo/common.c | 41 ++++++++++++++++++----------------------
 security/tomoyo/domain.c | 31 ++++++++++++++----------------
 security/tomoyo/file.c   | 21 +++++++++-----------
 3 files changed, 41 insertions(+), 52 deletions(-)

diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index a42fe02c61444..6d2561276a7b1 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -1113,10 +1113,9 @@ static int tomoyo_read_manager_policy(struct tomoyo_io_buffer *head)
 				 list);
 		if (ptr->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, "%s\n", ptr->manager->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, "%s\n", ptr->manager->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_policy_manager_list_lock);
 	head->read_eof = done;
@@ -1441,15 +1440,14 @@ static int tomoyo_read_domain_policy(struct tomoyo_io_buffer *head)
 		    TOMOYO_DOMAIN_FLAGS_IGNORE_GLOBAL_ALLOW_READ)
 			ignore_global_allow_read
 				= TOMOYO_KEYWORD_IGNORE_GLOBAL_ALLOW_READ "\n";
-		if (!tomoyo_io_printf(head,
-				      "%s\n" TOMOYO_KEYWORD_USE_PROFILE "%u\n"
-				      "%s%s%s\n", domain->domainname->name,
-				      domain->profile, quota_exceeded,
-				      transition_failed,
-				      ignore_global_allow_read)) {
-			done = false;
+		done = tomoyo_io_printf(head, "%s\n" TOMOYO_KEYWORD_USE_PROFILE
+					"%u\n%s%s%s\n",
+					domain->domainname->name,
+					domain->profile, quota_exceeded,
+					transition_failed,
+					ignore_global_allow_read);
+		if (!done)
 			break;
-		}
 		head->read_step = 2;
 acl_loop:
 		if (head->read_step == 3)
@@ -1457,24 +1455,22 @@ static int tomoyo_read_domain_policy(struct tomoyo_io_buffer *head)
 		/* Print ACL entries in the domain. */
 		down_read(&tomoyo_domain_acl_info_list_lock);
 		list_for_each_cookie(apos, head->read_var2,
-				      &domain->acl_info_list) {
+				     &domain->acl_info_list) {
 			struct tomoyo_acl_info *ptr
 				= list_entry(apos, struct tomoyo_acl_info,
-					      list);
-			if (!tomoyo_print_entry(head, ptr)) {
-				done = false;
+					     list);
+			done = tomoyo_print_entry(head, ptr);
+			if (!done)
 				break;
-			}
 		}
 		up_read(&tomoyo_domain_acl_info_list_lock);
 		if (!done)
 			break;
 		head->read_step = 3;
 tail_mark:
-		if (!tomoyo_io_printf(head, "\n")) {
-			done = false;
+		done = tomoyo_io_printf(head, "\n");
+		if (!done)
 			break;
-		}
 		head->read_step = 1;
 		if (head->read_single_domain)
 			break;
@@ -1544,11 +1540,10 @@ static int tomoyo_read_domain_profile(struct tomoyo_io_buffer *head)
 		domain = list_entry(pos, struct tomoyo_domain_info, list);
 		if (domain->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, "%u %s\n", domain->profile,
-				      domain->domainname->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, "%u %s\n", domain->profile,
+					domain->domainname->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_domain_list_lock);
 	head->read_eof = done;
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index aa119ca5a7828..34bb641c67438 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -189,13 +189,12 @@ bool tomoyo_read_domain_initializer_policy(struct tomoyo_io_buffer *head)
 			from = " from ";
 			domain = ptr->domainname->name;
 		}
-		if (!tomoyo_io_printf(head,
-				      "%s" TOMOYO_KEYWORD_INITIALIZE_DOMAIN
-				      "%s%s%s\n", no, ptr->program->name, from,
-				      domain)) {
-			done = false;
+		done = tomoyo_io_printf(head,
+					"%s" TOMOYO_KEYWORD_INITIALIZE_DOMAIN
+					"%s%s%s\n", no, ptr->program->name,
+					from, domain);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_domain_initializer_list_lock);
 	return done;
@@ -387,13 +386,12 @@ bool tomoyo_read_domain_keeper_policy(struct tomoyo_io_buffer *head)
 			from = " from ";
 			program = ptr->program->name;
 		}
-		if (!tomoyo_io_printf(head,
-				      "%s" TOMOYO_KEYWORD_KEEP_DOMAIN
-				      "%s%s%s\n", no, program, from,
-				      ptr->domainname->name)) {
-			done = false;
+		done = tomoyo_io_printf(head,
+					"%s" TOMOYO_KEYWORD_KEEP_DOMAIN
+					"%s%s%s\n", no, program, from,
+					ptr->domainname->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_domain_keeper_list_lock);
 	return done;
@@ -513,12 +511,11 @@ bool tomoyo_read_alias_policy(struct tomoyo_io_buffer *head)
 		ptr = list_entry(pos, struct tomoyo_alias_entry, list);
 		if (ptr->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_ALIAS "%s %s\n",
-				      ptr->original_name->name,
-				      ptr->aliased_name->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, TOMOYO_KEYWORD_ALIAS "%s %s\n",
+					ptr->original_name->name,
+					ptr->aliased_name->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_alias_list_lock);
 	return done;
diff --git a/security/tomoyo/file.c b/security/tomoyo/file.c
index adf786d7421db..a67f9e61ee607 100644
--- a/security/tomoyo/file.c
+++ b/security/tomoyo/file.c
@@ -247,11 +247,10 @@ bool tomoyo_read_globally_readable_policy(struct tomoyo_io_buffer *head)
 				 list);
 		if (ptr->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_ALLOW_READ "%s\n",
-				      ptr->filename->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, TOMOYO_KEYWORD_ALLOW_READ "%s\n",
+					ptr->filename->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_globally_readable_list_lock);
 	return done;
@@ -369,11 +368,10 @@ bool tomoyo_read_file_pattern(struct tomoyo_io_buffer *head)
 		ptr = list_entry(pos, struct tomoyo_pattern_entry, list);
 		if (ptr->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_FILE_PATTERN "%s\n",
-				      ptr->pattern->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, TOMOYO_KEYWORD_FILE_PATTERN
+					"%s\n", ptr->pattern->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_pattern_list_lock);
 	return done;
@@ -483,11 +481,10 @@ bool tomoyo_read_no_rewrite_policy(struct tomoyo_io_buffer *head)
 		ptr = list_entry(pos, struct tomoyo_no_rewrite_entry, list);
 		if (ptr->is_deleted)
 			continue;
-		if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_DENY_REWRITE "%s\n",
-				      ptr->pattern->name)) {
-			done = false;
+		done = tomoyo_io_printf(head, TOMOYO_KEYWORD_DENY_REWRITE
+					"%s\n", ptr->pattern->name);
+		if (!done)
 			break;
-		}
 	}
 	up_read(&tomoyo_no_rewrite_list_lock);
 	return done;
-- 
GitLab


From 6799687a53a28536fd027ccb644833f66a778925 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 2 Jun 2009 08:23:58 +0200
Subject: [PATCH 1542/2198] x86, boot: add new generated files to the
 appropriate .gitignore files

git status complains of untracked (generated) files in arch/x86/boot..

# Untracked files:
#   (use "git add <file>..." to include in what will be committed)
#
#       ../../arch/x86/boot/compressed/mkpiggy
#       ../../arch/x86/boot/compressed/piggy.S
#       ../../arch/x86/boot/compressed/vmlinux.lds
#       ../../arch/x86/boot/voffset.h
#       ../../arch/x86/boot/zoffset.h

..so adjust .gitignore files accordingly.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/boot/.gitignore            | 2 ++
 arch/x86/boot/compressed/.gitignore | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index 172cf8a98bdd2..851fe936d2421 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -3,6 +3,8 @@ bzImage
 cpustr.h
 mkcpustr
 offsets.h
+voffset.h
+zoffset.h
 setup
 setup.bin
 setup.elf
diff --git a/arch/x86/boot/compressed/.gitignore b/arch/x86/boot/compressed/.gitignore
index 63eff3b04d018..4a46fab7162e3 100644
--- a/arch/x86/boot/compressed/.gitignore
+++ b/arch/x86/boot/compressed/.gitignore
@@ -1,3 +1,6 @@
 relocs
 vmlinux.bin.all
 vmlinux.relocs
+vmlinux.lds
+mkpiggy
+piggy.S
-- 
GitLab


From a05c0205ba031c01bba33a21bf0a35920eb64833 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 3 Jun 2009 09:33:18 +0200
Subject: [PATCH 1543/2198] block: Fix bounce limit setting in DM

blk_queue_bounce_limit() is more than a wrapper about the request queue
limits.bounce_pfn variable.  Introduce blk_queue_bounce_pfn() which can
be called by stacking drivers that wish to set the bounce limit
explicitly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 17 +++++++++++++++++
 drivers/md/dm-table.c  |  2 +-
 include/linux/blkdev.h |  1 +
 3 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8d33934928918..9acd0b7e802a7 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -193,6 +193,23 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
+/**
+ * blk_queue_bounce_pfn - set the bounce buffer limit for queue
+ * @q: the request queue for the device
+ * @pfn: max address
+ *
+ * Description:
+ *    This function is similar to blk_queue_bounce_limit except it
+ *    neither changes allocation flags, nor does it set up the ISA DMA
+ *    pool. This function should only be used by stacking drivers.
+ *    Hardware drivers should use blk_queue_bounce_limit instead.
+ */
+void blk_queue_bounce_pfn(struct request_queue *q, u64 pfn)
+{
+	q->limits.bounce_pfn = pfn;
+}
+EXPORT_SYMBOL(blk_queue_bounce_pfn);
+
 /**
  * blk_queue_max_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e9a73bb242b09..3ca1604ddd5ca 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -920,7 +920,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_segment_size(q, t->limits.max_segment_size);
 	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
 	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
-	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
+	blk_queue_bounce_pfn(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5e740a135e733..989aa1790f48d 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,6 +910,7 @@ extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
+extern void blk_queue_bounce_pfn(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
-- 
GitLab


From a32881066e58346f2901afe0ebdfbf0c562877e5 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Wed, 3 Jun 2009 13:12:55 +0800
Subject: [PATCH 1544/2198] perf_counter/x86: Remove the IRQ (non-NMI) handling
 bits

Remove the IRQ (non-NMI) handling bits as NMI will be used always.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <20090603051255.GA2791@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/entry_arch.h  |  1 -
 arch/x86/include/asm/hw_irq.h      |  1 -
 arch/x86/include/asm/irq_vectors.h |  5 -----
 arch/x86/kernel/cpu/perf_counter.c | 21 ++++++---------------
 arch/x86/kernel/entry_64.S         |  2 --
 arch/x86/kernel/irqinit_32.c       |  1 -
 arch/x86/kernel/irqinit_64.c       |  1 -
 7 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index fe24d28024900..d750a10ccad66 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -49,7 +49,6 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
 BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
 
 #ifdef CONFIG_PERF_COUNTERS
-BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR)
 BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)
 #endif
 
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 7309c0ad69023..4b4921d7a28ed 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,6 @@
 extern void apic_timer_interrupt(void);
 extern void generic_interrupt(void);
 extern void error_interrupt(void);
-extern void perf_counter_interrupt(void);
 extern void perf_pending_interrupt(void);
 
 extern void spurious_interrupt(void);
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 545bb811ccb52..4492e19f83913 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -106,11 +106,6 @@
  */
 #define LOCAL_TIMER_VECTOR		0xef
 
-/*
- * Performance monitoring interrupt vector:
- */
-#define LOCAL_PERF_VECTOR		0xee
-
 /*
  * Generic system vector for platform specific use
  */
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e16e8c13132fd..12cc05ed9f488 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -40,7 +40,7 @@ struct cpu_hw_counters {
 struct x86_pmu {
 	const char	*name;
 	int		version;
-	int		(*handle_irq)(struct pt_regs *, int);
+	int		(*handle_irq)(struct pt_regs *);
 	void		(*disable_all)(void);
 	void		(*enable_all)(void);
 	void		(*enable)(struct hw_perf_counter *, int);
@@ -755,7 +755,7 @@ static void intel_pmu_reset(void)
  * This handler is triggered by the local APIC, so the APIC IRQ handling
  * rules apply:
  */
-static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
+static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
 	struct cpu_hw_counters *cpuc;
 	struct cpu_hw_counters;
@@ -794,7 +794,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		if (!intel_pmu_save_and_restart(counter))
 			continue;
 
-		if (perf_counter_overflow(counter, nmi, regs, 0))
+		if (perf_counter_overflow(counter, 1, regs, 0))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -812,7 +812,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi)
 	return 1;
 }
 
-static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
+static int amd_pmu_handle_irq(struct pt_regs *regs)
 {
 	int cpu, idx, handled = 0;
 	struct cpu_hw_counters *cpuc;
@@ -840,22 +840,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, nmi, regs, 0))
+		if (perf_counter_overflow(counter, 1, regs, 0))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
 	return handled;
 }
 
-void smp_perf_counter_interrupt(struct pt_regs *regs)
-{
-	irq_enter();
-	apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR);
-	ack_APIC_irq();
-	x86_pmu.handle_irq(regs, 0);
-	irq_exit();
-}
-
 void smp_perf_pending_interrupt(struct pt_regs *regs)
 {
 	irq_enter();
@@ -910,7 +901,7 @@ perf_counter_nmi_handler(struct notifier_block *self,
 	 * If the first NMI handles both, the latter will be empty and daze
 	 * the CPU.
 	 */
-	x86_pmu.handle_irq(regs, 1);
+	x86_pmu.handle_irq(regs);
 
 	return NOTIFY_STOP;
 }
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8910046191425..7985c010f8ac9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1026,8 +1026,6 @@ apicinterrupt SPURIOUS_APIC_VECTOR \
 	spurious_interrupt smp_spurious_interrupt
 
 #ifdef CONFIG_PERF_COUNTERS
-apicinterrupt LOCAL_PERF_VECTOR \
-	perf_counter_interrupt smp_perf_counter_interrupt
 apicinterrupt LOCAL_PENDING_VECTOR \
 	perf_pending_interrupt smp_perf_pending_interrupt
 #endif
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index 3190a6b961e64..205bdd880d31e 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -165,7 +165,6 @@ static void __init apic_intr_init(void)
 	alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
 	alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
 # ifdef CONFIG_PERF_COUNTERS
-	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 # endif
 
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 53ceb26f80ff3..fa6ef692000f0 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -155,7 +155,6 @@ static void __init apic_intr_init(void)
 
 	/* Performance monitoring interrupt: */
 #ifdef CONFIG_PERF_COUNTERS
-	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
 	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
 #endif
 }
-- 
GitLab


From 8ce998d6693bd02ab3b74ee1cc303ecb1fa9b514 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 3 Jun 2009 00:54:33 -0300
Subject: [PATCH 1545/2198] perf_counter tools: Cover PLT symbols too

PLT, the Program Linking Table, is used with the dynamic linker to
allow PIC code in executables and shared objects to figure out
where functions are in other shared objects.

It is one of the sources of unknown/unresolved symbols - this patch
does what binutils figures out when you ask it to disassembly.
(objdump -S)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/symbol.c | 143 ++++++++++++++++++++++-
 1 file changed, 138 insertions(+), 5 deletions(-)

diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 039931fcb1b54..d52a1ae5342a2 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -258,6 +258,117 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 	return sec;
 }
 
+#define elf_section__for_each_rel(reldata, pos, pos_mem, idx, nr_entries) \
+	for (idx = 0, pos = gelf_getrel(reldata, 0, &pos_mem); \
+	     idx < nr_entries; \
+	     ++idx, pos = gelf_getrel(reldata, idx, &pos_mem))
+
+#define elf_section__for_each_rela(reldata, pos, pos_mem, idx, nr_entries) \
+	for (idx = 0, pos = gelf_getrela(reldata, 0, &pos_mem); \
+	     idx < nr_entries; \
+	     ++idx, pos = gelf_getrela(reldata, idx, &pos_mem))
+
+static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
+				       GElf_Ehdr *ehdr, Elf_Scn *scn_dynsym,
+				       GElf_Shdr *shdr_dynsym,
+				       size_t dynsym_idx)
+{
+	uint32_t nr_rel_entries, idx;
+	GElf_Sym sym;
+	__u64 plt_offset;
+	GElf_Shdr shdr_plt;
+	struct symbol *f;
+	GElf_Shdr shdr_rel_plt;
+	Elf_Data *reldata, *syms, *symstrs;
+	Elf_Scn *scn_plt_rel, *scn_symstrs;
+	char sympltname[1024];
+	int nr = 0, symidx;
+
+	scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
+					  ".rela.plt", NULL);
+	if (scn_plt_rel == NULL) {
+		scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
+						  ".rel.plt", NULL);
+		if (scn_plt_rel == NULL)
+			return 0;
+	}
+
+	if (shdr_rel_plt.sh_link != dynsym_idx)
+		return 0;
+
+	if (elf_section_by_name(elf, ehdr, &shdr_plt, ".plt", NULL) == NULL)
+		return 0;
+
+	/*
+	 * Fetch the relocation section to find the indexes to the GOT
+	 * and the symbols in the .dynsym they refer to.
+	 */
+	reldata = elf_getdata(scn_plt_rel, NULL);
+	if (reldata == NULL)
+		return -1;
+
+	syms = elf_getdata(scn_dynsym, NULL);
+	if (syms == NULL)
+		return -1;
+
+	scn_symstrs = elf_getscn(elf, shdr_dynsym->sh_link);
+	if (scn_symstrs == NULL)
+		return -1;
+
+	symstrs = elf_getdata(scn_symstrs, NULL);
+	if (symstrs == NULL)
+		return -1;
+
+	nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize;
+	plt_offset = shdr_plt.sh_offset;
+
+	if (shdr_rel_plt.sh_type == SHT_RELA) {
+		GElf_Rela pos_mem, *pos;
+
+		elf_section__for_each_rela(reldata, pos, pos_mem, idx,
+					   nr_rel_entries) {
+			symidx = GELF_R_SYM(pos->r_info);
+			plt_offset += shdr_plt.sh_entsize;
+			gelf_getsym(syms, symidx, &sym);
+			snprintf(sympltname, sizeof(sympltname),
+				 "%s@plt", elf_sym__name(&sym, symstrs));
+
+			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+					sympltname, self->sym_priv_size);
+			if (!f)
+				return -1;
+
+			dso__insert_symbol(self, f);
+			++nr;
+		}
+	} else if (shdr_rel_plt.sh_type == SHT_REL) {
+		GElf_Rel pos_mem, *pos;
+		elf_section__for_each_rel(reldata, pos, pos_mem, idx,
+					  nr_rel_entries) {
+			symidx = GELF_R_SYM(pos->r_info);
+			plt_offset += shdr_plt.sh_entsize;
+			gelf_getsym(syms, symidx, &sym);
+			snprintf(sympltname, sizeof(sympltname),
+				 "%s@plt", elf_sym__name(&sym, symstrs));
+
+			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
+					sympltname, self->sym_priv_size);
+			if (!f)
+				return -1;
+
+			dso__insert_symbol(self, f);
+			++nr;
+		}
+	} else {
+		/*
+		 * TODO: There are still one more shdr_rel_plt.sh_type
+		 * I have to investigate, but probably should be ignored.
+		 */
+	}
+
+	return nr;
+}
+
 static int dso__load_sym(struct dso *self, int fd, const char *name,
 			 symbol_filter_t filter)
 {
@@ -269,8 +380,9 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 	GElf_Shdr shdr;
 	Elf_Data *syms;
 	GElf_Sym sym;
-	Elf_Scn *sec;
+	Elf_Scn *sec, *sec_dynsym;
 	Elf *elf;
+	size_t dynsym_idx;
 	int nr = 0;
 
 	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
@@ -285,12 +397,33 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 		goto out_elf_end;
 	}
 
+	/*
+	 * We need to check if we have a .dynsym, so that we can handle the
+	 * .plt, synthesizing its symbols, that aren't on the symtabs (be it
+	 * .dynsym or .symtab)
+	 */
+	sec_dynsym = elf_section_by_name(elf, &ehdr, &shdr,
+					 ".dynsym", &dynsym_idx);
+	if (sec_dynsym != NULL) {
+		nr = dso__synthesize_plt_symbols(self, elf, &ehdr,
+						 sec_dynsym, &shdr,
+						 dynsym_idx);
+		if (nr < 0)
+			goto out_elf_end;
+	}
+
+	/*
+	 * But if we have a full .symtab (that is a superset of .dynsym) we
+	 * should add the symbols not in the .dynsyn
+	 */
 	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
-	if (sec == NULL)
-		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
+	if (sec == NULL) {
+		if (sec_dynsym == NULL)
+			goto out_elf_end;
 
-	if (sec == NULL)
-		goto out_elf_end;
+		sec = sec_dynsym;
+		gelf_getshdr(sec, &shdr);
+	}
 
 	syms = elf_getdata(sec, NULL);
 	if (syms == NULL)
-- 
GitLab


From 3502973d005ed89cc2b3f39780813a341ddba97f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 09:38:58 +0200
Subject: [PATCH 1546/2198] perf report: Print -D to stdout

-D prints to stderr - which is a bit confusing - print to stdout
instead.

Also clean up the if (dump_trace) patterns via a dprintf helper.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 64 ++++++++++-----------
 1 file changed, 30 insertions(+), 34 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 9da990fba4a5e..6207a3147fcb7 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -31,6 +31,8 @@ static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
 static int		dump_trace = 0;
+#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
+
 static int		verbose;
 static int		full_paths;
 
@@ -729,14 +731,12 @@ static int __cmd_report(void)
 		uint64_t ip = event->ip.ip;
 		struct map *map = NULL;
 
-		if (dump_trace) {
-			fprintf(stderr, "%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
-				(void *)(offset + head),
-				(void *)(long)(event->header.size),
-				event->header.misc,
-				event->ip.pid,
-				(void *)(long)ip);
-		}
+		dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->header.misc,
+			event->ip.pid,
+			(void *)(long)ip);
 
 		if (thread == NULL) {
 			fprintf(stderr, "problem processing %d event, skipping it.\n",
@@ -781,15 +781,14 @@ static int __cmd_report(void)
 		struct thread *thread = threads__findnew(event->mmap.pid);
 		struct map *map = map__new(&event->mmap, cwdp, cwdlen);
 
-		if (dump_trace) {
-			fprintf(stderr, "%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
-				(void *)(offset + head),
-				(void *)(long)(event->header.size),
-				(void *)(long)event->mmap.start,
-				(void *)(long)event->mmap.len,
-				(void *)(long)event->mmap.pgoff,
-				event->mmap.filename);
-		}
+		dprintf("%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			(void *)(long)event->mmap.start,
+			(void *)(long)event->mmap.len,
+			(void *)(long)event->mmap.pgoff,
+			event->mmap.filename);
+
 		if (thread == NULL || map == NULL) {
 			if (verbose)
 				fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
@@ -802,12 +801,11 @@ static int __cmd_report(void)
 	case PERF_EVENT_COMM: {
 		struct thread *thread = threads__findnew(event->comm.pid);
 
-		if (dump_trace) {
-			fprintf(stderr, "%p [%p]: PERF_EVENT_COMM: %s:%d\n",
-				(void *)(offset + head),
-				(void *)(long)(event->header.size),
-				event->comm.comm, event->comm.pid);
-		}
+		dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->comm.comm, event->comm.pid);
+
 		if (thread == NULL ||
 		    thread__set_comm(thread, event->comm.comm)) {
 			fprintf(stderr, "problem processing PERF_EVENT_COMM, skipping event.\n");
@@ -818,11 +816,10 @@ static int __cmd_report(void)
 	}
 	default: {
 broken_event:
-		if (dump_trace)
-			fprintf(stderr, "%p [%p]: skipping unknown header type: %d\n",
-					(void *)(offset + head),
-					(void *)(long)(event->header.size),
-					event->header.type);
+		dprintf("%p [%p]: skipping unknown header type: %d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->header.type);
 
 		total_unknown++;
 
@@ -846,14 +843,13 @@ static int __cmd_report(void)
 	rc = EXIT_SUCCESS;
 	close(input);
 
-	if (dump_trace) {
-		fprintf(stderr, "      IP events: %10ld\n", total);
-		fprintf(stderr, "    mmap events: %10ld\n", total_mmap);
-		fprintf(stderr, "    comm events: %10ld\n", total_comm);
-		fprintf(stderr, " unknown events: %10ld\n", total_unknown);
+	dprintf("      IP events: %10ld\n", total);
+	dprintf("    mmap events: %10ld\n", total_mmap);
+	dprintf("    comm events: %10ld\n", total_comm);
+	dprintf(" unknown events: %10ld\n", total_unknown);
 
+	if (dump_trace)
 		return 0;
-	}
 
 	if (verbose >= 2)
 		dsos__fprintf(stdout);
-- 
GitLab


From 5352f35d6ae7b8b981d77137fb268bc54d10624f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 10:07:39 +0200
Subject: [PATCH 1547/2198] perf report: Improve sort key recognition

 - allow case-insensitive tokens - such as --sort Comm,Symbol
 - allow substring shortcuts: --sort sym
 - detect invalid tokens and bail out

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 29 ++++++++++++---------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6207a3147fcb7..a8d390596d8d3 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -453,28 +453,18 @@ static int sort_dimension__add(char *tok)
 		if (sd->taken)
 			continue;
 
-		if (strcmp(tok, sd->name))
+		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
 		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
 		sd->taken = 1;
+
 		return 0;
 	}
 
 	return -ESRCH;
 }
 
-static void setup_sorting(void)
-{
-	char *tmp, *tok, *str = strdup(sort_order);
-
-	for (tok = strtok_r(str, ", ", &tmp);
-			tok; tok = strtok_r(NULL, ", ", &tmp))
-		sort_dimension__add(tok);
-
-	free(str);
-}
-
 static int64_t
 hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -880,6 +870,21 @@ static const struct option options[] = {
 	OPT_END()
 };
 
+static void setup_sorting(void)
+{
+	char *tmp, *tok, *str = strdup(sort_order);
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
+		if (sort_dimension__add(tok) < 0) {
+			error("Unknown --sort key: `%s'", tok);
+			usage_with_options(report_usage, options);
+		}
+	}
+
+	free(str);
+}
+
 int cmd_report(int argc, const char **argv, const char *prefix)
 {
 	symbol__init();
-- 
GitLab


From 367d04c4ec02dad34d80452e32e3370db7fb6fee Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 28 May 2009 09:54:48 +0200
Subject: [PATCH 1548/2198] amd_iommu: fix lock imbalance

In alloc_coherent there is an omitted unlock on the path where mapping
fails. Add the unlock.

[ Impact: fix lock imbalance in alloc_coherent ]

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index d6898833c363b..9f89bb645b314 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1541,8 +1541,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
 	*dma_addr = __map_single(dev, iommu, domain->priv, paddr,
 				 size, DMA_BIDIRECTIONAL, true, dma_mask);
 
-	if (*dma_addr == bad_dma_address)
+	if (*dma_addr == bad_dma_address) {
+		spin_unlock_irqrestore(&domain->lock, flags);
 		goto out_free;
+	}
 
 	iommu_completion_wait(iommu);
 
-- 
GitLab


From ed966aac335a63083d3125198479447248637d9e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 10:39:26 +0200
Subject: [PATCH 1549/2198] perf report: Handle vDSO symbols properly

We were not looking up vDSO symbols properly, because they
are in the kallsyms but are user-mode entries.

Pass negative addresses to the kernel dso object, this
way we resolve them properly:

     0.05%  [kernel]: vread_tsc

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index a8d390596d8d3..0f88d9ebb34a3 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -728,6 +728,8 @@ static int __cmd_report(void)
 			event->ip.pid,
 			(void *)(long)ip);
 
+		dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
 		if (thread == NULL) {
 			fprintf(stderr, "problem processing %d event, skipping it.\n",
 				event->header.type);
@@ -740,6 +742,8 @@ static int __cmd_report(void)
 
 			dso = kernel_dso;
 
+			dprintf(" ...... dso: %s\n", dso->name);
+
 		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
 
 			show = SHOW_USER;
@@ -749,11 +753,22 @@ static int __cmd_report(void)
 			if (map != NULL) {
 				dso = map->dso;
 				ip -= map->start + map->pgoff;
+			} else {
+				/*
+				 * If this is outside of all known maps,
+				 * and is a negative address, try to look it
+				 * up in the kernel dso, as it might be a
+				 * vsyscall (which executes in user-mode):
+				 */
+				if ((long long)ip < 0)
+					dso = kernel_dso;
 			}
+			dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
 
 		} else {
 			show = SHOW_HV;
 			level = 'H';
+			dprintf(" ...... dso: [hypervisor]\n");
 		}
 
 		if (show & show_mask) {
-- 
GitLab


From e09f9446b94ac64b27d37e98c1110f29d712cdad Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 3 Jun 2009 10:07:44 +0100
Subject: [PATCH 1550/2198] GFS2: Remove unused variable

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 841ddc9793881..73318a3ce6f1b 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -694,8 +694,6 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 
 static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 {
-	struct gfs2_inode *ip = GFS2_I(file->f_mapping->host);
-
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
 	if (fl->fl_type & LOCK_MAND)
-- 
GitLab


From 6984efb692e97ce5f75f26e595685c04c2061bac Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 3 Jun 2009 19:38:58 +1000
Subject: [PATCH 1551/2198] perf_counter: powerpc: Fix event alternative code
 generation on POWER5/5+

Commit ef923214 ("perf_counter: powerpc: use u64 for event
codes internally") introduced a bug where the return value from
function find_alternative_bdecode gets put into a u64 variable
and later tested to see if it is < 0.  The effect is that we
get extra, bogus event code alternatives on POWER5 and POWER5+,
leading to error messages such as "oops compute_mmcr failed"
being printed and counters not counting properly.

This fixes it by using s64 for the return type of
find_alternative_bdecode and for the local variable that the
caller puts the value in.  It also makes the event argument a
u64 on POWER5+ for consistency.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <18982.17586.666132.90983@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/power5+-pmu.c | 4 ++--
 arch/powerpc/kernel/power5-pmu.c  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index c6cdfc165d6ef..8471e3c2e4658 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -242,7 +242,7 @@ static const unsigned char bytedecode_alternatives[4][4] = {
  * event code for those that do, or -1 otherwise.  This also handles
  * alternative PCMSEL values for add events.
  */
-static int find_alternative_bdecode(unsigned int event)
+static s64 find_alternative_bdecode(u64 event)
 {
 	int pmc, altpmc, pp, j;
 
@@ -277,7 +277,7 @@ static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, nalt = 1;
 	int nlim;
-	u64 ae;
+	s64 ae;
 
 	alt[0] = event;
 	nalt = 1;
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index d5344968ee9cb..1b44c5fca1898 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -250,7 +250,7 @@ static const unsigned char bytedecode_alternatives[4][4] = {
  * PMCSEL values on other counters.  This returns the alternative
  * event code for those that do, or -1 otherwise.
  */
-static u64 find_alternative_bdecode(u64 event)
+static s64 find_alternative_bdecode(u64 event)
 {
 	int pmc, altpmc, pp, j;
 
@@ -272,7 +272,7 @@ static u64 find_alternative_bdecode(u64 event)
 static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[])
 {
 	int i, j, nalt = 1;
-	u64 ae;
+	s64 ae;
 
 	alt[0] = event;
 	nalt = 1;
-- 
GitLab


From dcd945e0d8a6d654e3e1de51faea9f98f1504aa5 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 3 Jun 2009 19:40:36 +1000
Subject: [PATCH 1552/2198] perf_counter: powerpc: Fix race causing "oops
 trying to read PMC0" errors

When using interrupting counters and limited (non-interrupting)
counters at the same time, it's possible that we get an
interrupt in write_mmcr0() after writing MMCR0 but before we
have set up the counters using limited PMCs.  What happens then
is that we get into perf_counter_interrupt() with
counter->hw.idx = 0 for the limited counters, leading to the
"oops trying to read PMC0" error message being printed.

This fixes the problem by making perf_counter_interrupt()
robust against counter->hw.idx being zero (the counter is just
ignored in that case) and also by changing write_mmcr0() to
write MMCR0 initially with the counter overflow interrupt
enable bits masked (set to 0).  If the MMCR0 value requested by
the caller has either of those bits set, we write MMCR0 again
with the requested value of those bits after setting up the
limited counters properly.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <18982.17684.138182.954599@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index ea54686cb7878..4cc4ac5c791c3 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -372,16 +372,28 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0)
 
 	/*
 	 * Write MMCR0, then read PMC5 and PMC6 immediately.
+	 * To ensure we don't get a performance monitor interrupt
+	 * between writing MMCR0 and freezing/thawing the limited
+	 * counters, we first write MMCR0 with the counter overflow
+	 * interrupt enable bits turned off.
 	 */
 	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5"
 		     : "=&r" (pmc5), "=&r" (pmc6)
-		     : "r" (mmcr0), "i" (SPRN_MMCR0),
+		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)),
+		       "i" (SPRN_MMCR0),
 		       "i" (SPRN_PMC5), "i" (SPRN_PMC6));
 
 	if (mmcr0 & MMCR0_FC)
 		freeze_limited_counters(cpuhw, pmc5, pmc6);
 	else
 		thaw_limited_counters(cpuhw, pmc5, pmc6);
+
+	/*
+	 * Write the full MMCR0 including the counter overflow interrupt
+	 * enable bits, if necessary.
+	 */
+	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE))
+		mtspr(SPRN_MMCR0, mmcr0);
 }
 
 /*
@@ -1108,7 +1120,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
 
 	for (i = 0; i < cpuhw->n_counters; ++i) {
 		counter = cpuhw->counter[i];
-		if (is_limited_pmc(counter->hw.idx))
+		if (!counter->hw.idx || is_limited_pmc(counter->hw.idx))
 			continue;
 		val = read_pmc(counter->hw.idx);
 		if ((int)val < 0) {
-- 
GitLab


From 0e2595cdfd7df9f1128f7185152601ae5417483b Mon Sep 17 00:00:00 2001
From: Cliff Wickman <cpw@sgi.com>
Date: Wed, 20 May 2009 08:10:57 -0500
Subject: [PATCH 1553/2198] x86: Fix UV BAU activation descriptor init

The UV tlb shootdown code has a serious initialization error.

An array of structures [32*8] is initialized as if it were [32].
The array is indexed by (cpu number on the blade)*8, so the short
initialization works for up to 4 cpus on a blade.
But above that, we provide an invalid opcode to the hub's
broadcast assist unit.

This patch changes the allocation of the array to use its symbolic
dimensions for better clarity. And initializes all 32*8 entries.

Shortened 'UV_ACTIVATION_DESCRIPTOR_SIZE' to 'UV_ADP_SIZE' per Ingo's
recommendation.

Tested on the UV simulator.

Signed-off-by: Cliff Wickman <cpw@sgi.com>
Cc: <stable@kernel.org>
LKML-Reference: <E1M6lZR-0007kV-Aq@eag09.americas.sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_bau.h |  2 +-
 arch/x86/kernel/tlb_uv.c         | 15 +++++++++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 9b0e61bf7a88d..bddd44f2f0ab5 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -37,7 +37,7 @@
 #define UV_CPUS_PER_ACT_STATUS		32
 #define UV_ACT_STATUS_MASK		0x3
 #define UV_ACT_STATUS_SIZE		2
-#define UV_ACTIVATION_DESCRIPTOR_SIZE	32
+#define UV_ADP_SIZE			32
 #define UV_DISTRIBUTION_SIZE		256
 #define UV_SW_ACK_NPENDING		8
 #define UV_NET_ENDPOINT_INTD		0x38
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index ed0c33761e6d1..16f0fd4f18eca 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
 	struct bau_desc *adp;
 	struct bau_desc *ad2;
 
-	adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
+	/*
+	 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
+	 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+	 */
+	adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+		UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
 	BUG_ON(!adp);
 
 	pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
 				      (n << UV_DESC_BASE_PNODE_SHIFT | m));
 	}
 
-	for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
+	/*
+	 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+	 * cpu even though we only use the first one; one descriptor can
+	 * describe a broadcast to 256 nodes.
+	 */
+	for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+		i++, ad2++) {
 		memset(ad2, 0, sizeof(struct bau_desc));
 		ad2->header.sw_ack_flag = 1;
 		/*
-- 
GitLab


From 226f62fdd53d5b2c74e242aa11f6ad43d0285d3f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Jun 2009 11:23:56 +0200
Subject: [PATCH 1554/2198] perf_counter: Add a comm hook for pure fork()s

I noticed missing COMM events and found that we missed
reporting them for pure forks.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/kernel/fork.c b/kernel/fork.c
index 23bf757ed3211..b7d7a9f0bd7ad 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1412,6 +1412,12 @@ long do_fork(unsigned long clone_flags,
 		if (clone_flags & CLONE_VFORK) {
 			p->vfork_done = &vfork;
 			init_completion(&vfork);
+		} else {
+			/*
+			 * vfork will do an exec which will call
+			 * set_task_comm()
+			 */
+			perf_counter_comm(p);
 		}
 
 		audit_finish_fork(p);
-- 
GitLab


From e61078a0c88773d3465b0b9d665c5ed6b952b1cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Jun 2009 11:24:33 +0200
Subject: [PATCH 1555/2198] perf record: Use long arg for counter period

I wrote this to test the extended period emulation, we might as
well merge it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c     |  6 +++---
 Documentation/perf_counter/util/parse-options.c | 16 ++++++++++++++++
 Documentation/perf_counter/util/parse-options.h |  2 ++
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index ec3b73adbd9d1..cea5b8d3c633b 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -20,8 +20,8 @@
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
 #define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
+static long			default_interval = 100000;
+static long			event_count[MAX_COUNTERS];
 
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 static int			nr_cpus				= 0;
@@ -494,7 +494,7 @@ static const struct option options[] = {
 			    "append to the output file to do incremental profiling"),
 	OPT_BOOLEAN('f', "force", &force,
 			"overwrite existing data file"),
-	OPT_INTEGER('c', "count", &default_interval,
+	OPT_LONG('c', "count", &default_interval,
 		    "event period to sample"),
 	OPT_STRING('o', "output", &output_name, "file",
 		    "output file name"),
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
index 28b34c1c29cf5..b80abd9a99bcc 100644
--- a/Documentation/perf_counter/util/parse-options.c
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -113,6 +113,22 @@ static int get_value(struct parse_opt_ctx_t *p,
 			return opterror(opt, "expects a numerical value", flags);
 		return 0;
 
+	case OPTION_LONG:
+		if (unset) {
+			*(long *)opt->value = 0;
+			return 0;
+		}
+		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) {
+			*(long *)opt->value = opt->defval;
+			return 0;
+		}
+		if (get_arg(p, opt, flags, &arg))
+			return -1;
+		*(long *)opt->value = strtol(arg, (char **)&s, 10);
+		if (*s)
+			return opterror(opt, "expects a numerical value", flags);
+		return 0;
+
 	default:
 		die("should not happen, someone must be hit on the forehead");
 	}
diff --git a/Documentation/perf_counter/util/parse-options.h b/Documentation/perf_counter/util/parse-options.h
index a81c7faff68e0..a1039a6ce0eba 100644
--- a/Documentation/perf_counter/util/parse-options.h
+++ b/Documentation/perf_counter/util/parse-options.h
@@ -14,6 +14,7 @@ enum parse_opt_type {
 	/* options with arguments (usually) */
 	OPTION_STRING,
 	OPTION_INTEGER,
+	OPTION_LONG,
 	OPTION_CALLBACK,
 };
 
@@ -97,6 +98,7 @@ struct option {
 #define OPT_SET_INT(s, l, v, h, i)  { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) }
 #define OPT_SET_PTR(s, l, v, h, p)  { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) }
 #define OPT_INTEGER(s, l, v, h)     { OPTION_INTEGER, (s), (l), (v), NULL, (h) }
+#define OPT_LONG(s, l, v, h)        { OPTION_LONG, (s), (l), (v), NULL, (h) }
 #define OPT_STRING(s, l, v, a, h)   { OPTION_STRING,  (s), (l), (v), (a), (h) }
 #define OPT_DATE(s, l, v, h) \
 	{ OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \
-- 
GitLab


From 8229289b607682f90b946ad2c319526303c17700 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Jun 2009 12:37:36 +0200
Subject: [PATCH 1556/2198] perf report: Fix comm sorting

Since we can (and do) change comm strings during the collection
phase, we cannot actually sort on them to build the histogram.
Therefore add an (optional) third sorting phase to collapse the
histrogram.

Comm sorting now builds the histrogram on threads and then in
the collapse phase collects all threads with the same comm.

This collapsed histogram is then reversed and sorted on events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 118 +++++++++++++++++++-
 1 file changed, 112 insertions(+), 6 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 0f88d9ebb34a3..6d359c9f75ddd 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -211,9 +211,9 @@ static struct thread *thread__new(pid_t pid)
 
 	if (self != NULL) {
 		self->pid = pid;
-		self->comm = malloc(30);
+		self->comm = malloc(32);
 		if (self->comm)
-			sprintf(self->comm, ":%d", pid);
+			snprintf(self->comm, 32, ":%d", self->pid);
 		INIT_LIST_HEAD(&self->maps);
 	}
 
@@ -222,6 +222,8 @@ static struct thread *thread__new(pid_t pid)
 
 static int thread__set_comm(struct thread *self, const char *comm)
 {
+	if (self->comm)
+		free(self->comm);
 	self->comm = strdup(comm);
 	return self->comm ? 0 : -ENOMEM;
 }
@@ -303,9 +305,12 @@ struct sort_entry {
 	char *header;
 
 	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
 	size_t	(*print)(FILE *fp, struct hist_entry *);
 };
 
+/* --sort pid */
+
 static int64_t
 sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -324,8 +329,16 @@ static struct sort_entry sort_thread = {
 	.print	= sort__thread_print,
 };
 
+/* --sort comm */
+
 static int64_t
 sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->thread->pid - left->thread->pid;
+}
+
+static int64_t
+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
 {
 	char *comm_l = left->thread->comm;
 	char *comm_r = right->thread->comm;
@@ -349,11 +362,14 @@ sort__comm_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_comm = {
-	.header = "          Command",
-	.cmp	= sort__comm_cmp,
-	.print	= sort__comm_print,
+	.header 	= "          Command",
+	.cmp		= sort__comm_cmp,
+	.collapse	= sort__comm_collapse,
+	.print		= sort__comm_print,
 };
 
+/* --sort dso */
+
 static int64_t
 sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -387,6 +403,8 @@ static struct sort_entry sort_dso = {
 	.print	= sort__dso_print,
 };
 
+/* --sort symbol */
+
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
@@ -428,6 +446,8 @@ static struct sort_entry sort_sym = {
 	.print	= sort__sym_print,
 };
 
+static int sort__need_collapse = 0;
+
 struct sort_dimension {
 	char *name;
 	struct sort_entry *entry;
@@ -456,6 +476,9 @@ static int sort_dimension__add(char *tok)
 		if (strncasecmp(tok, sd->name, strlen(tok)))
 			continue;
 
+		if (sd->entry->collapse)
+			sort__need_collapse = 1;
+
 		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
 		sd->taken = 1;
 
@@ -480,6 +503,25 @@ hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
 	return cmp;
 }
 
+static int64_t
+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		int64_t (*f)(struct hist_entry *, struct hist_entry *);
+
+		f = se->collapse ?: se->cmp;
+
+		cmp = f(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
 static size_t
 hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 {
@@ -549,6 +591,64 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 	return 0;
 }
 
+static void hist_entry__free(struct hist_entry *he)
+{
+	free(he);
+}
+
+/*
+ * collapse the histogram
+ */
+
+static struct rb_root collapse_hists;
+
+static void collapse__insert_entry(struct hist_entry *he)
+{
+	struct rb_node **p = &collapse_hists.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+	int64_t cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__collapse(iter, he);
+
+		if (!cmp) {
+			iter->count += he->count;
+			hist_entry__free(he);
+			return;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &collapse_hists);
+}
+
+static void collapse__resort(void)
+{
+	struct rb_node *next;
+	struct hist_entry *n;
+
+	if (!sort__need_collapse)
+		return;
+
+	next = rb_first(&hist);
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, &hist);
+		collapse__insert_entry(n);
+	}
+}
+
 /*
  * reverse the map, sort on count.
  */
@@ -577,9 +677,14 @@ static void output__insert_entry(struct hist_entry *he)
 
 static void output__resort(void)
 {
-	struct rb_node *next = rb_first(&hist);
+	struct rb_node *next;
 	struct hist_entry *n;
 
+	if (sort__need_collapse)
+		next = rb_first(&collapse_hists);
+	else
+		next = rb_first(&hist);
+
 	while (next) {
 		n = rb_entry(next, struct hist_entry, rb_node);
 		next = rb_next(&n->rb_node);
@@ -859,6 +964,7 @@ static int __cmd_report(void)
 	if (verbose >= 2)
 		dsos__fprintf(stdout);
 
+	collapse__resort();
 	output__resort();
 	output__fprintf(stdout, total);
 
-- 
GitLab


From a96bbc16418bc691317f265d6bf98ba941ca9c1a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 3 Jun 2009 14:01:36 +0200
Subject: [PATCH 1557/2198] perf_counter: Fix race in counter initialization

We need the PID namespace and counter ID available when the
counter overflows and we need to generate a sample event.

[ Impact: fix kernel crash with high-frequency sampling ]

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <new-submission>
[ fixed a further crash and cleaned up the initialization a bit ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 317cef78a3883..ab4455447f844 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -48,6 +48,8 @@ int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
 int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
 
+static atomic64_t perf_counter_id;
+
 /*
  * Lock for (sysadmin-configurable) counter reservations:
  */
@@ -3351,14 +3353,18 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 
 	mutex_init(&counter->mmap_mutex);
 
-	counter->cpu			= cpu;
+	counter->cpu		= cpu;
 	counter->attr		= *attr;
-	counter->group_leader		= group_leader;
-	counter->pmu			= NULL;
-	counter->ctx			= ctx;
-	counter->oncpu			= -1;
+	counter->group_leader	= group_leader;
+	counter->pmu		= NULL;
+	counter->ctx		= ctx;
+	counter->oncpu		= -1;
+
+	counter->ns		= get_pid_ns(current->nsproxy->pid_ns);
+	counter->id		= atomic64_inc_return(&perf_counter_id);
+
+	counter->state		= PERF_COUNTER_STATE_INACTIVE;
 
-	counter->state = PERF_COUNTER_STATE_INACTIVE;
 	if (attr->disabled)
 		counter->state = PERF_COUNTER_STATE_OFF;
 
@@ -3402,6 +3408,8 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 		err = PTR_ERR(pmu);
 
 	if (err) {
+		if (counter->ns)
+			put_pid_ns(counter->ns);
 		kfree(counter);
 		return ERR_PTR(err);
 	}
@@ -3419,8 +3427,6 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	return counter;
 }
 
-static atomic64_t perf_counter_id;
-
 /**
  * sys_perf_counter_open - open a performance counter, associate it to a task/cpu
  *
@@ -3515,9 +3521,6 @@ SYSCALL_DEFINE5(perf_counter_open,
 	list_add_tail(&counter->owner_entry, &current->perf_counter_list);
 	mutex_unlock(&current->perf_counter_mutex);
 
-	counter->ns = get_pid_ns(current->nsproxy->pid_ns);
-	counter->id = atomic64_inc_return(&perf_counter_id);
-
 	fput_light(counter_file, fput_needed2);
 
 out_fput:
-- 
GitLab


From a2023556409cf7fec5d67a26f7fcfa57c5a4086d Mon Sep 17 00:00:00 2001
From: Tim Bird <tim.bird@am.sony.com>
Date: Tue, 2 Jun 2009 17:06:54 -0700
Subject: [PATCH 1558/2198] ring-buffer: fix bug in ring_buffer_discard_commit

There's a bug in ring_buffer_discard_commit.  The wrong
pointer is being compared in order to check if the event
can be freed from the buffer rather than discarded
(i.e. marked as PAD).

I noticed this when I was working on duration filtering.
The bug is not deadly - it just results in lots of wasted
space in the buffer.  All filtered events are left in
the buffer and marked as discarded, rather than being
removed from the buffer to make space for other events.

Unfortunately, when I fixed this bug, I got errors doing a
filtered function trace.  Multiple TIME_EXTEND
events pile up in the buffer, and trigger the
following loop overage warning in rb_iter_peek():

again:
	...
	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
		return NULL;

I'm not sure what the best way is to fix this. I don't
know if I should extend the loop threshhold, or if I should
make the test more complex (ignore TIME_EXTEND
events), or just get rid of this loop check completely.

Note that if I implement a workaround for this, then I
see another problem from rb_advance_iter().  I haven't
tracked that one down yet.

In general, it seems like the case of removing filtered
events has not been working properly, and so some assumptions
about buffer invariant conditions need to be revisited.

Here's the patch for the simple fix:

Compare correct pointer for checking if an event can be
freed rather than left as discarded in the buffer.

Signed-off-by: Tim Bird <tim.bird@am.sony.com>
LKML-Reference: <4A25BE9E.5090909@am.sony.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 16b24d49604ca..9453023686912 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1708,7 +1708,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 
 	bpage = cpu_buffer->tail_page;
 
-	if (bpage == (void *)addr && rb_page_write(bpage) == old_index) {
+	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
 		/*
 		 * This is on the tail page. It is possible that
 		 * a write could come in and move the tail page
-- 
GitLab


From edd813bffc62a980bb4fb9b1243f31c1cce78da3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 2 Jun 2009 23:00:53 -0400
Subject: [PATCH 1559/2198] ring-buffer: try to discard unneeded timestamps

There are times that a race may happen that we add a timestamp in a
nested write. This timestamp would just contain a zero delta and serves
no purpose.

Now that we have a way to discard events, this patch will try to discard
the timestamp instead of just wasting the space in the ring buffer.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 67 +++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9453023686912..50926601a28dd 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1335,6 +1335,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 }
 
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+		  struct ring_buffer_event *event)
+{
+	unsigned long new_index, old_index;
+	struct buffer_page *bpage;
+	unsigned long index;
+	unsigned long addr;
+
+	new_index = rb_event_index(event);
+	old_index = new_index + rb_event_length(event);
+	addr = (unsigned long)event;
+	addr &= PAGE_MASK;
+
+	bpage = cpu_buffer->tail_page;
+
+	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+		/*
+		 * This is on the tail page. It is possible that
+		 * a write could come in and move the tail page
+		 * and write to the next page. That is fine
+		 * because we just shorten what is on this page.
+		 */
+		index = local_cmpxchg(&bpage->write, old_index, new_index);
+		if (index == old_index)
+			return 1;
+	}
+
+	/* could not discard */
+	return 0;
+}
+
 static int
 rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		  u64 *ts, u64 *delta)
@@ -1384,10 +1416,13 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 		/* let the caller know this was the commit */
 		ret = 1;
 	} else {
-		/* Darn, this is just wasted space */
-		event->time_delta = 0;
-		event->array[0] = 0;
-		ret = 0;
+		/* Try to discard the event */
+		if (!rb_try_to_discard(cpu_buffer, event)) {
+			/* Darn, this is just wasted space */
+			event->time_delta = 0;
+			event->array[0] = 0;
+			ret = 0;
+		}
 	}
 
 	*delta = 0;
@@ -1682,10 +1717,6 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 				struct ring_buffer_event *event)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long new_index, old_index;
-	struct buffer_page *bpage;
-	unsigned long index;
-	unsigned long addr;
 	int cpu;
 
 	/* The event is discarded regardless */
@@ -1701,24 +1732,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
 	cpu = smp_processor_id();
 	cpu_buffer = buffer->buffers[cpu];
 
-	new_index = rb_event_index(event);
-	old_index = new_index + rb_event_length(event);
-	addr = (unsigned long)event;
-	addr &= PAGE_MASK;
-
-	bpage = cpu_buffer->tail_page;
-
-	if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
-		/*
-		 * This is on the tail page. It is possible that
-		 * a write could come in and move the tail page
-		 * and write to the next page. That is fine
-		 * because we just shorten what is on this page.
-		 */
-		index = local_cmpxchg(&bpage->write, old_index, new_index);
-		if (index == old_index)
-			goto out;
-	}
+	if (!rb_try_to_discard(cpu_buffer, event))
+		goto out;
 
 	/*
 	 * The commit is still visible by the reader, so we
-- 
GitLab


From ea05b57cc19234d8de9887c8a32c2e58e84b56ba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Jun 2009 09:30:10 -0400
Subject: [PATCH 1560/2198] ring-buffer: discard timestamps that are at the
 start of the buffer

Every buffer page in the ring buffer includes its own time stamp.
When an event is recorded to the ring buffer with a delta time greater
than what can be held in the event header, a time stamp event is created.

If the the create timestamp falls over to the next buffer page, it is
redundant because the buffer page holds a full time stamp. This patch
will try to discard the time stamp when it falls to the start of the
next page.

This change also fixes a issues with disarding events. If most events are
discarded, timestamps will start to creep into the ring buffer. If we
do not discard the timestamps then they can fill up the ring buffer over
time and waste space.

This change will keep time stamps from filling up over another page. If
something is recorded in the buffer page, and the rest is filtered, then
the time stamps can only fill up to the end of the page.

[ Impact: prevent time stamps from filling ring buffer ]

Reported-by: Tim Bird <tim.bird@am.sony.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 37 +++++++++++++++++++++----------------
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 50926601a28dd..7102d7a2fadb3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -370,6 +370,9 @@ static inline int test_time_stamp(u64 delta)
 /* Max payload is BUF_PAGE_SIZE - header (8bytes) */
 #define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
 
+/* Max number of timestamps that can fit on a page */
+#define RB_TIMESTAMPS_PER_PAGE	(BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+
 int ring_buffer_print_page_header(struct trace_seq *s)
 {
 	struct buffer_data_page field;
@@ -1409,8 +1412,12 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 			event->array[0] = *delta >> TS_SHIFT;
 		} else {
 			cpu_buffer->commit_page->page->time_stamp = *ts;
-			event->time_delta = 0;
-			event->array[0] = 0;
+			/* try to discard, since we do not need this */
+			if (!rb_try_to_discard(cpu_buffer, event)) {
+				/* nope, just zero it */
+				event->time_delta = 0;
+				event->array[0] = 0;
+			}
 		}
 		cpu_buffer->write_stamp = *ts;
 		/* let the caller know this was the commit */
@@ -2268,8 +2275,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
 	 * Check if we are at the end of the buffer.
 	 */
 	if (iter->head >= rb_page_size(iter->head_page)) {
-		if (RB_WARN_ON(buffer,
-			       iter->head_page == cpu_buffer->commit_page))
+		/* discarded commits can make the page empty */
+		if (iter->head_page == cpu_buffer->commit_page)
 			return;
 		rb_inc_iter(iter);
 		return;
@@ -2312,12 +2319,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	/*
 	 * We repeat when a timestamp is encountered. It is possible
 	 * to get multiple timestamps from an interrupt entering just
-	 * as one timestamp is about to be written. The max times
-	 * that this can happen is the number of nested interrupts we
-	 * can have.  Nesting 10 deep of interrupts is clearly
-	 * an anomaly.
+	 * as one timestamp is about to be written, or from discarded
+	 * commits. The most that we can have is the number on a single page.
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
 		return NULL;
 
 	reader = rb_get_reader_page(cpu_buffer);
@@ -2383,14 +2388,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
  again:
 	/*
-	 * We repeat when a timestamp is encountered. It is possible
-	 * to get multiple timestamps from an interrupt entering just
-	 * as one timestamp is about to be written. The max times
-	 * that this can happen is the number of nested interrupts we
-	 * can have. Nesting 10 deep of interrupts is clearly
-	 * an anomaly.
+	 * We repeat when a timestamp is encountered.
+	 * We can get multiple timestamps by nested interrupts or also
+	 * if filtering is on (discarding commits). Since discarding
+	 * commits can be frequent we can get a lot of timestamps.
+	 * But we limit them by not adding timestamps if they begin
+	 * at the start of a page.
 	 */
-	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+	if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
 		return NULL;
 
 	if (rb_per_cpu_empty(cpu_buffer))
-- 
GitLab


From 083a63b48e4dd0a6a2d44216720076dc81ebb255 Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Wed, 3 Jun 2009 16:01:28 +0800
Subject: [PATCH 1561/2198] tracing/trace_stack: fix the number of entries in
 the header

The last entry in the stack_dump_trace is ULONG_MAX, which is not
a valid entry, but max_stack_trace.nr_entries has accounted for it.
So when printing the header, we should decrease it by one.
Before fix, print as following, for example:

	Depth    Size   Location    (53 entries)	<--- should be 52
	-----    ----   --------
  0)     3264     108   update_wall_time+0x4d5/0x9a0
  ...
 51)       80      80   syscall_call+0x7/0xb
 ^^^
   it's correct.

Signed-off-by: walimis <walimisdev@gmail.com>
LKML-Reference: <1244016090-7814-1-git-send-email-walimisdev@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_stack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 1796f00524e19..2d7aebd71dbd4 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
 		seq_printf(m, "        Depth    Size   Location"
 			   "    (%d entries)\n"
 			   "        -----    ----   --------\n",
-			   max_stack_trace.nr_entries);
+			   max_stack_trace.nr_entries - 1);
 
 		if (!stack_tracer_enabled && !max_stack_size)
 			print_disabled(m);
-- 
GitLab


From f11b3f4e2932bfdcfc458ab8d1ece62724ceabfc Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Wed, 3 Jun 2009 16:01:29 +0800
Subject: [PATCH 1562/2198] tracing/events: fix output format of kernel stack

According to "events/ftrace/kernel_stack/format", output format of
kernel stack should use "=>" instead of "<=".

The second problem is that we shouldn't skip the first entry in the stack,
although it seems to be duplicated when used in the "function" tracer,
but events also use it. If we skip the first one, we will drop the topmost
entry of the stack.

The last problem is that if the last entry is ULONG_MAX(0xffffffff), we should
drop it, otherwise it will print a NULL name line.

before fix:

      sh-1072  [000]   26.957239: sched_process_fork: parent sh:1072 child sh:1073
      sh-1072  [000]   26.957262:
 <= syscall_call
 <=
      sh-1072  [000]   26.957744: sched_switch: task sh:1072 [120] (R) ==> sh:1073 [120]
      sh-1072  [000]   26.957752:
 <= preempt_schedule
 <= wake_up_new_task
 <= do_fork
 <= sys_clone
 <= syscall_call
 <=

After fix:

      sh-1075  [000]    39.791848: sched_process_fork: parent sh:1075  child sh:1076
      sh-1075  [000]    39.791871:
 => sys_clone
 => syscall_call
      sh-1075  [000]    39.792713: sched_switch: task sh:1075 [120] (R) ==> sh:1076 [120]
      sh-1075  [000]    39.792722:
 => schedule
 => preempt_schedule
 => wake_up_new_task
 => do_fork
 => sys_clone
 => syscall_call

Signed-off-by: walimis <walimisdev@gmail.com>
LKML-Reference: <1244016090-7814-2-git-send-email-walimisdev@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 0fe3b223f7ed8..64596a571609a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -975,16 +975,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
+	if (!trace_seq_puts(s, "\n"))
+		goto partial;
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-		if (!field->caller[i])
+		if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
 			break;
-		if (i) {
-			if (!trace_seq_puts(s, " <= "))
-				goto partial;
+		if (!trace_seq_puts(s, " => "))
+			goto partial;
 
-			if (!seq_print_ip_sym(s, field->caller[i], flags))
-				goto partial;
-		}
+		if (!seq_print_ip_sym(s, field->caller[i], flags))
+			goto partial;
 		if (!trace_seq_puts(s, "\n"))
 			goto partial;
 	}
-- 
GitLab


From 048dc50c5e7eada19ebabbad70b7966d14283d41 Mon Sep 17 00:00:00 2001
From: walimis <walimisdev@gmail.com>
Date: Wed, 3 Jun 2009 16:01:30 +0800
Subject: [PATCH 1563/2198] tracing/events: fix output format of user stack

According to "events/ftrace/user_stack/format", fix the output of
user stack.

before fix:

  sh-1073  [000]    31.137561:  <b7f274fe> <-  <0804e33c> <-  <080835c1>

after fix:

  sh-1072  [000]    37.039329:
 =>  <b7f8a4fe>
 =>  <0804e33c>
 =>  <080835c1>

Signed-off-by: walimis <walimisdev@gmail.com>
LKML-Reference: <1244016090-7814-3-git-send-email-walimisdev@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 64596a571609a..8dadbbbd2d5c3 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -389,17 +389,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 
 		if (ip == ULONG_MAX || !ret)
 			break;
-		if (i && ret)
-			ret = trace_seq_puts(s, " <- ");
+		if (ret)
+			ret = trace_seq_puts(s, " => ");
 		if (!ip) {
 			if (ret)
 				ret = trace_seq_puts(s, "??");
+			if (ret)
+				ret = trace_seq_puts(s, "\n");
 			continue;
 		}
 		if (!ret)
 			break;
 		if (ret)
 			ret = seq_print_user_ip(s, mm, ip, sym_flags);
+		ret = trace_seq_puts(s, "\n");
 	}
 
 	if (mm)
@@ -1012,10 +1015,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	if (!seq_print_userip_objs(field, s, flags))
+	if (!trace_seq_putc(s, '\n'))
 		goto partial;
 
-	if (!trace_seq_putc(s, '\n'))
+	if (!seq_print_userip_objs(field, s, flags))
 		goto partial;
 
 	return TRACE_TYPE_HANDLED;
-- 
GitLab


From 56d8bd3f0b98972312cad683947ec90b21011199 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 3 Jun 2009 14:52:03 +0100
Subject: [PATCH 1564/2198] tracing: fix multiple use of __print_flags and
 __print_symbolic

Here is an updated patch to include the extra call to
trace_seq_init() as requested. This is vs. the latest
-tip tree and fixes the use of multiple __print_flags
and __print_symbolic in a single tracer. Also tested
to ensure its working now:

mount.gfs2-2534  [000]   235.850587: gfs2_glock_queue: 8.7 glock 1:2 dequeue PR
mount.gfs2-2534  [000]   235.850591: gfs2_demote_rq: 8.7 glock 1:0 demote EX to NL flags:DI
mount.gfs2-2534  [000]   235.850591: gfs2_glock_queue: 8.7 glock 1:0 dequeue EX
glock_workqueue-2529  [000]   235.850666: gfs2_glock_state_change: 8.7 glock 1:0 state EX => NL tgt:NL dmt:NL flags:lDpI
glock_workqueue-2529  [000]   235.850672: gfs2_glock_put: 8.7 glock 1:0 state NL => IV flags:I

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
LKML-Reference: <1244037123.29604.603.camel@localhost.localdomain>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/ftrace.h      |  2 ++
 kernel/trace/trace_output.c | 10 ++++------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index b5478dab579b7..40ede4db4d885 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -104,6 +104,7 @@
  *	field = (typeof(field))entry;
  *
  *	p = get_cpu_var(ftrace_event_seq);
+ *	trace_seq_init(p);
  *	ret = trace_seq_printf(s, <TP_printk> "\n");
  *	put_cpu();
  *	if (!ret)
@@ -167,6 +168,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	field = (typeof(field))entry;					\
 									\
 	p = &get_cpu_var(ftrace_event_seq);				\
+	trace_seq_init(p);						\
 	ret = trace_seq_printf(s, #call ": " print);			\
 	put_cpu();							\
 	if (!ret)							\
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8dadbbbd2d5c3..8afeea412e77b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -223,10 +223,9 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 {
 	unsigned long mask;
 	const char *str;
+	const char *ret = p->buffer + p->len;
 	int i;
 
-	trace_seq_init(p);
-
 	for (i = 0;  flag_array[i].name && flags; i++) {
 
 		mask = flag_array[i].mask;
@@ -249,7 +248,7 @@ ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
 
 	trace_seq_putc(p, 0);
 
-	return p->buffer;
+	return ret;
 }
 EXPORT_SYMBOL(ftrace_print_flags_seq);
 
@@ -258,8 +257,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 			 const struct trace_print_flags *symbol_array)
 {
 	int i;
-
-	trace_seq_init(p);
+	const char *ret = p->buffer + p->len;
 
 	for (i = 0;  symbol_array[i].name; i++) {
 
@@ -275,7 +273,7 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
 		
 	trace_seq_putc(p, 0);
 
-	return p->buffer;
+	return ret;
 }
 EXPORT_SYMBOL(ftrace_print_symbols_seq);
 
-- 
GitLab


From 563af16c30ede41eda2d614195d88e07f7c7103d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 3 Jun 2009 11:10:44 -0400
Subject: [PATCH 1565/2198] tracing: add annotation to what type of stack trace
 is recorded

The current method of printing out a stack trace is to add a new line
and print out the trace:

    yum-updatesd-3120  [002]   573.691303:
 => do_softirq
 => irq_exit
 => smp_apic_timer_interrupt
 => apic_timer_interrupt

This looks a bit awkward, and if we have both stack and user stack traces
running, it would be nice to have a title to tell them apart, although
it is easy to tell by the output.

This patch adds an annotation to the start of the stack traces:

            init-1     [003]   929.304979: <stack trace>
 => user_path_at
 => vfs_fstatat
 => vfs_stat
 => sys_newstat
 => system_call_fastpath

             cat-3459  [002]  1016.824040: <user stack trace>
 =>  <0000003aae6c0250>
 =>  <00007ffff4b06ae4>
 =>  <69636172742f6775>

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 8afeea412e77b..425725c1622db 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -976,7 +976,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_puts(s, "\n"))
+	if (!trace_seq_puts(s, "<stack trace>\n"))
 		goto partial;
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
@@ -1013,7 +1013,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
 	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_putc(s, '\n'))
+	if (!trace_seq_puts(s, "<user stack trace>\n"))
 		goto partial;
 
 	if (!seq_print_userip_objs(field, s, flags))
-- 
GitLab


From f2521b6e4c365bd0aac61b2c346e6e9f22607e31 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 19:17:25 +0200
Subject: [PATCH 1566/2198] perf_counter tools: Clean up old kerneltop
 references

kerneltop has been replaced with perf top - so fix up a few
remaining references to it in display text and error messages.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c |  6 +++---
 Documentation/perf_counter/builtin-top.c    | 14 ++++++--------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index cea5b8d3c633b..fa625258315ec 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -357,7 +357,8 @@ static void open_counters(int cpu, pid_t pid)
 
 		if (fd[nr_cpu][counter] < 0) {
 			int err = errno;
-			printf("kerneltop error: syscall returned with %d (%s)\n",
+
+			error("syscall returned with %d (%s)\n",
 					fd[nr_cpu][counter], strerror(err));
 			if (err == EPERM)
 				printf("Are you root?\n");
@@ -382,8 +383,7 @@ static void open_counters(int cpu, pid_t pid)
 		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
 				PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
 		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
-			printf("kerneltop error: failed to mmap with %d (%s)\n",
-					errno, strerror(errno));
+			error("failed to mmap with %d (%s)\n", errno, strerror(errno));
 			exit(-1);
 		}
 	}
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index a63935276cacb..16a618446d3f0 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -208,7 +208,7 @@ static void print_sym_table(void)
 
 	printf(
 "------------------------------------------------------------------------------\n");
-	printf( " KernelTop:%8.0f irqs/sec  kernel:%4.1f%% [",
+	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
 		events_per_sec,
 		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)));
 
@@ -281,7 +281,7 @@ static void print_sym_table(void)
 
 static void *display_thread(void *arg)
 {
-	printf("KernelTop refresh period: %d seconds\n", delay_secs);
+	printf("PerfTop refresh period: %d seconds\n", delay_secs);
 
 	while (!sleep(delay_secs))
 		print_sym_table();
@@ -564,7 +564,8 @@ static int __cmd_top(void)
 			fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
 				int err = errno;
-				printf("kerneltop error: syscall returned with %d (%s)\n",
+
+				error("syscall returned with %d (%s)\n",
 					fd[i][counter], strerror(err));
 				if (err == EPERM)
 					printf("Are you root?\n");
@@ -588,11 +589,8 @@ static int __cmd_top(void)
 			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
 			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
 					PROT_READ, MAP_SHARED, fd[i][counter], 0);
-			if (mmap_array[i][counter].base == MAP_FAILED) {
-				printf("kerneltop error: failed to mmap with %d (%s)\n",
-						errno, strerror(errno));
-				exit(-1);
-			}
+			if (mmap_array[i][counter].base == MAP_FAILED)
+				die("failed to mmap with %d (%s)\n", errno, strerror(errno));
 		}
 	}
 
-- 
GitLab


From 021e9f476511ebe23d7f45854a52dfe74c09b6ee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 19:27:19 +0200
Subject: [PATCH 1567/2198] perf record: Refine capture printout

Print out the number of bytes captured, and the (estimated) number of
events the output file contains.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 34 +++++++++++++++------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index fa625258315ec..efa2eb498e9cd 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -67,6 +67,8 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 static long events;
 static struct timeval last_read, this_read;
 
+static __u64 bytes_written;
+
 static void mmap_read(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
@@ -114,28 +116,34 @@ static void mmap_read(struct mmap_data *md)
 		buf = &data[old & md->mask];
 		size = md->mask + 1 - (old & md->mask);
 		old += size;
+
 		while (size) {
 			int ret = write(output, buf, size);
-			if (ret < 0) {
-				perror("failed to write");
-				exit(-1);
-			}
+
+			if (ret < 0)
+				die("failed to write");
+
 			size -= ret;
 			buf += ret;
+
+			bytes_written += ret;
 		}
 	}
 
 	buf = &data[old & md->mask];
 	size = head - old;
 	old += size;
+
 	while (size) {
 		int ret = write(output, buf, size);
-		if (ret < 0) {
-			perror("failed to write");
-			exit(-1);
-		}
+
+		if (ret < 0)
+			die("failed to write");
+
 		size -= ret;
 		buf += ret;
+
+		bytes_written += ret;
 	}
 
 	md->prev = old;
@@ -467,8 +475,14 @@ static int __cmd_record(int argc, const char **argv)
 			ret = poll(event_array, nr_poll, 100);
 	}
 
-
-	fprintf(stderr, "[ perf record: Captured and wrote %ld events. ]\n", events);
+	/*
+	 * Approximate RIP event size: 24 bytes.
+	 */
+	fprintf(stderr,
+		"[ perf record: Captured and wrote %.3f MB %s (~%lld events) ]\n",
+		(double)bytes_written / 1024.0 / 1024.0,
+		output_name,
+		bytes_written / 24);
 
 	return 0;
 }
-- 
GitLab


From e98e96fe43ae92fad0930f05fb2b298e49b9f3b5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 19:30:38 +0200
Subject: [PATCH 1568/2198] perf report: Display 100% correctly

Needs to be 6.2 not 5.2, for 100.00% to be aligned properly.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6d359c9f75ddd..e837bb983dcab 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -529,7 +529,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 	size_t ret;
 
 	if (total_samples) {
-		ret = fprintf(fp, "    %5.2f%%",
+		ret = fprintf(fp, "   %6.2f%%",
 				(self->count * 100.0) / total_samples);
 	} else
 		ret = fprintf(fp, "%12d ", self->count);
-- 
GitLab


From 44db76c8553c328f4ae02481d77bb3a88ca17645 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 19:36:07 +0200
Subject: [PATCH 1569/2198] perf stat: Print out all arguments

Before:

 Performance counter stats for '/home/mingo/hackbench':

After:

 Performance counter stats for '/home/mingo/hackbench 10':

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-stat.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 2357a663b6755..4fc0d80440e73 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -228,6 +228,7 @@ static int do_perfstat(int argc, const char **argv)
 	int counter;
 	int status;
 	int pid;
+	int i;
 
 	if (!system_wide)
 		nr_cpus = 1;
@@ -243,14 +244,17 @@ static int do_perfstat(int argc, const char **argv)
 
 	if ((pid = fork()) < 0)
 		perror("failed to fork");
+
 	if (!pid) {
 		if (execvp(argv[0], (char **)argv)) {
 			perror(argv[0]);
 			exit(-1);
 		}
 	}
+
 	while (wait(&status) >= 0)
 		;
+
 	prctl(PR_TASK_PERF_COUNTERS_DISABLE);
 	t1 = rdclock();
 
@@ -259,8 +263,12 @@ static int do_perfstat(int argc, const char **argv)
 	fflush(stdout);
 
 	fprintf(stderr, "\n");
-	fprintf(stderr, " Performance counter stats for \'%s\':\n",
-		argv[0]);
+	fprintf(stderr, " Performance counter stats for \'%s", argv[0]);
+
+	for (i = 1; i < argc; i++)
+		fprintf(stderr, " %s", argv[i]);
+
+	fprintf(stderr, "\':\n");
 	fprintf(stderr, "\n");
 
 	for (counter = 0; counter < nr_counters; counter++)
-- 
GitLab


From eed4dcd443da7a46131ef37c7a389b444905960e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 19:59:24 +0200
Subject: [PATCH 1570/2198] perf report: Add front-entry cache for lookups

Before:

 Performance counter stats for './perf report -i perf.data.big':

     12453988058  instructions

 Performance counter stats for './perf report -i perf.data.big':

     12379566017  instructions

0.60% reduction.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index e837bb983dcab..33b3b15fb014a 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -229,6 +229,7 @@ static int thread__set_comm(struct thread *self, const char *comm)
 }
 
 static struct rb_root threads;
+static struct thread *last_match;
 
 static struct thread *threads__findnew(pid_t pid)
 {
@@ -236,12 +237,22 @@ static struct thread *threads__findnew(pid_t pid)
 	struct rb_node *parent = NULL;
 	struct thread *th;
 
+	/*
+	 * Font-end cache - PID lookups come in blocks,
+	 * so most of the time we dont have to look up
+	 * the full rbtree:
+	 */
+	if (last_match && last_match->pid == pid)
+		return last_match;
+
 	while (*p != NULL) {
 		parent = *p;
 		th = rb_entry(parent, struct thread, rb_node);
 
-		if (th->pid == pid)
+		if (th->pid == pid) {
+			last_match = th;
 			return th;
+		}
 
 		if (pid < th->pid)
 			p = &(*p)->rb_left;
@@ -253,7 +264,9 @@ static struct thread *threads__findnew(pid_t pid)
 	if (th != NULL) {
 		rb_link_node(&th->rb_node, parent, p);
 		rb_insert_color(&th->rb_node, &threads);
+		last_match = th;
 	}
+
 	return th;
 }
 
-- 
GitLab


From 051cdc3c2d0e24443ac03aff03ee89807ec8c589 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 20:09:11 +0200
Subject: [PATCH 1571/2198] perf help: Fix bug when there's no perf-* command
 around

main_cmds can be empty - fix util/help.c to handle this case
without segfaulting.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/help.c | 28 +++++++++++++++-----------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/Documentation/perf_counter/util/help.c b/Documentation/perf_counter/util/help.c
index 397487fb2bee6..6653f7dd1d784 100644
--- a/Documentation/perf_counter/util/help.c
+++ b/Documentation/perf_counter/util/help.c
@@ -298,7 +298,7 @@ static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old)
 
 const char *help_unknown_cmd(const char *cmd)
 {
-	int i, n, best_similarity = 0;
+	int i, n = 0, best_similarity = 0;
 	struct cmdnames main_cmds, other_cmds;
 
 	memset(&main_cmds, 0, sizeof(main_cmds));
@@ -315,20 +315,24 @@ const char *help_unknown_cmd(const char *cmd)
 	      sizeof(main_cmds.names), cmdname_compare);
 	uniq(&main_cmds);
 
-	/* This reuses cmdname->len for similarity index */
-	for (i = 0; i < main_cmds.cnt; ++i)
-		main_cmds.names[i]->len =
-			levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
+	if (main_cmds.cnt) {
+		/* This reuses cmdname->len for similarity index */
+		for (i = 0; i < main_cmds.cnt; ++i)
+			main_cmds.names[i]->len =
+				levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4);
 
-	qsort(main_cmds.names, main_cmds.cnt,
-	      sizeof(*main_cmds.names), levenshtein_compare);
+		qsort(main_cmds.names, main_cmds.cnt,
+		      sizeof(*main_cmds.names), levenshtein_compare);
+
+		best_similarity = main_cmds.names[0]->len;
+		n = 1;
+		while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
+			++n;
+	}
 
-	best_similarity = main_cmds.names[0]->len;
-	n = 1;
-	while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len)
-		++n;
 	if (autocorrect && n == 1) {
 		const char *assumed = main_cmds.names[0]->name;
+
 		main_cmds.names[0] = NULL;
 		clean_cmdnames(&main_cmds);
 		fprintf(stderr, "WARNING: You called a Git program named '%s', "
@@ -345,7 +349,7 @@ const char *help_unknown_cmd(const char *cmd)
 
 	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd);
 
-	if (best_similarity < 6) {
+	if (main_cmds.cnt && best_similarity < 6) {
 		fprintf(stderr, "\nDid you mean %s?\n",
 			n < 2 ? "this": "one of these");
 
-- 
GitLab


From 095b3a6a030f7d4f24825ae93fc384b3d4b4fafa Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 20:13:51 +0200
Subject: [PATCH 1572/2198] perf_counter tools: Optimize harder

Use -O6 to build the tools.

Before:

    12387507370  instructions         #    3121.653 M/sec

After:

     6244894971  instructions         #    3458.437 M/sec

Almost twice as fast!

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index eae88561233b4..005709b7b19ac 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -159,7 +159,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement
+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
 LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
-- 
GitLab


From 18374ab76e3ec1cf1b0ca5a8d08e35cfc5d01669 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 3 Jun 2009 14:49:21 -0300
Subject: [PATCH 1573/2198] perf_counter tools: Fix off-by-one bug in
 symbol__new

The end is really (start + len - 1). Noticed when synthesizing
the PLT symbols, that are small (16 bytes), and hot on the
start RIP.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <20090603174921.GG7805@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/symbol.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index d52a1ae5342a2..35ee6de1e513c 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -19,7 +19,7 @@ static struct symbol *symbol__new(uint64_t start, uint64_t len,
 			self = ((void *)self) + priv_size;
 		}
 		self->start = start;
-		self->end   = start + len;
+		self->end   = start + len - 1;
 		memcpy(self->name, name, namelen);
 	}
 
-- 
GitLab


From 233f0b95ca3a0d1dcbd70bc7e519069a8e10d23e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 21:48:40 +0200
Subject: [PATCH 1574/2198] perf_counter tools: Work around warnings in older
 GCCs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GCC 4.1.2 produces:

 util/parse-options.c: In function ‘get_value’:
 util/parse-options.c:36: warning: ‘arg’ may be used uninitialized in this function

 builtin-top.c: In function ‘display_thread’:
 builtin-top.c:178: warning: ‘printed’ may be used uninitialized in this function

Annotate them away by initializing these variables to 0.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c        | 2 +-
 Documentation/perf_counter/util/parse-options.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 16a618446d3f0..7c907e25d82b9 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -175,7 +175,7 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
 
 static void print_sym_table(void)
 {
-	int printed, j;
+	int printed = 0, j;
 	int counter;
 	float events_per_sec = events/delay_secs;
 	float kevents_per_sec = (events-userspace_events)/delay_secs;
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
index b80abd9a99bcc..551b6bc34e794 100644
--- a/Documentation/perf_counter/util/parse-options.c
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -33,7 +33,7 @@ static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt,
 static int get_value(struct parse_opt_ctx_t *p,
 		     const struct option *opt, int flags)
 {
-	const char *s, *arg;
+	const char *s, *arg = NULL;
 	const int unset = flags & OPT_UNSET;
 
 	if (unset && p->opt)
-- 
GitLab


From 128f048f0f0d2a477ad2555e7acd2ad15a1b6061 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 22:19:36 +0200
Subject: [PATCH 1575/2198] perf_counter: Fix throttling lock-up

Throttling logic is broken and we can lock up with too small
hw sampling intervals.

Make the throttling code more robust: disable counters even
if we already disabled them.

( Also clean up whitespace damage i noticed while reading
  various pieces of code related to throttling. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  2 +-
 kernel/perf_counter.c              | 19 ++++++++++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 12cc05ed9f488..8f53f3a7da29e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -91,7 +91,7 @@ static u64 intel_pmu_raw_event(u64 event)
 #define CORE_EVNTSEL_INV_MASK		0x00800000ULL
 #define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL
 
-#define CORE_EVNTSEL_MASK 		\
+#define CORE_EVNTSEL_MASK		\
 	(CORE_EVNTSEL_EVENT_MASK |	\
 	 CORE_EVNTSEL_UNIT_MASK  |	\
 	 CORE_EVNTSEL_EDGE_MASK  |	\
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ab4455447f844..0bb03f15a5b67 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2822,11 +2822,20 @@ int perf_counter_overflow(struct perf_counter *counter,
 
 	if (!throttle) {
 		counter->hw.interrupts++;
-	} else if (counter->hw.interrupts != MAX_INTERRUPTS) {
-		counter->hw.interrupts++;
-		if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
-			counter->hw.interrupts = MAX_INTERRUPTS;
-			perf_log_throttle(counter, 0);
+	} else {
+		if (counter->hw.interrupts != MAX_INTERRUPTS) {
+			counter->hw.interrupts++;
+			if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
+				counter->hw.interrupts = MAX_INTERRUPTS;
+				perf_log_throttle(counter, 0);
+				ret = 1;
+			}
+		} else {
+			/*
+			 * Keep re-disabling counters even though on the previous
+			 * pass we disabled it - just in case we raced with a
+			 * sched-in and the counter got enabled again:
+			 */
 			ret = 1;
 		}
 	}
-- 
GitLab


From d80d338d2fb611b65830db7ea56680624776030f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 23:14:49 +0200
Subject: [PATCH 1576/2198] perf report: Clean up event processing

- Split out event processig into process_events() helper.

- Untangle the cwd parameters - it's constant so can be a static.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 186 +++++++++++---------
 1 file changed, 98 insertions(+), 88 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 33b3b15fb014a..333d31269e3fc 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -147,7 +147,11 @@ static int load_kernel(void)
 	return err;
 }
 
-static int strcommon(const char *pathname, const char *cwd, int cwdlen)
+static char __cwd[PATH_MAX];
+static char *cwd = __cwd;
+static int cwdlen;
+
+static int strcommon(const char *pathname)
 {
 	int n = 0;
 
@@ -165,7 +169,7 @@ struct map {
 	struct dso	 *dso;
 };
 
-static struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen)
+static struct map *map__new(struct mmap_event *event)
 {
 	struct map *self = malloc(sizeof(*self));
 
@@ -174,7 +178,8 @@ static struct map *map__new(struct mmap_event *event, char *cwd, int cwdlen)
 		char newfilename[PATH_MAX];
 
 		if (cwd) {
-			int n = strcommon(filename, cwd, cwdlen);
+			int n = strcommon(filename);
+
 			if (n == cwdlen) {
 				snprintf(newfilename, sizeof(newfilename),
 					 ".%s", filename + n);
@@ -752,85 +757,11 @@ static void register_idle_thread(void)
 	}
 }
 
+static unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
 
-static int __cmd_report(void)
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
 {
-	unsigned long offset = 0;
-	unsigned long head = 0;
-	struct stat stat;
-	char *buf;
-	event_t *event;
-	int ret, rc = EXIT_FAILURE;
-	uint32_t size;
-	unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
-	char cwd[PATH_MAX], *cwdp = cwd;
-	int cwdlen;
-
-	register_idle_thread();
-
-	input = open(input_name, O_RDONLY);
-	if (input < 0) {
-		perror("failed to open file");
-		exit(-1);
-	}
-
-	ret = fstat(input, &stat);
-	if (ret < 0) {
-		perror("failed to stat file");
-		exit(-1);
-	}
-
-	if (!stat.st_size) {
-		fprintf(stderr, "zero-sized file, nothing to do!\n");
-		exit(0);
-	}
-
-	if (load_kernel() < 0) {
-		perror("failed to load kernel symbols");
-		return EXIT_FAILURE;
-	}
-
-	if (!full_paths) {
-		if (getcwd(cwd, sizeof(cwd)) == NULL) {
-			perror("failed to get the current directory");
-			return EXIT_FAILURE;
-		}
-		cwdlen = strlen(cwd);
-	} else {
-		cwdp = NULL;
-		cwdlen = 0;
-	}
-remap:
-	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
-			   MAP_SHARED, input, offset);
-	if (buf == MAP_FAILED) {
-		perror("failed to mmap file");
-		exit(-1);
-	}
-
-more:
-	event = (event_t *)(buf + head);
-
-	size = event->header.size;
-	if (!size)
-		size = 8;
-
-	if (head + event->header.size >= page_size * mmap_window) {
-		unsigned long shift = page_size * (head / page_size);
-		int ret;
-
-		ret = munmap(buf, page_size * mmap_window);
-		assert(ret == 0);
-
-		offset += shift;
-		head -= shift;
-		goto remap;
-	}
-
-	size = event->header.size;
-	if (!size)
-		goto broken_event;
-
 	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
 		char level;
 		int show = 0;
@@ -851,7 +782,7 @@ static int __cmd_report(void)
 		if (thread == NULL) {
 			fprintf(stderr, "problem processing %d event, skipping it.\n",
 				event->header.type);
-			goto broken_event;
+			return -1;
 		}
 
 		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
@@ -895,14 +826,14 @@ static int __cmd_report(void)
 			if (hist_entry__add(thread, map, dso, sym, ip, level)) {
 				fprintf(stderr,
 		"problem incrementing symbol count, skipping event\n");
-				goto broken_event;
+				return -1;
 			}
 		}
 		total++;
 	} else switch (event->header.type) {
 	case PERF_EVENT_MMAP: {
 		struct thread *thread = threads__findnew(event->mmap.pid);
-		struct map *map = map__new(&event->mmap, cwdp, cwdlen);
+		struct map *map = map__new(&event->mmap);
 
 		dprintf("%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
 			(void *)(offset + head),
@@ -915,7 +846,7 @@ static int __cmd_report(void)
 		if (thread == NULL || map == NULL) {
 			if (verbose)
 				fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
-			goto broken_event;
+			return -1;
 		}
 		thread__insert_map(thread, map);
 		total_mmap++;
@@ -932,13 +863,93 @@ static int __cmd_report(void)
 		if (thread == NULL ||
 		    thread__set_comm(thread, event->comm.comm)) {
 			fprintf(stderr, "problem processing PERF_EVENT_COMM, skipping event.\n");
-			goto broken_event;
+			return -1;
 		}
 		total_comm++;
 		break;
 	}
-	default: {
-broken_event:
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __cmd_report(void)
+{
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat stat;
+	char *buf;
+	event_t *event;
+	int ret, rc = EXIT_FAILURE;
+	uint32_t size;
+
+	register_idle_thread();
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
+	if (load_kernel() < 0) {
+		perror("failed to load kernel symbols");
+		return EXIT_FAILURE;
+	}
+
+	if (!full_paths) {
+		if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
+			perror("failed to get the current directory");
+			return EXIT_FAILURE;
+		}
+		cwdlen = strlen(cwd);
+	} else {
+		cwd = NULL;
+		cwdlen = 0;
+	}
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int ret;
+
+		ret = munmap(buf, page_size * mmap_window);
+		assert(ret == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+	size = event->header.size;
+
+	if (!size || process_event(event, offset, head) < 0) {
+
 		dprintf("%p [%p]: skipping unknown header type: %d\n",
 			(void *)(offset + head),
 			(void *)(long)(event->header.size),
@@ -956,7 +967,6 @@ static int __cmd_report(void)
 
 		size = 8;
 	}
-	}
 
 	head += size;
 
-- 
GitLab


From 75051724f78677532618dd164a515baf106990e5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 23:14:49 +0200
Subject: [PATCH 1577/2198] perf report: Split out event processing helpers

- Introduce per event helper functions

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 211 +++++++++++---------
 1 file changed, 118 insertions(+), 93 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 333d31269e3fc..82b62529e659a 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -50,6 +50,7 @@ struct ip_event {
 	__u64 ip;
 	__u32 pid, tid;
 };
+
 struct mmap_event {
 	struct perf_event_header header;
 	__u32 pid, tid;
@@ -58,9 +59,10 @@ struct mmap_event {
 	__u64 pgoff;
 	char filename[PATH_MAX];
 };
+
 struct comm_event {
 	struct perf_event_header header;
-	__u32 pid,tid;
+	__u32 pid, tid;
 	char comm[16];
 };
 
@@ -760,114 +762,137 @@ static void register_idle_thread(void)
 static unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
 
 static int
-process_event(event_t *event, unsigned long offset, unsigned long head)
+process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 {
-	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
-		char level;
-		int show = 0;
-		struct dso *dso = NULL;
-		struct thread *thread = threads__findnew(event->ip.pid);
-		uint64_t ip = event->ip.ip;
-		struct map *map = NULL;
-
-		dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
-			(void *)(offset + head),
-			(void *)(long)(event->header.size),
-			event->header.misc,
-			event->ip.pid,
-			(void *)(long)ip);
-
-		dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
-		if (thread == NULL) {
-			fprintf(stderr, "problem processing %d event, skipping it.\n",
-				event->header.type);
-			return -1;
-		}
-
-		if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
-			show = SHOW_KERNEL;
-			level = 'k';
+	char level;
+	int show = 0;
+	struct dso *dso = NULL;
+	struct thread *thread = threads__findnew(event->ip.pid);
+	uint64_t ip = event->ip.ip;
+	struct map *map = NULL;
+
+	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid,
+		(void *)(long)ip);
+
+	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	if (thread == NULL) {
+		fprintf(stderr, "problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
 
-			dso = kernel_dso;
+	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+		show = SHOW_KERNEL;
+		level = 'k';
 
-			dprintf(" ...... dso: %s\n", dso->name);
+		dso = kernel_dso;
 
-		} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+		dprintf(" ...... dso: %s\n", dso->name);
 
-			show = SHOW_USER;
-			level = '.';
+	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
 
-			map = thread__find_map(thread, ip);
-			if (map != NULL) {
-				dso = map->dso;
-				ip -= map->start + map->pgoff;
-			} else {
-				/*
-				 * If this is outside of all known maps,
-				 * and is a negative address, try to look it
-				 * up in the kernel dso, as it might be a
-				 * vsyscall (which executes in user-mode):
-				 */
-				if ((long long)ip < 0)
-					dso = kernel_dso;
-			}
-			dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
+		show = SHOW_USER;
+		level = '.';
 
+		map = thread__find_map(thread, ip);
+		if (map != NULL) {
+			dso = map->dso;
+			ip -= map->start + map->pgoff;
 		} else {
-			show = SHOW_HV;
-			level = 'H';
-			dprintf(" ...... dso: [hypervisor]\n");
+			/*
+			 * If this is outside of all known maps,
+			 * and is a negative address, try to look it
+			 * up in the kernel dso, as it might be a
+			 * vsyscall (which executes in user-mode):
+			 */
+			if ((long long)ip < 0)
+				dso = kernel_dso;
 		}
+		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
 
-		if (show & show_mask) {
-			struct symbol *sym = dso__find_symbol(dso, ip);
+	} else {
+		show = SHOW_HV;
+		level = 'H';
+		dprintf(" ...... dso: [hypervisor]\n");
+	}
 
-			if (hist_entry__add(thread, map, dso, sym, ip, level)) {
-				fprintf(stderr,
-		"problem incrementing symbol count, skipping event\n");
-				return -1;
-			}
-		}
-		total++;
-	} else switch (event->header.type) {
-	case PERF_EVENT_MMAP: {
-		struct thread *thread = threads__findnew(event->mmap.pid);
-		struct map *map = map__new(&event->mmap);
+	if (show & show_mask) {
+		struct symbol *sym = dso__find_symbol(dso, ip);
 
-		dprintf("%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
-			(void *)(offset + head),
-			(void *)(long)(event->header.size),
-			(void *)(long)event->mmap.start,
-			(void *)(long)event->mmap.len,
-			(void *)(long)event->mmap.pgoff,
-			event->mmap.filename);
-
-		if (thread == NULL || map == NULL) {
-			if (verbose)
-				fprintf(stderr, "problem processing PERF_EVENT_MMAP, skipping event.\n");
+		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+			fprintf(stderr,
+		"problem incrementing symbol count, skipping event\n");
 			return -1;
 		}
-		thread__insert_map(thread, map);
-		total_mmap++;
-		break;
 	}
-	case PERF_EVENT_COMM: {
-		struct thread *thread = threads__findnew(event->comm.pid);
+	total++;
 
-		dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
-			(void *)(offset + head),
-			(void *)(long)(event->header.size),
-			event->comm.comm, event->comm.pid);
+	return 0;
+}
 
-		if (thread == NULL ||
-		    thread__set_comm(thread, event->comm.comm)) {
-			fprintf(stderr, "problem processing PERF_EVENT_COMM, skipping event.\n");
-			return -1;
-		}
-		total_comm++;
-		break;
+static int
+process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->mmap.pid);
+	struct map *map = map__new(&event->mmap);
+
+	dprintf("%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		(void *)(long)event->mmap.start,
+		(void *)(long)event->mmap.len,
+		(void *)(long)event->mmap.pgoff,
+		event->mmap.filename);
+
+	if (thread == NULL || map == NULL) {
+		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+		return -1;
+	}
+
+	thread__insert_map(thread, map);
+	total_mmap++;
+
+	return 0;
+}
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->comm.pid);
+
+	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		return -1;
 	}
+	total_comm++;
+
+	return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
+		return process_overflow_event(event, offset, head);
+
+	switch (event->header.type) {
+	case PERF_EVENT_MMAP:
+		return process_mmap_event(event, offset, head);
+
+	case PERF_EVENT_COMM:
+		return process_comm_event(event, offset, head);
+
 	default:
 		return -1;
 	}
@@ -877,13 +902,13 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 
 static int __cmd_report(void)
 {
+	int ret, rc = EXIT_FAILURE;
 	unsigned long offset = 0;
 	unsigned long head = 0;
 	struct stat stat;
-	char *buf;
 	event_t *event;
-	int ret, rc = EXIT_FAILURE;
 	uint32_t size;
+	char *buf;
 
 	register_idle_thread();
 
-- 
GitLab


From d11444dfa78cdd887d8dfd2fab3883132aff2c2d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 3 Jun 2009 23:29:14 +0200
Subject: [PATCH 1578/2198] perf report: Handle all known event types

We have munmap, throttle/unthrottle and period events as well,
process them - otherwise they are considered broke events and
we mis-parse the next few events.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 82b62529e659a..6003cc3b188dc 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -893,6 +893,15 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	case PERF_EVENT_COMM:
 		return process_comm_event(event, offset, head);
 
+	/*
+	 * We dont process them right now but they are fine:
+	 */
+	case PERF_EVENT_MUNMAP:
+	case PERF_EVENT_PERIOD:
+	case PERF_EVENT_THROTTLE:
+	case PERF_EVENT_UNTHROTTLE:
+		return 0;
+
 	default:
 		return -1;
 	}
-- 
GitLab


From bfcd3555af478dbf04c87adc9bb1a739d0a6ccff Mon Sep 17 00:00:00 2001
From: Alberto Bertogli <albertito@blitiri.com.ar>
Date: Tue, 9 Jun 2009 00:06:20 -0400
Subject: [PATCH 1579/2198] jbd2: Fix minor typos in comments in
 fs/jbd2/journal.c

Signed-off-by: Alberto Bertogli <albertito@blitiri.com.ar>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
---
 fs/jbd2/journal.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 58144102bf253..62be7d294ec26 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1781,7 +1781,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
  * Journal abort has very specific semantics, which we describe
  * for journal abort.
  *
- * Two internal function, which provide abort to te jbd layer
+ * Two internal functions, which provide abort to the jbd layer
  * itself are here.
  */
 
@@ -1879,7 +1879,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
  * int jbd2_journal_errno () - returns the journal's error state.
  * @journal: journal to examine.
  *
- * This is the errno numbet set with jbd2_journal_abort(), the last
+ * This is the errno number set with jbd2_journal_abort(), the last
  * time the journal was mounted - if the journal was stopped
  * without calling abort this will be 0.
  *
@@ -1903,7 +1903,7 @@ int jbd2_journal_errno(journal_t *journal)
  * int jbd2_journal_clear_err () - clears the journal's error state
  * @journal: journal to act on.
  *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
  * mode.
  */
 int jbd2_journal_clear_err(journal_t *journal)
@@ -1923,7 +1923,7 @@ int jbd2_journal_clear_err(journal_t *journal)
  * void jbd2_journal_ack_err() - Ack journal err.
  * @journal: journal to act on.
  *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
  * mode.
  */
 void jbd2_journal_ack_err(journal_t *journal)
-- 
GitLab


From 0b8e58a140cae2ba1c4a21ccae7c6c3c939c51f9 Mon Sep 17 00:00:00 2001
From: Andreas Dilger <adilger@sun.com>
Date: Wed, 3 Jun 2009 17:59:28 -0400
Subject: [PATCH 1580/2198] ext4: super.c whitespace cleanup

Cleanup of whitespace and formatting.  Initially driven by confusing indents
for the ext4_{block,inode}_bitmap() et. al. helper routines, but figured I'd
cleanup some other 80-column wrapping and other indenting problems at the
same time.

Signed-off-by: Andreas Dilger <adilger@sun.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 117 +++++++++++++++++++++++++-----------------------
 1 file changed, 61 insertions(+), 56 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 91b98b58ccb9e..0a97b1ad3e19b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -79,7 +79,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_block_bitmap_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
 
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -87,7 +87,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_inode_bitmap_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
 
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -95,7 +95,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 {
 	return le32_to_cpu(bg->bg_inode_table_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 
 __u32 ext4_free_blks_count(struct super_block *sb,
@@ -103,7 +103,7 @@ __u32 ext4_free_blks_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_free_blocks_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 }
 
 __u32 ext4_free_inodes_count(struct super_block *sb,
@@ -111,7 +111,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_free_inodes_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 }
 
 __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -119,7 +119,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_used_dirs_count_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 }
 
 __u32 ext4_itable_unused_count(struct super_block *sb,
@@ -127,7 +127,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb,
 {
 	return le16_to_cpu(bg->bg_itable_unused_lo) |
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-		(__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+		 (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 }
 
 void ext4_block_bitmap_set(struct super_block *sb,
@@ -207,8 +207,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
 	journal = EXT4_SB(sb)->s_journal;
 	if (journal) {
 		if (is_journal_aborted(journal)) {
-			ext4_abort(sb, __func__,
-				   "Detected aborted journal");
+			ext4_abort(sb, __func__, "Detected aborted journal");
 			return ERR_PTR(-EROFS);
 		}
 		return jbd2_journal_start(journal, nblocks);
@@ -436,7 +435,7 @@ void ext4_warning(struct super_block *sb, const char *function,
 }
 
 void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
-				const char *function, const char *fmt, ...)
+			   const char *function, const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
@@ -472,7 +471,6 @@ __acquires(bitlock)
 	return;
 }
 
-
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
 	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -638,7 +636,6 @@ static void ext4_put_super(struct super_block *sb)
 	lock_kernel();
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	return;
 }
 
 static struct kmem_cache *ext4_inode_cachep;
@@ -653,6 +650,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
 	if (!ei)
 		return NULL;
+
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	ei->i_acl = EXT4_ACL_NOT_CACHED;
 	ei->i_default_acl = EXT4_ACL_NOT_CACHED;
@@ -673,6 +671,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 	ei->i_allocated_meta_blocks = 0;
 	ei->i_delalloc_reserved_flag = 0;
 	spin_lock_init(&(ei->i_block_reservation_lock));
+
 	return &ei->vfs_inode;
 }
 
@@ -879,12 +878,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",noauto_da_alloc");
 
 	ext4_show_quota_options(seq, sb);
+
 	return 0;
 }
 
-
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
-		u64 ino, u32 generation)
+					u64 ino, u32 generation)
 {
 	struct inode *inode;
 
@@ -913,14 +912,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 }
 
 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
+					int fh_len, int fh_type)
 {
 	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
 				    ext4_nfs_get_inode);
 }
 
 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
-		int fh_len, int fh_type)
+					int fh_len, int fh_type)
 {
 	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
 				    ext4_nfs_get_inode);
@@ -932,7 +931,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
  * which would prevent try_to_free_buffers() from freeing them, we must use
  * jbd2 layer's try_to_free_buffers() function to release them.
  */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+				 gfp_t wait)
 {
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 
@@ -1133,8 +1133,9 @@ static ext4_fsblk_t get_sb_block(void **data)
 
 	if (!options || strncmp(options, "sb=", 3) != 0)
 		return 1;	/* Default location */
+
 	options += 3;
-	/*todo: use simple_strtoll with >32bit ext4 */
+	/* TODO: use simple_strtoll with >32bit ext4 */
 	sb_block = simple_strtoul(options, &options, 0);
 	if (*options && *options != ',') {
 		printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
@@ -1144,6 +1145,7 @@ static ext4_fsblk_t get_sb_block(void **data)
 	if (*options == ',')
 		options++;
 	*data = (void *) options;
+
 	return sb_block;
 }
 
@@ -1626,7 +1628,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 		printk(KERN_WARNING
 		       "EXT4-fs warning: checktime reached, "
 		       "running e2fsck is recommended\n");
-	if (!sbi->s_journal) 
+	if (!sbi->s_journal)
 		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
 		es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
@@ -1810,7 +1812,7 @@ static int ext4_check_descriptors(struct super_block *sb)
 	}
 
 	ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
-	sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+	sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
 	return 1;
 }
 
@@ -1926,6 +1928,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #endif
 	sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
+
 /*
  * Maximal extent format file size.
  * Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1976,19 +1979,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 	loff_t res = EXT4_NDIR_BLOCKS;
 	int meta_blocks;
 	loff_t upper_limit;
-	/* This is calculated to be the largest file size for a
-	 * dense, bitmapped file such that the total number of
-	 * sectors in the file, including data and all indirect blocks,
-	 * does not exceed 2^48 -1
-	 * __u32 i_blocks_lo and _u16 i_blocks_high representing the
-	 * total number of  512 bytes blocks of the file
+	/* This is calculated to be the largest file size for a dense, block
+	 * mapped file such that the file's total number of 512-byte sectors,
+	 * including data and all indirect blocks, does not exceed (2^48 - 1).
+	 *
+	 * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
+	 * number of 512-byte sectors of the file.
 	 */
 
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * !has_huge_files or CONFIG_LBD is not enabled
-		 * implies the inode i_block represent total blocks in
-		 * 512 bytes 32 == size of vfs inode i_blocks * 8
+		 * !has_huge_files or CONFIG_LBD not enabled implies that
+		 * the inode i_block field represents total file blocks in
+		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
 		 */
 		upper_limit = (1LL << 32) - 1;
 
@@ -2030,7 +2033,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 }
 
 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
-				ext4_fsblk_t logical_sb_block, int nr)
+				   ext4_fsblk_t logical_sb_block, int nr)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	ext4_group_t bg, first_meta_bg;
@@ -2044,6 +2047,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
 	bg = sbi->s_desc_per_block * nr;
 	if (ext4_bg_has_super(sb, bg))
 		has_super = 1;
+
 	return (has_super + ext4_group_first_block_no(sb, bg));
 }
 
@@ -2148,7 +2152,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 }
 
 static ssize_t sbi_ui_show(struct ext4_attr *a,
-				struct ext4_sb_info *sbi, char *buf)
+			   struct ext4_sb_info *sbi, char *buf)
 {
 	unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
 
@@ -2253,7 +2257,6 @@ static struct kobj_type ext4_ktype = {
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				__releases(kernel_lock)
 				__acquires(kernel_lock)
-
 {
 	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
@@ -2379,7 +2382,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	set_opt(sbi->s_mount_opt, DELALLOC);
 
-
 	if (!parse_options((char *) data, sb, &journal_devnum,
 			   &journal_ioprio, NULL, 0))
 		goto failed_mount;
@@ -2442,7 +2444,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (sb->s_blocksize != blocksize) {
-
 		/* Validate the filesystem blocksize */
 		if (!sb_set_blocksize(sb, blocksize)) {
 			printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
@@ -2489,6 +2490,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
 			sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
 	}
+
 	sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
@@ -2501,10 +2503,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	} else
 		sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
 	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
 		goto cantfind_ext4;
+
 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0)
 		goto cantfind_ext4;
@@ -2515,6 +2519,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_mount_state = le16_to_cpu(es->s_state);
 	sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
 	sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+
 	for (i = 0; i < 4; i++)
 		sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
 	sbi->s_def_hash_version = es->s_def_hash_version;
@@ -2566,12 +2571,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-        /*
-         * It makes no sense for the first data block to be beyond the end
-         * of the filesystem.
-         */
-        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
+	/*
+	 * It makes no sense for the first data block to be beyond the end
+	 * of the filesystem.
+	 */
+	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+		printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
 		       "block %u is beyond end of filesystem (%llu)\n",
 		       le32_to_cpu(es->s_first_data_block),
 		       ext4_blocks_count(es));
@@ -3082,6 +3087,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	EXT4_SB(sb)->journal_bdev = bdev;
 	ext4_init_journal_params(sb, journal);
 	return journal;
+
 out_journal:
 	jbd2_journal_destroy(journal);
 out_bdev:
@@ -3116,7 +3122,6 @@ static int ext4_load_journal(struct super_block *sb,
 	 * crash?  For recovery, we need to check in advance whether we
 	 * can get read-write access to the device.
 	 */
-
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		if (sb->s_flags & MS_RDONLY) {
 			printk(KERN_INFO "EXT4-fs: INFO: recovery "
@@ -3234,7 +3239,6 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 	return error;
 }
 
-
 /*
  * Have we just finished recovery?  If so, and if we are mounting (or
  * remounting) the filesystem readonly, then we will end up with a
@@ -3485,8 +3489,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
 			/*
 			 * Make sure the group descriptor checksums
-			 * are sane.  If they aren't, refuse to
-			 * remount r/w.
+			 * are sane.  If they aren't, refuse to remount r/w.
 			 */
 			for (g = 0; g < sbi->s_groups_count; g++) {
 				struct ext4_group_desc *gdp =
@@ -3545,6 +3548,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	return 0;
+
 restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.s_mount_opt;
@@ -3628,11 +3632,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	       le64_to_cpup((void *)es->s_uuid + sizeof(u64));
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+
 	return 0;
 }
 
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
- * is locked for write. Otherwise the are possible deadlocks:
+/* Helper function for writing quotas on sync - we need to start transaction
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
  * Process 1                         Process 2
  * ext4_create()                     quota_sync()
  *   jbd2_journal_start()                  write_dquot()
@@ -3656,7 +3661,7 @@ static int ext4_write_dquot(struct dquot *dquot)
 
 	inode = dquot_to_inode(dquot);
 	handle = ext4_journal_start(inode,
-					EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+				    EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_commit(dquot);
@@ -3672,7 +3677,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
 	handle_t *handle;
 
 	handle = ext4_journal_start(dquot_to_inode(dquot),
-					EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+				    EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 	ret = dquot_acquire(dquot);
@@ -3688,7 +3693,7 @@ static int ext4_release_dquot(struct dquot *dquot)
 	handle_t *handle;
 
 	handle = ext4_journal_start(dquot_to_inode(dquot),
-					EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+				    EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
 	if (IS_ERR(handle)) {
 		/* Release dquot anyway to avoid endless cycle in dqput() */
 		dquot_release(dquot);
@@ -3736,7 +3741,7 @@ static int ext4_write_info(struct super_block *sb, int type)
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
 	return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-			EXT4_SB(sb)->s_jquota_fmt, type);
+				  EXT4_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
@@ -3907,10 +3912,10 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 
 #endif
 
-static int ext4_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+		       const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
 static struct file_system_type ext4_fs_type = {
@@ -3922,14 +3927,14 @@ static struct file_system_type ext4_fs_type = {
 };
 
 #ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
+			  const char *dev_name, void *data,struct vfsmount *mnt)
 {
 	printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
 	       "to mount using ext4\n");
 	printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
 	       "will go away by 2.6.31\n");
-	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
 static struct file_system_type ext4dev_fs_type = {
-- 
GitLab


From e0a94c2a63f2644826069044649669b5e7ca75d3 Mon Sep 17 00:00:00 2001
From: Christoph Lameter <cl@linux-foundation.org>
Date: Wed, 3 Jun 2009 16:04:31 -0400
Subject: [PATCH 1581/2198] security: use mmap_min_addr indepedently of
 security models

This patch removes the dependency of mmap_min_addr on CONFIG_SECURITY.
It also sets a default mmap_min_addr of 4096.

mmapping of addresses below 4096 will only be possible for processes
with CAP_SYS_RAWIO.

Signed-off-by: Christoph Lameter <cl@linux-foundation.org>
Acked-by: Eric Paris <eparis@redhat.com>
Looks-ok-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: James Morris <jmorris@namei.org>
---
 include/linux/mm.h       |  2 --
 include/linux/security.h |  2 ++
 kernel/sysctl.c          |  2 --
 mm/Kconfig               | 19 +++++++++++++++++++
 mm/mmap.c                |  3 +++
 security/Kconfig         | 22 +---------------------
 security/security.c      |  3 ---
 7 files changed, 25 insertions(+), 28 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d475c75..0c21af6abffbc 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -580,12 +580,10 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
  */
 static inline unsigned long round_hint_to_min(unsigned long hint)
 {
-#ifdef CONFIG_SECURITY
 	hint &= PAGE_MASK;
 	if (((void *)hint != NULL) &&
 	    (hint < mmap_min_addr))
 		return PAGE_ALIGN(mmap_min_addr);
-#endif
 	return hint;
 }
 
diff --git a/include/linux/security.h b/include/linux/security.h
index d5fd6163606fa..5eff459b38338 100644
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -2197,6 +2197,8 @@ static inline int security_file_mmap(struct file *file, unsigned long reqprot,
 				     unsigned long addr,
 				     unsigned long addr_only)
 {
+	if ((addr < mmap_min_addr) && !capable(CAP_SYS_RAWIO))
+		return -EACCES;
 	return 0;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 149581fb48ab0..45bd711a242ef 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1237,7 +1237,6 @@ static struct ctl_table vm_table[] = {
 		.strategy	= &sysctl_jiffies,
 	},
 #endif
-#ifdef CONFIG_SECURITY
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "mmap_min_addr",
@@ -1246,7 +1245,6 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &proc_doulongvec_minmax,
 	},
-#endif
 #ifdef CONFIG_NUMA
 	{
 		.ctl_name	= CTL_UNNUMBERED,
diff --git a/mm/Kconfig b/mm/Kconfig
index c2b57d81e1530..71830ba7b986a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -226,6 +226,25 @@ config HAVE_MLOCKED_PAGE_BIT
 config MMU_NOTIFIER
 	bool
 
+config DEFAULT_MMAP_MIN_ADDR
+        int "Low address space to protect from user allocation"
+        default 4096
+        help
+	  This is the portion of low virtual memory which should be protected
+	  from userspace allocation.  Keeping a user from writing to low pages
+	  can help reduce the impact of kernel NULL pointer bugs.
+
+	  For most ia64, ppc64 and x86 users with lots of address space
+	  a value of 65536 is reasonable and should cause no problems.
+	  On arm and other archs it should not be higher than 32768.
+	  Programs which use vm86 functionality would either need additional
+	  permissions from either the LSM or the capabilities module or have
+	  this protection disabled.
+
+	  This value can be changed after boot using the
+	  /proc/sys/vm/mmap_min_addr tunable.
+
+
 config NOMMU_INITIAL_TRIM_EXCESS
 	int "Turn on mmap() excess space trimming before booting"
 	depends on !MMU
diff --git a/mm/mmap.c b/mm/mmap.c
index 6b7b1a95944bf..2b43fa1aa3c83 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -87,6 +87,9 @@ int sysctl_overcommit_ratio = 50;	/* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 struct percpu_counter vm_committed_as;
 
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+
 /*
  * Check that a process has enough memory to allocate a new virtual
  * mapping. 0 means there is enough memory for the allocation to
diff --git a/security/Kconfig b/security/Kconfig
index bb244774e9d76..d23c839038f00 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -110,28 +110,8 @@ config SECURITY_ROOTPLUG
 
 	  See <http://www.linuxjournal.com/article.php?sid=6279> for
 	  more information about this module.
-	  
-	  If you are unsure how to answer this question, answer N.
-
-config SECURITY_DEFAULT_MMAP_MIN_ADDR
-        int "Low address space to protect from user allocation"
-        depends on SECURITY
-        default 0
-        help
-	  This is the portion of low virtual memory which should be protected
-	  from userspace allocation.  Keeping a user from writing to low pages
-	  can help reduce the impact of kernel NULL pointer bugs.
-
-	  For most ia64, ppc64 and x86 users with lots of address space
-	  a value of 65536 is reasonable and should cause no problems.
-	  On arm and other archs it should not be higher than 32768.
-	  Programs which use vm86 functionality would either need additional
-	  permissions from either the LSM or the capabilities module or have
-	  this protection disabled.
-
-	  This value can be changed after boot using the
-	  /proc/sys/vm/mmap_min_addr tunable.
 
+	  If you are unsure how to answer this question, answer N.
 
 source security/selinux/Kconfig
 source security/smack/Kconfig
diff --git a/security/security.c b/security/security.c
index 5284255c5cdff..dc7674fbfc7a4 100644
--- a/security/security.c
+++ b/security/security.c
@@ -26,9 +26,6 @@ extern void security_fixup_ops(struct security_operations *ops);
 
 struct security_operations *security_ops;	/* Initialized to NULL */
 
-/* amount of vm to protect from userspace access */
-unsigned long mmap_min_addr = CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR;
-
 static inline int verify(struct security_operations *ops)
 {
 	/* verify the security_operations structure exists */
-- 
GitLab


From a4c43beaff0fe6c83aa2505dce8ffe65db8e0a33 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 3 Jun 2009 23:02:33 -0300
Subject: [PATCH 1582/2198] perf report: Fix rbtree bug

Ingo Molnar reported:

> FYI, i just got this crash (segfault) in perf report after
> collecting a long profile from Xorg:
>
> Starting program: /home/mingo/tip/Documentation/perf_counter/perf report
> [Thread debugging using libthread_db enabled]
> Detaching after fork from child process 20008.
> [New Thread 0x7f92fd62a6f0 (LWP 20005)]
>
> Program received signal SIGSEGV, Segmentation fault.
> 0x000000000041031a in __rb_erase_color (node=0x142c090, parent=0x0,
> root=0x881918)
>     at util/rbtree.c:143
> 143			if (parent->rb_left == node)

It was a problem introduced in this cset:

 perf report: Fix comm sorting - 8229289b607682f90b946ad2c319526303c17700

This patch should fix it.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 6003cc3b188dc..86f23f0991f14 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -699,17 +699,18 @@ static void output__resort(void)
 {
 	struct rb_node *next;
 	struct hist_entry *n;
+	struct rb_root *tree = &hist;
 
 	if (sort__need_collapse)
-		next = rb_first(&collapse_hists);
-	else
-		next = rb_first(&hist);
+		tree = &collapse_hists;
+
+	next = rb_first(tree);
 
 	while (next) {
 		n = rb_entry(next, struct hist_entry, rb_node);
 		next = rb_next(&n->rb_node);
 
-		rb_erase(&n->rb_node, &hist);
+		rb_erase(&n->rb_node, tree);
 		output__insert_entry(n);
 	}
 }
-- 
GitLab


From 6e53cdf11dfc8d302ebb67e7112d1baf8d7c66d4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 08:53:05 +0200
Subject: [PATCH 1583/2198] perf top: Reduce default filter threshold

On idle systems 'perf top' comes up empty by default, because the event
count filter is set to 100.

Reduce it to 5 instead.

Also add an option to limit the number of functions displayed.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 7c907e25d82b9..3f7778ba00bc8 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -64,9 +64,10 @@ static int			default_interval = 100000;
 static int			event_count[MAX_COUNTERS];
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
-static __u64			count_filter		       = 100;
+static __u64			count_filter			=  5;
+static int			print_entries			= 15;
 
-static int			target_pid				= -1;
+static int			target_pid			= -1;
 static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
 static unsigned int		realtime_prio			=  0;
@@ -254,7 +255,7 @@ static void print_sym_table(void)
 		struct symbol *sym = (struct symbol *)(syme + 1);
 		float pcnt;
 
-		if (++printed > 18 || syme->snap_count < count_filter)
+		if (++printed > print_entries || syme->snap_count < count_filter)
 			continue;
 
 		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->snap_count) /
@@ -650,7 +651,7 @@ static const struct option options[] = {
 		    "number of seconds to delay between refreshes"),
 	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
 			    "dump the symbol table used for profiling"),
-	OPT_INTEGER('f', "--count-filter", &count_filter,
+	OPT_INTEGER('f', "count-filter", &count_filter,
 		    "only display functions with more events than this"),
 	OPT_BOOLEAN('g', "group", &group,
 			    "put the counters into a counter group"),
@@ -662,8 +663,10 @@ static const struct option options[] = {
 		    "track mmap events"),
 	OPT_BOOLEAN('U', "use-munmap", &use_munmap,
 		    "track munmap events"),
-	OPT_INTEGER('F', "--freq", &freq,
+	OPT_INTEGER('F', "freq", &freq,
 		    "profile at this frequency"),
+	OPT_INTEGER('E', "entries", &print_entries,
+		    "display this many functions"),
 	OPT_END()
 };
 
-- 
GitLab


From bcb86975dbcc24f820f1a37918d53914af29ace7 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Thu, 4 Jun 2009 15:14:34 +0900
Subject: [PATCH 1584/2198] TOMOYO: Remove unused parameter.

TOMOYO 2.2.0 does not check argv[] and envp[] upon execve().
We don't need to pass "struct tomoyo_page_buffer".

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/tomoyo/domain.c | 2 +-
 security/tomoyo/file.c   | 4 +---
 security/tomoyo/tomoyo.h | 4 +---
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index 34bb641c67438..eb75401fd6b02 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -721,7 +721,7 @@ int tomoyo_find_next_domain(struct linux_binprm *bprm,
 	}
 
 	/* Check execute permission. */
-	retval = tomoyo_check_exec_perm(old_domain, &r, tmp);
+	retval = tomoyo_check_exec_perm(old_domain, &r);
 	if (retval < 0)
 		goto out;
 
diff --git a/security/tomoyo/file.c b/security/tomoyo/file.c
index a67f9e61ee607..ab0cd35385107 100644
--- a/security/tomoyo/file.c
+++ b/security/tomoyo/file.c
@@ -1012,13 +1012,11 @@ int tomoyo_check_file_perm(struct tomoyo_domain_info *domain,
  *
  * @domain:   Pointer to "struct tomoyo_domain_info".
  * @filename: Check permission for "execute".
- * @tmp:      Buffer for temporary use.
  *
  * Returns 0 on success, negativevalue otherwise.
  */
 int tomoyo_check_exec_perm(struct tomoyo_domain_info *domain,
-			   const struct tomoyo_path_info *filename,
-			   struct tomoyo_page_buffer *tmp)
+			   const struct tomoyo_path_info *filename)
 {
 	const u8 mode = tomoyo_check_flags(domain, TOMOYO_MAC_FOR_FILE);
 
diff --git a/security/tomoyo/tomoyo.h b/security/tomoyo/tomoyo.h
index f12d5ada7dcbb..0fd588a629cf9 100644
--- a/security/tomoyo/tomoyo.h
+++ b/security/tomoyo/tomoyo.h
@@ -17,13 +17,11 @@ struct path;
 struct inode;
 struct linux_binprm;
 struct pt_regs;
-struct tomoyo_page_buffer;
 
 int tomoyo_check_file_perm(struct tomoyo_domain_info *domain,
 			   const char *filename, const u8 perm);
 int tomoyo_check_exec_perm(struct tomoyo_domain_info *domain,
-			   const struct tomoyo_path_info *filename,
-			   struct tomoyo_page_buffer *buf);
+			   const struct tomoyo_path_info *filename);
 int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
 				 struct path *path, const int flag);
 int tomoyo_check_1path_perm(struct tomoyo_domain_info *domain,
-- 
GitLab


From 1b58c2515be48d5df79d20210ac5a86e30094de2 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 4 Jun 2009 09:49:59 +1000
Subject: [PATCH 1585/2198] perf_counter: powerpc: Use new identifier names in
 powerpc-specific code

Commit b23f3325 ("perf_counter: Rename various fields") fixed up
most of the uses of the renamed fields, but missed one instance
of "record_type" in powerpc-specific code which needs to be changed
to "sample_type", and a "PERF_RECORD_ADDR" in the same statement that
needs to be changed to "PERF_SAMPLE_ADDR", causing compilation
errors on powerpc.  This fixes it.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18983.3111.770392.800486@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4cc4ac5c791c3..232b00a36f793 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1002,7 +1002,7 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		addr = 0;
-		if (counter->attr.record_type & PERF_RECORD_ADDR) {
+		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
 			/*
 			 * The user wants a data address recorded.
 			 * If we're not doing instruction sampling,
-- 
GitLab


From 3aff27ca84fa94311ae99189e54fed8d83b69fc1 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Wed, 3 Jun 2009 16:42:25 +0800
Subject: [PATCH 1586/2198] perf_counter: Documentation update

The 'nmi' bit is no longer there.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <20090603084225.GA6553@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/design.txt | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt
index 9930c4bddc6fa..d3250763dc92b 100644
--- a/Documentation/perf_counter/design.txt
+++ b/Documentation/perf_counter/design.txt
@@ -48,7 +48,6 @@ struct perf_counter_hw_event {
         __u32                   read_format;
 
         __u64                   disabled       :  1, /* off by default        */
-                                nmi            :  1, /* NMI sampling          */
                                 inherit        :  1, /* children inherit it   */
                                 pinned         :  1, /* must always be on PMU */
                                 exclusive      :  1, /* only group on PMU     */
@@ -195,12 +194,6 @@ The 'disabled' bit specifies whether the counter starts out disabled
 or enabled.  If it is initially disabled, it can be enabled by ioctl
 or prctl (see below).
 
-The 'nmi' bit specifies, for hardware events, whether the counter
-should be set up to request non-maskable interrupts (NMIs) or normal
-interrupts.  This bit is ignored if the user doesn't have
-CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't
-generate NMIs from hardware counters.
-
 The 'inherit' bit, if set, specifies that this counter should count
 events on descendant tasks as well as the task specified.  This only
 applies to new descendents, not to any existing descendents at the
-- 
GitLab


From 48c72fccbfb1db01b5d0b98baff4442fea50d7a4 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 20:20:24 +0900
Subject: [PATCH 1587/2198] sh: 16-bit get_unaligned() sh4a fix

This patch fixes the 16-bit case of the sh4a specific
unaligned access implementation. Without this patch
the 16-bit version of sh4a get_unaligned() results in
a 32-bit read which may read more data than intended
and/or cross page boundaries.

Unbreaks mtd NOR write handling on Migo-R.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/unaligned-sh4a.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/sh/include/asm/unaligned-sh4a.h b/arch/sh/include/asm/unaligned-sh4a.h
index d8f89770275bc..9f4dd252c9813 100644
--- a/arch/sh/include/asm/unaligned-sh4a.h
+++ b/arch/sh/include/asm/unaligned-sh4a.h
@@ -3,9 +3,9 @@
 
 /*
  * SH-4A has support for unaligned 32-bit loads, and 32-bit loads only.
- * Support for 16 and 64-bit accesses are done through shifting and
- * masking relative to the endianness. Unaligned stores are not supported
- * by the instruction encoding, so these continue to use the packed
+ * Support for 64-bit accesses are done through shifting and masking
+ * relative to the endianness. Unaligned stores are not supported by the
+ * instruction encoding, so these continue to use the packed
  * struct.
  *
  * The same note as with the movli.l/movco.l pair applies here, as long
@@ -41,9 +41,9 @@ struct __una_u64 { u64 x __attribute__((packed)); };
 static inline u16 __get_unaligned_cpu16(const u8 *p)
 {
 #ifdef __LITTLE_ENDIAN
-	return __get_unaligned_cpu32(p) & 0xffff;
+	return p[0] | p[1] << 8;
 #else
-	return __get_unaligned_cpu32(p) >> 16;
+	return p[0] << 8 | p[1];
 #endif
 }
 
-- 
GitLab


From df97992c6e25ffc66c549c8bc59262dc627c6d17 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 13:41:22 +0200
Subject: [PATCH 1588/2198] perf record/report: Fix PID/COMM handling

Fix two bugs causing lost comm mappings:

 - initial PID is not 0 but getpid()

 - when we are unable to handle an mmap event, dont assume the event
   itself is broken - try to parse the stream. This way we wont lose
   comm events.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 2 +-
 Documentation/perf_counter/builtin-report.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index efa2eb498e9cd..bf59df5bddf36 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -430,7 +430,7 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	if (!system_wide) {
-		open_counters(-1, target_pid != -1 ? target_pid : 0);
+		open_counters(-1, target_pid != -1 ? target_pid : getpid());
 	} else for (i = 0; i < nr_cpus; i++)
 		open_counters(i, target_pid);
 
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 86f23f0991f14..ff6f657476a93 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -852,7 +852,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 
 	if (thread == NULL || map == NULL) {
 		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
-		return -1;
+		return 0;
 	}
 
 	thread__insert_map(thread, map);
-- 
GitLab


From af794b94ae8a16fb4a9da6ce640c122efb44e2a0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 13:58:13 +0200
Subject: [PATCH 1589/2198] perf_counter tools: Build with native optimization

Build the tools with -march=native by default.

No measurable difference in speed though, compared to the
default, on a Nehalem testbox.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 005709b7b19ac..414399cbc51af 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -159,7 +159,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6 -march=native
 LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
-- 
GitLab


From 95ed6fd06e52bf850cd17524f0b36ed14300c10d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 15:00:45 +0200
Subject: [PATCH 1590/2198] perf report: Simplify symbol output

The DSO can be printed already - no need to repeat it in the
symbol field.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index ff6f657476a93..56c664d1b6281 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -382,7 +382,7 @@ sort__comm_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_comm = {
-	.header 	= "          Command",
+	.header		= "          Command",
 	.cmp		= sort__comm_cmp,
 	.collapse	= sort__comm_collapse,
 	.print		= sort__comm_print,
@@ -414,11 +414,11 @@ sort__dso_print(FILE *fp, struct hist_entry *self)
 	if (self->dso)
 		return fprintf(fp, "  %-25s", self->dso->name);
 
-	return fprintf(fp, "  %016llx", (__u64)self->ip);
+	return fprintf(fp, "  %016llx         ", (__u64)self->ip);
 }
 
 static struct sort_entry sort_dso = {
-	.header = " Shared Object          ",
+	.header = " Shared Object            ",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
 };
@@ -447,21 +447,16 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	if (verbose)
 		ret += fprintf(fp, "  %#018llx", (__u64)self->ip);
 
-	if (self->dso)
-		ret += fprintf(fp, "  %s: ", self->dso->name);
-	else
-		ret += fprintf(fp, "  %#016llx: ", (__u64)self->ip);
-
 	if (self->sym)
-		ret += fprintf(fp, "%s", self->sym->name);
+		ret += fprintf(fp, "  %s", self->sym->name);
 	else
-		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+		ret += fprintf(fp, "  %#016llx", (__u64)self->ip);
 
 	return ret;
 }
 
 static struct sort_entry sort_sym = {
-	.header = " Shared Object: Symbol",
+	.header = " Symbol",
 	.cmp	= sort__sym_cmp,
 	.print	= sort__sym_print,
 };
-- 
GitLab


From bd74137ec9aaca3df3ff22b92455fddf7afaced1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 14:13:04 +0200
Subject: [PATCH 1591/2198] perf_counter tools: Print out symbol parsing errors
 only if --verbose

Also, add a suggestion to 'perf report', if the default sort order is
used.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 18 +++++++++++---
 Documentation/perf_counter/builtin-top.c    |  2 +-
 Documentation/perf_counter/util/symbol.c    | 27 ++++++++++++---------
 Documentation/perf_counter/util/symbol.h    |  4 +--
 4 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 56c664d1b6281..15fe9dae792b0 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -26,7 +26,10 @@
 
 static char		const *input_name = "perf.data";
 static char		*vmlinux = NULL;
-static char		*sort_order = "comm,dso";
+
+static char		default_sort_order[] = "comm,dso";
+static char		*sort_order = default_sort_order;
+
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
 
@@ -103,9 +106,10 @@ static struct dso *dsos__findnew(const char *name)
 	if (!dso)
 		goto out_delete_dso;
 
-	nr = dso__load(dso, NULL);
+	nr = dso__load(dso, NULL, verbose);
 	if (nr < 0) {
-		fprintf(stderr, "Failed to open: %s\n", name);
+		if (verbose)
+			fprintf(stderr, "Failed to open: %s\n", name);
 		goto out_delete_dso;
 	}
 	if (!nr && verbose) {
@@ -139,7 +143,7 @@ static int load_kernel(void)
 	if (!kernel_dso)
 		return -1;
 
-	err = dso__load_kernel(kernel_dso, vmlinux, NULL);
+	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
 	if (err) {
 		dso__delete(kernel_dso);
 		kernel_dso = NULL;
@@ -741,6 +745,12 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 		ret += hist_entry__fprintf(fp, pos, total_samples);
 	}
 
+	if (!strcmp(sort_order, default_sort_order)) {
+		fprintf(fp, "#\n");
+		fprintf(fp, "# ( For more details, try: perf report --sort comm,dso,symbol )\n");
+		fprintf(fp, "#\n");
+	}
+
 	return ret;
 }
 
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 3f7778ba00bc8..548a8da4b15bd 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -349,7 +349,7 @@ static int parse_symbols(void)
 	if (kernel_dso == NULL)
 		return -1;
 
-	if (dso__load_kernel(kernel_dso, NULL, symbol_filter) != 0)
+	if (dso__load_kernel(kernel_dso, NULL, symbol_filter, 1) != 0)
 		goto out_delete_dso;
 
 	node = rb_first(&kernel_dso->syms);
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 35ee6de1e513c..15d5cf9abfac6 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -124,7 +124,7 @@ size_t dso__fprintf(struct dso *self, FILE *fp)
 	return ret;
 }
 
-static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter)
+static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verbose)
 {
 	struct rb_node *nd, *prevnd;
 	char *line = NULL;
@@ -370,7 +370,7 @@ static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 }
 
 static int dso__load_sym(struct dso *self, int fd, const char *name,
-			 symbol_filter_t filter)
+			 symbol_filter_t filter, int verbose)
 {
 	Elf_Data *symstrs;
 	uint32_t nr_syms;
@@ -387,13 +387,15 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 
 	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
 	if (elf == NULL) {
-		fprintf(stderr, "%s: cannot read %s ELF file.\n",
-			__func__, name);
+		if (verbose)
+			fprintf(stderr, "%s: cannot read %s ELF file.\n",
+				__func__, name);
 		goto out_close;
 	}
 
 	if (gelf_getehdr(elf, &ehdr) == NULL) {
-		fprintf(stderr, "%s: cannot get elf header.\n", __func__);
+		if (verbose)
+			fprintf(stderr, "%s: cannot get elf header.\n", __func__);
 		goto out_elf_end;
 	}
 
@@ -473,7 +475,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 	return err;
 }
 
-int dso__load(struct dso *self, symbol_filter_t filter)
+int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
 {
 	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug");
 	char *name = malloc(size);
@@ -505,7 +507,7 @@ int dso__load(struct dso *self, symbol_filter_t filter)
 		fd = open(name, O_RDONLY);
 	} while (fd < 0);
 
-	ret = dso__load_sym(self, fd, name, filter);
+	ret = dso__load_sym(self, fd, name, filter, verbose);
 	close(fd);
 
 	/*
@@ -520,28 +522,29 @@ int dso__load(struct dso *self, symbol_filter_t filter)
 }
 
 static int dso__load_vmlinux(struct dso *self, const char *vmlinux,
-			     symbol_filter_t filter)
+			     symbol_filter_t filter, int verbose)
 {
 	int err, fd = open(vmlinux, O_RDONLY);
 
 	if (fd < 0)
 		return -1;
 
-	err = dso__load_sym(self, fd, vmlinux, filter);
+	err = dso__load_sym(self, fd, vmlinux, filter, verbose);
 	close(fd);
 
 	return err;
 }
 
-int dso__load_kernel(struct dso *self, const char *vmlinux, symbol_filter_t filter)
+int dso__load_kernel(struct dso *self, const char *vmlinux,
+		     symbol_filter_t filter, int verbose)
 {
 	int err = -1;
 
 	if (vmlinux)
-		err = dso__load_vmlinux(self, vmlinux, filter);
+		err = dso__load_vmlinux(self, vmlinux, filter, verbose);
 
 	if (err)
-		err = dso__load_kallsyms(self, filter);
+		err = dso__load_kallsyms(self, filter, verbose);
 
 	return err;
 }
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index b0299bc0cf50e..8dd8522a0a0c9 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -32,8 +32,8 @@ static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
 struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
 
 int dso__load_kernel(struct dso *self, const char *vmlinux,
-		     symbol_filter_t filter);
-int dso__load(struct dso *self, symbol_filter_t filter);
+		     symbol_filter_t filter, int verbose);
+int dso__load(struct dso *self, symbol_filter_t filter, int verbose);
 
 size_t dso__fprintf(struct dso *self, FILE *fp);
 
-- 
GitLab


From 05ca061eb9704ad9b0739f88046276792b75f2c1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 14:21:16 +0200
Subject: [PATCH 1592/2198] perf report: Print out the total number of events

So that the statistical quality of the profile can be estimated at a glance.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 15fe9dae792b0..be392e0f77c6e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -721,6 +721,8 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 	struct rb_node *nd;
 	size_t ret = 0;
 
+	fprintf(fp, "#\n");
+	fprintf(fp, "# (%Ld profiler events)\n", (__u64)total_samples);
 	fprintf(fp, "#\n");
 
 	fprintf(fp, "# Overhead");
-- 
GitLab


From 71dd8945d8d827ab101cd287f9480ef22fc7c1b6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 15:16:56 +0200
Subject: [PATCH 1593/2198] perf report: Add consistent spacing rules

Make the sort header and the print function have the same column width.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 34 ++++++++++++---------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index be392e0f77c6e..e930b4e02335d 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -344,11 +344,11 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__thread_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, " %16s:%5d", self->thread->comm ?: "", self->thread->pid);
+	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
 }
 
 static struct sort_entry sort_thread = {
-	.header = "         Command: Pid ",
+	.header = "         Command:  Pid",
 	.cmp	= sort__thread_cmp,
 	.print	= sort__thread_print,
 };
@@ -382,11 +382,11 @@ sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
 static size_t
 sort__comm_print(FILE *fp, struct hist_entry *self)
 {
-	return fprintf(fp, "  %16s", self->thread->comm);
+	return fprintf(fp, "%16s", self->thread->comm);
 }
 
 static struct sort_entry sort_comm = {
-	.header		= "          Command",
+	.header 	= "         Command",
 	.cmp		= sort__comm_cmp,
 	.collapse	= sort__comm_collapse,
 	.print		= sort__comm_print,
@@ -416,13 +416,13 @@ static size_t
 sort__dso_print(FILE *fp, struct hist_entry *self)
 {
 	if (self->dso)
-		return fprintf(fp, "  %-25s", self->dso->name);
+		return fprintf(fp, "%-25s", self->dso->name);
 
-	return fprintf(fp, "  %016llx         ", (__u64)self->ip);
+	return fprintf(fp, "%016llx         ", (__u64)self->ip);
 }
 
 static struct sort_entry sort_dso = {
-	.header = " Shared Object            ",
+	.header = "Shared Object            ",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
 };
@@ -449,18 +449,18 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	size_t ret = 0;
 
 	if (verbose)
-		ret += fprintf(fp, "  %#018llx", (__u64)self->ip);
+		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
 
 	if (self->sym)
-		ret += fprintf(fp, "  %s", self->sym->name);
+		ret += fprintf(fp, "%s", self->sym->name);
 	else
-		ret += fprintf(fp, "  %#016llx", (__u64)self->ip);
+		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
 
 	return ret;
 }
 
 static struct sort_entry sort_sym = {
-	.header = " Symbol",
+	.header = "Symbol",
 	.cmp	= sort__sym_cmp,
 	.print	= sort__sym_print,
 };
@@ -553,8 +553,10 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 	} else
 		ret = fprintf(fp, "%12d ", self->count);
 
-	list_for_each_entry(se, &hist_entry__sort_list, list)
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		fprintf(fp, "  ");
 		ret += se->print(fp, self);
+	}
 
 	ret += fprintf(fp, "\n");
 
@@ -721,13 +723,14 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 	struct rb_node *nd;
 	size_t ret = 0;
 
+	fprintf(fp, "\n");
 	fprintf(fp, "#\n");
 	fprintf(fp, "# (%Ld profiler events)\n", (__u64)total_samples);
 	fprintf(fp, "#\n");
 
 	fprintf(fp, "# Overhead");
 	list_for_each_entry(se, &hist_entry__sort_list, list)
-		fprintf(fp, " %s", se->header);
+		fprintf(fp, "  %s", se->header);
 	fprintf(fp, "\n");
 
 	fprintf(fp, "# ........");
@@ -735,7 +738,7 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 		int i;
 
 		fprintf(fp, "  ");
-		for (i = 0; i < strlen(se->header)-1; i++)
+		for (i = 0; i < strlen(se->header); i++)
 			fprintf(fp, ".");
 	}
 	fprintf(fp, "\n");
@@ -749,9 +752,10 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 
 	if (!strcmp(sort_order, default_sort_order)) {
 		fprintf(fp, "#\n");
-		fprintf(fp, "# ( For more details, try: perf report --sort comm,dso,symbol )\n");
+		fprintf(fp, "# (For more details, try: perf report --sort comm,dso,symbol)\n");
 		fprintf(fp, "#\n");
 	}
+	fprintf(fp, "\n");
 
 	return ret;
 }
-- 
GitLab


From 8fc0321f1ad0ffef969056dda91b453bbd7a494d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 15:19:47 +0200
Subject: [PATCH 1594/2198] perf_counter tools: Add color terminal output
 support

Add Git's color printing library to util/color.[ch].

Add it to perf report, with a trivial example to print high-overhead
entries in red, low-overhead entries in green.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile           |   2 +
 Documentation/perf_counter/builtin-report.c   |  15 +-
 Documentation/perf_counter/builtin-top.c      |  23 +-
 Documentation/perf_counter/util/color.c       | 230 ++++++++++++++++++
 Documentation/perf_counter/util/color.h       |  36 +++
 Documentation/perf_counter/util/environment.c |   1 +
 6 files changed, 300 insertions(+), 7 deletions(-)
 create mode 100644 Documentation/perf_counter/util/color.c
 create mode 100644 Documentation/perf_counter/util/color.h

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 414399cbc51af..c9ec4585f4d62 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -298,6 +298,7 @@ LIB_H += util/string.h
 LIB_H += util/run-command.h
 LIB_H += util/sigchain.h
 LIB_H += util/symbol.h
+LIB_H += util/color.h
 
 LIB_OBJS += util/abspath.o
 LIB_OBJS += util/alias.o
@@ -319,6 +320,7 @@ LIB_OBJS += util/usage.o
 LIB_OBJS += util/wrapper.o
 LIB_OBJS += util/sigchain.o
 LIB_OBJS += util/symbol.o
+LIB_OBJS += util/color.o
 LIB_OBJS += util/pager.o
 
 BUILTIN_OBJS += builtin-help.o
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index e930b4e02335d..7beedc6effa4d 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -9,6 +9,7 @@
 
 #include "util/util.h"
 
+#include "util/color.h"
 #include "util/list.h"
 #include "util/cache.h"
 #include "util/rbtree.h"
@@ -548,7 +549,19 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 	size_t ret;
 
 	if (total_samples) {
-		ret = fprintf(fp, "   %6.2f%%",
+		double percent = self->count * 100.0 / total_samples;
+		char *color = PERF_COLOR_NORMAL;
+
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (percent >= 5.0)
+			color = PERF_COLOR_RED;
+		if (percent < 0.5)
+			color = PERF_COLOR_GREEN;
+
+		ret = color_fprintf(fp, color, "   %6.2f%%",
 				(self->count * 100.0) / total_samples);
 	} else
 		ret = fprintf(fp, "%12d ", self->count);
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 548a8da4b15bd..20e5b12009594 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -21,6 +21,7 @@
 #include "perf.h"
 
 #include "util/symbol.h"
+#include "util/color.h"
 #include "util/util.h"
 #include "util/rbtree.h"
 #include "util/parse-options.h"
@@ -253,7 +254,8 @@ static void print_sym_table(void)
 	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
 		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node);
 		struct symbol *sym = (struct symbol *)(syme + 1);
-		float pcnt;
+		char *color = PERF_COLOR_NORMAL;
+		double pcnt;
 
 		if (++printed > print_entries || syme->snap_count < count_filter)
 			continue;
@@ -261,13 +263,22 @@ static void print_sym_table(void)
 		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->snap_count) /
 					 sum_kevents));
 
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (pcnt >= 5.0)
+			color = PERF_COLOR_RED;
+		if (pcnt < 0.5)
+			color = PERF_COLOR_GREEN;
+
 		if (nr_counters == 1)
-			printf("%19.2f - %4.1f%% - %016llx : %s\n",
-				syme->weight, pcnt, sym->start, sym->name);
+			printf("%19.2f - ", syme->weight);
 		else
-			printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n",
-				syme->weight, syme->snap_count,
-				pcnt, sym->start, sym->name);
+			printf("%8.1f %10ld - ", syme->weight, syme->snap_count);
+
+		color_fprintf(stdout, color, "%4.1f%%", pcnt);
+		printf(" - %016llx : %s\n", sym->start, sym->name);
 	}
 
 	{
diff --git a/Documentation/perf_counter/util/color.c b/Documentation/perf_counter/util/color.c
new file mode 100644
index 0000000000000..a77975d66774d
--- /dev/null
+++ b/Documentation/perf_counter/util/color.c
@@ -0,0 +1,230 @@
+#include "cache.h"
+#include "color.h"
+
+int perf_use_color_default = 0;
+
+static int parse_color(const char *name, int len)
+{
+	static const char * const color_names[] = {
+		"normal", "black", "red", "green", "yellow",
+		"blue", "magenta", "cyan", "white"
+	};
+	char *end;
+	int i;
+	for (i = 0; i < ARRAY_SIZE(color_names); i++) {
+		const char *str = color_names[i];
+		if (!strncasecmp(name, str, len) && !str[len])
+			return i - 1;
+	}
+	i = strtol(name, &end, 10);
+	if (end - name == len && i >= -1 && i <= 255)
+		return i;
+	return -2;
+}
+
+static int parse_attr(const char *name, int len)
+{
+	static const int attr_values[] = { 1, 2, 4, 5, 7 };
+	static const char * const attr_names[] = {
+		"bold", "dim", "ul", "blink", "reverse"
+	};
+	int i;
+	for (i = 0; i < ARRAY_SIZE(attr_names); i++) {
+		const char *str = attr_names[i];
+		if (!strncasecmp(name, str, len) && !str[len])
+			return attr_values[i];
+	}
+	return -1;
+}
+
+void color_parse(const char *value, const char *var, char *dst)
+{
+	color_parse_mem(value, strlen(value), var, dst);
+}
+
+void color_parse_mem(const char *value, int value_len, const char *var,
+		char *dst)
+{
+	const char *ptr = value;
+	int len = value_len;
+	int attr = -1;
+	int fg = -2;
+	int bg = -2;
+
+	if (!strncasecmp(value, "reset", len)) {
+		strcpy(dst, PERF_COLOR_RESET);
+		return;
+	}
+
+	/* [fg [bg]] [attr] */
+	while (len > 0) {
+		const char *word = ptr;
+		int val, wordlen = 0;
+
+		while (len > 0 && !isspace(word[wordlen])) {
+			wordlen++;
+			len--;
+		}
+
+		ptr = word + wordlen;
+		while (len > 0 && isspace(*ptr)) {
+			ptr++;
+			len--;
+		}
+
+		val = parse_color(word, wordlen);
+		if (val >= -1) {
+			if (fg == -2) {
+				fg = val;
+				continue;
+			}
+			if (bg == -2) {
+				bg = val;
+				continue;
+			}
+			goto bad;
+		}
+		val = parse_attr(word, wordlen);
+		if (val < 0 || attr != -1)
+			goto bad;
+		attr = val;
+	}
+
+	if (attr >= 0 || fg >= 0 || bg >= 0) {
+		int sep = 0;
+
+		*dst++ = '\033';
+		*dst++ = '[';
+		if (attr >= 0) {
+			*dst++ = '0' + attr;
+			sep++;
+		}
+		if (fg >= 0) {
+			if (sep++)
+				*dst++ = ';';
+			if (fg < 8) {
+				*dst++ = '3';
+				*dst++ = '0' + fg;
+			} else {
+				dst += sprintf(dst, "38;5;%d", fg);
+			}
+		}
+		if (bg >= 0) {
+			if (sep++)
+				*dst++ = ';';
+			if (bg < 8) {
+				*dst++ = '4';
+				*dst++ = '0' + bg;
+			} else {
+				dst += sprintf(dst, "48;5;%d", bg);
+			}
+		}
+		*dst++ = 'm';
+	}
+	*dst = 0;
+	return;
+bad:
+	die("bad color value '%.*s' for variable '%s'", value_len, value, var);
+}
+
+int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty)
+{
+	if (value) {
+		if (!strcasecmp(value, "never"))
+			return 0;
+		if (!strcasecmp(value, "always"))
+			return 1;
+		if (!strcasecmp(value, "auto"))
+			goto auto_color;
+	}
+
+	/* Missing or explicit false to turn off colorization */
+	if (!perf_config_bool(var, value))
+		return 0;
+
+	/* any normal truth value defaults to 'auto' */
+ auto_color:
+	if (stdout_is_tty < 0)
+		stdout_is_tty = isatty(1);
+	if (stdout_is_tty || (pager_in_use() && pager_use_color)) {
+		char *term = getenv("TERM");
+		if (term && strcmp(term, "dumb"))
+			return 1;
+	}
+	return 0;
+}
+
+int perf_color_default_config(const char *var, const char *value, void *cb)
+{
+	if (!strcmp(var, "color.ui")) {
+		perf_use_color_default = perf_config_colorbool(var, value, -1);
+		return 0;
+	}
+
+	return perf_default_config(var, value, cb);
+}
+
+static int color_vfprintf(FILE *fp, const char *color, const char *fmt,
+		va_list args, const char *trail)
+{
+	int r = 0;
+
+	if (*color)
+		r += fprintf(fp, "%s", color);
+	r += vfprintf(fp, fmt, args);
+	if (*color)
+		r += fprintf(fp, "%s", PERF_COLOR_RESET);
+	if (trail)
+		r += fprintf(fp, "%s", trail);
+	return r;
+}
+
+
+
+int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+	va_start(args, fmt);
+	r = color_vfprintf(fp, color, fmt, args, NULL);
+	va_end(args);
+	return r;
+}
+
+int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+	va_start(args, fmt);
+	r = color_vfprintf(fp, color, fmt, args, "\n");
+	va_end(args);
+	return r;
+}
+
+/*
+ * This function splits the buffer by newlines and colors the lines individually.
+ *
+ * Returns 0 on success.
+ */
+int color_fwrite_lines(FILE *fp, const char *color,
+		size_t count, const char *buf)
+{
+	if (!*color)
+		return fwrite(buf, count, 1, fp) != 1;
+	while (count) {
+		char *p = memchr(buf, '\n', count);
+		if (p != buf && (fputs(color, fp) < 0 ||
+				fwrite(buf, p ? p - buf : count, 1, fp) != 1 ||
+				fputs(PERF_COLOR_RESET, fp) < 0))
+			return -1;
+		if (!p)
+			return 0;
+		if (fputc('\n', fp) < 0)
+			return -1;
+		count -= p + 1 - buf;
+		buf = p + 1;
+	}
+	return 0;
+}
+
+
diff --git a/Documentation/perf_counter/util/color.h b/Documentation/perf_counter/util/color.h
new file mode 100644
index 0000000000000..5abfd379582b4
--- /dev/null
+++ b/Documentation/perf_counter/util/color.h
@@ -0,0 +1,36 @@
+#ifndef COLOR_H
+#define COLOR_H
+
+/* "\033[1;38;5;2xx;48;5;2xxm\0" is 23 bytes */
+#define COLOR_MAXLEN 24
+
+#define PERF_COLOR_NORMAL	""
+#define PERF_COLOR_RESET	"\033[m"
+#define PERF_COLOR_BOLD		"\033[1m"
+#define PERF_COLOR_RED		"\033[31m"
+#define PERF_COLOR_GREEN	"\033[32m"
+#define PERF_COLOR_YELLOW	"\033[33m"
+#define PERF_COLOR_BLUE		"\033[34m"
+#define PERF_COLOR_MAGENTA	"\033[35m"
+#define PERF_COLOR_CYAN		"\033[36m"
+#define PERF_COLOR_BG_RED	"\033[41m"
+
+/*
+ * This variable stores the value of color.ui
+ */
+extern int perf_use_color_default;
+
+
+/*
+ * Use this instead of perf_default_config if you need the value of color.ui.
+ */
+int perf_color_default_config(const char *var, const char *value, void *cb);
+
+int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty);
+void color_parse(const char *value, const char *var, char *dst);
+void color_parse_mem(const char *value, int len, const char *var, char *dst);
+int color_fprintf(FILE *fp, const char *color, const char *fmt, ...);
+int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...);
+int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf);
+
+#endif /* COLOR_H */
diff --git a/Documentation/perf_counter/util/environment.c b/Documentation/perf_counter/util/environment.c
index 9b1c8199e7293..275b0ee345f5e 100644
--- a/Documentation/perf_counter/util/environment.c
+++ b/Documentation/perf_counter/util/environment.c
@@ -6,3 +6,4 @@
 #include "cache.h"
 
 const char *pager_program;
+int pager_use_color = 1;
-- 
GitLab


From 13d0ab5ec29852a6925f612830fa9e822669ece6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 15:40:25 +0200
Subject: [PATCH 1595/2198] perf_counter tools: Dont output in color on !tty

Dont emit ASCII color characters if the terminal is not a tty,
such as when perf report gets redirected into a file.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/color.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/Documentation/perf_counter/util/color.c b/Documentation/perf_counter/util/color.c
index a77975d66774d..9a8c20ccc53e2 100644
--- a/Documentation/perf_counter/util/color.c
+++ b/Documentation/perf_counter/util/color.c
@@ -1,7 +1,7 @@
 #include "cache.h"
 #include "color.h"
 
-int perf_use_color_default = 0;
+int perf_use_color_default = -1;
 
 static int parse_color(const char *name, int len)
 {
@@ -169,10 +169,20 @@ static int color_vfprintf(FILE *fp, const char *color, const char *fmt,
 {
 	int r = 0;
 
-	if (*color)
+	/*
+	 * Auto-detect:
+	 */
+	if (perf_use_color_default < 0) {
+		if (isatty(1) || pager_in_use())
+			perf_use_color_default = 1;
+		else
+			perf_use_color_default = 0;
+	}
+
+	if (perf_use_color_default && *color)
 		r += fprintf(fp, "%s", color);
 	r += vfprintf(fp, fmt, args);
-	if (*color)
+	if (perf_use_color_default && *color)
 		r += fprintf(fp, "%s", PERF_COLOR_RESET);
 	if (trail)
 		r += fprintf(fp, "%s", trail);
@@ -185,6 +195,7 @@ int color_fprintf(FILE *fp, const char *color, const char *fmt, ...)
 {
 	va_list args;
 	int r;
+
 	va_start(args, fmt);
 	r = color_vfprintf(fp, color, fmt, args, NULL);
 	va_end(args);
-- 
GitLab


From edc52deac624e4641211a325c23da2a73b01a85d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 16:24:37 +0200
Subject: [PATCH 1596/2198] perf report: Bail out if there are unrecognized
 options/arguments

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 7beedc6effa4d..389ae2569f499 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -1094,10 +1094,16 @@ int cmd_report(int argc, const char **argv, const char *prefix)
 
 	page_size = getpagesize();
 
-	parse_options(argc, argv, options, report_usage, 0);
+	argc = parse_options(argc, argv, options, report_usage, 0);
 
 	setup_sorting();
 
+	/*
+	 * Any (unrecognized) arguments left?
+	 */
+	if (argc)
+		usage_with_options(report_usage, options);
+
 	setup_pager();
 
 	return __cmd_report();
-- 
GitLab


From 20c84e959ec11b1803d2b2832eef703d5fbe7f7b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 4 Jun 2009 16:33:00 +0200
Subject: [PATCH 1597/2198] perf stat: Update help text

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../perf_counter/Documentation/perf-stat.txt  | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
index a67d0e3b7d0d7..a340e7be83ddd 100644
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -22,6 +22,7 @@ OPTIONS
 <command>...::
 	Any command you can specify in a shell.
 
+
 -e::
 --event=::
                              0:0: cpu-cycles          
@@ -45,6 +46,13 @@ OPTIONS
                              1:4: migrations          
                            rNNN: raw PMU events (eventsel+umask)
 
+-i::
+--inherit::
+        child tasks inherit counters
+-p::
+--pid=<pid>::
+        stat events on existing pid
+
 -a::
         system-wide collection
 
@@ -54,20 +62,20 @@ OPTIONS
 EXAMPLES
 --------
 
-$ perf stat sleep 1
+$ perf stat -- make -j
 
- Performance counter stats for 'sleep':
+ Performance counter stats for 'make -j':
 
-       0.678356  task clock ticks     (msecs)
-              7  context switches     (events)
-              4  CPU migrations       (events)
-            232  pagefaults           (events)
-        1810403  CPU cycles           (events)
-         946759  instructions         (events)
-          18952  cache references     (events)
-           4885  cache misses         (events)
+    8117.370256  task clock ticks     #      11.281 CPU utilization factor
+            678  context switches     #       0.000 M/sec
+            133  CPU migrations       #       0.000 M/sec
+         235724  pagefaults           #       0.029 M/sec
+    24821162526  CPU cycles           #    3057.784 M/sec
+    18687303457  instructions         #    2302.138 M/sec
+      172158895  cache references     #      21.209 M/sec
+       27075259  cache misses         #       3.335 M/sec
 
- Wall-clock time elapsed:  1001.252894 msecs
+ Wall-clock time elapsed:   719.554352 msecs
 
 SEE ALSO
 --------
-- 
GitLab


From 60313ebed739b331e8e61079da27a11ee3b73a30 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 16:53:44 +0200
Subject: [PATCH 1598/2198] perf_counter: Add fork event

Create a fork event so that we can easily clone the comm and
dso maps without having to generate all those events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  10 +++
 kernel/fork.c                |   4 +-
 kernel/perf_counter.c        | 131 ++++++++++++++++++++++++++++++-----
 3 files changed, 126 insertions(+), 19 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 37d5541d74cb0..380247bdb9180 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -276,6 +276,14 @@ enum perf_event_type {
 	PERF_EVENT_THROTTLE		= 5,
 	PERF_EVENT_UNTHROTTLE		= 6,
 
+	/*
+	 * struct {
+	 * 	struct perf_event_header	header;
+	 * 	u32				pid, ppid;
+	 * };
+	 */
+	PERF_EVENT_FORK			= 7,
+
 	/*
 	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field
 	 * will be PERF_RECORD_*
@@ -618,6 +626,7 @@ extern void perf_counter_munmap(unsigned long addr, unsigned long len,
 				unsigned long pgoff, struct file *file);
 
 extern void perf_counter_comm(struct task_struct *tsk);
+extern void perf_counter_fork(struct task_struct *tsk);
 
 extern void perf_counter_task_migration(struct task_struct *task, int cpu);
 
@@ -673,6 +682,7 @@ perf_counter_munmap(unsigned long addr, unsigned long len,
 		    unsigned long pgoff, struct file *file)		{ }
 
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
+static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
 static inline void perf_counter_task_migration(struct task_struct *task,
 					       int cpu)			{ }
diff --git a/kernel/fork.c b/kernel/fork.c
index b7d7a9f0bd7ad..f4466ca37ece6 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1412,12 +1412,12 @@ long do_fork(unsigned long clone_flags,
 		if (clone_flags & CLONE_VFORK) {
 			p->vfork_done = &vfork;
 			init_completion(&vfork);
-		} else {
+		} else if (!(clone_flags & CLONE_VM)) {
 			/*
 			 * vfork will do an exec which will call
 			 * set_task_comm()
 			 */
-			perf_counter_comm(p);
+			perf_counter_fork(p);
 		}
 
 		audit_finish_fork(p);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 0bb03f15a5b67..78c58623a0dd2 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -40,9 +40,9 @@ static int perf_reserved_percpu __read_mostly;
 static int perf_overcommit __read_mostly = 1;
 
 static atomic_t nr_counters __read_mostly;
-static atomic_t nr_mmap_tracking __read_mostly;
-static atomic_t nr_munmap_tracking __read_mostly;
-static atomic_t nr_comm_tracking __read_mostly;
+static atomic_t nr_mmap_counters __read_mostly;
+static atomic_t nr_munmap_counters __read_mostly;
+static atomic_t nr_comm_counters __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
@@ -1447,11 +1447,11 @@ static void free_counter(struct perf_counter *counter)
 
 	atomic_dec(&nr_counters);
 	if (counter->attr.mmap)
-		atomic_dec(&nr_mmap_tracking);
+		atomic_dec(&nr_mmap_counters);
 	if (counter->attr.munmap)
-		atomic_dec(&nr_munmap_tracking);
+		atomic_dec(&nr_munmap_counters);
 	if (counter->attr.comm)
-		atomic_dec(&nr_comm_tracking);
+		atomic_dec(&nr_comm_counters);
 
 	if (counter->destroy)
 		counter->destroy(counter);
@@ -2475,6 +2475,105 @@ static void perf_counter_output(struct perf_counter *counter,
 	perf_output_end(&handle);
 }
 
+/*
+ * fork tracking
+ */
+
+struct perf_fork_event {
+	struct task_struct	*task;
+
+	struct {
+		struct perf_event_header	header;
+
+		u32				pid;
+		u32				ppid;
+	} event;
+};
+
+static void perf_counter_fork_output(struct perf_counter *counter,
+				     struct perf_fork_event *fork_event)
+{
+	struct perf_output_handle handle;
+	int size = fork_event->event.header.size;
+	struct task_struct *task = fork_event->task;
+	int ret = perf_output_begin(&handle, counter, size, 0, 0);
+
+	if (ret)
+		return;
+
+	fork_event->event.pid = perf_counter_pid(counter, task);
+	fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+
+	perf_output_put(&handle, fork_event->event);
+	perf_output_end(&handle);
+}
+
+static int perf_counter_fork_match(struct perf_counter *counter)
+{
+	if (counter->attr.comm || counter->attr.mmap || counter->attr.munmap)
+		return 1;
+
+	return 0;
+}
+
+static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
+				  struct perf_fork_event *fork_event)
+{
+	struct perf_counter *counter;
+
+	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list))
+		return;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
+		if (perf_counter_fork_match(counter))
+			perf_counter_fork_output(counter, fork_event);
+	}
+	rcu_read_unlock();
+}
+
+static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+{
+	struct perf_cpu_context *cpuctx;
+	struct perf_counter_context *ctx;
+
+	cpuctx = &get_cpu_var(perf_cpu_context);
+	perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+	put_cpu_var(perf_cpu_context);
+
+	rcu_read_lock();
+	/*
+	 * doesn't really matter which of the child contexts the
+	 * events ends up in.
+	 */
+	ctx = rcu_dereference(current->perf_counter_ctxp);
+	if (ctx)
+		perf_counter_fork_ctx(ctx, fork_event);
+	rcu_read_unlock();
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+	struct perf_fork_event fork_event;
+
+	if (!atomic_read(&nr_comm_counters) &&
+	    !atomic_read(&nr_mmap_counters) &&
+	    !atomic_read(&nr_munmap_counters))
+		return;
+
+	fork_event = (struct perf_fork_event){
+		.task	= task,
+		.event  = {
+			.header = {
+				.type = PERF_EVENT_FORK,
+				.size = sizeof(fork_event.event),
+			},
+		},
+	};
+
+	perf_counter_fork_event(&fork_event);
+}
+
 /*
  * comm tracking
  */
@@ -2511,11 +2610,9 @@ static void perf_counter_comm_output(struct perf_counter *counter,
 	perf_output_end(&handle);
 }
 
-static int perf_counter_comm_match(struct perf_counter *counter,
-				   struct perf_comm_event *comm_event)
+static int perf_counter_comm_match(struct perf_counter *counter)
 {
-	if (counter->attr.comm &&
-	    comm_event->event.header.type == PERF_EVENT_COMM)
+	if (counter->attr.comm)
 		return 1;
 
 	return 0;
@@ -2531,7 +2628,7 @@ static void perf_counter_comm_ctx(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_counter_comm_match(counter, comm_event))
+		if (perf_counter_comm_match(counter))
 			perf_counter_comm_output(counter, comm_event);
 	}
 	rcu_read_unlock();
@@ -2570,7 +2667,7 @@ void perf_counter_comm(struct task_struct *task)
 {
 	struct perf_comm_event comm_event;
 
-	if (!atomic_read(&nr_comm_tracking))
+	if (!atomic_read(&nr_comm_counters))
 		return;
 
 	comm_event = (struct perf_comm_event){
@@ -2708,7 +2805,7 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 {
 	struct perf_mmap_event mmap_event;
 
-	if (!atomic_read(&nr_mmap_tracking))
+	if (!atomic_read(&nr_mmap_counters))
 		return;
 
 	mmap_event = (struct perf_mmap_event){
@@ -2729,7 +2826,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len,
 {
 	struct perf_mmap_event mmap_event;
 
-	if (!atomic_read(&nr_munmap_tracking))
+	if (!atomic_read(&nr_munmap_counters))
 		return;
 
 	mmap_event = (struct perf_mmap_event){
@@ -3427,11 +3524,11 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 
 	atomic_inc(&nr_counters);
 	if (counter->attr.mmap)
-		atomic_inc(&nr_mmap_tracking);
+		atomic_inc(&nr_mmap_counters);
 	if (counter->attr.munmap)
-		atomic_inc(&nr_munmap_tracking);
+		atomic_inc(&nr_munmap_counters);
 	if (counter->attr.comm)
-		atomic_inc(&nr_comm_tracking);
+		atomic_inc(&nr_comm_counters);
 
 	return counter;
 }
-- 
GitLab


From d99e9446200c1ffab28cb0e39b76c34a2bfafd06 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 17:08:58 +0200
Subject: [PATCH 1599/2198] perf_counter: Remove munmap stuff

In name of keeping it simple, only track mmap events. Userspace
will have to remove old overlapping maps when it encounters them.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 11 +----------
 kernel/perf_counter.c        | 38 +++---------------------------------
 mm/mmap.c                    |  6 ------
 3 files changed, 4 insertions(+), 51 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 380247bdb9180..6ca403acd4196 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -148,11 +148,10 @@ struct perf_counter_attr {
 				exclude_hv     :  1, /* ditto hypervisor      */
 				exclude_idle   :  1, /* don't count when idle */
 				mmap           :  1, /* include mmap data     */
-				munmap         :  1, /* include munmap data   */
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 52;
+				__reserved_1   : 53;
 
 	__u32			wakeup_events;	/* wakeup every n events */
 	__u32			__reserved_2;
@@ -246,7 +245,6 @@ enum perf_event_type {
 	 * };
 	 */
 	PERF_EVENT_MMAP			= 1,
-	PERF_EVENT_MUNMAP		= 2,
 
 	/*
 	 * struct {
@@ -622,9 +620,6 @@ extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
 extern void perf_counter_mmap(unsigned long addr, unsigned long len,
 			      unsigned long pgoff, struct file *file);
 
-extern void perf_counter_munmap(unsigned long addr, unsigned long len,
-				unsigned long pgoff, struct file *file);
-
 extern void perf_counter_comm(struct task_struct *tsk);
 extern void perf_counter_fork(struct task_struct *tsk);
 
@@ -677,10 +672,6 @@ static inline void
 perf_counter_mmap(unsigned long addr, unsigned long len,
 		  unsigned long pgoff, struct file *file)		{ }
 
-static inline void
-perf_counter_munmap(unsigned long addr, unsigned long len,
-		    unsigned long pgoff, struct file *file)		{ }
-
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 78c58623a0dd2..195712e20d075 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -41,7 +41,6 @@ static int perf_overcommit __read_mostly = 1;
 
 static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_counters __read_mostly;
-static atomic_t nr_munmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
@@ -1448,8 +1447,6 @@ static void free_counter(struct perf_counter *counter)
 	atomic_dec(&nr_counters);
 	if (counter->attr.mmap)
 		atomic_dec(&nr_mmap_counters);
-	if (counter->attr.munmap)
-		atomic_dec(&nr_munmap_counters);
 	if (counter->attr.comm)
 		atomic_dec(&nr_comm_counters);
 
@@ -2510,7 +2507,7 @@ static void perf_counter_fork_output(struct perf_counter *counter,
 
 static int perf_counter_fork_match(struct perf_counter *counter)
 {
-	if (counter->attr.comm || counter->attr.mmap || counter->attr.munmap)
+	if (counter->attr.comm || counter->attr.mmap)
 		return 1;
 
 	return 0;
@@ -2557,8 +2554,7 @@ void perf_counter_fork(struct task_struct *task)
 	struct perf_fork_event fork_event;
 
 	if (!atomic_read(&nr_comm_counters) &&
-	    !atomic_read(&nr_mmap_counters) &&
-	    !atomic_read(&nr_munmap_counters))
+	    !atomic_read(&nr_mmap_counters))
 		return;
 
 	fork_event = (struct perf_fork_event){
@@ -2722,12 +2718,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter,
 static int perf_counter_mmap_match(struct perf_counter *counter,
 				   struct perf_mmap_event *mmap_event)
 {
-	if (counter->attr.mmap &&
-	    mmap_event->event.header.type == PERF_EVENT_MMAP)
-		return 1;
-
-	if (counter->attr.munmap &&
-	    mmap_event->event.header.type == PERF_EVENT_MUNMAP)
+	if (counter->attr.mmap)
 		return 1;
 
 	return 0;
@@ -2821,27 +2812,6 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 	perf_counter_mmap_event(&mmap_event);
 }
 
-void perf_counter_munmap(unsigned long addr, unsigned long len,
-			 unsigned long pgoff, struct file *file)
-{
-	struct perf_mmap_event mmap_event;
-
-	if (!atomic_read(&nr_munmap_counters))
-		return;
-
-	mmap_event = (struct perf_mmap_event){
-		.file   = file,
-		.event  = {
-			.header = { .type = PERF_EVENT_MUNMAP, },
-			.start  = addr,
-			.len    = len,
-			.pgoff  = pgoff,
-		},
-	};
-
-	perf_counter_mmap_event(&mmap_event);
-}
-
 /*
  * Log sample_period changes so that analyzing tools can re-normalize the
  * event flow.
@@ -3525,8 +3495,6 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	atomic_inc(&nr_counters);
 	if (counter->attr.mmap)
 		atomic_inc(&nr_mmap_counters);
-	if (counter->attr.munmap)
-		atomic_inc(&nr_munmap_counters);
 	if (counter->attr.comm)
 		atomic_inc(&nr_comm_counters);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 2c1c2cb0e2e18..6451ce2854b9f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1756,12 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
 	do {
 		long nrpages = vma_pages(vma);
 
-		if (vma->vm_flags & VM_EXEC) {
-			perf_counter_munmap(vma->vm_start,
-					nrpages << PAGE_SHIFT,
-					vma->vm_pgoff, vma->vm_file);
-		}
-
 		mm->total_vm -= nrpages;
 		vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
 		vma = remove_vma(vma);
-- 
GitLab


From 62fc44536c14b5787531bac7417580fca54c88b4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 4 Jun 2009 16:53:49 +0200
Subject: [PATCH 1600/2198] perf_counter tools: Use fork and remove munmap
 events

Use fork events to clone comm and map data and remove everything
munmap related

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 103 ++++++++++++++++++--
 Documentation/perf_counter/builtin-top.c    |  21 ----
 2 files changed, 93 insertions(+), 31 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 389ae2569f499..5d191216c80a2 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -43,12 +43,6 @@ static int		full_paths;
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
 
-const char *perf_event_names[] = {
-	[PERF_EVENT_MMAP]   = " PERF_EVENT_MMAP",
-	[PERF_EVENT_MUNMAP] = " PERF_EVENT_MUNMAP",
-	[PERF_EVENT_COMM]   = " PERF_EVENT_COMM",
-};
-
 struct ip_event {
 	struct perf_event_header header;
 	__u64 ip;
@@ -70,11 +64,17 @@ struct comm_event {
 	char comm[16];
 };
 
+struct fork_event {
+	struct perf_event_header header;
+	__u32 pid, ppid;
+};
+
 typedef union event_union {
 	struct perf_event_header header;
 	struct ip_event ip;
 	struct mmap_event mmap;
 	struct comm_event comm;
+	struct fork_event fork;
 } event_t;
 
 static LIST_HEAD(dsos);
@@ -208,7 +208,31 @@ static struct map *map__new(struct mmap_event *event)
 	return NULL;
 }
 
-struct thread;
+static struct map *map__clone(struct map *self)
+{
+	struct map *map = malloc(sizeof(*self));
+
+	if (!map)
+		return NULL;
+
+	memcpy(map, self, sizeof(*self));
+
+	return map;
+}
+
+static int map__overlap(struct map *l, struct map *r)
+{
+	if (l->start > r->start) {
+		struct map *t = l;
+		l = r;
+		r = t;
+	}
+
+	if (l->end > r->start)
+		return 1;
+
+	return 0;
+}
 
 struct thread {
 	struct rb_node	 rb_node;
@@ -284,9 +308,39 @@ static struct thread *threads__findnew(pid_t pid)
 
 static void thread__insert_map(struct thread *self, struct map *map)
 {
+	struct map *pos, *tmp;
+
+	list_for_each_entry_safe(pos, tmp, &self->maps, node) {
+		if (map__overlap(pos, map)) {
+			list_del_init(&pos->node);
+			/* XXX leaks dsos */
+			free(pos);
+		}
+	}
+
 	list_add_tail(&map->node, &self->maps);
 }
 
+static int thread__fork(struct thread *self, struct thread *parent)
+{
+	struct map *map;
+
+	if (self->comm)
+		free(self->comm);
+	self->comm = strdup(parent->comm);
+	if (!self->comm)
+		return -ENOMEM;
+
+	list_for_each_entry(map, &parent->maps, node) {
+		struct map *new = map__clone(map);
+		if (!new)
+			return -ENOMEM;
+		thread__insert_map(self, new);
+	}
+
+	return 0;
+}
+
 static struct map *thread__find_map(struct thread *self, uint64_t ip)
 {
 	struct map *pos;
@@ -784,7 +838,11 @@ static void register_idle_thread(void)
 	}
 }
 
-static unsigned long total = 0, total_mmap = 0, total_comm = 0, total_unknown = 0;
+static unsigned long total = 0,
+		     total_mmap = 0,
+		     total_comm = 0,
+		     total_fork = 0,
+		     total_unknown = 0;
 
 static int
 process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
@@ -866,9 +924,10 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
 	struct thread *thread = threads__findnew(event->mmap.pid);
 	struct map *map = map__new(&event->mmap);
 
-	dprintf("%p [%p]: PERF_EVENT_MMAP: [%p(%p) @ %p]: %s\n",
+	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
+		event->mmap.pid,
 		(void *)(long)event->mmap.start,
 		(void *)(long)event->mmap.len,
 		(void *)(long)event->mmap.pgoff,
@@ -905,6 +964,26 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static int
+process_fork_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->fork.pid);
+	struct thread *parent = threads__findnew(event->fork.ppid);
+
+	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->fork.pid, event->fork.ppid);
+
+	if (!thread || !parent || thread__fork(thread, parent)) {
+		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
+		return -1;
+	}
+	total_fork++;
+
+	return 0;
+}
+
 static int
 process_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -918,10 +997,13 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	case PERF_EVENT_COMM:
 		return process_comm_event(event, offset, head);
 
+	case PERF_EVENT_FORK:
+		return process_fork_event(event, offset, head);
+
 	/*
 	 * We dont process them right now but they are fine:
 	 */
-	case PERF_EVENT_MUNMAP:
+
 	case PERF_EVENT_PERIOD:
 	case PERF_EVENT_THROTTLE:
 	case PERF_EVENT_UNTHROTTLE:
@@ -1038,6 +1120,7 @@ static int __cmd_report(void)
 	dprintf("      IP events: %10ld\n", total);
 	dprintf("    mmap events: %10ld\n", total_mmap);
 	dprintf("    comm events: %10ld\n", total_comm);
+	dprintf("    fork events: %10ld\n", total_fork);
 	dprintf(" unknown events: %10ld\n", total_unknown);
 
 	if (dump_trace)
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 20e5b12009594..31c00ba99b141 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -75,8 +75,6 @@ static unsigned int		realtime_prio			=  0;
 static int			group				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			=  16;
-static int			use_mmap			= 0;
-static int			use_munmap			= 0;
 static int			freq				= 0;
 
 static char			*sym_filter;
@@ -527,19 +525,6 @@ static void mmap_read(struct mmap_data *md)
 		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) {
 			if (event->header.type & PERF_SAMPLE_IP)
 				process_event(event->ip.ip, md->counter);
-		} else {
-			switch (event->header.type) {
-				case PERF_EVENT_MMAP:
-				case PERF_EVENT_MUNMAP:
-					printf("%s: %Lu %Lu %Lu %s\n",
-							event->header.type == PERF_EVENT_MMAP
-							? "mmap" : "munmap",
-							event->mmap.start,
-							event->mmap.len,
-							event->mmap.pgoff,
-							event->mmap.filename);
-					break;
-			}
 		}
 	}
 
@@ -569,8 +554,6 @@ static int __cmd_top(void)
 			attr.config		= event_id[counter];
 			attr.sample_period	= event_count[counter];
 			attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-			attr.mmap		= use_mmap;
-			attr.munmap		= use_munmap;
 			attr.freq		= freq;
 
 			fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0);
@@ -670,10 +653,6 @@ static const struct option options[] = {
 		    "only display symbols matchig this pattern"),
 	OPT_BOOLEAN('z', "zero", &group,
 		    "zero history across updates"),
-	OPT_BOOLEAN('M', "use-mmap", &use_mmap,
-		    "track mmap events"),
-	OPT_BOOLEAN('U', "use-munmap", &use_munmap,
-		    "track munmap events"),
 	OPT_INTEGER('F', "freq", &freq,
 		    "profile at this frequency"),
 	OPT_INTEGER('E', "entries", &print_entries,
-- 
GitLab


From 0f5486b5c71a831a713ce356d8d06822e3c7c379 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Jun 2009 20:48:04 +0200
Subject: [PATCH 1601/2198] perf_counter: Sleep before refresh using poll in
 perf top

perf top is refreshed every delay_secs the thread runs in such
loop:

while (sleep(delay_secs)) {
	print_sym_table();
}

At the end of print_sym_table(), poll is used without sleep delay
to check if we have something from stdin.

It means that this check is done only every delay_secs, which can
be higher that 2 secs if the user defined a custom refresh rate.

We can drop sleep() here and directly use poll to wait between
refresh periods, so that the reaction after the user stops perf top
after typing "Enter" is immediate and doesn't suffer from the
delay_secs latency.

Nb: poll doesn't add any overhead that can parasite perf top measures
since it sleeps the entire timeout here.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1244141284-7507-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 31c00ba99b141..28cbde4b6e83d 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -204,7 +204,7 @@ static void print_sym_table(void)
 			list_remove_active_sym(syme);
 	}
 
-	write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR));
+	puts(CONSOLE_CLEAR);
 
 	printf(
 "------------------------------------------------------------------------------\n");
@@ -278,23 +278,21 @@ static void print_sym_table(void)
 		color_fprintf(stdout, color, "%4.1f%%", pcnt);
 		printf(" - %016llx : %s\n", sym->start, sym->name);
 	}
-
-	{
-		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
-
-		if (poll(&stdin_poll, 1, 0) == 1) {
-			printf("key pressed - exiting.\n");
-			exit(0);
-		}
-	}
 }
 
 static void *display_thread(void *arg)
 {
+	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
+	int delay_msecs = delay_secs * 1000;
+
 	printf("PerfTop refresh period: %d seconds\n", delay_secs);
 
-	while (!sleep(delay_secs))
+	do {
 		print_sym_table();
+	} while (!poll(&stdin_poll, 1, delay_msecs) == 1);
+
+	printf("key pressed - exiting.\n");
+	exit(0);
 
 	return NULL;
 }
-- 
GitLab


From 9ac995457b2a148ed9bb8860e8b7cb869327b102 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 4 Jun 2009 13:54:00 -0300
Subject: [PATCH 1602/2198] perf report: Add -vvv to print the list of threads
 and its mmaps

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 36 +++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 5d191216c80a2..1a1028d3bc347 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -234,6 +234,13 @@ static int map__overlap(struct map *l, struct map *r)
 	return 0;
 }
 
+static size_t map__fprintf(struct map *self, FILE *fp)
+{
+	return fprintf(fp, " %lx-%lx %lx %s\n",
+		       self->start, self->end, self->pgoff, self->dso->name);
+}
+
+
 struct thread {
 	struct rb_node	 rb_node;
 	struct list_head maps;
@@ -264,6 +271,18 @@ static int thread__set_comm(struct thread *self, const char *comm)
 	return self->comm ? 0 : -ENOMEM;
 }
 
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+	struct map *pos;
+	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
+
+	list_for_each_entry(pos, &self->maps, node)
+		ret += map__fprintf(pos, fp);
+
+	return ret;
+}
+
+
 static struct rb_root threads;
 static struct thread *last_match;
 
@@ -355,6 +374,20 @@ static struct map *thread__find_map(struct thread *self, uint64_t ip)
 	return NULL;
 }
 
+static size_t threads__fprintf(FILE *fp)
+{
+	size_t ret = 0;
+	struct rb_node *nd;
+
+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
+
+		ret += thread__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
 /*
  * histogram, sorted on item, collects counts
  */
@@ -1126,6 +1159,9 @@ static int __cmd_report(void)
 	if (dump_trace)
 		return 0;
 
+	if (verbose >= 3)
+		threads__fprintf(stdout);
+
 	if (verbose >= 2)
 		dsos__fprintf(stdout);
 
-- 
GitLab


From 172124e220f1854acc99ee394671781b8b5e2120 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 4 Jun 2009 22:34:44 +0200
Subject: [PATCH 1603/2198] Revert "block: implement blkdev_readpages"

This reverts commit db2dbb12dc47a50c7a4c5678f526014063e486f6.

It apparently causes problems with partition table read-ahead
on archs with large page sizes. Until that problem is diagnosed
further, just drop the readpages support on block devices.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/block_dev.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index a29b4dcc1bca2..2dfc6cdcebbe0 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -331,12 +331,6 @@ static int blkdev_readpage(struct file * file, struct page * page)
 	return block_read_full_page(page, blkdev_get_block);
 }
 
-static int blkdev_readpages(struct file *file, struct address_space *mapping,
-			struct list_head *pages, unsigned nr_pages)
-{
-	return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block);
-}
-
 static int blkdev_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
@@ -1405,7 +1399,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
-	.readpages	= blkdev_readpages,
 	.writepage	= blkdev_writepage,
 	.sync_page	= block_sync_page,
 	.write_begin	= blkdev_write_begin,
-- 
GitLab


From 76a0f40fd6eff1bce3b91925cea7587b3399fe80 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 4 Jun 2009 22:15:58 +0200
Subject: [PATCH 1604/2198] perf_counter tools: Fix warn_unused_result warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix warnings for return values that we don't care about:

 util/quote.c:222: attention : ignoring return value of ‘fwrite’, declared with attribute warn_unused_result
 util/quote.c:235: attention : ignoring return value of ‘fwrite’, declared with attribute warn_unused_result
 util/quote.c: In function ‘write_name_quotedpfx’:
 util/quote.c:290: attention : ignoring return value of ‘fwrite’, declared with attribute warn_unused_result

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1244146558-8635-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/util/quote.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/util/quote.c b/Documentation/perf_counter/util/quote.c
index 7a49fcf696716..f18c5212bc92b 100644
--- a/Documentation/perf_counter/util/quote.c
+++ b/Documentation/perf_counter/util/quote.c
@@ -201,8 +201,9 @@ static size_t quote_c_style_counted(const char *name, ssize_t maxlen,
 	} while (0)
 #define EMITBUF(s, l)                           \
 	do {                                        \
+		int __ret;				\
 		if (sb) strbuf_add(sb, (s), (l));       \
-		if (fp) fwrite((s), (l), 1, fp);        \
+		if (fp) __ret = fwrite((s), (l), 1, fp);        \
 		count += (l);                           \
 	} while (0)
 
@@ -287,7 +288,9 @@ extern void write_name_quotedpfx(const char *pfx, size_t pfxlen,
 		quote_c_style(name, NULL, fp, 1);
 		fputc('"', fp);
 	} else {
-		fwrite(pfx, pfxlen, 1, fp);
+		int ret;
+
+		ret = fwrite(pfx, pfxlen, 1, fp);
 		fputs(name, fp);
 	}
 	fputc(terminator, fp);
-- 
GitLab


From 03f5d8bcf094a5e3b501bd2ae1553656efa8d1be Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 9 Jun 2009 00:17:05 -0400
Subject: [PATCH 1605/2198] ext4: Get rid of EXTEND_DISKSIZE flag of
 ext4_get_blocks_handle()

Get rid of EXTEND_DISKSIZE flag of ext4_get_blocks_handle(). This
seems to be a relict from some old days and setting disksize in this
function does not make much sense.  Currently it was set only by
ext4_getblk().  Since the parameter has some effect only if create ==
1, it is easy to check by grepping through the sources that the three
callers which end up calling ext4_getblk() with create == 1
(ext4_append, ext4_quota_write, ext4_mkdir) do the right thing and set
disksize themselves.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h    |  6 ++----
 fs/ext4/extents.c |  9 ---------
 fs/ext4/inode.c   | 23 +++--------------------
 3 files changed, 5 insertions(+), 33 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 4311cc85b5340..59657ff7b8f42 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -323,15 +323,13 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_UNINIT_EXT		0x0002
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT	(EXT4_GET_BLOCKS_UNINIT_EXT|\
 						 EXT4_GET_BLOCKS_CREATE)
-	/* Update the ext4_inode_info i_disksize field */
-#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE		0x0004
 	/* Caller is from the delayed allocation writeout path,
 	   so set the magic i_delalloc_reserve_flag after taking the 
 	   inode allocation semaphore for */
-#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0008
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE	0x0004
 	/* Call ext4_da_update_reserve_space() after successfully 
 	   allocating the blocks */
-#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0010
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE	0x0008
 
 
 /*
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index d4e99e96fddb2..9c35a7b1f0ae7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2779,7 +2779,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	int err = 0, depth, ret, cache_type;
 	unsigned int allocated = 0;
 	struct ext4_allocation_request ar;
-	loff_t disksize;
 
 	__clear_bit(BH_New, &bh_result->b_state);
 	ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2969,14 +2968,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
 	newblock = ext_pblock(&newex);
 	allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-	if (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE) {
-		disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
-		if (disksize > i_size_read(inode))
-			disksize = i_size_read(inode);
-		if (disksize > EXT4_I(inode)->i_disksize)
-			EXT4_I(inode)->i_disksize = disksize;
-	}
-
 	set_buffer_new(bh_result);
 
 	/* Cache only when it is _not_ an uninitialized extent */
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 14c00fff37137..17ed0d244dbbc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -933,11 +933,8 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	int indirect_blks;
 	int blocks_to_boundary = 0;
 	int depth;
-	struct ext4_inode_info *ei = EXT4_I(inode);
 	int count = 0;
 	ext4_fsblk_t first_block = 0;
-	loff_t disksize;
-
 
 	J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
 	J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
@@ -1003,19 +1000,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
 	if (!err)
 		err = ext4_splice_branch(handle, inode, iblock,
 					partial, indirect_blks, count);
-	/*
-	 * i_disksize growing is protected by i_data_sem.  Don't forget to
-	 * protect it if you're about to implement concurrent
-	 * ext4_get_block() -bzzz
-	*/
-	if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) {
-		disksize = ((loff_t) iblock + count) << inode->i_blkbits;
-		if (disksize > i_size_read(inode))
-			disksize = i_size_read(inode);
-		if (disksize > ei->i_disksize)
-			ei->i_disksize = disksize;
-	}
-	if (err)
+	else 
 		goto cleanup;
 
 	set_buffer_new(bh_result);
@@ -1321,7 +1306,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 {
 	struct buffer_head dummy;
 	int fatal = 0, err;
-	int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE;
+	int flags = 0;
 
 	J_ASSERT(handle != NULL || create == 0);
 
@@ -2153,9 +2138,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 	}
 
 	/*
-	 * Update on-disk size along with block allocation we don't
-	 * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change
-	 * within already allocated block -bzzz
+	 * Update on-disk size along with block allocation.
 	 */
 	disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
 	if (disksize > i_size_read(mpd->inode))
-- 
GitLab


From b31e15527a9bb71b6a11a425d17ce139a62f5af5 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 4 Jun 2009 17:36:36 -0400
Subject: [PATCH 1606/2198] ext4: Change all super.c messages to print the
 device

This patch changes ext4 super.c to include the device name with all
warning/error messages, by using a new utility function ext4_msg.
It's a rather large patch, but very mechanic. I left debug printks
alone.

This is a straightforward port of a patch which Andi Kleen did for
ext3.

Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h  |   2 +
 fs/ext4/super.c | 462 ++++++++++++++++++++++++------------------------
 2 files changed, 235 insertions(+), 229 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 59657ff7b8f42..cc7d5edc38c90 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1401,6 +1401,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
 	__attribute__ ((format (printf, 3, 4)));
+extern void ext4_msg(struct super_block *, const char *, const char *, ...)
+	__attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
 				const char *, const char *, ...)
 	__attribute__ ((format (printf, 4, 5)));
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0a97b1ad3e19b..c191d0f65fedc 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -306,7 +306,7 @@ static void ext4_handle_error(struct super_block *sb)
 			jbd2_journal_abort(journal, -EIO);
 	}
 	if (test_opt(sb, ERRORS_RO)) {
-		printk(KERN_CRIT "Remounting filesystem read-only\n");
+		ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 		sb->s_flags |= MS_RDONLY;
 	}
 	ext4_commit_super(sb, 1);
@@ -399,8 +399,6 @@ void ext4_abort(struct super_block *sb, const char *function,
 {
 	va_list args;
 
-	printk(KERN_CRIT "ext4_abort called.\n");
-
 	va_start(args, fmt);
 	printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
 	vprintk(fmt, args);
@@ -413,7 +411,7 @@ void ext4_abort(struct super_block *sb, const char *function,
 	if (sb->s_flags & MS_RDONLY)
 		return;
 
-	printk(KERN_CRIT "Remounting filesystem read-only\n");
+	ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
 	EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 	sb->s_flags |= MS_RDONLY;
 	EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
@@ -421,6 +419,18 @@ void ext4_abort(struct super_block *sb, const char *function,
 		jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 
+void ext4_msg (struct super_block * sb, const char *prefix,
+		   const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+	printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+}
+
 void ext4_warning(struct super_block *sb, const char *function,
 		  const char *fmt, ...)
 {
@@ -499,7 +509,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 /*
  * Open the external journal device
  */
-static struct block_device *ext4_blkdev_get(dev_t dev)
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 {
 	struct block_device *bdev;
 	char b[BDEVNAME_SIZE];
@@ -510,7 +520,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
 	return bdev;
 
 fail:
-	printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
+	ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
 			__bdevname(dev, b), PTR_ERR(bdev));
 	return NULL;
 }
@@ -546,8 +556,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 {
 	struct list_head *l;
 
-	printk(KERN_ERR "sb orphan head is %d\n",
-	       le32_to_cpu(sbi->s_es->s_last_orphan));
+	ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
+		 le32_to_cpu(sbi->s_es->s_last_orphan));
 
 	printk(KERN_ERR "sb_info orphan list:\n");
 	list_for_each(l, &sbi->s_orphan) {
@@ -678,8 +688,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
 static void ext4_destroy_inode(struct inode *inode)
 {
 	if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
-		printk("EXT4 Inode %p: orphan list check failed!\n",
-			EXT4_I(inode));
+		ext4_msg(inode->i_sb, KERN_ERR,
+			 "Inode %lu (%p): orphan list check failed!",
+			 inode->i_ino, EXT4_I(inode));
 		print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
 				EXT4_I(inode), sizeof(struct ext4_inode_info),
 				true);
@@ -1239,8 +1250,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
 		case Opt_user_xattr:
 		case Opt_nouser_xattr:
-			printk(KERN_ERR "EXT4 (no)user_xattr options "
-			       "not supported\n");
+			ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
 			break;
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1253,8 +1263,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
 		case Opt_acl:
 		case Opt_noacl:
-			printk(KERN_ERR "EXT4 (no)acl options "
-			       "not supported\n");
+			ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
 			break;
 #endif
 		case Opt_journal_update:
@@ -1264,16 +1273,16 @@ static int parse_options(char *options, struct super_block *sb,
 			   user to specify an existing inode to be the
 			   journal file. */
 			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
+				ext4_msg(sb, KERN_ERR,
+					 "Cannot specify journal on remount");
 				return 0;
 			}
 			set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
 			break;
 		case Opt_journal_dev:
 			if (is_remount) {
-				printk(KERN_ERR "EXT4-fs: cannot specify "
-				       "journal on remount\n");
+				ext4_msg(sb, KERN_ERR,
+					"Cannot specify journal on remount");
 				return 0;
 			}
 			if (match_int(&args[0], &option))
@@ -1327,9 +1336,8 @@ static int parse_options(char *options, struct super_block *sb,
 			if (is_remount) {
 				if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
 						!= data_opt) {
-					printk(KERN_ERR
-						"EXT4-fs: cannot change data "
-						"mode on remount\n");
+					ext4_msg(sb, KERN_ERR,
+						"Cannot change data mode on remount");
 					return 0;
 				}
 			} else {
@@ -1359,31 +1367,31 @@ static int parse_options(char *options, struct super_block *sb,
 set_qf_name:
 			if (sb_any_quota_loaded(sb) &&
 			    !sbi->s_qf_names[qtype]) {
-				printk(KERN_ERR
-				       "EXT4-fs: Cannot change journaled "
-				       "quota options when quota turned on.\n");
+				ext4_msg(sb, KERN_ERR,
+				       "Cannot change journaled "
+				       "quota options when quota turned on");
 				return 0;
 			}
 			qname = match_strdup(&args[0]);
 			if (!qname) {
-				printk(KERN_ERR
-					"EXT4-fs: not enough memory for "
-					"storing quotafile name.\n");
+				ext4_msg(sb, KERN_ERR,
+					"Not enough memory for "
+					"storing quotafile name");
 				return 0;
 			}
 			if (sbi->s_qf_names[qtype] &&
 			    strcmp(sbi->s_qf_names[qtype], qname)) {
-				printk(KERN_ERR
-					"EXT4-fs: %s quota file already "
-					"specified.\n", QTYPE2NAME(qtype));
+				ext4_msg(sb, KERN_ERR,
+					"%s quota file already "
+					"specified", QTYPE2NAME(qtype));
 				kfree(qname);
 				return 0;
 			}
 			sbi->s_qf_names[qtype] = qname;
 			if (strchr(sbi->s_qf_names[qtype], '/')) {
-				printk(KERN_ERR
-					"EXT4-fs: quotafile must be on "
-					"filesystem root.\n");
+				ext4_msg(sb, KERN_ERR,
+					"quotafile must be on "
+					"filesystem root");
 				kfree(sbi->s_qf_names[qtype]);
 				sbi->s_qf_names[qtype] = NULL;
 				return 0;
@@ -1398,9 +1406,9 @@ static int parse_options(char *options, struct super_block *sb,
 clear_qf_name:
 			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_qf_names[qtype]) {
-				printk(KERN_ERR "EXT4-fs: Cannot change "
+				ext4_msg(sb, KERN_ERR, "Cannot change "
 					"journaled quota options when "
-					"quota turned on.\n");
+					"quota turned on");
 				return 0;
 			}
 			/*
@@ -1417,9 +1425,9 @@ static int parse_options(char *options, struct super_block *sb,
 set_qf_format:
 			if (sb_any_quota_loaded(sb) &&
 			    sbi->s_jquota_fmt != qfmt) {
-				printk(KERN_ERR "EXT4-fs: Cannot change "
+				ext4_msg(sb, KERN_ERR, "Cannot change "
 					"journaled quota options when "
-					"quota turned on.\n");
+					"quota turned on");
 				return 0;
 			}
 			sbi->s_jquota_fmt = qfmt;
@@ -1435,8 +1443,8 @@ static int parse_options(char *options, struct super_block *sb,
 			break;
 		case Opt_noquota:
 			if (sb_any_quota_loaded(sb)) {
-				printk(KERN_ERR "EXT4-fs: Cannot change quota "
-					"options when quota turned on.\n");
+				ext4_msg(sb, KERN_ERR, "Cannot change quota "
+					"options when quota turned on");
 				return 0;
 			}
 			clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1447,8 +1455,8 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_quota:
 		case Opt_usrquota:
 		case Opt_grpquota:
-			printk(KERN_ERR
-				"EXT4-fs: quota options not supported.\n");
+			ext4_msg(sb, KERN_ERR,
+				"quota options not supported");
 			break;
 		case Opt_usrjquota:
 		case Opt_grpjquota:
@@ -1456,9 +1464,8 @@ static int parse_options(char *options, struct super_block *sb,
 		case Opt_offgrpjquota:
 		case Opt_jqfmt_vfsold:
 		case Opt_jqfmt_vfsv0:
-			printk(KERN_ERR
-				"EXT4-fs: journaled quota options not "
-				"supported.\n");
+			ext4_msg(sb, KERN_ERR,
+				"journaled quota options not supported");
 			break;
 		case Opt_noquota:
 			break;
@@ -1483,8 +1490,9 @@ static int parse_options(char *options, struct super_block *sb,
 			break;
 		case Opt_resize:
 			if (!is_remount) {
-				printk("EXT4-fs: resize option only available "
-					"for remount\n");
+				ext4_msg(sb, KERN_ERR,
+					"resize option only available "
+					"for remount");
 				return 0;
 			}
 			if (match_int(&args[0], &option) != 0)
@@ -1526,8 +1534,9 @@ static int parse_options(char *options, struct super_block *sb,
 			if (option < 0 || option > (1 << 30))
 				return 0;
 			if (!is_power_of_2(option)) {
-				printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
-				       " must be a power of 2\n");
+				ext4_msg(sb, KERN_ERR,
+					 "EXT4-fs: inode_readahead_blks"
+					 " must be a power of 2");
 				return 0;
 			}
 			sbi->s_inode_readahead_blks = option;
@@ -1554,9 +1563,9 @@ static int parse_options(char *options, struct super_block *sb,
 				set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
 			break;
 		default:
-			printk(KERN_ERR
-			       "EXT4-fs: Unrecognized mount option \"%s\" "
-			       "or missing value\n", p);
+			ext4_msg(sb, KERN_ERR,
+			       "Unrecognized mount option \"%s\" "
+			       "or missing value", p);
 			return 0;
 		}
 	}
@@ -1574,21 +1583,21 @@ static int parse_options(char *options, struct super_block *sb,
 				(sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
 		    (sbi->s_qf_names[GRPQUOTA] &&
 				(sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
-			printk(KERN_ERR "EXT4-fs: old and new quota "
-					"format mixing.\n");
+			ext4_msg(sb, KERN_ERR, "old and new quota "
+					"format mixing");
 			return 0;
 		}
 
 		if (!sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journaled quota format "
-					"not specified.\n");
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
+					"not specified");
 			return 0;
 		}
 	} else {
 		if (sbi->s_jquota_fmt) {
-			printk(KERN_ERR "EXT4-fs: journaled quota format "
+			ext4_msg(sb, KERN_ERR, "journaled quota format "
 					"specified with no journaling "
-					"enabled.\n");
+					"enabled");
 			return 0;
 		}
 	}
@@ -1603,31 +1612,31 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 	int res = 0;
 
 	if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
-		printk(KERN_ERR "EXT4-fs warning: revision level too high, "
-		       "forcing read-only mode\n");
+		ext4_msg(sb, KERN_ERR, "revision level too high, "
+			 "forcing read-only mode");
 		res = MS_RDONLY;
 	}
 	if (read_only)
 		return res;
 	if (!(sbi->s_mount_state & EXT4_VALID_FS))
-		printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
+			 "running e2fsck is recommended");
 	else if ((sbi->s_mount_state & EXT4_ERROR_FS))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: mounting fs with errors, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: mounting fs with errors, "
+			 "running e2fsck is recommended");
 	else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
 		 le16_to_cpu(es->s_mnt_count) >=
 		 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: maximal mount count reached, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: maximal mount count reached, "
+			 "running e2fsck is recommended");
 	else if (le32_to_cpu(es->s_checkinterval) &&
 		(le32_to_cpu(es->s_lastcheck) +
 			le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: checktime reached, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+			 "warning: checktime reached, "
+			 "running e2fsck is recommended");
 	if (!sbi->s_journal)
 		es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
 	if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
@@ -1649,11 +1658,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			sbi->s_mount_opt);
 
 	if (EXT4_SB(sb)->s_journal) {
-		printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
-		       sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+		ext4_msg(sb, KERN_INFO, "%s journal on %s",
+		       EXT4_SB(sb)->s_journal->j_inode ? "internal" :
 		       "external", EXT4_SB(sb)->s_journal->j_devname);
 	} else {
-		printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+		ext4_msg(sb, KERN_INFO, "no journal");
 	}
 	return res;
 }
@@ -1688,8 +1697,8 @@ static int ext4_fill_flex_info(struct super_block *sb)
 			memset(sbi->s_flex_groups, 0, size);
 	}
 	if (sbi->s_flex_groups == NULL) {
-		printk(KERN_ERR "EXT4-fs: not enough memory for "
-				"%u flex groups\n", flex_group_count);
+		ext4_msg(sb, KERN_ERR, "not enough memory for "
+				"%u flex groups", flex_group_count);
 		goto failed;
 	}
 
@@ -1775,32 +1784,32 @@ static int ext4_check_descriptors(struct super_block *sb)
 
 		block_bitmap = ext4_block_bitmap(sb, gdp);
 		if (block_bitmap < first_block || block_bitmap > last_block) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Block bitmap for group %u not in group "
-			       "(block %llu)!\n", i, block_bitmap);
+			       "(block %llu)!", i, block_bitmap);
 			return 0;
 		}
 		inode_bitmap = ext4_inode_bitmap(sb, gdp);
 		if (inode_bitmap < first_block || inode_bitmap > last_block) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Inode bitmap for group %u not in group "
-			       "(block %llu)!\n", i, inode_bitmap);
+			       "(block %llu)!", i, inode_bitmap);
 			return 0;
 		}
 		inode_table = ext4_inode_table(sb, gdp);
 		if (inode_table < first_block ||
 		    inode_table + sbi->s_itb_per_group - 1 > last_block) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
 			       "Inode table for group %u not in group "
-			       "(block %llu)!\n", i, inode_table);
+			       "(block %llu)!", i, inode_table);
 			return 0;
 		}
 		ext4_lock_group(sb, i);
 		if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
-			printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-			       "Checksum for group %u failed (%u!=%u)\n",
-			       i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
-			       gdp)), le16_to_cpu(gdp->bg_checksum));
+			ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+				 "Checksum for group %u failed (%u!=%u)",
+				 i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+				     gdp)), le16_to_cpu(gdp->bg_checksum));
 			if (!(sb->s_flags & MS_RDONLY)) {
 				ext4_unlock_group(sb, i);
 				return 0;
@@ -1847,8 +1856,8 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 	}
 
 	if (bdev_read_only(sb->s_bdev)) {
-		printk(KERN_ERR "EXT4-fs: write access "
-			"unavailable, skipping orphan cleanup.\n");
+		ext4_msg(sb, KERN_ERR, "write access "
+			"unavailable, skipping orphan cleanup");
 		return;
 	}
 
@@ -1862,8 +1871,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 	}
 
 	if (s_flags & MS_RDONLY) {
-		printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
-		       sb->s_id);
+		ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
 		sb->s_flags &= ~MS_RDONLY;
 	}
 #ifdef CONFIG_QUOTA
@@ -1874,9 +1882,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		if (EXT4_SB(sb)->s_qf_names[i]) {
 			int ret = ext4_quota_on_mount(sb, i);
 			if (ret < 0)
-				printk(KERN_ERR
-					"EXT4-fs: Cannot turn on journaled "
-					"quota: error %d\n", ret);
+				ext4_msg(sb, KERN_ERR,
+					"Cannot turn on journaled "
+					"quota: error %d", ret);
 		}
 	}
 #endif
@@ -1893,16 +1901,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 		list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
 		vfs_dq_init(inode);
 		if (inode->i_nlink) {
-			printk(KERN_DEBUG
-				"%s: truncating inode %lu to %lld bytes\n",
+			ext4_msg(sb, KERN_DEBUG,
+				"%s: truncating inode %lu to %lld bytes",
 				__func__, inode->i_ino, inode->i_size);
 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
 			ext4_truncate(inode);
 			nr_truncates++;
 		} else {
-			printk(KERN_DEBUG
-				"%s: deleting unreferenced inode %lu\n",
+			ext4_msg(sb, KERN_DEBUG,
+				"%s: deleting unreferenced inode %lu",
 				__func__, inode->i_ino);
 			jbd_debug(2, "deleting unreferenced inode %lu\n",
 				  inode->i_ino);
@@ -1914,11 +1922,11 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
 
 	if (nr_orphans)
-		printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
-		       sb->s_id, PLURAL(nr_orphans));
+		ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+		       PLURAL(nr_orphans));
 	if (nr_truncates)
-		printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
-		       sb->s_id, PLURAL(nr_truncates));
+		ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+		       PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
 	/* Turn quotas off */
 	for (i = 0; i < MAXQUOTAS; i++) {
@@ -2307,7 +2315,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
 	if (!blocksize) {
-		printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+		ext4_msg(sb, KERN_ERR, "unable to set blocksize");
 		goto out_fail;
 	}
 
@@ -2323,7 +2331,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (!(bh = sb_bread(sb, logical_sb_block))) {
-		printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
+		ext4_msg(sb, KERN_ERR, "unable to read superblock");
 		goto out_fail;
 	}
 	/*
@@ -2393,9 +2401,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	    (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
 	     EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-		printk(KERN_WARNING
-		       "EXT4-fs warning: feature flags set on rev 0 fs, "
-		       "running e2fsck is recommended\n");
+		ext4_msg(sb, KERN_WARNING,
+		       "feature flags set on rev 0 fs, "
+		       "running e2fsck is recommended");
 
 	/*
 	 * Check feature flags regardless of the revision level, since we
@@ -2404,16 +2412,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
 	if (features) {
-		printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
-		       "unsupported optional features (%x).\n", sb->s_id,
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount because of "
+			"unsupported optional features (%x)",
 			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
 			~EXT4_FEATURE_INCOMPAT_SUPP));
 		goto failed_mount;
 	}
 	features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
 	if (!(sb->s_flags & MS_RDONLY) && features) {
-		printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
-		       "unsupported optional features (%x).\n", sb->s_id,
+		ext4_msg(sb, KERN_ERR,
+			"Couldn't mount RDWR because of "
+			"unsupported optional features (%x)",
 			(le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
 			~EXT4_FEATURE_RO_COMPAT_SUPP));
 		goto failed_mount;
@@ -2427,9 +2437,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		 */
 		if (sizeof(root->i_blocks) < sizeof(u64) &&
 				!(sb->s_flags & MS_RDONLY)) {
-			printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
+			ext4_msg(sb, KERN_ERR, "Filesystem with huge "
 					"files cannot be mounted read-write "
-					"without CONFIG_LBD.\n", sb->s_id);
+					"without CONFIG_LBD");
 			goto failed_mount;
 		}
 	}
@@ -2437,16 +2447,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (blocksize < EXT4_MIN_BLOCK_SIZE ||
 	    blocksize > EXT4_MAX_BLOCK_SIZE) {
-		printk(KERN_ERR
-		       "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
-		       blocksize, sb->s_id);
+		ext4_msg(sb, KERN_ERR,
+		       "Unsupported filesystem blocksize %d", blocksize);
 		goto failed_mount;
 	}
 
 	if (sb->s_blocksize != blocksize) {
 		/* Validate the filesystem blocksize */
 		if (!sb_set_blocksize(sb, blocksize)) {
-			printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+			ext4_msg(sb, KERN_ERR, "bad block size %d",
 					blocksize);
 			goto failed_mount;
 		}
@@ -2456,15 +2465,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		offset = do_div(logical_sb_block, blocksize);
 		bh = sb_bread(sb, logical_sb_block);
 		if (!bh) {
-			printk(KERN_ERR
-			       "EXT4-fs: Can't read superblock on 2nd try.\n");
+			ext4_msg(sb, KERN_ERR,
+			       "Can't read superblock on 2nd try");
 			goto failed_mount;
 		}
 		es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
 		sbi->s_es = es;
 		if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
-			printk(KERN_ERR
-			       "EXT4-fs: Magic mismatch, very weird !\n");
+			ext4_msg(sb, KERN_ERR,
+			       "Magic mismatch, very weird!");
 			goto failed_mount;
 		}
 	}
@@ -2482,8 +2491,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
 		    (!is_power_of_2(sbi->s_inode_size)) ||
 		    (sbi->s_inode_size > blocksize)) {
-			printk(KERN_ERR
-			       "EXT4-fs: unsupported inode size: %d\n",
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported inode size: %d",
 			       sbi->s_inode_size);
 			goto failed_mount;
 		}
@@ -2496,8 +2505,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
 		    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
 		    !is_power_of_2(sbi->s_desc_size)) {
-			printk(KERN_ERR
-			       "EXT4-fs: unsupported descriptor size %lu\n",
+			ext4_msg(sb, KERN_ERR,
+			       "unsupported descriptor size %lu",
 			       sbi->s_desc_size);
 			goto failed_mount;
 		}
@@ -2537,25 +2546,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
-		printk(KERN_ERR
-		       "EXT4-fs: #blocks per group too big: %lu\n",
+		ext4_msg(sb, KERN_ERR,
+		       "#blocks per group too big: %lu",
 		       sbi->s_blocks_per_group);
 		goto failed_mount;
 	}
 	if (sbi->s_inodes_per_group > blocksize * 8) {
-		printk(KERN_ERR
-		       "EXT4-fs: #inodes per group too big: %lu\n",
+		ext4_msg(sb, KERN_ERR,
+		       "#inodes per group too big: %lu",
 		       sbi->s_inodes_per_group);
 		goto failed_mount;
 	}
 
 	if (ext4_blocks_count(es) >
 		    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-		printk(KERN_ERR "EXT4-fs: filesystem on %s:"
-			" too large to mount safely\n", sb->s_id);
+		ext4_msg(sb, KERN_ERR, "filesystem"
+			" too large to mount safely");
 		if (sizeof(sector_t) < 8)
-			printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
-					"enabled\n");
+			ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
 		goto failed_mount;
 	}
 
@@ -2565,8 +2573,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	/* check blocks count against device size */
 	blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
 	if (blocks_count && ext4_blocks_count(es) > blocks_count) {
-		printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
-		       "exceeds size of device (%llu blocks)\n",
+		ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+		       "exceeds size of device (%llu blocks)",
 		       ext4_blocks_count(es), blocks_count);
 		goto failed_mount;
 	}
@@ -2576,10 +2584,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 * of the filesystem.
 	 */
 	if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-		printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
-		       "block %u is beyond end of filesystem (%llu)\n",
-		       le32_to_cpu(es->s_first_data_block),
-		       ext4_blocks_count(es));
+                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+			 "block %u is beyond end of filesystem (%llu)",
+			 le32_to_cpu(es->s_first_data_block),
+			 ext4_blocks_count(es));
 		goto failed_mount;
 	}
 	blocks_count = (ext4_blocks_count(es) -
@@ -2587,9 +2595,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			EXT4_BLOCKS_PER_GROUP(sb) - 1);
 	do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
 	if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
-		printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+		ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
 		       "(block count %llu, first data block %u, "
-		       "blocks per group %lu)\n", sbi->s_groups_count,
+		       "blocks per group %lu)", sbi->s_groups_count,
 		       ext4_blocks_count(es),
 		       le32_to_cpu(es->s_first_data_block),
 		       EXT4_BLOCKS_PER_GROUP(sb));
@@ -2601,7 +2609,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
 				    GFP_KERNEL);
 	if (sbi->s_group_desc == NULL) {
-		printk(KERN_ERR "EXT4-fs: not enough memory\n");
+		ext4_msg(sb, KERN_ERR, "not enough memory");
 		goto failed_mount;
 	}
 
@@ -2616,21 +2624,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		block = descriptor_loc(sb, logical_sb_block, i);
 		sbi->s_group_desc[i] = sb_bread(sb, block);
 		if (!sbi->s_group_desc[i]) {
-			printk(KERN_ERR "EXT4-fs: "
-			       "can't read group descriptor %d\n", i);
+			ext4_msg(sb, KERN_ERR,
+			       "can't read group descriptor %d", i);
 			db_count = i;
 			goto failed_mount2;
 		}
 	}
 	if (!ext4_check_descriptors(sb)) {
-		printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+		ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
 		goto failed_mount2;
 	}
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
 		if (!ext4_fill_flex_info(sb)) {
-			printk(KERN_ERR
-			       "EXT4-fs: unable to initialize "
-			       "flex_bg meta info!\n");
+			ext4_msg(sb, KERN_ERR,
+			       "unable to initialize "
+			       "flex_bg meta info!");
 			goto failed_mount2;
 		}
 
@@ -2652,7 +2660,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
 	}
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount3;
 	}
 
@@ -2692,13 +2700,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			goto failed_mount3;
 		if (!(sb->s_flags & MS_RDONLY) &&
 		    EXT4_SB(sb)->s_journal->j_failed_commit) {
-			printk(KERN_CRIT "EXT4-fs error (device %s): "
+			ext4_msg(sb, KERN_CRIT, "error: "
 			       "ext4_fill_super: Journal transaction "
-			       "%u is corrupt\n", sb->s_id,
+			       "%u is corrupt",
 			       EXT4_SB(sb)->s_journal->j_failed_commit);
 			if (test_opt(sb, ERRORS_RO)) {
-				printk(KERN_CRIT
-				       "Mounting filesystem read-only\n");
+				ext4_msg(sb, KERN_CRIT,
+				       "Mounting filesystem read-only");
 				sb->s_flags |= MS_RDONLY;
 				EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
 				es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2712,8 +2720,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		}
 	} else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
 	      EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
-		printk(KERN_ERR "EXT4-fs: required journal recovery "
-		       "suppressed and not mounted read-only\n");
+		ext4_msg(sb, KERN_ERR, "required journal recovery "
+		       "suppressed and not mounted read-only");
 		goto failed_mount4;
 	} else {
 		clear_opt(sbi->s_mount_opt, DATA_FLAGS);
@@ -2726,7 +2734,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (ext4_blocks_count(es) > 0xffffffffULL &&
 	    !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
 				       JBD2_FEATURE_INCOMPAT_64BIT)) {
-		printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
+		ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
 		goto failed_mount4;
 	}
 
@@ -2764,8 +2772,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	case EXT4_MOUNT_WRITEBACK_DATA:
 		if (!jbd2_journal_check_available_features
 		    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
-			printk(KERN_ERR "EXT4-fs: Journal does not support "
-			       "requested data journaling mode\n");
+			ext4_msg(sb, KERN_ERR, "Journal does not support "
+			       "requested data journaling mode");
 			goto failed_mount4;
 		}
 	default:
@@ -2777,8 +2785,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	if (test_opt(sb, NOBH)) {
 		if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
-			printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
-				"its supported only with writeback mode\n");
+			ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
+				"its supported only with writeback mode");
 			clear_opt(sbi->s_mount_opt, NOBH);
 		}
 	}
@@ -2789,18 +2797,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	root = ext4_iget(sb, EXT4_ROOT_INO);
 	if (IS_ERR(root)) {
-		printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+		ext4_msg(sb, KERN_ERR, "get root inode failed");
 		ret = PTR_ERR(root);
 		goto failed_mount4;
 	}
 	if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
 		iput(root);
-		printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+		ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
 		goto failed_mount4;
 	}
 	sb->s_root = d_alloc_root(root);
 	if (!sb->s_root) {
-		printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
+		ext4_msg(sb, KERN_ERR, "get root dentry failed");
 		iput(root);
 		ret = -ENOMEM;
 		goto failed_mount4;
@@ -2829,29 +2837,29 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 							sbi->s_inode_size) {
 		sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
 						       EXT4_GOOD_OLD_INODE_SIZE;
-		printk(KERN_INFO "EXT4-fs: required extra inode space not"
-			"available.\n");
+		ext4_msg(sb, KERN_INFO, "required extra inode space not"
+			 "available");
 	}
 
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-		printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
-				"requested data journaling mode\n");
+		ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
+			 "requested data journaling mode");
 		clear_opt(sbi->s_mount_opt, DELALLOC);
 	} else if (test_opt(sb, DELALLOC))
-		printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+		ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
 
 	err = ext4_setup_system_zone(sb);
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: failed to initialize system "
-		       "zone (%d)\n", err);
+		ext4_msg(sb, KERN_ERR, "failed to initialize system "
+			 "zone (%d)\n", err);
 		goto failed_mount4;
 	}
 
 	ext4_ext_init(sb);
 	err = ext4_mb_init(sb, needs_recovery);
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
-		       err);
+		ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+			 err);
 		goto failed_mount4;
 	}
 
@@ -2869,7 +2877,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	ext4_orphan_cleanup(sb, es);
 	EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
 	if (needs_recovery) {
-		printk(KERN_INFO "EXT4-fs: recovery complete.\n");
+		ext4_msg(sb, KERN_INFO, "recovery complete");
 		ext4_mark_recovery_complete(sb, es);
 	}
 	if (EXT4_SB(sb)->s_journal) {
@@ -2882,20 +2890,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	} else
 		descr = "out journal";
 
-	printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
-	       sb->s_id, descr);
+	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
 
 	lock_kernel();
 	return 0;
 
 cantfind_ext4:
 	if (!silent)
-		printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
-		       sb->s_id);
+		ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
 	goto failed_mount;
 
 failed_mount4:
-	printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+	ext4_msg(sb, KERN_ERR, "mount failed");
 	ext4_release_system_zone(sb);
 	if (sbi->s_journal) {
 		jbd2_journal_destroy(sbi->s_journal);
@@ -2973,27 +2979,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 
 	journal_inode = ext4_iget(sb, journal_inum);
 	if (IS_ERR(journal_inode)) {
-		printk(KERN_ERR "EXT4-fs: no journal found.\n");
+		ext4_msg(sb, KERN_ERR, "no journal found");
 		return NULL;
 	}
 	if (!journal_inode->i_nlink) {
 		make_bad_inode(journal_inode);
 		iput(journal_inode);
-		printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+		ext4_msg(sb, KERN_ERR, "journal inode is deleted");
 		return NULL;
 	}
 
 	jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
 		  journal_inode, journal_inode->i_size);
 	if (!S_ISREG(journal_inode->i_mode)) {
-		printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+		ext4_msg(sb, KERN_ERR, "invalid journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
 
 	journal = jbd2_journal_init_inode(journal_inode);
 	if (!journal) {
-		printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+		ext4_msg(sb, KERN_ERR, "Could not load journal inode");
 		iput(journal_inode);
 		return NULL;
 	}
@@ -3017,13 +3023,13 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 
 	BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 
-	bdev = ext4_blkdev_get(j_dev);
+	bdev = ext4_blkdev_get(j_dev, sb);
 	if (bdev == NULL)
 		return NULL;
 
 	if (bd_claim(bdev, sb)) {
-		printk(KERN_ERR
-			"EXT4-fs: failed to claim external journal device.\n");
+		ext4_msg(sb, KERN_ERR,
+			"failed to claim external journal device");
 		blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
 		return NULL;
 	}
@@ -3031,8 +3037,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	blocksize = sb->s_blocksize;
 	hblock = bdev_hardsect_size(bdev);
 	if (blocksize < hblock) {
-		printk(KERN_ERR
-			"EXT4-fs: blocksize too small for journal device.\n");
+		ext4_msg(sb, KERN_ERR,
+			"blocksize too small for journal device");
 		goto out_bdev;
 	}
 
@@ -3040,8 +3046,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	offset = EXT4_MIN_BLOCK_SIZE % blocksize;
 	set_blocksize(bdev, blocksize);
 	if (!(bh = __bread(bdev, sb_block, blocksize))) {
-		printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
-		       "external journal\n");
+		ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
+		       "external journal");
 		goto out_bdev;
 	}
 
@@ -3049,14 +3055,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
 	    !(le32_to_cpu(es->s_feature_incompat) &
 	      EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-		printk(KERN_ERR "EXT4-fs: external journal has "
-					"bad superblock\n");
+		ext4_msg(sb, KERN_ERR, "external journal has "
+					"bad superblock");
 		brelse(bh);
 		goto out_bdev;
 	}
 
 	if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-		printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+		ext4_msg(sb, KERN_ERR, "journal UUID does not match");
 		brelse(bh);
 		goto out_bdev;
 	}
@@ -3068,19 +3074,19 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
 					start, len, blocksize);
 	if (!journal) {
-		printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+		ext4_msg(sb, KERN_ERR, "failed to create device journal");
 		goto out_bdev;
 	}
 	journal->j_private = sb;
 	ll_rw_block(READ, 1, &journal->j_sb_buffer);
 	wait_on_buffer(journal->j_sb_buffer);
 	if (!buffer_uptodate(journal->j_sb_buffer)) {
-		printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+		ext4_msg(sb, KERN_ERR, "I/O error on journal device");
 		goto out_journal;
 	}
 	if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-		printk(KERN_ERR "EXT4-fs: External journal has more than one "
-					"user (unsupported) - %d\n",
+		ext4_msg(sb, KERN_ERR, "External journal has more than one "
+					"user (unsupported) - %d",
 			be32_to_cpu(journal->j_superblock->s_nr_users));
 		goto out_journal;
 	}
@@ -3109,8 +3115,8 @@ static int ext4_load_journal(struct super_block *sb,
 
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-		printk(KERN_INFO "EXT4-fs: external journal device major/minor "
-			"numbers have changed\n");
+		ext4_msg(sb, KERN_INFO, "external journal device major/minor "
+			"numbers have changed");
 		journal_dev = new_decode_dev(journal_devnum);
 	} else
 		journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -3124,21 +3130,21 @@ static int ext4_load_journal(struct super_block *sb,
 	 */
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
 		if (sb->s_flags & MS_RDONLY) {
-			printk(KERN_INFO "EXT4-fs: INFO: recovery "
-					"required on readonly filesystem.\n");
+			ext4_msg(sb, KERN_INFO, "INFO: recovery "
+					"required on readonly filesystem");
 			if (really_read_only) {
-				printk(KERN_ERR "EXT4-fs: write access "
-					"unavailable, cannot proceed.\n");
+				ext4_msg(sb, KERN_ERR, "write access "
+					"unavailable, cannot proceed");
 				return -EROFS;
 			}
-			printk(KERN_INFO "EXT4-fs: write access will "
-			       "be enabled during recovery.\n");
+			ext4_msg(sb, KERN_INFO, "write access will "
+			       "be enabled during recovery");
 		}
 	}
 
 	if (journal_inum && journal_dev) {
-		printk(KERN_ERR "EXT4-fs: filesystem has both journal "
-		       "and inode journals!\n");
+		ext4_msg(sb, KERN_ERR, "filesystem has both journal "
+		       "and inode journals!");
 		return -EINVAL;
 	}
 
@@ -3151,14 +3157,14 @@ static int ext4_load_journal(struct super_block *sb,
 	}
 
 	if (journal->j_flags & JBD2_BARRIER)
-		printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+		ext4_msg(sb, KERN_INFO, "barriers enabled");
 	else
-		printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+		ext4_msg(sb, KERN_INFO, "barriers disabled");
 
 	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
 		err = jbd2_journal_update_format(journal);
 		if (err)  {
-			printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+			ext4_msg(sb, KERN_ERR, "error updating journal");
 			jbd2_journal_destroy(journal);
 			return err;
 		}
@@ -3170,7 +3176,7 @@ static int ext4_load_journal(struct super_block *sb,
 		err = jbd2_journal_load(journal);
 
 	if (err) {
-		printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+		ext4_msg(sb, KERN_ERR, "error loading journal");
 		jbd2_journal_destroy(journal);
 		return err;
 	}
@@ -3206,8 +3212,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 		 * be remapped.  Nothing we can do but to retry the
 		 * write and hope for the best.
 		 */
-		printk(KERN_ERR "EXT4-fs: previous I/O error to "
-		       "superblock detected for %s.\n", sb->s_id);
+		ext4_msg(sb, KERN_ERR, "previous I/O error to "
+		       "superblock detected");
 		clear_buffer_write_io_error(sbh);
 		set_buffer_uptodate(sbh);
 	}
@@ -3230,8 +3236,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 
 		error = buffer_write_io_error(sbh);
 		if (error) {
-			printk(KERN_ERR "EXT4-fs: I/O error while writing "
-			       "superblock for %s.\n", sb->s_id);
+			ext4_msg(sb, KERN_ERR, "I/O error while writing "
+			       "superblock");
 			clear_buffer_write_io_error(sbh);
 			set_buffer_uptodate(sbh);
 		}
@@ -3478,9 +3484,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			int ret;
 			if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
 					~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+				ext4_msg(sb, KERN_WARNING, "couldn't "
 				       "remount RDWR because of unsupported "
-				       "optional features (%x).\n", sb->s_id,
+				       "optional features (%x)",
 				(le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
 					~EXT4_FEATURE_RO_COMPAT_SUPP));
 				err = -EROFS;
@@ -3496,9 +3502,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 					ext4_get_group_desc(sb, g, NULL);
 
 				if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
-					printk(KERN_ERR
-	       "EXT4-fs: ext4_remount: "
-		"Checksum for group %u failed (%u!=%u)\n",
+					ext4_msg(sb, KERN_ERR,
+	       "ext4_remount: Checksum for group %u failed (%u!=%u)",
 		g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
 					       le16_to_cpu(gdp->bg_checksum));
 					err = -EINVAL;
@@ -3512,11 +3517,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			 * require a full umount/remount for now.
 			 */
 			if (es->s_last_orphan) {
-				printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+				ext4_msg(sb, KERN_WARNING, "Couldn't "
 				       "remount RDWR because of unprocessed "
 				       "orphan inode list.  Please "
-				       "umount/remount instead.\n",
-				       sb->s_id);
+				       "umount/remount instead");
 				err = -EINVAL;
 				goto restore_opts;
 			}
@@ -3772,9 +3776,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
 	if (EXT4_SB(sb)->s_qf_names[type]) {
 		/* Quotafile not in fs root? */
 		if (path.dentry->d_parent != sb->s_root)
-			printk(KERN_WARNING
-				"EXT4-fs: Quota file not on filesystem root. "
-				"Journaled quota will not work.\n");
+			ext4_msg(sb, KERN_WARNING,
+				"Quota file not on filesystem root. "
+				"Journaled quota will not work");
 	}
 
 	/*
@@ -3857,8 +3861,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
 	handle_t *handle = journal_current_handle();
 
 	if (EXT4_SB(sb)->s_journal && !handle) {
-		printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
-			" cancelled because transaction is not started.\n",
+		ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+			" cancelled because transaction is not started",
 			(unsigned long long)off, (unsigned long long)len);
 		return -EIO;
 	}
@@ -3930,10 +3934,10 @@ static struct file_system_type ext4_fs_type = {
 static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
 			  const char *dev_name, void *data,struct vfsmount *mnt)
 {
-	printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
-	       "to mount using ext4\n");
-	printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
-	       "will go away by 2.6.31\n");
+	printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
+	       "to mount using ext4\n", dev_name);
+	printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
+	       "will go away by 2.6.31\n", dev_name);
 	return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
-- 
GitLab


From 04288f42033607099cebf5ca15ce8dcec3a9688b Mon Sep 17 00:00:00 2001
From: Mimi Zohar <zohar@linux.vnet.ibm.com>
Date: Thu, 4 Jun 2009 13:53:10 -0400
Subject: [PATCH 1607/2198] integrity: ima audit dentry_open failure

Until we start appraising measurements, the ima_path_check()
return code should always be 0.

- Update the ima_path_check() return code comment
- Instead of the pr_info, audit the dentry_open failure

Signed-off-by: Mimi Zohar <zohar@us.ibm.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/integrity/ima/ima_main.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c
index a2eb23310eaf1..6f611874d10e9 100644
--- a/security/integrity/ima/ima_main.c
+++ b/security/integrity/ima/ima_main.c
@@ -149,8 +149,8 @@ static void ima_update_counts(struct ima_iint_cache *iint, int mask)
  *	- Opening a file for read when already open for write,
  * 	  could result in a file measurement error.
  *
- * Return 0 on success, an error code on failure.
- * (Based on the results of appraise_measurement().)
+ * Always return 0 and audit dentry_open failures.
+ * (Return code will be based upon measurement appraisal.)
  */
 int ima_path_check(struct path *path, int mask, int update_counts)
 {
@@ -189,8 +189,13 @@ int ima_path_check(struct path *path, int mask, int update_counts)
 		file = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE,
 				   current_cred());
 		if (IS_ERR(file)) {
-			pr_info("%s dentry_open failed\n", dentry->d_name.name);
-			rc = PTR_ERR(file);
+			int audit_info = 0;
+
+			integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode,
+					    dentry->d_name.name,
+					    "add_measurement",
+					    "dentry_open failed",
+					    1, audit_info);
 			file = NULL;
 			goto out;
 		}
-- 
GitLab


From 1938a150c25bf7c2c47182e753a1038945b70b0e Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 5 Jun 2009 01:00:26 -0400
Subject: [PATCH 1608/2198] ext4: Avoid leaking blocks after a block allocation
 failure

We should add inode to the orphan list in the same transaction
as block allocation.  This ensures that if we crash after a failed
block allocation and before we do a vmtruncate we don't leak block
(ie block marked as used in bitmap but not claimed by the inode).

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
CC:  Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 17ed0d244dbbc..8d215881172f1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1459,7 +1459,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 				struct page **pagep, void **fsdata)
 {
 	struct inode *inode = mapping->host;
-	int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+	int ret, needed_blocks;
 	handle_t *handle;
 	int retries = 0;
 	struct page *page;
@@ -1470,6 +1470,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 		   "dev %s ino %lu pos %llu len %u flags %u",
 		   inode->i_sb->s_id, inode->i_ino,
 		   (unsigned long long) pos, len, flags);
+	/*
+	 * Reserve one block more for addition to orphan list in case
+	 * we allocate blocks but write fails for some reason
+	 */
+	needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
  	index = pos >> PAGE_CACHE_SHIFT;
 	from = pos & (PAGE_CACHE_SIZE - 1);
 	to = from + len;
@@ -1503,15 +1508,30 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
 
 	if (ret) {
 		unlock_page(page);
-		ext4_journal_stop(handle);
 		page_cache_release(page);
 		/*
 		 * block_write_begin may have instantiated a few blocks
 		 * outside i_size.  Trim these off again. Don't need
 		 * i_size_read because we hold i_mutex.
+		 *
+		 * Add inode to orphan list in case we crash before
+		 * truncate finishes
 		 */
 		if (pos + len > inode->i_size)
+			ext4_orphan_add(handle, inode);
+
+		ext4_journal_stop(handle);
+		if (pos + len > inode->i_size) {
 			vmtruncate(inode, inode->i_size);
+			/* 
+			 * If vmtruncate failed early the inode might
+			 * still be on the orphan list; we need to
+			 * make sure the inode is removed from the
+			 * orphan list in that case.
+			 */
+			if (inode->i_nlink)
+				ext4_orphan_del(NULL, inode);
+		}
 	}
 
 	if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
-- 
GitLab


From f8514083cd61daef12fba5ef883ad9352c450428 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Fri, 5 Jun 2009 00:56:49 -0400
Subject: [PATCH 1609/2198] ext4: truncate the file properly if we fail to copy
 data from userspace

In generic_perform_write if we fail to copy the user data we don't
update the inode->i_size.  We should truncate the file in the above
case so that we don't have blocks allocated outside inode->i_size.  Add
the inode to orphan list in the same transaction as block allocation
This ensures that if we crash in between the recovery would do the
truncate.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
CC:  Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 128 ++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 102 insertions(+), 26 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 8d215881172f1..2c10d346f7a31 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1549,6 +1549,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
 	return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 
+static int ext4_generic_write_end(struct file *file,
+				struct address_space *mapping,
+				loff_t pos, unsigned len, unsigned copied,
+				struct page *page, void *fsdata)
+{
+	int i_size_changed = 0;
+	struct inode *inode = mapping->host;
+	handle_t *handle = ext4_journal_current_handle();
+
+	copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 *
+	 * But it's important to update i_size while still holding page lock:
+	 * page writeout could otherwise come in and zero beyond i_size.
+	 */
+	if (pos + copied > inode->i_size) {
+		i_size_write(inode, pos + copied);
+		i_size_changed = 1;
+	}
+
+	if (pos + copied >  EXT4_I(inode)->i_disksize) {
+		/* We need to mark inode dirty even if
+		 * new_i_size is less that inode->i_size
+		 * bu greater than i_disksize.(hint delalloc)
+		 */
+		ext4_update_i_disksize(inode, (pos + copied));
+		i_size_changed = 1;
+	}
+	unlock_page(page);
+	page_cache_release(page);
+
+	/*
+	 * Don't mark the inode dirty under page lock. First, it unnecessarily
+	 * makes the holding time of page lock longer. Second, it forces lock
+	 * ordering of page lock and transaction start for journaling
+	 * filesystems.
+	 */
+	if (i_size_changed)
+		ext4_mark_inode_dirty(handle, inode);
+
+	return copied;
+}
+
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
@@ -1572,21 +1618,15 @@ static int ext4_ordered_write_end(struct file *file,
 	ret = ext4_jbd2_file_inode(handle, inode);
 
 	if (ret == 0) {
-		loff_t new_i_size;
-
-		new_i_size = pos + copied;
-		if (new_i_size > EXT4_I(inode)->i_disksize) {
-			ext4_update_i_disksize(inode, new_i_size);
-			/* We need to mark inode dirty even if
-			 * new_i_size is less that inode->i_size
-			 * bu greater than i_disksize.(hint delalloc)
-			 */
-			ext4_mark_inode_dirty(handle, inode);
-		}
-
-		ret2 = generic_write_end(file, mapping, pos, len, copied,
+		ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 		copied = ret2;
+		if (pos + len > inode->i_size)
+			/* if we have allocated more blocks and copied
+			 * less. We will have blocks allocated outside
+			 * inode->i_size. So truncate them
+			 */
+			ext4_orphan_add(handle, inode);
 		if (ret2 < 0)
 			ret = ret2;
 	}
@@ -1594,6 +1634,18 @@ static int ext4_ordered_write_end(struct file *file,
 	if (!ret)
 		ret = ret2;
 
+	if (pos + len > inode->i_size) {
+		vmtruncate(inode, inode->i_size);
+		/* 
+		 * If vmtruncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
+
 	return ret ? ret : copied;
 }
 
@@ -1605,25 +1657,21 @@ static int ext4_writeback_write_end(struct file *file,
 	handle_t *handle = ext4_journal_current_handle();
 	struct inode *inode = mapping->host;
 	int ret = 0, ret2;
-	loff_t new_i_size;
 
 	trace_mark(ext4_writeback_write_end,
 		   "dev %s ino %lu pos %llu len %u copied %u",
 		   inode->i_sb->s_id, inode->i_ino,
 		   (unsigned long long) pos, len, copied);
-	new_i_size = pos + copied;
-	if (new_i_size > EXT4_I(inode)->i_disksize) {
-		ext4_update_i_disksize(inode, new_i_size);
-		/* We need to mark inode dirty even if
-		 * new_i_size is less that inode->i_size
-		 * bu greater than i_disksize.(hint delalloc)
-		 */
-		ext4_mark_inode_dirty(handle, inode);
-	}
-
-	ret2 = generic_write_end(file, mapping, pos, len, copied,
+	ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 	copied = ret2;
+	if (pos + len > inode->i_size)
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+
 	if (ret2 < 0)
 		ret = ret2;
 
@@ -1631,6 +1679,17 @@ static int ext4_writeback_write_end(struct file *file,
 	if (!ret)
 		ret = ret2;
 
+	if (pos + len > inode->i_size) {
+		vmtruncate(inode, inode->i_size);
+		/* 
+		 * If vmtruncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+
 	return ret ? ret : copied;
 }
 
@@ -1675,10 +1734,27 @@ static int ext4_journalled_write_end(struct file *file,
 	}
 
 	unlock_page(page);
+	page_cache_release(page);
+	if (pos + len > inode->i_size)
+		/* if we have allocated more blocks and copied
+		 * less. We will have blocks allocated outside
+		 * inode->i_size. So truncate them
+		 */
+		ext4_orphan_add(handle, inode);
+
 	ret2 = ext4_journal_stop(handle);
 	if (!ret)
 		ret = ret2;
-	page_cache_release(page);
+	if (pos + len > inode->i_size) {
+		vmtruncate(inode, inode->i_size);
+		/* 
+		 * If vmtruncate failed early the inode might still be
+		 * on the orphan list; we need to make sure the inode
+		 * is removed from the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
 
 	return ret ? ret : copied;
 }
-- 
GitLab


From 6dc5f2a41759987e35e757ef00192e7b424563bb Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Fri, 5 Jun 2009 12:36:28 +1000
Subject: [PATCH 1610/2198] perf_counter: Fix lockup with interrupting counters

Commit 8e3747c1 ("perf_counter: Change data head from u32 to u64")
changed the type of 'head' in struct perf_mmap_data from atomic_t
to atomic_long_t, but missed converting one use of atomic_read on
it to atomic_long_read.  The effect of using atomic_read rather than
atomic_long_read on powerpc (and other big-endian architectures) is
that we get the high half of the 64-bit quantity, resulting in the
cmpxchg retry loop in perf_output_begin spinning forever as soon as
data->head becomes non-zero.  On little-endian architectures such as
x86 we would get the low half, resulting in a lockup once data->head
becomes greater than 4G.

This fixes it by using atomic_long_read rather than atomic_read.

[ Impact: fix perfcounter lockup on PowerPC / big-endian systems ]

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18984.33964.21541.743096@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 195712e20d075..a5d3e2aedd2f1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2234,7 +2234,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	perf_output_lock(handle);
 
 	do {
-		offset = head = atomic_read(&data->head);
+		offset = head = atomic_long_read(&data->head);
 		head += size;
 	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
-- 
GitLab


From ee7b31fe5c5da8a038b96e54ae9fbd5dcab3b1da Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Fri, 5 Jun 2009 11:37:35 +0800
Subject: [PATCH 1611/2198] perf_counter tools: Fix incorrect printf formats

Otherwise the code does not compile on 32-bit boxes.

builtin-report.c: In function 'map__fprintf':
builtin-report.c:240: error: format '%lx' expects type 'long unsigned int', but argument 3 has type 'uint64_t'
builtin-report.c:240: error: format '%lx' expects type 'long unsigned int', but argument 4 has type 'uint64_t'
builtin-report.c:240: error: format '%lx' expects type 'long unsigned int', but argument 5 has type 'uint64_t'

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090605033735.GA20451@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 1a1028d3bc347..eb5424fcbd627 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -236,7 +236,7 @@ static int map__overlap(struct map *l, struct map *r)
 
 static size_t map__fprintf(struct map *self, FILE *fp)
 {
-	return fprintf(fp, " %lx-%lx %lx %s\n",
+	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
 		       self->start, self->end, self->pgoff, self->dso->name);
 }
 
-- 
GitLab


From f6d03139d745198b434f65a28aabed524f415a4c Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Fri, 5 Jun 2009 07:18:57 +0100
Subject: [PATCH 1612/2198] GFS2: Fix locking issue mounting gfs2meta fs

This patch uses sget() to get a reference to the
existing gfs2 sb when mouting the gfs2meta filesystem
(in fact thats just another mount of the gfs2
filesystem with a different root and this interface
is for backward compatibility).

Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Reported-by: Benjamin Marzinski <bmarzins@redhat.com>
Tested-by: Benjamin Marzinski <bmarzins@redhat.com>
Cc: Christoph Hellwig <hch@infradead.org>
---
 fs/gfs2/ops_fstype.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 2cd1164c88d7f..9da161cbb30f9 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1273,9 +1273,20 @@ static int gfs2_get_sb(struct file_system_type *fs_type, int flags,
 	return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt);
 }
 
+static int test_meta_super(struct super_block *s, void *ptr)
+{
+	struct block_device *bdev = ptr;
+	return (bdev == s->s_bdev);
+}
+
+static int set_meta_super(struct super_block *s, void *ptr)
+{
+	return -EINVAL;
+}
+
 static struct super_block *get_gfs2_sb(const char *dev_name)
 {
-	struct super_block *sb;
+	struct super_block *s;
 	struct path path;
 	int error;
 
@@ -1283,30 +1294,27 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
 	if (error) {
 		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
 		       dev_name, error);
-		return NULL;
+		return ERR_PTR(-ENOENT);
 	}
-	sb = path.dentry->d_inode->i_sb;
-	if (sb && (sb->s_type == &gfs2_fs_type))
-		atomic_inc(&sb->s_active);
-	else
-		sb = NULL;
+	s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
+		 path.dentry->d_inode->i_sb->s_bdev);
 	path_put(&path);
-	return sb;
+	return s;
 }
 
 static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
 			    const char *dev_name, void *data, struct vfsmount *mnt)
 {
-	struct super_block *sb = NULL;
+	struct super_block *s;
 	struct gfs2_sbd *sdp;
 
-	sb = get_gfs2_sb(dev_name);
-	if (!sb) {
+	s = get_gfs2_sb(dev_name);
+	if (IS_ERR(s)) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
-		return -ENOENT;
+		return PTR_ERR(s);
 	}
-	sdp = sb->s_fs_info;
-	mnt->mnt_sb = sb;
+	sdp = s->s_fs_info;
+	mnt->mnt_sb = s;
 	mnt->mnt_root = dget(sdp->sd_master_dir);
 	return 0;
 }
-- 
GitLab


From f250c030a87273f8838a2302bee7c2b4d03e9151 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 13:18:41 +0200
Subject: [PATCH 1613/2198] perf record: Split out counter creation into a
 helper function

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 100 +++++++++++---------
 1 file changed, 53 insertions(+), 47 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index bf59df5bddf36..7f2d7ce94075e 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -336,65 +336,71 @@ static void synthesize_events(void)
 	closedir(proc);
 }
 
-static void open_counters(int cpu, pid_t pid)
+static int group_fd;
+
+static void create_counter(int counter, int cpu, pid_t pid)
 {
 	struct perf_counter_attr attr;
-	int counter, group_fd;
 	int track = 1;
 
-	if (pid > 0) {
-		pid_synthesize_comm_event(pid, 0);
-		pid_synthesize_mmap_events(pid);
-	}
+	memset(&attr, 0, sizeof(attr));
+	attr.config		= event_id[counter];
+	attr.sample_period	= event_count[counter];
+	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	attr.mmap		= track;
+	attr.comm		= track;
+	attr.inherit	= (cpu < 0) && inherit;
 
-	group_fd = -1;
-	for (counter = 0; counter < nr_counters; counter++) {
+	track = 0; /* only the first counter needs these */
 
-		memset(&attr, 0, sizeof(attr));
-		attr.config		= event_id[counter];
-		attr.sample_period	= event_count[counter];
-		attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-		attr.mmap		= track;
-		attr.comm		= track;
-		attr.inherit	= (cpu < 0) && inherit;
+	fd[nr_cpu][counter] = sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
 
-		track = 0; // only the first counter needs these
+	if (fd[nr_cpu][counter] < 0) {
+		int err = errno;
 
-		fd[nr_cpu][counter] =
-			sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
+		error("syscall returned with %d (%s)\n",
+				fd[nr_cpu][counter], strerror(err));
+		if (err == EPERM)
+			printf("Are you root?\n");
+		exit(-1);
+	}
+	assert(fd[nr_cpu][counter] >= 0);
+	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
 
-		if (fd[nr_cpu][counter] < 0) {
-			int err = errno;
+	/*
+	 * First counter acts as the group leader:
+	 */
+	if (group && group_fd == -1)
+		group_fd = fd[nr_cpu][counter];
+
+	event_array[nr_poll].fd = fd[nr_cpu][counter];
+	event_array[nr_poll].events = POLLIN;
+	nr_poll++;
+
+	mmap_array[nr_cpu][counter].counter = counter;
+	mmap_array[nr_cpu][counter].prev = 0;
+	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
+	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+			PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
+	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
+		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
+		exit(-1);
+	}
+}
 
-			error("syscall returned with %d (%s)\n",
-					fd[nr_cpu][counter], strerror(err));
-			if (err == EPERM)
-				printf("Are you root?\n");
-			exit(-1);
-		}
-		assert(fd[nr_cpu][counter] >= 0);
-		fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
+static void open_counters(int cpu, pid_t pid)
+{
+	int counter;
 
-		/*
-		 * First counter acts as the group leader:
-		 */
-		if (group && group_fd == -1)
-			group_fd = fd[nr_cpu][counter];
-
-		event_array[nr_poll].fd = fd[nr_cpu][counter];
-		event_array[nr_poll].events = POLLIN;
-		nr_poll++;
-
-		mmap_array[nr_cpu][counter].counter = counter;
-		mmap_array[nr_cpu][counter].prev = 0;
-		mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
-		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0);
-		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
-			error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-			exit(-1);
-		}
+	if (pid > 0) {
+		pid_synthesize_comm_event(pid, 0);
+		pid_synthesize_mmap_events(pid);
 	}
+
+	group_fd = -1;
+	for (counter = 0; counter < nr_counters; counter++)
+		create_counter(counter, cpu, pid);
+
 	nr_cpu++;
 }
 
-- 
GitLab


From cf1f45744c6fa3501e0a6f0ddc418f0ef27e725b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 13:27:02 +0200
Subject: [PATCH 1614/2198] perf record, top: Implement --freq

Support frequency-based profiling and make it the default.

(Also add a Hz printout in perf top.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 10 +++++++++-
 Documentation/perf_counter/builtin-top.c    | 13 +++++++++----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 7f2d7ce94075e..e2301f39e551b 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -27,6 +27,7 @@ static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 static int			nr_cpus				= 0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 128;
+static int			freq				= 0;
 static int			output;
 static const char		*output_name			= "perf.data";
 static int			group				= 0;
@@ -347,9 +348,10 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	attr.config		= event_id[counter];
 	attr.sample_period	= event_count[counter];
 	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	attr.freq		= freq;
 	attr.mmap		= track;
 	attr.comm		= track;
-	attr.inherit	= (cpu < 0) && inherit;
+	attr.inherit		= (cpu < 0) && inherit;
 
 	track = 0; /* only the first counter needs these */
 
@@ -520,6 +522,8 @@ static const struct option options[] = {
 		    "output file name"),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
+	OPT_INTEGER('F', "freq", &freq,
+		    "profile at this frequency"),
 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
 		    "number of mmap data pages"),
 	OPT_END()
@@ -540,6 +544,10 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 		event_id[0] = 0;
 	}
 
+	if (freq) {
+		default_interval = freq;
+		freq = 1;
+	}
 	for (counter = 0; counter < nr_counters; counter++) {
 		if (event_count[counter])
 			continue;
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 28cbde4b6e83d..2fee5951b2315 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -74,8 +74,8 @@ static int			nr_cpus				=  0;
 static unsigned int		realtime_prio			=  0;
 static int			group				=  0;
 static unsigned int		page_size;
-static unsigned int		mmap_pages			=  16;
-static int			freq				= 0;
+static unsigned int		mmap_pages			= 16;
+static int			freq				=  0;
 
 static char			*sym_filter;
 static unsigned long		filter_start;
@@ -212,8 +212,13 @@ static void print_sym_table(void)
 		events_per_sec,
 		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)));
 
-	if (nr_counters == 1)
-		printf("%d ", event_count[0]);
+	if (nr_counters == 1) {
+		printf("%d", event_count[0]);
+		if (freq)
+			printf("Hz ");
+		else
+			printf(" ");
+	}
 
 	for (counter = 0; counter < nr_counters; counter++) {
 		if (counter)
-- 
GitLab


From f7b6eb3fa07269da20dbbde8ba37a0273fdbd9c9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:04:51 +0200
Subject: [PATCH 1615/2198] x86: Set context.vdso before installing the mapping

In order to make arch_vma_name() work from inside
install_special_mapping() we need to set the context.vdso
before calling it.

( This is needed for performance counters to be able to track
  this special executable area. )

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/vdso/vdso32-setup.c | 6 +++++-
 arch/x86/vdso/vma.c          | 7 +++++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 1241f118ab561..58bc00f68b12e 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		}
 	}
 
+	current->mm->context.vdso = (void *)addr;
+
 	if (compat_uses_vma || !compat) {
 		/*
 		 * MAYWRITE to allow gdb to COW and set breakpoints
@@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 			goto up_fail;
 	}
 
-	current->mm->context.vdso = (void *)addr;
 	current_thread_info()->sysenter_return =
 		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
 
   up_fail:
+	if (ret)
+		current->mm->context.vdso = NULL;
+
 	up_write(&mm->mmap_sem);
 
 	return ret;
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 7133cdf9098b7..93b7a2938b2fb 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -115,15 +115,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		goto up_fail;
 	}
 
+	current->mm->context.vdso = (void *)addr;
+
 	ret = install_special_mapping(mm, addr, vdso_size,
 				      VM_READ|VM_EXEC|
 				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
 				      VM_ALWAYSDUMP,
 				      vdso_pages);
-	if (ret)
+	if (ret) {
+		current->mm->context.vdso = NULL;
 		goto up_fail;
+	}
 
-	current->mm->context.vdso = (void *)addr;
 up_fail:
 	up_write(&mm->mmap_sem);
 	return ret;
-- 
GitLab


From 089dd79db9264dc0da602bad45d42f1b3e7d1e07 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:04:55 +0200
Subject: [PATCH 1616/2198] perf_counter: Generate mmap events for
 install_special_mapping()

In order to track the vdso also generate mmap events for
install_special_mapping().

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 14 ++++++++------
 kernel/perf_counter.c        | 34 ++++++++++++++++++++++------------
 mm/mmap.c                    |  5 +++--
 3 files changed, 33 insertions(+), 20 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6ca403acd4196..40dc0e273d9c4 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -617,8 +617,13 @@ static inline int is_software_counter(struct perf_counter *counter)
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
 
-extern void perf_counter_mmap(unsigned long addr, unsigned long len,
-			      unsigned long pgoff, struct file *file);
+extern void __perf_counter_mmap(struct vm_area_struct *vma);
+
+static inline void perf_counter_mmap(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_EXEC)
+		__perf_counter_mmap(vma);
+}
 
 extern void perf_counter_comm(struct task_struct *tsk);
 extern void perf_counter_fork(struct task_struct *tsk);
@@ -668,10 +673,7 @@ static inline void
 perf_swcounter_event(u32 event, u64 nr, int nmi,
 		     struct pt_regs *regs, u64 addr)			{ }
 
-static inline void
-perf_counter_mmap(unsigned long addr, unsigned long len,
-		  unsigned long pgoff, struct file *file)		{ }
-
+static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index a5d3e2aedd2f1..37a5a241ca7ec 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2255,7 +2255,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 }
 
 static void perf_output_copy(struct perf_output_handle *handle,
-			     void *buf, unsigned int len)
+			     const void *buf, unsigned int len)
 {
 	unsigned int pages_mask;
 	unsigned int offset;
@@ -2681,9 +2681,10 @@ void perf_counter_comm(struct task_struct *task)
  */
 
 struct perf_mmap_event {
-	struct file	*file;
-	char		*file_name;
-	int		file_size;
+	struct vm_area_struct	*vma;
+
+	const char		*file_name;
+	int			file_size;
 
 	struct {
 		struct perf_event_header	header;
@@ -2744,11 +2745,12 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 {
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
-	struct file *file = mmap_event->file;
+	struct vm_area_struct *vma = mmap_event->vma;
+	struct file *file = vma->vm_file;
 	unsigned int size;
 	char tmp[16];
 	char *buf = NULL;
-	char *name;
+	const char *name;
 
 	if (file) {
 		buf = kzalloc(PATH_MAX, GFP_KERNEL);
@@ -2762,6 +2764,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 			goto got_name;
 		}
 	} else {
+		name = arch_vma_name(mmap_event->vma);
+		if (name)
+			goto got_name;
+
+		if (!vma->vm_mm) {
+			name = strncpy(tmp, "[vdso]", sizeof(tmp));
+			goto got_name;
+		}
+
 		name = strncpy(tmp, "//anon", sizeof(tmp));
 		goto got_name;
 	}
@@ -2791,8 +2802,7 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 	kfree(buf);
 }
 
-void perf_counter_mmap(unsigned long addr, unsigned long len,
-		       unsigned long pgoff, struct file *file)
+void __perf_counter_mmap(struct vm_area_struct *vma)
 {
 	struct perf_mmap_event mmap_event;
 
@@ -2800,12 +2810,12 @@ void perf_counter_mmap(unsigned long addr, unsigned long len,
 		return;
 
 	mmap_event = (struct perf_mmap_event){
-		.file   = file,
+		.vma	= vma,
 		.event  = {
 			.header = { .type = PERF_EVENT_MMAP, },
-			.start  = addr,
-			.len    = len,
-			.pgoff  = pgoff,
+			.start  = vma->vm_start,
+			.len    = vma->vm_end - vma->vm_start,
+			.pgoff  = vma->vm_pgoff,
 		},
 	};
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 6451ce2854b9f..8101de490c739 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1220,8 +1220,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	if (correct_wcount)
 		atomic_inc(&inode->i_writecount);
 out:
-	if (vm_flags & VM_EXEC)
-		perf_counter_mmap(addr, len, pgoff, file);
+	perf_counter_mmap(vma);
 
 	mm->total_vm += len >> PAGE_SHIFT;
 	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
@@ -2309,6 +2308,8 @@ int install_special_mapping(struct mm_struct *mm,
 
 	mm->total_vm += len >> PAGE_SHIFT;
 
+	perf_counter_mmap(vma);
+
 	return 0;
 }
 
-- 
GitLab


From fc54db5105d01ad691a7d747064c7890e17f936c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:04:59 +0200
Subject: [PATCH 1617/2198] perf report: Deal with maps

In order to deal with [vdso] maps generalize the ip->symbol path
a bit and allow to override some bits with custom functions.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 37 +++++++++++++++++++--
 Documentation/perf_counter/util/symbol.c    |  1 +
 Documentation/perf_counter/util/symbol.h    |  1 +
 3 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index eb5424fcbd627..9783d1e493c44 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -79,6 +79,7 @@ typedef union event_union {
 
 static LIST_HEAD(dsos);
 static struct dso *kernel_dso;
+static struct dso *vdso;
 
 static void dsos__add(struct dso *dso)
 {
@@ -136,6 +137,11 @@ static void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
+static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
+{
+	return dso__find_symbol(kernel_dso, ip);
+}
+
 static int load_kernel(void)
 {
 	int err;
@@ -151,6 +157,14 @@ static int load_kernel(void)
 	} else
 		dsos__add(kernel_dso);
 
+	vdso = dso__new("[vdso]", 0);
+	if (!vdso)
+		return -1;
+
+	vdso->find_symbol = vdso__find_symbol;
+
+	dsos__add(vdso);
+
 	return err;
 }
 
@@ -173,9 +187,20 @@ struct map {
 	uint64_t	 start;
 	uint64_t	 end;
 	uint64_t	 pgoff;
+	uint64_t	 (*map_ip)(struct map *, uint64_t);
 	struct dso	 *dso;
 };
 
+static uint64_t map__map_ip(struct map *map, uint64_t ip)
+{
+	return ip - map->start + map->pgoff;
+}
+
+static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
+{
+	return ip;
+}
+
 static struct map *map__new(struct mmap_event *event)
 {
 	struct map *self = malloc(sizeof(*self));
@@ -201,6 +226,11 @@ static struct map *map__new(struct mmap_event *event)
 		self->dso = dsos__findnew(filename);
 		if (self->dso == NULL)
 			goto out_delete;
+
+		if (self->dso == vdso)
+			self->map_ip = vdso__map_ip;
+		else
+			self->map_ip = map__map_ip;
 	}
 	return self;
 out_delete:
@@ -917,8 +947,8 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 
 		map = thread__find_map(thread, ip);
 		if (map != NULL) {
+			ip = map->map_ip(map, ip);
 			dso = map->dso;
-			ip -= map->start + map->pgoff;
 		} else {
 			/*
 			 * If this is outside of all known maps,
@@ -938,7 +968,10 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	}
 
 	if (show & show_mask) {
-		struct symbol *sym = dso__find_symbol(dso, ip);
+		struct symbol *sym = NULL;
+
+		if (dso)
+			sym = dso->find_symbol(dso, ip);
 
 		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
 			fprintf(stderr,
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index 15d5cf9abfac6..a06bbfba8350a 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -45,6 +45,7 @@ struct dso *dso__new(const char *name, unsigned int sym_priv_size)
 		strcpy(self->name, name);
 		self->syms = RB_ROOT;
 		self->sym_priv_size = sym_priv_size;
+		self->find_symbol = dso__find_symbol;
 	}
 
 	return self;
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index 8dd8522a0a0c9..e23cc31266843 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -16,6 +16,7 @@ struct dso {
 	struct list_head node;
 	struct rb_root	 syms;
 	unsigned int	 sym_priv_size;
+	struct symbol    *(*find_symbol)(struct dso *, uint64_t ip);
 	char		 name[0];
 };
 
-- 
GitLab


From 8edd4286f99f78fe07fe9196e69d5643da86cada Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 14:13:18 +0200
Subject: [PATCH 1618/2198] perf report: Display user/kernel differentiator

Before:

    25.96%  copy_user_generic_string
    15.23%  two_op
    15.19%  one_op
     6.92%  enough_duration
     1.23%  alloc_pages_current
     1.14%  acpi_os_read_port
     1.08%  _spin_lock

After:

    25.96%  [k] copy_user_generic_string
    15.23%  [.] two_op
    15.19%  [.] one_op
     6.92%  [.] enough_duration
     1.23%  [k] alloc_pages_current
     1.14%  [k] acpi_os_read_port
     1.08%  [k] _spin_lock

The '[k]' differentiator is a quick clue that it's a kernel symbol,
without having to bring in the full dso column.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-report.c | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 9783d1e493c44..ca303fd74a718 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -504,7 +504,7 @@ sort__comm_print(FILE *fp, struct hist_entry *self)
 }
 
 static struct sort_entry sort_comm = {
-	.header 	= "         Command",
+	.header		= "         Command",
 	.cmp		= sort__comm_cmp,
 	.collapse	= sort__comm_collapse,
 	.print		= sort__comm_print,
@@ -569,10 +569,12 @@ sort__sym_print(FILE *fp, struct hist_entry *self)
 	if (verbose)
 		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
 
-	if (self->sym)
-		ret += fprintf(fp, "%s", self->sym->name);
-	else
+	if (self->sym) {
+		ret += fprintf(fp, "[%c] %s",
+			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
+	} else {
 		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+	}
 
 	return ret;
 }
@@ -586,9 +588,9 @@ static struct sort_entry sort_sym = {
 static int sort__need_collapse = 0;
 
 struct sort_dimension {
-	char *name;
-	struct sort_entry *entry;
-	int taken;
+	char			*name;
+	struct sort_entry	*entry;
+	int			taken;
 };
 
 static struct sort_dimension sort_dimensions[] = {
-- 
GitLab


From 2debbc836696f2a815d02630230584a1754a5022 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 14:29:10 +0200
Subject: [PATCH 1619/2198] perf_counter tools: Clarify events/samples naming

A number of places said 'events' while they should say 'samples'.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 22 +++++-----
 Documentation/perf_counter/builtin-report.c |  2 +-
 Documentation/perf_counter/builtin-top.c    | 46 ++++++++++-----------
 3 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index e2301f39e551b..d4ad3057a711d 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -65,7 +65,7 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 	return head;
 }
 
-static long events;
+static long samples;
 static struct timeval last_read, this_read;
 
 static __u64 bytes_written;
@@ -83,7 +83,7 @@ static void mmap_read(struct mmap_data *md)
 
 	/*
 	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and screw up the events under us.
+	 * the writer will bite our tail and mess up the samples under us.
 	 *
 	 * If we somehow ended up ahead of the head, we got messed up.
 	 *
@@ -109,7 +109,7 @@ static void mmap_read(struct mmap_data *md)
 	last_read = this_read;
 
 	if (old != head)
-		events++;
+		samples++;
 
 	size = head - old;
 
@@ -257,7 +257,7 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 	exit(EXIT_FAILURE);
 }
 
-static void pid_synthesize_mmap_events(pid_t pid)
+static void pid_synthesize_mmap_samples(pid_t pid)
 {
 	char filename[PATH_MAX];
 	FILE *fp;
@@ -315,7 +315,7 @@ static void pid_synthesize_mmap_events(pid_t pid)
 	fclose(fp);
 }
 
-static void synthesize_events(void)
+static void synthesize_samples(void)
 {
 	DIR *proc;
 	struct dirent dirent, *next;
@@ -331,7 +331,7 @@ static void synthesize_events(void)
 			continue;
 
 		pid_synthesize_comm_event(pid, 1);
-		pid_synthesize_mmap_events(pid);
+		pid_synthesize_mmap_samples(pid);
 	}
 
 	closedir(proc);
@@ -396,7 +396,7 @@ static void open_counters(int cpu, pid_t pid)
 
 	if (pid > 0) {
 		pid_synthesize_comm_event(pid, 0);
-		pid_synthesize_mmap_events(pid);
+		pid_synthesize_mmap_samples(pid);
 	}
 
 	group_fd = -1;
@@ -469,17 +469,17 @@ static int __cmd_record(int argc, const char **argv)
 	}
 
 	if (system_wide)
-		synthesize_events();
+		synthesize_samples();
 
 	while (!done) {
-		int hits = events;
+		int hits = samples;
 
 		for (i = 0; i < nr_cpu; i++) {
 			for (counter = 0; counter < nr_counters; counter++)
 				mmap_read(&mmap_array[i][counter]);
 		}
 
-		if (hits == events)
+		if (hits == samples)
 			ret = poll(event_array, nr_poll, 100);
 	}
 
@@ -487,7 +487,7 @@ static int __cmd_record(int argc, const char **argv)
 	 * Approximate RIP event size: 24 bytes.
 	 */
 	fprintf(stderr,
-		"[ perf record: Captured and wrote %.3f MB %s (~%lld events) ]\n",
+		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
 		(double)bytes_written / 1024.0 / 1024.0,
 		output_name,
 		bytes_written / 24);
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index ca303fd74a718..5af105c280b5e 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -857,7 +857,7 @@ static size_t output__fprintf(FILE *fp, uint64_t total_samples)
 
 	fprintf(fp, "\n");
 	fprintf(fp, "#\n");
-	fprintf(fp, "# (%Ld profiler events)\n", (__u64)total_samples);
+	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
 	fprintf(fp, "#\n");
 
 	fprintf(fp, "# Overhead");
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 2fee5951b2315..ff7e13c4647c7 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -137,8 +137,8 @@ static double sym_weight(const struct sym_entry *sym)
 	return weight;
 }
 
-static long			events;
-static long			userspace_events;
+static long			samples;
+static long			userspace_samples;
 static const char		CONSOLE_CLEAR[] = "[H[2J";
 
 static void __list_insert_active_sym(struct sym_entry *syme)
@@ -177,14 +177,14 @@ static void print_sym_table(void)
 {
 	int printed = 0, j;
 	int counter;
-	float events_per_sec = events/delay_secs;
-	float kevents_per_sec = (events-userspace_events)/delay_secs;
-	float sum_kevents = 0.0;
+	float samples_per_sec = samples/delay_secs;
+	float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
+	float sum_ksamples = 0.0;
 	struct sym_entry *syme, *n;
 	struct rb_root tmp = RB_ROOT;
 	struct rb_node *nd;
 
-	events = userspace_events = 0;
+	samples = userspace_samples = 0;
 
 	/* Sort the active symbols */
 	pthread_mutex_lock(&active_symbols_lock);
@@ -196,7 +196,7 @@ static void print_sym_table(void)
 		if (syme->snap_count != 0) {
 			syme->weight = sym_weight(syme);
 			rb_insert_active_sym(&tmp, syme);
-			sum_kevents += syme->snap_count;
+			sum_ksamples += syme->snap_count;
 
 			for (j = 0; j < nr_counters; j++)
 				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
@@ -209,8 +209,8 @@ static void print_sym_table(void)
 	printf(
 "------------------------------------------------------------------------------\n");
 	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
-		events_per_sec,
-		100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)));
+		samples_per_sec,
+		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
 
 	if (nr_counters == 1) {
 		printf("%d", event_count[0]);
@@ -246,12 +246,12 @@ static void print_sym_table(void)
 	printf("------------------------------------------------------------------------------\n\n");
 
 	if (nr_counters == 1)
-		printf("             events    pcnt");
+		printf("             samples    pcnt");
 	else
-		printf("  weight     events    pcnt");
+		printf("  weight     samples    pcnt");
 
 	printf("         RIP          kernel function\n"
-	       	       "  ______     ______   _____   ________________   _______________\n\n"
+	       	       "  ______     _______   _____   ________________   _______________\n\n"
 	);
 
 	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
@@ -263,8 +263,8 @@ static void print_sym_table(void)
 		if (++printed > print_entries || syme->snap_count < count_filter)
 			continue;
 
-		pcnt = 100.0 - (100.0 * ((sum_kevents - syme->snap_count) /
-					 sum_kevents));
+		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
+					 sum_ksamples));
 
 		/*
 		 * We color high-overhead entries in red, low-overhead
@@ -276,9 +276,9 @@ static void print_sym_table(void)
 			color = PERF_COLOR_GREEN;
 
 		if (nr_counters == 1)
-			printf("%19.2f - ", syme->weight);
+			printf("%20.2f - ", syme->weight);
 		else
-			printf("%8.1f %10ld - ", syme->weight, syme->snap_count);
+			printf("%9.1f %10ld - ", syme->weight, syme->snap_count);
 
 		color_fprintf(stdout, color, "%4.1f%%", pcnt);
 		printf(" - %016llx : %s\n", sym->start, sym->name);
@@ -318,7 +318,7 @@ static int symbol_filter(struct dso *self, struct symbol *sym)
 		return 1;
 
 	syme = dso__sym_priv(self, sym);
-	/* Tag events to be skipped. */
+	/* Tag samples to be skipped. */
 	if (!strcmp("default_idle", name) ||
 	    !strcmp("cpu_idle", name) ||
 	    !strcmp("enter_idle", name) ||
@@ -405,15 +405,15 @@ static void record_ip(uint64_t ip, int counter)
 		}
 	}
 
-	events--;
+	samples--;
 }
 
 static void process_event(uint64_t ip, int counter)
 {
-	events++;
+	samples++;
 
 	if (ip < min_ip || ip > max_ip) {
-		userspace_events++;
+		userspace_samples++;
 		return;
 	}
 
@@ -451,7 +451,7 @@ static void mmap_read(struct mmap_data *md)
 
 	/*
 	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and screw up the events under us.
+	 * the writer will bite our tail and mess up the samples under us.
 	 *
 	 * If we somehow ended up ahead of the head, we got messed up.
 	 *
@@ -608,14 +608,14 @@ static int __cmd_top(void)
 	}
 
 	while (1) {
-		int hits = events;
+		int hits = samples;
 
 		for (i = 0; i < nr_cpus; i++) {
 			for (counter = 0; counter < nr_counters; counter++)
 				mmap_read(&mmap_array[i][counter]);
 		}
 
-		if (hits == events)
+		if (hits == samples)
 			ret = poll(event_array, nr_poll, 100);
 	}
 
-- 
GitLab


From 136107a76fe5f62906162f730834477b71cf131e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 17:56:21 +0200
Subject: [PATCH 1620/2198] perf_counter tools: Remove -march=native

Turns out that neither PowerPC nor older x86 compilers know this switch
...

and since it does not make a measurable difference, just omit it.

Reported-by: Paul Mackerras <paulus@samba.org>
Reported-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index c9ec4585f4d62..5b99f04df812c 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -159,7 +159,7 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
 
 # CFLAGS and LDFLAGS are for the users to override from the command line.
 
-CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6 -march=native
+CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6
 LDFLAGS = -lpthread -lrt -lelf
 ALL_CFLAGS = $(CFLAGS)
 ALL_LDFLAGS = $(LDFLAGS)
-- 
GitLab


From ac4bcf889469ffbca88f234d3184452886a47905 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 14:44:52 +0200
Subject: [PATCH 1621/2198] perf_counter: Change PERF_SAMPLE_CONFIG into
 PERF_SAMPLE_ID

The purpose of PERF_SAMPLE_CONFIG was to identify the counters,
since then we've added counter ids, use those instead.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 2 +-
 kernel/perf_counter.c        | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 40dc0e273d9c4..9cea32a0655d8 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -104,7 +104,7 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_ADDR		= 1U << 3,
 	PERF_SAMPLE_GROUP		= 1U << 4,
 	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
-	PERF_SAMPLE_CONFIG		= 1U << 6,
+	PERF_SAMPLE_ID			= 1U << 6,
 	PERF_SAMPLE_CPU			= 1U << 7,
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 37a5a241ca7ec..e75b91a76a58f 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2392,8 +2392,8 @@ static void perf_counter_output(struct perf_counter *counter,
 		header.size += sizeof(u64);
 	}
 
-	if (sample_type & PERF_SAMPLE_CONFIG) {
-		header.type |= PERF_SAMPLE_CONFIG;
+	if (sample_type & PERF_SAMPLE_ID) {
+		header.type |= PERF_SAMPLE_ID;
 		header.size += sizeof(u64);
 	}
 
@@ -2439,8 +2439,8 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (sample_type & PERF_SAMPLE_ADDR)
 		perf_output_put(&handle, addr);
 
-	if (sample_type & PERF_SAMPLE_CONFIG)
-		perf_output_put(&handle, counter->attr.config);
+	if (sample_type & PERF_SAMPLE_ID)
+		perf_output_put(&handle, counter->id);
 
 	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
-- 
GitLab


From 689802b2d0536e72281dc959ab9cb34fb3c304cf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 15:05:43 +0200
Subject: [PATCH 1622/2198] perf_counter: Add PERF_SAMPLE_PERIOD

In order to allow easy tracking of the period, also provide means of
adding it to the sample data.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 10 ++++++++++
 2 files changed, 12 insertions(+)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 9cea32a0655d8..6bc2500397380 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -106,6 +106,7 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
 	PERF_SAMPLE_ID			= 1U << 6,
 	PERF_SAMPLE_CPU			= 1U << 7,
+	PERF_SAMPLE_PERIOD		= 1U << 8,
 };
 
 /*
@@ -260,6 +261,7 @@ enum perf_event_type {
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
+	 *	u64				id;
 	 *	u64				sample_period;
 	 * };
 	 */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e75b91a76a58f..f8390668c3912 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2404,6 +2404,11 @@ static void perf_counter_output(struct perf_counter *counter,
 		cpu_entry.cpu = raw_smp_processor_id();
 	}
 
+	if (sample_type & PERF_SAMPLE_PERIOD) {
+		header.type |= PERF_SAMPLE_PERIOD;
+		header.size += sizeof(u64);
+	}
+
 	if (sample_type & PERF_SAMPLE_GROUP) {
 		header.type |= PERF_SAMPLE_GROUP;
 		header.size += sizeof(u64) +
@@ -2445,6 +2450,9 @@ static void perf_counter_output(struct perf_counter *counter,
 	if (sample_type & PERF_SAMPLE_CPU)
 		perf_output_put(&handle, cpu_entry);
 
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		perf_output_put(&handle, counter->hw.sample_period);
+
 	/*
 	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
 	 */
@@ -2835,6 +2843,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 	struct {
 		struct perf_event_header	header;
 		u64				time;
+		u64				id;
 		u64				period;
 	} freq_event = {
 		.header = {
@@ -2843,6 +2852,7 @@ static void perf_log_period(struct perf_counter *counter, u64 period)
 			.size = sizeof(freq_event),
 		},
 		.time = sched_clock(),
+		.id = counter->id,
 		.period = period,
 	};
 
-- 
GitLab


From 6a24ed6c6082ec65d19331a4bfa30c0512a1a822 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 5 Jun 2009 18:01:29 +0200
Subject: [PATCH 1623/2198] perf_counter: Fix frequency adjustment for < HZ

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  3 +++
 kernel/perf_counter.c        | 32 +++++++++++++++++++++++++-------
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6bc2500397380..4f9d39ecdc05e 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -373,6 +373,9 @@ struct hw_perf_counter {
 	u64				sample_period;
 	atomic64_t			period_left;
 	u64				interrupts;
+
+	u64				freq_count;
+	u64				freq_interrupts;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index f8390668c3912..47c92fb927f29 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1187,8 +1187,9 @@ static void perf_log_period(struct perf_counter *counter, u64 period);
 static void perf_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
 	u64 interrupts, sample_period;
-	u64 events, period;
+	u64 events, period, freq;
 	s64 delta;
 
 	spin_lock(&ctx->lock);
@@ -1196,8 +1197,10 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (counter->state != PERF_COUNTER_STATE_ACTIVE)
 			continue;
 
-		interrupts = counter->hw.interrupts;
-		counter->hw.interrupts = 0;
+		hwc = &counter->hw;
+
+		interrupts = hwc->interrupts;
+		hwc->interrupts = 0;
 
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
@@ -1208,20 +1211,35 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
-		events = HZ * interrupts * counter->hw.sample_period;
+		if (counter->attr.sample_freq < HZ) {
+			freq = counter->attr.sample_freq;
+
+			hwc->freq_count += freq;
+			hwc->freq_interrupts += interrupts;
+
+			if (hwc->freq_count < HZ)
+				continue;
+
+			interrupts = hwc->freq_interrupts;
+			hwc->freq_interrupts = 0;
+			hwc->freq_count -= HZ;
+		} else
+			freq = HZ;
+
+		events = freq * interrupts * hwc->sample_period;
 		period = div64_u64(events, counter->attr.sample_freq);
 
-		delta = (s64)(1 + period - counter->hw.sample_period);
+		delta = (s64)(1 + period - hwc->sample_period);
 		delta >>= 1;
 
-		sample_period = counter->hw.sample_period + delta;
+		sample_period = hwc->sample_period + delta;
 
 		if (!sample_period)
 			sample_period = 1;
 
 		perf_log_period(counter, sample_period);
 
-		counter->hw.sample_period = sample_period;
+		hwc->sample_period = sample_period;
 	}
 	spin_unlock(&ctx->lock);
 }
-- 
GitLab


From b2fef0762fdb65cf8702eea93f4e58abeb0ecefc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 18:07:51 +0200
Subject: [PATCH 1624/2198] perf_counter tools: Sample and display frequency
 adjustment changes

To allow the debugging of frequency-adjusting counters, sample
those adjustments and display them in perf report -D.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c |  2 +-
 Documentation/perf_counter/builtin-report.c | 39 +++++++++++++++++----
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index d4ad3057a711d..43ddab31ac399 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -347,7 +347,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	memset(&attr, 0, sizeof(attr));
 	attr.config		= event_id[counter];
 	attr.sample_period	= event_count[counter];
-	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
 	attr.freq		= freq;
 	attr.mmap		= track;
 	attr.comm		= track;
diff --git a/Documentation/perf_counter/builtin-report.c b/Documentation/perf_counter/builtin-report.c
index 5af105c280b5e..242e09ff36589 100644
--- a/Documentation/perf_counter/builtin-report.c
+++ b/Documentation/perf_counter/builtin-report.c
@@ -69,12 +69,20 @@ struct fork_event {
 	__u32 pid, ppid;
 };
 
-typedef union event_union {
+struct period_event {
 	struct perf_event_header header;
-	struct ip_event ip;
-	struct mmap_event mmap;
-	struct comm_event comm;
-	struct fork_event fork;
+	__u64 time;
+	__u64 id;
+	__u64 sample_period;
+};
+
+typedef union event_union {
+	struct perf_event_header	header;
+	struct ip_event			ip;
+	struct mmap_event		mmap;
+	struct comm_event		comm;
+	struct fork_event		fork;
+	struct period_event		period;
 } event_t;
 
 static LIST_HEAD(dsos);
@@ -1052,6 +1060,19 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static int
+process_period_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->period.time,
+		event->period.id,
+		event->period.sample_period);
+
+	return 0;
+}
+
 static int
 process_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -1068,11 +1089,12 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	case PERF_EVENT_FORK:
 		return process_fork_event(event, offset, head);
 
+	case PERF_EVENT_PERIOD:
+		return process_period_event(event, offset, head);
 	/*
 	 * We dont process them right now but they are fine:
 	 */
 
-	case PERF_EVENT_PERIOD:
 	case PERF_EVENT_THROTTLE:
 	case PERF_EVENT_UNTHROTTLE:
 		return 0;
@@ -1157,6 +1179,11 @@ static int __cmd_report(void)
 
 	size = event->header.size;
 
+	dprintf("%p [%p]: event: %d\n",
+			(void *)(offset + head),
+			(void *)(long)event->header.size,
+			event->header.type);
+
 	if (!size || process_event(event, offset, head) < 0) {
 
 		dprintf("%p [%p]: skipping unknown header type: %d\n",
-- 
GitLab


From 1dba15e74aba5a90c1f2557f37e5d09f8a2df643 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 18:37:22 +0200
Subject: [PATCH 1625/2198] perf record: Set frequency correctly

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 43ddab31ac399..c22ea0c7472a3 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -348,7 +348,10 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	attr.config		= event_id[counter];
 	attr.sample_period	= event_count[counter];
 	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
-	attr.freq		= freq;
+	if (freq) {
+		attr.freq		= 1;
+		attr.sample_freq	= freq;
+	}
 	attr.mmap		= track;
 	attr.comm		= track;
 	attr.inherit		= (cpu < 0) && inherit;
@@ -544,10 +547,6 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 		event_id[0] = 0;
 	}
 
-	if (freq) {
-		default_interval = freq;
-		freq = 1;
-	}
 	for (counter = 0; counter < nr_counters; counter++) {
 		if (event_count[counter])
 			continue;
-- 
GitLab


From 2f335a02b3c816e77e7df1d15b12e3bbb8f4c8f0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 5 Jun 2009 19:31:01 +0200
Subject: [PATCH 1626/2198] perf top: Fix zero or negative refresh delay

If perf top is executed with a zero value for the refresh rate,
we get a division by zero exception while computing samples_per_sec.

Also a zero refresh rate is not possible, neither do we want to
accept negative values.

[ Impact: fix division by zero in perf top ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1244223061-5399-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-top.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index ff7e13c4647c7..b2f480b5a1348 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -693,6 +693,9 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 		event_id[0] = 0;
 	}
 
+	if (delay_secs < 1)
+		delay_secs = 1;
+
 	for (counter = 0; counter < nr_counters; counter++) {
 		if (event_count[counter])
 			continue;
-- 
GitLab


From fe2245c905631a3a353504fc04388ce3dfaf9d9e Mon Sep 17 00:00:00 2001
From: Mark Langsdorf <mark.langsdorf@amd.com>
Date: Sun, 5 Jul 2009 15:50:52 -0500
Subject: [PATCH 1627/2198] x86: enable GART-IOMMU only after setting up
 protection methods

The current code to set up the GART as an IOMMU enables GART
translations before it removes the aperture from the kernel memory
map, sets the GART PTEs to UC, sets up the guard and scratch
pages, or does a wbinvd().  This leaves the possibility of cache
aliasing open and can cause system crashes.

Re-order the code so as to enable the GART translations only
after all safeguards are in place and the tlb has been flushed.

AMD has tested this patch on both Istanbul systems and 1st
generation Opteron systems with APG enabled and seen no adverse
effects.  Istanbul systems with HT Assist enabled sometimes
see MCE errors due to cache artifacts with the unmodified
code.

Signed-off-by: Mark Langsdorf <mark.langsdorf@amd.com>
Cc: <stable@kernel.org>
Cc: Joerg Roedel <joerg.roedel@amd.com>
Cc: akpm@linux-foundation.org
Cc: jbarnes@virtuousgeek.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-gart_64.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 1e8920d98f7ca..cfd9f90638967 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -658,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 	agp_gatt_table = gatt;
 
-	enable_gart_translations();
-
 	error = sysdev_class_register(&gart_sysdev_class);
 	if (!error)
 		error = sysdev_register(&device_gart);
@@ -816,6 +814,14 @@ void __init gart_iommu_init(void)
 	 * the pages as Not-Present:
 	 */
 	wbinvd();
+	
+	/*
+	 * Now all caches are flushed and we can safely enable
+	 * GART hardware.  Doing it early leaves the possibility
+	 * of stale cache entries that can lead to GART PTE
+	 * errors.
+	 */
+	enable_gart_translations();
 
 	/*
 	 * Try to workaround a bug (thanks to BenH):
-- 
GitLab


From a21ca2cac582886a3e95c8bb84ff7c52d4d15e54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 09:58:57 +0200
Subject: [PATCH 1628/2198] perf_counter: Separate out attr->type from
 attr->config

Counter type is a frequently used value and we do a lot of
bit juggling by encoding and decoding it from attr->config.

Clean this up by creating a separate attr->type field.

Also clean up the various similarly complex user-space bits
all around counter attribute management.

The net improvement is significant, and it will be easier
to add a new major type (which is what triggered this cleanup).

(This changes the ABI, all tools are adapted.)
(PowerPC build-tested.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-record.c   | 105 +++++++--------
 Documentation/perf_counter/builtin-stat.c     |  76 +++++------
 Documentation/perf_counter/builtin-top.c      |  67 ++++------
 Documentation/perf_counter/perf.h             |   2 -
 .../perf_counter/util/parse-events.c          | 120 ++++++++++--------
 .../perf_counter/util/parse-events.h          |   7 +-
 arch/powerpc/kernel/perf_counter.c            |   6 +-
 arch/x86/kernel/cpu/perf_counter.c            |   8 +-
 include/linux/perf_counter.h                  |  65 ++--------
 kernel/perf_counter.c                         |  14 +-
 10 files changed, 196 insertions(+), 274 deletions(-)

diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index c22ea0c7472a3..130fd88266bb3 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -20,10 +20,10 @@
 #define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1)
 #define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask))
 
-static long			default_interval = 100000;
-static long			event_count[MAX_COUNTERS];
-
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+
+static long			default_interval		= 100000;
+
 static int			nr_cpus				= 0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 128;
@@ -38,22 +38,44 @@ static int			inherit				= 1;
 static int			force				= 0;
 static int			append_file			= 0;
 
-const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
+static long			samples;
+static struct timeval		last_read;
+static struct timeval		this_read;
+
+static __u64			bytes_written;
+
+static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
+
+static int			nr_poll;
+static int			nr_cpu;
+
+struct mmap_event {
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	__u64				start;
+	__u64				len;
+	__u64				pgoff;
+	char				filename[PATH_MAX];
+};
+
+struct comm_event {
+	struct perf_event_header	header;
+	__u32				pid;
+	__u32				tid;
+	char				comm[16];
 };
 
+
 struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
+	int			counter;
+	void			*base;
+	unsigned int		mask;
+	unsigned int		prev;
 };
 
+static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
+
 static unsigned int mmap_read_head(struct mmap_data *md)
 {
 	struct perf_counter_mmap_page *pc = md->base;
@@ -65,11 +87,6 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 	return head;
 }
 
-static long samples;
-static struct timeval last_read, this_read;
-
-static __u64 bytes_written;
-
 static void mmap_read(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
@@ -157,29 +174,6 @@ static void sig_handler(int sig)
 	done = 1;
 }
 
-static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-
-static int nr_poll;
-static int nr_cpu;
-
-struct mmap_event {
-	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	__u64				start;
-	__u64				len;
-	__u64				pgoff;
-	char				filename[PATH_MAX];
-};
-
-struct comm_event {
-	struct perf_event_header	header;
-	__u32				pid;
-	__u32				tid;
-	char				comm[16];
-};
-
 static void pid_synthesize_comm_event(pid_t pid, int full)
 {
 	struct comm_event comm_ev;
@@ -341,24 +335,21 @@ static int group_fd;
 
 static void create_counter(int counter, int cpu, pid_t pid)
 {
-	struct perf_counter_attr attr;
+	struct perf_counter_attr *attr = attrs + counter;
 	int track = 1;
 
-	memset(&attr, 0, sizeof(attr));
-	attr.config		= event_id[counter];
-	attr.sample_period	= event_count[counter];
-	attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
 	if (freq) {
-		attr.freq		= 1;
-		attr.sample_freq	= freq;
+		attr->freq		= 1;
+		attr->sample_freq	= freq;
 	}
-	attr.mmap		= track;
-	attr.comm		= track;
-	attr.inherit		= (cpu < 0) && inherit;
+	attr->mmap		= track;
+	attr->comm		= track;
+	attr->inherit		= (cpu < 0) && inherit;
 
 	track = 0; /* only the first counter needs these */
 
-	fd[nr_cpu][counter] = sys_perf_counter_open(&attr, pid, cpu, group_fd, 0);
+	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
 
 	if (fd[nr_cpu][counter] < 0) {
 		int err = errno;
@@ -542,16 +533,14 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 	if (!argc && target_pid == -1 && !system_wide)
 		usage_with_options(record_usage, options);
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 1;
-		event_id[0] = 0;
-	}
 
 	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
+		if (attrs[counter].sample_period)
 			continue;
 
-		event_count[counter] = default_interval;
+		attrs[counter].sample_period = default_interval;
 	}
 
 	return __cmd_record(argc, argv);
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 4fc0d80440e73..9711e5524233b 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -44,23 +44,22 @@
 
 #include <sys/prctl.h>
 
-static int			system_wide			=  0;
-static int			inherit				=  1;
+static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
-static __u64			default_event_id[MAX_COUNTERS]	= {
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK		},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS	},
 
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES		},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES	},
 };
 
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
+static int			system_wide			=  0;
+static int			inherit				=  1;
+
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int			target_pid			= -1;
@@ -86,22 +85,16 @@ static __u64			walltime_nsecs;
 
 static void create_perfstat_counter(int counter)
 {
-	struct perf_counter_attr attr;
-
-	memset(&attr, 0, sizeof(attr));
-	attr.config		= event_id[counter];
-	attr.sample_type	= 0;
-	attr.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL;
-	attr.exclude_user   = event_mask[counter] & EVENT_MASK_USER;
+	struct perf_counter_attr *attr = attrs + counter;
 
 	if (scale)
-		attr.read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-					  PERF_FORMAT_TOTAL_TIME_RUNNING;
+		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
+				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
 	if (system_wide) {
 		int cpu;
 		for (cpu = 0; cpu < nr_cpus; cpu ++) {
-			fd[cpu][counter] = sys_perf_counter_open(&attr, -1, cpu, -1, 0);
+			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
 			if (fd[cpu][counter] < 0) {
 				printf("perfstat error: syscall returned with %d (%s)\n",
 						fd[cpu][counter], strerror(errno));
@@ -109,10 +102,10 @@ static void create_perfstat_counter(int counter)
 			}
 		}
 	} else {
-		attr.inherit	= inherit;
-		attr.disabled	= 1;
+		attr->inherit	= inherit;
+		attr->disabled	= 1;
 
-		fd[0][counter] = sys_perf_counter_open(&attr, 0, -1, -1, 0);
+		fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
 		if (fd[0][counter] < 0) {
 			printf("perfstat error: syscall returned with %d (%s)\n",
 					fd[0][counter], strerror(errno));
@@ -126,9 +119,13 @@ static void create_perfstat_counter(int counter)
  */
 static inline int nsec_counter(int counter)
 {
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK))
+	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
+		return 0;
+
+	if (attrs[counter].config == PERF_COUNT_CPU_CLOCK)
 		return 1;
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+
+	if (attrs[counter].config == PERF_COUNT_TASK_CLOCK)
 		return 1;
 
 	return 0;
@@ -177,7 +174,8 @@ static void read_counter(int counter)
 	/*
 	 * Save the full runtime - to allow normalization during printout:
 	 */
-	if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK))
+	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
 		runtime_nsecs = count[0];
 }
 
@@ -203,8 +201,8 @@ static void print_counter(int counter)
 
 		fprintf(stderr, " %14.6f  %-20s",
 			msecs, event_name(counter));
-		if (event_id[counter] ==
-				EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) {
+		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
+			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
 
 			fprintf(stderr, " # %11.3f CPU utilization factor",
 				(double)count[0] / (double)walltime_nsecs);
@@ -300,8 +298,6 @@ static char events_help_msg[EVENTS_HELP_MAX];
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
 		     events_help_msg, parse_events),
-	OPT_INTEGER('c', "count", &default_interval,
-		    "event period to sample"),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
 	OPT_INTEGER('p', "pid", &target_pid,
@@ -315,27 +311,19 @@ static const struct option options[] = {
 
 int cmd_stat(int argc, const char **argv, const char *prefix)
 {
-	int counter;
-
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	create_events_help(events_help_msg);
-	memcpy(event_id, default_event_id, sizeof(default_event_id));
+
+	memcpy(attrs, default_attrs, sizeof(attrs));
 
 	argc = parse_options(argc, argv, options, stat_usage, 0);
 	if (!argc)
 		usage_with_options(stat_usage, options);
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 8;
-	}
-
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
-			continue;
 
-		event_count[counter] = default_interval;
-	}
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
 	assert(nr_cpus <= MAX_NR_CPUS);
 	assert(nr_cpus >= 0);
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index b2f480b5a1348..98a6d53e17b3c 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -48,22 +48,11 @@
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-static int			system_wide			=  0;
+static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
-static __u64			default_event_id[MAX_COUNTERS]		= {
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),
-	EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),
+static int			system_wide			=  0;
 
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),
-	EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),
-};
-static int			default_interval = 100000;
-static int			event_count[MAX_COUNTERS];
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static int			default_interval		= 100000;
 
 static __u64			count_filter			=  5;
 static int			print_entries			= 15;
@@ -85,15 +74,6 @@ static int			delay_secs			=  2;
 static int			zero;
 static int			dump_symtab;
 
-static const unsigned int default_count[] = {
-	1000000,
-	1000000,
-	  10000,
-	  10000,
-	1000000,
-	  10000,
-};
-
 /*
  * Symbols
  */
@@ -112,7 +92,7 @@ struct sym_entry {
 
 struct sym_entry		*sym_filter_entry;
 
-struct dso *kernel_dso;
+struct dso			*kernel_dso;
 
 /*
  * Symbols will be added here in record_ip and will get out
@@ -213,7 +193,7 @@ static void print_sym_table(void)
 		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
 
 	if (nr_counters == 1) {
-		printf("%d", event_count[0]);
+		printf("%Ld", attrs[0].sample_period);
 		if (freq)
 			printf("Hz ");
 		else
@@ -421,10 +401,10 @@ static void process_event(uint64_t ip, int counter)
 }
 
 struct mmap_data {
-	int counter;
-	void *base;
-	unsigned int mask;
-	unsigned int prev;
+	int			counter;
+	void			*base;
+	unsigned int		mask;
+	unsigned int		prev;
 };
 
 static unsigned int mmap_read_head(struct mmap_data *md)
@@ -539,7 +519,7 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
 static int __cmd_top(void)
 {
-	struct perf_counter_attr attr;
+	struct perf_counter_attr *attr;
 	pthread_t thread;
 	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
@@ -553,13 +533,12 @@ static int __cmd_top(void)
 			if (target_pid == -1 && profile_cpu == -1)
 				cpu = i;
 
-			memset(&attr, 0, sizeof(attr));
-			attr.config		= event_id[counter];
-			attr.sample_period	= event_count[counter];
-			attr.sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-			attr.freq		= freq;
+			attr = attrs + counter;
 
-			fd[i][counter] = sys_perf_counter_open(&attr, target_pid, cpu, group_fd, 0);
+			attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+			attr->freq		= freq;
+
+			fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
 			if (fd[i][counter] < 0) {
 				int err = errno;
 
@@ -670,7 +649,6 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 	page_size = sysconf(_SC_PAGE_SIZE);
 
 	create_events_help(events_help_msg);
-	memcpy(event_id, default_event_id, sizeof(default_event_id));
 
 	argc = parse_options(argc, argv, options, top_usage, 0);
 	if (argc)
@@ -688,19 +666,22 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 		profile_cpu = -1;
 	}
 
-	if (!nr_counters) {
+	if (!nr_counters)
 		nr_counters = 1;
-		event_id[0] = 0;
-	}
 
 	if (delay_secs < 1)
 		delay_secs = 1;
 
+	parse_symbols();
+
+	/*
+	 * Fill in the ones not specifically initialized via -c:
+	 */
 	for (counter = 0; counter < nr_counters; counter++) {
-		if (event_count[counter])
+		if (attrs[counter].sample_period)
 			continue;
 
-		event_count[counter] = default_interval;
+		attrs[counter].sample_period = default_interval;
 	}
 
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -710,7 +691,5 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 	if (target_pid != -1 || profile_cpu != -1)
 		nr_cpus = 1;
 
-	parse_symbols();
-
 	return __cmd_top();
 }
diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h
index 10622a48b408e..af0a5046d743a 100644
--- a/Documentation/perf_counter/perf.h
+++ b/Documentation/perf_counter/perf.h
@@ -64,6 +64,4 @@ sys_perf_counter_open(struct perf_counter_attr *attr_uptr,
 #define MAX_COUNTERS			256
 #define MAX_NR_CPUS			256
 
-#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id))
-
 #endif
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index 2fdfd1d923f2a..eb56bd996573d 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -6,37 +6,39 @@
 #include "exec_cmd.h"
 #include "string.h"
 
-int nr_counters;
+int					nr_counters;
 
-__u64			event_id[MAX_COUNTERS]		= { };
-int			event_mask[MAX_COUNTERS];
+struct perf_counter_attr		attrs[MAX_COUNTERS];
 
 struct event_symbol {
-	__u64 event;
-	char *symbol;
+	__u8	type;
+	__u64	config;
+	char	*symbol;
 };
 
+#define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
+
 static struct event_symbol event_symbols[] = {
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cpu-cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES),		"cycles",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS),		"instructions",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES),		"cache-references",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES),		"cache-misses",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS),	"branches",		},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES),		"branch-misses",	},
-	{EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES),		"bus-cycles",		},
-
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK),			"cpu-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK),		"task-clock",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"page-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS),		"faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN),		"minor-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ),		"major-faults",		},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"context-switches",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES),		"cs",			},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"cpu-migrations",	},
-	{EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS),		"migrations",		},
+  { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
+  { C(HARDWARE, CPU_CYCLES),		"cycles",		},
+  { C(HARDWARE, INSTRUCTIONS),		"instructions",		},
+  { C(HARDWARE, CACHE_REFERENCES),	"cache-references",	},
+  { C(HARDWARE, CACHE_MISSES),		"cache-misses",		},
+  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branch-instructions",	},
+  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branches",		},
+  { C(HARDWARE, BRANCH_MISSES),		"branch-misses",	},
+  { C(HARDWARE, BUS_CYCLES),		"bus-cycles",		},
+
+  { C(SOFTWARE, CPU_CLOCK),		"cpu-clock",		},
+  { C(SOFTWARE, TASK_CLOCK),		"task-clock",		},
+  { C(SOFTWARE, PAGE_FAULTS),		"page-faults",		},
+  { C(SOFTWARE, PAGE_FAULTS),		"faults",		},
+  { C(SOFTWARE, PAGE_FAULTS_MIN),	"minor-faults",		},
+  { C(SOFTWARE, PAGE_FAULTS_MAJ),	"major-faults",		},
+  { C(SOFTWARE, CONTEXT_SWITCHES),	"context-switches",	},
+  { C(SOFTWARE, CONTEXT_SWITCHES),	"cs",			},
+  { C(SOFTWARE, CPU_MIGRATIONS),	"cpu-migrations",	},
+  { C(SOFTWARE, CPU_MIGRATIONS),	"migrations",		},
 };
 
 #define __PERF_COUNTER_FIELD(config, name) \
@@ -67,27 +69,26 @@ static char *sw_event_names[] = {
 	"major faults",
 };
 
-char *event_name(int ctr)
+char *event_name(int counter)
 {
-	__u64 config = event_id[ctr];
-	int type = PERF_COUNTER_TYPE(config);
-	int id = PERF_COUNTER_ID(config);
+	__u64 config = attrs[counter].config;
+	int type = attrs[counter].type;
 	static char buf[32];
 
-	if (PERF_COUNTER_RAW(config)) {
-		sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config));
+	if (attrs[counter].type == PERF_TYPE_RAW) {
+		sprintf(buf, "raw 0x%llx", config);
 		return buf;
 	}
 
 	switch (type) {
 	case PERF_TYPE_HARDWARE:
-		if (id < PERF_HW_EVENTS_MAX)
-			return hw_event_names[id];
+		if (config < PERF_HW_EVENTS_MAX)
+			return hw_event_names[config];
 		return "unknown-hardware";
 
 	case PERF_TYPE_SOFTWARE:
-		if (id < PERF_SW_EVENTS_MAX)
-			return sw_event_names[id];
+		if (config < PERF_SW_EVENTS_MAX)
+			return sw_event_names[config];
 		return "unknown-software";
 
 	default:
@@ -101,15 +102,19 @@ char *event_name(int ctr)
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static __u64 match_event_symbols(const char *str)
+static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
 {
 	__u64 config, id;
 	int type;
 	unsigned int i;
 	const char *sep, *pstr;
 
-	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0)
-		return config | PERF_COUNTER_RAW_MASK;
+	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) {
+		attr->type = PERF_TYPE_RAW;
+		attr->config = config;
+
+		return 0;
+	}
 
 	pstr = str;
 	sep = strchr(pstr, ':');
@@ -121,35 +126,45 @@ static __u64 match_event_symbols(const char *str)
 		if (sep) {
 			pstr = sep + 1;
 			if (strchr(pstr, 'k'))
-				event_mask[nr_counters] |= EVENT_MASK_USER;
+				attr->exclude_user = 1;
 			if (strchr(pstr, 'u'))
-				event_mask[nr_counters] |= EVENT_MASK_KERNEL;
+				attr->exclude_kernel = 1;
 		}
-		return EID(type, id);
+		attr->type = type;
+		attr->config = id;
+
+		return 0;
 	}
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		if (!strncmp(str, event_symbols[i].symbol,
-			     strlen(event_symbols[i].symbol)))
-			return event_symbols[i].event;
+			     strlen(event_symbols[i].symbol))) {
+
+			attr->type = event_symbols[i].type;
+			attr->config = event_symbols[i].config;
+
+			return 0;
+		}
 	}
 
-	return ~0ULL;
+	return -EINVAL;
 }
 
 int parse_events(const struct option *opt, const char *str, int unset)
 {
-	__u64 config;
+	struct perf_counter_attr attr;
+	int ret;
 
+	memset(&attr, 0, sizeof(attr));
 again:
 	if (nr_counters == MAX_COUNTERS)
 		return -1;
 
-	config = match_event_symbols(str);
-	if (config == ~0ULL)
-		return -1;
+	ret = match_event_symbols(str, &attr);
+	if (ret < 0)
+		return ret;
 
-	event_id[nr_counters] = config;
+	attrs[nr_counters] = attr;
 	nr_counters++;
 
 	str = strstr(str, ",");
@@ -168,7 +183,6 @@ void create_events_help(char *events_help_msg)
 {
 	unsigned int i;
 	char *str;
-	__u64 e;
 
 	str = events_help_msg;
 
@@ -178,9 +192,8 @@ void create_events_help(char *events_help_msg)
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
 		int type, id;
 
-		e = event_symbols[i].event;
-		type = PERF_COUNTER_TYPE(e);
-		id = PERF_COUNTER_ID(e);
+		type = event_symbols[i].type;
+		id = event_symbols[i].config;
 
 		if (i)
 			str += sprintf(str, "|");
@@ -191,4 +204,3 @@ void create_events_help(char *events_help_msg)
 
 	str += sprintf(str, "|rNNN]");
 }
-
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
index 0da306bb90281..542971c495bd8 100644
--- a/Documentation/perf_counter/util/parse-events.h
+++ b/Documentation/perf_counter/util/parse-events.h
@@ -3,12 +3,9 @@
  * Parse symbolic events/counts passed in as options:
  */
 
-extern int nr_counters;
-extern __u64			event_id[MAX_COUNTERS];
-extern int			event_mask[MAX_COUNTERS];
+extern int			nr_counters;
 
-#define EVENT_MASK_KERNEL	1
-#define EVENT_MASK_USER		2
+extern struct perf_counter_attr attrs[MAX_COUNTERS];
 
 extern char *event_name(int ctr);
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 232b00a36f793..4786ad9a28877 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -867,13 +867,13 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (!perf_event_raw(&counter->attr)) {
-		ev = perf_event_id(&counter->attr);
+	if (counter->attr.type != PERF_TYPE_RAW) {
+		ev = counter->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
 	} else {
-		ev = perf_event_config(&counter->attr);
+		ev = counter->attr.config;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 8f53f3a7da29e..430e048f2854b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -292,15 +292,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
-	if (perf_event_raw(attr)) {
-		hwc->config |= x86_pmu.raw_event(perf_event_config(attr));
+	if (attr->type == PERF_TYPE_RAW) {
+		hwc->config |= x86_pmu.raw_event(attr->config);
 	} else {
-		if (perf_event_id(attr) >= x86_pmu.max_events)
+		if (attr->config >= x86_pmu.max_events)
 			return -EINVAL;
 		/*
 		 * The generic map:
 		 */
-		hwc->config |= x86_pmu.event_map(perf_event_id(attr));
+		hwc->config |= x86_pmu.event_map(attr->config);
 	}
 
 	counter->destroy = hw_perf_counter_destroy;
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 4f9d39ecdc05e..f794c69b34c99 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -73,26 +73,6 @@ enum sw_event_ids {
 	PERF_SW_EVENTS_MAX		= 7,
 };
 
-#define __PERF_COUNTER_MASK(name)			\
-	(((1ULL << PERF_COUNTER_##name##_BITS) - 1) <<	\
-	 PERF_COUNTER_##name##_SHIFT)
-
-#define PERF_COUNTER_RAW_BITS		1
-#define PERF_COUNTER_RAW_SHIFT		63
-#define PERF_COUNTER_RAW_MASK		__PERF_COUNTER_MASK(RAW)
-
-#define PERF_COUNTER_CONFIG_BITS	63
-#define PERF_COUNTER_CONFIG_SHIFT	0
-#define PERF_COUNTER_CONFIG_MASK	__PERF_COUNTER_MASK(CONFIG)
-
-#define PERF_COUNTER_TYPE_BITS		7
-#define PERF_COUNTER_TYPE_SHIFT		56
-#define PERF_COUNTER_TYPE_MASK		__PERF_COUNTER_MASK(TYPE)
-
-#define PERF_COUNTER_EVENT_BITS		56
-#define PERF_COUNTER_EVENT_SHIFT	0
-#define PERF_COUNTER_EVENT_MASK		__PERF_COUNTER_MASK(EVENT)
-
 /*
  * Bits that can be set in attr.sample_type to request information
  * in the overflow packets.
@@ -125,10 +105,13 @@ enum perf_counter_read_format {
  */
 struct perf_counter_attr {
 	/*
-	 * The MSB of the config word signifies if the rest contains cpu
-	 * specific (raw) counter configuration data, if unset, the next
-	 * 7 bits are an event type and the rest of the bits are the event
-	 * identifier.
+	 * Major type: hardware/software/tracepoint/etc.
+	 */
+	__u32			type;
+	__u32			__reserved_1;
+
+	/*
+	 * Type specific configuration information.
 	 */
 	__u64			config;
 
@@ -152,12 +135,11 @@ struct perf_counter_attr {
 				comm	       :  1, /* include comm data     */
 				freq           :  1, /* use freq, not period  */
 
-				__reserved_1   : 53;
+				__reserved_2   : 53;
 
 	__u32			wakeup_events;	/* wakeup every n events */
-	__u32			__reserved_2;
+	__u32			__reserved_3;
 
-	__u64			__reserved_3;
 	__u64			__reserved_4;
 };
 
@@ -278,8 +260,8 @@ enum perf_event_type {
 
 	/*
 	 * struct {
-	 * 	struct perf_event_header	header;
-	 * 	u32				pid, ppid;
+	 *	struct perf_event_header	header;
+	 *	u32				pid, ppid;
 	 * };
 	 */
 	PERF_EVENT_FORK			= 7,
@@ -331,27 +313,6 @@ enum perf_event_type {
 
 struct task_struct;
 
-static inline u64 perf_event_raw(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_RAW_MASK;
-}
-
-static inline u64 perf_event_config(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_CONFIG_MASK;
-}
-
-static inline u64 perf_event_type(struct perf_counter_attr *attr)
-{
-	return (attr->config & PERF_COUNTER_TYPE_MASK) >>
-		PERF_COUNTER_TYPE_SHIFT;
-}
-
-static inline u64 perf_event_id(struct perf_counter_attr *attr)
-{
-	return attr->config & PERF_COUNTER_EVENT_MASK;
-}
-
 /**
  * struct hw_perf_counter - performance counter hardware details:
  */
@@ -616,8 +577,8 @@ extern int perf_counter_overflow(struct perf_counter *counter,
  */
 static inline int is_software_counter(struct perf_counter *counter)
 {
-	return !perf_event_raw(&counter->attr) &&
-		perf_event_type(&counter->attr) != PERF_TYPE_HARDWARE;
+	return (counter->attr.type != PERF_TYPE_RAW) &&
+		(counter->attr.type != PERF_TYPE_HARDWARE);
 }
 
 extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64);
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 47c92fb927f29..75ae76796df1e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3091,14 +3091,12 @@ static int perf_swcounter_match(struct perf_counter *counter,
 				enum perf_event_types type,
 				u32 event, struct pt_regs *regs)
 {
-	u64 event_config;
-
-	event_config = ((u64) type << PERF_COUNTER_TYPE_SHIFT) | event;
-
 	if (!perf_swcounter_is_counting(counter))
 		return 0;
 
-	if (counter->attr.config != event_config)
+	if (counter->attr.type != type)
+		return 0;
+	if (counter->attr.config != event)
 		return 0;
 
 	if (regs) {
@@ -3403,7 +3401,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * to be kernel events, and page faults are never hypervisor
 	 * events.
 	 */
-	switch (perf_event_id(&counter->attr)) {
+	switch (counter->attr.config) {
 	case PERF_COUNT_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
@@ -3496,12 +3494,12 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP))
 		goto done;
 
-	if (perf_event_raw(attr)) {
+	if (attr->type == PERF_TYPE_RAW) {
 		pmu = hw_perf_counter_init(counter);
 		goto done;
 	}
 
-	switch (perf_event_type(attr)) {
+	switch (attr->type) {
 	case PERF_TYPE_HARDWARE:
 		pmu = hw_perf_counter_init(counter);
 		break;
-- 
GitLab


From 8326f44da090d6d304d29b9fdc7fb3e20889e329 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 5 Jun 2009 20:22:46 +0200
Subject: [PATCH 1629/2198] perf_counter: Implement generalized cache event
 types

Extend generic event enumeration with the PERF_TYPE_HW_CACHE
method.

This is a 3-dimensional space:

       { L1-D, L1-I, L2, ITLB, DTLB, BPU } x
       { load, store, prefetch } x
       { accesses, misses }

User-space passes in the 3 coordinates and the kernel provides
a counter. (if the hardware supports that type and if the
combination makes sense.)

Combinations that make no sense produce a -EINVAL.
Combinations that are not supported by the hardware produce -ENOTSUP.

Extend the tools to deal with this, and rewrite the event symbol
parsing code with various popular aliases for the units and
access methods above. So 'l1-cache-miss' and 'l1d-read-ops' are
both valid aliases.

( x86 is supported for now, with the Nehalem event table filled in,
  and with Core2 and Atom having placeholder tables. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../perf_counter/util/parse-events.c          | 104 ++++++++-
 arch/x86/kernel/cpu/perf_counter.c            | 201 +++++++++++++++++-
 include/linux/perf_counter.h                  |  34 +++
 kernel/perf_counter.c                         |   1 +
 4 files changed, 329 insertions(+), 11 deletions(-)

diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index eb56bd996573d..de9a77c471517 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -6,6 +6,8 @@
 #include "exec_cmd.h"
 #include "string.h"
 
+extern char *strcasestr(const char *haystack, const char *needle);
+
 int					nr_counters;
 
 struct perf_counter_attr		attrs[MAX_COUNTERS];
@@ -17,6 +19,7 @@ struct event_symbol {
 };
 
 #define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y
+#define CR(x, y) .type = PERF_TYPE_##x, .config = y
 
 static struct event_symbol event_symbols[] = {
   { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
@@ -69,6 +72,28 @@ static char *sw_event_names[] = {
 	"major faults",
 };
 
+#define MAX_ALIASES 8
+
+static char *hw_cache [][MAX_ALIASES] = {
+	{ "l1-d" ,	"l1d" ,	"l1", "l1-data-cache"			},
+	{ "l1-i" ,	"l1i" ,	"l1-instruction-cache"		},
+	{ "l2"  , },
+	{ "dtlb", },
+	{ "itlb", },
+	{ "bpu" , "btb", "branch-cache", NULL },
+};
+
+static char *hw_cache_op [][MAX_ALIASES] = {
+	{ "read"	, "load" },
+	{ "write"	, "store" },
+	{ "prefetch"	, "speculative-read", "speculative-load" },
+};
+
+static char *hw_cache_result [][MAX_ALIASES] = {
+	{ "access", "ops" },
+	{ "miss", },
+};
+
 char *event_name(int counter)
 {
 	__u64 config = attrs[counter].config;
@@ -86,6 +111,30 @@ char *event_name(int counter)
 			return hw_event_names[config];
 		return "unknown-hardware";
 
+	case PERF_TYPE_HW_CACHE: {
+		__u8 cache_type, cache_op, cache_result;
+		static char name[100];
+
+		cache_type   = (config >>  0) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_MAX)
+			return "unknown-ext-hardware-cache-type";
+
+		cache_op     = (config >>  8) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_OP_MAX)
+			return "unknown-ext-hardware-cache-op-type";
+
+		cache_result = (config >> 16) & 0xff;
+		if (cache_type > PERF_COUNT_HW_CACHE_RESULT_MAX)
+			return "unknown-ext-hardware-cache-result-type";
+
+		sprintf(name, "%s:%s:%s",
+			hw_cache[cache_type][0],
+			hw_cache_op[cache_op][0],
+			hw_cache_result[cache_result][0]);
+
+		return name;
+	}
+
 	case PERF_TYPE_SOFTWARE:
 		if (config < PERF_SW_EVENTS_MAX)
 			return sw_event_names[config];
@@ -98,11 +147,60 @@ char *event_name(int counter)
 	return "unknown";
 }
 
+static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size)
+{
+	int i, j;
+
+	for (i = 0; i < size; i++) {
+		for (j = 0; j < MAX_ALIASES; j++) {
+			if (!names[i][j])
+				break;
+			if (strcasestr(str, names[i][j]))
+				return i;
+		}
+	}
+
+	return 0;
+}
+
+static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr)
+{
+	__u8 cache_type = -1, cache_op = 0, cache_result = 0;
+
+	cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX);
+	/*
+	 * No fallback - if we cannot get a clear cache type
+	 * then bail out:
+	 */
+	if (cache_type == -1)
+		return -EINVAL;
+
+	cache_op = parse_aliases(str, hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX);
+	/*
+	 * Fall back to reads:
+	 */
+	if (cache_type == -1)
+		cache_type = PERF_COUNT_HW_CACHE_OP_READ;
+
+	cache_result = parse_aliases(str, hw_cache_result,
+					PERF_COUNT_HW_CACHE_RESULT_MAX);
+	/*
+	 * Fall back to accesses:
+	 */
+	if (cache_result == -1)
+		cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS;
+
+	attr->config = cache_type | (cache_op << 8) | (cache_result << 16);
+	attr->type = PERF_TYPE_HW_CACHE;
+
+	return 0;
+}
+
 /*
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
+static int parse_event_symbols(const char *str, struct perf_counter_attr *attr)
 {
 	__u64 config, id;
 	int type;
@@ -147,7 +245,7 @@ static int match_event_symbols(const char *str, struct perf_counter_attr *attr)
 		}
 	}
 
-	return -EINVAL;
+	return parse_generic_hw_symbols(str, attr);
 }
 
 int parse_events(const struct option *opt, const char *str, int unset)
@@ -160,7 +258,7 @@ int parse_events(const struct option *opt, const char *str, int unset)
 	if (nr_counters == MAX_COUNTERS)
 		return -1;
 
-	ret = match_event_symbols(str, &attr);
+	ret = parse_event_symbols(str, &attr);
 	if (ret < 0)
 		return ret;
 
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 430e048f2854b..e86679fa52159 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -83,6 +83,128 @@ static u64 intel_pmu_event_map(int event)
 	return intel_perfmon_event_map[event];
 }
 
+/*
+ * Generalized hw caching related event table, filled
+ * in on a per model basis. A value of 0 means
+ * 'not supported', -1 means 'event makes no sense on
+ * this CPU', any other value means the raw event
+ * ID.
+ */
+
+#define C(x) PERF_COUNT_HW_CACHE_##x
+
+static u64 __read_mostly hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX];
+
+static const u64 nehalem_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */
+		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
+		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */
+		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
+		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */
+		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
+		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
+static const u64 core2_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	/* To be filled in */
+};
+
+static const u64 atom_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+	/* To be filled in */
+};
+
 static u64 intel_pmu_raw_event(u64 event)
 {
 #define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL
@@ -246,6 +368,39 @@ static inline int x86_pmu_initialized(void)
 	return x86_pmu.handle_irq != NULL;
 }
 
+static inline int
+set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr)
+{
+	unsigned int cache_type, cache_op, cache_result;
+	u64 config, val;
+
+	config = attr->config;
+
+	cache_type = (config >>  0) & 0xff;
+	if (cache_type >= PERF_COUNT_HW_CACHE_MAX)
+		return -EINVAL;
+
+	cache_op = (config >>  8) & 0xff;
+	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX)
+		return -EINVAL;
+
+	cache_result = (config >> 16) & 0xff;
+	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	val = hw_cache_event_ids[cache_type][cache_op][cache_result];
+
+	if (val == 0)
+		return -ENOENT;
+
+	if (val == -1)
+		return -EINVAL;
+
+	hwc->config |= val;
+
+	return 0;
+}
+
 /*
  * Setup the hardware configuration for a given attr_type
  */
@@ -288,22 +443,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 		hwc->sample_period = x86_pmu.max_period;
 
 	atomic64_set(&hwc->period_left, hwc->sample_period);
+	counter->destroy = hw_perf_counter_destroy;
 
 	/*
 	 * Raw event type provide the config in the event structure
 	 */
 	if (attr->type == PERF_TYPE_RAW) {
 		hwc->config |= x86_pmu.raw_event(attr->config);
-	} else {
-		if (attr->config >= x86_pmu.max_events)
-			return -EINVAL;
-		/*
-		 * The generic map:
-		 */
-		hwc->config |= x86_pmu.event_map(attr->config);
+		return 0;
 	}
 
-	counter->destroy = hw_perf_counter_destroy;
+	if (attr->type == PERF_TYPE_HW_CACHE)
+		return set_ext_hw_attr(hwc, attr);
+
+	if (attr->config >= x86_pmu.max_events)
+		return -EINVAL;
+	/*
+	 * The generic map:
+	 */
+	hwc->config |= x86_pmu.event_map(attr->config);
 
 	return 0;
 }
@@ -989,6 +1147,33 @@ static int intel_pmu_init(void)
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
+	/*
+	 * Nehalem:
+	 */
+	switch (boot_cpu_data.x86_model) {
+	case 17:
+		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Core2 event tables\n");
+		break;
+	default:
+	case 26:
+		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Nehalem/Corei7 event tables\n");
+		break;
+	case 28:
+		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
+		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
+			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+
+		pr_info("... installed Atom event tables\n");
+		break;
+	}
 	return 0;
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index f794c69b34c99..3586df840f693 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -28,6 +28,7 @@ enum perf_event_types {
 	PERF_TYPE_HARDWARE		= 0,
 	PERF_TYPE_SOFTWARE		= 1,
 	PERF_TYPE_TRACEPOINT		= 2,
+	PERF_TYPE_HW_CACHE		= 3,
 
 	/*
 	 * available TYPE space, raw is the max value.
@@ -55,6 +56,39 @@ enum attr_ids {
 	PERF_HW_EVENTS_MAX		= 7,
 };
 
+/*
+ * Generalized hardware cache counters:
+ *
+ *       { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x
+ *       { read, write, prefetch } x
+ *       { accesses, misses }
+ */
+enum hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D,
+	PERF_COUNT_HW_CACHE_L1I,
+	PERF_COUNT_HW_CACHE_L2,
+	PERF_COUNT_HW_CACHE_DTLB,
+	PERF_COUNT_HW_CACHE_ITLB,
+	PERF_COUNT_HW_CACHE_BPU,
+
+	PERF_COUNT_HW_CACHE_MAX,
+};
+
+enum hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ,
+	PERF_COUNT_HW_CACHE_OP_WRITE,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH,
+
+	PERF_COUNT_HW_CACHE_OP_MAX,
+};
+
+enum hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS,
+	PERF_COUNT_HW_CACHE_RESULT_MISS,
+
+	PERF_COUNT_HW_CACHE_RESULT_MAX,
+};
+
 /*
  * Special "software" counters provided by the kernel, even if the hardware
  * does not support performance counters. These counters measure various
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 75ae76796df1e..5eacaaf3f9cdc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3501,6 +3501,7 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 
 	switch (attr->type) {
 	case PERF_TYPE_HARDWARE:
+	case PERF_TYPE_HW_CACHE:
 		pmu = hw_perf_counter_init(counter);
 		break;
 
-- 
GitLab


From 86847b62f0781ccc97a79936c9ed9dc818cff67b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 6 Jun 2009 12:24:17 +0200
Subject: [PATCH 1630/2198] perf_counter tools: Add 'perf list' to list
 available events

perf list: List all the available event types which can be used in
-e (--event) options.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Makefile           |  1 +
 Documentation/perf_counter/builtin-list.c     | 20 ++++++++
 Documentation/perf_counter/builtin-record.c   |  7 +--
 Documentation/perf_counter/builtin-stat.c     |  9 ++--
 Documentation/perf_counter/builtin-top.c      |  7 +--
 Documentation/perf_counter/builtin.h          |  2 +
 Documentation/perf_counter/perf.c             |  1 +
 .../perf_counter/util/parse-events.c          | 46 ++++++++++++-------
 .../perf_counter/util/parse-events.h          |  2 +-
 9 files changed, 61 insertions(+), 34 deletions(-)
 create mode 100644 Documentation/perf_counter/builtin-list.c

diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 5b99f04df812c..32c0bb21a3284 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -328,6 +328,7 @@ BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-report.o
 BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
+BUILTIN_OBJS += builtin-list.o
 
 PERFLIBS = $(LIB_FILE)
 EXTLIBS =
diff --git a/Documentation/perf_counter/builtin-list.c b/Documentation/perf_counter/builtin-list.c
new file mode 100644
index 0000000000000..fe60e37c96efc
--- /dev/null
+++ b/Documentation/perf_counter/builtin-list.c
@@ -0,0 +1,20 @@
+/*
+ * builtin-list.c
+ *
+ * Builtin list command: list all event types
+ *
+ * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de>
+ * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ */
+#include "builtin.h"
+
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+int cmd_list(int argc, const char **argv, const char *prefix)
+{
+	print_events();
+	return 0;
+}
diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c
index 130fd88266bb3..aeab9c4b15e48 100644
--- a/Documentation/perf_counter/builtin-record.c
+++ b/Documentation/perf_counter/builtin-record.c
@@ -495,11 +495,10 @@ static const char * const record_usage[] = {
 	NULL
 };
 
-static char events_help_msg[EVENTS_HELP_MAX];
-
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
-		     events_help_msg, parse_events),
+		     "event selector. use 'perf list' to list available events",
+		     parse_events),
 	OPT_INTEGER('p', "pid", &target_pid,
 		    "record events on existing pid"),
 	OPT_INTEGER('r', "realtime", &realtime_prio,
@@ -527,8 +526,6 @@ int cmd_record(int argc, const char **argv, const char *prefix)
 {
 	int counter;
 
-	create_events_help(events_help_msg);
-
 	argc = parse_options(argc, argv, options, record_usage, 0);
 	if (!argc && target_pid == -1 && !system_wide)
 		usage_with_options(record_usage, options);
diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c
index 9711e5524233b..2cbf5a189589d 100644
--- a/Documentation/perf_counter/builtin-stat.c
+++ b/Documentation/perf_counter/builtin-stat.c
@@ -293,18 +293,17 @@ static const char * const stat_usage[] = {
 	NULL
 };
 
-static char events_help_msg[EVENTS_HELP_MAX];
-
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
-		     events_help_msg, parse_events),
+		     "event selector. use 'perf list' to list available events",
+		     parse_events),
 	OPT_BOOLEAN('i', "inherit", &inherit,
 		    "child tasks inherit counters"),
 	OPT_INTEGER('p', "pid", &target_pid,
 		    "stat events on existing pid"),
 	OPT_BOOLEAN('a', "all-cpus", &system_wide,
 			    "system-wide collection from all CPUs"),
-	OPT_BOOLEAN('l', "scale", &scale,
+	OPT_BOOLEAN('S', "scale", &scale,
 			    "scale/normalize counters"),
 	OPT_END()
 };
@@ -313,8 +312,6 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
 {
 	page_size = sysconf(_SC_PAGE_SIZE);
 
-	create_events_help(events_help_msg);
-
 	memcpy(attrs, default_attrs, sizeof(attrs));
 
 	argc = parse_options(argc, argv, options, stat_usage, 0);
diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c
index 98a6d53e17b3c..f2e7312f85c94 100644
--- a/Documentation/perf_counter/builtin-top.c
+++ b/Documentation/perf_counter/builtin-top.c
@@ -606,11 +606,10 @@ static const char * const top_usage[] = {
 	NULL
 };
 
-static char events_help_msg[EVENTS_HELP_MAX];
-
 static const struct option options[] = {
 	OPT_CALLBACK('e', "event", NULL, "event",
-		     events_help_msg, parse_events),
+		     "event selector. use 'perf list' to list available events",
+		     parse_events),
 	OPT_INTEGER('c', "count", &default_interval,
 		    "event period to sample"),
 	OPT_INTEGER('p', "pid", &target_pid,
@@ -648,8 +647,6 @@ int cmd_top(int argc, const char **argv, const char *prefix)
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 
-	create_events_help(events_help_msg);
-
 	argc = parse_options(argc, argv, options, top_usage, 0);
 	if (argc)
 		usage_with_options(top_usage, options);
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index 5bfea57d33fe1..e7de47da85815 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -20,4 +20,6 @@ extern int cmd_report(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
+extern int cmd_list(int argc, const char **argv, const char *prefix);
+
 #endif
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index ec7edb7fbe298..9ac75657a180f 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -258,6 +258,7 @@ static void handle_internal_command(int argc, const char **argv)
 	const char *cmd = argv[0];
 	static struct cmd_struct commands[] = {
 		{ "help", cmd_help, 0 },
+		{ "list", cmd_list, 0 },
 		{ "record", cmd_record, 0 },
 		{ "report", cmd_report, 0 },
 		{ "stat", cmd_stat, 0 },
diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index de9a77c471517..150fbd2627147 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -274,31 +274,43 @@ int parse_events(const struct option *opt, const char *str, int unset)
 	return 0;
 }
 
+static const char * const event_type_descriptors[] = {
+	"",
+	"Hardware event",
+	"Software event",
+	"Tracepoint event",
+	"Hardware cache event",
+};
+
 /*
- * Create the help text for the event symbols:
+ * Print the help text for the event symbols:
  */
-void create_events_help(char *events_help_msg)
+void print_events(void)
 {
-	unsigned int i;
-	char *str;
+	struct event_symbol *syms = event_symbols;
+	unsigned int i, type, prev_type = -1;
 
-	str = events_help_msg;
+	fprintf(stderr, "\n");
+	fprintf(stderr, "List of pre-defined events (to be used in -e):\n");
 
-	str += sprintf(str,
-	"event name: [");
+	for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
+		type = syms->type + 1;
+		if (type > ARRAY_SIZE(event_type_descriptors))
+			type = 0;
 
-	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) {
-		int type, id;
-
-		type = event_symbols[i].type;
-		id = event_symbols[i].config;
+		if (type != prev_type)
+			fprintf(stderr, "\n");
 
-		if (i)
-			str += sprintf(str, "|");
+		fprintf(stderr, "  %-30s [%s]\n", syms->symbol,
+			event_type_descriptors[type]);
 
-		str += sprintf(str, "%s",
-				event_symbols[i].symbol);
+		prev_type = type;
 	}
 
-	str += sprintf(str, "|rNNN]");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "  %-30s [raw hardware event descriptor]\n",
+		"rNNN");
+	fprintf(stderr, "\n");
+
+	exit(129);
 }
diff --git a/Documentation/perf_counter/util/parse-events.h b/Documentation/perf_counter/util/parse-events.h
index 542971c495bd8..e3d552908e600 100644
--- a/Documentation/perf_counter/util/parse-events.h
+++ b/Documentation/perf_counter/util/parse-events.h
@@ -13,5 +13,5 @@ extern int parse_events(const struct option *opt, const char *str, int unset);
 
 #define EVENTS_HELP_MAX (128*1024)
 
-extern void create_events_help(char *help_msg);
+extern void print_events(void);
 
-- 
GitLab


From 8faf3b547593bf6ea10df631e73204975273c4e0 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 13:58:12 +0200
Subject: [PATCH 1631/2198] perf_counter tools: Fix cache-event printout

Also standardize the cache printout (so that it can be pasted back
into the command) and sort out the aliases.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../perf_counter/util/parse-events.c          | 56 +++++++++----------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/Documentation/perf_counter/util/parse-events.c b/Documentation/perf_counter/util/parse-events.c
index 150fbd2627147..e0820b4388aef 100644
--- a/Documentation/perf_counter/util/parse-events.c
+++ b/Documentation/perf_counter/util/parse-events.c
@@ -53,45 +53,45 @@ static struct event_symbol event_symbols[] = {
 #define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT)
 
 static char *hw_event_names[] = {
-	"CPU cycles",
+	"cycles",
 	"instructions",
-	"cache references",
-	"cache misses",
+	"cache-references",
+	"cache-misses",
 	"branches",
-	"branch misses",
-	"bus cycles",
+	"branch-misses",
+	"bus-cycles",
 };
 
 static char *sw_event_names[] = {
-	"cpu clock ticks",
-	"task clock ticks",
-	"pagefaults",
-	"context switches",
-	"CPU migrations",
-	"minor faults",
-	"major faults",
+	"cpu-clock-ticks",
+	"task-clock-ticks",
+	"page-faults",
+	"context-switches",
+	"CPU-migrations",
+	"minor-faults",
+	"major-faults",
 };
 
 #define MAX_ALIASES 8
 
 static char *hw_cache [][MAX_ALIASES] = {
-	{ "l1-d" ,	"l1d" ,	"l1", "l1-data-cache"			},
-	{ "l1-i" ,	"l1i" ,	"l1-instruction-cache"		},
-	{ "l2"  , },
-	{ "dtlb", },
-	{ "itlb", },
-	{ "bpu" , "btb", "branch-cache", NULL },
+	{ "L1-data"		, "l1-d", "l1d", "l1"				},
+	{ "L1-instruction"	, "l1-i", "l1i"					},
+	{ "L2"			, "l2"						},
+	{ "Data-TLB"		, "dtlb", "d-tlb"				},
+	{ "Instruction-TLB"	, "itlb", "i-tlb"				},
+	{ "Branch"		, "bpu" , "btb", "bpc"				},
 };
 
 static char *hw_cache_op [][MAX_ALIASES] = {
-	{ "read"	, "load" },
-	{ "write"	, "store" },
-	{ "prefetch"	, "speculative-read", "speculative-load" },
+	{ "Load"		, "read"					},
+	{ "Store"		, "write"					},
+	{ "Prefetch"		, "speculative-read", "speculative-load"	},
 };
 
 static char *hw_cache_result [][MAX_ALIASES] = {
-	{ "access", "ops" },
-	{ "miss", },
+	{ "Reference"		, "ops", "access"				},
+	{ "Miss"								},
 };
 
 char *event_name(int counter)
@@ -120,14 +120,14 @@ char *event_name(int counter)
 			return "unknown-ext-hardware-cache-type";
 
 		cache_op     = (config >>  8) & 0xff;
-		if (cache_type > PERF_COUNT_HW_CACHE_OP_MAX)
-			return "unknown-ext-hardware-cache-op-type";
+		if (cache_op > PERF_COUNT_HW_CACHE_OP_MAX)
+			return "unknown-ext-hardware-cache-op";
 
 		cache_result = (config >> 16) & 0xff;
-		if (cache_type > PERF_COUNT_HW_CACHE_RESULT_MAX)
-			return "unknown-ext-hardware-cache-result-type";
+		if (cache_result > PERF_COUNT_HW_CACHE_RESULT_MAX)
+			return "unknown-ext-hardware-cache-result";
 
-		sprintf(name, "%s:%s:%s",
+		sprintf(name, "%s-Cache-%s-%ses",
 			hw_cache[cache_type][0],
 			hw_cache_op[cache_op][0],
 			hw_cache_result[cache_result][0]);
-- 
GitLab


From 386b05e3a2f3c5b0a9c5575060421cca0911648a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 6 Jun 2009 14:56:33 +0200
Subject: [PATCH 1632/2198] perf_counter tools: Add help for perf list

Also update other areas of the help texts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../perf_counter/Documentation/perf-list.txt  | 25 ++++++++++++++++++
 .../Documentation/perf-record.txt             | 26 ++++---------------
 .../perf_counter/Documentation/perf-stat.txt  | 26 ++++---------------
 .../perf_counter/Documentation/perf-top.txt   | 26 ++++---------------
 .../perf_counter/Documentation/perf.txt       |  3 ++-
 5 files changed, 42 insertions(+), 64 deletions(-)
 create mode 100644 Documentation/perf_counter/Documentation/perf-list.txt

diff --git a/Documentation/perf_counter/Documentation/perf-list.txt b/Documentation/perf_counter/Documentation/perf-list.txt
new file mode 100644
index 0000000000000..aa55a71184fcf
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-list.txt
@@ -0,0 +1,25 @@
+perf-list(1)
+==============
+
+NAME
+----
+perf-list - List all symbolic event types
+
+SYNOPSIS
+--------
+[verse]
+'perf list
+
+DESCRIPTION
+-----------
+This command displays the symbolic event types which can be selected in the
+various perf commands with the -e option.
+
+OPTIONS
+-------
+None
+
+SEE ALSO
+--------
+linkperf:perf-stat[1], linkperf:perf-top[1],
+linkperf:perf-record[1]
diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt
index 4d3416fc76441..1dbc1eeb4c012 100644
--- a/Documentation/perf_counter/Documentation/perf-record.txt
+++ b/Documentation/perf_counter/Documentation/perf-record.txt
@@ -26,26 +26,10 @@ OPTIONS
 
 -e::
 --event=::
-                             0:0: cpu-cycles          
-                             0:0: cycles              
-                             0:1: instructions        
-                             0:2: cache-references    
-                             0:3: cache-misses        
-                             0:4: branch-instructions 
-                             0:4: branches            
-                             0:5: branch-misses       
-                             0:6: bus-cycles          
-                             1:0: cpu-clock           
-                             1:1: task-clock          
-                             1:2: page-faults         
-                             1:2: faults              
-                             1:5: minor-faults        
-                             1:6: major-faults        
-                             1:3: context-switches    
-                             1:3: cs                  
-                             1:4: cpu-migrations      
-                             1:4: migrations          
-                           rNNN: raw PMU events (eventsel+umask)
+	Select the PMU event. Selection can be a symbolic event name
+	(use 'perf list' to list all events) or a raw PMU
+	event (eventsel+umask) in the form of rNNN where NNN is a
+	 hexadecimal event descriptor.
 
 -a::
         system-wide collection
@@ -55,4 +39,4 @@ OPTIONS
 
 SEE ALSO
 --------
-linkperf:perf-stat[1]
+linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
index a340e7be83ddd..5d95784cce4de 100644
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -25,26 +25,10 @@ OPTIONS
 
 -e::
 --event=::
-                             0:0: cpu-cycles          
-                             0:0: cycles              
-                             0:1: instructions        
-                             0:2: cache-references    
-                             0:3: cache-misses        
-                             0:4: branch-instructions 
-                             0:4: branches            
-                             0:5: branch-misses       
-                             0:6: bus-cycles          
-                             1:0: cpu-clock           
-                             1:1: task-clock          
-                             1:2: page-faults         
-                             1:2: faults              
-                             1:5: minor-faults        
-                             1:6: major-faults        
-                             1:3: context-switches    
-                             1:3: cs                  
-                             1:4: cpu-migrations      
-                             1:4: migrations          
-                           rNNN: raw PMU events (eventsel+umask)
+	Select the PMU event. Selection can be a symbolic event name
+	(use 'perf list' to list all events) or a raw PMU
+	event (eventsel+umask) in the form of rNNN where NNN is a
+	 hexadecimal event descriptor.
 
 -i::
 --inherit::
@@ -79,4 +63,4 @@ $ perf stat -- make -j
 
 SEE ALSO
 --------
-linkperf:perf-tops[1]
+linkperf:perf-top[1], linkperf:perf-list[1]
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
index 15251e40e4f84..c8eb7cfffcd51 100644
--- a/Documentation/perf_counter/Documentation/perf-top.txt
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -23,26 +23,10 @@ OPTIONS
 
 -e::
 --event=::
-                             0:0: cpu-cycles          
-                             0:0: cycles              
-                             0:1: instructions        
-                             0:2: cache-references    
-                             0:3: cache-misses        
-                             0:4: branch-instructions 
-                             0:4: branches            
-                             0:5: branch-misses       
-                             0:6: bus-cycles          
-                             1:0: cpu-clock           
-                             1:1: task-clock          
-                             1:2: page-faults         
-                             1:2: faults              
-                             1:5: minor-faults        
-                             1:6: major-faults        
-                             1:3: context-switches    
-                             1:3: cs                  
-                             1:4: cpu-migrations      
-                             1:4: migrations          
-                           rNNN: raw PMU events (eventsel+umask)
+	Select the PMU event. Selection can be a symbolic event name
+	(use 'perf list' to list all events) or a raw PMU
+	event (eventsel+umask) in the form of rNNN where NNN is a
+	 hexadecimal event descriptor.
 
 -a::
         system-wide collection
@@ -52,4 +36,4 @@ OPTIONS
 
 SEE ALSO
 --------
-linkperf:perf-stat[1]
+linkperf:perf-stat[1], linkperf:perf-list[1]
diff --git a/Documentation/perf_counter/Documentation/perf.txt b/Documentation/perf_counter/Documentation/perf.txt
index e3d8b3832c6f4..69c832557199d 100644
--- a/Documentation/perf_counter/Documentation/perf.txt
+++ b/Documentation/perf_counter/Documentation/perf.txt
@@ -20,4 +20,5 @@ and software features (software counters, tracepoints) as well.
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-top[1],
-linkperf:perf-record[1], linkperf:perf-report[1]
+linkperf:perf-record[1], linkperf:perf-report[1],
+linkperf:perf-list[1]
-- 
GitLab


From 502fc5c72a886ff9d4d7a596e65ecc4dd5e4d458 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 03:20:49 +0100
Subject: [PATCH 1633/2198] perf_counter tools: Uniform help printouts

Also add perf list to command-list.txt.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/builtin-help.c       | 10 +++++-----
 Documentation/perf_counter/command-list.txt     | 14 ++++++++------
 Documentation/perf_counter/perf.c               |  4 ++--
 Documentation/perf_counter/util/parse-options.c |  2 +-
 Documentation/perf_counter/util/usage.c         |  2 +-
 5 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c
index a3894bfb9b6ec..0f32dc3f3c4c7 100644
--- a/Documentation/perf_counter/builtin-help.c
+++ b/Documentation/perf_counter/builtin-help.c
@@ -284,7 +284,7 @@ void list_common_cmds_help(void)
 			longest = strlen(common_cmds[i].name);
 	}
 
-	puts("The most commonly used perf commands are:");
+	puts(" The most commonly used perf commands are:");
 	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) {
 		printf("   %s   ", common_cmds[i].name);
 		mput_char(' ', longest - strlen(common_cmds[i].name));
@@ -426,16 +426,16 @@ int cmd_help(int argc, const char **argv, const char *prefix)
 			builtin_help_usage, 0);
 
 	if (show_all) {
-		printf("usage: %s\n\n", perf_usage_string);
+		printf("\n usage: %s\n\n", perf_usage_string);
 		list_commands("perf commands", &main_cmds, &other_cmds);
-		printf("%s\n", perf_more_info_string);
+		printf(" %s\n\n", perf_more_info_string);
 		return 0;
 	}
 
 	if (!argv[0]) {
-		printf("usage: %s\n\n", perf_usage_string);
+		printf("\n usage: %s\n\n", perf_usage_string);
 		list_common_cmds_help();
-		printf("\n%s\n", perf_more_info_string);
+		printf("\n %s\n\n", perf_more_info_string);
 		return 0;
 	}
 
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index 439029207770f..f0b922c902345 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -1,7 +1,9 @@
+#
 # List of known perf commands.
-# command name				category [deprecated] [common]
-perf-record                             mainporcelain common
-perf-report                             mainporcelain common
-perf-stat                               mainporcelain common
-perf-top                                mainporcelain common
-
+# command name			category [deprecated] [common]
+#
+perf-record			mainporcelain common
+perf-report			mainporcelain common
+perf-stat			mainporcelain common
+perf-top			mainporcelain common
+perf-list			mainporcelain common
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 9ac75657a180f..161824f1241fa 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -384,9 +384,9 @@ int main(int argc, const char **argv)
 			argv[0] += 2;
 	} else {
 		/* The user didn't specify a command; give them help */
-		printf("usage: %s\n\n", perf_usage_string);
+		printf("\n usage: %s\n\n", perf_usage_string);
 		list_common_cmds_help();
-		printf("\n%s\n", perf_more_info_string);
+		printf("\n %s\n\n", perf_more_info_string);
 		exit(1);
 	}
 	cmd = argv[0];
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
index 551b6bc34e794..e4d353395a60f 100644
--- a/Documentation/perf_counter/util/parse-options.c
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -385,7 +385,7 @@ int usage_with_options_internal(const char * const *usagestr,
 	if (!usagestr)
 		return PARSE_OPT_HELP;
 
-	fprintf(stderr, "usage: %s\n", *usagestr++);
+	fprintf(stderr, "\n usage: %s\n", *usagestr++);
 	while (*usagestr && **usagestr)
 		fprintf(stderr, "   or: %s\n", *usagestr++);
 	while (*usagestr) {
diff --git a/Documentation/perf_counter/util/usage.c b/Documentation/perf_counter/util/usage.c
index 7a10421fe6b42..2cad286e43716 100644
--- a/Documentation/perf_counter/util/usage.c
+++ b/Documentation/perf_counter/util/usage.c
@@ -14,7 +14,7 @@ static void report(const char *prefix, const char *err, va_list params)
 
 static NORETURN void usage_builtin(const char *err)
 {
-	fprintf(stderr, "usage: %s\n", err);
+	fprintf(stderr, "\n usage: %s\n", err);
 	exit(129);
 }
 
-- 
GitLab


From 6e6b754ffdb6415723686c733f13275397e44422 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 15 Apr 2008 22:39:31 +0200
Subject: [PATCH 1634/2198] perf_counter tools: Tidy up manpage details

Also fix a misalignment in usage string printing.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/perf_counter/Documentation/perf-help.txt | 2 +-
 Documentation/perf_counter/Documentation/perf-list.txt | 4 ++--
 Documentation/perf_counter/Documentation/perf-stat.txt | 2 +-
 Documentation/perf_counter/Documentation/perf-top.txt  | 2 +-
 Documentation/perf_counter/util/parse-options.c        | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/Documentation/perf_counter/Documentation/perf-help.txt b/Documentation/perf_counter/Documentation/perf-help.txt
index f85fed5a7edba..514391818d1f1 100644
--- a/Documentation/perf_counter/Documentation/perf-help.txt
+++ b/Documentation/perf_counter/Documentation/perf-help.txt
@@ -1,5 +1,5 @@
 perf-help(1)
-===========
+============
 
 NAME
 ----
diff --git a/Documentation/perf_counter/Documentation/perf-list.txt b/Documentation/perf_counter/Documentation/perf-list.txt
index aa55a71184fcf..8290b94226687 100644
--- a/Documentation/perf_counter/Documentation/perf-list.txt
+++ b/Documentation/perf_counter/Documentation/perf-list.txt
@@ -1,5 +1,5 @@
 perf-list(1)
-==============
+============
 
 NAME
 ----
@@ -8,7 +8,7 @@ perf-list - List all symbolic event types
 SYNOPSIS
 --------
 [verse]
-'perf list
+'perf list'
 
 DESCRIPTION
 -----------
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt
index 5d95784cce4de..c368a72721d78 100644
--- a/Documentation/perf_counter/Documentation/perf-stat.txt
+++ b/Documentation/perf_counter/Documentation/perf-stat.txt
@@ -1,5 +1,5 @@
 perf-stat(1)
-==========
+============
 
 NAME
 ----
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt
index c8eb7cfffcd51..539d012897250 100644
--- a/Documentation/perf_counter/Documentation/perf-top.txt
+++ b/Documentation/perf_counter/Documentation/perf-top.txt
@@ -1,5 +1,5 @@
 perf-top(1)
-==========
+===========
 
 NAME
 ----
diff --git a/Documentation/perf_counter/util/parse-options.c b/Documentation/perf_counter/util/parse-options.c
index e4d353395a60f..b3affb1658d2a 100644
--- a/Documentation/perf_counter/util/parse-options.c
+++ b/Documentation/perf_counter/util/parse-options.c
@@ -387,7 +387,7 @@ int usage_with_options_internal(const char * const *usagestr,
 
 	fprintf(stderr, "\n usage: %s\n", *usagestr++);
 	while (*usagestr && **usagestr)
-		fprintf(stderr, "   or: %s\n", *usagestr++);
+		fprintf(stderr, "    or: %s\n", *usagestr++);
 	while (*usagestr) {
 		fprintf(stderr, "%s%s\n",
 				**usagestr ? "    " : "",
-- 
GitLab


From 8035e4288078cb806e7dd6bafe4d3e54d44cab3f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 15:19:13 +0200
Subject: [PATCH 1635/2198] perf_counter tools: Prepare for 'perf annotate'

Prepare for the 'perf annotate' implementation by splitting off
builtin-annotate.c from builtin-report.c.

( We keep this commit separate to ease the later librarization
  of the facilities that perf-report and perf-annotate shares. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../Documentation/perf-annotate.txt           |   26 +
 Documentation/perf_counter/Makefile           |    3 +-
 Documentation/perf_counter/builtin-annotate.c | 1291 +++++++++++++++++
 Documentation/perf_counter/builtin.h          |    1 +
 Documentation/perf_counter/command-list.txt   |    3 +-
 Documentation/perf_counter/perf.c             |    3 +
 6 files changed, 1325 insertions(+), 2 deletions(-)
 create mode 100644 Documentation/perf_counter/Documentation/perf-annotate.txt
 create mode 100644 Documentation/perf_counter/builtin-annotate.c

diff --git a/Documentation/perf_counter/Documentation/perf-annotate.txt b/Documentation/perf_counter/Documentation/perf-annotate.txt
new file mode 100644
index 0000000000000..a9d6d5ee27012
--- /dev/null
+++ b/Documentation/perf_counter/Documentation/perf-annotate.txt
@@ -0,0 +1,26 @@
+perf-annotate(1)
+==============
+
+NAME
+----
+perf-annotate - Read perf.data (created by perf record) and annotate functions
+
+SYNOPSIS
+--------
+[verse]
+'perf annotate' [-i <file> | --input=file] symbol_name
+
+DESCRIPTION
+-----------
+This command displays the performance counter profile information recorded
+via perf record.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: perf.data)
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile
index 32c0bb21a3284..0cbd5d6874ec3 100644
--- a/Documentation/perf_counter/Makefile
+++ b/Documentation/perf_counter/Makefile
@@ -323,12 +323,13 @@ LIB_OBJS += util/symbol.o
 LIB_OBJS += util/color.o
 LIB_OBJS += util/pager.o
 
+BUILTIN_OBJS += builtin-annotate.o
 BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-list.o
 BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-report.o
 BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
-BUILTIN_OBJS += builtin-list.o
 
 PERFLIBS = $(LIB_FILE)
 EXTLIBS =
diff --git a/Documentation/perf_counter/builtin-annotate.c b/Documentation/perf_counter/builtin-annotate.c
new file mode 100644
index 0000000000000..d656484ec9838
--- /dev/null
+++ b/Documentation/perf_counter/builtin-annotate.c
@@ -0,0 +1,1291 @@
+/*
+ * builtin-annotate.c
+ *
+ * Builtin annotate command: Analyze the perf.data input file,
+ * look up and read DSOs and symbol information and display
+ * a histogram of results, along various sorting keys.
+ */
+#include "builtin.h"
+
+#include "util/util.h"
+
+#include "util/color.h"
+#include "util/list.h"
+#include "util/cache.h"
+#include "util/rbtree.h"
+#include "util/symbol.h"
+#include "util/string.h"
+
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+
+#define SHOW_KERNEL	1
+#define SHOW_USER	2
+#define SHOW_HV		4
+
+static char		const *input_name = "perf.data";
+static char		*vmlinux = NULL;
+
+static char		default_sort_order[] = "comm,dso";
+static char		*sort_order = default_sort_order;
+
+static int		input;
+static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
+
+static int		dump_trace = 0;
+#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
+
+static int		verbose;
+static int		full_paths;
+
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+struct ip_event {
+	struct perf_event_header header;
+	__u64 ip;
+	__u32 pid, tid;
+};
+
+struct mmap_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	__u64 start;
+	__u64 len;
+	__u64 pgoff;
+	char filename[PATH_MAX];
+};
+
+struct comm_event {
+	struct perf_event_header header;
+	__u32 pid, tid;
+	char comm[16];
+};
+
+struct fork_event {
+	struct perf_event_header header;
+	__u32 pid, ppid;
+};
+
+struct period_event {
+	struct perf_event_header header;
+	__u64 time;
+	__u64 id;
+	__u64 sample_period;
+};
+
+typedef union event_union {
+	struct perf_event_header	header;
+	struct ip_event			ip;
+	struct mmap_event		mmap;
+	struct comm_event		comm;
+	struct fork_event		fork;
+	struct period_event		period;
+} event_t;
+
+static LIST_HEAD(dsos);
+static struct dso *kernel_dso;
+static struct dso *vdso;
+
+static void dsos__add(struct dso *dso)
+{
+	list_add_tail(&dso->node, &dsos);
+}
+
+static struct dso *dsos__find(const char *name)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		if (strcmp(pos->name, name) == 0)
+			return pos;
+	return NULL;
+}
+
+static struct dso *dsos__findnew(const char *name)
+{
+	struct dso *dso = dsos__find(name);
+	int nr;
+
+	if (dso)
+		return dso;
+
+	dso = dso__new(name, 0);
+	if (!dso)
+		goto out_delete_dso;
+
+	nr = dso__load(dso, NULL, verbose);
+	if (nr < 0) {
+		if (verbose)
+			fprintf(stderr, "Failed to open: %s\n", name);
+		goto out_delete_dso;
+	}
+	if (!nr && verbose) {
+		fprintf(stderr,
+		"No symbols found in: %s, maybe install a debug package?\n",
+				name);
+	}
+
+	dsos__add(dso);
+
+	return dso;
+
+out_delete_dso:
+	dso__delete(dso);
+	return NULL;
+}
+
+static void dsos__fprintf(FILE *fp)
+{
+	struct dso *pos;
+
+	list_for_each_entry(pos, &dsos, node)
+		dso__fprintf(pos, fp);
+}
+
+static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
+{
+	return dso__find_symbol(kernel_dso, ip);
+}
+
+static int load_kernel(void)
+{
+	int err;
+
+	kernel_dso = dso__new("[kernel]", 0);
+	if (!kernel_dso)
+		return -1;
+
+	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose);
+	if (err) {
+		dso__delete(kernel_dso);
+		kernel_dso = NULL;
+	} else
+		dsos__add(kernel_dso);
+
+	vdso = dso__new("[vdso]", 0);
+	if (!vdso)
+		return -1;
+
+	vdso->find_symbol = vdso__find_symbol;
+
+	dsos__add(vdso);
+
+	return err;
+}
+
+static char __cwd[PATH_MAX];
+static char *cwd = __cwd;
+static int cwdlen;
+
+static int strcommon(const char *pathname)
+{
+	int n = 0;
+
+	while (pathname[n] == cwd[n] && n < cwdlen)
+		++n;
+
+	return n;
+}
+
+struct map {
+	struct list_head node;
+	uint64_t	 start;
+	uint64_t	 end;
+	uint64_t	 pgoff;
+	uint64_t	 (*map_ip)(struct map *, uint64_t);
+	struct dso	 *dso;
+};
+
+static uint64_t map__map_ip(struct map *map, uint64_t ip)
+{
+	return ip - map->start + map->pgoff;
+}
+
+static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
+{
+	return ip;
+}
+
+static struct map *map__new(struct mmap_event *event)
+{
+	struct map *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		const char *filename = event->filename;
+		char newfilename[PATH_MAX];
+
+		if (cwd) {
+			int n = strcommon(filename);
+
+			if (n == cwdlen) {
+				snprintf(newfilename, sizeof(newfilename),
+					 ".%s", filename + n);
+				filename = newfilename;
+			}
+		}
+
+		self->start = event->start;
+		self->end   = event->start + event->len;
+		self->pgoff = event->pgoff;
+
+		self->dso = dsos__findnew(filename);
+		if (self->dso == NULL)
+			goto out_delete;
+
+		if (self->dso == vdso)
+			self->map_ip = vdso__map_ip;
+		else
+			self->map_ip = map__map_ip;
+	}
+	return self;
+out_delete:
+	free(self);
+	return NULL;
+}
+
+static struct map *map__clone(struct map *self)
+{
+	struct map *map = malloc(sizeof(*self));
+
+	if (!map)
+		return NULL;
+
+	memcpy(map, self, sizeof(*self));
+
+	return map;
+}
+
+static int map__overlap(struct map *l, struct map *r)
+{
+	if (l->start > r->start) {
+		struct map *t = l;
+		l = r;
+		r = t;
+	}
+
+	if (l->end > r->start)
+		return 1;
+
+	return 0;
+}
+
+static size_t map__fprintf(struct map *self, FILE *fp)
+{
+	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
+		       self->start, self->end, self->pgoff, self->dso->name);
+}
+
+
+struct thread {
+	struct rb_node	 rb_node;
+	struct list_head maps;
+	pid_t		 pid;
+	char		 *comm;
+};
+
+static struct thread *thread__new(pid_t pid)
+{
+	struct thread *self = malloc(sizeof(*self));
+
+	if (self != NULL) {
+		self->pid = pid;
+		self->comm = malloc(32);
+		if (self->comm)
+			snprintf(self->comm, 32, ":%d", self->pid);
+		INIT_LIST_HEAD(&self->maps);
+	}
+
+	return self;
+}
+
+static int thread__set_comm(struct thread *self, const char *comm)
+{
+	if (self->comm)
+		free(self->comm);
+	self->comm = strdup(comm);
+	return self->comm ? 0 : -ENOMEM;
+}
+
+static size_t thread__fprintf(struct thread *self, FILE *fp)
+{
+	struct map *pos;
+	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm);
+
+	list_for_each_entry(pos, &self->maps, node)
+		ret += map__fprintf(pos, fp);
+
+	return ret;
+}
+
+
+static struct rb_root threads;
+static struct thread *last_match;
+
+static struct thread *threads__findnew(pid_t pid)
+{
+	struct rb_node **p = &threads.rb_node;
+	struct rb_node *parent = NULL;
+	struct thread *th;
+
+	/*
+	 * Font-end cache - PID lookups come in blocks,
+	 * so most of the time we dont have to look up
+	 * the full rbtree:
+	 */
+	if (last_match && last_match->pid == pid)
+		return last_match;
+
+	while (*p != NULL) {
+		parent = *p;
+		th = rb_entry(parent, struct thread, rb_node);
+
+		if (th->pid == pid) {
+			last_match = th;
+			return th;
+		}
+
+		if (pid < th->pid)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	th = thread__new(pid);
+	if (th != NULL) {
+		rb_link_node(&th->rb_node, parent, p);
+		rb_insert_color(&th->rb_node, &threads);
+		last_match = th;
+	}
+
+	return th;
+}
+
+static void thread__insert_map(struct thread *self, struct map *map)
+{
+	struct map *pos, *tmp;
+
+	list_for_each_entry_safe(pos, tmp, &self->maps, node) {
+		if (map__overlap(pos, map)) {
+			list_del_init(&pos->node);
+			/* XXX leaks dsos */
+			free(pos);
+		}
+	}
+
+	list_add_tail(&map->node, &self->maps);
+}
+
+static int thread__fork(struct thread *self, struct thread *parent)
+{
+	struct map *map;
+
+	if (self->comm)
+		free(self->comm);
+	self->comm = strdup(parent->comm);
+	if (!self->comm)
+		return -ENOMEM;
+
+	list_for_each_entry(map, &parent->maps, node) {
+		struct map *new = map__clone(map);
+		if (!new)
+			return -ENOMEM;
+		thread__insert_map(self, new);
+	}
+
+	return 0;
+}
+
+static struct map *thread__find_map(struct thread *self, uint64_t ip)
+{
+	struct map *pos;
+
+	if (self == NULL)
+		return NULL;
+
+	list_for_each_entry(pos, &self->maps, node)
+		if (ip >= pos->start && ip <= pos->end)
+			return pos;
+
+	return NULL;
+}
+
+static size_t threads__fprintf(FILE *fp)
+{
+	size_t ret = 0;
+	struct rb_node *nd;
+
+	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) {
+		struct thread *pos = rb_entry(nd, struct thread, rb_node);
+
+		ret += thread__fprintf(pos, fp);
+	}
+
+	return ret;
+}
+
+/*
+ * histogram, sorted on item, collects counts
+ */
+
+static struct rb_root hist;
+
+struct hist_entry {
+	struct rb_node	 rb_node;
+
+	struct thread	 *thread;
+	struct map	 *map;
+	struct dso	 *dso;
+	struct symbol	 *sym;
+	uint64_t	 ip;
+	char		 level;
+
+	uint32_t	 count;
+};
+
+/*
+ * configurable sorting bits
+ */
+
+struct sort_entry {
+	struct list_head list;
+
+	char *header;
+
+	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
+	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
+	size_t	(*print)(FILE *fp, struct hist_entry *);
+};
+
+/* --sort pid */
+
+static int64_t
+sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->thread->pid - left->thread->pid;
+}
+
+static size_t
+sort__thread_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
+}
+
+static struct sort_entry sort_thread = {
+	.header = "         Command:  Pid",
+	.cmp	= sort__thread_cmp,
+	.print	= sort__thread_print,
+};
+
+/* --sort comm */
+
+static int64_t
+sort__comm_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	return right->thread->pid - left->thread->pid;
+}
+
+static int64_t
+sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	char *comm_l = left->thread->comm;
+	char *comm_r = right->thread->comm;
+
+	if (!comm_l || !comm_r) {
+		if (!comm_l && !comm_r)
+			return 0;
+		else if (!comm_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(comm_l, comm_r);
+}
+
+static size_t
+sort__comm_print(FILE *fp, struct hist_entry *self)
+{
+	return fprintf(fp, "%16s", self->thread->comm);
+}
+
+static struct sort_entry sort_comm = {
+	.header		= "         Command",
+	.cmp		= sort__comm_cmp,
+	.collapse	= sort__comm_collapse,
+	.print		= sort__comm_print,
+};
+
+/* --sort dso */
+
+static int64_t
+sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct dso *dso_l = left->dso;
+	struct dso *dso_r = right->dso;
+
+	if (!dso_l || !dso_r) {
+		if (!dso_l && !dso_r)
+			return 0;
+		else if (!dso_l)
+			return -1;
+		else
+			return 1;
+	}
+
+	return strcmp(dso_l->name, dso_r->name);
+}
+
+static size_t
+sort__dso_print(FILE *fp, struct hist_entry *self)
+{
+	if (self->dso)
+		return fprintf(fp, "%-25s", self->dso->name);
+
+	return fprintf(fp, "%016llx         ", (__u64)self->ip);
+}
+
+static struct sort_entry sort_dso = {
+	.header = "Shared Object            ",
+	.cmp	= sort__dso_cmp,
+	.print	= sort__dso_print,
+};
+
+/* --sort symbol */
+
+static int64_t
+sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	uint64_t ip_l, ip_r;
+
+	if (left->sym == right->sym)
+		return 0;
+
+	ip_l = left->sym ? left->sym->start : left->ip;
+	ip_r = right->sym ? right->sym->start : right->ip;
+
+	return (int64_t)(ip_r - ip_l);
+}
+
+static size_t
+sort__sym_print(FILE *fp, struct hist_entry *self)
+{
+	size_t ret = 0;
+
+	if (verbose)
+		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip);
+
+	if (self->sym) {
+		ret += fprintf(fp, "[%c] %s",
+			self->dso == kernel_dso ? 'k' : '.', self->sym->name);
+	} else {
+		ret += fprintf(fp, "%#016llx", (__u64)self->ip);
+	}
+
+	return ret;
+}
+
+static struct sort_entry sort_sym = {
+	.header = "Symbol",
+	.cmp	= sort__sym_cmp,
+	.print	= sort__sym_print,
+};
+
+static int sort__need_collapse = 0;
+
+struct sort_dimension {
+	char			*name;
+	struct sort_entry	*entry;
+	int			taken;
+};
+
+static struct sort_dimension sort_dimensions[] = {
+	{ .name = "pid",	.entry = &sort_thread,	},
+	{ .name = "comm",	.entry = &sort_comm,	},
+	{ .name = "dso",	.entry = &sort_dso,	},
+	{ .name = "symbol",	.entry = &sort_sym,	},
+};
+
+static LIST_HEAD(hist_entry__sort_list);
+
+static int sort_dimension__add(char *tok)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) {
+		struct sort_dimension *sd = &sort_dimensions[i];
+
+		if (sd->taken)
+			continue;
+
+		if (strncasecmp(tok, sd->name, strlen(tok)))
+			continue;
+
+		if (sd->entry->collapse)
+			sort__need_collapse = 1;
+
+		list_add_tail(&sd->entry->list, &hist_entry__sort_list);
+		sd->taken = 1;
+
+		return 0;
+	}
+
+	return -ESRCH;
+}
+
+static int64_t
+hist_entry__cmp(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		cmp = se->cmp(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+static int64_t
+hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
+{
+	struct sort_entry *se;
+	int64_t cmp = 0;
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		int64_t (*f)(struct hist_entry *, struct hist_entry *);
+
+		f = se->collapse ?: se->cmp;
+
+		cmp = f(left, right);
+		if (cmp)
+			break;
+	}
+
+	return cmp;
+}
+
+static size_t
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+{
+	struct sort_entry *se;
+	size_t ret;
+
+	if (total_samples) {
+		double percent = self->count * 100.0 / total_samples;
+		char *color = PERF_COLOR_NORMAL;
+
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (percent >= 5.0)
+			color = PERF_COLOR_RED;
+		if (percent < 0.5)
+			color = PERF_COLOR_GREEN;
+
+		ret = color_fprintf(fp, color, "   %6.2f%%",
+				(self->count * 100.0) / total_samples);
+	} else
+		ret = fprintf(fp, "%12d ", self->count);
+
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		fprintf(fp, "  ");
+		ret += se->print(fp, self);
+	}
+
+	ret += fprintf(fp, "\n");
+
+	return ret;
+}
+
+/*
+ * collect histogram counts
+ */
+
+static int
+hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
+		struct symbol *sym, uint64_t ip, char level)
+{
+	struct rb_node **p = &hist.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *he;
+	struct hist_entry entry = {
+		.thread	= thread,
+		.map	= map,
+		.dso	= dso,
+		.sym	= sym,
+		.ip	= ip,
+		.level	= level,
+		.count	= 1,
+	};
+	int cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		he = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__cmp(&entry, he);
+
+		if (!cmp) {
+			he->count++;
+			return 0;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	he = malloc(sizeof(*he));
+	if (!he)
+		return -ENOMEM;
+	*he = entry;
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &hist);
+
+	return 0;
+}
+
+static void hist_entry__free(struct hist_entry *he)
+{
+	free(he);
+}
+
+/*
+ * collapse the histogram
+ */
+
+static struct rb_root collapse_hists;
+
+static void collapse__insert_entry(struct hist_entry *he)
+{
+	struct rb_node **p = &collapse_hists.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+	int64_t cmp;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		cmp = hist_entry__collapse(iter, he);
+
+		if (!cmp) {
+			iter->count += he->count;
+			hist_entry__free(he);
+			return;
+		}
+
+		if (cmp < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &collapse_hists);
+}
+
+static void collapse__resort(void)
+{
+	struct rb_node *next;
+	struct hist_entry *n;
+
+	if (!sort__need_collapse)
+		return;
+
+	next = rb_first(&hist);
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, &hist);
+		collapse__insert_entry(n);
+	}
+}
+
+/*
+ * reverse the map, sort on count.
+ */
+
+static struct rb_root output_hists;
+
+static void output__insert_entry(struct hist_entry *he)
+{
+	struct rb_node **p = &output_hists.rb_node;
+	struct rb_node *parent = NULL;
+	struct hist_entry *iter;
+
+	while (*p != NULL) {
+		parent = *p;
+		iter = rb_entry(parent, struct hist_entry, rb_node);
+
+		if (he->count > iter->count)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&he->rb_node, parent, p);
+	rb_insert_color(&he->rb_node, &output_hists);
+}
+
+static void output__resort(void)
+{
+	struct rb_node *next;
+	struct hist_entry *n;
+	struct rb_root *tree = &hist;
+
+	if (sort__need_collapse)
+		tree = &collapse_hists;
+
+	next = rb_first(tree);
+
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		next = rb_next(&n->rb_node);
+
+		rb_erase(&n->rb_node, tree);
+		output__insert_entry(n);
+	}
+}
+
+static size_t output__fprintf(FILE *fp, uint64_t total_samples)
+{
+	struct hist_entry *pos;
+	struct sort_entry *se;
+	struct rb_node *nd;
+	size_t ret = 0;
+
+	fprintf(fp, "\n");
+	fprintf(fp, "#\n");
+	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
+	fprintf(fp, "#\n");
+
+	fprintf(fp, "# Overhead");
+	list_for_each_entry(se, &hist_entry__sort_list, list)
+		fprintf(fp, "  %s", se->header);
+	fprintf(fp, "\n");
+
+	fprintf(fp, "# ........");
+	list_for_each_entry(se, &hist_entry__sort_list, list) {
+		int i;
+
+		fprintf(fp, "  ");
+		for (i = 0; i < strlen(se->header); i++)
+			fprintf(fp, ".");
+	}
+	fprintf(fp, "\n");
+
+	fprintf(fp, "#\n");
+
+	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
+		pos = rb_entry(nd, struct hist_entry, rb_node);
+		ret += hist_entry__fprintf(fp, pos, total_samples);
+	}
+
+	if (!strcmp(sort_order, default_sort_order)) {
+		fprintf(fp, "#\n");
+		fprintf(fp, "# (For more details, try: perf annotate --sort comm,dso,symbol)\n");
+		fprintf(fp, "#\n");
+	}
+	fprintf(fp, "\n");
+
+	return ret;
+}
+
+static void register_idle_thread(void)
+{
+	struct thread *thread = threads__findnew(0);
+
+	if (thread == NULL ||
+			thread__set_comm(thread, "[idle]")) {
+		fprintf(stderr, "problem inserting idle task.\n");
+		exit(-1);
+	}
+}
+
+static unsigned long total = 0,
+		     total_mmap = 0,
+		     total_comm = 0,
+		     total_fork = 0,
+		     total_unknown = 0;
+
+static int
+process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	char level;
+	int show = 0;
+	struct dso *dso = NULL;
+	struct thread *thread = threads__findnew(event->ip.pid);
+	uint64_t ip = event->ip.ip;
+	struct map *map = NULL;
+
+	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid,
+		(void *)(long)ip);
+
+	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	if (thread == NULL) {
+		fprintf(stderr, "problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
+
+	if (event->header.misc & PERF_EVENT_MISC_KERNEL) {
+		show = SHOW_KERNEL;
+		level = 'k';
+
+		dso = kernel_dso;
+
+		dprintf(" ...... dso: %s\n", dso->name);
+
+	} else if (event->header.misc & PERF_EVENT_MISC_USER) {
+
+		show = SHOW_USER;
+		level = '.';
+
+		map = thread__find_map(thread, ip);
+		if (map != NULL) {
+			ip = map->map_ip(map, ip);
+			dso = map->dso;
+		} else {
+			/*
+			 * If this is outside of all known maps,
+			 * and is a negative address, try to look it
+			 * up in the kernel dso, as it might be a
+			 * vsyscall (which executes in user-mode):
+			 */
+			if ((long long)ip < 0)
+				dso = kernel_dso;
+		}
+		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>");
+
+	} else {
+		show = SHOW_HV;
+		level = 'H';
+		dprintf(" ...... dso: [hypervisor]\n");
+	}
+
+	if (show & show_mask) {
+		struct symbol *sym = NULL;
+
+		if (dso)
+			sym = dso->find_symbol(dso, ip);
+
+		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+			fprintf(stderr,
+		"problem incrementing symbol count, skipping event\n");
+			return -1;
+		}
+	}
+	total++;
+
+	return 0;
+}
+
+static int
+process_mmap_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->mmap.pid);
+	struct map *map = map__new(&event->mmap);
+
+	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->mmap.pid,
+		(void *)(long)event->mmap.start,
+		(void *)(long)event->mmap.len,
+		(void *)(long)event->mmap.pgoff,
+		event->mmap.filename);
+
+	if (thread == NULL || map == NULL) {
+		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n");
+		return 0;
+	}
+
+	thread__insert_map(thread, map);
+	total_mmap++;
+
+	return 0;
+}
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->comm.pid);
+
+	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		return -1;
+	}
+	total_comm++;
+
+	return 0;
+}
+
+static int
+process_fork_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread = threads__findnew(event->fork.pid);
+	struct thread *parent = threads__findnew(event->fork.ppid);
+
+	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->fork.pid, event->fork.ppid);
+
+	if (!thread || !parent || thread__fork(thread, parent)) {
+		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n");
+		return -1;
+	}
+	total_fork++;
+
+	return 0;
+}
+
+static int
+process_period_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->period.time,
+		event->period.id,
+		event->period.sample_period);
+
+	return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW)
+		return process_overflow_event(event, offset, head);
+
+	switch (event->header.type) {
+	case PERF_EVENT_MMAP:
+		return process_mmap_event(event, offset, head);
+
+	case PERF_EVENT_COMM:
+		return process_comm_event(event, offset, head);
+
+	case PERF_EVENT_FORK:
+		return process_fork_event(event, offset, head);
+
+	case PERF_EVENT_PERIOD:
+		return process_period_event(event, offset, head);
+	/*
+	 * We dont process them right now but they are fine:
+	 */
+
+	case PERF_EVENT_THROTTLE:
+	case PERF_EVENT_UNTHROTTLE:
+		return 0;
+
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __cmd_annotate(void)
+{
+	int ret, rc = EXIT_FAILURE;
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat stat;
+	event_t *event;
+	uint32_t size;
+	char *buf;
+
+	register_idle_thread();
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
+	if (load_kernel() < 0) {
+		perror("failed to load kernel symbols");
+		return EXIT_FAILURE;
+	}
+
+	if (!full_paths) {
+		if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
+			perror("failed to get the current directory");
+			return EXIT_FAILURE;
+		}
+		cwdlen = strlen(cwd);
+	} else {
+		cwd = NULL;
+		cwdlen = 0;
+	}
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int ret;
+
+		ret = munmap(buf, page_size * mmap_window);
+		assert(ret == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+	size = event->header.size;
+
+	dprintf("%p [%p]: event: %d\n",
+			(void *)(offset + head),
+			(void *)(long)event->header.size,
+			event->header.type);
+
+	if (!size || process_event(event, offset, head) < 0) {
+
+		dprintf("%p [%p]: skipping unknown header type: %d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->header.type);
+
+		total_unknown++;
+
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
+	}
+
+	head += size;
+
+	if (offset + head < stat.st_size)
+		goto more;
+
+	rc = EXIT_SUCCESS;
+	close(input);
+
+	dprintf("      IP events: %10ld\n", total);
+	dprintf("    mmap events: %10ld\n", total_mmap);
+	dprintf("    comm events: %10ld\n", total_comm);
+	dprintf("    fork events: %10ld\n", total_fork);
+	dprintf(" unknown events: %10ld\n", total_unknown);
+
+	if (dump_trace)
+		return 0;
+
+	if (verbose >= 3)
+		threads__fprintf(stdout);
+
+	if (verbose >= 2)
+		dsos__fprintf(stdout);
+
+	collapse__resort();
+	output__resort();
+	output__fprintf(stdout, total);
+
+	return rc;
+}
+
+static const char * const annotate_usage[] = {
+	"perf annotate [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		    "input file name"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
+	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
+		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
+	OPT_BOOLEAN('P', "full-paths", &full_paths,
+		    "Don't shorten the pathnames taking into account the cwd"),
+	OPT_END()
+};
+
+static void setup_sorting(void)
+{
+	char *tmp, *tok, *str = strdup(sort_order);
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
+		if (sort_dimension__add(tok) < 0) {
+			error("Unknown --sort key: `%s'", tok);
+			usage_with_options(annotate_usage, options);
+		}
+	}
+
+	free(str);
+}
+
+int cmd_annotate(int argc, const char **argv, const char *prefix)
+{
+	symbol__init();
+
+	page_size = getpagesize();
+
+	argc = parse_options(argc, argv, options, annotate_usage, 0);
+
+	setup_sorting();
+
+	/*
+	 * Any (unrecognized) arguments left?
+	 */
+	if (argc)
+		usage_with_options(annotate_usage, options);
+
+	setup_pager();
+
+	return __cmd_annotate();
+}
diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h
index e7de47da85815..51d168230ee74 100644
--- a/Documentation/perf_counter/builtin.h
+++ b/Documentation/perf_counter/builtin.h
@@ -14,6 +14,7 @@ extern void prune_packed_objects(int);
 extern int read_line_with_nul(char *buf, int size, FILE *file);
 extern int check_pager_config(const char *cmd);
 
+extern int cmd_annotate(int argc, const char **argv, const char *prefix);
 extern int cmd_help(int argc, const char **argv, const char *prefix);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_report(int argc, const char **argv, const char *prefix);
diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt
index f0b922c902345..eebce30afbc09 100644
--- a/Documentation/perf_counter/command-list.txt
+++ b/Documentation/perf_counter/command-list.txt
@@ -2,8 +2,9 @@
 # List of known perf commands.
 # command name			category [deprecated] [common]
 #
+perf-annotate			mainporcelain common
+perf-list			mainporcelain common
 perf-record			mainporcelain common
 perf-report			mainporcelain common
 perf-stat			mainporcelain common
 perf-top			mainporcelain common
-perf-list			mainporcelain common
diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c
index 161824f1241fa..4eb725933703b 100644
--- a/Documentation/perf_counter/perf.c
+++ b/Documentation/perf_counter/perf.c
@@ -263,6 +263,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "report", cmd_report, 0 },
 		{ "stat", cmd_stat, 0 },
 		{ "top", cmd_top, 0 },
+		{ "annotate", cmd_annotate, 0 },
 		{ "version", cmd_version, 0 },
 	};
 	int i;
@@ -402,9 +403,11 @@ int main(int argc, const char **argv)
 	while (1) {
 		static int done_help = 0;
 		static int was_alias = 0;
+
 		was_alias = run_argv(&argc, &argv);
 		if (errno != ENOENT)
 			break;
+
 		if (was_alias) {
 			fprintf(stderr, "Expansion of alias '%s' failed; "
 				"'%s' is not a perf-command\n",
-- 
GitLab


From 0b73da3f40128eab6ca2a07508f424029a1edaeb Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 15:48:52 +0200
Subject: [PATCH 1636/2198] perf_counter tools: Add 'perf annotate' feature

Add new perf sub-command to display annotated source code:

 $ perf annotate decode_tree_entry

------------------------------------------------
 Percent |	Source code & Disassembly of /home/mingo/git/git
------------------------------------------------
         :
         :	/home/mingo/git/git:     file format elf64-x86-64
         :
         :
         :	Disassembly of section .text:
         :
         :	00000000004a0da0 <decode_tree_entry>:
         :		*modep = mode;
         :		return str;
         :	}
         :
         :	static void decode_tree_entry(struct tree_desc *desc, const char *buf, unsigned long size)
         :	{
    3.82 :	  4a0da0:	41 54                	push   %r12
         :		const char *path;
         :		unsigned int mode, len;
         :
         :		if (size < 24 || buf[size - 21])
    0.17 :	  4a0da2:	48 83 fa 17          	cmp    $0x17,%rdx
         :		*modep = mode;
         :		return str;
         :	}
         :
         :	static void decode_tree_entry(struct tree_desc *desc, const char *buf, unsigned long size)
         :	{
    0.00 :	  4a0da6:	49 89 fc             	mov    %rdi,%r12
    0.00 :	  4a0da9:	55                   	push   %rbp
    3.37 :	  4a0daa:	53                   	push   %rbx
         :		const char *path;
         :		unsigned int mode, len;
         :
         :		if (size < 24 || buf[size - 21])
    0.08 :	  4a0dab:	76 73                	jbe    4a0e20 <decode_tree_entry+0x80>
    0.00 :	  4a0dad:	80 7c 16 eb 00       	cmpb   $0x0,-0x15(%rsi,%rdx,1)
    3.48 :	  4a0db2:	75 6c                	jne    4a0e20 <decode_tree_entry+0x80>
         :	static const char *get_mode(const char *str, unsigned int *modep)
         :	{
         :		unsigned char c;
         :		unsigned int mode = 0;
         :
         :		if (*str == ' ')
    1.94 :	  4a0db4:	0f b6 06             	movzbl (%rsi),%eax
    0.39 :	  4a0db7:	3c 20                	cmp    $0x20,%al
    0.00 :	  4a0db9:	74 65                	je     4a0e20 <decode_tree_entry+0x80>
         :			return NULL;
         :
         :		while ((c = *str++) != ' ') {
    0.06 :	  4a0dbb:	89 c2                	mov    %eax,%edx
         :			if (c < '0' || c > '7')
    1.99 :	  4a0dbd:	31 ed                	xor    %ebp,%ebp
         :		unsigned int mode = 0;
         :
         :		if (*str == ' ')
         :			return NULL;
         :
         :		while ((c = *str++) != ' ') {
    1.74 :	  4a0dbf:	48 8d 5e 01          	lea    0x1(%rsi),%rbx
         :			if (c < '0' || c > '7')
    0.00 :	  4a0dc3:	8d 42 d0             	lea    -0x30(%rdx),%eax
    0.17 :	  4a0dc6:	3c 07                	cmp    $0x7,%al
    0.00 :	  4a0dc8:	76 0d                	jbe    4a0dd7 <decode_tree_entry+0x37>
    0.00 :	  4a0dca:	eb 54                	jmp    4a0e20 <decode_tree_entry+0x80>
    0.00 :	  4a0dcc:	0f 1f 40 00          	nopl   0x0(%rax)
   16.57 :	  4a0dd0:	8d 42 d0             	lea    -0x30(%rdx),%eax
    0.14 :	  4a0dd3:	3c 07                	cmp    $0x7,%al
    0.00 :	  4a0dd5:	77 49                	ja     4a0e20 <decode_tree_entry+0x80>
         :				return NULL;
         :			mode = (mode << 3) + (c - '0');
    3.12 :	  4a0dd7:	0f b6 c2             	movzbl %dl,%eax
         :		unsigned int mode = 0;
         :
         :		if (*str == ' ')
         :			return NULL;
         :
         :		while ((c = *str++) != ' ') {
    0.00 :	  4a0dda:	0f b6 13             	movzbl (%rbx),%edx
   16.74 :	  4a0ddd:	48 83 c3 01          	add    $0x1,%rbx
         :			if (c < '0' || c > '7')
         :				return NULL;
         :			mode = (mode << 3) + (c - '0');

The first column is the percentage of samples that arrived on that
particular line - relative to the total cost of the function.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 .../Documentation/perf-annotate.txt           |   9 +-
 Documentation/perf_counter/builtin-annotate.c | 304 +++++++++++-------
 Documentation/perf_counter/util/symbol.c      |  50 ++-
 Documentation/perf_counter/util/symbol.h      |   5 +
 4 files changed, 229 insertions(+), 139 deletions(-)

diff --git a/Documentation/perf_counter/Documentation/perf-annotate.txt b/Documentation/perf_counter/Documentation/perf-annotate.txt
index a9d6d5ee27012..c9dcade068312 100644
--- a/Documentation/perf_counter/Documentation/perf-annotate.txt
+++ b/Documentation/perf_counter/Documentation/perf-annotate.txt
@@ -3,7 +3,7 @@ perf-annotate(1)
 
 NAME
 ----
-perf-annotate - Read perf.data (created by perf record) and annotate functions
+perf-annotate - Read perf.data (created by perf record) and display annotated code
 
 SYNOPSIS
 --------
@@ -12,8 +12,11 @@ SYNOPSIS
 
 DESCRIPTION
 -----------
-This command displays the performance counter profile information recorded
-via perf record.
+This command reads the input file and displays an annotated version of the
+code. If the object file has debug symbols then the source code will be
+displayed alongside assembly code.
+
+If there is no debug info in the object, then annotated assembly is displayed.
 
 OPTIONS
 -------
diff --git a/Documentation/perf_counter/builtin-annotate.c b/Documentation/perf_counter/builtin-annotate.c
index d656484ec9838..116a3978b44c7 100644
--- a/Documentation/perf_counter/builtin-annotate.c
+++ b/Documentation/perf_counter/builtin-annotate.c
@@ -28,7 +28,7 @@
 static char		const *input_name = "perf.data";
 static char		*vmlinux = NULL;
 
-static char		default_sort_order[] = "comm,dso";
+static char		default_sort_order[] = "comm,symbol";
 static char		*sort_order = default_sort_order;
 
 static int		input;
@@ -38,7 +38,6 @@ static int		dump_trace = 0;
 #define dprintf(x...)	do { if (dump_trace) printf(x); } while (0)
 
 static int		verbose;
-static int		full_paths;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -89,6 +88,7 @@ static LIST_HEAD(dsos);
 static struct dso *kernel_dso;
 static struct dso *vdso;
 
+
 static void dsos__add(struct dso *dso)
 {
 	list_add_tail(&dso->node, &dsos);
@@ -176,20 +176,6 @@ static int load_kernel(void)
 	return err;
 }
 
-static char __cwd[PATH_MAX];
-static char *cwd = __cwd;
-static int cwdlen;
-
-static int strcommon(const char *pathname)
-{
-	int n = 0;
-
-	while (pathname[n] == cwd[n] && n < cwdlen)
-		++n;
-
-	return n;
-}
-
 struct map {
 	struct list_head node;
 	uint64_t	 start;
@@ -215,17 +201,6 @@ static struct map *map__new(struct mmap_event *event)
 
 	if (self != NULL) {
 		const char *filename = event->filename;
-		char newfilename[PATH_MAX];
-
-		if (cwd) {
-			int n = strcommon(filename);
-
-			if (n == cwdlen) {
-				snprintf(newfilename, sizeof(newfilename),
-					 ".%s", filename + n);
-				filename = newfilename;
-			}
-		}
 
 		self->start = event->start;
 		self->end   = event->start + event->len;
@@ -669,44 +644,36 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 	return cmp;
 }
 
-static size_t
-hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+/*
+ * collect histogram counts
+ */
+static void hist_hit(struct hist_entry *he, uint64_t ip)
 {
-	struct sort_entry *se;
-	size_t ret;
+	unsigned int sym_size, offset;
+	struct symbol *sym = he->sym;
 
-	if (total_samples) {
-		double percent = self->count * 100.0 / total_samples;
-		char *color = PERF_COLOR_NORMAL;
+	he->count++;
 
-		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
-		 */
-		if (percent >= 5.0)
-			color = PERF_COLOR_RED;
-		if (percent < 0.5)
-			color = PERF_COLOR_GREEN;
+	if (!sym || !sym->hist)
+		return;
 
-		ret = color_fprintf(fp, color, "   %6.2f%%",
-				(self->count * 100.0) / total_samples);
-	} else
-		ret = fprintf(fp, "%12d ", self->count);
+	sym_size = sym->end - sym->start;
+	offset = ip - sym->start;
 
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		fprintf(fp, "  ");
-		ret += se->print(fp, self);
-	}
+	if (offset >= sym_size)
+		return;
 
-	ret += fprintf(fp, "\n");
+	sym->hist_sum++;
+	sym->hist[offset]++;
 
-	return ret;
+	if (verbose >= 3)
+		printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n",
+			(void *)he->sym->start,
+			he->sym->name,
+			(void *)ip, ip - he->sym->start,
+			sym->hist[offset]);
 }
 
-/*
- * collect histogram counts
- */
-
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		struct symbol *sym, uint64_t ip, char level)
@@ -732,7 +699,8 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		cmp = hist_entry__cmp(&entry, he);
 
 		if (!cmp) {
-			he->count++;
+			hist_hit(he, ip);
+
 			return 0;
 		}
 
@@ -856,50 +824,6 @@ static void output__resort(void)
 	}
 }
 
-static size_t output__fprintf(FILE *fp, uint64_t total_samples)
-{
-	struct hist_entry *pos;
-	struct sort_entry *se;
-	struct rb_node *nd;
-	size_t ret = 0;
-
-	fprintf(fp, "\n");
-	fprintf(fp, "#\n");
-	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples);
-	fprintf(fp, "#\n");
-
-	fprintf(fp, "# Overhead");
-	list_for_each_entry(se, &hist_entry__sort_list, list)
-		fprintf(fp, "  %s", se->header);
-	fprintf(fp, "\n");
-
-	fprintf(fp, "# ........");
-	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		int i;
-
-		fprintf(fp, "  ");
-		for (i = 0; i < strlen(se->header); i++)
-			fprintf(fp, ".");
-	}
-	fprintf(fp, "\n");
-
-	fprintf(fp, "#\n");
-
-	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
-		pos = rb_entry(nd, struct hist_entry, rb_node);
-		ret += hist_entry__fprintf(fp, pos, total_samples);
-	}
-
-	if (!strcmp(sort_order, default_sort_order)) {
-		fprintf(fp, "#\n");
-		fprintf(fp, "# (For more details, try: perf annotate --sort comm,dso,symbol)\n");
-		fprintf(fp, "#\n");
-	}
-	fprintf(fp, "\n");
-
-	return ret;
-}
-
 static void register_idle_thread(void)
 {
 	struct thread *thread = threads__findnew(0);
@@ -1106,6 +1030,149 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static int
+parse_line(FILE *file, struct symbol *sym, uint64_t start, uint64_t len)
+{
+	char *line = NULL, *tmp, *tmp2;
+	unsigned int offset;
+	size_t line_len;
+	__u64 line_ip;
+	int ret;
+	char *c;
+
+	if (getline(&line, &line_len, file) < 0)
+		return -1;
+	if (!line)
+		return -1;
+
+	c = strchr(line, '\n');
+	if (c)
+		*c = 0;
+
+	line_ip = -1;
+	offset = 0;
+	ret = -2;
+
+	/*
+	 * Strip leading spaces:
+	 */
+	tmp = line;
+	while (*tmp) {
+		if (*tmp != ' ')
+			break;
+		tmp++;
+	}
+
+	if (*tmp) {
+		/*
+		 * Parse hexa addresses followed by ':'
+		 */
+		line_ip = strtoull(tmp, &tmp2, 16);
+		if (*tmp2 != ':')
+			line_ip = -1;
+	}
+
+	if (line_ip != -1) {
+		unsigned int hits = 0;
+		double percent = 0.0;
+		char *color = PERF_COLOR_NORMAL;
+
+		offset = line_ip - start;
+		if (offset < len)
+			hits = sym->hist[offset];
+
+		if (sym->hist_sum)
+			percent = 100.0 * hits / sym->hist_sum;
+
+		/*
+		 * We color high-overhead entries in red, low-overhead
+		 * entries in green - and keep the middle ground normal:
+		 */
+		if (percent >= 5.0)
+			color = PERF_COLOR_RED;
+		else {
+			if (percent > 0.5)
+				color = PERF_COLOR_GREEN;
+		}
+
+		color_fprintf(stdout, color, " %7.2f", percent);
+		printf(" :	");
+		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", line);
+	} else {
+		if (!*line)
+			printf("         :\n");
+		else
+			printf("         :	%s\n", line);
+	}
+
+	return 0;
+}
+
+static void annotate_sym(struct dso *dso, struct symbol *sym)
+{
+	char *filename = dso->name;
+	uint64_t start, end, len;
+	char command[PATH_MAX*2];
+	FILE *file;
+
+	if (!filename)
+		return;
+	if (dso == kernel_dso)
+		filename = vmlinux;
+
+	printf("\n------------------------------------------------\n");
+	printf(" Percent |	Source code & Disassembly of %s\n", filename);
+	printf("------------------------------------------------\n");
+
+	if (verbose >= 2)
+		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name);
+
+	start = sym->obj_start;
+	if (!start)
+		start = sym->start;
+
+	end = start + sym->end - sym->start + 1;
+	len = sym->end - sym->start;
+
+	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename);
+
+	if (verbose >= 3)
+		printf("doing: %s\n", command);
+
+	file = popen(command, "r");
+	if (!file)
+		return;
+
+	while (!feof(file)) {
+		if (parse_line(file, sym, start, len) < 0)
+			break;
+	}
+
+	pclose(file);
+}
+
+static void find_annotations(void)
+{
+	struct rb_node *nd;
+	struct dso *dso;
+	int count = 0;
+
+	list_for_each_entry(dso, &dsos, node) {
+
+		for (nd = rb_first(&dso->syms); nd; nd = rb_next(nd)) {
+			struct symbol *sym = rb_entry(nd, struct symbol, rb_node);
+
+			if (sym->hist) {
+				annotate_sym(dso, sym);
+				count++;
+			}
+		}
+	}
+
+	if (!count)
+		printf(" Error: symbol '%s' not present amongst the samples.\n", sym_hist_filter);
+}
+
 static int __cmd_annotate(void)
 {
 	int ret, rc = EXIT_FAILURE;
@@ -1140,16 +1207,6 @@ static int __cmd_annotate(void)
 		return EXIT_FAILURE;
 	}
 
-	if (!full_paths) {
-		if (getcwd(__cwd, sizeof(__cwd)) == NULL) {
-			perror("failed to get the current directory");
-			return EXIT_FAILURE;
-		}
-		cwdlen = strlen(cwd);
-	} else {
-		cwd = NULL;
-		cwdlen = 0;
-	}
 remap:
 	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
 			   MAP_SHARED, input, offset);
@@ -1229,7 +1286,8 @@ static int __cmd_annotate(void)
 
 	collapse__resort();
 	output__resort();
-	output__fprintf(stdout, total);
+
+	find_annotations();
 
 	return rc;
 }
@@ -1242,15 +1300,13 @@ static const char * const annotate_usage[] = {
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
+	OPT_STRING('s', "symbol", &sym_hist_filter, "file",
+		    "symbol to annotate"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
-	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"),
-	OPT_BOOLEAN('P', "full-paths", &full_paths,
-		    "Don't shorten the pathnames taking into account the cwd"),
 	OPT_END()
 };
 
@@ -1279,10 +1335,18 @@ int cmd_annotate(int argc, const char **argv, const char *prefix)
 
 	setup_sorting();
 
-	/*
-	 * Any (unrecognized) arguments left?
-	 */
-	if (argc)
+	if (argc) {
+		/*
+		 * Special case: if there's an argument left then assume tha
+		 * it's a symbol filter:
+		 */
+		if (argc > 1)
+			usage_with_options(annotate_usage, options);
+
+		sym_hist_filter = argv[0];
+	}
+
+	if (!sym_hist_filter)
 		usage_with_options(annotate_usage, options);
 
 	setup_pager();
diff --git a/Documentation/perf_counter/util/symbol.c b/Documentation/perf_counter/util/symbol.c
index a06bbfba8350a..23f4f7b3b83d1 100644
--- a/Documentation/perf_counter/util/symbol.c
+++ b/Documentation/perf_counter/util/symbol.c
@@ -7,21 +7,36 @@
 #include <gelf.h>
 #include <elf.h>
 
+const char *sym_hist_filter;
+
 static struct symbol *symbol__new(uint64_t start, uint64_t len,
-				  const char *name, unsigned int priv_size)
+				  const char *name, unsigned int priv_size,
+				  uint64_t obj_start, int verbose)
 {
 	size_t namelen = strlen(name) + 1;
-	struct symbol *self = malloc(priv_size + sizeof(*self) + namelen);
+	struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
 
-	if (self != NULL) {
-		if (priv_size) {
-			memset(self, 0, priv_size);
-			self = ((void *)self) + priv_size;
-		}
-		self->start = start;
-		self->end   = start + len - 1;
-		memcpy(self->name, name, namelen);
+	if (!self)
+		return NULL;
+
+	if (verbose >= 2)
+		printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
+			(__u64)start, len, name, self->hist, (void *)obj_start);
+
+	self->obj_start= obj_start;
+	self->hist = NULL;
+	self->hist_sum = 0;
+
+	if (sym_hist_filter && !strcmp(name, sym_hist_filter))
+		self->hist = calloc(sizeof(__u64), len);
+
+	if (priv_size) {
+		memset(self, 0, priv_size);
+		self = ((void *)self) + priv_size;
 	}
+	self->start = start;
+	self->end   = start + len - 1;
+	memcpy(self->name, name, namelen);
 
 	return self;
 }
@@ -166,7 +181,7 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verb
 		 * Well fix up the end later, when we have all sorted.
 		 */
 		sym = symbol__new(start, 0xdead, line + len + 2,
-				  self->sym_priv_size);
+				  self->sym_priv_size, 0, verbose);
 
 		if (sym == NULL)
 			goto out_delete_line;
@@ -272,7 +287,7 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 				       GElf_Ehdr *ehdr, Elf_Scn *scn_dynsym,
 				       GElf_Shdr *shdr_dynsym,
-				       size_t dynsym_idx)
+				       size_t dynsym_idx, int verbose)
 {
 	uint32_t nr_rel_entries, idx;
 	GElf_Sym sym;
@@ -335,7 +350,7 @@ static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 				 "%s@plt", elf_sym__name(&sym, symstrs));
 
 			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
-					sympltname, self->sym_priv_size);
+					sympltname, self->sym_priv_size, 0, verbose);
 			if (!f)
 				return -1;
 
@@ -353,7 +368,7 @@ static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 				 "%s@plt", elf_sym__name(&sym, symstrs));
 
 			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
-					sympltname, self->sym_priv_size);
+					sympltname, self->sym_priv_size, 0, verbose);
 			if (!f)
 				return -1;
 
@@ -410,7 +425,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 	if (sec_dynsym != NULL) {
 		nr = dso__synthesize_plt_symbols(self, elf, &ehdr,
 						 sec_dynsym, &shdr,
-						 dynsym_idx);
+						 dynsym_idx, verbose);
 		if (nr < 0)
 			goto out_elf_end;
 	}
@@ -444,6 +459,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
+		uint64_t obj_start;
 
 		if (!elf_sym__is_function(&sym))
 			continue;
@@ -453,11 +469,13 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 			goto out_elf_end;
 
 		gelf_getshdr(sec, &shdr);
+		obj_start = sym.st_value;
+
 		sym.st_value -= shdr.sh_addr - shdr.sh_offset;
 
 		f = symbol__new(sym.st_value, sym.st_size,
 				elf_sym__name(&sym, symstrs),
-				self->sym_priv_size);
+				self->sym_priv_size, obj_start, verbose);
 		if (!f)
 			goto out_elf_end;
 
diff --git a/Documentation/perf_counter/util/symbol.h b/Documentation/perf_counter/util/symbol.h
index e23cc31266843..4839d68f14f08 100644
--- a/Documentation/perf_counter/util/symbol.h
+++ b/Documentation/perf_counter/util/symbol.h
@@ -9,6 +9,9 @@ struct symbol {
 	struct rb_node	rb_node;
 	__u64		start;
 	__u64		end;
+	__u64		obj_start;
+	__u64		hist_sum;
+	__u64		*hist;
 	char		name[0];
 };
 
@@ -20,6 +23,8 @@ struct dso {
 	char		 name[0];
 };
 
+const char *sym_hist_filter;
+
 typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym);
 
 struct dso *dso__new(const char *name, unsigned int sym_priv_size);
-- 
GitLab


From 864709302a80f26fa9da3be5b47304f0b8bae192 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 20:33:43 +0200
Subject: [PATCH 1637/2198] perf_counter tools: Move from
 Documentation/perf_counter/ to tools/perf/

Several people have suggested that 'perf' has become a full-fledged
tool that should be moved out of Documentation/. Move it to the
(new) tools/ directory.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 {Documentation/perf_counter => tools/perf}/.gitignore             | 0
 {Documentation/perf_counter => tools/perf}/Documentation/Makefile | 0
 .../perf_counter => tools/perf}/Documentation/asciidoc.conf       | 0
 .../perf_counter => tools/perf}/Documentation/manpage-1.72.xsl    | 0
 .../perf_counter => tools/perf}/Documentation/manpage-base.xsl    | 0
 .../perf}/Documentation/manpage-bold-literal.xsl                  | 0
 .../perf_counter => tools/perf}/Documentation/manpage-normal.xsl  | 0
 .../perf}/Documentation/manpage-suppress-sp.xsl                   | 0
 .../perf_counter => tools/perf}/Documentation/perf-annotate.txt   | 0
 .../perf_counter => tools/perf}/Documentation/perf-help.txt       | 0
 .../perf_counter => tools/perf}/Documentation/perf-list.txt       | 0
 .../perf_counter => tools/perf}/Documentation/perf-record.txt     | 0
 .../perf_counter => tools/perf}/Documentation/perf-report.txt     | 0
 .../perf_counter => tools/perf}/Documentation/perf-stat.txt       | 0
 .../perf_counter => tools/perf}/Documentation/perf-top.txt        | 0
 {Documentation/perf_counter => tools/perf}/Documentation/perf.txt | 0
 {Documentation/perf_counter => tools/perf}/Makefile               | 0
 {Documentation/perf_counter => tools/perf}/builtin-annotate.c     | 0
 {Documentation/perf_counter => tools/perf}/builtin-help.c         | 0
 {Documentation/perf_counter => tools/perf}/builtin-list.c         | 0
 {Documentation/perf_counter => tools/perf}/builtin-record.c       | 0
 {Documentation/perf_counter => tools/perf}/builtin-report.c       | 0
 {Documentation/perf_counter => tools/perf}/builtin-stat.c         | 0
 {Documentation/perf_counter => tools/perf}/builtin-top.c          | 0
 {Documentation/perf_counter => tools/perf}/builtin.h              | 0
 {Documentation/perf_counter => tools/perf}/command-list.txt       | 0
 {Documentation/perf_counter => tools/perf}/design.txt             | 0
 {Documentation/perf_counter => tools/perf}/perf.c                 | 0
 {Documentation/perf_counter => tools/perf}/perf.h                 | 0
 {Documentation/perf_counter => tools/perf}/util/PERF-VERSION-GEN  | 0
 {Documentation/perf_counter => tools/perf}/util/abspath.c         | 0
 {Documentation/perf_counter => tools/perf}/util/alias.c           | 0
 {Documentation/perf_counter => tools/perf}/util/cache.h           | 0
 {Documentation/perf_counter => tools/perf}/util/color.c           | 0
 {Documentation/perf_counter => tools/perf}/util/color.h           | 0
 {Documentation/perf_counter => tools/perf}/util/config.c          | 0
 {Documentation/perf_counter => tools/perf}/util/ctype.c           | 0
 {Documentation/perf_counter => tools/perf}/util/environment.c     | 0
 {Documentation/perf_counter => tools/perf}/util/exec_cmd.c        | 0
 {Documentation/perf_counter => tools/perf}/util/exec_cmd.h        | 0
 .../perf_counter => tools/perf}/util/generate-cmdlist.sh          | 0
 {Documentation/perf_counter => tools/perf}/util/help.c            | 0
 {Documentation/perf_counter => tools/perf}/util/help.h            | 0
 {Documentation/perf_counter => tools/perf}/util/levenshtein.c     | 0
 {Documentation/perf_counter => tools/perf}/util/levenshtein.h     | 0
 {Documentation/perf_counter => tools/perf}/util/list.h            | 0
 {Documentation/perf_counter => tools/perf}/util/pager.c           | 0
 {Documentation/perf_counter => tools/perf}/util/parse-events.c    | 0
 {Documentation/perf_counter => tools/perf}/util/parse-events.h    | 0
 {Documentation/perf_counter => tools/perf}/util/parse-options.c   | 0
 {Documentation/perf_counter => tools/perf}/util/parse-options.h   | 0
 {Documentation/perf_counter => tools/perf}/util/path.c            | 0
 {Documentation/perf_counter => tools/perf}/util/quote.c           | 0
 {Documentation/perf_counter => tools/perf}/util/quote.h           | 0
 {Documentation/perf_counter => tools/perf}/util/rbtree.c          | 0
 {Documentation/perf_counter => tools/perf}/util/rbtree.h          | 0
 {Documentation/perf_counter => tools/perf}/util/run-command.c     | 0
 {Documentation/perf_counter => tools/perf}/util/run-command.h     | 0
 {Documentation/perf_counter => tools/perf}/util/sigchain.c        | 0
 {Documentation/perf_counter => tools/perf}/util/sigchain.h        | 0
 {Documentation/perf_counter => tools/perf}/util/strbuf.c          | 0
 {Documentation/perf_counter => tools/perf}/util/strbuf.h          | 0
 {Documentation/perf_counter => tools/perf}/util/string.c          | 0
 {Documentation/perf_counter => tools/perf}/util/string.h          | 0
 {Documentation/perf_counter => tools/perf}/util/symbol.c          | 0
 {Documentation/perf_counter => tools/perf}/util/symbol.h          | 0
 {Documentation/perf_counter => tools/perf}/util/usage.c           | 0
 {Documentation/perf_counter => tools/perf}/util/util.h            | 0
 {Documentation/perf_counter => tools/perf}/util/wrapper.c         | 0
 69 files changed, 0 insertions(+), 0 deletions(-)
 rename {Documentation/perf_counter => tools/perf}/.gitignore (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/Makefile (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/asciidoc.conf (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/manpage-1.72.xsl (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/manpage-base.xsl (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/manpage-bold-literal.xsl (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/manpage-normal.xsl (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/manpage-suppress-sp.xsl (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/perf-annotate.txt (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/perf-help.txt (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/perf-list.txt (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/perf-record.txt (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/perf-report.txt (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/perf-stat.txt (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/perf-top.txt (100%)
 rename {Documentation/perf_counter => tools/perf}/Documentation/perf.txt (100%)
 rename {Documentation/perf_counter => tools/perf}/Makefile (100%)
 rename {Documentation/perf_counter => tools/perf}/builtin-annotate.c (100%)
 rename {Documentation/perf_counter => tools/perf}/builtin-help.c (100%)
 rename {Documentation/perf_counter => tools/perf}/builtin-list.c (100%)
 rename {Documentation/perf_counter => tools/perf}/builtin-record.c (100%)
 rename {Documentation/perf_counter => tools/perf}/builtin-report.c (100%)
 rename {Documentation/perf_counter => tools/perf}/builtin-stat.c (100%)
 rename {Documentation/perf_counter => tools/perf}/builtin-top.c (100%)
 rename {Documentation/perf_counter => tools/perf}/builtin.h (100%)
 rename {Documentation/perf_counter => tools/perf}/command-list.txt (100%)
 rename {Documentation/perf_counter => tools/perf}/design.txt (100%)
 rename {Documentation/perf_counter => tools/perf}/perf.c (100%)
 rename {Documentation/perf_counter => tools/perf}/perf.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/PERF-VERSION-GEN (100%)
 rename {Documentation/perf_counter => tools/perf}/util/abspath.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/alias.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/cache.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/color.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/color.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/config.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/ctype.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/environment.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/exec_cmd.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/exec_cmd.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/generate-cmdlist.sh (100%)
 rename {Documentation/perf_counter => tools/perf}/util/help.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/help.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/levenshtein.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/levenshtein.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/list.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/pager.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/parse-events.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/parse-events.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/parse-options.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/parse-options.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/path.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/quote.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/quote.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/rbtree.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/rbtree.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/run-command.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/run-command.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/sigchain.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/sigchain.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/strbuf.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/strbuf.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/string.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/string.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/symbol.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/symbol.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/usage.c (100%)
 rename {Documentation/perf_counter => tools/perf}/util/util.h (100%)
 rename {Documentation/perf_counter => tools/perf}/util/wrapper.c (100%)

diff --git a/Documentation/perf_counter/.gitignore b/tools/perf/.gitignore
similarity index 100%
rename from Documentation/perf_counter/.gitignore
rename to tools/perf/.gitignore
diff --git a/Documentation/perf_counter/Documentation/Makefile b/tools/perf/Documentation/Makefile
similarity index 100%
rename from Documentation/perf_counter/Documentation/Makefile
rename to tools/perf/Documentation/Makefile
diff --git a/Documentation/perf_counter/Documentation/asciidoc.conf b/tools/perf/Documentation/asciidoc.conf
similarity index 100%
rename from Documentation/perf_counter/Documentation/asciidoc.conf
rename to tools/perf/Documentation/asciidoc.conf
diff --git a/Documentation/perf_counter/Documentation/manpage-1.72.xsl b/tools/perf/Documentation/manpage-1.72.xsl
similarity index 100%
rename from Documentation/perf_counter/Documentation/manpage-1.72.xsl
rename to tools/perf/Documentation/manpage-1.72.xsl
diff --git a/Documentation/perf_counter/Documentation/manpage-base.xsl b/tools/perf/Documentation/manpage-base.xsl
similarity index 100%
rename from Documentation/perf_counter/Documentation/manpage-base.xsl
rename to tools/perf/Documentation/manpage-base.xsl
diff --git a/Documentation/perf_counter/Documentation/manpage-bold-literal.xsl b/tools/perf/Documentation/manpage-bold-literal.xsl
similarity index 100%
rename from Documentation/perf_counter/Documentation/manpage-bold-literal.xsl
rename to tools/perf/Documentation/manpage-bold-literal.xsl
diff --git a/Documentation/perf_counter/Documentation/manpage-normal.xsl b/tools/perf/Documentation/manpage-normal.xsl
similarity index 100%
rename from Documentation/perf_counter/Documentation/manpage-normal.xsl
rename to tools/perf/Documentation/manpage-normal.xsl
diff --git a/Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl b/tools/perf/Documentation/manpage-suppress-sp.xsl
similarity index 100%
rename from Documentation/perf_counter/Documentation/manpage-suppress-sp.xsl
rename to tools/perf/Documentation/manpage-suppress-sp.xsl
diff --git a/Documentation/perf_counter/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
similarity index 100%
rename from Documentation/perf_counter/Documentation/perf-annotate.txt
rename to tools/perf/Documentation/perf-annotate.txt
diff --git a/Documentation/perf_counter/Documentation/perf-help.txt b/tools/perf/Documentation/perf-help.txt
similarity index 100%
rename from Documentation/perf_counter/Documentation/perf-help.txt
rename to tools/perf/Documentation/perf-help.txt
diff --git a/Documentation/perf_counter/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
similarity index 100%
rename from Documentation/perf_counter/Documentation/perf-list.txt
rename to tools/perf/Documentation/perf-list.txt
diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
similarity index 100%
rename from Documentation/perf_counter/Documentation/perf-record.txt
rename to tools/perf/Documentation/perf-record.txt
diff --git a/Documentation/perf_counter/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
similarity index 100%
rename from Documentation/perf_counter/Documentation/perf-report.txt
rename to tools/perf/Documentation/perf-report.txt
diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
similarity index 100%
rename from Documentation/perf_counter/Documentation/perf-stat.txt
rename to tools/perf/Documentation/perf-stat.txt
diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
similarity index 100%
rename from Documentation/perf_counter/Documentation/perf-top.txt
rename to tools/perf/Documentation/perf-top.txt
diff --git a/Documentation/perf_counter/Documentation/perf.txt b/tools/perf/Documentation/perf.txt
similarity index 100%
rename from Documentation/perf_counter/Documentation/perf.txt
rename to tools/perf/Documentation/perf.txt
diff --git a/Documentation/perf_counter/Makefile b/tools/perf/Makefile
similarity index 100%
rename from Documentation/perf_counter/Makefile
rename to tools/perf/Makefile
diff --git a/Documentation/perf_counter/builtin-annotate.c b/tools/perf/builtin-annotate.c
similarity index 100%
rename from Documentation/perf_counter/builtin-annotate.c
rename to tools/perf/builtin-annotate.c
diff --git a/Documentation/perf_counter/builtin-help.c b/tools/perf/builtin-help.c
similarity index 100%
rename from Documentation/perf_counter/builtin-help.c
rename to tools/perf/builtin-help.c
diff --git a/Documentation/perf_counter/builtin-list.c b/tools/perf/builtin-list.c
similarity index 100%
rename from Documentation/perf_counter/builtin-list.c
rename to tools/perf/builtin-list.c
diff --git a/Documentation/perf_counter/builtin-record.c b/tools/perf/builtin-record.c
similarity index 100%
rename from Documentation/perf_counter/builtin-record.c
rename to tools/perf/builtin-record.c
diff --git a/Documentation/perf_counter/builtin-report.c b/tools/perf/builtin-report.c
similarity index 100%
rename from Documentation/perf_counter/builtin-report.c
rename to tools/perf/builtin-report.c
diff --git a/Documentation/perf_counter/builtin-stat.c b/tools/perf/builtin-stat.c
similarity index 100%
rename from Documentation/perf_counter/builtin-stat.c
rename to tools/perf/builtin-stat.c
diff --git a/Documentation/perf_counter/builtin-top.c b/tools/perf/builtin-top.c
similarity index 100%
rename from Documentation/perf_counter/builtin-top.c
rename to tools/perf/builtin-top.c
diff --git a/Documentation/perf_counter/builtin.h b/tools/perf/builtin.h
similarity index 100%
rename from Documentation/perf_counter/builtin.h
rename to tools/perf/builtin.h
diff --git a/Documentation/perf_counter/command-list.txt b/tools/perf/command-list.txt
similarity index 100%
rename from Documentation/perf_counter/command-list.txt
rename to tools/perf/command-list.txt
diff --git a/Documentation/perf_counter/design.txt b/tools/perf/design.txt
similarity index 100%
rename from Documentation/perf_counter/design.txt
rename to tools/perf/design.txt
diff --git a/Documentation/perf_counter/perf.c b/tools/perf/perf.c
similarity index 100%
rename from Documentation/perf_counter/perf.c
rename to tools/perf/perf.c
diff --git a/Documentation/perf_counter/perf.h b/tools/perf/perf.h
similarity index 100%
rename from Documentation/perf_counter/perf.h
rename to tools/perf/perf.h
diff --git a/Documentation/perf_counter/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN
similarity index 100%
rename from Documentation/perf_counter/util/PERF-VERSION-GEN
rename to tools/perf/util/PERF-VERSION-GEN
diff --git a/Documentation/perf_counter/util/abspath.c b/tools/perf/util/abspath.c
similarity index 100%
rename from Documentation/perf_counter/util/abspath.c
rename to tools/perf/util/abspath.c
diff --git a/Documentation/perf_counter/util/alias.c b/tools/perf/util/alias.c
similarity index 100%
rename from Documentation/perf_counter/util/alias.c
rename to tools/perf/util/alias.c
diff --git a/Documentation/perf_counter/util/cache.h b/tools/perf/util/cache.h
similarity index 100%
rename from Documentation/perf_counter/util/cache.h
rename to tools/perf/util/cache.h
diff --git a/Documentation/perf_counter/util/color.c b/tools/perf/util/color.c
similarity index 100%
rename from Documentation/perf_counter/util/color.c
rename to tools/perf/util/color.c
diff --git a/Documentation/perf_counter/util/color.h b/tools/perf/util/color.h
similarity index 100%
rename from Documentation/perf_counter/util/color.h
rename to tools/perf/util/color.h
diff --git a/Documentation/perf_counter/util/config.c b/tools/perf/util/config.c
similarity index 100%
rename from Documentation/perf_counter/util/config.c
rename to tools/perf/util/config.c
diff --git a/Documentation/perf_counter/util/ctype.c b/tools/perf/util/ctype.c
similarity index 100%
rename from Documentation/perf_counter/util/ctype.c
rename to tools/perf/util/ctype.c
diff --git a/Documentation/perf_counter/util/environment.c b/tools/perf/util/environment.c
similarity index 100%
rename from Documentation/perf_counter/util/environment.c
rename to tools/perf/util/environment.c
diff --git a/Documentation/perf_counter/util/exec_cmd.c b/tools/perf/util/exec_cmd.c
similarity index 100%
rename from Documentation/perf_counter/util/exec_cmd.c
rename to tools/perf/util/exec_cmd.c
diff --git a/Documentation/perf_counter/util/exec_cmd.h b/tools/perf/util/exec_cmd.h
similarity index 100%
rename from Documentation/perf_counter/util/exec_cmd.h
rename to tools/perf/util/exec_cmd.h
diff --git a/Documentation/perf_counter/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh
similarity index 100%
rename from Documentation/perf_counter/util/generate-cmdlist.sh
rename to tools/perf/util/generate-cmdlist.sh
diff --git a/Documentation/perf_counter/util/help.c b/tools/perf/util/help.c
similarity index 100%
rename from Documentation/perf_counter/util/help.c
rename to tools/perf/util/help.c
diff --git a/Documentation/perf_counter/util/help.h b/tools/perf/util/help.h
similarity index 100%
rename from Documentation/perf_counter/util/help.h
rename to tools/perf/util/help.h
diff --git a/Documentation/perf_counter/util/levenshtein.c b/tools/perf/util/levenshtein.c
similarity index 100%
rename from Documentation/perf_counter/util/levenshtein.c
rename to tools/perf/util/levenshtein.c
diff --git a/Documentation/perf_counter/util/levenshtein.h b/tools/perf/util/levenshtein.h
similarity index 100%
rename from Documentation/perf_counter/util/levenshtein.h
rename to tools/perf/util/levenshtein.h
diff --git a/Documentation/perf_counter/util/list.h b/tools/perf/util/list.h
similarity index 100%
rename from Documentation/perf_counter/util/list.h
rename to tools/perf/util/list.h
diff --git a/Documentation/perf_counter/util/pager.c b/tools/perf/util/pager.c
similarity index 100%
rename from Documentation/perf_counter/util/pager.c
rename to tools/perf/util/pager.c
diff --git a/Documentation/perf_counter/util/parse-events.c b/tools/perf/util/parse-events.c
similarity index 100%
rename from Documentation/perf_counter/util/parse-events.c
rename to tools/perf/util/parse-events.c
diff --git a/Documentation/perf_counter/util/parse-events.h b/tools/perf/util/parse-events.h
similarity index 100%
rename from Documentation/perf_counter/util/parse-events.h
rename to tools/perf/util/parse-events.h
diff --git a/Documentation/perf_counter/util/parse-options.c b/tools/perf/util/parse-options.c
similarity index 100%
rename from Documentation/perf_counter/util/parse-options.c
rename to tools/perf/util/parse-options.c
diff --git a/Documentation/perf_counter/util/parse-options.h b/tools/perf/util/parse-options.h
similarity index 100%
rename from Documentation/perf_counter/util/parse-options.h
rename to tools/perf/util/parse-options.h
diff --git a/Documentation/perf_counter/util/path.c b/tools/perf/util/path.c
similarity index 100%
rename from Documentation/perf_counter/util/path.c
rename to tools/perf/util/path.c
diff --git a/Documentation/perf_counter/util/quote.c b/tools/perf/util/quote.c
similarity index 100%
rename from Documentation/perf_counter/util/quote.c
rename to tools/perf/util/quote.c
diff --git a/Documentation/perf_counter/util/quote.h b/tools/perf/util/quote.h
similarity index 100%
rename from Documentation/perf_counter/util/quote.h
rename to tools/perf/util/quote.h
diff --git a/Documentation/perf_counter/util/rbtree.c b/tools/perf/util/rbtree.c
similarity index 100%
rename from Documentation/perf_counter/util/rbtree.c
rename to tools/perf/util/rbtree.c
diff --git a/Documentation/perf_counter/util/rbtree.h b/tools/perf/util/rbtree.h
similarity index 100%
rename from Documentation/perf_counter/util/rbtree.h
rename to tools/perf/util/rbtree.h
diff --git a/Documentation/perf_counter/util/run-command.c b/tools/perf/util/run-command.c
similarity index 100%
rename from Documentation/perf_counter/util/run-command.c
rename to tools/perf/util/run-command.c
diff --git a/Documentation/perf_counter/util/run-command.h b/tools/perf/util/run-command.h
similarity index 100%
rename from Documentation/perf_counter/util/run-command.h
rename to tools/perf/util/run-command.h
diff --git a/Documentation/perf_counter/util/sigchain.c b/tools/perf/util/sigchain.c
similarity index 100%
rename from Documentation/perf_counter/util/sigchain.c
rename to tools/perf/util/sigchain.c
diff --git a/Documentation/perf_counter/util/sigchain.h b/tools/perf/util/sigchain.h
similarity index 100%
rename from Documentation/perf_counter/util/sigchain.h
rename to tools/perf/util/sigchain.h
diff --git a/Documentation/perf_counter/util/strbuf.c b/tools/perf/util/strbuf.c
similarity index 100%
rename from Documentation/perf_counter/util/strbuf.c
rename to tools/perf/util/strbuf.c
diff --git a/Documentation/perf_counter/util/strbuf.h b/tools/perf/util/strbuf.h
similarity index 100%
rename from Documentation/perf_counter/util/strbuf.h
rename to tools/perf/util/strbuf.h
diff --git a/Documentation/perf_counter/util/string.c b/tools/perf/util/string.c
similarity index 100%
rename from Documentation/perf_counter/util/string.c
rename to tools/perf/util/string.c
diff --git a/Documentation/perf_counter/util/string.h b/tools/perf/util/string.h
similarity index 100%
rename from Documentation/perf_counter/util/string.h
rename to tools/perf/util/string.h
diff --git a/Documentation/perf_counter/util/symbol.c b/tools/perf/util/symbol.c
similarity index 100%
rename from Documentation/perf_counter/util/symbol.c
rename to tools/perf/util/symbol.c
diff --git a/Documentation/perf_counter/util/symbol.h b/tools/perf/util/symbol.h
similarity index 100%
rename from Documentation/perf_counter/util/symbol.h
rename to tools/perf/util/symbol.h
diff --git a/Documentation/perf_counter/util/usage.c b/tools/perf/util/usage.c
similarity index 100%
rename from Documentation/perf_counter/util/usage.c
rename to tools/perf/util/usage.c
diff --git a/Documentation/perf_counter/util/util.h b/tools/perf/util/util.h
similarity index 100%
rename from Documentation/perf_counter/util/util.h
rename to tools/perf/util/util.h
diff --git a/Documentation/perf_counter/util/wrapper.c b/tools/perf/util/wrapper.c
similarity index 100%
rename from Documentation/perf_counter/util/wrapper.c
rename to tools/perf/util/wrapper.c
-- 
GitLab


From 7d37a0cbd68c875fa984fa97bcf5c7f4b7950b6d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 6 Jun 2009 20:36:38 +0200
Subject: [PATCH 1638/2198] perf_counter tools: Warning fixes on 32-bit

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 4 ++--
 tools/perf/util/symbol.c      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 116a3978b44c7..4a3c279160c5c 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -668,9 +668,9 @@ static void hist_hit(struct hist_entry *he, uint64_t ip)
 
 	if (verbose >= 3)
 		printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n",
-			(void *)he->sym->start,
+			(void *)(unsigned long)he->sym->start,
 			he->sym->name,
-			(void *)ip, ip - he->sym->start,
+			(void *)(unsigned long)ip, ip - he->sym->start,
 			sym->hist[offset]);
 }
 
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 23f4f7b3b83d1..253821d1efdb8 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -21,7 +21,7 @@ static struct symbol *symbol__new(uint64_t start, uint64_t len,
 
 	if (verbose >= 2)
 		printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n",
-			(__u64)start, len, name, self->hist, (void *)obj_start);
+			(__u64)start, (unsigned long)len, name, self->hist, (void *)(unsigned long)obj_start);
 
 	self->obj_start= obj_start;
 	self->hist = NULL;
-- 
GitLab


From 8953645fec933f992223286ad407dc371ac2caa5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 21:04:17 +0200
Subject: [PATCH 1639/2198] perf_counter tools: Fix error condition in
 parse_aliases()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

gcc warned about this bug:

util/parse-events.c: In function ‘parse_generic_hw_symbols’:
util/parse-events.c:175: warning: comparison is always false due to limited range of data type
util/parse-events.c:182: warning: comparison is always false due to limited range of data type
util/parse-events.c:190: warning: comparison is always false due to limited range of data type

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/parse-events.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index e0820b4388aef..f18a9a006e1bd 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -160,12 +160,12 @@ static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size)
 		}
 	}
 
-	return 0;
+	return -1;
 }
 
 static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr)
 {
-	__u8 cache_type = -1, cache_op = 0, cache_result = 0;
+	int cache_type = -1, cache_op = 0, cache_result = 0;
 
 	cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX);
 	/*
@@ -179,8 +179,8 @@ static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *a
 	/*
 	 * Fall back to reads:
 	 */
-	if (cache_type == -1)
-		cache_type = PERF_COUNT_HW_CACHE_OP_READ;
+	if (cache_op == -1)
+		cache_op = PERF_COUNT_HW_CACHE_OP_READ;
 
 	cache_result = parse_aliases(str, hw_cache_result,
 					PERF_COUNT_HW_CACHE_RESULT_MAX);
-- 
GitLab


From 39273ee9756917129de3190d469b0b120f87e763 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 21:17:03 +0200
Subject: [PATCH 1640/2198] perf annotate: Automatically pick up vmlinux in the
 local directory

Right now kernel debug info does not get resolved by default, because
we dont know where to look for the vmlinux.

The -k option can be used for that - but if no option is given, pick
up vmlinux files in the current directory - in case a kernel hacker
runs profiling from the source directory that the kernel was built in.

The real solution would be to embedd the location (and perhaps the
date/timestamp) of the vmlinux file in /proc/kallsyms, so that
tools can pick it up automatically.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 4a3c279160c5c..80c5aa0bb42ec 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -26,7 +26,7 @@
 #define SHOW_HV		4
 
 static char		const *input_name = "perf.data";
-static char		*vmlinux = NULL;
+static char		*vmlinux = "vmlinux";
 
 static char		default_sort_order[] = "comm,symbol";
 static char		*sort_order = default_sort_order;
-- 
GitLab


From e9fbc9dc9214d6a9de7d62627be5414804fd7b9f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 6 Jun 2009 21:22:33 +0200
Subject: [PATCH 1641/2198] perf_counter tools: Initialize a stack variable
 before use

the "perf report" utility crashed in some circumstances
because the "sym" stack variable was not initialized before used
(as also proven by valgrind).

With this fix both the crash goes away and valgrind no longer complains.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/symbol.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 253821d1efdb8..158588c7f6b1d 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -457,6 +457,8 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 
 	nr_syms = shdr.sh_size / shdr.sh_entsize;
 
+	memset(&sym, 0, sizeof(sym));
+
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
 		uint64_t obj_start;
-- 
GitLab


From 23b87116c7c4f73597965218b66041acbdb4e79f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 6 Jun 2009 21:25:29 +0200
Subject: [PATCH 1642/2198] perf annotate: Fix command line help text

Arjan noticed this bug in the perf annotate help output:

    -s, --symbol <file>   symbol to annotate

that should be <symbol> instead.

Reported-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 80c5aa0bb42ec..0e23fe98ec4ec 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -1300,7 +1300,7 @@ static const char * const annotate_usage[] = {
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
-	OPT_STRING('s', "symbol", &sym_hist_filter, "file",
+	OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
 		    "symbol to annotate"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
-- 
GitLab


From 4ae1507f6d266d0cc3dd36e474d83aad70fec9e4 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Sun, 24 May 2009 18:45:15 -0400
Subject: [PATCH 1643/2198] cifs: make overriding of ownership conditional on
 new mount options

We have a bit of a problem with the uid= option. The basic issue is that
it means too many things and has too many side-effects.

It's possible to allow an unprivileged user to mount a filesystem if the
user owns the mountpoint, /bin/mount is setuid root, and the mount is
set up in /etc/fstab with the "user" option.

When doing this though, /bin/mount automatically adds the "uid=" and
"gid=" options to the share. This is fortunate since the correct uid=
option is needed in order to tell the upcall what user's credcache to
use when generating the SPNEGO blob.

On a mount without unix extensions this is fine -- you generally will
want the files to be owned by the "owner" of the mount. The problem
comes in on a mount with unix extensions. With those enabled, the
uid/gid options cause the ownership of files to be overriden even though
the server is sending along the ownership info.

This means that it's not possible to have a mount by an unprivileged
user that shows the server's file ownership info. The result is also
inode permissions that have no reflection at all on the server. You
simply cannot separate ownership from the mode in this fashion.

This behavior also makes MultiuserMount option less usable. Once you
pass in the uid= option for a mount, then you can't use unix ownership
info and allow someone to share the mount.

While I'm not thrilled with it, the only solution I can see is to stop
making uid=/gid= force the overriding of ownership on mounts, and to add
new mount options that turn this behavior on.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 74b5a87e9195d..10151f8d84955 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1096,17 +1096,17 @@ cifs_parse_mount_options(char *options, const char *devname,
 				return 1;
 			}
 		} else if (strnicmp(data, "uid", 3) == 0) {
-			if (value && *value) {
+			if (value && *value)
 				vol->linux_uid =
 					simple_strtoul(value, &value, 0);
+		} else if (strnicmp(data, "forceuid", 8) == 0) {
 				vol->override_uid = 1;
-			}
 		} else if (strnicmp(data, "gid", 3) == 0) {
-			if (value && *value) {
+			if (value && *value)
 				vol->linux_gid =
 					simple_strtoul(value, &value, 0);
+		} else if (strnicmp(data, "forcegid", 8) == 0) {
 				vol->override_gid = 1;
-			}
 		} else if (strnicmp(data, "file_mode", 4) == 0) {
 			if (value && *value) {
 				vol->file_mode =
-- 
GitLab


From f0472d0ec89bef2ea4432828c3daa1b26ef569aa Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Sat, 6 Jun 2009 21:09:39 +0000
Subject: [PATCH 1644/2198] [CIFS] Add mention of new mount parm (forceuid) to
 cifs readme

Also update fs/cifs/CHANGES

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES | 5 ++++-
 fs/cifs/README  | 9 ++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 227c681b816de..b486898394288 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -2,7 +2,10 @@ Version 1.59
 ------------
 Client uses server inode numbers (which are persistent) rather than
 client generated ones by default (mount option "serverino" turned
-on by default if server supports it).
+on by default if server supports it).  Add forceuid and forcegid
+mount options (so that when negotiating unix extensions specifying
+which uid mounted does not immediately force the server's reported
+uids to be overridden).
 
 Version 1.58
 ------------
diff --git a/fs/cifs/README b/fs/cifs/README
index 6d1608fabde9d..ad92921dbde41 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,7 +262,8 @@ A partial list of the supported mount options follows:
 		mount.	
   domain	Set the SMB/CIFS workgroup name prepended to the
 		username during CIFS session establishment
-  uid		Set the default uid for inodes. For mounts to servers
+  forceuid	Set the default uid for inodes based on the uid
+		passed in. For mounts to servers
 		which do support the CIFS Unix extensions, such as a
 		properly configured Samba server, the server provides
 		the uid, gid and mode so this parameter should  not be
@@ -292,6 +293,12 @@ A partial list of the supported mount options follows:
 		the client.  Note that the mount.cifs helper must be
 		at version 1.10 or higher to support specifying the uid
 		(or gid) in non-numeric form.
+  forcegid	(similar to above but for the groupid instead of uid)
+  uid		Set the default uid for inodes, and indicate to the
+		cifs kernel driver which local user mounted . If the server
+		supports the unix extensions the default uid is
+		not used to fill in the owner fields of inodes (files)
+		unless the "forceuid" parameter is specified.
   gid		Set the default gid for inodes (similar to above).
   file_mode     If CIFS Unix extensions are not supported by the server
 		this overrides the default mode for file inodes.
-- 
GitLab


From 2f01190aa62fe9dd0a98205927b9f09fd191c017 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 6 Jun 2009 23:10:43 +0200
Subject: [PATCH 1645/2198] perf top: Wait for a minimal set of events before
 reading first snapshot

The first snapshot reading often occur before any events have
been read in the mapped perfcounter files.

Just wait until we have at least one event before starting the
snapshot, or the delay before the first set of entries to be
displayed may be long in case of low refresh rate.

Note: we could also use a semaphore to wait before
"print_entries" number of eveents is reached, but again this
value is tunable and we can't ensure we will even reach it.
Also we could base on a default mimimum set of entries for the
first refresh, say 15, but again, the minimal sample is
tunable, and we could end up displaying nothing until we have a
minimal default set of events, which can take some time in case
of high samples filters.

Hence this simple solution which partially covers the default
case.

[ Impact: fix display artifacts in perf top ]

Signed-off-by: Frederic Weisbecker <fweisbeec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1244322643-6447-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-top.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index f2e7312f85c94..fdc1d5863b012 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -420,7 +420,7 @@ static unsigned int mmap_read_head(struct mmap_data *md)
 
 struct timeval last_read, this_read;
 
-static void mmap_read(struct mmap_data *md)
+static void mmap_read_counter(struct mmap_data *md)
 {
 	unsigned int head = mmap_read_head(md);
 	unsigned int old = md->prev;
@@ -517,6 +517,16 @@ static void mmap_read(struct mmap_data *md)
 static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
 static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
 
+static void mmap_read(void)
+{
+	int i, counter;
+
+	for (i = 0; i < nr_cpus; i++) {
+		for (counter = 0; counter < nr_counters; counter++)
+			mmap_read_counter(&mmap_array[i][counter]);
+	}
+}
+
 static int __cmd_top(void)
 {
 	struct perf_counter_attr *attr;
@@ -571,6 +581,11 @@ static int __cmd_top(void)
 		}
 	}
 
+	/* Wait for a minimal set of events before starting the snapshot */
+	poll(event_array, nr_poll, 100);
+
+	mmap_read();
+
 	if (pthread_create(&thread, NULL, display_thread, NULL)) {
 		printf("Could not create display thread.\n");
 		exit(-1);
@@ -589,10 +604,7 @@ static int __cmd_top(void)
 	while (1) {
 		int hits = samples;
 
-		for (i = 0; i < nr_cpus; i++) {
-			for (counter = 0; counter < nr_counters; counter++)
-				mmap_read(&mmap_array[i][counter]);
-		}
+		mmap_read();
 
 		if (hits == samples)
 			ret = poll(event_array, nr_poll, 100);
-- 
GitLab


From 7caf6a49bb17d0377210693af5737563b31aa5ee Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 5 Jun 2009 12:01:35 +0200
Subject: [PATCH 1646/2198] dma-debug: change hash_bucket_find from first-fit
 to best-fit

Some device drivers map the same physical address multiple times to a
dma address. Without an IOMMU this results in the same dma address being
put into the dma-debug hash multiple times. With a first-fit match in
hash_bucket_find() this function may return the wrong dma_debug_entry.

This can result in false positive warnings. This patch fixes it by
changing the first-fit behavior of hash_bucket_find() into a best-fit
algorithm.

Reported-by: Torsten Kaiser <just.for.lkml@googlemail.com>
Reported-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Cc: lethal@linux-sh.org
Cc: just.for.lkml@googlemail.com
Cc: hancockrwd@gmail.com
Cc: jens.axboe@oracle.com
Cc: bharrosh@panasas.com
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@kernel.org>
LKML-Reference: <20090605104132.GE24836@amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 lib/dma-debug.c | 43 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 39 insertions(+), 4 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index cdd205d6bf7c6..8fcc09c91e1b1 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -186,15 +186,50 @@ static void put_hash_bucket(struct hash_bucket *bucket,
 static struct dma_debug_entry *hash_bucket_find(struct hash_bucket *bucket,
 						struct dma_debug_entry *ref)
 {
-	struct dma_debug_entry *entry;
+	struct dma_debug_entry *entry, *ret = NULL;
+	int matches = 0, match_lvl, last_lvl = 0;
 
 	list_for_each_entry(entry, &bucket->list, list) {
-		if ((entry->dev_addr == ref->dev_addr) &&
-		    (entry->dev == ref->dev))
+		if ((entry->dev_addr != ref->dev_addr) ||
+		    (entry->dev != ref->dev))
+			continue;
+
+		/*
+		 * Some drivers map the same physical address multiple
+		 * times. Without a hardware IOMMU this results in the
+		 * same device addresses being put into the dma-debug
+		 * hash multiple times too. This can result in false
+		 * positives being reported. Therfore we implement a
+		 * best-fit algorithm here which returns the entry from
+		 * the hash which fits best to the reference value
+		 * instead of the first-fit.
+		 */
+		matches += 1;
+		match_lvl = 0;
+		entry->size      == ref->size      ? ++match_lvl : match_lvl;
+		entry->type      == ref->type      ? ++match_lvl : match_lvl;
+		entry->direction == ref->direction ? ++match_lvl : match_lvl;
+
+		if (match_lvl == 3) {
+			/* perfect-fit - return the result */
 			return entry;
+		} else if (match_lvl > last_lvl) {
+			/*
+			 * We found an entry that fits better then the
+			 * previous one
+			 */
+			last_lvl = match_lvl;
+			ret      = entry;
+		}
 	}
 
-	return NULL;
+	/*
+	 * If we have multiple matches but no perfect-fit, just return
+	 * NULL.
+	 */
+	ret = (matches == 1) ? ret : NULL;
+
+	return ret;
 }
 
 /*
-- 
GitLab


From 5095f59bda6793a7b8f0856096d6893fe98e0e51 Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Fri, 5 Jun 2009 23:27:17 +0530
Subject: [PATCH 1647/2198] x86: cpu_debug: Remove model information to reduce
 encoding-decoding

Remove model information, encoding/decoding and reduce bookkeeping.

This, besides removing a lot of code and cleaning up the code, also
enables these features on many more CPUs that were enumerated before.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
LKML-Reference: <1244224637.8212.6.camel@ht.satnam>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/cpu_debug.h | 101 +-------
 arch/x86/kernel/cpu/cpu_debug.c  | 417 +++++++------------------------
 2 files changed, 97 insertions(+), 421 deletions(-)

diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
index 222802029fa6c..d96c1ee3a95cc 100644
--- a/arch/x86/include/asm/cpu_debug.h
+++ b/arch/x86/include/asm/cpu_debug.h
@@ -86,105 +86,7 @@ enum cpu_file_bit {
 	CPU_VALUE_BIT,				/* value		*/
 };
 
-#define	CPU_FILE_VALUE			(1 << CPU_VALUE_BIT)
-
-/*
- * DisplayFamily_DisplayModel	Processor Families/Processor Number Series
- * --------------------------	------------------------------------------
- * 05_01, 05_02, 05_04		Pentium, Pentium with MMX
- *
- * 06_01			Pentium Pro
- * 06_03, 06_05			Pentium II Xeon, Pentium II
- * 06_07, 06_08, 06_0A, 06_0B	Pentium III Xeon, Pentum III
- *
- * 06_09, 060D			Pentium M
- *
- * 06_0E			Core Duo, Core Solo
- *
- * 06_0F			Xeon 3000, 3200, 5100, 5300, 7300 series,
- *				Core 2 Quad, Core 2 Extreme, Core 2 Duo,
- *				Pentium dual-core
- * 06_17			Xeon 5200, 5400 series, Core 2 Quad Q9650
- *
- * 06_1C			Atom
- *
- * 0F_00, 0F_01, 0F_02		Xeon, Xeon MP, Pentium 4
- * 0F_03, 0F_04			Xeon, Xeon MP, Pentium 4, Pentium D
- *
- * 0F_06			Xeon 7100, 5000 Series, Xeon MP,
- *				Pentium 4, Pentium D
- */
-
-/* Register processors bits */
-enum cpu_processor_bit {
-	CPU_NONE,
-/* Intel */
-	CPU_INTEL_PENTIUM_BIT,
-	CPU_INTEL_P6_BIT,
-	CPU_INTEL_PENTIUM_M_BIT,
-	CPU_INTEL_CORE_BIT,
-	CPU_INTEL_CORE2_BIT,
-	CPU_INTEL_ATOM_BIT,
-	CPU_INTEL_XEON_P4_BIT,
-	CPU_INTEL_XEON_MP_BIT,
-/* AMD */
-	CPU_AMD_K6_BIT,
-	CPU_AMD_K7_BIT,
-	CPU_AMD_K8_BIT,
-	CPU_AMD_0F_BIT,
-	CPU_AMD_10_BIT,
-	CPU_AMD_11_BIT,
-};
-
-#define	CPU_INTEL_PENTIUM	(1 << CPU_INTEL_PENTIUM_BIT)
-#define	CPU_INTEL_P6		(1 << CPU_INTEL_P6_BIT)
-#define	CPU_INTEL_PENTIUM_M	(1 << CPU_INTEL_PENTIUM_M_BIT)
-#define	CPU_INTEL_CORE		(1 << CPU_INTEL_CORE_BIT)
-#define	CPU_INTEL_CORE2		(1 << CPU_INTEL_CORE2_BIT)
-#define	CPU_INTEL_ATOM		(1 << CPU_INTEL_ATOM_BIT)
-#define	CPU_INTEL_XEON_P4	(1 << CPU_INTEL_XEON_P4_BIT)
-#define	CPU_INTEL_XEON_MP	(1 << CPU_INTEL_XEON_MP_BIT)
-
-#define	CPU_INTEL_PX		(CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
-#define	CPU_INTEL_COREX		(CPU_INTEL_CORE | CPU_INTEL_CORE2)
-#define	CPU_INTEL_XEON		(CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
-#define	CPU_CO_AT		(CPU_INTEL_CORE | CPU_INTEL_ATOM)
-#define	CPU_C2_AT		(CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
-#define	CPU_CX_AT		(CPU_INTEL_COREX | CPU_INTEL_ATOM)
-#define	CPU_CX_XE		(CPU_INTEL_COREX | CPU_INTEL_XEON)
-#define	CPU_P6_XE		(CPU_INTEL_P6 | CPU_INTEL_XEON)
-#define	CPU_PM_CO_AT		(CPU_INTEL_PENTIUM_M | CPU_CO_AT)
-#define	CPU_C2_AT_XE		(CPU_C2_AT | CPU_INTEL_XEON)
-#define	CPU_CX_AT_XE		(CPU_CX_AT | CPU_INTEL_XEON)
-#define	CPU_P6_CX_AT		(CPU_INTEL_P6 | CPU_CX_AT)
-#define	CPU_P6_CX_XE		(CPU_P6_XE | CPU_INTEL_COREX)
-#define	CPU_P6_CX_AT_XE		(CPU_INTEL_P6 | CPU_CX_AT_XE)
-#define	CPU_PM_CX_AT_XE		(CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
-#define	CPU_PM_CX_AT		(CPU_INTEL_PENTIUM_M | CPU_CX_AT)
-#define	CPU_PM_CX_XE		(CPU_INTEL_PENTIUM_M | CPU_CX_XE)
-#define	CPU_PX_CX_AT		(CPU_INTEL_PX | CPU_CX_AT)
-#define	CPU_PX_CX_AT_XE		(CPU_INTEL_PX | CPU_CX_AT_XE)
-
-/* Select all supported Intel CPUs */
-#define	CPU_INTEL_ALL		(CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
-
-#define	CPU_AMD_K6		(1 << CPU_AMD_K6_BIT)
-#define	CPU_AMD_K7		(1 << CPU_AMD_K7_BIT)
-#define	CPU_AMD_K8		(1 << CPU_AMD_K8_BIT)
-#define	CPU_AMD_0F		(1 << CPU_AMD_0F_BIT)
-#define	CPU_AMD_10		(1 << CPU_AMD_10_BIT)
-#define	CPU_AMD_11		(1 << CPU_AMD_11_BIT)
-
-#define	CPU_K10_PLUS		(CPU_AMD_10 | CPU_AMD_11)
-#define	CPU_K0F_PLUS		(CPU_AMD_0F | CPU_K10_PLUS)
-#define	CPU_K8_PLUS		(CPU_AMD_K8 | CPU_K0F_PLUS)
-#define	CPU_K7_PLUS		(CPU_AMD_K7 | CPU_K8_PLUS)
-
-/* Select all supported AMD CPUs */
-#define	CPU_AMD_ALL		(CPU_AMD_K6 | CPU_K7_PLUS)
-
-/* Select all supported CPUs */
-#define	CPU_ALL			(CPU_INTEL_ALL | CPU_AMD_ALL)
+#define	CPU_FILE_VALUE		(1 << CPU_VALUE_BIT)
 
 #define MAX_CPU_FILES		512
 
@@ -220,7 +122,6 @@ struct cpu_debug_range {
 	unsigned		min;		/* Register range min	*/
 	unsigned		max;		/* Register range max	*/
 	unsigned		flag;		/* Supported flags	*/
-	unsigned		model;		/* Supported models	*/
 };
 
 #endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
index 46e29ab96c6ac..86afe13fc3110 100644
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ b/arch/x86/kernel/cpu/cpu_debug.c
@@ -32,9 +32,7 @@
 
 static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
 static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
-static DEFINE_PER_CPU(unsigned, cpu_modelflag);
 static DEFINE_PER_CPU(int, cpu_priv_count);
-static DEFINE_PER_CPU(unsigned, cpu_model);
 
 static DEFINE_MUTEX(cpu_debug_lock);
 
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
 	{ "value",	CPU_REG_ALL,	1	},
 };
 
-/* Intel Registers Range */
-static struct cpu_debug_range cpu_intel_range[] = {
-	{ 0x00000000, 0x00000001, CPU_MC,	CPU_INTEL_ALL		},
-	{ 0x00000006, 0x00000007, CPU_MONITOR,	CPU_CX_AT_XE		},
-	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_INTEL_ALL		},
-	{ 0x00000011, 0x00000013, CPU_PMC,	CPU_INTEL_PENTIUM	},
-	{ 0x00000017, 0x00000017, CPU_PLATFORM,	CPU_PX_CX_AT_XE		},
-	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_P6_CX_AT_XE		},
-
-	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_PX_CX_AT_XE		},
-	{ 0x0000002B, 0x0000002B, CPU_POWERON,	CPU_INTEL_XEON		},
-	{ 0x0000002C, 0x0000002C, CPU_FREQ,	CPU_INTEL_XEON		},
-	{ 0x0000003A, 0x0000003A, CPU_CONTROL,	CPU_CX_AT_XE		},
-
-	{ 0x00000040, 0x00000043, CPU_LBRANCH,	CPU_PM_CX_AT_XE		},
-	{ 0x00000044, 0x00000047, CPU_LBRANCH,	CPU_PM_CO_AT		},
-	{ 0x00000060, 0x00000063, CPU_LBRANCH,	CPU_C2_AT		},
-	{ 0x00000064, 0x00000067, CPU_LBRANCH,	CPU_INTEL_ATOM		},
-
-	{ 0x00000079, 0x00000079, CPU_BIOS,	CPU_P6_CX_AT_XE		},
-	{ 0x00000088, 0x0000008A, CPU_CACHE,	CPU_INTEL_P6		},
-	{ 0x0000008B, 0x0000008B, CPU_BIOS,	CPU_P6_CX_AT_XE		},
-	{ 0x0000009B, 0x0000009B, CPU_MONITOR,	CPU_INTEL_XEON		},
-
-	{ 0x000000C1, 0x000000C2, CPU_PMC,	CPU_P6_CX_AT		},
-	{ 0x000000CD, 0x000000CD, CPU_FREQ,	CPU_CX_AT		},
-	{ 0x000000E7, 0x000000E8, CPU_PERF,	CPU_CX_AT		},
-	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_P6_CX_XE		},
-
-	{ 0x00000116, 0x00000116, CPU_CACHE,	CPU_INTEL_P6		},
-	{ 0x00000118, 0x00000118, CPU_CACHE,	CPU_INTEL_P6		},
-	{ 0x00000119, 0x00000119, CPU_CACHE,	CPU_INTEL_PX		},
-	{ 0x0000011A, 0x0000011B, CPU_CACHE,	CPU_INTEL_P6		},
-	{ 0x0000011E, 0x0000011E, CPU_CACHE,	CPU_PX_CX_AT		},
-
-	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_P6_CX_AT_XE		},
-	{ 0x00000179, 0x0000017A, CPU_MC,	CPU_PX_CX_AT_XE		},
-	{ 0x0000017B, 0x0000017B, CPU_MC,	CPU_P6_XE		},
-	{ 0x00000186, 0x00000187, CPU_PMC,	CPU_P6_CX_AT		},
-	{ 0x00000198, 0x00000199, CPU_PERF,	CPU_PM_CX_AT_XE		},
-	{ 0x0000019A, 0x0000019A, CPU_TIME,	CPU_PM_CX_AT_XE		},
-	{ 0x0000019B, 0x0000019D, CPU_THERM,	CPU_PM_CX_AT_XE		},
-	{ 0x000001A0, 0x000001A0, CPU_MISC,	CPU_PM_CX_AT_XE		},
-
-	{ 0x000001C9, 0x000001C9, CPU_LBRANCH,	CPU_PM_CX_AT		},
-	{ 0x000001D7, 0x000001D8, CPU_LBRANCH,	CPU_INTEL_XEON		},
-	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_CX_AT_XE		},
-	{ 0x000001DA, 0x000001DA, CPU_LBRANCH,	CPU_INTEL_XEON		},
-	{ 0x000001DB, 0x000001DB, CPU_LBRANCH,	CPU_P6_XE		},
-	{ 0x000001DC, 0x000001DC, CPU_LBRANCH,	CPU_INTEL_P6		},
-	{ 0x000001DD, 0x000001DE, CPU_LBRANCH,	CPU_PX_CX_AT_XE		},
-	{ 0x000001E0, 0x000001E0, CPU_LBRANCH,	CPU_INTEL_P6		},
-
-	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_P6_CX_XE		},
-	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_P6_CX_XE		},
-	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_P6_CX_XE		},
-	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_P6_CX_XE		},
-	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_C2_AT_XE		},
-	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_P6_CX_XE		},
-
-	{ 0x00000300, 0x00000308, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x00000309, 0x0000030B, CPU_PMC,	CPU_C2_AT_XE		},
-	{ 0x0000030C, 0x00000311, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x00000345, 0x00000345, CPU_PMC,	CPU_C2_AT		},
-	{ 0x00000360, 0x00000371, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x0000038D, 0x00000390, CPU_PMC,	CPU_C2_AT		},
-	{ 0x000003A0, 0x000003BE, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x000003C0, 0x000003CD, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x000003E0, 0x000003E1, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x000003F0, 0x000003F0, CPU_PMC,	CPU_INTEL_XEON		},
-	{ 0x000003F1, 0x000003F1, CPU_PMC,	CPU_C2_AT_XE		},
-	{ 0x000003F2, 0x000003F2, CPU_PMC,	CPU_INTEL_XEON		},
-
-	{ 0x00000400, 0x00000402, CPU_MC,	CPU_PM_CX_AT_XE		},
-	{ 0x00000403, 0x00000403, CPU_MC,	CPU_INTEL_XEON		},
-	{ 0x00000404, 0x00000406, CPU_MC,	CPU_PM_CX_AT_XE		},
-	{ 0x00000407, 0x00000407, CPU_MC,	CPU_INTEL_XEON		},
-	{ 0x00000408, 0x0000040A, CPU_MC,	CPU_PM_CX_AT_XE		},
-	{ 0x0000040B, 0x0000040B, CPU_MC,	CPU_INTEL_XEON		},
-	{ 0x0000040C, 0x0000040E, CPU_MC,	CPU_PM_CX_XE		},
-	{ 0x0000040F, 0x0000040F, CPU_MC,	CPU_INTEL_XEON		},
-	{ 0x00000410, 0x00000412, CPU_MC,	CPU_PM_CX_AT_XE		},
-	{ 0x00000413, 0x00000417, CPU_MC,	CPU_CX_AT_XE		},
-	{ 0x00000480, 0x0000048B, CPU_VMX,	CPU_CX_AT_XE		},
-
-	{ 0x00000600, 0x00000600, CPU_DEBUG,	CPU_PM_CX_AT_XE		},
-	{ 0x00000680, 0x0000068F, CPU_LBRANCH,	CPU_INTEL_XEON		},
-	{ 0x000006C0, 0x000006CF, CPU_LBRANCH,	CPU_INTEL_XEON		},
-
-	{ 0x000107CC, 0x000107D3, CPU_PMC,	CPU_INTEL_XEON_MP	},
-
-	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_INTEL_XEON		},
-	{ 0xC0000081, 0xC0000082, CPU_CALL,	CPU_INTEL_XEON		},
-	{ 0xC0000084, 0xC0000084, CPU_CALL,	CPU_INTEL_XEON		},
-	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_INTEL_XEON		},
+/* CPU Registers Range */
+static struct cpu_debug_range cpu_reg_range[] = {
+	{ 0x00000000, 0x00000001, CPU_MC,	},
+	{ 0x00000006, 0x00000007, CPU_MONITOR,	},
+	{ 0x00000010, 0x00000010, CPU_TIME,	},
+	{ 0x00000011, 0x00000013, CPU_PMC,	},
+	{ 0x00000017, 0x00000017, CPU_PLATFORM,	},
+	{ 0x0000001B, 0x0000001B, CPU_APIC,	},
+	{ 0x0000002A, 0x0000002B, CPU_POWERON,	},
+	{ 0x0000002C, 0x0000002C, CPU_FREQ,	},
+	{ 0x0000003A, 0x0000003A, CPU_CONTROL,	},
+	{ 0x00000040, 0x00000047, CPU_LBRANCH,	},
+	{ 0x00000060, 0x00000067, CPU_LBRANCH,	},
+	{ 0x00000079, 0x00000079, CPU_BIOS,	},
+	{ 0x00000088, 0x0000008A, CPU_CACHE,	},
+	{ 0x0000008B, 0x0000008B, CPU_BIOS,	},
+	{ 0x0000009B, 0x0000009B, CPU_MONITOR,	},
+	{ 0x000000C1, 0x000000C4, CPU_PMC,	},
+	{ 0x000000CD, 0x000000CD, CPU_FREQ,	},
+	{ 0x000000E7, 0x000000E8, CPU_PERF,	},
+	{ 0x000000FE, 0x000000FE, CPU_MTRR,	},
+
+	{ 0x00000116, 0x0000011E, CPU_CACHE,	},
+	{ 0x00000174, 0x00000176, CPU_SYSENTER,	},
+	{ 0x00000179, 0x0000017B, CPU_MC,	},
+	{ 0x00000186, 0x00000189, CPU_PMC,	},
+	{ 0x00000198, 0x00000199, CPU_PERF,	},
+	{ 0x0000019A, 0x0000019A, CPU_TIME,	},
+	{ 0x0000019B, 0x0000019D, CPU_THERM,	},
+	{ 0x000001A0, 0x000001A0, CPU_MISC,	},
+	{ 0x000001C9, 0x000001C9, CPU_LBRANCH,	},
+	{ 0x000001D7, 0x000001D8, CPU_LBRANCH,	},
+	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	},
+	{ 0x000001DA, 0x000001E0, CPU_LBRANCH,	},
+
+	{ 0x00000200, 0x0000020F, CPU_MTRR,	},
+	{ 0x00000250, 0x00000250, CPU_MTRR,	},
+	{ 0x00000258, 0x00000259, CPU_MTRR,	},
+	{ 0x00000268, 0x0000026F, CPU_MTRR,	},
+	{ 0x00000277, 0x00000277, CPU_PAT,	},
+	{ 0x000002FF, 0x000002FF, CPU_MTRR,	},
+
+	{ 0x00000300, 0x00000311, CPU_PMC,	},
+	{ 0x00000345, 0x00000345, CPU_PMC,	},
+	{ 0x00000360, 0x00000371, CPU_PMC,	},
+	{ 0x0000038D, 0x00000390, CPU_PMC,	},
+	{ 0x000003A0, 0x000003BE, CPU_PMC,	},
+	{ 0x000003C0, 0x000003CD, CPU_PMC,	},
+	{ 0x000003E0, 0x000003E1, CPU_PMC,	},
+	{ 0x000003F0, 0x000003F2, CPU_PMC,	},
+
+	{ 0x00000400, 0x00000417, CPU_MC,	},
+	{ 0x00000480, 0x0000048B, CPU_VMX,	},
+
+	{ 0x00000600, 0x00000600, CPU_DEBUG,	},
+	{ 0x00000680, 0x0000068F, CPU_LBRANCH,	},
+	{ 0x000006C0, 0x000006CF, CPU_LBRANCH,	},
+
+	{ 0x000107CC, 0x000107D3, CPU_PMC,	},
+
+	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	},
+	{ 0xC0000081, 0xC0000084, CPU_CALL,	},
+	{ 0xC0000100, 0xC0000102, CPU_BASE,	},
+	{ 0xC0000103, 0xC0000103, CPU_TIME,	},
+
+	{ 0xC0010000, 0xC0010007, CPU_PMC,	},
+	{ 0xC0010010, 0xC0010010, CPU_CONF,	},
+	{ 0xC0010015, 0xC0010015, CPU_CONF,	},
+	{ 0xC0010016, 0xC001001A, CPU_MTRR,	},
+	{ 0xC001001D, 0xC001001D, CPU_MTRR,	},
+	{ 0xC001001F, 0xC001001F, CPU_CONF,	},
+	{ 0xC0010030, 0xC0010035, CPU_BIOS,	},
+	{ 0xC0010044, 0xC0010048, CPU_MC,	},
+	{ 0xC0010050, 0xC0010056, CPU_SMM,	},
+	{ 0xC0010058, 0xC0010058, CPU_CONF,	},
+	{ 0xC0010060, 0xC0010060, CPU_CACHE,	},
+	{ 0xC0010061, 0xC0010068, CPU_SMM,	},
+	{ 0xC0010069, 0xC001006B, CPU_SMM,	},
+	{ 0xC0010070, 0xC0010071, CPU_SMM,	},
+	{ 0xC0010111, 0xC0010113, CPU_SMM,	},
+	{ 0xC0010114, 0xC0010118, CPU_SVM,	},
+	{ 0xC0010140, 0xC0010141, CPU_OSVM,	},
+	{ 0xC0011022, 0xC0011023, CPU_CONF,	},
 };
 
-/* AMD Registers Range */
-static struct cpu_debug_range cpu_amd_range[] = {
-	{ 0x00000000, 0x00000001, CPU_MC,	CPU_K10_PLUS,		},
-	{ 0x00000010, 0x00000010, CPU_TIME,	CPU_K8_PLUS,		},
-	{ 0x0000001B, 0x0000001B, CPU_APIC,	CPU_K8_PLUS,		},
-	{ 0x0000002A, 0x0000002A, CPU_POWERON,	CPU_K7_PLUS		},
-	{ 0x0000008B, 0x0000008B, CPU_VER,	CPU_K8_PLUS		},
-	{ 0x000000FE, 0x000000FE, CPU_MTRR,	CPU_K8_PLUS,		},
-
-	{ 0x00000174, 0x00000176, CPU_SYSENTER,	CPU_K8_PLUS,		},
-	{ 0x00000179, 0x0000017B, CPU_MC,	CPU_K8_PLUS,		},
-	{ 0x000001D9, 0x000001D9, CPU_DEBUG,	CPU_K8_PLUS,		},
-	{ 0x000001DB, 0x000001DE, CPU_LBRANCH,	CPU_K8_PLUS,		},
-
-	{ 0x00000200, 0x0000020F, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0x00000250, 0x00000250, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0x00000258, 0x00000259, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0x00000268, 0x0000026F, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0x00000277, 0x00000277, CPU_PAT,	CPU_K8_PLUS,		},
-	{ 0x000002FF, 0x000002FF, CPU_MTRR,	CPU_K8_PLUS,		},
-
-	{ 0x00000400, 0x00000413, CPU_MC,	CPU_K8_PLUS,		},
-
-	{ 0xC0000080, 0xC0000080, CPU_FEATURES,	CPU_AMD_ALL,		},
-	{ 0xC0000081, 0xC0000084, CPU_CALL,	CPU_K8_PLUS,		},
-	{ 0xC0000100, 0xC0000102, CPU_BASE,	CPU_K8_PLUS,		},
-	{ 0xC0000103, 0xC0000103, CPU_TIME,	CPU_K10_PLUS,		},
-
-	{ 0xC0010000, 0xC0010007, CPU_PMC,	CPU_K8_PLUS,		},
-	{ 0xC0010010, 0xC0010010, CPU_CONF,	CPU_K7_PLUS,		},
-	{ 0xC0010015, 0xC0010015, CPU_CONF,	CPU_K7_PLUS,		},
-	{ 0xC0010016, 0xC001001A, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0xC001001D, 0xC001001D, CPU_MTRR,	CPU_K8_PLUS,		},
-	{ 0xC001001F, 0xC001001F, CPU_CONF,	CPU_K8_PLUS,		},
-	{ 0xC0010030, 0xC0010035, CPU_BIOS,	CPU_K8_PLUS,		},
-	{ 0xC0010044, 0xC0010048, CPU_MC,	CPU_K8_PLUS,		},
-	{ 0xC0010050, 0xC0010056, CPU_SMM,	CPU_K0F_PLUS,		},
-	{ 0xC0010058, 0xC0010058, CPU_CONF,	CPU_K10_PLUS,		},
-	{ 0xC0010060, 0xC0010060, CPU_CACHE,	CPU_AMD_11,		},
-	{ 0xC0010061, 0xC0010068, CPU_SMM,	CPU_K10_PLUS,		},
-	{ 0xC0010069, 0xC001006B, CPU_SMM,	CPU_AMD_11,		},
-	{ 0xC0010070, 0xC0010071, CPU_SMM,	CPU_K10_PLUS,		},
-	{ 0xC0010111, 0xC0010113, CPU_SMM,	CPU_K8_PLUS,		},
-	{ 0xC0010114, 0xC0010118, CPU_SVM,	CPU_K10_PLUS,		},
-	{ 0xC0010140, 0xC0010141, CPU_OSVM,	CPU_K10_PLUS,		},
-	{ 0xC0011022, 0xC0011023, CPU_CONF,	CPU_K10_PLUS,		},
-};
-
-
-/* Intel */
-static int get_intel_modelflag(unsigned model)
-{
-	int flag;
-
-	switch (model) {
-	case 0x0501:
-	case 0x0502:
-	case 0x0504:
-		flag = CPU_INTEL_PENTIUM;
-		break;
-	case 0x0601:
-	case 0x0603:
-	case 0x0605:
-	case 0x0607:
-	case 0x0608:
-	case 0x060A:
-	case 0x060B:
-		flag = CPU_INTEL_P6;
-		break;
-	case 0x0609:
-	case 0x060D:
-		flag = CPU_INTEL_PENTIUM_M;
-		break;
-	case 0x060E:
-		flag = CPU_INTEL_CORE;
-		break;
-	case 0x060F:
-	case 0x0617:
-		flag = CPU_INTEL_CORE2;
-		break;
-	case 0x061C:
-		flag = CPU_INTEL_ATOM;
-		break;
-	case 0x0F00:
-	case 0x0F01:
-	case 0x0F02:
-	case 0x0F03:
-	case 0x0F04:
-		flag = CPU_INTEL_XEON_P4;
-		break;
-	case 0x0F06:
-		flag = CPU_INTEL_XEON_MP;
-		break;
-	default:
-		flag = CPU_NONE;
-		break;
-	}
-
-	return flag;
-}
-
-/* AMD */
-static int get_amd_modelflag(unsigned model)
-{
-	int flag;
-
-	switch (model >> 8) {
-	case 0x6:
-		flag = CPU_AMD_K6;
-		break;
-	case 0x7:
-		flag = CPU_AMD_K7;
-		break;
-	case 0x8:
-		flag = CPU_AMD_K8;
-		break;
-	case 0xf:
-		flag = CPU_AMD_0F;
-		break;
-	case 0x10:
-		flag = CPU_AMD_10;
-		break;
-	case 0x11:
-		flag = CPU_AMD_11;
-		break;
-	default:
-		flag = CPU_NONE;
-		break;
-	}
-
-	return flag;
-}
-
-static int get_cpu_modelflag(unsigned cpu)
-{
-	int flag;
-
-	flag = per_cpu(cpu_model, cpu);
-
-	switch (flag >> 16) {
-	case X86_VENDOR_INTEL:
-		flag = get_intel_modelflag(flag);
-		break;
-	case X86_VENDOR_AMD:
-		flag = get_amd_modelflag(flag & 0xffff);
-		break;
-	default:
-		flag = CPU_NONE;
-		break;
-	}
-
-	return flag;
-}
-
-static int get_cpu_range_count(unsigned cpu)
-{
-	int index;
-
-	switch (per_cpu(cpu_model, cpu) >> 16) {
-	case X86_VENDOR_INTEL:
-		index = ARRAY_SIZE(cpu_intel_range);
-		break;
-	case X86_VENDOR_AMD:
-		index = ARRAY_SIZE(cpu_amd_range);
-		break;
-	default:
-		index = 0;
-		break;
-	}
-
-	return index;
-}
-
 static int is_typeflag_valid(unsigned cpu, unsigned flag)
 {
-	unsigned vendor, modelflag;
-	int i, index;
+	int i;
 
 	/* Standard Registers should be always valid */
 	if (flag >= CPU_TSS)
 		return 1;
 
-	modelflag = per_cpu(cpu_modelflag, cpu);
-	vendor = per_cpu(cpu_model, cpu) >> 16;
-	index = get_cpu_range_count(cpu);
-
-	for (i = 0; i < index; i++) {
-		switch (vendor) {
-		case X86_VENDOR_INTEL:
-			if ((cpu_intel_range[i].model & modelflag) &&
-			    (cpu_intel_range[i].flag & flag))
-				return 1;
-			break;
-		case X86_VENDOR_AMD:
-			if ((cpu_amd_range[i].model & modelflag) &&
-			    (cpu_amd_range[i].flag & flag))
-				return 1;
-			break;
-		}
+	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
+		if (cpu_reg_range[i].flag == flag)
+			return 1;
 	}
 
 	/* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
 static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
 			      int index, unsigned flag)
 {
-	unsigned modelflag;
-
-	modelflag = per_cpu(cpu_modelflag, cpu);
-	*max = 0;
-	switch (per_cpu(cpu_model, cpu) >> 16) {
-	case X86_VENDOR_INTEL:
-		if ((cpu_intel_range[index].model & modelflag) &&
-		    (cpu_intel_range[index].flag & flag)) {
-			*min = cpu_intel_range[index].min;
-			*max = cpu_intel_range[index].max;
-		}
-		break;
-	case X86_VENDOR_AMD:
-		if ((cpu_amd_range[index].model & modelflag) &&
-		    (cpu_amd_range[index].flag & flag)) {
-			*min = cpu_amd_range[index].min;
-			*max = cpu_amd_range[index].max;
-		}
-		break;
-	}
+	if (cpu_reg_range[index].flag == flag) {
+		*min = cpu_reg_range[index].min;
+		*max = cpu_reg_range[index].max;
+	} else
+		*max = 0;
 
 	return *max;
 }
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
 	unsigned msr, msr_min, msr_max;
 	struct cpu_private *priv;
 	u32 low, high;
-	int i, range;
+	int i;
 
 	if (seq) {
 		priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
 		}
 	}
 
-	range = get_cpu_range_count(cpu);
-
-	for (i = 0; i < range; i++) {
+	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
 		if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
 			continue;
 
@@ -788,13 +569,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
 {
 	struct dentry *cpu_dentry = NULL;
 	unsigned reg, reg_min, reg_max;
-	int i, range, err = 0;
+	int i, err = 0;
 	char reg_dir[12];
 	u32 low, high;
 
-	range = get_cpu_range_count(cpu);
-
-	for (i = 0; i < range; i++) {
+	for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
 		if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
 				   cpu_base[type].flag))
 			continue;
@@ -850,10 +629,6 @@ static int cpu_init_cpu(void)
 		cpui = &cpu_data(cpu);
 		if (!cpu_has(cpui, X86_FEATURE_MSR))
 			continue;
-		per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
-					   (cpui->x86 << 8) |
-					   (cpui->x86_model));
-		per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
 
 		sprintf(cpu_dir, "cpu%d", cpu);
 		cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
-- 
GitLab


From 4a4aca641bc4598e77b866804f47c651ec4a764d Mon Sep 17 00:00:00 2001
From: Jean Delvare <jdelvare@suse.de>
Date: Fri, 5 Jun 2009 12:02:38 +0200
Subject: [PATCH 1648/2198] x86: Add quirk for reboot stalls on a Dell Optiplex
 360

The Dell Optiplex 360 hangs on reboot, just like the Optiplex 330, so
the same quirk is needed.

Signed-off-by: Jean Delvare <jdelvare@suse.de>
Cc: Steve Conklin <steve.conklin@canonical.com>
Cc: Leann Ogasawara <leann.ogasawara@canonical.com>
Cc: <stable@kernel.org>
LKML-Reference: <200906051202.38311.jdelvare@suse.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/reboot.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 667188e0b5a0b..d2d1ce8170f06 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
 		},
 	},
+	{   /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+		.callback = set_bios_reboot,
+		.ident = "Dell OptiPlex 360",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+			DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+		},
+	},
 	{	/* Handle problems with rebooting on Dell 2400's */
 		.callback = set_bios_reboot,
 		.ident = "Dell PowerEdge 2400",
-- 
GitLab


From 103428e57be323c3c5545db8ad12667099bc6005 Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 7 Jun 2009 16:48:40 +0400
Subject: [PATCH 1649/2198] x86, apic: Fix dummy apic read operation together
 with broken MP handling

Ingo Molnar reported that read_apic is buggy novadays:

[    0.000000] Using APIC driver default
[    0.000000] SMP: Allowing 1 CPUs, 0 hotplug CPUs
[    0.000000] Local APIC disabled by BIOS -- you can enable it with "lapic"
[    0.000000] APIC: disable apic facility
[    0.000000] ------------[ cut here ]------------
[    0.000000] WARNING: at arch/x86/kernel/apic/apic.c:254 native_apic_read_dummy+0x2d/0x3b()
[    0.000000] Hardware name: HP OmniBook PC

Indeed we still rely on apic->read operation for SMP compiled
kernel. And instead of disfigure the SMP code with #ifdef we
allow to call apic->read. To capture any unexpected results
we check for apic->read being called for sane reason via
WARN_ON_ONCE but(!) instead of OR we should use AND logical
operation (thanks Yinghai for spotting the root of the problem).

Along with that we could be have bad MP table and we are
to fix it that way no SMP started and no complains about
BIOS bug if apic was just disabled via command line.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Yinghai Lu <yinghai@kernel.org>
LKML-Reference: <20090607124840.GD4547@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apic/apic.c | 9 ++++++++-
 arch/x86/kernel/smpboot.c   | 8 +++++---
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e82488d3f0bad..a4c9cf0bf70bf 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -249,7 +249,7 @@ static void native_apic_write_dummy(u32 reg, u32 v)
 
 static u32 native_apic_read_dummy(u32 reg)
 {
-	WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+	WARN_ON_ONCE((cpu_has_apic && !disable_apic));
 	return 0;
 }
 
@@ -1609,6 +1609,13 @@ void __init init_apic_mappings(void)
 	new_apicid = read_apic_id();
 	if (boot_cpu_physical_apicid != new_apicid) {
 		boot_cpu_physical_apicid = new_apicid;
+		/*
+		 * yeah -- we lie about apic_version
+		 * in case if apic was disabled via boot option
+		 * but it's not a problem for SMP compiled kernel
+		 * since smp_sanity_check is prepared for such a case
+		 * and disable smp mode
+		 */
 		apic_version[new_apicid] =
 			 GET_APIC_VERSION(apic_read(APIC_LVR));
 	}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index d2e8de9581562..7c80007ea5f7f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -992,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
 	 */
 	if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
 	    !cpu_has_apic) {
-		printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
-			boot_cpu_physical_apicid);
-		printk(KERN_ERR "... forcing use of dummy APIC emulation."
+		if (!disable_apic) {
+			pr_err("BIOS bug, local APIC #%d not detected!...\n",
+				boot_cpu_physical_apicid);
+			pr_err("... forcing use of dummy APIC emulation."
 				"(tell your hw vendor)\n");
+		}
 		smpboot_clear_io_apic();
 		arch_disable_smp_support();
 		return -1;
-- 
GitLab


From a4046f8d299e00e9855ae292527c2d66a42670eb Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Sun, 7 Jun 2009 12:19:37 +0400
Subject: [PATCH 1650/2198] x86, nmi: Use predefined numbers instead of
 hardcoded one

[ Impact: cleanup ]

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
LKML-Reference: <20090607081937.GC4547@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/nmi.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index c45a0a568dff0..c97264409934b 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void)
 	 * but since they are power of two we could use a
 	 * cheaper way --cvg
 	 */
-	return nmi_watchdog & 0x3;
+	return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
 }
 #endif
 
-- 
GitLab


From 3aa6b186f86c5d06d6d92d14311ffed51f091f40 Mon Sep 17 00:00:00 2001
From: Lubomir Rintel <lkundrak@v3.sk>
Date: Sun, 7 Jun 2009 16:23:48 +0200
Subject: [PATCH 1651/2198] x86: Fix non-lazy GS handling in sys_vm86()

This fixes a stack corruption panic or null dereference oops
due to a bad GS in resume_userspace() when returning from
sys_vm86() and calling lockdep_sys_exit().

Only a problem when CONFIG_LOCKDEP and CONFIG_CC_STACKPROTECTOR
enabled.

Signed-off-by: Lubomir Rintel <lkundrak@v3.sk>
Cc: H. Peter Anvin <hpa@zytor.com>
LKML-Reference: <1244384628.2323.4.camel@bimbo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/vm86_32.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index d7ac84e7fc1c7..6a177694058fe 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	info->regs.pt.ds = 0;
 	info->regs.pt.es = 0;
 	info->regs.pt.fs = 0;
-
-/* we are clearing gs later just before "jmp resume_userspace",
- * because it is not saved/restored.
- */
+#ifndef CONFIG_X86_32_LAZY_GS
+	info->regs.pt.gs = 0;
+#endif
 
 /*
  * The flags register is also special: we cannot trust that the user
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	__asm__ __volatile__(
 		"movl %0,%%esp\n\t"
 		"movl %1,%%ebp\n\t"
+#ifdef CONFIG_X86_32_LAZY_GS
 		"mov  %2, %%gs\n\t"
+#endif
 		"jmp resume_userspace"
 		: /* no outputs */
 		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
-- 
GitLab


From aeef50bc0483fa70ce0bddb686ec84a274b7f3d4 Mon Sep 17 00:00:00 2001
From: "Figo.zhang" <figo1802@gmail.com>
Date: Sun, 7 Jun 2009 22:30:36 +0800
Subject: [PATCH 1652/2198] x86, microcode: Simplify vfree() use

vfree() does its own 'NULL' check, so no need for check before
calling it.

In v2, remove the stray newline.

[ Impact: cleanup ]

Signed-off-by: Figo.zhang <figo1802@gmail.com>
Cc: Dmitry Adamushko <dmitry.adamushko@gmail.com>
LKML-Reference: <1244385036.3402.11.camel@myhost>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/microcode_amd.c | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index c8be20f16447d..366baa179913d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -241,10 +241,8 @@ static int install_equiv_cpu_table(const u8 *buf)
 
 static void free_equiv_cpu_table(void)
 {
-	if (equiv_cpu_table) {
-		vfree(equiv_cpu_table);
-		equiv_cpu_table = NULL;
-	}
+	vfree(equiv_cpu_table);
+	equiv_cpu_table = NULL;
 }
 
 static enum ucode_state
@@ -279,8 +277,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 
 		mc_header = (struct microcode_header_amd *)mc;
 		if (get_matching_microcode(cpu, mc, new_rev)) {
-			if (new_mc)
-				vfree(new_mc);
+			vfree(new_mc);
 			new_rev = mc_header->patch_id;
 			new_mc  = mc;
 		} else
@@ -292,8 +289,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
 
 	if (new_mc) {
 		if (!leftover) {
-			if (uci->mc)
-				vfree(uci->mc);
+			vfree(uci->mc);
 			uci->mc = new_mc;
 			pr_debug("microcode: CPU%d found a matching microcode "
 				 "update with version 0x%x (current=0x%x)\n",
-- 
GitLab


From 743ee1f80434138495bbb95ffb897acf46b51d54 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 17:06:46 +0200
Subject: [PATCH 1653/2198] perf stat: Continue even on counter creation error

Before:

 $ perf stat ~/hackbench 5

 error: syscall returned with -1 (No such device)

After:

 $ perf stat ~/hackbench 5
 Time: 1.640

 Performance counter stats for '/home/mingo/hackbench 5':

    6524.570382  task-clock-ticks     #       3.838 CPU utilization factor
          35704  context-switches     #       0.005 M/sec
            191  CPU-migrations       #       0.000 M/sec
           8958  page-faults          #       0.001 M/sec
  <not counted>  cycles
  <not counted>  instructions
  <not counted>  cache-references
  <not counted>  cache-misses

 Wall-clock time elapsed:  1699.999995 msecs

Also add -v (--verbose) option to allow the printing of failed
counter opens.

Plus dont print 'inf' if wall-time is zero (due to jiffies granularity),
instead skip the printing of the CPU utilization factor.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-stat.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 2cbf5a189589d..184ff95ef4f51 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -59,6 +59,7 @@ static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
 static int			system_wide			=  0;
 static int			inherit				=  1;
+static int			verbose				=  0;
 
 static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
 
@@ -83,7 +84,7 @@ static __u64			event_scaled[MAX_COUNTERS];
 static __u64			runtime_nsecs;
 static __u64			walltime_nsecs;
 
-static void create_perfstat_counter(int counter)
+static void create_perf_stat_counter(int counter)
 {
 	struct perf_counter_attr *attr = attrs + counter;
 
@@ -95,10 +96,8 @@ static void create_perfstat_counter(int counter)
 		int cpu;
 		for (cpu = 0; cpu < nr_cpus; cpu ++) {
 			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
-			if (fd[cpu][counter] < 0) {
-				printf("perfstat error: syscall returned with %d (%s)\n",
-						fd[cpu][counter], strerror(errno));
-				exit(-1);
+			if (fd[cpu][counter] < 0 && verbose) {
+				printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[cpu][counter], strerror(errno));
 			}
 		}
 	} else {
@@ -106,10 +105,8 @@ static void create_perfstat_counter(int counter)
 		attr->disabled	= 1;
 
 		fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0);
-		if (fd[0][counter] < 0) {
-			printf("perfstat error: syscall returned with %d (%s)\n",
-					fd[0][counter], strerror(errno));
-			exit(-1);
+		if (fd[0][counter] < 0 && verbose) {
+			printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[0][counter], strerror(errno));
 		}
 	}
 }
@@ -147,6 +144,9 @@ static void read_counter(int counter)
 
 	nv = scale ? 3 : 1;
 	for (cpu = 0; cpu < nr_cpus; cpu ++) {
+		if (fd[cpu][counter] < 0)
+			continue;
+
 		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64));
 		assert(res == nv * sizeof(__u64));
 
@@ -204,8 +204,9 @@ static void print_counter(int counter)
 		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
 			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
 
-			fprintf(stderr, " # %11.3f CPU utilization factor",
-				(double)count[0] / (double)walltime_nsecs);
+			if (walltime_nsecs)
+				fprintf(stderr, " # %11.3f CPU utilization factor",
+					(double)count[0] / (double)walltime_nsecs);
 		}
 	} else {
 		fprintf(stderr, " %14Ld  %-20s",
@@ -220,7 +221,7 @@ static void print_counter(int counter)
 	fprintf(stderr, "\n");
 }
 
-static int do_perfstat(int argc, const char **argv)
+static int do_perf_stat(int argc, const char **argv)
 {
 	unsigned long long t0, t1;
 	int counter;
@@ -232,7 +233,7 @@ static int do_perfstat(int argc, const char **argv)
 		nr_cpus = 1;
 
 	for (counter = 0; counter < nr_counters; counter++)
-		create_perfstat_counter(counter);
+		create_perf_stat_counter(counter);
 
 	/*
 	 * Enable counters and exec the command:
@@ -305,6 +306,8 @@ static const struct option options[] = {
 			    "system-wide collection from all CPUs"),
 	OPT_BOOLEAN('S', "scale", &scale,
 			    "scale/normalize counters"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show counter open errors, etc)"),
 	OPT_END()
 };
 
@@ -335,5 +338,5 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
 	signal(SIGALRM, skip_signal);
 	signal(SIGABRT, skip_signal);
 
-	return do_perfstat(argc, argv);
+	return do_perf_stat(argc, argv);
 }
-- 
GitLab


From 716c69fecacd42f2a304a97158e04af2786a3f65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 17:31:52 +0200
Subject: [PATCH 1654/2198] perf top: Fall back to cpu-clock-tick hrtimer
 sampling if no cycle counter available

On architectures/CPUs without PMU support but with perfcounters
enabled 'perf top' currently fails because it cannot create a
cycle based hw-perfcounter.

Fall back to the cpu-clock-tick sw-perfcounter in this case, which
is hrtimer based and will always work (as long as perfcounters
is enabled).

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-top.c | 113 +++++++++++++++++++++++----------------
 tools/perf/util/usage.c  |  10 ++--
 2 files changed, 73 insertions(+), 50 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index fdc1d5863b012..6da30a140e863 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -527,58 +527,81 @@ static void mmap_read(void)
 	}
 }
 
-static int __cmd_top(void)
+int nr_poll;
+int group_fd;
+
+static void start_counter(int i, int counter)
 {
 	struct perf_counter_attr *attr;
-	pthread_t thread;
-	int i, counter, group_fd, nr_poll = 0;
 	unsigned int cpu;
+
+	cpu = profile_cpu;
+	if (target_pid == -1 && profile_cpu == -1)
+		cpu = i;
+
+	attr = attrs + counter;
+
+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	attr->freq		= freq;
+
+try_again:
+	fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
+
+	if (fd[i][counter] < 0) {
+		int err = errno;
+
+		error("sys_perf_counter_open() syscall returned with %d (%s)\n",
+			fd[i][counter], strerror(err));
+
+		if (err == EPERM)
+			die(" No permission - are you root?\n");
+		/*
+		 * If it's cycles then fall back to hrtimer
+		 * based cpu-clock-tick sw counter, which
+		 * is always available even if no PMU support:
+		 */
+		if (attr->type == PERF_TYPE_HARDWARE
+			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+
+			warning(" ... trying to fall back to cpu-clock-ticks\n");
+			attr->type = PERF_TYPE_SOFTWARE;
+			attr->config = PERF_COUNT_CPU_CLOCK;
+			goto try_again;
+		}
+		exit(-1);
+	}
+	assert(fd[i][counter] >= 0);
+	fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
+
+	/*
+	 * First counter acts as the group leader:
+	 */
+	if (group && group_fd == -1)
+		group_fd = fd[i][counter];
+
+	event_array[nr_poll].fd = fd[i][counter];
+	event_array[nr_poll].events = POLLIN;
+	nr_poll++;
+
+	mmap_array[i][counter].counter = counter;
+	mmap_array[i][counter].prev = 0;
+	mmap_array[i][counter].mask = mmap_pages*page_size - 1;
+	mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+			PROT_READ, MAP_SHARED, fd[i][counter], 0);
+	if (mmap_array[i][counter].base == MAP_FAILED)
+		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
+}
+
+static int __cmd_top(void)
+{
+	pthread_t thread;
+	int i, counter;
 	int ret;
 
 	for (i = 0; i < nr_cpus; i++) {
 		group_fd = -1;
-		for (counter = 0; counter < nr_counters; counter++) {
-
-			cpu	= profile_cpu;
-			if (target_pid == -1 && profile_cpu == -1)
-				cpu = i;
-
-			attr = attrs + counter;
-
-			attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-			attr->freq		= freq;
-
-			fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
-			if (fd[i][counter] < 0) {
-				int err = errno;
-
-				error("syscall returned with %d (%s)\n",
-					fd[i][counter], strerror(err));
-				if (err == EPERM)
-					printf("Are you root?\n");
-				exit(-1);
-			}
-			assert(fd[i][counter] >= 0);
-			fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
-
-			/*
-			 * First counter acts as the group leader:
-			 */
-			if (group && group_fd == -1)
-				group_fd = fd[i][counter];
-
-			event_array[nr_poll].fd = fd[i][counter];
-			event_array[nr_poll].events = POLLIN;
-			nr_poll++;
-
-			mmap_array[i][counter].counter = counter;
-			mmap_array[i][counter].prev = 0;
-			mmap_array[i][counter].mask = mmap_pages*page_size - 1;
-			mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-					PROT_READ, MAP_SHARED, fd[i][counter], 0);
-			if (mmap_array[i][counter].base == MAP_FAILED)
-				die("failed to mmap with %d (%s)\n", errno, strerror(errno));
-		}
+		for (counter = 0; counter < nr_counters; counter++)
+			start_counter(i, counter);
 	}
 
 	/* Wait for a minimal set of events before starting the snapshot */
diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c
index 2cad286e43716..e16bf9a707e89 100644
--- a/tools/perf/util/usage.c
+++ b/tools/perf/util/usage.c
@@ -9,29 +9,29 @@ static void report(const char *prefix, const char *err, va_list params)
 {
 	char msg[1024];
 	vsnprintf(msg, sizeof(msg), err, params);
-	fprintf(stderr, "%s%s\n", prefix, msg);
+	fprintf(stderr, " %s%s\n", prefix, msg);
 }
 
 static NORETURN void usage_builtin(const char *err)
 {
-	fprintf(stderr, "\n usage: %s\n", err);
+	fprintf(stderr, "\n Usage: %s\n", err);
 	exit(129);
 }
 
 static NORETURN void die_builtin(const char *err, va_list params)
 {
-	report("fatal: ", err, params);
+	report(" Fatal: ", err, params);
 	exit(128);
 }
 
 static void error_builtin(const char *err, va_list params)
 {
-	report("error: ", err, params);
+	report(" Error: ", err, params);
 }
 
 static void warn_builtin(const char *warn, va_list params)
 {
-	report("warning: ", warn, params);
+	report(" Warning: ", warn, params);
 }
 
 /* If we are in a dlopen()ed .so write to a global variable would segfault
-- 
GitLab


From 3da297a60f7e8840f79f7d0b343af078890939ea Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 17:39:02 +0200
Subject: [PATCH 1655/2198] perf record: Fall back to cpu-clock-ticks if no PMU

On architectures/CPUs without PMU support but with perfcounters
enabled 'perf record' currently fails because it cannot create a
cycle based hw-perfcounter.

Fall back to the cpu-clock-tick sw-perfcounter in this case, which
is hrtimer based and will always work (as long as perfcounters
are enabled).

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 25 +++++++++++++++++++++++--
 tools/perf/builtin-top.c    | 14 ++++++++++----
 2 files changed, 33 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index aeab9c4b15e48..87866294a0e68 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -37,6 +37,7 @@ static pid_t			target_pid			= -1;
 static int			inherit				= 1;
 static int			force				= 0;
 static int			append_file			= 0;
+static int			verbose				= 0;
 
 static long			samples;
 static struct timeval		last_read;
@@ -349,17 +350,35 @@ static void create_counter(int counter, int cpu, pid_t pid)
 
 	track = 0; /* only the first counter needs these */
 
+try_again:
 	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0);
 
 	if (fd[nr_cpu][counter] < 0) {
 		int err = errno;
 
-		error("syscall returned with %d (%s)\n",
+		if (verbose)
+			error("sys_perf_counter_open() syscall returned with %d (%s)\n",
 				fd[nr_cpu][counter], strerror(err));
 		if (err == EPERM)
-			printf("Are you root?\n");
+			die("Permission error - are you root?\n");
+
+		/*
+		 * If it's cycles then fall back to hrtimer
+		 * based cpu-clock-tick sw counter, which
+		 * is always available even if no PMU support:
+		 */
+		if (attr->type == PERF_TYPE_HARDWARE
+			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+
+			if (verbose)
+				warning(" ... trying to fall back to cpu-clock-ticks\n");
+			attr->type = PERF_TYPE_SOFTWARE;
+			attr->config = PERF_COUNT_CPU_CLOCK;
+			goto try_again;
+		}
 		exit(-1);
 	}
+
 	assert(fd[nr_cpu][counter] >= 0);
 	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
 
@@ -519,6 +538,8 @@ static const struct option options[] = {
 		    "profile at this frequency"),
 	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
 		    "number of mmap data pages"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show counter open errors, etc)"),
 	OPT_END()
 };
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 6da30a140e863..1f8c97d5c32e4 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -65,6 +65,7 @@ static int			group				=  0;
 static unsigned int		page_size;
 static unsigned int		mmap_pages			= 16;
 static int			freq				=  0;
+static int			verbose				=  0;
 
 static char			*sym_filter;
 static unsigned long		filter_start;
@@ -550,11 +551,12 @@ static void start_counter(int i, int counter)
 	if (fd[i][counter] < 0) {
 		int err = errno;
 
-		error("sys_perf_counter_open() syscall returned with %d (%s)\n",
-			fd[i][counter], strerror(err));
+		if (verbose)
+			error("sys_perf_counter_open() syscall returned with %d (%s)\n",
+				fd[i][counter], strerror(err));
 
 		if (err == EPERM)
-			die(" No permission - are you root?\n");
+			die("No permission - are you root?\n");
 		/*
 		 * If it's cycles then fall back to hrtimer
 		 * based cpu-clock-tick sw counter, which
@@ -563,7 +565,9 @@ static void start_counter(int i, int counter)
 		if (attr->type == PERF_TYPE_HARDWARE
 			&& attr->config == PERF_COUNT_CPU_CYCLES) {
 
-			warning(" ... trying to fall back to cpu-clock-ticks\n");
+			if (verbose)
+				warning(" ... trying to fall back to cpu-clock-ticks\n");
+
 			attr->type = PERF_TYPE_SOFTWARE;
 			attr->config = PERF_COUNT_CPU_CLOCK;
 			goto try_again;
@@ -673,6 +677,8 @@ static const struct option options[] = {
 		    "profile at this frequency"),
 	OPT_INTEGER('E', "entries", &print_entries,
 		    "display this many functions"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show counter open errors, etc)"),
 	OPT_END()
 };
 
-- 
GitLab


From 30c806a094493beb7691bc7957dfa02dee96230a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 17:46:24 +0200
Subject: [PATCH 1656/2198] perf_counter tools: Handle kernels with
 !CONFIG_PERF_COUNTER

If perf is run on a !CONFIG_PERF_COUNTER kernel right now it
bails out with no messages or with confusing messages.

Standardize this case some more and explain the situation.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 7 ++++---
 tools/perf/builtin-top.c    | 8 ++++----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 87866294a0e68..deaee42d5eb04 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -356,9 +356,6 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	if (fd[nr_cpu][counter] < 0) {
 		int err = errno;
 
-		if (verbose)
-			error("sys_perf_counter_open() syscall returned with %d (%s)\n",
-				fd[nr_cpu][counter], strerror(err));
 		if (err == EPERM)
 			die("Permission error - are you root?\n");
 
@@ -376,6 +373,10 @@ static void create_counter(int counter, int cpu, pid_t pid)
 			attr->config = PERF_COUNT_CPU_CLOCK;
 			goto try_again;
 		}
+		printf("\n");
+		error("perfcounter syscall returned with %d (%s)\n",
+			fd[nr_cpu][counter], strerror(err));
+		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
 		exit(-1);
 	}
 
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 1f8c97d5c32e4..be1698f1189b7 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -551,10 +551,6 @@ static void start_counter(int i, int counter)
 	if (fd[i][counter] < 0) {
 		int err = errno;
 
-		if (verbose)
-			error("sys_perf_counter_open() syscall returned with %d (%s)\n",
-				fd[i][counter], strerror(err));
-
 		if (err == EPERM)
 			die("No permission - are you root?\n");
 		/*
@@ -572,6 +568,10 @@ static void start_counter(int i, int counter)
 			attr->config = PERF_COUNT_CPU_CLOCK;
 			goto try_again;
 		}
+		printf("\n");
+		error("perfcounter syscall returned with %d (%s)\n",
+			fd[i][counter], strerror(err));
+		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n");
 		exit(-1);
 	}
 	assert(fd[i][counter] >= 0);
-- 
GitLab


From a14832ff977e78d1982cdf78cdabb1f2320d9ac8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 17:58:23 +0200
Subject: [PATCH 1657/2198] perf report: Print more expressive message in case
 of file open error

Before:

 $ perf report
 failed to open file: No such file or directory

After:

 $ perf report
  failed to open file: perf.data  (try 'perf record' first)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 242e09ff36589..f053a7463dcf0 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1120,7 +1120,10 @@ static int __cmd_report(void)
 
 	input = open(input_name, O_RDONLY);
 	if (input < 0) {
-		perror("failed to open file");
+		fprintf(stderr, " failed to open file: %s", input_name);
+		if (!strcmp(input_name, "perf.data"))
+			fprintf(stderr, "  (try 'perf record' first)");
+		fprintf(stderr, "\n");
 		exit(-1);
 	}
 
-- 
GitLab


From e779898aa74cd2e97216368b3f3689ceffe8aeed Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 7 Jun 2009 18:14:46 +0200
Subject: [PATCH 1658/2198] perf stat: Print out instructins/cycle metric

Before:

     7549326754  cycles               #    3201.811 M/sec
    10007594937  instructions         #    4244.408 M/sec

After:

     7542051194  cycles               #    3201.996 M/sec
    10007743852  instructions         #    4248.811 M/sec # 1.327 per cycle

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-stat.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 184ff95ef4f51..80855090fd9fc 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -83,6 +83,7 @@ static __u64			event_scaled[MAX_COUNTERS];
 
 static __u64			runtime_nsecs;
 static __u64			walltime_nsecs;
+static __u64			runtime_cycles;
 
 static void create_perf_stat_counter(int counter)
 {
@@ -177,6 +178,9 @@ static void read_counter(int counter)
 	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
 		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
 		runtime_nsecs = count[0];
+	if (attrs[counter].type == PERF_TYPE_HARDWARE &&
+		attrs[counter].config == PERF_COUNT_CPU_CYCLES)
+		runtime_cycles = count[0];
 }
 
 /*
@@ -214,6 +218,13 @@ static void print_counter(int counter)
 		if (runtime_nsecs)
 			fprintf(stderr, " # %11.3f M/sec",
 				(double)count[0]/runtime_nsecs*1000.0);
+		if (runtime_cycles &&
+			attrs[counter].type == PERF_TYPE_HARDWARE &&
+				attrs[counter].config == PERF_COUNT_INSTRUCTIONS) {
+
+			fprintf(stderr, " # %1.3f per cycle",
+				(double)count[0] / (double)runtime_cycles);
+		}
 	}
 	if (scaled)
 		fprintf(stderr, "  (scaled from %.2f%%)",
-- 
GitLab


From 0312af84164215a452f2a94957ebd9bce86e0204 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 07:42:04 +0200
Subject: [PATCH 1659/2198] perf_counter, x86: Implement generalized cache
 event types, add Core2 support

Fill in core2_hw_cache_event_id[] with the Core2 model specific events.

The events can be used in all the tools via the -e (--event) parameter,
for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses".

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index e86679fa52159..b1f71ff502567 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -194,7 +194,90 @@ static const u64 core2_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
 {
-	/* To be filled in */
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */
+		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 static const u64 atom_hw_cache_event_ids
-- 
GitLab


From ad689220614b6c7c0b13b70d742f358e9310e71e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 09:30:41 +0200
Subject: [PATCH 1660/2198] perf_counter, x86: Implement generalized cache
 event types, add Atom support

Fill in core2_hw_cache_event_id[] with the Atom model specific events.

The events can be used in all the tools via the -e (--event) parameter,
for example "-e l1-misses" or -"-e l2-accesses" or "-e l2-write-misses".

( Note: these are straight from the Intel manuals - not tested yet.)

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 85 +++++++++++++++++++++++++++++-
 1 file changed, 84 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index b1f71ff502567..71590e09d16e5 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -285,7 +285,90 @@ static const u64 atom_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_OP_MAX]
 				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
 {
-	/* To be filled in */
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST               */
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */
+		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */
+		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */
+		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
 };
 
 static u64 intel_pmu_raw_event(u64 event)
-- 
GitLab


From 1123e3ad73697d64ad99f0104bbe49f8b52d7d65 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 29 May 2009 11:25:09 +0200
Subject: [PATCH 1661/2198] perf_counter: Clean up x86 boot messages

Standardize and tidy up all the messages we print during
perfcounter initialization.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 46 ++++++++++++++++--------------
 1 file changed, 24 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 71590e09d16e5..0339d195a3f00 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1298,23 +1298,22 @@ static int intel_pmu_init(void)
 	if (version < 2)
 		return -ENODEV;
 
-	x86_pmu = intel_pmu;
-	x86_pmu.version = version;
-	x86_pmu.num_counters = eax.split.num_counters;
+	x86_pmu				= intel_pmu;
+	x86_pmu.version			= version;
+	x86_pmu.num_counters		= eax.split.num_counters;
+	x86_pmu.counter_bits		= eax.split.bit_width;
+	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1;
 
 	/*
 	 * Quirk: v2 perfmon does not report fixed-purpose counters, so
 	 * assume at least 3 counters:
 	 */
-	x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
-
-	x86_pmu.counter_bits = eax.split.bit_width;
-	x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1;
+	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3);
 
 	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
 
 	/*
-	 * Nehalem:
+	 * Install the hw-cache-events table:
 	 */
 	switch (boot_cpu_data.x86_model) {
 	case 17:
@@ -1322,7 +1321,7 @@ static int intel_pmu_init(void)
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Core2 event tables\n");
+		pr_cont("Core2 events, ");
 		break;
 	default:
 	case 26:
@@ -1330,14 +1329,14 @@ static int intel_pmu_init(void)
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Nehalem/Corei7 event tables\n");
+		pr_cont("Nehalem/Corei7 events, ");
 		break;
 	case 28:
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
 		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
 			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
 
-		pr_info("... installed Atom event tables\n");
+		pr_cont("Atom events, ");
 		break;
 	}
 	return 0;
@@ -1353,6 +1352,8 @@ void __init init_hw_perf_counters(void)
 {
 	int err;
 
+	pr_info("Performance Counters: ");
+
 	switch (boot_cpu_data.x86_vendor) {
 	case X86_VENDOR_INTEL:
 		err = intel_pmu_init();
@@ -1363,14 +1364,13 @@ void __init init_hw_perf_counters(void)
 	default:
 		return;
 	}
-	if (err != 0)
+	if (err != 0) {
+		pr_cont("no PMU driver, software counters only.\n");
 		return;
+	}
 
-	pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name);
-	pr_info("... version:         %d\n", x86_pmu.version);
-	pr_info("... bit width:       %d\n", x86_pmu.counter_bits);
+	pr_cont("%s PMU driver.\n", x86_pmu.name);
 
-	pr_info("... num counters:    %d\n", x86_pmu.num_counters);
 	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
 		x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
 		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!",
@@ -1379,23 +1379,25 @@ void __init init_hw_perf_counters(void)
 	perf_counter_mask = (1 << x86_pmu.num_counters) - 1;
 	perf_max_counters = x86_pmu.num_counters;
 
-	pr_info("... value mask:      %016Lx\n", x86_pmu.counter_mask);
-	pr_info("... max period:      %016Lx\n", x86_pmu.max_period);
-
 	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
 		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
 		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!",
 		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
 	}
-	pr_info("... fixed counters:  %d\n", x86_pmu.num_counters_fixed);
 
 	perf_counter_mask |=
 		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
 
-	pr_info("... counter mask:    %016Lx\n", perf_counter_mask);
-
 	perf_counters_lapic_init();
 	register_die_notifier(&perf_counter_nmi_notifier);
+
+	pr_info("... version:                 %d\n",     x86_pmu.version);
+	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits);
+	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters);
+	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask);
+	pr_info("... max period:              %016Lx\n", x86_pmu.max_period);
+	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed);
+	pr_info("... counter mask:            %016Lx\n", perf_counter_mask);
 }
 
 static inline void x86_pmu_read(struct perf_counter *counter)
-- 
GitLab


From 312325094785566a0e42a88c1bf6e7eb54c5d70e Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 15:07:08 +0200
Subject: [PATCH 1662/2198] dma-debug: comment style fixes

Last patch series introduced some new comment which does not fit the
Kernel comment style guidelines. Fix it with this patch.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 77053d9ef513c..b8a61ff085441 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -542,7 +542,8 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 
 	write_lock_irqsave(&driver_name_lock, flags);
 
-	/* Now handle the string we got from userspace very carefully.
+	/*
+	 * Now handle the string we got from userspace very carefully.
 	 * The rules are:
 	 *         - only use the first token we got
 	 *         - token delimiter is everything looking like a space
@@ -551,7 +552,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 	 */
 	if (!isalnum(buf[0])) {
 		/*
-		   If the first character userspace gave us is not
+		 * If the first character userspace gave us is not
 		 * alphanumerical then assume the filter should be
 		 * switched off.
 		 */
-- 
GitLab


From c17e2cf7376a2010b8b114fdeebd4e340a5e9cb2 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 15:19:29 +0200
Subject: [PATCH 1663/2198] dma-debug: code style fixes

This patch changes the recent updates to dma-debug to conform with
coding style guidelines of Linux and the -tip tree.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index b8a61ff085441..9561825c14a42 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -501,8 +501,8 @@ static int prealloc_memory(u32 num_entries)
 static ssize_t filter_read(struct file *file, char __user *user_buf,
 			   size_t count, loff_t *ppos)
 {
-	unsigned long flags;
 	char buf[NAME_MAX_LEN + 1];
+	unsigned long flags;
 	int len;
 
 	if (!current_driver_name[0])
@@ -523,9 +523,9 @@ static ssize_t filter_read(struct file *file, char __user *user_buf,
 static ssize_t filter_write(struct file *file, const char __user *userbuf,
 			    size_t count, loff_t *ppos)
 {
-	unsigned long flags;
 	char buf[NAME_MAX_LEN];
-	size_t len = NAME_MAX_LEN - 1;
+	unsigned long flags;
+	size_t len;
 	int i;
 
 	/*
@@ -534,7 +534,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 	 * disabled. Since copy_from_user can fault and may sleep we
 	 * need to copy to temporary buffer first
 	 */
-	len = min(count, len);
+	len = min(count, NAME_MAX_LEN - 1);
 	if (copy_from_user(buf, userbuf, len))
 		return -EFAULT;
 
@@ -1040,18 +1040,19 @@ EXPORT_SYMBOL(debug_dma_map_sg);
 
 static int get_nr_mapped_entries(struct device *dev, struct scatterlist *s)
 {
-	struct dma_debug_entry *entry;
+	struct dma_debug_entry *entry, ref;
 	struct hash_bucket *bucket;
 	unsigned long flags;
-	int mapped_ents = 0;
-	struct dma_debug_entry ref;
+	int mapped_ents;
 
-	ref.dev = dev;
+	ref.dev      = dev;
 	ref.dev_addr = sg_dma_address(s);
-	ref.size = sg_dma_len(s),
+	ref.size     = sg_dma_len(s),
+
+	bucket       = get_hash_bucket(&ref, &flags);
+	entry        = hash_bucket_find(bucket, &ref);
+	mapped_ents  = 0;
 
-	bucket = get_hash_bucket(&ref, &flags);
-	entry = hash_bucket_find(bucket, &ref);
 	if (entry)
 		mapped_ents = entry->sg_mapped_ents;
 	put_hash_bucket(bucket, &flags);
-- 
GitLab


From e7ed70eedccc78e79ce6da2155e9caf90aff4003 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 15:39:24 +0200
Subject: [PATCH 1664/2198] dma-debug: use pr_* instead of printk(KERN_* ...)

The pr_* macros are shorter than the old printk(KERN_ ...) variant.
Change the dma-debug code to use the new macros and save a few
unnecessary line breaks. If lines don't break the source code can also
be grepped more easily.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 36 +++++++++++++++---------------------
 1 file changed, 15 insertions(+), 21 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 9561825c14a42..24c4a2c5d61c0 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -139,7 +139,7 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
 {
 #ifdef CONFIG_STACKTRACE
 	if (entry) {
-		printk(KERN_WARNING "Mapped at:\n");
+		pr_warning("Mapped at:\n");
 		print_stack_trace(&entry->stacktrace, 0);
 	}
 #endif
@@ -377,8 +377,7 @@ static struct dma_debug_entry *dma_entry_alloc(void)
 	spin_lock_irqsave(&free_entries_lock, flags);
 
 	if (list_empty(&free_entries)) {
-		printk(KERN_ERR "DMA-API: debugging out of memory "
-				"- disabling\n");
+		pr_err("DMA-API: debugging out of memory - disabling\n");
 		global_disable = true;
 		goto out;
 	}
@@ -483,8 +482,7 @@ static int prealloc_memory(u32 num_entries)
 	num_free_entries = num_entries;
 	min_free_entries = num_entries;
 
-	printk(KERN_INFO "DMA-API: preallocated %d debug entries\n",
-			num_entries);
+	pr_info("DMA-API: preallocated %d debug entries\n", num_entries);
 
 	return 0;
 
@@ -534,7 +532,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 	 * disabled. Since copy_from_user can fault and may sleep we
 	 * need to copy to temporary buffer first
 	 */
-	len = min(count, NAME_MAX_LEN - 1);
+	len = min(count, (size_t)(NAME_MAX_LEN - 1));
 	if (copy_from_user(buf, userbuf, len))
 		return -EFAULT;
 
@@ -557,8 +555,7 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 		 * switched off.
 		 */
 		if (current_driver_name[0])
-			printk(KERN_INFO "DMA-API: switching off dma-debug "
-					 "driver filter\n");
+			pr_info("DMA-API: switching off dma-debug driver filter\n");
 		current_driver_name[0] = 0;
 		current_driver = NULL;
 		goto out_unlock;
@@ -576,8 +573,8 @@ static ssize_t filter_write(struct file *file, const char __user *userbuf,
 	current_driver_name[i] = 0;
 	current_driver = NULL;
 
-	printk(KERN_INFO "DMA-API: enable driver filter for driver [%s]\n",
-	       current_driver_name);
+	pr_info("DMA-API: enable driver filter for driver [%s]\n",
+		current_driver_name);
 
 out_unlock:
 	write_unlock_irqrestore(&driver_name_lock, flags);
@@ -594,7 +591,7 @@ static int dma_debug_fs_init(void)
 {
 	dma_debug_dent = debugfs_create_dir("dma-api", NULL);
 	if (!dma_debug_dent) {
-		printk(KERN_ERR "DMA-API: can not create debugfs directory\n");
+		pr_err("DMA-API: can not create debugfs directory\n");
 		return -ENOMEM;
 	}
 
@@ -693,7 +690,7 @@ void dma_debug_add_bus(struct bus_type *bus)
 
 	nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
 	if (nb == NULL) {
-		printk(KERN_ERR "dma_debug_add_bus: out of memory\n");
+		pr_err("dma_debug_add_bus: out of memory\n");
 		return;
 	}
 
@@ -718,8 +715,7 @@ void dma_debug_init(u32 num_entries)
 	}
 
 	if (dma_debug_fs_init() != 0) {
-		printk(KERN_ERR "DMA-API: error creating debugfs entries "
-				"- disabling\n");
+		pr_err("DMA-API: error creating debugfs entries - disabling\n");
 		global_disable = true;
 
 		return;
@@ -729,8 +725,7 @@ void dma_debug_init(u32 num_entries)
 		num_entries = req_entries;
 
 	if (prealloc_memory(num_entries) != 0) {
-		printk(KERN_ERR "DMA-API: debugging out of memory error "
-				"- disabled\n");
+		pr_err("DMA-API: debugging out of memory error - disabled\n");
 		global_disable = true;
 
 		return;
@@ -738,7 +733,7 @@ void dma_debug_init(u32 num_entries)
 
 	nr_total_entries = num_free_entries;
 
-	printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n");
+	pr_info("DMA-API: debugging enabled by kernel config\n");
 }
 
 static __init int dma_debug_cmdline(char *str)
@@ -747,8 +742,7 @@ static __init int dma_debug_cmdline(char *str)
 		return -EINVAL;
 
 	if (strncmp(str, "off", 3) == 0) {
-		printk(KERN_INFO "DMA-API: debugging disabled on kernel "
-				 "command line\n");
+		pr_info("DMA-API: debugging disabled on kernel command line\n");
 		global_disable = true;
 	}
 
@@ -1239,8 +1233,8 @@ static int __init dma_debug_driver_setup(char *str)
 	}
 
 	if (current_driver_name[0])
-		printk(KERN_INFO "DMA-API: enable driver filter for "
-				 "driver [%s]\n", current_driver_name);
+		pr_info("DMA-API: enable driver filter for driver [%s]\n",
+			current_driver_name);
 
 
 	return 1;
-- 
GitLab


From be81c6ea23b8b471141734ef4bc005f5127aaf43 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 15:46:19 +0200
Subject: [PATCH 1665/2198] dma-debug: disable/enable irqs only once in
 device_dma_allocations

There is no need to disable/enable irqs on each loop iteration. Just
disable irqs for the whole time the loop runs.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 24c4a2c5d61c0..27b369da52c06 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -649,15 +649,19 @@ static int device_dma_allocations(struct device *dev)
 	unsigned long flags;
 	int count = 0, i;
 
+	local_irq_save(flags);
+
 	for (i = 0; i < HASH_SIZE; ++i) {
-		spin_lock_irqsave(&dma_entry_hash[i].lock, flags);
+		spin_lock(&dma_entry_hash[i].lock);
 		list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
 			if (entry->dev == dev)
 				count += 1;
 		}
-		spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags);
+		spin_unlock(&dma_entry_hash[i].lock);
 	}
 
+	local_irq_restore(flags);
+
 	return count;
 }
 
-- 
GitLab


From 0bf841281e58d0b3cc9fe9dc4383df7694bde6bd Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 15:53:46 +0200
Subject: [PATCH 1666/2198] dma-debug: simplify logic in driver_filter()

This patch makes the driver_filter function more readable by
reorganizing the code. The removal of a code code block to an upper
indentation level makes hard-to-read line-wraps unnecessary.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 lib/dma-debug.c | 42 +++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/lib/dma-debug.c b/lib/dma-debug.c
index 27b369da52c06..ad65fc0317d93 100644
--- a/lib/dma-debug.c
+++ b/lib/dma-debug.c
@@ -147,6 +147,10 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
 
 static bool driver_filter(struct device *dev)
 {
+	struct device_driver *drv;
+	unsigned long flags;
+	bool ret;
+
 	/* driver filter off */
 	if (likely(!current_driver_name[0]))
 		return true;
@@ -155,32 +159,28 @@ static bool driver_filter(struct device *dev)
 	if (current_driver && dev->driver == current_driver)
 		return true;
 
-	/* driver filter on but not yet initialized */
-	if (!current_driver && current_driver_name[0]) {
-		struct device_driver *drv = get_driver(dev->driver);
-		unsigned long flags;
-		bool ret = false;
-
-		if (!drv)
-			return false;
-
-		/* lock to protect against change of current_driver_name */
-		read_lock_irqsave(&driver_name_lock, flags);
+	if (current_driver || !current_driver_name[0])
+		return false;
 
-		if (drv->name &&
-		    strncmp(current_driver_name, drv->name,
-			    NAME_MAX_LEN-1) == 0) {
-			current_driver = drv;
-			ret = true;
-		}
+	/* driver filter on but not yet initialized */
+	drv = get_driver(dev->driver);
+	if (!drv)
+		return false;
 
-		read_unlock_irqrestore(&driver_name_lock, flags);
-		put_driver(drv);
+	/* lock to protect against change of current_driver_name */
+	read_lock_irqsave(&driver_name_lock, flags);
 
-		return ret;
+	ret = false;
+	if (drv->name &&
+	    strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) {
+		current_driver = drv;
+		ret = true;
 	}
 
-	return false;
+	read_unlock_irqrestore(&driver_name_lock, flags);
+	put_driver(drv);
+
+	return ret;
 }
 
 #define err_printk(dev, entry, format, arg...) do {		\
-- 
GitLab


From 9aee2286071c23c535fe9928eec1a26e0bcf256d Mon Sep 17 00:00:00 2001
From: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Date: Mon, 8 Jun 2009 12:41:35 -0400
Subject: [PATCH 1667/2198] ext4: fix dx_map_entry to support 256k directory
 blocks

The dx_map_entry structure doesn't support over 64KB block size by
current usage of its member("offs"). Because "offs" treats an offset
of copies of the ext4_dir_entry_2 structure as is. This member size is
16 bits. But real offset for over 64KB(256KB) block size needs 18
bits. However, real offset keeps 4 byte boundary, so lower 2 bits is
not used.

Therefore, we do the following to fix this limitation:
For "store":
	we divide the real offset by 4 and then store this result to "offs"
	member.
For "use":
	we multiply "offs" member by 4 and then use this result
	as real offset.

Signed-off-by: Toshiyuki Okajima <toshi.okajima@jp.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/namei.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f2bc160463b72..07eb6649e4fa1 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -749,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
 			ext4fs_dirhash(de->name, de->name_len, &h);
 			map_tail--;
 			map_tail->hash = h.hash;
-			map_tail->offs = (u16) ((char *) de - base);
+			map_tail->offs = ((char *) de - base)>>2;
 			map_tail->size = le16_to_cpu(de->rec_len);
 			count++;
 			cond_resched();
@@ -1147,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
 	unsigned rec_len = 0;
 
 	while (count--) {
-		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+		struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+						(from + (map->offs<<2));
 		rec_len = EXT4_DIR_REC_LEN(de->name_len);
 		memcpy (to, de, rec_len);
 		((struct ext4_dir_entry_2 *) to)->rec_len =
-- 
GitLab


From c4ed3f04ba9defe22aa729d1646f970f791c03d7 Mon Sep 17 00:00:00 2001
From: Jack Steiner <steiner@sgi.com>
Date: Mon, 8 Jun 2009 10:44:05 -0500
Subject: [PATCH 1668/2198] x86, UV: Fix macros for multiple coherency domains

Fix bug in the SGI UV macros that support systems with multiple
coherency domains.  The macros used for referencing global MMR
(chipset registers) are failing to correctly "or" the NASID
(node identifier) bits that reside above M+N. These high bits
are supplied automatically by the chipset for memory accesses
coming from the processor socket.

However, the bits must be present for references to the special
global MMR space used to map chipset registers. (See uv_hub.h
for more details ...)

The bug results in references to invalid/incorrect nodes.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: <stable@kernel.org>
LKML-Reference: <20090608154405.GA16395@sgi.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/uv/uv_hub.h   |  6 ++++--
 arch/x86/kernel/apic/x2apic_uv_x.c | 15 +++++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index d3a98ea1062ef..341070f7ad5cb 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -133,6 +133,7 @@ struct uv_scir_s {
 struct uv_hub_info_s {
 	unsigned long		global_mmr_base;
 	unsigned long		gpa_mask;
+	unsigned int		gnode_extra;
 	unsigned long		gnode_upper;
 	unsigned long		lowmem_remap_top;
 	unsigned long		lowmem_remap_base;
@@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
  * 		p -  PNODE (local part of nsids, right shifted 1)
  */
 #define UV_NASID_TO_PNODE(n)		(((n) >> 1) & uv_hub_info->pnode_mask)
-#define UV_PNODE_TO_NASID(p)		(((p) << 1) | uv_hub_info->gnode_upper)
+#define UV_PNODE_TO_GNODE(p)		((p) |uv_hub_info->gnode_extra)
+#define UV_PNODE_TO_NASID(p)		(UV_PNODE_TO_GNODE(p) << 1)
 
 #define UV_LOCAL_MMR_BASE		0xf4000000UL
 #define UV_GLOBAL_MMR32_BASE		0xf8000000UL
@@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define UV_GLOBAL_MMR32_PNODE_BITS(p)	((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
 
 #define UV_GLOBAL_MMR64_PNODE_BITS(p)					\
-	((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+	((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
 
 #define UV_APIC_PNODE_SHIFT	6
 
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2bda693529762..39f2af4b54675 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
 	union uvh_node_id_u node_id;
 	unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
 	int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
-	int max_pnode = 0;
+	int gnode_extra, max_pnode = 0;
 	unsigned long mmr_base, present, paddr;
 	unsigned short pnode_mask;
 
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
 	mmr_base =
 	    uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
 	    ~UV_MMR_ENABLE;
+	pnode_mask = (1 << n_val) - 1;
+	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+	gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
+	gnode_upper = ((unsigned long)gnode_extra  << m_val);
+	printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
+			n_val, m_val, gnode_upper, gnode_extra);
+
 	printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
 
 	for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -607,11 +614,6 @@ void __init uv_system_init(void)
 		}
 	}
 
-	pnode_mask = (1 << n_val) - 1;
-	node_id.v = uv_read_local_mmr(UVH_NODE_ID);
-	gnode_upper = (((unsigned long)node_id.s.node_id) &
-		       ~((1 << n_val) - 1)) << m_val;
-
 	uv_bios_init();
 	uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
 			    &sn_coherency_id, &sn_region_size);
@@ -634,6 +636,7 @@ void __init uv_system_init(void)
 		uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
 		uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
 		uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
+		uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
 		uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
 		uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
 		uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
-- 
GitLab


From c9690998ef48ffefeccb91c70a7739eebdea57f9 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Mon, 8 Jun 2009 19:09:39 +0200
Subject: [PATCH 1669/2198] x86: memtest: remove 64-bit division

Using gcc 3.3.5 a "make allmodconfig" + "CONFIG_KVM=n"
triggers a build error:

 arch/x86/mm/built-in.o(.init.text+0x43f7): In function `__change_page_attr':
 arch/x86/mm/pageattr.c:114: undefined reference to `__udivdi3'
 make: *** [.tmp_vmlinux1] Error 1

The culprit turned out to be a division in arch/x86/mm/memtest.c
For more info see this thread:

  http://marc.info/?l=linux-kernel&m=124416232620683

The patch entirely removes the division that caused the build
error.

[ Impact: build fix with certain GCC versions ]

Reported-by: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Yinghai Lu <yinghai@kernel.org>
Cc: xiyou.wangcong@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <stable@kernel.org>
LKML-Reference: <20090608170939.GB12431@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/memtest.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 605c8be06217b..c0bedcd10f973 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,23 +40,23 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
 
 static void __init memtest(u64 pattern, u64 start_phys, u64 size)
 {
-	u64 i, count;
-	u64 *start;
+	u64 *p;
+	void *start, *end;
 	u64 start_bad, last_bad;
 	u64 start_phys_aligned;
 	size_t incr;
 
 	incr = sizeof(pattern);
 	start_phys_aligned = ALIGN(start_phys, incr);
-	count = (size - (start_phys_aligned - start_phys))/incr;
 	start = __va(start_phys_aligned);
+	end = start + size - (start_phys_aligned - start_phys);
 	start_bad = 0;
 	last_bad = 0;
 
-	for (i = 0; i < count; i++)
-		start[i] = pattern;
-	for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-		if (*start == pattern)
+	for (p = start; p < end; p++)
+		*p = pattern;
+	for (p = start; p < end; p++, start_phys_aligned += incr) {
+		if (*p == pattern)
 			continue;
 		if (start_phys_aligned == last_bad + incr) {
 			last_bad += incr;
-- 
GitLab


From f86748e91a14bd6cc49477560f33ed5d59896e89 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 22:33:10 +0200
Subject: [PATCH 1670/2198] perf_counter, x86: Implement generalized cache
 event types, add AMD support

Fill in amd_hw_cache_event_id[] with the AMD CPU specific events,
for family 0x0f, 0x10 and 0x11.

There's apparently no distinction between load and store events, so
we only fill in the load events.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 102 +++++++++++++++++++++++++++++
 1 file changed, 102 insertions(+)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 0339d195a3f00..93af821ebe518 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -389,6 +389,97 @@ static u64 intel_pmu_raw_event(u64 event)
 	return event & CORE_EVNTSEL_MASK;
 }
 
+static const u64 amd_0f_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */
+		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(L2  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0,
+		[ C(RESULT_MISS)   ] = 0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */
+		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */
+		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+};
+
 /*
  * AMD Performance Monitor K7 and later.
  */
@@ -1345,6 +1436,17 @@ static int intel_pmu_init(void)
 static int amd_pmu_init(void)
 {
 	x86_pmu = amd_pmu;
+
+	switch (boot_cpu_data.x86) {
+	case 0x0f:
+	case 0x10:
+	case 0x11:
+		memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids,
+		       sizeof(hw_cache_event_ids));
+
+		pr_cont("AMD Family 0f/10/11 events, ");
+		break;
+	}
 	return 0;
 }
 
-- 
GitLab


From 820a644211bc1ac7715333abdb0f0b9ea4fbb549 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 8 Jun 2009 19:10:25 +0200
Subject: [PATCH 1671/2198] perf_counter, x86: Clean up hw_cache_event ids
 copies

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 93af821ebe518..56001feeffcd2 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1409,23 +1409,20 @@ static int intel_pmu_init(void)
 	switch (boot_cpu_data.x86_model) {
 	case 17:
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Core2 events, ");
 		break;
 	default:
 	case 26:
 		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Nehalem/Corei7 events, ");
 		break;
 	case 28:
 		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
-		sizeof(u64)*PERF_COUNT_HW_CACHE_MAX*
-			PERF_COUNT_HW_CACHE_OP_MAX*PERF_COUNT_HW_CACHE_RESULT_MAX);
+		       sizeof(hw_cache_event_ids));
 
 		pr_cont("Atom events, ");
 		break;
-- 
GitLab


From dab5855b12411334355ba21349a06700e4ae7a3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 8 Jun 2009 21:11:57 +0300
Subject: [PATCH 1672/2198] perf_counter: Add mmap event hooks to mprotect()

Some JIT compilers allocate memory for generated code with
posix_memalign() + mprotect() so we need to hook into mprotect()
to make sure 'perf' is aware that we're executing code in
anonymous memory.

[ penberg@cs.helsinki.fi: move the hook to sys_mprotect() ]
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <Pine.LNX.4.64.0906082111030.12407@melkki.cs.Helsinki.FI>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 mm/mprotect.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mm/mprotect.c b/mm/mprotect.c
index 258197b76fb41..d80311baeb2d2 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -23,6 +23,7 @@
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/migrate.h>
+#include <linux/perf_counter.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
@@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
 		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
 		if (error)
 			goto out;
+		perf_counter_mmap(vma);
 		nstart = tmp;
 
 		if (nstart < prev->vm_end)
-- 
GitLab


From 80d496be89ed7dede5abee5c057634e80a31c82d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 8 Jun 2009 21:12:48 +0300
Subject: [PATCH 1673/2198] perf report: Add support for profiling JIT
 generated code

This patch adds support for profiling JIT generated code to 'perf
report'. A JIT compiler is required to generate a "/tmp/perf-$PID.map"
symbols map that is parsed when looking and displaying symbols.

Thanks to Peter Zijlstra for his help with this patch!

Example "perf report" output with the Jato JIT:

 #
 # (40311 samples)
 #
 # Overhead           Command  Shared Object              Symbol
 # ........  ................  .........................  ......
 #
     97.80%              jato  /tmp/perf-11915.map        [.] Fibonacci.fib(I)I
      0.56%              jato  00000000b7fa023b           0x000000b7fa023b
      0.45%              jato  /tmp/perf-11915.map        [.] Fibonacci.main([Ljava/lang/String;)V
      0.38%              jato  [kernel]                   [k] get_page_from_freelist
      0.06%              jato  [kernel]                   [k] kunmap_atomic
      0.05%              jato  ./jato                     [.] utf8Hash
      0.04%              jato  ./jato                     [.] executeJava
      0.04%              jato  ./jato                     [.] defineClass

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: a.p.zijlstra@chello.nl
Cc: acme@redhat.com
LKML-Reference: <Pine.LNX.4.64.0906082111590.12407@melkki.cs.Helsinki.FI>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 15 ++++++++-
 tools/perf/util/symbol.c    | 65 +++++++++++++++++++++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f053a7463dcf0..61d871849b446 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -209,6 +209,11 @@ static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
 	return ip;
 }
 
+static inline int is_anon_memory(const char *filename)
+{
+     return strcmp(filename, "//anon") == 0;
+}
+
 static struct map *map__new(struct mmap_event *event)
 {
 	struct map *self = malloc(sizeof(*self));
@@ -216,6 +221,7 @@ static struct map *map__new(struct mmap_event *event)
 	if (self != NULL) {
 		const char *filename = event->filename;
 		char newfilename[PATH_MAX];
+		int anon;
 
 		if (cwd) {
 			int n = strcommon(filename);
@@ -227,6 +233,13 @@ static struct map *map__new(struct mmap_event *event)
 			}
 		}
 
+		anon = is_anon_memory(filename);
+
+		if (anon) {
+			snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", event->pid);
+			filename = newfilename;
+		}
+
 		self->start = event->start;
 		self->end   = event->start + event->len;
 		self->pgoff = event->pgoff;
@@ -235,7 +248,7 @@ static struct map *map__new(struct mmap_event *event)
 		if (self->dso == NULL)
 			goto out_delete;
 
-		if (self->dso == vdso)
+		if (self->dso == vdso || anon)
 			self->map_ip = vdso__map_ip;
 		else
 			self->map_ip = map__map_ip;
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 158588c7f6b1d..32dd47d60d9ca 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -220,6 +220,68 @@ static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verb
 	return -1;
 }
 
+static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int verbose)
+{
+	char *line = NULL;
+	size_t n;
+	FILE *file;
+	int nr_syms = 0;
+
+	file = fopen(self->name, "r");
+	if (file == NULL)
+		goto out_failure;
+
+	while (!feof(file)) {
+		__u64 start, size;
+		struct symbol *sym;
+		int line_len, len;
+
+		line_len = getline(&line, &n, file);
+		if (line_len < 0)
+			break;
+
+		if (!line)
+			goto out_failure;
+
+		line[--line_len] = '\0'; /* \n */
+
+		len = hex2u64(line, &start);
+
+		len++;
+		if (len + 2 >= line_len)
+			continue;
+
+		len += hex2u64(line + len, &size);
+
+		len++;
+		if (len + 2 >= line_len)
+			continue;
+
+		sym = symbol__new(start, size, line + len,
+				  self->sym_priv_size, start, verbose);
+
+		if (sym == NULL)
+			goto out_delete_line;
+
+		if (filter && filter(self, sym))
+			symbol__delete(sym, self->sym_priv_size);
+		else {
+			dso__insert_symbol(self, sym);
+			nr_syms++;
+		}
+	}
+
+	free(line);
+	fclose(file);
+
+	return nr_syms;
+
+out_delete_line:
+	free(line);
+out_failure:
+	return -1;
+}
+
 /**
  * elf_symtab__for_each_symbol - iterate thru all the symbols
  *
@@ -507,6 +569,9 @@ int dso__load(struct dso *self, symbol_filter_t filter, int verbose)
 	if (!name)
 		return -1;
 
+	if (strncmp(self->name, "/tmp/perf-", 10) == 0)
+		return dso__load_perf_map(self, filter, verbose);
+
 more:
 	do {
 		switch (variant) {
-- 
GitLab


From aefcf37b82886260d8540c9fb815e613c8977e06 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 8 Jun 2009 23:15:28 +0200
Subject: [PATCH 1674/2198] perf_counter tools: Standardize color printing

The rule is:

 - high overhead: red
 -  mid overhead: green
 -  low overhead: normal (white/black)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c |  5 +++--
 tools/perf/builtin-report.c   | 13 ++++++++-----
 tools/perf/builtin-top.c      | 13 ++++++++-----
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 0e23fe98ec4ec..3334a8bb1d513 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -1085,8 +1085,9 @@ parse_line(FILE *file, struct symbol *sym, uint64_t start, uint64_t len)
 			percent = 100.0 * hits / sym->hist_sum;
 
 		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
+		 * We color high-overhead entries in red, mid-overhead
+		 * entries in green - and keep the low overhead places
+		 * normal:
 		 */
 		if (percent >= 5.0)
 			color = PERF_COLOR_RED;
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 61d871849b446..0b18cb99a858e 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -693,13 +693,16 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 		char *color = PERF_COLOR_NORMAL;
 
 		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
+		 * We color high-overhead entries in red, mid-overhead
+		 * entries in green - and keep the low overhead places
+		 * normal:
 		 */
-		if (percent >= 5.0)
+		if (percent >= 5.0) {
 			color = PERF_COLOR_RED;
-		if (percent < 0.5)
-			color = PERF_COLOR_GREEN;
+		} else {
+			if (percent >= 0.5)
+				color = PERF_COLOR_GREEN;
+		}
 
 		ret = color_fprintf(fp, color, "   %6.2f%%",
 				(self->count * 100.0) / total_samples);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index be1698f1189b7..8ba24808a3921 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -248,13 +248,16 @@ static void print_sym_table(void)
 					 sum_ksamples));
 
 		/*
-		 * We color high-overhead entries in red, low-overhead
-		 * entries in green - and keep the middle ground normal:
+		 * We color high-overhead entries in red, mid-overhead
+		 * entries in green - and keep the low overhead places
+		 * normal:
 		 */
-		if (pcnt >= 5.0)
+		if (pcnt >= 5.0) {
 			color = PERF_COLOR_RED;
-		if (pcnt < 0.5)
-			color = PERF_COLOR_GREEN;
+		} else {
+			if (pcnt >= 0.5)
+				color = PERF_COLOR_GREEN;
+		}
 
 		if (nr_counters == 1)
 			printf("%20.2f - ", syme->weight);
-- 
GitLab


From 1f8a6a10fb9437eac3f516ea4324a19087872f30 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 8 Jun 2009 18:18:39 +0200
Subject: [PATCH 1675/2198] ring-buffer: pass in lockdep class key for
 reader_lock

On Sun, 7 Jun 2009, Ingo Molnar wrote:
> Testing tracer sched_switch: <6>Starting ring buffer hammer
> PASSED
> Testing tracer sysprof: PASSED
> Testing tracer function: PASSED
> Testing tracer irqsoff:
> =============================================
> PASSED
> Testing tracer preemptoff: PASSED
> Testing tracer preemptirqsoff: [ INFO: possible recursive locking detected ]
> PASSED
> Testing tracer branch: 2.6.30-rc8-tip-01972-ge5b9078-dirty #5760
> ---------------------------------------------
> rb_consumer/431 is trying to acquire lock:
>  (&cpu_buffer->reader_lock){......}, at: [<c109eef7>] ring_buffer_reset_cpu+0x37/0x70
>
> but task is already holding lock:
>  (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0
>
> other info that might help us debug this:
> 1 lock held by rb_consumer/431:
>  #0:  (&cpu_buffer->reader_lock){......}, at: [<c10a019e>] ring_buffer_consume+0x7e/0xc0

The ring buffer is a generic structure, and can be used outside of
ftrace. If ftrace traces within the use of the ring buffer, it can produce
false positives with lockdep.

This patch passes in a static lock key into the allocation of the ring
buffer, so that different ring buffers will have their own lock class.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1244477919.13761.9042.camel@twins>

[ store key in ring buffer descriptor ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ring_buffer.h | 14 +++++++++++++-
 kernel/trace/ring_buffer.c  |  9 +++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f1345828c7c58..8670f1575fe19 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -105,7 +105,19 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer,
  * size is in bytes for each per CPU buffer.
  */
 struct ring_buffer *
-ring_buffer_alloc(unsigned long size, unsigned flags);
+__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key);
+
+/*
+ * Because the ring buffer is generic, if other users of the ring buffer get
+ * traced by ftrace, it can produce lockdep warnings. We need to keep each
+ * ring buffer's lock class separate.
+ */
+#define ring_buffer_alloc(size, flags)			\
+({							\
+	static struct lock_class_key __key;		\
+	__ring_buffer_alloc((size), (flags), &__key);	\
+})
+
 void ring_buffer_free(struct ring_buffer *buffer);
 
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 7102d7a2fadb3..22878b0d370cf 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -426,6 +426,8 @@ struct ring_buffer {
 	atomic_t			record_disabled;
 	cpumask_var_t			cpumask;
 
+	struct lock_class_key		*reader_lock_key;
+
 	struct mutex			mutex;
 
 	struct ring_buffer_per_cpu	**buffers;
@@ -565,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
 	cpu_buffer->cpu = cpu;
 	cpu_buffer->buffer = buffer;
 	spin_lock_init(&cpu_buffer->reader_lock);
+	lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
 	cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	INIT_LIST_HEAD(&cpu_buffer->pages);
 
@@ -635,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self,
  * when the buffer wraps. If this flag is not set, the buffer will
  * drop data when the tail hits the head.
  */
-struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+					struct lock_class_key *key)
 {
 	struct ring_buffer *buffer;
 	int bsize;
@@ -658,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
 	buffer->clock = trace_clock_local;
+	buffer->reader_lock_key = key;
 
 	/* need at least two pages */
 	if (buffer->pages == 1)
@@ -715,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	kfree(buffer);
 	return NULL;
 }
-EXPORT_SYMBOL_GPL(ring_buffer_alloc);
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
 
 /**
  * ring_buffer_free - free a ring buffer.
-- 
GitLab


From 5bf1692f65c12a8aa359dc883468284ffc3c4587 Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Fri, 5 Jun 2009 14:44:58 +0900
Subject: [PATCH 1676/2198] TOMOYO: Remove unused field.

TOMOYO 2.2.0 is not using total_len field of "struct tomoyo_path_info".

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/tomoyo/common.c | 1 -
 security/tomoyo/common.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index 6d2561276a7b1..a44f655b3913c 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -428,7 +428,6 @@ void tomoyo_fill_path_info(struct tomoyo_path_info *ptr)
 	const char *name = ptr->name;
 	const int len = strlen(name);
 
-	ptr->total_len = len;
 	ptr->const_len = tomoyo_const_part_length(name);
 	ptr->is_dir = len && (name[len - 1] == '/');
 	ptr->is_patterned = (ptr->const_len < len);
diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h
index 678f4ff16aa44..d8b95047cb9d1 100644
--- a/security/tomoyo/common.h
+++ b/security/tomoyo/common.h
@@ -35,7 +35,6 @@ struct tomoyo_page_buffer {
 struct tomoyo_path_info {
 	const char *name;
 	u32 hash;          /* = full_name_hash(name, strlen(name)) */
-	u16 total_len;     /* = strlen(name)                       */
 	u16 const_len;     /* = tomoyo_const_part_length(name)     */
 	bool is_dir;       /* = tomoyo_strendswith(name, "/")      */
 	bool is_patterned; /* = tomoyo_path_contains_pattern(name) */
-- 
GitLab


From c3fa109a5894077d1eaf8731ea741a15dd117b3c Mon Sep 17 00:00:00 2001
From: Tetsuo Handa <penguin-kernel@i-love.sakura.ne.jp>
Date: Mon, 8 Jun 2009 12:37:39 +0900
Subject: [PATCH 1677/2198] TOMOYO: Add description of lists and structures.

This patch adds some descriptions of lists and structures.
This patch contains no code changes.

Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Signed-off-by: James Morris <jmorris@namei.org>
---
 security/tomoyo/common.c   |  69 ++++++++++++-
 security/tomoyo/common.h   | 133 ++++++++++++++++++++++---
 security/tomoyo/domain.c   | 192 +++++++++++++++++++++++++++++++++++--
 security/tomoyo/file.c     | 121 +++++++++++++++++++++--
 security/tomoyo/realpath.c |  19 +++-
 security/tomoyo/tomoyo.c   |   4 +
 6 files changed, 504 insertions(+), 34 deletions(-)

diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c
index a44f655b3913c..fdd1f4b8c448e 100644
--- a/security/tomoyo/common.c
+++ b/security/tomoyo/common.c
@@ -28,7 +28,13 @@ static const char *tomoyo_mode_2[4] = {
 	"disabled", "enabled", "enabled", "enabled"
 };
 
-/* Table for profile. */
+/*
+ * tomoyo_control_array is a static data which contains
+ *
+ *  (1) functionality name used by /sys/kernel/security/tomoyo/profile .
+ *  (2) initial values for "struct tomoyo_profile".
+ *  (3) max values for "struct tomoyo_profile".
+ */
 static struct {
 	const char *keyword;
 	unsigned int current_value;
@@ -39,7 +45,13 @@ static struct {
 	[TOMOYO_VERBOSE]          = { "TOMOYO_VERBOSE",      1,       1 },
 };
 
-/* Profile table. Memory is allocated as needed. */
+/*
+ * tomoyo_profile is a structure which is used for holding the mode of access
+ * controls. TOMOYO has 4 modes: disabled, learning, permissive, enforcing.
+ * An administrator can define up to 256 profiles.
+ * The ->profile of "struct tomoyo_domain_info" is used for remembering
+ * the profile's number (0 - 255) assigned to that domain.
+ */
 static struct tomoyo_profile {
 	unsigned int value[TOMOYO_MAX_CONTROL_INDEX];
 	const struct tomoyo_path_info *comment;
@@ -1006,7 +1018,19 @@ static int tomoyo_read_profile(struct tomoyo_io_buffer *head)
 	return 0;
 }
 
-/* Structure for policy manager. */
+/*
+ * tomoyo_policy_manager_entry is a structure which is used for holding list of
+ * domainnames or programs which are permitted to modify configuration via
+ * /sys/kernel/security/tomoyo/ interface.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_policy_manager_list .
+ *  (2) "manager" is a domainname or a program's pathname.
+ *  (3) "is_domain" is a bool which is true if "manager" is a domainname, false
+ *      otherwise.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_policy_manager_entry {
 	struct list_head list;
 	/* A path to program or a domainname. */
@@ -1015,7 +1039,36 @@ struct tomoyo_policy_manager_entry {
 	bool is_deleted; /* True if this entry is deleted. */
 };
 
-/* The list for "struct tomoyo_policy_manager_entry". */
+/*
+ * tomoyo_policy_manager_list is used for holding list of domainnames or
+ * programs which are permitted to modify configuration via
+ * /sys/kernel/security/tomoyo/ interface.
+ *
+ * An entry is added by
+ *
+ * # echo '<kernel> /sbin/mingetty /bin/login /bin/bash' > \
+ *                                        /sys/kernel/security/tomoyo/manager
+ *  (if you want to specify by a domainname)
+ *
+ *  or
+ *
+ * # echo '/usr/lib/ccs/editpolicy' > /sys/kernel/security/tomoyo/manager
+ *  (if you want to specify by a program's location)
+ *
+ * and is deleted by
+ *
+ * # echo 'delete <kernel> /sbin/mingetty /bin/login /bin/bash' > \
+ *                                        /sys/kernel/security/tomoyo/manager
+ *
+ *  or
+ *
+ * # echo 'delete /usr/lib/ccs/editpolicy' > \
+ *                                        /sys/kernel/security/tomoyo/manager
+ *
+ * and all entries are retrieved by
+ *
+ * # cat /sys/kernel/security/tomoyo/manager
+ */
 static LIST_HEAD(tomoyo_policy_manager_list);
 static DECLARE_RWSEM(tomoyo_policy_manager_list_lock);
 
@@ -2124,7 +2177,13 @@ static ssize_t tomoyo_write(struct file *file, const char __user *buf,
 	return tomoyo_write_control(file, buf, count);
 }
 
-/* Operations for /sys/kernel/security/tomoyo/ interface. */
+/*
+ * tomoyo_operations is a "struct file_operations" which is used for handling
+ * /sys/kernel/security/tomoyo/ interface.
+ *
+ * Some files under /sys/kernel/security/tomoyo/ directory accept open(O_RDWR).
+ * See tomoyo_io_buffer for internals.
+ */
 static const struct file_operations tomoyo_operations = {
 	.open    = tomoyo_open,
 	.release = tomoyo_release,
diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h
index d8b95047cb9d1..6d6ba09af4576 100644
--- a/security/tomoyo/common.h
+++ b/security/tomoyo/common.h
@@ -26,12 +26,40 @@
 struct dentry;
 struct vfsmount;
 
-/* Temporary buffer for holding pathnames. */
+/*
+ * tomoyo_page_buffer is a structure which is used for holding a pathname
+ * obtained from "struct dentry" and "struct vfsmount" pair.
+ * As of now, it is 4096 bytes. If users complain that 4096 bytes is too small
+ * (because TOMOYO escapes non ASCII printable characters using \ooo format),
+ * we will make the buffer larger.
+ */
 struct tomoyo_page_buffer {
 	char buffer[4096];
 };
 
-/* Structure for holding a token. */
+/*
+ * tomoyo_path_info is a structure which is used for holding a string data
+ * used by TOMOYO.
+ * This structure has several fields for supporting pattern matching.
+ *
+ * (1) "name" is the '\0' terminated string data.
+ * (2) "hash" is full_name_hash(name, strlen(name)).
+ *     This allows tomoyo_pathcmp() to compare by hash before actually compare
+ *     using strcmp().
+ * (3) "const_len" is the length of the initial segment of "name" which
+ *     consists entirely of non wildcard characters. In other words, the length
+ *     which we can compare two strings using strncmp().
+ * (4) "is_dir" is a bool which is true if "name" ends with "/",
+ *     false otherwise.
+ *     TOMOYO distinguishes directory and non-directory. A directory ends with
+ *     "/" and non-directory does not end with "/".
+ * (5) "is_patterned" is a bool which is true if "name" contains wildcard
+ *     characters, false otherwise. This allows TOMOYO to use "hash" and
+ *     strcmp() for string comparison if "is_patterned" is false.
+ * (6) "depth" is calculated using the number of "/" characters in "name".
+ *     This allows TOMOYO to avoid comparing two pathnames which never match
+ *     (e.g. whether "/var/www/html/index.html" matches "/tmp/sh-thd-\$").
+ */
 struct tomoyo_path_info {
 	const char *name;
 	u32 hash;          /* = full_name_hash(name, strlen(name)) */
@@ -50,7 +78,20 @@ struct tomoyo_path_info {
  */
 #define TOMOYO_MAX_PATHNAME_LEN 4000
 
-/* Structure for holding requested pathname. */
+/*
+ * tomoyo_path_info_with_data is a structure which is used for holding a
+ * pathname obtained from "struct dentry" and "struct vfsmount" pair.
+ *
+ * "struct tomoyo_path_info_with_data" consists of "struct tomoyo_path_info"
+ * and buffer for the pathname, while "struct tomoyo_page_buffer" consists of
+ * buffer for the pathname only.
+ *
+ * "struct tomoyo_path_info_with_data" is intended to allow TOMOYO to release
+ * both "struct tomoyo_path_info" and buffer for the pathname by single kfree()
+ * so that we don't need to return two pointers to the caller. If the caller
+ * puts "struct tomoyo_path_info" on stack memory, we will be able to remove
+ * "struct tomoyo_path_info_with_data".
+ */
 struct tomoyo_path_info_with_data {
 	/* Keep "head" first, for this pointer is passed to tomoyo_free(). */
 	struct tomoyo_path_info head;
@@ -60,7 +101,15 @@ struct tomoyo_path_info_with_data {
 };
 
 /*
- * Common header for holding ACL entries.
+ * tomoyo_acl_info is a structure which is used for holding
+ *
+ *  (1) "list" which is linked to the ->acl_info_list of
+ *      "struct tomoyo_domain_info"
+ *  (2) "type" which tells
+ *      (a) type & 0x7F : type of the entry (either
+ *          "struct tomoyo_single_path_acl_record" or
+ *          "struct tomoyo_double_path_acl_record")
+ *      (b) type & 0x80 : whether the entry is marked as "deleted".
  *
  * Packing "struct tomoyo_acl_info" allows
  * "struct tomoyo_single_path_acl_record" to embed "u16" and
@@ -80,7 +129,28 @@ struct tomoyo_acl_info {
 /* This ACL entry is deleted.           */
 #define TOMOYO_ACL_DELETED        0x80
 
-/* Structure for domain information. */
+/*
+ * tomoyo_domain_info is a structure which is used for holding permissions
+ * (e.g. "allow_read /lib/libc-2.5.so") given to each domain.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_domain_list .
+ *  (2) "acl_info_list" which is linked to "struct tomoyo_acl_info".
+ *  (3) "domainname" which holds the name of the domain.
+ *  (4) "profile" which remembers profile number assigned to this domain.
+ *  (5) "is_deleted" is a bool which is true if this domain is marked as
+ *      "deleted", false otherwise.
+ *  (6) "quota_warned" is a bool which is used for suppressing warning message
+ *      when learning mode learned too much entries.
+ *  (7) "flags" which remembers this domain's attributes.
+ *
+ * A domain's lifecycle is an analogy of files on / directory.
+ * Multiple domains with the same domainname cannot be created (as with
+ * creating files with the same filename fails with -EEXIST).
+ * If a process reached a domain, that process can reside in that domain after
+ * that domain is marked as "deleted" (as with a process can access an already
+ * open()ed file after that file was unlink()ed).
+ */
 struct tomoyo_domain_info {
 	struct list_head list;
 	struct list_head acl_info_list;
@@ -107,10 +177,18 @@ struct tomoyo_domain_info {
 #define TOMOYO_DOMAIN_FLAGS_TRANSITION_FAILED        2
 
 /*
- * Structure for "allow_read/write", "allow_execute", "allow_read",
- * "allow_write", "allow_create", "allow_unlink", "allow_mkdir", "allow_rmdir",
- * "allow_mkfifo", "allow_mksock", "allow_mkblock", "allow_mkchar",
- * "allow_truncate", "allow_symlink" and "allow_rewrite" directive.
+ * tomoyo_single_path_acl_record is a structure which is used for holding an
+ * entry with one pathname operation (e.g. open(), mkdir()).
+ * It has following fields.
+ *
+ *  (1) "head" which is a "struct tomoyo_acl_info".
+ *  (2) "perm" which is a bitmask of permitted operations.
+ *  (3) "filename" is the pathname.
+ *
+ * Directives held by this structure are "allow_read/write", "allow_execute",
+ * "allow_read", "allow_write", "allow_create", "allow_unlink", "allow_mkdir",
+ * "allow_rmdir", "allow_mkfifo", "allow_mksock", "allow_mkblock",
+ * "allow_mkchar", "allow_truncate", "allow_symlink" and "allow_rewrite".
  */
 struct tomoyo_single_path_acl_record {
 	struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_SINGLE_PATH_ACL */
@@ -119,7 +197,18 @@ struct tomoyo_single_path_acl_record {
 	const struct tomoyo_path_info *filename;
 };
 
-/* Structure for "allow_rename" and "allow_link" directive. */
+/*
+ * tomoyo_double_path_acl_record is a structure which is used for holding an
+ * entry with two pathnames operation (i.e. link() and rename()).
+ * It has following fields.
+ *
+ *  (1) "head" which is a "struct tomoyo_acl_info".
+ *  (2) "perm" which is a bitmask of permitted operations.
+ *  (3) "filename1" is the source/old pathname.
+ *  (4) "filename2" is the destination/new pathname.
+ *
+ * Directives held by this structure are "allow_rename" and "allow_link".
+ */
 struct tomoyo_double_path_acl_record {
 	struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_DOUBLE_PATH_ACL */
 	u8 perm;
@@ -152,7 +241,29 @@ struct tomoyo_double_path_acl_record {
 #define TOMOYO_VERBOSE                       2
 #define TOMOYO_MAX_CONTROL_INDEX             3
 
-/* Structure for reading/writing policy via securityfs interfaces. */
+/*
+ * tomoyo_io_buffer is a structure which is used for reading and modifying
+ * configuration via /sys/kernel/security/tomoyo/ interface.
+ * It has many fields. ->read_var1 , ->read_var2 , ->write_var1 are used as
+ * cursors.
+ *
+ * Since the content of /sys/kernel/security/tomoyo/domain_policy is a list of
+ * "struct tomoyo_domain_info" entries and each "struct tomoyo_domain_info"
+ * entry has a list of "struct tomoyo_acl_info", we need two cursors when
+ * reading (one is for traversing tomoyo_domain_list and the other is for
+ * traversing "struct tomoyo_acl_info"->acl_info_list ).
+ *
+ * If a line written to /sys/kernel/security/tomoyo/domain_policy starts with
+ * "select ", TOMOYO seeks the cursor ->read_var1 and ->write_var1 to the
+ * domain with the domainname specified by the rest of that line (NULL is set
+ * if seek failed).
+ * If a line written to /sys/kernel/security/tomoyo/domain_policy starts with
+ * "delete ", TOMOYO deletes an entry or a domain specified by the rest of that
+ * line (->write_var1 is set to NULL if a domain was deleted).
+ * If a line written to /sys/kernel/security/tomoyo/domain_policy starts with
+ * neither "select " nor "delete ", an entry or a domain specified by that line
+ * is appended.
+ */
 struct tomoyo_io_buffer {
 	int (*read) (struct tomoyo_io_buffer *);
 	int (*write) (struct tomoyo_io_buffer *);
diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c
index eb75401fd6b02..1d8b169605768 100644
--- a/security/tomoyo/domain.c
+++ b/security/tomoyo/domain.c
@@ -19,11 +19,63 @@
 /* The initial domain. */
 struct tomoyo_domain_info tomoyo_kernel_domain;
 
-/* The list for "struct tomoyo_domain_info". */
+/*
+ * tomoyo_domain_list is used for holding list of domains.
+ * The ->acl_info_list of "struct tomoyo_domain_info" is used for holding
+ * permissions (e.g. "allow_read /lib/libc-2.5.so") given to each domain.
+ *
+ * An entry is added by
+ *
+ * # ( echo "<kernel>"; echo "allow_execute /sbin/init" ) > \
+ *                                  /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and is deleted by
+ *
+ * # ( echo "<kernel>"; echo "delete allow_execute /sbin/init" ) > \
+ *                                  /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # cat /sys/kernel/security/tomoyo/domain_policy
+ *
+ * A domain is added by
+ *
+ * # echo "<kernel>" > /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and is deleted by
+ *
+ * # echo "delete <kernel>" > /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and all domains are retrieved by
+ *
+ * # grep '^<kernel>' /sys/kernel/security/tomoyo/domain_policy
+ *
+ * Normally, a domainname is monotonically getting longer because a domainname
+ * which the process will belong to if an execve() operation succeeds is
+ * defined as a concatenation of "current domainname" + "pathname passed to
+ * execve()".
+ * See tomoyo_domain_initializer_list and tomoyo_domain_keeper_list for
+ * exceptions.
+ */
 LIST_HEAD(tomoyo_domain_list);
 DECLARE_RWSEM(tomoyo_domain_list_lock);
 
-/* Structure for "initialize_domain" and "no_initialize_domain" keyword. */
+/*
+ * tomoyo_domain_initializer_entry is a structure which is used for holding
+ * "initialize_domain" and "no_initialize_domain" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_domain_initializer_list .
+ *  (2) "domainname" which is "a domainname" or "the last component of a
+ *      domainname". This field is NULL if "from" clause is not specified.
+ *  (3) "program" which is a program's pathname.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ *  (5) "is_not" is a bool which is true if "no_initialize_domain", false
+ *      otherwise.
+ *  (6) "is_last_name" is a bool which is true if "domainname" is "the last
+ *      component of a domainname", false otherwise.
+ */
 struct tomoyo_domain_initializer_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *domainname;    /* This may be NULL */
@@ -34,7 +86,23 @@ struct tomoyo_domain_initializer_entry {
 	bool is_last_name;
 };
 
-/* Structure for "keep_domain" and "no_keep_domain" keyword. */
+/*
+ * tomoyo_domain_keeper_entry is a structure which is used for holding
+ * "keep_domain" and "no_keep_domain" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_domain_keeper_list .
+ *  (2) "domainname" which is "a domainname" or "the last component of a
+ *      domainname".
+ *  (3) "program" which is a program's pathname.
+ *      This field is NULL if "from" clause is not specified.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ *  (5) "is_not" is a bool which is true if "no_initialize_domain", false
+ *      otherwise.
+ *  (6) "is_last_name" is a bool which is true if "domainname" is "the last
+ *      component of a domainname", false otherwise.
+ */
 struct tomoyo_domain_keeper_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *domainname;
@@ -45,7 +113,16 @@ struct tomoyo_domain_keeper_entry {
 	bool is_last_name;
 };
 
-/* Structure for "alias" keyword. */
+/*
+ * tomoyo_alias_entry is a structure which is used for holding "alias" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_alias_list .
+ *  (2) "original_name" which is a dereferenced pathname.
+ *  (3) "aliased_name" which is a symlink's pathname.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_alias_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *original_name;
@@ -92,7 +169,42 @@ const char *tomoyo_get_last_name(const struct tomoyo_domain_info *domain)
 	return cp0;
 }
 
-/* The list for "struct tomoyo_domain_initializer_entry". */
+/*
+ * tomoyo_domain_initializer_list is used for holding list of programs which
+ * triggers reinitialization of domainname. Normally, a domainname is
+ * monotonically getting longer. But sometimes, we restart daemon programs.
+ * It would be convenient for us that "a daemon started upon system boot" and
+ * "the daemon restarted from console" belong to the same domain. Thus, TOMOYO
+ * provides a way to shorten domainnames.
+ *
+ * An entry is added by
+ *
+ * # echo 'initialize_domain /usr/sbin/httpd' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete initialize_domain /usr/sbin/httpd' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^initialize_domain /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, /usr/sbin/httpd will belong to
+ * "<kernel> /usr/sbin/httpd" domain.
+ *
+ * You may specify a domainname using "from" keyword.
+ * "initialize_domain /usr/sbin/httpd from <kernel> /etc/rc.d/init.d/httpd"
+ * will cause "/usr/sbin/httpd" executed from "<kernel> /etc/rc.d/init.d/httpd"
+ * domain to belong to "<kernel> /usr/sbin/httpd" domain.
+ *
+ * You may add "no_" prefix to "initialize_domain".
+ * "initialize_domain /usr/sbin/httpd" and
+ * "no_initialize_domain /usr/sbin/httpd from <kernel> /etc/rc.d/init.d/httpd"
+ * will cause "/usr/sbin/httpd" to belong to "<kernel> /usr/sbin/httpd" domain
+ * unless executed from "<kernel> /etc/rc.d/init.d/httpd" domain.
+ */
 static LIST_HEAD(tomoyo_domain_initializer_list);
 static DECLARE_RWSEM(tomoyo_domain_initializer_list_lock);
 
@@ -268,7 +380,44 @@ static bool tomoyo_is_domain_initializer(const struct tomoyo_path_info *
 	return flag;
 }
 
-/* The list for "struct tomoyo_domain_keeper_entry". */
+/*
+ * tomoyo_domain_keeper_list is used for holding list of domainnames which
+ * suppresses domain transition. Normally, a domainname is monotonically
+ * getting longer. But sometimes, we want to suppress domain transition.
+ * It would be convenient for us that programs executed from a login session
+ * belong to the same domain. Thus, TOMOYO provides a way to suppress domain
+ * transition.
+ *
+ * An entry is added by
+ *
+ * # echo 'keep_domain <kernel> /usr/sbin/sshd /bin/bash' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete keep_domain <kernel> /usr/sbin/sshd /bin/bash' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^keep_domain /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, any process which belongs to
+ * "<kernel> /usr/sbin/sshd /bin/bash" domain will remain in that domain,
+ * unless explicitly specified by "initialize_domain" or "no_keep_domain".
+ *
+ * You may specify a program using "from" keyword.
+ * "keep_domain /bin/pwd from <kernel> /usr/sbin/sshd /bin/bash"
+ * will cause "/bin/pwd" executed from "<kernel> /usr/sbin/sshd /bin/bash"
+ * domain to remain in "<kernel> /usr/sbin/sshd /bin/bash" domain.
+ *
+ * You may add "no_" prefix to "keep_domain".
+ * "keep_domain <kernel> /usr/sbin/sshd /bin/bash" and
+ * "no_keep_domain /usr/bin/passwd from <kernel> /usr/sbin/sshd /bin/bash" will
+ * cause "/usr/bin/passwd" to belong to
+ * "<kernel> /usr/sbin/sshd /bin/bash /usr/bin/passwd" domain, unless
+ * explicitly specified by "initialize_domain".
+ */
 static LIST_HEAD(tomoyo_domain_keeper_list);
 static DECLARE_RWSEM(tomoyo_domain_keeper_list_lock);
 
@@ -437,7 +586,36 @@ static bool tomoyo_is_domain_keeper(const struct tomoyo_path_info *domainname,
 	return flag;
 }
 
-/* The list for "struct tomoyo_alias_entry". */
+/*
+ * tomoyo_alias_list is used for holding list of symlink's pathnames which are
+ * allowed to be passed to an execve() request. Normally, the domainname which
+ * the current process will belong to after execve() succeeds is calculated
+ * using dereferenced pathnames. But some programs behave differently depending
+ * on the name passed to argv[0]. For busybox, calculating domainname using
+ * dereferenced pathnames will cause all programs in the busybox to belong to
+ * the same domain. Thus, TOMOYO provides a way to allow use of symlink's
+ * pathname for checking execve()'s permission and calculating domainname which
+ * the current process will belong to after execve() succeeds.
+ *
+ * An entry is added by
+ *
+ * # echo 'alias /bin/busybox /bin/cat' > \
+ *                            /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete alias /bin/busybox /bin/cat' > \
+ *                            /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^alias /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, if /bin/cat is a symlink to /bin/busybox and execution
+ * of /bin/cat is requested, permission is checked for /bin/cat rather than
+ * /bin/busybox and domainname which the current process will belong to after
+ * execve() succeeds is calculated using /bin/cat rather than /bin/busybox .
+ */
 static LIST_HEAD(tomoyo_alias_list);
 static DECLARE_RWSEM(tomoyo_alias_list_lock);
 
diff --git a/security/tomoyo/file.c b/security/tomoyo/file.c
index ab0cd35385107..5ae3a571559f5 100644
--- a/security/tomoyo/file.c
+++ b/security/tomoyo/file.c
@@ -14,21 +14,50 @@
 #include "realpath.h"
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
 
-/* Structure for "allow_read" keyword. */
+/*
+ * tomoyo_globally_readable_file_entry is a structure which is used for holding
+ * "allow_read" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_globally_readable_list .
+ *  (2) "filename" is a pathname which is allowed to open(O_RDONLY).
+ *  (3) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_globally_readable_file_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *filename;
 	bool is_deleted;
 };
 
-/* Structure for "file_pattern" keyword. */
+/*
+ * tomoyo_pattern_entry is a structure which is used for holding
+ * "tomoyo_pattern_list" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_pattern_list .
+ *  (2) "pattern" is a pathname pattern which is used for converting pathnames
+ *      to pathname patterns during learning mode.
+ *  (3) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_pattern_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *pattern;
 	bool is_deleted;
 };
 
-/* Structure for "deny_rewrite" keyword. */
+/*
+ * tomoyo_no_rewrite_entry is a structure which is used for holding
+ * "deny_rewrite" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_no_rewrite_list .
+ *  (2) "pattern" is a pathname which is by default not permitted to modify
+ *      already existing content.
+ *  (3) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_no_rewrite_entry {
 	struct list_head list;
 	const struct tomoyo_path_info *pattern;
@@ -141,7 +170,31 @@ static int tomoyo_update_single_path_acl(const u8 type, const char *filename,
 					 struct tomoyo_domain_info *
 					 const domain, const bool is_delete);
 
-/* The list for "struct tomoyo_globally_readable_file_entry". */
+/*
+ * tomoyo_globally_readable_list is used for holding list of pathnames which
+ * are by default allowed to be open()ed for reading by any process.
+ *
+ * An entry is added by
+ *
+ * # echo 'allow_read /lib/libc-2.5.so' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete allow_read /lib/libc-2.5.so' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^allow_read /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, any process is allowed to
+ * open("/lib/libc-2.5.so", O_RDONLY).
+ * One exception is, if the domain which current process belongs to is marked
+ * as "ignore_global_allow_read", current process can't do so unless explicitly
+ * given "allow_read /lib/libc-2.5.so" to the domain which current process
+ * belongs to.
+ */
 static LIST_HEAD(tomoyo_globally_readable_list);
 static DECLARE_RWSEM(tomoyo_globally_readable_list_lock);
 
@@ -256,7 +309,35 @@ bool tomoyo_read_globally_readable_policy(struct tomoyo_io_buffer *head)
 	return done;
 }
 
-/* The list for "struct tomoyo_pattern_entry". */
+/* tomoyo_pattern_list is used for holding list of pathnames which are used for
+ * converting pathnames to pathname patterns during learning mode.
+ *
+ * An entry is added by
+ *
+ * # echo 'file_pattern /proc/\$/mounts' > \
+ *                             /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete file_pattern /proc/\$/mounts' > \
+ *                             /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^file_pattern /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, if a process which belongs to a domain which is in
+ * learning mode requested open("/proc/1/mounts", O_RDONLY),
+ * "allow_read /proc/\$/mounts" is automatically added to the domain which that
+ * process belongs to.
+ *
+ * It is not a desirable behavior that we have to use /proc/\$/ instead of
+ * /proc/self/ when current process needs to access only current process's
+ * information. As of now, LSM version of TOMOYO is using __d_path() for
+ * calculating pathname. Non LSM version of TOMOYO is using its own function
+ * which pretends as if /proc/self/ is not a symlink; so that we can forbid
+ * current process from accessing other process's information.
+ */
 static LIST_HEAD(tomoyo_pattern_list);
 static DECLARE_RWSEM(tomoyo_pattern_list_lock);
 
@@ -377,7 +458,35 @@ bool tomoyo_read_file_pattern(struct tomoyo_io_buffer *head)
 	return done;
 }
 
-/* The list for "struct tomoyo_no_rewrite_entry". */
+/*
+ * tomoyo_no_rewrite_list is used for holding list of pathnames which are by
+ * default forbidden to modify already written content of a file.
+ *
+ * An entry is added by
+ *
+ * # echo 'deny_rewrite /var/log/messages' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete deny_rewrite /var/log/messages' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^deny_rewrite /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, if a process requested to rewrite /var/log/messages ,
+ * the process can't rewrite unless the domain which that process belongs to
+ * has "allow_rewrite /var/log/messages" entry.
+ *
+ * It is not a desirable behavior that we have to add "\040(deleted)" suffix
+ * when we want to allow rewriting already unlink()ed file. As of now,
+ * LSM version of TOMOYO is using __d_path() for calculating pathname.
+ * Non LSM version of TOMOYO is using its own function which doesn't append
+ * " (deleted)" suffix if the file is already unlink()ed; so that we don't
+ * need to worry whether the file is already unlink()ed or not.
+ */
 static LIST_HEAD(tomoyo_no_rewrite_list);
 static DECLARE_RWSEM(tomoyo_no_rewrite_list_lock);
 
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index 3948f6b56ae2d..5f2e332633711 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -265,7 +265,16 @@ static unsigned int tomoyo_quota_for_savename;
  */
 #define TOMOYO_MAX_HASH 256
 
-/* Structure for string data. */
+/*
+ * tomoyo_name_entry is a structure which is used for linking
+ * "struct tomoyo_path_info" into tomoyo_name_list .
+ *
+ * Since tomoyo_name_list manages a list of strings which are shared by
+ * multiple processes (whereas "struct tomoyo_path_info" inside
+ * "struct tomoyo_path_info_with_data" is not shared), a reference counter will
+ * be added to "struct tomoyo_name_entry" rather than "struct tomoyo_path_info"
+ * when TOMOYO starts supporting garbage collector.
+ */
 struct tomoyo_name_entry {
 	struct list_head list;
 	struct tomoyo_path_info entry;
@@ -279,10 +288,10 @@ struct tomoyo_free_memory_block_list {
 };
 
 /*
- * The list for "struct tomoyo_name_entry".
- *
- * This list is updated only inside tomoyo_save_name(), thus
- * no global mutex exists.
+ * tomoyo_name_list is used for holding string data used by TOMOYO.
+ * Since same string data is likely used for multiple times (e.g.
+ * "/lib/libc-2.5.so"), TOMOYO shares string data in the form of
+ * "const struct tomoyo_path_info *".
  */
 static struct list_head tomoyo_name_list[TOMOYO_MAX_HASH];
 
diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c
index e42be5c4f055b..3194d09fe0f4c 100644
--- a/security/tomoyo/tomoyo.c
+++ b/security/tomoyo/tomoyo.c
@@ -262,6 +262,10 @@ static int tomoyo_dentry_open(struct file *f, const struct cred *cred)
 	return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path, flags);
 }
 
+/*
+ * tomoyo_security_ops is a "struct security_operations" which is used for
+ * registering TOMOYO.
+ */
 static struct security_operations tomoyo_security_ops = {
 	.name                = "tomoyo",
 	.cred_prepare        = tomoyo_cred_prepare,
-- 
GitLab


From dbb66c4be020b01dc2f3d7c609ddb0e09d2c0af7 Mon Sep 17 00:00:00 2001
From: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Date: Tue, 9 Jun 2009 05:47:10 +0200
Subject: [PATCH 1678/2198] block: needs to set the residual length of a bidi
 request

Tejun's "block: set rq->resid_len to blk_rq_bytes() on issue" patch
seems to be incomplete; It doesn't set rq->resid_len to blk_rq_bytes()
for a bidi request (req->next_rq). As a result, all bidi users are
broken.

Signed-off-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Acked-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 7ae83a1e2acff..03c5a64b6ccbd 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1846,6 +1846,9 @@ void blk_start_request(struct request *req)
 	 * resid_len to full count and add the timeout handler.
 	 */
 	req->resid_len = blk_rq_bytes(req);
+	if (unlikely(blk_bidi_rq(req)))
+		req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
+
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
-- 
GitLab


From 40df6ae4277a67e97aa0a8bd8e293fdbb00e5623 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 15:59:38 -0500
Subject: [PATCH 1679/2198] cciss: Use schedule_timeout_uninterruptible in SCSI
 error handling code

Use schedule_timeout_uninterruptible instead of schedule_timeout in the
scsi error handling code when waiting between TUR polls since we are not
interested in nor want to be interrupted by signals.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss_scsi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 2edfc9b644eb6..134fdf25d7f42 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -1608,7 +1608,7 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 		/* Wait for a bit.  do this first, because if we send
 		 * the TUR right away, the reset will just abort it.
 		 */
-		schedule_timeout_interruptible(waittime);
+		schedule_timeout_uninterruptible(waittime);
 		count++;
 
 		/* Increase wait time with each try, up to a point. */
-- 
GitLab


From 5390cfc3fea49d015ae1eed8551c0bf00489b50e Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:01:11 -0500
Subject: [PATCH 1680/2198] cciss: factor out core of sendcmd_withirq() for use
 by SCSI error handling code

Factor the core of sendcmd_withirq out to provide a simpler interface
which provides access to full error information.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss.c | 183 ++++++++++++++++++++++--------------------
 1 file changed, 96 insertions(+), 87 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e7d00952dd4fe..74fc85aded921 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2278,114 +2278,123 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 	return status;
 }
 
-static int sendcmd_withirq(__u8 cmd,
-			   int ctlr,
-			   void *buff,
-			   size_t size,
-			   unsigned int use_unit_num,
-			   unsigned int log_unit, __u8 page_code, int cmd_type)
+static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
 {
-	ctlr_info_t *h = hba[ctlr];
-	CommandList_struct *c;
+	DECLARE_COMPLETION_ONSTACK(wait);
 	u64bit buff_dma_handle;
 	unsigned long flags;
-	int return_status;
-	DECLARE_COMPLETION_ONSTACK(wait);
+	int return_status = IO_OK;
 
-	if ((c = cmd_alloc(h, 0)) == NULL)
-		return -ENOMEM;
-	return_status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
-				 log_unit, page_code, NULL, cmd_type);
-	if (return_status != IO_OK) {
-		cmd_free(h, c, 0);
-		return return_status;
-	}
-      resend_cmd2:
+resend_cmd2:
 	c->waiting = &wait;
-
 	/* Put the request on the tail of the queue and send it */
-	spin_lock_irqsave(CCISS_LOCK(ctlr), flags);
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
 	addQ(&h->reqQ, c);
 	h->Qdepth++;
 	start_io(h);
-	spin_unlock_irqrestore(CCISS_LOCK(ctlr), flags);
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
 
 	wait_for_completion(&wait);
 
-	if (c->err_info->CommandStatus != 0) {	/* an error has occurred */
-		switch (c->err_info->CommandStatus) {
-		case CMD_TARGET_STATUS:
-			printk(KERN_WARNING "cciss: cmd %p has "
-			       " completed with errors\n", c);
-			if (c->err_info->ScsiStatus) {
-				printk(KERN_WARNING "cciss: cmd %p "
-				       "has SCSI Status = %x\n",
-				       c, c->err_info->ScsiStatus);
-			}
+	if (c->err_info->CommandStatus == 0)
+		goto command_done;
 
-			break;
-		case CMD_DATA_UNDERRUN:
-		case CMD_DATA_OVERRUN:
-			/* expected for inquire and report lun commands */
-			break;
-		case CMD_INVALID:
-			printk(KERN_WARNING "cciss: Cmd %p is "
-			       "reported invalid\n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_PROTOCOL_ERR:
-			printk(KERN_WARNING "cciss: cmd %p has "
-			       "protocol error \n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_HARDWARE_ERR:
-			printk(KERN_WARNING "cciss: cmd %p had "
-			       " hardware error\n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_CONNECTION_LOST:
-			printk(KERN_WARNING "cciss: cmd %p had "
-			       "connection lost\n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_ABORTED:
-			printk(KERN_WARNING "cciss: cmd %p was "
-			       "aborted\n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_ABORT_FAILED:
-			printk(KERN_WARNING "cciss: cmd %p reports "
-			       "abort failed\n", c);
-			return_status = IO_ERROR;
-			break;
-		case CMD_UNSOLICITED_ABORT:
+	switch (c->err_info->CommandStatus) {
+	case CMD_TARGET_STATUS:
+		printk(KERN_WARNING "cciss: cmd 0x%02x "
+		"has completed with errors\n", c->Request.CDB[0]);
+		if (c->err_info->ScsiStatus) {
+			printk(KERN_WARNING "cciss: cmd 0x%02x "
+			       "has SCSI Status = %x\n",
+			       c->Request.CDB[0], c->err_info->ScsiStatus);
+		}
+		break;
+	case CMD_DATA_UNDERRUN:
+	case CMD_DATA_OVERRUN:
+		/* expected for inquiry and report lun commands */
+		break;
+	case CMD_INVALID:
+		printk(KERN_WARNING "cciss: Cmd 0x%02x is "
+		       "reported invalid\n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_PROTOCOL_ERR:
+		printk(KERN_WARNING "cciss: cmd 0x%02x has "
+		       "protocol error \n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_HARDWARE_ERR:
+		printk(KERN_WARNING "cciss: cmd 0x%02x had "
+		       " hardware error\n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_CONNECTION_LOST:
+		printk(KERN_WARNING "cciss: cmd 0x%02x had "
+		       "connection lost\n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_ABORTED:
+		printk(KERN_WARNING "cciss: cmd 0x%02x was "
+		       "aborted\n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_ABORT_FAILED:
+		printk(KERN_WARNING "cciss: cmd 0x%02x reports "
+		       "abort failed\n", c->Request.CDB[0]);
+		return_status = IO_ERROR;
+		break;
+	case CMD_UNSOLICITED_ABORT:
+		printk(KERN_WARNING
+		       "cciss%d: unsolicited abort 0x%02x\n", h->ctlr,
+			c->Request.CDB[0]);
+		if (c->retry_count < MAX_CMD_RETRIES) {
 			printk(KERN_WARNING
-			       "cciss%d: unsolicited abort %p\n", ctlr, c);
-			if (c->retry_count < MAX_CMD_RETRIES) {
-				printk(KERN_WARNING
-				       "cciss%d: retrying %p\n", ctlr, c);
-				c->retry_count++;
-				/* erase the old error information */
-				memset(c->err_info, 0,
-				       sizeof(ErrorInfo_struct));
-				return_status = IO_OK;
-				INIT_COMPLETION(wait);
-				goto resend_cmd2;
-			}
-			return_status = IO_ERROR;
-			break;
-		default:
-			printk(KERN_WARNING "cciss: cmd %p returned "
-			       "unknown status %x\n", c,
-			       c->err_info->CommandStatus);
-			return_status = IO_ERROR;
+			       "cciss%d: retrying 0x%02x\n", h->ctlr,
+				c->Request.CDB[0]);
+			c->retry_count++;
+			/* erase the old error information */
+			memset(c->err_info, 0,
+			       sizeof(ErrorInfo_struct));
+			return_status = IO_OK;
+			INIT_COMPLETION(wait);
+			goto resend_cmd2;
 		}
+		return_status = IO_ERROR;
+		break;
+	default:
+		printk(KERN_WARNING "cciss: cmd 0x%02x returned "
+		       "unknown status %x\n", c->Request.CDB[0],
+		       c->err_info->CommandStatus);
+		return_status = IO_ERROR;
 	}
+
+command_done:
 	/* unlock the buffers from DMA */
 	buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
 	buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
 	pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
 			 c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
+	return return_status;
+}
+
+static int sendcmd_withirq(__u8 cmd,
+			   int ctlr,
+			   void *buff,
+			   size_t size,
+			   unsigned int use_unit_num,
+			   unsigned int log_unit, __u8 page_code, int cmd_type)
+{
+	ctlr_info_t *h = hba[ctlr];
+	CommandList_struct *c;
+	int return_status;
+
+	c = cmd_alloc(h, 0);
+	if (!c)
+		return -ENOMEM;
+	return_status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
+				 log_unit, page_code, NULL, cmd_type);
+	if (return_status == IO_OK)
+		return_status = sendcmd_withirq_core(h, c);
 	cmd_free(h, c, 0);
 	return return_status;
 }
-- 
GitLab


From b57695fe131b13d3f2460cfeb9175cff673ed337 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:02:17 -0500
Subject: [PATCH 1681/2198] cciss: simplify interface of sendcmd() and
 sendcmd_withirq()

Simplify interfaces of sendcmd() and sendcmd_withirq() so that they
provide only one way to address commands instead of three ways.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss.c      | 115 +++++++++++++++++--------------------
 drivers/block/cciss_cmd.h  |   2 +
 drivers/block/cciss_scsi.c |  26 ++-------
 3 files changed, 59 insertions(+), 84 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 74fc85aded921..885ea1e38e429 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -180,11 +180,10 @@ static void __devinit cciss_interrupt_mode(ctlr_info_t *, struct pci_dev *,
 					   __u32);
 static void start_io(ctlr_info_t *h);
 static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
-		   unsigned int use_unit_num, unsigned int log_unit,
 		   __u8 page_code, unsigned char *scsi3addr, int cmd_type);
 static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
-			   unsigned int use_unit_num, unsigned int log_unit,
-			   __u8 page_code, int cmd_type);
+			__u8 page_code, unsigned char scsi3addr[],
+			int cmd_type);
 
 static void fail_all_cmds(unsigned long ctlr);
 static int scan_thread(void *data);
@@ -1520,6 +1519,15 @@ static void cciss_softirq_done(struct request *rq)
 	spin_unlock_irqrestore(&h->lock, flags);
 }
 
+static void log_unit_to_scsi3addr(ctlr_info_t *h, unsigned char scsi3addr[],
+	uint32_t log_unit)
+{
+	log_unit = h->drv[log_unit].LunID & 0x03fff;
+	memset(&scsi3addr[4], 0, 4);
+	memcpy(&scsi3addr[0], &log_unit, 4);
+	scsi3addr[3] |= 0x40;
+}
+
 /* This function gets the SCSI vendor, model, and revision of a logical drive
  * via the inquiry page 0.  Model, vendor, and rev are set to empty strings if
  * they cannot be read.
@@ -1529,6 +1537,7 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
 {
 	int rc;
 	InquiryData_struct *inq_buf;
+	unsigned char scsi3addr[8];
 
 	*vendor = '\0';
 	*model = '\0';
@@ -1538,14 +1547,15 @@ static void cciss_get_device_descr(int ctlr, int logvol, int withirq,
 	if (!inq_buf)
 		return;
 
+	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
 	if (withirq)
 		rc = sendcmd_withirq(CISS_INQUIRY, ctlr, inq_buf,
-				     sizeof(InquiryData_struct), 1, logvol,
-				     0, TYPE_CMD);
+			     sizeof(InquiryData_struct), 0,
+				scsi3addr, TYPE_CMD);
 	else
 		rc = sendcmd(CISS_INQUIRY, ctlr, inq_buf,
-			     sizeof(InquiryData_struct), 1, logvol, 0, NULL,
-			     TYPE_CMD);
+			     sizeof(InquiryData_struct), 0,
+				scsi3addr, TYPE_CMD);
 	if (rc == IO_OK) {
 		memcpy(vendor, &inq_buf->data_byte[8], VENDOR_LEN);
 		vendor[VENDOR_LEN] = '\0';
@@ -1570,6 +1580,7 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
 #define PAGE_83_INQ_BYTES 64
 	int rc;
 	unsigned char *buf;
+	unsigned char scsi3addr[8];
 
 	if (buflen > 16)
 		buflen = 16;
@@ -1578,12 +1589,13 @@ static void cciss_get_serial_no(int ctlr, int logvol, int withirq,
 	if (!buf)
 		return;
 	memset(serial_no, 0, buflen);
+	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
 	if (withirq)
 		rc = sendcmd_withirq(CISS_INQUIRY, ctlr, buf,
-			PAGE_83_INQ_BYTES, 1, logvol, 0x83, TYPE_CMD);
+			PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
 	else
 		rc = sendcmd(CISS_INQUIRY, ctlr, buf,
-			PAGE_83_INQ_BYTES, 1, logvol, 0x83, NULL, TYPE_CMD);
+			PAGE_83_INQ_BYTES, 0x83, scsi3addr, TYPE_CMD);
 	if (rc == IO_OK)
 		memcpy(serial_no, &buf[8], buflen);
 	kfree(buf);
@@ -1902,8 +1914,8 @@ static int rebuild_lun_table(ctlr_info_t *h, int first_time)
 		goto mem_msg;
 
 	return_code = sendcmd_withirq(CISS_REPORT_LOG, ctlr, ld_buff,
-				      sizeof(ReportLunData_struct), 0,
-				      0, 0, TYPE_CMD);
+				      sizeof(ReportLunData_struct),
+				      0, CTLR_LUNID, TYPE_CMD);
 
 	if (return_code == IO_OK)
 		listlength = be32_to_cpu(*(__be32 *) ld_buff->LUNListLength);
@@ -2112,11 +2124,9 @@ static int deregister_disk(ctlr_info_t *h, int drv_index,
 	return 0;
 }
 
-static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_t size, unsigned int use_unit_num,	/* 0: address the controller,
-															   1: address logical volume log_unit,
-															   2: periph device address is scsi3addr */
-		    unsigned int log_unit, __u8 page_code,
-		    unsigned char *scsi3addr, int cmd_type)
+static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
+		size_t size, __u8 page_code, unsigned char *scsi3addr,
+		int cmd_type)
 {
 	ctlr_info_t *h = hba[ctlr];
 	u64bit buff_dma_handle;
@@ -2132,27 +2142,12 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 		c->Header.SGTotal = 0;
 	}
 	c->Header.Tag.lower = c->busaddr;
+	memcpy(c->Header.LUN.LunAddrBytes, scsi3addr, 8);
 
 	c->Request.Type.Type = cmd_type;
 	if (cmd_type == TYPE_CMD) {
 		switch (cmd) {
 		case CISS_INQUIRY:
-			/* If the logical unit number is 0 then, this is going
-			   to controller so It's a physical command
-			   mode = 0 target = 0.  So we have nothing to write.
-			   otherwise, if use_unit_num == 1,
-			   mode = 1(volume set addressing) target = LUNID
-			   otherwise, if use_unit_num == 2,
-			   mode = 0(periph dev addr) target = scsi3addr */
-			if (use_unit_num == 1) {
-				c->Header.LUN.LogDev.VolId =
-				    h->drv[log_unit].LunID;
-				c->Header.LUN.LogDev.Mode = 1;
-			} else if (use_unit_num == 2) {
-				memcpy(c->Header.LUN.LunAddrBytes, scsi3addr,
-				       8);
-				c->Header.LUN.LogDev.Mode = 0;
-			}
 			/* are we trying to read a vital product page */
 			if (page_code != 0) {
 				c->Request.CDB[1] = 0x01;
@@ -2182,8 +2177,6 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			break;
 
 		case CCISS_READ_CAPACITY:
-			c->Header.LUN.LogDev.VolId = h->drv[log_unit].LunID;
-			c->Header.LUN.LogDev.Mode = 1;
 			c->Request.CDBLen = 10;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_READ;
@@ -2191,8 +2184,6 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			c->Request.CDB[0] = cmd;
 			break;
 		case CCISS_READ_CAPACITY_16:
-			c->Header.LUN.LogDev.VolId = h->drv[log_unit].LunID;
-			c->Header.LUN.LogDev.Mode = 1;
 			c->Request.CDBLen = 16;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_READ;
@@ -2215,7 +2206,6 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			c->Request.CDB[6] = BMIC_CACHE_FLUSH;
 			break;
 		case TEST_UNIT_READY:
-			memcpy(c->Header. LUN.LunAddrBytes, scsi3addr, 8);
 			c->Request.CDBLen = 6;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_NONE;
@@ -2239,7 +2229,6 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff, size_
 			memcpy(&c->Request.CDB[4], buff, 8);
 			break;
 		case 1:	/* RESET message */
-			memcpy(c->Header.LUN.LunAddrBytes, scsi3addr, 8);
 			c->Request.CDBLen = 16;
 			c->Request.Type.Attribute = ATTR_SIMPLE;
 			c->Request.Type.Direction = XFER_NONE;
@@ -2307,6 +2296,9 @@ static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
 			printk(KERN_WARNING "cciss: cmd 0x%02x "
 			       "has SCSI Status = %x\n",
 			       c->Request.CDB[0], c->err_info->ScsiStatus);
+			if (c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION)
+				printk(KERN_WARNING "sense key = 0x%02x\n",
+					0xf & c->err_info->SenseInfo[2]);
 		}
 		break;
 	case CMD_DATA_UNDERRUN:
@@ -2377,12 +2369,9 @@ static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
 	return return_status;
 }
 
-static int sendcmd_withirq(__u8 cmd,
-			   int ctlr,
-			   void *buff,
-			   size_t size,
-			   unsigned int use_unit_num,
-			   unsigned int log_unit, __u8 page_code, int cmd_type)
+static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
+			   __u8 page_code, unsigned char scsi3addr[],
+			int cmd_type)
 {
 	ctlr_info_t *h = hba[ctlr];
 	CommandList_struct *c;
@@ -2391,8 +2380,8 @@ static int sendcmd_withirq(__u8 cmd,
 	c = cmd_alloc(h, 0);
 	if (!c)
 		return -ENOMEM;
-	return_status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
-				 log_unit, page_code, NULL, cmd_type);
+	return_status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
+		scsi3addr, cmd_type);
 	if (return_status == IO_OK)
 		return_status = sendcmd_withirq_core(h, c);
 	cmd_free(h, c, 0);
@@ -2407,15 +2396,17 @@ static void cciss_geometry_inquiry(int ctlr, int logvol,
 {
 	int return_code;
 	unsigned long t;
+	unsigned char scsi3addr[8];
 
 	memset(inq_buff, 0, sizeof(InquiryData_struct));
+	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
 	if (withirq)
 		return_code = sendcmd_withirq(CISS_INQUIRY, ctlr,
-					      inq_buff, sizeof(*inq_buff), 1,
-					      logvol, 0xC1, TYPE_CMD);
+					      inq_buff, sizeof(*inq_buff),
+					      0xC1, scsi3addr, TYPE_CMD);
 	else
 		return_code = sendcmd(CISS_INQUIRY, ctlr, inq_buff,
-				      sizeof(*inq_buff), 1, logvol, 0xC1, NULL,
+				      sizeof(*inq_buff), 0xC1, scsi3addr,
 				      TYPE_CMD);
 	if (return_code == IO_OK) {
 		if (inq_buff->data_byte[8] == 0xFF) {
@@ -2456,6 +2447,7 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
 {
 	ReadCapdata_struct *buf;
 	int return_code;
+	unsigned char scsi3addr[8];
 
 	buf = kzalloc(sizeof(ReadCapdata_struct), GFP_KERNEL);
 	if (!buf) {
@@ -2463,14 +2455,15 @@ cciss_read_capacity(int ctlr, int logvol, int withirq, sector_t *total_size,
 		return;
 	}
 
+	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
 	if (withirq)
 		return_code = sendcmd_withirq(CCISS_READ_CAPACITY,
 				ctlr, buf, sizeof(ReadCapdata_struct),
-					1, logvol, 0, TYPE_CMD);
+					0, scsi3addr, TYPE_CMD);
 	else
 		return_code = sendcmd(CCISS_READ_CAPACITY,
 				ctlr, buf, sizeof(ReadCapdata_struct),
-					1, logvol, 0, NULL, TYPE_CMD);
+					0, scsi3addr, TYPE_CMD);
 	if (return_code == IO_OK) {
 		*total_size = be32_to_cpu(*(__be32 *) buf->total_size);
 		*block_size = be32_to_cpu(*(__be32 *) buf->block_size);
@@ -2490,6 +2483,7 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
 {
 	ReadCapdata_struct_16 *buf;
 	int return_code;
+	unsigned char scsi3addr[8];
 
 	buf = kzalloc(sizeof(ReadCapdata_struct_16), GFP_KERNEL);
 	if (!buf) {
@@ -2497,15 +2491,16 @@ cciss_read_capacity_16(int ctlr, int logvol, int withirq, sector_t *total_size,
 		return;
 	}
 
+	log_unit_to_scsi3addr(hba[ctlr], scsi3addr, logvol);
 	if (withirq) {
 		return_code = sendcmd_withirq(CCISS_READ_CAPACITY_16,
 			ctlr, buf, sizeof(ReadCapdata_struct_16),
-				1, logvol, 0, TYPE_CMD);
+				0, scsi3addr, TYPE_CMD);
 	}
 	else {
 		return_code = sendcmd(CCISS_READ_CAPACITY_16,
 			ctlr, buf, sizeof(ReadCapdata_struct_16),
-				1, logvol, 0, NULL, TYPE_CMD);
+				0, scsi3addr, TYPE_CMD);
 	}
 	if (return_code == IO_OK) {
 		*total_size = be64_to_cpu(*(__be64 *) buf->total_size);
@@ -2760,10 +2755,6 @@ static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
  * Used at init time, and during SCSI error recovery.
  */
 static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
-	unsigned int use_unit_num,/* 0: address the controller,
-				     1: address logical volume log_unit,
-				     2: periph device address is scsi3addr */
-	unsigned int log_unit,
 	__u8 page_code, unsigned char *scsi3addr, int cmd_type)
 {
 	CommandList_struct *c;
@@ -2774,8 +2765,8 @@ static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
 		printk(KERN_WARNING "cciss: unable to get memory");
 		return IO_ERROR;
 	}
-	status = fill_cmd(c, cmd, ctlr, buff, size, use_unit_num,
-			  log_unit, page_code, scsi3addr, cmd_type);
+	status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
+		scsi3addr, cmd_type);
 	if (status == IO_OK)
 		status = sendcmd_core(hba[ctlr], c);
 	cmd_free(hba[ctlr], c, 1);
@@ -4076,7 +4067,7 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 	}
 
 	return_code = sendcmd_withirq(CISS_INQUIRY, i, inq_buff,
-		sizeof(InquiryData_struct), 0, 0 , 0, TYPE_CMD);
+		sizeof(InquiryData_struct), 0, CTLR_LUNID, TYPE_CMD);
 	if (return_code == IO_OK) {
 		hba[i]->firm_ver[0] = inq_buff->data_byte[32];
 		hba[i]->firm_ver[1] = inq_buff->data_byte[33];
@@ -4157,8 +4148,8 @@ static void cciss_shutdown(struct pci_dev *pdev)
 	/* sendcmd will turn off interrupt, and send the flush...
 	 * To write all data in the battery backed cache to disks */
 	memset(flush_buf, 0, 4);
-	return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0, 0, 0, NULL,
-			      TYPE_CMD);
+	return_code = sendcmd(CCISS_CACHE_FLUSH, i, flush_buf, 4, 0,
+		CTLR_LUNID, TYPE_CMD);
 	if (return_code == IO_OK) {
 		printk(KERN_INFO "Completed flushing cache on controller %d\n", i);
 	} else {
diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h
index 40b1b92dae7fc..cd665b00c7c5b 100644
--- a/drivers/block/cciss_cmd.h
+++ b/drivers/block/cciss_cmd.h
@@ -217,6 +217,8 @@ typedef union _LUNAddr_struct {
   LogDevAddr_struct  LogDev;
 } LUNAddr_struct;
 
+#define CTLR_LUNID "\0\0\0\0\0\0\0\0"
+
 typedef struct _CommandListHeader_struct {
   BYTE              ReplyQueue;
   BYTE              SGList;
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 134fdf25d7f42..007ab8aea85bf 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -44,26 +44,9 @@
 #define CCISS_ABORT_MSG 0x00
 #define CCISS_RESET_MSG 0x01
 
-/* some prototypes... */ 
-static int sendcmd(
-	__u8	cmd,
-	int	ctlr,
-	void	*buff,
-	size_t	size,
-	unsigned int use_unit_num, /* 0: address the controller,
-				      1: address logical volume log_unit, 
-				      2: address is in scsi3addr */
-	unsigned int log_unit,
-	__u8	page_code,
-	unsigned char *scsi3addr,
-	int cmd_type);
-
 static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 	size_t size,
-	unsigned int use_unit_num, /* 0: address the controller,
-				      1: address logical volume log_unit,
-				      2: periph device address is scsi3addr */
-	unsigned int log_unit, __u8 page_code, unsigned char *scsi3addr,
+	__u8 page_code, unsigned char *scsi3addr,
 	int cmd_type);
 
 static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c);
@@ -1616,7 +1599,7 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 			waittime = waittime * 2;
 
 		/* Send the Test Unit Ready */
-		rc = fill_cmd(c, TEST_UNIT_READY, h->ctlr, NULL, 0, 0, 0, 0,
+		rc = fill_cmd(c, TEST_UNIT_READY, h->ctlr, NULL, 0, 0,
 			lunaddr, TYPE_CMD);
 		if (rc == 0) {
 			rc = sendcmd_core(h, c);
@@ -1680,7 +1663,7 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
 		return FAILED;
 	memcpy(lunaddr, &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 8);
 	/* send a reset to the SCSI LUN which the command was sent to */
-	rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 2, 0, 0, lunaddr,
+	rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr,
 		TYPE_MSG);
 	/* sendcmd turned off interrupts on the board, turn 'em back on. */
 	(*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
@@ -1708,8 +1691,7 @@ static int  cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
 	cmd_to_abort = (CommandList_struct *) scsicmd->host_scribble;
 	if (cmd_to_abort == NULL) /* paranoia */
 		return FAILED;
-	rc = sendcmd(CCISS_ABORT_MSG, ctlr, &cmd_to_abort->Header.Tag, 
-		0, 2, 0, 0, 
+	rc = sendcmd(CCISS_ABORT_MSG, ctlr, &cmd_to_abort->Header.Tag, 0, 0,
 		(unsigned char *) &cmd_to_abort->Header.LUN.LunAddrBytes[0], 
 		TYPE_MSG);
 	/* sendcmd turned off interrupts on the board, turn 'em back on. */
-- 
GitLab


From 3c2ab40296894d1f7ad9714550fdf9b96d4e9ee6 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:04:35 -0500
Subject: [PATCH 1682/2198] cciss: factor out fix target status processing code
 from sendcmd functions

Factor out code to process target status of completed commands in sendcmd()
and sendcmd_withirq_core(), and fix problem that bad target status was ignored in
sendcmd_withirq_core.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss.c | 56 ++++++++++++++++++++++++++-----------------
 1 file changed, 34 insertions(+), 22 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 885ea1e38e429..2d128831d3cae 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2267,6 +2267,31 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 	return status;
 }
 
+static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
+{
+	switch (c->err_info->ScsiStatus) {
+	case SAM_STAT_GOOD:
+		return IO_OK;
+	case SAM_STAT_CHECK_CONDITION:
+		switch (0xf & c->err_info->SenseInfo[2]) {
+		case 0: return IO_OK; /* no sense */
+		case 1: return IO_OK; /* recovered error */
+		default:
+			printk(KERN_WARNING "cciss%d: cmd 0x%02x "
+				"check condition, sense key = 0x%02x\n",
+				h->ctlr, c->Request.CDB[0],
+				c->err_info->SenseInfo[2]);
+		}
+		break;
+	default:
+		printk(KERN_WARNING "cciss%d: cmd 0x%02x"
+			"scsi status = 0x%02x\n", h->ctlr,
+			c->Request.CDB[0], c->err_info->ScsiStatus);
+		break;
+	}
+	return IO_ERROR;
+}
+
 static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
@@ -2290,16 +2315,7 @@ static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
 
 	switch (c->err_info->CommandStatus) {
 	case CMD_TARGET_STATUS:
-		printk(KERN_WARNING "cciss: cmd 0x%02x "
-		"has completed with errors\n", c->Request.CDB[0]);
-		if (c->err_info->ScsiStatus) {
-			printk(KERN_WARNING "cciss: cmd 0x%02x "
-			       "has SCSI Status = %x\n",
-			       c->Request.CDB[0], c->err_info->ScsiStatus);
-			if (c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION)
-				printk(KERN_WARNING "sense key = 0x%02x\n",
-					0xf & c->err_info->SenseInfo[2]);
-		}
+		return_status = check_target_status(h, c);
 		break;
 	case CMD_DATA_UNDERRUN:
 	case CMD_DATA_OVERRUN:
@@ -2710,33 +2726,29 @@ static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
 			printk(KERN_WARNING "cciss%d: retried %p too many "
 				"times\n", h->ctlr, c);
 			status = IO_ERROR;
-			goto cleanup1;
+			break;
 		}
 
 		if (c->err_info->CommandStatus == CMD_UNABORTABLE) {
 			printk(KERN_WARNING "cciss%d: command could not be "
 				"aborted.\n", h->ctlr);
 			status = IO_ERROR;
-			goto cleanup1;
+			break;
 		}
 
-		printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
-		printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
-			c->Request.CDB[0], c->err_info->CommandStatus);
 		if (c->err_info->CommandStatus == CMD_TARGET_STATUS) {
-			printk(KERN_WARNING "Target status = 0x%02x\n",
-			c->err_info->ScsiStatus);
-			if (c->err_info->ScsiStatus == 2) /* chk cond */
-				printk(KERN_WARNING "Sense key = 0x%02x\n",
-					0xf & c->err_info->SenseInfo[2]);
+			status = check_target_status(h, c);
+			break;
 		}
 
+		printk(KERN_WARNING "cciss%d: sendcmd error\n", h->ctlr);
+		printk(KERN_WARNING "cmd = 0x%02x, CommandStatus = 0x%02x\n",
+			c->Request.CDB[0], c->err_info->CommandStatus);
 		status = IO_ERROR;
-		goto cleanup1;
+		break;
 
 	} while (1);
 
-cleanup1:
 	/* unlock the data buffer from DMA */
 	buff_dma_handle.val32.lower = c->SG[0].Addr.lower;
 	buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
-- 
GitLab


From 789a424ad1352b335960e7c56494d0410577fa61 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:05:56 -0500
Subject: [PATCH 1683/2198] cciss: separate error processing and command
 retrying code in sendcmd_withirq_core()

Separate the error processing from sendcmd_withirq_core from the code
which retries commands.  The rationale for this is that the SCSI error
handling code can then be made to use sendcmd_withirq_core, but avoid
retrying commands.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss.c | 77 +++++++++++++++++++++++++------------------
 drivers/block/cciss.h |  1 +
 2 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 2d128831d3cae..700170711061d 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2292,26 +2292,12 @@ static int check_target_status(ctlr_info_t *h, CommandList_struct *c)
 	return IO_ERROR;
 }
 
-static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
+static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c)
 {
-	DECLARE_COMPLETION_ONSTACK(wait);
-	u64bit buff_dma_handle;
-	unsigned long flags;
 	int return_status = IO_OK;
 
-resend_cmd2:
-	c->waiting = &wait;
-	/* Put the request on the tail of the queue and send it */
-	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
-	addQ(&h->reqQ, c);
-	h->Qdepth++;
-	start_io(h);
-	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
-
-	wait_for_completion(&wait);
-
-	if (c->err_info->CommandStatus == 0)
-		goto command_done;
+	if (c->err_info->CommandStatus == CMD_SUCCESS)
+		return IO_OK;
 
 	switch (c->err_info->CommandStatus) {
 	case CMD_TARGET_STATUS:
@@ -2322,7 +2308,7 @@ static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
 		/* expected for inquiry and report lun commands */
 		break;
 	case CMD_INVALID:
-		printk(KERN_WARNING "cciss: Cmd 0x%02x is "
+		printk(KERN_WARNING "cciss: cmd 0x%02x is "
 		       "reported invalid\n", c->Request.CDB[0]);
 		return_status = IO_ERROR;
 		break;
@@ -2355,19 +2341,7 @@ static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
 		printk(KERN_WARNING
 		       "cciss%d: unsolicited abort 0x%02x\n", h->ctlr,
 			c->Request.CDB[0]);
-		if (c->retry_count < MAX_CMD_RETRIES) {
-			printk(KERN_WARNING
-			       "cciss%d: retrying 0x%02x\n", h->ctlr,
-				c->Request.CDB[0]);
-			c->retry_count++;
-			/* erase the old error information */
-			memset(c->err_info, 0,
-			       sizeof(ErrorInfo_struct));
-			return_status = IO_OK;
-			INIT_COMPLETION(wait);
-			goto resend_cmd2;
-		}
-		return_status = IO_ERROR;
+		return_status = IO_NEEDS_RETRY;
 		break;
 	default:
 		printk(KERN_WARNING "cciss: cmd 0x%02x returned "
@@ -2375,6 +2349,44 @@ static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c)
 		       c->err_info->CommandStatus);
 		return_status = IO_ERROR;
 	}
+	return return_status;
+}
+
+static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c,
+	int attempt_retry)
+{
+	DECLARE_COMPLETION_ONSTACK(wait);
+	u64bit buff_dma_handle;
+	unsigned long flags;
+	int return_status = IO_OK;
+
+resend_cmd2:
+	c->waiting = &wait;
+	/* Put the request on the tail of the queue and send it */
+	spin_lock_irqsave(CCISS_LOCK(h->ctlr), flags);
+	addQ(&h->reqQ, c);
+	h->Qdepth++;
+	start_io(h);
+	spin_unlock_irqrestore(CCISS_LOCK(h->ctlr), flags);
+
+	wait_for_completion(&wait);
+
+	if (c->err_info->CommandStatus == 0 || !attempt_retry)
+		goto command_done;
+
+	return_status = process_sendcmd_error(h, c);
+
+	if (return_status == IO_NEEDS_RETRY &&
+		c->retry_count < MAX_CMD_RETRIES) {
+		printk(KERN_WARNING "cciss%d: retrying 0x%02x\n", h->ctlr,
+			c->Request.CDB[0]);
+		c->retry_count++;
+		/* erase the old error information */
+		memset(c->err_info, 0, sizeof(ErrorInfo_struct));
+		return_status = IO_OK;
+		INIT_COMPLETION(wait);
+		goto resend_cmd2;
+	}
 
 command_done:
 	/* unlock the buffers from DMA */
@@ -2399,7 +2411,8 @@ static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
 	return_status = fill_cmd(c, cmd, ctlr, buff, size, page_code,
 		scsi3addr, cmd_type);
 	if (return_status == IO_OK)
-		return_status = sendcmd_withirq_core(h, c);
+		return_status = sendcmd_withirq_core(h, c, 1);
+
 	cmd_free(h, c, 0);
 	return return_status;
 }
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index dd1926d8cd971..25cd58e250219 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -11,6 +11,7 @@
 
 #define IO_OK		0
 #define IO_ERROR	1
+#define IO_NEEDS_RETRY  3
 
 #define VENDOR_LEN	8
 #define MODEL_LEN	16
-- 
GitLab


From 85cc61ae41084cb6d8ecc6c9e01ac4563005c8ac Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:07:45 -0500
Subject: [PATCH 1684/2198] cciss: change SCSI error handling routines to work
 with interrupts enabled.

Change cciss scsi error handling routines to work with interrupts enabled.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss.c      |  3 +++
 drivers/block/cciss_scsi.c | 24 +++++++++---------------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 700170711061d..9a9db6deefcd2 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -184,6 +184,9 @@ static int sendcmd(__u8 cmd, int ctlr, void *buff, size_t size,
 static int sendcmd_withirq(__u8 cmd, int ctlr, void *buff, size_t size,
 			__u8 page_code, unsigned char scsi3addr[],
 			int cmd_type);
+static int sendcmd_withirq_core(ctlr_info_t *h, CommandList_struct *c,
+	int attempt_retry);
+static int process_sendcmd_error(ctlr_info_t *h, CommandList_struct *c);
 
 static void fail_all_cmds(unsigned long ctlr);
 static int scan_thread(void *data);
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index 007ab8aea85bf..d2eb489774100 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -49,8 +49,6 @@ static int fill_cmd(CommandList_struct *c, __u8 cmd, int ctlr, void *buff,
 	__u8 page_code, unsigned char *scsi3addr,
 	int cmd_type);
 
-static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c);
-
 static CommandList_struct *cmd_alloc(ctlr_info_t *h, int get_from_pool);
 static void cmd_free(ctlr_info_t *h, CommandList_struct *c, int got_from_pool);
 
@@ -1601,11 +1599,10 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 		/* Send the Test Unit Ready */
 		rc = fill_cmd(c, TEST_UNIT_READY, h->ctlr, NULL, 0, 0,
 			lunaddr, TYPE_CMD);
-		if (rc == 0) {
-			rc = sendcmd_core(h, c);
-			/* sendcmd turned off interrupts, turn 'em back on. */
-			h->access.set_intr_mask(h, CCISS_INTR_ON);
-		}
+		if (rc == 0)
+			rc = sendcmd_withirq_core(h, c, 0);
+
+		(void) process_sendcmd_error(h, c);
 
 		if (rc == 0 && c->err_info->CommandStatus == CMD_SUCCESS)
 			break;
@@ -1663,10 +1660,8 @@ static int cciss_eh_device_reset_handler(struct scsi_cmnd *scsicmd)
 		return FAILED;
 	memcpy(lunaddr, &cmd_in_trouble->Header.LUN.LunAddrBytes[0], 8);
 	/* send a reset to the SCSI LUN which the command was sent to */
-	rc = sendcmd(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr,
+	rc = sendcmd_withirq(CCISS_RESET_MSG, ctlr, NULL, 0, 0, lunaddr,
 		TYPE_MSG);
-	/* sendcmd turned off interrupts on the board, turn 'em back on. */
-	(*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
 	if (rc == 0 && wait_for_device_to_become_ready(*c, lunaddr) == 0)
 		return SUCCESS;
 	printk(KERN_WARNING "cciss%d: resetting device failed.\n", ctlr);
@@ -1677,6 +1672,7 @@ static int  cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
 {
 	int rc;
 	CommandList_struct *cmd_to_abort;
+	unsigned char lunaddr[8];
 	ctlr_info_t **c;
 	int ctlr;
 
@@ -1691,11 +1687,9 @@ static int  cciss_eh_abort_handler(struct scsi_cmnd *scsicmd)
 	cmd_to_abort = (CommandList_struct *) scsicmd->host_scribble;
 	if (cmd_to_abort == NULL) /* paranoia */
 		return FAILED;
-	rc = sendcmd(CCISS_ABORT_MSG, ctlr, &cmd_to_abort->Header.Tag, 0, 0,
-		(unsigned char *) &cmd_to_abort->Header.LUN.LunAddrBytes[0], 
-		TYPE_MSG);
-	/* sendcmd turned off interrupts on the board, turn 'em back on. */
-	(*c)->access.set_intr_mask(*c, CCISS_INTR_ON);
+	memcpy(lunaddr, &cmd_to_abort->Header.LUN.LunAddrBytes[0], 8);
+	rc = sendcmd_withirq(CCISS_ABORT_MSG, ctlr, &cmd_to_abort->Header.Tag,
+		0, 0, lunaddr, TYPE_MSG);
 	if (rc == 0)
 		return SUCCESS;
 	return FAILED;
-- 
GitLab


From 72f9f1324fc4cd450c92e4600a710231b0445c75 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:09:32 -0500
Subject: [PATCH 1685/2198] cciss: Remove no longer needed sendcmd reject
 processing code

Now that the cciss SCSI error handling routines operate with interrupts
enabled, we no longer need to maintain the list of command completions that
sendcmd() might inadvertantly scoop up, since now it only runs at driver init
time, and there won't be any other commands for it to scoop up.  So we
can remove that list and the code that adds to it and processes it.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss.c | 98 ++-----------------------------------------
 drivers/block/cciss.h |  9 ----
 2 files changed, 3 insertions(+), 104 deletions(-)

diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 9a9db6deefcd2..b22cec97ea194 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -2612,51 +2612,6 @@ static unsigned long pollcomplete(int ctlr)
 	return 1;
 }
 
-static int add_sendcmd_reject(__u8 cmd, int ctlr, unsigned long complete)
-{
-	/* We get in here if sendcmd() is polling for completions
-	   and gets some command back that it wasn't expecting --
-	   something other than that which it just sent down.
-	   Ordinarily, that shouldn't happen, but it can happen when
-	   the scsi tape stuff gets into error handling mode, and
-	   starts using sendcmd() to try to abort commands and
-	   reset tape drives.  In that case, sendcmd may pick up
-	   completions of commands that were sent to logical drives
-	   through the block i/o system, or cciss ioctls completing, etc.
-	   In that case, we need to save those completions for later
-	   processing by the interrupt handler.
-	 */
-
-#ifdef CONFIG_CISS_SCSI_TAPE
-	struct sendcmd_reject_list *srl = &hba[ctlr]->scsi_rejects;
-
-	/* If it's not the scsi tape stuff doing error handling, (abort */
-	/* or reset) then we don't expect anything weird. */
-	if (cmd != CCISS_RESET_MSG && cmd != CCISS_ABORT_MSG) {
-#endif
-		printk(KERN_WARNING "cciss cciss%d: SendCmd "
-		       "Invalid command list address returned! (%lx)\n",
-		       ctlr, complete);
-		/* not much we can do. */
-#ifdef CONFIG_CISS_SCSI_TAPE
-		return 1;
-	}
-
-	/* We've sent down an abort or reset, but something else
-	   has completed */
-	if (srl->ncompletions >= (hba[ctlr]->nr_cmds + 2)) {
-		/* Uh oh.  No room to save it for later... */
-		printk(KERN_WARNING "cciss%d: Sendcmd: Invalid command addr, "
-		       "reject list overflow, command lost!\n", ctlr);
-		return 1;
-	}
-	/* Save it for later */
-	srl->complete[srl->ncompletions] = complete;
-	srl->ncompletions++;
-#endif
-	return 0;
-}
-
 /* Send command c to controller h and poll for it to complete.
  * Turns interrupts off on the board.  Used at driver init time
  * and during SCSI error recovery.
@@ -2701,11 +2656,10 @@ static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
 			break;
 		}
 
-		/* If it's not the cmd we're looking for, save it for later */
+		/* Make sure it's the command we're expecting. */
 		if ((complete & ~CISS_ERROR_BIT) != c->busaddr) {
-			if (add_sendcmd_reject(c->Request.CDB[0],
-				h->ctlr, complete) != 0)
-				BUG(); /* we are hosed if we get here. */
+			printk(KERN_WARNING "cciss%d: Unexpected command "
+				"completion.\n", h->ctlr);
 			continue;
 		}
 
@@ -2770,11 +2724,6 @@ static int sendcmd_core(ctlr_info_t *h, CommandList_struct *c)
 	buff_dma_handle.val32.upper = c->SG[0].Addr.upper;
 	pci_unmap_single(h->pdev, (dma_addr_t) buff_dma_handle.val,
 			 c->SG[0].Len, PCI_DMA_BIDIRECTIONAL);
-#ifdef CONFIG_CISS_SCSI_TAPE
-	/* if we saved some commands for later, process them now. */
-	if (h->scsi_rejects.ncompletions > 0)
-		do_cciss_intr(0, h);
-#endif
 	return status;
 }
 
@@ -3195,44 +3144,18 @@ static void do_cciss_request(struct request_queue *q)
 
 static inline unsigned long get_next_completion(ctlr_info_t *h)
 {
-#ifdef CONFIG_CISS_SCSI_TAPE
-	/* Any rejects from sendcmd() lying around? Process them first */
-	if (h->scsi_rejects.ncompletions == 0)
-		return h->access.command_completed(h);
-	else {
-		struct sendcmd_reject_list *srl;
-		int n;
-		srl = &h->scsi_rejects;
-		n = --srl->ncompletions;
-		/* printk("cciss%d: processing saved reject\n", h->ctlr); */
-		printk("p");
-		return srl->complete[n];
-	}
-#else
 	return h->access.command_completed(h);
-#endif
 }
 
 static inline int interrupt_pending(ctlr_info_t *h)
 {
-#ifdef CONFIG_CISS_SCSI_TAPE
-	return (h->access.intr_pending(h)
-		|| (h->scsi_rejects.ncompletions > 0));
-#else
 	return h->access.intr_pending(h);
-#endif
 }
 
 static inline long interrupt_not_for_us(ctlr_info_t *h)
 {
-#ifdef CONFIG_CISS_SCSI_TAPE
-	return (((h->access.intr_pending(h) == 0) ||
-		 (h->interrupts_enabled == 0))
-		&& (h->scsi_rejects.ncompletions == 0));
-#else
 	return (((h->access.intr_pending(h) == 0) ||
 		 (h->interrupts_enabled == 0)));
-#endif
 }
 
 static irqreturn_t do_cciss_intr(int irq, void *dev_id)
@@ -4054,15 +3977,6 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 		printk(KERN_ERR "cciss: out of memory");
 		goto clean4;
 	}
-#ifdef CONFIG_CISS_SCSI_TAPE
-	hba[i]->scsi_rejects.complete =
-	    kmalloc(sizeof(hba[i]->scsi_rejects.complete[0]) *
-		    (hba[i]->nr_cmds + 5), GFP_KERNEL);
-	if (hba[i]->scsi_rejects.complete == NULL) {
-		printk(KERN_ERR "cciss: out of memory");
-		goto clean4;
-	}
-#endif
 	spin_lock_init(&hba[i]->lock);
 
 	/* Initialize the pdev driver private data.
@@ -4122,9 +4036,6 @@ static int __devinit cciss_init_one(struct pci_dev *pdev,
 
 clean4:
 	kfree(inq_buff);
-#ifdef CONFIG_CISS_SCSI_TAPE
-	kfree(hba[i]->scsi_rejects.complete);
-#endif
 	kfree(hba[i]->cmd_pool_bits);
 	if (hba[i]->cmd_pool)
 		pci_free_consistent(hba[i]->pdev,
@@ -4242,9 +4153,6 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev)
 	pci_free_consistent(hba[i]->pdev, hba[i]->nr_cmds * sizeof(ErrorInfo_struct),
 			    hba[i]->errinfo_pool, hba[i]->errinfo_pool_dhandle);
 	kfree(hba[i]->cmd_pool_bits);
-#ifdef CONFIG_CISS_SCSI_TAPE
-	kfree(hba[i]->scsi_rejects.complete);
-#endif
 	/*
 	 * Deliberately omit pci_disable_device(): it does something nasty to
 	 * Smart Array controllers that pci_enable_device does not undo
diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h
index 25cd58e250219..06a5db25b298e 100644
--- a/drivers/block/cciss.h
+++ b/drivers/block/cciss.h
@@ -53,14 +53,6 @@ typedef struct _drive_info_struct
 	char rev[REV_LEN + 1];       /* SCSI revision string */
 } drive_info_struct;
 
-#ifdef CONFIG_CISS_SCSI_TAPE
-
-struct sendcmd_reject_list {
-	int ncompletions;
-	unsigned long *complete; /* array of NR_CMDS tags */
-};
-
-#endif
 struct ctlr_info 
 {
 	int	ctlr;
@@ -128,7 +120,6 @@ struct ctlr_info
 	void *scsi_ctlr; /* ptr to structure containing scsi related stuff */
 	/* list of block side commands the scsi error handling sucked up */
 	/* and saved for later processing */
-	struct sendcmd_reject_list scsi_rejects;
 #endif
 	unsigned char alive;
 	struct completion *rescan_wait;
-- 
GitLab


From 3969251b80a7b143d01576780073fe0cc9ef6253 Mon Sep 17 00:00:00 2001
From: "scameron@beardog.cca.cpqcorp.net" <scameron@beardog.cca.cpqcorp.net>
Date: Mon, 8 Jun 2009 16:10:57 -0500
Subject: [PATCH 1686/2198] cciss: decode unit attention in SCSI error handling
 code

Make SCSI reset error handler decode unit attention ASC
and after a target reset wait for a unit attention that indicates
a reset occurred rather than just for any old unit attention.

Signed-off-by: Stephen M. Cameron <scameron@beardog.cca.cpqcorp.net>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/cciss_scsi.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index d2eb489774100..3315268b4ec7d 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -1604,16 +1604,25 @@ static int wait_for_device_to_become_ready(ctlr_info_t *h,
 
 		(void) process_sendcmd_error(h, c);
 
-		if (rc == 0 && c->err_info->CommandStatus == CMD_SUCCESS)
-			break;
+		if (rc != 0)
+			goto retry_tur;
 
-		if (rc == 0 &&
-			c->err_info->CommandStatus == CMD_TARGET_STATUS &&
-			c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION &&
-			(c->err_info->SenseInfo[2] == NO_SENSE ||
-			c->err_info->SenseInfo[2] == UNIT_ATTENTION))
+		if (c->err_info->CommandStatus == CMD_SUCCESS)
 			break;
 
+		if (c->err_info->CommandStatus == CMD_TARGET_STATUS &&
+			c->err_info->ScsiStatus == SAM_STAT_CHECK_CONDITION) {
+			if (c->err_info->SenseInfo[2] == NO_SENSE)
+				break;
+			if (c->err_info->SenseInfo[2] == UNIT_ATTENTION) {
+				unsigned char asc;
+				asc = c->err_info->SenseInfo[12];
+				check_for_unit_attention(h, c);
+				if (asc == POWER_OR_RESET)
+					break;
+			}
+		}
+retry_tur:
 		printk(KERN_WARNING "cciss%d: Waiting %d secs "
 			"for device to become ready.\n",
 			h->ctlr, waittime / HZ);
-- 
GitLab


From 9df1bb9b516daeece159ab7fb262d01a0359247c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 9 Jun 2009 06:22:57 +0200
Subject: [PATCH 1687/2198] Revert "block: Fix bounce limit setting in DM"

This reverts commit a05c0205ba031c01bba33a21bf0a35920eb64833.

DM doesn't need to access the bounce_pfn directly.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 17 -----------------
 drivers/md/dm-table.c  |  2 +-
 include/linux/blkdev.h |  1 -
 3 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9acd0b7e802a7..8d33934928918 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -193,23 +193,6 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
-/**
- * blk_queue_bounce_pfn - set the bounce buffer limit for queue
- * @q: the request queue for the device
- * @pfn: max address
- *
- * Description:
- *    This function is similar to blk_queue_bounce_limit except it
- *    neither changes allocation flags, nor does it set up the ISA DMA
- *    pool. This function should only be used by stacking drivers.
- *    Hardware drivers should use blk_queue_bounce_limit instead.
- */
-void blk_queue_bounce_pfn(struct request_queue *q, u64 pfn)
-{
-	q->limits.bounce_pfn = pfn;
-}
-EXPORT_SYMBOL(blk_queue_bounce_pfn);
-
 /**
  * blk_queue_max_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ca1604ddd5ca..e9a73bb242b09 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -920,7 +920,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_segment_size(q, t->limits.max_segment_size);
 	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
 	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
-	blk_queue_bounce_pfn(q, t->limits.bounce_pfn);
+	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 989aa1790f48d..5e740a135e733 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,7 +910,6 @@ extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
-extern void blk_queue_bounce_pfn(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
-- 
GitLab


From 77634f33d4078542cf1087995cced0ffdae25aa2 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Tue, 9 Jun 2009 06:23:22 +0200
Subject: [PATCH 1688/2198] block: Add missing bounce_pfn stacking and fix
 comments

DM no longer needs to set limits explicitly when calling blk_stack_limits.
Let the latter automatically deal with bounce_pfn scaling.

Fix kerneldoc variable names.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8d33934928918..1c4df9bf68136 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -445,7 +445,7 @@ EXPORT_SYMBOL(blk_queue_stack_limits);
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top)
- * @bdev:  the underlying queue limits (bottom)
+ * @b:  the underlying queue limits (bottom)
  * @offset:  offset to beginning of data within component device
  *
  * Description:
@@ -458,6 +458,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 {
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
+	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
 
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
@@ -504,7 +505,7 @@ EXPORT_SYMBOL(blk_stack_limits);
 
 /**
  * disk_stack_limits - adjust queue limits for stacked drivers
- * @t:	MD/DM gendisk (top)
+ * @disk:  MD/DM gendisk (top)
  * @bdev:  the underlying block device (bottom)
  * @offset:  offset to beginning of data within component device
  *
-- 
GitLab


From 29150078d7a1758df8c7a6cd2ec066ac65e1fab9 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 9 Jun 2009 10:54:18 +0200
Subject: [PATCH 1689/2198] amd-iommu: remove BUS_NOTIFY_BOUND_DRIVER handling

Handling this event causes device assignment in KVM to fail because the
device gets re-attached as soon as the pci-stub registers as the driver
for the device.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 8510e90ebfecf..81872604eb762 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1145,17 +1145,6 @@ static int device_change_notifier(struct notifier_block *nb,
 			  "to a non-dma-ops domain\n", dev_name(dev));
 
 	switch (action) {
-	case BUS_NOTIFY_BOUND_DRIVER:
-		if (domain)
-			goto out;
-		dma_domain = find_protection_domain(devid);
-		if (!dma_domain)
-			dma_domain = iommu->default_dom;
-		attach_device(iommu, &dma_domain->domain, devid);
-		DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain "
-			    "%d for device %s\n",
-			    dma_domain->domain.id, dev_name(dev));
-		break;
 	case BUS_NOTIFY_UNBOUND_DRIVER:
 		if (!domain)
 			goto out;
-- 
GitLab


From 71ff3bca2f70264effe8cbbdd5bc10cf6be5f2f0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 8 Jun 2009 13:47:33 -0700
Subject: [PATCH 1690/2198] amd-iommu: detach device explicitly before
 attaching it to a new domain

This fixes a bug with a device that could not be assigned to a KVM guest
because it is still assigned to a dma_ops protection domain.

[chrisw: simply remove WARN_ON(), will always fire since dev->driver
will be pci-sub]

Signed-off-by: Chris Wright <chrisw@sous-sol.org>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 81872604eb762..772e91088e405 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -2073,7 +2073,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 
 	old_domain = domain_for_device(devid);
 	if (old_domain)
-		return -EBUSY;
+		detach_device(old_domain, devid);
 
 	attach_device(iommu, domain, devid);
 
-- 
GitLab


From e9a22a13c71986851e931bdfa054f68839ff8576 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Tue, 9 Jun 2009 12:00:37 +0200
Subject: [PATCH 1691/2198] amd-iommu: remove unnecessary "AMD IOMMU: " prefix

That prefix is already included in the DUMP_printk macro. So there is no
need to repeat it in the format string.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 772e91088e405..1c60554537c35 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1266,9 +1266,8 @@ static int get_device_resources(struct device *dev,
 			dma_dom = (*iommu)->default_dom;
 		*domain = &dma_dom->domain;
 		attach_device(*iommu, *domain, *bdf);
-		DUMP_printk(KERN_INFO "AMD IOMMU: Using protection domain "
-				"%d for device %s\n",
-				(*domain)->id, dev_name(dev));
+		DUMP_printk("Using protection domain %d for device %s\n",
+			    (*domain)->id, dev_name(dev));
 	}
 
 	if (domain_for_device(_bdf) == NULL)
-- 
GitLab


From 1d589bb16b825b3a7b4edd34d997f1f1f953033d Mon Sep 17 00:00:00 2001
From: john cooper <john.cooper@redhat.com>
Date: Tue, 9 Jun 2009 14:41:40 +0200
Subject: [PATCH 1692/2198] Add serial number support for virtio_blk, V4a

This patch extracts the opaque data from pci i/o
region 0 via the added VIRTIO_BLK_F_IDENTIFY
field.  By convention this data takes the form of
that returned by an ATA IDENTIFY DEVICE command,
however the driver (except for structure size)
makes no interpretation of the data.  The structure
data is copied wholesale to userspace via a
HDIO_GET_IDENTITY ioctl command (eg: hdparm -i <dev>).

Signed-off-by: john cooper <john.cooper@redhat.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/virtio_blk.c | 37 ++++++++++++++++++++++++++++++++++---
 include/linux/virtio_blk.h |  4 ++++
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c4845b1694640..c0facaa55cf46 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -171,11 +171,43 @@ static void do_virtblk_request(struct request_queue *q)
 		vblk->vq->vq_ops->kick(vblk->vq);
 }
 
+/* return ATA identify data
+ */
+static int virtblk_identify(struct gendisk *disk, void *argp)
+{
+	struct virtio_blk *vblk = disk->private_data;
+	void *opaque;
+	int err = -ENOMEM;
+
+	opaque = kmalloc(VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
+	if (!opaque)
+		goto out;
+
+	err = virtio_config_buf(vblk->vdev, VIRTIO_BLK_F_IDENTIFY,
+		offsetof(struct virtio_blk_config, identify), opaque,
+		VIRTIO_BLK_ID_BYTES);
+
+	if (err)
+		goto out_kfree;
+
+	if (copy_to_user(argp, opaque, VIRTIO_BLK_ID_BYTES))
+		err = -EFAULT;
+
+out_kfree:
+	kfree(opaque);
+out:
+	return err;
+}
+
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 			 unsigned cmd, unsigned long data)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	struct virtio_blk *vblk = disk->private_data;
+	void __user *argp = (void __user *)data;
+
+	if (cmd == HDIO_GET_IDENTITY)
+		return virtblk_identify(disk, argp);
 
 	/*
 	 * Only allow the generic SCSI ioctls if the host can support it.
@@ -183,8 +215,7 @@ static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
 		return -ENOIOCTLCMD;
 
-	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
-			      (void __user *)data);
+	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, argp);
 }
 
 /* We provide getgeo only to please some old bootloader/partitioning tools */
@@ -390,7 +421,7 @@ static struct virtio_device_id id_table[] = {
 static unsigned int features[] = {
 	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
 	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
-	VIRTIO_BLK_F_SCSI,
+	VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_IDENTIFY
 };
 
 static struct virtio_driver virtio_blk = {
diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h
index 4dbcbc1c3481c..be7d255fc7cfb 100644
--- a/include/linux/virtio_blk.h
+++ b/include/linux/virtio_blk.h
@@ -16,6 +16,9 @@
 #define VIRTIO_BLK_F_RO		5	/* Disk is read-only */
 #define VIRTIO_BLK_F_BLK_SIZE	6	/* Block size of disk is available*/
 #define VIRTIO_BLK_F_SCSI	7	/* Supports scsi command passthru */
+#define VIRTIO_BLK_F_IDENTIFY	8	/* ATA IDENTIFY supported */
+
+#define VIRTIO_BLK_ID_BYTES	(sizeof(__u16[256]))	/* IDENTIFY DATA */
 
 struct virtio_blk_config
 {
@@ -33,6 +36,7 @@ struct virtio_blk_config
 	} geometry;
 	/* block size of device (if VIRTIO_BLK_F_BLK_SIZE) */
 	__u32 blk_size;
+	__u8 identify[VIRTIO_BLK_ID_BYTES];
 } __attribute__((packed));
 
 /* These two define direction. */
-- 
GitLab


From 42937e81a82b6bbc51a309c83da140b3a7ca5945 Mon Sep 17 00:00:00 2001
From: Andreas Herrmann <andreas.herrmann3@amd.com>
Date: Mon, 8 Jun 2009 15:55:09 +0200
Subject: [PATCH 1693/2198] x86: Detect use of extended APIC ID for AMD CPUs

Booting a 32-bit kernel on Magny-Cours results in the following panic:

  ...
  Using APIC driver default
  ...
  Overriding APIC driver with bigsmp
  ...
  Getting VERSION: 80050010
  Getting VERSION: 80050010
  Getting ID: 10000000
  Getting ID: ef000000
  Getting LVT0: 700
  Getting LVT1: 10000
  Kernel panic - not syncing: Boot APIC ID in local APIC unexpected (16 vs 0)
  Pid: 1, comm: swapper Not tainted 2.6.30-rcX #2
  Call Trace:
   [<c05194da>] ? panic+0x38/0xd3
   [<c0743102>] ? native_smp_prepare_cpus+0x259/0x31f
   [<c073b19d>] ? kernel_init+0x3e/0x141
   [<c073b15f>] ? kernel_init+0x0/0x141
   [<c020325f>] ? kernel_thread_helper+0x7/0x10

The reason is that default_get_apic_id handled extension of local APIC
ID field just in case of XAPIC.

Thus for this AMD CPU, default_get_apic_id() returns 0 and
bigsmp_get_apic_id() returns 16 which leads to the respective kernel
panic.

This patch introduces a Linux specific feature flag to indicate
support for extended APIC id (8 bits instead of 4 bits width) and sets
the flag on AMD CPUs if applicable.

Signed-off-by: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: <stable@kernel.org>
LKML-Reference: <20090608135509.GA12431@alberich.amd.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apic.h       |  2 +-
 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/kernel/cpu/amd.c         | 10 ++++++++++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 42f2f83774224..9b2c04910e0de 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -410,7 +410,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
 {
 	unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
 
-	if (APIC_XAPIC(ver))
+	if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
 		return (x >> 24) & 0xFF;
 	else
 		return (x >> 24) & 0x0F;
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c397aad..78dee4f0f7aaa 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -94,6 +94,7 @@
 #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC	(3*32+24) /* TSC does not stop in C states */
 #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID	(3*32+26) /* has extended APICID (8 bits) */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7e4a459daa644..0802e151c2c9e 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -6,6 +6,7 @@
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
+#include <asm/pci-direct.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/numa_64.h>
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
 		    (c->x86_model == 8 && c->x86_mask >= 8))
 			set_cpu_cap(c, X86_FEATURE_K6_MTRR);
 #endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+	/* check CPU config space for extended APIC ID */
+	if (c->x86 >= 0xf) {
+		unsigned int val;
+		val = read_pci_config(0, 24, 0, 0x68);
+		if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+			set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+	}
+#endif
 }
 
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
-- 
GitLab


From 0eab928221bac8895a0b494a16a8810002bd8645 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Tue, 9 Jun 2009 09:54:40 -0400
Subject: [PATCH 1694/2198] ext4: Don't treat a truncation of a zero-length
 file as replace-via-truncate

If a non-existent file is opened via O_WRONLY|O_CREAT|O_TRUNC, there's
no need to treat this as a true file truncation, so we shouldn't
activate the replace-via-truncate hueristic.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c10d346f7a31..875db944b22f1 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4127,7 +4127,8 @@ void ext4_truncate(struct inode *inode)
 	if (!ext4_can_truncate(inode))
 		return;
 
-	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+	if (ei->i_disksize && inode->i_size == 0 &&
+	    !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
 
 	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
-- 
GitLab


From fecc8ac8496fce96069724f54daba8e7078b0082 Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Tue, 9 Jun 2009 21:15:53 +0800
Subject: [PATCH 1695/2198] perf_counter, x86: Correct some event and umask
 values for Intel processors

Correct some event and UMASK values according to Intel SDM,
in the Nehalem and Atom tables.

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090609131553.GA12489@ywang-moblin2.bj.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 56001feeffcd2..40978aac6e0f6 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -119,7 +119,7 @@ static const u64 nehalem_hw_cache_event_ids
  },
  [ C(L1I ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0480, /* L1I.READS                    */
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */
 		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */
 	},
 	[ C(OP_WRITE) ] = {
@@ -162,7 +162,7 @@ static const u64 nehalem_hw_cache_event_ids
  [ C(ITLB) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */
-		[ C(RESULT_MISS)   ] = 0x0185, /* ITLB_MISS_RETIRED            */
+		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -291,7 +291,7 @@ static const u64 atom_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x2241, /* L1D_CACHE.ST               */
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */
 		[ C(RESULT_MISS)   ] = 0,
 	},
 	[ C(OP_PREFETCH) ] = {
@@ -301,8 +301,8 @@ static const u64 atom_hw_cache_event_ids
  },
  [ C(L1I ) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */
-		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */
+		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */
+		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */
 	},
 	[ C(OP_WRITE) ] = {
 		[ C(RESULT_ACCESS) ] = -1,
@@ -329,11 +329,11 @@ static const u64 atom_hw_cache_event_ids
  },
  [ C(DTLB) ] = {
 	[ C(OP_READ) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */
+		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */
 		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */
 	},
 	[ C(OP_WRITE) ] = {
-		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */
+		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */
 		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */
 	},
 	[ C(OP_PREFETCH) ] = {
-- 
GitLab


From 0b8c3d5ab000c22889af7f9409799a6cdc31a2b2 Mon Sep 17 00:00:00 2001
From: Chuck Ebbert <cebbert@redhat.com>
Date: Tue, 9 Jun 2009 10:40:50 -0400
Subject: [PATCH 1696/2198] x86: Clear TS in irq_ts_save() when in an atomic
 section

The dynamic FPU context allocation changes caused the padlock driver
to generate the below warning. Fix it by masking TS when doing padlock
encryption operations in an atomic section.

This solves:

BUG: sleeping function called from invalid context at mm/slub.c:1602
in_atomic(): 1, irqs_disabled(): 0, pid: 82, name: cryptomgr_test
Pid: 82, comm: cryptomgr_test Not tainted 2.6.29.4-168.test7.fc11.x86_64 #1
Call Trace:
[<ffffffff8103ff16>] __might_sleep+0x10b/0x110
[<ffffffff810cd3b2>] kmem_cache_alloc+0x37/0xf1
[<ffffffff81018505>] init_fpu+0x49/0x8a
[<ffffffff81012a83>] math_state_restore+0x3e/0xbc
[<ffffffff813ac6d0>] do_device_not_available+0x9/0xb
[<ffffffff810123ab>] device_not_available+0x1b/0x20
[<ffffffffa001c066>] ? aes_crypt+0x66/0x74 [padlock_aes]
[<ffffffff8119a51a>] ? blkcipher_walk_next+0x257/0x2e0
[<ffffffff8119a731>] ? blkcipher_walk_first+0x18e/0x19d
[<ffffffffa001c1fe>] aes_encrypt+0x9d/0xe5 [padlock_aes]
[<ffffffffa0027253>] crypt+0x6b/0x114 [xts]
[<ffffffffa001c161>] ? aes_encrypt+0x0/0xe5 [padlock_aes]
[<ffffffffa001c161>] ? aes_encrypt+0x0/0xe5 [padlock_aes]
[<ffffffffa0027390>] encrypt+0x49/0x4b [xts]
[<ffffffff81199acc>] async_encrypt+0x3c/0x3e
[<ffffffff8119dafc>] test_skcipher+0x1da/0x658
[<ffffffff811979c3>] ? crypto_spawn_tfm+0x8e/0xb1
[<ffffffff8119672d>] ? __crypto_alloc_tfm+0x11b/0x15f
[<ffffffff811979c3>] ? crypto_spawn_tfm+0x8e/0xb1
[<ffffffff81199dbe>] ? skcipher_geniv_init+0x2b/0x47
[<ffffffff8119a905>] ? async_chainiv_init+0x5c/0x61
[<ffffffff8119dfdd>] alg_test_skcipher+0x63/0x9b
[<ffffffff8119e1bc>] alg_test+0x12d/0x175
[<ffffffff8119c488>] cryptomgr_test+0x38/0x54
[<ffffffff8119c450>] ? cryptomgr_test+0x0/0x54
[<ffffffff8105c6c9>] kthread+0x4d/0x78
[<ffffffff8101264a>] child_rip+0xa/0x20
[<ffffffff81011f67>] ? restore_args+0x0/0x30
[<ffffffff8105c67c>] ? kthread+0x0/0x78
[<ffffffff81012640>] ? child_rip+0x0/0x20

Signed-off-by: Chuck Ebbert <cebbert@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
LKML-Reference: <20090609104050.50158cfe@dhcp-100-2-144.bos.redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/i387.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 71c9e51839827..4aab52f8e41a3 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -305,18 +305,18 @@ static inline void kernel_fpu_end(void)
 /*
  * Some instructions like VIA's padlock instructions generate a spurious
  * DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context aswell. To prevent these kernel instructions
- * in interrupt context interact wrongly with other user/kernel fpu usage, we
+ * get used from interrupt context as well. To prevent these kernel instructions
+ * in interrupt context interacting wrongly with other user/kernel fpu usage, we
  * should use them only in the context of irq_ts_save/restore()
  */
 static inline int irq_ts_save(void)
 {
 	/*
-	 * If we are in process context, we are ok to take a spurious DNA fault.
-	 * Otherwise, doing clts() in process context require pre-emption to
-	 * be disabled or some heavy lifting like kernel_fpu_begin()
+	 * If in process context and not atomic, we can take a spurious DNA fault.
+	 * Otherwise, doing clts() in process context requires disabling preemption
+	 * or some heavy lifting like kernel_fpu_begin()
 	 */
-	if (!in_interrupt())
+	if (!in_atomic())
 		return 0;
 
 	if (read_cr0() & X86_CR0_TS) {
-- 
GitLab


From f57a8a1911342265e7acdc190333c4e9235a6632 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 5 Jun 2009 14:11:30 -0400
Subject: [PATCH 1697/2198] ring-buffer: fix ret in rb_add_time_stamp

The update of ret got mistakenly added to the if statement of
rb_try_to_discard. The variable ret should be 1 on commit and zero
otherwise.

[ Impact: fix compiler warning and real bug ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 22878b0d370cf..2e642b2b7253d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1433,8 +1433,8 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 			/* Darn, this is just wasted space */
 			event->time_delta = 0;
 			event->array[0] = 0;
-			ret = 0;
 		}
+		ret = 0;
 	}
 
 	*delta = 0;
-- 
GitLab


From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Jun 2009 13:43:05 +0800
Subject: [PATCH 1698/2198] tracing/events: convert block trace points to
 TRACE_EVENT()

TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions
  ...

Cons:

  - no dev_t info for the output of plug, unplug_timer and unplug_io events.
    no dev_t info for getrq and sleeprq events if bio == NULL.
    no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.

    This is mainly because we can't get the deivce from a request queue.
    But this may change in the future.

  - A packet command is converted to a string in TP_assign, not TP_print.
    While blktrace do the convertion just before output.

    Since pc requests should be rather rare, this is not a big issue.

  - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT
    has a unique format, which means we have some unused data in a trace entry.

    The overhead is minimized by using __dynamic_array() instead of __array().

I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing:

      dd                   dd + ioctl blktrace       dd + TRACE_EVENT (splice)
1     7.36s, 42.7 MB/s     7.50s, 42.0 MB/s          7.41s, 42.5 MB/s
2     7.43s, 42.3 MB/s     7.48s, 42.1 MB/s          7.43s, 42.4 MB/s
3     7.38s, 42.6 MB/s     7.45s, 42.2 MB/s          7.41s, 42.5 MB/s

So the overhead of tracing is very small, and no regression when using
those trace events vs blktrace.

And the binary output of TRACE_EVENT is much smaller than blktrace:

 # ls -l -h
 -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0
 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1
 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out

Following are some comparisons between TRACE_EVENT and blktrace:

plug:
  kjournald-480   [000]   303.084981: block_plug: [kjournald]
  kjournald-480   [000]   303.084981:   8,0    P   N [kjournald]

unplug_io:
  kblockd/0-118   [000]   300.052973: block_unplug_io: [kblockd/0] 1
  kblockd/0-118   [000]   300.052974:   8,0    U   N [kblockd/0] 1

remap:
  kjournald-480   [000]   303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384
  kjournald-480   [000]   303.085043:   8,0    A   W 102736992 + 8 <- (8,8) 33384

bio_backmerge:
  kjournald-480   [000]   303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald]
  kjournald-480   [000]   303.085086:   8,0    M   W 102737032 + 8 [kjournald]

getrq:
  kjournald-480   [000]   303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084975:   8,0    G   W 102736984 + 8 [kjournald]

  bash-2066  [001]  1072.953770:   8,0    G   N [bash]
  bash-2066  [001]  1072.953773: block_getrq: 0,0 N 0 + 0 [bash]

rq_complete:
  konsole-2065  [001]   300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0]
  konsole-2065  [001]   300.053191:   8,0    C   W 103669040 + 16 [0]

  ksoftirqd/1-7   [001]  1072.953811:   8,0    C   N (5a 00 08 00 00 00 00 00 24 00) [0]
  ksoftirqd/1-7   [001]  1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0]

rq_insert:
  kjournald-480   [000]   303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084986:   8,0    I   W 102736984 + 8 [kjournald]

Changelog from v2 -> v3:

- use the newly introduced __dynamic_array().

Changelog from v1 -> v2:

- use __string() instead of __array() to minimize the memory required
  to store hex dump of rq->cmd().

- support large pc requests.

- add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT.

- some cleanups.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 block/blk-core.c             |  16 +-
 block/elevator.c             |   8 +-
 drivers/md/dm.c              |   5 +-
 fs/bio.c                     |   3 +-
 include/linux/blktrace_api.h |  13 +
 include/trace/block.h        |  76 ------
 include/trace/events/block.h | 483 +++++++++++++++++++++++++++++++++++
 kernel/trace/Makefile        |   5 +-
 kernel/trace/blktrace.c      |  78 +++++-
 mm/bounce.c                  |   5 +-
 10 files changed, 588 insertions(+), 104 deletions(-)
 delete mode 100644 include/trace/block.h
 create mode 100644 include/trace/events/block.h

diff --git a/block/blk-core.c b/block/blk-core.c
index 1306de9cce046..9475bf99b891c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,22 +28,14 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
-#include <trace/block.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/block.h>
 
 #include "blk.h"
 
-DEFINE_TRACE(block_plug);
-DEFINE_TRACE(block_unplug_io);
-DEFINE_TRACE(block_unplug_timer);
-DEFINE_TRACE(block_getrq);
-DEFINE_TRACE(block_sleeprq);
-DEFINE_TRACE(block_rq_requeue);
-DEFINE_TRACE(block_bio_backmerge);
-DEFINE_TRACE(block_bio_frontmerge);
-DEFINE_TRACE(block_bio_queue);
-DEFINE_TRACE(block_rq_complete);
-DEFINE_TRACE(block_remap);	/* Also used in drivers/md/dm.c */
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
 static int __make_request(struct request_queue *q, struct bio *bio);
 
diff --git a/block/elevator.c b/block/elevator.c
index 7073a9072577c..e220f0c543e3c 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,17 +33,16 @@
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 
+#include <trace/events/block.h>
+
 #include "blk.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 
-DEFINE_TRACE(block_rq_abort);
-
 /*
  * Merge hash stuff.
  */
@@ -55,9 +54,6 @@ static const int elv_hash_shift = 6;
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
-DEFINE_TRACE(block_rq_insert);
-DEFINE_TRACE(block_rq_issue);
-
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e2ee4a79ea2ca..3fd8b1e65483d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,7 +20,8 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
+
+#include <trace/events/block.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -53,8 +54,6 @@ struct dm_target_io {
 	union map_info info;
 };
 
-DEFINE_TRACE(block_bio_complete);
-
 /*
  * For request-based dm.
  * One of these is allocated per request.
diff --git a/fs/bio.c b/fs/bio.c
index 98711647ece49..740699c4f90c5 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 82b4636030e9e..c7ec31dd04c9c 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -218,5 +218,18 @@ static inline int blk_trace_init_sysfs(struct device *dev)
 
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
+#ifdef CONFIG_EVENT_TRACING
+
+static inline int blk_cmd_buf_len(struct request *rq)
+{
+	return blk_pc_request(rq) ? rq->cmd_len * 3 : 1;
+}
+
+extern void blk_dump_cmd(char *buf, struct request *rq);
+extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
+extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
+
+#endif /* CONFIG_EVENT_TRACING */
+
 #endif /* __KERNEL__ */
 #endif
diff --git a/include/trace/block.h b/include/trace/block.h
deleted file mode 100644
index 5b12efa096b67..0000000000000
--- a/include/trace/block.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef _TRACE_BLOCK_H
-#define _TRACE_BLOCK_H
-
-#include <linux/blkdev.h>
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(block_rq_abort,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_insert,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_issue,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_requeue,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_complete,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_bio_bounce,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_complete,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_backmerge,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_frontmerge,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_queue,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_getrq,
-	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-	      TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_sleeprq,
-	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-	      TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_plug,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_timer,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_io,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_split,
-	TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-	      TP_ARGS(q, bio, pdu));
-
-DECLARE_TRACE(block_remap,
-	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t from),
-	      TP_ARGS(q, bio, dev, from));
-
-#endif
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
new file mode 100644
index 0000000000000..a99d1e565bb0a
--- /dev/null
+++ b/include/trace/events/block.h
@@ -0,0 +1,483 @@
+#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BLOCK_H
+
+#include <linux/blktrace_api.h>
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM block
+
+TRACE_EVENT(block_rq_abort,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_insert,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  unsigned int,	bytes			)
+		__array(  char,		rwbs,	6		)
+		__array(  char,         comm,   TASK_COMM_LEN   )
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __entry->bytes, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_issue,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  unsigned int,	bytes			)
+		__array(  char,		rwbs,	6		)
+		__array(  char,		comm,   TASK_COMM_LEN   )
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __entry->bytes, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_requeue,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors	   = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_complete,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+TRACE_EVENT(block_bio_bounce,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_complete,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned,	nr_sector	)
+		__field( int,		error		)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->error)
+);
+
+TRACE_EVENT(block_bio_backmerge,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_frontmerge,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_queue,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_getrq,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+	TP_ARGS(q, bio, rw),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+        ),
+
+	TP_fast_assign(
+		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
+		blk_fill_rwbs(__entry->rwbs,
+			      bio ? bio->bi_rw : 0, __entry->nr_sector);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+        ),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_sleeprq,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+	TP_ARGS(q, bio, rw),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
+		blk_fill_rwbs(__entry->rwbs,
+			    bio ? bio->bi_rw : 0, __entry->nr_sector);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_plug,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s]", __entry->comm)
+);
+
+TRACE_EVENT(block_unplug_timer,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__field( int,		nr_rq			)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_unplug_io,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__field( int,		nr_rq			)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_split,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio,
+		 unsigned int new_sector),
+
+	TP_ARGS(q, bio, new_sector),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev				)
+		__field( sector_t,	sector				)
+		__field( sector_t,	new_sector			)
+		__array( char,		rwbs,		6		)
+		__array( char,		comm,		TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->new_sector	= new_sector;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu / %llu [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->new_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_remap,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		 sector_t from),
+
+	TP_ARGS(q, bio, dev, from),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned int,	nr_sector	)
+		__field( dev_t,		old_dev		)
+		__field( sector_t,	old_sector	)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		__entry->old_dev	= dev;
+		__entry->old_sector	= from;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector,
+		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
+		  __entry->old_sector)
+);
+
+#endif /* _TRACE_BLOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 06b85850fab49..844164dca90ae 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e3abf55bc8e57..7bd6a9893c247 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,10 +23,14 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
-#include <trace/block.h>
 #include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
 #include "trace_output.h"
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
 static unsigned int blktrace_seq __read_mostly = 1;
 
 static struct trace_array *blk_tr;
@@ -1658,3 +1662,75 @@ int blk_trace_init_sysfs(struct device *dev)
 	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
 }
 
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+	int i, end;
+	int len = rq->cmd_len;
+	unsigned char *cmd = rq->cmd;
+
+	if (!blk_pc_request(rq)) {
+		buf[0] = '\0';
+		return;
+	}
+
+	for (end = len - 1; end >= 0; end--)
+		if (cmd[end])
+			break;
+	end++;
+
+	for (i = 0; i < len; i++) {
+		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+		if (i == end && end != len - 1) {
+			sprintf(buf, " ..");
+			break;
+		}
+	}
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+	int i = 0;
+
+	if (rw & WRITE)
+		rwbs[i++] = 'W';
+	else if (rw & 1 << BIO_RW_DISCARD)
+		rwbs[i++] = 'D';
+	else if (bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
+
+	if (rw & 1 << BIO_RW_AHEAD)
+		rwbs[i++] = 'A';
+	if (rw & 1 << BIO_RW_BARRIER)
+		rwbs[i++] = 'B';
+	if (rw & 1 << BIO_RW_SYNCIO)
+		rwbs[i++] = 'S';
+	if (rw & 1 << BIO_RW_META)
+		rwbs[i++] = 'M';
+
+	rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+	int rw = rq->cmd_flags & 0x03;
+	int bytes;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq))
+		bytes = rq->data_len;
+	else
+		bytes = rq->hard_nr_sectors << 9;
+
+	blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272fe7a8f..65f5e17e411aa 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -14,16 +14,15 @@
 #include <linux/hash.h>
 #include <linux/highmem.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <asm/tlbflush.h>
 
+#include <trace/events/block.h>
+
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
 
 static mempool_t *page_pool, *isa_page_pool;
 
-DEFINE_TRACE(block_bio_bounce);
-
 #ifdef CONFIG_HIGHMEM
 static __init int init_emergency_pool(void)
 {
-- 
GitLab


From 6556d1df88fe68f9836beeb43342a336691cb67c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 9 Jun 2009 14:04:26 -0400
Subject: [PATCH 1699/2198] tracing: fix the block trace points print size

The sector field is either u64 or unsigned long depending on
the arch. This patch casts the sector to unsigned long long to
prevent the printf warnings.

[ Impact: remove compile warnings ]

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/events/block.h | 45 ++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index a99d1e565bb0a..53effd496a504 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -37,7 +37,8 @@ TRACE_EVENT(block_rq_abort,
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
-		  __entry->sector, __entry->nr_sector, __entry->errors)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->errors)
 );
 
 TRACE_EVENT(block_rq_insert,
@@ -71,7 +72,8 @@ TRACE_EVENT(block_rq_insert,
 	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __entry->bytes, __get_str(cmd),
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_rq_issue,
@@ -105,7 +107,8 @@ TRACE_EVENT(block_rq_issue,
 	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __entry->bytes, __get_str(cmd),
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_rq_requeue,
@@ -137,7 +140,8 @@ TRACE_EVENT(block_rq_requeue,
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
-		  __entry->sector, __entry->nr_sector, __entry->errors)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->errors)
 );
 
 TRACE_EVENT(block_rq_complete,
@@ -169,7 +173,8 @@ TRACE_EVENT(block_rq_complete,
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
-		  __entry->sector, __entry->nr_sector, __entry->errors)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->errors)
 );
 TRACE_EVENT(block_bio_bounce,
 
@@ -195,7 +200,8 @@ TRACE_EVENT(block_bio_bounce,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_bio_complete,
@@ -221,7 +227,8 @@ TRACE_EVENT(block_bio_complete,
 
 	TP_printk("%d,%d %s %llu + %u [%d]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->error)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->error)
 );
 
 TRACE_EVENT(block_bio_backmerge,
@@ -248,7 +255,8 @@ TRACE_EVENT(block_bio_backmerge,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_bio_frontmerge,
@@ -275,7 +283,8 @@ TRACE_EVENT(block_bio_frontmerge,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_bio_queue,
@@ -302,7 +311,8 @@ TRACE_EVENT(block_bio_queue,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_getrq,
@@ -330,7 +340,8 @@ TRACE_EVENT(block_getrq,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_sleeprq,
@@ -358,7 +369,8 @@ TRACE_EVENT(block_sleeprq,
 
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector, __entry->comm)
 );
 
 TRACE_EVENT(block_plug,
@@ -441,7 +453,9 @@ TRACE_EVENT(block_split,
 
 	TP_printk("%d,%d %s %llu / %llu [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->new_sector, __entry->comm)
+		  (unsigned long long)__entry->sector,
+		  (unsigned long long)__entry->new_sector,
+		  __entry->comm)
 );
 
 TRACE_EVENT(block_remap,
@@ -471,9 +485,10 @@ TRACE_EVENT(block_remap,
 
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
-		  __entry->sector, __entry->nr_sector,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector,
 		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
-		  __entry->old_sector)
+		  (unsigned long long)__entry->old_sector)
 );
 
 #endif /* _TRACE_BLOCK_H */
-- 
GitLab


From 725c624a58a10ef90a2ff889e122158fabf36147 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 8 Jun 2009 19:09:45 -0400
Subject: [PATCH 1700/2198] tracing: add trace_seq_vprint interface

The code to update the print formats for events requires a vprintf
format in the trace_seq. This patch adds that interface.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/trace_seq.h   |  2 ++
 kernel/trace/trace_output.c | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index ba9627f00d3f0..c68bccba20743 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -27,6 +27,8 @@ trace_seq_init(struct trace_seq *s)
 #ifdef CONFIG_TRACING
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
+extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+	__attribute__ ((format (printf, 2, 0)));
 extern int
 trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 425725c1622db..c05aff465dc91 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -100,6 +100,38 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(trace_seq_printf);
 
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
+
 int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
-- 
GitLab


From 110bf2b764eb6026b868d84499263cb24b1bcc8d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 9 Jun 2009 17:29:07 -0400
Subject: [PATCH 1701/2198] tracing: add protection around module events unload

When reading the trace buffer, there is a race that when a module
is unloaded it removes events that is stilled referenced in the buffers.
This patch adds the protection around the unloading of the events
from modules and the reading of the trace buffers.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_events.c |  4 +++-
 kernel/trace/trace_output.c | 15 ++++++++++++---
 kernel/trace/trace_output.h |  4 ++++
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 6c81f9c21426f..aa08be69a1b6c 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1050,12 +1050,13 @@ static void trace_module_remove_events(struct module *mod)
 	struct ftrace_event_call *call, *p;
 	bool found = false;
 
+	down_write(&trace_event_mutex);
 	list_for_each_entry_safe(call, p, &ftrace_events, list) {
 		if (call->mod == mod) {
 			found = true;
 			ftrace_event_enable_disable(call, 0);
 			if (call->event)
-				unregister_ftrace_event(call->event);
+				__unregister_ftrace_event(call->event);
 			debugfs_remove_recursive(call->dir);
 			list_del(&call->list);
 			trace_destroy_fields(call);
@@ -1079,6 +1080,7 @@ static void trace_module_remove_events(struct module *mod)
 	 */
 	if (found)
 		tracing_reset_current_online_cpus();
+	up_write(&trace_event_mutex);
 }
 
 static int trace_module_notify(struct notifier_block *self,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c05aff465dc91..7938f3ae93e3d 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
 /* must be a power of 2 */
 #define EVENT_HASHSIZE	128
 
-static DECLARE_RWSEM(trace_event_mutex);
+DECLARE_RWSEM(trace_event_mutex);
 
 DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
 EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
@@ -702,6 +702,16 @@ int register_ftrace_event(struct trace_event *event)
 }
 EXPORT_SYMBOL_GPL(register_ftrace_event);
 
+/*
+ * Used by module code with the trace_event_mutex held for write.
+ */
+int __unregister_ftrace_event(struct trace_event *event)
+{
+	hlist_del(&event->node);
+	list_del(&event->list);
+	return 0;
+}
+
 /**
  * unregister_ftrace_event - remove a no longer used event
  * @event: the event to remove
@@ -709,8 +719,7 @@ EXPORT_SYMBOL_GPL(register_ftrace_event);
 int unregister_ftrace_event(struct trace_event *event)
 {
 	down_write(&trace_event_mutex);
-	hlist_del(&event->node);
-	list_del(&event->list);
+	__unregister_ftrace_event(event);
 	up_write(&trace_event_mutex);
 
 	return 0;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ac240e76eb012..d38bec4a9c308 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -27,6 +27,10 @@ extern struct trace_event *ftrace_find_event(int type);
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
 					 int flags);
 
+/* used by module unregistering */
+extern int __unregister_ftrace_event(struct trace_event *event);
+extern struct rw_semaphore trace_event_mutex;
+
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
 
-- 
GitLab


From 35f2c2f6f6ae13ef23c4f68e6d3073753077ca43 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Tue, 9 Jun 2009 17:48:56 +0900
Subject: [PATCH 1702/2198] nommu: Provide mmap_min_addr definition.

With the "security: use mmap_min_addr indepedently of security models"
change, mmap_min_addr is used in common areas, which susbsequently blows
up the nommu build. This stubs in the definition in the nommu case as
well.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>

--

 mm/nommu.c |    3 +++
 1 file changed, 3 insertions(+)
Signed-off-by: James Morris <jmorris@namei.org>
---
 mm/nommu.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/nommu.c b/mm/nommu.c
index b571ef707428c..2fd2ad5da98e5 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,6 +69,9 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 int heap_stack_gap = 0;
 
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+
 atomic_long_t mmap_pages_allocated;
 
 EXPORT_SYMBOL(mem_map);
-- 
GitLab


From 2b83868723d090078ac0e2120e06a1cc94dbaef0 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Tue, 9 Jun 2009 20:40:25 -0700
Subject: [PATCH 1703/2198] Make /dev/zero reads interruptible by signals

This helps with bad latencies for large reads from /dev/zero, but might
conceivably break some application that "knows" that a read of /dev/zero
cannot return early.  So do this early in the merge window to give us
maximal test coverage, even if the patch is totally trivial.

Obviously, no well-behaved application should ever depend on the read
being uninterruptible, but hey, bugs happen.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/mem.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 65e12bca657cf..f96d0bef855e3 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -694,9 +694,8 @@ static ssize_t read_zero(struct file * file, char __user * buf,
 		written += chunk - unwritten;
 		if (unwritten)
 			break;
-		/* Consider changing this to just 'signal_pending()' with lots of testing */
-		if (fatal_signal_pending(current))
-			return written ? written : -EINTR;
+		if (signal_pending(current))
+			return written ? written : -ERESTARTSYS;
 		buf += chunk;
 		count -= chunk;
 		cond_resched();
-- 
GitLab


From c1d0d32a603ed06377f404adf2c538de33bb3634 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Wed, 10 Jun 2009 09:48:33 +0300
Subject: [PATCH 1704/2198] sh: plug vsyscall dir in to archclean.

The vsyscall targets are presently not cleaned up, so just handle it in
the archclean rule.

Reported-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Makefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/sh/Makefile b/arch/sh/Makefile
index 0fe35cff90f6b..75d049b03f7e0 100644
--- a/arch/sh/Makefile
+++ b/arch/sh/Makefile
@@ -205,6 +205,7 @@ archprepare: maketools
 
 archclean:
 	$(Q)$(MAKE) $(clean)=$(boot)
+	$(Q)$(MAKE) $(clean)=arch/sh/kernel/vsyscall
 
 define archhelp
 	@echo '* zImage 	           - Compressed kernel image'
-- 
GitLab


From 40bc9a27e00d6c8c7e4dc2865c02d7402a950472 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 10 Jun 2009 09:09:40 +0100
Subject: [PATCH 1705/2198] GFS2: Fix cache coherency between truncate and
 O_DIRECT read

If a page was partially zeroed as the result of a truncate, then it was
not being correctly marked dirty. This resulted in the deleted data
reappearing if the file was read back via direct I/O.

Reported-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1153a078920c2..329763530dc06 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1012,7 +1012,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping)
 		gfs2_trans_add_bh(ip->i_gl, bh, 0);
 
 	zero_user(page, offset, length);
-
+	mark_buffer_dirty(bh);
 unlock:
 	unlock_page(page);
 	page_cache_release(page);
-- 
GitLab


From 3e7c73e9b15eab73e9cf72daf3931925da8afcff Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 24 Feb 2009 21:46:19 +0200
Subject: [PATCH 1706/2198] KVM: VMX: Don't use highmem pages for the msr and
 pio bitmaps

Highmem pages are a pain, and saving three lowmem pages on i386 isn't worth
the extra code.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 59 ++++++++++++++++++++--------------------------
 1 file changed, 25 insertions(+), 34 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bb481330716f5..b20c9e47e9257 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -111,9 +111,9 @@ static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
 
-static struct page *vmx_io_bitmap_a;
-static struct page *vmx_io_bitmap_b;
-static struct page *vmx_msr_bitmap;
+static unsigned long *vmx_io_bitmap_a;
+static unsigned long *vmx_io_bitmap_b;
+static unsigned long *vmx_msr_bitmap;
 
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -2082,9 +2082,9 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
-static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
+static void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 {
-	void *va;
+	int f = sizeof(unsigned long);
 
 	if (!cpu_has_vmx_msr_bitmap())
 		return;
@@ -2094,16 +2094,14 @@ static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
 	 * have the write-low and read-high bitmap offsets the wrong way round.
 	 * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
 	 */
-	va = kmap(msr_bitmap);
 	if (msr <= 0x1fff) {
-		__clear_bit(msr, va + 0x000); /* read-low */
-		__clear_bit(msr, va + 0x800); /* write-low */
+		__clear_bit(msr, msr_bitmap + 0x000 / f); /* read-low */
+		__clear_bit(msr, msr_bitmap + 0x800 / f); /* write-low */
 	} else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
 		msr &= 0x1fff;
-		__clear_bit(msr, va + 0x400); /* read-high */
-		__clear_bit(msr, va + 0xc00); /* write-high */
+		__clear_bit(msr, msr_bitmap + 0x400 / f); /* read-high */
+		__clear_bit(msr, msr_bitmap + 0xc00 / f); /* write-high */
 	}
-	kunmap(msr_bitmap);
 }
 
 /*
@@ -2121,11 +2119,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	u32 exec_control;
 
 	/* I/O */
-	vmcs_write64(IO_BITMAP_A, page_to_phys(vmx_io_bitmap_a));
-	vmcs_write64(IO_BITMAP_B, page_to_phys(vmx_io_bitmap_b));
+	vmcs_write64(IO_BITMAP_A, __pa(vmx_io_bitmap_a));
+	vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
 
 	if (cpu_has_vmx_msr_bitmap())
-		vmcs_write64(MSR_BITMAP, page_to_phys(vmx_msr_bitmap));
+		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap));
 
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
@@ -3695,20 +3693,19 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 static int __init vmx_init(void)
 {
-	void *va;
 	int r;
 
-	vmx_io_bitmap_a = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+	vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_io_bitmap_a)
 		return -ENOMEM;
 
-	vmx_io_bitmap_b = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+	vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_io_bitmap_b) {
 		r = -ENOMEM;
 		goto out;
 	}
 
-	vmx_msr_bitmap = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+	vmx_msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
 	if (!vmx_msr_bitmap) {
 		r = -ENOMEM;
 		goto out1;
@@ -3718,18 +3715,12 @@ static int __init vmx_init(void)
 	 * Allow direct access to the PC debug port (it is often used for I/O
 	 * delays, but the vmexits simply slow things down).
 	 */
-	va = kmap(vmx_io_bitmap_a);
-	memset(va, 0xff, PAGE_SIZE);
-	clear_bit(0x80, va);
-	kunmap(vmx_io_bitmap_a);
+	memset(vmx_io_bitmap_a, 0xff, PAGE_SIZE);
+	clear_bit(0x80, vmx_io_bitmap_a);
 
-	va = kmap(vmx_io_bitmap_b);
-	memset(va, 0xff, PAGE_SIZE);
-	kunmap(vmx_io_bitmap_b);
+	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
 
-	va = kmap(vmx_msr_bitmap);
-	memset(va, 0xff, PAGE_SIZE);
-	kunmap(vmx_msr_bitmap);
+	memset(vmx_msr_bitmap, 0xff, PAGE_SIZE);
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
@@ -3762,19 +3753,19 @@ static int __init vmx_init(void)
 	return 0;
 
 out2:
-	__free_page(vmx_msr_bitmap);
+	free_page((unsigned long)vmx_msr_bitmap);
 out1:
-	__free_page(vmx_io_bitmap_b);
+	free_page((unsigned long)vmx_io_bitmap_b);
 out:
-	__free_page(vmx_io_bitmap_a);
+	free_page((unsigned long)vmx_io_bitmap_a);
 	return r;
 }
 
 static void __exit vmx_exit(void)
 {
-	__free_page(vmx_msr_bitmap);
-	__free_page(vmx_io_bitmap_b);
-	__free_page(vmx_io_bitmap_a);
+	free_page((unsigned long)vmx_msr_bitmap);
+	free_page((unsigned long)vmx_io_bitmap_b);
+	free_page((unsigned long)vmx_io_bitmap_a);
 
 	kvm_exit();
 }
-- 
GitLab


From 5897297bc228fc3c85fdc421fd5c487f9a99821a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 24 Feb 2009 22:26:47 +0200
Subject: [PATCH 1707/2198] KVM: VMX: Don't intercept MSR_KERNEL_GS_BASE

Windows 2008 accesses this MSR often on context switch intensive workloads;
since we run in guest context with the guest MSR value loaded (so swapgs can
work correctly), we can simply disable interception of rdmsr/wrmsr for this
MSR.

A complication occurs since in legacy mode, we run with the host MSR value
loaded. In this case we enable interception.  This means we need two MSR
bitmaps, one for legacy mode and one for long mode.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 57 ++++++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b20c9e47e9257..b5eae7a00aad4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -113,7 +113,8 @@ static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
 
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
-static unsigned long *vmx_msr_bitmap;
+static unsigned long *vmx_msr_bitmap_legacy;
+static unsigned long *vmx_msr_bitmap_longmode;
 
 static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
 static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -812,6 +813,7 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
 static void setup_msrs(struct vcpu_vmx *vmx)
 {
 	int save_nmsrs;
+	unsigned long *msr_bitmap;
 
 	vmx_load_host_state(vmx);
 	save_nmsrs = 0;
@@ -847,6 +849,15 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		__find_msr_index(vmx, MSR_KERNEL_GS_BASE);
 #endif
 	vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
+
+	if (cpu_has_vmx_msr_bitmap()) {
+		if (is_long_mode(&vmx->vcpu))
+			msr_bitmap = vmx_msr_bitmap_longmode;
+		else
+			msr_bitmap = vmx_msr_bitmap_legacy;
+
+		vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
+	}
 }
 
 /*
@@ -2082,7 +2093,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
 	spin_unlock(&vmx_vpid_lock);
 }
 
-static void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
+static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 {
 	int f = sizeof(unsigned long);
 
@@ -2104,6 +2115,13 @@ static void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, u32 msr)
 	}
 }
 
+static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
+{
+	if (!longmode_only)
+		__vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy, msr);
+	__vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode, msr);
+}
+
 /*
  * Sets up the vmcs for emulated real mode.
  */
@@ -2123,7 +2141,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write64(IO_BITMAP_B, __pa(vmx_io_bitmap_b));
 
 	if (cpu_has_vmx_msr_bitmap())
-		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap));
+		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
 
 	vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
 
@@ -3705,12 +3723,18 @@ static int __init vmx_init(void)
 		goto out;
 	}
 
-	vmx_msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
-	if (!vmx_msr_bitmap) {
+	vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx_msr_bitmap_legacy) {
 		r = -ENOMEM;
 		goto out1;
 	}
 
+	vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL);
+	if (!vmx_msr_bitmap_longmode) {
+		r = -ENOMEM;
+		goto out2;
+	}
+
 	/*
 	 * Allow direct access to the PC debug port (it is often used for I/O
 	 * delays, but the vmexits simply slow things down).
@@ -3720,19 +3744,21 @@ static int __init vmx_init(void)
 
 	memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
 
-	memset(vmx_msr_bitmap, 0xff, PAGE_SIZE);
+	memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
+	memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
 	r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx), THIS_MODULE);
 	if (r)
-		goto out2;
+		goto out3;
 
-	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_FS_BASE);
-	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_GS_BASE);
-	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_CS);
-	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
-	vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
+	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
+	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
+	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
+	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
+	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
+	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
 	if (vm_need_ept()) {
 		bypass_guest_pf = 0;
@@ -3752,8 +3778,10 @@ static int __init vmx_init(void)
 
 	return 0;
 
+out3:
+	free_page((unsigned long)vmx_msr_bitmap_longmode);
 out2:
-	free_page((unsigned long)vmx_msr_bitmap);
+	free_page((unsigned long)vmx_msr_bitmap_legacy);
 out1:
 	free_page((unsigned long)vmx_io_bitmap_b);
 out:
@@ -3763,7 +3791,8 @@ static int __init vmx_init(void)
 
 static void __exit vmx_exit(void)
 {
-	free_page((unsigned long)vmx_msr_bitmap);
+	free_page((unsigned long)vmx_msr_bitmap_legacy);
+	free_page((unsigned long)vmx_msr_bitmap_longmode);
 	free_page((unsigned long)vmx_io_bitmap_b);
 	free_page((unsigned long)vmx_io_bitmap_a);
 
-- 
GitLab


From cf9e4e15e8f6306b2559979269ead7c02e6b2b95 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:36 +0800
Subject: [PATCH 1708/2198] KVM: Split IOAPIC structure

Prepared for reuse ioapic_redir_entry for MSI.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_types.h | 17 +++++++++++++++++
 virt/kvm/ioapic.c         |  6 +++---
 virt/kvm/ioapic.h         | 17 +----------------
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index 2b8318c83e531..b84aca3c4ad1a 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -40,4 +40,21 @@ typedef unsigned long  hfn_t;
 
 typedef hfn_t pfn_t;
 
+union kvm_ioapic_redirect_entry {
+	u64 bits;
+	struct {
+		u8 vector;
+		u8 delivery_mode:3;
+		u8 dest_mode:1;
+		u8 delivery_status:1;
+		u8 polarity:1;
+		u8 remote_irr:1;
+		u8 trig_mode:1;
+		u8 mask:1;
+		u8 reserve:7;
+		u8 reserved[4];
+		u8 dest_id;
+	} fields;
+};
+
 #endif /* __KVM_TYPES_H__ */
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index c3b99def9cbc3..812801317e362 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -85,7 +85,7 @@ static unsigned long ioapic_read_indirect(struct kvm_ioapic *ioapic,
 
 static int ioapic_service(struct kvm_ioapic *ioapic, unsigned int idx)
 {
-	union ioapic_redir_entry *pent;
+	union kvm_ioapic_redirect_entry *pent;
 	int injected = -1;
 
 	pent = &ioapic->redirtbl[idx];
@@ -284,7 +284,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 {
 	u32 old_irr = ioapic->irr;
 	u32 mask = 1 << irq;
-	union ioapic_redir_entry entry;
+	union kvm_ioapic_redirect_entry entry;
 	int ret = 1;
 
 	if (irq >= 0 && irq < IOAPIC_NUM_PINS) {
@@ -305,7 +305,7 @@ int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 static void __kvm_ioapic_update_eoi(struct kvm_ioapic *ioapic, int pin,
 				    int trigger_mode)
 {
-	union ioapic_redir_entry *ent;
+	union kvm_ioapic_redirect_entry *ent;
 
 	ent = &ioapic->redirtbl[pin];
 
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index a34bd5e6436bd..008ec873d0186 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -40,22 +40,7 @@ struct kvm_ioapic {
 	u32 id;
 	u32 irr;
 	u32 pad;
-	union ioapic_redir_entry {
-		u64 bits;
-		struct {
-			u8 vector;
-			u8 delivery_mode:3;
-			u8 dest_mode:1;
-			u8 delivery_status:1;
-			u8 polarity:1;
-			u8 remote_irr:1;
-			u8 trig_mode:1;
-			u8 mask:1;
-			u8 reserve:7;
-			u8 reserved[4];
-			u8 dest_id;
-		} fields;
-	} redirtbl[IOAPIC_NUM_PINS];
+	union kvm_ioapic_redirect_entry redirtbl[IOAPIC_NUM_PINS];
 	struct kvm_io_device dev;
 	struct kvm *kvm;
 	void (*ack_notifier)(void *opaque, int irq);
-- 
GitLab


From 116191b69b608d0f1513e3abe71d6a46800f2bd6 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:37 +0800
Subject: [PATCH 1709/2198] KVM: Unify the delivery of IOAPIC and MSI
 interrupts

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h |  3 ++
 virt/kvm/ioapic.c        | 91 ++++++++++++++------------------------
 virt/kvm/irq_comm.c      | 95 ++++++++++++++++++++++++----------------
 3 files changed, 95 insertions(+), 94 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 894a56e365e85..1a2f98fbecea9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -352,6 +352,9 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+				   union kvm_ioapic_redirect_entry *entry,
+				   unsigned long *deliver_bitmask);
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 812801317e362..883fd0dc9b786 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -203,79 +203,56 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
-	u8 dest = ioapic->redirtbl[irq].fields.dest_id;
-	u8 dest_mode = ioapic->redirtbl[irq].fields.dest_mode;
-	u8 delivery_mode = ioapic->redirtbl[irq].fields.delivery_mode;
-	u8 vector = ioapic->redirtbl[irq].fields.vector;
-	u8 trig_mode = ioapic->redirtbl[irq].fields.trig_mode;
-	u32 deliver_bitmask;
+	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
+	unsigned long deliver_bitmask;
 	struct kvm_vcpu *vcpu;
 	int vcpu_id, r = -1;
 
 	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
 		     "vector=%x trig_mode=%x\n",
-		     dest, dest_mode, delivery_mode, vector, trig_mode);
+		     entry.fields.dest, entry.fields.dest_mode,
+		     entry.fields.delivery_mode, entry.fields.vector,
+		     entry.fields.trig_mode);
 
-	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic, dest,
-							  dest_mode);
+	kvm_get_intr_delivery_bitmask(ioapic, &entry, &deliver_bitmask);
 	if (!deliver_bitmask) {
 		ioapic_debug("no target on destination\n");
 		return 0;
 	}
 
-	switch (delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
-				deliver_bitmask);
+	/* Always delivery PIT interrupt to vcpu 0 */
 #ifdef CONFIG_X86
-		if (irq == 0)
-			vcpu = ioapic->kvm->vcpus[0];
+	if (irq == 0)
+		deliver_bitmask = 1;
 #endif
-		if (vcpu != NULL)
-			r = ioapic_inj_irq(ioapic, vcpu, vector,
-				       trig_mode, delivery_mode);
-		else
-			ioapic_debug("null lowest prio vcpu: "
-				     "mask=%x vector=%x delivery_mode=%x\n",
-				     deliver_bitmask, vector, IOAPIC_LOWEST_PRIORITY);
-		break;
-	case IOAPIC_FIXED:
-#ifdef CONFIG_X86
-		if (irq == 0)
-			deliver_bitmask = 1;
-#endif
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu) {
+
+	for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+		if (!(deliver_bitmask & (1 << vcpu_id)))
+			continue;
+		deliver_bitmask &= ~(1 << vcpu_id);
+		vcpu = ioapic->kvm->vcpus[vcpu_id];
+		if (vcpu) {
+			if (entry.fields.delivery_mode ==
+					IOAPIC_LOWEST_PRIORITY ||
+			    entry.fields.delivery_mode == IOAPIC_FIXED) {
 				if (r < 0)
 					r = 0;
-				r += ioapic_inj_irq(ioapic, vcpu, vector,
-					       trig_mode, delivery_mode);
-			}
-		}
-		break;
-	case IOAPIC_NMI:
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu) {
-				ioapic_inj_nmi(vcpu);
+				r += ioapic_inj_irq(ioapic, vcpu,
+						    entry.fields.vector,
+						    entry.fields.trig_mode,
+						    entry.fields.delivery_mode);
+			} else if (entry.fields.delivery_mode == IOAPIC_NMI) {
 				r = 1;
-			}
-			else
-				ioapic_debug("NMI to vcpu %d failed\n",
-						vcpu->vcpu_id);
-		}
-		break;
-	default:
-		printk(KERN_WARNING "Unsupported delivery mode %d\n",
-		       delivery_mode);
-		break;
+				ioapic_inj_nmi(vcpu);
+			} else
+				ioapic_debug("unsupported delivery mode %x!\n",
+					     entry.fields.delivery_mode);
+		} else
+			ioapic_debug("null destination vcpu: "
+				     "mask=%x vector=%x delivery_mode=%x\n",
+				     entry.fields.deliver_bitmask,
+				     entry.fields.vector,
+				     entry.fields.delivery_mode);
 	}
 	return r;
 }
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 864ac5483baad..aec7a0d93a3fc 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -43,53 +43,74 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+				   union kvm_ioapic_redirect_entry *entry,
+				   unsigned long *deliver_bitmask)
+{
+	struct kvm_vcpu *vcpu;
+
+	*deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
+				entry->fields.dest_id, entry->fields.dest_mode);
+	switch (entry->fields.delivery_mode) {
+	case IOAPIC_LOWEST_PRIORITY:
+		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
+				entry->fields.vector, *deliver_bitmask);
+		*deliver_bitmask = 1 << vcpu->vcpu_id;
+		break;
+	case IOAPIC_FIXED:
+	case IOAPIC_NMI:
+		break;
+	default:
+		if (printk_ratelimit())
+			printk(KERN_INFO "kvm: unsupported delivery mode %d\n",
+				entry->fields.delivery_mode);
+		*deliver_bitmask = 0;
+	}
+}
+
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		       struct kvm *kvm, int level)
 {
 	int vcpu_id, r = -1;
 	struct kvm_vcpu *vcpu;
 	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
-	int dest_id = (e->msi.address_lo & MSI_ADDR_DEST_ID_MASK)
-			>> MSI_ADDR_DEST_ID_SHIFT;
-	int vector = (e->msi.data & MSI_DATA_VECTOR_MASK)
-			>> MSI_DATA_VECTOR_SHIFT;
-	int dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
-				(unsigned long *)&e->msi.address_lo);
-	int trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
-				(unsigned long *)&e->msi.data);
-	int delivery_mode = test_bit(MSI_DATA_DELIVERY_MODE_SHIFT,
-				(unsigned long *)&e->msi.data);
-	u32 deliver_bitmask;
+	union kvm_ioapic_redirect_entry entry;
+	unsigned long deliver_bitmask;
 
 	BUG_ON(!ioapic);
 
-	deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
-				dest_id, dest_mode);
-	/* IOAPIC delivery mode value is the same as MSI here */
-	switch (delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm, vector,
-				deliver_bitmask);
-		if (vcpu != NULL)
-			r = kvm_apic_set_irq(vcpu, vector, trig_mode);
-		else
-			printk(KERN_INFO "kvm: null lowest priority vcpu!\n");
-		break;
-	case IOAPIC_FIXED:
-		for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-			if (!(deliver_bitmask & (1 << vcpu_id)))
-				continue;
-			deliver_bitmask &= ~(1 << vcpu_id);
-			vcpu = ioapic->kvm->vcpus[vcpu_id];
-			if (vcpu) {
-				if (r < 0)
-					r = 0;
-				r += kvm_apic_set_irq(vcpu, vector, trig_mode);
-			}
+	entry.bits = 0;
+	entry.fields.dest_id = (e->msi.address_lo &
+			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+	entry.fields.vector = (e->msi.data &
+			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
+	entry.fields.dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
+			(unsigned long *)&e->msi.address_lo);
+	entry.fields.trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
+			(unsigned long *)&e->msi.data);
+	entry.fields.delivery_mode = test_bit(
+			MSI_DATA_DELIVERY_MODE_SHIFT,
+			(unsigned long *)&e->msi.data);
+
+	/* TODO Deal with RH bit of MSI message address */
+
+	kvm_get_intr_delivery_bitmask(ioapic, &entry, &deliver_bitmask);
+
+	if (!deliver_bitmask) {
+		printk(KERN_WARNING "kvm: no destination for MSI delivery!");
+		return -1;
+	}
+	for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
+		if (!(deliver_bitmask & (1 << vcpu_id)))
+			continue;
+		deliver_bitmask &= ~(1 << vcpu_id);
+		vcpu = ioapic->kvm->vcpus[vcpu_id];
+		if (vcpu) {
+			if (r < 0)
+				r = 0;
+			r += kvm_apic_set_irq(vcpu, entry.fields.vector,
+					      entry.fields.trig_mode);
 		}
-		break;
-	default:
-		break;
 	}
 	return r;
 }
-- 
GitLab


From e5871be0f5d6847bc9585c997acb1b917c168f03 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:38 +0800
Subject: [PATCH 1710/2198] KVM: Change API of kvm_ioapic_get_delivery_bitmask

In order to use with bit ops.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/ioapic.c   | 17 ++++++++---------
 virt/kvm/ioapic.h   |  4 ++--
 virt/kvm/irq_comm.c |  5 +++--
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 883fd0dc9b786..3b5371299dd10 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -161,22 +161,22 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
 	kvm_vcpu_kick(vcpu);
 }
 
-u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-				    u8 dest_mode)
+void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+				     u8 dest_mode, unsigned long *mask)
 {
-	u32 mask = 0;
 	int i;
 	struct kvm *kvm = ioapic->kvm;
 	struct kvm_vcpu *vcpu;
 
 	ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
 
+	*mask = 0;
 	if (dest_mode == 0) {	/* Physical mode. */
 		if (dest == 0xFF) {	/* Broadcast. */
 			for (i = 0; i < KVM_MAX_VCPUS; ++i)
 				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
-					mask |= 1 << i;
-			return mask;
+					*mask |= 1 << i;
+			return;
 		}
 		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 			vcpu = kvm->vcpus[i];
@@ -184,7 +184,7 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 				continue;
 			if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
 				if (vcpu->arch.apic)
-					mask = 1 << i;
+					*mask = 1 << i;
 				break;
 			}
 		}
@@ -195,10 +195,9 @@ u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 				continue;
 			if (vcpu->arch.apic &&
 			    kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
-				mask |= 1 << vcpu->vcpu_id;
+				*mask |= 1 << vcpu->vcpu_id;
 		}
-	ioapic_debug("mask %x\n", mask);
-	return mask;
+	ioapic_debug("mask %x\n", *mask);
 }
 
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 008ec873d0186..f395798bc1d12 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -70,7 +70,7 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-u32 kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-				u8 dest_mode);
+void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
+				     u8 dest_mode, unsigned long *mask);
 
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index aec7a0d93a3fc..e8ff89c3cca73 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -49,8 +49,9 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 {
 	struct kvm_vcpu *vcpu;
 
-	*deliver_bitmask = kvm_ioapic_get_delivery_bitmask(ioapic,
-				entry->fields.dest_id, entry->fields.dest_mode);
+	kvm_ioapic_get_delivery_bitmask(ioapic, entry->fields.dest_id,
+					entry->fields.dest_mode,
+					deliver_bitmask);
 	switch (entry->fields.delivery_mode) {
 	case IOAPIC_LOWEST_PRIORITY:
 		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
-- 
GitLab


From 110c2faeba1f1994bcb1de55b9c31f4147dbfdb6 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:39 +0800
Subject: [PATCH 1711/2198] KVM: Update intr delivery func to accept unsigned
 long* bitmap

Would be used with bit ops, and would be easily extended if KVM_MAX_VCPUS is
increased.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 8 ++++----
 virt/kvm/ioapic.h    | 2 +-
 virt/kvm/irq_comm.c  | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index f0b67f2cdd695..6aa8d20f9eeb0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -409,7 +409,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 }
 
 static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
-				       unsigned long bitmap)
+				       unsigned long *bitmap)
 {
 	int last;
 	int next;
@@ -421,7 +421,7 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
 	do {
 		if (++next == KVM_MAX_VCPUS)
 			next = 0;
-		if (kvm->vcpus[next] == NULL || !test_bit(next, &bitmap))
+		if (kvm->vcpus[next] == NULL || !test_bit(next, bitmap))
 			continue;
 		apic = kvm->vcpus[next]->arch.apic;
 		if (apic && apic_enabled(apic))
@@ -437,7 +437,7 @@ static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-		unsigned long bitmap)
+		unsigned long *bitmap)
 {
 	struct kvm_lapic *apic;
 
@@ -508,7 +508,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	}
 
 	if (delivery_mode == APIC_DM_LOWEST) {
-		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
+		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, &lpr_map);
 		if (target != NULL)
 			__apic_accept_irq(target->arch.apic, delivery_mode,
 					  vector, level, trig_mode);
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index f395798bc1d12..7275f87a11cd9 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -65,7 +65,7 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-				       unsigned long bitmap);
+				       unsigned long *bitmap);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index e8ff89c3cca73..d4421cd6d6636 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -55,7 +55,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 	switch (entry->fields.delivery_mode) {
 	case IOAPIC_LOWEST_PRIORITY:
 		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
-				entry->fields.vector, *deliver_bitmask);
+				entry->fields.vector, deliver_bitmask);
 		*deliver_bitmask = 1 << vcpu->vcpu_id;
 		break;
 	case IOAPIC_FIXED:
-- 
GitLab


From bfd349d073b2838a6a031f057d25e266619b7093 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 11 Feb 2009 16:03:40 +0800
Subject: [PATCH 1712/2198] KVM: bit ops for deliver_bitmap

It's also convenient when we extend KVM supported vcpu number in the future.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c |  7 ++++---
 virt/kvm/ioapic.c    | 24 +++++++++++++-----------
 virt/kvm/irq_comm.c  | 17 +++++++++--------
 3 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 6aa8d20f9eeb0..afc59b2e7e028 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -483,9 +483,10 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 
 	struct kvm_vcpu *target;
 	struct kvm_vcpu *vcpu;
-	unsigned long lpr_map = 0;
+	DECLARE_BITMAP(lpr_map, KVM_MAX_VCPUS);
 	int i;
 
+	bitmap_zero(lpr_map, KVM_MAX_VCPUS);
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
 		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
@@ -500,7 +501,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 		if (vcpu->arch.apic &&
 		    apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
 			if (delivery_mode == APIC_DM_LOWEST)
-				set_bit(vcpu->vcpu_id, &lpr_map);
+				__set_bit(vcpu->vcpu_id, lpr_map);
 			else
 				__apic_accept_irq(vcpu->arch.apic, delivery_mode,
 						  vector, level, trig_mode);
@@ -508,7 +509,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	}
 
 	if (delivery_mode == APIC_DM_LOWEST) {
-		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, &lpr_map);
+		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
 		if (target != NULL)
 			__apic_accept_irq(target->arch.apic, delivery_mode,
 					  vector, level, trig_mode);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 3b5371299dd10..7c2cb2bd1199e 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -203,7 +203,7 @@ void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
 	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
-	unsigned long deliver_bitmask;
+	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 	struct kvm_vcpu *vcpu;
 	int vcpu_id, r = -1;
 
@@ -213,22 +213,24 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 		     entry.fields.delivery_mode, entry.fields.vector,
 		     entry.fields.trig_mode);
 
-	kvm_get_intr_delivery_bitmask(ioapic, &entry, &deliver_bitmask);
-	if (!deliver_bitmask) {
-		ioapic_debug("no target on destination\n");
-		return 0;
-	}
+	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 
 	/* Always delivery PIT interrupt to vcpu 0 */
 #ifdef CONFIG_X86
 	if (irq == 0)
-		deliver_bitmask = 1;
+		__set_bit(0, deliver_bitmask);
+	else
 #endif
+		kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
+
+	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
+		ioapic_debug("no target on destination\n");
+		return 0;
+	}
 
-	for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-		if (!(deliver_bitmask & (1 << vcpu_id)))
-			continue;
-		deliver_bitmask &= ~(1 << vcpu_id);
+	while ((vcpu_id = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
+			< KVM_MAX_VCPUS) {
+		__clear_bit(vcpu_id, deliver_bitmask);
 		vcpu = ioapic->kvm->vcpus[vcpu_id];
 		if (vcpu) {
 			if (entry.fields.delivery_mode ==
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index d4421cd6d6636..d165e056f79bb 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -56,7 +56,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 	case IOAPIC_LOWEST_PRIORITY:
 		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
 				entry->fields.vector, deliver_bitmask);
-		*deliver_bitmask = 1 << vcpu->vcpu_id;
+		__set_bit(vcpu->vcpu_id, deliver_bitmask);
 		break;
 	case IOAPIC_FIXED:
 	case IOAPIC_NMI:
@@ -76,10 +76,12 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 	struct kvm_vcpu *vcpu;
 	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
 	union kvm_ioapic_redirect_entry entry;
-	unsigned long deliver_bitmask;
+	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 
 	BUG_ON(!ioapic);
 
+	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+
 	entry.bits = 0;
 	entry.fields.dest_id = (e->msi.address_lo &
 			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
@@ -95,16 +97,15 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 
 	/* TODO Deal with RH bit of MSI message address */
 
-	kvm_get_intr_delivery_bitmask(ioapic, &entry, &deliver_bitmask);
+	kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
 
-	if (!deliver_bitmask) {
+	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
 		printk(KERN_WARNING "kvm: no destination for MSI delivery!");
 		return -1;
 	}
-	for (vcpu_id = 0; deliver_bitmask != 0; vcpu_id++) {
-		if (!(deliver_bitmask & (1 << vcpu_id)))
-			continue;
-		deliver_bitmask &= ~(1 << vcpu_id);
+	while ((vcpu_id = find_first_bit(deliver_bitmask,
+					KVM_MAX_VCPUS)) < KVM_MAX_VCPUS) {
+		__clear_bit(vcpu_id, deliver_bitmask);
 		vcpu = ioapic->kvm->vcpus[vcpu_id];
 		if (vcpu) {
 			if (r < 0)
-- 
GitLab


From c1e01514296e8a4a43ff0c88dcff635cb90feb5f Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Feb 2009 17:22:26 +0800
Subject: [PATCH 1713/2198] KVM: Ioctls for init MSI-X entry

Introduce KVM_SET_MSIX_NR and KVM_SET_MSIX_ENTRY two ioctls.

This two ioctls are used by userspace to specific guest device MSI-X entry
number and correlate MSI-X entry with GSI during the initialization stage.

MSI-X should be well initialzed before enabling.

Don't support change MSI-X entry number for now.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h      |  18 +++++++
 include/linux/kvm_host.h |  10 ++++
 virt/kvm/kvm_main.c      | 104 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 132 insertions(+)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 8cc137911b340..78cdee8c63556 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -487,6 +487,10 @@ struct kvm_irq_routing {
 #define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
 #define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
 				     struct kvm_assigned_pci_dev)
+#define KVM_ASSIGN_SET_MSIX_NR \
+			_IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr)
+#define KVM_ASSIGN_SET_MSIX_ENTRY \
+			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
 
 /*
  * ioctls for vcpu fds
@@ -607,4 +611,18 @@ struct kvm_assigned_irq {
 #define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
 #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
 
+struct kvm_assigned_msix_nr {
+	__u32 assigned_dev_id;
+	__u16 entry_nr;
+	__u16 padding;
+};
+
+#define KVM_MAX_MSIX_PER_DEV		512
+struct kvm_assigned_msix_entry {
+	__u32 assigned_dev_id;
+	__u32 gsi;
+	__u16 entry; /* The index of entry in the MSI-X table */
+	__u16 padding[3];
+};
+
 #endif
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1a2f98fbecea9..432edc27e82b9 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -319,6 +319,12 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
+struct kvm_guest_msix_entry {
+	u32 vector;
+	u16 entry;
+	u16 flags;
+};
+
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
 	struct work_struct interrupt_work;
@@ -326,13 +332,17 @@ struct kvm_assigned_dev_kernel {
 	int assigned_dev_id;
 	int host_busnr;
 	int host_devfn;
+	unsigned int entries_nr;
 	int host_irq;
 	bool host_irq_disabled;
+	struct msix_entry *host_msix_entries;
 	int guest_irq;
+	struct kvm_guest_msix_entry *guest_msix_entries;
 #define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
 #define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
 #define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
 #define KVM_ASSIGNED_DEV_HOST_MSI	(1 << 9)
+#define KVM_ASSIGNED_DEV_MSIX		((1 << 2) | (1 << 10))
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	int flags;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 4d0dd390aa501..1ceb96901f328 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1593,6 +1593,88 @@ static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu *vcpu, sigset_t *sigset)
 	return 0;
 }
 
+#ifdef __KVM_HAVE_MSIX
+static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
+				    struct kvm_assigned_msix_nr *entry_nr)
+{
+	int r = 0;
+	struct kvm_assigned_dev_kernel *adev;
+
+	mutex_lock(&kvm->lock);
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      entry_nr->assigned_dev_id);
+	if (!adev) {
+		r = -EINVAL;
+		goto msix_nr_out;
+	}
+
+	if (adev->entries_nr == 0) {
+		adev->entries_nr = entry_nr->entry_nr;
+		if (adev->entries_nr == 0 ||
+		    adev->entries_nr >= KVM_MAX_MSIX_PER_DEV) {
+			r = -EINVAL;
+			goto msix_nr_out;
+		}
+
+		adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) *
+						entry_nr->entry_nr,
+						GFP_KERNEL);
+		if (!adev->host_msix_entries) {
+			r = -ENOMEM;
+			goto msix_nr_out;
+		}
+		adev->guest_msix_entries = kzalloc(
+				sizeof(struct kvm_guest_msix_entry) *
+				entry_nr->entry_nr, GFP_KERNEL);
+		if (!adev->guest_msix_entries) {
+			kfree(adev->host_msix_entries);
+			r = -ENOMEM;
+			goto msix_nr_out;
+		}
+	} else /* Not allowed set MSI-X number twice */
+		r = -EINVAL;
+msix_nr_out:
+	mutex_unlock(&kvm->lock);
+	return r;
+}
+
+static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm,
+				       struct kvm_assigned_msix_entry *entry)
+{
+	int r = 0, i;
+	struct kvm_assigned_dev_kernel *adev;
+
+	mutex_lock(&kvm->lock);
+
+	adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      entry->assigned_dev_id);
+
+	if (!adev) {
+		r = -EINVAL;
+		goto msix_entry_out;
+	}
+
+	for (i = 0; i < adev->entries_nr; i++)
+		if (adev->guest_msix_entries[i].vector == 0 ||
+		    adev->guest_msix_entries[i].entry == entry->entry) {
+			adev->guest_msix_entries[i].entry = entry->entry;
+			adev->guest_msix_entries[i].vector = entry->gsi;
+			adev->host_msix_entries[i].entry = entry->entry;
+			break;
+		}
+	if (i == adev->entries_nr) {
+		r = -ENOSPC;
+		goto msix_entry_out;
+	}
+
+msix_entry_out:
+	mutex_unlock(&kvm->lock);
+
+	return r;
+}
+#endif
+
 static long kvm_vcpu_ioctl(struct file *filp,
 			   unsigned int ioctl, unsigned long arg)
 {
@@ -1917,7 +1999,29 @@ static long kvm_vm_ioctl(struct file *filp,
 		vfree(entries);
 		break;
 	}
+#ifdef __KVM_HAVE_MSIX
+	case KVM_ASSIGN_SET_MSIX_NR: {
+		struct kvm_assigned_msix_nr entry_nr;
+		r = -EFAULT;
+		if (copy_from_user(&entry_nr, argp, sizeof entry_nr))
+			goto out;
+		r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr);
+		if (r)
+			goto out;
+		break;
+	}
+	case KVM_ASSIGN_SET_MSIX_ENTRY: {
+		struct kvm_assigned_msix_entry entry;
+		r = -EFAULT;
+		if (copy_from_user(&entry, argp, sizeof entry))
+			goto out;
+		r = kvm_vm_ioctl_set_msix_entry(kvm, &entry);
+		if (r)
+			goto out;
+		break;
+	}
 #endif
+#endif /* KVM_CAP_IRQ_ROUTING */
 	default:
 		r = kvm_arch_vm_ioctl(filp, ioctl, arg);
 	}
-- 
GitLab


From 2350bd1f62c8706c22b8e58c3bfff10806c0a31b Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Feb 2009 17:22:27 +0800
Subject: [PATCH 1714/2198] KVM: Add MSI-X interrupt injection logic

We have to handle more than one interrupt with one handler for MSI-X. Avi
suggested to use a flag to indicate the pending. So here is it.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 66 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 432edc27e82b9..3832243625d4b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -319,6 +319,7 @@ struct kvm_irq_ack_notifier {
 	void (*irq_acked)(struct kvm_irq_ack_notifier *kian);
 };
 
+#define KVM_ASSIGNED_MSIX_PENDING		0x1
 struct kvm_guest_msix_entry {
 	u32 vector;
 	u16 entry;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 1ceb96901f328..8bd44d6985c7a 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -95,25 +95,69 @@ static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *h
 	return NULL;
 }
 
+static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
+				    *assigned_dev, int irq)
+{
+	int i, index;
+	struct msix_entry *host_msix_entries;
+
+	host_msix_entries = assigned_dev->host_msix_entries;
+
+	index = -1;
+	for (i = 0; i < assigned_dev->entries_nr; i++)
+		if (irq == host_msix_entries[i].vector) {
+			index = i;
+			break;
+		}
+	if (index < 0) {
+		printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n");
+		return 0;
+	}
+
+	return index;
+}
+
 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 {
 	struct kvm_assigned_dev_kernel *assigned_dev;
+	struct kvm *kvm;
+	int irq, i;
 
 	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
 				    interrupt_work);
+	kvm = assigned_dev->kvm;
 
 	/* This is taken to safely inject irq inside the guest. When
 	 * the interrupt injection (or the ioapic code) uses a
 	 * finer-grained lock, update this
 	 */
-	mutex_lock(&assigned_dev->kvm->lock);
-	kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
-		    assigned_dev->guest_irq, 1);
-
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI) {
-		enable_irq(assigned_dev->host_irq);
-		assigned_dev->host_irq_disabled = false;
+	mutex_lock(&kvm->lock);
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+		struct kvm_guest_msix_entry *guest_entries =
+			assigned_dev->guest_msix_entries;
+		for (i = 0; i < assigned_dev->entries_nr; i++) {
+			if (!(guest_entries[i].flags &
+					KVM_ASSIGNED_MSIX_PENDING))
+				continue;
+			guest_entries[i].flags &= ~KVM_ASSIGNED_MSIX_PENDING;
+			kvm_set_irq(assigned_dev->kvm,
+				    assigned_dev->irq_source_id,
+				    guest_entries[i].vector, 1);
+			irq = assigned_dev->host_msix_entries[i].vector;
+			if (irq != 0)
+				enable_irq(irq);
+			assigned_dev->host_irq_disabled = false;
+		}
+	} else {
+		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
+			    assigned_dev->guest_irq, 1);
+		if (assigned_dev->irq_requested_type &
+				KVM_ASSIGNED_DEV_GUEST_MSI) {
+			enable_irq(assigned_dev->host_irq);
+			assigned_dev->host_irq_disabled = false;
+		}
 	}
+
 	mutex_unlock(&assigned_dev->kvm->lock);
 }
 
@@ -122,6 +166,14 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 	struct kvm_assigned_dev_kernel *assigned_dev =
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
+	if (assigned_dev->irq_requested_type == KVM_ASSIGNED_DEV_MSIX) {
+		int index = find_index_from_host_irq(assigned_dev, irq);
+		if (index < 0)
+			return IRQ_HANDLED;
+		assigned_dev->guest_msix_entries[index].flags |=
+			KVM_ASSIGNED_MSIX_PENDING;
+	}
+
 	schedule_work(&assigned_dev->interrupt_work);
 
 	disable_irq_nosync(irq);
-- 
GitLab


From d510d6cc653bc4b3094ea73afe12600d0ab445b3 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Feb 2009 17:22:28 +0800
Subject: [PATCH 1715/2198] KVM: Enable MSI-X for KVM assigned device

This patch finally enable MSI-X.

What we need for MSI-X:
1. Intercept one page in MMIO region of device. So that we can get guest desired
MSI-X table and set up the real one. Now this have been done by guest, and
transfer to kernel using ioctl KVM_SET_MSIX_NR and KVM_SET_MSIX_ENTRY.

2. Information for incoming interrupt. Now one device can have more than one
interrupt, and they are all handled by one workqueue structure. So we need to
identify them. The previous patch enable gsi_msg_pending_bitmap get this done.

3. Mapping from host IRQ to guest gsi as well as guest gsi to real MSI/MSI-X
message address/data. We used same entry number for the host and guest here, so
that it's easy to find the correlated guest gsi.

What we lack for now:
1. The PCI spec said nothing can existed with MSI-X table in the same page of
MMIO region, except pending bits. The patch ignore pending bits as the first
step (so they are always 0 - no pending).

2. The PCI spec allowed to change MSI-X table dynamically. That means, the OS
can enable MSI-X, then mask one MSI-X entry, modify it, and unmask it. The patch
didn't support this, and Linux also don't work in this way.

3. The patch didn't implement MSI-X mask all and mask single entry. I would
implement the former in driver/pci/msi.c later. And for single entry, userspace
should have reposibility to handle it.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm.h |  1 +
 include/linux/kvm.h        |  8 ++++
 virt/kvm/kvm_main.c        | 98 +++++++++++++++++++++++++++++++++++---
 3 files changed, 101 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index dc3f6cf117045..125be8b19568e 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -16,6 +16,7 @@
 #define __KVM_HAVE_MSI
 #define __KVM_HAVE_USER_NMI
 #define __KVM_HAVE_GUEST_DEBUG
+#define __KVM_HAVE_MSIX
 
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 78cdee8c63556..640835ed2708d 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -409,6 +409,9 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_DEVICE_ASSIGNMENT
 #define KVM_CAP_DEVICE_DEASSIGNMENT 27
 #endif
+#ifdef __KVM_HAVE_MSIX
+#define KVM_CAP_DEVICE_MSIX 28
+#endif
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
 
@@ -611,6 +614,11 @@ struct kvm_assigned_irq {
 #define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
 #define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
 
+#define KVM_DEV_IRQ_ASSIGN_MSIX_ACTION  (KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX |\
+					KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
+#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX  (1 << 1)
+#define KVM_DEV_IRQ_ASSIGN_MASK_MSIX    (1 << 2)
+
 struct kvm_assigned_msix_nr {
 	__u32 assigned_dev_id;
 	__u16 entry_nr;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8bd44d6985c7a..3bed82754a5de 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -236,13 +236,33 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	 * now, the kvm state is still legal for probably we also have to wait
 	 * interrupt_work done.
 	 */
-	disable_irq_nosync(assigned_dev->host_irq);
-	cancel_work_sync(&assigned_dev->interrupt_work);
+	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+		int i;
+		for (i = 0; i < assigned_dev->entries_nr; i++)
+			disable_irq_nosync(assigned_dev->
+					   host_msix_entries[i].vector);
+
+		cancel_work_sync(&assigned_dev->interrupt_work);
 
-	free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+		for (i = 0; i < assigned_dev->entries_nr; i++)
+			free_irq(assigned_dev->host_msix_entries[i].vector,
+				 (void *)assigned_dev);
 
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
-		pci_disable_msi(assigned_dev->dev);
+		assigned_dev->entries_nr = 0;
+		kfree(assigned_dev->host_msix_entries);
+		kfree(assigned_dev->guest_msix_entries);
+		pci_disable_msix(assigned_dev->dev);
+	} else {
+		/* Deal with MSI and INTx */
+		disable_irq_nosync(assigned_dev->host_irq);
+		cancel_work_sync(&assigned_dev->interrupt_work);
+
+		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
+
+		if (assigned_dev->irq_requested_type &
+				KVM_ASSIGNED_DEV_HOST_MSI)
+			pci_disable_msi(assigned_dev->dev);
+	}
 
 	assigned_dev->irq_requested_type = 0;
 }
@@ -373,6 +393,60 @@ static int assigned_device_update_msi(struct kvm *kvm,
 }
 #endif
 
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_update_msix(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *adev,
+			struct kvm_assigned_irq *airq)
+{
+	/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
+	int i, r;
+
+	adev->ack_notifier.gsi = -1;
+
+	if (irqchip_in_kernel(kvm)) {
+		if (airq->flags & KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
+			return -ENOTTY;
+
+		if (!(airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX)) {
+			/* Guest disable MSI-X */
+			kvm_free_assigned_irq(kvm, adev);
+			if (msi2intx) {
+				pci_enable_msi(adev->dev);
+				if (adev->dev->msi_enabled)
+					return assigned_device_update_msi(kvm,
+							adev, airq);
+			}
+			return assigned_device_update_intx(kvm, adev, airq);
+		}
+
+		/* host_msix_entries and guest_msix_entries should have been
+		 * initialized */
+		if (adev->entries_nr == 0)
+			return -EINVAL;
+
+		kvm_free_assigned_irq(kvm, adev);
+
+		r = pci_enable_msix(adev->dev, adev->host_msix_entries,
+				    adev->entries_nr);
+		if (r)
+			return r;
+
+		for (i = 0; i < adev->entries_nr; i++) {
+			r = request_irq((adev->host_msix_entries + i)->vector,
+					kvm_assigned_dev_intr, 0,
+					"kvm_assigned_msix_device",
+					(void *)adev);
+			if (r)
+				return r;
+		}
+	}
+
+	adev->irq_requested_type |= KVM_ASSIGNED_DEV_MSIX;
+
+	return 0;
+}
+#endif
+
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 				   struct kvm_assigned_irq
 				   *assigned_irq)
@@ -417,12 +491,24 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 		}
 	}
 
-	if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
+	if (match->irq_requested_type & KVM_ASSIGNED_DEV_MSIX)
+		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX;
+	else if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
 		 (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI))
 		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI;
 
 	changed_flags = assigned_irq->flags ^ current_flags;
 
+#ifdef __KVM_HAVE_MSIX
+	if (changed_flags & KVM_DEV_IRQ_ASSIGN_MSIX_ACTION) {
+		r = assigned_device_update_msix(kvm, match, assigned_irq);
+		if (r) {
+			printk(KERN_WARNING "kvm: failed to execute "
+					"MSI-X action!\n");
+			goto out_release;
+		}
+	} else
+#endif
 	if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) ||
 	    (msi2intx && match->dev->msi_enabled)) {
 #ifdef CONFIG_X86
-- 
GitLab


From 2dea4c84bc936731668b5a7a9fba5b436a422668 Mon Sep 17 00:00:00 2001
From: "Matt T. Yourst" <yourst@users.sourceforge.net>
Date: Tue, 24 Feb 2009 15:28:00 -0300
Subject: [PATCH 1716/2198] KVM: x86: silence preempt warning on
 kvm_write_guest_time

This issue just appeared in kvm-84 when running on 2.6.28.7 (x86-64)
with PREEMPT enabled.

We're getting syslog warnings like this many (but not all) times qemu
tells KVM to run the VCPU:

BUG: using smp_processor_id() in preemptible [00000000] code:
qemu-system-x86/28938
caller is kvm_arch_vcpu_ioctl_run+0x5d1/0xc70 [kvm]
Pid: 28938, comm: qemu-system-x86 2.6.28.7-mtyrel-64bit
Call Trace:
debug_smp_processor_id+0xf7/0x100
kvm_arch_vcpu_ioctl_run+0x5d1/0xc70 [kvm]
? __wake_up+0x4e/0x70
? wake_futex+0x27/0x40
kvm_vcpu_ioctl+0x2e9/0x5a0 [kvm]
enqueue_hrtimer+0x8a/0x110
_spin_unlock_irqrestore+0x27/0x50
vfs_ioctl+0x31/0xa0
do_vfs_ioctl+0x74/0x480
sys_futex+0xb4/0x140
sys_ioctl+0x99/0xa0
system_call_fastpath+0x16/0x1b

As it turns out, the call trace is messed up due to gcc's inlining, but
I isolated the problem anyway: kvm_write_guest_time() is being used in a
non-thread-safe manner on preemptable kernels.

Basically kvm_write_guest_time()'s body needs to be surrounded by
preempt_disable() and preempt_enable(), since the kernel won't let us
query any per-CPU data (indirectly using smp_processor_id()) without
preemption disabled. The attached patch fixes this issue by disabling
preemption inside kvm_write_guest_time().

[marcelo: surround only __get_cpu_var calls since the warning
is harmless]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3944e917e794f..43e049a2ccf4e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -634,10 +634,12 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	if ((!vcpu->time_page))
 		return;
 
+	preempt_disable();
 	if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
 		kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
 		vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
 	}
+	preempt_enable();
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-- 
GitLab


From a90ede7b17d122acd58e6e1ff911be9dcf5263cc Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 11 Feb 2009 22:45:42 -0200
Subject: [PATCH 1717/2198] KVM: x86: paravirt skip pit-through-ioapic boot
 check

Skip the test which checks if the PIT is properly routed when
using the IOAPIC, aimed at buggy hardware.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kernel/kvm.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 33019ddb56b45..057173db6adcc 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,6 +27,7 @@
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/hardirq.h>
+#include <asm/timer.h>
 
 #define MMU_QUEUE_SIZE 1024
 
@@ -230,6 +231,9 @@ static void paravirt_ops_setup(void)
 		pv_mmu_ops.lazy_mode.enter = kvm_enter_lazy_mmu;
 		pv_mmu_ops.lazy_mode.leave = kvm_leave_lazy_mmu;
 	}
+#ifdef CONFIG_X86_IO_APIC
+	no_timer_check = 1;
+#endif
 }
 
 void __init kvm_guest_init(void)
-- 
GitLab


From b95b51d580bff9376850eef29d34c3aa08c26db7 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Thu, 26 Feb 2009 13:55:33 +0100
Subject: [PATCH 1718/2198] KVM: declare ioapic functions only on affected
 hardware

Since "KVM: Unify the delivery of IOAPIC and MSI interrupts"
I get the following warnings:

  CC [M]  arch/s390/kvm/kvm-s390.o
In file included from arch/s390/kvm/kvm-s390.c:22:
include/linux/kvm_host.h:357: warning: 'struct kvm_ioapic' declared inside parameter list
include/linux/kvm_host.h:357: warning: its scope is only this definition or declaration, which is probably not what you want

This patch limits IOAPIC functions for architectures that have one.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3832243625d4b..3b91ec9982c25 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -363,9 +363,11 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
+#ifdef __KVM_HAVE_IOAPIC
 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask);
+#endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
-- 
GitLab


From 5a05d54554f19a128306eca7f7f5ed31f7d7eeb9 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 23 Feb 2009 10:57:39 -0300
Subject: [PATCH 1719/2198] KVM: PIT: remove unused scheduled variable

Unused.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 1 -
 arch/x86/kvm/i8254.h | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c13bb92d31577..3eddae692c052 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -208,7 +208,6 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 		wake_up_interruptible(&vcpu0->wq);
 
 	hrtimer_add_expires_ns(&pt->timer, pt->period);
-	pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
 	if (pt->period)
 		ps->channels[0].count_load_time = ktime_get();
 
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 6acbe4b505d5f..521accf3bae73 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -7,7 +7,6 @@ struct kvm_kpit_timer {
 	struct hrtimer timer;
 	int irq;
 	s64 period; /* unit: ns */
-	s64 scheduled;
 	atomic_t pending;
 	bool reinject;
 };
-- 
GitLab


From fd66842370e32f3bbe429677280a326c07e508c1 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 23 Feb 2009 10:57:40 -0300
Subject: [PATCH 1720/2198] KVM: PIT: remove usage of count_load_time for
 channel 0

We can infer elapsed time from hrtimer_expires_remaining.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 37 +++++++++++++++++++++++++++++++------
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3eddae692c052..09ae841ff462d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -98,6 +98,32 @@ static int pit_get_gate(struct kvm *kvm, int channel)
 	return kvm->arch.vpit->pit_state.channels[channel].gate;
 }
 
+static s64 __kpit_elapsed(struct kvm *kvm)
+{
+	s64 elapsed;
+	ktime_t remaining;
+	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
+
+	remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
+	if (ktime_to_ns(remaining) < 0)
+		remaining = ktime_set(0, 0);
+
+	elapsed = ps->pit_timer.period;
+	if (ktime_to_ns(remaining) <= ps->pit_timer.period)
+		elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+
+	return elapsed;
+}
+
+static s64 kpit_elapsed(struct kvm *kvm, struct kvm_kpit_channel_state *c,
+			int channel)
+{
+	if (channel == 0)
+		return __kpit_elapsed(kvm);
+
+	return ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+}
+
 static int pit_get_count(struct kvm *kvm, int channel)
 {
 	struct kvm_kpit_channel_state *c =
@@ -107,7 +133,7 @@ static int pit_get_count(struct kvm *kvm, int channel)
 
 	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
 
-	t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+	t = kpit_elapsed(kvm, c, channel);
 	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
 	switch (c->mode) {
@@ -137,7 +163,7 @@ static int pit_get_out(struct kvm *kvm, int channel)
 
 	WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
 
-	t = ktime_to_ns(ktime_sub(ktime_get(), c->count_load_time));
+	t = kpit_elapsed(kvm, c, channel);
 	d = muldiv64(t, KVM_PIT_FREQ, NSEC_PER_SEC);
 
 	switch (c->mode) {
@@ -208,8 +234,6 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
 		wake_up_interruptible(&vcpu0->wq);
 
 	hrtimer_add_expires_ns(&pt->timer, pt->period);
-	if (pt->period)
-		ps->channels[0].count_load_time = ktime_get();
 
 	return (pt->period == 0 ? 0 : 1);
 }
@@ -305,11 +329,12 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	if (val == 0)
 		val = 0x10000;
 
-	ps->channels[channel].count_load_time = ktime_get();
 	ps->channels[channel].count = val;
 
-	if (channel != 0)
+	if (channel != 0) {
+		ps->channels[channel].count_load_time = ktime_get();
 		return;
+	}
 
 	/* Two types of timer
 	 * mode 1 is one shot, mode 2 is period, otherwise del timer */
-- 
GitLab


From d3c7b77d1a6e7a0a27035a7ba723a3455317883e Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 23 Feb 2009 10:57:41 -0300
Subject: [PATCH 1721/2198] KVM: unify part of generic timer handling

Hide the internals of vcpu awakening / injection from the in-kernel
emulated timers. This makes future changes in this logic easier and
decreases the distance to more generic timer handling.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/Makefile    |  2 +-
 arch/x86/kvm/i8254.c     | 57 +++++++++---------------
 arch/x86/kvm/i8254.h     | 11 +----
 arch/x86/kvm/kvm_timer.h | 18 ++++++++
 arch/x86/kvm/lapic.c     | 94 +++++++++++++++++-----------------------
 arch/x86/kvm/lapic.h     |  9 ++--
 arch/x86/kvm/timer.c     | 46 ++++++++++++++++++++
 7 files changed, 129 insertions(+), 108 deletions(-)
 create mode 100644 arch/x86/kvm/kvm_timer.h
 create mode 100644 arch/x86/kvm/timer.c

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d3ec292f00f24..b43c4efafe801 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -14,7 +14,7 @@ endif
 EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
 
 kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \
-	i8254.o
+	i8254.o timer.o
 obj-$(CONFIG_KVM) += kvm.o
 kvm-intel-objs = vmx.o
 obj-$(CONFIG_KVM_INTEL) += kvm-intel.o
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 09ae841ff462d..4e2e3f26dbf83 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -219,25 +219,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
 	}
 }
 
-static int __pit_timer_fn(struct kvm_kpit_state *ps)
-{
-	struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
-	struct kvm_kpit_timer *pt = &ps->pit_timer;
-
-	if (!atomic_inc_and_test(&pt->pending))
-		set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
-
-	if (!pt->reinject)
-		atomic_set(&pt->pending, 1);
-
-	if (vcpu0 && waitqueue_active(&vcpu0->wq))
-		wake_up_interruptible(&vcpu0->wq);
-
-	hrtimer_add_expires_ns(&pt->timer, pt->period);
-
-	return (pt->period == 0 ? 0 : 1);
-}
-
 int pit_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -258,21 +239,6 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 	spin_unlock(&ps->inject_lock);
 }
 
-static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
-{
-	struct kvm_kpit_state *ps;
-	int restart_timer = 0;
-
-	ps = container_of(data, struct kvm_kpit_state, pit_timer.timer);
-
-	restart_timer = __pit_timer_fn(ps);
-
-	if (restart_timer)
-		return HRTIMER_RESTART;
-	else
-		return HRTIMER_NORESTART;
-}
-
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
@@ -286,15 +252,26 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
-static void destroy_pit_timer(struct kvm_kpit_timer *pt)
+static void destroy_pit_timer(struct kvm_timer *pt)
 {
 	pr_debug("pit: execute del timer!\n");
 	hrtimer_cancel(&pt->timer);
 }
 
+static bool kpit_is_periodic(struct kvm_timer *ktimer)
+{
+	struct kvm_kpit_state *ps = container_of(ktimer, struct kvm_kpit_state,
+						 pit_timer);
+	return ps->is_periodic;
+}
+
+struct kvm_timer_ops kpit_ops = {
+	.is_periodic = kpit_is_periodic,
+};
+
 static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 {
-	struct kvm_kpit_timer *pt = &ps->pit_timer;
+	struct kvm_timer *pt = &ps->pit_timer;
 	s64 interval;
 
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -304,7 +281,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&pt->timer);
 	pt->period = (is_period == 0) ? 0 : interval;
-	pt->timer.function = pit_timer_fn;
+	ps->is_periodic = is_period;
+
+	pt->timer.function = kvm_timer_fn;
+	pt->t_ops = &kpit_ops;
+	pt->kvm = ps->pit->kvm;
+	pt->vcpu_id = 0;
+
 	atomic_set(&pt->pending, 0);
 	ps->irq_ack = 1;
 
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 521accf3bae73..bbd863ff60b70 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,14 +3,6 @@
 
 #include "iodev.h"
 
-struct kvm_kpit_timer {
-	struct hrtimer timer;
-	int irq;
-	s64 period; /* unit: ns */
-	atomic_t pending;
-	bool reinject;
-};
-
 struct kvm_kpit_channel_state {
 	u32 count; /* can be 65536 */
 	u16 latched_count;
@@ -29,7 +21,8 @@ struct kvm_kpit_channel_state {
 
 struct kvm_kpit_state {
 	struct kvm_kpit_channel_state channels[3];
-	struct kvm_kpit_timer pit_timer;
+	struct kvm_timer pit_timer;
+	bool is_periodic;
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;
diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h
new file mode 100644
index 0000000000000..26bd6ba74e1cf
--- /dev/null
+++ b/arch/x86/kvm/kvm_timer.h
@@ -0,0 +1,18 @@
+
+struct kvm_timer {
+	struct hrtimer timer;
+	s64 period; 				/* unit: ns */
+	atomic_t pending;			/* accumulated triggered timers */
+	bool reinject;
+	struct kvm_timer_ops *t_ops;
+	struct kvm *kvm;
+	int vcpu_id;
+};
+
+struct kvm_timer_ops {
+        bool (*is_periodic)(struct kvm_timer *);
+};
+
+
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data);
+
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index afc59b2e7e028..27ca43e4e4405 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -528,12 +528,13 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic)
 	if (apic_get_reg(apic, APIC_TMICT) == 0)
 		return 0;
 
-	remaining = hrtimer_expires_remaining(&apic->timer.dev);
+	remaining = hrtimer_expires_remaining(&apic->lapic_timer.timer);
 	if (ktime_to_ns(remaining) < 0)
 		remaining = ktime_set(0, 0);
 
-	ns = mod_64(ktime_to_ns(remaining), apic->timer.period);
-	tmcct = div64_u64(ns, (APIC_BUS_CYCLE_NS * apic->timer.divide_count));
+	ns = mod_64(ktime_to_ns(remaining), apic->lapic_timer.period);
+	tmcct = div64_u64(ns,
+			 (APIC_BUS_CYCLE_NS * apic->divide_count));
 
 	return tmcct;
 }
@@ -620,25 +621,25 @@ static void update_divide_count(struct kvm_lapic *apic)
 	tdcr = apic_get_reg(apic, APIC_TDCR);
 	tmp1 = tdcr & 0xf;
 	tmp2 = ((tmp1 & 0x3) | ((tmp1 & 0x8) >> 1)) + 1;
-	apic->timer.divide_count = 0x1 << (tmp2 & 0x7);
+	apic->divide_count = 0x1 << (tmp2 & 0x7);
 
 	apic_debug("timer divide count is 0x%x\n",
-				   apic->timer.divide_count);
+				   apic->lapic_timer.divide_count);
 }
 
 static void start_apic_timer(struct kvm_lapic *apic)
 {
-	ktime_t now = apic->timer.dev.base->get_time();
+	ktime_t now = apic->lapic_timer.timer.base->get_time();
 
-	apic->timer.period = apic_get_reg(apic, APIC_TMICT) *
-		    APIC_BUS_CYCLE_NS * apic->timer.divide_count;
-	atomic_set(&apic->timer.pending, 0);
+	apic->lapic_timer.period = apic_get_reg(apic, APIC_TMICT) *
+		    APIC_BUS_CYCLE_NS * apic->divide_count;
+	atomic_set(&apic->lapic_timer.pending, 0);
 
-	if (!apic->timer.period)
+	if (!apic->lapic_timer.period)
 		return;
 
-	hrtimer_start(&apic->timer.dev,
-		      ktime_add_ns(now, apic->timer.period),
+	hrtimer_start(&apic->lapic_timer.timer,
+		      ktime_add_ns(now, apic->lapic_timer.period),
 		      HRTIMER_MODE_ABS);
 
 	apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016"
@@ -647,9 +648,9 @@ static void start_apic_timer(struct kvm_lapic *apic)
 			   "expire @ 0x%016" PRIx64 ".\n", __func__,
 			   APIC_BUS_CYCLE_NS, ktime_to_ns(now),
 			   apic_get_reg(apic, APIC_TMICT),
-			   apic->timer.period,
+			   apic->lapic_timer.period,
 			   ktime_to_ns(ktime_add_ns(now,
-					apic->timer.period)));
+					apic->lapic_timer.period)));
 }
 
 static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val)
@@ -731,7 +732,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
 				apic_set_reg(apic, APIC_LVTT + 0x10 * i,
 					     lvt_val | APIC_LVT_MASKED);
 			}
-			atomic_set(&apic->timer.pending, 0);
+			atomic_set(&apic->lapic_timer.pending, 0);
 
 		}
 		break;
@@ -763,7 +764,7 @@ static void apic_mmio_write(struct kvm_io_device *this,
 		break;
 
 	case APIC_TMICT:
-		hrtimer_cancel(&apic->timer.dev);
+		hrtimer_cancel(&apic->lapic_timer.timer);
 		apic_set_reg(apic, APIC_TMICT, val);
 		start_apic_timer(apic);
 		return;
@@ -803,7 +804,7 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
 	if (!vcpu->arch.apic)
 		return;
 
-	hrtimer_cancel(&vcpu->arch.apic->timer.dev);
+	hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
 
 	if (vcpu->arch.apic->regs_page)
 		__free_page(vcpu->arch.apic->regs_page);
@@ -881,7 +882,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 	ASSERT(apic != NULL);
 
 	/* Stop the timer in case it's a reset to an active apic */
-	hrtimer_cancel(&apic->timer.dev);
+	hrtimer_cancel(&apic->lapic_timer.timer);
 
 	apic_set_reg(apic, APIC_ID, vcpu->vcpu_id << 24);
 	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
@@ -906,7 +907,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
 	update_divide_count(apic);
-	atomic_set(&apic->timer.pending, 0);
+	atomic_set(&apic->lapic_timer.pending, 0);
 	if (vcpu->vcpu_id == 0)
 		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
 	apic_update_ppr(apic);
@@ -937,22 +938,11 @@ EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
  *----------------------------------------------------------------------
  */
 
-/* TODO: make sure __apic_timer_fn runs in current pCPU */
-static int __apic_timer_fn(struct kvm_lapic *apic)
+static bool lapic_is_periodic(struct kvm_timer *ktimer)
 {
-	int result = 0;
-	wait_queue_head_t *q = &apic->vcpu->wq;
-
-	if(!atomic_inc_and_test(&apic->timer.pending))
-		set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
-	if (waitqueue_active(q))
-		wake_up_interruptible(q);
-
-	if (apic_lvtt_period(apic)) {
-		result = 1;
-		hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
-	}
-	return result;
+	struct kvm_lapic *apic = container_of(ktimer, struct kvm_lapic,
+					      lapic_timer);
+	return apic_lvtt_period(apic);
 }
 
 int apic_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -960,7 +950,7 @@ int apic_has_pending_timer(struct kvm_vcpu *vcpu)
 	struct kvm_lapic *lapic = vcpu->arch.apic;
 
 	if (lapic && apic_enabled(lapic) && apic_lvt_enabled(lapic, APIC_LVTT))
-		return atomic_read(&lapic->timer.pending);
+		return atomic_read(&lapic->lapic_timer.pending);
 
 	return 0;
 }
@@ -987,20 +977,9 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 		kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
-static enum hrtimer_restart apic_timer_fn(struct hrtimer *data)
-{
-	struct kvm_lapic *apic;
-	int restart_timer = 0;
-
-	apic = container_of(data, struct kvm_lapic, timer.dev);
-
-	restart_timer = __apic_timer_fn(apic);
-
-	if (restart_timer)
-		return HRTIMER_RESTART;
-	else
-		return HRTIMER_NORESTART;
-}
+struct kvm_timer_ops lapic_timer_ops = {
+	.is_periodic = lapic_is_periodic,
+};
 
 int kvm_create_lapic(struct kvm_vcpu *vcpu)
 {
@@ -1025,8 +1004,13 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
 	memset(apic->regs, 0, PAGE_SIZE);
 	apic->vcpu = vcpu;
 
-	hrtimer_init(&apic->timer.dev, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
-	apic->timer.dev.function = apic_timer_fn;
+	hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
+		     HRTIMER_MODE_ABS);
+	apic->lapic_timer.timer.function = kvm_timer_fn;
+	apic->lapic_timer.t_ops = &lapic_timer_ops;
+	apic->lapic_timer.kvm = vcpu->kvm;
+	apic->lapic_timer.vcpu_id = vcpu->vcpu_id;
+
 	apic->base_address = APIC_DEFAULT_PHYS_BASE;
 	vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE;
 
@@ -1079,9 +1063,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
-	if (apic && atomic_read(&apic->timer.pending) > 0) {
+	if (apic && atomic_read(&apic->lapic_timer.pending) > 0) {
 		if (kvm_apic_local_deliver(apic, APIC_LVTT))
-			atomic_dec(&apic->timer.pending);
+			atomic_dec(&apic->lapic_timer.pending);
 	}
 }
 
@@ -1107,7 +1091,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
 			     MSR_IA32_APICBASE_BASE;
 	apic_set_reg(apic, APIC_LVR, APIC_VERSION);
 	apic_update_ppr(apic);
-	hrtimer_cancel(&apic->timer.dev);
+	hrtimer_cancel(&apic->lapic_timer.timer);
 	update_divide_count(apic);
 	start_apic_timer(apic);
 }
@@ -1120,7 +1104,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 	if (!apic)
 		return;
 
-	timer = &apic->timer.dev;
+	timer = &apic->lapic_timer.timer;
 	if (hrtimer_cancel(timer))
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 45ab6ee71209f..2fc0d3c1b1940 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -2,18 +2,15 @@
 #define __KVM_X86_LAPIC_H
 
 #include "iodev.h"
+#include "kvm_timer.h"
 
 #include <linux/kvm_host.h>
 
 struct kvm_lapic {
 	unsigned long base_address;
 	struct kvm_io_device dev;
-	struct {
-		atomic_t pending;
-		s64 period;	/* unit: ns */
-		u32 divide_count;
-		struct hrtimer dev;
-	} timer;
+	struct kvm_timer lapic_timer;
+	u32 divide_count;
 	struct kvm_vcpu *vcpu;
 	struct page *regs_page;
 	void *regs;
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
new file mode 100644
index 0000000000000..86dbac072d0c8
--- /dev/null
+++ b/arch/x86/kvm/timer.c
@@ -0,0 +1,46 @@
+#include <linux/kvm_host.h>
+#include <linux/kvm.h>
+#include <linux/hrtimer.h>
+#include <asm/atomic.h>
+#include "kvm_timer.h"
+
+static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
+{
+	int restart_timer = 0;
+	wait_queue_head_t *q = &vcpu->wq;
+
+	/* FIXME: this code should not know anything about vcpus */
+	if (!atomic_inc_and_test(&ktimer->pending))
+		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+
+	if (!ktimer->reinject)
+		atomic_set(&ktimer->pending, 1);
+
+	if (waitqueue_active(q))
+		wake_up_interruptible(q);
+
+	if (ktimer->t_ops->is_periodic(ktimer)) {
+		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+		restart_timer = 1;
+	}
+
+	return restart_timer;
+}
+
+enum hrtimer_restart kvm_timer_fn(struct hrtimer *data)
+{
+	int restart_timer;
+	struct kvm_vcpu *vcpu;
+	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+
+	vcpu = ktimer->kvm->vcpus[ktimer->vcpu_id];
+	if (!vcpu)
+		return HRTIMER_NORESTART;
+
+	restart_timer = __kvm_timer_fn(vcpu, ktimer);
+	if (restart_timer)
+		return HRTIMER_RESTART;
+	else
+		return HRTIMER_NORESTART;
+}
+
-- 
GitLab


From 3f5e06f8799adca3e7e1bbafe1cd780a3e69f604 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.zhang@intel.com>
Date: Mon, 2 Mar 2009 22:06:41 -0500
Subject: [PATCH 1722/2198] KVM: ia64: fix compilation error in
 kvm_get_lowest_prio_vcpu

Modify the arg of kvm_get_lowest_prio_vcpu().
Make it consistent with its declaration.

Signed-off-by: Yang Zhang <yang.zhang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d20a5db4c4dde..774f0d78a5815 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1837,7 +1837,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-				       unsigned long bitmap)
+				       unsigned long *bitmap)
 {
 	struct kvm_vcpu *lvcpu = kvm->vcpus[0];
 	int i;
-- 
GitLab


From 74a3a8f152053394a016518cc2f2fee216897fa4 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 4 Mar 2009 13:33:02 +0800
Subject: [PATCH 1723/2198] KVM: Merge kvm_ioapic_get_delivery_bitmask into
 kvm_get_intr_delivery_bitmask

Gleb fixed bitmap ops usage in kvm_ioapic_get_delivery_bitmask.

Sheng merged two functions, as well as fixed several issues in
kvm_get_intr_delivery_bitmask
1. deliver_bitmask is a bitmap rather than a unsigned long intereger.
2. Lowest priority target bitmap wrong calculated by mistake.
3. Prevent potential NULL reference.
4. Declaration in include/kvm_host.h caused powerpc compilation warning.
5. Add warning for guest broadcast interrupt with lowest priority delivery mode.
6. Removed duplicate bitmap clean up in caller of kvm_get_intr_delivery_bitmask.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 virt/kvm/ioapic.c   | 46 +++---------------------------------------
 virt/kvm/ioapic.h   |  5 +++--
 virt/kvm/irq_comm.c | 49 +++++++++++++++++++++++++++++++++++++++------
 3 files changed, 49 insertions(+), 51 deletions(-)

diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 7c2cb2bd1199e..ea268a8c37dac 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -161,45 +161,6 @@ static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
 	kvm_vcpu_kick(vcpu);
 }
 
-void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-				     u8 dest_mode, unsigned long *mask)
-{
-	int i;
-	struct kvm *kvm = ioapic->kvm;
-	struct kvm_vcpu *vcpu;
-
-	ioapic_debug("dest %d dest_mode %d\n", dest, dest_mode);
-
-	*mask = 0;
-	if (dest_mode == 0) {	/* Physical mode. */
-		if (dest == 0xFF) {	/* Broadcast. */
-			for (i = 0; i < KVM_MAX_VCPUS; ++i)
-				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
-					*mask |= 1 << i;
-			return;
-		}
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (kvm_apic_match_physical_addr(vcpu->arch.apic, dest)) {
-				if (vcpu->arch.apic)
-					*mask = 1 << i;
-				break;
-			}
-		}
-	} else if (dest != 0)	/* Logical mode, MDA non-zero. */
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (vcpu->arch.apic &&
-			    kvm_apic_match_logical_addr(vcpu->arch.apic, dest))
-				*mask |= 1 << vcpu->vcpu_id;
-		}
-	ioapic_debug("mask %x\n", *mask);
-}
-
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
 	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
@@ -213,13 +174,12 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 		     entry.fields.delivery_mode, entry.fields.vector,
 		     entry.fields.trig_mode);
 
-	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-
 	/* Always delivery PIT interrupt to vcpu 0 */
 #ifdef CONFIG_X86
-	if (irq == 0)
+	if (irq == 0) {
+                bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 		__set_bit(0, deliver_bitmask);
-	else
+        } else
 #endif
 		kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
 
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index 7275f87a11cd9..c8032ab2a4e22 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -70,7 +70,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_ioapic_get_delivery_bitmask(struct kvm_ioapic *ioapic, u8 dest,
-				     u8 dest_mode, unsigned long *mask);
+void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+				   union kvm_ioapic_redirect_entry *entry,
+				   unsigned long *deliver_bitmask);
 
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index d165e056f79bb..1c6ff6d1b8422 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -47,15 +47,54 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask)
 {
+	int i;
+	struct kvm *kvm = ioapic->kvm;
 	struct kvm_vcpu *vcpu;
 
-	kvm_ioapic_get_delivery_bitmask(ioapic, entry->fields.dest_id,
-					entry->fields.dest_mode,
-					deliver_bitmask);
+	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+
+	if (entry->fields.dest_mode == 0) {	/* Physical mode. */
+		if (entry->fields.dest_id == 0xFF) {	/* Broadcast. */
+			for (i = 0; i < KVM_MAX_VCPUS; ++i)
+				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
+					__set_bit(i, deliver_bitmask);
+			/* Lowest priority shouldn't combine with broadcast */
+			if (entry->fields.delivery_mode ==
+			    IOAPIC_LOWEST_PRIORITY && printk_ratelimit())
+				printk(KERN_INFO "kvm: apic: phys broadcast "
+						  "and lowest prio\n");
+			return;
+		}
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (kvm_apic_match_physical_addr(vcpu->arch.apic,
+					entry->fields.dest_id)) {
+				if (vcpu->arch.apic)
+					__set_bit(i, deliver_bitmask);
+				break;
+			}
+		}
+	} else if (entry->fields.dest_id != 0) /* Logical mode, MDA non-zero. */
+		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+			vcpu = kvm->vcpus[i];
+			if (!vcpu)
+				continue;
+			if (vcpu->arch.apic &&
+			    kvm_apic_match_logical_addr(vcpu->arch.apic,
+					entry->fields.dest_id))
+				__set_bit(i, deliver_bitmask);
+		}
+
 	switch (entry->fields.delivery_mode) {
 	case IOAPIC_LOWEST_PRIORITY:
+		/* Select one in deliver_bitmask */
 		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
 				entry->fields.vector, deliver_bitmask);
+		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+		if (!vcpu)
+			return;
 		__set_bit(vcpu->vcpu_id, deliver_bitmask);
 		break;
 	case IOAPIC_FIXED:
@@ -65,7 +104,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 		if (printk_ratelimit())
 			printk(KERN_INFO "kvm: unsupported delivery mode %d\n",
 				entry->fields.delivery_mode);
-		*deliver_bitmask = 0;
+		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 	}
 }
 
@@ -80,8 +119,6 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 
 	BUG_ON(!ioapic);
 
-	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-
 	entry.bits = 0;
 	entry.fields.dest_id = (e->msi.address_lo &
 			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
-- 
GitLab


From f5a1e9f89504f57b2b45645a7239dc8a8ddb0f4c Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Thu, 5 Mar 2009 13:12:29 +0100
Subject: [PATCH 1724/2198] KVM: MMU: remove call to kvm_mmu_pte_write from
 walk_addr

There is no reason to update the shadow pte here because the guest pte
is only changed to dirty state.

Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6bd70206c5613..855eb71110490 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -209,7 +209,6 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 		if (ret)
 			goto walk;
 		pte |= PT_DIRTY_MASK;
-		kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte), 0);
 		walker->ptes[walker->level - 1] = pte;
 	}
 
-- 
GitLab


From 6da7e3f643cf7099965d75fda8606b9d3a8650b9 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:34:44 +0200
Subject: [PATCH 1725/2198] KVM: APIC: kvm_apic_set_irq deliver all kinds of
 interrupts

Get rid of ioapic_inj_irq() and ioapic_inj_nmi() functions.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h |  1 -
 arch/ia64/kvm/kvm-ia64.c         |  8 +++---
 arch/ia64/kvm/lapic.h            |  2 +-
 arch/x86/kvm/lapic.c             | 47 ++++++++++++++++++++------------
 arch/x86/kvm/lapic.h             |  2 +-
 virt/kvm/ioapic.c                | 40 ++++-----------------------
 virt/kvm/irq_comm.c              |  1 +
 7 files changed, 42 insertions(+), 59 deletions(-)

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 4542651e6acb1..5608488dc2dae 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -585,7 +585,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_pal_emul(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run);
 void kvm_sal_emul(struct kvm_vcpu *vcpu);
 
-static inline void kvm_inject_nmi(struct kvm_vcpu *vcpu) {}
 #endif /* __ASSEMBLY__*/
 
 #endif
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 774f0d78a5815..99d6d174d9323 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -292,13 +292,13 @@ static void vcpu_deliver_ipi(struct kvm_vcpu *vcpu, uint64_t dm,
 {
 	switch (dm) {
 	case SAPIC_FIXED:
-		kvm_apic_set_irq(vcpu, vector, 0);
+		kvm_apic_set_irq(vcpu, vector, dm, 0);
 		break;
 	case SAPIC_NMI:
-		kvm_apic_set_irq(vcpu, 2, 0);
+		kvm_apic_set_irq(vcpu, 2, dm, 0);
 		break;
 	case SAPIC_EXTINT:
-		kvm_apic_set_irq(vcpu, 0, 0);
+		kvm_apic_set_irq(vcpu, 0, dm, 0);
 		break;
 	case SAPIC_INIT:
 	case SAPIC_PMI:
@@ -1813,7 +1813,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	put_cpu();
 }
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig)
 {
 
 	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index 6d6cbcb14893c..cbcfaa6195c7a 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -20,6 +20,6 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
 
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 27ca43e4e4405..a42f968a23e1c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -196,20 +196,30 @@ int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig)
+static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
+			     int vector, int level, int trig_mode);
+
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
+	int lapic_dmode;
 
-	if (!apic_test_and_set_irr(vec, apic)) {
-		/* a new pending irq is set in IRR */
-		if (trig)
-			apic_set_vector(vec, apic->regs + APIC_TMR);
-		else
-			apic_clear_vector(vec, apic->regs + APIC_TMR);
-		kvm_vcpu_kick(apic->vcpu);
-		return 1;
+	switch (dmode) {
+	case IOAPIC_LOWEST_PRIORITY:
+		lapic_dmode = APIC_DM_LOWEST;
+		break;
+	case IOAPIC_FIXED:
+		lapic_dmode = APIC_DM_FIXED;
+		break;
+	case IOAPIC_NMI:
+		lapic_dmode = APIC_DM_NMI;
+		break;
+	default:
+		printk(KERN_DEBUG"Ignoring delivery mode %d\n", dmode);
+		return 0;
+		break;
 	}
-	return 0;
+	return __apic_accept_irq(apic, lapic_dmode, vec, 1, trig);
 }
 
 static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -327,7 +337,7 @@ static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode)
 {
-	int orig_irr, result = 0;
+	int result = 0;
 	struct kvm_vcpu *vcpu = apic->vcpu;
 
 	switch (delivery_mode) {
@@ -337,10 +347,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (unlikely(!apic_enabled(apic)))
 			break;
 
-		orig_irr = apic_test_and_set_irr(vector, apic);
-		if (orig_irr && trig_mode) {
-			apic_debug("level trig mode repeatedly for vector %d",
-				   vector);
+		result = !apic_test_and_set_irr(vector, apic);
+		if (!result) {
+			if (trig_mode)
+				apic_debug("level trig mode repeatedly for "
+						"vector %d", vector);
 			break;
 		}
 
@@ -349,10 +360,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			apic_set_vector(vector, apic->regs + APIC_TMR);
 		} else
 			apic_clear_vector(vector, apic->regs + APIC_TMR);
-
 		kvm_vcpu_kick(vcpu);
-
-		result = (orig_irr == 0);
 		break;
 
 	case APIC_DM_REMRD:
@@ -364,12 +372,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		break;
 
 	case APIC_DM_NMI:
+		result = 1;
 		kvm_inject_nmi(vcpu);
 		kvm_vcpu_kick(vcpu);
 		break;
 
 	case APIC_DM_INIT:
 		if (level) {
+			result = 1;
 			if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
 				printk(KERN_DEBUG
 				       "INIT on a runnable vcpu %d\n",
@@ -386,6 +396,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		apic_debug("SIPI to vcpu %d vector 0x%02x\n",
 			   vcpu->vcpu_id, vector);
 		if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
+			result = 1;
 			vcpu->arch.sipi_vector = vector;
 			vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
 			kvm_vcpu_kick(vcpu);
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 2fc0d3c1b1940..1b0e3c03cb34d 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,7 +31,7 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index ea268a8c37dac..d4a7948b010cb 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -142,25 +142,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	}
 }
 
-static int ioapic_inj_irq(struct kvm_ioapic *ioapic,
-			   struct kvm_vcpu *vcpu,
-			   u8 vector, u8 trig_mode, u8 delivery_mode)
-{
-	ioapic_debug("irq %d trig %d deliv %d\n", vector, trig_mode,
-		     delivery_mode);
-
-	ASSERT((delivery_mode == IOAPIC_FIXED) ||
-	       (delivery_mode == IOAPIC_LOWEST_PRIORITY));
-
-	return kvm_apic_set_irq(vcpu, vector, trig_mode);
-}
-
-static void ioapic_inj_nmi(struct kvm_vcpu *vcpu)
-{
-	kvm_inject_nmi(vcpu);
-	kvm_vcpu_kick(vcpu);
-}
-
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
 	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
@@ -193,21 +174,12 @@ static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 		__clear_bit(vcpu_id, deliver_bitmask);
 		vcpu = ioapic->kvm->vcpus[vcpu_id];
 		if (vcpu) {
-			if (entry.fields.delivery_mode ==
-					IOAPIC_LOWEST_PRIORITY ||
-			    entry.fields.delivery_mode == IOAPIC_FIXED) {
-				if (r < 0)
-					r = 0;
-				r += ioapic_inj_irq(ioapic, vcpu,
-						    entry.fields.vector,
-						    entry.fields.trig_mode,
-						    entry.fields.delivery_mode);
-			} else if (entry.fields.delivery_mode == IOAPIC_NMI) {
-				r = 1;
-				ioapic_inj_nmi(vcpu);
-			} else
-				ioapic_debug("unsupported delivery mode %x!\n",
-					     entry.fields.delivery_mode);
+			if (r < 0)
+				r = 0;
+			r += kvm_apic_set_irq(vcpu,
+					entry.fields.vector,
+					entry.fields.trig_mode,
+					entry.fields.delivery_mode);
 		} else
 			ioapic_debug("null destination vcpu: "
 				     "mask=%x vector=%x delivery_mode=%x\n",
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 1c6ff6d1b8422..325c6685f2065 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -148,6 +148,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 			if (r < 0)
 				r = 0;
 			r += kvm_apic_set_irq(vcpu, entry.fields.vector,
+					      entry.fields.dest_mode,
 					      entry.fields.trig_mode);
 		}
 	}
-- 
GitLab


From a53c17d21c46a752f5ac6695376481bc27865b04 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:34:49 +0200
Subject: [PATCH 1726/2198] KVM: ioapic/msi interrupt delivery consolidation

ioapic_deliver() and kvm_set_msi() have code duplication. Move
the code into ioapic_deliver_entry() function and call it from
both places.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 include/linux/kvm_host.h |  2 +-
 virt/kvm/ioapic.c        | 61 +++++++++++++++++++++-------------------
 virt/kvm/ioapic.h        |  4 +--
 virt/kvm/irq_comm.c      | 32 ++-------------------
 4 files changed, 38 insertions(+), 61 deletions(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 3b91ec9982c25..ec9d078b1e8ec 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -364,7 +364,7 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
 #ifdef __KVM_HAVE_IOAPIC
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask);
 #endif
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index d4a7948b010cb..b71c0442cecf6 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -142,54 +142,57 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	}
 }
 
-static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e)
 {
-	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
 	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-	struct kvm_vcpu *vcpu;
-	int vcpu_id, r = -1;
+	int i, r = -1;
 
-	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
-		     "vector=%x trig_mode=%x\n",
-		     entry.fields.dest, entry.fields.dest_mode,
-		     entry.fields.delivery_mode, entry.fields.vector,
-		     entry.fields.trig_mode);
-
-	/* Always delivery PIT interrupt to vcpu 0 */
-#ifdef CONFIG_X86
-	if (irq == 0) {
-                bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-		__set_bit(0, deliver_bitmask);
-        } else
-#endif
-		kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
+	kvm_get_intr_delivery_bitmask(kvm, e, deliver_bitmask);
 
 	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
 		ioapic_debug("no target on destination\n");
-		return 0;
+		return r;
 	}
 
-	while ((vcpu_id = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
+	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
 			< KVM_MAX_VCPUS) {
-		__clear_bit(vcpu_id, deliver_bitmask);
-		vcpu = ioapic->kvm->vcpus[vcpu_id];
+		struct kvm_vcpu *vcpu = kvm->vcpus[i];
+		__clear_bit(i, deliver_bitmask);
 		if (vcpu) {
 			if (r < 0)
 				r = 0;
-			r += kvm_apic_set_irq(vcpu,
-					entry.fields.vector,
-					entry.fields.trig_mode,
-					entry.fields.delivery_mode);
+			r += kvm_apic_set_irq(vcpu, e->fields.vector,
+					e->fields.delivery_mode,
+					e->fields.trig_mode);
 		} else
 			ioapic_debug("null destination vcpu: "
 				     "mask=%x vector=%x delivery_mode=%x\n",
-				     entry.fields.deliver_bitmask,
-				     entry.fields.vector,
-				     entry.fields.delivery_mode);
+				     e->fields.deliver_bitmask,
+				     e->fields.vector, e->fields.delivery_mode);
 	}
 	return r;
 }
 
+static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
+{
+	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
+
+	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
+		     "vector=%x trig_mode=%x\n",
+		     entry.fields.dest, entry.fields.dest_mode,
+		     entry.fields.delivery_mode, entry.fields.vector,
+		     entry.fields.trig_mode);
+
+#ifdef CONFIG_X86
+	/* Always delivery PIT interrupt to vcpu 0 */
+	if (irq == 0) {
+		entry.fields.dest_mode = 0; /* Physical mode. */
+		entry.fields.dest_id = ioapic->kvm->vcpus[0]->vcpu_id;
+	}
+#endif
+	return ioapic_deliver_entry(ioapic->kvm, &entry);
+}
+
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
 {
 	u32 old_irr = ioapic->irr;
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index c8032ab2a4e22..bedeea59cc1c3 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -70,8 +70,8 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask);
-
+int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e);
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 325c6685f2065..35397a569b249 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -43,12 +43,11 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
-void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
 				   union kvm_ioapic_redirect_entry *entry,
 				   unsigned long *deliver_bitmask)
 {
 	int i;
-	struct kvm *kvm = ioapic->kvm;
 	struct kvm_vcpu *vcpu;
 
 	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
@@ -90,7 +89,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 	switch (entry->fields.delivery_mode) {
 	case IOAPIC_LOWEST_PRIORITY:
 		/* Select one in deliver_bitmask */
-		vcpu = kvm_get_lowest_prio_vcpu(ioapic->kvm,
+		vcpu = kvm_get_lowest_prio_vcpu(kvm,
 				entry->fields.vector, deliver_bitmask);
 		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 		if (!vcpu)
@@ -111,13 +110,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		       struct kvm *kvm, int level)
 {
-	int vcpu_id, r = -1;
-	struct kvm_vcpu *vcpu;
-	struct kvm_ioapic *ioapic = ioapic_irqchip(kvm);
 	union kvm_ioapic_redirect_entry entry;
-	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-
-	BUG_ON(!ioapic);
 
 	entry.bits = 0;
 	entry.fields.dest_id = (e->msi.address_lo &
@@ -133,26 +126,7 @@ static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 			(unsigned long *)&e->msi.data);
 
 	/* TODO Deal with RH bit of MSI message address */
-
-	kvm_get_intr_delivery_bitmask(ioapic, &entry, deliver_bitmask);
-
-	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
-		printk(KERN_WARNING "kvm: no destination for MSI delivery!");
-		return -1;
-	}
-	while ((vcpu_id = find_first_bit(deliver_bitmask,
-					KVM_MAX_VCPUS)) < KVM_MAX_VCPUS) {
-		__clear_bit(vcpu_id, deliver_bitmask);
-		vcpu = ioapic->kvm->vcpus[vcpu_id];
-		if (vcpu) {
-			if (r < 0)
-				r = 0;
-			r += kvm_apic_set_irq(vcpu, entry.fields.vector,
-					      entry.fields.dest_mode,
-					      entry.fields.trig_mode);
-		}
-	}
-	return r;
+	return ioapic_deliver_entry(kvm, &entry);
 }
 
 /* This should be called with the kvm->lock mutex held
-- 
GitLab


From 343f94fe4d16ec898da77720c03da9e09f8523d2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:34:54 +0200
Subject: [PATCH 1727/2198] KVM: consolidate ioapic/ipi interrupt delivery
 logic

Use kvm_apic_match_dest() in kvm_get_intr_delivery_bitmask() instead
of duplicating the same code. Use kvm_get_intr_delivery_bitmask() in
apic_send_ipi() to figure out ipi destination instead of reimplementing
the logic.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c |  8 +++++
 arch/ia64/kvm/lapic.h    |  3 ++
 arch/x86/kvm/lapic.c     | 69 ++++++++++++++-----------------------
 arch/x86/kvm/lapic.h     |  2 ++
 include/linux/kvm_host.h |  5 ---
 virt/kvm/ioapic.c        |  5 ++-
 virt/kvm/ioapic.h        | 10 +++---
 virt/kvm/irq_comm.c      | 74 ++++++++++++----------------------------
 8 files changed, 70 insertions(+), 106 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 99d6d174d9323..8eea9cba7b7c4 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1852,6 +1852,14 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
 	return lvcpu;
 }
 
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode)
+{
+	return (dest_mode == 0) ?
+		kvm_apic_match_physical_addr(target, dest) :
+		kvm_apic_match_logical_addr(target, dest);
+}
+
 static int find_highest_bits(int *dat)
 {
 	u32  bits, bitnum;
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index cbcfaa6195c7a..31602e7338d7b 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -20,6 +20,9 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode);
+bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
 
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index a42f968a23e1c..998862a3c267e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -260,7 +260,7 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
 {
-	return kvm_apic_id(apic) == dest;
+	return dest == 0xff || kvm_apic_id(apic) == dest;
 }
 
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
@@ -289,37 +289,34 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 	return result;
 }
 
-static int apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, int dest, int dest_mode)
 {
 	int result = 0;
 	struct kvm_lapic *target = vcpu->arch.apic;
 
 	apic_debug("target %p, source %p, dest 0x%x, "
-		   "dest_mode 0x%x, short_hand 0x%x",
+		   "dest_mode 0x%x, short_hand 0x%x\n",
 		   target, source, dest, dest_mode, short_hand);
 
 	ASSERT(!target);
 	switch (short_hand) {
 	case APIC_DEST_NOSHORT:
-		if (dest_mode == 0) {
+		if (dest_mode == 0)
 			/* Physical mode. */
-			if ((dest == 0xFF) || (dest == kvm_apic_id(target)))
-				result = 1;
-		} else
+			result = kvm_apic_match_physical_addr(target, dest);
+		else
 			/* Logical mode. */
 			result = kvm_apic_match_logical_addr(target, dest);
 		break;
 	case APIC_DEST_SELF:
-		if (target == source)
-			result = 1;
+		result = (target == source);
 		break;
 	case APIC_DEST_ALLINC:
 		result = 1;
 		break;
 	case APIC_DEST_ALLBUT:
-		if (target != source)
-			result = 1;
+		result = (target != source);
 		break;
 	default:
 		printk(KERN_WARNING "Bad dest shorthand value %x\n",
@@ -492,38 +489,26 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
 	unsigned int vector = icr_low & APIC_VECTOR_MASK;
 
-	struct kvm_vcpu *target;
-	struct kvm_vcpu *vcpu;
-	DECLARE_BITMAP(lpr_map, KVM_MAX_VCPUS);
+	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 	int i;
 
-	bitmap_zero(lpr_map, KVM_MAX_VCPUS);
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
 		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
 		   icr_high, icr_low, short_hand, dest,
 		   trig_mode, level, dest_mode, delivery_mode, vector);
 
-	for (i = 0; i < KVM_MAX_VCPUS; i++) {
-		vcpu = apic->vcpu->kvm->vcpus[i];
-		if (!vcpu)
-			continue;
-
-		if (vcpu->arch.apic &&
-		    apic_match_dest(vcpu, apic, short_hand, dest, dest_mode)) {
-			if (delivery_mode == APIC_DM_LOWEST)
-				__set_bit(vcpu->vcpu_id, lpr_map);
-			else
-				__apic_accept_irq(vcpu->arch.apic, delivery_mode,
-						  vector, level, trig_mode);
-		}
-	}
-
-	if (delivery_mode == APIC_DM_LOWEST) {
-		target = kvm_get_lowest_prio_vcpu(vcpu->kvm, vector, lpr_map);
-		if (target != NULL)
-			__apic_accept_irq(target->arch.apic, delivery_mode,
-					  vector, level, trig_mode);
+	kvm_get_intr_delivery_bitmask(apic->vcpu->kvm, apic, dest, dest_mode,
+			delivery_mode == APIC_DM_LOWEST, short_hand,
+			deliver_bitmask);
+
+	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
+			< KVM_MAX_VCPUS) {
+		struct kvm_vcpu *vcpu = apic->vcpu->kvm->vcpus[i];
+		__clear_bit(i, deliver_bitmask);
+		if (vcpu)
+			__apic_accept_irq(vcpu->arch.apic, delivery_mode,
+					vector, level, trig_mode);
 	}
 }
 
@@ -930,16 +915,14 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_reset);
 
-int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+bool kvm_apic_present(struct kvm_vcpu *vcpu)
 {
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	int ret = 0;
-
-	if (!apic)
-		return 0;
-	ret = apic_enabled(apic);
+	return vcpu->arch.apic && apic_hw_enabled(vcpu->arch.apic);
+}
 
-	return ret;
+int kvm_lapic_enabled(struct kvm_vcpu *vcpu)
+{
+	return kvm_apic_present(vcpu) && apic_sw_enabled(vcpu->arch.apic);
 }
 EXPORT_SYMBOL_GPL(kvm_lapic_enabled);
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 1b0e3c03cb34d..b66dc14a96982 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -37,6 +37,8 @@ u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
+bool kvm_apic_present(struct kvm_vcpu *vcpu);
+bool kvm_lapic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ec9d078b1e8ec..fb60f31c4fb3e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -363,11 +363,6 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
 
-#ifdef __KVM_HAVE_IOAPIC
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
-				   union kvm_ioapic_redirect_entry *entry,
-				   unsigned long *deliver_bitmask);
-#endif
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, int irq, int level);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index b71c0442cecf6..43969bbf127f9 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -147,7 +147,10 @@ int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e)
 	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
 	int i, r = -1;
 
-	kvm_get_intr_delivery_bitmask(kvm, e, deliver_bitmask);
+	kvm_get_intr_delivery_bitmask(kvm, NULL, e->fields.dest_id,
+			e->fields.dest_mode,
+			e->fields.delivery_mode == IOAPIC_LOWEST_PRIORITY,
+			0, deliver_bitmask);
 
 	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
 		ioapic_debug("no target on destination\n");
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index bedeea59cc1c3..d996c7abc4663 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -65,13 +65,15 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 }
 
 struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-				       unsigned long *bitmap);
+		unsigned long *bitmap);
+int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
+		int short_hand, int dest, int dest_mode);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
-				   union kvm_ioapic_redirect_entry *entry,
-				   unsigned long *deliver_bitmask);
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
+		int dest_id, int dest_mode, bool low_prio, int short_hand,
+		unsigned long *deliver_bitmask);
 int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e);
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 35397a569b249..e43701c0a5c43 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -43,67 +43,35 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm,
-				   union kvm_ioapic_redirect_entry *entry,
-				   unsigned long *deliver_bitmask)
+void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
+		int dest_id, int dest_mode, bool low_prio, int short_hand,
+		unsigned long *deliver_bitmask)
 {
 	int i;
 	struct kvm_vcpu *vcpu;
 
+	if (dest_mode == 0 && dest_id == 0xff && low_prio)
+		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
+
 	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+	for (i = 0; i < KVM_MAX_VCPUS; i++) {
+		vcpu = kvm->vcpus[i];
 
-	if (entry->fields.dest_mode == 0) {	/* Physical mode. */
-		if (entry->fields.dest_id == 0xFF) {	/* Broadcast. */
-			for (i = 0; i < KVM_MAX_VCPUS; ++i)
-				if (kvm->vcpus[i] && kvm->vcpus[i]->arch.apic)
-					__set_bit(i, deliver_bitmask);
-			/* Lowest priority shouldn't combine with broadcast */
-			if (entry->fields.delivery_mode ==
-			    IOAPIC_LOWEST_PRIORITY && printk_ratelimit())
-				printk(KERN_INFO "kvm: apic: phys broadcast "
-						  "and lowest prio\n");
-			return;
-		}
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (kvm_apic_match_physical_addr(vcpu->arch.apic,
-					entry->fields.dest_id)) {
-				if (vcpu->arch.apic)
-					__set_bit(i, deliver_bitmask);
-				break;
-			}
-		}
-	} else if (entry->fields.dest_id != 0) /* Logical mode, MDA non-zero. */
-		for (i = 0; i < KVM_MAX_VCPUS; ++i) {
-			vcpu = kvm->vcpus[i];
-			if (!vcpu)
-				continue;
-			if (vcpu->arch.apic &&
-			    kvm_apic_match_logical_addr(vcpu->arch.apic,
-					entry->fields.dest_id))
-				__set_bit(i, deliver_bitmask);
-		}
+		if (!vcpu || !kvm_apic_present(vcpu))
+			continue;
 
-	switch (entry->fields.delivery_mode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		/* Select one in deliver_bitmask */
-		vcpu = kvm_get_lowest_prio_vcpu(kvm,
-				entry->fields.vector, deliver_bitmask);
-		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-		if (!vcpu)
-			return;
-		__set_bit(vcpu->vcpu_id, deliver_bitmask);
-		break;
-	case IOAPIC_FIXED:
-	case IOAPIC_NMI:
-		break;
-	default:
-		if (printk_ratelimit())
-			printk(KERN_INFO "kvm: unsupported delivery mode %d\n",
-				entry->fields.delivery_mode);
+		if (!kvm_apic_match_dest(vcpu, src, short_hand, dest_id,
+					dest_mode))
+			continue;
+
+		__set_bit(i, deliver_bitmask);
+	}
+
+	if (low_prio) {
+		vcpu = kvm_get_lowest_prio_vcpu(kvm, 0, deliver_bitmask);
 		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
+		if (vcpu)
+			__set_bit(vcpu->vcpu_id, deliver_bitmask);
 	}
 }
 
-- 
GitLab


From e1035715ef8d3171e29f9c6aee6f40d57b3fead5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:34:59 +0200
Subject: [PATCH 1728/2198] KVM: change the way how lowest priority vcpu is
 calculated

The new way does not require additional loop over vcpus to calculate
the one with lowest priority as one is chosen during delivery bitmap
construction.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c        | 15 ++----------
 arch/ia64/kvm/lapic.h           |  1 +
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/lapic.c            | 43 +++++----------------------------
 virt/kvm/ioapic.h               |  3 +--
 virt/kvm/irq_comm.c             | 19 +++++++++------
 6 files changed, 22 insertions(+), 61 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 8eea9cba7b7c4..1887a93a2bd5f 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1836,20 +1836,9 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda)
 	return 0;
 }
 
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-				       unsigned long *bitmap)
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 {
-	struct kvm_vcpu *lvcpu = kvm->vcpus[0];
-	int i;
-
-	for (i = 1; i < kvm->arch.online_vcpus; i++) {
-		if (!kvm->vcpus[i])
-			continue;
-		if (lvcpu->arch.xtp > kvm->vcpus[i]->arch.xtp)
-			lvcpu = kvm->vcpus[i];
-	}
-
-	return lvcpu;
+	return vcpu1->arch.xtp - vcpu2->arch.xtp;
 }
 
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index 31602e7338d7b..e42109e6ca470 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -22,6 +22,7 @@ int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 bool kvm_apic_present(struct kvm_vcpu *vcpu);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
 
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f0faf58044ff6..46276273a1a16 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -286,6 +286,7 @@ struct kvm_vcpu_arch {
 	u64 shadow_efer;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
+	int32_t apic_arb_prio;
 	int mp_state;
 	int sipi_vector;
 	u64 ia32_misc_enable_msr;
@@ -400,7 +401,6 @@ struct kvm_arch{
 	struct hlist_head irq_ack_notifier_list;
 	int vapics_in_nmi_mode;
 
-	int round_robin_prev_vcpu;
 	unsigned int tss_addr;
 	struct page *apic_access_page;
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 998862a3c267e..814466f455d9c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -338,8 +338,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	struct kvm_vcpu *vcpu = apic->vcpu;
 
 	switch (delivery_mode) {
-	case APIC_DM_FIXED:
 	case APIC_DM_LOWEST:
+		vcpu->arch.apic_arb_prio++;
+	case APIC_DM_FIXED:
 		/* FIXME add logic for vcpu on reset */
 		if (unlikely(!apic_enabled(apic)))
 			break;
@@ -416,43 +417,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	return result;
 }
 
-static struct kvm_lapic *kvm_apic_round_robin(struct kvm *kvm, u8 vector,
-				       unsigned long *bitmap)
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 {
-	int last;
-	int next;
-	struct kvm_lapic *apic = NULL;
-
-	last = kvm->arch.round_robin_prev_vcpu;
-	next = last;
-
-	do {
-		if (++next == KVM_MAX_VCPUS)
-			next = 0;
-		if (kvm->vcpus[next] == NULL || !test_bit(next, bitmap))
-			continue;
-		apic = kvm->vcpus[next]->arch.apic;
-		if (apic && apic_enabled(apic))
-			break;
-		apic = NULL;
-	} while (next != last);
-	kvm->arch.round_robin_prev_vcpu = next;
-
-	if (!apic)
-		printk(KERN_DEBUG "vcpu not ready for apic_round_robin\n");
-
-	return apic;
-}
-
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-		unsigned long *bitmap)
-{
-	struct kvm_lapic *apic;
-
-	apic = kvm_apic_round_robin(kvm, vector, bitmap);
-	if (apic)
-		return apic->vcpu;
-	return NULL;
+	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 
 static void apic_set_eoi(struct kvm_lapic *apic)
@@ -908,6 +875,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP;
 	apic_update_ppr(apic);
 
+	vcpu->arch.apic_arb_prio = 0;
+
 	apic_debug(KERN_INFO "%s: vcpu=%p, id=%d, base_msr="
 		   "0x%016" PRIx64 ", base_address=0x%0lx.\n", __func__,
 		   vcpu, kvm_apic_id(apic),
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index d996c7abc4663..e7bc92d895ff8 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -64,10 +64,9 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 	return kvm->arch.vioapic;
 }
 
-struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
-		unsigned long *bitmap);
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode);
+int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index e43701c0a5c43..f5e059b67cd4e 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -47,7 +47,7 @@ void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
 		int dest_id, int dest_mode, bool low_prio, int short_hand,
 		unsigned long *deliver_bitmask)
 {
-	int i;
+	int i, lowest = -1;
 	struct kvm_vcpu *vcpu;
 
 	if (dest_mode == 0 && dest_id == 0xff && low_prio)
@@ -64,15 +64,18 @@ void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
 					dest_mode))
 			continue;
 
-		__set_bit(i, deliver_bitmask);
+		if (!low_prio) {
+			__set_bit(i, deliver_bitmask);
+		} else {
+			if (lowest < 0)
+				lowest = i;
+			if (kvm_apic_compare_prio(vcpu, kvm->vcpus[lowest]) < 0)
+				lowest = i;
+		}
 	}
 
-	if (low_prio) {
-		vcpu = kvm_get_lowest_prio_vcpu(kvm, 0, deliver_bitmask);
-		bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
-		if (vcpu)
-			__set_bit(vcpu->vcpu_id, deliver_bitmask);
-	}
+	if (lowest != -1)
+		__set_bit(lowest, deliver_bitmask);
 }
 
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
-- 
GitLab


From 58c2dde17d6eb6c8c0566e52d184aa16755d890f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 5 Mar 2009 16:35:04 +0200
Subject: [PATCH 1729/2198] KVM: APIC: get rid of deliver_bitmask

Deliver interrupt during destination matching loop.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c  | 33 ++++++++++--------
 arch/ia64/kvm/lapic.h     |  4 +--
 arch/x86/kvm/lapic.c      | 59 +++++++++-----------------------
 arch/x86/kvm/lapic.h      |  3 +-
 include/linux/kvm_types.h | 10 ++++++
 virt/kvm/ioapic.c         | 57 +++++++++----------------------
 virt/kvm/ioapic.h         |  6 ++--
 virt/kvm/irq_comm.c       | 71 +++++++++++++++++++++++----------------
 8 files changed, 108 insertions(+), 135 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 1887a93a2bd5f..acf43ec42704b 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -283,6 +283,18 @@ static int handle_sal_call(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 }
 
+static int __apic_accept_irq(struct kvm_vcpu *vcpu, uint64_t vector)
+{
+	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
+
+	if (!test_and_set_bit(vector, &vpd->irr[0])) {
+		vcpu->arch.irq_new_pending = 1;
+		kvm_vcpu_kick(vcpu);
+		return 1;
+	}
+	return 0;
+}
+
 /*
  *  offset: address offset to IPI space.
  *  value:  deliver value.
@@ -292,20 +304,20 @@ static void vcpu_deliver_ipi(struct kvm_vcpu *vcpu, uint64_t dm,
 {
 	switch (dm) {
 	case SAPIC_FIXED:
-		kvm_apic_set_irq(vcpu, vector, dm, 0);
 		break;
 	case SAPIC_NMI:
-		kvm_apic_set_irq(vcpu, 2, dm, 0);
+		vector = 2;
 		break;
 	case SAPIC_EXTINT:
-		kvm_apic_set_irq(vcpu, 0, dm, 0);
+		vector = 0;
 		break;
 	case SAPIC_INIT:
 	case SAPIC_PMI:
 	default:
 		printk(KERN_ERR"kvm: Unimplemented Deliver reserved IPI!\n");
-		break;
+		return;
 	}
+	__apic_accept_irq(vcpu, vector);
 }
 
 static struct kvm_vcpu *lid_to_vcpu(struct kvm *kvm, unsigned long id,
@@ -1813,17 +1825,9 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	put_cpu();
 }
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
 {
-
-	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
-
-	if (!test_and_set_bit(vec, &vpd->irr[0])) {
-		vcpu->arch.irq_new_pending = 1;
-		kvm_vcpu_kick(vcpu);
-		return 1;
-	}
-	return 0;
+	return __apic_accept_irq(vcpu, irq->vector);
 }
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest)
@@ -1844,6 +1848,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode)
 {
+	struct kvm_lapic *target = vcpu->arch.apic;
 	return (dest_mode == 0) ?
 		kvm_apic_match_physical_addr(target, dest) :
 		kvm_apic_match_logical_addr(target, dest);
diff --git a/arch/ia64/kvm/lapic.h b/arch/ia64/kvm/lapic.h
index e42109e6ca470..ee541cebcd78a 100644
--- a/arch/ia64/kvm/lapic.h
+++ b/arch/ia64/kvm/lapic.h
@@ -23,7 +23,7 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
 int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
-bool kvm_apic_present(struct kvm_vcpu *vcpu);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
+#define kvm_apic_present(x) (true)
 
 #endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 814466f455d9c..dd934d27040f6 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -199,27 +199,12 @@ EXPORT_SYMBOL_GPL(kvm_lapic_find_highest_irr);
 static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 			     int vector, int level, int trig_mode);
 
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig)
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
-	int lapic_dmode;
 
-	switch (dmode) {
-	case IOAPIC_LOWEST_PRIORITY:
-		lapic_dmode = APIC_DM_LOWEST;
-		break;
-	case IOAPIC_FIXED:
-		lapic_dmode = APIC_DM_FIXED;
-		break;
-	case IOAPIC_NMI:
-		lapic_dmode = APIC_DM_NMI;
-		break;
-	default:
-		printk(KERN_DEBUG"Ignoring delivery mode %d\n", dmode);
-		return 0;
-		break;
-	}
-	return __apic_accept_irq(apic, lapic_dmode, vec, 1, trig);
+	return __apic_accept_irq(apic, irq->delivery_mode, irq->vector,
+			irq->level, irq->trig_mode);
 }
 
 static inline int apic_find_highest_isr(struct kvm_lapic *apic)
@@ -447,36 +432,24 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 {
 	u32 icr_low = apic_get_reg(apic, APIC_ICR);
 	u32 icr_high = apic_get_reg(apic, APIC_ICR2);
+	struct kvm_lapic_irq irq;
 
-	unsigned int dest = GET_APIC_DEST_FIELD(icr_high);
-	unsigned int short_hand = icr_low & APIC_SHORT_MASK;
-	unsigned int trig_mode = icr_low & APIC_INT_LEVELTRIG;
-	unsigned int level = icr_low & APIC_INT_ASSERT;
-	unsigned int dest_mode = icr_low & APIC_DEST_MASK;
-	unsigned int delivery_mode = icr_low & APIC_MODE_MASK;
-	unsigned int vector = icr_low & APIC_VECTOR_MASK;
-
-	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-	int i;
+	irq.vector = icr_low & APIC_VECTOR_MASK;
+	irq.delivery_mode = icr_low & APIC_MODE_MASK;
+	irq.dest_mode = icr_low & APIC_DEST_MASK;
+	irq.level = icr_low & APIC_INT_ASSERT;
+	irq.trig_mode = icr_low & APIC_INT_LEVELTRIG;
+	irq.shorthand = icr_low & APIC_SHORT_MASK;
+	irq.dest_id = GET_APIC_DEST_FIELD(icr_high);
 
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
 		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
-		   icr_high, icr_low, short_hand, dest,
-		   trig_mode, level, dest_mode, delivery_mode, vector);
-
-	kvm_get_intr_delivery_bitmask(apic->vcpu->kvm, apic, dest, dest_mode,
-			delivery_mode == APIC_DM_LOWEST, short_hand,
-			deliver_bitmask);
-
-	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
-			< KVM_MAX_VCPUS) {
-		struct kvm_vcpu *vcpu = apic->vcpu->kvm->vcpus[i];
-		__clear_bit(i, deliver_bitmask);
-		if (vcpu)
-			__apic_accept_irq(vcpu->arch.apic, delivery_mode,
-					vector, level, trig_mode);
-	}
+		   icr_high, icr_low, irq.shorthand, irq.dest,
+		   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
+		   irq.vector);
+
+	kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
 }
 
 static u32 apic_get_tmcct(struct kvm_lapic *apic)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index b66dc14a96982..a587f8349c460 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,14 +31,13 @@ u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 
 int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
 int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
-int kvm_apic_set_irq(struct kvm_vcpu *vcpu, u8 vec, u8 dmode, u8 trig);
+int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq);
 
 u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
 void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data);
 void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu);
 int kvm_lapic_enabled(struct kvm_vcpu *vcpu);
 bool kvm_apic_present(struct kvm_vcpu *vcpu);
-bool kvm_lapic_present(struct kvm_vcpu *vcpu);
 int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
 
 void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index b84aca3c4ad1a..fb46efbeabec6 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -57,4 +57,14 @@ union kvm_ioapic_redirect_entry {
 	} fields;
 };
 
+struct kvm_lapic_irq {
+	u32 vector;
+	u32 delivery_mode;
+	u32 dest_mode;
+	u32 level;
+	u32 trig_mode;
+	u32 shorthand;
+	u32 dest_id;
+};
+
 #endif /* __KVM_TYPES_H__ */
diff --git a/virt/kvm/ioapic.c b/virt/kvm/ioapic.c
index 43969bbf127f9..1eddae94bab33 100644
--- a/virt/kvm/ioapic.c
+++ b/virt/kvm/ioapic.c
@@ -142,58 +142,33 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 	}
 }
 
-int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e)
-{
-	DECLARE_BITMAP(deliver_bitmask, KVM_MAX_VCPUS);
-	int i, r = -1;
-
-	kvm_get_intr_delivery_bitmask(kvm, NULL, e->fields.dest_id,
-			e->fields.dest_mode,
-			e->fields.delivery_mode == IOAPIC_LOWEST_PRIORITY,
-			0, deliver_bitmask);
-
-	if (find_first_bit(deliver_bitmask, KVM_MAX_VCPUS) >= KVM_MAX_VCPUS) {
-		ioapic_debug("no target on destination\n");
-		return r;
-	}
-
-	while ((i = find_first_bit(deliver_bitmask, KVM_MAX_VCPUS))
-			< KVM_MAX_VCPUS) {
-		struct kvm_vcpu *vcpu = kvm->vcpus[i];
-		__clear_bit(i, deliver_bitmask);
-		if (vcpu) {
-			if (r < 0)
-				r = 0;
-			r += kvm_apic_set_irq(vcpu, e->fields.vector,
-					e->fields.delivery_mode,
-					e->fields.trig_mode);
-		} else
-			ioapic_debug("null destination vcpu: "
-				     "mask=%x vector=%x delivery_mode=%x\n",
-				     e->fields.deliver_bitmask,
-				     e->fields.vector, e->fields.delivery_mode);
-	}
-	return r;
-}
-
 static int ioapic_deliver(struct kvm_ioapic *ioapic, int irq)
 {
-	union kvm_ioapic_redirect_entry entry = ioapic->redirtbl[irq];
+	union kvm_ioapic_redirect_entry *entry = &ioapic->redirtbl[irq];
+	struct kvm_lapic_irq irqe;
 
 	ioapic_debug("dest=%x dest_mode=%x delivery_mode=%x "
 		     "vector=%x trig_mode=%x\n",
-		     entry.fields.dest, entry.fields.dest_mode,
-		     entry.fields.delivery_mode, entry.fields.vector,
-		     entry.fields.trig_mode);
+		     entry->fields.dest, entry->fields.dest_mode,
+		     entry->fields.delivery_mode, entry->fields.vector,
+		     entry->fields.trig_mode);
+
+	irqe.dest_id = entry->fields.dest_id;
+	irqe.vector = entry->fields.vector;
+	irqe.dest_mode = entry->fields.dest_mode;
+	irqe.trig_mode = entry->fields.trig_mode;
+	irqe.delivery_mode = entry->fields.delivery_mode << 8;
+	irqe.level = 1;
+	irqe.shorthand = 0;
 
 #ifdef CONFIG_X86
 	/* Always delivery PIT interrupt to vcpu 0 */
 	if (irq == 0) {
-		entry.fields.dest_mode = 0; /* Physical mode. */
-		entry.fields.dest_id = ioapic->kvm->vcpus[0]->vcpu_id;
+		irqe.dest_mode = 0; /* Physical mode. */
+		irqe.dest_id = ioapic->kvm->vcpus[0]->vcpu_id;
 	}
 #endif
-	return ioapic_deliver_entry(ioapic->kvm, &entry);
+	return kvm_irq_delivery_to_apic(ioapic->kvm, NULL, &irqe);
 }
 
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level)
diff --git a/virt/kvm/ioapic.h b/virt/kvm/ioapic.h
index e7bc92d895ff8..7080b713c1603 100644
--- a/virt/kvm/ioapic.h
+++ b/virt/kvm/ioapic.h
@@ -71,8 +71,6 @@ void kvm_ioapic_update_eoi(struct kvm *kvm, int vector, int trigger_mode);
 int kvm_ioapic_init(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int level);
 void kvm_ioapic_reset(struct kvm_ioapic *ioapic);
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
-		int dest_id, int dest_mode, bool low_prio, int short_hand,
-		unsigned long *deliver_bitmask);
-int ioapic_deliver_entry(struct kvm *kvm, union kvm_ioapic_redirect_entry *e);
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq);
 #endif
diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index f5e059b67cd4e..4fa1f604b4250 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -22,6 +22,9 @@
 #include <linux/kvm_host.h>
 
 #include <asm/msidef.h>
+#ifdef CONFIG_IA64
+#include <asm/iosapic.h>
+#endif
 
 #include "irq.h"
 
@@ -43,61 +46,71 @@ static int kvm_set_ioapic_irq(struct kvm_kernel_irq_routing_entry *e,
 	return kvm_ioapic_set_irq(kvm->arch.vioapic, e->irqchip.pin, level);
 }
 
-void kvm_get_intr_delivery_bitmask(struct kvm *kvm, struct kvm_lapic *src,
-		int dest_id, int dest_mode, bool low_prio, int short_hand,
-		unsigned long *deliver_bitmask)
+inline static bool kvm_is_dm_lowest_prio(struct kvm_lapic_irq *irq)
 {
-	int i, lowest = -1;
-	struct kvm_vcpu *vcpu;
+#ifdef CONFIG_IA64
+	return irq->delivery_mode ==
+		(IOSAPIC_LOWEST_PRIORITY << IOSAPIC_DELIVERY_SHIFT);
+#else
+	return irq->delivery_mode == APIC_DM_LOWEST;
+#endif
+}
 
-	if (dest_mode == 0 && dest_id == 0xff && low_prio)
+int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
+		struct kvm_lapic_irq *irq)
+{
+	int i, r = -1;
+	struct kvm_vcpu *vcpu, *lowest = NULL;
+
+	if (irq->dest_mode == 0 && irq->dest_id == 0xff &&
+			kvm_is_dm_lowest_prio(irq))
 		printk(KERN_INFO "kvm: apic: phys broadcast and lowest prio\n");
 
-	bitmap_zero(deliver_bitmask, KVM_MAX_VCPUS);
 	for (i = 0; i < KVM_MAX_VCPUS; i++) {
 		vcpu = kvm->vcpus[i];
 
 		if (!vcpu || !kvm_apic_present(vcpu))
 			continue;
 
-		if (!kvm_apic_match_dest(vcpu, src, short_hand, dest_id,
-					dest_mode))
+		if (!kvm_apic_match_dest(vcpu, src, irq->shorthand,
+					irq->dest_id, irq->dest_mode))
 			continue;
 
-		if (!low_prio) {
-			__set_bit(i, deliver_bitmask);
+		if (!kvm_is_dm_lowest_prio(irq)) {
+			if (r < 0)
+				r = 0;
+			r += kvm_apic_set_irq(vcpu, irq);
 		} else {
-			if (lowest < 0)
-				lowest = i;
-			if (kvm_apic_compare_prio(vcpu, kvm->vcpus[lowest]) < 0)
-				lowest = i;
+			if (!lowest)
+				lowest = vcpu;
+			else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
+				lowest = vcpu;
 		}
 	}
 
-	if (lowest != -1)
-		__set_bit(lowest, deliver_bitmask);
+	if (lowest)
+		r = kvm_apic_set_irq(lowest, irq);
+
+	return r;
 }
 
 static int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		       struct kvm *kvm, int level)
 {
-	union kvm_ioapic_redirect_entry entry;
+	struct kvm_lapic_irq irq;
 
-	entry.bits = 0;
-	entry.fields.dest_id = (e->msi.address_lo &
+	irq.dest_id = (e->msi.address_lo &
 			MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
-	entry.fields.vector = (e->msi.data &
+	irq.vector = (e->msi.data &
 			MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
-	entry.fields.dest_mode = test_bit(MSI_ADDR_DEST_MODE_SHIFT,
-			(unsigned long *)&e->msi.address_lo);
-	entry.fields.trig_mode = test_bit(MSI_DATA_TRIGGER_SHIFT,
-			(unsigned long *)&e->msi.data);
-	entry.fields.delivery_mode = test_bit(
-			MSI_DATA_DELIVERY_MODE_SHIFT,
-			(unsigned long *)&e->msi.data);
+	irq.dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
+	irq.trig_mode = (1 << MSI_DATA_TRIGGER_SHIFT) & e->msi.data;
+	irq.delivery_mode = e->msi.data & 0x700;
+	irq.level = 1;
+	irq.shorthand = 0;
 
 	/* TODO Deal with RH bit of MSI message address */
-	return ioapic_deliver_entry(kvm, &entry);
+	return kvm_irq_delivery_to_apic(kvm, NULL, &irq);
 }
 
 /* This should be called with the kvm->lock mutex held
-- 
GitLab


From 0c72ea7fb8a39b4bba071b19f5f835af5b5e538a Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Wed, 25 Feb 2009 10:38:52 -0600
Subject: [PATCH 1730/2198] KVM: ia64: Map in SN2 RTC registers to the VMM
 module

On SN2, map in the SN2 RTC registers to the VMM module, needed for ITC
emulation.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h |  2 ++
 arch/ia64/include/asm/pgtable.h  |  2 ++
 arch/ia64/kvm/kvm-ia64.c         | 43 ++++++++++++++++++++++++++++----
 3 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 5608488dc2dae..12439956551bc 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -371,6 +371,7 @@ struct kvm_vcpu_arch {
 	int last_run_cpu;
 	int vmm_tr_slot;
 	int vm_tr_slot;
+	int sn_rtc_tr_slot;
 
 #define KVM_MP_STATE_RUNNABLE          0
 #define KVM_MP_STATE_UNINITIALIZED     1
@@ -465,6 +466,7 @@ struct kvm_arch {
 	unsigned long	vmm_init_rr;
 
 	int		online_vcpus;
+	int		is_sn2;
 
 	struct kvm_ioapic *vioapic;
 	struct kvm_vm_stat stat;
diff --git a/arch/ia64/include/asm/pgtable.h b/arch/ia64/include/asm/pgtable.h
index 7a9bff47564f1..0a9cc73d35c78 100644
--- a/arch/ia64/include/asm/pgtable.h
+++ b/arch/ia64/include/asm/pgtable.h
@@ -146,6 +146,8 @@
 #define PAGE_GATE	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_X_RX)
 #define PAGE_KERNEL	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX)
 #define PAGE_KERNELRX	__pgprot(__ACCESS_BITS | _PAGE_PL_0 | _PAGE_AR_RX)
+#define PAGE_KERNEL_UC	__pgprot(__DIRTY_BITS  | _PAGE_PL_0 | _PAGE_AR_RWX | \
+				 _PAGE_MA_UC)
 
 # ifndef __ASSEMBLY__
 
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index acf43ec42704b..14a3fabc7d626 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -41,6 +41,9 @@
 #include <asm/div64.h>
 #include <asm/tlb.h>
 #include <asm/elf.h>
+#include <asm/sn/addrs.h>
+#include <asm/sn/clksupport.h>
+#include <asm/sn/shub_mmr.h>
 
 #include "misc.h"
 #include "vti.h"
@@ -119,8 +122,7 @@ void kvm_arch_hardware_enable(void *garbage)
 	unsigned long saved_psr;
 	int slot;
 
-	pte = pte_val(mk_pte_phys(__pa(kvm_vmm_base),
-				PAGE_KERNEL));
+	pte = pte_val(mk_pte_phys(__pa(kvm_vmm_base), PAGE_KERNEL));
 	local_irq_save(saved_psr);
 	slot = ia64_itr_entry(0x3, KVM_VMM_BASE, pte, KVM_VMM_SHIFT);
 	local_irq_restore(saved_psr);
@@ -425,6 +427,23 @@ static int handle_switch_rr6(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int kvm_sn2_setup_mappings(struct kvm_vcpu *vcpu)
+{
+	unsigned long pte, rtc_phys_addr, map_addr;
+	int slot;
+
+	map_addr = KVM_VMM_BASE + (1UL << KVM_VMM_SHIFT);
+	rtc_phys_addr = LOCAL_MMR_OFFSET | SH_RTC;
+	pte = pte_val(mk_pte_phys(rtc_phys_addr, PAGE_KERNEL_UC));
+	slot = ia64_itr_entry(0x3, map_addr, pte, PAGE_SHIFT);
+	vcpu->arch.sn_rtc_tr_slot = slot;
+	if (slot < 0) {
+		printk(KERN_ERR "Mayday mayday! RTC mapping failed!\n");
+		slot = 0;
+	}
+	return slot;
+}
+
 int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 {
 
@@ -563,18 +582,29 @@ static int kvm_insert_vmm_mapping(struct kvm_vcpu *vcpu)
 	if (r < 0)
 		goto out;
 	vcpu->arch.vm_tr_slot = r;
+
+#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC)
+	if (kvm->arch.is_sn2) {
+		r = kvm_sn2_setup_mappings(vcpu);
+		if (r < 0)
+			goto out;
+	}
+#endif
+
 	r = 0;
 out:
 	return r;
-
 }
 
 static void kvm_purge_vmm_mapping(struct kvm_vcpu *vcpu)
 {
-
+	struct kvm *kvm = vcpu->kvm;
 	ia64_ptr_entry(0x3, vcpu->arch.vmm_tr_slot);
 	ia64_ptr_entry(0x3, vcpu->arch.vm_tr_slot);
-
+#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC)
+	if (kvm->arch.is_sn2)
+		ia64_ptr_entry(0x3, vcpu->arch.sn_rtc_tr_slot);
+#endif
 }
 
 static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu)
@@ -800,6 +830,9 @@ struct  kvm *kvm_arch_create_vm(void)
 
 	if (IS_ERR(kvm))
 		return ERR_PTR(-ENOMEM);
+
+	kvm->arch.is_sn2 = ia64_platform_is("sn2");
+
 	kvm_init_vm(kvm);
 
 	kvm->arch.online_vcpus = 0;
-- 
GitLab


From c6c9fcdf0fff7da77c088be11e3c4d0181b36941 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Wed, 25 Feb 2009 10:38:53 -0600
Subject: [PATCH 1731/2198] KVM: ia64: Create inline function kvm_get_itc() to
 centralize ITC reading.

Move all reading of special register 'AR_ITC' into two functions, one
in the kernel and one in the VMM module. When running on SN2, base the
result on the RTC rather the system ITC, as the ITC isn't
synchronized.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 18 ++++++++++++++----
 arch/ia64/kvm/vcpu.c     | 20 ++++++++++++++++++--
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 14a3fabc7d626..ebbd995194fb1 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -68,6 +68,16 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 };
 
+static unsigned long kvm_get_itc(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC)
+	if (vcpu->kvm->arch.is_sn2)
+		return rtc_time();
+	else
+#endif
+		return ia64_getreg(_IA64_REG_AR_ITC);
+}
+
 static void kvm_flush_icache(unsigned long start, unsigned long len)
 {
 	int l;
@@ -457,7 +467,7 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 
 	if (irqchip_in_kernel(vcpu->kvm)) {
 
-		vcpu_now_itc = ia64_getreg(_IA64_REG_AR_ITC) + vcpu->arch.itc_offset;
+		vcpu_now_itc = kvm_get_itc(vcpu) + vcpu->arch.itc_offset;
 
 		if (time_after(vcpu_now_itc, vpd->itm)) {
 			vcpu->arch.timer_check = 1;
@@ -929,7 +939,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	RESTORE_REGS(saved_gp);
 
 	vcpu->arch.irq_new_pending = 1;
-	vcpu->arch.itc_offset = regs->saved_itc - ia64_getreg(_IA64_REG_AR_ITC);
+	vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu);
 	set_bit(KVM_REQ_RESUME, &vcpu->requests);
 
 	vcpu_put(vcpu);
@@ -1210,7 +1220,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 		regs->cr_iip = PALE_RESET_ENTRY;
 
 		/*Initialize itc offset for vcpus*/
-		itc_offset = 0UL - ia64_getreg(_IA64_REG_AR_ITC);
+		itc_offset = 0UL - kvm_get_itc(vcpu);
 		for (i = 0; i < kvm->arch.online_vcpus; i++) {
 			v = (struct kvm_vcpu *)((char *)vcpu +
 					sizeof(struct kvm_vcpu_data) * i);
@@ -1472,7 +1482,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	}
 	for (i = 0; i < 4; i++)
 		regs->insvc[i] = vcpu->arch.insvc[i];
-	regs->saved_itc = vcpu->arch.itc_offset + ia64_getreg(_IA64_REG_AR_ITC);
+	regs->saved_itc = vcpu->arch.itc_offset + kvm_get_itc(vcpu);
 	SAVE_REGS(xtp);
 	SAVE_REGS(metaphysical_rr0);
 	SAVE_REGS(metaphysical_rr4);
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index a18ee17b9192e..a2c6c15e47618 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -788,13 +788,29 @@ void vcpu_set_fpreg(struct kvm_vcpu *vcpu, unsigned long reg,
 		setfpreg(reg, val, regs);   /* FIXME: handle NATs later*/
 }
 
+/*
+ * The Altix RTC is mapped specially here for the vmm module
+ */
+#define SN_RTC_BASE	(u64 *)(KVM_VMM_BASE+(1UL<<KVM_VMM_SHIFT))
+static long kvm_get_itc(struct kvm_vcpu *vcpu)
+{
+#if defined(CONFIG_IA64_SGI_SN2) || defined(CONFIG_IA64_GENERIC)
+	struct kvm *kvm = (struct kvm *)KVM_VM_BASE;
+
+	if (kvm->arch.is_sn2)
+		return (*SN_RTC_BASE);
+	else
+#endif
+		return ia64_getreg(_IA64_REG_AR_ITC);
+}
+
 /************************************************************************
  * lsapic timer
  ***********************************************************************/
 u64 vcpu_get_itc(struct kvm_vcpu *vcpu)
 {
 	unsigned long guest_itc;
-	guest_itc = VMX(vcpu, itc_offset) + ia64_getreg(_IA64_REG_AR_ITC);
+	guest_itc = VMX(vcpu, itc_offset) + kvm_get_itc(vcpu);
 
 	if (guest_itc >= VMX(vcpu, last_itc)) {
 		VMX(vcpu, last_itc) = guest_itc;
@@ -809,7 +825,7 @@ static void vcpu_set_itc(struct kvm_vcpu *vcpu, u64 val)
 	struct kvm_vcpu *v;
 	struct kvm *kvm;
 	int i;
-	long itc_offset = val - ia64_getreg(_IA64_REG_AR_ITC);
+	long itc_offset = val - kvm_get_itc(vcpu);
 	unsigned long vitv = VCPU(vcpu, itv);
 
 	kvm = (struct kvm *)KVM_VM_BASE;
-- 
GitLab


From ce17c643738bebcacf8d19d8cab7dd3eb96f9f32 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Wed, 25 Feb 2009 10:38:54 -0600
Subject: [PATCH 1732/2198] KVM: ia64: SN2 adjust emulated ITC frequency to
 match RTC frequency

On SN2 do not pass down the real ITC frequency, but rather patch the
values to match the SN2 RTC frequency.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm_fw.c | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/ia64/kvm/kvm_fw.c b/arch/ia64/kvm/kvm_fw.c
index a8ae52ed56358..e4b82319881db 100644
--- a/arch/ia64/kvm/kvm_fw.c
+++ b/arch/ia64/kvm/kvm_fw.c
@@ -21,6 +21,9 @@
 
 #include <linux/kvm_host.h>
 #include <linux/smp.h>
+#include <asm/sn/addrs.h>
+#include <asm/sn/clksupport.h>
+#include <asm/sn/shub_mmr.h>
 
 #include "vti.h"
 #include "misc.h"
@@ -188,12 +191,35 @@ static struct ia64_pal_retval pal_freq_base(struct kvm_vcpu *vcpu)
 	return result;
 }
 
-static struct ia64_pal_retval pal_freq_ratios(struct kvm_vcpu *vcpu)
+/*
+ * On the SGI SN2, the ITC isn't stable. Emulation backed by the SN2
+ * RTC is used instead. This function patches the ratios from SAL
+ * to match the RTC before providing them to the guest.
+ */
+static void sn2_patch_itc_freq_ratios(struct ia64_pal_retval *result)
 {
+	struct pal_freq_ratio *ratio;
+	unsigned long sal_freq, sal_drift, factor;
+
+	result->status = ia64_sal_freq_base(SAL_FREQ_BASE_PLATFORM,
+					    &sal_freq, &sal_drift);
+	ratio = (struct pal_freq_ratio *)&result->v2;
+	factor = ((sal_freq * 3) + (sn_rtc_cycles_per_second / 2)) /
+		sn_rtc_cycles_per_second;
+
+	ratio->num = 3;
+	ratio->den = factor;
+}
 
+static struct ia64_pal_retval pal_freq_ratios(struct kvm_vcpu *vcpu)
+{
 	struct ia64_pal_retval result;
 
 	PAL_CALL(result, PAL_FREQ_RATIOS, 0, 0, 0);
+
+	if (vcpu->kvm->arch.is_sn2)
+		sn2_patch_itc_freq_ratios(&result);
+
 	return result;
 }
 
-- 
GitLab


From 0b5d7a2ccb98f0403b1969295429724af9dc9d8e Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Wed, 25 Feb 2009 10:38:55 -0600
Subject: [PATCH 1733/2198] KVM: ia64: Drop in SN2 replacement of fast path ITC
 emulation fault handler

Copy in SN2 RTC based ITC emulation for fast exit. The two versions
have the same size, so a dropin is simpler than patching the branch
instruction to hit the SN2 version.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h |  2 ++
 arch/ia64/kvm/kvm-ia64.c         | 32 +++++++++++++++++++++++++++++++-
 arch/ia64/kvm/optvfault.S        | 30 ++++++++++++++++++++++++++++++
 arch/ia64/kvm/vmm.c              | 12 ++++++++----
 4 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 12439956551bc..589536fa799d7 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -580,6 +580,8 @@ struct kvm_vmm_info{
 	kvm_vmm_entry 	*vmm_entry;
 	kvm_tramp_entry *tramp_entry;
 	unsigned long 	vmm_ivt;
+	unsigned long	patch_mov_ar;
+	unsigned long	patch_mov_ar_sn2;
 };
 
 int kvm_highest_pending_irq(struct kvm_vcpu *vcpu);
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index ebbd995194fb1..4623a90e515ab 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1671,8 +1671,37 @@ static int vti_cpu_has_kvm_support(void)
 	return 0;
 }
 
+
+/*
+ * On SN2, the ITC isn't stable, so copy in fast path code to use the
+ * SN2 RTC, replacing the ITC based default verion.
+ */
+static void kvm_patch_vmm(struct kvm_vmm_info *vmm_info,
+			  struct module *module)
+{
+	unsigned long new_ar, new_ar_sn2;
+	unsigned long module_base;
+
+	if (!ia64_platform_is("sn2"))
+		return;
+
+	module_base = (unsigned long)module->module_core;
+
+	new_ar = kvm_vmm_base + vmm_info->patch_mov_ar - module_base;
+	new_ar_sn2 = kvm_vmm_base + vmm_info->patch_mov_ar_sn2 - module_base;
+
+	printk(KERN_INFO "kvm: Patching ITC emulation to use SGI SN2 RTC "
+	       "as source\n");
+
+	/*
+	 * Copy the SN2 version of mov_ar into place. They are both
+	 * the same size, so 6 bundles is sufficient (6 * 0x10).
+	 */
+	memcpy((void *)new_ar, (void *)new_ar_sn2, 0x60);
+}
+
 static int kvm_relocate_vmm(struct kvm_vmm_info *vmm_info,
-						struct module *module)
+			    struct module *module)
 {
 	unsigned long module_base;
 	unsigned long vmm_size;
@@ -1694,6 +1723,7 @@ static int kvm_relocate_vmm(struct kvm_vmm_info *vmm_info,
 		return -EFAULT;
 
 	memcpy((void *)kvm_vmm_base, (void *)module_base, vmm_size);
+	kvm_patch_vmm(vmm_info, module);
 	kvm_flush_icache(kvm_vmm_base, vmm_size);
 
 	/*Recalculate kvm_vmm_info based on new VMM*/
diff --git a/arch/ia64/kvm/optvfault.S b/arch/ia64/kvm/optvfault.S
index 32254ce9a1bda..f793be3effff5 100644
--- a/arch/ia64/kvm/optvfault.S
+++ b/arch/ia64/kvm/optvfault.S
@@ -11,6 +11,7 @@
 
 #include <asm/asmmacro.h>
 #include <asm/processor.h>
+#include <asm/kvm_host.h>
 
 #include "vti.h"
 #include "asm-offsets.h"
@@ -140,6 +141,35 @@ GLOBAL_ENTRY(kvm_asm_mov_from_ar)
 	;;
 END(kvm_asm_mov_from_ar)
 
+/*
+ * Special SGI SN2 optimized version of mov_from_ar using the SN2 RTC
+ * clock as it's source for emulating the ITC. This version will be
+ * copied on top of the original version if the host is determined to
+ * be an SN2.
+ */
+GLOBAL_ENTRY(kvm_asm_mov_from_ar_sn2)
+	add r18=VMM_VCPU_ITC_OFS_OFFSET, r21
+	movl r19 = (KVM_VMM_BASE+(1<<KVM_VMM_SHIFT))
+
+	add r16=VMM_VCPU_LAST_ITC_OFFSET,r21
+	extr.u r17=r25,6,7
+	mov r24=b0
+	;;
+	ld8 r18=[r18]
+	ld8 r19=[r19]
+	addl r20=@gprel(asm_mov_to_reg),gp
+	;;
+	add r19=r19,r18
+	shladd r17=r17,4,r20
+	;;
+	adds r30=kvm_resume_to_guest-asm_mov_to_reg,r20
+	st8 [r16] = r19
+	mov b0=r17
+	br.sptk.few b0
+	;;
+END(kvm_asm_mov_from_ar_sn2)
+
+
 
 // mov r1=rr[r3]
 GLOBAL_ENTRY(kvm_asm_mov_from_rr)
diff --git a/arch/ia64/kvm/vmm.c b/arch/ia64/kvm/vmm.c
index 9eee5c04baccc..f4b4c899bb6ce 100644
--- a/arch/ia64/kvm/vmm.c
+++ b/arch/ia64/kvm/vmm.c
@@ -30,15 +30,19 @@ MODULE_AUTHOR("Intel");
 MODULE_LICENSE("GPL");
 
 extern char kvm_ia64_ivt;
+extern char kvm_asm_mov_from_ar;
+extern char kvm_asm_mov_from_ar_sn2;
 extern fpswa_interface_t *vmm_fpswa_interface;
 
 long vmm_sanity = 1;
 
 struct kvm_vmm_info vmm_info = {
-	.module	     = THIS_MODULE,
-	.vmm_entry   = vmm_entry,
-	.tramp_entry = vmm_trampoline,
-	.vmm_ivt     = (unsigned long)&kvm_ia64_ivt,
+	.module			= THIS_MODULE,
+	.vmm_entry		= vmm_entry,
+	.tramp_entry		= vmm_trampoline,
+	.vmm_ivt		= (unsigned long)&kvm_ia64_ivt,
+	.patch_mov_ar		= (unsigned long)&kvm_asm_mov_from_ar,
+	.patch_mov_ar_sn2	= (unsigned long)&kvm_asm_mov_from_ar_sn2,
 };
 
 static int __init  kvm_vmm_init(void)
-- 
GitLab


From 386eb6e8b3caface8a0514da70a47c05cabb5b96 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Mar 2009 22:51:09 +0100
Subject: [PATCH 1734/2198] KVM: make 'lapic_timer_ops' and 'kpit_ops' static

Fix this sparse warnings:
  arch/x86/kvm/lapic.c:916:22: warning: symbol 'lapic_timer_ops' was not declared. Should it be static?
  arch/x86/kvm/i8254.c:268:22: warning: symbol 'kpit_ops' was not declared. Should it be static?

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 2 +-
 arch/x86/kvm/lapic.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 4e2e3f26dbf83..cf09bb64f1411 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -265,7 +265,7 @@ static bool kpit_is_periodic(struct kvm_timer *ktimer)
 	return ps->is_periodic;
 }
 
-struct kvm_timer_ops kpit_ops = {
+static struct kvm_timer_ops kpit_ops = {
 	.is_periodic = kpit_is_periodic,
 };
 
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index dd934d27040f6..4d76bb6c5e516 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -913,7 +913,7 @@ void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu)
 		kvm_apic_local_deliver(apic, APIC_LVT0);
 }
 
-struct kvm_timer_ops lapic_timer_ops = {
+static struct kvm_timer_ops lapic_timer_ops = {
 	.is_periodic = lapic_is_periodic,
 };
 
-- 
GitLab


From e56d532f20c890a06bbe7cd479f4201e3a03cd73 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 12 Mar 2009 21:45:39 +0800
Subject: [PATCH 1735/2198] KVM: Device assignment framework rework

After discussion with Marcelo, we decided to rework device assignment framework
together. The old problems are kernel logic is unnecessary complex. So Marcelo
suggest to split it into a more elegant way:

1. Split host IRQ assign and guest IRQ assign. And userspace determine the
combination. Also discard msi2intx parameter, userspace can specific
KVM_DEV_IRQ_HOST_MSI | KVM_DEV_IRQ_GUEST_INTX in assigned_irq->flags to
enable MSI to INTx convertion.

2. Split assign IRQ and deassign IRQ. Import two new ioctls:
KVM_ASSIGN_DEV_IRQ and KVM_DEASSIGN_DEV_IRQ.

This patch also fixed the reversed _IOR vs _IOW in definition(by deprecated the
old interface).

[avi: replace homemade bitcount() by hweight_long()]

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c       |   1 +
 include/linux/kvm.h      |  26 ++-
 include/linux/kvm_host.h |   5 -
 virt/kvm/kvm_main.c      | 486 +++++++++++++++++++++------------------
 4 files changed, 276 insertions(+), 242 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43e049a2ccf4e..41123fc8613e7 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1026,6 +1026,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_SYNC_MMU:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
+	case KVM_CAP_ASSIGN_DEV_IRQ:
 		r = 1;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 640835ed2708d..644e3a9f47dba 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -412,6 +412,7 @@ struct kvm_trace_rec {
 #ifdef __KVM_HAVE_MSIX
 #define KVM_CAP_DEVICE_MSIX 28
 #endif
+#define KVM_CAP_ASSIGN_DEV_IRQ 29
 /* Another bug in KVM_SET_USER_MEMORY_REGION fixed: */
 #define KVM_CAP_JOIN_MEMORY_REGIONS_WORKS 30
 
@@ -485,8 +486,10 @@ struct kvm_irq_routing {
 #define KVM_ASSIGN_PCI_DEVICE _IOR(KVMIO, 0x69, \
 				   struct kvm_assigned_pci_dev)
 #define KVM_SET_GSI_ROUTING       _IOW(KVMIO, 0x6a, struct kvm_irq_routing)
+/* deprecated, replaced by KVM_ASSIGN_DEV_IRQ */
 #define KVM_ASSIGN_IRQ _IOR(KVMIO, 0x70, \
 			    struct kvm_assigned_irq)
+#define KVM_ASSIGN_DEV_IRQ        _IOW(KVMIO, 0x70, struct kvm_assigned_irq)
 #define KVM_REINJECT_CONTROL      _IO(KVMIO, 0x71)
 #define KVM_DEASSIGN_PCI_DEVICE _IOW(KVMIO, 0x72, \
 				     struct kvm_assigned_pci_dev)
@@ -494,6 +497,7 @@ struct kvm_irq_routing {
 			_IOW(KVMIO, 0x73, struct kvm_assigned_msix_nr)
 #define KVM_ASSIGN_SET_MSIX_ENTRY \
 			_IOW(KVMIO, 0x74, struct kvm_assigned_msix_entry)
+#define KVM_DEASSIGN_DEV_IRQ       _IOW(KVMIO, 0x75, struct kvm_assigned_irq)
 
 /*
  * ioctls for vcpu fds
@@ -584,6 +588,8 @@ struct kvm_debug_guest {
 #define KVM_TRC_STLB_INVAL       (KVM_TRC_HANDLER + 0x18)
 #define KVM_TRC_PPC_INSTR        (KVM_TRC_HANDLER + 0x19)
 
+#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
+
 struct kvm_assigned_pci_dev {
 	__u32 assigned_dev_id;
 	__u32 busnr;
@@ -594,6 +600,17 @@ struct kvm_assigned_pci_dev {
 	};
 };
 
+#define KVM_DEV_IRQ_HOST_INTX    (1 << 0)
+#define KVM_DEV_IRQ_HOST_MSI     (1 << 1)
+#define KVM_DEV_IRQ_HOST_MSIX    (1 << 2)
+
+#define KVM_DEV_IRQ_GUEST_INTX   (1 << 8)
+#define KVM_DEV_IRQ_GUEST_MSI    (1 << 9)
+#define KVM_DEV_IRQ_GUEST_MSIX   (1 << 10)
+
+#define KVM_DEV_IRQ_HOST_MASK	 0x00ff
+#define KVM_DEV_IRQ_GUEST_MASK   0xff00
+
 struct kvm_assigned_irq {
 	__u32 assigned_dev_id;
 	__u32 host_irq;
@@ -609,15 +626,6 @@ struct kvm_assigned_irq {
 	};
 };
 
-#define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
-
-#define KVM_DEV_IRQ_ASSIGN_MSI_ACTION	KVM_DEV_IRQ_ASSIGN_ENABLE_MSI
-#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSI	(1 << 0)
-
-#define KVM_DEV_IRQ_ASSIGN_MSIX_ACTION  (KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX |\
-					KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
-#define KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX  (1 << 1)
-#define KVM_DEV_IRQ_ASSIGN_MASK_MSIX    (1 << 2)
 
 struct kvm_assigned_msix_nr {
 	__u32 assigned_dev_id;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index fb60f31c4fb3e..40e49ede8f914 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -339,11 +339,6 @@ struct kvm_assigned_dev_kernel {
 	struct msix_entry *host_msix_entries;
 	int guest_irq;
 	struct kvm_guest_msix_entry *guest_msix_entries;
-#define KVM_ASSIGNED_DEV_GUEST_INTX	(1 << 0)
-#define KVM_ASSIGNED_DEV_GUEST_MSI	(1 << 1)
-#define KVM_ASSIGNED_DEV_HOST_INTX	(1 << 8)
-#define KVM_ASSIGNED_DEV_HOST_MSI	(1 << 9)
-#define KVM_ASSIGNED_DEV_MSIX		((1 << 2) | (1 << 10))
 	unsigned long irq_requested_type;
 	int irq_source_id;
 	int flags;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 3bed82754a5de..792fb7fae0a3f 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -41,6 +41,7 @@
 #include <linux/pagemap.h>
 #include <linux/mman.h>
 #include <linux/swap.h>
+#include <linux/bitops.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -60,9 +61,6 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int msi2intx = 1;
-module_param(msi2intx, bool, 0);
-
 DEFINE_SPINLOCK(kvm_lock);
 LIST_HEAD(vm_list);
 
@@ -132,7 +130,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 * finer-grained lock, update this
 	 */
 	mutex_lock(&kvm->lock);
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		struct kvm_guest_msix_entry *guest_entries =
 			assigned_dev->guest_msix_entries;
 		for (i = 0; i < assigned_dev->entries_nr; i++) {
@@ -152,7 +150,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 		kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id,
 			    assigned_dev->guest_irq, 1);
 		if (assigned_dev->irq_requested_type &
-				KVM_ASSIGNED_DEV_GUEST_MSI) {
+				KVM_DEV_IRQ_GUEST_MSI) {
 			enable_irq(assigned_dev->host_irq);
 			assigned_dev->host_irq_disabled = false;
 		}
@@ -166,7 +164,7 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 	struct kvm_assigned_dev_kernel *assigned_dev =
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
-	if (assigned_dev->irq_requested_type == KVM_ASSIGNED_DEV_MSIX) {
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int index = find_index_from_host_irq(assigned_dev, irq);
 		if (index < 0)
 			return IRQ_HANDLED;
@@ -204,22 +202,22 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	}
 }
 
-/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
-static void kvm_free_assigned_irq(struct kvm *kvm,
-				  struct kvm_assigned_dev_kernel *assigned_dev)
+static void deassign_guest_irq(struct kvm *kvm,
+			       struct kvm_assigned_dev_kernel *assigned_dev)
 {
-	if (!irqchip_in_kernel(kvm))
-		return;
-
 	kvm_unregister_irq_ack_notifier(&assigned_dev->ack_notifier);
+	assigned_dev->ack_notifier.gsi = -1;
 
 	if (assigned_dev->irq_source_id != -1)
 		kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id);
 	assigned_dev->irq_source_id = -1;
+	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK);
+}
 
-	if (!assigned_dev->irq_requested_type)
-		return;
-
+/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */
+static void deassign_host_irq(struct kvm *kvm,
+			      struct kvm_assigned_dev_kernel *assigned_dev)
+{
 	/*
 	 * In kvm_free_device_irq, cancel_work_sync return true if:
 	 * 1. work is scheduled, and then cancelled.
@@ -236,7 +234,7 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 	 * now, the kvm state is still legal for probably we also have to wait
 	 * interrupt_work done.
 	 */
-	if (assigned_dev->irq_requested_type & KVM_ASSIGNED_DEV_MSIX) {
+	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int i;
 		for (i = 0; i < assigned_dev->entries_nr; i++)
 			disable_irq_nosync(assigned_dev->
@@ -259,14 +257,41 @@ static void kvm_free_assigned_irq(struct kvm *kvm,
 
 		free_irq(assigned_dev->host_irq, (void *)assigned_dev);
 
-		if (assigned_dev->irq_requested_type &
-				KVM_ASSIGNED_DEV_HOST_MSI)
+		if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI)
 			pci_disable_msi(assigned_dev->dev);
 	}
 
-	assigned_dev->irq_requested_type = 0;
+	assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK);
+}
+
+static int kvm_deassign_irq(struct kvm *kvm,
+			    struct kvm_assigned_dev_kernel *assigned_dev,
+			    unsigned long irq_requested_type)
+{
+	unsigned long guest_irq_type, host_irq_type;
+
+	if (!irqchip_in_kernel(kvm))
+		return -EINVAL;
+	/* no irq assignment to deassign */
+	if (!assigned_dev->irq_requested_type)
+		return -ENXIO;
+
+	host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK;
+	guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK;
+
+	if (host_irq_type)
+		deassign_host_irq(kvm, assigned_dev);
+	if (guest_irq_type)
+		deassign_guest_irq(kvm, assigned_dev);
+
+	return 0;
 }
 
+static void kvm_free_assigned_irq(struct kvm *kvm,
+				  struct kvm_assigned_dev_kernel *assigned_dev)
+{
+	kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type);
+}
 
 static void kvm_free_assigned_device(struct kvm *kvm,
 				     struct kvm_assigned_dev_kernel
@@ -298,256 +323,244 @@ void kvm_free_all_assigned_devices(struct kvm *kvm)
 	}
 }
 
-static int assigned_device_update_intx(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *adev,
-			struct kvm_assigned_irq *airq)
+static int assigned_device_enable_host_intx(struct kvm *kvm,
+					    struct kvm_assigned_dev_kernel *dev)
 {
-	adev->guest_irq = airq->guest_irq;
-	adev->ack_notifier.gsi = airq->guest_irq;
-
-	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_INTX)
-		return 0;
-
-	if (irqchip_in_kernel(kvm)) {
-		if (!msi2intx &&
-		    (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)) {
-			free_irq(adev->host_irq, (void *)adev);
-			pci_disable_msi(adev->dev);
-		}
+	dev->host_irq = dev->dev->irq;
+	/* Even though this is PCI, we don't want to use shared
+	 * interrupts. Sharing host devices with guest-assigned devices
+	 * on the same interrupt line is not a happy situation: there
+	 * are going to be long delays in accepting, acking, etc.
+	 */
+	if (request_irq(dev->host_irq, kvm_assigned_dev_intr,
+			0, "kvm_assigned_intx_device", (void *)dev))
+		return -EIO;
+	return 0;
+}
 
-		if (!capable(CAP_SYS_RAWIO))
-			return -EPERM;
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_host_msi(struct kvm *kvm,
+					   struct kvm_assigned_dev_kernel *dev)
+{
+	int r;
 
-		if (airq->host_irq)
-			adev->host_irq = airq->host_irq;
-		else
-			adev->host_irq = adev->dev->irq;
+	if (!dev->dev->msi_enabled) {
+		r = pci_enable_msi(dev->dev);
+		if (r)
+			return r;
+	}
 
-		/* Even though this is PCI, we don't want to use shared
-		 * interrupts. Sharing host devices with guest-assigned devices
-		 * on the same interrupt line is not a happy situation: there
-		 * are going to be long delays in accepting, acking, etc.
-		 */
-		if (request_irq(adev->host_irq, kvm_assigned_dev_intr,
-				0, "kvm_assigned_intx_device", (void *)adev))
-			return -EIO;
+	dev->host_irq = dev->dev->irq;
+	if (request_irq(dev->host_irq, kvm_assigned_dev_intr, 0,
+			"kvm_assigned_msi_device", (void *)dev)) {
+		pci_disable_msi(dev->dev);
+		return -EIO;
 	}
 
-	adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_INTX |
-				   KVM_ASSIGNED_DEV_HOST_INTX;
 	return 0;
 }
+#endif
 
-#ifdef CONFIG_X86
-static int assigned_device_update_msi(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *adev,
-			struct kvm_assigned_irq *airq)
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_host_msix(struct kvm *kvm,
+					    struct kvm_assigned_dev_kernel *dev)
 {
-	int r;
+	int i, r = -EINVAL;
 
-	adev->guest_irq = airq->guest_irq;
-	if (airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSI) {
-		/* x86 don't care upper address of guest msi message addr */
-		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_MSI;
-		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_INTX;
-		adev->ack_notifier.gsi = -1;
-	} else if (msi2intx) {
-		adev->irq_requested_type |= KVM_ASSIGNED_DEV_GUEST_INTX;
-		adev->irq_requested_type &= ~KVM_ASSIGNED_DEV_GUEST_MSI;
-		adev->ack_notifier.gsi = airq->guest_irq;
-	} else {
-		/*
-		 * Guest require to disable device MSI, we disable MSI and
-		 * re-enable INTx by default again. Notice it's only for
-		 * non-msi2intx.
-		 */
-		assigned_device_update_intx(kvm, adev, airq);
-		return 0;
-	}
+	/* host_msix_entries and guest_msix_entries should have been
+	 * initialized */
+	if (dev->entries_nr == 0)
+		return r;
 
-	if (adev->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI)
-		return 0;
+	r = pci_enable_msix(dev->dev, dev->host_msix_entries, dev->entries_nr);
+	if (r)
+		return r;
 
-	if (irqchip_in_kernel(kvm)) {
-		if (!msi2intx) {
-			if (adev->irq_requested_type &
-					KVM_ASSIGNED_DEV_HOST_INTX)
-				free_irq(adev->host_irq, (void *)adev);
+	for (i = 0; i < dev->entries_nr; i++) {
+		r = request_irq(dev->host_msix_entries[i].vector,
+				kvm_assigned_dev_intr, 0,
+				"kvm_assigned_msix_device",
+				(void *)dev);
+		/* FIXME: free requested_irq's on failure */
+		if (r)
+			return r;
+	}
 
-			r = pci_enable_msi(adev->dev);
-			if (r)
-				return r;
-		}
+	return 0;
+}
 
-		adev->host_irq = adev->dev->irq;
-		if (request_irq(adev->host_irq, kvm_assigned_dev_intr, 0,
-				"kvm_assigned_msi_device", (void *)adev))
-			return -EIO;
-	}
+#endif
 
-	if (!msi2intx)
-		adev->irq_requested_type = KVM_ASSIGNED_DEV_GUEST_MSI;
+static int assigned_device_enable_guest_intx(struct kvm *kvm,
+				struct kvm_assigned_dev_kernel *dev,
+				struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = irq->guest_irq;
+	return 0;
+}
 
-	adev->irq_requested_type |= KVM_ASSIGNED_DEV_HOST_MSI;
+#ifdef __KVM_HAVE_MSI
+static int assigned_device_enable_guest_msi(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *dev,
+			struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = -1;
 	return 0;
 }
 #endif
+#ifdef __KVM_HAVE_MSIX
+static int assigned_device_enable_guest_msix(struct kvm *kvm,
+			struct kvm_assigned_dev_kernel *dev,
+			struct kvm_assigned_irq *irq)
+{
+	dev->guest_irq = irq->guest_irq;
+	dev->ack_notifier.gsi = -1;
+	return 0;
+}
+#endif
+
+static int assign_host_irq(struct kvm *kvm,
+			   struct kvm_assigned_dev_kernel *dev,
+			   __u32 host_irq_type)
+{
+	int r = -EEXIST;
+
+	if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK)
+		return r;
 
+	switch (host_irq_type) {
+	case KVM_DEV_IRQ_HOST_INTX:
+		r = assigned_device_enable_host_intx(kvm, dev);
+		break;
+#ifdef __KVM_HAVE_MSI
+	case KVM_DEV_IRQ_HOST_MSI:
+		r = assigned_device_enable_host_msi(kvm, dev);
+		break;
+#endif
 #ifdef __KVM_HAVE_MSIX
-static int assigned_device_update_msix(struct kvm *kvm,
-			struct kvm_assigned_dev_kernel *adev,
-			struct kvm_assigned_irq *airq)
-{
-	/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
-	int i, r;
-
-	adev->ack_notifier.gsi = -1;
-
-	if (irqchip_in_kernel(kvm)) {
-		if (airq->flags & KVM_DEV_IRQ_ASSIGN_MASK_MSIX)
-			return -ENOTTY;
-
-		if (!(airq->flags & KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX)) {
-			/* Guest disable MSI-X */
-			kvm_free_assigned_irq(kvm, adev);
-			if (msi2intx) {
-				pci_enable_msi(adev->dev);
-				if (adev->dev->msi_enabled)
-					return assigned_device_update_msi(kvm,
-							adev, airq);
-			}
-			return assigned_device_update_intx(kvm, adev, airq);
-		}
+	case KVM_DEV_IRQ_HOST_MSIX:
+		r = assigned_device_enable_host_msix(kvm, dev);
+		break;
+#endif
+	default:
+		r = -EINVAL;
+	}
 
-		/* host_msix_entries and guest_msix_entries should have been
-		 * initialized */
-		if (adev->entries_nr == 0)
-			return -EINVAL;
+	if (!r)
+		dev->irq_requested_type |= host_irq_type;
 
-		kvm_free_assigned_irq(kvm, adev);
+	return r;
+}
 
-		r = pci_enable_msix(adev->dev, adev->host_msix_entries,
-				    adev->entries_nr);
-		if (r)
-			return r;
+static int assign_guest_irq(struct kvm *kvm,
+			    struct kvm_assigned_dev_kernel *dev,
+			    struct kvm_assigned_irq *irq,
+			    unsigned long guest_irq_type)
+{
+	int id;
+	int r = -EEXIST;
 
-		for (i = 0; i < adev->entries_nr; i++) {
-			r = request_irq((adev->host_msix_entries + i)->vector,
-					kvm_assigned_dev_intr, 0,
-					"kvm_assigned_msix_device",
-					(void *)adev);
-			if (r)
-				return r;
-		}
+	if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK)
+		return r;
+
+	id = kvm_request_irq_source_id(kvm);
+	if (id < 0)
+		return id;
+
+	dev->irq_source_id = id;
+
+	switch (guest_irq_type) {
+	case KVM_DEV_IRQ_GUEST_INTX:
+		r = assigned_device_enable_guest_intx(kvm, dev, irq);
+		break;
+#ifdef __KVM_HAVE_MSI
+	case KVM_DEV_IRQ_GUEST_MSI:
+		r = assigned_device_enable_guest_msi(kvm, dev, irq);
+		break;
+#endif
+#ifdef __KVM_HAVE_MSIX
+	case KVM_DEV_IRQ_GUEST_MSIX:
+		r = assigned_device_enable_guest_msix(kvm, dev, irq);
+		break;
+#endif
+	default:
+		r = -EINVAL;
 	}
 
-	adev->irq_requested_type |= KVM_ASSIGNED_DEV_MSIX;
+	if (!r) {
+		dev->irq_requested_type |= guest_irq_type;
+		kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier);
+	} else
+		kvm_free_irq_source_id(kvm, dev->irq_source_id);
 
-	return 0;
+	return r;
 }
-#endif
 
+/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */
 static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
-				   struct kvm_assigned_irq
-				   *assigned_irq)
+				   struct kvm_assigned_irq *assigned_irq)
 {
-	int r = 0;
+	int r = -EINVAL;
 	struct kvm_assigned_dev_kernel *match;
-	u32 current_flags = 0, changed_flags;
+	unsigned long host_irq_type, guest_irq_type;
 
-	mutex_lock(&kvm->lock);
+	if (!capable(CAP_SYS_RAWIO))
+		return -EPERM;
 
+	if (!irqchip_in_kernel(kvm))
+		return r;
+
+	mutex_lock(&kvm->lock);
+	r = -ENODEV;
 	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
 				      assigned_irq->assigned_dev_id);
-	if (!match) {
-		mutex_unlock(&kvm->lock);
-		return -EINVAL;
-	}
-
-	if (!match->irq_requested_type) {
-		INIT_WORK(&match->interrupt_work,
-				kvm_assigned_dev_interrupt_work_handler);
-		if (irqchip_in_kernel(kvm)) {
-			/* Register ack nofitier */
-			match->ack_notifier.gsi = -1;
-			match->ack_notifier.irq_acked =
-					kvm_assigned_dev_ack_irq;
-			kvm_register_irq_ack_notifier(kvm,
-					&match->ack_notifier);
-
-			/* Request IRQ source ID */
-			r = kvm_request_irq_source_id(kvm);
-			if (r < 0)
-				goto out_release;
-			else
-				match->irq_source_id = r;
-
-#ifdef CONFIG_X86
-			/* Determine host device irq type, we can know the
-			 * result from dev->msi_enabled */
-			if (msi2intx)
-				pci_enable_msi(match->dev);
-#endif
-		}
-	}
+	if (!match)
+		goto out;
 
-	if (match->irq_requested_type & KVM_ASSIGNED_DEV_MSIX)
-		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSIX;
-	else if ((match->irq_requested_type & KVM_ASSIGNED_DEV_HOST_MSI) &&
-		 (match->irq_requested_type & KVM_ASSIGNED_DEV_GUEST_MSI))
-		current_flags |= KVM_DEV_IRQ_ASSIGN_ENABLE_MSI;
+	host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK);
+	guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK);
 
-	changed_flags = assigned_irq->flags ^ current_flags;
+	r = -EINVAL;
+	/* can only assign one type at a time */
+	if (hweight_long(host_irq_type) > 1)
+		goto out;
+	if (hweight_long(guest_irq_type) > 1)
+		goto out;
+	if (host_irq_type == 0 && guest_irq_type == 0)
+		goto out;
 
-#ifdef __KVM_HAVE_MSIX
-	if (changed_flags & KVM_DEV_IRQ_ASSIGN_MSIX_ACTION) {
-		r = assigned_device_update_msix(kvm, match, assigned_irq);
-		if (r) {
-			printk(KERN_WARNING "kvm: failed to execute "
-					"MSI-X action!\n");
-			goto out_release;
-		}
-	} else
-#endif
-	if ((changed_flags & KVM_DEV_IRQ_ASSIGN_MSI_ACTION) ||
-	    (msi2intx && match->dev->msi_enabled)) {
-#ifdef CONFIG_X86
-		r = assigned_device_update_msi(kvm, match, assigned_irq);
-		if (r) {
-			printk(KERN_WARNING "kvm: failed to enable "
-					"MSI device!\n");
-			goto out_release;
-		}
-#else
-		r = -ENOTTY;
-#endif
-	} else if (assigned_irq->host_irq == 0 && match->dev->irq == 0) {
-		/* Host device IRQ 0 means don't support INTx */
-		if (!msi2intx) {
-			printk(KERN_WARNING
-			       "kvm: wait device to enable MSI!\n");
-			r = 0;
-		} else {
-			printk(KERN_WARNING
-			       "kvm: failed to enable MSI device!\n");
-			r = -ENOTTY;
-			goto out_release;
-		}
-	} else {
-		/* Non-sharing INTx mode */
-		r = assigned_device_update_intx(kvm, match, assigned_irq);
-		if (r) {
-			printk(KERN_WARNING "kvm: failed to enable "
-					"INTx device!\n");
-			goto out_release;
-		}
-	}
+	r = 0;
+	if (host_irq_type)
+		r = assign_host_irq(kvm, match, host_irq_type);
+	if (r)
+		goto out;
 
+	if (guest_irq_type)
+		r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type);
+out:
 	mutex_unlock(&kvm->lock);
 	return r;
-out_release:
+}
+
+static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm,
+					 struct kvm_assigned_irq
+					 *assigned_irq)
+{
+	int r = -ENODEV;
+	struct kvm_assigned_dev_kernel *match;
+
+	mutex_lock(&kvm->lock);
+
+	match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head,
+				      assigned_irq->assigned_dev_id);
+	if (!match)
+		goto out;
+
+	r = kvm_deassign_irq(kvm, match, assigned_irq->flags);
+out:
 	mutex_unlock(&kvm->lock);
-	kvm_free_assigned_device(kvm, match);
 	return r;
 }
 
@@ -565,7 +578,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 				      assigned_dev->assigned_dev_id);
 	if (match) {
 		/* device already assigned */
-		r = -EINVAL;
+		r = -EEXIST;
 		goto out;
 	}
 
@@ -604,6 +617,9 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->dev = dev;
 	match->irq_source_id = -1;
 	match->kvm = kvm;
+	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
+	INIT_WORK(&match->interrupt_work,
+		  kvm_assigned_dev_interrupt_work_handler);
 
 	list_add(&match->list, &kvm->arch.assigned_dev_head);
 
@@ -2084,6 +2100,11 @@ static long kvm_vm_ioctl(struct file *filp,
 		break;
 	}
 	case KVM_ASSIGN_IRQ: {
+		r = -EOPNOTSUPP;
+		break;
+	}
+#ifdef KVM_CAP_ASSIGN_DEV_IRQ
+	case KVM_ASSIGN_DEV_IRQ: {
 		struct kvm_assigned_irq assigned_irq;
 
 		r = -EFAULT;
@@ -2094,6 +2115,18 @@ static long kvm_vm_ioctl(struct file *filp,
 			goto out;
 		break;
 	}
+	case KVM_DEASSIGN_DEV_IRQ: {
+		struct kvm_assigned_irq assigned_irq;
+
+		r = -EFAULT;
+		if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq))
+			goto out;
+		r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq);
+		if (r)
+			goto out;
+		break;
+	}
+#endif
 #endif
 #ifdef KVM_CAP_DEVICE_DEASSIGNMENT
 	case KVM_DEASSIGN_PCI_DEVICE: {
@@ -2596,9 +2629,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 
 	kvm_preempt_ops.sched_in = kvm_sched_in;
 	kvm_preempt_ops.sched_out = kvm_sched_out;
-#ifndef CONFIG_X86
-	msi2intx = 0;
-#endif
 
 	return 0;
 
-- 
GitLab


From f00be0cae4e6ad0a8c7be381c6d9be3586800b3e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 19 Mar 2009 12:20:36 +0200
Subject: [PATCH 1736/2198] KVM: MMU: do not free active mmu pages in
 free_mmu_pages()

free_mmu_pages() should only undo what alloc_mmu_pages() does.
Free mmu pages from the generic VM destruction function, kvm_destroy_vm().

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c  | 8 --------
 virt/kvm/kvm_main.c | 2 ++
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 32cf11e5728a2..8aac67cbd92f0 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2638,14 +2638,6 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
 
 static void free_mmu_pages(struct kvm_vcpu *vcpu)
 {
-	struct kvm_mmu_page *sp;
-
-	while (!list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
-		sp = container_of(vcpu->kvm->arch.active_mmu_pages.next,
-				  struct kvm_mmu_page, link);
-		kvm_mmu_zap_page(vcpu->kvm, sp);
-		cond_resched();
-	}
 	free_page((unsigned long)vcpu->arch.mmu.pae_root);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 792fb7fae0a3f..934dd1c9487e3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1032,6 +1032,8 @@ static void kvm_destroy_vm(struct kvm *kvm)
 #endif
 #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
 	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
+#else
+	kvm_arch_flush_shadow(kvm);
 #endif
 	kvm_arch_destroy_vm(kvm);
 	mmdrop(mm);
-- 
GitLab


From 7fe29e0faacb650d31b9e9f538203a157bec821d Mon Sep 17 00:00:00 2001
From: Amit Shah <amit.shah@redhat.com>
Date: Fri, 20 Mar 2009 12:39:00 +0530
Subject: [PATCH 1737/2198] KVM: x86: Ignore reads to EVNTSEL MSRs

We ignore writes to the performance counters and performance event
selector registers already. Kaspersky antivirus reads the eventsel
MSR causing it to crash with the current behaviour.

Return 0 as data when the eventsel registers are read to stop the
crash.

Signed-off-by: Amit Shah <amit.shah@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 41123fc8613e7..c0ae5e6cba9b6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -895,6 +895,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case MSR_IA32_LASTINTFROMIP:
 	case MSR_IA32_LASTINTTOIP:
 	case MSR_VM_HSAVE_PA:
+	case MSR_P6_EVNTSEL0:
+	case MSR_P6_EVNTSEL1:
 		data = 0;
 		break;
 	case MSR_MTRRcap:
-- 
GitLab


From 61c50edfcd40be9126579f9cec68c789b6089998 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 22 Mar 2009 12:37:05 +0200
Subject: [PATCH 1738/2198] KVM: SVM: Remove duplicate code in
 svm_do_inject_vector()

svm_do_inject_vector() reimplements pop_irq().

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1f8510c51d6e8..5b35ebd4ec88b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2346,15 +2346,7 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 
 static void svm_do_inject_vector(struct vcpu_svm *svm)
 {
-	struct kvm_vcpu *vcpu = &svm->vcpu;
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
-	svm_inject_irq(svm, irq);
+	svm_inject_irq(svm, pop_irq(&svm->vcpu));
 }
 
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-- 
GitLab


From fe4c7b1914ac46af751d256f5a20c2e12dcbaaae Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 23 Mar 2009 11:23:18 +0200
Subject: [PATCH 1739/2198] KVM: reuse (pop|push)_irq from svm.c in vmx.c

The prioritized bit vector manipulation functions are useful in both vmx and
svm.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 25 ++++---------------------
 arch/x86/kvm/vmx.c | 17 ++---------------
 arch/x86/kvm/x86.h | 18 ++++++++++++++++++
 3 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5b35ebd4ec88b..aa528dbad070f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -19,6 +19,7 @@
 #include "irq.h"
 #include "mmu.h"
 #include "kvm_cache_regs.h"
+#include "x86.h"
 
 #include <linux/module.h>
 #include <linux/kernel.h>
@@ -132,24 +133,6 @@ static inline u32 svm_has(u32 feat)
 	return svm_features & feat;
 }
 
-static inline u8 pop_irq(struct kvm_vcpu *vcpu)
-{
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
-	return irq;
-}
-
-static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
-	set_bit(irq, vcpu->arch.irq_pending);
-	set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
-}
-
 static inline void clgi(void)
 {
 	asm volatile (__ex(SVM_CLGI));
@@ -1116,7 +1099,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	if (!irqchip_in_kernel(kvm) &&
 	    is_external_interrupt(exit_int_info)) {
 		event_injection = true;
-		push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
+		kvm_push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
 	}
 
 	fault_address  = svm->vmcb->control.exit_info_2;
@@ -2336,7 +2319,7 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 	if ((control->int_ctl & V_IRQ_MASK)
 	    && !irqchip_in_kernel(svm->vcpu.kvm)) {
 		control->int_ctl &= ~V_IRQ_MASK;
-		push_irq(&svm->vcpu, control->int_vector);
+		kvm_push_irq(&svm->vcpu, control->int_vector);
 	}
 
 	svm->vcpu.arch.interrupt_window_open =
@@ -2346,7 +2329,7 @@ static void kvm_reput_irq(struct vcpu_svm *svm)
 
 static void svm_do_inject_vector(struct vcpu_svm *svm)
 {
-	svm_inject_irq(svm, pop_irq(&svm->vcpu));
+	svm_inject_irq(svm, kvm_pop_irq(&svm->vcpu));
 }
 
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b5eae7a00aad4..2c0a2ed708d2a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2489,18 +2489,6 @@ static void vmx_update_window_states(struct kvm_vcpu *vcpu)
 				 GUEST_INTR_STATE_MOV_SS)));
 }
 
-static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
-{
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
-	kvm_queue_interrupt(vcpu, irq);
-}
-
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 				       struct kvm_run *kvm_run)
 {
@@ -2534,7 +2522,7 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 
 	if (vcpu->arch.interrupt_window_open) {
 		if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
-			kvm_do_inject_irq(vcpu);
+			kvm_queue_interrupt(vcpu, kvm_pop_irq(vcpu));
 
 		if (vcpu->arch.interrupt.pending)
 			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
@@ -2619,8 +2607,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
 		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-		set_bit(irq, vcpu->arch.irq_pending);
-		set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+		kvm_push_irq(vcpu, irq);
 	}
 
 	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 6a4be78a73843..2ab679102dcd0 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -19,4 +19,22 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
 	vcpu->arch.interrupt.pending = false;
 }
 
+static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu)
+{
+	int word_index = __ffs(vcpu->arch.irq_summary);
+	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
+	int irq = word_index * BITS_PER_LONG + bit_index;
+
+	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
+	if (!vcpu->arch.irq_pending[word_index])
+		clear_bit(word_index, &vcpu->arch.irq_summary);
+	return irq;
+}
+
+static inline void kvm_push_irq(struct kvm_vcpu *vcpu, u8 irq)
+{
+        set_bit(irq, vcpu->arch.irq_pending);
+        set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+}
+
 #endif
-- 
GitLab


From c1f8bc04c6f8576553dc87abe7562e868433a19f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 15:41:17 +0200
Subject: [PATCH 1740/2198] KVM: VMX: Make module parameters readable

Useful to see how the module was loaded.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2c0a2ed708d2a..469787ce93f7b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -39,19 +39,19 @@ MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
 static int bypass_guest_pf = 1;
-module_param(bypass_guest_pf, bool, 0);
+module_param(bypass_guest_pf, bool, S_IRUGO);
 
 static int enable_vpid = 1;
-module_param(enable_vpid, bool, 0);
+module_param(enable_vpid, bool, 0444);
 
 static int flexpriority_enabled = 1;
-module_param(flexpriority_enabled, bool, 0);
+module_param(flexpriority_enabled, bool, S_IRUGO);
 
 static int enable_ept = 1;
-module_param(enable_ept, bool, 0);
+module_param(enable_ept, bool, S_IRUGO);
 
 static int emulate_invalid_guest_state = 0;
-module_param(emulate_invalid_guest_state, bool, 0);
+module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
 struct vmcs {
 	u32 revision_id;
-- 
GitLab


From 6062d012ed23c29672bb0f93ebcfb8e556def726 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 17:35:17 +0200
Subject: [PATCH 1741/2198] KVM: VMX: Rename kvm_handle_exit() to
 vmx_handle_exit()

It is a static vmx-specific function.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 469787ce93f7b..85f4fd5417564 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3162,7 +3162,7 @@ static const int kvm_vmx_max_exit_handlers =
  * The guest has exited.  See if we can fix it or if we need userspace
  * assistance.
  */
-static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3681,7 +3681,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.tlb_flush = vmx_flush_tlb,
 
 	.run = vmx_vcpu_run,
-	.handle_exit = kvm_handle_exit,
+	.handle_exit = vmx_handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
 	.patch_hypercall = vmx_patch_hypercall,
 	.get_irq = vmx_get_irq,
-- 
GitLab


From 736caefe1511d9d1116ed4ffb0ea95b7368beb1f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 17:39:48 +0200
Subject: [PATCH 1742/2198] KVM: VMX: Simplify module parameter names

Instead of 'enable_vpid=1', use a simple 'vpid=1'.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 85f4fd5417564..a69ba6ba42117 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -42,13 +42,13 @@ static int bypass_guest_pf = 1;
 module_param(bypass_guest_pf, bool, S_IRUGO);
 
 static int enable_vpid = 1;
-module_param(enable_vpid, bool, 0444);
+module_param_named(vpid, enable_vpid, bool, 0444);
 
 static int flexpriority_enabled = 1;
-module_param(flexpriority_enabled, bool, S_IRUGO);
+module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 
 static int enable_ept = 1;
-module_param(enable_ept, bool, S_IRUGO);
+module_param_named(ept, enable_ept, bool, S_IRUGO);
 
 static int emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
-- 
GitLab


From 4462d21a615dfd0c0f672c10832a011d6f280d5a Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 17:53:37 +0200
Subject: [PATCH 1743/2198] KVM: VMX: Annotate module parameters as
 __read_mostly

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index a69ba6ba42117..f4b6c4bcee298 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -38,19 +38,19 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-static int bypass_guest_pf = 1;
+static int __read_mostly bypass_guest_pf = 1;
 module_param(bypass_guest_pf, bool, S_IRUGO);
 
-static int enable_vpid = 1;
+static int __read_mostly enable_vpid = 1;
 module_param_named(vpid, enable_vpid, bool, 0444);
 
-static int flexpriority_enabled = 1;
+static int __read_mostly flexpriority_enabled = 1;
 module_param_named(flexpriority, flexpriority_enabled, bool, S_IRUGO);
 
-static int enable_ept = 1;
+static int __read_mostly enable_ept = 1;
 module_param_named(ept, enable_ept, bool, S_IRUGO);
 
-static int emulate_invalid_guest_state = 0;
+static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
 struct vmcs {
-- 
GitLab


From 919818abc2ca0721f1cd296fbc24601d9044f993 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 18:01:29 +0200
Subject: [PATCH 1744/2198] KVM: VMX: Zero the vpid module parameter if vpid is
 not supported

This allows reading back how the hardware is configured.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f4b6c4bcee298..9b97c8e3cfd1f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1202,6 +1202,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		      vmx_capability.ept, vmx_capability.vpid);
 	}
 
+	if (!cpu_has_vmx_vpid())
+		enable_vpid = 0;
+
 	min = 0;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
@@ -2082,7 +2085,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
 	int vpid;
 
 	vmx->vpid = 0;
-	if (!enable_vpid || !cpu_has_vmx_vpid())
+	if (!enable_vpid)
 		return;
 	spin_lock(&vmx_vpid_lock);
 	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
-- 
GitLab


From 575ff2dcb25608d53737d1126ee0e7e4d6f11752 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 18:25:15 +0200
Subject: [PATCH 1745/2198] KVM: VMX: Zero ept module parameter if ept is not
 present

Allows reading back hardware capability.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9b97c8e3cfd1f..2f65120cf283f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -265,7 +265,7 @@ static inline int cpu_has_vmx_ept(void)
 
 static inline int vm_need_ept(void)
 {
-	return (cpu_has_vmx_ept() && enable_ept);
+	return enable_ept;
 }
 
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
@@ -1205,6 +1205,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (!cpu_has_vmx_vpid())
 		enable_vpid = 0;
 
+	if (!cpu_has_vmx_ept())
+		enable_ept = 0;
+
 	min = 0;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-- 
GitLab


From 089d034e0c4538d2436512fa64782b91008d4a7c Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 23 Mar 2009 18:26:32 +0200
Subject: [PATCH 1746/2198] KVM: VMX: Fold vm_need_ept() into callers

Trivial.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2f65120cf283f..da6461d5dc84a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -263,11 +263,6 @@ static inline int cpu_has_vmx_ept(void)
 		SECONDARY_EXEC_ENABLE_EPT);
 }
 
-static inline int vm_need_ept(void)
-{
-	return enable_ept;
-}
-
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
 	return ((cpu_has_vmx_virtualize_apic_accesses()) &&
@@ -382,7 +377,7 @@ static inline void ept_sync_global(void)
 
 static inline void ept_sync_context(u64 eptp)
 {
-	if (vm_need_ept()) {
+	if (enable_ept) {
 		if (cpu_has_vmx_invept_context())
 			__invept(VMX_EPT_EXTENT_CONTEXT, eptp, 0);
 		else
@@ -392,7 +387,7 @@ static inline void ept_sync_context(u64 eptp)
 
 static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
 {
-	if (vm_need_ept()) {
+	if (enable_ept) {
 		if (cpu_has_vmx_invept_individual_addr())
 			__invept(VMX_EPT_EXTENT_INDIVIDUAL_ADDR,
 					eptp, gpa);
@@ -491,7 +486,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	}
 	if (vcpu->arch.rmode.active)
 		eb = ~0;
-	if (vm_need_ept())
+	if (enable_ept)
 		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
@@ -1502,7 +1497,7 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 {
 	vpid_sync_vcpu_all(to_vmx(vcpu));
-	if (vm_need_ept())
+	if (enable_ept)
 		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
 }
 
@@ -1587,7 +1582,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	}
 #endif
 
-	if (vm_need_ept())
+	if (enable_ept)
 		ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
 
 	vmcs_writel(CR0_READ_SHADOW, cr0);
@@ -1616,7 +1611,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	u64 eptp;
 
 	guest_cr3 = cr3;
-	if (vm_need_ept()) {
+	if (enable_ept) {
 		eptp = construct_eptp(cr3);
 		vmcs_write64(EPT_POINTER, eptp);
 		ept_sync_context(eptp);
@@ -1637,7 +1632,7 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
 	vcpu->arch.cr4 = cr4;
-	if (vm_need_ept())
+	if (enable_ept)
 		ept_update_paging_mode_cr4(&hw_cr4, vcpu);
 
 	vmcs_writel(CR4_READ_SHADOW, cr4);
@@ -1999,7 +1994,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
 	pfn_t identity_map_pfn;
 	u32 tmp;
 
-	if (!vm_need_ept())
+	if (!enable_ept)
 		return 1;
 	if (unlikely(!kvm->arch.ept_identity_pagetable)) {
 		printk(KERN_ERR "EPT: identity-mapping pagetable "
@@ -2163,7 +2158,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				CPU_BASED_CR8_LOAD_EXITING;
 #endif
 	}
-	if (!vm_need_ept())
+	if (!enable_ept)
 		exec_control |= CPU_BASED_CR3_STORE_EXITING |
 				CPU_BASED_CR3_LOAD_EXITING  |
 				CPU_BASED_INVLPG_EXITING;
@@ -2176,7 +2171,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 				~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 		if (vmx->vpid == 0)
 			exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
-		if (!vm_need_ept())
+		if (!enable_ept)
 			exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
 	}
@@ -2637,7 +2632,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
 	if (is_page_fault(intr_info)) {
 		/* EPT won't cause page fault directly */
-		if (vm_need_ept())
+		if (enable_ept)
 			BUG();
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
@@ -3187,7 +3182,7 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	/* Access CR3 don't cause VMExit in paging mode, so we need
 	 * to sync with guest real CR3. */
-	if (vm_need_ept() && is_paging(vcpu)) {
+	if (enable_ept && is_paging(vcpu)) {
 		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
 		ept_load_pdptrs(vcpu);
 	}
@@ -3602,7 +3597,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 		if (alloc_apic_access_page(kvm) != 0)
 			goto free_vmcs;
 
-	if (vm_need_ept())
+	if (enable_ept)
 		if (alloc_identity_pagetable(kvm) != 0)
 			goto free_vmcs;
 
@@ -3753,7 +3748,7 @@ static int __init vmx_init(void)
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
 	vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
 
-	if (vm_need_ept()) {
+	if (enable_ept) {
 		bypass_guest_pf = 0;
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
 			VMX_EPT_WRITABLE_MASK);
-- 
GitLab


From 09cec754885f900f6aab23801878c0cd217ee1d6 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 23 Mar 2009 15:11:44 +0200
Subject: [PATCH 1747/2198] KVM: Timer event should not unconditionally unhalt
 vcpu.

Currently timer events are processed before entering guest mode. Move it
to main vcpu event loop since timer events should be processed even while
vcpu is halted.  Timer may cause interrupt/nmi to be injected and only then
vcpu will be unhalted.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c |  6 ++---
 arch/x86/kvm/x86.c       | 57 ++++++++++++++++++++++++----------------
 virt/kvm/kvm_main.c      |  5 ++--
 3 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 4623a90e515ab..d2a90fd505b00 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -488,10 +488,10 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 		hrtimer_cancel(p_ht);
 		vcpu->arch.ht_active = 0;
 
-		if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+		if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests) ||
+				kvm_cpu_has_pending_timer(vcpu))
 			if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
-				vcpu->arch.mp_state =
-					KVM_MP_STATE_RUNNABLE;
+				vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
 		if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
 			return -EINTR;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c0ae5e6cba9b6..8fca7a4e95a3e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3133,9 +3133,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		}
 	}
 
-	clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
-	kvm_inject_pending_timer_irqs(vcpu);
-
 	preempt_disable();
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -3235,6 +3232,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	return r;
 }
 
+
 static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
@@ -3261,29 +3259,42 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			kvm_vcpu_block(vcpu);
 			down_read(&vcpu->kvm->slots_lock);
 			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
-				if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
+			{
+				switch(vcpu->arch.mp_state) {
+				case KVM_MP_STATE_HALTED:
 					vcpu->arch.mp_state =
-							KVM_MP_STATE_RUNNABLE;
-			if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
-				r = -EINTR;
+						KVM_MP_STATE_RUNNABLE;
+				case KVM_MP_STATE_RUNNABLE:
+					break;
+				case KVM_MP_STATE_SIPI_RECEIVED:
+				default:
+					r = -EINTR;
+					break;
+				}
+			}
 		}
 
-		if (r > 0) {
-			if (dm_request_for_irq_injection(vcpu, kvm_run)) {
-				r = -EINTR;
-				kvm_run->exit_reason = KVM_EXIT_INTR;
-				++vcpu->stat.request_irq_exits;
-			}
-			if (signal_pending(current)) {
-				r = -EINTR;
-				kvm_run->exit_reason = KVM_EXIT_INTR;
-				++vcpu->stat.signal_exits;
-			}
-			if (need_resched()) {
-				up_read(&vcpu->kvm->slots_lock);
-				kvm_resched(vcpu);
-				down_read(&vcpu->kvm->slots_lock);
-			}
+		if (r <= 0)
+			break;
+
+		clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		if (kvm_cpu_has_pending_timer(vcpu))
+			kvm_inject_pending_timer_irqs(vcpu);
+
+		if (dm_request_for_irq_injection(vcpu, kvm_run)) {
+			r = -EINTR;
+			kvm_run->exit_reason = KVM_EXIT_INTR;
+			++vcpu->stat.request_irq_exits;
+		}
+		if (signal_pending(current)) {
+			r = -EINTR;
+			kvm_run->exit_reason = KVM_EXIT_INTR;
+			++vcpu->stat.signal_exits;
+		}
+		if (need_resched()) {
+			up_read(&vcpu->kvm->slots_lock);
+			kvm_resched(vcpu);
+			down_read(&vcpu->kvm->slots_lock);
 		}
 	}
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 934dd1c9487e3..a1a4272fa57cb 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1611,11 +1611,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
 		if (kvm_cpu_has_interrupt(vcpu) ||
-		    kvm_cpu_has_pending_timer(vcpu) ||
-		    kvm_arch_vcpu_runnable(vcpu)) {
+				kvm_arch_vcpu_runnable(vcpu)) {
 			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
 			break;
 		}
+		if (kvm_cpu_has_pending_timer(vcpu))
+			break;
 		if (signal_pending(current))
 			break;
 
-- 
GitLab


From 78646121e9a2fcf7977cc15966420e572a450bc3 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 23 Mar 2009 12:12:11 +0200
Subject: [PATCH 1748/2198] KVM: Fix interrupt unhalting a vcpu when it
 shouldn't

kvm_vcpu_block() unhalts vpu on an interrupt/timer without checking
if interrupt window is actually opened.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c        |  6 ++++++
 arch/powerpc/kvm/powerpc.c      |  6 ++++++
 arch/s390/kvm/interrupt.c       |  6 ++++++
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              | 10 ++++++++++
 arch/x86/kvm/vmx.c              |  8 +++++++-
 arch/x86/kvm/x86.c              |  5 +++++
 include/linux/kvm_host.h        |  1 +
 virt/kvm/kvm_main.c             |  3 ++-
 9 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index d2a90fd505b00..3bf0a345224af 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1963,6 +1963,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
+
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.timer_fired;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 9057335fdc616..2cf915e51e7e9 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -41,6 +41,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 	return !!(v->arch.pending_exceptions);
 }
 
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
+
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
 	return !(v->arch.msr & MSR_WE);
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 0189356fe2098..4ed4c3a11485f 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -318,6 +318,12 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	return rc;
 }
 
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	/* do real check here */
+	return 1;
+}
+
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
 	return 0;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 46276273a1a16..8351c4d00ac0f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -521,7 +521,7 @@ struct kvm_x86_ops {
 	void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
 	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
 				       struct kvm_run *run);
-
+	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	int (*get_mt_mask_shift)(void);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index aa528dbad070f..de741043c5b15 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2270,6 +2270,15 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 		vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
 }
 
+static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *vmcb = svm->vmcb;
+	return (vmcb->save.rflags & X86_EFLAGS_IF) &&
+		!(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+		(svm->vcpu.arch.hflags & HF_GIF_MASK);
+}
+
 static void svm_intr_assist(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2649,6 +2658,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.exception_injected = svm_exception_injected,
 	.inject_pending_irq = svm_intr_assist,
 	.inject_pending_vectors = do_interrupt_requests,
+	.interrupt_allowed = svm_interrupt_allowed,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index da6461d5dc84a..b9e06b07aca12 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2490,6 +2490,12 @@ static void vmx_update_window_states(struct kvm_vcpu *vcpu)
 				 GUEST_INTR_STATE_MOV_SS)));
 }
 
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	vmx_update_window_states(vcpu);
+	return vcpu->arch.interrupt_window_open;
+}
+
 static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 				       struct kvm_run *kvm_run)
 {
@@ -3691,7 +3697,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.exception_injected = vmx_exception_injected,
 	.inject_pending_irq = vmx_intr_assist,
 	.inject_pending_vectors = do_interrupt_requests,
-
+	.interrupt_allowed = vmx_interrupt_allowed,
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
 	.get_mt_mask_shift = vmx_get_mt_mask_shift,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8fca7a4e95a3e..5bbcad345376e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4475,3 +4475,8 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
 	put_cpu();
 }
+
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
+{
+	return kvm_x86_ops->interrupt_allowed(vcpu);
+}
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 40e49ede8f914..72d56844f3880 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -298,6 +298,7 @@ int kvm_arch_hardware_setup(void);
 void kvm_arch_hardware_unsetup(void);
 void kvm_arch_check_processor_compat(void *rtn);
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
+int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
 
 void kvm_free_physmem(struct kvm *kvm);
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a1a4272fa57cb..63d5fa2bc84ac 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1610,7 +1610,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
-		if (kvm_cpu_has_interrupt(vcpu) ||
+		if ((kvm_arch_interrupt_allowed(vcpu) &&
+					kvm_cpu_has_interrupt(vcpu)) ||
 				kvm_arch_vcpu_runnable(vcpu)) {
 			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
 			break;
-- 
GitLab


From 7d433b9f942606f66da8ef68b8baecd2915c5627 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 24 Mar 2009 14:27:47 +0200
Subject: [PATCH 1749/2198] KVM: VMX: Make flexpriority module parameter
 reflect hardware capability

If the hardware does not support flexpriority, zero the module parameter.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b9e06b07aca12..37ae13d7b8143 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -237,9 +237,7 @@ static inline int cpu_has_secondary_exec_ctrls(void)
 
 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 {
-	return flexpriority_enabled
-		&& (vmcs_config.cpu_based_2nd_exec_ctrl &
-		    SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES);
+	return flexpriority_enabled;
 }
 
 static inline int cpu_has_vmx_invept_individual_addr(void)
@@ -1203,6 +1201,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (!cpu_has_vmx_ept())
 		enable_ept = 0;
 
+	if (!(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+		flexpriority_enabled = 0;
+
 	min = 0;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
-- 
GitLab


From f9c617f61127615dd054f3f159213bdd12451cab Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 25 Mar 2009 10:08:52 +0800
Subject: [PATCH 1750/2198] KVM: VMX: Correct wrong vmcs field sizes

EXIT_QUALIFICATION and GUEST_LINEAR_ADDRESS are natural width, not 64-bit.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 37ae13d7b8143..aba41ae2c81fb 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2991,7 +2991,7 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	kvm_mmu_invlpg(vcpu, exit_qualification);
 	skip_emulated_instruction(vcpu);
@@ -3007,11 +3007,11 @@ static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	u64 exit_qualification;
+	unsigned long exit_qualification;
 	enum emulation_result er;
 	unsigned long offset;
 
-	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	offset = exit_qualification & 0xffful;
 
 	er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
@@ -3062,11 +3062,11 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
-	u64 exit_qualification;
+	unsigned long exit_qualification;
 	gpa_t gpa;
 	int gla_validity;
 
-	exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
+	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	if (exit_qualification & (1 << 6)) {
 		printk(KERN_ERR "EPT: GPA exceeds GAW!\n");
@@ -3078,7 +3078,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		printk(KERN_ERR "EPT: Handling EPT violation failed!\n");
 		printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n",
 			(long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS),
-			(long unsigned int)vmcs_read64(GUEST_LINEAR_ADDRESS));
+			vmcs_readl(GUEST_LINEAR_ADDRESS));
 		printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
 			(long unsigned int)exit_qualification);
 		kvm_run->exit_reason = KVM_EXIT_UNKNOWN;
-- 
GitLab


From a8b876b1a469cb364fee16ba3aef01613a1231cc Mon Sep 17 00:00:00 2001
From: Eddie Dong <eddie.dong@intel.com>
Date: Thu, 26 Mar 2009 15:28:40 +0800
Subject: [PATCH 1751/2198] KVM: MMU: Fix comment in page_fault()

The original one is for the code before refactoring.

Signed-off-by: Yaozu (Eddie) Dong <eddie.dong@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 855eb71110490..eae949973d09c 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -379,7 +379,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 		return r;
 
 	/*
-	 * Look up the shadow pte for the faulting address.
+	 * Look up the guest pte for the faulting address.
 	 */
 	r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault,
 			     fetch_fault);
-- 
GitLab


From 362c1055e58ecd25a9393c520ab263c80b147497 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yang.zhang@intel.com>
Date: Mon, 23 Mar 2009 03:31:04 -0400
Subject: [PATCH 1752/2198] KVM: ia64: enable external interrupt in vmm

Currently, the interrupt enable bit is cleared when in
the vmm.  This patch sets the bit and the external interrupts can
be dealt with when in the vmm.  This improves the I/O performance.

Signed-off-by: Yang Zhang <yang.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/process.c |  5 +++++
 arch/ia64/kvm/vmm_ivt.S | 18 +++++++++---------
 arch/ia64/kvm/vtlb.c    |  3 ++-
 3 files changed, 16 insertions(+), 10 deletions(-)

diff --git a/arch/ia64/kvm/process.c b/arch/ia64/kvm/process.c
index b1dc80952d91f..a8f84da04b49b 100644
--- a/arch/ia64/kvm/process.c
+++ b/arch/ia64/kvm/process.c
@@ -652,20 +652,25 @@ void  kvm_ia64_handle_break(unsigned long ifa, struct kvm_pt_regs *regs,
 		unsigned long isr, unsigned long iim)
 {
 	struct kvm_vcpu *v = current_vcpu;
+	long psr;
 
 	if (ia64_psr(regs)->cpl == 0) {
 		/* Allow hypercalls only when cpl = 0.  */
 		if (iim == DOMN_PAL_REQUEST) {
+			local_irq_save(psr);
 			set_pal_call_data(v);
 			vmm_transition(v);
 			get_pal_call_result(v);
 			vcpu_increment_iip(v);
+			local_irq_restore(psr);
 			return;
 		} else if (iim == DOMN_SAL_REQUEST) {
+			local_irq_save(psr);
 			set_sal_call_data(v);
 			vmm_transition(v);
 			get_sal_call_result(v);
 			vcpu_increment_iip(v);
+			local_irq_restore(psr);
 			return;
 		}
 	}
diff --git a/arch/ia64/kvm/vmm_ivt.S b/arch/ia64/kvm/vmm_ivt.S
index 3ef1a017a3189..40920c630649c 100644
--- a/arch/ia64/kvm/vmm_ivt.S
+++ b/arch/ia64/kvm/vmm_ivt.S
@@ -95,7 +95,7 @@ GLOBAL_ENTRY(kvm_vmm_panic)
 	;;
 	srlz.i    // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -249,7 +249,7 @@ ENTRY(kvm_break_fault)
 	;;
 	srlz.i         // guarantee that interruption collection is on
 	;;
-	//(p15)ssm psr.i               // restore psr.i
+	(p15)ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -439,7 +439,7 @@ kvm_dispatch_vexirq:
 	;;
 	srlz.i // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	adds r3=8,r2                // set up second base pointer
 	;;
 	KVM_SAVE_REST
@@ -819,7 +819,7 @@ ENTRY(kvm_dtlb_miss_dispatch)
 	;;
 	srlz.i     // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
 	;;
 	KVM_SAVE_REST
@@ -842,7 +842,7 @@ ENTRY(kvm_itlb_miss_dispatch)
 	;;
 	srlz.i   // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -871,7 +871,7 @@ ENTRY(kvm_dispatch_reflection)
 	;;
 	srlz.i   // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -898,7 +898,7 @@ ENTRY(kvm_dispatch_virtualization_fault)
 	;;
 	srlz.i    // guarantee that interruption collection is on
 	;;
-	//(p15) ssm psr.i               // restore psr.i
+	(p15) ssm psr.i               // restore psr.i
 	addl r14=@gprel(ia64_leave_hypervisor_prepare),gp
 	;;
 	KVM_SAVE_REST
@@ -920,7 +920,7 @@ ENTRY(kvm_dispatch_interrupt)
 	;;
 	srlz.i
 	;;
-	//(p15) ssm psr.i
+	(p15) ssm psr.i
 	addl r14=@gprel(ia64_leave_hypervisor),gp
 	;;
 	KVM_SAVE_REST
@@ -1333,7 +1333,7 @@ hostret =   r24
 	;;
 (p7)    srlz.i
 	;;
-//(p6)    ssm psr.i
+(p6)    ssm psr.i
 	;;
 	mov rp=rpsave
 	mov ar.pfs=pfssave
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index 2c2501f131597..4290a429bf7c9 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -254,7 +254,8 @@ u64 guest_vhpt_lookup(u64 iha, u64 *pte)
 			"(p7) st8 [%2]=r9;;"
 			"ssm psr.ic;;"
 			"srlz.d;;"
-			/* "ssm psr.i;;" Once interrupts in vmm open, need fix*/
+			"ssm psr.i;;"
+			"srlz.d;;"
 			: "=r"(ret) : "r"(iha), "r"(pte):"memory");
 
 	return ret;
-- 
GitLab


From 82725b20e22fb85377f61a16f6d0d5cfc28b45d3 Mon Sep 17 00:00:00 2001
From: "Dong, Eddie" <eddie.dong@intel.com>
Date: Mon, 30 Mar 2009 16:21:08 +0800
Subject: [PATCH 1753/2198] KVM: MMU: Emulate #PF error code of reserved bits
 violation

Detect, indicate, and propagate page faults where reserved bits are set.
Take care to handle the different paging modes, each of which has different
sets of reserved bits.

[avi: fix pte reserved bits for efer.nxe=0]

Signed-off-by: Eddie Dong <eddie.dong@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +
 arch/x86/kvm/mmu.c              | 69 +++++++++++++++++++++++++++++++++
 arch/x86/kvm/paging_tmpl.h      |  7 ++++
 arch/x86/kvm/x86.c              | 10 +++++
 4 files changed, 88 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8351c4d00ac0f..548b97d284d38 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -261,6 +261,7 @@ struct kvm_mmu {
 	union kvm_mmu_page_role base_role;
 
 	u64 *pae_root;
+	u64 rsvd_bits_mask[2][4];
 };
 
 struct kvm_vcpu_arch {
@@ -791,5 +792,6 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
 #define KVM_ARCH_WANT_MMU_NOTIFIER
 int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
 int kvm_age_hva(struct kvm *kvm, unsigned long hva);
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 
 #endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 8aac67cbd92f0..b2c8e28021c95 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -126,6 +126,7 @@ module_param(oos_shadow, bool, 0644);
 #define PFERR_PRESENT_MASK (1U << 0)
 #define PFERR_WRITE_MASK (1U << 1)
 #define PFERR_USER_MASK (1U << 2)
+#define PFERR_RSVD_MASK (1U << 3)
 #define PFERR_FETCH_MASK (1U << 4)
 
 #define PT_DIRECTORY_LEVEL 2
@@ -179,6 +180,11 @@ static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
 static u64 __read_mostly shadow_mt_mask;
 
+static inline u64 rsvd_bits(int s, int e)
+{
+	return ((1ULL << (e - s + 1)) - 1) << s;
+}
+
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
 {
 	shadow_trap_nonpresent_pte = trap_pte;
@@ -2151,6 +2157,14 @@ static void paging_free(struct kvm_vcpu *vcpu)
 	nonpaging_free(vcpu);
 }
 
+static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
+{
+	int bit7;
+
+	bit7 = (gpte >> 7) & 1;
+	return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0;
+}
+
 #define PTTYPE 64
 #include "paging_tmpl.h"
 #undef PTTYPE
@@ -2159,6 +2173,55 @@ static void paging_free(struct kvm_vcpu *vcpu)
 #include "paging_tmpl.h"
 #undef PTTYPE
 
+static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
+{
+	struct kvm_mmu *context = &vcpu->arch.mmu;
+	int maxphyaddr = cpuid_maxphyaddr(vcpu);
+	u64 exb_bit_rsvd = 0;
+
+	if (!is_nx(vcpu))
+		exb_bit_rsvd = rsvd_bits(63, 63);
+	switch (level) {
+	case PT32_ROOT_LEVEL:
+		/* no rsvd bits for 2 level 4K page table entries */
+		context->rsvd_bits_mask[0][1] = 0;
+		context->rsvd_bits_mask[0][0] = 0;
+		if (is_cpuid_PSE36())
+			/* 36bits PSE 4MB page */
+			context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+		else
+			/* 32 bits PSE 4MB page */
+			context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+		context->rsvd_bits_mask[1][0] = ~0ull;
+		break;
+	case PT32E_ROOT_LEVEL:
+		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 62);		/* PDE */
+		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 62); 	/* PTE */
+		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 62) |
+			rsvd_bits(13, 20);		/* large page */
+		context->rsvd_bits_mask[1][0] = ~0ull;
+		break;
+	case PT64_ROOT_LEVEL:
+		context->rsvd_bits_mask[0][3] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51);
+		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
+		context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
+		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
+			rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20);
+		context->rsvd_bits_mask[1][0] = ~0ull;
+		break;
+	}
+}
+
 static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
@@ -2179,6 +2242,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
 
 static int paging64_init_context(struct kvm_vcpu *vcpu)
 {
+	reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
 	return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
 }
 
@@ -2186,6 +2250,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *context = &vcpu->arch.mmu;
 
+	reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
 	context->new_cr3 = paging_new_cr3;
 	context->page_fault = paging32_page_fault;
 	context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2201,6 +2266,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
 
 static int paging32E_init_context(struct kvm_vcpu *vcpu)
 {
+	reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
 	return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
 }
 
@@ -2221,12 +2287,15 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 		context->gva_to_gpa = nonpaging_gva_to_gpa;
 		context->root_level = 0;
 	} else if (is_long_mode(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT64_ROOT_LEVEL;
 	} else if (is_pae(vcpu)) {
+		reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL);
 		context->gva_to_gpa = paging64_gva_to_gpa;
 		context->root_level = PT32E_ROOT_LEVEL;
 	} else {
+		reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
 		context->gva_to_gpa = paging32_gva_to_gpa;
 		context->root_level = PT32_ROOT_LEVEL;
 	}
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index eae949973d09c..09782a982785d 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -123,6 +123,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 	gfn_t table_gfn;
 	unsigned index, pt_access, pte_access;
 	gpa_t pte_gpa;
+	int rsvd_fault = 0;
 
 	pgprintk("%s: addr %lx\n", __func__, addr);
 walk:
@@ -157,6 +158,10 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 		if (!is_present_pte(pte))
 			goto not_present;
 
+		rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
+		if (rsvd_fault)
+			goto access_error;
+
 		if (write_fault && !is_writeble_pte(pte))
 			if (user_fault || is_write_protection(vcpu))
 				goto access_error;
@@ -232,6 +237,8 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 		walker->error_code |= PFERR_USER_MASK;
 	if (fetch_fault)
 		walker->error_code |= PFERR_FETCH_MASK;
+	if (rsvd_fault)
+		walker->error_code |= PFERR_RSVD_MASK;
 	return 0;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5bbcad345376e..df866684bad1f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3017,6 +3017,16 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 	return best;
 }
 
+int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+	if (best)
+		return best->eax & 0xff;
+	return 36;
+}
+
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
 {
 	u32 function, index;
-- 
GitLab


From 9645bb56b31a1b70ab9e470387b5264cafc04aa9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Tue, 31 Mar 2009 11:31:54 +0300
Subject: [PATCH 1754/2198] KVM: MMU: Use different shadows when EFER.NXE
 changes

A pte that is shadowed when the guest EFER.NXE=1 is not valid when
EFER.NXE=0; if bit 63 is set, the pte should cause a fault, and since the
shadow EFER always has NX enabled, this won't happen.

Fix by using a different shadow page table for different EFER.NXE bits.  This
allows vcpus to run correctly with different values of EFER.NXE, and for
transitions on this bit to be handled correctly without requiring a full
flush.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/x86.c              | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 548b97d284d38..3fc46238476c1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -185,6 +185,7 @@ union kvm_mmu_page_role {
 		unsigned access:3;
 		unsigned invalid:1;
 		unsigned cr4_pge:1;
+		unsigned nxe:1;
 	};
 };
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index df866684bad1f..007fadd62529e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -523,6 +523,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
 	efer |= vcpu->arch.shadow_efer & EFER_LMA;
 
 	vcpu->arch.shadow_efer = efer;
+
+	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
+	kvm_mmu_reset_context(vcpu);
 }
 
 void kvm_enable_efer_bits(u64 mask)
-- 
GitLab


From 7a6ce84c74a3064f6b6dda9e2d55c509cd4e78e4 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 16:47:44 +0800
Subject: [PATCH 1755/2198] KVM: remove pointless conditional before kfree() in
 lapic initialization

Remove pointless conditional before kfree().

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 007fadd62529e..7fe83fe145f97 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1592,8 +1592,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 	}
 out:
-	if (lapic)
-		kfree(lapic);
+	kfree(lapic);
 	return r;
 }
 
-- 
GitLab


From 045471563df4b8723202a66ae54d58788b0f8e88 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 1 Apr 2009 15:52:31 +0800
Subject: [PATCH 1756/2198] KVM: VMX: Clean up Flex Priority related

And clean paranthes on returns.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 47 +++++++++++++++++++++++++++++-----------------
 1 file changed, 30 insertions(+), 17 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index aba41ae2c81fb..1caa1fc6d5ece 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -216,61 +216,69 @@ static inline int is_external_interrupt(u32 intr_info)
 
 static inline int cpu_has_vmx_msr_bitmap(void)
 {
-	return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS);
+	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
 }
 
 static inline int cpu_has_vmx_tpr_shadow(void)
 {
-	return (vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW);
+	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 }
 
 static inline int vm_need_tpr_shadow(struct kvm *kvm)
 {
-	return ((cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm)));
+	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
 }
 
 static inline int cpu_has_secondary_exec_ctrls(void)
 {
-	return (vmcs_config.cpu_based_exec_ctrl &
-		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
+	return vmcs_config.cpu_based_exec_ctrl &
+		CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
 }
 
 static inline bool cpu_has_vmx_virtualize_apic_accesses(void)
 {
-	return flexpriority_enabled;
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+}
+
+static inline bool cpu_has_vmx_flexpriority(void)
+{
+	return cpu_has_vmx_tpr_shadow() &&
+		cpu_has_vmx_virtualize_apic_accesses();
 }
 
 static inline int cpu_has_vmx_invept_individual_addr(void)
 {
-	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT));
+	return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
 }
 
 static inline int cpu_has_vmx_invept_context(void)
 {
-	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT));
+	return !!(vmx_capability.ept & VMX_EPT_EXTENT_CONTEXT_BIT);
 }
 
 static inline int cpu_has_vmx_invept_global(void)
 {
-	return (!!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT));
+	return !!(vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT);
 }
 
 static inline int cpu_has_vmx_ept(void)
 {
-	return (vmcs_config.cpu_based_2nd_exec_ctrl &
-		SECONDARY_EXEC_ENABLE_EPT);
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_EPT;
 }
 
 static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
 {
-	return ((cpu_has_vmx_virtualize_apic_accesses()) &&
-		(irqchip_in_kernel(kvm)));
+	return flexpriority_enabled &&
+		(cpu_has_vmx_virtualize_apic_accesses()) &&
+		(irqchip_in_kernel(kvm));
 }
 
 static inline int cpu_has_vmx_vpid(void)
 {
-	return (vmcs_config.cpu_based_2nd_exec_ctrl &
-		SECONDARY_EXEC_ENABLE_VPID);
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_ENABLE_VPID;
 }
 
 static inline int cpu_has_virtual_nmis(void)
@@ -278,6 +286,11 @@ static inline int cpu_has_virtual_nmis(void)
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 }
 
+static inline bool report_flexpriority(void)
+{
+	return flexpriority_enabled;
+}
+
 static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
 {
 	int i;
@@ -1201,7 +1214,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 	if (!cpu_has_vmx_ept())
 		enable_ept = 0;
 
-	if (!(vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+	if (!cpu_has_vmx_flexpriority())
 		flexpriority_enabled = 0;
 
 	min = 0;
@@ -3655,7 +3668,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.check_processor_compatibility = vmx_check_processor_compat,
 	.hardware_enable = hardware_enable,
 	.hardware_disable = hardware_disable,
-	.cpu_has_accelerated_tpr = cpu_has_vmx_virtualize_apic_accesses,
+	.cpu_has_accelerated_tpr = report_flexpriority,
 
 	.vcpu_create = vmx_create_vcpu,
 	.vcpu_free = vmx_free_vcpu,
-- 
GitLab


From 93ba03c2e2aba23b042cc15eef83b7a66d3ac17a Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 1 Apr 2009 15:52:32 +0800
Subject: [PATCH 1757/2198] KVM: VMX: Fix feature testing

The testing of feature is too early now, before vmcs_config complete initialization.

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 1caa1fc6d5ece..7d7b0d6e3f587 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1208,15 +1208,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 		      vmx_capability.ept, vmx_capability.vpid);
 	}
 
-	if (!cpu_has_vmx_vpid())
-		enable_vpid = 0;
-
-	if (!cpu_has_vmx_ept())
-		enable_ept = 0;
-
-	if (!cpu_has_vmx_flexpriority())
-		flexpriority_enabled = 0;
-
 	min = 0;
 #ifdef CONFIG_X86_64
 	min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
@@ -1320,6 +1311,15 @@ static __init int hardware_setup(void)
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
+	if (!cpu_has_vmx_vpid())
+		enable_vpid = 0;
+
+	if (!cpu_has_vmx_ept())
+		enable_ept = 0;
+
+	if (!cpu_has_vmx_flexpriority())
+		flexpriority_enabled = 0;
+
 	return alloc_kvm_area();
 }
 
-- 
GitLab


From 20c466b56168ddccf034c136510d73e4a0e18605 Mon Sep 17 00:00:00 2001
From: "Dong, Eddie" <eddie.dong@intel.com>
Date: Tue, 31 Mar 2009 23:03:45 +0800
Subject: [PATCH 1758/2198] KVM: Use rsvd_bits_mask in load_pdptrs()

Also remove bit 5-6 from rsvd_bits_mask per latest SDM.

Signed-off-by: Eddie Dong <Eddie.Dong@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 +++-----
 arch/x86/kvm/mmu.h | 5 +++++
 arch/x86/kvm/x86.c | 3 ++-
 3 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b2c8e28021c95..da3ad3cf3142f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -225,11 +225,6 @@ static int is_nx(struct kvm_vcpu *vcpu)
 	return vcpu->arch.shadow_efer & EFER_NX;
 }
 
-static int is_present_pte(unsigned long pte)
-{
-	return pte & PT_PRESENT_MASK;
-}
-
 static int is_shadow_present_pte(u64 pte)
 {
 	return pte != shadow_trap_nonpresent_pte
@@ -2195,6 +2190,9 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		context->rsvd_bits_mask[1][0] = ~0ull;
 		break;
 	case PT32E_ROOT_LEVEL:
+		context->rsvd_bits_mask[0][2] =
+			rsvd_bits(maxphyaddr, 63) |
+			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 62);		/* PDE */
 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index eaab2145f62b2..3494a2fb136ee 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -75,4 +75,9 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return vcpu->arch.cr0 & X86_CR0_PG;
 }
 
+static inline int is_present_pte(unsigned long pte)
+{
+	return pte & PT_PRESENT_MASK;
+}
+
 #endif
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7fe83fe145f97..70ee81e50d99e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -234,7 +234,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 		goto out;
 	}
 	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
-		if ((pdpte[i] & 1) && (pdpte[i] & 0xfffffff0000001e6ull)) {
+		if (is_present_pte(pdpte[i]) &&
+		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
 			ret = 0;
 			goto out;
 		}
-- 
GitLab


From 7b4a25cb296e2a73d2e87a4af65361d45d450a27 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Mar 2009 16:03:08 +0300
Subject: [PATCH 1759/2198] KVM: VMX: Fix handling of a fault during NMI
 unblocked due to IRET

Bit 12 is undefined in any of the following cases:
 If the VM exit sets the valid bit in the IDT-vectoring information field.
 If the VM exit is due to a double fault.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7d7b0d6e3f587..631f9b720971d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3272,36 +3272,41 @@ static void update_tpr_threshold(struct kvm_vcpu *vcpu)
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 {
 	u32 exit_intr_info;
-	u32 idt_vectoring_info;
+	u32 idt_vectoring_info = vmx->idt_vectoring_info;
 	bool unblock_nmi;
 	u8 vector;
 	int type;
 	bool idtv_info_valid;
 	u32 error;
 
+	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 	if (cpu_has_virtual_nmis()) {
 		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
 		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
 		/*
-		 * SDM 3: 25.7.1.2
+		 * SDM 3: 27.7.1.2 (September 2008)
 		 * Re-set bit "block by NMI" before VM entry if vmexit caused by
 		 * a guest IRET fault.
+		 * SDM 3: 23.2.2 (September 2008)
+		 * Bit 12 is undefined in any of the following cases:
+		 *  If the VM exit sets the valid bit in the IDT-vectoring
+		 *   information field.
+		 *  If the VM exit is due to a double fault.
 		 */
-		if (unblock_nmi && vector != DF_VECTOR)
+		if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
+		    vector != DF_VECTOR && !idtv_info_valid)
 			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
 				      GUEST_INTR_STATE_NMI);
 	} else if (unlikely(vmx->soft_vnmi_blocked))
 		vmx->vnmi_blocked_time +=
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 
-	idt_vectoring_info = vmx->idt_vectoring_info;
-	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 	if (vmx->vcpu.arch.nmi_injected) {
 		/*
-		 * SDM 3: 25.7.1.2
+		 * SDM 3: 27.7.1.2 (September 2008)
 		 * Clear bit "block by NMI" before VM entry if a NMI delivery
 		 * faulted.
 		 */
-- 
GitLab


From 37b96e988053c4dd21811b0408a12f8f60b4d0c8 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Mar 2009 16:03:13 +0300
Subject: [PATCH 1760/2198] KVM: VMX: Rewrite vmx_complete_interrupt()'s
 twisted maze of if() statements

...with a more straightforward switch().

Also fix a bug when NMI could be dropped on exit. Although this should
never happen in practice, since NMIs can only be injected, never triggered
internally by the guest like exceptions.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 43 +++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 631f9b720971d..577aa9551a979 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3277,7 +3277,6 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	u8 vector;
 	int type;
 	bool idtv_info_valid;
-	u32 error;
 
 	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
@@ -3302,34 +3301,42 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		vmx->vnmi_blocked_time +=
 			ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
 
+	vmx->vcpu.arch.nmi_injected = false;
+	kvm_clear_exception_queue(&vmx->vcpu);
+	kvm_clear_interrupt_queue(&vmx->vcpu);
+
+	if (!idtv_info_valid)
+		return;
+
 	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
-	if (vmx->vcpu.arch.nmi_injected) {
+
+	switch(type) {
+	case INTR_TYPE_NMI_INTR:
+		vmx->vcpu.arch.nmi_injected = true;
 		/*
 		 * SDM 3: 27.7.1.2 (September 2008)
-		 * Clear bit "block by NMI" before VM entry if a NMI delivery
-		 * faulted.
+		 * Clear bit "block by NMI" before VM entry if a NMI
+		 * delivery faulted.
 		 */
-		if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
-			vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-					GUEST_INTR_STATE_NMI);
-		else
-			vmx->vcpu.arch.nmi_injected = false;
-	}
-	kvm_clear_exception_queue(&vmx->vcpu);
-	if (idtv_info_valid && (type == INTR_TYPE_HARD_EXCEPTION ||
-				type == INTR_TYPE_SOFT_EXCEPTION)) {
+		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+				GUEST_INTR_STATE_NMI);
+		break;
+	case INTR_TYPE_HARD_EXCEPTION:
+	case INTR_TYPE_SOFT_EXCEPTION:
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
-			error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
-			kvm_queue_exception_e(&vmx->vcpu, vector, error);
+			u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
+			kvm_queue_exception_e(&vmx->vcpu, vector, err);
 		} else
 			kvm_queue_exception(&vmx->vcpu, vector);
 		vmx->idt_vectoring_info = 0;
-	}
-	kvm_clear_interrupt_queue(&vmx->vcpu);
-	if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
+		break;
+	case INTR_TYPE_EXT_INTR:
 		kvm_queue_interrupt(&vmx->vcpu, vector);
 		vmx->idt_vectoring_info = 0;
+		break;
+	default:
+		break;
 	}
 }
 
-- 
GitLab


From 8843419048e500f8f38df555bca1bf7948804b7f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Mar 2009 16:03:18 +0300
Subject: [PATCH 1761/2198] KVM: VMX: Do not zero idt_vectoring_info in
 vmx_complete_interrupts().

We will need it later in task_switch().
Code in handle_exception() is dead. is_external_interrupt(vect_info)
will always be false since idt_vectoring_info is zeroed in
vmx_complete_interrupts().

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 577aa9551a979..e4ad9d3c0636e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2626,11 +2626,6 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
 		       "intr info 0x%x\n", __func__, vect_info, intr_info);
 
-	if (!irqchip_in_kernel(vcpu->kvm) && is_external_interrupt(vect_info)) {
-		int irq = vect_info & VECTORING_INFO_VECTOR_MASK;
-		kvm_push_irq(vcpu, irq);
-	}
-
 	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
 		return 1;  /* already handled by vmx_vcpu_run() */
 
@@ -3329,11 +3324,9 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 			kvm_queue_exception_e(&vmx->vcpu, vector, err);
 		} else
 			kvm_queue_exception(&vmx->vcpu, vector);
-		vmx->idt_vectoring_info = 0;
 		break;
 	case INTR_TYPE_EXT_INTR:
 		kvm_queue_interrupt(&vmx->vcpu, vector);
-		vmx->idt_vectoring_info = 0;
 		break;
 	default:
 		break;
-- 
GitLab


From b237ac37a149e8b56436fabf093532483bff13b0 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Mar 2009 16:03:24 +0300
Subject: [PATCH 1762/2198] KVM: Fix task switch back link handling.

Back link is written to a wrong TSS now.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 40 ++++++++++++++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 70ee81e50d99e..adcf73871a9dd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3717,7 +3717,6 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
 	tss->fs = get_segment_selector(vcpu, VCPU_SREG_FS);
 	tss->gs = get_segment_selector(vcpu, VCPU_SREG_GS);
 	tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
-	tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
 }
 
 static int load_state_from_tss32(struct kvm_vcpu *vcpu,
@@ -3814,8 +3813,8 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
 }
 
 static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
-		       u32 old_tss_base,
-		       struct desc_struct *nseg_desc)
+			      u16 old_tss_sel, u32 old_tss_base,
+			      struct desc_struct *nseg_desc)
 {
 	struct tss_segment_16 tss_segment_16;
 	int ret = 0;
@@ -3834,6 +3833,16 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
 			   &tss_segment_16, sizeof tss_segment_16))
 		goto out;
 
+	if (old_tss_sel != 0xffff) {
+		tss_segment_16.prev_task_link = old_tss_sel;
+
+		if (kvm_write_guest(vcpu->kvm,
+				    get_tss_base_addr(vcpu, nseg_desc),
+				    &tss_segment_16.prev_task_link,
+				    sizeof tss_segment_16.prev_task_link))
+			goto out;
+	}
+
 	if (load_state_from_tss16(vcpu, &tss_segment_16))
 		goto out;
 
@@ -3843,7 +3852,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
 }
 
 static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
-		       u32 old_tss_base,
+		       u16 old_tss_sel, u32 old_tss_base,
 		       struct desc_struct *nseg_desc)
 {
 	struct tss_segment_32 tss_segment_32;
@@ -3863,6 +3872,16 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
 			   &tss_segment_32, sizeof tss_segment_32))
 		goto out;
 
+	if (old_tss_sel != 0xffff) {
+		tss_segment_32.prev_task_link = old_tss_sel;
+
+		if (kvm_write_guest(vcpu->kvm,
+				    get_tss_base_addr(vcpu, nseg_desc),
+				    &tss_segment_32.prev_task_link,
+				    sizeof tss_segment_32.prev_task_link))
+			goto out;
+	}
+
 	if (load_state_from_tss32(vcpu, &tss_segment_32))
 		goto out;
 
@@ -3918,12 +3937,17 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 
 	kvm_x86_ops->skip_emulated_instruction(vcpu);
 
+	/* set back link to prev task only if NT bit is set in eflags
+	   note that old_tss_sel is not used afetr this point */
+	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+		old_tss_sel = 0xffff;
+
 	if (nseg_desc.type & 8)
-		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
-					 &nseg_desc);
+		ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
+					 old_tss_base, &nseg_desc);
 	else
-		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
-					 &nseg_desc);
+		ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_sel,
+					 old_tss_base, &nseg_desc);
 
 	if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
 		u32 eflags = kvm_x86_ops->get_rflags(vcpu);
-- 
GitLab


From 64a7ec066813443440bfc9f60a9e76a47cfa6b2b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 30 Mar 2009 16:03:29 +0300
Subject: [PATCH 1763/2198] KVM: Fix unneeded instruction skipping during task
 switching.

There is no need to skip instruction if the reason for a task switch
is a task gate in IDT and access to it is caused by an external even.
The problem  is currently solved only for VMX since there is no reliable
way to skip an instruction in SVM. We should emulate it instead.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/svm.h |  1 +
 arch/x86/kvm/svm.c         | 25 ++++++++++++++++++-------
 arch/x86/kvm/vmx.c         | 38 ++++++++++++++++++++++++++++----------
 arch/x86/kvm/x86.c         |  5 ++++-
 4 files changed, 51 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 82ada75f3ebf1..85574b7c1bc13 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -225,6 +225,7 @@ struct __attribute__ ((__packed__)) vmcb {
 #define SVM_EVTINJ_VALID_ERR (1 << 11)
 
 #define SVM_EXITINTINFO_VEC_MASK SVM_EVTINJ_VEC_MASK
+#define SVM_EXITINTINFO_TYPE_MASK SVM_EVTINJ_TYPE_MASK
 
 #define	SVM_EXITINTINFO_TYPE_INTR SVM_EVTINJ_TYPE_INTR
 #define	SVM_EXITINTINFO_TYPE_NMI SVM_EVTINJ_TYPE_NMI
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index de741043c5b15..bba67b70c4b01 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1825,17 +1825,28 @@ static int task_switch_interception(struct vcpu_svm *svm,
 				    struct kvm_run *kvm_run)
 {
 	u16 tss_selector;
+	int reason;
+	int int_type = svm->vmcb->control.exit_int_info &
+		SVM_EXITINTINFO_TYPE_MASK;
 
 	tss_selector = (u16)svm->vmcb->control.exit_info_1;
+
 	if (svm->vmcb->control.exit_info_2 &
 	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET))
-		return kvm_task_switch(&svm->vcpu, tss_selector,
-				       TASK_SWITCH_IRET);
-	if (svm->vmcb->control.exit_info_2 &
-	    (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
-		return kvm_task_switch(&svm->vcpu, tss_selector,
-				       TASK_SWITCH_JMP);
-	return kvm_task_switch(&svm->vcpu, tss_selector, TASK_SWITCH_CALL);
+		reason = TASK_SWITCH_IRET;
+	else if (svm->vmcb->control.exit_info_2 &
+		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
+		reason = TASK_SWITCH_JMP;
+	else if (svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID)
+		reason = TASK_SWITCH_GATE;
+	else
+		reason = TASK_SWITCH_CALL;
+
+
+	if (reason != TASK_SWITCH_GATE || int_type == SVM_EXITINTINFO_TYPE_SOFT)
+		skip_emulated_instruction(&svm->vcpu);
+
+	return kvm_task_switch(&svm->vcpu, tss_selector, reason);
 }
 
 static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e4ad9d3c0636e..c6997c0e8ca64 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3038,22 +3038,40 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	unsigned long exit_qualification;
 	u16 tss_selector;
-	int reason;
+	int reason, type, idt_v;
+
+	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
+	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 
 	reason = (u32)exit_qualification >> 30;
-	if (reason == TASK_SWITCH_GATE && vmx->vcpu.arch.nmi_injected &&
-	    (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
-	    (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK)
-	    == INTR_TYPE_NMI_INTR) {
-		vcpu->arch.nmi_injected = false;
-		if (cpu_has_virtual_nmis())
-			vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
-				      GUEST_INTR_STATE_NMI);
+	if (reason == TASK_SWITCH_GATE && idt_v) {
+		switch (type) {
+		case INTR_TYPE_NMI_INTR:
+			vcpu->arch.nmi_injected = false;
+			if (cpu_has_virtual_nmis())
+				vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
+					      GUEST_INTR_STATE_NMI);
+			break;
+		case INTR_TYPE_EXT_INTR:
+			kvm_clear_interrupt_queue(vcpu);
+			break;
+		case INTR_TYPE_HARD_EXCEPTION:
+		case INTR_TYPE_SOFT_EXCEPTION:
+			kvm_clear_exception_queue(vcpu);
+			break;
+		default:
+			break;
+		}
 	}
 	tss_selector = exit_qualification;
 
+	if (!idt_v || (type != INTR_TYPE_HARD_EXCEPTION &&
+		       type != INTR_TYPE_EXT_INTR &&
+		       type != INTR_TYPE_NMI_INTR))
+		skip_emulated_instruction(vcpu);
+
 	if (!kvm_task_switch(vcpu, tss_selector, reason))
 		return 0;
 
@@ -3306,7 +3324,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
 	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
 
-	switch(type) {
+	switch (type) {
 	case INTR_TYPE_NMI_INTR:
 		vmx->vcpu.arch.nmi_injected = true;
 		/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index adcf73871a9dd..bb04f11bf70f5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3935,7 +3935,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
 		kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
 	}
 
-	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	/* set back link to prev task only if NT bit is set in eflags
+	   note that old_tss_sel is not used afetr this point */
+	if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
+		old_tss_sel = 0xffff;
 
 	/* set back link to prev task only if NT bit is set in eflags
 	   note that old_tss_sel is not used afetr this point */
-- 
GitLab


From 4c26b4cd6ff6c3f7534f2aea9615a561c372ed05 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Thu, 2 Apr 2009 10:28:37 +0800
Subject: [PATCH 1764/2198] KVM: MMU: Discard reserved bits checking on PDE bit
 7-8

1. It's related to a Linux kernel bug which fixed by Ingo on
07a66d7c53a538e1a9759954a82bb6c07365eff9. The original code exists for quite a
long time, and it would convert a PDE for large page into a normal PDE. But it
fail to fit normal PDE well.  With the code before Ingo's fix, the kernel would
fall reserved bit checking with bit 8 - the remaining global bit of PTE. So the
kernel would receive a double-fault.

2. After discussion, we decide to discard PDE bit 7-8 reserved checking for now.
For this marked as reserved in SDM, but didn't checked by the processor in
fact...

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index da3ad3cf3142f..b582adde68309 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2194,7 +2194,7 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 			rsvd_bits(maxphyaddr, 63) |
 			rsvd_bits(7, 8) | rsvd_bits(1, 2);	/* PDPTE */
 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
-			rsvd_bits(maxphyaddr, 62);		/* PDE */
+			rsvd_bits(maxphyaddr, 62);	/* PDE */
 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 62); 	/* PTE */
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
@@ -2208,13 +2208,14 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
 		context->rsvd_bits_mask[0][2] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
 		context->rsvd_bits_mask[0][1] = exb_bit_rsvd |
-			rsvd_bits(maxphyaddr, 51) | rsvd_bits(7, 8);
+			rsvd_bits(maxphyaddr, 51);
 		context->rsvd_bits_mask[0][0] = exb_bit_rsvd |
 			rsvd_bits(maxphyaddr, 51);
 		context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3];
 		context->rsvd_bits_mask[1][2] = context->rsvd_bits_mask[0][2];
 		context->rsvd_bits_mask[1][1] = exb_bit_rsvd |
-			rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20);
+			rsvd_bits(maxphyaddr, 51) |
+			rsvd_bits(13, 20);		/* large page */
 		context->rsvd_bits_mask[1][0] = ~0ull;
 		break;
 	}
-- 
GitLab


From 2d033196541959d91802d5a62e63771448101557 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 2 Apr 2009 15:51:46 +0300
Subject: [PATCH 1765/2198] KVM: x86 emulator: fix call near emulation

The length of pushed on to the stack return address depends on operand
size not address size.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index ca91749d2083f..d7c9f6fd0d34e 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1792,7 +1792,6 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		}
 		c->src.val = (unsigned long) c->eip;
 		jmp_rel(c, rel);
-		c->op_bytes = c->ad_bytes;
 		emulate_push(ctxt);
 		break;
 	}
-- 
GitLab


From 2906e79f21a559e49611e1e188c4993cd45d9ce1 Mon Sep 17 00:00:00 2001
From: "Zhang, Xiantao" <xiantao.zhang@intel.com>
Date: Thu, 9 Apr 2009 21:37:28 +0800
Subject: [PATCH 1766/2198] KVM: ia64: make kvm depend on CONFIG_MODULES.

Since kvm-intel modue can't be built-in, make kvm depend on
CONFIG_MODULES.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/ia64/kvm/Kconfig b/arch/ia64/kvm/Kconfig
index 0a2d6b86075a6..64d5209378749 100644
--- a/arch/ia64/kvm/Kconfig
+++ b/arch/ia64/kvm/Kconfig
@@ -23,7 +23,7 @@ if VIRTUALIZATION
 
 config KVM
 	tristate "Kernel-based Virtual Machine (KVM) support"
-	depends on HAVE_KVM && EXPERIMENTAL
+	depends on HAVE_KVM && MODULES && EXPERIMENTAL
 	# for device assignment:
 	depends on PCI
 	select PREEMPT_NOTIFIERS
-- 
GitLab


From ede2ccc51742059d356d419260460cbbf3e36273 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Wed, 8 Apr 2009 13:14:19 -0300
Subject: [PATCH 1767/2198] KVM: PIT: fix count read and mode 0 handling

Commit 46ee278652f4cbd51013471b64c7897ba9bcd1b1 causes Solaris 10
to hang on boot.

Assuming that PIT counter reads should return 0 for an expired timer
is wrong: when it is active, the counter never stops (see comment on
__kpit_elapsed).

Also arm a one shot timer for mode 0.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/i8254.c | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index cf09bb64f1411..4d6f0d293ee29 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -104,13 +104,18 @@ static s64 __kpit_elapsed(struct kvm *kvm)
 	ktime_t remaining;
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 
+	/*
+	 * The Counter does not stop when it reaches zero. In
+	 * Modes 0, 1, 4, and 5 the Counter ``wraps around'' to
+	 * the highest count, either FFFF hex for binary counting
+	 * or 9999 for BCD counting, and continues counting.
+	 * Modes 2 and 3 are periodic; the Counter reloads
+	 * itself with the initial count and continues counting
+	 * from there.
+	 */
 	remaining = hrtimer_expires_remaining(&ps->pit_timer.timer);
-	if (ktime_to_ns(remaining) < 0)
-		remaining = ktime_set(0, 0);
-
-	elapsed = ps->pit_timer.period;
-	if (ktime_to_ns(remaining) <= ps->pit_timer.period)
-		elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+	elapsed = ps->pit_timer.period - ktime_to_ns(remaining);
+	elapsed = mod_64(elapsed, ps->pit_timer.period);
 
 	return elapsed;
 }
@@ -280,7 +285,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&pt->timer);
-	pt->period = (is_period == 0) ? 0 : interval;
+	pt->period = interval;
 	ps->is_periodic = is_period;
 
 	pt->timer.function = kvm_timer_fn;
@@ -304,10 +309,8 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	pr_debug("pit: load_count val is %d, channel is %d\n", val, channel);
 
 	/*
-	 * Though spec said the state of 8254 is undefined after power-up,
-	 * seems some tricky OS like Windows XP depends on IRQ0 interrupt
-	 * when booting up.
-	 * So here setting initialize rate for it, and not a specific number
+	 * The largest possible initial count is 0; this is equivalent
+	 * to 216 for binary counting and 104 for BCD counting.
 	 */
 	if (val == 0)
 		val = 0x10000;
@@ -322,6 +325,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 	/* Two types of timer
 	 * mode 1 is one shot, mode 2 is period, otherwise del timer */
 	switch (ps->channels[0].mode) {
+	case 0:
 	case 1:
         /* FIXME: enhance mode 4 precision */
 	case 4:
-- 
GitLab


From 2f8b9ee14eb439008e0c5131116ea6baa40dba50 Mon Sep 17 00:00:00 2001
From: nathan binkert <nate@binkert.org>
Date: Fri, 27 Mar 2009 21:53:05 -0700
Subject: [PATCH 1768/2198] KVM: Make kvm header C++ friendly

Two things needed fixing: 1) g++ does not allow a named structure type
within an anonymous union and 2) Avoid name clash between two padding
fields within the same struct by giving them different names as is
done elsewhere in the header.

Signed-off-by: Nathan Binkert <nate@binkert.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/kvm.h b/include/linux/kvm.h
index 644e3a9f47dba..3db5d8d374852 100644
--- a/include/linux/kvm.h
+++ b/include/linux/kvm.h
@@ -119,7 +119,7 @@ struct kvm_run {
 			__u32 error_code;
 		} ex;
 		/* KVM_EXIT_IO */
-		struct kvm_io {
+		struct {
 #define KVM_EXIT_IO_IN  0
 #define KVM_EXIT_IO_OUT 1
 			__u8 direction;
@@ -224,10 +224,10 @@ struct kvm_interrupt {
 /* for KVM_GET_DIRTY_LOG */
 struct kvm_dirty_log {
 	__u32 slot;
-	__u32 padding;
+	__u32 padding1;
 	union {
 		void __user *dirty_bitmap; /* one bit per page */
-		__u64 padding;
+		__u64 padding2;
 	};
 };
 
-- 
GitLab


From c2d0ee46e6e633a3c23ecbcb9b03ad731906cd79 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Sun, 5 Apr 2009 14:54:47 -0300
Subject: [PATCH 1769/2198] KVM: MMU: remove global page optimization logic

Complexity to fix it not worthwhile the gains, as discussed
in http://article.gmane.org/gmane.comp.emulators.kvm.devel/28649.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 ---
 arch/x86/kvm/mmu.c              | 50 ++++-----------------------------
 arch/x86/kvm/paging_tmpl.h      |  6 ++--
 arch/x86/kvm/x86.c              |  4 ---
 4 files changed, 8 insertions(+), 56 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3fc46238476c1..0e3a7c6e522c2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -213,7 +213,6 @@ struct kvm_mmu_page {
 	int multimapped;         /* More than one parent_pte? */
 	int root_count;          /* Currently serving as active root */
 	bool unsync;
-	bool global;
 	unsigned int unsync_children;
 	union {
 		u64 *parent_pte;               /* !multimapped */
@@ -395,7 +394,6 @@ struct kvm_arch{
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
-	struct list_head oos_global_pages;
 	struct iommu_domain *iommu_domain;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
@@ -425,7 +423,6 @@ struct kvm_vm_stat {
 	u32 mmu_recycled;
 	u32 mmu_cache_miss;
 	u32 mmu_unsync;
-	u32 mmu_unsync_global;
 	u32 remote_tlb_flush;
 	u32 lpages;
 };
@@ -640,7 +637,6 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 int kvm_mmu_load(struct kvm_vcpu *vcpu);
 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
-void kvm_mmu_sync_global(struct kvm_vcpu *vcpu);
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b582adde68309..b39ec626040e2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1075,18 +1075,10 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
 	return NULL;
 }
 
-static void kvm_unlink_unsync_global(struct kvm *kvm, struct kvm_mmu_page *sp)
-{
-	list_del(&sp->oos_link);
-	--kvm->stat.mmu_unsync_global;
-}
-
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 	WARN_ON(!sp->unsync);
 	sp->unsync = 0;
-	if (sp->global)
-		kvm_unlink_unsync_global(kvm, sp);
 	--kvm->stat.mmu_unsync;
 }
 
@@ -1249,7 +1241,6 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	pgprintk("%s: adding gfn %lx role %x\n", __func__, gfn, role.word);
 	sp->gfn = gfn;
 	sp->role = role;
-	sp->global = 0;
 	hlist_add_head(&sp->hash_link, bucket);
 	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
@@ -1647,11 +1638,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 
-	if (sp->global) {
-		list_add(&sp->oos_link, &vcpu->kvm->arch.oos_global_pages);
-		++vcpu->kvm->stat.mmu_unsync_global;
-	} else
-		kvm_mmu_mark_parents_unsync(vcpu, sp);
+	kvm_mmu_mark_parents_unsync(vcpu, sp);
 
 	mmu_convert_notrap(sp);
 	return 0;
@@ -1678,21 +1665,12 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		    unsigned pte_access, int user_fault,
 		    int write_fault, int dirty, int largepage,
-		    int global, gfn_t gfn, pfn_t pfn, bool speculative,
+		    gfn_t gfn, pfn_t pfn, bool speculative,
 		    bool can_unsync)
 {
 	u64 spte;
 	int ret = 0;
 	u64 mt_mask = shadow_mt_mask;
-	struct kvm_mmu_page *sp = page_header(__pa(shadow_pte));
-
-	if (!global && sp->global) {
-		sp->global = 0;
-		if (sp->unsync) {
-			kvm_unlink_unsync_global(vcpu->kvm, sp);
-			kvm_mmu_mark_parents_unsync(vcpu, sp);
-		}
-	}
 
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
@@ -1766,8 +1744,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			 unsigned pt_access, unsigned pte_access,
 			 int user_fault, int write_fault, int dirty,
-			 int *ptwrite, int largepage, int global,
-			 gfn_t gfn, pfn_t pfn, bool speculative)
+			 int *ptwrite, int largepage, gfn_t gfn,
+			 pfn_t pfn, bool speculative)
 {
 	int was_rmapped = 0;
 	int was_writeble = is_writeble_pte(*shadow_pte);
@@ -1796,7 +1774,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 			was_rmapped = 1;
 	}
 	if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
-		      dirty, largepage, global, gfn, pfn, speculative, true)) {
+		      dirty, largepage, gfn, pfn, speculative, true)) {
 		if (write_fault)
 			*ptwrite = 1;
 		kvm_x86_ops->tlb_flush(vcpu);
@@ -1844,7 +1822,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		    || (largepage && iterator.level == PT_DIRECTORY_LEVEL)) {
 			mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL,
 				     0, write, 1, &pt_write,
-				     largepage, 0, gfn, pfn, false);
+				     largepage, gfn, pfn, false);
 			++vcpu->stat.pf_fixed;
 			break;
 		}
@@ -2015,15 +1993,6 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void mmu_sync_global(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_mmu_page *sp, *n;
-
-	list_for_each_entry_safe(sp, n, &kvm->arch.oos_global_pages, oos_link)
-		kvm_sync_page(vcpu, sp);
-}
-
 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 {
 	spin_lock(&vcpu->kvm->mmu_lock);
@@ -2031,13 +2000,6 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 	spin_unlock(&vcpu->kvm->mmu_lock);
 }
 
-void kvm_mmu_sync_global(struct kvm_vcpu *vcpu)
-{
-	spin_lock(&vcpu->kvm->mmu_lock);
-	mmu_sync_global(vcpu);
-	spin_unlock(&vcpu->kvm->mmu_lock);
-}
-
 static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
 {
 	return vaddr;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 09782a982785d..258e4591e1ca0 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -268,8 +268,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	kvm_get_pfn(pfn);
 	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
 		     gpte & PT_DIRTY_MASK, NULL, largepage,
-		     gpte & PT_GLOBAL_MASK, gpte_to_gfn(gpte),
-		     pfn, true);
+		     gpte_to_gfn(gpte), pfn, true);
 }
 
 /*
@@ -303,7 +302,6 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 				     user_fault, write_fault,
 				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
 				     ptwrite, largepage,
-				     gw->ptes[gw->level-1] & PT_GLOBAL_MASK,
 				     gw->gfn, pfn, false);
 			break;
 		}
@@ -592,7 +590,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		nr_present++;
 		pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 		set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
-			 is_dirty_pte(gpte), 0, gpte & PT_GLOBAL_MASK, gfn,
+			 is_dirty_pte(gpte), 0, gfn,
 			 spte_to_pfn(sp->spt[i]), true, false);
 	}
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bb04f11bf70f5..b5ac1b7224546 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -108,7 +108,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "mmu_recycled", VM_STAT(mmu_recycled) },
 	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
 	{ "mmu_unsync", VM_STAT(mmu_unsync) },
-	{ "mmu_unsync_global", VM_STAT(mmu_unsync_global) },
 	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
 	{ "largepages", VM_STAT(lpages) },
 	{ NULL }
@@ -322,7 +321,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 	vcpu->arch.cr0 = cr0;
 
-	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 	return;
 }
@@ -371,7 +369,6 @@ void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	vcpu->arch.cr4 = cr4;
 	vcpu->arch.mmu.base_role.cr4_pge = (cr4 & X86_CR4_PGE) && !tdp_enabled;
-	kvm_mmu_sync_global(vcpu);
 	kvm_mmu_reset_context(vcpu);
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
@@ -4364,7 +4361,6 @@ struct  kvm *kvm_arch_create_vm(void)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
-	INIT_LIST_HEAD(&kvm->arch.oos_global_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
-- 
GitLab


From a5f868bd45a64ce1f502d228723a5b6c357790cd Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:14 +0300
Subject: [PATCH 1770/2198] KVM: x86 emulator: Add decoding of 16bit second
 immediate argument

Such as segment number in lcall/ljmp

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d7c9f6fd0d34e..c0150636d4530 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -76,6 +76,7 @@
 #define Src2CL      (1<<29)
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
+#define Src2Imm16   (4<<29)
 #define Src2Mask    (7<<29)
 
 enum {
@@ -1072,6 +1073,12 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		c->src2.bytes = 1;
 		c->src2.val = insn_fetch(u8, 1, c->eip);
 		break;
+	case Src2Imm16:
+		c->src2.type = OP_IMM;
+		c->src2.ptr = (unsigned long *)c->eip;
+		c->src2.bytes = 2;
+		c->src2.val = insn_fetch(u16, 2, c->eip);
+		break;
 	case Src2One:
 		c->src2.bytes = 1;
 		c->src2.val = 1;
-- 
GitLab


From 0654169e7309f2f68ec4bea9257b14b05ea94d7d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:20 +0300
Subject: [PATCH 1771/2198] KVM: x86 emulator: Add lcall decoding

No emulation yet.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index c0150636d4530..71b4bee2e221f 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -154,7 +154,8 @@ static u32 opcode_table[256] = {
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
 	/* 0x98 - 0x9F */
-	0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
+	0, 0, SrcImm | Src2Imm16, 0,
+	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
 	/* 0xA0 - 0xA7 */
 	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
 	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
-- 
GitLab


From 782b877c8073a9ef307ad6638ee472b8336b2b85 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:25 +0300
Subject: [PATCH 1772/2198] KVM: x86 emulator: Complete ljmp decoding at decode
 stage

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 71b4bee2e221f..8779cf233a830 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -193,7 +193,7 @@ static u32 opcode_table[256] = {
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	/* 0xE8 - 0xEF */
 	ImplicitOps | Stack, SrcImm | ImplicitOps,
-	ImplicitOps, SrcImmByte | ImplicitOps,
+	SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	/* 0xF0 - 0xF7 */
@@ -1805,30 +1805,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 	case 0xe9: /* jmp rel */
 		goto jmp;
-	case 0xea: /* jmp far */ {
-		uint32_t eip;
-		uint16_t sel;
-
-		switch (c->op_bytes) {
-		case 2:
-			eip = insn_fetch(u16, 2, c->eip);
-			break;
-		case 4:
-			eip = insn_fetch(u32, 4, c->eip);
-			break;
-		default:
-			DPRINTF("jmp far: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
-		sel = insn_fetch(u16, 2, c->eip);
-		if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
+	case 0xea: /* jmp far */
+		if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9,
+					VCPU_SREG_CS) < 0) {
 			DPRINTF("jmp far: Failed to load CS descriptor\n");
 			goto cannot_emulate;
 		}
 
-		c->eip = eip;
+		c->eip = c->src.val;
 		break;
-	}
 	case 0xeb:
 	      jmp:		/* jmp rel short */
 		jmp_rel(c, c->src.val);
-- 
GitLab


From b2833e3cdebfe3ea4d0d1d3ce4d2ff1c42a4f8f4 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:30 +0300
Subject: [PATCH 1773/2198] KVM: x86 emulator: Complete short/near jcc decoding
 in decode stage

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 42 +++++++++-----------------------------
 1 file changed, 10 insertions(+), 32 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 8779cf233a830..14b8ee2c09e96 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -136,11 +136,11 @@ static u32 opcode_table[256] = {
 	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* insb, insw/insd */
 	SrcNone  | ByteOp  | ImplicitOps, SrcNone  | ImplicitOps, /* outsb, outsw/outsd */
 	/* 0x70 - 0x77 */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
 	/* 0x78 - 0x7F */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
+	SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
 	/* 0x80 - 0x87 */
 	Group | Group1_80, Group | Group1_81,
 	Group | Group1_82, Group | Group1_83,
@@ -232,10 +232,8 @@ static u32 twobyte_table[256] = {
 	/* 0x70 - 0x7F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0x80 - 0x8F */
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
-	ImplicitOps, ImplicitOps, ImplicitOps, ImplicitOps,
+	SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
+	SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
 	/* 0x90 - 0x9F */
 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xA0 - 0xA7 */
@@ -1539,13 +1537,10 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 			return -1;
 		}
 		return 0;
-	case 0x70 ... 0x7f: /* jcc (short) */ {
-		int rel = insn_fetch(s8, 1, c->eip);
-
+	case 0x70 ... 0x7f: /* jcc (short) */
 		if (test_cc(c->b, ctxt->eflags))
-			jmp_rel(c, rel);
+			jmp_rel(c, c->src.val);
 		break;
-	}
 	case 0x80 ... 0x83:	/* Grp1 */
 		switch (c->modrm_reg) {
 		case 0:
@@ -2031,28 +2026,11 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		if (!test_cc(c->b, ctxt->eflags))
 			c->dst.type = OP_NONE; /* no writeback */
 		break;
-	case 0x80 ... 0x8f: /* jnz rel, etc*/ {
-		long int rel;
-
-		switch (c->op_bytes) {
-		case 2:
-			rel = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			rel = insn_fetch(s32, 4, c->eip);
-			break;
-		case 8:
-			rel = insn_fetch(s64, 8, c->eip);
-			break;
-		default:
-			DPRINTF("jnz: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
+	case 0x80 ... 0x8f: /* jnz rel, etc*/
 		if (test_cc(c->b, ctxt->eflags))
-			jmp_rel(c, rel);
+			jmp_rel(c, c->src.val);
 		c->dst.type = OP_NONE;
 		break;
-	}
 	case 0xa3:
 	      bt:		/* bt */
 		c->dst.type = OP_NONE;
-- 
GitLab


From d53c4777b3a3e5031710d0664851d1309325884b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:36 +0300
Subject: [PATCH 1774/2198] KVM: x86 emulator: Complete decoding of call near
 in decode stage

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 14b8ee2c09e96..4a9cd4c2b9831 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -192,7 +192,7 @@ static u32 opcode_table[256] = {
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	/* 0xE8 - 0xEF */
-	ImplicitOps | Stack, SrcImm | ImplicitOps,
+	SrcImm | Stack, SrcImm | ImplicitOps,
 	SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
 	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
@@ -1781,18 +1781,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		io_dir_in = 0;
 		goto do_io;
 	case 0xe8: /* call (near) */ {
-		long int rel;
-		switch (c->op_bytes) {
-		case 2:
-			rel = insn_fetch(s16, 2, c->eip);
-			break;
-		case 4:
-			rel = insn_fetch(s32, 4, c->eip);
-			break;
-		default:
-			DPRINTF("Call: Invalid op_bytes\n");
-			goto cannot_emulate;
-		}
+		long int rel = c->src.val;
 		c->src.val = (unsigned long) c->eip;
 		jmp_rel(c, rel);
 		emulate_push(ctxt);
-- 
GitLab


From 341de7e3728ade102eaadf56af404f4ce865a73d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:41 +0300
Subject: [PATCH 1775/2198] KVM: x86 emulator: Add unsigned byte immediate
 decode

Extend "Source operand type" opcode description field to 4 bites
to accommodate new option.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 4a9cd4c2b9831..0988a13063dc9 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -59,13 +59,14 @@
 #define SrcImm      (5<<4)	/* Immediate operand. */
 #define SrcImmByte  (6<<4)	/* 8-bit sign-extended immediate operand. */
 #define SrcOne      (7<<4)	/* Implied '1' */
-#define SrcMask     (7<<4)
+#define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
+#define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
-#define ModRM       (1<<7)
+#define ModRM       (1<<8)
 /* Destination is only written; never read. */
-#define Mov         (1<<8)
-#define BitOp       (1<<9)
-#define MemAbs      (1<<10)      /* Memory operand is absolute displacement */
+#define Mov         (1<<9)
+#define BitOp       (1<<10)
+#define MemAbs      (1<<11)      /* Memory operand is absolute displacement */
 #define String      (1<<12)     /* String instruction (rep capable) */
 #define Stack       (1<<13)     /* Stack instruction (push/pop) */
 #define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
@@ -1044,10 +1045,14 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		}
 		break;
 	case SrcImmByte:
+	case SrcImmUByte:
 		c->src.type = OP_IMM;
 		c->src.ptr = (unsigned long *)c->eip;
 		c->src.bytes = 1;
-		c->src.val = insn_fetch(s8, 1, c->eip);
+		if ((c->d & SrcMask) == SrcImmByte)
+			c->src.val = insn_fetch(s8, 1, c->eip);
+		else
+			c->src.val = insn_fetch(u8, 1, c->eip);
 		break;
 	case SrcOne:
 		c->src.bytes = 1;
-- 
GitLab


From 84ce66a6865192567172f08ab5269aa380fa6ead Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:46 +0300
Subject: [PATCH 1776/2198] KVM: x86 emulator: Completely decode in/out at
 decoding stage

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 0988a13063dc9..c2f55ca84fdf1 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -190,8 +190,8 @@ static u32 opcode_table[256] = {
 	0, 0, 0, 0, 0, 0, 0, 0,
 	/* 0xE0 - 0xE7 */
 	0, 0, 0, 0,
-	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
-	SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
+	ByteOp | SrcImmUByte, SrcImmUByte,
+	ByteOp | SrcImmUByte, SrcImmUByte,
 	/* 0xE8 - 0xEF */
 	SrcImm | Stack, SrcImm | ImplicitOps,
 	SrcImm | Src2Imm16, SrcImmByte | ImplicitOps,
@@ -1777,12 +1777,12 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		break;
 	case 0xe4: 	/* inb */
 	case 0xe5: 	/* in */
-		port = insn_fetch(u8, 1, c->eip);
+		port = c->src.val;
 		io_dir_in = 1;
 		goto do_io;
 	case 0xe6: /* outb */
 	case 0xe7: /* out */
-		port = insn_fetch(u8, 1, c->eip);
+		port = c->src.val;
 		io_dir_in = 0;
 		goto do_io;
 	case 0xe8: /* call (near) */ {
-- 
GitLab


From e637b8238ad1783ebdd113bd34cd6982dec1006d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:52 +0300
Subject: [PATCH 1777/2198] KVM: x86 emulator: Decode soft interrupt
 instructions

Do not emulate them yet.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86_emulate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index c2f55ca84fdf1..d2664fcba7faf 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -181,7 +181,8 @@ static u32 opcode_table[256] = {
 	0, ImplicitOps | Stack, 0, 0,
 	ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
 	/* 0xC8 - 0xCF */
-	0, 0, 0, ImplicitOps | Stack, 0, 0, 0, 0,
+	0, 0, 0, ImplicitOps | Stack,
+	ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps,
 	/* 0xD0 - 0xD7 */
 	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
 	ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
-- 
GitLab


From ba8afb6b0a2c7e06da760ffe5d078245058619b5 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:36:57 +0300
Subject: [PATCH 1778/2198] KVM: x86 emulator: Add new mode of instruction
 emulation: skip

In the new mode instruction is decoded, but not executed. The EIP
is moved to point after the instruction.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/x86.c              | 5 +++++
 2 files changed, 6 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0e3a7c6e522c2..cb306cff2b49a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -562,6 +562,7 @@ enum emulation_result {
 
 #define EMULTYPE_NO_DECODE	    (1 << 0)
 #define EMULTYPE_TRAP_UD	    (1 << 1)
+#define EMULTYPE_SKIP		    (1 << 2)
 int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
 			unsigned long cr2, u16 error_code, int emulation_type);
 void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b5ac1b7224546..8beccaa17690a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2413,6 +2413,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		}
 	}
 
+	if (emulation_type & EMULTYPE_SKIP) {
+		kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.decode.eip);
+		return EMULATE_DONE;
+	}
+
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 
 	if (vcpu->arch.pio.string)
-- 
GitLab


From 8317c298eab14cc20e2de5289743ff0d4c5a6b30 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Sun, 12 Apr 2009 13:37:02 +0300
Subject: [PATCH 1779/2198] KVM: SVM: Skip instruction on a task switch only
 when appropriate

If a task switch was initiated because off a task gate in IDT and IDT
was accessed because of an external even the instruction should not
be skipped.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index bba67b70c4b01..8fc6eea148e7f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1828,6 +1828,7 @@ static int task_switch_interception(struct vcpu_svm *svm,
 	int reason;
 	int int_type = svm->vmcb->control.exit_int_info &
 		SVM_EXITINTINFO_TYPE_MASK;
+	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
 
 	tss_selector = (u16)svm->vmcb->control.exit_info_1;
 
@@ -1843,8 +1844,14 @@ static int task_switch_interception(struct vcpu_svm *svm,
 		reason = TASK_SWITCH_CALL;
 
 
-	if (reason != TASK_SWITCH_GATE || int_type == SVM_EXITINTINFO_TYPE_SOFT)
-		skip_emulated_instruction(&svm->vcpu);
+	if (reason != TASK_SWITCH_GATE ||
+	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
+	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
+	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
+		if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0,
+					EMULTYPE_SKIP) != EMULATE_DONE)
+			return 0;
+	}
 
 	return kvm_task_switch(&svm->vcpu, tss_selector, reason);
 }
-- 
GitLab


From 463656c0007ddccee78db383eeb9e6eac75ccb7f Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 12 Apr 2009 15:49:07 +0300
Subject: [PATCH 1780/2198] KVM: Replace kvmclock open-coded get_cpu_var() with
 the real thing

Suggested by Ingo Molnar.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 8beccaa17690a..ffbb2c818d78c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -631,16 +631,17 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
 	unsigned long flags;
 	struct kvm_vcpu_arch *vcpu = &v->arch;
 	void *shared_kaddr;
+	unsigned long this_tsc_khz;
 
 	if ((!vcpu->time_page))
 		return;
 
-	preempt_disable();
-	if (unlikely(vcpu->hv_clock_tsc_khz != __get_cpu_var(cpu_tsc_khz))) {
-		kvm_set_time_scale(__get_cpu_var(cpu_tsc_khz), &vcpu->hv_clock);
-		vcpu->hv_clock_tsc_khz = __get_cpu_var(cpu_tsc_khz);
+	this_tsc_khz = get_cpu_var(cpu_tsc_khz);
+	if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) {
+		kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock);
+		vcpu->hv_clock_tsc_khz = this_tsc_khz;
 	}
-	preempt_enable();
+	put_cpu_var(cpu_tsc_khz);
 
 	/* Keep irq disabled to prevent changes to the clock */
 	local_irq_save(flags);
-- 
GitLab


From c6b60c6921381130e5288b19f5fdf81152230b37 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Thu, 16 Apr 2009 10:43:48 +0200
Subject: [PATCH 1781/2198] KVM: ia64: Don't hold slots_lock in guest mode

Reorder locking to avoid holding the slots_lock when entering
the guest.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by : Xiantao Zhang<xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 64 +++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 31 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 3bf0a345224af..f127fb723f2fa 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -632,34 +632,22 @@ static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu)
 	vti_set_rr6(vcpu->arch.vmm_rr);
 	return kvm_insert_vmm_mapping(vcpu);
 }
+
 static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu)
 {
 	kvm_purge_vmm_mapping(vcpu);
 	vti_set_rr6(vcpu->arch.host_rr6);
 }
 
-static int  vti_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	union context *host_ctx, *guest_ctx;
 	int r;
 
-	/*Get host and guest context with guest address space.*/
-	host_ctx = kvm_get_host_context(vcpu);
-	guest_ctx = kvm_get_guest_context(vcpu);
-
-	r = kvm_vcpu_pre_transition(vcpu);
-	if (r < 0)
-		goto out;
-	kvm_vmm_info->tramp_entry(host_ctx, guest_ctx);
-	kvm_vcpu_post_transition(vcpu);
-	r = 0;
-out:
-	return r;
-}
-
-static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	int r;
+	/*
+	 * down_read() may sleep and return with interrupts enabled
+	 */
+	down_read(&vcpu->kvm->slots_lock);
 
 again:
 	if (signal_pending(current)) {
@@ -668,23 +656,28 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
-	/*
-	 * down_read() may sleep and return with interrupts enabled
-	 */
-	down_read(&vcpu->kvm->slots_lock);
-
 	preempt_disable();
 	local_irq_disable();
 
+	/*Get host and guest context with guest address space.*/
+	host_ctx = kvm_get_host_context(vcpu);
+	guest_ctx = kvm_get_guest_context(vcpu);
+
 	vcpu->guest_mode = 1;
+
+	r = kvm_vcpu_pre_transition(vcpu);
+	if (r < 0)
+		goto vcpu_run_fail;
+
+	up_read(&vcpu->kvm->slots_lock);
 	kvm_guest_enter();
-	r = vti_vcpu_run(vcpu, kvm_run);
-	if (r < 0) {
-		local_irq_enable();
-		preempt_enable();
-		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-		goto out;
-	}
+
+	/*
+	 * Transition to the guest
+	 */
+	kvm_vmm_info->tramp_entry(host_ctx, guest_ctx);
+
+	kvm_vcpu_post_transition(vcpu);
 
 	vcpu->arch.launched = 1;
 	vcpu->guest_mode = 0;
@@ -698,9 +691,10 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	 */
 	barrier();
 	kvm_guest_exit();
-	up_read(&vcpu->kvm->slots_lock);
 	preempt_enable();
 
+	down_read(&vcpu->kvm->slots_lock);
+
 	r = kvm_handle_exit(kvm_run, vcpu);
 
 	if (r > 0) {
@@ -709,12 +703,20 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 out:
+	up_read(&vcpu->kvm->slots_lock);
 	if (r > 0) {
 		kvm_resched(vcpu);
+		down_read(&vcpu->kvm->slots_lock);
 		goto again;
 	}
 
 	return r;
+
+vcpu_run_fail:
+	local_irq_enable();
+	preempt_enable();
+	kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+	goto out;
 }
 
 static void kvm_set_mmio_data(struct kvm_vcpu *vcpu)
-- 
GitLab


From 59839dfff5eabca01cc4e20b45797a60a80af8cb Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 16 Apr 2009 08:30:44 -0300
Subject: [PATCH 1782/2198] KVM: x86: check for cr3 validity in ioctl_set_sregs

Matt T. Yourst notes that kvm_arch_vcpu_ioctl_set_sregs lacks validity
checking for the new cr3 value:

"Userspace callers of KVM_SET_SREGS can pass a bogus value of cr3 to
the kernel. This will trigger a NULL pointer access in gfn_to_rmap()
when userspace next tries to call KVM_RUN on the affected VCPU and kvm
attempts to activate the new non-existent page table root.

This happens since kvm only validates that cr3 points to a valid guest
physical memory page when code *inside* the guest sets cr3. However, kvm
currently trusts the userspace caller (e.g. QEMU) on the host machine to
always supply a valid page table root, rather than properly validating
it along with the rest of the reloaded guest state."

http://sourceforge.net/tracker/?func=detail&atid=893831&aid=2687641&group_id=180599

Check for a valid cr3 address in kvm_arch_vcpu_ioctl_set_sregs, triple
fault in case of failure.

Cc: stable@kernel.org
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ffbb2c818d78c..2bad49b535c88 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3993,7 +3993,13 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 	vcpu->arch.cr2 = sregs->cr2;
 	mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3;
-	vcpu->arch.cr3 = sregs->cr3;
+
+	down_read(&vcpu->kvm->slots_lock);
+	if (gfn_to_memslot(vcpu->kvm, sregs->cr3 >> PAGE_SHIFT))
+		vcpu->arch.cr3 = sregs->cr3;
+	else
+		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+	up_read(&vcpu->kvm->slots_lock);
 
 	kvm_set_cr8(vcpu, sregs->cr8);
 
-- 
GitLab


From 64f6afbd4c3317eb6845d770aa345afd3f120471 Mon Sep 17 00:00:00 2001
From: Xiantao Zhang <xiantao.zhang@intel.com>
Date: Thu, 16 Apr 2009 17:59:16 +0800
Subject: [PATCH 1783/2198] KVM: ia64: Flush all TLBs once guest's memory
 mapping changes.

Flush all vcpu's TLB entries once changes guest's memory mapping.

Signed-off-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index f127fb723f2fa..9addca622d2d4 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1631,6 +1631,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 
 void kvm_arch_flush_shadow(struct kvm *kvm)
 {
+	kvm_flush_remote_tlbs(kvm);
 }
 
 long kvm_arch_dev_ioctl(struct file *filp,
-- 
GitLab


From f9b647adda0e821538b6d80e52873d861ffb1799 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Thu, 16 Apr 2009 11:24:58 +0200
Subject: [PATCH 1784/2198] KVM: ia64: remove empty function vti_vcpu_load()

vti_vcpu_load() doesn't do anything, so lets get rid of it.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by : Xiantao Zhang<xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 9addca622d2d4..726317144a6f8 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1100,10 +1100,6 @@ static void kvm_free_vmm_area(void)
 	}
 }
 
-static void vti_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
-}
-
 static int vti_init_vpd(struct kvm_vcpu *vcpu)
 {
 	int i;
@@ -1348,7 +1344,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	vcpu->kvm = kvm;
 
 	cpu = get_cpu();
-	vti_vcpu_load(vcpu, cpu);
 	r = vti_vcpu_setup(vcpu, id);
 	put_cpu();
 
-- 
GitLab


From 457459c3c738dfb37226ba116ba301140da0d1fb Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Thu, 16 Apr 2009 16:08:29 +0200
Subject: [PATCH 1785/2198] KVM: ia64: restore irq state before calling
 kvm_vcpu_init

Make sure to restore the psr after calling kvm_insert_vmm_mapping()
which calls ia64_itr_entry() as it disables local interrupts and
kvm_vcpu_init() may sleep.

Avoids a warning from the lock debugging code.

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by : Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 726317144a6f8..5b868db1e9f21 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -1290,6 +1290,7 @@ static int vti_vcpu_setup(struct kvm_vcpu *vcpu, int id)
 
 	local_irq_save(psr);
 	r = kvm_insert_vmm_mapping(vcpu);
+	local_irq_restore(psr);
 	if (r)
 		goto fail;
 	r = kvm_vcpu_init(vcpu, vcpu->kvm, id);
@@ -1307,13 +1308,11 @@ static int vti_vcpu_setup(struct kvm_vcpu *vcpu, int id)
 		goto uninit;
 
 	kvm_purge_vmm_mapping(vcpu);
-	local_irq_restore(psr);
 
 	return 0;
 uninit:
 	kvm_vcpu_uninit(vcpu);
 fail:
-	local_irq_restore(psr);
 	return r;
 }
 
-- 
GitLab


From 4d13c3b04f14a9a72ffcdd082acc243e7e56b4e0 Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Thu, 16 Apr 2009 16:53:13 +0200
Subject: [PATCH 1786/2198] KVM: ia64: preserve int status through call to
 kvm_insert_vmm_mapping

Preserve interrupt status around call to kvm_insert_vmm_mappin()
in kvm_vcpu_pre_transition().

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 5b868db1e9f21..cf5a19390f4e5 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -619,6 +619,8 @@ static void kvm_purge_vmm_mapping(struct kvm_vcpu *vcpu)
 
 static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu)
 {
+	unsigned long psr;
+	int r;
 	int cpu = smp_processor_id();
 
 	if (vcpu->arch.last_run_cpu != cpu ||
@@ -630,7 +632,10 @@ static int kvm_vcpu_pre_transition(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.host_rr6 = ia64_get_rr(RR6);
 	vti_set_rr6(vcpu->arch.vmm_rr);
-	return kvm_insert_vmm_mapping(vcpu);
+	local_irq_save(psr);
+	r = kvm_insert_vmm_mapping(vcpu);
+	local_irq_restore(psr);
+	return r;
 }
 
 static void kvm_vcpu_post_transition(struct kvm_vcpu *vcpu)
-- 
GitLab


From 43890ae8bcc5fa88ef3061613efe041b866fbc5a Mon Sep 17 00:00:00 2001
From: Jes Sorensen <jes@sgi.com>
Date: Fri, 17 Apr 2009 16:43:27 +0200
Subject: [PATCH 1787/2198] KVM: ia64: ia64 vcpu_reset() do not call kmalloc()
 with irqs disabled

Restore local irq enabled state before calling kvm_arch_vcpu_init(),
which calls kmalloc(GFP_KERNEL).

Signed-off-by: Jes Sorensen <jes@sgi.com>
Acked-by: Xiantao Zhang <xiantao.zhang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kvm/kvm-ia64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index cf5a19390f4e5..be4413e1f43ff 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -2001,6 +2001,7 @@ static int vcpu_reset(struct kvm_vcpu *vcpu)
 	long psr;
 	local_irq_save(psr);
 	r = kvm_insert_vmm_mapping(vcpu);
+	local_irq_restore(psr);
 	if (r)
 		goto fail;
 
@@ -2013,7 +2014,6 @@ static int vcpu_reset(struct kvm_vcpu *vcpu)
 	kvm_purge_vmm_mapping(vcpu);
 	r = 0;
 fail:
-	local_irq_restore(psr);
 	return r;
 }
 
-- 
GitLab


From 3438253926822a6bf8487b4f7d82f26a2c0b2388 Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@web.de>
Date: Sat, 25 Apr 2009 12:43:21 +0200
Subject: [PATCH 1788/2198] KVM: MMU: Fix auditing code

Fix build breakage of hpa lookup in audit_mappings_page. Moreover, make
this function robust against shadow_notrap_nonpresent_pte entries.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index b39ec626040e2..3592aea59ef7f 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3029,11 +3029,13 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
 				       " in nonleaf level: levels %d gva %lx"
 				       " level %d pte %llx\n", audit_msg,
 				       vcpu->arch.mmu.root_level, va, level, ent);
-
-			audit_mappings_page(vcpu, ent, va, level - 1);
+			else
+				audit_mappings_page(vcpu, ent, va, level - 1);
 		} else {
 			gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va);
-			hpa_t hpa = (hpa_t)gpa_to_pfn(vcpu, gpa) << PAGE_SHIFT;
+			gfn_t gfn = gpa >> PAGE_SHIFT;
+			pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
+			hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
 
 			if (is_shadow_present_pte(ent)
 			    && (ent & PT64_BASE_ADDR_MASK) != hpa)
-- 
GitLab


From 8061823a25218174f30c3dd943989e1d72f7d06e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:44:56 +0300
Subject: [PATCH 1789/2198] KVM: Make kvm_cpu_(has|get)_interrupt() work for
 userspace irqchip too

At the vector level, kernel and userspace irqchip are fairly similar.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/irq.c |  7 +++++++
 arch/x86/kvm/svm.c | 11 +++++++----
 arch/x86/kvm/vmx.c | 18 +++++++++---------
 arch/x86/kvm/x86.c |  4 ++--
 4 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index cf17ed52f6fbb..11c2757b808f9 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -24,6 +24,7 @@
 
 #include "irq.h"
 #include "i8254.h"
+#include "x86.h"
 
 /*
  * check if there are pending timer events
@@ -48,6 +49,9 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
 	struct kvm_pic *s;
 
+	if (!irqchip_in_kernel(v->kvm))
+		return v->arch.irq_summary;
+
 	if (kvm_apic_has_interrupt(v) == -1) {	/* LAPIC */
 		if (kvm_apic_accept_pic_intr(v)) {
 			s = pic_irqchip(v->kvm);	/* PIC */
@@ -67,6 +71,9 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 	struct kvm_pic *s;
 	int vector;
 
+	if (!irqchip_in_kernel(v->kvm))
+		return kvm_pop_irq(v);
+
 	vector = kvm_get_apic_interrupt(v);	/* APIC */
 	if (vector == -1) {
 		if (kvm_apic_accept_pic_intr(v)) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8fc6eea148e7f..6eef6d22e87ee 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2091,8 +2091,9 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
-	if (kvm_run->request_interrupt_window &&
-	    !svm->vcpu.arch.irq_summary) {
+	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
+	    kvm_run->request_interrupt_window &&
+	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
 		++svm->vcpu.stat.irq_window_exits;
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
@@ -2373,7 +2374,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 		 (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
 		 (svm->vcpu.arch.hflags & HF_GIF_MASK));
 
-	if (svm->vcpu.arch.interrupt_window_open && svm->vcpu.arch.irq_summary)
+	if (svm->vcpu.arch.interrupt_window_open &&
+	    kvm_cpu_has_interrupt(&svm->vcpu))
 		/*
 		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
 		 */
@@ -2383,7 +2385,8 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 	 * Interrupts blocked.  Wait for unblock.
 	 */
 	if (!svm->vcpu.arch.interrupt_window_open &&
-	    (svm->vcpu.arch.irq_summary || kvm_run->request_interrupt_window))
+	    (kvm_cpu_has_interrupt(&svm->vcpu) ||
+	     kvm_run->request_interrupt_window))
 		svm_set_vintr(svm);
 	else
 		svm_clear_vintr(svm);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c6997c0e8ca64..b3292c1ea2f20 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2535,21 +2535,20 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
 		vmx_inject_nmi(vcpu);
 		if (vcpu->arch.nmi_pending)
 			enable_nmi_window(vcpu);
-		else if (vcpu->arch.irq_summary
-			 || kvm_run->request_interrupt_window)
+		else if (kvm_cpu_has_interrupt(vcpu) ||
+			 kvm_run->request_interrupt_window)
 			enable_irq_window(vcpu);
 		return;
 	}
 
 	if (vcpu->arch.interrupt_window_open) {
-		if (vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
-			kvm_queue_interrupt(vcpu, kvm_pop_irq(vcpu));
+		if (kvm_cpu_has_interrupt(vcpu) && !vcpu->arch.interrupt.pending)
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
 
 		if (vcpu->arch.interrupt.pending)
 			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-	}
-	if (!vcpu->arch.interrupt_window_open &&
-	    (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
+	} else if(kvm_cpu_has_interrupt(vcpu) ||
+		  kvm_run->request_interrupt_window)
 		enable_irq_window(vcpu);
 }
 
@@ -2976,8 +2975,9 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
 	 * If the user space waits to inject interrupts, exit as soon as
 	 * possible
 	 */
-	if (kvm_run->request_interrupt_window &&
-	    !vcpu->arch.irq_summary) {
+	if (!irqchip_in_kernel(vcpu->kvm) &&
+	    kvm_run->request_interrupt_window &&
+	    !kvm_cpu_has_interrupt(vcpu)) {
 		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 		return 0;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2bad49b535c88..4c2eb7c0e1fb1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3069,7 +3069,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 					  struct kvm_run *kvm_run)
 {
-	return (!vcpu->arch.irq_summary &&
+	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
 		kvm_run->request_interrupt_window &&
 		vcpu->arch.interrupt_window_open &&
 		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
@@ -3086,7 +3086,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 	else
 		kvm_run->ready_for_interrupt_injection =
 					(vcpu->arch.interrupt_window_open &&
-					 vcpu->arch.irq_summary == 0);
+					 !kvm_cpu_has_interrupt(vcpu));
 }
 
 static void vapic_enter(struct kvm_vcpu *vcpu)
-- 
GitLab


From 863e8e658ee9ac6e5931b295eb7428456e450a0f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:44:57 +0300
Subject: [PATCH 1790/2198] KVM: VMX: Consolidate userspace and kernel
 interrupt injection for VMX

Use the same callback to inject irq/nmi events no matter what irqchip is
in use. Only from VMX for now.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/svm.c              |  2 +-
 arch/x86/kvm/vmx.c              | 71 ++++++++-------------------------
 arch/x86/kvm/x86.c              |  2 +-
 4 files changed, 19 insertions(+), 58 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index cb306cff2b49a..5edae351b5dc1 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -517,7 +517,7 @@ struct kvm_x86_ops {
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
 	bool (*exception_injected)(struct kvm_vcpu *vcpu);
-	void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
+	void (*inject_pending_irq)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
 				       struct kvm_run *run);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6eef6d22e87ee..f2933abc96916 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2298,7 +2298,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 		(svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
-static void svm_intr_assist(struct kvm_vcpu *vcpu)
+static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vmcb *vmcb = svm->vmcb;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b3292c1ea2f20..06252f7465d6e 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2510,48 +2510,6 @@ static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 	return vcpu->arch.interrupt_window_open;
 }
 
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
-{
-	vmx_update_window_states(vcpu);
-
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-				GUEST_INTR_STATE_STI |
-				GUEST_INTR_STATE_MOV_SS);
-
-	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-		if (vcpu->arch.interrupt.pending) {
-			enable_nmi_window(vcpu);
-		} else if (vcpu->arch.nmi_window_open) {
-			vcpu->arch.nmi_pending = false;
-			vcpu->arch.nmi_injected = true;
-		} else {
-			enable_nmi_window(vcpu);
-			return;
-		}
-	}
-	if (vcpu->arch.nmi_injected) {
-		vmx_inject_nmi(vcpu);
-		if (vcpu->arch.nmi_pending)
-			enable_nmi_window(vcpu);
-		else if (kvm_cpu_has_interrupt(vcpu) ||
-			 kvm_run->request_interrupt_window)
-			enable_irq_window(vcpu);
-		return;
-	}
-
-	if (vcpu->arch.interrupt_window_open) {
-		if (kvm_cpu_has_interrupt(vcpu) && !vcpu->arch.interrupt.pending)
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-
-		if (vcpu->arch.interrupt.pending)
-			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-	} else if(kvm_cpu_has_interrupt(vcpu) ||
-		  kvm_run->request_interrupt_window)
-		enable_irq_window(vcpu);
-}
-
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
 {
 	int ret;
@@ -3351,8 +3309,11 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	}
 }
 
-static void vmx_intr_assist(struct kvm_vcpu *vcpu)
+static void vmx_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+		kvm_run->request_interrupt_window;
+
 	update_tpr_threshold(vcpu);
 
 	vmx_update_window_states(vcpu);
@@ -3373,25 +3334,25 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
 			return;
 		}
 	}
+
 	if (vcpu->arch.nmi_injected) {
 		vmx_inject_nmi(vcpu);
-		if (vcpu->arch.nmi_pending)
-			enable_nmi_window(vcpu);
-		else if (kvm_cpu_has_interrupt(vcpu))
-			enable_irq_window(vcpu);
-		return;
+		goto out;
 	}
+
 	if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
 		if (vcpu->arch.interrupt_window_open)
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-		else
-			enable_irq_window(vcpu);
 	}
-	if (vcpu->arch.interrupt.pending) {
+
+	if (vcpu->arch.interrupt.pending)
 		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-		if (kvm_cpu_has_interrupt(vcpu))
-			enable_irq_window(vcpu);
-	}
+
+out:
+	if (vcpu->arch.nmi_pending)
+		enable_nmi_window(vcpu);
+	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		enable_irq_window(vcpu);
 }
 
 /*
@@ -3733,7 +3694,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.queue_exception = vmx_queue_exception,
 	.exception_injected = vmx_exception_injected,
 	.inject_pending_irq = vmx_intr_assist,
-	.inject_pending_vectors = do_interrupt_requests,
+	.inject_pending_vectors = vmx_intr_assist,
 	.interrupt_allowed = vmx_interrupt_allowed,
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 4c2eb7c0e1fb1..a84c96a7ea5e6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3173,7 +3173,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else if (irqchip_in_kernel(vcpu->kvm))
-		kvm_x86_ops->inject_pending_irq(vcpu);
+		kvm_x86_ops->inject_pending_irq(vcpu, kvm_run);
 	else
 		kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
 
-- 
GitLab


From 1f21e79aaced0a041e9399346960ce26ae0f5a4e Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:44:58 +0300
Subject: [PATCH 1791/2198] KVM: VMX: Cleanup vmx_intr_assist()

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 55 +++++++++++++++++++++++++---------------------
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 06252f7465d6e..9eb518fb907b0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3309,6 +3309,34 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	}
 }
 
+static void vmx_intr_inject(struct kvm_vcpu *vcpu)
+{
+	/* try to reinject previous events if any */
+	if (vcpu->arch.nmi_injected) {
+		vmx_inject_nmi(vcpu);
+		return;
+	}
+
+	if (vcpu->arch.interrupt.pending) {
+		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+		return;
+	}
+
+	/* try to inject new event if pending */
+	if (vcpu->arch.nmi_pending) {
+		if (vcpu->arch.nmi_window_open) {
+			vcpu->arch.nmi_pending = false;
+			vcpu->arch.nmi_injected = true;
+			vmx_inject_nmi(vcpu);
+		}
+	} else if (kvm_cpu_has_interrupt(vcpu)) {
+		if (vcpu->arch.interrupt_window_open) {
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+		}
+	}
+}
+
 static void vmx_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
@@ -3323,32 +3351,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 				GUEST_INTR_STATE_STI |
 				GUEST_INTR_STATE_MOV_SS);
 
-	if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
-		if (vcpu->arch.interrupt.pending) {
-			enable_nmi_window(vcpu);
-		} else if (vcpu->arch.nmi_window_open) {
-			vcpu->arch.nmi_pending = false;
-			vcpu->arch.nmi_injected = true;
-		} else {
-			enable_nmi_window(vcpu);
-			return;
-		}
-	}
-
-	if (vcpu->arch.nmi_injected) {
-		vmx_inject_nmi(vcpu);
-		goto out;
-	}
-
-	if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
-		if (vcpu->arch.interrupt_window_open)
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-	}
-
-	if (vcpu->arch.interrupt.pending)
-		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
+	vmx_intr_inject(vcpu);
 
-out:
+	/* enable NMI/IRQ window open exits if needed */
 	if (vcpu->arch.nmi_pending)
 		enable_nmi_window(vcpu);
 	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-- 
GitLab


From 5df56646472c42495dd2412c8d8aa72e59efe79a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:44:59 +0300
Subject: [PATCH 1792/2198] KVM: Use kvm_arch_interrupt_allowed() instead of
 checking interrupt_window_open directly

kvm_arch_interrupt_allowed() also checks IF so drop the check.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a84c96a7ea5e6..ae6250b197267 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3071,8 +3071,7 @@ static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu,
 {
 	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
 		kvm_run->request_interrupt_window &&
-		vcpu->arch.interrupt_window_open &&
-		(kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF));
+		kvm_arch_interrupt_allowed(vcpu));
 }
 
 static void post_kvm_run_save(struct kvm_vcpu *vcpu,
@@ -3085,7 +3084,7 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 		kvm_run->ready_for_interrupt_injection = 1;
 	else
 		kvm_run->ready_for_interrupt_injection =
-					(vcpu->arch.interrupt_window_open &&
+					(kvm_arch_interrupt_allowed(vcpu) &&
 					 !kvm_cpu_has_interrupt(vcpu));
 }
 
-- 
GitLab


From 9222be18f76b4410b4da0d06e1cc21079e64b3ec Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 23 Apr 2009 17:14:37 +0300
Subject: [PATCH 1793/2198] KVM: SVM: Coalesce userspace/kernel irqchip
 interrupt injection logic

Start to use interrupt/exception queues like VMX does.
This also fix the bug that if exit was caused by a guest
internal exception access to IDT the exception was not
reinjected.

Use EVENTINJ to inject interrupts.  Use VINT only for detecting when IRQ
windows is open again.  EVENTINJ ensures
the interrupt is injected immediately and not delayed.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 187 +++++++++++++++++++++------------------------
 1 file changed, 85 insertions(+), 102 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f2933abc96916..a80ffaa16a940 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -70,7 +70,6 @@ module_param(npt, int, S_IRUGO);
 static int nested = 0;
 module_param(nested, int, S_IRUGO);
 
-static void kvm_reput_irq(struct vcpu_svm *svm);
 static void svm_flush_tlb(struct kvm_vcpu *vcpu);
 
 static int nested_svm_exit_handled(struct vcpu_svm *svm, bool kvm_override);
@@ -199,9 +198,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 
 static bool svm_exception_injected(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	return !(svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID);
+	return false;
 }
 
 static int is_external_interrupt(u32 info)
@@ -978,12 +975,9 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 
 static int svm_get_irq(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_svm *svm = to_svm(vcpu);
-	u32 exit_int_info = svm->vmcb->control.exit_int_info;
-
-	if (is_external_interrupt(exit_int_info))
-		return exit_int_info & SVM_EVTINJ_VEC_MASK;
-	return -1;
+	if (!vcpu->arch.interrupt.pending)
+		return -1;
+	return vcpu->arch.interrupt.nr;
 }
 
 static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1090,17 +1084,8 @@ static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
 
 static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
-	u32 exit_int_info = svm->vmcb->control.exit_int_info;
-	struct kvm *kvm = svm->vcpu.kvm;
 	u64 fault_address;
 	u32 error_code;
-	bool event_injection = false;
-
-	if (!irqchip_in_kernel(kvm) &&
-	    is_external_interrupt(exit_int_info)) {
-		event_injection = true;
-		kvm_push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
-	}
 
 	fault_address  = svm->vmcb->control.exit_info_2;
 	error_code = svm->vmcb->control.exit_info_1;
@@ -1120,9 +1105,11 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	 */
 	if (npt_enabled)
 		svm_flush_tlb(&svm->vcpu);
-
-	if (!npt_enabled && event_injection)
-		kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+	else {
+		if (svm->vcpu.arch.interrupt.pending ||
+				svm->vcpu.arch.exception.pending)
+			kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
+	}
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
 }
 
@@ -2196,7 +2183,6 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		}
 	}
 
-	kvm_reput_irq(svm);
 
 	if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2259,13 +2245,19 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
 }
 
+static void svm_queue_irq(struct vcpu_svm *svm, unsigned nr)
+{
+	svm->vmcb->control.event_inj = nr |
+		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
+}
+
 static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	nested_svm_intr(svm);
 
-	svm_inject_irq(svm, irq);
+	svm_queue_irq(svm, irq);
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -2298,98 +2290,47 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
 		(svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
-static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
-	struct vcpu_svm *svm = to_svm(vcpu);
-	struct vmcb *vmcb = svm->vmcb;
-	int intr_vector = -1;
-
-	if ((vmcb->control.exit_int_info & SVM_EVTINJ_VALID) &&
-	    ((vmcb->control.exit_int_info & SVM_EVTINJ_TYPE_MASK) == 0)) {
-		intr_vector = vmcb->control.exit_int_info &
-			      SVM_EVTINJ_VEC_MASK;
-		vmcb->control.exit_int_info = 0;
-		svm_inject_irq(svm, intr_vector);
-		goto out;
-	}
-
-	if (vmcb->control.int_ctl & V_IRQ_MASK)
-		goto out;
-
-	if (!kvm_cpu_has_interrupt(vcpu))
-		goto out;
-
-	if (nested_svm_intr(svm))
-		goto out;
-
-	if (!(svm->vcpu.arch.hflags & HF_GIF_MASK))
-		goto out;
-
-	if (!(vmcb->save.rflags & X86_EFLAGS_IF) ||
-	    (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) ||
-	    (vmcb->control.event_inj & SVM_EVTINJ_VALID)) {
-		/* unable to deliver irq, set pending irq */
-		svm_set_vintr(svm);
-		svm_inject_irq(svm, 0x0);
-		goto out;
-	}
-	/* Okay, we can deliver the interrupt: grab it and update PIC state. */
-	intr_vector = kvm_cpu_get_interrupt(vcpu);
-	svm_inject_irq(svm, intr_vector);
-out:
-	update_cr8_intercept(vcpu);
+	svm_set_vintr(to_svm(vcpu));
+	svm_inject_irq(to_svm(vcpu), 0x0);
 }
 
-static void kvm_reput_irq(struct vcpu_svm *svm)
+static void svm_intr_inject(struct kvm_vcpu *vcpu)
 {
-	struct vmcb_control_area *control = &svm->vmcb->control;
-
-	if ((control->int_ctl & V_IRQ_MASK)
-	    && !irqchip_in_kernel(svm->vcpu.kvm)) {
-		control->int_ctl &= ~V_IRQ_MASK;
-		kvm_push_irq(&svm->vcpu, control->int_vector);
+	/* try to reinject previous events if any */
+	if (vcpu->arch.interrupt.pending) {
+		svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
+		return;
 	}
 
-	svm->vcpu.arch.interrupt_window_open =
-		!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		 (svm->vcpu.arch.hflags & HF_GIF_MASK);
-}
-
-static void svm_do_inject_vector(struct vcpu_svm *svm)
-{
-	svm_inject_irq(svm, kvm_pop_irq(&svm->vcpu));
+	/* try to inject new event if pending */
+	if (kvm_cpu_has_interrupt(vcpu)) {
+		if (vcpu->arch.interrupt_window_open) {
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+			svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
+		}
+	}
 }
 
-static void do_interrupt_requests(struct kvm_vcpu *vcpu,
-				       struct kvm_run *kvm_run)
+static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	struct vmcb_control_area *control = &svm->vmcb->control;
+	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+		kvm_run->request_interrupt_window;
 
 	if (nested_svm_intr(svm))
-		return;
+		goto out;
 
-	svm->vcpu.arch.interrupt_window_open =
-		(!(control->int_state & SVM_INTERRUPT_SHADOW_MASK) &&
-		 (svm->vmcb->save.rflags & X86_EFLAGS_IF) &&
-		 (svm->vcpu.arch.hflags & HF_GIF_MASK));
+	svm->vcpu.arch.interrupt_window_open = svm_interrupt_allowed(vcpu);
 
-	if (svm->vcpu.arch.interrupt_window_open &&
-	    kvm_cpu_has_interrupt(&svm->vcpu))
-		/*
-		 * If interrupts enabled, and not blocked by sti or mov ss. Good.
-		 */
-		svm_do_inject_vector(svm);
+	svm_intr_inject(vcpu);
 
-	/*
-	 * Interrupts blocked.  Wait for unblock.
-	 */
-	if (!svm->vcpu.arch.interrupt_window_open &&
-	    (kvm_cpu_has_interrupt(&svm->vcpu) ||
-	     kvm_run->request_interrupt_window))
-		svm_set_vintr(svm);
-	else
-		svm_clear_vintr(svm);
+	if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		enable_irq_window(vcpu);
+
+out:
+	update_cr8_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2429,6 +2370,46 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
 }
 
+static void svm_complete_interrupts(struct vcpu_svm *svm)
+{
+	u8 vector;
+	int type;
+	u32 exitintinfo = svm->vmcb->control.exit_int_info;
+
+	svm->vcpu.arch.nmi_injected = false;
+	kvm_clear_exception_queue(&svm->vcpu);
+	kvm_clear_interrupt_queue(&svm->vcpu);
+
+	if (!(exitintinfo & SVM_EXITINTINFO_VALID))
+		return;
+
+	vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
+	type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
+
+	switch (type) {
+	case SVM_EXITINTINFO_TYPE_NMI:
+		svm->vcpu.arch.nmi_injected = true;
+		break;
+	case SVM_EXITINTINFO_TYPE_EXEPT:
+		/* In case of software exception do not reinject an exception
+		   vector, but re-execute and instruction instead */
+		if (vector == BP_VECTOR || vector == OF_VECTOR)
+			break;
+		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
+			u32 err = svm->vmcb->control.exit_int_info_err;
+			kvm_queue_exception_e(&svm->vcpu, vector, err);
+
+		} else
+			kvm_queue_exception(&svm->vcpu, vector);
+		break;
+	case SVM_EXITINTINFO_TYPE_INTR:
+		kvm_queue_interrupt(&svm->vcpu, vector);
+		break;
+	default:
+		break;
+	}
+}
+
 #ifdef CONFIG_X86_64
 #define R "r"
 #else
@@ -2557,6 +2538,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	sync_cr8_to_lapic(vcpu);
 
 	svm->next_rip = 0;
+
+	svm_complete_interrupts(svm);
 }
 
 #undef R
@@ -2678,7 +2661,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.queue_exception = svm_queue_exception,
 	.exception_injected = svm_exception_injected,
 	.inject_pending_irq = svm_intr_assist,
-	.inject_pending_vectors = do_interrupt_requests,
+	.inject_pending_vectors = svm_intr_assist,
 	.interrupt_allowed = svm_interrupt_allowed,
 
 	.set_tss_addr = svm_set_tss_addr,
-- 
GitLab


From 1cb948ae86f3d95cce58fac51d00766825f5f783 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:02 +0300
Subject: [PATCH 1794/2198] KVM: Remove exception_injected() callback.

It always return false for VMX/SVM now.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/svm.c              | 6 ------
 arch/x86/kvm/vmx.c              | 6 ------
 arch/x86/kvm/x86.c              | 2 --
 4 files changed, 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5edae351b5dc1..ea3741edbec3e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -516,7 +516,6 @@ struct kvm_x86_ops {
 	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
-	bool (*exception_injected)(struct kvm_vcpu *vcpu);
 	void (*inject_pending_irq)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
 				       struct kvm_run *run);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index a80ffaa16a940..8fa5a0ead6809 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -196,11 +196,6 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	svm->vmcb->control.event_inj_err = error_code;
 }
 
-static bool svm_exception_injected(struct kvm_vcpu *vcpu)
-{
-	return false;
-}
-
 static int is_external_interrupt(u32 info)
 {
 	info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
@@ -2659,7 +2654,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.get_irq = svm_get_irq,
 	.set_irq = svm_set_irq,
 	.queue_exception = svm_queue_exception,
-	.exception_injected = svm_exception_injected,
 	.inject_pending_irq = svm_intr_assist,
 	.inject_pending_vectors = svm_intr_assist,
 	.interrupt_allowed = svm_interrupt_allowed,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9eb518fb907b0..3186fcfcffb5c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -789,11 +789,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
 }
 
-static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
-{
-	return false;
-}
-
 /*
  * Swap MSR entry in host/guest MSR entry array.
  */
@@ -3697,7 +3692,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.get_irq = vmx_get_irq,
 	.set_irq = vmx_inject_irq,
 	.queue_exception = vmx_queue_exception,
-	.exception_injected = vmx_exception_injected,
 	.inject_pending_irq = vmx_intr_assist,
 	.inject_pending_vectors = vmx_intr_assist,
 	.interrupt_allowed = vmx_interrupt_allowed,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae6250b197267..b81970b053b98 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3237,8 +3237,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		profile_hit(KVM_PROFILING, (void *)rip);
 	}
 
-	if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
-		vcpu->arch.exception.pending = false;
 
 	kvm_lapic_sync_from_vapic(vcpu);
 
-- 
GitLab


From 1d6ed0cb95a2f0839e1a31f1971dc37cd60c258a Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:03 +0300
Subject: [PATCH 1795/2198] KVM: Remove inject_pending_vectors() callback

It is the same as inject_pending_irq() for VMX/SVM now.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 --
 arch/x86/kvm/svm.c              | 1 -
 arch/x86/kvm/vmx.c              | 1 -
 arch/x86/kvm/x86.c              | 4 +---
 4 files changed, 1 insertion(+), 7 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ea3741edbec3e..aa5a54eb4da4b 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -517,8 +517,6 @@ struct kvm_x86_ops {
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
 	void (*inject_pending_irq)(struct kvm_vcpu *vcpu, struct kvm_run *run);
-	void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
-				       struct kvm_run *run);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8fa5a0ead6809..290547537b4dd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2655,7 +2655,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_irq = svm_set_irq,
 	.queue_exception = svm_queue_exception,
 	.inject_pending_irq = svm_intr_assist,
-	.inject_pending_vectors = svm_intr_assist,
 	.interrupt_allowed = svm_interrupt_allowed,
 
 	.set_tss_addr = svm_set_tss_addr,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 3186fcfcffb5c..9162b4cd34109 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3693,7 +3693,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.set_irq = vmx_inject_irq,
 	.queue_exception = vmx_queue_exception,
 	.inject_pending_irq = vmx_intr_assist,
-	.inject_pending_vectors = vmx_intr_assist,
 	.interrupt_allowed = vmx_interrupt_allowed,
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index b81970b053b98..0890df9e88ff5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3171,10 +3171,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
-	else if (irqchip_in_kernel(vcpu->kvm))
-		kvm_x86_ops->inject_pending_irq(vcpu, kvm_run);
 	else
-		kvm_x86_ops->inject_pending_vectors(vcpu, kvm_run);
+		kvm_x86_ops->inject_pending_irq(vcpu, kvm_run);
 
 	kvm_lapic_sync_to_vapic(vcpu);
 
-- 
GitLab


From 115666dfc733a6749d99bcf3b2fe3fa253218b36 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:04 +0300
Subject: [PATCH 1796/2198] KVM: Remove kvm_push_irq()

No longer used.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.h | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 2ab679102dcd0..39350b2527258 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -30,11 +30,4 @@ static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu)
 		clear_bit(word_index, &vcpu->arch.irq_summary);
 	return irq;
 }
-
-static inline void kvm_push_irq(struct kvm_vcpu *vcpu, u8 irq)
-{
-        set_bit(irq, vcpu->arch.irq_pending);
-        set_bit(irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
-}
-
 #endif
-- 
GitLab


From 615d5193055880d44db92b72403b7549251ac2a6 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:05 +0300
Subject: [PATCH 1797/2198] KVM: sync_lapic_to_cr8() should always sync cr8 to
 V_TPR

Even if IRQ chip is in userspace.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 290547537b4dd..143818eff52e0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2348,7 +2348,7 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
 
 	if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) {
 		int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
-		kvm_lapic_set_tpr(vcpu, cr8);
+		kvm_set_cr8(vcpu, cr8);
 	}
 }
 
@@ -2357,9 +2357,6 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 	u64 cr8;
 
-	if (!irqchip_in_kernel(vcpu->kvm))
-		return;
-
 	cr8 = kvm_get_cr8(vcpu);
 	svm->vmcb->control.int_ctl &= ~V_TPR_MASK;
 	svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
-- 
GitLab


From 0a5fff192388d2a74aa9ab5e0d394b745df9f225 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:06 +0300
Subject: [PATCH 1798/2198] KVM: Do not report TPR write to userspace if new
 value bigger or equal to a previous one.

Saves many exits to userspace in a case of IRQ chip in userspace.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c |  4 ++++
 arch/x86/kvm/vmx.c | 19 ++++++++++++-------
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 143818eff52e0..e283a63b2bca2 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1862,9 +1862,13 @@ static int emulate_on_interception(struct vcpu_svm *svm,
 
 static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
+	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
+	/* instruction emulation calls kvm_set_cr8() */
 	emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
 	if (irqchip_in_kernel(svm->vcpu.kvm))
 		return 1;
+	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
+		return 1;
 	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9162b4cd34109..51f804c8fe79d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2724,13 +2724,18 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 			kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
 			skip_emulated_instruction(vcpu);
 			return 1;
-		case 8:
-			kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
-			skip_emulated_instruction(vcpu);
-			if (irqchip_in_kernel(vcpu->kvm))
-				return 1;
-			kvm_run->exit_reason = KVM_EXIT_SET_TPR;
-			return 0;
+		case 8: {
+				u8 cr8_prev = kvm_get_cr8(vcpu);
+				u8 cr8 = kvm_register_read(vcpu, reg);
+				kvm_set_cr8(vcpu, cr8);
+				skip_emulated_instruction(vcpu);
+				if (irqchip_in_kernel(vcpu->kvm))
+					return 1;
+				if (cr8_prev <= cr8)
+					return 1;
+				kvm_run->exit_reason = KVM_EXIT_SET_TPR;
+				return 0;
+			}
 		};
 		break;
 	case 2: /* clts */
-- 
GitLab


From c4282df98ae0993983924c00ed76428a6609d68b Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:07 +0300
Subject: [PATCH 1799/2198] KVM: Get rid of arch.interrupt_window_open &
 arch.nmi_window_open

They are recalculated before each use anyway.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 --
 arch/x86/kvm/svm.c              |  6 +-----
 arch/x86/kvm/vmx.c              | 35 +++++++++++----------------------
 3 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index aa5a54eb4da4b..53533ea17555c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -266,7 +266,6 @@ struct kvm_mmu {
 
 struct kvm_vcpu_arch {
 	u64 host_tsc;
-	int interrupt_window_open;
 	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
 	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
 	/*
@@ -360,7 +359,6 @@ struct kvm_vcpu_arch {
 
 	bool nmi_pending;
 	bool nmi_injected;
-	bool nmi_window_open;
 
 	struct mtrr_state_type mtrr_state;
 	u32 pat;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e283a63b2bca2..0f53439296b93 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -216,8 +216,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 
 	kvm_rip_write(vcpu, svm->next_rip);
 	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
-
-	vcpu->arch.interrupt_window_open = (svm->vcpu.arch.hflags & HF_GIF_MASK);
 }
 
 static int has_svm(void)
@@ -2305,7 +2303,7 @@ static void svm_intr_inject(struct kvm_vcpu *vcpu)
 
 	/* try to inject new event if pending */
 	if (kvm_cpu_has_interrupt(vcpu)) {
-		if (vcpu->arch.interrupt_window_open) {
+		if (svm_interrupt_allowed(vcpu)) {
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
 			svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
 		}
@@ -2321,8 +2319,6 @@ static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (nested_svm_intr(svm))
 		goto out;
 
-	svm->vcpu.arch.interrupt_window_open = svm_interrupt_allowed(vcpu);
-
 	svm_intr_inject(vcpu);
 
 	if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 51f804c8fe79d..116eac01a9f0a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -753,7 +753,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	if (interruptibility & 3)
 		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
 			     interruptibility & ~3);
-	vcpu->arch.interrupt_window_open = 1;
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -2482,27 +2481,21 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
 
-static void vmx_update_window_states(struct kvm_vcpu *vcpu)
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
 {
-	u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-
-	vcpu->arch.nmi_window_open =
-		!(guest_intr & (GUEST_INTR_STATE_STI |
-				GUEST_INTR_STATE_MOV_SS |
-				GUEST_INTR_STATE_NMI));
 	if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked)
-		vcpu->arch.nmi_window_open = 0;
+		return 0;
 
-	vcpu->arch.interrupt_window_open =
-		((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
-		 !(guest_intr & (GUEST_INTR_STATE_STI |
-				 GUEST_INTR_STATE_MOV_SS)));
+	return	!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS |
+				GUEST_INTR_STATE_NMI));
 }
 
 static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
 {
-	vmx_update_window_states(vcpu);
-	return vcpu->arch.interrupt_window_open;
+	return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+		!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+			(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
 }
 
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -3194,9 +3187,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		       __func__, vectoring_info, exit_reason);
 
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) {
-		if (vcpu->arch.interrupt_window_open) {
+		if (vmx_interrupt_allowed(vcpu)) {
 			vmx->soft_vnmi_blocked = 0;
-			vcpu->arch.nmi_window_open = 1;
 		} else if (vmx->vnmi_blocked_time > 1000000000LL &&
 			   vcpu->arch.nmi_pending) {
 			/*
@@ -3209,7 +3201,6 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			       "state on VCPU %d after 1 s timeout\n",
 			       __func__, vcpu->vcpu_id);
 			vmx->soft_vnmi_blocked = 0;
-			vmx->vcpu.arch.nmi_window_open = 1;
 		}
 	}
 
@@ -3324,13 +3315,13 @@ static void vmx_intr_inject(struct kvm_vcpu *vcpu)
 
 	/* try to inject new event if pending */
 	if (vcpu->arch.nmi_pending) {
-		if (vcpu->arch.nmi_window_open) {
+		if (vmx_nmi_allowed(vcpu)) {
 			vcpu->arch.nmi_pending = false;
 			vcpu->arch.nmi_injected = true;
 			vmx_inject_nmi(vcpu);
 		}
 	} else if (kvm_cpu_has_interrupt(vcpu)) {
-		if (vcpu->arch.interrupt_window_open) {
+		if (vmx_interrupt_allowed(vcpu)) {
 			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
 			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
 		}
@@ -3344,8 +3335,6 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	update_tpr_threshold(vcpu);
 
-	vmx_update_window_states(vcpu);
-
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
 				GUEST_INTR_STATE_STI |
@@ -3518,8 +3507,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vmx->rmode.irq.pending)
 		fixup_rmode_irq(vmx);
 
-	vmx_update_window_states(vcpu);
-
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
 
-- 
GitLab


From 95ba82731374eb1c2af4dd442526c4b314f0e8b6 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:08 +0300
Subject: [PATCH 1800/2198] KVM: SVM: Add NMI injection support

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  8 ++-
 arch/x86/kvm/svm.c              | 96 +++++++++++++++++----------------
 arch/x86/kvm/vmx.c              | 79 +++++++--------------------
 arch/x86/kvm/x86.c              | 71 +++++++++++++++++++++++-
 4 files changed, 145 insertions(+), 109 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 53533ea17555c..dd9ecd3de90d6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -512,10 +512,15 @@ struct kvm_x86_ops {
 				unsigned char *hypercall_addr);
 	int (*get_irq)(struct kvm_vcpu *vcpu);
 	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
-	void (*inject_pending_irq)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
+	int (*nmi_allowed)(struct kvm_vcpu *vcpu);
+	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
+	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
+	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
+	void (*drop_interrupt_shadow)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	int (*get_mt_mask_shift)(void);
@@ -763,6 +768,7 @@ enum {
 #define HF_GIF_MASK		(1 << 0)
 #define HF_HIF_MASK		(1 << 1)
 #define HF_VINTR_MASK		(1 << 2)
+#define HF_NMI_MASK		(1 << 3)
 
 /*
  * Hardware virtualization extension instructions may fault if a
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0f53439296b93..18072888efc51 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1843,6 +1843,14 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	return 1;
 }
 
+static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
+{
+	++svm->vcpu.stat.nmi_window_exits;
+	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
+	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+	return 1;
+}
+
 static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
@@ -1863,8 +1871,10 @@ static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
 	emulate_instruction(&svm->vcpu, NULL, 0, 0, 0);
-	if (irqchip_in_kernel(svm->vcpu.kvm))
+	if (irqchip_in_kernel(svm->vcpu.kvm)) {
+		svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
 		return 1;
+	}
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
 		return 1;
 	kvm_run->exit_reason = KVM_EXIT_SET_TPR;
@@ -2120,6 +2130,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
 	[SVM_EXIT_VINTR]			= interrupt_window_interception,
 	/* [SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception, */
 	[SVM_EXIT_CPUID]			= cpuid_interception,
+	[SVM_EXIT_IRET]                         = iret_interception,
 	[SVM_EXIT_INVD]                         = emulate_on_interception,
 	[SVM_EXIT_HLT]				= halt_interception,
 	[SVM_EXIT_INVLPG]			= invlpg_interception,
@@ -2227,6 +2238,21 @@ static void pre_svm_run(struct vcpu_svm *svm)
 		new_asid(svm, svm_data);
 }
 
+static void svm_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+}
+
+static void svm_inject_nmi(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
+	vcpu->arch.hflags |= HF_NMI_MASK;
+	svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET);
+	++vcpu->stat.nmi_injections;
+}
 
 static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 {
@@ -2242,8 +2268,10 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 		((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
 }
 
-static void svm_queue_irq(struct vcpu_svm *svm, unsigned nr)
+static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+
 	svm->vmcb->control.event_inj = nr |
 		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
@@ -2254,28 +2282,26 @@ static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
 
 	nested_svm_intr(svm);
 
-	svm_queue_irq(svm, irq);
+	svm_queue_irq(vcpu, irq);
 }
 
-static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	struct vmcb *vmcb = svm->vmcb;
-	int max_irr, tpr;
 
-	if (!irqchip_in_kernel(vcpu->kvm) || vcpu->arch.apic->vapic_addr)
+	if (irr == -1)
 		return;
 
-	vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
-
-	max_irr = kvm_lapic_find_highest_irr(vcpu);
-	if (max_irr == -1)
-		return;
-
-	tpr = kvm_lapic_get_cr8(vcpu) << 4;
+	if (tpr >= irr)
+		svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+}
 
-	if (tpr >= (max_irr & 0xf0))
-		vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK;
+static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb *vmcb = svm->vmcb;
+	return !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) &&
+		!(svm->vcpu.arch.hflags & HF_NMI_MASK);
 }
 
 static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
@@ -2293,39 +2319,12 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
 	svm_inject_irq(to_svm(vcpu), 0x0);
 }
 
-static void svm_intr_inject(struct kvm_vcpu *vcpu)
-{
-	/* try to reinject previous events if any */
-	if (vcpu->arch.interrupt.pending) {
-		svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
-		return;
-	}
-
-	/* try to inject new event if pending */
-	if (kvm_cpu_has_interrupt(vcpu)) {
-		if (svm_interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-			svm_queue_irq(to_svm(vcpu), vcpu->arch.interrupt.nr);
-		}
-	}
-}
-
-static void svm_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
-		kvm_run->request_interrupt_window;
 
-	if (nested_svm_intr(svm))
-		goto out;
-
-	svm_intr_inject(vcpu);
-
-	if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
 		enable_irq_window(vcpu);
-
-out:
-	update_cr8_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2650,9 +2649,14 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.patch_hypercall = svm_patch_hypercall,
 	.get_irq = svm_get_irq,
 	.set_irq = svm_set_irq,
+	.set_nmi = svm_inject_nmi,
 	.queue_exception = svm_queue_exception,
-	.inject_pending_irq = svm_intr_assist,
 	.interrupt_allowed = svm_interrupt_allowed,
+	.nmi_allowed = svm_nmi_allowed,
+	.enable_nmi_window = enable_nmi_window,
+	.enable_irq_window = enable_irq_window,
+	.update_cr8_intercept = update_cr8_intercept,
+	.drop_interrupt_shadow = svm_drop_interrupt_shadow,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 116eac01a9f0a..bad2413fbd51c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1314,6 +1314,9 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_flexpriority())
 		flexpriority_enabled = 0;
 
+	if (!cpu_has_vmx_tpr_shadow())
+		kvm_x86_ops->update_cr8_intercept = NULL;
+
 	return alloc_kvm_area();
 }
 
@@ -2404,6 +2407,12 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
+void vmx_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
+{
+	vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
+			GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+}
+
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
@@ -3214,21 +3223,14 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	return 0;
 }
 
-static void update_tpr_threshold(struct kvm_vcpu *vcpu)
+static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
 {
-	int max_irr, tpr;
-
-	if (!vm_need_tpr_shadow(vcpu->kvm))
-		return;
-
-	if (!kvm_lapic_enabled(vcpu) ||
-	    ((max_irr = kvm_lapic_find_highest_irr(vcpu)) == -1)) {
+	if (irr == -1 || tpr < irr) {
 		vmcs_write32(TPR_THRESHOLD, 0);
 		return;
 	}
 
-	tpr = (kvm_lapic_get_cr8(vcpu) & 0x0f) << 4;
-	vmcs_write32(TPR_THRESHOLD, (max_irr > tpr) ? tpr >> 4 : max_irr >> 4);
+	vmcs_write32(TPR_THRESHOLD, irr);
 }
 
 static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
@@ -3300,55 +3302,6 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	}
 }
 
-static void vmx_intr_inject(struct kvm_vcpu *vcpu)
-{
-	/* try to reinject previous events if any */
-	if (vcpu->arch.nmi_injected) {
-		vmx_inject_nmi(vcpu);
-		return;
-	}
-
-	if (vcpu->arch.interrupt.pending) {
-		vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-		return;
-	}
-
-	/* try to inject new event if pending */
-	if (vcpu->arch.nmi_pending) {
-		if (vmx_nmi_allowed(vcpu)) {
-			vcpu->arch.nmi_pending = false;
-			vcpu->arch.nmi_injected = true;
-			vmx_inject_nmi(vcpu);
-		}
-	} else if (kvm_cpu_has_interrupt(vcpu)) {
-		if (vmx_interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-			vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
-		}
-	}
-}
-
-static void vmx_intr_assist(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
-		kvm_run->request_interrupt_window;
-
-	update_tpr_threshold(vcpu);
-
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-				GUEST_INTR_STATE_STI |
-				GUEST_INTR_STATE_MOV_SS);
-
-	vmx_intr_inject(vcpu);
-
-	/* enable NMI/IRQ window open exits if needed */
-	if (vcpu->arch.nmi_pending)
-		enable_nmi_window(vcpu);
-	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-		enable_irq_window(vcpu);
-}
-
 /*
  * Failure to inject an interrupt should give us the information
  * in IDT_VECTORING_INFO_FIELD.  However, if the failure occurs
@@ -3683,9 +3636,15 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.patch_hypercall = vmx_patch_hypercall,
 	.get_irq = vmx_get_irq,
 	.set_irq = vmx_inject_irq,
+	.set_nmi = vmx_inject_nmi,
 	.queue_exception = vmx_queue_exception,
-	.inject_pending_irq = vmx_intr_assist,
 	.interrupt_allowed = vmx_interrupt_allowed,
+	.nmi_allowed = vmx_nmi_allowed,
+	.enable_nmi_window = enable_nmi_window,
+	.enable_irq_window = enable_irq_window,
+	.update_cr8_intercept = update_cr8_intercept,
+	.drop_interrupt_shadow = vmx_drop_interrupt_shadow,
+
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
 	.get_mt_mask_shift = vmx_get_mt_mask_shift,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0890df9e88ff5..96e995c1dd76d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3114,6 +3114,68 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
 	up_read(&vcpu->kvm->slots_lock);
 }
 
+static void update_cr8_intercept(struct kvm_vcpu *vcpu)
+{
+	int max_irr, tpr;
+
+	if (!kvm_x86_ops->update_cr8_intercept)
+		return;
+
+	max_irr = kvm_lapic_find_highest_irr(vcpu);
+
+	if (max_irr != -1)
+		max_irr >>= 4;
+
+	tpr = kvm_lapic_get_cr8(vcpu);
+
+	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+}
+
+static void inject_irq(struct kvm_vcpu *vcpu)
+{
+	/* try to reinject previous events if any */
+	if (vcpu->arch.nmi_injected) {
+		kvm_x86_ops->set_nmi(vcpu);
+		return;
+	}
+
+	if (vcpu->arch.interrupt.pending) {
+		kvm_x86_ops->set_irq(vcpu, vcpu->arch.interrupt.nr);
+		return;
+	}
+
+	/* try to inject new event if pending */
+	if (vcpu->arch.nmi_pending) {
+		if (kvm_x86_ops->nmi_allowed(vcpu)) {
+			vcpu->arch.nmi_pending = false;
+			vcpu->arch.nmi_injected = true;
+			kvm_x86_ops->set_nmi(vcpu);
+		}
+	} else if (kvm_cpu_has_interrupt(vcpu)) {
+		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
+			kvm_x86_ops->set_irq(vcpu, vcpu->arch.interrupt.nr);
+		}
+	}
+}
+
+static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+		kvm_run->request_interrupt_window;
+
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		kvm_x86_ops->drop_interrupt_shadow(vcpu);
+
+	inject_irq(vcpu);
+
+	/* enable NMI/IRQ window open exits if needed */
+	if (vcpu->arch.nmi_pending)
+		kvm_x86_ops->enable_nmi_window(vcpu);
+	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		kvm_x86_ops->enable_irq_window(vcpu);
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
@@ -3172,9 +3234,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else
-		kvm_x86_ops->inject_pending_irq(vcpu, kvm_run);
+		inject_pending_irq(vcpu, kvm_run);
 
-	kvm_lapic_sync_to_vapic(vcpu);
+	if (kvm_lapic_enabled(vcpu)) {
+		if (!vcpu->arch.apic->vapic_addr)
+			update_cr8_intercept(vcpu);
+		else
+			kvm_lapic_sync_to_vapic(vcpu);
+	}
 
 	up_read(&vcpu->kvm->slots_lock);
 
-- 
GitLab


From 16d7a191170f0ca48c2c3277017b3e6d275e0711 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:10 +0300
Subject: [PATCH 1801/2198] KVM: Fix userspace IRQ chip migration

Re-put pending IRQ vector into interrupt_bitmap before migration.
Otherwise it will be lost if migration happens in the wrong time.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 96e995c1dd76d..63917216a051b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3566,17 +3566,17 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->efer = vcpu->arch.shadow_efer;
 	sregs->apic_base = kvm_get_apic_base(vcpu);
 
-	if (irqchip_in_kernel(vcpu->kvm)) {
+	if (irqchip_in_kernel(vcpu->kvm))
 		memset(sregs->interrupt_bitmap, 0,
 		       sizeof sregs->interrupt_bitmap);
-		pending_vec = kvm_x86_ops->get_irq(vcpu);
-		if (pending_vec >= 0)
-			set_bit(pending_vec,
-				(unsigned long *)sregs->interrupt_bitmap);
-	} else
+	else
 		memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
 		       sizeof sregs->interrupt_bitmap);
 
+	pending_vec = kvm_x86_ops->get_irq(vcpu);
+	if (pending_vec >= 0)
+		set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
+
 	vcpu_put(vcpu);
 
 	return 0;
-- 
GitLab


From 14d0bc1f7c8226d5088e7182c3b53e0c7e91d1af Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Tue, 21 Apr 2009 17:45:11 +0300
Subject: [PATCH 1802/2198] KVM: Get rid of get_irq() callback

It just returns pending IRQ vector from the queue for VMX/SVM.
Get IRQ directly from the queue before migration and put it back
after.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 -
 arch/x86/kvm/svm.c              |  8 --------
 arch/x86/kvm/vmx.c              |  8 --------
 arch/x86/kvm/x86.c              | 12 +++++-------
 4 files changed, 5 insertions(+), 24 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index dd9ecd3de90d6..3e94d0513208d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -510,7 +510,6 @@ struct kvm_x86_ops {
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
 				unsigned char *hypercall_addr);
-	int (*get_irq)(struct kvm_vcpu *vcpu);
 	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 18072888efc51..d96a6d3edec7e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -966,13 +966,6 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 	return 0;
 }
 
-static int svm_get_irq(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->arch.interrupt.pending)
-		return -1;
-	return vcpu->arch.interrupt.nr;
-}
-
 static void load_host_msrs(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_X86_64
@@ -2647,7 +2640,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.handle_exit = handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
 	.patch_hypercall = svm_patch_hypercall,
-	.get_irq = svm_get_irq,
 	.set_irq = svm_set_irq,
 	.set_nmi = svm_inject_nmi,
 	.queue_exception = svm_queue_exception,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index bad2413fbd51c..25be53aa5eefc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1045,13 +1045,6 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 	return 0;
 }
 
-static int vmx_get_irq(struct kvm_vcpu *vcpu)
-{
-	if (!vcpu->arch.interrupt.pending)
-		return -1;
-	return vcpu->arch.interrupt.nr;
-}
-
 static __init int cpu_has_kvm_support(void)
 {
 	return cpu_has_vmx();
@@ -3634,7 +3627,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.handle_exit = vmx_handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
 	.patch_hypercall = vmx_patch_hypercall,
-	.get_irq = vmx_get_irq,
 	.set_irq = vmx_inject_irq,
 	.set_nmi = vmx_inject_nmi,
 	.queue_exception = vmx_queue_exception,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63917216a051b..0f3e04b74a66a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3536,7 +3536,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
 	struct descriptor_table dt;
-	int pending_vec;
 
 	vcpu_load(vcpu);
 
@@ -3573,9 +3572,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
 		       sizeof sregs->interrupt_bitmap);
 
-	pending_vec = kvm_x86_ops->get_irq(vcpu);
-	if (pending_vec >= 0)
-		set_bit(pending_vec, (unsigned long *)sregs->interrupt_bitmap);
+	if (vcpu->arch.interrupt.pending)
+		set_bit(vcpu->arch.interrupt.nr,
+			(unsigned long *)sregs->interrupt_bitmap);
 
 	vcpu_put(vcpu);
 
@@ -4097,9 +4096,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 			max_bits);
 		/* Only pending external irq is handled here */
 		if (pending_vec < max_bits) {
-			kvm_x86_ops->set_irq(vcpu, pending_vec);
-			pr_debug("Set back pending irq %d\n",
-				 pending_vec);
+			kvm_queue_interrupt(vcpu, pending_vec);
+			pr_debug("Set back pending irq %d\n", pending_vec);
 		}
 		kvm_pic_clear_isr_ack(vcpu->kvm);
 	}
-- 
GitLab


From 9b62e5b10ff0f98346bcbe4a4fe3a0ca8fa7be30 Mon Sep 17 00:00:00 2001
From: Jan Blunck <jblunck@suse.de>
Date: Tue, 7 Apr 2009 23:58:56 +0000
Subject: [PATCH 1803/2198] KVM: Wake up waitqueue before calling get_cpu()

This moves the get_cpu() call down to be called after we wake up the
waiters. Therefore the waitqueue locks can safely be rt mutex.

Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Sven-Thorsten Dietrich <sven@thebigcorporation.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0f3e04b74a66a..e2713716e7320 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4569,7 +4569,7 @@ static void vcpu_kick_intr(void *info)
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
 	int ipi_pcpu = vcpu->cpu;
-	int cpu = get_cpu();
+	int cpu;
 
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
@@ -4579,6 +4579,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	 * We may be called synchronously with irqs disabled in guest mode,
 	 * So need not to call smp_call_function_single() in that case.
 	 */
+	cpu = get_cpu();
 	if (vcpu->guest_mode && vcpu->cpu != cpu)
 		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
 	put_cpu();
-- 
GitLab


From 4b12f0de33a64dfc624b2480f55b674f7fa23ef2 Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 27 Apr 2009 20:35:42 +0800
Subject: [PATCH 1804/2198] KVM: Replace get_mt_mask_shift with get_mt_mask

Shadow_mt_mask is out of date, now it have only been used as a flag to indicate
if TDP enabled. Get rid of it and use tdp_enabled instead.

Also put memory type logical in kvm_x86_ops->get_mt_mask().

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 +++--
 arch/x86/kvm/mmu.c              | 21 ++++++---------------
 arch/x86/kvm/svm.c              |  4 ++--
 arch/x86/kvm/vmx.c              | 17 ++++++++++++-----
 arch/x86/kvm/x86.c              |  2 +-
 5 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 3e94d0513208d..8a6f6b643dfe6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -522,7 +522,7 @@ struct kvm_x86_ops {
 	void (*drop_interrupt_shadow)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
-	int (*get_mt_mask_shift)(void);
+	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
 };
 
 extern struct kvm_x86_ops *kvm_x86_ops;
@@ -536,7 +536,7 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu);
 void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
 void kvm_mmu_set_base_ptes(u64 base_pte);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask);
+		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
@@ -550,6 +550,7 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  const void *val, int bytes);
 int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
 		  gpa_t addr, unsigned long *ret);
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 extern bool tdp_enabled;
 
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3592aea59ef7f..bc614f91f5ba8 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -178,7 +178,6 @@ static u64 __read_mostly shadow_x_mask;	/* mutual exclusive with nx_mask */
 static u64 __read_mostly shadow_user_mask;
 static u64 __read_mostly shadow_accessed_mask;
 static u64 __read_mostly shadow_dirty_mask;
-static u64 __read_mostly shadow_mt_mask;
 
 static inline u64 rsvd_bits(int s, int e)
 {
@@ -199,14 +198,13 @@ void kvm_mmu_set_base_ptes(u64 base_pte)
 EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
 
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
-		u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 mt_mask)
+		u64 dirty_mask, u64 nx_mask, u64 x_mask)
 {
 	shadow_user_mask = user_mask;
 	shadow_accessed_mask = accessed_mask;
 	shadow_dirty_mask = dirty_mask;
 	shadow_nx_mask = nx_mask;
 	shadow_x_mask = x_mask;
-	shadow_mt_mask = mt_mask;
 }
 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
 
@@ -1608,7 +1606,7 @@ static int get_mtrr_type(struct mtrr_state_type *mtrr_state,
 	return mtrr_state->def_type;
 }
 
-static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
+u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
 {
 	u8 mtrr;
 
@@ -1618,6 +1616,7 @@ static u8 get_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
 		mtrr = MTRR_TYPE_WRBACK;
 	return mtrr;
 }
+EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
 
 static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
@@ -1670,7 +1669,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 {
 	u64 spte;
 	int ret = 0;
-	u64 mt_mask = shadow_mt_mask;
 
 	/*
 	 * We don't set the accessed bit, since we sometimes want to see
@@ -1690,16 +1688,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
 		spte |= shadow_user_mask;
 	if (largepage)
 		spte |= PT_PAGE_SIZE_MASK;
-	if (mt_mask) {
-		if (!kvm_is_mmio_pfn(pfn)) {
-			mt_mask = get_memory_type(vcpu, gfn) <<
-				kvm_x86_ops->get_mt_mask_shift();
-			mt_mask |= VMX_EPT_IGMT_BIT;
-		} else
-			mt_mask = MTRR_TYPE_UNCACHABLE <<
-				kvm_x86_ops->get_mt_mask_shift();
-		spte |= mt_mask;
-	}
+	if (tdp_enabled)
+		spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
+			kvm_is_mmio_pfn(pfn));
 
 	spte |= (u64)pfn << PAGE_SHIFT;
 
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d96a6d3edec7e..63503782935d6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2589,7 +2589,7 @@ static int get_npt_level(void)
 #endif
 }
 
-static int svm_get_mt_mask_shift(void)
+static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	return 0;
 }
@@ -2652,7 +2652,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
-	.get_mt_mask_shift = svm_get_mt_mask_shift,
+	.get_mt_mask = svm_get_mt_mask,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 25be53aa5eefc..59b080c262e80 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3577,9 +3577,17 @@ static int get_ept_level(void)
 	return VMX_EPT_DEFAULT_GAW + 1;
 }
 
-static int vmx_get_mt_mask_shift(void)
+static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
-	return VMX_EPT_MT_EPTE_SHIFT;
+	u64 ret;
+
+	if (is_mmio)
+		ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+	else
+		ret = (kvm_get_guest_memory_type(vcpu, gfn) <<
+			VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IGMT_BIT;
+
+	return ret;
 }
 
 static struct kvm_x86_ops vmx_x86_ops = {
@@ -3639,7 +3647,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
-	.get_mt_mask_shift = vmx_get_mt_mask_shift,
+	.get_mt_mask = vmx_get_mt_mask,
 };
 
 static int __init vmx_init(void)
@@ -3698,8 +3706,7 @@ static int __init vmx_init(void)
 		kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
 			VMX_EPT_WRITABLE_MASK);
 		kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
-				VMX_EPT_EXECUTABLE_MASK,
-				VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
+				VMX_EPT_EXECUTABLE_MASK);
 		kvm_enable_tdp();
 	} else
 		kvm_disable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e2713716e7320..dd056826f6754 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2772,7 +2772,7 @@ int kvm_arch_init(void *opaque)
 	kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
 	kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
-			PT_DIRTY_MASK, PT64_NX_MASK, 0, 0);
+			PT_DIRTY_MASK, PT64_NX_MASK, 0);
 
 	for_each_possible_cpu(cpu)
 		per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
-- 
GitLab


From 522c68c4416de3cd3e11a9ff10d58e776a69ae1e Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Mon, 27 Apr 2009 20:35:43 +0800
Subject: [PATCH 1805/2198] KVM: Enable snooping control for supported hardware

Memory aliases with different memory type is a problem for guest. For the guest
without assigned device, the memory type of guest memory would always been the
same as host(WB); but for the assigned device, some part of memory may be used
as DMA and then set to uncacheable memory type(UC/WC), which would be a conflict of
host memory type then be a potential issue.

Snooping control can guarantee the cache correctness of memory go through the
DMA engine of VT-d.

[avi: fix build on ia64]

Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/include/asm/kvm_host.h |  1 +
 arch/x86/include/asm/kvm_host.h  |  1 +
 arch/x86/kvm/vmx.c               | 19 +++++++++++++++++--
 include/linux/kvm_host.h         |  3 +++
 virt/kvm/iommu.c                 | 27 ++++++++++++++++++++++++---
 5 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/arch/ia64/include/asm/kvm_host.h b/arch/ia64/include/asm/kvm_host.h
index 589536fa799d7..5f43697aed301 100644
--- a/arch/ia64/include/asm/kvm_host.h
+++ b/arch/ia64/include/asm/kvm_host.h
@@ -474,6 +474,7 @@ struct kvm_arch {
 
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
+	int iommu_flags;
 	struct hlist_head irq_ack_notifier_list;
 
 	unsigned long irq_sources_bitmap;
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8a6f6b643dfe6..253d8f669cf68 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -393,6 +393,7 @@ struct kvm_arch{
 	struct list_head active_mmu_pages;
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
+	int iommu_flags;
 	struct kvm_pic *vpic;
 	struct kvm_ioapic *vioapic;
 	struct kvm_pit *vpit;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 59b080c262e80..e8a5649f9c159 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3581,11 +3581,26 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 {
 	u64 ret;
 
+	/* For VT-d and EPT combination
+	 * 1. MMIO: always map as UC
+	 * 2. EPT with VT-d:
+	 *   a. VT-d without snooping control feature: can't guarantee the
+	 *	result, try to trust guest.
+	 *   b. VT-d with snooping control feature: snooping control feature of
+	 *	VT-d engine can guarantee the cache correctness. Just set it
+	 *	to WB to keep consistent with host. So the same as item 3.
+	 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep
+	 *    consistent with host MTRR
+	 */
 	if (is_mmio)
 		ret = MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
+	else if (vcpu->kvm->arch.iommu_domain &&
+		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY))
+		ret = kvm_get_guest_memory_type(vcpu, gfn) <<
+		      VMX_EPT_MT_EPTE_SHIFT;
 	else
-		ret = (kvm_get_guest_memory_type(vcpu, gfn) <<
-			VMX_EPT_MT_EPTE_SHIFT) | VMX_EPT_IGMT_BIT;
+		ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
+			| VMX_EPT_IGMT_BIT;
 
 	return ret;
 }
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 72d56844f3880..bdce8e1303c92 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -367,6 +367,9 @@ void kvm_unregister_irq_ack_notifier(struct kvm_irq_ack_notifier *kian);
 int kvm_request_irq_source_id(struct kvm *kvm);
 void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id);
 
+/* For vcpu->arch.iommu_flags */
+#define KVM_IOMMU_CACHE_COHERENCY	0x1
+
 #ifdef CONFIG_IOMMU_API
 int kvm_iommu_map_pages(struct kvm *kvm, gfn_t base_gfn,
 			unsigned long npages);
diff --git a/virt/kvm/iommu.c b/virt/kvm/iommu.c
index 4c40375036008..15147583abd16 100644
--- a/virt/kvm/iommu.c
+++ b/virt/kvm/iommu.c
@@ -39,11 +39,16 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 	pfn_t pfn;
 	int i, r = 0;
 	struct iommu_domain *domain = kvm->arch.iommu_domain;
+	int flags;
 
 	/* check if iommu exists and in use */
 	if (!domain)
 		return 0;
 
+	flags = IOMMU_READ | IOMMU_WRITE;
+	if (kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY)
+		flags |= IOMMU_CACHE;
+
 	for (i = 0; i < npages; i++) {
 		/* check if already mapped */
 		if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn)))
@@ -53,8 +58,7 @@ int kvm_iommu_map_pages(struct kvm *kvm,
 		r = iommu_map_range(domain,
 				    gfn_to_gpa(gfn),
 				    pfn_to_hpa(pfn),
-				    PAGE_SIZE,
-				    IOMMU_READ | IOMMU_WRITE);
+				    PAGE_SIZE, flags);
 		if (r) {
 			printk(KERN_ERR "kvm_iommu_map_address:"
 			       "iommu failed to map pfn=%lx\n", pfn);
@@ -88,7 +92,7 @@ int kvm_assign_device(struct kvm *kvm,
 {
 	struct pci_dev *pdev = NULL;
 	struct iommu_domain *domain = kvm->arch.iommu_domain;
-	int r;
+	int r, last_flags;
 
 	/* check if iommu exists and in use */
 	if (!domain)
@@ -107,12 +111,29 @@ int kvm_assign_device(struct kvm *kvm,
 		return r;
 	}
 
+	last_flags = kvm->arch.iommu_flags;
+	if (iommu_domain_has_cap(kvm->arch.iommu_domain,
+				 IOMMU_CAP_CACHE_COHERENCY))
+		kvm->arch.iommu_flags |= KVM_IOMMU_CACHE_COHERENCY;
+
+	/* Check if need to update IOMMU page table for guest memory */
+	if ((last_flags ^ kvm->arch.iommu_flags) ==
+			KVM_IOMMU_CACHE_COHERENCY) {
+		kvm_iommu_unmap_memslots(kvm);
+		r = kvm_iommu_map_memslots(kvm);
+		if (r)
+			goto out_unmap;
+	}
+
 	printk(KERN_DEBUG "assign device: host bdf = %x:%x:%x\n",
 		assigned_dev->host_busnr,
 		PCI_SLOT(assigned_dev->host_devfn),
 		PCI_FUNC(assigned_dev->host_devfn));
 
 	return 0;
+out_unmap:
+	kvm_iommu_unmap_memslots(kvm);
+	return r;
 }
 
 int kvm_deassign_device(struct kvm *kvm,
-- 
GitLab


From 8e1c18157d8772e65247ac55525a05a26b83fffe Mon Sep 17 00:00:00 2001
From: Sheng Yang <sheng@linux.intel.com>
Date: Wed, 29 Apr 2009 11:09:04 +0800
Subject: [PATCH 1806/2198] KVM: VMX: Disable VMX when system shutdown

Intel TXT(Trusted Execution Technology) required VMX off for all cpu to work
when system shutdown.

CC: Joseph Cihula <joseph.cihula@intel.com>
Signed-off-by: Sheng Yang <sheng@linux.intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 63d5fa2bc84ac..29c0afb064da1 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2391,15 +2391,15 @@ EXPORT_SYMBOL_GPL(kvm_handle_fault_on_reboot);
 static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
 		      void *v)
 {
-	if (val == SYS_RESTART) {
-		/*
-		 * Some (well, at least mine) BIOSes hang on reboot if
-		 * in vmx root mode.
-		 */
-		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-		kvm_rebooting = true;
-		on_each_cpu(hardware_disable, NULL, 1);
-	}
+	/*
+	 * Some (well, at least mine) BIOSes hang on reboot if
+	 * in vmx root mode.
+	 *
+	 * And Intel TXT required VMX off for all cpu when system shutdown.
+	 */
+	printk(KERN_INFO "kvm: exiting hardware virtualization\n");
+	kvm_rebooting = true;
+	on_each_cpu(hardware_disable, NULL, 1);
 	return NOTIFY_OK;
 }
 
-- 
GitLab


From efbc100c20f9dc4a219a2a92763d68e4064109f2 Mon Sep 17 00:00:00 2001
From: Chris Wright <chrisw@sous-sol.org>
Date: Fri, 1 May 2009 14:15:43 -0700
Subject: [PATCH 1807/2198] KVM: Trivial format fix in setup_routing_entry()

Remove extra tab.

Signed-off-by: Chris Wright <chrisw@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/irq_comm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c
index 4fa1f604b4250..a8bd466d00ccf 100644
--- a/virt/kvm/irq_comm.c
+++ b/virt/kvm/irq_comm.c
@@ -271,7 +271,7 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e,
 			delta = 8;
 			break;
 		case KVM_IRQCHIP_IOAPIC:
-				e->set = kvm_set_ioapic_irq;
+			e->set = kvm_set_ioapic_irq;
 			break;
 		default:
 			goto out;
-- 
GitLab


From 9b5843ddd20557b77c73ddbcc814a8c816bf3d3a Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Wed, 29 Apr 2009 17:29:09 -0400
Subject: [PATCH 1808/2198] KVM: fix apic_debug instances

Apparently nobody turned this on in a while...
setting apic_debug to something compilable, generates
some errors. This patch fixes it.

Signed-off-by: Glauber Costa <glommer@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/lapic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4d76bb6c5e516..ae99d83f81a34 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -445,7 +445,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
 	apic_debug("icr_high 0x%x, icr_low 0x%x, "
 		   "short_hand 0x%x, dest 0x%x, trig_mode 0x%x, level 0x%x, "
 		   "dest_mode 0x%x, delivery_mode 0x%x, vector 0x%x\n",
-		   icr_high, icr_low, irq.shorthand, irq.dest,
+		   icr_high, icr_low, irq.shorthand, irq.dest_id,
 		   irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
 		   irq.vector);
 
@@ -560,7 +560,7 @@ static void update_divide_count(struct kvm_lapic *apic)
 	apic->divide_count = 0x1 << (tmp2 & 0x7);
 
 	apic_debug("timer divide count is 0x%x\n",
-				   apic->lapic_timer.divide_count);
+				   apic->divide_count);
 }
 
 static void start_apic_timer(struct kvm_lapic *apic)
-- 
GitLab


From b586eb0253083795e58dcbe76665410d4676dc08 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Tue, 28 Apr 2009 12:45:43 +0200
Subject: [PATCH 1809/2198] KVM: SVM: Fix cross vendor migration issue in
 segment segment descriptor

On AMD CPUs sometimes the DB bit in the stack segment
descriptor is left as 1, although the whole segment has
been made unusable. Clear it here to pass an Intel VMX
entry check when cross vendor migrating.

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 63503782935d6..e3ea98290f9bc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -803,6 +803,15 @@ static void svm_get_segment(struct kvm_vcpu *vcpu,
 		if (!var->unusable)
 			var->type |= 0x1;
 		break;
+	case VCPU_SREG_SS:
+		/* On AMD CPUs sometimes the DB bit in the segment
+		 * descriptor is left as 1, although the whole segment has
+		 * been made unusable. Clear it here to pass an Intel VMX
+		 * entry check when cross vendor migrating.
+		 */
+		if (var->unusable)
+			var->db = 0;
+		break;
 	}
 }
 
-- 
GitLab


From fe8e7f83de3f0979f13f57992cb33e4a8041272d Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Thu, 23 Apr 2009 17:03:48 +0300
Subject: [PATCH 1810/2198] KVM: SVM: Don't reinject event that caused a task
 switch

If a task switch caused by an event remove it from the event queue.
VMX already does that.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index e3ea98290f9bc..f994c6df78be5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1811,6 +1811,10 @@ static int task_switch_interception(struct vcpu_svm *svm,
 	int int_type = svm->vmcb->control.exit_int_info &
 		SVM_EXITINTINFO_TYPE_MASK;
 	int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK;
+	uint32_t type =
+		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK;
+	uint32_t idt_v =
+		svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID;
 
 	tss_selector = (u16)svm->vmcb->control.exit_info_1;
 
@@ -1820,11 +1824,26 @@ static int task_switch_interception(struct vcpu_svm *svm,
 	else if (svm->vmcb->control.exit_info_2 &
 		 (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP))
 		reason = TASK_SWITCH_JMP;
-	else if (svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID)
+	else if (idt_v)
 		reason = TASK_SWITCH_GATE;
 	else
 		reason = TASK_SWITCH_CALL;
 
+	if (reason == TASK_SWITCH_GATE) {
+		switch (type) {
+		case SVM_EXITINTINFO_TYPE_NMI:
+			svm->vcpu.arch.nmi_injected = false;
+			break;
+		case SVM_EXITINTINFO_TYPE_EXEPT:
+			kvm_clear_exception_queue(&svm->vcpu);
+			break;
+		case SVM_EXITINTINFO_TYPE_INTR:
+			kvm_clear_interrupt_queue(&svm->vcpu);
+			break;
+		default:
+			break;
+		}
+	}
 
 	if (reason != TASK_SWITCH_GATE ||
 	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
@@ -2203,7 +2222,7 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 	if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
 	    exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
-	    exit_code != SVM_EXIT_NPF)
+	    exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH)
 		printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
 		       "exit_code 0x%x\n",
 		       __func__, svm->vmcb->control.exit_int_info,
-- 
GitLab


From d6a8c875f35a6e1b3fb3f21e93eabb183b1f39ee Mon Sep 17 00:00:00 2001
From: Jan Kiszka <jan.kiszka@siemens.com>
Date: Mon, 20 Apr 2009 18:10:07 +0200
Subject: [PATCH 1811/2198] KVM: Drop request_nmi from stats

The stats entry request_nmi is no longer used as the related user space
interface was dropped. So clean it up.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 1 -
 arch/x86/kvm/x86.c              | 1 -
 2 files changed, 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 253d8f669cf68..ab7de4a119554 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -441,7 +441,6 @@ struct kvm_vcpu_stat {
 	u32 halt_exits;
 	u32 halt_wakeup;
 	u32 request_irq_exits;
-	u32 request_nmi_exits;
 	u32 irq_exits;
 	u32 host_state_reload;
 	u32 efer_reload;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dd056826f6754..a0faa4882f846 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -91,7 +91,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
 	{ "hypercalls", VCPU_STAT(hypercalls) },
 	{ "request_irq", VCPU_STAT(request_irq_exits) },
-	{ "request_nmi", VCPU_STAT(request_nmi_exits) },
 	{ "irq_exits", VCPU_STAT(irq_exits) },
 	{ "host_state_reload", VCPU_STAT(host_state_reload) },
 	{ "efer_reload", VCPU_STAT(efer_reload) },
-- 
GitLab


From 8d753f369bd28fff1706ffe9fb9fea4fd88cf85b Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 10 May 2009 11:41:39 +0300
Subject: [PATCH 1812/2198] KVM: Fix cpuid feature misreporting

MTRR, PAT, MCE, and MCA are all supported (to some extent) but not reported.
Vista requires these features, so if userspace relies on kernel cpuid
reporting, it loses support for Vista.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a0faa4882f846..da3e65bfc6ee9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1254,9 +1254,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
 		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
 		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
+		bit(X86_FEATURE_MCE) |
 		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-		bit(X86_FEATURE_SEP) | bit(X86_FEATURE_PGE) |
-		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
+		bit(X86_FEATURE_SEP) | bit(X86_FEATURE_MTRR) |
+		bit(X86_FEATURE_PGE) | bit(X86_FEATURE_MCA) |
+		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PAT) |
+		bit(X86_FEATURE_PSE36) |
 		bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
 		bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
 		bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
-- 
GitLab


From 7faa4ee1c777a5f8e8373430cfd9cb6172aa3503 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 10 May 2009 13:55:35 +0300
Subject: [PATCH 1813/2198] KVM: Add AMD cpuid bit: cr8_legacy, abm, misaligned
 sse, sse4, 3dnow prefetch

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 71 +++++++++++++++++++++++++---------------------
 1 file changed, 38 insertions(+), 33 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index da3e65bfc6ee9..95de3d34b115a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1247,44 +1247,47 @@ static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	entry->flags = 0;
 }
 
+#define F(x) bit(X86_FEATURE_##x)
+
 static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 			 u32 index, int *nent, int maxnent)
 {
-	const u32 kvm_supported_word0_x86_features = bit(X86_FEATURE_FPU) |
-		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
-		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
-		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
-		bit(X86_FEATURE_MCE) |
-		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-		bit(X86_FEATURE_SEP) | bit(X86_FEATURE_MTRR) |
-		bit(X86_FEATURE_PGE) | bit(X86_FEATURE_MCA) |
-		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PAT) |
-		bit(X86_FEATURE_PSE36) |
-		bit(X86_FEATURE_CLFLSH) | bit(X86_FEATURE_MMX) |
-		bit(X86_FEATURE_FXSR) | bit(X86_FEATURE_XMM) |
-		bit(X86_FEATURE_XMM2) | bit(X86_FEATURE_SELFSNOOP);
-	const u32 kvm_supported_word1_x86_features = bit(X86_FEATURE_FPU) |
-		bit(X86_FEATURE_VME) | bit(X86_FEATURE_DE) |
-		bit(X86_FEATURE_PSE) | bit(X86_FEATURE_TSC) |
-		bit(X86_FEATURE_MSR) | bit(X86_FEATURE_PAE) |
-		bit(X86_FEATURE_CX8) | bit(X86_FEATURE_APIC) |
-		bit(X86_FEATURE_PGE) |
-		bit(X86_FEATURE_CMOV) | bit(X86_FEATURE_PSE36) |
-		bit(X86_FEATURE_MMX) | bit(X86_FEATURE_FXSR) |
-		bit(X86_FEATURE_SYSCALL) |
-		(is_efer_nx() ? bit(X86_FEATURE_NX) : 0) |
+	unsigned f_nx = is_efer_nx() ? F(NX) : 0;
 #ifdef CONFIG_X86_64
-		bit(X86_FEATURE_LM) |
+	unsigned f_lm = F(LM);
+#else
+	unsigned f_lm = 0;
 #endif
-		bit(X86_FEATURE_FXSR_OPT) |
-		bit(X86_FEATURE_MMXEXT) |
-		bit(X86_FEATURE_3DNOWEXT) |
-		bit(X86_FEATURE_3DNOW);
-	const u32 kvm_supported_word3_x86_features =
-		bit(X86_FEATURE_XMM3) | bit(X86_FEATURE_CX16);
+
+	/* cpuid 1.edx */
+	const u32 kvm_supported_word0_x86_features =
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SEP) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* PSN */ | F(CLFLSH) |
+		0 /* Reserved, DS, ACPI */ | F(MMX) |
+		F(FXSR) | F(XMM) | F(XMM2) | F(SELFSNOOP) |
+		0 /* HTT, TM, Reserved, PBE */;
+	/* cpuid 0x80000001.edx */
+	const u32 kvm_supported_word1_x86_features =
+		F(FPU) | F(VME) | F(DE) | F(PSE) |
+		F(TSC) | F(MSR) | F(PAE) | F(MCE) |
+		F(CX8) | F(APIC) | 0 /* Reserved */ | F(SYSCALL) |
+		F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
+		F(PAT) | F(PSE36) | 0 /* Reserved */ |
+		f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
+		F(FXSR) | F(FXSR_OPT) | 0 /* GBPAGES */ | 0 /* RDTSCP */ |
+		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
+	/* cpuid 1.ecx */
+	const u32 kvm_supported_word4_x86_features =
+		F(XMM3) | F(CX16);
+	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
-		bit(X86_FEATURE_LAHF_LM) | bit(X86_FEATURE_CMP_LEGACY) |
-		bit(X86_FEATURE_SVM);
+		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
+		F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
+		F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) |
+		0 /* SKINIT */ | 0 /* WDT */;
 
 	/* all calls to cpuid_count() should be made on the same cpu */
 	get_cpu();
@@ -1297,7 +1300,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		break;
 	case 1:
 		entry->edx &= kvm_supported_word0_x86_features;
-		entry->ecx &= kvm_supported_word3_x86_features;
+		entry->ecx &= kvm_supported_word4_x86_features;
 		break;
 	/* function 2 entries are STATEFUL. That is, repeated cpuid commands
 	 * may return different values. This forces us to get_cpu() before
@@ -1359,6 +1362,8 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 	put_cpu();
 }
 
+#undef F
+
 static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
 				     struct kvm_cpuid_entry2 __user *entries)
 {
-- 
GitLab


From 069ebaa4644521e8e80b6849ace4dee53f93f55e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 10 May 2009 14:37:56 +0300
Subject: [PATCH 1814/2198] x86: Add cpu features MOVBE and POPCNT

Add cpu feature bit support for the MOVBE and POPCNT instructions.

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/cpufeature.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index bb83b1c397aad..9c63bf37ad533 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -115,6 +115,8 @@
 #define X86_FEATURE_XMM4_1	(4*32+19) /* "sse4_1" SSE-4.1 */
 #define X86_FEATURE_XMM4_2	(4*32+20) /* "sse4_2" SSE-4.2 */
 #define X86_FEATURE_X2APIC	(4*32+21) /* x2APIC */
+#define X86_FEATURE_MOVBE	(4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT      (4*32+23) /* POPCNT instruction */
 #define X86_FEATURE_AES		(4*32+25) /* AES instructions */
 #define X86_FEATURE_XSAVE	(4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
 #define X86_FEATURE_OSXSAVE	(4*32+27) /* "" XSAVE enabled in the OS */
-- 
GitLab


From d149c731e4f71982247a14409951259f36271dd7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 10 May 2009 14:41:56 +0300
Subject: [PATCH 1815/2198] KVM: Update cpuid 1.ecx reporting

Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 95de3d34b115a..75927700a26d5 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1281,7 +1281,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
 	const u32 kvm_supported_word4_x86_features =
-		F(XMM3) | F(CX16);
+		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+		0 /* DS-CPL, VMX, SMX, EST */ |
+		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
+		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
+		0 /* Reserved, DCA */ | F(XMM4_1) |
+		F(XMM4_2) | 0 /* x2APIC */ | F(MOVBE) | F(POPCNT) |
+		0 /* Reserved, XSAVE, OSXSAVE */;
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
-- 
GitLab


From 32f8840064d88cc3f6e85203aec7b6b57bebcb97 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 7 May 2009 17:55:12 -0300
Subject: [PATCH 1816/2198] KVM: use smp_send_reschedule in kvm_vcpu_kick

KVM uses a function call IPI to cause the exit of a guest running on a
physical cpu. For virtual interrupt notification there is no need to
wait on IPI receival, or to execute any function.

This is exactly what the reschedule IPI does, without the overhead
of function IPI. So use it instead of smp_call_function_single in
kvm_vcpu_kick.

Also change the "guest_mode" variable to a bit in vcpu->requests, and
use that to collapse multiple IPI's that would be issued between the
first one and zeroing of guest mode.

This allows kvm_vcpu_kick to called with interrupts disabled.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/ia64/kernel/irq_ia64.c |  3 +++
 arch/ia64/kvm/kvm-ia64.c    | 22 ++++++++--------------
 arch/x86/kernel/smp.c       |  3 +++
 arch/x86/kvm/x86.c          | 36 +++++++++++-------------------------
 include/linux/kvm_host.h    |  2 +-
 5 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/arch/ia64/kernel/irq_ia64.c b/arch/ia64/kernel/irq_ia64.c
index acc4d19ae62ad..b448197728be5 100644
--- a/arch/ia64/kernel/irq_ia64.c
+++ b/arch/ia64/kernel/irq_ia64.c
@@ -610,6 +610,9 @@ static struct irqaction ipi_irqaction = {
 	.name =		"IPI"
 };
 
+/*
+ * KVM uses this interrupt to force a cpu out of guest mode
+ */
 static struct irqaction resched_irqaction = {
 	.handler =	dummy_handler,
 	.flags =	IRQF_DISABLED,
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index be4413e1f43ff..80c57b0a21c47 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -668,7 +668,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	host_ctx = kvm_get_host_context(vcpu);
 	guest_ctx = kvm_get_guest_context(vcpu);
 
-	vcpu->guest_mode = 1;
+	clear_bit(KVM_REQ_KICK, &vcpu->requests);
 
 	r = kvm_vcpu_pre_transition(vcpu);
 	if (r < 0)
@@ -685,7 +685,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	kvm_vcpu_post_transition(vcpu);
 
 	vcpu->arch.launched = 1;
-	vcpu->guest_mode = 0;
+	set_bit(KVM_REQ_KICK, &vcpu->requests);
 	local_irq_enable();
 
 	/*
@@ -1879,24 +1879,18 @@ void kvm_arch_hardware_unsetup(void)
 {
 }
 
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-	printk(KERN_DEBUG"vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
-	int ipi_pcpu = vcpu->cpu;
-	int cpu = get_cpu();
+	int me;
+	int cpu = vcpu->cpu;
 
 	if (waitqueue_active(&vcpu->wq))
 		wake_up_interruptible(&vcpu->wq);
 
-	if (vcpu->guest_mode && cpu != ipi_pcpu)
-		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
+	me = get_cpu();
+	if (cpu != me && (unsigned) cpu < nr_cpu_ids && cpu_online(cpu))
+		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+			smp_send_reschedule(cpu);
 	put_cpu();
 }
 
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 13f33ea8ccaaf..3b2e55e8ad2b8 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -172,6 +172,9 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	inc_irq_stat(irq_resched_count);
+	/*
+	 * KVM uses this interrupt to force a cpu out of guest mode
+	 */
 }
 
 void smp_call_function_interrupt(struct pt_regs *regs)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 75927700a26d5..3c4c327490af3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3230,6 +3230,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	local_irq_disable();
 
+	clear_bit(KVM_REQ_KICK, &vcpu->requests);
+	smp_mb__after_clear_bit();
+
 	if (vcpu->requests || need_resched() || signal_pending(current)) {
 		local_irq_enable();
 		preempt_enable();
@@ -3237,13 +3240,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		goto out;
 	}
 
-	vcpu->guest_mode = 1;
-	/*
-	 * Make sure that guest_mode assignment won't happen after
-	 * testing the pending IRQ vector bitmap.
-	 */
-	smp_wmb();
-
 	if (vcpu->arch.exception.pending)
 		__queue_exception(vcpu);
 	else
@@ -3288,7 +3284,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	set_debugreg(vcpu->arch.host_dr6, 6);
 	set_debugreg(vcpu->arch.host_dr7, 7);
 
-	vcpu->guest_mode = 0;
+	set_bit(KVM_REQ_KICK, &vcpu->requests);
 	local_irq_enable();
 
 	++vcpu->stat.exits;
@@ -4571,30 +4567,20 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 	       || vcpu->arch.nmi_pending;
 }
 
-static void vcpu_kick_intr(void *info)
-{
-#ifdef DEBUG
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)info;
-	printk(KERN_DEBUG "vcpu_kick_intr %p \n", vcpu);
-#endif
-}
-
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 {
-	int ipi_pcpu = vcpu->cpu;
-	int cpu;
+	int me;
+	int cpu = vcpu->cpu;
 
 	if (waitqueue_active(&vcpu->wq)) {
 		wake_up_interruptible(&vcpu->wq);
 		++vcpu->stat.halt_wakeup;
 	}
-	/*
-	 * We may be called synchronously with irqs disabled in guest mode,
-	 * So need not to call smp_call_function_single() in that case.
-	 */
-	cpu = get_cpu();
-	if (vcpu->guest_mode && vcpu->cpu != cpu)
-		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
+
+	me = get_cpu();
+	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
+		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+			smp_send_reschedule(cpu);
 	put_cpu();
 }
 
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index bdce8e1303c92..161816284192e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -38,6 +38,7 @@
 #define KVM_REQ_UNHALT             6
 #define KVM_REQ_MMU_SYNC           7
 #define KVM_REQ_KVMCLOCK_UPDATE    8
+#define KVM_REQ_KICK               9
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID	0
 
@@ -72,7 +73,6 @@ struct kvm_vcpu {
 	struct mutex mutex;
 	int   cpu;
 	struct kvm_run *run;
-	int guest_mode;
 	unsigned long requests;
 	unsigned long guest_debug;
 	int fpu_active;
-- 
GitLab


From 547de29e5b1662deb05b5f90917902dc0e9ac182 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Thu, 7 May 2009 17:55:13 -0300
Subject: [PATCH 1817/2198] KVM: protect assigned dev workqueue, int handler
 and irq acker

kvm_assigned_dev_ack_irq is vulnerable to a race condition with the
interrupt handler function. It does:

        if (dev->host_irq_disabled) {
                enable_irq(dev->host_irq);
                dev->host_irq_disabled = false;
        }

If an interrupt triggers before the host->dev_irq_disabled assignment,
it will disable the interrupt and set dev->host_irq_disabled to true.

On return to kvm_assigned_dev_ack_irq, dev->host_irq_disabled is set to
false, and the next kvm_assigned_dev_ack_irq call will fail to reenable
it.

Other than that, having the interrupt handler and work handlers run in
parallel sounds like asking for trouble (could not spot any obvious
problem, but better not have to, its fragile).

CC: sheng.yang@intel.com
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 include/linux/kvm_host.h |  1 +
 virt/kvm/kvm_main.c      | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 161816284192e..aacc5449f5863 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -345,6 +345,7 @@ struct kvm_assigned_dev_kernel {
 	int flags;
 	struct pci_dev *dev;
 	struct kvm *kvm;
+	spinlock_t assigned_dev_lock;
 };
 
 struct kvm_irq_mask_notifier {
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 29c0afb064da1..687d113a3e5e8 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -42,6 +42,7 @@
 #include <linux/mman.h>
 #include <linux/swap.h>
 #include <linux/bitops.h>
+#include <linux/spinlock.h>
 
 #include <asm/processor.h>
 #include <asm/io.h>
@@ -130,6 +131,7 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 	 * finer-grained lock, update this
 	 */
 	mutex_lock(&kvm->lock);
+	spin_lock_irq(&assigned_dev->assigned_dev_lock);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		struct kvm_guest_msix_entry *guest_entries =
 			assigned_dev->guest_msix_entries;
@@ -156,18 +158,21 @@ static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 		}
 	}
 
+	spin_unlock_irq(&assigned_dev->assigned_dev_lock);
 	mutex_unlock(&assigned_dev->kvm->lock);
 }
 
 static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 {
+	unsigned long flags;
 	struct kvm_assigned_dev_kernel *assigned_dev =
 		(struct kvm_assigned_dev_kernel *) dev_id;
 
+	spin_lock_irqsave(&assigned_dev->assigned_dev_lock, flags);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 		int index = find_index_from_host_irq(assigned_dev, irq);
 		if (index < 0)
-			return IRQ_HANDLED;
+			goto out;
 		assigned_dev->guest_msix_entries[index].flags |=
 			KVM_ASSIGNED_MSIX_PENDING;
 	}
@@ -177,6 +182,8 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 	disable_irq_nosync(irq);
 	assigned_dev->host_irq_disabled = true;
 
+out:
+	spin_unlock_irqrestore(&assigned_dev->assigned_dev_lock, flags);
 	return IRQ_HANDLED;
 }
 
@@ -184,6 +191,7 @@ static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id)
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_assigned_dev_kernel *dev;
+	unsigned long flags;
 
 	if (kian->gsi == -1)
 		return;
@@ -196,10 +204,12 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 	/* The guest irq may be shared so this ack may be
 	 * from another device.
 	 */
+	spin_lock_irqsave(&dev->assigned_dev_lock, flags);
 	if (dev->host_irq_disabled) {
 		enable_irq(dev->host_irq);
 		dev->host_irq_disabled = false;
 	}
+	spin_unlock_irqrestore(&dev->assigned_dev_lock, flags);
 }
 
 static void deassign_guest_irq(struct kvm *kvm,
@@ -615,6 +625,7 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm,
 	match->host_devfn = assigned_dev->devfn;
 	match->flags = assigned_dev->flags;
 	match->dev = dev;
+	spin_lock_init(&match->assigned_dev_lock);
 	match->irq_source_id = -1;
 	match->kvm = kvm;
 	match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq;
-- 
GitLab


From 2809f5d2c4cfad171167b131bb2a21ab65eba40f Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 12 May 2009 16:21:05 -0400
Subject: [PATCH 1818/2198] KVM: Replace ->drop_interrupt_shadow() by
 ->set_interrupt_shadow()

This patch replaces drop_interrupt_shadow with the more
general set_interrupt_shadow, that can either drop or raise
it, depending on its parameter.  It also adds ->get_interrupt_shadow()
for future use.

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h        |  3 +-
 arch/x86/include/asm/kvm_x86_emulate.h |  3 ++
 arch/x86/kvm/svm.c                     | 32 ++++++++++++-----
 arch/x86/kvm/vmx.c                     | 49 +++++++++++++++++---------
 arch/x86/kvm/x86.c                     |  2 +-
 5 files changed, 63 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index ab7de4a119554..16d1481aa231c 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -508,6 +508,8 @@ struct kvm_x86_ops {
 	void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
 	int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
 	void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
+	void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
+	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
 				unsigned char *hypercall_addr);
 	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
@@ -519,7 +521,6 @@ struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-	void (*drop_interrupt_shadow)(struct kvm_vcpu *vcpu);
 	int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
 	int (*get_tdp_level)(void);
 	u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index 6a159732881a6..be40d6e2b6bb2 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -143,6 +143,9 @@ struct decode_cache {
 	struct fetch_cache fetch;
 };
 
+#define X86_SHADOW_INT_MOV_SS  1
+#define X86_SHADOW_INT_STI     2
+
 struct x86_emulate_ctxt {
 	/* Register state before/after emulation. */
 	struct kvm_vcpu *vcpu;
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f994c6df78be5..8b5ffbd55c11e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -202,6 +202,27 @@ static int is_external_interrupt(u32 info)
 	return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
 }
 
+static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	u32 ret = 0;
+
+	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
+		ret |= X86_SHADOW_INT_STI | X86_SHADOW_INT_MOV_SS;
+	return ret & mask;
+}
+
+static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	if (mask == 0)
+		svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+	else
+		svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK;
+
+}
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -215,7 +236,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 		       __func__, kvm_rip_read(vcpu), svm->next_rip);
 
 	kvm_rip_write(vcpu, svm->next_rip);
-	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
+	svm_set_interrupt_shadow(vcpu, 0);
 }
 
 static int has_svm(void)
@@ -2259,12 +2280,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
 		new_asid(svm, svm_data);
 }
 
-static void svm_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-	svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
-}
-
 static void svm_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2667,6 +2682,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.run = svm_vcpu_run,
 	.handle_exit = handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
+	.set_interrupt_shadow = svm_set_interrupt_shadow,
+	.get_interrupt_shadow = svm_get_interrupt_shadow,
 	.patch_hypercall = svm_patch_hypercall,
 	.set_irq = svm_set_irq,
 	.set_nmi = svm_inject_nmi,
@@ -2676,7 +2693,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
-	.drop_interrupt_shadow = svm_drop_interrupt_shadow,
 
 	.set_tss_addr = svm_set_tss_addr,
 	.get_tdp_level = get_npt_level,
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e8a5649f9c159..f3ab27b5a6b2c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -736,23 +736,45 @@ static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
 
+static u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+	u32 interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	int ret = 0;
+
+	if (interruptibility & GUEST_INTR_STATE_STI)
+		ret |= X86_SHADOW_INT_STI;
+	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
+		ret |= X86_SHADOW_INT_MOV_SS;
+
+	return ret & mask;
+}
+
+static void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask)
+{
+	u32 interruptibility_old = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
+	u32 interruptibility = interruptibility_old;
+
+	interruptibility &= ~(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
+
+	if (mask & X86_SHADOW_INT_MOV_SS)
+		interruptibility |= GUEST_INTR_STATE_MOV_SS;
+	if (mask & X86_SHADOW_INT_STI)
+		interruptibility |= GUEST_INTR_STATE_STI;
+
+	if ((interruptibility != interruptibility_old))
+		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, interruptibility);
+}
+
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
 	unsigned long rip;
-	u32 interruptibility;
 
 	rip = kvm_rip_read(vcpu);
 	rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
 	kvm_rip_write(vcpu, rip);
 
-	/*
-	 * We emulated an instruction, so temporary interrupt blocking
-	 * should be removed, if set.
-	 */
-	interruptibility = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
-	if (interruptibility & 3)
-		vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
-			     interruptibility & ~3);
+	/* skipping an emulated instruction also counts */
+	vmx_set_interrupt_shadow(vcpu, 0);
 }
 
 static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -2400,12 +2422,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	return ret;
 }
 
-void vmx_drop_interrupt_shadow(struct kvm_vcpu *vcpu)
-{
-	vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
-			GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS);
-}
-
 static void enable_irq_window(struct kvm_vcpu *vcpu)
 {
 	u32 cpu_based_vm_exec_control;
@@ -3649,6 +3665,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.run = vmx_vcpu_run,
 	.handle_exit = vmx_handle_exit,
 	.skip_emulated_instruction = skip_emulated_instruction,
+	.set_interrupt_shadow = vmx_set_interrupt_shadow,
+	.get_interrupt_shadow = vmx_get_interrupt_shadow,
 	.patch_hypercall = vmx_patch_hypercall,
 	.set_irq = vmx_inject_irq,
 	.set_nmi = vmx_inject_nmi,
@@ -3658,7 +3676,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.enable_nmi_window = enable_nmi_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
-	.drop_interrupt_shadow = vmx_drop_interrupt_shadow,
 
 	.set_tss_addr = vmx_set_tss_addr,
 	.get_tdp_level = get_ept_level,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4c327490af3..7475b029b2ad8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3178,7 +3178,7 @@ static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		kvm_run->request_interrupt_window;
 
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		kvm_x86_ops->drop_interrupt_shadow(vcpu);
+		kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
 
 	inject_irq(vcpu);
 
-- 
GitLab


From 310b5d306c1aee7ebe32f702c0e33e7988d50646 Mon Sep 17 00:00:00 2001
From: Glauber Costa <glommer@redhat.com>
Date: Tue, 12 May 2009 16:21:06 -0400
Subject: [PATCH 1819/2198] KVM: Deal with interrupt shadow state for emulated
 instructions

We currently unblock shadow interrupt state when we skip an instruction,
but failing to do so when we actually emulate one. This blocks interrupts
in key instruction blocks, in particular sti; hlt; sequences

If the instruction emulated is an sti, we have to block shadow interrupts.
The same goes for mov ss. pop ss also needs it, but we don't currently
emulate it.

Without this patch, I cannot boot gpxe option roms at vmx machines.
This is described at https://bugzilla.redhat.com/show_bug.cgi?id=494469

Signed-off-by: Glauber Costa <glommer@redhat.com>
CC: H. Peter Anvin <hpa@zytor.com>
CC: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_x86_emulate.h |  3 +++
 arch/x86/kvm/x86.c                     |  6 +++++-
 arch/x86/kvm/x86_emulate.c             | 20 ++++++++++++++++++++
 3 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
index be40d6e2b6bb2..b7ed2c423116b 100644
--- a/arch/x86/include/asm/kvm_x86_emulate.h
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -155,6 +155,9 @@ struct x86_emulate_ctxt {
 	int mode;
 	u32 cs_base;
 
+	/* interruptibility state, as a result of execution of STI or MOV SS */
+	int interruptibility;
+
 	/* decode cache */
 	struct decode_cache decode;
 };
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7475b029b2ad8..48f744ff0bc1a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2379,7 +2379,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			u16 error_code,
 			int emulation_type)
 {
-	int r;
+	int r, shadow_mask;
 	struct decode_cache *c;
 
 	kvm_clear_exception_queue(vcpu);
@@ -2433,6 +2433,10 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	}
 
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
+	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
+
+	if (r == 0)
+		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
 
 	if (vcpu->arch.pio.string)
 		return EMULATE_DO_MMIO;
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index d2664fcba7faf..c1b6c232e02b1 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -1361,6 +1361,20 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
 	return 0;
 }
 
+void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
+{
+	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
+	/*
+	 * an sti; sti; sequence only disable interrupts for the first
+	 * instruction. So, if the last instruction, be it emulated or
+	 * not, left the system with the INT_STI flag enabled, it
+	 * means that the last instruction is an sti. We should not
+	 * leave the flag on in this case. The same goes for mov ss
+	 */
+	if (!(int_shadow & mask))
+		ctxt->interruptibility = mask;
+}
+
 int
 x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
@@ -1372,6 +1386,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	int io_dir_in;
 	int rc = 0;
 
+	ctxt->interruptibility = 0;
+
 	/* Shadow copy of register state. Committed on successful emulation.
 	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
 	 * modify them.
@@ -1618,6 +1634,9 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		int err;
 
 		sel = c->src.val;
+		if (c->modrm_reg == VCPU_SREG_SS)
+			toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
+
 		if (c->modrm_reg <= 5) {
 			type_bits = (c->modrm_reg == 1) ? 9 : 1;
 			err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
@@ -1847,6 +1866,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 	case 0xfb: /* sti */
+		toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
 		ctxt->eflags |= X86_EFLAGS_IF;
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
-- 
GitLab


From 7c8a83b75a38a807d37f5a4398eca2a42c8cf513 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 12 May 2009 18:55:43 -0300
Subject: [PATCH 1820/2198] KVM: MMU: protect kvm_mmu_change_mmu_pages with
 mmu_lock

kvm_handle_hva, called by MMU notifiers, manipulates mmu data only with
the protection of mmu_lock.

Update kvm_mmu_change_mmu_pages callers to take mmu_lock, thus protecting
against kvm_handle_hva.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 --
 arch/x86/kvm/x86.c | 6 ++++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bc614f91f5ba8..3ce60ad1fe379 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -2723,7 +2723,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 {
 	struct kvm_mmu_page *sp;
 
-	spin_lock(&kvm->mmu_lock);
 	list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
 		int i;
 		u64 *pt;
@@ -2738,7 +2737,6 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 				pt[i] &= ~PT_WRITABLE_MASK;
 	}
 	kvm_flush_remote_tlbs(kvm);
-	spin_unlock(&kvm->mmu_lock);
 }
 
 void kvm_mmu_zap_all(struct kvm *kvm)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 48f744ff0bc1a..d2a4eca261811 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1625,10 +1625,12 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
 		return -EINVAL;
 
 	down_write(&kvm->slots_lock);
+	spin_lock(&kvm->mmu_lock);
 
 	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
 	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
 
+	spin_unlock(&kvm->mmu_lock);
 	up_write(&kvm->slots_lock);
 	return 0;
 }
@@ -1804,7 +1806,9 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
+		spin_lock(&kvm->mmu_lock);
 		kvm_mmu_slot_remove_write_access(kvm, log->slot);
+		spin_unlock(&kvm->mmu_lock);
 		kvm_flush_remote_tlbs(kvm);
 		memslot = &kvm->memslots[log->slot];
 		n = ALIGN(memslot->npages, BITS_PER_LONG) / 8;
@@ -4548,12 +4552,14 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 		}
 	}
 
+	spin_lock(&kvm->mmu_lock);
 	if (!kvm->arch.n_requested_mmu_pages) {
 		unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
 		kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
 	}
 
 	kvm_mmu_slot_remove_write_access(kvm, mem->slot);
+	spin_unlock(&kvm->mmu_lock);
 	kvm_flush_remote_tlbs(kvm);
 
 	return 0;
-- 
GitLab


From b43b1901ad282aeb74161837fb403927102687a1 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 12 May 2009 18:55:44 -0300
Subject: [PATCH 1821/2198] KVM: take mmu_lock when updating a deleted slot

kvm_handle_hva relies on mmu_lock protection to safely access
the memslot structures.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 687d113a3e5e8..5fed9bfc3cf5b 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1210,8 +1210,10 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 	kvm_free_physmem_slot(&old, npages ? &new : NULL);
 	/* Slot deletion case: we have to update the current slot */
+	spin_lock(&kvm->mmu_lock);
 	if (!npages)
 		*memslot = old;
+	spin_unlock(&kvm->mmu_lock);
 #ifdef CONFIG_DMAR
 	/* map the pages in iommu page table */
 	r = kvm_iommu_map_pages(kvm, base_gfn, npages);
-- 
GitLab


From 8986ecc0ef58c96eec48d8502c048f3ab67fd8e2 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Tue, 12 May 2009 18:55:45 -0300
Subject: [PATCH 1822/2198] KVM: x86: check for cr3 validity in mmu_alloc_roots

Verify the cr3 address stored in vcpu->arch.cr3 points to an existant
memslot. If not, inject a triple fault.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 27 +++++++++++++++++++++++----
 arch/x86/kvm/x86.c |  1 +
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3ce60ad1fe379..5c3d6e81a7dc6 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1912,7 +1912,19 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
 
-static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
+static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
+{
+	int ret = 0;
+
+	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
+		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		ret = 1;
+	}
+
+	return ret;
+}
+
+static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 {
 	int i;
 	gfn_t root_gfn;
@@ -1927,13 +1939,15 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		ASSERT(!VALID_PAGE(root));
 		if (tdp_enabled)
 			direct = 1;
+		if (mmu_check_root(vcpu, root_gfn))
+			return 1;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 		root = __pa(sp->spt);
 		++sp->root_count;
 		vcpu->arch.mmu.root_hpa = root;
-		return;
+		return 0;
 	}
 	direct = !is_paging(vcpu);
 	if (tdp_enabled)
@@ -1950,6 +1964,8 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = vcpu->arch.pdptrs[i] >> PAGE_SHIFT;
 		} else if (vcpu->arch.mmu.root_level == 0)
 			root_gfn = 0;
+		if (mmu_check_root(vcpu, root_gfn))
+			return 1;
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
@@ -1958,6 +1974,7 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
 		vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
 	}
 	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
+	return 0;
 }
 
 static void mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -1976,7 +1993,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 	for (i = 0; i < 4; ++i) {
 		hpa_t root = vcpu->arch.mmu.pae_root[i];
 
-		if (root) {
+		if (root && VALID_PAGE(root)) {
 			root &= PT64_BASE_ADDR_MASK;
 			sp = page_header(root);
 			mmu_sync_children(vcpu, sp);
@@ -2311,9 +2328,11 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 		goto out;
 	spin_lock(&vcpu->kvm->mmu_lock);
 	kvm_mmu_free_some_pages(vcpu);
-	mmu_alloc_roots(vcpu);
+	r = mmu_alloc_roots(vcpu);
 	mmu_sync_roots(vcpu);
 	spin_unlock(&vcpu->kvm->mmu_lock);
+	if (r)
+		goto out;
 	kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
 	kvm_mmu_flush_tlb(vcpu);
 out:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d2a4eca261811..3244437e67b35 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4568,6 +4568,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 void kvm_arch_flush_shadow(struct kvm *kvm)
 {
 	kvm_mmu_zap_all(kvm);
+	kvm_reload_remote_mmus(kvm);
 }
 
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
-- 
GitLab


From 58f8ac279a8c46eb0a3193edd521ed2e41c4f914 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Tue, 12 May 2009 13:44:06 -0700
Subject: [PATCH 1823/2198] KVM: Expand on "help" info to specify kvm intel and
 amd module names

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Cc: Avi Kivity <avi@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/Kconfig | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index a58504ea78ccb..8600a09e0c6c5 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -50,6 +50,9 @@ config KVM_INTEL
 	  Provides support for KVM on Intel processors equipped with the VT
 	  extensions.
 
+	  To compile this as a module, choose M here: the module
+	  will be called kvm-intel.
+
 config KVM_AMD
 	tristate "KVM for AMD processors support"
 	depends on KVM
@@ -57,6 +60,9 @@ config KVM_AMD
 	  Provides support for KVM on AMD processors equipped with the AMD-V
 	  (SVM) extensions.
 
+	  To compile this as a module, choose M here: the module
+	  will be called kvm-amd.
+
 config KVM_TRACE
 	bool "KVM trace support"
 	depends on KVM && SYSFS
-- 
GitLab


From 2668dab794272f0898491acaf1e77e9a005abc0f Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 12 May 2009 17:21:48 +0200
Subject: [PATCH 1824/2198] KVM: s390: Fix memory slot versus run - v3

This patch fixes an incorrectness in the kvm backend for s390.
In case virtual cpus are being created before the corresponding
memory slot is being registered, we need to update the sie
control blocks for the virtual cpus.

*updates in v3*
In consideration of the s390 memslot constraints locking was changed
to trylock. These locks should never be held, as vcpu's can't run without
the single memslot we just assign when running this code. To ensure this
never deadlocks in case other code changes the code uses trylocks and bail
out if it can't get all locks.

Additionally most of the discussed special conditions for s390 like
only one memslot and no user_alloc are now checked for validity in
kvm_arch_set_memory_region.

Reported-by: Mijo Safradin <mijo@linux.vnet.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 36 +++++++++++++++++++++++++++++++-----
 1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f4d56e9939c90..86567e174fd79 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -657,6 +657,8 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 				struct kvm_memory_slot old,
 				int user_alloc)
 {
+	int i;
+
 	/* A few sanity checks. We can have exactly one memory slot which has
 	   to start at guest virtual zero and which has to be located at a
 	   page boundary in userland and which has to end at a page boundary.
@@ -664,7 +666,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	   vmas. It is okay to mmap() and munmap() stuff in this slot after
 	   doing this call at any time */
 
-	if (mem->slot)
+	if (mem->slot || kvm->arch.guest_memsize)
 		return -EINVAL;
 
 	if (mem->guest_phys_addr)
@@ -676,15 +678,39 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
 	if (mem->memory_size & (PAGE_SIZE - 1))
 		return -EINVAL;
 
+	if (!user_alloc)
+		return -EINVAL;
+
+	/* lock all vcpus */
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (!kvm->vcpus[i])
+			continue;
+		if (!mutex_trylock(&kvm->vcpus[i]->mutex))
+			goto fail_out;
+	}
+
 	kvm->arch.guest_origin = mem->userspace_addr;
 	kvm->arch.guest_memsize = mem->memory_size;
 
-	/* FIXME: we do want to interrupt running CPUs and update their memory
-	   configuration now to avoid race conditions. But hey, changing the
-	   memory layout while virtual CPUs are running is usually bad
-	   programming practice. */
+	/* update sie control blocks, and unlock all vcpus */
+	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
+		if (kvm->vcpus[i]) {
+			kvm->vcpus[i]->arch.sie_block->gmsor =
+				kvm->arch.guest_origin;
+			kvm->vcpus[i]->arch.sie_block->gmslm =
+				kvm->arch.guest_memsize +
+				kvm->arch.guest_origin +
+				VIRTIODESCSPACE - 1ul;
+			mutex_unlock(&kvm->vcpus[i]->mutex);
+		}
+	}
 
 	return 0;
+
+fail_out:
+	for (; i >= 0; i--)
+		mutex_unlock(&kvm->vcpus[i]->mutex);
+	return -EINVAL;
 }
 
 void kvm_arch_flush_shadow(struct kvm *kvm)
-- 
GitLab


From ca8723023f25c9a70d76cbd6101f8fb4ffec2fa0 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 12 May 2009 17:21:49 +0200
Subject: [PATCH 1825/2198] KVM: s390: use hrtimer for clock wakeup from idle -
 v2

This patch reworks the s390 clock comparator wakeup to hrtimer. The clock
comparator is a per-cpu value that is compared against the TOD clock. If
ckc <= TOD an external interrupt 1004 is triggered. Since the clock comparator
and the TOD clock have a much higher resolution than jiffies we should use
hrtimers to trigger the wakeup. This speeds up guest nanosleep for small
values.

Since hrtimers callbacks run in hard-irq context, I added a tasklet to do
the actual work with enabled interrupts.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/include/asm/kvm_host.h |  5 ++++-
 arch/s390/kvm/interrupt.c        | 33 ++++++++++++++++++++++----------
 arch/s390/kvm/kvm-s390.c         |  7 +++++--
 arch/s390/kvm/kvm-s390.h         |  4 +++-
 4 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 54ea39f96ecdf..a27d0d5a6f868 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -13,6 +13,8 @@
 
 #ifndef ASM_KVM_HOST_H
 #define ASM_KVM_HOST_H
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
 #include <linux/kvm_host.h>
 #include <asm/debug.h>
 #include <asm/cpuid.h>
@@ -210,7 +212,8 @@ struct kvm_vcpu_arch {
 	s390_fp_regs      guest_fpregs;
 	unsigned int      guest_acrs[NUM_ACRS];
 	struct kvm_s390_local_interrupt local_int;
-	struct timer_list ckc_timer;
+	struct hrtimer    ckc_timer;
+	struct tasklet_struct tasklet;
 	union  {
 		cpuid_t	  cpu_id;
 		u64	  stidp_data;
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 4ed4c3a11485f..a48830fa9c591 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -12,6 +12,8 @@
 
 #include <asm/lowcore.h>
 #include <asm/uaccess.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
 #include <linux/kvm_host.h>
 #include <linux/signal.h>
 #include "kvm-s390.h"
@@ -361,12 +363,10 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 
-	sltime = (vcpu->arch.sie_block->ckc - now) / (0xf4240000ul / HZ) + 1;
+	sltime = ((vcpu->arch.sie_block->ckc - now)*125)>>9;
 
-	vcpu->arch.ckc_timer.expires = jiffies + sltime;
-
-	add_timer(&vcpu->arch.ckc_timer);
-	VCPU_EVENT(vcpu, 5, "enabled wait timer:%llx jiffies", sltime);
+	hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
+	VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
 no_timer:
 	spin_lock_bh(&vcpu->arch.local_int.float_int->lock);
 	spin_lock_bh(&vcpu->arch.local_int.lock);
@@ -389,21 +389,34 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 	remove_wait_queue(&vcpu->wq, &wait);
 	spin_unlock_bh(&vcpu->arch.local_int.lock);
 	spin_unlock_bh(&vcpu->arch.local_int.float_int->lock);
-	del_timer(&vcpu->arch.ckc_timer);
+	hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
 	return 0;
 }
 
-void kvm_s390_idle_wakeup(unsigned long data)
+void kvm_s390_tasklet(unsigned long parm)
 {
-	struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
+	struct kvm_vcpu *vcpu = (struct kvm_vcpu *) parm;
 
-	spin_lock_bh(&vcpu->arch.local_int.lock);
+	spin_lock(&vcpu->arch.local_int.lock);
 	vcpu->arch.local_int.timer_due = 1;
 	if (waitqueue_active(&vcpu->arch.local_int.wq))
 		wake_up_interruptible(&vcpu->arch.local_int.wq);
-	spin_unlock_bh(&vcpu->arch.local_int.lock);
+	spin_unlock(&vcpu->arch.local_int.lock);
 }
 
+/*
+ * low level hrtimer wake routine. Because this runs in hardirq context
+ * we schedule a tasklet to do the real work.
+ */
+enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
+{
+	struct kvm_vcpu *vcpu;
+
+	vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
+	tasklet_schedule(&vcpu->arch.tasklet);
+
+	return HRTIMER_NORESTART;
+}
 
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 86567e174fd79..dc3d06811fd83 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/err.h>
 #include <linux/fs.h>
+#include <linux/hrtimer.h>
 #include <linux/init.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
@@ -283,8 +284,10 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 	vcpu->arch.sie_block->gmsor = vcpu->kvm->arch.guest_origin;
 	vcpu->arch.sie_block->ecb   = 2;
 	vcpu->arch.sie_block->eca   = 0xC1002001U;
-	setup_timer(&vcpu->arch.ckc_timer, kvm_s390_idle_wakeup,
-		 (unsigned long) vcpu);
+	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	tasklet_init(&vcpu->arch.tasklet, kvm_s390_tasklet,
+		     (unsigned long) vcpu);
+	vcpu->arch.ckc_timer.function = kvm_s390_idle_wakeup;
 	get_cpu_id(&vcpu->arch.cpu_id);
 	vcpu->arch.cpu_id.version = 0xff;
 	return 0;
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 00bbe69b78da9..748fee872323f 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -14,6 +14,7 @@
 #ifndef ARCH_S390_KVM_S390_H
 #define ARCH_S390_KVM_S390_H
 
+#include <linux/hrtimer.h>
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 
@@ -41,7 +42,8 @@ static inline int __cpu_is_stopped(struct kvm_vcpu *vcpu)
 }
 
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
-void kvm_s390_idle_wakeup(unsigned long data);
+enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
+void kvm_s390_tasklet(unsigned long parm);
 void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu);
 int kvm_s390_inject_vm(struct kvm *kvm,
 		struct kvm_s390_interrupt *s390int);
-- 
GitLab


From b037a4f34ec51b6c8ccb352a04056c04a4bfc269 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Tue, 12 May 2009 17:21:50 +0200
Subject: [PATCH 1826/2198] KVM: s390: optimize float int lock: spin_lock_bh
 --> spin_lock

The floating interrupt lock is only taken in process context. We can
replace all spin_lock_bh with standard spin_lock calls.

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/interrupt.c | 20 ++++++++++----------
 arch/s390/kvm/kvm-s390.c  |  4 ++--
 arch/s390/kvm/priv.c      |  4 ++--
 arch/s390/kvm/sigp.c      | 16 ++++++++--------
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index a48830fa9c591..f04f5301b1b42 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -301,13 +301,13 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu)
 	}
 
 	if ((!rc) && atomic_read(&fi->active)) {
-		spin_lock_bh(&fi->lock);
+		spin_lock(&fi->lock);
 		list_for_each_entry(inti, &fi->list, list)
 			if (__interrupt_is_deliverable(vcpu, inti)) {
 				rc = 1;
 				break;
 			}
-		spin_unlock_bh(&fi->lock);
+		spin_unlock(&fi->lock);
 	}
 
 	if ((!rc) && (vcpu->arch.sie_block->ckc <
@@ -368,7 +368,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 	hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
 	VCPU_EVENT(vcpu, 5, "enabled wait via clock comparator: %llx ns", sltime);
 no_timer:
-	spin_lock_bh(&vcpu->arch.local_int.float_int->lock);
+	spin_lock(&vcpu->arch.local_int.float_int->lock);
 	spin_lock_bh(&vcpu->arch.local_int.lock);
 	add_wait_queue(&vcpu->arch.local_int.wq, &wait);
 	while (list_empty(&vcpu->arch.local_int.list) &&
@@ -377,18 +377,18 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 		!signal_pending(current)) {
 		set_current_state(TASK_INTERRUPTIBLE);
 		spin_unlock_bh(&vcpu->arch.local_int.lock);
-		spin_unlock_bh(&vcpu->arch.local_int.float_int->lock);
+		spin_unlock(&vcpu->arch.local_int.float_int->lock);
 		vcpu_put(vcpu);
 		schedule();
 		vcpu_load(vcpu);
-		spin_lock_bh(&vcpu->arch.local_int.float_int->lock);
+		spin_lock(&vcpu->arch.local_int.float_int->lock);
 		spin_lock_bh(&vcpu->arch.local_int.lock);
 	}
 	__unset_cpu_idle(vcpu);
 	__set_current_state(TASK_RUNNING);
 	remove_wait_queue(&vcpu->wq, &wait);
 	spin_unlock_bh(&vcpu->arch.local_int.lock);
-	spin_unlock_bh(&vcpu->arch.local_int.float_int->lock);
+	spin_unlock(&vcpu->arch.local_int.float_int->lock);
 	hrtimer_try_to_cancel(&vcpu->arch.ckc_timer);
 	return 0;
 }
@@ -455,7 +455,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 	if (atomic_read(&fi->active)) {
 		do {
 			deliver = 0;
-			spin_lock_bh(&fi->lock);
+			spin_lock(&fi->lock);
 			list_for_each_entry_safe(inti, n, &fi->list, list) {
 				if (__interrupt_is_deliverable(vcpu, inti)) {
 					list_del(&inti->list);
@@ -466,7 +466,7 @@ void kvm_s390_deliver_pending_interrupts(struct kvm_vcpu *vcpu)
 			}
 			if (list_empty(&fi->list))
 				atomic_set(&fi->active, 0);
-			spin_unlock_bh(&fi->lock);
+			spin_unlock(&fi->lock);
 			if (deliver) {
 				__do_deliver_interrupt(vcpu, inti);
 				kfree(inti);
@@ -531,7 +531,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 
 	mutex_lock(&kvm->lock);
 	fi = &kvm->arch.float_int;
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	list_add_tail(&inti->list, &fi->list);
 	atomic_set(&fi->active, 1);
 	sigcpu = find_first_bit(fi->idle_mask, KVM_MAX_VCPUS);
@@ -548,7 +548,7 @@ int kvm_s390_inject_vm(struct kvm *kvm,
 	if (waitqueue_active(&li->wq))
 		wake_up_interruptible(&li->wq);
 	spin_unlock_bh(&li->lock);
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	mutex_unlock(&kvm->lock);
 	return 0;
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index dc3d06811fd83..36c654d2d64a2 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -318,11 +318,11 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	spin_lock_init(&vcpu->arch.local_int.lock);
 	INIT_LIST_HEAD(&vcpu->arch.local_int.list);
 	vcpu->arch.local_int.float_int = &kvm->arch.float_int;
-	spin_lock_bh(&kvm->arch.float_int.lock);
+	spin_lock(&kvm->arch.float_int.lock);
 	kvm->arch.float_int.local_int[id] = &vcpu->arch.local_int;
 	init_waitqueue_head(&vcpu->arch.local_int.wq);
 	vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
-	spin_unlock_bh(&kvm->arch.float_int.lock);
+	spin_unlock(&kvm->arch.float_int.lock);
 
 	rc = kvm_vcpu_init(vcpu, kvm, id);
 	if (rc)
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 4b88834b8dd8b..93ecd06e1a749 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -204,11 +204,11 @@ static void handle_stsi_3_2_2(struct kvm_vcpu *vcpu, struct sysinfo_3_2_2 *mem)
 	int cpus = 0;
 	int n;
 
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	for (n = 0; n < KVM_MAX_VCPUS; n++)
 		if (fi->local_int[n])
 			cpus++;
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 
 	/* deal with other level 3 hypervisors */
 	if (stsi(mem, 3, 2, 2) == -ENOSYS)
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index f27dbedf08660..36678835034d9 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -52,7 +52,7 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
 	if (cpu_addr >= KVM_MAX_VCPUS)
 		return 3; /* not operational */
 
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	if (fi->local_int[cpu_addr] == NULL)
 		rc = 3; /* not operational */
 	else if (atomic_read(fi->local_int[cpu_addr]->cpuflags)
@@ -64,7 +64,7 @@ static int __sigp_sense(struct kvm_vcpu *vcpu, u16 cpu_addr,
 		*reg |= SIGP_STAT_STOPPED;
 		rc = 1; /* status stored */
 	}
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 
 	VCPU_EVENT(vcpu, 4, "sensed status of cpu %x rc %x", cpu_addr, rc);
 	return rc;
@@ -86,7 +86,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 
 	inti->type = KVM_S390_INT_EMERGENCY;
 
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
 	if (li == NULL) {
 		rc = 3; /* not operational */
@@ -102,7 +102,7 @@ static int __sigp_emergency(struct kvm_vcpu *vcpu, u16 cpu_addr)
 	spin_unlock_bh(&li->lock);
 	rc = 0; /* order accepted */
 unlock:
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	VCPU_EVENT(vcpu, 4, "sent sigp emerg to cpu %x", cpu_addr);
 	return rc;
 }
@@ -123,7 +123,7 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store)
 
 	inti->type = KVM_S390_SIGP_STOP;
 
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
 	if (li == NULL) {
 		rc = 3; /* not operational */
@@ -142,7 +142,7 @@ static int __sigp_stop(struct kvm_vcpu *vcpu, u16 cpu_addr, int store)
 	spin_unlock_bh(&li->lock);
 	rc = 0; /* order accepted */
 unlock:
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	VCPU_EVENT(vcpu, 4, "sent sigp stop to cpu %x", cpu_addr);
 	return rc;
 }
@@ -188,7 +188,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
 	if (!inti)
 		return 2; /* busy */
 
-	spin_lock_bh(&fi->lock);
+	spin_lock(&fi->lock);
 	li = fi->local_int[cpu_addr];
 
 	if ((cpu_addr >= KVM_MAX_VCPUS) || (li == NULL)) {
@@ -220,7 +220,7 @@ static int __sigp_set_prefix(struct kvm_vcpu *vcpu, u16 cpu_addr, u32 address,
 out_li:
 	spin_unlock_bh(&li->lock);
 out_fi:
-	spin_unlock_bh(&fi->lock);
+	spin_unlock(&fi->lock);
 	return rc;
 }
 
-- 
GitLab


From abf4a71ed95ff29d696bf04633958b2068ed2e0b Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 12 May 2009 17:21:51 +0200
Subject: [PATCH 1827/2198] KVM: s390: Unlink vcpu on destroy - v2

This patch makes sure we do unlink a vcpu's sie control block
from the system control area in kvm_arch_vcpu_destroy. This
prevents illegal accesses to the sie control block from other
virtual cpus after free.

Reported-by: Mijo Safradin <mijo@linux.vnet.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 36c654d2d64a2..628494a434251 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -196,6 +196,10 @@ struct kvm *kvm_arch_create_vm(void)
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+	if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
+		(__u64) vcpu->arch.sie_block)
+		vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
+	smp_mb();
 	free_page((unsigned long)(vcpu->arch.sie_block));
 	kvm_vcpu_uninit(vcpu);
 	kfree(vcpu);
@@ -310,8 +314,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 
 	vcpu->arch.sie_block->icpua = id;
 	BUG_ON(!kvm->arch.sca);
-	BUG_ON(kvm->arch.sca->cpu[id].sda);
-	kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
+	if (!kvm->arch.sca->cpu[id].sda)
+		kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
+	else
+		BUG_ON(!kvm->vcpus[id]); /* vcpu does already exist */
 	vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
 	vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
 
-- 
GitLab


From 3edbcff9bfe2ed632e518b3cfe807d062cee8269 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 12 May 2009 17:21:52 +0200
Subject: [PATCH 1828/2198] KVM: s390: Sanity check on validity intercept

This patch adds a sanity check for the content of the guest
prefix register content before faulting in the cpu lowcore
that it refers to. The guest might end up in an endless loop
where SIE complains about missing lowcore with incorrect
content of the prefix register without this fix.

Reported-by: Mijo Safradin <mijo@linux.vnet.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/intercept.c | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 9d19803111bab..98997ccba5015 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -154,17 +154,25 @@ static int handle_stop(struct kvm_vcpu *vcpu)
 static int handle_validity(struct kvm_vcpu *vcpu)
 {
 	int viwhy = vcpu->arch.sie_block->ipb >> 16;
+	int rc;
+
 	vcpu->stat.exit_validity++;
-	if (viwhy == 0x37) {
-		fault_in_pages_writeable((char __user *)
-					 vcpu->kvm->arch.guest_origin +
-					 vcpu->arch.sie_block->prefix,
-					 PAGE_SIZE);
-		return 0;
-	}
-	VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d",
-		   viwhy);
-	return -ENOTSUPP;
+	if ((viwhy == 0x37) && (vcpu->arch.sie_block->prefix
+		<= vcpu->kvm->arch.guest_memsize - 2*PAGE_SIZE)){
+		rc = fault_in_pages_writeable((char __user *)
+			 vcpu->kvm->arch.guest_origin +
+			 vcpu->arch.sie_block->prefix,
+			 2*PAGE_SIZE);
+		if (rc)
+			/* user will receive sigsegv, exit to user */
+			rc = -ENOTSUPP;
+	} else
+		rc = -ENOTSUPP;
+
+	if (rc)
+		VCPU_EVENT(vcpu, 2, "unhandled validity intercept code %d",
+			   viwhy);
+	return rc;
 }
 
 static int handle_instruction(struct kvm_vcpu *vcpu)
-- 
GitLab


From 51e4d5ab28c75d819b3840da11559ad5c1429135 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Tue, 12 May 2009 17:21:53 +0200
Subject: [PATCH 1829/2198] KVM: s390: Verify memory in kvm run

This check verifies that the guest we're trying to run in KVM_RUN
has some memory assigned to it. It enters an endless exception
loop if this is not the case.

Reported-by: Mijo Safradin <mijo@linux.vnet.ibm.com>
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Signed-off-by: Christian Ehrhardt <ehrhardt@de.ibm.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 628494a434251..10bccd1f8aee5 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -487,6 +487,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
 	vcpu_load(vcpu);
 
+	/* verify, that memory has been registered */
+	if (!vcpu->kvm->arch.guest_memsize) {
+		vcpu_put(vcpu);
+		return -EINVAL;
+	}
+
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
-- 
GitLab


From 3298b75c880d6f0fd70750233c0f3e71a72a5bfb Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:46 +0300
Subject: [PATCH 1830/2198] KVM: Unprotect a page if #PF happens during NMI
 injection.

It is done for exception and interrupt already.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 3 +--
 arch/x86/kvm/vmx.c | 2 +-
 arch/x86/kvm/x86.h | 6 ++++++
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8b5ffbd55c11e..ac3d5ba48d612 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1122,8 +1122,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 	if (npt_enabled)
 		svm_flush_tlb(&svm->vcpu);
 	else {
-		if (svm->vcpu.arch.interrupt.pending ||
-				svm->vcpu.arch.exception.pending)
+		if (kvm_event_needs_reinjection(&svm->vcpu))
 			kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
 	}
 	return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f3ab27b5a6b2c..8981654ad061b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2615,7 +2615,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		cr2 = vmcs_readl(EXIT_QUALIFICATION);
 		KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
 			    (u32)((u64)cr2 >> 32), handler);
-		if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
+		if (kvm_event_needs_reinjection(vcpu))
 			kvm_mmu_unprotect_page_virt(vcpu, cr2);
 		return kvm_mmu_page_fault(vcpu, cr2, error_code);
 	}
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 39350b2527258..21203d4212763 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -30,4 +30,10 @@ static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu)
 		clear_bit(word_index, &vcpu->arch.irq_summary);
 	return irq;
 }
+
+static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
+		vcpu->arch.nmi_injected;
+}
 #endif
-- 
GitLab


From fa9726b0733461781933ab7180aca45e46d0a891 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:47 +0300
Subject: [PATCH 1831/2198] KVM: Do not allow interrupt injection from
 userspace if there is a pending event.

The exception will immediately close the interrupt window.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3244437e67b35..96413f4e33bab 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3105,8 +3105,9 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu,
 		kvm_run->ready_for_interrupt_injection = 1;
 	else
 		kvm_run->ready_for_interrupt_injection =
-					(kvm_arch_interrupt_allowed(vcpu) &&
-					 !kvm_cpu_has_interrupt(vcpu));
+			kvm_arch_interrupt_allowed(vcpu) &&
+			!kvm_cpu_has_interrupt(vcpu) &&
+			!kvm_event_needs_reinjection(vcpu);
 }
 
 static void vapic_enter(struct kvm_vcpu *vcpu)
-- 
GitLab


From 923c61bbc6413e87e5f6b0bae663d202a8de0537 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:48 +0300
Subject: [PATCH 1832/2198] KVM: Remove irq_pending bitmap

Only one interrupt vector can be injected from userspace irqchip at
any given time so no need to store it in a bitmap. Put it into interrupt
queue directly.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 --
 arch/x86/kvm/irq.c              |  4 ++--
 arch/x86/kvm/x86.c              | 38 ++++++++++-----------------------
 arch/x86/kvm/x86.h              | 12 -----------
 4 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 16d1481aa231c..977a785a9d75d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -266,8 +266,6 @@ struct kvm_mmu {
 
 struct kvm_vcpu_arch {
 	u64 host_tsc;
-	unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
-	DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
 	/*
 	 * rip and regs accesses must go through
 	 * kvm_{register,rip}_{read,write} functions.
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 11c2757b808f9..96dfbb6ad2a9d 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -50,7 +50,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 	struct kvm_pic *s;
 
 	if (!irqchip_in_kernel(v->kvm))
-		return v->arch.irq_summary;
+		return v->arch.interrupt.pending;
 
 	if (kvm_apic_has_interrupt(v) == -1) {	/* LAPIC */
 		if (kvm_apic_accept_pic_intr(v)) {
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 	int vector;
 
 	if (!irqchip_in_kernel(v->kvm))
-		return kvm_pop_irq(v);
+		return v->arch.interrupt.nr;
 
 	vector = kvm_get_apic_interrupt(v);	/* APIC */
 	if (vector == -1) {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 96413f4e33bab..54eec3565485b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1441,8 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 		return -ENXIO;
 	vcpu_load(vcpu);
 
-	set_bit(irq->irq, vcpu->arch.irq_pending);
-	set_bit(irq->irq / BITS_PER_LONG, &vcpu->arch.irq_summary);
+	kvm_queue_interrupt(vcpu, irq->irq);
 
 	vcpu_put(vcpu);
 
@@ -3583,12 +3582,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	sregs->efer = vcpu->arch.shadow_efer;
 	sregs->apic_base = kvm_get_apic_base(vcpu);
 
-	if (irqchip_in_kernel(vcpu->kvm))
-		memset(sregs->interrupt_bitmap, 0,
-		       sizeof sregs->interrupt_bitmap);
-	else
-		memcpy(sregs->interrupt_bitmap, vcpu->arch.irq_pending,
-		       sizeof sregs->interrupt_bitmap);
+	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
 
 	if (vcpu->arch.interrupt.pending)
 		set_bit(vcpu->arch.interrupt.nr,
@@ -4058,7 +4052,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 {
 	int mmu_reset_needed = 0;
-	int i, pending_vec, max_bits;
+	int pending_vec, max_bits;
 	struct descriptor_table dt;
 
 	vcpu_load(vcpu);
@@ -4100,24 +4094,14 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	if (mmu_reset_needed)
 		kvm_mmu_reset_context(vcpu);
 
-	if (!irqchip_in_kernel(vcpu->kvm)) {
-		memcpy(vcpu->arch.irq_pending, sregs->interrupt_bitmap,
-		       sizeof vcpu->arch.irq_pending);
-		vcpu->arch.irq_summary = 0;
-		for (i = 0; i < ARRAY_SIZE(vcpu->arch.irq_pending); ++i)
-			if (vcpu->arch.irq_pending[i])
-				__set_bit(i, &vcpu->arch.irq_summary);
-	} else {
-		max_bits = (sizeof sregs->interrupt_bitmap) << 3;
-		pending_vec = find_first_bit(
-			(const unsigned long *)sregs->interrupt_bitmap,
-			max_bits);
-		/* Only pending external irq is handled here */
-		if (pending_vec < max_bits) {
-			kvm_queue_interrupt(vcpu, pending_vec);
-			pr_debug("Set back pending irq %d\n", pending_vec);
-		}
-		kvm_pic_clear_isr_ack(vcpu->kvm);
+	max_bits = (sizeof sregs->interrupt_bitmap) << 3;
+	pending_vec = find_first_bit(
+		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
+	if (pending_vec < max_bits) {
+		kvm_queue_interrupt(vcpu, pending_vec);
+		pr_debug("Set back pending irq %d\n", pending_vec);
+		if (irqchip_in_kernel(vcpu->kvm))
+			kvm_pic_clear_isr_ack(vcpu->kvm);
 	}
 
 	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 21203d4212763..c1f1a8ceba640 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -19,18 +19,6 @@ static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
 	vcpu->arch.interrupt.pending = false;
 }
 
-static inline u8 kvm_pop_irq(struct kvm_vcpu *vcpu)
-{
-	int word_index = __ffs(vcpu->arch.irq_summary);
-	int bit_index = __ffs(vcpu->arch.irq_pending[word_index]);
-	int irq = word_index * BITS_PER_LONG + bit_index;
-
-	clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
-	if (!vcpu->arch.irq_pending[word_index])
-		clear_bit(word_index, &vcpu->arch.irq_summary);
-	return irq;
-}
-
 static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
-- 
GitLab


From f629cf8485c9e1063fd8b915fa3bde80917400a1 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:49 +0300
Subject: [PATCH 1833/2198] KVM: skip_emulated_instruction() decode instruction
 if size is not known

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/svm.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index ac3d5ba48d612..1315ce025e571 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -228,7 +228,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	if (!svm->next_rip) {
-		printk(KERN_DEBUG "%s: NOP\n", __func__);
+		if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) !=
+				EMULATE_DONE)
+			printk(KERN_DEBUG "%s: NOP\n", __func__);
 		return;
 	}
 	if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
@@ -1868,11 +1870,8 @@ static int task_switch_interception(struct vcpu_svm *svm,
 	if (reason != TASK_SWITCH_GATE ||
 	    int_type == SVM_EXITINTINFO_TYPE_SOFT ||
 	    (int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
-	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
-		if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0,
-					EMULTYPE_SKIP) != EMULATE_DONE)
-			return 0;
-	}
+	     (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
+		skip_emulated_instruction(&svm->vcpu);
 
 	return kvm_task_switch(&svm->vcpu, tss_selector, reason);
 }
-- 
GitLab


From 66fd3f7f901f29a557a473af595bf11b270b9ac2 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:50 +0300
Subject: [PATCH 1834/2198] KVM: Do not re-execute INTn instruction.

Re-inject event instead. This is what Intel suggest. Also use correct
instruction length when re-injecting soft fault/interrupt.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 ++++-
 arch/x86/kvm/svm.c              |  8 ++++----
 arch/x86/kvm/vmx.c              | 32 +++++++++++++++++++++++++-------
 arch/x86/kvm/x86.c              | 11 ++++++-----
 arch/x86/kvm/x86.h              |  9 ++++++++-
 5 files changed, 47 insertions(+), 18 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 977a785a9d75d..1d6c3f757cb6e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -319,6 +319,8 @@ struct kvm_vcpu_arch {
 	struct kvm_pio_request pio;
 	void *pio_data;
 
+	u8 event_exit_inst_len;
+
 	struct kvm_queued_exception {
 		bool pending;
 		bool has_error_code;
@@ -328,6 +330,7 @@ struct kvm_vcpu_arch {
 
 	struct kvm_queued_interrupt {
 		bool pending;
+		bool soft;
 		u8 nr;
 	} interrupt;
 
@@ -510,7 +513,7 @@ struct kvm_x86_ops {
 	u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
 	void (*patch_hypercall)(struct kvm_vcpu *vcpu,
 				unsigned char *hypercall_addr);
-	void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
+	void (*set_irq)(struct kvm_vcpu *vcpu);
 	void (*set_nmi)(struct kvm_vcpu *vcpu);
 	void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
 				bool has_error_code, u32 error_code);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 1315ce025e571..377c4f17d1707 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2310,13 +2310,13 @@ static void svm_queue_irq(struct kvm_vcpu *vcpu, unsigned nr)
 		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
 
-static void svm_set_irq(struct kvm_vcpu *vcpu, int irq)
+static void svm_set_irq(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
 	nested_svm_intr(svm);
 
-	svm_queue_irq(vcpu, irq);
+	svm_queue_irq(vcpu, vcpu->arch.interrupt.nr);
 }
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
@@ -2418,7 +2418,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	case SVM_EXITINTINFO_TYPE_EXEPT:
 		/* In case of software exception do not reinject an exception
 		   vector, but re-execute and instruction instead */
-		if (vector == BP_VECTOR || vector == OF_VECTOR)
+		if (kvm_exception_is_soft(vector))
 			break;
 		if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) {
 			u32 err = svm->vmcb->control.exit_int_info_err;
@@ -2428,7 +2428,7 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 			kvm_queue_exception(&svm->vcpu, vector);
 		break;
 	case SVM_EXITINTINFO_TYPE_INTR:
-		kvm_queue_interrupt(&svm->vcpu, vector);
+		kvm_queue_interrupt(&svm->vcpu, vector, false);
 		break;
 	default:
 		break;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 8981654ad061b..29b49f09a019a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -801,8 +801,9 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 		return;
 	}
 
-	if (nr == BP_VECTOR || nr == OF_VECTOR) {
-		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
+	if (kvm_exception_is_soft(nr)) {
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmx->vcpu.arch.event_exit_inst_len);
 		intr_info |= INTR_TYPE_SOFT_EXCEPTION;
 	} else
 		intr_info |= INTR_TYPE_HARD_EXCEPTION;
@@ -2445,9 +2446,11 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
 }
 
-static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
+static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	uint32_t intr;
+	int irq = vcpu->arch.interrupt.nr;
 
 	KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
 
@@ -2462,8 +2465,14 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
 		kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
 		return;
 	}
-	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
-			irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
+	intr = irq | INTR_INFO_VALID_MASK;
+	if (vcpu->arch.interrupt.soft) {
+		intr |= INTR_TYPE_SOFT_INTR;
+		vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
+			     vmx->vcpu.arch.event_exit_inst_len);
+	} else
+		intr |= INTR_TYPE_EXT_INTR;
+	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
 }
 
 static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -3024,6 +3033,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 					      GUEST_INTR_STATE_NMI);
 			break;
 		case INTR_TYPE_EXT_INTR:
+		case INTR_TYPE_SOFT_INTR:
 			kvm_clear_interrupt_queue(vcpu);
 			break;
 		case INTR_TYPE_HARD_EXCEPTION:
@@ -3295,16 +3305,24 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 		vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
 				GUEST_INTR_STATE_NMI);
 		break;
-	case INTR_TYPE_HARD_EXCEPTION:
 	case INTR_TYPE_SOFT_EXCEPTION:
+		vmx->vcpu.arch.event_exit_inst_len =
+			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		/* fall through */
+	case INTR_TYPE_HARD_EXCEPTION:
 		if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
 			u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE);
 			kvm_queue_exception_e(&vmx->vcpu, vector, err);
 		} else
 			kvm_queue_exception(&vmx->vcpu, vector);
 		break;
+	case INTR_TYPE_SOFT_INTR:
+		vmx->vcpu.arch.event_exit_inst_len =
+			vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+		/* fall through */
 	case INTR_TYPE_EXT_INTR:
-		kvm_queue_interrupt(&vmx->vcpu, vector);
+		kvm_queue_interrupt(&vmx->vcpu, vector,
+			type == INTR_TYPE_SOFT_INTR);
 		break;
 	default:
 		break;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 54eec3565485b..73cfe87fba105 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1441,7 +1441,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 		return -ENXIO;
 	vcpu_load(vcpu);
 
-	kvm_queue_interrupt(vcpu, irq->irq);
+	kvm_queue_interrupt(vcpu, irq->irq, false);
 
 	vcpu_put(vcpu);
 
@@ -3161,7 +3161,7 @@ static void inject_irq(struct kvm_vcpu *vcpu)
 	}
 
 	if (vcpu->arch.interrupt.pending) {
-		kvm_x86_ops->set_irq(vcpu, vcpu->arch.interrupt.nr);
+		kvm_x86_ops->set_irq(vcpu);
 		return;
 	}
 
@@ -3174,8 +3174,9 @@ static void inject_irq(struct kvm_vcpu *vcpu)
 		}
 	} else if (kvm_cpu_has_interrupt(vcpu)) {
 		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
-			kvm_x86_ops->set_irq(vcpu, vcpu->arch.interrupt.nr);
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+					    false);
+			kvm_x86_ops->set_irq(vcpu);
 		}
 	}
 }
@@ -4098,7 +4099,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	pending_vec = find_first_bit(
 		(const unsigned long *)sregs->interrupt_bitmap, max_bits);
 	if (pending_vec < max_bits) {
-		kvm_queue_interrupt(vcpu, pending_vec);
+		kvm_queue_interrupt(vcpu, pending_vec, false);
 		pr_debug("Set back pending irq %d\n", pending_vec);
 		if (irqchip_in_kernel(vcpu->kvm))
 			kvm_pic_clear_isr_ack(vcpu->kvm);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c1f1a8ceba640..4c8e10af78e88 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -8,9 +8,11 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
 	vcpu->arch.exception.pending = false;
 }
 
-static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
+static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector,
+	bool soft)
 {
 	vcpu->arch.interrupt.pending = true;
+	vcpu->arch.interrupt.soft = soft;
 	vcpu->arch.interrupt.nr = vector;
 }
 
@@ -24,4 +26,9 @@ static inline bool kvm_event_needs_reinjection(struct kvm_vcpu *vcpu)
 	return vcpu->arch.exception.pending || vcpu->arch.interrupt.pending ||
 		vcpu->arch.nmi_injected;
 }
+
+static inline bool kvm_exception_is_soft(unsigned int nr)
+{
+	return (nr == BP_VECTOR) || (nr == OF_VECTOR);
+}
 #endif
-- 
GitLab


From 6a8b1d13121f8226783987dc7ddd861ee2245410 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:51 +0300
Subject: [PATCH 1835/2198] KVM: Always request IRQ/NMI window if an interrupt
 is pending

Currently they are not requested if there is pending exception.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 30 ++++++++++++------------------
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 73cfe87fba105..199426cc1d0e9 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3152,8 +3152,11 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
 }
 
-static void inject_irq(struct kvm_vcpu *vcpu)
+static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
+	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
+		kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
+
 	/* try to reinject previous events if any */
 	if (vcpu->arch.nmi_injected) {
 		kvm_x86_ops->set_nmi(vcpu);
@@ -3181,26 +3184,11 @@ static void inject_irq(struct kvm_vcpu *vcpu)
 	}
 }
 
-static void inject_pending_irq(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
-{
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
-		kvm_run->request_interrupt_window;
-
-	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
-		kvm_x86_ops->set_interrupt_shadow(vcpu, 0);
-
-	inject_irq(vcpu);
-
-	/* enable NMI/IRQ window open exits if needed */
-	if (vcpu->arch.nmi_pending)
-		kvm_x86_ops->enable_nmi_window(vcpu);
-	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
-		kvm_x86_ops->enable_irq_window(vcpu);
-}
-
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	int r;
+	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+		kvm_run->request_interrupt_window;
 
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3254,6 +3242,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	else
 		inject_pending_irq(vcpu, kvm_run);
 
+	/* enable NMI/IRQ window open exits if needed */
+	if (vcpu->arch.nmi_pending)
+		kvm_x86_ops->enable_nmi_window(vcpu);
+	else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
+		kvm_x86_ops->enable_irq_window(vcpu);
+
 	if (kvm_lapic_enabled(vcpu)) {
 		if (!vcpu->arch.apic->vapic_addr)
 			update_cr8_intercept(vcpu);
-- 
GitLab


From 44c11430b52cbad0a467bc023a802d122dfd285c Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:52 +0300
Subject: [PATCH 1836/2198] KVM: inject NMI after IRET from a previous NMI, not
 before.

If NMI is received during handling of another NMI it should be injected
immediately after IRET from previous NMI handler, but SVM intercept IRET
before instruction execution so we can't inject pending NMI at this
point and there is not way to request exit when NMI window opens. This
patch fix SVM code to open NMI window after IRET by single stepping over
IRET instruction.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/svm.c              | 62 ++++++++++++++++++++++++++-------
 2 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1d6c3f757cb6e..82129437e873f 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -358,6 +358,7 @@ struct kvm_vcpu_arch {
 	unsigned int time_offset;
 	struct page *time_page;
 
+	bool singlestep; /* guest is single stepped by KVM */
 	bool nmi_pending;
 	bool nmi_injected;
 
@@ -771,6 +772,7 @@ enum {
 #define HF_HIF_MASK		(1 << 1)
 #define HF_VINTR_MASK		(1 << 2)
 #define HF_NMI_MASK		(1 << 3)
+#define HF_IRET_MASK		(1 << 4)
 
 /*
  * Hardware virtualization extension instructions may fault if a
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 377c4f17d1707..71510e07e69e0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -965,15 +965,16 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
 
 }
 
-static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+static void update_db_intercept(struct kvm_vcpu *vcpu)
 {
-	int old_debug = vcpu->guest_debug;
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	vcpu->guest_debug = dbg->control;
-
 	svm->vmcb->control.intercept_exceptions &=
 		~((1 << DB_VECTOR) | (1 << BP_VECTOR));
+
+	if (vcpu->arch.singlestep)
+		svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
+
 	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
 		if (vcpu->guest_debug &
 		    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
@@ -984,6 +985,16 @@ static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
 				1 << BP_VECTOR;
 	} else
 		vcpu->guest_debug = 0;
+}
+
+static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
+{
+	int old_debug = vcpu->guest_debug;
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	vcpu->guest_debug = dbg->control;
+
+	update_db_intercept(vcpu);
 
 	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
 		svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
@@ -1133,14 +1144,30 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	if (!(svm->vcpu.guest_debug &
-	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
+	      (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
+		!svm->vcpu.arch.singlestep) {
 		kvm_queue_exception(&svm->vcpu, DB_VECTOR);
 		return 1;
 	}
-	kvm_run->exit_reason = KVM_EXIT_DEBUG;
-	kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
-	kvm_run->debug.arch.exception = DB_VECTOR;
-	return 0;
+
+	if (svm->vcpu.arch.singlestep) {
+		svm->vcpu.arch.singlestep = false;
+		if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
+			svm->vmcb->save.rflags &=
+				~(X86_EFLAGS_TF | X86_EFLAGS_RF);
+		update_db_intercept(&svm->vcpu);
+	}
+
+	if (svm->vcpu.guest_debug &
+	    (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)){
+		kvm_run->exit_reason = KVM_EXIT_DEBUG;
+		kvm_run->debug.arch.pc =
+			svm->vmcb->save.cs.base + svm->vmcb->save.rip;
+		kvm_run->debug.arch.exception = DB_VECTOR;
+		return 0;
+	}
+
+	return 1;
 }
 
 static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
@@ -1887,7 +1914,7 @@ static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
 {
 	++svm->vcpu.stat.nmi_window_exits;
 	svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET);
-	svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
+	svm->vcpu.arch.hflags |= HF_IRET_MASK;
 	return 1;
 }
 
@@ -2357,8 +2384,16 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 
-	if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)
-		enable_irq_window(vcpu);
+	if ((svm->vcpu.arch.hflags & (HF_NMI_MASK | HF_IRET_MASK))
+	    == HF_NMI_MASK)
+		return; /* IRET will cause a vm exit */
+
+	/* Something prevents NMI from been injected. Single step over
+	   possible problem (IRET or exception injection or interrupt
+	   shadow) */
+	vcpu->arch.singlestep = true;
+	svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
+	update_db_intercept(vcpu);
 }
 
 static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -2401,6 +2436,9 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
 	int type;
 	u32 exitintinfo = svm->vmcb->control.exit_int_info;
 
+	if (svm->vcpu.arch.hflags & HF_IRET_MASK)
+		svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
+
 	svm->vcpu.arch.nmi_injected = false;
 	kvm_clear_exception_queue(&svm->vcpu);
 	kvm_clear_interrupt_queue(&svm->vcpu);
-- 
GitLab


From 36752c9b91f75aa3ff0f214a89f13d806cb2f61f Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:53 +0300
Subject: [PATCH 1837/2198] KVM: Do not migrate pending software interrupts.

INTn will be re-executed after migration. If we wanted to migrate
pending software interrupt we would need to migrate interrupt type
and instruction length too, but we do not have all required info on
SVM, so SVM->VMX migration would need to re-execute INTn anyway. To
make it simple never migrate pending soft interrupt.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 199426cc1d0e9..beb806b03a2ee 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3579,7 +3579,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 
 	memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
 
-	if (vcpu->arch.interrupt.pending)
+	if (vcpu->arch.interrupt.pending && !vcpu->arch.interrupt.soft)
 		set_bit(vcpu->arch.interrupt.nr,
 			(unsigned long *)sregs->interrupt_bitmap);
 
-- 
GitLab


From 8db3baa2db34035b2ddb7d0e8b186eb92a056532 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:54 +0300
Subject: [PATCH 1838/2198] KVM: Disable CR8 intercept if tpr patching is
 active

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/x86.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index beb806b03a2ee..249540f985132 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3142,7 +3142,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 	if (!kvm_x86_ops->update_cr8_intercept)
 		return;
 
-	max_irr = kvm_lapic_find_highest_irr(vcpu);
+	if (!vcpu->arch.apic->vapic_addr)
+		max_irr = kvm_lapic_find_highest_irr(vcpu);
+	else
+		max_irr = -1;
 
 	if (max_irr != -1)
 		max_irr >>= 4;
@@ -3249,10 +3252,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		kvm_x86_ops->enable_irq_window(vcpu);
 
 	if (kvm_lapic_enabled(vcpu)) {
-		if (!vcpu->arch.apic->vapic_addr)
-			update_cr8_intercept(vcpu);
-		else
-			kvm_lapic_sync_to_vapic(vcpu);
+		update_cr8_intercept(vcpu);
+		kvm_lapic_sync_to_vapic(vcpu);
 	}
 
 	up_read(&vcpu->kvm->slots_lock);
-- 
GitLab


From 20f65983e30f222e5383f77206e3f571d1d64610 Mon Sep 17 00:00:00 2001
From: Gleb Natapov <gleb@redhat.com>
Date: Mon, 11 May 2009 13:35:55 +0300
Subject: [PATCH 1839/2198] KVM: Move "exit due to NMI" handling into
 vmx_complete_interrupts()

To save us one reading of VM_EXIT_INTR_INFO.

Signed-off-by: Gleb Natapov <gleb@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 29b49f09a019a..fe2ce2b405049 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3261,8 +3261,17 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 	int type;
 	bool idtv_info_valid;
 
-	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+	/* We need to handle NMIs before interrupts are enabled */
+	if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
+	    (exit_intr_info & INTR_INFO_VALID_MASK)) {
+		KVMTRACE_0D(NMI, &vmx->vcpu, handler);
+		asm("int $2");
+	}
+
+	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
+
 	if (cpu_has_virtual_nmis()) {
 		unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
 		vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
@@ -3363,7 +3372,6 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
 static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 intr_info;
 
 	/* Record the guest's net vcpu time for enforced NMI injections. */
 	if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
@@ -3490,15 +3498,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
 	vmx->launched = 1;
 
-	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
-	/* We need to handle NMIs before interrupts are enabled */
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-	    (intr_info & INTR_INFO_VALID_MASK)) {
-		KVMTRACE_0D(NMI, vcpu, handler);
-		asm("int $2");
-	}
-
 	vmx_complete_interrupts(vmx);
 }
 
-- 
GitLab


From 56b237e31abf4d6dbc6e2a0214049b9a23be4883 Mon Sep 17 00:00:00 2001
From: Nitin A Kamble <nitin.a.kamble@intel.com>
Date: Thu, 4 Jun 2009 15:04:08 -0700
Subject: [PATCH 1840/2198] KVM: VMX: Rename rmode.active to rmode.vm86_active

That way the interpretation of rmode.active becomes more clear with
unrestricted guest code.

Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/vmx.c              | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 82129437e873f..eabdc1cfab5c6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -335,7 +335,7 @@ struct kvm_vcpu_arch {
 	} interrupt;
 
 	struct {
-		int active;
+		int vm86_active;
 		u8 save_iopl;
 		struct kvm_save_segment {
 			u16 selector;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fe2ce2b405049..c379a3472fa9b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -495,7 +495,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
 			eb |= 1u << BP_VECTOR;
 	}
-	if (vcpu->arch.rmode.active)
+	if (vcpu->arch.rmode.vm86_active)
 		eb = ~0;
 	if (enable_ept)
 		eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
@@ -731,7 +731,7 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
 
 static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
 {
-	if (vcpu->arch.rmode.active)
+	if (vcpu->arch.rmode.vm86_active)
 		rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
 	vmcs_writel(GUEST_RFLAGS, rflags);
 }
@@ -788,7 +788,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
 		intr_info |= INTR_INFO_DELIVER_CODE_MASK;
 	}
 
-	if (vcpu->arch.rmode.active) {
+	if (vcpu->arch.rmode.vm86_active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = nr;
 		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -1363,7 +1363,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	vmx->emulation_required = 1;
-	vcpu->arch.rmode.active = 0;
+	vcpu->arch.rmode.vm86_active = 0;
 
 	vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
 	vmcs_write32(GUEST_TR_LIMIT, vcpu->arch.rmode.tr.limit);
@@ -1425,7 +1425,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	vmx->emulation_required = 1;
-	vcpu->arch.rmode.active = 1;
+	vcpu->arch.rmode.vm86_active = 1;
 
 	vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
 	vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1594,10 +1594,10 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 
 	vmx_fpu_deactivate(vcpu);
 
-	if (vcpu->arch.rmode.active && (cr0 & X86_CR0_PE))
+	if (vcpu->arch.rmode.vm86_active && (cr0 & X86_CR0_PE))
 		enter_pmode(vcpu);
 
-	if (!vcpu->arch.rmode.active && !(cr0 & X86_CR0_PE))
+	if (!vcpu->arch.rmode.vm86_active && !(cr0 & X86_CR0_PE))
 		enter_rmode(vcpu);
 
 #ifdef CONFIG_X86_64
@@ -1655,7 +1655,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.active ?
+	unsigned long hw_cr4 = cr4 | (vcpu->arch.rmode.vm86_active ?
 		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
 	vcpu->arch.cr4 = cr4;
@@ -1738,7 +1738,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 	struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
 	u32 ar;
 
-	if (vcpu->arch.rmode.active && seg == VCPU_SREG_TR) {
+	if (vcpu->arch.rmode.vm86_active && seg == VCPU_SREG_TR) {
 		vcpu->arch.rmode.tr.selector = var->selector;
 		vcpu->arch.rmode.tr.base = var->base;
 		vcpu->arch.rmode.tr.limit = var->limit;
@@ -1748,7 +1748,7 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
 	vmcs_writel(sf->base, var->base);
 	vmcs_write32(sf->limit, var->limit);
 	vmcs_write16(sf->selector, var->selector);
-	if (vcpu->arch.rmode.active && var->s) {
+	if (vcpu->arch.rmode.vm86_active && var->s) {
 		/*
 		 * Hack real-mode segments into vm86 compatibility.
 		 */
@@ -2317,7 +2317,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		goto out;
 	}
 
-	vmx->vcpu.arch.rmode.active = 0;
+	vmx->vcpu.arch.rmode.vm86_active = 0;
 
 	vmx->soft_vnmi_blocked = 0;
 
@@ -2455,7 +2455,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
 	KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
 
 	++vcpu->stat.irq_injections;
-	if (vcpu->arch.rmode.active) {
+	if (vcpu->arch.rmode.vm86_active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = irq;
 		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2493,7 +2493,7 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 	}
 
 	++vcpu->stat.nmi_injections;
-	if (vcpu->arch.rmode.active) {
+	if (vcpu->arch.rmode.vm86_active) {
 		vmx->rmode.irq.pending = true;
 		vmx->rmode.irq.vector = NMI_VECTOR;
 		vmx->rmode.irq.rip = kvm_rip_read(vcpu);
@@ -2629,7 +2629,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		return kvm_mmu_page_fault(vcpu, cr2, error_code);
 	}
 
-	if (vcpu->arch.rmode.active &&
+	if (vcpu->arch.rmode.vm86_active &&
 	    handle_rmode_exception(vcpu, intr_info & INTR_INFO_VECTOR_MASK,
 								error_code)) {
 		if (vcpu->arch.halt_request) {
-- 
GitLab


From a0861c02a981c943573478ea13b29b1fb958ee5b Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Mon, 8 Jun 2009 17:37:09 +0800
Subject: [PATCH 1841/2198] KVM: Add VT-x machine check support

VT-x needs an explicit MC vector intercept to handle machine checks in the
hyper visor.

It also has a special option to catch machine checks that happen
during VT entry.

Do these interceptions and forward them to the Linux machine check
handler. Make it always look like user space is interrupted because
the machine check handler treats kernel/user space differently.

Thanks to Jiang Yunhong for help and testing.

Cc: stable@kernel.org
Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Huang Ying <ying.huang@intel.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/vmx.h          |  1 +
 arch/x86/kernel/cpu/mcheck/mce_64.c |  1 +
 arch/x86/kvm/vmx.c                  | 50 +++++++++++++++++++++++++++--
 3 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 498f944010b9a..11be5ad2e0e97 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -247,6 +247,7 @@ enum vmcs_field {
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
+#define EXIT_REASON_MCE_DURING_VMENTRY	 41
 #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
 #define EXIT_REASON_APIC_ACCESS         44
 #define EXIT_REASON_EPT_VIOLATION       48
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 09dd1d414fc36..289cc48150288 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -420,6 +420,7 @@ void do_machine_check(struct pt_regs * regs, long error_code)
  out2:
 	atomic_dec(&mce_entry);
 }
+EXPORT_SYMBOL_GPL(do_machine_check);
 
 #ifdef CONFIG_X86_MCE_INTEL
 /***
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c379a3472fa9b..32d6ae8fb60e3 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -32,6 +32,7 @@
 #include <asm/desc.h>
 #include <asm/vmx.h>
 #include <asm/virtext.h>
+#include <asm/mce.h>
 
 #define __ex(x) __kvm_handle_fault_on_reboot(x)
 
@@ -97,6 +98,7 @@ struct vcpu_vmx {
 	int soft_vnmi_blocked;
 	ktime_t entry_time;
 	s64 vnmi_blocked_time;
+	u32 exit_reason;
 };
 
 static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -214,6 +216,13 @@ static inline int is_external_interrupt(u32 intr_info)
 		== (INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
 }
 
+static inline int is_machine_check(u32 intr_info)
+{
+	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
+			     INTR_INFO_VALID_MASK)) ==
+		(INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+}
+
 static inline int cpu_has_vmx_msr_bitmap(void)
 {
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_USE_MSR_BITMAPS;
@@ -485,7 +494,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
 	u32 eb;
 
-	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR);
+	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR);
 	if (!vcpu->fpu_active)
 		eb |= 1u << NM_VECTOR;
 	if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -2582,6 +2591,31 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+/*
+ * Trigger machine check on the host. We assume all the MSRs are already set up
+ * by the CPU and that we still run on the same CPU as the MCE occurred on.
+ * We pass a fake environment to the machine check handler because we want
+ * the guest to be always treated like user space, no matter what context
+ * it used internally.
+ */
+static void kvm_machine_check(void)
+{
+#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+	struct pt_regs regs = {
+		.cs = 3, /* Fake ring 3 no matter what the guest ran on */
+		.flags = X86_EFLAGS_IF,
+	};
+
+	do_machine_check(&regs, 0);
+#endif
+}
+
+static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+{
+	/* already handled by vcpu_run */
+	return 1;
+}
+
 static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2593,6 +2627,9 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	vect_info = vmx->idt_vectoring_info;
 	intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
+	if (is_machine_check(intr_info))
+		return handle_machine_check(vcpu, kvm_run);
+
 	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
 						!is_page_fault(intr_info))
 		printk(KERN_ERR "%s: unexpected, vectoring info 0x%x "
@@ -3166,6 +3203,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
+	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
 };
 
 static const int kvm_vmx_max_exit_handlers =
@@ -3177,8 +3215,8 @@ static const int kvm_vmx_max_exit_handlers =
  */
 static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
-	u32 exit_reason = vmcs_read32(VM_EXIT_REASON);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 exit_reason = vmx->exit_reason;
 	u32 vectoring_info = vmx->idt_vectoring_info;
 
 	KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
@@ -3263,6 +3301,14 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
 
 	exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
 
+	vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
+
+	/* Handle machine checks before interrupts are enabled */
+	if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+	    || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
+		&& is_machine_check(exit_intr_info)))
+		kvm_machine_check();
+
 	/* We need to handle NMIs before interrupts are enabled */
 	if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
 	    (exit_intr_info & INTR_INFO_VALID_MASK)) {
-- 
GitLab


From 003dec8913d6bebb4ecc989ec04a235cf38f5ea9 Mon Sep 17 00:00:00 2001
From: Steven Whitehouse <swhiteho@redhat.com>
Date: Wed, 10 Jun 2009 10:31:45 +0100
Subject: [PATCH 1842/2198] GFS2: Merge gfs2_get_sb into gfs2_get_sb_meta

These don't need to be separate functions.

Reported-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/ops_fstype.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 9da161cbb30f9..f234aba36fb84 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1284,9 +1284,11 @@ static int set_meta_super(struct super_block *s, void *ptr)
 	return -EINVAL;
 }
 
-static struct super_block *get_gfs2_sb(const char *dev_name)
+static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
+			    const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	struct super_block *s;
+	struct gfs2_sbd *sdp;
 	struct path path;
 	int error;
 
@@ -1294,21 +1296,11 @@ static struct super_block *get_gfs2_sb(const char *dev_name)
 	if (error) {
 		printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n",
 		       dev_name, error);
-		return ERR_PTR(-ENOENT);
+		return error;
 	}
 	s = sget(&gfs2_fs_type, test_meta_super, set_meta_super,
 		 path.dentry->d_inode->i_sb->s_bdev);
 	path_put(&path);
-	return s;
-}
-
-static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
-			    const char *dev_name, void *data, struct vfsmount *mnt)
-{
-	struct super_block *s;
-	struct gfs2_sbd *sdp;
-
-	s = get_gfs2_sb(dev_name);
 	if (IS_ERR(s)) {
 		printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n");
 		return PTR_ERR(s);
-- 
GitLab


From 04dce7d9d429ea5ea04e9432d1726c930f4d67da Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 10 Jun 2009 16:59:46 +1000
Subject: [PATCH 1843/2198] spinlock: Add missing __raw_spin_lock_flags() stub
 for UP

This was only defined with CONFIG_DEBUG_SPINLOCK set, but some
obscure arch/powerpc code wants it always.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/spinlock_up.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/spinlock_up.h b/include/linux/spinlock_up.h
index 938234c4a996b..d4841ed8215b7 100644
--- a/include/linux/spinlock_up.h
+++ b/include/linux/spinlock_up.h
@@ -60,6 +60,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 #define __raw_spin_is_locked(lock)	((void)(lock), 0)
 /* for sched.c and kernel_lock.c: */
 # define __raw_spin_lock(lock)		do { (void)(lock); } while (0)
+# define __raw_spin_lock_flags(lock, flags)	do { (void)(lock); } while (0)
 # define __raw_spin_unlock(lock)	do { (void)(lock); } while (0)
 # define __raw_spin_trylock(lock)	({ (void)(lock); 1; })
 #endif /* DEBUG_SPINLOCK */
-- 
GitLab


From f1db457ce6e2f63cb01022f58c0c023838958bd1 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 10 Jun 2009 10:06:24 +0800
Subject: [PATCH 1844/2198] tracing/events: convert block trace points to
 TRACE_EVENT(), fix !CONFIG_BLOCK

Fix building failures when CONFIG_BLOCK == n.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2F1520.8020003@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/blktrace_api.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index c7ec31dd04c9c..7e4350ece0f8d 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -218,7 +218,7 @@ static inline int blk_trace_init_sysfs(struct device *dev)
 
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
-#ifdef CONFIG_EVENT_TRACING
+#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK)
 
 static inline int blk_cmd_buf_len(struct request *rq)
 {
@@ -229,7 +229,7 @@ extern void blk_dump_cmd(char *buf, struct request *rq);
 extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
 extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
 
-#endif /* CONFIG_EVENT_TRACING */
+#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
 
 #endif /* __KERNEL__ */
 #endif
-- 
GitLab


From 6bc1096d7ab3621b3ffcf06616d1f4e0325d903d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 22 May 2009 12:12:01 +0200
Subject: [PATCH 1845/2198] x86: MSR: add a struct representation of an MSR

Add a struct representing a 64bit MSR pair consisting of a low and high
register part and convert msr_info to use it. Also, rename msr-on-cpu.c
to msr.c.

Side note: Put the cpumask.h include in __KERNEL__ space thus fixing an
allmodconfig build failure in the headers_check target.

CC: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 arch/x86/include/asm/msr.h           | 11 +++++++++++
 arch/x86/lib/Makefile                |  2 +-
 arch/x86/lib/{msr-on-cpu.c => msr.c} | 26 +++++++++++++-------------
 3 files changed, 25 insertions(+), 14 deletions(-)
 rename arch/x86/lib/{msr-on-cpu.c => msr.c} (81%)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 638bf6241807a..5e1213216e2b6 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -12,6 +12,17 @@
 
 #include <asm/asm.h>
 #include <asm/errno.h>
+#include <asm/cpumask.h>
+
+struct msr {
+	union {
+		struct {
+			u32 l;
+			u32 h;
+		};
+		u64 q;
+	};
+};
 
 static inline unsigned long long native_read_tscp(unsigned int *aux)
 {
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 55e11aa6d66cc..f9d35632666b0 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,7 +2,7 @@
 # Makefile for x86 specific library files.
 #
 
-obj-$(CONFIG_SMP) := msr-on-cpu.o
+obj-$(CONFIG_SMP) := msr.o
 
 lib-y := delay.o
 lib-y += thunk_$(BITS).o
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr.c
similarity index 81%
rename from arch/x86/lib/msr-on-cpu.c
rename to arch/x86/lib/msr.c
index 321cf720dbb63..cade714e57f9f 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr.c
@@ -5,7 +5,7 @@
 
 struct msr_info {
 	u32 msr_no;
-	u32 l, h;
+	struct msr reg;
 	int err;
 };
 
@@ -13,14 +13,14 @@ static void __rdmsr_on_cpu(void *info)
 {
 	struct msr_info *rv = info;
 
-	rdmsr(rv->msr_no, rv->l, rv->h);
+	rdmsr(rv->msr_no, rv->reg.l, rv->reg.h);
 }
 
 static void __wrmsr_on_cpu(void *info)
 {
 	struct msr_info *rv = info;
 
-	wrmsr(rv->msr_no, rv->l, rv->h);
+	wrmsr(rv->msr_no, rv->reg.l, rv->reg.h);
 }
 
 int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
@@ -30,8 +30,8 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 
 	rv.msr_no = msr_no;
 	err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
-	*l = rv.l;
-	*h = rv.h;
+	*l = rv.reg.l;
+	*h = rv.reg.h;
 
 	return err;
 }
@@ -42,8 +42,8 @@ int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 	struct msr_info rv;
 
 	rv.msr_no = msr_no;
-	rv.l = l;
-	rv.h = h;
+	rv.reg.l = l;
+	rv.reg.h = h;
 	err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
 
 	return err;
@@ -55,14 +55,14 @@ static void __rdmsr_safe_on_cpu(void *info)
 {
 	struct msr_info *rv = info;
 
-	rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
+	rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
 }
 
 static void __wrmsr_safe_on_cpu(void *info)
 {
 	struct msr_info *rv = info;
 
-	rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
+	rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
 }
 
 int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
@@ -72,8 +72,8 @@ int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 
 	rv.msr_no = msr_no;
 	err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
-	*l = rv.l;
-	*h = rv.h;
+	*l = rv.reg.l;
+	*h = rv.reg.h;
 
 	return err ? err : rv.err;
 }
@@ -84,8 +84,8 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 	struct msr_info rv;
 
 	rv.msr_no = msr_no;
-	rv.l = l;
-	rv.h = h;
+	rv.reg.l = l;
+	rv.reg.h = h;
 	err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
 
 	return err ? err : rv.err;
-- 
GitLab


From b034c19f9f61c8b6f2435aa2e77f52348ebde767 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Fri, 22 May 2009 13:52:19 +0200
Subject: [PATCH 1846/2198] x86: MSR: add methods for writing of an MSR on
 several CPUs

Provide for concurrent MSR writes on all the CPUs in the cpumask. Also,
add a temporary workaround for smp_call_function_many which skips the
CPU we're executing on.

Bart: zero out rv struct which is allocated on stack.

CC: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
---
 arch/x86/include/asm/msr.h | 12 +++++
 arch/x86/lib/msr.c         | 98 +++++++++++++++++++++++++++++++++++---
 2 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 5e1213216e2b6..22603764e7db3 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -227,6 +227,8 @@ do {                                                            \
 #ifdef CONFIG_SMP
 int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
 int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
+void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
+void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs);
 int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
 int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
 #else  /*  CONFIG_SMP  */
@@ -240,6 +242,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 	wrmsr(msr_no, l, h);
 	return 0;
 }
+static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no,
+				struct msr *msrs)
+{
+       rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
+}
+static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no,
+				struct msr *msrs)
+{
+       wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
+}
 static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
 				    u32 *l, u32 *h)
 {
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index cade714e57f9f..1440b9c0547e9 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -6,21 +6,37 @@
 struct msr_info {
 	u32 msr_no;
 	struct msr reg;
+	struct msr *msrs;
+	int off;
 	int err;
 };
 
 static void __rdmsr_on_cpu(void *info)
 {
 	struct msr_info *rv = info;
+	struct msr *reg;
+	int this_cpu = raw_smp_processor_id();
 
-	rdmsr(rv->msr_no, rv->reg.l, rv->reg.h);
+	if (rv->msrs)
+		reg = &rv->msrs[this_cpu - rv->off];
+	else
+		reg = &rv->reg;
+
+	rdmsr(rv->msr_no, reg->l, reg->h);
 }
 
 static void __wrmsr_on_cpu(void *info)
 {
 	struct msr_info *rv = info;
+	struct msr *reg;
+	int this_cpu = raw_smp_processor_id();
+
+	if (rv->msrs)
+		reg = &rv->msrs[this_cpu - rv->off];
+	else
+		reg = &rv->reg;
 
-	wrmsr(rv->msr_no, rv->reg.l, rv->reg.h);
+	wrmsr(rv->msr_no, reg->l, reg->h);
 }
 
 int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
@@ -28,6 +44,8 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 	int err;
 	struct msr_info rv;
 
+	memset(&rv, 0, sizeof(rv));
+
 	rv.msr_no = msr_no;
 	err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
 	*l = rv.reg.l;
@@ -35,12 +53,15 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 
 	return err;
 }
+EXPORT_SYMBOL(rdmsr_on_cpu);
 
 int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 {
 	int err;
 	struct msr_info rv;
 
+	memset(&rv, 0, sizeof(rv));
+
 	rv.msr_no = msr_no;
 	rv.reg.l = l;
 	rv.reg.h = h;
@@ -48,6 +69,70 @@ int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 
 	return err;
 }
+EXPORT_SYMBOL(wrmsr_on_cpu);
+
+/* rdmsr on a bunch of CPUs
+ *
+ * @mask:       which CPUs
+ * @msr_no:     which MSR
+ * @msrs:       array of MSR values
+ *
+ */
+void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
+{
+	struct msr_info rv;
+	int this_cpu;
+
+	memset(&rv, 0, sizeof(rv));
+
+	rv.off    = cpumask_first(mask);
+	rv.msrs	  = msrs;
+	rv.msr_no = msr_no;
+
+	preempt_disable();
+	/*
+	 * FIXME: handle the CPU we're executing on separately for now until
+	 * smp_call_function_many has been fixed to not skip it.
+	 */
+	this_cpu = raw_smp_processor_id();
+	smp_call_function_single(this_cpu, __rdmsr_on_cpu, &rv, 1);
+
+	smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
+	preempt_enable();
+}
+EXPORT_SYMBOL(rdmsr_on_cpus);
+
+/*
+ * wrmsr on a bunch of CPUs
+ *
+ * @mask:       which CPUs
+ * @msr_no:     which MSR
+ * @msrs:       array of MSR values
+ *
+ */
+void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
+{
+	struct msr_info rv;
+	int this_cpu;
+
+	memset(&rv, 0, sizeof(rv));
+
+	rv.off    = cpumask_first(mask);
+	rv.msrs   = msrs;
+	rv.msr_no = msr_no;
+
+	preempt_disable();
+	/*
+	 * FIXME: handle the CPU we're executing on separately for now until
+	 * smp_call_function_many has been fixed to not skip it.
+	 */
+	this_cpu = raw_smp_processor_id();
+	smp_call_function_single(this_cpu, __wrmsr_on_cpu, &rv, 1);
+
+	smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
+	preempt_enable();
+}
+EXPORT_SYMBOL(wrmsr_on_cpus);
 
 /* These "safe" variants are slower and should be used when the target MSR
    may not actually exist. */
@@ -70,6 +155,8 @@ int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 	int err;
 	struct msr_info rv;
 
+	memset(&rv, 0, sizeof(rv));
+
 	rv.msr_no = msr_no;
 	err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
 	*l = rv.reg.l;
@@ -77,12 +164,15 @@ int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
 
 	return err ? err : rv.err;
 }
+EXPORT_SYMBOL(rdmsr_safe_on_cpu);
 
 int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 {
 	int err;
 	struct msr_info rv;
 
+	memset(&rv, 0, sizeof(rv));
+
 	rv.msr_no = msr_no;
 	rv.reg.l = l;
 	rv.reg.h = h;
@@ -90,8 +180,4 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
 
 	return err ? err : rv.err;
 }
-
-EXPORT_SYMBOL(rdmsr_on_cpu);
-EXPORT_SYMBOL(wrmsr_on_cpu);
-EXPORT_SYMBOL(rdmsr_safe_on_cpu);
 EXPORT_SYMBOL(wrmsr_safe_on_cpu);
-- 
GitLab


From d357cbb445208ea0c33b268e08a65e53fdbb5e86 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Thu, 14 May 2009 17:49:28 +0200
Subject: [PATCH 1847/2198] edac: fold __func__ into edac_debug_printk

This shortens debugfX() calls a bit.

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
CC: Doug Thompson <norsk5@yahoo.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/edac_core.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/edac/edac_core.h b/drivers/edac/edac_core.h
index 6ad95c8d63631..48d3b1409834e 100644
--- a/drivers/edac/edac_core.h
+++ b/drivers/edac/edac_core.h
@@ -76,10 +76,11 @@
 extern int edac_debug_level;
 
 #ifndef CONFIG_EDAC_DEBUG_VERBOSE
-#define edac_debug_printk(level, fmt, arg...)                            \
-	do {                                                             \
-		if (level <= edac_debug_level)                           \
-			edac_printk(KERN_DEBUG, EDAC_DEBUG, fmt, ##arg); \
+#define edac_debug_printk(level, fmt, arg...)                           \
+	do {                                                            \
+		if (level <= edac_debug_level)                          \
+			edac_printk(KERN_DEBUG, EDAC_DEBUG,		\
+				    "%s: " fmt, __func__, ##arg);	\
 	} while (0)
 #else  /* CONFIG_EDAC_DEBUG_VERBOSE */
 #define edac_debug_printk(level, fmt, arg...)                            \
-- 
GitLab


From cfe40fdb4a46a68e45fa9a5ecbe588e94b89b4f3 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 4 May 2009 19:25:34 +0200
Subject: [PATCH 1848/2198] amd64_edac: add driver header

Borislav:
- remove register bit descriptions (complete text in BKDG)
- cleanup and remove excessive/superfluous comments

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.h | 628 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 628 insertions(+)
 create mode 100644 drivers/edac/amd64_edac.h

diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
new file mode 100644
index 0000000000000..6f5d5d62cefc8
--- /dev/null
+++ b/drivers/edac/amd64_edac.h
@@ -0,0 +1,628 @@
+/*
+ * AMD64 class Memory Controller kernel module
+ *
+ * Copyright (c) 2009 SoftwareBitMaker.
+ * Copyright (c) 2009 Advanced Micro Devices, Inc.
+ *
+ * This file may be distributed under the terms of the
+ * GNU General Public License.
+ *
+ *	Originally Written by Thayne Harbaugh
+ *
+ *      Changes by Douglas "norsk" Thompson  <dougthompson@xmission.com>:
+ *		- K8 CPU Revision D and greater support
+ *
+ *      Changes by Dave Peterson <dsp@llnl.gov> <dave_peterson@pobox.com>:
+ *		- Module largely rewritten, with new (and hopefully correct)
+ *		code for dealing with node and chip select interleaving,
+ *		various code cleanup, and bug fixes
+ *		- Added support for memory hoisting using DRAM hole address
+ *		register
+ *
+ *	Changes by Douglas "norsk" Thompson <dougthompson@xmission.com>:
+ *		-K8 Rev (1207) revision support added, required Revision
+ *		specific mini-driver code to support Rev F as well as
+ *		prior revisions
+ *
+ *	Changes by Douglas "norsk" Thompson <dougthompson@xmission.com>:
+ *		-Family 10h revision support added. New PCI Device IDs,
+ *		indicating new changes. Actual registers modified
+ *		were slight, less than the Rev E to Rev F transition
+ *		but changing the PCI Device ID was the proper thing to
+ *		do, as it provides for almost automactic family
+ *		detection. The mods to Rev F required more family
+ *		information detection.
+ *
+ *	Changes/Fixes by Borislav Petkov <borislav.petkov@amd.com>:
+ *		- misc fixes and code cleanups
+ *
+ * This module is based on the following documents
+ * (available from http://www.amd.com/):
+ *
+ *	Title:	BIOS and Kernel Developer's Guide for AMD Athlon 64 and AMD
+ *		Opteron Processors
+ *	AMD publication #: 26094
+ *`	Revision: 3.26
+ *
+ *	Title:	BIOS and Kernel Developer's Guide for AMD NPT Family 0Fh
+ *		Processors
+ *	AMD publication #: 32559
+ *	Revision: 3.00
+ *	Issue Date: May 2006
+ *
+ *	Title:	BIOS and Kernel Developer's Guide (BKDG) For AMD Family 10h
+ *		Processors
+ *	AMD publication #: 31116
+ *	Revision: 3.00
+ *	Issue Date: September 07, 2007
+ *
+ * Sections in the first 2 documents are no longer in sync with each other.
+ * The Family 10h BKDG was totally re-written from scratch with a new
+ * presentation model.
+ * Therefore, comments that refer to a Document section might be off.
+ */
+
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/pci_ids.h>
+#include <linux/slab.h>
+#include <linux/mmzone.h>
+#include <linux/edac.h>
+#include "edac_core.h"
+
+#define amd64_printk(level, fmt, arg...) \
+	edac_printk(level, "amd64", fmt, ##arg)
+
+#define amd64_mc_printk(mci, level, fmt, arg...) \
+	edac_mc_chipset_printk(mci, level, "amd64", fmt, ##arg)
+
+/*
+ * Throughout the comments in this code, the following terms are used:
+ *
+ *	SysAddr, DramAddr, and InputAddr
+ *
+ *  These terms come directly from the amd64 documentation
+ * (AMD publication #26094).  They are defined as follows:
+ *
+ *     SysAddr:
+ *         This is a physical address generated by a CPU core or a device
+ *         doing DMA.  If generated by a CPU core, a SysAddr is the result of
+ *         a virtual to physical address translation by the CPU core's address
+ *         translation mechanism (MMU).
+ *
+ *     DramAddr:
+ *         A DramAddr is derived from a SysAddr by subtracting an offset that
+ *         depends on which node the SysAddr maps to and whether the SysAddr
+ *         is within a range affected by memory hoisting.  The DRAM Base
+ *         (section 3.4.4.1) and DRAM Limit (section 3.4.4.2) registers
+ *         determine which node a SysAddr maps to.
+ *
+ *         If the DRAM Hole Address Register (DHAR) is enabled and the SysAddr
+ *         is within the range of addresses specified by this register, then
+ *         a value x from the DHAR is subtracted from the SysAddr to produce a
+ *         DramAddr.  Here, x represents the base address for the node that
+ *         the SysAddr maps to plus an offset due to memory hoisting.  See
+ *         section 3.4.8 and the comments in amd64_get_dram_hole_info() and
+ *         sys_addr_to_dram_addr() below for more information.
+ *
+ *         If the SysAddr is not affected by the DHAR then a value y is
+ *         subtracted from the SysAddr to produce a DramAddr.  Here, y is the
+ *         base address for the node that the SysAddr maps to.  See section
+ *         3.4.4 and the comments in sys_addr_to_dram_addr() below for more
+ *         information.
+ *
+ *     InputAddr:
+ *         A DramAddr is translated to an InputAddr before being passed to the
+ *         memory controller for the node that the DramAddr is associated
+ *         with.  The memory controller then maps the InputAddr to a csrow.
+ *         If node interleaving is not in use, then the InputAddr has the same
+ *         value as the DramAddr.  Otherwise, the InputAddr is produced by
+ *         discarding the bits used for node interleaving from the DramAddr.
+ *         See section 3.4.4 for more information.
+ *
+ *         The memory controller for a given node uses its DRAM CS Base and
+ *         DRAM CS Mask registers to map an InputAddr to a csrow.  See
+ *         sections 3.5.4 and 3.5.5 for more information.
+ */
+
+#define EDAC_AMD64_VERSION		" Ver: 3.2.0 " __DATE__
+#define EDAC_MOD_STR			"amd64_edac"
+
+/* Extended Model from CPUID, for CPU Revision numbers */
+#define OPTERON_CPU_LE_REV_C		0
+#define OPTERON_CPU_REV_D		1
+#define OPTERON_CPU_REV_E		2
+
+/* NPT processors have the following Extended Models */
+#define OPTERON_CPU_REV_F		4
+#define OPTERON_CPU_REV_FA		5
+
+/* Hardware limit on ChipSelect rows per MC and processors per system */
+#define CHIPSELECT_COUNT		8
+#define DRAM_REG_COUNT			8
+
+
+/*
+ * PCI-defined configuration space registers
+ */
+
+
+/*
+ * Function 1 - Address Map
+ */
+#define K8_DRAM_BASE_LOW		0x40
+#define K8_DRAM_LIMIT_LOW		0x44
+#define K8_DHAR				0xf0
+
+#define DHAR_VALID			BIT(0)
+#define F10_DRAM_MEM_HOIST_VALID	BIT(1)
+
+#define DHAR_BASE_MASK			0xff000000
+#define dhar_base(dhar)			(dhar & DHAR_BASE_MASK)
+
+#define K8_DHAR_OFFSET_MASK		0x0000ff00
+#define k8_dhar_offset(dhar)		((dhar & K8_DHAR_OFFSET_MASK) << 16)
+
+#define F10_DHAR_OFFSET_MASK		0x0000ff80
+					/* NOTE: Extra mask bit vs K8 */
+#define f10_dhar_offset(dhar)		((dhar & F10_DHAR_OFFSET_MASK) << 16)
+
+
+/* F10 High BASE/LIMIT registers */
+#define F10_DRAM_BASE_HIGH		0x140
+#define F10_DRAM_LIMIT_HIGH		0x144
+
+
+/*
+ * Function 2 - DRAM controller
+ */
+#define K8_DCSB0			0x40
+#define F10_DCSB1			0x140
+
+#define K8_DCSB_CS_ENABLE		BIT(0)
+#define K8_DCSB_NPT_SPARE		BIT(1)
+#define K8_DCSB_NPT_TESTFAIL		BIT(2)
+
+/*
+ * REV E: select [31:21] and [15:9] from DCSB and the shift amount to form
+ * the address
+ */
+#define REV_E_DCSB_BASE_BITS		(0xFFE0FE00ULL)
+#define REV_E_DCS_SHIFT			4
+#define REV_E_DCSM_COUNT		8
+
+#define REV_F_F1Xh_DCSB_BASE_BITS	(0x1FF83FE0ULL)
+#define REV_F_F1Xh_DCS_SHIFT		8
+
+/*
+ * REV F and later: selects [28:19] and [13:5] from DCSB and the shift amount
+ * to form the address
+ */
+#define REV_F_DCSB_BASE_BITS		(0x1FF83FE0ULL)
+#define REV_F_DCS_SHIFT			8
+#define REV_F_DCSM_COUNT		4
+#define F10_DCSM_COUNT			4
+#define F11_DCSM_COUNT			2
+
+/* DRAM CS Mask Registers */
+#define K8_DCSM0			0x60
+#define F10_DCSM1			0x160
+
+/* REV E: select [29:21] and [15:9] from DCSM */
+#define REV_E_DCSM_MASK_BITS		0x3FE0FE00
+
+/* unused bits [24:20] and [12:0] */
+#define REV_E_DCS_NOTUSED_BITS		0x01F01FFF
+
+/* REV F and later: select [28:19] and [13:5] from DCSM */
+#define REV_F_F1Xh_DCSM_MASK_BITS	0x1FF83FE0
+
+/* unused bits [26:22] and [12:0] */
+#define REV_F_F1Xh_DCS_NOTUSED_BITS	0x07C01FFF
+
+#define DBAM0				0x80
+#define DBAM1				0x180
+
+/* Extract the DIMM 'type' on the i'th DIMM from the DBAM reg value passed */
+#define DBAM_DIMM(i, reg)		((((reg) >> (4*i))) & 0xF)
+
+#define DBAM_MAX_VALUE			11
+
+
+#define F10_DCLR_0			0x90
+#define F10_DCLR_1			0x190
+#define REVE_WIDTH_128			BIT(16)
+#define F10_WIDTH_128			BIT(11)
+
+
+#define F10_DCHR_0			0x94
+#define F10_DCHR_1			0x194
+
+#define F10_DCHR_FOUR_RANK_DIMM		BIT(18)
+#define F10_DCHR_Ddr3Mode		BIT(8)
+#define F10_DCHR_MblMode		BIT(6)
+
+
+#define F10_DCTL_SEL_LOW		0x110
+
+#define dct_sel_baseaddr(pvt)    \
+	((pvt->dram_ctl_select_low) & 0xFFFFF800)
+
+#define dct_sel_interleave_addr(pvt)    \
+	(((pvt->dram_ctl_select_low) >> 6) & 0x3)
+
+enum {
+	F10_DCTL_SEL_LOW_DctSelHiRngEn	= BIT(0),
+	F10_DCTL_SEL_LOW_DctSelIntLvEn	= BIT(2),
+	F10_DCTL_SEL_LOW_DctGangEn	= BIT(4),
+	F10_DCTL_SEL_LOW_DctDatIntLv	= BIT(5),
+	F10_DCTL_SEL_LOW_DramEnable	= BIT(8),
+	F10_DCTL_SEL_LOW_MemCleared	= BIT(10),
+};
+
+#define    dct_high_range_enabled(pvt)    \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_DctSelHiRngEn)
+
+#define dct_interleave_enabled(pvt)	   \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_DctSelIntLvEn)
+
+#define dct_ganging_enabled(pvt)        \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_DctGangEn)
+
+#define dct_data_intlv_enabled(pvt)    \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_DctDatIntLv)
+
+#define dct_dram_enabled(pvt)    \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_DramEnable)
+
+#define dct_memory_cleared(pvt)    \
+	(pvt->dram_ctl_select_low & F10_DCTL_SEL_LOW_MemCleared)
+
+
+#define F10_DCTL_SEL_HIGH		0x114
+
+
+/*
+ * Function 3 - Misc Control
+ */
+#define K8_NBCTL			0x40
+
+/* Correctable ECC error reporting enable */
+#define K8_NBCTL_CECCEn			BIT(0)
+
+/* UnCorrectable ECC error reporting enable */
+#define K8_NBCTL_UECCEn			BIT(1)
+
+#define K8_NBCFG			0x44
+#define K8_NBCFG_CHIPKILL		BIT(23)
+#define K8_NBCFG_ECC_ENABLE		BIT(22)
+
+#define K8_NBSL				0x48
+
+
+#define EXTRACT_HIGH_SYNDROME(x)	(((x) >> 24) & 0xff)
+#define EXTRACT_EXT_ERROR_CODE(x)	(((x) >> 16) & 0x1f)
+
+/* Family F10h: Normalized Extended Error Codes */
+#define F10_NBSL_EXT_ERR_RES		0x0
+#define F10_NBSL_EXT_ERR_CRC		0x1
+#define F10_NBSL_EXT_ERR_SYNC		0x2
+#define F10_NBSL_EXT_ERR_MST		0x3
+#define F10_NBSL_EXT_ERR_TGT		0x4
+#define F10_NBSL_EXT_ERR_GART		0x5
+#define F10_NBSL_EXT_ERR_RMW		0x6
+#define F10_NBSL_EXT_ERR_WDT		0x7
+#define F10_NBSL_EXT_ERR_ECC		0x8
+#define F10_NBSL_EXT_ERR_DEV		0x9
+#define F10_NBSL_EXT_ERR_LINK_DATA	0xA
+
+/* Next two are overloaded values */
+#define F10_NBSL_EXT_ERR_LINK_PROTO	0xB
+#define F10_NBSL_EXT_ERR_L3_PROTO	0xB
+
+#define F10_NBSL_EXT_ERR_NB_ARRAY	0xC
+#define F10_NBSL_EXT_ERR_DRAM_PARITY	0xD
+#define F10_NBSL_EXT_ERR_LINK_RETRY	0xE
+
+/* Next two are overloaded values */
+#define F10_NBSL_EXT_ERR_GART_WALK	0xF
+#define F10_NBSL_EXT_ERR_DEV_WALK	0xF
+
+/* 0x10 to 0x1B: Reserved */
+#define F10_NBSL_EXT_ERR_L3_DATA	0x1C
+#define F10_NBSL_EXT_ERR_L3_TAG		0x1D
+#define F10_NBSL_EXT_ERR_L3_LRU		0x1E
+
+/* K8: Normalized Extended Error Codes */
+#define K8_NBSL_EXT_ERR_ECC		0x0
+#define K8_NBSL_EXT_ERR_CRC		0x1
+#define K8_NBSL_EXT_ERR_SYNC		0x2
+#define K8_NBSL_EXT_ERR_MST		0x3
+#define K8_NBSL_EXT_ERR_TGT		0x4
+#define K8_NBSL_EXT_ERR_GART		0x5
+#define K8_NBSL_EXT_ERR_RMW		0x6
+#define K8_NBSL_EXT_ERR_WDT		0x7
+#define K8_NBSL_EXT_ERR_CHIPKILL_ECC	0x8
+#define K8_NBSL_EXT_ERR_DRAM_PARITY	0xD
+
+#define EXTRACT_ERROR_CODE(x)		((x) & 0xffff)
+#define	TEST_TLB_ERROR(x)		(((x) & 0xFFF0) == 0x0010)
+#define	TEST_MEM_ERROR(x)		(((x) & 0xFF00) == 0x0100)
+#define	TEST_BUS_ERROR(x)		(((x) & 0xF800) == 0x0800)
+#define	EXTRACT_TT_CODE(x)		(((x) >> 2) & 0x3)
+#define	EXTRACT_II_CODE(x)		(((x) >> 2) & 0x3)
+#define	EXTRACT_LL_CODE(x)		(((x) >> 0) & 0x3)
+#define	EXTRACT_RRRR_CODE(x)		(((x) >> 4) & 0xf)
+#define	EXTRACT_TO_CODE(x)		(((x) >> 8) & 0x1)
+#define	EXTRACT_PP_CODE(x)		(((x) >> 9) & 0x3)
+
+/*
+ * The following are for BUS type errors AFTER values have been normalized by
+ * shifting right
+ */
+#define K8_NBSL_PP_SRC			0x0
+#define K8_NBSL_PP_RES			0x1
+#define K8_NBSL_PP_OBS			0x2
+#define K8_NBSL_PP_GENERIC		0x3
+
+
+#define K8_NBSH				0x4C
+
+#define K8_NBSH_VALID_BIT		BIT(31)
+#define K8_NBSH_OVERFLOW		BIT(30)
+#define K8_NBSH_UNCORRECTED_ERR		BIT(29)
+#define K8_NBSH_ERR_ENABLE		BIT(28)
+#define K8_NBSH_MISC_ERR_VALID		BIT(27)
+#define K8_NBSH_VALID_ERROR_ADDR	BIT(26)
+#define K8_NBSH_PCC			BIT(25)
+#define K8_NBSH_CECC			BIT(14)
+#define K8_NBSH_UECC			BIT(13)
+#define K8_NBSH_ERR_SCRUBER		BIT(8)
+#define K8_NBSH_CORE3			BIT(3)
+#define K8_NBSH_CORE2			BIT(2)
+#define K8_NBSH_CORE1			BIT(1)
+#define K8_NBSH_CORE0			BIT(0)
+
+#define EXTRACT_LDT_LINK(x)		(((x) >> 4) & 0x7)
+#define EXTRACT_ERR_CPU_MAP(x)		((x) & 0xF)
+#define EXTRACT_LOW_SYNDROME(x)		(((x) >> 15) & 0xff)
+
+
+#define K8_NBEAL			0x50
+#define K8_NBEAH			0x54
+#define K8_SCRCTRL			0x58
+
+#define F10_NB_CFG_LOW			0x88
+#define	F10_NB_CFG_LOW_ENABLE_EXT_CFG	BIT(14)
+
+#define F10_NB_CFG_HIGH			0x8C
+
+#define F10_ONLINE_SPARE		0xB0
+#define F10_ONLINE_SPARE_SWAPDONE0(x)	((x) & BIT(1))
+#define F10_ONLINE_SPARE_SWAPDONE1(x)	((x) & BIT(3))
+#define F10_ONLINE_SPARE_BADDRAM_CS0(x) (((x) >> 4) & 0x00000007)
+#define F10_ONLINE_SPARE_BADDRAM_CS1(x) (((x) >> 8) & 0x00000007)
+
+#define F10_NB_ARRAY_ADDR		0xB8
+
+#define F10_NB_ARRAY_DRAM_ECC		0x80000000
+
+/* Bits [2:1] are used to select 16-byte section within a 64-byte cacheline  */
+#define SET_NB_ARRAY_ADDRESS(section)	(((section) & 0x3) << 1)
+
+#define F10_NB_ARRAY_DATA		0xBC
+
+#define SET_NB_DRAM_INJECTION_WRITE(word, bits)  \
+					(BIT(((word) & 0xF) + 20) | \
+					BIT(17) |  \
+					((bits) & 0xF))
+
+#define SET_NB_DRAM_INJECTION_READ(word, bits)  \
+					(BIT(((word) & 0xF) + 20) | \
+					BIT(16) |  \
+					((bits) & 0xF))
+
+#define K8_NBCAP			0xE8
+#define K8_NBCAP_CORES			(BIT(12)|BIT(13))
+#define K8_NBCAP_CHIPKILL		BIT(4)
+#define K8_NBCAP_SECDED			BIT(3)
+#define K8_NBCAP_8_NODE			BIT(2)
+#define K8_NBCAP_DUAL_NODE		BIT(1)
+#define K8_NBCAP_DCT_DUAL		BIT(0)
+
+/*
+ * MSR Regs
+ */
+#define K8_MSR_MCGCTL			0x017b
+#define K8_MSR_MCGCTL_NBE		BIT(4)
+
+#define K8_MSR_MC4CTL			0x0410
+#define K8_MSR_MC4STAT			0x0411
+#define K8_MSR_MC4ADDR			0x0412
+
+/* AMD sets the first MC device at device ID 0x18. */
+static inline int get_mc_node_id_from_pdev(struct pci_dev *pdev)
+{
+	return PCI_SLOT(pdev->devfn) - 0x18;
+}
+
+enum amd64_chipset_families {
+	K8_CPUS = 0,
+	F10_CPUS,
+	F11_CPUS,
+};
+
+/*
+ * Structure to hold:
+ *
+ * 1) dynamically read status and error address HW registers
+ * 2) sysfs entered values
+ * 3) MCE values
+ *
+ * Depends on entry into the modules
+ */
+struct amd64_error_info_regs {
+	u32 nbcfg;
+	u32 nbsh;
+	u32 nbsl;
+	u32 nbeah;
+	u32 nbeal;
+};
+
+/* Error injection control structure */
+struct error_injection {
+	u32	section;
+	u32	word;
+	u32	bit_map;
+};
+
+struct amd64_pvt {
+	/* pci_device handles which we utilize */
+	struct pci_dev *addr_f1_ctl;
+	struct pci_dev *dram_f2_ctl;
+	struct pci_dev *misc_f3_ctl;
+
+	int mc_node_id;		/* MC index of this MC node */
+	int ext_model;		/* extended model value of this node */
+
+	struct low_ops *ops;	/* pointer to per PCI Device ID func table */
+
+	int channel_count;
+
+	/* Raw registers */
+	u32 dclr0;		/* DRAM Configuration Low DCT0 reg */
+	u32 dclr1;		/* DRAM Configuration Low DCT1 reg */
+	u32 dchr0;		/* DRAM Configuration High DCT0 reg */
+	u32 dchr1;		/* DRAM Configuration High DCT1 reg */
+	u32 nbcap;		/* North Bridge Capabilities */
+	u32 nbcfg;		/* F10 North Bridge Configuration */
+	u32 ext_nbcfg;		/* Extended F10 North Bridge Configuration */
+	u32 dhar;		/* DRAM Hoist reg */
+	u32 dbam0;		/* DRAM Base Address Mapping reg for DCT0 */
+	u32 dbam1;		/* DRAM Base Address Mapping reg for DCT1 */
+
+	/* DRAM CS Base Address Registers F2x[1,0][5C:40] */
+	u32 dcsb0[CHIPSELECT_COUNT];
+	u32 dcsb1[CHIPSELECT_COUNT];
+
+	/* DRAM CS Mask Registers F2x[1,0][6C:60] */
+	u32 dcsm0[CHIPSELECT_COUNT];
+	u32 dcsm1[CHIPSELECT_COUNT];
+
+	/*
+	 * Decoded parts of DRAM BASE and LIMIT Registers
+	 * F1x[78,70,68,60,58,50,48,40]
+	 */
+	u64 dram_base[DRAM_REG_COUNT];
+	u64 dram_limit[DRAM_REG_COUNT];
+	u8  dram_IntlvSel[DRAM_REG_COUNT];
+	u8  dram_IntlvEn[DRAM_REG_COUNT];
+	u8  dram_DstNode[DRAM_REG_COUNT];
+	u8  dram_rw_en[DRAM_REG_COUNT];
+
+	/*
+	 * The following fields are set at (load) run time, after CPU revision
+	 * has been determined, since the dct_base and dct_mask registers vary
+	 * based on revision
+	 */
+	u32 dcsb_base;		/* DCSB base bits */
+	u32 dcsm_mask;		/* DCSM mask bits */
+	u32 num_dcsm;		/* Number of DCSM registers */
+	u32 dcs_mask_notused;	/* DCSM notused mask bits */
+	u32 dcs_shift;		/* DCSB and DCSM shift value */
+
+	u64 top_mem;		/* top of memory below 4GB */
+	u64 top_mem2;		/* top of memory above 4GB */
+
+	u32 dram_ctl_select_low;	/* DRAM Controller Select Low Reg */
+	u32 dram_ctl_select_high;	/* DRAM Controller Select High Reg */
+	u32 online_spare;               /* On-Line spare Reg */
+
+	/* temp storage for when input is received from sysfs */
+	struct amd64_error_info_regs ctl_error_info;
+
+	/* place to store error injection parameters prior to issue */
+	struct error_injection injection;
+
+	/* Save old hw registers' values before we modified them */
+	u32 nbctl_mcgctl_saved;		/* When true, following 2 are valid */
+	u32 old_nbctl;
+	u32 *old_mcgctl;		/* per core on this node */
+
+	/* MC Type Index value: socket F vs Family 10h */
+	u32 mc_type_index;
+
+	/* misc settings */
+	struct flags {
+		unsigned long cf8_extcfg:1;
+	} flags;
+};
+
+struct scrubrate {
+       u32 scrubval;           /* bit pattern for scrub rate */
+       u32 bandwidth;          /* bandwidth consumed (bytes/sec) */
+};
+
+extern struct scrubrate scrubrates[23];
+extern u32 revf_quad_ddr2_shift[16];
+extern const char *tt_msgs[4];
+extern const char *ll_msgs[4];
+extern const char *rrrr_msgs[16];
+extern const char *to_msgs[2];
+extern const char *pp_msgs[4];
+extern const char *ii_msgs[4];
+extern const char *ext_msgs[32];
+extern const char *htlink_msgs[8];
+
+/*
+ * Each of the PCI Device IDs types have their own set of hardware accessor
+ * functions and per device encoding/decoding logic.
+ */
+struct low_ops {
+	int (*probe_valid_hardware)(struct amd64_pvt *pvt);
+	int (*early_channel_count)(struct amd64_pvt *pvt);
+
+	u64 (*get_error_address)(struct mem_ctl_info *mci,
+			struct amd64_error_info_regs *info);
+	void (*read_dram_base_limit)(struct amd64_pvt *pvt, int dram);
+	void (*read_dram_ctl_register)(struct amd64_pvt *pvt);
+	void (*map_sysaddr_to_csrow)(struct mem_ctl_info *mci,
+					struct amd64_error_info_regs *info,
+					u64 SystemAddr);
+	int (*dbam_map_to_pages)(struct amd64_pvt *pvt, int dram_map);
+};
+
+struct amd64_family_type {
+	const char *ctl_name;
+	u16 addr_f1_ctl;
+	u16 misc_f3_ctl;
+	struct low_ops ops;
+};
+
+static struct amd64_family_type amd64_family_types[];
+
+static inline const char *get_amd_family_name(int index)
+{
+	return amd64_family_types[index].ctl_name;
+}
+
+static inline struct low_ops *family_ops(int index)
+{
+	return &amd64_family_types[index].ops;
+}
+
+/*
+ * For future CPU versions, verify the following as new 'slow' rates appear and
+ * modify the necessary skip values for the supported CPU.
+ */
+#define K8_MIN_SCRUB_RATE_BITS	0x0
+#define F10_MIN_SCRUB_RATE_BITS	0x5
+#define F11_MIN_SCRUB_RATE_BITS	0x6
+
+int amd64_process_error_info(struct mem_ctl_info *mci,
+			     struct amd64_error_info_regs *info,
+			     int handle_errors);
+int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
+			     u64 *hole_offset, u64 *hole_size);
-- 
GitLab


From fd3d6780f7358225a6ea1ffb1bb8fd8b2d6df446 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 19:50:23 +0200
Subject: [PATCH 1849/2198] amd64_edac: add debugging/testing code

This is for dumping different registers and testing the address mapping
logic using the ECC syndromes.

Borislav:

- split sysfs attrs per file
- use more conform names for the sysfs attrs
- fix function return value patterns
- cleanup/fix comments
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac_dbg.c | 255 ++++++++++++++++++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 drivers/edac/amd64_edac_dbg.c

diff --git a/drivers/edac/amd64_edac_dbg.c b/drivers/edac/amd64_edac_dbg.c
new file mode 100644
index 0000000000000..0a41b248a4adc
--- /dev/null
+++ b/drivers/edac/amd64_edac_dbg.c
@@ -0,0 +1,255 @@
+#include "amd64_edac.h"
+
+/*
+ * accept a hex value and store it into the virtual error register file, field:
+ * nbeal and nbeah. Assume virtual error values have already been set for: NBSL,
+ * NBSH and NBCFG. Then proceed to map the error values to a MC, CSROW and
+ * CHANNEL
+ */
+static ssize_t amd64_nbea_store(struct mem_ctl_info *mci, const char *data,
+				size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long long value;
+	int ret = 0;
+
+	ret = strict_strtoull(data, 16, &value);
+	if (ret != -EINVAL) {
+		debugf0("received NBEA= 0x%llx\n", value);
+
+		/* place the value into the virtual error packet */
+		pvt->ctl_error_info.nbeal = (u32) value;
+		value >>= 32;
+		pvt->ctl_error_info.nbeah = (u32) value;
+
+		/* Process the Mapping request */
+		/* TODO: Add race prevention */
+		amd64_process_error_info(mci, &pvt->ctl_error_info, 1);
+
+		return count;
+	}
+	return ret;
+}
+
+/* display back what the last NBEA (MCA NB Address (MC4_ADDR)) was written */
+static ssize_t amd64_nbea_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u64 value;
+
+	value = pvt->ctl_error_info.nbeah;
+	value <<= 32;
+	value |= pvt->ctl_error_info.nbeal;
+
+	return sprintf(data, "%llx\n", value);
+}
+
+/* store the NBSL (MCA NB Status Low (MC4_STATUS)) value user desires */
+static ssize_t amd64_nbsl_store(struct mem_ctl_info *mci, const char *data,
+				size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 16, &value);
+	if (ret != -EINVAL) {
+		debugf0("received NBSL= 0x%lx\n", value);
+
+		pvt->ctl_error_info.nbsl = (u32) value;
+
+		return count;
+	}
+	return ret;
+}
+
+/* display back what the last NBSL value written */
+static ssize_t amd64_nbsl_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u32 value;
+
+	value = pvt->ctl_error_info.nbsl;
+
+	return sprintf(data, "%x\n", value);
+}
+
+/* store the NBSH (MCA NB Status High) value user desires */
+static ssize_t amd64_nbsh_store(struct mem_ctl_info *mci, const char *data,
+				size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 16, &value);
+	if (ret != -EINVAL) {
+		debugf0("received NBSH= 0x%lx\n", value);
+
+		pvt->ctl_error_info.nbsh = (u32) value;
+
+		return count;
+	}
+	return ret;
+}
+
+/* display back what the last NBSH value written */
+static ssize_t amd64_nbsh_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u32 value;
+
+	value = pvt->ctl_error_info.nbsh;
+
+	return sprintf(data, "%x\n", value);
+}
+
+/* accept and store the NBCFG (MCA NB Configuration) value user desires */
+static ssize_t amd64_nbcfg_store(struct mem_ctl_info *mci,
+					const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 16, &value);
+	if (ret != -EINVAL) {
+		debugf0("received NBCFG= 0x%lx\n", value);
+
+		pvt->ctl_error_info.nbcfg = (u32) value;
+
+		return count;
+	}
+	return ret;
+}
+
+/* various show routines for the controls of a MCI */
+static ssize_t amd64_nbcfg_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return sprintf(data, "%x\n", pvt->ctl_error_info.nbcfg);
+}
+
+
+static ssize_t amd64_dhar_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return sprintf(data, "%x\n", pvt->dhar);
+}
+
+
+static ssize_t amd64_dbam_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return sprintf(data, "%x\n", pvt->dbam0);
+}
+
+
+static ssize_t amd64_topmem_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return sprintf(data, "%llx\n", pvt->top_mem);
+}
+
+
+static ssize_t amd64_topmem2_show(struct mem_ctl_info *mci, char *data)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return sprintf(data, "%llx\n", pvt->top_mem2);
+}
+
+static ssize_t amd64_hole_show(struct mem_ctl_info *mci, char *data)
+{
+	u64 hole_base = 0;
+	u64 hole_offset = 0;
+	u64 hole_size = 0;
+
+	amd64_get_dram_hole_info(mci, &hole_base, &hole_offset, &hole_size);
+
+	return sprintf(data, "%llx %llx %llx\n", hole_base, hole_offset,
+						 hole_size);
+}
+
+/*
+ * update NUM_DBG_ATTRS in case you add new members
+ */
+struct mcidev_sysfs_attribute amd64_dbg_attrs[] = {
+
+	{
+		.attr = {
+			.name = "nbea_ctl",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = amd64_nbea_show,
+		.store = amd64_nbea_store,
+	},
+	{
+		.attr = {
+			.name = "nbsl_ctl",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = amd64_nbsl_show,
+		.store = amd64_nbsl_store,
+	},
+	{
+		.attr = {
+			.name = "nbsh_ctl",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = amd64_nbsh_show,
+		.store = amd64_nbsh_store,
+	},
+	{
+		.attr = {
+			.name = "nbcfg_ctl",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = amd64_nbcfg_show,
+		.store = amd64_nbcfg_store,
+	},
+	{
+		.attr = {
+			.name = "dhar",
+			.mode = (S_IRUGO)
+		},
+		.show = amd64_dhar_show,
+		.store = NULL,
+	},
+	{
+		.attr = {
+			.name = "dbam",
+			.mode = (S_IRUGO)
+		},
+		.show = amd64_dbam_show,
+		.store = NULL,
+	},
+	{
+		.attr = {
+			.name = "topmem",
+			.mode = (S_IRUGO)
+		},
+		.show = amd64_topmem_show,
+		.store = NULL,
+	},
+	{
+		.attr = {
+			.name = "topmem2",
+			.mode = (S_IRUGO)
+		},
+		.show = amd64_topmem2_show,
+		.store = NULL,
+	},
+	{
+		.attr = {
+			.name = "dram_hole",
+			.mode = (S_IRUGO)
+		},
+		.show = amd64_hole_show,
+		.store = NULL,
+	},
+};
-- 
GitLab


From eb919690be994386eac326f8c53c4540602de563 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Tue, 5 May 2009 20:07:11 +0200
Subject: [PATCH 1850/2198] amd64_edac: add DRAM error injection logic using
 sysfs

Borislav:
- rename sysfs attrs to more conform names
- cleanup/fix comments according to BKDG text
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac_inj.c | 185 ++++++++++++++++++++++++++++++++++
 1 file changed, 185 insertions(+)
 create mode 100644 drivers/edac/amd64_edac_inj.c

diff --git a/drivers/edac/amd64_edac_inj.c b/drivers/edac/amd64_edac_inj.c
new file mode 100644
index 0000000000000..d3675b76b3a7d
--- /dev/null
+++ b/drivers/edac/amd64_edac_inj.c
@@ -0,0 +1,185 @@
+#include "amd64_edac.h"
+
+/*
+ * store error injection section value which refers to one of 4 16-byte sections
+ * within a 64-byte cacheline
+ *
+ * range: 0..3
+ */
+static ssize_t amd64_inject_section_store(struct mem_ctl_info *mci,
+					  const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 10, &value);
+	if (ret != -EINVAL) {
+		pvt->injection.section = (u32) value;
+		return count;
+	}
+	return ret;
+}
+
+/*
+ * store error injection word value which refers to one of 9 16-bit word of the
+ * 16-byte (128-bit + ECC bits) section
+ *
+ * range: 0..8
+ */
+static ssize_t amd64_inject_word_store(struct mem_ctl_info *mci,
+					const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 10, &value);
+	if (ret != -EINVAL) {
+
+		value = (value <= 8) ? value : 0;
+		pvt->injection.word = (u32) value;
+
+		return count;
+	}
+	return ret;
+}
+
+/*
+ * store 16 bit error injection vector which enables injecting errors to the
+ * corresponding bit within the error injection word above. When used during a
+ * DRAM ECC read, it holds the contents of the of the DRAM ECC bits.
+ */
+static ssize_t amd64_inject_ecc_vector_store(struct mem_ctl_info *mci,
+					     const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 16, &value);
+	if (ret != -EINVAL) {
+
+		pvt->injection.bit_map = (u32) value & 0xFFFF;
+
+		return count;
+	}
+	return ret;
+}
+
+/*
+ * Do a DRAM ECC read. Assemble staged values in the pvt area, format into
+ * fields needed by the injection registers and read the NB Array Data Port.
+ */
+static ssize_t amd64_inject_read_store(struct mem_ctl_info *mci,
+					const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	u32 section, word_bits;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 10, &value);
+	if (ret != -EINVAL) {
+
+		/* Form value to choose 16-byte section of cacheline */
+		section = F10_NB_ARRAY_DRAM_ECC |
+				SET_NB_ARRAY_ADDRESS(pvt->injection.section);
+		pci_write_config_dword(pvt->misc_f3_ctl,
+					F10_NB_ARRAY_ADDR, section);
+
+		word_bits = SET_NB_DRAM_INJECTION_READ(pvt->injection.word,
+						pvt->injection.bit_map);
+
+		/* Issue 'word' and 'bit' along with the READ request */
+		pci_write_config_dword(pvt->misc_f3_ctl,
+					F10_NB_ARRAY_DATA, word_bits);
+
+		debugf0("section=0x%x word_bits=0x%x\n", section, word_bits);
+
+		return count;
+	}
+	return ret;
+}
+
+/*
+ * Do a DRAM ECC write. Assemble staged values in the pvt area and format into
+ * fields needed by the injection registers.
+ */
+static ssize_t amd64_inject_write_store(struct mem_ctl_info *mci,
+					const char *data, size_t count)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	unsigned long value;
+	u32 section, word_bits;
+	int ret = 0;
+
+	ret = strict_strtoul(data, 10, &value);
+	if (ret != -EINVAL) {
+
+		/* Form value to choose 16-byte section of cacheline */
+		section = F10_NB_ARRAY_DRAM_ECC |
+				SET_NB_ARRAY_ADDRESS(pvt->injection.section);
+		pci_write_config_dword(pvt->misc_f3_ctl,
+					F10_NB_ARRAY_ADDR, section);
+
+		word_bits = SET_NB_DRAM_INJECTION_WRITE(pvt->injection.word,
+						pvt->injection.bit_map);
+
+		/* Issue 'word' and 'bit' along with the READ request */
+		pci_write_config_dword(pvt->misc_f3_ctl,
+					F10_NB_ARRAY_DATA, word_bits);
+
+		debugf0("section=0x%x word_bits=0x%x\n", section, word_bits);
+
+		return count;
+	}
+	return ret;
+}
+
+/*
+ * update NUM_INJ_ATTRS in case you add new members
+ */
+struct mcidev_sysfs_attribute amd64_inj_attrs[] = {
+
+	{
+		.attr = {
+			.name = "inject_section",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = NULL,
+		.store = amd64_inject_section_store,
+	},
+	{
+		.attr = {
+			.name = "inject_word",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = NULL,
+		.store = amd64_inject_word_store,
+	},
+	{
+		.attr = {
+			.name = "inject_ecc_vector",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = NULL,
+		.store = amd64_inject_ecc_vector_store,
+	},
+	{
+		.attr = {
+			.name = "inject_write",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = NULL,
+		.store = amd64_inject_write_store,
+	},
+	{
+		.attr = {
+			.name = "inject_read",
+			.mode = (S_IRUGO | S_IWUSR)
+		},
+		.show = NULL,
+		.store = amd64_inject_read_store,
+	},
+};
-- 
GitLab


From b52401cecedf669211f0b7a5095abcb6d3fc82c2 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Wed, 6 May 2009 17:57:20 +0200
Subject: [PATCH 1851/2198] amd64_edac: add MCA error types

Borislav:
- cleanup comments

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac_err_types.c | 161 ++++++++++++++++++++++++++++
 1 file changed, 161 insertions(+)
 create mode 100644 drivers/edac/amd64_edac_err_types.c

diff --git a/drivers/edac/amd64_edac_err_types.c b/drivers/edac/amd64_edac_err_types.c
new file mode 100644
index 0000000000000..f212ff12a9d80
--- /dev/null
+++ b/drivers/edac/amd64_edac_err_types.c
@@ -0,0 +1,161 @@
+#include "amd64_edac.h"
+
+/*
+ * See F2x80 for K8 and F2x[1,0]80 for Fam10 and later. The table below is only
+ * for DDR2 DRAM mapping.
+ */
+u32 revf_quad_ddr2_shift[] = {
+	0,	/* 0000b NULL DIMM (128mb) */
+	28,	/* 0001b 256mb */
+	29,	/* 0010b 512mb */
+	29,	/* 0011b 512mb */
+	29,	/* 0100b 512mb */
+	30,	/* 0101b 1gb */
+	30,	/* 0110b 1gb */
+	31,	/* 0111b 2gb */
+	31,	/* 1000b 2gb */
+	32,	/* 1001b 4gb */
+	32,	/* 1010b 4gb */
+	33,	/* 1011b 8gb */
+	0,	/* 1100b future */
+	0,	/* 1101b future */
+	0,	/* 1110b future */
+	0	/* 1111b future */
+};
+
+/*
+ * Valid scrub rates for the K8 hardware memory scrubber. We map the scrubbing
+ * bandwidth to a valid bit pattern. The 'set' operation finds the 'matching-
+ * or higher value'.
+ *
+ *FIXME: Produce a better mapping/linearisation.
+ */
+
+struct scrubrate scrubrates[] = {
+	{ 0x01, 1600000000UL},
+	{ 0x02, 800000000UL},
+	{ 0x03, 400000000UL},
+	{ 0x04, 200000000UL},
+	{ 0x05, 100000000UL},
+	{ 0x06, 50000000UL},
+	{ 0x07, 25000000UL},
+	{ 0x08, 12284069UL},
+	{ 0x09, 6274509UL},
+	{ 0x0A, 3121951UL},
+	{ 0x0B, 1560975UL},
+	{ 0x0C, 781440UL},
+	{ 0x0D, 390720UL},
+	{ 0x0E, 195300UL},
+	{ 0x0F, 97650UL},
+	{ 0x10, 48854UL},
+	{ 0x11, 24427UL},
+	{ 0x12, 12213UL},
+	{ 0x13, 6101UL},
+	{ 0x14, 3051UL},
+	{ 0x15, 1523UL},
+	{ 0x16, 761UL},
+	{ 0x00, 0UL},        /* scrubbing off */
+};
+
+/*
+ * string representation for the different MCA reported error types, see F3x48
+ * or MSR0000_0411.
+ */
+const char *tt_msgs[] = {        /* transaction type */
+	"instruction",
+	"data",
+	"generic",
+	"reserved"
+};
+
+const char *ll_msgs[] = {	/* cache level */
+	"L0",
+	"L1",
+	"L2",
+	"L3/generic"
+};
+
+const char *rrrr_msgs[] = {
+	"generic",
+	"generic read",
+	"generic write",
+	"data read",
+	"data write",
+	"inst fetch",
+	"prefetch",
+	"evict",
+	"snoop",
+	"reserved RRRR= 9",
+	"reserved RRRR= 10",
+	"reserved RRRR= 11",
+	"reserved RRRR= 12",
+	"reserved RRRR= 13",
+	"reserved RRRR= 14",
+	"reserved RRRR= 15"
+};
+
+const char *pp_msgs[] = {	/* participating processor */
+	"local node originated (SRC)",
+	"local node responded to request (RES)",
+	"local node observed as 3rd party (OBS)",
+	"generic"
+};
+
+const char *to_msgs[] = {
+	"no timeout",
+	"timed out"
+};
+
+const char *ii_msgs[] = {	/* memory or i/o */
+	"mem access",
+	"reserved",
+	"i/o access",
+	"generic"
+};
+
+/* Map the 5 bits of Extended Error code to the string table. */
+const char *ext_msgs[] = {	/* extended error */
+	"K8 ECC error/F10 reserved",	/* 0_0000b */
+	"CRC error",			/* 0_0001b */
+	"sync error",			/* 0_0010b */
+	"mst abort",			/* 0_0011b */
+	"tgt abort",			/* 0_0100b */
+	"GART error",			/* 0_0101b */
+	"RMW error",			/* 0_0110b */
+	"Wdog timer error",		/* 0_0111b */
+	"F10-ECC/K8-Chipkill error",	/* 0_1000b */
+	"DEV Error",			/* 0_1001b */
+	"Link Data error",		/* 0_1010b */
+	"Link or L3 Protocol error",	/* 0_1011b */
+	"NB Array error",		/* 0_1100b */
+	"DRAM Parity error",		/* 0_1101b */
+	"Link Retry/GART Table Walk/DEV Table Walk error", /* 0_1110b */
+	"Res 0x0ff error",		/* 0_1111b */
+	"Res 0x100 error",		/* 1_0000b */
+	"Res 0x101 error",		/* 1_0001b */
+	"Res 0x102 error",		/* 1_0010b */
+	"Res 0x103 error",		/* 1_0011b */
+	"Res 0x104 error",		/* 1_0100b */
+	"Res 0x105 error",		/* 1_0101b */
+	"Res 0x106 error",		/* 1_0110b */
+	"Res 0x107 error",		/* 1_0111b */
+	"Res 0x108 error",		/* 1_1000b */
+	"Res 0x109 error",		/* 1_1001b */
+	"Res 0x10A error",		/* 1_1010b */
+	"Res 0x10B error",		/* 1_1011b */
+	"L3 Cache Data error",		/* 1_1100b */
+	"L3 CacheTag error",		/* 1_1101b */
+	"L3 Cache LRU error",		/* 1_1110b */
+	"Res 0x1FF error"		/* 1_1111b */
+};
+
+const char *htlink_msgs[] = {
+	"none",
+	"1",
+	"2",
+	"1 2",
+	"3",
+	"1 3",
+	"2 3",
+	"1 2 3"
+};
-- 
GitLab


From 2bc65418724a0ddbb20c9eeb9de8285af7748c16 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 4 May 2009 20:11:14 +0200
Subject: [PATCH 1852/2198] amd64_edac: add memory scrubber interface

Borislav:
- fix/cleanup comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 130 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 130 insertions(+)
 create mode 100644 drivers/edac/amd64_edac.c

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
new file mode 100644
index 0000000000000..6876d4359794e
--- /dev/null
+++ b/drivers/edac/amd64_edac.c
@@ -0,0 +1,130 @@
+#include "amd64_edac.h"
+
+static struct edac_pci_ctl_info *amd64_ctl_pci;
+
+static int report_gart_errors;
+module_param(report_gart_errors, int, 0644);
+
+/*
+ * Set by command line parameter. If BIOS has enabled the ECC, this override is
+ * cleared to prevent re-enabling the hardware by this driver.
+ */
+static int ecc_enable_override;
+module_param(ecc_enable_override, int, 0644);
+
+/* Lookup table for all possible MC control instances */
+struct amd64_pvt;
+static struct mem_ctl_info *mci_lookup[MAX_NUMNODES];
+static struct amd64_pvt *pvt_lookup[MAX_NUMNODES];
+
+/*
+ * Memory scrubber control interface. For K8, memory scrubbing is handled by
+ * hardware and can involve L2 cache, dcache as well as the main memory. With
+ * F10, this is extended to L3 cache scrubbing on CPU models sporting that
+ * functionality.
+ *
+ * This causes the "units" for the scrubbing speed to vary from 64 byte blocks
+ * (dram) over to cache lines. This is nasty, so we will use bandwidth in
+ * bytes/sec for the setting.
+ *
+ * Currently, we only do dram scrubbing. If the scrubbing is done in software on
+ * other archs, we might not have access to the caches directly.
+ */
+
+/*
+ * scan the scrub rate mapping table for a close or matching bandwidth value to
+ * issue. If requested is too big, then use last maximum value found.
+ */
+static int amd64_search_set_scrub_rate(struct pci_dev *ctl, u32 new_bw,
+				       u32 min_scrubrate)
+{
+	u32 scrubval;
+	int i;
+
+	/*
+	 * map the configured rate (new_bw) to a value specific to the AMD64
+	 * memory controller and apply to register. Search for the first
+	 * bandwidth entry that is greater or equal than the setting requested
+	 * and program that. If at last entry, turn off DRAM scrubbing.
+	 */
+	for (i = 0; i < ARRAY_SIZE(scrubrates); i++) {
+		/*
+		 * skip scrub rates which aren't recommended
+		 * (see F10 BKDG, F3x58)
+		 */
+		if (scrubrates[i].scrubval < min_scrubrate)
+			continue;
+
+		if (scrubrates[i].bandwidth <= new_bw)
+			break;
+
+		/*
+		 * if no suitable bandwidth found, turn off DRAM scrubbing
+		 * entirely by falling back to the last element in the
+		 * scrubrates array.
+		 */
+	}
+
+	scrubval = scrubrates[i].scrubval;
+	if (scrubval)
+		edac_printk(KERN_DEBUG, EDAC_MC,
+			    "Setting scrub rate bandwidth: %u\n",
+			    scrubrates[i].bandwidth);
+	else
+		edac_printk(KERN_DEBUG, EDAC_MC, "Turning scrubbing off.\n");
+
+	pci_write_bits32(ctl, K8_SCRCTRL, scrubval, 0x001F);
+
+	return 0;
+}
+
+static int amd64_set_scrub_rate(struct mem_ctl_info *mci, u32 *bandwidth)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u32 min_scrubrate = 0x0;
+
+	switch (boot_cpu_data.x86) {
+	case 0xf:
+		min_scrubrate = K8_MIN_SCRUB_RATE_BITS;
+		break;
+	case 0x10:
+		min_scrubrate = F10_MIN_SCRUB_RATE_BITS;
+		break;
+	case 0x11:
+		min_scrubrate = F11_MIN_SCRUB_RATE_BITS;
+		break;
+
+	default:
+		amd64_printk(KERN_ERR, "Unsupported family!\n");
+		break;
+	}
+	return amd64_search_set_scrub_rate(pvt->misc_f3_ctl, *bandwidth,
+			min_scrubrate);
+}
+
+static int amd64_get_scrub_rate(struct mem_ctl_info *mci, u32 *bw)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u32 scrubval = 0;
+	int status = -1, i, ret = 0;
+
+	ret = pci_read_config_dword(pvt->misc_f3_ctl, K8_SCRCTRL, &scrubval);
+	if (ret)
+		debugf0("Reading K8_SCRCTRL failed\n");
+
+	scrubval = scrubval & 0x001F;
+
+	edac_printk(KERN_DEBUG, EDAC_MC,
+		    "pci-read, sdram scrub control value: %d \n", scrubval);
+
+	for (i = 0; ARRAY_SIZE(scrubrates); i++) {
+		if (scrubrates[i].scrubval == scrubval) {
+			*bw = scrubrates[i].bandwidth;
+			status = 0;
+			break;
+		}
+	}
+
+	return status;
+}
+
-- 
GitLab


From 6775763a2377e1ea865299b6daadc875d622de3f Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 15:53:22 +0200
Subject: [PATCH 1853/2198] amd64_edac: add sys addr to memory controller
 mapping helpers

Borislav:

- cleanup comments
- cleanup debug calls
- simplify find_mc_by_sys_addr's exit path

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 141 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 141 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 6876d4359794e..a774f34d508bd 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -128,3 +128,144 @@ static int amd64_get_scrub_rate(struct mem_ctl_info *mci, u32 *bw)
 	return status;
 }
 
+/* Map from a CSROW entry to the mask entry that operates on it */
+static inline u32 amd64_map_to_dcs_mask(struct amd64_pvt *pvt, int csrow)
+{
+	return csrow >> (pvt->num_dcsm >> 3);
+}
+
+/* return the 'base' address the i'th CS entry of the 'dct' DRAM controller */
+static u32 amd64_get_dct_base(struct amd64_pvt *pvt, int dct, int csrow)
+{
+	if (dct == 0)
+		return pvt->dcsb0[csrow];
+	else
+		return pvt->dcsb1[csrow];
+}
+
+/*
+ * Return the 'mask' address the i'th CS entry. This function is needed because
+ * there number of DCSM registers on Rev E and prior vs Rev F and later is
+ * different.
+ */
+static u32 amd64_get_dct_mask(struct amd64_pvt *pvt, int dct, int csrow)
+{
+	if (dct == 0)
+		return pvt->dcsm0[amd64_map_to_dcs_mask(pvt, csrow)];
+	else
+		return pvt->dcsm1[amd64_map_to_dcs_mask(pvt, csrow)];
+}
+
+
+/*
+ * In *base and *limit, pass back the full 40-bit base and limit physical
+ * addresses for the node given by node_id.  This information is obtained from
+ * DRAM Base (section 3.4.4.1) and DRAM Limit (section 3.4.4.2) registers. The
+ * base and limit addresses are of type SysAddr, as defined at the start of
+ * section 3.4.4 (p. 70).  They are the lowest and highest physical addresses
+ * in the address range they represent.
+ */
+static void amd64_get_base_and_limit(struct amd64_pvt *pvt, int node_id,
+			       u64 *base, u64 *limit)
+{
+	*base = pvt->dram_base[node_id];
+	*limit = pvt->dram_limit[node_id];
+}
+
+/*
+ * Return 1 if the SysAddr given by sys_addr matches the base/limit associated
+ * with node_id
+ */
+static int amd64_base_limit_match(struct amd64_pvt *pvt,
+					u64 sys_addr, int node_id)
+{
+	u64 base, limit, addr;
+
+	amd64_get_base_and_limit(pvt, node_id, &base, &limit);
+
+	/* The K8 treats this as a 40-bit value.  However, bits 63-40 will be
+	 * all ones if the most significant implemented address bit is 1.
+	 * Here we discard bits 63-40.  See section 3.4.2 of AMD publication
+	 * 24592: AMD x86-64 Architecture Programmer's Manual Volume 1
+	 * Application Programming.
+	 */
+	addr = sys_addr & 0x000000ffffffffffull;
+
+	return (addr >= base) && (addr <= limit);
+}
+
+/*
+ * Attempt to map a SysAddr to a node. On success, return a pointer to the
+ * mem_ctl_info structure for the node that the SysAddr maps to.
+ *
+ * On failure, return NULL.
+ */
+static struct mem_ctl_info *find_mc_by_sys_addr(struct mem_ctl_info *mci,
+						u64 sys_addr)
+{
+	struct amd64_pvt *pvt;
+	int node_id;
+	u32 intlv_en, bits;
+
+	/*
+	 * Here we use the DRAM Base (section 3.4.4.1) and DRAM Limit (section
+	 * 3.4.4.2) registers to map the SysAddr to a node ID.
+	 */
+	pvt = mci->pvt_info;
+
+	/*
+	 * The value of this field should be the same for all DRAM Base
+	 * registers.  Therefore we arbitrarily choose to read it from the
+	 * register for node 0.
+	 */
+	intlv_en = pvt->dram_IntlvEn[0];
+
+	if (intlv_en == 0) {
+		for (node_id = 0; ; ) {
+			if (amd64_base_limit_match(pvt, sys_addr, node_id))
+				break;
+
+			if (++node_id >= DRAM_REG_COUNT)
+				goto err_no_match;
+		}
+		goto found;
+	}
+
+	if (unlikely((intlv_en != (0x01 << 8)) &&
+		     (intlv_en != (0x03 << 8)) &&
+		     (intlv_en != (0x07 << 8)))) {
+		amd64_printk(KERN_WARNING, "junk value of 0x%x extracted from "
+			     "IntlvEn field of DRAM Base Register for node 0: "
+			     "This probably indicates a BIOS bug.\n", intlv_en);
+		return NULL;
+	}
+
+	bits = (((u32) sys_addr) >> 12) & intlv_en;
+
+	for (node_id = 0; ; ) {
+		if ((pvt->dram_limit[node_id] & intlv_en) == bits)
+			break;	/* intlv_sel field matches */
+
+		if (++node_id >= DRAM_REG_COUNT)
+			goto err_no_match;
+	}
+
+	/* sanity test for sys_addr */
+	if (unlikely(!amd64_base_limit_match(pvt, sys_addr, node_id))) {
+		amd64_printk(KERN_WARNING,
+			  "%s(): sys_addr 0x%lx falls outside base/limit "
+			  "address range for node %d with node interleaving "
+			  "enabled.\n", __func__, (unsigned long)sys_addr,
+			  node_id);
+		return NULL;
+	}
+
+found:
+	return edac_mc_find(node_id);
+
+err_no_match:
+	debugf2("sys_addr 0x%lx doesn't match any node\n",
+		(unsigned long)sys_addr);
+
+	return NULL;
+}
-- 
GitLab


From e2ce7255e84db656853e91536e6023f92ff89f97 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 15:57:12 +0200
Subject: [PATCH 1854/2198] amd64_edac: add functionality to compute the DRAM
 hole

Borislav:

- cleanup/fix comments, add BKDG refs
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 166 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 166 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index a774f34d508bd..4716fb561e6e0 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -269,3 +269,169 @@ static struct mem_ctl_info *find_mc_by_sys_addr(struct mem_ctl_info *mci,
 
 	return NULL;
 }
+
+/*
+ * Extract the DRAM CS base address from selected csrow register.
+ */
+static u64 base_from_dct_base(struct amd64_pvt *pvt, int csrow)
+{
+	return ((u64) (amd64_get_dct_base(pvt, 0, csrow) & pvt->dcsb_base)) <<
+				pvt->dcs_shift;
+}
+
+/*
+ * Extract the mask from the dcsb0[csrow] entry in a CPU revision-specific way.
+ */
+static u64 mask_from_dct_mask(struct amd64_pvt *pvt, int csrow)
+{
+	u64 dcsm_bits, other_bits;
+	u64 mask;
+
+	/* Extract bits from DRAM CS Mask. */
+	dcsm_bits = amd64_get_dct_mask(pvt, 0, csrow) & pvt->dcsm_mask;
+
+	other_bits = pvt->dcsm_mask;
+	other_bits = ~(other_bits << pvt->dcs_shift);
+
+	/*
+	 * The extracted bits from DCSM belong in the spaces represented by
+	 * the cleared bits in other_bits.
+	 */
+	mask = (dcsm_bits << pvt->dcs_shift) | other_bits;
+
+	return mask;
+}
+
+/*
+ * @input_addr is an InputAddr associated with the node given by mci. Return the
+ * csrow that input_addr maps to, or -1 on failure (no csrow claims input_addr).
+ */
+static int input_addr_to_csrow(struct mem_ctl_info *mci, u64 input_addr)
+{
+	struct amd64_pvt *pvt;
+	int csrow;
+	u64 base, mask;
+
+	pvt = mci->pvt_info;
+
+	/*
+	 * Here we use the DRAM CS Base and DRAM CS Mask registers. For each CS
+	 * base/mask register pair, test the condition shown near the start of
+	 * section 3.5.4 (p. 84, BKDG #26094, K8, revA-E).
+	 */
+	for (csrow = 0; csrow < CHIPSELECT_COUNT; csrow++) {
+
+		/* This DRAM chip select is disabled on this node */
+		if ((pvt->dcsb0[csrow] & K8_DCSB_CS_ENABLE) == 0)
+			continue;
+
+		base = base_from_dct_base(pvt, csrow);
+		mask = ~mask_from_dct_mask(pvt, csrow);
+
+		if ((input_addr & mask) == (base & mask)) {
+			debugf2("InputAddr 0x%lx matches csrow %d (node %d)\n",
+				(unsigned long)input_addr, csrow,
+				pvt->mc_node_id);
+
+			return csrow;
+		}
+	}
+
+	debugf2("no matching csrow for InputAddr 0x%lx (MC node %d)\n",
+		(unsigned long)input_addr, pvt->mc_node_id);
+
+	return -1;
+}
+
+/*
+ * Return the base value defined by the DRAM Base register for the node
+ * represented by mci.  This function returns the full 40-bit value despite the
+ * fact that the register only stores bits 39-24 of the value. See section
+ * 3.4.4.1 (BKDG #26094, K8, revA-E)
+ */
+static inline u64 get_dram_base(struct mem_ctl_info *mci)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return pvt->dram_base[pvt->mc_node_id];
+}
+
+/*
+ * Obtain info from the DRAM Hole Address Register (section 3.4.8, pub #26094)
+ * for the node represented by mci. Info is passed back in *hole_base,
+ * *hole_offset, and *hole_size.  Function returns 0 if info is valid or 1 if
+ * info is invalid. Info may be invalid for either of the following reasons:
+ *
+ * - The revision of the node is not E or greater.  In this case, the DRAM Hole
+ *   Address Register does not exist.
+ *
+ * - The DramHoleValid bit is cleared in the DRAM Hole Address Register,
+ *   indicating that its contents are not valid.
+ *
+ * The values passed back in *hole_base, *hole_offset, and *hole_size are
+ * complete 32-bit values despite the fact that the bitfields in the DHAR
+ * only represent bits 31-24 of the base and offset values.
+ */
+int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
+			     u64 *hole_offset, u64 *hole_size)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u64 base;
+
+	/* only revE and later have the DRAM Hole Address Register */
+	if (boot_cpu_data.x86 == 0xf && pvt->ext_model < OPTERON_CPU_REV_E) {
+		debugf1("  revision %d for node %d does not support DHAR\n",
+			pvt->ext_model, pvt->mc_node_id);
+		return 1;
+	}
+
+	/* only valid for Fam10h */
+	if (boot_cpu_data.x86 == 0x10 &&
+	    (pvt->dhar & F10_DRAM_MEM_HOIST_VALID) == 0) {
+		debugf1("  Dram Memory Hoisting is DISABLED on this system\n");
+		return 1;
+	}
+
+	if ((pvt->dhar & DHAR_VALID) == 0) {
+		debugf1("  Dram Memory Hoisting is DISABLED on this node %d\n",
+			pvt->mc_node_id);
+		return 1;
+	}
+
+	/* This node has Memory Hoisting */
+
+	/* +------------------+--------------------+--------------------+-----
+	 * | memory           | DRAM hole          | relocated          |
+	 * | [0, (x - 1)]     | [x, 0xffffffff]    | addresses from     |
+	 * |                  |                    | DRAM hole          |
+	 * |                  |                    | [0x100000000,      |
+	 * |                  |                    |  (0x100000000+     |
+	 * |                  |                    |   (0xffffffff-x))] |
+	 * +------------------+--------------------+--------------------+-----
+	 *
+	 * Above is a diagram of physical memory showing the DRAM hole and the
+	 * relocated addresses from the DRAM hole.  As shown, the DRAM hole
+	 * starts at address x (the base address) and extends through address
+	 * 0xffffffff.  The DRAM Hole Address Register (DHAR) relocates the
+	 * addresses in the hole so that they start at 0x100000000.
+	 */
+
+	base = dhar_base(pvt->dhar);
+
+	*hole_base = base;
+	*hole_size = (0x1ull << 32) - base;
+
+	if (boot_cpu_data.x86 > 0xf)
+		*hole_offset = f10_dhar_offset(pvt->dhar);
+	else
+		*hole_offset = k8_dhar_offset(pvt->dhar);
+
+	debugf1("  DHAR info for node %d base 0x%lx offset 0x%lx size 0x%lx\n",
+		pvt->mc_node_id, (unsigned long)*hole_base,
+		(unsigned long)*hole_offset, (unsigned long)*hole_size);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(amd64_get_dram_hole_info);
+
+
-- 
GitLab


From 93c2df58b5b1a434cca8f60067e0e12d1942b7f1 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 4 May 2009 20:46:50 +0200
Subject: [PATCH 1855/2198] amd64_edac: add DRAM address type conversion
 facilities

Borislav:

- cleanup/fix comments, add BKDG refs
- fix function return value patterns
- cleanup dbg calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 294 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 294 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 4716fb561e6e0..28f85c9e3afc4 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -434,4 +434,298 @@ int amd64_get_dram_hole_info(struct mem_ctl_info *mci, u64 *hole_base,
 }
 EXPORT_SYMBOL_GPL(amd64_get_dram_hole_info);
 
+/*
+ * Return the DramAddr that the SysAddr given by @sys_addr maps to.  It is
+ * assumed that sys_addr maps to the node given by mci.
+ *
+ * The first part of section 3.4.4 (p. 70) shows how the DRAM Base (section
+ * 3.4.4.1) and DRAM Limit (section 3.4.4.2) registers are used to translate a
+ * SysAddr to a DramAddr. If the DRAM Hole Address Register (DHAR) is enabled,
+ * then it is also involved in translating a SysAddr to a DramAddr. Sections
+ * 3.4.8 and 3.5.8.2 describe the DHAR and how it is used for memory hoisting.
+ * These parts of the documentation are unclear. I interpret them as follows:
+ *
+ * When node n receives a SysAddr, it processes the SysAddr as follows:
+ *
+ * 1. It extracts the DRAMBase and DRAMLimit values from the DRAM Base and DRAM
+ *    Limit registers for node n. If the SysAddr is not within the range
+ *    specified by the base and limit values, then node n ignores the Sysaddr
+ *    (since it does not map to node n). Otherwise continue to step 2 below.
+ *
+ * 2. If the DramHoleValid bit of the DHAR for node n is clear, the DHAR is
+ *    disabled so skip to step 3 below. Otherwise see if the SysAddr is within
+ *    the range of relocated addresses (starting at 0x100000000) from the DRAM
+ *    hole. If not, skip to step 3 below. Else get the value of the
+ *    DramHoleOffset field from the DHAR. To obtain the DramAddr, subtract the
+ *    offset defined by this value from the SysAddr.
+ *
+ * 3. Obtain the base address for node n from the DRAMBase field of the DRAM
+ *    Base register for node n. To obtain the DramAddr, subtract the base
+ *    address from the SysAddr, as shown near the start of section 3.4.4 (p.70).
+ */
+static u64 sys_addr_to_dram_addr(struct mem_ctl_info *mci, u64 sys_addr)
+{
+	u64 dram_base, hole_base, hole_offset, hole_size, dram_addr;
+	int ret = 0;
+
+	dram_base = get_dram_base(mci);
+
+	ret = amd64_get_dram_hole_info(mci, &hole_base, &hole_offset,
+				      &hole_size);
+	if (!ret) {
+		if ((sys_addr >= (1ull << 32)) &&
+		    (sys_addr < ((1ull << 32) + hole_size))) {
+			/* use DHAR to translate SysAddr to DramAddr */
+			dram_addr = sys_addr - hole_offset;
+
+			debugf2("using DHAR to translate SysAddr 0x%lx to "
+				"DramAddr 0x%lx\n",
+				(unsigned long)sys_addr,
+				(unsigned long)dram_addr);
+
+			return dram_addr;
+		}
+	}
+
+	/*
+	 * Translate the SysAddr to a DramAddr as shown near the start of
+	 * section 3.4.4 (p. 70).  Although sys_addr is a 64-bit value, the k8
+	 * only deals with 40-bit values.  Therefore we discard bits 63-40 of
+	 * sys_addr below.  If bit 39 of sys_addr is 1 then the bits we
+	 * discard are all 1s.  Otherwise the bits we discard are all 0s.  See
+	 * section 3.4.2 of AMD publication 24592: AMD x86-64 Architecture
+	 * Programmer's Manual Volume 1 Application Programming.
+	 */
+	dram_addr = (sys_addr & 0xffffffffffull) - dram_base;
+
+	debugf2("using DRAM Base register to translate SysAddr 0x%lx to "
+		"DramAddr 0x%lx\n", (unsigned long)sys_addr,
+		(unsigned long)dram_addr);
+	return dram_addr;
+}
+
+/*
+ * @intlv_en is the value of the IntlvEn field from a DRAM Base register
+ * (section 3.4.4.1).  Return the number of bits from a SysAddr that are used
+ * for node interleaving.
+ */
+static int num_node_interleave_bits(unsigned intlv_en)
+{
+	static const int intlv_shift_table[] = { 0, 1, 0, 2, 0, 0, 0, 3 };
+	int n;
+
+	BUG_ON(intlv_en > 7);
+	n = intlv_shift_table[intlv_en];
+	return n;
+}
+
+/* Translate the DramAddr given by @dram_addr to an InputAddr. */
+static u64 dram_addr_to_input_addr(struct mem_ctl_info *mci, u64 dram_addr)
+{
+	struct amd64_pvt *pvt;
+	int intlv_shift;
+	u64 input_addr;
+
+	pvt = mci->pvt_info;
+
+	/*
+	 * See the start of section 3.4.4 (p. 70, BKDG #26094, K8, revA-E)
+	 * concerning translating a DramAddr to an InputAddr.
+	 */
+	intlv_shift = num_node_interleave_bits(pvt->dram_IntlvEn[0]);
+	input_addr = ((dram_addr >> intlv_shift) & 0xffffff000ull) +
+	    (dram_addr & 0xfff);
+
+	debugf2("  Intlv Shift=%d DramAddr=0x%lx maps to InputAddr=0x%lx\n",
+		intlv_shift, (unsigned long)dram_addr,
+		(unsigned long)input_addr);
+
+	return input_addr;
+}
+
+/*
+ * Translate the SysAddr represented by @sys_addr to an InputAddr.  It is
+ * assumed that @sys_addr maps to the node given by mci.
+ */
+static u64 sys_addr_to_input_addr(struct mem_ctl_info *mci, u64 sys_addr)
+{
+	u64 input_addr;
+
+	input_addr =
+	    dram_addr_to_input_addr(mci, sys_addr_to_dram_addr(mci, sys_addr));
+
+	debugf2("SysAdddr 0x%lx translates to InputAddr 0x%lx\n",
+		(unsigned long)sys_addr, (unsigned long)input_addr);
+
+	return input_addr;
+}
+
+
+/*
+ * @input_addr is an InputAddr associated with the node represented by mci.
+ * Translate @input_addr to a DramAddr and return the result.
+ */
+static u64 input_addr_to_dram_addr(struct mem_ctl_info *mci, u64 input_addr)
+{
+	struct amd64_pvt *pvt;
+	int node_id, intlv_shift;
+	u64 bits, dram_addr;
+	u32 intlv_sel;
+
+	/*
+	 * Near the start of section 3.4.4 (p. 70, BKDG #26094, K8, revA-E)
+	 * shows how to translate a DramAddr to an InputAddr. Here we reverse
+	 * this procedure. When translating from a DramAddr to an InputAddr, the
+	 * bits used for node interleaving are discarded.  Here we recover these
+	 * bits from the IntlvSel field of the DRAM Limit register (section
+	 * 3.4.4.2) for the node that input_addr is associated with.
+	 */
+	pvt = mci->pvt_info;
+	node_id = pvt->mc_node_id;
+	BUG_ON((node_id < 0) || (node_id > 7));
+
+	intlv_shift = num_node_interleave_bits(pvt->dram_IntlvEn[0]);
+
+	if (intlv_shift == 0) {
+		debugf1("    InputAddr 0x%lx translates to DramAddr of "
+			"same value\n",	(unsigned long)input_addr);
+
+		return input_addr;
+	}
+
+	bits = ((input_addr & 0xffffff000ull) << intlv_shift) +
+	    (input_addr & 0xfff);
+
+	intlv_sel = pvt->dram_IntlvSel[node_id] & ((1 << intlv_shift) - 1);
+	dram_addr = bits + (intlv_sel << 12);
+
+	debugf1("InputAddr 0x%lx translates to DramAddr 0x%lx "
+		"(%d node interleave bits)\n", (unsigned long)input_addr,
+		(unsigned long)dram_addr, intlv_shift);
+
+	return dram_addr;
+}
+
+/*
+ * @dram_addr is a DramAddr that maps to the node represented by mci. Convert
+ * @dram_addr to a SysAddr.
+ */
+static u64 dram_addr_to_sys_addr(struct mem_ctl_info *mci, u64 dram_addr)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u64 hole_base, hole_offset, hole_size, base, limit, sys_addr;
+	int ret = 0;
+
+	ret = amd64_get_dram_hole_info(mci, &hole_base, &hole_offset,
+				      &hole_size);
+	if (!ret) {
+		if ((dram_addr >= hole_base) &&
+		    (dram_addr < (hole_base + hole_size))) {
+			sys_addr = dram_addr + hole_offset;
+
+			debugf1("using DHAR to translate DramAddr 0x%lx to "
+				"SysAddr 0x%lx\n", (unsigned long)dram_addr,
+				(unsigned long)sys_addr);
+
+			return sys_addr;
+		}
+	}
+
+	amd64_get_base_and_limit(pvt, pvt->mc_node_id, &base, &limit);
+	sys_addr = dram_addr + base;
+
+	/*
+	 * The sys_addr we have computed up to this point is a 40-bit value
+	 * because the k8 deals with 40-bit values.  However, the value we are
+	 * supposed to return is a full 64-bit physical address.  The AMD
+	 * x86-64 architecture specifies that the most significant implemented
+	 * address bit through bit 63 of a physical address must be either all
+	 * 0s or all 1s.  Therefore we sign-extend the 40-bit sys_addr to a
+	 * 64-bit value below.  See section 3.4.2 of AMD publication 24592:
+	 * AMD x86-64 Architecture Programmer's Manual Volume 1 Application
+	 * Programming.
+	 */
+	sys_addr |= ~((sys_addr & (1ull << 39)) - 1);
+
+	debugf1("    Node %d, DramAddr 0x%lx to SysAddr 0x%lx\n",
+		pvt->mc_node_id, (unsigned long)dram_addr,
+		(unsigned long)sys_addr);
+
+	return sys_addr;
+}
+
+/*
+ * @input_addr is an InputAddr associated with the node given by mci. Translate
+ * @input_addr to a SysAddr.
+ */
+static inline u64 input_addr_to_sys_addr(struct mem_ctl_info *mci,
+					 u64 input_addr)
+{
+	return dram_addr_to_sys_addr(mci,
+				     input_addr_to_dram_addr(mci, input_addr));
+}
+
+/*
+ * Find the minimum and maximum InputAddr values that map to the given @csrow.
+ * Pass back these values in *input_addr_min and *input_addr_max.
+ */
+static void find_csrow_limits(struct mem_ctl_info *mci, int csrow,
+			      u64 *input_addr_min, u64 *input_addr_max)
+{
+	struct amd64_pvt *pvt;
+	u64 base, mask;
+
+	pvt = mci->pvt_info;
+	BUG_ON((csrow < 0) || (csrow >= CHIPSELECT_COUNT));
+
+	base = base_from_dct_base(pvt, csrow);
+	mask = mask_from_dct_mask(pvt, csrow);
+
+	*input_addr_min = base & ~mask;
+	*input_addr_max = base | mask | pvt->dcs_mask_notused;
+}
+
+/*
+ * Extract error address from MCA NB Address Low (section 3.6.4.5) and MCA NB
+ * Address High (section 3.6.4.6) register values and return the result. Address
+ * is located in the info structure (nbeah and nbeal), the encoding is device
+ * specific.
+ */
+static u64 extract_error_address(struct mem_ctl_info *mci,
+				 struct amd64_error_info_regs *info)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	return pvt->ops->get_error_address(mci, info);
+}
+
+
+/* Map the Error address to a PAGE and PAGE OFFSET. */
+static inline void error_address_to_page_and_offset(u64 error_address,
+						    u32 *page, u32 *offset)
+{
+	*page = (u32) (error_address >> PAGE_SHIFT);
+	*offset = ((u32) error_address) & ~PAGE_MASK;
+}
+
+/*
+ * @sys_addr is an error address (a SysAddr) extracted from the MCA NB Address
+ * Low (section 3.6.4.5) and MCA NB Address High (section 3.6.4.6) registers
+ * of a node that detected an ECC memory error.  mci represents the node that
+ * the error address maps to (possibly different from the node that detected
+ * the error).  Return the number of the csrow that sys_addr maps to, or -1 on
+ * error.
+ */
+static int sys_addr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr)
+{
+	int csrow;
+
+	csrow = input_addr_to_csrow(mci, sys_addr_to_input_addr(mci, sys_addr));
+
+	if (csrow == -1)
+		amd64_mc_printk(mci, KERN_ERR,
+			     "Failed to translate InputAddr to csrow for "
+			     "address 0x%lx\n", (unsigned long)sys_addr);
+	return csrow;
+}
 
-- 
GitLab


From 2da11654ea4d32c5700693675dfbd55c70619519 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:09:09 +0200
Subject: [PATCH 1856/2198] amd64_edac: add helper to dump relevant registers

Borislav:

- cleanup/fix comments
- fix function return value patterns
- cleanup dbg calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 142 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 28f85c9e3afc4..3d6d0007ed5d2 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -729,3 +729,145 @@ static int sys_addr_to_csrow(struct mem_ctl_info *mci, u64 sys_addr)
 	return csrow;
 }
 
+static int get_channel_from_ecc_syndrome(unsigned short syndrome);
+
+static void amd64_cpu_display_info(struct amd64_pvt *pvt)
+{
+	if (boot_cpu_data.x86 == 0x11)
+		edac_printk(KERN_DEBUG, EDAC_MC, "F11h CPU detected\n");
+	else if (boot_cpu_data.x86 == 0x10)
+		edac_printk(KERN_DEBUG, EDAC_MC, "F10h CPU detected\n");
+	else if (boot_cpu_data.x86 == 0xf)
+		edac_printk(KERN_DEBUG, EDAC_MC, "%s detected\n",
+			(pvt->ext_model >= OPTERON_CPU_REV_F) ?
+			"Rev F or later" : "Rev E or earlier");
+	else
+		/* we'll hardly ever ever get here */
+		edac_printk(KERN_ERR, EDAC_MC, "Unknown cpu!\n");
+}
+
+/*
+ * Determine if the DIMMs have ECC enabled. ECC is enabled ONLY if all the DIMMs
+ * are ECC capable.
+ */
+static enum edac_type amd64_determine_edac_cap(struct amd64_pvt *pvt)
+{
+	int bit;
+	enum dev_type edac_cap = EDAC_NONE;
+
+	bit = (boot_cpu_data.x86 > 0xf || pvt->ext_model >= OPTERON_CPU_REV_F)
+		? 19
+		: 17;
+
+	if (pvt->dclr0 >> BIT(bit))
+		edac_cap = EDAC_FLAG_SECDED;
+
+	return edac_cap;
+}
+
+
+static void f10_debug_display_dimm_sizes(int ctrl, struct amd64_pvt *pvt,
+					 int ganged);
+
+/* Display and decode various NB registers for debug purposes. */
+static void amd64_dump_misc_regs(struct amd64_pvt *pvt)
+{
+	int ganged;
+
+	debugf1("  nbcap:0x%8.08x DctDualCap=%s DualNode=%s 8-Node=%s\n",
+		pvt->nbcap,
+		(pvt->nbcap & K8_NBCAP_DCT_DUAL) ? "True" : "False",
+		(pvt->nbcap & K8_NBCAP_DUAL_NODE) ? "True" : "False",
+		(pvt->nbcap & K8_NBCAP_8_NODE) ? "True" : "False");
+	debugf1("    ECC Capable=%s   ChipKill Capable=%s\n",
+		(pvt->nbcap & K8_NBCAP_SECDED) ? "True" : "False",
+		(pvt->nbcap & K8_NBCAP_CHIPKILL) ? "True" : "False");
+	debugf1("  DramCfg0-low=0x%08x DIMM-ECC=%s Parity=%s Width=%s\n",
+		pvt->dclr0,
+		(pvt->dclr0 & BIT(19)) ?  "Enabled" : "Disabled",
+		(pvt->dclr0 & BIT(8)) ?  "Enabled" : "Disabled",
+		(pvt->dclr0 & BIT(11)) ?  "128b" : "64b");
+	debugf1("    DIMM x4 Present: L0=%s L1=%s L2=%s L3=%s  DIMM Type=%s\n",
+		(pvt->dclr0 & BIT(12)) ?  "Y" : "N",
+		(pvt->dclr0 & BIT(13)) ?  "Y" : "N",
+		(pvt->dclr0 & BIT(14)) ?  "Y" : "N",
+		(pvt->dclr0 & BIT(15)) ?  "Y" : "N",
+		(pvt->dclr0 & BIT(16)) ?  "UN-Buffered" : "Buffered");
+
+
+	debugf1("  online-spare: 0x%8.08x\n", pvt->online_spare);
+
+	if (boot_cpu_data.x86 == 0xf) {
+		debugf1("  dhar: 0x%8.08x Base=0x%08x Offset=0x%08x\n",
+			pvt->dhar, dhar_base(pvt->dhar),
+			k8_dhar_offset(pvt->dhar));
+		debugf1("      DramHoleValid=%s\n",
+			(pvt->dhar & DHAR_VALID) ?  "True" : "False");
+
+		debugf1("  dbam-dkt: 0x%8.08x\n", pvt->dbam0);
+
+		/* everything below this point is Fam10h and above */
+		return;
+
+	} else {
+		debugf1("  dhar: 0x%8.08x Base=0x%08x Offset=0x%08x\n",
+			pvt->dhar, dhar_base(pvt->dhar),
+			f10_dhar_offset(pvt->dhar));
+		debugf1("    DramMemHoistValid=%s DramHoleValid=%s\n",
+			(pvt->dhar & F10_DRAM_MEM_HOIST_VALID) ?
+			"True" : "False",
+			(pvt->dhar & DHAR_VALID) ?
+			"True" : "False");
+	}
+
+	/* Only if NOT ganged does dcl1 have valid info */
+	if (!dct_ganging_enabled(pvt)) {
+		debugf1("  DramCfg1-low=0x%08x DIMM-ECC=%s Parity=%s "
+			"Width=%s\n", pvt->dclr1,
+			(pvt->dclr1 & BIT(19)) ?  "Enabled" : "Disabled",
+			(pvt->dclr1 & BIT(8)) ?  "Enabled" : "Disabled",
+			(pvt->dclr1 & BIT(11)) ?  "128b" : "64b");
+		debugf1("    DIMM x4 Present: L0=%s L1=%s L2=%s L3=%s  "
+			"DIMM Type=%s\n",
+			(pvt->dclr1 & BIT(12)) ?  "Y" : "N",
+			(pvt->dclr1 & BIT(13)) ?  "Y" : "N",
+			(pvt->dclr1 & BIT(14)) ?  "Y" : "N",
+			(pvt->dclr1 & BIT(15)) ?  "Y" : "N",
+			(pvt->dclr1 & BIT(16)) ?  "UN-Buffered" : "Buffered");
+	}
+
+	/*
+	 * Determine if ganged and then dump memory sizes for first controller,
+	 * and if NOT ganged dump info for 2nd controller.
+	 */
+	ganged = dct_ganging_enabled(pvt);
+
+	f10_debug_display_dimm_sizes(0, pvt, ganged);
+
+	if (!ganged)
+		f10_debug_display_dimm_sizes(1, pvt, ganged);
+}
+
+/* Read in both of DBAM registers */
+static void amd64_read_dbam_reg(struct amd64_pvt *pvt)
+{
+	int err = 0;
+	unsigned int reg;
+
+	reg = DBAM0;
+	err = pci_read_config_dword(pvt->dram_f2_ctl, reg, &pvt->dbam0);
+	if (err)
+		goto err_reg;
+
+	if (boot_cpu_data.x86 >= 0x10) {
+		reg = DBAM1;
+		err = pci_read_config_dword(pvt->dram_f2_ctl, reg, &pvt->dbam1);
+
+		if (err)
+			goto err_reg;
+	}
+
+err_reg:
+	debugf0("Error reading F2x%03x.\n", reg);
+}
+
-- 
GitLab


From 94be4bff21990674ac418c970c708a1a01cf709f Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:12:00 +0200
Subject: [PATCH 1857/2198] amd64_edac: assign DRAM chip select base and mask
 in a family-specific way

Borislav:

- cleanup/fix comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 142 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 142 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 3d6d0007ed5d2..5ec44a440d65b 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -871,3 +871,145 @@ static void amd64_read_dbam_reg(struct amd64_pvt *pvt)
 	debugf0("Error reading F2x%03x.\n", reg);
 }
 
+/*
+ * NOTE: CPU Revision Dependent code: Rev E and Rev F
+ *
+ * Set the DCSB and DCSM mask values depending on the CPU revision value. Also
+ * set the shift factor for the DCSB and DCSM values.
+ *
+ * ->dcs_mask_notused, RevE:
+ *
+ * To find the max InputAddr for the csrow, start with the base address and set
+ * all bits that are "don't care" bits in the test at the start of section
+ * 3.5.4 (p. 84).
+ *
+ * The "don't care" bits are all set bits in the mask and all bits in the gaps
+ * between bit ranges [35:25] and [19:13]. The value REV_E_DCS_NOTUSED_BITS
+ * represents bits [24:20] and [12:0], which are all bits in the above-mentioned
+ * gaps.
+ *
+ * ->dcs_mask_notused, RevF and later:
+ *
+ * To find the max InputAddr for the csrow, start with the base address and set
+ * all bits that are "don't care" bits in the test at the start of NPT section
+ * 4.5.4 (p. 87).
+ *
+ * The "don't care" bits are all set bits in the mask and all bits in the gaps
+ * between bit ranges [36:27] and [21:13].
+ *
+ * The value REV_F_F1Xh_DCS_NOTUSED_BITS represents bits [26:22] and [12:0],
+ * which are all bits in the above-mentioned gaps.
+ */
+static void amd64_set_dct_base_and_mask(struct amd64_pvt *pvt)
+{
+	if (pvt->ext_model >= OPTERON_CPU_REV_F) {
+		pvt->dcsb_base		= REV_F_F1Xh_DCSB_BASE_BITS;
+		pvt->dcsm_mask		= REV_F_F1Xh_DCSM_MASK_BITS;
+		pvt->dcs_mask_notused	= REV_F_F1Xh_DCS_NOTUSED_BITS;
+		pvt->dcs_shift		= REV_F_F1Xh_DCS_SHIFT;
+
+		switch (boot_cpu_data.x86) {
+		case 0xf:
+			pvt->num_dcsm = REV_F_DCSM_COUNT;
+			break;
+
+		case 0x10:
+			pvt->num_dcsm = F10_DCSM_COUNT;
+			break;
+
+		case 0x11:
+			pvt->num_dcsm = F11_DCSM_COUNT;
+			break;
+
+		default:
+			amd64_printk(KERN_ERR, "Unsupported family!\n");
+			break;
+		}
+	} else {
+		pvt->dcsb_base		= REV_E_DCSB_BASE_BITS;
+		pvt->dcsm_mask		= REV_E_DCSM_MASK_BITS;
+		pvt->dcs_mask_notused	= REV_E_DCS_NOTUSED_BITS;
+		pvt->dcs_shift		= REV_E_DCS_SHIFT;
+		pvt->num_dcsm		= REV_E_DCSM_COUNT;
+	}
+}
+
+/*
+ * Function 2 Offset F10_DCSB0; read in the DCS Base and DCS Mask hw registers
+ */
+static void amd64_read_dct_base_mask(struct amd64_pvt *pvt)
+{
+	int cs, reg, err = 0;
+
+	amd64_set_dct_base_and_mask(pvt);
+
+	for (cs = 0; cs < CHIPSELECT_COUNT; cs++) {
+		reg = K8_DCSB0 + (cs * 4);
+		err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
+						&pvt->dcsb0[cs]);
+		if (unlikely(err))
+			debugf0("Reading K8_DCSB0[%d] failed\n", cs);
+		else
+			debugf0("  DCSB0[%d]=0x%08x reg: F2x%x\n",
+				cs, pvt->dcsb0[cs], reg);
+
+		/* If DCT are NOT ganged, then read in DCT1's base */
+		if (boot_cpu_data.x86 >= 0x10 && !dct_ganging_enabled(pvt)) {
+			reg = F10_DCSB1 + (cs * 4);
+			err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
+							&pvt->dcsb1[cs]);
+			if (unlikely(err))
+				debugf0("Reading F10_DCSB1[%d] failed\n", cs);
+			else
+				debugf0("  DCSB1[%d]=0x%08x reg: F2x%x\n",
+					cs, pvt->dcsb1[cs], reg);
+		} else {
+			pvt->dcsb1[cs] = 0;
+		}
+	}
+
+	for (cs = 0; cs < pvt->num_dcsm; cs++) {
+		reg = K8_DCSB0 + (cs * 4);
+		err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
+					&pvt->dcsm0[cs]);
+		if (unlikely(err))
+			debugf0("Reading K8_DCSM0 failed\n");
+		else
+			debugf0("    DCSM0[%d]=0x%08x reg: F2x%x\n",
+				cs, pvt->dcsm0[cs], reg);
+
+		/* If DCT are NOT ganged, then read in DCT1's mask */
+		if (boot_cpu_data.x86 >= 0x10 && !dct_ganging_enabled(pvt)) {
+			reg = F10_DCSM1 + (cs * 4);
+			err = pci_read_config_dword(pvt->dram_f2_ctl, reg,
+					&pvt->dcsm1[cs]);
+			if (unlikely(err))
+				debugf0("Reading F10_DCSM1[%d] failed\n", cs);
+			else
+				debugf0("    DCSM1[%d]=0x%08x reg: F2x%x\n",
+					cs, pvt->dcsm1[cs], reg);
+		} else
+			pvt->dcsm1[cs] = 0;
+	}
+}
+
+static enum mem_type amd64_determine_memory_type(struct amd64_pvt *pvt)
+{
+	enum mem_type type;
+
+	if (boot_cpu_data.x86 >= 0x10 || pvt->ext_model >= OPTERON_CPU_REV_F) {
+		/* Rev F and later */
+		type = (pvt->dclr0 & BIT(16)) ? MEM_DDR2 : MEM_RDDR2;
+	} else {
+		/* Rev E and earlier */
+		type = (pvt->dclr0 & BIT(18)) ? MEM_DDR : MEM_RDDR;
+	}
+
+	debugf1("  Memory type is: %s\n",
+		(type == MEM_DDR2) ? "MEM_DDR2" :
+		(type == MEM_RDDR2) ? "MEM_RDDR2" :
+		(type == MEM_DDR) ? "MEM_DDR" : "MEM_RDDR");
+
+	return type;
+}
+
-- 
GitLab


From ddff876d2022c5e06509f6535bc4fd61c4d6ffd6 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:14:52 +0200
Subject: [PATCH 1858/2198] amd64_edac: add k8-specific methods

Borislav:

- fix/cleanup/move comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 173 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 173 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 5ec44a440d65b..24c031f797567 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1013,3 +1013,176 @@ static enum mem_type amd64_determine_memory_type(struct amd64_pvt *pvt)
 	return type;
 }
 
+/*
+ * Read the DRAM Configuration Low register. It differs between CG, D & E revs
+ * and the later RevF memory controllers (DDR vs DDR2)
+ *
+ * Return:
+ *      number of memory channels in operation
+ * Pass back:
+ *      contents of the DCL0_LOW register
+ */
+static int k8_early_channel_count(struct amd64_pvt *pvt)
+{
+	int flag, err = 0;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_0, &pvt->dclr0);
+	if (err)
+		return err;
+
+	if ((boot_cpu_data.x86_model >> 4) >= OPTERON_CPU_REV_F) {
+		/* RevF (NPT) and later */
+		flag = pvt->dclr0 & F10_WIDTH_128;
+	} else {
+		/* RevE and earlier */
+		flag = pvt->dclr0 & REVE_WIDTH_128;
+	}
+
+	/* not used */
+	pvt->dclr1 = 0;
+
+	return (flag) ? 2 : 1;
+}
+
+/* extract the ERROR ADDRESS for the K8 CPUs */
+static u64 k8_get_error_address(struct mem_ctl_info *mci,
+				struct amd64_error_info_regs *info)
+{
+	return (((u64) (info->nbeah & 0xff)) << 32) +
+			(info->nbeal & ~0x03);
+}
+
+/*
+ * Read the Base and Limit registers for K8 based Memory controllers; extract
+ * fields from the 'raw' reg into separate data fields
+ *
+ * Isolates: BASE, LIMIT, IntlvEn, IntlvSel, RW_EN
+ */
+static void k8_read_dram_base_limit(struct amd64_pvt *pvt, int dram)
+{
+	u32 low;
+	u32 off = dram << 3;	/* 8 bytes between DRAM entries */
+	int err;
+
+	err = pci_read_config_dword(pvt->addr_f1_ctl,
+				    K8_DRAM_BASE_LOW + off, &low);
+	if (err)
+		debugf0("Reading K8_DRAM_BASE_LOW failed\n");
+
+	/* Extract parts into separate data entries */
+	pvt->dram_base[dram] = ((u64) low & 0xFFFF0000) << 8;
+	pvt->dram_IntlvEn[dram] = (low >> 8) & 0x7;
+	pvt->dram_rw_en[dram] = (low & 0x3);
+
+	err = pci_read_config_dword(pvt->addr_f1_ctl,
+				    K8_DRAM_LIMIT_LOW + off, &low);
+	if (err)
+		debugf0("Reading K8_DRAM_LIMIT_LOW failed\n");
+
+	/*
+	 * Extract parts into separate data entries. Limit is the HIGHEST memory
+	 * location of the region, so lower 24 bits need to be all ones
+	 */
+	pvt->dram_limit[dram] = (((u64) low & 0xFFFF0000) << 8) | 0x00FFFFFF;
+	pvt->dram_IntlvSel[dram] = (low >> 8) & 0x7;
+	pvt->dram_DstNode[dram] = (low & 0x7);
+}
+
+static void k8_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
+					struct amd64_error_info_regs *info,
+					u64 SystemAddress)
+{
+	struct mem_ctl_info *src_mci;
+	unsigned short syndrome;
+	int channel, csrow;
+	u32 page, offset;
+
+	/* Extract the syndrome parts and form a 16-bit syndrome */
+	syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8;
+	syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh);
+
+	/* CHIPKILL enabled */
+	if (info->nbcfg & K8_NBCFG_CHIPKILL) {
+		channel = get_channel_from_ecc_syndrome(syndrome);
+		if (channel < 0) {
+			/*
+			 * Syndrome didn't map, so we don't know which of the
+			 * 2 DIMMs is in error. So we need to ID 'both' of them
+			 * as suspect.
+			 */
+			amd64_mc_printk(mci, KERN_WARNING,
+				       "unknown syndrome 0x%x - possible error "
+				       "reporting race\n", syndrome);
+			edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+			return;
+		}
+	} else {
+		/*
+		 * non-chipkill ecc mode
+		 *
+		 * The k8 documentation is unclear about how to determine the
+		 * channel number when using non-chipkill memory.  This method
+		 * was obtained from email communication with someone at AMD.
+		 * (Wish the email was placed in this comment - norsk)
+		 */
+		channel = ((SystemAddress & BIT(3)) != 0);
+	}
+
+	/*
+	 * Find out which node the error address belongs to. This may be
+	 * different from the node that detected the error.
+	 */
+	src_mci = find_mc_by_sys_addr(mci, SystemAddress);
+	if (src_mci) {
+		amd64_mc_printk(mci, KERN_ERR,
+			     "failed to map error address 0x%lx to a node\n",
+			     (unsigned long)SystemAddress);
+		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+		return;
+	}
+
+	/* Now map the SystemAddress to a CSROW */
+	csrow = sys_addr_to_csrow(src_mci, SystemAddress);
+	if (csrow < 0) {
+		edac_mc_handle_ce_no_info(src_mci, EDAC_MOD_STR);
+	} else {
+		error_address_to_page_and_offset(SystemAddress, &page, &offset);
+
+		edac_mc_handle_ce(src_mci, page, offset, syndrome, csrow,
+				  channel, EDAC_MOD_STR);
+	}
+}
+
+/*
+ * determrine the number of PAGES in for this DIMM's size based on its DRAM
+ * Address Mapping.
+ *
+ * First step is to calc the number of bits to shift a value of 1 left to
+ * indicate show many pages. Start with the DBAM value as the starting bits,
+ * then proceed to adjust those shift bits, based on CPU rev and the table.
+ * See BKDG on the DBAM
+ */
+static int k8_dbam_map_to_pages(struct amd64_pvt *pvt, int dram_map)
+{
+	int nr_pages;
+
+	if (pvt->ext_model >= OPTERON_CPU_REV_F) {
+		nr_pages = 1 << (revf_quad_ddr2_shift[dram_map] - PAGE_SHIFT);
+	} else {
+		/*
+		 * RevE and less section; this line is tricky. It collapses the
+		 * table used by RevD and later to one that matches revisions CG
+		 * and earlier.
+		 */
+		dram_map -= (pvt->ext_model >= OPTERON_CPU_REV_D) ?
+				(dram_map > 8 ? 4 : (dram_map > 5 ?
+				3 : (dram_map > 2 ? 1 : 0))) : 0;
+
+		/* 25 shift is 32MiB minimum DIMM size in RevE and prior */
+		nr_pages = 1 << (dram_map + 25 - PAGE_SHIFT);
+	}
+
+	return nr_pages;
+}
+
+
-- 
GitLab


From 1afd3c98b5e8df68e1840b56c0ced15f314ce30d Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:16:50 +0200
Subject: [PATCH 1859/2198] amd64_edac: add F10h-and-later methods-p1

Borislav:

Fail f10_early_channel_count() if error encountered while reading a NB
register since those cached register contents are accessed afterwards.

- fix/cleanup comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 181 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 181 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 24c031f797567..b4ef2c70500e5 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1185,4 +1185,185 @@ static int k8_dbam_map_to_pages(struct amd64_pvt *pvt, int dram_map)
 	return nr_pages;
 }
 
+/*
+ * Get the number of DCT channels in use.
+ *
+ * Return:
+ *	number of Memory Channels in operation
+ * Pass back:
+ *	contents of the DCL0_LOW register
+ */
+static int f10_early_channel_count(struct amd64_pvt *pvt)
+{
+	int err = 0, channels = 0;
+	u32 dbam;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_0, &pvt->dclr0);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_1, &pvt->dclr1);
+	if (err)
+		goto err_reg;
+
+	/* If we are in 128 bit mode, then we are using 2 channels */
+	if (pvt->dclr0 & F10_WIDTH_128) {
+		debugf0("Data WIDTH is 128 bits - 2 channels\n");
+		channels = 2;
+		return channels;
+	}
+
+	/*
+	 * Need to check if in UN-ganged mode: In such, there are 2 channels,
+	 * but they are NOT in 128 bit mode and thus the above 'dcl0' status bit
+	 * will be OFF.
+	 *
+	 * Need to check DCT0[0] and DCT1[0] to see if only one of them has
+	 * their CSEnable bit on. If so, then SINGLE DIMM case.
+	 */
+	debugf0("Data WIDTH is NOT 128 bits - need more decoding\n");
 
+	/*
+	 * Check DRAM Bank Address Mapping values for each DIMM to see if there
+	 * is more than just one DIMM present in unganged mode. Need to check
+	 * both controllers since DIMMs can be placed in either one.
+	 */
+	channels = 0;
+	err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM0, &dbam);
+	if (err)
+		goto err_reg;
+
+	if (DBAM_DIMM(0, dbam) > 0)
+		channels++;
+	if (DBAM_DIMM(1, dbam) > 0)
+		channels++;
+	if (DBAM_DIMM(2, dbam) > 0)
+		channels++;
+	if (DBAM_DIMM(3, dbam) > 0)
+		channels++;
+
+	/* If more than 2 DIMMs are present, then we have 2 channels */
+	if (channels > 2)
+		channels = 2;
+	else if (channels == 0) {
+		/* No DIMMs on DCT0, so look at DCT1 */
+		err = pci_read_config_dword(pvt->dram_f2_ctl, DBAM1, &dbam);
+		if (err)
+			goto err_reg;
+
+		if (DBAM_DIMM(0, dbam) > 0)
+			channels++;
+		if (DBAM_DIMM(1, dbam) > 0)
+			channels++;
+		if (DBAM_DIMM(2, dbam) > 0)
+			channels++;
+		if (DBAM_DIMM(3, dbam) > 0)
+			channels++;
+
+		if (channels > 2)
+			channels = 2;
+	}
+
+	/* If we found ALL 0 values, then assume just ONE DIMM-ONE Channel */
+	if (channels == 0)
+		channels = 1;
+
+	debugf0("DIMM count= %d\n", channels);
+
+	return channels;
+
+err_reg:
+	return -1;
+
+}
+
+static int f10_dbam_map_to_pages(struct amd64_pvt *pvt, int dram_map)
+{
+	return 1 << (revf_quad_ddr2_shift[dram_map] - PAGE_SHIFT);
+}
+
+/* Enable extended configuration access via 0xCF8 feature */
+static void amd64_setup(struct amd64_pvt *pvt)
+{
+	u32 reg;
+
+	pci_read_config_dword(pvt->misc_f3_ctl, F10_NB_CFG_HIGH, &reg);
+
+	pvt->flags.cf8_extcfg = !!(reg & F10_NB_CFG_LOW_ENABLE_EXT_CFG);
+	reg |= F10_NB_CFG_LOW_ENABLE_EXT_CFG;
+	pci_write_config_dword(pvt->misc_f3_ctl, F10_NB_CFG_HIGH, reg);
+}
+
+/* Restore the extended configuration access via 0xCF8 feature */
+static void amd64_teardown(struct amd64_pvt *pvt)
+{
+	u32 reg;
+
+	pci_read_config_dword(pvt->misc_f3_ctl, F10_NB_CFG_HIGH, &reg);
+
+	reg &= ~F10_NB_CFG_LOW_ENABLE_EXT_CFG;
+	if (pvt->flags.cf8_extcfg)
+		reg |= F10_NB_CFG_LOW_ENABLE_EXT_CFG;
+	pci_write_config_dword(pvt->misc_f3_ctl, F10_NB_CFG_HIGH, reg);
+}
+
+static u64 f10_get_error_address(struct mem_ctl_info *mci,
+			struct amd64_error_info_regs *info)
+{
+	return (((u64) (info->nbeah & 0xffff)) << 32) +
+			(info->nbeal & ~0x01);
+}
+
+/*
+ * Read the Base and Limit registers for F10 based Memory controllers. Extract
+ * fields from the 'raw' reg into separate data fields.
+ *
+ * Isolates: BASE, LIMIT, IntlvEn, IntlvSel, RW_EN.
+ */
+static void f10_read_dram_base_limit(struct amd64_pvt *pvt, int dram)
+{
+	u32 high_offset, low_offset, high_base, low_base, high_limit, low_limit;
+
+	low_offset = K8_DRAM_BASE_LOW + (dram << 3);
+	high_offset = F10_DRAM_BASE_HIGH + (dram << 3);
+
+	/* read the 'raw' DRAM BASE Address register */
+	pci_read_config_dword(pvt->addr_f1_ctl, low_offset, &low_base);
+
+	/* Read from the ECS data register */
+	pci_read_config_dword(pvt->addr_f1_ctl, high_offset, &high_base);
+
+	/* Extract parts into separate data entries */
+	pvt->dram_rw_en[dram] = (low_base & 0x3);
+
+	if (pvt->dram_rw_en[dram] == 0)
+		return;
+
+	pvt->dram_IntlvEn[dram] = (low_base >> 8) & 0x7;
+
+	pvt->dram_base[dram] = (((((u64) high_base & 0x000000FF) << 32) |
+				((u64) low_base & 0xFFFF0000))) << 8;
+
+	low_offset = K8_DRAM_LIMIT_LOW + (dram << 3);
+	high_offset = F10_DRAM_LIMIT_HIGH + (dram << 3);
+
+	/* read the 'raw' LIMIT registers */
+	pci_read_config_dword(pvt->addr_f1_ctl, low_offset, &low_limit);
+
+	/* Read from the ECS data register for the HIGH portion */
+	pci_read_config_dword(pvt->addr_f1_ctl, high_offset, &high_limit);
+
+	debugf0("  HW Regs: BASE=0x%08x-%08x      LIMIT=  0x%08x-%08x\n",
+		high_base, low_base, high_limit, low_limit);
+
+	pvt->dram_DstNode[dram] = (low_limit & 0x7);
+	pvt->dram_IntlvSel[dram] = (low_limit >> 8) & 0x7;
+
+	/*
+	 * Extract address values and form a LIMIT address. Limit is the HIGHEST
+	 * memory location of the region, so low 24 bits need to be all ones.
+	 */
+	low_limit |= 0x0000FFFF;
+	pvt->dram_limit[dram] =
+		((((u64) high_limit << 32) + (u64) low_limit) << 8) | (0xFF);
+}
-- 
GitLab


From 6163b5d4fb45d20e3eb92d627943f26572726889 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:20:17 +0200
Subject: [PATCH 1860/2198] amd64_edac: add F10h-and-later methods-p2

Borislav:

- fix a wrong negation in f10_determine_base_addr_offset()
- fix a wrong mask in f10_determine_base_addr_offset() which should
select DctSelBaseAddr[31:11] and not [31:16] as it was before
- remove StinkyIdentifiers, trivially simplify code.
- fix/cleanup comments
- fix function return value patterns

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 196 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 196 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index b4ef2c70500e5..744a49ac9f5cf 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1367,3 +1367,199 @@ static void f10_read_dram_base_limit(struct amd64_pvt *pvt, int dram)
 	pvt->dram_limit[dram] =
 		((((u64) high_limit << 32) + (u64) low_limit) << 8) | (0xFF);
 }
+
+static void f10_read_dram_ctl_register(struct amd64_pvt *pvt)
+{
+	int err = 0;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCTL_SEL_LOW,
+				    &pvt->dram_ctl_select_low);
+	if (err) {
+		debugf0("Reading F10_DCTL_SEL_LOW failed\n");
+	} else {
+		debugf0("DRAM_DCTL_SEL_LOW=0x%x  DctSelBaseAddr=0x%x\n",
+			pvt->dram_ctl_select_low, dct_sel_baseaddr(pvt));
+
+		debugf0("  DRAM DCTs are=%s DRAM Is=%s DRAM-Ctl-"
+				"sel-hi-range=%s\n",
+			(dct_ganging_enabled(pvt) ? "GANGED" : "NOT GANGED"),
+			(dct_dram_enabled(pvt) ? "Enabled"   : "Disabled"),
+			(dct_high_range_enabled(pvt) ? "Enabled" : "Disabled"));
+
+		debugf0("  DctDatIntLv=%s MemCleared=%s DctSelIntLvAddr=0x%x\n",
+			(dct_data_intlv_enabled(pvt) ? "Enabled" : "Disabled"),
+			(dct_memory_cleared(pvt) ? "True " : "False "),
+			dct_sel_interleave_addr(pvt));
+	}
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCTL_SEL_HIGH,
+				    &pvt->dram_ctl_select_high);
+	if (err)
+		debugf0("Reading F10_DCTL_SEL_HIGH failed\n");
+}
+
+static u32 f10_determine_channel(struct amd64_pvt *pvt, u64 sys_addr,
+				int hi_range_sel, u32 intlv_en)
+{
+	u32 cs, temp, dct_sel_high = (pvt->dram_ctl_select_low >> 1) & 1;
+
+	if (dct_ganging_enabled(pvt))
+		cs = 0;
+	else if (hi_range_sel)
+		cs = dct_sel_high;
+	else if (dct_interleave_enabled(pvt)) {
+		if (dct_sel_interleave_addr(pvt) == 0)
+			cs = sys_addr >> 6 & 1;
+		else if ((dct_sel_interleave_addr(pvt) >> 1) & 1) {
+			temp = hweight_long((u32) ((sys_addr >> 16) & 0x1F)) % 2;
+
+			if (dct_sel_interleave_addr(pvt) & 1)
+				cs = (sys_addr >> 9 & 1) ^ temp;
+			else
+				cs = (sys_addr >> 6 & 1) ^ temp;
+		} else if (intlv_en & 4)
+			cs = sys_addr >> 15 & 1;
+		else if (intlv_en & 2)
+			cs = sys_addr >> 14 & 1;
+		else if (intlv_en & 1)
+			cs = sys_addr >> 13 & 1;
+		else
+			cs = sys_addr >> 12 & 1;
+	} else if (dct_high_range_enabled(pvt) && !dct_ganging_enabled(pvt))
+		cs = ~dct_sel_high & 1;
+	else
+		cs = 0;
+
+	return cs;
+}
+
+static inline u32 f10_map_intlv_en_to_shift(u32 intlv_en)
+{
+	if (intlv_en == 1)
+		return 1;
+	else if (intlv_en == 3)
+		return 2;
+	else if (intlv_en == 7)
+		return 3;
+
+	return 0;
+}
+
+static inline u64 f10_determine_base_addr_offset(u64 sys_addr, int hi_range_sel,
+						 u32 dct_sel_base_addr,
+						 u64 dct_sel_base_off,
+						 u32 hole_en, u32 hole_off,
+						 u64 dram_base)
+{
+	u64 chan_off;
+
+	if (hi_range_sel) {
+		if (!(dct_sel_base_addr & 0xFFFFF800) &&
+		   (hole_en & 1) && (sys_addr >= 0x100000000ULL))
+			chan_off = hole_off << 16;
+		else
+			chan_off = dct_sel_base_off;
+	} else {
+		if ((hole_en & 1) && (sys_addr >= 0x100000000ULL))
+			chan_off = hole_off << 16;
+		else
+			chan_off = dram_base & 0xFFFFF8000000ULL;
+	}
+
+	return (sys_addr & 0x0000FFFFFFFFFFC0ULL) -
+			(chan_off & 0x0000FFFFFF800000ULL);
+}
+
+/* Hack for the time being - Can we get this from BIOS?? */
+#define	CH0SPARE_RANK	0
+#define	CH1SPARE_RANK	1
+
+/*
+ * checks if the csrow passed in is marked as SPARED, if so returns the new
+ * spare row
+ */
+static inline int f10_process_possible_spare(int csrow,
+				u32 cs, struct amd64_pvt *pvt)
+{
+	u32 swap_done;
+	u32 bad_dram_cs;
+
+	/* Depending on channel, isolate respective SPARING info */
+	if (cs) {
+		swap_done = F10_ONLINE_SPARE_SWAPDONE1(pvt->online_spare);
+		bad_dram_cs = F10_ONLINE_SPARE_BADDRAM_CS1(pvt->online_spare);
+		if (swap_done && (csrow == bad_dram_cs))
+			csrow = CH1SPARE_RANK;
+	} else {
+		swap_done = F10_ONLINE_SPARE_SWAPDONE0(pvt->online_spare);
+		bad_dram_cs = F10_ONLINE_SPARE_BADDRAM_CS0(pvt->online_spare);
+		if (swap_done && (csrow == bad_dram_cs))
+			csrow = CH0SPARE_RANK;
+	}
+	return csrow;
+}
+
+/*
+ * Iterate over the DRAM DCT "base" and "mask" registers looking for a
+ * SystemAddr match on the specified 'ChannelSelect' and 'NodeID'
+ *
+ * Return:
+ *	-EINVAL:  NOT FOUND
+ *	0..csrow = Chip-Select Row
+ */
+static int f10_lookup_addr_in_dct(u32 in_addr, u32 nid, u32 cs)
+{
+	struct mem_ctl_info *mci;
+	struct amd64_pvt *pvt;
+	u32 cs_base, cs_mask;
+	int cs_found = -EINVAL;
+	int csrow;
+
+	mci = mci_lookup[nid];
+	if (!mci)
+		return cs_found;
+
+	pvt = mci->pvt_info;
+
+	debugf1("InputAddr=0x%x  channelselect=%d\n", in_addr, cs);
+
+	for (csrow = 0; csrow < CHIPSELECT_COUNT; csrow++) {
+
+		cs_base = amd64_get_dct_base(pvt, cs, csrow);
+		if (!(cs_base & K8_DCSB_CS_ENABLE))
+			continue;
+
+		/*
+		 * We have an ENABLED CSROW, Isolate just the MASK bits of the
+		 * target: [28:19] and [13:5], which map to [36:27] and [21:13]
+		 * of the actual address.
+		 */
+		cs_base &= REV_F_F1Xh_DCSB_BASE_BITS;
+
+		/*
+		 * Get the DCT Mask, and ENABLE the reserved bits: [18:16] and
+		 * [4:0] to become ON. Then mask off bits [28:0] ([36:8])
+		 */
+		cs_mask = amd64_get_dct_mask(pvt, cs, csrow);
+
+		debugf1("    CSROW=%d CSBase=0x%x RAW CSMask=0x%x\n",
+				csrow, cs_base, cs_mask);
+
+		cs_mask = (cs_mask | 0x0007C01F) & 0x1FFFFFFF;
+
+		debugf1("              Final CSMask=0x%x\n", cs_mask);
+		debugf1("    (InputAddr & ~CSMask)=0x%x "
+				"(CSBase & ~CSMask)=0x%x\n",
+				(in_addr & ~cs_mask), (cs_base & ~cs_mask));
+
+		if ((in_addr & ~cs_mask) == (cs_base & ~cs_mask)) {
+			cs_found = f10_process_possible_spare(csrow, cs, pvt);
+
+			debugf1(" MATCH csrow=%d\n", cs_found);
+			break;
+		}
+	}
+	return cs_found;
+}
+
+
-- 
GitLab


From f71d0a05001afd10e2be491ca002c55c7df42ed8 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:22:43 +0200
Subject: [PATCH 1861/2198] amd64_edac: add F10h-and-later methods-p3

Borislav:

- compute dct_sel_base_off in f10_match_to_this_node() correctly since
it cannot be assumed that the Reserved bits are zero and they have to be
masked out instead.

- cleanup, remove StinkyIdentifiers, simplify logic
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 269 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 265 insertions(+), 4 deletions(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 744a49ac9f5cf..c2e2c3c37f5cf 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1398,6 +1398,10 @@ static void f10_read_dram_ctl_register(struct amd64_pvt *pvt)
 		debugf0("Reading F10_DCTL_SEL_HIGH failed\n");
 }
 
+/*
+ * determine channel based on the interleaving mode: F10h BKDG, 2.8.9 Memory
+ * Interleaving Modes.
+ */
 static u32 f10_determine_channel(struct amd64_pvt *pvt, u64 sys_addr,
 				int hi_range_sel, u32 intlv_en)
 {
@@ -1408,6 +1412,9 @@ static u32 f10_determine_channel(struct amd64_pvt *pvt, u64 sys_addr,
 	else if (hi_range_sel)
 		cs = dct_sel_high;
 	else if (dct_interleave_enabled(pvt)) {
+		/*
+		 * see F2x110[DctSelIntLvAddr] - channel interleave mode
+		 */
 		if (dct_sel_interleave_addr(pvt) == 0)
 			cs = sys_addr >> 6 & 1;
 		else if ((dct_sel_interleave_addr(pvt) >> 1) & 1) {
@@ -1445,22 +1452,23 @@ static inline u32 f10_map_intlv_en_to_shift(u32 intlv_en)
 	return 0;
 }
 
-static inline u64 f10_determine_base_addr_offset(u64 sys_addr, int hi_range_sel,
+/* See F10h BKDG, 2.8.10.2 DctSelBaseOffset Programming */
+static inline u64 f10_get_base_addr_offset(u64 sys_addr, int hi_range_sel,
 						 u32 dct_sel_base_addr,
 						 u64 dct_sel_base_off,
-						 u32 hole_en, u32 hole_off,
+						 u32 hole_valid, u32 hole_off,
 						 u64 dram_base)
 {
 	u64 chan_off;
 
 	if (hi_range_sel) {
 		if (!(dct_sel_base_addr & 0xFFFFF800) &&
-		   (hole_en & 1) && (sys_addr >= 0x100000000ULL))
+		   hole_valid && (sys_addr >= 0x100000000ULL))
 			chan_off = hole_off << 16;
 		else
 			chan_off = dct_sel_base_off;
 	} else {
-		if ((hole_en & 1) && (sys_addr >= 0x100000000ULL))
+		if (hole_valid && (sys_addr >= 0x100000000ULL))
 			chan_off = hole_off << 16;
 		else
 			chan_off = dram_base & 0xFFFFF8000000ULL;
@@ -1562,4 +1570,257 @@ static int f10_lookup_addr_in_dct(u32 in_addr, u32 nid, u32 cs)
 	return cs_found;
 }
 
+/* For a given @dram_range, check if @sys_addr falls within it. */
+static int f10_match_to_this_node(struct amd64_pvt *pvt, int dram_range,
+				  u64 sys_addr, int *nid, int *chan_sel)
+{
+	int node_id, cs_found = -EINVAL, high_range = 0;
+	u32 intlv_en, intlv_sel, intlv_shift, hole_off;
+	u32 hole_valid, tmp, dct_sel_base, channel;
+	u64 dram_base, chan_addr, dct_sel_base_off;
+
+	dram_base = pvt->dram_base[dram_range];
+	intlv_en = pvt->dram_IntlvEn[dram_range];
+
+	node_id = pvt->dram_DstNode[dram_range];
+	intlv_sel = pvt->dram_IntlvSel[dram_range];
+
+	debugf1("(dram=%d) Base=0x%llx SystemAddr= 0x%llx Limit=0x%llx\n",
+		dram_range, dram_base, sys_addr, pvt->dram_limit[dram_range]);
+
+	/*
+	 * This assumes that one node's DHAR is the same as all the other
+	 * nodes' DHAR.
+	 */
+	hole_off = (pvt->dhar & 0x0000FF80);
+	hole_valid = (pvt->dhar & 0x1);
+	dct_sel_base_off = (pvt->dram_ctl_select_high & 0xFFFFFC00) << 16;
+
+	debugf1("   HoleOffset=0x%x  HoleValid=0x%x IntlvSel=0x%x\n",
+			hole_off, hole_valid, intlv_sel);
+
+	if (intlv_en ||
+	    (intlv_sel != ((sys_addr >> 12) & intlv_en)))
+		return -EINVAL;
+
+	dct_sel_base = dct_sel_baseaddr(pvt);
+
+	/*
+	 * check whether addresses >= DctSelBaseAddr[47:27] are to be used to
+	 * select between DCT0 and DCT1.
+	 */
+	if (dct_high_range_enabled(pvt) &&
+	   !dct_ganging_enabled(pvt) &&
+	   ((sys_addr >> 27) >= (dct_sel_base >> 11)))
+		high_range = 1;
+
+	channel = f10_determine_channel(pvt, sys_addr, high_range, intlv_en);
+
+	chan_addr = f10_get_base_addr_offset(sys_addr, high_range, dct_sel_base,
+					     dct_sel_base_off, hole_valid,
+					     hole_off, dram_base);
+
+	intlv_shift = f10_map_intlv_en_to_shift(intlv_en);
+
+	/* remove Node ID (in case of memory interleaving) */
+	tmp = chan_addr & 0xFC0;
+
+	chan_addr = ((chan_addr >> intlv_shift) & 0xFFFFFFFFF000ULL) | tmp;
+
+	/* remove channel interleave and hash */
+	if (dct_interleave_enabled(pvt) &&
+	   !dct_high_range_enabled(pvt) &&
+	   !dct_ganging_enabled(pvt)) {
+		if (dct_sel_interleave_addr(pvt) != 1)
+			chan_addr = (chan_addr >> 1) & 0xFFFFFFFFFFFFFFC0ULL;
+		else {
+			tmp = chan_addr & 0xFC0;
+			chan_addr = ((chan_addr & 0xFFFFFFFFFFFFC000ULL) >> 1)
+					| tmp;
+		}
+	}
+
+	debugf1("   (ChannelAddrLong=0x%llx) >> 8 becomes InputAddr=0x%x\n",
+		chan_addr, (u32)(chan_addr >> 8));
+
+	cs_found = f10_lookup_addr_in_dct(chan_addr >> 8, node_id, channel);
+
+	if (cs_found >= 0) {
+		*nid = node_id;
+		*chan_sel = channel;
+	}
+	return cs_found;
+}
+
+static int f10_translate_sysaddr_to_cs(struct amd64_pvt *pvt, u64 sys_addr,
+				       int *node, int *chan_sel)
+{
+	int dram_range, cs_found = -EINVAL;
+	u64 dram_base, dram_limit;
+
+	for (dram_range = 0; dram_range < DRAM_REG_COUNT; dram_range++) {
+
+		if (!pvt->dram_rw_en[dram_range])
+			continue;
+
+		dram_base = pvt->dram_base[dram_range];
+		dram_limit = pvt->dram_limit[dram_range];
+
+		if ((dram_base <= sys_addr) && (sys_addr <= dram_limit)) {
+
+			cs_found = f10_match_to_this_node(pvt, dram_range,
+							  sys_addr, node,
+							  chan_sel);
+			if (cs_found >= 0)
+				break;
+		}
+	}
+	return cs_found;
+}
+
+/*
+ * This the F10h reference code from AMD to map a @sys_addr to NodeID,
+ * CSROW, Channel.
+ *
+ * The @sys_addr is usually an error address received from the hardware.
+ */
+static void f10_map_sysaddr_to_csrow(struct mem_ctl_info *mci,
+				     struct amd64_error_info_regs *info,
+				     u64 sys_addr)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u32 page, offset;
+	unsigned short syndrome;
+	int nid, csrow, chan = 0;
+
+	csrow = f10_translate_sysaddr_to_cs(pvt, sys_addr, &nid, &chan);
+
+	if (csrow >= 0) {
+		error_address_to_page_and_offset(sys_addr, &page, &offset);
+
+		syndrome = EXTRACT_HIGH_SYNDROME(info->nbsl) << 8;
+		syndrome |= EXTRACT_LOW_SYNDROME(info->nbsh);
+
+		/*
+		 * Is CHIPKILL on? If so, then we can attempt to use the
+		 * syndrome to isolate which channel the error was on.
+		 */
+		if (pvt->nbcfg & K8_NBCFG_CHIPKILL)
+			chan = get_channel_from_ecc_syndrome(syndrome);
+
+		if (chan >= 0) {
+			edac_mc_handle_ce(mci, page, offset, syndrome,
+					csrow, chan, EDAC_MOD_STR);
+		} else {
+			/*
+			 * Channel unknown, report all channels on this
+			 * CSROW as failed.
+			 */
+			for (chan = 0; chan < mci->csrows[csrow].nr_channels;
+								chan++) {
+					edac_mc_handle_ce(mci, page, offset,
+							syndrome,
+							csrow, chan,
+							EDAC_MOD_STR);
+			}
+		}
+
+	} else {
+		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+	}
+}
+
+/*
+ * Input (@index) is the DBAM DIMM value (1 of 4) used as an index into a shift
+ * table (revf_quad_ddr2_shift) which starts at 128MB DIMM size. Index of 0
+ * indicates an empty DIMM slot, as reported by Hardware on empty slots.
+ *
+ * Normalize to 128MB by subracting 27 bit shift.
+ */
+static int map_dbam_to_csrow_size(int index)
+{
+	int mega_bytes = 0;
+
+	if (index > 0 && index <= DBAM_MAX_VALUE)
+		mega_bytes = ((128 << (revf_quad_ddr2_shift[index]-27)));
+
+	return mega_bytes;
+}
+
+/*
+ * debug routine to display the memory sizes of a DIMM (ganged or not) and it
+ * CSROWs as well
+ */
+static void f10_debug_display_dimm_sizes(int ctrl, struct amd64_pvt *pvt,
+					 int ganged)
+{
+	int dimm, size0, size1;
+	u32 dbam;
+	u32 *dcsb;
+
+	debugf1("  dbam%d: 0x%8.08x  CSROW is %s\n", ctrl,
+			ctrl ? pvt->dbam1 : pvt->dbam0,
+			ganged ? "GANGED - dbam1 not used" : "NON-GANGED");
+
+	dbam = ctrl ? pvt->dbam1 : pvt->dbam0;
+	dcsb = ctrl ? pvt->dcsb1 : pvt->dcsb0;
+
+	/* Dump memory sizes for DIMM and its CSROWs */
+	for (dimm = 0; dimm < 4; dimm++) {
+
+		size0 = 0;
+		if (dcsb[dimm*2] & K8_DCSB_CS_ENABLE)
+			size0 = map_dbam_to_csrow_size(DBAM_DIMM(dimm, dbam));
+
+		size1 = 0;
+		if (dcsb[dimm*2 + 1] & K8_DCSB_CS_ENABLE)
+			size1 = map_dbam_to_csrow_size(DBAM_DIMM(dimm, dbam));
+
+		debugf1("     CTRL-%d DIMM-%d=%5dMB   CSROW-%d=%5dMB "
+				"CSROW-%d=%5dMB\n",
+				ctrl,
+				dimm,
+				size0 + size1,
+				dimm * 2,
+				size0,
+				dimm * 2 + 1,
+				size1);
+	}
+}
+
+/*
+ * Very early hardware probe on pci_probe thread to determine if this module
+ * supports the hardware.
+ *
+ * Return:
+ *      0 for OK
+ *      1 for error
+ */
+static int f10_probe_valid_hardware(struct amd64_pvt *pvt)
+{
+	int ret = 0;
+
+	/*
+	 * If we are on a DDR3 machine, we don't know yet if
+	 * we support that properly at this time
+	 */
+	if ((pvt->dchr0 & F10_DCHR_Ddr3Mode) ||
+	    (pvt->dchr1 & F10_DCHR_Ddr3Mode)) {
+
+		amd64_printk(KERN_WARNING,
+			"%s() This machine is running with DDR3 memory. "
+			"This is not currently supported. "
+			"DCHR0=0x%x DCHR1=0x%x\n",
+			__func__, pvt->dchr0, pvt->dchr1);
+
+		amd64_printk(KERN_WARNING,
+			"   Contact '%s' module MAINTAINER to help add"
+			" support.\n",
+			EDAC_MOD_STR);
+
+		ret = 1;
+
+	}
+	return ret;
+}
 
-- 
GitLab


From 4d37607adbff69596a3170cf84badaf26efc59ac Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:25:05 +0200
Subject: [PATCH 1862/2198] amd64_edac: add per-family descriptors

Borislav:

- fix comments
- fix function return value patterns

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 72 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index c2e2c3c37f5cf..e5f4a92faa3b9 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1824,3 +1824,75 @@ static int f10_probe_valid_hardware(struct amd64_pvt *pvt)
 	return ret;
 }
 
+/*
+ * There currently are 3 types type of MC devices for AMD Athlon/Opterons
+ * (as per PCI DEVICE_IDs):
+ *
+ * Family K8: That is the Athlon64 and Opteron CPUs. They all have the same PCI
+ * DEVICE ID, even though there is differences between the different Revisions
+ * (CG,D,E,F).
+ *
+ * Family F10h and F11h.
+ *
+ */
+static struct amd64_family_type amd64_family_types[] = {
+	[K8_CPUS] = {
+		.ctl_name = "RevF",
+		.addr_f1_ctl = PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+		.misc_f3_ctl = PCI_DEVICE_ID_AMD_K8_NB_MISC,
+		.ops = {
+			.early_channel_count = k8_early_channel_count,
+			.get_error_address = k8_get_error_address,
+			.read_dram_base_limit = k8_read_dram_base_limit,
+			.map_sysaddr_to_csrow = k8_map_sysaddr_to_csrow,
+			.dbam_map_to_pages = k8_dbam_map_to_pages,
+		}
+	},
+	[F10_CPUS] = {
+		.ctl_name = "Family 10h",
+		.addr_f1_ctl = PCI_DEVICE_ID_AMD_10H_NB_MAP,
+		.misc_f3_ctl = PCI_DEVICE_ID_AMD_10H_NB_MISC,
+		.ops = {
+			.probe_valid_hardware = f10_probe_valid_hardware,
+			.early_channel_count = f10_early_channel_count,
+			.get_error_address = f10_get_error_address,
+			.read_dram_base_limit = f10_read_dram_base_limit,
+			.read_dram_ctl_register = f10_read_dram_ctl_register,
+			.map_sysaddr_to_csrow = f10_map_sysaddr_to_csrow,
+			.dbam_map_to_pages = f10_dbam_map_to_pages,
+		}
+	},
+	[F11_CPUS] = {
+		.ctl_name = "Family 11h",
+		.addr_f1_ctl = PCI_DEVICE_ID_AMD_11H_NB_MAP,
+		.misc_f3_ctl = PCI_DEVICE_ID_AMD_11H_NB_MISC,
+		.ops = {
+			.probe_valid_hardware = f10_probe_valid_hardware,
+			.early_channel_count = f10_early_channel_count,
+			.get_error_address = f10_get_error_address,
+			.read_dram_base_limit = f10_read_dram_base_limit,
+			.read_dram_ctl_register = f10_read_dram_ctl_register,
+			.map_sysaddr_to_csrow = f10_map_sysaddr_to_csrow,
+			.dbam_map_to_pages = f10_dbam_map_to_pages,
+		}
+	},
+};
+
+static struct pci_dev *pci_get_related_function(unsigned int vendor,
+						unsigned int device,
+						struct pci_dev *related)
+{
+	struct pci_dev *dev = NULL;
+
+	dev = pci_get_device(vendor, device, dev);
+	while (dev) {
+		if ((dev->bus->number == related->bus->number) &&
+		    (PCI_SLOT(dev->devfn) == PCI_SLOT(related->devfn)))
+			break;
+		dev = pci_get_device(vendor, device, dev);
+	}
+
+	return dev;
+}
+
+
-- 
GitLab


From b1289d6f9d23abab396077abb65d5a23a775cdb0 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 16:37:05 +0200
Subject: [PATCH 1863/2198] amd64_edac: add ECC chipkill syndrome mapping table

Borislav:

- fix comments
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 135 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index e5f4a92faa3b9..feb4986ea76d2 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1895,4 +1895,139 @@ static struct pci_dev *pci_get_related_function(unsigned int vendor,
 	return dev;
 }
 
+/*
+ * syndrome mapping table for ECC ChipKill devices
+ *
+ * The comment in each row is the token (nibble) number that is in error.
+ * The least significant nibble of the syndrome is the mask for the bits
+ * that are in error (need to be toggled) for the particular nibble.
+ *
+ * Each row contains 16 entries.
+ * The first entry (0th) is the channel number for that row of syndromes.
+ * The remaining 15 entries are the syndromes for the respective Error
+ * bit mask index.
+ *
+ * 1st index entry is 0x0001 mask, indicating that the rightmost bit is the
+ * bit in error.
+ * The 2nd index entry is 0x0010 that the second bit is damaged.
+ * The 3rd index entry is 0x0011 indicating that the rightmost 2 bits
+ * are damaged.
+ * Thus so on until index 15, 0x1111, whose entry has the syndrome
+ * indicating that all 4 bits are damaged.
+ *
+ * A search is performed on this table looking for a given syndrome.
+ *
+ * See the AMD documentation for ECC syndromes. This ECC table is valid
+ * across all the versions of the AMD64 processors.
+ *
+ * A fast lookup is to use the LAST four bits of the 16-bit syndrome as a
+ * COLUMN index, then search all ROWS of that column, looking for a match
+ * with the input syndrome. The ROW value will be the token number.
+ *
+ * The 0'th entry on that row, can be returned as the CHANNEL (0 or 1) of this
+ * error.
+ */
+#define NUMBER_ECC_ROWS  36
+static const unsigned short ecc_chipkill_syndromes[NUMBER_ECC_ROWS][16] = {
+	/* Channel 0 syndromes */
+	{/*0*/  0, 0xe821, 0x7c32, 0x9413, 0xbb44, 0x5365, 0xc776, 0x2f57,
+	   0xdd88, 0x35a9, 0xa1ba, 0x499b, 0x66cc, 0x8eed, 0x1afe, 0xf2df },
+	{/*1*/  0, 0x5d31, 0xa612, 0xfb23, 0x9584, 0xc8b5, 0x3396, 0x6ea7,
+	   0xeac8, 0xb7f9, 0x4cda, 0x11eb, 0x7f4c, 0x227d, 0xd95e, 0x846f },
+	{/*2*/  0, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007,
+	   0x0008, 0x0009, 0x000a, 0x000b, 0x000c, 0x000d, 0x000e, 0x000f },
+	{/*3*/  0, 0x2021, 0x3032, 0x1013, 0x4044, 0x6065, 0x7076, 0x5057,
+	   0x8088, 0xa0a9, 0xb0ba, 0x909b, 0xc0cc, 0xe0ed, 0xf0fe, 0xd0df },
+	{/*4*/  0, 0x5041, 0xa082, 0xf0c3, 0x9054, 0xc015, 0x30d6, 0x6097,
+	   0xe0a8, 0xb0e9, 0x402a, 0x106b, 0x70fc, 0x20bd, 0xd07e, 0x803f },
+	{/*5*/  0, 0xbe21, 0xd732, 0x6913, 0x2144, 0x9f65, 0xf676, 0x4857,
+	   0x3288, 0x8ca9, 0xe5ba, 0x5b9b, 0x13cc, 0xaded, 0xc4fe, 0x7adf },
+	{/*6*/  0, 0x4951, 0x8ea2, 0xc7f3, 0x5394, 0x1ac5, 0xdd36, 0x9467,
+	   0xa1e8, 0xe8b9, 0x2f4a, 0x661b, 0xf27c, 0xbb2d, 0x7cde, 0x358f },
+	{/*7*/  0, 0x74e1, 0x9872, 0xec93, 0xd6b4, 0xa255, 0x4ec6, 0x3a27,
+	   0x6bd8, 0x1f39, 0xf3aa, 0x874b, 0xbd6c, 0xc98d, 0x251e, 0x51ff },
+	{/*8*/  0, 0x15c1, 0x2a42, 0x3f83, 0xcef4, 0xdb35, 0xe4b6, 0xf177,
+	   0x4758, 0x5299, 0x6d1a, 0x78db, 0x89ac, 0x9c6d, 0xa3ee, 0xb62f },
+	{/*9*/  0, 0x3d01, 0x1602, 0x2b03, 0x8504, 0xb805, 0x9306, 0xae07,
+	   0xca08, 0xf709, 0xdc0a, 0xe10b, 0x4f0c, 0x720d, 0x590e, 0x640f },
+	{/*a*/  0, 0x9801, 0xec02, 0x7403, 0x6b04, 0xf305, 0x8706, 0x1f07,
+	   0xbd08, 0x2509, 0x510a, 0xc90b, 0xd60c, 0x4e0d, 0x3a0e, 0xa20f },
+	{/*b*/  0, 0xd131, 0x6212, 0xb323, 0x3884, 0xe9b5, 0x5a96, 0x8ba7,
+	   0x1cc8, 0xcdf9, 0x7eda, 0xafeb, 0x244c, 0xf57d, 0x465e, 0x976f },
+	{/*c*/  0, 0xe1d1, 0x7262, 0x93b3, 0xb834, 0x59e5, 0xca56, 0x2b87,
+	   0xdc18, 0x3dc9, 0xae7a, 0x4fab, 0x542c, 0x85fd, 0x164e, 0xf79f },
+	{/*d*/  0, 0x6051, 0xb0a2, 0xd0f3, 0x1094, 0x70c5, 0xa036, 0xc067,
+	   0x20e8, 0x40b9, 0x904a, 0x601b, 0x307c, 0x502d, 0x80de, 0xe08f },
+	{/*e*/  0, 0xa4c1, 0xf842, 0x5c83, 0xe6f4, 0x4235, 0x1eb6, 0xba77,
+	   0x7b58, 0xdf99, 0x831a, 0x27db, 0x9dac, 0x396d, 0x65ee, 0xc12f },
+	{/*f*/  0, 0x11c1, 0x2242, 0x3383, 0xc8f4, 0xd935, 0xeab6, 0xfb77,
+	   0x4c58, 0x5d99, 0x6e1a, 0x7fdb, 0x84ac, 0x956d, 0xa6ee, 0xb72f },
+
+	/* Channel 1 syndromes */
+	{/*10*/ 1, 0x45d1, 0x8a62, 0xcfb3, 0x5e34, 0x1be5, 0xd456, 0x9187,
+	   0xa718, 0xe2c9, 0x2d7a, 0x68ab, 0xf92c, 0xbcfd, 0x734e, 0x369f },
+	{/*11*/ 1, 0x63e1, 0xb172, 0xd293, 0x14b4, 0x7755, 0xa5c6, 0xc627,
+	   0x28d8, 0x4b39, 0x99aa, 0xfa4b, 0x3c6c, 0x5f8d, 0x8d1e, 0xeeff },
+	{/*12*/ 1, 0xb741, 0xd982, 0x6ec3, 0x2254, 0x9515, 0xfbd6, 0x4c97,
+	   0x33a8, 0x84e9, 0xea2a, 0x5d6b, 0x11fc, 0xa6bd, 0xc87e, 0x7f3f },
+	{/*13*/ 1, 0xdd41, 0x6682, 0xbbc3, 0x3554, 0xe815, 0x53d6, 0xce97,
+	   0x1aa8, 0xc7e9, 0x7c2a, 0xa1fb, 0x2ffc, 0xf2bd, 0x497e, 0x943f },
+	{/*14*/ 1, 0x2bd1, 0x3d62, 0x16b3, 0x4f34, 0x64e5, 0x7256, 0x5987,
+	   0x8518, 0xaec9, 0xb87a, 0x93ab, 0xca2c, 0xe1fd, 0xf74e, 0xdc9f },
+	{/*15*/ 1, 0x83c1, 0xc142, 0x4283, 0xa4f4, 0x2735, 0x65b6, 0xe677,
+	   0xf858, 0x7b99, 0x391a, 0xbadb, 0x5cac, 0xdf6d, 0x9dee, 0x1e2f },
+	{/*16*/ 1, 0x8fd1, 0xc562, 0x4ab3, 0xa934, 0x26e5, 0x6c56, 0xe387,
+	   0xfe18, 0x71c9, 0x3b7a, 0xb4ab, 0x572c, 0xd8fd, 0x924e, 0x1d9f },
+	{/*17*/ 1, 0x4791, 0x89e2, 0xce73, 0x5264, 0x15f5, 0xdb86, 0x9c17,
+	   0xa3b8, 0xe429, 0x2a5a, 0x6dcb, 0xf1dc, 0xb64d, 0x783e, 0x3faf },
+	{/*18*/ 1, 0x5781, 0xa9c2, 0xfe43, 0x92a4, 0xc525, 0x3b66, 0x6ce7,
+	   0xe3f8, 0xb479, 0x4a3a, 0x1dbb, 0x715c, 0x26dd, 0xd89e, 0x8f1f },
+	{/*19*/ 1, 0xbf41, 0xd582, 0x6ac3, 0x2954, 0x9615, 0xfcd6, 0x4397,
+	   0x3ea8, 0x81e9, 0xeb2a, 0x546b, 0x17fc, 0xa8bd, 0xc27e, 0x7d3f },
+	{/*1a*/ 1, 0x9891, 0xe1e2, 0x7273, 0x6464, 0xf7f5, 0x8586, 0x1617,
+	   0xb8b8, 0x2b29, 0x595a, 0xcacb, 0xdcdc, 0x4f4d, 0x3d3e, 0xaeaf },
+	{/*1b*/ 1, 0xcce1, 0x4472, 0x8893, 0xfdb4, 0x3f55, 0xb9c6, 0x7527,
+	   0x56d8, 0x9a39, 0x12aa, 0xde4b, 0xab6c, 0x678d, 0xef1e, 0x23ff },
+	{/*1c*/ 1, 0xa761, 0xf9b2, 0x5ed3, 0xe214, 0x4575, 0x1ba6, 0xbcc7,
+	   0x7328, 0xd449, 0x8a9a, 0x2dfb, 0x913c, 0x365d, 0x688e, 0xcfef },
+	{/*1d*/ 1, 0xff61, 0x55b2, 0xaad3, 0x7914, 0x8675, 0x2ca6, 0xd3c7,
+	   0x9e28, 0x6149, 0xcb9a, 0x34fb, 0xe73c, 0x185d, 0xb28e, 0x4def },
+	{/*1e*/ 1, 0x5451, 0xa8a2, 0xfcf3, 0x9694, 0xc2c5, 0x3e36, 0x6a67,
+	   0xebe8, 0xbfb9, 0x434a, 0x171b, 0x7d7c, 0x292d, 0xd5de, 0x818f },
+	{/*1f*/ 1, 0x6fc1, 0xb542, 0xda83, 0x19f4, 0x7635, 0xacb6, 0xc377,
+	   0x2e58, 0x4199, 0x9b1a, 0xf4db, 0x37ac, 0x586d, 0x82ee, 0xed2f },
+
+	/* ECC bits are also in the set of tokens and they too can go bad
+	 * first 2 cover channel 0, while the second 2 cover channel 1
+	 */
+	{/*20*/ 0, 0xbe01, 0xd702, 0x6903, 0x2104, 0x9f05, 0xf606, 0x4807,
+	   0x3208, 0x8c09, 0xe50a, 0x5b0b, 0x130c, 0xad0d, 0xc40e, 0x7a0f },
+	{/*21*/ 0, 0x4101, 0x8202, 0xc303, 0x5804, 0x1905, 0xda06, 0x9b07,
+	   0xac08, 0xed09, 0x2e0a, 0x6f0b, 0x640c, 0xb50d, 0x760e, 0x370f },
+	{/*22*/ 1, 0xc441, 0x4882, 0x8cc3, 0xf654, 0x3215, 0xbed6, 0x7a97,
+	   0x5ba8, 0x9fe9, 0x132a, 0xd76b, 0xadfc, 0x69bd, 0xe57e, 0x213f },
+	{/*23*/ 1, 0x7621, 0x9b32, 0xed13, 0xda44, 0xac65, 0x4176, 0x3757,
+	   0x6f88, 0x19a9, 0xf4ba, 0x829b, 0xb5cc, 0xc3ed, 0x2efe, 0x58df }
+};
+
+/*
+ * Given the syndrome argument, scan each of the channel tables for a syndrome
+ * match. Depending on which table it is found, return the channel number.
+ */
+static int get_channel_from_ecc_syndrome(unsigned short syndrome)
+{
+	int row;
+	int column;
 
+	/* Determine column to scan */
+	column = syndrome & 0xF;
+
+	/* Scan all rows, looking for syndrome, or end of table */
+	for (row = 0; row < NUMBER_ECC_ROWS; row++) {
+		if (ecc_chipkill_syndromes[row][column] == syndrome)
+			return ecc_chipkill_syndromes[row][0];
+	}
+
+	debugf0("syndrome(%x) not found\n", syndrome);
+	return -1;
+}
-- 
GitLab


From d27bf6fa369ca0272df10558d2f290d6fc72e675 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Wed, 6 May 2009 17:55:27 +0200
Subject: [PATCH 1864/2198] amd64_edac: add error decoding logic

Borislav:

- fold amd64_error_info_valid() into its only user
- fix/cleanup comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 425 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 425 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index feb4986ea76d2..09991c8a6ee32 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2031,3 +2031,428 @@ static int get_channel_from_ecc_syndrome(unsigned short syndrome)
 	debugf0("syndrome(%x) not found\n", syndrome);
 	return -1;
 }
+
+/*
+ * Check for valid error in the NB Status High register. If so, proceed to read
+ * NB Status Low, NB Address Low and NB Address High registers and store data
+ * into error structure.
+ *
+ * Returns:
+ *	- 1: if hardware regs contains valid error info
+ *	- 0: if no valid error is indicated
+ */
+static int amd64_get_error_info_regs(struct mem_ctl_info *mci,
+				     struct amd64_error_info_regs *regs)
+{
+	struct amd64_pvt *pvt;
+	struct pci_dev *misc_f3_ctl;
+	int err = 0;
+
+	pvt = mci->pvt_info;
+	misc_f3_ctl = pvt->misc_f3_ctl;
+
+	err = pci_read_config_dword(misc_f3_ctl, K8_NBSH, &regs->nbsh);
+	if (err)
+		goto err_reg;
+
+	if (!(regs->nbsh & K8_NBSH_VALID_BIT))
+		return 0;
+
+	/* valid error, read remaining error information registers */
+	err = pci_read_config_dword(misc_f3_ctl, K8_NBSL, &regs->nbsl);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(misc_f3_ctl, K8_NBEAL, &regs->nbeal);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(misc_f3_ctl, K8_NBEAH, &regs->nbeah);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(misc_f3_ctl, K8_NBCFG, &regs->nbcfg);
+	if (err)
+		goto err_reg;
+
+	return 1;
+
+err_reg:
+	debugf0("Reading error info register failed\n");
+	return 0;
+}
+
+/*
+ * This function is called to retrieve the error data from hardware and store it
+ * in the info structure.
+ *
+ * Returns:
+ *	- 1: if a valid error is found
+ *	- 0: if no error is found
+ */
+static int amd64_get_error_info(struct mem_ctl_info *mci,
+				struct amd64_error_info_regs *info)
+{
+	struct amd64_pvt *pvt;
+	struct amd64_error_info_regs regs;
+
+	pvt = mci->pvt_info;
+
+	if (!amd64_get_error_info_regs(mci, info))
+		return 0;
+
+	/*
+	 * Here's the problem with the K8's EDAC reporting: There are four
+	 * registers which report pieces of error information. They are shared
+	 * between CEs and UEs. Furthermore, contrary to what is stated in the
+	 * BKDG, the overflow bit is never used! Every error always updates the
+	 * reporting registers.
+	 *
+	 * Can you see the race condition? All four error reporting registers
+	 * must be read before a new error updates them! There is no way to read
+	 * all four registers atomically. The best than can be done is to detect
+	 * that a race has occured and then report the error without any kind of
+	 * precision.
+	 *
+	 * What is still positive is that errors are still reported and thus
+	 * problems can still be detected - just not localized because the
+	 * syndrome and address are spread out across registers.
+	 *
+	 * Grrrrr!!!!!  Here's hoping that AMD fixes this in some future K8 rev.
+	 * UEs and CEs should have separate register sets with proper overflow
+	 * bits that are used! At very least the problem can be fixed by
+	 * honoring the ErrValid bit in 'nbsh' and not updating registers - just
+	 * set the overflow bit - unless the current error is CE and the new
+	 * error is UE which would be the only situation for overwriting the
+	 * current values.
+	 */
+
+	regs = *info;
+
+	/* Use info from the second read - most current */
+	if (unlikely(!amd64_get_error_info_regs(mci, info)))
+		return 0;
+
+	/* clear the error bits in hardware */
+	pci_write_bits32(pvt->misc_f3_ctl, K8_NBSH, 0, K8_NBSH_VALID_BIT);
+
+	/* Check for the possible race condition */
+	if ((regs.nbsh != info->nbsh) ||
+	     (regs.nbsl != info->nbsl) ||
+	     (regs.nbeah != info->nbeah) ||
+	     (regs.nbeal != info->nbeal)) {
+		amd64_mc_printk(mci, KERN_WARNING,
+				"hardware STATUS read access race condition "
+				"detected!\n");
+		return 0;
+	}
+	return 1;
+}
+
+static inline void amd64_decode_gart_tlb_error(struct mem_ctl_info *mci,
+					 struct amd64_error_info_regs *info)
+{
+	u32 err_code;
+	u32 ec_tt;		/* error code transaction type (2b) */
+	u32 ec_ll;		/* error code cache level (2b) */
+
+	err_code = EXTRACT_ERROR_CODE(info->nbsl);
+	ec_ll = EXTRACT_LL_CODE(err_code);
+	ec_tt = EXTRACT_TT_CODE(err_code);
+
+	amd64_mc_printk(mci, KERN_ERR,
+		     "GART TLB event: transaction type(%s), "
+		     "cache level(%s)\n", tt_msgs[ec_tt], ll_msgs[ec_ll]);
+}
+
+static inline void amd64_decode_mem_cache_error(struct mem_ctl_info *mci,
+				      struct amd64_error_info_regs *info)
+{
+	u32 err_code;
+	u32 ec_rrrr;		/* error code memory transaction (4b) */
+	u32 ec_tt;		/* error code transaction type (2b) */
+	u32 ec_ll;		/* error code cache level (2b) */
+
+	err_code = EXTRACT_ERROR_CODE(info->nbsl);
+	ec_ll = EXTRACT_LL_CODE(err_code);
+	ec_tt = EXTRACT_TT_CODE(err_code);
+	ec_rrrr = EXTRACT_RRRR_CODE(err_code);
+
+	amd64_mc_printk(mci, KERN_ERR,
+		     "cache hierarchy error: memory transaction type(%s), "
+		     "transaction type(%s), cache level(%s)\n",
+		     rrrr_msgs[ec_rrrr], tt_msgs[ec_tt], ll_msgs[ec_ll]);
+}
+
+
+/*
+ * Handle any Correctable Errors (CEs) that have occurred. Check for valid ERROR
+ * ADDRESS and process.
+ */
+static void amd64_handle_ce(struct mem_ctl_info *mci,
+			    struct amd64_error_info_regs *info)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	u64 SystemAddress;
+
+	/* Ensure that the Error Address is VALID */
+	if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) {
+		amd64_mc_printk(mci, KERN_ERR,
+			"HW has no ERROR_ADDRESS available\n");
+		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR);
+		return;
+	}
+
+	SystemAddress = extract_error_address(mci, info);
+
+	amd64_mc_printk(mci, KERN_ERR,
+		"CE ERROR_ADDRESS= 0x%llx\n", SystemAddress);
+
+	pvt->ops->map_sysaddr_to_csrow(mci, info, SystemAddress);
+}
+
+/* Handle any Un-correctable Errors (UEs) */
+static void amd64_handle_ue(struct mem_ctl_info *mci,
+			    struct amd64_error_info_regs *info)
+{
+	int csrow;
+	u64 SystemAddress;
+	u32 page, offset;
+	struct mem_ctl_info *log_mci, *src_mci = NULL;
+
+	log_mci = mci;
+
+	if ((info->nbsh & K8_NBSH_VALID_ERROR_ADDR) == 0) {
+		amd64_mc_printk(mci, KERN_CRIT,
+			"HW has no ERROR_ADDRESS available\n");
+		edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+		return;
+	}
+
+	SystemAddress = extract_error_address(mci, info);
+
+	/*
+	 * Find out which node the error address belongs to. This may be
+	 * different from the node that detected the error.
+	 */
+	src_mci = find_mc_by_sys_addr(mci, SystemAddress);
+	if (!src_mci) {
+		amd64_mc_printk(mci, KERN_CRIT,
+			"ERROR ADDRESS (0x%lx) value NOT mapped to a MC\n",
+			(unsigned long)SystemAddress);
+		edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+		return;
+	}
+
+	log_mci = src_mci;
+
+	csrow = sys_addr_to_csrow(log_mci, SystemAddress);
+	if (csrow < 0) {
+		amd64_mc_printk(mci, KERN_CRIT,
+			"ERROR_ADDRESS (0x%lx) value NOT mapped to 'csrow'\n",
+			(unsigned long)SystemAddress);
+		edac_mc_handle_ue_no_info(log_mci, EDAC_MOD_STR);
+	} else {
+		error_address_to_page_and_offset(SystemAddress, &page, &offset);
+		edac_mc_handle_ue(log_mci, page, offset, csrow, EDAC_MOD_STR);
+	}
+}
+
+static void amd64_decode_bus_error(struct mem_ctl_info *mci,
+				   struct amd64_error_info_regs *info)
+{
+	u32 err_code, ext_ec;
+	u32 ec_pp;		/* error code participating processor (2p) */
+	u32 ec_to;		/* error code timed out (1b) */
+	u32 ec_rrrr;		/* error code memory transaction (4b) */
+	u32 ec_ii;		/* error code memory or I/O (2b) */
+	u32 ec_ll;		/* error code cache level (2b) */
+
+	ext_ec = EXTRACT_EXT_ERROR_CODE(info->nbsl);
+	err_code = EXTRACT_ERROR_CODE(info->nbsl);
+
+	ec_ll = EXTRACT_LL_CODE(err_code);
+	ec_ii = EXTRACT_II_CODE(err_code);
+	ec_rrrr = EXTRACT_RRRR_CODE(err_code);
+	ec_to = EXTRACT_TO_CODE(err_code);
+	ec_pp = EXTRACT_PP_CODE(err_code);
+
+	amd64_mc_printk(mci, KERN_ERR,
+		"BUS ERROR:\n"
+		"  time-out(%s) mem or i/o(%s)\n"
+		"  participating processor(%s)\n"
+		"  memory transaction type(%s)\n"
+		"  cache level(%s) Error Found by: %s\n",
+		to_msgs[ec_to],
+		ii_msgs[ec_ii],
+		pp_msgs[ec_pp],
+		rrrr_msgs[ec_rrrr],
+		ll_msgs[ec_ll],
+		(info->nbsh & K8_NBSH_ERR_SCRUBER) ?
+			"Scrubber" : "Normal Operation");
+
+	/* If this was an 'observed' error, early out */
+	if (ec_pp == K8_NBSL_PP_OBS)
+		return;		/* We aren't the node involved */
+
+	/* Parse out the extended error code for ECC events */
+	switch (ext_ec) {
+	/* F10 changed to one Extended ECC error code */
+	case F10_NBSL_EXT_ERR_RES:		/* Reserved field */
+	case F10_NBSL_EXT_ERR_ECC:		/* F10 ECC ext err code */
+		break;
+
+	default:
+		amd64_mc_printk(mci, KERN_ERR, "NOT ECC: no special error "
+					       "handling for this error\n");
+		return;
+	}
+
+	if (info->nbsh & K8_NBSH_CECC)
+		amd64_handle_ce(mci, info);
+	else if (info->nbsh & K8_NBSH_UECC)
+		amd64_handle_ue(mci, info);
+
+	/*
+	 * If main error is CE then overflow must be CE.  If main error is UE
+	 * then overflow is unknown.  We'll call the overflow a CE - if
+	 * panic_on_ue is set then we're already panic'ed and won't arrive
+	 * here. Else, then apparently someone doesn't think that UE's are
+	 * catastrophic.
+	 */
+	if (info->nbsh & K8_NBSH_OVERFLOW)
+		edac_mc_handle_ce_no_info(mci, EDAC_MOD_STR
+					  "Error Overflow set");
+}
+
+int amd64_process_error_info(struct mem_ctl_info *mci,
+			     struct amd64_error_info_regs *info,
+			     int handle_errors)
+{
+	struct amd64_pvt *pvt;
+	struct amd64_error_info_regs *regs;
+	u32 err_code, ext_ec;
+	int gart_tlb_error = 0;
+
+	pvt = mci->pvt_info;
+
+	/* If caller doesn't want us to process the error, return */
+	if (!handle_errors)
+		return 1;
+
+	regs = info;
+
+	debugf1("NorthBridge ERROR: mci(0x%p)\n", mci);
+	debugf1("  MC node(%d) Error-Address(0x%.8x-%.8x)\n",
+		pvt->mc_node_id, regs->nbeah, regs->nbeal);
+	debugf1("  nbsh(0x%.8x) nbsl(0x%.8x)\n",
+		regs->nbsh, regs->nbsl);
+	debugf1("  Valid Error=%s Overflow=%s\n",
+		(regs->nbsh & K8_NBSH_VALID_BIT) ? "True" : "False",
+		(regs->nbsh & K8_NBSH_OVERFLOW) ? "True" : "False");
+	debugf1("  Err Uncorrected=%s MCA Error Reporting=%s\n",
+		(regs->nbsh & K8_NBSH_UNCORRECTED_ERR) ?
+			"True" : "False",
+		(regs->nbsh & K8_NBSH_ERR_ENABLE) ?
+			"True" : "False");
+	debugf1("  MiscErr Valid=%s ErrAddr Valid=%s PCC=%s\n",
+		(regs->nbsh & K8_NBSH_MISC_ERR_VALID) ?
+			"True" : "False",
+		(regs->nbsh & K8_NBSH_VALID_ERROR_ADDR) ?
+			"True" : "False",
+		(regs->nbsh & K8_NBSH_PCC) ?
+			"True" : "False");
+	debugf1("  CECC=%s UECC=%s Found by Scruber=%s\n",
+		(regs->nbsh & K8_NBSH_CECC) ?
+			"True" : "False",
+		(regs->nbsh & K8_NBSH_UECC) ?
+			"True" : "False",
+		(regs->nbsh & K8_NBSH_ERR_SCRUBER) ?
+			"True" : "False");
+	debugf1("  CORE0=%s CORE1=%s CORE2=%s CORE3=%s\n",
+		(regs->nbsh & K8_NBSH_CORE0) ? "True" : "False",
+		(regs->nbsh & K8_NBSH_CORE1) ? "True" : "False",
+		(regs->nbsh & K8_NBSH_CORE2) ? "True" : "False",
+		(regs->nbsh & K8_NBSH_CORE3) ? "True" : "False");
+
+
+	err_code = EXTRACT_ERROR_CODE(regs->nbsl);
+
+	/* Determine which error type:
+	 *	1) GART errors - non-fatal, developmental events
+	 *	2) MEMORY errors
+	 *	3) BUS errors
+	 *	4) Unknown error
+	 */
+	if (TEST_TLB_ERROR(err_code)) {
+		/*
+		 * GART errors are intended to help graphics driver developers
+		 * to detect bad GART PTEs. It is recommended by AMD to disable
+		 * GART table walk error reporting by default[1] (currently
+		 * being disabled in mce_cpu_quirks()) and according to the
+		 * comment in mce_cpu_quirks(), such GART errors can be
+		 * incorrectly triggered. We may see these errors anyway and
+		 * unless requested by the user, they won't be reported.
+		 *
+		 * [1] section 13.10.1 on BIOS and Kernel Developers Guide for
+		 *     AMD NPT family 0Fh processors
+		 */
+		if (report_gart_errors == 0)
+			return 1;
+
+		/*
+		 * Only if GART error reporting is requested should we generate
+		 * any logs.
+		 */
+		gart_tlb_error = 1;
+
+		debugf1("GART TLB error\n");
+		amd64_decode_gart_tlb_error(mci, info);
+	} else if (TEST_MEM_ERROR(err_code)) {
+		debugf1("Memory/Cache error\n");
+		amd64_decode_mem_cache_error(mci, info);
+	} else if (TEST_BUS_ERROR(err_code)) {
+		debugf1("Bus (Link/DRAM) error\n");
+		amd64_decode_bus_error(mci, info);
+	} else {
+		/* shouldn't reach here! */
+		amd64_mc_printk(mci, KERN_WARNING,
+			     "%s(): unknown MCE error 0x%x\n", __func__,
+			     err_code);
+	}
+
+	ext_ec = EXTRACT_EXT_ERROR_CODE(regs->nbsl);
+	amd64_mc_printk(mci, KERN_ERR,
+		"ExtErr=(0x%x) %s\n", ext_ec, ext_msgs[ext_ec]);
+
+	if (((ext_ec >= F10_NBSL_EXT_ERR_CRC &&
+			ext_ec <= F10_NBSL_EXT_ERR_TGT) ||
+			(ext_ec == F10_NBSL_EXT_ERR_RMW)) &&
+			EXTRACT_LDT_LINK(info->nbsh)) {
+
+		amd64_mc_printk(mci, KERN_ERR,
+			"Error on hypertransport link: %s\n",
+			htlink_msgs[
+			EXTRACT_LDT_LINK(info->nbsh)]);
+	}
+
+	/*
+	 * Check the UE bit of the NB status high register, if set generate some
+	 * logs. If NOT a GART error, then process the event as a NO-INFO event.
+	 * If it was a GART error, skip that process.
+	 */
+	if (regs->nbsh & K8_NBSH_UNCORRECTED_ERR) {
+		amd64_mc_printk(mci, KERN_CRIT, "uncorrected error\n");
+		if (!gart_tlb_error)
+			edac_mc_handle_ue_no_info(mci, "UE bit is set\n");
+	}
+
+	if (regs->nbsh & K8_NBSH_PCC)
+		amd64_mc_printk(mci, KERN_CRIT,
+			"PCC (processor context corrupt) set\n");
+
+	return 1;
+}
+EXPORT_SYMBOL_GPL(amd64_process_error_info);
+
+
-- 
GitLab


From 0ec449ee95b20245fef4aa9fa2486456f1540514 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 19:41:25 +0200
Subject: [PATCH 1865/2198] amd64_edac: add EDAC core-related initializers

Borislav:

- add a amd64_free_mc_sibling_devices() helper instead of opencoding the
  release-path.
- fix/cleanup comments
- fix function return value patterns
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 315 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 315 insertions(+)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 09991c8a6ee32..5a6e714b115ed 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2455,4 +2455,319 @@ int amd64_process_error_info(struct mem_ctl_info *mci,
 }
 EXPORT_SYMBOL_GPL(amd64_process_error_info);
 
+/*
+ * The main polling 'check' function, called FROM the edac core to perform the
+ * error checking and if an error is encountered, error processing.
+ */
+static void amd64_check(struct mem_ctl_info *mci)
+{
+	struct amd64_error_info_regs info;
+
+	if (amd64_get_error_info(mci, &info))
+		amd64_process_error_info(mci, &info, 1);
+}
+
+/*
+ * Input:
+ *	1) struct amd64_pvt which contains pvt->dram_f2_ctl pointer
+ *	2) AMD Family index value
+ *
+ * Ouput:
+ *	Upon return of 0, the following filled in:
+ *
+ *		struct pvt->addr_f1_ctl
+ *		struct pvt->misc_f3_ctl
+ *
+ *	Filled in with related device funcitions of 'dram_f2_ctl'
+ *	These devices are "reserved" via the pci_get_device()
+ *
+ *	Upon return of 1 (error status):
+ *
+ *		Nothing reserved
+ */
+static int amd64_reserve_mc_sibling_devices(struct amd64_pvt *pvt, int mc_idx)
+{
+	const struct amd64_family_type *amd64_dev = &amd64_family_types[mc_idx];
+
+	/* Reserve the ADDRESS MAP Device */
+	pvt->addr_f1_ctl = pci_get_related_function(pvt->dram_f2_ctl->vendor,
+						    amd64_dev->addr_f1_ctl,
+						    pvt->dram_f2_ctl);
+
+	if (!pvt->addr_f1_ctl) {
+		amd64_printk(KERN_ERR, "error address map device not found: "
+			     "vendor %x device 0x%x (broken BIOS?)\n",
+			     PCI_VENDOR_ID_AMD, amd64_dev->addr_f1_ctl);
+		return 1;
+	}
+
+	/* Reserve the MISC Device */
+	pvt->misc_f3_ctl = pci_get_related_function(pvt->dram_f2_ctl->vendor,
+						    amd64_dev->misc_f3_ctl,
+						    pvt->dram_f2_ctl);
+
+	if (!pvt->misc_f3_ctl) {
+		pci_dev_put(pvt->addr_f1_ctl);
+		pvt->addr_f1_ctl = NULL;
+
+		amd64_printk(KERN_ERR, "error miscellaneous device not found: "
+			     "vendor %x device 0x%x (broken BIOS?)\n",
+			     PCI_VENDOR_ID_AMD, amd64_dev->misc_f3_ctl);
+		return 1;
+	}
+
+	debugf1("    Addr Map device PCI Bus ID:\t%s\n",
+		pci_name(pvt->addr_f1_ctl));
+	debugf1("    DRAM MEM-CTL PCI Bus ID:\t%s\n",
+		pci_name(pvt->dram_f2_ctl));
+	debugf1("    Misc device PCI Bus ID:\t%s\n",
+		pci_name(pvt->misc_f3_ctl));
+
+	return 0;
+}
+
+static void amd64_free_mc_sibling_devices(struct amd64_pvt *pvt)
+{
+	pci_dev_put(pvt->addr_f1_ctl);
+	pci_dev_put(pvt->misc_f3_ctl);
+}
+
+/*
+ * Retrieve the hardware registers of the memory controller (this includes the
+ * 'Address Map' and 'Misc' device regs)
+ */
+static void amd64_read_mc_registers(struct amd64_pvt *pvt)
+{
+	u64 msr_val;
+	int dram, err = 0;
+
+	/*
+	 * Retrieve TOP_MEM and TOP_MEM2; no masking off of reserved bits since
+	 * those are Read-As-Zero
+	 */
+	rdmsrl(MSR_K8_TOP_MEM1, msr_val);
+	pvt->top_mem = msr_val >> 23;
+	debugf0("  TOP_MEM=0x%08llx\n", pvt->top_mem);
+
+	/* check first whether TOP_MEM2 is enabled */
+	rdmsrl(MSR_K8_SYSCFG, msr_val);
+	if (msr_val & (1U << 21)) {
+		rdmsrl(MSR_K8_TOP_MEM2, msr_val);
+		pvt->top_mem2 = msr_val >> 23;
+		debugf0("  TOP_MEM2=0x%08llx\n", pvt->top_mem2);
+	} else
+		debugf0("  TOP_MEM2 disabled.\n");
+
+	amd64_cpu_display_info(pvt);
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCAP, &pvt->nbcap);
+	if (err)
+		goto err_reg;
+
+	if (pvt->ops->read_dram_ctl_register)
+		pvt->ops->read_dram_ctl_register(pvt);
+
+	for (dram = 0; dram < DRAM_REG_COUNT; dram++) {
+		/*
+		 * Call CPU specific READ function to get the DRAM Base and
+		 * Limit values from the DCT.
+		 */
+		pvt->ops->read_dram_base_limit(pvt, dram);
+
+		/*
+		 * Only print out debug info on rows with both R and W Enabled.
+		 * Normal processing, compiler should optimize this whole 'if'
+		 * debug output block away.
+		 */
+		if (pvt->dram_rw_en[dram] != 0) {
+			debugf1("  DRAM_BASE[%d]: 0x%8.08x-%8.08x "
+				"DRAM_LIMIT:  0x%8.08x-%8.08x\n",
+				dram,
+				(u32)(pvt->dram_base[dram] >> 32),
+				(u32)(pvt->dram_base[dram] & 0xFFFFFFFF),
+				(u32)(pvt->dram_limit[dram] >> 32),
+				(u32)(pvt->dram_limit[dram] & 0xFFFFFFFF));
+			debugf1("        IntlvEn=%s %s %s "
+				"IntlvSel=%d DstNode=%d\n",
+				pvt->dram_IntlvEn[dram] ?
+					"Enabled" : "Disabled",
+				(pvt->dram_rw_en[dram] & 0x2) ? "W" : "!W",
+				(pvt->dram_rw_en[dram] & 0x1) ? "R" : "!R",
+				pvt->dram_IntlvSel[dram],
+				pvt->dram_DstNode[dram]);
+		}
+	}
+
+	amd64_read_dct_base_mask(pvt);
+
+	err = pci_read_config_dword(pvt->addr_f1_ctl, K8_DHAR, &pvt->dhar);
+	if (err)
+		goto err_reg;
+
+	amd64_read_dbam_reg(pvt);
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl,
+				F10_ONLINE_SPARE, &pvt->online_spare);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_0, &pvt->dclr0);
+	if (err)
+		goto err_reg;
+
+	err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCHR_0, &pvt->dchr0);
+	if (err)
+		goto err_reg;
+
+	if (!dct_ganging_enabled(pvt)) {
+		err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCLR_1,
+						&pvt->dclr1);
+		if (err)
+			goto err_reg;
+
+		err = pci_read_config_dword(pvt->dram_f2_ctl, F10_DCHR_1,
+						&pvt->dchr1);
+		if (err)
+			goto err_reg;
+	}
+
+	amd64_dump_misc_regs(pvt);
+
+err_reg:
+	debugf0("Reading an MC register failed\n");
+
+}
+
+/*
+ * NOTE: CPU Revision Dependent code
+ *
+ * Input:
+ *	@csrow_nr ChipSelect Row Number (0..CHIPSELECT_COUNT-1)
+ *	k8 private pointer to -->
+ *			DRAM Bank Address mapping register
+ *			node_id
+ *			DCL register where dual_channel_active is
+ *
+ * The DBAM register consists of 4 sets of 4 bits each definitions:
+ *
+ * Bits:	CSROWs
+ * 0-3		CSROWs 0 and 1
+ * 4-7		CSROWs 2 and 3
+ * 8-11		CSROWs 4 and 5
+ * 12-15	CSROWs 6 and 7
+ *
+ * Values range from: 0 to 15
+ * The meaning of the values depends on CPU revision and dual-channel state,
+ * see relevant BKDG more info.
+ *
+ * The memory controller provides for total of only 8 CSROWs in its current
+ * architecture. Each "pair" of CSROWs normally represents just one DIMM in
+ * single channel or two (2) DIMMs in dual channel mode.
+ *
+ * The following code logic collapses the various tables for CSROW based on CPU
+ * revision.
+ *
+ * Returns:
+ *	The number of PAGE_SIZE pages on the specified CSROW number it
+ *	encompasses
+ *
+ */
+static u32 amd64_csrow_nr_pages(int csrow_nr, struct amd64_pvt *pvt)
+{
+	u32 dram_map, nr_pages;
+
+	/*
+	 * The math on this doesn't look right on the surface because x/2*4 can
+	 * be simplified to x*2 but this expression makes use of the fact that
+	 * it is integral math where 1/2=0. This intermediate value becomes the
+	 * number of bits to shift the DBAM register to extract the proper CSROW
+	 * field.
+	 */
+	dram_map = (pvt->dbam0 >> ((csrow_nr / 2) * 4)) & 0xF;
+
+	nr_pages = pvt->ops->dbam_map_to_pages(pvt, dram_map);
+
+	/*
+	 * If dual channel then double the memory size of single channel.
+	 * Channel count is 1 or 2
+	 */
+	nr_pages <<= (pvt->channel_count - 1);
+
+	debugf0("  (csrow=%d) DBAM map index= %d\n", csrow_nr, dram_map);
+	debugf0("    nr_pages= %u  channel-count = %d\n",
+		nr_pages, pvt->channel_count);
+
+	return nr_pages;
+}
+
+/*
+ * Initialize the array of csrow attribute instances, based on the values
+ * from pci config hardware registers.
+ */
+static int amd64_init_csrows(struct mem_ctl_info *mci)
+{
+	struct csrow_info *csrow;
+	struct amd64_pvt *pvt;
+	u64 input_addr_min, input_addr_max, sys_addr;
+	int i, err = 0, empty = 1;
+
+	pvt = mci->pvt_info;
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &pvt->nbcfg);
+	if (err)
+		debugf0("Reading K8_NBCFG failed\n");
+
+	debugf0("NBCFG= 0x%x  CHIPKILL= %s DRAM ECC= %s\n", pvt->nbcfg,
+		(pvt->nbcfg & K8_NBCFG_CHIPKILL) ? "Enabled" : "Disabled",
+		(pvt->nbcfg & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled"
+		);
+
+	for (i = 0; i < CHIPSELECT_COUNT; i++) {
+		csrow = &mci->csrows[i];
+
+		if ((pvt->dcsb0[i] & K8_DCSB_CS_ENABLE) == 0) {
+			debugf1("----CSROW %d EMPTY for node %d\n", i,
+				pvt->mc_node_id);
+			continue;
+		}
+
+		debugf1("----CSROW %d VALID for MC node %d\n",
+			i, pvt->mc_node_id);
+
+		empty = 0;
+		csrow->nr_pages = amd64_csrow_nr_pages(i, pvt);
+		find_csrow_limits(mci, i, &input_addr_min, &input_addr_max);
+		sys_addr = input_addr_to_sys_addr(mci, input_addr_min);
+		csrow->first_page = (u32) (sys_addr >> PAGE_SHIFT);
+		sys_addr = input_addr_to_sys_addr(mci, input_addr_max);
+		csrow->last_page = (u32) (sys_addr >> PAGE_SHIFT);
+		csrow->page_mask = ~mask_from_dct_mask(pvt, i);
+		/* 8 bytes of resolution */
+
+		csrow->mtype = amd64_determine_memory_type(pvt);
+
+		debugf1("  for MC node %d csrow %d:\n", pvt->mc_node_id, i);
+		debugf1("    input_addr_min: 0x%lx input_addr_max: 0x%lx\n",
+			(unsigned long)input_addr_min,
+			(unsigned long)input_addr_max);
+		debugf1("    sys_addr: 0x%lx  page_mask: 0x%lx\n",
+			(unsigned long)sys_addr, csrow->page_mask);
+		debugf1("    nr_pages: %u  first_page: 0x%lx "
+			"last_page: 0x%lx\n",
+			(unsigned)csrow->nr_pages,
+			csrow->first_page, csrow->last_page);
+
+		/*
+		 * determine whether CHIPKILL or JUST ECC or NO ECC is operating
+		 */
+		if (pvt->nbcfg & K8_NBCFG_ECC_ENABLE)
+			csrow->edac_mode =
+			    (pvt->nbcfg & K8_NBCFG_CHIPKILL) ?
+			    EDAC_S4ECD4ED : EDAC_SECDED;
+		else
+			csrow->edac_mode = EDAC_NONE;
+	}
+
+	return empty;
+}
 
-- 
GitLab


From f9431992b6227069bc54800d55531c6f78d276a7 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 19:46:08 +0200
Subject: [PATCH 1866/2198] amd64_edac: add ECC reporting initializers

Borislav:
- convert to the new {rd|wr}msr_on_cpus interfaces.
- convert pvt->old_mcgctl to a bitmask thus saving some bytes
- fix/cleanup comments
- fix function return value patterns
- add a proper bugfix found by Doug to amd64_check_ecc_enabled where we
  missed checking for the ECC enabled bit in NB CFG.
- cleanup debug calls

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/amd64_edac.c | 207 ++++++++++++++++++++++++++++++++++++++
 drivers/edac/amd64_edac.h |   3 +-
 2 files changed, 209 insertions(+), 1 deletion(-)

diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 5a6e714b115ed..3b6c421531cf2 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -2771,3 +2771,210 @@ static int amd64_init_csrows(struct mem_ctl_info *mci)
 	return empty;
 }
 
+/*
+ * Only if 'ecc_enable_override' is set AND BIOS had ECC disabled, do "we"
+ * enable it.
+ */
+static void amd64_enable_ecc_error_reporting(struct mem_ctl_info *mci)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+	const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id);
+	int cpu, idx = 0, err = 0;
+	struct msr msrs[cpumask_weight(cpumask)];
+	u32 value;
+	u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn;
+
+	if (!ecc_enable_override)
+		return;
+
+	memset(msrs, 0, sizeof(msrs));
+
+	amd64_printk(KERN_WARNING,
+		"'ecc_enable_override' parameter is active, "
+		"Enabling AMD ECC hardware now: CAUTION\n");
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, &value);
+	if (err)
+		debugf0("Reading K8_NBCTL failed\n");
+
+	/* turn on UECCn and CECCEn bits */
+	pvt->old_nbctl = value & mask;
+	pvt->nbctl_mcgctl_saved = 1;
+
+	value |= mask;
+	pci_write_config_dword(pvt->misc_f3_ctl, K8_NBCTL, value);
+
+	rdmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
+
+	for_each_cpu(cpu, cpumask) {
+		if (msrs[idx].l & K8_MSR_MCGCTL_NBE)
+			set_bit(idx, &pvt->old_mcgctl);
+
+		msrs[idx].l |= K8_MSR_MCGCTL_NBE;
+		idx++;
+	}
+	wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value);
+	if (err)
+		debugf0("Reading K8_NBCFG failed\n");
+
+	debugf0("NBCFG(1)= 0x%x  CHIPKILL= %s ECC_ENABLE= %s\n", value,
+		(value & K8_NBCFG_CHIPKILL) ? "Enabled" : "Disabled",
+		(value & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled");
+
+	if (!(value & K8_NBCFG_ECC_ENABLE)) {
+		amd64_printk(KERN_WARNING,
+			"This node reports that DRAM ECC is "
+			"currently Disabled; ENABLING now\n");
+
+		/* Attempt to turn on DRAM ECC Enable */
+		value |= K8_NBCFG_ECC_ENABLE;
+		pci_write_config_dword(pvt->misc_f3_ctl, K8_NBCFG, value);
+
+		err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value);
+		if (err)
+			debugf0("Reading K8_NBCFG failed\n");
+
+		if (!(value & K8_NBCFG_ECC_ENABLE)) {
+			amd64_printk(KERN_WARNING,
+				"Hardware rejects Enabling DRAM ECC checking\n"
+				"Check memory DIMM configuration\n");
+		} else {
+			amd64_printk(KERN_DEBUG,
+				"Hardware accepted DRAM ECC Enable\n");
+		}
+	}
+	debugf0("NBCFG(2)= 0x%x  CHIPKILL= %s ECC_ENABLE= %s\n", value,
+		(value & K8_NBCFG_CHIPKILL) ? "Enabled" : "Disabled",
+		(value & K8_NBCFG_ECC_ENABLE) ? "Enabled" : "Disabled");
+
+	pvt->ctl_error_info.nbcfg = value;
+}
+
+static void amd64_restore_ecc_error_reporting(struct amd64_pvt *pvt)
+{
+	const cpumask_t *cpumask = cpumask_of_node(pvt->mc_node_id);
+	int cpu, idx = 0, err = 0;
+	struct msr msrs[cpumask_weight(cpumask)];
+	u32 value;
+	u32 mask = K8_NBCTL_CECCEn | K8_NBCTL_UECCEn;
+
+	if (!pvt->nbctl_mcgctl_saved)
+		return;
+
+	memset(msrs, 0, sizeof(msrs));
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCTL, &value);
+	if (err)
+		debugf0("Reading K8_NBCTL failed\n");
+	value &= ~mask;
+	value |= pvt->old_nbctl;
+
+	/* restore the NB Enable MCGCTL bit */
+	pci_write_config_dword(pvt->misc_f3_ctl, K8_NBCTL, value);
+
+	rdmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
+
+	for_each_cpu(cpu, cpumask) {
+		msrs[idx].l &= ~K8_MSR_MCGCTL_NBE;
+		msrs[idx].l |=
+			test_bit(idx, &pvt->old_mcgctl) << K8_MSR_MCGCTL_NBE;
+		idx++;
+	}
+
+	wrmsr_on_cpus(cpumask, K8_MSR_MCGCTL, msrs);
+}
+
+static void check_mcg_ctl(void *ret)
+{
+	u64 msr_val = 0;
+	u8 nbe;
+
+	rdmsrl(MSR_IA32_MCG_CTL, msr_val);
+	nbe = msr_val & K8_MSR_MCGCTL_NBE;
+
+	debugf0("core: %u, MCG_CTL: 0x%llx, NB MSR is %s\n",
+		raw_smp_processor_id(), msr_val,
+		(nbe ? "enabled" : "disabled"));
+
+	if (!nbe)
+		*(int *)ret = 0;
+}
+
+/* check MCG_CTL on all the cpus on this node */
+static int amd64_mcg_ctl_enabled_on_cpus(const cpumask_t *mask)
+{
+	int ret = 1;
+	preempt_disable();
+	smp_call_function_many(mask, check_mcg_ctl, &ret, 1);
+	preempt_enable();
+
+	return ret;
+}
+
+/*
+ * EDAC requires that the BIOS have ECC enabled before taking over the
+ * processing of ECC errors. This is because the BIOS can properly initialize
+ * the memory system completely. A command line option allows to force-enable
+ * hardware ECC later in amd64_enable_ecc_error_reporting().
+ */
+static int amd64_check_ecc_enabled(struct amd64_pvt *pvt)
+{
+	u32 value;
+	int err = 0, ret = 0;
+	u8 ecc_enabled = 0;
+
+	err = pci_read_config_dword(pvt->misc_f3_ctl, K8_NBCFG, &value);
+	if (err)
+		debugf0("Reading K8_NBCTL failed\n");
+
+	ecc_enabled = !!(value & K8_NBCFG_ECC_ENABLE);
+
+	ret = amd64_mcg_ctl_enabled_on_cpus(cpumask_of_node(pvt->mc_node_id));
+
+	debugf0("K8_NBCFG=0x%x,  DRAM ECC is %s\n", value,
+			(value & K8_NBCFG_ECC_ENABLE ? "enabled" : "disabled"));
+
+	if (!ecc_enabled || !ret) {
+		if (!ecc_enabled) {
+			amd64_printk(KERN_WARNING, "This node reports that "
+						   "Memory ECC is currently "
+						   "disabled.\n");
+
+			amd64_printk(KERN_WARNING, "bit 0x%lx in register "
+				"F3x%x of the MISC_CONTROL device (%s) "
+				"should be enabled\n", K8_NBCFG_ECC_ENABLE,
+				K8_NBCFG, pci_name(pvt->misc_f3_ctl));
+		}
+		if (!ret) {
+			amd64_printk(KERN_WARNING, "bit 0x%016lx in MSR 0x%08x "
+					"of node %d should be enabled\n",
+					K8_MSR_MCGCTL_NBE, MSR_IA32_MCG_CTL,
+					pvt->mc_node_id);
+		}
+		if (!ecc_enable_override) {
+			amd64_printk(KERN_WARNING, "WARNING: ECC is NOT "
+				"currently enabled by the BIOS. Module "
+				"will NOT be loaded.\n"
+				"    Either Enable ECC in the BIOS, "
+				"or use the 'ecc_enable_override' "
+				"parameter.\n"
+				"    Might be a BIOS bug, if BIOS says "
+				"ECC is enabled\n"
+				"    Use of the override can cause "
+				"unknown side effects.\n");
+			ret = -ENODEV;
+		}
+	} else {
+		amd64_printk(KERN_INFO,
+			"ECC is enabled by BIOS, Proceeding "
+			"with EDAC module initialization\n");
+
+		/* CLEAR the override, since BIOS controlled it */
+		ecc_enable_override = 0;
+	}
+
+	return ret;
+}
+
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index 6f5d5d62cefc8..e7aa760614ce3 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -70,6 +70,7 @@
 #include <linux/slab.h>
 #include <linux/mmzone.h>
 #include <linux/edac.h>
+#include <asm/msr.h>
 #include "edac_core.h"
 
 #define amd64_printk(level, fmt, arg...) \
@@ -549,7 +550,7 @@ struct amd64_pvt {
 	/* Save old hw registers' values before we modified them */
 	u32 nbctl_mcgctl_saved;		/* When true, following 2 are valid */
 	u32 old_nbctl;
-	u32 *old_mcgctl;		/* per core on this node */
+	unsigned long old_mcgctl;	/* per core on this node */
 
 	/* MC Type Index value: socket F vs Family 10h */
 	u32 mc_type_index;
-- 
GitLab


From 7d6034d3213e2dd1c0f8678e11064007413011c4 Mon Sep 17 00:00:00 2001
From: Doug Thompson <dougthompson@xmission.com>
Date: Mon, 27 Apr 2009 20:01:01 +0200
Subject: [PATCH 1867/2198] amd64_edac: add module registration routines

Also, link into Kbuild by adding Kconfig and Makefile entries.

Borislav:
- Kconfig/Makefile splitting
- use zero-sized arrays for the sysfs attrs if not enabled
- rename sysfs attrs to more conform values
- shorten CONFIG_ names
- make multiple structure members assignment vertically aligned
- fix/cleanup comments
- fix function return value patterns
- fix err labels
- fix a memleak bug caught by Ingo
- remove the NUMA dependency and use num_k8_northbrides for initializing
  a driver instance per NB.
- do not copy the pvt contents into the mci struct in
  amd64_init_2nd_stage() and save it in the mci->pvt_info void ptr
  instead.
- cleanup debug calls
- simplify amd64_setup_pci_device()

Reviewed-by: Mauro Carvalho Chehab <mchehab@redhat.com>
Signed-off-by: Doug Thompson <dougthompson@xmission.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/Kconfig      |  26 +++
 drivers/edac/Makefile     |   7 +
 drivers/edac/amd64_edac.c | 374 ++++++++++++++++++++++++++++++++++++++
 drivers/edac/amd64_edac.h |  15 ++
 4 files changed, 422 insertions(+)

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 956982f8739b3..3dc650a7422c6 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -58,6 +58,32 @@ config EDAC_MM_EDAC
 	  occurred so that a particular failing memory module can be
 	  replaced.  If unsure, select 'Y'.
 
+config EDAC_AMD64
+	tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h"
+	depends on EDAC_MM_EDAC && X86 && PCI
+	default m
+	help
+	Support for error detection and correction on the AMD 64
+	Families of Memory Controllers (K8, F10h and F11h)
+
+config EDAC_AMD64_ERROR_INJECTION
+	bool "Sysfs Error Injection facilities"
+	depends on EDAC_AMD64
+	help
+	  Recent Opterons (Family 10h and later) provide for Memory Error
+	  Injection into the ECC detection circuits. The amd64_edac module
+	  allows the operator/user to inject Uncorrectable and Correctable
+	  errors into DRAM.
+
+	  When enabled, in each of the respective memory controller directories
+	  (/sys/devices/system/edac/mc/mcX), there are 3 input files:
+
+	  - inject_section (0..3, 16-byte section of 64-byte cacheline),
+	  - inject_word (0..8, 16-bit word of 16-byte section),
+	  - inject_ecc_vector (hex ecc vector: select bits of inject word)
+
+	  In addition, there are two control files, inject_read and inject_write,
+	  which trigger the DRAM ECC Read and Write respectively.
 
 config EDAC_AMD76X
 	tristate "AMD 76x (760, 762, 768)"
diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile
index 59076819135d2..633dc5604ee32 100644
--- a/drivers/edac/Makefile
+++ b/drivers/edac/Makefile
@@ -30,6 +30,13 @@ obj-$(CONFIG_EDAC_I3000)		+= i3000_edac.o
 obj-$(CONFIG_EDAC_X38)			+= x38_edac.o
 obj-$(CONFIG_EDAC_I82860)		+= i82860_edac.o
 obj-$(CONFIG_EDAC_R82600)		+= r82600_edac.o
+
+amd64_edac_mod-y :=  amd64_edac_err_types.o amd64_edac.o
+amd64_edac_mod-$(CONFIG_EDAC_DEBUG) += amd64_edac_dbg.o
+amd64_edac_mod-$(CONFIG_EDAC_AMD64_ERROR_INJECTION) += amd64_edac_inj.o
+
+obj-$(CONFIG_EDAC_AMD64)		+= amd64_edac_mod.o
+
 obj-$(CONFIG_EDAC_PASEMI)		+= pasemi_edac.o
 obj-$(CONFIG_EDAC_MPC85XX)		+= mpc85xx_edac.o
 obj-$(CONFIG_EDAC_MV64X60)		+= mv64x60_edac.o
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 3b6c421531cf2..c36bf40568cf8 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -1,4 +1,5 @@
 #include "amd64_edac.h"
+#include <asm/k8.h>
 
 static struct edac_pci_ctl_info *amd64_ctl_pci;
 
@@ -2978,3 +2979,376 @@ static int amd64_check_ecc_enabled(struct amd64_pvt *pvt)
 	return ret;
 }
 
+struct mcidev_sysfs_attribute sysfs_attrs[ARRAY_SIZE(amd64_dbg_attrs) +
+					  ARRAY_SIZE(amd64_inj_attrs) +
+					  1];
+
+struct mcidev_sysfs_attribute terminator = { .attr = { .name = NULL } };
+
+static void amd64_set_mc_sysfs_attributes(struct mem_ctl_info *mci)
+{
+	unsigned int i = 0, j = 0;
+
+	for (; i < ARRAY_SIZE(amd64_dbg_attrs); i++)
+		sysfs_attrs[i] = amd64_dbg_attrs[i];
+
+	for (j = 0; j < ARRAY_SIZE(amd64_inj_attrs); j++, i++)
+		sysfs_attrs[i] = amd64_inj_attrs[j];
+
+	sysfs_attrs[i] = terminator;
+
+	mci->mc_driver_sysfs_attributes = sysfs_attrs;
+}
+
+static void amd64_setup_mci_misc_attributes(struct mem_ctl_info *mci)
+{
+	struct amd64_pvt *pvt = mci->pvt_info;
+
+	mci->mtype_cap		= MEM_FLAG_DDR2 | MEM_FLAG_RDDR2;
+	mci->edac_ctl_cap	= EDAC_FLAG_NONE;
+	mci->edac_cap		= EDAC_FLAG_NONE;
+
+	if (pvt->nbcap & K8_NBCAP_SECDED)
+		mci->edac_ctl_cap |= EDAC_FLAG_SECDED;
+
+	if (pvt->nbcap & K8_NBCAP_CHIPKILL)
+		mci->edac_ctl_cap |= EDAC_FLAG_S4ECD4ED;
+
+	mci->edac_cap		= amd64_determine_edac_cap(pvt);
+	mci->mod_name		= EDAC_MOD_STR;
+	mci->mod_ver		= EDAC_AMD64_VERSION;
+	mci->ctl_name		= get_amd_family_name(pvt->mc_type_index);
+	mci->dev_name		= pci_name(pvt->dram_f2_ctl);
+	mci->ctl_page_to_phys	= NULL;
+
+	/* IMPORTANT: Set the polling 'check' function in this module */
+	mci->edac_check		= amd64_check;
+
+	/* memory scrubber interface */
+	mci->set_sdram_scrub_rate = amd64_set_scrub_rate;
+	mci->get_sdram_scrub_rate = amd64_get_scrub_rate;
+}
+
+/*
+ * Init stuff for this DRAM Controller device.
+ *
+ * Due to a hardware feature on Fam10h CPUs, the Enable Extended Configuration
+ * Space feature MUST be enabled on ALL Processors prior to actually reading
+ * from the ECS registers. Since the loading of the module can occur on any
+ * 'core', and cores don't 'see' all the other processors ECS data when the
+ * others are NOT enabled. Our solution is to first enable ECS access in this
+ * routine on all processors, gather some data in a amd64_pvt structure and
+ * later come back in a finish-setup function to perform that final
+ * initialization. See also amd64_init_2nd_stage() for that.
+ */
+static int amd64_probe_one_instance(struct pci_dev *dram_f2_ctl,
+				    int mc_type_index)
+{
+	struct amd64_pvt *pvt = NULL;
+	int err = 0, ret;
+
+	ret = -ENOMEM;
+	pvt = kzalloc(sizeof(struct amd64_pvt), GFP_KERNEL);
+	if (!pvt)
+		goto err_exit;
+
+	pvt->mc_node_id = get_mc_node_id_from_pdev(dram_f2_ctl);
+
+	pvt->dram_f2_ctl	= dram_f2_ctl;
+	pvt->ext_model		= boot_cpu_data.x86_model >> 4;
+	pvt->mc_type_index	= mc_type_index;
+	pvt->ops		= family_ops(mc_type_index);
+	pvt->old_mcgctl		= 0;
+
+	/*
+	 * We have the dram_f2_ctl device as an argument, now go reserve its
+	 * sibling devices from the PCI system.
+	 */
+	ret = -ENODEV;
+	err = amd64_reserve_mc_sibling_devices(pvt, mc_type_index);
+	if (err)
+		goto err_free;
+
+	ret = -EINVAL;
+	err = amd64_check_ecc_enabled(pvt);
+	if (err)
+		goto err_put;
+
+	/*
+	 * Key operation here: setup of HW prior to performing ops on it. Some
+	 * setup is required to access ECS data. After this is performed, the
+	 * 'teardown' function must be called upon error and normal exit paths.
+	 */
+	if (boot_cpu_data.x86 >= 0x10)
+		amd64_setup(pvt);
+
+	/*
+	 * Save the pointer to the private data for use in 2nd initialization
+	 * stage
+	 */
+	pvt_lookup[pvt->mc_node_id] = pvt;
+
+	return 0;
+
+err_put:
+	amd64_free_mc_sibling_devices(pvt);
+
+err_free:
+	kfree(pvt);
+
+err_exit:
+	return ret;
+}
+
+/*
+ * This is the finishing stage of the init code. Needs to be performed after all
+ * MCs' hardware have been prepped for accessing extended config space.
+ */
+static int amd64_init_2nd_stage(struct amd64_pvt *pvt)
+{
+	int node_id = pvt->mc_node_id;
+	struct mem_ctl_info *mci;
+	int ret, err = 0;
+
+	amd64_read_mc_registers(pvt);
+
+	ret = -ENODEV;
+	if (pvt->ops->probe_valid_hardware) {
+		err = pvt->ops->probe_valid_hardware(pvt);
+		if (err)
+			goto err_exit;
+	}
+
+	/*
+	 * We need to determine how many memory channels there are. Then use
+	 * that information for calculating the size of the dynamic instance
+	 * tables in the 'mci' structure
+	 */
+	pvt->channel_count = pvt->ops->early_channel_count(pvt);
+	if (pvt->channel_count < 0)
+		goto err_exit;
+
+	ret = -ENOMEM;
+	mci = edac_mc_alloc(0, CHIPSELECT_COUNT, pvt->channel_count, node_id);
+	if (!mci)
+		goto err_exit;
+
+	mci->pvt_info = pvt;
+
+	mci->dev = &pvt->dram_f2_ctl->dev;
+	amd64_setup_mci_misc_attributes(mci);
+
+	if (amd64_init_csrows(mci))
+		mci->edac_cap = EDAC_FLAG_NONE;
+
+	amd64_enable_ecc_error_reporting(mci);
+	amd64_set_mc_sysfs_attributes(mci);
+
+	ret = -ENODEV;
+	if (edac_mc_add_mc(mci)) {
+		debugf1("failed edac_mc_add_mc()\n");
+		goto err_add_mc;
+	}
+
+	mci_lookup[node_id] = mci;
+	pvt_lookup[node_id] = NULL;
+	return 0;
+
+err_add_mc:
+	edac_mc_free(mci);
+
+err_exit:
+	debugf0("failure to init 2nd stage: ret=%d\n", ret);
+
+	amd64_restore_ecc_error_reporting(pvt);
+
+	if (boot_cpu_data.x86 > 0xf)
+		amd64_teardown(pvt);
+
+	amd64_free_mc_sibling_devices(pvt);
+
+	kfree(pvt_lookup[pvt->mc_node_id]);
+	pvt_lookup[node_id] = NULL;
+
+	return ret;
+}
+
+
+static int __devinit amd64_init_one_instance(struct pci_dev *pdev,
+				 const struct pci_device_id *mc_type)
+{
+	int ret = 0;
+
+	debugf0("(MC node=%d,mc_type='%s')\n",
+		get_mc_node_id_from_pdev(pdev),
+		get_amd_family_name(mc_type->driver_data));
+
+	ret = pci_enable_device(pdev);
+	if (ret < 0)
+		ret = -EIO;
+	else
+		ret = amd64_probe_one_instance(pdev, mc_type->driver_data);
+
+	if (ret < 0)
+		debugf0("ret=%d\n", ret);
+
+	return ret;
+}
+
+static void __devexit amd64_remove_one_instance(struct pci_dev *pdev)
+{
+	struct mem_ctl_info *mci;
+	struct amd64_pvt *pvt;
+
+	/* Remove from EDAC CORE tracking list */
+	mci = edac_mc_del_mc(&pdev->dev);
+	if (!mci)
+		return;
+
+	pvt = mci->pvt_info;
+
+	amd64_restore_ecc_error_reporting(pvt);
+
+	if (boot_cpu_data.x86 > 0xf)
+		amd64_teardown(pvt);
+
+	amd64_free_mc_sibling_devices(pvt);
+
+	kfree(pvt);
+	mci->pvt_info = NULL;
+
+	mci_lookup[pvt->mc_node_id] = NULL;
+
+	/* Free the EDAC CORE resources */
+	edac_mc_free(mci);
+}
+
+/*
+ * This table is part of the interface for loading drivers for PCI devices. The
+ * PCI core identifies what devices are on a system during boot, and then
+ * inquiry this table to see if this driver is for a given device found.
+ */
+static const struct pci_device_id amd64_pci_table[] __devinitdata = {
+	{
+		.vendor		= PCI_VENDOR_ID_AMD,
+		.device		= PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.class		= 0,
+		.class_mask	= 0,
+		.driver_data	= K8_CPUS
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_AMD,
+		.device		= PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.class		= 0,
+		.class_mask	= 0,
+		.driver_data	= F10_CPUS
+	},
+	{
+		.vendor		= PCI_VENDOR_ID_AMD,
+		.device		= PCI_DEVICE_ID_AMD_11H_NB_DRAM,
+		.subvendor	= PCI_ANY_ID,
+		.subdevice	= PCI_ANY_ID,
+		.class		= 0,
+		.class_mask	= 0,
+		.driver_data	= F11_CPUS
+	},
+	{0, }
+};
+MODULE_DEVICE_TABLE(pci, amd64_pci_table);
+
+static struct pci_driver amd64_pci_driver = {
+	.name		= EDAC_MOD_STR,
+	.probe		= amd64_init_one_instance,
+	.remove		= __devexit_p(amd64_remove_one_instance),
+	.id_table	= amd64_pci_table,
+};
+
+static void amd64_setup_pci_device(void)
+{
+	struct mem_ctl_info *mci;
+	struct amd64_pvt *pvt;
+
+	if (amd64_ctl_pci)
+		return;
+
+	mci = mci_lookup[0];
+	if (mci) {
+
+		pvt = mci->pvt_info;
+		amd64_ctl_pci =
+			edac_pci_create_generic_ctl(&pvt->dram_f2_ctl->dev,
+						    EDAC_MOD_STR);
+
+		if (!amd64_ctl_pci) {
+			pr_warning("%s(): Unable to create PCI control\n",
+				   __func__);
+
+			pr_warning("%s(): PCI error report via EDAC not set\n",
+				   __func__);
+			}
+	}
+}
+
+static int __init amd64_edac_init(void)
+{
+	int nb, err = -ENODEV;
+
+	edac_printk(KERN_INFO, EDAC_MOD_STR, EDAC_AMD64_VERSION "\n");
+
+	opstate_init();
+
+	if (cache_k8_northbridges() < 0)
+		goto err_exit;
+
+	err = pci_register_driver(&amd64_pci_driver);
+	if (err)
+		return err;
+
+	/*
+	 * At this point, the array 'pvt_lookup[]' contains pointers to alloc'd
+	 * amd64_pvt structs. These will be used in the 2nd stage init function
+	 * to finish initialization of the MC instances.
+	 */
+	for (nb = 0; nb < num_k8_northbridges; nb++) {
+		if (!pvt_lookup[nb])
+			continue;
+
+		err = amd64_init_2nd_stage(pvt_lookup[nb]);
+		if (err)
+			goto err_exit;
+	}
+
+	amd64_setup_pci_device();
+
+	return 0;
+
+err_exit:
+	debugf0("'finish_setup' stage failed\n");
+	pci_unregister_driver(&amd64_pci_driver);
+
+	return err;
+}
+
+static void __exit amd64_edac_exit(void)
+{
+	if (amd64_ctl_pci)
+		edac_pci_release_generic_ctl(amd64_ctl_pci);
+
+	pci_unregister_driver(&amd64_pci_driver);
+}
+
+module_init(amd64_edac_init);
+module_exit(amd64_edac_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("SoftwareBitMaker: Doug Thompson, "
+		"Dave Peterson, Thayne Harbaugh");
+MODULE_DESCRIPTION("MC support for AMD64 memory controllers - "
+		EDAC_AMD64_VERSION);
+
+module_param(edac_op_state, int, 0444);
+MODULE_PARM_DESC(edac_op_state, "EDAC Error Reporting state: 0=Poll,1=NMI");
diff --git a/drivers/edac/amd64_edac.h b/drivers/edac/amd64_edac.h
index e7aa760614ce3..a159957e167b1 100644
--- a/drivers/edac/amd64_edac.h
+++ b/drivers/edac/amd64_edac.h
@@ -577,6 +577,21 @@ extern const char *ii_msgs[4];
 extern const char *ext_msgs[32];
 extern const char *htlink_msgs[8];
 
+#ifdef CONFIG_EDAC_DEBUG
+#define NUM_DBG_ATTRS 9
+#else
+#define NUM_DBG_ATTRS 0
+#endif
+
+#ifdef CONFIG_EDAC_AMD64_ERROR_INJECTION
+#define NUM_INJ_ATTRS 5
+#else
+#define NUM_INJ_ATTRS 0
+#endif
+
+extern struct mcidev_sysfs_attribute amd64_dbg_attrs[NUM_DBG_ATTRS],
+				     amd64_inj_attrs[NUM_INJ_ATTRS];
+
 /*
  * Each of the PCI Device IDs types have their own set of hardware accessor
  * functions and per device encoding/decoding logic.
-- 
GitLab


From 3d373290450b10933dad7c387c42179f54417009 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 20 May 2009 20:18:46 +0200
Subject: [PATCH 1868/2198] amd64_edac: do not enable module by default

While at it, fix a link failure when !K8_NB.

Acked-by: Doug Thompson <dougthompson@xmission.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Tested-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/Kconfig | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 3dc650a7422c6..eff363d727e3b 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -60,11 +60,10 @@ config EDAC_MM_EDAC
 
 config EDAC_AMD64
 	tristate "AMD64 (Opteron, Athlon64) K8, F10h, F11h"
-	depends on EDAC_MM_EDAC && X86 && PCI
-	default m
+	depends on EDAC_MM_EDAC && K8_NB && X86_64 && PCI
 	help
-	Support for error detection and correction on the AMD 64
-	Families of Memory Controllers (K8, F10h and F11h)
+	  Support for error detection and correction on the AMD 64
+	  Families of Memory Controllers (K8, F10h and F11h)
 
 config EDAC_AMD64_ERROR_INJECTION
 	bool "Sysfs Error Injection facilities"
-- 
GitLab


From 9456ffffcf3c2f8fafeb9dee45f1502209602253 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 20 May 2009 20:26:53 +0200
Subject: [PATCH 1869/2198] EDAC: do not enable modules by default

Prevent EDAC compilation units from being built by default and let the
user explicitly select the needed modules.

Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Tested-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 drivers/edac/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index eff363d727e3b..ab4f3592a11c5 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -49,7 +49,6 @@ config EDAC_DEBUG_VERBOSE
 
 config EDAC_MM_EDAC
 	tristate "Main Memory EDAC (Error Detection And Correction) reporting"
-	default y
 	help
 	  Some systems are able to detect and correct errors in main
 	  memory.  EDAC can report statistics on memory error
-- 
GitLab


From c476c23b45a41eb4e3ea63af786cc4d74762fe11 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <borislav.petkov@amd.com>
Date: Wed, 20 May 2009 20:31:58 +0200
Subject: [PATCH 1870/2198] amd64_edac: add MAINTAINERS entry

Acked-by: Doug Thompson <dougthompson@xmission.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Borislav Petkov <borislav.petkov@amd.com>
---
 MAINTAINERS | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index cf4abddfc8a40..6362ee809c3a5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1978,6 +1978,16 @@ F:	Documentation/edac.txt
 F:	drivers/edac/edac_*
 F:	include/linux/edac.h
 
+EDAC-AMD64
+P:	Doug Thompson
+M:	dougthompson@xmission.com
+P:	Borislav Petkov
+M:	borislav.petkov@amd.com
+L:	bluesmoke-devel@lists.sourceforge.net (moderated for non-subscribers)
+W:	bluesmoke.sourceforge.net
+S:	Supported
+F:	drivers/edac/amd64_edac*
+
 EDAC-E752X
 P:	Mark Gross
 M:	mark.gross@intel.com
-- 
GitLab


From dc81081b2d9a6a9d64dad1bef1e5fc9fb660e53e Mon Sep 17 00:00:00 2001
From: Yong Wang <yong.y.wang@linux.intel.com>
Date: Wed, 10 Jun 2009 17:06:12 +0800
Subject: [PATCH 1871/2198] perf_counter/x86: Fix the model number of Intel
 Core2 processors

Fix the model number of Intel Core2 processors according to the
documentation: Intel Processor Identification with the CPUID
Instruction: http://www.intel.com/support/processors/sb/cs-009861.htm

Signed-off-by: Yong Wang <yong.y.wang@intel.com>
Also-Reported-by: Arnd Bergmann <arnd@arndb.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <20090610090612.GA26580@ywang-moblin2.bj.intel.com>
[ Added two more model numbers suggested by Arnd Bergmann ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 40978aac6e0f6..49f258537cbfc 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1407,7 +1407,10 @@ static int intel_pmu_init(void)
 	 * Install the hw-cache-events table:
 	 */
 	switch (boot_cpu_data.x86_model) {
-	case 17:
+	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
+	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
+	case 29: /* six-core 45 nm xeon "Dunnington" */
 		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 
-- 
GitLab


From 4da646b7b52552f3b43eae27ffa5aa2c200f6db6 Mon Sep 17 00:00:00 2001
From: Jeff Garzik <jeff@garzik.org>
Date: Wed, 8 Apr 2009 02:00:13 -0400
Subject: [PATCH 1872/2198] [libata] ahci: use less error-prone array
 initializers

Also, remove unneeded prototype.

Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ahci.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 6b91c26a46356..cd832cb69492e 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -313,7 +313,6 @@ static void ahci_error_handler(struct ata_port *ap);
 static void ahci_post_internal_cmd(struct ata_queued_cmd *qc);
 static int ahci_port_resume(struct ata_port *ap);
 static void ahci_dev_config(struct ata_device *dev);
-static unsigned int ahci_fill_sg(struct ata_queued_cmd *qc, void *cmd_tbl);
 static void ahci_fill_cmd_slot(struct ahci_port_priv *pp, unsigned int tag,
 			       u32 opts);
 #ifdef CONFIG_PM
@@ -404,14 +403,14 @@ static struct ata_port_operations ahci_sb600_ops = {
 #define AHCI_HFLAGS(flags)	.private_data	= (void *)(flags)
 
 static const struct ata_port_info ahci_port_info[] = {
-	/* board_ahci */
+	[board_ahci] =
 	{
 		.flags		= AHCI_FLAG_COMMON,
 		.pio_mask	= ATA_PIO4,
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_ops,
 	},
-	/* board_ahci_vt8251 */
+	[board_ahci_vt8251] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_NO_NCQ | AHCI_HFLAG_NO_PMP),
 		.flags		= AHCI_FLAG_COMMON,
@@ -419,7 +418,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_vt8251_ops,
 	},
-	/* board_ahci_ign_iferr */
+	[board_ahci_ign_iferr] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_IGN_IRQ_IF_ERR),
 		.flags		= AHCI_FLAG_COMMON,
@@ -427,7 +426,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_ops,
 	},
-	/* board_ahci_sb600 */
+	[board_ahci_sb600] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_IGN_SERR_INTERNAL |
 				 AHCI_HFLAG_32BIT_ONLY | AHCI_HFLAG_NO_MSI |
@@ -437,7 +436,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_sb600_ops,
 	},
-	/* board_ahci_mv */
+	[board_ahci_mv] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_NO_NCQ | AHCI_HFLAG_NO_MSI |
 				 AHCI_HFLAG_MV_PATA | AHCI_HFLAG_NO_PMP),
@@ -447,7 +446,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_ops,
 	},
-	/* board_ahci_sb700, for SB700 and SB800 */
+	[board_ahci_sb700] =	/* for SB700 and SB800 */
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_IGN_SERR_INTERNAL),
 		.flags		= AHCI_FLAG_COMMON,
@@ -455,7 +454,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_sb600_ops,
 	},
-	/* board_ahci_mcp65 */
+	[board_ahci_mcp65] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_YES_NCQ),
 		.flags		= AHCI_FLAG_COMMON,
@@ -463,7 +462,7 @@ static const struct ata_port_info ahci_port_info[] = {
 		.udma_mask	= ATA_UDMA6,
 		.port_ops	= &ahci_ops,
 	},
-	/* board_ahci_nopmp */
+	[board_ahci_nopmp] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_NO_PMP),
 		.flags		= AHCI_FLAG_COMMON,
-- 
GitLab


From 2102d7497393e982bf38ffe8f5fd3d487104880d Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Date: Sun, 15 Feb 2009 23:30:38 +0400
Subject: [PATCH 1873/2198] libata-sff: avoid byte swapping in
 ata_sff_data_xfer()

Handling of the trailing byte in ata_sff_data_xfer() is suboptimal bacause:

- it always initializes the padding buffer to 0 which is not really needed in
  both the read and write cases;

- it has to use memcpy() to transfer a single byte from/to the padding buffer;

- it uses io{read|write}16() accessors which swap bytes on the big endian CPUs
  and so have to additionally convert the data from/to the little endian format
  instead of using io{read|write}16_rep() accessors which are not supposed to
  change the byte ordering.

Signed-off-by: Sergei Shtylyov <sshtylyov@ru.mvista.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-sff.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c
index bb18415d3d634..bbbb1fab17557 100644
--- a/drivers/ata/libata-sff.c
+++ b/drivers/ata/libata-sff.c
@@ -727,17 +727,23 @@ unsigned int ata_sff_data_xfer(struct ata_device *dev, unsigned char *buf,
 	else
 		iowrite16_rep(data_addr, buf, words);
 
-	/* Transfer trailing 1 byte, if any. */
+	/* Transfer trailing byte, if any. */
 	if (unlikely(buflen & 0x01)) {
-		__le16 align_buf[1] = { 0 };
-		unsigned char *trailing_buf = buf + buflen - 1;
+		unsigned char pad[2];
 
+		/* Point buf to the tail of buffer */
+		buf += buflen - 1;
+
+		/*
+		 * Use io*16_rep() accessors here as well to avoid pointlessly
+		 * swapping bytes to and fro on the big endian machines...
+		 */
 		if (rw == READ) {
-			align_buf[0] = cpu_to_le16(ioread16(data_addr));
-			memcpy(trailing_buf, align_buf, 1);
+			ioread16_rep(data_addr, pad, 1);
+			*buf = pad[0];
 		} else {
-			memcpy(align_buf, trailing_buf, 1);
-			iowrite16(le16_to_cpu(align_buf[0]), data_addr);
+			pad[0] = *buf;
+			iowrite16_rep(data_addr, pad, 1);
 		}
 		words++;
 	}
-- 
GitLab


From f35b5e7c066a41f60683d2689e52a1336479913b Mon Sep 17 00:00:00 2001
From: Alexander Beregalov <a.beregalov@gmail.com>
Date: Wed, 15 Apr 2009 00:00:54 +0400
Subject: [PATCH 1874/2198] sata_sx4: speed up ECC initialization

ECC initialization takes too long. It writes zeroes by portions
of 4 byte, it takes more than 6 minutes on my machine to initialize
512Mb ECC DIMM module. Change portion to 128Kb - it significantly
reduces initialization time.

Signed-off-by: Alexander Beregalov <a.beregalov@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/sata_sx4.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/drivers/ata/sata_sx4.c b/drivers/ata/sata_sx4.c
index eb05a3c82a9ee..bbcf970068ad0 100644
--- a/drivers/ata/sata_sx4.c
+++ b/drivers/ata/sata_sx4.c
@@ -193,6 +193,7 @@ enum {
 					  PDC_TIMER_MASK_INT,
 };
 
+#define ECC_ERASE_BUF_SZ (128 * 1024)
 
 struct pdc_port_priv {
 	u8			dimm_buf[(ATA_PRD_SZ * ATA_MAX_PRD) + 512];
@@ -1280,7 +1281,6 @@ static unsigned int pdc20621_dimm_init(struct ata_host *host)
 {
 	int speed, size, length;
 	u32 addr, spd0, pci_status;
-	u32 tmp = 0;
 	u32 time_period = 0;
 	u32 tcount = 0;
 	u32 ticks = 0;
@@ -1395,14 +1395,17 @@ static unsigned int pdc20621_dimm_init(struct ata_host *host)
 	pdc20621_i2c_read(host, PDC_DIMM0_SPD_DEV_ADDRESS,
 			  PDC_DIMM_SPD_TYPE, &spd0);
 	if (spd0 == 0x02) {
+		void *buf;
 		VPRINTK("Start ECC initialization\n");
 		addr = 0;
 		length = size * 1024 * 1024;
+		buf = kzalloc(ECC_ERASE_BUF_SZ, GFP_KERNEL);
 		while (addr < length) {
-			pdc20621_put_to_dimm(host, (void *) &tmp, addr,
-					     sizeof(u32));
-			addr += sizeof(u32);
+			pdc20621_put_to_dimm(host, buf, addr,
+					     ECC_ERASE_BUF_SZ);
+			addr += ECC_ERASE_BUF_SZ;
 		}
+		kfree(buf);
 		VPRINTK("Finish ECC initialization\n");
 	}
 	return 0;
-- 
GitLab


From 31f80112cc7e7ea4c220d6f62b0a7052754befb3 Mon Sep 17 00:00:00 2001
From: Robert Hancock <hancockrwd@gmail.com>
Date: Mon, 13 Apr 2009 22:57:28 -0600
Subject: [PATCH 1875/2198] sata_sil: enable 32-bit PIO

32-bit PIO seems to work fine on sata_sil hardware (tested on SiI3114) and is
listed as OK in the Silicon Image datasheets. Enable it.

Signed-off-by: Robert Hancock <hancockrwd@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/sata_sil.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/ata/sata_sil.c b/drivers/ata/sata_sil.c
index e67ce8e5caa5f..030ec079b184e 100644
--- a/drivers/ata/sata_sil.c
+++ b/drivers/ata/sata_sil.c
@@ -183,7 +183,7 @@ static struct scsi_host_template sil_sht = {
 };
 
 static struct ata_port_operations sil_ops = {
-	.inherits		= &ata_bmdma_port_ops,
+	.inherits		= &ata_bmdma32_port_ops,
 	.dev_config		= sil_dev_config,
 	.set_mode		= sil_set_mode,
 	.bmdma_setup            = sil_bmdma_setup,
-- 
GitLab


From 437681800bdaa9feb58cf943dfbbd239c21d3705 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Wed, 20 May 2009 09:44:39 +0200
Subject: [PATCH 1876/2198] [libata] get rid of ATA_MAX_QUEUE loop in
 ata_qc_complete_multiple() v2

We very rarely (if ever) complete more than one command in the
sactive mask at the time, even for extremely high IO rates. So
looping over the entire range of possible tags is pointless,
instead use __ffs() to just find the completed tags directly.

Updated to clear the tag from the done_mask instead of shifting
done_mask down as suggested by From: Tejun Heo <htejun@gmail.com>
Verified with a user space tester to produce the same results.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/libata-core.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c
index c9242301cfa14..ca4d208ddf3ba 100644
--- a/drivers/ata/libata-core.c
+++ b/drivers/ata/libata-core.c
@@ -5031,7 +5031,6 @@ int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active)
 {
 	int nr_done = 0;
 	u32 done_mask;
-	int i;
 
 	done_mask = ap->qc_active ^ qc_active;
 
@@ -5041,16 +5040,16 @@ int ata_qc_complete_multiple(struct ata_port *ap, u32 qc_active)
 		return -EINVAL;
 	}
 
-	for (i = 0; i < ATA_MAX_QUEUE; i++) {
+	while (done_mask) {
 		struct ata_queued_cmd *qc;
+		unsigned int tag = __ffs(done_mask);
 
-		if (!(done_mask & (1 << i)))
-			continue;
-
-		if ((qc = ata_qc_from_tag(ap, i))) {
+		qc = ata_qc_from_tag(ap, tag);
+		if (qc) {
 			ata_qc_complete(qc);
 			nr_done++;
 		}
+		done_mask &= ~(1 << tag);
 	}
 
 	return nr_done;
-- 
GitLab


From d50ce07d6fd8a422757a231f9d7293cbddeaec31 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 12 May 2009 10:57:41 +0900
Subject: [PATCH 1877/2198] ahci: misc cleanups for EM stuff

Make the following EM related cleanups.

* Use msleep(1) instead of udelay(100) and reduce retry count to 5.

* s/MAX_SLOTS/EM_MAX_SLOTS/, s/MAX_RETRY/EM_MAX_RETRY/

* Make EM constants enums as suggested by Jeff.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Milburn <dmilburn@redhat.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ahci.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index cd832cb69492e..2141a31f176be 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -77,8 +77,6 @@ static ssize_t ahci_led_store(struct ata_port *ap, const char *buf,
 			      size_t size);
 static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state,
 					ssize_t size);
-#define MAX_SLOTS 8
-#define MAX_RETRY 15
 
 enum {
 	AHCI_PCI_BAR		= 5,
@@ -231,6 +229,10 @@ enum {
 
 	ICH_MAP				= 0x90, /* ICH MAP register */
 
+	/* em constants */
+	EM_MAX_SLOTS			= 8,
+	EM_MAX_RETRY			= 5,
+
 	/* em_ctl bits */
 	EM_CTL_RST			= (1 << 9), /* Reset */
 	EM_CTL_TM			= (1 << 8), /* Transmit Message */
@@ -282,8 +284,8 @@ struct ahci_port_priv {
 	unsigned int		ncq_saw_dmas:1;
 	unsigned int		ncq_saw_sdb:1;
 	u32 			intr_mask;	/* interrupts to enable */
-	struct ahci_em_priv	em_priv[MAX_SLOTS];/* enclosure management info
-					 	 * per PM slot */
+	/* enclosure management info per PM slot */
+	struct ahci_em_priv	em_priv[EM_MAX_SLOTS];
 };
 
 static int ahci_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val);
@@ -1140,12 +1142,12 @@ static void ahci_start_port(struct ata_port *ap)
 			emp = &pp->em_priv[link->pmp];
 
 			/* EM Transmit bit maybe busy during init */
-			for (i = 0; i < MAX_RETRY; i++) {
+			for (i = 0; i < EM_MAX_RETRY; i++) {
 				rc = ahci_transmit_led_message(ap,
 							       emp->led_state,
 							       4);
 				if (rc == -EBUSY)
-					udelay(100);
+					msleep(1);
 				else
 					break;
 			}
@@ -1339,7 +1341,7 @@ static ssize_t ahci_transmit_led_message(struct ata_port *ap, u32 state,
 
 	/* get the slot number from the message */
 	pmp = (state & EM_MSG_LED_PMP_SLOT) >> 8;
-	if (pmp < MAX_SLOTS)
+	if (pmp < EM_MAX_SLOTS)
 		emp = &pp->em_priv[pmp];
 	else
 		return -EINVAL;
@@ -1407,7 +1409,7 @@ static ssize_t ahci_led_store(struct ata_port *ap, const char *buf,
 
 	/* get the slot number from the message */
 	pmp = (state & EM_MSG_LED_PMP_SLOT) >> 8;
-	if (pmp < MAX_SLOTS)
+	if (pmp < EM_MAX_SLOTS)
 		emp = &pp->em_priv[pmp];
 	else
 		return -EINVAL;
-- 
GitLab


From 347979a034539ab20f3bc0c30ac0ccd3c4fd4c2e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 6 May 2009 17:10:08 +0100
Subject: [PATCH 1878/2198] ata_piix: Turn on hotplugging support for older
 chips

We can't do this for the later ones as they have all sorts of magic boot
time stuff that needs reviewing and the like. However we can do it for the
older ones and it turns out we need to as some IBM docking stations have a
second PIIX series device in them and without this change you can't use it
very well

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ata_piix.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 1aeb7082b0c4e..716e369f39ba8 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1509,8 +1509,8 @@ static int __devinit piix_init_one(struct pci_dev *pdev,
 		dev_printk(KERN_DEBUG, &pdev->dev,
 			   "version " DRV_VERSION "\n");
 
-	/* no hotplugging support (FIXME) */
-	if (!in_module_init)
+	/* no hotplugging support for later devices (FIXME) */
+	if (!in_module_init && ent->driver_data >= ich5_sata)
 		return -ENODEV;
 
 	if (piix_broken_system_poweroff(pdev)) {
-- 
GitLab


From 7654db1a9256d746ae4d229ba675f616a5d5e1a1 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Wed, 6 May 2009 17:10:17 +0100
Subject: [PATCH 1879/2198] ata_piix: Remove stale comment

Combined mode pci quirk hacks went away - so the table to keep in sync
no longer exists.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ata_piix.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 716e369f39ba8..6e165bf91d65d 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -223,10 +223,8 @@ static const struct pci_device_id piix_pci_tbl[] = {
 	/* ICH8 Mobile PATA Controller */
 	{ 0x8086, 0x2850, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich_pata_100 },
 
-	/* NOTE: The following PCI ids must be kept in sync with the
-	 * list in drivers/pci/quirks.c.
-	 */
-
+	/* SATA ports */
+	
 	/* 82801EB (ICH5) */
 	{ 0x8086, 0x24d1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, ich5_sata },
 	/* 82801EB (ICH5) */
-- 
GitLab


From ac04527f7947020c5890090b2ac87af4e98d977e Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 8 Jun 2009 15:52:39 +0300
Subject: [PATCH 1880/2198] KVM: Disable large pages on misaligned memory slots

If a slots guest physical address and host virtual address unequal (mod
large page size), then we would erronously try to back guest large pages
with host large pages.  Detect this misalignment and diable large page
support for the trouble slot.

Cc: stable@kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5fed9bfc3cf5b..5f865ed4c4318 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1086,7 +1086,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 {
 	int r;
 	gfn_t base_gfn;
-	unsigned long npages;
+	unsigned long npages, ugfn;
 	int largepages;
 	unsigned long i;
 	struct kvm_memory_slot *memslot;
@@ -1177,6 +1177,14 @@ int __kvm_set_memory_region(struct kvm *kvm,
 			new.lpage_info[0].write_count = 1;
 		if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE)
 			new.lpage_info[largepages-1].write_count = 1;
+		ugfn = new.userspace_addr >> PAGE_SHIFT;
+		/*
+		 * If the gfn and userspace address are not aligned wrt each
+		 * other, disable large page support for this slot
+		 */
+		if ((base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE - 1))
+			for (i = 0; i < largepages; ++i)
+				new.lpage_info[i].write_count = 1;
 	}
 
 	/* Allocate page dirty bitmap if needed */
-- 
GitLab


From 09f8ca74ae6c2d78b2c7f6c0751ed0cbe815a3d9 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Mon, 8 Jun 2009 15:55:21 +0300
Subject: [PATCH 1881/2198] KVM: Prevent overflow in largepages calculation

If userspace specifies a memory slot that is larger than 8 petabytes, it
could overflow the largepages variable.

Cc: stable@kernel.org
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 virt/kvm/kvm_main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 5f865ed4c4318..e21194566b714 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1087,8 +1087,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	int r;
 	gfn_t base_gfn;
 	unsigned long npages, ugfn;
-	int largepages;
-	unsigned long i;
+	unsigned long largepages, i;
 	struct kvm_memory_slot *memslot;
 	struct kvm_memory_slot old, new;
 
-- 
GitLab


From bd2b5b12849a3446abad0b25e920f86f5480b309 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 13:40:57 +0200
Subject: [PATCH 1882/2198] perf_counter: More aggressive frequency adjustment

Also employ the overflow handler to adjust the frequency, this results
in a stable frequency in about 40~50 samples, instead of that many ticks.

This also means we can start sampling at a sample period of 1 without
running head-first into the throttle.

It relies on sched_clock() to accurately measure the time difference
between the overflow NMIs.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |   5 +-
 include/linux/perf_counter.h       |   1 +
 kernel/perf_counter.c              | 130 +++++++++++++++++++----------
 3 files changed, 92 insertions(+), 44 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 49f258537cbfc..240ca5630632b 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -696,10 +696,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	if (!attr->exclude_kernel)
 		hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
 
-	if (!hwc->sample_period)
+	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
+		atomic64_set(&hwc->period_left, hwc->sample_period);
+	}
 
-	atomic64_set(&hwc->period_left, hwc->sample_period);
 	counter->destroy = hw_perf_counter_destroy;
 
 	/*
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 3586df840f693..282d8cc489804 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -371,6 +371,7 @@ struct hw_perf_counter {
 
 	u64				freq_count;
 	u64				freq_interrupts;
+	u64				freq_stamp;
 #endif
 };
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5eacaaf3f9cdc..51c571ee4d0b1 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1184,13 +1184,33 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 static void perf_log_throttle(struct perf_counter *counter, int enable);
 static void perf_log_period(struct perf_counter *counter, u64 period);
 
-static void perf_adjust_freq(struct perf_counter_context *ctx)
+static void perf_adjust_period(struct perf_counter *counter, u64 events)
+{
+	struct hw_perf_counter *hwc = &counter->hw;
+	u64 period, sample_period;
+	s64 delta;
+
+	events *= hwc->sample_period;
+	period = div64_u64(events, counter->attr.sample_freq);
+
+	delta = (s64)(period - hwc->sample_period);
+	delta = (delta + 7) / 8; /* low pass filter */
+
+	sample_period = hwc->sample_period + delta;
+
+	if (!sample_period)
+		sample_period = 1;
+
+	perf_log_period(counter, sample_period);
+
+	hwc->sample_period = sample_period;
+}
+
+static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 {
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	u64 interrupts, sample_period;
-	u64 events, period, freq;
-	s64 delta;
+	u64 interrupts, freq;
 
 	spin_lock(&ctx->lock);
 	list_for_each_entry(counter, &ctx->counter_list, list_entry) {
@@ -1202,6 +1222,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		interrupts = hwc->interrupts;
 		hwc->interrupts = 0;
 
+		/*
+		 * unthrottle counters on the tick
+		 */
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
 			counter->pmu->unthrottle(counter);
@@ -1211,6 +1234,9 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		if (!counter->attr.freq || !counter->attr.sample_freq)
 			continue;
 
+		/*
+		 * if the specified freq < HZ then we need to skip ticks
+		 */
 		if (counter->attr.sample_freq < HZ) {
 			freq = counter->attr.sample_freq;
 
@@ -1226,20 +1252,20 @@ static void perf_adjust_freq(struct perf_counter_context *ctx)
 		} else
 			freq = HZ;
 
-		events = freq * interrupts * hwc->sample_period;
-		period = div64_u64(events, counter->attr.sample_freq);
-
-		delta = (s64)(1 + period - hwc->sample_period);
-		delta >>= 1;
-
-		sample_period = hwc->sample_period + delta;
-
-		if (!sample_period)
-			sample_period = 1;
+		perf_adjust_period(counter, freq * interrupts);
 
-		perf_log_period(counter, sample_period);
-
-		hwc->sample_period = sample_period;
+		/*
+		 * In order to avoid being stalled by an (accidental) huge
+		 * sample period, force reset the sample period if we didn't
+		 * get any events in this freq period.
+		 */
+		if (!interrupts) {
+			perf_disable();
+			counter->pmu->disable(counter);
+			atomic_set(&hwc->period_left, 0);
+			counter->pmu->enable(counter);
+			perf_enable();
+		}
 	}
 	spin_unlock(&ctx->lock);
 }
@@ -1279,9 +1305,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
 	cpuctx = &per_cpu(perf_cpu_context, cpu);
 	ctx = curr->perf_counter_ctxp;
 
-	perf_adjust_freq(&cpuctx->ctx);
+	perf_ctx_adjust_freq(&cpuctx->ctx);
 	if (ctx)
-		perf_adjust_freq(ctx);
+		perf_ctx_adjust_freq(ctx);
 
 	perf_counter_cpu_sched_out(cpuctx);
 	if (ctx)
@@ -1647,10 +1673,10 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 
 		counter->attr.sample_freq = value;
 	} else {
+		perf_log_period(counter, value);
+
 		counter->attr.sample_period = value;
 		counter->hw.sample_period = value;
-
-		perf_log_period(counter, value);
 	}
 unlock:
 	spin_unlock_irq(&ctx->lock);
@@ -2853,35 +2879,41 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
  * event flow.
  */
 
+struct freq_event {
+	struct perf_event_header	header;
+	u64				time;
+	u64				id;
+	u64				period;
+};
+
 static void perf_log_period(struct perf_counter *counter, u64 period)
 {
 	struct perf_output_handle handle;
+	struct freq_event event;
 	int ret;
 
-	struct {
-		struct perf_event_header	header;
-		u64				time;
-		u64				id;
-		u64				period;
-	} freq_event = {
+	if (counter->hw.sample_period == period)
+		return;
+
+	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
+		return;
+
+	event = (struct freq_event) {
 		.header = {
 			.type = PERF_EVENT_PERIOD,
 			.misc = 0,
-			.size = sizeof(freq_event),
+			.size = sizeof(event),
 		},
 		.time = sched_clock(),
 		.id = counter->id,
 		.period = period,
 	};
 
-	if (counter->hw.sample_period == period)
-		return;
-
-	ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0);
+	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
 	if (ret)
 		return;
 
-	perf_output_put(&handle, freq_event);
+	perf_output_put(&handle, event);
 	perf_output_end(&handle);
 }
 
@@ -2923,15 +2955,16 @@ int perf_counter_overflow(struct perf_counter *counter,
 {
 	int events = atomic_read(&counter->event_limit);
 	int throttle = counter->pmu->unthrottle != NULL;
+	struct hw_perf_counter *hwc = &counter->hw;
 	int ret = 0;
 
 	if (!throttle) {
-		counter->hw.interrupts++;
+		hwc->interrupts++;
 	} else {
-		if (counter->hw.interrupts != MAX_INTERRUPTS) {
-			counter->hw.interrupts++;
-			if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) {
-				counter->hw.interrupts = MAX_INTERRUPTS;
+		if (hwc->interrupts != MAX_INTERRUPTS) {
+			hwc->interrupts++;
+			if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) {
+				hwc->interrupts = MAX_INTERRUPTS;
 				perf_log_throttle(counter, 0);
 				ret = 1;
 			}
@@ -2945,6 +2978,16 @@ int perf_counter_overflow(struct perf_counter *counter,
 		}
 	}
 
+	if (counter->attr.freq) {
+		u64 now = sched_clock();
+		s64 delta = now - hwc->freq_stamp;
+
+		hwc->freq_stamp = now;
+
+		if (delta > 0 && delta < TICK_NSEC)
+			perf_adjust_period(counter, NSEC_PER_SEC / (int)delta);
+	}
+
 	/*
 	 * XXX event_limit might not quite work as expected on inherited
 	 * counters
@@ -3379,7 +3422,6 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;
-	counter->hw.sample_period = counter->attr.sample_period;
 
 	return &perf_ops_generic;
 }
@@ -3483,10 +3525,11 @@ perf_counter_alloc(struct perf_counter_attr *attr,
 	pmu = NULL;
 
 	hwc = &counter->hw;
+	hwc->sample_period = attr->sample_period;
 	if (attr->freq && attr->sample_freq)
-		hwc->sample_period = div64_u64(TICK_NSEC, attr->sample_freq);
-	else
-		hwc->sample_period = attr->sample_period;
+		hwc->sample_period = 1;
+
+	atomic64_set(&hwc->period_left, hwc->sample_period);
 
 	/*
 	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters
@@ -3687,6 +3730,9 @@ inherit_counter(struct perf_counter *parent_counter,
 	else
 		child_counter->state = PERF_COUNTER_STATE_OFF;
 
+	if (parent_counter->attr.freq)
+		child_counter->hw.sample_period = parent_counter->hw.sample_period;
+
 	/*
 	 * Link it up in the child's context:
 	 */
-- 
GitLab


From 4502d77c1d8f15f20c04b92cb96c12d4e465de29 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 15:03:06 +0200
Subject: [PATCH 1883/2198] perf_counter tools: Small frequency related fixes

Create the counter in a disabled state and only enable it after we
mmap() the buffer, this allows us to see the first few samples (and
observe the frequency ramp).

Furthermore, print the period in the verbose report.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 3 +++
 tools/perf/builtin-report.c | 6 ++++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index deaee42d5eb04..a5698add2fcb7 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -347,6 +347,7 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	attr->mmap		= track;
 	attr->comm		= track;
 	attr->inherit		= (cpu < 0) && inherit;
+	attr->disabled		= 1;
 
 	track = 0; /* only the first counter needs these */
 
@@ -402,6 +403,8 @@ static void create_counter(int counter, int cpu, pid_t pid)
 		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
 		exit(-1);
 	}
+
+	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
 }
 
 static void open_counters(int cpu, pid_t pid)
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 0b18cb99a858e..9a0e31e79e9df 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -47,6 +47,7 @@ struct ip_event {
 	struct perf_event_header header;
 	__u64 ip;
 	__u32 pid, tid;
+	__u64 period;
 };
 
 struct mmap_event {
@@ -943,12 +944,13 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	uint64_t ip = event->ip.ip;
 	struct map *map = NULL;
 
-	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
+	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
 		event->ip.pid,
-		(void *)(long)ip);
+		(void *)(long)ip,
+		(long long)event->ip.period);
 
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
-- 
GitLab


From f7b7c26e01e51fe46097e11f179dc71ce7950084 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 15:55:59 +0200
Subject: [PATCH 1884/2198] perf_counter tools: Propagate signals properly

Currently report and stat catch SIGINT (and others) without altering
their exit state. This means that things like:

   while :; do perf stat ./foo ; done

Loops become hard-to-interrupt, because bash never sees perf terminate
due to interruption. Fix this.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 12 ++++++++++++
 tools/perf/builtin-stat.c   | 13 +++++++++++++
 2 files changed, 25 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index a5698add2fcb7..c10553c3460f0 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -169,10 +169,21 @@ static void mmap_read(struct mmap_data *md)
 }
 
 static volatile int done = 0;
+static volatile int signr = -1;
 
 static void sig_handler(int sig)
 {
 	done = 1;
+	signr = sig;
+}
+
+static void sig_atexit(void)
+{
+	if (signr == -1)
+		return;
+
+	signal(signr, SIG_DFL);
+	kill(getpid(), signr);
 }
 
 static void pid_synthesize_comm_event(pid_t pid, int full)
@@ -459,6 +470,7 @@ static int __cmd_record(int argc, const char **argv)
 	} else for (i = 0; i < nr_cpus; i++)
 		open_counters(i, target_pid);
 
+	atexit(sig_atexit);
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 80855090fd9fc..6404906924fa2 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -296,8 +296,20 @@ static int do_perf_stat(int argc, const char **argv)
 	return 0;
 }
 
+static volatile int signr = -1;
+
 static void skip_signal(int signo)
 {
+	signr = signo;
+}
+
+static void sig_atexit(void)
+{
+	if (signr == -1)
+		return;
+
+	signal(signr, SIG_DFL);
+	kill(getpid(), signr);
 }
 
 static const char * const stat_usage[] = {
@@ -345,6 +357,7 @@ int cmd_stat(int argc, const char **argv, const char *prefix)
 	 * What we want is for Ctrl-C to work in the exec()-ed
 	 * task, but being ignored by perf stat itself:
 	 */
+	atexit(sig_atexit);
 	signal(SIGINT,  skip_signal);
 	signal(SIGALRM, skip_signal);
 	signal(SIGABRT, skip_signal);
-- 
GitLab


From 58a09b38cfcd700b796ea07ae3d2e0efbb28b561 Mon Sep 17 00:00:00 2001
From: Shane Huang <shane.huang@amd.com>
Date: Wed, 27 May 2009 15:04:43 +0800
Subject: [PATCH 1885/2198] [libata] ahci: Restore SB600 SATA controller 64 bit
 DMA

Community reported one SB600 SATA issue(BZ #9412), which led to 64 bit
DMA disablement for all SB600 revisions by driver maintainers with
commits c7a42156d99bcea7f8173ba7a6034bbaa2ecb77c and
4cde32fc4b32e96a99063af3183acdfd54c563f0.

But the root cause is ASUS M2A-VM system BIOS bug in old revisions
like 0901, while forcing into 32bit DMA happens to work as workaround.
Now it's time to withdraw 4cde32fc4b32e96a99063af3183acdfd54c563f0
so as to restore the SB600 SATA 64bit DMA capability.
This patch is also adding the workaround for M2A-VM old BIOS revisions,
but users are suggested to upgrade their system BIOS to the latest one
if they meet this issue.

Signed-off-by: Shane Huang <shane.huang@amd.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ahci.c          | 52 +++++++++++++++++++++++++++++++++++--
 drivers/firmware/dmi_scan.c |  1 +
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c
index 2141a31f176be..15a23031833f4 100644
--- a/drivers/ata/ahci.c
+++ b/drivers/ata/ahci.c
@@ -431,8 +431,7 @@ static const struct ata_port_info ahci_port_info[] = {
 	[board_ahci_sb600] =
 	{
 		AHCI_HFLAGS	(AHCI_HFLAG_IGN_SERR_INTERNAL |
-				 AHCI_HFLAG_32BIT_ONLY | AHCI_HFLAG_NO_MSI |
-				 AHCI_HFLAG_SECT255),
+				 AHCI_HFLAG_NO_MSI | AHCI_HFLAG_SECT255),
 		.flags		= AHCI_FLAG_COMMON,
 		.pio_mask	= ATA_PIO4,
 		.udma_mask	= ATA_UDMA6,
@@ -2585,6 +2584,51 @@ static void ahci_p5wdh_workaround(struct ata_host *host)
 	}
 }
 
+/*
+ * SB600 ahci controller on ASUS M2A-VM can't do 64bit DMA with older
+ * BIOS.  The oldest version known to be broken is 0901 and working is
+ * 1501 which was released on 2007-10-26.  Force 32bit DMA on anything
+ * older than 1501.  Please read bko#9412 for more info.
+ */
+static bool ahci_asus_m2a_vm_32bit_only(struct pci_dev *pdev)
+{
+	static const struct dmi_system_id sysids[] = {
+		{
+			.ident = "ASUS M2A-VM",
+			.matches = {
+				DMI_MATCH(DMI_BOARD_VENDOR,
+					  "ASUSTeK Computer INC."),
+				DMI_MATCH(DMI_BOARD_NAME, "M2A-VM"),
+			},
+		},
+		{ }
+	};
+	const char *cutoff_mmdd = "10/26";
+	const char *date;
+	int year;
+
+	if (pdev->bus->number != 0 || pdev->devfn != PCI_DEVFN(0x12, 0) ||
+	    !dmi_check_system(sysids))
+		return false;
+
+	/*
+	 * Argh.... both version and date are free form strings.
+	 * Let's hope they're using the same date format across
+	 * different versions.
+	 */
+	date = dmi_get_system_info(DMI_BIOS_DATE);
+	year = dmi_get_year(DMI_BIOS_DATE);
+	if (date && strlen(date) >= 10 && date[2] == '/' && date[5] == '/' &&
+	    (year > 2007 ||
+	     (year == 2007 && strncmp(date, cutoff_mmdd, 5) >= 0)))
+		return false;
+
+	dev_printk(KERN_WARNING, &pdev->dev, "ASUS M2A-VM: BIOS too old, "
+		   "forcing 32bit DMA, update BIOS\n");
+
+	return true;
+}
+
 static bool ahci_broken_system_poweroff(struct pci_dev *pdev)
 {
 	static const struct dmi_system_id broken_systems[] = {
@@ -2745,6 +2789,10 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
 	if (board_id == board_ahci_sb700 && pdev->revision >= 0x40)
 		hpriv->flags &= ~AHCI_HFLAG_IGN_SERR_INTERNAL;
 
+	/* apply ASUS M2A_VM quirk */
+	if (ahci_asus_m2a_vm_32bit_only(pdev))
+		hpriv->flags |= AHCI_HFLAG_32BIT_ONLY;
+
 	if (!(hpriv->flags & AHCI_HFLAG_NO_MSI))
 		pci_enable_msi(pdev);
 
diff --git a/drivers/firmware/dmi_scan.c b/drivers/firmware/dmi_scan.c
index 5f1b5400d96ab..24c84ae81527b 100644
--- a/drivers/firmware/dmi_scan.c
+++ b/drivers/firmware/dmi_scan.c
@@ -596,6 +596,7 @@ int dmi_get_year(int field)
 
 	return year;
 }
+EXPORT_SYMBOL(dmi_get_year);
 
 /**
  *	dmi_walk - Walk the DMI table and get called back for every record
-- 
GitLab


From 7f4774b38ee6270bbc6c3015cb3fa6c415ffb340 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Wed, 10 Jun 2009 16:29:07 +0900
Subject: [PATCH 1886/2198] sata_nv: use hardreset only for post-boot probing

When I thought it was finally defeated, it came back with vengeance.
The failure cases are ever more convoluted.  Now there is a single
combination which fails boot probing - MCP5x + Intel SSD and there are
two hotplug failure reports on different flavors where softreset fails
to bring up the device.

Through the many bug reports after the switch to hardreset, the
following patterns emerged.

- Softreset during boot always works.

- Hardreset during boot sometimes fails to bring up the link on
  certain comibnations and device signature acquisition is unreliable.

- Hardreset is often necessary after hotplug.

It looks like the old behavior of preferring softreset was somehow
pretty close to the working reset protocol although it could have lost
a device during phy error handling by issuing hardreset.

This patch implements nv_hardreset() which kicks in only for post-boot
(!LOADING) device probing resets.  This should be able to work around
all known problem cases.  This isn't perfect but given the various
hardreset quirks on these controllers, I think this is as good as it
can get.

Tested on mcp5x (swncq), nf3 and ck804 for all both boot, warm and
hot probing cases.

Kudos to all the bug reporters and their painful hours with these damn
controllers.  ;-)

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Robert Hancock <hancockr@shaw.ca>
Reported-by: David Lang <david@lang.hm>
Reported-by: Samo Vodopivec <lament.email.si@gmail.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/sata_nv.c | 131 ++++++++++++++++++++++++++----------------
 1 file changed, 81 insertions(+), 50 deletions(-)

diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c
index 6cda12ba81225..b2d11f300c390 100644
--- a/drivers/ata/sata_nv.c
+++ b/drivers/ata/sata_nv.c
@@ -305,8 +305,8 @@ static irqreturn_t nv_ck804_interrupt(int irq, void *dev_instance);
 static int nv_scr_read(struct ata_link *link, unsigned int sc_reg, u32 *val);
 static int nv_scr_write(struct ata_link *link, unsigned int sc_reg, u32 val);
 
-static int nv_noclassify_hardreset(struct ata_link *link, unsigned int *class,
-				   unsigned long deadline);
+static int nv_hardreset(struct ata_link *link, unsigned int *class,
+			unsigned long deadline);
 static void nv_nf2_freeze(struct ata_port *ap);
 static void nv_nf2_thaw(struct ata_port *ap);
 static void nv_ck804_freeze(struct ata_port *ap);
@@ -406,49 +406,82 @@ static struct scsi_host_template nv_swncq_sht = {
 	.slave_configure	= nv_swncq_slave_config,
 };
 
-static struct ata_port_operations nv_common_ops = {
+/*
+ * NV SATA controllers have various different problems with hardreset
+ * protocol depending on the specific controller and device.
+ *
+ * GENERIC:
+ *
+ *  bko11195 reports that link doesn't come online after hardreset on
+ *  generic nv's and there have been several other similar reports on
+ *  linux-ide.
+ *
+ *  bko12351#c23 reports that warmplug on MCP61 doesn't work with
+ *  softreset.
+ *
+ * NF2/3:
+ *
+ *  bko3352 reports nf2/3 controllers can't determine device signature
+ *  reliably after hardreset.  The following thread reports detection
+ *  failure on cold boot with the standard debouncing timing.
+ *
+ *  http://thread.gmane.org/gmane.linux.ide/34098
+ *
+ *  bko12176 reports that hardreset fails to bring up the link during
+ *  boot on nf2.
+ *
+ * CK804:
+ *
+ *  For initial probing after boot and hot plugging, hardreset mostly
+ *  works fine on CK804 but curiously, reprobing on the initial port
+ *  by rescanning or rmmod/insmod fails to acquire the initial D2H Reg
+ *  FIS in somewhat undeterministic way.
+ *
+ * SWNCQ:
+ *
+ *  bko12351 reports that when SWNCQ is enabled, for hotplug to work,
+ *  hardreset should be used and hardreset can't report proper
+ *  signature, which suggests that mcp5x is closer to nf2 as long as
+ *  reset quirkiness is concerned.
+ *
+ *  bko12703 reports that boot probing fails for intel SSD with
+ *  hardreset.  Link fails to come online.  Softreset works fine.
+ *
+ * The failures are varied but the following patterns seem true for
+ * all flavors.
+ *
+ * - Softreset during boot always works.
+ *
+ * - Hardreset during boot sometimes fails to bring up the link on
+ *   certain comibnations and device signature acquisition is
+ *   unreliable.
+ *
+ * - Hardreset is often necessary after hotplug.
+ *
+ * So, preferring softreset for boot probing and error handling (as
+ * hardreset might bring down the link) but using hardreset for
+ * post-boot probing should work around the above issues in most
+ * cases.  Define nv_hardreset() which only kicks in for post-boot
+ * probing and use it for all variants.
+ */
+static struct ata_port_operations nv_generic_ops = {
 	.inherits		= &ata_bmdma_port_ops,
 	.lost_interrupt		= ATA_OP_NULL,
 	.scr_read		= nv_scr_read,
 	.scr_write		= nv_scr_write,
+	.hardreset		= nv_hardreset,
 };
 
-/* OSDL bz11195 reports that link doesn't come online after hardreset
- * on generic nv's and there have been several other similar reports
- * on linux-ide.  Disable hardreset for generic nv's.
- */
-static struct ata_port_operations nv_generic_ops = {
-	.inherits		= &nv_common_ops,
-	.hardreset		= ATA_OP_NULL,
-};
-
-/* nf2 is ripe with hardreset related problems.
- *
- * kernel bz#3352 reports nf2/3 controllers can't determine device
- * signature reliably.  The following thread reports detection failure
- * on cold boot with the standard debouncing timing.
- *
- * http://thread.gmane.org/gmane.linux.ide/34098
- *
- * And bz#12176 reports that hardreset simply doesn't work on nf2.
- * Give up on it and just don't do hardreset.
- */
 static struct ata_port_operations nv_nf2_ops = {
 	.inherits		= &nv_generic_ops,
 	.freeze			= nv_nf2_freeze,
 	.thaw			= nv_nf2_thaw,
 };
 
-/* For initial probing after boot and hot plugging, hardreset mostly
- * works fine on CK804 but curiously, reprobing on the initial port by
- * rescanning or rmmod/insmod fails to acquire the initial D2H Reg FIS
- * in somewhat undeterministic way.  Use noclassify hardreset.
- */
 static struct ata_port_operations nv_ck804_ops = {
-	.inherits		= &nv_common_ops,
+	.inherits		= &nv_generic_ops,
 	.freeze			= nv_ck804_freeze,
 	.thaw			= nv_ck804_thaw,
-	.hardreset		= nv_noclassify_hardreset,
 	.host_stop		= nv_ck804_host_stop,
 };
 
@@ -476,19 +509,8 @@ static struct ata_port_operations nv_adma_ops = {
 	.host_stop		= nv_adma_host_stop,
 };
 
-/* Kernel bz#12351 reports that when SWNCQ is enabled, for hotplug to
- * work, hardreset should be used and hardreset can't report proper
- * signature, which suggests that mcp5x is closer to nf2 as long as
- * reset quirkiness is concerned.  Define separate ops for mcp5x with
- * nv_noclassify_hardreset().
- */
-static struct ata_port_operations nv_mcp5x_ops = {
-	.inherits		= &nv_common_ops,
-	.hardreset		= nv_noclassify_hardreset,
-};
-
 static struct ata_port_operations nv_swncq_ops = {
-	.inherits		= &nv_mcp5x_ops,
+	.inherits		= &nv_generic_ops,
 
 	.qc_defer		= ata_std_qc_defer,
 	.qc_prep		= nv_swncq_qc_prep,
@@ -557,7 +579,7 @@ static const struct ata_port_info nv_port_info[] = {
 		.pio_mask	= NV_PIO_MASK,
 		.mwdma_mask	= NV_MWDMA_MASK,
 		.udma_mask	= NV_UDMA_MASK,
-		.port_ops	= &nv_mcp5x_ops,
+		.port_ops	= &nv_generic_ops,
 		.private_data	= NV_PI_PRIV(nv_generic_interrupt, &nv_sht),
 	},
 	/* SWNCQ */
@@ -1559,15 +1581,24 @@ static int nv_scr_write(struct ata_link *link, unsigned int sc_reg, u32 val)
 	return 0;
 }
 
-static int nv_noclassify_hardreset(struct ata_link *link, unsigned int *class,
-				   unsigned long deadline)
+static int nv_hardreset(struct ata_link *link, unsigned int *class,
+			unsigned long deadline)
 {
-	bool online;
-	int rc;
+	struct ata_eh_context *ehc = &link->eh_context;
 
-	rc = sata_link_hardreset(link, sata_deb_timing_hotplug, deadline,
-				 &online, NULL);
-	return online ? -EAGAIN : rc;
+	/* Do hardreset iff it's post-boot probing, please read the
+	 * comment above port ops for details.
+	 */
+	if (!(link->ap->pflags & ATA_PFLAG_LOADING) &&
+	    !ata_dev_enabled(link->device))
+		sata_link_hardreset(link, sata_deb_timing_hotplug, deadline,
+				    NULL, NULL);
+	else if (!(ehc->i.flags & ATA_EHI_QUIET))
+		ata_link_printk(link, KERN_INFO,
+				"nv: skipping hardreset on occupied port\n");
+
+	/* device signature acquisition is unreliable */
+	return -EAGAIN;
 }
 
 static void nv_nf2_freeze(struct ata_port *ap)
-- 
GitLab


From 517d3cc15b36392e518abab6bacbb72089658313 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Wed, 13 May 2009 15:02:42 +0100
Subject: [PATCH 1887/2198] [libata] ata_piix: Enable parallel scan

This patch turns on parallel scanning for the ata_piix driver.
This driver is used on most netbooks (no AHCI for cheap storage it seems).
The scan is the dominating time factor in the kernel boot for these
devices; with this flag it gets cut in half for the device I used
for testing (eeepc).
Alan took a look at the driver source and concluded that it ought to be safe
to do for this driver.  Alan has also checked with the hardware team.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Jeff Garzik <jgarzik@redhat.com>
---
 drivers/ata/ata_piix.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/ata/ata_piix.c b/drivers/ata/ata_piix.c
index 6e165bf91d65d..d0a14cf2bd741 100644
--- a/drivers/ata/ata_piix.c
+++ b/drivers/ata/ata_piix.c
@@ -1589,6 +1589,7 @@ static int __devinit piix_init_one(struct pci_dev *pdev,
 		host->ports[1]->mwdma_mask = 0;
 		host->ports[1]->udma_mask = 0;
 	}
+	host->flags |= ATA_HOST_PARALLEL_SCAN;
 
 	pci_set_master(pdev);
 	return ata_pci_sff_activate_host(host, ata_sff_interrupt, &piix_sht);
-- 
GitLab


From 5c939df56c3ea018b58e5aa76181284c2053d699 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 27 May 2009 09:16:03 -0400
Subject: [PATCH 1888/2198] btrfs: Fix set/clear_extent_bit for 'end ==
 (u64)-1'

There are some 'start = state->end + 1;' like code in set_extent_bit
and clear_extent_bit. They overflow when end == (u64)-1.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fe9eb990e4436..68260180f5871 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -476,6 +476,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	struct extent_state *state;
 	struct extent_state *prealloc = NULL;
 	struct rb_node *node;
+	u64 last_end;
 	int err;
 	int set = 0;
 
@@ -498,6 +499,7 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 	if (state->start > end)
 		goto out;
 	WARN_ON(state->end < start);
+	last_end = state->end;
 
 	/*
 	 *     | ---- desired range ---- |
@@ -524,9 +526,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		if (err)
 			goto out;
 		if (state->end <= end) {
-			start = state->end + 1;
 			set |= clear_state_bit(tree, state, bits,
 					wake, delete);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
 		} else {
 			start = state->start;
 		}
@@ -552,8 +556,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		goto out;
 	}
 
-	start = state->end + 1;
 	set |= clear_state_bit(tree, state, bits, wake, delete);
+	if (last_end == (u64)-1)
+		goto out;
+	start = last_end + 1;
 	goto search_again;
 
 out:
@@ -707,8 +713,10 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			goto out;
 		}
 		set_state_bits(tree, state, bits);
-		start = state->end + 1;
 		merge_state(tree, state);
+		if (last_end == (u64)-1)
+			goto out;
+		start = last_end + 1;
 		goto search_again;
 	}
 
@@ -742,8 +750,10 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 			goto out;
 		if (state->end <= end) {
 			set_state_bits(tree, state, bits);
-			start = state->end + 1;
 			merge_state(tree, state);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
 		} else {
 			start = state->start;
 		}
-- 
GitLab


From 5d4f98a28c7d334091c1b7744f48a1acdd2a4ae0 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Wed, 10 Jun 2009 10:45:14 -0400
Subject: [PATCH 1889/2198] Btrfs: Mixed back reference  (FORWARD ROLLING
 FORMAT CHANGE)

This commit introduces a new kind of back reference for btrfs metadata.
Once a filesystem has been mounted with this commit, IT WILL NO LONGER
BE MOUNTABLE BY OLDER KERNELS.

When a tree block in subvolume tree is cow'd, the reference counts of all
extents it points to are increased by one.  At transaction commit time,
the old root of the subvolume is recorded in a "dead root" data structure,
and the btree it points to is later walked, dropping reference counts
and freeing any blocks where the reference count goes to 0.

The increments done during cow and decrements done after commit cancel out,
and the walk is a very expensive way to go about freeing the blocks that
are no longer referenced by the new btree root.  This commit reduces the
transaction overhead by avoiding the need for dead root records.

When a non-shared tree block is cow'd, we free the old block at once, and the
new block inherits old block's references. When a tree block with reference
count > 1 is cow'd, we increase the reference counts of all extents
the new block points to by one, and decrease the old block's reference count by
one.

This dead tree avoidance code removes the need to modify the reference
counts of lower level extents when a non-shared tree block is cow'd.
But we still need to update back ref for all pointers in the block.
This is because the location of the block is recorded in the back ref
item.

We can solve this by introducing a new type of back ref. The new
back ref provides information about pointer's key, level and in which
tree the pointer lives. This information allow us to find the pointer
by searching the tree. The shortcoming of the new back ref is that it
only works for pointers in tree blocks referenced by their owner trees.

This is mostly a problem for snapshots, where resolving one of these
fuzzy back references would be O(number_of_snapshots) and quite slow.
The solution used here is to use the fuzzy back references in the common
case where a given tree block is only referenced by one root,
and use the full back references when multiple roots have a reference
on a given block.

This commit adds per subvolume red-black tree to keep trace of cached
inodes. The red-black tree helps the balancing code to find cached
inodes whose inode numbers within a given range.

This commit improves the balancing code by introducing several data
structures to keep the state of balancing. The most important one
is the back ref cache. It caches how the upper level tree blocks are
referenced. This greatly reduce the overhead of checking back ref.

The improved balancing code scales significantly better with a large
number of snapshots.

This is a very large commit and was written in a number of
pieces.  But, they depend heavily on the disk format change and were
squashed together to make sure git bisect didn't end up in a
bad state wrt space balancing or the format change.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/Makefile      |    4 +-
 fs/btrfs/btrfs_inode.h |    3 +
 fs/btrfs/ctree.c       |  685 ++++----
 fs/btrfs/ctree.h       |  308 ++--
 fs/btrfs/delayed-ref.c |  509 ++++--
 fs/btrfs/delayed-ref.h |   85 +-
 fs/btrfs/disk-io.c     |   95 +-
 fs/btrfs/export.c      |    4 +-
 fs/btrfs/extent-tree.c | 2674 +++++++++++++++++++----------
 fs/btrfs/file.c        |   76 +-
 fs/btrfs/inode.c       |  132 +-
 fs/btrfs/ioctl.c       |   21 +-
 fs/btrfs/print-tree.c  |  155 +-
 fs/btrfs/relocation.c  | 3711 ++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/root-tree.c   |   17 +-
 fs/btrfs/super.c       |   26 +-
 fs/btrfs/transaction.c |  410 ++---
 fs/btrfs/transaction.h |   12 +-
 fs/btrfs/tree-log.c    |  102 +-
 fs/btrfs/volumes.c     |    2 -
 20 files changed, 6958 insertions(+), 2073 deletions(-)
 create mode 100644 fs/btrfs/relocation.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 94212844a9bcf..a35eb36b32fdc 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   transaction.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-	   ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
-	   compression.o delayed-ref.o
+	   export.o tree-log.o acl.o free-space-cache.o zlib.o \
+	   compression.o delayed-ref.o relocation.o
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index b30986f00b9d7..ecf5f7d8166fb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -72,6 +72,9 @@ struct btrfs_inode {
 	 */
 	struct list_head ordered_operations;
 
+	/* node for the red-black tree that links inodes in subvolume root */
+	struct rb_node rb_node;
+
 	/* the space_info for where this inode's data allocations are done */
 	struct btrfs_space_info *space_info;
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index fedf8b9f03a2a..2b960278a2f95 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -197,14 +197,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	u32 nritems;
 	int ret = 0;
 	int level;
-	struct btrfs_root *new_root;
-
-	new_root = kmalloc(sizeof(*new_root), GFP_NOFS);
-	if (!new_root)
-		return -ENOMEM;
-
-	memcpy(new_root, root, sizeof(*new_root));
-	new_root->root_key.objectid = new_root_objectid;
+	struct btrfs_disk_key disk_key;
 
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
@@ -212,28 +205,37 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 
 	level = btrfs_header_level(buf);
 	nritems = btrfs_header_nritems(buf);
+	if (level == 0)
+		btrfs_item_key(buf, &disk_key, 0);
+	else
+		btrfs_node_key(buf, &disk_key, 0);
 
-	cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0,
-				     new_root_objectid, trans->transid,
-				     level, buf->start, 0);
-	if (IS_ERR(cow)) {
-		kfree(new_root);
+	cow = btrfs_alloc_free_block(trans, root, buf->len, 0,
+				     new_root_objectid, &disk_key, level,
+				     buf->start, 0);
+	if (IS_ERR(cow))
 		return PTR_ERR(cow);
-	}
 
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
-	btrfs_set_header_owner(cow, new_root_objectid);
-	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+				     BTRFS_HEADER_FLAG_RELOC);
+	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+	else
+		btrfs_set_header_owner(cow, new_root_objectid);
 
 	write_extent_buffer(cow, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL);
-	kfree(new_root);
+	if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
+		ret = btrfs_inc_ref(trans, root, cow, 1);
+	else
+		ret = btrfs_inc_ref(trans, root, cow, 0);
 
 	if (ret)
 		return ret;
@@ -243,6 +245,125 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+/*
+ * check if the tree block can be shared by multiple trees
+ */
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+			      struct extent_buffer *buf)
+{
+	/*
+	 * Tree blocks not in refernece counted trees and tree roots
+	 * are never shared. If a block was allocated after the last
+	 * snapshot and the block was not allocated by tree relocation,
+	 * we know the block is not shared.
+	 */
+	if (root->ref_cows &&
+	    buf != root->node && buf != root->commit_root &&
+	    (btrfs_header_generation(buf) <=
+	     btrfs_root_last_snapshot(&root->root_item) ||
+	     btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+		return 1;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (root->ref_cows &&
+	    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+		return 1;
+#endif
+	return 0;
+}
+
+static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
+				       struct btrfs_root *root,
+				       struct extent_buffer *buf,
+				       struct extent_buffer *cow)
+{
+	u64 refs;
+	u64 owner;
+	u64 flags;
+	u64 new_flags = 0;
+	int ret;
+
+	/*
+	 * Backrefs update rules:
+	 *
+	 * Always use full backrefs for extent pointers in tree block
+	 * allocated by tree relocation.
+	 *
+	 * If a shared tree block is no longer referenced by its owner
+	 * tree (btrfs_header_owner(buf) == root->root_key.objectid),
+	 * use full backrefs for extent pointers in tree block.
+	 *
+	 * If a tree block is been relocating
+	 * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
+	 * use full backrefs for extent pointers in tree block.
+	 * The reason for this is some operations (such as drop tree)
+	 * are only allowed for blocks use full backrefs.
+	 */
+
+	if (btrfs_block_can_be_shared(root, buf)) {
+		ret = btrfs_lookup_extent_info(trans, root, buf->start,
+					       buf->len, &refs, &flags);
+		BUG_ON(ret);
+		BUG_ON(refs == 0);
+	} else {
+		refs = 1;
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+			flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+		else
+			flags = 0;
+	}
+
+	owner = btrfs_header_owner(buf);
+	BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
+	       !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
+	if (refs > 1) {
+		if ((owner == root->root_key.objectid ||
+		     root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
+		    !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+			ret = btrfs_inc_ref(trans, root, buf, 1);
+			BUG_ON(ret);
+
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID) {
+				ret = btrfs_dec_ref(trans, root, buf, 0);
+				BUG_ON(ret);
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+				BUG_ON(ret);
+			}
+			new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+		} else {
+
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID)
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+			else
+				ret = btrfs_inc_ref(trans, root, cow, 0);
+			BUG_ON(ret);
+		}
+		if (new_flags != 0) {
+			ret = btrfs_set_disk_extent_flags(trans, root,
+							  buf->start,
+							  buf->len,
+							  new_flags, 0);
+			BUG_ON(ret);
+		}
+	} else {
+		if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
+			if (root->root_key.objectid ==
+			    BTRFS_TREE_RELOC_OBJECTID)
+				ret = btrfs_inc_ref(trans, root, cow, 1);
+			else
+				ret = btrfs_inc_ref(trans, root, cow, 0);
+			BUG_ON(ret);
+			ret = btrfs_dec_ref(trans, root, buf, 1);
+			BUG_ON(ret);
+		}
+		clean_tree_block(trans, root, buf);
+	}
+	return 0;
+}
+
 /*
  * does the dirty work in cow of a single block.  The parent block (if
  * supplied) is updated to point to the new cow copy.  The new buffer is marked
@@ -262,34 +383,39 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 			     struct extent_buffer **cow_ret,
 			     u64 search_start, u64 empty_size)
 {
-	u64 parent_start;
+	struct btrfs_disk_key disk_key;
 	struct extent_buffer *cow;
-	u32 nritems;
-	int ret = 0;
 	int level;
 	int unlock_orig = 0;
+	u64 parent_start;
 
 	if (*cow_ret == buf)
 		unlock_orig = 1;
 
 	btrfs_assert_tree_locked(buf);
 
-	if (parent)
-		parent_start = parent->start;
-	else
-		parent_start = 0;
-
 	WARN_ON(root->ref_cows && trans->transid !=
 		root->fs_info->running_transaction->transid);
 	WARN_ON(root->ref_cows && trans->transid != root->last_trans);
 
 	level = btrfs_header_level(buf);
-	nritems = btrfs_header_nritems(buf);
 
-	cow = btrfs_alloc_free_block(trans, root, buf->len,
-				     parent_start, root->root_key.objectid,
-				     trans->transid, level,
-				     search_start, empty_size);
+	if (level == 0)
+		btrfs_item_key(buf, &disk_key, 0);
+	else
+		btrfs_node_key(buf, &disk_key, 0);
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent)
+			parent_start = parent->start;
+		else
+			parent_start = 0;
+	} else
+		parent_start = 0;
+
+	cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start,
+				     root->root_key.objectid, &disk_key,
+				     level, search_start, empty_size);
 	if (IS_ERR(cow))
 		return PTR_ERR(cow);
 
@@ -298,83 +424,53 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	copy_extent_buffer(cow, buf, 0, 0, cow->len);
 	btrfs_set_header_bytenr(cow, cow->start);
 	btrfs_set_header_generation(cow, trans->transid);
-	btrfs_set_header_owner(cow, root->root_key.objectid);
-	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN);
+	btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
+	btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
+				     BTRFS_HEADER_FLAG_RELOC);
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
+	else
+		btrfs_set_header_owner(cow, root->root_key.objectid);
 
 	write_extent_buffer(cow, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
-	WARN_ON(btrfs_header_generation(buf) > trans->transid);
-	if (btrfs_header_generation(buf) != trans->transid) {
-		u32 nr_extents;
-		ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents);
-		if (ret)
-			return ret;
-
-		ret = btrfs_cache_ref(trans, root, buf, nr_extents);
-		WARN_ON(ret);
-	} else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) {
-		/*
-		 * There are only two places that can drop reference to
-		 * tree blocks owned by living reloc trees, one is here,
-		 * the other place is btrfs_drop_subtree. In both places,
-		 * we check reference count while tree block is locked.
-		 * Furthermore, if reference count is one, it won't get
-		 * increased by someone else.
-		 */
-		u32 refs;
-		ret = btrfs_lookup_extent_ref(trans, root, buf->start,
-					      buf->len, &refs);
-		BUG_ON(ret);
-		if (refs == 1) {
-			ret = btrfs_update_ref(trans, root, buf, cow,
-					       0, nritems);
-			clean_tree_block(trans, root, buf);
-		} else {
-			ret = btrfs_inc_ref(trans, root, buf, cow, NULL);
-		}
-		BUG_ON(ret);
-	} else {
-		ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems);
-		if (ret)
-			return ret;
-		clean_tree_block(trans, root, buf);
-	}
-
-	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
-		ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start);
-		WARN_ON(ret);
-	}
+	update_ref_for_cow(trans, root, buf, cow);
 
 	if (buf == root->node) {
 		WARN_ON(parent && parent != buf);
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
+		    btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
+			parent_start = buf->start;
+		else
+			parent_start = 0;
 
 		spin_lock(&root->node_lock);
 		root->node = cow;
 		extent_buffer_get(cow);
 		spin_unlock(&root->node_lock);
 
-		if (buf != root->commit_root) {
-			btrfs_free_extent(trans, root, buf->start,
-					  buf->len, buf->start,
-					  root->root_key.objectid,
-					  btrfs_header_generation(buf),
-					  level, 1);
-		}
+		btrfs_free_extent(trans, root, buf->start, buf->len,
+				  parent_start, root->root_key.objectid,
+				  level, 0);
 		free_extent_buffer(buf);
 		add_root_to_dirty_list(root);
 	} else {
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+			parent_start = parent->start;
+		else
+			parent_start = 0;
+
+		WARN_ON(trans->transid != btrfs_header_generation(parent));
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
-		WARN_ON(trans->transid == 0);
 		btrfs_set_node_ptr_generation(parent, parent_slot,
 					      trans->transid);
 		btrfs_mark_buffer_dirty(parent);
-		WARN_ON(btrfs_header_generation(parent) != trans->transid);
 		btrfs_free_extent(trans, root, buf->start, buf->len,
-				  parent_start, btrfs_header_owner(parent),
-				  btrfs_header_generation(parent), level, 1);
+				  parent_start, root->root_key.objectid,
+				  level, 0);
 	}
 	if (unlock_orig)
 		btrfs_tree_unlock(buf);
@@ -384,6 +480,18 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static inline int should_cow_block(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct extent_buffer *buf)
+{
+	if (btrfs_header_generation(buf) == trans->transid &&
+	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
+	    !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+	      btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
+		return 0;
+	return 1;
+}
+
 /*
  * cows a single block, see __btrfs_cow_block for the real work.
  * This version of it has extra checks so that a block isn't cow'd more than
@@ -411,9 +519,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		WARN_ON(1);
 	}
 
-	if (btrfs_header_generation(buf) == trans->transid &&
-	    btrfs_header_owner(buf) == root->root_key.objectid &&
-	    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+	if (!should_cow_block(trans, root, buf)) {
 		*cow_ret = buf;
 		return 0;
 	}
@@ -469,7 +575,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
 /*
  * same as comp_keys only with two btrfs_key's
  */
-static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
 {
 	if (k1->objectid > k2->objectid)
 		return 1;
@@ -845,6 +951,12 @@ static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 	return -1;
 }
 
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		     int level, int *slot)
+{
+	return bin_search(eb, key, level, slot);
+}
+
 /* given a node and slot number, this reads the blocks it points to.  The
  * extent buffer is returned with a reference taken (but unlocked).
  * NULL is returned on error.
@@ -921,13 +1033,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		root->node = child;
 		spin_unlock(&root->node_lock);
 
-		ret = btrfs_update_extent_ref(trans, root, child->start,
-					      child->len,
-					      mid->start, child->start,
-					      root->root_key.objectid,
-					      trans->transid, level - 1);
-		BUG_ON(ret);
-
 		add_root_to_dirty_list(root);
 		btrfs_tree_unlock(child);
 
@@ -938,9 +1043,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		/* once for the path */
 		free_extent_buffer(mid);
 		ret = btrfs_free_extent(trans, root, mid->start, mid->len,
-					mid->start, root->root_key.objectid,
-					btrfs_header_generation(mid),
-					level, 1);
+					0, root->root_key.objectid, level, 1);
 		/* once for the root ptr */
 		free_extent_buffer(mid);
 		return ret;
@@ -998,7 +1101,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			ret = wret;
 		if (btrfs_header_nritems(right) == 0) {
 			u64 bytenr = right->start;
-			u64 generation = btrfs_header_generation(parent);
 			u32 blocksize = right->len;
 
 			clean_tree_block(trans, root, right);
@@ -1010,9 +1112,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 			if (wret)
 				ret = wret;
 			wret = btrfs_free_extent(trans, root, bytenr,
-						 blocksize, parent->start,
-						 btrfs_header_owner(parent),
-						 generation, level, 1);
+						 blocksize, 0,
+						 root->root_key.objectid,
+						 level, 0);
 			if (wret)
 				ret = wret;
 		} else {
@@ -1047,7 +1149,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	}
 	if (btrfs_header_nritems(mid) == 0) {
 		/* we've managed to empty the middle node, drop it */
-		u64 root_gen = btrfs_header_generation(parent);
 		u64 bytenr = mid->start;
 		u32 blocksize = mid->len;
 
@@ -1059,9 +1160,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 		if (wret)
 			ret = wret;
 		wret = btrfs_free_extent(trans, root, bytenr, blocksize,
-					 parent->start,
-					 btrfs_header_owner(parent),
-					 root_gen, level, 1);
+					 0, root->root_key.objectid,
+					 level, 0);
 		if (wret)
 			ret = wret;
 	} else {
@@ -1437,7 +1537,7 @@ noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
 {
 	int i;
 
-	if (path->keep_locks || path->lowest_level)
+	if (path->keep_locks)
 		return;
 
 	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
@@ -1614,10 +1714,17 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 		lowest_unlock = 2;
 
 again:
-	if (p->skip_locking)
-		b = btrfs_root_node(root);
-	else
-		b = btrfs_lock_root_node(root);
+	if (p->search_commit_root) {
+		b = root->commit_root;
+		extent_buffer_get(b);
+		if (!p->skip_locking)
+			btrfs_tree_lock(b);
+	} else {
+		if (p->skip_locking)
+			b = btrfs_root_node(root);
+		else
+			b = btrfs_lock_root_node(root);
+	}
 
 	while (b) {
 		level = btrfs_header_level(b);
@@ -1638,11 +1745,9 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 			 * then we don't want to set the path blocking,
 			 * so we test it here
 			 */
-			if (btrfs_header_generation(b) == trans->transid &&
-			    btrfs_header_owner(b) == root->root_key.objectid &&
-			    !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
+			if (!should_cow_block(trans, root, b))
 				goto cow_done;
-			}
+
 			btrfs_set_path_blocking(p);
 
 			wret = btrfs_cow_block(trans, root, b,
@@ -1764,138 +1869,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	return ret;
 }
 
-int btrfs_merge_path(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_key *node_keys,
-		     u64 *nodes, int lowest_level)
-{
-	struct extent_buffer *eb;
-	struct extent_buffer *parent;
-	struct btrfs_key key;
-	u64 bytenr;
-	u64 generation;
-	u32 blocksize;
-	int level;
-	int slot;
-	int key_match;
-	int ret;
-
-	eb = btrfs_lock_root_node(root);
-	ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
-	BUG_ON(ret);
-
-	btrfs_set_lock_blocking(eb);
-
-	parent = eb;
-	while (1) {
-		level = btrfs_header_level(parent);
-		if (level == 0 || level <= lowest_level)
-			break;
-
-		ret = bin_search(parent, &node_keys[lowest_level], level,
-				 &slot);
-		if (ret && slot > 0)
-			slot--;
-
-		bytenr = btrfs_node_blockptr(parent, slot);
-		if (nodes[level - 1] == bytenr)
-			break;
-
-		blocksize = btrfs_level_size(root, level - 1);
-		generation = btrfs_node_ptr_generation(parent, slot);
-		btrfs_node_key_to_cpu(eb, &key, slot);
-		key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key));
-
-		if (generation == trans->transid) {
-			eb = read_tree_block(root, bytenr, blocksize,
-					     generation);
-			btrfs_tree_lock(eb);
-			btrfs_set_lock_blocking(eb);
-		}
-
-		/*
-		 * if node keys match and node pointer hasn't been modified
-		 * in the running transaction, we can merge the path. for
-		 * blocks owened by reloc trees, the node pointer check is
-		 * skipped, this is because these blocks are fully controlled
-		 * by the space balance code, no one else can modify them.
-		 */
-		if (!nodes[level - 1] || !key_match ||
-		    (generation == trans->transid &&
-		     btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) {
-			if (level == 1 || level == lowest_level + 1) {
-				if (generation == trans->transid) {
-					btrfs_tree_unlock(eb);
-					free_extent_buffer(eb);
-				}
-				break;
-			}
-
-			if (generation != trans->transid) {
-				eb = read_tree_block(root, bytenr, blocksize,
-						generation);
-				btrfs_tree_lock(eb);
-				btrfs_set_lock_blocking(eb);
-			}
-
-			ret = btrfs_cow_block(trans, root, eb, parent, slot,
-					      &eb);
-			BUG_ON(ret);
-
-			if (root->root_key.objectid ==
-			    BTRFS_TREE_RELOC_OBJECTID) {
-				if (!nodes[level - 1]) {
-					nodes[level - 1] = eb->start;
-					memcpy(&node_keys[level - 1], &key,
-					       sizeof(node_keys[0]));
-				} else {
-					WARN_ON(1);
-				}
-			}
-
-			btrfs_tree_unlock(parent);
-			free_extent_buffer(parent);
-			parent = eb;
-			continue;
-		}
-
-		btrfs_set_node_blockptr(parent, slot, nodes[level - 1]);
-		btrfs_set_node_ptr_generation(parent, slot, trans->transid);
-		btrfs_mark_buffer_dirty(parent);
-
-		ret = btrfs_inc_extent_ref(trans, root,
-					nodes[level - 1],
-					blocksize, parent->start,
-					btrfs_header_owner(parent),
-					btrfs_header_generation(parent),
-					level - 1);
-		BUG_ON(ret);
-
-		/*
-		 * If the block was created in the running transaction,
-		 * it's possible this is the last reference to it, so we
-		 * should drop the subtree.
-		 */
-		if (generation == trans->transid) {
-			ret = btrfs_drop_subtree(trans, root, eb, parent);
-			BUG_ON(ret);
-			btrfs_tree_unlock(eb);
-			free_extent_buffer(eb);
-		} else {
-			ret = btrfs_free_extent(trans, root, bytenr,
-					blocksize, parent->start,
-					btrfs_header_owner(parent),
-					btrfs_header_generation(parent),
-					level - 1, 1);
-			BUG_ON(ret);
-		}
-		break;
-	}
-	btrfs_tree_unlock(parent);
-	free_extent_buffer(parent);
-	return 0;
-}
-
 /*
  * adjust the pointers going up the tree, starting at level
  * making sure the right key of each node is points to 'key'.
@@ -2021,9 +1994,6 @@ static int push_node_left(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(src);
 	btrfs_mark_buffer_dirty(dst);
 
-	ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items);
-	BUG_ON(ret);
-
 	return ret;
 }
 
@@ -2083,9 +2053,6 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(src);
 	btrfs_mark_buffer_dirty(dst);
 
-	ret = btrfs_update_ref(trans, root, src, dst, 0, push_items);
-	BUG_ON(ret);
-
 	return ret;
 }
 
@@ -2105,7 +2072,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	struct extent_buffer *c;
 	struct extent_buffer *old;
 	struct btrfs_disk_key lower_key;
-	int ret;
 
 	BUG_ON(path->nodes[level]);
 	BUG_ON(path->nodes[level-1] != root->node);
@@ -2117,16 +2083,17 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 		btrfs_node_key(lower, &lower_key, 0);
 
 	c = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
-				   root->root_key.objectid, trans->transid,
+				   root->root_key.objectid, &lower_key,
 				   level, root->node->start, 0);
 	if (IS_ERR(c))
 		return PTR_ERR(c);
 
-	memset_extent_buffer(c, 0, 0, root->nodesize);
+	memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_nritems(c, 1);
 	btrfs_set_header_level(c, level);
 	btrfs_set_header_bytenr(c, c->start);
 	btrfs_set_header_generation(c, trans->transid);
+	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(c, root->root_key.objectid);
 
 	write_extent_buffer(c, root->fs_info->fsid,
@@ -2151,12 +2118,6 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	root->node = c;
 	spin_unlock(&root->node_lock);
 
-	ret = btrfs_update_extent_ref(trans, root, lower->start,
-				      lower->len, lower->start, c->start,
-				      root->root_key.objectid,
-				      trans->transid, level - 1);
-	BUG_ON(ret);
-
 	/* the super has an extra ref to root->node */
 	free_extent_buffer(old);
 
@@ -2244,20 +2205,21 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	}
 
 	c_nritems = btrfs_header_nritems(c);
+	mid = (c_nritems + 1) / 2;
+	btrfs_node_key(c, &disk_key, mid);
 
-	split = btrfs_alloc_free_block(trans, root, root->nodesize,
-					path->nodes[level + 1]->start,
+	split = btrfs_alloc_free_block(trans, root, root->nodesize, 0,
 					root->root_key.objectid,
-					trans->transid, level, c->start, 0);
+					&disk_key, level, c->start, 0);
 	if (IS_ERR(split))
 		return PTR_ERR(split);
 
-	btrfs_set_header_flags(split, btrfs_header_flags(c));
+	memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_level(split, btrfs_header_level(c));
 	btrfs_set_header_bytenr(split, split->start);
 	btrfs_set_header_generation(split, trans->transid);
+	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(split, root->root_key.objectid);
-	btrfs_set_header_flags(split, 0);
 	write_extent_buffer(split, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(split),
 			    BTRFS_FSID_SIZE);
@@ -2265,7 +2227,6 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
 			    BTRFS_UUID_SIZE);
 
-	mid = (c_nritems + 1) / 2;
 
 	copy_extent_buffer(split, c,
 			   btrfs_node_key_ptr_offset(0),
@@ -2278,16 +2239,12 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(c);
 	btrfs_mark_buffer_dirty(split);
 
-	btrfs_node_key(split, &disk_key, 0);
 	wret = insert_ptr(trans, root, path, &disk_key, split->start,
 			  path->slots[level + 1] + 1,
 			  level + 1);
 	if (wret)
 		ret = wret;
 
-	ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid);
-	BUG_ON(ret);
-
 	if (path->slots[level] >= mid) {
 		path->slots[level] -= mid;
 		btrfs_tree_unlock(c);
@@ -2360,7 +2317,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 	u32 right_nritems;
 	u32 data_end;
 	u32 this_item_size;
-	int ret;
 
 	if (empty)
 		nr = 0;
@@ -2473,9 +2429,6 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
 		btrfs_mark_buffer_dirty(left);
 	btrfs_mark_buffer_dirty(right);
 
-	ret = btrfs_update_ref(trans, root, left, right, 0, push_items);
-	BUG_ON(ret);
-
 	btrfs_item_key(right, &disk_key, 0);
 	btrfs_set_node_key(upper, &disk_key, slot + 1);
 	btrfs_mark_buffer_dirty(upper);
@@ -2720,10 +2673,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 	if (right_nritems)
 		btrfs_mark_buffer_dirty(right);
 
-	ret = btrfs_update_ref(trans, root, right, left,
-			       old_left_nritems, push_items);
-	BUG_ON(ret);
-
 	btrfs_item_key(right, &disk_key, 0);
 	wret = fixup_low_keys(trans, root, path, &disk_key, 1);
 	if (wret)
@@ -2880,9 +2829,6 @@ static noinline int copy_for_split(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(l);
 	BUG_ON(path->slots[0] != slot);
 
-	ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
-	BUG_ON(ret);
-
 	if (mid <= slot) {
 		btrfs_tree_unlock(path->nodes[0]);
 		free_extent_buffer(path->nodes[0]);
@@ -2911,6 +2857,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			       struct btrfs_path *path, int data_size,
 			       int extend)
 {
+	struct btrfs_disk_key disk_key;
 	struct extent_buffer *l;
 	u32 nritems;
 	int mid;
@@ -2918,7 +2865,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	struct extent_buffer *right;
 	int ret = 0;
 	int wret;
-	int double_split;
+	int split;
 	int num_doubles = 0;
 
 	/* first try to make some room by pushing left and right */
@@ -2945,16 +2892,53 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			return ret;
 	}
 again:
-	double_split = 0;
+	split = 1;
 	l = path->nodes[0];
 	slot = path->slots[0];
 	nritems = btrfs_header_nritems(l);
 	mid = (nritems + 1) / 2;
 
-	right = btrfs_alloc_free_block(trans, root, root->leafsize,
-					path->nodes[1]->start,
+	if (mid <= slot) {
+		if (nritems == 1 ||
+		    leaf_space_used(l, mid, nritems - mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (slot >= nritems) {
+				split = 0;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					split = 2;
+				}
+			}
+		}
+	} else {
+		if (leaf_space_used(l, 0, mid) + data_size >
+			BTRFS_LEAF_DATA_SIZE(root)) {
+			if (!extend && data_size && slot == 0) {
+				split = 0;
+			} else if ((extend || !data_size) && slot == 0) {
+				mid = 1;
+			} else {
+				mid = slot;
+				if (mid != nritems &&
+				    leaf_space_used(l, mid, nritems - mid) +
+				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
+					split = 2 ;
+				}
+			}
+		}
+	}
+
+	if (split == 0)
+		btrfs_cpu_key_to_disk(&disk_key, ins_key);
+	else
+		btrfs_item_key(l, &disk_key, mid);
+
+	right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
 					root->root_key.objectid,
-					trans->transid, 0, l->start, 0);
+					&disk_key, 0, l->start, 0);
 	if (IS_ERR(right)) {
 		BUG_ON(1);
 		return PTR_ERR(right);
@@ -2963,6 +2947,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_bytenr(right, right->start);
 	btrfs_set_header_generation(right, trans->transid);
+	btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
 	write_extent_buffer(right, root->fs_info->fsid,
@@ -2973,79 +2958,47 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
 			    BTRFS_UUID_SIZE);
 
-	if (mid <= slot) {
-		if (nritems == 1 ||
-		    leaf_space_used(l, mid, nritems - mid) + data_size >
-			BTRFS_LEAF_DATA_SIZE(root)) {
-			if (slot >= nritems) {
-				struct btrfs_disk_key disk_key;
-
-				btrfs_cpu_key_to_disk(&disk_key, ins_key);
-				btrfs_set_header_nritems(right, 0);
-				wret = insert_ptr(trans, root, path,
-						  &disk_key, right->start,
-						  path->slots[1] + 1, 1);
-				if (wret)
-					ret = wret;
+	if (split == 0) {
+		if (mid <= slot) {
+			btrfs_set_header_nritems(right, 0);
+			wret = insert_ptr(trans, root, path,
+					  &disk_key, right->start,
+					  path->slots[1] + 1, 1);
+			if (wret)
+				ret = wret;
 
-				btrfs_tree_unlock(path->nodes[0]);
-				free_extent_buffer(path->nodes[0]);
-				path->nodes[0] = right;
-				path->slots[0] = 0;
-				path->slots[1] += 1;
-				btrfs_mark_buffer_dirty(right);
-				return ret;
-			}
-			mid = slot;
-			if (mid != nritems &&
-			    leaf_space_used(l, mid, nritems - mid) +
-			    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
-				double_split = 1;
-			}
-		}
-	} else {
-		if (leaf_space_used(l, 0, mid) + data_size >
-			BTRFS_LEAF_DATA_SIZE(root)) {
-			if (!extend && data_size && slot == 0) {
-				struct btrfs_disk_key disk_key;
-
-				btrfs_cpu_key_to_disk(&disk_key, ins_key);
-				btrfs_set_header_nritems(right, 0);
-				wret = insert_ptr(trans, root, path,
-						  &disk_key,
-						  right->start,
-						  path->slots[1], 1);
+			btrfs_tree_unlock(path->nodes[0]);
+			free_extent_buffer(path->nodes[0]);
+			path->nodes[0] = right;
+			path->slots[0] = 0;
+			path->slots[1] += 1;
+		} else {
+			btrfs_set_header_nritems(right, 0);
+			wret = insert_ptr(trans, root, path,
+					  &disk_key,
+					  right->start,
+					  path->slots[1], 1);
+			if (wret)
+				ret = wret;
+			btrfs_tree_unlock(path->nodes[0]);
+			free_extent_buffer(path->nodes[0]);
+			path->nodes[0] = right;
+			path->slots[0] = 0;
+			if (path->slots[1] == 0) {
+				wret = fixup_low_keys(trans, root,
+						path, &disk_key, 1);
 				if (wret)
 					ret = wret;
-				btrfs_tree_unlock(path->nodes[0]);
-				free_extent_buffer(path->nodes[0]);
-				path->nodes[0] = right;
-				path->slots[0] = 0;
-				if (path->slots[1] == 0) {
-					wret = fixup_low_keys(trans, root,
-						      path, &disk_key, 1);
-					if (wret)
-						ret = wret;
-				}
-				btrfs_mark_buffer_dirty(right);
-				return ret;
-			} else if ((extend || !data_size) && slot == 0) {
-				mid = 1;
-			} else {
-				mid = slot;
-				if (mid != nritems &&
-				    leaf_space_used(l, mid, nritems - mid) +
-				    data_size > BTRFS_LEAF_DATA_SIZE(root)) {
-					double_split = 1;
-				}
 			}
 		}
+		btrfs_mark_buffer_dirty(right);
+		return ret;
 	}
 
 	ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
 	BUG_ON(ret);
 
-	if (double_split) {
+	if (split == 2) {
 		BUG_ON(num_doubles != 0);
 		num_doubles++;
 		goto again;
@@ -3447,7 +3400,7 @@ int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
 		/* figure out how many keys we can insert in here */
 		total_data = data_size[0];
 		for (i = 1; i < nr; i++) {
-			if (comp_cpu_keys(&found_key, cpu_key + i) <= 0)
+			if (btrfs_comp_cpu_keys(&found_key, cpu_key + i) <= 0)
 				break;
 			total_data += data_size[i];
 		}
@@ -3745,9 +3698,7 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 /*
  * a helper function to delete the leaf pointed to by path->slots[1] and
- * path->nodes[1].  bytenr is the node block pointer, but since the callers
- * already know it, it is faster to have them pass it down than to
- * read it out of the node again.
+ * path->nodes[1].
  *
  * This deletes the pointer in path->nodes[1] and frees the leaf
  * block extent.  zero is returned if it all worked out, < 0 otherwise.
@@ -3755,15 +3706,14 @@ static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  * The path must have already been setup for deleting the leaf, including
  * all the proper balancing.  path->nodes[1] must be locked.
  */
-noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path, u64 bytenr)
+static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path,
+				   struct extent_buffer *leaf)
 {
 	int ret;
-	u64 root_gen = btrfs_header_generation(path->nodes[1]);
-	u64 parent_start = path->nodes[1]->start;
-	u64 parent_owner = btrfs_header_owner(path->nodes[1]);
 
+	WARN_ON(btrfs_header_generation(leaf) != trans->transid);
 	ret = del_ptr(trans, root, path, 1, path->slots[1]);
 	if (ret)
 		return ret;
@@ -3774,10 +3724,8 @@ noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
 	 */
 	btrfs_unlock_up_safe(path, 0);
 
-	ret = btrfs_free_extent(trans, root, bytenr,
-				btrfs_level_size(root, 0),
-				parent_start, parent_owner,
-				root_gen, 0, 1);
+	ret = btrfs_free_extent(trans, root, leaf->start, leaf->len,
+				0, root->root_key.objectid, 0, 0);
 	return ret;
 }
 /*
@@ -3845,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		if (leaf == root->node) {
 			btrfs_set_header_level(leaf, 0);
 		} else {
-			ret = btrfs_del_leaf(trans, root, path, leaf->start);
+			ret = btrfs_del_leaf(trans, root, path, leaf);
 			BUG_ON(ret);
 		}
 	} else {
@@ -3884,8 +3832,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 
 			if (btrfs_header_nritems(leaf) == 0) {
 				path->slots[1] = slot;
-				ret = btrfs_del_leaf(trans, root, path,
-						     leaf->start);
+				ret = btrfs_del_leaf(trans, root, path, leaf);
 				BUG_ON(ret);
 				free_extent_buffer(leaf);
 			} else {
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4414a5d9983a9..ce3ab4e130642 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,8 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_MAX_LEVEL 8
 
+#define BTRFS_COMPAT_EXTENT_TREE_V0
+
 /*
  * files bigger than this get some pre-flushing when they are added
  * to the ordered operations list.  That way we limit the total
@@ -267,7 +269,18 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 }
 
 #define BTRFS_FSID_SIZE 16
-#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0)
+#define BTRFS_HEADER_FLAG_WRITTEN	(1ULL << 0)
+#define BTRFS_HEADER_FLAG_RELOC		(1ULL << 1)
+#define BTRFS_SUPER_FLAG_SEEDING	(1ULL << 32)
+#define BTRFS_SUPER_FLAG_METADUMP	(1ULL << 33)
+
+#define BTRFS_BACKREF_REV_MAX		256
+#define BTRFS_BACKREF_REV_SHIFT		56
+#define BTRFS_BACKREF_REV_MASK		(((u64)BTRFS_BACKREF_REV_MAX - 1) << \
+					 BTRFS_BACKREF_REV_SHIFT)
+
+#define BTRFS_OLD_BACKREF_REV		0
+#define BTRFS_MIXED_BACKREF_REV		1
 
 /*
  * every tree block (leaf or node) starts with this header.
@@ -296,7 +309,6 @@ struct btrfs_header {
 					sizeof(struct btrfs_item) - \
 					sizeof(struct btrfs_file_extent_item))
 
-#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32)
 
 /*
  * this is a very generous portion of the super block, giving us
@@ -355,9 +367,12 @@ struct btrfs_super_block {
  * Compat flags that we support.  If any incompat flags are set other than the
  * ones specified below then we will fail to mount
  */
-#define BTRFS_FEATURE_COMPAT_SUPP	0x0
-#define BTRFS_FEATURE_COMPAT_RO_SUPP	0x0
-#define BTRFS_FEATURE_INCOMPAT_SUPP	0x0
+#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
+
+#define BTRFS_FEATURE_COMPAT_SUPP		0ULL
+#define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
+#define BTRFS_FEATURE_INCOMPAT_SUPP		\
+	BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF
 
 /*
  * A leaf is full of items. offset and size tell us where to find
@@ -421,23 +436,65 @@ struct btrfs_path {
 	unsigned int keep_locks:1;
 	unsigned int skip_locking:1;
 	unsigned int leave_spinning:1;
+	unsigned int search_commit_root:1;
 };
 
 /*
  * items in the extent btree are used to record the objectid of the
  * owner of the block and the number of references
  */
+
 struct btrfs_extent_item {
+	__le64 refs;
+	__le64 generation;
+	__le64 flags;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_item_v0 {
 	__le32 refs;
 } __attribute__ ((__packed__));
 
-struct btrfs_extent_ref {
+#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r) >> 4) - \
+					sizeof(struct btrfs_item))
+
+#define BTRFS_EXTENT_FLAG_DATA		(1ULL << 0)
+#define BTRFS_EXTENT_FLAG_TREE_BLOCK	(1ULL << 1)
+
+/* following flags only apply to tree blocks */
+
+/* use full backrefs for extent pointers in the block */
+#define BTRFS_BLOCK_FLAG_FULL_BACKREF	(1ULL << 8)
+
+struct btrfs_tree_block_info {
+	struct btrfs_disk_key key;
+	u8 level;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_data_ref {
+	__le64 root;
+	__le64 objectid;
+	__le64 offset;
+	__le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_shared_data_ref {
+	__le32 count;
+} __attribute__ ((__packed__));
+
+struct btrfs_extent_inline_ref {
+	u8 type;
+	u64 offset;
+} __attribute__ ((__packed__));
+
+/* old style backrefs item */
+struct btrfs_extent_ref_v0 {
 	__le64 root;
 	__le64 generation;
 	__le64 objectid;
-	__le32 num_refs;
+	__le32 count;
 } __attribute__ ((__packed__));
 
+
 /* dev extents record free space on individual devices.  The owner
  * field points back to the chunk allocation mapping tree that allocated
  * the extent.  The chunk tree uuid field is a way to double check the owner
@@ -695,12 +752,7 @@ struct btrfs_block_group_cache {
 	struct list_head cluster_list;
 };
 
-struct btrfs_leaf_ref_tree {
-	struct rb_root root;
-	struct list_head list;
-	spinlock_t lock;
-};
-
+struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
 struct btrfs_fs_info {
@@ -831,18 +883,11 @@ struct btrfs_fs_info {
 	struct task_struct *cleaner_kthread;
 	int thread_pool_size;
 
-	/* tree relocation relocated fields */
-	struct list_head dead_reloc_roots;
-	struct btrfs_leaf_ref_tree reloc_ref_tree;
-	struct btrfs_leaf_ref_tree shared_ref_tree;
-
 	struct kobject super_kobj;
 	struct completion kobj_unregister;
 	int do_barriers;
 	int closing;
 	int log_root_recovering;
-	atomic_t throttles;
-	atomic_t throttle_gen;
 
 	u64 total_pinned;
 
@@ -861,6 +906,8 @@ struct btrfs_fs_info {
 	 */
 	struct list_head space_info;
 
+	struct reloc_control *reloc_ctl;
+
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
@@ -891,7 +938,6 @@ struct btrfs_fs_info {
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
  */
-struct btrfs_dirty_root;
 struct btrfs_root {
 	struct extent_buffer *node;
 
@@ -899,9 +945,6 @@ struct btrfs_root {
 	spinlock_t node_lock;
 
 	struct extent_buffer *commit_root;
-	struct btrfs_leaf_ref_tree *ref_tree;
-	struct btrfs_leaf_ref_tree ref_tree_struct;
-	struct btrfs_dirty_root *dirty_root;
 	struct btrfs_root *log_root;
 	struct btrfs_root *reloc_root;
 
@@ -952,10 +995,15 @@ struct btrfs_root {
 	/* the dirty list is only used by non-reference counted roots */
 	struct list_head dirty_list;
 
+	struct list_head root_list;
+
 	spinlock_t list_lock;
-	struct list_head dead_list;
 	struct list_head orphan_list;
 
+	spinlock_t inode_lock;
+	/* red-black tree that keeps track of in-memory inodes */
+	struct rb_root inode_tree;
+
 	/*
 	 * right now this just gets used so that a root has its own devid
 	 * for stat.  It may be used for more later
@@ -1017,7 +1065,16 @@ struct btrfs_root {
  * are used, and how many references there are to each block
  */
 #define BTRFS_EXTENT_ITEM_KEY	168
-#define BTRFS_EXTENT_REF_KEY	180
+
+#define BTRFS_TREE_BLOCK_REF_KEY	176
+
+#define BTRFS_EXTENT_DATA_REF_KEY	178
+
+#define BTRFS_EXTENT_REF_V0_KEY		180
+
+#define BTRFS_SHARED_BLOCK_REF_KEY	182
+
+#define BTRFS_SHARED_DATA_REF_KEY	184
 
 /*
  * block groups give us hints into the extent allocation trees.  Which
@@ -1317,24 +1374,67 @@ static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
 	return (u8 *)((unsigned long)dev + ptr);
 }
 
-/* struct btrfs_extent_ref */
-BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
-BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
-BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64);
-BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32);
+BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
+BTRFS_SETGET_FUNCS(extent_generation, struct btrfs_extent_item,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(extent_flags, struct btrfs_extent_item, flags, 64);
 
-BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref,
-			 generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref,
-			 objectid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref,
-			 num_refs, 32);
+BTRFS_SETGET_FUNCS(extent_refs_v0, struct btrfs_extent_item_v0, refs, 32);
+
+
+BTRFS_SETGET_FUNCS(tree_block_level, struct btrfs_tree_block_info, level, 8);
+
+static inline void btrfs_tree_block_key(struct extent_buffer *eb,
+					struct btrfs_tree_block_info *item,
+					struct btrfs_disk_key *key)
+{
+	read_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
+
+static inline void btrfs_set_tree_block_key(struct extent_buffer *eb,
+					    struct btrfs_tree_block_info *item,
+					    struct btrfs_disk_key *key)
+{
+	write_eb_member(eb, item, struct btrfs_tree_block_info, key, key);
+}
 
-/* struct btrfs_extent_item */
-BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item,
-			 refs, 32);
+BTRFS_SETGET_FUNCS(extent_data_ref_root, struct btrfs_extent_data_ref,
+		   root, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_objectid, struct btrfs_extent_data_ref,
+		   objectid, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_offset, struct btrfs_extent_data_ref,
+		   offset, 64);
+BTRFS_SETGET_FUNCS(extent_data_ref_count, struct btrfs_extent_data_ref,
+		   count, 32);
+
+BTRFS_SETGET_FUNCS(shared_data_ref_count, struct btrfs_shared_data_ref,
+		   count, 32);
+
+BTRFS_SETGET_FUNCS(extent_inline_ref_type, struct btrfs_extent_inline_ref,
+		   type, 8);
+BTRFS_SETGET_FUNCS(extent_inline_ref_offset, struct btrfs_extent_inline_ref,
+		   offset, 64);
+
+static inline u32 btrfs_extent_inline_ref_size(int type)
+{
+	if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    type == BTRFS_SHARED_BLOCK_REF_KEY)
+		return sizeof(struct btrfs_extent_inline_ref);
+	if (type == BTRFS_SHARED_DATA_REF_KEY)
+		return sizeof(struct btrfs_shared_data_ref) +
+		       sizeof(struct btrfs_extent_inline_ref);
+	if (type == BTRFS_EXTENT_DATA_REF_KEY)
+		return sizeof(struct btrfs_extent_data_ref) +
+		       offsetof(struct btrfs_extent_inline_ref, offset);
+	BUG();
+	return 0;
+}
+
+BTRFS_SETGET_FUNCS(ref_root_v0, struct btrfs_extent_ref_v0, root, 64);
+BTRFS_SETGET_FUNCS(ref_generation_v0, struct btrfs_extent_ref_v0,
+		   generation, 64);
+BTRFS_SETGET_FUNCS(ref_objectid_v0, struct btrfs_extent_ref_v0, objectid, 64);
+BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
 
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
@@ -1558,6 +1658,21 @@ static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag)
 	return (flags & flag) == flag;
 }
 
+static inline int btrfs_header_backref_rev(struct extent_buffer *eb)
+{
+	u64 flags = btrfs_header_flags(eb);
+	return flags >> BTRFS_BACKREF_REV_SHIFT;
+}
+
+static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
+						int rev)
+{
+	u64 flags = btrfs_header_flags(eb);
+	flags &= ~BTRFS_BACKREF_REV_MASK;
+	flags |= (u64)rev << BTRFS_BACKREF_REV_SHIFT;
+	btrfs_set_header_flags(eb, flags);
+}
+
 static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
 {
 	unsigned long ptr = offsetof(struct btrfs_header, fsid);
@@ -1790,39 +1905,32 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, u64 objectid, u64 bytenr);
+			  struct btrfs_root *root,
+			  u64 objectid, u64 offset, u64 bytenr);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
 						 struct btrfs_fs_info *info,
 						 u64 bytenr);
+void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 u64 btrfs_find_block_group(struct btrfs_root *root,
 			   u64 search_start, u64 search_hint, int owner);
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root,
-					     u32 blocksize, u64 parent,
-					     u64 root_objectid,
-					     u64 ref_generation,
-					     int level,
-					     u64 hint,
-					     u64 empty_size);
+					struct btrfs_root *root, u32 blocksize,
+					u64 parent, u64 root_objectid,
+					struct btrfs_disk_key *key, int level,
+					u64 hint, u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize,
 					    int level);
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       u64 num_bytes, u64 parent, u64 min_bytes,
-		       u64 root_objectid, u64 ref_generation,
-		       u64 owner, u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, u64 data);
-int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, u64 parent,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, struct btrfs_key *ins);
-int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, u64 parent,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, struct btrfs_key *ins);
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 root_objectid, u64 owner,
+				     u64 offset, struct btrfs_key *ins);
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   u64 root_objectid, u64 owner, u64 offset,
+				   struct btrfs_key *ins);
 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root,
 				  u64 num_bytes, u64 min_alloc_size,
@@ -1830,18 +1938,18 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 				  u64 search_end, struct btrfs_key *ins,
 				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		  struct extent_buffer *orig_buf, struct extent_buffer *buf,
-		  u32 *nr_extents);
-int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-		    struct extent_buffer *buf, u32 nr_extents);
-int btrfs_update_ref(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root, struct extent_buffer *orig_buf,
-		     struct extent_buffer *buf, int start_slot, int nr);
+		  struct extent_buffer *buf, int full_backref);
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref);
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 flags,
+				int is_data);
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 ref_generation,
-		      u64 owner_objectid, int pin);
+		      u64 root_objectid, u64 owner, u64 offset);
+
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root,
@@ -1849,13 +1957,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root,
 			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid);
-int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr, u64 num_bytes,
-			    u64 orig_parent, u64 parent,
-			    u64 root_objectid, u64 ref_generation,
-			    u64 owner_objectid);
+			 u64 root_objectid, u64 owner, u64 offset);
+
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 				    struct btrfs_root *root);
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr);
@@ -1867,16 +1970,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 			   u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root, u64 group_start);
-int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
-int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root);
-int btrfs_drop_dead_reloc_roots(struct btrfs_root *root);
-int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
-			       struct btrfs_root *root,
-			       struct extent_buffer *buf, u64 orig_start);
-int btrfs_add_dead_reloc_root(struct btrfs_root *root);
-int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
-int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+				struct btrfs_block_group_cache *group);
+
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
@@ -1891,13 +1987,12 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
 			      u64 bytes);
 /* ctree.c */
+int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
+		     int level, int *slot);
+int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2);
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
 			int type);
-int btrfs_merge_path(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root,
-		     struct btrfs_key *node_keys,
-		     u64 *nodes, int lowest_level);
 int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root, struct btrfs_path *path,
 			    struct btrfs_key *new_key);
@@ -1918,6 +2013,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
 		      struct extent_buffer **cow_ret, u64 new_root_objectid);
+int btrfs_block_can_be_shared(struct btrfs_root *root,
+			      struct extent_buffer *buf);
 int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root
 		      *root, struct btrfs_path *path, u32 data_size);
 int btrfs_truncate_item(struct btrfs_trans_handle *trans,
@@ -1944,9 +2041,6 @@ void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int slot, int nr);
-int btrfs_del_leaf(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root,
-			    struct btrfs_path *path, u64 bytenr);
 static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 struct btrfs_path *path)
@@ -2005,8 +2099,9 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
 			 btrfs_root_item *item, struct btrfs_key *key);
 int btrfs_search_root(struct btrfs_root *root, u64 search_start,
 		      u64 *found_objectid);
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
-			  struct btrfs_root *latest_root);
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
+int btrfs_set_root_node(struct btrfs_root_item *item,
+			struct extent_buffer *node);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, const char *name,
@@ -2139,7 +2234,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
-void btrfs_read_locked_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, int wait);
 void btrfs_dirty_inode(struct inode *inode);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
@@ -2147,12 +2241,8 @@ void btrfs_destroy_inode(struct inode *inode);
 int btrfs_init_cachep(void);
 void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
-			    struct btrfs_root *root, int wait);
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-				struct btrfs_root *root);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root, int *is_new);
+			 struct btrfs_root *root);
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2209,4 +2299,12 @@ int btrfs_check_acl(struct inode *inode, int mask);
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
 
+/* relocation.c */
+int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start);
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root);
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root);
+int btrfs_recover_relocation(struct btrfs_root *root);
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index d6c01c096a40b..84e6781413b17 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -29,27 +29,87 @@
  * add extents in the middle of btrfs_search_slot, and it allows
  * us to buffer up frequently modified backrefs in an rb tree instead
  * of hammering updates on the extent allocation tree.
- *
- * Right now this code is only used for reference counted trees, but
- * the long term goal is to get rid of the similar code for delayed
- * extent tree modifications.
  */
 
 /*
- * entries in the rb tree are ordered by the byte number of the extent
- * and by the byte number of the parent block.
+ * compare two delayed tree backrefs with same bytenr and type
+ */
+static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref2,
+			  struct btrfs_delayed_tree_ref *ref1)
+{
+	if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
+		if (ref1->root < ref2->root)
+			return -1;
+		if (ref1->root > ref2->root)
+			return 1;
+	} else {
+		if (ref1->parent < ref2->parent)
+			return -1;
+		if (ref1->parent > ref2->parent)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * compare two delayed data backrefs with same bytenr and type
  */
-static int comp_entry(struct btrfs_delayed_ref_node *ref,
-		      u64 bytenr, u64 parent)
+static int comp_data_refs(struct btrfs_delayed_data_ref *ref2,
+			  struct btrfs_delayed_data_ref *ref1)
 {
-	if (bytenr < ref->bytenr)
+	if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		if (ref1->root < ref2->root)
+			return -1;
+		if (ref1->root > ref2->root)
+			return 1;
+		if (ref1->objectid < ref2->objectid)
+			return -1;
+		if (ref1->objectid > ref2->objectid)
+			return 1;
+		if (ref1->offset < ref2->offset)
+			return -1;
+		if (ref1->offset > ref2->offset)
+			return 1;
+	} else {
+		if (ref1->parent < ref2->parent)
+			return -1;
+		if (ref1->parent > ref2->parent)
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent,
+ * type of the delayed backrefs and content of delayed backrefs.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref2,
+		      struct btrfs_delayed_ref_node *ref1)
+{
+	if (ref1->bytenr < ref2->bytenr)
 		return -1;
-	if (bytenr > ref->bytenr)
+	if (ref1->bytenr > ref2->bytenr)
 		return 1;
-	if (parent < ref->parent)
+	if (ref1->is_head && ref2->is_head)
+		return 0;
+	if (ref2->is_head)
 		return -1;
-	if (parent > ref->parent)
+	if (ref1->is_head)
 		return 1;
+	if (ref1->type < ref2->type)
+		return -1;
+	if (ref1->type > ref2->type)
+		return 1;
+	if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) {
+		return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2),
+				      btrfs_delayed_node_to_tree_ref(ref1));
+	} else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		   ref1->type == BTRFS_SHARED_DATA_REF_KEY) {
+		return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2),
+				      btrfs_delayed_node_to_data_ref(ref1));
+	}
+	BUG();
 	return 0;
 }
 
@@ -59,20 +119,21 @@ static int comp_entry(struct btrfs_delayed_ref_node *ref,
  * inserted.
  */
 static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
-						  u64 bytenr, u64 parent,
 						  struct rb_node *node)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent_node = NULL;
 	struct btrfs_delayed_ref_node *entry;
+	struct btrfs_delayed_ref_node *ins;
 	int cmp;
 
+	ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 	while (*p) {
 		parent_node = *p;
 		entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
 				 rb_node);
 
-		cmp = comp_entry(entry, bytenr, parent);
+		cmp = comp_entry(entry, ins);
 		if (cmp < 0)
 			p = &(*p)->rb_left;
 		else if (cmp > 0)
@@ -81,18 +142,17 @@ static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
 			return entry;
 	}
 
-	entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
 	rb_link_node(node, parent_node, p);
 	rb_insert_color(node, root);
 	return NULL;
 }
 
 /*
- * find an entry based on (bytenr,parent).  This returns the delayed
- * ref if it was able to find one, or NULL if nothing was in that spot
+ * find an head entry based on bytenr. This returns the delayed ref
+ * head if it was able to find one, or NULL if nothing was in that spot
  */
-static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
-				  u64 bytenr, u64 parent,
+static struct btrfs_delayed_ref_node *find_ref_head(struct rb_root *root,
+				  u64 bytenr,
 				  struct btrfs_delayed_ref_node **last)
 {
 	struct rb_node *n = root->rb_node;
@@ -105,7 +165,15 @@ static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
 		if (last)
 			*last = entry;
 
-		cmp = comp_entry(entry, bytenr, parent);
+		if (bytenr < entry->bytenr)
+			cmp = -1;
+		else if (bytenr > entry->bytenr)
+			cmp = 1;
+		else if (!btrfs_delayed_ref_is_head(entry))
+			cmp = 1;
+		else
+			cmp = 0;
+
 		if (cmp < 0)
 			n = n->rb_left;
 		else if (cmp > 0)
@@ -154,7 +222,7 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
 		node = rb_first(&delayed_refs->root);
 	} else {
 		ref = NULL;
-		tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+		find_ref_head(&delayed_refs->root, start, &ref);
 		if (ref) {
 			struct btrfs_delayed_ref_node *tmp;
 
@@ -234,7 +302,7 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
 	if (ref) {
 		prev_node = rb_prev(&ref->rb_node);
 		if (!prev_node)
@@ -250,25 +318,28 @@ int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
 }
 
 /*
- * helper function to lookup reference count
+ * helper function to lookup reference count and flags of extent.
  *
  * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree.  This way you
- * can check to see what the reference count would be if all of the
- * delayed refs are processed.
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
  */
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs)
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u64 *refs, u64 *flags)
 {
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *head;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_path *path;
-	struct extent_buffer *leaf;
 	struct btrfs_extent_item *ei;
+	struct extent_buffer *leaf;
 	struct btrfs_key key;
-	u32 num_refs;
+	u32 item_size;
+	u64 num_refs;
+	u64 extent_flags;
 	int ret;
 
 	path = btrfs_alloc_path();
@@ -287,37 +358,60 @@ int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
 
 	if (ret == 0) {
 		leaf = path->nodes[0];
-		ei = btrfs_item_ptr(leaf, path->slots[0],
-				    struct btrfs_extent_item);
-		num_refs = btrfs_extent_refs(leaf, ei);
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		if (item_size >= sizeof(*ei)) {
+			ei = btrfs_item_ptr(leaf, path->slots[0],
+					    struct btrfs_extent_item);
+			num_refs = btrfs_extent_refs(leaf, ei);
+			extent_flags = btrfs_extent_flags(leaf, ei);
+		} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			struct btrfs_extent_item_v0 *ei0;
+			BUG_ON(item_size != sizeof(*ei0));
+			ei0 = btrfs_item_ptr(leaf, path->slots[0],
+					     struct btrfs_extent_item_v0);
+			num_refs = btrfs_extent_refs_v0(leaf, ei0);
+			/* FIXME: this isn't correct for data */
+			extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+			BUG();
+#endif
+		}
+		BUG_ON(num_refs == 0);
 	} else {
 		num_refs = 0;
+		extent_flags = 0;
 		ret = 0;
 	}
 
 	spin_lock(&delayed_refs->lock);
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
 	if (ref) {
 		head = btrfs_delayed_node_to_head(ref);
-		if (mutex_trylock(&head->mutex)) {
-			num_refs += ref->ref_mod;
-			mutex_unlock(&head->mutex);
-			*refs = num_refs;
-			goto out;
-		}
+		if (!mutex_trylock(&head->mutex)) {
+			atomic_inc(&ref->refs);
+			spin_unlock(&delayed_refs->lock);
 
-		atomic_inc(&ref->refs);
-		spin_unlock(&delayed_refs->lock);
+			btrfs_release_path(root->fs_info->extent_root, path);
 
-		btrfs_release_path(root->fs_info->extent_root, path);
+			mutex_lock(&head->mutex);
+			mutex_unlock(&head->mutex);
+			btrfs_put_delayed_ref(ref);
+			goto again;
+		}
+		if (head->extent_op && head->extent_op->update_flags)
+			extent_flags |= head->extent_op->flags_to_set;
+		else
+			BUG_ON(num_refs == 0);
 
-		mutex_lock(&head->mutex);
+		num_refs += ref->ref_mod;
 		mutex_unlock(&head->mutex);
-		btrfs_put_delayed_ref(ref);
-		goto again;
-	} else {
-		*refs = num_refs;
 	}
+	WARN_ON(num_refs == 0);
+	if (refs)
+		*refs = num_refs;
+	if (flags)
+		*flags = extent_flags;
 out:
 	spin_unlock(&delayed_refs->lock);
 	btrfs_free_path(path);
@@ -338,16 +432,7 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 		    struct btrfs_delayed_ref_node *existing,
 		    struct btrfs_delayed_ref_node *update)
 {
-	struct btrfs_delayed_ref *existing_ref;
-	struct btrfs_delayed_ref *ref;
-
-	existing_ref = btrfs_delayed_node_to_ref(existing);
-	ref = btrfs_delayed_node_to_ref(update);
-
-	if (ref->pin)
-		existing_ref->pin = 1;
-
-	if (ref->action != existing_ref->action) {
+	if (update->action != existing->action) {
 		/*
 		 * this is effectively undoing either an add or a
 		 * drop.  We decrement the ref_mod, and if it goes
@@ -363,20 +448,13 @@ update_existing_ref(struct btrfs_trans_handle *trans,
 			delayed_refs->num_entries--;
 			if (trans->delayed_ref_updates)
 				trans->delayed_ref_updates--;
+		} else {
+			WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+				existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
 		}
 	} else {
-		if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
-			/* if we're adding refs, make sure all the
-			 * details match up.  The extent could
-			 * have been totally freed and reallocated
-			 * by a different owner before the delayed
-			 * ref entries were removed.
-			 */
-			existing_ref->owner_objectid = ref->owner_objectid;
-			existing_ref->generation = ref->generation;
-			existing_ref->root = ref->root;
-			existing->num_bytes = update->num_bytes;
-		}
+		WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY ||
+			existing->type == BTRFS_SHARED_BLOCK_REF_KEY);
 		/*
 		 * the action on the existing ref matches
 		 * the action on the ref we're trying to add.
@@ -401,6 +479,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 
 	existing_ref = btrfs_delayed_node_to_head(existing);
 	ref = btrfs_delayed_node_to_head(update);
+	BUG_ON(existing_ref->is_data != ref->is_data);
 
 	if (ref->must_insert_reserved) {
 		/* if the extent was freed and then
@@ -420,6 +499,24 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 
 	}
 
+	if (ref->extent_op) {
+		if (!existing_ref->extent_op) {
+			existing_ref->extent_op = ref->extent_op;
+		} else {
+			if (ref->extent_op->update_key) {
+				memcpy(&existing_ref->extent_op->key,
+				       &ref->extent_op->key,
+				       sizeof(ref->extent_op->key));
+				existing_ref->extent_op->update_key = 1;
+			}
+			if (ref->extent_op->update_flags) {
+				existing_ref->extent_op->flags_to_set |=
+					ref->extent_op->flags_to_set;
+				existing_ref->extent_op->update_flags = 1;
+			}
+			kfree(ref->extent_op);
+		}
+	}
 	/*
 	 * update the reference mod on the head to reflect this new operation
 	 */
@@ -427,19 +524,16 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
 }
 
 /*
- * helper function to actually insert a delayed ref into the rbtree.
+ * helper function to actually insert a head node into the rbtree.
  * this does all the dirty work in terms of maintaining the correct
- * overall modification count in the head node and properly dealing
- * with updating existing nodes as new modifications are queued.
+ * overall modification count.
  */
-static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
-			  struct btrfs_delayed_ref_node *ref,
-			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid, int action,
-			  int pin)
+static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
+					struct btrfs_delayed_ref_node *ref,
+					u64 bytenr, u64 num_bytes,
+					int action, int is_data)
 {
 	struct btrfs_delayed_ref_node *existing;
-	struct btrfs_delayed_ref *full_ref;
 	struct btrfs_delayed_ref_head *head_ref = NULL;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int count_mod = 1;
@@ -449,12 +543,10 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 	 * the head node stores the sum of all the mods, so dropping a ref
 	 * should drop the sum in the head node by one.
 	 */
-	if (parent == (u64)-1) {
-		if (action == BTRFS_DROP_DELAYED_REF)
-			count_mod = -1;
-		else if (action == BTRFS_UPDATE_DELAYED_HEAD)
-			count_mod = 0;
-	}
+	if (action == BTRFS_UPDATE_DELAYED_HEAD)
+		count_mod = 0;
+	else if (action == BTRFS_DROP_DELAYED_REF)
+		count_mod = -1;
 
 	/*
 	 * BTRFS_ADD_DELAYED_EXTENT means that we need to update
@@ -467,57 +559,148 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 	 * Once we record must_insert_reserved, switch the action to
 	 * BTRFS_ADD_DELAYED_REF because other special casing is not required.
 	 */
-	if (action == BTRFS_ADD_DELAYED_EXTENT) {
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
 		must_insert_reserved = 1;
-		action = BTRFS_ADD_DELAYED_REF;
-	} else {
+	else
 		must_insert_reserved = 0;
-	}
-
 
 	delayed_refs = &trans->transaction->delayed_refs;
 
 	/* first set the basic ref node struct up */
 	atomic_set(&ref->refs, 1);
 	ref->bytenr = bytenr;
-	ref->parent = parent;
+	ref->num_bytes = num_bytes;
 	ref->ref_mod = count_mod;
+	ref->type  = 0;
+	ref->action  = 0;
+	ref->is_head = 1;
 	ref->in_tree = 1;
+
+	head_ref = btrfs_delayed_node_to_head(ref);
+	head_ref->must_insert_reserved = must_insert_reserved;
+	head_ref->is_data = is_data;
+
+	INIT_LIST_HEAD(&head_ref->cluster);
+	mutex_init(&head_ref->mutex);
+
+	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+
+	if (existing) {
+		update_existing_head_ref(existing, ref);
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kfree(ref);
+	} else {
+		delayed_refs->num_heads++;
+		delayed_refs->num_heads_ready++;
+		delayed_refs->num_entries++;
+		trans->delayed_ref_updates++;
+	}
+	return 0;
+}
+
+/*
+ * helper to insert a delayed tree ref into the rbtree.
+ */
+static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+					 struct btrfs_delayed_ref_node *ref,
+					 u64 bytenr, u64 num_bytes, u64 parent,
+					 u64 ref_root, int level, int action)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_tree_ref *full_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
+		action = BTRFS_ADD_DELAYED_REF;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
 	ref->num_bytes = num_bytes;
+	ref->ref_mod = 1;
+	ref->action = action;
+	ref->is_head = 0;
+	ref->in_tree = 1;
 
-	if (btrfs_delayed_ref_is_head(ref)) {
-		head_ref = btrfs_delayed_node_to_head(ref);
-		head_ref->must_insert_reserved = must_insert_reserved;
-		INIT_LIST_HEAD(&head_ref->cluster);
-		mutex_init(&head_ref->mutex);
+	full_ref = btrfs_delayed_node_to_tree_ref(ref);
+	if (parent) {
+		full_ref->parent = parent;
+		ref->type = BTRFS_SHARED_BLOCK_REF_KEY;
 	} else {
-		full_ref = btrfs_delayed_node_to_ref(ref);
 		full_ref->root = ref_root;
-		full_ref->generation = ref_generation;
-		full_ref->owner_objectid = owner_objectid;
-		full_ref->pin = pin;
-		full_ref->action = action;
+		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
 	}
+	full_ref->level = level;
 
-	existing = tree_insert(&delayed_refs->root, bytenr,
-			       parent, &ref->rb_node);
+	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
 
 	if (existing) {
-		if (btrfs_delayed_ref_is_head(ref))
-			update_existing_head_ref(existing, ref);
-		else
-			update_existing_ref(trans, delayed_refs, existing, ref);
+		update_existing_ref(trans, delayed_refs, existing, ref);
+		/*
+		 * we've updated the existing ref, free the newly
+		 * allocated ref
+		 */
+		kfree(ref);
+	} else {
+		delayed_refs->num_entries++;
+		trans->delayed_ref_updates++;
+	}
+	return 0;
+}
+
+/*
+ * helper to insert a delayed data ref into the rbtree.
+ */
+static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
+					 struct btrfs_delayed_ref_node *ref,
+					 u64 bytenr, u64 num_bytes, u64 parent,
+					 u64 ref_root, u64 owner, u64 offset,
+					 int action)
+{
+	struct btrfs_delayed_ref_node *existing;
+	struct btrfs_delayed_data_ref *full_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+
+	if (action == BTRFS_ADD_DELAYED_EXTENT)
+		action = BTRFS_ADD_DELAYED_REF;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+
+	/* first set the basic ref node struct up */
+	atomic_set(&ref->refs, 1);
+	ref->bytenr = bytenr;
+	ref->num_bytes = num_bytes;
+	ref->ref_mod = 1;
+	ref->action = action;
+	ref->is_head = 0;
+	ref->in_tree = 1;
+
+	full_ref = btrfs_delayed_node_to_data_ref(ref);
+	if (parent) {
+		full_ref->parent = parent;
+		ref->type = BTRFS_SHARED_DATA_REF_KEY;
+	} else {
+		full_ref->root = ref_root;
+		ref->type = BTRFS_EXTENT_DATA_REF_KEY;
+	}
+	full_ref->objectid = owner;
+	full_ref->offset = offset;
 
+	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
+
+	if (existing) {
+		update_existing_ref(trans, delayed_refs, existing, ref);
 		/*
 		 * we've updated the existing ref, free the newly
 		 * allocated ref
 		 */
 		kfree(ref);
 	} else {
-		if (btrfs_delayed_ref_is_head(ref)) {
-			delayed_refs->num_heads++;
-			delayed_refs->num_heads_ready++;
-		}
 		delayed_refs->num_entries++;
 		trans->delayed_ref_updates++;
 	}
@@ -525,37 +708,78 @@ static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 }
 
 /*
- * add a delayed ref to the tree.  This does all of the accounting required
+ * add a delayed tree ref.  This does all of the accounting required
  * to make sure the delayed ref is eventually processed before this
  * transaction commits.
  */
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
-			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid, int action,
-			  int pin)
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 ref_root,  int level, int action,
+			       struct btrfs_delayed_extent_op *extent_op)
 {
-	struct btrfs_delayed_ref *ref;
+	struct btrfs_delayed_tree_ref *ref;
 	struct btrfs_delayed_ref_head *head_ref;
 	struct btrfs_delayed_ref_root *delayed_refs;
 	int ret;
 
+	BUG_ON(extent_op && extent_op->is_data);
 	ref = kmalloc(sizeof(*ref), GFP_NOFS);
 	if (!ref)
 		return -ENOMEM;
 
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref) {
+		kfree(ref);
+		return -ENOMEM;
+	}
+
+	head_ref->extent_op = extent_op;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
 	/*
-	 * the parent = 0 case comes from cases where we don't actually
-	 * know the parent yet.  It will get updated later via a add/drop
-	 * pair.
+	 * insert both the head node and the new ref without dropping
+	 * the spin lock
 	 */
-	if (parent == 0)
-		parent = bytenr;
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+				   action, 0);
+	BUG_ON(ret);
+
+	ret = add_delayed_tree_ref(trans, &ref->node, bytenr, num_bytes,
+				   parent, ref_root, level, action);
+	BUG_ON(ret);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+/*
+ * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
+ */
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes,
+			       u64 parent, u64 ref_root,
+			       u64 owner, u64 offset, int action,
+			       struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_delayed_data_ref *ref;
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	BUG_ON(extent_op && !extent_op->is_data);
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
 
 	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
 	if (!head_ref) {
 		kfree(ref);
 		return -ENOMEM;
 	}
+
+	head_ref->extent_op = extent_op;
+
 	delayed_refs = &trans->transaction->delayed_refs;
 	spin_lock(&delayed_refs->lock);
 
@@ -563,14 +787,39 @@ int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
 	 * insert both the head node and the new ref without dropping
 	 * the spin lock
 	 */
-	ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
-				      (u64)-1, 0, 0, 0, action, pin);
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr, num_bytes,
+				   action, 1);
 	BUG_ON(ret);
 
-	ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
-				      parent, ref_root, ref_generation,
-				      owner_objectid, action, pin);
+	ret = add_delayed_data_ref(trans, &ref->node, bytenr, num_bytes,
+				   parent, ref_root, owner, offset, action);
+	BUG_ON(ret);
+	spin_unlock(&delayed_refs->lock);
+	return 0;
+}
+
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+				u64 bytenr, u64 num_bytes,
+				struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_delayed_ref_head *head_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	int ret;
+
+	head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+	if (!head_ref)
+		return -ENOMEM;
+
+	head_ref->extent_op = extent_op;
+
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+
+	ret = add_delayed_ref_head(trans, &head_ref->node, bytenr,
+				   num_bytes, BTRFS_UPDATE_DELAYED_HEAD,
+				   extent_op->is_data);
 	BUG_ON(ret);
+
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
@@ -587,7 +836,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
 	struct btrfs_delayed_ref_root *delayed_refs;
 
 	delayed_refs = &trans->transaction->delayed_refs;
-	ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+	ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
 	if (ref)
 		return btrfs_delayed_node_to_head(ref);
 	return NULL;
@@ -603,6 +852,7 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
  *
  * It is the same as doing a ref add and delete in two separate calls.
  */
+#if 0
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 			  u64 bytenr, u64 num_bytes, u64 orig_parent,
 			  u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -666,3 +916,4 @@ int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 	spin_unlock(&delayed_refs->lock);
 	return 0;
 }
+#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 3bec2ff0b15c9..f6fc67ddad363 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -30,9 +30,6 @@ struct btrfs_delayed_ref_node {
 	/* the starting bytenr of the extent */
 	u64 bytenr;
 
-	/* the parent our backref will point to */
-	u64 parent;
-
 	/* the size of the extent */
 	u64 num_bytes;
 
@@ -50,10 +47,21 @@ struct btrfs_delayed_ref_node {
 	 */
 	int ref_mod;
 
+	unsigned int action:8;
+	unsigned int type:8;
 	/* is this node still in the rbtree? */
+	unsigned int is_head:1;
 	unsigned int in_tree:1;
 };
 
+struct btrfs_delayed_extent_op {
+	struct btrfs_disk_key key;
+	u64 flags_to_set;
+	unsigned int update_key:1;
+	unsigned int update_flags:1;
+	unsigned int is_data:1;
+};
+
 /*
  * the head refs are used to hold a lock on a given extent, which allows us
  * to make sure that only one process is running the delayed refs
@@ -71,6 +79,7 @@ struct btrfs_delayed_ref_head {
 
 	struct list_head cluster;
 
+	struct btrfs_delayed_extent_op *extent_op;
 	/*
 	 * when a new extent is allocated, it is just reserved in memory
 	 * The actual extent isn't inserted into the extent allocation tree
@@ -84,27 +93,26 @@ struct btrfs_delayed_ref_head {
 	 * the free has happened.
 	 */
 	unsigned int must_insert_reserved:1;
+	unsigned int is_data:1;
 };
 
-struct btrfs_delayed_ref {
+struct btrfs_delayed_tree_ref {
 	struct btrfs_delayed_ref_node node;
+	union {
+		u64 root;
+		u64 parent;
+	};
+	int level;
+};
 
-	/* the root objectid our ref will point to */
-	u64 root;
-
-	/* the generation for the backref */
-	u64 generation;
-
-	/* owner_objectid of the backref  */
-	u64 owner_objectid;
-
-	/* operation done by this entry in the rbtree */
-	u8 action;
-
-	/* if pin == 1, when the extent is freed it will be pinned until
-	 * transaction commit
-	 */
-	unsigned int pin:1;
+struct btrfs_delayed_data_ref {
+	struct btrfs_delayed_ref_node node;
+	union {
+		u64 root;
+		u64 parent;
+	};
+	u64 objectid;
+	u64 offset;
 };
 
 struct btrfs_delayed_ref_root {
@@ -143,17 +151,25 @@ static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 	}
 }
 
-int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
-			  u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid, int action,
-			  int pin);
+int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes, u64 parent,
+			       u64 ref_root, int level, int action,
+			       struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
+			       u64 bytenr, u64 num_bytes,
+			       u64 parent, u64 ref_root,
+			       u64 owner, u64 offset, int action,
+			       struct btrfs_delayed_extent_op *extent_op);
+int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
+				u64 bytenr, u64 num_bytes,
+				struct btrfs_delayed_extent_op *extent_op);
 
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u32 *refs);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root, u64 bytenr,
+			     u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
 			  u64 bytenr, u64 num_bytes, u64 orig_parent,
 			  u64 parent, u64 orig_ref_root, u64 ref_root,
@@ -169,18 +185,24 @@ int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
  */
 static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
 {
-	return node->parent == (u64)-1;
+	return node->is_head;
 }
 
 /*
  * helper functions to cast a node into its container
  */
-static inline struct btrfs_delayed_ref *
-btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+static inline struct btrfs_delayed_tree_ref *
+btrfs_delayed_node_to_tree_ref(struct btrfs_delayed_ref_node *node)
 {
 	WARN_ON(btrfs_delayed_ref_is_head(node));
-	return container_of(node, struct btrfs_delayed_ref, node);
+	return container_of(node, struct btrfs_delayed_tree_ref, node);
+}
 
+static inline struct btrfs_delayed_data_ref *
+btrfs_delayed_node_to_data_ref(struct btrfs_delayed_ref_node *node)
+{
+	WARN_ON(btrfs_delayed_ref_is_head(node));
+	return container_of(node, struct btrfs_delayed_data_ref, node);
 }
 
 static inline struct btrfs_delayed_ref_head *
@@ -188,6 +210,5 @@ btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
 {
 	WARN_ON(!btrfs_delayed_ref_is_head(node));
 	return container_of(node, struct btrfs_delayed_ref_head, node);
-
 }
 #endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4b0ea0b80c234..7f5c6e3e99928 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -36,7 +36,6 @@
 #include "print-tree.h"
 #include "async-thread.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
 
@@ -884,7 +883,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 {
 	root->node = NULL;
 	root->commit_root = NULL;
-	root->ref_tree = NULL;
 	root->sectorsize = sectorsize;
 	root->nodesize = nodesize;
 	root->leafsize = leafsize;
@@ -899,12 +897,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	root->last_inode_alloc = 0;
 	root->name = NULL;
 	root->in_sysfs = 0;
+	root->inode_tree.rb_node = NULL;
 
 	INIT_LIST_HEAD(&root->dirty_list);
 	INIT_LIST_HEAD(&root->orphan_list);
-	INIT_LIST_HEAD(&root->dead_list);
+	INIT_LIST_HEAD(&root->root_list);
 	spin_lock_init(&root->node_lock);
 	spin_lock_init(&root->list_lock);
+	spin_lock_init(&root->inode_lock);
 	mutex_init(&root->objectid_mutex);
 	mutex_init(&root->log_mutex);
 	init_waitqueue_head(&root->log_writer_wait);
@@ -918,9 +918,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
 	extent_io_tree_init(&root->dirty_log_pages,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 
-	btrfs_leaf_ref_tree_init(&root->ref_tree_struct);
-	root->ref_tree = &root->ref_tree_struct;
-
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -959,6 +956,7 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
+	root->commit_root = btrfs_root_node(root);
 	BUG_ON(!root->node);
 	return 0;
 }
@@ -1025,20 +1023,19 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	 */
 	root->ref_cows = 0;
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
-				      0, BTRFS_TREE_LOG_OBJECTID,
-				      trans->transid, 0, 0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
+				      BTRFS_TREE_LOG_OBJECTID, NULL, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		kfree(root);
 		return ERR_CAST(leaf);
 	}
 
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
+	btrfs_set_header_bytenr(leaf, leaf->start);
+	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
+	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
 	root->node = leaf;
-	btrfs_set_header_nritems(root->node, 0);
-	btrfs_set_header_level(root->node, 0);
-	btrfs_set_header_bytenr(root->node, root->node->start);
-	btrfs_set_header_generation(root->node, trans->transid);
-	btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID);
 
 	write_extent_buffer(root->node, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(root->node),
@@ -1081,8 +1078,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	inode_item->nbytes = cpu_to_le64(root->leafsize);
 	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
 
-	btrfs_set_root_bytenr(&log_root->root_item, log_root->node->start);
-	btrfs_set_root_generation(&log_root->root_item, trans->transid);
+	btrfs_set_root_node(&log_root->root_item, log_root->node);
 
 	WARN_ON(root->log_root);
 	root->log_root = log_root;
@@ -1144,6 +1140,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
+	root->commit_root = btrfs_root_node(root);
 	BUG_ON(!root->node);
 insert:
 	if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
@@ -1210,7 +1207,7 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 	}
 	if (!(fs_info->sb->s_flags & MS_RDONLY)) {
 		ret = btrfs_find_dead_roots(fs_info->tree_root,
-					    root->root_key.objectid, root);
+					    root->root_key.objectid);
 		BUG_ON(ret);
 		btrfs_orphan_cleanup(root);
 	}
@@ -1569,8 +1566,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
 	atomic_set(&fs_info->nr_async_bios, 0);
-	atomic_set(&fs_info->throttles, 0);
-	atomic_set(&fs_info->throttle_gen, 0);
 	fs_info->sb = sb;
 	fs_info->max_extent = (u64)-1;
 	fs_info->max_inline = 8192 * 1024;
@@ -1598,6 +1593,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
 	fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
 
+	RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
 	extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
 			     fs_info->btree_inode->i_mapping,
 			     GFP_NOFS);
@@ -1613,10 +1609,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 			     fs_info->btree_inode->i_mapping, GFP_NOFS);
 	fs_info->do_barriers = 1;
 
-	INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
-	btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree);
-	btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree);
-
 	BTRFS_I(fs_info->btree_inode)->root = tree_root;
 	memset(&BTRFS_I(fs_info->btree_inode)->location, 0,
 	       sizeof(struct btrfs_key));
@@ -1674,6 +1666,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_iput;
 	}
 
+	features = btrfs_super_incompat_flags(disk_super);
+	if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
+		features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+		btrfs_set_super_incompat_flags(disk_super, features);
+	}
+
 	features = btrfs_super_compat_ro_flags(disk_super) &
 		~BTRFS_FEATURE_COMPAT_RO_SUPP;
 	if (!(sb->s_flags & MS_RDONLY) && features) {
@@ -1771,7 +1769,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (ret) {
 		printk(KERN_WARNING "btrfs: failed to read the system "
 		       "array on %s\n", sb->s_id);
-		goto fail_sys_array;
+		goto fail_sb_buffer;
 	}
 
 	blocksize = btrfs_level_size(tree_root,
@@ -1785,6 +1783,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					   btrfs_super_chunk_root(disk_super),
 					   blocksize, generation);
 	BUG_ON(!chunk_root->node);
+	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
+	chunk_root->commit_root = btrfs_root_node(chunk_root);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
 	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
@@ -1810,7 +1810,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 					  blocksize, generation);
 	if (!tree_root->node)
 		goto fail_chunk_root;
-
+	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
+	tree_root->commit_root = btrfs_root_node(tree_root);
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
@@ -1820,14 +1821,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
-	dev_root->track_dirty = 1;
 	if (ret)
 		goto fail_extent_root;
+	dev_root->track_dirty = 1;
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
 	if (ret)
-		goto fail_extent_root;
+		goto fail_dev_root;
 
 	csum_root->track_dirty = 1;
 
@@ -1881,7 +1882,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	}
 
 	if (!(sb->s_flags & MS_RDONLY)) {
-		ret = btrfs_cleanup_reloc_trees(tree_root);
+		ret = btrfs_recover_relocation(tree_root);
 		BUG_ON(ret);
 	}
 
@@ -1908,14 +1909,19 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 fail_csum_root:
 	free_extent_buffer(csum_root->node);
+	free_extent_buffer(csum_root->commit_root);
+fail_dev_root:
+	free_extent_buffer(dev_root->node);
+	free_extent_buffer(dev_root->commit_root);
 fail_extent_root:
 	free_extent_buffer(extent_root->node);
+	free_extent_buffer(extent_root->commit_root);
 fail_tree_root:
 	free_extent_buffer(tree_root->node);
+	free_extent_buffer(tree_root->commit_root);
 fail_chunk_root:
 	free_extent_buffer(chunk_root->node);
-fail_sys_array:
-	free_extent_buffer(dev_root->node);
+	free_extent_buffer(chunk_root->commit_root);
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
@@ -2173,6 +2179,7 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
 
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
 {
+	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
 	radix_tree_delete(&fs_info->fs_roots_radix,
 			  (unsigned long)root->root_key.objectid);
 	if (root->anon_super.s_dev) {
@@ -2219,10 +2226,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
 					     ARRAY_SIZE(gang));
 		if (!ret)
 			break;
+
+		root_objectid = gang[ret - 1]->root_key.objectid + 1;
 		for (i = 0; i < ret; i++) {
 			root_objectid = gang[i]->root_key.objectid;
 			ret = btrfs_find_dead_roots(fs_info->tree_root,
-						    root_objectid, gang[i]);
+						    root_objectid);
 			BUG_ON(ret);
 			btrfs_orphan_cleanup(gang[i]);
 		}
@@ -2278,20 +2287,16 @@ int close_ctree(struct btrfs_root *root)
 		       (unsigned long long)fs_info->total_ref_cache_size);
 	}
 
-	if (fs_info->extent_root->node)
-		free_extent_buffer(fs_info->extent_root->node);
-
-	if (fs_info->tree_root->node)
-		free_extent_buffer(fs_info->tree_root->node);
-
-	if (root->fs_info->chunk_root->node)
-		free_extent_buffer(root->fs_info->chunk_root->node);
-
-	if (root->fs_info->dev_root->node)
-		free_extent_buffer(root->fs_info->dev_root->node);
-
-	if (root->fs_info->csum_root->node)
-		free_extent_buffer(root->fs_info->csum_root->node);
+	free_extent_buffer(fs_info->extent_root->node);
+	free_extent_buffer(fs_info->extent_root->commit_root);
+	free_extent_buffer(fs_info->tree_root->node);
+	free_extent_buffer(fs_info->tree_root->commit_root);
+	free_extent_buffer(root->fs_info->chunk_root->node);
+	free_extent_buffer(root->fs_info->chunk_root->commit_root);
+	free_extent_buffer(root->fs_info->dev_root->node);
+	free_extent_buffer(root->fs_info->dev_root->commit_root);
+	free_extent_buffer(root->fs_info->csum_root->node);
+	free_extent_buffer(root->fs_info->csum_root->commit_root);
 
 	btrfs_free_block_groups(root->fs_info);
 
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 85315d2c90de0..9596b40caa4ea 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -78,7 +78,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	inode = btrfs_iget(sb, &key, root, NULL);
+	inode = btrfs_iget(sb, &key, root);
 	if (IS_ERR(inode))
 		return (void *)inode;
 
@@ -192,7 +192,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
 	btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
 	key.offset = 0;
 
-	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+	return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root));
 }
 
 const struct export_operations btrfs_export_ops = {
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 35af933550630..a42419c276e21 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -30,43 +30,33 @@
 #include "transaction.h"
 #include "volumes.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "free-space-cache.h"
 
-#define PENDING_EXTENT_INSERT 0
-#define PENDING_EXTENT_DELETE 1
-#define PENDING_BACKREF_UPDATE 2
-
-struct pending_extent_op {
-	int type;
-	u64 bytenr;
-	u64 num_bytes;
-	u64 parent;
-	u64 orig_parent;
-	u64 generation;
-	u64 orig_generation;
-	int level;
-	struct list_head list;
-	int del;
-};
-
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root, u64 parent,
-					 u64 root_objectid, u64 ref_generation,
-					 u64 owner, struct btrfs_key *ins,
-					 int ref_mod);
 static int update_reserved_extents(struct btrfs_root *root,
 				   u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
-static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					u64 bytenr, u64 num_bytes, u64 parent,
-					u64 root_objectid, u64 ref_generation,
-					u64 owner_objectid, int pin,
-					int ref_to_drop);
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 parent,
+				u64 root_objectid, u64 owner_objectid,
+				u64 owner_offset, int refs_to_drop,
+				struct btrfs_delayed_extent_op *extra_op);
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+				    struct extent_buffer *leaf,
+				    struct btrfs_extent_item *ei);
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      u64 parent, u64 root_objectid,
+				      u64 flags, u64 owner, u64 offset,
+				      struct btrfs_key *ins, int ref_mod);
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 parent, u64 root_objectid,
+				     u64 flags, struct btrfs_disk_key *key,
+				     int level, struct btrfs_key *ins);
 
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -453,348 +443,1078 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
  *    maintenance.  This is actually the same as #2, but with a slightly
  *    different use case.
  *
+ * There are two kinds of back refs. The implicit back refs is optimized
+ * for pointers in non-shared tree blocks. For a given pointer in a block,
+ * back refs of this kind provide information about the block's owner tree
+ * and the pointer's key. These information allow us to find the block by
+ * b-tree searching. The full back refs is for pointers in tree blocks not
+ * referenced by their owner trees. The location of tree block is recorded
+ * in the back refs. Actually the full back refs is generic, and can be
+ * used in all cases the implicit back refs is used. The major shortcoming
+ * of the full back refs is its overhead. Every time a tree block gets
+ * COWed, we have to update back refs entry for all pointers in it.
+ *
+ * For a newly allocated tree block, we use implicit back refs for
+ * pointers in it. This means most tree related operations only involve
+ * implicit back refs. For a tree block created in old transaction, the
+ * only way to drop a reference to it is COW it. So we can detect the
+ * event that tree block loses its owner tree's reference and do the
+ * back refs conversion.
+ *
+ * When a tree block is COW'd through a tree, there are four cases:
+ *
+ * The reference count of the block is one and the tree is the block's
+ * owner tree. Nothing to do in this case.
+ *
+ * The reference count of the block is one and the tree is not the
+ * block's owner tree. In this case, full back refs is used for pointers
+ * in the block. Remove these full back refs, add implicit back refs for
+ * every pointers in the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * the block's owner tree. In this case, implicit back refs is used for
+ * pointers in the block. Add full back refs for every pointers in the
+ * block, increase lower level extents' reference counts. The original
+ * implicit back refs are entailed to the new block.
+ *
+ * The reference count of the block is greater than one and the tree is
+ * not the block's owner tree. Add implicit back refs for every pointer in
+ * the new block, increase lower level extents' reference count.
+ *
+ * Back Reference Key composing:
+ *
+ * The key objectid corresponds to the first byte in the extent,
+ * The key type is used to differentiate between types of back refs.
+ * There are different meanings of the key offset for different types
+ * of back refs.
+ *
  * File extents can be referenced by:
  *
  * - multiple snapshots, subvolumes, or different generations in one subvol
  * - different files inside a single subvolume
  * - different offsets inside a file (bookend extents in file.c)
  *
- * The extent ref structure has fields for:
+ * The extent ref structure for the implicit back refs has fields for:
  *
  * - Objectid of the subvolume root
- * - Generation number of the tree holding the reference
  * - objectid of the file holding the reference
- * - number of references holding by parent node (alway 1 for tree blocks)
- *
- * Btree leaf may hold multiple references to a file extent. In most cases,
- * these references are from same file and the corresponding offsets inside
- * the file are close together.
- *
- * When a file extent is allocated the fields are filled in:
- *     (root_key.objectid, trans->transid, inode objectid, 1)
+ * - original offset in the file
+ * - how many bookend extents
  *
- * When a leaf is cow'd new references are added for every file extent found
- * in the leaf.  It looks similar to the create case, but trans->transid will
- * be different when the block is cow'd.
+ * The key offset for the implicit back refs is hash of the first
+ * three fields.
  *
- *     (root_key.objectid, trans->transid, inode objectid,
- *      number of references in the leaf)
+ * The extent ref structure for the full back refs has field for:
  *
- * When a file extent is removed either during snapshot deletion or
- * file truncation, we find the corresponding back reference and check
- * the following fields:
+ * - number of pointers in the tree leaf
  *
- *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
- *      inode objectid)
+ * The key offset for the implicit back refs is the first byte of
+ * the tree leaf
  *
- * Btree extents can be referenced by:
- *
- * - Different subvolumes
- * - Different generations of the same subvolume
- *
- * When a tree block is created, back references are inserted:
+ * When a file extent is allocated, The implicit back refs is used.
+ * the fields are filled in:
  *
- * (root->root_key.objectid, trans->transid, level, 1)
+ *     (root_key.objectid, inode objectid, offset in file, 1)
  *
- * When a tree block is cow'd, new back references are added for all the
- * blocks it points to. If the tree block isn't in reference counted root,
- * the old back references are removed. These new back references are of
- * the form (trans->transid will have increased since creation):
+ * When a file extent is removed file truncation, we find the
+ * corresponding implicit back refs and check the following fields:
  *
- * (root->root_key.objectid, trans->transid, level, 1)
+ *     (btrfs_header_owner(leaf), inode objectid, offset in file)
  *
- * When a backref is in deleting, the following fields are checked:
+ * Btree extents can be referenced by:
  *
- * if backref was for a tree root:
- *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
- * else
- *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
+ * - Different subvolumes
  *
- * Back Reference Key composing:
+ * Both the implicit back refs and the full back refs for tree blocks
+ * only consist of key. The key offset for the implicit back refs is
+ * objectid of block's owner tree. The key offset for the full back refs
+ * is the first byte of parent block.
  *
- * The key objectid corresponds to the first byte in the extent, the key
- * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
- * byte of parent extent. If a extent is tree root, the key offset is set
- * to the key objectid.
+ * When implicit back refs is used, information about the lowest key and
+ * level of the tree block are required. These information are stored in
+ * tree block info structure.
  */
 
-static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, u64 parent,
-					  u64 ref_root, u64 ref_generation,
-					  u64 owner_objectid, int del)
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_path *path,
+				  u64 owner, u32 extra_size)
 {
+	struct btrfs_extent_item *item;
+	struct btrfs_extent_item_v0 *ei0;
+	struct btrfs_extent_ref_v0 *ref0;
+	struct btrfs_tree_block_info *bi;
+	struct extent_buffer *leaf;
 	struct btrfs_key key;
-	struct btrfs_extent_ref *ref;
+	struct btrfs_key found_key;
+	u32 new_size = sizeof(*item);
+	u64 refs;
+	int ret;
+
+	leaf = path->nodes[0];
+	BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
+
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	ei0 = btrfs_item_ptr(leaf, path->slots[0],
+			     struct btrfs_extent_item_v0);
+	refs = btrfs_extent_refs_v0(leaf, ei0);
+
+	if (owner == (u64)-1) {
+		while (1) {
+			if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+				ret = btrfs_next_leaf(root, path);
+				if (ret < 0)
+					return ret;
+				BUG_ON(ret > 0);
+				leaf = path->nodes[0];
+			}
+			btrfs_item_key_to_cpu(leaf, &found_key,
+					      path->slots[0]);
+			BUG_ON(key.objectid != found_key.objectid);
+			if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
+				path->slots[0]++;
+				continue;
+			}
+			ref0 = btrfs_item_ptr(leaf, path->slots[0],
+					      struct btrfs_extent_ref_v0);
+			owner = btrfs_ref_objectid_v0(leaf, ref0);
+			break;
+		}
+	}
+	btrfs_release_path(root, path);
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID)
+		new_size += sizeof(*bi);
+
+	new_size -= sizeof(*ei0);
+	ret = btrfs_search_slot(trans, root, &key, path,
+				new_size + extra_size, 1);
+	if (ret < 0)
+		return ret;
+	BUG_ON(ret);
+
+	ret = btrfs_extend_item(trans, root, path, new_size);
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, item, refs);
+	/* FIXME: get real generation */
+	btrfs_set_extent_generation(leaf, item, 0);
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		btrfs_set_extent_flags(leaf, item,
+				       BTRFS_EXTENT_FLAG_TREE_BLOCK |
+				       BTRFS_BLOCK_FLAG_FULL_BACKREF);
+		bi = (struct btrfs_tree_block_info *)(item + 1);
+		/* FIXME: get first key of the block */
+		memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
+		btrfs_set_tree_block_level(leaf, bi, (int)owner);
+	} else {
+		btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+#endif
+
+static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
+{
+	u32 high_crc = ~(u32)0;
+	u32 low_crc = ~(u32)0;
+	__le64 lenum;
+
+	lenum = cpu_to_le64(root_objectid);
+	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+	lenum = cpu_to_le64(owner);
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	lenum = cpu_to_le64(offset);
+	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+
+	return ((u64)high_crc << 31) ^ (u64)low_crc;
+}
+
+static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
+				     struct btrfs_extent_data_ref *ref)
+{
+	return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
+				    btrfs_extent_data_ref_objectid(leaf, ref),
+				    btrfs_extent_data_ref_offset(leaf, ref));
+}
+
+static int match_extent_data_ref(struct extent_buffer *leaf,
+				 struct btrfs_extent_data_ref *ref,
+				 u64 root_objectid, u64 owner, u64 offset)
+{
+	if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
+	    btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
+	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
+		return 0;
+	return 1;
+}
+
+static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   u64 bytenr, u64 parent,
+					   u64 root_objectid,
+					   u64 owner, u64 offset)
+{
+	struct btrfs_key key;
+	struct btrfs_extent_data_ref *ref;
 	struct extent_buffer *leaf;
-	u64 ref_objectid;
+	u32 nritems;
 	int ret;
+	int recow;
+	int err = -ENOENT;
 
 	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_REF_KEY;
-	key.offset = parent;
+	if (parent) {
+		key.type = BTRFS_SHARED_DATA_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_EXTENT_DATA_REF_KEY;
+		key.offset = hash_extent_data_ref(root_objectid,
+						  owner, offset);
+	}
+again:
+	recow = 0;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret < 0) {
+		err = ret;
+		goto fail;
+	}
 
-	ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
-	if (ret < 0)
-		goto out;
-	if (ret > 0) {
-		ret = -ENOENT;
-		goto out;
+	if (parent) {
+		if (!ret)
+			return 0;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		key.type = BTRFS_EXTENT_REF_V0_KEY;
+		btrfs_release_path(root, path);
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0) {
+			err = ret;
+			goto fail;
+		}
+		if (!ret)
+			return 0;
+#endif
+		goto fail;
 	}
 
 	leaf = path->nodes[0];
-	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-	ref_objectid = btrfs_ref_objectid(leaf, ref);
-	if (btrfs_ref_root(leaf, ref) != ref_root ||
-	    btrfs_ref_generation(leaf, ref) != ref_generation ||
-	    (ref_objectid != owner_objectid &&
-	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-		ret = -EIO;
-		WARN_ON(1);
-		goto out;
+	nritems = btrfs_header_nritems(leaf);
+	while (1) {
+		if (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0)
+				err = ret;
+			if (ret)
+				goto fail;
+
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			recow = 1;
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != bytenr ||
+		    key.type != BTRFS_EXTENT_DATA_REF_KEY)
+			goto fail;
+
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_data_ref);
+
+		if (match_extent_data_ref(leaf, ref, root_objectid,
+					  owner, offset)) {
+			if (recow) {
+				btrfs_release_path(root, path);
+				goto again;
+			}
+			err = 0;
+			break;
+		}
+		path->slots[0]++;
 	}
-	ret = 0;
-out:
-	return ret;
+fail:
+	return err;
 }
 
-static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  u64 bytenr, u64 parent,
-					  u64 ref_root, u64 ref_generation,
-					  u64 owner_objectid,
-					  int refs_to_add)
+static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   u64 bytenr, u64 parent,
+					   u64 root_objectid, u64 owner,
+					   u64 offset, int refs_to_add)
 {
 	struct btrfs_key key;
 	struct extent_buffer *leaf;
-	struct btrfs_extent_ref *ref;
+	u32 size;
 	u32 num_refs;
 	int ret;
 
 	key.objectid = bytenr;
-	key.type = BTRFS_EXTENT_REF_KEY;
-	key.offset = parent;
+	if (parent) {
+		key.type = BTRFS_SHARED_DATA_REF_KEY;
+		key.offset = parent;
+		size = sizeof(struct btrfs_shared_data_ref);
+	} else {
+		key.type = BTRFS_EXTENT_DATA_REF_KEY;
+		key.offset = hash_extent_data_ref(root_objectid,
+						  owner, offset);
+		size = sizeof(struct btrfs_extent_data_ref);
+	}
 
-	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
-	if (ret == 0) {
-		leaf = path->nodes[0];
-		ref = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_extent_ref);
-		btrfs_set_ref_root(leaf, ref, ref_root);
-		btrfs_set_ref_generation(leaf, ref, ref_generation);
-		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-		btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
-	} else if (ret == -EEXIST) {
-		u64 existing_owner;
-
-		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
-		leaf = path->nodes[0];
+	ret = btrfs_insert_empty_item(trans, root, path, &key, size);
+	if (ret && ret != -EEXIST)
+		goto fail;
+
+	leaf = path->nodes[0];
+	if (parent) {
+		struct btrfs_shared_data_ref *ref;
 		ref = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_extent_ref);
-		if (btrfs_ref_root(leaf, ref) != ref_root ||
-		    btrfs_ref_generation(leaf, ref) != ref_generation) {
-			ret = -EIO;
-			WARN_ON(1);
-			goto out;
+				     struct btrfs_shared_data_ref);
+		if (ret == 0) {
+			btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
+		} else {
+			num_refs = btrfs_shared_data_ref_count(leaf, ref);
+			num_refs += refs_to_add;
+			btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
 		}
+	} else {
+		struct btrfs_extent_data_ref *ref;
+		while (ret == -EEXIST) {
+			ref = btrfs_item_ptr(leaf, path->slots[0],
+					     struct btrfs_extent_data_ref);
+			if (match_extent_data_ref(leaf, ref, root_objectid,
+						  owner, offset))
+				break;
+			btrfs_release_path(root, path);
+			key.offset++;
+			ret = btrfs_insert_empty_item(trans, root, path, &key,
+						      size);
+			if (ret && ret != -EEXIST)
+				goto fail;
 
-		num_refs = btrfs_ref_num_refs(leaf, ref);
-		BUG_ON(num_refs == 0);
-		btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
-
-		existing_owner = btrfs_ref_objectid(leaf, ref);
-		if (existing_owner != owner_objectid &&
-		    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
-			btrfs_set_ref_objectid(leaf, ref,
-					BTRFS_MULTIPLE_OBJECTIDS);
+			leaf = path->nodes[0];
+		}
+		ref = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_data_ref);
+		if (ret == 0) {
+			btrfs_set_extent_data_ref_root(leaf, ref,
+						       root_objectid);
+			btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+			btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+			btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
+		} else {
+			num_refs = btrfs_extent_data_ref_count(leaf, ref);
+			num_refs += refs_to_add;
+			btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
 		}
-		ret = 0;
-	} else {
-		goto out;
 	}
-	btrfs_unlock_up_safe(path, 1);
-	btrfs_mark_buffer_dirty(path->nodes[0]);
-out:
+	btrfs_mark_buffer_dirty(leaf);
+	ret = 0;
+fail:
 	btrfs_release_path(root, path);
 	return ret;
 }
 
-static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
-					  struct btrfs_root *root,
-					  struct btrfs_path *path,
-					  int refs_to_drop)
+static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
+					   struct btrfs_root *root,
+					   struct btrfs_path *path,
+					   int refs_to_drop)
 {
+	struct btrfs_key key;
+	struct btrfs_extent_data_ref *ref1 = NULL;
+	struct btrfs_shared_data_ref *ref2 = NULL;
 	struct extent_buffer *leaf;
-	struct btrfs_extent_ref *ref;
-	u32 num_refs;
+	u32 num_refs = 0;
 	int ret = 0;
 
 	leaf = path->nodes[0];
-	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-	num_refs = btrfs_ref_num_refs(leaf, ref);
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+
+	if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		ref1 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_data_ref);
+		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+		ref2 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_shared_data_ref);
+		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+		struct btrfs_extent_ref_v0 *ref0;
+		ref0 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_ref_v0);
+		num_refs = btrfs_ref_count_v0(leaf, ref0);
+#endif
+	} else {
+		BUG();
+	}
+
 	BUG_ON(num_refs < refs_to_drop);
 	num_refs -= refs_to_drop;
+
 	if (num_refs == 0) {
 		ret = btrfs_del_item(trans, root, path);
 	} else {
-		btrfs_set_ref_num_refs(leaf, ref, num_refs);
+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
+			btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
+		else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
+			btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		else {
+			struct btrfs_extent_ref_v0 *ref0;
+			ref0 = btrfs_item_ptr(leaf, path->slots[0],
+					struct btrfs_extent_ref_v0);
+			btrfs_set_ref_count_v0(leaf, ref0, num_refs);
+		}
+#endif
 		btrfs_mark_buffer_dirty(leaf);
 	}
-	btrfs_release_path(root, path);
 	return ret;
 }
 
-#ifdef BIO_RW_DISCARD
-static void btrfs_issue_discard(struct block_device *bdev,
-				u64 start, u64 len)
+static noinline u32 extent_data_ref_count(struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  struct btrfs_extent_inline_ref *iref)
 {
-	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
-}
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_data_ref *ref1;
+	struct btrfs_shared_data_ref *ref2;
+	u32 num_refs = 0;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	if (iref) {
+		if (btrfs_extent_inline_ref_type(leaf, iref) ==
+		    BTRFS_EXTENT_DATA_REF_KEY) {
+			ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
+			num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+		} else {
+			ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
+			num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+		}
+	} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+		ref1 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_data_ref);
+		num_refs = btrfs_extent_data_ref_count(leaf, ref1);
+	} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+		ref2 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_shared_data_ref);
+		num_refs = btrfs_shared_data_ref_count(leaf, ref2);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	} else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
+		struct btrfs_extent_ref_v0 *ref0;
+		ref0 = btrfs_item_ptr(leaf, path->slots[0],
+				      struct btrfs_extent_ref_v0);
+		num_refs = btrfs_ref_count_v0(leaf, ref0);
 #endif
+	} else {
+		WARN_ON(1);
+	}
+	return num_refs;
+}
 
-static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
-				u64 num_bytes)
+static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 root_objectid)
 {
-#ifdef BIO_RW_DISCARD
+	struct btrfs_key key;
 	int ret;
-	u64 map_length = num_bytes;
-	struct btrfs_multi_bio *multi = NULL;
-
-	/* Tell the block device(s) that the sectors can be discarded */
-	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
-			      bytenr, &map_length, &multi, 0);
-	if (!ret) {
-		struct btrfs_bio_stripe *stripe = multi->stripes;
-		int i;
-
-		if (map_length > num_bytes)
-			map_length = num_bytes;
 
-		for (i = 0; i < multi->num_stripes; i++, stripe++) {
-			btrfs_issue_discard(stripe->dev->bdev,
-					    stripe->physical,
-					    map_length);
-		}
-		kfree(multi);
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
 	}
 
-	return ret;
-#else
-	return 0;
+	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+	if (ret > 0)
+		ret = -ENOENT;
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (ret == -ENOENT && parent) {
+		btrfs_release_path(root, path);
+		key.type = BTRFS_EXTENT_REF_V0_KEY;
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret > 0)
+			ret = -ENOENT;
+	}
 #endif
+	return ret;
 }
 
-static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *root, u64 bytenr,
-				     u64 num_bytes,
-				     u64 orig_parent, u64 parent,
-				     u64 orig_root, u64 ref_root,
-				     u64 orig_generation, u64 ref_generation,
-				     u64 owner_objectid)
+static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
+					  struct btrfs_root *root,
+					  struct btrfs_path *path,
+					  u64 bytenr, u64 parent,
+					  u64 root_objectid)
 {
+	struct btrfs_key key;
 	int ret;
-	int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
 
-	ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
-				       orig_parent, parent, orig_root,
-				       ref_root, orig_generation,
-				       ref_generation, owner_objectid, pin);
-	BUG_ON(ret);
+	key.objectid = bytenr;
+	if (parent) {
+		key.type = BTRFS_SHARED_BLOCK_REF_KEY;
+		key.offset = parent;
+	} else {
+		key.type = BTRFS_TREE_BLOCK_REF_KEY;
+		key.offset = root_objectid;
+	}
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+	btrfs_release_path(root, path);
 	return ret;
 }
 
-int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root, u64 bytenr,
-			    u64 num_bytes, u64 orig_parent, u64 parent,
-			    u64 ref_root, u64 ref_generation,
-			    u64 owner_objectid)
+static inline int extent_ref_type(u64 parent, u64 owner)
 {
-	int ret;
-	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
-	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-		return 0;
-
-	ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-					orig_parent, parent, ref_root,
-					ref_root, ref_generation,
-					ref_generation, owner_objectid);
-	return ret;
+	int type;
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		if (parent > 0)
+			type = BTRFS_SHARED_BLOCK_REF_KEY;
+		else
+			type = BTRFS_TREE_BLOCK_REF_KEY;
+	} else {
+		if (parent > 0)
+			type = BTRFS_SHARED_DATA_REF_KEY;
+		else
+			type = BTRFS_EXTENT_DATA_REF_KEY;
+	}
+	return type;
 }
-static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root, u64 bytenr,
-				  u64 num_bytes,
-				  u64 orig_parent, u64 parent,
-				  u64 orig_root, u64 ref_root,
-				  u64 orig_generation, u64 ref_generation,
-				  u64 owner_objectid)
-{
-	int ret;
 
-	ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
-				    ref_generation, owner_objectid,
-				    BTRFS_ADD_DELAYED_REF, 0);
-	BUG_ON(ret);
-	return ret;
-}
+static int find_next_key(struct btrfs_path *path, struct btrfs_key *key)
 
-static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, u64 bytenr,
-			  u64 num_bytes, u64 parent, u64 ref_root,
-			  u64 ref_generation, u64 owner_objectid,
-			  int refs_to_add)
 {
-	struct btrfs_path *path;
-	int ret;
-	struct btrfs_key key;
-	struct extent_buffer *l;
-	struct btrfs_extent_item *item;
-	u32 refs;
+	int level;
+	BUG_ON(!path->keep_locks);
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		if (!path->nodes[level])
+			break;
+		btrfs_assert_tree_locked(path->nodes[level]);
+		if (path->slots[level] + 1 >=
+		    btrfs_header_nritems(path->nodes[level]))
+			continue;
+		if (level == 0)
+			btrfs_item_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+		else
+			btrfs_node_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+		return 0;
+	}
+	return 1;
+}
 
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
+/*
+ * look for inline back ref. if back ref is found, *ref_ret is set
+ * to the address of inline back ref, and 0 is returned.
+ *
+ * if back ref isn't found, *ref_ret is set to the address where it
+ * should be inserted, and -ENOENT is returned.
+ *
+ * if insert is true and there are too many inline back refs, the path
+ * points to the extent item, and -EAGAIN is returned.
+ *
+ * NOTE: inline back refs are ordered in the same way that back ref
+ *	 items in the tree are ordered.
+ */
+static noinline_for_stack
+int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref **ref_ret,
+				 u64 bytenr, u64 num_bytes,
+				 u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int insert)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	u64 flags;
+	u64 item_size;
+	unsigned long ptr;
+	unsigned long end;
+	int extra_size;
+	int type;
+	int want;
+	int ret;
+	int err = 0;
 
-	path->reada = 1;
-	path->leave_spinning = 1;
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 	key.offset = num_bytes;
 
-	/* first find the extent item and update its reference count */
-	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
-				path, 0, 1);
+	want = extent_ref_type(parent, owner);
+	if (insert) {
+		extra_size = btrfs_extent_inline_ref_size(want);
+		if (owner >= BTRFS_FIRST_FREE_OBJECTID)
+			path->keep_locks = 1;
+	} else
+		extra_size = -1;
+	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
 	if (ret < 0) {
-		btrfs_set_path_blocking(path);
+		err = ret;
+		goto out;
+	}
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		if (!insert) {
+			err = -ENOENT;
+			goto out;
+		}
+		ret = convert_extent_item_v0(trans, root, path, owner,
+					     extra_size);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID && insert &&
+	    item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	flags = btrfs_extent_flags(leaf, ei);
+
+	ptr = (unsigned long)(ei + 1);
+	end = (unsigned long)ei + item_size;
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		ptr += sizeof(struct btrfs_tree_block_info);
+		BUG_ON(ptr > end);
+	} else {
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
+	}
+
+	err = -ENOENT;
+	while (1) {
+		if (ptr >= end) {
+			WARN_ON(ptr > end);
+			break;
+		}
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(leaf, iref);
+		if (want < type)
+			break;
+		if (want > type) {
+			ptr += btrfs_extent_inline_ref_size(type);
+			continue;
+		}
+
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+			struct btrfs_extent_data_ref *dref;
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			if (match_extent_data_ref(leaf, dref, root_objectid,
+						  owner, offset)) {
+				err = 0;
+				break;
+			}
+			if (hash_extent_data_ref_item(leaf, dref) <
+			    hash_extent_data_ref(root_objectid, owner, offset))
+				break;
+		} else {
+			u64 ref_offset;
+			ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
+			if (parent > 0) {
+				if (parent == ref_offset) {
+					err = 0;
+					break;
+				}
+				if (ref_offset < parent)
+					break;
+			} else {
+				if (root_objectid == ref_offset) {
+					err = 0;
+					break;
+				}
+				if (ref_offset < root_objectid)
+					break;
+			}
+		}
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+	if (err == -ENOENT && insert) {
+		if (item_size + extra_size >=
+		    BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
+			err = -EAGAIN;
+			goto out;
+		}
+		/*
+		 * To add new inline back ref, we have to make sure
+		 * there is no corresponding back ref item.
+		 * For simplicity, we just do not add new inline back
+		 * ref if there is any kind of item for this block
+		 */
+		if (owner >= BTRFS_FIRST_FREE_OBJECTID &&
+		    find_next_key(path, &key) == 0 && key.objectid == bytenr) {
+			err = -EAGAIN;
+			goto out;
+		}
+	}
+	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
+out:
+	if (insert && owner >= BTRFS_FIRST_FREE_OBJECTID) {
+		path->keep_locks = 0;
+		btrfs_unlock_up_safe(path, 1);
+	}
+	return err;
+}
+
+/*
+ * helper to add new inline back ref
+ */
+static noinline_for_stack
+int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_path *path,
+				struct btrfs_extent_inline_ref *iref,
+				u64 parent, u64 root_objectid,
+				u64 owner, u64 offset, int refs_to_add,
+				struct btrfs_delayed_extent_op *extent_op)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	unsigned long ptr;
+	unsigned long end;
+	unsigned long item_offset;
+	u64 refs;
+	int size;
+	int type;
+	int ret;
+
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	item_offset = (unsigned long)iref - (unsigned long)ei;
+
+	type = extent_ref_type(parent, owner);
+	size = btrfs_extent_inline_ref_size(type);
+
+	ret = btrfs_extend_item(trans, root, path, size);
+	BUG_ON(ret);
+
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	refs += refs_to_add;
+	btrfs_set_extent_refs(leaf, ei, refs);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, ei);
+
+	ptr = (unsigned long)ei + item_offset;
+	end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
+	if (ptr < end - size)
+		memmove_extent_buffer(leaf, ptr + size, ptr,
+				      end - size - ptr);
+
+	iref = (struct btrfs_extent_inline_ref *)ptr;
+	btrfs_set_extent_inline_ref_type(leaf, iref, type);
+	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+		struct btrfs_extent_data_ref *dref;
+		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
+		btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
+		btrfs_set_extent_data_ref_offset(leaf, dref, offset);
+		btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
+	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+		struct btrfs_shared_data_ref *sref;
+		sref = (struct btrfs_shared_data_ref *)(iref + 1);
+		btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else {
+		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+
+static int lookup_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref **ref_ret,
+				 u64 bytenr, u64 num_bytes, u64 parent,
+				 u64 root_objectid, u64 owner, u64 offset)
+{
+	int ret;
+
+	ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
+					   bytenr, num_bytes, parent,
+					   root_objectid, owner, offset, 0);
+	if (ret != -ENOENT)
 		return ret;
+
+	btrfs_release_path(root, path);
+	*ref_ret = NULL;
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
+					    root_objectid);
+	} else {
+		ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
+					     root_objectid, owner, offset);
 	}
+	return ret;
+}
 
-	if (ret > 0) {
-		WARN_ON(1);
-		btrfs_free_path(path);
-		return -EIO;
+/*
+ * helper to update/remove inline back ref
+ */
+static noinline_for_stack
+int update_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref *iref,
+				 int refs_to_mod,
+				 struct btrfs_delayed_extent_op *extent_op)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_data_ref *dref = NULL;
+	struct btrfs_shared_data_ref *sref = NULL;
+	unsigned long ptr;
+	unsigned long end;
+	u32 item_size;
+	int size;
+	int type;
+	int ret;
+	u64 refs;
+
+	leaf = path->nodes[0];
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, ei);
+	WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
+	refs += refs_to_mod;
+	btrfs_set_extent_refs(leaf, ei, refs);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, ei);
+
+	type = btrfs_extent_inline_ref_type(leaf, iref);
+
+	if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+		dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		refs = btrfs_extent_data_ref_count(leaf, dref);
+	} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+		sref = (struct btrfs_shared_data_ref *)(iref + 1);
+		refs = btrfs_shared_data_ref_count(leaf, sref);
+	} else {
+		refs = 1;
+		BUG_ON(refs_to_mod != -1);
 	}
-	l = path->nodes[0];
 
-	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
-	if (key.objectid != bytenr) {
-		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
-		printk(KERN_ERR "btrfs wanted %llu found %llu\n",
-		       (unsigned long long)bytenr,
-		       (unsigned long long)key.objectid);
-		BUG();
+	BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
+	refs += refs_to_mod;
+
+	if (refs > 0) {
+		if (type == BTRFS_EXTENT_DATA_REF_KEY)
+			btrfs_set_extent_data_ref_count(leaf, dref, refs);
+		else
+			btrfs_set_shared_data_ref_count(leaf, sref, refs);
+	} else {
+		size =  btrfs_extent_inline_ref_size(type);
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+		ptr = (unsigned long)iref;
+		end = (unsigned long)ei + item_size;
+		if (ptr + size < end)
+			memmove_extent_buffer(leaf, ptr, ptr + size,
+					      end - ptr - size);
+		item_size -= size;
+		ret = btrfs_truncate_item(trans, root, path, item_size, 1);
+		BUG_ON(ret);
+	}
+	btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+
+static noinline_for_stack
+int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 bytenr, u64 num_bytes, u64 parent,
+				 u64 root_objectid, u64 owner,
+				 u64 offset, int refs_to_add,
+				 struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_extent_inline_ref *iref;
+	int ret;
+
+	ret = lookup_inline_extent_backref(trans, root, path, &iref,
+					   bytenr, num_bytes, parent,
+					   root_objectid, owner, offset, 1);
+	if (ret == 0) {
+		BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
+		ret = update_inline_extent_backref(trans, root, path, iref,
+						   refs_to_add, extent_op);
+	} else if (ret == -ENOENT) {
+		ret = setup_inline_extent_backref(trans, root, path, iref,
+						  parent, root_objectid,
+						  owner, offset, refs_to_add,
+						  extent_op);
 	}
-	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
+	return ret;
+}
 
-	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+static int insert_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 u64 bytenr, u64 parent, u64 root_objectid,
+				 u64 owner, u64 offset, int refs_to_add)
+{
+	int ret;
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		BUG_ON(refs_to_add != 1);
+		ret = insert_tree_block_ref(trans, root, path, bytenr,
+					    parent, root_objectid);
+	} else {
+		ret = insert_extent_data_ref(trans, root, path, bytenr,
+					     parent, root_objectid,
+					     owner, offset, refs_to_add);
+	}
+	return ret;
+}
 
-	refs = btrfs_extent_refs(l, item);
-	btrfs_set_extent_refs(l, item, refs + refs_to_add);
-	btrfs_unlock_up_safe(path, 1);
+static int remove_extent_backref(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_path *path,
+				 struct btrfs_extent_inline_ref *iref,
+				 int refs_to_drop, int is_data)
+{
+	int ret;
 
-	btrfs_mark_buffer_dirty(path->nodes[0]);
+	BUG_ON(!is_data && refs_to_drop != 1);
+	if (iref) {
+		ret = update_inline_extent_backref(trans, root, path, iref,
+						   -refs_to_drop, NULL);
+	} else if (is_data) {
+		ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
+	} else {
+		ret = btrfs_del_item(trans, root, path);
+	}
+	return ret;
+}
+
+#ifdef BIO_RW_DISCARD
+static void btrfs_issue_discard(struct block_device *bdev,
+				u64 start, u64 len)
+{
+	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+}
+#endif
+
+static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
+				u64 num_bytes)
+{
+#ifdef BIO_RW_DISCARD
+	int ret;
+	u64 map_length = num_bytes;
+	struct btrfs_multi_bio *multi = NULL;
+
+	/* Tell the block device(s) that the sectors can be discarded */
+	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+			      bytenr, &map_length, &multi, 0);
+	if (!ret) {
+		struct btrfs_bio_stripe *stripe = multi->stripes;
+		int i;
+
+		if (map_length > num_bytes)
+			map_length = num_bytes;
+
+		for (i = 0; i < multi->num_stripes; i++, stripe++) {
+			btrfs_issue_discard(stripe->dev->bdev,
+					    stripe->physical,
+					    map_length);
+		}
+		kfree(multi);
+	}
+
+	return ret;
+#else
+	return 0;
+#endif
+}
+
+int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+			 struct btrfs_root *root,
+			 u64 bytenr, u64 num_bytes, u64 parent,
+			 u64 root_objectid, u64 owner, u64 offset)
+{
+	int ret;
+	BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
+	       root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+	if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, (int)owner,
+					BTRFS_ADD_DELAYED_REF, NULL);
+	} else {
+		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, owner, offset,
+					BTRFS_ADD_DELAYED_REF, NULL);
+	}
+	return ret;
+}
+
+static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 bytenr, u64 num_bytes,
+				  u64 parent, u64 root_objectid,
+				  u64 owner, u64 offset, int refs_to_add,
+				  struct btrfs_delayed_extent_op *extent_op)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_item *item;
+	u64 refs;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+	/* this will setup the path even if it fails to insert the back ref */
+	ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
+					   path, bytenr, num_bytes, parent,
+					   root_objectid, owner, offset,
+					   refs_to_add, extent_op);
+	if (ret == 0)
+		goto out;
+
+	if (ret != -EAGAIN) {
+		err = ret;
+		goto out;
+	}
 
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	refs = btrfs_extent_refs(leaf, item);
+	btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
+	if (extent_op)
+		__run_delayed_extent_op(extent_op, leaf, item);
+
+	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(root->fs_info->extent_root, path);
 
 	path->reada = 1;
@@ -802,56 +1522,197 @@ static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
 
 	/* now insert the actual backref */
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
-				    path, bytenr, parent,
-				    ref_root, ref_generation,
-				    owner_objectid, refs_to_add);
+				    path, bytenr, parent, root_objectid,
+				    owner, offset, refs_to_add);
 	BUG_ON(ret);
+out:
 	btrfs_free_path(path);
-	return 0;
+	return err;
 }
 
-int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 ref_root, u64 ref_generation,
-			 u64 owner_objectid)
+static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_delayed_ref_node *node,
+				struct btrfs_delayed_extent_op *extent_op,
+				int insert_reserved)
+{
+	int ret = 0;
+	struct btrfs_delayed_data_ref *ref;
+	struct btrfs_key ins;
+	u64 parent = 0;
+	u64 ref_root = 0;
+	u64 flags = 0;
+
+	ins.objectid = node->bytenr;
+	ins.offset = node->num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+	ref = btrfs_delayed_node_to_data_ref(node);
+	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
+		parent = ref->parent;
+	else
+		ref_root = ref->root;
+
+	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		if (extent_op) {
+			BUG_ON(extent_op->update_key);
+			flags |= extent_op->flags_to_set;
+		}
+		ret = alloc_reserved_file_extent(trans, root,
+						 parent, ref_root, flags,
+						 ref->objectid, ref->offset,
+						 &ins, node->ref_mod);
+		update_reserved_extents(root, ins.objectid, ins.offset, 0);
+	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+					     node->num_bytes, parent,
+					     ref_root, ref->objectid,
+					     ref->offset, node->ref_mod,
+					     extent_op);
+	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
+		ret = __btrfs_free_extent(trans, root, node->bytenr,
+					  node->num_bytes, parent,
+					  ref_root, ref->objectid,
+					  ref->offset, node->ref_mod,
+					  extent_op);
+	} else {
+		BUG();
+	}
+	return ret;
+}
+
+static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
+				    struct extent_buffer *leaf,
+				    struct btrfs_extent_item *ei)
+{
+	u64 flags = btrfs_extent_flags(leaf, ei);
+	if (extent_op->update_flags) {
+		flags |= extent_op->flags_to_set;
+		btrfs_set_extent_flags(leaf, ei, flags);
+	}
+
+	if (extent_op->update_key) {
+		struct btrfs_tree_block_info *bi;
+		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
+	}
+}
+
+static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 struct btrfs_delayed_ref_node *node,
+				 struct btrfs_delayed_extent_op *extent_op)
 {
+	struct btrfs_key key;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	struct extent_buffer *leaf;
+	u32 item_size;
 	int ret;
-	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
-	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-		return 0;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = node->bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = node->num_bytes;
+
+	path->reada = 1;
+	path->leave_spinning = 1;
+	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+				path, 0, 1);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	if (ret > 0) {
+		err = -EIO;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
+					     path, (u64)-1, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
+	__run_delayed_extent_op(extent_op, leaf, ei);
 
-	ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
-				     0, ref_root, 0, ref_generation,
-				     owner_objectid);
-	return ret;
+	btrfs_mark_buffer_dirty(leaf);
+out:
+	btrfs_free_path(path);
+	return err;
 }
 
-static int drop_delayed_ref(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_delayed_ref_node *node)
+static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				struct btrfs_delayed_ref_node *node,
+				struct btrfs_delayed_extent_op *extent_op,
+				int insert_reserved)
 {
 	int ret = 0;
-	struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+	struct btrfs_delayed_tree_ref *ref;
+	struct btrfs_key ins;
+	u64 parent = 0;
+	u64 ref_root = 0;
 
-	BUG_ON(node->ref_mod == 0);
-	ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
-				  node->parent, ref->root, ref->generation,
-				  ref->owner_objectid, ref->pin, node->ref_mod);
+	ins.objectid = node->bytenr;
+	ins.offset = node->num_bytes;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
 
+	ref = btrfs_delayed_node_to_tree_ref(node);
+	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+		parent = ref->parent;
+	else
+		ref_root = ref->root;
+
+	BUG_ON(node->ref_mod != 1);
+	if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
+		BUG_ON(!extent_op || !extent_op->update_flags ||
+		       !extent_op->update_key);
+		ret = alloc_reserved_tree_block(trans, root,
+						parent, ref_root,
+						extent_op->flags_to_set,
+						&extent_op->key,
+						ref->level, &ins);
+		update_reserved_extents(root, ins.objectid, ins.offset, 0);
+	} else if (node->action == BTRFS_ADD_DELAYED_REF) {
+		ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
+					     node->num_bytes, parent, ref_root,
+					     ref->level, 0, 1, extent_op);
+	} else if (node->action == BTRFS_DROP_DELAYED_REF) {
+		ret = __btrfs_free_extent(trans, root, node->bytenr,
+					  node->num_bytes, parent, ref_root,
+					  ref->level, 0, 1, extent_op);
+	} else {
+		BUG();
+	}
 	return ret;
 }
 
+
 /* helper function to actually process a single delayed ref entry */
-static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					struct btrfs_delayed_ref_node *node,
-					int insert_reserved)
+static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root,
+			       struct btrfs_delayed_ref_node *node,
+			       struct btrfs_delayed_extent_op *extent_op,
+			       int insert_reserved)
 {
 	int ret;
-	struct btrfs_delayed_ref *ref;
-
-	if (node->parent == (u64)-1) {
+	if (btrfs_delayed_ref_is_head(node)) {
 		struct btrfs_delayed_ref_head *head;
 		/*
 		 * we've hit the end of the chain and we were supposed
@@ -859,44 +1720,35 @@ static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 		 * deleted before we ever needed to insert it, so all
 		 * we have to do is clean up the accounting
 		 */
+		BUG_ON(extent_op);
+		head = btrfs_delayed_node_to_head(node);
 		if (insert_reserved) {
+			if (head->is_data) {
+				ret = btrfs_del_csums(trans, root,
+						      node->bytenr,
+						      node->num_bytes);
+				BUG_ON(ret);
+			}
+			btrfs_update_pinned_extents(root, node->bytenr,
+						    node->num_bytes, 1);
 			update_reserved_extents(root, node->bytenr,
 						node->num_bytes, 0);
 		}
-		head = btrfs_delayed_node_to_head(node);
 		mutex_unlock(&head->mutex);
 		return 0;
 	}
 
-	ref = btrfs_delayed_node_to_ref(node);
-	if (ref->action == BTRFS_ADD_DELAYED_REF) {
-		if (insert_reserved) {
-			struct btrfs_key ins;
-
-			ins.objectid = node->bytenr;
-			ins.offset = node->num_bytes;
-			ins.type = BTRFS_EXTENT_ITEM_KEY;
-
-			/* record the full extent allocation */
-			ret = __btrfs_alloc_reserved_extent(trans, root,
-					node->parent, ref->root,
-					ref->generation, ref->owner_objectid,
-					&ins, node->ref_mod);
-			update_reserved_extents(root, node->bytenr,
-						node->num_bytes, 0);
-		} else {
-			/* just add one backref */
-			ret = add_extent_ref(trans, root, node->bytenr,
-				     node->num_bytes,
-				     node->parent, ref->root, ref->generation,
-				     ref->owner_objectid, node->ref_mod);
-		}
-		BUG_ON(ret);
-	} else if (ref->action == BTRFS_DROP_DELAYED_REF) {
-		WARN_ON(insert_reserved);
-		ret = drop_delayed_ref(trans, root, node);
-	}
-	return 0;
+	if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
+	    node->type == BTRFS_SHARED_BLOCK_REF_KEY)
+		ret = run_delayed_tree_ref(trans, root, node, extent_op,
+					   insert_reserved);
+	else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
+		 node->type == BTRFS_SHARED_DATA_REF_KEY)
+		ret = run_delayed_data_ref(trans, root, node, extent_op,
+					   insert_reserved);
+	else
+		BUG();
+	return ret;
 }
 
 static noinline struct btrfs_delayed_ref_node *
@@ -919,7 +1771,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
 				rb_node);
 		if (ref->bytenr != head->node.bytenr)
 			break;
-		if (btrfs_delayed_node_to_ref(ref)->action == action)
+		if (ref->action == action)
 			return ref;
 		node = rb_prev(node);
 	}
@@ -937,6 +1789,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 	struct btrfs_delayed_ref_root *delayed_refs;
 	struct btrfs_delayed_ref_node *ref;
 	struct btrfs_delayed_ref_head *locked_ref = NULL;
+	struct btrfs_delayed_extent_op *extent_op;
 	int ret;
 	int count = 0;
 	int must_insert_reserved = 0;
@@ -975,6 +1828,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		must_insert_reserved = locked_ref->must_insert_reserved;
 		locked_ref->must_insert_reserved = 0;
 
+		extent_op = locked_ref->extent_op;
+		locked_ref->extent_op = NULL;
+
 		/*
 		 * locked_ref is the head node, so we have to go one
 		 * node back for any delayed ref updates
@@ -986,6 +1842,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			 * so that any accounting fixes can happen
 			 */
 			ref = &locked_ref->node;
+
+			if (extent_op && must_insert_reserved) {
+				kfree(extent_op);
+				extent_op = NULL;
+			}
+
+			if (extent_op) {
+				spin_unlock(&delayed_refs->lock);
+
+				ret = run_delayed_extent_op(trans, root,
+							    ref, extent_op);
+				BUG_ON(ret);
+				kfree(extent_op);
+
+				cond_resched();
+				spin_lock(&delayed_refs->lock);
+				continue;
+			}
+
 			list_del_init(&locked_ref->cluster);
 			locked_ref = NULL;
 		}
@@ -993,14 +1868,17 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		ref->in_tree = 0;
 		rb_erase(&ref->rb_node, &delayed_refs->root);
 		delayed_refs->num_entries--;
+
 		spin_unlock(&delayed_refs->lock);
 
-		ret = run_one_delayed_ref(trans, root, ref,
+		ret = run_one_delayed_ref(trans, root, ref, extent_op,
 					  must_insert_reserved);
 		BUG_ON(ret);
-		btrfs_put_delayed_ref(ref);
 
+		btrfs_put_delayed_ref(ref);
+		kfree(extent_op);
 		count++;
+
 		cond_resched();
 		spin_lock(&delayed_refs->lock);
 	}
@@ -1095,25 +1973,112 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root, u64 objectid, u64 bytenr)
+int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 flags,
+				int is_data)
+{
+	struct btrfs_delayed_extent_op *extent_op;
+	int ret;
+
+	extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+	if (!extent_op)
+		return -ENOMEM;
+
+	extent_op->flags_to_set = flags;
+	extent_op->update_flags = 1;
+	extent_op->update_key = 0;
+	extent_op->is_data = is_data ? 1 : 0;
+
+	ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
+	if (ret)
+		kfree(extent_op);
+	return ret;
+}
+
+static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      struct btrfs_path *path,
+				      u64 objectid, u64 offset, u64 bytenr)
+{
+	struct btrfs_delayed_ref_head *head;
+	struct btrfs_delayed_ref_node *ref;
+	struct btrfs_delayed_data_ref *data_ref;
+	struct btrfs_delayed_ref_root *delayed_refs;
+	struct rb_node *node;
+	int ret = 0;
+
+	ret = -ENOENT;
+	delayed_refs = &trans->transaction->delayed_refs;
+	spin_lock(&delayed_refs->lock);
+	head = btrfs_find_delayed_ref_head(trans, bytenr);
+	if (!head)
+		goto out;
+
+	if (!mutex_trylock(&head->mutex)) {
+		atomic_inc(&head->node.refs);
+		spin_unlock(&delayed_refs->lock);
+
+		btrfs_release_path(root->fs_info->extent_root, path);
+
+		mutex_lock(&head->mutex);
+		mutex_unlock(&head->mutex);
+		btrfs_put_delayed_ref(&head->node);
+		return -EAGAIN;
+	}
+
+	node = rb_prev(&head->node.rb_node);
+	if (!node)
+		goto out_unlock;
+
+	ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+
+	if (ref->bytenr != bytenr)
+		goto out_unlock;
+
+	ret = 1;
+	if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
+		goto out_unlock;
+
+	data_ref = btrfs_delayed_node_to_data_ref(ref);
+
+	node = rb_prev(node);
+	if (node) {
+		ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+		if (ref->bytenr == bytenr)
+			goto out_unlock;
+	}
+
+	if (data_ref->root != root->root_key.objectid ||
+	    data_ref->objectid != objectid || data_ref->offset != offset)
+		goto out_unlock;
+
+	ret = 0;
+out_unlock:
+	mutex_unlock(&head->mutex);
+out:
+	spin_unlock(&delayed_refs->lock);
+	return ret;
+}
+
+static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
+					struct btrfs_root *root,
+					struct btrfs_path *path,
+					u64 objectid, u64 offset, u64 bytenr)
 {
 	struct btrfs_root *extent_root = root->fs_info->extent_root;
-	struct btrfs_path *path;
 	struct extent_buffer *leaf;
-	struct btrfs_extent_ref *ref_item;
+	struct btrfs_extent_data_ref *ref;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
-	struct btrfs_key found_key;
-	u64 ref_root;
-	u64 last_snapshot;
-	u32 nritems;
+	u32 item_size;
 	int ret;
 
 	key.objectid = bytenr;
 	key.offset = (u64)-1;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
 
-	path = btrfs_alloc_path();
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
@@ -1125,55 +2090,83 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 
 	path->slots[0]--;
 	leaf = path->nodes[0];
-	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 
-	if (found_key.objectid != bytenr ||
-	    found_key.type != BTRFS_EXTENT_ITEM_KEY)
+	if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
 		goto out;
 
-	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
-	while (1) {
-		leaf = path->nodes[0];
-		nritems = btrfs_header_nritems(leaf);
-		if (path->slots[0] >= nritems) {
-			ret = btrfs_next_leaf(extent_root, path);
-			if (ret < 0)
-				goto out;
-			if (ret == 0)
-				continue;
-			break;
-		}
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.objectid != bytenr)
-			break;
+	ret = 1;
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		goto out;
+	}
+#endif
+	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 
-		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
-			path->slots[0]++;
-			continue;
-		}
+	if (item_size != sizeof(*ei) +
+	    btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
+		goto out;
 
-		ref_item = btrfs_item_ptr(leaf, path->slots[0],
-					  struct btrfs_extent_ref);
-		ref_root = btrfs_ref_root(leaf, ref_item);
-		if ((ref_root != root->root_key.objectid &&
-		     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
-		     objectid != btrfs_ref_objectid(leaf, ref_item)) {
-			ret = 1;
-			goto out;
-		}
-		if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
-			ret = 1;
+	if (btrfs_extent_generation(leaf, ei) <=
+	    btrfs_root_last_snapshot(&root->root_item))
+		goto out;
+
+	iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	if (btrfs_extent_inline_ref_type(leaf, iref) !=
+	    BTRFS_EXTENT_DATA_REF_KEY)
+		goto out;
+
+	ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+	if (btrfs_extent_refs(leaf, ei) !=
+	    btrfs_extent_data_ref_count(leaf, ref) ||
+	    btrfs_extent_data_ref_root(leaf, ref) !=
+	    root->root_key.objectid ||
+	    btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
+	    btrfs_extent_data_ref_offset(leaf, ref) != offset)
+		goto out;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root,
+			  u64 objectid, u64 offset, u64 bytenr)
+{
+	struct btrfs_path *path;
+	int ret;
+	int ret2;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOENT;
+
+	do {
+		ret = check_committed_ref(trans, root, path, objectid,
+					  offset, bytenr);
+		if (ret && ret != -ENOENT)
 			goto out;
-		}
 
-		path->slots[0]++;
+		ret2 = check_delayed_ref(trans, root, path, objectid,
+					 offset, bytenr);
+	} while (ret2 == -EAGAIN);
+
+	if (ret2 && ret2 != -ENOENT) {
+		ret = ret2;
+		goto out;
 	}
-	ret = 0;
+
+	if (ret != -ENOENT || ret2 != -ENOENT)
+		ret = 0;
 out:
 	btrfs_free_path(path);
 	return ret;
 }
 
+#if 0
 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		    struct extent_buffer *buf, u32 nr_extents)
 {
@@ -1291,191 +2284,49 @@ static int refsort_cmp(const void *a_void, const void *b_void)
 		return 1;
 	return 0;
 }
+#endif
 
-
-noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
+static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root,
-			   struct extent_buffer *orig_buf,
-			   struct extent_buffer *buf, u32 *nr_extents)
+			   struct extent_buffer *buf,
+			   int full_backref, int inc)
 {
 	u64 bytenr;
+	u64 num_bytes;
+	u64 parent;
 	u64 ref_root;
-	u64 orig_root;
-	u64 ref_generation;
-	u64 orig_generation;
-	struct refsort *sorted;
 	u32 nritems;
-	u32 nr_file_extents = 0;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int i;
 	int level;
 	int ret = 0;
-	int faili = 0;
-	int refi = 0;
-	int slot;
 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-			    u64, u64, u64, u64, u64, u64, u64, u64, u64);
+			    u64, u64, u64, u64, u64, u64);
 
 	ref_root = btrfs_header_owner(buf);
-	ref_generation = btrfs_header_generation(buf);
-	orig_root = btrfs_header_owner(orig_buf);
-	orig_generation = btrfs_header_generation(orig_buf);
-
 	nritems = btrfs_header_nritems(buf);
 	level = btrfs_header_level(buf);
 
-	sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
-	BUG_ON(!sorted);
-
-	if (root->ref_cows) {
-		process_func = __btrfs_inc_extent_ref;
-	} else {
-		if (level == 0 &&
-		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-			goto out;
-		if (level != 0 &&
-		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
-			goto out;
-		process_func = __btrfs_update_extent_ref;
-	}
-
-	/*
-	 * we make two passes through the items.  In the first pass we
-	 * only record the byte number and slot.  Then we sort based on
-	 * byte number and do the actual work based on the sorted results
-	 */
-	for (i = 0; i < nritems; i++) {
-		cond_resched();
-		if (level == 0) {
-			btrfs_item_key_to_cpu(buf, &key, i);
-			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-				continue;
-			fi = btrfs_item_ptr(buf, i,
-					    struct btrfs_file_extent_item);
-			if (btrfs_file_extent_type(buf, fi) ==
-			    BTRFS_FILE_EXTENT_INLINE)
-				continue;
-			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-			if (bytenr == 0)
-				continue;
-
-			nr_file_extents++;
-			sorted[refi].bytenr = bytenr;
-			sorted[refi].slot = i;
-			refi++;
-		} else {
-			bytenr = btrfs_node_blockptr(buf, i);
-			sorted[refi].bytenr = bytenr;
-			sorted[refi].slot = i;
-			refi++;
-		}
-	}
-	/*
-	 * if refi == 0, we didn't actually put anything into the sorted
-	 * array and we're done
-	 */
-	if (refi == 0)
-		goto out;
-
-	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-
-	for (i = 0; i < refi; i++) {
-		cond_resched();
-		slot = sorted[i].slot;
-		bytenr = sorted[i].bytenr;
-
-		if (level == 0) {
-			btrfs_item_key_to_cpu(buf, &key, slot);
-			fi = btrfs_item_ptr(buf, slot,
-					    struct btrfs_file_extent_item);
-
-			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-			if (bytenr == 0)
-				continue;
-
-			ret = process_func(trans, root, bytenr,
-				   btrfs_file_extent_disk_num_bytes(buf, fi),
-				   orig_buf->start, buf->start,
-				   orig_root, ref_root,
-				   orig_generation, ref_generation,
-				   key.objectid);
-
-			if (ret) {
-				faili = slot;
-				WARN_ON(1);
-				goto fail;
-			}
-		} else {
-			ret = process_func(trans, root, bytenr, buf->len,
-					   orig_buf->start, buf->start,
-					   orig_root, ref_root,
-					   orig_generation, ref_generation,
-					   level - 1);
-			if (ret) {
-				faili = slot;
-				WARN_ON(1);
-				goto fail;
-			}
-		}
-	}
-out:
-	kfree(sorted);
-	if (nr_extents) {
-		if (level == 0)
-			*nr_extents = nr_file_extents;
-		else
-			*nr_extents = nritems;
-	}
-	return 0;
-fail:
-	kfree(sorted);
-	WARN_ON(1);
-	return ret;
-}
-
-int btrfs_update_ref(struct btrfs_trans_handle *trans,
-		     struct btrfs_root *root, struct extent_buffer *orig_buf,
-		     struct extent_buffer *buf, int start_slot, int nr)
-
-{
-	u64 bytenr;
-	u64 ref_root;
-	u64 orig_root;
-	u64 ref_generation;
-	u64 orig_generation;
-	struct btrfs_key key;
-	struct btrfs_file_extent_item *fi;
-	int i;
-	int ret;
-	int slot;
-	int level;
-
-	BUG_ON(start_slot < 0);
-	BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
-
-	ref_root = btrfs_header_owner(buf);
-	ref_generation = btrfs_header_generation(buf);
-	orig_root = btrfs_header_owner(orig_buf);
-	orig_generation = btrfs_header_generation(orig_buf);
-	level = btrfs_header_level(buf);
+	if (!root->ref_cows && level == 0)
+		return 0;
 
-	if (!root->ref_cows) {
-		if (level == 0 &&
-		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
-			return 0;
-		if (level != 0 &&
-		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
-			return 0;
-	}
+	if (inc)
+		process_func = btrfs_inc_extent_ref;
+	else
+		process_func = btrfs_free_extent;
 
-	for (i = 0, slot = start_slot; i < nr; i++, slot++) {
-		cond_resched();
+	if (full_backref)
+		parent = buf->start;
+	else
+		parent = 0;
+
+	for (i = 0; i < nritems; i++) {
 		if (level == 0) {
-			btrfs_item_key_to_cpu(buf, &key, slot);
+			btrfs_item_key_to_cpu(buf, &key, i);
 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
 				continue;
-			fi = btrfs_item_ptr(buf, slot,
+			fi = btrfs_item_ptr(buf, i,
 					    struct btrfs_file_extent_item);
 			if (btrfs_file_extent_type(buf, fi) ==
 			    BTRFS_FILE_EXTENT_INLINE)
@@ -1483,28 +2334,39 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
 			if (bytenr == 0)
 				continue;
-			ret = __btrfs_update_extent_ref(trans, root, bytenr,
-				    btrfs_file_extent_disk_num_bytes(buf, fi),
-				    orig_buf->start, buf->start,
-				    orig_root, ref_root, orig_generation,
-				    ref_generation, key.objectid);
+
+			num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
+			key.offset -= btrfs_file_extent_offset(buf, fi);
+			ret = process_func(trans, root, bytenr, num_bytes,
+					   parent, ref_root, key.objectid,
+					   key.offset);
 			if (ret)
 				goto fail;
 		} else {
-			bytenr = btrfs_node_blockptr(buf, slot);
-			ret = __btrfs_update_extent_ref(trans, root, bytenr,
-					    buf->len, orig_buf->start,
-					    buf->start, orig_root, ref_root,
-					    orig_generation, ref_generation,
-					    level - 1);
+			bytenr = btrfs_node_blockptr(buf, i);
+			num_bytes = btrfs_level_size(root, level - 1);
+			ret = process_func(trans, root, bytenr, num_bytes,
+					   parent, ref_root, level - 1, 0);
 			if (ret)
 				goto fail;
 		}
 	}
 	return 0;
 fail:
-	WARN_ON(1);
-	return -1;
+	BUG();
+	return ret;
+}
+
+int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref)
+{
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
+}
+
+int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
+		  struct extent_buffer *buf, int full_backref)
+{
+	return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
 }
 
 static int write_one_cache_group(struct btrfs_trans_handle *trans,
@@ -2007,6 +2869,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 	u64 old_val;
 	u64 byte_in_group;
 
+	/* block accounting for super block */
+	spin_lock(&info->delalloc_lock);
+	old_val = btrfs_super_bytes_used(&info->super_copy);
+	if (alloc)
+		old_val += num_bytes;
+	else
+		old_val -= num_bytes;
+	btrfs_set_super_bytes_used(&info->super_copy, old_val);
+
+	/* block accounting for root item */
+	old_val = btrfs_root_used(&root->root_item);
+	if (alloc)
+		old_val += num_bytes;
+	else
+		old_val -= num_bytes;
+	btrfs_set_root_used(&root->root_item, old_val);
+	spin_unlock(&info->delalloc_lock);
+
 	while (total) {
 		cache = btrfs_lookup_block_group(info, bytenr);
 		if (!cache)
@@ -2216,8 +3096,6 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 		u64 header_owner = btrfs_header_owner(buf);
 		u64 header_transid = btrfs_header_generation(buf);
 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
-		    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
 		    header_transid == trans->transid &&
 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			*must_clean = buf;
@@ -2235,63 +3113,77 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-/*
- * remove an extent from the root, returns 0 on success
- */
-static int __free_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
-			 u64 bytenr, u64 num_bytes, u64 parent,
-			 u64 root_objectid, u64 ref_generation,
-			 u64 owner_objectid, int pin, int mark_free,
-			 int refs_to_drop)
+
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 bytenr, u64 num_bytes, u64 parent,
+				u64 root_objectid, u64 owner_objectid,
+				u64 owner_offset, int refs_to_drop,
+				struct btrfs_delayed_extent_op *extent_op)
 {
-	struct btrfs_path *path;
 	struct btrfs_key key;
+	struct btrfs_path *path;
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_root *extent_root = info->extent_root;
 	struct extent_buffer *leaf;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
 	int ret;
+	int is_data;
 	int extent_slot = 0;
 	int found_extent = 0;
 	int num_to_del = 1;
-	struct btrfs_extent_item *ei;
-	u32 refs;
+	u32 item_size;
+	u64 refs;
 
-	key.objectid = bytenr;
-	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-	key.offset = num_bytes;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	path->reada = 1;
 	path->leave_spinning = 1;
-	ret = lookup_extent_backref(trans, extent_root, path,
-				    bytenr, parent, root_objectid,
-				    ref_generation, owner_objectid, 1);
+
+	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
+	BUG_ON(!is_data && refs_to_drop != 1);
+
+	ret = lookup_extent_backref(trans, extent_root, path, &iref,
+				    bytenr, num_bytes, parent,
+				    root_objectid, owner_objectid,
+				    owner_offset);
 	if (ret == 0) {
-		struct btrfs_key found_key;
 		extent_slot = path->slots[0];
-		while (extent_slot > 0) {
-			extent_slot--;
-			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+		while (extent_slot >= 0) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
 					      extent_slot);
-			if (found_key.objectid != bytenr)
+			if (key.objectid != bytenr)
 				break;
-			if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
-			    found_key.offset == num_bytes) {
+			if (key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == num_bytes) {
 				found_extent = 1;
 				break;
 			}
 			if (path->slots[0] - extent_slot > 5)
 				break;
+			extent_slot--;
 		}
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
+		if (found_extent && item_size < sizeof(*ei))
+			found_extent = 0;
+#endif
 		if (!found_extent) {
+			BUG_ON(iref);
 			ret = remove_extent_backref(trans, extent_root, path,
-						    refs_to_drop);
+						    NULL, refs_to_drop,
+						    is_data);
 			BUG_ON(ret);
 			btrfs_release_path(extent_root, path);
 			path->leave_spinning = 1;
+
+			key.objectid = bytenr;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = num_bytes;
+
 			ret = btrfs_search_slot(trans, extent_root,
 						&key, path, -1, 1);
 			if (ret) {
@@ -2307,82 +3199,98 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 		btrfs_print_leaf(extent_root, path->nodes[0]);
 		WARN_ON(1);
 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-		       "parent %llu root %llu gen %llu owner %llu\n",
+		       "parent %llu root %llu  owner %llu offset %llu\n",
 		       (unsigned long long)bytenr,
 		       (unsigned long long)parent,
 		       (unsigned long long)root_objectid,
-		       (unsigned long long)ref_generation,
-		       (unsigned long long)owner_objectid);
+		       (unsigned long long)owner_objectid,
+		       (unsigned long long)owner_offset);
 	}
 
 	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, extent_slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		BUG_ON(found_extent || extent_slot != path->slots[0]);
+		ret = convert_extent_item_v0(trans, extent_root, path,
+					     owner_objectid, 0);
+		BUG_ON(ret < 0);
+
+		btrfs_release_path(extent_root, path);
+		path->leave_spinning = 1;
+
+		key.objectid = bytenr;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = num_bytes;
+
+		ret = btrfs_search_slot(trans, extent_root, &key, path,
+					-1, 1);
+		if (ret) {
+			printk(KERN_ERR "umm, got %d back from search"
+			       ", was looking for %llu\n", ret,
+			       (unsigned long long)bytenr);
+			btrfs_print_leaf(extent_root, path->nodes[0]);
+		}
+		BUG_ON(ret);
+		extent_slot = path->slots[0];
+		leaf = path->nodes[0];
+		item_size = btrfs_item_size_nr(leaf, extent_slot);
+	}
+#endif
+	BUG_ON(item_size < sizeof(*ei));
 	ei = btrfs_item_ptr(leaf, extent_slot,
 			    struct btrfs_extent_item);
-	refs = btrfs_extent_refs(leaf, ei);
-
-	/*
-	 * we're not allowed to delete the extent item if there
-	 * are other delayed ref updates pending
-	 */
+	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+		struct btrfs_tree_block_info *bi;
+		BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
+	}
 
+	refs = btrfs_extent_refs(leaf, ei);
 	BUG_ON(refs < refs_to_drop);
 	refs -= refs_to_drop;
-	btrfs_set_extent_refs(leaf, ei, refs);
-	btrfs_mark_buffer_dirty(leaf);
 
-	if (refs == 0 && found_extent &&
-	    path->slots[0] == extent_slot + 1) {
-		struct btrfs_extent_ref *ref;
-		ref = btrfs_item_ptr(leaf, path->slots[0],
-				     struct btrfs_extent_ref);
-		BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
-		/* if the back ref and the extent are next to each other
-		 * they get deleted below in one shot
+	if (refs > 0) {
+		if (extent_op)
+			__run_delayed_extent_op(extent_op, leaf, ei);
+		/*
+		 * In the case of inline back ref, reference count will
+		 * be updated by remove_extent_backref
 		 */
-		path->slots[0] = extent_slot;
-		num_to_del = 2;
-	} else if (found_extent) {
-		/* otherwise delete the extent back ref */
-		ret = remove_extent_backref(trans, extent_root, path,
-					    refs_to_drop);
-		BUG_ON(ret);
-		/* if refs are 0, we need to setup the path for deletion */
-		if (refs == 0) {
-			btrfs_release_path(extent_root, path);
-			path->leave_spinning = 1;
-			ret = btrfs_search_slot(trans, extent_root, &key, path,
-						-1, 1);
+		if (iref) {
+			BUG_ON(!found_extent);
+		} else {
+			btrfs_set_extent_refs(leaf, ei, refs);
+			btrfs_mark_buffer_dirty(leaf);
+		}
+		if (found_extent) {
+			ret = remove_extent_backref(trans, extent_root, path,
+						    iref, refs_to_drop,
+						    is_data);
 			BUG_ON(ret);
 		}
-	}
-
-	if (refs == 0) {
-		u64 super_used;
-		u64 root_used;
+	} else {
+		int mark_free = 0;
 		struct extent_buffer *must_clean = NULL;
 
-		if (pin) {
-			ret = pin_down_bytes(trans, root, path,
-				bytenr, num_bytes,
-				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
-				&must_clean);
-			if (ret > 0)
-				mark_free = 1;
-			BUG_ON(ret < 0);
+		if (found_extent) {
+			BUG_ON(is_data && refs_to_drop !=
+			       extent_data_ref_count(root, path, iref));
+			if (iref) {
+				BUG_ON(path->slots[0] != extent_slot);
+			} else {
+				BUG_ON(path->slots[0] != extent_slot + 1);
+				path->slots[0] = extent_slot;
+				num_to_del = 2;
+			}
 		}
 
-		/* block accounting for super block */
-		spin_lock(&info->delalloc_lock);
-		super_used = btrfs_super_bytes_used(&info->super_copy);
-		btrfs_set_super_bytes_used(&info->super_copy,
-					   super_used - num_bytes);
-
-		/* block accounting for root item */
-		root_used = btrfs_root_used(&root->root_item);
-		btrfs_set_root_used(&root->root_item,
-					   root_used - num_bytes);
-		spin_unlock(&info->delalloc_lock);
-
+		ret = pin_down_bytes(trans, root, path, bytenr,
+				     num_bytes, is_data, &must_clean);
+		if (ret > 0)
+			mark_free = 1;
+		BUG_ON(ret < 0);
 		/*
 		 * it is going to be very rare for someone to be waiting
 		 * on the block we're freeing.  del_items might need to
@@ -2403,7 +3311,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 			free_extent_buffer(must_clean);
 		}
 
-		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
+		if (is_data) {
 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
 			BUG_ON(ret);
 		} else {
@@ -2420,34 +3328,6 @@ static int __free_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-/*
- * remove an extent from the root, returns 0 on success
- */
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-					struct btrfs_root *root,
-					u64 bytenr, u64 num_bytes, u64 parent,
-					u64 root_objectid, u64 ref_generation,
-					u64 owner_objectid, int pin,
-					int refs_to_drop)
-{
-	WARN_ON(num_bytes < root->sectorsize);
-
-	/*
-	 * if metadata always pin
-	 * if data pin when any transaction has committed this
-	 */
-	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
-	    ref_generation != trans->transid)
-		pin = 1;
-
-	if (ref_generation != trans->transid)
-		pin = 1;
-
-	return __free_extent(trans, root, bytenr, num_bytes, parent,
-			    root_objectid, ref_generation,
-			    owner_objectid, pin, pin == 0, refs_to_drop);
-}
-
 /*
  * when we free an extent, it is possible (and likely) that we free the last
  * delayed ref for that extent as well.  This searches the delayed ref tree for
@@ -2479,6 +3359,13 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	if (ref->bytenr == bytenr)
 		goto out;
 
+	if (head->extent_op) {
+		if (!head->must_insert_reserved)
+			goto out;
+		kfree(head->extent_op);
+		head->extent_op = NULL;
+	}
+
 	/*
 	 * waiting for the lock here would deadlock.  If someone else has it
 	 * locked they are already in the process of dropping it anyway
@@ -2507,7 +3394,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 	spin_unlock(&delayed_refs->lock);
 
 	ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
-				  &head->node, head->must_insert_reserved);
+				  &head->node, head->extent_op,
+				  head->must_insert_reserved);
 	BUG_ON(ret);
 	btrfs_put_delayed_ref(&head->node);
 	return 0;
@@ -2519,32 +3407,32 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      u64 bytenr, u64 num_bytes, u64 parent,
-		      u64 root_objectid, u64 ref_generation,
-		      u64 owner_objectid, int pin)
+		      u64 root_objectid, u64 owner, u64 offset)
 {
 	int ret;
 
 	/*
 	 * tree log blocks never actually go into the extent allocation
 	 * tree, just update pinning info and exit early.
-	 *
-	 * data extents referenced by the tree log do need to have
-	 * their reference counts bumped.
 	 */
-	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
-	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+	if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
+		WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
 		/* unlocks the pinned mutex */
 		btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
 		update_reserved_extents(root, bytenr, num_bytes, 0);
 		ret = 0;
-	} else {
-		ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
-				       root_objectid, ref_generation,
-				       owner_objectid,
-				       BTRFS_DROP_DELAYED_REF, 1);
+	} else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
+		ret = btrfs_add_delayed_tree_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, (int)owner,
+					BTRFS_DROP_DELAYED_REF, NULL);
 		BUG_ON(ret);
 		ret = check_ref_cleanup(trans, root, bytenr);
 		BUG_ON(ret);
+	} else {
+		ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
+					parent, root_objectid, owner,
+					offset, BTRFS_DROP_DELAYED_REF, NULL);
+		BUG_ON(ret);
 	}
 	return ret;
 }
@@ -2969,99 +3857,147 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-					 struct btrfs_root *root, u64 parent,
-					 u64 root_objectid, u64 ref_generation,
-					 u64 owner, struct btrfs_key *ins,
-					 int ref_mod)
+static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *root,
+				      u64 parent, u64 root_objectid,
+				      u64 flags, u64 owner, u64 offset,
+				      struct btrfs_key *ins, int ref_mod)
 {
 	int ret;
-	u64 super_used;
-	u64 root_used;
-	u64 num_bytes = ins->offset;
-	u32 sizes[2];
-	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_root *extent_root = info->extent_root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_extent_item *extent_item;
-	struct btrfs_extent_ref *ref;
+	struct btrfs_extent_inline_ref *iref;
 	struct btrfs_path *path;
-	struct btrfs_key keys[2];
-
-	if (parent == 0)
-		parent = ins->objectid;
-
-	/* block accounting for super block */
-	spin_lock(&info->delalloc_lock);
-	super_used = btrfs_super_bytes_used(&info->super_copy);
-	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
+	struct extent_buffer *leaf;
+	int type;
+	u32 size;
 
-	/* block accounting for root item */
-	root_used = btrfs_root_used(&root->root_item);
-	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
-	spin_unlock(&info->delalloc_lock);
+	if (parent > 0)
+		type = BTRFS_SHARED_DATA_REF_KEY;
+	else
+		type = BTRFS_EXTENT_DATA_REF_KEY;
 
-	memcpy(&keys[0], ins, sizeof(*ins));
-	keys[1].objectid = ins->objectid;
-	keys[1].type = BTRFS_EXTENT_REF_KEY;
-	keys[1].offset = parent;
-	sizes[0] = sizeof(*extent_item);
-	sizes[1] = sizeof(*ref);
+	size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
 	path->leave_spinning = 1;
-	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
-				       sizes, 2);
+	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+				      ins, size);
 	BUG_ON(ret);
 
-	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+	leaf = path->nodes[0];
+	extent_item = btrfs_item_ptr(leaf, path->slots[0],
 				     struct btrfs_extent_item);
-	btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
-	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
-			     struct btrfs_extent_ref);
-
-	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
-	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
-	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-	btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
+	btrfs_set_extent_refs(leaf, extent_item, ref_mod);
+	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+	btrfs_set_extent_flags(leaf, extent_item,
+			       flags | BTRFS_EXTENT_FLAG_DATA);
+
+	iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
+	btrfs_set_extent_inline_ref_type(leaf, iref, type);
+	if (parent > 0) {
+		struct btrfs_shared_data_ref *ref;
+		ref = (struct btrfs_shared_data_ref *)(iref + 1);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+		btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
+	} else {
+		struct btrfs_extent_data_ref *ref;
+		ref = (struct btrfs_extent_data_ref *)(&iref->offset);
+		btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
+		btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
+		btrfs_set_extent_data_ref_offset(leaf, ref, offset);
+		btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
+	}
 
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-
-	trans->alloc_exclude_start = 0;
-	trans->alloc_exclude_nr = 0;
 	btrfs_free_path(path);
 
-	if (ret)
-		goto out;
-
-	ret = update_block_group(trans, root, ins->objectid,
-				 ins->offset, 1, 0);
+	ret = update_block_group(trans, root, ins->objectid, ins->offset,
+				 1, 0);
 	if (ret) {
 		printk(KERN_ERR "btrfs update block group failed for %llu "
 		       "%llu\n", (unsigned long long)ins->objectid,
 		       (unsigned long long)ins->offset);
 		BUG();
 	}
-out:
 	return ret;
 }
 
-int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, u64 parent,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, struct btrfs_key *ins)
+static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 parent, u64 root_objectid,
+				     u64 flags, struct btrfs_disk_key *key,
+				     int level, struct btrfs_key *ins)
 {
 	int ret;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_tree_block_info *block_info;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
 
-	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
-		return 0;
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
 
-	ret = btrfs_add_delayed_ref(trans, ins->objectid,
-				    ins->offset, parent, root_objectid,
-				    ref_generation, owner,
-				    BTRFS_ADD_DELAYED_EXTENT, 0);
+	path->leave_spinning = 1;
+	ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
+				      ins, size);
 	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	extent_item = btrfs_item_ptr(leaf, path->slots[0],
+				     struct btrfs_extent_item);
+	btrfs_set_extent_refs(leaf, extent_item, 1);
+	btrfs_set_extent_generation(leaf, extent_item, trans->transid);
+	btrfs_set_extent_flags(leaf, extent_item,
+			       flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
+	block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
+
+	btrfs_set_tree_block_key(leaf, block_info, key);
+	btrfs_set_tree_block_level(leaf, block_info, level);
+
+	iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
+	if (parent > 0) {
+		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+		btrfs_set_extent_inline_ref_type(leaf, iref,
+						 BTRFS_SHARED_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
+	} else {
+		btrfs_set_extent_inline_ref_type(leaf, iref,
+						 BTRFS_TREE_BLOCK_REF_KEY);
+		btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
+	}
+
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_free_path(path);
+
+	ret = update_block_group(trans, root, ins->objectid, ins->offset,
+				 1, 0);
+	if (ret) {
+		printk(KERN_ERR "btrfs update block group failed for %llu "
+		       "%llu\n", (unsigned long long)ins->objectid,
+		       (unsigned long long)ins->offset);
+		BUG();
+	}
+	return ret;
+}
+
+int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
+				     struct btrfs_root *root,
+				     u64 root_objectid, u64 owner,
+				     u64 offset, struct btrfs_key *ins)
+{
+	int ret;
+
+	BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
+
+	ret = btrfs_add_delayed_data_ref(trans, ins->objectid, ins->offset,
+					 0, root_objectid, owner, offset,
+					 BTRFS_ADD_DELAYED_EXTENT, NULL);
 	return ret;
 }
 
@@ -3070,10 +4006,10 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
  * an extent has been allocated and makes sure to clear the free
  * space cache bits as well
  */
-int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root, u64 parent,
-				u64 root_objectid, u64 ref_generation,
-				u64 owner, struct btrfs_key *ins)
+int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   u64 root_objectid, u64 owner, u64 offset,
+				   struct btrfs_key *ins)
 {
 	int ret;
 	struct btrfs_block_group_cache *block_group;
@@ -3087,8 +4023,8 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
 				      ins->offset);
 	BUG_ON(ret);
 	btrfs_put_block_group(block_group);
-	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-					    ref_generation, owner, ins, 1);
+	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
+					 0, owner, offset, ins, 1);
 	return ret;
 }
 
@@ -3099,26 +4035,48 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
  *
  * returns 0 if everything worked, non-zero otherwise.
  */
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       u64 num_bytes, u64 parent, u64 min_alloc_size,
-		       u64 root_objectid, u64 ref_generation,
-		       u64 owner_objectid, u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, u64 data)
+static int alloc_tree_block(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root,
+			    u64 num_bytes, u64 parent, u64 root_objectid,
+			    struct btrfs_disk_key *key, int level,
+			    u64 empty_size, u64 hint_byte, u64 search_end,
+			    struct btrfs_key *ins)
 {
 	int ret;
-	ret = __btrfs_reserve_extent(trans, root, num_bytes,
-				     min_alloc_size, empty_size, hint_byte,
-				     search_end, ins, data);
+	u64 flags = 0;
+
+	ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
+				     empty_size, hint_byte, search_end,
+				     ins, 0);
 	BUG_ON(ret);
+
+	if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		if (parent == 0)
+			parent = ins->objectid;
+		flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+	} else
+		BUG_ON(parent > 0);
+
+	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-		ret = btrfs_add_delayed_ref(trans, ins->objectid,
-					    ins->offset, parent, root_objectid,
-					    ref_generation, owner_objectid,
-					    BTRFS_ADD_DELAYED_EXTENT, 0);
+		struct btrfs_delayed_extent_op *extent_op;
+		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+		BUG_ON(!extent_op);
+		if (key)
+			memcpy(&extent_op->key, key, sizeof(extent_op->key));
+		else
+			memset(&extent_op->key, 0, sizeof(extent_op->key));
+		extent_op->flags_to_set = flags;
+		extent_op->update_key = 1;
+		extent_op->update_flags = 1;
+		extent_op->is_data = 0;
+
+		ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
+					ins->offset, parent, root_objectid,
+					level, BTRFS_ADD_DELAYED_EXTENT,
+					extent_op);
 		BUG_ON(ret);
 	}
-	update_reserved_extents(root, ins->objectid, ins->offset, 1);
 	return ret;
 }
 
@@ -3157,21 +4115,17 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
  * returns the tree buffer or NULL.
  */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
-					     struct btrfs_root *root,
-					     u32 blocksize, u64 parent,
-					     u64 root_objectid,
-					     u64 ref_generation,
-					     int level,
-					     u64 hint,
-					     u64 empty_size)
+					struct btrfs_root *root, u32 blocksize,
+					u64 parent, u64 root_objectid,
+					struct btrfs_disk_key *key, int level,
+					u64 hint, u64 empty_size)
 {
 	struct btrfs_key ins;
 	int ret;
 	struct extent_buffer *buf;
 
-	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
-				 root_objectid, ref_generation, level,
-				 empty_size, hint, (u64)-1, &ins, 0);
+	ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
+			       key, level, empty_size, hint, (u64)-1, &ins);
 	if (ret) {
 		BUG_ON(ret > 0);
 		return ERR_PTR(ret);
@@ -3185,32 +4139,19 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root, struct extent_buffer *leaf)
 {
-	u64 leaf_owner;
-	u64 leaf_generation;
-	struct refsort *sorted;
+	u64 disk_bytenr;
+	u64 num_bytes;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
+	u32 nritems;
 	int i;
-	int nritems;
 	int ret;
-	int refi = 0;
-	int slot;
 
 	BUG_ON(!btrfs_is_leaf(leaf));
 	nritems = btrfs_header_nritems(leaf);
-	leaf_owner = btrfs_header_owner(leaf);
-	leaf_generation = btrfs_header_generation(leaf);
 
-	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
-	/* we do this loop twice.  The first time we build a list
-	 * of the extents we have a reference on, then we sort the list
-	 * by bytenr.  The second time around we actually do the
-	 * extent freeing.
-	 */
 	for (i = 0; i < nritems; i++) {
-		u64 disk_bytenr;
 		cond_resched();
-
 		btrfs_item_key_to_cpu(leaf, &key, i);
 
 		/* only extents have references, skip everything else */
@@ -3230,45 +4171,16 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
 		if (disk_bytenr == 0)
 			continue;
 
-		sorted[refi].bytenr = disk_bytenr;
-		sorted[refi].slot = i;
-		refi++;
-	}
-
-	if (refi == 0)
-		goto out;
-
-	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
-
-	for (i = 0; i < refi; i++) {
-		u64 disk_bytenr;
-
-		disk_bytenr = sorted[i].bytenr;
-		slot = sorted[i].slot;
-
-		cond_resched();
-
-		btrfs_item_key_to_cpu(leaf, &key, slot);
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-			continue;
-
-		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-
-		ret = btrfs_free_extent(trans, root, disk_bytenr,
-				btrfs_file_extent_disk_num_bytes(leaf, fi),
-				leaf->start, leaf_owner, leaf_generation,
-				key.objectid, 0);
+		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		ret = btrfs_free_extent(trans, root, disk_bytenr, num_bytes,
+					leaf->start, 0, key.objectid, 0);
 		BUG_ON(ret);
-
-		atomic_inc(&root->fs_info->throttle_gen);
-		wake_up(&root->fs_info->transaction_throttle);
-		cond_resched();
 	}
-out:
-	kfree(sorted);
 	return 0;
 }
 
+#if 0
+
 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 					struct btrfs_root *root,
 					struct btrfs_leaf_ref *ref)
@@ -3311,13 +4223,14 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+
 static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root, u64 start,
 				     u64 len, u32 *refs)
 {
 	int ret;
 
-	ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
+	ret = btrfs_lookup_extent_refs(trans, root, start, len, refs);
 	BUG_ON(ret);
 
 #if 0 /* some debugging code in case we see problems here */
@@ -3352,6 +4265,7 @@ static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+
 /*
  * this is used while deleting old snapshots, and it drops the refs
  * on a whole subtree starting from a level 1 node.
@@ -3645,32 +4559,36 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
 	cond_resched();
 	return 0;
 }
+#endif
 
 /*
  * helper function for drop_subtree, this function is similar to
  * walk_down_tree. The main difference is that it checks reference
  * counts while tree blocks are locked.
  */
-static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
-				      struct btrfs_root *root,
-				      struct btrfs_path *path, int *level)
+static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
+				   struct btrfs_root *root,
+				   struct btrfs_path *path, int *level)
 {
 	struct extent_buffer *next;
 	struct extent_buffer *cur;
 	struct extent_buffer *parent;
 	u64 bytenr;
 	u64 ptr_gen;
+	u64 refs;
+	u64 flags;
 	u32 blocksize;
-	u32 refs;
 	int ret;
 
 	cur = path->nodes[*level];
-	ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
-				      &refs);
+	ret = btrfs_lookup_extent_info(trans, root, cur->start, cur->len,
+				       &refs, &flags);
 	BUG_ON(ret);
 	if (refs > 1)
 		goto out;
 
+	BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
 	while (*level >= 0) {
 		cur = path->nodes[*level];
 		if (*level == 0) {
@@ -3692,16 +4610,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 		btrfs_tree_lock(next);
 		btrfs_set_lock_blocking(next);
 
-		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
-					      &refs);
+		ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
+					       &refs, &flags);
 		BUG_ON(ret);
 		if (refs > 1) {
 			parent = path->nodes[*level];
 			ret = btrfs_free_extent(trans, root, bytenr,
-					blocksize, parent->start,
-					btrfs_header_owner(parent),
-					btrfs_header_generation(parent),
-					*level - 1, 1);
+						blocksize, parent->start,
+						btrfs_header_owner(parent),
+						*level - 1, 0);
 			BUG_ON(ret);
 			path->slots[*level]++;
 			btrfs_tree_unlock(next);
@@ -3709,6 +4626,8 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 			continue;
 		}
 
+		BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
+
 		*level = btrfs_header_level(next);
 		path->nodes[*level] = next;
 		path->slots[*level] = 0;
@@ -3716,13 +4635,15 @@ static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
 		cond_resched();
 	}
 out:
-	parent = path->nodes[*level + 1];
+	if (path->nodes[*level] == root->node)
+		parent = path->nodes[*level];
+	else
+		parent = path->nodes[*level + 1];
 	bytenr = path->nodes[*level]->start;
 	blocksize = path->nodes[*level]->len;
 
-	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
-			parent->start, btrfs_header_owner(parent),
-			btrfs_header_generation(parent), *level, 1);
+	ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent->start,
+				btrfs_header_owner(parent), *level, 0);
 	BUG_ON(ret);
 
 	if (path->locks[*level]) {
@@ -3746,8 +4667,6 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 				 struct btrfs_path *path,
 				 int *level, int max_level)
 {
-	u64 root_owner;
-	u64 root_gen;
 	struct btrfs_root_item *root_item = &root->root_item;
 	int i;
 	int slot;
@@ -3755,24 +4674,22 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 
 	for (i = *level; i < max_level && path->nodes[i]; i++) {
 		slot = path->slots[i];
-		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
-			struct extent_buffer *node;
-			struct btrfs_disk_key disk_key;
-
+		if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
 			/*
 			 * there is more work to do in this level.
 			 * Update the drop_progress marker to reflect
 			 * the work we've done so far, and then bump
 			 * the slot number
 			 */
-			node = path->nodes[i];
 			path->slots[i]++;
-			*level = i;
 			WARN_ON(*level == 0);
-			btrfs_node_key(node, &disk_key, path->slots[i]);
-			memcpy(&root_item->drop_progress,
-			       &disk_key, sizeof(disk_key));
-			root_item->drop_level = i;
+			if (max_level == BTRFS_MAX_LEVEL) {
+				btrfs_node_key(path->nodes[i],
+					       &root_item->drop_progress,
+					       path->slots[i]);
+				root_item->drop_level = i;
+			}
+			*level = i;
 			return 0;
 		} else {
 			struct extent_buffer *parent;
@@ -3786,22 +4703,20 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 			else
 				parent = path->nodes[*level + 1];
 
-			root_owner = btrfs_header_owner(parent);
-			root_gen = btrfs_header_generation(parent);
-
-			clean_tree_block(trans, root, path->nodes[*level]);
+			clean_tree_block(trans, root, path->nodes[i]);
 			ret = btrfs_free_extent(trans, root,
-						path->nodes[*level]->start,
-						path->nodes[*level]->len,
-						parent->start, root_owner,
-						root_gen, *level, 1);
+						path->nodes[i]->start,
+						path->nodes[i]->len,
+						parent->start,
+						btrfs_header_owner(parent),
+						*level, 0);
 			BUG_ON(ret);
 			if (path->locks[*level]) {
-				btrfs_tree_unlock(path->nodes[*level]);
-				path->locks[*level] = 0;
+				btrfs_tree_unlock(path->nodes[i]);
+				path->locks[i] = 0;
 			}
-			free_extent_buffer(path->nodes[*level]);
-			path->nodes[*level] = NULL;
+			free_extent_buffer(path->nodes[i]);
+			path->nodes[i] = NULL;
 			*level = i + 1;
 		}
 	}
@@ -3820,21 +4735,18 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int wret;
 	int level;
 	struct btrfs_path *path;
-	int i;
-	int orig_level;
 	int update_count;
 	struct btrfs_root_item *root_item = &root->root_item;
 
-	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
 	level = btrfs_header_level(root->node);
-	orig_level = level;
 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
-		path->nodes[level] = root->node;
-		extent_buffer_get(root->node);
+		path->nodes[level] = btrfs_lock_root_node(root);
+		btrfs_set_lock_blocking(path->nodes[level]);
 		path->slots[level] = 0;
+		path->locks[level] = 1;
 	} else {
 		struct btrfs_key key;
 		struct btrfs_disk_key found_key;
@@ -3856,12 +4768,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 		 * unlock our path, this is safe because only this
 		 * function is allowed to delete this snapshot
 		 */
-		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
-			if (path->nodes[i] && path->locks[i]) {
-				path->locks[i] = 0;
-				btrfs_tree_unlock(path->nodes[i]);
-			}
-		}
+		btrfs_unlock_up_safe(path, 0);
 	}
 	while (1) {
 		unsigned long update;
@@ -3882,8 +4789,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 			ret = -EAGAIN;
 			break;
 		}
-		atomic_inc(&root->fs_info->throttle_gen);
-		wake_up(&root->fs_info->transaction_throttle);
 		for (update_count = 0; update_count < 16; update_count++) {
 			update = trans->delayed_ref_updates;
 			trans->delayed_ref_updates = 0;
@@ -3893,12 +4798,6 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
 				break;
 		}
 	}
-	for (i = 0; i <= orig_level; i++) {
-		if (path->nodes[i]) {
-			free_extent_buffer(path->nodes[i]);
-			path->nodes[i] = NULL;
-		}
-	}
 out:
 	btrfs_free_path(path);
 	return ret;
@@ -3931,7 +4830,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	path->slots[level] = 0;
 
 	while (1) {
-		wret = walk_down_subtree(trans, root, path, &level);
+		wret = walk_down_tree(trans, root, path, &level);
 		if (wret < 0)
 			ret = wret;
 		if (wret != 0)
@@ -3948,6 +4847,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+#if 0
 static unsigned long calc_ra(unsigned long start, unsigned long last,
 			     unsigned long nr)
 {
@@ -5429,6 +6329,7 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 	kfree(ref_path);
 	return ret;
 }
+#endif
 
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
@@ -5477,7 +6378,8 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	u64 calc;
 
 	spin_lock(&shrink_block_group->lock);
-	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
+	if (btrfs_block_group_used(&shrink_block_group->item) +
+	    shrink_block_group->reserved > 0) {
 		spin_unlock(&shrink_block_group->lock);
 
 		trans = btrfs_start_transaction(root, 1);
@@ -5502,6 +6404,17 @@ static int __alloc_chunk_for_shrink(struct btrfs_root *root,
 	return 0;
 }
 
+
+int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+					 struct btrfs_block_group_cache *group)
+
+{
+	__alloc_chunk_for_shrink(root, group, 1);
+	set_block_group_readonly(group);
+	return 0;
+}
+
+#if 0
 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
 				 u64 objectid, u64 size)
@@ -5781,6 +6694,7 @@ int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
 	btrfs_free_path(path);
 	return ret;
 }
+#endif
 
 static int find_first_block_group(struct btrfs_root *root,
 		struct btrfs_path *path, struct btrfs_key *key)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 1d51dc38bb497..0726a734ee388 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -291,16 +291,12 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 {
 	u64 extent_end = 0;
 	u64 search_start = start;
-	u64 leaf_start;
 	u64 ram_bytes = 0;
-	u64 orig_parent = 0;
 	u64 disk_bytenr = 0;
 	u64 orig_locked_end = locked_end;
 	u8 compression;
 	u8 encryption;
 	u16 other_encoding = 0;
-	u64 root_gen;
-	u64 root_owner;
 	struct extent_buffer *leaf;
 	struct btrfs_file_extent_item *extent;
 	struct btrfs_path *path;
@@ -340,9 +336,6 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		bookend = 0;
 		found_extent = 0;
 		found_inline = 0;
-		leaf_start = 0;
-		root_gen = 0;
-		root_owner = 0;
 		compression = 0;
 		encryption = 0;
 		extent = NULL;
@@ -417,9 +410,6 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		if (found_extent) {
 			read_extent_buffer(leaf, &old, (unsigned long)extent,
 					   sizeof(old));
-			root_gen = btrfs_header_generation(leaf);
-			root_owner = btrfs_header_owner(leaf);
-			leaf_start = leaf->start;
 		}
 
 		if (end < extent_end && end >= key.offset) {
@@ -443,14 +433,14 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 				}
 				locked_end = extent_end;
 			}
-			orig_parent = path->nodes[0]->start;
 			disk_bytenr = le64_to_cpu(old.disk_bytenr);
 			if (disk_bytenr != 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 					   disk_bytenr,
-					   le64_to_cpu(old.disk_num_bytes),
-					   orig_parent, root->root_key.objectid,
-					   trans->transid, inode->i_ino);
+					   le64_to_cpu(old.disk_num_bytes), 0,
+					   root->root_key.objectid,
+					   key.objectid, key.offset -
+					   le64_to_cpu(old.offset));
 				BUG_ON(ret);
 			}
 		}
@@ -568,17 +558,6 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 			btrfs_mark_buffer_dirty(path->nodes[0]);
 			btrfs_set_lock_blocking(path->nodes[0]);
 
-			if (disk_bytenr != 0) {
-				ret = btrfs_update_extent_ref(trans, root,
-						disk_bytenr,
-						le64_to_cpu(old.disk_num_bytes),
-						orig_parent,
-						leaf->start,
-						root->root_key.objectid,
-						trans->transid, ins.objectid);
-
-				BUG_ON(ret);
-			}
 			path->leave_spinning = 0;
 			btrfs_release_path(root, path);
 			if (disk_bytenr != 0)
@@ -594,8 +573,9 @@ noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 				ret = btrfs_free_extent(trans, root,
 						old_disk_bytenr,
 						le64_to_cpu(old.disk_num_bytes),
-						leaf_start, root_owner,
-						root_gen, key.objectid, 0);
+						0, root->root_key.objectid,
+						key.objectid, key.offset -
+						le64_to_cpu(old.offset));
 				BUG_ON(ret);
 				*hint_byte = old_disk_bytenr;
 			}
@@ -664,12 +644,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	u64 bytenr;
 	u64 num_bytes;
 	u64 extent_end;
-	u64 extent_offset;
+	u64 orig_offset;
 	u64 other_start;
 	u64 other_end;
 	u64 split = start;
 	u64 locked_end = end;
-	u64 orig_parent;
 	int extent_type;
 	int split_end = 1;
 	int ret;
@@ -703,7 +682,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 
 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-	extent_offset = btrfs_file_extent_offset(leaf, fi);
+	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
 
 	if (key.offset == start)
 		split = end;
@@ -711,8 +690,6 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	if (key.offset == start && extent_end == end) {
 		int del_nr = 0;
 		int del_slot = 0;
-		u64 leaf_owner = btrfs_header_owner(leaf);
-		u64 leaf_gen = btrfs_header_generation(leaf);
 		other_start = end;
 		other_end = 0;
 		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
@@ -721,8 +698,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			del_slot = path->slots[0] + 1;
 			del_nr++;
 			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-						leaf->start, leaf_owner,
-						leaf_gen, inode->i_ino, 0);
+						0, root->root_key.objectid,
+						inode->i_ino, orig_offset);
 			BUG_ON(ret);
 		}
 		other_start = 0;
@@ -733,8 +710,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			del_slot = path->slots[0];
 			del_nr++;
 			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
-						leaf->start, leaf_owner,
-						leaf_gen, inode->i_ino, 0);
+						0, root->root_key.objectid,
+						inode->i_ino, orig_offset);
 			BUG_ON(ret);
 		}
 		split_end = 0;
@@ -768,13 +745,12 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 			locked_end = extent_end;
 		}
 		btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
-		extent_offset += split - key.offset;
 	} else  {
 		BUG_ON(key.offset != start);
-		btrfs_set_file_extent_offset(leaf, fi, extent_offset +
-					     split - key.offset);
-		btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
 		key.offset = split;
+		btrfs_set_file_extent_offset(leaf, fi, key.offset -
+					     orig_offset);
+		btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
 		btrfs_set_item_key_safe(trans, root, path, &key);
 		extent_end = split;
 	}
@@ -793,7 +769,8 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 					    struct btrfs_file_extent_item);
 			key.offset = split;
 			btrfs_set_item_key_safe(trans, root, path, &key);
-			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+			btrfs_set_file_extent_offset(leaf, fi, key.offset -
+						     orig_offset);
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							other_end - split);
 			goto done;
@@ -815,10 +792,9 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 
 	btrfs_mark_buffer_dirty(leaf);
 
-	orig_parent = leaf->start;
-	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
-				   orig_parent, root->root_key.objectid,
-				   trans->transid, inode->i_ino);
+	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
+				   root->root_key.objectid,
+				   inode->i_ino, orig_offset);
 	BUG_ON(ret);
 	btrfs_release_path(root, path);
 
@@ -833,20 +809,12 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 	btrfs_set_file_extent_type(leaf, fi, extent_type);
 	btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
 	btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
-	btrfs_set_file_extent_offset(leaf, fi, extent_offset);
+	btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
 	btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
 	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
 	btrfs_set_file_extent_compression(leaf, fi, 0);
 	btrfs_set_file_extent_encryption(leaf, fi, 0);
 	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
-
-	if (orig_parent != leaf->start) {
-		ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-					      orig_parent, leaf->start,
-					      root->root_key.objectid,
-					      trans->transid, inode->i_ino);
-		BUG_ON(ret);
-	}
 done:
 	btrfs_mark_buffer_dirty(leaf);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1c8b0190d0317..917bf10597c60 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -48,7 +48,6 @@
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
-#include "ref-cache.h"
 #include "compression.h"
 #include "locking.h"
 
@@ -944,6 +943,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 	u64 cow_start;
 	u64 cur_offset;
 	u64 extent_end;
+	u64 extent_offset;
 	u64 disk_bytenr;
 	u64 num_bytes;
 	int extent_type;
@@ -1005,6 +1005,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+			extent_offset = btrfs_file_extent_offset(leaf, fi);
 			extent_end = found_key.offset +
 				btrfs_file_extent_num_bytes(leaf, fi);
 			if (extent_end <= start) {
@@ -1022,9 +1023,10 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 			if (btrfs_extent_readonly(root, disk_bytenr))
 				goto out_check;
 			if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
-						  disk_bytenr))
+						  found_key.offset -
+						  extent_offset, disk_bytenr))
 				goto out_check;
-			disk_bytenr += btrfs_file_extent_offset(leaf, fi);
+			disk_bytenr += extent_offset;
 			disk_bytenr += cur_offset - found_key.offset;
 			num_bytes = min(end + 1, extent_end) - cur_offset;
 			/*
@@ -1489,9 +1491,9 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ins.objectid = disk_bytenr;
 	ins.offset = disk_num_bytes;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
-	ret = btrfs_alloc_reserved_extent(trans, root, leaf->start,
-					  root->root_key.objectid,
-					  trans->transid, inode->i_ino, &ins);
+	ret = btrfs_alloc_reserved_file_extent(trans, root,
+					root->root_key.objectid,
+					inode->i_ino, file_pos, &ins);
 	BUG_ON(ret);
 	btrfs_free_path(path);
 
@@ -1956,23 +1958,13 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * crossing root thing.  we store the inode number in the
 		 * offset of the orphan item.
 		 */
-		inode = btrfs_iget_locked(root->fs_info->sb,
-					  found_key.offset, root);
-		if (!inode)
+		found_key.objectid = found_key.offset;
+		found_key.type = BTRFS_INODE_ITEM_KEY;
+		found_key.offset = 0;
+		inode = btrfs_iget(root->fs_info->sb, &found_key, root);
+		if (IS_ERR(inode))
 			break;
 
-		if (inode->i_state & I_NEW) {
-			BTRFS_I(inode)->root = root;
-
-			/* have to set the location manually */
-			BTRFS_I(inode)->location.objectid = inode->i_ino;
-			BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-			BTRFS_I(inode)->location.offset = 0;
-
-			btrfs_read_locked_inode(inode);
-			unlock_new_inode(inode);
-		}
-
 		/*
 		 * add this inode to the orphan list so btrfs_orphan_del does
 		 * the proper thing when we hit it
@@ -2069,7 +2061,7 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 /*
  * read an inode from the btree into the in-memory inode
  */
-void btrfs_read_locked_inode(struct inode *inode)
+static void btrfs_read_locked_inode(struct inode *inode)
 {
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
@@ -2599,9 +2591,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	struct btrfs_file_extent_item *fi;
 	u64 extent_start = 0;
 	u64 extent_num_bytes = 0;
+	u64 extent_offset = 0;
 	u64 item_end = 0;
-	u64 root_gen = 0;
-	u64 root_owner = 0;
 	int found_extent;
 	int del_item;
 	int pending_del_nr = 0;
@@ -2716,6 +2707,9 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				extent_num_bytes =
 					btrfs_file_extent_disk_num_bytes(leaf,
 									 fi);
+				extent_offset = found_key.offset -
+					btrfs_file_extent_offset(leaf, fi);
+
 				/* FIXME blocksize != 4096 */
 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
 				if (extent_start != 0) {
@@ -2723,8 +2717,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 					if (root->ref_cows)
 						inode_sub_bytes(inode, num_dec);
 				}
-				root_gen = btrfs_header_generation(leaf);
-				root_owner = btrfs_header_owner(leaf);
 			}
 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 			/*
@@ -2768,12 +2760,12 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 		} else {
 			break;
 		}
-		if (found_extent) {
+		if (found_extent && root->ref_cows) {
 			btrfs_set_path_blocking(path);
 			ret = btrfs_free_extent(trans, root, extent_start,
-						extent_num_bytes,
-						leaf->start, root_owner,
-						root_gen, inode->i_ino, 0);
+						extent_num_bytes, 0,
+						btrfs_header_owner(leaf),
+						inode->i_ino, extent_offset);
 			BUG_ON(ret);
 		}
 next:
@@ -3105,6 +3097,45 @@ static int fixup_tree_root_location(struct btrfs_root *root,
 	return 0;
 }
 
+static void inode_tree_add(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_inode *entry;
+	struct rb_node **p = &root->inode_tree.rb_node;
+	struct rb_node *parent = NULL;
+
+	spin_lock(&root->inode_lock);
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct btrfs_inode, rb_node);
+
+		if (inode->i_ino < entry->vfs_inode.i_ino)
+			p = &(*p)->rb_left;
+		else if (inode->i_ino > entry->vfs_inode.i_ino)
+			p = &(*p)->rb_right;
+		else {
+			WARN_ON(!(entry->vfs_inode.i_state &
+				  (I_WILL_FREE | I_FREEING | I_CLEAR)));
+			break;
+		}
+	}
+	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
+	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+	spin_unlock(&root->inode_lock);
+}
+
+static void inode_tree_del(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+
+	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
+		spin_lock(&root->inode_lock);
+		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
+		spin_unlock(&root->inode_lock);
+		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
+	}
+}
+
 static noinline void init_btrfs_i(struct inode *inode)
 {
 	struct btrfs_inode *bi = BTRFS_I(inode);
@@ -3130,6 +3161,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 			     inode->i_mapping, GFP_NOFS);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
 	INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
+	RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
 	btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
 	mutex_init(&BTRFS_I(inode)->extent_mutex);
 	mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3152,26 +3184,9 @@ static int btrfs_find_actor(struct inode *inode, void *opaque)
 		args->root == BTRFS_I(inode)->root;
 }
 
-struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
-			    struct btrfs_root *root, int wait)
-{
-	struct inode *inode;
-	struct btrfs_iget_args args;
-	args.ino = objectid;
-	args.root = root;
-
-	if (wait) {
-		inode = ilookup5(s, objectid, btrfs_find_actor,
-				 (void *)&args);
-	} else {
-		inode = ilookup5_nowait(s, objectid, btrfs_find_actor,
-					(void *)&args);
-	}
-	return inode;
-}
-
-struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
-				struct btrfs_root *root)
+static struct inode *btrfs_iget_locked(struct super_block *s,
+				       u64 objectid,
+				       struct btrfs_root *root)
 {
 	struct inode *inode;
 	struct btrfs_iget_args args;
@@ -3188,24 +3203,21 @@ struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
  * Returns in *is_new if the inode was read from disk
  */
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
-			 struct btrfs_root *root, int *is_new)
+			 struct btrfs_root *root)
 {
 	struct inode *inode;
 
 	inode = btrfs_iget_locked(s, location->objectid, root);
 	if (!inode)
-		return ERR_PTR(-EACCES);
+		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
 		BTRFS_I(inode)->root = root;
 		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
 		btrfs_read_locked_inode(inode);
+
+		inode_tree_add(inode);
 		unlock_new_inode(inode);
-		if (is_new)
-			*is_new = 1;
-	} else {
-		if (is_new)
-			*is_new = 0;
 	}
 
 	return inode;
@@ -3218,7 +3230,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 	struct btrfs_root *root = bi->root;
 	struct btrfs_root *sub_root = root;
 	struct btrfs_key location;
-	int ret, new;
+	int ret;
 
 	if (dentry->d_name.len > BTRFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
@@ -3236,7 +3248,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
 			return ERR_PTR(ret);
 		if (ret > 0)
 			return ERR_PTR(-ENOENT);
-		inode = btrfs_iget(dir->i_sb, &location, sub_root, &new);
+		inode = btrfs_iget(dir->i_sb, &location, sub_root);
 		if (IS_ERR(inode))
 			return ERR_CAST(inode);
 	}
@@ -3631,6 +3643,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 
 	insert_inode_hash(inode);
+	inode_tree_add(inode);
 	return inode;
 fail:
 	if (dir)
@@ -4683,6 +4696,7 @@ void btrfs_destroy_inode(struct inode *inode)
 			btrfs_put_ordered_extent(ordered);
 		}
 	}
+	inode_tree_del(inode);
 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2624b53ea7830..54dfd45cc5917 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -82,22 +82,25 @@ static noinline int create_subvol(struct btrfs_root *root,
 	if (ret)
 		goto fail;
 
-	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
-				      objectid, trans->transid, 0, 0, 0);
+	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
+				      0, objectid, NULL, 0, 0, 0);
 	if (IS_ERR(leaf)) {
 		ret = PTR_ERR(leaf);
 		goto fail;
 	}
 
-	btrfs_set_header_nritems(leaf, 0);
-	btrfs_set_header_level(leaf, 0);
+	memset_extent_buffer(leaf, 0, 0, sizeof(struct btrfs_header));
 	btrfs_set_header_bytenr(leaf, leaf->start);
 	btrfs_set_header_generation(leaf, trans->transid);
+	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(leaf, objectid);
 
 	write_extent_buffer(leaf, root->fs_info->fsid,
 			    (unsigned long)btrfs_header_fsid(leaf),
 			    BTRFS_FSID_SIZE);
+	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
+			    (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+			    BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
 	inode_item = &root_item.inode;
@@ -125,7 +128,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	btrfs_set_root_dirid(&root_item, new_dirid);
 
 	key.objectid = objectid;
-	key.offset = 1;
+	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&root_item);
@@ -911,10 +914,10 @@ static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 				if (disko) {
 					inode_add_bytes(inode, datal);
 					ret = btrfs_inc_extent_ref(trans, root,
-						   disko, diskl, leaf->start,
-						   root->root_key.objectid,
-						   trans->transid,
-						   inode->i_ino);
+							disko, diskl, 0,
+							root->root_key.objectid,
+							inode->i_ino,
+							new_key.offset - datao);
 					BUG_ON(ret);
 				}
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 5f8f218c10057..6d6523da0a30e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -45,22 +45,132 @@ static void print_dev_item(struct extent_buffer *eb,
 	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
 	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
 }
+static void print_extent_data_ref(struct extent_buffer *eb,
+				  struct btrfs_extent_data_ref *ref)
+{
+	printk(KERN_INFO "\t\textent data backref root %llu "
+	       "objectid %llu offset %llu count %u\n",
+	       (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
+	       (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
+	       (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
+	       btrfs_extent_data_ref_count(eb, ref));
+}
+
+static void print_extent_item(struct extent_buffer *eb, int slot)
+{
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *iref;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_shared_data_ref *sref;
+	struct btrfs_disk_key key;
+	unsigned long end;
+	unsigned long ptr;
+	int type;
+	u32 item_size = btrfs_item_size_nr(eb, slot);
+	u64 flags;
+	u64 offset;
+
+	if (item_size < sizeof(*ei)) {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		struct btrfs_extent_item_v0 *ei0;
+		BUG_ON(item_size != sizeof(*ei0));
+		ei0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_item_v0);
+		printk(KERN_INFO "\t\textent refs %u\n",
+		       btrfs_extent_refs_v0(eb, ei0));
+		return;
+#else
+		BUG();
+#endif
+	}
+
+	ei = btrfs_item_ptr(eb, slot, struct btrfs_extent_item);
+	flags = btrfs_extent_flags(eb, ei);
+
+	printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
+	       (unsigned long long)btrfs_extent_refs(eb, ei),
+	       (unsigned long long)btrfs_extent_generation(eb, ei),
+	       (unsigned long long)flags);
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		struct btrfs_tree_block_info *info;
+		info = (struct btrfs_tree_block_info *)(ei + 1);
+		btrfs_tree_block_key(eb, info, &key);
+		printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
+		       "level %d\n",
+		       (unsigned long long)btrfs_disk_key_objectid(&key),
+		       key.type,
+		       (unsigned long long)btrfs_disk_key_offset(&key),
+		       btrfs_tree_block_level(eb, info));
+		iref = (struct btrfs_extent_inline_ref *)(info + 1);
+	} else {
+		iref = (struct btrfs_extent_inline_ref *)(ei + 1);
+	}
+
+	ptr = (unsigned long)iref;
+	end = (unsigned long)ei + item_size;
+	while (ptr < end) {
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		type = btrfs_extent_inline_ref_type(eb, iref);
+		offset = btrfs_extent_inline_ref_offset(eb, iref);
+		switch (type) {
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\ttree block backref "
+				"root %llu\n", (unsigned long long)offset);
+			break;
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\tshared block backref "
+				"parent %llu\n", (unsigned long long)offset);
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			print_extent_data_ref(eb, dref);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = (struct btrfs_shared_data_ref *)(iref + 1);
+			printk(KERN_INFO "\t\tshared data backref "
+			       "parent %llu count %u\n",
+			       (unsigned long long)offset,
+			       btrfs_shared_data_ref_count(eb, sref));
+			break;
+		default:
+			BUG();
+		}
+		ptr += btrfs_extent_inline_ref_size(type);
+	}
+	WARN_ON(ptr > end);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
+{
+	struct btrfs_extent_ref_v0 *ref0;
+
+	ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
+	printk("\t\textent back ref root %llu gen %llu "
+		"owner %llu num_refs %lu\n",
+		(unsigned long long)btrfs_ref_root_v0(eb, ref0),
+		(unsigned long long)btrfs_ref_generation_v0(eb, ref0),
+		(unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
+		(unsigned long)btrfs_ref_count_v0(eb, ref0));
+}
+#endif
+
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
+	u32 type;
 	u32 nr = btrfs_header_nritems(l);
 	struct btrfs_item *item;
-	struct btrfs_extent_item *ei;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
 	struct btrfs_inode_item *ii;
 	struct btrfs_block_group_item *bi;
 	struct btrfs_file_extent_item *fi;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_shared_data_ref *sref;
+	struct btrfs_dev_extent *dev_extent;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
-	struct btrfs_extent_ref *ref;
-	struct btrfs_dev_extent *dev_extent;
-	u32 type;
 
 	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
 		(unsigned long long)btrfs_header_bytenr(l), nr,
@@ -100,20 +210,25 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 				btrfs_disk_root_refs(l, ri));
 			break;
 		case BTRFS_EXTENT_ITEM_KEY:
-			ei = btrfs_item_ptr(l, i, struct btrfs_extent_item);
-			printk(KERN_INFO "\t\textent data refs %u\n",
-				btrfs_extent_refs(l, ei));
-			break;
-		case BTRFS_EXTENT_REF_KEY:
-			ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref);
-			printk(KERN_INFO "\t\textent back ref root %llu "
-			       "gen %llu owner %llu num_refs %lu\n",
-			       (unsigned long long)btrfs_ref_root(l, ref),
-			       (unsigned long long)btrfs_ref_generation(l, ref),
-			       (unsigned long long)btrfs_ref_objectid(l, ref),
-			       (unsigned long)btrfs_ref_num_refs(l, ref));
+			print_extent_item(l, i);
+			break;
+		case BTRFS_TREE_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\ttree block backref\n");
+			break;
+		case BTRFS_SHARED_BLOCK_REF_KEY:
+			printk(KERN_INFO "\t\tshared block backref\n");
+			break;
+		case BTRFS_EXTENT_DATA_REF_KEY:
+			dref = btrfs_item_ptr(l, i,
+					      struct btrfs_extent_data_ref);
+			print_extent_data_ref(l, dref);
+			break;
+		case BTRFS_SHARED_DATA_REF_KEY:
+			sref = btrfs_item_ptr(l, i,
+					      struct btrfs_shared_data_ref);
+			printk(KERN_INFO "\t\tshared data backref count %u\n",
+			       btrfs_shared_data_ref_count(l, sref));
 			break;
-
 		case BTRFS_EXTENT_DATA_KEY:
 			fi = btrfs_item_ptr(l, i,
 					    struct btrfs_file_extent_item);
@@ -139,6 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			       (unsigned long long)
 			       btrfs_file_extent_ram_bytes(l, fi));
 			break;
+		case BTRFS_EXTENT_REF_V0_KEY:
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			print_extent_ref_v0(l, i);
+#else
+			BUG();
+#endif
 		case BTRFS_BLOCK_GROUP_ITEM_KEY:
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
new file mode 100644
index 0000000000000..b23dc209ae103
--- /dev/null
+++ b/fs/btrfs/relocation.c
@@ -0,0 +1,3711 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "volumes.h"
+#include "locking.h"
+#include "btrfs_inode.h"
+#include "async-thread.h"
+
+/*
+ * backref_node, mapping_node and tree_block start with this
+ */
+struct tree_entry {
+	struct rb_node rb_node;
+	u64 bytenr;
+};
+
+/*
+ * present a tree block in the backref cache
+ */
+struct backref_node {
+	struct rb_node rb_node;
+	u64 bytenr;
+	/* objectid tree block owner */
+	u64 owner;
+	/* list of upper level blocks reference this block */
+	struct list_head upper;
+	/* list of child blocks in the cache */
+	struct list_head lower;
+	/* NULL if this node is not tree root */
+	struct btrfs_root *root;
+	/* extent buffer got by COW the block */
+	struct extent_buffer *eb;
+	/* level of tree block */
+	unsigned int level:8;
+	/* 1 if the block is root of old snapshot */
+	unsigned int old_root:1;
+	/* 1 if no child blocks in the cache */
+	unsigned int lowest:1;
+	/* is the extent buffer locked */
+	unsigned int locked:1;
+	/* has the block been processed */
+	unsigned int processed:1;
+	/* have backrefs of this block been checked */
+	unsigned int checked:1;
+};
+
+/*
+ * present a block pointer in the backref cache
+ */
+struct backref_edge {
+	struct list_head list[2];
+	struct backref_node *node[2];
+	u64 blockptr;
+};
+
+#define LOWER	0
+#define UPPER	1
+
+struct backref_cache {
+	/* red black tree of all backref nodes in the cache */
+	struct rb_root rb_root;
+	/* list of backref nodes with no child block in the cache */
+	struct list_head pending[BTRFS_MAX_LEVEL];
+	spinlock_t lock;
+};
+
+/*
+ * map address of tree root to tree
+ */
+struct mapping_node {
+	struct rb_node rb_node;
+	u64 bytenr;
+	void *data;
+};
+
+struct mapping_tree {
+	struct rb_root rb_root;
+	spinlock_t lock;
+};
+
+/*
+ * present a tree block to process
+ */
+struct tree_block {
+	struct rb_node rb_node;
+	u64 bytenr;
+	struct btrfs_key key;
+	unsigned int level:8;
+	unsigned int key_ready:1;
+};
+
+/* inode vector */
+#define INODEVEC_SIZE 16
+
+struct inodevec {
+	struct list_head list;
+	struct inode *inode[INODEVEC_SIZE];
+	int nr;
+};
+
+struct reloc_control {
+	/* block group to relocate */
+	struct btrfs_block_group_cache *block_group;
+	/* extent tree */
+	struct btrfs_root *extent_root;
+	/* inode for moving data */
+	struct inode *data_inode;
+	struct btrfs_workers workers;
+	/* tree blocks have been processed */
+	struct extent_io_tree processed_blocks;
+	/* map start of tree root to corresponding reloc tree */
+	struct mapping_tree reloc_root_tree;
+	/* list of reloc trees */
+	struct list_head reloc_roots;
+	u64 search_start;
+	u64 extents_found;
+	u64 extents_skipped;
+	int stage;
+	int create_reloc_root;
+	unsigned int found_file_extent:1;
+	unsigned int found_old_snapshot:1;
+};
+
+/* stages of data relocation */
+#define MOVE_DATA_EXTENTS	0
+#define UPDATE_DATA_PTRS	1
+
+/*
+ * merge reloc tree to corresponding fs tree in worker threads
+ */
+struct async_merge {
+	struct btrfs_work work;
+	struct reloc_control *rc;
+	struct btrfs_root *root;
+	struct completion *done;
+	atomic_t *num_pending;
+};
+
+static void mapping_tree_init(struct mapping_tree *tree)
+{
+	tree->rb_root.rb_node = NULL;
+	spin_lock_init(&tree->lock);
+}
+
+static void backref_cache_init(struct backref_cache *cache)
+{
+	int i;
+	cache->rb_root.rb_node = NULL;
+	for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+		INIT_LIST_HEAD(&cache->pending[i]);
+	spin_lock_init(&cache->lock);
+}
+
+static void backref_node_init(struct backref_node *node)
+{
+	memset(node, 0, sizeof(*node));
+	INIT_LIST_HEAD(&node->upper);
+	INIT_LIST_HEAD(&node->lower);
+	RB_CLEAR_NODE(&node->rb_node);
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
+				   struct rb_node *node)
+{
+	struct rb_node **p = &root->rb_node;
+	struct rb_node *parent = NULL;
+	struct tree_entry *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		if (bytenr < entry->bytenr)
+			p = &(*p)->rb_left;
+		else if (bytenr > entry->bytenr)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
+{
+	struct rb_node *n = root->rb_node;
+	struct tree_entry *entry;
+
+	while (n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+
+		if (bytenr < entry->bytenr)
+			n = n->rb_left;
+		else if (bytenr > entry->bytenr)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	return NULL;
+}
+
+/*
+ * walk up backref nodes until reach node presents tree root
+ */
+static struct backref_node *walk_up_backref(struct backref_node *node,
+					    struct backref_edge *edges[],
+					    int *index)
+{
+	struct backref_edge *edge;
+	int idx = *index;
+
+	while (!list_empty(&node->upper)) {
+		edge = list_entry(node->upper.next,
+				  struct backref_edge, list[LOWER]);
+		edges[idx++] = edge;
+		node = edge->node[UPPER];
+	}
+	*index = idx;
+	return node;
+}
+
+/*
+ * walk down backref nodes to find start of next reference path
+ */
+static struct backref_node *walk_down_backref(struct backref_edge *edges[],
+					      int *index)
+{
+	struct backref_edge *edge;
+	struct backref_node *lower;
+	int idx = *index;
+
+	while (idx > 0) {
+		edge = edges[idx - 1];
+		lower = edge->node[LOWER];
+		if (list_is_last(&edge->list[LOWER], &lower->upper)) {
+			idx--;
+			continue;
+		}
+		edge = list_entry(edge->list[LOWER].next,
+				  struct backref_edge, list[LOWER]);
+		edges[idx - 1] = edge;
+		*index = idx;
+		return edge->node[UPPER];
+	}
+	*index = 0;
+	return NULL;
+}
+
+static void drop_node_buffer(struct backref_node *node)
+{
+	if (node->eb) {
+		if (node->locked) {
+			btrfs_tree_unlock(node->eb);
+			node->locked = 0;
+		}
+		free_extent_buffer(node->eb);
+		node->eb = NULL;
+	}
+}
+
+static void drop_backref_node(struct backref_cache *tree,
+			      struct backref_node *node)
+{
+	BUG_ON(!node->lowest);
+	BUG_ON(!list_empty(&node->upper));
+
+	drop_node_buffer(node);
+	list_del(&node->lower);
+
+	rb_erase(&node->rb_node, &tree->rb_root);
+	kfree(node);
+}
+
+/*
+ * remove a backref node from the backref cache
+ */
+static void remove_backref_node(struct backref_cache *cache,
+				struct backref_node *node)
+{
+	struct backref_node *upper;
+	struct backref_edge *edge;
+
+	if (!node)
+		return;
+
+	BUG_ON(!node->lowest);
+	while (!list_empty(&node->upper)) {
+		edge = list_entry(node->upper.next, struct backref_edge,
+				  list[LOWER]);
+		upper = edge->node[UPPER];
+		list_del(&edge->list[LOWER]);
+		list_del(&edge->list[UPPER]);
+		kfree(edge);
+		/*
+		 * add the node to pending list if no other
+		 * child block cached.
+		 */
+		if (list_empty(&upper->lower)) {
+			list_add_tail(&upper->lower,
+				      &cache->pending[upper->level]);
+			upper->lowest = 1;
+		}
+	}
+	drop_backref_node(cache, node);
+}
+
+/*
+ * find reloc tree by address of tree root
+ */
+static struct btrfs_root *find_reloc_root(struct reloc_control *rc,
+					  u64 bytenr)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node;
+	struct btrfs_root *root = NULL;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root, bytenr);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		root = (struct btrfs_root *)node->data;
+	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+	return root;
+}
+
+static int is_cowonly_root(u64 root_objectid)
+{
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
+	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
+	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
+	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
+	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
+static struct btrfs_root *read_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid)
+{
+	struct btrfs_key key;
+
+	key.objectid = root_objectid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	if (is_cowonly_root(root_objectid))
+		key.offset = 0;
+	else
+		key.offset = (u64)-1;
+
+	return btrfs_read_fs_root_no_name(fs_info, &key);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static noinline_for_stack
+struct btrfs_root *find_tree_root(struct reloc_control *rc,
+				  struct extent_buffer *leaf,
+				  struct btrfs_extent_ref_v0 *ref0)
+{
+	struct btrfs_root *root;
+	u64 root_objectid = btrfs_ref_root_v0(leaf, ref0);
+	u64 generation = btrfs_ref_generation_v0(leaf, ref0);
+
+	BUG_ON(root_objectid == BTRFS_TREE_RELOC_OBJECTID);
+
+	root = read_fs_root(rc->extent_root->fs_info, root_objectid);
+	BUG_ON(IS_ERR(root));
+
+	if (root->ref_cows &&
+	    generation != btrfs_root_generation(&root->root_item))
+		return NULL;
+
+	return root;
+}
+#endif
+
+static noinline_for_stack
+int find_inline_backref(struct extent_buffer *leaf, int slot,
+			unsigned long *ptr, unsigned long *end)
+{
+	struct btrfs_extent_item *ei;
+	struct btrfs_tree_block_info *bi;
+	u32 item_size;
+
+	item_size = btrfs_item_size_nr(leaf, slot);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (item_size < sizeof(*ei)) {
+		WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		return 1;
+	}
+#endif
+	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
+	WARN_ON(!(btrfs_extent_flags(leaf, ei) &
+		  BTRFS_EXTENT_FLAG_TREE_BLOCK));
+
+	if (item_size <= sizeof(*ei) + sizeof(*bi)) {
+		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
+		return 1;
+	}
+
+	bi = (struct btrfs_tree_block_info *)(ei + 1);
+	*ptr = (unsigned long)(bi + 1);
+	*end = (unsigned long)ei + item_size;
+	return 0;
+}
+
+/*
+ * build backref tree for a given tree block. root of the backref tree
+ * corresponds the tree block, leaves of the backref tree correspond
+ * roots of b-trees that reference the tree block.
+ *
+ * the basic idea of this function is check backrefs of a given block
+ * to find upper level blocks that refernece the block, and then check
+ * bakcrefs of these upper level blocks recursively. the recursion stop
+ * when tree root is reached or backrefs for the block is cached.
+ *
+ * NOTE: if we find backrefs for a block are cached, we know backrefs
+ * for all upper level blocks that directly/indirectly reference the
+ * block are also cached.
+ */
+static struct backref_node *build_backref_tree(struct reloc_control *rc,
+					       struct backref_cache *cache,
+					       struct btrfs_key *node_key,
+					       int level, u64 bytenr)
+{
+	struct btrfs_path *path1;
+	struct btrfs_path *path2;
+	struct extent_buffer *eb;
+	struct btrfs_root *root;
+	struct backref_node *cur;
+	struct backref_node *upper;
+	struct backref_node *lower;
+	struct backref_node *node = NULL;
+	struct backref_node *exist = NULL;
+	struct backref_edge *edge;
+	struct rb_node *rb_node;
+	struct btrfs_key key;
+	unsigned long end;
+	unsigned long ptr;
+	LIST_HEAD(list);
+	int ret;
+	int err = 0;
+
+	path1 = btrfs_alloc_path();
+	path2 = btrfs_alloc_path();
+	if (!path1 || !path2) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	node = kmalloc(sizeof(*node), GFP_NOFS);
+	if (!node) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	backref_node_init(node);
+	node->bytenr = bytenr;
+	node->owner = 0;
+	node->level = level;
+	node->lowest = 1;
+	cur = node;
+again:
+	end = 0;
+	ptr = 0;
+	key.objectid = cur->bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	path1->search_commit_root = 1;
+	path1->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path1,
+				0, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+	BUG_ON(!ret || !path1->slots[0]);
+
+	path1->slots[0]--;
+
+	WARN_ON(cur->checked);
+	if (!list_empty(&cur->upper)) {
+		/*
+		 * the backref was added previously when processsing
+		 * backref of type BTRFS_TREE_BLOCK_REF_KEY
+		 */
+		BUG_ON(!list_is_singular(&cur->upper));
+		edge = list_entry(cur->upper.next, struct backref_edge,
+				  list[LOWER]);
+		BUG_ON(!list_empty(&edge->list[UPPER]));
+		exist = edge->node[UPPER];
+		/*
+		 * add the upper level block to pending list if we need
+		 * check its backrefs
+		 */
+		if (!exist->checked)
+			list_add_tail(&edge->list[UPPER], &list);
+	} else {
+		exist = NULL;
+	}
+
+	while (1) {
+		cond_resched();
+		eb = path1->nodes[0];
+
+		if (ptr >= end) {
+			if (path1->slots[0] >= btrfs_header_nritems(eb)) {
+				ret = btrfs_next_leaf(rc->extent_root, path1);
+				if (ret < 0) {
+					err = ret;
+					goto out;
+				}
+				if (ret > 0)
+					break;
+				eb = path1->nodes[0];
+			}
+
+			btrfs_item_key_to_cpu(eb, &key, path1->slots[0]);
+			if (key.objectid != cur->bytenr) {
+				WARN_ON(exist);
+				break;
+			}
+
+			if (key.type == BTRFS_EXTENT_ITEM_KEY) {
+				ret = find_inline_backref(eb, path1->slots[0],
+							  &ptr, &end);
+				if (ret)
+					goto next;
+			}
+		}
+
+		if (ptr < end) {
+			/* update key for inline back ref */
+			struct btrfs_extent_inline_ref *iref;
+			iref = (struct btrfs_extent_inline_ref *)ptr;
+			key.type = btrfs_extent_inline_ref_type(eb, iref);
+			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+			WARN_ON(key.type != BTRFS_TREE_BLOCK_REF_KEY &&
+				key.type != BTRFS_SHARED_BLOCK_REF_KEY);
+		}
+
+		if (exist &&
+		    ((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
+		      exist->owner == key.offset) ||
+		     (key.type == BTRFS_SHARED_BLOCK_REF_KEY &&
+		      exist->bytenr == key.offset))) {
+			exist = NULL;
+			goto next;
+		}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
+		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+			if (key.objectid == key.offset &&
+			    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+				struct btrfs_extent_ref_v0 *ref0;
+				ref0 = btrfs_item_ptr(eb, path1->slots[0],
+						struct btrfs_extent_ref_v0);
+				root = find_tree_root(rc, eb, ref0);
+				if (root)
+					cur->root = root;
+				else
+					cur->old_root = 1;
+				break;
+			}
+#else
+		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+		if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
+#endif
+			if (key.objectid == key.offset) {
+				/*
+				 * only root blocks of reloc trees use
+				 * backref of this type.
+				 */
+				root = find_reloc_root(rc, cur->bytenr);
+				BUG_ON(!root);
+				cur->root = root;
+				break;
+			}
+
+			edge = kzalloc(sizeof(*edge), GFP_NOFS);
+			if (!edge) {
+				err = -ENOMEM;
+				goto out;
+			}
+			rb_node = tree_search(&cache->rb_root, key.offset);
+			if (!rb_node) {
+				upper = kmalloc(sizeof(*upper), GFP_NOFS);
+				if (!upper) {
+					kfree(edge);
+					err = -ENOMEM;
+					goto out;
+				}
+				backref_node_init(upper);
+				upper->bytenr = key.offset;
+				upper->owner = 0;
+				upper->level = cur->level + 1;
+				/*
+				 *  backrefs for the upper level block isn't
+				 *  cached, add the block to pending list
+				 */
+				list_add_tail(&edge->list[UPPER], &list);
+			} else {
+				upper = rb_entry(rb_node, struct backref_node,
+						 rb_node);
+				INIT_LIST_HEAD(&edge->list[UPPER]);
+			}
+			list_add(&edge->list[LOWER], &cur->upper);
+			edge->node[UPPER] = upper;
+			edge->node[LOWER] = cur;
+
+			goto next;
+		} else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
+			goto next;
+		}
+
+		/* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+		root = read_fs_root(rc->extent_root->fs_info, key.offset);
+		if (IS_ERR(root)) {
+			err = PTR_ERR(root);
+			goto out;
+		}
+
+		if (btrfs_root_level(&root->root_item) == cur->level) {
+			/* tree root */
+			BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+			       cur->bytenr);
+			cur->root = root;
+			break;
+		}
+
+		level = cur->level + 1;
+
+		/*
+		 * searching the tree to find upper level blocks
+		 * reference the block.
+		 */
+		path2->search_commit_root = 1;
+		path2->skip_locking = 1;
+		path2->lowest_level = level;
+		ret = btrfs_search_slot(NULL, root, node_key, path2, 0, 0);
+		path2->lowest_level = 0;
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+
+		eb = path2->nodes[level];
+		WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
+			cur->bytenr);
+
+		lower = cur;
+		for (; level < BTRFS_MAX_LEVEL; level++) {
+			if (!path2->nodes[level]) {
+				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
+				       lower->bytenr);
+				lower->root = root;
+				break;
+			}
+
+			edge = kzalloc(sizeof(*edge), GFP_NOFS);
+			if (!edge) {
+				err = -ENOMEM;
+				goto out;
+			}
+
+			eb = path2->nodes[level];
+			rb_node = tree_search(&cache->rb_root, eb->start);
+			if (!rb_node) {
+				upper = kmalloc(sizeof(*upper), GFP_NOFS);
+				if (!upper) {
+					kfree(edge);
+					err = -ENOMEM;
+					goto out;
+				}
+				backref_node_init(upper);
+				upper->bytenr = eb->start;
+				upper->owner = btrfs_header_owner(eb);
+				upper->level = lower->level + 1;
+
+				/*
+				 * if we know the block isn't shared
+				 * we can void checking its backrefs.
+				 */
+				if (btrfs_block_can_be_shared(root, eb))
+					upper->checked = 0;
+				else
+					upper->checked = 1;
+
+				/*
+				 * add the block to pending list if we
+				 * need check its backrefs. only block
+				 * at 'cur->level + 1' is added to the
+				 * tail of pending list. this guarantees
+				 * we check backrefs from lower level
+				 * blocks to upper level blocks.
+				 */
+				if (!upper->checked &&
+				    level == cur->level + 1) {
+					list_add_tail(&edge->list[UPPER],
+						      &list);
+				} else
+					INIT_LIST_HEAD(&edge->list[UPPER]);
+			} else {
+				upper = rb_entry(rb_node, struct backref_node,
+						 rb_node);
+				BUG_ON(!upper->checked);
+				INIT_LIST_HEAD(&edge->list[UPPER]);
+			}
+			list_add_tail(&edge->list[LOWER], &lower->upper);
+			edge->node[UPPER] = upper;
+			edge->node[LOWER] = lower;
+
+			if (rb_node)
+				break;
+			lower = upper;
+			upper = NULL;
+		}
+		btrfs_release_path(root, path2);
+next:
+		if (ptr < end) {
+			ptr += btrfs_extent_inline_ref_size(key.type);
+			if (ptr >= end) {
+				WARN_ON(ptr > end);
+				ptr = 0;
+				end = 0;
+			}
+		}
+		if (ptr >= end)
+			path1->slots[0]++;
+	}
+	btrfs_release_path(rc->extent_root, path1);
+
+	cur->checked = 1;
+	WARN_ON(exist);
+
+	/* the pending list isn't empty, take the first block to process */
+	if (!list_empty(&list)) {
+		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+		list_del_init(&edge->list[UPPER]);
+		cur = edge->node[UPPER];
+		goto again;
+	}
+
+	/*
+	 * everything goes well, connect backref nodes and insert backref nodes
+	 * into the cache.
+	 */
+	BUG_ON(!node->checked);
+	rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+	BUG_ON(rb_node);
+
+	list_for_each_entry(edge, &node->upper, list[LOWER])
+		list_add_tail(&edge->list[UPPER], &list);
+
+	while (!list_empty(&list)) {
+		edge = list_entry(list.next, struct backref_edge, list[UPPER]);
+		list_del_init(&edge->list[UPPER]);
+		upper = edge->node[UPPER];
+
+		if (!RB_EMPTY_NODE(&upper->rb_node)) {
+			if (upper->lowest) {
+				list_del_init(&upper->lower);
+				upper->lowest = 0;
+			}
+
+			list_add_tail(&edge->list[UPPER], &upper->lower);
+			continue;
+		}
+
+		BUG_ON(!upper->checked);
+		rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+				      &upper->rb_node);
+		BUG_ON(rb_node);
+
+		list_add_tail(&edge->list[UPPER], &upper->lower);
+
+		list_for_each_entry(edge, &upper->upper, list[LOWER])
+			list_add_tail(&edge->list[UPPER], &list);
+	}
+out:
+	btrfs_free_path(path1);
+	btrfs_free_path(path2);
+	if (err) {
+		INIT_LIST_HEAD(&list);
+		upper = node;
+		while (upper) {
+			if (RB_EMPTY_NODE(&upper->rb_node)) {
+				list_splice_tail(&upper->upper, &list);
+				kfree(upper);
+			}
+
+			if (list_empty(&list))
+				break;
+
+			edge = list_entry(list.next, struct backref_edge,
+					  list[LOWER]);
+			upper = edge->node[UPPER];
+			kfree(edge);
+		}
+		return ERR_PTR(err);
+	}
+	return node;
+}
+
+/*
+ * helper to add 'address of tree root -> reloc tree' mapping
+ */
+static int __add_reloc_root(struct btrfs_root *root)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	node = kmalloc(sizeof(*node), GFP_NOFS);
+	BUG_ON(!node);
+
+	node->bytenr = root->node->start;
+	node->data = root;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+			      node->bytenr, &node->rb_node);
+	spin_unlock(&rc->reloc_root_tree.lock);
+	BUG_ON(rb_node);
+
+	list_add_tail(&root->root_list, &rc->reloc_roots);
+	return 0;
+}
+
+/*
+ * helper to update/delete the 'address of tree root -> reloc tree'
+ * mapping
+ */
+static int __update_reloc_root(struct btrfs_root *root, int del)
+{
+	struct rb_node *rb_node;
+	struct mapping_node *node = NULL;
+	struct reloc_control *rc = root->fs_info->reloc_ctl;
+
+	spin_lock(&rc->reloc_root_tree.lock);
+	rb_node = tree_search(&rc->reloc_root_tree.rb_root,
+			      root->commit_root->start);
+	if (rb_node) {
+		node = rb_entry(rb_node, struct mapping_node, rb_node);
+		rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
+	}
+	spin_unlock(&rc->reloc_root_tree.lock);
+
+	BUG_ON((struct btrfs_root *)node->data != root);
+
+	if (!del) {
+		spin_lock(&rc->reloc_root_tree.lock);
+		node->bytenr = root->node->start;
+		rb_node = tree_insert(&rc->reloc_root_tree.rb_root,
+				      node->bytenr, &node->rb_node);
+		spin_unlock(&rc->reloc_root_tree.lock);
+		BUG_ON(rb_node);
+	} else {
+		list_del_init(&root->root_list);
+		kfree(node);
+	}
+	return 0;
+}
+
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct extent_buffer *eb;
+	struct btrfs_root_item *root_item;
+	struct btrfs_key root_key;
+	int ret;
+
+	if (root->reloc_root) {
+		reloc_root = root->reloc_root;
+		reloc_root->last_trans = trans->transid;
+		return 0;
+	}
+
+	if (!root->fs_info->reloc_ctl ||
+	    !root->fs_info->reloc_ctl->create_reloc_root ||
+	    root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		return 0;
+
+	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
+	BUG_ON(!root_item);
+
+	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = root->root_key.objectid;
+
+	ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+			      BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(ret);
+
+	btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
+	memcpy(root_item, &root->root_item, sizeof(*root_item));
+	btrfs_set_root_refs(root_item, 1);
+	btrfs_set_root_bytenr(root_item, eb->start);
+	btrfs_set_root_level(root_item, btrfs_header_level(eb));
+	btrfs_set_root_generation(root_item, trans->transid);
+	memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
+	root_item->drop_level = 0;
+
+	btrfs_tree_unlock(eb);
+	free_extent_buffer(eb);
+
+	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
+				&root_key, root_item);
+	BUG_ON(ret);
+	kfree(root_item);
+
+	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
+						 &root_key);
+	BUG_ON(IS_ERR(reloc_root));
+	reloc_root->last_trans = trans->transid;
+
+	__add_reloc_root(reloc_root);
+	root->reloc_root = reloc_root;
+	return 0;
+}
+
+/*
+ * update root item of reloc tree
+ */
+int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
+			    struct btrfs_root *root)
+{
+	struct btrfs_root *reloc_root;
+	struct btrfs_root_item *root_item;
+	int del = 0;
+	int ret;
+
+	if (!root->reloc_root)
+		return 0;
+
+	reloc_root = root->reloc_root;
+	root_item = &reloc_root->root_item;
+
+	if (btrfs_root_refs(root_item) == 0) {
+		root->reloc_root = NULL;
+		del = 1;
+	}
+
+	__update_reloc_root(reloc_root, del);
+
+	if (reloc_root->commit_root != reloc_root->node) {
+		btrfs_set_root_node(root_item, reloc_root->node);
+		free_extent_buffer(reloc_root->commit_root);
+		reloc_root->commit_root = btrfs_root_node(reloc_root);
+	}
+
+	ret = btrfs_update_root(trans, root->fs_info->tree_root,
+				&reloc_root->root_key, root_item);
+	BUG_ON(ret);
+	return 0;
+}
+
+/*
+ * helper to find first cached inode with inode number >= objectid
+ * in a subvolume
+ */
+static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
+{
+	struct rb_node *node;
+	struct rb_node *prev;
+	struct btrfs_inode *entry;
+	struct inode *inode;
+
+	spin_lock(&root->inode_lock);
+again:
+	node = root->inode_tree.rb_node;
+	prev = NULL;
+	while (node) {
+		prev = node;
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+
+		if (objectid < entry->vfs_inode.i_ino)
+			node = node->rb_left;
+		else if (objectid > entry->vfs_inode.i_ino)
+			node = node->rb_right;
+		else
+			break;
+	}
+	if (!node) {
+		while (prev) {
+			entry = rb_entry(prev, struct btrfs_inode, rb_node);
+			if (objectid <= entry->vfs_inode.i_ino) {
+				node = prev;
+				break;
+			}
+			prev = rb_next(prev);
+		}
+	}
+	while (node) {
+		entry = rb_entry(node, struct btrfs_inode, rb_node);
+		inode = igrab(&entry->vfs_inode);
+		if (inode) {
+			spin_unlock(&root->inode_lock);
+			return inode;
+		}
+
+		objectid = entry->vfs_inode.i_ino + 1;
+		if (cond_resched_lock(&root->inode_lock))
+			goto again;
+
+		node = rb_next(node);
+	}
+	spin_unlock(&root->inode_lock);
+	return NULL;
+}
+
+static int in_block_group(u64 bytenr,
+			  struct btrfs_block_group_cache *block_group)
+{
+	if (bytenr >= block_group->key.objectid &&
+	    bytenr < block_group->key.objectid + block_group->key.offset)
+		return 1;
+	return 0;
+}
+
+/*
+ * get new location of data
+ */
+static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
+			    u64 bytenr, u64 num_bytes)
+{
+	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
+	struct btrfs_path *path;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	bytenr -= BTRFS_I(reloc_inode)->index_cnt;
+	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+				       bytenr, 0);
+	if (ret < 0)
+		goto out;
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	fi = btrfs_item_ptr(leaf, path->slots[0],
+			    struct btrfs_file_extent_item);
+
+	BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
+	       btrfs_file_extent_compression(leaf, fi) ||
+	       btrfs_file_extent_encryption(leaf, fi) ||
+	       btrfs_file_extent_other_encoding(leaf, fi));
+
+	if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
+		ret = 1;
+		goto out;
+	}
+
+	if (new_bytenr)
+		*new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * update file extent items in the tree leaf to point to
+ * the new locations.
+ */
+static int replace_file_extents(struct btrfs_trans_handle *trans,
+				struct reloc_control *rc,
+				struct btrfs_root *root,
+				struct extent_buffer *leaf,
+				struct list_head *inode_list)
+{
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	struct inode *inode = NULL;
+	struct inodevec *ivec = NULL;
+	u64 parent;
+	u64 bytenr;
+	u64 new_bytenr;
+	u64 num_bytes;
+	u64 end;
+	u32 nritems;
+	u32 i;
+	int ret;
+	int first = 1;
+	int dirty = 0;
+
+	if (rc->stage != UPDATE_DATA_PTRS)
+		return 0;
+
+	/* reloc trees always use full backref */
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+		parent = leaf->start;
+	else
+		parent = 0;
+
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		btrfs_item_key_to_cpu(leaf, &key, i);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
+		if (bytenr == 0)
+			continue;
+		if (!in_block_group(bytenr, rc->block_group))
+			continue;
+
+		/*
+		 * if we are modifying block in fs tree, wait for readpage
+		 * to complete and drop the extent cache
+		 */
+		if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
+			if (!ivec || ivec->nr == INODEVEC_SIZE) {
+				ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
+				BUG_ON(!ivec);
+				ivec->nr = 0;
+				list_add_tail(&ivec->list, inode_list);
+			}
+			if (first) {
+				inode = find_next_inode(root, key.objectid);
+				if (inode)
+					ivec->inode[ivec->nr++] = inode;
+				first = 0;
+			} else if (inode && inode->i_ino < key.objectid) {
+				inode = find_next_inode(root, key.objectid);
+				if (inode)
+					ivec->inode[ivec->nr++] = inode;
+			}
+			if (inode && inode->i_ino == key.objectid) {
+				end = key.offset +
+				      btrfs_file_extent_num_bytes(leaf, fi);
+				WARN_ON(!IS_ALIGNED(key.offset,
+						    root->sectorsize));
+				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+				end--;
+				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
+						      key.offset, end,
+						      GFP_NOFS);
+				if (!ret)
+					continue;
+
+				btrfs_drop_extent_cache(inode, key.offset, end,
+							1);
+				unlock_extent(&BTRFS_I(inode)->io_tree,
+					      key.offset, end, GFP_NOFS);
+			}
+		}
+
+		ret = get_new_location(rc->data_inode, &new_bytenr,
+				       bytenr, num_bytes);
+		if (ret > 0)
+			continue;
+		BUG_ON(ret < 0);
+
+		btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
+		dirty = 1;
+
+		key.offset -= btrfs_file_extent_offset(leaf, fi);
+		ret = btrfs_inc_extent_ref(trans, root, new_bytenr,
+					   num_bytes, parent,
+					   btrfs_header_owner(leaf),
+					   key.objectid, key.offset);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
+					parent, btrfs_header_owner(leaf),
+					key.objectid, key.offset);
+		BUG_ON(ret);
+	}
+	if (dirty)
+		btrfs_mark_buffer_dirty(leaf);
+	return 0;
+}
+
+static noinline_for_stack
+int memcmp_node_keys(struct extent_buffer *eb, int slot,
+		     struct btrfs_path *path, int level)
+{
+	struct btrfs_disk_key key1;
+	struct btrfs_disk_key key2;
+	btrfs_node_key(eb, &key1, slot);
+	btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
+	return memcmp(&key1, &key2, sizeof(key1));
+}
+
+/*
+ * try to replace tree blocks in fs tree with the new blocks
+ * in reloc tree. tree blocks haven't been modified since the
+ * reloc tree was create can be replaced.
+ *
+ * if a block was replaced, level of the block + 1 is returned.
+ * if no block got replaced, 0 is returned. if there are other
+ * errors, a negative error number is returned.
+ */
+static int replace_path(struct btrfs_trans_handle *trans,
+			struct btrfs_root *dest, struct btrfs_root *src,
+			struct btrfs_path *path, struct btrfs_key *next_key,
+			struct extent_buffer **leaf,
+			int lowest_level, int max_level)
+{
+	struct extent_buffer *eb;
+	struct extent_buffer *parent;
+	struct btrfs_key key;
+	u64 old_bytenr;
+	u64 new_bytenr;
+	u64 old_ptr_gen;
+	u64 new_ptr_gen;
+	u64 last_snapshot;
+	u32 blocksize;
+	int level;
+	int ret;
+	int slot;
+
+	BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+	BUG_ON(lowest_level > 1 && leaf);
+
+	last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+
+	slot = path->slots[lowest_level];
+	btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
+
+	eb = btrfs_lock_root_node(dest);
+	btrfs_set_lock_blocking(eb);
+	level = btrfs_header_level(eb);
+
+	if (level < lowest_level) {
+		btrfs_tree_unlock(eb);
+		free_extent_buffer(eb);
+		return 0;
+	}
+
+	ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+	BUG_ON(ret);
+	btrfs_set_lock_blocking(eb);
+
+	if (next_key) {
+		next_key->objectid = (u64)-1;
+		next_key->type = (u8)-1;
+		next_key->offset = (u64)-1;
+	}
+
+	parent = eb;
+	while (1) {
+		level = btrfs_header_level(parent);
+		BUG_ON(level < lowest_level);
+
+		ret = btrfs_bin_search(parent, &key, level, &slot);
+		if (ret && slot > 0)
+			slot--;
+
+		if (next_key && slot + 1 < btrfs_header_nritems(parent))
+			btrfs_node_key_to_cpu(parent, next_key, slot + 1);
+
+		old_bytenr = btrfs_node_blockptr(parent, slot);
+		blocksize = btrfs_level_size(dest, level - 1);
+		old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
+
+		if (level <= max_level) {
+			eb = path->nodes[level];
+			new_bytenr = btrfs_node_blockptr(eb,
+							path->slots[level]);
+			new_ptr_gen = btrfs_node_ptr_generation(eb,
+							path->slots[level]);
+		} else {
+			new_bytenr = 0;
+			new_ptr_gen = 0;
+		}
+
+		if (new_bytenr > 0 && new_bytenr == old_bytenr) {
+			WARN_ON(1);
+			ret = level;
+			break;
+		}
+
+		if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
+		    memcmp_node_keys(parent, slot, path, level)) {
+			if (level <= lowest_level && !leaf) {
+				ret = 0;
+				break;
+			}
+
+			eb = read_tree_block(dest, old_bytenr, blocksize,
+					     old_ptr_gen);
+			btrfs_tree_lock(eb);
+			ret = btrfs_cow_block(trans, dest, eb, parent,
+					      slot, &eb);
+			BUG_ON(ret);
+			btrfs_set_lock_blocking(eb);
+
+			if (level <= lowest_level) {
+				*leaf = eb;
+				ret = 0;
+				break;
+			}
+
+			btrfs_tree_unlock(parent);
+			free_extent_buffer(parent);
+
+			parent = eb;
+			continue;
+		}
+
+		btrfs_node_key_to_cpu(path->nodes[level], &key,
+				      path->slots[level]);
+		btrfs_release_path(src, path);
+
+		path->lowest_level = level;
+		ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
+		path->lowest_level = 0;
+		BUG_ON(ret);
+
+		/*
+		 * swap blocks in fs tree and reloc tree.
+		 */
+		btrfs_set_node_blockptr(parent, slot, new_bytenr);
+		btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
+		btrfs_mark_buffer_dirty(parent);
+
+		btrfs_set_node_blockptr(path->nodes[level],
+					path->slots[level], old_bytenr);
+		btrfs_set_node_ptr_generation(path->nodes[level],
+					      path->slots[level], old_ptr_gen);
+		btrfs_mark_buffer_dirty(path->nodes[level]);
+
+		ret = btrfs_inc_extent_ref(trans, src, old_bytenr, blocksize,
+					path->nodes[level]->start,
+					src->root_key.objectid, level - 1, 0);
+		BUG_ON(ret);
+		ret = btrfs_inc_extent_ref(trans, dest, new_bytenr, blocksize,
+					0, dest->root_key.objectid, level - 1,
+					0);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, src, new_bytenr, blocksize,
+					path->nodes[level]->start,
+					src->root_key.objectid, level - 1, 0);
+		BUG_ON(ret);
+
+		ret = btrfs_free_extent(trans, dest, old_bytenr, blocksize,
+					0, dest->root_key.objectid, level - 1,
+					0);
+		BUG_ON(ret);
+
+		btrfs_unlock_up_safe(path, 0);
+
+		ret = level;
+		break;
+	}
+	btrfs_tree_unlock(parent);
+	free_extent_buffer(parent);
+	return ret;
+}
+
+/*
+ * helper to find next relocated block in reloc tree
+ */
+static noinline_for_stack
+int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+		       int *level)
+{
+	struct extent_buffer *eb;
+	int i;
+	u64 last_snapshot;
+	u32 nritems;
+
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+	for (i = 0; i < *level; i++) {
+		free_extent_buffer(path->nodes[i]);
+		path->nodes[i] = NULL;
+	}
+
+	for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
+		eb = path->nodes[i];
+		nritems = btrfs_header_nritems(eb);
+		while (path->slots[i] + 1 < nritems) {
+			path->slots[i]++;
+			if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
+			    last_snapshot)
+				continue;
+
+			*level = i;
+			return 0;
+		}
+		free_extent_buffer(path->nodes[i]);
+		path->nodes[i] = NULL;
+	}
+	return 1;
+}
+
+/*
+ * walk down reloc tree to find relocated block of lowest level
+ */
+static noinline_for_stack
+int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
+			 int *level)
+{
+	struct extent_buffer *eb = NULL;
+	int i;
+	u64 bytenr;
+	u64 ptr_gen = 0;
+	u64 last_snapshot;
+	u32 blocksize;
+	u32 nritems;
+
+	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+
+	for (i = *level; i > 0; i--) {
+		eb = path->nodes[i];
+		nritems = btrfs_header_nritems(eb);
+		while (path->slots[i] < nritems) {
+			ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
+			if (ptr_gen > last_snapshot)
+				break;
+			path->slots[i]++;
+		}
+		if (path->slots[i] >= nritems) {
+			if (i == *level)
+				break;
+			*level = i + 1;
+			return 0;
+		}
+		if (i == 1) {
+			*level = i;
+			return 0;
+		}
+
+		bytenr = btrfs_node_blockptr(eb, path->slots[i]);
+		blocksize = btrfs_level_size(root, i - 1);
+		eb = read_tree_block(root, bytenr, blocksize, ptr_gen);
+		BUG_ON(btrfs_header_level(eb) != i - 1);
+		path->nodes[i - 1] = eb;
+		path->slots[i - 1] = 0;
+	}
+	return 1;
+}
+
+/*
+ * invalidate extent cache for file extents whose key in range of
+ * [min_key, max_key)
+ */
+static int invalidate_extent_cache(struct btrfs_root *root,
+				   struct btrfs_key *min_key,
+				   struct btrfs_key *max_key)
+{
+	struct inode *inode = NULL;
+	u64 objectid;
+	u64 start, end;
+
+	objectid = min_key->objectid;
+	while (1) {
+		cond_resched();
+		iput(inode);
+
+		if (objectid > max_key->objectid)
+			break;
+
+		inode = find_next_inode(root, objectid);
+		if (!inode)
+			break;
+
+		if (inode->i_ino > max_key->objectid) {
+			iput(inode);
+			break;
+		}
+
+		objectid = inode->i_ino + 1;
+		if (!S_ISREG(inode->i_mode))
+			continue;
+
+		if (unlikely(min_key->objectid == inode->i_ino)) {
+			if (min_key->type > BTRFS_EXTENT_DATA_KEY)
+				continue;
+			if (min_key->type < BTRFS_EXTENT_DATA_KEY)
+				start = 0;
+			else {
+				start = min_key->offset;
+				WARN_ON(!IS_ALIGNED(start, root->sectorsize));
+			}
+		} else {
+			start = 0;
+		}
+
+		if (unlikely(max_key->objectid == inode->i_ino)) {
+			if (max_key->type < BTRFS_EXTENT_DATA_KEY)
+				continue;
+			if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
+				end = (u64)-1;
+			} else {
+				if (max_key->offset == 0)
+					continue;
+				end = max_key->offset;
+				WARN_ON(!IS_ALIGNED(end, root->sectorsize));
+				end--;
+			}
+		} else {
+			end = (u64)-1;
+		}
+
+		/* the lock_extent waits for readpage to complete */
+		lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+		btrfs_drop_extent_cache(inode, start, end, 1);
+		unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	}
+	return 0;
+}
+
+static int find_next_key(struct btrfs_path *path, int level,
+			 struct btrfs_key *key)
+
+{
+	while (level < BTRFS_MAX_LEVEL) {
+		if (!path->nodes[level])
+			break;
+		if (path->slots[level] + 1 <
+		    btrfs_header_nritems(path->nodes[level])) {
+			btrfs_node_key_to_cpu(path->nodes[level], key,
+					      path->slots[level] + 1);
+			return 0;
+		}
+		level++;
+	}
+	return 1;
+}
+
+/*
+ * merge the relocated tree blocks in reloc tree with corresponding
+ * fs tree.
+ */
+static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
+					       struct btrfs_root *root)
+{
+	LIST_HEAD(inode_list);
+	struct btrfs_key key;
+	struct btrfs_key next_key;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *reloc_root;
+	struct btrfs_root_item *root_item;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf = NULL;
+	unsigned long nr;
+	int level;
+	int max_level;
+	int replaced = 0;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	reloc_root = root->reloc_root;
+	root_item = &reloc_root->root_item;
+
+	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
+		level = btrfs_root_level(root_item);
+		extent_buffer_get(reloc_root->node);
+		path->nodes[level] = reloc_root->node;
+		path->slots[level] = 0;
+	} else {
+		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
+
+		level = root_item->drop_level;
+		BUG_ON(level == 0);
+		path->lowest_level = level;
+		ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+		if (ret < 0) {
+			btrfs_free_path(path);
+			return ret;
+		}
+
+		btrfs_node_key_to_cpu(path->nodes[level], &next_key,
+				      path->slots[level]);
+		WARN_ON(memcmp(&key, &next_key, sizeof(key)));
+
+		btrfs_unlock_up_safe(path, 0);
+	}
+
+	if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+		trans = btrfs_start_transaction(root, 1);
+
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, 0);
+		btrfs_release_path(reloc_root, path);
+
+		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+
+		leaf = path->nodes[0];
+		btrfs_unlock_up_safe(path, 1);
+		ret = replace_file_extents(trans, rc, root, leaf,
+					   &inode_list);
+		if (ret < 0)
+			err = ret;
+		goto out;
+	}
+
+	memset(&next_key, 0, sizeof(next_key));
+
+	while (1) {
+		leaf = NULL;
+		replaced = 0;
+		trans = btrfs_start_transaction(root, 1);
+		max_level = level;
+
+		ret = walk_down_reloc_tree(reloc_root, path, &level);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret > 0)
+			break;
+
+		if (!find_next_key(path, level, &key) &&
+		    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
+			ret = 0;
+		} else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
+			ret = replace_path(trans, root, reloc_root,
+					   path, &next_key, &leaf,
+					   level, max_level);
+		} else {
+			ret = replace_path(trans, root, reloc_root,
+					   path, &next_key, NULL,
+					   level, max_level);
+		}
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+
+		if (ret > 0) {
+			level = ret;
+			btrfs_node_key_to_cpu(path->nodes[level], &key,
+					      path->slots[level]);
+			replaced = 1;
+		} else if (leaf) {
+			/*
+			 * no block got replaced, try replacing file extents
+			 */
+			btrfs_item_key_to_cpu(leaf, &key, 0);
+			ret = replace_file_extents(trans, rc, root, leaf,
+						   &inode_list);
+			btrfs_tree_unlock(leaf);
+			free_extent_buffer(leaf);
+			BUG_ON(ret < 0);
+		}
+
+		ret = walk_up_reloc_tree(reloc_root, path, &level);
+		if (ret > 0)
+			break;
+
+		BUG_ON(level == 0);
+		/*
+		 * save the merging progress in the drop_progress.
+		 * this is OK since root refs == 1 in this case.
+		 */
+		btrfs_node_key(path->nodes[level], &root_item->drop_progress,
+			       path->slots[level]);
+		root_item->drop_level = level;
+
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, root);
+
+		btrfs_btree_balance_dirty(root, nr);
+
+		if (replaced && rc->stage == UPDATE_DATA_PTRS)
+			invalidate_extent_cache(root, &key, &next_key);
+	}
+
+	/*
+	 * handle the case only one block in the fs tree need to be
+	 * relocated and the block is tree root.
+	 */
+	leaf = btrfs_lock_root_node(root);
+	ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf);
+	btrfs_tree_unlock(leaf);
+	free_extent_buffer(leaf);
+	if (ret < 0)
+		err = ret;
+out:
+	btrfs_free_path(path);
+
+	if (err == 0) {
+		memset(&root_item->drop_progress, 0,
+		       sizeof(root_item->drop_progress));
+		root_item->drop_level = 0;
+		btrfs_set_root_refs(root_item, 0);
+	}
+
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+
+	btrfs_btree_balance_dirty(root, nr);
+
+	/*
+	 * put inodes while we aren't holding the tree locks
+	 */
+	while (!list_empty(&inode_list)) {
+		struct inodevec *ivec;
+		ivec = list_entry(inode_list.next, struct inodevec, list);
+		list_del(&ivec->list);
+		while (ivec->nr > 0) {
+			ivec->nr--;
+			iput(ivec->inode[ivec->nr]);
+		}
+		kfree(ivec);
+	}
+
+	if (replaced && rc->stage == UPDATE_DATA_PTRS)
+		invalidate_extent_cache(root, &key, &next_key);
+
+	return err;
+}
+
+/*
+ * callback for the work threads.
+ * this function merges reloc tree with corresponding fs tree,
+ * and then drops the reloc tree.
+ */
+static void merge_func(struct btrfs_work *work)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	struct btrfs_root *reloc_root;
+	struct async_merge *async;
+
+	async = container_of(work, struct async_merge, work);
+	reloc_root = async->root;
+
+	if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+		root = read_fs_root(reloc_root->fs_info,
+				    reloc_root->root_key.offset);
+		BUG_ON(IS_ERR(root));
+		BUG_ON(root->reloc_root != reloc_root);
+
+		merge_reloc_root(async->rc, root);
+
+		trans = btrfs_start_transaction(root, 1);
+		btrfs_update_reloc_root(trans, root);
+		btrfs_end_transaction(trans, root);
+	}
+
+	btrfs_drop_dead_root(reloc_root);
+
+	if (atomic_dec_and_test(async->num_pending))
+		complete(async->done);
+
+	kfree(async);
+}
+
+static int merge_reloc_roots(struct reloc_control *rc)
+{
+	struct async_merge *async;
+	struct btrfs_root *root;
+	struct completion done;
+	atomic_t num_pending;
+
+	init_completion(&done);
+	atomic_set(&num_pending, 1);
+
+	while (!list_empty(&rc->reloc_roots)) {
+		root = list_entry(rc->reloc_roots.next,
+				  struct btrfs_root, root_list);
+		list_del_init(&root->root_list);
+
+		async = kmalloc(sizeof(*async), GFP_NOFS);
+		BUG_ON(!async);
+		async->work.func = merge_func;
+		async->work.flags = 0;
+		async->rc = rc;
+		async->root = root;
+		async->done = &done;
+		async->num_pending = &num_pending;
+		atomic_inc(&num_pending);
+		btrfs_queue_worker(&rc->workers, &async->work);
+	}
+
+	if (!atomic_dec_and_test(&num_pending))
+		wait_for_completion(&done);
+
+	BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
+	return 0;
+}
+
+static void free_block_list(struct rb_root *blocks)
+{
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	while ((rb_node = rb_first(blocks))) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		rb_erase(rb_node, blocks);
+		kfree(block);
+	}
+}
+
+static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
+				      struct btrfs_root *reloc_root)
+{
+	struct btrfs_root *root;
+
+	if (reloc_root->last_trans == trans->transid)
+		return 0;
+
+	root = read_fs_root(reloc_root->fs_info, reloc_root->root_key.offset);
+	BUG_ON(IS_ERR(root));
+	BUG_ON(root->reloc_root != reloc_root);
+
+	return btrfs_record_root_in_trans(trans, root);
+}
+
+/*
+ * select one tree from trees that references the block.
+ * for blocks in refernce counted trees, we preper reloc tree.
+ * if no reloc tree found and reloc_only is true, NULL is returned.
+ */
+static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
+					    struct backref_node *node,
+					    struct backref_edge *edges[],
+					    int *nr, int reloc_only)
+{
+	struct backref_node *next;
+	struct btrfs_root *root;
+	int index;
+	int loop = 0;
+again:
+	index = 0;
+	next = node;
+	while (1) {
+		cond_resched();
+		next = walk_up_backref(next, edges, &index);
+		root = next->root;
+		if (!root) {
+			BUG_ON(!node->old_root);
+			goto skip;
+		}
+
+		/* no other choice for non-refernce counted tree */
+		if (!root->ref_cows) {
+			BUG_ON(reloc_only);
+			break;
+		}
+
+		if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+			record_reloc_root_in_trans(trans, root);
+			break;
+		}
+
+		if (loop) {
+			btrfs_record_root_in_trans(trans, root);
+			break;
+		}
+
+		if (reloc_only || next != node) {
+			if (!root->reloc_root)
+				btrfs_record_root_in_trans(trans, root);
+			root = root->reloc_root;
+			/*
+			 * if the reloc tree was created in current
+			 * transation, there is no node in backref tree
+			 * corresponds to the root of the reloc tree.
+			 */
+			if (btrfs_root_last_snapshot(&root->root_item) ==
+			    trans->transid - 1)
+				break;
+		}
+skip:
+		root = NULL;
+		next = walk_down_backref(edges, &index);
+		if (!next || next->level <= node->level)
+			break;
+	}
+
+	if (!root && !loop && !reloc_only) {
+		loop = 1;
+		goto again;
+	}
+
+	if (root)
+		*nr = index;
+	else
+		*nr = 0;
+
+	return root;
+}
+
+static noinline_for_stack
+struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
+				   struct backref_node *node)
+{
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	int nr;
+	return __select_one_root(trans, node, edges, &nr, 0);
+}
+
+static noinline_for_stack
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+				     struct backref_node *node,
+				     struct backref_edge *edges[], int *nr)
+{
+	return __select_one_root(trans, node, edges, nr, 1);
+}
+
+static void grab_path_buffers(struct btrfs_path *path,
+			      struct backref_node *node,
+			      struct backref_edge *edges[], int nr)
+{
+	int i = 0;
+	while (1) {
+		drop_node_buffer(node);
+		node->eb = path->nodes[node->level];
+		BUG_ON(!node->eb);
+		if (path->locks[node->level])
+			node->locked = 1;
+		path->nodes[node->level] = NULL;
+		path->locks[node->level] = 0;
+
+		if (i >= nr)
+			break;
+
+		edges[i]->blockptr = node->eb->start;
+		node = edges[i]->node[UPPER];
+		i++;
+	}
+}
+
+/*
+ * relocate a block tree, and then update pointers in upper level
+ * blocks that reference the block to point to the new location.
+ *
+ * if called by link_to_upper, the block has already been relocated.
+ * in that case this function just updates pointers.
+ */
+static int do_relocation(struct btrfs_trans_handle *trans,
+			 struct backref_node *node,
+			 struct btrfs_key *key,
+			 struct btrfs_path *path, int lowest)
+{
+	struct backref_node *upper;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	struct btrfs_root *root;
+	struct extent_buffer *eb;
+	u32 blocksize;
+	u64 bytenr;
+	u64 generation;
+	int nr;
+	int slot;
+	int ret;
+	int err = 0;
+
+	BUG_ON(lowest && node->eb);
+
+	path->lowest_level = node->level + 1;
+	list_for_each_entry(edge, &node->upper, list[LOWER]) {
+		cond_resched();
+		if (node->eb && node->eb->start == edge->blockptr)
+			continue;
+
+		upper = edge->node[UPPER];
+		root = select_reloc_root(trans, upper, edges, &nr);
+		if (!root)
+			continue;
+
+		if (upper->eb && !upper->locked)
+			drop_node_buffer(upper);
+
+		if (!upper->eb) {
+			ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			BUG_ON(ret > 0);
+
+			slot = path->slots[upper->level];
+
+			btrfs_unlock_up_safe(path, upper->level + 1);
+			grab_path_buffers(path, upper, edges, nr);
+
+			btrfs_release_path(NULL, path);
+		} else {
+			ret = btrfs_bin_search(upper->eb, key, upper->level,
+					       &slot);
+			BUG_ON(ret);
+		}
+
+		bytenr = btrfs_node_blockptr(upper->eb, slot);
+		if (!lowest) {
+			if (node->eb->start == bytenr) {
+				btrfs_tree_unlock(upper->eb);
+				upper->locked = 0;
+				continue;
+			}
+		} else {
+			BUG_ON(node->bytenr != bytenr);
+		}
+
+		blocksize = btrfs_level_size(root, node->level);
+		generation = btrfs_node_ptr_generation(upper->eb, slot);
+		eb = read_tree_block(root, bytenr, blocksize, generation);
+		btrfs_tree_lock(eb);
+		btrfs_set_lock_blocking(eb);
+
+		if (!node->eb) {
+			ret = btrfs_cow_block(trans, root, eb, upper->eb,
+					      slot, &eb);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			btrfs_set_lock_blocking(eb);
+			node->eb = eb;
+			node->locked = 1;
+		} else {
+			btrfs_set_node_blockptr(upper->eb, slot,
+						node->eb->start);
+			btrfs_set_node_ptr_generation(upper->eb, slot,
+						      trans->transid);
+			btrfs_mark_buffer_dirty(upper->eb);
+
+			ret = btrfs_inc_extent_ref(trans, root,
+						node->eb->start, blocksize,
+						upper->eb->start,
+						btrfs_header_owner(upper->eb),
+						node->level, 0);
+			BUG_ON(ret);
+
+			ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
+			BUG_ON(ret);
+
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+		}
+		if (!lowest) {
+			btrfs_tree_unlock(upper->eb);
+			upper->locked = 0;
+		}
+	}
+	path->lowest_level = 0;
+	return err;
+}
+
+static int link_to_upper(struct btrfs_trans_handle *trans,
+			 struct backref_node *node,
+			 struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	if (!node->eb || list_empty(&node->upper))
+		return 0;
+
+	btrfs_node_key_to_cpu(node->eb, &key, 0);
+	return do_relocation(trans, node, &key, path, 0);
+}
+
+static int finish_pending_nodes(struct btrfs_trans_handle *trans,
+				struct backref_cache *cache,
+				struct btrfs_path *path)
+{
+	struct backref_node *node;
+	int level;
+	int ret;
+	int err = 0;
+
+	for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+		while (!list_empty(&cache->pending[level])) {
+			node = list_entry(cache->pending[level].next,
+					  struct backref_node, lower);
+			BUG_ON(node->level != level);
+
+			ret = link_to_upper(trans, node, path);
+			if (ret < 0)
+				err = ret;
+			/*
+			 * this remove the node from the pending list and
+			 * may add some other nodes to the level + 1
+			 * pending list
+			 */
+			remove_backref_node(cache, node);
+		}
+	}
+	BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+	return err;
+}
+
+static void mark_block_processed(struct reloc_control *rc,
+				 struct backref_node *node)
+{
+	u32 blocksize;
+	if (node->level == 0 ||
+	    in_block_group(node->bytenr, rc->block_group)) {
+		blocksize = btrfs_level_size(rc->extent_root, node->level);
+		set_extent_bits(&rc->processed_blocks, node->bytenr,
+				node->bytenr + blocksize - 1, EXTENT_DIRTY,
+				GFP_NOFS);
+	}
+	node->processed = 1;
+}
+
+/*
+ * mark a block and all blocks directly/indirectly reference the block
+ * as processed.
+ */
+static void update_processed_blocks(struct reloc_control *rc,
+				    struct backref_node *node)
+{
+	struct backref_node *next = node;
+	struct backref_edge *edge;
+	struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+	int index = 0;
+
+	while (next) {
+		cond_resched();
+		while (1) {
+			if (next->processed)
+				break;
+
+			mark_block_processed(rc, next);
+
+			if (list_empty(&next->upper))
+				break;
+
+			edge = list_entry(next->upper.next,
+					  struct backref_edge, list[LOWER]);
+			edges[index++] = edge;
+			next = edge->node[UPPER];
+		}
+		next = walk_down_backref(edges, &index);
+	}
+}
+
+static int tree_block_processed(u64 bytenr, u32 blocksize,
+				struct reloc_control *rc)
+{
+	if (test_range_bit(&rc->processed_blocks, bytenr,
+			   bytenr + blocksize - 1, EXTENT_DIRTY, 1))
+		return 1;
+	return 0;
+}
+
+/*
+ * check if there are any file extent pointers in the leaf point to
+ * data require processing
+ */
+static int check_file_extents(struct reloc_control *rc,
+			      u64 bytenr, u32 blocksize, u64 ptr_gen)
+{
+	struct btrfs_key found_key;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *leaf;
+	u32 nritems;
+	int i;
+	int ret = 0;
+
+	leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
+
+	nritems = btrfs_header_nritems(leaf);
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		btrfs_item_key_to_cpu(leaf, &found_key, i);
+		if (found_key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			continue;
+		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+		if (bytenr == 0)
+			continue;
+		if (in_block_group(bytenr, rc->block_group)) {
+			ret = 1;
+			break;
+		}
+	}
+	free_extent_buffer(leaf);
+	return ret;
+}
+
+/*
+ * scan child blocks of a given block to find blocks require processing
+ */
+static int add_child_blocks(struct btrfs_trans_handle *trans,
+			    struct reloc_control *rc,
+			    struct backref_node *node,
+			    struct rb_root *blocks)
+{
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	u64 bytenr;
+	u64 ptr_gen;
+	u32 blocksize;
+	u32 nritems;
+	int i;
+	int err = 0;
+
+	nritems = btrfs_header_nritems(node->eb);
+	blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		bytenr = btrfs_node_blockptr(node->eb, i);
+		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+		if (ptr_gen == trans->transid)
+			continue;
+		if (!in_block_group(bytenr, rc->block_group) &&
+		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+			continue;
+		if (tree_block_processed(bytenr, blocksize, rc))
+			continue;
+
+		readahead_tree_block(rc->extent_root,
+				     bytenr, blocksize, ptr_gen);
+	}
+
+	for (i = 0; i < nritems; i++) {
+		cond_resched();
+		bytenr = btrfs_node_blockptr(node->eb, i);
+		ptr_gen = btrfs_node_ptr_generation(node->eb, i);
+		if (ptr_gen == trans->transid)
+			continue;
+		if (!in_block_group(bytenr, rc->block_group) &&
+		    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
+			continue;
+		if (tree_block_processed(bytenr, blocksize, rc))
+			continue;
+		if (!in_block_group(bytenr, rc->block_group) &&
+		    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
+			continue;
+
+		block = kmalloc(sizeof(*block), GFP_NOFS);
+		if (!block) {
+			err = -ENOMEM;
+			break;
+		}
+		block->bytenr = bytenr;
+		btrfs_node_key_to_cpu(node->eb, &block->key, i);
+		block->level = node->level - 1;
+		block->key_ready = 1;
+		rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+		BUG_ON(rb_node);
+	}
+	if (err)
+		free_block_list(blocks);
+	return err;
+}
+
+/*
+ * find adjacent blocks require processing
+ */
+static noinline_for_stack
+int add_adjacent_blocks(struct btrfs_trans_handle *trans,
+			struct reloc_control *rc,
+			struct backref_cache *cache,
+			struct rb_root *blocks, int level,
+			struct backref_node **upper)
+{
+	struct backref_node *node;
+	int ret = 0;
+
+	WARN_ON(!list_empty(&cache->pending[level]));
+
+	if (list_empty(&cache->pending[level + 1]))
+		return 1;
+
+	node = list_entry(cache->pending[level + 1].next,
+			  struct backref_node, lower);
+	if (node->eb)
+		ret = add_child_blocks(trans, rc, node, blocks);
+
+	*upper = node;
+	return ret;
+}
+
+static int get_tree_block_key(struct reloc_control *rc,
+			      struct tree_block *block)
+{
+	struct extent_buffer *eb;
+
+	BUG_ON(block->key_ready);
+	eb = read_tree_block(rc->extent_root, block->bytenr,
+			     block->key.objectid, block->key.offset);
+	WARN_ON(btrfs_header_level(eb) != block->level);
+	if (block->level == 0)
+		btrfs_item_key_to_cpu(eb, &block->key, 0);
+	else
+		btrfs_node_key_to_cpu(eb, &block->key, 0);
+	free_extent_buffer(eb);
+	block->key_ready = 1;
+	return 0;
+}
+
+static int reada_tree_block(struct reloc_control *rc,
+			    struct tree_block *block)
+{
+	BUG_ON(block->key_ready);
+	readahead_tree_block(rc->extent_root, block->bytenr,
+			     block->key.objectid, block->key.offset);
+	return 0;
+}
+
+/*
+ * helper function to relocate a tree block
+ */
+static int relocate_tree_block(struct btrfs_trans_handle *trans,
+				struct reloc_control *rc,
+				struct backref_node *node,
+				struct btrfs_key *key,
+				struct btrfs_path *path)
+{
+	struct btrfs_root *root;
+	int ret;
+
+	root = select_one_root(trans, node);
+	if (unlikely(!root)) {
+		rc->found_old_snapshot = 1;
+		update_processed_blocks(rc, node);
+		return 0;
+	}
+
+	if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+		ret = do_relocation(trans, node, key, path, 1);
+		if (ret < 0)
+			goto out;
+		if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+			ret = replace_file_extents(trans, rc, root,
+						   node->eb, NULL);
+			if (ret < 0)
+				goto out;
+		}
+		drop_node_buffer(node);
+	} else if (!root->ref_cows) {
+		path->lowest_level = node->level;
+		ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+		btrfs_release_path(root, path);
+		if (ret < 0)
+			goto out;
+	} else if (root != node->root) {
+		WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
+	}
+
+	update_processed_blocks(rc, node);
+	ret = 0;
+out:
+	drop_node_buffer(node);
+	return ret;
+}
+
+/*
+ * relocate a list of blocks
+ */
+static noinline_for_stack
+int relocate_tree_blocks(struct btrfs_trans_handle *trans,
+			 struct reloc_control *rc, struct rb_root *blocks)
+{
+	struct backref_cache *cache;
+	struct backref_node *node;
+	struct btrfs_path *path;
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	int level = -1;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	cache = kmalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	backref_cache_init(cache);
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		if (level == -1)
+			level = block->level;
+		else
+			BUG_ON(level != block->level);
+		if (!block->key_ready)
+			reada_tree_block(rc, block);
+		rb_node = rb_next(rb_node);
+	}
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+		if (!block->key_ready)
+			get_tree_block_key(rc, block);
+		rb_node = rb_next(rb_node);
+	}
+
+	rb_node = rb_first(blocks);
+	while (rb_node) {
+		block = rb_entry(rb_node, struct tree_block, rb_node);
+
+		node = build_backref_tree(rc, cache, &block->key,
+					  block->level, block->bytenr);
+		if (IS_ERR(node)) {
+			err = PTR_ERR(node);
+			goto out;
+		}
+
+		ret = relocate_tree_block(trans, rc, node, &block->key,
+					  path);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		remove_backref_node(cache, node);
+		rb_node = rb_next(rb_node);
+	}
+
+	if (level > 0)
+		goto out;
+
+	free_block_list(blocks);
+
+	/*
+	 * now backrefs of some upper level tree blocks have been cached,
+	 * try relocating blocks referenced by these upper level blocks.
+	 */
+	while (1) {
+		struct backref_node *upper = NULL;
+		if (trans->transaction->in_commit ||
+		    trans->transaction->delayed_refs.flushing)
+			break;
+
+		ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
+					  &upper);
+		if (ret < 0)
+			err = ret;
+		if (ret != 0)
+			break;
+
+		rb_node = rb_first(blocks);
+		while (rb_node) {
+			block = rb_entry(rb_node, struct tree_block, rb_node);
+			if (trans->transaction->in_commit ||
+			    trans->transaction->delayed_refs.flushing)
+				goto out;
+			BUG_ON(!block->key_ready);
+			node = build_backref_tree(rc, cache, &block->key,
+						  level, block->bytenr);
+			if (IS_ERR(node)) {
+				err = PTR_ERR(node);
+				goto out;
+			}
+
+			ret = relocate_tree_block(trans, rc, node,
+						  &block->key, path);
+			if (ret < 0) {
+				err = ret;
+				goto out;
+			}
+			remove_backref_node(cache, node);
+			rb_node = rb_next(rb_node);
+		}
+		free_block_list(blocks);
+
+		if (upper) {
+			ret = link_to_upper(trans, upper, path);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			remove_backref_node(cache, upper);
+		}
+	}
+out:
+	free_block_list(blocks);
+
+	ret = finish_pending_nodes(trans, cache, path);
+	if (ret < 0)
+		err = ret;
+
+	kfree(cache);
+	btrfs_free_path(path);
+	return err;
+}
+
+static noinline_for_stack
+int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
+{
+	u64 page_start;
+	u64 page_end;
+	unsigned long i;
+	unsigned long first_index;
+	unsigned long last_index;
+	unsigned int total_read = 0;
+	unsigned int total_dirty = 0;
+	struct page *page;
+	struct file_ra_state *ra;
+	struct btrfs_ordered_extent *ordered;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int ret = 0;
+
+	ra = kzalloc(sizeof(*ra), GFP_NOFS);
+	if (!ra)
+		return -ENOMEM;
+
+	mutex_lock(&inode->i_mutex);
+	first_index = start >> PAGE_CACHE_SHIFT;
+	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
+
+	/* make sure the dirty trick played by the caller work */
+	ret = invalidate_inode_pages2_range(inode->i_mapping,
+					    first_index, last_index);
+	if (ret)
+		goto out_unlock;
+
+	file_ra_state_init(ra, inode->i_mapping);
+
+	for (i = first_index ; i <= last_index; i++) {
+		if (total_read % ra->ra_pages == 0) {
+			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
+				min(last_index, ra->ra_pages + i - 1));
+		}
+		total_read++;
+again:
+		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
+			BUG_ON(1);
+		page = grab_cache_page(inode->i_mapping, i);
+		if (!page) {
+			ret = -ENOMEM;
+			goto out_unlock;
+		}
+		if (!PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				unlock_page(page);
+				page_cache_release(page);
+				ret = -EIO;
+				goto out_unlock;
+			}
+		}
+		wait_on_page_writeback(page);
+
+		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+		page_end = page_start + PAGE_CACHE_SIZE - 1;
+		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+
+		ordered = btrfs_lookup_ordered_extent(inode, page_start);
+		if (ordered) {
+			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+			unlock_page(page);
+			page_cache_release(page);
+			btrfs_start_ordered_extent(inode, ordered, 1);
+			btrfs_put_ordered_extent(ordered);
+			goto again;
+		}
+		set_page_extent_mapped(page);
+
+		if (i == first_index)
+			set_extent_bits(io_tree, page_start, page_end,
+					EXTENT_BOUNDARY, GFP_NOFS);
+		btrfs_set_extent_delalloc(inode, page_start, page_end);
+
+		set_page_dirty(page);
+		total_dirty++;
+
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		page_cache_release(page);
+	}
+out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	kfree(ra);
+	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
+	return ret;
+}
+
+static noinline_for_stack
+int relocate_data_extent(struct inode *inode, struct btrfs_key *extent_key)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *em;
+	u64 start = extent_key->objectid - BTRFS_I(inode)->index_cnt;
+	u64 end = start + extent_key->offset - 1;
+
+	em = alloc_extent_map(GFP_NOFS);
+	em->start = start;
+	em->len = extent_key->offset;
+	em->block_len = extent_key->offset;
+	em->block_start = extent_key->objectid;
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
+	set_bit(EXTENT_FLAG_PINNED, &em->flags);
+
+	/* setup extent map to cheat btrfs_readpage */
+	lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+	while (1) {
+		int ret;
+		spin_lock(&em_tree->lock);
+		ret = add_extent_mapping(em_tree, em);
+		spin_unlock(&em_tree->lock);
+		if (ret != -EEXIST) {
+			free_extent_map(em);
+			break;
+		}
+		btrfs_drop_extent_cache(inode, start, end, 0);
+	}
+	unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+
+	return relocate_inode_pages(inode, start, extent_key->offset);
+}
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+static int get_ref_objectid_v0(struct reloc_control *rc,
+			       struct btrfs_path *path,
+			       struct btrfs_key *extent_key,
+			       u64 *ref_objectid, int *path_change)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	struct btrfs_extent_ref_v0 *ref0;
+	int ret;
+	int slot;
+
+	leaf = path->nodes[0];
+	slot = path->slots[0];
+	while (1) {
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret < 0)
+				return ret;
+			BUG_ON(ret > 0);
+			leaf = path->nodes[0];
+			slot = path->slots[0];
+			if (path_change)
+				*path_change = 1;
+		}
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+		if (key.objectid != extent_key->objectid)
+			return -ENOENT;
+
+		if (key.type != BTRFS_EXTENT_REF_V0_KEY) {
+			slot++;
+			continue;
+		}
+		ref0 = btrfs_item_ptr(leaf, slot,
+				struct btrfs_extent_ref_v0);
+		*ref_objectid = btrfs_ref_objectid_v0(leaf, ref0);
+		break;
+	}
+	return 0;
+}
+#endif
+
+/*
+ * helper to add a tree block to the list.
+ * the major work is getting the generation and level of the block
+ */
+static int add_tree_block(struct reloc_control *rc,
+			  struct btrfs_key *extent_key,
+			  struct btrfs_path *path,
+			  struct rb_root *blocks)
+{
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_tree_block_info *bi;
+	struct tree_block *block;
+	struct rb_node *rb_node;
+	u32 item_size;
+	int level = -1;
+	int generation;
+
+	eb =  path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	if (item_size >= sizeof(*ei) + sizeof(*bi)) {
+		ei = btrfs_item_ptr(eb, path->slots[0],
+				struct btrfs_extent_item);
+		bi = (struct btrfs_tree_block_info *)(ei + 1);
+		generation = btrfs_extent_generation(eb, ei);
+		level = btrfs_tree_block_level(eb, bi);
+	} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		u64 ref_owner;
+		int ret;
+
+		BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
+		ret = get_ref_objectid_v0(rc, path, extent_key,
+					  &ref_owner, NULL);
+		BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
+		level = (int)ref_owner;
+		/* FIXME: get real generation */
+		generation = 0;
+#else
+		BUG();
+#endif
+	}
+
+	btrfs_release_path(rc->extent_root, path);
+
+	BUG_ON(level == -1);
+
+	block = kmalloc(sizeof(*block), GFP_NOFS);
+	if (!block)
+		return -ENOMEM;
+
+	block->bytenr = extent_key->objectid;
+	block->key.objectid = extent_key->offset;
+	block->key.offset = generation;
+	block->level = level;
+	block->key_ready = 0;
+
+	rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
+	BUG_ON(rb_node);
+
+	return 0;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
+ */
+static int __add_tree_block(struct reloc_control *rc,
+			    u64 bytenr, u32 blocksize,
+			    struct rb_root *blocks)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int ret;
+
+	if (tree_block_processed(bytenr, blocksize, rc))
+		return 0;
+
+	if (tree_search(blocks, bytenr))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = bytenr;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = blocksize;
+
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		goto out;
+	BUG_ON(ret);
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+	ret = add_tree_block(rc, &key, path, blocks);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to check if the block use full backrefs for pointers in it
+ */
+static int block_use_full_backref(struct reloc_control *rc,
+				  struct extent_buffer *eb)
+{
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+	u64 flags;
+	int ret;
+
+	if (btrfs_header_flag(eb, BTRFS_HEADER_FLAG_RELOC) ||
+	    btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
+		return 1;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = eb->start;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = eb->len;
+
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, rc->extent_root,
+				&key, path, 0, 0);
+	BUG_ON(ret);
+
+	ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+			    struct btrfs_extent_item);
+	flags = btrfs_extent_flags(path->nodes[0], ei);
+	BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
+	if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
+		ret = 1;
+	else
+		ret = 0;
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
+ * this function scans fs tree to find blocks reference the data extent
+ */
+static int find_data_references(struct reloc_control *rc,
+				struct btrfs_key *extent_key,
+				struct extent_buffer *leaf,
+				struct btrfs_extent_data_ref *ref,
+				struct rb_root *blocks)
+{
+	struct btrfs_path *path;
+	struct tree_block *block;
+	struct btrfs_root *root;
+	struct btrfs_file_extent_item *fi;
+	struct rb_node *rb_node;
+	struct btrfs_key key;
+	u64 ref_root;
+	u64 ref_objectid;
+	u64 ref_offset;
+	u32 ref_count;
+	u32 nritems;
+	int err = 0;
+	int added = 0;
+	int counted;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ref_root = btrfs_extent_data_ref_root(leaf, ref);
+	ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
+	ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
+	ref_count = btrfs_extent_data_ref_count(leaf, ref);
+
+	root = read_fs_root(rc->extent_root->fs_info, ref_root);
+	if (IS_ERR(root)) {
+		err = PTR_ERR(root);
+		goto out;
+	}
+
+	key.objectid = ref_objectid;
+	key.offset = ref_offset;
+	key.type = BTRFS_EXTENT_DATA_KEY;
+
+	path->search_commit_root = 1;
+	path->skip_locking = 1;
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0) {
+		err = ret;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	nritems = btrfs_header_nritems(leaf);
+	/*
+	 * the references in tree blocks that use full backrefs
+	 * are not counted in
+	 */
+	if (block_use_full_backref(rc, leaf))
+		counted = 0;
+	else
+		counted = 1;
+	rb_node = tree_search(blocks, leaf->start);
+	if (rb_node) {
+		if (counted)
+			added = 1;
+		else
+			path->slots[0] = nritems;
+	}
+
+	while (ref_count > 0) {
+		while (path->slots[0] >= nritems) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret < 0) {
+				err = ret;
+				goto out;
+			}
+			if (ret > 0) {
+				WARN_ON(1);
+				goto out;
+			}
+
+			leaf = path->nodes[0];
+			nritems = btrfs_header_nritems(leaf);
+			added = 0;
+
+			if (block_use_full_backref(rc, leaf))
+				counted = 0;
+			else
+				counted = 1;
+			rb_node = tree_search(blocks, leaf->start);
+			if (rb_node) {
+				if (counted)
+					added = 1;
+				else
+					path->slots[0] = nritems;
+			}
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid != ref_objectid ||
+		    key.type != BTRFS_EXTENT_DATA_KEY) {
+			WARN_ON(1);
+			break;
+		}
+
+		fi = btrfs_item_ptr(leaf, path->slots[0],
+				    struct btrfs_file_extent_item);
+
+		if (btrfs_file_extent_type(leaf, fi) ==
+		    BTRFS_FILE_EXTENT_INLINE)
+			goto next;
+
+		if (btrfs_file_extent_disk_bytenr(leaf, fi) !=
+		    extent_key->objectid)
+			goto next;
+
+		key.offset -= btrfs_file_extent_offset(leaf, fi);
+		if (key.offset != ref_offset)
+			goto next;
+
+		if (counted)
+			ref_count--;
+		if (added)
+			goto next;
+
+		if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+			block = kmalloc(sizeof(*block), GFP_NOFS);
+			if (!block) {
+				err = -ENOMEM;
+				break;
+			}
+			block->bytenr = leaf->start;
+			btrfs_item_key_to_cpu(leaf, &block->key, 0);
+			block->level = 0;
+			block->key_ready = 1;
+			rb_node = tree_insert(blocks, block->bytenr,
+					      &block->rb_node);
+			BUG_ON(rb_node);
+		}
+		if (counted)
+			added = 1;
+		else
+			path->slots[0] = nritems;
+next:
+		path->slots[0]++;
+
+	}
+out:
+	btrfs_free_path(path);
+	return err;
+}
+
+/*
+ * hepler to find all tree blocks that reference a given data extent
+ */
+static noinline_for_stack
+int add_data_references(struct reloc_control *rc,
+			struct btrfs_key *extent_key,
+			struct btrfs_path *path,
+			struct rb_root *blocks)
+{
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_data_ref *dref;
+	struct btrfs_extent_inline_ref *iref;
+	unsigned long ptr;
+	unsigned long end;
+	u32 blocksize;
+	int ret;
+	int err = 0;
+
+	ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
+			       extent_key->offset);
+	BUG_ON(ret < 0);
+	if (ret > 0) {
+		/* the relocated data is fragmented */
+		rc->extents_skipped++;
+		btrfs_release_path(rc->extent_root, path);
+		return 0;
+	}
+
+	blocksize = btrfs_level_size(rc->extent_root, 0);
+
+	eb = path->nodes[0];
+	ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
+	end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+	if (ptr + sizeof(struct btrfs_extent_item_v0) == end)
+		ptr = end;
+	else
+#endif
+		ptr += sizeof(struct btrfs_extent_item);
+
+	while (ptr < end) {
+		iref = (struct btrfs_extent_inline_ref *)ptr;
+		key.type = btrfs_extent_inline_ref_type(eb, iref);
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+			key.offset = btrfs_extent_inline_ref_offset(eb, iref);
+			ret = __add_tree_block(rc, key.offset, blocksize,
+					       blocks);
+		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
+			ret = find_data_references(rc, extent_key,
+						   eb, dref, blocks);
+		} else {
+			BUG();
+		}
+		ptr += btrfs_extent_inline_ref_size(key.type);
+	}
+	WARN_ON(ptr > end);
+
+	while (1) {
+		cond_resched();
+		eb = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+			if (ret > 0)
+				break;
+			eb = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+		if (key.objectid != extent_key->objectid)
+			break;
+
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY ||
+		    key.type == BTRFS_EXTENT_REF_V0_KEY) {
+#else
+		BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
+		if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+#endif
+			ret = __add_tree_block(rc, key.offset, blocksize,
+					       blocks);
+		} else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = btrfs_item_ptr(eb, path->slots[0],
+					      struct btrfs_extent_data_ref);
+			ret = find_data_references(rc, extent_key,
+						   eb, dref, blocks);
+		} else {
+			ret = 0;
+		}
+		if (ret) {
+			err = ret;
+			break;
+		}
+		path->slots[0]++;
+	}
+	btrfs_release_path(rc->extent_root, path);
+	if (err)
+		free_block_list(blocks);
+	return err;
+}
+
+/*
+ * hepler to find next unprocessed extent
+ */
+static noinline_for_stack
+int find_next_extent(struct btrfs_trans_handle *trans,
+		     struct reloc_control *rc, struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	u64 start, end, last;
+	int ret;
+
+	last = rc->block_group->key.objectid + rc->block_group->key.offset;
+	while (1) {
+		cond_resched();
+		if (rc->search_start >= last) {
+			ret = 1;
+			break;
+		}
+
+		key.objectid = rc->search_start;
+		key.type = BTRFS_EXTENT_ITEM_KEY;
+		key.offset = 0;
+
+		path->search_commit_root = 1;
+		path->skip_locking = 1;
+		ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
+					0, 0);
+		if (ret < 0)
+			break;
+next:
+		leaf = path->nodes[0];
+		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(rc->extent_root, path);
+			if (ret != 0)
+				break;
+			leaf = path->nodes[0];
+		}
+
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		if (key.objectid >= last) {
+			ret = 1;
+			break;
+		}
+
+		if (key.type != BTRFS_EXTENT_ITEM_KEY ||
+		    key.objectid + key.offset <= rc->search_start) {
+			path->slots[0]++;
+			goto next;
+		}
+
+		ret = find_first_extent_bit(&rc->processed_blocks,
+					    key.objectid, &start, &end,
+					    EXTENT_DIRTY);
+
+		if (ret == 0 && start <= key.objectid) {
+			btrfs_release_path(rc->extent_root, path);
+			rc->search_start = end + 1;
+		} else {
+			rc->search_start = key.objectid + key.offset;
+			return 0;
+		}
+	}
+	btrfs_release_path(rc->extent_root, path);
+	return ret;
+}
+
+static void set_reloc_control(struct reloc_control *rc)
+{
+	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+	mutex_lock(&fs_info->trans_mutex);
+	fs_info->reloc_ctl = rc;
+	mutex_unlock(&fs_info->trans_mutex);
+}
+
+static void unset_reloc_control(struct reloc_control *rc)
+{
+	struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
+	mutex_lock(&fs_info->trans_mutex);
+	fs_info->reloc_ctl = NULL;
+	mutex_unlock(&fs_info->trans_mutex);
+}
+
+static int check_extent_flags(u64 flags)
+{
+	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+		return 1;
+	if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
+		return 1;
+	if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
+	    (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
+		return 1;
+	return 0;
+}
+
+static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
+{
+	struct rb_root blocks = RB_ROOT;
+	struct btrfs_key key;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_path *path;
+	struct btrfs_extent_item *ei;
+	unsigned long nr;
+	u64 flags;
+	u32 item_size;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	rc->search_start = rc->block_group->key.objectid;
+	clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+			  GFP_NOFS);
+
+	rc->create_reloc_root = 1;
+	set_reloc_control(rc);
+
+	trans = btrfs_start_transaction(rc->extent_root, 1);
+	btrfs_commit_transaction(trans, rc->extent_root);
+
+	while (1) {
+		trans = btrfs_start_transaction(rc->extent_root, 1);
+
+		ret = find_next_extent(trans, rc, path);
+		if (ret < 0)
+			err = ret;
+		if (ret != 0)
+			break;
+
+		rc->extents_found++;
+
+		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_extent_item);
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		item_size = btrfs_item_size_nr(path->nodes[0],
+					       path->slots[0]);
+		if (item_size >= sizeof(*ei)) {
+			flags = btrfs_extent_flags(path->nodes[0], ei);
+			ret = check_extent_flags(flags);
+			BUG_ON(ret);
+
+		} else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+			u64 ref_owner;
+			int path_change = 0;
+
+			BUG_ON(item_size !=
+			       sizeof(struct btrfs_extent_item_v0));
+			ret = get_ref_objectid_v0(rc, path, &key, &ref_owner,
+						  &path_change);
+			if (ref_owner < BTRFS_FIRST_FREE_OBJECTID)
+				flags = BTRFS_EXTENT_FLAG_TREE_BLOCK;
+			else
+				flags = BTRFS_EXTENT_FLAG_DATA;
+
+			if (path_change) {
+				btrfs_release_path(rc->extent_root, path);
+
+				path->search_commit_root = 1;
+				path->skip_locking = 1;
+				ret = btrfs_search_slot(NULL, rc->extent_root,
+							&key, path, 0, 0);
+				if (ret < 0) {
+					err = ret;
+					break;
+				}
+				BUG_ON(ret > 0);
+			}
+#else
+			BUG();
+#endif
+		}
+
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+			ret = add_tree_block(rc, &key, path, &blocks);
+		} else if (rc->stage == UPDATE_DATA_PTRS &&
+			 (flags & BTRFS_EXTENT_FLAG_DATA)) {
+			ret = add_data_references(rc, &key, path, &blocks);
+		} else {
+			btrfs_release_path(rc->extent_root, path);
+			ret = 0;
+		}
+		if (ret < 0) {
+			err = 0;
+			break;
+		}
+
+		if (!RB_EMPTY_ROOT(&blocks)) {
+			ret = relocate_tree_blocks(trans, rc, &blocks);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+		}
+
+		nr = trans->blocks_used;
+		btrfs_end_transaction_throttle(trans, rc->extent_root);
+		trans = NULL;
+		btrfs_btree_balance_dirty(rc->extent_root, nr);
+
+		if (rc->stage == MOVE_DATA_EXTENTS &&
+		    (flags & BTRFS_EXTENT_FLAG_DATA)) {
+			rc->found_file_extent = 1;
+			ret = relocate_data_extent(rc->data_inode, &key);
+			if (ret < 0) {
+				err = ret;
+				break;
+			}
+		}
+	}
+	btrfs_free_path(path);
+
+	if (trans) {
+		nr = trans->blocks_used;
+		btrfs_end_transaction(trans, rc->extent_root);
+		btrfs_btree_balance_dirty(rc->extent_root, nr);
+	}
+
+	rc->create_reloc_root = 0;
+	smp_mb();
+
+	if (rc->extents_found > 0) {
+		trans = btrfs_start_transaction(rc->extent_root, 1);
+		btrfs_commit_transaction(trans, rc->extent_root);
+	}
+
+	merge_reloc_roots(rc);
+
+	unset_reloc_control(rc);
+
+	/* get rid of pinned extents */
+	trans = btrfs_start_transaction(rc->extent_root, 1);
+	btrfs_commit_transaction(trans, rc->extent_root);
+
+	return err;
+}
+
+static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root,
+				 u64 objectid, u64 size)
+{
+	struct btrfs_path *path;
+	struct btrfs_inode_item *item;
+	struct extent_buffer *leaf;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
+	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
+	btrfs_set_inode_generation(leaf, item, 1);
+	btrfs_set_inode_size(leaf, item, size);
+	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
+	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(root, path);
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+/*
+ * helper to create inode for data relocation.
+ * the inode is in data relocation tree and its link count is 0
+ */
+static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+					struct btrfs_block_group_cache *group)
+{
+	struct inode *inode = NULL;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	unsigned long nr;
+	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
+	int err = 0;
+
+	root = read_fs_root(fs_info, BTRFS_DATA_RELOC_TREE_OBJECTID);
+	if (IS_ERR(root))
+		return ERR_CAST(root);
+
+	trans = btrfs_start_transaction(root, 1);
+	BUG_ON(!trans);
+
+	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+	if (err)
+		goto out;
+
+	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
+	BUG_ON(err);
+
+	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
+				       group->key.offset, 0, group->key.offset,
+				       0, 0, 0);
+	BUG_ON(err);
+
+	key.objectid = objectid;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(root->fs_info->sb, &key, root);
+	BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
+	BTRFS_I(inode)->index_cnt = group->key.objectid;
+
+	err = btrfs_orphan_add(trans, inode);
+out:
+	nr = trans->blocks_used;
+	btrfs_end_transaction(trans, root);
+
+	btrfs_btree_balance_dirty(root, nr);
+	if (err) {
+		if (inode)
+			iput(inode);
+		inode = ERR_PTR(err);
+	}
+	return inode;
+}
+
+/*
+ * function to relocate all extents in a block group.
+ */
+int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
+{
+	struct btrfs_fs_info *fs_info = extent_root->fs_info;
+	struct reloc_control *rc;
+	int ret;
+	int err = 0;
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc)
+		return -ENOMEM;
+
+	mapping_tree_init(&rc->reloc_root_tree);
+	extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+	INIT_LIST_HEAD(&rc->reloc_roots);
+
+	rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
+	BUG_ON(!rc->block_group);
+
+	btrfs_init_workers(&rc->workers, "relocate",
+			   fs_info->thread_pool_size);
+
+	rc->extent_root = extent_root;
+	btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+
+	rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
+	if (IS_ERR(rc->data_inode)) {
+		err = PTR_ERR(rc->data_inode);
+		rc->data_inode = NULL;
+		goto out;
+	}
+
+	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
+	       (unsigned long long)rc->block_group->key.objectid,
+	       (unsigned long long)rc->block_group->flags);
+
+	btrfs_start_delalloc_inodes(fs_info->tree_root);
+	btrfs_wait_ordered_extents(fs_info->tree_root, 0);
+
+	while (1) {
+		mutex_lock(&fs_info->cleaner_mutex);
+		btrfs_clean_old_snapshots(fs_info->tree_root);
+		mutex_unlock(&fs_info->cleaner_mutex);
+
+		rc->extents_found = 0;
+		rc->extents_skipped = 0;
+
+		ret = relocate_block_group(rc);
+		if (ret < 0) {
+			err = ret;
+			break;
+		}
+
+		if (rc->extents_found == 0)
+			break;
+
+		printk(KERN_INFO "btrfs: found %llu extents\n",
+			(unsigned long long)rc->extents_found);
+
+		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
+			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
+			invalidate_mapping_pages(rc->data_inode->i_mapping,
+						 0, -1);
+			rc->stage = UPDATE_DATA_PTRS;
+		} else if (rc->stage == UPDATE_DATA_PTRS &&
+			   rc->extents_skipped >= rc->extents_found) {
+			iput(rc->data_inode);
+			rc->data_inode = create_reloc_inode(fs_info,
+							    rc->block_group);
+			if (IS_ERR(rc->data_inode)) {
+				err = PTR_ERR(rc->data_inode);
+				rc->data_inode = NULL;
+				break;
+			}
+			rc->stage = MOVE_DATA_EXTENTS;
+			rc->found_file_extent = 0;
+		}
+	}
+
+	filemap_fdatawrite_range(fs_info->btree_inode->i_mapping,
+				 rc->block_group->key.objectid,
+				 rc->block_group->key.objectid +
+				 rc->block_group->key.offset - 1);
+
+	WARN_ON(rc->block_group->pinned > 0);
+	WARN_ON(rc->block_group->reserved > 0);
+	WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
+out:
+	iput(rc->data_inode);
+	btrfs_stop_workers(&rc->workers);
+	btrfs_put_block_group(rc->block_group);
+	kfree(rc);
+	return err;
+}
+
+/*
+ * recover relocation interrupted by system crash.
+ *
+ * this function resumes merging reloc trees with corresponding fs trees.
+ * this is important for keeping the sharing of tree blocks
+ */
+int btrfs_recover_relocation(struct btrfs_root *root)
+{
+	LIST_HEAD(reloc_roots);
+	struct btrfs_key key;
+	struct btrfs_root *fs_root;
+	struct btrfs_root *reloc_root;
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct reloc_control *rc = NULL;
+	struct btrfs_trans_handle *trans;
+	int ret;
+	int err = 0;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	while (1) {
+		ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key,
+					path, 0, 0);
+		if (ret < 0) {
+			err = ret;
+			goto out;
+		}
+		if (ret > 0) {
+			if (path->slots[0] == 0)
+				break;
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+		btrfs_release_path(root->fs_info->tree_root, path);
+
+		if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
+		    key.type != BTRFS_ROOT_ITEM_KEY)
+			break;
+
+		reloc_root = btrfs_read_fs_root_no_radix(root, &key);
+		if (IS_ERR(reloc_root)) {
+			err = PTR_ERR(reloc_root);
+			goto out;
+		}
+
+		list_add(&reloc_root->root_list, &reloc_roots);
+
+		if (btrfs_root_refs(&reloc_root->root_item) > 0) {
+			fs_root = read_fs_root(root->fs_info,
+					       reloc_root->root_key.offset);
+			if (IS_ERR(fs_root)) {
+				err = PTR_ERR(fs_root);
+				goto out;
+			}
+		}
+
+		if (key.offset == 0)
+			break;
+
+		key.offset--;
+	}
+	btrfs_release_path(root->fs_info->tree_root, path);
+
+	if (list_empty(&reloc_roots))
+		goto out;
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	mapping_tree_init(&rc->reloc_root_tree);
+	INIT_LIST_HEAD(&rc->reloc_roots);
+	btrfs_init_workers(&rc->workers, "relocate",
+			   root->fs_info->thread_pool_size);
+	rc->extent_root = root->fs_info->extent_root;
+
+	set_reloc_control(rc);
+
+	while (!list_empty(&reloc_roots)) {
+		reloc_root = list_entry(reloc_roots.next,
+					struct btrfs_root, root_list);
+		list_del(&reloc_root->root_list);
+
+		if (btrfs_root_refs(&reloc_root->root_item) == 0) {
+			list_add_tail(&reloc_root->root_list,
+				      &rc->reloc_roots);
+			continue;
+		}
+
+		fs_root = read_fs_root(root->fs_info,
+				       reloc_root->root_key.offset);
+		BUG_ON(IS_ERR(fs_root));
+
+		__add_reloc_root(reloc_root);
+		fs_root->reloc_root = reloc_root;
+	}
+
+	trans = btrfs_start_transaction(rc->extent_root, 1);
+	btrfs_commit_transaction(trans, rc->extent_root);
+
+	merge_reloc_roots(rc);
+
+	unset_reloc_control(rc);
+
+	trans = btrfs_start_transaction(rc->extent_root, 1);
+	btrfs_commit_transaction(trans, rc->extent_root);
+out:
+	if (rc) {
+		btrfs_stop_workers(&rc->workers);
+		kfree(rc);
+	}
+	while (!list_empty(&reloc_roots)) {
+		reloc_root = list_entry(reloc_roots.next,
+					struct btrfs_root, root_list);
+		list_del(&reloc_root->root_list);
+		free_extent_buffer(reloc_root->node);
+		free_extent_buffer(reloc_root->commit_root);
+		kfree(reloc_root);
+	}
+	btrfs_free_path(path);
+
+	if (err == 0) {
+		/* cleanup orphan inode in data relocation tree */
+		fs_root = read_fs_root(root->fs_info,
+				       BTRFS_DATA_RELOC_TREE_OBJECTID);
+		if (IS_ERR(fs_root))
+			err = PTR_ERR(fs_root);
+	}
+	return err;
+}
+
+/*
+ * helper to add ordered checksum for data relocation.
+ *
+ * cloning checksum properly handles the nodatasum extents.
+ * it also saves CPU time to re-calculate the checksum.
+ */
+int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
+{
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
+	struct btrfs_ordered_extent *ordered;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	size_t offset;
+	int ret;
+	u64 disk_bytenr;
+	LIST_HEAD(list);
+
+	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
+	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
+
+	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
+	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
+				       disk_bytenr + len - 1, &list);
+
+	while (!list_empty(&list)) {
+		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
+		list_del_init(&sums->list);
+
+		sector_sum = sums->sums;
+		sums->bytenr = ordered->start;
+
+		offset = 0;
+		while (offset < sums->len) {
+			sector_sum->bytenr += ordered->start - disk_bytenr;
+			sector_sum++;
+			offset += root->sectorsize;
+		}
+
+		btrfs_add_ordered_sum(inode, ordered, sums);
+	}
+	btrfs_put_ordered_extent(ordered);
+	return 0;
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index b48650de44725..0ddc6d61c55a7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -111,6 +111,15 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
 	return ret;
 }
 
+int btrfs_set_root_node(struct btrfs_root_item *item,
+			struct extent_buffer *node)
+{
+	btrfs_set_root_bytenr(item, node->start);
+	btrfs_set_root_level(item, btrfs_header_level(node));
+	btrfs_set_root_generation(item, btrfs_header_generation(node));
+	return 0;
+}
+
 /*
  * copy the data in 'item' into the btree
  */
@@ -164,8 +173,7 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
  * offset lower than the latest root.  They need to be queued for deletion to
  * finish what was happening when we crashed.
  */
-int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
-			  struct btrfs_root *latest)
+int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
 {
 	struct btrfs_root *dead_root;
 	struct btrfs_item *item;
@@ -227,10 +235,7 @@ int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid,
 			goto err;
 		}
 
-		if (objectid == BTRFS_TREE_RELOC_OBJECTID)
-			ret = btrfs_add_dead_reloc_root(dead_root);
-		else
-			ret = btrfs_add_dead_root(dead_root, latest);
+		ret = btrfs_add_dead_root(dead_root);
 		if (ret)
 			goto err;
 		goto again;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2ff7cd2db25f4..e9ef8c3307fe0 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -52,7 +52,6 @@
 #include "export.h"
 #include "compression.h"
 
-
 static struct super_operations btrfs_super_ops;
 
 static void btrfs_put_super(struct super_block *sb)
@@ -322,7 +321,7 @@ static int btrfs_fill_super(struct super_block *sb,
 	struct dentry *root_dentry;
 	struct btrfs_super_block *disk_super;
 	struct btrfs_root *tree_root;
-	struct btrfs_inode *bi;
+	struct btrfs_key key;
 	int err;
 
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -341,23 +340,15 @@ static int btrfs_fill_super(struct super_block *sb,
 	}
 	sb->s_fs_info = tree_root;
 	disk_super = &tree_root->fs_info->super_copy;
-	inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID,
-				  tree_root->fs_info->fs_root);
-	bi = BTRFS_I(inode);
-	bi->location.objectid = inode->i_ino;
-	bi->location.offset = 0;
-	bi->root = tree_root->fs_info->fs_root;
-
-	btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
 
-	if (!inode) {
-		err = -ENOMEM;
+	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root);
+	if (IS_ERR(inode)) {
+		err = PTR_ERR(inode);
 		goto fail_close;
 	}
-	if (inode->i_state & I_NEW) {
-		btrfs_read_locked_inode(inode);
-		unlock_new_inode(inode);
-	}
 
 	root_dentry = d_alloc_root(inode);
 	if (!root_dentry) {
@@ -584,7 +575,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
 			return -EINVAL;
 
-		ret = btrfs_cleanup_reloc_trees(root);
+		/* recover relocation */
+		ret = btrfs_recover_relocation(root);
 		WARN_ON(ret);
 
 		ret = btrfs_cleanup_fs_roots(root->fs_info);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 01b143605ec1b..2e177d7f4bb94 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -25,7 +25,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "locking.h"
-#include "ref-cache.h"
 #include "tree-log.h"
 
 #define BTRFS_ROOT_TRANS_TAG 0
@@ -94,45 +93,37 @@ static noinline int join_transaction(struct btrfs_root *root)
  * to make sure the old root from before we joined the transaction is deleted
  * when the transaction commits
  */
-noinline int btrfs_record_root_in_trans(struct btrfs_root *root)
+static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root)
 {
-	struct btrfs_dirty_root *dirty;
-	u64 running_trans_id = root->fs_info->running_transaction->transid;
-	if (root->ref_cows && root->last_trans < running_trans_id) {
+	if (root->ref_cows && root->last_trans < trans->transid) {
 		WARN_ON(root == root->fs_info->extent_root);
-		if (root->root_item.refs != 0) {
-			radix_tree_tag_set(&root->fs_info->fs_roots_radix,
-				   (unsigned long)root->root_key.objectid,
-				   BTRFS_ROOT_TRANS_TAG);
-
-			dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-			BUG_ON(!dirty);
-			dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS);
-			BUG_ON(!dirty->root);
-			dirty->latest_root = root;
-			INIT_LIST_HEAD(&dirty->list);
-
-			root->commit_root = btrfs_root_node(root);
-
-			memcpy(dirty->root, root, sizeof(*root));
-			spin_lock_init(&dirty->root->node_lock);
-			spin_lock_init(&dirty->root->list_lock);
-			mutex_init(&dirty->root->objectid_mutex);
-			mutex_init(&dirty->root->log_mutex);
-			INIT_LIST_HEAD(&dirty->root->dead_list);
-			dirty->root->node = root->commit_root;
-			dirty->root->commit_root = NULL;
+		WARN_ON(root->root_item.refs == 0);
+		WARN_ON(root->commit_root != root->node);
+
+		radix_tree_tag_set(&root->fs_info->fs_roots_radix,
+			   (unsigned long)root->root_key.objectid,
+			   BTRFS_ROOT_TRANS_TAG);
+		root->last_trans = trans->transid;
+		btrfs_init_reloc_root(trans, root);
+	}
+	return 0;
+}
 
-			spin_lock(&root->list_lock);
-			list_add(&dirty->root->dead_list, &root->dead_list);
-			spin_unlock(&root->list_lock);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	if (!root->ref_cows)
+		return 0;
 
-			root->dirty_root = dirty;
-		} else {
-			WARN_ON(1);
-		}
-		root->last_trans = running_trans_id;
+	mutex_lock(&root->fs_info->trans_mutex);
+	if (root->last_trans == trans->transid) {
+		mutex_unlock(&root->fs_info->trans_mutex);
+		return 0;
 	}
+
+	record_root_in_trans(trans, root);
+	mutex_unlock(&root->fs_info->trans_mutex);
 	return 0;
 }
 
@@ -181,7 +172,6 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	ret = join_transaction(root);
 	BUG_ON(ret);
 
-	btrfs_record_root_in_trans(root);
 	h->transid = root->fs_info->running_transaction->transid;
 	h->transaction = root->fs_info->running_transaction;
 	h->blocks_reserved = num_blocks;
@@ -192,6 +182,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	h->delayed_ref_updates = 0;
 
 	root->fs_info->running_transaction->use_count++;
+	record_root_in_trans(h, root);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return h;
 }
@@ -233,6 +224,7 @@ static noinline int wait_for_commit(struct btrfs_root *root,
 	return 0;
 }
 
+#if 0
 /*
  * rate limit against the drop_snapshot code.  This helps to slow down new
  * operations if the drop_snapshot code isn't able to keep up.
@@ -273,6 +265,7 @@ static void throttle_on_drops(struct btrfs_root *root)
 			goto harder;
 	}
 }
+#endif
 
 void btrfs_throttle(struct btrfs_root *root)
 {
@@ -280,7 +273,6 @@ void btrfs_throttle(struct btrfs_root *root)
 	if (!root->fs_info->open_ioctl_trans)
 		wait_current_trans(root);
 	mutex_unlock(&root->fs_info->trans_mutex);
-	throttle_on_drops(root);
 }
 
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
@@ -323,9 +315,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 	memset(trans, 0, sizeof(*trans));
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
-	if (throttle)
-		throttle_on_drops(root);
-
 	return 0;
 }
 
@@ -462,12 +451,8 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
 		if (old_root_bytenr == root->node->start)
 			break;
-		btrfs_set_root_bytenr(&root->root_item,
-				       root->node->start);
-		btrfs_set_root_level(&root->root_item,
-				     btrfs_header_level(root->node));
-		btrfs_set_root_generation(&root->root_item, trans->transid);
 
+		btrfs_set_root_node(&root->root_item, root->node);
 		ret = btrfs_update_root(trans, tree_root,
 					&root->root_key,
 					&root->root_item);
@@ -477,14 +462,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
 		ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
 		BUG_ON(ret);
 	}
+	free_extent_buffer(root->commit_root);
+	root->commit_root = btrfs_root_node(root);
 	return 0;
 }
 
 /*
  * update all the cowonly tree roots on disk
  */
-int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
-			    struct btrfs_root *root)
+static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *next;
@@ -520,118 +507,54 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
  * a dirty root struct and adds it into the list of dead roots that need to
  * be deleted
  */
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest)
+int btrfs_add_dead_root(struct btrfs_root *root)
 {
-	struct btrfs_dirty_root *dirty;
-
-	dirty = kmalloc(sizeof(*dirty), GFP_NOFS);
-	if (!dirty)
-		return -ENOMEM;
-	dirty->root = root;
-	dirty->latest_root = latest;
-
 	mutex_lock(&root->fs_info->trans_mutex);
-	list_add(&dirty->list, &latest->fs_info->dead_roots);
+	list_add(&root->root_list, &root->fs_info->dead_roots);
 	mutex_unlock(&root->fs_info->trans_mutex);
 	return 0;
 }
 
 /*
- * at transaction commit time we need to schedule the old roots for
- * deletion via btrfs_drop_snapshot.  This runs through all the
- * reference counted roots that were modified in the current
- * transaction and puts them into the drop list
+ * update all the cowonly tree roots on disk
  */
-static noinline int add_dirty_roots(struct btrfs_trans_handle *trans,
-				    struct radix_tree_root *radix,
-				    struct list_head *list)
+static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root)
 {
-	struct btrfs_dirty_root *dirty;
 	struct btrfs_root *gang[8];
-	struct btrfs_root *root;
+	struct btrfs_fs_info *fs_info = root->fs_info;
 	int i;
 	int ret;
 	int err = 0;
-	u32 refs;
 
 	while (1) {
-		ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0,
+		ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
+						 (void **)gang, 0,
 						 ARRAY_SIZE(gang),
 						 BTRFS_ROOT_TRANS_TAG);
 		if (ret == 0)
 			break;
 		for (i = 0; i < ret; i++) {
 			root = gang[i];
-			radix_tree_tag_clear(radix,
-				     (unsigned long)root->root_key.objectid,
-				     BTRFS_ROOT_TRANS_TAG);
-
-			BUG_ON(!root->ref_tree);
-			dirty = root->dirty_root;
+			radix_tree_tag_clear(&fs_info->fs_roots_radix,
+					(unsigned long)root->root_key.objectid,
+					BTRFS_ROOT_TRANS_TAG);
 
 			btrfs_free_log(trans, root);
-			btrfs_free_reloc_root(trans, root);
-
-			if (root->commit_root == root->node) {
-				WARN_ON(root->node->start !=
-					btrfs_root_bytenr(&root->root_item));
-
-				free_extent_buffer(root->commit_root);
-				root->commit_root = NULL;
-				root->dirty_root = NULL;
-
-				spin_lock(&root->list_lock);
-				list_del_init(&dirty->root->dead_list);
-				spin_unlock(&root->list_lock);
+			btrfs_update_reloc_root(trans, root);
 
-				kfree(dirty->root);
-				kfree(dirty);
-
-				/* make sure to update the root on disk
-				 * so we get any updates to the block used
-				 * counts
-				 */
-				err = btrfs_update_root(trans,
-						root->fs_info->tree_root,
-						&root->root_key,
-						&root->root_item);
+			if (root->commit_root == root->node)
 				continue;
-			}
 
-			memset(&root->root_item.drop_progress, 0,
-			       sizeof(struct btrfs_disk_key));
-			root->root_item.drop_level = 0;
-			root->commit_root = NULL;
-			root->dirty_root = NULL;
-			root->root_key.offset = root->fs_info->generation;
-			btrfs_set_root_bytenr(&root->root_item,
-					      root->node->start);
-			btrfs_set_root_level(&root->root_item,
-					     btrfs_header_level(root->node));
-			btrfs_set_root_generation(&root->root_item,
-						  root->root_key.offset);
-
-			err = btrfs_insert_root(trans, root->fs_info->tree_root,
+			free_extent_buffer(root->commit_root);
+			root->commit_root = btrfs_root_node(root);
+
+			btrfs_set_root_node(&root->root_item, root->node);
+			err = btrfs_update_root(trans, fs_info->tree_root,
 						&root->root_key,
 						&root->root_item);
 			if (err)
 				break;
-
-			refs = btrfs_root_refs(&dirty->root->root_item);
-			btrfs_set_root_refs(&dirty->root->root_item, refs - 1);
-			err = btrfs_update_root(trans, root->fs_info->tree_root,
-						&dirty->root->root_key,
-						&dirty->root->root_item);
-
-			BUG_ON(err);
-			if (refs == 1) {
-				list_add(&dirty->list, list);
-			} else {
-				WARN_ON(1);
-				free_extent_buffer(dirty->root->node);
-				kfree(dirty->root);
-				kfree(dirty);
-			}
 		}
 	}
 	return err;
@@ -688,12 +611,8 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
 				TASK_UNINTERRUPTIBLE);
 		mutex_unlock(&info->trans_mutex);
 
-		atomic_dec(&info->throttles);
-		wake_up(&info->transaction_throttle);
-
 		schedule();
 
-		atomic_inc(&info->throttles);
 		mutex_lock(&info->trans_mutex);
 		finish_wait(&info->transaction_wait, &wait);
 	}
@@ -705,111 +624,61 @@ static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
  * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
  * all of them
  */
-static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
-				     struct list_head *list)
+int btrfs_drop_dead_root(struct btrfs_root *root)
 {
-	struct btrfs_dirty_root *dirty;
 	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
 	unsigned long nr;
-	u64 num_bytes;
-	u64 bytes_used;
-	u64 max_useless;
-	int ret = 0;
-	int err;
-
-	while (!list_empty(list)) {
-		struct btrfs_root *root;
-
-		dirty = list_entry(list->prev, struct btrfs_dirty_root, list);
-		list_del_init(&dirty->list);
-
-		num_bytes = btrfs_root_used(&dirty->root->root_item);
-		root = dirty->latest_root;
-		atomic_inc(&root->fs_info->throttles);
-
-		while (1) {
-			/*
-			 * we don't want to jump in and create a bunch of
-			 * delayed refs if the transaction is starting to close
-			 */
-			wait_transaction_pre_flush(tree_root->fs_info);
-			trans = btrfs_start_transaction(tree_root, 1);
-
-			/*
-			 * we've joined a transaction, make sure it isn't
-			 * closing right now
-			 */
-			if (trans->transaction->delayed_refs.flushing) {
-				btrfs_end_transaction(trans, tree_root);
-				continue;
-			}
-
-			mutex_lock(&root->fs_info->drop_mutex);
-			ret = btrfs_drop_snapshot(trans, dirty->root);
-			if (ret != -EAGAIN)
-				break;
-			mutex_unlock(&root->fs_info->drop_mutex);
+	int ret;
 
-			err = btrfs_update_root(trans,
-					tree_root,
-					&dirty->root->root_key,
-					&dirty->root->root_item);
-			if (err)
-				ret = err;
-			nr = trans->blocks_used;
-			ret = btrfs_end_transaction(trans, tree_root);
-			BUG_ON(ret);
+	while (1) {
+		/*
+		 * we don't want to jump in and create a bunch of
+		 * delayed refs if the transaction is starting to close
+		 */
+		wait_transaction_pre_flush(tree_root->fs_info);
+		trans = btrfs_start_transaction(tree_root, 1);
 
-			btrfs_btree_balance_dirty(tree_root, nr);
-			cond_resched();
+		/*
+		 * we've joined a transaction, make sure it isn't
+		 * closing right now
+		 */
+		if (trans->transaction->delayed_refs.flushing) {
+			btrfs_end_transaction(trans, tree_root);
+			continue;
 		}
-		BUG_ON(ret);
-		atomic_dec(&root->fs_info->throttles);
-		wake_up(&root->fs_info->transaction_throttle);
 
-		num_bytes -= btrfs_root_used(&dirty->root->root_item);
-		bytes_used = btrfs_root_used(&root->root_item);
-		if (num_bytes) {
-			mutex_lock(&root->fs_info->trans_mutex);
-			btrfs_record_root_in_trans(root);
-			mutex_unlock(&root->fs_info->trans_mutex);
-			btrfs_set_root_used(&root->root_item,
-					    bytes_used - num_bytes);
-		}
+		ret = btrfs_drop_snapshot(trans, root);
+		if (ret != -EAGAIN)
+			break;
 
-		ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key);
-		if (ret) {
-			BUG();
+		ret = btrfs_update_root(trans, tree_root,
+					&root->root_key,
+					&root->root_item);
+		if (ret)
 			break;
-		}
-		mutex_unlock(&root->fs_info->drop_mutex);
-
-		spin_lock(&root->list_lock);
-		list_del_init(&dirty->root->dead_list);
-		if (!list_empty(&root->dead_list)) {
-			struct btrfs_root *oldest;
-			oldest = list_entry(root->dead_list.prev,
-					    struct btrfs_root, dead_list);
-			max_useless = oldest->root_key.offset - 1;
-		} else {
-			max_useless = root->root_key.offset - 1;
-		}
-		spin_unlock(&root->list_lock);
 
 		nr = trans->blocks_used;
 		ret = btrfs_end_transaction(trans, tree_root);
 		BUG_ON(ret);
 
-		ret = btrfs_remove_leaf_refs(root, max_useless, 0);
-		BUG_ON(ret);
-
-		free_extent_buffer(dirty->root->node);
-		kfree(dirty->root);
-		kfree(dirty);
-
 		btrfs_btree_balance_dirty(tree_root, nr);
 		cond_resched();
 	}
+	BUG_ON(ret);
+
+	ret = btrfs_del_root(trans, tree_root, &root->root_key);
+	BUG_ON(ret);
+
+	nr = trans->blocks_used;
+	ret = btrfs_end_transaction(trans, tree_root);
+	BUG_ON(ret);
+
+	free_extent_buffer(root->node);
+	free_extent_buffer(root->commit_root);
+	kfree(root);
+
+	btrfs_btree_balance_dirty(tree_root, nr);
 	return ret;
 }
 
@@ -839,24 +708,23 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto fail;
 
-	btrfs_record_root_in_trans(root);
+	record_root_in_trans(trans, root);
 	btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
 	memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
 
 	key.objectid = objectid;
-	key.offset = trans->transid;
+	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
 
 	old = btrfs_lock_root_node(root);
 	btrfs_cow_block(trans, root, old, NULL, 0, &old);
+	btrfs_set_lock_blocking(old);
 
 	btrfs_copy_root(trans, root, old, &tmp, objectid);
 	btrfs_tree_unlock(old);
 	free_extent_buffer(old);
 
-	btrfs_set_root_bytenr(new_root_item, tmp->start);
-	btrfs_set_root_level(new_root_item, btrfs_header_level(tmp));
-	btrfs_set_root_generation(new_root_item, trans->transid);
+	btrfs_set_root_node(new_root_item, tmp);
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				new_root_item);
 	btrfs_tree_unlock(tmp);
@@ -964,6 +832,24 @@ static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
+static void update_super_roots(struct btrfs_root *root)
+{
+	struct btrfs_root_item *root_item;
+	struct btrfs_super_block *super;
+
+	super = &root->fs_info->super_copy;
+
+	root_item = &root->fs_info->chunk_root->root_item;
+	super->chunk_root = root_item->bytenr;
+	super->chunk_root_generation = root_item->generation;
+	super->chunk_root_level = root_item->level;
+
+	root_item = &root->fs_info->tree_root->root_item;
+	super->root = root_item->bytenr;
+	super->generation = root_item->generation;
+	super->root_level = root_item->level;
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -971,8 +857,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
-	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
-	struct list_head dirty_fs_roots;
 	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
 	int ret;
@@ -999,7 +883,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 
 	mutex_lock(&root->fs_info->trans_mutex);
-	INIT_LIST_HEAD(&dirty_fs_roots);
 	if (cur_trans->in_commit) {
 		cur_trans->use_count++;
 		mutex_unlock(&root->fs_info->trans_mutex);
@@ -1105,41 +988,36 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	 * with the tree-log code.
 	 */
 	mutex_lock(&root->fs_info->tree_log_mutex);
-	/*
-	 * keep tree reloc code from adding new reloc trees
-	 */
-	mutex_lock(&root->fs_info->tree_reloc_mutex);
-
 
-	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
-			      &dirty_fs_roots);
+	ret = commit_fs_roots(trans, root);
 	BUG_ON(ret);
 
-	/* add_dirty_roots gets rid of all the tree log roots, it is now
+	/* commit_fs_roots gets rid of all the tree log roots, it is now
 	 * safe to free the root of tree log roots
 	 */
 	btrfs_free_log_root_tree(trans, root->fs_info);
 
-	ret = btrfs_commit_tree_roots(trans, root);
+	ret = commit_cowonly_roots(trans, root);
 	BUG_ON(ret);
 
 	cur_trans = root->fs_info->running_transaction;
 	spin_lock(&root->fs_info->new_trans_lock);
 	root->fs_info->running_transaction = NULL;
 	spin_unlock(&root->fs_info->new_trans_lock);
-	btrfs_set_super_generation(&root->fs_info->super_copy,
-				   cur_trans->transid);
-	btrfs_set_super_root(&root->fs_info->super_copy,
-			     root->fs_info->tree_root->node->start);
-	btrfs_set_super_root_level(&root->fs_info->super_copy,
-			   btrfs_header_level(root->fs_info->tree_root->node));
-
-	btrfs_set_super_chunk_root(&root->fs_info->super_copy,
-				   chunk_root->node->start);
-	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
-					 btrfs_header_level(chunk_root->node));
-	btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy,
-				btrfs_header_generation(chunk_root->node));
+
+	btrfs_set_root_node(&root->fs_info->tree_root->root_item,
+			    root->fs_info->tree_root->node);
+	free_extent_buffer(root->fs_info->tree_root->commit_root);
+	root->fs_info->tree_root->commit_root =
+				btrfs_root_node(root->fs_info->tree_root);
+
+	btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
+			    root->fs_info->chunk_root->node);
+	free_extent_buffer(root->fs_info->chunk_root->commit_root);
+	root->fs_info->chunk_root->commit_root =
+				btrfs_root_node(root->fs_info->chunk_root);
+
+	update_super_roots(root);
 
 	if (!root->fs_info->log_root_recovering) {
 		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
@@ -1153,7 +1031,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	trans->transaction->blocked = 0;
 
-	wake_up(&root->fs_info->transaction_throttle);
 	wake_up(&root->fs_info->transaction_wait);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
@@ -1170,9 +1047,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	btrfs_finish_extent_commit(trans, root, pinned_copy);
 	kfree(pinned_copy);
 
-	btrfs_drop_dead_reloc_roots(root);
-	mutex_unlock(&root->fs_info->tree_reloc_mutex);
-
 	/* do the directory inserts of any pending snapshot creations */
 	finish_pending_snapshots(trans, root->fs_info);
 
@@ -1186,16 +1060,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	put_transaction(cur_trans);
 	put_transaction(cur_trans);
 
-	list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
-	if (root->fs_info->closing)
-		list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots);
-
 	mutex_unlock(&root->fs_info->trans_mutex);
 
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
-
-	if (root->fs_info->closing)
-		drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots);
 	return ret;
 }
 
@@ -1204,16 +1071,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
  */
 int btrfs_clean_old_snapshots(struct btrfs_root *root)
 {
-	struct list_head dirty_roots;
-	INIT_LIST_HEAD(&dirty_roots);
-again:
-	mutex_lock(&root->fs_info->trans_mutex);
-	list_splice_init(&root->fs_info->dead_roots, &dirty_roots);
-	mutex_unlock(&root->fs_info->trans_mutex);
+	LIST_HEAD(list);
+	struct btrfs_fs_info *fs_info = root->fs_info;
+
+	mutex_lock(&fs_info->trans_mutex);
+	list_splice_init(&fs_info->dead_roots, &list);
+	mutex_unlock(&fs_info->trans_mutex);
 
-	if (!list_empty(&dirty_roots)) {
-		drop_dirty_roots(root, &dirty_roots);
-		goto again;
+	while (!list_empty(&list)) {
+		root = list_entry(list.next, struct btrfs_root, root_list);
+		list_del_init(&root->root_list);
+		btrfs_drop_dead_root(root);
 	}
 	return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 94f5bde2b58d4..961c3ee5a2e10 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -62,12 +62,6 @@ struct btrfs_pending_snapshot {
 	struct list_head list;
 };
 
-struct btrfs_dirty_root {
-	struct list_head list;
-	struct btrfs_root *root;
-	struct btrfs_root *latest_root;
-};
-
 static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
 					       struct inode *inode)
 {
@@ -100,7 +94,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root);
 
-int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest);
+int btrfs_add_dead_root(struct btrfs_root *root);
+int btrfs_drop_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
@@ -108,7 +103,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
-int btrfs_record_root_in_trans(struct btrfs_root *root);
+int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 					struct extent_io_tree *dirty_pages);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index db5e212e84452..2b41fc08c34aa 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -430,18 +430,16 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 static noinline struct inode *read_one_inode(struct btrfs_root *root,
 					     u64 objectid)
 {
+	struct btrfs_key key;
 	struct inode *inode;
-	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
-	if (inode->i_state & I_NEW) {
-		BTRFS_I(inode)->root = root;
-		BTRFS_I(inode)->location.objectid = objectid;
-		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
-		BTRFS_I(inode)->location.offset = 0;
-		btrfs_read_locked_inode(inode);
-		unlock_new_inode(inode);
 
-	}
-	if (is_bad_inode(inode)) {
+	key.objectid = objectid;
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.offset = 0;
+	inode = btrfs_iget(root->fs_info->sb, &key, root);
+	if (IS_ERR(inode)) {
+		inode = NULL;
+	} else if (is_bad_inode(inode)) {
 		iput(inode);
 		inode = NULL;
 	}
@@ -541,6 +539,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 
 	if (found_type == BTRFS_FILE_EXTENT_REG ||
 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
+		u64 offset;
 		unsigned long dest_offset;
 		struct btrfs_key ins;
 
@@ -555,6 +554,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 		ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
 		ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
 		ins.type = BTRFS_EXTENT_ITEM_KEY;
+		offset = key->offset - btrfs_file_extent_offset(eb, item);
 
 		if (ins.objectid > 0) {
 			u64 csum_start;
@@ -569,19 +569,16 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
 			if (ret == 0) {
 				ret = btrfs_inc_extent_ref(trans, root,
 						ins.objectid, ins.offset,
-						path->nodes[0]->start,
-						root->root_key.objectid,
-						trans->transid, key->objectid);
+						0, root->root_key.objectid,
+						key->objectid, offset);
 			} else {
 				/*
 				 * insert the extent pointer in the extent
 				 * allocation tree
 				 */
-				ret = btrfs_alloc_logged_extent(trans, root,
-						path->nodes[0]->start,
-						root->root_key.objectid,
-						trans->transid, key->objectid,
-						&ins);
+				ret = btrfs_alloc_logged_file_extent(trans,
+						root, root->root_key.objectid,
+						key->objectid, offset, &ins);
 				BUG_ON(ret);
 			}
 			btrfs_release_path(root, path);
@@ -1706,9 +1703,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
-				ret = btrfs_drop_leaf_ref(trans, root, next);
-				BUG_ON(ret);
-
 				WARN_ON(root_owner !=
 					BTRFS_TREE_LOG_OBJECTID);
 				ret = btrfs_free_reserved_extent(root,
@@ -1753,10 +1747,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 		btrfs_wait_tree_block_writeback(next);
 		btrfs_tree_unlock(next);
 
-		if (*level == 0) {
-			ret = btrfs_drop_leaf_ref(trans, root, next);
-			BUG_ON(ret);
-		}
 		WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
 		ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
 		BUG_ON(ret);
@@ -1811,12 +1801,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				btrfs_wait_tree_block_writeback(next);
 				btrfs_tree_unlock(next);
 
-				if (*level == 0) {
-					ret = btrfs_drop_leaf_ref(trans, root,
-								  next);
-					BUG_ON(ret);
-				}
-
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
 				ret = btrfs_free_reserved_extent(root,
 						path->nodes[*level]->start,
@@ -1884,11 +1868,6 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 			btrfs_wait_tree_block_writeback(next);
 			btrfs_tree_unlock(next);
 
-			if (orig_level == 0) {
-				ret = btrfs_drop_leaf_ref(trans, log,
-							  next);
-				BUG_ON(ret);
-			}
 			WARN_ON(log->root_key.objectid !=
 				BTRFS_TREE_LOG_OBJECTID);
 			ret = btrfs_free_reserved_extent(log, next->start,
@@ -2027,9 +2006,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
 	BUG_ON(ret);
 
-	btrfs_set_root_bytenr(&log->root_item, log->node->start);
-	btrfs_set_root_generation(&log->root_item, trans->transid);
-	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
+	btrfs_set_root_node(&log->root_item, log->node);
 
 	root->log_batch = 0;
 	root->log_transid++;
@@ -2581,7 +2558,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 				       ins_keys, ins_sizes, nr);
 	BUG_ON(ret);
 
-	for (i = 0; i < nr; i++) {
+	for (i = 0; i < nr; i++, dst_path->slots[0]++) {
 		dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
 						   dst_path->slots[0]);
 
@@ -2617,36 +2594,31 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 			found_type = btrfs_file_extent_type(src, extent);
 			if (found_type == BTRFS_FILE_EXTENT_REG ||
 			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
-				u64 ds = btrfs_file_extent_disk_bytenr(src,
-								   extent);
-				u64 dl = btrfs_file_extent_disk_num_bytes(src,
-								      extent);
-				u64 cs = btrfs_file_extent_offset(src, extent);
-				u64 cl = btrfs_file_extent_num_bytes(src,
-								     extent);;
+				u64 ds, dl, cs, cl;
+				ds = btrfs_file_extent_disk_bytenr(src,
+								extent);
+				/* ds == 0 is a hole */
+				if (ds == 0)
+					continue;
+
+				dl = btrfs_file_extent_disk_num_bytes(src,
+								extent);
+				cs = btrfs_file_extent_offset(src, extent);
+				cl = btrfs_file_extent_num_bytes(src,
+								extent);;
 				if (btrfs_file_extent_compression(src,
 								  extent)) {
 					cs = 0;
 					cl = dl;
 				}
-				/* ds == 0 is a hole */
-				if (ds != 0) {
-					ret = btrfs_inc_extent_ref(trans, log,
-						   ds, dl,
-						   dst_path->nodes[0]->start,
-						   BTRFS_TREE_LOG_OBJECTID,
-						   trans->transid,
-						   ins_keys[i].objectid);
-					BUG_ON(ret);
-					ret = btrfs_lookup_csums_range(
-						   log->fs_info->csum_root,
-						   ds + cs, ds + cs + cl - 1,
-						   &ordered_sums);
-					BUG_ON(ret);
-				}
+
+				ret = btrfs_lookup_csums_range(
+						log->fs_info->csum_root,
+						ds + cs, ds + cs + cl - 1,
+						&ordered_sums);
+				BUG_ON(ret);
 			}
 		}
-		dst_path->slots[0]++;
 	}
 
 	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
@@ -3029,9 +3001,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 		BUG_ON(!wc.replay_dest);
 
 		wc.replay_dest->log_root = log;
-		mutex_lock(&fs_info->trans_mutex);
-		btrfs_record_root_in_trans(wc.replay_dest);
-		mutex_unlock(&fs_info->trans_mutex);
+		btrfs_record_root_in_trans(trans, wc.replay_dest);
 		ret = walk_log_tree(trans, log, &wc);
 		BUG_ON(ret);
 
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index a6d35b0054ca7..8bc6a88074821 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1671,8 +1671,6 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
 	int ret;
 	int i;
 
-	printk(KERN_INFO "btrfs relocating chunk %llu\n",
-	       (unsigned long long)chunk_offset);
 	root = root->fs_info->chunk_root;
 	extent_root = root->fs_info->extent_root;
 	em_tree = &root->fs_info->mapping_tree.map_tree;
-- 
GitLab


From b36124210248706186a02093427bdff4b3f548e8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 13 May 2009 19:12:15 -0400
Subject: [PATCH 1890/2198] Btrfs: stop avoiding balancing at the end of the
 transaction.

When the delayed reference code was added, some checks were added
to avoid extra balancing while the delayed references were being flushed.
This made for less efficient btrees, but it reduced the chances of
loops where no forward progress was made because the balances made
more delayed ref updates.

With the new dead root removal code and the mixed back references,
the extent allocation tree is no longer using precise back refs, and
the delayed reference updates don't carry the risk of looping forever
anymore.  So, the balance avoidance is no longer required.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2b960278a2f95..2f633e751198b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1052,8 +1052,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 	    BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
 		return 0;
 
-	if (trans->transaction->delayed_refs.flushing &&
-	    btrfs_header_nritems(mid) > 2)
+	if (btrfs_header_nritems(mid) > 2)
 		return 0;
 
 	if (btrfs_header_nritems(mid) < 2)
@@ -2194,7 +2193,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 		ret = insert_new_root(trans, root, path, level + 1);
 		if (ret)
 			return ret;
-	} else if (!trans->transaction->delayed_refs.flushing) {
+	} else {
 		ret = push_nodes_for_insert(trans, root, path, level);
 		c = path->nodes[level];
 		if (!ret && btrfs_header_nritems(c) <
@@ -2869,8 +2868,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
 	int num_doubles = 0;
 
 	/* first try to make some room by pushing left and right */
-	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
-	    !trans->transaction->delayed_refs.flushing) {
+	if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
 		wret = push_leaf_right(trans, root, path, data_size, 0);
 		if (wret < 0)
 			return wret;
@@ -3809,8 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
-		    !trans->transaction->delayed_refs.flushing) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
-- 
GitLab


From cfbb9308463f6dad1334884db046ccf0f1a77918 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 18 May 2009 10:41:58 -0400
Subject: [PATCH 1891/2198] Btrfs: balance btree more often

With the new back reference code, the cost of a balance has gone down
in terms of the number of back reference updates done.  This commit
makes us more aggressively balance leaves and nodes as they become
less full.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 2f633e751198b..60a45f3a4e916 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1651,7 +1651,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
 		}
 		b = p->nodes[level];
 	} else if (ins_len < 0 && btrfs_header_nritems(b) <
-		   BTRFS_NODEPTRS_PER_BLOCK(root) / 4) {
+		   BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
 		int sret;
 
 		sret = reada_for_balance(root, p, level);
@@ -3807,7 +3807,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		}
 
 		/* delete the leaf if it is mostly empty */
-		if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+		if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
 			/* push_leaf_left fixes the path.
 			 * make sure the path still points to our leaf
 			 * for possible call to del_ptr below
-- 
GitLab


From 2c943de6ad795a174dcc424c293bb77f15ae3b8c Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 18 May 2009 10:41:58 -0400
Subject: [PATCH 1892/2198] Btrfs: reduce mount -o ssd CPU usage

The block allocator in SSD mode will try to find groups of free blocks
that are close together.  This commit makes it loop less on a given
group size before bumping it.

The end result is that we are less likely to fill small holes in the
available free space, but we don't waste as much CPU building the
large cluster used by ssd mode.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0bc93657b4601..280165581c572 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -652,7 +652,7 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 			last = entry;
 			max_extent = 0;
 			total_retries++;
-			if (total_retries % 256 == 0) {
+			if (total_retries % 64 == 0) {
 				if (min_bytes >= (bytes + empty_size)) {
 					ret = -ENOSPC;
 					goto out;
-- 
GitLab


From 585ad2c3797dcaa643aeba75b9f072778adf3490 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 18 May 2009 10:41:58 -0400
Subject: [PATCH 1893/2198] Btrfs: fix metadata dirty throttling limits

Once a metadata block has been written, it must be recowed, so the
btrfs dirty balancing call has a check to make sure a fair amount of metadata
was actually dirty before it started writing it back to disk.

A previous commit had changed the dirty tracking for metadata without
updating the btrfs dirty balancing checks.  This commit switches it
to use the correct counter.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7f5c6e3e99928..e572cf478a5d4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2378,17 +2378,14 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
 	 * looks as though older kernels can get into trouble with
 	 * this code, they end up stuck in balance_dirty_pages forever
 	 */
-	struct extent_io_tree *tree;
 	u64 num_dirty;
-	u64 start = 0;
 	unsigned long thresh = 32 * 1024 * 1024;
-	tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 
 	if (current->flags & PF_MEMALLOC)
 		return;
 
-	num_dirty = count_range_bits(tree, &start, (u64)-1,
-				     thresh, EXTENT_DIRTY);
+	num_dirty = root->fs_info->dirty_metadata_bytes;
+
 	if (num_dirty > thresh) {
 		balance_dirty_pages_ratelimited_nr(
 				   root->fs_info->btree_inode->i_mapping, 1);
-- 
GitLab


From d84275c938e1a5e2dc5b89eb9b878e0ddb2c55e0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 9 Jun 2009 15:39:08 -0400
Subject: [PATCH 1894/2198] Btrfs: don't allow WRITE_SYNC bios to starve out
 regular writes

Btrfs uses dedicated threads to submit bios when checksumming is on,
which allows us to make sure the threads dedicated to checksumming don't get
stuck waiting for requests.  For each btrfs device, there are
two lists of bios.  One list is for WRITE_SYNC bios and the other
is for regular priority bios.

The IO submission threads used to process all of the WRITE_SYNC bios first and
then switch to the regular bios.  This commit makes sure we don't completely
starve the regular bios by rotating between the two lists.

WRITE_SYNC bios are still favored 2:1 over the regular bios, and this tries
to run in batches to avoid seeking.  Benchmarking shows this eliminates
stalls during streaming buffered writes on both multi-device and
single device filesystems.

If the regular bios starve, the system can end up with a large amount of ram
pinned down in writeback pages.  If we are a little more fair between the two
classes, we're able to keep throughput up and make progress on the bulk of
our dirty ram.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8bc6a88074821..9d3618192009f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -163,6 +163,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	unsigned long num_sync_run;
 	unsigned long limit;
 	unsigned long last_waited = 0;
+	int force_reg = 0;
 
 	bdi = blk_get_backing_dev_info(device->bdev);
 	fs_info = device->dev_root->fs_info;
@@ -176,19 +177,22 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 
 loop:
 	spin_lock(&device->io_lock);
-	num_run = 0;
 
 loop_lock:
+	num_run = 0;
 
 	/* take all the bios off the list at once and process them
 	 * later on (without the lock held).  But, remember the
 	 * tail and other pointers so the bios can be properly reinserted
 	 * into the list if we hit congestion
 	 */
-	if (device->pending_sync_bios.head)
+	if (!force_reg && device->pending_sync_bios.head) {
 		pending_bios = &device->pending_sync_bios;
-	else
+		force_reg = 1;
+	} else {
 		pending_bios = &device->pending_bios;
+		force_reg = 0;
+	}
 
 	pending = pending_bios->head;
 	tail = pending_bios->tail;
@@ -228,10 +232,14 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	while (pending) {
 
 		rmb();
-		if (pending_bios != &device->pending_sync_bios &&
-		    device->pending_sync_bios.head &&
-		    num_run > 16) {
-			cond_resched();
+		/* we want to work on both lists, but do more bios on the
+		 * sync list than the regular list
+		 */
+		if ((num_run > 32 &&
+		    pending_bios != &device->pending_sync_bios &&
+		    device->pending_sync_bios.head) ||
+		   (num_run > 64 && pending_bios == &device->pending_sync_bios &&
+		    device->pending_bios.head)) {
 			spin_lock(&device->io_lock);
 			requeue_list(pending_bios, pending, tail);
 			goto loop_lock;
-- 
GitLab


From d644d8a1e30b88a93bcfb63cada2ae628462ddba Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 9 Jun 2009 15:59:22 -0400
Subject: [PATCH 1895/2198] Btrfs: avoid IO stalls behind congested devices in
 a multi-device FS

The btrfs IO submission threads try to service a bunch of devices with a small
number of threads.  They do a congestion check to try and avoid waiting
on requests for a busy device.

The checks make sure we've sent a few requests down to a given device just so
that we aren't bouncing between busy devices without actually sending down
any IO.  The counter used to decide if we can switch to the next device
is somewhat overloaded.  It is also being used to decide if we've done
a good batch of requests between the WRITE_SYNC or regular priority lists.
It may get reset to zero often, leaving us hammering on a busy device
instead of moving on to another disk.

This commit adds a new counter for the number of bios sent while
servicing a device.  It doesn't get reset or fiddled with.  On
multi-device filesystems, this fixes IO stalls in streaming
write workloads.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9d3618192009f..27d5f37b845fe 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -161,6 +161,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 	int again = 0;
 	unsigned long num_run;
 	unsigned long num_sync_run;
+	unsigned long batch_run = 0;
 	unsigned long limit;
 	unsigned long last_waited = 0;
 	int force_reg = 0;
@@ -257,6 +258,8 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 		BUG_ON(atomic_read(&cur->bi_cnt) == 0);
 		submit_bio(cur->bi_rw, cur);
 		num_run++;
+		batch_run++;
+
 		if (bio_sync(cur))
 			num_sync_run++;
 
@@ -273,7 +276,7 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
 		 * is now congested.  Back off and let other work structs
 		 * run instead
 		 */
-		if (pending && bdi_write_congested(bdi) && num_run > 16 &&
+		if (pending && bdi_write_congested(bdi) && batch_run > 32 &&
 		    fs_info->fs_devices->open_devices > 1) {
 			struct io_context *ioc;
 
-- 
GitLab


From 3b30c22f64a6bb297719c60e494af1d26563f584 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 9 Jun 2009 16:42:22 -0400
Subject: [PATCH 1896/2198] Btrfs: Add mount -o nossd

This allows you to turn off the ssd mode via remount.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index e9ef8c3307fe0..22855a18eb487 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,8 +66,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_thread_pool, Opt_noacl,  Opt_compress, Opt_notreelog,
-	Opt_ratio, Opt_flushoncommit, Opt_err,
+	Opt_ssd, Opt_nossd, Opt_thread_pool, Opt_noacl,  Opt_compress,
+	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,6 +83,7 @@ static match_table_t tokens = {
 	{Opt_thread_pool, "thread_pool=%d"},
 	{Opt_compress, "compress"},
 	{Opt_ssd, "ssd"},
+	{Opt_nossd, "nossd"},
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
 	{Opt_flushoncommit, "flushoncommit"},
@@ -173,6 +174,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
 			btrfs_set_opt(info->mount_opt, SSD);
 			break;
+		case Opt_nossd:
+			printk(KERN_INFO "btrfs: not using ssd allocation scheme\n");
+			btrfs_clear_opt(info->mount_opt, SSD);
+			break;
 		case Opt_nobarrier:
 			printk(KERN_INFO "btrfs: turning off barriers\n");
 			btrfs_set_opt(info->mount_opt, NOBARRIER);
-- 
GitLab


From c604480171c510c1beeb81b82418e5bc4de8f1ae Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 9 Jun 2009 18:35:15 -0400
Subject: [PATCH 1897/2198] Btrfs: avoid allocation clusters that are too
 spread out

In SSD mode for data, and all the time for metadata the allocator
will try to find a cluster of nearby blocks for allocations.  This
commit adds extra checks to make sure that each free block in the
cluster is close to the last one.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 280165581c572..ac23476beb6ea 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -645,7 +645,8 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 		 * we haven't filled the empty size and the window is
 		 * very large.  reset and try again
 		 */
-		if (next->offset - window_start > (bytes + empty_size) * 2) {
+		if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
+		    next->offset - window_start > (bytes + empty_size) * 2) {
 			entry = next;
 			window_start = entry->offset;
 			window_free = entry->bytes;
-- 
GitLab


From 451d7585a8bb1b9bec0d676ce3dece1923164e55 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 9 Jun 2009 20:28:34 -0400
Subject: [PATCH 1898/2198] Btrfs: add mount -o ssd_spread to spread
 allocations out

Some SSDs perform best when reusing block numbers often, while
others perform much better when clustering strictly allocates
big chunks of unused space.

The default mount -o ssd will find rough groupings of blocks
where there are a bunch of free blocks that might have some
allocated blocks mixed in.

mount -o ssd_spread will make sure there are no allocated blocks
mixed in.  It should perform better on lower end SSDs.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h            |  1 +
 fs/btrfs/extent-tree.c      |  2 +-
 fs/btrfs/free-space-cache.c |  5 ++++-
 fs/btrfs/free-space-cache.h |  1 +
 fs/btrfs/super.c            | 19 +++++++++++++++----
 5 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ce3ab4e130642..b9d8788b299ec 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1100,6 +1100,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_COMPRESS		(1 << 5)
 #define BTRFS_MOUNT_NOTREELOG           (1 << 6)
 #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
+#define BTRFS_MOUNT_SSD_SPREAD		(1 << 8)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a42419c276e21..3355d7ea83081 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3607,7 +3607,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 			last_ptr_loop = 0;
 
 			/* allocate a cluster in this block group */
-			ret = btrfs_find_space_cluster(trans,
+			ret = btrfs_find_space_cluster(trans, root,
 					       block_group, last_ptr,
 					       offset, num_bytes,
 					       empty_cluster + empty_size);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ac23476beb6ea..4538e48581a51 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -579,6 +579,7 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
  * it returns -enospc
  */
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size)
@@ -595,7 +596,9 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
 	int ret;
 
 	/* for metadata, allow allocates with more holes */
-	if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+	if (btrfs_test_opt(root, SSD_SPREAD)) {
+		min_bytes = bytes + empty_size;
+	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
 		/*
 		 * we want to do larger allocations when we are
 		 * flushing out the delayed refs, it helps prevent
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index ab0bdc0a63ce2..266fb87640544 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -31,6 +31,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 			   u64 bytes);
 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+			     struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 22855a18eb487..7f5b2889949a1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -66,8 +66,8 @@ static void btrfs_put_super(struct super_block *sb)
 enum {
 	Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow,
 	Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier,
-	Opt_ssd, Opt_nossd, Opt_thread_pool, Opt_noacl,  Opt_compress,
-	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
+	Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl,
+	Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -83,6 +83,7 @@ static match_table_t tokens = {
 	{Opt_thread_pool, "thread_pool=%d"},
 	{Opt_compress, "compress"},
 	{Opt_ssd, "ssd"},
+	{Opt_ssd_spread, "ssd_spread"},
 	{Opt_nossd, "nossd"},
 	{Opt_noacl, "noacl"},
 	{Opt_notreelog, "notreelog"},
@@ -174,9 +175,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
 			btrfs_set_opt(info->mount_opt, SSD);
 			break;
+		case Opt_ssd_spread:
+			printk(KERN_INFO "btrfs: use spread ssd "
+			       "allocation scheme\n");
+			btrfs_set_opt(info->mount_opt, SSD);
+			btrfs_set_opt(info->mount_opt, SSD_SPREAD);
+			break;
 		case Opt_nossd:
-			printk(KERN_INFO "btrfs: not using ssd allocation scheme\n");
+			printk(KERN_INFO "btrfs: not using ssd allocation "
+			       "scheme\n");
 			btrfs_clear_opt(info->mount_opt, SSD);
+			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
 			break;
 		case Opt_nobarrier:
 			printk(KERN_INFO "btrfs: turning off barriers\n");
@@ -429,7 +438,9 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
 	if (btrfs_test_opt(root, COMPRESS))
 		seq_puts(seq, ",compress");
-	if (btrfs_test_opt(root, SSD))
+	if (btrfs_test_opt(root, SSD_SPREAD))
+		seq_puts(seq, ",ssd_spread");
+	else if (btrfs_test_opt(root, SSD))
 		seq_puts(seq, ",ssd");
 	if (btrfs_test_opt(root, NOTREELOG))
 		seq_puts(seq, ",notreelog");
-- 
GitLab


From c289811cc096c57ff35550ee8132793a4f9b5b59 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 10 Jun 2009 09:51:32 -0400
Subject: [PATCH 1899/2198] Btrfs: autodetect SSD devices

During mount, btrfs will check the queue nonrot flag
for all the devices found in the FS.  If they are all
non-rotating, SSD mode is enabled by default.

If the FS was mounted with -o nossd, the non-rotating
flag is ignored.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   | 1 +
 fs/btrfs/disk-io.c | 9 +++++++++
 fs/btrfs/super.c   | 3 +++
 fs/btrfs/volumes.c | 6 ++++++
 fs/btrfs/volumes.h | 5 +++++
 5 files changed, 24 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index b9d8788b299ec..5fa7d7d287a4b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1101,6 +1101,7 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOTREELOG           (1 << 6)
 #define BTRFS_MOUNT_FLUSHONCOMMIT       (1 << 7)
 #define BTRFS_MOUNT_SSD_SPREAD		(1 << 8)
+#define BTRFS_MOUNT_NOSSD		(1 << 9)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e572cf478a5d4..f4dfbb7ab4965 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1850,6 +1850,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (IS_ERR(fs_info->transaction_kthread))
 		goto fail_cleaner;
 
+	if (!btrfs_test_opt(tree_root, SSD) &&
+	    !btrfs_test_opt(tree_root, NOSSD) &&
+	    !fs_info->fs_devices->rotating) {
+		printk(KERN_INFO "Btrfs detected SSD devices, enabling SSD "
+		       "mode\n");
+		btrfs_set_opt(fs_info->mount_opt, SSD);
+	}
+
 	if (btrfs_super_log_root(disk_super) != 0) {
 		u64 bytenr = btrfs_super_log_root(disk_super);
 
@@ -1893,6 +1901,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
 	if (!fs_info->fs_root)
 		goto fail_trans_kthread;
+
 	return tree_root;
 
 fail_trans_kthread:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 7f5b2889949a1..3427db28f6fe9 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -184,6 +184,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_nossd:
 			printk(KERN_INFO "btrfs: not using ssd allocation "
 			       "scheme\n");
+			btrfs_set_opt(info->mount_opt, NOSSD);
 			btrfs_clear_opt(info->mount_opt, SSD);
 			btrfs_clear_opt(info->mount_opt, SSD_SPREAD);
 			break;
@@ -438,6 +439,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
 	if (btrfs_test_opt(root, COMPRESS))
 		seq_puts(seq, ",compress");
+	if (btrfs_test_opt(root, NOSSD))
+		seq_puts(seq, ",nossd");
 	if (btrfs_test_opt(root, SSD_SPREAD))
 		seq_puts(seq, ",ssd_spread");
 	else if (btrfs_test_opt(root, SSD))
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 27d5f37b845fe..3f4a5932eac9e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -605,6 +605,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		device->in_fs_metadata = 0;
 		device->mode = flags;
 
+		if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+			fs_devices->rotating = 1;
+
 		fs_devices->open_devices++;
 		if (device->writeable) {
 			fs_devices->rw_devices++;
@@ -1473,6 +1476,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	root->fs_info->fs_devices->rw_devices++;
 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
+	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
+		root->fs_info->fs_devices->rotating = 1;
+
 	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
 				    total_bytes + device->total_bytes);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 5c3ff6d02fd72..3c1f7310421e2 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -107,6 +107,11 @@ struct btrfs_fs_devices {
 	int seeding;
 
 	int opened;
+
+	/* set when we find or add a device that doesn't have the
+	 * nonrot flag set
+	 */
+	int rotating;
 };
 
 struct btrfs_bio_stripe {
-- 
GitLab


From 6cbff00f4632c8060b06bfc9585805217f11e12e Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 17 Apr 2009 10:37:41 +0200
Subject: [PATCH 1900/2198] Btrfs: implement
 FS_IOC_GETFLAGS/SETFLAGS/GETVERSION

Add support for the standard attributes set via chattr and read via
lsattr.  Currently we store the attributes in the flags value in
the btrfs inode, but I wonder whether we should split it into two so
that we don't have to keep converting between the two formats.

Remove the btrfs_clear_flag/btrfs_set_flag/btrfs_test_flag macros
as they were confusing the existing code and got in the way of the
new additions.

Also add the FS_IOC_GETVERSION ioctl for getting i_generation as it's
trivial.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/btrfs_inode.h |   1 -
 fs/btrfs/compression.c |   6 +-
 fs/btrfs/ctree.h       |  16 ++--
 fs/btrfs/inode.c       |  27 ++++---
 fs/btrfs/ioctl.c       | 171 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 200 insertions(+), 21 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index ecf5f7d8166fb..acb4f35175825 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -157,5 +157,4 @@ static inline void btrfs_i_size_write(struct inode *inode, u64 size)
 	BTRFS_I(inode)->disk_i_size = size;
 }
 
-
 #endif
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index ab07627084f13..de1e2fd32080f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -123,7 +123,7 @@ static int check_compressed_csum(struct inode *inode,
 	u32 csum;
 	u32 *cb_sum = &cb->sums;
 
-	if (btrfs_test_flag(inode, NODATASUM))
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		return 0;
 
 	for (i = 0; i < cb->nr_pages; i++) {
@@ -670,7 +670,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 			 */
 			atomic_inc(&cb->pending_bios);
 
-			if (!btrfs_test_flag(inode, NODATASUM)) {
+			if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
 				btrfs_lookup_bio_sums(root, inode, comp_bio,
 						      sums);
 			}
@@ -697,7 +697,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
 	BUG_ON(ret);
 
-	if (!btrfs_test_flag(inode, NODATASUM))
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
 		btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
 
 	ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5fa7d7d287a4b..4d6e0b6f21eac 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1115,12 +1115,14 @@ struct btrfs_root {
 #define BTRFS_INODE_READONLY		(1 << 2)
 #define BTRFS_INODE_NOCOMPRESS		(1 << 3)
 #define BTRFS_INODE_PREALLOC		(1 << 4)
-#define btrfs_clear_flag(inode, flag)	(BTRFS_I(inode)->flags &= \
-					 ~BTRFS_INODE_##flag)
-#define btrfs_set_flag(inode, flag)	(BTRFS_I(inode)->flags |= \
-					 BTRFS_INODE_##flag)
-#define btrfs_test_flag(inode, flag)	(BTRFS_I(inode)->flags & \
-					 BTRFS_INODE_##flag)
+#define BTRFS_INODE_SYNC		(1 << 5)
+#define BTRFS_INODE_IMMUTABLE		(1 << 6)
+#define BTRFS_INODE_APPEND		(1 << 7)
+#define BTRFS_INODE_NODUMP		(1 << 8)
+#define BTRFS_INODE_NOATIME		(1 << 9)
+#define BTRFS_INODE_DIRSYNC		(1 << 10)
+
+
 /* some macros to generate set/get funcs for the struct fields.  This
  * assumes there is a lefoo_to_cpu for every type, so lets make a simple
  * one for u8:
@@ -2260,6 +2262,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size);
 
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+void btrfs_update_iflags(struct inode *inode);
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 
 /* file.c */
 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 917bf10597c60..5b68330f85852 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -368,7 +368,7 @@ static noinline int compress_file_range(struct inode *inode,
 	 * inode has not been flagged as nocompress.  This flag can
 	 * change at any time if we discover bad compression ratios.
 	 */
-	if (!btrfs_test_flag(inode, NOCOMPRESS) &&
+	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
 	    btrfs_test_opt(root, COMPRESS)) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
@@ -469,7 +469,7 @@ static noinline int compress_file_range(struct inode *inode,
 		nr_pages_ret = 0;
 
 		/* flag the file so we don't compress in the future */
-		btrfs_set_flag(inode, NOCOMPRESS);
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
 	}
 	if (will_compress) {
 		*num_added += 1;
@@ -862,7 +862,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 		async_cow->locked_page = locked_page;
 		async_cow->start = start;
 
-		if (btrfs_test_flag(inode, NOCOMPRESS))
+		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
 			cur_end = end;
 		else
 			cur_end = min(end, start + 512 * 1024 - 1);
@@ -1133,10 +1133,10 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 
-	if (btrfs_test_flag(inode, NODATACOW))
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 1, nr_written);
-	else if (btrfs_test_flag(inode, PREALLOC))
+	else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
 		ret = run_delalloc_nocow(inode, locked_page, start, end,
 					 page_started, 0, nr_written);
 	else if (!btrfs_test_opt(root, COMPRESS))
@@ -1290,7 +1290,7 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 	int ret = 0;
 	int skip_sum;
 
-	skip_sum = btrfs_test_flag(inode, NODATASUM);
+	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
 	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
 	BUG_ON(ret);
@@ -1790,7 +1790,8 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		ClearPageChecked(page);
 		goto good;
 	}
-	if (btrfs_test_flag(inode, NODATASUM))
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
 		return 0;
 
 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
@@ -2156,6 +2157,8 @@ static void btrfs_read_locked_inode(struct inode *inode)
 		init_special_inode(inode, inode->i_mode, rdev);
 		break;
 	}
+
+	btrfs_update_iflags(inode);
 	return;
 
 make_bad:
@@ -3586,9 +3589,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			btrfs_find_block_group(root, 0, alloc_hint, owner);
 	if ((mode & S_IFREG)) {
 		if (btrfs_test_opt(root, NODATASUM))
-			btrfs_set_flag(inode, NODATASUM);
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
 		if (btrfs_test_opt(root, NODATACOW))
-			btrfs_set_flag(inode, NODATACOW);
+			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
 	}
 
 	key[0].objectid = objectid;
@@ -3642,6 +3645,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	location->offset = 0;
 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
 
+	btrfs_inherit_iflags(inode, dir);
+
 	insert_inode_hash(inode);
 	inode_tree_add(inode);
 	return inode;
@@ -5075,7 +5080,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
 out:
 	if (cur_offset > start) {
 		inode->i_ctime = CURRENT_TIME;
-		btrfs_set_flag(inode, PREALLOC);
+		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
 		    cur_offset > i_size_read(inode))
 			btrfs_i_size_write(inode, cur_offset);
@@ -5196,7 +5201,7 @@ static int btrfs_set_page_dirty(struct page *page)
 
 static int btrfs_permission(struct inode *inode, int mask)
 {
-	if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE))
+	if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
 		return -EACCES;
 	return generic_permission(inode, mask, btrfs_check_acl);
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 54dfd45cc5917..926332a73cdef 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -50,7 +50,172 @@
 #include "volumes.h"
 #include "locking.h"
 
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
+{
+	if (S_ISDIR(mode))
+		return flags;
+	else if (S_ISREG(mode))
+		return flags & ~FS_DIRSYNC_FL;
+	else
+		return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
+}
+
+/*
+ * Export inode flags to the format expected by the FS_IOC_GETFLAGS ioctl.
+ */
+static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
+{
+	unsigned int iflags = 0;
+
+	if (flags & BTRFS_INODE_SYNC)
+		iflags |= FS_SYNC_FL;
+	if (flags & BTRFS_INODE_IMMUTABLE)
+		iflags |= FS_IMMUTABLE_FL;
+	if (flags & BTRFS_INODE_APPEND)
+		iflags |= FS_APPEND_FL;
+	if (flags & BTRFS_INODE_NODUMP)
+		iflags |= FS_NODUMP_FL;
+	if (flags & BTRFS_INODE_NOATIME)
+		iflags |= FS_NOATIME_FL;
+	if (flags & BTRFS_INODE_DIRSYNC)
+		iflags |= FS_DIRSYNC_FL;
+
+	return iflags;
+}
+
+/*
+ * Update inode->i_flags based on the btrfs internal flags.
+ */
+void btrfs_update_iflags(struct inode *inode)
+{
+	struct btrfs_inode *ip = BTRFS_I(inode);
+
+	inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+
+	if (ip->flags & BTRFS_INODE_SYNC)
+		inode->i_flags |= S_SYNC;
+	if (ip->flags & BTRFS_INODE_IMMUTABLE)
+		inode->i_flags |= S_IMMUTABLE;
+	if (ip->flags & BTRFS_INODE_APPEND)
+		inode->i_flags |= S_APPEND;
+	if (ip->flags & BTRFS_INODE_NOATIME)
+		inode->i_flags |= S_NOATIME;
+	if (ip->flags & BTRFS_INODE_DIRSYNC)
+		inode->i_flags |= S_DIRSYNC;
+}
+
+/*
+ * Inherit flags from the parent inode.
+ *
+ * Unlike extN we don't have any flags we don't want to inherit currently.
+ */
+void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
+{
+	unsigned int flags = BTRFS_I(dir)->flags;
+
+	if (S_ISREG(inode->i_mode))
+		flags &= ~BTRFS_INODE_DIRSYNC;
+	else if (!S_ISDIR(inode->i_mode))
+		flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+
+	BTRFS_I(inode)->flags = flags;
+	btrfs_update_iflags(inode);
+}
+
+static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
+{
+	struct btrfs_inode *ip = BTRFS_I(file->f_path.dentry->d_inode);
+	unsigned int flags = btrfs_flags_to_ioctl(ip->flags);
+
+	if (copy_to_user(arg, &flags, sizeof(flags)))
+		return -EFAULT;
+	return 0;
+}
+
+static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+	struct btrfs_inode *ip = BTRFS_I(inode);
+	struct btrfs_root *root = ip->root;
+	struct btrfs_trans_handle *trans;
+	unsigned int flags, oldflags;
+	int ret;
+
+	if (copy_from_user(&flags, arg, sizeof(flags)))
+		return -EFAULT;
+
+	if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+		      FS_NOATIME_FL | FS_NODUMP_FL | \
+		      FS_SYNC_FL | FS_DIRSYNC_FL))
+		return -EOPNOTSUPP;
 
+	if (!is_owner_or_cap(inode))
+		return -EACCES;
+
+	mutex_lock(&inode->i_mutex);
+
+	flags = btrfs_mask_flags(inode->i_mode, flags);
+	oldflags = btrfs_flags_to_ioctl(ip->flags);
+	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE)) {
+			ret = -EPERM;
+			goto out_unlock;
+		}
+	}
+
+	ret = mnt_want_write(file->f_path.mnt);
+	if (ret)
+		goto out_unlock;
+
+	if (flags & FS_SYNC_FL)
+		ip->flags |= BTRFS_INODE_SYNC;
+	else
+		ip->flags &= ~BTRFS_INODE_SYNC;
+	if (flags & FS_IMMUTABLE_FL)
+		ip->flags |= BTRFS_INODE_IMMUTABLE;
+	else
+		ip->flags &= ~BTRFS_INODE_IMMUTABLE;
+	if (flags & FS_APPEND_FL)
+		ip->flags |= BTRFS_INODE_APPEND;
+	else
+		ip->flags &= ~BTRFS_INODE_APPEND;
+	if (flags & FS_NODUMP_FL)
+		ip->flags |= BTRFS_INODE_NODUMP;
+	else
+		ip->flags &= ~BTRFS_INODE_NODUMP;
+	if (flags & FS_NOATIME_FL)
+		ip->flags |= BTRFS_INODE_NOATIME;
+	else
+		ip->flags &= ~BTRFS_INODE_NOATIME;
+	if (flags & FS_DIRSYNC_FL)
+		ip->flags |= BTRFS_INODE_DIRSYNC;
+	else
+		ip->flags &= ~BTRFS_INODE_DIRSYNC;
+
+
+	trans = btrfs_join_transaction(root, 1);
+	BUG_ON(!trans);
+
+	ret = btrfs_update_inode(trans, root, inode);
+	BUG_ON(ret);
+
+	btrfs_update_iflags(inode);
+	inode->i_ctime = CURRENT_TIME;
+	btrfs_end_transaction(trans, root);
+
+	mnt_drop_write(file->f_path.mnt);
+ out_unlock:
+	mutex_unlock(&inode->i_mutex);
+	return 0;
+}
+
+static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
+{
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	return put_user(inode->i_generation, arg);
+}
 
 static noinline int create_subvol(struct btrfs_root *root,
 				  struct dentry *dentry,
@@ -1077,6 +1242,12 @@ long btrfs_ioctl(struct file *file, unsigned int
 	void __user *argp = (void __user *)arg;
 
 	switch (cmd) {
+	case FS_IOC_GETFLAGS:
+		return btrfs_ioctl_getflags(file, argp);
+	case FS_IOC_SETFLAGS:
+		return btrfs_ioctl_setflags(file, argp);
+	case FS_IOC_GETVERSION:
+		return btrfs_ioctl_getversion(file, argp);
 	case BTRFS_IOC_SNAP_CREATE:
 		return btrfs_ioctl_snap_create(file, argp, 0);
 	case BTRFS_IOC_SUBVOL_CREATE:
-- 
GitLab


From 163e783e6a8b1e8bcb4c9084d438091386b589df Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw2@infradead.org>
Date: Sun, 19 Apr 2009 13:02:41 +0100
Subject: [PATCH 1901/2198] Btrfs: remove crc32c.h and use libcrc32c directly.

There's no need to preserve this abstraction; it used to let us use
hardware crc32c support directly, but libcrc32c is already doing that for us
through the crypto API -- so we're already using the Intel crc32c
acceleration where appropriate.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/crc32c.h      | 29 -----------------------------
 fs/btrfs/disk-io.c     |  4 ++--
 fs/btrfs/extent-tree.c |  7 +++----
 fs/btrfs/hash.h        |  4 ++--
 4 files changed, 7 insertions(+), 37 deletions(-)
 delete mode 100644 fs/btrfs/crc32c.h

diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h
deleted file mode 100644
index 6e1b3de367004..0000000000000
--- a/fs/btrfs/crc32c.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * Copyright (C) 2008 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-
-#ifndef __BTRFS_CRC32C__
-#define __BTRFS_CRC32C__
-#include <linux/crc32c.h>
-
-/*
- * this file used to do more for selecting the HW version of crc32c,
- * perhaps it will one day again soon.
- */
-#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length)
-#endif
-
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index f4dfbb7ab4965..6c54c210dfd07 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -26,8 +26,8 @@
 #include <linux/workqueue.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/crc32c.h>
 #include "compat.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -171,7 +171,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
 
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 {
-	return btrfs_crc32c(seed, data, len);
+	return crc32c(seed, data, len);
 }
 
 void btrfs_csum_final(u32 crc, char *result)
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3355d7ea83081..33a65f2c8a378 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,7 +23,6 @@
 #include <linux/rcupdate.h>
 #include "compat.h"
 #include "hash.h"
-#include "crc32c.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "print-tree.h"
@@ -625,11 +624,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
 	__le64 lenum;
 
 	lenum = cpu_to_le64(root_objectid);
-	high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
+	high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(owner);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 	lenum = cpu_to_le64(offset);
-	low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
+	low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
 
 	return ((u64)high_crc << 31) ^ (u64)low_crc;
 }
diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h
index 2a020b2767683..db2ff9773b99d 100644
--- a/fs/btrfs/hash.h
+++ b/fs/btrfs/hash.h
@@ -19,9 +19,9 @@
 #ifndef __HASH__
 #define __HASH__
 
-#include "crc32c.h"
+#include <linux/crc32c.h>
 static inline u64 btrfs_name_hash(const char *name, int len)
 {
-	return btrfs_crc32c((u32)~1, name, len);
+	return crc32c((u32)~1, name, len);
 }
 #endif
-- 
GitLab


From 524724ed1f224875a117be593540591ed050c73d Mon Sep 17 00:00:00 2001
From: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Date: Wed, 10 Jun 2009 11:13:17 -0400
Subject: [PATCH 1902/2198] Btrfs: fdatasync should skip metadata writeout

In btrfs, fdatasync and fsync are identical, but
fdatasync should skip committing transaction when
inode->i_state is set just I_DIRTY_SYNC and this indicates
only atime or/and mtime updates.
Following patch improves fdatasync throughput.

--file-block-size=4K --file-total-size=16G --file-test-mode=rndwr
--file-fsync-mode=fdatasync run

Results:
-2.6.30-rc8
Test execution summary:
    total time:                          1980.6540s
    total number of events:              10001
    total time taken by event execution: 1192.9804
    per-request statistics:
         min:                            0.0000s
         avg:                            0.1193s
         max:                            15.3720s
         approx.  95 percentile:         0.7257s

Threads fairness:
    events (avg/stddev):           625.0625/151.32
    execution time (avg/stddev):   74.5613/9.46

-2.6.30-rc8-patched
Test execution summary:
    total time:                          1695.9118s
    total number of events:              10000
    total time taken by event execution: 871.3214
    per-request statistics:
         min:                            0.0000s
         avg:                            0.0871s
         max:                            10.4644s
         approx.  95 percentile:         0.4787s

Threads fairness:
    events (avg/stddev):           625.0000/131.86
    execution time (avg/stddev):   54.4576/8.98

Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0726a734ee388..126477eaecf56 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1157,6 +1157,8 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	root->log_batch++;
 
+	if (datasync && !(inode->i_state & I_DIRTY_PAGES))
+		goto out;
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
-- 
GitLab


From 7df336ec1266dccbb253bac52c529d3dcc7c22d0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 10 Jun 2009 11:36:43 -0400
Subject: [PATCH 1903/2198] Fix btrfs when ACLs are configured out

... otherwise generic_permission() will allow *anything* for all
files you don't own and that have some group permissions.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/acl.c   | 5 -----
 fs/btrfs/ctree.h | 4 ++++
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index cbba000dccbea..603972576f0f3 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -351,9 +351,4 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir)
 	return 0;
 }
 
-int btrfs_check_acl(struct inode *inode, int mask)
-{
-	return 0;
-}
-
 #endif /* CONFIG_FS_POSIX_ACL */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4d6e0b6f21eac..03441a99ea38a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2301,7 +2301,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
 
 /* acl.c */
+#ifdef CONFIG_FS_POSIX_ACL
 int btrfs_check_acl(struct inode *inode, int mask);
+#else
+#define btrfs_check_acl NULL
+#endif
 int btrfs_init_acl(struct inode *inode, struct inode *dir);
 int btrfs_acl_chmod(struct inode *inode);
 
-- 
GitLab


From 58f7f68f228c3aba2ba4468d92e2cec35724ba2e Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Jun 2009 09:57:55 -0400
Subject: [PATCH 1904/2198] cifs: add addr= mount option alias for ip=

When you look in /proc/mounts, the address of the server gets displayed
as "addr=". That's really a better option to use anyway since it's more
generic. What if we eventually want to support non-IP transports? It
also makes CIFS option consistent with the NFS option of the same name.

Begin the migration to that option name by adding an alias for ip=
called addr=.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 10151f8d84955..6298dc32adebd 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -958,7 +958,8 @@ cifs_parse_mount_options(char *options, const char *devname,
 				}
 				strcpy(vol->password, value);
 			}
-		} else if (strnicmp(data, "ip", 2) == 0) {
+		} else if (!strnicmp(data, "ip", 2) ||
+			   !strnicmp(data, "addr", 4)) {
 			if (!value || !*value) {
 				vol->UNCip = NULL;
 			} else if (strnlen(value, INET6_ADDRSTRLEN) <
-- 
GitLab


From bc5c6c043d8381676339fb3da59cc4cc5921d368 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Wed, 10 Jun 2009 04:48:41 -0400
Subject: [PATCH 1905/2198] ftrace/documentation: fix typo in function grapher
 name

The function graph tracer is called just "function_graph" (no trailing
"_tracer" needed).

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
LKML-Reference: <1244623722-6325-1-git-send-email-vapier@gentoo.org>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 Documentation/trace/ftrace.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index fd9a3e6938135..5ad2ded8aa636 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -179,7 +179,7 @@ Here is the list of current tracers that may be configured.
 
 	Function call tracer to trace all kernel functions.
 
-  "function_graph_tracer"
+  "function_graph"
 
 	Similar to the function tracer except that the
 	function tracer probes the functions on their entry
-- 
GitLab


From a41f20716975910d9beb90b7efc61107901492b8 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Wed, 10 Jun 2009 14:22:55 -0400
Subject: [PATCH 1906/2198] ext4: Avoid corrupting the uninitialized bit in the
 extent during truncate

The unitialized bit was not properly getting preserved in in an extent
which is partially truncated because the it was geting set to the
value of the first extent to be removed or truncated as part of the
truncate operation, and if there are multiple extents are getting
removed or modified as part of the truncate operation, it is only the
last extent which will might be partially truncated, and its
uninitalized bit is not necessarily the same as the first extent to be
truncated.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/extents.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 9c35a7b1f0ae7..2593f748c3a48 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2083,12 +2083,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
 	ex = EXT_LAST_EXTENT(eh);
 
 	ex_ee_block = le32_to_cpu(ex->ee_block);
-	if (ext4_ext_is_uninitialized(ex))
-		uninitialized = 1;
 	ex_ee_len = ext4_ext_get_actual_len(ex);
 
 	while (ex >= EXT_FIRST_EXTENT(eh) &&
 			ex_ee_block + ex_ee_len > start) {
+
+		if (ext4_ext_is_uninitialized(ex))
+			uninitialized = 1;
+		else
+			uninitialized = 0;
+
 		ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
 		path[depth].p_ext = ex;
 
-- 
GitLab


From 6ff9a64d2aaa6eae396adc95e9c91c0cbfa6dbe4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 10 Jun 2009 14:28:34 -0400
Subject: [PATCH 1907/2198] tracing: do not translate event helper macros in
 print format

By moving the macro that creates the print format code above the
defining of the event macro helpers (__get_str, __print_symbolic,
and __get_dynamic_array), we get a little cleaner print format.

Instead of:

  (char *)((void *)REC + REC->__data_loc_name)

we get:

   __get_str(name)

Instead of:

   ({ static const struct trace_print_flags symbols[] = { { HI_SOFTIRQ, "HI" }, {

we get:

   __print_symbolic(REC->vec, { HI_SOFTIRQ, "HI" }, {

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/trace/ftrace.h | 158 +++++++++++++++++++++--------------------
 1 file changed, 81 insertions(+), 77 deletions(-)

diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 40ede4db4d885..1867553c61e59 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -80,6 +80,87 @@
 
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " offset:%u; size:%u;\n",
+ *			       offsetof(struct ftrace_raw_##call, item),
+ *			       sizeof(field.type));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __array
+#define __array(type, item, len)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)				       \
+	ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t"	       \
+			       "offset:%u;\tsize:%u;\n",		       \
+			       (unsigned int)offsetof(typeof(field),	       \
+					__data_loc_##item),		       \
+			       (unsigned int)sizeof(field.__data_loc_##item)); \
+	if (!ret)							       \
+		return 0;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef __entry
+#define __entry REC
+
+#undef __print_symbolic
+#undef __get_dynamic_array
+#undef __get_str
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field __attribute__((unused));		\
+	int ret = 0;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
 /*
  * Stage 3 of the trace events.
  *
@@ -179,83 +260,6 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 	
 #include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
 
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " offset:%u; size:%u;\n",
- *			       offsetof(struct ftrace_raw_##call, item),
- *			       sizeof(field.type));
- *
- * }
- */
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef __field
-#define __field(type, item)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __array
-#define __array(type, item, len)						\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef __dynamic_array
-#define __dynamic_array(type, item, len)				       \
-	ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t"	       \
-			       "offset:%u;\tsize:%u;\n",		       \
-			       (unsigned int)offsetof(typeof(field),	       \
-					__data_loc_##item),		       \
-			       (unsigned int)sizeof(field.__data_loc_##item)); \
-	if (!ret)							       \
-		return 0;
-
-#undef __string
-#define __string(item, src) __dynamic_array(char, item, -1)
-
-#undef __entry
-#define __entry REC
-
-#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field __attribute__((unused));		\
-	int ret = 0;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: " print);			\
-									\
-	return ret;							\
-}
-
-#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
-
 #undef __field
 #define __field(type, item)						\
 	ret = trace_define_field(event_call, #type, #item,		\
-- 
GitLab


From 61b6bc525a34931bb73e4c95bfe009cd9572a288 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 10 Jun 2009 10:04:58 -0400
Subject: [PATCH 1908/2198] cifs: remove never-used in6_addr option

This option was never used to my knowledge. Remove it before someone
does...

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 6298dc32adebd..97f4311b9a8ea 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -62,7 +62,6 @@ struct smb_vol {
 	char *domainname;
 	char *UNC;
 	char *UNCip;
-	char *in6_addr;   /* ipv6 address as human readable form of in6_addr */
 	char *iocharset;  /* local code page for mapping to and from Unicode */
 	char source_rfc1001_name[16]; /* netbios name of client */
 	char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -1320,16 +1319,6 @@ cifs_parse_mount_options(char *options, const char *devname,
 			vol->direct_io = 1;
 		} else if (strnicmp(data, "forcedirectio", 13) == 0) {
 			vol->direct_io = 1;
-		} else if (strnicmp(data, "in6_addr", 8) == 0) {
-			if (!value || !*value) {
-				vol->in6_addr = NULL;
-			} else if (strnlen(value, 49) == 48) {
-				vol->in6_addr = value;
-			} else {
-				printk(KERN_WARNING "CIFS: ip v6 address not "
-						    "48 characters long\n");
-				return 1;
-			}
 		} else if (strnicmp(data, "noac", 4) == 0) {
 			printk(KERN_WARNING "CIFS: Mount option noac not "
 				"supported. Instead set "
-- 
GitLab


From e5e9a5206a171b2c467e494aebcdcf70c47289bc Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 10 Jun 2009 15:17:02 -0400
Subject: [PATCH 1909/2198] Btrfs: avoid races between super writeout and
 device list updates

On multi-device filesystems, btrfs writes supers to all of the devices
before considering a sync complete.  There wasn't any additional
locking between super writeout and the device list management code
because device management was done inside a transaction and
super writeout only happened  with no transation writers running.

With the btrfs fsync log and other async transaction updates, this
has been racey for some time.  This adds a mutex to protect
the device list.  The existing volume mutex could not be reused due to
transaction lock ordering requirements.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c |  6 +++++-
 fs/btrfs/volumes.c | 34 ++++++++++++++++++++++++++++++++++
 fs/btrfs/volumes.h |  7 ++++++-
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6c54c210dfd07..b7ddc77fa568f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2111,7 +2111,7 @@ static int write_dev_supers(struct btrfs_device *device,
 
 int write_all_supers(struct btrfs_root *root, int max_mirrors)
 {
-	struct list_head *head = &root->fs_info->fs_devices->devices;
+	struct list_head *head;
 	struct btrfs_device *dev;
 	struct btrfs_super_block *sb;
 	struct btrfs_dev_item *dev_item;
@@ -2126,6 +2126,9 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	sb = &root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
+
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+	head = &root->fs_info->fs_devices->devices;
 	list_for_each_entry(dev, head, dev_list) {
 		if (!dev->bdev) {
 			total_errors++;
@@ -2169,6 +2172,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 		if (ret)
 			total_errors++;
 	}
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 	if (total_errors > max_errors) {
 		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
 		       total_errors);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3f4a5932eac9e..3ab80e9cd7676 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -377,6 +377,7 @@ static noinline int device_list_add(const char *path,
 		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
+		mutex_init(&fs_devices->device_list_mutex);
 		device = NULL;
 	} else {
 		device = __find_device(&fs_devices->devices, devid,
@@ -403,7 +404,11 @@ static noinline int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		INIT_LIST_HEAD(&device->dev_alloc_list);
+
+		mutex_lock(&fs_devices->device_list_mutex);
 		list_add(&device->dev_list, &fs_devices->devices);
+		mutex_unlock(&fs_devices->device_list_mutex);
+
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
 	}
@@ -429,10 +434,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 	INIT_LIST_HEAD(&fs_devices->devices);
 	INIT_LIST_HEAD(&fs_devices->alloc_list);
 	INIT_LIST_HEAD(&fs_devices->list);
+	mutex_init(&fs_devices->device_list_mutex);
 	fs_devices->latest_devid = orig->latest_devid;
 	fs_devices->latest_trans = orig->latest_trans;
 	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
 
+	mutex_lock(&orig->device_list_mutex);
 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 		device = kzalloc(sizeof(*device), GFP_NOFS);
 		if (!device)
@@ -454,8 +461,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
 	}
+	mutex_unlock(&orig->device_list_mutex);
 	return fs_devices;
 error:
+	mutex_unlock(&orig->device_list_mutex);
 	free_fs_devices(fs_devices);
 	return ERR_PTR(-ENOMEM);
 }
@@ -466,6 +475,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 
 	mutex_lock(&uuid_mutex);
 again:
+	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 		if (device->in_fs_metadata)
 			continue;
@@ -485,6 +495,7 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
 		kfree(device->name);
 		kfree(device);
 	}
+	mutex_unlock(&fs_devices->device_list_mutex);
 
 	if (fs_devices->seed) {
 		fs_devices = fs_devices->seed;
@@ -1135,12 +1146,14 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 
 		device = NULL;
 		devices = &root->fs_info->fs_devices->devices;
+		mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 		list_for_each_entry(tmp, devices, dev_list) {
 			if (tmp->in_fs_metadata && !tmp->bdev) {
 				device = tmp;
 				break;
 			}
 		}
+		mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 		bdev = NULL;
 		bh = NULL;
 		disk_super = NULL;
@@ -1195,7 +1208,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		goto error_brelse;
 
 	device->in_fs_metadata = 0;
+
+	/*
+	 * the device list mutex makes sure that we don't change
+	 * the device list while someone else is writing out all
+	 * the device supers.
+	 */
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_del_init(&device->dev_list);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
 	device->fs_devices->num_devices--;
 
 	next_device = list_entry(root->fs_info->fs_devices->devices.next,
@@ -1289,6 +1311,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 	seed_devices->opened = 1;
 	INIT_LIST_HEAD(&seed_devices->devices);
 	INIT_LIST_HEAD(&seed_devices->alloc_list);
+	mutex_init(&seed_devices->device_list_mutex);
 	list_splice_init(&fs_devices->devices, &seed_devices->devices);
 	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
 	list_for_each_entry(device, &seed_devices->devices, dev_list) {
@@ -1414,6 +1437,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	mutex_lock(&root->fs_info->volume_mutex);
 
 	devices = &root->fs_info->fs_devices->devices;
+	/*
+	 * we have the volume lock, so we don't need the extra
+	 * device list mutex while reading the list here.
+	 */
 	list_for_each_entry(device, devices, dev_list) {
 		if (device->bdev == bdev) {
 			ret = -EEXIST;
@@ -1468,6 +1495,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 
 	device->fs_devices = root->fs_info->fs_devices;
+
+	/*
+	 * we don't want write_supers to jump in here with our device
+	 * half setup
+	 */
+	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
 	list_add(&device->dev_alloc_list,
 		 &root->fs_info->fs_devices->alloc_list);
@@ -1486,6 +1519,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
 	btrfs_set_super_num_devices(&root->fs_info->super_copy,
 				    total_bytes + 1);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	if (seeding_dev) {
 		ret = init_first_rw_device(trans, root, device);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 3c1f7310421e2..5139a833f7213 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -96,7 +96,12 @@ struct btrfs_fs_devices {
 	u64 rw_devices;
 	u64 total_rw_bytes;
 	struct block_device *latest_bdev;
-	/* all of the devices in the FS */
+
+	/* all of the devices in the FS, protected by a mutex
+	 * so we can safely walk it to write out the supers without
+	 * worrying about add/remove by the multi-device code
+	 */
+	struct mutex device_list_mutex;
 	struct list_head devices;
 
 	/* devices not currently being allocated */
-- 
GitLab


From 4eedeb75e7f15ffdb12d1ad559b565e7505bdbaf Mon Sep 17 00:00:00 2001
From: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Date: Wed, 10 Jun 2009 15:28:55 -0400
Subject: [PATCH 1910/2198] Btrfs: pin buffers during write_dev_supers

write_dev_supers is called in sequence.  First is it called with wait == 0,
which starts IO on all of the super blocks for a given device.  Then it is
called with wait == 1 to make sure they all reach the disk.

It doesn't currently pin the buffers between the two calls, and it also
assumes the buffers won't go away between the two calls, leading to
an oops if the VM manages to free the buffers in the middle of the sync.

This fixes that assumption and updates the code to return an error if things
are not up to date when the wait == 1 run is done.

Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 43 +++++++++++++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 14 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b7ddc77fa568f..0d50d49d990ab 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2020,6 +2020,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	return latest;
 }
 
+/*
+ * this should be called twice, once with wait == 0 and
+ * once with wait == 1.  When wait == 0 is done, all the buffer heads
+ * we write are pinned.
+ *
+ * They are released when wait == 1 is done.
+ * max_mirrors must be the same for both runs, and it indicates how
+ * many supers on this one device should be written.
+ *
+ * max_mirrors == 0 means to write them all.
+ */
 static int write_dev_supers(struct btrfs_device *device,
 			    struct btrfs_super_block *sb,
 			    int do_barriers, int wait, int max_mirrors)
@@ -2055,12 +2066,16 @@ static int write_dev_supers(struct btrfs_device *device,
 			bh = __find_get_block(device->bdev, bytenr / 4096,
 					      BTRFS_SUPER_INFO_SIZE);
 			BUG_ON(!bh);
-			brelse(bh);
 			wait_on_buffer(bh);
-			if (buffer_uptodate(bh)) {
-				brelse(bh);
-				continue;
-			}
+			if (!buffer_uptodate(bh))
+				errors++;
+
+			/* drop our reference */
+			brelse(bh);
+
+			/* drop the reference from the wait == 0 run */
+			brelse(bh);
+			continue;
 		} else {
 			btrfs_set_super_bytenr(sb, bytenr);
 
@@ -2071,12 +2086,18 @@ static int write_dev_supers(struct btrfs_device *device,
 					      BTRFS_CSUM_SIZE);
 			btrfs_csum_final(crc, sb->csum);
 
+			/*
+			 * one reference for us, and we leave it for the
+			 * caller
+			 */
 			bh = __getblk(device->bdev, bytenr / 4096,
 				      BTRFS_SUPER_INFO_SIZE);
 			memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE);
 
-			set_buffer_uptodate(bh);
+			/* one reference for submit_bh */
 			get_bh(bh);
+
+			set_buffer_uptodate(bh);
 			lock_buffer(bh);
 			bh->b_end_io = btrfs_end_buffer_write_sync;
 		}
@@ -2088,6 +2109,7 @@ static int write_dev_supers(struct btrfs_device *device,
 				       device->name);
 				set_buffer_uptodate(bh);
 				device->barriers = 0;
+				/* one reference for submit_bh */
 				get_bh(bh);
 				lock_buffer(bh);
 				ret = submit_bh(WRITE_SYNC, bh);
@@ -2096,15 +2118,8 @@ static int write_dev_supers(struct btrfs_device *device,
 			ret = submit_bh(WRITE_SYNC, bh);
 		}
 
-		if (!ret && wait) {
-			wait_on_buffer(bh);
-			if (!buffer_uptodate(bh))
-				errors++;
-		} else if (ret) {
+		if (ret)
 			errors++;
-		}
-		if (wait)
-			brelse(bh);
 	}
 	return errors < i ? 0 : -1;
 }
-- 
GitLab


From d9c7d394a8ebacb60097b192939ae9f15235225e Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@novell.com>
Date: Wed, 10 Jun 2009 12:57:06 -0700
Subject: [PATCH 1911/2198] block: prevent possible io_context->refcount
 overflow

Currently io_context has an atomic_t(32-bit) as refcount.  In the case of
cfq, for each device against whcih a task does I/O, a reference to the
io_context would be taken.  And when there are multiple process sharing
io_contexts(CLONE_IO) would also have a reference to the same io_context.

Theoretically the possible maximum number of processes sharing the same
io_context + the number of disks/cfq_data referring to the same io_context
can overflow the 32-bit counter on a very high-end machine.

Even though it is an improbable case, let us make it atomic_long_t.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-ioc.c           | 12 ++++++------
 block/cfq-iosched.c       |  2 +-
 include/linux/iocontext.h |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 012f065ac8e24..d4ed6000147d1 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -35,9 +35,9 @@ int put_io_context(struct io_context *ioc)
 	if (ioc == NULL)
 		return 1;
 
-	BUG_ON(atomic_read(&ioc->refcount) == 0);
+	BUG_ON(atomic_long_read(&ioc->refcount) == 0);
 
-	if (atomic_dec_and_test(&ioc->refcount)) {
+	if (atomic_long_dec_and_test(&ioc->refcount)) {
 		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
@@ -90,7 +90,7 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 
 	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
 	if (ret) {
-		atomic_set(&ret->refcount, 1);
+		atomic_long_set(&ret->refcount, 1);
 		atomic_set(&ret->nr_tasks, 1);
 		spin_lock_init(&ret->lock);
 		ret->ioprio_changed = 0;
@@ -151,7 +151,7 @@ struct io_context *get_io_context(gfp_t gfp_flags, int node)
 		ret = current_io_context(gfp_flags, node);
 		if (unlikely(!ret))
 			break;
-	} while (!atomic_inc_not_zero(&ret->refcount));
+	} while (!atomic_long_inc_not_zero(&ret->refcount));
 
 	return ret;
 }
@@ -163,8 +163,8 @@ void copy_io_context(struct io_context **pdst, struct io_context **psrc)
 	struct io_context *dst = *pdst;
 
 	if (src) {
-		BUG_ON(atomic_read(&src->refcount) == 0);
-		atomic_inc(&src->refcount);
+		BUG_ON(atomic_long_read(&src->refcount) == 0);
+		atomic_long_inc(&src->refcount);
 		put_io_context(dst);
 		*pdst = src;
 	}
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 99ac4304d711e..ef2f72d424340 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1282,7 +1282,7 @@ static void cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	if (!cfqd->active_cic) {
 		struct cfq_io_context *cic = RQ_CIC(rq);
 
-		atomic_inc(&cic->ioc->refcount);
+		atomic_long_inc(&cic->ioc->refcount);
 		cfqd->active_cic = cic;
 	}
 }
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 08b987bccf89a..dd05434fa45f2 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -64,7 +64,7 @@ struct cfq_io_context {
  * and kmalloc'ed. These could be shared between processes.
  */
 struct io_context {
-	atomic_t refcount;
+	atomic_long_t refcount;
 	atomic_t nr_tasks;
 
 	/* all the fields below are protected by this lock */
@@ -91,8 +91,8 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc)
 	 * if ref count is zero, don't allow sharing (ioc is going away, it's
 	 * a race).
 	 */
-	if (ioc && atomic_inc_not_zero(&ioc->refcount)) {
-		atomic_inc(&ioc->nr_tasks);
+	if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) {
+		atomic_long_inc(&ioc->refcount);
 		return ioc;
 	}
 
-- 
GitLab


From 0e0c62123b517d2b3c26922342c0cc5bb63a93f8 Mon Sep 17 00:00:00 2001
From: Michal Simek <monstr@monstr.eu>
Date: Wed, 10 Jun 2009 12:57:07 -0700
Subject: [PATCH 1912/2198] fs/bio.c: add missing __user annotation

As reported by sparse:

fs/bio.c:720:13: warning: incorrect type in assignment (different address spaces)
fs/bio.c:720:13:    expected char *iov_addr
fs/bio.c:720:13:    got void [noderef] <asn:1>*
fs/bio.c:724:36: warning: incorrect type in argument 2 (different address spaces)
fs/bio.c:724:36:    expected void const [noderef] <asn:1>*from
fs/bio.c:724:36:    got char *iov_addr

Signed-off-by: Michal Simek <monstr@monstr.eu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/bio.c b/fs/bio.c
index ab423a1024ab8..533266a5e5842 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -722,7 +722,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
 
 		while (bv_len && iov_idx < iov_count) {
 			unsigned int bytes;
-			char *iov_addr;
+			char __user *iov_addr;
 
 			bytes = min_t(unsigned int,
 				      iov[iov_idx].iov_len - iov_off, bv_len);
-- 
GitLab


From 1adbee50fd6fce5af4feb34d2db93cfe4d2066a4 Mon Sep 17 00:00:00 2001
From: "Robert P. J. Day" <rpjday@crashcourse.ca>
Date: Wed, 10 Jun 2009 12:57:08 -0700
Subject: [PATCH 1913/2198] ramdisk: remove long-deprecated "ramdisk="
 boot-time parameter

The "ramdisk" parameter was removed from the defunct rd.c file quite some
time ago, in favour of the more specific "ramdisk_size" parameter so, for
consistency, the same should be done here.

Signed-off-by: Robert P. J. Day <rpjday@crashcourse.ca>
Acked-by: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/brd.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 5f7e64ba87e50..4bf8705b3aced 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -407,12 +407,7 @@ static int __init ramdisk_size(char *str)
 	rd_size = simple_strtol(str, NULL, 0);
 	return 1;
 }
-static int __init ramdisk_size2(char *str)
-{
-	return ramdisk_size(str);
-}
-__setup("ramdisk=", ramdisk_size);
-__setup("ramdisk_size=", ramdisk_size2);
+__setup("ramdisk_size=", ramdisk_size);
 #endif
 
 /*
-- 
GitLab


From 5e50b9ef975219304cc91d601530994861585bfe Mon Sep 17 00:00:00 2001
From: Ondrej Zary <linux@rainbow-software.org>
Date: Wed, 10 Jun 2009 12:57:09 -0700
Subject: [PATCH 1914/2198] floppy: fix hibernation

Based on Ingo Molnar's patch from 2006, this makes the floppy work after
resume from hibernation, at least on my machine.

This fix resets the floppy controller on resume.  It was experimentally
determined to bring the controller back to life - we don't really know why
it works.

floppy_init() does the same thing at boot/modprobe time.

Signed-off-by: Ondrej Zary <linux@rainbow-software.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 drivers/block/floppy.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 90877fee0ee00..862b40c90181b 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4148,6 +4148,24 @@ static void floppy_device_release(struct device *dev)
 {
 }
 
+static int floppy_resume(struct platform_device *dev)
+{
+	int fdc;
+
+	for (fdc = 0; fdc < N_FDC; fdc++)
+		if (FDCS->address != -1)
+			user_reset_fdc(-1, FD_RESET_ALWAYS, 0);
+
+	return 0;
+}
+
+static struct platform_driver floppy_driver = {
+	.resume = floppy_resume,
+	.driver = {
+		.name = "floppy",
+	},
+};
+
 static struct platform_device floppy_device[N_DRIVE];
 
 static struct kobject *floppy_find(dev_t dev, int *part, void *data)
@@ -4196,10 +4214,14 @@ static int __init floppy_init(void)
 	if (err)
 		goto out_put_disk;
 
+	err = platform_driver_register(&floppy_driver);
+	if (err)
+		goto out_unreg_blkdev;
+
 	floppy_queue = blk_init_queue(do_fd_request, &floppy_lock);
 	if (!floppy_queue) {
 		err = -ENOMEM;
-		goto out_unreg_blkdev;
+		goto out_unreg_driver;
 	}
 	blk_queue_max_sectors(floppy_queue, 64);
 
@@ -4346,6 +4368,8 @@ static int __init floppy_init(void)
 out_unreg_region:
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	blk_cleanup_queue(floppy_queue);
+out_unreg_driver:
+	platform_driver_unregister(&floppy_driver);
 out_unreg_blkdev:
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 out_put_disk:
@@ -4566,6 +4590,7 @@ static void __exit floppy_module_exit(void)
 
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
+	platform_driver_unregister(&floppy_driver);
 
 	for (drive = 0; drive < N_DRIVE; drive++) {
 		del_timer_sync(&motor_off_timer[drive]);
-- 
GitLab


From 0de51088e6a82bc8413d3ca9e28bbca2788b5b53 Mon Sep 17 00:00:00 2001
From: Harald Welte <HaraldWelte@viatech.com>
Date: Mon, 8 Jun 2009 18:27:54 +0800
Subject: [PATCH 1915/2198] CPUFREQ: Enable acpi-cpufreq driver for VIA/Centaur
 CPUs

The VIA/Centaur C7, C7-M and Nano CPU's all support ACPI based cpu p-states
using a MSR interface.  The Linux driver just never made use of it, since in
addition to the check for the EST flag it also checked if the vendor is Intel.

Signed-off-by: Harald Welte <HaraldWelte@viatech.com>
[ Removed the vendor checks entirely  - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 752e8c6b2c7e2..ae9b503220caf 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
 {
 	struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
 
-	if (cpu->x86_vendor != X86_VENDOR_INTEL ||
-	    !cpu_has(cpu, X86_FEATURE_EST))
-		return 0;
-
-	return 1;
+	return cpu_has(cpu, X86_FEATURE_EST);
 }
 
 static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
-- 
GitLab


From 0fea615e526b4b7eff0363ee02d5753e5f924089 Mon Sep 17 00:00:00 2001
From: Harald Welte <HaraldWelte@viatech.com>
Date: Mon, 8 Jun 2009 18:29:36 +0800
Subject: [PATCH 1916/2198] CPUFREQ: Mark e_powersaver driver as EXPERIMENTAL
 and DANGEROUS

The e_powersaver driver for VIA's C7 CPU's needs to be marked as
DANGEROUS as it configures the CPU to power states that are out
of specification.

According to Centaur, all systems with C7 and Nano CPU's support
the ACPI p-state method.  Thus, the acpi-cpufreq driver should
be used instead.

Signed-off-by: Harald Welte <HaraldWelte@viatech.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/cpufreq/Kconfig | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index 52c8398754785..f138c6c389b92 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -220,11 +220,14 @@ config X86_LONGHAUL
 	  If in doubt, say N.
 
 config X86_E_POWERSAVER
-	tristate "VIA C7 Enhanced PowerSaver"
+	tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
 	select CPU_FREQ_TABLE
-	depends on X86_32
+	depends on X86_32 && EXPERIMENTAL
 	help
-	  This adds the CPUFreq driver for VIA C7 processors.
+	  This adds the CPUFreq driver for VIA C7 processors.  However, this driver
+	  does not have any safeguards to prevent operating the CPU out of spec
+	  and is thus considered dangerous.  Please use the regular ACPI cpufreq
+	  driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
 
 	  If in doubt, say N.
 
-- 
GitLab


From 511b01bdf64ad8a38414096eab283c7784aebfc4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Jun 2009 00:32:00 +0200
Subject: [PATCH 1917/2198] Revert "x86, bts: reenable ptrace branch trace
 support"

This reverts commit 7e0bfad24d85de7cf2202a7b0ce51de11a077b21.

A late objection to the ABI has arrived:

   http://lkml.org/lkml/2009/6/10/253

Keep the ABI disabled out of caution, to not create premature
user-space expectations.

While the hw-branch-tracing variant uses and tests the BTS code.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Markus Metzger <markus.t.metzger@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.cpu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 924e156a85abd..8130334329c06 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -506,6 +506,7 @@ config X86_PTRACE_BTS
 	bool "Branch Trace Store"
 	default y
 	depends on X86_DEBUGCTLMSR
+	depends on BROKEN
 	---help---
 	  This adds a ptrace interface to the hardware's branch trace store.
 
-- 
GitLab


From fd0fb038d5a308c7faddd1701be5e70aaffec98b Mon Sep 17 00:00:00 2001
From: Shin Hong <hongshin@gmail.com>
Date: Wed, 10 Jun 2009 20:11:29 -0400
Subject: [PATCH 1918/2198] Btrfs: init worker struct fields before kthread-run

This patch fixes a bug which may result race condition
between btrfs_start_workers() and worker_loop().

btrfs_start_workers() executed in a parent thread writes
on workers->worker and worker_loop() in a child thread
reads workers->worker. However, there is no synchronization
enforcing the order of two operations.

This patch makes btrfs_start_workers() fill workers->worker
before it starts a child thread with worker_loop()

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/async-thread.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 502c3d61de625..7f88628a1a72e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -294,10 +294,10 @@ int btrfs_start_workers(struct btrfs_workers *workers, int num_workers)
 		INIT_LIST_HEAD(&worker->worker_list);
 		spin_lock_init(&worker->lock);
 		atomic_set(&worker->num_pending, 0);
+		worker->workers = workers;
 		worker->task = kthread_run(worker_loop, worker,
 					   "btrfs-%s-%d", workers->name,
 					   workers->num_workers + i);
-		worker->workers = workers;
 		if (IS_ERR(worker->task)) {
 			kfree(worker);
 			ret = PTR_ERR(worker->task);
-- 
GitLab


From 66fff22483d8542dfb4d61a28d21277bbde321e8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 22:53:37 +0200
Subject: [PATCH 1919/2198] perf_counter: Annotate exit ctx recursion

Ever since Paul fixed it to unclone the context before taking the
ctx->lock this became a false positive, annotate it away.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 51c571ee4d0b1..ae591a1275a66 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3879,7 +3879,18 @@ void perf_counter_exit_task(struct task_struct *child)
 	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 
-	mutex_lock(&child_ctx->mutex);
+	/*
+	 * We can recurse on the same lock type through:
+	 *
+	 *   __perf_counter_exit_task()
+	 *     sync_child_counter()
+	 *       fput(parent_counter->filp)
+	 *         perf_release()
+	 *           mutex_lock(&ctx->mutex)
+	 *
+	 * But since its the parent context it won't be the same instance.
+	 */
+	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING);
 
 again:
 	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
-- 
GitLab


From ea1900e571d40a3ce60c835c2f21e1fd8c5cb663 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:45:22 +0200
Subject: [PATCH 1920/2198] perf_counter tools: Normalize data using per sample
 period data

When we use variable period sampling, add the period to the sample
data and use that to normalize the samples.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c |  3 ++-
 tools/perf/builtin-report.c | 18 +++++++++++-------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index c10553c3460f0..919f23ca41996 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -350,8 +350,9 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	struct perf_counter_attr *attr = attrs + counter;
 	int track = 1;
 
-	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_PERIOD;
+	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 	if (freq) {
+		attr->sample_type	|= PERF_SAMPLE_PERIOD;
 		attr->freq		= 1;
 		attr->sample_freq	= freq;
 	}
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 9a0e31e79e9df..f57fd5c5531ad 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -456,7 +456,7 @@ struct hist_entry {
 	uint64_t	 ip;
 	char		 level;
 
-	uint32_t	 count;
+	uint64_t	 count;
 };
 
 /*
@@ -726,7 +726,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, uint64_t ip, char level)
+		struct symbol *sym, uint64_t ip, char level, uint64_t count)
 {
 	struct rb_node **p = &hist.rb_node;
 	struct rb_node *parent = NULL;
@@ -738,7 +738,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		.sym	= sym,
 		.ip	= ip,
 		.level	= level,
-		.count	= 1,
+		.count	= count,
 	};
 	int cmp;
 
@@ -749,7 +749,7 @@ hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
 		cmp = hist_entry__cmp(&entry, he);
 
 		if (!cmp) {
-			he->count++;
+			he->count += count;
 			return 0;
 		}
 
@@ -942,15 +942,19 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	struct dso *dso = NULL;
 	struct thread *thread = threads__findnew(event->ip.pid);
 	uint64_t ip = event->ip.ip;
+	uint64_t period = 1;
 	struct map *map = NULL;
 
+	if (event->header.type & PERF_SAMPLE_PERIOD)
+		period = event->ip.period;
+
 	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->header.misc,
 		event->ip.pid,
 		(void *)(long)ip,
-		(long long)event->ip.period);
+		(long long)period);
 
 	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid);
 
@@ -1001,13 +1005,13 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 		if (dso)
 			sym = dso->find_symbol(dso, ip);
 
-		if (hist_entry__add(thread, map, dso, sym, ip, level)) {
+		if (hist_entry__add(thread, map, dso, sym, ip, level, period)) {
 			fprintf(stderr,
 		"problem incrementing symbol count, skipping event\n");
 			return -1;
 		}
 	}
-	total++;
+	total += period;
 
 	return 0;
 }
-- 
GitLab


From df1a132bf3d3508f863336c80a27806a2ac947e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:02:22 +0200
Subject: [PATCH 1921/2198] perf_counter: Introduce struct for sample data

For easy extension of the sample data, put it in a structure.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c | 10 +++++---
 arch/x86/kernel/cpu/perf_counter.c | 15 ++++++++----
 include/linux/perf_counter.h       | 10 ++++++--
 kernel/perf_counter.c              | 38 +++++++++++++++++-------------
 4 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4786ad9a28877..5e0bf399c4335 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1001,7 +1001,11 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 * Finally record data if requested.
 	 */
 	if (record) {
-		addr = 0;
+		struct perf_sample_data data = {
+			.regs = regs,
+			.addr = 0,
+		};
+
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
 			/*
 			 * The user wants a data address recorded.
@@ -1016,9 +1020,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ?
 				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC;
 			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync))
-				addr = mfspr(SPRN_SDAR);
+				data.addr = mfspr(SPRN_SDAR);
 		}
-		if (perf_counter_overflow(counter, nmi, regs, addr)) {
+		if (perf_counter_overflow(counter, nmi, &data)) {
 			/*
 			 * Interrupts are coming too fast - throttle them
 			 * by setting the counter to 0, so it will be
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 240ca5630632b..82a23d487f928 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1173,11 +1173,14 @@ static void intel_pmu_reset(void)
  */
 static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
+	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
-	struct cpu_hw_counters;
 	int bit, cpu, loops;
 	u64 ack, status;
 
+	data.regs = regs;
+	data.addr = 0;
+
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
@@ -1210,7 +1213,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 		if (!intel_pmu_save_and_restart(counter))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, regs, 0))
+		if (perf_counter_overflow(counter, 1, &data))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -1230,12 +1233,16 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 
 static int amd_pmu_handle_irq(struct pt_regs *regs)
 {
-	int cpu, idx, handled = 0;
+	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
+	int cpu, idx, handled = 0;
 	u64 val;
 
+	data.regs = regs;
+	data.addr = 0;
+
 	cpu = smp_processor_id();
 	cpuc = &per_cpu(cpu_hw_counters, cpu);
 
@@ -1256,7 +1263,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, regs, 0))
+		if (perf_counter_overflow(counter, 1, &data))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 282d8cc489804..d8c0eb480f9a6 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -605,8 +605,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 	       struct perf_counter_context *ctx, int cpu);
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
-extern int perf_counter_overflow(struct perf_counter *counter,
-				 int nmi, struct pt_regs *regs, u64 addr);
+struct perf_sample_data {
+	struct pt_regs	*regs;
+	u64		addr;
+};
+
+extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
+				 struct perf_sample_data *data);
+
 /*
  * Return 1 for a software counter, 0 for a hardware counter
  */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ae591a1275a66..4fe85e804f43c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2378,8 +2378,8 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p)
 	return task_pid_nr_ns(p, counter->ns);
 }
 
-static void perf_counter_output(struct perf_counter *counter,
-				int nmi, struct pt_regs *regs, u64 addr)
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+				struct perf_sample_data *data)
 {
 	int ret;
 	u64 sample_type = counter->attr.sample_type;
@@ -2404,10 +2404,10 @@ static void perf_counter_output(struct perf_counter *counter,
 	header.size = sizeof(header);
 
 	header.misc = PERF_EVENT_MISC_OVERFLOW;
-	header.misc |= perf_misc_flags(regs);
+	header.misc |= perf_misc_flags(data->regs);
 
 	if (sample_type & PERF_SAMPLE_IP) {
-		ip = perf_instruction_pointer(regs);
+		ip = perf_instruction_pointer(data->regs);
 		header.type |= PERF_SAMPLE_IP;
 		header.size += sizeof(ip);
 	}
@@ -2460,7 +2460,7 @@ static void perf_counter_output(struct perf_counter *counter,
 	}
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		callchain = perf_callchain(regs);
+		callchain = perf_callchain(data->regs);
 
 		if (callchain) {
 			callchain_size = (1 + callchain->nr) * sizeof(u64);
@@ -2486,7 +2486,7 @@ static void perf_counter_output(struct perf_counter *counter,
 		perf_output_put(&handle, time);
 
 	if (sample_type & PERF_SAMPLE_ADDR)
-		perf_output_put(&handle, addr);
+		perf_output_put(&handle, data->addr);
 
 	if (sample_type & PERF_SAMPLE_ID)
 		perf_output_put(&handle, counter->id);
@@ -2950,8 +2950,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
  * Generic counter overflow handling.
  */
 
-int perf_counter_overflow(struct perf_counter *counter,
-			  int nmi, struct pt_regs *regs, u64 addr)
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+			  struct perf_sample_data *data)
 {
 	int events = atomic_read(&counter->event_limit);
 	int throttle = counter->pmu->unthrottle != NULL;
@@ -3005,7 +3005,7 @@ int perf_counter_overflow(struct perf_counter *counter,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, regs, addr);
+	perf_counter_output(counter, nmi, data);
 	return ret;
 }
 
@@ -3054,24 +3054,25 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	enum hrtimer_restart ret = HRTIMER_RESTART;
+	struct perf_sample_data data;
 	struct perf_counter *counter;
-	struct pt_regs *regs;
 	u64 period;
 
 	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer);
 	counter->pmu->read(counter);
 
-	regs = get_irq_regs();
+	data.addr = 0;
+	data.regs = get_irq_regs();
 	/*
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->attr.exclude_kernel || !regs) &&
+	if ((counter->attr.exclude_kernel || !data.regs) &&
 			!counter->attr.exclude_user)
-		regs = task_pt_regs(current);
+		data.regs = task_pt_regs(current);
 
-	if (regs) {
-		if (perf_counter_overflow(counter, 0, regs, 0))
+	if (data.regs) {
+		if (perf_counter_overflow(counter, 0, &data))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -3084,9 +3085,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs, u64 addr)
 {
+	struct perf_sample_data data = {
+		.regs = regs,
+		.addr = addr,
+	};
+
 	perf_swcounter_update(counter);
 	perf_swcounter_set_period(counter);
-	if (perf_counter_overflow(counter, nmi, regs, addr))
+	if (perf_counter_overflow(counter, nmi, &data))
 		/* soft-disable the counter */
 		;
 
-- 
GitLab


From 9e350de37ac9607012fcf9c5314a28fbddf8f43c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 10 Jun 2009 21:34:59 +0200
Subject: [PATCH 1922/2198] perf_counter: Accurate period data

We currently log hw.sample_period for PERF_SAMPLE_PERIOD, however this is
incorrect. When we adjust the period, it will only take effect the next
cycle but report it for the current cycle. So when we adjust the period
for every cycle, we're always wrong.

Solve this by keeping track of the last_period.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/perf_counter.c |  9 ++++++---
 arch/x86/kernel/cpu/perf_counter.c | 15 ++++++++++++---
 include/linux/perf_counter.h       |  6 ++++--
 kernel/perf_counter.c              |  9 ++++++---
 4 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5e0bf399c4335..4990ce2e5f087 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -767,6 +767,7 @@ static void power_pmu_unthrottle(struct perf_counter *counter)
 	perf_disable();
 	power_pmu_read(counter);
 	left = counter->hw.sample_period;
+	counter->hw.last_period = left;
 	val = 0;
 	if (left < 0x80000000L)
 		val = 0x80000000L - left;
@@ -937,7 +938,8 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	counter->hw.config = events[n];
 	counter->hw.counter_base = cflags[n];
-	atomic64_set(&counter->hw.period_left, counter->hw.sample_period);
+	counter->hw.last_period = counter->hw.sample_period;
+	atomic64_set(&counter->hw.period_left, counter->hw.last_period);
 
 	/*
 	 * See if we need to reserve the PMU.
@@ -1002,8 +1004,9 @@ static void record_and_restart(struct perf_counter *counter, long val,
 	 */
 	if (record) {
 		struct perf_sample_data data = {
-			.regs = regs,
-			.addr = 0,
+			.regs	= regs,
+			.addr	= 0,
+			.period	= counter->hw.last_period,
 		};
 
 		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) {
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 82a23d487f928..57ae1bec81bea 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -698,6 +698,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!hwc->sample_period) {
 		hwc->sample_period = x86_pmu.max_period;
+		hwc->last_period = hwc->sample_period;
 		atomic64_set(&hwc->period_left, hwc->sample_period);
 	}
 
@@ -880,12 +881,14 @@ x86_perf_counter_set_period(struct perf_counter *counter,
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 		ret = 1;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 		ret = 1;
 	}
 	/*
@@ -1257,9 +1260,12 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
 			continue;
 
-		/* counter overflow */
-		handled = 1;
-		inc_irq_stat(apic_perf_irqs);
+		/*
+		 * counter overflow
+		 */
+		handled		= 1;
+		data.period	= counter->hw.last_period;
+
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
@@ -1267,6 +1273,9 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
 	return handled;
 }
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d8c0eb480f9a6..5b966472b4580 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -366,6 +366,7 @@ struct hw_perf_counter {
 	};
 	atomic64_t			prev_count;
 	u64				sample_period;
+	u64				last_period;
 	atomic64_t			period_left;
 	u64				interrupts;
 
@@ -606,8 +607,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 struct perf_sample_data {
-	struct pt_regs	*regs;
-	u64		addr;
+	struct pt_regs		*regs;
+	u64			addr;
+	u64			period;
 };
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 4fe85e804f43c..8b89b40bd0f04 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2495,7 +2495,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		perf_output_put(&handle, cpu_entry);
 
 	if (sample_type & PERF_SAMPLE_PERIOD)
-		perf_output_put(&handle, counter->hw.sample_period);
+		perf_output_put(&handle, data->period);
 
 	/*
 	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult.
@@ -3040,11 +3040,13 @@ static void perf_swcounter_set_period(struct perf_counter *counter)
 	if (unlikely(left <= -period)) {
 		left = period;
 		atomic64_set(&hwc->period_left, left);
+		hwc->last_period = period;
 	}
 
 	if (unlikely(left <= 0)) {
 		left += period;
 		atomic64_add(period, &hwc->period_left);
+		hwc->last_period = period;
 	}
 
 	atomic64_set(&hwc->prev_count, -left);
@@ -3086,8 +3088,9 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data = {
-		.regs = regs,
-		.addr = addr,
+		.regs	= regs,
+		.addr	= addr,
+		.period	= counter->hw.last_period,
 	};
 
 	perf_swcounter_update(counter);
-- 
GitLab


From 7f72b47cedd1efc301576ff1050ec0a26c57eb48 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Tue, 28 Apr 2009 14:13:21 +1000
Subject: [PATCH 1923/2198] m68knommu: fix system reset for ColdFire 527x
 family

The sofwtare reset control for the 527x ColdFire family is based on
the same Reset Control Unit as the 528x ColdFire family. So use the
same reset code for both.

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/system_no.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/m68k/include/asm/system_no.h b/arch/m68k/include/asm/system_no.h
index 4496c0aa83792..5fbc96d933cef 100644
--- a/arch/m68k/include/asm/system_no.h
+++ b/arch/m68k/include/asm/system_no.h
@@ -264,18 +264,18 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 	: /* No output */		\
 	: "o" (*(char *)MCF_MBAR) );	\
 })
-#elif defined(CONFIG_M528x)
+#elif defined(CONFIG_M528x) || defined(CONFIG_M527x)
 /*
- * The MCF528x has a bit (SOFTRST) in memory (Reset Control Register RCR),
- * that when set, resets the MCF528x.
+ * Most of the newer ColdFire family members have a proper RESET unit.
+ * Use the software reset control bit in the Reset Control Register (RCR).
  */
 #define HARD_RESET_NOW() \
-({						\
-	unsigned char volatile *reset;		\
-	asm("move.w	#0x2700, %sr");		\
+({									\
+	unsigned char volatile *reset;					\
+	asm("move.w #0x2700, %sr");					\
 	reset = ((volatile unsigned char *)(MCF_IPSBAR + 0x110000));	\
-	while(1)				\
-	*reset |= (0x01 << 7);\
+	while (1)							\
+		*reset |= (0x01 << 7);					\
 })
 #elif defined(CONFIG_M523x)
 #define HARD_RESET_NOW() ({		\
-- 
GitLab


From 293ca0f75415dd8373c716bf9755569daca7ba5d Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Tue, 28 Apr 2009 16:56:03 +1000
Subject: [PATCH 1924/2198] m68knommu: merge system reset for code ColdFire
 523x family

The sofwtare reset control code for the 523x ColdFire family uses the
same Reset unit hardware as the 527x and 528x ColdFire parts. So they
should all use the same code. Merge them.

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/system_no.h | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/arch/m68k/include/asm/system_no.h b/arch/m68k/include/asm/system_no.h
index 5fbc96d933cef..a0a1ae8b15241 100644
--- a/arch/m68k/include/asm/system_no.h
+++ b/arch/m68k/include/asm/system_no.h
@@ -264,7 +264,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 	: /* No output */		\
 	: "o" (*(char *)MCF_MBAR) );	\
 })
-#elif defined(CONFIG_M528x) || defined(CONFIG_M527x)
+#elif defined(CONFIG_M523x) || defined(CONFIG_M528x) || defined(CONFIG_M527x)
 /*
  * Most of the newer ColdFire family members have a proper RESET unit.
  * Use the software reset control bit in the Reset Control Register (RCR).
@@ -275,16 +275,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 	asm("move.w #0x2700, %sr");					\
 	reset = ((volatile unsigned char *)(MCF_IPSBAR + 0x110000));	\
 	while (1)							\
-		*reset |= (0x01 << 7);					\
-})
-#elif defined(CONFIG_M523x)
-#define HARD_RESET_NOW() ({		\
-	asm("				\
-	movew #0x2700, %sr;		\
-	movel #0x01000000, %sp;		\
-	moveal #0x40110000, %a0;	\
-	moveb #0x80, (%a0);		\
-	");				\
+		*reset |= 0x80;						\
 })
 #elif defined(CONFIG_M520x)
 	/*
-- 
GitLab


From c18e52c7696e39c7cb23cd083bbda5652c338b79 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 17:03:09 +1000
Subject: [PATCH 1925/2198] m68knommu: add CPU reset code for the 5307 ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68knommu/platform/5307/config.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/m68knommu/platform/5307/config.c b/arch/m68knommu/platform/5307/config.c
index 44803bf70a6e0..39da9e9ff674e 100644
--- a/arch/m68knommu/platform/5307/config.c
+++ b/arch/m68knommu/platform/5307/config.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -22,8 +21,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -119,6 +116,17 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+void m5307_cpu_reset(void)
+{
+	local_irq_disable();
+	/* Set watchdog to soft reset, and enabled */
+	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -134,7 +142,7 @@ void __init config_BSP(char *commandp, int size)
 	mcf_timerlevel = 6;
 #endif
 
-	mach_reset = coldfire_reset;
+	mach_reset = m5307_cpu_reset;
 
 #ifdef CONFIG_BDM_DISABLE
 	/*
-- 
GitLab


From eb7c874d5c81dae6d7b80857e9f0dab1a5d05ebd Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 17:15:18 +1000
Subject: [PATCH 1926/2198] m68knommu: add CPU reset code for the 5407 ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68knommu/platform/5407/config.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/m68knommu/platform/5407/config.c b/arch/m68knommu/platform/5407/config.c
index 0ee8c1a200c87..b41d942bf8d07 100644
--- a/arch/m68knommu/platform/5407/config.c
+++ b/arch/m68knommu/platform/5407/config.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -21,8 +20,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -110,6 +107,17 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+void m5407_cpu_reset(void)
+{
+	local_irq_disable();
+	/* set watchdog to soft reset, and enabled */
+	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -121,7 +129,7 @@ void __init config_BSP(char *commandp, int size)
 	mcf_timerlevel = 6;
 #endif
 
-	mach_reset = coldfire_reset;
+	mach_reset = m5407_cpu_reset;
 }
 
 /***************************************************************************/
-- 
GitLab


From 851377bca613175473b6dafd6c7d1bfbcf24efe2 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 17:22:05 +1000
Subject: [PATCH 1927/2198] m68knommu: add CPU reset code for the 5206 ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68knommu/platform/5206/config.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/m68knommu/platform/5206/config.c b/arch/m68knommu/platform/5206/config.c
index 53a5920c2b71a..f6f79874e9af7 100644
--- a/arch/m68knommu/platform/5206/config.c
+++ b/arch/m68knommu/platform/5206/config.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -21,10 +20,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m5206_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -109,10 +104,21 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+void m5206_cpu_reset(void)
+{
+	local_irq_disable();
+	/* Set watchdog to soft reset, and enabled */
+	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
-	mach_reset = coldfire_reset;
+	mach_reset = m5206_cpu_reset;
 }
 
 /***************************************************************************/
-- 
GitLab


From b61a7260ff70d10ca27edcc74535af0d46a277fa Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 17:22:56 +1000
Subject: [PATCH 1928/2198] m68knommu: add CPU reset code for the 5206e
 ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68knommu/platform/5206e/config.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/m68knommu/platform/5206e/config.c b/arch/m68knommu/platform/5206e/config.c
index db902540bf2c9..65887799db812 100644
--- a/arch/m68knommu/platform/5206e/config.c
+++ b/arch/m68knommu/platform/5206e/config.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -21,10 +20,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m5206e_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -109,6 +104,17 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+void m5206e_cpu_reset(void)
+{
+	local_irq_disable();
+	/* Set watchdog to soft reset, and enabled */
+	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -119,7 +125,7 @@ void __init config_BSP(char *commandp, int size)
 	commandp[size-1] = 0;
 #endif /* CONFIG_NETtel */
 
-	mach_reset = coldfire_reset;
+	mach_reset = m5206e_cpu_reset;
 }
 
 /***************************************************************************/
-- 
GitLab


From 6f5aa7ce38dfb9c04eecac1222fc4b2521f02b83 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 17:31:19 +1000
Subject: [PATCH 1929/2198] m68knommu: add CPU reset code for the 5249 ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68knommu/platform/5249/config.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/arch/m68knommu/platform/5249/config.c b/arch/m68knommu/platform/5249/config.c
index 9eab19d01eb10..93d9988259250 100644
--- a/arch/m68knommu/platform/5249/config.c
+++ b/arch/m68knommu/platform/5249/config.c
@@ -11,7 +11,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -20,10 +19,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m5249_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -106,10 +101,21 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+void m5249_cpu_reset(void)
+{
+	local_irq_disable();
+	/* Set watchdog to soft reset, and enabled */
+	__raw_writeb(0xc0, MCF_MBAR + MCFSIM_SYPCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
-	mach_reset = coldfire_reset;
+	mach_reset = m5249_cpu_reset;
 }
 
 /***************************************************************************/
-- 
GitLab


From 384feb91319766298c6358f23f54873d44c9d8cb Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 22:03:43 +1000
Subject: [PATCH 1930/2198] m68knommu: add CPU reset code for the 532x ColdFire

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/m532xsim.h      | 12 ++++++++++++
 arch/m68knommu/platform/532x/config.c | 12 +++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/arch/m68k/include/asm/m532xsim.h b/arch/m68k/include/asm/m532xsim.h
index ce603451b55e3..eb7fd44489474 100644
--- a/arch/m68k/include/asm/m532xsim.h
+++ b/arch/m68k/include/asm/m532xsim.h
@@ -125,6 +125,18 @@
 #define	ACR_CM_OFF_IMP		(3<<5)
 #define	ACR_WPROTECT		(1<<2)
 
+/*********************************************************************
+ *
+ * Reset Controller Module
+ *
+ *********************************************************************/
+
+#define	MCF_RCR			0xFC0A0000
+#define	MCF_RSR			0xFC0A0001
+
+#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
+#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
+
 /*********************************************************************
  *
  * Inter-IC (I2C) Module
diff --git a/arch/m68knommu/platform/532x/config.c b/arch/m68knommu/platform/532x/config.c
index 591f2f801134b..cdb761971f7a7 100644
--- a/arch/m68knommu/platform/532x/config.c
+++ b/arch/m68knommu/platform/532x/config.c
@@ -31,8 +31,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -164,6 +162,14 @@ void mcf_settimericr(unsigned int timer, unsigned int level)
 
 /***************************************************************************/
 
+static void m532x_cpu_reset(void)
+{
+	local_irq_disable();
+	__raw_writeb(MCF_RCR_SWRESET, MCF_RCR);
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_setimr(MCFSIM_IMR_MASKALL);
@@ -181,7 +187,7 @@ void __init config_BSP(char *commandp, int size)
 
 	mcf_timervector = 64+32;
 	mcf_profilevector = 64+33;
-	mach_reset = coldfire_reset;
+	mach_reset = m532x_cpu_reset;
 
 #ifdef CONFIG_BDM_DISABLE
 	/*
-- 
GitLab


From 25ce4a908abebf8c7d2e89908d36050e1de9cbec Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 22:39:50 +1000
Subject: [PATCH 1931/2198] m68knommu: move CPU reset code for the 520x
 ColdFire into its platform code

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/m520xsim.h      |  9 +++++++++
 arch/m68knommu/platform/520x/config.c | 15 +++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/m68k/include/asm/m520xsim.h b/arch/m68k/include/asm/m520xsim.h
index 49d016e6391a3..83bbcfd6e8f2b 100644
--- a/arch/m68k/include/asm/m520xsim.h
+++ b/arch/m68k/include/asm/m520xsim.h
@@ -59,5 +59,14 @@
 #define MCFPIT_IMR		MCFINTC_IMRL
 #define MCFPIT_IMR_IBIT		(1 << MCFINT_PIT1)
 
+/*
+ *  Reset Controll Unit.
+ */
+#define	MCF_RCR			0xFC0A0000
+#define	MCF_RSR			0xFC0A0001
+
+#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
+#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
+
 /****************************************************************************/
 #endif  /* m520xsim_h */
diff --git a/arch/m68knommu/platform/520x/config.c b/arch/m68knommu/platform/520x/config.c
index 855fc6a79d72f..1c43a8aec69b9 100644
--- a/arch/m68knommu/platform/520x/config.c
+++ b/arch/m68knommu/platform/520x/config.c
@@ -14,7 +14,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -23,10 +22,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m520x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -169,9 +164,17 @@ void mcf_autovector(unsigned int vec)
 
 /***************************************************************************/
 
+static void m520x_cpu_reset(void)
+{
+	local_irq_disable();
+	__raw_writeb(MCF_RCR_SWRESET, MCF_RCR);
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
-	mach_reset = coldfire_reset;
+	mach_reset = m520x_cpu_reset;
 	m520x_uarts_init();
 	m520x_fec_init();
 }
-- 
GitLab


From 55b33f316d25c1ffb13a65de7f846b950b5ef5a6 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 22:58:35 +1000
Subject: [PATCH 1932/2198] m68knommu: move CPU reset code for the 523x
 ColdFire into its platform code

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/m523xsim.h      |  9 +++++++++
 arch/m68knommu/platform/523x/config.c | 14 ++++++++------
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/arch/m68k/include/asm/m523xsim.h b/arch/m68k/include/asm/m523xsim.h
index bf397313e93f1..55183b5df1b87 100644
--- a/arch/m68k/include/asm/m523xsim.h
+++ b/arch/m68k/include/asm/m523xsim.h
@@ -41,5 +41,14 @@
 #define	MCFSIM_DACR1		0x50		/* SDRAM base address 1 */
 #define	MCFSIM_DMR1		0x54		/* SDRAM address mask 1 */
 
+/*
+ *  Reset Controll Unit (relative to IPSBAR).
+ */
+#define	MCF_RCR			0x110000
+#define	MCF_RSR			0x110001
+
+#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
+#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
+
 /****************************************************************************/
 #endif	/* m523xsim_h */
diff --git a/arch/m68knommu/platform/523x/config.c b/arch/m68knommu/platform/523x/config.c
index 74133f27b30c4..961fefebca14e 100644
--- a/arch/m68knommu/platform/523x/config.c
+++ b/arch/m68knommu/platform/523x/config.c
@@ -15,7 +15,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -24,10 +23,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m523x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -145,13 +140,20 @@ void mcf_autovector(unsigned int vec)
 {
 	/* Everything is auto-vectored on the 523x */
 }
+/***************************************************************************/
+
+static void m523x_cpu_reset(void)
+{
+	local_irq_disable();
+	__raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR);
+}
 
 /***************************************************************************/
 
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_disableall();
-	mach_reset = coldfire_reset;
+	mach_reset = m523x_cpu_reset;
 	m523x_uarts_init();
 	m523x_fec_init();
 }
-- 
GitLab


From 4c0b008d49e728735f54419e60446480017bfe1b Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 23:06:45 +1000
Subject: [PATCH 1933/2198] m68knommu: move CPU reset code for the 527x
 ColdFire into its platform code

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/m527xsim.h      |  9 +++++++++
 arch/m68knommu/platform/527x/config.c | 15 +++++++++------
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/arch/m68k/include/asm/m527xsim.h b/arch/m68k/include/asm/m527xsim.h
index 1f63ab3fb3e67..95f4f8ee8f7c4 100644
--- a/arch/m68k/include/asm/m527xsim.h
+++ b/arch/m68k/include/asm/m527xsim.h
@@ -70,5 +70,14 @@
 #define UART2_ENABLE_MASK	0x3f00 
 #endif
 
+/*
+ *  Reset Controll Unit (relative to IPSBAR).
+ */
+#define	MCF_RCR			0x110000
+#define	MCF_RSR			0x110001
+
+#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
+#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
+
 /****************************************************************************/
 #endif	/* m527xsim_h */
diff --git a/arch/m68knommu/platform/527x/config.c b/arch/m68knommu/platform/527x/config.c
index 428b15922ef59..f746439cfd3ef 100644
--- a/arch/m68knommu/platform/527x/config.c
+++ b/arch/m68knommu/platform/527x/config.c
@@ -15,7 +15,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -24,10 +23,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m527x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -227,10 +222,18 @@ void mcf_autovector(unsigned int vec)
 
 /***************************************************************************/
 
+static void m527x_cpu_reset(void)
+{
+	local_irq_disable();
+	__raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR);
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 	mcf_disableall();
-	mach_reset = coldfire_reset;
+	mach_reset = m527x_cpu_reset;
 	m527x_uarts_init();
 	m527x_fec_init();
 }
-- 
GitLab


From dd65b1de553ddfd85e8fea9d3aa9db7479ccf2aa Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 23:15:56 +1000
Subject: [PATCH 1934/2198] m68knommu: move CPU reset code for the 528x
 ColdFire into its platform code

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/m528xsim.h      |  8 ++++++++
 arch/m68knommu/platform/528x/config.c | 13 +++++++++----
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/arch/m68k/include/asm/m528xsim.h b/arch/m68k/include/asm/m528xsim.h
index 28bf783a5d6df..d79c49f8134ab 100644
--- a/arch/m68k/include/asm/m528xsim.h
+++ b/arch/m68k/include/asm/m528xsim.h
@@ -56,6 +56,14 @@
 #define MCF5282_INTC0_ICR17     (volatile u8 *) (MCF_IPSBAR + 0x0C51)
 
 
+/*
+ *  Reset Control Unit (relative to IPSBAR).
+ */
+#define	MCF_RCR			0x110000
+#define	MCF_RSR			0x110001
+
+#define	MCF_RCR_SWRESET		0x80		/* Software reset bit */
+#define	MCF_RCR_FRCSTOUT	0x40		/* Force external reset */
 
 /*********************************************************************
 *
diff --git a/arch/m68knommu/platform/528x/config.c b/arch/m68knommu/platform/528x/config.c
index bee526f4d1af7..a1d1a61c4fe66 100644
--- a/arch/m68knommu/platform/528x/config.c
+++ b/arch/m68knommu/platform/528x/config.c
@@ -31,10 +31,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
-/***************************************************************************/
-
 static struct mcf_platform_uart m528x_uart_platform[] = {
 	{
 		.mapbase	= MCF_MBAR + MCFUART_BASE1,
@@ -171,6 +167,14 @@ void mcf_autovector(unsigned int vec)
 
 /***************************************************************************/
 
+static void m528x_cpu_reset(void)
+{
+	local_irq_disable();
+	__raw_writeb(MCF_RCR_SWRESET, MCF_IPSBAR + MCF_RCR);
+}
+
+/***************************************************************************/
+
 #ifdef CONFIG_WILDFIRE
 void wildfire_halt(void)
 {
@@ -214,6 +218,7 @@ void __init config_BSP(char *commandp, int size)
 
 static int __init init_BSP(void)
 {
+	mach_reset = m528x_cpu_reset;
 	m528x_uarts_init();
 	m528x_fec_init();
 	platform_add_devices(m528x_devices, ARRAY_SIZE(m528x_devices));
-- 
GitLab


From 05728aec8b05ec3fa859add8b22ba46432611e9e Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 30 Apr 2009 23:32:52 +1000
Subject: [PATCH 1935/2198] m68knommu: move CPU reset code for the 5272
 ColdFire into its platform code

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68knommu/platform/5272/config.c | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/arch/m68knommu/platform/5272/config.c b/arch/m68knommu/platform/5272/config.c
index e049245f4092a..5f95fcde05fd8 100644
--- a/arch/m68knommu/platform/5272/config.c
+++ b/arch/m68knommu/platform/5272/config.c
@@ -12,7 +12,6 @@
 #include <linux/kernel.h>
 #include <linux/param.h>
 #include <linux/init.h>
-#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <asm/machdep.h>
 #include <asm/coldfire.h>
@@ -21,8 +20,6 @@
 
 /***************************************************************************/
 
-void coldfire_reset(void);
-
 extern unsigned int mcf_timervector;
 extern unsigned int mcf_profilevector;
 extern unsigned int mcf_timerlevel;
@@ -170,6 +167,19 @@ void mcf_settimericr(int timer, int level)
 
 /***************************************************************************/
 
+static void m5272_cpu_reset(void)
+{
+	local_irq_disable();
+	/* Set watchdog to reset, and enabled */
+	__raw_writew(0, MCF_MBAR + MCFSIM_WIRR);
+	__raw_writew(1, MCF_MBAR + MCFSIM_WRRR);
+	__raw_writew(0, MCF_MBAR + MCFSIM_WCR);
+	for (;;)
+		/* wait for watchdog to timeout */;
+}
+
+/***************************************************************************/
+
 void __init config_BSP(char *commandp, int size)
 {
 #if defined (CONFIG_MOD5272)
@@ -194,7 +204,7 @@ void __init config_BSP(char *commandp, int size)
 
 	mcf_timervector = 69;
 	mcf_profilevector = 70;
-	mach_reset = coldfire_reset;
+	mach_reset = m5272_cpu_reset;
 }
 
 /***************************************************************************/
-- 
GitLab


From fb29ad7949aeed3d464ba8d2c9772835f456d4c4 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Fri, 1 May 2009 16:09:17 +1000
Subject: [PATCH 1936/2198] m68knommu: remove obsolete reset code

All ColdFire and non-MMU 68k code has custom reset routines.
Remove the obsolete and now un-used reset macros.

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/system_no.h          | 98 ----------------------
 arch/m68knommu/platform/coldfire/vectors.c |  7 --
 2 files changed, 105 deletions(-)

diff --git a/arch/m68k/include/asm/system_no.h b/arch/m68k/include/asm/system_no.h
index a0a1ae8b15241..3c0718d74398b 100644
--- a/arch/m68k/include/asm/system_no.h
+++ b/arch/m68k/include/asm/system_no.h
@@ -203,104 +203,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz
 #include <asm-generic/cmpxchg.h>
 #endif
 
-#if defined( CONFIG_M68328 ) || defined( CONFIG_M68EZ328 ) || \
-	defined (CONFIG_M68360) || defined( CONFIG_M68VZ328 )
-#define HARD_RESET_NOW() ({		\
-        local_irq_disable();		\
-        asm("				\
-        moveal #0x10c00000, %a0;	\
-        moveb #0, 0xFFFFF300;		\
-        moveal 0(%a0), %sp;		\
-        moveal 4(%a0), %a0;		\
-        jmp (%a0);			\
-        ");				\
-})
-#endif
-
-#ifdef CONFIG_COLDFIRE
-#if defined(CONFIG_M5272) && defined(CONFIG_NETtel)
-/*
- * Need to account for broken early mask of 5272 silicon. So don't
- * jump through the original start address. Jump strait into the
- * known start of the FLASH code.
- */
-#define HARD_RESET_NOW() ({		\
-        asm("				\
-	movew #0x2700, %sr;		\
-        jmp 0xf0000400;			\
-        ");				\
-})
-#elif defined(CONFIG_NETtel) || \
-      defined(CONFIG_SECUREEDGEMP3) || defined(CONFIG_CLEOPATRA)
-#define HARD_RESET_NOW() ({		\
-        asm("				\
-	movew #0x2700, %sr;		\
-	moveal #0x10000044, %a0;	\
-	movel #0xffffffff, (%a0);	\
-	moveal #0x10000001, %a0;	\
-	moveb #0x00, (%a0);		\
-        moveal #0xf0000004, %a0;	\
-        moveal (%a0), %a0;		\
-        jmp (%a0);			\
-        ");				\
-})
-#elif defined(CONFIG_M5272)
-/*
- * Retrieve the boot address in flash using CSBR0 and CSOR0
- * find the reset vector at flash_address + 4 (e.g. 0x400)
- * remap it in the flash's current location (e.g. 0xf0000400)
- * and jump there.
- */ 
-#define HARD_RESET_NOW() ({		\
-	asm("				\
-	movew #0x2700, %%sr;		\
-	move.l	%0+0x40,%%d0;		\
-	and.l	%0+0x44,%%d0;		\
-	andi.l	#0xfffff000,%%d0;	\
-	mov.l	%%d0,%%a0;		\
-	or.l	4(%%a0),%%d0;		\
-	mov.l	%%d0,%%a0;		\
-	jmp (%%a0);"			\
-	: /* No output */		\
-	: "o" (*(char *)MCF_MBAR) );	\
-})
-#elif defined(CONFIG_M523x) || defined(CONFIG_M528x) || defined(CONFIG_M527x)
-/*
- * Most of the newer ColdFire family members have a proper RESET unit.
- * Use the software reset control bit in the Reset Control Register (RCR).
- */
-#define HARD_RESET_NOW() \
-({									\
-	unsigned char volatile *reset;					\
-	asm("move.w #0x2700, %sr");					\
-	reset = ((volatile unsigned char *)(MCF_IPSBAR + 0x110000));	\
-	while (1)							\
-		*reset |= 0x80;						\
-})
-#elif defined(CONFIG_M520x)
-	/*
-	 * The MCF5208 has a bit (SOFTRST) in memory (Reset Control Register 
-	 * RCR), that when set, resets the MCF5208.
-	 */
-#define HARD_RESET_NOW() 		\
-({					\
-	unsigned char volatile *reset;	\
-	asm("move.w     #0x2700, %sr");	\
-	reset = ((volatile unsigned char *)(MCF_IPSBAR + 0xA0000));	\
-	while(1)			\
-		*reset |= 0x80;		\
-})
-#else
-#define HARD_RESET_NOW() ({		\
-        asm("				\
-	movew #0x2700, %sr;		\
-        moveal #0x4, %a0;		\
-        moveal (%a0), %a0;		\
-        jmp (%a0);			\
-        ");				\
-})
-#endif
-#endif
 #define arch_align_stack(x) (x)
 
 
diff --git a/arch/m68knommu/platform/coldfire/vectors.c b/arch/m68knommu/platform/coldfire/vectors.c
index 6cf894620234c..bdca0297fa9a1 100644
--- a/arch/m68knommu/platform/coldfire/vectors.c
+++ b/arch/m68knommu/platform/coldfire/vectors.c
@@ -96,10 +96,3 @@ void ack_vector(unsigned int irq)
 }
 
 /***************************************************************************/
-
-void coldfire_reset(void)
-{
-	HARD_RESET_NOW();
-}
-
-/***************************************************************************/
-- 
GitLab


From 308e610e8c435d7875842b427dbc99de43a4aec9 Mon Sep 17 00:00:00 2001
From: Joe Perches <joe@perches.com>
Date: Fri, 15 May 2009 10:59:37 -0700
Subject: [PATCH 1937/2198] arch/m68knommu: Convert #ifdef DEBUG
 printk(KERN_DEBUG to pr_debug(

Signed-off-by: Joe Perches <joe@perches.com>
Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68knommu/kernel/setup.c | 16 +++++++---------
 arch/m68knommu/mm/init.c      |  4 +---
 2 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/arch/m68knommu/kernel/setup.c b/arch/m68knommu/kernel/setup.c
index 5985f19890211..5c2bb3eeaaa2e 100644
--- a/arch/m68knommu/kernel/setup.c
+++ b/arch/m68knommu/kernel/setup.c
@@ -166,15 +166,13 @@ void __init setup_arch(char **cmdline_p)
 	printk(KERN_INFO "Motorola M5235EVB support (C)2005 Syn-tech Systems, Inc. (Jate Sujjavanich)\n");
 #endif
 
-#ifdef DEBUG
-	printk(KERN_DEBUG "KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x "
-		"BSS=0x%06x-0x%06x\n", (int) &_stext, (int) &_etext,
-		(int) &_sdata, (int) &_edata,
-		(int) &_sbss, (int) &_ebss);
-	printk(KERN_DEBUG "MEMORY -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x\n ",
-		(int) &_ebss, (int) memory_start,
-		(int) memory_start, (int) memory_end);
-#endif
+	pr_debug("KERNEL -> TEXT=0x%06x-0x%06x DATA=0x%06x-0x%06x "
+		 "BSS=0x%06x-0x%06x\n", (int) &_stext, (int) &_etext,
+		 (int) &_sdata, (int) &_edata,
+		 (int) &_sbss, (int) &_ebss);
+	pr_debug("MEMORY -> ROMFS=0x%06x-0x%06x MEM=0x%06x-0x%06x\n ",
+		 (int) &_ebss, (int) memory_start,
+		 (int) memory_start, (int) memory_end);
 
 	/* Keep a copy of command line */
 	*cmdline_p = &command_line[0];
diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c
index 7befc0c357e08..b1703c67a4f19 100644
--- a/arch/m68knommu/mm/init.c
+++ b/arch/m68knommu/mm/init.c
@@ -126,9 +126,7 @@ void __init mem_init(void)
 	unsigned long start_mem = memory_start; /* DAVIDM - these must start at end of kernel */
 	unsigned long end_mem   = memory_end; /* DAVIDM - this must not include kernel stack at top */
 
-#ifdef DEBUG
-	printk(KERN_DEBUG "Mem_init: start=%lx, end=%lx\n", start_mem, end_mem);
-#endif
+	pr_debug("Mem_init: start=%lx, end=%lx\n", start_mem, end_mem);
 
 	end_mem &= PAGE_MASK;
 	high_memory = (void *) end_mem;
-- 
GitLab


From 4f308e35a9bde9d6b671e8409157edb9065f117c Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Date: Wed, 10 Jun 2009 18:47:47 +0530
Subject: [PATCH 1938/2198] headers_check fix: m68k, swab.h

fix the following 'make headers_check' warnings:

  usr/include/asm-m68k/swab.h:4: include of <linux/types.h> is preferred over <asm/types.h>
  usr/include/asm-m68k/swab.h:10: found __[us]{8,16,32,64} type without #include <linux/types.h>

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/swab.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/m68k/include/asm/swab.h b/arch/m68k/include/asm/swab.h
index 9e3054ea59e93..5b754aace7449 100644
--- a/arch/m68k/include/asm/swab.h
+++ b/arch/m68k/include/asm/swab.h
@@ -1,7 +1,7 @@
 #ifndef _M68K_SWAB_H
 #define _M68K_SWAB_H
 
-#include <asm/types.h>
+#include <linux/types.h>
 #include <linux/compiler.h>
 
 #define __SWAB_64_THRU_32__
-- 
GitLab


From 9d4f94135385b565e2ce4a37f71b53d0fd3664e8 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 11 Jun 2009 13:05:00 +1000
Subject: [PATCH 1939/2198] m68knommu: enumerate INIT_THREAD fields properly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use proper field value setting init INIT_THREAD macro.
Fixes this:

arch/m68knommu/kernel/init_task.c:27: warning: excess elements in array initializer
arch/m68knommu/kernel/init_task.c:27: warning: (near initialization for ‘init_task.thread.fpstate’)

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68k/include/asm/processor_no.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/m68k/include/asm/processor_no.h b/arch/m68k/include/asm/processor_no.h
index 91cba18acdd32..7a1e0ba35f5ad 100644
--- a/arch/m68k/include/asm/processor_no.h
+++ b/arch/m68k/include/asm/processor_no.h
@@ -72,10 +72,10 @@ struct thread_struct {
 	unsigned char  fpstate[FPSTATESIZE];  /* floating point state */
 };
 
-#define INIT_THREAD  { \
-	sizeof(init_stack) + (unsigned long) init_stack, 0, \
-	PS_S, __KERNEL_DS, \
-	{0, 0}, 0, {0,}, {0, 0, 0}, {0,}, \
+#define INIT_THREAD  {							\
+	.ksp	= sizeof(init_stack) + (unsigned long) init_stack,	\
+	.sr	= PS_S,							\
+	.fs	= __KERNEL_DS,						\
 }
 
 /*
-- 
GitLab


From 193e98401a89707ea374e9c32c1b9a5a9dd87a48 Mon Sep 17 00:00:00 2001
From: Greg Ungerer <gerg@uclinux.org>
Date: Thu, 23 Apr 2009 11:24:03 +1000
Subject: [PATCH 1940/2198] m68knommu: remove unecessary include of
 thread_info.h in entry.S

Signed-off-by: Greg Ungerer <gerg@uclinux.org>
---
 arch/m68knommu/kernel/entry.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/m68knommu/kernel/entry.S b/arch/m68knommu/kernel/entry.S
index f4782d2dce8f7..f56faa5c9cd91 100644
--- a/arch/m68knommu/kernel/entry.S
+++ b/arch/m68knommu/kernel/entry.S
@@ -26,7 +26,6 @@
 
 #include <linux/sys.h>
 #include <linux/linkage.h>
-#include <asm/thread_info.h>
 #include <asm/errno.h>
 #include <asm/setup.h>
 #include <asm/segment.h>
-- 
GitLab


From a50de78dc6d21ee074e9561c800d194bec12128b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 2 Jun 2009 08:43:59 +0000
Subject: [PATCH 1941/2198] sh: clock div4 frequency table offset fix

This patch fixes the per clock offset calculation in
sh_clk_div4_register(). Without this patch the offset
to the frequency table for each clock is incorrect.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/clock-cpg.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index 88fc30d2f5fd9..e604a6f80194a 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -71,8 +71,9 @@ int __init sh_clk_div4_register(struct clk *clks, int nr,
 	int ret = 0;
 	int k;
 
-	k = nr_divs + 1;
-	freq_table = alloc_bootmem(freq_table_size * nr * (nr_divs + 1));
+	freq_table_size *= (nr_divs + 1);
+
+	freq_table = alloc_bootmem(freq_table_size * nr);
 	if (!freq_table)
 		return -ENOMEM;
 
-- 
GitLab


From 2693e2740ddae364a80e6083043ba760b6366b69 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 2 Jun 2009 08:53:54 +0000
Subject: [PATCH 1942/2198] sh: clock div6 helper code

This patch adds div6 clock helper code. The div6 clocks
are simply 6-bit divide-by-n modules where n is 1 to 64.

Needed for vclk on sh7722, sh7723, sh7343 and sh7366.
sh7724 needs this even more for vclk, fclka, fclkb,
irdaclk and spuclk.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h    | 10 +++++
 arch/sh/kernel/cpu/clock-cpg.c | 71 +++++++++++++++++++++++++++++++---
 2 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 7435e40022e67..026b9daa5584d 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -145,4 +145,14 @@ int sh_clk_mstp32_register(struct clk *clks, int nr);
 int sh_clk_div4_register(struct clk *clks, int nr,
 			 struct clk_div_mult_table *table);
 
+#define SH_CLK_DIV6(_name, _parent, _reg, _flags)	\
+{							\
+	.name = _name,					\
+	.parent = _parent,				\
+	.enable_reg = (void __iomem *)_reg,		\
+	.flags = _flags,				\
+}
+
+int sh_clk_div6_register(struct clk *clks, int nr);
+
 #endif /* __ASM_SH_CLOCK_H */
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index e604a6f80194a..fedc8b84db4c2 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -38,6 +38,70 @@ int __init sh_clk_mstp32_register(struct clk *clks, int nr)
 	return ret;
 }
 
+static long sh_clk_div_round_rate(struct clk *clk, unsigned long rate)
+{
+	return clk_rate_table_round(clk, clk->freq_table, rate);
+}
+
+static int sh_clk_div6_divisors[64] = {
+	1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+	17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+	33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+	49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+};
+
+static struct clk_div_mult_table sh_clk_div6_table = {
+	.divisors = sh_clk_div6_divisors,
+	.nr_divisors = ARRAY_SIZE(sh_clk_div6_divisors),
+};
+
+static unsigned long sh_clk_div6_recalc(struct clk *clk)
+{
+	struct clk_div_mult_table *table = &sh_clk_div6_table;
+	unsigned int idx;
+
+	clk_rate_table_build(clk, clk->freq_table, table->nr_divisors,
+			     table, NULL);
+
+	idx = __raw_readl(clk->enable_reg) & 0x003f;
+
+	return clk->freq_table[idx].frequency;
+}
+
+static struct clk_ops sh_clk_div6_clk_ops = {
+	.recalc		= sh_clk_div6_recalc,
+	.round_rate	= sh_clk_div_round_rate,
+};
+
+int __init sh_clk_div6_register(struct clk *clks, int nr)
+{
+	struct clk *clkp;
+	void *freq_table;
+	int nr_divs = sh_clk_div6_table.nr_divisors;
+	int freq_table_size = sizeof(struct cpufreq_frequency_table);
+	int ret = 0;
+	int k;
+
+	freq_table_size *= (nr_divs + 1);
+
+	freq_table = alloc_bootmem(freq_table_size * nr);
+	if (!freq_table)
+		return -ENOMEM;
+
+	for (k = 0; !ret && (k < nr); k++) {
+		clkp = clks + k;
+
+		clkp->ops = &sh_clk_div6_clk_ops;
+		clkp->id = -1;
+		clkp->freq_table = freq_table + (k * freq_table_size);
+		clkp->freq_table[nr_divs].frequency = CPUFREQ_TABLE_END;
+
+		ret = clk_register(clkp);
+	}
+
+	return ret;
+}
+
 static unsigned long sh_clk_div4_recalc(struct clk *clk)
 {
 	struct clk_div_mult_table *table = clk->priv;
@@ -51,14 +115,9 @@ static unsigned long sh_clk_div4_recalc(struct clk *clk)
 	return clk->freq_table[idx].frequency;
 }
 
-static long sh_clk_div4_round_rate(struct clk *clk, unsigned long rate)
-{
-	return clk_rate_table_round(clk, clk->freq_table, rate);
-}
-
 static struct clk_ops sh_clk_div4_clk_ops = {
 	.recalc		= sh_clk_div4_recalc,
-	.round_rate	= sh_clk_div4_round_rate,
+	.round_rate	= sh_clk_div_round_rate,
 };
 
 int __init sh_clk_div4_register(struct clk *clks, int nr,
-- 
GitLab


From 0d4fdbb64f472ef31195714993f1263f77cf85ca Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 2 Jun 2009 09:22:02 +0000
Subject: [PATCH 1943/2198] sh: rework mode pin code

This patch reworks the mode pin code to keep the pin
definitions in one place. The mode pins values are now
the value of the bit instead of bit number.

With this patch in place the sh7785 header file contains
mode pin comments. The sh7785 clock code and the sh7785lcr
board code are updated to reflect the new shared mode pins.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/board-sh7785lcr.c       | 14 ++++-----
 arch/sh/include/asm/processor.h        | 17 +++++++++++
 arch/sh/include/cpu-sh4/cpu/sh7785.h   | 39 +++++++++++++-------------
 arch/sh/kernel/cpu/sh4a/clock-sh7785.c |  2 +-
 arch/sh/kernel/setup.c                 |  2 +-
 5 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/arch/sh/boards/board-sh7785lcr.c b/arch/sh/boards/board-sh7785lcr.c
index c2894c5b71ea0..7be56fb06c1f0 100644
--- a/arch/sh/boards/board-sh7785lcr.c
+++ b/arch/sh/boards/board-sh7785lcr.c
@@ -330,13 +330,13 @@ static int sh7785lcr_mode_pins(void)
 	 * If you change these dip switches then you will need to
 	 * adjust the values below as well.
 	 */
-	value |= 1 << MODE_PIN_MODE4; /* Clock Mode 16 */
-	value |= 1 << MODE_PIN_MODE5; /* 32-bit Area0 bus width */
-	value |= 1 << MODE_PIN_MODE6; /* 32-bit Area0 bus width */
-	value |= 1 << MODE_PIN_MODE7; /* Area 0 SRAM interface [fixed] */
-	value |= 1 << MODE_PIN_MODE8; /* Little Endian */
-	value |= 1 << MODE_PIN_MODE9; /* Master Mode */
-	value |= 1 << MODE_PIN_MODE14; /* No PLL step-up */
+	value |= MODE_PIN4; /* Clock Mode 16 */
+	value |= MODE_PIN5; /* 32-bit Area0 bus width */
+	value |= MODE_PIN6; /* 32-bit Area0 bus width */
+	value |= MODE_PIN7; /* Area 0 SRAM interface [fixed] */
+	value |= MODE_PIN8; /* Little Endian */
+	value |= MODE_PIN9; /* Master Mode */
+	value |= MODE_PIN14; /* No PLL step-up */
 
 	return value;
 }
diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h
index fb67482e47eb4..ff7daaf9a6202 100644
--- a/arch/sh/include/asm/processor.h
+++ b/arch/sh/include/asm/processor.h
@@ -95,6 +95,23 @@ const char *get_cpu_subtype(struct sh_cpuinfo *c);
 extern const struct seq_operations cpuinfo_op;
 
 /* processor boot mode configuration */
+#define MODE_PIN0 (1 << 0)
+#define MODE_PIN1 (1 << 1)
+#define MODE_PIN2 (1 << 2)
+#define MODE_PIN3 (1 << 3)
+#define MODE_PIN4 (1 << 4)
+#define MODE_PIN5 (1 << 5)
+#define MODE_PIN6 (1 << 6)
+#define MODE_PIN7 (1 << 7)
+#define MODE_PIN8 (1 << 8)
+#define MODE_PIN9 (1 << 9)
+#define MODE_PIN10 (1 << 10)
+#define MODE_PIN11 (1 << 11)
+#define MODE_PIN12 (1 << 12)
+#define MODE_PIN13 (1 << 13)
+#define MODE_PIN14 (1 << 14)
+#define MODE_PIN15 (1 << 15)
+
 int generic_mode_pins(void);
 int test_mode_pin(int pin);
 
diff --git a/arch/sh/include/cpu-sh4/cpu/sh7785.h b/arch/sh/include/cpu-sh4/cpu/sh7785.h
index 89afaa6dc2d8f..9dc9d91e0a8e1 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7785.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7785.h
@@ -1,25 +1,26 @@
 #ifndef __ASM_SH7785_H__
 #define __ASM_SH7785_H__
 
-/* Boot Mode Pins, more information in sh7785 manual Rev.1.00, page 1628 */
-enum {
-	MODE_PIN_MODE0,  /* CPG - Initial Pck/Bck Frequency [FRQMR1] */
-	MODE_PIN_MODE1,  /* CPG - Initial Uck/SHck/DDRck Frequency [FRQMR1] */
-	MODE_PIN_MODE2,  /* CPG - Reserved (L: Normal operation) */
-	MODE_PIN_MODE3,  /* CPG - Reserved (L: Normal operation) */
-	MODE_PIN_MODE4,  /* CPG - Initial PLL setting (72x/36x) */
-	MODE_PIN_MODE5,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.8] */
-	MODE_PIN_MODE6,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.9] */
-	MODE_PIN_MODE7,  /* LBSC - Area0 Memory Type / Bus Width [CS0BCR.3] */
-	MODE_PIN_MODE8,  /* LBSC - Endian Mode (L: Big, H: Little) [BCR.31] */
-	MODE_PIN_MODE9,  /* LBSC - Master/Slave Mode (L: Slave) [BCR.30] */
-	MODE_PIN_MODE10, /* CPG - Clock Input (L: Ext Clk, H: Crystal) */
-	MODE_PIN_MODE11, /* PCI - Pin Mode (LL: PCI host, LH: PCI slave) */
-	MODE_PIN_MODE12, /* PCI - Pin Mode (HL: Local bus, HH: DU) */
-	MODE_PIN_MODE13, /* Boot Address Mode (L: 29-bit, H: 32-bit) */
-	MODE_PIN_MODE14, /* Reserved (H: Normal operation) */
-	MODE_PIN_MPMD,   /* Emulation Mode (L: Emulation mode, H: LSI mode) */
-};
+/* Boot Mode Pins:
+ *
+ * MODE0: CPG - Initial Pck/Bck Frequency [FRQMR1]
+ * MODE1: CPG - Initial Uck/SHck/DDRck Frequency [FRQMR1]
+ * MODE2: CPG - Reserved (L: Normal operation)
+ * MODE3: CPG - Reserved (L: Normal operation)
+ * MODE4: CPG - Initial PLL setting (72x/36x)
+ * MODE5: LBSC - Area0 Memory Type / Bus Width [CS0BCR.8]
+ * MODE6: LBSC - Area0 Memory Type / Bus Width [CS0BCR.9]
+ * MODE7: LBSC - Area0 Memory Type / Bus Width [CS0BCR.3]
+ * MODE8: LBSC - Endian Mode (L: Big, H: Little) [BCR.31]
+ * MODE9: LBSC - Master/Slave Mode (L: Slave) [BCR.30]
+ * MODE10: CPG - Clock Input (L: Ext Clk, H: Crystal)
+ * MODE11: PCI - Pin Mode (LL: PCI host, LH: PCI slave)
+ * MODE12: PCI - Pin Mode (HL: Local bus, HH: DU)
+ * MODE13: Boot Address Mode (L: 29-bit, H: 32-bit)
+ * MODE14: Reserved (H: Normal operation)
+ *
+ * More information in sh7785 manual Rev.1.00, page 1628.
+ */
 
 /* Pin Function Controller:
  * GPIO_FN_xx - GPIO used to select pin function
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
index dae20aca536d0..73abfbf2f16d2 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c
@@ -32,7 +32,7 @@ static unsigned long pll_recalc(struct clk *clk)
 {
 	int multiplier;
 
-	multiplier = test_mode_pin(MODE_PIN_MODE4) ? 36 : 72;
+	multiplier = test_mode_pin(MODE_PIN4) ? 36 : 72;
 
 	return clk->parent->rate * multiplier;
 }
diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c
index 050131eec7738..dd38338553ef5 100644
--- a/arch/sh/kernel/setup.c
+++ b/arch/sh/kernel/setup.c
@@ -429,7 +429,7 @@ int generic_mode_pins(void)
 
 int test_mode_pin(int pin)
 {
-	return sh_mv.mv_mode_pins() & (1 << pin);
+	return sh_mv.mv_mode_pins() & pin;
 }
 
 static const char *cpu_name[] = {
-- 
GitLab


From e4218ef506a2f8b0b5bf1a4b6effcc2f8983f86b Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 2 Jun 2009 09:45:16 +0000
Subject: [PATCH 1944/2198] sh: sh7723 mode pin V2

This patch is sh7723 mode pin V2. Mode pins and
pin function controller comments are added.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/cpu-sh4/cpu/sh7723.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7723.h b/arch/sh/include/cpu-sh4/cpu/sh7723.h
index 9d2f6d7aa938d..14c8ca936781c 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7723.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7723.h
@@ -1,6 +1,20 @@
 #ifndef __ASM_SH7723_H__
 #define __ASM_SH7723_H__
 
+/* Boot Mode Pins:
+ *
+ * MD0: CPG - Clock Mode 0->3
+ * MD1: CPG - Clock Mode 0->3
+ * MD2: CPG - Reserved (L: Normal operation)
+ * MD3: BSC - Area0 Bus Width (16/32-bit) [CS0BCR.9,10]
+ * MD5: BSC - Endian Mode (L: Big, H: Little) [CMNCR.3]
+ * MD8: Test Mode
+ */
+
+/* Pin Function Controller:
+ * GPIO_FN_xx - GPIO used to select pin function
+ * GPIO_Pxx - GPIO mapped to real I/O pin on CPU
+ */
 enum {
 	/* PTA */
 	GPIO_PTA7, GPIO_PTA6, GPIO_PTA5, GPIO_PTA4,
-- 
GitLab


From 36e5b26bdac0ee3791071d240334853ddd7b9790 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 2 Jun 2009 10:47:32 +0000
Subject: [PATCH 1945/2198] sh: sh7724 mode pin comments

This patch adds comments for the sh7724 mode pins
and pin function controller.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/cpu-sh4/cpu/sh7724.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7724.h b/arch/sh/include/cpu-sh4/cpu/sh7724.h
index 34605c9e354db..66fd1184359ec 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7724.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7724.h
@@ -1,6 +1,20 @@
 #ifndef __ASM_SH7724_H__
 #define __ASM_SH7724_H__
 
+/* Boot Mode Pins:
+ *
+ * MD0: CPG - Clock Mode 0->7
+ * MD1: CPG - Clock Mode 0->7
+ * MD2: CPG - Clock Mode 0->7
+ * MD3: BSC - Area0 Bus Width (16/32-bit) [CS0BCR.9,10]
+ * MD5: BSC - Endian Mode (L: Big, H: Little) [CMNCR.3]
+ * MD8: Test Mode
+ */
+
+/* Pin Function Controller:
+ * GPIO_FN_xx - GPIO used to select pin function
+ * GPIO_Pxx - GPIO mapped to real I/O pin on CPU
+ */
 enum {
 	/* PTA */
 	GPIO_PTA7, GPIO_PTA6, GPIO_PTA5, GPIO_PTA4,
-- 
GitLab


From ed740cb9b7f6eaee7bb45a07ce53ff9e73a92798 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 3 Jun 2009 08:18:56 +0000
Subject: [PATCH 1946/2198] sh: sh7722 mode pin definitions

This patch adds sh7722 mode pin and pin function
controller comments.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/cpu-sh4/cpu/sh7722.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/sh/include/cpu-sh4/cpu/sh7722.h b/arch/sh/include/cpu-sh4/cpu/sh7722.h
index 4b3096f5307b8..738ea43c5038d 100644
--- a/arch/sh/include/cpu-sh4/cpu/sh7722.h
+++ b/arch/sh/include/cpu-sh4/cpu/sh7722.h
@@ -1,6 +1,20 @@
 #ifndef __ASM_SH7722_H__
 #define __ASM_SH7722_H__
 
+/* Boot Mode Pins:
+ *
+ * MD0: CPG - Clock Mode 0->3
+ * MD1: CPG - Clock Mode 0->3
+ * MD2: CPG - Reserved (L: Normal operation)
+ * MD3: BSC - Area0 Bus Width (16/32-bit) [CS0BCR.9,10]
+ * MD5: BSC - Endian Mode (L: Big, H: Little) [CMNCR.3]
+ * MD8: Test Mode
+ */
+
+/* Pin Function Controller:
+ * GPIO_FN_xx - GPIO used to select pin function
+ * GPIO_Pxx - GPIO mapped to real I/O pin on CPU
+ */
 enum {
 	/* PTA */
 	GPIO_PTA7, GPIO_PTA6, GPIO_PTA5, GPIO_PTA4,
-- 
GitLab


From 0ec80fddf1674579d52b5ff94774ba73107cad31 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 3 Jun 2009 08:22:50 +0000
Subject: [PATCH 1947/2198] sh: add Migo-R mode pin configuration

This patch adds mode pin configuration and
a machvec structure to Migo-R.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/mach-migor/setup.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/arch/sh/boards/mach-migor/setup.c b/arch/sh/boards/mach-migor/setup.c
index 1ee1de0bc1c3c..6ed401cd3156a 100644
--- a/arch/sh/boards/mach-migor/setup.c
+++ b/arch/sh/boards/mach-migor/setup.c
@@ -584,3 +584,22 @@ static int __init migor_devices_setup(void)
 	return platform_add_devices(migor_devices, ARRAY_SIZE(migor_devices));
 }
 __initcall(migor_devices_setup);
+
+/* Return the board specific boot mode pin configuration */
+static int migor_mode_pins(void)
+{
+	/* MD0=1, MD1=1, MD2=0: Clock Mode 3
+	 * MD3=0: 16-bit Area0 Bus Width
+	 * MD5=1: Little Endian
+	 * TSTMD=1, MD8=0: Test Mode Disabled
+	 */
+	return MODE_PIN0 | MODE_PIN1 | MODE_PIN5;
+}
+
+/*
+ * The Machine Vector
+ */
+static struct sh_machine_vector mv_migor __initmv = {
+	.mv_name		= "Migo-R",
+	.mv_mode_pins		= migor_mode_pins,
+};
-- 
GitLab


From c01641b42a88c475fd4b72cff2b10e42262a80fe Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 3 Jun 2009 08:26:40 +0000
Subject: [PATCH 1948/2198] sh: add AP325RXA mode pin configuration

This patch adds mode pin configuration to ap325rxa.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/boards/board-ap325rxa.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/sh/boards/board-ap325rxa.c b/arch/sh/boards/board-ap325rxa.c
index f2a29641b6a32..1c4d83ef2a472 100644
--- a/arch/sh/boards/board-ap325rxa.c
+++ b/arch/sh/boards/board-ap325rxa.c
@@ -535,6 +535,18 @@ static int __init ap325rxa_devices_setup(void)
 }
 device_initcall(ap325rxa_devices_setup);
 
+/* Return the board specific boot mode pin configuration */
+static int ap325rxa_mode_pins(void)
+{
+	/* MD0=0, MD1=0, MD2=0: Clock Mode 0
+	 * MD3=0: 16-bit Area0 Bus Width
+	 * MD5=1: Little Endian
+	 * TSTMD=1, MD8=1: Test Mode Disabled
+	 */
+	return MODE_PIN5 | MODE_PIN8;
+}
+
 static struct sh_machine_vector mv_ap325rxa __initmv = {
 	.mv_name = "AP-325RXA",
+	.mv_mode_pins = ap325rxa_mode_pins,
 };
-- 
GitLab


From 098dee99d14e8324d3793df442d6078d0c134140 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 05:31:41 +0000
Subject: [PATCH 1949/2198] sh: add enable()/disable()/set_rate() to div6 code

This patch updates the div6 clock helper code to add support
for enable(), disable() and set_rate() callbacks.

Needed by the camera clock enabling board code on Migo-R.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/clock.h    |  4 ++++
 arch/sh/kernel/cpu/clock-cpg.c | 44 ++++++++++++++++++++++++++++++++++
 arch/sh/kernel/cpu/clock.c     | 19 +++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h
index 026b9daa5584d..9fe7d7f8af404 100644
--- a/arch/sh/include/asm/clock.h
+++ b/arch/sh/include/asm/clock.h
@@ -119,6 +119,10 @@ long clk_rate_table_round(struct clk *clk,
 			  struct cpufreq_frequency_table *freq_table,
 			  unsigned long rate);
 
+int clk_rate_table_find(struct clk *clk,
+			struct cpufreq_frequency_table *freq_table,
+			unsigned long rate);
+
 #define SH_CLK_MSTP32(_name, _id, _parent, _enable_reg,	\
 	    _enable_bit, _flags)			\
 {							\
diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c
index fedc8b84db4c2..275942e58e4f1 100644
--- a/arch/sh/kernel/cpu/clock-cpg.c
+++ b/arch/sh/kernel/cpu/clock-cpg.c
@@ -68,9 +68,53 @@ static unsigned long sh_clk_div6_recalc(struct clk *clk)
 	return clk->freq_table[idx].frequency;
 }
 
+static int sh_clk_div6_set_rate(struct clk *clk,
+				unsigned long rate, int algo_id)
+{
+	unsigned long value;
+	int idx;
+
+	idx = clk_rate_table_find(clk, clk->freq_table, rate);
+	if (idx < 0)
+		return idx;
+
+	value = __raw_readl(clk->enable_reg);
+	value &= ~0x3f;
+	value |= idx;
+	__raw_writel(value, clk->enable_reg);
+	return 0;
+}
+
+static int sh_clk_div6_enable(struct clk *clk)
+{
+	unsigned long value;
+	int ret;
+
+	ret = sh_clk_div6_set_rate(clk, clk->rate, 0);
+	if (ret == 0) {
+		value = __raw_readl(clk->enable_reg);
+		value &= ~0x100; /* clear stop bit to enable clock */
+		__raw_writel(value, clk->enable_reg);
+	}
+	return ret;
+}
+
+static void sh_clk_div6_disable(struct clk *clk)
+{
+	unsigned long value;
+
+	value = __raw_readl(clk->enable_reg);
+	value |= 0x100; /* stop clock */
+	value |= 0x3f; /* VDIV bits must be non-zero, overwrite divider */
+	__raw_writel(value, clk->enable_reg);
+}
+
 static struct clk_ops sh_clk_div6_clk_ops = {
 	.recalc		= sh_clk_div6_recalc,
 	.round_rate	= sh_clk_div_round_rate,
+	.set_rate	= sh_clk_div6_set_rate,
+	.enable		= sh_clk_div6_enable,
+	.disable	= sh_clk_div6_disable,
 };
 
 int __init sh_clk_div6_register(struct clk *clks, int nr)
diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c
index aa0fd0893585b..f3a46be2ae81c 100644
--- a/arch/sh/kernel/cpu/clock.c
+++ b/arch/sh/kernel/cpu/clock.c
@@ -111,6 +111,25 @@ long clk_rate_table_round(struct clk *clk,
 	return rate_best_fit;
 }
 
+int clk_rate_table_find(struct clk *clk,
+			struct cpufreq_frequency_table *freq_table,
+			unsigned long rate)
+{
+	int i;
+
+	for (i = 0; freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
+		unsigned long freq = freq_table[i].frequency;
+
+		if (freq == CPUFREQ_ENTRY_INVALID)
+			continue;
+
+		if (freq == rate)
+			return i;
+	}
+
+	return -ENOENT;
+}
+
 /* Used for clocks that always have same value as the parent clock */
 unsigned long followparent_recalc(struct clk *clk)
 {
-- 
GitLab


From c521dc02034df3681394a30b428bf081cfa22253 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 3 Jun 2009 08:39:59 +0000
Subject: [PATCH 1950/2198] sh: sh7723 clock framework rewrite V2

This patch contains V2 of the sh7723 clock framework
rewrite. The new code makes use of the recently merged
div4, div6 and mstp32 helper code. Both extal and dll
are supported as input clocks to the pll.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                        |   2 +-
 arch/sh/kernel/cpu/sh4a/Makefile       |   2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c |  57 +------
 arch/sh/kernel/cpu/sh4a/clock-sh7723.c | 222 +++++++++++++++++++++++++
 4 files changed, 225 insertions(+), 58 deletions(-)
 create mode 100644 arch/sh/kernel/cpu/sh4a/clock-sh7723.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index dec8b7b8dc6cf..d13390b81aa58 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -516,7 +516,7 @@ config SH_CLK_CPG
 
 config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
-	def_bool y if !CPU_SUBTYPE_SH7785
+	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index afd6fba47849c..25b2833f74462 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -26,7 +26,7 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7785)	:= clock-sh7785.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7786)	:= clock-sh7786.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7722)	:= clock-sh7722.o
-clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7722.o
+clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7723.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7366)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SHX3)	:= clock-shx3.o
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index c090c9a373f66..ccefd7dde78c1 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -1,7 +1,7 @@
 /*
  * arch/sh/kernel/cpu/sh4a/clock-sh7722.c
  *
- * SH7343, SH7722, SH7723 & SH7366 support for the clock framework
+ * SH7343, SH7722 & SH7366 support for the clock framework
  *
  * Copyright (c) 2006-2007 Nomad Global Solutions Inc
  * Based on code for sh7343 by Paul Mundt
@@ -176,11 +176,6 @@ static unsigned long module_clk_recalc(struct clk *clk)
 #define STCMASK		0x3f
 #define DIVCALC(div)	(div/2-1)
 #define FRQCRKICK	0x80000000
-#elif defined(CONFIG_CPU_SUBTYPE_SH7723)
-#define MASTERDIVS	{ 6, 8, 12, 16 }
-#define STCMASK		0x1f
-#define DIVCALC(div)	(div-1)
-#define FRQCRKICK	0x00000000
 #else
 #define MASTERDIVS	{ 2, 3, 4, 6, 8, 16 }
 #define STCMASK		0x1f
@@ -681,56 +676,6 @@ static struct clk sh7722_mstpcr_clocks[] = {
 	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
 	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
-#if defined(CONFIG_CPU_SUBTYPE_SH7723)
-	/* See page 60 of Datasheet V1.0: Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
-	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
-	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
-	MSTPCR("l2c0", "sh_clk", 0, 28, 0),
-	MSTPCR("ilmem0", "cpu_clk", 0, 27, 0),
-	MSTPCR("fpu0", "cpu_clk", 0, 24, 0),
-	MSTPCR("intc0", "cpu_clk", 0, 22, 0),
-	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
-	MSTPCR("sh0", "sh_clk", 0, 20, 0),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
-	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
-	MSTPCR("cmt0", "r_clk", 0, 14, 0),
-	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
-	MSTPCR("dmac1", "bus_clk", 0, 12, 0),
-	MSTPCR("tmu1", "peripheral_clk", 0, 11, 0),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
-	MSTPCR("scif0", "peripheral_clk", 0, 9, 0),
-	MSTPCR("scif1", "peripheral_clk", 0, 8, 0),
-	MSTPCR("scif2", "peripheral_clk", 0, 7, 0),
-	MSTPCR("scif3", "bus_clk", 0, 6, 0),
-	MSTPCR("scif4", "bus_clk", 0, 5, 0),
-	MSTPCR("scif5", "bus_clk", 0, 4, 0),
-	MSTPCR("msiof0", "bus_clk", 0, 2, 0),
-	MSTPCR("msiof1", "bus_clk", 0, 1, 0),
-	MSTPCR("meram0", "sh_clk", 0, 0, CLK_ENABLE_ON_INIT),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
-	MSTPCR("rtc0", "r_clk", 1, 8, 0),
-	MSTPCR("atapi0", "sh_clk", 2, 28, 0),
-	MSTPCR("adc0", "peripheral_clk", 2, 28, 0),
-	MSTPCR("tpu0", "bus_clk", 2, 25, 0),
-	MSTPCR("irda0", "peripheral_clk", 2, 24, 0),
-	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
-	MSTPCR("icb0", "bus_clk", 2, 21, 0),
-	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
-	MSTPCR("sdhi1", "bus_clk", 2, 17, 0),
-	MSTPCR("keysc0", "r_clk", 2, 14, 0),
-	MSTPCR("usb0", "bus_clk", 2, 11, 0),
-	MSTPCR("2dg0", "bus_clk", 2, 10, 0),
-	MSTPCR("siu0", "bus_clk", 2, 8, 0),
-	MSTPCR("veu1", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
-	MSTPCR("vou0", "bus_clk", 2, 5, 0),
-	MSTPCR("beu0", "bus_clk", 2, 4, 0),
-	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
-	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
-	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
-	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
-#endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7724)
 	/* See Datasheet : Overview -> Block Diagram */
 	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7723.c b/arch/sh/kernel/cpu/sh4a/clock-sh7723.c
new file mode 100644
index 0000000000000..e67c2678b8ae6
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7723.c
@@ -0,0 +1,222 @@
+/*
+ * arch/sh/kernel/cpu/sh4a/clock-sh7723.c
+ *
+ * SH7723 clock framework support
+ *
+ * Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <asm/clock.h>
+
+/* SH7723 registers */
+#define FRQCR		0xa4150000
+#define VCLKCR		0xa4150004
+#define SCLKACR		0xa4150008
+#define SCLKBCR		0xa415000c
+#define IRDACLKCR	0xa4150018
+#define PLLCR		0xa4150024
+#define MSTPCR0		0xa4150030
+#define MSTPCR1		0xa4150034
+#define MSTPCR2		0xa4150038
+#define DLLFRQ		0xa4150050
+
+/* Fixed 32 KHz root clock for RTC and Power Management purposes */
+static struct clk r_clk = {
+	.name           = "rclk",
+	.id             = -1,
+	.rate           = 32768,
+};
+
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
+};
+
+/* The dll multiplies the 32khz r_clk, may be used instead of extal */
+static unsigned long dll_recalc(struct clk *clk)
+{
+	unsigned long mult;
+
+	if (__raw_readl(PLLCR) & 0x1000)
+		mult = __raw_readl(DLLFRQ);
+	else
+		mult = 0;
+
+	return clk->parent->rate * mult;
+}
+
+static struct clk_ops dll_clk_ops = {
+	.recalc		= dll_recalc,
+};
+
+static struct clk dll_clk = {
+	.name           = "dll_clk",
+	.id             = -1,
+	.ops		= &dll_clk_ops,
+	.parent		= &r_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static unsigned long pll_recalc(struct clk *clk)
+{
+	unsigned long mult = 1;
+	unsigned long div = 1;
+
+	if (__raw_readl(PLLCR) & 0x4000)
+		mult = (((__raw_readl(FRQCR) >> 24) & 0x1f) + 1);
+	else
+		div = 2;
+
+	return (clk->parent->rate * mult) / div;
+}
+
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
+};
+
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+struct clk *main_clks[] = {
+	&r_clk,
+	&extal_clk,
+	&dll_clk,
+	&pll_clk,
+};
+
+static int multipliers[] = { 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+static int divisors[] = { 1, 3, 2, 5, 3, 4, 5, 6, 8, 10, 12, 16, 20 };
+
+static struct clk_div_mult_table div4_table = {
+	.divisors = divisors,
+	.nr_divisors = ARRAY_SIZE(divisors),
+	.multipliers = multipliers,
+	.nr_multipliers = ARRAY_SIZE(multipliers),
+};
+
+enum { DIV4_I, DIV4_U, DIV4_SH, DIV4_B, DIV4_B3, DIV4_P,
+       DIV4_SIUA, DIV4_SIUB, DIV4_IRDA, DIV4_NR };
+
+#define DIV4(_str, _reg, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, _reg, _bit, _mask, _flags)
+
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_I] = DIV4("cpu_clk", FRQCR, 20, 0x0dbf, CLK_ENABLE_ON_INIT),
+	[DIV4_U] = DIV4("umem_clk", FRQCR, 16, 0x0dbf, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", FRQCR, 12, 0x0dbf, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", FRQCR, 8, 0x0dbf, CLK_ENABLE_ON_INIT),
+	[DIV4_B3] = DIV4("b3_clk", FRQCR, 4, 0x0db4, CLK_ENABLE_ON_INIT),
+	[DIV4_P] = DIV4("peripheral_clk", FRQCR, 0, 0x0dbf, 0),
+	[DIV4_SIUA] = DIV4("siua_clk", SCLKACR, 0, 0x0dbf, 0),
+	[DIV4_SIUB] = DIV4("siub_clk", SCLKBCR, 0, 0x0dbf, 0),
+	[DIV4_IRDA] = DIV4("irda_clk", IRDACLKCR, 0, 0x0dbf, 0),
+};
+
+struct clk div6_clks[] = {
+	SH_CLK_DIV6("video_clk", &pll_clk, VCLKCR, 0),
+};
+
+#define MSTP(_str, _parent, _reg, _bit, _force_on, _need_cpg, _need_ram) \
+  SH_CLK_MSTP32(_str, -1, _parent, _reg, _bit, _force_on * CLK_ENABLE_ON_INIT)
+
+static struct clk mstp_clks[] = {
+	/* See page 60 of Datasheet V1.0: Overview -> Block Diagram */
+	MSTP("tlb0", &div4_clks[DIV4_I], MSTPCR0, 31, 1, 1, 0),
+	MSTP("ic0", &div4_clks[DIV4_I], MSTPCR0, 30, 1, 1, 0),
+	MSTP("oc0", &div4_clks[DIV4_I], MSTPCR0, 29, 1, 1, 0),
+	MSTP("l2c0", &div4_clks[DIV4_SH], MSTPCR0, 28, 1, 1, 0),
+	MSTP("ilmem0", &div4_clks[DIV4_I], MSTPCR0, 27, 1, 1, 0),
+	MSTP("fpu0", &div4_clks[DIV4_I], MSTPCR0, 24, 1, 1, 0),
+	MSTP("intc0", &div4_clks[DIV4_I], MSTPCR0, 22, 1, 1, 0),
+	MSTP("dmac0", &div4_clks[DIV4_B], MSTPCR0, 21, 0, 1, 1),
+	MSTP("sh0", &div4_clks[DIV4_SH], MSTPCR0, 20, 0, 1, 0),
+	MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0, 1, 0),
+	MSTP("ubc0", &div4_clks[DIV4_I], MSTPCR0, 17, 0, 1, 0),
+	MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0, 1, 0),
+	MSTP("cmt0", &r_clk, MSTPCR0, 14, 0, 0, 0),
+	MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0, 0, 0),
+	MSTP("dmac1", &div4_clks[DIV4_B], MSTPCR0, 12, 0, 1, 1),
+	MSTP("tmu1", &div4_clks[DIV4_P], MSTPCR0, 11, 0, 1, 0),
+	MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0, 1, 0),
+	MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 9, 0, 1, 0),
+	MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 8, 0, 1, 0),
+	MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 7, 0, 1, 0),
+	MSTP("scif3", &div4_clks[DIV4_B], MSTPCR0, 6, 0, 1, 0),
+	MSTP("scif4", &div4_clks[DIV4_B], MSTPCR0, 5, 0, 1, 0),
+	MSTP("scif5", &div4_clks[DIV4_B], MSTPCR0, 4, 0, 1, 0),
+	MSTP("msiof0", &div4_clks[DIV4_B], MSTPCR0, 2, 0, 1, 0),
+	MSTP("msiof1", &div4_clks[DIV4_B], MSTPCR0, 1, 0, 1, 0),
+	MSTP("meram0", &div4_clks[DIV4_SH], MSTPCR0, 0, 1, 1, 0),
+
+	MSTP("i2c0", &div4_clks[DIV4_P], MSTPCR1, 9, 0, 1, 0),
+	MSTP("rtc0", &r_clk, MSTPCR1, 8, 0, 0, 0),
+
+	MSTP("atapi0", &div4_clks[DIV4_SH], MSTPCR2, 28, 0, 1, 0),
+	MSTP("adc0", &div4_clks[DIV4_P], MSTPCR2, 27, 0, 1, 0),
+	MSTP("tpu0", &div4_clks[DIV4_B], MSTPCR2, 25, 0, 1, 0),
+	MSTP("irda0", &div4_clks[DIV4_P], MSTPCR2, 24, 0, 1, 0),
+	MSTP("tsif0", &div4_clks[DIV4_B], MSTPCR2, 22, 0, 1, 0),
+	MSTP("icb0", &div4_clks[DIV4_B], MSTPCR2, 21, 0, 1, 1),
+	MSTP("sdhi0", &div4_clks[DIV4_B], MSTPCR2, 18, 0, 1, 0),
+	MSTP("sdhi1", &div4_clks[DIV4_B], MSTPCR2, 17, 0, 1, 0),
+	MSTP("keysc0", &r_clk, MSTPCR2, 14, 0, 0, 0),
+	MSTP("usb0", &div4_clks[DIV4_B], MSTPCR2, 11, 0, 1, 0),
+	MSTP("2dg0", &div4_clks[DIV4_B], MSTPCR2, 10, 0, 1, 1),
+	MSTP("siu0", &div4_clks[DIV4_B], MSTPCR2, 8, 0, 1, 0),
+	MSTP("veu1", &div4_clks[DIV4_B], MSTPCR2, 6, 1, 1, 1),
+	MSTP("vou0", &div4_clks[DIV4_B], MSTPCR2, 5, 0, 1, 1),
+	MSTP("beu0", &div4_clks[DIV4_B], MSTPCR2, 4, 0, 1, 1),
+	MSTP("ceu0", &div4_clks[DIV4_B], MSTPCR2, 3, 0, 1, 1),
+	MSTP("veu0", &div4_clks[DIV4_B], MSTPCR2, 2, 1, 1, 1),
+	MSTP("vpu0", &div4_clks[DIV4_B], MSTPCR2, 1, 1, 1, 1),
+	MSTP("lcdc0", &div4_clks[DIV4_B], MSTPCR2, 0, 0, 1, 1),
+};
+
+int __init arch_clk_init(void)
+{
+	int k, ret = 0;
+
+	/* autodetect extal or dll configuration */
+	if (__raw_readl(PLLCR) & 0x1000)
+		pll_clk.parent = &dll_clk;
+	else
+		pll_clk.parent = &extal_clk;
+
+	for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
+		ret = clk_register(main_clks[k]);
+
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
+
+	if (!ret)
+		ret = sh_clk_div6_register(div6_clks, ARRAY_SIZE(div6_clks));
+
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
+
+	return ret;
+}
-- 
GitLab


From b621370a3505f8bd42acc41736cae47d5ce8bd06 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Wed, 10 Jun 2009 11:31:16 +0000
Subject: [PATCH 1951/2198] sh: sh7724 clock framework rewrite V3

This patch contains V3 of the sh7724 clock framework
rewrite. The new code makes use of the recently merged
div4, div6 and mstp32 helper code. Both extal and fll are
supported as input clocks to the pll. The div6 clocks are
fed through a divide-by-3 block.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                        |   2 +-
 arch/sh/kernel/cpu/sh4a/Makefile       |   2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c |  85 +--------
 arch/sh/kernel/cpu/sh4a/clock-sh7724.c | 242 +++++++++++++++++++++++++
 4 files changed, 249 insertions(+), 82 deletions(-)
 create mode 100644 arch/sh/kernel/cpu/sh4a/clock-sh7724.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index d13390b81aa58..7b564bddfb536 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -516,7 +516,7 @@ config SH_CLK_CPG
 
 config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
-	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723
+	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index 25b2833f74462..02a0b17347be3 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -27,7 +27,7 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7786)	:= clock-sh7786.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7722)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7723.o
-clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7722.o
+clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7724.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7366)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SHX3)	:= clock-shx3.o
 
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index ccefd7dde78c1..5e08504da3a60 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -130,11 +130,7 @@ static void adjust_clocks(int originate, int *l, unsigned long v[],
  * is quite simple..
  */
 
-#if defined(CONFIG_CPU_SUBTYPE_SH7724)
-#define STCPLL(frqcr) ((((frqcr >> 24) & 0x3f) + 1) * 2)
-#else
 #define STCPLL(frqcr) (((frqcr >> 24) & 0x1f) + 1)
-#endif
 
 /*
  * Instead of having two separate multipliers/divisors set, like this:
@@ -145,11 +141,7 @@ static void adjust_clocks(int originate, int *l, unsigned long v[],
  * I created the divisors2 array, which is used to calculate rate like
  *   rate = parent * 2 / divisors2[ divisor ];
 */
-#if defined(CONFIG_CPU_SUBTYPE_SH7724)
-static int divisors2[] = { 4, 1, 8, 12, 16, 24, 32, 1, 48, 64, 72, 96, 1, 144 };
-#else
 static int divisors2[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32, 40 };
-#endif
 
 static unsigned long master_clk_recalc(struct clk *clk)
 {
@@ -171,17 +163,10 @@ static unsigned long module_clk_recalc(struct clk *clk)
 	return clk->parent->rate / STCPLL(frqcr);
 }
 
-#if defined(CONFIG_CPU_SUBTYPE_SH7724)
-#define MASTERDIVS	{ 12, 16, 24, 30, 32, 36, 48 }
-#define STCMASK		0x3f
-#define DIVCALC(div)	(div/2-1)
-#define FRQCRKICK	0x80000000
-#else
 #define MASTERDIVS	{ 2, 3, 4, 6, 8, 16 }
 #define STCMASK		0x1f
 #define DIVCALC(div)	(div-1)
 #define FRQCRKICK	0x00000000
-#endif
 
 static int master_clk_setrate(struct clk *clk, unsigned long rate, int id)
 {
@@ -557,8 +542,7 @@ static struct clk sh7722_r_clock = {
 	.rate = 32768,
 };
 
-#if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\
-    !defined(CONFIG_CPU_SUBTYPE_SH7724)
+#if !defined(CONFIG_CPU_SUBTYPE_SH7343)
 /*
  * these three clocks - SIU A, SIU B, IrDA - share the same clk_ops
  * methods of clk_ops determine which register they should access by
@@ -575,10 +559,9 @@ static struct clk sh7722_siu_b_clock = {
 	.arch_flags = SCLKBCR,
 	.ops = &sh7722_siu_clk_ops,
 };
-#endif /* CONFIG_CPU_SUBTYPE_SH7343, SH7724 */
+#endif /* CONFIG_CPU_SUBTYPE_SH7343 */
 
-#if defined(CONFIG_CPU_SUBTYPE_SH7722) ||\
-    defined(CONFIG_CPU_SUBTYPE_SH7724)
+#if defined(CONFIG_CPU_SUBTYPE_SH7722)
 static struct clk sh7722_irda_clock = {
 	.name = "irda_clk",
 	.arch_flags = IrDACLKCR,
@@ -676,61 +659,6 @@ static struct clk sh7722_mstpcr_clocks[] = {
 	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
 	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
-#if defined(CONFIG_CPU_SUBTYPE_SH7724)
-	/* See Datasheet : Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
-	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
-	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
-	MSTPCR("rs0", "bus_clk", 0, 28, 0),
-	MSTPCR("ilmem0", "cpu_clk", 0, 27, 0),
-	MSTPCR("l2c0", "sh_clk", 0, 26, 0),
-	MSTPCR("fpu0", "cpu_clk", 0, 24, 0),
-	MSTPCR("intc0", "peripheral_clk", 0, 22, 0),
-	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
-	MSTPCR("sh0", "sh_clk", 0, 20, 0),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
-	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
-	MSTPCR("cmt0", "r_clk", 0, 14, 0),
-	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
-	MSTPCR("dmac1", "bus_clk", 0, 12, 0),
-	MSTPCR("tmu1", "peripheral_clk", 0, 10, 0),
-	MSTPCR("scif0", "peripheral_clk", 0, 9, 0),
-	MSTPCR("scif1", "peripheral_clk", 0, 8, 0),
-	MSTPCR("scif2", "peripheral_clk", 0, 7, 0),
-	MSTPCR("scif3", "bus_clk", 0, 6, 0),
-	MSTPCR("scif4", "bus_clk", 0, 5, 0),
-	MSTPCR("scif5", "bus_clk", 0, 4, 0),
-	MSTPCR("msiof0", "bus_clk", 0, 2, 0),
-	MSTPCR("msiof1", "bus_clk", 0, 1, 0),
-	MSTPCR("keysc0", "r_clk", 1, 12, 0),
-	MSTPCR("rtc0", "r_clk", 1, 11, 0),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
-	MSTPCR("i2c1", "peripheral_clk", 1, 8, 0),
-	MSTPCR("mmc0", "bus_clk", 2, 29, 0),
-	MSTPCR("eth0", "bus_clk", 2, 28, 0),
-	MSTPCR("atapi0", "bus_clk", 2, 26, 0),
-	MSTPCR("tpu0", "bus_clk", 2, 25, 0),
-	MSTPCR("irda0", "peripheral_clk", 2, 24, 0),
-	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
-	MSTPCR("usb1", "bus_clk", 2, 21, 0),
-	MSTPCR("usb0", "bus_clk", 2, 20, 0),
-	MSTPCR("2dg0", "bus_clk", 2, 19, 0),
-	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
-	MSTPCR("sdhi1", "bus_clk", 2, 17, 0),
-	MSTPCR("veu1", "bus_clk", 2, 15, CLK_ENABLE_ON_INIT),
-	MSTPCR("ceu1", "bus_clk", 2, 13, 0),
-	MSTPCR("beu1", "bus_clk", 2, 12, 0),
-	MSTPCR("2ddmac0", "sh_clk", 2, 10, 0),
-	MSTPCR("spu0", "bus_clk", 2, 9, 0),
-	MSTPCR("jpu0", "bus_clk", 2, 6, 0),
-	MSTPCR("vou0", "bus_clk", 2, 5, 0),
-	MSTPCR("beu0", "bus_clk", 2, 4, 0),
-	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
-	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
-	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
-	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
-#endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7343)
 	MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT),
 	MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT),
@@ -802,14 +730,11 @@ static struct clk *sh7722_clocks[] = {
 	&sh7722_sh_clock,
 	&sh7722_peripheral_clock,
 	&sh7722_sdram_clock,
-#if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\
-    !defined(CONFIG_CPU_SUBTYPE_SH7724)
+#if !defined(CONFIG_CPU_SUBTYPE_SH7343)
 	&sh7722_siu_a_clock,
 	&sh7722_siu_b_clock,
 #endif
-/* 7724 should support FSI clock */
-#if defined(CONFIG_CPU_SUBTYPE_SH7722) || \
-    defined(CONFIG_CPU_SUBTYPE_SH7724)
+#if defined(CONFIG_CPU_SUBTYPE_SH7722)
 	&sh7722_irda_clock,
 #endif
 	&sh7722_video_clock,
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7724.c b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c
new file mode 100644
index 0000000000000..5d5c9b952883a
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7724.c
@@ -0,0 +1,242 @@
+/*
+ * arch/sh/kernel/cpu/sh4a/clock-sh7724.c
+ *
+ * SH7724 clock framework support
+ *
+ * Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <asm/clock.h>
+
+/* SH7724 registers */
+#define FRQCRA		0xa4150000
+#define FRQCRB		0xa4150004
+#define VCLKCR		0xa4150048
+#define FCLKACR		0xa4150008
+#define FCLKBCR		0xa415000c
+#define IRDACLKCR	0xa4150018
+#define PLLCR		0xa4150024
+#define MSTPCR0		0xa4150030
+#define MSTPCR1		0xa4150034
+#define MSTPCR2		0xa4150038
+#define SPUCLKCR	0xa415003c
+#define FLLFRQ		0xa4150050
+#define LSTATS		0xa4150060
+
+/* Fixed 32 KHz root clock for RTC and Power Management purposes */
+static struct clk r_clk = {
+	.name           = "rclk",
+	.id             = -1,
+	.rate           = 32768,
+};
+
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
+};
+
+/* The fll multiplies the 32khz r_clk, may be used instead of extal */
+static unsigned long fll_recalc(struct clk *clk)
+{
+	unsigned long mult = 0;
+	unsigned long div = 1;
+
+	if (__raw_readl(PLLCR) & 0x1000)
+		mult = __raw_readl(FLLFRQ) & 0x3ff;
+
+	if (__raw_readl(FLLFRQ) & 0x4000)
+		div = 2;
+
+	return (clk->parent->rate * mult) / div;
+}
+
+static struct clk_ops fll_clk_ops = {
+	.recalc		= fll_recalc,
+};
+
+static struct clk fll_clk = {
+	.name           = "fll_clk",
+	.id             = -1,
+	.ops		= &fll_clk_ops,
+	.parent		= &r_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static unsigned long pll_recalc(struct clk *clk)
+{
+	unsigned long mult = 1;
+
+	if (__raw_readl(PLLCR) & 0x4000)
+		mult = (((__raw_readl(FRQCRA) >> 24) & 0x3f) + 1) * 2;
+
+	return clk->parent->rate * mult;
+}
+
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
+};
+
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+/* A fixed divide-by-3 block use by the div6 clocks */
+static unsigned long div3_recalc(struct clk *clk)
+{
+	return clk->parent->rate / 3;
+}
+
+static struct clk_ops div3_clk_ops = {
+	.recalc		= div3_recalc,
+};
+
+static struct clk div3_clk = {
+	.name		= "div3_clk",
+	.id		= -1,
+	.ops		= &div3_clk_ops,
+	.parent		= &pll_clk,
+};
+
+struct clk *main_clks[] = {
+	&r_clk,
+	&extal_clk,
+	&fll_clk,
+	&pll_clk,
+	&div3_clk,
+};
+
+static int divisors[] = { 2, 0, 4, 6, 8, 12, 16, 0, 24, 32, 36, 48, 0, 72 };
+
+static struct clk_div_mult_table div4_table = {
+	.divisors = divisors,
+	.nr_divisors = ARRAY_SIZE(divisors),
+};
+
+enum { DIV4_I, DIV4_SH, DIV4_B, DIV4_P, DIV4_M1, DIV4_NR };
+
+#define DIV4(_str, _reg, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, _reg, _bit, _mask, _flags)
+
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_I] = DIV4("cpu_clk", FRQCRA, 20, 0x2f7d, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", FRQCRA, 12, 0x2f7c, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", FRQCRA, 8, 0x2f7c, CLK_ENABLE_ON_INIT),
+	[DIV4_P] = DIV4("peripheral_clk", FRQCRA, 0, 0x2f7c, 0),
+	[DIV4_M1] = DIV4("vpu_clk", FRQCRB, 4, 0x2f7c, 0),
+};
+
+struct clk div6_clks[] = {
+	SH_CLK_DIV6("video_clk", &div3_clk, VCLKCR, 0),
+	SH_CLK_DIV6("fsia_clk", &div3_clk, FCLKACR, 0),
+	SH_CLK_DIV6("fsib_clk", &div3_clk, FCLKBCR, 0),
+	SH_CLK_DIV6("irda_clk", &div3_clk, IRDACLKCR, 0),
+	SH_CLK_DIV6("spu_clk", &div3_clk, SPUCLKCR, 0),
+};
+
+#define MSTP(_str, _parent, _reg, _bit, _force_on, _need_cpg, _need_ram) \
+  SH_CLK_MSTP32(_str, -1, _parent, _reg, _bit, _force_on * CLK_ENABLE_ON_INIT)
+
+static struct clk mstp_clks[] = {
+	MSTP("tlb0", &div4_clks[DIV4_I], MSTPCR0, 31, 1, 1, 0),
+	MSTP("ic0", &div4_clks[DIV4_I], MSTPCR0, 30, 1, 1, 0),
+	MSTP("oc0", &div4_clks[DIV4_I], MSTPCR0, 29, 1, 1, 0),
+	MSTP("rs0", &div4_clks[DIV4_B], MSTPCR0, 28, 1, 1, 0),
+	MSTP("ilmem0", &div4_clks[DIV4_I], MSTPCR0, 27, 1, 1, 0),
+	MSTP("l2c0", &div4_clks[DIV4_SH], MSTPCR0, 26, 1, 1, 0),
+	MSTP("fpu0", &div4_clks[DIV4_I], MSTPCR0, 24, 1, 1, 0),
+	MSTP("intc0", &div4_clks[DIV4_P], MSTPCR0, 22, 1, 1, 0),
+	MSTP("dmac0", &div4_clks[DIV4_B], MSTPCR0, 21, 0, 1, 1),
+	MSTP("sh0", &div4_clks[DIV4_SH], MSTPCR0, 20, 0, 1, 0),
+	MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0, 1, 0),
+	MSTP("ubc0", &div4_clks[DIV4_I], MSTPCR0, 17, 0, 1, 0),
+	MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0, 1, 0),
+	MSTP("cmt0", &r_clk, MSTPCR0, 14, 0, 0, 0),
+	MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0, 0, 0),
+	MSTP("dmac1", &div4_clks[DIV4_B], MSTPCR0, 12, 0, 1, 1),
+	MSTP("tmu1", &div4_clks[DIV4_P], MSTPCR0, 10, 0, 1, 0),
+	MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 9, 0, 1, 0),
+	MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 8, 0, 1, 0),
+	MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 7, 0, 1, 0),
+	MSTP("scif3", &div4_clks[DIV4_B], MSTPCR0, 6, 0, 1, 0),
+	MSTP("scif4", &div4_clks[DIV4_B], MSTPCR0, 5, 0, 1, 0),
+	MSTP("scif5", &div4_clks[DIV4_B], MSTPCR0, 4, 0, 1, 0),
+	MSTP("msiof0", &div4_clks[DIV4_B], MSTPCR0, 2, 0, 1, 0),
+	MSTP("msiof1", &div4_clks[DIV4_B], MSTPCR0, 1, 0, 1, 0),
+
+	MSTP("keysc0", &r_clk, MSTPCR1, 12, 0, 0, 0),
+	MSTP("rtc0", &r_clk, MSTPCR1, 11, 0, 0, 0),
+	MSTP("i2c0", &div4_clks[DIV4_P], MSTPCR1, 9, 0, 1, 0),
+	MSTP("i2c1", &div4_clks[DIV4_P], MSTPCR1, 8, 0, 1, 0),
+
+	MSTP("mmc0", &div4_clks[DIV4_B], MSTPCR2, 29, 0, 1, 0),
+	MSTP("eth0", &div4_clks[DIV4_B], MSTPCR2, 28, 0, 1, 0),
+	MSTP("atapi0", &div4_clks[DIV4_B], MSTPCR2, 26, 0, 1, 0),
+	MSTP("tpu0", &div4_clks[DIV4_B], MSTPCR2, 25, 0, 1, 0),
+	MSTP("irda0", &div4_clks[DIV4_P], MSTPCR2, 24, 0, 1, 0),
+	MSTP("tsif0", &div4_clks[DIV4_B], MSTPCR2, 22, 0, 1, 0),
+	MSTP("usb1", &div4_clks[DIV4_B], MSTPCR2, 21, 0, 1, 1),
+	MSTP("usb0", &div4_clks[DIV4_B], MSTPCR2, 20, 0, 1, 1),
+	MSTP("2dg0", &div4_clks[DIV4_B], MSTPCR2, 19, 0, 1, 1),
+	MSTP("sdhi0", &div4_clks[DIV4_B], MSTPCR2, 18, 0, 1, 0),
+	MSTP("sdhi1", &div4_clks[DIV4_B], MSTPCR2, 17, 0, 1, 0),
+	MSTP("veu1", &div4_clks[DIV4_B], MSTPCR2, 15, 1, 1, 1),
+	MSTP("ceu1", &div4_clks[DIV4_B], MSTPCR2, 13, 0, 1, 1),
+	MSTP("beu1", &div4_clks[DIV4_B], MSTPCR2, 12, 0, 1, 1),
+	MSTP("2ddmac0", &div4_clks[DIV4_SH], MSTPCR2, 10, 0, 1, 1),
+	MSTP("spu0", &div4_clks[DIV4_B], MSTPCR2, 9, 0, 1, 0),
+	MSTP("jpu0", &div4_clks[DIV4_B], MSTPCR2, 6, 1, 1, 1),
+	MSTP("vou0", &div4_clks[DIV4_B], MSTPCR2, 5, 0, 1, 1),
+	MSTP("beu0", &div4_clks[DIV4_B], MSTPCR2, 4, 0, 1, 1),
+	MSTP("ceu0", &div4_clks[DIV4_B], MSTPCR2, 3, 0, 1, 1),
+	MSTP("veu0", &div4_clks[DIV4_B], MSTPCR2, 2, 1, 1, 1),
+	MSTP("vpu0", &div4_clks[DIV4_B], MSTPCR2, 1, 1, 1, 1),
+	MSTP("lcdc0", &div4_clks[DIV4_B], MSTPCR2, 0, 0, 1, 1),
+};
+
+int __init arch_clk_init(void)
+{
+	int k, ret = 0;
+
+	/* autodetect extal or fll configuration */
+	if (__raw_readl(PLLCR) & 0x1000)
+		pll_clk.parent = &fll_clk;
+	else
+		pll_clk.parent = &extal_clk;
+
+	for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
+		ret = clk_register(main_clks[k]);
+
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
+
+	if (!ret)
+		ret = sh_clk_div6_register(div6_clks, ARRAY_SIZE(div6_clks));
+
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
+
+	return ret;
+}
-- 
GitLab


From bc49b6eaac6eff86f902a36d846c310e1e0beedf Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 07:32:11 +0000
Subject: [PATCH 1952/2198] sh: sh7343 clock framework rewrite

This patch rewrites the sh7343 clock framework code.
The new code makes use of the recently merged div4,
div6 and mstp32 helper code. Both extal and dll are
supported as input clocks to the pll.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                        |   2 +-
 arch/sh/kernel/cpu/sh4a/Makefile       |   2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7343.c | 211 +++++++++++++++++++++++++
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c |  33 +---
 4 files changed, 214 insertions(+), 34 deletions(-)
 create mode 100644 arch/sh/kernel/cpu/sh4a/clock-sh7343.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 7b564bddfb536..39807146d2591 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -516,7 +516,7 @@ config SH_CLK_CPG
 
 config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
-	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724
+	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724 && !CPU_SUBTYPE_SH7343
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index 02a0b17347be3..f4415f9ebac73 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -24,7 +24,7 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7770)	:= clock-sh7770.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7780)	:= clock-sh7780.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7785)	:= clock-sh7785.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7786)	:= clock-sh7786.o
-clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7722.o
+clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7343.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7722)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7723.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7724.o
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7343.c b/arch/sh/kernel/cpu/sh4a/clock-sh7343.c
new file mode 100644
index 0000000000000..0ee3ee8612524
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7343.c
@@ -0,0 +1,211 @@
+/*
+ * arch/sh/kernel/cpu/sh4a/clock-sh7343.c
+ *
+ * SH7343 clock framework support
+ *
+ * Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <asm/clock.h>
+
+/* SH7343 registers */
+#define FRQCR		0xa4150000
+#define VCLKCR		0xa4150004
+#define SCLKACR		0xa4150008
+#define SCLKBCR		0xa415000c
+#define PLLCR		0xa4150024
+#define MSTPCR0		0xa4150030
+#define MSTPCR1		0xa4150034
+#define MSTPCR2		0xa4150038
+#define DLLFRQ		0xa4150050
+
+/* Fixed 32 KHz root clock for RTC and Power Management purposes */
+static struct clk r_clk = {
+	.name           = "rclk",
+	.id             = -1,
+	.rate           = 32768,
+};
+
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
+};
+
+/* The dll block multiplies the 32khz r_clk, may be used instead of extal */
+static unsigned long dll_recalc(struct clk *clk)
+{
+	unsigned long mult;
+
+	if (__raw_readl(PLLCR) & 0x1000)
+		mult = __raw_readl(DLLFRQ);
+	else
+		mult = 0;
+
+	return clk->parent->rate * mult;
+}
+
+static struct clk_ops dll_clk_ops = {
+	.recalc		= dll_recalc,
+};
+
+static struct clk dll_clk = {
+	.name           = "dll_clk",
+	.id             = -1,
+	.ops		= &dll_clk_ops,
+	.parent		= &r_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static unsigned long pll_recalc(struct clk *clk)
+{
+	unsigned long mult = 1;
+
+	if (__raw_readl(PLLCR) & 0x4000)
+		mult = (((__raw_readl(FRQCR) >> 24) & 0x1f) + 1);
+
+	return clk->parent->rate * mult;
+}
+
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
+};
+
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+struct clk *main_clks[] = {
+	&r_clk,
+	&extal_clk,
+	&dll_clk,
+	&pll_clk,
+};
+
+static int multipliers[] = { 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+static int divisors[] = { 1, 3, 2, 5, 3, 4, 5, 6, 8, 10, 12, 16, 20 };
+
+static struct clk_div_mult_table div4_table = {
+	.divisors = divisors,
+	.nr_divisors = ARRAY_SIZE(divisors),
+	.multipliers = multipliers,
+	.nr_multipliers = ARRAY_SIZE(multipliers),
+};
+
+enum { DIV4_I, DIV4_U, DIV4_SH, DIV4_B, DIV4_B3, DIV4_P,
+       DIV4_SIUA, DIV4_SIUB, DIV4_NR };
+
+#define DIV4(_str, _reg, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, _reg, _bit, _mask, _flags)
+
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_I] = DIV4("cpu_clk", FRQCR, 20, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_U] = DIV4("umem_clk", FRQCR, 16, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", FRQCR, 12, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", FRQCR, 8, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B3] = DIV4("b3_clk", FRQCR, 4, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_P] = DIV4("peripheral_clk", FRQCR, 0, 0x1fff, 0),
+	[DIV4_SIUA] = DIV4("siua_clk", SCLKACR, 0, 0x1fff, 0),
+	[DIV4_SIUB] = DIV4("siub_clk", SCLKBCR, 0, 0x1fff, 0),
+};
+
+struct clk div6_clks[] = {
+	SH_CLK_DIV6("video_clk", &pll_clk, VCLKCR, 0),
+};
+
+#define MSTP(_str, _parent, _reg, _bit, _flags) \
+  SH_CLK_MSTP32(_str, -1, _parent, _reg, _bit, _flags)
+
+static struct clk mstp_clks[] = {
+	MSTP("tlb0", &div4_clks[DIV4_I], MSTPCR0, 31, CLK_ENABLE_ON_INIT),
+	MSTP("ic0", &div4_clks[DIV4_I], MSTPCR0, 30, CLK_ENABLE_ON_INIT),
+	MSTP("oc0", &div4_clks[DIV4_I], MSTPCR0, 29, CLK_ENABLE_ON_INIT),
+	MSTP("uram0", &div4_clks[DIV4_U], MSTPCR0, 28, CLK_ENABLE_ON_INIT),
+	MSTP("xymem0", &div4_clks[DIV4_B], MSTPCR0, 26, CLK_ENABLE_ON_INIT),
+	MSTP("intc3", &div4_clks[DIV4_P], MSTPCR0, 23, 0),
+	MSTP("intc0", &div4_clks[DIV4_P], MSTPCR0, 22, 0),
+	MSTP("dmac0", &div4_clks[DIV4_P], MSTPCR0, 21, 0),
+	MSTP("sh0", &div4_clks[DIV4_P], MSTPCR0, 20, 0),
+	MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0),
+	MSTP("ubc0", &div4_clks[DIV4_P], MSTPCR0, 17, 0),
+	MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0),
+	MSTP("cmt0", &r_clk, MSTPCR0, 14, 0),
+	MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0),
+	MSTP("mfi0", &div4_clks[DIV4_P], MSTPCR0, 11, 0),
+	MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0),
+	MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 7, 0),
+	MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 6, 0),
+	MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 5, 0),
+	MSTP("scif3", &div4_clks[DIV4_P], MSTPCR0, 4, 0),
+	MSTP("sio0", &div4_clks[DIV4_P], MSTPCR0, 3, 0),
+	MSTP("siof0", &div4_clks[DIV4_P], MSTPCR0, 2, 0),
+	MSTP("siof1", &div4_clks[DIV4_P], MSTPCR0, 1, 0),
+
+	MSTP("i2c0", &div4_clks[DIV4_P], MSTPCR1, 9, 0),
+	MSTP("i2c1", &div4_clks[DIV4_P], MSTPCR1, 8, 0),
+
+	MSTP("tpu0", &div4_clks[DIV4_P], MSTPCR2, 25, 0),
+	MSTP("irda0", &div4_clks[DIV4_P], MSTPCR2, 24, 0),
+	MSTP("sdhi0", &div4_clks[DIV4_P], MSTPCR2, 18, 0),
+	MSTP("mmcif0", &div4_clks[DIV4_P], MSTPCR2, 17, 0),
+	MSTP("sim0", &div4_clks[DIV4_P], MSTPCR2, 16, 0),
+	MSTP("keysc0", &r_clk, MSTPCR2, 14, 0),
+	MSTP("tsif0", &div4_clks[DIV4_P], MSTPCR2, 13, 0),
+	MSTP("s3d40", &div4_clks[DIV4_P], MSTPCR2, 12, 0),
+	MSTP("usbf0", &div4_clks[DIV4_P], MSTPCR2, 11, 0),
+	MSTP("siu0", &div4_clks[DIV4_B], MSTPCR2, 8, 0),
+	MSTP("jpu0", &div4_clks[DIV4_B], MSTPCR2, 6, CLK_ENABLE_ON_INIT),
+	MSTP("vou0", &div4_clks[DIV4_B], MSTPCR2, 5, 0),
+	MSTP("beu0", &div4_clks[DIV4_B], MSTPCR2, 4, 0),
+	MSTP("ceu0", &div4_clks[DIV4_B], MSTPCR2, 3, 0),
+	MSTP("veu0", &div4_clks[DIV4_B], MSTPCR2, 2, CLK_ENABLE_ON_INIT),
+	MSTP("vpu0", &div4_clks[DIV4_B], MSTPCR2, 1, CLK_ENABLE_ON_INIT),
+	MSTP("lcdc0", &div4_clks[DIV4_B], MSTPCR2, 0, 0),
+};
+
+int __init arch_clk_init(void)
+{
+	int k, ret = 0;
+
+	/* autodetect extal or dll configuration */
+	if (__raw_readl(PLLCR) & 0x1000)
+		pll_clk.parent = &dll_clk;
+	else
+		pll_clk.parent = &extal_clk;
+
+	for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
+		ret = clk_register(main_clks[k]);
+
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
+
+	if (!ret)
+		ret = sh_clk_div6_register(div6_clks, ARRAY_SIZE(div6_clks));
+
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
+
+	return ret;
+}
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 5e08504da3a60..8aaaac240ada2 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -1,7 +1,7 @@
 /*
  * arch/sh/kernel/cpu/sh4a/clock-sh7722.c
  *
- * SH7343, SH7722 & SH7366 support for the clock framework
+ * SH7722 & SH7366 support for the clock framework
  *
  * Copyright (c) 2006-2007 Nomad Global Solutions Inc
  * Based on code for sh7343 by Paul Mundt
@@ -417,7 +417,6 @@ static struct clk_ops sh7722_frqcr_clk_ops = {
 /*
  * clock ops methods for SIU A/B and IrDA clock
  */
-#ifndef CONFIG_CPU_SUBTYPE_SH7343
 static int sh7722_siu_set_rate(struct clk *clk, unsigned long rate, int algo_id)
 {
 	unsigned long r;
@@ -469,8 +468,6 @@ static struct clk_ops sh7722_siu_clk_ops = {
 	.disable = sh7722_siu_disable,
 };
 
-#endif /* CONFIG_CPU_SUBTYPE_SH7343 */
-
 static int sh7722_video_enable(struct clk *clk)
 {
 	unsigned long r;
@@ -542,7 +539,6 @@ static struct clk sh7722_r_clock = {
 	.rate = 32768,
 };
 
-#if !defined(CONFIG_CPU_SUBTYPE_SH7343)
 /*
  * these three clocks - SIU A, SIU B, IrDA - share the same clk_ops
  * methods of clk_ops determine which register they should access by
@@ -559,7 +555,6 @@ static struct clk sh7722_siu_b_clock = {
 	.arch_flags = SCLKBCR,
 	.ops = &sh7722_siu_clk_ops,
 };
-#endif /* CONFIG_CPU_SUBTYPE_SH7343 */
 
 #if defined(CONFIG_CPU_SUBTYPE_SH7722)
 static struct clk sh7722_irda_clock = {
@@ -659,30 +654,6 @@ static struct clk sh7722_mstpcr_clocks[] = {
 	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
 	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
-#if defined(CONFIG_CPU_SUBTYPE_SH7343)
-	MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT),
-	MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
-	MSTPCR("cmt0", "r_clk", 0, 14, 0),
-	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
-	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
-	MSTPCR("scif1", "peripheral_clk", 0, 6, 0),
-	MSTPCR("scif2", "peripheral_clk", 0, 5, 0),
-	MSTPCR("scif3", "peripheral_clk", 0, 4, 0),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
-	MSTPCR("i2c1", "peripheral_clk", 1, 8, 0),
-	MSTPCR("sdhi0", "peripheral_clk", 2, 18, 0),
-	MSTPCR("keysc0", "r_clk", 2, 14, 0),
-	MSTPCR("usbf0", "peripheral_clk", 2, 11, 0),
-	MSTPCR("siu0", "bus_clk", 2, 8, 0),
-	MSTPCR("jpu0", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
-	MSTPCR("vou0", "bus_clk", 2, 5, 0),
-	MSTPCR("beu0", "bus_clk", 2, 4, 0),
-	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
-	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
-	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
-	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
-#endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7366)
 	/* See page 52 of Datasheet V0.40: Overview -> Block Diagram */
 	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
@@ -730,10 +701,8 @@ static struct clk *sh7722_clocks[] = {
 	&sh7722_sh_clock,
 	&sh7722_peripheral_clock,
 	&sh7722_sdram_clock,
-#if !defined(CONFIG_CPU_SUBTYPE_SH7343)
 	&sh7722_siu_a_clock,
 	&sh7722_siu_b_clock,
-#endif
 #if defined(CONFIG_CPU_SUBTYPE_SH7722)
 	&sh7722_irda_clock,
 #endif
-- 
GitLab


From 4ed37394842e5f9c0c4ea087e8716d4458d03bfb Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 07:35:25 +0000
Subject: [PATCH 1953/2198] sh: sh7366 clock framework rewrite

This patch rewrites the sh7366 clock framework code.
The new code makes use of the recently merged div4,
div6 and mstp32 helper code. Both extal and dll are
supported as input clocks to the pll.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                        |   2 +-
 arch/sh/kernel/cpu/sh4a/Makefile       |   2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7366.c | 211 +++++++++++++++++++++++++
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c |  42 +----
 4 files changed, 214 insertions(+), 43 deletions(-)
 create mode 100644 arch/sh/kernel/cpu/sh4a/clock-sh7366.c

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 39807146d2591..dbc10ff495a9a 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -516,7 +516,7 @@ config SH_CLK_CPG
 
 config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
-	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724 && !CPU_SUBTYPE_SH7343
+	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724 && !CPU_SUBTYPE_SH7343 && !CPU_SUBTYPE_SH7366
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile
index f4415f9ebac73..96ea09ca8cc11 100644
--- a/arch/sh/kernel/cpu/sh4a/Makefile
+++ b/arch/sh/kernel/cpu/sh4a/Makefile
@@ -28,7 +28,7 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7343)	:= clock-sh7343.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7722)	:= clock-sh7722.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7723)	:= clock-sh7723.o
 clock-$(CONFIG_CPU_SUBTYPE_SH7724)	:= clock-sh7724.o
-clock-$(CONFIG_CPU_SUBTYPE_SH7366)	:= clock-sh7722.o
+clock-$(CONFIG_CPU_SUBTYPE_SH7366)	:= clock-sh7366.o
 clock-$(CONFIG_CPU_SUBTYPE_SHX3)	:= clock-shx3.o
 
 # Pinmux setup
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7366.c b/arch/sh/kernel/cpu/sh4a/clock-sh7366.c
new file mode 100644
index 0000000000000..a95ebaba095c0
--- /dev/null
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7366.c
@@ -0,0 +1,211 @@
+/*
+ * arch/sh/kernel/cpu/sh4a/clock-sh7366.c
+ *
+ * SH7366 clock framework support
+ *
+ * Copyright (C) 2009 Magnus Damm
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/io.h>
+#include <asm/clock.h>
+
+/* SH7366 registers */
+#define FRQCR		0xa4150000
+#define VCLKCR		0xa4150004
+#define SCLKACR		0xa4150008
+#define SCLKBCR		0xa415000c
+#define PLLCR		0xa4150024
+#define MSTPCR0		0xa4150030
+#define MSTPCR1		0xa4150034
+#define MSTPCR2		0xa4150038
+#define DLLFRQ		0xa4150050
+
+/* Fixed 32 KHz root clock for RTC and Power Management purposes */
+static struct clk r_clk = {
+	.name           = "rclk",
+	.id             = -1,
+	.rate           = 32768,
+};
+
+/*
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
+ */
+struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
+};
+
+/* The dll block multiplies the 32khz r_clk, may be used instead of extal */
+static unsigned long dll_recalc(struct clk *clk)
+{
+	unsigned long mult;
+
+	if (__raw_readl(PLLCR) & 0x1000)
+		mult = __raw_readl(DLLFRQ);
+	else
+		mult = 0;
+
+	return clk->parent->rate * mult;
+}
+
+static struct clk_ops dll_clk_ops = {
+	.recalc		= dll_recalc,
+};
+
+static struct clk dll_clk = {
+	.name           = "dll_clk",
+	.id             = -1,
+	.ops		= &dll_clk_ops,
+	.parent		= &r_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+static unsigned long pll_recalc(struct clk *clk)
+{
+	unsigned long mult = 1;
+	unsigned long div = 1;
+
+	if (__raw_readl(PLLCR) & 0x4000)
+		mult = (((__raw_readl(FRQCR) >> 24) & 0x1f) + 1);
+	else
+		div = 2;
+
+	return (clk->parent->rate * mult) / div;
+}
+
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
+};
+
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
+
+struct clk *main_clks[] = {
+	&r_clk,
+	&extal_clk,
+	&dll_clk,
+	&pll_clk,
+};
+
+static int multipliers[] = { 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+static int divisors[] = { 1, 3, 2, 5, 3, 4, 5, 6, 8, 10, 12, 16, 20 };
+
+static struct clk_div_mult_table div4_table = {
+	.divisors = divisors,
+	.nr_divisors = ARRAY_SIZE(divisors),
+	.multipliers = multipliers,
+	.nr_multipliers = ARRAY_SIZE(multipliers),
+};
+
+enum { DIV4_I, DIV4_U, DIV4_SH, DIV4_B, DIV4_B3, DIV4_P,
+       DIV4_SIUA, DIV4_SIUB, DIV4_NR };
+
+#define DIV4(_str, _reg, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, _reg, _bit, _mask, _flags)
+
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_I] = DIV4("cpu_clk", FRQCR, 20, 0x1fef, CLK_ENABLE_ON_INIT),
+	[DIV4_U] = DIV4("umem_clk", FRQCR, 16, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", FRQCR, 12, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", FRQCR, 8, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B3] = DIV4("b3_clk", FRQCR, 4, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_P] = DIV4("peripheral_clk", FRQCR, 0, 0x1fff, 0),
+	[DIV4_SIUA] = DIV4("siua_clk", SCLKACR, 0, 0x1fff, 0),
+	[DIV4_SIUB] = DIV4("siub_clk", SCLKBCR, 0, 0x1fff, 0),
+};
+
+struct clk div6_clks[] = {
+	SH_CLK_DIV6("video_clk", &pll_clk, VCLKCR, 0),
+};
+
+#define MSTP(_str, _parent, _reg, _bit, _flags) \
+  SH_CLK_MSTP32(_str, -1, _parent, _reg, _bit, _flags)
+
+static struct clk mstp_clks[] = {
+	/* See page 52 of Datasheet V0.40: Overview -> Block Diagram */
+	MSTP("tlb0", &div4_clks[DIV4_I], MSTPCR0, 31, CLK_ENABLE_ON_INIT),
+	MSTP("ic0", &div4_clks[DIV4_I], MSTPCR0, 30, CLK_ENABLE_ON_INIT),
+	MSTP("oc0", &div4_clks[DIV4_I], MSTPCR0, 29, CLK_ENABLE_ON_INIT),
+	MSTP("rsmem0", &div4_clks[DIV4_SH], MSTPCR0, 28, CLK_ENABLE_ON_INIT),
+	MSTP("xymem0", &div4_clks[DIV4_B], MSTPCR0, 26, CLK_ENABLE_ON_INIT),
+	MSTP("intc3", &div4_clks[DIV4_P], MSTPCR0, 23, 0),
+	MSTP("intc0", &div4_clks[DIV4_P], MSTPCR0, 22, 0),
+	MSTP("dmac0", &div4_clks[DIV4_P], MSTPCR0, 21, 0),
+	MSTP("sh0", &div4_clks[DIV4_P], MSTPCR0, 20, 0),
+	MSTP("hudi0", &div4_clks[DIV4_P], MSTPCR0, 19, 0),
+	MSTP("ubc0", &div4_clks[DIV4_P], MSTPCR0, 17, 0),
+	MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0),
+	MSTP("cmt0", &r_clk, MSTPCR0, 14, 0),
+	MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0),
+	MSTP("mfi0", &div4_clks[DIV4_P], MSTPCR0, 11, 0),
+	MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0),
+	MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 7, 0),
+	MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 6, 0),
+	MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 5, 0),
+	MSTP("msiof0", &div4_clks[DIV4_P], MSTPCR0, 2, 0),
+	MSTP("sbr0", &div4_clks[DIV4_P], MSTPCR0, 1, 0),
+
+	MSTP("i2c0", &div4_clks[DIV4_P], MSTPCR1, 9, 0),
+
+	MSTP("icb0", &div4_clks[DIV4_P], MSTPCR2, 27, 0),
+	MSTP("meram0", &div4_clks[DIV4_P], MSTPCR2, 26, 0),
+	MSTP("dacy1", &div4_clks[DIV4_P], MSTPCR2, 24, 0),
+	MSTP("dacy0", &div4_clks[DIV4_P], MSTPCR2, 23, 0),
+	MSTP("tsif0", &div4_clks[DIV4_P], MSTPCR2, 22, 0),
+	MSTP("sdhi0", &div4_clks[DIV4_P], MSTPCR2, 18, 0),
+	MSTP("mmcif0", &div4_clks[DIV4_P], MSTPCR2, 17, 0),
+	MSTP("usbf0", &div4_clks[DIV4_P], MSTPCR2, 11, 0),
+	MSTP("siu0", &div4_clks[DIV4_B], MSTPCR2, 9, 0),
+	MSTP("veu1", &div4_clks[DIV4_B], MSTPCR2, 7, CLK_ENABLE_ON_INIT),
+	MSTP("vou0", &div4_clks[DIV4_B], MSTPCR2, 5, 0),
+	MSTP("beu0", &div4_clks[DIV4_B], MSTPCR2, 4, 0),
+	MSTP("ceu0", &div4_clks[DIV4_B], MSTPCR2, 3, 0),
+	MSTP("veu0", &div4_clks[DIV4_B], MSTPCR2, 2, CLK_ENABLE_ON_INIT),
+	MSTP("vpu0", &div4_clks[DIV4_B], MSTPCR2, 1, CLK_ENABLE_ON_INIT),
+	MSTP("lcdc0", &div4_clks[DIV4_B], MSTPCR2, 0, 0),
+};
+
+int __init arch_clk_init(void)
+{
+	int k, ret = 0;
+
+	/* autodetect extal or dll configuration */
+	if (__raw_readl(PLLCR) & 0x1000)
+		pll_clk.parent = &dll_clk;
+	else
+		pll_clk.parent = &extal_clk;
+
+	for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
+		ret = clk_register(main_clks[k]);
+
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
+
+	if (!ret)
+		ret = sh_clk_div6_register(div6_clks, ARRAY_SIZE(div6_clks));
+
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
+
+	return ret;
+}
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 8aaaac240ada2..6ca7c745580dc 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -1,7 +1,7 @@
 /*
  * arch/sh/kernel/cpu/sh4a/clock-sh7722.c
  *
- * SH7722 & SH7366 support for the clock framework
+ * SH7722 support for the clock framework
  *
  * Copyright (c) 2006-2007 Nomad Global Solutions Inc
  * Based on code for sh7343 by Paul Mundt
@@ -654,46 +654,6 @@ static struct clk sh7722_mstpcr_clocks[] = {
 	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
 	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
 #endif
-#if defined(CONFIG_CPU_SUBTYPE_SH7366)
-	/* See page 52 of Datasheet V0.40: Overview -> Block Diagram */
-	MSTPCR("tlb0", "cpu_clk", 0, 31, 0),
-	MSTPCR("ic0", "cpu_clk", 0, 30, 0),
-	MSTPCR("oc0", "cpu_clk", 0, 29, 0),
-	MSTPCR("rsmem0", "sh_clk", 0, 28, CLK_ENABLE_ON_INIT),
-	MSTPCR("xymem0", "cpu_clk", 0, 26, CLK_ENABLE_ON_INIT),
-	MSTPCR("intc30", "peripheral_clk", 0, 23, 0),
-	MSTPCR("intc0", "peripheral_clk", 0, 22, 0),
-	MSTPCR("dmac0", "bus_clk", 0, 21, 0),
-	MSTPCR("sh0", "sh_clk", 0, 20, 0),
-	MSTPCR("hudi0", "peripheral_clk", 0, 19, 0),
-	MSTPCR("ubc0", "cpu_clk", 0, 17, 0),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
-	MSTPCR("cmt0", "r_clk", 0, 14, 0),
-	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
-	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
-	MSTPCR("scif1", "bus_clk", 0, 6, 0),
-	MSTPCR("scif2", "bus_clk", 0, 5, 0),
-	MSTPCR("msiof0", "peripheral_clk", 0, 2, 0),
-	MSTPCR("sbr0", "peripheral_clk", 0, 1, 0),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
-	MSTPCR("icb0", "bus_clk", 2, 27, 0),
-	MSTPCR("meram0", "sh_clk", 2, 26, 0),
-	MSTPCR("dacc0", "peripheral_clk", 2, 24, 0),
-	MSTPCR("dacy0", "peripheral_clk", 2, 23, 0),
-	MSTPCR("tsif0", "bus_clk", 2, 22, 0),
-	MSTPCR("sdhi0", "bus_clk", 2, 18, 0),
-	MSTPCR("mmcif0", "bus_clk", 2, 17, 0),
-	MSTPCR("usb0", "bus_clk", 2, 11, 0),
-	MSTPCR("siu0", "bus_clk", 2, 8, 0),
-	MSTPCR("veu1", "bus_clk", 2, 7, CLK_ENABLE_ON_INIT),
-	MSTPCR("vou0", "bus_clk", 2, 5, 0),
-	MSTPCR("beu0", "bus_clk", 2, 4, 0),
-	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
-	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
-	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
-	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
-#endif
 };
 
 static struct clk *sh7722_clocks[] = {
-- 
GitLab


From 46e9371c0eaade8391b3969431a72272b9007158 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 4 Jun 2009 07:37:59 +0000
Subject: [PATCH 1954/2198] sh: sh7722 clock framework rewrite

This patch rewrites the sh7722 clock framework code.
The new code makes use of the recently merged div4,
div6 and mstp32 helper code. Both extal and dll are
supported as input clocks to the pll.

While at it, now when all SuperH Mobile processors
are converted, fix CONFIG_SH_CLK_CPG_LEGACY to depend
on CONFIG_ARCH_SHMOBILE.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig                        |   2 +-
 arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 796 +++++--------------------
 2 files changed, 138 insertions(+), 660 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index dbc10ff495a9a..b5629cdc37a01 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -516,7 +516,7 @@ config SH_CLK_CPG
 
 config SH_CLK_CPG_LEGACY
 	depends on SH_CLK_CPG
-	def_bool y if !CPU_SUBTYPE_SH7785 && !CPU_SUBTYPE_SH7723 && !CPU_SUBTYPE_SH7724 && !CPU_SUBTYPE_SH7343 && !CPU_SUBTYPE_SH7366
+	def_bool y if !CPU_SUBTYPE_SH7785 && !ARCH_SHMOBILE
 
 config SH_CLK_MD
 	int "CPU Mode Pin Setting"
diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
index 6ca7c745580dc..40f859354f793 100644
--- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
+++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c
@@ -1,719 +1,197 @@
 /*
  * arch/sh/kernel/cpu/sh4a/clock-sh7722.c
  *
- * SH7722 support for the clock framework
+ * SH7722 clock framework support
  *
- * Copyright (c) 2006-2007 Nomad Global Solutions Inc
- * Based on code for sh7343 by Paul Mundt
+ * Copyright (C) 2009 Magnus Damm
  *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file "COPYING" in the main directory of this archive
- * for more details.
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/io.h>
-#include <linux/errno.h>
-#include <linux/stringify.h>
 #include <asm/clock.h>
-#include <asm/freq.h>
-
-#define N  (-1)
-#define NM (-2)
-#define ROUND_NEAREST 0
-#define ROUND_DOWN -1
-#define ROUND_UP   +1
-
-static int adjust_algos[][3] = {
-	{},	/* NO_CHANGE */
-	{ NM, N, 1 },   /* N:1, N:1 */
-	{ 3, 2, 2 },	/* 3:2:2 */
-	{ 5, 2, 2 },    /* 5:2:2 */
-	{ N, 1, 1 },	/* N:1:1 */
-
-	{ N, 1 },	/* N:1 */
 
-	{ N, 1 },	/* N:1 */
-	{ 3, 2 },
-	{ 4, 3 },
-	{ 5, 4 },
-
-	{ N, 1 }
+/* SH7722 registers */
+#define FRQCR		0xa4150000
+#define VCLKCR		0xa4150004
+#define SCLKACR		0xa4150008
+#define SCLKBCR		0xa415000c
+#define IRDACLKCR	0xa4150018
+#define PLLCR		0xa4150024
+#define MSTPCR0		0xa4150030
+#define MSTPCR1		0xa4150034
+#define MSTPCR2		0xa4150038
+#define DLLFRQ		0xa4150050
+
+/* Fixed 32 KHz root clock for RTC and Power Management purposes */
+static struct clk r_clk = {
+	.name           = "rclk",
+	.id             = -1,
+	.rate           = 32768,
 };
 
-static unsigned long adjust_pair_of_clocks(unsigned long r1, unsigned long r2,
-			int m1, int m2, int round_flag)
-{
-	unsigned long rem, div;
-	int the_one = 0;
-
-	pr_debug( "Actual values: r1 = %ld\n", r1);
-	pr_debug( "...............r2 = %ld\n", r2);
-
-	if (m1 == m2) {
-		r2 = r1;
-		pr_debug( "setting equal rates: r2 now %ld\n", r2);
-	} else if ((m2 == N  && m1 == 1) ||
-		   (m2 == NM && m1 == N)) { /* N:1 or NM:N */
-		pr_debug( "Setting rates as 1:N (N:N*M)\n");
-		rem = r2 % r1;
-		pr_debug( "...remainder = %ld\n", rem);
-		if (rem) {
-			div = r2 / r1;
-			pr_debug( "...div = %ld\n", div);
-			switch (round_flag) {
-			case ROUND_NEAREST:
-				the_one = rem >= r1/2 ? 1 : 0; break;
-			case ROUND_UP:
-				the_one = 1; break;
-			case ROUND_DOWN:
-				the_one = 0; break;
-			}
-
-			r2 = r1 * (div + the_one);
-			pr_debug( "...setting r2 to %ld\n", r2);
-		}
-	} else if ((m2 == 1  && m1 == N) ||
-		   (m2 == N && m1 == NM)) { /* 1:N or N:NM */
-		pr_debug( "Setting rates as N:1 (N*M:N)\n");
-		rem = r1 % r2;
-		pr_debug( "...remainder = %ld\n", rem);
-		if (rem) {
-			div = r1 / r2;
-			pr_debug( "...div = %ld\n", div);
-			switch (round_flag) {
-			case ROUND_NEAREST:
-				the_one = rem > r2/2 ? 1 : 0; break;
-			case ROUND_UP:
-				the_one = 0; break;
-			case ROUND_DOWN:
-				the_one = 1; break;
-			}
-
-			r2 = r1 / (div + the_one);
-			pr_debug( "...setting r2 to %ld\n", r2);
-		}
-	} else { /* value:value */
-		pr_debug( "Setting rates as %d:%d\n", m1, m2);
-		div = r1 / m1;
-		r2 = div * m2;
-		pr_debug( "...div = %ld\n", div);
-		pr_debug( "...setting r2 to %ld\n", r2);
-	}
-
-	return r2;
-}
-
-static void adjust_clocks(int originate, int *l, unsigned long v[],
-			  int n_in_line)
-{
-	int x;
-
-	pr_debug( "Go down from %d...\n", originate);
-	/* go up recalculation clocks */
-	for (x = originate; x>0; x -- )
-		v[x-1] = adjust_pair_of_clocks(v[x], v[x-1],
-					l[x], l[x-1],
-					ROUND_UP);
-
-	pr_debug( "Go up from %d...\n", originate);
-	/* go down recalculation clocks */
-	for (x = originate; x<n_in_line - 1; x ++ )
-		v[x+1] = adjust_pair_of_clocks(v[x], v[x+1],
-					l[x], l[x+1],
-					ROUND_UP);
-}
-
-
 /*
- * SH7722 uses a common set of multipliers and divisors, so this
- * is quite simple..
+ * Default rate for the root input clock, reset this with clk_set_rate()
+ * from the platform code.
  */
-
-#define STCPLL(frqcr) (((frqcr >> 24) & 0x1f) + 1)
-
-/*
- * Instead of having two separate multipliers/divisors set, like this:
- *
- * static int multipliers[] = { 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
- * static int divisors[] = { 1, 3, 2, 5, 3, 4, 5, 6, 8, 10, 12, 16, 20 };
- *
- * I created the divisors2 array, which is used to calculate rate like
- *   rate = parent * 2 / divisors2[ divisor ];
-*/
-static int divisors2[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32, 40 };
-
-static unsigned long master_clk_recalc(struct clk *clk)
-{
-	unsigned frqcr = ctrl_inl(FRQCR);
-
-	return CONFIG_SH_PCLK_FREQ * STCPLL(frqcr);
-}
-
-static void master_clk_init(struct clk *clk)
-{
-	clk->parent = NULL;
-	clk->rate = master_clk_recalc(clk);
-}
-
-static unsigned long module_clk_recalc(struct clk *clk)
-{
-	unsigned long frqcr = ctrl_inl(FRQCR);
-
-	return clk->parent->rate / STCPLL(frqcr);
-}
-
-#define MASTERDIVS	{ 2, 3, 4, 6, 8, 16 }
-#define STCMASK		0x1f
-#define DIVCALC(div)	(div-1)
-#define FRQCRKICK	0x00000000
-
-static int master_clk_setrate(struct clk *clk, unsigned long rate, int id)
-{
-	int div = rate / clk->rate;
-	int master_divs[] = MASTERDIVS;
-	int index;
-	unsigned long frqcr;
-
-	for (index = 1; index < ARRAY_SIZE(master_divs); index++)
-		if (div >= master_divs[index - 1] && div < master_divs[index])
-			break;
-
-	if (index >= ARRAY_SIZE(master_divs))
-		index = ARRAY_SIZE(master_divs);
-	div = master_divs[index - 1];
-
-	frqcr = ctrl_inl(FRQCR);
-	frqcr &= ~(STCMASK << 24);
-	frqcr |= (DIVCALC(div) << 24);
-	frqcr |= FRQCRKICK;
-	ctrl_outl(frqcr, FRQCR);
-
-	return 0;
-}
-
-static struct clk_ops sh7722_master_clk_ops = {
-	.init = master_clk_init,
-	.recalc = master_clk_recalc,
-	.set_rate = master_clk_setrate,
-};
-
-static struct clk_ops sh7722_module_clk_ops = {
-       .recalc = module_clk_recalc,
-};
-
-struct frqcr_context {
-	unsigned mask;
-	unsigned shift;
-};
-
-struct frqcr_context sh7722_get_clk_context(const char *name)
-{
-	struct frqcr_context ctx = { 0, };
-
-	if (!strcmp(name, "peripheral_clk")) {
-		ctx.shift = 0;
-		ctx.mask = 0xF;
-	} else if (!strcmp(name, "sdram_clk")) {
-		ctx.shift = 4;
-		ctx.mask = 0xF;
-	} else if (!strcmp(name, "bus_clk")) {
-		ctx.shift = 8;
-		ctx.mask = 0xF;
-	} else if (!strcmp(name, "sh_clk")) {
-		ctx.shift = 12;
-		ctx.mask = 0xF;
-	} else if (!strcmp(name, "umem_clk")) {
-		ctx.shift = 16;
-		ctx.mask = 0xF;
-	} else if (!strcmp(name, "cpu_clk")) {
-		ctx.shift = 20;
-		ctx.mask = 7;
-	}
-	return ctx;
-}
-
-/**
- * sh7722_find_div_index - find divisor for setting rate
- *
- * All sh7722 clocks use the same set of multipliers/divisors. This function
- * chooses correct divisor to set the rate of clock with parent clock that
- * generates frequency of 'parent_rate'
- *
- * @parent_rate: rate of parent clock
- * @rate: requested rate to be set
- */
-static int sh7722_find_div_index(unsigned long parent_rate, unsigned rate)
-{
-	unsigned div2 = parent_rate * 2 / rate;
-	int index;
-
-	if (rate > parent_rate)
-		return -EINVAL;
-
-	for (index = 1; index < ARRAY_SIZE(divisors2); index++) {
-		if (div2 > divisors2[index - 1] && div2 <= divisors2[index])
-			break;
-	}
-	if (index >= ARRAY_SIZE(divisors2))
-		index = ARRAY_SIZE(divisors2) - 1;
-	return index;
-}
-
-static unsigned long sh7722_frqcr_recalc(struct clk *clk)
-{
-	struct frqcr_context ctx = sh7722_get_clk_context(clk->name);
-	unsigned long frqcr = ctrl_inl(FRQCR);
-	int index;
-
-	index = (frqcr >> ctx.shift) & ctx.mask;
-	return clk->parent->rate * 2 / divisors2[index];
-}
-
-static int sh7722_frqcr_set_rate(struct clk *clk, unsigned long rate,
-				 int algo_id)
-{
-	struct frqcr_context ctx = sh7722_get_clk_context(clk->name);
-	unsigned long parent_rate = clk->parent->rate;
-	int div;
-	unsigned long frqcr;
-	int err = 0;
-
-	/* pretty invalid */
-	if (parent_rate < rate)
-		return -EINVAL;
-
-	/* look for multiplier/divisor pair */
-	div = sh7722_find_div_index(parent_rate, rate);
-	if (div<0)
-		return div;
-
-	/* calculate new value of clock rate */
-	clk->rate = parent_rate * 2 / divisors2[div];
-	frqcr = ctrl_inl(FRQCR);
-
-	/* FIXME: adjust as algo_id specifies */
-	if (algo_id != NO_CHANGE) {
-		int originator;
-		char *algo_group_1[] = { "cpu_clk", "umem_clk", "sh_clk" };
-		char *algo_group_2[] = { "sh_clk", "bus_clk" };
-		char *algo_group_3[] = { "sh_clk", "sdram_clk" };
-		char *algo_group_4[] = { "bus_clk", "peripheral_clk" };
-		char *algo_group_5[] = { "cpu_clk", "peripheral_clk" };
-		char **algo_current = NULL;
-		/* 3 is the maximum number of clocks in relation */
-		struct clk *ck[3];
-		unsigned long values[3]; /* the same comment as above */
-		int part_length = -1;
-		int i;
-
-		/*
-		 * all the steps below only required if adjustion was
-		 * requested
-		 */
-		if (algo_id == IUS_N1_N1 ||
-		    algo_id == IUS_322 ||
-		    algo_id == IUS_522 ||
-		    algo_id == IUS_N11) {
-			algo_current = algo_group_1;
-			part_length = 3;
-		}
-		if (algo_id == SB_N1) {
-			algo_current = algo_group_2;
-			part_length = 2;
-		}
-		if (algo_id == SB3_N1 ||
-		    algo_id == SB3_32 ||
-		    algo_id == SB3_43 ||
-		    algo_id == SB3_54) {
-			algo_current = algo_group_3;
-			part_length = 2;
-		}
-		if (algo_id == BP_N1) {
-			algo_current = algo_group_4;
-			part_length = 2;
-		}
-		if (algo_id == IP_N1) {
-			algo_current = algo_group_5;
-			part_length = 2;
-		}
-		if (!algo_current)
-			goto incorrect_algo_id;
-
-		originator = -1;
-		for (i = 0; i < part_length; i ++ ) {
-			if (originator >= 0 && !strcmp(clk->name,
-						       algo_current[i]))
-				originator = i;
-			ck[i] = clk_get(NULL, algo_current[i]);
-			values[i] = clk_get_rate(ck[i]);
-		}
-
-		if (originator >= 0)
-			adjust_clocks(originator, adjust_algos[algo_id],
-				      values, part_length);
-
-		for (i = 0; i < part_length; i ++ ) {
-			struct frqcr_context part_ctx;
-			int part_div;
-
-			if (likely(!err)) {
-				part_div = sh7722_find_div_index(parent_rate,
-								rate);
-				if (part_div > 0) {
-					part_ctx = sh7722_get_clk_context(
-								ck[i]->name);
-					frqcr &= ~(part_ctx.mask <<
-						   part_ctx.shift);
-					frqcr |= part_div << part_ctx.shift;
-				} else
-					err = part_div;
-			}
-
-			ck[i]->ops->recalc(ck[i]);
-			clk_put(ck[i]);
-		}
-	}
-
-	/* was there any error during recalculation ? If so, bail out.. */
-	if (unlikely(err!=0))
-		goto out_err;
-
-	/* clear FRQCR bits */
-	frqcr &= ~(ctx.mask << ctx.shift);
-	frqcr |= div << ctx.shift;
-	frqcr |= FRQCRKICK;
-
-	/* ...and perform actual change */
-	ctrl_outl(frqcr, FRQCR);
-	return 0;
-
-incorrect_algo_id:
-	return -EINVAL;
-out_err:
-	return err;
-}
-
-static long sh7722_frqcr_round_rate(struct clk *clk, unsigned long rate)
-{
-	unsigned long parent_rate = clk->parent->rate;
-	int div;
-
-	/* look for multiplier/divisor pair */
-	div = sh7722_find_div_index(parent_rate, rate);
-	if (div < 0)
-		return clk->rate;
-
-	/* calculate new value of clock rate */
-	return parent_rate * 2 / divisors2[div];
-}
-
-static struct clk_ops sh7722_frqcr_clk_ops = {
-	.recalc = sh7722_frqcr_recalc,
-	.set_rate = sh7722_frqcr_set_rate,
-	.round_rate = sh7722_frqcr_round_rate,
+struct clk extal_clk = {
+	.name		= "extal",
+	.id		= -1,
+	.rate		= 33333333,
 };
 
-/*
- * clock ops methods for SIU A/B and IrDA clock
- */
-static int sh7722_siu_set_rate(struct clk *clk, unsigned long rate, int algo_id)
-{
-	unsigned long r;
-	int div;
-
-	r = ctrl_inl(clk->arch_flags);
-	div = sh7722_find_div_index(clk->parent->rate, rate);
-	if (div < 0)
-		return div;
-	r = (r & ~0xF) | div;
-	ctrl_outl(r, clk->arch_flags);
-	return 0;
-}
-
-static unsigned long sh7722_siu_recalc(struct clk *clk)
-{
-	unsigned long r;
-
-	r = ctrl_inl(clk->arch_flags);
-	return clk->parent->rate * 2 / divisors2[r & 0xF];
-}
-
-static int sh7722_siu_start_stop(struct clk *clk, int enable)
+/* The dll block multiplies the 32khz r_clk, may be used instead of extal */
+static unsigned long dll_recalc(struct clk *clk)
 {
-	unsigned long r;
+	unsigned long mult;
 
-	r = ctrl_inl(clk->arch_flags);
-	if (enable)
-		ctrl_outl(r & ~(1 << 8), clk->arch_flags);
+	if (__raw_readl(PLLCR) & 0x1000)
+		mult = __raw_readl(DLLFRQ);
 	else
-		ctrl_outl(r | (1 << 8), clk->arch_flags);
-	return 0;
-}
+		mult = 0;
 
-static int sh7722_siu_enable(struct clk *clk)
-{
-	return sh7722_siu_start_stop(clk, 1);
-}
-
-static void sh7722_siu_disable(struct clk *clk)
-{
-	sh7722_siu_start_stop(clk, 0);
+	return clk->parent->rate * mult;
 }
 
-static struct clk_ops sh7722_siu_clk_ops = {
-	.recalc = sh7722_siu_recalc,
-	.set_rate = sh7722_siu_set_rate,
-	.enable = sh7722_siu_enable,
-	.disable = sh7722_siu_disable,
+static struct clk_ops dll_clk_ops = {
+	.recalc		= dll_recalc,
 };
 
-static int sh7722_video_enable(struct clk *clk)
-{
-	unsigned long r;
-
-	r = ctrl_inl(VCLKCR);
-	ctrl_outl( r & ~(1<<8), VCLKCR);
-	return 0;
-}
-
-static void sh7722_video_disable(struct clk *clk)
-{
-	unsigned long r;
-
-	r = ctrl_inl(VCLKCR);
-	ctrl_outl( r | (1<<8), VCLKCR);
-}
+static struct clk dll_clk = {
+	.name           = "dll_clk",
+	.id             = -1,
+	.ops		= &dll_clk_ops,
+	.parent		= &r_clk,
+	.flags		= CLK_ENABLE_ON_INIT,
+};
 
-static int sh7722_video_set_rate(struct clk *clk, unsigned long rate,
-				 int algo_id)
+static unsigned long pll_recalc(struct clk *clk)
 {
-	unsigned long r;
+	unsigned long mult = 1;
+	unsigned long div = 1;
 
-	r = ctrl_inl(VCLKCR);
-	r &= ~0x3F;
-	r |= ((clk->parent->rate / rate - 1) & 0x3F);
-	ctrl_outl(r, VCLKCR);
-	return 0;
-}
-
-static unsigned long sh7722_video_recalc(struct clk *clk)
-{
-	unsigned long r;
+	if (__raw_readl(PLLCR) & 0x4000)
+		mult = (((__raw_readl(FRQCR) >> 24) & 0x1f) + 1);
+	else
+		div = 2;
 
-	r = ctrl_inl(VCLKCR);
-	return clk->parent->rate / ((r & 0x3F) + 1);
+	return (clk->parent->rate * mult) / div;
 }
 
-static struct clk_ops sh7722_video_clk_ops = {
-	.recalc = sh7722_video_recalc,
-	.set_rate = sh7722_video_set_rate,
-	.enable = sh7722_video_enable,
-	.disable = sh7722_video_disable,
-};
-/*
- * and at last, clock definitions themselves
- */
-static struct clk sh7722_umem_clock = {
-	.name = "umem_clk",
-	.ops = &sh7722_frqcr_clk_ops,
+static struct clk_ops pll_clk_ops = {
+	.recalc		= pll_recalc,
 };
 
-static struct clk sh7722_sh_clock = {
-	.name = "sh_clk",
-	.ops = &sh7722_frqcr_clk_ops,
+static struct clk pll_clk = {
+	.name		= "pll_clk",
+	.id		= -1,
+	.ops		= &pll_clk_ops,
+	.flags		= CLK_ENABLE_ON_INIT,
 };
 
-static struct clk sh7722_peripheral_clock = {
-	.name = "peripheral_clk",
-	.ops = &sh7722_frqcr_clk_ops,
+struct clk *main_clks[] = {
+	&r_clk,
+	&extal_clk,
+	&dll_clk,
+	&pll_clk,
 };
 
-static struct clk sh7722_sdram_clock = {
-	.name = "sdram_clk",
-	.ops = &sh7722_frqcr_clk_ops,
-};
+static int multipliers[] = { 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
+static int divisors[] = { 1, 3, 2, 5, 3, 4, 5, 6, 8, 10, 12, 16, 20 };
 
-static struct clk sh7722_r_clock = {
-	.name = "r_clk",
-	.rate = 32768,
+static struct clk_div_mult_table div4_table = {
+	.divisors = divisors,
+	.nr_divisors = ARRAY_SIZE(divisors),
+	.multipliers = multipliers,
+	.nr_multipliers = ARRAY_SIZE(multipliers),
 };
 
-/*
- * these three clocks - SIU A, SIU B, IrDA - share the same clk_ops
- * methods of clk_ops determine which register they should access by
- * examining clk->name field
- */
-static struct clk sh7722_siu_a_clock = {
-	.name = "siu_a_clk",
-	.arch_flags = SCLKACR,
-	.ops = &sh7722_siu_clk_ops,
-};
+enum { DIV4_I, DIV4_U, DIV4_SH, DIV4_B, DIV4_B3, DIV4_P,
+       DIV4_SIUA, DIV4_SIUB, DIV4_IRDA, DIV4_NR };
 
-static struct clk sh7722_siu_b_clock = {
-	.name = "siu_b_clk",
-	.arch_flags = SCLKBCR,
-	.ops = &sh7722_siu_clk_ops,
-};
+#define DIV4(_str, _reg, _bit, _mask, _flags) \
+  SH_CLK_DIV4(_str, &pll_clk, _reg, _bit, _mask, _flags)
 
-#if defined(CONFIG_CPU_SUBTYPE_SH7722)
-static struct clk sh7722_irda_clock = {
-	.name = "irda_clk",
-	.arch_flags = IrDACLKCR,
-	.ops = &sh7722_siu_clk_ops,
+struct clk div4_clks[DIV4_NR] = {
+	[DIV4_I] = DIV4("cpu_clk", FRQCR, 20, 0x1fef, CLK_ENABLE_ON_INIT),
+	[DIV4_U] = DIV4("umem_clk", FRQCR, 16, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_SH] = DIV4("shyway_clk", FRQCR, 12, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B] = DIV4("bus_clk", FRQCR, 8, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_B3] = DIV4("b3_clk", FRQCR, 4, 0x1fff, CLK_ENABLE_ON_INIT),
+	[DIV4_P] = DIV4("peripheral_clk", FRQCR, 0, 0x1fff, 0),
+	[DIV4_SIUA] = DIV4("siua_clk", SCLKACR, 0, 0x1fff, 0),
+	[DIV4_SIUB] = DIV4("siub_clk", SCLKBCR, 0, 0x1fff, 0),
+	[DIV4_IRDA] = DIV4("irda_clk", IRDACLKCR, 0, 0x1fff, 0),
 };
-#endif
 
-static struct clk sh7722_video_clock = {
-	.name = "video_clk",
-	.ops = &sh7722_video_clk_ops,
+struct clk div6_clks[] = {
+	SH_CLK_DIV6("video_clk", &pll_clk, VCLKCR, 0),
 };
 
-#define MSTPCR_ARCH_FLAGS(reg, bit) (((reg) << 8) | (bit))
-#define MSTPCR_ARCH_FLAGS_REG(value) ((value) >> 8)
-#define MSTPCR_ARCH_FLAGS_BIT(value) ((value) & 0xff)
-
-static int sh7722_mstpcr_start_stop(struct clk *clk, int enable)
-{
-	unsigned long bit = MSTPCR_ARCH_FLAGS_BIT(clk->arch_flags);
-	unsigned long reg;
-	unsigned long r;
+#define MSTP(_str, _parent, _reg, _bit, _flags) \
+  SH_CLK_MSTP32(_str, -1, _parent, _reg, _bit, _flags)
 
-	switch(MSTPCR_ARCH_FLAGS_REG(clk->arch_flags)) {
-	case 0:
-		reg = MSTPCR0;
-		break;
-	case 1:
-		reg = MSTPCR1;
-		break;
-	case 2:
-		reg = MSTPCR2;
-		break;
-	default:
-		return -EINVAL;
-	}
+static struct clk mstp_clks[] = {
+	MSTP("uram0", &div4_clks[DIV4_U], MSTPCR0, 28, CLK_ENABLE_ON_INIT),
+	MSTP("xymem0", &div4_clks[DIV4_B], MSTPCR0, 26, CLK_ENABLE_ON_INIT),
+	MSTP("tmu0", &div4_clks[DIV4_P], MSTPCR0, 15, 0),
+	MSTP("cmt0", &r_clk, MSTPCR0, 14, 0),
+	MSTP("rwdt0", &r_clk, MSTPCR0, 13, 0),
+	MSTP("flctl0", &div4_clks[DIV4_P], MSTPCR0, 10, 0),
+	MSTP("scif0", &div4_clks[DIV4_P], MSTPCR0, 7, 0),
+	MSTP("scif1", &div4_clks[DIV4_P], MSTPCR0, 6, 0),
+	MSTP("scif2", &div4_clks[DIV4_P], MSTPCR0, 5, 0),
 
-	r = ctrl_inl(reg);
-
-	if (enable)
-		r &= ~(1 << bit);
-	else
-		r |= (1 << bit);
+	MSTP("i2c0", &div4_clks[DIV4_P], MSTPCR1, 9, 0),
+	MSTP("rtc0", &r_clk, MSTPCR1, 8, 0),
 
-	ctrl_outl(r, reg);
-	return 0;
-}
-
-static int sh7722_mstpcr_enable(struct clk *clk)
-{
-	return sh7722_mstpcr_start_stop(clk, 1);
-}
-
-static void sh7722_mstpcr_disable(struct clk *clk)
-{
-	sh7722_mstpcr_start_stop(clk, 0);
-}
-
-static struct clk_ops sh7722_mstpcr_clk_ops = {
-	.enable = sh7722_mstpcr_enable,
-	.disable = sh7722_mstpcr_disable,
-	.recalc = followparent_recalc,
-};
-
-#define MSTPCR(_name, _parent, regnr, bitnr, _flags) \
-{						\
-	.name = _name,				\
-	.flags = _flags,			\
-	.arch_flags = MSTPCR_ARCH_FLAGS(regnr, bitnr),	\
-	.ops = (void *)_parent,		\
-}
-
-static struct clk sh7722_mstpcr_clocks[] = {
-#if defined(CONFIG_CPU_SUBTYPE_SH7722)
-	MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT),
-	MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT),
-	MSTPCR("tmu0", "peripheral_clk", 0, 15, 0),
-	MSTPCR("cmt0", "r_clk", 0, 14, 0),
-	MSTPCR("rwdt0", "r_clk", 0, 13, 0),
-	MSTPCR("flctl0", "peripheral_clk", 0, 10, 0),
-	MSTPCR("scif0", "peripheral_clk", 0, 7, 0),
-	MSTPCR("scif1", "peripheral_clk", 0, 6, 0),
-	MSTPCR("scif2", "peripheral_clk", 0, 5, 0),
-	MSTPCR("i2c0", "peripheral_clk", 1, 9, 0),
-	MSTPCR("rtc0", "r_clk", 1, 8, 0),
-	MSTPCR("sdhi0", "peripheral_clk", 2, 18, 0),
-	MSTPCR("keysc0", "r_clk", 2, 14, 0),
-	MSTPCR("usbf0", "peripheral_clk", 2, 11, 0),
-	MSTPCR("2dg0", "bus_clk", 2, 9, 0),
-	MSTPCR("siu0", "bus_clk", 2, 8, 0),
-	MSTPCR("vou0", "bus_clk", 2, 5, 0),
-	MSTPCR("jpu0", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT),
-	MSTPCR("beu0", "bus_clk", 2, 4, 0),
-	MSTPCR("ceu0", "bus_clk", 2, 3, 0),
-	MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT),
-	MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT),
-	MSTPCR("lcdc0", "bus_clk", 2, 0, 0),
-#endif
-};
-
-static struct clk *sh7722_clocks[] = {
-	&sh7722_umem_clock,
-	&sh7722_sh_clock,
-	&sh7722_peripheral_clock,
-	&sh7722_sdram_clock,
-	&sh7722_siu_a_clock,
-	&sh7722_siu_b_clock,
-#if defined(CONFIG_CPU_SUBTYPE_SH7722)
-	&sh7722_irda_clock,
-#endif
-	&sh7722_video_clock,
-};
-
-/*
- * init in order: master, module, bus, cpu
- */
-struct clk_ops *onchip_ops[] = {
-	&sh7722_master_clk_ops,
-	&sh7722_module_clk_ops,
-	&sh7722_frqcr_clk_ops,
-	&sh7722_frqcr_clk_ops,
+	MSTP("sdhi0", &div4_clks[DIV4_P], MSTPCR2, 18, 0),
+	MSTP("keysc0", &r_clk, MSTPCR2, 14, 0),
+	MSTP("usbf0", &div4_clks[DIV4_P], MSTPCR2, 11, 0),
+	MSTP("2dg0", &div4_clks[DIV4_B], MSTPCR2, 9, 0),
+	MSTP("siu0", &div4_clks[DIV4_B], MSTPCR2, 8, 0),
+	MSTP("vou0", &div4_clks[DIV4_B], MSTPCR2, 5, 0),
+	MSTP("jpu0", &div4_clks[DIV4_B], MSTPCR2, 6, CLK_ENABLE_ON_INIT),
+	MSTP("beu0", &div4_clks[DIV4_B], MSTPCR2, 4, 0),
+	MSTP("ceu0", &div4_clks[DIV4_B], MSTPCR2, 3, 0),
+	MSTP("veu0", &div4_clks[DIV4_B], MSTPCR2, 2, CLK_ENABLE_ON_INIT),
+	MSTP("vpu0", &div4_clks[DIV4_B], MSTPCR2, 1, CLK_ENABLE_ON_INIT),
+	MSTP("lcdc0", &div4_clks[DIV4_B], MSTPCR2, 0, 0),
 };
 
-void __init
-arch_init_clk_ops(struct clk_ops **ops, int type)
-{
-	BUG_ON(type < 0 || type >= ARRAY_SIZE(onchip_ops));
-	*ops = onchip_ops[type];
-}
-
 int __init arch_clk_init(void)
 {
-	struct clk *clk;
-	int i;
+	int k, ret = 0;
 
-	cpg_clk_init();
+	/* autodetect extal or dll configuration */
+	if (__raw_readl(PLLCR) & 0x1000)
+		pll_clk.parent = &dll_clk;
+	else
+		pll_clk.parent = &extal_clk;
 
-	clk = clk_get(NULL, "master_clk");
-	for (i = 0; i < ARRAY_SIZE(sh7722_clocks); i++) {
-		pr_debug( "Registering clock '%s'\n", sh7722_clocks[i]->name);
-		sh7722_clocks[i]->parent = clk;
-		clk_register(sh7722_clocks[i]);
-	}
-	clk_put(clk);
+	for (k = 0; !ret && (k < ARRAY_SIZE(main_clks)); k++)
+		ret = clk_register(main_clks[k]);
 
-	clk_register(&sh7722_r_clock);
+	if (!ret)
+		ret = sh_clk_div4_register(div4_clks, DIV4_NR, &div4_table);
 
-	for (i = 0; i < ARRAY_SIZE(sh7722_mstpcr_clocks); i++) {
-		pr_debug( "Registering mstpcr clock '%s'\n",
-			  sh7722_mstpcr_clocks[i].name);
-		clk = clk_get(NULL, (void *) sh7722_mstpcr_clocks[i].ops);
-		sh7722_mstpcr_clocks[i].parent = clk;
-		sh7722_mstpcr_clocks[i].ops = &sh7722_mstpcr_clk_ops;
-		clk_register(&sh7722_mstpcr_clocks[i]);
-		clk_put(clk);
-	}
+	if (!ret)
+		ret = sh_clk_div6_register(div6_clks, ARRAY_SIZE(div6_clks));
 
-	propagate_rate(&sh7722_r_clock); /* make sure rate gets propagated */
+	if (!ret)
+		ret = sh_clk_mstp32_register(mstp_clks, ARRAY_SIZE(mstp_clks));
 
-	return 0;
+	return ret;
 }
-- 
GitLab


From c5eeff1f8ecbc4bc7c1dd8e97a8610bc4dd3def8 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Mon, 8 Jun 2009 10:05:12 +0000
Subject: [PATCH 1955/2198] sh: sh7724: INTC setting update

This patch follows Rev 0.50 manual

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 171 ++++++++++++-------------
 1 file changed, 82 insertions(+), 89 deletions(-)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 000f3b82669b6..585dd85c6c4eb 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -511,46 +511,46 @@ enum {
 	IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7,
 	HUDI,
 	DMAC1A_DEI0, DMAC1A_DEI1, DMAC1A_DEI2, DMAC1A_DEI3,
-	_2DG_TRI, _2DG_INI, _2DG_CEI, _2DG_BRK,
+	_2DG_TRI, _2DG_INI, _2DG_CEI,
 	DMAC0A_DEI0, DMAC0A_DEI1, DMAC0A_DEI2, DMAC0A_DEI3,
-	VIO_CEU20I, VIO_BEU20I, VIO_VEU3F1, VIO_VOUI,
-	SCIFA_SCIFA0,
-	VPU_VPUI,
-	TPU_TPUI,
-	CEU21I,
-	BEU21I,
-	USB_USI0,
+	VIO_CEU0, VIO_BEU0, VIO_VEU1, VIO_VOU,
+	SCIFA3,
+	VPU,
+	TPU,
+	CEU1,
+	BEU1,
+	USB0, USB1,
 	ATAPI,
 	RTC_ATI, RTC_PRI, RTC_CUI,
 	DMAC1B_DEI4, DMAC1B_DEI5, DMAC1B_DADERR,
 	DMAC0B_DEI4, DMAC0B_DEI5, DMAC0B_DADERR,
-	KEYSC_KEYI,
+	KEYSC,
 	SCIF_SCIF0, SCIF_SCIF1, SCIF_SCIF2,
-	VEU3F0I,
+	VEU0,
 	MSIOF_MSIOFI0, MSIOF_MSIOFI1,
 	SPU_SPUI0, SPU_SPUI1,
-	SCIFA_SCIFA1,
-/*	ICB_ICBI, */
+	SCIFA4,
+	ICB,
 	ETHI,
 	I2C1_ALI, I2C1_TACKI, I2C1_WAITI, I2C1_DTEI,
 	I2C0_ALI, I2C0_TACKI, I2C0_WAITI, I2C0_DTEI,
-	SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2,
-	CMT_CMTI,
-	TSIF_TSIFI,
-/*	ICB_LMBI, */
-	FSI_FSI,
-	SCIFA_SCIFA2,
+	SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2, SDHI0_SDHII3,
+	CMT,
+	TSIF,
+	FSI,
+	SCIFA5,
 	TMU0_TUNI0, TMU0_TUNI1, TMU0_TUNI2,
-	IRDA_IRDAI,
+	IRDA,
 	SDHI1_SDHII0, SDHI1_SDHII1, SDHI1_SDHII2,
-	JPU_JPUI,
-	MMC_MMCI0, MMC_MMCI1, MMC_MMCI2,
-	LCDC_LCDCI,
+	JPU,
+	_2DDMAC,
+	MMC_MMC2I, MMC_MMC3I,
+	LCDC,
 	TMU1_TUNI0, TMU1_TUNI1, TMU1_TUNI2,
 
 	/* interrupt groups */
-	DMAC1A, _2DG, DMAC0A, VIO, RTC,
-	DMAC1B, DMAC0B, I2C0, I2C1, SDHI0, SDHI1, SPU, MMC,
+	DMAC1A, _2DG, DMAC0A, VIO, USB, RTC,
+	DMAC1B, DMAC0B, I2C0, I2C1, SDHI0, SDHI1, SPU, MMCIF,
 };
 
 static struct intc_vect vectors[] __initdata = {
@@ -567,25 +567,25 @@ static struct intc_vect vectors[] __initdata = {
 	INTC_VECT(_2DG_TRI, 0x780),
 	INTC_VECT(_2DG_INI, 0x7A0),
 	INTC_VECT(_2DG_CEI, 0x7C0),
-	INTC_VECT(_2DG_BRK, 0x7E0),
 
 	INTC_VECT(DMAC0A_DEI0, 0x800),
 	INTC_VECT(DMAC0A_DEI1, 0x820),
 	INTC_VECT(DMAC0A_DEI2, 0x840),
 	INTC_VECT(DMAC0A_DEI3, 0x860),
 
-	INTC_VECT(VIO_CEU20I, 0x880),
-	INTC_VECT(VIO_BEU20I, 0x8A0),
-	INTC_VECT(VIO_VEU3F1, 0x8C0),
-	INTC_VECT(VIO_VOUI, 0x8E0),
+	INTC_VECT(VIO_CEU0, 0x880),
+	INTC_VECT(VIO_BEU0, 0x8A0),
+	INTC_VECT(VIO_VEU1, 0x8C0),
+	INTC_VECT(VIO_VOU,  0x8E0),
 
-	INTC_VECT(SCIFA_SCIFA0, 0x900),
-	INTC_VECT(VPU_VPUI, 0x980),
-	INTC_VECT(TPU_TPUI, 0x9A0),
-	INTC_VECT(CEU21I, 0x9E0),
-	INTC_VECT(BEU21I, 0xA00),
-	INTC_VECT(USB_USI0, 0xA20),
-	INTC_VECT(ATAPI, 0xA60),
+	INTC_VECT(SCIFA3, 0x900),
+	INTC_VECT(VPU,    0x980),
+	INTC_VECT(TPU,    0x9A0),
+	INTC_VECT(CEU1,   0x9E0),
+	INTC_VECT(BEU1,   0xA00),
+	INTC_VECT(USB0,   0xA20),
+	INTC_VECT(USB1,   0xA40),
+	INTC_VECT(ATAPI,  0xA60),
 
 	INTC_VECT(RTC_ATI, 0xA80),
 	INTC_VECT(RTC_PRI, 0xAA0),
@@ -599,18 +599,18 @@ static struct intc_vect vectors[] __initdata = {
 	INTC_VECT(DMAC0B_DEI5, 0xBA0),
 	INTC_VECT(DMAC0B_DADERR, 0xBC0),
 
-	INTC_VECT(KEYSC_KEYI, 0xBE0),
+	INTC_VECT(KEYSC,      0xBE0),
 	INTC_VECT(SCIF_SCIF0, 0xC00),
 	INTC_VECT(SCIF_SCIF1, 0xC20),
 	INTC_VECT(SCIF_SCIF2, 0xC40),
-	INTC_VECT(VEU3F0I, 0xC60),
+	INTC_VECT(VEU0,       0xC60),
 	INTC_VECT(MSIOF_MSIOFI0, 0xC80),
 	INTC_VECT(MSIOF_MSIOFI1, 0xCA0),
 	INTC_VECT(SPU_SPUI0, 0xCC0),
 	INTC_VECT(SPU_SPUI1, 0xCE0),
-	INTC_VECT(SCIFA_SCIFA1, 0xD00),
+	INTC_VECT(SCIFA4,    0xD00),
 
-/*	INTC_VECT(ICB_ICBI, 0xD20), */
+	INTC_VECT(ICB,  0xD20),
 	INTC_VECT(ETHI, 0xD60),
 
 	INTC_VECT(I2C1_ALI, 0xD80),
@@ -626,30 +626,30 @@ static struct intc_vect vectors[] __initdata = {
 	INTC_VECT(SDHI0_SDHII0, 0xE80),
 	INTC_VECT(SDHI0_SDHII1, 0xEA0),
 	INTC_VECT(SDHI0_SDHII2, 0xEC0),
+	INTC_VECT(SDHI0_SDHII3, 0xEE0),
 
-	INTC_VECT(CMT_CMTI, 0xF00),
-	INTC_VECT(TSIF_TSIFI, 0xF20),
-/*	INTC_VECT(ICB_LMBI, 0xF60), */
-	INTC_VECT(FSI_FSI, 0xF80),
-	INTC_VECT(SCIFA_SCIFA2, 0xFA0),
+	INTC_VECT(CMT,    0xF00),
+	INTC_VECT(TSIF,   0xF20),
+	INTC_VECT(FSI,    0xF80),
+	INTC_VECT(SCIFA5, 0xFA0),
 
 	INTC_VECT(TMU0_TUNI0, 0x400),
 	INTC_VECT(TMU0_TUNI1, 0x420),
 	INTC_VECT(TMU0_TUNI2, 0x440),
 
-	INTC_VECT(IRDA_IRDAI, 0x480),
+	INTC_VECT(IRDA,    0x480),
 
 	INTC_VECT(SDHI1_SDHII0, 0x4E0),
 	INTC_VECT(SDHI1_SDHII1, 0x500),
 	INTC_VECT(SDHI1_SDHII2, 0x520),
 
-	INTC_VECT(JPU_JPUI, 0x560),
+	INTC_VECT(JPU, 0x560),
+	INTC_VECT(_2DDMAC, 0x4A0),
 
-	INTC_VECT(MMC_MMCI0, 0x580),
-	INTC_VECT(MMC_MMCI1, 0x5A0),
-	INTC_VECT(MMC_MMCI2, 0x5C0),
+	INTC_VECT(MMC_MMC2I, 0x5A0),
+	INTC_VECT(MMC_MMC3I, 0x5C0),
 
-	INTC_VECT(LCDC_LCDCI, 0xF40),
+	INTC_VECT(LCDC, 0xF40),
 
 	INTC_VECT(TMU1_TUNI0, 0x920),
 	INTC_VECT(TMU1_TUNI1, 0x940),
@@ -658,86 +658,79 @@ static struct intc_vect vectors[] __initdata = {
 
 static struct intc_group groups[] __initdata = {
 	INTC_GROUP(DMAC1A, DMAC1A_DEI0, DMAC1A_DEI1, DMAC1A_DEI2, DMAC1A_DEI3),
-	INTC_GROUP(_2DG, _2DG_TRI, _2DG_INI, _2DG_CEI, _2DG_BRK),
+	INTC_GROUP(_2DG, _2DG_TRI, _2DG_INI, _2DG_CEI),
 	INTC_GROUP(DMAC0A, DMAC0A_DEI0, DMAC0A_DEI1, DMAC0A_DEI2, DMAC0A_DEI3),
-	INTC_GROUP(VIO, VIO_CEU20I, VIO_BEU20I, VIO_VEU3F1, VIO_VOUI),
+	INTC_GROUP(VIO, VIO_CEU0, VIO_BEU0, VIO_VEU1, VIO_VOU),
+	INTC_GROUP(USB, USB0, USB1),
 	INTC_GROUP(RTC, RTC_ATI, RTC_PRI, RTC_CUI),
 	INTC_GROUP(DMAC1B, DMAC1B_DEI4, DMAC1B_DEI5, DMAC1B_DADERR),
 	INTC_GROUP(DMAC0B, DMAC0B_DEI4, DMAC0B_DEI5, DMAC0B_DADERR),
 	INTC_GROUP(I2C0, I2C0_ALI, I2C0_TACKI, I2C0_WAITI, I2C0_DTEI),
 	INTC_GROUP(I2C1, I2C1_ALI, I2C1_TACKI, I2C1_WAITI, I2C1_DTEI),
-	INTC_GROUP(SDHI0, SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2),
+	INTC_GROUP(SDHI0, SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2, SDHI0_SDHII3),
 	INTC_GROUP(SDHI1, SDHI1_SDHII0, SDHI1_SDHII1, SDHI1_SDHII2),
 	INTC_GROUP(SPU, SPU_SPUI0, SPU_SPUI1),
-	INTC_GROUP(MMC, MMC_MMCI0, MMC_MMCI1, MMC_MMCI2),
+	INTC_GROUP(MMCIF, MMC_MMC2I, MMC_MMC3I),
 };
 
-/* FIXMEEEEEEEEEEEEEEEEEEE !!!!! */
-/* very bad manual !! */
 static struct intc_mask_reg mask_registers[] __initdata = {
 	{ 0xa4080080, 0xa40800c0, 8, /* IMR0 / IMCR0 */
 	  { 0, TMU1_TUNI2, TMU1_TUNI1, TMU1_TUNI0,
-	    /*SDHII3?*/0, SDHI1_SDHII2, SDHI1_SDHII1, SDHI1_SDHII0 } },
+	    0, SDHI1_SDHII2, SDHI1_SDHII1, SDHI1_SDHII0 } },
 	{ 0xa4080084, 0xa40800c4, 8, /* IMR1 / IMCR1 */
-	  { VIO_VOUI, VIO_VEU3F1, VIO_BEU20I, VIO_CEU20I,
+	  { VIO_VOU, VIO_VEU1, VIO_BEU0, VIO_CEU0,
 	    DMAC0A_DEI3, DMAC0A_DEI2, DMAC0A_DEI1, DMAC0A_DEI0 } },
 	{ 0xa4080088, 0xa40800c8, 8, /* IMR2 / IMCR2 */
-	  { 0, 0, 0, VPU_VPUI, ATAPI, ETHI, 0, /*SCIFA3*/SCIFA_SCIFA0 } },
+	  { 0, 0, 0, VPU, ATAPI, ETHI, 0, SCIFA3 } },
 	{ 0xa408008c, 0xa40800cc, 8, /* IMR3 / IMCR3 */
 	  { DMAC1A_DEI3, DMAC1A_DEI2, DMAC1A_DEI1, DMAC1A_DEI0,
-	    SPU_SPUI1, SPU_SPUI0, BEU21I, IRDA_IRDAI } },
+	    SPU_SPUI1, SPU_SPUI0, BEU1, IRDA } },
 	{ 0xa4080090, 0xa40800d0, 8, /* IMR4 / IMCR4 */
 	  { 0, TMU0_TUNI2, TMU0_TUNI1, TMU0_TUNI0,
-	    JPU_JPUI, 0, 0, LCDC_LCDCI } },
+	    JPU, 0, 0, LCDC } },
 	{ 0xa4080094, 0xa40800d4, 8, /* IMR5 / IMCR5 */
-	  { KEYSC_KEYI, DMAC0B_DADERR, DMAC0B_DEI5, DMAC0B_DEI4,
-	    VEU3F0I, SCIF_SCIF2, SCIF_SCIF1, SCIF_SCIF0 } },
+	  { KEYSC, DMAC0B_DADERR, DMAC0B_DEI5, DMAC0B_DEI4,
+	    VEU0, SCIF_SCIF2, SCIF_SCIF1, SCIF_SCIF0 } },
 	{ 0xa4080098, 0xa40800d8, 8, /* IMR6 / IMCR6 */
-	  { 0, 0, /*ICB_ICBI*/0, /*SCIFA4*/SCIFA_SCIFA1,
-	    CEU21I, 0, MSIOF_MSIOFI1, MSIOF_MSIOFI0 } },
+	  { 0, 0, ICB, SCIFA4,
+	    CEU1, 0, MSIOF_MSIOFI1, MSIOF_MSIOFI0 } },
 	{ 0xa408009c, 0xa40800dc, 8, /* IMR7 / IMCR7 */
 	  { I2C0_DTEI, I2C0_WAITI, I2C0_TACKI, I2C0_ALI,
 	    I2C1_DTEI, I2C1_WAITI, I2C1_TACKI, I2C1_ALI } },
 	{ 0xa40800a0, 0xa40800e0, 8, /* IMR8 / IMCR8 */
-	  { /*SDHII3*/0, SDHI0_SDHII2, SDHI0_SDHII1, SDHI0_SDHII0,
-	    0, 0, /*SCIFA5*/SCIFA_SCIFA2, FSI_FSI } },
+	  { SDHI0_SDHII3, SDHI0_SDHII2, SDHI0_SDHII1, SDHI0_SDHII0,
+	    0, 0, SCIFA5, FSI } },
 	{ 0xa40800a4, 0xa40800e4, 8, /* IMR9 / IMCR9 */
-	  { 0, 0, 0, CMT_CMTI, 0, /*USB1*/0, USB_USI0, 0 } },
+	  { 0, 0, 0, CMT, 0, USB1, USB0, 0 } },
 	{ 0xa40800a8, 0xa40800e8, 8, /* IMR10 / IMCR10 */
 	  { 0, DMAC1B_DADERR, DMAC1B_DEI5, DMAC1B_DEI4,
-	    0, RTC_ATI, RTC_PRI, RTC_CUI } },
+	    0, RTC_CUI, RTC_PRI, RTC_ATI } },
 	{ 0xa40800ac, 0xa40800ec, 8, /* IMR11 / IMCR11 */
-	  { _2DG_BRK, _2DG_CEI, _2DG_INI, _2DG_TRI,
-	    0, TPU_TPUI, /*ICB_LMBI*/0, TSIF_TSIFI } },
+	  { 0, _2DG_CEI, _2DG_INI, _2DG_TRI,
+	    0, TPU, 0, TSIF } },
 	{ 0xa40800b0, 0xa40800f0, 8, /* IMR12 / IMCR12 */
-	  { 0, 0, 0, 0, 0, 0, 0, 0/*2DDMAC*/ } },
+	  { 0, 0, MMC_MMC3I, MMC_MMC2I, 0, 0, 0, _2DDMAC } },
 	{ 0xa4140044, 0xa4140064, 8, /* INTMSK00 / INTMSKCLR00 */
 	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
 };
 
 static struct intc_prio_reg prio_registers[] __initdata = {
 	{ 0xa4080000, 0, 16, 4, /* IPRA */ { TMU0_TUNI0, TMU0_TUNI1,
-					     TMU0_TUNI2, IRDA_IRDAI } },
-	{ 0xa4080004, 0, 16, 4, /* IPRB */ { JPU_JPUI, LCDC_LCDCI,
-					     DMAC1A, BEU21I } },
+					     TMU0_TUNI2, IRDA } },
+	{ 0xa4080004, 0, 16, 4, /* IPRB */ { JPU, LCDC, DMAC1A, BEU1 } },
 	{ 0xa4080008, 0, 16, 4, /* IPRC */ { TMU1_TUNI0, TMU1_TUNI1,
 					     TMU1_TUNI2, SPU } },
-	{ 0xa408000c, 0, 16, 4, /* IPRD */ { 0, MMC, 0, ATAPI } },
-	{ 0xa4080010, 0, 16, 4, /* IPRE */
-	  { DMAC0A, /*BEU?VEU?*/VIO, /*SCIFA3*/SCIFA_SCIFA0, /*VPU5F*/
-	    VPU_VPUI } },
-	{ 0xa4080014, 0, 16, 4, /* IPRF */ { KEYSC_KEYI, DMAC0B,
-					     USB_USI0, CMT_CMTI } },
+	{ 0xa408000c, 0, 16, 4, /* IPRD */ { 0, MMCIF, 0, ATAPI } },
+	{ 0xa4080010, 0, 16, 4, /* IPRE */ { DMAC0A, VIO, SCIFA3, VPU } },
+	{ 0xa4080014, 0, 16, 4, /* IPRF */ { KEYSC, DMAC0B, USB, CMT } },
 	{ 0xa4080018, 0, 16, 4, /* IPRG */ { SCIF_SCIF0, SCIF_SCIF1,
-					     SCIF_SCIF2, VEU3F0I } },
+					     SCIF_SCIF2, VEU0 } },
 	{ 0xa408001c, 0, 16, 4, /* IPRH */ { MSIOF_MSIOFI0, MSIOF_MSIOFI1,
 					     I2C1, I2C0 } },
-	{ 0xa4080020, 0, 16, 4, /* IPRI */ { /*SCIFA4*/SCIFA_SCIFA1, /*ICB*/0,
-					     TSIF_TSIFI, _2DG/*ICB?*/ } },
-	{ 0xa4080024, 0, 16, 4, /* IPRJ */ { CEU21I, ETHI, FSI_FSI, SDHI1 } },
-	{ 0xa4080028, 0, 16, 4, /* IPRK */ { RTC, DMAC1B, /*ICB?*/0, SDHI0 } },
-	{ 0xa408002c, 0, 16, 4, /* IPRL */ { /*SCIFA5*/SCIFA_SCIFA2, 0,
-					     TPU_TPUI, /*2DDMAC*/0 } },
+	{ 0xa4080020, 0, 16, 4, /* IPRI */ { SCIFA4, ICB, TSIF, _2DG } },
+	{ 0xa4080024, 0, 16, 4, /* IPRJ */ { CEU1, ETHI, FSI, SDHI1 } },
+	{ 0xa4080028, 0, 16, 4, /* IPRK */ { RTC, DMAC1B, 0, SDHI0 } },
+	{ 0xa408002c, 0, 16, 4, /* IPRL */ { SCIFA5, 0, TPU, _2DDMAC } },
 	{ 0xa4140010, 0, 32, 4, /* INTPRI00 */
 	  { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } },
 };
-- 
GitLab


From f168dd00a9440a6f644db73bda47718fd12008e4 Mon Sep 17 00:00:00 2001
From: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Date: Wed, 10 Jun 2009 05:04:32 +0000
Subject: [PATCH 1956/2198] sh: sh7724: Add JPU support

Signed-off-by: Kuninori Morimoto <morimoto.kuninori@renesas.com>
Acked-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 31 ++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
index 585dd85c6c4eb..e5ac9eb11c63a 100644
--- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
+++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c
@@ -451,6 +451,35 @@ static struct platform_device tmu5_device = {
 	.num_resources	= ARRAY_SIZE(tmu5_resources),
 };
 
+/* JPU */
+static struct uio_info jpu_platform_data = {
+	.name = "JPU",
+	.version = "0",
+	.irq = 27,
+};
+
+static struct resource jpu_resources[] = {
+	[0] = {
+		.name	= "JPU",
+		.start	= 0xfe980000,
+		.end	= 0xfe9902d3,
+		.flags	= IORESOURCE_MEM,
+	},
+	[1] = {
+		/* place holder for contiguous memory */
+	},
+};
+
+static struct platform_device jpu_device = {
+	.name		= "uio_pdrv_genirq",
+	.id		= 3,
+	.dev = {
+		.platform_data	= &jpu_platform_data,
+	},
+	.resource	= jpu_resources,
+	.num_resources	= ARRAY_SIZE(jpu_resources),
+};
+
 static struct platform_device *sh7724_devices[] __initdata = {
 	&cmt_device,
 	&tmu0_device,
@@ -466,6 +495,7 @@ static struct platform_device *sh7724_devices[] __initdata = {
 	&vpu_device,
 	&veu0_device,
 	&veu1_device,
+	&jpu_device,
 };
 
 static int __init sh7724_devices_setup(void)
@@ -473,6 +503,7 @@ static int __init sh7724_devices_setup(void)
 	platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20);
 	platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20);
 	platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20);
+	platform_resource_setup_memory(&jpu_device,  "jpu",  2 << 20);
 
 	return platform_add_devices(sh7724_devices,
 				    ARRAY_SIZE(sh7724_devices));
-- 
GitLab


From 4c7c99788631bab177bd51e15e893be4689bb085 Mon Sep 17 00:00:00 2001
From: Aoi Shinkai <shinkoi2005@gmail.com>
Date: Wed, 10 Jun 2009 16:15:42 +0000
Subject: [PATCH 1957/2198] sh: Fix sh4a llsc-based cmpxchg()

This fixes up a typo in the ll/sc based cmpxchg code which apparently
wasn't getting a lot of testing due to the swapped old/new pair. With
that fixed up, the ll/sc code also starts using it and provides its own
atomic_add_unless().

Signed-off-by: Aoi Shinkai <shinkoi2005@gmail.com>
Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/atomic-llsc.h  | 27 +++++++++++++++++++++++++++
 arch/sh/include/asm/atomic.h       |  4 ++--
 arch/sh/include/asm/cmpxchg-llsc.h |  2 +-
 arch/sh/include/asm/spinlock.h     |  2 +-
 4 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/arch/sh/include/asm/atomic-llsc.h b/arch/sh/include/asm/atomic-llsc.h
index 4b00b78e3f4f3..b040e1e086108 100644
--- a/arch/sh/include/asm/atomic-llsc.h
+++ b/arch/sh/include/asm/atomic-llsc.h
@@ -104,4 +104,31 @@ static inline void atomic_set_mask(unsigned int mask, atomic_t *v)
 	: "t");
 }
 
+#define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
+
+/**
+ * atomic_add_unless - add unless the number is a given value
+ * @v: pointer of type atomic_t
+ * @a: the amount to add to v...
+ * @u: ...unless v is equal to u.
+ *
+ * Atomically adds @a to @v, so long as it was not @u.
+ * Returns non-zero if @v was not @u, and zero otherwise.
+ */
+static inline int atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int c, old;
+	c = atomic_read(v);
+	for (;;) {
+		if (unlikely(c == (u)))
+			break;
+		old = atomic_cmpxchg((v), c, c + (a));
+		if (likely(old == c))
+			break;
+		c = old;
+	}
+
+	return c != (u);
+}
+
 #endif /* __ASM_SH_ATOMIC_LLSC_H */
diff --git a/arch/sh/include/asm/atomic.h b/arch/sh/include/asm/atomic.h
index 6327ffbb19928..978b58efb1e92 100644
--- a/arch/sh/include/asm/atomic.h
+++ b/arch/sh/include/asm/atomic.h
@@ -45,7 +45,7 @@
 #define atomic_inc(v) atomic_add(1,(v))
 #define atomic_dec(v) atomic_sub(1,(v))
 
-#ifndef CONFIG_GUSA_RB
+#if !defined(CONFIG_GUSA_RB) && !defined(CONFIG_CPU_SH4A)
 static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
 {
 	int ret;
@@ -73,7 +73,7 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)
 
 	return ret != u;
 }
-#endif
+#endif /* !CONFIG_GUSA_RB && !CONFIG_CPU_SH4A */
 
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
 #define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
diff --git a/arch/sh/include/asm/cmpxchg-llsc.h b/arch/sh/include/asm/cmpxchg-llsc.h
index 0fac3da536ca0..47136661a203a 100644
--- a/arch/sh/include/asm/cmpxchg-llsc.h
+++ b/arch/sh/include/asm/cmpxchg-llsc.h
@@ -55,7 +55,7 @@ __cmpxchg_u32(volatile int *m, unsigned long old, unsigned long new)
 		"mov		%0, %1				\n\t"
 		"cmp/eq		%1, %3				\n\t"
 		"bf		2f				\n\t"
-		"mov		%3, %0				\n\t"
+		"mov		%4, %0				\n\t"
 		"2:						\n\t"
 		"movco.l	%0, @%2				\n\t"
 		"bf		1b				\n\t"
diff --git a/arch/sh/include/asm/spinlock.h b/arch/sh/include/asm/spinlock.h
index 60283565f89b5..a28c9f0053fd4 100644
--- a/arch/sh/include/asm/spinlock.h
+++ b/arch/sh/include/asm/spinlock.h
@@ -26,7 +26,7 @@
 #define __raw_spin_is_locked(x)		((x)->lock <= 0)
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 #define __raw_spin_unlock_wait(x) \
-	do { cpu_relax(); } while ((x)->lock)
+	do { while (__raw_spin_is_locked(x)) cpu_relax(); } while (0)
 
 /*
  * Simple spin lock operations.  There are two variants, one clears IRQ's
-- 
GitLab


From 75c936aec050113c12f74581f51779ffc5502a51 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 11 Jun 2009 09:33:53 +0300
Subject: [PATCH 1958/2198] sh: Fix sys_pwritev() syscall table entry for sh32.

There was a typo here that had this as sys_writev() instead of
sys_pwritev(), fix this up. sh64 got this right, as did the preadv()
case.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/kernel/syscalls_32.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 05202edd8e212..6118f358f9500 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -350,4 +350,4 @@ ENTRY(sys_call_table)
 	.long sys_pipe2
 	.long sys_inotify_init1
 	.long sys_preadv
-	.long sys_writev
+	.long sys_pwritev
-- 
GitLab


From 6a1555fdde407dad23b8a119cf5feeb7c6466de9 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 11 Jun 2009 09:38:05 +0300
Subject: [PATCH 1959/2198] sh: Wire up sys_rt_tgsigqueueinfo.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/include/asm/unistd_32.h | 3 ++-
 arch/sh/include/asm/unistd_64.h | 3 ++-
 arch/sh/kernel/syscalls_32.S    | 1 +
 arch/sh/kernel/syscalls_64.S    | 1 +
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/sh/include/asm/unistd_32.h b/arch/sh/include/asm/unistd_32.h
index 2efb819e2db3e..65197086a1c53 100644
--- a/arch/sh/include/asm/unistd_32.h
+++ b/arch/sh/include/asm/unistd_32.h
@@ -343,8 +343,9 @@
 #define __NR_inotify_init1	332
 #define __NR_preadv		333
 #define __NR_pwritev		334
+#define __NR_rt_tgsigqueueinfo	335
 
-#define NR_syscalls 335
+#define NR_syscalls 336
 
 #ifdef __KERNEL__
 
diff --git a/arch/sh/include/asm/unistd_64.h b/arch/sh/include/asm/unistd_64.h
index 6eb9d2934c0f0..8014aea88ec3c 100644
--- a/arch/sh/include/asm/unistd_64.h
+++ b/arch/sh/include/asm/unistd_64.h
@@ -383,10 +383,11 @@
 #define __NR_inotify_init1	360
 #define __NR_preadv		361
 #define __NR_pwritev		362
+#define __NR_rt_tgsigqueueinfo	363
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 363
+#define NR_syscalls 364
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S
index 6118f358f9500..a9fff9f731ec1 100644
--- a/arch/sh/kernel/syscalls_32.S
+++ b/arch/sh/kernel/syscalls_32.S
@@ -351,3 +351,4 @@ ENTRY(sys_call_table)
 	.long sys_inotify_init1
 	.long sys_preadv
 	.long sys_pwritev
+	.long sys_rt_tgsigqueueinfo	/* 335 */
diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S
index a083609f92847..75c1889af1ed5 100644
--- a/arch/sh/kernel/syscalls_64.S
+++ b/arch/sh/kernel/syscalls_64.S
@@ -389,3 +389,4 @@ sys_call_table:
 	.long sys_inotify_init1		/* 360 */
 	.long sys_preadv
 	.long sys_pwritev
+	.long sys_rt_tgsigqueueinfo
-- 
GitLab


From 54ff328b46e58568c4b3350c2fa3223ef862e5a4 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 11 Jun 2009 10:33:09 +0300
Subject: [PATCH 1960/2198] sh: Tie sparseirq in to Kconfig.

Now that the dependent patches are merged, we are ready to enable
sparseirq support. This simply adds the Kconfig option, and then converts
from the _cpu to the _node allocation routines to follow the upstream
sparseirq API changes.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
---
 arch/sh/Kconfig              | 12 ++++++++++++
 arch/sh/kernel/cpu/irq/ipr.c |  7 ++-----
 drivers/sh/intc.c            | 10 ++++------
 3 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index b5629cdc37a01..586cd045e2db2 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -678,6 +678,18 @@ config GUSA_RB
 	  LLSC, this should be more efficient than the other alternative of
 	  disabling interrupts around the atomic sequence.
 
+config SPARSE_IRQ
+	bool "Support sparse irq numbering"
+	depends on EXPERIMENTAL
+	help
+	  This enables support for sparse irqs. This is useful in general
+	  as most CPUs have a fairly sparse array of IRQ vectors, which
+	  the irq_desc then maps directly on to. Systems with a high
+	  number of off-chip IRQs will want to treat this as
+	  experimental until they have been independently verified.
+
+	  If you don't know what to do here, say N.
+
 endmenu
 
 menu "Boot options"
diff --git a/arch/sh/kernel/cpu/irq/ipr.c b/arch/sh/kernel/cpu/irq/ipr.c
index 6ad40dbad8812..808d99a48efb6 100644
--- a/arch/sh/kernel/cpu/irq/ipr.c
+++ b/arch/sh/kernel/cpu/irq/ipr.c
@@ -21,6 +21,7 @@
 #include <linux/module.h>
 #include <linux/io.h>
 #include <linux/interrupt.h>
+#include <linux/topology.h>
 
 static inline struct ipr_desc *get_ipr_desc(unsigned int irq)
 {
@@ -59,21 +60,17 @@ void register_ipr_controller(struct ipr_desc *desc)
 
 	for (i = 0; i < desc->nr_irqs; i++) {
 		struct ipr_data *p = desc->ipr_data + i;
-#ifdef CONFIG_SPARSE_IRQ
 		struct irq_desc *irq_desc;
-#endif
 
 		BUG_ON(p->ipr_idx >= desc->nr_offsets);
 		BUG_ON(!desc->ipr_offsets[p->ipr_idx]);
 
-#ifdef CONFIG_SPARSE_IRQ
-		irq_desc = irq_to_desc_alloc_cpu(p->irq, smp_processor_id());
+		irq_desc = irq_to_desc_alloc_node(p->irq, numa_node_id());
 		if (unlikely(!irq_desc)) {
 			printk(KERN_INFO "can not get irq_desc for %d\n",
 			       p->irq);
 			continue;
 		}
-#endif
 
 		disable_irq_nosync(p->irq);
 		set_irq_chip_and_handler_name(p->irq, &desc->chip,
diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c
index caf0656940424..d687a9b93d03b 100644
--- a/drivers/sh/intc.c
+++ b/drivers/sh/intc.c
@@ -24,6 +24,7 @@
 #include <linux/sh_intc.h>
 #include <linux/sysdev.h>
 #include <linux/list.h>
+#include <linux/topology.h>
 
 #define _INTC_MK(fn, mode, addr_e, addr_d, width, shift) \
 	((shift) | ((width) << 5) | ((fn) << 9) | ((mode) << 13) | \
@@ -671,7 +672,7 @@ unsigned int intc_evt2irq(unsigned int vector)
 
 void __init register_intc_controller(struct intc_desc *desc)
 {
-	unsigned int i, k, smp, cpu = smp_processor_id();
+	unsigned int i, k, smp;
 	struct intc_desc_int *d;
 
 	d = alloc_bootmem(sizeof(*d));
@@ -771,19 +772,16 @@ void __init register_intc_controller(struct intc_desc *desc)
 	for (i = 0; i < desc->nr_vectors; i++) {
 		struct intc_vect *vect = desc->vectors + i;
 		unsigned int irq = evt2irq(vect->vect);
-#ifdef CONFIG_SPARSE_IRQ
 		struct irq_desc *irq_desc;
-#endif
+
 		if (!vect->enum_id)
 			continue;
 
-#ifdef CONFIG_SPARSE_IRQ
-		irq_desc = irq_to_desc_alloc_cpu(irq, cpu);
+		irq_desc = irq_to_desc_alloc_node(irq, numa_node_id());
 		if (unlikely(!irq_desc)) {
 			printk(KERN_INFO "can not get irq_desc for %d\n", irq);
 			continue;
 		}
-#endif
 
 		intc_register_irq(desc, d, vect->enum_id, irq);
 	}
-- 
GitLab


From b0fd271d5fba0b2d00888363f3869e3f9b26caa9 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Thu, 11 Jun 2009 13:10:16 +0200
Subject: [PATCH 1961/2198] block: add request clone interface (v2)

This patch adds the following 2 interfaces for request-stacking drivers:

  - blk_rq_prep_clone(struct request *clone, struct request *orig,
		      struct bio_set *bs, gfp_t gfp_mask,
		      int (*bio_ctr)(struct bio *, struct bio*, void *),
		      void *data)
      * Clones bios in the original request to the clone request
        (bio_ctr is called for each cloned bios.)
      * Copies attributes of the original request to the clone request.
        The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not
        copied.

  - blk_rq_unprep_clone(struct request *clone)
      * Frees cloned bios from the clone request.

Request stacking drivers (e.g. request-based dm) need to make a clone
request for a submitted request and dispatch it to other devices.

To allocate request for the clone, request stacking drivers may not
be able to use blk_get_request() because the allocation may be done
in an irq-disabled context.
So blk_rq_prep_clone() takes a request allocated by the caller
as an argument.

For each clone bio in the clone request, request stacking drivers
should be able to set up their own completion handler.
So blk_rq_prep_clone() takes a callback function which is called
for each clone bio, and a pointer for private data which is passed
to the callback.

NOTE:
blk_rq_prep_clone() doesn't copy any actual data of the original
request.  Pages are shared between original bios and cloned bios.
So caller must not complete the original request before the clone
request.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Boaz Harrosh <bharrosh@panasas.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c       | 100 +++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |   5 +++
 2 files changed, 105 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index 03c5a64b6ccbd..02a9252107ab2 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2295,6 +2295,106 @@ int blk_lld_busy(struct request_queue *q)
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 
+/**
+ * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
+ * @rq: the clone request to be cleaned up
+ *
+ * Description:
+ *     Free all bios in @rq for a cloned request.
+ */
+void blk_rq_unprep_clone(struct request *rq)
+{
+	struct bio *bio;
+
+	while ((bio = rq->bio) != NULL) {
+		rq->bio = bio->bi_next;
+
+		bio_put(bio);
+	}
+}
+EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
+
+/*
+ * Copy attributes of the original request to the clone request.
+ * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.
+ */
+static void __blk_rq_prep_clone(struct request *dst, struct request *src)
+{
+	dst->cpu = src->cpu;
+	dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
+	dst->cmd_type = src->cmd_type;
+	dst->__sector = blk_rq_pos(src);
+	dst->__data_len = blk_rq_bytes(src);
+	dst->nr_phys_segments = src->nr_phys_segments;
+	dst->ioprio = src->ioprio;
+	dst->extra_len = src->extra_len;
+}
+
+/**
+ * blk_rq_prep_clone - Helper function to setup clone request
+ * @rq: the request to be setup
+ * @rq_src: original request to be cloned
+ * @bs: bio_set that bios for clone are allocated from
+ * @gfp_mask: memory allocation mask for bio
+ * @bio_ctr: setup function to be called for each clone bio.
+ *           Returns %0 for success, non %0 for failure.
+ * @data: private data to be passed to @bio_ctr
+ *
+ * Description:
+ *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
+ *     The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)
+ *     are not copied, and copying such parts is the caller's responsibility.
+ *     Also, pages which the original bios are pointing to are not copied
+ *     and the cloned bios just point same pages.
+ *     So cloned bios must be completed before original bios, which means
+ *     the caller must complete @rq before @rq_src.
+ */
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+		      struct bio_set *bs, gfp_t gfp_mask,
+		      int (*bio_ctr)(struct bio *, struct bio *, void *),
+		      void *data)
+{
+	struct bio *bio, *bio_src;
+
+	if (!bs)
+		bs = fs_bio_set;
+
+	blk_rq_init(NULL, rq);
+
+	__rq_for_each_bio(bio_src, rq_src) {
+		bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
+		if (!bio)
+			goto free_and_out;
+
+		__bio_clone(bio, bio_src);
+
+		if (bio_integrity(bio_src) &&
+		    bio_integrity_clone(bio, bio_src, gfp_mask))
+			goto free_and_out;
+
+		if (bio_ctr && bio_ctr(bio, bio_src, data))
+			goto free_and_out;
+
+		if (rq->bio) {
+			rq->biotail->bi_next = bio;
+			rq->biotail = bio;
+		} else
+			rq->bio = rq->biotail = bio;
+	}
+
+	__blk_rq_prep_clone(rq, rq_src);
+
+	return 0;
+
+free_and_out:
+	if (bio)
+		bio_free(bio, bs);
+	blk_rq_unprep_clone(rq);
+
+	return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
+
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5e740a135e733..ebdfde8fe5561 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -765,6 +765,11 @@ extern void blk_insert_request(struct request_queue *, struct request *, int, vo
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
+extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+			     struct bio_set *bs, gfp_t gfp_mask,
+			     int (*bio_ctr)(struct bio *, struct bio *, void *),
+			     void *data);
+extern void blk_rq_unprep_clone(struct request *rq);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_plug_device(struct request_queue *);
-- 
GitLab


From 85d4198e40c289dd623cecd16601fa613559bed7 Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 11 Jun 2009 08:51:10 -0400
Subject: [PATCH 1962/2198] Btrfs: check duplicate backrefs for both data and
 metadata

lookup_inline_extent_backref only checks for duplicate backref for data
extents. It assumes backrefs for tree block never conflict.

This patch makes lookup_inline_extent_backref check for duplicate backrefs
for both data and tree block, so that we can detect potential bug earlier.
This is a safety check, strictly speaking it is not required.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 33a65f2c8a378..edc7d208c5ce6 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1056,8 +1056,7 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 	want = extent_ref_type(parent, owner);
 	if (insert) {
 		extra_size = btrfs_extent_inline_ref_size(want);
-		if (owner >= BTRFS_FIRST_FREE_OBJECTID)
-			path->keep_locks = 1;
+		path->keep_locks = 1;
 	} else
 		extra_size = -1;
 	ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
@@ -1087,12 +1086,6 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 #endif
 	BUG_ON(item_size < sizeof(*ei));
 
-	if (owner < BTRFS_FIRST_FREE_OBJECTID && insert &&
-	    item_size + extra_size >= BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
-		err = -EAGAIN;
-		goto out;
-	}
-
 	ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
 	flags = btrfs_extent_flags(leaf, ei);
 
@@ -1165,15 +1158,15 @@ int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
 		 * For simplicity, we just do not add new inline back
 		 * ref if there is any kind of item for this block
 		 */
-		if (owner >= BTRFS_FIRST_FREE_OBJECTID &&
-		    find_next_key(path, &key) == 0 && key.objectid == bytenr) {
+		if (find_next_key(path, &key) == 0 && key.objectid == bytenr &&
+		    key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
 			err = -EAGAIN;
 			goto out;
 		}
 	}
 	*ref_ret = (struct btrfs_extent_inline_ref *)ptr;
 out:
-	if (insert && owner >= BTRFS_FIRST_FREE_OBJECTID) {
+	if (insert) {
 		path->keep_locks = 0;
 		btrfs_unlock_up_safe(path, 1);
 	}
-- 
GitLab


From 067c28adc53807514ac0c6ebb6af3243cbd071fa Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Jun 2009 09:30:13 -0400
Subject: [PATCH 1963/2198] Btrfs: fix -o nodatasum printk spelling

It was printing nodatacsum, which was not the correct option name.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 3427db28f6fe9..708ac06b953be 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -159,7 +159,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			 */
 			break;
 		case Opt_nodatasum:
-			printk(KERN_INFO "btrfs: setting nodatacsum\n");
+			printk(KERN_INFO "btrfs: setting nodatasum\n");
 			btrfs_set_opt(info->mount_opt, NODATASUM);
 			break;
 		case Opt_nodatacow:
-- 
GitLab


From 9866b7e86a2ce4daa677be750e3ccbfc65d187f5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 11 Jun 2009 16:25:01 +0200
Subject: [PATCH 1964/2198] x86: memtest: use pointers of equal type for
 comparison

Commit c9690998ef48ffefeccb91c70a7739eebdea57f9 (x86: memtest: remove
64-bit division) introduced following compile warning:

arch/x86/mm/memtest.c: In function 'memtest':
arch/x86/mm/memtest.c:56: warning: comparison of distinct pointer types lacks a cast
arch/x86/mm/memtest.c:58: warning: comparison of distinct pointer types lacks a cast

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/memtest.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index c0bedcd10f973..18d244f702059 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -40,21 +40,20 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
 
 static void __init memtest(u64 pattern, u64 start_phys, u64 size)
 {
-	u64 *p;
-	void *start, *end;
+	u64 *p, *start, *end;
 	u64 start_bad, last_bad;
 	u64 start_phys_aligned;
-	size_t incr;
+	const size_t incr = sizeof(pattern);
 
-	incr = sizeof(pattern);
 	start_phys_aligned = ALIGN(start_phys, incr);
 	start = __va(start_phys_aligned);
-	end = start + size - (start_phys_aligned - start_phys);
+	end = start + (size - (start_phys_aligned - start_phys)) / incr;
 	start_bad = 0;
 	last_bad = 0;
 
 	for (p = start; p < end; p++)
 		*p = pattern;
+
 	for (p = start; p < end; p++, start_phys_aligned += incr) {
 		if (*p == pattern)
 			continue;
-- 
GitLab


From 4da52960fd1ae3ddd14901bc88b608cbeaa4b9a6 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 11 Jun 2009 14:54:01 +1000
Subject: [PATCH 1965/2198] perf_counters: powerpc: Add support for POWER7
 processors

This adds the back-end for the PMU on POWER7 processors.  POWER7
has 4 fully-programmable counters and two fixed-function counters
(which do respect the freeze conditions, can generate interrupts,
and are writable, unlike PMC5/6 on POWER5+/6).

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18992.36329.189378.17992@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/Makefile       |   3 +-
 arch/powerpc/kernel/perf_counter.c |   4 +
 arch/powerpc/kernel/power7-pmu.c   | 316 +++++++++++++++++++++++++++++
 3 files changed, 322 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/kernel/power7-pmu.c

diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 9ba1bb731fccd..a2c683403c2be 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -95,7 +95,8 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o
 obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o power4-pmu.o ppc970-pmu.o \
-				   power5-pmu.o power5+-pmu.o power6-pmu.o
+				   power5-pmu.o power5+-pmu.o power6-pmu.o \
+				   power7-pmu.o
 
 obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o
 
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 4990ce2e5f087..5d12e68aac1c6 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -1181,6 +1181,7 @@ extern struct power_pmu ppc970_pmu;
 extern struct power_pmu power5_pmu;
 extern struct power_pmu power5p_pmu;
 extern struct power_pmu power6_pmu;
+extern struct power_pmu power7_pmu;
 
 static int init_perf_counters(void)
 {
@@ -1207,6 +1208,9 @@ static int init_perf_counters(void)
 	case 0x3e:
 		ppmu = &power6_pmu;
 		break;
+	case 0x3f:
+		ppmu = &power7_pmu;
+		break;
 	}
 
 	/*
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
new file mode 100644
index 0000000000000..dfac48d8ff451
--- /dev/null
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -0,0 +1,316 @@
+/*
+ * Performance counter support for POWER7 processors.
+ *
+ * Copyright 2009 Paul Mackerras, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/perf_counter.h>
+#include <asm/reg.h>
+
+/*
+ * Bits in event code for POWER7
+ */
+#define PM_PMC_SH	16	/* PMC number (1-based) for direct events */
+#define PM_PMC_MSK	0xf
+#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH)
+#define PM_UNIT_SH	12	/* TTMMUX number and setting - unit select */
+#define PM_UNIT_MSK	0xf
+#define PM_COMBINE_SH	11	/* Combined event bit */
+#define PM_COMBINE_MSK	1
+#define PM_COMBINE_MSKS	0x800
+#define PM_L2SEL_SH	8	/* L2 event select */
+#define PM_L2SEL_MSK	7
+#define PM_PMCSEL_MSK	0xff
+
+/*
+ * Bits in MMCR1 for POWER7
+ */
+#define MMCR1_TTM0SEL_SH	60
+#define MMCR1_TTM1SEL_SH	56
+#define MMCR1_TTM2SEL_SH	52
+#define MMCR1_TTM3SEL_SH	48
+#define MMCR1_TTMSEL_MSK	0xf
+#define MMCR1_L2SEL_SH		45
+#define MMCR1_L2SEL_MSK		7
+#define MMCR1_PMC1_COMBINE_SH	35
+#define MMCR1_PMC2_COMBINE_SH	34
+#define MMCR1_PMC3_COMBINE_SH	33
+#define MMCR1_PMC4_COMBINE_SH	32
+#define MMCR1_PMC1SEL_SH	24
+#define MMCR1_PMC2SEL_SH	16
+#define MMCR1_PMC3SEL_SH	8
+#define MMCR1_PMC4SEL_SH	0
+#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8)
+#define MMCR1_PMCSEL_MSK	0xff
+
+/*
+ * Bits in MMCRA
+ */
+
+/*
+ * Layout of constraint bits:
+ * 6666555555555544444444443333333333222222222211111111110000000000
+ * 3210987654321098765432109876543210987654321098765432109876543210
+ *                                                 [  ><><><><><><>
+ *                                                  NC P6P5P4P3P2P1
+ *
+ * NC - number of counters
+ *     15: NC error 0x8000
+ *     12-14: number of events needing PMC1-4 0x7000
+ *
+ * P6
+ *     11: P6 error 0x800
+ *     10-11: Count of events needing PMC6
+ *
+ * P1..P5
+ *     0-9: Count of events needing PMC1..PMC5
+ */
+
+static int power7_get_constraint(u64 event, u64 *maskp, u64 *valp)
+{
+	int pmc, sh;
+	u64 mask = 0, value = 0;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	if (pmc) {
+		if (pmc > 6)
+			return -1;
+		sh = (pmc - 1) * 2;
+		mask |= 2 << sh;
+		value |= 1 << sh;
+		if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4))
+			return -1;
+	}
+	if (pmc < 5) {
+		/* need a counter from PMC1-4 set */
+		mask  |= 0x8000;
+		value |= 0x1000;
+	}
+	*maskp = mask;
+	*valp = value;
+	return 0;
+}
+
+#define MAX_ALT	2	/* at most 2 alternatives for any event */
+
+static const unsigned int event_alternatives[][MAX_ALT] = {
+	{ 0x200f2, 0x300f2 },		/* PM_INST_DISP */
+	{ 0x200f4, 0x600f4 },		/* PM_RUN_CYC */
+	{ 0x400fa, 0x500fa },		/* PM_RUN_INST_CMPL */
+};
+
+/*
+ * Scan the alternatives table for a match and return the
+ * index into the alternatives table if found, else -1.
+ */
+static int find_alternative(u64 event)
+{
+	int i, j;
+
+	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) {
+		if (event < event_alternatives[i][0])
+			break;
+		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j)
+			if (event == event_alternatives[i][j])
+				return i;
+	}
+	return -1;
+}
+
+static s64 find_alternative_decode(u64 event)
+{
+	int pmc, psel;
+
+	/* this only handles the 4x decode events */
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	psel = event & PM_PMCSEL_MSK;
+	if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40)
+		return event - (1 << PM_PMC_SH) + 8;
+	if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48)
+		return event + (1 << PM_PMC_SH) - 8;
+	return -1;
+}
+
+static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[])
+{
+	int i, j, nalt = 1;
+	s64 ae;
+
+	alt[0] = event;
+	nalt = 1;
+	i = find_alternative(event);
+	if (i >= 0) {
+		for (j = 0; j < MAX_ALT; ++j) {
+			ae = event_alternatives[i][j];
+			if (ae && ae != event)
+				alt[nalt++] = ae;
+		}
+	} else {
+		ae = find_alternative_decode(event);
+		if (ae > 0)
+			alt[nalt++] = ae;
+	}
+
+	if (flags & PPMU_ONLY_COUNT_RUN) {
+		/*
+		 * We're only counting in RUN state,
+		 * so PM_CYC is equivalent to PM_RUN_CYC
+		 * and PM_INST_CMPL === PM_RUN_INST_CMPL.
+		 * This doesn't include alternatives that don't provide
+		 * any extra flexibility in assigning PMCs.
+		 */
+		j = nalt;
+		for (i = 0; i < nalt; ++i) {
+			switch (alt[i]) {
+			case 0x1e:	/* PM_CYC */
+				alt[j++] = 0x600f4;	/* PM_RUN_CYC */
+				break;
+			case 0x600f4:	/* PM_RUN_CYC */
+				alt[j++] = 0x1e;
+				break;
+			case 0x2:	/* PM_PPC_CMPL */
+				alt[j++] = 0x500fa;	/* PM_RUN_INST_CMPL */
+				break;
+			case 0x500fa:	/* PM_RUN_INST_CMPL */
+				alt[j++] = 0x2;	/* PM_PPC_CMPL */
+				break;
+			}
+		}
+		nalt = j;
+	}
+
+	return nalt;
+}
+
+/*
+ * Returns 1 if event counts things relating to marked instructions
+ * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not.
+ */
+static int power7_marked_instr_event(u64 event)
+{
+	int pmc, psel;
+	int unit;
+
+	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK;
+	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK;
+	psel = event & PM_PMCSEL_MSK & ~1;	/* trim off edge/level bit */
+	if (pmc >= 5)
+		return 0;
+
+	switch (psel >> 4) {
+	case 2:
+		return pmc == 2 || pmc == 4;
+	case 3:
+		if (psel == 0x3c)
+			return pmc == 1;
+		if (psel == 0x3e)
+			return pmc != 2;
+		return 1;
+	case 4:
+	case 5:
+		return unit == 0xd;
+	case 6:
+		if (psel == 0x64)
+			return pmc >= 3;
+	case 8:
+		return unit == 0xd;
+	}
+	return 0;
+}
+
+static int power7_compute_mmcr(u64 event[], int n_ev,
+			       unsigned int hwc[], u64 mmcr[])
+{
+	u64 mmcr1 = 0;
+	u64 mmcra = 0;
+	unsigned int pmc, unit, combine, l2sel, psel;
+	unsigned int pmc_inuse = 0;
+	int i;
+
+	/* First pass to count resource use */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		if (pmc) {
+			if (pmc > 6)
+				return -1;
+			if (pmc_inuse & (1 << (pmc - 1)))
+				return -1;
+			pmc_inuse |= 1 << (pmc - 1);
+		}
+	}
+
+	/* Second pass: assign PMCs, set all MMCR1 fields */
+	for (i = 0; i < n_ev; ++i) {
+		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK;
+		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK;
+		combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK;
+		l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK;
+		psel = event[i] & PM_PMCSEL_MSK;
+		if (!pmc) {
+			/* Bus event or any-PMC direct event */
+			for (pmc = 0; pmc < 4; ++pmc) {
+				if (!(pmc_inuse & (1 << pmc)))
+					break;
+			}
+			if (pmc >= 4)
+				return -1;
+			pmc_inuse |= 1 << pmc;
+		} else {
+			/* Direct or decoded event */
+			--pmc;
+		}
+		if (pmc <= 3) {
+			mmcr1 |= (u64) unit << (MMCR1_TTM0SEL_SH - 4 * pmc);
+			mmcr1 |= (u64) combine << (MMCR1_PMC1_COMBINE_SH - pmc);
+			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc);
+			if (unit == 6)	/* L2 events */
+				mmcr1 |= (u64) l2sel << MMCR1_L2SEL_SH;
+		}
+		if (power7_marked_instr_event(event[i]))
+			mmcra |= MMCRA_SAMPLE_ENABLE;
+		hwc[i] = pmc;
+	}
+
+	/* Return MMCRx values */
+	mmcr[0] = 0;
+	if (pmc_inuse & 1)
+		mmcr[0] = MMCR0_PMC1CE;
+	if (pmc_inuse & 0x3e)
+		mmcr[0] |= MMCR0_PMCjCE;
+	mmcr[1] = mmcr1;
+	mmcr[2] = mmcra;
+	return 0;
+}
+
+static void power7_disable_pmc(unsigned int pmc, u64 mmcr[])
+{
+	if (pmc <= 3)
+		mmcr[1] &= ~(0xffULL << MMCR1_PMCSEL_SH(pmc));
+}
+
+static int power7_generic_events[] = {
+	[PERF_COUNT_CPU_CYCLES] = 0x1e,
+	[PERF_COUNT_INSTRUCTIONS] = 2,
+	[PERF_COUNT_CACHE_REFERENCES] = 0xc880,		/* LD_REF_L1_LSU */
+	[PERF_COUNT_CACHE_MISSES] = 0x400f0,		/* LD_MISS_L1 */
+	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x10068,	/* BRU_FIN */
+	[PERF_COUNT_BRANCH_MISSES] = 0x400f6,		/* BR_MPRED */
+};
+
+struct power_pmu power7_pmu = {
+	.n_counter = 6,
+	.max_alternatives = MAX_ALT + 1,
+	.add_fields = 0x1555ull,
+	.test_adder = 0x3000ull,
+	.compute_mmcr = power7_compute_mmcr,
+	.get_constraint = power7_get_constraint,
+	.get_alternatives = power7_get_alternatives,
+	.disable_pmc = power7_disable_pmc,
+	.n_generic = ARRAY_SIZE(power7_generic_events),
+	.generic_events = power7_generic_events,
+};
-- 
GitLab


From 106b506c3a8b74daa5751e83ed3e46438fcf9a52 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 11 Jun 2009 14:55:42 +1000
Subject: [PATCH 1966/2198] perf_counter: powerpc: Implement generalized cache
 events for POWER processors

This adds tables of event codes for the generalized cache events for
all the currently supported powerpc processors: POWER{4,5,5+,6,7} and
PPC970*, plus powerpc-specific code to use these tables when a
generalized cache event is requested.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <18992.36430.933526.742969@drongo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/include/asm/perf_counter.h |  3 ++
 arch/powerpc/kernel/perf_counter.c      | 42 ++++++++++++++++++++--
 arch/powerpc/kernel/power4-pmu.c        | 41 ++++++++++++++++++++++
 arch/powerpc/kernel/power5+-pmu.c       | 45 ++++++++++++++++++++++--
 arch/powerpc/kernel/power5-pmu.c        | 41 ++++++++++++++++++++++
 arch/powerpc/kernel/power6-pmu.c        | 46 +++++++++++++++++++++++--
 arch/powerpc/kernel/power7-pmu.c        | 41 ++++++++++++++++++++++
 arch/powerpc/kernel/ppc970-pmu.c        | 41 ++++++++++++++++++++++
 8 files changed, 294 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h
index 1c60f0ca79201..cc7c887705b87 100644
--- a/arch/powerpc/include/asm/perf_counter.h
+++ b/arch/powerpc/include/asm/perf_counter.h
@@ -33,6 +33,9 @@ struct power_pmu {
 	u32	flags;
 	int	n_generic;
 	int	*generic_events;
+	int	(*cache_events)[PERF_COUNT_HW_CACHE_MAX]
+			       [PERF_COUNT_HW_CACHE_OP_MAX]
+			       [PERF_COUNT_HW_CACHE_RESULT_MAX];
 };
 
 extern struct power_pmu *ppmu;
diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c
index 5d12e68aac1c6..bb202388170e7 100644
--- a/arch/powerpc/kernel/perf_counter.c
+++ b/arch/powerpc/kernel/perf_counter.c
@@ -856,6 +856,36 @@ static void hw_perf_counter_destroy(struct perf_counter *counter)
 	}
 }
 
+/*
+ * Translate a generic cache event config to a raw event code.
+ */
+static int hw_perf_cache_event(u64 config, u64 *eventp)
+{
+	unsigned long type, op, result;
+	int ev;
+
+	if (!ppmu->cache_events)
+		return -EINVAL;
+
+	/* unpack config */
+	type = config & 0xff;
+	op = (config >> 8) & 0xff;
+	result = (config >> 16) & 0xff;
+
+	if (type >= PERF_COUNT_HW_CACHE_MAX ||
+	    op >= PERF_COUNT_HW_CACHE_OP_MAX ||
+	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX)
+		return -EINVAL;
+
+	ev = (*ppmu->cache_events)[type][op][result];
+	if (ev == 0)
+		return -EOPNOTSUPP;
+	if (ev == -1)
+		return -EINVAL;
+	*eventp = ev;
+	return 0;
+}
+
 const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 {
 	u64 ev;
@@ -868,13 +898,21 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter)
 
 	if (!ppmu)
 		return ERR_PTR(-ENXIO);
-	if (counter->attr.type != PERF_TYPE_RAW) {
+	switch (counter->attr.type) {
+	case PERF_TYPE_HARDWARE:
 		ev = counter->attr.config;
 		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0)
 			return ERR_PTR(-EOPNOTSUPP);
 		ev = ppmu->generic_events[ev];
-	} else {
+		break;
+	case PERF_TYPE_HW_CACHE:
+		err = hw_perf_cache_event(counter->attr.config, &ev);
+		if (err)
+			return ERR_PTR(err);
+		break;
+	case PERF_TYPE_RAW:
 		ev = counter->attr.config;
+		break;
 	}
 	counter->hw.config_base = ev;
 	counter->hw.idx = 0;
diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 836fa118eb1e6..0e94b6857220d 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -543,6 +543,46 @@ static int p4_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x8c10,		0x3c10	},
+		[C(OP_WRITE)] = {	0x7c10,		0xc13	},
+		[C(OP_PREFETCH)] = {	0xc35,		0	},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	0,		0	},
+		[C(OP_PREFETCH)] = {	0xc34,		0	},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x904	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x900	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x330,		0x331	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+};
+
 struct power_pmu power4_pmu = {
 	.n_counter = 8,
 	.max_alternatives = 5,
@@ -554,4 +594,5 @@ struct power_pmu power4_pmu = {
 	.disable_pmc = p4_disable_pmc,
 	.n_generic = ARRAY_SIZE(p4_generic_events),
 	.generic_events = p4_generic_events,
+	.cache_events = &power4_cache_events,
 };
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 8471e3c2e4658..bbf2cbb07388c 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -614,6 +614,46 @@ static int power5p_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x1c10a8,	0x3c1088	},
+		[C(OP_WRITE)] = {	0x2c10a8,	0xc10c3		},
+		[C(OP_PREFETCH)] = {	0xc70e7,	-1		},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	0,		0		},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0		},
+		[C(OP_WRITE)] = {	0,		0		},
+		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0xc20e4,	0x800c4		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x800c0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x230e4,	0x230e5		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+};
+
 struct power_pmu power5p_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
@@ -623,8 +663,9 @@ struct power_pmu power5p_pmu = {
 	.get_constraint = power5p_get_constraint,
 	.get_alternatives = power5p_get_alternatives,
 	.disable_pmc = power5p_disable_pmc,
+	.limited_pmc_event = power5p_limited_pmc_event,
+	.flags = PPMU_LIMITED_PMC5_6,
 	.n_generic = ARRAY_SIZE(power5p_generic_events),
 	.generic_events = power5p_generic_events,
-	.flags = PPMU_LIMITED_PMC5_6,
-	.limited_pmc_event = power5p_limited_pmc_event,
+	.cache_events = &power5p_cache_events,
 };
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 1b44c5fca1898..670cf10b91e8a 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -556,6 +556,46 @@ static int power5_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x4c1090,	0x3c1088	},
+		[C(OP_WRITE)] = {	0x3c1090,	0xc10c3		},
+		[C(OP_PREFETCH)] = {	0xc70e7,	0		},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	0,		0		},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x3c309b	},
+		[C(OP_WRITE)] = {	0,		0		},
+		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x2c4090,	0x800c4		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x800c0		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x230e4,	0x230e5		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+};
+
 struct power_pmu power5_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
@@ -567,4 +607,5 @@ struct power_pmu power5_pmu = {
 	.disable_pmc = power5_disable_pmc,
 	.n_generic = ARRAY_SIZE(power5_generic_events),
 	.generic_events = power5_generic_events,
+	.cache_events = &power5_cache_events,
 };
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index cd4fbe06c35d4..4da707866097e 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -474,6 +474,47 @@ static int power6_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ * The "DTLB" and "ITLB" events relate to the DERAT and IERAT.
+ */
+static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x80082,	0x80080		},
+		[C(OP_WRITE)] = {	0x80086,	0x80088		},
+		[C(OP_PREFETCH)] = {	0x810a4,	0		},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x100056 	},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	0x4008c,	0		},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x150730,	0x250532	},
+		[C(OP_WRITE)] = {	0x250432,	0x150432	},
+		[C(OP_PREFETCH)] = {	0x810a6,	0		},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x20000e	},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x420ce		},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x430e6,	0x400052	},
+		[C(OP_WRITE)] = {	-1,		-1		},
+		[C(OP_PREFETCH)] = {	-1,		-1		},
+	},
+};
+
 struct power_pmu power6_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT,
@@ -483,8 +524,9 @@ struct power_pmu power6_pmu = {
 	.get_constraint = p6_get_constraint,
 	.get_alternatives = p6_get_alternatives,
 	.disable_pmc = p6_disable_pmc,
+	.limited_pmc_event = p6_limited_pmc_event,
+	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
 	.n_generic = ARRAY_SIZE(power6_generic_events),
 	.generic_events = power6_generic_events,
-	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR,
-	.limited_pmc_event = p6_limited_pmc_event,
+	.cache_events = &power6_cache_events,
 };
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index dfac48d8ff451..060e0deb399e6 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -302,6 +302,46 @@ static int power7_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x400f6,		/* BR_MPRED */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x400f0,	0xc880	},
+		[C(OP_WRITE)] = {	0,		0x300f0	},
+		[C(OP_PREFETCH)] = {	0xd8b8,		0	},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x200fc	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	0x408a,		0	},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x6080,		0x6084	},
+		[C(OP_WRITE)] = {	0x6082,		0x6086	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x300fc	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x400fc	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x10068,	0x400f6	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+};
+
 struct power_pmu power7_pmu = {
 	.n_counter = 6,
 	.max_alternatives = MAX_ALT + 1,
@@ -313,4 +353,5 @@ struct power_pmu power7_pmu = {
 	.disable_pmc = power7_disable_pmc,
 	.n_generic = ARRAY_SIZE(power7_generic_events),
 	.generic_events = power7_generic_events,
+	.cache_events = &power7_cache_events,
 };
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index eed47c4523f17..336adf1736af8 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -427,6 +427,46 @@ static int ppc970_generic_events[] = {
 	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
 };
 
+#define C(x)	PERF_COUNT_HW_CACHE_##x
+
+/*
+ * Table of generalized cache-related events.
+ * 0 means not supported, -1 means nonsensical, other values
+ * are event codes.
+ */
+static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
+	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x8810,		0x3810	},
+		[C(OP_WRITE)] = {	0x7810,		0x813	},
+		[C(OP_PREFETCH)] = {	0x731,		0	},
+	},
+	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	0,		0	},
+	},
+	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0	},
+		[C(OP_WRITE)] = {	0,		0	},
+		[C(OP_PREFETCH)] = {	0x733,		0	},
+	},
+	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x704	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0,		0x700	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+		[C(OP_READ)] = {	0x431,		0x327	},
+		[C(OP_WRITE)] = {	-1,		-1	},
+		[C(OP_PREFETCH)] = {	-1,		-1	},
+	},
+};
+
 struct power_pmu ppc970_pmu = {
 	.n_counter = 8,
 	.max_alternatives = 2,
@@ -438,4 +478,5 @@ struct power_pmu ppc970_pmu = {
 	.disable_pmc = p970_disable_pmc,
 	.n_generic = ARRAY_SIZE(ppc970_generic_events),
 	.generic_events = ppc970_generic_events,
+	.cache_events = &ppc970_cache_events,
 };
-- 
GitLab


From 0764771dab80d7b84b9a271bee7f1b21a04a3f0c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 11:18:36 +0200
Subject: [PATCH 1967/2198] perf_counter: More paranoia settings

Rename the perf_counter_priv knob to perf_counter_paranoia (because
priv can be read as private, as opposed to privileged) and provide
one more level:

 0 - permissive
 1 - restrict cpu counters to privilidged contexts
 2 - restrict kernel-mode code counting and profiling

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 +-
 kernel/perf_counter.c        | 25 +++++++++++++++++++++++--
 kernel/sysctl.c              |  6 +++---
 3 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5b966472b4580..386be915baa1b 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -648,7 +648,7 @@ struct perf_callchain_entry {
 
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
-extern int sysctl_perf_counter_priv;
+extern int sysctl_perf_counter_paranoid;
 extern int sysctl_perf_counter_mlock;
 extern int sysctl_perf_counter_limit;
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8b89b40bd0f04..63f1987c1c1cc 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -43,7 +43,23 @@ static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
 
-int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
+/*
+ * 0 - not paranoid
+ * 1 - disallow cpu counters to unpriv
+ * 2 - disallow kernel profiling to unpriv
+ */
+int sysctl_perf_counter_paranoid __read_mostly; /* do we need to be privileged */
+
+static inline bool perf_paranoid_cpu(void)
+{
+	return sysctl_perf_counter_paranoid > 0;
+}
+
+static inline bool perf_paranoid_kernel(void)
+{
+	return sysctl_perf_counter_paranoid > 1;
+}
+
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
 int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
 
@@ -1385,7 +1401,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 	 */
 	if (cpu != -1) {
 		/* Must be root to operate on a CPU counter: */
-		if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
 		if (cpu < 0 || cpu > num_possible_cpus())
@@ -3618,6 +3634,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0)
 		return -EFAULT;
 
+	if (!attr.exclude_kernel) {
+		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+			return -EACCES;
+	}
+
 	/*
 	 * Get the target context (task or percpu):
 	 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 0c4bf863afa32..344a65981dee4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -916,9 +916,9 @@ static struct ctl_table kern_table[] = {
 #ifdef CONFIG_PERF_COUNTERS
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_privileged",
-		.data		= &sysctl_perf_counter_priv,
-		.maxlen		= sizeof(sysctl_perf_counter_priv),
+		.procname	= "perf_counter_paranoid",
+		.data		= &sysctl_perf_counter_paranoid,
+		.maxlen		= sizeof(sysctl_perf_counter_paranoid),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-- 
GitLab


From df58ab24bf26b166874bfb18b3b5a2e0a8e63179 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 11:25:05 +0200
Subject: [PATCH 1968/2198] perf_counter: Rename perf_counter_limit sysctl

Rename perf_counter_limit to perf_counter_max_sample_rate and
prohibit creation of counters with a known higher sample
frequency.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 +-
 kernel/perf_counter.c        | 27 +++++++++++++++++++--------
 kernel/sysctl.c              |  6 +++---
 3 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 386be915baa1b..95c797c480e8d 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -650,7 +650,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
 
 extern int sysctl_perf_counter_paranoid;
 extern int sysctl_perf_counter_mlock;
-extern int sysctl_perf_counter_limit;
+extern int sysctl_perf_counter_sample_rate;
 
 extern void perf_counter_init(void);
 
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 63f1987c1c1cc..3b2829de5590e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -44,11 +44,12 @@ static atomic_t nr_mmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
 
 /*
- * 0 - not paranoid
- * 1 - disallow cpu counters to unpriv
- * 2 - disallow kernel profiling to unpriv
+ * perf counter paranoia level:
+ *  0 - not paranoid
+ *  1 - disallow cpu counters to unpriv
+ *  2 - disallow kernel profiling to unpriv
  */
-int sysctl_perf_counter_paranoid __read_mostly; /* do we need to be privileged */
+int sysctl_perf_counter_paranoid __read_mostly;
 
 static inline bool perf_paranoid_cpu(void)
 {
@@ -61,7 +62,11 @@ static inline bool perf_paranoid_kernel(void)
 }
 
 int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
-int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */
+
+/*
+ * max perf counter sample rate
+ */
+int sysctl_perf_counter_sample_rate __read_mostly = 100000;
 
 static atomic64_t perf_counter_id;
 
@@ -1244,7 +1249,7 @@ static void perf_ctx_adjust_freq(struct perf_counter_context *ctx)
 		if (interrupts == MAX_INTERRUPTS) {
 			perf_log_throttle(counter, 1);
 			counter->pmu->unthrottle(counter);
-			interrupts = 2*sysctl_perf_counter_limit/HZ;
+			interrupts = 2*sysctl_perf_counter_sample_rate/HZ;
 		}
 
 		if (!counter->attr.freq || !counter->attr.sample_freq)
@@ -1682,7 +1687,7 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 
 	spin_lock_irq(&ctx->lock);
 	if (counter->attr.freq) {
-		if (value > sysctl_perf_counter_limit) {
+		if (value > sysctl_perf_counter_sample_rate) {
 			ret = -EINVAL;
 			goto unlock;
 		}
@@ -2979,7 +2984,8 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
 	} else {
 		if (hwc->interrupts != MAX_INTERRUPTS) {
 			hwc->interrupts++;
-			if (HZ * hwc->interrupts > (u64)sysctl_perf_counter_limit) {
+			if (HZ * hwc->interrupts >
+					(u64)sysctl_perf_counter_sample_rate) {
 				hwc->interrupts = MAX_INTERRUPTS;
 				perf_log_throttle(counter, 0);
 				ret = 1;
@@ -3639,6 +3645,11 @@ SYSCALL_DEFINE5(perf_counter_open,
 			return -EACCES;
 	}
 
+	if (attr.freq) {
+		if (attr.sample_freq > sysctl_perf_counter_sample_rate)
+			return -EINVAL;
+	}
+
 	/*
 	 * Get the target context (task or percpu):
 	 */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 344a65981dee4..9fd4e436b6961 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -932,9 +932,9 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
-		.procname	= "perf_counter_int_limit",
-		.data		= &sysctl_perf_counter_limit,
-		.maxlen		= sizeof(sysctl_perf_counter_limit),
+		.procname	= "perf_counter_max_sample_rate",
+		.data		= &sysctl_perf_counter_sample_rate,
+		.maxlen		= sizeof(sysctl_perf_counter_sample_rate),
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
-- 
GitLab


From 729ff5e2aaf181f5d3ab849337fce406cd19b1d9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Jun 2009 14:16:15 +0200
Subject: [PATCH 1969/2198] perf_counter tools: Clean up u64 usage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A build error slipped in:

 builtin-report.c: In function ‘hist_entry__fprintf’:
 builtin-report.c:711: error: format ‘%12d’ expects type ‘int’, but argument 3 has type ‘uint64_t’

Because we got a bit sloppy with those types. uint64_t really sucks,
because there's no printf format for it. So standardize on __u64
instead - for all types that go to or come from the ABI (which is __u64),
or for values that need to be large enough even on 32-bit.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-annotate.c | 32 +++++++++++++++----------------
 tools/perf/builtin-record.c   |  4 ++--
 tools/perf/builtin-report.c   | 36 +++++++++++++++++------------------
 tools/perf/builtin-top.c      |  8 ++++----
 tools/perf/util/symbol.c      | 10 +++++-----
 tools/perf/util/symbol.h      |  4 ++--
 6 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 3334a8bb1d513..b1ed5f766cb3f 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -145,7 +145,7 @@ static void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
-static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
+static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
 {
 	return dso__find_symbol(kernel_dso, ip);
 }
@@ -178,19 +178,19 @@ static int load_kernel(void)
 
 struct map {
 	struct list_head node;
-	uint64_t	 start;
-	uint64_t	 end;
-	uint64_t	 pgoff;
-	uint64_t	 (*map_ip)(struct map *, uint64_t);
+	__u64	 start;
+	__u64	 end;
+	__u64	 pgoff;
+	__u64	 (*map_ip)(struct map *, __u64);
 	struct dso	 *dso;
 };
 
-static uint64_t map__map_ip(struct map *map, uint64_t ip)
+static __u64 map__map_ip(struct map *map, __u64 ip)
 {
 	return ip - map->start + map->pgoff;
 }
 
-static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
+static __u64 vdso__map_ip(struct map *map, __u64 ip)
 {
 	return ip;
 }
@@ -249,7 +249,7 @@ static int map__overlap(struct map *l, struct map *r)
 
 static size_t map__fprintf(struct map *self, FILE *fp)
 {
-	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
+	return fprintf(fp, " %Lx-%Lx %Lx %s\n",
 		       self->start, self->end, self->pgoff, self->dso->name);
 }
 
@@ -373,7 +373,7 @@ static int thread__fork(struct thread *self, struct thread *parent)
 	return 0;
 }
 
-static struct map *thread__find_map(struct thread *self, uint64_t ip)
+static struct map *thread__find_map(struct thread *self, __u64 ip)
 {
 	struct map *pos;
 
@@ -414,7 +414,7 @@ struct hist_entry {
 	struct map	 *map;
 	struct dso	 *dso;
 	struct symbol	 *sym;
-	uint64_t	 ip;
+	__u64	 ip;
 	char		 level;
 
 	uint32_t	 count;
@@ -533,7 +533,7 @@ static struct sort_entry sort_dso = {
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	uint64_t ip_l, ip_r;
+	__u64 ip_l, ip_r;
 
 	if (left->sym == right->sym)
 		return 0;
@@ -647,7 +647,7 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 /*
  * collect histogram counts
  */
-static void hist_hit(struct hist_entry *he, uint64_t ip)
+static void hist_hit(struct hist_entry *he, __u64 ip)
 {
 	unsigned int sym_size, offset;
 	struct symbol *sym = he->sym;
@@ -676,7 +676,7 @@ static void hist_hit(struct hist_entry *he, uint64_t ip)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, uint64_t ip, char level)
+		struct symbol *sym, __u64 ip, char level)
 {
 	struct rb_node **p = &hist.rb_node;
 	struct rb_node *parent = NULL;
@@ -848,7 +848,7 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	int show = 0;
 	struct dso *dso = NULL;
 	struct thread *thread = threads__findnew(event->ip.pid);
-	uint64_t ip = event->ip.ip;
+	__u64 ip = event->ip.ip;
 	struct map *map = NULL;
 
 	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n",
@@ -1031,7 +1031,7 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 }
 
 static int
-parse_line(FILE *file, struct symbol *sym, uint64_t start, uint64_t len)
+parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len)
 {
 	char *line = NULL, *tmp, *tmp2;
 	unsigned int offset;
@@ -1112,7 +1112,7 @@ parse_line(FILE *file, struct symbol *sym, uint64_t start, uint64_t len)
 static void annotate_sym(struct dso *dso, struct symbol *sym)
 {
 	char *filename = dso->name;
-	uint64_t start, end, len;
+	__u64 start, end, len;
 	char command[PATH_MAX*2];
 	FILE *file;
 
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 919f23ca41996..84cd336ae79bc 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -223,7 +223,7 @@ static void pid_synthesize_comm_event(pid_t pid, int full)
 
 	comm_ev.pid = pid;
 	comm_ev.header.type = PERF_EVENT_COMM;
-	size = ALIGN(size, sizeof(uint64_t));
+	size = ALIGN(size, sizeof(__u64));
 	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size);
 
 	if (!full) {
@@ -304,7 +304,7 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 			size = strlen(execname);
 			execname[size - 1] = '\0'; /* Remove \n */
 			memcpy(mmap_ev.filename, execname, size);
-			size = ALIGN(size, sizeof(uint64_t));
+			size = ALIGN(size, sizeof(__u64));
 			mmap_ev.len -= mmap_ev.start;
 			mmap_ev.header.size = (sizeof(mmap_ev) -
 					       (sizeof(mmap_ev.filename) - size));
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f57fd5c5531ad..82fa93b4db999 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -146,7 +146,7 @@ static void dsos__fprintf(FILE *fp)
 		dso__fprintf(pos, fp);
 }
 
-static struct symbol *vdso__find_symbol(struct dso *dso, uint64_t ip)
+static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip)
 {
 	return dso__find_symbol(kernel_dso, ip);
 }
@@ -193,19 +193,19 @@ static int strcommon(const char *pathname)
 
 struct map {
 	struct list_head node;
-	uint64_t	 start;
-	uint64_t	 end;
-	uint64_t	 pgoff;
-	uint64_t	 (*map_ip)(struct map *, uint64_t);
+	__u64	 start;
+	__u64	 end;
+	__u64	 pgoff;
+	__u64	 (*map_ip)(struct map *, __u64);
 	struct dso	 *dso;
 };
 
-static uint64_t map__map_ip(struct map *map, uint64_t ip)
+static __u64 map__map_ip(struct map *map, __u64 ip)
 {
 	return ip - map->start + map->pgoff;
 }
 
-static uint64_t vdso__map_ip(struct map *map, uint64_t ip)
+static __u64 vdso__map_ip(struct map *map, __u64 ip)
 {
 	return ip;
 }
@@ -288,7 +288,7 @@ static int map__overlap(struct map *l, struct map *r)
 
 static size_t map__fprintf(struct map *self, FILE *fp)
 {
-	return fprintf(fp, " %"PRIx64"-%"PRIx64" %"PRIx64" %s\n",
+	return fprintf(fp, " %Lx-%Lx %Lx %s\n",
 		       self->start, self->end, self->pgoff, self->dso->name);
 }
 
@@ -412,7 +412,7 @@ static int thread__fork(struct thread *self, struct thread *parent)
 	return 0;
 }
 
-static struct map *thread__find_map(struct thread *self, uint64_t ip)
+static struct map *thread__find_map(struct thread *self, __u64 ip)
 {
 	struct map *pos;
 
@@ -453,10 +453,10 @@ struct hist_entry {
 	struct map	 *map;
 	struct dso	 *dso;
 	struct symbol	 *sym;
-	uint64_t	 ip;
+	__u64		 ip;
 	char		 level;
 
-	uint64_t	 count;
+	__u64		 count;
 };
 
 /*
@@ -572,7 +572,7 @@ static struct sort_entry sort_dso = {
 static int64_t
 sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 {
-	uint64_t ip_l, ip_r;
+	__u64 ip_l, ip_r;
 
 	if (left->sym == right->sym)
 		return 0;
@@ -684,7 +684,7 @@ hist_entry__collapse(struct hist_entry *left, struct hist_entry *right)
 }
 
 static size_t
-hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
+hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples)
 {
 	struct sort_entry *se;
 	size_t ret;
@@ -708,7 +708,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 		ret = color_fprintf(fp, color, "   %6.2f%%",
 				(self->count * 100.0) / total_samples);
 	} else
-		ret = fprintf(fp, "%12d ", self->count);
+		ret = fprintf(fp, "%12Ld ", self->count);
 
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		fprintf(fp, "  ");
@@ -726,7 +726,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, uint64_t total_samples)
 
 static int
 hist_entry__add(struct thread *thread, struct map *map, struct dso *dso,
-		struct symbol *sym, uint64_t ip, char level, uint64_t count)
+		struct symbol *sym, __u64 ip, char level, __u64 count)
 {
 	struct rb_node **p = &hist.rb_node;
 	struct rb_node *parent = NULL;
@@ -873,7 +873,7 @@ static void output__resort(void)
 	}
 }
 
-static size_t output__fprintf(FILE *fp, uint64_t total_samples)
+static size_t output__fprintf(FILE *fp, __u64 total_samples)
 {
 	struct hist_entry *pos;
 	struct sort_entry *se;
@@ -941,8 +941,8 @@ process_overflow_event(event_t *event, unsigned long offset, unsigned long head)
 	int show = 0;
 	struct dso *dso = NULL;
 	struct thread *thread = threads__findnew(event->ip.pid);
-	uint64_t ip = event->ip.ip;
-	uint64_t period = 1;
+	__u64 ip = event->ip.ip;
+	__u64 period = 1;
 	struct map *map = NULL;
 
 	if (event->header.type & PERF_SAMPLE_PERIOD)
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 8ba24808a3921..309dbc76ec88c 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -79,8 +79,8 @@ static int			dump_symtab;
  * Symbols
  */
 
-static uint64_t			min_ip;
-static uint64_t			max_ip = -1ll;
+static __u64			min_ip;
+static __u64			max_ip = -1ll;
 
 struct sym_entry {
 	struct rb_node		rb_node;
@@ -372,7 +372,7 @@ static int parse_symbols(void)
 /*
  * Binary search in the histogram table and record the hit:
  */
-static void record_ip(uint64_t ip, int counter)
+static void record_ip(__u64 ip, int counter)
 {
 	struct symbol *sym = dso__find_symbol(kernel_dso, ip);
 
@@ -392,7 +392,7 @@ static void record_ip(uint64_t ip, int counter)
 	samples--;
 }
 
-static void process_event(uint64_t ip, int counter)
+static void process_event(__u64 ip, int counter)
 {
 	samples++;
 
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 32dd47d60d9ca..49a55f8137121 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -9,9 +9,9 @@
 
 const char *sym_hist_filter;
 
-static struct symbol *symbol__new(uint64_t start, uint64_t len,
+static struct symbol *symbol__new(__u64 start, __u64 len,
 				  const char *name, unsigned int priv_size,
-				  uint64_t obj_start, int verbose)
+				  __u64 obj_start, int verbose)
 {
 	size_t namelen = strlen(name) + 1;
 	struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen);
@@ -89,7 +89,7 @@ static void dso__insert_symbol(struct dso *self, struct symbol *sym)
 {
 	struct rb_node **p = &self->syms.rb_node;
 	struct rb_node *parent = NULL;
-	const uint64_t ip = sym->start;
+	const __u64 ip = sym->start;
 	struct symbol *s;
 
 	while (*p != NULL) {
@@ -104,7 +104,7 @@ static void dso__insert_symbol(struct dso *self, struct symbol *sym)
 	rb_insert_color(&sym->rb_node, &self->syms);
 }
 
-struct symbol *dso__find_symbol(struct dso *self, uint64_t ip)
+struct symbol *dso__find_symbol(struct dso *self, __u64 ip)
 {
 	struct rb_node *n;
 
@@ -523,7 +523,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
-		uint64_t obj_start;
+		__u64 obj_start;
 
 		if (!elf_sym__is_function(&sym))
 			continue;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 4839d68f14f08..0d1292bd82706 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -19,7 +19,7 @@ struct dso {
 	struct list_head node;
 	struct rb_root	 syms;
 	unsigned int	 sym_priv_size;
-	struct symbol    *(*find_symbol)(struct dso *, uint64_t ip);
+	struct symbol    *(*find_symbol)(struct dso *, __u64 ip);
 	char		 name[0];
 };
 
@@ -35,7 +35,7 @@ static inline void *dso__sym_priv(struct dso *self, struct symbol *sym)
 	return ((void *)sym) - self->sym_priv_size;
 }
 
-struct symbol *dso__find_symbol(struct dso *self, uint64_t ip);
+struct symbol *dso__find_symbol(struct dso *self, __u64 ip);
 
 int dso__load_kernel(struct dso *self, const char *vmlinux,
 		     symbol_filter_t filter, int verbose);
-- 
GitLab


From 0b4dcea579a1b6f4d249d61f5bc8adeaa7c895d8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Jun 2009 11:13:35 -0400
Subject: [PATCH 1970/2198] Btrfs: fix oops when btrfs_inherit_iflags called
 with a NULL dir

This happens during subvol creation.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ioctl.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 926332a73cdef..eff18f5b53620 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -112,7 +112,12 @@ void btrfs_update_iflags(struct inode *inode)
  */
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 {
-	unsigned int flags = BTRFS_I(dir)->flags;
+	unsigned int flags;
+
+	if (!dir)
+		return;
+
+	flags = BTRFS_I(dir)->flags;
 
 	if (S_ISREG(inode->i_mode))
 		flags &= ~BTRFS_INODE_DIRSYNC;
-- 
GitLab


From b263c2c8bf13c273485bd99dbbeba79c844409dd Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 11 Jun 2009 11:24:47 -0400
Subject: [PATCH 1971/2198] Btrfs: fix extent_buffer leak during tree log
 replay

During tree log replay, we read in the tree log roots,
process them and then free them.  A recent change
takes an extra reference on the root node of the tree
when the root is read in, and stores that reference
in root->commit_root.

This reference was not being freed, leaving us with
one buffer pinned in ram for each subvol with
a tree log root after a crash.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 2b41fc08c34aa..c13922206d1be 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3019,6 +3019,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
 		key.offset = found_key.offset - 1;
 		wc.replay_dest->log_root = NULL;
 		free_extent_buffer(log->node);
+		free_extent_buffer(log->commit_root);
 		kfree(log);
 
 		if (found_key.offset == 0)
-- 
GitLab


From ace08c3c4403140e5ce82116c8f2acb38f58f61d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:20:38 +0100
Subject: [PATCH 1972/2198] tty:cyclades, load firmware even on Ze

Ze needs firmware to be loaded as well as Zo. Move cyz_load_fw one
level upper to achieve that.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 1fdb9f657d8f8..bde24583bcfcb 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -5043,6 +5043,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 			nchan = ZE_V1_NPORTS;
 		} else {
 			card_name = "Cyclades-8Zo";
+			nchan = 8;
 
 #ifdef CY_PCI_DEBUG
 			if (mailbox == ZO_V1) {
@@ -5065,15 +5066,11 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 			 */
 			if ((mailbox == ZO_V1) || (mailbox == ZO_V2))
 				cy_writel(addr2 + ID_ADDRESS, 0L);
-
-			retval = cyz_load_fw(pdev, addr2, addr0, irq);
-			if (retval)
-				goto err_unmap;
-			/* This must be a Cyclades-8Zo/PCI.  The extendable
-			   version will have a different device_id and will
-			   be allocated its maximum number of ports. */
-			nchan = 8;
 		}
+
+		retval = cyz_load_fw(pdev, addr2, addr0, irq);
+		if (retval)
+			goto err_unmap;
 	}
 
 	if ((cy_next_channel + nchan) > NR_PORTS) {
-- 
GitLab


From eca1b592d600bb2b7d24b66e4106de1e751ae510 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:21:30 +0100
Subject: [PATCH 1973/2198] tty: cyclades, don't kill FW

Don't reset the PLX chip after FW load, which effectively kills
the FW, so that user had to boot manually.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index bde24583bcfcb..6d524391e9948 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -4932,8 +4932,6 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 	cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) |
 			0x00030800UL);
 
-	plx_init(pdev, irq, ctl_addr);
-
 	return 0;
 err_rel:
 	release_firmware(fw);
-- 
GitLab


From 65a29f60e121ae5116ac1736b50a237bf2db3225 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:22:27 +0100
Subject: [PATCH 1974/2198] tty: cyclades, remove spurious check in ISR

No need to check if dev_id is NULL, it never is.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 6d524391e9948..ccf68a9e24b75 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -1737,14 +1737,6 @@ static irqreturn_t cyz_interrupt(int irq, void *dev_id)
 {
 	struct cyclades_card *cinfo = dev_id;
 
-	if (unlikely(cinfo == NULL)) {
-#ifdef CY_DEBUG_INTERRUPTS
-		printk(KERN_DEBUG "cyz_interrupt: spurious interrupt %d\n",
-									irq);
-#endif
-		return IRQ_NONE;	/* spurious interrupt */
-	}
-
 	if (unlikely(!ISZLOADED(*cinfo))) {
 #ifdef CY_DEBUG_INTERRUPTS
 		printk(KERN_DEBUG "cyz_interrupt: board not yet loaded "
-- 
GitLab


From fcc8ac1825d3d0fb81f73bc1a80ebc863168bb56 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 11 Jun 2009 12:24:17 +0100
Subject: [PATCH 1975/2198] tty: Add carrier processing on close to the
 tty_port core

Some drivers implement this internally, others miss it out. Push the
behaviour into the core code as that way everyone will do it consistently.

Update the dtr rts method to raise or lower depending upon flags. Having a
single method in this style fits most of the implementations more cleanly than
two funtions.

We need this in place before we tackle the USB side

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/epca.c               |  4 ++--
 drivers/char/isicom.c             | 19 +++++++++++++------
 drivers/char/istallion.c          |  8 ++++----
 drivers/char/mxser.c              | 12 ++++++++----
 drivers/char/pcmcia/synclink_cs.c | 11 +++++++----
 drivers/char/rocket.c             | 13 +++++++++----
 drivers/char/stallion.c           |  6 +++---
 drivers/char/synclink.c           |  9 ++++++---
 drivers/char/synclink_gt.c        |  9 ++++++---
 drivers/char/synclinkmp.c         |  9 ++++++---
 drivers/char/tty_port.c           | 27 +++++++++++++++++++++++----
 include/linux/tty.h               |  3 ++-
 12 files changed, 89 insertions(+), 41 deletions(-)

diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index af7c13ca94937..8797b77423505 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -745,7 +745,7 @@ static int epca_carrier_raised(struct tty_port *port)
 	return 0;
 }
 
-static void epca_raise_dtr_rts(struct tty_port *port)
+static void epca_dtr_rts(struct tty_port *port, int onoff)
 {
 }
 
@@ -925,7 +925,7 @@ static const struct tty_operations pc_ops = {
 
 static const struct tty_port_operations epca_port_ops = {
 	.carrier_raised = epca_carrier_raised,
-	.raise_dtr_rts = epca_raise_dtr_rts,
+	.dtr_rts = epca_dtr_rts,
 };
 
 static int info_open(struct tty_struct *tty, struct file *filp)
diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c
index a59eac584d162..4d745a89504f1 100644
--- a/drivers/char/isicom.c
+++ b/drivers/char/isicom.c
@@ -329,7 +329,7 @@ static inline void drop_rts(struct isi_port *port)
 
 /* card->lock MUST NOT be held */
 
-static void isicom_raise_dtr_rts(struct tty_port *port)
+static void isicom_dtr_rts(struct tty_port *port, int on)
 {
 	struct isi_port *ip = container_of(port, struct isi_port, port);
 	struct isi_board *card = ip->card;
@@ -339,10 +339,17 @@ static void isicom_raise_dtr_rts(struct tty_port *port)
 	if (!lock_card(card))
 		return;
 
-	outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-	outw(0x0f04, base);
-	InterruptTheCard(base);
-	ip->status |= (ISI_DTR | ISI_RTS);
+	if (on) {
+		outw(0x8000 | (channel << card->shift_count) | 0x02, base);
+		outw(0x0f04, base);
+		InterruptTheCard(base);
+		ip->status |= (ISI_DTR | ISI_RTS);
+	} else {
+		outw(0x8000 | (channel << card->shift_count) | 0x02, base);
+		outw(0x0C04, base);
+		InterruptTheCard(base);
+		ip->status &= ~(ISI_DTR | ISI_RTS);
+	}
 	unlock_card(card);
 }
 
@@ -1339,7 +1346,7 @@ static const struct tty_operations isicom_ops = {
 
 static const struct tty_port_operations isicom_port_ops = {
 	.carrier_raised		= isicom_carrier_raised,
-	.raise_dtr_rts		= isicom_raise_dtr_rts,
+	.dtr_rts		= isicom_dtr_rts,
 };
 
 static int __devinit reset_card(struct pci_dev *pdev,
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index fff19f7e29d25..e18800c400b10 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -1140,14 +1140,14 @@ static int stli_carrier_raised(struct tty_port *port)
 	return (portp->sigs & TIOCM_CD) ? 1 : 0;
 }
 
-static void stli_raise_dtr_rts(struct tty_port *port)
+static void stli_dtr_rts(struct tty_port *port, int on)
 {
 	struct stliport *portp = container_of(port, struct stliport, port);
 	struct stlibrd *brdp = stli_brds[portp->brdnr];
-	stli_mkasysigs(&portp->asig, 1, 1);
+	stli_mkasysigs(&portp->asig, on, on);
 	if (stli_cmdwait(brdp, portp, A_SETSIGNALS, &portp->asig,
 		sizeof(asysigs_t), 0) < 0)
-			printk(KERN_WARNING "istallion: dtr raise failed.\n");
+			printk(KERN_WARNING "istallion: dtr set failed.\n");
 }
 
 
@@ -4417,7 +4417,7 @@ static const struct tty_operations stli_ops = {
 
 static const struct tty_port_operations stli_port_ops = {
 	.carrier_raised = stli_carrier_raised,
-	.raise_dtr_rts = stli_raise_dtr_rts,
+	.dtr_rts = stli_dtr_rts,
 };
 
 /*****************************************************************************/
diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c
index 13f8871e5b217..9533f43a30bb0 100644
--- a/drivers/char/mxser.c
+++ b/drivers/char/mxser.c
@@ -547,14 +547,18 @@ static int mxser_carrier_raised(struct tty_port *port)
 	return (inb(mp->ioaddr + UART_MSR) & UART_MSR_DCD)?1:0;
 }
 
-static void mxser_raise_dtr_rts(struct tty_port *port)
+static void mxser_dtr_rts(struct tty_port *port, int on)
 {
 	struct mxser_port *mp = container_of(port, struct mxser_port, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&mp->slock, flags);
-	outb(inb(mp->ioaddr + UART_MCR) |
-		UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
+	if (on)
+		outb(inb(mp->ioaddr + UART_MCR) |
+			UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
+	else
+		outb(inb(mp->ioaddr + UART_MCR)&~(UART_MCR_DTR | UART_MCR_RTS),
+			mp->ioaddr + UART_MCR);
 	spin_unlock_irqrestore(&mp->slock, flags);
 }
 
@@ -2356,7 +2360,7 @@ static const struct tty_operations mxser_ops = {
 
 struct tty_port_operations mxser_port_ops = {
 	.carrier_raised = mxser_carrier_raised,
-	.raise_dtr_rts = mxser_raise_dtr_rts,
+	.dtr_rts = mxser_dtr_rts,
 };
 
 /*
diff --git a/drivers/char/pcmcia/synclink_cs.c b/drivers/char/pcmcia/synclink_cs.c
index 19d79fc54461c..77b3648892249 100644
--- a/drivers/char/pcmcia/synclink_cs.c
+++ b/drivers/char/pcmcia/synclink_cs.c
@@ -383,7 +383,7 @@ static void async_mode(MGSLPC_INFO *info);
 static void tx_timeout(unsigned long context);
 
 static int carrier_raised(struct tty_port *port);
-static void raise_dtr_rts(struct tty_port *port);
+static void dtr_rts(struct tty_port *port, int onoff);
 
 #if SYNCLINK_GENERIC_HDLC
 #define dev_to_port(D) (dev_to_hdlc(D)->priv)
@@ -513,7 +513,7 @@ static void ldisc_receive_buf(struct tty_struct *tty,
 
 static const struct tty_port_operations mgslpc_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts
+	.dtr_rts = dtr_rts
 };
 
 static int mgslpc_probe(struct pcmcia_device *link)
@@ -2528,13 +2528,16 @@ static int carrier_raised(struct tty_port *port)
 	return 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int onoff)
 {
 	MGSLPC_INFO *info = container_of(port, MGSLPC_INFO, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->lock,flags);
-	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (onoff)
+		info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->serial_signals &= ~SerialSignal_RTS + SerialSignal_DTR;
 	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index f59fc5cea0673..7399188049d84 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -872,11 +872,16 @@ static int carrier_raised(struct tty_port *port)
 	return (sGetChanStatusLo(&info->channel) & CD_ACT) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	struct r_port *info = container_of(port, struct r_port, port);
-	sSetDTR(&info->channel);
-	sSetRTS(&info->channel);
+	if (on) {
+		sSetDTR(&info->channel);
+		sSetRTS(&info->channel);
+	} else {
+		sClrDTR(&info->channel);
+		sClrRTS(&info->channel);
+	}
 }
 
 /*
@@ -2250,7 +2255,7 @@ static const struct tty_operations rocket_ops = {
 
 static const struct tty_port_operations rocket_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 /*
diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index 2ad813a801dc3..53e504f41b204 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -772,11 +772,11 @@ static int stl_carrier_raised(struct tty_port *port)
 	return (portp->sigs & TIOCM_CD) ? 1 : 0;
 }
 
-static void stl_raise_dtr_rts(struct tty_port *port)
+static void stl_dtr_rts(struct tty_port *port, int on)
 {
 	struct stlport *portp = container_of(port, struct stlport, port);
 	/* Takes brd_lock internally */
-	stl_setsignals(portp, 1, 1);
+	stl_setsignals(portp, on, on);
 }
 
 /*****************************************************************************/
@@ -2547,7 +2547,7 @@ static const struct tty_operations stl_ops = {
 
 static const struct tty_port_operations stl_port_ops = {
 	.carrier_raised = stl_carrier_raised,
-	.raise_dtr_rts = stl_raise_dtr_rts,
+	.dtr_rts = stl_dtr_rts,
 };
 
 /*****************************************************************************/
diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c
index afd0b26ca0568..afded3a2379c5 100644
--- a/drivers/char/synclink.c
+++ b/drivers/char/synclink.c
@@ -3247,13 +3247,16 @@ static int carrier_raised(struct tty_port *port)
 	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	struct mgsl_struct *info = container_of(port, struct mgsl_struct, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->irq_spinlock,flags);
-	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (on)
+		info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->serial_signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
  	usc_set_serial_signals(info);
 	spin_unlock_irqrestore(&info->irq_spinlock,flags);
 }
@@ -4258,7 +4261,7 @@ static void mgsl_add_device( struct mgsl_struct *info )
 
 static const struct tty_port_operations mgsl_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 
diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 5e256494686a8..67986ea0d479c 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -3099,13 +3099,16 @@ static int carrier_raised(struct tty_port *port)
 	return (info->signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	unsigned long flags;
 	struct slgt_info *info = container_of(port, struct slgt_info, port);
 
 	spin_lock_irqsave(&info->lock,flags);
-	info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (on)
+		info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
  	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
@@ -3419,7 +3422,7 @@ static void add_device(struct slgt_info *info)
 
 static const struct tty_port_operations slgt_port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 /*
diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c
index 26de60efe4b24..6f727e3c53ade 100644
--- a/drivers/char/synclinkmp.c
+++ b/drivers/char/synclinkmp.c
@@ -3277,13 +3277,16 @@ static int carrier_raised(struct tty_port *port)
 	return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
 	SLMP_INFO *info = container_of(port, SLMP_INFO, port);
 	unsigned long flags;
 
 	spin_lock_irqsave(&info->lock,flags);
-	info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	if (on)
+		info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+	else
+		info->serial_signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
  	set_signals(info);
 	spin_unlock_irqrestore(&info->lock,flags);
 }
@@ -3746,7 +3749,7 @@ static void add_device(SLMP_INFO *info)
 
 static const struct tty_port_operations port_ops = {
 	.carrier_raised = carrier_raised,
-	.raise_dtr_rts = raise_dtr_rts,
+	.dtr_rts = dtr_rts,
 };
 
 /* Allocate and initialize a device instance structure
diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 9b8004c72686a..926d4a5593fac 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -137,7 +137,7 @@ int tty_port_carrier_raised(struct tty_port *port)
 EXPORT_SYMBOL(tty_port_carrier_raised);
 
 /**
- *	tty_port_raise_dtr_rts	-	Riase DTR/RTS
+ *	tty_port_raise_dtr_rts	-	Raise DTR/RTS
  *	@port: tty port
  *
  *	Wrapper for the DTR/RTS raise logic. For the moment this is used
@@ -147,11 +147,27 @@ EXPORT_SYMBOL(tty_port_carrier_raised);
 
 void tty_port_raise_dtr_rts(struct tty_port *port)
 {
-	if (port->ops->raise_dtr_rts)
-		port->ops->raise_dtr_rts(port);
+	if (port->ops->dtr_rts)
+		port->ops->dtr_rts(port, 1);
 }
 EXPORT_SYMBOL(tty_port_raise_dtr_rts);
 
+/**
+ *	tty_port_lower_dtr_rts	-	Lower DTR/RTS
+ *	@port: tty port
+ *
+ *	Wrapper for the DTR/RTS raise logic. For the moment this is used
+ *	to hide some internal details. This will eventually become entirely
+ *	internal to the tty port.
+ */
+
+void tty_port_lower_dtr_rts(struct tty_port *port)
+{
+	if (port->ops->dtr_rts)
+		port->ops->dtr_rts(port, 0);
+}
+EXPORT_SYMBOL(tty_port_lower_dtr_rts);
+
 /**
  *	tty_port_block_til_ready	-	Waiting logic for tty open
  *	@port: the tty port being opened
@@ -167,7 +183,7 @@ EXPORT_SYMBOL(tty_port_raise_dtr_rts);
  *		- port flags and counts
  *
  *	The passed tty_port must implement the carrier_raised method if it can
- *	do carrier detect and the raise_dtr_rts method if it supports software
+ *	do carrier detect and the dtr_rts method if it supports software
  *	management of these lines. Note that the dtr/rts raise is done each
  *	iteration as a hangup may have previously dropped them while we wait.
  */
@@ -302,6 +318,9 @@ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty)
 
 	tty_ldisc_flush(tty);
 
+	if (tty->termios->c_cflag & HUPCL)
+		tty_port_lower_dtr_rts(port);
+
 	spin_lock_irqsave(&port->lock, flags);
 	tty->closing = 0;
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index fc39db95499fb..98694364c96fd 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -185,7 +185,7 @@ struct tty_port;
 struct tty_port_operations {
 	/* Return 1 if the carrier is raised */
 	int (*carrier_raised)(struct tty_port *port);
-	void (*raise_dtr_rts)(struct tty_port *port);
+	void (*dtr_rts)(struct tty_port *port, int raise);
 };
 	
 struct tty_port {
@@ -438,6 +438,7 @@ extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
 extern void tty_port_raise_dtr_rts(struct tty_port *port);
+extern void tty_port_lower_dtr_rts(struct tty_port *port);
 extern void tty_port_hangup(struct tty_port *port);
 extern int tty_port_block_til_ready(struct tty_port *port,
 				struct tty_struct *tty, struct file *filp);
-- 
GitLab


From 1ec739be75a6cb961a46ba0b1982d0edb7f27558 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 11 Jun 2009 12:25:25 +0100
Subject: [PATCH 1976/2198] tty: Implement a drain delay in the tty port

We need this for devices that cannot flush and wait, but which do not order
data and modem events. Without it we will hang up before all the data
clears the hardware. Needed for the USB changes.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_port.c | 11 +++++++++++
 include/linux/tty.h     |  3 +++
 2 files changed, 14 insertions(+)

diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 926d4a5593fac..4d08b6d27c28b 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -308,6 +308,17 @@ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct f
 	if (port->flags & ASYNC_INITIALIZED &&
 			port->closing_wait != ASYNC_CLOSING_WAIT_NONE)
 		tty_wait_until_sent(tty, port->closing_wait);
+	if (port->drain_delay) {
+		unsigned int bps = tty_get_baud_rate(tty);
+		long timeout;
+
+		if (bps > 1200)
+			timeout = max_t(long, (HZ * 10 * port->drain_delay) / bps,
+								HZ / 10);
+		else
+			timeout = 2 * HZ;
+		schedule_timeout_interruptible(timeout);
+	}
 	return 1;
 }
 EXPORT_SYMBOL(tty_port_close_start);
diff --git a/include/linux/tty.h b/include/linux/tty.h
index 98694364c96fd..bed5a3d403074 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -201,6 +201,9 @@ struct tty_port {
 	unsigned char		*xmit_buf;	/* Optional buffer */
 	int			close_delay;	/* Close port delay */
 	int			closing_wait;	/* Delay for output */
+	int			drain_delay;	/* Set to zero if no pure time
+						   based drain is needed else
+						   set to size of fifo */
 };
 
 /*
-- 
GitLab


From 335f8514f200e63d689113d29cb7253a5c282967 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@lxorguk.ukuu.org.uk>
Date: Thu, 11 Jun 2009 12:26:29 +0100
Subject: [PATCH 1977/2198] tty: Bring the usb tty port structure into more use

This allows us to clean stuff up, but is probably also going to cause
some app breakage with buggy apps as we now implement proper POSIX behaviour
for USB ports matching all the other ports. This does also mean other apps
that break on USB will now work properly.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/belkin_sa.c        |   6 +-
 drivers/usb/serial/ch341.c            |  46 ++++----
 drivers/usb/serial/console.c          |   6 +-
 drivers/usb/serial/cp210x.c           |   6 +-
 drivers/usb/serial/cyberjack.c        |   6 +-
 drivers/usb/serial/cypress_m8.c       |  81 ++++-----------
 drivers/usb/serial/digi_acceleport.c  |  75 ++++----------
 drivers/usb/serial/empeg.c            |   6 +-
 drivers/usb/serial/ftdi_sio.c         |  50 +++++----
 drivers/usb/serial/garmin_gps.c       |   3 +-
 drivers/usb/serial/generic.c          |   3 +-
 drivers/usb/serial/io_edgeport.c      |  10 +-
 drivers/usb/serial/io_ti.c            |   3 +-
 drivers/usb/serial/ipaq.c             |   6 +-
 drivers/usb/serial/ipw.c              |  18 ++--
 drivers/usb/serial/ir-usb.c           |   6 +-
 drivers/usb/serial/iuu_phoenix.c      |  25 +----
 drivers/usb/serial/keyspan.c          |  13 ++-
 drivers/usb/serial/keyspan.h          |   8 +-
 drivers/usb/serial/keyspan_pda.c      |  48 ++++++---
 drivers/usb/serial/kl5kusb105.c       |   6 +-
 drivers/usb/serial/kobil_sct.c        |   9 +-
 drivers/usb/serial/mct_u232.c         |  37 +++----
 drivers/usb/serial/mos7720.c          |   3 +-
 drivers/usb/serial/mos7840.c          |  48 +--------
 drivers/usb/serial/navman.c           |   3 +-
 drivers/usb/serial/omninet.c          |   6 +-
 drivers/usb/serial/opticon.c          |   3 +-
 drivers/usb/serial/option.c           |  68 ++++++------
 drivers/usb/serial/oti6858.c          |  57 +---------
 drivers/usb/serial/pl2303.c           |  79 ++++++--------
 drivers/usb/serial/sierra.c           |  97 ++++++++---------
 drivers/usb/serial/spcp8x5.c          |  85 ++++++---------
 drivers/usb/serial/symbolserial.c     |   3 +-
 drivers/usb/serial/ti_usb_3410_5052.c |   6 +-
 drivers/usb/serial/usb-serial.c       | 144 ++++++++++++++++++--------
 drivers/usb/serial/visor.c            |   6 +-
 drivers/usb/serial/whiteheat.c        |  33 +-----
 include/linux/usb/serial.h            |  10 +-
 39 files changed, 467 insertions(+), 661 deletions(-)

diff --git a/drivers/usb/serial/belkin_sa.c b/drivers/usb/serial/belkin_sa.c
index b7eacad4d48cb..2bfd6dd85b5ad 100644
--- a/drivers/usb/serial/belkin_sa.c
+++ b/drivers/usb/serial/belkin_sa.c
@@ -93,8 +93,7 @@ static int  belkin_sa_startup(struct usb_serial *serial);
 static void belkin_sa_shutdown(struct usb_serial *serial);
 static int  belkin_sa_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void belkin_sa_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void belkin_sa_close(struct usb_serial_port *port);
 static void belkin_sa_read_int_callback(struct urb *urb);
 static void belkin_sa_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios * old);
@@ -244,8 +243,7 @@ static int  belkin_sa_open(struct tty_struct *tty,
 } /* belkin_sa_open */
 
 
-static void belkin_sa_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void belkin_sa_close(struct usb_serial_port *port)
 {
 	dbg("%s port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/ch341.c b/drivers/usb/serial/ch341.c
index ab4cc277aa659..2830766f5b391 100644
--- a/drivers/usb/serial/ch341.c
+++ b/drivers/usb/serial/ch341.c
@@ -262,13 +262,33 @@ error:	kfree(priv);
 	return r;
 }
 
-static void ch341_close(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static int ch341_carrier_raised(struct usb_serial_port *port)
+{
+	struct ch341_private *priv = usb_get_serial_port_data(port);
+	if (priv->line_status & CH341_BIT_DCD)
+		return 1;
+	return 0;
+}
+
+static void ch341_dtr_rts(struct usb_serial_port *port, int on)
 {
 	struct ch341_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	unsigned int c_cflag;
 
+	dbg("%s - port %d", __func__, port->number);
+	/* drop DTR and RTS */
+	spin_lock_irqsave(&priv->lock, flags);
+	if (on)
+		priv->line_control |= CH341_BIT_RTS | CH341_BIT_DTR;
+	else
+		priv->line_control &= ~(CH341_BIT_RTS | CH341_BIT_DTR);
+	spin_unlock_irqrestore(&priv->lock, flags);
+	ch341_set_handshake(port->serial->dev, priv->line_control);
+	wake_up_interruptible(&priv->delta_msr_wait);
+}
+
+static void ch341_close(struct usb_serial_port *port)
+{
 	dbg("%s - port %d", __func__, port->number);
 
 	/* shutdown our urbs */
@@ -276,18 +296,6 @@ static void ch341_close(struct tty_struct *tty, struct usb_serial_port *port,
 	usb_kill_urb(port->write_urb);
 	usb_kill_urb(port->read_urb);
 	usb_kill_urb(port->interrupt_in_urb);
-
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			/* drop DTR and RTS */
-			spin_lock_irqsave(&priv->lock, flags);
-			priv->line_control = 0;
-			spin_unlock_irqrestore(&priv->lock, flags);
-			ch341_set_handshake(port->serial->dev, 0);
-		}
-	}
-	wake_up_interruptible(&priv->delta_msr_wait);
 }
 
 
@@ -302,7 +310,6 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port,
 	dbg("ch341_open()");
 
 	priv->baud_rate = DEFAULT_BAUD_RATE;
-	priv->line_control = CH341_BIT_RTS | CH341_BIT_DTR;
 
 	r = ch341_configure(serial->dev, priv);
 	if (r)
@@ -322,7 +329,7 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port,
 	if (r) {
 		dev_err(&port->dev, "%s - failed submitting interrupt urb,"
 			" error %d\n", __func__, r);
-		ch341_close(tty, port, NULL);
+		ch341_close(port);
 		return -EPROTO;
 	}
 
@@ -343,9 +350,6 @@ static void ch341_set_termios(struct tty_struct *tty,
 
 	dbg("ch341_set_termios()");
 
-	if (!tty || !tty->termios)
-		return;
-
 	baud_rate = tty_get_baud_rate(tty);
 
 	priv->baud_rate = baud_rate;
@@ -568,6 +572,8 @@ static struct usb_serial_driver ch341_device = {
 	.usb_driver        = &ch341_driver,
 	.num_ports         = 1,
 	.open              = ch341_open,
+	.dtr_rts	   = ch341_dtr_rts,
+	.carrier_raised	   = ch341_carrier_raised,
 	.close             = ch341_close,
 	.ioctl             = ch341_ioctl,
 	.set_termios       = ch341_set_termios,
diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c
index 19e24045b137a..247b61bfb7f45 100644
--- a/drivers/usb/serial/console.c
+++ b/drivers/usb/serial/console.c
@@ -169,7 +169,9 @@ static int usb_console_setup(struct console *co, char *options)
 			kfree(tty);
 		}
 	}
-
+	/* So we know not to kill the hardware on a hangup on this
+	   port. We have also bumped the use count by one so it won't go
+	   idle */
 	port->console = 1;
 	retval = 0;
 
@@ -182,7 +184,7 @@ static int usb_console_setup(struct console *co, char *options)
 	kfree(tty);
 reset_open_count:
 	port->port.count = 0;
-goto out;
+	goto out;
 }
 
 static void usb_console_write(struct console *co,
diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index e8d5133ce9c84..d9f586dc6eccd 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -36,8 +36,7 @@
 static int cp2101_open(struct tty_struct *, struct usb_serial_port *,
 							struct file *);
 static void cp2101_cleanup(struct usb_serial_port *);
-static void cp2101_close(struct tty_struct *, struct usb_serial_port *,
-							struct file*);
+static void cp2101_close(struct usb_serial_port *);
 static void cp2101_get_termios(struct tty_struct *,
 	struct usb_serial_port *port);
 static void cp2101_get_termios_port(struct usb_serial_port *port,
@@ -398,8 +397,7 @@ static void cp2101_cleanup(struct usb_serial_port *port)
 	}
 }
 
-static void cp2101_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp)
+static void cp2101_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c
index dd501bb63ed61..933ba913e66c5 100644
--- a/drivers/usb/serial/cyberjack.c
+++ b/drivers/usb/serial/cyberjack.c
@@ -61,8 +61,7 @@ static int cyberjack_startup(struct usb_serial *serial);
 static void cyberjack_shutdown(struct usb_serial *serial);
 static int  cyberjack_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void cyberjack_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void cyberjack_close(struct usb_serial_port *port);
 static int cyberjack_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
 static int cyberjack_write_room(struct tty_struct *tty);
@@ -185,8 +184,7 @@ static int  cyberjack_open(struct tty_struct *tty,
 	return result;
 }
 
-static void cyberjack_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void cyberjack_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/cypress_m8.c b/drivers/usb/serial/cypress_m8.c
index e568710b263fa..669f938485395 100644
--- a/drivers/usb/serial/cypress_m8.c
+++ b/drivers/usb/serial/cypress_m8.c
@@ -174,8 +174,8 @@ static int  cypress_ca42v2_startup(struct usb_serial *serial);
 static void cypress_shutdown(struct usb_serial *serial);
 static int  cypress_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void cypress_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void cypress_close(struct usb_serial_port *port);
+static void cypress_dtr_rts(struct usb_serial_port *port, int on);
 static int  cypress_write(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 static void cypress_send(struct usb_serial_port *port);
@@ -218,6 +218,7 @@ static struct usb_serial_driver cypress_earthmate_device = {
 	.shutdown =			cypress_shutdown,
 	.open =				cypress_open,
 	.close =			cypress_close,
+	.dtr_rts =			cypress_dtr_rts,
 	.write =			cypress_write,
 	.write_room =			cypress_write_room,
 	.ioctl =			cypress_ioctl,
@@ -244,6 +245,7 @@ static struct usb_serial_driver cypress_hidcom_device = {
 	.shutdown =			cypress_shutdown,
 	.open =				cypress_open,
 	.close =			cypress_close,
+	.dtr_rts =			cypress_dtr_rts,
 	.write =			cypress_write,
 	.write_room =			cypress_write_room,
 	.ioctl =			cypress_ioctl,
@@ -270,6 +272,7 @@ static struct usb_serial_driver cypress_ca42v2_device = {
 	.shutdown =			cypress_shutdown,
 	.open =				cypress_open,
 	.close =			cypress_close,
+	.dtr_rts =			cypress_dtr_rts,
 	.write =			cypress_write,
 	.write_room =			cypress_write_room,
 	.ioctl =			cypress_ioctl,
@@ -656,11 +659,7 @@ static int cypress_open(struct tty_struct *tty,
 	priv->rx_flags = 0;
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* raise both lines and set termios */
-	spin_lock_irqsave(&priv->lock, flags);
-	priv->line_control = CONTROL_DTR | CONTROL_RTS;
-	priv->cmd_ctrl = 1;
-	spin_unlock_irqrestore(&priv->lock, flags);
+	/* Set termios */
 	result = cypress_write(tty, port, NULL, 0);
 
 	if (result) {
@@ -694,76 +693,42 @@ static int cypress_open(struct tty_struct *tty,
 							__func__, result);
 		cypress_set_dead(port);
 	}
-
+	port->port.drain_delay = 256;
 	return result;
 } /* cypress_open */
 
+static void cypress_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct cypress_private *priv = usb_get_serial_port_data(port);
+	/* drop dtr and rts */
+	priv = usb_get_serial_port_data(port);
+	spin_lock_irq(&priv->lock);
+	if (on == 0)
+		priv->line_control = 0;
+	else 
+		priv->line_control = CONTROL_DTR | CONTROL_RTS;
+	priv->cmd_ctrl = 1;
+	spin_unlock_irq(&priv->lock);
+	cypress_write(NULL, port, NULL, 0);
+}
 
-static void cypress_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void cypress_close(struct usb_serial_port *port)
 {
 	struct cypress_private *priv = usb_get_serial_port_data(port);
-	unsigned int c_cflag;
-	int bps;
-	long timeout;
-	wait_queue_t wait;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* wait for data to drain from buffer */
-	spin_lock_irq(&priv->lock);
-	timeout = CYPRESS_CLOSING_WAIT;
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (cypress_buf_data_avail(priv->buf) == 0
-		|| timeout == 0 || signal_pending(current)
-		/* without mutex, allowed due to harmless failure mode */
-		|| port->serial->disconnected)
-			break;
-		spin_unlock_irq(&priv->lock);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irq(&priv->lock);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
-	/* clear out any remaining data in the buffer */
-	cypress_buf_clear(priv->buf);
-	spin_unlock_irq(&priv->lock);
-
 	/* writing is potentially harmful, lock must be taken */
 	mutex_lock(&port->serial->disc_mutex);
 	if (port->serial->disconnected) {
 		mutex_unlock(&port->serial->disc_mutex);
 		return;
 	}
-	/* wait for characters to drain from device */
-	if (tty) {
-		bps = tty_get_baud_rate(tty);
-		if (bps > 1200)
-			timeout = max((HZ * 2560) / bps, HZ / 10);
-		else
-			timeout = 2 * HZ;
-		schedule_timeout_interruptible(timeout);
-	}
-
+	cypress_buf_clear(priv->buf);
 	dbg("%s - stopping urbs", __func__);
 	usb_kill_urb(port->interrupt_in_urb);
 	usb_kill_urb(port->interrupt_out_urb);
 
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			/* drop dtr and rts */
-			priv = usb_get_serial_port_data(port);
-			spin_lock_irq(&priv->lock);
-			priv->line_control = 0;
-			priv->cmd_ctrl = 1;
-			spin_unlock_irq(&priv->lock);
-			cypress_write(tty, port, NULL, 0);
-		}
-	}
 
 	if (stats)
 		dev_info(&port->dev, "Statistics: %d Bytes In | %d Bytes Out | %d Commands Issued\n",
diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c
index 38ba4ea8b6bfd..30f5140eff03a 100644
--- a/drivers/usb/serial/digi_acceleport.c
+++ b/drivers/usb/serial/digi_acceleport.c
@@ -422,7 +422,6 @@ struct digi_port {
 	int dp_throttled;
 	int dp_throttle_restart;
 	wait_queue_head_t dp_flush_wait;
-	int dp_in_close;			/* close in progress */
 	wait_queue_head_t dp_close_wait;	/* wait queue for close */
 	struct work_struct dp_wakeup_work;
 	struct usb_serial_port *dp_port;
@@ -456,8 +455,9 @@ static int digi_write_room(struct tty_struct *tty);
 static int digi_chars_in_buffer(struct tty_struct *tty);
 static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 	struct file *filp);
-static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
-	struct file *filp);
+static void digi_close(struct usb_serial_port *port);
+static int digi_carrier_raised(struct usb_serial_port *port);
+static void digi_dtr_rts(struct usb_serial_port *port, int on);
 static int digi_startup_device(struct usb_serial *serial);
 static int digi_startup(struct usb_serial *serial);
 static void digi_shutdown(struct usb_serial *serial);
@@ -510,6 +510,8 @@ static struct usb_serial_driver digi_acceleport_2_device = {
 	.num_ports =			3,
 	.open =				digi_open,
 	.close =			digi_close,
+	.dtr_rts =			digi_dtr_rts,
+	.carrier_raised =		digi_carrier_raised,
 	.write =			digi_write,
 	.write_room =			digi_write_room,
 	.write_bulk_callback = 		digi_write_bulk_callback,
@@ -1328,6 +1330,19 @@ static int digi_chars_in_buffer(struct tty_struct *tty)
 
 }
 
+static void digi_dtr_rts(struct usb_serial_port *port, int on)
+{
+	/* Adjust DTR and RTS */
+	digi_set_modem_signals(port, on * (TIOCM_DTR|TIOCM_RTS), 1);
+}
+
+static int digi_carrier_raised(struct usb_serial_port *port)
+{
+	struct digi_port *priv = usb_get_serial_port_data(port);
+	if (priv->dp_modem_signals & TIOCM_CD)
+		return 1;
+	return 0;
+}
 
 static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 				struct file *filp)
@@ -1336,7 +1351,6 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 	unsigned char buf[32];
 	struct digi_port *priv = usb_get_serial_port_data(port);
 	struct ktermios not_termios;
-	unsigned long flags = 0;
 
 	dbg("digi_open: TOP: port=%d, open_count=%d",
 		priv->dp_port_num, port->port.count);
@@ -1345,26 +1359,6 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 	if (digi_startup_device(port->serial) != 0)
 		return -ENXIO;
 
-	spin_lock_irqsave(&priv->dp_port_lock, flags);
-
-	/* don't wait on a close in progress for non-blocking opens */
-	if (priv->dp_in_close && (filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0) {
-		spin_unlock_irqrestore(&priv->dp_port_lock, flags);
-		return -EAGAIN;
-	}
-
-	/* wait for a close in progress to finish */
-	while (priv->dp_in_close) {
-		cond_wait_interruptible_timeout_irqrestore(
-			&priv->dp_close_wait, DIGI_RETRY_TIMEOUT,
-			&priv->dp_port_lock, flags);
-		if (signal_pending(current))
-			return -EINTR;
-		spin_lock_irqsave(&priv->dp_port_lock, flags);
-	}
-
-	spin_unlock_irqrestore(&priv->dp_port_lock, flags);
-
 	/* read modem signals automatically whenever they change */
 	buf[0] = DIGI_CMD_READ_INPUT_SIGNALS;
 	buf[1] = priv->dp_port_num;
@@ -1387,16 +1381,11 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
 		not_termios.c_iflag = ~tty->termios->c_iflag;
 		digi_set_termios(tty, port, &not_termios);
 	}
-
-	/* set DTR and RTS */
-	digi_set_modem_signals(port, TIOCM_DTR|TIOCM_RTS, 1);
-
 	return 0;
 }
 
 
-static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static void digi_close(struct usb_serial_port *port)
 {
 	DEFINE_WAIT(wait);
 	int ret;
@@ -1411,28 +1400,9 @@ static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
 	if (port->serial->disconnected)
 		goto exit;
 
-	/* do cleanup only after final close on this port */
-	spin_lock_irq(&priv->dp_port_lock);
-	priv->dp_in_close = 1;
-	spin_unlock_irq(&priv->dp_port_lock);
-
-	/* tell line discipline to process only XON/XOFF */
-	tty->closing = 1;
-
-	/* wait for output to drain */
-	if ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0)
-		tty_wait_until_sent(tty, DIGI_CLOSE_TIMEOUT);
-
-	/* flush driver and line discipline buffers */
-	tty_driver_flush_buffer(tty);
-	tty_ldisc_flush(tty);
-
 	if (port->serial->dev) {
-		/* wait for transmit idle */
-		if ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0)
-			digi_transmit_idle(port, DIGI_CLOSE_TIMEOUT);
-		/* drop DTR and RTS */
-		digi_set_modem_signals(port, 0, 0);
+		/* FIXME: Transmit idle belongs in the wait_unti_sent path */
+		digi_transmit_idle(port, DIGI_CLOSE_TIMEOUT);
 
 		/* disable input flow control */
 		buf[0] = DIGI_CMD_SET_INPUT_FLOW_CONTROL;
@@ -1477,11 +1447,9 @@ static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
 		/* shutdown any outstanding bulk writes */
 		usb_kill_urb(port->write_urb);
 	}
-	tty->closing = 0;
 exit:
 	spin_lock_irq(&priv->dp_port_lock);
 	priv->dp_write_urb_in_use = 0;
-	priv->dp_in_close = 0;
 	wake_up_interruptible(&priv->dp_close_wait);
 	spin_unlock_irq(&priv->dp_port_lock);
 	mutex_unlock(&port->serial->disc_mutex);
@@ -1560,7 +1528,6 @@ static int digi_startup(struct usb_serial *serial)
 		priv->dp_throttled = 0;
 		priv->dp_throttle_restart = 0;
 		init_waitqueue_head(&priv->dp_flush_wait);
-		priv->dp_in_close = 0;
 		init_waitqueue_head(&priv->dp_close_wait);
 		INIT_WORK(&priv->dp_wakeup_work, digi_wakeup_write_lock);
 		priv->dp_port = serial->port[i];
diff --git a/drivers/usb/serial/empeg.c b/drivers/usb/serial/empeg.c
index c709ec474a80b..2b141ccb0cd95 100644
--- a/drivers/usb/serial/empeg.c
+++ b/drivers/usb/serial/empeg.c
@@ -81,8 +81,7 @@ static int debug;
 /* function prototypes for an empeg-car player */
 static int  empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
 						struct file *filp);
-static void empeg_close(struct tty_struct *tty, struct usb_serial_port *port,
-						struct file *filp);
+static void empeg_close(struct usb_serial_port *port);
 static int  empeg_write(struct tty_struct *tty, struct usb_serial_port *port,
 						const unsigned char *buf,
 						int count);
@@ -181,8 +180,7 @@ static int empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
 }
 
 
-static void empeg_close(struct tty_struct *tty, struct usb_serial_port *port,
-				struct file *filp)
+static void empeg_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index d9fcdaedf389d..d9d87111f9a9c 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -719,8 +719,8 @@ static int  ftdi_sio_port_probe(struct usb_serial_port *port);
 static int  ftdi_sio_port_remove(struct usb_serial_port *port);
 static int  ftdi_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void ftdi_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void ftdi_close(struct usb_serial_port *port);
+static void ftdi_dtr_rts(struct usb_serial_port *port, int on);
 static int  ftdi_write(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 static int  ftdi_write_room(struct tty_struct *tty);
@@ -758,6 +758,7 @@ static struct usb_serial_driver ftdi_sio_device = {
 	.port_remove =		ftdi_sio_port_remove,
 	.open =			ftdi_open,
 	.close =		ftdi_close,
+	.dtr_rts =		ftdi_dtr_rts,
 	.throttle =		ftdi_throttle,
 	.unthrottle =		ftdi_unthrottle,
 	.write =		ftdi_write,
@@ -1558,6 +1559,30 @@ static int ftdi_open(struct tty_struct *tty,
 } /* ftdi_open */
 
 
+static void ftdi_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct ftdi_private *priv = usb_get_serial_port_data(port);
+	char buf[1];
+
+	mutex_lock(&port->serial->disc_mutex);
+	if (!port->serial->disconnected) {
+		/* Disable flow control */
+		if (!on && usb_control_msg(port->serial->dev,
+			    usb_sndctrlpipe(port->serial->dev, 0),
+			    FTDI_SIO_SET_FLOW_CTRL_REQUEST,
+			    FTDI_SIO_SET_FLOW_CTRL_REQUEST_TYPE,
+			    0, priv->interface, buf, 0,
+			    WDR_TIMEOUT) < 0) {
+			    dev_err(&port->dev, "error from flowcontrol urb\n");
+		}
+		/* drop RTS and DTR */
+		if (on)
+			set_mctrl(port, TIOCM_DTR | TIOCM_RTS);
+		else
+			clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
+	}
+	mutex_unlock(&port->serial->disc_mutex);
+}
 
 /*
  * usbserial:__serial_close  only calls ftdi_close if the point is open
@@ -1567,31 +1592,12 @@ static int ftdi_open(struct tty_struct *tty,
  *
  */
 
-static void ftdi_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void ftdi_close(struct usb_serial_port *port)
 { /* ftdi_close */
-	unsigned int c_cflag = tty->termios->c_cflag;
 	struct ftdi_private *priv = usb_get_serial_port_data(port);
-	char buf[1];
 
 	dbg("%s", __func__);
 
-	mutex_lock(&port->serial->disc_mutex);
-	if (c_cflag & HUPCL && !port->serial->disconnected) {
-		/* Disable flow control */
-		if (usb_control_msg(port->serial->dev,
-				    usb_sndctrlpipe(port->serial->dev, 0),
-				    FTDI_SIO_SET_FLOW_CTRL_REQUEST,
-				    FTDI_SIO_SET_FLOW_CTRL_REQUEST_TYPE,
-				    0, priv->interface, buf, 0,
-				    WDR_TIMEOUT) < 0) {
-			dev_err(&port->dev, "error from flowcontrol urb\n");
-		}
-
-		/* drop RTS and DTR */
-		clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
-	} /* Note change no line if hupcl is off */
-	mutex_unlock(&port->serial->disc_mutex);
 
 	/* cancel any scheduled reading */
 	cancel_delayed_work_sync(&priv->rx_work);
diff --git a/drivers/usb/serial/garmin_gps.c b/drivers/usb/serial/garmin_gps.c
index 586d30ff450b2..ee25a3fe3b09c 100644
--- a/drivers/usb/serial/garmin_gps.c
+++ b/drivers/usb/serial/garmin_gps.c
@@ -993,8 +993,7 @@ static int garmin_open(struct tty_struct *tty,
 }
 
 
-static void garmin_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void garmin_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct garmin_data *garmin_data_p = usb_get_serial_port_data(port);
diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c
index 4cec9906ccf39..be82ea9567201 100644
--- a/drivers/usb/serial/generic.c
+++ b/drivers/usb/serial/generic.c
@@ -184,8 +184,7 @@ int usb_serial_generic_resume(struct usb_serial *serial)
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_resume);
 
-void usb_serial_generic_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+void usb_serial_generic_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 	generic_cleanup(port);
diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c
index fb4a73d090f6b..53ef5996e33de 100644
--- a/drivers/usb/serial/io_edgeport.c
+++ b/drivers/usb/serial/io_edgeport.c
@@ -207,8 +207,7 @@ static void edge_bulk_out_cmd_callback(struct urb *urb);
 /* function prototypes for the usbserial callbacks */
 static int edge_open(struct tty_struct *tty, struct usb_serial_port *port,
 					struct file *filp);
-static void edge_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp);
+static void edge_close(struct usb_serial_port *port);
 static int edge_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
 static int edge_write_room(struct tty_struct *tty);
@@ -965,7 +964,7 @@ static int edge_open(struct tty_struct *tty,
 
 	if (!edge_port->txfifo.fifo) {
 		dbg("%s - no memory", __func__);
-		edge_close(tty, port, filp);
+		edge_close(port);
 		return -ENOMEM;
 	}
 
@@ -975,7 +974,7 @@ static int edge_open(struct tty_struct *tty,
 
 	if (!edge_port->write_urb) {
 		dbg("%s - no memory", __func__);
-		edge_close(tty, port, filp);
+		edge_close(port);
 		return -ENOMEM;
 	}
 
@@ -1099,8 +1098,7 @@ static void block_until_tx_empty(struct edgeport_port *edge_port)
  * edge_close
  *	this function is called by the tty driver when a port is closed
  *****************************************************************************/
-static void edge_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void edge_close(struct usb_serial_port *port)
 {
 	struct edgeport_serial *edge_serial;
 	struct edgeport_port *edge_port;
diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c
index 513b25e044c16..eabf20eeb370e 100644
--- a/drivers/usb/serial/io_ti.c
+++ b/drivers/usb/serial/io_ti.c
@@ -2009,8 +2009,7 @@ static int edge_open(struct tty_struct *tty,
 	return status;
 }
 
-static void edge_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void edge_close(struct usb_serial_port *port)
 {
 	struct edgeport_serial *edge_serial;
 	struct edgeport_port *edge_port;
diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c
index cd62825a9ac32..c610a99fa4774 100644
--- a/drivers/usb/serial/ipaq.c
+++ b/drivers/usb/serial/ipaq.c
@@ -76,8 +76,7 @@ static int initial_wait;
 /* Function prototypes for an ipaq */
 static int  ipaq_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void ipaq_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void ipaq_close(struct usb_serial_port *port);
 static int  ipaq_calc_num_ports(struct usb_serial *serial);
 static int  ipaq_startup(struct usb_serial *serial);
 static void ipaq_shutdown(struct usb_serial *serial);
@@ -714,8 +713,7 @@ static int ipaq_open(struct tty_struct *tty,
 }
 
 
-static void ipaq_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void ipaq_close(struct usb_serial_port *port)
 {
 	struct ipaq_private	*priv = usb_get_serial_port_data(port);
 
diff --git a/drivers/usb/serial/ipw.c b/drivers/usb/serial/ipw.c
index da2a2b46644a5..29ad038b9c8de 100644
--- a/drivers/usb/serial/ipw.c
+++ b/drivers/usb/serial/ipw.c
@@ -302,23 +302,17 @@ static int ipw_open(struct tty_struct *tty,
 	return 0;
 }
 
-static void ipw_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void ipw_dtr_rts(struct usb_serial_port *port, int on)
 {
 	struct usb_device *dev = port->serial->dev;
 	int result;
 
-	if (tty_hung_up_p(filp)) {
-		dbg("%s: tty_hung_up_p ...", __func__);
-		return;
-	}
-
 	/*--1: drop the dtr */
 	dbg("%s:dropping dtr", __func__);
 	result = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
 			 IPW_SIO_SET_PIN,
 			 USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_OUT,
-			 IPW_PIN_CLRDTR,
+			 on ? IPW_PIN_SETDTR : IPW_PIN_CLRDTR,
 			 0,
 			 NULL,
 			 0,
@@ -332,7 +326,7 @@ static void ipw_close(struct tty_struct *tty,
 	result = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
 			 IPW_SIO_SET_PIN, USB_TYPE_VENDOR |
 			 		USB_RECIP_INTERFACE | USB_DIR_OUT,
-			 IPW_PIN_CLRRTS,
+			 on ? IPW_PIN_SETRTS : IPW_PIN_CLRRTS,
 			 0,
 			 NULL,
 			 0,
@@ -340,7 +334,12 @@ static void ipw_close(struct tty_struct *tty,
 	if (result < 0)
 		dev_err(&port->dev,
 				"dropping rts failed (error = %d)\n", result);
+}
 
+static void ipw_close(struct usb_serial_port *port)
+{
+	struct usb_device *dev = port->serial->dev;
+	int result;
 
 	/*--3: purge */
 	dbg("%s:sending purge", __func__);
@@ -461,6 +460,7 @@ static struct usb_serial_driver ipw_device = {
 	.num_ports =		1,
 	.open =			ipw_open,
 	.close =		ipw_close,
+	.dtr_rts =		ipw_dtr_rts,
 	.port_probe = 		ipw_probe,
 	.port_remove =		ipw_disconnect,
 	.write =		ipw_write,
diff --git a/drivers/usb/serial/ir-usb.c b/drivers/usb/serial/ir-usb.c
index 4e2cda93da596..66009b6b763a2 100644
--- a/drivers/usb/serial/ir-usb.c
+++ b/drivers/usb/serial/ir-usb.c
@@ -88,8 +88,7 @@ static int xbof = -1;
 static int  ir_startup (struct usb_serial *serial);
 static int  ir_open(struct tty_struct *tty, struct usb_serial_port *port,
 					struct file *filep);
-static void ir_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filep);
+static void ir_close(struct usb_serial_port *port);
 static int  ir_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
 static void ir_write_bulk_callback (struct urb *urb);
@@ -346,8 +345,7 @@ static int ir_open(struct tty_struct *tty,
 	return result;
 }
 
-static void ir_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file * filp)
+static void ir_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index 4473d442b2aa4..bb572cee6ecdd 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -70,7 +70,6 @@ static void read_rxcmd_callback(struct urb *urb);
 struct iuu_private {
 	spinlock_t lock;	/* store irq state */
 	wait_queue_head_t delta_msr_wait;
-	u8 line_control;
 	u8 line_status;
 	u8 termios_initialized;
 	int tiostatus;		/* store IUART SIGNAL for tiocmget call */
@@ -946,19 +945,10 @@ static int iuu_uart_baud(struct usb_serial_port *port, u32 baud,
 	return status;
 }
 
-static int set_control_lines(struct usb_device *dev, u8 value)
-{
-	return 0;
-}
-
-static void iuu_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void iuu_close(struct usb_serial_port *port)
 {
 	/* iuu_led (port,255,0,0,0); */
 	struct usb_serial *serial;
-	struct iuu_private *priv = usb_get_serial_port_data(port);
-	unsigned long flags;
-	unsigned int c_cflag;
 
 	serial = port->serial;
 	if (!serial)
@@ -968,17 +958,6 @@ static void iuu_close(struct tty_struct *tty,
 
 	iuu_uart_off(port);
 	if (serial->dev) {
-		if (tty) {
-			c_cflag = tty->termios->c_cflag;
-			if (c_cflag & HUPCL) {
-				/* drop DTR and RTS */
-				priv = usb_get_serial_port_data(port);
-				spin_lock_irqsave(&priv->lock, flags);
-				priv->line_control = 0;
-				spin_unlock_irqrestore(&priv->lock, flags);
-				set_control_lines(port->serial->dev, 0);
-			}
-		}
 		/* free writebuf */
 		/* shutdown our urbs */
 		dbg("%s - shutting down urbs", __func__);
@@ -1154,7 +1133,7 @@ static int iuu_open(struct tty_struct *tty,
 	if (result) {
 		dev_err(&port->dev, "%s - failed submitting read urb,"
 			" error %d\n", __func__, result);
-		iuu_close(tty, port, NULL);
+		iuu_close(port);
 		return -EPROTO;
 	} else {
 		dbg("%s - rxcmd OK", __func__);
diff --git a/drivers/usb/serial/keyspan.c b/drivers/usb/serial/keyspan.c
index 00daa8f7759a9..f1195a98f316d 100644
--- a/drivers/usb/serial/keyspan.c
+++ b/drivers/usb/serial/keyspan.c
@@ -1298,8 +1298,16 @@ static inline void stop_urb(struct urb *urb)
 		usb_kill_urb(urb);
 }
 
-static void keyspan_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void keyspan_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct keyspan_port_private *p_priv = usb_get_serial_port_data(port);
+
+	p_priv->rts_state = on;
+	p_priv->dtr_state = on;
+	keyspan_send_setup(port, 0);
+}
+
+static void keyspan_close(struct usb_serial_port *port)
 {
 	int			i;
 	struct usb_serial	*serial = port->serial;
@@ -1336,7 +1344,6 @@ static void keyspan_close(struct tty_struct *tty,
 			stop_urb(p_priv->out_urbs[i]);
 		}
 	}
-	tty_port_tty_set(&port->port, NULL);
 }
 
 /* download the firmware to a pre-renumeration device */
diff --git a/drivers/usb/serial/keyspan.h b/drivers/usb/serial/keyspan.h
index 38b4582e07344..0d4569b60768a 100644
--- a/drivers/usb/serial/keyspan.h
+++ b/drivers/usb/serial/keyspan.h
@@ -38,9 +38,8 @@
 static int  keyspan_open		(struct tty_struct *tty,
 					 struct usb_serial_port *port,
 					 struct file *filp);
-static void keyspan_close		(struct tty_struct *tty,
-					 struct usb_serial_port *port,
-					 struct file *filp);
+static void keyspan_close		(struct usb_serial_port *port);
+static void keyspan_dtr_rts		(struct usb_serial_port *port, int on);
 static int  keyspan_startup		(struct usb_serial *serial);
 static void keyspan_shutdown		(struct usb_serial *serial);
 static int  keyspan_write_room		(struct tty_struct *tty);
@@ -562,6 +561,7 @@ static struct usb_serial_driver keyspan_1port_device = {
 	.num_ports		= 1,
 	.open			= keyspan_open,
 	.close			= keyspan_close,
+	.dtr_rts		= keyspan_dtr_rts,
 	.write			= keyspan_write,
 	.write_room		= keyspan_write_room,
 	.set_termios		= keyspan_set_termios,
@@ -582,6 +582,7 @@ static struct usb_serial_driver keyspan_2port_device = {
 	.num_ports		= 2,
 	.open			= keyspan_open,
 	.close			= keyspan_close,
+	.dtr_rts		= keyspan_dtr_rts,
 	.write			= keyspan_write,
 	.write_room		= keyspan_write_room,
 	.set_termios		= keyspan_set_termios,
@@ -602,6 +603,7 @@ static struct usb_serial_driver keyspan_4port_device = {
 	.num_ports		= 4,
 	.open			= keyspan_open,
 	.close			= keyspan_close,
+	.dtr_rts		= keyspan_dtr_rts,
 	.write			= keyspan_write,
 	.write_room		= keyspan_write_room,
 	.set_termios		= keyspan_set_termios,
diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c
index bf1ae247da661..ab769dbea1b3e 100644
--- a/drivers/usb/serial/keyspan_pda.c
+++ b/drivers/usb/serial/keyspan_pda.c
@@ -651,6 +651,35 @@ static int keyspan_pda_chars_in_buffer(struct tty_struct *tty)
 }
 
 
+static void keyspan_pda_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct usb_serial *serial = port->serial;
+
+	if (serial->dev) {
+		if (on)
+			keyspan_pda_set_modem_info(serial, (1<<7) | (1<< 2));
+		else
+			keyspan_pda_set_modem_info(serial, 0);
+	}
+}
+
+static int keyspan_pda_carrier_raised(struct usb_serial_port *port)
+{
+	struct usb_serial *serial = port->serial;
+	unsigned char modembits;
+
+	/* If we can read the modem status and the DCD is low then
+	   carrier is not raised yet */
+	if (keyspan_pda_get_modem_info(serial, &modembits) >= 0) {
+		if (!(modembits & (1>>6)))
+			return 0;
+	}
+	/* Carrier raised, or we failed (eg disconnected) so
+	   progress accordingly */
+	return 1;
+}
+
+
 static int keyspan_pda_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp)
 {
@@ -682,13 +711,6 @@ static int keyspan_pda_open(struct tty_struct *tty,
 	priv->tx_room = room;
 	priv->tx_throttled = room ? 0 : 1;
 
-	/* the normal serial device seems to always turn on DTR and RTS here,
-	   so do the same */
-	if (tty && (tty->termios->c_cflag & CBAUD))
-		keyspan_pda_set_modem_info(serial, (1<<7) | (1<<2));
-	else
-		keyspan_pda_set_modem_info(serial, 0);
-
 	/*Start reading from the device*/
 	port->interrupt_in_urb->dev = serial->dev;
 	rc = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL);
@@ -700,19 +722,11 @@ static int keyspan_pda_open(struct tty_struct *tty,
 error:
 	return rc;
 }
-
-
-static void keyspan_pda_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void keyspan_pda_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 
 	if (serial->dev) {
-		/* the normal serial device seems to always shut
-		   off DTR and RTS now */
-		if (tty->termios->c_cflag & HUPCL)
-			keyspan_pda_set_modem_info(serial, 0);
-
 		/* shutdown our bulk reads and writes */
 		usb_kill_urb(port->write_urb);
 		usb_kill_urb(port->interrupt_in_urb);
@@ -839,6 +853,8 @@ static struct usb_serial_driver keyspan_pda_device = {
 	.usb_driver = 		&keyspan_pda_driver,
 	.id_table =		id_table_std,
 	.num_ports =		1,
+	.dtr_rts =		keyspan_pda_dtr_rts,
+	.carrier_raised	=	keyspan_pda_carrier_raised,
 	.open =			keyspan_pda_open,
 	.close =		keyspan_pda_close,
 	.write =		keyspan_pda_write,
diff --git a/drivers/usb/serial/kl5kusb105.c b/drivers/usb/serial/kl5kusb105.c
index fcd9082f3e7f0..fa817c66b3e8c 100644
--- a/drivers/usb/serial/kl5kusb105.c
+++ b/drivers/usb/serial/kl5kusb105.c
@@ -76,8 +76,7 @@ static int  klsi_105_startup(struct usb_serial *serial);
 static void klsi_105_shutdown(struct usb_serial *serial);
 static int  klsi_105_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void klsi_105_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void klsi_105_close(struct usb_serial_port *port);
 static int  klsi_105_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
 static void klsi_105_write_bulk_callback(struct urb *urb);
@@ -447,8 +446,7 @@ static int  klsi_105_open(struct tty_struct *tty,
 } /* klsi_105_open */
 
 
-static void klsi_105_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void klsi_105_close(struct usb_serial_port *port)
 {
 	struct klsi_105_private *priv = usb_get_serial_port_data(port);
 	int rc;
diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c
index c148544953b37..6b570498287f3 100644
--- a/drivers/usb/serial/kobil_sct.c
+++ b/drivers/usb/serial/kobil_sct.c
@@ -72,8 +72,7 @@ static int  kobil_startup(struct usb_serial *serial);
 static void kobil_shutdown(struct usb_serial *serial);
 static int  kobil_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void kobil_close(struct tty_struct *tty, struct usb_serial_port *port,
-			struct file *filp);
+static void kobil_close(struct usb_serial_port *port);
 static int  kobil_write(struct tty_struct *tty, struct usb_serial_port *port,
 			 const unsigned char *buf, int count);
 static int  kobil_write_room(struct tty_struct *tty);
@@ -209,7 +208,7 @@ static void kobil_shutdown(struct usb_serial *serial)
 
 	for (i = 0; i < serial->num_ports; ++i) {
 		while (serial->port[i]->port.count > 0)
-			kobil_close(NULL, serial->port[i], NULL);
+			kobil_close(serial->port[i]);
 		kfree(usb_get_serial_port_data(serial->port[i]));
 		usb_set_serial_port_data(serial->port[i], NULL);
 	}
@@ -346,11 +345,11 @@ static int kobil_open(struct tty_struct *tty,
 }
 
 
-static void kobil_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void kobil_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
+	/* FIXME: Add rts/dtr methods */
 	if (port->write_urb) {
 		usb_kill_urb(port->write_urb);
 		usb_free_urb(port->write_urb);
diff --git a/drivers/usb/serial/mct_u232.c b/drivers/usb/serial/mct_u232.c
index 82930a7d50932..873795548fc0a 100644
--- a/drivers/usb/serial/mct_u232.c
+++ b/drivers/usb/serial/mct_u232.c
@@ -95,8 +95,8 @@ static int  mct_u232_startup(struct usb_serial *serial);
 static void mct_u232_shutdown(struct usb_serial *serial);
 static int  mct_u232_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void mct_u232_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void mct_u232_close(struct usb_serial_port *port);
+static void mct_u232_dtr_rts(struct usb_serial_port *port, int on);
 static void mct_u232_read_int_callback(struct urb *urb);
 static void mct_u232_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
@@ -140,6 +140,7 @@ static struct usb_serial_driver mct_u232_device = {
 	.num_ports =	     1,
 	.open =		     mct_u232_open,
 	.close =	     mct_u232_close,
+	.dtr_rts =	     mct_u232_dtr_rts,
 	.throttle =	     mct_u232_throttle,
 	.unthrottle =	     mct_u232_unthrottle,
 	.read_int_callback = mct_u232_read_int_callback,
@@ -496,29 +497,29 @@ static int  mct_u232_open(struct tty_struct *tty,
 	return retval;
 } /* mct_u232_open */
 
-
-static void mct_u232_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void mct_u232_dtr_rts(struct usb_serial_port *port, int on)
 {
-	unsigned int c_cflag;
 	unsigned int control_state;
 	struct mct_u232_private *priv = usb_get_serial_port_data(port);
-	dbg("%s port %d", __func__, port->number);
 
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		mutex_lock(&port->serial->disc_mutex);
-		if (c_cflag & HUPCL && !port->serial->disconnected) {
-			/* drop DTR and RTS */
-			spin_lock_irq(&priv->lock);
+	mutex_lock(&port->serial->disc_mutex);
+	if (!port->serial->disconnected) {
+		/* drop DTR and RTS */
+		spin_lock_irq(&priv->lock);
+		if (on)
+			priv->control_state |= TIOCM_DTR | TIOCM_RTS;
+		else
 			priv->control_state &= ~(TIOCM_DTR | TIOCM_RTS);
-			control_state = priv->control_state;
-			spin_unlock_irq(&priv->lock);
-			mct_u232_set_modem_ctrl(port->serial, control_state);
-		}
-		mutex_unlock(&port->serial->disc_mutex);
+		control_state = priv->control_state;
+		spin_unlock_irq(&priv->lock);
+		mct_u232_set_modem_ctrl(port->serial, control_state);
 	}
+	mutex_unlock(&port->serial->disc_mutex);
+}
 
+static void mct_u232_close(struct usb_serial_port *port)
+{
+	dbg("%s port %d", __func__, port->number);
 
 	if (port->serial->dev) {
 		/* shutdown our urbs */
diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c
index 24e3b5d4b4d49..9e1a013ee7f67 100644
--- a/drivers/usb/serial/mos7720.c
+++ b/drivers/usb/serial/mos7720.c
@@ -533,8 +533,7 @@ static int mos7720_chars_in_buffer(struct tty_struct *tty)
 	return chars;
 }
 
-static void mos7720_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void mos7720_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial;
 	struct moschip_port *mos7720_port;
diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c
index 84fb1dcd30dc3..10b78a37214f6 100644
--- a/drivers/usb/serial/mos7840.c
+++ b/drivers/usb/serial/mos7840.c
@@ -1135,54 +1135,12 @@ static int mos7840_chars_in_buffer(struct tty_struct *tty)
 
 }
 
-/************************************************************************
- *
- * mos7840_block_until_tx_empty
- *
- *	This function will block the close until one of the following:
- *		1. TX count are 0
- *		2. The mos7840 has stopped
- *		3. A timeout of 3 seconds without activity has expired
- *
- ************************************************************************/
-static void mos7840_block_until_tx_empty(struct tty_struct *tty,
-				struct moschip_port *mos7840_port)
-{
-	int timeout = HZ / 10;
-	int wait = 30;
-	int count;
-
-	while (1) {
-
-		count = mos7840_chars_in_buffer(tty);
-
-		/* Check for Buffer status */
-		if (count <= 0)
-			return;
-
-		/* Block the thread for a while */
-		interruptible_sleep_on_timeout(&mos7840_port->wait_chase,
-					       timeout);
-
-		/* No activity.. count down section */
-		wait--;
-		if (wait == 0) {
-			dbg("%s - TIMEOUT", __func__);
-			return;
-		} else {
-			/* Reset timeout value back to seconds */
-			wait = 30;
-		}
-	}
-}
-
 /*****************************************************************************
  * mos7840_close
  *	this function is called by the tty driver when a port is closed
  *****************************************************************************/
 
-static void mos7840_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void mos7840_close(struct usb_serial_port *port)
 {
 	struct usb_serial *serial;
 	struct moschip_port *mos7840_port;
@@ -1223,10 +1181,6 @@ static void mos7840_close(struct tty_struct *tty,
 		}
 	}
 
-	if (serial->dev)
-		/* flush and block until tx is empty */
-		mos7840_block_until_tx_empty(tty, mos7840_port);
-
 	/* While closing port, shutdown all bulk read, write  *
 	 * and interrupt read if they exists                  */
 	if (serial->dev) {
diff --git a/drivers/usb/serial/navman.c b/drivers/usb/serial/navman.c
index bcdcbb822705f..f5f3751a888ce 100644
--- a/drivers/usb/serial/navman.c
+++ b/drivers/usb/serial/navman.c
@@ -98,8 +98,7 @@ static int navman_open(struct tty_struct *tty,
 	return result;
 }
 
-static void navman_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void navman_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c
index df6539712726b..1104617334f50 100644
--- a/drivers/usb/serial/omninet.c
+++ b/drivers/usb/serial/omninet.c
@@ -66,8 +66,7 @@ static int debug;
 /* function prototypes */
 static int  omninet_open(struct tty_struct *tty, struct usb_serial_port *port,
 							struct file *filp);
-static void omninet_close(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *filp);
+static void omninet_close(struct usb_serial_port *port);
 static void omninet_read_bulk_callback(struct urb *urb);
 static void omninet_write_bulk_callback(struct urb *urb);
 static int  omninet_write(struct tty_struct *tty, struct usb_serial_port *port,
@@ -189,8 +188,7 @@ static int omninet_open(struct tty_struct *tty,
 	return result;
 }
 
-static void omninet_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void omninet_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 	usb_kill_urb(port->read_urb);
diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c
index b500ad10b7589..c20480aa97555 100644
--- a/drivers/usb/serial/opticon.c
+++ b/drivers/usb/serial/opticon.c
@@ -173,8 +173,7 @@ static int opticon_open(struct tty_struct *tty, struct usb_serial_port *port,
 	return result;
 }
 
-static void opticon_close(struct tty_struct *tty, struct usb_serial_port *port,
-			  struct file *filp)
+static void opticon_close(struct usb_serial_port *port)
 {
 	struct opticon_private *priv = usb_get_serial_data(port->serial);
 
diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c
index 7817b82889ca5..a16d69fadba1c 100644
--- a/drivers/usb/serial/option.c
+++ b/drivers/usb/serial/option.c
@@ -45,8 +45,9 @@
 /* Function prototypes */
 static int  option_open(struct tty_struct *tty, struct usb_serial_port *port,
 							struct file *filp);
-static void option_close(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *filp);
+static void option_close(struct usb_serial_port *port);
+static void option_dtr_rts(struct usb_serial_port *port, int on);
+
 static int  option_startup(struct usb_serial *serial);
 static void option_shutdown(struct usb_serial *serial);
 static int  option_write_room(struct tty_struct *tty);
@@ -61,7 +62,7 @@ static void option_set_termios(struct tty_struct *tty,
 static int  option_tiocmget(struct tty_struct *tty, struct file *file);
 static int  option_tiocmset(struct tty_struct *tty, struct file *file,
 				unsigned int set, unsigned int clear);
-static int  option_send_setup(struct tty_struct *tty, struct usb_serial_port *port);
+static int  option_send_setup(struct usb_serial_port *port);
 static int  option_suspend(struct usb_serial *serial, pm_message_t message);
 static int  option_resume(struct usb_serial *serial);
 
@@ -551,6 +552,7 @@ static struct usb_serial_driver option_1port_device = {
 	.num_ports         = 1,
 	.open              = option_open,
 	.close             = option_close,
+	.dtr_rts	   = option_dtr_rts,
 	.write             = option_write,
 	.write_room        = option_write_room,
 	.chars_in_buffer   = option_chars_in_buffer,
@@ -630,7 +632,7 @@ static void option_set_termios(struct tty_struct *tty,
 	dbg("%s", __func__);
 	/* Doesn't support option setting */
 	tty_termios_copy_hw(tty->termios, old_termios);
-	option_send_setup(tty, port);
+	option_send_setup(port);
 }
 
 static int option_tiocmget(struct tty_struct *tty, struct file *file)
@@ -669,7 +671,7 @@ static int option_tiocmset(struct tty_struct *tty, struct file *file,
 		portdata->rts_state = 0;
 	if (clear & TIOCM_DTR)
 		portdata->dtr_state = 0;
-	return option_send_setup(tty, port);
+	return option_send_setup(port);
 }
 
 /* Write */
@@ -897,10 +899,6 @@ static int option_open(struct tty_struct *tty,
 
 	dbg("%s", __func__);
 
-	/* Set some sane defaults */
-	portdata->rts_state = 1;
-	portdata->dtr_state = 1;
-
 	/* Reset low level data toggle and start reading from endpoints */
 	for (i = 0; i < N_IN_URB; i++) {
 		urb = portdata->in_urbs[i];
@@ -936,37 +934,43 @@ static int option_open(struct tty_struct *tty,
 				usb_pipeout(urb->pipe), 0); */
 	}
 
-	option_send_setup(tty, port);
+	option_send_setup(port);
 
 	return 0;
 }
 
-static void option_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void option_dtr_rts(struct usb_serial_port *port, int on)
 {
-	int i;
 	struct usb_serial *serial = port->serial;
 	struct option_port_private *portdata;
 
 	dbg("%s", __func__);
 	portdata = usb_get_serial_port_data(port);
+	mutex_lock(&serial->disc_mutex);
+	portdata->rts_state = on;
+	portdata->dtr_state = on;
+	if (serial->dev)
+		option_send_setup(port);
+	mutex_unlock(&serial->disc_mutex);
+}
 
-	portdata->rts_state = 0;
-	portdata->dtr_state = 0;
 
-	if (serial->dev) {
-		mutex_lock(&serial->disc_mutex);
-		if (!serial->disconnected)
-			option_send_setup(tty, port);
-		mutex_unlock(&serial->disc_mutex);
+static void option_close(struct usb_serial_port *port)
+{
+	int i;
+	struct usb_serial *serial = port->serial;
+	struct option_port_private *portdata;
+
+	dbg("%s", __func__);
+	portdata = usb_get_serial_port_data(port);
 
+	if (serial->dev) {
 		/* Stop reading/writing urbs */
 		for (i = 0; i < N_IN_URB; i++)
 			usb_kill_urb(portdata->in_urbs[i]);
 		for (i = 0; i < N_OUT_URB; i++)
 			usb_kill_urb(portdata->out_urbs[i]);
 	}
-	tty_port_tty_set(&port->port, NULL);
 }
 
 /* Helper functions used by option_setup_urbs */
@@ -1032,28 +1036,24 @@ static void option_setup_urbs(struct usb_serial *serial)
  * This is exactly the same as SET_CONTROL_LINE_STATE from the PSTN
  * CDC.
 */
-static int option_send_setup(struct tty_struct *tty,
-						struct usb_serial_port *port)
+static int option_send_setup(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct option_port_private *portdata;
 	int ifNum = serial->interface->cur_altsetting->desc.bInterfaceNumber;
+	int val = 0;
 	dbg("%s", __func__);
 
 	portdata = usb_get_serial_port_data(port);
 
-	if (tty) {
-		int val = 0;
-		if (portdata->dtr_state)
-			val |= 0x01;
-		if (portdata->rts_state)
-			val |= 0x02;
+	if (portdata->dtr_state)
+		val |= 0x01;
+	if (portdata->rts_state)
+		val |= 0x02;
 
-		return usb_control_msg(serial->dev,
-			usb_rcvctrlpipe(serial->dev, 0),
-			0x22, 0x21, val, ifNum, NULL, 0, USB_CTRL_SET_TIMEOUT);
-	}
-	return 0;
+	return usb_control_msg(serial->dev,
+		usb_rcvctrlpipe(serial->dev, 0),
+		0x22, 0x21, val, ifNum, NULL, 0, USB_CTRL_SET_TIMEOUT);
 }
 
 static int option_startup(struct usb_serial *serial)
diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c
index ba551f00f16ff..7de54781fe614 100644
--- a/drivers/usb/serial/oti6858.c
+++ b/drivers/usb/serial/oti6858.c
@@ -143,8 +143,7 @@ struct oti6858_control_pkt {
 /* function prototypes */
 static int oti6858_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void oti6858_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void oti6858_close(struct usb_serial_port *port);
 static void oti6858_set_termios(struct tty_struct *tty,
 			struct usb_serial_port *port, struct ktermios *old);
 static int oti6858_ioctl(struct tty_struct *tty, struct file *file,
@@ -622,67 +621,30 @@ static int oti6858_open(struct tty_struct *tty,
 	if (result != 0) {
 		dev_err(&port->dev, "%s(): usb_submit_urb() failed"
 			       " with error %d\n", __func__, result);
-		oti6858_close(tty, port, NULL);
+		oti6858_close(port);
 		return -EPROTO;
 	}
 
 	/* setup termios */
 	if (tty)
 		oti6858_set_termios(tty, port, &tmp_termios);
-
+	port->port.drain_delay = 256;	/* FIXME: check the FIFO length */
 	return 0;
 }
 
-static void oti6858_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void oti6858_close(struct usb_serial_port *port)
 {
 	struct oti6858_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	long timeout;
-	wait_queue_t wait;
 
 	dbg("%s(port = %d)", __func__, port->number);
 
-	/* wait for data to drain from the buffer */
 	spin_lock_irqsave(&priv->lock, flags);
-	timeout = 30 * HZ;	/* PL2303_CLOSING_WAIT */
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	dbg("%s(): entering wait loop", __func__);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (oti6858_buf_data_avail(priv->buf) == 0
-		|| timeout == 0 || signal_pending(current)
-		|| port->serial->disconnected)
-			break;
-		spin_unlock_irqrestore(&priv->lock, flags);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irqsave(&priv->lock, flags);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
-	dbg("%s(): after wait loop", __func__);
-
 	/* clear out any remaining data in the buffer */
 	oti6858_buf_clear(priv->buf);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* wait for characters to drain from the device */
-	/* (this is long enough for the entire 256 byte */
-	/* pl2303 hardware buffer to drain with no flow */
-	/* control for data rates of 1200 bps or more, */
-	/* for lower rates we should really know how much */
-	/* data is in the buffer to compute a delay */
-	/* that is not unnecessarily long) */
-	/* FIXME
-	bps = tty_get_baud_rate(tty);
-	if (bps > 1200)
-		timeout = max((HZ*2560)/bps,HZ/10);
-	else
-	*/
-		timeout = 2*HZ;
-	schedule_timeout_interruptible(timeout);
-	dbg("%s(): after schedule_timeout_interruptible()", __func__);
+	dbg("%s(): after buf_clear()", __func__);
 
 	/* cancel scheduled setup */
 	cancel_delayed_work(&priv->delayed_setup_work);
@@ -694,15 +656,6 @@ static void oti6858_close(struct tty_struct *tty,
 	usb_kill_urb(port->write_urb);
 	usb_kill_urb(port->read_urb);
 	usb_kill_urb(port->interrupt_in_urb);
-
-	/*
-	if (tty && (tty->termios->c_cflag) & HUPCL) {
-		// drop DTR and RTS
-		spin_lock_irqsave(&priv->lock, flags);
-		priv->pending_setup.control &= ~CONTROL_MASK;
-		spin_unlock_irqrestore(&priv->lock, flags);
-	}
-	*/
 }
 
 static int oti6858_tiocmset(struct tty_struct *tty, struct file *file,
diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c
index 751a533a4347e..e02dc3d643c7a 100644
--- a/drivers/usb/serial/pl2303.c
+++ b/drivers/usb/serial/pl2303.c
@@ -652,69 +652,41 @@ static void pl2303_set_termios(struct tty_struct *tty,
 	kfree(buf);
 }
 
-static void pl2303_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void pl2303_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct pl2303_private *priv = usb_get_serial_port_data(port);
+	unsigned long flags;
+	u8 control;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	/* Change DTR and RTS */
+	if (on)
+		priv->line_control |= (CONTROL_DTR | CONTROL_RTS);
+	else
+		priv->line_control &= ~(CONTROL_DTR | CONTROL_RTS);
+	control = priv->line_control;
+	spin_unlock_irqrestore(&priv->lock, flags);
+	set_control_lines(port->serial->dev, control);
+}
+
+static void pl2303_close(struct usb_serial_port *port)
 {
 	struct pl2303_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	unsigned int c_cflag;
-	int bps;
-	long timeout;
-	wait_queue_t wait;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* wait for data to drain from the buffer */
 	spin_lock_irqsave(&priv->lock, flags);
-	timeout = PL2303_CLOSING_WAIT;
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (pl2303_buf_data_avail(priv->buf) == 0 ||
-		    timeout == 0 || signal_pending(current) ||
-		    port->serial->disconnected)
-			break;
-		spin_unlock_irqrestore(&priv->lock, flags);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irqsave(&priv->lock, flags);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
 	/* clear out any remaining data in the buffer */
 	pl2303_buf_clear(priv->buf);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* wait for characters to drain from the device */
-	/* (this is long enough for the entire 256 byte */
-	/* pl2303 hardware buffer to drain with no flow */
-	/* control for data rates of 1200 bps or more, */
-	/* for lower rates we should really know how much */
-	/* data is in the buffer to compute a delay */
-	/* that is not unnecessarily long) */
-	bps = tty_get_baud_rate(tty);
-	if (bps > 1200)
-		timeout = max((HZ*2560)/bps, HZ/10);
-	else
-		timeout = 2*HZ;
-	schedule_timeout_interruptible(timeout);
-
 	/* shutdown our urbs */
 	dbg("%s - shutting down urbs", __func__);
 	usb_kill_urb(port->write_urb);
 	usb_kill_urb(port->read_urb);
 	usb_kill_urb(port->interrupt_in_urb);
 
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			/* drop DTR and RTS */
-			spin_lock_irqsave(&priv->lock, flags);
-			priv->line_control = 0;
-			spin_unlock_irqrestore(&priv->lock, flags);
-			set_control_lines(port->serial->dev, 0);
-		}
-	}
 }
 
 static int pl2303_open(struct tty_struct *tty,
@@ -748,7 +720,7 @@ static int pl2303_open(struct tty_struct *tty,
 	if (result) {
 		dev_err(&port->dev, "%s - failed submitting read urb,"
 			" error %d\n", __func__, result);
-		pl2303_close(tty, port, NULL);
+		pl2303_close(port);
 		return -EPROTO;
 	}
 
@@ -758,9 +730,10 @@ static int pl2303_open(struct tty_struct *tty,
 	if (result) {
 		dev_err(&port->dev, "%s - failed submitting interrupt urb,"
 			" error %d\n", __func__, result);
-		pl2303_close(tty, port, NULL);
+		pl2303_close(port);
 		return -EPROTO;
 	}
+	port->port.drain_delay = 256;
 	return 0;
 }
 
@@ -821,6 +794,14 @@ static int pl2303_tiocmget(struct tty_struct *tty, struct file *file)
 	return result;
 }
 
+static int pl2303_carrier_raised(struct usb_serial_port *port)
+{
+	struct pl2303_private *priv = usb_get_serial_port_data(port);
+	if (priv->line_status & UART_DCD)
+		return 1;
+	return 0;
+}
+
 static int wait_modem_info(struct usb_serial_port *port, unsigned int arg)
 {
 	struct pl2303_private *priv = usb_get_serial_port_data(port);
@@ -1125,6 +1106,8 @@ static struct usb_serial_driver pl2303_device = {
 	.num_ports =		1,
 	.open =			pl2303_open,
 	.close =		pl2303_close,
+	.dtr_rts = 		pl2303_dtr_rts,
+	.carrier_raised =	pl2303_carrier_raised,
 	.write =		pl2303_write,
 	.ioctl =		pl2303_ioctl,
 	.break_ctl =		pl2303_break_ctl,
diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 913225c616103..1319b8968d82e 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -240,57 +240,39 @@ struct sierra_port_private {
 	int ri_state;
 };
 
-static int sierra_send_setup(struct tty_struct *tty,
-						struct usb_serial_port *port)
+static int sierra_send_setup(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 	struct sierra_port_private *portdata;
 	__u16 interface = 0;
+	int val = 0;
 
 	dev_dbg(&port->dev, "%s", __func__);
 
 	portdata = usb_get_serial_port_data(port);
 
-	if (tty) {
-		int val = 0;
-		if (portdata->dtr_state)
-			val |= 0x01;
-		if (portdata->rts_state)
-			val |= 0x02;
-
-		/* If composite device then properly report interface */
-		if (serial->num_ports == 1) {
-			interface = sierra_calc_interface(serial);
-
-			/* Control message is sent only to interfaces with
-			 * interrupt_in endpoints
-			 */
-			if (port->interrupt_in_urb) {
-				/* send control message */
-				return usb_control_msg(serial->dev,
-					usb_rcvctrlpipe(serial->dev, 0),
-					0x22, 0x21, val, interface,
-					NULL, 0, USB_CTRL_SET_TIMEOUT);
-			}
-		}
-
-		/* Otherwise the need to do non-composite mapping */
-		else {
-			if (port->bulk_out_endpointAddress == 2)
-				interface = 0;
-			else if (port->bulk_out_endpointAddress == 4)
-				interface = 1;
-			else if (port->bulk_out_endpointAddress == 5)
-				interface = 2;
-
-			return usb_control_msg(serial->dev,
-				usb_rcvctrlpipe(serial->dev, 0),
-				0x22, 0x21, val, interface,
-				NULL, 0, USB_CTRL_SET_TIMEOUT);
-
-		}
+	if (portdata->dtr_state)
+		val |= 0x01;
+	if (portdata->rts_state)
+		val |= 0x02;
+
+	/* If composite device then properly report interface */
+	if (serial->num_ports == 1)
+		interface = sierra_calc_interface(serial);
+
+	/* Otherwise the need to do non-composite mapping */
+	else {
+		if (port->bulk_out_endpointAddress == 2)
+			interface = 0;
+		else if (port->bulk_out_endpointAddress == 4)
+			interface = 1;
+		else if (port->bulk_out_endpointAddress == 5)
+			interface = 2;
 	}
-
+	return usb_control_msg(serial->dev,
+			usb_rcvctrlpipe(serial->dev, 0),
+			0x22, 0x21, val, interface,
+			NULL, 0, USB_CTRL_SET_TIMEOUT);
 	return 0;
 }
 
@@ -299,7 +281,7 @@ static void sierra_set_termios(struct tty_struct *tty,
 {
 	dev_dbg(&port->dev, "%s", __func__);
 	tty_termios_copy_hw(tty->termios, old_termios);
-	sierra_send_setup(tty, port);
+	sierra_send_setup(port);
 }
 
 static int sierra_tiocmget(struct tty_struct *tty, struct file *file)
@@ -338,7 +320,7 @@ static int sierra_tiocmset(struct tty_struct *tty, struct file *file,
 		portdata->rts_state = 0;
 	if (clear & TIOCM_DTR)
 		portdata->dtr_state = 0;
-	return sierra_send_setup(tty, port);
+	return sierra_send_setup(port);
 }
 
 static void sierra_outdat_callback(struct urb *urb)
@@ -598,7 +580,7 @@ static int sierra_open(struct tty_struct *tty,
 		}
 	}
 
-	sierra_send_setup(tty, port);
+	sierra_send_setup(port);
 
 	/* start up the interrupt endpoint if we have one */
 	if (port->interrupt_in_urb) {
@@ -610,32 +592,38 @@ static int sierra_open(struct tty_struct *tty,
 	return 0;
 }
 
-static void sierra_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void sierra_dtr_rts(struct usb_serial_port *port, int on)
 {
-	int i;
 	struct usb_serial *serial = port->serial;
 	struct sierra_port_private *portdata;
 
-	dev_dbg(&port->dev, "%s", __func__);
 	portdata = usb_get_serial_port_data(port);
-
-	portdata->rts_state = 0;
-	portdata->dtr_state = 0;
+	portdata->rts_state = on;
+	portdata->dtr_state = on;
 
 	if (serial->dev) {
 		mutex_lock(&serial->disc_mutex);
 		if (!serial->disconnected)
-			sierra_send_setup(tty, port);
+			sierra_send_setup(port);
 		mutex_unlock(&serial->disc_mutex);
+	}
+}
+
+static void sierra_close(struct usb_serial_port *port)
+{
+	int i;
+	struct usb_serial *serial = port->serial;
+	struct sierra_port_private *portdata;
 
+	dev_dbg(&port->dev, "%s", __func__);
+	portdata = usb_get_serial_port_data(port);
+
+	if (serial->dev) {
 		/* Stop reading/writing urbs */
 		for (i = 0; i < N_IN_URB; i++)
 			usb_kill_urb(portdata->in_urbs[i]);
 	}
-
 	usb_kill_urb(port->interrupt_in_urb);
-	tty_port_tty_set(&port->port, NULL);
 }
 
 static int sierra_startup(struct usb_serial *serial)
@@ -737,6 +725,7 @@ static struct usb_serial_driver sierra_device = {
 	.probe		   = sierra_probe,
 	.open              = sierra_open,
 	.close             = sierra_close,
+	.dtr_rts	   = sierra_dtr_rts,
 	.write             = sierra_write,
 	.write_room        = sierra_write_room,
 	.set_termios       = sierra_set_termios,
diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c
index 5e7528cc81a8e..8f7ed8f139961 100644
--- a/drivers/usb/serial/spcp8x5.c
+++ b/drivers/usb/serial/spcp8x5.c
@@ -446,66 +446,47 @@ static void spcp8x5_set_workMode(struct usb_device *dev, u16 value,
 			"RTSCTS usb_control_msg(enable flowctrl) = %d\n", ret);
 }
 
+static int spcp8x5_carrier_raised(struct usb_serial_port *port)
+{
+	struct spcp8x5_private *priv = usb_get_serial_port_data(port);
+	if (priv->line_status & MSR_STATUS_LINE_DCD)
+		return 1;
+	return 0;
+}
+
+static void spcp8x5_dtr_rts(struct usb_serial_port *port, int on)
+{
+	struct spcp8x5_private *priv = usb_get_serial_port_data(port);
+	unsigned long flags;
+	u8 control;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	if (on)
+		priv->line_control = MCR_CONTROL_LINE_DTR
+						| MCR_CONTROL_LINE_RTS;
+	else
+		priv->line_control &= ~ (MCR_CONTROL_LINE_DTR
+						| MCR_CONTROL_LINE_RTS);
+	control = priv->line_control;
+	spin_unlock_irqrestore(&priv->lock, flags);
+	spcp8x5_set_ctrlLine(port->serial->dev, control , priv->type);
+}
+
 /* close the serial port. We should wait for data sending to device 1st and
  * then kill all urb. */
-static void spcp8x5_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void spcp8x5_close(struct usb_serial_port *port)
 {
 	struct spcp8x5_private *priv = usb_get_serial_port_data(port);
 	unsigned long flags;
-	unsigned int c_cflag;
-	int bps;
-	long timeout;
-	wait_queue_t wait;
 	int result;
 
 	dbg("%s - port %d", __func__, port->number);
 
-	/* wait for data to drain from the buffer */
 	spin_lock_irqsave(&priv->lock, flags);
-	timeout = SPCP8x5_CLOSING_WAIT;
-	init_waitqueue_entry(&wait, current);
-	add_wait_queue(&tty->write_wait, &wait);
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (ringbuf_avail_data(priv->buf) == 0 ||
-		    timeout == 0 || signal_pending(current))
-			break;
-		spin_unlock_irqrestore(&priv->lock, flags);
-		timeout = schedule_timeout(timeout);
-		spin_lock_irqsave(&priv->lock, flags);
-	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&tty->write_wait, &wait);
-
 	/* clear out any remaining data in the buffer */
 	clear_ringbuf(priv->buf);
 	spin_unlock_irqrestore(&priv->lock, flags);
 
-	/* wait for characters to drain from the device (this is long enough
-	 * for the entire all byte spcp8x5 hardware buffer to drain with no
-	 * flow control for data rates of 1200 bps or more, for lower rates we
-	 * should really know how much data is in the buffer to compute a delay
-	 * that is not unnecessarily long) */
-	bps = tty_get_baud_rate(tty);
-	if (bps > 1200)
-		timeout = max((HZ*2560) / bps, HZ/10);
-	else
-		timeout = 2*HZ;
-	set_current_state(TASK_INTERRUPTIBLE);
-	schedule_timeout(timeout);
-
-	/* clear control lines */
-	if (tty) {
-		c_cflag = tty->termios->c_cflag;
-		if (c_cflag & HUPCL) {
-			spin_lock_irqsave(&priv->lock, flags);
-			priv->line_control = 0;
-			spin_unlock_irqrestore(&priv->lock, flags);
-			spcp8x5_set_ctrlLine(port->serial->dev, 0 , priv->type);
-		}
-	}
-
 	/* kill urb */
 	if (port->write_urb != NULL) {
 		result = usb_unlink_urb(port->write_urb);
@@ -665,13 +646,6 @@ static int spcp8x5_open(struct tty_struct *tty,
 	if (ret)
 		return ret;
 
-	spin_lock_irqsave(&priv->lock, flags);
-	if (tty && (tty->termios->c_cflag & CBAUD))
-		priv->line_control = MCR_DTR | MCR_RTS;
-	else
-		priv->line_control = 0;
-	spin_unlock_irqrestore(&priv->lock, flags);
-
 	spcp8x5_set_ctrlLine(serial->dev, priv->line_control , priv->type);
 
 	/* Setup termios */
@@ -691,9 +665,10 @@ static int spcp8x5_open(struct tty_struct *tty,
 	port->read_urb->dev = serial->dev;
 	ret = usb_submit_urb(port->read_urb, GFP_KERNEL);
 	if (ret) {
-		spcp8x5_close(tty, port, NULL);
+		spcp8x5_close(port);
 		return -EPROTO;
 	}
+	port->port.drain_delay = 256;
 	return 0;
 }
 
@@ -1033,6 +1008,8 @@ static struct usb_serial_driver spcp8x5_device = {
 	.num_ports		= 1,
 	.open 			= spcp8x5_open,
 	.close 			= spcp8x5_close,
+	.dtr_rts		= spcp8x5_dtr_rts,
+	.carrier_raised		= spcp8x5_carrier_raised,
 	.write 			= spcp8x5_write,
 	.set_termios 		= spcp8x5_set_termios,
 	.ioctl 			= spcp8x5_ioctl,
diff --git a/drivers/usb/serial/symbolserial.c b/drivers/usb/serial/symbolserial.c
index 69879e4379402..8b07ebc6baeb4 100644
--- a/drivers/usb/serial/symbolserial.c
+++ b/drivers/usb/serial/symbolserial.c
@@ -152,8 +152,7 @@ static int symbol_open(struct tty_struct *tty, struct usb_serial_port *port,
 	return result;
 }
 
-static void symbol_close(struct tty_struct *tty, struct usb_serial_port *port,
-			  struct file *filp)
+static void symbol_close(struct usb_serial_port *port)
 {
 	struct symbol_private *priv = usb_get_serial_data(port->serial);
 
diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c
index 0a64bac306ee6..42cb04c403bee 100644
--- a/drivers/usb/serial/ti_usb_3410_5052.c
+++ b/drivers/usb/serial/ti_usb_3410_5052.c
@@ -100,8 +100,7 @@ static int ti_startup(struct usb_serial *serial);
 static void ti_shutdown(struct usb_serial *serial);
 static int ti_open(struct tty_struct *tty, struct usb_serial_port *port,
 		struct file *file);
-static void ti_close(struct tty_struct *tty, struct usb_serial_port *port,
-		struct file *file);
+static void ti_close(struct usb_serial_port *port);
 static int ti_write(struct tty_struct *tty, struct usb_serial_port *port,
 		const unsigned char *data, int count);
 static int ti_write_room(struct tty_struct *tty);
@@ -647,8 +646,7 @@ static int ti_open(struct tty_struct *tty,
 }
 
 
-static void ti_close(struct tty_struct *tty, struct usb_serial_port *port,
-							struct file *file)
+static void ti_close(struct usb_serial_port *port)
 {
 	struct ti_device *tdev;
 	struct ti_port *tport;
diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c
index f331e2bde88ac..1967a7edc10c5 100644
--- a/drivers/usb/serial/usb-serial.c
+++ b/drivers/usb/serial/usb-serial.c
@@ -238,9 +238,11 @@ static int serial_open (struct tty_struct *tty, struct file *filp)
 			goto bailout_interface_put;
 		mutex_unlock(&serial->disc_mutex);
 	}
-
 	mutex_unlock(&port->mutex);
-	return 0;
+	/* Now do the correct tty layer semantics */
+	retval = tty_port_block_til_ready(&port->port, tty, filp);
+	if (retval == 0)
+		return 0;
 
 bailout_interface_put:
 	usb_autopm_put_interface(serial->interface);
@@ -259,64 +261,89 @@ static int serial_open (struct tty_struct *tty, struct file *filp)
 	return retval;
 }
 
-static void serial_close(struct tty_struct *tty, struct file *filp)
+/**
+ *	serial_do_down		-	shut down hardware
+ *	@port: port to shut down
+ *
+ *	Shut down a USB port unless it is the console. We never shut down the
+ *	console hardware as it will always be in use.
+ *
+ *	Don't free any resources at this point
+ */
+static void serial_do_down(struct usb_serial_port *port)
 {
-	struct usb_serial_port *port = tty->driver_data;
+	struct usb_serial_driver *drv = port->serial->type;
 	struct usb_serial *serial;
 	struct module *owner;
-	int count;
 
-	if (!port)
+	/* The console is magical, do not hang up the console hardware
+	   or there will be tears */
+	if (port->console)
 		return;
 
-	dbg("%s - port %d", __func__, port->number);
-
 	mutex_lock(&port->mutex);
 	serial = port->serial;
 	owner = serial->type->driver.owner;
 
-	if (port->port.count == 0) {
-		mutex_unlock(&port->mutex);
-		return;
-	}
-
-	if (port->port.count == 1)
-		/* only call the device specific close if this
-		 * port is being closed by the last owner. Ensure we do
-		 * this before we drop the port count. The call is protected
-		 * by the port mutex
-		 */
-		serial->type->close(tty, port, filp);
-
-	if (port->port.count == (port->console ? 2 : 1)) {
-		struct tty_struct *tty = tty_port_tty_get(&port->port);
-		if (tty) {
-			/* We must do this before we drop the port count to
-			   zero. */
-			if (tty->driver_data)
-				tty->driver_data = NULL;
-			tty_port_tty_set(&port->port, NULL);
-			tty_kref_put(tty);
-		}
-	}
+	if (drv->close)
+		drv->close(port);
 
-	--port->port.count;
-	count = port->port.count;
 	mutex_unlock(&port->mutex);
-	put_device(&port->dev);
+}
+
+/**
+ *	serial_do_free		-	free resources post close/hangup
+ *	@port: port to free up
+ *
+ *	Do the resource freeing and refcount dropping for the port. We must
+ *	be careful about ordering and we must avoid freeing up the console.
+ */
 
+static void serial_do_free(struct usb_serial_port *port)
+{
+	struct usb_serial *serial;
+	struct module *owner;
+
+	/* The console is magical, do not hang up the console hardware
+	   or there will be tears */
+	if (port->console)
+		return;
+
+	serial = port->serial;
+	owner = serial->type->driver.owner;
+	put_device(&port->dev);
 	/* Mustn't dereference port any more */
-	if (count == 0) {
-		mutex_lock(&serial->disc_mutex);
-		if (!serial->disconnected)
-			usb_autopm_put_interface(serial->interface);
-		mutex_unlock(&serial->disc_mutex);
-	}
+	mutex_lock(&serial->disc_mutex);
+	if (!serial->disconnected)
+		usb_autopm_put_interface(serial->interface);
+	mutex_unlock(&serial->disc_mutex);
 	usb_serial_put(serial);
-
 	/* Mustn't dereference serial any more */
-	if (count == 0)
-		module_put(owner);
+	module_put(owner);
+}
+
+static void serial_close(struct tty_struct *tty, struct file *filp)
+{
+	struct usb_serial_port *port = tty->driver_data;
+
+	dbg("%s - port %d", __func__, port->number);
+
+
+	if (tty_port_close_start(&port->port, tty, filp) == 0)
+		return;
+
+	serial_do_down(port);		
+	tty_port_close_end(&port->port, tty);
+	tty_port_tty_set(&port->port, NULL);
+	serial_do_free(port);
+}
+
+static void serial_hangup(struct tty_struct *tty)
+{
+	struct usb_serial_port *port = tty->driver_data;
+	serial_do_down(port);
+	tty_port_hangup(&port->port);
+	serial_do_free(port);
 }
 
 static int serial_write(struct tty_struct *tty, const unsigned char *buf,
@@ -648,6 +675,29 @@ static struct usb_serial_driver *search_serial_device(
 	return NULL;
 }
 
+static int serial_carrier_raised(struct tty_port *port)
+{
+	struct usb_serial_port *p = container_of(port, struct usb_serial_port, port);
+	struct usb_serial_driver *drv = p->serial->type;
+	if (drv->carrier_raised)
+		return drv->carrier_raised(p);
+	/* No carrier control - don't block */
+	return 1;	
+}
+
+static void serial_dtr_rts(struct tty_port *port, int on)
+{
+	struct usb_serial_port *p = container_of(port, struct usb_serial_port, port);
+	struct usb_serial_driver *drv = p->serial->type;
+	if (drv->dtr_rts)
+		drv->dtr_rts(p, on);
+}
+
+static const struct tty_port_operations serial_port_ops = {
+	.carrier_raised = serial_carrier_raised,
+	.dtr_rts = serial_dtr_rts,
+};
+
 int usb_serial_probe(struct usb_interface *interface,
 			       const struct usb_device_id *id)
 {
@@ -841,6 +891,7 @@ int usb_serial_probe(struct usb_interface *interface,
 		if (!port)
 			goto probe_error;
 		tty_port_init(&port->port);
+		port->port.ops = &serial_port_ops;
 		port->serial = serial;
 		spin_lock_init(&port->lock);
 		mutex_init(&port->mutex);
@@ -1071,6 +1122,9 @@ void usb_serial_disconnect(struct usb_interface *interface)
 		if (port) {
 			struct tty_struct *tty = tty_port_tty_get(&port->port);
 			if (tty) {
+				/* The hangup will occur asynchronously but
+				   the object refcounts will sort out all the
+				   cleanup */
 				tty_hangup(tty);
 				tty_kref_put(tty);
 			}
@@ -1135,6 +1189,7 @@ static const struct tty_operations serial_ops = {
 	.open =			serial_open,
 	.close =		serial_close,
 	.write =		serial_write,
+	.hangup = 		serial_hangup,
 	.write_room =		serial_write_room,
 	.ioctl =		serial_ioctl,
 	.set_termios =		serial_set_termios,
@@ -1147,6 +1202,7 @@ static const struct tty_operations serial_ops = {
 	.proc_fops =		&serial_proc_fops,
 };
 
+
 struct tty_driver *usb_serial_tty_driver;
 
 static int __init usb_serial_init(void)
diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c
index 5ac414bda7181..b15f1c0e1d4ac 100644
--- a/drivers/usb/serial/visor.c
+++ b/drivers/usb/serial/visor.c
@@ -38,8 +38,7 @@
 /* function prototypes for a handspring visor */
 static int  visor_open(struct tty_struct *tty, struct usb_serial_port *port,
 					struct file *filp);
-static void visor_close(struct tty_struct *tty, struct usb_serial_port *port,
-					struct file *filp);
+static void visor_close(struct usb_serial_port *port);
 static int  visor_write(struct tty_struct *tty, struct usb_serial_port *port,
 					const unsigned char *buf, int count);
 static int  visor_write_room(struct tty_struct *tty);
@@ -324,8 +323,7 @@ static int visor_open(struct tty_struct *tty, struct usb_serial_port *port,
 }
 
 
-static void visor_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void visor_close(struct usb_serial_port *port)
 {
 	struct visor_private *priv = usb_get_serial_port_data(port);
 	unsigned char *transfer_buffer;
diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c
index 5335d3211c073..7c7295d09f344 100644
--- a/drivers/usb/serial/whiteheat.c
+++ b/drivers/usb/serial/whiteheat.c
@@ -147,8 +147,7 @@ static int  whiteheat_attach(struct usb_serial *serial);
 static void whiteheat_shutdown(struct usb_serial *serial);
 static int  whiteheat_open(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-static void whiteheat_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+static void whiteheat_close(struct usb_serial_port *port);
 static int  whiteheat_write(struct tty_struct *tty,
 			struct usb_serial_port *port,
 			const unsigned char *buf, int count);
@@ -712,8 +711,7 @@ static int whiteheat_open(struct tty_struct *tty,
 }
 
 
-static void whiteheat_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void whiteheat_close(struct usb_serial_port *port)
 {
 	struct whiteheat_private *info = usb_get_serial_port_data(port);
 	struct whiteheat_urb_wrap *wrap;
@@ -723,31 +721,7 @@ static void whiteheat_close(struct tty_struct *tty,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	mutex_lock(&port->serial->disc_mutex);
-	/* filp is NULL when called from usb_serial_disconnect */
-	if ((filp && (tty_hung_up_p(filp))) || port->serial->disconnected) {
-		mutex_unlock(&port->serial->disc_mutex);
-		return;
-	}
-	mutex_unlock(&port->serial->disc_mutex);
-
-	tty->closing = 1;
-
-/*
- * Not currently in use; tty_wait_until_sent() calls
- * serial_chars_in_buffer() which deadlocks on the second semaphore
- * acquisition. This should be fixed at some point. Greg's been
- * notified.
-	if ((filp->f_flags & (O_NDELAY | O_NONBLOCK)) == 0) {
-		tty_wait_until_sent(tty, CLOSING_DELAY);
-	}
-*/
-
-	tty_driver_flush_buffer(tty);
-	tty_ldisc_flush(tty);
-
 	firm_report_tx_done(port);
-
 	firm_close(port);
 
 	/* shutdown our bulk reads and writes */
@@ -775,10 +749,7 @@ static void whiteheat_close(struct tty_struct *tty,
 	}
 	spin_unlock_irq(&info->lock);
 	mutex_unlock(&info->deathwarrant);
-
 	stop_command_port(port->serial);
-
-	tty->closing = 0;
 }
 
 
diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h
index 625e9e4639c68..8cdfed738fe4b 100644
--- a/include/linux/usb/serial.h
+++ b/include/linux/usb/serial.h
@@ -224,8 +224,7 @@ struct usb_serial_driver {
 	/* Called by console with tty = NULL and by tty */
 	int  (*open)(struct tty_struct *tty,
 			struct usb_serial_port *port, struct file *filp);
-	void (*close)(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+	void (*close)(struct usb_serial_port *port);
 	int  (*write)(struct tty_struct *tty, struct usb_serial_port *port,
 			const unsigned char *buf, int count);
 	/* Called only by the tty layer */
@@ -241,6 +240,10 @@ struct usb_serial_driver {
 	int  (*tiocmget)(struct tty_struct *tty, struct file *file);
 	int  (*tiocmset)(struct tty_struct *tty, struct file *file,
 			 unsigned int set, unsigned int clear);
+	/* Called by the tty layer for port level work. There may or may not
+	   be an attached tty at this point */
+	void (*dtr_rts)(struct usb_serial_port *port, int on);
+	int  (*carrier_raised)(struct usb_serial_port *port);
 	/* USB events */
 	void (*read_int_callback)(struct urb *urb);
 	void (*write_int_callback)(struct urb *urb);
@@ -283,8 +286,7 @@ extern int usb_serial_generic_open(struct tty_struct *tty,
 		struct usb_serial_port *port, struct file *filp);
 extern int usb_serial_generic_write(struct tty_struct *tty,
 	struct usb_serial_port *port, const unsigned char *buf, int count);
-extern void usb_serial_generic_close(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp);
+extern void usb_serial_generic_close(struct usb_serial_port *port);
 extern int usb_serial_generic_resume(struct usb_serial *serial);
 extern int usb_serial_generic_write_room(struct tty_struct *tty);
 extern int usb_serial_generic_chars_in_buffer(struct tty_struct *tty);
-- 
GitLab


From 739e0285cbb162c8ddd0061fda581ee54a34c19a Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:27:50 +0100
Subject: [PATCH 1978/2198] tty: Update cdc_acm

The CDC ACM driver uses the tty layer correctly so needs conversion. Start by
adding and initializing the port structures.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/class/cdc-acm.c | 5 +++++
 drivers/usb/class/cdc-acm.h | 3 ++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 7a1164dd1d37f..41d4ca527f823 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -89,6 +89,9 @@ static DEFINE_MUTEX(open_mutex);
 
 #define ACM_READY(acm)	(acm && acm->dev && acm->used)
 
+static const struct tty_port_operations acm_port_ops = {
+};
+
 #ifdef VERBOSE_DEBUG
 #define verbose	1
 #else
@@ -1082,6 +1085,8 @@ static int acm_probe (struct usb_interface *intf,
 	spin_lock_init(&acm->read_lock);
 	mutex_init(&acm->mutex);
 	acm->rx_endpoint = usb_rcvbulkpipe(usb_dev, epread->bEndpointAddress);
+	tty_port_init(&acm->port);
+	acm->port.ops = &acm_port_ops;
 
 	buf = usb_buffer_alloc(usb_dev, ctrlsize, GFP_KERNEL, &acm->ctrl_dma);
 	if (!buf) {
diff --git a/drivers/usb/class/cdc-acm.h b/drivers/usb/class/cdc-acm.h
index 1f95e7aa1b665..19967cdb2691d 100644
--- a/drivers/usb/class/cdc-acm.h
+++ b/drivers/usb/class/cdc-acm.h
@@ -90,7 +90,8 @@ struct acm {
 	struct usb_interface *control;			/* control interface */
 	struct usb_interface *data;			/* data interface */
 	struct tty_struct *tty;				/* the corresponding tty */
-	struct urb *ctrlurb;			/* urbs */
+	struct tty_port port;			 	/* our tty port data */
+	struct urb *ctrlurb;				/* urbs */
 	u8 *ctrl_buffer;				/* buffers of urbs */
 	dma_addr_t ctrl_dma;				/* dma handles of buffers */
 	u8 *country_codes;				/* country codes from device */
-- 
GitLab


From 5ba5a5d21204f61ff655b322a90ed91c75194e4b Mon Sep 17 00:00:00 2001
From: Paul Fulghum <paulkf@microgate.com>
Date: Thu, 11 Jun 2009 12:28:37 +0100
Subject: [PATCH 1979/2198] tty: synclink_gt add receive pio mode

Add receive programmed IO mode to reduce receive latency
when using low data rates. The receive FIFO trigger
level of 128 bytes used in DMA mode creates excessive latency
when operating at low data rates. PIO mode is selected when user
application requests data in blocks of less than 128 bytes.

Signed-off-by: Paul Fulghum <paulkf@microgate.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/synclink_gt.c | 77 +++++++++++++++++++++++++++++++++-----
 1 file changed, 68 insertions(+), 9 deletions(-)

diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c
index 67986ea0d479c..1386625fc4caa 100644
--- a/drivers/char/synclink_gt.c
+++ b/drivers/char/synclink_gt.c
@@ -214,6 +214,7 @@ struct slgt_desc
 #define set_desc_next(a,b) (a).next   = cpu_to_le32((unsigned int)(b))
 #define set_desc_count(a,b)(a).count  = cpu_to_le16((unsigned short)(b))
 #define set_desc_eof(a,b)  (a).status = cpu_to_le16((b) ? (le16_to_cpu((a).status) | BIT0) : (le16_to_cpu((a).status) & ~BIT0))
+#define set_desc_status(a, b) (a).status = cpu_to_le16((unsigned short)(b))
 #define desc_count(a)      (le16_to_cpu((a).count))
 #define desc_status(a)     (le16_to_cpu((a).status))
 #define desc_complete(a)   (le16_to_cpu((a).status) & BIT15)
@@ -297,6 +298,7 @@ struct slgt_info {
 	u32 max_frame_size;       /* as set by device config */
 
 	unsigned int rbuf_fill_level;
+	unsigned int rx_pio;
 	unsigned int if_mode;
 	unsigned int base_clock;
 
@@ -331,6 +333,8 @@ struct slgt_info {
 	struct slgt_desc *rbufs;
 	unsigned int rbuf_current;
 	unsigned int rbuf_index;
+	unsigned int rbuf_fill_index;
+	unsigned short rbuf_fill_count;
 
 	unsigned int tbuf_count;
 	struct slgt_desc *tbufs;
@@ -2110,6 +2114,40 @@ static void ri_change(struct slgt_info *info, unsigned short status)
 	info->pending_bh |= BH_STATUS;
 }
 
+static void isr_rxdata(struct slgt_info *info)
+{
+	unsigned int count = info->rbuf_fill_count;
+	unsigned int i = info->rbuf_fill_index;
+	unsigned short reg;
+
+	while (rd_reg16(info, SSR) & IRQ_RXDATA) {
+		reg = rd_reg16(info, RDR);
+		DBGISR(("isr_rxdata %s RDR=%04X\n", info->device_name, reg));
+		if (desc_complete(info->rbufs[i])) {
+			/* all buffers full */
+			rx_stop(info);
+			info->rx_restart = 1;
+			continue;
+		}
+		info->rbufs[i].buf[count++] = (unsigned char)reg;
+		/* async mode saves status byte to buffer for each data byte */
+		if (info->params.mode == MGSL_MODE_ASYNC)
+			info->rbufs[i].buf[count++] = (unsigned char)(reg >> 8);
+		if (count == info->rbuf_fill_level || (reg & BIT10)) {
+			/* buffer full or end of frame */
+			set_desc_count(info->rbufs[i], count);
+			set_desc_status(info->rbufs[i], BIT15 | (reg >> 8));
+			info->rbuf_fill_count = count = 0;
+			if (++i == info->rbuf_count)
+				i = 0;
+			info->pending_bh |= BH_RECEIVE;
+		}
+	}
+
+	info->rbuf_fill_index = i;
+	info->rbuf_fill_count = count;
+}
+
 static void isr_serial(struct slgt_info *info)
 {
 	unsigned short status = rd_reg16(info, SSR);
@@ -2125,6 +2163,8 @@ static void isr_serial(struct slgt_info *info)
 			if (info->tx_count)
 				isr_txeom(info, status);
 		}
+		if (info->rx_pio && (status & IRQ_RXDATA))
+			isr_rxdata(info);
 		if ((status & IRQ_RXBREAK) && (status & RXBREAK)) {
 			info->icount.brk++;
 			/* process break detection if tty control allows */
@@ -2141,7 +2181,8 @@ static void isr_serial(struct slgt_info *info)
 	} else {
 		if (status & (IRQ_TXIDLE + IRQ_TXUNDER))
 			isr_txeom(info, status);
-
+		if (info->rx_pio && (status & IRQ_RXDATA))
+			isr_rxdata(info);
 		if (status & IRQ_RXIDLE) {
 			if (status & RXIDLE)
 				info->icount.rxidle++;
@@ -2642,6 +2683,10 @@ static int rx_enable(struct slgt_info *info, int enable)
 			return -EINVAL;
 		}
 		info->rbuf_fill_level = rbuf_fill_level;
+		if (rbuf_fill_level < 128)
+			info->rx_pio = 1; /* PIO mode */
+		else
+			info->rx_pio = 0; /* DMA mode */
 		rx_stop(info); /* restart receiver to use new fill level */
 	}
 
@@ -3844,15 +3889,27 @@ static void rx_start(struct slgt_info *info)
 	rdma_reset(info);
 	reset_rbufs(info);
 
-	/* set 1st descriptor address */
-	wr_reg32(info, RDDAR, info->rbufs[0].pdesc);
-
-	if (info->params.mode != MGSL_MODE_ASYNC) {
-		/* enable rx DMA and DMA interrupt */
-		wr_reg32(info, RDCSR, (BIT2 + BIT0));
+	if (info->rx_pio) {
+		/* rx request when rx FIFO not empty */
+		wr_reg16(info, SCR, (unsigned short)(rd_reg16(info, SCR) & ~BIT14));
+		slgt_irq_on(info, IRQ_RXDATA);
+		if (info->params.mode == MGSL_MODE_ASYNC) {
+			/* enable saving of rx status */
+			wr_reg32(info, RDCSR, BIT6);
+		}
 	} else {
-		/* enable saving of rx status, rx DMA and DMA interrupt */
-		wr_reg32(info, RDCSR, (BIT6 + BIT2 + BIT0));
+		/* rx request when rx FIFO half full */
+		wr_reg16(info, SCR, (unsigned short)(rd_reg16(info, SCR) | BIT14));
+		/* set 1st descriptor address */
+		wr_reg32(info, RDDAR, info->rbufs[0].pdesc);
+
+		if (info->params.mode != MGSL_MODE_ASYNC) {
+			/* enable rx DMA and DMA interrupt */
+			wr_reg32(info, RDCSR, (BIT2 + BIT0));
+		} else {
+			/* enable saving of rx status, rx DMA and DMA interrupt */
+			wr_reg32(info, RDCSR, (BIT6 + BIT2 + BIT0));
+		}
 	}
 
 	slgt_irq_on(info, IRQ_RXOVER);
@@ -4470,6 +4527,8 @@ static void free_rbufs(struct slgt_info *info, unsigned int i, unsigned int last
 static void reset_rbufs(struct slgt_info *info)
 {
 	free_rbufs(info, 0, info->rbuf_count - 1);
+	info->rbuf_fill_index = 0;
+	info->rbuf_fill_count = 0;
 }
 
 /*
-- 
GitLab


From 97e87f8ebe978e881c7325ba490574bd5500b133 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:29:27 +0100
Subject: [PATCH 1980/2198] tty: cyclades, plx9060 casts cleanup

Remove ugly all-over-the-code casts of ctl_addr to 9060 space.
Add an union to the cyclades_card structure, which contains
a pointer to both 9050 and 9060 spaces.

The 9050 space layout is unknown, so let it still as a void
__iomem pointer.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c  | 65 ++++++++++++++++++++--------------------
 include/linux/cyclades.h | 23 +++++++-------
 2 files changed, 46 insertions(+), 42 deletions(-)

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index ccf68a9e24b75..2cbf74134f1b1 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -666,12 +666,10 @@ static void cy_send_xchar(struct tty_struct *tty, char ch);
 #define IS_CYC_Z(card) ((card).num_chips == (unsigned int)-1)
 
 #define Z_FPGA_CHECK(card) \
-	((readl(&((struct RUNTIME_9060 __iomem *) \
-		((card).ctl_addr))->init_ctrl) & (1<<17)) != 0)
+	((readl(&(card).ctl_addr.p9060->init_ctrl) & (1<<17)) != 0)
 
-#define ISZLOADED(card)	(((ZO_V1 == readl(&((struct RUNTIME_9060 __iomem *) \
-			((card).ctl_addr))->mail_box_0)) || \
-			Z_FPGA_CHECK(card)) && \
+#define ISZLOADED(card)	(((ZO_V1 == readl(&(card).ctl_addr.p9060->mail_box_0)) \
+			|| Z_FPGA_CHECK(card)) && \
 			(ZFIRM_ID == readl(&((struct FIRM_ID __iomem *) \
 			((card).base_addr+ID_ADDRESS))->signature)))
 
@@ -1400,14 +1398,12 @@ cyz_fetch_msg(struct cyclades_card *cinfo,
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 
-	loc_doorbell = readl(&((struct RUNTIME_9060 __iomem *)
-				  (cinfo->ctl_addr))->loc_doorbell);
+	loc_doorbell = readl(&cinfo->ctl_addr.p9060->loc_doorbell);
 	if (loc_doorbell) {
 		*cmd = (char)(0xff & loc_doorbell);
 		*channel = readl(&board_ctrl->fwcmd_channel);
 		*param = (__u32) readl(&board_ctrl->fwcmd_param);
-		cy_writel(&((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->
-			  loc_doorbell, 0xffffffff);
+		cy_writel(&cinfo->ctl_addr.p9060->loc_doorbell, 0xffffffff);
 		return 1;
 	}
 	return 0;
@@ -1431,8 +1427,7 @@ cyz_issue_cmd(struct cyclades_card *cinfo,
 	board_ctrl = &zfw_ctrl->board_ctrl;
 
 	index = 0;
-	pci_doorbell =
-	    &((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->pci_doorbell;
+	pci_doorbell = &cinfo->ctl_addr.p9060->pci_doorbell;
 	while ((readl(pci_doorbell) & 0xff) != 0) {
 		if (index++ == 1000)
 			return (int)(readl(pci_doorbell) & 0xff);
@@ -1635,8 +1630,7 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 	fw_ver = readl(&board_ctrl->fw_version);
-	hw_ver = readl(&((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->
-			mail_box_0);
+	hw_ver = readl(&cinfo->ctl_addr.p9060->mail_box_0);
 
 	while (cyz_fetch_msg(cinfo, &channel, &cmd, &param) == 1) {
 		special_count = 0;
@@ -2394,8 +2388,8 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 		struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
 
 		if (!ISZLOADED(*cinfo)) {
-			if (((ZE_V1 == readl(&((struct RUNTIME_9060 __iomem *)
-					 (cinfo->ctl_addr))->mail_box_0)) &&
+			if (((ZE_V1 == readl(&cinfo->ctl_addr.p9060->
+							mail_box_0)) &&
 					Z_FPGA_CHECK(*cinfo)) &&
 					(ZFIRM_HLT == readl(
 						&firm_id->signature))) {
@@ -2417,6 +2411,7 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 			if (!cinfo->intr_enabled) {
 				struct ZFW_CTRL __iomem *zfw_ctrl;
 				struct BOARD_CTRL __iomem *board_ctrl;
+				u16 intr;
 
 				zfw_ctrl = cinfo->base_addr +
 					(readl(&firm_id->zfwctrl_addr) &
@@ -2425,8 +2420,10 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 				board_ctrl = &zfw_ctrl->board_ctrl;
 
 				/* Enable interrupts on the PLX chip */
-				cy_writew(cinfo->ctl_addr + 0x68,
-					readw(cinfo->ctl_addr + 0x68) | 0x0900);
+				intr = readw(&cinfo->ctl_addr.p9060->
+						intr_ctrl_stat) | 0x0900;
+				cy_writew(&cinfo->ctl_addr.p9060->
+						intr_ctrl_stat, intr);
 				/* Enable interrupts on the FW */
 				retval = cyz_issue_cmd(cinfo, 0,
 						C_CM_IRQ_ENBL, 0L);
@@ -4347,8 +4344,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 	spin_lock_init(&cinfo->card_lock);
 
 	if (IS_CYC_Z(*cinfo)) {	/* Cyclades-Z */
-		mailbox = readl(&((struct RUNTIME_9060 __iomem *)
-				     cinfo->ctl_addr)->mail_box_0);
+		mailbox = readl(&cinfo->ctl_addr.p9060->mail_box_0);
 		nports = (mailbox == ZE_V1) ? ZE_V1_NPORTS : 8;
 		cinfo->intr_enabled = 0;
 		cinfo->nports = 0;	/* Will be correctly set later, after
@@ -4613,7 +4609,7 @@ static int __init cy_detect_isa(void)
 
 		/* set cy_card */
 		cy_card[j].base_addr = cy_isa_address;
-		cy_card[j].ctl_addr = NULL;
+		cy_card[j].ctl_addr.p9050 = NULL;
 		cy_card[j].irq = (int)cy_isa_irq;
 		cy_card[j].bus_index = 0;
 		cy_card[j].first_line = cy_next_channel;
@@ -5013,7 +5009,8 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		}
 
 		/* Disable interrupts on the PLX before resetting it */
-		cy_writew(addr0 + 0x68, readw(addr0 + 0x68) & ~0x0900);
+		cy_writew(&ctl_addr->intr_ctrl_stat,
+				readw(&ctl_addr->intr_ctrl_stat) & ~0x0900);
 
 		plx_init(pdev, irq, addr0);
 
@@ -5109,7 +5106,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 
 	/* set cy_card */
 	cy_card[card_no].base_addr = addr2;
-	cy_card[card_no].ctl_addr = addr0;
+	cy_card[card_no].ctl_addr.p9050 = addr0;
 	cy_card[card_no].irq = irq;
 	cy_card[card_no].bus_index = 1;
 	cy_card[card_no].first_line = cy_next_channel;
@@ -5125,17 +5122,20 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		plx_ver = readb(addr2 + CyPLX_VER) & 0x0f;
 		switch (plx_ver) {
 		case PLX_9050:
-
 			cy_writeb(addr0 + 0x4c, 0x43);
 			break;
 
 		case PLX_9060:
 		case PLX_9080:
 		default:	/* Old boards, use PLX_9060 */
-			plx_init(pdev, irq, addr0);
-			cy_writew(addr0 + 0x68, readw(addr0 + 0x68) | 0x0900);
+		{
+			struct RUNTIME_9060 __iomem *ctl_addr = addr0;
+			plx_init(pdev, irq, ctl_addr);
+			cy_writew(&ctl_addr->intr_ctrl_stat,
+				readw(&ctl_addr->intr_ctrl_stat) | 0x0900);
 			break;
 		}
+		}
 	}
 
 	dev_info(&pdev->dev, "%s/PCI #%d found: %d channels starting from "
@@ -5168,17 +5168,18 @@ static void __devexit cy_pci_remove(struct pci_dev *pdev)
 	/* non-Z with old PLX */
 	if (!IS_CYC_Z(*cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
 			PLX_9050)
-		cy_writeb(cinfo->ctl_addr + 0x4c, 0);
+		cy_writeb(cinfo->ctl_addr.p9050 + 0x4c, 0);
 	else
 #ifndef CONFIG_CYZ_INTR
 		if (!IS_CYC_Z(*cinfo))
 #endif
-		cy_writew(cinfo->ctl_addr + 0x68,
-				readw(cinfo->ctl_addr + 0x68) & ~0x0900);
+		cy_writew(&cinfo->ctl_addr.p9060->intr_ctrl_stat,
+			readw(&cinfo->ctl_addr.p9060->intr_ctrl_stat) &
+			~0x0900);
 
 	iounmap(cinfo->base_addr);
-	if (cinfo->ctl_addr)
-		iounmap(cinfo->ctl_addr);
+	if (cinfo->ctl_addr.p9050)
+		iounmap(cinfo->ctl_addr.p9050);
 	if (cinfo->irq
 #ifndef CONFIG_CYZ_INTR
 		&& !IS_CYC_Z(*cinfo)
@@ -5373,8 +5374,8 @@ static void __exit cy_cleanup_module(void)
 			/* clear interrupt */
 			cy_writeb(card->base_addr + Cy_ClrIntr, 0);
 			iounmap(card->base_addr);
-			if (card->ctl_addr)
-				iounmap(card->ctl_addr);
+			if (card->ctl_addr.p9050)
+				iounmap(card->ctl_addr.p9050);
 			if (card->irq
 #ifndef CONFIG_CYZ_INTR
 				&& !IS_CYC_Z(*card)
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index 788850ba4e757..9ae03d5b35904 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -507,16 +507,19 @@ struct ZFW_CTRL {
 
 /* Per card data structure */
 struct cyclades_card {
-    void __iomem *base_addr;
-    void __iomem *ctl_addr;
-    int irq;
-    unsigned int num_chips;	/* 0 if card absent, -1 if Z/PCI, else Y */
-    unsigned int first_line;	/* minor number of first channel on card */
-    unsigned int nports;	/* Number of ports in the card */
-    int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
-    int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
-    spinlock_t card_lock;
-    struct cyclades_port *ports;
+	void __iomem *base_addr;
+	union {
+		void __iomem *p9050;
+		struct RUNTIME_9060 __iomem *p9060;
+	} ctl_addr;
+	int irq;
+	unsigned int num_chips;	/* 0 if card absent, -1 if Z/PCI, else Y */
+	unsigned int first_line;	/* minor number of first channel on card */
+	unsigned int nports;	/* Number of ports in the card */
+	int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
+	int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
+	spinlock_t card_lock;
+	struct cyclades_port *ports;
 };
 
 /***************************************
-- 
GitLab


From 101b81590d8df0a74c33cf739886247c0a13f4af Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:30:10 +0100
Subject: [PATCH 1981/2198] tty: cyclades, cache HW version

Store HW version locally to not read it all the time in interrupts
and alike.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c  | 32 +++++++++++---------------------
 include/linux/cyclades.h |  1 +
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 2cbf74134f1b1..cf191cc1c921a 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -668,8 +668,7 @@ static void cy_send_xchar(struct tty_struct *tty, char ch);
 #define Z_FPGA_CHECK(card) \
 	((readl(&(card).ctl_addr.p9060->init_ctrl) & (1<<17)) != 0)
 
-#define ISZLOADED(card)	(((ZO_V1 == readl(&(card).ctl_addr.p9060->mail_box_0)) \
-			|| Z_FPGA_CHECK(card)) && \
+#define ISZLOADED(card)	((ZO_V1 == (card).hw_ver || Z_FPGA_CHECK(card)) && \
 			(ZFIRM_ID == readl(&((struct FIRM_ID __iomem *) \
 			((card).base_addr+ID_ADDRESS))->signature)))
 
@@ -1393,8 +1392,6 @@ cyz_fetch_msg(struct cyclades_card *cinfo,
 	unsigned long loc_doorbell;
 
 	firm_id = cinfo->base_addr + ID_ADDRESS;
-	if (!ISZLOADED(*cinfo))
-		return -1;
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 
@@ -1619,10 +1616,8 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 	static struct BOARD_CTRL __iomem *board_ctrl;
 	static struct CH_CTRL __iomem *ch_ctrl;
 	static struct BUF_CTRL __iomem *buf_ctrl;
-	__u32 channel;
+	__u32 channel, param, fw_ver;
 	__u8 cmd;
-	__u32 param;
-	__u32 hw_ver, fw_ver;
 	int special_count;
 	int delta_count;
 
@@ -1630,7 +1625,6 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
 	board_ctrl = &zfw_ctrl->board_ctrl;
 	fw_ver = readl(&board_ctrl->fw_version);
-	hw_ver = readl(&cinfo->ctl_addr.p9060->mail_box_0);
 
 	while (cyz_fetch_msg(cinfo, &channel, &cmd, &param) == 1) {
 		special_count = 0;
@@ -2388,11 +2382,9 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 		struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
 
 		if (!ISZLOADED(*cinfo)) {
-			if (((ZE_V1 == readl(&cinfo->ctl_addr.p9060->
-							mail_box_0)) &&
-					Z_FPGA_CHECK(*cinfo)) &&
-					(ZFIRM_HLT == readl(
-						&firm_id->signature))) {
+			if (cinfo->hw_ver == ZE_V1 && Z_FPGA_CHECK(*cinfo) &&
+					readl(&firm_id->signature) ==
+					ZFIRM_HLT) {
 				printk(KERN_ERR "cyc:Cyclades-Z Error: you "
 					"need an external power supply for "
 					"this number of ports.\nFirmware "
@@ -4336,7 +4328,6 @@ static void cy_hangup(struct tty_struct *tty)
 static int __devinit cy_init_card(struct cyclades_card *cinfo)
 {
 	struct cyclades_port *info;
-	u32 uninitialized_var(mailbox);
 	unsigned int nports, port;
 	unsigned short chip_number;
 	int uninitialized_var(index);
@@ -4344,8 +4335,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 	spin_lock_init(&cinfo->card_lock);
 
 	if (IS_CYC_Z(*cinfo)) {	/* Cyclades-Z */
-		mailbox = readl(&cinfo->ctl_addr.p9060->mail_box_0);
-		nports = (mailbox == ZE_V1) ? ZE_V1_NPORTS : 8;
+		nports = (cinfo->hw_ver == ZE_V1) ? ZE_V1_NPORTS : 8;
 		cinfo->intr_enabled = 0;
 		cinfo->nports = 0;	/* Will be correctly set later, after
 					   Z FW is loaded */
@@ -4377,7 +4367,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 
 		if (IS_CYC_Z(*cinfo)) {
 			info->type = PORT_STARTECH;
-			if (mailbox == ZO_V1)
+			if (cinfo->hw_ver == ZO_V1)
 				info->xmit_fifo_size = CYZ_FIFO_SIZE;
 			else
 				info->xmit_fifo_size = 4 * CYZ_FIFO_SIZE;
@@ -4932,7 +4922,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 {
 	void __iomem *addr0 = NULL, *addr2 = NULL;
 	char *card_name = NULL;
-	u32 mailbox;
+	u32 uninitialized_var(mailbox);
 	unsigned int device_id, nchan = 0, card_no, i;
 	unsigned char plx_ver;
 	int retval, irq;
@@ -5014,7 +5004,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 
 		plx_init(pdev, irq, addr0);
 
-		mailbox = (u32)readl(&ctl_addr->mail_box_0);
+		mailbox = readl(&ctl_addr->mail_box_0);
 
 		addr2 = ioremap_nocache(pci_resource_start(pdev, 2),
 				mailbox == ZE_V1 ? CyPCI_Ze_win : CyPCI_Zwin);
@@ -5026,7 +5016,6 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		if (mailbox == ZE_V1) {
 			card_name = "Cyclades-Ze";
 
-			readl(&ctl_addr->mail_box_0);
 			nchan = ZE_V1_NPORTS;
 		} else {
 			card_name = "Cyclades-8Zo";
@@ -5089,6 +5078,8 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		}
 		cy_card[card_no].num_chips = nchan / 4;
 	} else {
+		cy_card[card_no].hw_ver = mailbox;
+		cy_card[card_no].num_chips = (unsigned int)-1;
 #ifdef CONFIG_CYZ_INTR
 		/* allocate IRQ only if board has an IRQ */
 		if (irq != 0 && irq != 255) {
@@ -5101,7 +5092,6 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 			}
 		}
 #endif				/* CONFIG_CYZ_INTR */
-		cy_card[card_no].num_chips = (unsigned int)-1;
 	}
 
 	/* set cy_card */
diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index 9ae03d5b35904..a14fe3079761a 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -518,6 +518,7 @@ struct cyclades_card {
 	unsigned int nports;	/* Number of ports in the card */
 	int bus_index;		/* address shift - 0 for ISA, 1 for PCI */
 	int intr_enabled;		/* FW Interrupt flag - 0 disabled, 1 enabled */
+	u32 hw_ver;
 	spinlock_t card_lock;
 	struct cyclades_port *ports;
 };
-- 
GitLab


From 2693f485c22d18474c077f12fd0f797ee8679b33 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:31:06 +0100
Subject: [PATCH 1982/2198] tty: cyclades, convert macros to inlines

Remove ugly macros and add inlines instead of them. This improves
readability and type checking a much.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c | 130 ++++++++++++++++++++++------------------
 1 file changed, 71 insertions(+), 59 deletions(-)

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index cf191cc1c921a..c9a503f4724a2 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -663,15 +663,6 @@
 static void cy_throttle(struct tty_struct *tty);
 static void cy_send_xchar(struct tty_struct *tty, char ch);
 
-#define IS_CYC_Z(card) ((card).num_chips == (unsigned int)-1)
-
-#define Z_FPGA_CHECK(card) \
-	((readl(&(card).ctl_addr.p9060->init_ctrl) & (1<<17)) != 0)
-
-#define ISZLOADED(card)	((ZO_V1 == (card).hw_ver || Z_FPGA_CHECK(card)) && \
-			(ZFIRM_ID == readl(&((struct FIRM_ID __iomem *) \
-			((card).base_addr+ID_ADDRESS))->signature)))
-
 #ifndef SERIAL_XMIT_SIZE
 #define	SERIAL_XMIT_SIZE	(min(PAGE_SIZE, 4096))
 #endif
@@ -684,8 +675,6 @@ static void cy_send_xchar(struct tty_struct *tty, char ch);
 #define DRIVER_VERSION	0x02010203
 #define RAM_SIZE 0x80000
 
-#define Z_FPGA_LOADED(X)	((readl(&(X)->init_ctrl) & (1<<17)) != 0)
-
 enum zblock_type {
 	ZBLOCK_PRG = 0,
 	ZBLOCK_FPGA = 1
@@ -880,6 +869,29 @@ static void cyz_rx_restart(unsigned long);
 static struct timer_list cyz_rx_full_timer[NR_PORTS];
 #endif				/* CONFIG_CYZ_INTR */
 
+static inline bool cy_is_Z(struct cyclades_card *card)
+{
+	return card->num_chips == (unsigned int)-1;
+}
+
+static inline bool __cyz_fpga_loaded(struct RUNTIME_9060 __iomem *ctl_addr)
+{
+	return readl(&ctl_addr->init_ctrl) & (1 << 17);
+}
+
+static inline bool cyz_fpga_loaded(struct cyclades_card *card)
+{
+	return __cyz_fpga_loaded(card->ctl_addr.p9060);
+}
+
+static inline bool cyz_is_loaded(struct cyclades_card *card)
+{
+	struct FIRM_ID __iomem *fw_id = card->base_addr + ID_ADDRESS;
+
+	return (card->hw_ver == ZO_V1 || cyz_fpga_loaded(card)) &&
+			readl(&fw_id->signature) == ZFIRM_ID;
+}
+
 static inline int serial_paranoia_check(struct cyclades_port *info,
 		char *name, const char *routine)
 {
@@ -1417,7 +1429,7 @@ cyz_issue_cmd(struct cyclades_card *cinfo,
 	unsigned int index;
 
 	firm_id = cinfo->base_addr + ID_ADDRESS;
-	if (!ISZLOADED(*cinfo))
+	if (!cyz_is_loaded(cinfo))
 		return -1;
 
 	zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
@@ -1725,7 +1737,7 @@ static irqreturn_t cyz_interrupt(int irq, void *dev_id)
 {
 	struct cyclades_card *cinfo = dev_id;
 
-	if (unlikely(!ISZLOADED(*cinfo))) {
+	if (unlikely(!cyz_is_loaded(cinfo))) {
 #ifdef CY_DEBUG_INTERRUPTS
 		printk(KERN_DEBUG "cyz_interrupt: board not yet loaded "
 				"(IRQ%d).\n", irq);
@@ -1773,9 +1785,9 @@ static void cyz_poll(unsigned long arg)
 	for (card = 0; card < NR_CARDS; card++) {
 		cinfo = &cy_card[card];
 
-		if (!IS_CYC_Z(*cinfo))
+		if (!cy_is_Z(cinfo))
 			continue;
-		if (!ISZLOADED(*cinfo))
+		if (!cyz_is_loaded(cinfo))
 			continue;
 
 		firm_id = cinfo->base_addr + ID_ADDRESS;
@@ -1854,7 +1866,7 @@ static int startup(struct cyclades_port *info)
 
 	set_line_char(info);
 
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -1911,7 +1923,7 @@ static int startup(struct cyclades_port *info)
 		base_addr = card->base_addr;
 
 		firm_id = base_addr + ID_ADDRESS;
-		if (!ISZLOADED(*card))
+		if (!cyz_is_loaded(card))
 			return -ENODEV;
 
 		zfw_ctrl = card->base_addr +
@@ -2006,7 +2018,7 @@ static void start_xmit(struct cyclades_port *info)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -2050,7 +2062,7 @@ static void shutdown(struct cyclades_port *info)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -2106,7 +2118,7 @@ static void shutdown(struct cyclades_port *info)
 #endif
 
 		firm_id = base_addr + ID_ADDRESS;
-		if (!ISZLOADED(*card))
+		if (!cyz_is_loaded(card))
 			return;
 
 		zfw_ctrl = card->base_addr +
@@ -2213,7 +2225,7 @@ block_til_ready(struct tty_struct *tty, struct file *filp,
 #endif
 	info->port.blocked_open++;
 
-	if (!IS_CYC_Z(*cinfo)) {
+	if (!cy_is_Z(cinfo)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = cinfo->bus_index;
@@ -2276,7 +2288,7 @@ block_til_ready(struct tty_struct *tty, struct file *filp,
 
 		base_addr = cinfo->base_addr;
 		firm_id = base_addr + ID_ADDRESS;
-		if (!ISZLOADED(*cinfo)) {
+		if (!cyz_is_loaded(cinfo)) {
 			__set_current_state(TASK_RUNNING);
 			remove_wait_queue(&info->port.open_wait, &wait);
 			return -EINVAL;
@@ -2377,12 +2389,12 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 	   treat it as absent from the system.  This
 	   will make the user pay attention.
 	 */
-	if (IS_CYC_Z(*info->card)) {
+	if (cy_is_Z(info->card)) {
 		struct cyclades_card *cinfo = info->card;
 		struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
 
-		if (!ISZLOADED(*cinfo)) {
-			if (cinfo->hw_ver == ZE_V1 && Z_FPGA_CHECK(*cinfo) &&
+		if (!cyz_is_loaded(cinfo)) {
+			if (cinfo->hw_ver == ZE_V1 && cyz_fpga_loaded(cinfo) &&
 					readl(&firm_id->signature) ==
 					ZFIRM_HLT) {
 				printk(KERN_ERR "cyc:Cyclades-Z Error: you "
@@ -2537,7 +2549,7 @@ static void cy_wait_until_sent(struct tty_struct *tty, int timeout)
 #endif
 	card = info->card;
 	channel = (info->line) - (card->first_line);
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -2582,7 +2594,7 @@ static void cy_flush_buffer(struct tty_struct *tty)
 	info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
 	spin_unlock_irqrestore(&card->card_lock, flags);
 
-	if (IS_CYC_Z(*card)) {	/* If it is a Z card, flush the on-board
+	if (cy_is_Z(card)) {	/* If it is a Z card, flush the on-board
 					   buffers as well */
 		spin_lock_irqsave(&card->card_lock, flags);
 		retval = cyz_issue_cmd(card, channel, C_CM_FLUSH_TX, 0L);
@@ -2663,7 +2675,7 @@ static void cy_close(struct tty_struct *tty, struct file *filp)
 
 	spin_lock_irqsave(&card->card_lock, flags);
 
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		int channel = info->line - card->first_line;
 		int index = card->bus_index;
 		void __iomem *base_addr = card->base_addr +
@@ -2883,7 +2895,7 @@ static int cy_chars_in_buffer(struct tty_struct *tty)
 	channel = (info->line) - (card->first_line);
 
 #ifdef Z_EXT_CHARS_IN_BUFFER
-	if (!IS_CYC_Z(cy_card[card])) {
+	if (!cy_is_Z(card)) {
 #endif				/* Z_EXT_CHARS_IN_BUFFER */
 #ifdef CY_DEBUG_IO
 		printk(KERN_DEBUG "cyc:cy_chars_in_buffer ttyC%d %d\n",
@@ -2996,7 +3008,7 @@ static void set_line_char(struct cyclades_port *info)
 	channel = info->line - card->first_line;
 	chip_number = channel / 4;
 
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 
 		index = card->bus_index;
 
@@ -3221,7 +3233,7 @@ static void set_line_char(struct cyclades_port *info)
 		int retval;
 
 		firm_id = card->base_addr + ID_ADDRESS;
-		if (!ISZLOADED(*card))
+		if (!cyz_is_loaded(card))
 			return;
 
 		zfw_ctrl = card->base_addr +
@@ -3438,7 +3450,7 @@ static int get_lsr_info(struct cyclades_port *info, unsigned int __user *value)
 
 	card = info->card;
 	channel = (info->line) - (card->first_line);
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3478,7 +3490,7 @@ static int cy_tiocmget(struct tty_struct *tty, struct file *file)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3504,7 +3516,7 @@ static int cy_tiocmget(struct tty_struct *tty, struct file *file)
 	} else {
 		base_addr = card->base_addr;
 		firm_id = card->base_addr + ID_ADDRESS;
-		if (ISZLOADED(*card)) {
+		if (cyz_is_loaded(card)) {
 			zfw_ctrl = card->base_addr +
 				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
 			board_ctrl = &zfw_ctrl->board_ctrl;
@@ -3547,7 +3559,7 @@ cy_tiocmset(struct tty_struct *tty, struct file *file,
 
 	card = info->card;
 	channel = (info->line) - (card->first_line);
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3622,7 +3634,7 @@ cy_tiocmset(struct tty_struct *tty, struct file *file,
 		base_addr = card->base_addr;
 
 		firm_id = card->base_addr + ID_ADDRESS;
-		if (ISZLOADED(*card)) {
+		if (cyz_is_loaded(card)) {
 			zfw_ctrl = card->base_addr +
 				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
 			board_ctrl = &zfw_ctrl->board_ctrl;
@@ -3694,7 +3706,7 @@ static int cy_break(struct tty_struct *tty, int break_state)
 	card = info->card;
 
 	spin_lock_irqsave(&card->card_lock, flags);
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		/* Let the transmit ISR take care of this (since it
 		   requires stuffing characters into the output stream).
 		 */
@@ -3763,7 +3775,7 @@ static int set_threshold(struct cyclades_port *info, unsigned long value)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3791,7 +3803,7 @@ static int get_threshold(struct cyclades_port *info,
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3825,7 +3837,7 @@ static int set_timeout(struct cyclades_port *info, unsigned long value)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -3848,7 +3860,7 @@ static int get_timeout(struct cyclades_port *info,
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	if (!IS_CYC_Z(*card)) {
+	if (!cy_is_Z(card)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		index = card->bus_index;
@@ -4102,7 +4114,7 @@ static void cy_send_xchar(struct tty_struct *tty, char ch)
 	card = info->card;
 	channel = info->line - card->first_line;
 
-	if (IS_CYC_Z(*card)) {
+	if (cy_is_Z(card)) {
 		if (ch == STOP_CHAR(tty))
 			cyz_issue_cmd(card, channel, C_CM_SENDXOFF, 0L);
 		else if (ch == START_CHAR(tty))
@@ -4135,7 +4147,7 @@ static void cy_throttle(struct tty_struct *tty)
 	card = info->card;
 
 	if (I_IXOFF(tty)) {
-		if (!IS_CYC_Z(*card))
+		if (!cy_is_Z(card))
 			cy_send_xchar(tty, STOP_CHAR(tty));
 		else
 			info->throttle = 1;
@@ -4143,7 +4155,7 @@ static void cy_throttle(struct tty_struct *tty)
 
 	if (tty->termios->c_cflag & CRTSCTS) {
 		channel = info->line - card->first_line;
-		if (!IS_CYC_Z(*card)) {
+		if (!cy_is_Z(card)) {
 			chip = channel >> 2;
 			channel &= 0x03;
 			index = card->bus_index;
@@ -4200,7 +4212,7 @@ static void cy_unthrottle(struct tty_struct *tty)
 	if (tty->termios->c_cflag & CRTSCTS) {
 		card = info->card;
 		channel = info->line - card->first_line;
-		if (!IS_CYC_Z(*card)) {
+		if (!cy_is_Z(card)) {
 			chip = channel >> 2;
 			channel &= 0x03;
 			index = card->bus_index;
@@ -4244,7 +4256,7 @@ static void cy_stop(struct tty_struct *tty)
 
 	cinfo = info->card;
 	channel = info->line - cinfo->first_line;
-	if (!IS_CYC_Z(*cinfo)) {
+	if (!cy_is_Z(cinfo)) {
 		index = cinfo->bus_index;
 		chip = channel >> 2;
 		channel &= 0x03;
@@ -4277,7 +4289,7 @@ static void cy_start(struct tty_struct *tty)
 	cinfo = info->card;
 	channel = info->line - cinfo->first_line;
 	index = cinfo->bus_index;
-	if (!IS_CYC_Z(*cinfo)) {
+	if (!cy_is_Z(cinfo)) {
 		chip = channel >> 2;
 		channel &= 0x03;
 		base_addr = cinfo->base_addr + (cy_chip_offset[chip] << index);
@@ -4334,7 +4346,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 
 	spin_lock_init(&cinfo->card_lock);
 
-	if (IS_CYC_Z(*cinfo)) {	/* Cyclades-Z */
+	if (cy_is_Z(cinfo)) {	/* Cyclades-Z */
 		nports = (cinfo->hw_ver == ZE_V1) ? ZE_V1_NPORTS : 8;
 		cinfo->intr_enabled = 0;
 		cinfo->nports = 0;	/* Will be correctly set later, after
@@ -4365,7 +4377,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 		init_completion(&info->shutdown_wait);
 		init_waitqueue_head(&info->delta_msr_wait);
 
-		if (IS_CYC_Z(*cinfo)) {
+		if (cy_is_Z(cinfo)) {
 			info->type = PORT_STARTECH;
 			if (cinfo->hw_ver == ZO_V1)
 				info->xmit_fifo_size = CYZ_FIFO_SIZE;
@@ -4408,7 +4420,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 	}
 
 #ifndef CONFIG_CYZ_INTR
-	if (IS_CYC_Z(*cinfo) && !timer_pending(&cyz_timerlist)) {
+	if (cy_is_Z(cinfo) && !timer_pending(&cyz_timerlist)) {
 		mod_timer(&cyz_timerlist, jiffies + 1);
 #ifdef CY_PCI_DEBUG
 		printk(KERN_DEBUG "Cyclades-Z polling initialized\n");
@@ -4771,7 +4783,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 
 	/* Check whether the firmware is already loaded and running. If
 	   positive, skip this board */
-	if (Z_FPGA_LOADED(ctl_addr) && readl(&fid->signature) == ZFIRM_ID) {
+	if (__cyz_fpga_loaded(ctl_addr) && readl(&fid->signature) == ZFIRM_ID) {
 		u32 cntval = readl(base_addr + 0x190);
 
 		udelay(100);
@@ -4790,7 +4802,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 
 	mailbox = readl(&ctl_addr->mail_box_0);
 
-	if (mailbox == 0 || Z_FPGA_LOADED(ctl_addr)) {
+	if (mailbox == 0 || __cyz_fpga_loaded(ctl_addr)) {
 		/* stops CPU and set window to beginning of RAM */
 		cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
 		cy_writel(&cust->cpu_stop, 0);
@@ -4806,7 +4818,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 				base_addr);
 		if (retval)
 			goto err_rel;
-		if (!Z_FPGA_LOADED(ctl_addr)) {
+		if (!__cyz_fpga_loaded(ctl_addr)) {
 			dev_err(&pdev->dev, "fw upload successful, but fw is "
 					"not loaded\n");
 			goto err_rel;
@@ -4865,7 +4877,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 				"system before loading the new FW to the "
 				"Cyclades-Z.\n");
 
-			if (Z_FPGA_LOADED(ctl_addr))
+			if (__cyz_fpga_loaded(ctl_addr))
 				plx_init(pdev, irq, ctl_addr);
 
 			retval = -EIO;
@@ -4889,7 +4901,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 			"check the connection between the Z host card and the "
 			"serial expanders.\n");
 
-		if (Z_FPGA_LOADED(ctl_addr))
+		if (__cyz_fpga_loaded(ctl_addr))
 			plx_init(pdev, irq, ctl_addr);
 
 		dev_info(&pdev->dev, "Null number of ports detected. Board "
@@ -5156,12 +5168,12 @@ static void __devexit cy_pci_remove(struct pci_dev *pdev)
 	unsigned int i;
 
 	/* non-Z with old PLX */
-	if (!IS_CYC_Z(*cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
+	if (!cy_is_Z(cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
 			PLX_9050)
 		cy_writeb(cinfo->ctl_addr.p9050 + 0x4c, 0);
 	else
 #ifndef CONFIG_CYZ_INTR
-		if (!IS_CYC_Z(*cinfo))
+		if (!cy_is_Z(cinfo))
 #endif
 		cy_writew(&cinfo->ctl_addr.p9060->intr_ctrl_stat,
 			readw(&cinfo->ctl_addr.p9060->intr_ctrl_stat) &
@@ -5172,7 +5184,7 @@ static void __devexit cy_pci_remove(struct pci_dev *pdev)
 		iounmap(cinfo->ctl_addr.p9050);
 	if (cinfo->irq
 #ifndef CONFIG_CYZ_INTR
-		&& !IS_CYC_Z(*cinfo)
+		&& !cy_is_Z(cinfo)
 #endif /* CONFIG_CYZ_INTR */
 		)
 		free_irq(cinfo->irq, cinfo);
@@ -5368,7 +5380,7 @@ static void __exit cy_cleanup_module(void)
 				iounmap(card->ctl_addr.p9050);
 			if (card->irq
 #ifndef CONFIG_CYZ_INTR
-				&& !IS_CYC_Z(*card)
+				&& !cy_is_Z(card)
 #endif /* CONFIG_CYZ_INTR */
 				)
 				free_irq(card->irq, card);
-- 
GitLab


From 08a951e1693e324bd035b194002a82b9057fd9c5 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:33:40 +0100
Subject: [PATCH 1983/2198] tty: cyclades, remove typedefs

They are unused anyway.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cyclades.h | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/include/linux/cyclades.h b/include/linux/cyclades.h
index a14fe3079761a..1fbdea4f08ebb 100644
--- a/include/linux/cyclades.h
+++ b/include/linux/cyclades.h
@@ -142,19 +142,6 @@ struct CYZ_BOOT_CTRL {
 
 
 #ifndef DP_WINDOW_SIZE
-/* #include "cyclomz.h" */
-/****************** ****************** *******************/
-/*
- *	The data types defined below are used in all ZFIRM interface
- *	data structures. They accomodate differences between HW
- *	architectures and compilers.
- */
-
-typedef __u64  ucdouble;		/* 64 bits, unsigned */
-typedef __u32  uclong;			/* 32 bits, unsigned */
-typedef __u16  ucshort;		/* 16 bits, unsigned */
-typedef __u8   ucchar;			/* 8 bits, unsigned */
-
 /*
  *	Memory Window Sizes
  */
-- 
GitLab


From 963118eef9e6706e2b5356309fb0cdd9c9eba81d Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:34:27 +0100
Subject: [PATCH 1984/2198] tty: cyclades, fix nports handling

Set up ports right after FW load so that we won't allocate maximal
(64) ports when we use few.

Also remove reading of nports in irq context, since we know it from
initialisation now.

This also fixes a tty ports unregistration on some fail paths and for
Ze which registered 64 and unregistered real port count.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c | 58 ++++++++++++-----------------------------
 1 file changed, 17 insertions(+), 41 deletions(-)

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index c9a503f4724a2..6adbc97596614 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -604,7 +604,6 @@
 
 #define NR_PORTS	256
 
-#define ZE_V1_NPORTS	64
 #define ZO_V1	0
 #define ZO_V2	1
 #define ZE_V1	2
@@ -1777,7 +1776,6 @@ static void cyz_poll(unsigned long arg)
 	struct tty_struct *tty;
 	struct FIRM_ID __iomem *firm_id;
 	struct ZFW_CTRL __iomem *zfw_ctrl;
-	struct BOARD_CTRL __iomem *board_ctrl;
 	struct BUF_CTRL __iomem *buf_ctrl;
 	unsigned long expires = jiffies + HZ;
 	unsigned int port, card;
@@ -1793,11 +1791,9 @@ static void cyz_poll(unsigned long arg)
 		firm_id = cinfo->base_addr + ID_ADDRESS;
 		zfw_ctrl = cinfo->base_addr +
 				(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-		board_ctrl = &(zfw_ctrl->board_ctrl);
 
 	/* Skip first polling cycle to avoid racing conditions with the FW */
 		if (!cinfo->intr_enabled) {
-			cinfo->nports = (int)readl(&board_ctrl->n_channel);
 			cinfo->intr_enabled = 1;
 			continue;
 		}
@@ -2413,16 +2409,8 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 		   interrupts should be enabled as soon as the first open
 		   happens to one of its ports. */
 			if (!cinfo->intr_enabled) {
-				struct ZFW_CTRL __iomem *zfw_ctrl;
-				struct BOARD_CTRL __iomem *board_ctrl;
 				u16 intr;
 
-				zfw_ctrl = cinfo->base_addr +
-					(readl(&firm_id->zfwctrl_addr) &
-					 0xfffff);
-
-				board_ctrl = &zfw_ctrl->board_ctrl;
-
 				/* Enable interrupts on the PLX chip */
 				intr = readw(&cinfo->ctl_addr.p9060->
 						intr_ctrl_stat) | 0x0900;
@@ -2435,8 +2423,6 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
 					printk(KERN_ERR "cyc:IRQ enable retval "
 						"was %x\n", retval);
 				}
-				cinfo->nports =
-					(int)readl(&board_ctrl->n_channel);
 				cinfo->intr_enabled = 1;
 			}
 		}
@@ -4340,30 +4326,20 @@ static void cy_hangup(struct tty_struct *tty)
 static int __devinit cy_init_card(struct cyclades_card *cinfo)
 {
 	struct cyclades_port *info;
-	unsigned int nports, port;
+	unsigned int port;
 	unsigned short chip_number;
-	int uninitialized_var(index);
 
 	spin_lock_init(&cinfo->card_lock);
+	cinfo->intr_enabled = 0;
 
-	if (cy_is_Z(cinfo)) {	/* Cyclades-Z */
-		nports = (cinfo->hw_ver == ZE_V1) ? ZE_V1_NPORTS : 8;
-		cinfo->intr_enabled = 0;
-		cinfo->nports = 0;	/* Will be correctly set later, after
-					   Z FW is loaded */
-	} else {
-		index = cinfo->bus_index;
-		nports = cinfo->nports = CyPORTS_PER_CHIP * cinfo->num_chips;
-	}
-
-	cinfo->ports = kzalloc(sizeof(*cinfo->ports) * nports, GFP_KERNEL);
+	cinfo->ports = kcalloc(cinfo->nports, sizeof(*cinfo->ports),
+			GFP_KERNEL);
 	if (cinfo->ports == NULL) {
 		printk(KERN_ERR "Cyclades: cannot allocate ports\n");
-		cinfo->nports = 0;
 		return -ENOMEM;
 	}
 
-	for (port = cinfo->first_line; port < cinfo->first_line + nports;
+	for (port = cinfo->first_line; port < cinfo->first_line + cinfo->nports;
 			port++) {
 		info = &cinfo->ports[port - cinfo->first_line];
 		tty_port_init(&info->port);
@@ -4388,6 +4364,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
 				cyz_rx_restart, (unsigned long)info);
 #endif
 		} else {
+			int index = cinfo->bus_index;
 			info->type = PORT_CIRRUS;
 			info->xmit_fifo_size = CyMAX_CHAR_FIFO;
 			info->cor1 = CyPARITY_NONE | Cy_1_STOP | Cy_8_BITS;
@@ -4615,7 +4592,8 @@ static int __init cy_detect_isa(void)
 		cy_card[j].irq = (int)cy_isa_irq;
 		cy_card[j].bus_index = 0;
 		cy_card[j].first_line = cy_next_channel;
-		cy_card[j].num_chips = cy_isa_nchan / 4;
+		cy_card[j].num_chips = cy_isa_nchan / CyPORTS_PER_CHIP;
+		cy_card[j].nports = cy_isa_nchan;
 		if (cy_init_card(&cy_card[j])) {
 			cy_card[j].base_addr = NULL;
 			free_irq(cy_isa_irq, &cy_card[j]);
@@ -4771,7 +4749,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 	struct CUSTOM_REG __iomem *cust = base_addr;
 	struct ZFW_CTRL __iomem *pt_zfwctrl;
 	void __iomem *tmp;
-	u32 mailbox, status;
+	u32 mailbox, status, nchan;
 	unsigned int i;
 	int retval;
 
@@ -4892,11 +4870,11 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 			base_addr + ID_ADDRESS, readl(&fid->zfwctrl_addr),
 			base_addr + readl(&fid->zfwctrl_addr));
 
+	nchan = readl(&pt_zfwctrl->board_ctrl.n_channel);
 	dev_info(&pdev->dev, "Cyclades-Z FW loaded: version = %x, ports = %u\n",
-		readl(&pt_zfwctrl->board_ctrl.fw_version),
-		readl(&pt_zfwctrl->board_ctrl.n_channel));
+		readl(&pt_zfwctrl->board_ctrl.fw_version), nchan);
 
-	if (readl(&pt_zfwctrl->board_ctrl.n_channel) == 0) {
+	if (nchan == 0) {
 		dev_warn(&pdev->dev, "no Cyclades-Z ports were found. Please "
 			"check the connection between the Z host card and the "
 			"serial expanders.\n");
@@ -4922,7 +4900,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 	cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) |
 			0x00030800UL);
 
-	return 0;
+	return nchan;
 err_rel:
 	release_firmware(fw);
 err:
@@ -5027,12 +5005,8 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 
 		if (mailbox == ZE_V1) {
 			card_name = "Cyclades-Ze";
-
-			nchan = ZE_V1_NPORTS;
 		} else {
 			card_name = "Cyclades-8Zo";
-			nchan = 8;
-
 #ifdef CY_PCI_DEBUG
 			if (mailbox == ZO_V1) {
 				cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
@@ -5057,8 +5031,9 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 		}
 
 		retval = cyz_load_fw(pdev, addr2, addr0, irq);
-		if (retval)
+		if (retval <= 0)
 			goto err_unmap;
+		nchan = retval;
 	}
 
 	if ((cy_next_channel + nchan) > NR_PORTS) {
@@ -5088,7 +5063,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 			dev_err(&pdev->dev, "could not allocate IRQ\n");
 			goto err_unmap;
 		}
-		cy_card[card_no].num_chips = nchan / 4;
+		cy_card[card_no].num_chips = nchan / CyPORTS_PER_CHIP;
 	} else {
 		cy_card[card_no].hw_ver = mailbox;
 		cy_card[card_no].num_chips = (unsigned int)-1;
@@ -5112,6 +5087,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 	cy_card[card_no].irq = irq;
 	cy_card[card_no].bus_index = 1;
 	cy_card[card_no].first_line = cy_next_channel;
+	cy_card[card_no].nports = nchan;
 	retval = cy_init_card(&cy_card[card_no]);
 	if (retval)
 		goto err_null;
-- 
GitLab


From b39933fbd304021580800796683b8ddaa3dd0a6a Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:35:21 +0100
Subject: [PATCH 1985/2198] tty: cyclades, remove unused variables

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/cyclades.c | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 6adbc97596614..4560190517422 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -2963,7 +2963,6 @@ static void set_line_char(struct cyclades_port *info)
 	void __iomem *base_addr;
 	int chip, channel, index;
 	unsigned cflag, iflag;
-	unsigned short chip_number;
 	int baud, baud_rate = 0;
 	int i;
 
@@ -2992,7 +2991,6 @@ static void set_line_char(struct cyclades_port *info)
 
 	card = info->card;
 	channel = info->line - card->first_line;
-	chip_number = channel / 4;
 
 	if (!cy_is_Z(card)) {
 
@@ -3212,9 +3210,7 @@ static void set_line_char(struct cyclades_port *info)
 	} else {
 		struct FIRM_ID __iomem *firm_id;
 		struct ZFW_CTRL __iomem *zfw_ctrl;
-		struct BOARD_CTRL __iomem *board_ctrl;
 		struct CH_CTRL __iomem *ch_ctrl;
-		struct BUF_CTRL __iomem *buf_ctrl;
 		__u32 sw_flow;
 		int retval;
 
@@ -3224,9 +3220,7 @@ static void set_line_char(struct cyclades_port *info)
 
 		zfw_ctrl = card->base_addr +
 			(readl(&firm_id->zfwctrl_addr) & 0xfffff);
-		board_ctrl = &zfw_ctrl->board_ctrl;
 		ch_ctrl = &(zfw_ctrl->ch_ctrl[channel]);
-		buf_ctrl = &zfw_ctrl->buf_ctrl[channel];
 
 		/* baud rate */
 		baud = tty_get_baud_rate(info->port.tty);
-- 
GitLab


From 10077d4a6674f535abdbe25cdecb1202af7948f1 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:36:09 +0100
Subject: [PATCH 1986/2198] tty: cdc_acm add krefs

Now we have a port structure begin using the fields and kref counts

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/class/cdc-acm.c | 164 +++++++++++++++++++++---------------
 drivers/usb/class/cdc-acm.h |   2 -
 2 files changed, 98 insertions(+), 68 deletions(-)

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index 41d4ca527f823..b4e73aa759d6a 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -62,7 +62,7 @@
 #include <linux/tty_flip.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/usb.h>
 #include <linux/usb/cdc.h>
 #include <asm/byteorder.h>
@@ -87,7 +87,7 @@ static struct acm *acm_table[ACM_TTY_MINORS];
 
 static DEFINE_MUTEX(open_mutex);
 
-#define ACM_READY(acm)	(acm && acm->dev && acm->used)
+#define ACM_READY(acm)	(acm && acm->dev && acm->port.count)
 
 static const struct tty_port_operations acm_port_ops = {
 };
@@ -265,6 +265,7 @@ static void acm_ctrl_irq(struct urb *urb)
 {
 	struct acm *acm = urb->context;
 	struct usb_cdc_notification *dr = urb->transfer_buffer;
+	struct tty_struct *tty;
 	unsigned char *data;
 	int newctrl;
 	int retval;
@@ -297,12 +298,16 @@ static void acm_ctrl_irq(struct urb *urb)
 			break;
 
 		case USB_CDC_NOTIFY_SERIAL_STATE:
-
+			tty = tty_port_tty_get(&acm->port);
 			newctrl = get_unaligned_le16(data);
 
-			if (acm->tty && !acm->clocal && (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
-				dbg("calling hangup");
-				tty_hangup(acm->tty);
+			if (tty) {
+				if (!acm->clocal &&
+				    (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
+					dbg("calling hangup");
+					tty_hangup(tty);
+				}
+				tty_kref_put(tty);
 			}
 
 			acm->ctrlin = newctrl;
@@ -374,15 +379,14 @@ static void acm_rx_tasklet(unsigned long _acm)
 {
 	struct acm *acm = (void *)_acm;
 	struct acm_rb *buf;
-	struct tty_struct *tty = acm->tty;
+	struct tty_struct *tty;
 	struct acm_ru *rcv;
 	unsigned long flags;
 	unsigned char throttled;
 
 	dbg("Entering acm_rx_tasklet");
 
-	if (!ACM_READY(acm))
-	{
+	if (!ACM_READY(acm)) {
 		dbg("acm_rx_tasklet: ACM not ready");
 		return;
 	}
@@ -390,12 +394,13 @@ static void acm_rx_tasklet(unsigned long _acm)
 	spin_lock_irqsave(&acm->throttle_lock, flags);
 	throttled = acm->throttle;
 	spin_unlock_irqrestore(&acm->throttle_lock, flags);
-	if (throttled)
-	{
+	if (throttled) {
 		dbg("acm_rx_tasklet: throttled");
 		return;
 	}
 
+	tty = tty_port_tty_get(&acm->port);
+
 next_buffer:
 	spin_lock_irqsave(&acm->read_lock, flags);
 	if (list_empty(&acm->filled_read_bufs)) {
@@ -409,20 +414,22 @@ static void acm_rx_tasklet(unsigned long _acm)
 
 	dbg("acm_rx_tasklet: procesing buf 0x%p, size = %d", buf, buf->size);
 
-	tty_buffer_request_room(tty, buf->size);
-	spin_lock_irqsave(&acm->throttle_lock, flags);
-	throttled = acm->throttle;
-	spin_unlock_irqrestore(&acm->throttle_lock, flags);
-	if (!throttled)
-		tty_insert_flip_string(tty, buf->base, buf->size);
-	tty_flip_buffer_push(tty);
-
-	if (throttled) {
-		dbg("Throttling noticed");
-		spin_lock_irqsave(&acm->read_lock, flags);
-		list_add(&buf->list, &acm->filled_read_bufs);
-		spin_unlock_irqrestore(&acm->read_lock, flags);
-		return;
+	if (tty) {
+		spin_lock_irqsave(&acm->throttle_lock, flags);
+		throttled = acm->throttle;
+		spin_unlock_irqrestore(&acm->throttle_lock, flags);
+		if (!throttled) {
+			tty_buffer_request_room(tty, buf->size);
+			tty_insert_flip_string(tty, buf->base, buf->size);
+			tty_flip_buffer_push(tty);
+		} else {
+			tty_kref_put(tty);
+			dbg("Throttling noticed");
+			spin_lock_irqsave(&acm->read_lock, flags);
+			list_add(&buf->list, &acm->filled_read_bufs);
+			spin_unlock_irqrestore(&acm->read_lock, flags);
+			return;
+		}
 	}
 
 	spin_lock_irqsave(&acm->read_lock, flags);
@@ -431,6 +438,8 @@ static void acm_rx_tasklet(unsigned long _acm)
 	goto next_buffer;
 
 urbs:
+	tty_kref_put(tty);
+
 	while (!list_empty(&acm->spare_read_bufs)) {
 		spin_lock_irqsave(&acm->read_lock, flags);
 		if (list_empty(&acm->spare_read_urbs)) {
@@ -502,11 +511,14 @@ static void acm_write_bulk(struct urb *urb)
 static void acm_softint(struct work_struct *work)
 {
 	struct acm *acm = container_of(work, struct acm, work);
+	struct tty_struct *tty;
 
 	dev_vdbg(&acm->data->dev, "tx work\n");
 	if (!ACM_READY(acm))
 		return;
-	tty_wakeup(acm->tty);
+	tty = tty_port_tty_get(&acm->port);
+	tty_wakeup(tty);
+	tty_kref_put(tty);
 }
 
 static void acm_waker(struct work_struct *waker)
@@ -546,8 +558,9 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 		rv = 0;
 
 	set_bit(TTY_NO_WRITE_SPLIT, &tty->flags);
+
 	tty->driver_data = acm;
-	acm->tty = tty;
+	tty_port_tty_set(&acm->port, tty);
 
 	if (usb_autopm_get_interface(acm->control) < 0)
 		goto early_bail;
@@ -555,11 +568,10 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 		acm->control->needs_remote_wakeup = 1;
 
 	mutex_lock(&acm->mutex);
-	if (acm->used++) {
+	if (acm->port.count++) {
 		usb_autopm_put_interface(acm->control);
 		goto done;
-        }
-
+	}
 
 	acm->ctrlurb->dev = acm->dev;
 	if (usb_submit_urb(acm->ctrlurb, GFP_KERNEL)) {
@@ -570,6 +582,7 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 	if (0 > acm_set_control(acm, acm->ctrlout = ACM_CTRL_DTR | ACM_CTRL_RTS) &&
 	    (acm->ctrl_caps & USB_CDC_CAP_LINE))
 		goto full_bailout;
+
 	usb_autopm_put_interface(acm->control);
 
 	INIT_LIST_HEAD(&acm->spare_read_urbs);
@@ -585,7 +598,7 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 	acm->throttle = 0;
 
 	tasklet_schedule(&acm->urb_task);
-
+	rv = tty_port_block_til_ready(&acm->port, tty, filp);
 done:
 	mutex_unlock(&acm->mutex);
 err_out:
@@ -596,16 +609,17 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 	usb_kill_urb(acm->ctrlurb);
 bail_out:
 	usb_autopm_put_interface(acm->control);
-	acm->used--;
+	acm->port.count--;
 	mutex_unlock(&acm->mutex);
 early_bail:
 	mutex_unlock(&open_mutex);
+	tty_port_tty_set(&acm->port, NULL);
 	return -EIO;
 }
 
 static void acm_tty_unregister(struct acm *acm)
 {
-	int i,nr;
+	int i, nr;
 
 	nr = acm->rx_buflimit;
 	tty_unregister_device(acm_tty_driver, acm->minor);
@@ -622,37 +636,51 @@ static void acm_tty_unregister(struct acm *acm)
 
 static int acm_tty_chars_in_buffer(struct tty_struct *tty);
 
+static void acm_port_down(struct acm *acm, int drain)
+{
+	int i, nr = acm->rx_buflimit;
+	mutex_lock(&open_mutex);
+	if (acm->dev) {
+		usb_autopm_get_interface(acm->control);
+		acm_set_control(acm, acm->ctrlout = 0);
+		/* try letting the last writes drain naturally */
+		if (drain) {
+			wait_event_interruptible_timeout(acm->drain_wait,
+				(ACM_NW == acm_wb_is_avail(acm)) || !acm->dev,
+					ACM_CLOSE_TIMEOUT * HZ);
+		}
+		usb_kill_urb(acm->ctrlurb);
+		for (i = 0; i < ACM_NW; i++)
+			usb_kill_urb(acm->wb[i].urb);
+		for (i = 0; i < nr; i++)
+			usb_kill_urb(acm->ru[i].urb);
+		acm->control->needs_remote_wakeup = 0;
+		usb_autopm_put_interface(acm->control);
+	}
+	mutex_unlock(&open_mutex);
+}
+
+static void acm_tty_hangup(struct tty_struct *tty)
+{
+	struct acm *acm = tty->driver_data;
+	tty_port_hangup(&acm->port);
+	acm_port_down(acm, 0);
+}
+
 static void acm_tty_close(struct tty_struct *tty, struct file *filp)
 {
 	struct acm *acm = tty->driver_data;
-	int i,nr;
 
-	if (!acm || !acm->used)
+	/* Perform the closing process and see if we need to do the hardware
+	   shutdown */
+	if (tty_port_close_start(&acm->port, tty, filp) == 0)
 		return;
-
-	nr = acm->rx_buflimit;
+	acm_port_down(acm, 0);
+	tty_port_close_end(&acm->port, tty);
 	mutex_lock(&open_mutex);
-	if (!--acm->used) {
-		if (acm->dev) {
-			usb_autopm_get_interface(acm->control);
-			acm_set_control(acm, acm->ctrlout = 0);
-
-			/* try letting the last writes drain naturally */
-			wait_event_interruptible_timeout(acm->drain_wait,
-					(ACM_NW == acm_wb_is_avail(acm))
-						|| !acm->dev,
-					ACM_CLOSE_TIMEOUT * HZ);
-
-			usb_kill_urb(acm->ctrlurb);
-			for (i = 0; i < ACM_NW; i++)
-				usb_kill_urb(acm->wb[i].urb);
-			for (i = 0; i < nr; i++)
-				usb_kill_urb(acm->ru[i].urb);
-			acm->control->needs_remote_wakeup = 0;
-			usb_autopm_put_interface(acm->control);
-		} else
-			acm_tty_unregister(acm);
-	}
+	tty_port_tty_set(&acm->port, NULL);
+	if (!acm->dev)
+		acm_tty_unregister(acm);
 	mutex_unlock(&open_mutex);
 }
 
@@ -885,8 +913,8 @@ static int acm_write_buffers_alloc(struct acm *acm)
 	return 0;
 }
 
-static int acm_probe (struct usb_interface *intf,
-		      const struct usb_device_id *id)
+static int acm_probe(struct usb_interface *intf,
+		     const struct usb_device_id *id)
 {
 	struct usb_cdc_union_desc *union_header = NULL;
 	struct usb_cdc_country_functional_desc *cfd = NULL;
@@ -1232,6 +1260,7 @@ static void acm_disconnect(struct usb_interface *intf)
 {
 	struct acm *acm = usb_get_intfdata(intf);
 	struct usb_device *usb_dev = interface_to_usbdev(intf);
+	struct tty_struct *tty;
 
 	/* sibling interface is already cleaning up */
 	if (!acm)
@@ -1258,16 +1287,18 @@ static void acm_disconnect(struct usb_interface *intf)
 	usb_driver_release_interface(&acm_driver, intf == acm->control ?
 					acm->data : acm->control);
 
-	if (!acm->used) {
+	if (acm->port.count == 0) {
 		acm_tty_unregister(acm);
 		mutex_unlock(&open_mutex);
 		return;
 	}
 
 	mutex_unlock(&open_mutex);
-
-	if (acm->tty)
-		tty_hangup(acm->tty);
+	tty = tty_port_tty_get(&acm->port);
+	if (tty) {
+		tty_hangup(tty);
+		tty_kref_put(tty);
+	}
 }
 
 #ifdef CONFIG_PM
@@ -1302,7 +1333,7 @@ static int acm_suspend(struct usb_interface *intf, pm_message_t message)
 	*/
 	mutex_lock(&acm->mutex);
 
-	if (acm->used)
+	if (acm->port.count)
 		stop_data_traffic(acm);
 
 	mutex_unlock(&acm->mutex);
@@ -1324,7 +1355,7 @@ static int acm_resume(struct usb_interface *intf)
 		return 0;
 
 	mutex_lock(&acm->mutex);
-	if (acm->used) {
+	if (acm->port.count) {
 		rv = usb_submit_urb(acm->ctrlurb, GFP_NOIO);
 		if (rv < 0)
 			goto err_out;
@@ -1434,6 +1465,7 @@ static struct usb_driver acm_driver = {
 static const struct tty_operations acm_ops = {
 	.open =			acm_tty_open,
 	.close =		acm_tty_close,
+	.hangup =		acm_tty_hangup,
 	.write =		acm_tty_write,
 	.write_room =		acm_tty_write_room,
 	.ioctl =		acm_tty_ioctl,
diff --git a/drivers/usb/class/cdc-acm.h b/drivers/usb/class/cdc-acm.h
index 19967cdb2691d..4c3856420adde 100644
--- a/drivers/usb/class/cdc-acm.h
+++ b/drivers/usb/class/cdc-acm.h
@@ -89,7 +89,6 @@ struct acm {
 	struct usb_device *dev;				/* the corresponding usb device */
 	struct usb_interface *control;			/* control interface */
 	struct usb_interface *data;			/* data interface */
-	struct tty_struct *tty;				/* the corresponding tty */
 	struct tty_port port;			 	/* our tty port data */
 	struct urb *ctrlurb;				/* urbs */
 	u8 *ctrl_buffer;				/* buffers of urbs */
@@ -121,7 +120,6 @@ struct acm {
 	unsigned int ctrlout;				/* output control lines (DTR, RTS) */
 	unsigned int writesize;				/* max packet size for the output bulk endpoint */
 	unsigned int readsize,ctrlsize;			/* buffer sizes for freeing */
-	unsigned int used;				/* someone has this acm's device open */
 	unsigned int minor;				/* acm minor number */
 	unsigned char throttle;				/* throttled by tty layer */
 	unsigned char clocal;				/* termios CLOCAL */
-- 
GitLab


From 6e47e069eb4dffa88ad91ddfc3fd85f32c35654b Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:37:06 +0100
Subject: [PATCH 1987/2198] tty: Clean up the ACM driver to CodingStyle

Or at least most of it. There are further clean ups possible and there are
are also thing checkpatch moans about that would be silly to "fix".

Also note some FIXME points found as the cleanup was done.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/class/cdc-acm.c | 291 ++++++++++++++++++++----------------
 1 file changed, 160 insertions(+), 131 deletions(-)

diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c
index b4e73aa759d6a..ddeb691925373 100644
--- a/drivers/usb/class/cdc-acm.c
+++ b/drivers/usb/class/cdc-acm.c
@@ -16,7 +16,8 @@
  *	v0.9  - thorough cleaning, URBification, almost a rewrite
  *	v0.10 - some more cleanups
  *	v0.11 - fixed flow control, read error doesn't stop reads
- *	v0.12 - added TIOCM ioctls, added break handling, made struct acm kmalloced
+ *	v0.12 - added TIOCM ioctls, added break handling, made struct acm
+ *		kmalloced
  *	v0.13 - added termios, added hangup
  *	v0.14 - sized down struct acm
  *	v0.15 - fixed flow control again - characters could be lost
@@ -102,13 +103,15 @@ static const struct tty_port_operations acm_port_ops = {
  * Functions for ACM control messages.
  */
 
-static int acm_ctrl_msg(struct acm *acm, int request, int value, void *buf, int len)
+static int acm_ctrl_msg(struct acm *acm, int request, int value,
+							void *buf, int len)
 {
 	int retval = usb_control_msg(acm->dev, usb_sndctrlpipe(acm->dev, 0),
 		request, USB_RT_ACM, value,
 		acm->control->altsetting[0].desc.bInterfaceNumber,
 		buf, len, 5000);
-	dbg("acm_control_msg: rq: 0x%02x val: %#x len: %#x result: %d", request, value, len, retval);
+	dbg("acm_control_msg: rq: 0x%02x val: %#x len: %#x result: %d",
+						request, value, len, retval);
 	return retval < 0 ? retval : 0;
 }
 
@@ -153,9 +156,8 @@ static int acm_wb_is_avail(struct acm *acm)
 
 	n = ACM_NW;
 	spin_lock_irqsave(&acm->write_lock, flags);
-	for (i = 0; i < ACM_NW; i++) {
+	for (i = 0; i < ACM_NW; i++)
 		n -= acm->wb[i].use;
-	}
 	spin_unlock_irqrestore(&acm->write_lock, flags);
 	return n;
 }
@@ -186,7 +188,8 @@ static int acm_start_wb(struct acm *acm, struct acm_wb *wb)
 	wb->urb->transfer_buffer_length = wb->len;
 	wb->urb->dev = acm->dev;
 
-	if ((rc = usb_submit_urb(wb->urb, GFP_ATOMIC)) < 0) {
+	rc = usb_submit_urb(wb->urb, GFP_ATOMIC);
+	if (rc < 0) {
 		dbg("usb_submit_urb(write bulk) failed: %d", rc);
 		acm_write_done(acm, wb);
 	}
@@ -291,44 +294,45 @@ static void acm_ctrl_irq(struct urb *urb)
 
 	data = (unsigned char *)(dr + 1);
 	switch (dr->bNotificationType) {
+	case USB_CDC_NOTIFY_NETWORK_CONNECTION:
+		dbg("%s network", dr->wValue ?
+					"connected to" : "disconnected from");
+		break;
 
-		case USB_CDC_NOTIFY_NETWORK_CONNECTION:
+	case USB_CDC_NOTIFY_SERIAL_STATE:
+		tty = tty_port_tty_get(&acm->port);
+		newctrl = get_unaligned_le16(data);
 
-			dbg("%s network", dr->wValue ? "connected to" : "disconnected from");
-			break;
-
-		case USB_CDC_NOTIFY_SERIAL_STATE:
-			tty = tty_port_tty_get(&acm->port);
-			newctrl = get_unaligned_le16(data);
-
-			if (tty) {
-				if (!acm->clocal &&
-				    (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
-					dbg("calling hangup");
-					tty_hangup(tty);
-				}
-				tty_kref_put(tty);
+		if (tty) {
+			if (!acm->clocal &&
+				(acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
+				dbg("calling hangup");
+				tty_hangup(tty);
 			}
+			tty_kref_put(tty);
+		}
 
-			acm->ctrlin = newctrl;
-
-			dbg("input control lines: dcd%c dsr%c break%c ring%c framing%c parity%c overrun%c",
-				acm->ctrlin & ACM_CTRL_DCD ? '+' : '-',	acm->ctrlin & ACM_CTRL_DSR ? '+' : '-',
-				acm->ctrlin & ACM_CTRL_BRK ? '+' : '-',	acm->ctrlin & ACM_CTRL_RI  ? '+' : '-',
-				acm->ctrlin & ACM_CTRL_FRAMING ? '+' : '-',	acm->ctrlin & ACM_CTRL_PARITY ? '+' : '-',
-				acm->ctrlin & ACM_CTRL_OVERRUN ? '+' : '-');
+		acm->ctrlin = newctrl;
 
+		dbg("input control lines: dcd%c dsr%c break%c ring%c framing%c parity%c overrun%c",
+			acm->ctrlin & ACM_CTRL_DCD ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_DSR ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_BRK ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_RI  ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_FRAMING ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_PARITY ? '+' : '-',
+			acm->ctrlin & ACM_CTRL_OVERRUN ? '+' : '-');
 			break;
 
-		default:
-			dbg("unknown notification %d received: index %d len %d data0 %d data1 %d",
-				dr->bNotificationType, dr->wIndex,
-				dr->wLength, data[0], data[1]);
-			break;
+	default:
+		dbg("unknown notification %d received: index %d len %d data0 %d data1 %d",
+			dr->bNotificationType, dr->wIndex,
+			dr->wLength, data[0], data[1]);
+		break;
 	}
 exit:
 	usb_mark_last_busy(acm->dev);
-	retval = usb_submit_urb (urb, GFP_ATOMIC);
+	retval = usb_submit_urb(urb, GFP_ATOMIC);
 	if (retval)
 		dev_err(&urb->dev->dev, "%s - usb_submit_urb failed with "
 			"result %d", __func__, retval);
@@ -466,10 +470,11 @@ static void acm_rx_tasklet(unsigned long _acm)
 		rcv->urb->transfer_dma = buf->dma;
 		rcv->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 
-		/* This shouldn't kill the driver as unsuccessful URBs are returned to the
-		   free-urbs-pool and resubmited ASAP */
+		/* This shouldn't kill the driver as unsuccessful URBs are
+		   returned to the free-urbs-pool and resubmited ASAP */
 		spin_lock_irqsave(&acm->read_lock, flags);
-		if (acm->susp_count || usb_submit_urb(rcv->urb, GFP_ATOMIC) < 0) {
+		if (acm->susp_count ||
+				usb_submit_urb(rcv->urb, GFP_ATOMIC) < 0) {
 			list_add(&buf->list, &acm->spare_read_bufs);
 			list_add(&rcv->list, &acm->spare_read_urbs);
 			acm->processing = 0;
@@ -588,12 +593,11 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
 	INIT_LIST_HEAD(&acm->spare_read_urbs);
 	INIT_LIST_HEAD(&acm->spare_read_bufs);
 	INIT_LIST_HEAD(&acm->filled_read_bufs);
-	for (i = 0; i < acm->rx_buflimit; i++) {
+
+	for (i = 0; i < acm->rx_buflimit; i++)
 		list_add(&(acm->ru[i].list), &acm->spare_read_urbs);
-	}
-	for (i = 0; i < acm->rx_buflimit; i++) {
+	for (i = 0; i < acm->rx_buflimit; i++)
 		list_add(&(acm->rb[i].list), &acm->spare_read_bufs);
-	}
 
 	acm->throttle = 0;
 
@@ -684,7 +688,8 @@ static void acm_tty_close(struct tty_struct *tty, struct file *filp)
 	mutex_unlock(&open_mutex);
 }
 
-static int acm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+static int acm_tty_write(struct tty_struct *tty,
+					const unsigned char *buf, int count)
 {
 	struct acm *acm = tty->driver_data;
 	int stat;
@@ -700,7 +705,8 @@ static int acm_tty_write(struct tty_struct *tty, const unsigned char *buf, int c
 		return 0;
 
 	spin_lock_irqsave(&acm->write_lock, flags);
-	if ((wbn = acm_wb_alloc(acm)) < 0) {
+	wbn = acm_wb_alloc(acm);
+	if (wbn < 0) {
 		spin_unlock_irqrestore(&acm->write_lock, flags);
 		return 0;
 	}
@@ -712,7 +718,8 @@ static int acm_tty_write(struct tty_struct *tty, const unsigned char *buf, int c
 	wb->len = count;
 	spin_unlock_irqrestore(&acm->write_lock, flags);
 
-	if ((stat = acm_write_start(acm, wbn)) < 0)
+	stat = acm_write_start(acm, wbn);
+	if (stat < 0)
 		return stat;
 	return count;
 }
@@ -798,8 +805,10 @@ static int acm_tty_tiocmset(struct tty_struct *tty, struct file *file,
 		return -EINVAL;
 
 	newctrl = acm->ctrlout;
-	set = (set & TIOCM_DTR ? ACM_CTRL_DTR : 0) | (set & TIOCM_RTS ? ACM_CTRL_RTS : 0);
-	clear = (clear & TIOCM_DTR ? ACM_CTRL_DTR : 0) | (clear & TIOCM_RTS ? ACM_CTRL_RTS : 0);
+	set = (set & TIOCM_DTR ? ACM_CTRL_DTR : 0) |
+					(set & TIOCM_RTS ? ACM_CTRL_RTS : 0);
+	clear = (clear & TIOCM_DTR ? ACM_CTRL_DTR : 0) |
+					(clear & TIOCM_RTS ? ACM_CTRL_RTS : 0);
 
 	newctrl = (newctrl & ~clear) | set;
 
@@ -808,7 +817,8 @@ static int acm_tty_tiocmset(struct tty_struct *tty, struct file *file,
 	return acm_set_control(acm, acm->ctrlout = newctrl);
 }
 
-static int acm_tty_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg)
+static int acm_tty_ioctl(struct tty_struct *tty, struct file *file,
+					unsigned int cmd, unsigned long arg)
 {
 	struct acm *acm = tty->driver_data;
 
@@ -830,7 +840,8 @@ static const __u8 acm_tty_size[] = {
 	5, 6, 7, 8
 };
 
-static void acm_tty_set_termios(struct tty_struct *tty, struct ktermios *termios_old)
+static void acm_tty_set_termios(struct tty_struct *tty,
+						struct ktermios *termios_old)
 {
 	struct acm *acm = tty->driver_data;
 	struct ktermios *termios = tty->termios;
@@ -840,19 +851,23 @@ static void acm_tty_set_termios(struct tty_struct *tty, struct ktermios *termios
 	if (!ACM_READY(acm))
 		return;
 
+	/* FIXME: Needs to support the tty_baud interface */
+	/* FIXME: Broken on sparc */
 	newline.dwDTERate = cpu_to_le32p(acm_tty_speed +
 		(termios->c_cflag & CBAUD & ~CBAUDEX) + (termios->c_cflag & CBAUDEX ? 15 : 0));
 	newline.bCharFormat = termios->c_cflag & CSTOPB ? 2 : 0;
 	newline.bParityType = termios->c_cflag & PARENB ?
-		(termios->c_cflag & PARODD ? 1 : 2) + (termios->c_cflag & CMSPAR ? 2 : 0) : 0;
+				(termios->c_cflag & PARODD ? 1 : 2) +
+				(termios->c_cflag & CMSPAR ? 2 : 0) : 0;
 	newline.bDataBits = acm_tty_size[(termios->c_cflag & CSIZE) >> 4];
-
+	/* FIXME: Needs to clear unsupported bits in the termios */
 	acm->clocal = ((termios->c_cflag & CLOCAL) != 0);
 
 	if (!newline.dwDTERate) {
 		newline.dwDTERate = acm->line.dwDTERate;
 		newctrl &= ~ACM_CTRL_DTR;
-	} else  newctrl |=  ACM_CTRL_DTR;
+	} else
+		newctrl |=  ACM_CTRL_DTR;
 
 	if (newctrl != acm->ctrlout)
 		acm_set_control(acm, acm->ctrlout = newctrl);
@@ -877,9 +892,8 @@ static void acm_write_buffers_free(struct acm *acm)
 	struct acm_wb *wb;
 	struct usb_device *usb_dev = interface_to_usbdev(acm->control);
 
-	for (wb = &acm->wb[0], i = 0; i < ACM_NW; i++, wb++) {
+	for (wb = &acm->wb[0], i = 0; i < ACM_NW; i++, wb++)
 		usb_buffer_free(usb_dev, acm->writesize, wb->buf, wb->dmah);
-	}
 }
 
 static void acm_read_buffers_free(struct acm *acm)
@@ -888,7 +902,8 @@ static void acm_read_buffers_free(struct acm *acm)
 	int i, n = acm->rx_buflimit;
 
 	for (i = 0; i < n; i++)
-		usb_buffer_free(usb_dev, acm->readsize, acm->rb[i].base, acm->rb[i].dma);
+		usb_buffer_free(usb_dev, acm->readsize,
+					acm->rb[i].base, acm->rb[i].dma);
 }
 
 /* Little helper: write buffers allocate */
@@ -928,7 +943,7 @@ static int acm_probe(struct usb_interface *intf,
 	struct usb_device *usb_dev = interface_to_usbdev(intf);
 	struct acm *acm;
 	int minor;
-	int ctrlsize,readsize;
+	int ctrlsize, readsize;
 	u8 *buf;
 	u8 ac_management_function = 0;
 	u8 call_management_function = 0;
@@ -948,7 +963,7 @@ static int acm_probe(struct usb_interface *intf,
 		control_interface = usb_ifnum_to_if(usb_dev, 0);
 		goto skip_normal_probe;
 	}
-	
+
 	/* normal probing*/
 	if (!buffer) {
 		dev_err(&intf->dev, "Weird descriptor references\n");
@@ -956,8 +971,10 @@ static int acm_probe(struct usb_interface *intf,
 	}
 
 	if (!buflen) {
-		if (intf->cur_altsetting->endpoint->extralen && intf->cur_altsetting->endpoint->extra) {
-			dev_dbg(&intf->dev,"Seeking extra descriptors on endpoint\n");
+		if (intf->cur_altsetting->endpoint->extralen &&
+				intf->cur_altsetting->endpoint->extra) {
+			dev_dbg(&intf->dev,
+				"Seeking extra descriptors on endpoint\n");
 			buflen = intf->cur_altsetting->endpoint->extralen;
 			buffer = intf->cur_altsetting->endpoint->extra;
 		} else {
@@ -968,47 +985,43 @@ static int acm_probe(struct usb_interface *intf,
 	}
 
 	while (buflen > 0) {
-		if (buffer [1] != USB_DT_CS_INTERFACE) {
+		if (buffer[1] != USB_DT_CS_INTERFACE) {
 			dev_err(&intf->dev, "skipping garbage\n");
 			goto next_desc;
 		}
 
-		switch (buffer [2]) {
-			case USB_CDC_UNION_TYPE: /* we've found it */
-				if (union_header) {
-					dev_err(&intf->dev, "More than one "
-						"union descriptor, "
-						"skipping ...\n");
-					goto next_desc;
-				}
-				union_header = (struct usb_cdc_union_desc *)
-							buffer;
-				break;
-			case USB_CDC_COUNTRY_TYPE: /* export through sysfs*/
-				cfd = (struct usb_cdc_country_functional_desc *)buffer;
-				break;
-			case USB_CDC_HEADER_TYPE: /* maybe check version */ 
-				break; /* for now we ignore it */ 
-			case USB_CDC_ACM_TYPE:
-				ac_management_function = buffer[3];
-				break;
-			case USB_CDC_CALL_MANAGEMENT_TYPE:
-				call_management_function = buffer[3];
-				call_interface_num = buffer[4];
-				if ((call_management_function & 3) != 3)
-					dev_err(&intf->dev, "This device "
-						"cannot do calls on its own. "
-						"It is no modem.\n");
-				break;
-			default:
-				/* there are LOTS more CDC descriptors that
-				 * could legitimately be found here.
-				 */
-				dev_dbg(&intf->dev, "Ignoring descriptor: "
-						"type %02x, length %d\n",
-						buffer[2], buffer[0]);
-				break;
+		switch (buffer[2]) {
+		case USB_CDC_UNION_TYPE: /* we've found it */
+			if (union_header) {
+				dev_err(&intf->dev, "More than one "
+					"union descriptor, skipping ...\n");
+				goto next_desc;
 			}
+			union_header = (struct usb_cdc_union_desc *)buffer;
+			break;
+		case USB_CDC_COUNTRY_TYPE: /* export through sysfs*/
+			cfd = (struct usb_cdc_country_functional_desc *)buffer;
+			break;
+		case USB_CDC_HEADER_TYPE: /* maybe check version */
+			break; /* for now we ignore it */
+		case USB_CDC_ACM_TYPE:
+			ac_management_function = buffer[3];
+			break;
+		case USB_CDC_CALL_MANAGEMENT_TYPE:
+			call_management_function = buffer[3];
+			call_interface_num = buffer[4];
+			if ((call_management_function & 3) != 3)
+				dev_err(&intf->dev, "This device cannot do calls on its own. It is not a modem.\n");
+			break;
+		default:
+			/* there are LOTS more CDC descriptors that
+			 * could legitimately be found here.
+			 */
+			dev_dbg(&intf->dev, "Ignoring descriptor: "
+					"type %02x, length %d\n",
+					buffer[2], buffer[0]);
+			break;
+		}
 next_desc:
 		buflen -= buffer[0];
 		buffer += buffer[0];
@@ -1016,33 +1029,36 @@ static int acm_probe(struct usb_interface *intf,
 
 	if (!union_header) {
 		if (call_interface_num > 0) {
-			dev_dbg(&intf->dev,"No union descriptor, using call management descriptor\n");
+			dev_dbg(&intf->dev, "No union descriptor, using call management descriptor\n");
 			data_interface = usb_ifnum_to_if(usb_dev, (data_interface_num = call_interface_num));
 			control_interface = intf;
 		} else {
-			dev_dbg(&intf->dev,"No union descriptor, giving up\n");
+			dev_dbg(&intf->dev,
+					"No union descriptor, giving up\n");
 			return -ENODEV;
 		}
 	} else {
 		control_interface = usb_ifnum_to_if(usb_dev, union_header->bMasterInterface0);
 		data_interface = usb_ifnum_to_if(usb_dev, (data_interface_num = union_header->bSlaveInterface0));
 		if (!control_interface || !data_interface) {
-			dev_dbg(&intf->dev,"no interfaces\n");
+			dev_dbg(&intf->dev, "no interfaces\n");
 			return -ENODEV;
 		}
 	}
-	
+
 	if (data_interface_num != call_interface_num)
-		dev_dbg(&intf->dev,"Separate call control interface. That is not fully supported.\n");
+		dev_dbg(&intf->dev, "Separate call control interface. That is not fully supported.\n");
 
 skip_normal_probe:
 
 	/*workaround for switched interfaces */
-	if (data_interface->cur_altsetting->desc.bInterfaceClass != CDC_DATA_INTERFACE_TYPE) {
-		if (control_interface->cur_altsetting->desc.bInterfaceClass == CDC_DATA_INTERFACE_TYPE) {
+	if (data_interface->cur_altsetting->desc.bInterfaceClass
+						!= CDC_DATA_INTERFACE_TYPE) {
+		if (control_interface->cur_altsetting->desc.bInterfaceClass
+						== CDC_DATA_INTERFACE_TYPE) {
 			struct usb_interface *t;
-			dev_dbg(&intf->dev,"Your device has switched interfaces.\n");
-
+			dev_dbg(&intf->dev,
+				"Your device has switched interfaces.\n");
 			t = control_interface;
 			control_interface = data_interface;
 			data_interface = t;
@@ -1054,9 +1070,9 @@ static int acm_probe(struct usb_interface *intf,
 	/* Accept probe requests only for the control interface */
 	if (intf != control_interface)
 		return -ENODEV;
-	
+
 	if (usb_interface_claimed(data_interface)) { /* valid in this context */
-		dev_dbg(&intf->dev,"The data interface isn't available\n");
+		dev_dbg(&intf->dev, "The data interface isn't available\n");
 		return -EBUSY;
 	}
 
@@ -1073,8 +1089,8 @@ static int acm_probe(struct usb_interface *intf,
 	if (!usb_endpoint_dir_in(epread)) {
 		/* descriptors are swapped */
 		struct usb_endpoint_descriptor *t;
-		dev_dbg(&intf->dev,"The data interface has switched endpoints\n");
-		
+		dev_dbg(&intf->dev,
+			"The data interface has switched endpoints\n");
 		t = epread;
 		epread = epwrite;
 		epwrite = t;
@@ -1087,13 +1103,15 @@ static int acm_probe(struct usb_interface *intf,
 		return -ENODEV;
 	}
 
-	if (!(acm = kzalloc(sizeof(struct acm), GFP_KERNEL))) {
+	acm = kzalloc(sizeof(struct acm), GFP_KERNEL);
+	if (acm == NULL) {
 		dev_dbg(&intf->dev, "out of memory (acm kzalloc)\n");
 		goto alloc_fail;
 	}
 
 	ctrlsize = le16_to_cpu(epctrl->wMaxPacketSize);
-	readsize = le16_to_cpu(epread->wMaxPacketSize)* ( quirks == SINGLE_RX_URB ? 1 : 2);
+	readsize = le16_to_cpu(epread->wMaxPacketSize) *
+				(quirks == SINGLE_RX_URB ? 1 : 2);
 	acm->writesize = le16_to_cpu(epwrite->wMaxPacketSize) * 20;
 	acm->control = control_interface;
 	acm->data = data_interface;
@@ -1136,8 +1154,10 @@ static int acm_probe(struct usb_interface *intf,
 	for (i = 0; i < num_rx_buf; i++) {
 		struct acm_ru *rcv = &(acm->ru[i]);
 
-		if (!(rcv->urb = usb_alloc_urb(0, GFP_KERNEL))) {
-			dev_dbg(&intf->dev, "out of memory (read urbs usb_alloc_urb)\n");
+		rcv->urb = usb_alloc_urb(0, GFP_KERNEL);
+		if (rcv->urb == NULL) {
+			dev_dbg(&intf->dev,
+				"out of memory (read urbs usb_alloc_urb)\n");
 			goto alloc_fail7;
 		}
 
@@ -1150,26 +1170,29 @@ static int acm_probe(struct usb_interface *intf,
 		rb->base = usb_buffer_alloc(acm->dev, readsize,
 				GFP_KERNEL, &rb->dma);
 		if (!rb->base) {
-			dev_dbg(&intf->dev, "out of memory (read bufs usb_buffer_alloc)\n");
+			dev_dbg(&intf->dev,
+				"out of memory (read bufs usb_buffer_alloc)\n");
 			goto alloc_fail7;
 		}
 	}
-	for(i = 0; i < ACM_NW; i++)
-	{
+	for (i = 0; i < ACM_NW; i++) {
 		struct acm_wb *snd = &(acm->wb[i]);
 
-		if (!(snd->urb = usb_alloc_urb(0, GFP_KERNEL))) {
-			dev_dbg(&intf->dev, "out of memory (write urbs usb_alloc_urb)");
+		snd->urb = usb_alloc_urb(0, GFP_KERNEL);
+		if (snd->urb == NULL) {
+			dev_dbg(&intf->dev,
+				"out of memory (write urbs usb_alloc_urb)");
 			goto alloc_fail7;
 		}
 
-		usb_fill_bulk_urb(snd->urb, usb_dev, usb_sndbulkpipe(usb_dev, epwrite->bEndpointAddress),
-				NULL, acm->writesize, acm_write_bulk, snd);
+		usb_fill_bulk_urb(snd->urb, usb_dev,
+			usb_sndbulkpipe(usb_dev, epwrite->bEndpointAddress),
+			NULL, acm->writesize, acm_write_bulk, snd);
 		snd->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 		snd->instance = acm;
 	}
 
-	usb_set_intfdata (intf, acm);
+	usb_set_intfdata(intf, acm);
 
 	i = device_create_file(&intf->dev, &dev_attr_bmCapabilities);
 	if (i < 0)
@@ -1180,7 +1203,8 @@ static int acm_probe(struct usb_interface *intf,
 		if (!acm->country_codes)
 			goto skip_countries;
 		acm->country_code_size = cfd->bLength - 4;
-		memcpy(acm->country_codes, (u8 *)&cfd->wCountyCode0, cfd->bLength - 4);
+		memcpy(acm->country_codes, (u8 *)&cfd->wCountyCode0,
+							cfd->bLength - 4);
 		acm->country_rel_date = cfd->iCountryCodeRelDate;
 
 		i = device_create_file(&intf->dev, &dev_attr_wCountryCodes);
@@ -1189,7 +1213,8 @@ static int acm_probe(struct usb_interface *intf,
 			goto skip_countries;
 		}
 
-		i = device_create_file(&intf->dev, &dev_attr_iCountryCodeRelDate);
+		i = device_create_file(&intf->dev,
+						&dev_attr_iCountryCodeRelDate);
 		if (i < 0) {
 			kfree(acm->country_codes);
 			goto skip_countries;
@@ -1197,8 +1222,10 @@ static int acm_probe(struct usb_interface *intf,
 	}
 
 skip_countries:
-	usb_fill_int_urb(acm->ctrlurb, usb_dev, usb_rcvintpipe(usb_dev, epctrl->bEndpointAddress),
-			 acm->ctrl_buffer, ctrlsize, acm_ctrl_irq, acm, epctrl->bInterval);
+	usb_fill_int_urb(acm->ctrlurb, usb_dev,
+			usb_rcvintpipe(usb_dev, epctrl->bEndpointAddress),
+			acm->ctrl_buffer, ctrlsize, acm_ctrl_irq, acm,
+			epctrl->bInterval);
 	acm->ctrlurb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 	acm->ctrlurb->transfer_dma = acm->ctrl_dma;
 
@@ -1245,7 +1272,7 @@ static void stop_data_traffic(struct acm *acm)
 	tasklet_disable(&acm->urb_task);
 
 	usb_kill_urb(acm->ctrlurb);
-	for(i = 0; i < ACM_NW; i++)
+	for (i = 0; i < ACM_NW; i++)
 		usb_kill_urb(acm->wb[i].urb);
 	for (i = 0; i < acm->rx_buflimit; i++)
 		usb_kill_urb(acm->ru[i].urb);
@@ -1267,7 +1294,7 @@ static void acm_disconnect(struct usb_interface *intf)
 		return;
 
 	mutex_lock(&open_mutex);
-	if (acm->country_codes){
+	if (acm->country_codes) {
 		device_remove_file(&acm->control->dev,
 				&dev_attr_wCountryCodes);
 		device_remove_file(&acm->control->dev,
@@ -1281,7 +1308,8 @@ static void acm_disconnect(struct usb_interface *intf)
 	stop_data_traffic(acm);
 
 	acm_write_buffers_free(acm);
-	usb_buffer_free(usb_dev, acm->ctrlsize, acm->ctrl_buffer, acm->ctrl_dma);
+	usb_buffer_free(usb_dev, acm->ctrlsize, acm->ctrl_buffer,
+								acm->ctrl_dma);
 	acm_read_buffers_free(acm);
 
 	usb_driver_release_interface(&acm_driver, intf == acm->control ?
@@ -1434,7 +1462,7 @@ static struct usb_device_id acm_ids[] = {
 	{ USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
 		USB_CDC_ACM_PROTO_AT_GSM) },
 	{ USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
-		USB_CDC_ACM_PROTO_AT_3G	) },
+		USB_CDC_ACM_PROTO_AT_3G) },
 	{ USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
 		USB_CDC_ACM_PROTO_AT_CDMA) },
 
@@ -1442,7 +1470,7 @@ static struct usb_device_id acm_ids[] = {
 	{ }
 };
 
-MODULE_DEVICE_TABLE (usb, acm_ids);
+MODULE_DEVICE_TABLE(usb, acm_ids);
 
 static struct usb_driver acm_driver = {
 	.name =		"cdc_acm",
@@ -1497,7 +1525,8 @@ static int __init acm_init(void)
 	acm_tty_driver->subtype = SERIAL_TYPE_NORMAL,
 	acm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
 	acm_tty_driver->init_termios = tty_std_termios;
-	acm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+	acm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD |
+								HUPCL | CLOCAL;
 	tty_set_operations(acm_tty_driver, &acm_ops);
 
 	retval = tty_register_driver(acm_tty_driver);
@@ -1529,7 +1558,7 @@ static void __exit acm_exit(void)
 module_init(acm_init);
 module_exit(acm_exit);
 
-MODULE_AUTHOR( DRIVER_AUTHOR );
-MODULE_DESCRIPTION( DRIVER_DESC );
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_CHARDEV_MAJOR(ACM_TTY_MAJOR);
-- 
GitLab


From 70beaed22cbe12979e55d99b370e147e2e168562 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:39:12 +0100
Subject: [PATCH 1988/2198] serial: refactor ASYNC_ flags

Define ASYNCB_* flags which are bit numbers of the ASYNC_* flags.
This is useful for {test,set,clear}_bit.

Also convert each ASYNC_% to be (1 << ASYNCB_%) and define masks
with the macros, not constants.

Tested with:
#include "PATH_TO_KERNEL/include/linux/serial.h"
static struct {
        unsigned int new, old;
} as[] = {
        { ASYNC_HUP_NOTIFY, 0x0001 },
        { ASYNC_FOURPORT, 0x0002 },
...
	{ ASYNC_BOOT_ONLYMCA, 0x00400000 },
        { ASYNC_INTERNAL_FLAGS, 0xFFC00000 }
};
...
        for (a = 0; a < ARRAY_SIZE(as); a++)
                if (as[a].old != as[a].new)
                        printf("%.8x != %.8x\n", as[a].old, as[a].new);

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/serial.h | 116 ++++++++++++++++++++++++-----------------
 1 file changed, 69 insertions(+), 47 deletions(-)

diff --git a/include/linux/serial.h b/include/linux/serial.h
index 9136cc5608c3d..e5bb75a638021 100644
--- a/include/linux/serial.h
+++ b/include/linux/serial.h
@@ -96,54 +96,76 @@ struct serial_uart_config {
 
 /*
  * Definitions for async_struct (and serial_struct) flags field
+ *
+ * Define ASYNCB_* for convenient use with {test,set,clear}_bit.
  */
-#define ASYNC_HUP_NOTIFY 0x0001 /* Notify getty on hangups and closes 
-				   on the callout port */
-#define ASYNC_FOURPORT  0x0002	/* Set OU1, OUT2 per AST Fourport settings */
-#define ASYNC_SAK	0x0004	/* Secure Attention Key (Orange book) */
-#define ASYNC_SPLIT_TERMIOS 0x0008 /* Separate termios for dialin/callout */
-
-#define ASYNC_SPD_MASK	0x1030
-#define ASYNC_SPD_HI	0x0010	/* Use 56000 instead of 38400 bps */
-
-#define ASYNC_SPD_VHI	0x0020  /* Use 115200 instead of 38400 bps */
-#define ASYNC_SPD_CUST	0x0030  /* Use user-specified divisor */
-
-#define ASYNC_SKIP_TEST	0x0040 /* Skip UART test during autoconfiguration */
-#define ASYNC_AUTO_IRQ  0x0080 /* Do automatic IRQ during autoconfiguration */
-#define ASYNC_SESSION_LOCKOUT 0x0100 /* Lock out cua opens based on session */
-#define ASYNC_PGRP_LOCKOUT    0x0200 /* Lock out cua opens based on pgrp */
-#define ASYNC_CALLOUT_NOHUP   0x0400 /* Don't do hangups for cua device */
-
-#define ASYNC_HARDPPS_CD	0x0800	/* Call hardpps when CD goes high  */
-
-#define ASYNC_SPD_SHI	0x1000	/* Use 230400 instead of 38400 bps */
-#define ASYNC_SPD_WARP	0x1010	/* Use 460800 instead of 38400 bps */
-
-#define ASYNC_LOW_LATENCY 0x2000 /* Request low latency behaviour */
-
-#define ASYNC_BUGGY_UART  0x4000 /* This is a buggy UART, skip some safety
-				  * checks.  Note: can be dangerous! */
-
-#define ASYNC_AUTOPROBE	 0x8000 /* Port was autoprobed by PCI or PNP code */
-
-#define ASYNC_FLAGS	0x7FFF	/* Possible legal async flags */
-#define ASYNC_USR_MASK	0x3430	/* Legal flags that non-privileged
-				 * users can set or reset */
-
-/* Internal flags used only by kernel/chr_drv/serial.c */
-#define ASYNC_INITIALIZED	0x80000000 /* Serial port was initialized */
-#define ASYNC_NORMAL_ACTIVE	0x20000000 /* Normal device is active */
-#define ASYNC_BOOT_AUTOCONF	0x10000000 /* Autoconfigure port on bootup */
-#define ASYNC_CLOSING		0x08000000 /* Serial port is closing */
-#define ASYNC_CTS_FLOW		0x04000000 /* Do CTS flow control */
-#define ASYNC_CHECK_CD		0x02000000 /* i.e., CLOCAL */
-#define ASYNC_SHARE_IRQ		0x01000000 /* for multifunction cards
-					     --- no longer used */
-#define ASYNC_CONS_FLOW		0x00800000 /* flow control for console  */
-
-#define ASYNC_BOOT_ONLYMCA	0x00400000 /* Probe only if MCA bus */
-#define ASYNC_INTERNAL_FLAGS	0xFFC00000 /* Internal flags */
+#define ASYNCB_HUP_NOTIFY	 0 /* Notify getty on hangups and closes
+				    * on the callout port */
+#define ASYNCB_FOURPORT		 1 /* Set OU1, OUT2 per AST Fourport settings */
+#define ASYNCB_SAK		 2 /* Secure Attention Key (Orange book) */
+#define ASYNCB_SPLIT_TERMIOS	 3 /* Separate termios for dialin/callout */
+#define ASYNCB_SPD_HI		 4 /* Use 56000 instead of 38400 bps */
+#define ASYNCB_SPD_VHI		 5 /* Use 115200 instead of 38400 bps */
+#define ASYNCB_SKIP_TEST	 6 /* Skip UART test during autoconfiguration */
+#define ASYNCB_AUTO_IRQ		 7 /* Do automatic IRQ during
+				    * autoconfiguration */
+#define ASYNCB_SESSION_LOCKOUT	 8 /* Lock out cua opens based on session */
+#define ASYNCB_PGRP_LOCKOUT	 9 /* Lock out cua opens based on pgrp */
+#define ASYNCB_CALLOUT_NOHUP	10 /* Don't do hangups for cua device */
+#define ASYNCB_HARDPPS_CD	11 /* Call hardpps when CD goes high  */
+#define ASYNCB_SPD_SHI		12 /* Use 230400 instead of 38400 bps */
+#define ASYNCB_LOW_LATENCY	13 /* Request low latency behaviour */
+#define ASYNCB_BUGGY_UART	14 /* This is a buggy UART, skip some safety
+				    * checks.  Note: can be dangerous! */
+#define ASYNCB_AUTOPROBE	15 /* Port was autoprobed by PCI or PNP code */
+#define ASYNCB_LAST_USER	15
+
+/* Internal flags used only by kernel */
+#define ASYNCB_INITIALIZED	31 /* Serial port was initialized */
+#define ASYNCB_NORMAL_ACTIVE	29 /* Normal device is active */
+#define ASYNCB_BOOT_AUTOCONF	28 /* Autoconfigure port on bootup */
+#define ASYNCB_CLOSING		27 /* Serial port is closing */
+#define ASYNCB_CTS_FLOW		26 /* Do CTS flow control */
+#define ASYNCB_CHECK_CD		25 /* i.e., CLOCAL */
+#define ASYNCB_SHARE_IRQ	24 /* for multifunction cards, no longer used */
+#define ASYNCB_CONS_FLOW	23 /* flow control for console  */
+#define ASYNCB_BOOT_ONLYMCA	22 /* Probe only if MCA bus */
+#define ASYNCB_FIRST_KERNEL	22
+
+#define ASYNC_HUP_NOTIFY	(1U << ASYNCB_HUP_NOTIFY)
+#define ASYNC_FOURPORT		(1U << ASYNCB_FOURPORT)
+#define ASYNC_SAK		(1U << ASYNCB_SAK)
+#define ASYNC_SPLIT_TERMIOS	(1U << ASYNCB_SPLIT_TERMIOS)
+#define ASYNC_SPD_HI		(1U << ASYNCB_SPD_HI)
+#define ASYNC_SPD_VHI		(1U << ASYNCB_SPD_VHI)
+#define ASYNC_SKIP_TEST		(1U << ASYNCB_SKIP_TEST)
+#define ASYNC_AUTO_IRQ		(1U << ASYNCB_AUTO_IRQ)
+#define ASYNC_SESSION_LOCKOUT	(1U << ASYNCB_SESSION_LOCKOUT)
+#define ASYNC_PGRP_LOCKOUT	(1U << ASYNCB_PGRP_LOCKOUT)
+#define ASYNC_CALLOUT_NOHUP	(1U << ASYNCB_CALLOUT_NOHUP)
+#define ASYNC_HARDPPS_CD	(1U << ASYNCB_HARDPPS_CD)
+#define ASYNC_SPD_SHI		(1U << ASYNCB_SPD_SHI)
+#define ASYNC_LOW_LATENCY	(1U << ASYNCB_LOW_LATENCY)
+#define ASYNC_BUGGY_UART	(1U << ASYNCB_BUGGY_UART)
+#define ASYNC_AUTOPROBE		(1U << ASYNCB_AUTOPROBE)
+
+#define ASYNC_FLAGS		((1U << ASYNCB_LAST_USER) - 1)
+#define ASYNC_USR_MASK		(ASYNC_SPD_HI|ASYNC_SPD_VHI| \
+		ASYNC_CALLOUT_NOHUP|ASYNC_SPD_SHI|ASYNC_LOW_LATENCY)
+#define ASYNC_SPD_CUST		(ASYNC_SPD_HI|ASYNC_SPD_VHI)
+#define ASYNC_SPD_WARP		(ASYNC_SPD_HI|ASYNC_SPD_SHI)
+#define ASYNC_SPD_MASK		(ASYNC_SPD_HI|ASYNC_SPD_VHI|ASYNC_SPD_SHI)
+
+#define ASYNC_INITIALIZED	(1U << ASYNCB_INITIALIZED)
+#define ASYNC_NORMAL_ACTIVE	(1U << ASYNCB_NORMAL_ACTIVE)
+#define ASYNC_BOOT_AUTOCONF	(1U << ASYNCB_BOOT_AUTOCONF)
+#define ASYNC_CLOSING		(1U << ASYNCB_CLOSING)
+#define ASYNC_CTS_FLOW		(1U << ASYNCB_CTS_FLOW)
+#define ASYNC_CHECK_CD		(1U << ASYNCB_CHECK_CD)
+#define ASYNC_SHARE_IRQ		(1U << ASYNCB_SHARE_IRQ)
+#define ASYNC_CONS_FLOW		(1U << ASYNCB_CONS_FLOW)
+#define ASYNC_BOOT_ONLYMCA	(1U << ASYNCB_BOOT_ONLYMCA)
+#define ASYNC_INTERNAL_FLAGS	(~((1U << ASYNCB_FIRST_KERNEL) - 1))
 
 /*
  * Multiport serial configuration structure --- external structure
-- 
GitLab


From a391ad0f09014856bbc4eeea309593eba977b3b0 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:40:17 +0100
Subject: [PATCH 1989/2198] rocket: fix test_bit parameters

Switch from ASYNC_* to ASYNCB_*, because {test,set}_bit expect
bit number, not mask.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/rocket.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c
index 7399188049d84..63d5b628477a7 100644
--- a/drivers/char/rocket.c
+++ b/drivers/char/rocket.c
@@ -939,7 +939,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 	/*
 	 * Info->count is now 1; so it's safe to sleep now.
 	 */
-	if (!test_bit(ASYNC_INITIALIZED, &port->flags)) {
+	if (!test_bit(ASYNCB_INITIALIZED, &port->flags)) {
 		cp = &info->channel;
 		sSetRxTrigger(cp, TRIG_1);
 		if (sGetChanStatus(cp) & CD_ACT)
@@ -963,7 +963,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
 		sEnRxFIFO(cp);
 		sEnTransmit(cp);
 
-		set_bit(ASYNC_INITIALIZED, &info->port.flags);
+		set_bit(ASYNCB_INITIALIZED, &info->port.flags);
 
 		/*
 		 * Set up the tty->alt_speed kludge
@@ -1646,7 +1646,7 @@ static int rp_write(struct tty_struct *tty,
 	/*  Write remaining data into the port's xmit_buf */
 	while (1) {
 		/* Hung up ? */
-		if (!test_bit(ASYNC_NORMAL_ACTIVE, &info->port.flags))
+		if (!test_bit(ASYNCB_NORMAL_ACTIVE, &info->port.flags))
 			goto end;
 		c = min(count, XMIT_BUF_SIZE - info->xmit_cnt - 1);
 		c = min(c, XMIT_BUF_SIZE - info->xmit_head);
-- 
GitLab


From c3301a5c04800bcf8afc8a815bf9e570a4e25a08 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 12:41:05 +0100
Subject: [PATCH 1990/2198] epca: fix test_bit parameters

Switch from ASYNC_* to ASYNCB_*, because test_bit expects
bit number, not mask.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/epca.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 8797b77423505..710ee93330578 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -1518,7 +1518,7 @@ static void doevent(int crd)
 		if (event & MODEMCHG_IND) {
 			/* A modem signal change has been indicated */
 			ch->imodem = mstat;
-			if (test_bit(ASYNC_CHECK_CD, &ch->port.flags)) {
+			if (test_bit(ASYNCB_CHECK_CD, &ch->port.flags)) {
 				/* We are now receiving dcd */
 				if (mstat & ch->dcd)
 					wake_up_interruptible(&ch->port.open_wait);
@@ -1765,9 +1765,9 @@ static void epcaparam(struct tty_struct *tty, struct channel *ch)
 		 * that the driver will wait on carrier detect.
 		 */
 		if (ts->c_cflag & CLOCAL)
-			clear_bit(ASYNC_CHECK_CD, &ch->port.flags);
+			clear_bit(ASYNCB_CHECK_CD, &ch->port.flags);
 		else
-			set_bit(ASYNC_CHECK_CD, &ch->port.flags);
+			set_bit(ASYNCB_CHECK_CD, &ch->port.flags);
 		mval = ch->m_dtr | ch->m_rts;
 	} /* End CBAUD not detected */
 	iflag = termios2digi_i(ch, ts->c_iflag);
@@ -2244,7 +2244,8 @@ static void do_softint(struct work_struct *work)
 			if (test_and_clear_bit(EPCA_EVENT_HANGUP, &ch->event)) {
 				tty_hangup(tty);
 				wake_up_interruptible(&ch->port.open_wait);
-				clear_bit(ASYNC_NORMAL_ACTIVE, &ch->port.flags);
+				clear_bit(ASYNCB_NORMAL_ACTIVE,
+						&ch->port.flags);
 			}
 		}
 		tty_kref_put(tty);
-- 
GitLab


From 70fd8fdecc4430ffcede7704dd812d4054d1faf9 Mon Sep 17 00:00:00 2001
From: Andre Przywara <andre.przywara@amd.com>
Date: Thu, 11 Jun 2009 12:41:57 +0100
Subject: [PATCH 1991/2198] 8250_pci: add the OXCB950 chip to the 8250 PCI
 driver.

This adds support for the following serial controller chip:
Oxford Semiconductor OXCB950 for PCI Cardbus interface
http://www.transdimension.com/products/serial/OXCB950.html

on this card:
ExSys EX-1370 1 port high-speed serial card for ExpressCard/34 slot

Signed-off-by: Andre Przywara <andre.przywara@amd.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250_pci.c | 3 +++
 include/linux/pci_ids.h   | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/drivers/serial/8250_pci.c b/drivers/serial/8250_pci.c
index 938bc1b6c3faa..e371a9c15341e 100644
--- a/drivers/serial/8250_pci.c
+++ b/drivers/serial/8250_pci.c
@@ -2776,6 +2776,9 @@ static struct pci_device_id serial_pci_tbl[] = {
 	{	PCI_VENDOR_ID_OXSEMI, 0x950a,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b0_2_1130000 },
+	{	PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_C950,
+		PCI_VENDOR_ID_OXSEMI, PCI_SUBDEVICE_ID_OXSEMI_C950, 0, 0,
+		pbn_b0_1_921600 },
 	{	PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI954,
 		PCI_ANY_ID, PCI_ANY_ID, 0, 0,
 		pbn_b0_4_115200 },
diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h
index 0f71812d67d32..d7d1c41a0b17d 100644
--- a/include/linux/pci_ids.h
+++ b/include/linux/pci_ids.h
@@ -1996,10 +1996,12 @@
 #define PCI_DEVICE_ID_OXSEMI_PCIe952_1_U	0xC118
 #define PCI_DEVICE_ID_OXSEMI_PCIe952_1_GU	0xC11C
 #define PCI_DEVICE_ID_OXSEMI_16PCI954	0x9501
+#define PCI_DEVICE_ID_OXSEMI_C950	0x950B
 #define PCI_DEVICE_ID_OXSEMI_16PCI95N	0x9511
 #define PCI_DEVICE_ID_OXSEMI_16PCI954PP	0x9513
 #define PCI_DEVICE_ID_OXSEMI_16PCI952	0x9521
 #define PCI_DEVICE_ID_OXSEMI_16PCI952PP	0x9523
+#define PCI_SUBDEVICE_ID_OXSEMI_C950	0x0001
 
 #define PCI_VENDOR_ID_CHELSIO		0x1425
 
-- 
GitLab


From 5b0ed5263cb089500052f8c1ab6e0706bebf0d83 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 11 Jun 2009 12:42:49 +0100
Subject: [PATCH 1992/2198] x86: fix ktermios-termio conversion

The legacy TCSETA{,W,F} ioctls failed to set the termio->c_line field
on x86. This adds a missing get_user.

The same ioctls also fail to report faulting user pointers, which
we keep ignoring.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/termios.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
index f72956331c493..c4ee8056bacaf 100644
--- a/arch/x86/include/asm/termios.h
+++ b/arch/x86/include/asm/termios.h
@@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios,
 	SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
 	SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
 	SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
+	get_user(termios->c_line, &termio->c_line);
 	return copy_from_user(termios->c_cc, termio->c_cc, NCC);
 }
 
-- 
GitLab


From 38db89799bdf11625a831c5af33938dcb11908b6 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:44:17 +0100
Subject: [PATCH 1993/2198] tty: throttling race fix

The tty throttling code can race due to the lock drops. It takes very high
loads but this has been observed and verified by Rob Duncan.

The basic problem is that on an SMP box we can go

	CPU #1				CPU #2
	need to throttle ?
	suppose we should		buffer space cleared
					are we throttled
					yes ? - unthrottle
	call throttle method

This changeet take the termios lock to protect against this. The termios
lock isn't the initial obvious candidate but many implementations of throttle
methods already need to poke around their own termios structures (and nobody
really locks them against a racing change of flow control).

This does mean that anyone who is setting tty->low_latency = 1 and then
calling tty_flip_buffer_push from their unthrottle method is going to end up
collapsing in a pile of locks. However we've removed all the known bogus
users of low_latency = 1 and such use isn't safe anyway for other reasons so
catching it would be an improvement.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_ioctl.c   | 13 +++++++++++++
 include/linux/tty_driver.h |  6 ++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index 6f4c7d0a53bf3..2401dbcbee9c1 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -97,14 +97,19 @@ EXPORT_SYMBOL(tty_driver_flush_buffer);
  *	@tty: terminal
  *
  *	Indicate that a tty should stop transmitting data down the stack.
+ *	Takes the termios mutex to protect against parallel throttle/unthrottle
+ *	and also to ensure the driver can consistently reference its own
+ *	termios data at this point when implementing software flow control.
  */
 
 void tty_throttle(struct tty_struct *tty)
 {
+	mutex_lock(&tty->termios_mutex);
 	/* check TTY_THROTTLED first so it indicates our state */
 	if (!test_and_set_bit(TTY_THROTTLED, &tty->flags) &&
 	    tty->ops->throttle)
 		tty->ops->throttle(tty);
+	mutex_unlock(&tty->termios_mutex);
 }
 EXPORT_SYMBOL(tty_throttle);
 
@@ -113,13 +118,21 @@ EXPORT_SYMBOL(tty_throttle);
  *	@tty: terminal
  *
  *	Indicate that a tty may continue transmitting data down the stack.
+ *	Takes the termios mutex to protect against parallel throttle/unthrottle
+ *	and also to ensure the driver can consistently reference its own
+ *	termios data at this point when implementing software flow control.
+ *
+ *	Drivers should however remember that the stack can issue a throttle,
+ *	then change flow control method, then unthrottle.
  */
 
 void tty_unthrottle(struct tty_struct *tty)
 {
+	mutex_lock(&tty->termios_mutex);
 	if (test_and_clear_bit(TTY_THROTTLED, &tty->flags) &&
 	    tty->ops->unthrottle)
 		tty->ops->unthrottle(tty);
+	mutex_unlock(&tty->termios_mutex);
 }
 EXPORT_SYMBOL(tty_unthrottle);
 
diff --git a/include/linux/tty_driver.h b/include/linux/tty_driver.h
index bcba84ea2d86a..3566129384a4d 100644
--- a/include/linux/tty_driver.h
+++ b/include/linux/tty_driver.h
@@ -127,7 +127,8 @@
  * 	the line discipline are close to full, and it should somehow
  * 	signal that no more characters should be sent to the tty.
  *
- *	Optional: Always invoke via tty_throttle();
+ *	Optional: Always invoke via tty_throttle(), called under the
+ *	termios lock.
  * 
  * void (*unthrottle)(struct tty_struct * tty);
  *
@@ -135,7 +136,8 @@
  * 	that characters can now be sent to the tty without fear of
  * 	overrunning the input buffers of the line disciplines.
  * 
- *	Optional: Always invoke via tty_unthrottle();
+ *	Optional: Always invoke via tty_unthrottle(), called under the
+ *	termios lock.
  *
  * void (*stop)(struct tty_struct *tty);
  *
-- 
GitLab


From 4b6ae89ef40d0d5d4c85098c1d0a012333a68729 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:45:08 +0100
Subject: [PATCH 1994/2198] serial: update maintainers file

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index cf4abddfc8a40..1979167c16dc0 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -159,7 +159,8 @@ F:	drivers/net/r8169.c
 8250/16?50 (AND CLONE UARTS) SERIAL DRIVER
 L:	linux-serial@vger.kernel.org
 W:	http://serial.sourceforge.net
-S:	Orphan
+M:	alan@lxorguk.ukuu.org.uk
+S:	Odd Fixes
 F:	drivers/serial/8250*
 F:	include/linux/serial_8250.h
 
-- 
GitLab


From 620df3c0a5b70656c4de6049825de214f108218e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:45:53 +0100
Subject: [PATCH 1995/2198] pty: Fix a comment

We fixed the globals, so now fix the comment

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/pty.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index 31038a0052a2c..da2cb8c70c110 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -30,7 +30,6 @@
 
 #include <asm/system.h>
 
-/* These are global because they are accessed in tty_io.c */
 #ifdef CONFIG_UNIX98_PTYS
 static struct tty_driver *ptm_driver;
 static struct tty_driver *pts_driver;
-- 
GitLab


From 5f0878acba7db24323f5ba4055ec9a96895bb150 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:46:41 +0100
Subject: [PATCH 1996/2198] tty: Fix oops when scanning the polling list for
 kgdb

Costantino Leandro found a bug in tty_find_polling_driver and provided a
patch that fixed the crash but not the underlying bug. This fixes the
underlying bug where the list walk corrupts the values it is using on a
match but then reuses them if the open fails.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_io.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 66b99a2049e37..6c817398232e4 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -295,7 +295,7 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line)
 	struct tty_driver *p, *res = NULL;
 	int tty_line = 0;
 	int len;
-	char *str;
+	char *str, *stp;
 
 	for (str = name; *str; str++)
 		if ((*str >= '0' && *str <= '9') || *str == ',')
@@ -311,13 +311,14 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line)
 	list_for_each_entry(p, &tty_drivers, tty_drivers) {
 		if (strncmp(name, p->name, len) != 0)
 			continue;
-		if (*str == ',')
-			str++;
-		if (*str == '\0')
-			str = NULL;
+		stp = str;
+		if (*stp == ',')
+			stp++;
+		if (*stp == '\0')
+			stp = NULL;
 
 		if (tty_line >= 0 && tty_line <= p->num && p->ops &&
-		    p->ops->poll_init && !p->ops->poll_init(p, tty_line, str)) {
+		    p->ops->poll_init && !p->ops->poll_init(p, tty_line, stp)) {
 			res = tty_driver_kref_get(p);
 			*line = tty_line;
 			break;
-- 
GitLab


From e8b70e7d3e86319a8b2aaabde3866833d92cd80f Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:48:02 +0100
Subject: [PATCH 1997/2198] tty: Extract various bits of ldisc code

Before trying to tackle the ldisc bugs the code needs to be a good deal
more readable, so do the simple extractions of routines first.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_io.c    | 24 +++++++---
 drivers/char/tty_ldisc.c | 97 +++++++++++++++++++++++++++-------------
 include/linux/tty.h      |  3 ++
 3 files changed, 88 insertions(+), 36 deletions(-)

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 6c817398232e4..be49d0730bb99 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2481,6 +2481,24 @@ static int tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int
 	return tty->ops->tiocmset(tty, file, set, clear);
 }
 
+struct tty_struct *tty_pair_get_tty(struct tty_struct *tty)
+{
+	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+	    tty->driver->subtype == PTY_TYPE_MASTER)
+		tty = tty->link;
+	return tty;
+}
+EXPORT_SYMBOL(tty_pair_get_tty);
+
+struct tty_struct *tty_pair_get_pty(struct tty_struct *tty)
+{
+	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+	    tty->driver->subtype == PTY_TYPE_MASTER)
+	    return tty;
+	return tty->link;
+}
+EXPORT_SYMBOL(tty_pair_get_pty);
+
 /*
  * Split this up, as gcc can choke on it otherwise..
  */
@@ -2496,11 +2514,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	if (tty_paranoia_check(tty, inode, "tty_ioctl"))
 		return -EINVAL;
 
-	real_tty = tty;
-	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
-	    tty->driver->subtype == PTY_TYPE_MASTER)
-		real_tty = tty->link;
-
+	real_tty = tty_pair_get_tty(tty);
 
 	/*
 	 * Factor out some common prep work
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index f78f5b0127a88..e3c6416aa86db 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -443,6 +443,50 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 	}
 }
 
+/**
+ *	tty_ldisc_halt		-	shutdown the line discipline
+ *	@tty: tty device
+ *
+ *	Shut down the line discipline and work queue for this tty device.
+ *	The TTY_LDISC flag being cleared ensures no further references can
+ *	be obtained while the delayed work queue halt ensures that no more
+ *	data is fed to the ldisc.
+ *
+ *	In order to wait for any existing references to complete see 
+ *	tty_ldisc_wait_idle.
+ */
+
+static void tty_ldisc_halt(struct tty_struct *tty)
+{
+	clear_bit(TTY_LDISC, &tty->flags);
+	cancel_delayed_work(&tty->buf.work);
+	/*
+	 * Wait for ->hangup_work and ->buf.work handlers to terminate
+	 */
+	flush_scheduled_work();
+}
+
+/**
+ *	tty_ldisc_wait_idle	-	wait for the ldisc to become idle
+ *	@tty: tty to wait for
+ *
+ *	Wait for the line discipline to become idle. The discipline must
+ *	have been halted for this to guarantee it remains idle.
+ *
+ */
+
+static void tty_ldisc_wait_idle(struct tty_struct *tty)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&tty_ldisc_lock, flags);
+	while (tty->ldisc.refcount) {
+		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+		wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0);
+		spin_lock_irqsave(&tty_ldisc_lock, flags);
+	}
+	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+}
+
 /**
  *	tty_set_ldisc		-	set line discipline
  *	@tty: the terminal to set
@@ -636,6 +680,21 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 	return 0;
 }
 
+static void tty_ldisc_reinit(struct tty_struct *tty)
+{
+	struct tty_ldisc ld;
+
+	if (tty->ldisc.ops->close)
+		(tty->ldisc.ops->close)(tty);
+	tty_ldisc_put(tty->ldisc.ops);
+	/*
+	 *	Switch the line discipline back
+	 */
+	WARN_ON(tty_ldisc_get(N_TTY, &ld));
+	tty_ldisc_assign(tty, &ld);
+	tty_set_termios_ldisc(tty, N_TTY);
+}
+
 /**
  *	tty_ldisc_release		-	release line discipline
  *	@tty: tty being shut down
@@ -647,58 +706,34 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 
 void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-	unsigned long flags;
-	struct tty_ldisc ld;
+
 	/*
 	 * Prevent flush_to_ldisc() from rescheduling the work for later.  Then
 	 * kill any delayed work. As this is the final close it does not
 	 * race with the set_ldisc code path.
 	 */
-	clear_bit(TTY_LDISC, &tty->flags);
-	cancel_delayed_work(&tty->buf.work);
 
-	/*
-	 * Wait for ->hangup_work and ->buf.work handlers to terminate
-	 */
-
-	flush_scheduled_work();
+	tty_ldisc_halt(tty);
 
 	/*
 	 * Wait for any short term users (we know they are just driver
 	 * side waiters as the file is closing so user count on the file
 	 * side is zero.
 	 */
-	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	while (tty->ldisc.refcount) {
-		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-		wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0);
-		spin_lock_irqsave(&tty_ldisc_lock, flags);
-	}
-	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+
+	tty_ldisc_wait_idle(tty);
+
 	/*
 	 * Shutdown the current line discipline, and reset it to N_TTY.
 	 *
 	 * FIXME: this MUST get fixed for the new reflocking
 	 */
-	if (tty->ldisc.ops->close)
-		(tty->ldisc.ops->close)(tty);
-	tty_ldisc_put(tty->ldisc.ops);
 
-	/*
-	 *	Switch the line discipline back
-	 */
-	WARN_ON(tty_ldisc_get(N_TTY, &ld));
-	tty_ldisc_assign(tty, &ld);
-	tty_set_termios_ldisc(tty, N_TTY);
+	tty_ldisc_reinit(tty);
 	if (o_tty) {
 		/* FIXME: could o_tty be in setldisc here ? */
 		clear_bit(TTY_LDISC, &o_tty->flags);
-		if (o_tty->ldisc.ops->close)
-			(o_tty->ldisc.ops->close)(o_tty);
-		tty_ldisc_put(o_tty->ldisc.ops);
-		WARN_ON(tty_ldisc_get(N_TTY, &ld));
-		tty_ldisc_assign(o_tty, &ld);
-		tty_set_termios_ldisc(o_tty, N_TTY);
+		tty_ldisc_reinit(o_tty);
 	}
 }
 
diff --git a/include/linux/tty.h b/include/linux/tty.h
index bed5a3d403074..f9c13c83790c0 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -428,6 +428,9 @@ extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx,
 extern void tty_release_dev(struct file *filp);
 extern int tty_init_termios(struct tty_struct *tty);
 
+extern struct tty_struct *tty_pair_get_tty(struct tty_struct *tty);
+extern struct tty_struct *tty_pair_get_pty(struct tty_struct *tty);
+
 extern struct mutex tty_mutex;
 
 extern void tty_write_unlock(struct tty_struct *tty);
-- 
GitLab


From c65c9bc3efa5589f691276bb9db689119a711222 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:50:12 +0100
Subject: [PATCH 1998/2198] tty: rewrite the ldisc locking

There are several pretty much unfixable races in the old ldisc code, especially
with respect to pty behaviour and also to hangup. It's easier to rewrite the
code than simply try and patch it up.

This patch
- splits the ldisc from the tty (so we will be able to refcount it more cleanly
  later)
- introduces a mutex lock for ldisc changing on an active device
- fixes the complete mess that hangup caused
- implements hopefully correct setldisc/close/hangup locking

There are still some problems around pty pairs that have always been there but
at least it is now possible to understand the code and fix further problems.

This fixes the following known bugs
- hang up can leak ldisc references
- hang up may not call open/close on ldisc in a matched way
- pty/tty pairs can deadlock during an ldisc change
- reading the ldisc proc files can cause every ldisc to be loaded

and probably a few other of the mysterious ldisc race reports.

I'm sure it also adds the odd new one.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/bluetooth/hci_ldisc.c |   4 +-
 drivers/char/cyclades.c       |   2 +-
 drivers/char/epca.c           |   4 +-
 drivers/char/ip2/i2lib.c      |   4 +-
 drivers/char/ip2/ip2main.c    |   4 +-
 drivers/char/n_hdlc.c         |   4 +-
 drivers/char/pty.c            |  10 +-
 drivers/char/selection.c      |   2 +-
 drivers/char/tty_io.c         |  64 +----
 drivers/char/tty_ldisc.c      | 477 +++++++++++++++++++++-------------
 include/linux/tty.h           |   9 +-
 11 files changed, 330 insertions(+), 254 deletions(-)

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index af761dc434f63..688015128594a 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -277,8 +277,8 @@ static int hci_uart_tty_open(struct tty_struct *tty)
 	/* FIXME: why is this needed. Note don't use ldisc_ref here as the
 	   open path is before the ldisc is referencable */
 
-	if (tty->ldisc.ops->flush_buffer)
-		tty->ldisc.ops->flush_buffer(tty);
+	if (tty->ldisc->ops->flush_buffer)
+		tty->ldisc->ops->flush_buffer(tty);
 	tty_driver_flush_buffer(tty);
 
 	return 0;
diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c
index 4560190517422..f3366d3f06cfe 100644
--- a/drivers/char/cyclades.c
+++ b/drivers/char/cyclades.c
@@ -5200,7 +5200,7 @@ static int cyclades_proc_show(struct seq_file *m, void *v)
 					(cur_jifs - info->idle_stats.recv_idle)/
 					HZ, info->idle_stats.overruns,
 					/* FIXME: double check locking */
-					(long)info->port.tty->ldisc.ops->num);
+					(long)info->port.tty->ldisc->ops->num);
 			else
 				seq_printf(m, "%3d %8lu %10lu %8lu "
 					"%10lu %8lu %9lu %6ld\n",
diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 710ee93330578..abef1f7d84fef 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -2114,8 +2114,8 @@ static int pc_ioctl(struct tty_struct *tty, struct file *file,
 			tty_wait_until_sent(tty, 0);
 		} else {
 			/* ldisc lock already held in ioctl */
-			if (tty->ldisc.ops->flush_buffer)
-				tty->ldisc.ops->flush_buffer(tty);
+			if (tty->ldisc->ops->flush_buffer)
+				tty->ldisc->ops->flush_buffer(tty);
 		}
 		unlock_kernel();
 		/* Fall Thru */
diff --git a/drivers/char/ip2/i2lib.c b/drivers/char/ip2/i2lib.c
index 0061e18aff604..0d10b89218ed1 100644
--- a/drivers/char/ip2/i2lib.c
+++ b/drivers/char/ip2/i2lib.c
@@ -868,11 +868,11 @@ i2Input(i2ChanStrPtr pCh)
 		amountToMove = count;
 	}
 	// Move the first block
-	pCh->pTTY->ldisc.ops->receive_buf( pCh->pTTY,
+	pCh->pTTY->ldisc->ops->receive_buf( pCh->pTTY,
 		 &(pCh->Ibuf[stripIndex]), NULL, amountToMove );
 	// If we needed to wrap, do the second data move
 	if (count > amountToMove) {
-		pCh->pTTY->ldisc.ops->receive_buf( pCh->pTTY,
+		pCh->pTTY->ldisc->ops->receive_buf( pCh->pTTY,
 		 pCh->Ibuf, NULL, count - amountToMove );
 	}
 	// Bump and wrap the stripIndex all at once by the amount of data read. This
diff --git a/drivers/char/ip2/ip2main.c b/drivers/char/ip2/ip2main.c
index afd9247cf082d..517271c762e6b 100644
--- a/drivers/char/ip2/ip2main.c
+++ b/drivers/char/ip2/ip2main.c
@@ -1315,8 +1315,8 @@ static inline void  isig(int sig, struct tty_struct *tty, int flush)
 	if (tty->pgrp)
 		kill_pgrp(tty->pgrp, sig, 1);
 	if (flush || !L_NOFLSH(tty)) {
-		if ( tty->ldisc.ops->flush_buffer )  
-			tty->ldisc.ops->flush_buffer(tty);
+		if ( tty->ldisc->ops->flush_buffer )  
+			tty->ldisc->ops->flush_buffer(tty);
 		i2InputFlush( tty->driver_data );
 	}
 }
diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c
index bacb3e2872ae4..461ece591a5bf 100644
--- a/drivers/char/n_hdlc.c
+++ b/drivers/char/n_hdlc.c
@@ -342,8 +342,8 @@ static int n_hdlc_tty_open (struct tty_struct *tty)
 #endif
 	
 	/* Flush any pending characters in the driver and discipline. */
-	if (tty->ldisc.ops->flush_buffer)
-		tty->ldisc.ops->flush_buffer(tty);
+	if (tty->ldisc->ops->flush_buffer)
+		tty->ldisc->ops->flush_buffer(tty);
 
 	tty_driver_flush_buffer(tty);
 		
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index da2cb8c70c110..5acd29e6e0430 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -110,7 +110,7 @@ static int pty_write(struct tty_struct *tty, const unsigned char *buf,
 	c = to->receive_room;
 	if (c > count)
 		c = count;
-	to->ldisc.ops->receive_buf(to, buf, NULL, c);
+	to->ldisc->ops->receive_buf(to, buf, NULL, c);
 
 	return c;
 }
@@ -148,11 +148,11 @@ static int pty_chars_in_buffer(struct tty_struct *tty)
 	int count;
 
 	/* We should get the line discipline lock for "tty->link" */
-	if (!to || !to->ldisc.ops->chars_in_buffer)
+	if (!to || !to->ldisc->ops->chars_in_buffer)
 		return 0;
 
 	/* The ldisc must report 0 if no characters available to be read */
-	count = to->ldisc.ops->chars_in_buffer(to);
+	count = to->ldisc->ops->chars_in_buffer(to);
 
 	if (tty->driver->subtype == PTY_TYPE_SLAVE)
 		return count;
@@ -186,8 +186,8 @@ static void pty_flush_buffer(struct tty_struct *tty)
 	if (!to)
 		return;
 
-	if (to->ldisc.ops->flush_buffer)
-		to->ldisc.ops->flush_buffer(to);
+	if (to->ldisc->ops->flush_buffer)
+		to->ldisc->ops->flush_buffer(to);
 
 	if (to->packet) {
 		spin_lock_irqsave(&tty->ctrl_lock, flags);
diff --git a/drivers/char/selection.c b/drivers/char/selection.c
index cb8ca56989630..f97b9e8480645 100644
--- a/drivers/char/selection.c
+++ b/drivers/char/selection.c
@@ -327,7 +327,7 @@ int paste_selection(struct tty_struct *tty)
 		}
 		count = sel_buffer_lth - pasted;
 		count = min(count, tty->receive_room);
-		tty->ldisc.ops->receive_buf(tty, sel_buffer + pasted,
+		tty->ldisc->ops->receive_buf(tty, sel_buffer + pasted,
 								NULL, count);
 		pasted += count;
 	}
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index be49d0730bb99..2f44b0b241c3a 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -491,22 +491,6 @@ void tty_ldisc_flush(struct tty_struct *tty)
 
 EXPORT_SYMBOL_GPL(tty_ldisc_flush);
 
-/**
- *	tty_reset_termios	-	reset terminal state
- *	@tty: tty to reset
- *
- *	Restore a terminal to the driver default state
- */
-
-static void tty_reset_termios(struct tty_struct *tty)
-{
-	mutex_lock(&tty->termios_mutex);
-	*tty->termios = tty->driver->init_termios;
-	tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios);
-	tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios);
-	mutex_unlock(&tty->termios_mutex);
-}
-
 /**
  *	do_tty_hangup		-	actual handler for hangup events
  *	@work: tty device
@@ -536,7 +520,6 @@ static void do_tty_hangup(struct work_struct *work)
 	struct file *cons_filp = NULL;
 	struct file *filp, *f = NULL;
 	struct task_struct *p;
-	struct tty_ldisc *ld;
 	int    closecount = 0, n;
 	unsigned long flags;
 	int refs = 0;
@@ -567,40 +550,8 @@ static void do_tty_hangup(struct work_struct *work)
 		filp->f_op = &hung_up_tty_fops;
 	}
 	file_list_unlock();
-	/*
-	 * FIXME! What are the locking issues here? This may me overdoing
-	 * things... This question is especially important now that we've
-	 * removed the irqlock.
-	 */
-	ld = tty_ldisc_ref(tty);
-	if (ld != NULL) {
-		/* We may have no line discipline at this point */
-		if (ld->ops->flush_buffer)
-			ld->ops->flush_buffer(tty);
-		tty_driver_flush_buffer(tty);
-		if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) &&
-		    ld->ops->write_wakeup)
-			ld->ops->write_wakeup(tty);
-		if (ld->ops->hangup)
-			ld->ops->hangup(tty);
-	}
-	/*
-	 * FIXME: Once we trust the LDISC code better we can wait here for
-	 * ldisc completion and fix the driver call race
-	 */
-	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
-	wake_up_interruptible_poll(&tty->read_wait, POLLIN);
-	/*
-	 * Shutdown the current line discipline, and reset it to
-	 * N_TTY.
-	 */
-	if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS)
-		tty_reset_termios(tty);
-	/* Defer ldisc switch */
-	/* tty_deferred_ldisc_switch(N_TTY);
 
-	  This should get done automatically when the port closes and
-	  tty_release is called */
+	tty_ldisc_hangup(tty);
 
 	read_lock(&tasklist_lock);
 	if (tty->session) {
@@ -629,12 +580,15 @@ static void do_tty_hangup(struct work_struct *work)
 	read_unlock(&tasklist_lock);
 
 	spin_lock_irqsave(&tty->ctrl_lock, flags);
-	tty->flags = 0;
+	clear_bit(TTY_THROTTLED, &tty->flags);
+	clear_bit(TTY_PUSH, &tty->flags);
+	clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
 	put_pid(tty->session);
 	put_pid(tty->pgrp);
 	tty->session = NULL;
 	tty->pgrp = NULL;
 	tty->ctrl_status = 0;
+	set_bit(TTY_HUPPED, &tty->flags);
 	spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 
 	/* Account for the p->signal references we killed */
@@ -660,10 +614,7 @@ static void do_tty_hangup(struct work_struct *work)
 	 * can't yet guarantee all that.
 	 */
 	set_bit(TTY_HUPPED, &tty->flags);
-	if (ld) {
-		tty_ldisc_enable(tty);
-		tty_ldisc_deref(ld);
-	}
+	tty_ldisc_enable(tty);
 	unlock_kernel();
 	if (f)
 		fput(f);
@@ -2570,7 +2521,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCGSID:
 		return tiocgsid(tty, real_tty, p);
 	case TIOCGETD:
-		return put_user(tty->ldisc.ops->num, (int __user *)p);
+		return put_user(tty->ldisc->ops->num, (int __user *)p);
 	case TIOCSETD:
 		return tiocsetd(tty, p);
 	/*
@@ -2785,6 +2736,7 @@ void initialize_tty_struct(struct tty_struct *tty,
 	tty->buf.head = tty->buf.tail = NULL;
 	tty_buffer_init(tty);
 	mutex_init(&tty->termios_mutex);
+	mutex_init(&tty->ldisc_mutex);
 	init_waitqueue_head(&tty->write_wait);
 	init_waitqueue_head(&tty->read_wait);
 	INIT_WORK(&tty->hangup_work, do_tty_hangup);
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index e3c6416aa86db..a58a19a6a5e10 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -115,19 +115,22 @@ EXPORT_SYMBOL(tty_unregister_ldisc);
 /**
  *	tty_ldisc_try_get	-	try and reference an ldisc
  *	@disc: ldisc number
- *	@ld: tty ldisc structure to complete
  *
  *	Attempt to open and lock a line discipline into place. Return
- *	the line discipline refcounted and assigned in ld. On an error
- *	report the error code back
+ *	the line discipline refcounted or an error.
  */
 
-static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
+static struct tty_ldisc *tty_ldisc_try_get(int disc)
 {
 	unsigned long flags;
+	struct tty_ldisc *ld;
 	struct tty_ldisc_ops *ldops;
 	int err = -EINVAL;
 	
+	ld = kmalloc(sizeof(struct tty_ldisc), GFP_KERNEL);
+	if (ld == NULL)
+		return ERR_PTR(-ENOMEM);
+
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
 	ld->ops = NULL;
 	ldops = tty_ldiscs[disc];
@@ -140,17 +143,19 @@ static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
 			/* lock it */
 			ldops->refcount++;
 			ld->ops = ldops;
+			ld->refcount = 0;
 			err = 0;
 		}
 	}
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-	return err;
+	if (err)
+		return ERR_PTR(err);
+	return ld;
 }
 
 /**
  *	tty_ldisc_get		-	take a reference to an ldisc
  *	@disc: ldisc number
- *	@ld: tty line discipline structure to use
  *
  *	Takes a reference to a line discipline. Deals with refcounts and
  *	module locking counts. Returns NULL if the discipline is not available.
@@ -161,44 +166,46 @@ static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
  *		takes tty_ldisc_lock to guard against ldisc races
  */
 
-static int tty_ldisc_get(int disc, struct tty_ldisc *ld)
+static struct tty_ldisc *tty_ldisc_get(int disc)
 {
-	int err;
+	struct tty_ldisc *ld;
 
 	if (disc < N_TTY || disc >= NR_LDISCS)
-		return -EINVAL;
-	err = tty_ldisc_try_get(disc, ld);
-	if (err < 0) {
+		return ERR_PTR(-EINVAL);
+	ld = tty_ldisc_try_get(disc);
+	if (IS_ERR(ld)) {
 		request_module("tty-ldisc-%d", disc);
-		err = tty_ldisc_try_get(disc, ld);
+		ld = tty_ldisc_try_get(disc);
 	}
-	return err;
+	return ld;
 }
 
 /**
  *	tty_ldisc_put		-	drop ldisc reference
- *	@disc: ldisc number
+ *	@ld: ldisc
  *
  *	Drop a reference to a line discipline. Manage refcounts and
- *	module usage counts
+ *	module usage counts. Free the ldisc once the recount hits zero.
  *
  *	Locking:
  *		takes tty_ldisc_lock to guard against ldisc races
  */
 
-static void tty_ldisc_put(struct tty_ldisc_ops *ld)
+static void tty_ldisc_put(struct tty_ldisc *ld)
 {
 	unsigned long flags;
-	int disc = ld->num;
+	int disc = ld->ops->num;
+	struct tty_ldisc_ops *ldo;
 
 	BUG_ON(disc < N_TTY || disc >= NR_LDISCS);
 
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	ld = tty_ldiscs[disc];
-	BUG_ON(ld->refcount == 0);
-	ld->refcount--;
-	module_put(ld->owner);
+	ldo = tty_ldiscs[disc];
+	BUG_ON(ldo->refcount == 0);
+	ldo->refcount--;
+	module_put(ldo->owner);
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+	kfree(ld);
 }
 
 static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
@@ -219,12 +226,13 @@ static void tty_ldiscs_seq_stop(struct seq_file *m, void *v)
 static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
 {
 	int i = *(loff_t *)v;
-	struct tty_ldisc ld;
+	struct tty_ldisc *ld;
 	
-	if (tty_ldisc_get(i, &ld) < 0)
+	ld = tty_ldisc_try_get(i);
+	if (IS_ERR(ld))
 		return 0;
-	seq_printf(m, "%-10s %2d\n", ld.ops->name ? ld.ops->name : "???", i);
-	tty_ldisc_put(ld.ops);
+	seq_printf(m, "%-10s %2d\n", ld->ops->name ? ld->ops->name : "???", i);
+	tty_ldisc_put(ld);
 	return 0;
 }
 
@@ -263,8 +271,7 @@ const struct file_operations tty_ldiscs_proc_fops = {
 
 static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld)
 {
-	ld->refcount = 0;
-	tty->ldisc = *ld;
+	tty->ldisc = ld;
 }
 
 /**
@@ -286,7 +293,7 @@ static int tty_ldisc_try(struct tty_struct *tty)
 	int ret = 0;
 
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	ld = &tty->ldisc;
+	ld = tty->ldisc;
 	if (test_bit(TTY_LDISC, &tty->flags)) {
 		ld->refcount++;
 		ret = 1;
@@ -315,8 +322,8 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty)
 {
 	/* wait_event is a macro */
 	wait_event(tty_ldisc_wait, tty_ldisc_try(tty));
-	WARN_ON(tty->ldisc.refcount == 0);
-	return &tty->ldisc;
+	WARN_ON(tty->ldisc->refcount == 0);
+	return tty->ldisc;
 }
 
 EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
@@ -335,7 +342,7 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
 struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty)
 {
 	if (tty_ldisc_try(tty))
-		return &tty->ldisc;
+		return tty->ldisc;
 	return NULL;
 }
 
@@ -407,6 +414,39 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
 	mutex_unlock(&tty->termios_mutex);
 }
 
+/**
+ *	tty_ldisc_open		-	open a line discipline
+ *	@tty: tty we are opening the ldisc on
+ *	@ld: discipline to open
+ *
+ *	A helper opening method. Also a convenient debugging and check
+ *	point.
+ */
+
+static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld)
+{
+	WARN_ON(test_and_set_bit(TTY_LDISC_OPEN, &tty->flags));
+	if (ld->ops->open)
+		return ld->ops->open(tty);
+	return 0;
+}
+
+/**
+ *	tty_ldisc_close		-	close a line discipline
+ *	@tty: tty we are opening the ldisc on
+ *	@ld: discipline to close
+ *
+ *	A helper close method. Also a convenient debugging and check
+ *	point.
+ */
+
+static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld)
+{
+	WARN_ON(!test_bit(TTY_LDISC_OPEN, &tty->flags));
+	clear_bit(TTY_LDISC_OPEN, &tty->flags);
+	if (ld->ops->close)
+		ld->ops->close(tty);
+}
 
 /**
  *	tty_ldisc_restore	-	helper for tty ldisc change
@@ -420,31 +460,32 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
 static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 {
 	char buf[64];
-	struct tty_ldisc new_ldisc;
+	struct tty_ldisc *new_ldisc;
+	int r;
 
 	/* There is an outstanding reference here so this is safe */
-	tty_ldisc_get(old->ops->num, old);
+	old = tty_ldisc_get(old->ops->num);
+	WARN_ON(IS_ERR(old));
 	tty_ldisc_assign(tty, old);
 	tty_set_termios_ldisc(tty, old->ops->num);
-	if (old->ops->open && (old->ops->open(tty) < 0)) {
-		tty_ldisc_put(old->ops);
+	if (tty_ldisc_open(tty, old) < 0) {
+		tty_ldisc_put(old);
 		/* This driver is always present */
-		if (tty_ldisc_get(N_TTY, &new_ldisc) < 0)
+		new_ldisc =tty_ldisc_get(N_TTY);
+		if (IS_ERR(new_ldisc))
 			panic("n_tty: get");
-		tty_ldisc_assign(tty, &new_ldisc);
+		tty_ldisc_assign(tty, new_ldisc);
 		tty_set_termios_ldisc(tty, N_TTY);
-		if (new_ldisc.ops->open) {
-			int r = new_ldisc.ops->open(tty);
-				if (r < 0)
-				panic("Couldn't open N_TTY ldisc for "
-				      "%s --- error %d.",
-				      tty_name(tty, buf), r);
-		}
+		r = tty_ldisc_open(tty, new_ldisc);
+		if (r < 0)
+			panic("Couldn't open N_TTY ldisc for "
+			      "%s --- error %d.",
+			      tty_name(tty, buf), r);
 	}
 }
 
 /**
- *	tty_ldisc_halt		-	shutdown the line discipline
+ *	tty_ldisc_halt		-	shut down the line discipline
  *	@tty: tty device
  *
  *	Shut down the line discipline and work queue for this tty device.
@@ -456,14 +497,10 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
  *	tty_ldisc_wait_idle.
  */
 
-static void tty_ldisc_halt(struct tty_struct *tty)
+static int tty_ldisc_halt(struct tty_struct *tty)
 {
 	clear_bit(TTY_LDISC, &tty->flags);
-	cancel_delayed_work(&tty->buf.work);
-	/*
-	 * Wait for ->hangup_work and ->buf.work handlers to terminate
-	 */
-	flush_scheduled_work();
+	return cancel_delayed_work(&tty->buf.work);
 }
 
 /**
@@ -473,18 +510,22 @@ static void tty_ldisc_halt(struct tty_struct *tty)
  *	Wait for the line discipline to become idle. The discipline must
  *	have been halted for this to guarantee it remains idle.
  *
+ *	tty_ldisc_lock protects the ref counts currently.
  */
 
-static void tty_ldisc_wait_idle(struct tty_struct *tty)
+static int tty_ldisc_wait_idle(struct tty_struct *tty)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	while (tty->ldisc.refcount) {
+	while (tty->ldisc->refcount) {
 		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-		wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0);
+		if (wait_event_timeout(tty_ldisc_wait,
+				tty->ldisc->refcount == 0, 5 * HZ) == 0)
+			return -EBUSY;
 		spin_lock_irqsave(&tty_ldisc_lock, flags);
 	}
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+	return 0;
 }
 
 /**
@@ -493,38 +534,63 @@ static void tty_ldisc_wait_idle(struct tty_struct *tty)
  *	@ldisc: the line discipline
  *
  *	Set the discipline of a tty line. Must be called from a process
- *	context.
+ *	context. The ldisc change logic has to protect itself against any
+ *	overlapping ldisc change (including on the other end of pty pairs),
+ *	the close of one side of a tty/pty pair, and eventually hangup.
  *
- *	Locking: takes tty_ldisc_lock.
- *		 called functions take termios_mutex
+ *	Locking: takes tty_ldisc_lock, termios_mutex
  */
 
 int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 {
 	int retval;
-	struct tty_ldisc o_ldisc, new_ldisc;
-	int work;
-	unsigned long flags;
+	struct tty_ldisc *o_ldisc, *new_ldisc;
+	int work, o_work = 0;
 	struct tty_struct *o_tty;
 
-restart:
-	/* This is a bit ugly for now but means we can break the 'ldisc
-	   is part of the tty struct' assumption later */
-	retval = tty_ldisc_get(ldisc, &new_ldisc);
-	if (retval)
-		return retval;
+	new_ldisc = tty_ldisc_get(ldisc);
+	if (IS_ERR(new_ldisc))
+		return PTR_ERR(new_ldisc);
 
 	/*
-	 *	Problem: What do we do if this blocks ?
+	 *	We need to look at the tty locking here for pty/tty pairs
+	 *	when both sides try to change in parallel.
 	 */
 
-	tty_wait_until_sent(tty, 0);
+	o_tty = tty->link;	/* o_tty is the pty side or NULL */
+
 
-	if (tty->ldisc.ops->num == ldisc) {
-		tty_ldisc_put(new_ldisc.ops);
+	/*
+	 *	Check the no-op case
+	 */
+
+	if (tty->ldisc->ops->num == ldisc) {
+		tty_ldisc_put(new_ldisc);
 		return 0;
 	}
 
+	/*
+	 *	Problem: What do we do if this blocks ?
+	 *	We could deadlock here
+	 */
+
+	tty_wait_until_sent(tty, 0);
+
+	mutex_lock(&tty->ldisc_mutex);
+
+	/*
+	 *	We could be midstream of another ldisc change which has
+	 *	dropped the lock during processing. If so we need to wait.
+	 */
+
+	while (test_bit(TTY_LDISC_CHANGING, &tty->flags)) {
+		mutex_unlock(&tty->ldisc_mutex);
+		wait_event(tty_ldisc_wait,
+			test_bit(TTY_LDISC_CHANGING, &tty->flags) == 0);
+		mutex_lock(&tty->ldisc_mutex);
+	}
+	set_bit(TTY_LDISC_CHANGING, &tty->flags);
+		
 	/*
 	 *	No more input please, we are switching. The new ldisc
 	 *	will update this value in the ldisc open function
@@ -533,8 +599,6 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 	tty->receive_room = 0;
 
 	o_ldisc = tty->ldisc;
-	o_tty = tty->link;
-
 	/*
 	 *	Make sure we don't change while someone holds a
 	 *	reference to the line discipline. The TTY_LDISC bit
@@ -545,108 +609,181 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 	 *	with a userspace app continually trying to use the tty in
 	 *	parallel to the change and re-referencing the tty.
 	 */
-	clear_bit(TTY_LDISC, &tty->flags);
-	if (o_tty)
-		clear_bit(TTY_LDISC, &o_tty->flags);
 
-	spin_lock_irqsave(&tty_ldisc_lock, flags);
-	if (tty->ldisc.refcount || (o_tty && o_tty->ldisc.refcount)) {
-		if (tty->ldisc.refcount) {
-			/* Free the new ldisc we grabbed. Must drop the lock
-			   first. */
-			spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-			tty_ldisc_put(o_ldisc.ops);
-			/*
-			 * There are several reasons we may be busy, including
-			 * random momentary I/O traffic. We must therefore
-			 * retry. We could distinguish between blocking ops
-			 * and retries if we made tty_ldisc_wait() smarter.
-			 * That is up for discussion.
-			 */
-			if (wait_event_interruptible(tty_ldisc_wait, tty->ldisc.refcount == 0) < 0)
-				return -ERESTARTSYS;
-			goto restart;
-		}
-		if (o_tty && o_tty->ldisc.refcount) {
-			spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-			tty_ldisc_put(o_tty->ldisc.ops);
-			if (wait_event_interruptible(tty_ldisc_wait, o_tty->ldisc.refcount == 0) < 0)
-				return -ERESTARTSYS;
-			goto restart;
-		}
-	}
-	/*
-	 *	If the TTY_LDISC bit is set, then we are racing against
-	 *	another ldisc change
-	 */
-	if (test_bit(TTY_LDISC_CHANGING, &tty->flags)) {
-		struct tty_ldisc *ld;
-		spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-		tty_ldisc_put(new_ldisc.ops);
-		ld = tty_ldisc_ref_wait(tty);
-		tty_ldisc_deref(ld);
-		goto restart;
-	}
-	/*
-	 *	This flag is used to avoid two parallel ldisc changes. Once
-	 *	open and close are fine grained locked this may work better
-	 *	as a mutex shared with the open/close/hup paths
-	 */
-	set_bit(TTY_LDISC_CHANGING, &tty->flags);
+	work = tty_ldisc_halt(tty);
 	if (o_tty)
-		set_bit(TTY_LDISC_CHANGING, &o_tty->flags);
-	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-	
-	/*
-	 *	From this point on we know nobody has an ldisc
-	 *	usage reference, nor can they obtain one until
-	 *	we say so later on.
-	 */
+		o_work = tty_ldisc_halt(o_tty);
 
-	work = cancel_delayed_work(&tty->buf.work);
 	/*
-	 * Wait for ->hangup_work and ->buf.work handlers to terminate
-	 * MUST NOT hold locks here.
+	 * Wait for ->hangup_work and ->buf.work handlers to terminate.
+	 * We must drop the mutex here in case a hangup is also in process.
 	 */
+
+	mutex_unlock(&tty->ldisc_mutex);
+
 	flush_scheduled_work();
+
+	/* Let any existing reference holders finish */
+	retval = tty_ldisc_wait_idle(tty);
+	if (retval < 0) {
+		clear_bit(TTY_LDISC_CHANGING, &tty->flags);
+		tty_ldisc_put(new_ldisc);
+		return retval;
+	}
+
+	mutex_lock(&tty->ldisc_mutex);
+	if (test_bit(TTY_HUPPED, &tty->flags)) {
+		/* We were raced by the hangup method. It will have stomped
+		   the ldisc data and closed the ldisc down */
+		clear_bit(TTY_LDISC_CHANGING, &tty->flags);
+		mutex_unlock(&tty->ldisc_mutex);
+		tty_ldisc_put(new_ldisc);
+		return -EIO;
+	}
+
 	/* Shutdown the current discipline. */
-	if (o_ldisc.ops->close)
-		(o_ldisc.ops->close)(tty);
+	tty_ldisc_close(tty, o_ldisc);
 
 	/* Now set up the new line discipline. */
-	tty_ldisc_assign(tty, &new_ldisc);
+	tty_ldisc_assign(tty, new_ldisc);
 	tty_set_termios_ldisc(tty, ldisc);
-	if (new_ldisc.ops->open)
-		retval = (new_ldisc.ops->open)(tty);
+
+	retval = tty_ldisc_open(tty, new_ldisc);
 	if (retval < 0) {
-		tty_ldisc_put(new_ldisc.ops);
-		tty_ldisc_restore(tty, &o_ldisc);
+		/* Back to the old one or N_TTY if we can't */
+		tty_ldisc_put(new_ldisc);
+		tty_ldisc_restore(tty, o_ldisc);
 	}
+
 	/* At this point we hold a reference to the new ldisc and a
 	   a reference to the old ldisc. If we ended up flipping back
 	   to the existing ldisc we have two references to it */
 
-	if (tty->ldisc.ops->num != o_ldisc.ops->num && tty->ops->set_ldisc)
+	if (tty->ldisc->ops->num != o_ldisc->ops->num && tty->ops->set_ldisc)
 		tty->ops->set_ldisc(tty);
 
-	tty_ldisc_put(o_ldisc.ops);
+	tty_ldisc_put(o_ldisc);
 
 	/*
-	 *	Allow ldisc referencing to occur as soon as the driver
-	 *	ldisc callback completes.
+	 *	Allow ldisc referencing to occur again
 	 */
 
 	tty_ldisc_enable(tty);
 	if (o_tty)
 		tty_ldisc_enable(o_tty);
 
-	/* Restart it in case no characters kick it off. Safe if
+	/* Restart the work queue in case no characters kick it off. Safe if
 	   already running */
 	if (work)
 		schedule_delayed_work(&tty->buf.work, 1);
+	if (o_work)
+		schedule_delayed_work(&o_tty->buf.work, 1);
+	mutex_unlock(&tty->ldisc_mutex);
 	return retval;
 }
 
+/**
+ *	tty_reset_termios	-	reset terminal state
+ *	@tty: tty to reset
+ *
+ *	Restore a terminal to the driver default state.
+ */
+
+static void tty_reset_termios(struct tty_struct *tty)
+{
+	mutex_lock(&tty->termios_mutex);
+	*tty->termios = tty->driver->init_termios;
+	tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios);
+	tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios);
+	mutex_unlock(&tty->termios_mutex);
+}
+
+
+/**
+ *	tty_ldisc_reinit	-	reinitialise the tty ldisc
+ *	@tty: tty to reinit
+ *
+ *	Switch the tty back to N_TTY line discipline and leave the
+ *	ldisc state closed
+ */
+
+static void tty_ldisc_reinit(struct tty_struct *tty)
+{
+	struct tty_ldisc *ld;
+
+	tty_ldisc_close(tty, tty->ldisc);
+	tty_ldisc_put(tty->ldisc);
+	tty->ldisc = NULL;
+	/*
+	 *	Switch the line discipline back
+	 */
+	ld = tty_ldisc_get(N_TTY);
+	BUG_ON(IS_ERR(ld));
+	tty_ldisc_assign(tty, ld);
+	tty_set_termios_ldisc(tty, N_TTY);
+}
+
+/**
+ *	tty_ldisc_hangup		-	hangup ldisc reset
+ *	@tty: tty being hung up
+ *
+ *	Some tty devices reset their termios when they receive a hangup
+ *	event. In that situation we must also switch back to N_TTY properly
+ *	before we reset the termios data.
+ *
+ *	Locking: We can take the ldisc mutex as the rest of the code is
+ *	careful to allow for this.
+ *
+ *	In the pty pair case this occurs in the close() path of the
+ *	tty itself so we must be careful about locking rules.
+ */
+
+void tty_ldisc_hangup(struct tty_struct *tty)
+{
+	struct tty_ldisc *ld;
+
+	/*
+	 * FIXME! What are the locking issues here? This may me overdoing
+	 * things... This question is especially important now that we've
+	 * removed the irqlock.
+	 */
+	ld = tty_ldisc_ref(tty);
+	if (ld != NULL) {
+		/* We may have no line discipline at this point */
+		if (ld->ops->flush_buffer)
+			ld->ops->flush_buffer(tty);
+		tty_driver_flush_buffer(tty);
+		if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) &&
+		    ld->ops->write_wakeup)
+			ld->ops->write_wakeup(tty);
+		if (ld->ops->hangup)
+			ld->ops->hangup(tty);
+		tty_ldisc_deref(ld);
+	}
+	/*
+	 * FIXME: Once we trust the LDISC code better we can wait here for
+	 * ldisc completion and fix the driver call race
+	 */
+	wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
+	wake_up_interruptible_poll(&tty->read_wait, POLLIN);
+	/*
+	 * Shutdown the current line discipline, and reset it to
+	 * N_TTY.
+	 */
+	if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) {
+		/* Avoid racing set_ldisc */
+		mutex_lock(&tty->ldisc_mutex);
+		/* Switch back to N_TTY */
+		tty_ldisc_reinit(tty);
+		/* At this point we have a closed ldisc and we want to
+		   reopen it. We could defer this to the next open but
+		   it means auditing a lot of other paths so this is a FIXME */
+		WARN_ON(tty_ldisc_open(tty, tty->ldisc));
+		tty_ldisc_enable(tty);
+		mutex_unlock(&tty->ldisc_mutex);
+		tty_reset_termios(tty);
+	}
+}
 
 /**
  *	tty_ldisc_setup			-	open line discipline
@@ -654,24 +791,23 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
  *	@o_tty: pair tty for pty/tty pairs
  *
  *	Called during the initial open of a tty/pty pair in order to set up the
- *	line discplines and bind them to the tty.
+ *	line disciplines and bind them to the tty. This has no locking issues
+ *	as the device isn't yet active.
  */
 
 int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-	struct tty_ldisc *ld = &tty->ldisc;
+	struct tty_ldisc *ld = tty->ldisc;
 	int retval;
 
-	if (ld->ops->open) {
-		retval = (ld->ops->open)(tty);
-		if (retval)
-			return retval;
-	}
-	if (o_tty && o_tty->ldisc.ops->open) {
-		retval = (o_tty->ldisc.ops->open)(o_tty);
+	retval = tty_ldisc_open(tty, ld);
+	if (retval)
+		return retval;
+
+	if (o_tty) {
+		retval = tty_ldisc_open(o_tty, o_tty->ldisc);
 		if (retval) {
-			if (ld->ops->close)
-				(ld->ops->close)(tty);
+			tty_ldisc_close(tty, ld);
 			return retval;
 		}
 		tty_ldisc_enable(o_tty);
@@ -679,34 +815,18 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 	tty_ldisc_enable(tty);
 	return 0;
 }
-
-static void tty_ldisc_reinit(struct tty_struct *tty)
-{
-	struct tty_ldisc ld;
-
-	if (tty->ldisc.ops->close)
-		(tty->ldisc.ops->close)(tty);
-	tty_ldisc_put(tty->ldisc.ops);
-	/*
-	 *	Switch the line discipline back
-	 */
-	WARN_ON(tty_ldisc_get(N_TTY, &ld));
-	tty_ldisc_assign(tty, &ld);
-	tty_set_termios_ldisc(tty, N_TTY);
-}
-
 /**
  *	tty_ldisc_release		-	release line discipline
  *	@tty: tty being shut down
  *	@o_tty: pair tty for pty/tty pairs
  *
  *	Called during the final close of a tty/pty pair in order to shut down the
- *	line discpline layer.
+ *	line discpline layer. On exit the ldisc assigned is N_TTY and the
+ *	ldisc has not been opened.
  */
 
 void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-
 	/*
 	 * Prevent flush_to_ldisc() from rescheduling the work for later.  Then
 	 * kill any delayed work. As this is the final close it does not
@@ -714,6 +834,7 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 	 */
 
 	tty_ldisc_halt(tty);
+	flush_scheduled_work();
 
 	/*
 	 * Wait for any short term users (we know they are just driver
@@ -730,11 +851,9 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 	 */
 
 	tty_ldisc_reinit(tty);
-	if (o_tty) {
-		/* FIXME: could o_tty be in setldisc here ? */
-		clear_bit(TTY_LDISC, &o_tty->flags);
-		tty_ldisc_reinit(o_tty);
-	}
+	/* This will need doing differently if we need to lock */
+	if (o_tty)
+		tty_ldisc_release(o_tty, NULL);
 }
 
 /**
@@ -747,10 +866,10 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 
 void tty_ldisc_init(struct tty_struct *tty)
 {
-	struct tty_ldisc ld;
-	if (tty_ldisc_get(N_TTY, &ld) < 0)
+	struct tty_ldisc *ld = tty_ldisc_get(N_TTY);
+	if (IS_ERR(ld))
 		panic("n_tty: init_tty");
-	tty_ldisc_assign(tty, &ld);
+	tty_ldisc_assign(tty, ld);
 }
 
 void tty_ldisc_begin(void)
diff --git a/include/linux/tty.h b/include/linux/tty.h
index f9c13c83790c0..1488d8c81aac6 100644
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -226,8 +226,11 @@ struct tty_struct {
 	struct tty_driver *driver;
 	const struct tty_operations *ops;
 	int index;
-	/* The ldisc objects are protected by tty_ldisc_lock at the moment */
-	struct tty_ldisc ldisc;
+
+	/* Protects ldisc changes: Lock tty not pty */
+	struct mutex ldisc_mutex;
+	struct tty_ldisc *ldisc;
+
 	struct mutex termios_mutex;
 	spinlock_t ctrl_lock;
 	/* Termios values are protected by the termios mutex */
@@ -314,6 +317,7 @@ struct tty_struct {
 #define TTY_CLOSING 		7	/* ->close() in progress */
 #define TTY_LDISC 		9	/* Line discipline attached */
 #define TTY_LDISC_CHANGING 	10	/* Line discipline changing */
+#define TTY_LDISC_OPEN	 	11	/* Line discipline is open */
 #define TTY_HW_COOK_OUT 	14	/* Hardware can do output cooking */
 #define TTY_HW_COOK_IN 		15	/* Hardware can do input cooking */
 #define TTY_PTY_LOCK 		16	/* pty private */
@@ -406,6 +410,7 @@ extern int tty_termios_hw_change(struct ktermios *a, struct ktermios *b);
 extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *);
 extern void tty_ldisc_deref(struct tty_ldisc *);
 extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *);
+extern void tty_ldisc_hangup(struct tty_struct *tty);
 extern const struct file_operations tty_ldiscs_proc_fops;
 
 extern void tty_wakeup(struct tty_struct *tty);
-- 
GitLab


From f2c4c65c8350d885d74dcdea7061dcecc1223c36 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:50:58 +0100
Subject: [PATCH 1999/2198] tty: Move ldisc_flush

We have a tty_ldisc file now so put tty_ldisc_flush in the right place

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_io.c    | 21 ---------------------
 drivers/char/tty_ldisc.c | 21 +++++++++++++++++++++
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 2f44b0b241c3a..939e198d7670a 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -470,27 +470,6 @@ void tty_wakeup(struct tty_struct *tty)
 
 EXPORT_SYMBOL_GPL(tty_wakeup);
 
-/**
- *	tty_ldisc_flush	-	flush line discipline queue
- *	@tty: tty
- *
- *	Flush the line discipline queue (if any) for this tty. If there
- *	is no line discipline active this is a no-op.
- */
-
-void tty_ldisc_flush(struct tty_struct *tty)
-{
-	struct tty_ldisc *ld = tty_ldisc_ref(tty);
-	if (ld) {
-		if (ld->ops->flush_buffer)
-			ld->ops->flush_buffer(tty);
-		tty_ldisc_deref(ld);
-	}
-	tty_buffer_flush(tty);
-}
-
-EXPORT_SYMBOL_GPL(tty_ldisc_flush);
-
 /**
  *	do_tty_hangup		-	actual handler for hangup events
  *	@work: tty device
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index a58a19a6a5e10..01c70c01f64a6 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -395,6 +395,27 @@ void tty_ldisc_enable(struct tty_struct *tty)
 	wake_up(&tty_ldisc_wait);
 }
 
+/**
+ *	tty_ldisc_flush	-	flush line discipline queue
+ *	@tty: tty
+ *
+ *	Flush the line discipline queue (if any) for this tty. If there
+ *	is no line discipline active this is a no-op.
+ */
+
+void tty_ldisc_flush(struct tty_struct *tty)
+{
+	struct tty_ldisc *ld = tty_ldisc_ref(tty);
+	if (ld) {
+		if (ld->ops->flush_buffer)
+			ld->ops->flush_buffer(tty);
+		tty_ldisc_deref(ld);
+	}
+	tty_buffer_flush(tty);
+}
+
+EXPORT_SYMBOL_GPL(tty_ldisc_flush);
+
 /**
  *	tty_set_termios_ldisc		-	set ldisc field
  *	@tty: tty structure
-- 
GitLab


From 852e99d22f2231d232c45216b027565e3bae7add Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 12:51:41 +0100
Subject: [PATCH 2000/2198] tty: bring ldisc into CodingStyle

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_ldisc.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index 01c70c01f64a6..39c8f86dedd49 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -126,7 +126,7 @@ static struct tty_ldisc *tty_ldisc_try_get(int disc)
 	struct tty_ldisc *ld;
 	struct tty_ldisc_ops *ldops;
 	int err = -EINVAL;
-	
+
 	ld = kmalloc(sizeof(struct tty_ldisc), GFP_KERNEL);
 	if (ld == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -208,12 +208,12 @@ static void tty_ldisc_put(struct tty_ldisc *ld)
 	kfree(ld);
 }
 
-static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
+static void *tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
 {
 	return (*pos < NR_LDISCS) ? pos : NULL;
 }
 
-static void * tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
+static void *tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	(*pos)++;
 	return (*pos < NR_LDISCS) ? pos : NULL;
@@ -227,7 +227,7 @@ static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
 {
 	int i = *(loff_t *)v;
 	struct tty_ldisc *ld;
-	
+
 	ld = tty_ldisc_try_get(i);
 	if (IS_ERR(ld))
 		return 0;
@@ -325,7 +325,6 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty)
 	WARN_ON(tty->ldisc->refcount == 0);
 	return tty->ldisc;
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
 
 /**
@@ -345,7 +344,6 @@ struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty)
 		return tty->ldisc;
 	return NULL;
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_ref);
 
 /**
@@ -373,7 +371,6 @@ void tty_ldisc_deref(struct tty_ldisc *ld)
 		wake_up(&tty_ldisc_wait);
 	spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_deref);
 
 /**
@@ -413,7 +410,6 @@ void tty_ldisc_flush(struct tty_struct *tty)
 	}
 	tty_buffer_flush(tty);
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_flush);
 
 /**
@@ -492,7 +488,7 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 	if (tty_ldisc_open(tty, old) < 0) {
 		tty_ldisc_put(old);
 		/* This driver is always present */
-		new_ldisc =tty_ldisc_get(N_TTY);
+		new_ldisc = tty_ldisc_get(N_TTY);
 		if (IS_ERR(new_ldisc))
 			panic("n_tty: get");
 		tty_ldisc_assign(tty, new_ldisc);
@@ -514,7 +510,7 @@ static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
  *	be obtained while the delayed work queue halt ensures that no more
  *	data is fed to the ldisc.
  *
- *	In order to wait for any existing references to complete see 
+ *	In order to wait for any existing references to complete see
  *	tty_ldisc_wait_idle.
  */
 
@@ -611,7 +607,7 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 		mutex_lock(&tty->ldisc_mutex);
 	}
 	set_bit(TTY_LDISC_CHANGING, &tty->flags);
-		
+
 	/*
 	 *	No more input please, we are switching. The new ldisc
 	 *	will update this value in the ldisc open function
@@ -841,8 +837,8 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
  *	@tty: tty being shut down
  *	@o_tty: pair tty for pty/tty pairs
  *
- *	Called during the final close of a tty/pty pair in order to shut down the
- *	line discpline layer. On exit the ldisc assigned is N_TTY and the
+ *	Called during the final close of a tty/pty pair in order to shut down
+ *	the line discpline layer. On exit the ldisc assigned is N_TTY and the
  *	ldisc has not been opened.
  */
 
-- 
GitLab


From 5fcf62b0f1f24ee25931636216f28bc87448a60f Mon Sep 17 00:00:00 2001
From: Olivier Bornet <Olivier.Bornet@puck.ch>
Date: Thu, 11 Jun 2009 12:52:26 +0100
Subject: [PATCH 2001/2198] tty: iuu_phoenix: fix locking.

Bring in the relevant bits of the 0.9 vendor driver.

Signed-off-by: Olivier Bornet <Olivier.Bornet@puck.ch>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/iuu_phoenix.c | 29 +++++++++++++----------------
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index bb572cee6ecdd..637d0ddbfb428 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -650,32 +650,33 @@ static int iuu_bulk_write(struct usb_serial_port *port)
 	unsigned long flags;
 	int result;
 	int i;
+	int buf_len;
 	char *buf_ptr = port->write_urb->transfer_buffer;
 	dbg("%s - enter", __func__);
 
+	spin_lock_irqsave(&priv->lock, flags);
 	*buf_ptr++ = IUU_UART_ESC;
 	*buf_ptr++ = IUU_UART_TX;
 	*buf_ptr++ = priv->writelen;
 
-	memcpy(buf_ptr, priv->writebuf,
-	       priv->writelen);
+	memcpy(buf_ptr, priv->writebuf, priv->writelen);
+	buf_len = priv->writelen;
+	priv->writelen = 0;
+	spin_unlock_irqrestore(&priv->lock, flags);
 	if (debug == 1) {
-		for (i = 0; i < priv->writelen; i++)
+		for (i = 0; i < buf_len; i++)
 			sprintf(priv->dbgbuf + i*2 ,
 				"%02X", priv->writebuf[i]);
-		priv->dbgbuf[priv->writelen+i*2] = 0;
+		priv->dbgbuf[buf_len+i*2] = 0;
 		dbg("%s - writing %i chars : %s", __func__,
-		    priv->writelen, priv->dbgbuf);
+		    buf_len, priv->dbgbuf);
 	}
 	usb_fill_bulk_urb(port->write_urb, port->serial->dev,
 			  usb_sndbulkpipe(port->serial->dev,
 					  port->bulk_out_endpointAddress),
-			  port->write_urb->transfer_buffer, priv->writelen + 3,
+			  port->write_urb->transfer_buffer, buf_len + 3,
 			  iuu_rxcmd, port);
 	result = usb_submit_urb(port->write_urb, GFP_ATOMIC);
-	spin_lock_irqsave(&priv->lock, flags);
-	priv->writelen = 0;
-	spin_unlock_irqrestore(&priv->lock, flags);
 	usb_serial_port_softint(port);
 	return result;
 }
@@ -769,14 +770,10 @@ static int iuu_uart_write(struct tty_struct *tty, struct usb_serial_port *port,
 		return -ENOMEM;
 
 	spin_lock_irqsave(&priv->lock, flags);
-	if (priv->writelen > 0) {
-		/* buffer already filled but not commited */
-		spin_unlock_irqrestore(&priv->lock, flags);
-		return 0;
-	}
+
 	/* fill the buffer */
-	memcpy(priv->writebuf, buf, count);
-	priv->writelen = count;
+	memcpy(priv->writebuf + priv->writelen, buf, count);
+	priv->writelen += count;
 	spin_unlock_irqrestore(&priv->lock, flags);
 
 	return count;
-- 
GitLab


From cc3447d179d8a5e16807e52b77d7f4c095ffedb7 Mon Sep 17 00:00:00 2001
From: Olivier Bornet <Olivier.Bornet@puck.ch>
Date: Thu, 11 Jun 2009 12:53:24 +0100
Subject: [PATCH 2002/2198] tty: iuu_phoenix: Fix stopbit when uart goes on.

Signed-off-by: Olivier Bornet <Olivier.Bornet@puck.ch>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/iuu_phoenix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index 637d0ddbfb428..ef6d12fd579c5 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -815,7 +815,7 @@ static int iuu_uart_on(struct usb_serial_port *port)
 	buf[0] = IUU_UART_ENABLE;
 	buf[1] = (u8) ((IUU_BAUD_9600 >> 8) & 0x00FF);
 	buf[2] = (u8) (0x00FF & IUU_BAUD_9600);
-	buf[3] = (u8) (0x0F0 & IUU_TWO_STOP_BITS) | (0x07 & IUU_PARITY_EVEN);
+	buf[3] = (u8) (0x0F0 & IUU_ONE_STOP_BIT) | (0x07 & IUU_PARITY_EVEN);
 
 	status = bulk_immediate(port, buf, 4);
 	if (status != IUU_OPERATION_OK) {
-- 
GitLab


From 96dab77ebf3868cc8723ac95e048e8a9c1dccf22 Mon Sep 17 00:00:00 2001
From: Olivier Bornet <Olivier.Bornet@puck.ch>
Date: Thu, 11 Jun 2009 12:54:20 +0100
Subject: [PATCH 2003/2198] tty: iuu_phoenix: set termios.

set_termios can now be used for setting the parity and the stopbits. This is
needed to use with cards which use a different parity then the parity used at
start (even).

If the iuu_uart_baud function return an error, we will return the old_termios
instead of the new one.

Signed-off-by: Olivier Bornet <Olivier.Bornet@puck.ch>

This was then revamped to use the various helpers, not copy non-hardware
bits any to add mark/space parity and csize reporting

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/iuu_phoenix.c | 50 ++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index ef6d12fd579c5..fa86f6e7415f3 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -942,6 +942,55 @@ static int iuu_uart_baud(struct usb_serial_port *port, u32 baud,
 	return status;
 }
 
+static void iuu_set_termios(struct tty_struct *tty,
+		struct usb_serial_port *port, struct ktermios *old_termios)
+{
+	const u32 supported_mask = CMSPAR|PARENB|PARODD;
+
+	unsigned int cflag = tty->termios->c_cflag;
+	int status;
+	u32 actual;
+	u32 parity;
+	int csize = CS7;
+	int baud = 9600;	/* Fixed for the moment */
+	u32 newval = cflag & supported_mask;
+
+	/* compute the parity parameter */
+	parity = 0;
+	if (cflag & CMSPAR) {	/* Using mark space */
+		if (cflag & PARODD)
+			parity |= IUU_PARITY_SPACE;
+		else
+			parity |= IUU_PARITY_MARK;
+	} else if (!(cflag & PARENB)) {
+		parity |= IUU_PARITY_NONE;
+		csize = CS8;
+	} else if (cflag & PARODD)
+		parity |= IUU_PARITY_ODD;
+	else
+		parity |= IUU_PARITY_EVEN;
+
+	parity |= (cflag & CSTOPB ? IUU_TWO_STOP_BITS : IUU_ONE_STOP_BIT);
+
+	/* set it */
+	status = iuu_uart_baud(port,
+			(clockmode == 2) ? 16457 : 9600 * boost / 100,
+			&actual, parity);
+
+	/* set the termios value to the real one, so the user now what has
+	 * changed. We support few fields so its easies to copy the old hw
+	 * settings back over and then adjust them
+	 */
+ 	if (old_termios)
+ 		tty_termios_copy_hw(tty->termios, old_termios);
+	if (status != 0)	/* Set failed - return old bits */
+		return;
+	/* Re-encode speed, parity and csize */
+	tty_encode_baud_rate(tty, baud, baud);
+	tty->termios->c_cflag &= ~(supported_mask|CSIZE);
+	tty->termios->c_cflag |= newval | csize;
+}
+
 static void iuu_close(struct usb_serial_port *port)
 {
 	/* iuu_led (port,255,0,0,0); */
@@ -1151,6 +1200,7 @@ static struct usb_serial_driver iuu_device = {
 	.read_bulk_callback = iuu_uart_read_callback,
 	.tiocmget = iuu_tiocmget,
 	.tiocmset = iuu_tiocmset,
+	.set_termios = iuu_set_termios,
 	.attach = iuu_startup,
 	.shutdown = iuu_shutdown,
 };
-- 
GitLab


From 9bb41699ad5c74519dc054bfe469a8074799c863 Mon Sep 17 00:00:00 2001
From: Olivier Bornet <Olivier.Bornet@puck.ch>
Date: Thu, 11 Jun 2009 12:55:01 +0100
Subject: [PATCH 2004/2198] tty: iuu_phoenix: update version number.

Signed-off-by: Olivier Bornet <Olivier.Bornet@puck.ch>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/iuu_phoenix.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c
index fa86f6e7415f3..76a3cc327bb9c 100644
--- a/drivers/usb/serial/iuu_phoenix.c
+++ b/drivers/usb/serial/iuu_phoenix.c
@@ -40,7 +40,7 @@ static int debug;
 /*
  * Version Information
  */
-#define DRIVER_VERSION "v0.5"
+#define DRIVER_VERSION "v0.10"
 #define DRIVER_DESC "Infinity USB Unlimited Phoenix driver"
 
 static struct usb_device_id id_table[] = {
-- 
GitLab


From 13858d36903990441a89eb342247b71436808eeb Mon Sep 17 00:00:00 2001
From: "Alexander Y. Fomichev" <git.user@gmail.com>
Date: Thu, 11 Jun 2009 12:56:00 +0100
Subject: [PATCH 2005/2198] jsm: correctly support multiple 4/8-port boards

If there are more then one 4/8-port board jsm_uart_port_init
allocate a line numbers of the second and further boards
from range of previous one.

This patch fixed the problem.

Signed-off-by: Alexander Y. Fomichev <git.user@gmail.com>

[printks fixed to add jsm: ]

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/jsm/jsm.h     |  1 +
 drivers/serial/jsm/jsm_tty.c | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/drivers/serial/jsm/jsm.h b/drivers/serial/jsm/jsm.h
index c0a3e2734e240..4e5f3bde0461c 100644
--- a/drivers/serial/jsm/jsm.h
+++ b/drivers/serial/jsm/jsm.h
@@ -61,6 +61,7 @@ enum {
 	if ((DBG_##nlevel & jsm_debug))			\
 	dev_printk(KERN_##klevel, pdev->dev, fmt, ## args)
 
+#define	MAXLINES	256
 #define MAXPORTS	8
 #define MAX_STOPS_SENT	5
 
diff --git a/drivers/serial/jsm/jsm_tty.c b/drivers/serial/jsm/jsm_tty.c
index 31496dc0a0d17..107ce2e187b8f 100644
--- a/drivers/serial/jsm/jsm_tty.c
+++ b/drivers/serial/jsm/jsm_tty.c
@@ -33,6 +33,8 @@
 
 #include "jsm.h"
 
+static DECLARE_BITMAP(linemap, MAXLINES);
+
 static void jsm_carrier(struct jsm_channel *ch);
 
 static inline int jsm_get_mstat(struct jsm_channel *ch)
@@ -433,6 +435,7 @@ int __devinit jsm_tty_init(struct jsm_board *brd)
 int __devinit jsm_uart_port_init(struct jsm_board *brd)
 {
 	int i;
+	unsigned int line;
 	struct jsm_channel *ch;
 
 	if (!brd)
@@ -459,9 +462,15 @@ int __devinit jsm_uart_port_init(struct jsm_board *brd)
 		brd->channels[i]->uart_port.membase = brd->re_map_membase;
 		brd->channels[i]->uart_port.fifosize = 16;
 		brd->channels[i]->uart_port.ops = &jsm_ops;
-		brd->channels[i]->uart_port.line = brd->channels[i]->ch_portnum + brd->boardnum * 2;
+		line = find_first_zero_bit(linemap, MAXLINES);
+		if (line >= MAXLINES) {
+			printk(KERN_INFO "jsm: linemap is full, added device failed\n");
+			continue;
+		} else
+			set_bit((int)line, linemap);
+		brd->channels[i]->uart_port.line = line;
 		if (uart_add_one_port (&jsm_uart_driver, &brd->channels[i]->uart_port))
-			printk(KERN_INFO "Added device failed\n");
+			printk(KERN_INFO "jsm: add device failed\n");
 		else
 			printk(KERN_INFO "Added device \n");
 	}
@@ -494,6 +503,7 @@ int jsm_remove_uart_port(struct jsm_board *brd)
 
 		ch = brd->channels[i];
 
+		clear_bit((int)(ch->uart_port.line), linemap);
 		uart_remove_one_port(&jsm_uart_driver, &brd->channels[i]->uart_port);
 	}
 
-- 
GitLab


From aba6593bf77371e71331ba76dacc98b47760cba3 Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@linux.vnet.ibm.com>
Date: Thu, 11 Jun 2009 13:02:59 +0100
Subject: [PATCH 2006/2198] icom: fixing a if clause spaghetti

adapter->version can only be ADAPTER_V2 or ADAPTER_V1. So,
that OR operand in the "if" clause is non-sense and can be removed.

Signed-off-by: Breno Leitao <leitao@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/icom.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c
index a461b3b2c72dc..fd5fd23804bc6 100644
--- a/drivers/serial/icom.c
+++ b/drivers/serial/icom.c
@@ -408,7 +408,7 @@ static void load_code(struct icom_port *icom_port)
 	release_firmware(fw);
 
 	/* Set Hardware level */
-	if ((icom_port->adapter->version | ADAPTER_V2) == ADAPTER_V2)
+	if (icom_port->adapter->version == ADAPTER_V2)
 		writeb(V2_HARDWARE, &(icom_port->dram->misc_flags));
 
 	/* Start the processor in Adapter */
@@ -861,7 +861,7 @@ static irqreturn_t icom_interrupt(int irq, void *dev_id)
 	/* find icom_port for this interrupt */
 	icom_adapter = (struct icom_adapter *) dev_id;
 
-	if ((icom_adapter->version | ADAPTER_V2) == ADAPTER_V2) {
+	if (icom_adapter->version == ADAPTER_V2) {
 		int_reg = icom_adapter->base_addr + 0x8024;
 
 		adapter_interrupts = readl(int_reg);
-- 
GitLab


From c481c707fe4b07783d9a2499a9bbbb94497e9b18 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:04:27 +0100
Subject: [PATCH 2007/2198] tty: remove buffer special casing

Long long ago a 4K kmalloc allocated two pages so the tty layer used the
page allocator, except on some machines where the page size was huge. This was
removed from the core tty layer with the tty buffer re-implementation but not
from tty_audit or the n_tty ldisc.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/n_tty.c     | 11 ++---------
 drivers/char/tty_audit.c | 10 ++--------
 2 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index f6f0e4ec2b510..b4b12b4f0ac4a 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -76,19 +76,12 @@
 static inline unsigned char *alloc_buf(void)
 {
 	gfp_t prio = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
-
-	if (PAGE_SIZE != N_TTY_BUF_SIZE)
-		return kmalloc(N_TTY_BUF_SIZE, prio);
-	else
-		return (unsigned char *)__get_free_page(prio);
+	return kmalloc(N_TTY_BUF_SIZE, prio);
 }
 
 static inline void free_buf(unsigned char *buf)
 {
-	if (PAGE_SIZE != N_TTY_BUF_SIZE)
-		kfree(buf);
-	else
-		free_page((unsigned long) buf);
+	kfree(buf);
 }
 
 static inline int tty_put_user(struct tty_struct *tty, unsigned char x,
diff --git a/drivers/char/tty_audit.c b/drivers/char/tty_audit.c
index 55ba6f142883f..ac16fbec72d03 100644
--- a/drivers/char/tty_audit.c
+++ b/drivers/char/tty_audit.c
@@ -29,10 +29,7 @@ static struct tty_audit_buf *tty_audit_buf_alloc(int major, int minor,
 	buf = kmalloc(sizeof(*buf), GFP_KERNEL);
 	if (!buf)
 		goto err;
-	if (PAGE_SIZE != N_TTY_BUF_SIZE)
-		buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
-	else
-		buf->data = (unsigned char *)__get_free_page(GFP_KERNEL);
+	buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
 	if (!buf->data)
 		goto err_buf;
 	atomic_set(&buf->count, 1);
@@ -52,10 +49,7 @@ static struct tty_audit_buf *tty_audit_buf_alloc(int major, int minor,
 static void tty_audit_buf_free(struct tty_audit_buf *buf)
 {
 	WARN_ON(buf->valid != 0);
-	if (PAGE_SIZE != N_TTY_BUF_SIZE)
-		kfree(buf->data);
-	else
-		free_page((unsigned long)buf->data);
+	kfree(buf->data);
 	kfree(buf);
 }
 
-- 
GitLab


From 0b4068a1287b02018d1b3159e7be6f27f3e3e68c Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:05:49 +0100
Subject: [PATCH 2008/2198] tty: simplify buffer allocator cleanups

Having cleaned up the allocators we might as well remove the inline helpers
for some of it

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/n_tty.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c
index b4b12b4f0ac4a..94a5d5020abce 100644
--- a/drivers/char/n_tty.c
+++ b/drivers/char/n_tty.c
@@ -73,17 +73,6 @@
 #define ECHO_OP_SET_CANON_COL 0x81
 #define ECHO_OP_ERASE_TAB 0x82
 
-static inline unsigned char *alloc_buf(void)
-{
-	gfp_t prio = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
-	return kmalloc(N_TTY_BUF_SIZE, prio);
-}
-
-static inline void free_buf(unsigned char *buf)
-{
-	kfree(buf);
-}
-
 static inline int tty_put_user(struct tty_struct *tty, unsigned char x,
 			       unsigned char __user *ptr)
 {
@@ -1551,11 +1540,11 @@ static void n_tty_close(struct tty_struct *tty)
 {
 	n_tty_flush_buffer(tty);
 	if (tty->read_buf) {
-		free_buf(tty->read_buf);
+		kfree(tty->read_buf);
 		tty->read_buf = NULL;
 	}
 	if (tty->echo_buf) {
-		free_buf(tty->echo_buf);
+		kfree(tty->echo_buf);
 		tty->echo_buf = NULL;
 	}
 }
@@ -1577,17 +1566,16 @@ static int n_tty_open(struct tty_struct *tty)
 
 	/* These are ugly. Currently a malloc failure here can panic */
 	if (!tty->read_buf) {
-		tty->read_buf = alloc_buf();
+		tty->read_buf = kzalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
 		if (!tty->read_buf)
 			return -ENOMEM;
 	}
 	if (!tty->echo_buf) {
-		tty->echo_buf = alloc_buf();
+		tty->echo_buf = kzalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
+
 		if (!tty->echo_buf)
 			return -ENOMEM;
 	}
-	memset(tty->read_buf, 0, N_TTY_BUF_SIZE);
-	memset(tty->echo_buf, 0, N_TTY_BUF_SIZE);
 	reset_buffer_flags(tty);
 	tty->column = 0;
 	n_tty_set_termios(tty, NULL);
-- 
GitLab


From 73e0d48b8c28fb39a0bb6713c875e9919a9af546 Mon Sep 17 00:00:00 2001
From: Michael Buesch <mb@bu3sch.de>
Date: Thu, 11 Jun 2009 13:06:31 +0100
Subject: [PATCH 2009/2198] parport_pc: Fix subscription bugs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This patch fixes array subscription bugs in the parport_pc driver.

drivers/parport/parport_pc.c: In function ‘parport_irq_probe’:
drivers/parport/parport_pc.c:1589: warning: array subscript is above array bounds
drivers/parport/parport_pc.c: In function ‘parport_pc_probe_port’:
drivers/parport/parport_pc.c:1579: warning: array subscript is above array bounds

The patch also fixes a few other array bugs, which the compiler was
unable to find. Coding style violations are also fixed.

Signed-off-by: Michael Buesch <mb@bu3sch.de>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/parport_pc.c | 34 ++++++++++++++++++----------------
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 4e63cc9e27782..24984c4a1ba4f 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -1246,17 +1246,17 @@ static void __devinit show_parconfig_smsc37c669(int io, int key)
 		       (cr1 & 0x08 ) ? "Standard mode only (SPP)" : modes[cr4 & 0x03], 
 		       (cr4 & 0x40) ? "1.7" : "1.9");
 	}
-		
+
 	/* Heuristics !  BIOS setup for this mainboard device limits
 	   the choices to standard settings, i.e. io-address and IRQ
 	   are related, however DMA can be 1 or 3, assume DMA_A=DMA1,
 	   DMA_C=DMA3 (this is true e.g. for TYAN 1564D Tomcat IV) */
-	if(cr23*4 >=0x100) { /* if active */
-		while((superios[i].io!= 0) && (i<NR_SUPERIOS))
+	if (cr23 * 4 >= 0x100) { /* if active */
+		while ((i < NR_SUPERIOS) && (superios[i].io != 0))
 			i++;
-		if(i==NR_SUPERIOS)
+		if (i == NR_SUPERIOS) {
 			printk(KERN_INFO "Super-IO: too many chips!\n");
-		else {
+		} else {
 			int d;
 			switch (cr23*4) {
 				case 0x3bc:
@@ -1332,12 +1332,12 @@ static void __devinit show_parconfig_winbond(int io, int key)
 		printk(KERN_INFO "Winbond LPT Config: Port mode=%s\n", modes[crf0 & 0x07]);
 	}
 
-	if(cr30 & 0x01) { /* the settings can be interrogated later ... */
-		while((superios[i].io!= 0) && (i<NR_SUPERIOS))
+	if (cr30 & 0x01) { /* the settings can be interrogated later ... */
+		while ((i < NR_SUPERIOS) && (superios[i].io != 0))
 			i++;
-		if(i==NR_SUPERIOS) 
+		if (i == NR_SUPERIOS) {
 			printk(KERN_INFO "Super-IO: too many chips!\n");
-		else {
+		} else {
 			superios[i].io = (cr60<<8)|cr61;
 			superios[i].irq = cr70&0x0f;
 			superios[i].dma = (((cr74 & 0x07) > 3) ?
@@ -1575,24 +1575,26 @@ static void __devinit detect_and_report_it87(void)
 
 static int get_superio_dma (struct parport *p)
 {
-	int i=0;
-	while( (superios[i].io != p->base) && (i<NR_SUPERIOS))
+	int i = 0;
+
+	while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
 		i++;
-	if (i!=NR_SUPERIOS)
+	if (i != NR_SUPERIOS)
 		return superios[i].dma;
 	return PARPORT_DMA_NONE;
 }
 
 static int get_superio_irq (struct parport *p)
 {
-	int i=0;
-        while( (superios[i].io != p->base) && (i<NR_SUPERIOS))
+	int i = 0;
+
+        while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
                 i++;
-        if (i!=NR_SUPERIOS)
+        if (i != NR_SUPERIOS)
                 return superios[i].irq;
         return PARPORT_IRQ_NONE;
 }
-	
+
 
 /* --- Mode detection ------------------------------------- */
 
-- 
GitLab


From 3aeda9bc95d064308a70664e6f4a158c96cc1918 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:07:29 +0100
Subject: [PATCH 2010/2198] parport_pc: Coding style

Michael's patch fixed some of the coding style so the style is now
inconsistent. Sort the rest out

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/parport_pc.c | 1746 ++++++++++++++++++----------------
 1 file changed, 916 insertions(+), 830 deletions(-)

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index 24984c4a1ba4f..edf83e945853b 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -1,5 +1,5 @@
 /* Low-level parallel-port routines for 8255-based PC-style hardware.
- * 
+ *
  * Authors: Phil Blundell <philb@gnu.org>
  *          Tim Waugh <tim@cyberelk.demon.co.uk>
  *	    Jose Renau <renau@acm.org>
@@ -11,7 +11,7 @@
  * Cleaned up include files - Russell King <linux@arm.uk.linux.org>
  * DMA support - Bert De Jonghe <bert@sophis.be>
  * Many ECP bugs fixed.  Fred Barnes & Jamie Lokier, 1999
- * More PCI support now conditional on CONFIG_PCI, 03/2001, Paul G. 
+ * More PCI support now conditional on CONFIG_PCI, 03/2001, Paul G.
  * Various hacks, Fred Barnes, 04/2001
  * Updated probing logic - Adam Belay <ambx1@neo.rr.com>
  */
@@ -56,10 +56,10 @@
 #include <linux/pnp.h>
 #include <linux/platform_device.h>
 #include <linux/sysctl.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
 
-#include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
 
 #include <linux/parport.h>
 #include <linux/parport_pc.h>
@@ -82,7 +82,7 @@
 #define ECR_TST 06
 #define ECR_CNF 07
 #define ECR_MODE_MASK 0xe0
-#define ECR_WRITE(p,v) frob_econtrol((p),0xff,(v))
+#define ECR_WRITE(p, v) frob_econtrol((p), 0xff, (v))
 
 #undef DEBUG
 
@@ -109,27 +109,27 @@ static int pci_registered_parport;
 static int pnp_registered_parport;
 
 /* frob_control, but for ECR */
-static void frob_econtrol (struct parport *pb, unsigned char m,
+static void frob_econtrol(struct parport *pb, unsigned char m,
 			   unsigned char v)
 {
 	unsigned char ectr = 0;
 
 	if (m != 0xff)
-		ectr = inb (ECONTROL (pb));
+		ectr = inb(ECONTROL(pb));
 
-	DPRINTK (KERN_DEBUG "frob_econtrol(%02x,%02x): %02x -> %02x\n",
+	DPRINTK(KERN_DEBUG "frob_econtrol(%02x,%02x): %02x -> %02x\n",
 		m, v, ectr, (ectr & ~m) ^ v);
 
-	outb ((ectr & ~m) ^ v, ECONTROL (pb));
+	outb((ectr & ~m) ^ v, ECONTROL(pb));
 }
 
-static __inline__ void frob_set_mode (struct parport *p, int mode)
+static inline void frob_set_mode(struct parport *p, int mode)
 {
-	frob_econtrol (p, ECR_MODE_MASK, mode << 5);
+	frob_econtrol(p, ECR_MODE_MASK, mode << 5);
 }
 
 #ifdef CONFIG_PARPORT_PC_FIFO
-/* Safely change the mode bits in the ECR 
+/* Safely change the mode bits in the ECR
    Returns:
 	    0    : Success
 	   -EBUSY: Could not drain FIFO in some finite amount of time,
@@ -141,17 +141,18 @@ static int change_mode(struct parport *p, int m)
 	unsigned char oecr;
 	int mode;
 
-	DPRINTK(KERN_INFO "parport change_mode ECP-ISA to mode 0x%02x\n",m);
+	DPRINTK(KERN_INFO "parport change_mode ECP-ISA to mode 0x%02x\n", m);
 
 	if (!priv->ecr) {
-		printk (KERN_DEBUG "change_mode: but there's no ECR!\n");
+		printk(KERN_DEBUG "change_mode: but there's no ECR!\n");
 		return 0;
 	}
 
 	/* Bits <7:5> contain the mode. */
-	oecr = inb (ECONTROL (p));
+	oecr = inb(ECONTROL(p));
 	mode = (oecr >> 5) & 0x7;
-	if (mode == m) return 0;
+	if (mode == m)
+		return 0;
 
 	if (mode >= 2 && !(priv->ctr & 0x20)) {
 		/* This mode resets the FIFO, so we may
@@ -163,19 +164,21 @@ static int change_mode(struct parport *p, int m)
 		case ECR_ECP: /* ECP Parallel Port mode */
 			/* Busy wait for 200us */
 			for (counter = 0; counter < 40; counter++) {
-				if (inb (ECONTROL (p)) & 0x01)
+				if (inb(ECONTROL(p)) & 0x01)
 					break;
-				if (signal_pending (current)) break;
-				udelay (5);
+				if (signal_pending(current))
+					break;
+				udelay(5);
 			}
 
 			/* Poll slowly. */
-			while (!(inb (ECONTROL (p)) & 0x01)) {
-				if (time_after_eq (jiffies, expire))
+			while (!(inb(ECONTROL(p)) & 0x01)) {
+				if (time_after_eq(jiffies, expire))
 					/* The FIFO is stuck. */
 					return -EBUSY;
-				schedule_timeout_interruptible(msecs_to_jiffies(10));
-				if (signal_pending (current))
+				schedule_timeout_interruptible(
+							msecs_to_jiffies(10));
+				if (signal_pending(current))
 					break;
 			}
 		}
@@ -185,20 +188,20 @@ static int change_mode(struct parport *p, int m)
 		/* We have to go through mode 001 */
 		oecr &= ~(7 << 5);
 		oecr |= ECR_PS2 << 5;
-		ECR_WRITE (p, oecr);
+		ECR_WRITE(p, oecr);
 	}
 
 	/* Set the mode. */
 	oecr &= ~(7 << 5);
 	oecr |= m << 5;
-	ECR_WRITE (p, oecr);
+	ECR_WRITE(p, oecr);
 	return 0;
 }
 
 #ifdef CONFIG_PARPORT_1284
 /* Find FIFO lossage; FIFO is reset */
 #if 0
-static int get_fifo_residue (struct parport *p)
+static int get_fifo_residue(struct parport *p)
 {
 	int residue;
 	int cnfga;
@@ -206,26 +209,26 @@ static int get_fifo_residue (struct parport *p)
 
 	/* Adjust for the contents of the FIFO. */
 	for (residue = priv->fifo_depth; ; residue--) {
-		if (inb (ECONTROL (p)) & 0x2)
+		if (inb(ECONTROL(p)) & 0x2)
 				/* Full up. */
 			break;
 
-		outb (0, FIFO (p));
+		outb(0, FIFO(p));
 	}
 
-	printk (KERN_DEBUG "%s: %d PWords were left in FIFO\n", p->name,
+	printk(KERN_DEBUG "%s: %d PWords were left in FIFO\n", p->name,
 		residue);
 
 	/* Reset the FIFO. */
-	frob_set_mode (p, ECR_PS2);
+	frob_set_mode(p, ECR_PS2);
 
 	/* Now change to config mode and clean up. FIXME */
-	frob_set_mode (p, ECR_CNF);
-	cnfga = inb (CONFIGA (p));
-	printk (KERN_DEBUG "%s: cnfgA contains 0x%02x\n", p->name, cnfga);
+	frob_set_mode(p, ECR_CNF);
+	cnfga = inb(CONFIGA(p));
+	printk(KERN_DEBUG "%s: cnfgA contains 0x%02x\n", p->name, cnfga);
 
 	if (!(cnfga & (1<<2))) {
-		printk (KERN_DEBUG "%s: Accounting for extra byte\n", p->name);
+		printk(KERN_DEBUG "%s: Accounting for extra byte\n", p->name);
 		residue++;
 	}
 
@@ -233,9 +236,11 @@ static int get_fifo_residue (struct parport *p)
 	 * PWord != 1 byte. */
 
 	/* Back to PS2 mode. */
-	frob_set_mode (p, ECR_PS2);
+	frob_set_mode(p, ECR_PS2);
 
-	DPRINTK (KERN_DEBUG "*** get_fifo_residue: done residue collecting (ecr = 0x%2.2x)\n", inb (ECONTROL (p)));
+	DPRINTK(KERN_DEBUG
+	     "*** get_fifo_residue: done residue collecting (ecr = 0x%2.2x)\n",
+							inb(ECONTROL(p)));
 	return residue;
 }
 #endif  /*  0 */
@@ -257,8 +262,8 @@ static int clear_epp_timeout(struct parport *pb)
 	/* To clear timeout some chips require double read */
 	parport_pc_read_status(pb);
 	r = parport_pc_read_status(pb);
-	outb (r | 0x01, STATUS (pb)); /* Some reset by writing 1 */
-	outb (r & 0xfe, STATUS (pb)); /* Others by writing 0 */
+	outb(r | 0x01, STATUS(pb)); /* Some reset by writing 1 */
+	outb(r & 0xfe, STATUS(pb)); /* Others by writing 0 */
 	r = parport_pc_read_status(pb);
 
 	return !(r & 0x01);
@@ -272,7 +277,8 @@ static int clear_epp_timeout(struct parport *pb)
  * of these are in parport_pc.h.
  */
 
-static void parport_pc_init_state(struct pardevice *dev, struct parport_state *s)
+static void parport_pc_init_state(struct pardevice *dev,
+						struct parport_state *s)
 {
 	s->u.pc.ctr = 0xc;
 	if (dev->irq_func &&
@@ -289,22 +295,23 @@ static void parport_pc_save_state(struct parport *p, struct parport_state *s)
 	const struct parport_pc_private *priv = p->physport->private_data;
 	s->u.pc.ctr = priv->ctr;
 	if (priv->ecr)
-		s->u.pc.ecr = inb (ECONTROL (p));
+		s->u.pc.ecr = inb(ECONTROL(p));
 }
 
-static void parport_pc_restore_state(struct parport *p, struct parport_state *s)
+static void parport_pc_restore_state(struct parport *p,
+						struct parport_state *s)
 {
 	struct parport_pc_private *priv = p->physport->private_data;
 	register unsigned char c = s->u.pc.ctr & priv->ctr_writable;
-	outb (c, CONTROL (p));
+	outb(c, CONTROL(p));
 	priv->ctr = c;
 	if (priv->ecr)
-		ECR_WRITE (p, s->u.pc.ecr);
+		ECR_WRITE(p, s->u.pc.ecr);
 }
 
 #ifdef CONFIG_PARPORT_1284
-static size_t parport_pc_epp_read_data (struct parport *port, void *buf,
-					size_t length, int flags)
+static size_t parport_pc_epp_read_data(struct parport *port, void *buf,
+				       size_t length, int flags)
 {
 	size_t got = 0;
 
@@ -316,54 +323,52 @@ static size_t parport_pc_epp_read_data (struct parport *port, void *buf,
 		 *  nFault is 0 if there is at least 1 byte in the Warp's FIFO
 		 *  pError is 1 if there are 16 bytes in the Warp's FIFO
 		 */
-		status = inb (STATUS (port));
+		status = inb(STATUS(port));
 
-		while (!(status & 0x08) && (got < length)) {
-			if ((left >= 16) && (status & 0x20) && !(status & 0x08)) {
+		while (!(status & 0x08) && got < length) {
+			if (left >= 16 && (status & 0x20) && !(status & 0x08)) {
 				/* can grab 16 bytes from warp fifo */
-				if (!((long)buf & 0x03)) {
-					insl (EPPDATA (port), buf, 4);
-				} else {
-					insb (EPPDATA (port), buf, 16);
-				}
+				if (!((long)buf & 0x03))
+					insl(EPPDATA(port), buf, 4);
+				else
+					insb(EPPDATA(port), buf, 16);
 				buf += 16;
 				got += 16;
 				left -= 16;
 			} else {
 				/* grab single byte from the warp fifo */
-				*((char *)buf) = inb (EPPDATA (port));
+				*((char *)buf) = inb(EPPDATA(port));
 				buf++;
 				got++;
 				left--;
 			}
-			status = inb (STATUS (port));
+			status = inb(STATUS(port));
 			if (status & 0x01) {
 				/* EPP timeout should never occur... */
-				printk (KERN_DEBUG "%s: EPP timeout occurred while talking to "
-					"w91284pic (should not have done)\n", port->name);
-				clear_epp_timeout (port);
+				printk(KERN_DEBUG
+"%s: EPP timeout occurred while talking to w91284pic (should not have done)\n", port->name);
+				clear_epp_timeout(port);
 			}
 		}
 		return got;
 	}
 	if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-		if (!(((long)buf | length) & 0x03)) {
-			insl (EPPDATA (port), buf, (length >> 2));
-		} else {
-			insb (EPPDATA (port), buf, length);
-		}
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		if (!(((long)buf | length) & 0x03))
+			insl(EPPDATA(port), buf, (length >> 2));
+		else
+			insb(EPPDATA(port), buf, length);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			return -EIO;
 		}
 		return length;
 	}
 	for (; got < length; got++) {
-		*((char*)buf) = inb (EPPDATA(port));
+		*((char *)buf) = inb(EPPDATA(port));
 		buf++;
-		if (inb (STATUS (port)) & 0x01) {
+		if (inb(STATUS(port)) & 0x01) {
 			/* EPP timeout */
-			clear_epp_timeout (port);
+			clear_epp_timeout(port);
 			break;
 		}
 	}
@@ -371,28 +376,27 @@ static size_t parport_pc_epp_read_data (struct parport *port, void *buf,
 	return got;
 }
 
-static size_t parport_pc_epp_write_data (struct parport *port, const void *buf,
-					 size_t length, int flags)
+static size_t parport_pc_epp_write_data(struct parport *port, const void *buf,
+					size_t length, int flags)
 {
 	size_t written = 0;
 
 	if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-		if (!(((long)buf | length) & 0x03)) {
-			outsl (EPPDATA (port), buf, (length >> 2));
-		} else {
-			outsb (EPPDATA (port), buf, length);
-		}
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		if (!(((long)buf | length) & 0x03))
+			outsl(EPPDATA(port), buf, (length >> 2));
+		else
+			outsb(EPPDATA(port), buf, length);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			return -EIO;
 		}
 		return length;
 	}
 	for (; written < length; written++) {
-		outb (*((char*)buf), EPPDATA(port));
+		outb(*((char *)buf), EPPDATA(port));
 		buf++;
-		if (inb (STATUS(port)) & 0x01) {
-			clear_epp_timeout (port);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			break;
 		}
 	}
@@ -400,24 +404,24 @@ static size_t parport_pc_epp_write_data (struct parport *port, const void *buf,
 	return written;
 }
 
-static size_t parport_pc_epp_read_addr (struct parport *port, void *buf,
+static size_t parport_pc_epp_read_addr(struct parport *port, void *buf,
 					size_t length, int flags)
 {
 	size_t got = 0;
 
 	if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-		insb (EPPADDR (port), buf, length);
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		insb(EPPADDR(port), buf, length);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			return -EIO;
 		}
 		return length;
 	}
 	for (; got < length; got++) {
-		*((char*)buf) = inb (EPPADDR (port));
+		*((char *)buf) = inb(EPPADDR(port));
 		buf++;
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			break;
 		}
 	}
@@ -425,25 +429,25 @@ static size_t parport_pc_epp_read_addr (struct parport *port, void *buf,
 	return got;
 }
 
-static size_t parport_pc_epp_write_addr (struct parport *port,
+static size_t parport_pc_epp_write_addr(struct parport *port,
 					 const void *buf, size_t length,
 					 int flags)
 {
 	size_t written = 0;
 
 	if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-		outsb (EPPADDR (port), buf, length);
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		outsb(EPPADDR(port), buf, length);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			return -EIO;
 		}
 		return length;
 	}
 	for (; written < length; written++) {
-		outb (*((char*)buf), EPPADDR (port));
+		outb(*((char *)buf), EPPADDR(port));
 		buf++;
-		if (inb (STATUS (port)) & 0x01) {
-			clear_epp_timeout (port);
+		if (inb(STATUS(port)) & 0x01) {
+			clear_epp_timeout(port);
 			break;
 		}
 	}
@@ -451,74 +455,74 @@ static size_t parport_pc_epp_write_addr (struct parport *port,
 	return written;
 }
 
-static size_t parport_pc_ecpepp_read_data (struct parport *port, void *buf,
-					   size_t length, int flags)
+static size_t parport_pc_ecpepp_read_data(struct parport *port, void *buf,
+					  size_t length, int flags)
 {
 	size_t got;
 
-	frob_set_mode (port, ECR_EPP);
-	parport_pc_data_reverse (port);
-	parport_pc_write_control (port, 0x4);
-	got = parport_pc_epp_read_data (port, buf, length, flags);
-	frob_set_mode (port, ECR_PS2);
+	frob_set_mode(port, ECR_EPP);
+	parport_pc_data_reverse(port);
+	parport_pc_write_control(port, 0x4);
+	got = parport_pc_epp_read_data(port, buf, length, flags);
+	frob_set_mode(port, ECR_PS2);
 
 	return got;
 }
 
-static size_t parport_pc_ecpepp_write_data (struct parport *port,
-					    const void *buf, size_t length,
-					    int flags)
+static size_t parport_pc_ecpepp_write_data(struct parport *port,
+					   const void *buf, size_t length,
+					   int flags)
 {
 	size_t written;
 
-	frob_set_mode (port, ECR_EPP);
-	parport_pc_write_control (port, 0x4);
-	parport_pc_data_forward (port);
-	written = parport_pc_epp_write_data (port, buf, length, flags);
-	frob_set_mode (port, ECR_PS2);
+	frob_set_mode(port, ECR_EPP);
+	parport_pc_write_control(port, 0x4);
+	parport_pc_data_forward(port);
+	written = parport_pc_epp_write_data(port, buf, length, flags);
+	frob_set_mode(port, ECR_PS2);
 
 	return written;
 }
 
-static size_t parport_pc_ecpepp_read_addr (struct parport *port, void *buf,
-					   size_t length, int flags)
+static size_t parport_pc_ecpepp_read_addr(struct parport *port, void *buf,
+					  size_t length, int flags)
 {
 	size_t got;
 
-	frob_set_mode (port, ECR_EPP);
-	parport_pc_data_reverse (port);
-	parport_pc_write_control (port, 0x4);
-	got = parport_pc_epp_read_addr (port, buf, length, flags);
-	frob_set_mode (port, ECR_PS2);
+	frob_set_mode(port, ECR_EPP);
+	parport_pc_data_reverse(port);
+	parport_pc_write_control(port, 0x4);
+	got = parport_pc_epp_read_addr(port, buf, length, flags);
+	frob_set_mode(port, ECR_PS2);
 
 	return got;
 }
 
-static size_t parport_pc_ecpepp_write_addr (struct parport *port,
+static size_t parport_pc_ecpepp_write_addr(struct parport *port,
 					    const void *buf, size_t length,
 					    int flags)
 {
 	size_t written;
 
-	frob_set_mode (port, ECR_EPP);
-	parport_pc_write_control (port, 0x4);
-	parport_pc_data_forward (port);
-	written = parport_pc_epp_write_addr (port, buf, length, flags);
-	frob_set_mode (port, ECR_PS2);
+	frob_set_mode(port, ECR_EPP);
+	parport_pc_write_control(port, 0x4);
+	parport_pc_data_forward(port);
+	written = parport_pc_epp_write_addr(port, buf, length, flags);
+	frob_set_mode(port, ECR_PS2);
 
 	return written;
 }
 #endif /* IEEE 1284 support */
 
 #ifdef CONFIG_PARPORT_PC_FIFO
-static size_t parport_pc_fifo_write_block_pio (struct parport *port,
+static size_t parport_pc_fifo_write_block_pio(struct parport *port,
 					       const void *buf, size_t length)
 {
 	int ret = 0;
 	const unsigned char *bufp = buf;
 	size_t left = length;
 	unsigned long expire = jiffies + port->physport->cad->timeout;
-	const int fifo = FIFO (port);
+	const int fifo = FIFO(port);
 	int poll_for = 8; /* 80 usecs */
 	const struct parport_pc_private *priv = port->physport->private_data;
 	const int fifo_depth = priv->fifo_depth;
@@ -526,25 +530,25 @@ static size_t parport_pc_fifo_write_block_pio (struct parport *port,
 	port = port->physport;
 
 	/* We don't want to be interrupted every character. */
-	parport_pc_disable_irq (port);
+	parport_pc_disable_irq(port);
 	/* set nErrIntrEn and serviceIntr */
-	frob_econtrol (port, (1<<4) | (1<<2), (1<<4) | (1<<2));
+	frob_econtrol(port, (1<<4) | (1<<2), (1<<4) | (1<<2));
 
 	/* Forward mode. */
-	parport_pc_data_forward (port); /* Must be in PS2 mode */
+	parport_pc_data_forward(port); /* Must be in PS2 mode */
 
 	while (left) {
 		unsigned char byte;
-		unsigned char ecrval = inb (ECONTROL (port));
+		unsigned char ecrval = inb(ECONTROL(port));
 		int i = 0;
 
-		if (need_resched() && time_before (jiffies, expire))
+		if (need_resched() && time_before(jiffies, expire))
 			/* Can't yield the port. */
-			schedule ();
+			schedule();
 
 		/* Anyone else waiting for the port? */
 		if (port->waithead) {
-			printk (KERN_DEBUG "Somebody wants the port\n");
+			printk(KERN_DEBUG "Somebody wants the port\n");
 			break;
 		}
 
@@ -552,21 +556,22 @@ static size_t parport_pc_fifo_write_block_pio (struct parport *port,
 			/* FIFO is full. Wait for interrupt. */
 
 			/* Clear serviceIntr */
-			ECR_WRITE (port, ecrval & ~(1<<2));
-		false_alarm:
-			ret = parport_wait_event (port, HZ);
-			if (ret < 0) break;
+			ECR_WRITE(port, ecrval & ~(1<<2));
+false_alarm:
+			ret = parport_wait_event(port, HZ);
+			if (ret < 0)
+				break;
 			ret = 0;
-			if (!time_before (jiffies, expire)) {
+			if (!time_before(jiffies, expire)) {
 				/* Timed out. */
-				printk (KERN_DEBUG "FIFO write timed out\n");
+				printk(KERN_DEBUG "FIFO write timed out\n");
 				break;
 			}
-			ecrval = inb (ECONTROL (port));
+			ecrval = inb(ECONTROL(port));
 			if (!(ecrval & (1<<2))) {
 				if (need_resched() &&
-				    time_before (jiffies, expire))
-					schedule ();
+				    time_before(jiffies, expire))
+					schedule();
 
 				goto false_alarm;
 			}
@@ -577,38 +582,38 @@ static size_t parport_pc_fifo_write_block_pio (struct parport *port,
 		/* Can't fail now. */
 		expire = jiffies + port->cad->timeout;
 
-	poll:
-		if (signal_pending (current))
+poll:
+		if (signal_pending(current))
 			break;
 
 		if (ecrval & 0x01) {
 			/* FIFO is empty. Blast it full. */
 			const int n = left < fifo_depth ? left : fifo_depth;
-			outsb (fifo, bufp, n);
+			outsb(fifo, bufp, n);
 			bufp += n;
 			left -= n;
 
 			/* Adjust the poll time. */
-			if (i < (poll_for - 2)) poll_for--;
+			if (i < (poll_for - 2))
+				poll_for--;
 			continue;
 		} else if (i++ < poll_for) {
-			udelay (10);
-			ecrval = inb (ECONTROL (port));
+			udelay(10);
+			ecrval = inb(ECONTROL(port));
 			goto poll;
 		}
 
-		/* Half-full (call me an optimist) */
+		/* Half-full(call me an optimist) */
 		byte = *bufp++;
-		outb (byte, fifo);
+		outb(byte, fifo);
 		left--;
-        }
-
-dump_parport_state ("leave fifo_write_block_pio", port);
+	}
+	dump_parport_state("leave fifo_write_block_pio", port);
 	return length - left;
 }
 
 #ifdef HAS_DMA
-static size_t parport_pc_fifo_write_block_dma (struct parport *port,
+static size_t parport_pc_fifo_write_block_dma(struct parport *port,
 					       const void *buf, size_t length)
 {
 	int ret = 0;
@@ -621,7 +626,7 @@ static size_t parport_pc_fifo_write_block_dma (struct parport *port,
 	unsigned long start = (unsigned long) buf;
 	unsigned long end = (unsigned long) buf + length - 1;
 
-dump_parport_state ("enter fifo_write_block_dma", port);
+dump_parport_state("enter fifo_write_block_dma", port);
 	if (end < MAX_DMA_ADDRESS) {
 		/* If it would cross a 64k boundary, cap it at the end. */
 		if ((start ^ end) & ~0xffffUL)
@@ -629,8 +634,9 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 
 		dma_addr = dma_handle = dma_map_single(dev, (void *)buf, length,
 						       DMA_TO_DEVICE);
-        } else {
-		/* above 16 MB we use a bounce buffer as ISA-DMA is not possible */
+	} else {
+		/* above 16 MB we use a bounce buffer as ISA-DMA
+		   is not possible */
 		maxlen   = PAGE_SIZE;          /* sizeof(priv->dma_buf) */
 		dma_addr = priv->dma_handle;
 		dma_handle = 0;
@@ -639,12 +645,12 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 	port = port->physport;
 
 	/* We don't want to be interrupted every character. */
-	parport_pc_disable_irq (port);
+	parport_pc_disable_irq(port);
 	/* set nErrIntrEn and serviceIntr */
-	frob_econtrol (port, (1<<4) | (1<<2), (1<<4) | (1<<2));
+	frob_econtrol(port, (1<<4) | (1<<2), (1<<4) | (1<<2));
 
 	/* Forward mode. */
-	parport_pc_data_forward (port); /* Must be in PS2 mode */
+	parport_pc_data_forward(port); /* Must be in PS2 mode */
 
 	while (left) {
 		unsigned long expire = jiffies + port->physport->cad->timeout;
@@ -665,10 +671,10 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 		set_dma_count(port->dma, count);
 
 		/* Set DMA mode */
-		frob_econtrol (port, 1<<3, 1<<3);
+		frob_econtrol(port, 1<<3, 1<<3);
 
 		/* Clear serviceIntr */
-		frob_econtrol (port, 1<<2, 0);
+		frob_econtrol(port, 1<<2, 0);
 
 		enable_dma(port->dma);
 		release_dma_lock(dmaflag);
@@ -676,20 +682,22 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 		/* assume DMA will be successful */
 		left -= count;
 		buf  += count;
-		if (dma_handle) dma_addr += count;
+		if (dma_handle)
+			dma_addr += count;
 
 		/* Wait for interrupt. */
-	false_alarm:
-		ret = parport_wait_event (port, HZ);
-		if (ret < 0) break;
+false_alarm:
+		ret = parport_wait_event(port, HZ);
+		if (ret < 0)
+			break;
 		ret = 0;
-		if (!time_before (jiffies, expire)) {
+		if (!time_before(jiffies, expire)) {
 			/* Timed out. */
-			printk (KERN_DEBUG "DMA write timed out\n");
+			printk(KERN_DEBUG "DMA write timed out\n");
 			break;
 		}
 		/* Is serviceIntr set? */
-		if (!(inb (ECONTROL (port)) & (1<<2))) {
+		if (!(inb(ECONTROL(port)) & (1<<2))) {
 			cond_resched();
 
 			goto false_alarm;
@@ -705,14 +713,15 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 
 		/* Anyone else waiting for the port? */
 		if (port->waithead) {
-			printk (KERN_DEBUG "Somebody wants the port\n");
+			printk(KERN_DEBUG "Somebody wants the port\n");
 			break;
 		}
 
 		/* update for possible DMA residue ! */
 		buf  -= count;
 		left += count;
-		if (dma_handle) dma_addr -= count;
+		if (dma_handle)
+			dma_addr -= count;
 	}
 
 	/* Maybe got here through break, so adjust for DMA residue! */
@@ -723,12 +732,12 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 	release_dma_lock(dmaflag);
 
 	/* Turn off DMA mode */
-	frob_econtrol (port, 1<<3, 0);
+	frob_econtrol(port, 1<<3, 0);
 
 	if (dma_handle)
 		dma_unmap_single(dev, dma_handle, length, DMA_TO_DEVICE);
 
-dump_parport_state ("leave fifo_write_block_dma", port);
+dump_parport_state("leave fifo_write_block_dma", port);
 	return length - left;
 }
 #endif
@@ -738,13 +747,13 @@ static inline size_t parport_pc_fifo_write_block(struct parport *port,
 {
 #ifdef HAS_DMA
 	if (port->dma != PARPORT_DMA_NONE)
-		return parport_pc_fifo_write_block_dma (port, buf, length);
+		return parport_pc_fifo_write_block_dma(port, buf, length);
 #endif
-	return parport_pc_fifo_write_block_pio (port, buf, length);
+	return parport_pc_fifo_write_block_pio(port, buf, length);
 }
 
 /* Parallel Port FIFO mode (ECP chipsets) */
-static size_t parport_pc_compat_write_block_pio (struct parport *port,
+static size_t parport_pc_compat_write_block_pio(struct parport *port,
 						 const void *buf, size_t length,
 						 int flags)
 {
@@ -756,14 +765,16 @@ static size_t parport_pc_compat_write_block_pio (struct parport *port,
 	/* Special case: a timeout of zero means we cannot call schedule().
 	 * Also if O_NONBLOCK is set then use the default implementation. */
 	if (port->physport->cad->timeout <= PARPORT_INACTIVITY_O_NONBLOCK)
-		return parport_ieee1284_write_compat (port, buf,
+		return parport_ieee1284_write_compat(port, buf,
 						      length, flags);
 
 	/* Set up parallel port FIFO mode.*/
-	parport_pc_data_forward (port); /* Must be in PS2 mode */
-	parport_pc_frob_control (port, PARPORT_CONTROL_STROBE, 0);
-	r = change_mode (port, ECR_PPF); /* Parallel port FIFO */
-	if (r)  printk (KERN_DEBUG "%s: Warning change_mode ECR_PPF failed\n", port->name);
+	parport_pc_data_forward(port); /* Must be in PS2 mode */
+	parport_pc_frob_control(port, PARPORT_CONTROL_STROBE, 0);
+	r = change_mode(port, ECR_PPF); /* Parallel port FIFO */
+	if (r)
+		printk(KERN_DEBUG "%s: Warning change_mode ECR_PPF failed\n",
+								port->name);
 
 	port->physport->ieee1284.phase = IEEE1284_PH_FWD_DATA;
 
@@ -775,40 +786,39 @@ static size_t parport_pc_compat_write_block_pio (struct parport *port,
 	 * the FIFO is empty, so allow 4 seconds for each position
 	 * in the fifo.
 	 */
-        expire = jiffies + (priv->fifo_depth * HZ * 4);
+	expire = jiffies + (priv->fifo_depth * HZ * 4);
 	do {
 		/* Wait for the FIFO to empty */
-		r = change_mode (port, ECR_PS2);
-		if (r != -EBUSY) {
+		r = change_mode(port, ECR_PS2);
+		if (r != -EBUSY)
 			break;
-		}
-	} while (time_before (jiffies, expire));
+	} while (time_before(jiffies, expire));
 	if (r == -EBUSY) {
 
-		printk (KERN_DEBUG "%s: FIFO is stuck\n", port->name);
+		printk(KERN_DEBUG "%s: FIFO is stuck\n", port->name);
 
 		/* Prevent further data transfer. */
-		frob_set_mode (port, ECR_TST);
+		frob_set_mode(port, ECR_TST);
 
 		/* Adjust for the contents of the FIFO. */
 		for (written -= priv->fifo_depth; ; written++) {
-			if (inb (ECONTROL (port)) & 0x2) {
+			if (inb(ECONTROL(port)) & 0x2) {
 				/* Full up. */
 				break;
 			}
-			outb (0, FIFO (port));
+			outb(0, FIFO(port));
 		}
 
 		/* Reset the FIFO and return to PS2 mode. */
-		frob_set_mode (port, ECR_PS2);
+		frob_set_mode(port, ECR_PS2);
 	}
 
-	r = parport_wait_peripheral (port,
+	r = parport_wait_peripheral(port,
 				     PARPORT_STATUS_BUSY,
 				     PARPORT_STATUS_BUSY);
 	if (r)
-		printk (KERN_DEBUG
-			"%s: BUSY timeout (%d) in compat_write_block_pio\n", 
+		printk(KERN_DEBUG
+			"%s: BUSY timeout (%d) in compat_write_block_pio\n",
 			port->name, r);
 
 	port->physport->ieee1284.phase = IEEE1284_PH_FWD_IDLE;
@@ -818,7 +828,7 @@ static size_t parport_pc_compat_write_block_pio (struct parport *port,
 
 /* ECP */
 #ifdef CONFIG_PARPORT_1284
-static size_t parport_pc_ecp_write_block_pio (struct parport *port,
+static size_t parport_pc_ecp_write_block_pio(struct parport *port,
 					      const void *buf, size_t length,
 					      int flags)
 {
@@ -830,36 +840,38 @@ static size_t parport_pc_ecp_write_block_pio (struct parport *port,
 	/* Special case: a timeout of zero means we cannot call schedule().
 	 * Also if O_NONBLOCK is set then use the default implementation. */
 	if (port->physport->cad->timeout <= PARPORT_INACTIVITY_O_NONBLOCK)
-		return parport_ieee1284_ecp_write_data (port, buf,
+		return parport_ieee1284_ecp_write_data(port, buf,
 							length, flags);
 
 	/* Switch to forward mode if necessary. */
 	if (port->physport->ieee1284.phase != IEEE1284_PH_FWD_IDLE) {
 		/* Event 47: Set nInit high. */
-		parport_frob_control (port,
+		parport_frob_control(port,
 				      PARPORT_CONTROL_INIT
 				      | PARPORT_CONTROL_AUTOFD,
 				      PARPORT_CONTROL_INIT
 				      | PARPORT_CONTROL_AUTOFD);
 
 		/* Event 49: PError goes high. */
-		r = parport_wait_peripheral (port,
+		r = parport_wait_peripheral(port,
 					     PARPORT_STATUS_PAPEROUT,
 					     PARPORT_STATUS_PAPEROUT);
 		if (r) {
-			printk (KERN_DEBUG "%s: PError timeout (%d) "
+			printk(KERN_DEBUG "%s: PError timeout (%d) "
 				"in ecp_write_block_pio\n", port->name, r);
 		}
 	}
 
 	/* Set up ECP parallel port mode.*/
-	parport_pc_data_forward (port); /* Must be in PS2 mode */
-	parport_pc_frob_control (port,
+	parport_pc_data_forward(port); /* Must be in PS2 mode */
+	parport_pc_frob_control(port,
 				 PARPORT_CONTROL_STROBE |
 				 PARPORT_CONTROL_AUTOFD,
 				 0);
-	r = change_mode (port, ECR_ECP); /* ECP FIFO */
-	if (r) printk (KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n", port->name);
+	r = change_mode(port, ECR_ECP); /* ECP FIFO */
+	if (r)
+		printk(KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n",
+								port->name);
 	port->physport->ieee1284.phase = IEEE1284_PH_FWD_DATA;
 
 	/* Write the data to the FIFO. */
@@ -873,55 +885,54 @@ static size_t parport_pc_ecp_write_block_pio (struct parport *port,
 	expire = jiffies + (priv->fifo_depth * (HZ * 4));
 	do {
 		/* Wait for the FIFO to empty */
-		r = change_mode (port, ECR_PS2);
-		if (r != -EBUSY) {
+		r = change_mode(port, ECR_PS2);
+		if (r != -EBUSY)
 			break;
-		}
-	} while (time_before (jiffies, expire));
+	} while (time_before(jiffies, expire));
 	if (r == -EBUSY) {
 
-		printk (KERN_DEBUG "%s: FIFO is stuck\n", port->name);
+		printk(KERN_DEBUG "%s: FIFO is stuck\n", port->name);
 
 		/* Prevent further data transfer. */
-		frob_set_mode (port, ECR_TST);
+		frob_set_mode(port, ECR_TST);
 
 		/* Adjust for the contents of the FIFO. */
 		for (written -= priv->fifo_depth; ; written++) {
-			if (inb (ECONTROL (port)) & 0x2) {
+			if (inb(ECONTROL(port)) & 0x2) {
 				/* Full up. */
 				break;
 			}
-			outb (0, FIFO (port));
+			outb(0, FIFO(port));
 		}
 
 		/* Reset the FIFO and return to PS2 mode. */
-		frob_set_mode (port, ECR_PS2);
+		frob_set_mode(port, ECR_PS2);
 
 		/* Host transfer recovery. */
-		parport_pc_data_reverse (port); /* Must be in PS2 mode */
-		udelay (5);
-		parport_frob_control (port, PARPORT_CONTROL_INIT, 0);
-		r = parport_wait_peripheral (port, PARPORT_STATUS_PAPEROUT, 0);
+		parport_pc_data_reverse(port); /* Must be in PS2 mode */
+		udelay(5);
+		parport_frob_control(port, PARPORT_CONTROL_INIT, 0);
+		r = parport_wait_peripheral(port, PARPORT_STATUS_PAPEROUT, 0);
 		if (r)
-			printk (KERN_DEBUG "%s: PE,1 timeout (%d) "
+			printk(KERN_DEBUG "%s: PE,1 timeout (%d) "
 				"in ecp_write_block_pio\n", port->name, r);
 
-		parport_frob_control (port,
+		parport_frob_control(port,
 				      PARPORT_CONTROL_INIT,
 				      PARPORT_CONTROL_INIT);
-		r = parport_wait_peripheral (port,
+		r = parport_wait_peripheral(port,
 					     PARPORT_STATUS_PAPEROUT,
 					     PARPORT_STATUS_PAPEROUT);
-                if (r)
-                        printk (KERN_DEBUG "%s: PE,2 timeout (%d) "
+		if (r)
+			printk(KERN_DEBUG "%s: PE,2 timeout (%d) "
 				"in ecp_write_block_pio\n", port->name, r);
 	}
 
-	r = parport_wait_peripheral (port,
-				     PARPORT_STATUS_BUSY, 
+	r = parport_wait_peripheral(port,
+				     PARPORT_STATUS_BUSY,
 				     PARPORT_STATUS_BUSY);
-	if(r)
-		printk (KERN_DEBUG
+	if (r)
+		printk(KERN_DEBUG
 			"%s: BUSY timeout (%d) in ecp_write_block_pio\n",
 			port->name, r);
 
@@ -931,7 +942,7 @@ static size_t parport_pc_ecp_write_block_pio (struct parport *port,
 }
 
 #if 0
-static size_t parport_pc_ecp_read_block_pio (struct parport *port,
+static size_t parport_pc_ecp_read_block_pio(struct parport *port,
 					     void *buf, size_t length,
 					     int flags)
 {
@@ -944,13 +955,13 @@ static size_t parport_pc_ecp_read_block_pio (struct parport *port,
 	char *bufp = buf;
 
 	port = port->physport;
-DPRINTK (KERN_DEBUG "parport_pc: parport_pc_ecp_read_block_pio\n");
-dump_parport_state ("enter fcn", port);
+DPRINTK(KERN_DEBUG "parport_pc: parport_pc_ecp_read_block_pio\n");
+dump_parport_state("enter fcn", port);
 
 	/* Special case: a timeout of zero means we cannot call schedule().
 	 * Also if O_NONBLOCK is set then use the default implementation. */
 	if (port->cad->timeout <= PARPORT_INACTIVITY_O_NONBLOCK)
-		return parport_ieee1284_ecp_read_data (port, buf,
+		return parport_ieee1284_ecp_read_data(port, buf,
 						       length, flags);
 
 	if (port->ieee1284.mode == IEEE1284_MODE_ECPRLE) {
@@ -966,173 +977,177 @@ dump_parport_state ("enter fcn", port);
 	 * go through software emulation.  Otherwise we may have to throw
 	 * away data. */
 	if (length < fifofull)
-		return parport_ieee1284_ecp_read_data (port, buf,
+		return parport_ieee1284_ecp_read_data(port, buf,
 						       length, flags);
 
 	if (port->ieee1284.phase != IEEE1284_PH_REV_IDLE) {
 		/* change to reverse-idle phase (must be in forward-idle) */
 
 		/* Event 38: Set nAutoFd low (also make sure nStrobe is high) */
-		parport_frob_control (port,
+		parport_frob_control(port,
 				      PARPORT_CONTROL_AUTOFD
 				      | PARPORT_CONTROL_STROBE,
 				      PARPORT_CONTROL_AUTOFD);
-		parport_pc_data_reverse (port); /* Must be in PS2 mode */
-		udelay (5);
+		parport_pc_data_reverse(port); /* Must be in PS2 mode */
+		udelay(5);
 		/* Event 39: Set nInit low to initiate bus reversal */
-		parport_frob_control (port,
+		parport_frob_control(port,
 				      PARPORT_CONTROL_INIT,
 				      0);
 		/* Event 40: Wait for  nAckReverse (PError) to go low */
-		r = parport_wait_peripheral (port, PARPORT_STATUS_PAPEROUT, 0);
-                if (r) {
-                        printk (KERN_DEBUG "%s: PE timeout Event 40 (%d) "
+		r = parport_wait_peripheral(port, PARPORT_STATUS_PAPEROUT, 0);
+		if (r) {
+			printk(KERN_DEBUG "%s: PE timeout Event 40 (%d) "
 				"in ecp_read_block_pio\n", port->name, r);
 			return 0;
 		}
 	}
 
 	/* Set up ECP FIFO mode.*/
-/*	parport_pc_frob_control (port,
+/*	parport_pc_frob_control(port,
 				 PARPORT_CONTROL_STROBE |
 				 PARPORT_CONTROL_AUTOFD,
 				 PARPORT_CONTROL_AUTOFD); */
-	r = change_mode (port, ECR_ECP); /* ECP FIFO */
-	if (r) printk (KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n", port->name);
+	r = change_mode(port, ECR_ECP); /* ECP FIFO */
+	if (r)
+		printk(KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n",
+								port->name);
 
 	port->ieee1284.phase = IEEE1284_PH_REV_DATA;
 
 	/* the first byte must be collected manually */
-dump_parport_state ("pre 43", port);
+	dump_parport_state("pre 43", port);
 	/* Event 43: Wait for nAck to go low */
-	r = parport_wait_peripheral (port, PARPORT_STATUS_ACK, 0);
+	r = parport_wait_peripheral(port, PARPORT_STATUS_ACK, 0);
 	if (r) {
 		/* timed out while reading -- no data */
-		printk (KERN_DEBUG "PIO read timed out (initial byte)\n");
+		printk(KERN_DEBUG "PIO read timed out (initial byte)\n");
 		goto out_no_data;
 	}
 	/* read byte */
-	*bufp++ = inb (DATA (port));
+	*bufp++ = inb(DATA(port));
 	left--;
-dump_parport_state ("43-44", port);
+	dump_parport_state("43-44", port);
 	/* Event 44: nAutoFd (HostAck) goes high to acknowledge */
-	parport_pc_frob_control (port,
+	parport_pc_frob_control(port,
 				 PARPORT_CONTROL_AUTOFD,
 				 0);
-dump_parport_state ("pre 45", port);
+	dump_parport_state("pre 45", port);
 	/* Event 45: Wait for nAck to go high */
-/*	r = parport_wait_peripheral (port, PARPORT_STATUS_ACK, PARPORT_STATUS_ACK); */
-dump_parport_state ("post 45", port);
-r = 0;
+	/* r = parport_wait_peripheral(port, PARPORT_STATUS_ACK,
+						PARPORT_STATUS_ACK); */
+	dump_parport_state("post 45", port);
+	r = 0;
 	if (r) {
 		/* timed out while waiting for peripheral to respond to ack */
-		printk (KERN_DEBUG "ECP PIO read timed out (waiting for nAck)\n");
+		printk(KERN_DEBUG "ECP PIO read timed out (waiting for nAck)\n");
 
 		/* keep hold of the byte we've got already */
 		goto out_no_data;
 	}
 	/* Event 46: nAutoFd (HostAck) goes low to accept more data */
-	parport_pc_frob_control (port,
+	parport_pc_frob_control(port,
 				 PARPORT_CONTROL_AUTOFD,
 				 PARPORT_CONTROL_AUTOFD);
 
 
-dump_parport_state ("rev idle", port);
+	dump_parport_state("rev idle", port);
 	/* Do the transfer. */
 	while (left > fifofull) {
 		int ret;
 		unsigned long expire = jiffies + port->cad->timeout;
-		unsigned char ecrval = inb (ECONTROL (port));
+		unsigned char ecrval = inb(ECONTROL(port));
 
-		if (need_resched() && time_before (jiffies, expire))
+		if (need_resched() && time_before(jiffies, expire))
 			/* Can't yield the port. */
-			schedule ();
+			schedule();
 
 		/* At this point, the FIFO may already be full. In
-                 * that case ECP is already holding back the
-                 * peripheral (assuming proper design) with a delayed
-                 * handshake.  Work fast to avoid a peripheral
-                 * timeout.  */
+		 * that case ECP is already holding back the
+		 * peripheral (assuming proper design) with a delayed
+		 * handshake.  Work fast to avoid a peripheral
+		 * timeout.  */
 
 		if (ecrval & 0x01) {
 			/* FIFO is empty. Wait for interrupt. */
-dump_parport_state ("FIFO empty", port);
+			dump_parport_state("FIFO empty", port);
 
 			/* Anyone else waiting for the port? */
 			if (port->waithead) {
-				printk (KERN_DEBUG "Somebody wants the port\n");
+				printk(KERN_DEBUG "Somebody wants the port\n");
 				break;
 			}
 
 			/* Clear serviceIntr */
-			ECR_WRITE (port, ecrval & ~(1<<2));
-		false_alarm:
-dump_parport_state ("waiting", port);
-			ret = parport_wait_event (port, HZ);
-DPRINTK (KERN_DEBUG "parport_wait_event returned %d\n", ret);
+			ECR_WRITE(port, ecrval & ~(1<<2));
+false_alarm:
+			dump_parport_state("waiting", port);
+			ret = parport_wait_event(port, HZ);
+			DPRINTK(KERN_DEBUG "parport_wait_event returned %d\n",
+									ret);
 			if (ret < 0)
 				break;
 			ret = 0;
-			if (!time_before (jiffies, expire)) {
+			if (!time_before(jiffies, expire)) {
 				/* Timed out. */
-dump_parport_state ("timeout", port);
-				printk (KERN_DEBUG "PIO read timed out\n");
+				dump_parport_state("timeout", port);
+				printk(KERN_DEBUG "PIO read timed out\n");
 				break;
 			}
-			ecrval = inb (ECONTROL (port));
+			ecrval = inb(ECONTROL(port));
 			if (!(ecrval & (1<<2))) {
 				if (need_resched() &&
-				    time_before (jiffies, expire)) {
-					schedule ();
+				    time_before(jiffies, expire)) {
+					schedule();
 				}
 				goto false_alarm;
 			}
 
 			/* Depending on how the FIFO threshold was
-                         * set, how long interrupt service took, and
-                         * how fast the peripheral is, we might be
-                         * lucky and have a just filled FIFO. */
+			 * set, how long interrupt service took, and
+			 * how fast the peripheral is, we might be
+			 * lucky and have a just filled FIFO. */
 			continue;
 		}
 
 		if (ecrval & 0x02) {
 			/* FIFO is full. */
-dump_parport_state ("FIFO full", port);
-			insb (fifo, bufp, fifo_depth);
+dump_parport_state("FIFO full", port);
+			insb(fifo, bufp, fifo_depth);
 			bufp += fifo_depth;
 			left -= fifo_depth;
 			continue;
 		}
 
-DPRINTK (KERN_DEBUG "*** ecp_read_block_pio: reading one byte from the FIFO\n");
+DPRINTK(KERN_DEBUG "*** ecp_read_block_pio: reading one byte from the FIFO\n");
 
 		/* FIFO not filled.  We will cycle this loop for a while
-                 * and either the peripheral will fill it faster,
-                 * tripping a fast empty with insb, or we empty it. */
-		*bufp++ = inb (fifo);
+		 * and either the peripheral will fill it faster,
+		 * tripping a fast empty with insb, or we empty it. */
+		*bufp++ = inb(fifo);
 		left--;
 	}
 
 	/* scoop up anything left in the FIFO */
-	while (left && !(inb (ECONTROL (port) & 0x01))) {
-		*bufp++ = inb (fifo);
+	while (left && !(inb(ECONTROL(port) & 0x01))) {
+		*bufp++ = inb(fifo);
 		left--;
 	}
 
 	port->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-dump_parport_state ("rev idle2", port);
+dump_parport_state("rev idle2", port);
 
 out_no_data:
 
 	/* Go to forward idle mode to shut the peripheral up (event 47). */
-	parport_frob_control (port, PARPORT_CONTROL_INIT, PARPORT_CONTROL_INIT);
+	parport_frob_control(port, PARPORT_CONTROL_INIT, PARPORT_CONTROL_INIT);
 
 	/* event 49: PError goes high */
-	r = parport_wait_peripheral (port,
+	r = parport_wait_peripheral(port,
 				     PARPORT_STATUS_PAPEROUT,
 				     PARPORT_STATUS_PAPEROUT);
 	if (r) {
-		printk (KERN_DEBUG
+		printk(KERN_DEBUG
 			"%s: PE timeout FWDIDLE (%d) in ecp_read_block_pio\n",
 			port->name, r);
 	}
@@ -1141,14 +1156,14 @@ dump_parport_state ("rev idle2", port);
 
 	/* Finish up. */
 	{
-		int lost = get_fifo_residue (port);
+		int lost = get_fifo_residue(port);
 		if (lost)
 			/* Shouldn't happen with compliant peripherals. */
-			printk (KERN_DEBUG "%s: DATA LOSS (%d bytes)!\n",
+			printk(KERN_DEBUG "%s: DATA LOSS (%d bytes)!\n",
 				port->name, lost);
 	}
 
-dump_parport_state ("fwd idle", port);
+dump_parport_state("fwd idle", port);
 	return length - left;
 }
 #endif  /*  0  */
@@ -1164,8 +1179,7 @@ dump_parport_state ("fwd idle", port);
 
 /* GCC is not inlining extern inline function later overwriten to non-inline,
    so we use outlined_ variants here.  */
-static const struct parport_operations parport_pc_ops =
-{
+static const struct parport_operations parport_pc_ops = {
 	.write_data	= parport_pc_write_data,
 	.read_data	= parport_pc_read_data,
 
@@ -1205,46 +1219,52 @@ static const struct parport_operations parport_pc_ops =
 /* Super-IO chipset detection, Winbond, SMSC */
 static void __devinit show_parconfig_smsc37c669(int io, int key)
 {
-	int cr1,cr4,cra,cr23,cr26,cr27,i=0;
-	static const char *const modes[]={
+	int cr1, cr4, cra, cr23, cr26, cr27, i = 0;
+	static const char *const modes[] = {
 		"SPP and Bidirectional (PS/2)",
 		"EPP and SPP",
 		"ECP",
 		"ECP and EPP" };
 
-	outb(key,io);
-	outb(key,io);
-	outb(1,io);
-	cr1=inb(io+1);
-	outb(4,io);
-	cr4=inb(io+1);
-	outb(0x0a,io);
-	cra=inb(io+1);
-	outb(0x23,io);
-	cr23=inb(io+1);
-	outb(0x26,io);
-	cr26=inb(io+1);
-	outb(0x27,io);
-	cr27=inb(io+1);
-	outb(0xaa,io);
+	outb(key, io);
+	outb(key, io);
+	outb(1, io);
+	cr1 = inb(io + 1);
+	outb(4, io);
+	cr4 = inb(io + 1);
+	outb(0x0a, io);
+	cra = inb(io + 1);
+	outb(0x23, io);
+	cr23 = inb(io + 1);
+	outb(0x26, io);
+	cr26 = inb(io + 1);
+	outb(0x27, io);
+	cr27 = inb(io + 1);
+	outb(0xaa, io);
 
 	if (verbose_probing) {
-		printk (KERN_INFO "SMSC 37c669 LPT Config: cr_1=0x%02x, 4=0x%02x, "
+		printk(KERN_INFO
+			"SMSC 37c669 LPT Config: cr_1=0x%02x, 4=0x%02x, "
 			"A=0x%2x, 23=0x%02x, 26=0x%02x, 27=0x%02x\n",
-			cr1,cr4,cra,cr23,cr26,cr27);
-		
+			cr1, cr4, cra, cr23, cr26, cr27);
+
 		/* The documentation calls DMA and IRQ-Lines by letters, so
 		   the board maker can/will wire them
 		   appropriately/randomly...  G=reserved H=IDE-irq, */
-		printk (KERN_INFO "SMSC LPT Config: io=0x%04x, irq=%c, dma=%c, "
-			"fifo threshold=%d\n", cr23*4,
-			(cr27 &0x0f) ? 'A'-1+(cr27 &0x0f): '-',
-			(cr26 &0x0f) ? 'A'-1+(cr26 &0x0f): '-', cra & 0x0f);
+		printk(KERN_INFO
+	"SMSC LPT Config: io=0x%04x, irq=%c, dma=%c, fifo threshold=%d\n",
+				cr23 * 4,
+				(cr27 & 0x0f) ? 'A' - 1 + (cr27 & 0x0f) : '-',
+				(cr26 & 0x0f) ? 'A' - 1 + (cr26 & 0x0f) : '-',
+				cra & 0x0f);
 		printk(KERN_INFO "SMSC LPT Config: enabled=%s power=%s\n",
-		       (cr23*4 >=0x100) ?"yes":"no", (cr1 & 4) ? "yes" : "no");
-		printk(KERN_INFO "SMSC LPT Config: Port mode=%s, EPP version =%s\n",
-		       (cr1 & 0x08 ) ? "Standard mode only (SPP)" : modes[cr4 & 0x03], 
-		       (cr4 & 0x40) ? "1.7" : "1.9");
+		       (cr23 * 4 >= 0x100) ? "yes" : "no",
+		       (cr1 & 4) ? "yes" : "no");
+		printk(KERN_INFO
+			"SMSC LPT Config: Port mode=%s, EPP version =%s\n",
+				(cr1 & 0x08) ? "Standard mode only (SPP)"
+					      : modes[cr4 & 0x03],
+				(cr4 & 0x40) ? "1.7" : "1.9");
 	}
 
 	/* Heuristics !  BIOS setup for this mainboard device limits
@@ -1258,32 +1278,32 @@ static void __devinit show_parconfig_smsc37c669(int io, int key)
 			printk(KERN_INFO "Super-IO: too many chips!\n");
 		} else {
 			int d;
-			switch (cr23*4) {
-				case 0x3bc:
-					superios[i].io = 0x3bc;
-					superios[i].irq = 7;
-					break;
-				case 0x378:
-					superios[i].io = 0x378;
-					superios[i].irq = 7;
-					break;
-				case 0x278:
-					superios[i].io = 0x278;
-					superios[i].irq = 5;
+			switch (cr23 * 4) {
+			case 0x3bc:
+				superios[i].io = 0x3bc;
+				superios[i].irq = 7;
+				break;
+			case 0x378:
+				superios[i].io = 0x378;
+				superios[i].irq = 7;
+				break;
+			case 0x278:
+				superios[i].io = 0x278;
+				superios[i].irq = 5;
 			}
-			d=(cr26 &0x0f);
-			if((d==1) || (d==3)) 
-				superios[i].dma= d;
+			d = (cr26 & 0x0f);
+			if (d == 1 || d == 3)
+				superios[i].dma = d;
 			else
-				superios[i].dma= PARPORT_DMA_NONE;
+				superios[i].dma = PARPORT_DMA_NONE;
 		}
- 	}
+	}
 }
 
 
 static void __devinit show_parconfig_winbond(int io, int key)
 {
-	int cr30,cr60,cr61,cr70,cr74,crf0,i=0;
+	int cr30, cr60, cr61, cr70, cr74, crf0, i = 0;
 	static const char *const modes[] = {
 		"Standard (SPP) and Bidirectional(PS/2)", /* 0 */
 		"EPP-1.9 and SPP",
@@ -1296,40 +1316,43 @@ static void __devinit show_parconfig_winbond(int io, int key)
 	static char *const irqtypes[] = {
 		"pulsed low, high-Z",
 		"follows nACK" };
-		
+
 	/* The registers are called compatible-PnP because the
-           register layout is modelled after ISA-PnP, the access
-           method is just another ... */
-	outb(key,io);
-	outb(key,io);
-	outb(0x07,io);   /* Register 7: Select Logical Device */
-	outb(0x01,io+1); /* LD1 is Parallel Port */
-	outb(0x30,io);
-	cr30=inb(io+1);
-	outb(0x60,io);
-	cr60=inb(io+1);
-	outb(0x61,io);
-	cr61=inb(io+1);
-	outb(0x70,io);
-	cr70=inb(io+1);
-	outb(0x74,io);
-	cr74=inb(io+1);
-	outb(0xf0,io);
-	crf0=inb(io+1);
-	outb(0xaa,io);
+	   register layout is modelled after ISA-PnP, the access
+	   method is just another ... */
+	outb(key, io);
+	outb(key, io);
+	outb(0x07, io);   /* Register 7: Select Logical Device */
+	outb(0x01, io + 1); /* LD1 is Parallel Port */
+	outb(0x30, io);
+	cr30 = inb(io + 1);
+	outb(0x60, io);
+	cr60 = inb(io + 1);
+	outb(0x61, io);
+	cr61 = inb(io + 1);
+	outb(0x70, io);
+	cr70 = inb(io + 1);
+	outb(0x74, io);
+	cr74 = inb(io + 1);
+	outb(0xf0, io);
+	crf0 = inb(io + 1);
+	outb(0xaa, io);
 
 	if (verbose_probing) {
-		printk(KERN_INFO "Winbond LPT Config: cr_30=%02x 60,61=%02x%02x "
-		       "70=%02x 74=%02x, f0=%02x\n", cr30,cr60,cr61,cr70,cr74,crf0);
-		printk(KERN_INFO "Winbond LPT Config: active=%s, io=0x%02x%02x irq=%d, ", 
-		       (cr30 & 0x01) ? "yes":"no", cr60,cr61,cr70&0x0f );
+		printk(KERN_INFO
+    "Winbond LPT Config: cr_30=%02x 60,61=%02x%02x 70=%02x 74=%02x, f0=%02x\n",
+					cr30, cr60, cr61, cr70, cr74, crf0);
+		printk(KERN_INFO "Winbond LPT Config: active=%s, io=0x%02x%02x irq=%d, ",
+		       (cr30 & 0x01) ? "yes" : "no", cr60, cr61, cr70 & 0x0f);
 		if ((cr74 & 0x07) > 3)
 			printk("dma=none\n");
 		else
-			printk("dma=%d\n",cr74 & 0x07);
-		printk(KERN_INFO "Winbond LPT Config: irqtype=%s, ECP fifo threshold=%d\n",
-		       irqtypes[crf0>>7], (crf0>>3)&0x0f);
-		printk(KERN_INFO "Winbond LPT Config: Port mode=%s\n", modes[crf0 & 0x07]);
+			printk("dma=%d\n", cr74 & 0x07);
+		printk(KERN_INFO
+		    "Winbond LPT Config: irqtype=%s, ECP fifo threshold=%d\n",
+					irqtypes[crf0>>7], (crf0>>3)&0x0f);
+		printk(KERN_INFO "Winbond LPT Config: Port mode=%s\n",
+					modes[crf0 & 0x07]);
 	}
 
 	if (cr30 & 0x01) { /* the settings can be interrogated later ... */
@@ -1346,60 +1369,82 @@ static void __devinit show_parconfig_winbond(int io, int key)
 	}
 }
 
-static void __devinit decode_winbond(int efer, int key, int devid, int devrev, int oldid)
+static void __devinit decode_winbond(int efer, int key, int devid,
+							int devrev, int oldid)
 {
 	const char *type = "unknown";
-	int id,progif=2;
+	int id, progif = 2;
 
 	if (devid == devrev)
 		/* simple heuristics, we happened to read some
-                   non-winbond register */
+		   non-winbond register */
 		return;
 
-	id=(devid<<8) | devrev;
+	id = (devid << 8) | devrev;
 
 	/* Values are from public data sheets pdf files, I can just
-           confirm 83977TF is correct :-) */
-	if      (id == 0x9771) type="83977F/AF";
-	else if (id == 0x9773) type="83977TF / SMSC 97w33x/97w34x";
-	else if (id == 0x9774) type="83977ATF";
-	else if ((id & ~0x0f) == 0x5270) type="83977CTF / SMSC 97w36x";
-	else if ((id & ~0x0f) == 0x52f0) type="83977EF / SMSC 97w35x";
-	else if ((id & ~0x0f) == 0x5210) type="83627";
-	else if ((id & ~0x0f) == 0x6010) type="83697HF";
-	else if ((oldid &0x0f ) == 0x0a) { type="83877F"; progif=1;}
-	else if ((oldid &0x0f ) == 0x0b) { type="83877AF"; progif=1;}
-	else if ((oldid &0x0f ) == 0x0c) { type="83877TF"; progif=1;}
-	else if ((oldid &0x0f ) == 0x0d) { type="83877ATF"; progif=1;}
-	else progif=0;
+	   confirm 83977TF is correct :-) */
+	if (id == 0x9771)
+		type = "83977F/AF";
+	else if (id == 0x9773)
+		type = "83977TF / SMSC 97w33x/97w34x";
+	else if (id == 0x9774)
+		type = "83977ATF";
+	else if ((id & ~0x0f) == 0x5270)
+		type = "83977CTF / SMSC 97w36x";
+	else if ((id & ~0x0f) == 0x52f0)
+		type = "83977EF / SMSC 97w35x";
+	else if ((id & ~0x0f) == 0x5210)
+		type = "83627";
+	else if ((id & ~0x0f) == 0x6010)
+		type = "83697HF";
+	else if ((oldid & 0x0f) == 0x0a) {
+		type = "83877F";
+		progif = 1;
+	} else if ((oldid & 0x0f) == 0x0b) {
+		type = "83877AF";
+		progif = 1;
+	} else if ((oldid & 0x0f) == 0x0c) {
+		type = "83877TF";
+		progif = 1;
+	} else if ((oldid & 0x0f) == 0x0d) {
+		type = "83877ATF";
+		progif = 1;
+	} else
+		progif = 0;
 
 	if (verbose_probing)
 		printk(KERN_INFO "Winbond chip at EFER=0x%x key=0x%02x "
-		       "devid=%02x devrev=%02x oldid=%02x type=%s\n", 
+		       "devid=%02x devrev=%02x oldid=%02x type=%s\n",
 		       efer, key, devid, devrev, oldid, type);
 
 	if (progif == 2)
-		show_parconfig_winbond(efer,key);
+		show_parconfig_winbond(efer, key);
 }
 
 static void __devinit decode_smsc(int efer, int key, int devid, int devrev)
 {
-        const char *type = "unknown";
+	const char *type = "unknown";
 	void (*func)(int io, int key);
-        int id;
+	int id;
 
-        if (devid == devrev)
+	if (devid == devrev)
 		/* simple heuristics, we happened to read some
-                   non-smsc register */
+		   non-smsc register */
 		return;
 
-	func=NULL;
-        id=(devid<<8) | devrev;
+	func = NULL;
+	id = (devid << 8) | devrev;
 
-	if	(id==0x0302) {type="37c669"; func=show_parconfig_smsc37c669;}
-	else if	(id==0x6582) type="37c665IR";
-	else if	(devid==0x65) type="37c665GT";
-	else if	(devid==0x66) type="37c666GT";
+	if (id == 0x0302) {
+		type = "37c669";
+		func = show_parconfig_smsc37c669;
+	} else if (id == 0x6582)
+		type = "37c665IR";
+	else if	(devid == 0x65)
+		type = "37c665GT";
+	else if	(devid == 0x66)
+		type = "37c666GT";
 
 	if (verbose_probing)
 		printk(KERN_INFO "SMSC chip at EFER=0x%x "
@@ -1407,138 +1452,138 @@ static void __devinit decode_smsc(int efer, int key, int devid, int devrev)
 		       efer, key, devid, devrev, type);
 
 	if (func)
-		func(efer,key);
+		func(efer, key);
 }
 
 
 static void __devinit winbond_check(int io, int key)
 {
-	int devid,devrev,oldid,x_devid,x_devrev,x_oldid;
+	int devid, devrev, oldid, x_devid, x_devrev, x_oldid;
 
 	if (!request_region(io, 3, __func__))
 		return;
 
 	/* First probe without key */
-	outb(0x20,io);
-	x_devid=inb(io+1);
-	outb(0x21,io);
-	x_devrev=inb(io+1);
-	outb(0x09,io);
-	x_oldid=inb(io+1);
-
-	outb(key,io);
-	outb(key,io);     /* Write Magic Sequence to EFER, extended
-                             funtion enable register */
-	outb(0x20,io);    /* Write EFIR, extended function index register */
-	devid=inb(io+1);  /* Read EFDR, extended function data register */
-	outb(0x21,io);
-	devrev=inb(io+1);
-	outb(0x09,io);
-	oldid=inb(io+1);
-	outb(0xaa,io);    /* Magic Seal */
+	outb(0x20, io);
+	x_devid = inb(io + 1);
+	outb(0x21, io);
+	x_devrev = inb(io + 1);
+	outb(0x09, io);
+	x_oldid = inb(io + 1);
+
+	outb(key, io);
+	outb(key, io);     /* Write Magic Sequence to EFER, extended
+			      funtion enable register */
+	outb(0x20, io);    /* Write EFIR, extended function index register */
+	devid = inb(io + 1);  /* Read EFDR, extended function data register */
+	outb(0x21, io);
+	devrev = inb(io + 1);
+	outb(0x09, io);
+	oldid = inb(io + 1);
+	outb(0xaa, io);    /* Magic Seal */
 
 	if ((x_devid == devid) && (x_devrev == devrev) && (x_oldid == oldid))
 		goto out; /* protection against false positives */
 
-	decode_winbond(io,key,devid,devrev,oldid);
+	decode_winbond(io, key, devid, devrev, oldid);
 out:
 	release_region(io, 3);
 }
 
-static void __devinit winbond_check2(int io,int key)
+static void __devinit winbond_check2(int io, int key)
 {
-        int devid,devrev,oldid,x_devid,x_devrev,x_oldid;
+	int devid, devrev, oldid, x_devid, x_devrev, x_oldid;
 
 	if (!request_region(io, 3, __func__))
 		return;
 
 	/* First probe without the key */
-	outb(0x20,io+2);
-	x_devid=inb(io+2);
-	outb(0x21,io+1);
-	x_devrev=inb(io+2);
-	outb(0x09,io+1);
-	x_oldid=inb(io+2);
-
-        outb(key,io);     /* Write Magic Byte to EFER, extended
-                             funtion enable register */
-        outb(0x20,io+2);  /* Write EFIR, extended function index register */
-        devid=inb(io+2);  /* Read EFDR, extended function data register */
-        outb(0x21,io+1);
-        devrev=inb(io+2);
-        outb(0x09,io+1);
-        oldid=inb(io+2);
-        outb(0xaa,io);    /* Magic Seal */
-
-	if ((x_devid == devid) && (x_devrev == devrev) && (x_oldid == oldid))
+	outb(0x20, io + 2);
+	x_devid = inb(io + 2);
+	outb(0x21, io + 1);
+	x_devrev = inb(io + 2);
+	outb(0x09, io + 1);
+	x_oldid = inb(io + 2);
+
+	outb(key, io);     /* Write Magic Byte to EFER, extended
+			      funtion enable register */
+	outb(0x20, io + 2);  /* Write EFIR, extended function index register */
+	devid = inb(io + 2);  /* Read EFDR, extended function data register */
+	outb(0x21, io + 1);
+	devrev = inb(io + 2);
+	outb(0x09, io + 1);
+	oldid = inb(io + 2);
+	outb(0xaa, io);    /* Magic Seal */
+
+	if (x_devid == devid && x_devrev == devrev && x_oldid == oldid)
 		goto out; /* protection against false positives */
 
-	decode_winbond(io,key,devid,devrev,oldid);
+	decode_winbond(io, key, devid, devrev, oldid);
 out:
 	release_region(io, 3);
 }
 
 static void __devinit smsc_check(int io, int key)
 {
-        int id,rev,oldid,oldrev,x_id,x_rev,x_oldid,x_oldrev;
+	int id, rev, oldid, oldrev, x_id, x_rev, x_oldid, x_oldrev;
 
 	if (!request_region(io, 3, __func__))
 		return;
 
 	/* First probe without the key */
-	outb(0x0d,io);
-	x_oldid=inb(io+1);
-	outb(0x0e,io);
-	x_oldrev=inb(io+1);
-	outb(0x20,io);
-	x_id=inb(io+1);
-	outb(0x21,io);
-	x_rev=inb(io+1);
-
-        outb(key,io);
-        outb(key,io);     /* Write Magic Sequence to EFER, extended
-                             funtion enable register */
-        outb(0x0d,io);    /* Write EFIR, extended function index register */
-        oldid=inb(io+1);  /* Read EFDR, extended function data register */
-        outb(0x0e,io);
-        oldrev=inb(io+1);
-	outb(0x20,io);
-	id=inb(io+1);
-	outb(0x21,io);
-	rev=inb(io+1);
-        outb(0xaa,io);    /* Magic Seal */
-
-	if ((x_id == id) && (x_oldrev == oldrev) &&
-	    (x_oldid == oldid) && (x_rev == rev))
+	outb(0x0d, io);
+	x_oldid = inb(io + 1);
+	outb(0x0e, io);
+	x_oldrev = inb(io + 1);
+	outb(0x20, io);
+	x_id = inb(io + 1);
+	outb(0x21, io);
+	x_rev = inb(io + 1);
+
+	outb(key, io);
+	outb(key, io);     /* Write Magic Sequence to EFER, extended
+			      funtion enable register */
+	outb(0x0d, io);    /* Write EFIR, extended function index register */
+	oldid = inb(io + 1);  /* Read EFDR, extended function data register */
+	outb(0x0e, io);
+	oldrev = inb(io + 1);
+	outb(0x20, io);
+	id = inb(io + 1);
+	outb(0x21, io);
+	rev = inb(io + 1);
+	outb(0xaa, io);    /* Magic Seal */
+
+	if (x_id == id && x_oldrev == oldrev &&
+	    x_oldid == oldid && x_rev == rev)
 		goto out; /* protection against false positives */
 
-        decode_smsc(io,key,oldid,oldrev);
+	decode_smsc(io, key, oldid, oldrev);
 out:
 	release_region(io, 3);
 }
 
 
-static void __devinit detect_and_report_winbond (void)
-{ 
+static void __devinit detect_and_report_winbond(void)
+{
 	if (verbose_probing)
 		printk(KERN_DEBUG "Winbond Super-IO detection, now testing ports 3F0,370,250,4E,2E ...\n");
-	winbond_check(0x3f0,0x87);
-	winbond_check(0x370,0x87);
-	winbond_check(0x2e ,0x87);
-	winbond_check(0x4e ,0x87);
-	winbond_check(0x3f0,0x86);
-	winbond_check2(0x250,0x88); 
-	winbond_check2(0x250,0x89);
+	winbond_check(0x3f0, 0x87);
+	winbond_check(0x370, 0x87);
+	winbond_check(0x2e , 0x87);
+	winbond_check(0x4e , 0x87);
+	winbond_check(0x3f0, 0x86);
+	winbond_check2(0x250, 0x88);
+	winbond_check2(0x250, 0x89);
 }
 
-static void __devinit detect_and_report_smsc (void)
+static void __devinit detect_and_report_smsc(void)
 {
 	if (verbose_probing)
 		printk(KERN_DEBUG "SMSC Super-IO detection, now testing Ports 2F0, 370 ...\n");
-	smsc_check(0x3f0,0x55);
-	smsc_check(0x370,0x55);
-	smsc_check(0x3f0,0x44);
-	smsc_check(0x370,0x44);
+	smsc_check(0x3f0, 0x55);
+	smsc_check(0x370, 0x55);
+	smsc_check(0x3f0, 0x44);
+	smsc_check(0x370, 0x44);
 }
 
 static void __devinit detect_and_report_it87(void)
@@ -1573,7 +1618,7 @@ static void __devinit detect_and_report_it87(void)
 }
 #endif /* CONFIG_PARPORT_PC_SUPERIO */
 
-static int get_superio_dma (struct parport *p)
+static int get_superio_dma(struct parport *p)
 {
 	int i = 0;
 
@@ -1584,15 +1629,15 @@ static int get_superio_dma (struct parport *p)
 	return PARPORT_DMA_NONE;
 }
 
-static int get_superio_irq (struct parport *p)
+static int get_superio_irq(struct parport *p)
 {
 	int i = 0;
 
-        while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
-                i++;
-        if (i != NR_SUPERIOS)
-                return superios[i].irq;
-        return PARPORT_IRQ_NONE;
+	while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
+		i++;
+	if (i != NR_SUPERIOS)
+		return superios[i].irq;
+	return PARPORT_IRQ_NONE;
 }
 
 
@@ -1600,9 +1645,9 @@ static int get_superio_irq (struct parport *p)
 
 /*
  * Checks for port existence, all ports support SPP MODE
- * Returns: 
+ * Returns:
  *         0           :  No parallel port at this address
- *  PARPORT_MODE_PCSPP :  SPP port detected 
+ *  PARPORT_MODE_PCSPP :  SPP port detected
  *                        (if the user specified an ioport himself,
  *                         this shall always be the case!)
  *
@@ -1612,7 +1657,7 @@ static int parport_SPP_supported(struct parport *pb)
 	unsigned char r, w;
 
 	/*
-	 * first clear an eventually pending EPP timeout 
+	 * first clear an eventually pending EPP timeout
 	 * I (sailer@ife.ee.ethz.ch) have an SMSC chipset
 	 * that does not even respond to SPP cycles if an EPP
 	 * timeout is pending
@@ -1621,19 +1666,19 @@ static int parport_SPP_supported(struct parport *pb)
 
 	/* Do a simple read-write test to make sure the port exists. */
 	w = 0xc;
-	outb (w, CONTROL (pb));
+	outb(w, CONTROL(pb));
 
 	/* Is there a control register that we can read from?  Some
 	 * ports don't allow reads, so read_control just returns a
 	 * software copy. Some ports _do_ allow reads, so bypass the
 	 * software copy here.  In addition, some bits aren't
 	 * writable. */
-	r = inb (CONTROL (pb));
+	r = inb(CONTROL(pb));
 	if ((r & 0xf) == w) {
 		w = 0xe;
-		outb (w, CONTROL (pb));
-		r = inb (CONTROL (pb));
-		outb (0xc, CONTROL (pb));
+		outb(w, CONTROL(pb));
+		r = inb(CONTROL(pb));
+		outb(0xc, CONTROL(pb));
 		if ((r & 0xf) == w)
 			return PARPORT_MODE_PCSPP;
 	}
@@ -1641,18 +1686,18 @@ static int parport_SPP_supported(struct parport *pb)
 	if (user_specified)
 		/* That didn't work, but the user thinks there's a
 		 * port here. */
-		printk (KERN_INFO "parport 0x%lx (WARNING): CTR: "
+		printk(KERN_INFO "parport 0x%lx (WARNING): CTR: "
 			"wrote 0x%02x, read 0x%02x\n", pb->base, w, r);
 
 	/* Try the data register.  The data lines aren't tri-stated at
 	 * this stage, so we expect back what we wrote. */
 	w = 0xaa;
-	parport_pc_write_data (pb, w);
-	r = parport_pc_read_data (pb);
+	parport_pc_write_data(pb, w);
+	r = parport_pc_read_data(pb);
 	if (r == w) {
 		w = 0x55;
-		parport_pc_write_data (pb, w);
-		r = parport_pc_read_data (pb);
+		parport_pc_write_data(pb, w);
+		r = parport_pc_read_data(pb);
 		if (r == w)
 			return PARPORT_MODE_PCSPP;
 	}
@@ -1660,9 +1705,9 @@ static int parport_SPP_supported(struct parport *pb)
 	if (user_specified) {
 		/* Didn't work, but the user is convinced this is the
 		 * place. */
-		printk (KERN_INFO "parport 0x%lx (WARNING): DATA: "
+		printk(KERN_INFO "parport 0x%lx (WARNING): DATA: "
 			"wrote 0x%02x, read 0x%02x\n", pb->base, w, r);
-		printk (KERN_INFO "parport 0x%lx: You gave this address, "
+		printk(KERN_INFO "parport 0x%lx: You gave this address, "
 			"but there is probably no parallel port there!\n",
 			pb->base);
 	}
@@ -1693,33 +1738,33 @@ static int parport_ECR_present(struct parport *pb)
 	struct parport_pc_private *priv = pb->private_data;
 	unsigned char r = 0xc;
 
-	outb (r, CONTROL (pb));
-	if ((inb (ECONTROL (pb)) & 0x3) == (r & 0x3)) {
-		outb (r ^ 0x2, CONTROL (pb)); /* Toggle bit 1 */
+	outb(r, CONTROL(pb));
+	if ((inb(ECONTROL(pb)) & 0x3) == (r & 0x3)) {
+		outb(r ^ 0x2, CONTROL(pb)); /* Toggle bit 1 */
 
-		r = inb (CONTROL (pb));
-		if ((inb (ECONTROL (pb)) & 0x2) == (r & 0x2))
+		r = inb(CONTROL(pb));
+		if ((inb(ECONTROL(pb)) & 0x2) == (r & 0x2))
 			goto no_reg; /* Sure that no ECR register exists */
 	}
-	
-	if ((inb (ECONTROL (pb)) & 0x3 ) != 0x1)
+
+	if ((inb(ECONTROL(pb)) & 0x3) != 0x1)
 		goto no_reg;
 
-	ECR_WRITE (pb, 0x34);
-	if (inb (ECONTROL (pb)) != 0x35)
+	ECR_WRITE(pb, 0x34);
+	if (inb(ECONTROL(pb)) != 0x35)
 		goto no_reg;
 
 	priv->ecr = 1;
-	outb (0xc, CONTROL (pb));
-	
+	outb(0xc, CONTROL(pb));
+
 	/* Go to mode 000 */
-	frob_set_mode (pb, ECR_SPP);
+	frob_set_mode(pb, ECR_SPP);
 
 	return 1;
 
  no_reg:
-	outb (0xc, CONTROL (pb));
-	return 0; 
+	outb(0xc, CONTROL(pb));
+	return 0;
 }
 
 #ifdef CONFIG_PARPORT_1284
@@ -1729,7 +1774,7 @@ static int parport_ECR_present(struct parport *pb)
  * allows us to read data from the data lines.  In theory we would get back
  * 0xff but any peripheral attached to the port may drag some or all of the
  * lines down to zero.  So if we get back anything that isn't the contents
- * of the data register we deem PS/2 support to be present. 
+ * of the data register we deem PS/2 support to be present.
  *
  * Some SPP ports have "half PS/2" ability - you can't turn off the line
  * drivers, but an external peripheral with sufficiently beefy drivers of
@@ -1737,26 +1782,28 @@ static int parport_ECR_present(struct parport *pb)
  * where they can then be read back as normal.  Ports with this property
  * and the right type of device attached are likely to fail the SPP test,
  * (as they will appear to have stuck bits) and so the fact that they might
- * be misdetected here is rather academic. 
+ * be misdetected here is rather academic.
  */
 
 static int parport_PS2_supported(struct parport *pb)
 {
 	int ok = 0;
-  
+
 	clear_epp_timeout(pb);
 
 	/* try to tri-state the buffer */
-	parport_pc_data_reverse (pb);
-	
+	parport_pc_data_reverse(pb);
+
 	parport_pc_write_data(pb, 0x55);
-	if (parport_pc_read_data(pb) != 0x55) ok++;
+	if (parport_pc_read_data(pb) != 0x55)
+		ok++;
 
 	parport_pc_write_data(pb, 0xaa);
-	if (parport_pc_read_data(pb) != 0xaa) ok++;
+	if (parport_pc_read_data(pb) != 0xaa)
+		ok++;
 
 	/* cancel input mode */
-	parport_pc_data_forward (pb);
+	parport_pc_data_forward(pb);
 
 	if (ok) {
 		pb->modes |= PARPORT_MODE_TRISTATE;
@@ -1775,68 +1822,68 @@ static int parport_ECP_supported(struct parport *pb)
 	int config, configb;
 	int pword;
 	struct parport_pc_private *priv = pb->private_data;
-	/* Translate ECP intrLine to ISA irq value */	
-	static const int intrline[]= { 0, 7, 9, 10, 11, 14, 15, 5 }; 
+	/* Translate ECP intrLine to ISA irq value */
+	static const int intrline[] = { 0, 7, 9, 10, 11, 14, 15, 5 };
 
 	/* If there is no ECR, we have no hope of supporting ECP. */
 	if (!priv->ecr)
 		return 0;
 
 	/* Find out FIFO depth */
-	ECR_WRITE (pb, ECR_SPP << 5); /* Reset FIFO */
-	ECR_WRITE (pb, ECR_TST << 5); /* TEST FIFO */
-	for (i=0; i < 1024 && !(inb (ECONTROL (pb)) & 0x02); i++)
-		outb (0xaa, FIFO (pb));
+	ECR_WRITE(pb, ECR_SPP << 5); /* Reset FIFO */
+	ECR_WRITE(pb, ECR_TST << 5); /* TEST FIFO */
+	for (i = 0; i < 1024 && !(inb(ECONTROL(pb)) & 0x02); i++)
+		outb(0xaa, FIFO(pb));
 
 	/*
 	 * Using LGS chipset it uses ECR register, but
 	 * it doesn't support ECP or FIFO MODE
 	 */
 	if (i == 1024) {
-		ECR_WRITE (pb, ECR_SPP << 5);
+		ECR_WRITE(pb, ECR_SPP << 5);
 		return 0;
 	}
 
 	priv->fifo_depth = i;
 	if (verbose_probing)
-		printk (KERN_DEBUG "0x%lx: FIFO is %d bytes\n", pb->base, i);
+		printk(KERN_DEBUG "0x%lx: FIFO is %d bytes\n", pb->base, i);
 
 	/* Find out writeIntrThreshold */
-	frob_econtrol (pb, 1<<2, 1<<2);
-	frob_econtrol (pb, 1<<2, 0);
+	frob_econtrol(pb, 1<<2, 1<<2);
+	frob_econtrol(pb, 1<<2, 0);
 	for (i = 1; i <= priv->fifo_depth; i++) {
-		inb (FIFO (pb));
-		udelay (50);
-		if (inb (ECONTROL (pb)) & (1<<2))
+		inb(FIFO(pb));
+		udelay(50);
+		if (inb(ECONTROL(pb)) & (1<<2))
 			break;
 	}
 
 	if (i <= priv->fifo_depth) {
 		if (verbose_probing)
-			printk (KERN_DEBUG "0x%lx: writeIntrThreshold is %d\n",
+			printk(KERN_DEBUG "0x%lx: writeIntrThreshold is %d\n",
 				pb->base, i);
 	} else
 		/* Number of bytes we know we can write if we get an
-                   interrupt. */
+		   interrupt. */
 		i = 0;
 
 	priv->writeIntrThreshold = i;
 
 	/* Find out readIntrThreshold */
-	frob_set_mode (pb, ECR_PS2); /* Reset FIFO and enable PS2 */
-	parport_pc_data_reverse (pb); /* Must be in PS2 mode */
-	frob_set_mode (pb, ECR_TST); /* Test FIFO */
-	frob_econtrol (pb, 1<<2, 1<<2);
-	frob_econtrol (pb, 1<<2, 0);
+	frob_set_mode(pb, ECR_PS2); /* Reset FIFO and enable PS2 */
+	parport_pc_data_reverse(pb); /* Must be in PS2 mode */
+	frob_set_mode(pb, ECR_TST); /* Test FIFO */
+	frob_econtrol(pb, 1<<2, 1<<2);
+	frob_econtrol(pb, 1<<2, 0);
 	for (i = 1; i <= priv->fifo_depth; i++) {
-		outb (0xaa, FIFO (pb));
-		if (inb (ECONTROL (pb)) & (1<<2))
+		outb(0xaa, FIFO(pb));
+		if (inb(ECONTROL(pb)) & (1<<2))
 			break;
 	}
 
 	if (i <= priv->fifo_depth) {
 		if (verbose_probing)
-			printk (KERN_INFO "0x%lx: readIntrThreshold is %d\n",
+			printk(KERN_INFO "0x%lx: readIntrThreshold is %d\n",
 				pb->base, i);
 	} else
 		/* Number of bytes we can read if we get an interrupt. */
@@ -1844,23 +1891,23 @@ static int parport_ECP_supported(struct parport *pb)
 
 	priv->readIntrThreshold = i;
 
-	ECR_WRITE (pb, ECR_SPP << 5); /* Reset FIFO */
-	ECR_WRITE (pb, 0xf4); /* Configuration mode */
-	config = inb (CONFIGA (pb));
+	ECR_WRITE(pb, ECR_SPP << 5); /* Reset FIFO */
+	ECR_WRITE(pb, 0xf4); /* Configuration mode */
+	config = inb(CONFIGA(pb));
 	pword = (config >> 4) & 0x7;
 	switch (pword) {
 	case 0:
 		pword = 2;
-		printk (KERN_WARNING "0x%lx: Unsupported pword size!\n",
+		printk(KERN_WARNING "0x%lx: Unsupported pword size!\n",
 			pb->base);
 		break;
 	case 2:
 		pword = 4;
-		printk (KERN_WARNING "0x%lx: Unsupported pword size!\n",
+		printk(KERN_WARNING "0x%lx: Unsupported pword size!\n",
 			pb->base);
 		break;
 	default:
-		printk (KERN_WARNING "0x%lx: Unknown implementation ID\n",
+		printk(KERN_WARNING "0x%lx: Unknown implementation ID\n",
 			pb->base);
 		/* Assume 1 */
 	case 1:
@@ -1869,28 +1916,29 @@ static int parport_ECP_supported(struct parport *pb)
 	priv->pword = pword;
 
 	if (verbose_probing) {
-		printk (KERN_DEBUG "0x%lx: PWord is %d bits\n", pb->base, 8 * pword);
-		
-		printk (KERN_DEBUG "0x%lx: Interrupts are ISA-%s\n", pb->base,
+		printk(KERN_DEBUG "0x%lx: PWord is %d bits\n",
+			pb->base, 8 * pword);
+
+		printk(KERN_DEBUG "0x%lx: Interrupts are ISA-%s\n", pb->base,
 			config & 0x80 ? "Level" : "Pulses");
 
-		configb = inb (CONFIGB (pb));
-		printk (KERN_DEBUG "0x%lx: ECP port cfgA=0x%02x cfgB=0x%02x\n",
+		configb = inb(CONFIGB(pb));
+		printk(KERN_DEBUG "0x%lx: ECP port cfgA=0x%02x cfgB=0x%02x\n",
 			pb->base, config, configb);
-		printk (KERN_DEBUG "0x%lx: ECP settings irq=", pb->base);
-		if ((configb >>3) & 0x07)
-			printk("%d",intrline[(configb >>3) & 0x07]);
+		printk(KERN_DEBUG "0x%lx: ECP settings irq=", pb->base);
+		if ((configb >> 3) & 0x07)
+			printk("%d", intrline[(configb >> 3) & 0x07]);
 		else
 			printk("<none or set by other means>");
-		printk (" dma=");
-		if( (configb & 0x03 ) == 0x00)
+		printk(" dma=");
+		if ((configb & 0x03) == 0x00)
 			printk("<none or set by other means>\n");
 		else
-			printk("%d\n",configb & 0x07);
+			printk("%d\n", configb & 0x07);
 	}
 
 	/* Go back to mode 000 */
-	frob_set_mode (pb, ECR_SPP);
+	frob_set_mode(pb, ECR_SPP);
 
 	return 1;
 }
@@ -1905,10 +1953,10 @@ static int parport_ECPPS2_supported(struct parport *pb)
 	if (!priv->ecr)
 		return 0;
 
-	oecr = inb (ECONTROL (pb));
-	ECR_WRITE (pb, ECR_PS2 << 5);
+	oecr = inb(ECONTROL(pb));
+	ECR_WRITE(pb, ECR_PS2 << 5);
 	result = parport_PS2_supported(pb);
-	ECR_WRITE (pb, oecr);
+	ECR_WRITE(pb, oecr);
 	return result;
 }
 
@@ -1932,16 +1980,15 @@ static int parport_EPP_supported(struct parport *pb)
 	 */
 
 	/* If EPP timeout bit clear then EPP available */
-	if (!clear_epp_timeout(pb)) {
+	if (!clear_epp_timeout(pb))
 		return 0;  /* No way to clear timeout */
-	}
 
 	/* Check for Intel bug. */
 	if (priv->ecr) {
 		unsigned char i;
 		for (i = 0x00; i < 0x80; i += 0x20) {
-			ECR_WRITE (pb, i);
-			if (clear_epp_timeout (pb)) {
+			ECR_WRITE(pb, i);
+			if (clear_epp_timeout(pb)) {
 				/* Phony EPP in ECP. */
 				return 0;
 			}
@@ -1965,17 +2012,16 @@ static int parport_ECPEPP_supported(struct parport *pb)
 	int result;
 	unsigned char oecr;
 
-	if (!priv->ecr) {
+	if (!priv->ecr)
 		return 0;
-	}
 
-	oecr = inb (ECONTROL (pb));
+	oecr = inb(ECONTROL(pb));
 	/* Search for SMC style EPP+ECP mode */
-	ECR_WRITE (pb, 0x80);
-	outb (0x04, CONTROL (pb));
+	ECR_WRITE(pb, 0x80);
+	outb(0x04, CONTROL(pb));
 	result = parport_EPP_supported(pb);
 
-	ECR_WRITE (pb, oecr);
+	ECR_WRITE(pb, oecr);
 
 	if (result) {
 		/* Set up access functions to use ECP+EPP hardware. */
@@ -1993,11 +2039,25 @@ static int parport_ECPEPP_supported(struct parport *pb)
 /* Don't bother probing for modes we know we won't use. */
 static int __devinit parport_PS2_supported(struct parport *pb) { return 0; }
 #ifdef CONFIG_PARPORT_PC_FIFO
-static int parport_ECP_supported(struct parport *pb) { return 0; }
+static int parport_ECP_supported(struct parport *pb)
+{
+	return 0;
+}
 #endif
-static int __devinit parport_EPP_supported(struct parport *pb) { return 0; }
-static int __devinit parport_ECPEPP_supported(struct parport *pb){return 0;}
-static int __devinit parport_ECPPS2_supported(struct parport *pb){return 0;}
+static int __devinit parport_EPP_supported(struct parport *pb)
+{
+	return 0;
+}
+
+static int __devinit parport_ECPEPP_supported(struct parport *pb)
+{
+	return 0;
+}
+
+static int __devinit parport_ECPPS2_supported(struct parport *pb)
+{
+	return 0;
+}
 
 #endif /* No IEEE 1284 support */
 
@@ -2007,17 +2067,17 @@ static int __devinit parport_ECPPS2_supported(struct parport *pb){return 0;}
 static int programmable_irq_support(struct parport *pb)
 {
 	int irq, intrLine;
-	unsigned char oecr = inb (ECONTROL (pb));
+	unsigned char oecr = inb(ECONTROL(pb));
 	static const int lookup[8] = {
 		PARPORT_IRQ_NONE, 7, 9, 10, 11, 14, 15, 5
 	};
 
-	ECR_WRITE (pb, ECR_CNF << 5); /* Configuration MODE */
+	ECR_WRITE(pb, ECR_CNF << 5); /* Configuration MODE */
 
-	intrLine = (inb (CONFIGB (pb)) >> 3) & 0x07;
+	intrLine = (inb(CONFIGB(pb)) >> 3) & 0x07;
 	irq = lookup[intrLine];
 
-	ECR_WRITE (pb, oecr);
+	ECR_WRITE(pb, oecr);
 	return irq;
 }
 
@@ -2027,17 +2087,17 @@ static int irq_probe_ECP(struct parport *pb)
 	unsigned long irqs;
 
 	irqs = probe_irq_on();
-		
-	ECR_WRITE (pb, ECR_SPP << 5); /* Reset FIFO */
-	ECR_WRITE (pb, (ECR_TST << 5) | 0x04);
-	ECR_WRITE (pb, ECR_TST << 5);
+
+	ECR_WRITE(pb, ECR_SPP << 5); /* Reset FIFO */
+	ECR_WRITE(pb, (ECR_TST << 5) | 0x04);
+	ECR_WRITE(pb, ECR_TST << 5);
 
 	/* If Full FIFO sure that writeIntrThreshold is generated */
-	for (i=0; i < 1024 && !(inb (ECONTROL (pb)) & 0x02) ; i++) 
-		outb (0xaa, FIFO (pb));
-		
+	for (i = 0; i < 1024 && !(inb(ECONTROL(pb)) & 0x02) ; i++)
+		outb(0xaa, FIFO(pb));
+
 	pb->irq = probe_irq_off(irqs);
-	ECR_WRITE (pb, ECR_SPP << 5);
+	ECR_WRITE(pb, ECR_SPP << 5);
 
 	if (pb->irq <= 0)
 		pb->irq = PARPORT_IRQ_NONE;
@@ -2047,7 +2107,7 @@ static int irq_probe_ECP(struct parport *pb)
 
 /*
  * This detection seems that only works in National Semiconductors
- * This doesn't work in SMC, LGS, and Winbond 
+ * This doesn't work in SMC, LGS, and Winbond
  */
 static int irq_probe_EPP(struct parport *pb)
 {
@@ -2058,16 +2118,16 @@ static int irq_probe_EPP(struct parport *pb)
 	unsigned char oecr;
 
 	if (pb->modes & PARPORT_MODE_PCECR)
-		oecr = inb (ECONTROL (pb));
+		oecr = inb(ECONTROL(pb));
 
 	irqs = probe_irq_on();
 
 	if (pb->modes & PARPORT_MODE_PCECR)
-		frob_econtrol (pb, 0x10, 0x10);
-	
+		frob_econtrol(pb, 0x10, 0x10);
+
 	clear_epp_timeout(pb);
-	parport_pc_frob_control (pb, 0x20, 0x20);
-	parport_pc_frob_control (pb, 0x10, 0x10);
+	parport_pc_frob_control(pb, 0x20, 0x20);
+	parport_pc_frob_control(pb, 0x10, 0x10);
 	clear_epp_timeout(pb);
 
 	/* Device isn't expecting an EPP read
@@ -2076,9 +2136,9 @@ static int irq_probe_EPP(struct parport *pb)
 	parport_pc_read_epp(pb);
 	udelay(20);
 
-	pb->irq = probe_irq_off (irqs);
+	pb->irq = probe_irq_off(irqs);
 	if (pb->modes & PARPORT_MODE_PCECR)
-		ECR_WRITE (pb, oecr);
+		ECR_WRITE(pb, oecr);
 	parport_pc_write_control(pb, 0xc);
 
 	if (pb->irq <= 0)
@@ -2135,28 +2195,28 @@ static int parport_irq_probe(struct parport *pb)
 /* --- DMA detection -------------------------------------- */
 
 /* Only if chipset conforms to ECP ISA Interface Standard */
-static int programmable_dma_support (struct parport *p)
+static int programmable_dma_support(struct parport *p)
 {
-	unsigned char oecr = inb (ECONTROL (p));
+	unsigned char oecr = inb(ECONTROL(p));
 	int dma;
 
-	frob_set_mode (p, ECR_CNF);
-	
-	dma = inb (CONFIGB(p)) & 0x07;
+	frob_set_mode(p, ECR_CNF);
+
+	dma = inb(CONFIGB(p)) & 0x07;
 	/* 000: Indicates jumpered 8-bit DMA if read-only.
 	   100: Indicates jumpered 16-bit DMA if read-only. */
 	if ((dma & 0x03) == 0)
 		dma = PARPORT_DMA_NONE;
 
-	ECR_WRITE (p, oecr);
+	ECR_WRITE(p, oecr);
 	return dma;
 }
 
-static int parport_dma_probe (struct parport *p)
+static int parport_dma_probe(struct parport *p)
 {
 	const struct parport_pc_private *priv = p->private_data;
-	if (priv->ecr)
-		p->dma = programmable_dma_support(p); /* ask ECP chipset first */
+	if (priv->ecr)		/* ask ECP chipset first */
+		p->dma = programmable_dma_support(p);
 	if (p->dma == PARPORT_DMA_NONE) {
 		/* ask known Super-IO chips proper, although these
 		   claim ECP compatible, some don't report their DMA
@@ -2214,7 +2274,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 	if (!base_res)
 		goto out4;
 
-	memcpy(ops, &parport_pc_ops, sizeof (struct parport_operations));
+	memcpy(ops, &parport_pc_ops, sizeof(struct parport_operations));
 	priv->ctr = 0xc;
 	priv->ctr_writable = ~0x10;
 	priv->ecr = 0;
@@ -2241,7 +2301,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 			if (!parport_EPP_supported(p))
 				parport_ECPEPP_supported(p);
 	}
-	if (!parport_SPP_supported (p))
+	if (!parport_SPP_supported(p))
 		/* No port. */
 		goto out5;
 	if (priv->ecr)
@@ -2249,7 +2309,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 	else
 		parport_PS2_supported(p);
 
-	p->size = (p->modes & PARPORT_MODE_EPP)?8:3;
+	p->size = (p->modes & PARPORT_MODE_EPP) ? 8 : 3;
 
 	printk(KERN_INFO "%s: PC-style at 0x%lx", p->name, p->base);
 	if (p->base_hi && priv->ecr)
@@ -2273,7 +2333,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 		}
 	}
 	if (p->dma == PARPORT_DMA_AUTO) /* To use DMA, giving the irq
-                                           is mandatory (see above) */
+					   is mandatory (see above) */
 		p->dma = PARPORT_DMA_NONE;
 
 #ifdef CONFIG_PARPORT_PC_FIFO
@@ -2290,16 +2350,23 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 		if (p->dma != PARPORT_DMA_NONE) {
 			printk(", dma %d", p->dma);
 			p->modes |= PARPORT_MODE_DMA;
-		}
-		else printk(", using FIFO");
-	}
-	else
+		} else
+			printk(", using FIFO");
+	} else
 		/* We can't use the DMA channel after all. */
 		p->dma = PARPORT_DMA_NONE;
 #endif /* Allowed to use FIFO/DMA */
 
 	printk(" [");
-#define printmode(x) {if(p->modes&PARPORT_MODE_##x){printk("%s%s",f?",":"",#x);f++;}}
+
+#define printmode(x) \
+	{\
+		if (p->modes & PARPORT_MODE_##x) {\
+			printk("%s%s", f ? "," : "", #x);\
+			f++;\
+		} \
+	}
+
 	{
 		int f = 0;
 		printmode(PCSPP);
@@ -2311,10 +2378,10 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 	}
 #undef printmode
 #ifndef CONFIG_PARPORT_1284
-	printk ("(,...)");
+	printk("(,...)");
 #endif /* CONFIG_PARPORT_1284 */
 	printk("]\n");
-	if (probedirq != PARPORT_IRQ_NONE) 
+	if (probedirq != PARPORT_IRQ_NONE)
 		printk(KERN_INFO "%s: irq %d detected\n", p->name, probedirq);
 
 	/* If No ECP release the ports grabbed above. */
@@ -2330,7 +2397,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 	if (p->irq != PARPORT_IRQ_NONE) {
 		if (request_irq(p->irq, parport_irq_handler,
 				 irqflags, p->name, p)) {
-			printk (KERN_WARNING "%s: irq %d in use, "
+			printk(KERN_WARNING "%s: irq %d in use, "
 				"resorting to polled operation\n",
 				p->name, p->irq);
 			p->irq = PARPORT_IRQ_NONE;
@@ -2340,8 +2407,8 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 #ifdef CONFIG_PARPORT_PC_FIFO
 #ifdef HAS_DMA
 		if (p->dma != PARPORT_DMA_NONE) {
-			if (request_dma (p->dma, p->name)) {
-				printk (KERN_WARNING "%s: dma %d in use, "
+			if (request_dma(p->dma, p->name)) {
+				printk(KERN_WARNING "%s: dma %d in use, "
 					"resorting to PIO operation\n",
 					p->name, p->dma);
 				p->dma = PARPORT_DMA_NONE;
@@ -2351,8 +2418,8 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 						       PAGE_SIZE,
 						       &priv->dma_handle,
 						       GFP_KERNEL);
-				if (! priv->dma_buf) {
-					printk (KERN_WARNING "%s: "
+				if (!priv->dma_buf) {
+					printk(KERN_WARNING "%s: "
 						"cannot get buffer for DMA, "
 						"resorting to PIO operation\n",
 						p->name);
@@ -2371,10 +2438,10 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 		 * Put the ECP detected port in PS2 mode.
 		 * Do this also for ports that have ECR but don't do ECP.
 		 */
-		ECR_WRITE (p, 0x34);
+		ECR_WRITE(p, 0x34);
 
 	parport_pc_write_data(p, 0);
-	parport_pc_data_forward (p);
+	parport_pc_data_forward(p);
 
 	/* Now that we've told the sharing engine about the port, and
 	   found out its characteristics, let the high-level drivers
@@ -2382,7 +2449,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 	spin_lock(&ports_lock);
 	list_add(&priv->list, &ports_list);
 	spin_unlock(&ports_lock);
-	parport_announce_port (p);
+	parport_announce_port(p);
 
 	return p;
 
@@ -2395,18 +2462,17 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 out4:
 	parport_put_port(p);
 out3:
-	kfree (priv);
+	kfree(priv);
 out2:
-	kfree (ops);
+	kfree(ops);
 out1:
 	if (pdev)
 		platform_device_unregister(pdev);
 	return NULL;
 }
+EXPORT_SYMBOL(parport_pc_probe_port);
 
-EXPORT_SYMBOL (parport_pc_probe_port);
-
-void parport_pc_unregister_port (struct parport *p)
+void parport_pc_unregister_port(struct parport *p)
 {
 	struct parport_pc_private *priv = p->private_data;
 	struct parport_operations *ops = p->ops;
@@ -2432,17 +2498,16 @@ void parport_pc_unregister_port (struct parport *p)
 				    priv->dma_buf,
 				    priv->dma_handle);
 #endif
-	kfree (p->private_data);
+	kfree(p->private_data);
 	parport_put_port(p);
-	kfree (ops); /* hope no-one cached it */
+	kfree(ops); /* hope no-one cached it */
 }
-
-EXPORT_SYMBOL (parport_pc_unregister_port);
+EXPORT_SYMBOL(parport_pc_unregister_port);
 
 #ifdef CONFIG_PCI
 
 /* ITE support maintained by Rich Liu <richliu@poorman.org> */
-static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
+static int __devinit sio_ite_8872_probe(struct pci_dev *pdev, int autoirq,
 					 int autodma,
 					 const struct parport_pc_via_data *via)
 {
@@ -2454,73 +2519,74 @@ static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
 	int irq;
 	int i;
 
-	DPRINTK (KERN_DEBUG "sio_ite_8872_probe()\n");
-	
-	// make sure which one chip
-	for(i = 0; i < 5; i++) {
+	DPRINTK(KERN_DEBUG "sio_ite_8872_probe()\n");
+
+	/* make sure which one chip */
+	for (i = 0; i < 5; i++) {
 		base_res = request_region(inta_addr[i], 32, "it887x");
 		if (base_res) {
 			int test;
-			pci_write_config_dword (pdev, 0x60,
+			pci_write_config_dword(pdev, 0x60,
 						0xe5000000 | inta_addr[i]);
-			pci_write_config_dword (pdev, 0x78,
+			pci_write_config_dword(pdev, 0x78,
 						0x00000000 | inta_addr[i]);
-			test = inb (inta_addr[i]);
-			if (test != 0xff) break;
+			test = inb(inta_addr[i]);
+			if (test != 0xff)
+				break;
 			release_region(inta_addr[i], 0x8);
 		}
 	}
-	if(i >= 5) {
-		printk (KERN_INFO "parport_pc: cannot find ITE8872 INTA\n");
+	if (i >= 5) {
+		printk(KERN_INFO "parport_pc: cannot find ITE8872 INTA\n");
 		return 0;
 	}
 
-	type = inb (inta_addr[i] + 0x18);
+	type = inb(inta_addr[i] + 0x18);
 	type &= 0x0f;
 
 	switch (type) {
 	case 0x2:
-		printk (KERN_INFO "parport_pc: ITE8871 found (1P)\n");
+		printk(KERN_INFO "parport_pc: ITE8871 found (1P)\n");
 		ite8872set = 0x64200000;
 		break;
 	case 0xa:
-		printk (KERN_INFO "parport_pc: ITE8875 found (1P)\n");
+		printk(KERN_INFO "parport_pc: ITE8875 found (1P)\n");
 		ite8872set = 0x64200000;
 		break;
 	case 0xe:
-		printk (KERN_INFO "parport_pc: ITE8872 found (2S1P)\n");
+		printk(KERN_INFO "parport_pc: ITE8872 found (2S1P)\n");
 		ite8872set = 0x64e00000;
 		break;
 	case 0x6:
-		printk (KERN_INFO "parport_pc: ITE8873 found (1S)\n");
+		printk(KERN_INFO "parport_pc: ITE8873 found (1S)\n");
 		return 0;
 	case 0x8:
-		DPRINTK (KERN_DEBUG "parport_pc: ITE8874 found (2S)\n");
+		DPRINTK(KERN_DEBUG "parport_pc: ITE8874 found (2S)\n");
 		return 0;
 	default:
-		printk (KERN_INFO "parport_pc: unknown ITE887x\n");
-		printk (KERN_INFO "parport_pc: please mail 'lspci -nvv' "
+		printk(KERN_INFO "parport_pc: unknown ITE887x\n");
+		printk(KERN_INFO "parport_pc: please mail 'lspci -nvv' "
 			"output to Rich.Liu@ite.com.tw\n");
 		return 0;
 	}
 
-	pci_read_config_byte (pdev, 0x3c, &ite8872_irq);
-	pci_read_config_dword (pdev, 0x1c, &ite8872_lpt);
+	pci_read_config_byte(pdev, 0x3c, &ite8872_irq);
+	pci_read_config_dword(pdev, 0x1c, &ite8872_lpt);
 	ite8872_lpt &= 0x0000ff00;
-	pci_read_config_dword (pdev, 0x20, &ite8872_lpthi);
+	pci_read_config_dword(pdev, 0x20, &ite8872_lpthi);
 	ite8872_lpthi &= 0x0000ff00;
-	pci_write_config_dword (pdev, 0x6c, 0xe3000000 | ite8872_lpt);
-	pci_write_config_dword (pdev, 0x70, 0xe3000000 | ite8872_lpthi);
-	pci_write_config_dword (pdev, 0x80, (ite8872_lpthi<<16) | ite8872_lpt);
-	// SET SPP&EPP , Parallel Port NO DMA , Enable All Function
-	// SET Parallel IRQ
-	pci_write_config_dword (pdev, 0x9c,
+	pci_write_config_dword(pdev, 0x6c, 0xe3000000 | ite8872_lpt);
+	pci_write_config_dword(pdev, 0x70, 0xe3000000 | ite8872_lpthi);
+	pci_write_config_dword(pdev, 0x80, (ite8872_lpthi<<16) | ite8872_lpt);
+	/* SET SPP&EPP , Parallel Port NO DMA , Enable All Function */
+	/* SET Parallel IRQ */
+	pci_write_config_dword(pdev, 0x9c,
 				ite8872set | (ite8872_irq * 0x11111));
 
-	DPRINTK (KERN_DEBUG "ITE887x: The IRQ is %d.\n", ite8872_irq);
-	DPRINTK (KERN_DEBUG "ITE887x: The PARALLEL I/O port is 0x%x.\n",
+	DPRINTK(KERN_DEBUG "ITE887x: The IRQ is %d.\n", ite8872_irq);
+	DPRINTK(KERN_DEBUG "ITE887x: The PARALLEL I/O port is 0x%x.\n",
 		 ite8872_lpt);
-	DPRINTK (KERN_DEBUG "ITE887x: The PARALLEL I/O porthi is 0x%x.\n",
+	DPRINTK(KERN_DEBUG "ITE887x: The PARALLEL I/O porthi is 0x%x.\n",
 		 ite8872_lpthi);
 
 	/* Let the user (or defaults) steer us away from interrupts */
@@ -2532,14 +2598,14 @@ static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
 	 * Release the resource so that parport_pc_probe_port can get it.
 	 */
 	release_resource(base_res);
-	if (parport_pc_probe_port (ite8872_lpt, ite8872_lpthi,
+	if (parport_pc_probe_port(ite8872_lpt, ite8872_lpthi,
 				   irq, PARPORT_DMA_NONE, &pdev->dev, 0)) {
-		printk (KERN_INFO
+		printk(KERN_INFO
 			"parport_pc: ITE 8872 parallel port: io=0x%X",
-			ite8872_lpt);
+								ite8872_lpt);
 		if (irq != PARPORT_IRQ_NONE)
-			printk (", irq=%d", irq);
-		printk ("\n");
+			printk(", irq=%d", irq);
+		printk("\n");
 		return 1;
 	}
 
@@ -2548,7 +2614,7 @@ static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
 
 /* VIA 8231 support by Pavel Fedin <sonic_amiga@rambler.ru>
    based on VIA 686a support code by Jeff Garzik <jgarzik@pobox.com> */
-static int __devinitdata parport_init_mode = 0;
+static int __devinitdata parport_init_mode;
 
 /* Data for two known VIA chips */
 static struct parport_pc_via_data via_686a_data __devinitdata = {
@@ -2570,7 +2636,7 @@ static struct parport_pc_via_data via_8231_data __devinitdata = {
 	0xF6
 };
 
-static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
+static int __devinit sio_via_probe(struct pci_dev *pdev, int autoirq,
 				    int autodma,
 				    const struct parport_pc_via_data *via)
 {
@@ -2582,38 +2648,38 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 
 	printk(KERN_DEBUG "parport_pc: VIA 686A/8231 detected\n");
 
-	switch(parport_init_mode)
-	{
+	switch (parport_init_mode) {
 	case 1:
-	    printk(KERN_DEBUG "parport_pc: setting SPP mode\n");
-	    siofunc = VIA_FUNCTION_PARPORT_SPP;
-	    break;
+		printk(KERN_DEBUG "parport_pc: setting SPP mode\n");
+		siofunc = VIA_FUNCTION_PARPORT_SPP;
+		break;
 	case 2:
-	    printk(KERN_DEBUG "parport_pc: setting PS/2 mode\n");
-	    siofunc = VIA_FUNCTION_PARPORT_SPP;
-	    ppcontrol = VIA_PARPORT_BIDIR;
-	    break;
+		printk(KERN_DEBUG "parport_pc: setting PS/2 mode\n");
+		siofunc = VIA_FUNCTION_PARPORT_SPP;
+		ppcontrol = VIA_PARPORT_BIDIR;
+		break;
 	case 3:
-	    printk(KERN_DEBUG "parport_pc: setting EPP mode\n");
-	    siofunc = VIA_FUNCTION_PARPORT_EPP;
-	    ppcontrol = VIA_PARPORT_BIDIR;
-	    have_epp = 1;
-	    break;
+		printk(KERN_DEBUG "parport_pc: setting EPP mode\n");
+		siofunc = VIA_FUNCTION_PARPORT_EPP;
+		ppcontrol = VIA_PARPORT_BIDIR;
+		have_epp = 1;
+		break;
 	case 4:
-	    printk(KERN_DEBUG "parport_pc: setting ECP mode\n");
-	    siofunc = VIA_FUNCTION_PARPORT_ECP;
-	    ppcontrol = VIA_PARPORT_BIDIR;
-	    break;
+		printk(KERN_DEBUG "parport_pc: setting ECP mode\n");
+		siofunc = VIA_FUNCTION_PARPORT_ECP;
+		ppcontrol = VIA_PARPORT_BIDIR;
+		break;
 	case 5:
-	    printk(KERN_DEBUG "parport_pc: setting EPP+ECP mode\n");
-	    siofunc = VIA_FUNCTION_PARPORT_ECP;
-	    ppcontrol = VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP;
-	    have_epp = 1;
-	    break;
-	 default:
-	    printk(KERN_DEBUG "parport_pc: probing current configuration\n");
-	    siofunc = VIA_FUNCTION_PROBE;
-	    break;
+		printk(KERN_DEBUG "parport_pc: setting EPP+ECP mode\n");
+		siofunc = VIA_FUNCTION_PARPORT_ECP;
+		ppcontrol = VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP;
+		have_epp = 1;
+		break;
+	default:
+		printk(KERN_DEBUG
+			"parport_pc: probing current configuration\n");
+		siofunc = VIA_FUNCTION_PROBE;
+		break;
 	}
 	/*
 	 * unlock super i/o configuration
@@ -2624,38 +2690,36 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 
 	/* Bits 1-0: Parallel Port Mode / Enable */
 	outb(via->viacfg_function, VIA_CONFIG_INDEX);
-	tmp = inb (VIA_CONFIG_DATA);
+	tmp = inb(VIA_CONFIG_DATA);
 	/* Bit 5: EPP+ECP enable; bit 7: PS/2 bidirectional port enable */
 	outb(via->viacfg_parport_control, VIA_CONFIG_INDEX);
-	tmp2 = inb (VIA_CONFIG_DATA);
-	if (siofunc == VIA_FUNCTION_PROBE)
-	{
-	    siofunc = tmp & VIA_FUNCTION_PARPORT_DISABLE;
-	    ppcontrol = tmp2;
+	tmp2 = inb(VIA_CONFIG_DATA);
+	if (siofunc == VIA_FUNCTION_PROBE) {
+		siofunc = tmp & VIA_FUNCTION_PARPORT_DISABLE;
+		ppcontrol = tmp2;
+	} else {
+		tmp &= ~VIA_FUNCTION_PARPORT_DISABLE;
+		tmp |= siofunc;
+		outb(via->viacfg_function, VIA_CONFIG_INDEX);
+		outb(tmp, VIA_CONFIG_DATA);
+		tmp2 &= ~(VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP);
+		tmp2 |= ppcontrol;
+		outb(via->viacfg_parport_control, VIA_CONFIG_INDEX);
+		outb(tmp2, VIA_CONFIG_DATA);
 	}
-	else
-	{
-	    tmp &= ~VIA_FUNCTION_PARPORT_DISABLE;
-	    tmp |= siofunc;
-	    outb(via->viacfg_function, VIA_CONFIG_INDEX);
-	    outb(tmp, VIA_CONFIG_DATA);
-	    tmp2 &= ~(VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP);
-	    tmp2 |= ppcontrol;
-	    outb(via->viacfg_parport_control, VIA_CONFIG_INDEX);
-	    outb(tmp2, VIA_CONFIG_DATA);
-	}
-	
+
 	/* Parallel Port I/O Base Address, bits 9-2 */
 	outb(via->viacfg_parport_base, VIA_CONFIG_INDEX);
 	port1 = inb(VIA_CONFIG_DATA) << 2;
-	
-	printk (KERN_DEBUG "parport_pc: Current parallel port base: 0x%X\n",port1);
-	if ((port1 == 0x3BC) && have_epp)
-	{
-	    outb(via->viacfg_parport_base, VIA_CONFIG_INDEX);
-	    outb((0x378 >> 2), VIA_CONFIG_DATA);
-	    printk(KERN_DEBUG "parport_pc: Parallel port base changed to 0x378\n");
-	    port1 = 0x378;
+
+	printk(KERN_DEBUG "parport_pc: Current parallel port base: 0x%X\n",
+									port1);
+	if (port1 == 0x3BC && have_epp) {
+		outb(via->viacfg_parport_base, VIA_CONFIG_INDEX);
+		outb((0x378 >> 2), VIA_CONFIG_DATA);
+		printk(KERN_DEBUG
+			"parport_pc: Parallel port base changed to 0x378\n");
+		port1 = 0x378;
 	}
 
 	/*
@@ -2669,36 +2733,39 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 		printk(KERN_INFO "parport_pc: VIA parallel port disabled in BIOS\n");
 		return 0;
 	}
-	
+
 	/* Bits 7-4: PnP Routing for Parallel Port IRQ */
 	pci_read_config_byte(pdev, via->via_pci_parport_irq_reg, &tmp);
 	irq = ((tmp & VIA_IRQCONTROL_PARALLEL) >> 4);
 
-	if (siofunc == VIA_FUNCTION_PARPORT_ECP)
-	{
-	    /* Bits 3-2: PnP Routing for Parallel Port DMA */
-	    pci_read_config_byte(pdev, via->via_pci_parport_dma_reg, &tmp);
-	    dma = ((tmp & VIA_DMACONTROL_PARALLEL) >> 2);
-	}
-	else
-	    /* if ECP not enabled, DMA is not enabled, assumed bogus 'dma' value */
-	    dma = PARPORT_DMA_NONE;
+	if (siofunc == VIA_FUNCTION_PARPORT_ECP) {
+		/* Bits 3-2: PnP Routing for Parallel Port DMA */
+		pci_read_config_byte(pdev, via->via_pci_parport_dma_reg, &tmp);
+		dma = ((tmp & VIA_DMACONTROL_PARALLEL) >> 2);
+	} else
+		/* if ECP not enabled, DMA is not enabled, assumed
+		   bogus 'dma' value */
+		dma = PARPORT_DMA_NONE;
 
 	/* Let the user (or defaults) steer us away from interrupts and DMA */
 	if (autoirq == PARPORT_IRQ_NONE) {
-	    irq = PARPORT_IRQ_NONE;
-	    dma = PARPORT_DMA_NONE;
+		irq = PARPORT_IRQ_NONE;
+		dma = PARPORT_DMA_NONE;
 	}
 	if (autodma == PARPORT_DMA_NONE)
-	    dma = PARPORT_DMA_NONE;
+		dma = PARPORT_DMA_NONE;
 
 	switch (port1) {
-	case 0x3bc: port2 = 0x7bc; break;
-	case 0x378: port2 = 0x778; break;
-	case 0x278: port2 = 0x678; break;
+	case 0x3bc:
+		port2 = 0x7bc; break;
+	case 0x378:
+		port2 = 0x778; break;
+	case 0x278:
+		port2 = 0x678; break;
 	default:
-		printk(KERN_INFO "parport_pc: Weird VIA parport base 0x%X, ignoring\n",
-			port1);
+		printk(KERN_INFO
+			"parport_pc: Weird VIA parport base 0x%X, ignoring\n",
+									port1);
 		return 0;
 	}
 
@@ -2716,17 +2783,17 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 	}
 
 	/* finally, do the probe with values obtained */
-	if (parport_pc_probe_port (port1, port2, irq, dma, &pdev->dev, 0)) {
-		printk (KERN_INFO
+	if (parport_pc_probe_port(port1, port2, irq, dma, &pdev->dev, 0)) {
+		printk(KERN_INFO
 			"parport_pc: VIA parallel port: io=0x%X", port1);
 		if (irq != PARPORT_IRQ_NONE)
-			printk (", irq=%d", irq);
+			printk(", irq=%d", irq);
 		if (dma != PARPORT_DMA_NONE)
-			printk (", dma=%d", dma);
-		printk ("\n");
+			printk(", dma=%d", dma);
+		printk("\n");
 		return 1;
 	}
-	
+
 	printk(KERN_WARNING "parport_pc: Strange, can't probe VIA parallel port: io=0x%X, irq=%d, dma=%d\n",
 		port1, irq, dma);
 	return 0;
@@ -2734,8 +2801,8 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 
 
 enum parport_pc_sio_types {
-	sio_via_686a = 0,	/* Via VT82C686A motherboard Super I/O */
-	sio_via_8231,		/* Via VT8231 south bridge integrated Super IO */
+	sio_via_686a = 0,   /* Via VT82C686A motherboard Super I/O */
+	sio_via_8231,	    /* Via VT8231 south bridge integrated Super IO */
 	sio_ite_8872,
 	last_sio
 };
@@ -2806,15 +2873,15 @@ enum parport_pc_pci_cards {
 };
 
 
-/* each element directly indexed from enum list, above 
+/* each element directly indexed from enum list, above
  * (but offset by last_sio) */
 static struct parport_pc_pci {
 	int numports;
 	struct { /* BAR (base address registers) numbers in the config
-                    space header */
+		    space header */
 		int lo;
-		int hi; /* -1 if not there, >6 for offset-method (max
-                           BAR is 6) */
+		int hi;
+		/* -1 if not there, >6 for offset-method (max BAR is 6) */
 	} addr[4];
 
 	/* If set, this is called immediately after pci_enable_device.
@@ -2859,7 +2926,7 @@ static struct parport_pc_pci {
 	/* timedia_4018  */             { 2, { { 0, 1 }, { 2, 3 }, } },
 	/* timedia_9018a */             { 2, { { 0, 1 }, { 2, 3 }, } },
 					/* SYBA uses fixed offsets in
-                                           a 1K io window */
+					   a 1K io window */
 	/* syba_2p_epp AP138B */	{ 2, { { 0, 0x078 }, { 0, 0x178 }, } },
 	/* syba_1p_ecp W83787 */	{ 1, { { 0, 0x078 }, } },
 	/* titan_010l */		{ 1, { { 3, -1 }, } },
@@ -2875,11 +2942,14 @@ static struct parport_pc_pci {
 	/* oxsemi_pcie_pport */		{ 1, { { 0, 1 }, } },
 	/* aks_0100 */                  { 1, { { 0, -1 }, } },
 	/* mobility_pp */		{ 1, { { 0, 1 }, } },
-	/* netmos_9705 */               { 1, { { 0, -1 }, } }, /* untested */
-        /* netmos_9715 */               { 2, { { 0, 1 }, { 2, 3 },} }, /* untested */
-        /* netmos_9755 */               { 2, { { 0, 1 }, { 2, 3 },} }, /* untested */
-	/* netmos_9805 */               { 1, { { 0, -1 }, } }, /* untested */
-	/* netmos_9815 */               { 2, { { 0, -1 }, { 2, -1 }, } }, /* untested */
+
+	/* The netmos entries below are untested */
+	/* netmos_9705 */               { 1, { { 0, -1 }, } },
+	/* netmos_9715 */               { 2, { { 0, 1 }, { 2, 3 },} },
+	/* netmos_9755 */               { 2, { { 0, 1 }, { 2, 3 },} },
+	/* netmos_9805 */               { 1, { { 0, -1 }, } },
+	/* netmos_9815 */               { 2, { { 0, -1 }, { 2, -1 }, } },
+
 	/* quatech_sppxp100 */		{ 1, { { 0, 1 }, } },
 };
 
@@ -2908,7 +2978,7 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
 	{ PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_BOCA_IOPPAR,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, boca_ioppar },
 	{ PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9050,
-	  PCI_SUBVENDOR_ID_EXSYS, PCI_SUBDEVICE_ID_EXSYS_4014, 0,0, plx_9050 },
+	  PCI_SUBVENDOR_ID_EXSYS, PCI_SUBDEVICE_ID_EXSYS_4014, 0, 0, plx_9050 },
 	/* PCI_VENDOR_ID_TIMEDIA/SUNIX has many differing cards ...*/
 	{ 0x1409, 0x7168, 0x1409, 0x4078, 0, 0, timedia_4078a },
 	{ 0x1409, 0x7168, 0x1409, 0x4079, 0, 0, timedia_4079h },
@@ -2942,7 +3012,8 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
 	{ 0x9710, 0x9805, 0x1000, 0x0010, 0, 0, titan_1284p1 },
 	{ 0x9710, 0x9815, 0x1000, 0x0020, 0, 0, titan_1284p2 },
 	/* PCI_VENDOR_ID_AVLAB/Intek21 has another bunch of cards ...*/
-	{ 0x14db, 0x2120, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_1p}, /* AFAVLAB_TK9902 */
+	/* AFAVLAB_TK9902 */
+	{ 0x14db, 0x2120, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_1p},
 	{ 0x14db, 0x2121, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_2p},
 	{ PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI952PP,
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_952 },
@@ -2985,14 +3056,14 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
 	  PCI_ANY_ID, PCI_ANY_ID, 0, 0, quatech_sppxp100 },
 	{ 0, } /* terminate list */
 };
-MODULE_DEVICE_TABLE(pci,parport_pc_pci_tbl);
+MODULE_DEVICE_TABLE(pci, parport_pc_pci_tbl);
 
 struct pci_parport_data {
 	int num;
 	struct parport *ports[2];
 };
 
-static int parport_pc_pci_probe (struct pci_dev *dev,
+static int parport_pc_pci_probe(struct pci_dev *dev,
 					   const struct pci_device_id *id)
 {
 	int err, count, n, i = id->driver_data;
@@ -3005,7 +3076,8 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
 	/* This is a PCI card */
 	i -= last_sio;
 	count = 0;
-	if ((err = pci_enable_device (dev)) != 0)
+	err = pci_enable_device(dev);
+	if (err)
 		return err;
 
 	data = kmalloc(sizeof(struct pci_parport_data), GFP_KERNEL);
@@ -3013,7 +3085,7 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
 		return -ENOMEM;
 
 	if (cards[i].preinit_hook &&
-	    cards[i].preinit_hook (dev, PARPORT_IRQ_NONE, PARPORT_DMA_NONE)) {
+	    cards[i].preinit_hook(dev, PARPORT_IRQ_NONE, PARPORT_DMA_NONE)) {
 		kfree(data);
 		return -ENODEV;
 	}
@@ -3023,25 +3095,25 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
 		int hi = cards[i].addr[n].hi;
 		int irq;
 		unsigned long io_lo, io_hi;
-		io_lo = pci_resource_start (dev, lo);
+		io_lo = pci_resource_start(dev, lo);
 		io_hi = 0;
 		if ((hi >= 0) && (hi <= 6))
-			io_hi = pci_resource_start (dev, hi);
+			io_hi = pci_resource_start(dev, hi);
 		else if (hi > 6)
 			io_lo += hi; /* Reinterpret the meaning of
-                                        "hi" as an offset (see SYBA
-                                        def.) */
+					"hi" as an offset (see SYBA
+					def.) */
 		/* TODO: test if sharing interrupts works */
 		irq = dev->irq;
 		if (irq == IRQ_NONE) {
-			printk (KERN_DEBUG
+			printk(KERN_DEBUG
 	"PCI parallel port detected: %04x:%04x, I/O at %#lx(%#lx)\n",
 				parport_pc_pci_tbl[i + last_sio].vendor,
 				parport_pc_pci_tbl[i + last_sio].device,
 				io_lo, io_hi);
 			irq = PARPORT_IRQ_NONE;
 		} else {
-			printk (KERN_DEBUG
+			printk(KERN_DEBUG
 	"PCI parallel port detected: %04x:%04x, I/O at %#lx(%#lx), IRQ %d\n",
 				parport_pc_pci_tbl[i + last_sio].vendor,
 				parport_pc_pci_tbl[i + last_sio].device,
@@ -3058,7 +3130,7 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
 	data->num = count;
 
 	if (cards[i].postinit_hook)
-		cards[i].postinit_hook (dev, count == 0);
+		cards[i].postinit_hook(dev, count == 0);
 
 	if (count) {
 		pci_set_drvdata(dev, data);
@@ -3092,7 +3164,7 @@ static struct pci_driver parport_pc_pci_driver = {
 	.remove		= __devexit_p(parport_pc_pci_remove),
 };
 
-static int __init parport_pc_init_superio (int autoirq, int autodma)
+static int __init parport_pc_init_superio(int autoirq, int autodma)
 {
 	const struct pci_device_id *id;
 	struct pci_dev *pdev = NULL;
@@ -3103,8 +3175,9 @@ static int __init parport_pc_init_superio (int autoirq, int autodma)
 		if (id == NULL || id->driver_data >= last_sio)
 			continue;
 
-		if (parport_pc_superio_info[id->driver_data].probe
-			(pdev, autoirq, autodma,parport_pc_superio_info[id->driver_data].via)) {
+		if (parport_pc_superio_info[id->driver_data].probe(
+			pdev, autoirq, autodma,
+			parport_pc_superio_info[id->driver_data].via)) {
 			ret++;
 		}
 	}
@@ -3113,7 +3186,10 @@ static int __init parport_pc_init_superio (int autoirq, int autodma)
 }
 #else
 static struct pci_driver parport_pc_pci_driver;
-static int __init parport_pc_init_superio(int autoirq, int autodma) {return 0;}
+static int __init parport_pc_init_superio(int autoirq, int autodma)
+{
+	return 0;
+}
 #endif /* CONFIG_PCI */
 
 #ifdef CONFIG_PNP
@@ -3126,44 +3202,45 @@ static const struct pnp_device_id parport_pc_pnp_tbl[] = {
 	{ }
 };
 
-MODULE_DEVICE_TABLE(pnp,parport_pc_pnp_tbl);
+MODULE_DEVICE_TABLE(pnp, parport_pc_pnp_tbl);
 
-static int parport_pc_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *id)
+static int parport_pc_pnp_probe(struct pnp_dev *dev,
+						const struct pnp_device_id *id)
 {
 	struct parport *pdata;
 	unsigned long io_lo, io_hi;
 	int dma, irq;
 
-	if (pnp_port_valid(dev,0) &&
-		!(pnp_port_flags(dev,0) & IORESOURCE_DISABLED)) {
-		io_lo = pnp_port_start(dev,0);
+	if (pnp_port_valid(dev, 0) &&
+		!(pnp_port_flags(dev, 0) & IORESOURCE_DISABLED)) {
+		io_lo = pnp_port_start(dev, 0);
 	} else
 		return -EINVAL;
 
-	if (pnp_port_valid(dev,1) &&
-		!(pnp_port_flags(dev,1) & IORESOURCE_DISABLED)) {
-		io_hi = pnp_port_start(dev,1);
+	if (pnp_port_valid(dev, 1) &&
+		!(pnp_port_flags(dev, 1) & IORESOURCE_DISABLED)) {
+		io_hi = pnp_port_start(dev, 1);
 	} else
 		io_hi = 0;
 
-	if (pnp_irq_valid(dev,0) &&
-		!(pnp_irq_flags(dev,0) & IORESOURCE_DISABLED)) {
-		irq = pnp_irq(dev,0);
+	if (pnp_irq_valid(dev, 0) &&
+		!(pnp_irq_flags(dev, 0) & IORESOURCE_DISABLED)) {
+		irq = pnp_irq(dev, 0);
 	} else
 		irq = PARPORT_IRQ_NONE;
 
-	if (pnp_dma_valid(dev,0) &&
-		!(pnp_dma_flags(dev,0) & IORESOURCE_DISABLED)) {
-		dma = pnp_dma(dev,0);
+	if (pnp_dma_valid(dev, 0) &&
+		!(pnp_dma_flags(dev, 0) & IORESOURCE_DISABLED)) {
+		dma = pnp_dma(dev, 0);
 	} else
 		dma = PARPORT_DMA_NONE;
 
 	dev_info(&dev->dev, "reported by %s\n", dev->protocol->name);
-	if (!(pdata = parport_pc_probe_port(io_lo, io_hi,
-					irq, dma, &dev->dev, 0)))
+	pdata = parport_pc_probe_port(io_lo, io_hi, irq, dma, &dev->dev, 0);
+	if (pdata == NULL)
 		return -ENODEV;
 
-	pnp_set_drvdata(dev,pdata);
+	pnp_set_drvdata(dev, pdata);
 	return 0;
 }
 
@@ -3205,7 +3282,7 @@ static struct platform_driver parport_pc_platform_driver = {
 
 /* This is called by parport_pc_find_nonpci_ports (in asm/parport.h) */
 static int __devinit __attribute__((unused))
-parport_pc_find_isa_ports (int autoirq, int autodma)
+parport_pc_find_isa_ports(int autoirq, int autodma)
 {
 	int count = 0;
 
@@ -3229,7 +3306,7 @@ parport_pc_find_isa_ports (int autoirq, int autodma)
  * autoirq is PARPORT_IRQ_NONE, PARPORT_IRQ_AUTO, or PARPORT_IRQ_PROBEONLY
  * autodma is PARPORT_DMA_NONE or PARPORT_DMA_AUTO
  */
-static void __init parport_pc_find_ports (int autoirq, int autodma)
+static void __init parport_pc_find_ports(int autoirq, int autodma)
 {
 	int count = 0, err;
 
@@ -3263,11 +3340,18 @@ static void __init parport_pc_find_ports (int autoirq, int autodma)
  *	syntax and keep in mind that code below is a cleaned up version.
  */
 
-static int __initdata io[PARPORT_PC_MAX_PORTS+1] = { [0 ... PARPORT_PC_MAX_PORTS] = 0 };
-static int __initdata io_hi[PARPORT_PC_MAX_PORTS+1] =
-	{ [0 ... PARPORT_PC_MAX_PORTS] = PARPORT_IOHI_AUTO };
-static int __initdata dmaval[PARPORT_PC_MAX_PORTS] = { [0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_DMA_NONE };
-static int __initdata irqval[PARPORT_PC_MAX_PORTS] = { [0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_IRQ_PROBEONLY };
+static int __initdata io[PARPORT_PC_MAX_PORTS+1] = {
+	[0 ... PARPORT_PC_MAX_PORTS] = 0
+};
+static int __initdata io_hi[PARPORT_PC_MAX_PORTS+1] = {
+	[0 ... PARPORT_PC_MAX_PORTS] = PARPORT_IOHI_AUTO
+};
+static int __initdata dmaval[PARPORT_PC_MAX_PORTS] = {
+	[0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_DMA_NONE
+};
+static int __initdata irqval[PARPORT_PC_MAX_PORTS] = {
+	[0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_IRQ_PROBEONLY
+};
 
 static int __init parport_parse_param(const char *s, int *val,
 				int automatic, int none, int nofifo)
@@ -3308,18 +3392,19 @@ static int __init parport_parse_dma(const char *dmastr, int *val)
 #ifdef CONFIG_PCI
 static int __init parport_init_mode_setup(char *str)
 {
-	printk(KERN_DEBUG "parport_pc.c: Specified parameter parport_init_mode=%s\n", str);
-
-	if (!strcmp (str, "spp"))
-		parport_init_mode=1;
-	if (!strcmp (str, "ps2"))
-		parport_init_mode=2;
-	if (!strcmp (str, "epp"))
-		parport_init_mode=3;
-	if (!strcmp (str, "ecp"))
-		parport_init_mode=4;
-	if (!strcmp (str, "ecpepp"))
-		parport_init_mode=5;
+	printk(KERN_DEBUG
+	     "parport_pc.c: Specified parameter parport_init_mode=%s\n", str);
+
+	if (!strcmp(str, "spp"))
+		parport_init_mode = 1;
+	if (!strcmp(str, "ps2"))
+		parport_init_mode = 2;
+	if (!strcmp(str, "epp"))
+		parport_init_mode = 3;
+	if (!strcmp(str, "ecp"))
+		parport_init_mode = 4;
+	if (!strcmp(str, "ecpepp"))
+		parport_init_mode = 5;
 	return 1;
 }
 #endif
@@ -3343,7 +3428,8 @@ module_param(verbose_probing, int, 0644);
 #endif
 #ifdef CONFIG_PCI
 static char *init_mode;
-MODULE_PARM_DESC(init_mode, "Initialise mode for VIA VT8231 port (spp, ps2, epp, ecp or ecpepp)");
+MODULE_PARM_DESC(init_mode,
+	"Initialise mode for VIA VT8231 port (spp, ps2, epp, ecp or ecpepp)");
 module_param(init_mode, charp, 0);
 #endif
 
@@ -3374,7 +3460,7 @@ static int __init parse_parport_params(void)
 				irqval[0] = val;
 				break;
 			default:
-				printk (KERN_WARNING
+				printk(KERN_WARNING
 					"parport_pc: irq specified "
 					"without base address.  Use 'io=' "
 					"to specify one\n");
@@ -3387,7 +3473,7 @@ static int __init parse_parport_params(void)
 				dmaval[0] = val;
 				break;
 			default:
-				printk (KERN_WARNING
+				printk(KERN_WARNING
 					"parport_pc: dma specified "
 					"without base address.  Use 'io=' "
 					"to specify one\n");
@@ -3398,7 +3484,7 @@ static int __init parse_parport_params(void)
 
 #else
 
-static int parport_setup_ptr __initdata = 0;
+static int parport_setup_ptr __initdata;
 
 /*
  * Acceptable parameters:
@@ -3409,7 +3495,7 @@ static int parport_setup_ptr __initdata = 0;
  *
  * IRQ/DMA may be numeric or 'auto' or 'none'
  */
-static int __init parport_setup (char *str)
+static int __init parport_setup(char *str)
 {
 	char *endptr;
 	char *sep;
@@ -3421,15 +3507,15 @@ static int __init parport_setup (char *str)
 		return 1;
 	}
 
-	if (!strncmp (str, "auto", 4)) {
+	if (!strncmp(str, "auto", 4)) {
 		irqval[0] = PARPORT_IRQ_AUTO;
 		dmaval[0] = PARPORT_DMA_AUTO;
 		return 1;
 	}
 
-	val = simple_strtoul (str, &endptr, 0);
+	val = simple_strtoul(str, &endptr, 0);
 	if (endptr == str) {
-		printk (KERN_WARNING "parport=%s not understood\n", str);
+		printk(KERN_WARNING "parport=%s not understood\n", str);
 		return 1;
 	}
 
@@ -3463,7 +3549,7 @@ static int __init parse_parport_params(void)
 	return io[0] == PARPORT_DISABLE;
 }
 
-__setup ("parport=", parport_setup);
+__setup("parport=", parport_setup);
 
 /*
  * Acceptable parameters:
@@ -3471,7 +3557,7 @@ __setup ("parport=", parport_setup);
  * parport_init_mode=[spp|ps2|epp|ecp|ecpepp]
  */
 #ifdef CONFIG_PCI
-__setup("parport_init_mode=",parport_init_mode_setup);
+__setup("parport_init_mode=", parport_init_mode_setup);
 #endif
 #endif
 
@@ -3495,13 +3581,13 @@ static int __init parport_pc_init(void)
 		for (i = 0; i < PARPORT_PC_MAX_PORTS; i++) {
 			if (!io[i])
 				break;
-			if ((io_hi[i]) == PARPORT_IOHI_AUTO)
-			       io_hi[i] = 0x400 + io[i];
+			if (io_hi[i] == PARPORT_IOHI_AUTO)
+				io_hi[i] = 0x400 + io[i];
 			parport_pc_probe_port(io[i], io_hi[i],
-					  irqval[i], dmaval[i], NULL, 0);
+					irqval[i], dmaval[i], NULL, 0);
 		}
 	} else
-		parport_pc_find_ports (irqval[0], dmaval[0]);
+		parport_pc_find_ports(irqval[0], dmaval[0]);
 
 	return 0;
 }
@@ -3509,9 +3595,9 @@ static int __init parport_pc_init(void)
 static void __exit parport_pc_exit(void)
 {
 	if (pci_registered_parport)
-		pci_unregister_driver (&parport_pc_pci_driver);
+		pci_unregister_driver(&parport_pc_pci_driver);
 	if (pnp_registered_parport)
-		pnp_unregister_driver (&parport_pc_pnp_driver);
+		pnp_unregister_driver(&parport_pc_pnp_driver);
 	platform_driver_unregister(&parport_pc_platform_driver);
 
 	while (!list_empty(&ports_list)) {
-- 
GitLab


From 181bf1e815a2ad2a9a8b30ef6e92583d1530b255 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:08:10 +0100
Subject: [PATCH 2011/2198] parport_pc: clean up the modified while loops using
 for

And tidy up a few bits coding style detectors missed

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/parport/parport_pc.c | 98 +++++++++++++++++++++---------------
 1 file changed, 57 insertions(+), 41 deletions(-)

diff --git a/drivers/parport/parport_pc.c b/drivers/parport/parport_pc.c
index edf83e945853b..151bf5bc8afe9 100644
--- a/drivers/parport/parport_pc.c
+++ b/drivers/parport/parport_pc.c
@@ -626,7 +626,7 @@ static size_t parport_pc_fifo_write_block_dma(struct parport *port,
 	unsigned long start = (unsigned long) buf;
 	unsigned long end = (unsigned long) buf + length - 1;
 
-dump_parport_state("enter fifo_write_block_dma", port);
+	dump_parport_state("enter fifo_write_block_dma", port);
 	if (end < MAX_DMA_ADDRESS) {
 		/* If it would cross a 64k boundary, cap it at the end. */
 		if ((start ^ end) & ~0xffffUL)
@@ -737,7 +737,7 @@ dump_parport_state("enter fifo_write_block_dma", port);
 	if (dma_handle)
 		dma_unmap_single(dev, dma_handle, length, DMA_TO_DEVICE);
 
-dump_parport_state("leave fifo_write_block_dma", port);
+	dump_parport_state("leave fifo_write_block_dma", port);
 	return length - left;
 }
 #endif
@@ -955,8 +955,8 @@ static size_t parport_pc_ecp_read_block_pio(struct parport *port,
 	char *bufp = buf;
 
 	port = port->physport;
-DPRINTK(KERN_DEBUG "parport_pc: parport_pc_ecp_read_block_pio\n");
-dump_parport_state("enter fcn", port);
+	DPRINTK(KERN_DEBUG "parport_pc: parport_pc_ecp_read_block_pio\n");
+	dump_parport_state("enter fcn", port);
 
 	/* Special case: a timeout of zero means we cannot call schedule().
 	 * Also if O_NONBLOCK is set then use the default implementation. */
@@ -1112,14 +1112,15 @@ dump_parport_state("enter fcn", port);
 
 		if (ecrval & 0x02) {
 			/* FIFO is full. */
-dump_parport_state("FIFO full", port);
+			dump_parport_state("FIFO full", port);
 			insb(fifo, bufp, fifo_depth);
 			bufp += fifo_depth;
 			left -= fifo_depth;
 			continue;
 		}
 
-DPRINTK(KERN_DEBUG "*** ecp_read_block_pio: reading one byte from the FIFO\n");
+		DPRINTK(KERN_DEBUG
+		  "*** ecp_read_block_pio: reading one byte from the FIFO\n");
 
 		/* FIFO not filled.  We will cycle this loop for a while
 		 * and either the peripheral will fill it faster,
@@ -1135,7 +1136,7 @@ DPRINTK(KERN_DEBUG "*** ecp_read_block_pio: reading one byte from the FIFO\n");
 	}
 
 	port->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-dump_parport_state("rev idle2", port);
+	dump_parport_state("rev idle2", port);
 
 out_no_data:
 
@@ -1163,7 +1164,7 @@ dump_parport_state("rev idle2", port);
 				port->name, lost);
 	}
 
-dump_parport_state("fwd idle", port);
+	dump_parport_state("fwd idle", port);
 	return length - left;
 }
 #endif  /*  0  */
@@ -1216,10 +1217,23 @@ static const struct parport_operations parport_pc_ops = {
 };
 
 #ifdef CONFIG_PARPORT_PC_SUPERIO
+
+static struct superio_struct *find_free_superio(void)
+{
+	int i;
+	for (i = 0; i < NR_SUPERIOS; i++)
+		if (superios[i].io == 0)
+			return &superios[i];
+	return NULL;
+}
+
+
 /* Super-IO chipset detection, Winbond, SMSC */
 static void __devinit show_parconfig_smsc37c669(int io, int key)
 {
-	int cr1, cr4, cra, cr23, cr26, cr27, i = 0;
+	int cr1, cr4, cra, cr23, cr26, cr27;
+	struct superio_struct *s;
+
 	static const char *const modes[] = {
 		"SPP and Bidirectional (PS/2)",
 		"EPP and SPP",
@@ -1272,30 +1286,29 @@ static void __devinit show_parconfig_smsc37c669(int io, int key)
 	   are related, however DMA can be 1 or 3, assume DMA_A=DMA1,
 	   DMA_C=DMA3 (this is true e.g. for TYAN 1564D Tomcat IV) */
 	if (cr23 * 4 >= 0x100) { /* if active */
-		while ((i < NR_SUPERIOS) && (superios[i].io != 0))
-			i++;
-		if (i == NR_SUPERIOS) {
+		s = find_free_superio();
+		if (s == NULL)
 			printk(KERN_INFO "Super-IO: too many chips!\n");
-		} else {
+		else {
 			int d;
 			switch (cr23 * 4) {
 			case 0x3bc:
-				superios[i].io = 0x3bc;
-				superios[i].irq = 7;
+				s->io = 0x3bc;
+				s->irq = 7;
 				break;
 			case 0x378:
-				superios[i].io = 0x378;
-				superios[i].irq = 7;
+				s->io = 0x378;
+				s->irq = 7;
 				break;
 			case 0x278:
-				superios[i].io = 0x278;
-				superios[i].irq = 5;
+				s->io = 0x278;
+				s->irq = 5;
 			}
 			d = (cr26 & 0x0f);
 			if (d == 1 || d == 3)
-				superios[i].dma = d;
+				s->dma = d;
 			else
-				superios[i].dma = PARPORT_DMA_NONE;
+				s->dma = PARPORT_DMA_NONE;
 		}
 	}
 }
@@ -1303,7 +1316,8 @@ static void __devinit show_parconfig_smsc37c669(int io, int key)
 
 static void __devinit show_parconfig_winbond(int io, int key)
 {
-	int cr30, cr60, cr61, cr70, cr74, crf0, i = 0;
+	int cr30, cr60, cr61, cr70, cr74, crf0;
+	struct superio_struct *s;
 	static const char *const modes[] = {
 		"Standard (SPP) and Bidirectional(PS/2)", /* 0 */
 		"EPP-1.9 and SPP",
@@ -1356,14 +1370,13 @@ static void __devinit show_parconfig_winbond(int io, int key)
 	}
 
 	if (cr30 & 0x01) { /* the settings can be interrogated later ... */
-		while ((i < NR_SUPERIOS) && (superios[i].io != 0))
-			i++;
-		if (i == NR_SUPERIOS) {
+		s = find_free_superio();
+		if (s == NULL)
 			printk(KERN_INFO "Super-IO: too many chips!\n");
-		} else {
-			superios[i].io = (cr60<<8)|cr61;
-			superios[i].irq = cr70&0x0f;
-			superios[i].dma = (((cr74 & 0x07) > 3) ?
+		else {
+			s->io = (cr60 << 8) | cr61;
+			s->irq = cr70 & 0x0f;
+			s->dma = (((cr74 & 0x07) > 3) ?
 					   PARPORT_DMA_NONE : (cr74 & 0x07));
 		}
 	}
@@ -1618,25 +1631,28 @@ static void __devinit detect_and_report_it87(void)
 }
 #endif /* CONFIG_PARPORT_PC_SUPERIO */
 
-static int get_superio_dma(struct parport *p)
+static struct superio_struct *find_superio(struct parport *p)
 {
-	int i = 0;
+	int i;
+	for (i = 0; i < NR_SUPERIOS; i++)
+		if (superios[i].io != p->base)
+			return &superios[i];
+	return NULL;
+}
 
-	while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
-		i++;
-	if (i != NR_SUPERIOS)
-		return superios[i].dma;
+static int get_superio_dma(struct parport *p)
+{
+	struct superio_struct *s = find_superio(p);
+	if (s)
+		return s->dma;
 	return PARPORT_DMA_NONE;
 }
 
 static int get_superio_irq(struct parport *p)
 {
-	int i = 0;
-
-	while ((i < NR_SUPERIOS) && (superios[i].io != p->base))
-		i++;
-	if (i != NR_SUPERIOS)
-		return superios[i].irq;
+	struct superio_struct *s = find_superio(p);
+	if (s)
+		return s->irq;
 	return PARPORT_IRQ_NONE;
 }
 
-- 
GitLab


From 257a6e8cc7f9274f0af090494a3f1ee06548b5bd Mon Sep 17 00:00:00 2001
From: Breno Leitao <leitao@linux.vnet.ibm.com>
Date: Thu, 11 Jun 2009 13:20:09 +0100
Subject: [PATCH 2012/2198] icom: fix compile errors when defining ICOM_TRACE
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

As it is, defining ICOM_TRACE produces some compile errors, as
"parameter name omitted" and "redefinition of ‘trace’"

This patch removes the wrong trace definition.

Signed-off-by: Breno Leitao <leitao@linux.vnet.ibm.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/icom.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/drivers/serial/icom.c b/drivers/serial/icom.c
index fd5fd23804bc6..9f2891c2c4a21 100644
--- a/drivers/serial/icom.c
+++ b/drivers/serial/icom.c
@@ -137,7 +137,12 @@ static LIST_HEAD(icom_adapter_head);
 static spinlock_t icom_lock;
 
 #ifdef ICOM_TRACE
-static inline void trace(struct icom_port *, char *, unsigned long) {};
+static inline void trace(struct icom_port *icom_port, char *trace_pt,
+			unsigned long trace_data)
+{
+	dev_info(&icom_port->adapter->pci_dev->dev, ":%d:%s - %lx\n",
+	icom_port->port, trace_pt, trace_data);
+}
 #else
 static inline void trace(struct icom_port *icom_port, char *trace_pt, unsigned long trace_data) {};
 #endif
@@ -1647,15 +1652,6 @@ static void __exit icom_exit(void)
 module_init(icom_init);
 module_exit(icom_exit);
 
-#ifdef ICOM_TRACE
-static inline void trace(struct icom_port *icom_port, char *trace_pt,
-		  unsigned long trace_data)
-{
-	dev_info(&icom_port->adapter->pci_dev->dev, ":%d:%s - %lx\n",
-		 icom_port->port, trace_pt, trace_data);
-}
-#endif
-
 MODULE_AUTHOR("Michael Anderson <mjanders@us.ibm.com>");
 MODULE_DESCRIPTION("IBM iSeries Serial IOA driver");
 MODULE_SUPPORTED_DEVICE
-- 
GitLab


From 08e0992f60ad44025a8a8b8a821838ca4a562686 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <florian@openwrt.org>
Date: Thu, 11 Jun 2009 13:21:24 +0100
Subject: [PATCH 2013/2198] serial: add support for the TI AR7 internal UART

This patch adds support for the TI AR7 internal UART.

Signed-off-by: Florian Fainelli <florian@openwrt.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/8250.c       | 7 +++++++
 include/linux/serial_core.h | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/drivers/serial/8250.c b/drivers/serial/8250.c
index a0127e93ade0a..fb867a9f55e94 100644
--- a/drivers/serial/8250.c
+++ b/drivers/serial/8250.c
@@ -287,6 +287,13 @@ static const struct serial8250_config uart_config[] = {
 		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
 		.flags		= UART_CAP_FIFO,
 	},
+	[PORT_AR7] = {
+		.name		= "AR7",
+		.fifo_size	= 16,
+		.tx_loadsz	= 16,
+		.fcr		= UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
+		.flags		= UART_CAP_FIFO | UART_CAP_AFE,
+	},
 };
 
 #if defined (CONFIG_SERIAL_8250_AU1X00)
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 57a97e52e58d0..48766ea845cfb 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -41,7 +41,8 @@
 #define PORT_XSCALE	15
 #define PORT_RM9000	16	/* PMC-Sierra RM9xxx internal UART */
 #define PORT_OCTEON	17	/* Cavium OCTEON internal UART */
-#define PORT_MAX_8250	17	/* max port ID */
+#define PORT_AR7	18	/* Texas Instruments AR7 internal UART */
+#define PORT_MAX_8250	18	/* max port ID */
 
 /*
  * ARM specific type numbers.  These are not currently guaranteed
-- 
GitLab


From b5c6794fe4256fd63664aa185c468647c28bfd4a Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 11 Jun 2009 13:23:02 +0100
Subject: [PATCH 2014/2198] Blackfin SPORT UART: fix typo in sport_set_termios
 prototype

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/bfin_sport_uart.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/serial/bfin_sport_uart.c b/drivers/serial/bfin_sport_uart.c
index 529c0ff7952ce..fd5cb9576a06e 100644
--- a/drivers/serial/bfin_sport_uart.c
+++ b/drivers/serial/bfin_sport_uart.c
@@ -419,7 +419,7 @@ static void sport_shutdown(struct uart_port *port)
 }
 
 static void sport_set_termios(struct uart_port *port,
-		struct termios *termios, struct termios *old)
+		struct ktermios *termios, struct ktermios *old)
 {
 	pr_debug("%s enter, c_cflag:%08x\n", __func__, termios->c_cflag);
 	uart_update_timeout(port, CS8 ,port->uartclk);
-- 
GitLab


From a19e8b205915b2925aca75b2d2bf0c3104c8be14 Mon Sep 17 00:00:00 2001
From: Michael Hennerich <michael.hennerich@analog.com>
Date: Thu, 11 Jun 2009 13:23:42 +0100
Subject: [PATCH 2015/2198] Blackfin SPORT UART: fix data misses while using
 transmit frame sync

SPORT transmit frame sync (TFS) isn't used as an electrical signal during
normal SPORT UART emulation.  However, it is useful in EIA RS-485
emulation as RS-485 Transceiver Driver Enable DE strobe.

This patch configures:
TFS to be active high in order to drive an DE strobe of
an eventually connected RS-485 Transceiver.

Late frame sync mode (LATFS) gating the entire TX shift cycle.

Signed-off-by: Michael Hennerich <michael.hennerich@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/bfin_sport_uart.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/serial/bfin_sport_uart.c b/drivers/serial/bfin_sport_uart.c
index fd5cb9576a06e..6687ccd422b78 100644
--- a/drivers/serial/bfin_sport_uart.c
+++ b/drivers/serial/bfin_sport_uart.c
@@ -149,7 +149,7 @@ static int sport_uart_setup(struct sport_uart_port *up, int sclk, int baud_rate)
 	int tclkdiv, tfsdiv, rclkdiv;
 
 	/* Set TCR1 and TCR2 */
-	SPORT_PUT_TCR1(up, (LTFS | ITFS | TFSR | TLSBIT | ITCLK));
+	SPORT_PUT_TCR1(up, (LATFS | ITFS | TFSR | TLSBIT | ITCLK));
 	SPORT_PUT_TCR2(up, 10);
 	pr_debug("%s TCR1:%x, TCR2:%x\n", __func__, SPORT_GET_TCR1(up), SPORT_GET_TCR2(up));
 
-- 
GitLab


From 4328e3e5ef1ae3427a4f6863aa65916a68ec2dd9 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 11 Jun 2009 13:37:11 +0100
Subject: [PATCH 2016/2198] Blackfin SPORT UART: rewrite inline assembly

Hopefuly the new version is easier to read, but in the process it declares
proper clobber lists and better constraints so that GCC can do a better
job at allocating free registers.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/bfin_sport_uart.c | 54 +++++++++++++++++---------------
 1 file changed, 29 insertions(+), 25 deletions(-)

diff --git a/drivers/serial/bfin_sport_uart.c b/drivers/serial/bfin_sport_uart.c
index 6687ccd422b78..34b4ae0fe7604 100644
--- a/drivers/serial/bfin_sport_uart.c
+++ b/drivers/serial/bfin_sport_uart.c
@@ -101,15 +101,16 @@ static inline void tx_one_byte(struct sport_uart_port *up, unsigned int value)
 {
 	pr_debug("%s value:%x\n", __func__, value);
 	/* Place a Start and Stop bit */
-	__asm__ volatile (
-		"R2 = b#01111111100;\n\t"
-		"R3 = b#10000000001;\n\t"
-		"%0 <<= 2;\n\t"
-		"%0 = %0 & R2;\n\t"
-		"%0 = %0 | R3;\n\t"
-		:"=r"(value)
-		:"0"(value)
-		:"R2", "R3");
+	__asm__ __volatile__ (
+		"R2 = b#01111111100;"
+		"R3 = b#10000000001;"
+		"%0 <<= 2;"
+		"%0 = %0 & R2;"
+		"%0 = %0 | R3;"
+		: "=d"(value)
+		: "d"(value)
+		: "ASTAT", "R2", "R3"
+	);
 	pr_debug("%s value:%x\n", __func__, value);
 
 	SPORT_PUT_TX(up, value);
@@ -118,27 +119,30 @@ static inline void tx_one_byte(struct sport_uart_port *up, unsigned int value)
 static inline unsigned int rx_one_byte(struct sport_uart_port *up)
 {
 	unsigned int value, extract;
+	u32 tmp_mask1, tmp_mask2, tmp_shift, tmp;
 
 	value = SPORT_GET_RX32(up);
 	pr_debug("%s value:%x\n", __func__, value);
 
 	/* Extract 8 bits data */
-	__asm__ volatile (
-		"R5 = 0;\n\t"
-		"P0 = 8;\n\t"
-		"R1 = 0x1801(Z);\n\t"
-		"R3 = 0x0300(Z);\n\t"
-		"R4 = 0;\n\t"
-		"LSETUP(loop_s, loop_e) LC0 = P0;\nloop_s:\t"
-		"R2 = extract(%1, R1.L)(Z);\n\t"
-		"R2 <<= R4;\n\t"
-		"R5 = R5 | R2;\n\t"
-		"R1 = R1 - R3;\nloop_e:\t"
-		"R4 += 1;\n\t"
-		"%0 = R5;\n\t"
-		:"=r"(extract)
-		:"r"(value)
-		:"P0", "R1", "R2","R3","R4", "R5");
+	__asm__ __volatile__ (
+		"%[extr] = 0;"
+		"%[mask1] = 0x1801(Z);"
+		"%[mask2] = 0x0300(Z);"
+		"%[shift] = 0;"
+		"LSETUP(.Lloop_s, .Lloop_e) LC0 = %[lc];"
+		".Lloop_s:"
+		"%[tmp] = extract(%[val], %[mask1].L)(Z);"
+		"%[tmp] <<= %[shift];"
+		"%[extr] = %[extr] | %[tmp];"
+		"%[mask1] = %[mask1] - %[mask2];"
+		".Lloop_e:"
+		"%[shift] += 1;"
+		: [val]"=d"(value), [extr]"=d"(extract), [shift]"=d"(tmp_shift), [tmp]"=d"(tmp),
+		  [mask1]"=d"(tmp_mask1), [mask2]"=d"(tmp_mask2)
+		: "d"(value), [lc]"a"(8)
+		: "ASTAT", "LB0", "LC0", "LT0"
+	);
 
 	pr_debug("	extract:%x\n", extract);
 	return extract;
-- 
GitLab


From 8516c568f25666a012ec4c859e640a76fc9b6ec0 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 11 Jun 2009 13:38:16 +0100
Subject: [PATCH 2017/2198] Blackfin Serial Driver: fix error while
 transferring large files

Ignore receiving data if new position is in the same line of current
buffer tail and is small.  This should decrease overruns.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/bfin_5xx.c | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index d86123e03391e..65a4c07f61996 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -477,6 +477,15 @@ void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 
 	spin_lock_irqsave(&uart->port.lock, flags);
 
+	/* 2D DMA RX buffer ring is used. Because curr_y_count and
+	 * curr_x_count can't be read as an atomic operation,
+	 * curr_y_count should be read before curr_x_count. When
+	 * curr_x_count is read, curr_y_count may already indicate
+	 * next buffer line. But, the position calculated here is
+	 * still indicate the old line. The wrong position data may
+	 * be smaller than current buffer tail, which cause garbages
+	 * are received if it is not prohibit.
+	 */
 	uart->rx_dma_nrows = get_dma_curr_ycount(uart->rx_dma_channel);
 	x_pos = get_dma_curr_xcount(uart->rx_dma_channel);
 	uart->rx_dma_nrows = DMA_RX_YCOUNT - uart->rx_dma_nrows;
@@ -487,7 +496,11 @@ void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 		x_pos = 0;
 
 	pos = uart->rx_dma_nrows * DMA_RX_XCOUNT + x_pos;
-	if (pos != uart->rx_dma_buf.tail) {
+	/* Ignore receiving data if new position is in the same line of
+	 * current buffer tail and small.
+	 */
+	if (pos > uart->rx_dma_buf.tail ||
+		uart->rx_dma_nrows < (uart->rx_dma_buf.tail/DMA_RX_XCOUNT)) {
 		uart->rx_dma_buf.head = pos;
 		bfin_serial_dma_rx_chars(uart);
 		uart->rx_dma_buf.tail = uart->rx_dma_buf.head;
@@ -532,11 +545,25 @@ static irqreturn_t bfin_serial_dma_rx_int(int irq, void *dev_id)
 {
 	struct bfin_serial_port *uart = dev_id;
 	unsigned short irqstat;
+	int pos;
 
 	spin_lock(&uart->port.lock);
 	irqstat = get_dma_curr_irqstat(uart->rx_dma_channel);
 	clear_dma_irqstat(uart->rx_dma_channel);
-	bfin_serial_dma_rx_chars(uart);
+
+	uart->rx_dma_nrows = get_dma_curr_ycount(uart->rx_dma_channel);
+	uart->rx_dma_nrows = DMA_RX_YCOUNT - uart->rx_dma_nrows;
+	if (uart->rx_dma_nrows == DMA_RX_YCOUNT)
+		uart->rx_dma_nrows = 0;
+
+	pos = uart->rx_dma_nrows * DMA_RX_XCOUNT;
+	if (pos > uart->rx_dma_buf.tail ||
+		uart->rx_dma_nrows < (uart->rx_dma_buf.tail/DMA_RX_XCOUNT)) {
+		uart->rx_dma_buf.head = pos;
+		bfin_serial_dma_rx_chars(uart);
+		uart->rx_dma_buf.tail = uart->rx_dma_buf.head;
+	}
+
 	spin_unlock(&uart->port.lock);
 
 	return IRQ_HANDLED;
-- 
GitLab


From 7de7c55bf54dede2bd2262349fc7b558bcc8e413 Mon Sep 17 00:00:00 2001
From: Robin Getz <rgetz@blackfin.uclinux.org>
Date: Thu, 11 Jun 2009 13:38:57 +0100
Subject: [PATCH 2018/2198] Blackfin Serial Driver: fix baudrate for
 early_printk

Since we already setup the early console UART in
arch/blackfin/kernel/early_printk.c, and common functions which are
enabled from the .setup will override the proper settings later, don't
fill in these structures.  Otherwise we get mangled baudrate settings when
using early_printk.

Signed-off-by: Robin Getz <rgetz@blackfin.uclinux.org>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/bfin_5xx.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index 65a4c07f61996..dfae22d47bf78 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -1272,12 +1272,17 @@ static __init void early_serial_write(struct console *con, const char *s,
 	}
 }
 
+/*
+ * This should have a .setup or .early_setup in it, but then things get called
+ * without the command line options, and the baud rate gets messed up - so
+ * don't let the common infrastructure play with things. (see calls to setup
+ * & earlysetup in ./kernel/printk.c:register_console()
+ */
 static struct __initdata console bfin_early_serial_console = {
 	.name = "early_BFuart",
 	.write = early_serial_write,
 	.device = uart_console_device,
 	.flags = CON_PRINTBUFFER,
-	.setup = bfin_serial_console_setup,
 	.index = -1,
 	.data  = &bfin_serial_reg,
 };
-- 
GitLab


From f9d36da9cdc2504cd9bb6034cfaba0673ce2d6df Mon Sep 17 00:00:00 2001
From: Graf Yang <graf.yang@analog.com>
Date: Thu, 11 Jun 2009 13:42:17 +0100
Subject: [PATCH 2019/2198] Blackfin Serial Driver: fix missing new lines when
 under load

Add a SSYNC() into bfin_serial_dma_tx_chars() to ensure DMA registers are
written with new data otherwise we might miss a byte or two when the
system is under load.  PIO mode is OK though.

Signed-off-by: Graf Yang <graf.yang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/bfin_5xx.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index dfae22d47bf78..676efdad3712e 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -415,6 +415,7 @@ static void bfin_serial_dma_tx_chars(struct bfin_serial_port *uart)
 	set_dma_start_addr(uart->tx_dma_channel, (unsigned long)(xmit->buf+xmit->tail));
 	set_dma_x_count(uart->tx_dma_channel, uart->tx_count);
 	set_dma_x_modify(uart->tx_dma_channel, 1);
+	SSYNC();
 	enable_dma(uart->tx_dma_channel);
 
 	UART_SET_IER(uart, ETBEI);
-- 
GitLab


From 35ff69357949cfff5c3d8e3038b77146872e3bd3 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 11 Jun 2009 13:42:57 +0100
Subject: [PATCH 2020/2198] Blackfin Serial Driver: handle irregular DMA
 register status in auto start mode

This bug is caused by irregular behavior of DMA register CURR_X_COUNT
and CURR_Y_COUNT when an auto restart uart rx DMA run to last byte in
DMA buffer, trigger the interrupt and stay at this possiton. The status
of current x and y is 0:7 instead of 512:8 or 0:8. The driver doesn't
take care of this case when calculating the position.

URL: http://blackfin.uclinux.org/gf/tracker/5063
Reported-by: Tomasz Motylewski <t.motylewski@bfad.de>

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/bfin_5xx.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index 676efdad3712e..854e96dfb0705 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -490,7 +490,7 @@ void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 	uart->rx_dma_nrows = get_dma_curr_ycount(uart->rx_dma_channel);
 	x_pos = get_dma_curr_xcount(uart->rx_dma_channel);
 	uart->rx_dma_nrows = DMA_RX_YCOUNT - uart->rx_dma_nrows;
-	if (uart->rx_dma_nrows == DMA_RX_YCOUNT)
+	if (uart->rx_dma_nrows == DMA_RX_YCOUNT || x_pos == 0)
 		uart->rx_dma_nrows = 0;
 	x_pos = DMA_RX_XCOUNT - x_pos;
 	if (x_pos == DMA_RX_XCOUNT)
@@ -546,15 +546,16 @@ static irqreturn_t bfin_serial_dma_rx_int(int irq, void *dev_id)
 {
 	struct bfin_serial_port *uart = dev_id;
 	unsigned short irqstat;
-	int pos;
+	int x_pos, pos;
 
 	spin_lock(&uart->port.lock);
 	irqstat = get_dma_curr_irqstat(uart->rx_dma_channel);
 	clear_dma_irqstat(uart->rx_dma_channel);
 
 	uart->rx_dma_nrows = get_dma_curr_ycount(uart->rx_dma_channel);
+	x_pos = get_dma_curr_xcount(uart->rx_dma_channel);
 	uart->rx_dma_nrows = DMA_RX_YCOUNT - uart->rx_dma_nrows;
-	if (uart->rx_dma_nrows == DMA_RX_YCOUNT)
+	if (uart->rx_dma_nrows == DMA_RX_YCOUNT || x_pos == 0)
 		uart->rx_dma_nrows = 0;
 
 	pos = uart->rx_dma_nrows * DMA_RX_XCOUNT;
-- 
GitLab


From 0efa4f2c944fabffc81918cc86d4d17dba39a021 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 11 Jun 2009 13:45:07 +0100
Subject: [PATCH 2021/2198] Blackfin Serial Driver: annotate anomalies 05000215
 and 05000099

Add some comments for how these anomalies are addressed:

05000215 - UART TX Interrupt Masked Erroneously
We always clear ETBEI within last UART TX interrupt to end
a string.  It is always set when starting a new tx transfer.

05000099 - UART Line Status Register (UART_LSR) Bits Are Not Updated at
the Same Time
This anomaly affects driver only in POLL code where multi bits of
UART_LSR are checked. It doesn't affect current bfin_5xx.c driver.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/bfin_5xx.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index 854e96dfb0705..ab583ef3a305a 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -330,6 +330,11 @@ static void bfin_serial_tx_chars(struct bfin_serial_port *uart)
 		/* Clear TFI bit */
 		UART_PUT_LSR(uart, TFI);
 #endif
+		/* Anomaly notes:
+		 *  05000215 -	we always clear ETBEI within last UART TX
+		 *		interrupt to end a string. It is always set
+		 *		when start a new tx.
+		 */
 		UART_CLEAR_IER(uart, ETBEI);
 		return;
 	}
@@ -528,6 +533,11 @@ static irqreturn_t bfin_serial_dma_tx_int(int irq, void *dev_id)
 	if (!(get_dma_curr_irqstat(uart->tx_dma_channel)&DMA_RUN)) {
 		disable_dma(uart->tx_dma_channel);
 		clear_dma_irqstat(uart->tx_dma_channel);
+		/* Anomaly notes:
+		 *  05000215 -	we always clear ETBEI within last UART TX
+		 *		interrupt to end a string. It is always set
+		 *		when start a new tx.
+		 */
 		UART_CLEAR_IER(uart, ETBEI);
 		xmit->tail = (xmit->tail + uart->tx_count) & (UART_XMIT_SIZE - 1);
 		uart->port.icount.tx += uart->tx_count;
@@ -969,6 +979,10 @@ static void bfin_serial_reset_irda(struct uart_port *port)
 }
 
 #ifdef CONFIG_CONSOLE_POLL
+/* Anomaly notes:
+ *  05000099 -  Because we only use THRE in poll_put and DR in poll_get,
+ *		losing other bits of UART_LSR is not a problem here.
+ */
 static void bfin_serial_poll_put_char(struct uart_port *port, unsigned char chr)
 {
 	struct bfin_serial_port *uart = (struct bfin_serial_port *)port;
-- 
GitLab


From 84507794a9d5d4decd6bc9c111480076dba0d301 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 11 Jun 2009 13:50:20 +0100
Subject: [PATCH 2022/2198] Blackfin Serial Driver: handle anomaly 05000231

05000231 - UART STB Bit Incorrectly Affects Receiver Setting
For processors affected by this, we cannot safely allow CSTOPB to be set
as the UART will then be unable to properly clock in bytes.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/bfin_5xx.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index ab583ef3a305a..64603f5bb7039 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -828,8 +828,16 @@ bfin_serial_set_termios(struct uart_port *port, struct ktermios *termios,
 			__func__);
 	}
 
-	if (termios->c_cflag & CSTOPB)
-		lcr |= STB;
+	/* Anomaly notes:
+	 *  05000231 -  STOP bit is always set to 1 whatever the user is set.
+	 */
+	if (termios->c_cflag & CSTOPB) {
+		if (ANOMALY_05000231)
+			printk(KERN_WARNING "STOP bits other than 1 is not "
+				"supported in case of anomaly 05000231.\n");
+		else
+			lcr |= STB;
+	}
 	if (termios->c_cflag & PARENB)
 		lcr |= PEN;
 	if (!(termios->c_cflag & PARODD))
-- 
GitLab


From 2860b7911137eabb01c159abefb506e538ff3cb7 Mon Sep 17 00:00:00 2001
From: Sonic Zhang <sonic.zhang@analog.com>
Date: Thu, 11 Jun 2009 13:51:33 +0100
Subject: [PATCH 2023/2198] Blackfin Serial Driver: disable dma rx interrupt
 only rather than all irqs

The UART RX handling code isn't exactly speeding, so don't go disabling
all interrupts when processing the buffer.  Just disable the relevant DMA
interrupt.  This greatly improves latency of the system when utilizing the
UART.

Signed-off-by: Sonic Zhang <sonic.zhang@analog.com>
Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/bfin_5xx.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/serial/bfin_5xx.c b/drivers/serial/bfin_5xx.c
index 64603f5bb7039..e2f6b1bfac98c 100644
--- a/drivers/serial/bfin_5xx.c
+++ b/drivers/serial/bfin_5xx.c
@@ -479,9 +479,9 @@ static void bfin_serial_dma_rx_chars(struct bfin_serial_port *uart)
 void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 {
 	int x_pos, pos;
-	unsigned long flags;
 
-	spin_lock_irqsave(&uart->port.lock, flags);
+	dma_disable_irq(uart->rx_dma_channel);
+	spin_lock_bh(&uart->port.lock);
 
 	/* 2D DMA RX buffer ring is used. Because curr_y_count and
 	 * curr_x_count can't be read as an atomic operation,
@@ -512,7 +512,8 @@ void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 		uart->rx_dma_buf.tail = uart->rx_dma_buf.head;
 	}
 
-	spin_unlock_irqrestore(&uart->port.lock, flags);
+	spin_unlock_bh(&uart->port.lock);
+	dma_enable_irq(uart->rx_dma_channel);
 
 	mod_timer(&(uart->rx_dma_timer), jiffies + DMA_RX_FLUSH_JIFFIES);
 }
-- 
GitLab


From b7c7cbc898e8a97829f33ad3bcd1b5e91690d8f4 Mon Sep 17 00:00:00 2001
From: Craig Shelley <craig@microtron.org.uk>
Date: Thu, 11 Jun 2009 13:52:31 +0100
Subject: [PATCH 2024/2198] USB: CP210X Add device IDs

Signed-off-by: Craig Shelley <craig@microtron.org.uk>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/cp210x.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index d9f586dc6eccd..ac4944a815215 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -57,13 +57,16 @@ static int debug;
 static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x0471, 0x066A) }, /* AKTAKOM ACE-1001 cable */
 	{ USB_DEVICE(0x0489, 0xE000) }, /* Pirelli Broadband S.p.A, DP-L10 SIP/GSM Mobile */
+	{ USB_DEVICE(0x0745, 0x1000) }, /* CipherLab USB CCD Barcode Scanner 1000 */
 	{ USB_DEVICE(0x08e6, 0x5501) }, /* Gemalto Prox-PU/CU contactless smartcard reader */
+	{ USB_DEVICE(0x08FD, 0x000A) }, /* Digianswer A/S , ZigBee/802.15.4 MAC Device */
 	{ USB_DEVICE(0x0FCF, 0x1003) }, /* Dynastream ANT development board */
 	{ USB_DEVICE(0x0FCF, 0x1004) }, /* Dynastream ANT2USB */
 	{ USB_DEVICE(0x0FCF, 0x1006) }, /* Dynastream ANT development board */
 	{ USB_DEVICE(0x10A6, 0xAA26) }, /* Knock-off DCU-11 cable */
 	{ USB_DEVICE(0x10AB, 0x10C5) }, /* Siemens MC60 Cable */
 	{ USB_DEVICE(0x10B5, 0xAC70) }, /* Nokia CA-42 USB */
+	{ USB_DEVICE(0x10C4, 0x0F91) }, /* Vstabi */
 	{ USB_DEVICE(0x10C4, 0x800A) }, /* SPORTident BSM7-D-USB main station */
 	{ USB_DEVICE(0x10C4, 0x803B) }, /* Pololu USB-serial converter */
 	{ USB_DEVICE(0x10C4, 0x8053) }, /* Enfora EDG1228 */
@@ -84,10 +87,12 @@ static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x10C4, 0x81C8) }, /* Lipowsky Industrie Elektronik GmbH, Baby-JTAG */
 	{ USB_DEVICE(0x10C4, 0x81E2) }, /* Lipowsky Industrie Elektronik GmbH, Baby-LIN */
 	{ USB_DEVICE(0x10C4, 0x81E7) }, /* Aerocomm Radio */
+	{ USB_DEVICE(0x10C4, 0x81F2) }, /* C1007 HF band RFID controller */
 	{ USB_DEVICE(0x10C4, 0x8218) }, /* Lipowsky Industrie Elektronik GmbH, HARP-1 */
 	{ USB_DEVICE(0x10C4, 0x822B) }, /* Modem EDGE(GSM) Comander 2 */
 	{ USB_DEVICE(0x10C4, 0x826B) }, /* Cygnal Integrated Products, Inc., Fasttrax GPS demostration module */
 	{ USB_DEVICE(0x10c4, 0x8293) }, /* Telegesys ETRX2USB */
+	{ USB_DEVICE(0x10C4, 0x82F9) }, /* Procyon AVS */
 	{ USB_DEVICE(0x10C4, 0x8341) }, /* Siemens MC35PU GPRS Modem */
 	{ USB_DEVICE(0x10C4, 0x83A8) }, /* Amber Wireless AMB2560 */
 	{ USB_DEVICE(0x10C4, 0x846E) }, /* BEI USB Sensor Interface (VCP) */
@@ -98,7 +103,9 @@ static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x10C4, 0xF003) }, /* Elan Digital Systems USBpulse100 */
 	{ USB_DEVICE(0x10C4, 0xF004) }, /* Elan Digital Systems USBcount50 */
 	{ USB_DEVICE(0x10C5, 0xEA61) }, /* Silicon Labs MobiData GPRS USB Modem */
+	{ USB_DEVICE(0x10CE, 0xEA6A) }, /* Silicon Labs MobiData GPRS USB Modem 100EU */
 	{ USB_DEVICE(0x13AD, 0x9999) }, /* Baltech card reader */
+	{ USB_DEVICE(0x1555, 0x0004) }, /* Owen AC4 USB-RS485 Converter */
 	{ USB_DEVICE(0x166A, 0x0303) }, /* Clipsal 5500PCU C-Bus USB interface */
 	{ USB_DEVICE(0x16D6, 0x0001) }, /* Jablotron serial interface */
 	{ USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */
-- 
GitLab


From 93ef1f1fbce37f14666e4856ff933d4a1b735d02 Mon Sep 17 00:00:00 2001
From: Craig Shelley <craig@microtron.org.uk>
Date: Thu, 11 Jun 2009 13:53:30 +0100
Subject: [PATCH 2025/2198] USB: CP210X Use official request code definitions

The CP210X driver was developed without official device specifications.
This has lead to an incorrect assumption that all GET request codes are
equal to the corresponding SET request code +1.
This patch removes this incorrect assumption, and uses request code
definitions based on the updated GPL driver from SiLabs.
This modification is needed before extended functionality such as GPIO
on CP2103 can be supported.

Signed-off-by: Craig Shelley <craig@microtron.org.uk>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/cp210x.c | 98 +++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 42 deletions(-)

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index ac4944a815215..b50f27fa11d0a 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -27,7 +27,7 @@
 /*
  * Version Information
  */
-#define DRIVER_VERSION "v0.08"
+#define DRIVER_VERSION "v0.09"
 #define DRIVER_DESC "Silicon Labs CP2101/CP2102 RS232 serial adaptor driver"
 
 /*
@@ -144,23 +144,40 @@ static struct usb_serial_driver cp2101_device = {
 #define REQTYPE_HOST_TO_DEVICE	0x41
 #define REQTYPE_DEVICE_TO_HOST	0xc1
 
-/* Config SET requests. To GET, add 1 to the request number */
-#define CP2101_UART 		0x00	/* Enable / Disable */
-#define CP2101_BAUDRATE		0x01	/* (BAUD_RATE_GEN_FREQ / baudrate) */
-#define CP2101_BITS		0x03	/* 0x(0)(databits)(parity)(stopbits) */
-#define CP2101_BREAK		0x05	/* On / Off */
-#define CP2101_CONTROL		0x07	/* Flow control line states */
-#define CP2101_MODEMCTL		0x13	/* Modem controls */
-#define CP2101_CONFIG_6		0x19	/* 6 bytes of config data ??? */
-
-/* CP2101_UART */
+/* Config request codes */
+#define CP210X_IFC_ENABLE	0x00
+#define CP210X_SET_BAUDDIV	0x01
+#define CP210X_GET_BAUDDIV	0x02
+#define CP210X_SET_LINE_CTL	0x03
+#define CP210X_GET_LINE_CTL	0x04
+#define CP210X_SET_BREAK	0x05
+#define CP210X_IMM_CHAR		0x06
+#define CP210X_SET_MHS		0x07
+#define CP210X_GET_MDMSTS	0x08
+#define CP210X_SET_XON		0x09
+#define CP210X_SET_XOFF		0x0A
+#define CP210X_SET_EVENTMASK	0x0B
+#define CP210X_GET_EVENTMASK	0x0C
+#define CP210X_SET_CHAR		0x0D
+#define CP210X_GET_CHARS	0x0E
+#define CP210X_GET_PROPS	0x0F
+#define CP210X_GET_COMM_STATUS	0x10
+#define CP210X_RESET		0x11
+#define CP210X_PURGE		0x12
+#define CP210X_SET_FLOW		0x13
+#define CP210X_GET_FLOW		0x14
+#define CP210X_EMBED_EVENTS	0x15
+#define CP210X_GET_EVENTSTATE	0x16
+#define CP210X_SET_CHARS	0x19
+
+/* CP210X_IFC_ENABLE */
 #define UART_ENABLE		0x0001
 #define UART_DISABLE		0x0000
 
-/* CP2101_BAUDRATE */
+/* CP210X_(SET|GET)_BAUDDIV */
 #define BAUD_RATE_GEN_FREQ	0x384000
 
-/* CP2101_BITS */
+/* CP210X_(SET|GET)_LINE_CTL */
 #define BITS_DATA_MASK		0X0f00
 #define BITS_DATA_5		0X0500
 #define BITS_DATA_6		0X0600
@@ -180,11 +197,11 @@ static struct usb_serial_driver cp2101_device = {
 #define BITS_STOP_1_5		0x0001
 #define BITS_STOP_2		0x0002
 
-/* CP2101_BREAK */
+/* CP210X_SET_BREAK */
 #define BREAK_ON		0x0000
 #define BREAK_OFF		0x0001
 
-/* CP2101_CONTROL */
+/* CP210X_(SET_MHS|GET_MDMSTS) */
 #define CONTROL_DTR		0x0001
 #define CONTROL_RTS		0x0002
 #define CONTROL_CTS		0x0010
@@ -217,9 +234,6 @@ static int cp2101_get_config(struct usb_serial_port *port, u8 request,
 		return -ENOMEM;
 	}
 
-	/* For get requests, the request number must be incremented */
-	request++;
-
 	/* Issue the request, attempting to read 'size' bytes */
 	result = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0),
 				request, REQTYPE_DEVICE_TO_HOST, 0x0000,
@@ -357,7 +371,7 @@ static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	if (cp2101_set_config_single(port, CP2101_UART, UART_ENABLE)) {
+	if (cp2101_set_config_single(port, CP210X_IFC_ENABLE, UART_ENABLE)) {
 		dev_err(&port->dev, "%s - Unable to enable UART\n",
 				__func__);
 		return -EPROTO;
@@ -415,7 +429,7 @@ static void cp2101_close(struct usb_serial_port *port)
 
 	mutex_lock(&port->serial->disc_mutex);
 	if (!port->serial->disconnected)
-		cp2101_set_config_single(port, CP2101_UART, UART_DISABLE);
+		cp2101_set_config_single(port, CP210X_IFC_ENABLE, UART_DISABLE);
 	mutex_unlock(&port->serial->disc_mutex);
 }
 
@@ -456,7 +470,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	cp2101_get_config(port, CP2101_BAUDRATE, &baud, 2);
+	cp2101_get_config(port, CP210X_GET_BAUDDIV, &baud, 2);
 	/* Convert to baudrate */
 	if (baud)
 		baud = cp2101_quantise_baudrate((BAUD_RATE_GEN_FREQ + baud/2)/ baud);
@@ -466,7 +480,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 
 	cflag = *cflagp;
 
-	cp2101_get_config(port, CP2101_BITS, &bits, 2);
+	cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 	cflag &= ~CSIZE;
 	switch (bits & BITS_DATA_MASK) {
 	case BITS_DATA_5:
@@ -491,14 +505,14 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		cflag |= CS8;
 		bits &= ~BITS_DATA_MASK;
 		bits |= BITS_DATA_8;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	default:
 		dbg("%s - Unknown number of data bits, using 8", __func__);
 		cflag |= CS8;
 		bits &= ~BITS_DATA_MASK;
 		bits |= BITS_DATA_8;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
@@ -521,20 +535,20 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 				__func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	case BITS_PARITY_SPACE:
 		dbg("%s - parity = SPACE (not supported, disabling parity)",
 				__func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	default:
 		dbg("%s - Unknown parity mode, disabling parity", __func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
@@ -547,7 +561,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		dbg("%s - stop bits = 1.5 (not supported, using 1 stop bit)",
 								__func__);
 		bits &= ~BITS_STOP_MASK;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	case BITS_STOP_2:
 		dbg("%s - stop bits = 2", __func__);
@@ -557,11 +571,11 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		dbg("%s - Unknown number of stop bits, using 1 stop bit",
 								__func__);
 		bits &= ~BITS_STOP_MASK;
-		cp2101_set_config(port, CP2101_BITS, &bits, 2);
+		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
-	cp2101_get_config(port, CP2101_MODEMCTL, modem_ctl, 16);
+	cp2101_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
 	if (modem_ctl[0] & 0x0008) {
 		dbg("%s - flow control = CRTSCTS", __func__);
 		cflag |= CRTSCTS;
@@ -594,7 +608,7 @@ static void cp2101_set_termios(struct tty_struct *tty,
 	if (baud != tty_termios_baud_rate(old_termios) && baud != 0) {
 		dbg("%s - Setting baud rate to %d baud", __func__,
 				baud);
-		if (cp2101_set_config_single(port, CP2101_BAUDRATE,
+		if (cp2101_set_config_single(port, CP210X_SET_BAUDDIV,
 					((BAUD_RATE_GEN_FREQ + baud/2) / baud))) {
 			dbg("Baud rate requested not supported by device\n");
 			baud = tty_termios_baud_rate(old_termios);
@@ -605,7 +619,7 @@ static void cp2101_set_termios(struct tty_struct *tty,
 
 	/* If the number of data bits is to be updated */
 	if ((cflag & CSIZE) != (old_cflag & CSIZE)) {
-		cp2101_get_config(port, CP2101_BITS, &bits, 2);
+		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_DATA_MASK;
 		switch (cflag & CSIZE) {
 		case CS5:
@@ -635,13 +649,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 				bits |= BITS_DATA_8;
 				break;
 		}
-		if (cp2101_set_config(port, CP2101_BITS, &bits, 2))
+		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Number of data bits requested "
 					"not supported by device\n");
 	}
 
 	if ((cflag & (PARENB|PARODD)) != (old_cflag & (PARENB|PARODD))) {
-		cp2101_get_config(port, CP2101_BITS, &bits, 2);
+		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_PARITY_MASK;
 		if (cflag & PARENB) {
 			if (cflag & PARODD) {
@@ -652,13 +666,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 				dbg("%s - parity = EVEN", __func__);
 			}
 		}
-		if (cp2101_set_config(port, CP2101_BITS, &bits, 2))
+		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Parity mode not supported "
 					"by device\n");
 	}
 
 	if ((cflag & CSTOPB) != (old_cflag & CSTOPB)) {
-		cp2101_get_config(port, CP2101_BITS, &bits, 2);
+		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_STOP_MASK;
 		if (cflag & CSTOPB) {
 			bits |= BITS_STOP_2;
@@ -667,13 +681,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 			bits |= BITS_STOP_1;
 			dbg("%s - stop bits = 1", __func__);
 		}
-		if (cp2101_set_config(port, CP2101_BITS, &bits, 2))
+		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Number of stop bits requested "
 					"not supported by device\n");
 	}
 
 	if ((cflag & CRTSCTS) != (old_cflag & CRTSCTS)) {
-		cp2101_get_config(port, CP2101_MODEMCTL, modem_ctl, 16);
+		cp2101_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
 		dbg("%s - read modem controls = 0x%.4x 0x%.4x 0x%.4x 0x%.4x",
 				__func__, modem_ctl[0], modem_ctl[1],
 				modem_ctl[2], modem_ctl[3]);
@@ -693,7 +707,7 @@ static void cp2101_set_termios(struct tty_struct *tty,
 		dbg("%s - write modem controls = 0x%.4x 0x%.4x 0x%.4x 0x%.4x",
 				__func__, modem_ctl[0], modem_ctl[1],
 				modem_ctl[2], modem_ctl[3]);
-		cp2101_set_config(port, CP2101_MODEMCTL, modem_ctl, 16);
+		cp2101_set_config(port, CP210X_SET_FLOW, modem_ctl, 16);
 	}
 
 }
@@ -731,7 +745,7 @@ static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *file,
 
 	dbg("%s - control = 0x%.4x", __func__, control);
 
-	return cp2101_set_config(port, CP2101_CONTROL, &control, 2);
+	return cp2101_set_config(port, CP210X_SET_MHS, &control, 2);
 }
 
 static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
@@ -742,7 +756,7 @@ static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
 
 	dbg("%s - port %d", __func__, port->number);
 
-	cp2101_get_config(port, CP2101_CONTROL, &control, 1);
+	cp2101_get_config(port, CP210X_GET_MDMSTS, &control, 1);
 
 	result = ((control & CONTROL_DTR) ? TIOCM_DTR : 0)
 		|((control & CONTROL_RTS) ? TIOCM_RTS : 0)
@@ -768,7 +782,7 @@ static void cp2101_break_ctl (struct tty_struct *tty, int break_state)
 		state = BREAK_ON;
 	dbg("%s - turning break %s", __func__,
 			state == BREAK_OFF ? "off" : "on");
-	cp2101_set_config(port, CP2101_BREAK, &state, 2);
+	cp2101_set_config(port, CP210X_SET_BREAK, &state, 2);
 }
 
 static int cp2101_startup(struct usb_serial *serial)
-- 
GitLab


From 4cc27bd6d7d6750dba33b4ccb4585c00b8fca7d2 Mon Sep 17 00:00:00 2001
From: Craig Shelley <craig@microtron.org.uk>
Date: Thu, 11 Jun 2009 13:54:40 +0100
Subject: [PATCH 2026/2198] USB: CP210X Replace CP2101 with CP210x

This patch replaces the string "CP2101" with "CP210x" within cp210x.c
This is to reduce confusion about the fact that the driver is actually
compatible with CP2101, CP2102 and CP2103 devices.

Signed-off-by: Craig Shelley <craig@microtron.org.uk>

(Fixed some collisions merging)

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/cp210x.c | 194 ++++++++++++++++++------------------
 1 file changed, 97 insertions(+), 97 deletions(-)

diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c
index b50f27fa11d0a..16a154d3b2fee 100644
--- a/drivers/usb/serial/cp210x.c
+++ b/drivers/usb/serial/cp210x.c
@@ -1,5 +1,5 @@
 /*
- * Silicon Laboratories CP2101/CP2102 USB to RS232 serial adaptor driver
+ * Silicon Laboratories CP210x USB to RS232 serial adaptor driver
  *
  * Copyright (C) 2005 Craig Shelley (craig@microtron.org.uk)
  *
@@ -28,29 +28,29 @@
  * Version Information
  */
 #define DRIVER_VERSION "v0.09"
-#define DRIVER_DESC "Silicon Labs CP2101/CP2102 RS232 serial adaptor driver"
+#define DRIVER_DESC "Silicon Labs CP210x RS232 serial adaptor driver"
 
 /*
  * Function Prototypes
  */
-static int cp2101_open(struct tty_struct *, struct usb_serial_port *,
+static int cp210x_open(struct tty_struct *, struct usb_serial_port *,
 							struct file *);
-static void cp2101_cleanup(struct usb_serial_port *);
-static void cp2101_close(struct usb_serial_port *);
-static void cp2101_get_termios(struct tty_struct *,
+static void cp210x_cleanup(struct usb_serial_port *);
+static void cp210x_close(struct usb_serial_port *);
+static void cp210x_get_termios(struct tty_struct *,
 	struct usb_serial_port *port);
-static void cp2101_get_termios_port(struct usb_serial_port *port,
+static void cp210x_get_termios_port(struct usb_serial_port *port,
 	unsigned int *cflagp, unsigned int *baudp);
-static void cp2101_set_termios(struct tty_struct *, struct usb_serial_port *,
+static void cp210x_set_termios(struct tty_struct *, struct usb_serial_port *,
 							struct ktermios*);
-static int cp2101_tiocmget(struct tty_struct *, struct file *);
-static int cp2101_tiocmset(struct tty_struct *, struct file *,
+static int cp210x_tiocmget(struct tty_struct *, struct file *);
+static int cp210x_tiocmset(struct tty_struct *, struct file *,
 		unsigned int, unsigned int);
-static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *,
+static int cp210x_tiocmset_port(struct usb_serial_port *port, struct file *,
 		unsigned int, unsigned int);
-static void cp2101_break_ctl(struct tty_struct *, int);
-static int cp2101_startup(struct usb_serial *);
-static void cp2101_shutdown(struct usb_serial *);
+static void cp210x_break_ctl(struct tty_struct *, int);
+static int cp210x_startup(struct usb_serial *);
+static void cp210x_shutdown(struct usb_serial *);
 
 static int debug;
 
@@ -114,30 +114,30 @@ static struct usb_device_id id_table [] = {
 
 MODULE_DEVICE_TABLE(usb, id_table);
 
-static struct usb_driver cp2101_driver = {
-	.name		= "cp2101",
+static struct usb_driver cp210x_driver = {
+	.name		= "cp210x",
 	.probe		= usb_serial_probe,
 	.disconnect	= usb_serial_disconnect,
 	.id_table	= id_table,
 	.no_dynamic_id	= 	1,
 };
 
-static struct usb_serial_driver cp2101_device = {
+static struct usb_serial_driver cp210x_device = {
 	.driver = {
 		.owner =	THIS_MODULE,
-		.name = 	"cp2101",
+		.name = 	"cp210x",
 	},
-	.usb_driver		= &cp2101_driver,
+	.usb_driver		= &cp210x_driver,
 	.id_table		= id_table,
 	.num_ports		= 1,
-	.open			= cp2101_open,
-	.close			= cp2101_close,
-	.break_ctl		= cp2101_break_ctl,
-	.set_termios		= cp2101_set_termios,
-	.tiocmget 		= cp2101_tiocmget,
-	.tiocmset		= cp2101_tiocmset,
-	.attach			= cp2101_startup,
-	.shutdown		= cp2101_shutdown,
+	.open			= cp210x_open,
+	.close			= cp210x_close,
+	.break_ctl		= cp210x_break_ctl,
+	.set_termios		= cp210x_set_termios,
+	.tiocmget 		= cp210x_tiocmget,
+	.tiocmset		= cp210x_tiocmset,
+	.attach			= cp210x_startup,
+	.shutdown		= cp210x_shutdown,
 };
 
 /* Config request types */
@@ -212,13 +212,13 @@ static struct usb_serial_driver cp2101_device = {
 #define CONTROL_WRITE_RTS	0x0200
 
 /*
- * cp2101_get_config
- * Reads from the CP2101 configuration registers
+ * cp210x_get_config
+ * Reads from the CP210x configuration registers
  * 'size' is specified in bytes.
  * 'data' is a pointer to a pre-allocated array of integers large
  * enough to hold 'size' bytes (with 4 bytes to each integer)
  */
-static int cp2101_get_config(struct usb_serial_port *port, u8 request,
+static int cp210x_get_config(struct usb_serial_port *port, u8 request,
 		unsigned int *data, int size)
 {
 	struct usb_serial *serial = port->serial;
@@ -256,12 +256,12 @@ static int cp2101_get_config(struct usb_serial_port *port, u8 request,
 }
 
 /*
- * cp2101_set_config
- * Writes to the CP2101 configuration registers
+ * cp210x_set_config
+ * Writes to the CP210x configuration registers
  * Values less than 16 bits wide are sent directly
  * 'size' is specified in bytes.
  */
-static int cp2101_set_config(struct usb_serial_port *port, u8 request,
+static int cp210x_set_config(struct usb_serial_port *port, u8 request,
 		unsigned int *data, int size)
 {
 	struct usb_serial *serial = port->serial;
@@ -312,21 +312,21 @@ static int cp2101_set_config(struct usb_serial_port *port, u8 request,
 }
 
 /*
- * cp2101_set_config_single
- * Convenience function for calling cp2101_set_config on single data values
+ * cp210x_set_config_single
+ * Convenience function for calling cp210x_set_config on single data values
  * without requiring an integer pointer
  */
-static inline int cp2101_set_config_single(struct usb_serial_port *port,
+static inline int cp210x_set_config_single(struct usb_serial_port *port,
 		u8 request, unsigned int data)
 {
-	return cp2101_set_config(port, request, &data, 2);
+	return cp210x_set_config(port, request, &data, 2);
 }
 
 /*
- * cp2101_quantise_baudrate
+ * cp210x_quantise_baudrate
  * Quantises the baud rate as per AN205 Table 1
  */
-static unsigned int cp2101_quantise_baudrate(unsigned int baud) {
+static unsigned int cp210x_quantise_baudrate(unsigned int baud) {
 	if      (baud <= 56)       baud = 0;
 	else if (baud <= 300)      baud = 300;
 	else if (baud <= 600)      baud = 600;
@@ -363,7 +363,7 @@ static unsigned int cp2101_quantise_baudrate(unsigned int baud) {
 	return baud;
 }
 
-static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
+static int cp210x_open(struct tty_struct *tty, struct usb_serial_port *port,
 				struct file *filp)
 {
 	struct usb_serial *serial = port->serial;
@@ -371,7 +371,7 @@ static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	if (cp2101_set_config_single(port, CP210X_IFC_ENABLE, UART_ENABLE)) {
+	if (cp210x_set_config_single(port, CP210X_IFC_ENABLE, UART_ENABLE)) {
 		dev_err(&port->dev, "%s - Unable to enable UART\n",
 				__func__);
 		return -EPROTO;
@@ -393,17 +393,17 @@ static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
 	}
 
 	/* Configure the termios structure */
-	cp2101_get_termios(tty, port);
+	cp210x_get_termios(tty, port);
 
 	/* Set the DTR and RTS pins low */
-	cp2101_tiocmset_port(tty ? (struct usb_serial_port *) tty->driver_data
+	cp210x_tiocmset_port(tty ? (struct usb_serial_port *) tty->driver_data
 			: port,
 		NULL, TIOCM_DTR | TIOCM_RTS, 0);
 
 	return 0;
 }
 
-static void cp2101_cleanup(struct usb_serial_port *port)
+static void cp210x_cleanup(struct usb_serial_port *port)
 {
 	struct usb_serial *serial = port->serial;
 
@@ -418,7 +418,7 @@ static void cp2101_cleanup(struct usb_serial_port *port)
 	}
 }
 
-static void cp2101_close(struct usb_serial_port *port)
+static void cp210x_close(struct usb_serial_port *port)
 {
 	dbg("%s - port %d", __func__, port->number);
 
@@ -429,23 +429,23 @@ static void cp2101_close(struct usb_serial_port *port)
 
 	mutex_lock(&port->serial->disc_mutex);
 	if (!port->serial->disconnected)
-		cp2101_set_config_single(port, CP210X_IFC_ENABLE, UART_DISABLE);
+		cp210x_set_config_single(port, CP210X_IFC_ENABLE, UART_DISABLE);
 	mutex_unlock(&port->serial->disc_mutex);
 }
 
 /*
- * cp2101_get_termios
+ * cp210x_get_termios
  * Reads the baud rate, data bits, parity, stop bits and flow control mode
  * from the device, corrects any unsupported values, and configures the
  * termios structure to reflect the state of the device
  */
-static void cp2101_get_termios(struct tty_struct *tty,
+static void cp210x_get_termios(struct tty_struct *tty,
 	struct usb_serial_port *port)
 {
 	unsigned int baud;
 
 	if (tty) {
-		cp2101_get_termios_port(tty->driver_data,
+		cp210x_get_termios_port(tty->driver_data,
 			&tty->termios->c_cflag, &baud);
 		tty_encode_baud_rate(tty, baud, baud);
 	}
@@ -453,15 +453,15 @@ static void cp2101_get_termios(struct tty_struct *tty,
 	else {
 		unsigned int cflag;
 		cflag = 0;
-		cp2101_get_termios_port(port, &cflag, &baud);
+		cp210x_get_termios_port(port, &cflag, &baud);
 	}
 }
 
 /*
- * cp2101_get_termios_port
- * This is the heart of cp2101_get_termios which always uses a &usb_serial_port.
+ * cp210x_get_termios_port
+ * This is the heart of cp210x_get_termios which always uses a &usb_serial_port.
  */
-static void cp2101_get_termios_port(struct usb_serial_port *port,
+static void cp210x_get_termios_port(struct usb_serial_port *port,
 	unsigned int *cflagp, unsigned int *baudp)
 {
 	unsigned int cflag, modem_ctl[4];
@@ -470,17 +470,17 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 
 	dbg("%s - port %d", __func__, port->number);
 
-	cp2101_get_config(port, CP210X_GET_BAUDDIV, &baud, 2);
+	cp210x_get_config(port, CP210X_GET_BAUDDIV, &baud, 2);
 	/* Convert to baudrate */
 	if (baud)
-		baud = cp2101_quantise_baudrate((BAUD_RATE_GEN_FREQ + baud/2)/ baud);
+		baud = cp210x_quantise_baudrate((BAUD_RATE_GEN_FREQ + baud/2)/ baud);
 
 	dbg("%s - baud rate = %d", __func__, baud);
 	*baudp = baud;
 
 	cflag = *cflagp;
 
-	cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
+	cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 	cflag &= ~CSIZE;
 	switch (bits & BITS_DATA_MASK) {
 	case BITS_DATA_5:
@@ -505,14 +505,14 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		cflag |= CS8;
 		bits &= ~BITS_DATA_MASK;
 		bits |= BITS_DATA_8;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	default:
 		dbg("%s - Unknown number of data bits, using 8", __func__);
 		cflag |= CS8;
 		bits &= ~BITS_DATA_MASK;
 		bits |= BITS_DATA_8;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
@@ -535,20 +535,20 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 				__func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	case BITS_PARITY_SPACE:
 		dbg("%s - parity = SPACE (not supported, disabling parity)",
 				__func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	default:
 		dbg("%s - Unknown parity mode, disabling parity", __func__);
 		cflag &= ~PARENB;
 		bits &= ~BITS_PARITY_MASK;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
@@ -561,7 +561,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		dbg("%s - stop bits = 1.5 (not supported, using 1 stop bit)",
 								__func__);
 		bits &= ~BITS_STOP_MASK;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	case BITS_STOP_2:
 		dbg("%s - stop bits = 2", __func__);
@@ -571,11 +571,11 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 		dbg("%s - Unknown number of stop bits, using 1 stop bit",
 								__func__);
 		bits &= ~BITS_STOP_MASK;
-		cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
+		cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
 		break;
 	}
 
-	cp2101_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
+	cp210x_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
 	if (modem_ctl[0] & 0x0008) {
 		dbg("%s - flow control = CRTSCTS", __func__);
 		cflag |= CRTSCTS;
@@ -587,7 +587,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 	*cflagp = cflag;
 }
 
-static void cp2101_set_termios(struct tty_struct *tty,
+static void cp210x_set_termios(struct tty_struct *tty,
 		struct usb_serial_port *port, struct ktermios *old_termios)
 {
 	unsigned int cflag, old_cflag;
@@ -602,13 +602,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 	tty->termios->c_cflag &= ~CMSPAR;
 	cflag = tty->termios->c_cflag;
 	old_cflag = old_termios->c_cflag;
-	baud = cp2101_quantise_baudrate(tty_get_baud_rate(tty));
+	baud = cp210x_quantise_baudrate(tty_get_baud_rate(tty));
 
 	/* If the baud rate is to be updated*/
 	if (baud != tty_termios_baud_rate(old_termios) && baud != 0) {
 		dbg("%s - Setting baud rate to %d baud", __func__,
 				baud);
-		if (cp2101_set_config_single(port, CP210X_SET_BAUDDIV,
+		if (cp210x_set_config_single(port, CP210X_SET_BAUDDIV,
 					((BAUD_RATE_GEN_FREQ + baud/2) / baud))) {
 			dbg("Baud rate requested not supported by device\n");
 			baud = tty_termios_baud_rate(old_termios);
@@ -619,7 +619,7 @@ static void cp2101_set_termios(struct tty_struct *tty,
 
 	/* If the number of data bits is to be updated */
 	if ((cflag & CSIZE) != (old_cflag & CSIZE)) {
-		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
+		cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_DATA_MASK;
 		switch (cflag & CSIZE) {
 		case CS5:
@@ -643,19 +643,19 @@ static void cp2101_set_termios(struct tty_struct *tty,
 			dbg("%s - data bits = 9", __func__);
 			break;*/
 		default:
-			dbg("cp2101 driver does not "
+			dbg("cp210x driver does not "
 					"support the number of bits requested,"
 					" using 8 bit mode\n");
 				bits |= BITS_DATA_8;
 				break;
 		}
-		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
+		if (cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Number of data bits requested "
 					"not supported by device\n");
 	}
 
 	if ((cflag & (PARENB|PARODD)) != (old_cflag & (PARENB|PARODD))) {
-		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
+		cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_PARITY_MASK;
 		if (cflag & PARENB) {
 			if (cflag & PARODD) {
@@ -666,13 +666,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 				dbg("%s - parity = EVEN", __func__);
 			}
 		}
-		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
+		if (cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Parity mode not supported "
 					"by device\n");
 	}
 
 	if ((cflag & CSTOPB) != (old_cflag & CSTOPB)) {
-		cp2101_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
+		cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
 		bits &= ~BITS_STOP_MASK;
 		if (cflag & CSTOPB) {
 			bits |= BITS_STOP_2;
@@ -681,13 +681,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
 			bits |= BITS_STOP_1;
 			dbg("%s - stop bits = 1", __func__);
 		}
-		if (cp2101_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
+		if (cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
 			dbg("Number of stop bits requested "
 					"not supported by device\n");
 	}
 
 	if ((cflag & CRTSCTS) != (old_cflag & CRTSCTS)) {
-		cp2101_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
+		cp210x_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
 		dbg("%s - read modem controls = 0x%.4x 0x%.4x 0x%.4x 0x%.4x",
 				__func__, modem_ctl[0], modem_ctl[1],
 				modem_ctl[2], modem_ctl[3]);
@@ -707,19 +707,19 @@ static void cp2101_set_termios(struct tty_struct *tty,
 		dbg("%s - write modem controls = 0x%.4x 0x%.4x 0x%.4x 0x%.4x",
 				__func__, modem_ctl[0], modem_ctl[1],
 				modem_ctl[2], modem_ctl[3]);
-		cp2101_set_config(port, CP210X_SET_FLOW, modem_ctl, 16);
+		cp210x_set_config(port, CP210X_SET_FLOW, modem_ctl, 16);
 	}
 
 }
 
-static int cp2101_tiocmset (struct tty_struct *tty, struct file *file,
+static int cp210x_tiocmset (struct tty_struct *tty, struct file *file,
 		unsigned int set, unsigned int clear)
 {
 	struct usb_serial_port *port = tty->driver_data;
-	return cp2101_tiocmset_port(port, file, set, clear);
+	return cp210x_tiocmset_port(port, file, set, clear);
 }
 
-static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *file,
+static int cp210x_tiocmset_port(struct usb_serial_port *port, struct file *file,
 		unsigned int set, unsigned int clear)
 {
 	unsigned int control = 0;
@@ -745,10 +745,10 @@ static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *file,
 
 	dbg("%s - control = 0x%.4x", __func__, control);
 
-	return cp2101_set_config(port, CP210X_SET_MHS, &control, 2);
+	return cp210x_set_config(port, CP210X_SET_MHS, &control, 2);
 }
 
-static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
+static int cp210x_tiocmget (struct tty_struct *tty, struct file *file)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	unsigned int control;
@@ -756,7 +756,7 @@ static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
 
 	dbg("%s - port %d", __func__, port->number);
 
-	cp2101_get_config(port, CP210X_GET_MDMSTS, &control, 1);
+	cp210x_get_config(port, CP210X_GET_MDMSTS, &control, 1);
 
 	result = ((control & CONTROL_DTR) ? TIOCM_DTR : 0)
 		|((control & CONTROL_RTS) ? TIOCM_RTS : 0)
@@ -770,7 +770,7 @@ static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
 	return result;
 }
 
-static void cp2101_break_ctl (struct tty_struct *tty, int break_state)
+static void cp210x_break_ctl (struct tty_struct *tty, int break_state)
 {
 	struct usb_serial_port *port = tty->driver_data;
 	unsigned int state;
@@ -782,17 +782,17 @@ static void cp2101_break_ctl (struct tty_struct *tty, int break_state)
 		state = BREAK_ON;
 	dbg("%s - turning break %s", __func__,
 			state == BREAK_OFF ? "off" : "on");
-	cp2101_set_config(port, CP210X_SET_BREAK, &state, 2);
+	cp210x_set_config(port, CP210X_SET_BREAK, &state, 2);
 }
 
-static int cp2101_startup(struct usb_serial *serial)
+static int cp210x_startup(struct usb_serial *serial)
 {
-	/* CP2101 buffers behave strangely unless device is reset */
+	/* cp210x buffers behave strangely unless device is reset */
 	usb_reset_device(serial->dev);
 	return 0;
 }
 
-static void cp2101_shutdown(struct usb_serial *serial)
+static void cp210x_shutdown(struct usb_serial *serial)
 {
 	int i;
 
@@ -800,21 +800,21 @@ static void cp2101_shutdown(struct usb_serial *serial)
 
 	/* Stop reads and writes on all ports */
 	for (i = 0; i < serial->num_ports; ++i)
-		cp2101_cleanup(serial->port[i]);
+		cp210x_cleanup(serial->port[i]);
 }
 
-static int __init cp2101_init(void)
+static int __init cp210x_init(void)
 {
 	int retval;
 
-	retval = usb_serial_register(&cp2101_device);
+	retval = usb_serial_register(&cp210x_device);
 	if (retval)
 		return retval; /* Failed to register */
 
-	retval = usb_register(&cp2101_driver);
+	retval = usb_register(&cp210x_driver);
 	if (retval) {
 		/* Failed to register */
-		usb_serial_deregister(&cp2101_device);
+		usb_serial_deregister(&cp210x_device);
 		return retval;
 	}
 
@@ -824,14 +824,14 @@ static int __init cp2101_init(void)
 	return 0;
 }
 
-static void __exit cp2101_exit(void)
+static void __exit cp210x_exit(void)
 {
-	usb_deregister(&cp2101_driver);
-	usb_serial_deregister(&cp2101_device);
+	usb_deregister(&cp210x_driver);
+	usb_serial_deregister(&cp210x_device);
 }
 
-module_init(cp2101_init);
-module_exit(cp2101_exit);
+module_init(cp210x_init);
+module_exit(cp210x_exit);
 
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_VERSION(DRIVER_VERSION);
-- 
GitLab


From 557aaa7ffab639d0190b935a041b16ae44606342 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:55:34 +0100
Subject: [PATCH 2027/2198] ft232: support the ASYNC_LOW_LATENCY flag

This allows users to use the standard setserial command with this FT232
feature as well as obscure chip specific interfaces we have now. We keep
track of and respect the sysfs value for non-low-latency cases. In theory we
could do smart stuff with VTIME and the like but this seems of questionable
worth.

Closes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=9120
Signed-off-by: Alan Cox <alan@linux.intel.com)
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/ftdi_sio.c | 94 ++++++++++++++++++++++-------------
 1 file changed, 60 insertions(+), 34 deletions(-)

diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index d9d87111f9a9c..21c053c31b9ef 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -89,6 +89,7 @@ struct ftdi_private {
 	int force_rtscts;	/* if non-zero, force RTS-CTS to always
 				   be enabled */
 
+	unsigned int latency;		/* latency setting in use */
 	spinlock_t tx_lock;	/* spinlock for transmit state */
 	unsigned long tx_bytes;
 	unsigned long tx_outstanding_bytes;
@@ -1038,7 +1039,54 @@ static int change_speed(struct tty_struct *tty, struct usb_serial_port *port)
 	return rv;
 }
 
+static int write_latency_timer(struct usb_serial_port *port)
+{
+	struct ftdi_private *priv = usb_get_serial_port_data(port);
+	struct usb_device *udev = port->serial->dev;
+	char buf[1];
+	int rv = 0;
+	int l = priv->latency;
+
+	if (priv->flags & ASYNC_LOW_LATENCY)
+		l = 1;
+
+	dbg("%s: setting latency timer = %i", __func__, l);
+
+	rv = usb_control_msg(udev,
+			     usb_sndctrlpipe(udev, 0),
+			     FTDI_SIO_SET_LATENCY_TIMER_REQUEST,
+			     FTDI_SIO_SET_LATENCY_TIMER_REQUEST_TYPE,
+			     l, priv->interface,
+			     buf, 0, WDR_TIMEOUT);
 
+	if (rv < 0)
+		dev_err(&port->dev, "Unable to write latency timer: %i\n", rv);
+	return rv;
+}
+
+static int read_latency_timer(struct usb_serial_port *port)
+{
+	struct ftdi_private *priv = usb_get_serial_port_data(port);
+	struct usb_device *udev = port->serial->dev;
+	unsigned short latency = 0;
+	int rv = 0;
+
+
+	dbg("%s", __func__);
+
+	rv = usb_control_msg(udev,
+			     usb_rcvctrlpipe(udev, 0),
+			     FTDI_SIO_GET_LATENCY_TIMER_REQUEST,
+			     FTDI_SIO_GET_LATENCY_TIMER_REQUEST_TYPE,
+			     0, priv->interface,
+			     (char *) &latency, 1, WDR_TIMEOUT);
+
+	if (rv < 0) {
+		dev_err(&port->dev, "Unable to read latency timer: %i\n", rv);
+		return -EIO;
+	}
+	return latency;
+}
 
 static int get_serial_info(struct usb_serial_port *port,
 				struct serial_struct __user *retinfo)
@@ -1098,6 +1146,7 @@ static int set_serial_info(struct tty_struct *tty,
 	priv->custom_divisor = new_serial.custom_divisor;
 
 	tty->low_latency = (priv->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
+	write_latency_timer(port);
 
 check_and_exit:
 	if ((old_priv.flags & ASYNC_SPD_MASK) !=
@@ -1193,27 +1242,13 @@ static ssize_t show_latency_timer(struct device *dev,
 {
 	struct usb_serial_port *port = to_usb_serial_port(dev);
 	struct ftdi_private *priv = usb_get_serial_port_data(port);
-	struct usb_device *udev = port->serial->dev;
-	unsigned short latency = 0;
-	int rv = 0;
-
-
-	dbg("%s", __func__);
-
-	rv = usb_control_msg(udev,
-			     usb_rcvctrlpipe(udev, 0),
-			     FTDI_SIO_GET_LATENCY_TIMER_REQUEST,
-			     FTDI_SIO_GET_LATENCY_TIMER_REQUEST_TYPE,
-			     0, priv->interface,
-			     (char *) &latency, 1, WDR_TIMEOUT);
-
-	if (rv < 0) {
-		dev_err(dev, "Unable to read latency timer: %i\n", rv);
-		return -EIO;
-	}
-	return sprintf(buf, "%i\n", latency);
+	if (priv->flags & ASYNC_LOW_LATENCY)
+		return sprintf(buf, "1\n");
+	else
+		return sprintf(buf, "%i\n", priv->latency);
 }
 
+
 /* Write a new value of the latency timer, in units of milliseconds. */
 static ssize_t store_latency_timer(struct device *dev,
 			struct device_attribute *attr, const char *valbuf,
@@ -1221,25 +1256,13 @@ static ssize_t store_latency_timer(struct device *dev,
 {
 	struct usb_serial_port *port = to_usb_serial_port(dev);
 	struct ftdi_private *priv = usb_get_serial_port_data(port);
-	struct usb_device *udev = port->serial->dev;
-	char buf[1];
 	int v = simple_strtoul(valbuf, NULL, 10);
 	int rv = 0;
 
-	dbg("%s: setting latency timer = %i", __func__, v);
-
-	rv = usb_control_msg(udev,
-			     usb_sndctrlpipe(udev, 0),
-			     FTDI_SIO_SET_LATENCY_TIMER_REQUEST,
-			     FTDI_SIO_SET_LATENCY_TIMER_REQUEST_TYPE,
-			     v, priv->interface,
-			     buf, 0, WDR_TIMEOUT);
-
-	if (rv < 0) {
-		dev_err(dev, "Unable to write latency timer: %i\n", rv);
+	priv->latency = v;
+	rv = write_latency_timer(port);
+	if (rv < 0)
 		return -EIO;
-	}
-
 	return count;
 }
 
@@ -1393,6 +1416,7 @@ static int ftdi_sio_port_probe(struct usb_serial_port *port)
 	usb_set_serial_port_data(port, priv);
 
 	ftdi_determine_type(port);
+	read_latency_timer(port);
 	create_sysfs_attrs(port);
 	return 0;
 }
@@ -1515,6 +1539,8 @@ static int ftdi_open(struct tty_struct *tty,
 	if (tty)
 		tty->low_latency = (priv->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
 
+	write_latency_timer(port);
+
 	/* No error checking for this (will get errors later anyway) */
 	/* See ftdi_sio.h for description of what is reset */
 	usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
-- 
GitLab


From 7f8d09eae26a8108406583192996561665b36371 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:56:16 +0100
Subject: [PATCH 2028/2198] tty: fix bluetooth scribbling on low latency flags

Bluetooth shouldn't be doing this as most drivers don't support the flag,
furthermore it shouldn't be needed with newer buffering. This becomes rather
more visible as the locking fixes make the abuse of low_latency visible as
spew on the users console/dmesg.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/bluetooth/hci_ldisc.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/bluetooth/hci_ldisc.c b/drivers/bluetooth/hci_ldisc.c
index 688015128594a..4895f0e053229 100644
--- a/drivers/bluetooth/hci_ldisc.c
+++ b/drivers/bluetooth/hci_ldisc.c
@@ -463,7 +463,6 @@ static int hci_uart_tty_ioctl(struct tty_struct *tty, struct file * file,
 				clear_bit(HCI_UART_PROTO_SET, &hu->flags);
 				return err;
 			}
-			tty->low_latency = 1;
 		} else
 			return -EBUSY;
 		break;
-- 
GitLab


From 7e9cd3a617414cfe74342659ceeb4e92975c1efa Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 13:57:01 +0100
Subject: [PATCH 2029/2198] ftdi_sio: don't override modem bits

The new open/close logic handles DTR and friends, so don't do it in our own
open routine as well.

Signed-off-by: Alan Cox <alan@lxorguk.ukuu.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/ftdi_sio.c | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c
index 21c053c31b9ef..683304d60615a 100644
--- a/drivers/usb/serial/ftdi_sio.c
+++ b/drivers/usb/serial/ftdi_sio.c
@@ -1556,11 +1556,6 @@ static int ftdi_open(struct tty_struct *tty,
 	if (tty)
 		ftdi_set_termios(tty, port, tty->termios);
 
-	/* FIXME: Flow control might be enabled, so it should be checked -
-	   we have no control of defaults! */
-	/* Turn on RTS and DTR since we are not flow controlling by default */
-	set_mctrl(port, TIOCM_DTR | TIOCM_RTS);
-
 	/* Not throttled */
 	spin_lock_irqsave(&priv->rx_lock, flags);
 	priv->rx_flags &= ~(THROTTLED | ACTUALLY_THROTTLED);
-- 
GitLab


From 0b91421857414f525690ee452c0b0acb6ab1dba3 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Thu, 11 Jun 2009 14:01:45 +0100
Subject: [PATCH 2030/2198] tty: bfin_jtag_comm: emulate a TTY over the
 Blackfin EMUDAT/JTAG interface

The Blackfin JTAG interface has a 4 byte generic data field (EMUDAT).  With
a little creative thinking, we can turn this into a TTY device.

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Bryan Wu <cooloney@kernel.org>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/Kconfig          |  13 ++
 drivers/char/Makefile         |   1 +
 drivers/char/bfin_jtag_comm.c | 365 ++++++++++++++++++++++++++++++++++
 3 files changed, 379 insertions(+)
 create mode 100644 drivers/char/bfin_jtag_comm.c

diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig
index 735bbe2be51aa..02ecfd5fa61c5 100644
--- a/drivers/char/Kconfig
+++ b/drivers/char/Kconfig
@@ -97,6 +97,19 @@ config DEVKMEM
 	  kind of kernel debugging operations.
 	  When in doubt, say "N".
 
+config BFIN_JTAG_COMM
+	tristate "Blackfin JTAG Communication"
+	depends on BLACKFIN
+	help
+	  Add support for emulating a TTY device over the Blackfin JTAG.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called bfin_jtag_comm.
+
+config BFIN_JTAG_COMM_CONSOLE
+	bool "Console on Blackfin JTAG"
+	depends on BFIN_JTAG_COMM=y
+
 config SERIAL_NONSTANDARD
 	bool "Non-standard serial port support"
 	depends on HAS_IOMEM
diff --git a/drivers/char/Makefile b/drivers/char/Makefile
index 9caf5b5ad1c05..189efcff08ce2 100644
--- a/drivers/char/Makefile
+++ b/drivers/char/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_LEGACY_PTYS)	+= pty.o
 obj-$(CONFIG_UNIX98_PTYS)	+= pty.o
 obj-y				+= misc.o
 obj-$(CONFIG_VT)		+= vt_ioctl.o vc_screen.o selection.o keyboard.o
+obj-$(CONFIG_BFIN_JTAG_COMM)	+= bfin_jtag_comm.o
 obj-$(CONFIG_CONSOLE_TRANSLATIONS) += consolemap.o consolemap_deftbl.o
 obj-$(CONFIG_HW_CONSOLE)	+= vt.o defkeymap.o
 obj-$(CONFIG_AUDIT)		+= tty_audit.o
diff --git a/drivers/char/bfin_jtag_comm.c b/drivers/char/bfin_jtag_comm.c
new file mode 100644
index 0000000000000..44c113d560450
--- /dev/null
+++ b/drivers/char/bfin_jtag_comm.c
@@ -0,0 +1,365 @@
+/*
+ * TTY over Blackfin JTAG Communication
+ *
+ * Copyright 2008-2009 Analog Devices Inc.
+ *
+ * Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/circ_buf.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/tty_flip.h>
+#include <asm/atomic.h>
+
+/* See the Debug/Emulation chapter in the HRM */
+#define EMUDOF   0x00000001	/* EMUDAT_OUT full & valid */
+#define EMUDIF   0x00000002	/* EMUDAT_IN full & valid */
+#define EMUDOOVF 0x00000004	/* EMUDAT_OUT overflow */
+#define EMUDIOVF 0x00000008	/* EMUDAT_IN overflow */
+
+#define DRV_NAME "bfin-jtag-comm"
+#define DEV_NAME "ttyBFJC"
+
+#define pr_init(fmt, args...) ({ static const __initdata char __fmt[] = fmt; printk(__fmt, ## args); })
+#define debug(fmt, args...) pr_debug(DRV_NAME ": " fmt, ## args)
+
+static inline uint32_t bfin_write_emudat(uint32_t emudat)
+{
+	__asm__ __volatile__("emudat = %0;" : : "d"(emudat));
+	return emudat;
+}
+
+static inline uint32_t bfin_read_emudat(void)
+{
+	uint32_t emudat;
+	__asm__ __volatile__("%0 = emudat;" : "=d"(emudat));
+	return emudat;
+}
+
+static inline uint32_t bfin_write_emudat_chars(char a, char b, char c, char d)
+{
+	return bfin_write_emudat((a << 0) | (b << 8) | (c << 16) | (d << 24));
+}
+
+#define CIRC_SIZE 2048	/* see comment in tty_io.c:do_tty_write() */
+#define CIRC_MASK (CIRC_SIZE - 1)
+#define circ_empty(circ)     ((circ)->head == (circ)->tail)
+#define circ_free(circ)      CIRC_SPACE((circ)->head, (circ)->tail, CIRC_SIZE)
+#define circ_cnt(circ)       CIRC_CNT((circ)->head, (circ)->tail, CIRC_SIZE)
+#define circ_byte(circ, idx) ((circ)->buf[(idx) & CIRC_MASK])
+
+static struct tty_driver *bfin_jc_driver;
+static struct task_struct *bfin_jc_kthread;
+static struct tty_struct * volatile bfin_jc_tty;
+static unsigned long bfin_jc_count;
+static DEFINE_MUTEX(bfin_jc_tty_mutex);
+static volatile struct circ_buf bfin_jc_write_buf;
+
+static int
+bfin_jc_emudat_manager(void *arg)
+{
+	uint32_t inbound_len = 0, outbound_len = 0;
+
+	while (!kthread_should_stop()) {
+		/* no one left to give data to, so sleep */
+		if (bfin_jc_tty == NULL && circ_empty(&bfin_jc_write_buf)) {
+			debug("waiting for readers\n");
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule();
+			__set_current_state(TASK_RUNNING);
+		}
+
+		/* no data available, so just chill */
+		if (!(bfin_read_DBGSTAT() & EMUDIF) && circ_empty(&bfin_jc_write_buf)) {
+			debug("waiting for data (in_len = %i) (circ: %i %i)\n",
+				inbound_len, bfin_jc_write_buf.tail, bfin_jc_write_buf.head);
+			if (inbound_len)
+				schedule();
+			else
+				schedule_timeout_interruptible(HZ);
+			continue;
+		}
+
+		/* if incoming data is ready, eat it */
+		if (bfin_read_DBGSTAT() & EMUDIF) {
+			struct tty_struct *tty;
+			mutex_lock(&bfin_jc_tty_mutex);
+			tty = (struct tty_struct *)bfin_jc_tty;
+			if (tty != NULL) {
+				uint32_t emudat = bfin_read_emudat();
+				if (inbound_len == 0) {
+					debug("incoming length: 0x%08x\n", emudat);
+					inbound_len = emudat;
+				} else {
+					size_t num_chars = (4 <= inbound_len ? 4 : inbound_len);
+					debug("  incoming data: 0x%08x (pushing %zu)\n", emudat, num_chars);
+					inbound_len -= num_chars;
+					tty_insert_flip_string(tty, (unsigned char *)&emudat, num_chars);
+					tty_flip_buffer_push(tty);
+				}
+			}
+			mutex_unlock(&bfin_jc_tty_mutex);
+		}
+
+		/* if outgoing data is ready, post it */
+		if (!(bfin_read_DBGSTAT() & EMUDOF) && !circ_empty(&bfin_jc_write_buf)) {
+			if (outbound_len == 0) {
+				outbound_len = circ_cnt(&bfin_jc_write_buf);
+				bfin_write_emudat(outbound_len);
+				debug("outgoing length: 0x%08x\n", outbound_len);
+			} else {
+				struct tty_struct *tty;
+				int tail = bfin_jc_write_buf.tail;
+				size_t ate = (4 <= outbound_len ? 4 : outbound_len);
+				uint32_t emudat =
+				bfin_write_emudat_chars(
+					circ_byte(&bfin_jc_write_buf, tail + 0),
+					circ_byte(&bfin_jc_write_buf, tail + 1),
+					circ_byte(&bfin_jc_write_buf, tail + 2),
+					circ_byte(&bfin_jc_write_buf, tail + 3)
+				);
+				bfin_jc_write_buf.tail += ate;
+				outbound_len -= ate;
+				mutex_lock(&bfin_jc_tty_mutex);
+				tty = (struct tty_struct *)bfin_jc_tty;
+				if (tty)
+					tty_wakeup(tty);
+				mutex_unlock(&bfin_jc_tty_mutex);
+				debug("  outgoing data: 0x%08x (pushing %zu)\n", emudat, ate);
+			}
+		}
+	}
+
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int
+bfin_jc_open(struct tty_struct *tty, struct file *filp)
+{
+	mutex_lock(&bfin_jc_tty_mutex);
+	debug("open %lu\n", bfin_jc_count);
+	++bfin_jc_count;
+	bfin_jc_tty = tty;
+	wake_up_process(bfin_jc_kthread);
+	mutex_unlock(&bfin_jc_tty_mutex);
+	return 0;
+}
+
+static void
+bfin_jc_close(struct tty_struct *tty, struct file *filp)
+{
+	mutex_lock(&bfin_jc_tty_mutex);
+	debug("close %lu\n", bfin_jc_count);
+	if (--bfin_jc_count == 0)
+		bfin_jc_tty = NULL;
+	wake_up_process(bfin_jc_kthread);
+	mutex_unlock(&bfin_jc_tty_mutex);
+}
+
+/* XXX: we dont handle the put_char() case where we must handle count = 1 */
+static int
+bfin_jc_circ_write(const unsigned char *buf, int count)
+{
+	int i;
+	count = min(count, circ_free(&bfin_jc_write_buf));
+	debug("going to write chunk of %i bytes\n", count);
+	for (i = 0; i < count; ++i)
+		circ_byte(&bfin_jc_write_buf, bfin_jc_write_buf.head + i) = buf[i];
+	bfin_jc_write_buf.head += i;
+	return i;
+}
+
+#ifndef CONFIG_BFIN_JTAG_COMM_CONSOLE
+# define acquire_console_sem()
+# define release_console_sem()
+#endif
+static int
+bfin_jc_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+	int i;
+	acquire_console_sem();
+	i = bfin_jc_circ_write(buf, count);
+	release_console_sem();
+	wake_up_process(bfin_jc_kthread);
+	return i;
+}
+
+static void
+bfin_jc_flush_chars(struct tty_struct *tty)
+{
+	wake_up_process(bfin_jc_kthread);
+}
+
+static int
+bfin_jc_write_room(struct tty_struct *tty)
+{
+	return circ_free(&bfin_jc_write_buf);
+}
+
+static int
+bfin_jc_chars_in_buffer(struct tty_struct *tty)
+{
+	return circ_cnt(&bfin_jc_write_buf);
+}
+
+static void
+bfin_jc_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+	unsigned long expire = jiffies + timeout;
+	while (!circ_empty(&bfin_jc_write_buf)) {
+		if (signal_pending(current))
+			break;
+		if (time_after(jiffies, expire))
+			break;
+	}
+}
+
+static struct tty_operations bfin_jc_ops = {
+	.open            = bfin_jc_open,
+	.close           = bfin_jc_close,
+	.write           = bfin_jc_write,
+	/*.put_char        = bfin_jc_put_char,*/
+	.flush_chars     = bfin_jc_flush_chars,
+	.write_room      = bfin_jc_write_room,
+	.chars_in_buffer = bfin_jc_chars_in_buffer,
+	.wait_until_sent = bfin_jc_wait_until_sent,
+};
+
+static int __init bfin_jc_init(void)
+{
+	int ret;
+
+	bfin_jc_kthread = kthread_create(bfin_jc_emudat_manager, NULL, DRV_NAME);
+	if (IS_ERR(bfin_jc_kthread))
+		return PTR_ERR(bfin_jc_kthread);
+
+	ret = -ENOMEM;
+
+	bfin_jc_write_buf.head = bfin_jc_write_buf.tail = 0;
+	bfin_jc_write_buf.buf = kmalloc(CIRC_SIZE, GFP_KERNEL);
+	if (!bfin_jc_write_buf.buf)
+		goto err;
+
+	bfin_jc_driver = alloc_tty_driver(1);
+	if (!bfin_jc_driver)
+		goto err;
+
+	bfin_jc_driver->owner        = THIS_MODULE;
+	bfin_jc_driver->driver_name  = DRV_NAME;
+	bfin_jc_driver->name         = DEV_NAME;
+	bfin_jc_driver->type         = TTY_DRIVER_TYPE_SERIAL;
+	bfin_jc_driver->subtype      = SERIAL_TYPE_NORMAL;
+	bfin_jc_driver->init_termios = tty_std_termios;
+	tty_set_operations(bfin_jc_driver, &bfin_jc_ops);
+
+	ret = tty_register_driver(bfin_jc_driver);
+	if (ret)
+		goto err;
+
+	pr_init(KERN_INFO DRV_NAME ": initialized\n");
+
+	return 0;
+
+ err:
+	put_tty_driver(bfin_jc_driver);
+	kfree(bfin_jc_write_buf.buf);
+	kthread_stop(bfin_jc_kthread);
+	return ret;
+}
+module_init(bfin_jc_init);
+
+static void __exit bfin_jc_exit(void)
+{
+	kthread_stop(bfin_jc_kthread);
+	kfree(bfin_jc_write_buf.buf);
+	tty_unregister_driver(bfin_jc_driver);
+	put_tty_driver(bfin_jc_driver);
+}
+module_exit(bfin_jc_exit);
+
+#if defined(CONFIG_BFIN_JTAG_COMM_CONSOLE) || defined(CONFIG_EARLY_PRINTK)
+static void
+bfin_jc_straight_buffer_write(const char *buf, unsigned count)
+{
+	unsigned ate = 0;
+	while (bfin_read_DBGSTAT() & EMUDOF)
+		continue;
+	bfin_write_emudat(count);
+	while (ate < count) {
+		while (bfin_read_DBGSTAT() & EMUDOF)
+			continue;
+		bfin_write_emudat_chars(buf[ate], buf[ate+1], buf[ate+2], buf[ate+3]);
+		ate += 4;
+	}
+}
+#endif
+
+#ifdef CONFIG_BFIN_JTAG_COMM_CONSOLE
+static void
+bfin_jc_console_write(struct console *co, const char *buf, unsigned count)
+{
+	if (bfin_jc_kthread == NULL)
+		bfin_jc_straight_buffer_write(buf, count);
+	else
+		bfin_jc_circ_write(buf, count);
+}
+
+static struct tty_driver *
+bfin_jc_console_device(struct console *co, int *index)
+{
+	*index = co->index;
+	return bfin_jc_driver;
+}
+
+static struct console bfin_jc_console = {
+	.name    = DEV_NAME,
+	.write   = bfin_jc_console_write,
+	.device  = bfin_jc_console_device,
+	.flags   = CON_ANYTIME | CON_PRINTBUFFER,
+	.index   = -1,
+};
+
+static int __init bfin_jc_console_init(void)
+{
+	register_console(&bfin_jc_console);
+	return 0;
+}
+console_initcall(bfin_jc_console_init);
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+static void __init
+bfin_jc_early_write(struct console *co, const char *buf, unsigned int count)
+{
+	bfin_jc_straight_buffer_write(buf, count);
+}
+
+static struct __initdata console bfin_jc_early_console = {
+	.name   = "early_BFJC",
+	.write   = bfin_jc_early_write,
+	.flags   = CON_ANYTIME | CON_PRINTBUFFER,
+	.index   = -1,
+};
+
+struct console * __init
+bfin_jc_early_init(unsigned int port, unsigned int cflag)
+{
+	return &bfin_jc_early_console;
+}
+#endif
+
+MODULE_AUTHOR("Mike Frysinger <vapier@gentoo.org>");
+MODULE_DESCRIPTION("TTY over Blackfin JTAG Communication");
+MODULE_LICENSE("GPL");
-- 
GitLab


From 26a2e20f4a966bebc312836aeb4423fbfd4e33e2 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:03:13 +0100
Subject: [PATCH 2031/2198] tty: Untangle termios and mm mutex dependencies

Although this doesn't cause any problems it could potentially do so for
future mmap using devices. No real work is needed to sort it out so untangle
it before it causes problems

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_ioctl.c | 75 ++++++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 29 deletions(-)

diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c
index 2401dbcbee9c1..8116bb1c8f801 100644
--- a/drivers/char/tty_ioctl.c
+++ b/drivers/char/tty_ioctl.c
@@ -626,9 +626,25 @@ static int set_termios(struct tty_struct *tty, void __user *arg, int opt)
 	return 0;
 }
 
+static void copy_termios(struct tty_struct *tty, struct ktermios *kterm)
+{
+	mutex_lock(&tty->termios_mutex);
+	memcpy(kterm, tty->termios, sizeof(struct ktermios));
+	mutex_unlock(&tty->termios_mutex);
+}
+
+static void copy_termios_locked(struct tty_struct *tty, struct ktermios *kterm)
+{
+	mutex_lock(&tty->termios_mutex);
+	memcpy(kterm, tty->termios_locked, sizeof(struct ktermios));
+	mutex_unlock(&tty->termios_mutex);
+}
+
 static int get_termio(struct tty_struct *tty, struct termio __user *termio)
 {
-	if (kernel_termios_to_user_termio(termio, tty->termios))
+	struct ktermios kterm;
+	copy_termios(tty, &kterm);
+	if (kernel_termios_to_user_termio(termio, &kterm))
 		return -EFAULT;
 	return 0;
 }
@@ -930,6 +946,8 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 	struct tty_struct *real_tty;
 	void __user *p = (void __user *)arg;
 	int ret = 0;
+	struct ktermios kterm;
+	struct termiox ktermx;
 
 	if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
 	    tty->driver->subtype == PTY_TYPE_MASTER)
@@ -965,23 +983,20 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 		return set_termios(real_tty, p, TERMIOS_OLD);
 #ifndef TCGETS2
 	case TCGETS:
-		mutex_lock(&real_tty->termios_mutex);
-		if (kernel_termios_to_user_termios((struct termios __user *)arg, real_tty->termios))
+		copy_termios(real_tty, &kterm);
+		if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm))
 			ret = -EFAULT;
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 #else
 	case TCGETS:
-		mutex_lock(&real_tty->termios_mutex);
-		if (kernel_termios_to_user_termios_1((struct termios __user *)arg, real_tty->termios))
+		copy_termios(real_tty, &kterm);
+		if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm))
 			ret = -EFAULT;
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 	case TCGETS2:
-		mutex_lock(&real_tty->termios_mutex);
-		if (kernel_termios_to_user_termios((struct termios2 __user *)arg, real_tty->termios))
+		copy_termios(real_tty, &kterm);
+		if (kernel_termios_to_user_termios((struct termios2 __user *)arg, &kterm))
 			ret = -EFAULT;
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 	case TCSETSF2:
 		return set_termios(real_tty, p,  TERMIOS_FLUSH | TERMIOS_WAIT);
@@ -1000,34 +1015,36 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 		return set_termios(real_tty, p, TERMIOS_TERMIO);
 #ifndef TCGETS2
 	case TIOCGLCKTRMIOS:
-		mutex_lock(&real_tty->termios_mutex);
-		if (kernel_termios_to_user_termios((struct termios __user *)arg, real_tty->termios_locked))
+		copy_termios_locked(real_tty, &kterm);
+		if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm))
 			ret = -EFAULT;
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 	case TIOCSLCKTRMIOS:
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
-		mutex_lock(&real_tty->termios_mutex);
-		if (user_termios_to_kernel_termios(real_tty->termios_locked,
+		copy_termios_locked(real_tty, &kterm);
+		if (user_termios_to_kernel_termios(&kterm,
 					       (struct termios __user *) arg))
-			ret = -EFAULT;
+			return -EFAULT;
+		mutex_lock(&real_tty->termios_mutex);
+		memcpy(real_tty->termios_locked, &kterm, sizeof(struct ktermios));
 		mutex_unlock(&real_tty->termios_mutex);
-		return ret;
+		return 0;
 #else
 	case TIOCGLCKTRMIOS:
-		mutex_lock(&real_tty->termios_mutex);
-		if (kernel_termios_to_user_termios_1((struct termios __user *)arg, real_tty->termios_locked))
+		copy_termios_locked(real_tty, &kterm);
+		if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm))
 			ret = -EFAULT;
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 	case TIOCSLCKTRMIOS:
 		if (!capable(CAP_SYS_ADMIN))
-			ret = -EPERM;
-		mutex_lock(&real_tty->termios_mutex);
-		if (user_termios_to_kernel_termios_1(real_tty->termios_locked,
+			return -EPERM;
+		copy_termios_locked(real_tty, &kterm);
+		if (user_termios_to_kernel_termios_1(&kterm,
 					       (struct termios __user *) arg))
-			ret = -EFAULT;
+			return -EFAULT;
+		mutex_lock(&real_tty->termios_mutex);
+		memcpy(real_tty->termios_locked, &kterm, sizeof(struct ktermios));
 		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 #endif
@@ -1036,9 +1053,10 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 		if (real_tty->termiox == NULL)
 			return -EINVAL;
 		mutex_lock(&real_tty->termios_mutex);
-		if (copy_to_user(p, real_tty->termiox, sizeof(struct termiox)))
-			ret = -EFAULT;
+		memcpy(&ktermx, real_tty->termiox, sizeof(struct termiox));
 		mutex_unlock(&real_tty->termios_mutex);
+		if (copy_to_user(p, &ktermx, sizeof(struct termiox)))
+			ret = -EFAULT;
 		return ret;
 	case TCSETX:
 		return set_termiox(real_tty, p, 0);
@@ -1048,10 +1066,9 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
 		return set_termiox(real_tty, p, TERMIOS_FLUSH);
 #endif		
 	case TIOCGSOFTCAR:
-		mutex_lock(&real_tty->termios_mutex);
-		ret = put_user(C_CLOCAL(real_tty) ? 1 : 0,
+		copy_termios(real_tty, &kterm);
+		ret = put_user((kterm.c_cflag & CLOCAL) ? 1 : 0,
 						(int __user *)arg);
-		mutex_unlock(&real_tty->termios_mutex);
 		return ret;
 	case TIOCSSOFTCAR:
 		if (get_user(arg, (unsigned int __user *) arg))
-- 
GitLab


From 93d5581e20600593ec3236921b6620225fb76034 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:03:55 +0100
Subject: [PATCH 2032/2198] devpts: unregister the file system on error

Closes-bug: http://bugzilla.kernel.org/show_bug.cgi?id=13429

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c68edb969441c..9b1d285f9fe6e 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -557,8 +557,10 @@ static int __init init_devpts_fs(void)
 	int err = register_filesystem(&devpts_fs_type);
 	if (!err) {
 		devpts_mnt = kern_mount(&devpts_fs_type);
-		if (IS_ERR(devpts_mnt))
+		if (IS_ERR(devpts_mnt)) {
 			err = PTR_ERR(devpts_mnt);
+			unregister_filesystem(&devpts_fs_type);
+		}
 	}
 	return err;
 }
-- 
GitLab


From e960bf73ddd73714bcfbadb1717e53ecda9924cb Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:04:57 +0100
Subject: [PATCH 2033/2198] tty: Add URL for ttydev queue

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 MAINTAINERS | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 1979167c16dc0..84285b5ba359c 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -71,7 +71,7 @@ P: Person
 M: Mail patches to
 L: Mailing list that is relevant to this area
 W: Web-page with status/info
-T: SCM tree type and location.  Type is one of: git, hg, quilt.
+T: SCM tree type and location.  Type is one of: git, hg, quilt, stgit.
 S: Status, one of the following:
 
 	Supported:	Someone is actually paid to look after this.
@@ -5630,6 +5630,7 @@ P:	Alan Cox
 M:	alan@lxorguk.ukuu.org.uk
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
+T:	stgit http://zeniv.linux.org.uk/~alan/ttydev/
 
 TULIP NETWORK DRIVERS
 P:	Grant Grundler
-- 
GitLab


From 34aec591847c696339189b070cce2a11f901cfea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Richard=20R=C3=B6jfors?=
 <richard.rojfors.ext@mocean-labs.com>
Date: Thu, 11 Jun 2009 14:05:39 +0100
Subject: [PATCH 2034/2198] serial: Added Timberdale UART driver
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Driver for the UART found in the Timberdale FPGA

Signed-off-by: Richard Röjfors <richard.rojfors.ext@mocean-labs.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/Kconfig      |   7 +
 drivers/serial/Makefile     |   1 +
 drivers/serial/timbuart.c   | 520 ++++++++++++++++++++++++++++++++++++
 drivers/serial/timbuart.h   |  58 ++++
 include/linux/serial_core.h |   3 +
 5 files changed, 589 insertions(+)
 create mode 100644 drivers/serial/timbuart.c
 create mode 100644 drivers/serial/timbuart.h

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 343e3a35b6a37..3391ef0d761da 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -1433,4 +1433,11 @@ config SPORT_BAUD_RATE
 	default 19200 if (SERIAL_SPORT_BAUD_RATE_19200)
 	default 9600 if (SERIAL_SPORT_BAUD_RATE_9600)
 
+config SERIAL_TIMBERDALE
+	tristate "Support for timberdale UART"
+	depends on MFD_TIMBERDALE
+	select SERIAL_CORE
+	---help---
+	Add support for UART controller on timberdale.
+
 endmenu
diff --git a/drivers/serial/Makefile b/drivers/serial/Makefile
index d438eb2a73def..45a8658f54d51 100644
--- a/drivers/serial/Makefile
+++ b/drivers/serial/Makefile
@@ -77,3 +77,4 @@ obj-$(CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL) += nwpserial.o
 obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o
 obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o
 obj-$(CONFIG_SERIAL_QE) += ucc_uart.o
+obj-$(CONFIG_SERIAL_TIMBERDALE)	+= timbuart.o
diff --git a/drivers/serial/timbuart.c b/drivers/serial/timbuart.c
new file mode 100644
index 0000000000000..30ba3c4cc180a
--- /dev/null
+++ b/drivers/serial/timbuart.c
@@ -0,0 +1,520 @@
+/*
+ * timbuart.c timberdale FPGA UART driver
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Timberdale FPGA UART
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/serial_core.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/ioport.h>
+
+#include "timbuart.h"
+
+struct timbuart_port {
+	struct uart_port	port;
+	struct tasklet_struct	tasklet;
+	int			usedma;
+	u8			last_ier;
+	struct platform_device  *dev;
+};
+
+static int baudrates[] = {9600, 19200, 38400, 57600, 115200, 230400, 460800,
+	921600, 1843200, 3250000};
+
+static void timbuart_mctrl_check(struct uart_port *port, u8 isr, u8 *ier);
+
+static irqreturn_t timbuart_handleinterrupt(int irq, void *devid);
+
+static void timbuart_stop_rx(struct uart_port *port)
+{
+	/* spin lock held by upper layer, disable all RX interrupts */
+	u8 ier = ioread8(port->membase + TIMBUART_IER) & ~RXFLAGS;
+	iowrite8(ier, port->membase + TIMBUART_IER);
+}
+
+static void timbuart_stop_tx(struct uart_port *port)
+{
+	/* spinlock held by upper layer, disable TX interrupt */
+	u8 ier = ioread8(port->membase + TIMBUART_IER) & ~TXBAE;
+	iowrite8(ier, port->membase + TIMBUART_IER);
+}
+
+static void timbuart_start_tx(struct uart_port *port)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+
+	/* do not transfer anything here -> fire off the tasklet */
+	tasklet_schedule(&uart->tasklet);
+}
+
+static void timbuart_flush_buffer(struct uart_port *port)
+{
+	u8 ctl = ioread8(port->membase + TIMBUART_CTRL) | TIMBUART_CTRL_FLSHTX;
+
+	iowrite8(ctl, port->membase + TIMBUART_CTRL);
+	iowrite8(TXBF, port->membase + TIMBUART_ISR);
+}
+
+static void timbuart_rx_chars(struct uart_port *port)
+{
+	struct tty_struct *tty = port->info->port.tty;
+
+	while (ioread8(port->membase + TIMBUART_ISR) & RXDP) {
+		u8 ch = ioread8(port->membase + TIMBUART_RXFIFO);
+		port->icount.rx++;
+		tty_insert_flip_char(tty, ch, TTY_NORMAL);
+	}
+
+	spin_unlock(&port->lock);
+	tty_flip_buffer_push(port->info->port.tty);
+	spin_lock(&port->lock);
+
+	dev_dbg(port->dev, "%s - total read %d bytes\n",
+		__func__, port->icount.rx);
+}
+
+static void timbuart_tx_chars(struct uart_port *port)
+{
+	struct circ_buf *xmit = &port->info->xmit;
+
+	while (!(ioread8(port->membase + TIMBUART_ISR) & TXBF) &&
+		!uart_circ_empty(xmit)) {
+		iowrite8(xmit->buf[xmit->tail],
+			port->membase + TIMBUART_TXFIFO);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+		port->icount.tx++;
+	}
+
+	dev_dbg(port->dev,
+		"%s - total written %d bytes, CTL: %x, RTS: %x, baud: %x\n",
+		 __func__,
+		port->icount.tx,
+		ioread8(port->membase + TIMBUART_CTRL),
+		port->mctrl & TIOCM_RTS,
+		ioread8(port->membase + TIMBUART_BAUDRATE));
+}
+
+static void timbuart_handle_tx_port(struct uart_port *port, u8 isr, u8 *ier)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+	struct circ_buf *xmit = &port->info->xmit;
+
+	if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+		return;
+
+	if (port->x_char)
+		return;
+
+	if (isr & TXFLAGS) {
+		timbuart_tx_chars(port);
+		/* clear all TX interrupts */
+		iowrite8(TXFLAGS, port->membase + TIMBUART_ISR);
+
+		if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+			uart_write_wakeup(port);
+	} else
+		/* Re-enable any tx interrupt */
+		*ier |= uart->last_ier & TXFLAGS;
+
+	/* enable interrupts if there are chars in the transmit buffer,
+	 * Or if we delivered some bytes and want the almost empty interrupt
+	 * we wake up the upper layer later when we got the interrupt
+	 * to give it some time to go out...
+	 */
+	if (!uart_circ_empty(xmit))
+		*ier |= TXBAE;
+
+	dev_dbg(port->dev, "%s - leaving\n", __func__);
+}
+
+void timbuart_handle_rx_port(struct uart_port *port, u8 isr, u8 *ier)
+{
+	if (isr & RXFLAGS) {
+		/* Some RX status is set */
+		if (isr & RXBF) {
+			u8 ctl = ioread8(port->membase + TIMBUART_CTRL) |
+				TIMBUART_CTRL_FLSHRX;
+			iowrite8(ctl, port->membase + TIMBUART_CTRL);
+			port->icount.overrun++;
+		} else if (isr & (RXDP))
+			timbuart_rx_chars(port);
+
+		/* ack all RX interrupts */
+		iowrite8(RXFLAGS, port->membase + TIMBUART_ISR);
+	}
+
+	/* always have the RX interrupts enabled */
+	*ier |= RXBAF | RXBF | RXTT;
+
+	dev_dbg(port->dev, "%s - leaving\n", __func__);
+}
+
+void timbuart_tasklet(unsigned long arg)
+{
+	struct timbuart_port *uart = (struct timbuart_port *)arg;
+	u8 isr, ier = 0;
+
+	spin_lock(&uart->port.lock);
+
+	isr = ioread8(uart->port.membase + TIMBUART_ISR);
+	dev_dbg(uart->port.dev, "%s ISR: %x\n", __func__, isr);
+
+	if (!uart->usedma)
+		timbuart_handle_tx_port(&uart->port, isr, &ier);
+
+	timbuart_mctrl_check(&uart->port, isr, &ier);
+
+	if (!uart->usedma)
+		timbuart_handle_rx_port(&uart->port, isr, &ier);
+
+	iowrite8(ier, uart->port.membase + TIMBUART_IER);
+
+	spin_unlock(&uart->port.lock);
+	dev_dbg(uart->port.dev, "%s leaving\n", __func__);
+}
+
+static unsigned int timbuart_tx_empty(struct uart_port *port)
+{
+	u8 isr = ioread8(port->membase + TIMBUART_ISR);
+
+	return (isr & TXBAE) ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int timbuart_get_mctrl(struct uart_port *port)
+{
+	u8 cts = ioread8(port->membase + TIMBUART_CTRL);
+	dev_dbg(port->dev, "%s - cts %x\n", __func__, cts);
+
+	if (cts & TIMBUART_CTRL_CTS)
+		return TIOCM_CTS | TIOCM_DSR | TIOCM_CAR;
+	else
+		return TIOCM_DSR | TIOCM_CAR;
+}
+
+static void timbuart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+	dev_dbg(port->dev, "%s - %x\n", __func__, mctrl);
+
+	if (mctrl & TIOCM_RTS)
+		iowrite8(TIMBUART_CTRL_RTS, port->membase + TIMBUART_CTRL);
+	else
+		iowrite8(TIMBUART_CTRL_RTS, port->membase + TIMBUART_CTRL);
+}
+
+static void timbuart_mctrl_check(struct uart_port *port, u8 isr, u8 *ier)
+{
+	unsigned int cts;
+
+	if (isr & CTS_DELTA) {
+		/* ack */
+		iowrite8(CTS_DELTA, port->membase + TIMBUART_ISR);
+		cts = timbuart_get_mctrl(port);
+		uart_handle_cts_change(port, cts & TIOCM_CTS);
+		wake_up_interruptible(&port->info->delta_msr_wait);
+	}
+
+	*ier |= CTS_DELTA;
+}
+
+static void timbuart_enable_ms(struct uart_port *port)
+{
+	/* N/A */
+}
+
+static void timbuart_break_ctl(struct uart_port *port, int ctl)
+{
+	/* N/A */
+}
+
+static int timbuart_startup(struct uart_port *port)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+
+	dev_dbg(port->dev, "%s\n", __func__);
+
+	iowrite8(TIMBUART_CTRL_FLSHRX, port->membase + TIMBUART_CTRL);
+	iowrite8(0xff, port->membase + TIMBUART_ISR);
+	/* Enable all but TX interrupts */
+	iowrite8(RXBAF | RXBF | RXTT | CTS_DELTA,
+		port->membase + TIMBUART_IER);
+
+	return request_irq(port->irq, timbuart_handleinterrupt, IRQF_SHARED,
+		"timb-uart", uart);
+}
+
+static void timbuart_shutdown(struct uart_port *port)
+{
+	struct timbuart_port *uart =
+		container_of(port, struct timbuart_port, port);
+	dev_dbg(port->dev, "%s\n", __func__);
+	free_irq(port->irq, uart);
+	iowrite8(0, port->membase + TIMBUART_IER);
+}
+
+static int get_bindex(int baud)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(baudrates); i++)
+		if (baud == baudrates[i])
+			return i;
+
+	return -1;
+}
+
+static void timbuart_set_termios(struct uart_port *port,
+	struct ktermios *termios,
+	struct ktermios *old)
+{
+	unsigned int baud;
+	short bindex;
+	unsigned long flags;
+
+	baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16);
+	bindex = get_bindex(baud);
+	dev_dbg(port->dev, "%s - bindex %d\n", __func__, bindex);
+
+	if (bindex < 0) {
+		printk(KERN_ALERT "timbuart: Unsupported baud rate\n");
+	} else {
+		spin_lock_irqsave(&port->lock, flags);
+		iowrite8((u8)bindex, port->membase + TIMBUART_BAUDRATE);
+		uart_update_timeout(port, termios->c_cflag, baud);
+		spin_unlock_irqrestore(&port->lock, flags);
+	}
+}
+
+static const char *timbuart_type(struct uart_port *port)
+{
+	return port->type == PORT_UNKNOWN ? "timbuart" : NULL;
+}
+
+/* We do not request/release mappings of the registers here,
+ * currently it's done in the proble function.
+ */
+static void timbuart_release_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	int size =
+		resource_size(platform_get_resource(pdev, IORESOURCE_MEM, 0));
+
+	if (port->flags & UPF_IOREMAP) {
+		iounmap(port->membase);
+		port->membase = NULL;
+	}
+
+	release_mem_region(port->mapbase, size);
+}
+
+static int timbuart_request_port(struct uart_port *port)
+{
+	struct platform_device *pdev = to_platform_device(port->dev);
+	int size =
+		resource_size(platform_get_resource(pdev, IORESOURCE_MEM, 0));
+
+	if (!request_mem_region(port->mapbase, size, "timb-uart"))
+		return -EBUSY;
+
+	if (port->flags & UPF_IOREMAP) {
+		port->membase = ioremap(port->mapbase, size);
+		if (port->membase == NULL) {
+			release_mem_region(port->mapbase, size);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static irqreturn_t timbuart_handleinterrupt(int irq, void *devid)
+{
+	struct timbuart_port *uart = (struct timbuart_port *)devid;
+
+	if (ioread8(uart->port.membase + TIMBUART_IPR)) {
+		uart->last_ier = ioread8(uart->port.membase + TIMBUART_IER);
+
+		/* disable interrupts, the tasklet enables them again */
+		iowrite8(0, uart->port.membase + TIMBUART_IER);
+
+		/* fire off bottom half */
+		tasklet_schedule(&uart->tasklet);
+
+		return IRQ_HANDLED;
+	} else
+		return IRQ_NONE;
+}
+
+/*
+ * Configure/autoconfigure the port.
+ */
+static void timbuart_config_port(struct uart_port *port, int flags)
+{
+	if (flags & UART_CONFIG_TYPE) {
+		port->type = PORT_TIMBUART;
+		timbuart_request_port(port);
+	}
+}
+
+static int timbuart_verify_port(struct uart_port *port,
+	struct serial_struct *ser)
+{
+	/* we don't want the core code to modify any port params */
+	return -EINVAL;
+}
+
+static struct uart_ops timbuart_ops = {
+	.tx_empty = timbuart_tx_empty,
+	.set_mctrl = timbuart_set_mctrl,
+	.get_mctrl = timbuart_get_mctrl,
+	.stop_tx = timbuart_stop_tx,
+	.start_tx = timbuart_start_tx,
+	.flush_buffer = timbuart_flush_buffer,
+	.stop_rx = timbuart_stop_rx,
+	.enable_ms = timbuart_enable_ms,
+	.break_ctl = timbuart_break_ctl,
+	.startup = timbuart_startup,
+	.shutdown = timbuart_shutdown,
+	.set_termios = timbuart_set_termios,
+	.type = timbuart_type,
+	.release_port = timbuart_release_port,
+	.request_port = timbuart_request_port,
+	.config_port = timbuart_config_port,
+	.verify_port = timbuart_verify_port
+};
+
+static struct uart_driver timbuart_driver = {
+	.owner = THIS_MODULE,
+	.driver_name = "timberdale_uart",
+	.dev_name = "ttyTU",
+	.major = TIMBUART_MAJOR,
+	.minor = TIMBUART_MINOR,
+	.nr = 1
+};
+
+static int timbuart_probe(struct platform_device *dev)
+{
+	int err;
+	struct timbuart_port *uart;
+	struct resource *iomem;
+
+	dev_dbg(&dev->dev, "%s\n", __func__);
+
+	uart = kzalloc(sizeof(*uart), GFP_KERNEL);
+	if (!uart) {
+		err = -EINVAL;
+		goto err_mem;
+	}
+
+	uart->usedma = 0;
+
+	uart->port.uartclk = 3250000 * 16;
+	uart->port.fifosize  = TIMBUART_FIFO_SIZE;
+	uart->port.regshift  = 2;
+	uart->port.iotype  = UPIO_MEM;
+	uart->port.ops = &timbuart_ops;
+	uart->port.irq = 0;
+	uart->port.flags = UPF_BOOT_AUTOCONF | UPF_IOREMAP;
+	uart->port.line  = 0;
+	uart->port.dev	= &dev->dev;
+
+	iomem = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!iomem) {
+		err = -ENOMEM;
+		goto err_register;
+	}
+	uart->port.mapbase = iomem->start;
+	uart->port.membase = NULL;
+
+	uart->port.irq = platform_get_irq(dev, 0);
+	if (uart->port.irq < 0) {
+		err = -EINVAL;
+		goto err_register;
+	}
+
+	tasklet_init(&uart->tasklet, timbuart_tasklet, (unsigned long)uart);
+
+	err = uart_register_driver(&timbuart_driver);
+	if (err)
+		goto err_register;
+
+	err = uart_add_one_port(&timbuart_driver, &uart->port);
+	if (err)
+		goto err_add_port;
+
+	platform_set_drvdata(dev, uart);
+
+	return 0;
+
+err_add_port:
+	uart_unregister_driver(&timbuart_driver);
+err_register:
+	kfree(uart);
+err_mem:
+	printk(KERN_ERR "timberdale: Failed to register Timberdale UART: %d\n",
+		err);
+
+	return err;
+}
+
+static int timbuart_remove(struct platform_device *dev)
+{
+	struct timbuart_port *uart = platform_get_drvdata(dev);
+
+	tasklet_kill(&uart->tasklet);
+	uart_remove_one_port(&timbuart_driver, &uart->port);
+	uart_unregister_driver(&timbuart_driver);
+	kfree(uart);
+
+	return 0;
+}
+
+static struct platform_driver timbuart_platform_driver = {
+	.driver = {
+		.name	= "timb-uart",
+		.owner	= THIS_MODULE,
+	},
+	.probe		= timbuart_probe,
+	.remove		= timbuart_remove,
+};
+
+/*--------------------------------------------------------------------------*/
+
+static int __init timbuart_init(void)
+{
+	return platform_driver_register(&timbuart_platform_driver);
+}
+
+static void __exit timbuart_exit(void)
+{
+	platform_driver_unregister(&timbuart_platform_driver);
+}
+
+module_init(timbuart_init);
+module_exit(timbuart_exit);
+
+MODULE_DESCRIPTION("Timberdale UART driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:timb-uart");
+
diff --git a/drivers/serial/timbuart.h b/drivers/serial/timbuart.h
new file mode 100644
index 0000000000000..7e566766bc439
--- /dev/null
+++ b/drivers/serial/timbuart.h
@@ -0,0 +1,58 @@
+/*
+ * timbuart.c timberdale FPGA GPIO driver
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Timberdale FPGA UART
+ */
+
+#ifndef _TIMBUART_H
+#define _TIMBUART_H
+
+#define TIMBUART_FIFO_SIZE	2048
+
+#define TIMBUART_RXFIFO		0x08
+#define TIMBUART_TXFIFO		0x0c
+#define TIMBUART_IER		0x10
+#define TIMBUART_IPR		0x14
+#define TIMBUART_ISR		0x18
+#define TIMBUART_CTRL		0x1c
+#define TIMBUART_BAUDRATE	0x20
+
+#define TIMBUART_CTRL_RTS	0x01
+#define TIMBUART_CTRL_CTS	0x02
+#define TIMBUART_CTRL_FLSHTX	0x40
+#define TIMBUART_CTRL_FLSHRX	0x80
+
+#define TXBF		0x01
+#define TXBAE		0x02
+#define CTS_DELTA	0x04
+#define RXDP		0x08
+#define RXBAF		0x10
+#define RXBF		0x20
+#define RXTT		0x40
+#define RXBNAE		0x80
+#define TXBE		0x100
+
+#define RXFLAGS (RXDP | RXBAF | RXBF | RXTT | RXBNAE)
+#define TXFLAGS (TXBF | TXBAE)
+
+#define TIMBUART_MAJOR 204
+#define TIMBUART_MINOR 192
+
+#endif /* _TIMBUART_H */
+
diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h
index 48766ea845cfb..6fd80c4243f1b 100644
--- a/include/linux/serial_core.h
+++ b/include/linux/serial_core.h
@@ -168,6 +168,9 @@
 /* MAX3100 */
 #define PORT_MAX3100    86
 
+/* Timberdale UART */
+#define PORT_TIMBUART	87
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
-- 
GitLab


From 7d55deaf50182c47c1e805dc8cc85f2769f0673e Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:27:13 +0100
Subject: [PATCH 2035/2198] timbuart: Fix the termios logic

The driver only handles speeds but it fails to return the current values
for the hardware features it does not support.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/timbuart.c | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/drivers/serial/timbuart.c b/drivers/serial/timbuart.c
index 30ba3c4cc180a..ac9e5d5f742ef 100644
--- a/drivers/serial/timbuart.c
+++ b/drivers/serial/timbuart.c
@@ -278,7 +278,7 @@ static int get_bindex(int baud)
 	int i;
 
 	for (i = 0; i < ARRAY_SIZE(baudrates); i++)
-		if (baud == baudrates[i])
+		if (baud <= baudrates[i])
 			return i;
 
 	return -1;
@@ -296,14 +296,20 @@ static void timbuart_set_termios(struct uart_port *port,
 	bindex = get_bindex(baud);
 	dev_dbg(port->dev, "%s - bindex %d\n", __func__, bindex);
 
-	if (bindex < 0) {
-		printk(KERN_ALERT "timbuart: Unsupported baud rate\n");
-	} else {
-		spin_lock_irqsave(&port->lock, flags);
-		iowrite8((u8)bindex, port->membase + TIMBUART_BAUDRATE);
-		uart_update_timeout(port, termios->c_cflag, baud);
-		spin_unlock_irqrestore(&port->lock, flags);
-	}
+	if (bindex < 0)
+		bindex = 0;
+	baud = baudrates[bindex];
+
+	/* The serial layer calls into this once with old = NULL when setting
+	   up initially */
+	if (old)
+		tty_termios_copy_hw(termios, old);
+	tty_termios_encode_baud_rate(termios, baud, baud);
+
+	spin_lock_irqsave(&port->lock, flags);
+	iowrite8((u8)bindex, port->membase + TIMBUART_BAUDRATE);
+	uart_update_timeout(port, termios->c_cflag, baud);
+	spin_unlock_irqrestore(&port->lock, flags);
 }
 
 static const char *timbuart_type(struct uart_port *port)
-- 
GitLab


From 00b040deca907a113f5bef67a6cc7a4f65a5ace9 Mon Sep 17 00:00:00 2001
From: Alan Cox <alan@linux.intel.com>
Date: Thu, 11 Jun 2009 14:29:29 +0100
Subject: [PATCH 2036/2198] tty: resolve some sierra breakage

The various merges into the sierra driver inadvertently undid
commit 212b8f0c3f5a2280bfa1d6ab13a6fe98552becaa by Elina Pasheva
<epasheva@sierrawireless.com>. Put it back so the OBEX port works again.

Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/sierra.c | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 1319b8968d82e..222a6230276b4 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -257,8 +257,19 @@ static int sierra_send_setup(struct usb_serial_port *port)
 		val |= 0x02;
 
 	/* If composite device then properly report interface */
-	if (serial->num_ports == 1)
+	if (serial->num_ports == 1) {
 		interface = sierra_calc_interface(serial);
+		/* Control message is sent only to interfaces with
+		 * interrupt_in endpoints
+		 */
+		if (port->interrupt_in_urb) {
+			/* send control message */
+			return usb_control_msg(serial->dev,
+				usb_rcvctrlpipe(serial->dev, 0),
+				0x22, 0x21, val, interface,
+				NULL, 0, USB_CTRL_SET_TIMEOUT);
+		}
+	}
 
 	/* Otherwise the need to do non-composite mapping */
 	else {
@@ -268,11 +279,11 @@ static int sierra_send_setup(struct usb_serial_port *port)
 			interface = 1;
 		else if (port->bulk_out_endpointAddress == 5)
 			interface = 2;
-	}
-	return usb_control_msg(serial->dev,
+		return usb_control_msg(serial->dev,
 			usb_rcvctrlpipe(serial->dev, 0),
 			0x22, 0x21, val, interface,
 			NULL, 0, USB_CTRL_SET_TIMEOUT);
+	}
 	return 0;
 }
 
-- 
GitLab


From b9a44bc19f48fd82b8f411500a9bb0ea4153d23c Mon Sep 17 00:00:00 2001
From: Elina Pasheva <epasheva@sierrawireless.com>
Date: Thu, 11 Jun 2009 14:30:21 +0100
Subject: [PATCH 2037/2198] sierra: driver urb handling improvements

[Folded from eight patches into one as the original set according to the
 author "All of the patches need to be applied to obtain a working product"
 so keeping them split seems unhelpful

 Merge fixes done versus other conflicting changes and moved the
 spin_lock_init from open to setup time -- Alan]

Summary of the changes and code re-organization in this patch:

- The memory for urbs is allocated and urbs are submitted only for the active
  interfaces (instead of pre-allocating these for all interfaces). This will
  save memory especially in the case of using composite devices.
- The code has been re-organized and functionality has been extracted from
  sierra_startup(), sierra_shutdown(), sierra_open(), sierra_close() and added
  in helper functions sierra_release_urb(), sierra_stop_rx_urbs(),
  sierra_submit_rx_urbs() and sierra_setup_urb()

- Added function sierra_release_urb() to free an urb and its transfer
buffer.
- Removed unecessary include file reference and comment.
- Added function sierra_stop_rx_urbs() that takes care of the release of
receive and interrupt urbs. This function is to be called by sierra_close()
whenever an interface is de-activated.
- Added new function sierra_submit_rx_urbs() that handles the submission of
receive urbs and interrupt urbs (if any) during the interface activation.
This function is to be called by sierra_open(). Added a second parameter to
pass the memory allocation (as suggested by Oliver Neukum) so that this
function can be used in post_reset() and resume().
- Added new function sierra_setup_urb() that contains the functionality to
allocate an urb, fill bulk urb using the supplied memory allocation flag
and release urb upon error. Added parameter so that the caller pass the
memory allocation flag for flexibility.
- Moved sierra_close() before sierra_open() to resolve dependencies
introduced by the code reorganization.
- Modified sierra_close() to call sierra_stop_rx_urbs() and
sierra_release_urb() functions added in previous patch.
- Modified sierra_open() to call sierra_setup_urb() and sierra_submit_rx_urbs()
functions; note urbs are allocated and submitted for each activated interface.
- Modified sierra_startup() so that allocation of urbs happens whenever an
interface is activated (urb allocation is moved to sierra_open()).
- Modified sierra_shutdown() so that urbs are freed whenever an interface is
de-activated (urb freeing moved to sierra_close() as shown in previous patch
from the series)
- Removed unecessary data structure from sierra_port_private_data
- Suppress an entry in logs by not re-submitting an urb when usb_submit_urb()
returns -EPERM, as this shows that usb_kill_urb() is running (as suggested by
Oliver Neukum)

Signed-off-by: Elina Pasheva <epasheva@sierrawireless.com>
Signed-off-by: Alan Cox <alan.cox@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/sierra.c | 228 +++++++++++++++++++++++-------------
 1 file changed, 144 insertions(+), 84 deletions(-)

diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 222a6230276b4..88ec7bfd9d354 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -26,12 +26,10 @@
 #include <linux/module.h>
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
-#include <linux/usb/ch9.h>
 
 #define SWIMS_USB_REQUEST_SetPower	0x00
 #define SWIMS_USB_REQUEST_SetNmea	0x07
 
-/* per port private data */
 #define N_IN_URB	4
 #define N_OUT_URB	4
 #define IN_BUFLEN	4096
@@ -229,7 +227,6 @@ struct sierra_port_private {
 
 	/* Input endpoints and buffers for this port */
 	struct urb *in_urbs[N_IN_URB];
-	char *in_buffer[N_IN_URB];
 
 	/* Settings for the port */
 	int rts_state;	/* Handshaking pins (outputs) */
@@ -334,6 +331,17 @@ static int sierra_tiocmset(struct tty_struct *tty, struct file *file,
 	return sierra_send_setup(port);
 }
 
+static void sierra_release_urb(struct urb *urb)
+{
+	struct usb_serial_port *port;
+	if (urb) {
+		port =  urb->context;
+		dev_dbg(&port->dev, "%s: %p\n", __func__, urb);
+		kfree(urb->transfer_buffer);
+		usb_free_urb(urb);
+	}
+}
+
 static void sierra_outdat_callback(struct urb *urb)
 {
 	struct usb_serial_port *port = urb->context;
@@ -458,7 +466,7 @@ static void sierra_indat_callback(struct urb *urb)
 				" received", __func__);
 
 		/* Resubmit urb so we continue receiving */
-		if (port->port.count && status != -ESHUTDOWN) {
+		if (port->port.count && status != -ESHUTDOWN && status != -EPERM) {
 			err = usb_submit_urb(urb, GFP_ATOMIC);
 			if (err)
 				dev_err(&port->dev, "resubmit read urb failed."
@@ -550,100 +558,184 @@ static int sierra_write_room(struct tty_struct *tty)
 	return 2048;
 }
 
-static int sierra_open(struct tty_struct *tty,
-			struct usb_serial_port *port, struct file *filp)
+static void sierra_stop_rx_urbs(struct usb_serial_port *port)
 {
-	struct sierra_port_private *portdata;
-	struct usb_serial *serial = port->serial;
 	int i;
-	struct urb *urb;
-	int result;
+	struct sierra_port_private *portdata = usb_get_serial_port_data(port);
 
-	portdata = usb_get_serial_port_data(port);
+	for (i = 0; i < ARRAY_SIZE(portdata->in_urbs); i++)
+		usb_kill_urb(portdata->in_urbs[i]);
 
-	dev_dbg(&port->dev, "%s", __func__);
+	usb_kill_urb(port->interrupt_in_urb);
+}
 
-	/* Set some sane defaults */
-	portdata->rts_state = 1;
-	portdata->dtr_state = 1;
+static int sierra_submit_rx_urbs(struct usb_serial_port *port, gfp_t mem_flags)
+{
+	int ok_cnt;
+	int err = -EINVAL;
+	int i;
+	struct urb *urb;
+	struct sierra_port_private *portdata = usb_get_serial_port_data(port);
 
-	/* Reset low level data toggle and start reading from endpoints */
-	for (i = 0; i < N_IN_URB; i++) {
+	ok_cnt = 0;
+	for (i = 0; i < ARRAY_SIZE(portdata->in_urbs); i++) {
 		urb = portdata->in_urbs[i];
 		if (!urb)
 			continue;
-		if (urb->dev != serial->dev) {
-			dev_dbg(&port->dev, "%s: dev %p != %p",
-				 __func__, urb->dev, serial->dev);
-			continue;
+		err = usb_submit_urb(urb, mem_flags);
+		if (err) {
+			dev_err(&port->dev, "%s: submit urb failed: %d\n",
+				__func__, err);
+		} else {
+			ok_cnt++;
 		}
+	}
 
-		/*
-		 * make sure endpoint data toggle is synchronized with the
-		 * device
-		 */
-		usb_clear_halt(urb->dev, urb->pipe);
-
-		result = usb_submit_urb(urb, GFP_KERNEL);
-		if (result) {
-			dev_err(&port->dev, "submit urb %d failed (%d) %d\n",
-				i, result, urb->transfer_buffer_length);
+	if (ok_cnt && port->interrupt_in_urb) {
+		err = usb_submit_urb(port->interrupt_in_urb, mem_flags);
+		if (err) {
+			dev_err(&port->dev, "%s: submit intr urb failed: %d\n",
+				__func__, err);
 		}
 	}
 
-	sierra_send_setup(port);
+	if (ok_cnt > 0) /* at least one rx urb submitted */
+		return 0;
+	else
+		return err;
+}
+
+static struct urb *sierra_setup_urb(struct usb_serial *serial, int endpoint,
+					int dir, void *ctx, int len,
+					gfp_t mem_flags,
+					usb_complete_t callback)
+{
+	struct urb	*urb;
+	u8		*buf;
+
+	if (endpoint == -1)
+		return NULL;
 
-	/* start up the interrupt endpoint if we have one */
-	if (port->interrupt_in_urb) {
-		result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL);
-		if (result)
-			dev_err(&port->dev, "submit irq_in urb failed %d\n",
-				result);
+	urb = usb_alloc_urb(0, mem_flags);
+	if (urb == NULL) {
+		dev_dbg(&serial->dev->dev, "%s: alloc for endpoint %d failed\n",
+			__func__, endpoint);
+		return NULL;
 	}
-	return 0;
+
+	buf = kmalloc(len, mem_flags);
+	if (buf) {
+		/* Fill URB using supplied data */
+		usb_fill_bulk_urb(urb, serial->dev,
+			usb_sndbulkpipe(serial->dev, endpoint) | dir,
+			buf, len, callback, ctx);
+
+		/* debug */
+		dev_dbg(&serial->dev->dev, "%s %c u : %p d:%p\n", __func__,
+				dir == USB_DIR_IN ? 'i' : 'o', urb, buf);
+	} else {
+		dev_dbg(&serial->dev->dev, "%s %c u:%p d:%p\n", __func__,
+				dir == USB_DIR_IN ? 'i' : 'o', urb, buf);
+
+		sierra_release_urb(urb);
+		urb = NULL;
+	}
+
+	return urb;
 }
 
-static void sierra_dtr_rts(struct usb_serial_port *port, int on)
+static void sierra_close(struct usb_serial_port *port)
 {
+	int i;
 	struct usb_serial *serial = port->serial;
 	struct sierra_port_private *portdata;
 
+	dev_dbg(&port->dev, "%s\n", __func__);
 	portdata = usb_get_serial_port_data(port);
-	portdata->rts_state = on;
-	portdata->dtr_state = on;
+
+	portdata->rts_state = 0;
+	portdata->dtr_state = 0;
 
 	if (serial->dev) {
 		mutex_lock(&serial->disc_mutex);
 		if (!serial->disconnected)
 			sierra_send_setup(port);
 		mutex_unlock(&serial->disc_mutex);
+
+		/* Stop reading urbs */
+		sierra_stop_rx_urbs(port);
+		/* .. and release them */
+		for (i = 0; i < N_IN_URB; i++) {
+			sierra_release_urb(portdata->in_urbs[i]);
+			portdata->in_urbs[i] = NULL;
+		}
 	}
 }
 
-static void sierra_close(struct usb_serial_port *port)
+static int sierra_open(struct tty_struct *tty,
+			struct usb_serial_port *port, struct file *filp)
 {
+	struct sierra_port_private *portdata;
+	struct usb_serial *serial = port->serial;
 	int i;
+	int err;
+	int endpoint;
+	struct urb *urb;
+
+	portdata = usb_get_serial_port_data(port);
+
+	dev_dbg(&port->dev, "%s", __func__);
+
+	/* Set some sane defaults */
+	portdata->rts_state = 1;
+	portdata->dtr_state = 1;
+
+
+	endpoint = port->bulk_in_endpointAddress;
+	for (i = 0; i < ARRAY_SIZE(portdata->in_urbs); i++) {
+		urb = sierra_setup_urb(serial, endpoint, USB_DIR_IN, port,
+					IN_BUFLEN, GFP_KERNEL,
+					sierra_indat_callback);
+		portdata->in_urbs[i] = urb;
+	}
+	/* clear halt condition */
+	usb_clear_halt(serial->dev,
+			usb_sndbulkpipe(serial->dev, endpoint) | USB_DIR_IN);
+
+	err = sierra_submit_rx_urbs(port, GFP_KERNEL);
+	if (err) {
+		/* get rid of everything as in close */
+		sierra_close(port);
+		return err;
+	}
+	sierra_send_setup(port);
+
+	return 0;
+}
+
+
+static void sierra_dtr_rts(struct usb_serial_port *port, int on)
+{
 	struct usb_serial *serial = port->serial;
 	struct sierra_port_private *portdata;
 
-	dev_dbg(&port->dev, "%s", __func__);
 	portdata = usb_get_serial_port_data(port);
+	portdata->rts_state = on;
+	portdata->dtr_state = on;
 
 	if (serial->dev) {
-		/* Stop reading/writing urbs */
-		for (i = 0; i < N_IN_URB; i++)
-			usb_kill_urb(portdata->in_urbs[i]);
+		mutex_lock(&serial->disc_mutex);
+		if (!serial->disconnected)
+			sierra_send_setup(port);
+		mutex_unlock(&serial->disc_mutex);
 	}
-	usb_kill_urb(port->interrupt_in_urb);
 }
 
 static int sierra_startup(struct usb_serial *serial)
 {
 	struct usb_serial_port *port;
 	struct sierra_port_private *portdata;
-	struct urb *urb;
 	int i;
-	int j;
 
 	dev_dbg(&serial->dev->dev, "%s", __func__);
 
@@ -665,34 +757,8 @@ static int sierra_startup(struct usb_serial *serial)
 			return -ENOMEM;
 		}
 		spin_lock_init(&portdata->lock);
-		for (j = 0; j < N_IN_URB; j++) {
-			portdata->in_buffer[j] = kmalloc(IN_BUFLEN, GFP_KERNEL);
-			if (!portdata->in_buffer[j]) {
-				for (--j; j >= 0; j--)
-					kfree(portdata->in_buffer[j]);
-				kfree(portdata);
-				return -ENOMEM;
-			}
-		}
-
+		/* Set the port private data pointer */
 		usb_set_serial_port_data(port, portdata);
-
-		/* initialize the in urbs */
-		for (j = 0; j < N_IN_URB; ++j) {
-			urb = usb_alloc_urb(0, GFP_KERNEL);
-			if (urb == NULL) {
-				dev_dbg(&port->dev, "%s: alloc for in "
-					"port failed.", __func__);
-				continue;
-			}
-			/* Fill URB using supplied data. */
-			usb_fill_bulk_urb(urb, serial->dev,
-					  usb_rcvbulkpipe(serial->dev,
-						port->bulk_in_endpointAddress),
-					  portdata->in_buffer[j], IN_BUFLEN,
-					  sierra_indat_callback, port);
-			portdata->in_urbs[j] = urb;
-		}
 	}
 
 	return 0;
@@ -700,7 +766,7 @@ static int sierra_startup(struct usb_serial *serial)
 
 static void sierra_shutdown(struct usb_serial *serial)
 {
-	int i, j;
+	int i;
 	struct usb_serial_port *port;
 	struct sierra_port_private *portdata;
 
@@ -713,12 +779,6 @@ static void sierra_shutdown(struct usb_serial *serial)
 		portdata = usb_get_serial_port_data(port);
 		if (!portdata)
 			continue;
-
-		for (j = 0; j < N_IN_URB; j++) {
-			usb_kill_urb(portdata->in_urbs[j]);
-			usb_free_urb(portdata->in_urbs[j]);
-			kfree(portdata->in_buffer[j]);
-		}
 		kfree(portdata);
 		usb_set_serial_port_data(port, NULL);
 	}
-- 
GitLab


From 4db2299da213d1ba8cf7f4c0a197ae7ba49db5cb Mon Sep 17 00:00:00 2001
From: Elina Pasheva <epasheva@sierrawireless.com>
Date: Thu, 11 Jun 2009 14:32:01 +0100
Subject: [PATCH 2038/2198] sierra: driver interface blacklisting

Interface blacklisting is necessary for non-serial interfaces that are handled
by a different driver. The interface blacklisting is implemented in sierra
driver per device. Each device in need of a blacklist has a static information
array kept in the driver. This array contains the interface numbers that are
blacklisted. The pointer for each blacklist array and the length
of that blacklist are 'bundled' in data structure sierra_iface_info. A pointer
to this information is set in id_table when the device is added to the id_table.

The following is summary of changes we have made to sierra.c driver in
this patch dealing with interface blacklisting support:
- Added data structure sierra_iface_info and function is_blacklisted()
to support blacklisting
- Modified sierra_probe() to handle blacklisted interfaces accordingly
- Improved comments in id_table
- Added new device in id_table with blacklist interface support

Signed-off-by: Elina Pasheva <epasheva@sierrawireless.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/usb/serial/sierra.c | 51 ++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 3 deletions(-)

diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c
index 88ec7bfd9d354..17ac34f4d6682 100644
--- a/drivers/usb/serial/sierra.c
+++ b/drivers/usb/serial/sierra.c
@@ -37,6 +37,12 @@
 static int debug;
 static int nmea;
 
+/* Used in interface blacklisting */
+struct sierra_iface_info {
+	const u32 infolen;	/* number of interface numbers on blacklist */
+	const u8  *ifaceinfo;	/* pointer to the array holding the numbers */
+};
+
 static int sierra_set_power_state(struct usb_device *udev, __u16 swiState)
 {
 	int result;
@@ -83,6 +89,23 @@ static int sierra_calc_num_ports(struct usb_serial *serial)
 	return result;
 }
 
+static int is_blacklisted(const u8 ifnum,
+				const struct sierra_iface_info *blacklist)
+{
+	const u8  *info;
+	int i;
+
+	if (blacklist) {
+		info = blacklist->ifaceinfo;
+
+		for (i = 0; i < blacklist->infolen; i++) {
+			if (info[i] == ifnum)
+				return 1;
+		}
+	}
+	return 0;
+}
+
 static int sierra_calc_interface(struct usb_serial *serial)
 {
 	int interface;
@@ -151,9 +174,25 @@ static int sierra_probe(struct usb_serial *serial,
 	 */
 	usb_set_serial_data(serial, (void *)num_ports);
 
+	/* ifnum could have changed - by calling usb_set_interface */
+	ifnum = sierra_calc_interface(serial);
+
+	if (is_blacklisted(ifnum,
+				(struct sierra_iface_info *)id->driver_info)) {
+		dev_dbg(&serial->dev->dev,
+			"Ignoring blacklisted interface #%d\n", ifnum);
+		return -ENODEV;
+	}
+
 	return result;
 }
 
+static const u8 direct_ip_non_serial_ifaces[] = { 7, 8, 9, 10, 11 };
+static const struct sierra_iface_info direct_ip_interface_blacklist = {
+	.infolen = ARRAY_SIZE(direct_ip_non_serial_ifaces),
+	.ifaceinfo = direct_ip_non_serial_ifaces,
+};
+
 static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x1199, 0x0017) },	/* Sierra Wireless EM5625 */
 	{ USB_DEVICE(0x1199, 0x0018) },	/* Sierra Wireless MC5720 */
@@ -186,9 +225,11 @@ static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x1199, 0x6833) },	/* Sierra Wireless MC8781 */
 	{ USB_DEVICE(0x1199, 0x683A) },	/* Sierra Wireless MC8785 */
 	{ USB_DEVICE(0x1199, 0x683B) },	/* Sierra Wireless MC8785 Composite */
-	{ USB_DEVICE(0x1199, 0x683C) },	/* Sierra Wireless MC8790 */
-	{ USB_DEVICE(0x1199, 0x683D) },	/* Sierra Wireless MC8790 */
-	{ USB_DEVICE(0x1199, 0x683E) },	/* Sierra Wireless MC8790 */
+	/* Sierra Wireless MC8790, MC8791, MC8792 Composite */
+	{ USB_DEVICE(0x1199, 0x683C) },
+	{ USB_DEVICE(0x1199, 0x683D) },	/* Sierra Wireless MC8791 Composite */
+	/* Sierra Wireless MC8790, MC8791, MC8792 */
+	{ USB_DEVICE(0x1199, 0x683E) },
 	{ USB_DEVICE(0x1199, 0x6850) },	/* Sierra Wireless AirCard 880 */
 	{ USB_DEVICE(0x1199, 0x6851) },	/* Sierra Wireless AirCard 881 */
 	{ USB_DEVICE(0x1199, 0x6852) },	/* Sierra Wireless AirCard 880 E */
@@ -209,6 +250,10 @@ static struct usb_device_id id_table [] = {
 	{ USB_DEVICE(0x1199, 0x0112) }, /* Sierra Wireless AirCard 580 */
 	{ USB_DEVICE(0x0F3D, 0x0112) }, /* Airprime/Sierra PC 5220 */
 
+	{ USB_DEVICE(0x1199, 0x68A3), 	/* Sierra Wireless Direct IP modems */
+	  .driver_info = (kernel_ulong_t)&direct_ip_interface_blacklist
+	},
+
 	{ }
 };
 MODULE_DEVICE_TABLE(usb, id_table);
-- 
GitLab


From 5fc5b42a3bb564f0b6e03f0f1b522ed9100250ad Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 14:32:42 +0100
Subject: [PATCH 2039/2198] tty: remove sleep_on

Use wait_event instead of sleep_on in tty_block_til_ready.

Wait for ASYNC_CLOSING flag being 0.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_port.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 4d08b6d27c28b..931af10307409 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -198,7 +198,8 @@ int tty_port_block_til_ready(struct tty_port *port,
 
 	/* block if port is in the process of being closed */
 	if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
-		interruptible_sleep_on(&port->close_wait);
+		wait_event_interruptible(port->close_wait,
+				!(port->flags & ASYNC_CLOSING));
 		if (port->flags & ASYNC_HUP_NOTIFY)
 			return -EAGAIN;
 		else
-- 
GitLab


From 3e3b5c087799e536871c8261b05bc28e4783c8da Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Thu, 11 Jun 2009 14:33:37 +0100
Subject: [PATCH 2040/2198] tty: use prepare/finish_wait

Use prepare_to_wait and finish_wait instead of add_wait_queue and
remove_wait_queue.

This avoids us setting a task state.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_port.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/char/tty_port.c b/drivers/char/tty_port.c
index 931af10307409..62dadfc95e341 100644
--- a/drivers/char/tty_port.c
+++ b/drivers/char/tty_port.c
@@ -222,7 +222,6 @@ int tty_port_block_til_ready(struct tty_port *port,
 	   before the next open may complete */
 
 	retval = 0;
-	add_wait_queue(&port->open_wait, &wait);
 
 	/* The port lock protects the port counts */
 	spin_lock_irqsave(&port->lock, flags);
@@ -236,7 +235,7 @@ int tty_port_block_til_ready(struct tty_port *port,
 		if (tty->termios->c_cflag & CBAUD)
 			tty_port_raise_dtr_rts(port);
 
-		set_current_state(TASK_INTERRUPTIBLE);
+		prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE);
 		/* Check for a hangup or uninitialised port. Return accordingly */
 		if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
 			if (port->flags & ASYNC_HUP_NOTIFY)
@@ -257,8 +256,7 @@ int tty_port_block_til_ready(struct tty_port *port,
 		}
 		schedule();
 	}
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&port->open_wait, &wait);
+	finish_wait(&port->open_wait, &wait);
 
 	/* Update counts. A parallel hangup will have set count to zero and
 	   we must not mess that up further */
-- 
GitLab


From d3810cd4d7064b109574374f73c41b11b481dbf2 Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 11 Jun 2009 14:35:01 +0100
Subject: [PATCH 2041/2198] imx: serial: fix whitespaces (no changes in
 functionality)

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/imx.c | 48 +++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 25 deletions(-)

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 5f0be40dfdab7..76c8fa1884ecc 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -302,8 +302,7 @@ static inline void imx_transmit_buffer(struct imx_port *sport)
 		/* send xmit->buf[xmit->tail]
 		 * out the port here */
 		writel(xmit->buf[xmit->tail], sport->port.membase + URTX0);
-		xmit->tail = (xmit->tail + 1) &
-		         (UART_XMIT_SIZE - 1);
+		xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
 		sport->port.icount.tx++;
 		if (uart_circ_empty(xmit))
 			break;
@@ -395,8 +394,7 @@ static irqreturn_t imx_rxint(int irq, void *dev_id)
 				continue;
 		}
 
-		if (uart_handle_sysrq_char
-		            (&sport->port, (unsigned char)rx))
+		if (uart_handle_sysrq_char(&sport->port, (unsigned char)rx))
 			continue;
 
 		if (rx & (URXD_PRERR | URXD_OVRRUN | URXD_FRMERR) ) {
@@ -471,26 +469,26 @@ static unsigned int imx_tx_empty(struct uart_port *port)
  */
 static unsigned int imx_get_mctrl(struct uart_port *port)
 {
-        struct imx_port *sport = (struct imx_port *)port;
-        unsigned int tmp = TIOCM_DSR | TIOCM_CAR;
+	struct imx_port *sport = (struct imx_port *)port;
+	unsigned int tmp = TIOCM_DSR | TIOCM_CAR;
 
-        if (readl(sport->port.membase + USR1) & USR1_RTSS)
-                tmp |= TIOCM_CTS;
+	if (readl(sport->port.membase + USR1) & USR1_RTSS)
+		tmp |= TIOCM_CTS;
 
-        if (readl(sport->port.membase + UCR2) & UCR2_CTS)
-                tmp |= TIOCM_RTS;
+	if (readl(sport->port.membase + UCR2) & UCR2_CTS)
+		tmp |= TIOCM_RTS;
 
-        return tmp;
+	return tmp;
 }
 
 static void imx_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
-        struct imx_port *sport = (struct imx_port *)port;
+	struct imx_port *sport = (struct imx_port *)port;
 	unsigned long temp;
 
 	temp = readl(sport->port.membase + UCR2) & ~UCR2_CTS;
 
-        if (mctrl & TIOCM_RTS)
+	if (mctrl & TIOCM_RTS)
 		temp |= UCR2_CTS;
 
 	writel(temp, sport->port.membase + UCR2);
@@ -1072,22 +1070,22 @@ static struct uart_driver imx_reg = {
 
 static int serial_imx_suspend(struct platform_device *dev, pm_message_t state)
 {
-        struct imx_port *sport = platform_get_drvdata(dev);
+	struct imx_port *sport = platform_get_drvdata(dev);
 
-        if (sport)
-                uart_suspend_port(&imx_reg, &sport->port);
+	if (sport)
+		uart_suspend_port(&imx_reg, &sport->port);
 
-        return 0;
+	return 0;
 }
 
 static int serial_imx_resume(struct platform_device *dev)
 {
-        struct imx_port *sport = platform_get_drvdata(dev);
+	struct imx_port *sport = platform_get_drvdata(dev);
 
-        if (sport)
-                uart_resume_port(&imx_reg, &sport->port);
+	if (sport)
+		uart_resume_port(&imx_reg, &sport->port);
 
-        return 0;
+	return 0;
 }
 
 static int serial_imx_probe(struct platform_device *pdev)
@@ -1143,7 +1141,7 @@ static int serial_imx_probe(struct platform_device *pdev)
 	imx_ports[pdev->id] = sport;
 
 	pdata = pdev->dev.platform_data;
-	if(pdata && (pdata->flags & IMXUART_HAVE_RTSCTS))
+	if (pdata && (pdata->flags & IMXUART_HAVE_RTSCTS))
 		sport->have_rtscts = 1;
 
 	if (pdata->init) {
@@ -1193,13 +1191,13 @@ static int serial_imx_remove(struct platform_device *pdev)
 }
 
 static struct platform_driver serial_imx_driver = {
-        .probe          = serial_imx_probe,
-        .remove         = serial_imx_remove,
+	.probe		= serial_imx_probe,
+	.remove		= serial_imx_remove,
 
 	.suspend	= serial_imx_suspend,
 	.resume		= serial_imx_resume,
 	.driver		= {
-	        .name	= "imx-uart",
+		.name	= "imx-uart",
 		.owner	= THIS_MODULE,
 	},
 };
-- 
GitLab


From 26bbb3ff1ff6163d6a233055766e26af8054a212 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Gl=C3=B6ckner?= <dg@emlix.com>
Date: Thu, 11 Jun 2009 14:36:29 +0100
Subject: [PATCH 2042/2198] imx: serial: fix one bit field type
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

"have_rtscts" is assigned 1, while it is declared
int:1, two's complement, which can hold 0 and -1
only. The code works, as the upper bits are cut
off, and tests are done against 0 only.

Nonetheless, correctly declaring the bit field
as unsigned int:1 renders the code more robust.

Signed-off-by: Daniel Glöckner <dg@emlix.com>
Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/imx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 76c8fa1884ecc..6b8f12f4a7054 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -211,7 +211,7 @@ struct imx_port {
 	struct timer_list	timer;
 	unsigned int		old_status;
 	int			txirq,rxirq,rtsirq;
-	int			have_rtscts:1;
+	unsigned int		have_rtscts:1;
 	struct clk		*clk;
 };
 
-- 
GitLab


From 977757311e50dc5d832c9fef34e7555411f7ccd8 Mon Sep 17 00:00:00 2001
From: Fabian Godehardt <fg@emlix.com>
Date: Thu, 11 Jun 2009 14:37:19 +0100
Subject: [PATCH 2043/2198] imx: serial: notify higher layers in case xmit IRQ
 was not called

upper layers, namely line discipline, need to be notified
when transmission of more data is possible. For spurious
cases, where IRQ handling does not supply notification
for sure, it is given additionally here, when data has just
been transmitted and space in the buffer will most probably
be available.

Signed-off-by: Fabian Godehardt <fg@emlix.com>
Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/imx.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 6b8f12f4a7054..49f2e12ba58d2 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -308,6 +308,9 @@ static inline void imx_transmit_buffer(struct imx_port *sport)
 			break;
 	}
 
+	if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+		uart_write_wakeup(&sport->port);
+
 	if (uart_circ_empty(xmit))
 		imx_stop_tx(&sport->port);
 }
-- 
GitLab


From 2e1463922a35584c863f71d4021e1e71f76eaed0 Mon Sep 17 00:00:00 2001
From: Fabian Godehardt <fg@emlix.com>
Date: Thu, 11 Jun 2009 14:38:38 +0100
Subject: [PATCH 2044/2198] imx: serial: be sure to stop xmit upon shutdown

needed to avoid continued transmission by hardware
while software already shuts down, which might
cause dangling characters to show up in hardware
queues when restarting the device.

Signed-off-by: Fabian Godehardt <fg@emlix.com>
Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/imx.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 49f2e12ba58d2..e6c2ba26dcbb1 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -634,6 +634,10 @@ static void imx_shutdown(struct uart_port *port)
 	struct imx_port *sport = (struct imx_port *)port;
 	unsigned long temp;
 
+	temp = readl(sport->port.membase + UCR2);
+	temp &= ~(UCR2_TXEN);
+	writel(temp, sport->port.membase + UCR2);
+
 	/*
 	 * Stop our timer.
 	 */
-- 
GitLab


From 9f322ad064f9210e7d472dfe77e702274d5c9dba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Gl=C3=B6ckner?= <dg@emlix.com>
Date: Thu, 11 Jun 2009 14:39:21 +0100
Subject: [PATCH 2045/2198] imx: serial: handle initialisation failure
 correctly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

correctly de-initialise device when setting up failed,
call to pdata->exit() was missing.

Signed-off-by: Daniel Glöckner <dg@emlix.com>
Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/imx.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index e6c2ba26dcbb1..cbd4f32246432 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -1157,10 +1157,15 @@ static int serial_imx_probe(struct platform_device *pdev)
 			goto clkput;
 	}
 
-	uart_add_one_port(&imx_reg, &sport->port);
+	ret = uart_add_one_port(&imx_reg, &sport->port);
+	if (ret)
+		goto deinit;
 	platform_set_drvdata(pdev, &sport->port);
 
 	return 0;
+deinit:
+	if (pdata->exit)
+		pdata->exit(pdev);
 clkput:
 	clk_put(sport->clk);
 	clk_disable(sport->clk);
-- 
GitLab


From 8759ef32d992fc6c0bcbe40fca7aa302190918a5 Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 11 Jun 2009 14:51:15 +0100
Subject: [PATCH 2046/2198] lib: isolate rational fractions helper function

Provide a helper function to determine optimum numerator
denominator value pairs taking into account restricted
register size. Useful especially with PLL and other clock
configurations.

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rational.h | 19 ++++++++++++
 lib/Kconfig              |  3 ++
 lib/Makefile             |  1 +
 lib/rational.c           | 62 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 85 insertions(+)
 create mode 100644 include/linux/rational.h
 create mode 100644 lib/rational.c

diff --git a/include/linux/rational.h b/include/linux/rational.h
new file mode 100644
index 0000000000000..4f532fcd9eeae
--- /dev/null
+++ b/include/linux/rational.h
@@ -0,0 +1,19 @@
+/*
+ * rational fractions
+ *
+ * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <os@emlix.com>
+ *
+ * helper functions when coping with rational numbers,
+ * e.g. when calculating optimum numerator/denominator pairs for
+ * pll configuration taking into account restricted register size
+ */
+
+#ifndef _LINUX_RATIONAL_H
+#define _LINUX_RATIONAL_H
+
+void rational_best_approximation(
+	unsigned long given_numerator, unsigned long given_denominator,
+	unsigned long max_numerator, unsigned long max_denominator,
+	unsigned long *best_numerator, unsigned long *best_denominator);
+
+#endif /* _LINUX_RATIONAL_H */
diff --git a/lib/Kconfig b/lib/Kconfig
index 8ade0a7a91e09..9960be04cbbe6 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -10,6 +10,9 @@ menu "Library routines"
 config BITREVERSE
 	tristate
 
+config RATIONAL
+	boolean
+
 config GENERIC_FIND_FIRST_BIT
 	bool
 
diff --git a/lib/Makefile b/lib/Makefile
index 33a40e40e3ee4..1f6edefebffe3 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -50,6 +50,7 @@ ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
 endif
 
 obj-$(CONFIG_BITREVERSE) += bitrev.o
+obj-$(CONFIG_RATIONAL)	+= rational.o
 obj-$(CONFIG_CRC_CCITT)	+= crc-ccitt.o
 obj-$(CONFIG_CRC16)	+= crc16.o
 obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o
diff --git a/lib/rational.c b/lib/rational.c
new file mode 100644
index 0000000000000..b3c099b5478e3
--- /dev/null
+++ b/lib/rational.c
@@ -0,0 +1,62 @@
+/*
+ * rational fractions
+ *
+ * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <os@emlix.com>
+ *
+ * helper functions when coping with rational numbers
+ */
+
+#include <linux/rational.h>
+
+/*
+ * calculate best rational approximation for a given fraction
+ * taking into account restricted register size, e.g. to find
+ * appropriate values for a pll with 5 bit denominator and
+ * 8 bit numerator register fields, trying to set up with a
+ * frequency ratio of 3.1415, one would say:
+ *
+ * rational_best_approximation(31415, 10000,
+ *		(1 << 8) - 1, (1 << 5) - 1, &n, &d);
+ *
+ * you may look at given_numerator as a fixed point number,
+ * with the fractional part size described in given_denominator.
+ *
+ * for theoretical background, see:
+ * http://en.wikipedia.org/wiki/Continued_fraction
+ */
+
+void rational_best_approximation(
+	unsigned long given_numerator, unsigned long given_denominator,
+	unsigned long max_numerator, unsigned long max_denominator,
+	unsigned long *best_numerator, unsigned long *best_denominator)
+{
+	unsigned long n, d, n0, d0, n1, d1;
+	n = given_numerator;
+	d = given_denominator;
+	n0 = d1 = 0;
+	n1 = d0 = 1;
+	for (;;) {
+		unsigned long t, a;
+		if ((n1 > max_numerator) || (d1 > max_denominator)) {
+			n1 = n0;
+			d1 = d0;
+			break;
+		}
+		if (d == 0)
+			break;
+		t = d;
+		a = n / d;
+		d = n % d;
+		n = t;
+		t = n0 + a * n1;
+		n0 = n1;
+		n1 = t;
+		t = d0 + a * d1;
+		d0 = d1;
+		d1 = t;
+	}
+	*best_numerator = n1;
+	*best_denominator = d1;
+}
+
+EXPORT_SYMBOL(rational_best_approximation);
-- 
GitLab


From 534fca068ec8063ec8b67626b3eb34ba6ec86967 Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 11 Jun 2009 14:52:23 +0100
Subject: [PATCH 2047/2198] imx: serial: use rational library function

for calculation of numerator and denominator
used in baud rate setting, use generic library function
for optimum settings.

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/Kconfig |  1 +
 drivers/serial/imx.c   | 30 ++++++++++--------------------
 2 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 3391ef0d761da..641e800ed6933 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -833,6 +833,7 @@ config SERIAL_IMX
 	bool "IMX serial port support"
 	depends on ARM && (ARCH_IMX || ARCH_MXC)
 	select SERIAL_CORE
+	select RATIONAL
 	help
 	  If you have a machine based on a Motorola IMX CPU you
 	  can enable its onboard serial port by enabling this option.
diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index cbd4f32246432..0de81f71f8848 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -41,6 +41,7 @@
 #include <linux/serial_core.h>
 #include <linux/serial.h>
 #include <linux/clk.h>
+#include <linux/rational.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
@@ -670,7 +671,8 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	unsigned long flags;
 	unsigned int ucr2, old_ucr1, old_txrxen, baud, quot;
 	unsigned int old_csize = old ? old->c_cflag & CSIZE : CS8;
-	unsigned int div, num, denom, ufcr;
+	unsigned int div, ufcr;
+	unsigned long num, denom;
 
 	/*
 	 * If we don't support modem control lines, don't allow
@@ -772,32 +774,20 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	if (!div)
 		div = 1;
 
-	num = baud;
-	denom = port->uartclk / div / 16;
+	rational_best_approximation(16 * div * baud, sport->port.uartclk,
+		1 << 16, 1 << 16, &num, &denom);
 
-	/* shift num and denom right until they fit into 16 bits */
-	while (num > 0x10000 || denom > 0x10000) {
-		num >>= 1;
-		denom >>= 1;
-	}
-	if (num > 0)
-		num -= 1;
-	if (denom > 0)
-		denom -= 1;
-
-	writel(num, sport->port.membase + UBIR);
-	writel(denom, sport->port.membase + UBMR);
-
-	if (div == 7)
-		div = 6; /* 6 in RFDIV means divide by 7 */
-	else
-		div = 6 - div;
+	num -= 1;
+	denom -= 1;
 
 	ufcr = readl(sport->port.membase + UFCR);
 	ufcr = (ufcr & (~UFCR_RFDIV)) |
 	    (div << 7);
 	writel(ufcr, sport->port.membase + UFCR);
 
+	writel(num, sport->port.membase + UBIR);
+	writel(denom, sport->port.membase + UBMR);
+
 #ifdef ONEMS
 	writel(sport->port.uartclk / div / 1000, sport->port.membase + ONEMS);
 #endif
-- 
GitLab


From b6e4913834cd032082e5c76dfade61050212dc98 Mon Sep 17 00:00:00 2001
From: Fabian Godehardt <fg@emlix.com>
Date: Thu, 11 Jun 2009 14:53:18 +0100
Subject: [PATCH 2048/2198] imx: serial: add IrDA support to serial driver

Using the iMX serial driver with an IrDA device
needs extra peripheral settings and specific
timing depending on the transmitter circuitry used.

Signed-off-by: Fabian Godehardt <fg@emlix.com>
Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/plat-mxc/include/mach/imx-uart.h |   5 +
 drivers/serial/imx.c                      | 195 +++++++++++++++++++---
 2 files changed, 181 insertions(+), 19 deletions(-)

diff --git a/arch/arm/plat-mxc/include/mach/imx-uart.h b/arch/arm/plat-mxc/include/mach/imx-uart.h
index 599217b2e13f9..f9bd17dd8dd71 100644
--- a/arch/arm/plat-mxc/include/mach/imx-uart.h
+++ b/arch/arm/plat-mxc/include/mach/imx-uart.h
@@ -20,11 +20,16 @@
 #define ASMARM_ARCH_UART_H
 
 #define IMXUART_HAVE_RTSCTS (1<<0)
+#define IMXUART_IRDA        (1<<1)
 
 struct imxuart_platform_data {
 	int (*init)(struct platform_device *pdev);
 	int (*exit)(struct platform_device *pdev);
 	unsigned int flags;
+	void (*irda_enable)(int enable);
+	unsigned int irda_inv_rx:1;
+	unsigned int irda_inv_tx:1;
+	unsigned short transceiver_delay;
 };
 
 #endif
diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 0de81f71f8848..8c79e8c2fd4e7 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -8,6 +8,9 @@
  *  Author: Sascha Hauer <sascha@saschahauer.de>
  *  Copyright (C) 2004 Pengutronix
  *
+ *  Copyright (C) 2009 emlix GmbH
+ *  Author: Fabian Godehardt (added IrDA support for iMX)
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -41,6 +44,7 @@
 #include <linux/serial_core.h>
 #include <linux/serial.h>
 #include <linux/clk.h>
+#include <linux/delay.h>
 #include <linux/rational.h>
 
 #include <asm/io.h>
@@ -149,6 +153,7 @@
 #define  UCR4_DREN  	 (1<<0)  /* Recv data ready interrupt enable */
 #define  UFCR_RXTL_SHF   0       /* Receiver trigger level shift */
 #define  UFCR_RFDIV      (7<<7)  /* Reference freq divider mask */
+#define  UFCR_RFDIV_REG(x)	(((x) < 7 ? 6 - (x) : 6) << 7)
 #define  UFCR_TXTL_SHF   10      /* Transmitter trigger level shift */
 #define  USR1_PARITYERR  (1<<15) /* Parity error interrupt flag */
 #define  USR1_RTSS  	 (1<<14) /* RTS pin status */
@@ -213,9 +218,19 @@ struct imx_port {
 	unsigned int		old_status;
 	int			txirq,rxirq,rtsirq;
 	unsigned int		have_rtscts:1;
+	unsigned int		use_irda:1;
+	unsigned int		irda_inv_rx:1;
+	unsigned int		irda_inv_tx:1;
+	unsigned short		trcv_delay; /* transceiver delay */
 	struct clk		*clk;
 };
 
+#ifdef CONFIG_IRDA
+#define USE_IRDA(sport)	((sport)->use_irda)
+#else
+#define USE_IRDA(sport)	(0)
+#endif
+
 /*
  * Handle any change of modem status signal since we were last called.
  */
@@ -269,6 +284,48 @@ static void imx_stop_tx(struct uart_port *port)
 	struct imx_port *sport = (struct imx_port *)port;
 	unsigned long temp;
 
+	if (USE_IRDA(sport)) {
+		/* half duplex - wait for end of transmission */
+		int n = 256;
+		while ((--n > 0) &&
+		      !(readl(sport->port.membase + USR2) & USR2_TXDC)) {
+			udelay(5);
+			barrier();
+		}
+		/*
+		 * irda transceiver - wait a bit more to avoid
+		 * cutoff, hardware dependent
+		 */
+		udelay(sport->trcv_delay);
+
+		/*
+		 * half duplex - reactivate receive mode,
+		 * flush receive pipe echo crap
+		 */
+		if (readl(sport->port.membase + USR2) & USR2_TXDC) {
+			temp = readl(sport->port.membase + UCR1);
+			temp &= ~(UCR1_TXMPTYEN | UCR1_TRDYEN);
+			writel(temp, sport->port.membase + UCR1);
+
+			temp = readl(sport->port.membase + UCR4);
+			temp &= ~(UCR4_TCEN);
+			writel(temp, sport->port.membase + UCR4);
+
+			while (readl(sport->port.membase + URXD0) &
+			       URXD_CHARRDY)
+				barrier();
+
+			temp = readl(sport->port.membase + UCR1);
+			temp |= UCR1_RRDYEN;
+			writel(temp, sport->port.membase + UCR1);
+
+			temp = readl(sport->port.membase + UCR4);
+			temp |= UCR4_DREN;
+			writel(temp, sport->port.membase + UCR4);
+		}
+		return;
+	}
+
 	temp = readl(sport->port.membase + UCR1);
 	writel(temp & ~UCR1_TXMPTYEN, sport->port.membase + UCR1);
 }
@@ -324,9 +381,30 @@ static void imx_start_tx(struct uart_port *port)
 	struct imx_port *sport = (struct imx_port *)port;
 	unsigned long temp;
 
+	if (USE_IRDA(sport)) {
+		/* half duplex in IrDA mode; have to disable receive mode */
+		temp = readl(sport->port.membase + UCR4);
+		temp &= ~(UCR4_DREN);
+		writel(temp, sport->port.membase + UCR4);
+
+		temp = readl(sport->port.membase + UCR1);
+		temp &= ~(UCR1_RRDYEN);
+		writel(temp, sport->port.membase + UCR1);
+	}
+
 	temp = readl(sport->port.membase + UCR1);
 	writel(temp | UCR1_TXMPTYEN, sport->port.membase + UCR1);
 
+	if (USE_IRDA(sport)) {
+		temp = readl(sport->port.membase + UCR1);
+		temp |= UCR1_TRDYEN;
+		writel(temp, sport->port.membase + UCR1);
+
+		temp = readl(sport->port.membase + UCR4);
+		temp |= UCR4_TCEN;
+		writel(temp, sport->port.membase + UCR4);
+	}
+
 	if (readl(sport->port.membase + UTS) & UTS_TXEMPTY)
 		imx_transmit_buffer(sport);
 }
@@ -536,12 +614,7 @@ static int imx_setup_ufcr(struct imx_port *sport, unsigned int mode)
 	if(!ufcr_rfdiv)
 		ufcr_rfdiv = 1;
 
-	if(ufcr_rfdiv >= 7)
-		ufcr_rfdiv = 6;
-	else
-		ufcr_rfdiv = 6 - ufcr_rfdiv;
-
-	val |= UFCR_RFDIV & (ufcr_rfdiv << 7);
+	val |= UFCR_RFDIV_REG(ufcr_rfdiv);
 
 	writel(val, sport->port.membase + UFCR);
 
@@ -560,8 +633,24 @@ static int imx_startup(struct uart_port *port)
 	 * requesting IRQs
 	 */
 	temp = readl(sport->port.membase + UCR4);
+
+	if (USE_IRDA(sport))
+		temp |= UCR4_IRSC;
+
 	writel(temp & ~UCR4_DREN, sport->port.membase + UCR4);
 
+	if (USE_IRDA(sport)) {
+		/* reset fifo's and state machines */
+		int i = 100;
+		temp = readl(sport->port.membase + UCR2);
+		temp &= ~UCR2_SRST;
+		writel(temp, sport->port.membase + UCR2);
+		while (!(readl(sport->port.membase + UCR2) & UCR2_SRST) &&
+		    (--i > 0)) {
+			udelay(1);
+		}
+	}
+
 	/*
 	 * Allocate the IRQ(s) i.MX1 has three interrupts whereas later
 	 * chips only have one interrupt.
@@ -577,12 +666,16 @@ static int imx_startup(struct uart_port *port)
 		if (retval)
 			goto error_out2;
 
-		retval = request_irq(sport->rtsirq, imx_rtsint,
-			     (sport->rtsirq < MAX_INTERNAL_IRQ) ? 0 :
-			       IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING,
-				DRIVER_NAME, sport);
-		if (retval)
-			goto error_out3;
+		/* do not use RTS IRQ on IrDA */
+		if (!USE_IRDA(sport)) {
+			retval = request_irq(sport->rtsirq, imx_rtsint,
+				     (sport->rtsirq < MAX_INTERNAL_IRQ) ? 0 :
+				       IRQF_TRIGGER_FALLING |
+				       IRQF_TRIGGER_RISING,
+					DRIVER_NAME, sport);
+			if (retval)
+				goto error_out3;
+		}
 	} else {
 		retval = request_irq(sport->port.irq, imx_int, 0,
 				DRIVER_NAME, sport);
@@ -599,18 +692,49 @@ static int imx_startup(struct uart_port *port)
 
 	temp = readl(sport->port.membase + UCR1);
 	temp |= UCR1_RRDYEN | UCR1_RTSDEN | UCR1_UARTEN;
+
+	if (USE_IRDA(sport)) {
+		temp |= UCR1_IREN;
+		temp &= ~(UCR1_RTSDEN);
+	}
+
 	writel(temp, sport->port.membase + UCR1);
 
 	temp = readl(sport->port.membase + UCR2);
 	temp |= (UCR2_RXEN | UCR2_TXEN);
 	writel(temp, sport->port.membase + UCR2);
 
+	if (USE_IRDA(sport)) {
+		/* clear RX-FIFO */
+		int i = 64;
+		while ((--i > 0) &&
+			(readl(sport->port.membase + URXD0) & URXD_CHARRDY)) {
+			barrier();
+		}
+	}
+
 #if defined CONFIG_ARCH_MX2 || defined CONFIG_ARCH_MX3
 	temp = readl(sport->port.membase + UCR3);
 	temp |= UCR3_RXDMUXSEL;
 	writel(temp, sport->port.membase + UCR3);
 #endif
 
+	if (USE_IRDA(sport)) {
+		temp = readl(sport->port.membase + UCR4);
+		if (sport->irda_inv_rx)
+			temp |= UCR4_INVR;
+		else
+			temp &= ~(UCR4_INVR);
+		writel(temp | UCR4_DREN, sport->port.membase + UCR4);
+
+		temp = readl(sport->port.membase + UCR3);
+		if (sport->irda_inv_tx)
+			temp |= UCR3_INVT;
+		else
+			temp &= ~(UCR3_INVT);
+		writel(temp, sport->port.membase + UCR3);
+	}
+
 	/*
 	 * Enable modem status interrupts
 	 */
@@ -618,6 +742,16 @@ static int imx_startup(struct uart_port *port)
 	imx_enable_ms(&sport->port);
 	spin_unlock_irqrestore(&sport->port.lock,flags);
 
+	if (USE_IRDA(sport)) {
+		struct imxuart_platform_data *pdata;
+		pdata = sport->port.dev->platform_data;
+		sport->irda_inv_rx = pdata->irda_inv_rx;
+		sport->irda_inv_tx = pdata->irda_inv_tx;
+		sport->trcv_delay = pdata->transceiver_delay;
+		if (pdata->irda_enable)
+			pdata->irda_enable(1);
+	}
+
 	return 0;
 
 error_out3:
@@ -639,6 +773,13 @@ static void imx_shutdown(struct uart_port *port)
 	temp &= ~(UCR2_TXEN);
 	writel(temp, sport->port.membase + UCR2);
 
+	if (USE_IRDA(sport)) {
+		struct imxuart_platform_data *pdata;
+		pdata = sport->port.dev->platform_data;
+		if (pdata->irda_enable)
+			pdata->irda_enable(0);
+	}
+
 	/*
 	 * Stop our timer.
 	 */
@@ -648,7 +789,8 @@ static void imx_shutdown(struct uart_port *port)
 	 * Free the interrupts
 	 */
 	if (sport->txirq > 0) {
-		free_irq(sport->rtsirq, sport);
+		if (!USE_IRDA(sport))
+			free_irq(sport->rtsirq, sport);
 		free_irq(sport->txirq, sport);
 		free_irq(sport->rxirq, sport);
 	} else
@@ -660,6 +802,9 @@ static void imx_shutdown(struct uart_port *port)
 
 	temp = readl(sport->port.membase + UCR1);
 	temp &= ~(UCR1_TXMPTYEN | UCR1_RRDYEN | UCR1_RTSDEN | UCR1_UARTEN);
+	if (USE_IRDA(sport))
+		temp &= ~(UCR1_IREN);
+
 	writel(temp, sport->port.membase + UCR1);
 }
 
@@ -768,11 +913,19 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 			sport->port.membase + UCR2);
 	old_txrxen &= (UCR2_TXEN | UCR2_RXEN);
 
-	div = sport->port.uartclk / (baud * 16);
-	if (div > 7)
-		div = 7;
-	if (!div)
+	if (USE_IRDA(sport)) {
+		/*
+		 * use maximum available submodule frequency to
+		 * avoid missing short pulses due to low sampling rate
+		 */
 		div = 1;
+	} else {
+		div = sport->port.uartclk / (baud * 16);
+		if (div > 7)
+			div = 7;
+		if (!div)
+			div = 1;
+	}
 
 	rational_best_approximation(16 * div * baud, sport->port.uartclk,
 		1 << 16, 1 << 16, &num, &denom);
@@ -781,8 +934,7 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	denom -= 1;
 
 	ufcr = readl(sport->port.membase + UFCR);
-	ufcr = (ufcr & (~UFCR_RFDIV)) |
-	    (div << 7);
+	ufcr = (ufcr & (~UFCR_RFDIV)) | UFCR_RFDIV_REG(div);
 	writel(ufcr, sport->port.membase + UFCR);
 
 	writel(num, sport->port.membase + UBIR);
@@ -1141,6 +1293,11 @@ static int serial_imx_probe(struct platform_device *pdev)
 	if (pdata && (pdata->flags & IMXUART_HAVE_RTSCTS))
 		sport->have_rtscts = 1;
 
+#ifdef CONFIG_IRDA
+	if (pdata && (pdata->flags & IMXUART_IRDA))
+		sport->use_irda = 1;
+#endif
+
 	if (pdata->init) {
 		ret = pdata->init(pdev);
 		if (ret)
-- 
GitLab


From d7f8d437bda0ec409e26cffb846bc28a40603ee3 Mon Sep 17 00:00:00 2001
From: Oskar Schirmer <os@emlix.com>
Date: Thu, 11 Jun 2009 14:55:22 +0100
Subject: [PATCH 2049/2198] imx: serial: use tty_encode_baud_rate to set true
 rate

real baud rate may be different from the one requested.
for upper layers, set the nearest value to the real rate
in favour of the rate previously requested.

Signed-off-by: Oskar Schirmer <os@emlix.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/serial/imx.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/drivers/serial/imx.c b/drivers/serial/imx.c
index 8c79e8c2fd4e7..7b5d1de9cfe39 100644
--- a/drivers/serial/imx.c
+++ b/drivers/serial/imx.c
@@ -818,6 +818,7 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	unsigned int old_csize = old ? old->c_cflag & CSIZE : CS8;
 	unsigned int div, ufcr;
 	unsigned long num, denom;
+	uint64_t tdiv64;
 
 	/*
 	 * If we don't support modem control lines, don't allow
@@ -930,6 +931,12 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
 	rational_best_approximation(16 * div * baud, sport->port.uartclk,
 		1 << 16, 1 << 16, &num, &denom);
 
+	tdiv64 = sport->port.uartclk;
+	tdiv64 *= num;
+	do_div(tdiv64, denom * 16 * div);
+	tty_encode_baud_rate(sport->port.info->port.tty,
+		(speed_t)tdiv64, (speed_t)tdiv64);
+
 	num -= 1;
 	denom -= 1;
 
-- 
GitLab


From f0e8527726b9e56649b9eafde3bc0fbc4dd2dd47 Mon Sep 17 00:00:00 2001
From: Dirk Eibach <eibach@gdsys.de>
Date: Thu, 11 Jun 2009 14:56:44 +0100
Subject: [PATCH 2050/2198] moxa: prevent opening unavailable ports

In moxa.c there are 32 minor numbers reserved for each device. The
number of ports actually available per device is stored in
moxa_board_conf->numPorts. This number is not considered in moxa_open().
Opening a port that is not available results in a kernel oops.
This patch adds a test to moxa_open() that prevents opening unavailable
ports.

Signed-off-by: Dirk Eibach <eibach@gdsys.de>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Alan Cox <alan@linux.intel.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/moxa.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c
index 4a4cab73d0be1..65b6ff2442c6d 100644
--- a/drivers/char/moxa.c
+++ b/drivers/char/moxa.c
@@ -1184,6 +1184,11 @@ static int moxa_open(struct tty_struct *tty, struct file *filp)
 		return -ENODEV;
 	}
 
+	if (port % MAX_PORTS_PER_BOARD >= brd->numPorts) {
+		mutex_unlock(&moxa_openlock);
+		return -ENODEV;
+	}
+
 	ch = &brd->ports[port % MAX_PORTS_PER_BOARD];
 	ch->port.count++;
 	tty->driver_data = ch;
-- 
GitLab


From 1c432d899d32d36371ee4ee310fa3609cf0e5742 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 13:19:29 +0200
Subject: [PATCH 2051/2198] perf_counter: Rename enums

Rename the perf enums to be in the 'perf_' namespace and strictly
enumerate the ABI bits.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 53 +++++++++++++++++-------------------
 kernel/perf_counter.c        |  6 ++--
 2 files changed, 28 insertions(+), 31 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 95c797c480e8d..d5911b02bc8c3 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -24,24 +24,21 @@
 /*
  * attr.type
  */
-enum perf_event_types {
+enum perf_type_id {
 	PERF_TYPE_HARDWARE		= 0,
 	PERF_TYPE_SOFTWARE		= 1,
 	PERF_TYPE_TRACEPOINT		= 2,
 	PERF_TYPE_HW_CACHE		= 3,
+	PERF_TYPE_RAW			= 4,
 
-	/*
-	 * available TYPE space, raw is the max value.
-	 */
-
-	PERF_TYPE_RAW			= 128,
+	PERF_TYPE_MAX,			/* non ABI */
 };
 
 /*
  * Generalized performance counter event types, used by the attr.event_id
  * parameter of the sys_perf_counter_open() syscall:
  */
-enum attr_ids {
+enum perf_hw_id {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
@@ -53,7 +50,7 @@ enum attr_ids {
 	PERF_COUNT_BRANCH_MISSES	= 5,
 	PERF_COUNT_BUS_CYCLES		= 6,
 
-	PERF_HW_EVENTS_MAX		= 7,
+	PERF_HW_EVENTS_MAX,		/* non ABI */
 };
 
 /*
@@ -63,30 +60,30 @@ enum attr_ids {
  *       { read, write, prefetch } x
  *       { accesses, misses }
  */
-enum hw_cache_id {
-	PERF_COUNT_HW_CACHE_L1D,
-	PERF_COUNT_HW_CACHE_L1I,
-	PERF_COUNT_HW_CACHE_L2,
-	PERF_COUNT_HW_CACHE_DTLB,
-	PERF_COUNT_HW_CACHE_ITLB,
-	PERF_COUNT_HW_CACHE_BPU,
-
-	PERF_COUNT_HW_CACHE_MAX,
+enum perf_hw_cache_id {
+	PERF_COUNT_HW_CACHE_L1D		= 0,
+	PERF_COUNT_HW_CACHE_L1I		= 1,
+	PERF_COUNT_HW_CACHE_L2		= 2,
+	PERF_COUNT_HW_CACHE_DTLB	= 3,
+	PERF_COUNT_HW_CACHE_ITLB	= 4,
+	PERF_COUNT_HW_CACHE_BPU		= 5,
+
+	PERF_COUNT_HW_CACHE_MAX,	/* non ABI */
 };
 
-enum hw_cache_op_id {
-	PERF_COUNT_HW_CACHE_OP_READ,
-	PERF_COUNT_HW_CACHE_OP_WRITE,
-	PERF_COUNT_HW_CACHE_OP_PREFETCH,
+enum perf_hw_cache_op_id {
+	PERF_COUNT_HW_CACHE_OP_READ	= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE	= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH	= 2,
 
-	PERF_COUNT_HW_CACHE_OP_MAX,
+	PERF_COUNT_HW_CACHE_OP_MAX,	/* non ABI */
 };
 
-enum hw_cache_op_result_id {
-	PERF_COUNT_HW_CACHE_RESULT_ACCESS,
-	PERF_COUNT_HW_CACHE_RESULT_MISS,
+enum perf_hw_cache_op_result_id {
+	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
+	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
 
-	PERF_COUNT_HW_CACHE_RESULT_MAX,
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non ABI */
 };
 
 /*
@@ -95,7 +92,7 @@ enum hw_cache_op_result_id {
  * physical and sw events of the kernel (and allow the profiling of them as
  * well):
  */
-enum sw_event_ids {
+enum perf_sw_ids {
 	PERF_COUNT_CPU_CLOCK		= 0,
 	PERF_COUNT_TASK_CLOCK		= 1,
 	PERF_COUNT_PAGE_FAULTS		= 2,
@@ -104,7 +101,7 @@ enum sw_event_ids {
 	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
 	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
 
-	PERF_SW_EVENTS_MAX		= 7,
+	PERF_SW_EVENTS_MAX,		/* non ABI */
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 3b2829de5590e..c02535bed26ff 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3162,7 +3162,7 @@ static int perf_swcounter_is_counting(struct perf_counter *counter)
 }
 
 static int perf_swcounter_match(struct perf_counter *counter,
-				enum perf_event_types type,
+				enum perf_type_id type,
 				u32 event, struct pt_regs *regs)
 {
 	if (!perf_swcounter_is_counting(counter))
@@ -3194,7 +3194,7 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 }
 
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
-				     enum perf_event_types type, u32 event,
+				     enum perf_type_id type, u32 event,
 				     u64 nr, int nmi, struct pt_regs *regs,
 				     u64 addr)
 {
@@ -3225,7 +3225,7 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 	return &cpuctx->recursion[0];
 }
 
-static void __perf_swcounter_event(enum perf_event_types type, u32 event,
+static void __perf_swcounter_event(enum perf_type_id type, u32 event,
 				   u64 nr, int nmi, struct pt_regs *regs,
 				   u64 addr)
 {
-- 
GitLab


From f4dbfa8f3131a84257223393905f7efad0ca5996 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:06:28 +0200
Subject: [PATCH 2052/2198] perf_counter: Standardize event names

Pure renames only, to PERF_COUNT_HW_* and PERF_COUNT_SW_*.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/power4-pmu.c   | 12 ++++----
 arch/powerpc/kernel/power5+-pmu.c  | 12 ++++----
 arch/powerpc/kernel/power5-pmu.c   | 12 ++++----
 arch/powerpc/kernel/power6-pmu.c   | 12 ++++----
 arch/powerpc/kernel/ppc970-pmu.c   | 12 ++++----
 arch/powerpc/mm/fault.c            |  6 ++--
 arch/x86/kernel/cpu/perf_counter.c | 32 +++++++++++-----------
 arch/x86/mm/fault.c                |  6 ++--
 include/linux/perf_counter.h       | 36 ++++++++++++------------
 kernel/perf_counter.c              | 20 +++++++-------
 tools/perf/builtin-record.c        |  4 +--
 tools/perf/builtin-stat.c          | 31 +++++++++++----------
 tools/perf/builtin-top.c           |  4 +--
 tools/perf/design.txt              | 28 +++++++++----------
 tools/perf/util/parse-events.c     | 44 +++++++++++++++---------------
 15 files changed, 136 insertions(+), 135 deletions(-)

diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 0e94b6857220d..73956f084b295 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -535,12 +535,12 @@ static void p4_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int p4_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 7,
-	[PERF_COUNT_INSTRUCTIONS] = 0x1001,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x8c10,		/* PM_LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c10,		/* PM_LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330,	/* PM_BR_ISSUED */
-	[PERF_COUNT_BRANCH_MISSES] = 0x331,		/* PM_BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x1001,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8c10, /* PM_LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c10, /* PM_LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x330,  /* PM_BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x331,  /* PM_BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index bbf2cbb07388c..5f8b7741e9702 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -606,12 +606,12 @@ static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power5p_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0xf,
-	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x1c10a8, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index 670cf10b91e8a..d54723ab627dc 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -548,12 +548,12 @@ static void power5_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power5_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0xf,
-	[PERF_COUNT_INSTRUCTIONS] = 0x100009,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x4c1090,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3c1088,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4,	/* BR_ISSUED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x230e5,		/* BR_MPRED_CR */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4c1090, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 4da707866097e..0cd406ee765b6 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -466,12 +466,12 @@ static void p6_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int power6_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 0x1e,
-	[PERF_COUNT_INSTRUCTIONS] = 2,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x280030,	/* LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x30000c,		/* LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0,	/* BR_PRED */ 
-	[PERF_COUNT_BRANCH_MISSES] = 0x400052,		/* BR_MPRED */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 0x1e,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 2,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x280030, /* LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x30000c, /* LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x410a0,  /* BR_PRED */
+	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x400052, /* BR_MPRED */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 336adf1736af8..46a2064094203 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -419,12 +419,12 @@ static void p970_disable_pmc(unsigned int pmc, u64 mmcr[])
 }
 
 static int ppc970_generic_events[] = {
-	[PERF_COUNT_CPU_CYCLES] = 7,
-	[PERF_COUNT_INSTRUCTIONS] = 1,
-	[PERF_COUNT_CACHE_REFERENCES] = 0x8810,		/* PM_LD_REF_L1 */
-	[PERF_COUNT_CACHE_MISSES] = 0x3810,		/* PM_LD_MISS_L1 */
-	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431,	/* PM_BR_ISSUED */
-	[PERF_COUNT_BRANCH_MISSES] = 0x327,		/* PM_GRP_BR_MPRED */
+	[PERF_COUNT_HW_CPU_CYCLES]		= 7,
+	[PERF_COUNT_HW_INSTRUCTIONS]		= 1,
+	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8810, /* PM_LD_REF_L1 */
+	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3810, /* PM_LD_MISS_L1 */
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x431,  /* PM_BR_ISSUED */
+	[PERF_COUNT_HW_BRANCH_MISSES] 		= 0x327,  /* PM_GRP_BR_MPRED */
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index ac0e112031b29..5beffc8f481e7 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 		die("Weird page fault", regs, SIGSEGV);
 	}
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/* When running in the kernel we expect faults to occur only to
 	 * addresses in user space.  All other faults represent errors in the
@@ -312,7 +312,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 	}
 	if (ret & VM_FAULT_MAJOR) {
 		current->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 #ifdef CONFIG_PPC_SMLPAR
 		if (firmware_has_feature(FW_FEATURE_CMO)) {
@@ -323,7 +323,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
 #endif
 	} else {
 		current->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 	up_read(&mm->mmap_sem);
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 57ae1bec81bea..572fb434a6663 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -69,13 +69,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
  */
 static const u64 intel_perfmon_event_map[] =
 {
-  [PERF_COUNT_CPU_CYCLES]		= 0x003c,
-  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_CACHE_REFERENCES]		= 0x4f2e,
-  [PERF_COUNT_CACHE_MISSES]		= 0x412e,
-  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
-  [PERF_COUNT_BUS_CYCLES]		= 0x013c,
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c,
 };
 
 static u64 intel_pmu_event_map(int event)
@@ -485,12 +485,12 @@ static const u64 amd_0f_hw_cache_event_ids
  */
 static const u64 amd_perfmon_event_map[] =
 {
-  [PERF_COUNT_CPU_CYCLES]		= 0x0076,
-  [PERF_COUNT_INSTRUCTIONS]		= 0x00c0,
-  [PERF_COUNT_CACHE_REFERENCES]		= 0x0080,
-  [PERF_COUNT_CACHE_MISSES]		= 0x0081,
-  [PERF_COUNT_BRANCH_INSTRUCTIONS]	= 0x00c4,
-  [PERF_COUNT_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
 };
 
 static u64 amd_pmu_event_map(int event)
@@ -970,11 +970,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc)
 
 	event = hwc->config & ARCH_PERFMON_EVENT_MASK;
 
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS)))
 		return X86_PMC_IDX_FIXED_INSTRUCTIONS;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
 		return X86_PMC_IDX_FIXED_CPU_CYCLES;
-	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES)))
+	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
 		return X86_PMC_IDX_FIXED_BUS_CYCLES;
 
 	return -1;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 6f9df2babe487..5c6d816f30b4e 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	if (unlikely(error_code & PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
-	perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address);
+	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
 
 	/*
 	 * If we're in an interrupt, have no user context or are running
@@ -1142,11 +1142,11 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	if (fault & VM_FAULT_MAJOR) {
 		tsk->maj_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
 				     regs, address);
 	} else {
 		tsk->min_flt++;
-		perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0,
+		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
 				     regs, address);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index d5911b02bc8c3..887df88a9c2ae 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -42,15 +42,15 @@ enum perf_hw_id {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CYCLES		= 0,
-	PERF_COUNT_INSTRUCTIONS		= 1,
-	PERF_COUNT_CACHE_REFERENCES	= 2,
-	PERF_COUNT_CACHE_MISSES		= 3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_BRANCH_MISSES	= 5,
-	PERF_COUNT_BUS_CYCLES		= 6,
-
-	PERF_HW_EVENTS_MAX,		/* non ABI */
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES		= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES		= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
+
+	PERF_COUNT_HW_MAX,		/* non ABI */
 };
 
 /*
@@ -93,15 +93,15 @@ enum perf_hw_cache_op_result_id {
  * well):
  */
 enum perf_sw_ids {
-	PERF_COUNT_CPU_CLOCK		= 0,
-	PERF_COUNT_TASK_CLOCK		= 1,
-	PERF_COUNT_PAGE_FAULTS		= 2,
-	PERF_COUNT_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
-
-	PERF_SW_EVENTS_MAX,		/* non ABI */
+	PERF_COUNT_SW_CPU_CLOCK		= 0,
+	PERF_COUNT_SW_TASK_CLOCK	= 1,
+	PERF_COUNT_SW_PAGE_FAULTS	= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
+
+	PERF_COUNT_SW_MAX,		/* non ABI */
 };
 
 /*
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c02535bed26ff..8859b97390ecf 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -1024,7 +1024,7 @@ void perf_counter_task_sched_out(struct task_struct *task,
 	int do_switch = 1;
 
 	regs = task_pt_regs(task);
-	perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0);
+	perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0);
 
 	if (likely(!ctx || !cpuctx->task_ctx))
 		return;
@@ -3411,13 +3411,13 @@ void perf_counter_task_migration(struct task_struct *task, int cpu)
 	struct perf_counter_context *ctx;
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE,
-				 PERF_COUNT_CPU_MIGRATIONS,
+				 PERF_COUNT_SW_CPU_MIGRATIONS,
 				 1, 1, NULL, 0);
 
 	ctx = perf_pin_task_context(task);
 	if (ctx) {
 		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE,
-					 PERF_COUNT_CPU_MIGRATIONS,
+					 PERF_COUNT_SW_CPU_MIGRATIONS,
 					 1, 1, NULL, 0);
 		perf_unpin_context(ctx);
 	}
@@ -3475,11 +3475,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 	 * events.
 	 */
 	switch (counter->attr.config) {
-	case PERF_COUNT_CPU_CLOCK:
+	case PERF_COUNT_SW_CPU_CLOCK:
 		pmu = &perf_ops_cpu_clock;
 
 		break;
-	case PERF_COUNT_TASK_CLOCK:
+	case PERF_COUNT_SW_TASK_CLOCK:
 		/*
 		 * If the user instantiates this as a per-cpu counter,
 		 * use the cpu_clock counter instead.
@@ -3490,11 +3490,11 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 			pmu = &perf_ops_cpu_clock;
 
 		break;
-	case PERF_COUNT_PAGE_FAULTS:
-	case PERF_COUNT_PAGE_FAULTS_MIN:
-	case PERF_COUNT_PAGE_FAULTS_MAJ:
-	case PERF_COUNT_CONTEXT_SWITCHES:
-	case PERF_COUNT_CPU_MIGRATIONS:
+	case PERF_COUNT_SW_PAGE_FAULTS:
+	case PERF_COUNT_SW_PAGE_FAULTS_MIN:
+	case PERF_COUNT_SW_PAGE_FAULTS_MAJ:
+	case PERF_COUNT_SW_CONTEXT_SWITCHES:
+	case PERF_COUNT_SW_CPU_MIGRATIONS:
 		pmu = &perf_ops_generic;
 		break;
 	}
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 84cd336ae79bc..29259e74dcfa7 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -378,12 +378,12 @@ static void create_counter(int counter, int cpu, pid_t pid)
 		 * is always available even if no PMU support:
 		 */
 		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
 
 			if (verbose)
 				warning(" ... trying to fall back to cpu-clock-ticks\n");
 			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_CPU_CLOCK;
+			attr->config = PERF_COUNT_SW_CPU_CLOCK;
 			goto try_again;
 		}
 		printf("\n");
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 6404906924fa2..c43e4a97dc428 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -46,15 +46,16 @@
 
 static struct perf_counter_attr default_attrs[MAX_COUNTERS] = {
 
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_TASK_CLOCK		},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CONTEXT_SWITCHES	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_CPU_MIGRATIONS	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_PAGE_FAULTS	},
-
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CPU_CYCLES		},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_INSTRUCTIONS	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_REFERENCES	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_CACHE_MISSES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS	},
+
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES	},
+
 };
 
 static int			system_wide			=  0;
@@ -120,10 +121,10 @@ static inline int nsec_counter(int counter)
 	if (attrs[counter].type != PERF_TYPE_SOFTWARE)
 		return 0;
 
-	if (attrs[counter].config == PERF_COUNT_CPU_CLOCK)
+	if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
 		return 1;
 
-	if (attrs[counter].config == PERF_COUNT_TASK_CLOCK)
+	if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
 		return 1;
 
 	return 0;
@@ -176,10 +177,10 @@ static void read_counter(int counter)
 	 * Save the full runtime - to allow normalization during printout:
 	 */
 	if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-		attrs[counter].config == PERF_COUNT_TASK_CLOCK)
+		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
 		runtime_nsecs = count[0];
 	if (attrs[counter].type == PERF_TYPE_HARDWARE &&
-		attrs[counter].config == PERF_COUNT_CPU_CYCLES)
+		attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
 		runtime_cycles = count[0];
 }
 
@@ -206,7 +207,7 @@ static void print_counter(int counter)
 		fprintf(stderr, " %14.6f  %-20s",
 			msecs, event_name(counter));
 		if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
-			attrs[counter].config == PERF_COUNT_TASK_CLOCK) {
+			attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
 
 			if (walltime_nsecs)
 				fprintf(stderr, " # %11.3f CPU utilization factor",
@@ -220,7 +221,7 @@ static void print_counter(int counter)
 				(double)count[0]/runtime_nsecs*1000.0);
 		if (runtime_cycles &&
 			attrs[counter].type == PERF_TYPE_HARDWARE &&
-				attrs[counter].config == PERF_COUNT_INSTRUCTIONS) {
+				attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
 
 			fprintf(stderr, " # %1.3f per cycle",
 				(double)count[0] / (double)runtime_cycles);
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 309dbc76ec88c..fe338d3c5d7ec 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -562,13 +562,13 @@ static void start_counter(int i, int counter)
 		 * is always available even if no PMU support:
 		 */
 		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_CPU_CYCLES) {
+			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
 
 			if (verbose)
 				warning(" ... trying to fall back to cpu-clock-ticks\n");
 
 			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_CPU_CLOCK;
+			attr->config = PERF_COUNT_SW_CPU_CLOCK;
 			goto try_again;
 		}
 		printf("\n");
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index d3250763dc92b..860e116d979c8 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -99,13 +99,13 @@ enum hw_event_ids {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
-	PERF_COUNT_CPU_CYCLES		= 0,
-	PERF_COUNT_INSTRUCTIONS		= 1,
-	PERF_COUNT_CACHE_REFERENCES	= 2,
-	PERF_COUNT_CACHE_MISSES		= 3,
-	PERF_COUNT_BRANCH_INSTRUCTIONS	= 4,
-	PERF_COUNT_BRANCH_MISSES	= 5,
-	PERF_COUNT_BUS_CYCLES		= 6,
+	PERF_COUNT_HW_CPU_CYCLES		= 0,
+	PERF_COUNT_HW_INSTRUCTIONS		= 1,
+	PERF_COUNT_HW_CACHE_REFERENCES	= 2,
+	PERF_COUNT_HW_CACHE_MISSES		= 3,
+	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4,
+	PERF_COUNT_HW_BRANCH_MISSES	= 5,
+	PERF_COUNT_HW_BUS_CYCLES		= 6,
 };
 
 These are standardized types of events that work relatively uniformly
@@ -130,13 +130,13 @@ software events, selected by 'event_id':
  * well):
  */
 enum sw_event_ids {
-	PERF_COUNT_CPU_CLOCK		= 0,
-	PERF_COUNT_TASK_CLOCK		= 1,
-	PERF_COUNT_PAGE_FAULTS		= 2,
-	PERF_COUNT_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_PAGE_FAULTS_MAJ	= 6,
+	PERF_COUNT_SW_CPU_CLOCK		= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
 };
 
 Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index f18a9a006e1bd..9d5f1ca50e6fe 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -22,26 +22,26 @@ struct event_symbol {
 #define CR(x, y) .type = PERF_TYPE_##x, .config = y
 
 static struct event_symbol event_symbols[] = {
-  { C(HARDWARE, CPU_CYCLES),		"cpu-cycles",		},
-  { C(HARDWARE, CPU_CYCLES),		"cycles",		},
-  { C(HARDWARE, INSTRUCTIONS),		"instructions",		},
-  { C(HARDWARE, CACHE_REFERENCES),	"cache-references",	},
-  { C(HARDWARE, CACHE_MISSES),		"cache-misses",		},
-  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branch-instructions",	},
-  { C(HARDWARE, BRANCH_INSTRUCTIONS),	"branches",		},
-  { C(HARDWARE, BRANCH_MISSES),		"branch-misses",	},
-  { C(HARDWARE, BUS_CYCLES),		"bus-cycles",		},
-
-  { C(SOFTWARE, CPU_CLOCK),		"cpu-clock",		},
-  { C(SOFTWARE, TASK_CLOCK),		"task-clock",		},
-  { C(SOFTWARE, PAGE_FAULTS),		"page-faults",		},
-  { C(SOFTWARE, PAGE_FAULTS),		"faults",		},
-  { C(SOFTWARE, PAGE_FAULTS_MIN),	"minor-faults",		},
-  { C(SOFTWARE, PAGE_FAULTS_MAJ),	"major-faults",		},
-  { C(SOFTWARE, CONTEXT_SWITCHES),	"context-switches",	},
-  { C(SOFTWARE, CONTEXT_SWITCHES),	"cs",			},
-  { C(SOFTWARE, CPU_MIGRATIONS),	"cpu-migrations",	},
-  { C(SOFTWARE, CPU_MIGRATIONS),	"migrations",		},
+  { C(HARDWARE, HW_CPU_CYCLES),		"cpu-cycles",		},
+  { C(HARDWARE, HW_CPU_CYCLES),		"cycles",		},
+  { C(HARDWARE, HW_INSTRUCTIONS),	"instructions",		},
+  { C(HARDWARE, HW_CACHE_REFERENCES),	"cache-references",	},
+  { C(HARDWARE, HW_CACHE_MISSES),	"cache-misses",		},
+  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branch-instructions",	},
+  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branches",		},
+  { C(HARDWARE, HW_BRANCH_MISSES),	"branch-misses",	},
+  { C(HARDWARE, HW_BUS_CYCLES),		"bus-cycles",		},
+
+  { C(SOFTWARE, SW_CPU_CLOCK),		"cpu-clock",		},
+  { C(SOFTWARE, SW_TASK_CLOCK),		"task-clock",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS),	"page-faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS),	"faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS_MIN),	"minor-faults",		},
+  { C(SOFTWARE, SW_PAGE_FAULTS_MAJ),	"major-faults",		},
+  { C(SOFTWARE, SW_CONTEXT_SWITCHES),	"context-switches",	},
+  { C(SOFTWARE, SW_CONTEXT_SWITCHES),	"cs",			},
+  { C(SOFTWARE, SW_CPU_MIGRATIONS),	"cpu-migrations",	},
+  { C(SOFTWARE, SW_CPU_MIGRATIONS),	"migrations",		},
 };
 
 #define __PERF_COUNTER_FIELD(config, name) \
@@ -107,7 +107,7 @@ char *event_name(int counter)
 
 	switch (type) {
 	case PERF_TYPE_HARDWARE:
-		if (config < PERF_HW_EVENTS_MAX)
+		if (config < PERF_COUNT_HW_MAX)
 			return hw_event_names[config];
 		return "unknown-hardware";
 
@@ -136,7 +136,7 @@ char *event_name(int counter)
 	}
 
 	case PERF_TYPE_SOFTWARE:
-		if (config < PERF_SW_EVENTS_MAX)
+		if (config < PERF_COUNT_SW_MAX)
 			return sw_event_names[config];
 		return "unknown-software";
 
-- 
GitLab


From 8be6e8f3c3a13900169f1141870562d0c723b010 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:19:11 +0200
Subject: [PATCH 2053/2198] perf_counter: Rename L2 to LL cache

The top (fastest) and last level (biggest) caches are the most
interesting ones, performance wise.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
[ Fixed the Nehalem LL table to LLC Reference/Miss events ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/kernel/power4-pmu.c   |  2 +-
 arch/powerpc/kernel/power5+-pmu.c  |  2 +-
 arch/powerpc/kernel/power5-pmu.c   |  2 +-
 arch/powerpc/kernel/power6-pmu.c   |  2 +-
 arch/powerpc/kernel/power7-pmu.c   |  2 +-
 arch/powerpc/kernel/ppc970-pmu.c   |  2 +-
 arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------
 include/linux/perf_counter.h       |  4 ++--
 8 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c
index 73956f084b295..07bd308a5fa74 100644
--- a/arch/powerpc/kernel/power4-pmu.c
+++ b/arch/powerpc/kernel/power4-pmu.c
@@ -561,7 +561,7 @@ static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0	},
 		[C(OP_WRITE)] = {	0,		0	},
 		[C(OP_PREFETCH)] = {	0xc34,		0	},
diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c
index 5f8b7741e9702..41e5d2d958d4c 100644
--- a/arch/powerpc/kernel/power5+-pmu.c
+++ b/arch/powerpc/kernel/power5+-pmu.c
@@ -632,7 +632,7 @@ static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0,		0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0		},
 		[C(OP_WRITE)] = {	0,		0		},
 		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c
index d54723ab627dc..05600b66221ae 100644
--- a/arch/powerpc/kernel/power5-pmu.c
+++ b/arch/powerpc/kernel/power5-pmu.c
@@ -574,7 +574,7 @@ static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0,		0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0x3c309b	},
 		[C(OP_WRITE)] = {	0,		0		},
 		[C(OP_PREFETCH)] = {	0xc50c3,	0		},
diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c
index 0cd406ee765b6..46f74bebcfd92 100644
--- a/arch/powerpc/kernel/power6-pmu.c
+++ b/arch/powerpc/kernel/power6-pmu.c
@@ -493,7 +493,7 @@ static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1		},
 		[C(OP_PREFETCH)] = {	0x4008c,	0		},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x150730,	0x250532	},
 		[C(OP_WRITE)] = {	0x250432,	0x150432	},
 		[C(OP_PREFETCH)] = {	0x810a6,	0		},
diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c
index 060e0deb399e6..b3f7d1216bae8 100644
--- a/arch/powerpc/kernel/power7-pmu.c
+++ b/arch/powerpc/kernel/power7-pmu.c
@@ -320,7 +320,7 @@ static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0x408a,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0x6080,		0x6084	},
 		[C(OP_WRITE)] = {	0x6082,		0x6086	},
 		[C(OP_PREFETCH)] = {	0,		0	},
diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c
index 46a2064094203..ba0a357a89f40 100644
--- a/arch/powerpc/kernel/ppc970-pmu.c
+++ b/arch/powerpc/kernel/ppc970-pmu.c
@@ -445,7 +445,7 @@ static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = {
 		[C(OP_WRITE)] = {	-1,		-1	},
 		[C(OP_PREFETCH)] = {	0,		0	},
 	},
-	[C(L2)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
+	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */
 		[C(OP_READ)] = {	0,		0	},
 		[C(OP_WRITE)] = {	0,		0	},
 		[C(OP_PREFETCH)] = {	0x733,		0	},
diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 572fb434a6663..895c82e784550 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -131,7 +131,7 @@ static const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */
 		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */
@@ -141,8 +141,8 @@ static const u64 nehalem_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */
 	},
 	[ C(OP_PREFETCH) ] = {
-		[ C(RESULT_ACCESS) ] = 0xc024, /* L2_RQSTS.PREFETCHES          */
-		[ C(RESULT_MISS)   ] = 0x8024, /* L2_RQSTS.PREFETCH_MISS       */
+		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */
+		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */
 	},
  },
  [ C(DTLB) ] = {
@@ -222,7 +222,7 @@ static const u64 core2_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
 		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
@@ -313,7 +313,7 @@ static const u64 atom_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */
 		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */
@@ -422,7 +422,7 @@ static const u64 amd_0f_hw_cache_event_ids
 		[ C(RESULT_MISS)   ] = 0,
 	},
  },
- [ C(L2  ) ] = {
+ [ C(LL  ) ] = {
 	[ C(OP_READ) ] = {
 		[ C(RESULT_ACCESS) ] = 0,
 		[ C(RESULT_MISS)   ] = 0,
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 887df88a9c2ae..20cf5af27ade7 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -56,14 +56,14 @@ enum perf_hw_id {
 /*
  * Generalized hardware cache counters:
  *
- *       { L1-D, L1-I, L2, LLC, ITLB, DTLB, BPU } x
+ *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x
  *       { read, write, prefetch } x
  *       { accesses, misses }
  */
 enum perf_hw_cache_id {
 	PERF_COUNT_HW_CACHE_L1D		= 0,
 	PERF_COUNT_HW_CACHE_L1I		= 1,
-	PERF_COUNT_HW_CACHE_L2		= 2,
+	PERF_COUNT_HW_CACHE_LL		= 2,
 	PERF_COUNT_HW_CACHE_DTLB	= 3,
 	PERF_COUNT_HW_CACHE_ITLB	= 4,
 	PERF_COUNT_HW_CACHE_BPU		= 5,
-- 
GitLab


From a308444ceb576d3089f9ca0dfd097eba6f1e623f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Jun 2009 14:44:26 +0200
Subject: [PATCH 2054/2198] perf_counter: Better align code

Whitespace and comment bits. Also update copyrights.

[ Impact: cleanup ]

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
---
 include/linux/perf_counter.h | 165 ++++++++++++++++++-----------------
 1 file changed, 85 insertions(+), 80 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 20cf5af27ade7..1fa1a26cb1b35 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -1,12 +1,13 @@
 /*
  *  Performance counters:
  *
- *   Copyright(C) 2008, Thomas Gleixner <tglx@linutronix.de>
- *   Copyright(C) 2008, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de>
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar
+ *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra
  *
  *  Data type definitions, declarations, prototypes.
  *
- *  Started by: Thomas Gleixner and Ingo Molnar
+ *    Started by: Thomas Gleixner and Ingo Molnar
  *
  *  For licencing details see kernel-base/COPYING
  */
@@ -25,18 +26,19 @@
  * attr.type
  */
 enum perf_type_id {
-	PERF_TYPE_HARDWARE		= 0,
-	PERF_TYPE_SOFTWARE		= 1,
-	PERF_TYPE_TRACEPOINT		= 2,
-	PERF_TYPE_HW_CACHE		= 3,
-	PERF_TYPE_RAW			= 4,
+	PERF_TYPE_HARDWARE			= 0,
+	PERF_TYPE_SOFTWARE			= 1,
+	PERF_TYPE_TRACEPOINT			= 2,
+	PERF_TYPE_HW_CACHE			= 3,
+	PERF_TYPE_RAW				= 4,
 
-	PERF_TYPE_MAX,			/* non ABI */
+	PERF_TYPE_MAX,				/* non-ABI */
 };
 
 /*
- * Generalized performance counter event types, used by the attr.event_id
- * parameter of the sys_perf_counter_open() syscall:
+ * Generalized performance counter event types, used by the
+ * attr.event_id parameter of the sys_perf_counter_open()
+ * syscall:
  */
 enum perf_hw_id {
 	/*
@@ -50,7 +52,7 @@ enum perf_hw_id {
 	PERF_COUNT_HW_BRANCH_MISSES		= 5,
 	PERF_COUNT_HW_BUS_CYCLES		= 6,
 
-	PERF_COUNT_HW_MAX,		/* non ABI */
+	PERF_COUNT_HW_MAX,			/* non-ABI */
 };
 
 /*
@@ -61,29 +63,29 @@ enum perf_hw_id {
  *       { accesses, misses }
  */
 enum perf_hw_cache_id {
-	PERF_COUNT_HW_CACHE_L1D		= 0,
-	PERF_COUNT_HW_CACHE_L1I		= 1,
-	PERF_COUNT_HW_CACHE_LL		= 2,
-	PERF_COUNT_HW_CACHE_DTLB	= 3,
-	PERF_COUNT_HW_CACHE_ITLB	= 4,
-	PERF_COUNT_HW_CACHE_BPU		= 5,
-
-	PERF_COUNT_HW_CACHE_MAX,	/* non ABI */
+	PERF_COUNT_HW_CACHE_L1D			= 0,
+	PERF_COUNT_HW_CACHE_L1I			= 1,
+	PERF_COUNT_HW_CACHE_LL			= 2,
+	PERF_COUNT_HW_CACHE_DTLB		= 3,
+	PERF_COUNT_HW_CACHE_ITLB		= 4,
+	PERF_COUNT_HW_CACHE_BPU			= 5,
+
+	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */
 };
 
 enum perf_hw_cache_op_id {
-	PERF_COUNT_HW_CACHE_OP_READ	= 0,
-	PERF_COUNT_HW_CACHE_OP_WRITE	= 1,
-	PERF_COUNT_HW_CACHE_OP_PREFETCH	= 2,
+	PERF_COUNT_HW_CACHE_OP_READ		= 0,
+	PERF_COUNT_HW_CACHE_OP_WRITE		= 1,
+	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2,
 
-	PERF_COUNT_HW_CACHE_OP_MAX,	/* non ABI */
+	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */
 };
 
 enum perf_hw_cache_op_result_id {
 	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0,
 	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1,
 
-	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non ABI */
+	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */
 };
 
 /*
@@ -93,15 +95,15 @@ enum perf_hw_cache_op_result_id {
  * well):
  */
 enum perf_sw_ids {
-	PERF_COUNT_SW_CPU_CLOCK		= 0,
-	PERF_COUNT_SW_TASK_CLOCK	= 1,
-	PERF_COUNT_SW_PAGE_FAULTS	= 2,
-	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3,
-	PERF_COUNT_SW_CPU_MIGRATIONS	= 4,
-	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5,
-	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6,
-
-	PERF_COUNT_SW_MAX,		/* non ABI */
+	PERF_COUNT_SW_CPU_CLOCK			= 0,
+	PERF_COUNT_SW_TASK_CLOCK		= 1,
+	PERF_COUNT_SW_PAGE_FAULTS		= 2,
+	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3,
+	PERF_COUNT_SW_CPU_MIGRATIONS		= 4,
+	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5,
+	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6,
+
+	PERF_COUNT_SW_MAX,			/* non-ABI */
 };
 
 /*
@@ -109,15 +111,15 @@ enum perf_sw_ids {
  * in the overflow packets.
  */
 enum perf_counter_sample_format {
-	PERF_SAMPLE_IP			= 1U << 0,
-	PERF_SAMPLE_TID			= 1U << 1,
-	PERF_SAMPLE_TIME		= 1U << 2,
-	PERF_SAMPLE_ADDR		= 1U << 3,
-	PERF_SAMPLE_GROUP		= 1U << 4,
-	PERF_SAMPLE_CALLCHAIN		= 1U << 5,
-	PERF_SAMPLE_ID			= 1U << 6,
-	PERF_SAMPLE_CPU			= 1U << 7,
-	PERF_SAMPLE_PERIOD		= 1U << 8,
+	PERF_SAMPLE_IP				= 1U << 0,
+	PERF_SAMPLE_TID				= 1U << 1,
+	PERF_SAMPLE_TIME			= 1U << 2,
+	PERF_SAMPLE_ADDR			= 1U << 3,
+	PERF_SAMPLE_GROUP			= 1U << 4,
+	PERF_SAMPLE_CALLCHAIN			= 1U << 5,
+	PERF_SAMPLE_ID				= 1U << 6,
+	PERF_SAMPLE_CPU				= 1U << 7,
+	PERF_SAMPLE_PERIOD			= 1U << 8,
 };
 
 /*
@@ -126,9 +128,9 @@ enum perf_counter_sample_format {
  * in increasing order of bit value, after the counter value.
  */
 enum perf_counter_read_format {
-	PERF_FORMAT_TOTAL_TIME_ENABLED	=  1U << 0,
-	PERF_FORMAT_TOTAL_TIME_RUNNING	=  1U << 1,
-	PERF_FORMAT_ID			=  1U << 2,
+	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0,
+	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1,
+	PERF_FORMAT_ID				= 1U << 2,
 };
 
 /*
@@ -229,12 +231,12 @@ struct perf_counter_mmap_page {
 	__u64   data_head;		/* head in the data section */
 };
 
-#define PERF_EVENT_MISC_CPUMODE_MASK	(3 << 0)
-#define PERF_EVENT_MISC_CPUMODE_UNKNOWN	(0 << 0)
-#define PERF_EVENT_MISC_KERNEL		(1 << 0)
-#define PERF_EVENT_MISC_USER		(2 << 0)
-#define PERF_EVENT_MISC_HYPERVISOR	(3 << 0)
-#define PERF_EVENT_MISC_OVERFLOW	(1 << 2)
+#define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0)
+#define PERF_EVENT_MISC_CPUMODE_UNKNOWN		(0 << 0)
+#define PERF_EVENT_MISC_KERNEL			(1 << 0)
+#define PERF_EVENT_MISC_USER			(2 << 0)
+#define PERF_EVENT_MISC_HYPERVISOR		(3 << 0)
+#define PERF_EVENT_MISC_OVERFLOW		(1 << 2)
 
 struct perf_event_header {
 	__u32	type;
@@ -351,14 +353,14 @@ struct hw_perf_counter {
 #ifdef CONFIG_PERF_COUNTERS
 	union {
 		struct { /* hardware */
-			u64				config;
-			unsigned long			config_base;
-			unsigned long			counter_base;
-			int				idx;
+			u64		config;
+			unsigned long	config_base;
+			unsigned long	counter_base;
+			int		idx;
 		};
 		union { /* software */
-			atomic64_t			count;
-			struct hrtimer			hrtimer;
+			atomic64_t	count;
+			struct hrtimer	hrtimer;
 		};
 	};
 	atomic64_t			prev_count;
@@ -523,37 +525,37 @@ struct perf_counter_context {
 	 * Protect the states of the counters in the list,
 	 * nr_active, and the list:
 	 */
-	spinlock_t		lock;
+	spinlock_t			lock;
 	/*
 	 * Protect the list of counters.  Locking either mutex or lock
 	 * is sufficient to ensure the list doesn't change; to change
 	 * the list you need to lock both the mutex and the spinlock.
 	 */
-	struct mutex		mutex;
+	struct mutex			mutex;
 
-	struct list_head	counter_list;
-	struct list_head	event_list;
-	int			nr_counters;
-	int			nr_active;
-	int			is_active;
-	atomic_t		refcount;
-	struct task_struct	*task;
+	struct list_head		counter_list;
+	struct list_head		event_list;
+	int				nr_counters;
+	int				nr_active;
+	int				is_active;
+	atomic_t			refcount;
+	struct task_struct		*task;
 
 	/*
 	 * Context clock, runs when context enabled.
 	 */
-	u64			time;
-	u64			timestamp;
+	u64				time;
+	u64				timestamp;
 
 	/*
 	 * These fields let us detect when two contexts have both
 	 * been cloned (inherited) from a common ancestor.
 	 */
-	struct perf_counter_context *parent_ctx;
-	u64			parent_gen;
-	u64			generation;
-	int			pin_count;
-	struct rcu_head		rcu_head;
+	struct perf_counter_context	*parent_ctx;
+	u64				parent_gen;
+	u64				generation;
+	int				pin_count;
+	struct rcu_head			rcu_head;
 };
 
 /**
@@ -604,9 +606,9 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 struct perf_sample_data {
-	struct pt_regs		*regs;
-	u64			addr;
-	u64			period;
+	struct pt_regs			*regs;
+	u64				addr;
+	u64				period;
 };
 
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
@@ -636,11 +638,14 @@ extern void perf_counter_fork(struct task_struct *tsk);
 
 extern void perf_counter_task_migration(struct task_struct *task, int cpu);
 
-#define MAX_STACK_DEPTH		255
+#define MAX_STACK_DEPTH			255
 
 struct perf_callchain_entry {
-	u16	nr, hv, kernel, user;
-	u64	ip[MAX_STACK_DEPTH];
+	u16				nr;
+	u16				hv;
+	u16				kernel;
+	u16				user;
+	u64				ip[MAX_STACK_DEPTH];
 };
 
 extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
-- 
GitLab


From cca3f454a85ff42d426401bce7ac804541b2bd03 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 11 Jun 2009 14:57:55 +0200
Subject: [PATCH 2055/2198] perf_counter: Add counter->id to the throttle event

So as to be able to distuinguish between multiple counters.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 1 +
 kernel/perf_counter.c        | 4 +++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 1fa1a26cb1b35..6e133954e2e44 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -286,6 +286,7 @@ enum perf_event_type {
 	 * struct {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
+	 *	u64				id;
 	 * };
 	 */
 	PERF_EVENT_THROTTLE		= 5,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 8859b97390ecf..ef5d8a5b24539 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2950,13 +2950,15 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 	struct {
 		struct perf_event_header	header;
 		u64				time;
+		u64				id;
 	} throttle_event = {
 		.header = {
 			.type = PERF_EVENT_THROTTLE + 1,
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time = sched_clock(),
+		.time	= sched_clock(),
+		.id	= counter->id,
 	};
 
 	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
-- 
GitLab


From 8dc8e5e8bc0ce00b0f656bf972f67cd8a72759e5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 11 Jun 2009 16:13:24 +0200
Subject: [PATCH 2056/2198] perf_counter: Turn off by default

Perfcounters were enabled by default to help testing - but now that we
are submitting it upstream, make it default-disabled.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/Kconfig | 1 -
 1 file changed, 1 deletion(-)

diff --git a/init/Kconfig b/init/Kconfig
index 8158f1f44694e..aef16f9b3d238 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -941,7 +941,6 @@ menu "Performance Counters"
 config PERF_COUNTERS
 	bool "Kernel Performance Counters"
 	depends on HAVE_PERF_COUNTERS
-	default y
 	select ANON_INODES
 	help
 	  Enable kernel support for performance counter hardware.
-- 
GitLab


From d2f11bf7fc630e27dfcede367399dddf40521878 Mon Sep 17 00:00:00 2001
From: Justin Chen <jchen@hpdst41.cup.hp.com>
Date: Thu, 11 Jun 2009 13:04:59 +0100
Subject: [PATCH 2057/2198] FRV: bitops: Change the bitmap index from int to
 unsigned long

Change the index to unsigned long in all bitops for [frv]

Signed-off-by: Justin Chen <justin.chen@hp.com>
Reviewed-by: Bjorn Helgaas <bjorn.helgaas@hp.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/include/asm/bitops.h | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/arch/frv/include/asm/bitops.h b/arch/frv/include/asm/bitops.h
index 287f6f697ce27..50ae91b296745 100644
--- a/arch/frv/include/asm/bitops.h
+++ b/arch/frv/include/asm/bitops.h
@@ -112,7 +112,7 @@ extern unsigned long atomic_test_and_XOR_mask(unsigned long mask, volatile unsig
 #define atomic_clear_mask(mask, v)	atomic_test_and_ANDNOT_mask((mask), (v))
 #define atomic_set_mask(mask, v)	atomic_test_and_OR_mask((mask), (v))
 
-static inline int test_and_clear_bit(int nr, volatile void *addr)
+static inline int test_and_clear_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *ptr = addr;
 	unsigned long mask = 1UL << (nr & 31);
@@ -120,7 +120,7 @@ static inline int test_and_clear_bit(int nr, volatile void *addr)
 	return (atomic_test_and_ANDNOT_mask(mask, ptr) & mask) != 0;
 }
 
-static inline int test_and_set_bit(int nr, volatile void *addr)
+static inline int test_and_set_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *ptr = addr;
 	unsigned long mask = 1UL << (nr & 31);
@@ -128,7 +128,7 @@ static inline int test_and_set_bit(int nr, volatile void *addr)
 	return (atomic_test_and_OR_mask(mask, ptr) & mask) != 0;
 }
 
-static inline int test_and_change_bit(int nr, volatile void *addr)
+static inline int test_and_change_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *ptr = addr;
 	unsigned long mask = 1UL << (nr & 31);
@@ -136,22 +136,22 @@ static inline int test_and_change_bit(int nr, volatile void *addr)
 	return (atomic_test_and_XOR_mask(mask, ptr) & mask) != 0;
 }
 
-static inline void clear_bit(int nr, volatile void *addr)
+static inline void clear_bit(unsigned long nr, volatile void *addr)
 {
 	test_and_clear_bit(nr, addr);
 }
 
-static inline void set_bit(int nr, volatile void *addr)
+static inline void set_bit(unsigned long nr, volatile void *addr)
 {
 	test_and_set_bit(nr, addr);
 }
 
-static inline void change_bit(int nr, volatile void * addr)
+static inline void change_bit(unsigned long nr, volatile void *addr)
 {
 	test_and_change_bit(nr, addr);
 }
 
-static inline void __clear_bit(int nr, volatile void * addr)
+static inline void __clear_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask;
@@ -161,7 +161,7 @@ static inline void __clear_bit(int nr, volatile void * addr)
 	*a &= ~mask;
 }
 
-static inline void __set_bit(int nr, volatile void * addr)
+static inline void __set_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask;
@@ -171,7 +171,7 @@ static inline void __set_bit(int nr, volatile void * addr)
 	*a |= mask;
 }
 
-static inline void __change_bit(int nr, volatile void *addr)
+static inline void __change_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask;
@@ -181,7 +181,7 @@ static inline void __change_bit(int nr, volatile void *addr)
 	*a ^= mask;
 }
 
-static inline int __test_and_clear_bit(int nr, volatile void * addr)
+static inline int __test_and_clear_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask, retval;
@@ -193,7 +193,7 @@ static inline int __test_and_clear_bit(int nr, volatile void * addr)
 	return retval;
 }
 
-static inline int __test_and_set_bit(int nr, volatile void * addr)
+static inline int __test_and_set_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask, retval;
@@ -205,7 +205,7 @@ static inline int __test_and_set_bit(int nr, volatile void * addr)
 	return retval;
 }
 
-static inline int __test_and_change_bit(int nr, volatile void * addr)
+static inline int __test_and_change_bit(unsigned long nr, volatile void *addr)
 {
 	volatile unsigned long *a = addr;
 	int mask, retval;
@@ -220,12 +220,13 @@ static inline int __test_and_change_bit(int nr, volatile void * addr)
 /*
  * This routine doesn't need to be atomic.
  */
-static inline int __constant_test_bit(int nr, const volatile void * addr)
+static inline int
+__constant_test_bit(unsigned long nr, const volatile void *addr)
 {
 	return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
 }
 
-static inline int __test_bit(int nr, const volatile void * addr)
+static inline int __test_bit(unsigned long nr, const volatile void *addr)
 {
 	int 	* a = (int *) addr;
 	int	mask;
-- 
GitLab


From db5c444eeb781788a0db36a2682b2417cf71f764 Mon Sep 17 00:00:00 2001
From: Stoyan Gaydarov <stoyboyker@gmail.com>
Date: Thu, 11 Jun 2009 13:05:04 +0100
Subject: [PATCH 2058/2198] FRV: BUG to BUG_ON changes

Change some BUG()'s to BUG_ON()'s.

Signed-off-by: Stoyan Gaydarov <stoyboyker@gmail.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/include/asm/pci.h            | 7 ++-----
 arch/frv/kernel/uaccess.c             | 6 ++----
 arch/frv/mb93090-mb00/pci-dma-nommu.c | 6 ++----
 arch/frv/mb93090-mb00/pci-dma.c       | 6 ++----
 4 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/arch/frv/include/asm/pci.h b/arch/frv/include/asm/pci.h
index 585d9b49949a7..cc685e60b0f9c 100644
--- a/arch/frv/include/asm/pci.h
+++ b/arch/frv/include/asm/pci.h
@@ -87,8 +87,7 @@ static inline void pci_dma_sync_single(struct pci_dev *hwdev,
 				       dma_addr_t dma_handle,
 				       size_t size, int direction)
 {
-	if (direction == PCI_DMA_NONE)
-                BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 
 	frv_cache_wback_inv((unsigned long)bus_to_virt(dma_handle),
 			    (unsigned long)bus_to_virt(dma_handle) + size);
@@ -105,9 +104,7 @@ static inline void pci_dma_sync_sg(struct pci_dev *hwdev,
 				   int nelems, int direction)
 {
 	int i;
-
-	if (direction == PCI_DMA_NONE)
-                BUG();
+	BUG_ON(direction == PCI_DMA_NONE);
 
 	for (i = 0; i < nelems; i++)
 		frv_cache_wback_inv(sg_dma_address(&sg[i]),
diff --git a/arch/frv/kernel/uaccess.c b/arch/frv/kernel/uaccess.c
index 9fb771a20df3c..374f88d6cc00d 100644
--- a/arch/frv/kernel/uaccess.c
+++ b/arch/frv/kernel/uaccess.c
@@ -23,8 +23,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 	char *p, ch;
 	long err = -EFAULT;
 
-	if (count < 0)
-		BUG();
+	BUG_ON(count < 0);
 
 	p = dst;
 
@@ -76,8 +75,7 @@ long strnlen_user(const char __user *src, long count)
 	long err = 0;
 	char ch;
 
-	if (count < 0)
-		BUG();
+	BUG_ON(count < 0);
 
 #ifndef CONFIG_MMU
 	if ((unsigned long) src < memory_start)
diff --git a/arch/frv/mb93090-mb00/pci-dma-nommu.c b/arch/frv/mb93090-mb00/pci-dma-nommu.c
index 52ff9aec799d2..4e1ba0b15443a 100644
--- a/arch/frv/mb93090-mb00/pci-dma-nommu.c
+++ b/arch/frv/mb93090-mb00/pci-dma-nommu.c
@@ -116,8 +116,7 @@ EXPORT_SYMBOL(dma_free_coherent);
 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
 			  enum dma_data_direction direction)
 {
-	if (direction == DMA_NONE)
-                BUG();
+	BUG_ON(direction == DMA_NONE);
 
 	frv_cache_wback_inv((unsigned long) ptr, (unsigned long) ptr + size);
 
@@ -151,8 +150,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 		frv_cache_wback_inv(sg_dma_address(&sg[i]),
 				    sg_dma_address(&sg[i]) + sg_dma_len(&sg[i]));
 
-	if (direction == DMA_NONE)
-                BUG();
+	BUG_ON(direction == DMA_NONE);
 
 	return nents;
 }
diff --git a/arch/frv/mb93090-mb00/pci-dma.c b/arch/frv/mb93090-mb00/pci-dma.c
index 3ddedebc4eb3d..45954f0813dc9 100644
--- a/arch/frv/mb93090-mb00/pci-dma.c
+++ b/arch/frv/mb93090-mb00/pci-dma.c
@@ -48,8 +48,7 @@ EXPORT_SYMBOL(dma_free_coherent);
 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
 			  enum dma_data_direction direction)
 {
-	if (direction == DMA_NONE)
-                BUG();
+	BUG_ON(direction == DMA_NONE);
 
 	frv_cache_wback_inv((unsigned long) ptr, (unsigned long) ptr + size);
 
@@ -81,8 +80,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
 	void *vaddr;
 	int i;
 
-	if (direction == DMA_NONE)
-                BUG();
+	BUG_ON(direction == DMA_NONE);
 
 	dampr2 = __get_DAMPR(2);
 
-- 
GitLab


From aa1913c0214a53568731617c0afbbfa3f59513fb Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:05:09 +0100
Subject: [PATCH 2059/2198] FRV: Remove in-kernel strace code

Remove in-kernel strace code from the FRV arch as it's not really needed any
more.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/kernel/ptrace.c | 379 ---------------------------------------
 1 file changed, 379 deletions(-)

diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c
index 5e7d401d21e7d..6b15e5da311a5 100644
--- a/arch/frv/kernel/ptrace.c
+++ b/arch/frv/kernel/ptrace.c
@@ -306,387 +306,8 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	return ret;
 }
 
-int __nongprelbss kstrace;
-
-static const struct {
-	const char	*name;
-	unsigned	argmask;
-} __syscall_name_table[NR_syscalls] = {
-	[0]	= { "restart_syscall"			},
-	[1]	= { "exit",		0x000001	},
-	[2]	= { "fork",		0xffffff	},
-	[3]	= { "read",		0x000141	},
-	[4]	= { "write",		0x000141	},
-	[5]	= { "open",		0x000235	},
-	[6]	= { "close",		0x000001	},
-	[7]	= { "waitpid",		0x000141	},
-	[8]	= { "creat",		0x000025	},
-	[9]	= { "link",		0x000055	},
-	[10]	= { "unlink",		0x000005	},
-	[11]	= { "execve",		0x000445	},
-	[12]	= { "chdir",		0x000005	},
-	[13]	= { "time",		0x000004	},
-	[14]	= { "mknod",		0x000325	},
-	[15]	= { "chmod",		0x000025	},
-	[16]	= { "lchown",		0x000025	},
-	[17]	= { "break" },
-	[18]	= { "oldstat",		0x000045	},
-	[19]	= { "lseek",		0x000131	},
-	[20]	= { "getpid",		0xffffff	},
-	[21]	= { "mount",		0x043555	},
-	[22]	= { "umount",		0x000005	},
-	[23]	= { "setuid",		0x000001	},
-	[24]	= { "getuid",		0xffffff	},
-	[25]	= { "stime",		0x000004	},
-	[26]	= { "ptrace",		0x004413	},
-	[27]	= { "alarm",		0x000001	},
-	[28]	= { "oldfstat",		0x000041	},
-	[29]	= { "pause",		0xffffff	},
-	[30]	= { "utime",		0x000045	},
-	[31]	= { "stty" },
-	[32]	= { "gtty" },
-	[33]	= { "access",		0x000025	},
-	[34]	= { "nice",		0x000001	},
-	[35]	= { "ftime" },
-	[36]	= { "sync",		0xffffff	},
-	[37]	= { "kill",		0x000011	},
-	[38]	= { "rename",		0x000055	},
-	[39]	= { "mkdir",		0x000025	},
-	[40]	= { "rmdir",		0x000005	},
-	[41]	= { "dup",		0x000001	},
-	[42]	= { "pipe",		0x000004	},
-	[43]	= { "times",		0x000004	},
-	[44]	= { "prof" },
-	[45]	= { "brk",		0x000004	},
-	[46]	= { "setgid",		0x000001	},
-	[47]	= { "getgid",		0xffffff	},
-	[48]	= { "signal",		0x000041	},
-	[49]	= { "geteuid",		0xffffff	},
-	[50]	= { "getegid",		0xffffff	},
-	[51]	= { "acct",		0x000005	},
-	[52]	= { "umount2",		0x000035	},
-	[53]	= { "lock" },
-	[54]	= { "ioctl",		0x000331	},
-	[55]	= { "fcntl",		0x000331	},
-	[56]	= { "mpx" },
-	[57]	= { "setpgid",		0x000011	},
-	[58]	= { "ulimit" },
-	[60]	= { "umask",		0x000002	},
-	[61]	= { "chroot",		0x000005	},
-	[62]	= { "ustat",		0x000043	},
-	[63]	= { "dup2",		0x000011	},
-	[64]	= { "getppid",		0xffffff	},
-	[65]	= { "getpgrp",		0xffffff	},
-	[66]	= { "setsid",		0xffffff	},
-	[67]	= { "sigaction" },
-	[68]	= { "sgetmask" },
-	[69]	= { "ssetmask" },
-	[70]	= { "setreuid" },
-	[71]	= { "setregid" },
-	[72]	= { "sigsuspend" },
-	[73]	= { "sigpending" },
-	[74]	= { "sethostname" },
-	[75]	= { "setrlimit" },
-	[76]	= { "getrlimit" },
-	[77]	= { "getrusage" },
-	[78]	= { "gettimeofday" },
-	[79]	= { "settimeofday" },
-	[80]	= { "getgroups" },
-	[81]	= { "setgroups" },
-	[82]	= { "select" },
-	[83]	= { "symlink" },
-	[84]	= { "oldlstat" },
-	[85]	= { "readlink" },
-	[86]	= { "uselib" },
-	[87]	= { "swapon" },
-	[88]	= { "reboot" },
-	[89]	= { "readdir" },
-	[91]	= { "munmap",		0x000034	},
-	[92]	= { "truncate" },
-	[93]	= { "ftruncate" },
-	[94]	= { "fchmod" },
-	[95]	= { "fchown" },
-	[96]	= { "getpriority" },
-	[97]	= { "setpriority" },
-	[99]	= { "statfs" },
-	[100]	= { "fstatfs" },
-	[102]	= { "socketcall" },
-	[103]	= { "syslog" },
-	[104]	= { "setitimer" },
-	[105]	= { "getitimer" },
-	[106]	= { "stat" },
-	[107]	= { "lstat" },
-	[108]	= { "fstat" },
-	[111]	= { "vhangup" },
-	[114]	= { "wait4" },
-	[115]	= { "swapoff" },
-	[116]	= { "sysinfo" },
-	[117]	= { "ipc" },
-	[118]	= { "fsync" },
-	[119]	= { "sigreturn" },
-	[120]	= { "clone" },
-	[121]	= { "setdomainname" },
-	[122]	= { "uname" },
-	[123]	= { "modify_ldt" },
-	[123]	= { "cacheflush" },
-	[124]	= { "adjtimex" },
-	[125]	= { "mprotect" },
-	[126]	= { "sigprocmask" },
-	[127]	= { "create_module" },
-	[128]	= { "init_module" },
-	[129]	= { "delete_module" },
-	[130]	= { "get_kernel_syms" },
-	[131]	= { "quotactl" },
-	[132]	= { "getpgid" },
-	[133]	= { "fchdir" },
-	[134]	= { "bdflush" },
-	[135]	= { "sysfs" },
-	[136]	= { "personality" },
-	[137]	= { "afs_syscall" },
-	[138]	= { "setfsuid" },
-	[139]	= { "setfsgid" },
-	[140]	= { "_llseek",			0x014331	},
-	[141]	= { "getdents" },
-	[142]	= { "_newselect",		0x000141	},
-	[143]	= { "flock" },
-	[144]	= { "msync" },
-	[145]	= { "readv" },
-	[146]	= { "writev" },
-	[147]	= { "getsid",			0x000001	},
-	[148]	= { "fdatasync",		0x000001	},
-	[149]	= { "_sysctl",			0x000004	},
-	[150]	= { "mlock" },
-	[151]	= { "munlock" },
-	[152]	= { "mlockall" },
-	[153]	= { "munlockall" },
-	[154]	= { "sched_setparam" },
-	[155]	= { "sched_getparam" },
-	[156]	= { "sched_setscheduler" },
-	[157]	= { "sched_getscheduler" },
-	[158]	= { "sched_yield" },
-	[159]	= { "sched_get_priority_max" },
-	[160]	= { "sched_get_priority_min" },
-	[161]	= { "sched_rr_get_interval" },
-	[162]	= { "nanosleep",		0x000044	},
-	[163]	= { "mremap" },
-	[164]	= { "setresuid" },
-	[165]	= { "getresuid" },
-	[166]	= { "vm86" },
-	[167]	= { "query_module" },
-	[168]	= { "poll" },
-	[169]	= { "nfsservctl" },
-	[170]	= { "setresgid" },
-	[171]	= { "getresgid" },
-	[172]	= { "prctl",			0x333331	},
-	[173]	= { "rt_sigreturn",		0xffffff	},
-	[174]	= { "rt_sigaction",		0x001441	},
-	[175]	= { "rt_sigprocmask",		0x001441	},
-	[176]	= { "rt_sigpending",		0x000014	},
-	[177]	= { "rt_sigtimedwait",		0x001444	},
-	[178]	= { "rt_sigqueueinfo",		0x000411	},
-	[179]	= { "rt_sigsuspend",		0x000014	},
-	[180]	= { "pread",			0x003341	},
-	[181]	= { "pwrite",			0x003341	},
-	[182]	= { "chown",			0x000115	},
-	[183]	= { "getcwd" },
-	[184]	= { "capget" },
-	[185]	= { "capset" },
-	[186]	= { "sigaltstack" },
-	[187]	= { "sendfile" },
-	[188]	= { "getpmsg" },
-	[189]	= { "putpmsg" },
-	[190]	= { "vfork",			0xffffff	},
-	[191]	= { "ugetrlimit" },
-	[192]	= { "mmap2",			0x313314	},
-	[193]	= { "truncate64" },
-	[194]	= { "ftruncate64" },
-	[195]	= { "stat64",			0x000045	},
-	[196]	= { "lstat64",			0x000045	},
-	[197]	= { "fstat64",			0x000041	},
-	[198]	= { "lchown32" },
-	[199]	= { "getuid32",			0xffffff	},
-	[200]	= { "getgid32",			0xffffff	},
-	[201]	= { "geteuid32",		0xffffff	},
-	[202]	= { "getegid32",		0xffffff	},
-	[203]	= { "setreuid32" },
-	[204]	= { "setregid32" },
-	[205]	= { "getgroups32" },
-	[206]	= { "setgroups32" },
-	[207]	= { "fchown32" },
-	[208]	= { "setresuid32" },
-	[209]	= { "getresuid32" },
-	[210]	= { "setresgid32" },
-	[211]	= { "getresgid32" },
-	[212]	= { "chown32" },
-	[213]	= { "setuid32" },
-	[214]	= { "setgid32" },
-	[215]	= { "setfsuid32" },
-	[216]	= { "setfsgid32" },
-	[217]	= { "pivot_root" },
-	[218]	= { "mincore" },
-	[219]	= { "madvise" },
-	[220]	= { "getdents64" },
-	[221]	= { "fcntl64" },
-	[223]	= { "security" },
-	[224]	= { "gettid" },
-	[225]	= { "readahead" },
-	[226]	= { "setxattr" },
-	[227]	= { "lsetxattr" },
-	[228]	= { "fsetxattr" },
-	[229]	= { "getxattr" },
-	[230]	= { "lgetxattr" },
-	[231]	= { "fgetxattr" },
-	[232]	= { "listxattr" },
-	[233]	= { "llistxattr" },
-	[234]	= { "flistxattr" },
-	[235]	= { "removexattr" },
-	[236]	= { "lremovexattr" },
-	[237]	= { "fremovexattr" },
-	[238]	= { "tkill" },
-	[239]	= { "sendfile64" },
-	[240]	= { "futex" },
-	[241]	= { "sched_setaffinity" },
-	[242]	= { "sched_getaffinity" },
-	[243]	= { "set_thread_area" },
-	[244]	= { "get_thread_area" },
-	[245]	= { "io_setup" },
-	[246]	= { "io_destroy" },
-	[247]	= { "io_getevents" },
-	[248]	= { "io_submit" },
-	[249]	= { "io_cancel" },
-	[250]	= { "fadvise64" },
-	[252]	= { "exit_group",		0x000001	},
-	[253]	= { "lookup_dcookie" },
-	[254]	= { "epoll_create" },
-	[255]	= { "epoll_ctl" },
-	[256]	= { "epoll_wait" },
-	[257]	= { "remap_file_pages" },
-	[258]	= { "set_tid_address" },
-	[259]	= { "timer_create" },
-	[260]	= { "timer_settime" },
-	[261]	= { "timer_gettime" },
-	[262]	= { "timer_getoverrun" },
-	[263]	= { "timer_delete" },
-	[264]	= { "clock_settime" },
-	[265]	= { "clock_gettime" },
-	[266]	= { "clock_getres" },
-	[267]	= { "clock_nanosleep" },
-	[268]	= { "statfs64" },
-	[269]	= { "fstatfs64" },
-	[270]	= { "tgkill" },
-	[271]	= { "utimes" },
-	[272]	= { "fadvise64_64" },
-	[273]	= { "vserver" },
-	[274]	= { "mbind" },
-	[275]	= { "get_mempolicy" },
-	[276]	= { "set_mempolicy" },
-	[277]	= { "mq_open" },
-	[278]	= { "mq_unlink" },
-	[279]	= { "mq_timedsend" },
-	[280]	= { "mq_timedreceive" },
-	[281]	= { "mq_notify" },
-	[282]	= { "mq_getsetattr" },
-	[283]	= { "sys_kexec_load" },
-};
-
 asmlinkage void do_syscall_trace(int leaving)
 {
-#if 0
-	unsigned long *argp;
-	const char *name;
-	unsigned argmask;
-	char buffer[16];
-
-	if (!kstrace)
-		return;
-
-	if (!current->mm)
-		return;
-
-	if (__frame->gr7 == __NR_close)
-		return;
-
-#if 0
-	if (__frame->gr7 != __NR_mmap2 &&
-	    __frame->gr7 != __NR_vfork &&
-	    __frame->gr7 != __NR_execve &&
-	    __frame->gr7 != __NR_exit)
-		return;
-#endif
-
-	argmask = 0;
-	name = NULL;
-	if (__frame->gr7 < NR_syscalls) {
-		name = __syscall_name_table[__frame->gr7].name;
-		argmask = __syscall_name_table[__frame->gr7].argmask;
-	}
-	if (!name) {
-		sprintf(buffer, "sys_%lx", __frame->gr7);
-		name = buffer;
-	}
-
-	if (!leaving) {
-		if (!argmask) {
-			printk(KERN_CRIT "[%d] %s(%lx,%lx,%lx,%lx,%lx,%lx)\n",
-			       current->pid,
-			       name,
-			       __frame->gr8,
-			       __frame->gr9,
-			       __frame->gr10,
-			       __frame->gr11,
-			       __frame->gr12,
-			       __frame->gr13);
-		}
-		else if (argmask == 0xffffff) {
-			printk(KERN_CRIT "[%d] %s()\n",
-			       current->pid,
-			       name);
-		}
-		else {
-			printk(KERN_CRIT "[%d] %s(",
-			       current->pid,
-			       name);
-
-			argp = &__frame->gr8;
-
-			do {
-				switch (argmask & 0xf) {
-				case 1:
-					printk("%ld", (long) *argp);
-					break;
-				case 2:
-					printk("%lo", *argp);
-					break;
-				case 3:
-					printk("%lx", *argp);
-					break;
-				case 4:
-					printk("%p", (void *) *argp);
-					break;
-				case 5:
-					printk("\"%s\"", (char *) *argp);
-					break;
-				}
-
-				argp++;
-				argmask >>= 4;
-				if (argmask)
-					printk(",");
-
-			} while (argmask);
-
-			printk(")\n");
-		}
-	}
-	else {
-		if ((int)__frame->gr8 > -4096 && (int)__frame->gr8 < 4096)
-			printk(KERN_CRIT "[%d] %s() = %ld\n", current->pid, name, __frame->gr8);
-		else
-			printk(KERN_CRIT "[%d] %s() = %lx\n", current->pid, name, __frame->gr8);
-	}
-	return;
-#endif
-
 	if (!test_thread_flag(TIF_SYSCALL_TRACE))
 		return;
 
-- 
GitLab


From b7bab880c795ec620041ef0295cbbbc5a726f414 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:05:14 +0100
Subject: [PATCH 2060/2198] FRV: Implement TIF_NOTIFY_RESUME

Implement the TIF_NOTIFY_RESUME thread flag, making it call do_notify_resume()
which then clears it.  This will be made use of later by tracehooks in the
new-style ptrace implementation

Also discard TIF_IRET as that's not used by FRV.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/include/asm/thread_info.h | 10 +++++-----
 arch/frv/kernel/signal.c           |  5 +++++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h
index bb53ab753ffbf..e8a5ed7be0212 100644
--- a/arch/frv/include/asm/thread_info.h
+++ b/arch/frv/include/asm/thread_info.h
@@ -109,20 +109,20 @@ register struct thread_info *__current_thread_info asm("gr15");
  * - other flags in MSW
  */
 #define TIF_SYSCALL_TRACE	0	/* syscall trace active */
-#define TIF_SIGPENDING		1	/* signal pending */
-#define TIF_NEED_RESCHED	2	/* rescheduling necessary */
-#define TIF_SINGLESTEP		3	/* restore singlestep on return to user mode */
-#define TIF_IRET		4	/* return with iret */
+#define TIF_NOTIFY_RESUME	1	/* callback before returning to user */
+#define TIF_SIGPENDING		2	/* signal pending */
+#define TIF_NEED_RESCHED	3	/* rescheduling necessary */
+#define TIF_SINGLESTEP		4	/* restore singlestep on return to user mode */
 #define TIF_RESTORE_SIGMASK	5	/* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17	/* OOM killer killed process */
 #define TIF_FREEZE		18	/* freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
+#define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP		(1 << TIF_SINGLESTEP)
-#define _TIF_IRET		(1 << TIF_IRET)
 #define _TIF_RESTORE_SIGMASK	(1 << TIF_RESTORE_SIGMASK)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_FREEZE		(1 << TIF_FREEZE)
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 3bdb368292a8c..7ae290a161de6 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -564,4 +564,9 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags)
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal();
 
+	/* deal with notification on about to resume userspace execution */
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+		clear_thread_flag(TIF_NOTIFY_RESUME);
+	}
+
 } /* end do_notify_resume() */
-- 
GitLab


From 24ceb7e8a6c5dd6e32ac3d43a2424542c97989f5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:05:19 +0100
Subject: [PATCH 2061/2198] FRV: Don't turn on TIF_SYSCALL_TRACE
 unconditionally in syscall prologue

Don't turn on TIF_SYSCALL_TRACE unconditionally in syscall prologue in FRV's
entry.S.  This was originally for debugging stuff and should have been removed
a long time ago.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/kernel/entry.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index 1da523b3298e0..268dfbddee3ba 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -886,7 +886,6 @@ system_call:
 	bnc		icc0,#0,__syscall_badsys
 
 	ldi		@(gr15,#TI_FLAGS),gr4
-	ori		gr4,#_TIF_SYSCALL_TRACE,gr4
 	andicc		gr4,#_TIF_SYSCALL_TRACE,gr0,icc0
 	bne		icc0,#0,__syscall_trace_entry
 
-- 
GitLab


From 4a3b98932270f5d69f2c081924e356325ed704d9 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:05:24 +0100
Subject: [PATCH 2062/2198] FRV: Implement new-style ptrace

Implement the new-style ptrace for FRV, including adding appropriate
tracehooks.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/frv/Kconfig               |   1 +
 arch/frv/include/asm/elf.h     |   1 +
 arch/frv/include/asm/ptrace.h  |  11 +-
 arch/frv/include/asm/syscall.h | 123 +++++++++++
 arch/frv/kernel/entry.S        |  12 +-
 arch/frv/kernel/ptrace.c       | 378 +++++++++++++++++++--------------
 arch/frv/kernel/signal.c       |   5 +
 7 files changed, 367 insertions(+), 164 deletions(-)
 create mode 100644 arch/frv/include/asm/syscall.h

diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig
index 9d1552a9ee2c8..8a5bd7a9c6f53 100644
--- a/arch/frv/Kconfig
+++ b/arch/frv/Kconfig
@@ -6,6 +6,7 @@ config FRV
 	bool
 	default y
 	select HAVE_IDE
+	select HAVE_ARCH_TRACEHOOK
 
 config ZONE_DMA
 	bool
diff --git a/arch/frv/include/asm/elf.h b/arch/frv/include/asm/elf.h
index 7279ec07d62e3..7bbf6e47f8c8a 100644
--- a/arch/frv/include/asm/elf.h
+++ b/arch/frv/include/asm/elf.h
@@ -116,6 +116,7 @@ do {											\
 } while(0)
 
 #define USE_ELF_CORE_DUMP
+#define CORE_DUMP_USE_REGSET
 #define ELF_FDPIC_CORE_EFLAGS	EF_FRV_FDPIC
 #define ELF_EXEC_PAGESIZE	16384
 
diff --git a/arch/frv/include/asm/ptrace.h b/arch/frv/include/asm/ptrace.h
index cf6934012b64d..a54b535c9e493 100644
--- a/arch/frv/include/asm/ptrace.h
+++ b/arch/frv/include/asm/ptrace.h
@@ -65,6 +65,8 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
+struct task_struct;
+
 /*
  * we dedicate GR28 to keeping a pointer to the current exception frame
  * - gr28 is destroyed on entry to the kernel from userspace
@@ -73,11 +75,18 @@ register struct pt_regs *__frame asm("gr28");
 
 #define user_mode(regs)			(!((regs)->psr & PSR_S))
 #define instruction_pointer(regs)	((regs)->pc)
+#define user_stack_pointer(regs)	((regs)->sp)
 
 extern unsigned long user_stack(const struct pt_regs *);
 extern void show_regs(struct pt_regs *);
 #define profile_pc(regs) ((regs)->pc)
-#endif
+
+#define task_pt_regs(task) ((task)->thread.frame0)
+
+#define arch_has_single_step()	(1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
 
 #endif /* !__ASSEMBLY__ */
+#endif /* __KERNEL__ */
 #endif /* _ASM_PTRACE_H */
diff --git a/arch/frv/include/asm/syscall.h b/arch/frv/include/asm/syscall.h
new file mode 100644
index 0000000000000..70689eb29b988
--- /dev/null
+++ b/arch/frv/include/asm/syscall.h
@@ -0,0 +1,123 @@
+/* syscall parameter access functions
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _ASM_SYSCALL_H
+#define _ASM_SYSCALL_H
+
+#include <linux/err.h>
+#include <asm/ptrace.h>
+
+/*
+ * Get the system call number or -1
+ */
+static inline long syscall_get_nr(struct task_struct *task,
+				  struct pt_regs *regs)
+{
+	return regs->syscallno;
+}
+
+/*
+ * Restore the clobbered GR8 register
+ * (1st syscall arg was overwritten with syscall return or error)
+ */
+static inline void syscall_rollback(struct task_struct *task,
+				    struct pt_regs *regs)
+{
+	regs->gr8 = regs->orig_gr8;
+}
+
+/*
+ * See if the syscall return value is an error, returning it if it is and 0 if
+ * not
+ */
+static inline long syscall_get_error(struct task_struct *task,
+				     struct pt_regs *regs)
+{
+	return IS_ERR_VALUE(regs->gr8) ? regs->gr8 : 0;
+}
+
+/*
+ * Get the syscall return value
+ */
+static inline long syscall_get_return_value(struct task_struct *task,
+					    struct pt_regs *regs)
+{
+	return regs->gr8;
+}
+
+/*
+ * Set the syscall return value
+ */
+static inline void syscall_set_return_value(struct task_struct *task,
+					    struct pt_regs *regs,
+					    int error, long val)
+{
+	if (error)
+		regs->gr8 = -error;
+	else
+		regs->gr8 = val;
+}
+
+/*
+ * Retrieve the system call arguments
+ */
+static inline void syscall_get_arguments(struct task_struct *task,
+					 struct pt_regs *regs,
+					 unsigned int i, unsigned int n,
+					 unsigned long *args)
+{
+	/*
+	 * Do this simply for now. If we need to start supporting
+	 * fetching arguments from arbitrary indices, this will need some
+	 * extra logic. Presently there are no in-tree users that depend
+	 * on this behaviour.
+	 */
+	BUG_ON(i);
+
+	/* Argument pattern is: GR8, GR9, GR10, GR11, GR12, GR13 */
+	switch (n) {
+	case 6: args[5] = regs->gr13;
+	case 5: args[4] = regs->gr12;
+	case 4: args[3] = regs->gr11;
+	case 3: args[2] = regs->gr10;
+	case 2: args[1] = regs->gr9;
+	case 1:	args[0] = regs->gr8;
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Alter the system call arguments
+ */
+static inline void syscall_set_arguments(struct task_struct *task,
+					 struct pt_regs *regs,
+					 unsigned int i, unsigned int n,
+					 const unsigned long *args)
+{
+	/* Same note as above applies */
+	BUG_ON(i);
+
+	switch (n) {
+	case 6: regs->gr13 = args[5];
+	case 5: regs->gr12 = args[4];
+	case 4: regs->gr11 = args[3];
+	case 3: regs->gr10 = args[2];
+	case 2: regs->gr9  = args[1];
+	case 1: regs->gr8  = args[0];
+		break;
+	default:
+		BUG();
+	}
+}
+
+#endif /* _ASM_SYSCALL_H */
diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S
index 268dfbddee3ba..356e0e327a892 100644
--- a/arch/frv/kernel/entry.S
+++ b/arch/frv/kernel/entry.S
@@ -1149,11 +1149,10 @@ __entry_work_notifysig:
 	# perform syscall entry tracing
 __syscall_trace_entry:
 	LEDS		0x6320
-	setlos.p	#0,gr8
-	call		do_syscall_trace
+	call		syscall_trace_entry
 
-	ldi		@(gr28,#REG_SYSCALLNO),gr7
-	lddi		@(gr28,#REG_GR(8)) ,gr8
+	lddi.p		@(gr28,#REG_GR(8)) ,gr8
+	ori		gr8,#0,gr7		; syscall_trace_entry() returned new syscallno
 	lddi		@(gr28,#REG_GR(10)),gr10
 	lddi.p		@(gr28,#REG_GR(12)),gr12
 
@@ -1168,11 +1167,10 @@ __syscall_exit_work:
 	beq		icc0,#1,__entry_work_pending
 
 	movsg		psr,gr23
-	andi		gr23,#~PSR_PIL,gr23	; could let do_syscall_trace() call schedule()
+	andi		gr23,#~PSR_PIL,gr23	; could let syscall_trace_exit() call schedule()
 	movgs		gr23,psr
 
-	setlos.p	#1,gr8
-	call		do_syscall_trace
+	call		syscall_trace_exit
 	bra		__entry_resume_userspace
 
 __syscall_badsys:
diff --git a/arch/frv/kernel/ptrace.c b/arch/frv/kernel/ptrace.c
index 6b15e5da311a5..60eeed3694c07 100644
--- a/arch/frv/kernel/ptrace.c
+++ b/arch/frv/kernel/ptrace.c
@@ -19,6 +19,9 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/signal.h>
+#include <linux/regset.h>
+#include <linux/elf.h>
+#include <linux/tracehook.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -32,6 +35,169 @@
  * in exit.c or in signal.c.
  */
 
+/*
+ * retrieve the contents of FRV userspace general registers
+ */
+static int genregs_get(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       void *kbuf, void __user *ubuf)
+{
+	const struct user_int_regs *iregs = &target->thread.user->i;
+	int ret;
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  iregs, 0, sizeof(*iregs));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					sizeof(*iregs), -1);
+}
+
+/*
+ * update the contents of the FRV userspace general registers
+ */
+static int genregs_set(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       const void *kbuf, const void __user *ubuf)
+{
+	struct user_int_regs *iregs = &target->thread.user->i;
+	unsigned int offs_gr0, offs_gr1;
+	int ret;
+
+	/* not allowed to set PSR or __status */
+	if (pos < offsetof(struct user_int_regs, psr) + sizeof(long) &&
+	    pos + count > offsetof(struct user_int_regs, psr))
+		return -EIO;
+
+	if (pos < offsetof(struct user_int_regs, __status) + sizeof(long) &&
+	    pos + count > offsetof(struct user_int_regs, __status))
+		return -EIO;
+
+	/* set the control regs */
+	offs_gr0 = offsetof(struct user_int_regs, gr[0]);
+	offs_gr1 = offsetof(struct user_int_regs, gr[1]);
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 iregs, 0, offs_gr0);
+	if (ret < 0)
+		return ret;
+
+	/* skip GR0/TBR */
+	ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					offs_gr0, offs_gr1);
+	if (ret < 0)
+		return ret;
+
+	/* set the general regs */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &iregs->gr[1], offs_gr1, sizeof(*iregs));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					sizeof(*iregs), -1);
+}
+
+/*
+ * retrieve the contents of FRV userspace FP/Media registers
+ */
+static int fpmregs_get(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       void *kbuf, void __user *ubuf)
+{
+	const struct user_fpmedia_regs *fpregs = &target->thread.user->f;
+	int ret;
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  fpregs, 0, sizeof(*fpregs));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					sizeof(*fpregs), -1);
+}
+
+/*
+ * update the contents of the FRV userspace FP/Media registers
+ */
+static int fpmregs_set(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       const void *kbuf, const void __user *ubuf)
+{
+	struct user_fpmedia_regs *fpregs = &target->thread.user->f;
+	int ret;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 fpregs, 0, sizeof(*fpregs));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					sizeof(*fpregs), -1);
+}
+
+/*
+ * determine if the FP/Media registers have actually been used
+ */
+static int fpmregs_active(struct task_struct *target,
+			  const struct user_regset *regset)
+{
+	return tsk_used_math(target) ? regset->n : 0;
+}
+
+/*
+ * Define the register sets available on the FRV under Linux
+ */
+enum frv_regset {
+	REGSET_GENERAL,
+	REGSET_FPMEDIA,
+};
+
+static const struct user_regset frv_regsets[] = {
+	/*
+	 * General register format is:
+	 *	PSR, ISR, CCR, CCCR, LR, LCR, PC, (STATUS), SYSCALLNO, ORIG_G8
+	 *	GNER0-1, IACC0, TBR, GR1-63
+	 */
+	[REGSET_GENERAL] = {
+		.core_note_type	= NT_PRSTATUS,
+		.n		= ELF_NGREG,
+		.size		= sizeof(long),
+		.align		= sizeof(long),
+		.get		= genregs_get,
+		.set		= genregs_set,
+	},
+	/*
+	 * FPU/Media register format is:
+	 *	FR0-63, FNER0-1, MSR0-1, ACC0-7, ACCG0-8, FSR
+	 */
+	[REGSET_FPMEDIA] = {
+		.core_note_type	= NT_PRFPREG,
+		.n		= sizeof(struct user_fpmedia_regs) / sizeof(long),
+		.size		= sizeof(long),
+		.align		= sizeof(long),
+		.get		= fpmregs_get,
+		.set		= fpmregs_set,
+		.active		= fpmregs_active,
+	},
+};
+
+static const struct user_regset_view user_frv_native_view = {
+	.name		= "frv",
+	.e_machine	= EM_FRV,
+	.regsets	= frv_regsets,
+	.n		= ARRAY_SIZE(frv_regsets),
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+	return &user_frv_native_view;
+}
+
 /*
  * Get contents of register REGNO in task TASK.
  */
@@ -68,41 +234,24 @@ static inline int put_reg(struct task_struct *task, int regno,
 	}
 }
 
-/*
- * check that an address falls within the bounds of the target process's memory
- * mappings
- */
-static inline int is_user_addr_valid(struct task_struct *child,
-				     unsigned long start, unsigned long len)
-{
-#ifdef CONFIG_MMU
-	if (start >= PAGE_OFFSET || len > PAGE_OFFSET - start)
-		return -EIO;
-	return 0;
-#else
-	struct vm_area_struct *vma;
-
-	vma = find_vma(child->mm, start);
-	if (vma && start >= vma->vm_start && start + len <= vma->vm_end)
-		return 0;
-
-	return -EIO;
-#endif
-}
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
  * Control h/w single stepping
  */
-void ptrace_disable(struct task_struct *child)
+void user_enable_single_step(struct task_struct *child)
+{
+	child->thread.frame0->__status |= REG__STATUS_STEP;
+}
+
+void user_disable_single_step(struct task_struct *child)
 {
 	child->thread.frame0->__status &= ~REG__STATUS_STEP;
 }
 
-void ptrace_enable(struct task_struct *child)
+void ptrace_disable(struct task_struct *child)
 {
-	child->thread.frame0->__status |= REG__STATUS_STEP;
+	user_disable_single_step(child);
 }
 
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
@@ -111,15 +260,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 	int ret;
 
 	switch (request) {
-		/* when I and D space are separate, these will need to be fixed. */
-	case PTRACE_PEEKTEXT: /* read word at location addr. */
-	case PTRACE_PEEKDATA:
-		ret = -EIO;
-		if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0)
-			break;
-		ret = generic_ptrace_peekdata(child, addr, data);
-		break;
-
 		/* read the word at location addr in the USER area. */
 	case PTRACE_PEEKUSR: {
 		tmp = 0;
@@ -163,15 +303,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 	}
 
-		/* when I and D space are separate, this will have to be fixed. */
-	case PTRACE_POKETEXT: /* write the word at location addr. */
-	case PTRACE_POKEDATA:
-		ret = -EIO;
-		if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0)
-			break;
-		ret = generic_ptrace_pokedata(child, addr, data);
-		break;
-
 	case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
 		ret = -EIO;
 		if ((addr & 3) || addr < 0)
@@ -179,7 +310,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
 		ret = 0;
 		switch (addr >> 2) {
-		case 0 ... PT__END-1:
+		case 0 ... PT__END - 1:
 			ret = put_reg(child, addr >> 2, data);
 			break;
 
@@ -189,95 +320,29 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		}
 		break;
 
-	case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
-	case PTRACE_CONT: /* restart after signal. */
-		ret = -EIO;
-		if (!valid_signal(data))
-			break;
-		if (request == PTRACE_SYSCALL)
-			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		else
-			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		child->exit_code = data;
-		ptrace_disable(child);
-		wake_up_process(child);
-		ret = 0;
-		break;
-
-		/* make the child exit.  Best I can do is send it a sigkill.
-		 * perhaps it should be put in the status that it wants to
-		 * exit.
-		 */
-	case PTRACE_KILL:
-		ret = 0;
-		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
-			break;
-		child->exit_code = SIGKILL;
-		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-		ptrace_disable(child);
-		wake_up_process(child);
-		break;
-
-	case PTRACE_SINGLESTEP:  /* set the trap flag. */
-		ret = -EIO;
-		if (!valid_signal(data))
-			break;
-		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		ptrace_enable(child);
-		child->exit_code = data;
-		wake_up_process(child);
-		ret = 0;
-		break;
-
-	case PTRACE_DETACH:	/* detach a process that was attached. */
-		ret = ptrace_detach(child, data);
-		break;
-
-	case PTRACE_GETREGS: { /* Get all integer regs from the child. */
-		int i;
-		for (i = 0; i < PT__GPEND; i++) {
-			tmp = get_reg(child, i);
-			if (put_user(tmp, (unsigned long *) data)) {
-				ret = -EFAULT;
-				break;
-			}
-			data += sizeof(long);
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_SETREGS: { /* Set all integer regs in the child. */
-		int i;
-		for (i = 0; i < PT__GPEND; i++) {
-			if (get_user(tmp, (unsigned long *) data)) {
-				ret = -EFAULT;
-				break;
-			}
-			put_reg(child, i, tmp);
-			data += sizeof(long);
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_GETFPREGS: { /* Get the child FP/Media state. */
-		ret = 0;
-		if (copy_to_user((void *) data,
-				 &child->thread.user->f,
-				 sizeof(child->thread.user->f)))
-			ret = -EFAULT;
-		break;
-	}
-
-	case PTRACE_SETFPREGS: { /* Set the child FP/Media state. */
-		ret = 0;
-		if (copy_from_user(&child->thread.user->f,
-				   (void *) data,
-				   sizeof(child->thread.user->f)))
-			ret = -EFAULT;
-		break;
-	}
+	case PTRACE_GETREGS:	/* Get all integer regs from the child. */
+		return copy_regset_to_user(child, &user_frv_native_view,
+					   REGSET_GENERAL,
+					   0, sizeof(child->thread.user->i),
+					   (void __user *)data);
+
+	case PTRACE_SETREGS:	/* Set all integer regs in the child. */
+		return copy_regset_from_user(child, &user_frv_native_view,
+					     REGSET_GENERAL,
+					     0, sizeof(child->thread.user->i),
+					     (const void __user *)data);
+
+	case PTRACE_GETFPREGS:	/* Get the child FP/Media state. */
+		return copy_regset_to_user(child, &user_frv_native_view,
+					   REGSET_FPMEDIA,
+					   0, sizeof(child->thread.user->f),
+					   (void __user *)data);
+
+	case PTRACE_SETFPREGS:	/* Set the child FP/Media state. */
+		return copy_regset_from_user(child, &user_frv_native_view,
+					     REGSET_FPMEDIA,
+					     0, sizeof(child->thread.user->f),
+					     (const void __user *)data);
 
 	case PTRACE_GETFDPIC:
 		tmp = 0;
@@ -300,35 +365,36 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 		break;
 
 	default:
-		ret = -EIO;
+		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
 	return ret;
 }
 
-asmlinkage void do_syscall_trace(int leaving)
+/*
+ * handle tracing of system call entry
+ * - return the revised system call number or ULONG_MAX to cause ENOSYS
+ */
+asmlinkage unsigned long syscall_trace_entry(void)
 {
-	if (!test_thread_flag(TIF_SYSCALL_TRACE))
-		return;
-
-	if (!(current->ptrace & PT_PTRACED))
-		return;
-
-	/* we need to indicate entry or exit to strace */
-	if (leaving)
-		__frame->__status |= REG__STATUS_SYSC_EXIT;
-	else
-		__frame->__status |= REG__STATUS_SYSC_ENTRY;
+	__frame->__status |= REG__STATUS_SYSC_ENTRY;
+	if (tracehook_report_syscall_entry(__frame)) {
+		/* tracing decided this syscall should not happen, so
+		 * We'll return a bogus call number to get an ENOSYS
+		 * error, but leave the original number in
+		 * __frame->syscallno
+		 */
+		return ULONG_MAX;
+	}
 
-	ptrace_notify(SIGTRAP);
+	return __frame->syscallno;
+}
 
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
+/*
+ * handle tracing of system call exit
+ */
+asmlinkage void syscall_trace_exit(void)
+{
+	__frame->__status |= REG__STATUS_SYSC_EXIT;
+	tracehook_report_syscall_exit(__frame, 0);
 }
diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c
index 7ae290a161de6..4a7a62c6e7833 100644
--- a/arch/frv/kernel/signal.c
+++ b/arch/frv/kernel/signal.c
@@ -21,6 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/personality.h>
 #include <linux/freezer.h>
+#include <linux/tracehook.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -516,6 +517,9 @@ static void do_signal(void)
 			 * clear the TIF_RESTORE_SIGMASK flag */
 			if (test_thread_flag(TIF_RESTORE_SIGMASK))
 				clear_thread_flag(TIF_RESTORE_SIGMASK);
+
+			tracehook_signal_handler(signr, &info, &ka, __frame,
+						 test_thread_flag(TIF_SINGLESTEP));
 		}
 
 		return;
@@ -567,6 +571,7 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags)
 	/* deal with notification on about to resume userspace execution */
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
+		tracehook_notify_resume(__frame);
 	}
 
 } /* end do_notify_resume() */
-- 
GitLab


From fd4f683d045e053abb093f80d81afce30ceadad2 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:08:32 +0100
Subject: [PATCH 2063/2198] MN10300: Don't set the dirty bit in the DTLB
 entries in the TLB-miss handler

Remove the special handling for the Data TLB entry dirty bit in the TLB-miss
handler.  As the code stands, all that it does is to cause us to take a second
data address exception to set the dirty bit.  Instead, we can just let
pte_mkdirty() set the bit.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mn10300/mm/tlb-mn10300.S | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/arch/mn10300/mm/tlb-mn10300.S b/arch/mn10300/mm/tlb-mn10300.S
index 789208094e985..7095147dcb8ba 100644
--- a/arch/mn10300/mm/tlb-mn10300.S
+++ b/arch/mn10300/mm/tlb-mn10300.S
@@ -165,24 +165,6 @@ ENTRY(itlb_aerror)
 ENTRY(dtlb_aerror)
 	and	~EPSW_NMID,epsw
 	add	-4,sp
-	mov	d1,(sp)
-
-	movhu	(MMUFCR_DFC),d1			# is it the initial valid write
-						# to this page?
-	and	MMUFCR_xFC_INITWR,d1
- 	beq	dtlb_pagefault			# jump if not
-
-	mov	(DPTEL),d1			# set the dirty bit
-						# (don't replace with BSET!)
-	or	_PAGE_DIRTY,d1
-	mov	d1,(DPTEL)
-	mov	(sp),d1
-	add	4,sp
- 	rti
-
-	ALIGN
-dtlb_pagefault:
-	mov	(sp),d1
 	SAVE_ALL
 	add	-4,sp				# need to pass three params
 
-- 
GitLab


From 5d289964e1f1e8a2ec4289274bf15bce6a4f8ab8 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 11 Jun 2009 13:08:37 +0100
Subject: [PATCH 2064/2198] MN10300: Add utrace/tracehooks support

Add utrace/tracehooks support to MN10300.

Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mn10300/Kconfig                 |   1 +
 arch/mn10300/include/asm/elf.h       |   3 +-
 arch/mn10300/include/asm/processor.h |   8 +-
 arch/mn10300/include/asm/ptrace.h    |   8 +
 arch/mn10300/kernel/entry.S          |  13 +-
 arch/mn10300/kernel/ptrace.c         | 454 ++++++++++++++-------------
 arch/mn10300/kernel/signal.c         |   9 +
 7 files changed, 257 insertions(+), 239 deletions(-)

diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig
index 355926730e8d9..89faacad5d17d 100644
--- a/arch/mn10300/Kconfig
+++ b/arch/mn10300/Kconfig
@@ -8,6 +8,7 @@ mainmenu "Linux Kernel Configuration"
 config MN10300
 	def_bool y
 	select HAVE_OPROFILE
+	select HAVE_ARCH_TRACEHOOK
 
 config AM33
 	def_bool y
diff --git a/arch/mn10300/include/asm/elf.h b/arch/mn10300/include/asm/elf.h
index bf09f8bb392ee..49105462e6fc9 100644
--- a/arch/mn10300/include/asm/elf.h
+++ b/arch/mn10300/include/asm/elf.h
@@ -34,7 +34,7 @@
  */
 typedef unsigned long elf_greg_t;
 
-#define ELF_NGREG (sizeof (struct pt_regs) / sizeof(elf_greg_t))
+#define ELF_NGREG ((sizeof(struct pt_regs) / sizeof(elf_greg_t)) - 1)
 typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
 #define ELF_NFPREG 32
@@ -76,6 +76,7 @@ do {									\
 } while (0)
 
 #define USE_ELF_CORE_DUMP
+#define CORE_DUMP_USE_REGSET
 #define ELF_EXEC_PAGESIZE	4096
 
 /*
diff --git a/arch/mn10300/include/asm/processor.h b/arch/mn10300/include/asm/processor.h
index 73239271873da..f7d4b0d285e8d 100644
--- a/arch/mn10300/include/asm/processor.h
+++ b/arch/mn10300/include/asm/processor.h
@@ -143,13 +143,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 unsigned long get_wchan(struct task_struct *p);
 
-#define task_pt_regs(task)						\
-({									\
-       struct pt_regs *__regs__;					\
-       __regs__ = (struct pt_regs *) (KSTK_TOP(task_stack_page(task)) - 8); \
-       __regs__ - 1;							\
-})
-
+#define task_pt_regs(task) ((task)->thread.uregs)
 #define KSTK_EIP(task) (task_pt_regs(task)->pc)
 #define KSTK_ESP(task) (task_pt_regs(task)->sp)
 
diff --git a/arch/mn10300/include/asm/ptrace.h b/arch/mn10300/include/asm/ptrace.h
index 7b06cc623d8b0..921942ed1b035 100644
--- a/arch/mn10300/include/asm/ptrace.h
+++ b/arch/mn10300/include/asm/ptrace.h
@@ -91,9 +91,17 @@ extern struct pt_regs *__frame; /* current frame pointer */
 #if defined(__KERNEL__)
 
 #if !defined(__ASSEMBLY__)
+struct task_struct;
+
 #define user_mode(regs)			(((regs)->epsw & EPSW_nSL) == EPSW_nSL)
 #define instruction_pointer(regs)	((regs)->pc)
+#define user_stack_pointer(regs)	((regs)->sp)
 extern void show_regs(struct pt_regs *);
+
+#define arch_has_single_step()	(1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
+
 #endif  /*  !__ASSEMBLY  */
 
 #define profile_pc(regs) ((regs)->pc)
diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S
index 3dc3e462f92a4..7408a27199f34 100644
--- a/arch/mn10300/kernel/entry.S
+++ b/arch/mn10300/kernel/entry.S
@@ -76,7 +76,7 @@ ENTRY(system_call)
 	cmp	nr_syscalls,d0
 	bcc	syscall_badsys
 	btst	_TIF_SYSCALL_TRACE,(TI_flags,a2)
-	bne	syscall_trace_entry
+	bne	syscall_entry_trace
 syscall_call:
 	add	d0,d0,a1
 	add	a1,a1
@@ -104,11 +104,10 @@ restore_all:
 syscall_exit_work:
 	btst	_TIF_SYSCALL_TRACE,d2
 	beq	work_pending
-	__sti				# could let do_syscall_trace() call
+	__sti				# could let syscall_trace_exit() call
 					# schedule() instead
 	mov	fp,d0
-	mov	1,d1
-	call	do_syscall_trace[],0	# do_syscall_trace(regs,entryexit)
+	call	syscall_trace_exit[],0	# do_syscall_trace(regs)
 	jmp	resume_userspace
 
 	ALIGN
@@ -138,13 +137,11 @@ work_notifysig:
 	jmp	resume_userspace
 
 	# perform syscall entry tracing
-syscall_trace_entry:
+syscall_entry_trace:
 	mov	-ENOSYS,d0
 	mov	d0,(REG_D0,fp)
 	mov	fp,d0
-	clr	d1
-	call	do_syscall_trace[],0
-	mov	(REG_ORIG_D0,fp),d0
+	call	syscall_trace_entry[],0	# returns the syscall number to actually use
 	mov	(REG_D1,fp),d1
 	cmp	nr_syscalls,d0
 	bcs	syscall_call
diff --git a/arch/mn10300/kernel/ptrace.c b/arch/mn10300/kernel/ptrace.c
index d6d6cdc75c523..e143339ad28e0 100644
--- a/arch/mn10300/kernel/ptrace.c
+++ b/arch/mn10300/kernel/ptrace.c
@@ -17,6 +17,9 @@
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/user.h>
+#include <linux/regset.h>
+#include <linux/elf.h>
+#include <linux/tracehook.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -64,12 +67,6 @@ static inline int get_stack_long(struct task_struct *task, int offset)
 		((unsigned long) task->thread.uregs + offset);
 }
 
-/*
- * this routine will put a word on the processes privileged stack.
- * the offset is how far from the base addr as stored in the TSS.
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */
 static inline
 int put_stack_long(struct task_struct *task, int offset, unsigned long data)
 {
@@ -80,94 +77,233 @@ int put_stack_long(struct task_struct *task, int offset, unsigned long data)
 	return 0;
 }
 
-static inline unsigned long get_fpregs(struct fpu_state_struct *buf,
-				       struct task_struct *tsk)
+/*
+ * retrieve the contents of MN10300 userspace general registers
+ */
+static int genregs_get(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       void *kbuf, void __user *ubuf)
 {
-	return __copy_to_user(buf, &tsk->thread.fpu_state,
-			      sizeof(struct fpu_state_struct));
+	const struct pt_regs *regs = task_pt_regs(target);
+	int ret;
+
+	/* we need to skip regs->next */
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  regs, 0, PT_ORIG_D0 * sizeof(long));
+	if (ret < 0)
+		return ret;
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &regs->orig_d0, PT_ORIG_D0 * sizeof(long),
+				  NR_PTREGS * sizeof(long));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					NR_PTREGS * sizeof(long), -1);
 }
 
-static inline unsigned long set_fpregs(struct task_struct *tsk,
-				       struct fpu_state_struct *buf)
+/*
+ * update the contents of the MN10300 userspace general registers
+ */
+static int genregs_set(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       const void *kbuf, const void __user *ubuf)
 {
-	return __copy_from_user(&tsk->thread.fpu_state, buf,
-				sizeof(struct fpu_state_struct));
+	struct pt_regs *regs = task_pt_regs(target);
+	unsigned long tmp;
+	int ret;
+
+	/* we need to skip regs->next */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 regs, 0, PT_ORIG_D0 * sizeof(long));
+	if (ret < 0)
+		return ret;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &regs->orig_d0, PT_ORIG_D0 * sizeof(long),
+				 PT_EPSW * sizeof(long));
+	if (ret < 0)
+		return ret;
+
+	/* we need to mask off changes to EPSW */
+	tmp = regs->epsw;
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &tmp, PT_EPSW * sizeof(long),
+				 PT_PC * sizeof(long));
+	tmp &= EPSW_FLAG_V | EPSW_FLAG_C | EPSW_FLAG_N | EPSW_FLAG_Z;
+	tmp |= regs->epsw & ~(EPSW_FLAG_V | EPSW_FLAG_C | EPSW_FLAG_N |
+			      EPSW_FLAG_Z);
+	regs->epsw = tmp;
+
+	if (ret < 0)
+		return ret;
+
+	/* and finally load the PC */
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &regs->pc, PT_PC * sizeof(long),
+				 NR_PTREGS * sizeof(long));
+
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					 NR_PTREGS * sizeof(long), -1);
 }
 
-static inline void fpsave_init(struct task_struct *task)
+/*
+ * retrieve the contents of MN10300 userspace FPU registers
+ */
+static int fpuregs_get(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       void *kbuf, void __user *ubuf)
 {
-	memset(&task->thread.fpu_state, 0, sizeof(struct fpu_state_struct));
+	const struct fpu_state_struct *fpregs = &target->thread.fpu_state;
+	int ret;
+
+	unlazy_fpu(target);
+
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  fpregs, 0, sizeof(*fpregs));
+	if (ret < 0)
+		return ret;
+
+	return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+					sizeof(*fpregs), -1);
 }
 
 /*
- * make sure the single step bit is not set
+ * update the contents of the MN10300 userspace FPU registers
  */
-void ptrace_disable(struct task_struct *child)
+static int fpuregs_set(struct task_struct *target,
+		       const struct user_regset *regset,
+		       unsigned int pos, unsigned int count,
+		       const void *kbuf, const void __user *ubuf)
+{
+	struct fpu_state_struct fpu_state = target->thread.fpu_state;
+	int ret;
+
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+				 &fpu_state, 0, sizeof(fpu_state));
+	if (ret < 0)
+		return ret;
+
+	fpu_kill_state(target);
+	target->thread.fpu_state = fpu_state;
+	set_using_fpu(target);
+
+	return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+					 sizeof(fpu_state), -1);
+}
+
+/*
+ * determine if the FPU registers have actually been used
+ */
+static int fpuregs_active(struct task_struct *target,
+			  const struct user_regset *regset)
+{
+	return is_using_fpu(target) ? regset->n : 0;
+}
+
+/*
+ * Define the register sets available on the MN10300 under Linux
+ */
+enum mn10300_regset {
+	REGSET_GENERAL,
+	REGSET_FPU,
+};
+
+static const struct user_regset mn10300_regsets[] = {
+	/*
+	 * General register format is:
+	 *	A3, A2, D3, D2, MCVF, MCRL, MCRH, MDRQ
+	 *	E1, E0, E7...E2, SP, LAR, LIR, MDR
+	 *	A1, A0, D1, D0, ORIG_D0, EPSW, PC
+	 */
+	[REGSET_GENERAL] = {
+		.core_note_type	= NT_PRSTATUS,
+		.n		= ELF_NGREG,
+		.size		= sizeof(long),
+		.align		= sizeof(long),
+		.get		= genregs_get,
+		.set		= genregs_set,
+	},
+	/*
+	 * FPU register format is:
+	 *	FS0-31, FPCR
+	 */
+	[REGSET_FPU] = {
+		.core_note_type	= NT_PRFPREG,
+		.n		= sizeof(struct fpu_state_struct) / sizeof(long),
+		.size		= sizeof(long),
+		.align		= sizeof(long),
+		.get		= fpuregs_get,
+		.set		= fpuregs_set,
+		.active		= fpuregs_active,
+	},
+};
+
+static const struct user_regset_view user_mn10300_native_view = {
+	.name		= "mn10300",
+	.e_machine	= EM_MN10300,
+	.regsets	= mn10300_regsets,
+	.n		= ARRAY_SIZE(mn10300_regsets),
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+	return &user_mn10300_native_view;
+}
+
+/*
+ * set the single-step bit
+ */
+void user_enable_single_step(struct task_struct *child)
 {
 #ifndef CONFIG_MN10300_USING_JTAG
 	struct user *dummy = NULL;
 	long tmp;
 
 	tmp = get_stack_long(child, (unsigned long) &dummy->regs.epsw);
-	tmp &= ~EPSW_T;
+	tmp |= EPSW_T;
 	put_stack_long(child, (unsigned long) &dummy->regs.epsw, tmp);
 #endif
 }
 
 /*
- * set the single step bit
+ * make sure the single-step bit is not set
  */
-void ptrace_enable(struct task_struct *child)
+void user_disable_single_step(struct task_struct *child)
 {
 #ifndef CONFIG_MN10300_USING_JTAG
 	struct user *dummy = NULL;
 	long tmp;
 
 	tmp = get_stack_long(child, (unsigned long) &dummy->regs.epsw);
-	tmp |= EPSW_T;
+	tmp &= ~EPSW_T;
 	put_stack_long(child, (unsigned long) &dummy->regs.epsw, tmp);
 #endif
 }
 
+void ptrace_disable(struct task_struct *child)
+{
+	user_disable_single_step(child);
+}
+
 /*
  * handle the arch-specific side of process tracing
  */
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-	struct fpu_state_struct fpu_state;
-	int i, ret;
+	unsigned long tmp;
+	int ret;
 
 	switch (request) {
-	/* read the word at location addr. */
-	case PTRACE_PEEKTEXT: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp, (unsigned long *) data);
-		break;
-	}
-
-	/* read the word at location addr. */
-	case PTRACE_PEEKDATA: {
-		unsigned long tmp;
-		int copied;
-
-		copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-		ret = -EIO;
-		if (copied != sizeof(tmp))
-			break;
-		ret = put_user(tmp, (unsigned long *) data);
-		break;
-	}
-
 	/* read the word at location addr in the USER area. */
-	case PTRACE_PEEKUSR: {
-		unsigned long tmp;
-
+	case PTRACE_PEEKUSR:
 		ret = -EIO;
 		if ((addr & 3) || addr < 0 ||
 		    addr > sizeof(struct user) - 3)
@@ -179,17 +315,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 					     ptrace_regid_to_frame[addr]);
 		ret = put_user(tmp, (unsigned long *) data);
 		break;
-	}
-
-	/* write the word at location addr. */
-	case PTRACE_POKETEXT:
-	case PTRACE_POKEDATA:
-		if (access_process_vm(child, addr, &data, sizeof(data), 1) ==
-		    sizeof(data))
-			ret = 0;
-		else
-			ret = -EIO;
-		break;
 
 		/* write the word at location addr in the USER area */
 	case PTRACE_POKEUSR:
@@ -204,132 +329,32 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 					     data);
 		break;
 
-		/* continue and stop at next (return from) syscall */
-	case PTRACE_SYSCALL:
-		/* restart after signal. */
-	case PTRACE_CONT:
-		ret = -EIO;
-		if ((unsigned long) data > _NSIG)
-			break;
-		if (request == PTRACE_SYSCALL)
-			set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		else
-			clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		child->exit_code = data;
-		ptrace_disable(child);
-		wake_up_process(child);
-		ret = 0;
-		break;
-
-		/*
-		 * make the child exit
-		 * - the best I can do is send it a sigkill
-		 * - perhaps it should be put in the status that it wants to
-		 *   exit
-		 */
-	case PTRACE_KILL:
-		ret = 0;
-		if (child->exit_state == EXIT_ZOMBIE)	/* already dead */
-			break;
-		child->exit_code = SIGKILL;
-		clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-		ptrace_disable(child);
-		wake_up_process(child);
-		break;
-
-	case PTRACE_SINGLESTEP: /* set the trap flag. */
-#ifndef CONFIG_MN10300_USING_JTAG
-		ret = -EIO;
-		if ((unsigned long) data > _NSIG)
-			break;
-		clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-		ptrace_enable(child);
-		child->exit_code = data;
-		wake_up_process(child);
-		ret = 0;
-#else
-		ret = -EINVAL;
-#endif
-		break;
-
-	case PTRACE_DETACH:	/* detach a process that was attached. */
-		ret = ptrace_detach(child, data);
-		break;
-
-		/* Get all gp regs from the child. */
-	case PTRACE_GETREGS: {
-		unsigned long tmp;
-
-		if (!access_ok(VERIFY_WRITE, (unsigned *) data, NR_PTREGS << 2)) {
-			ret = -EIO;
-			break;
-		}
-
-		for (i = 0; i < NR_PTREGS << 2; i += 4) {
-			tmp = get_stack_long(child, ptrace_regid_to_frame[i]);
-			__put_user(tmp, (unsigned long *) data);
-			data += sizeof(tmp);
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-		unsigned long tmp;
-
-		if (!access_ok(VERIFY_READ, (unsigned long *)data,
-			       sizeof(struct pt_regs))) {
-			ret = -EIO;
-			break;
-		}
-
-		for (i = 0; i < NR_PTREGS << 2; i += 4) {
-			__get_user(tmp, (unsigned long *) data);
-			put_stack_long(child, ptrace_regid_to_frame[i], tmp);
-			data += sizeof(tmp);
-		}
-		ret = 0;
-		break;
-	}
-
-	case PTRACE_GETFPREGS: { /* Get the child FPU state. */
-		if (is_using_fpu(child)) {
-			unlazy_fpu(child);
-			fpu_state = child->thread.fpu_state;
-		} else {
-			memset(&fpu_state, 0, sizeof(fpu_state));
-		}
-
-		ret = -EIO;
-		if (copy_to_user((void *) data, &fpu_state,
-				 sizeof(fpu_state)) == 0)
-			ret = 0;
-		break;
-	}
-
-	case PTRACE_SETFPREGS: { /* Set the child FPU state. */
-		ret = -EFAULT;
-		if (copy_from_user(&fpu_state, (const void *) data,
-				   sizeof(fpu_state)) == 0) {
-			fpu_kill_state(child);
-			child->thread.fpu_state = fpu_state;
-			set_using_fpu(child);
-			ret = 0;
-		}
-		break;
-	}
-
-	case PTRACE_SETOPTIONS: {
-		if (data & PTRACE_O_TRACESYSGOOD)
-			child->ptrace |= PT_TRACESYSGOOD;
-		else
-			child->ptrace &= ~PT_TRACESYSGOOD;
-		ret = 0;
-		break;
-	}
+	case PTRACE_GETREGS:	/* Get all integer regs from the child. */
+		return copy_regset_to_user(child, &user_mn10300_native_view,
+					   REGSET_GENERAL,
+					   0, NR_PTREGS * sizeof(long),
+					   (void __user *)data);
+
+	case PTRACE_SETREGS:	/* Set all integer regs in the child. */
+		return copy_regset_from_user(child, &user_mn10300_native_view,
+					     REGSET_GENERAL,
+					     0, NR_PTREGS * sizeof(long),
+					     (const void __user *)data);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child, &user_mn10300_native_view,
+					   REGSET_FPU,
+					   0, sizeof(struct fpu_state_struct),
+					   (void __user *)data);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(child, &user_mn10300_native_view,
+					     REGSET_FPU,
+					     0, sizeof(struct fpu_state_struct),
+					     (const void __user *)data);
 
 	default:
-		ret = -EIO;
+		ret = ptrace_request(child, request, addr, data);
 		break;
 	}
 
@@ -337,43 +362,26 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 }
 
 /*
- * notification of system call entry/exit
- * - triggered by current->work.syscall_trace
+ * handle tracing of system call entry
+ * - return the revised system call number or ULONG_MAX to cause ENOSYS
  */
-asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
+asmlinkage unsigned long syscall_trace_entry(struct pt_regs *regs)
 {
-#if 0
-	/* just in case... */
-	printk(KERN_DEBUG "[%d] syscall_%lu(%lx,%lx,%lx,%lx) = %lx\n",
-	       current->pid,
-	       regs->orig_d0,
-	       regs->a0,
-	       regs->d1,
-	       regs->a3,
-	       regs->a2,
-	       regs->d0);
-	return;
-#endif
-
-	if (!test_thread_flag(TIF_SYSCALL_TRACE) &&
-	    !test_thread_flag(TIF_SINGLESTEP))
-		return;
-	if (!(current->ptrace & PT_PTRACED))
-		return;
+	if (tracehook_report_syscall_entry(regs))
+		/* tracing decided this syscall should not happen, so
+		 * We'll return a bogus call number to get an ENOSYS
+		 * error, but leave the original number in
+		 * regs->orig_d0
+		 */
+		return ULONG_MAX;
 
-	/* the 0x80 provides a way for the tracing parent to distinguish
-	   between a syscall stop and SIGTRAP delivery */
-	ptrace_notify(SIGTRAP |
-		      ((current->ptrace & PT_TRACESYSGOOD) &&
-		       !test_thread_flag(TIF_SINGLESTEP) ? 0x80 : 0));
+	return regs->orig_d0;
+}
 
-	/*
-	 * this isn't the same as continuing with a signal, but it will do
-	 * for normal use.  strace only continues with a signal if the
-	 * stopping signal is not SIGTRAP.  -brl
-	 */
-	if (current->exit_code) {
-		send_sig(current->exit_code, current, 1);
-		current->exit_code = 0;
-	}
+/*
+ * handle tracing of system call exit
+ */
+asmlinkage void syscall_trace_exit(struct pt_regs *regs)
+{
+	tracehook_report_syscall_exit(regs, 0);
 }
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index 841ca9955a181..9f7572a0f5784 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -23,6 +23,7 @@
 #include <linux/tty.h>
 #include <linux/personality.h>
 #include <linux/suspend.h>
+#include <linux/tracehook.h>
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
@@ -511,6 +512,9 @@ static void do_signal(struct pt_regs *regs)
 			 * clear the TIF_RESTORE_SIGMASK flag */
 			if (test_thread_flag(TIF_RESTORE_SIGMASK))
 				clear_thread_flag(TIF_RESTORE_SIGMASK);
+
+			tracehook_signal_handler(signr, &info, &ka, regs,
+						 test_thread_flag(TIF_SINGLESTEP));
 		}
 
 		return;
@@ -561,4 +565,9 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags)
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+		clear_thread_flag(TIF_NOTIFY_RESUME);
+		tracehook_notify_resume(__frame);
+	}
 }
-- 
GitLab


From 3c7b4e6b8be4c16f1e6e5c558e33b7ff0db2dfaf Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:22:39 +0100
Subject: [PATCH 2065/2198] kmemleak: Add the base support

This patch adds the base support for the kernel memory leak
detector. It traces the memory allocation/freeing in a way similar to
the Boehm's conservative garbage collector, the difference being that
the unreferenced objects are not freed but only shown in
/sys/kernel/debug/kmemleak. Enabling this feature introduces an
overhead to memory allocations.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Andrew Morton <akpm@linux-foundation.org>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/kmemleak.h |   96 +++
 init/main.c              |    4 +-
 mm/kmemleak.c            | 1498 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 1597 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/kmemleak.h
 create mode 100644 mm/kmemleak.c

diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h
new file mode 100644
index 0000000000000..7796aed6cdd5f
--- /dev/null
+++ b/include/linux/kmemleak.h
@@ -0,0 +1,96 @@
+/*
+ * include/linux/kmemleak.h
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef __KMEMLEAK_H
+#define __KMEMLEAK_H
+
+#ifdef CONFIG_DEBUG_KMEMLEAK
+
+extern void kmemleak_init(void);
+extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+			   gfp_t gfp);
+extern void kmemleak_free(const void *ptr);
+extern void kmemleak_padding(const void *ptr, unsigned long offset,
+			     size_t size);
+extern void kmemleak_not_leak(const void *ptr);
+extern void kmemleak_ignore(const void *ptr);
+extern void kmemleak_scan_area(const void *ptr, unsigned long offset,
+			       size_t length, gfp_t gfp);
+extern void kmemleak_no_scan(const void *ptr);
+
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+					    int min_count, unsigned long flags,
+					    gfp_t gfp)
+{
+	if (!(flags & SLAB_NOLEAKTRACE))
+		kmemleak_alloc(ptr, size, min_count, gfp);
+}
+
+static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
+{
+	if (!(flags & SLAB_NOLEAKTRACE))
+		kmemleak_free(ptr);
+}
+
+static inline void kmemleak_erase(void **ptr)
+{
+	*ptr = NULL;
+}
+
+#else
+
+static inline void kmemleak_init(void)
+{
+}
+static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+				  gfp_t gfp)
+{
+}
+static inline void kmemleak_alloc_recursive(const void *ptr, size_t size,
+					    int min_count, unsigned long flags,
+					    gfp_t gfp)
+{
+}
+static inline void kmemleak_free(const void *ptr)
+{
+}
+static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags)
+{
+}
+static inline void kmemleak_not_leak(const void *ptr)
+{
+}
+static inline void kmemleak_ignore(const void *ptr)
+{
+}
+static inline void kmemleak_scan_area(const void *ptr, unsigned long offset,
+				      size_t length, gfp_t gfp)
+{
+}
+static inline void kmemleak_erase(void **ptr)
+{
+}
+static inline void kmemleak_no_scan(const void *ptr)
+{
+}
+
+#endif	/* CONFIG_DEBUG_KMEMLEAK */
+
+#endif	/* __KMEMLEAK_H */
diff --git a/init/main.c b/init/main.c
index bb7dc57eee36e..9d759d68d7a0a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -56,6 +56,7 @@
 #include <linux/debug_locks.h>
 #include <linux/debugobjects.h>
 #include <linux/lockdep.h>
+#include <linux/kmemleak.h>
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
@@ -603,6 +604,7 @@ asmlinkage void __init start_kernel(void)
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
+	prio_tree_init();
 	pidhash_init();
 	init_timers();
 	hrtimers_init();
@@ -654,6 +656,7 @@ asmlinkage void __init start_kernel(void)
 	cpu_hotplug_init();
 	kmem_cache_init();
 	kmemtrace_init();
+	kmemleak_init();
 	debug_objects_mem_init();
 	idr_init_cache();
 	setup_per_cpu_pageset();
@@ -663,7 +666,6 @@ asmlinkage void __init start_kernel(void)
 	calibrate_delay();
 	pidmap_init();
 	pgtable_cache_init();
-	prio_tree_init();
 	anon_vma_init();
 #ifdef CONFIG_X86
 	if (efi_enabled)
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
new file mode 100644
index 0000000000000..58ec86c9e58a0
--- /dev/null
+++ b/mm/kmemleak.c
@@ -0,0 +1,1498 @@
+/*
+ * mm/kmemleak.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ *
+ * For more information on the algorithm and kmemleak usage, please see
+ * Documentation/kmemleak.txt.
+ *
+ * Notes on locking
+ * ----------------
+ *
+ * The following locks and mutexes are used by kmemleak:
+ *
+ * - kmemleak_lock (rwlock): protects the object_list modifications and
+ *   accesses to the object_tree_root. The object_list is the main list
+ *   holding the metadata (struct kmemleak_object) for the allocated memory
+ *   blocks. The object_tree_root is a priority search tree used to look-up
+ *   metadata based on a pointer to the corresponding memory block.  The
+ *   kmemleak_object structures are added to the object_list and
+ *   object_tree_root in the create_object() function called from the
+ *   kmemleak_alloc() callback and removed in delete_object() called from the
+ *   kmemleak_free() callback
+ * - kmemleak_object.lock (spinlock): protects a kmemleak_object. Accesses to
+ *   the metadata (e.g. count) are protected by this lock. Note that some
+ *   members of this structure may be protected by other means (atomic or
+ *   kmemleak_lock). This lock is also held when scanning the corresponding
+ *   memory block to avoid the kernel freeing it via the kmemleak_free()
+ *   callback. This is less heavyweight than holding a global lock like
+ *   kmemleak_lock during scanning
+ * - scan_mutex (mutex): ensures that only one thread may scan the memory for
+ *   unreferenced objects at a time. The gray_list contains the objects which
+ *   are already referenced or marked as false positives and need to be
+ *   scanned. This list is only modified during a scanning episode when the
+ *   scan_mutex is held. At the end of a scan, the gray_list is always empty.
+ *   Note that the kmemleak_object.use_count is incremented when an object is
+ *   added to the gray_list and therefore cannot be freed
+ * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs
+ *   file together with modifications to the memory scanning parameters
+ *   including the scan_thread pointer
+ *
+ * The kmemleak_object structures have a use_count incremented or decremented
+ * using the get_object()/put_object() functions. When the use_count becomes
+ * 0, this count can no longer be incremented and put_object() schedules the
+ * kmemleak_object freeing via an RCU callback. All calls to the get_object()
+ * function must be protected by rcu_read_lock() to avoid accessing a freed
+ * structure.
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/jiffies.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/prio_tree.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/cpumask.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/rcupdate.h>
+#include <linux/stacktrace.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mmzone.h>
+#include <linux/slab.h>
+#include <linux/thread_info.h>
+#include <linux/err.h>
+#include <linux/uaccess.h>
+#include <linux/string.h>
+#include <linux/nodemask.h>
+#include <linux/mm.h>
+
+#include <asm/sections.h>
+#include <asm/processor.h>
+#include <asm/atomic.h>
+
+#include <linux/kmemleak.h>
+
+/*
+ * Kmemleak configuration and common defines.
+ */
+#define MAX_TRACE		16	/* stack trace length */
+#define REPORTS_NR		50	/* maximum number of reported leaks */
+#define MSECS_MIN_AGE		5000	/* minimum object age for reporting */
+#define MSECS_SCAN_YIELD	10	/* CPU yielding period */
+#define SECS_FIRST_SCAN		60	/* delay before the first scan */
+#define SECS_SCAN_WAIT		600	/* subsequent auto scanning delay */
+
+#define BYTES_PER_POINTER	sizeof(void *)
+
+/* scanning area inside a memory block */
+struct kmemleak_scan_area {
+	struct hlist_node node;
+	unsigned long offset;
+	size_t length;
+};
+
+/*
+ * Structure holding the metadata for each allocated memory block.
+ * Modifications to such objects should be made while holding the
+ * object->lock. Insertions or deletions from object_list, gray_list or
+ * tree_node are already protected by the corresponding locks or mutex (see
+ * the notes on locking above). These objects are reference-counted
+ * (use_count) and freed using the RCU mechanism.
+ */
+struct kmemleak_object {
+	spinlock_t lock;
+	unsigned long flags;		/* object status flags */
+	struct list_head object_list;
+	struct list_head gray_list;
+	struct prio_tree_node tree_node;
+	struct rcu_head rcu;		/* object_list lockless traversal */
+	/* object usage count; object freed when use_count == 0 */
+	atomic_t use_count;
+	unsigned long pointer;
+	size_t size;
+	/* minimum number of a pointers found before it is considered leak */
+	int min_count;
+	/* the total number of pointers found pointing to this object */
+	int count;
+	/* memory ranges to be scanned inside an object (empty for all) */
+	struct hlist_head area_list;
+	unsigned long trace[MAX_TRACE];
+	unsigned int trace_len;
+	unsigned long jiffies;		/* creation timestamp */
+	pid_t pid;			/* pid of the current task */
+	char comm[TASK_COMM_LEN];	/* executable name */
+};
+
+/* flag representing the memory block allocation status */
+#define OBJECT_ALLOCATED	(1 << 0)
+/* flag set after the first reporting of an unreference object */
+#define OBJECT_REPORTED		(1 << 1)
+/* flag set to not scan the object */
+#define OBJECT_NO_SCAN		(1 << 2)
+
+/* the list of all allocated objects */
+static LIST_HEAD(object_list);
+/* the list of gray-colored objects (see color_gray comment below) */
+static LIST_HEAD(gray_list);
+/* prio search tree for object boundaries */
+static struct prio_tree_root object_tree_root;
+/* rw_lock protecting the access to object_list and prio_tree_root */
+static DEFINE_RWLOCK(kmemleak_lock);
+
+/* allocation caches for kmemleak internal data */
+static struct kmem_cache *object_cache;
+static struct kmem_cache *scan_area_cache;
+
+/* set if tracing memory operations is enabled */
+static atomic_t kmemleak_enabled = ATOMIC_INIT(0);
+/* set in the late_initcall if there were no errors */
+static atomic_t kmemleak_initialized = ATOMIC_INIT(0);
+/* enables or disables early logging of the memory operations */
+static atomic_t kmemleak_early_log = ATOMIC_INIT(1);
+/* set if a fata kmemleak error has occurred */
+static atomic_t kmemleak_error = ATOMIC_INIT(0);
+
+/* minimum and maximum address that may be valid pointers */
+static unsigned long min_addr = ULONG_MAX;
+static unsigned long max_addr;
+
+/* used for yielding the CPU to other tasks during scanning */
+static unsigned long next_scan_yield;
+static struct task_struct *scan_thread;
+static unsigned long jiffies_scan_yield;
+static unsigned long jiffies_min_age;
+/* delay between automatic memory scannings */
+static signed long jiffies_scan_wait;
+/* enables or disables the task stacks scanning */
+static int kmemleak_stack_scan;
+/* mutex protecting the memory scanning */
+static DEFINE_MUTEX(scan_mutex);
+/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */
+static DEFINE_MUTEX(kmemleak_mutex);
+
+/* number of leaks reported (for limitation purposes) */
+static int reported_leaks;
+
+/*
+ * Early object allocation/freeing logging. Kkmemleak is initialized after the
+ * kernel allocator. However, both the kernel allocator and kmemleak may
+ * allocate memory blocks which need to be tracked. Kkmemleak defines an
+ * arbitrary buffer to hold the allocation/freeing information before it is
+ * fully initialized.
+ */
+
+/* kmemleak operation type for early logging */
+enum {
+	KMEMLEAK_ALLOC,
+	KMEMLEAK_FREE,
+	KMEMLEAK_NOT_LEAK,
+	KMEMLEAK_IGNORE,
+	KMEMLEAK_SCAN_AREA,
+	KMEMLEAK_NO_SCAN
+};
+
+/*
+ * Structure holding the information passed to kmemleak callbacks during the
+ * early logging.
+ */
+struct early_log {
+	int op_type;			/* kmemleak operation type */
+	const void *ptr;		/* allocated/freed memory block */
+	size_t size;			/* memory block size */
+	int min_count;			/* minimum reference count */
+	unsigned long offset;		/* scan area offset */
+	size_t length;			/* scan area length */
+};
+
+/* early logging buffer and current position */
+static struct early_log early_log[200];
+static int crt_early_log;
+
+static void kmemleak_disable(void);
+
+/*
+ * Print a warning and dump the stack trace.
+ */
+#define kmemleak_warn(x...)	do {	\
+	pr_warning(x);			\
+	dump_stack();			\
+} while (0)
+
+/*
+ * Macro invoked when a serious kmemleak condition occured and cannot be
+ * recovered from. Kkmemleak will be disabled and further allocation/freeing
+ * tracing no longer available.
+ */
+#define kmemleak_panic(x...)	do {	\
+	kmemleak_warn(x);		\
+	kmemleak_disable();		\
+} while (0)
+
+/*
+ * Object colors, encoded with count and min_count:
+ * - white - orphan object, not enough references to it (count < min_count)
+ * - gray  - not orphan, not marked as false positive (min_count == 0) or
+ *		sufficient references to it (count >= min_count)
+ * - black - ignore, it doesn't contain references (e.g. text section)
+ *		(min_count == -1). No function defined for this color.
+ * Newly created objects don't have any color assigned (object->count == -1)
+ * before the next memory scan when they become white.
+ */
+static int color_white(const struct kmemleak_object *object)
+{
+	return object->count != -1 && object->count < object->min_count;
+}
+
+static int color_gray(const struct kmemleak_object *object)
+{
+	return object->min_count != -1 && object->count >= object->min_count;
+}
+
+/*
+ * Objects are considered referenced if their color is gray and they have not
+ * been deleted.
+ */
+static int referenced_object(struct kmemleak_object *object)
+{
+	return (object->flags & OBJECT_ALLOCATED) && color_gray(object);
+}
+
+/*
+ * Objects are considered unreferenced only if their color is white, they have
+ * not be deleted and have a minimum age to avoid false positives caused by
+ * pointers temporarily stored in CPU registers.
+ */
+static int unreferenced_object(struct kmemleak_object *object)
+{
+	return (object->flags & OBJECT_ALLOCATED) && color_white(object) &&
+		time_is_before_eq_jiffies(object->jiffies + jiffies_min_age);
+}
+
+/*
+ * Printing of the (un)referenced objects information, either to the seq file
+ * or to the kernel log. The print_referenced/print_unreferenced functions
+ * must be called with the object->lock held.
+ */
+#define print_helper(seq, x...)	do {	\
+	struct seq_file *s = (seq);	\
+	if (s)				\
+		seq_printf(s, x);	\
+	else				\
+		pr_info(x);		\
+} while (0)
+
+static void print_referenced(struct kmemleak_object *object)
+{
+	pr_info("kmemleak: referenced object 0x%08lx (size %zu)\n",
+		object->pointer, object->size);
+}
+
+static void print_unreferenced(struct seq_file *seq,
+			       struct kmemleak_object *object)
+{
+	int i;
+
+	print_helper(seq, "kmemleak: unreferenced object 0x%08lx (size %zu):\n",
+		     object->pointer, object->size);
+	print_helper(seq, "  comm \"%s\", pid %d, jiffies %lu\n",
+		     object->comm, object->pid, object->jiffies);
+	print_helper(seq, "  backtrace:\n");
+
+	for (i = 0; i < object->trace_len; i++) {
+		void *ptr = (void *)object->trace[i];
+		print_helper(seq, "    [<%p>] %pS\n", ptr, ptr);
+	}
+}
+
+/*
+ * Print the kmemleak_object information. This function is used mainly for
+ * debugging special cases when kmemleak operations. It must be called with
+ * the object->lock held.
+ */
+static void dump_object_info(struct kmemleak_object *object)
+{
+	struct stack_trace trace;
+
+	trace.nr_entries = object->trace_len;
+	trace.entries = object->trace;
+
+	pr_notice("kmemleak: Object 0x%08lx (size %zu):\n",
+		  object->tree_node.start, object->size);
+	pr_notice("  comm \"%s\", pid %d, jiffies %lu\n",
+		  object->comm, object->pid, object->jiffies);
+	pr_notice("  min_count = %d\n", object->min_count);
+	pr_notice("  count = %d\n", object->count);
+	pr_notice("  backtrace:\n");
+	print_stack_trace(&trace, 4);
+}
+
+/*
+ * Look-up a memory block metadata (kmemleak_object) in the priority search
+ * tree based on a pointer value. If alias is 0, only values pointing to the
+ * beginning of the memory block are allowed. The kmemleak_lock must be held
+ * when calling this function.
+ */
+static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
+{
+	struct prio_tree_node *node;
+	struct prio_tree_iter iter;
+	struct kmemleak_object *object;
+
+	prio_tree_iter_init(&iter, &object_tree_root, ptr, ptr);
+	node = prio_tree_next(&iter);
+	if (node) {
+		object = prio_tree_entry(node, struct kmemleak_object,
+					 tree_node);
+		if (!alias && object->pointer != ptr) {
+			kmemleak_warn("kmemleak: Found object by alias");
+			object = NULL;
+		}
+	} else
+		object = NULL;
+
+	return object;
+}
+
+/*
+ * Increment the object use_count. Return 1 if successful or 0 otherwise. Note
+ * that once an object's use_count reached 0, the RCU freeing was already
+ * registered and the object should no longer be used. This function must be
+ * called under the protection of rcu_read_lock().
+ */
+static int get_object(struct kmemleak_object *object)
+{
+	return atomic_inc_not_zero(&object->use_count);
+}
+
+/*
+ * RCU callback to free a kmemleak_object.
+ */
+static void free_object_rcu(struct rcu_head *rcu)
+{
+	struct hlist_node *elem, *tmp;
+	struct kmemleak_scan_area *area;
+	struct kmemleak_object *object =
+		container_of(rcu, struct kmemleak_object, rcu);
+
+	/*
+	 * Once use_count is 0 (guaranteed by put_object), there is no other
+	 * code accessing this object, hence no need for locking.
+	 */
+	hlist_for_each_entry_safe(area, elem, tmp, &object->area_list, node) {
+		hlist_del(elem);
+		kmem_cache_free(scan_area_cache, area);
+	}
+	kmem_cache_free(object_cache, object);
+}
+
+/*
+ * Decrement the object use_count. Once the count is 0, free the object using
+ * an RCU callback. Since put_object() may be called via the kmemleak_free() ->
+ * delete_object() path, the delayed RCU freeing ensures that there is no
+ * recursive call to the kernel allocator. Lock-less RCU object_list traversal
+ * is also possible.
+ */
+static void put_object(struct kmemleak_object *object)
+{
+	if (!atomic_dec_and_test(&object->use_count))
+		return;
+
+	/* should only get here after delete_object was called */
+	WARN_ON(object->flags & OBJECT_ALLOCATED);
+
+	call_rcu(&object->rcu, free_object_rcu);
+}
+
+/*
+ * Look up an object in the prio search tree and increase its use_count.
+ */
+static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias)
+{
+	unsigned long flags;
+	struct kmemleak_object *object = NULL;
+
+	rcu_read_lock();
+	read_lock_irqsave(&kmemleak_lock, flags);
+	if (ptr >= min_addr && ptr < max_addr)
+		object = lookup_object(ptr, alias);
+	read_unlock_irqrestore(&kmemleak_lock, flags);
+
+	/* check whether the object is still available */
+	if (object && !get_object(object))
+		object = NULL;
+	rcu_read_unlock();
+
+	return object;
+}
+
+/*
+ * Create the metadata (struct kmemleak_object) corresponding to an allocated
+ * memory block and add it to the object_list and object_tree_root.
+ */
+static void create_object(unsigned long ptr, size_t size, int min_count,
+			  gfp_t gfp)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+	struct prio_tree_node *node;
+	struct stack_trace trace;
+
+	object = kmem_cache_alloc(object_cache, gfp & ~GFP_SLAB_BUG_MASK);
+	if (!object) {
+		kmemleak_panic("kmemleak: Cannot allocate a kmemleak_object "
+			       "structure\n");
+		return;
+	}
+
+	INIT_LIST_HEAD(&object->object_list);
+	INIT_LIST_HEAD(&object->gray_list);
+	INIT_HLIST_HEAD(&object->area_list);
+	spin_lock_init(&object->lock);
+	atomic_set(&object->use_count, 1);
+	object->flags = OBJECT_ALLOCATED;
+	object->pointer = ptr;
+	object->size = size;
+	object->min_count = min_count;
+	object->count = -1;			/* no color initially */
+	object->jiffies = jiffies;
+
+	/* task information */
+	if (in_irq()) {
+		object->pid = 0;
+		strncpy(object->comm, "hardirq", sizeof(object->comm));
+	} else if (in_softirq()) {
+		object->pid = 0;
+		strncpy(object->comm, "softirq", sizeof(object->comm));
+	} else {
+		object->pid = current->pid;
+		/*
+		 * There is a small chance of a race with set_task_comm(),
+		 * however using get_task_comm() here may cause locking
+		 * dependency issues with current->alloc_lock. In the worst
+		 * case, the command line is not correct.
+		 */
+		strncpy(object->comm, current->comm, sizeof(object->comm));
+	}
+
+	/* kernel backtrace */
+	trace.max_entries = MAX_TRACE;
+	trace.nr_entries = 0;
+	trace.entries = object->trace;
+	trace.skip = 1;
+	save_stack_trace(&trace);
+	object->trace_len = trace.nr_entries;
+
+	INIT_PRIO_TREE_NODE(&object->tree_node);
+	object->tree_node.start = ptr;
+	object->tree_node.last = ptr + size - 1;
+
+	write_lock_irqsave(&kmemleak_lock, flags);
+	min_addr = min(min_addr, ptr);
+	max_addr = max(max_addr, ptr + size);
+	node = prio_tree_insert(&object_tree_root, &object->tree_node);
+	/*
+	 * The code calling the kernel does not yet have the pointer to the
+	 * memory block to be able to free it.  However, we still hold the
+	 * kmemleak_lock here in case parts of the kernel started freeing
+	 * random memory blocks.
+	 */
+	if (node != &object->tree_node) {
+		unsigned long flags;
+
+		kmemleak_panic("kmemleak: Cannot insert 0x%lx into the object "
+			       "search tree (already existing)\n", ptr);
+		object = lookup_object(ptr, 1);
+		spin_lock_irqsave(&object->lock, flags);
+		dump_object_info(object);
+		spin_unlock_irqrestore(&object->lock, flags);
+
+		goto out;
+	}
+	list_add_tail_rcu(&object->object_list, &object_list);
+out:
+	write_unlock_irqrestore(&kmemleak_lock, flags);
+}
+
+/*
+ * Remove the metadata (struct kmemleak_object) for a memory block from the
+ * object_list and object_tree_root and decrement its use_count.
+ */
+static void delete_object(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	write_lock_irqsave(&kmemleak_lock, flags);
+	object = lookup_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Freeing unknown object at 0x%08lx\n",
+			      ptr);
+		write_unlock_irqrestore(&kmemleak_lock, flags);
+		return;
+	}
+	prio_tree_remove(&object_tree_root, &object->tree_node);
+	list_del_rcu(&object->object_list);
+	write_unlock_irqrestore(&kmemleak_lock, flags);
+
+	WARN_ON(!(object->flags & OBJECT_ALLOCATED));
+	WARN_ON(atomic_read(&object->use_count) < 1);
+
+	/*
+	 * Locking here also ensures that the corresponding memory block
+	 * cannot be freed when it is being scanned.
+	 */
+	spin_lock_irqsave(&object->lock, flags);
+	if (object->flags & OBJECT_REPORTED)
+		print_referenced(object);
+	object->flags &= ~OBJECT_ALLOCATED;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Make a object permanently as gray-colored so that it can no longer be
+ * reported as a leak. This is used in general to mark a false positive.
+ */
+static void make_gray_object(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Graying unknown object at 0x%08lx\n",
+			      ptr);
+		return;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	object->min_count = 0;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Mark the object as black-colored so that it is ignored from scans and
+ * reporting.
+ */
+static void make_black_object(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Blacking unknown object at 0x%08lx\n",
+			      ptr);
+		return;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	object->min_count = -1;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Add a scanning area to the object. If at least one such area is added,
+ * kmemleak will only scan these ranges rather than the whole memory block.
+ */
+static void add_scan_area(unsigned long ptr, unsigned long offset,
+			  size_t length, gfp_t gfp)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+	struct kmemleak_scan_area *area;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Adding scan area to unknown "
+			      "object at 0x%08lx\n", ptr);
+		return;
+	}
+
+	area = kmem_cache_alloc(scan_area_cache, gfp & ~GFP_SLAB_BUG_MASK);
+	if (!area) {
+		kmemleak_warn("kmemleak: Cannot allocate a scan area\n");
+		goto out;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	if (offset + length > object->size) {
+		kmemleak_warn("kmemleak: Scan area larger than object "
+			      "0x%08lx\n", ptr);
+		dump_object_info(object);
+		kmem_cache_free(scan_area_cache, area);
+		goto out_unlock;
+	}
+
+	INIT_HLIST_NODE(&area->node);
+	area->offset = offset;
+	area->length = length;
+
+	hlist_add_head(&area->node, &object->area_list);
+out_unlock:
+	spin_unlock_irqrestore(&object->lock, flags);
+out:
+	put_object(object);
+}
+
+/*
+ * Set the OBJECT_NO_SCAN flag for the object corresponding to the give
+ * pointer. Such object will not be scanned by kmemleak but references to it
+ * are searched.
+ */
+static void object_no_scan(unsigned long ptr)
+{
+	unsigned long flags;
+	struct kmemleak_object *object;
+
+	object = find_and_get_object(ptr, 0);
+	if (!object) {
+		kmemleak_warn("kmemleak: Not scanning unknown object at "
+			      "0x%08lx\n", ptr);
+		return;
+	}
+
+	spin_lock_irqsave(&object->lock, flags);
+	object->flags |= OBJECT_NO_SCAN;
+	spin_unlock_irqrestore(&object->lock, flags);
+	put_object(object);
+}
+
+/*
+ * Log an early kmemleak_* call to the early_log buffer. These calls will be
+ * processed later once kmemleak is fully initialized.
+ */
+static void log_early(int op_type, const void *ptr, size_t size,
+		      int min_count, unsigned long offset, size_t length)
+{
+	unsigned long flags;
+	struct early_log *log;
+
+	if (crt_early_log >= ARRAY_SIZE(early_log)) {
+		kmemleak_panic("kmemleak: Early log buffer exceeded\n");
+		return;
+	}
+
+	/*
+	 * There is no need for locking since the kernel is still in UP mode
+	 * at this stage. Disabling the IRQs is enough.
+	 */
+	local_irq_save(flags);
+	log = &early_log[crt_early_log];
+	log->op_type = op_type;
+	log->ptr = ptr;
+	log->size = size;
+	log->min_count = min_count;
+	log->offset = offset;
+	log->length = length;
+	crt_early_log++;
+	local_irq_restore(flags);
+}
+
+/*
+ * Memory allocation function callback. This function is called from the
+ * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc,
+ * vmalloc etc.).
+ */
+void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp)
+{
+	pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		create_object((unsigned long)ptr, size, min_count, gfp);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_alloc);
+
+/*
+ * Memory freeing function callback. This function is called from the kernel
+ * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.).
+ */
+void kmemleak_free(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		delete_object((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL_GPL(kmemleak_free);
+
+/*
+ * Mark an already allocated memory block as a false positive. This will cause
+ * the block to no longer be reported as leak and always be scanned.
+ */
+void kmemleak_not_leak(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		make_gray_object((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_not_leak);
+
+/*
+ * Ignore a memory block. This is usually done when it is known that the
+ * corresponding block is not a leak and does not contain any references to
+ * other allocated memory blocks.
+ */
+void kmemleak_ignore(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		make_black_object((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_ignore);
+
+/*
+ * Limit the range to be scanned in an allocated memory block.
+ */
+void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length,
+			gfp_t gfp)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		add_scan_area((unsigned long)ptr, offset, length, gfp);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length);
+}
+EXPORT_SYMBOL(kmemleak_scan_area);
+
+/*
+ * Inform kmemleak not to scan the given memory block.
+ */
+void kmemleak_no_scan(const void *ptr)
+{
+	pr_debug("%s(0x%p)\n", __func__, ptr);
+
+	if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr))
+		object_no_scan((unsigned long)ptr);
+	else if (atomic_read(&kmemleak_early_log))
+		log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0);
+}
+EXPORT_SYMBOL(kmemleak_no_scan);
+
+/*
+ * Yield the CPU so that other tasks get a chance to run.  The yielding is
+ * rate-limited to avoid excessive number of calls to the schedule() function
+ * during memory scanning.
+ */
+static void scan_yield(void)
+{
+	might_sleep();
+
+	if (time_is_before_eq_jiffies(next_scan_yield)) {
+		schedule();
+		next_scan_yield = jiffies + jiffies_scan_yield;
+	}
+}
+
+/*
+ * Memory scanning is a long process and it needs to be interruptable. This
+ * function checks whether such interrupt condition occured.
+ */
+static int scan_should_stop(void)
+{
+	if (!atomic_read(&kmemleak_enabled))
+		return 1;
+
+	/*
+	 * This function may be called from either process or kthread context,
+	 * hence the need to check for both stop conditions.
+	 */
+	if (current->mm)
+		return signal_pending(current);
+	else
+		return kthread_should_stop();
+
+	return 0;
+}
+
+/*
+ * Scan a memory block (exclusive range) for valid pointers and add those
+ * found to the gray list.
+ */
+static void scan_block(void *_start, void *_end,
+		       struct kmemleak_object *scanned)
+{
+	unsigned long *ptr;
+	unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
+	unsigned long *end = _end - (BYTES_PER_POINTER - 1);
+
+	for (ptr = start; ptr < end; ptr++) {
+		unsigned long flags;
+		unsigned long pointer = *ptr;
+		struct kmemleak_object *object;
+
+		if (scan_should_stop())
+			break;
+
+		/*
+		 * When scanning a memory block with a corresponding
+		 * kmemleak_object, the CPU yielding is handled in the calling
+		 * code since it holds the object->lock to avoid the block
+		 * freeing.
+		 */
+		if (!scanned)
+			scan_yield();
+
+		object = find_and_get_object(pointer, 1);
+		if (!object)
+			continue;
+		if (object == scanned) {
+			/* self referenced, ignore */
+			put_object(object);
+			continue;
+		}
+
+		/*
+		 * Avoid the lockdep recursive warning on object->lock being
+		 * previously acquired in scan_object(). These locks are
+		 * enclosed by scan_mutex.
+		 */
+		spin_lock_irqsave_nested(&object->lock, flags,
+					 SINGLE_DEPTH_NESTING);
+		if (!color_white(object)) {
+			/* non-orphan, ignored or new */
+			spin_unlock_irqrestore(&object->lock, flags);
+			put_object(object);
+			continue;
+		}
+
+		/*
+		 * Increase the object's reference count (number of pointers
+		 * to the memory block). If this count reaches the required
+		 * minimum, the object's color will become gray and it will be
+		 * added to the gray_list.
+		 */
+		object->count++;
+		if (color_gray(object))
+			list_add_tail(&object->gray_list, &gray_list);
+		else
+			put_object(object);
+		spin_unlock_irqrestore(&object->lock, flags);
+	}
+}
+
+/*
+ * Scan a memory block corresponding to a kmemleak_object. A condition is
+ * that object->use_count >= 1.
+ */
+static void scan_object(struct kmemleak_object *object)
+{
+	struct kmemleak_scan_area *area;
+	struct hlist_node *elem;
+	unsigned long flags;
+
+	/*
+	 * Once the object->lock is aquired, the corresponding memory block
+	 * cannot be freed (the same lock is aquired in delete_object).
+	 */
+	spin_lock_irqsave(&object->lock, flags);
+	if (object->flags & OBJECT_NO_SCAN)
+		goto out;
+	if (!(object->flags & OBJECT_ALLOCATED))
+		/* already freed object */
+		goto out;
+	if (hlist_empty(&object->area_list))
+		scan_block((void *)object->pointer,
+			   (void *)(object->pointer + object->size), object);
+	else
+		hlist_for_each_entry(area, elem, &object->area_list, node)
+			scan_block((void *)(object->pointer + area->offset),
+				   (void *)(object->pointer + area->offset
+					    + area->length), object);
+out:
+	spin_unlock_irqrestore(&object->lock, flags);
+}
+
+/*
+ * Scan data sections and all the referenced memory blocks allocated via the
+ * kernel's standard allocators. This function must be called with the
+ * scan_mutex held.
+ */
+static void kmemleak_scan(void)
+{
+	unsigned long flags;
+	struct kmemleak_object *object, *tmp;
+	struct task_struct *task;
+	int i;
+
+	/* prepare the kmemleak_object's */
+	rcu_read_lock();
+	list_for_each_entry_rcu(object, &object_list, object_list) {
+		spin_lock_irqsave(&object->lock, flags);
+#ifdef DEBUG
+		/*
+		 * With a few exceptions there should be a maximum of
+		 * 1 reference to any object at this point.
+		 */
+		if (atomic_read(&object->use_count) > 1) {
+			pr_debug("kmemleak: object->use_count = %d\n",
+				 atomic_read(&object->use_count));
+			dump_object_info(object);
+		}
+#endif
+		/* reset the reference count (whiten the object) */
+		object->count = 0;
+		if (color_gray(object) && get_object(object))
+			list_add_tail(&object->gray_list, &gray_list);
+
+		spin_unlock_irqrestore(&object->lock, flags);
+	}
+	rcu_read_unlock();
+
+	/* data/bss scanning */
+	scan_block(_sdata, _edata, NULL);
+	scan_block(__bss_start, __bss_stop, NULL);
+
+#ifdef CONFIG_SMP
+	/* per-cpu sections scanning */
+	for_each_possible_cpu(i)
+		scan_block(__per_cpu_start + per_cpu_offset(i),
+			   __per_cpu_end + per_cpu_offset(i), NULL);
+#endif
+
+	/*
+	 * Struct page scanning for each node. The code below is not yet safe
+	 * with MEMORY_HOTPLUG.
+	 */
+	for_each_online_node(i) {
+		pg_data_t *pgdat = NODE_DATA(i);
+		unsigned long start_pfn = pgdat->node_start_pfn;
+		unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
+		unsigned long pfn;
+
+		for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+			struct page *page;
+
+			if (!pfn_valid(pfn))
+				continue;
+			page = pfn_to_page(pfn);
+			/* only scan if page is in use */
+			if (page_count(page) == 0)
+				continue;
+			scan_block(page, page + 1, NULL);
+		}
+	}
+
+	/*
+	 * Scanning the task stacks may introduce false negatives and it is
+	 * not enabled by default.
+	 */
+	if (kmemleak_stack_scan) {
+		read_lock(&tasklist_lock);
+		for_each_process(task)
+			scan_block(task_stack_page(task),
+				   task_stack_page(task) + THREAD_SIZE, NULL);
+		read_unlock(&tasklist_lock);
+	}
+
+	/*
+	 * Scan the objects already referenced from the sections scanned
+	 * above. More objects will be referenced and, if there are no memory
+	 * leaks, all the objects will be scanned. The list traversal is safe
+	 * for both tail additions and removals from inside the loop. The
+	 * kmemleak objects cannot be freed from outside the loop because their
+	 * use_count was increased.
+	 */
+	object = list_entry(gray_list.next, typeof(*object), gray_list);
+	while (&object->gray_list != &gray_list) {
+		scan_yield();
+
+		/* may add new objects to the list */
+		if (!scan_should_stop())
+			scan_object(object);
+
+		tmp = list_entry(object->gray_list.next, typeof(*object),
+				 gray_list);
+
+		/* remove the object from the list and release it */
+		list_del(&object->gray_list);
+		put_object(object);
+
+		object = tmp;
+	}
+	WARN_ON(!list_empty(&gray_list));
+}
+
+/*
+ * Thread function performing automatic memory scanning. Unreferenced objects
+ * at the end of a memory scan are reported but only the first time.
+ */
+static int kmemleak_scan_thread(void *arg)
+{
+	static int first_run = 1;
+
+	pr_info("kmemleak: Automatic memory scanning thread started\n");
+
+	/*
+	 * Wait before the first scan to allow the system to fully initialize.
+	 */
+	if (first_run) {
+		first_run = 0;
+		ssleep(SECS_FIRST_SCAN);
+	}
+
+	while (!kthread_should_stop()) {
+		struct kmemleak_object *object;
+		signed long timeout = jiffies_scan_wait;
+
+		mutex_lock(&scan_mutex);
+
+		kmemleak_scan();
+		reported_leaks = 0;
+
+		rcu_read_lock();
+		list_for_each_entry_rcu(object, &object_list, object_list) {
+			unsigned long flags;
+
+			if (reported_leaks >= REPORTS_NR)
+				break;
+			spin_lock_irqsave(&object->lock, flags);
+			if (!(object->flags & OBJECT_REPORTED) &&
+			    unreferenced_object(object)) {
+				print_unreferenced(NULL, object);
+				object->flags |= OBJECT_REPORTED;
+				reported_leaks++;
+			} else if ((object->flags & OBJECT_REPORTED) &&
+				   referenced_object(object)) {
+				print_referenced(object);
+				object->flags &= ~OBJECT_REPORTED;
+			}
+			spin_unlock_irqrestore(&object->lock, flags);
+		}
+		rcu_read_unlock();
+
+		mutex_unlock(&scan_mutex);
+		/* wait before the next scan */
+		while (timeout && !kthread_should_stop())
+			timeout = schedule_timeout_interruptible(timeout);
+	}
+
+	pr_info("kmemleak: Automatic memory scanning thread ended\n");
+
+	return 0;
+}
+
+/*
+ * Start the automatic memory scanning thread. This function must be called
+ * with the kmemleak_mutex held.
+ */
+void start_scan_thread(void)
+{
+	if (scan_thread)
+		return;
+	scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak");
+	if (IS_ERR(scan_thread)) {
+		pr_warning("kmemleak: Failed to create the scan thread\n");
+		scan_thread = NULL;
+	}
+}
+
+/*
+ * Stop the automatic memory scanning thread. This function must be called
+ * with the kmemleak_mutex held.
+ */
+void stop_scan_thread(void)
+{
+	if (scan_thread) {
+		kthread_stop(scan_thread);
+		scan_thread = NULL;
+	}
+}
+
+/*
+ * Iterate over the object_list and return the first valid object at or after
+ * the required position with its use_count incremented. The function triggers
+ * a memory scanning when the pos argument points to the first position.
+ */
+static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct kmemleak_object *object;
+	loff_t n = *pos;
+
+	if (!n) {
+		kmemleak_scan();
+		reported_leaks = 0;
+	}
+	if (reported_leaks >= REPORTS_NR)
+		return NULL;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(object, &object_list, object_list) {
+		if (n-- > 0)
+			continue;
+		if (get_object(object))
+			goto out;
+	}
+	object = NULL;
+out:
+	rcu_read_unlock();
+	return object;
+}
+
+/*
+ * Return the next object in the object_list. The function decrements the
+ * use_count of the previous object and increases that of the next one.
+ */
+static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct kmemleak_object *prev_obj = v;
+	struct kmemleak_object *next_obj = NULL;
+	struct list_head *n = &prev_obj->object_list;
+
+	++(*pos);
+	if (reported_leaks >= REPORTS_NR)
+		goto out;
+
+	rcu_read_lock();
+	list_for_each_continue_rcu(n, &object_list) {
+		next_obj = list_entry(n, struct kmemleak_object, object_list);
+		if (get_object(next_obj))
+			break;
+	}
+	rcu_read_unlock();
+out:
+	put_object(prev_obj);
+	return next_obj;
+}
+
+/*
+ * Decrement the use_count of the last object required, if any.
+ */
+static void kmemleak_seq_stop(struct seq_file *seq, void *v)
+{
+	if (v)
+		put_object(v);
+}
+
+/*
+ * Print the information for an unreferenced object to the seq file.
+ */
+static int kmemleak_seq_show(struct seq_file *seq, void *v)
+{
+	struct kmemleak_object *object = v;
+	unsigned long flags;
+
+	spin_lock_irqsave(&object->lock, flags);
+	if (!unreferenced_object(object))
+		goto out;
+	print_unreferenced(seq, object);
+	reported_leaks++;
+out:
+	spin_unlock_irqrestore(&object->lock, flags);
+	return 0;
+}
+
+static const struct seq_operations kmemleak_seq_ops = {
+	.start = kmemleak_seq_start,
+	.next  = kmemleak_seq_next,
+	.stop  = kmemleak_seq_stop,
+	.show  = kmemleak_seq_show,
+};
+
+static int kmemleak_open(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	if (!atomic_read(&kmemleak_enabled))
+		return -EBUSY;
+
+	ret = mutex_lock_interruptible(&kmemleak_mutex);
+	if (ret < 0)
+		goto out;
+	if (file->f_mode & FMODE_READ) {
+		ret = mutex_lock_interruptible(&scan_mutex);
+		if (ret < 0)
+			goto kmemleak_unlock;
+		ret = seq_open(file, &kmemleak_seq_ops);
+		if (ret < 0)
+			goto scan_unlock;
+	}
+	return ret;
+
+scan_unlock:
+	mutex_unlock(&scan_mutex);
+kmemleak_unlock:
+	mutex_unlock(&kmemleak_mutex);
+out:
+	return ret;
+}
+
+static int kmemleak_release(struct inode *inode, struct file *file)
+{
+	int ret = 0;
+
+	if (file->f_mode & FMODE_READ) {
+		seq_release(inode, file);
+		mutex_unlock(&scan_mutex);
+	}
+	mutex_unlock(&kmemleak_mutex);
+
+	return ret;
+}
+
+/*
+ * File write operation to configure kmemleak at run-time. The following
+ * commands can be written to the /sys/kernel/debug/kmemleak file:
+ *   off	- disable kmemleak (irreversible)
+ *   stack=on	- enable the task stacks scanning
+ *   stack=off	- disable the tasks stacks scanning
+ *   scan=on	- start the automatic memory scanning thread
+ *   scan=off	- stop the automatic memory scanning thread
+ *   scan=...	- set the automatic memory scanning period in seconds (0 to
+ *		  disable it)
+ */
+static ssize_t kmemleak_write(struct file *file, const char __user *user_buf,
+			      size_t size, loff_t *ppos)
+{
+	char buf[64];
+	int buf_size;
+
+	if (!atomic_read(&kmemleak_enabled))
+		return -EBUSY;
+
+	buf_size = min(size, (sizeof(buf) - 1));
+	if (strncpy_from_user(buf, user_buf, buf_size) < 0)
+		return -EFAULT;
+	buf[buf_size] = 0;
+
+	if (strncmp(buf, "off", 3) == 0)
+		kmemleak_disable();
+	else if (strncmp(buf, "stack=on", 8) == 0)
+		kmemleak_stack_scan = 1;
+	else if (strncmp(buf, "stack=off", 9) == 0)
+		kmemleak_stack_scan = 0;
+	else if (strncmp(buf, "scan=on", 7) == 0)
+		start_scan_thread();
+	else if (strncmp(buf, "scan=off", 8) == 0)
+		stop_scan_thread();
+	else if (strncmp(buf, "scan=", 5) == 0) {
+		unsigned long secs;
+		int err;
+
+		err = strict_strtoul(buf + 5, 0, &secs);
+		if (err < 0)
+			return err;
+		stop_scan_thread();
+		if (secs) {
+			jiffies_scan_wait = msecs_to_jiffies(secs * 1000);
+			start_scan_thread();
+		}
+	} else
+		return -EINVAL;
+
+	/* ignore the rest of the buffer, only one command at a time */
+	*ppos += size;
+	return size;
+}
+
+static const struct file_operations kmemleak_fops = {
+	.owner		= THIS_MODULE,
+	.open		= kmemleak_open,
+	.read		= seq_read,
+	.write		= kmemleak_write,
+	.llseek		= seq_lseek,
+	.release	= kmemleak_release,
+};
+
+/*
+ * Perform the freeing of the kmemleak internal objects after waiting for any
+ * current memory scan to complete.
+ */
+static int kmemleak_cleanup_thread(void *arg)
+{
+	struct kmemleak_object *object;
+
+	mutex_lock(&kmemleak_mutex);
+	stop_scan_thread();
+	mutex_unlock(&kmemleak_mutex);
+
+	mutex_lock(&scan_mutex);
+	rcu_read_lock();
+	list_for_each_entry_rcu(object, &object_list, object_list)
+		delete_object(object->pointer);
+	rcu_read_unlock();
+	mutex_unlock(&scan_mutex);
+
+	return 0;
+}
+
+/*
+ * Start the clean-up thread.
+ */
+static void kmemleak_cleanup(void)
+{
+	struct task_struct *cleanup_thread;
+
+	cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL,
+				     "kmemleak-clean");
+	if (IS_ERR(cleanup_thread))
+		pr_warning("kmemleak: Failed to create the clean-up thread\n");
+}
+
+/*
+ * Disable kmemleak. No memory allocation/freeing will be traced once this
+ * function is called. Disabling kmemleak is an irreversible operation.
+ */
+static void kmemleak_disable(void)
+{
+	/* atomically check whether it was already invoked */
+	if (atomic_cmpxchg(&kmemleak_error, 0, 1))
+		return;
+
+	/* stop any memory operation tracing */
+	atomic_set(&kmemleak_early_log, 0);
+	atomic_set(&kmemleak_enabled, 0);
+
+	/* check whether it is too early for a kernel thread */
+	if (atomic_read(&kmemleak_initialized))
+		kmemleak_cleanup();
+
+	pr_info("Kernel memory leak detector disabled\n");
+}
+
+/*
+ * Allow boot-time kmemleak disabling (enabled by default).
+ */
+static int kmemleak_boot_config(char *str)
+{
+	if (!str)
+		return -EINVAL;
+	if (strcmp(str, "off") == 0)
+		kmemleak_disable();
+	else if (strcmp(str, "on") != 0)
+		return -EINVAL;
+	return 0;
+}
+early_param("kmemleak", kmemleak_boot_config);
+
+/*
+ * Kkmemleak initialization.
+ */
+void __init kmemleak_init(void)
+{
+	int i;
+	unsigned long flags;
+
+	jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD);
+	jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
+	jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
+
+	object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
+	scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
+	INIT_PRIO_TREE_ROOT(&object_tree_root);
+
+	/* the kernel is still in UP mode, so disabling the IRQs is enough */
+	local_irq_save(flags);
+	if (!atomic_read(&kmemleak_error)) {
+		atomic_set(&kmemleak_enabled, 1);
+		atomic_set(&kmemleak_early_log, 0);
+	}
+	local_irq_restore(flags);
+
+	/*
+	 * This is the point where tracking allocations is safe. Automatic
+	 * scanning is started during the late initcall. Add the early logged
+	 * callbacks to the kmemleak infrastructure.
+	 */
+	for (i = 0; i < crt_early_log; i++) {
+		struct early_log *log = &early_log[i];
+
+		switch (log->op_type) {
+		case KMEMLEAK_ALLOC:
+			kmemleak_alloc(log->ptr, log->size, log->min_count,
+				       GFP_KERNEL);
+			break;
+		case KMEMLEAK_FREE:
+			kmemleak_free(log->ptr);
+			break;
+		case KMEMLEAK_NOT_LEAK:
+			kmemleak_not_leak(log->ptr);
+			break;
+		case KMEMLEAK_IGNORE:
+			kmemleak_ignore(log->ptr);
+			break;
+		case KMEMLEAK_SCAN_AREA:
+			kmemleak_scan_area(log->ptr, log->offset, log->length,
+					   GFP_KERNEL);
+			break;
+		case KMEMLEAK_NO_SCAN:
+			kmemleak_no_scan(log->ptr);
+			break;
+		default:
+			WARN_ON(1);
+		}
+	}
+}
+
+/*
+ * Late initialization function.
+ */
+static int __init kmemleak_late_init(void)
+{
+	struct dentry *dentry;
+
+	atomic_set(&kmemleak_initialized, 1);
+
+	if (atomic_read(&kmemleak_error)) {
+		/*
+		 * Some error occured and kmemleak was disabled. There is a
+		 * small chance that kmemleak_disable() was called immediately
+		 * after setting kmemleak_initialized and we may end up with
+		 * two clean-up threads but serialized by scan_mutex.
+		 */
+		kmemleak_cleanup();
+		return -ENOMEM;
+	}
+
+	dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL,
+				     &kmemleak_fops);
+	if (!dentry)
+		pr_warning("kmemleak: Failed to create the debugfs kmemleak "
+			   "file\n");
+	mutex_lock(&kmemleak_mutex);
+	start_scan_thread();
+	mutex_unlock(&kmemleak_mutex);
+
+	pr_info("Kernel memory leak detector initialized\n");
+
+	return 0;
+}
+late_initcall(kmemleak_late_init);
-- 
GitLab


From 04f70336c80c43a15e617b36c2043dfa0ad6ed0f Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:22:39 +0100
Subject: [PATCH 2066/2198] kmemleak: Add documentation on the memory leak
 detector

This patch adds the Documentation/kmemleak.txt file with some
information about how kmemleak works.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/kernel-parameters.txt |   4 +
 Documentation/kmemleak.txt          | 142 ++++++++++++++++++++++++++++
 2 files changed, 146 insertions(+)
 create mode 100644 Documentation/kmemleak.txt

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 4a3c2209a124b..04a44cc5048ab 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1077,6 +1077,10 @@ and is between 256 and 4096 characters. It is defined in the file
 			Configure the RouterBoard 532 series on-chip
 			Ethernet adapter MAC address.
 
+	kmemleak=	[KNL] Boot-time kmemleak enable/disable
+			Valid arguments: on, off
+			Default: on
+
 	kstack=N	[X86] Print N words from the kernel stack
 			in oops dumps.
 
diff --git a/Documentation/kmemleak.txt b/Documentation/kmemleak.txt
new file mode 100644
index 0000000000000..0112da3b9ab87
--- /dev/null
+++ b/Documentation/kmemleak.txt
@@ -0,0 +1,142 @@
+Kernel Memory Leak Detector
+===========================
+
+Introduction
+------------
+
+Kmemleak provides a way of detecting possible kernel memory leaks in a
+way similar to a tracing garbage collector
+(http://en.wikipedia.org/wiki/Garbage_collection_%28computer_science%29#Tracing_garbage_collectors),
+with the difference that the orphan objects are not freed but only
+reported via /sys/kernel/debug/kmemleak. A similar method is used by the
+Valgrind tool (memcheck --leak-check) to detect the memory leaks in
+user-space applications.
+
+Usage
+-----
+
+CONFIG_DEBUG_KMEMLEAK in "Kernel hacking" has to be enabled. A kernel
+thread scans the memory every 10 minutes (by default) and prints any new
+unreferenced objects found. To trigger an intermediate scan and display
+all the possible memory leaks:
+
+  # mount -t debugfs nodev /sys/kernel/debug/
+  # cat /sys/kernel/debug/kmemleak
+
+Note that the orphan objects are listed in the order they were allocated
+and one object at the beginning of the list may cause other subsequent
+objects to be reported as orphan.
+
+Memory scanning parameters can be modified at run-time by writing to the
+/sys/kernel/debug/kmemleak file. The following parameters are supported:
+
+  off		- disable kmemleak (irreversible)
+  stack=on	- enable the task stacks scanning
+  stack=off	- disable the tasks stacks scanning
+  scan=on	- start the automatic memory scanning thread
+  scan=off	- stop the automatic memory scanning thread
+  scan=<secs>	- set the automatic memory scanning period in seconds (0
+		  to disable it)
+
+Kmemleak can also be disabled at boot-time by passing "kmemleak=off" on
+the kernel command line.
+
+Basic Algorithm
+---------------
+
+The memory allocations via kmalloc, vmalloc, kmem_cache_alloc and
+friends are traced and the pointers, together with additional
+information like size and stack trace, are stored in a prio search tree.
+The corresponding freeing function calls are tracked and the pointers
+removed from the kmemleak data structures.
+
+An allocated block of memory is considered orphan if no pointer to its
+start address or to any location inside the block can be found by
+scanning the memory (including saved registers). This means that there
+might be no way for the kernel to pass the address of the allocated
+block to a freeing function and therefore the block is considered a
+memory leak.
+
+The scanning algorithm steps:
+
+  1. mark all objects as white (remaining white objects will later be
+     considered orphan)
+  2. scan the memory starting with the data section and stacks, checking
+     the values against the addresses stored in the prio search tree. If
+     a pointer to a white object is found, the object is added to the
+     gray list
+  3. scan the gray objects for matching addresses (some white objects
+     can become gray and added at the end of the gray list) until the
+     gray set is finished
+  4. the remaining white objects are considered orphan and reported via
+     /sys/kernel/debug/kmemleak
+
+Some allocated memory blocks have pointers stored in the kernel's
+internal data structures and they cannot be detected as orphans. To
+avoid this, kmemleak can also store the number of values pointing to an
+address inside the block address range that need to be found so that the
+block is not considered a leak. One example is __vmalloc().
+
+Kmemleak API
+------------
+
+See the include/linux/kmemleak.h header for the functions prototype.
+
+kmemleak_init		 - initialize kmemleak
+kmemleak_alloc		 - notify of a memory block allocation
+kmemleak_free		 - notify of a memory block freeing
+kmemleak_not_leak	 - mark an object as not a leak
+kmemleak_ignore		 - do not scan or report an object as leak
+kmemleak_scan_area	 - add scan areas inside a memory block
+kmemleak_no_scan	 - do not scan a memory block
+kmemleak_erase		 - erase an old value in a pointer variable
+kmemleak_alloc_recursive - as kmemleak_alloc but checks the recursiveness
+kmemleak_free_recursive	 - as kmemleak_free but checks the recursiveness
+
+Dealing with false positives/negatives
+--------------------------------------
+
+The false negatives are real memory leaks (orphan objects) but not
+reported by kmemleak because values found during the memory scanning
+point to such objects. To reduce the number of false negatives, kmemleak
+provides the kmemleak_ignore, kmemleak_scan_area, kmemleak_no_scan and
+kmemleak_erase functions (see above). The task stacks also increase the
+amount of false negatives and their scanning is not enabled by default.
+
+The false positives are objects wrongly reported as being memory leaks
+(orphan). For objects known not to be leaks, kmemleak provides the
+kmemleak_not_leak function. The kmemleak_ignore could also be used if
+the memory block is known not to contain other pointers and it will no
+longer be scanned.
+
+Some of the reported leaks are only transient, especially on SMP
+systems, because of pointers temporarily stored in CPU registers or
+stacks. Kmemleak defines MSECS_MIN_AGE (defaulting to 1000) representing
+the minimum age of an object to be reported as a memory leak.
+
+Limitations and Drawbacks
+-------------------------
+
+The main drawback is the reduced performance of memory allocation and
+freeing. To avoid other penalties, the memory scanning is only performed
+when the /sys/kernel/debug/kmemleak file is read. Anyway, this tool is
+intended for debugging purposes where the performance might not be the
+most important requirement.
+
+To keep the algorithm simple, kmemleak scans for values pointing to any
+address inside a block's address range. This may lead to an increased
+number of false negatives. However, it is likely that a real memory leak
+will eventually become visible.
+
+Another source of false negatives is the data stored in non-pointer
+values. In a future version, kmemleak could only scan the pointer
+members in the allocated structures. This feature would solve many of
+the false negative cases described above.
+
+The tool can report false positives. These are cases where an allocated
+block doesn't need to be freed (some cases in the init_call functions),
+the pointer is calculated by other methods than the usual container_of
+macro or the pointer is stored in a location not scanned by kmemleak.
+
+Page allocations and ioremap are not tracked. Only the ARM and x86
+architectures are currently supported.
-- 
GitLab


From d5cff635290aec9ad7e6ee546aa4fae895361cbb Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:22:40 +0100
Subject: [PATCH 2067/2198] kmemleak: Add the slab memory allocation/freeing
 hooks

This patch adds the callbacks to kmemleak_(alloc|free) functions from
the slab allocator. The patch also adds the SLAB_NOLEAKTRACE flag to
avoid recursive calls to kmemleak when it allocates its own data
structures.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 include/linux/slab.h |  2 ++
 mm/slab.c            | 32 ++++++++++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 24c5602bee99d..48803064cedf7 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -62,6 +62,8 @@
 # define SLAB_DEBUG_OBJECTS	0x00000000UL
 #endif
 
+#define SLAB_NOLEAKTRACE	0x00800000UL	/* Avoid kmemleak tracing */
+
 /* The following flags affect the page allocator grouping pages by mobility */
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL		/* Objects are reclaimable */
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
diff --git a/mm/slab.c b/mm/slab.c
index f85831da9080d..859067f8e4fdd 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -107,6 +107,7 @@
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
 #include	<linux/nodemask.h>
+#include	<linux/kmemleak.h>
 #include	<linux/mempolicy.h>
 #include	<linux/mutex.h>
 #include	<linux/fault-inject.h>
@@ -178,13 +179,13 @@
 			 SLAB_STORE_USER | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
-			 SLAB_DEBUG_OBJECTS)
+			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
 #else
 # define CREATE_MASK	(SLAB_HWCACHE_ALIGN | \
 			 SLAB_CACHE_DMA | \
 			 SLAB_RECLAIM_ACCOUNT | SLAB_PANIC | \
 			 SLAB_DESTROY_BY_RCU | SLAB_MEM_SPREAD | \
-			 SLAB_DEBUG_OBJECTS)
+			 SLAB_DEBUG_OBJECTS | SLAB_NOLEAKTRACE)
 #endif
 
 /*
@@ -964,6 +965,14 @@ static struct array_cache *alloc_arraycache(int node, int entries,
 	struct array_cache *nc = NULL;
 
 	nc = kmalloc_node(memsize, GFP_KERNEL, node);
+	/*
+	 * The array_cache structures contain pointers to free object.
+	 * However, when such objects are allocated or transfered to another
+	 * cache the pointers are not cleared and they could be counted as
+	 * valid references during a kmemleak scan. Therefore, kmemleak must
+	 * not scan such objects.
+	 */
+	kmemleak_no_scan(nc);
 	if (nc) {
 		nc->avail = 0;
 		nc->limit = entries;
@@ -2621,6 +2630,14 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
 		/* Slab management obj is off-slab. */
 		slabp = kmem_cache_alloc_node(cachep->slabp_cache,
 					      local_flags, nodeid);
+		/*
+		 * If the first object in the slab is leaked (it's allocated
+		 * but no one has a reference to it), we want to make sure
+		 * kmemleak does not treat the ->s_mem pointer as a reference
+		 * to the object. Otherwise we will not report the leak.
+		 */
+		kmemleak_scan_area(slabp, offsetof(struct slab, list),
+				   sizeof(struct list_head), local_flags);
 		if (!slabp)
 			return NULL;
 	} else {
@@ -3141,6 +3158,12 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 		STATS_INC_ALLOCMISS(cachep);
 		objp = cache_alloc_refill(cachep, flags);
 	}
+	/*
+	 * To avoid a false negative, if an object that is in one of the
+	 * per-CPU caches is leaked, we need to make sure kmemleak doesn't
+	 * treat the array pointers as a reference to the object.
+	 */
+	kmemleak_erase(&ac->entry[ac->avail]);
 	return objp;
 }
 
@@ -3360,6 +3383,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
   out:
 	local_irq_restore(save_flags);
 	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
+	kmemleak_alloc_recursive(ptr, obj_size(cachep), 1, cachep->flags,
+				 flags);
 
 	if (unlikely((flags & __GFP_ZERO) && ptr))
 		memset(ptr, 0, obj_size(cachep));
@@ -3415,6 +3440,8 @@ __cache_alloc(struct kmem_cache *cachep, gfp_t flags, void *caller)
 	objp = __do_cache_alloc(cachep, flags);
 	local_irq_restore(save_flags);
 	objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
+	kmemleak_alloc_recursive(objp, obj_size(cachep), 1, cachep->flags,
+				 flags);
 	prefetchw(objp);
 
 	if (unlikely((flags & __GFP_ZERO) && objp))
@@ -3530,6 +3557,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
 	struct array_cache *ac = cpu_cache_get(cachep);
 
 	check_irq_off();
+	kmemleak_free_recursive(objp, cachep->flags);
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 
 	/*
-- 
GitLab


From 4374e616d28e65265a5b433ceece275449f3d2e3 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:23:17 +0100
Subject: [PATCH 2068/2198] kmemleak: Add the slob memory allocation/freeing
 hooks

This patch adds the callbacks to kmemleak_(alloc|free) functions from the
slob allocator.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Matt Mackall <mpm@selenic.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slob.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/mm/slob.c b/mm/slob.c
index 9b1737b0787bf..12f261499925a 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -67,6 +67,7 @@
 #include <linux/rcupdate.h>
 #include <linux/list.h>
 #include <linux/kmemtrace.h>
+#include <linux/kmemleak.h>
 #include <asm/atomic.h>
 
 /*
@@ -509,6 +510,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 				   size, PAGE_SIZE << order, gfp, node);
 	}
 
+	kmemleak_alloc(ret, size, 1, gfp);
 	return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
@@ -521,6 +523,7 @@ void kfree(const void *block)
 
 	if (unlikely(ZERO_OR_NULL_PTR(block)))
 		return;
+	kmemleak_free(block);
 
 	sp = slob_page(block);
 	if (is_slob_page(sp)) {
@@ -584,12 +587,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
 	} else if (flags & SLAB_PANIC)
 		panic("Cannot create slab cache %s\n", name);
 
+	kmemleak_alloc(c, sizeof(struct kmem_cache), 1, GFP_KERNEL);
 	return c;
 }
 EXPORT_SYMBOL(kmem_cache_create);
 
 void kmem_cache_destroy(struct kmem_cache *c)
 {
+	kmemleak_free(c);
 	slob_free(c, sizeof(struct kmem_cache));
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
@@ -613,6 +618,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 	if (c->ctor)
 		c->ctor(b);
 
+	kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
 	return b;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
@@ -635,6 +641,7 @@ static void kmem_rcu_free(struct rcu_head *head)
 
 void kmem_cache_free(struct kmem_cache *c, void *b)
 {
+	kmemleak_free_recursive(b, c->flags);
 	if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) {
 		struct slob_rcu *slob_rcu;
 		slob_rcu = b + (c->size - sizeof(struct slob_rcu));
-- 
GitLab


From 06f22f13f3cc2eff00db09f053218e5d4b757bc8 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:23:18 +0100
Subject: [PATCH 2069/2198] kmemleak: Add the slub memory allocation/freeing
 hooks

This patch adds the callbacks to kmemleak_(alloc|free) functions from the
slub allocator.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Reviewed-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/slub.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/mm/slub.c b/mm/slub.c
index 5e805a6fe36c4..6674a7907b0cb 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
 #include <linux/kmemtrace.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
+#include <linux/kmemleak.h>
 #include <linux/mempolicy.h>
 #include <linux/ctype.h>
 #include <linux/debugobjects.h>
@@ -143,7 +144,7 @@
  * Set of flags that will prevent slab merging
  */
 #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
-		SLAB_TRACE | SLAB_DESTROY_BY_RCU)
+		SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE)
 
 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
 		SLAB_CACHE_DMA)
@@ -1617,6 +1618,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
 	if (unlikely((gfpflags & __GFP_ZERO) && object))
 		memset(object, 0, objsize);
 
+	kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags);
 	return object;
 }
 
@@ -1746,6 +1748,7 @@ static __always_inline void slab_free(struct kmem_cache *s,
 	struct kmem_cache_cpu *c;
 	unsigned long flags;
 
+	kmemleak_free_recursive(x, s->flags);
 	local_irq_save(flags);
 	c = get_cpu_slab(s, smp_processor_id());
 	debug_check_no_locks_freed(object, c->objsize);
-- 
GitLab


From 89219d37a2377c44fde7bff0bf0623453c05329a Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:23:19 +0100
Subject: [PATCH 2070/2198] kmemleak: Add the vmalloc memory allocation/freeing
 hooks

This patch adds the callbacks to kmemleak_(alloc|free) functions from
vmalloc/vfree.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 mm/vmalloc.c | 30 +++++++++++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 083716ea38c9f..b7db93572797e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -25,6 +25,7 @@
 #include <linux/rcupdate.h>
 #include <linux/bootmem.h>
 #include <linux/pfn.h>
+#include <linux/kmemleak.h>
 
 #include <asm/atomic.h>
 #include <asm/uaccess.h>
@@ -1327,6 +1328,9 @@ static void __vunmap(const void *addr, int deallocate_pages)
 void vfree(const void *addr)
 {
 	BUG_ON(in_interrupt());
+
+	kmemleak_free(addr);
+
 	__vunmap(addr, 1);
 }
 EXPORT_SYMBOL(vfree);
@@ -1439,8 +1443,17 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 
 void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
 {
-	return __vmalloc_area_node(area, gfp_mask, prot, -1,
-					__builtin_return_address(0));
+	void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1,
+					 __builtin_return_address(0));
+
+	/*
+	 * A ref_count = 3 is needed because the vm_struct and vmap_area
+	 * structures allocated in the __get_vm_area_node() function contain
+	 * references to the virtual address of the vmalloc'ed block.
+	 */
+	kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask);
+
+	return addr;
 }
 
 /**
@@ -1459,6 +1472,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 						int node, void *caller)
 {
 	struct vm_struct *area;
+	void *addr;
+	unsigned long real_size = size;
 
 	size = PAGE_ALIGN(size);
 	if (!size || (size >> PAGE_SHIFT) > num_physpages)
@@ -1470,7 +1485,16 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
 	if (!area)
 		return NULL;
 
-	return __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+	addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
+
+	/*
+	 * A ref_count = 3 is needed because the vm_struct and vmap_area
+	 * structures allocated in the __get_vm_area_node() function contain
+	 * references to the virtual address of the vmalloc'ed block.
+	 */
+	kmemleak_alloc(addr, real_size, 3, gfp_mask);
+
+	return addr;
 }
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
-- 
GitLab


From dbb1f81ca67a56c6cfce4c94d07c76378fd4af9e Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:23:19 +0100
Subject: [PATCH 2071/2198] kmemleak: Add kmemleak_alloc callback from
 alloc_large_system_hash

The alloc_large_system_hash function is called from various places in
the kernel and it contains pointers to other allocated structures. It
therefore needs to be traced by kmemleak.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 mm/page_alloc.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 474c7e9dd51ac..17d5f539a9aa5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -46,6 +46,7 @@
 #include <linux/page-isolation.h>
 #include <linux/page_cgroup.h>
 #include <linux/debugobjects.h>
+#include <linux/kmemleak.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -4546,6 +4547,16 @@ void *__init alloc_large_system_hash(const char *tablename,
 	if (_hash_mask)
 		*_hash_mask = (1 << log2qty) - 1;
 
+	/*
+	 * If hashdist is set, the table allocation is done with __vmalloc()
+	 * which invokes the kmemleak_alloc() callback. This function may also
+	 * be called before the slab and kmemleak are initialised when
+	 * kmemleak simply buffers the request to be executed later
+	 * (GFP_ATOMIC flag ignored in this case).
+	 */
+	if (!hashdist)
+		kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+
 	return table;
 }
 
-- 
GitLab


From 4f2294b6dc88d99295230d97fef2c9863cec44c3 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:23:20 +0100
Subject: [PATCH 2072/2198] kmemleak: Add modules support

This patch handles the kmemleak operations needed for modules loading so
that memory allocations from inside a module are properly tracked.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 kernel/module.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/kernel/module.c b/kernel/module.c
index 2383e60fcf3fc..5cd55ab15daf9 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -53,6 +53,7 @@
 #include <linux/ftrace.h>
 #include <linux/async.h>
 #include <linux/percpu.h>
+#include <linux/kmemleak.h>
 
 #if 0
 #define DEBUGP printk
@@ -430,6 +431,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
 	unsigned long extra;
 	unsigned int i;
 	void *ptr;
+	int cpu;
 
 	if (align > PAGE_SIZE) {
 		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
@@ -459,6 +461,11 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
 			if (!split_block(i, size))
 				return NULL;
 
+		/* add the per-cpu scanning areas */
+		for_each_possible_cpu(cpu)
+			kmemleak_alloc(ptr + per_cpu_offset(cpu), size, 0,
+				       GFP_KERNEL);
+
 		/* Mark allocated */
 		pcpu_size[i] = -pcpu_size[i];
 		return ptr;
@@ -473,6 +480,7 @@ static void percpu_modfree(void *freeme)
 {
 	unsigned int i;
 	void *ptr = __per_cpu_start + block_size(pcpu_size[0]);
+	int cpu;
 
 	/* First entry is core kernel percpu data. */
 	for (i = 1; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) {
@@ -484,6 +492,10 @@ static void percpu_modfree(void *freeme)
 	BUG();
 
  free:
+	/* remove the per-cpu scanning areas */
+	for_each_possible_cpu(cpu)
+		kmemleak_free(freeme + per_cpu_offset(cpu));
+
 	/* Merge with previous? */
 	if (pcpu_size[i-1] >= 0) {
 		pcpu_size[i-1] += pcpu_size[i];
@@ -1876,6 +1888,36 @@ static void *module_alloc_update_bounds(unsigned long size)
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_KMEMLEAK
+static void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+				 Elf_Shdr *sechdrs, char *secstrings)
+{
+	unsigned int i;
+
+	/* only scan the sections containing data */
+	kmemleak_scan_area(mod->module_core, (unsigned long)mod -
+			   (unsigned long)mod->module_core,
+			   sizeof(struct module), GFP_KERNEL);
+
+	for (i = 1; i < hdr->e_shnum; i++) {
+		if (!(sechdrs[i].sh_flags & SHF_ALLOC))
+			continue;
+		if (strncmp(secstrings + sechdrs[i].sh_name, ".data", 5) != 0
+		    && strncmp(secstrings + sechdrs[i].sh_name, ".bss", 4) != 0)
+			continue;
+
+		kmemleak_scan_area(mod->module_core, sechdrs[i].sh_addr -
+				   (unsigned long)mod->module_core,
+				   sechdrs[i].sh_size, GFP_KERNEL);
+	}
+}
+#else
+static inline void kmemleak_load_module(struct module *mod, Elf_Ehdr *hdr,
+					Elf_Shdr *sechdrs, char *secstrings)
+{
+}
+#endif
+
 /* Allocate and load the module: note that size of section 0 is always
    zero, and we rely on this for optional sections. */
 static noinline struct module *load_module(void __user *umod,
@@ -2046,6 +2088,12 @@ static noinline struct module *load_module(void __user *umod,
 
 	/* Do the allocs. */
 	ptr = module_alloc_update_bounds(mod->core_size);
+	/*
+	 * The pointer to this block is stored in the module structure
+	 * which is inside the block. Just mark it as not being a
+	 * leak.
+	 */
+	kmemleak_not_leak(ptr);
 	if (!ptr) {
 		err = -ENOMEM;
 		goto free_percpu;
@@ -2054,6 +2102,13 @@ static noinline struct module *load_module(void __user *umod,
 	mod->module_core = ptr;
 
 	ptr = module_alloc_update_bounds(mod->init_size);
+	/*
+	 * The pointer to this block is stored in the module structure
+	 * which is inside the block. This block doesn't need to be
+	 * scanned as it contains data and code that will be freed
+	 * after the module is initialized.
+	 */
+	kmemleak_ignore(ptr);
 	if (!ptr && mod->init_size) {
 		err = -ENOMEM;
 		goto free_core;
@@ -2084,6 +2139,7 @@ static noinline struct module *load_module(void __user *umod,
 	}
 	/* Module has been moved. */
 	mod = (void *)sechdrs[modindex].sh_addr;
+	kmemleak_load_module(mod, hdr, sechdrs, secstrings);
 
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
 	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
-- 
GitLab


From 2e1483c995bbd0fa6cbd055ad76088a520799ba4 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:24:13 +0100
Subject: [PATCH 2073/2198] kmemleak: Remove some of the kmemleak false
 positives

There are allocations for which the main pointer cannot be found but
they are not memory leaks. This patch fixes some of them. For more
information on false positives, see Documentation/kmemleak.txt.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 drivers/char/vt.c      | 7 +++++++
 fs/block_dev.c         | 6 ++++++
 include/linux/percpu.h | 5 +++++
 3 files changed, 18 insertions(+)

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 08151d4de489b..961c1a788c613 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -104,6 +104,7 @@
 #include <linux/io.h>
 #include <asm/system.h>
 #include <linux/uaccess.h>
+#include <linux/kmemleak.h>
 
 #define MAX_NR_CON_DRIVER 16
 
@@ -2880,6 +2881,12 @@ static int __init con_init(void)
 	 */
 	for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
 		vc_cons[currcons].d = vc = alloc_bootmem(sizeof(struct vc_data));
+		/*
+		 * Kmemleak does not track the memory allocated via
+		 * alloc_bootmem() but this block contains pointers to
+		 * other blocks allocated via kmalloc.
+		 */
+		kmemleak_alloc(vc, sizeof(struct vc_data), 1, GFP_ATOMIC);
 		INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
 		visual_init(vc, currcons, 1);
 		vc->vc_screenbuf = (unsigned short *)alloc_bootmem(vc->vc_screenbuf_size);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index f45dbc18dd175..d250f807fd83d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -25,6 +25,7 @@
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/log2.h>
+#include <linux/kmemleak.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -492,6 +493,11 @@ void __init bdev_cache_init(void)
 	bd_mnt = kern_mount(&bd_type);
 	if (IS_ERR(bd_mnt))
 		panic("Cannot create bdev pseudo-fs");
+	/*
+	 * This vfsmount structure is only used to obtain the
+	 * blockdev_superblock, so tell kmemleak not to report it.
+	 */
+	kmemleak_not_leak(bd_mnt);
 	blockdev_superblock = bd_mnt->mnt_sb;	/* For writeback */
 }
 
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 1581ff235c7e3..26fd9d12f050b 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -86,7 +86,12 @@ struct percpu_data {
 	void *ptrs[1];
 };
 
+/* pointer disguising messes up the kmemleak objects tracking */
+#ifndef CONFIG_DEBUG_KMEMLEAK
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+#else
+#define __percpu_disguise(pdata) (struct percpu_data *)(pdata)
+#endif
 
 #define per_cpu_ptr(ptr, cpu)						\
 ({									\
-- 
GitLab


From 3bba00d7bdd57cb7aa739b751fa0a1fbbb04dc18 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:24:13 +0100
Subject: [PATCH 2074/2198] kmemleak: Enable the building of the memory leak
 detector

This patch adds the Kconfig.debug and Makefile entries needed for
building kmemleak into the kernel.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 lib/Kconfig.debug | 22 ++++++++++++++++++++++
 mm/Makefile       |  1 +
 2 files changed, 23 insertions(+)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 6cdcf38f2da9b..5cc3506574e9d 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -336,6 +336,28 @@ config SLUB_STATS
 	  out which slabs are relevant to a particular load.
 	  Try running: slabinfo -DA
 
+config DEBUG_KMEMLEAK
+	bool "Kernel memory leak detector"
+	depends on DEBUG_KERNEL && EXPERIMENTAL && (X86 || ARM) && \
+		!MEMORY_HOTPLUG
+	select DEBUG_SLAB if SLAB
+	select SLUB_DEBUG if SLUB
+	select DEBUG_FS if SYSFS
+	select STACKTRACE if STACKTRACE_SUPPORT
+	select KALLSYMS
+	help
+	  Say Y here if you want to enable the memory leak
+	  detector. The memory allocation/freeing is traced in a way
+	  similar to the Boehm's conservative garbage collector, the
+	  difference being that the orphan objects are not freed but
+	  only shown in /sys/kernel/debug/kmemleak. Enabling this
+	  feature will introduce an overhead to memory
+	  allocations. See Documentation/kmemleak.txt for more
+	  details.
+
+	  In order to access the kmemleak file, debugfs needs to be
+	  mounted (usually at /sys/kernel/debug).
+
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT && (TRACE_IRQFLAGS_SUPPORT || PPC64)
diff --git a/mm/Makefile b/mm/Makefile
index ec73c68b60154..cb84a025fdbf2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -38,3 +38,4 @@ obj-$(CONFIG_SMP) += allocpercpu.o
 endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
+obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
-- 
GitLab


From 0822ee4ac1ae6af5a953f97f75553738834b10b9 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:24:14 +0100
Subject: [PATCH 2075/2198] kmemleak: Simple testing module for kmemleak

This patch adds a loadable module that deliberately leaks memory. It
is used for testing various memory leaking scenarios.

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 lib/Kconfig.debug  |  10 ++++
 mm/Makefile        |   1 +
 mm/kmemleak-test.c | 111 +++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 122 insertions(+)
 create mode 100644 mm/kmemleak-test.c

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 5cc3506574e9d..116a35051be6f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -358,6 +358,16 @@ config DEBUG_KMEMLEAK
 	  In order to access the kmemleak file, debugfs needs to be
 	  mounted (usually at /sys/kernel/debug).
 
+config DEBUG_KMEMLEAK_TEST
+	tristate "Simple test for the kernel memory leak detector"
+	depends on DEBUG_KMEMLEAK
+	help
+	  Say Y or M here to build a test for the kernel memory leak
+	  detector. This option enables a module that explicitly leaks
+	  memory.
+
+	  If unsure, say N.
+
 config DEBUG_PREEMPT
 	bool "Debug preemptible kernel"
 	depends on DEBUG_KERNEL && PREEMPT && (TRACE_IRQFLAGS_SUPPORT || PPC64)
diff --git a/mm/Makefile b/mm/Makefile
index cb84a025fdbf2..e89acb090b4df 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -39,3 +39,4 @@ endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
 obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
+obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
new file mode 100644
index 0000000000000..d5292fc6f5238
--- /dev/null
+++ b/mm/kmemleak-test.c
@@ -0,0 +1,111 @@
+/*
+ * mm/kmemleak-test.c
+ *
+ * Copyright (C) 2008 ARM Limited
+ * Written by Catalin Marinas <catalin.marinas@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/fdtable.h>
+
+#include <linux/kmemleak.h>
+
+struct test_node {
+	long header[25];
+	struct list_head list;
+	long footer[25];
+};
+
+static LIST_HEAD(test_list);
+static DEFINE_PER_CPU(void *, test_pointer);
+
+/*
+ * Some very simple testing. This function needs to be extended for
+ * proper testing.
+ */
+static int __init kmemleak_test_init(void)
+{
+	struct test_node *elem;
+	int i;
+
+	printk(KERN_INFO "Kmemleak testing\n");
+
+	/* make some orphan objects */
+	pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(32) = %p\n", kmalloc(32, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(1024) = %p\n", kmalloc(1024, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(2048) = %p\n", kmalloc(2048, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+	pr_info("kmemleak: kmalloc(4096) = %p\n", kmalloc(4096, GFP_KERNEL));
+#ifndef CONFIG_MODULES
+	pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
+		kmem_cache_alloc(files_cachep, GFP_KERNEL));
+	pr_info("kmemleak: kmem_cache_alloc(files_cachep) = %p\n",
+		kmem_cache_alloc(files_cachep, GFP_KERNEL));
+#endif
+	pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+	pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+	pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+	pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+	pr_info("kmemleak: vmalloc(64) = %p\n", vmalloc(64));
+
+	/*
+	 * Add elements to a list. They should only appear as orphan
+	 * after the module is removed.
+	 */
+	for (i = 0; i < 10; i++) {
+		elem = kmalloc(sizeof(*elem), GFP_KERNEL);
+		pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem);
+		if (!elem)
+			return -ENOMEM;
+		memset(elem, 0, sizeof(*elem));
+		INIT_LIST_HEAD(&elem->list);
+
+		list_add_tail(&elem->list, &test_list);
+	}
+
+	for_each_possible_cpu(i) {
+		per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL);
+		pr_info("kmemleak: kmalloc(129) = %p\n",
+			per_cpu(test_pointer, i));
+	}
+
+	return 0;
+}
+module_init(kmemleak_test_init);
+
+static void __exit kmemleak_test_exit(void)
+{
+	struct test_node *elem, *tmp;
+
+	/*
+	 * Remove the list elements without actually freeing the
+	 * memory.
+	 */
+	list_for_each_entry_safe(elem, tmp, &test_list, list)
+		list_del(&elem->list);
+}
+module_exit(kmemleak_test_exit);
+
+MODULE_LICENSE("GPL");
-- 
GitLab


From 3aa27bbe7a6536d1ec859d3a97caf3319b5081b7 Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Thu, 11 Jun 2009 13:24:14 +0100
Subject: [PATCH 2076/2198] kmemleak: Add the corresponding MAINTAINERS entry

Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 MAINTAINERS | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index cf4abddfc8a40..e5af3063e8be3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3359,6 +3359,12 @@ F:	Documentation/trace/kmemtrace.txt
 F:	include/trace/kmemtrace.h
 F:	kernel/trace/kmemtrace.c
 
+KMEMLEAK
+P:	Catalin Marinas
+M:	catalin.marinas@arm.com
+L:	linux-kernel@vger.kernel.org
+S:	Maintained
+
 KPROBES
 P:	Ananth N Mavinakayanahalli
 M:	ananth@in.ibm.com
-- 
GitLab


From 441c7e0a2ed38827b48b907bd1fa29faba2017a3 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 10 Jun 2009 20:05:53 +0300
Subject: [PATCH 2077/2198] bootmem: use slab if bootmem is no longer available

As a preparation for initializing the slab allocator early, make sure the
bootmem allocator does not crash and burn if someone calls it after slab is up;
otherwise we'd need a flag day for switching to early slab.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/bootmem.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index daf92713f7de0..457269c73e81c 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -532,6 +532,9 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata,
 					unsigned long size, unsigned long align,
 					unsigned long goal, unsigned long limit)
 {
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc(size, GFP_NOWAIT);
+
 #ifdef CONFIG_HAVE_ARCH_BOOTMEM
 	bootmem_data_t *p_bdata;
 
-- 
GitLab


From c91c4773b334d4d3a6d44626dc2a558ad97b86f3 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 08:10:28 +0300
Subject: [PATCH 2078/2198] bootmem: fix slab fallback on numa

If the user requested bootmem allocation on a specific node, we should use
kzalloc_node() for the fallback allocation.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/bootmem.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mm/bootmem.c b/mm/bootmem.c
index 457269c73e81c..282df0a09e6f2 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -665,6 +665,9 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata,
 void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
 				   unsigned long align, unsigned long goal)
 {
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
 	return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0);
 }
 
@@ -696,6 +699,9 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
 {
 	void *ptr;
 
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
 	ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0);
 	if (ptr)
 		return ptr;
@@ -748,6 +754,9 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
 void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
 				       unsigned long align, unsigned long goal)
 {
+	if (WARN_ON_ONCE(slab_is_available()))
+		return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
+
 	return ___alloc_bootmem_node(pgdat->bdata, size, align,
 				goal, ARCH_LOW_ADDRESS_LIMIT);
 }
-- 
GitLab


From 83b519e8b9572c319c8e0c615ee5dd7272856090 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 10 Jun 2009 19:40:04 +0300
Subject: [PATCH 2079/2198] slab: setup allocators earlier in the boot sequence

This patch makes kmalloc() available earlier in the boot sequence so we can get
rid of some bootmem allocations. The bulk of the changes are due to
kmem_cache_init() being called with interrupts disabled which requires some
changes to allocator boostrap code.

Note: 32-bit x86 does WP protect test in mem_init() so we must setup traps
before we call mem_init() during boot as reported by Ingo Molnar:

  We have a hard crash in the WP-protect code:

  [    0.000000] Checking if this processor honours the WP bit even in supervisor mode...BUG: Int 14: CR2 ffcff000
  [    0.000000]      EDI 00000188  ESI 00000ac7  EBP c17eaf9c  ESP c17eaf8c
  [    0.000000]      EBX 000014e0  EDX 0000000e  ECX 01856067  EAX 00000001
  [    0.000000]      err 00000003  EIP c10135b1   CS 00000060  flg 00010002
  [    0.000000] Stack: c17eafa8 c17fd410 c16747bc c17eafc4 c17fd7e5 000011fd f8616000 c18237cc
  [    0.000000]        00099800 c17bb000 c17eafec c17f1668 000001c5 c17f1322 c166e039 c1822bf0
  [    0.000000]        c166e033 c153a014 c18237cc 00020800 c17eaff8 c17f106a 00020800 01ba5003
  [    0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #52203
  [    0.000000] Call Trace:
  [    0.000000]  [<c15357c2>] ? printk+0x14/0x16
  [    0.000000]  [<c10135b1>] ? do_test_wp_bit+0x19/0x23
  [    0.000000]  [<c17fd410>] ? test_wp_bit+0x26/0x64
  [    0.000000]  [<c17fd7e5>] ? mem_init+0x1ba/0x1d8
  [    0.000000]  [<c17f1668>] ? start_kernel+0x164/0x2f7
  [    0.000000]  [<c17f1322>] ? unknown_bootoption+0x0/0x19c
  [    0.000000]  [<c17f106a>] ? __init_begin+0x6a/0x6f

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by Linus Torvalds <torvalds@linux-foundation.org>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Matt Mackall <mpm@selenic.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 init/main.c | 36 ++++++++++++++---------
 mm/slab.c   | 85 ++++++++++++++++++++++++++++-------------------------
 mm/slub.c   | 17 ++++++-----
 3 files changed, 77 insertions(+), 61 deletions(-)

diff --git a/init/main.c b/init/main.c
index bb7dc57eee36e..0ab82a453de50 100644
--- a/init/main.c
+++ b/init/main.c
@@ -574,6 +574,28 @@ asmlinkage void __init start_kernel(void)
 	setup_nr_cpu_ids();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
+	build_all_zonelists();
+	page_alloc_init();
+
+	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
+	parse_early_param();
+	parse_args("Booting kernel", static_command_line, __start___param,
+		   __stop___param - __start___param,
+		   &unknown_bootoption);
+	/*
+	 * These use large bootmem allocations and must precede
+	 * kmem_cache_init()
+	 */
+	pidhash_init();
+	vmalloc_init();
+	vfs_caches_init_early();
+	sort_main_extable();
+	trap_init();
+	/*
+	 * Set up kernel memory allocators
+	 */
+	mem_init();
+	kmem_cache_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
@@ -585,25 +607,15 @@ asmlinkage void __init start_kernel(void)
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
-	build_all_zonelists();
-	page_alloc_init();
-	printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line);
-	parse_early_param();
-	parse_args("Booting kernel", static_command_line, __start___param,
-		   __stop___param - __start___param,
-		   &unknown_bootoption);
 	if (!irqs_disabled()) {
 		printk(KERN_WARNING "start_kernel(): bug: interrupts were "
 				"enabled *very* early, fixing it\n");
 		local_irq_disable();
 	}
-	sort_main_extable();
-	trap_init();
 	rcu_init();
 	/* init some links before init_ISA_irqs() */
 	early_irq_init();
 	init_IRQ();
-	pidhash_init();
 	init_timers();
 	hrtimers_init();
 	softirq_init();
@@ -645,14 +657,10 @@ asmlinkage void __init start_kernel(void)
 		initrd_start = 0;
 	}
 #endif
-	vmalloc_init();
-	vfs_caches_init_early();
 	cpuset_init_early();
 	page_cgroup_init();
-	mem_init();
 	enable_debug_pagealloc();
 	cpu_hotplug_init();
-	kmem_cache_init();
 	kmemtrace_init();
 	debug_objects_mem_init();
 	idr_init_cache();
diff --git a/mm/slab.c b/mm/slab.c
index f85831da9080d..2bd611fa87bf5 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -315,7 +315,7 @@ static int drain_freelist(struct kmem_cache *cache,
 			struct kmem_list3 *l3, int tofree);
 static void free_block(struct kmem_cache *cachep, void **objpp, int len,
 			int node);
-static int enable_cpucache(struct kmem_cache *cachep);
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp);
 static void cache_reap(struct work_struct *unused);
 
 /*
@@ -958,12 +958,12 @@ static void __cpuinit start_cpu_timer(int cpu)
 }
 
 static struct array_cache *alloc_arraycache(int node, int entries,
-					    int batchcount)
+					    int batchcount, gfp_t gfp)
 {
 	int memsize = sizeof(void *) * entries + sizeof(struct array_cache);
 	struct array_cache *nc = NULL;
 
-	nc = kmalloc_node(memsize, GFP_KERNEL, node);
+	nc = kmalloc_node(memsize, gfp, node);
 	if (nc) {
 		nc->avail = 0;
 		nc->limit = entries;
@@ -1003,7 +1003,7 @@ static int transfer_objects(struct array_cache *to,
 #define drain_alien_cache(cachep, alien) do { } while (0)
 #define reap_alien(cachep, l3) do { } while (0)
 
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
+static inline struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	return (struct array_cache **)BAD_ALIEN_MAGIC;
 }
@@ -1034,7 +1034,7 @@ static inline void *____cache_alloc_node(struct kmem_cache *cachep,
 static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
 static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
 
-static struct array_cache **alloc_alien_cache(int node, int limit)
+static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
 {
 	struct array_cache **ac_ptr;
 	int memsize = sizeof(void *) * nr_node_ids;
@@ -1042,14 +1042,14 @@ static struct array_cache **alloc_alien_cache(int node, int limit)
 
 	if (limit > 1)
 		limit = 12;
-	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
+	ac_ptr = kmalloc_node(memsize, gfp, node);
 	if (ac_ptr) {
 		for_each_node(i) {
 			if (i == node || !node_online(i)) {
 				ac_ptr[i] = NULL;
 				continue;
 			}
-			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
+			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp);
 			if (!ac_ptr[i]) {
 				for (i--; i >= 0; i--)
 					kfree(ac_ptr[i]);
@@ -1282,20 +1282,20 @@ static int __cpuinit cpuup_prepare(long cpu)
 		struct array_cache **alien = NULL;
 
 		nc = alloc_arraycache(node, cachep->limit,
-					cachep->batchcount);
+					cachep->batchcount, GFP_KERNEL);
 		if (!nc)
 			goto bad;
 		if (cachep->shared) {
 			shared = alloc_arraycache(node,
 				cachep->shared * cachep->batchcount,
-				0xbaadf00d);
+				0xbaadf00d, GFP_KERNEL);
 			if (!shared) {
 				kfree(nc);
 				goto bad;
 			}
 		}
 		if (use_alien_caches) {
-			alien = alloc_alien_cache(node, cachep->limit);
+			alien = alloc_alien_cache(node, cachep->limit, GFP_KERNEL);
 			if (!alien) {
 				kfree(shared);
 				kfree(nc);
@@ -1399,10 +1399,9 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 {
 	struct kmem_list3 *ptr;
 
-	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
+	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_NOWAIT, nodeid);
 	BUG_ON(!ptr);
 
-	local_irq_disable();
 	memcpy(ptr, list, sizeof(struct kmem_list3));
 	/*
 	 * Do not assume that spinlocks can be initialized via memcpy:
@@ -1411,7 +1410,6 @@ static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list,
 
 	MAKE_ALL_LISTS(cachep, ptr, nodeid);
 	cachep->nodelists[nodeid] = ptr;
-	local_irq_enable();
 }
 
 /*
@@ -1575,9 +1573,8 @@ void __init kmem_cache_init(void)
 	{
 		struct array_cache *ptr;
 
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		local_irq_disable();
 		BUG_ON(cpu_cache_get(&cache_cache) != &initarray_cache.cache);
 		memcpy(ptr, cpu_cache_get(&cache_cache),
 		       sizeof(struct arraycache_init));
@@ -1587,11 +1584,9 @@ void __init kmem_cache_init(void)
 		spin_lock_init(&ptr->lock);
 
 		cache_cache.array[smp_processor_id()] = ptr;
-		local_irq_enable();
 
-		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+		ptr = kmalloc(sizeof(struct arraycache_init), GFP_NOWAIT);
 
-		local_irq_disable();
 		BUG_ON(cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep)
 		       != &initarray_generic.cache);
 		memcpy(ptr, cpu_cache_get(malloc_sizes[INDEX_AC].cs_cachep),
@@ -1603,7 +1598,6 @@ void __init kmem_cache_init(void)
 
 		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
 		    ptr;
-		local_irq_enable();
 	}
 	/* 5) Replace the bootstrap kmem_list3's */
 	{
@@ -1627,7 +1621,7 @@ void __init kmem_cache_init(void)
 		struct kmem_cache *cachep;
 		mutex_lock(&cache_chain_mutex);
 		list_for_each_entry(cachep, &cache_chain, next)
-			if (enable_cpucache(cachep))
+			if (enable_cpucache(cachep, GFP_NOWAIT))
 				BUG();
 		mutex_unlock(&cache_chain_mutex);
 	}
@@ -2064,10 +2058,10 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 	return left_over;
 }
 
-static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
+static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	if (g_cpucache_up == FULL)
-		return enable_cpucache(cachep);
+		return enable_cpucache(cachep, gfp);
 
 	if (g_cpucache_up == NONE) {
 		/*
@@ -2089,7 +2083,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep)
 			g_cpucache_up = PARTIAL_AC;
 	} else {
 		cachep->array[smp_processor_id()] =
-			kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
+			kmalloc(sizeof(struct arraycache_init), gfp);
 
 		if (g_cpucache_up == PARTIAL_AC) {
 			set_up_list3s(cachep, SIZE_L3);
@@ -2153,6 +2147,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 {
 	size_t left_over, slab_size, ralign;
 	struct kmem_cache *cachep = NULL, *pc;
+	gfp_t gfp;
 
 	/*
 	 * Sanity checks... these are all serious usage bugs.
@@ -2168,8 +2163,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * We use cache_chain_mutex to ensure a consistent view of
 	 * cpu_online_mask as well.  Please see cpuup_callback
 	 */
-	get_online_cpus();
-	mutex_lock(&cache_chain_mutex);
+	if (slab_is_available()) {
+		get_online_cpus();
+		mutex_lock(&cache_chain_mutex);
+	}
 
 	list_for_each_entry(pc, &cache_chain, next) {
 		char tmp;
@@ -2278,8 +2275,13 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 */
 	align = ralign;
 
+	if (slab_is_available())
+		gfp = GFP_KERNEL;
+	else
+		gfp = GFP_NOWAIT;
+
 	/* Get cache's description obj. */
-	cachep = kmem_cache_zalloc(&cache_cache, GFP_KERNEL);
+	cachep = kmem_cache_zalloc(&cache_cache, gfp);
 	if (!cachep)
 		goto oops;
 
@@ -2382,7 +2384,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	cachep->ctor = ctor;
 	cachep->name = name;
 
-	if (setup_cpu_cache(cachep)) {
+	if (setup_cpu_cache(cachep, gfp)) {
 		__kmem_cache_destroy(cachep);
 		cachep = NULL;
 		goto oops;
@@ -2394,8 +2396,10 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	if (!cachep && (flags & SLAB_PANIC))
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
-	mutex_unlock(&cache_chain_mutex);
-	put_online_cpus();
+	if (slab_is_available()) {
+		mutex_unlock(&cache_chain_mutex);
+		put_online_cpus();
+	}
 	return cachep;
 }
 EXPORT_SYMBOL(kmem_cache_create);
@@ -3802,7 +3806,7 @@ EXPORT_SYMBOL_GPL(kmem_cache_name);
 /*
  * This initializes kmem_list3 or resizes various caches for all nodes.
  */
-static int alloc_kmemlist(struct kmem_cache *cachep)
+static int alloc_kmemlist(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int node;
 	struct kmem_list3 *l3;
@@ -3812,7 +3816,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 	for_each_online_node(node) {
 
                 if (use_alien_caches) {
-                        new_alien = alloc_alien_cache(node, cachep->limit);
+                        new_alien = alloc_alien_cache(node, cachep->limit, gfp);
                         if (!new_alien)
                                 goto fail;
                 }
@@ -3821,7 +3825,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 		if (cachep->shared) {
 			new_shared = alloc_arraycache(node,
 				cachep->shared*cachep->batchcount,
-					0xbaadf00d);
+					0xbaadf00d, gfp);
 			if (!new_shared) {
 				free_alien_cache(new_alien);
 				goto fail;
@@ -3850,7 +3854,7 @@ static int alloc_kmemlist(struct kmem_cache *cachep)
 			free_alien_cache(new_alien);
 			continue;
 		}
-		l3 = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, node);
+		l3 = kmalloc_node(sizeof(struct kmem_list3), gfp, node);
 		if (!l3) {
 			free_alien_cache(new_alien);
 			kfree(new_shared);
@@ -3906,18 +3910,18 @@ static void do_ccupdate_local(void *info)
 
 /* Always called with the cache_chain_mutex held */
 static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
-				int batchcount, int shared)
+				int batchcount, int shared, gfp_t gfp)
 {
 	struct ccupdate_struct *new;
 	int i;
 
-	new = kzalloc(sizeof(*new), GFP_KERNEL);
+	new = kzalloc(sizeof(*new), gfp);
 	if (!new)
 		return -ENOMEM;
 
 	for_each_online_cpu(i) {
 		new->new[i] = alloc_arraycache(cpu_to_node(i), limit,
-						batchcount);
+						batchcount, gfp);
 		if (!new->new[i]) {
 			for (i--; i >= 0; i--)
 				kfree(new->new[i]);
@@ -3944,11 +3948,11 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 		kfree(ccold);
 	}
 	kfree(new);
-	return alloc_kmemlist(cachep);
+	return alloc_kmemlist(cachep, gfp);
 }
 
 /* Called with cache_chain_mutex held always */
-static int enable_cpucache(struct kmem_cache *cachep)
+static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
 {
 	int err;
 	int limit, shared;
@@ -3994,7 +3998,7 @@ static int enable_cpucache(struct kmem_cache *cachep)
 	if (limit > 32)
 		limit = 32;
 #endif
-	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared);
+	err = do_tune_cpucache(cachep, limit, (limit + 1) / 2, shared, gfp);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
 		       cachep->name, -err);
@@ -4300,7 +4304,8 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
 				res = 0;
 			} else {
 				res = do_tune_cpucache(cachep, limit,
-						       batchcount, shared);
+						       batchcount, shared,
+						       GFP_KERNEL);
 			}
 			break;
 		}
diff --git a/mm/slub.c b/mm/slub.c
index 5e805a6fe36c4..c1815a63807a3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2557,13 +2557,16 @@ static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
 	if (gfp_flags & SLUB_DMA)
 		flags = SLAB_CACHE_DMA;
 
-	down_write(&slub_lock);
+	/*
+	 * This function is called with IRQs disabled during early-boot on
+	 * single CPU so there's no need to take slub_lock here.
+	 */
 	if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
 								flags, NULL))
 		goto panic;
 
 	list_add(&s->list, &slab_caches);
-	up_write(&slub_lock);
+
 	if (sysfs_slab_add(s))
 		goto panic;
 	return s;
@@ -3021,7 +3024,7 @@ void __init kmem_cache_init(void)
 	 * kmem_cache_open for slab_state == DOWN.
 	 */
 	create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
-		sizeof(struct kmem_cache_node), GFP_KERNEL);
+		sizeof(struct kmem_cache_node), GFP_NOWAIT);
 	kmalloc_caches[0].refcount = -1;
 	caches++;
 
@@ -3034,16 +3037,16 @@ void __init kmem_cache_init(void)
 	/* Caches that are not of the two-to-the-power-of size */
 	if (KMALLOC_MIN_SIZE <= 64) {
 		create_kmalloc_cache(&kmalloc_caches[1],
-				"kmalloc-96", 96, GFP_KERNEL);
+				"kmalloc-96", 96, GFP_NOWAIT);
 		caches++;
 		create_kmalloc_cache(&kmalloc_caches[2],
-				"kmalloc-192", 192, GFP_KERNEL);
+				"kmalloc-192", 192, GFP_NOWAIT);
 		caches++;
 	}
 
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
 		create_kmalloc_cache(&kmalloc_caches[i],
-			"kmalloc", 1 << i, GFP_KERNEL);
+			"kmalloc", 1 << i, GFP_NOWAIT);
 		caches++;
 	}
 
@@ -3080,7 +3083,7 @@ void __init kmem_cache_init(void)
 	/* Provide the correct kmalloc names now that the caches are up */
 	for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++)
 		kmalloc_caches[i]. name =
-			kasprintf(GFP_KERNEL, "kmalloc-%d", 1 << i);
+			kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
 
 #ifdef CONFIG_SMP
 	register_cpu_notifier(&slab_notifier);
-- 
GitLab


From 43ebdac42f16037263b52a5aeedcd1bfa4a9bb29 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Mon, 25 May 2009 15:01:35 +0300
Subject: [PATCH 2080/2198] vmalloc: use kzalloc() instead of alloc_bootmem()

We can call vmalloc_init() after kmem_cache_init() and use kzalloc() instead of
the bootmem allocator when initializing vmalloc data structures.

Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Nick Piggin <npiggin@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 init/main.c  | 2 +-
 mm/vmalloc.c | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/init/main.c b/init/main.c
index 0ab82a453de50..6d38f9607d149 100644
--- a/init/main.c
+++ b/init/main.c
@@ -587,7 +587,6 @@ asmlinkage void __init start_kernel(void)
 	 * kmem_cache_init()
 	 */
 	pidhash_init();
-	vmalloc_init();
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
@@ -596,6 +595,7 @@ asmlinkage void __init start_kernel(void)
 	 */
 	mem_init();
 	kmem_cache_init();
+	vmalloc_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 083716ea38c9f..323513858c200 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,7 +23,6 @@
 #include <linux/rbtree.h>
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
-#include <linux/bootmem.h>
 #include <linux/pfn.h>
 
 #include <asm/atomic.h>
@@ -1032,7 +1031,7 @@ void __init vmalloc_init(void)
 
 	/* Import existing vmlist entries. */
 	for (tmp = vmlist; tmp; tmp = tmp->next) {
-		va = alloc_bootmem(sizeof(struct vmap_area));
+		va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
 		va->flags = tmp->flags | VM_VM_AREA;
 		va->va_start = (unsigned long)tmp->addr;
 		va->va_end = va->va_start + tmp->size;
-- 
GitLab


From 444f478f65c7ca4606f9965b31feed13fe2bc9fa Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 18:29:06 +0300
Subject: [PATCH 2081/2198] init: introduce mm_init()

As suggested by Christoph Lameter, introduce mm_init() now that we initialize
all the kernel memory allocations together.

Cc: Christoph Lameter <cl@linux-foundation.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 init/main.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/init/main.c b/init/main.c
index 6d38f9607d149..7917695bf71ee 100644
--- a/init/main.c
+++ b/init/main.c
@@ -533,6 +533,16 @@ void __init __weak thread_info_cache_init(void)
 {
 }
 
+/*
+ * Set up kernel memory allocators
+ */
+static void __init mm_init(void)
+{
+	mem_init();
+	kmem_cache_init();
+	vmalloc_init();
+}
+
 asmlinkage void __init start_kernel(void)
 {
 	char * command_line;
@@ -590,12 +600,7 @@ asmlinkage void __init start_kernel(void)
 	vfs_caches_init_early();
 	sort_main_extable();
 	trap_init();
-	/*
-	 * Set up kernel memory allocators
-	 */
-	mem_init();
-	kmem_cache_init();
-	vmalloc_init();
+	mm_init();
 	/*
 	 * Set up the scheduler prior starting any interrupts (such as the
 	 * timer interrupt). Full topology setup happens at smp_init()
-- 
GitLab


From 36b7b6d465489c4754c4fd66fcec6086eba87896 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 10 Jun 2009 23:42:36 +0300
Subject: [PATCH 2082/2198] sched: use kzalloc() instead of the bootmem
 allocator

Now that kmem_cache_init() happens before sched_init(), we should use kzalloc()
and not the bootmem allocator.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 kernel/sched.c | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 14c447ae5d53a..a9ff9533355ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -68,7 +68,6 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
@@ -7782,21 +7781,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 
 static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
 {
+	gfp_t gfp = GFP_KERNEL;
+
 	memset(rd, 0, sizeof(*rd));
 
-	if (bootmem) {
-		alloc_bootmem_cpumask_var(&def_root_domain.span);
-		alloc_bootmem_cpumask_var(&def_root_domain.online);
-		alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-		cpupri_init(&rd->cpupri, true);
-		return 0;
-	}
+	if (bootmem)
+		gfp = GFP_NOWAIT;
 
-	if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->span, gfp))
 		goto out;
-	if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->online, gfp))
 		goto free_span;
-	if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+	if (!alloc_cpumask_var(&rd->rto_mask, gfp))
 		goto free_online;
 
 	if (cpupri_init(&rd->cpupri, false) != 0)
@@ -9123,7 +9119,7 @@ void __init sched_init(void)
 	 * we use alloc_bootmem().
 	 */
 	if (alloc_size) {
-		ptr = (unsigned long)alloc_bootmem(alloc_size);
+		ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		init_task_group.se = (struct sched_entity **)ptr;
-- 
GitLab


From a5f4f52e82114e85aa1a066bd1a450acc19a464d Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Wed, 10 Jun 2009 23:53:37 +0300
Subject: [PATCH 2083/2198] vt: use kzalloc() instead of the bootmem allocator

Now that kmem_cache_init() happens before console_init(), we should use
kzalloc() and not the bootmem allocator.

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 drivers/char/vt.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/char/vt.c b/drivers/char/vt.c
index 08151d4de489b..c796a86ab7f36 100644
--- a/drivers/char/vt.c
+++ b/drivers/char/vt.c
@@ -95,7 +95,6 @@
 #include <linux/timer.h>
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
-#include <linux/bootmem.h>
 #include <linux/pm.h>
 #include <linux/font.h>
 #include <linux/bitops.h>
@@ -2875,14 +2874,11 @@ static int __init con_init(void)
 		mod_timer(&console_timer, jiffies + blankinterval);
 	}
 
-	/*
-	 * kmalloc is not running yet - we use the bootmem allocator.
-	 */
 	for (currcons = 0; currcons < MIN_NR_CONSOLES; currcons++) {
-		vc_cons[currcons].d = vc = alloc_bootmem(sizeof(struct vc_data));
+		vc_cons[currcons].d = vc = kzalloc(sizeof(struct vc_data), GFP_NOWAIT);
 		INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK);
 		visual_init(vc, currcons, 1);
-		vc->vc_screenbuf = (unsigned short *)alloc_bootmem(vc->vc_screenbuf_size);
+		vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT);
 		vc->vc_kmalloced = 0;
 		vc_init(vc, vc->vc_rows, vc->vc_cols,
 			currcons || !vc->vc_sw->con_save_screen);
-- 
GitLab


From 38c7fed2f5ffee17e1fa3e0f78b0e1bf43d52d13 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 25 May 2009 15:10:58 +0300
Subject: [PATCH 2084/2198] x86: remove some alloc_bootmem_cpumask_var calling

Now that we set up the slab allocator earlier, we can get rid of some
alloc_bootmem_cpumask_var() calls in boot code.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 arch/x86/kernel/apic/io_apic.c |  4 ++--
 include/linux/irq.h            | 18 +++++++-----------
 kernel/cpuset.c                |  2 +-
 kernel/profile.c               |  6 ------
 lib/cpumask.c                  | 11 ++---------
 5 files changed, 12 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 1946fac42ab3c..139201a562bad 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -185,8 +185,8 @@ int __init arch_early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
 		desc->chip_data = &cfg[i];
-		alloc_bootmem_cpumask_var(&cfg[i].domain);
-		alloc_bootmem_cpumask_var(&cfg[i].old_domain);
+		alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT);
+		alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT);
 		if (i < NR_IRQS_LEGACY)
 			cpumask_setall(cfg[i].domain);
 	}
diff --git a/include/linux/irq.h b/include/linux/irq.h
index eedbb8e5e0ccf..1e50c34f00628 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -430,23 +430,19 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
  * Returns true if successful (or not required).
  */
 static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
-								bool boot)
+							bool boot)
 {
-#ifdef CONFIG_CPUMASK_OFFSTACK
-	if (boot) {
-		alloc_bootmem_cpumask_var(&desc->affinity);
+	gfp_t gfp = GFP_ATOMIC;
 
-#ifdef CONFIG_GENERIC_PENDING_IRQ
-		alloc_bootmem_cpumask_var(&desc->pending_mask);
-#endif
-		return true;
-	}
+	if (boot)
+		gfp = GFP_NOWAIT;
 
-	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	if (!alloc_cpumask_var_node(&desc->affinity, gfp, node))
 		return false;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+	if (!alloc_cpumask_var_node(&desc->pending_mask, gfp, node)) {
 		free_cpumask_var(desc->affinity);
 		return false;
 	}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 026faccca869e..d5a7e17474ee3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1857,7 +1857,7 @@ struct cgroup_subsys cpuset_subsys = {
 
 int __init cpuset_init_early(void)
 {
-	alloc_bootmem_cpumask_var(&top_cpuset.cpus_allowed);
+	alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_NOWAIT);
 
 	top_cpuset.mems_generation = cpuset_mems_generation++;
 	return 0;
diff --git a/kernel/profile.c b/kernel/profile.c
index 7724e0409bae7..28cf26ad2d247 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -111,12 +111,6 @@ int __ref profile_init(void)
 	/* only text is profiled */
 	prof_len = (_etext - _stext) >> prof_shift;
 	buffer_bytes = prof_len*sizeof(atomic_t);
-	if (!slab_is_available()) {
-		prof_buffer = alloc_bootmem(buffer_bytes);
-		alloc_bootmem_cpumask_var(&prof_cpu_mask);
-		cpumask_copy(prof_cpu_mask, cpu_possible_mask);
-		return 0;
-	}
 
 	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
diff --git a/lib/cpumask.c b/lib/cpumask.c
index eb23aaa0c7b81..7bb4142a502f3 100644
--- a/lib/cpumask.c
+++ b/lib/cpumask.c
@@ -92,15 +92,8 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu)
  */
 bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node)
 {
-	if (likely(slab_is_available()))
-		*mask = kmalloc_node(cpumask_size(), flags, node);
-	else {
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-		printk(KERN_ERR
-			"=> alloc_cpumask_var: kmalloc not available!\n");
-#endif
-		*mask = NULL;
-	}
+	*mask = kmalloc_node(cpumask_size(), flags, node);
+
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 	if (!*mask) {
 		printk(KERN_ERR "=> alloc_cpumask_var: failed!\n");
-- 
GitLab


From dad213aeb59718623fc59defeff95fe8c3feb8a0 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 28 May 2009 18:14:40 -0700
Subject: [PATCH 2085/2198] irq/cpumask: make memoryless node zero happy

Don't hardcode to node zero for early boot IRQ setup memory allocations.

[ penberg@cs.helsinki.fi: minor cleanups ]
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 arch/x86/kernel/apic/io_apic.c | 6 ++++--
 kernel/irq/handle.c            | 9 +++++----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 139201a562bad..94605e7f6a541 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -177,16 +177,18 @@ int __init arch_early_irq_init(void)
 	struct irq_cfg *cfg;
 	struct irq_desc *desc;
 	int count;
+	int node;
 	int i;
 
 	cfg = irq_cfgx;
 	count = ARRAY_SIZE(irq_cfgx);
+	node= cpu_to_node(boot_cpu_id);
 
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
 		desc->chip_data = &cfg[i];
-		alloc_cpumask_var(&cfg[i].domain, GFP_NOWAIT);
-		alloc_cpumask_var(&cfg[i].old_domain, GFP_NOWAIT);
+		alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+		alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
 		if (i < NR_IRQS_LEGACY)
 			cpumask_setall(cfg[i].domain);
 	}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a60018402f422..e161999b66832 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -150,6 +150,7 @@ int __init early_irq_init(void)
 {
 	struct irq_desc *desc;
 	int legacy_count;
+	int node;
 	int i;
 
 	init_irq_default_affinity();
@@ -160,20 +161,20 @@ int __init early_irq_init(void)
 
 	desc = irq_desc_legacy;
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
+ 	node = first_online_node;
 
 	/* allocate irq_desc_ptrs array based on nr_irqs */
 	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
 
 	/* allocate based on nr_cpu_ids */
-	/* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
-	kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
-					  sizeof(int));
+	kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
+					  sizeof(int), GFP_NOWAIT, node);
 
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-		alloc_desc_masks(&desc[i], 0, true);
+		alloc_desc_masks(&desc[i], node, true);
 		init_desc_masks(&desc[i]);
 		irq_desc_ptrs[i] = desc + i;
 	}
-- 
GitLab


From 959982fee4e635c61780e989c3e34267143fcc02 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 28 May 2009 18:15:16 -0700
Subject: [PATCH 2086/2198] memcg: don't use bootmem allocator in setup code

The bootmem allocator is no longer available for page_cgroup_init() because we
set up the kernel slab allocator much earlier now.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 mm/page_cgroup.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 791905c991df9..3dd4a909a1de8 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -47,6 +47,8 @@ static int __init alloc_node_page_cgroup(int nid)
 	struct page_cgroup *base, *pc;
 	unsigned long table_size;
 	unsigned long start_pfn, nr_pages, index;
+	struct page *page;
+	unsigned int order;
 
 	start_pfn = NODE_DATA(nid)->node_start_pfn;
 	nr_pages = NODE_DATA(nid)->node_spanned_pages;
@@ -55,11 +57,13 @@ static int __init alloc_node_page_cgroup(int nid)
 		return 0;
 
 	table_size = sizeof(struct page_cgroup) * nr_pages;
-
-	base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
-			table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
-	if (!base)
+	order = get_order(table_size);
+	page = alloc_pages_node(nid, GFP_NOWAIT | __GFP_ZERO, order);
+	if (!page)
+		page = alloc_pages_node(-1, GFP_NOWAIT | __GFP_ZERO, order);
+	if (!page)
 		return -ENOMEM;
+	base = page_address(page);
 	for (index = 0; index < nr_pages; index++) {
 		pc = base + index;
 		__init_page_cgroup(pc, start_pfn + index);
-- 
GitLab


From 4bdddf8ff9bbb8aa7b4d7847586202bd25842c90 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 08:35:27 +0300
Subject: [PATCH 2087/2198] sched: use alloc_cpumask_var() instead of
 alloc_bootmem_cpumask_var()

Slab is initialized when sched_init() runs now so lets use alloc_cpumask_var().

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 kernel/sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index a9ff9533355ba..12cc09cf51478 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9310,13 +9310,13 @@ void __init sched_init(void)
 	current->sched_class = &fair_sched_class;
 
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-	alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-	alloc_bootmem_cpumask_var(&nohz.cpu_mask);
-	alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
+	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
+	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
 	scheduler_running = 1;
-- 
GitLab


From 0fb530291621c8b8a1b16abeeab05d9262489f71 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 08:41:22 +0300
Subject: [PATCH 2088/2198] sched: use slab in cpupri_init()

Lets not use the bootmem allocator in cpupri_init() as slab is already up when
it is run.

Cc: Ingo Molnar <mingo@elte.hu>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 kernel/sched.c        | 2 +-
 kernel/sched_cpupri.c | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 12cc09cf51478..dcf2dc28931ad 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7795,7 +7795,7 @@ static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
 	if (!alloc_cpumask_var(&rd->rto_mask, gfp))
 		goto free_online;
 
-	if (cpupri_init(&rd->cpupri, false) != 0)
+	if (cpupri_init(&rd->cpupri, bootmem) != 0)
 		goto free_rto_mask;
 	return 0;
 
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 344712a5e3edd..7deffc9f0e5f5 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -154,8 +154,12 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
  */
 int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
 {
+	gfp_t gfp = GFP_KERNEL;
 	int i;
 
+	if (bootmem)
+		gfp = GFP_NOWAIT;
+
 	memset(cp, 0, sizeof(*cp));
 
 	for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
@@ -163,9 +167,7 @@ int __init_refok cpupri_init(struct cpupri *cp, bool bootmem)
 
 		spin_lock_init(&vec->lock);
 		vec->count = 0;
-		if (bootmem)
-			alloc_bootmem_cpumask_var(&vec->mask);
-		else if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
+		if (!zalloc_cpumask_var(&vec->mask, gfp))
 			goto cleanup;
 	}
 
-- 
GitLab


From 22fb4e71e646695c7e0f379ada66b372c2d1aa1a Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 14:46:49 +0300
Subject: [PATCH 2089/2198] irq: use kcalloc() instead of the bootmem allocator

Fixes the following problem:

[    0.000000] Experimental hierarchical RCU init done.
[    0.000000] NR_IRQS:4352 nr_irqs:256
[    0.000000] ------------[ cut here ]------------
[    0.000000] WARNING: at mm/bootmem.c:537 alloc_arch_preferred_bootmem+0x40/0x7e()
[    0.000000] Hardware name: To Be Filled By O.E.M.
[    0.000000] Pid: 0, comm: swapper Not tainted 2.6.30-tip-02161-g7a74539-dirty #59709
[    0.000000] Call Trace:
[    0.000000]  [<ffffffff823f8c8e>] ? alloc_arch_preferred_bootmem+0x40/0x7e
[    0.000000]  [<ffffffff81067168>] warn_slowpath_common+0x88/0xcb
[    0.000000]  [<ffffffff810671d2>] warn_slowpath_null+0x27/0x3d
[    0.000000]  [<ffffffff823f8c8e>] alloc_arch_preferred_bootmem+0x40/0x7e
[    0.000000]  [<ffffffff823f9307>] ___alloc_bootmem_nopanic+0x4e/0xec
[    0.000000]  [<ffffffff823f93c5>] ___alloc_bootmem+0x20/0x61
[    0.000000]  [<ffffffff823f962e>] __alloc_bootmem+0x1e/0x34
[    0.000000]  [<ffffffff823f757c>] early_irq_init+0x6d/0x118
[    0.000000]  [<ffffffff823e0140>] ? early_idt_handler+0x0/0x71
[    0.000000]  [<ffffffff823e0cf7>] start_kernel+0x192/0x394
[    0.000000]  [<ffffffff823e0140>] ? early_idt_handler+0x0/0x71
[    0.000000]  [<ffffffff823e02ad>] x86_64_start_reservations+0xb4/0xcf
[    0.000000]  [<ffffffff823e0000>] ? __init_begin+0x0/0x140
[    0.000000]  [<ffffffff823e0420>] x86_64_start_kernel+0x158/0x17b
[    0.000000] ---[ end trace a7919e7f17c0a725 ]---
[    0.000000] Fast TSC calibration using PIT
[    0.000000] Detected 2002.510 MHz processor.
[    0.004000] Console: colour VGA+ 80x25

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 kernel/irq/handle.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e161999b66832..1045785412305 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -164,7 +164,7 @@ int __init early_irq_init(void)
  	node = first_online_node;
 
 	/* allocate irq_desc_ptrs array based on nr_irqs */
-	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+	irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT);
 
 	/* allocate based on nr_cpu_ids */
 	kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids *
-- 
GitLab


From b8ec757390282e21d349bf6b602a8cb182da0429 Mon Sep 17 00:00:00 2001
From: Pekka Enberg <penberg@cs.helsinki.fi>
Date: Thu, 11 Jun 2009 19:25:37 +0300
Subject: [PATCH 2090/2198] vgacon: use slab allocator instead of the bootmem
 allocator

Slab is initialized before the console subsystem so use the slab allocator in
vgacon_scrollback_startup().

Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
---
 drivers/video/console/vgacon.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/video/console/vgacon.c b/drivers/video/console/vgacon.c
index 38e86b84dce04..59d7d5ec17a4e 100644
--- a/drivers/video/console/vgacon.c
+++ b/drivers/video/console/vgacon.c
@@ -180,7 +180,7 @@ static inline void vga_set_mem_top(struct vc_data *c)
 }
 
 #ifdef CONFIG_VGACON_SOFT_SCROLLBACK
-#include <linux/bootmem.h>
+#include <linux/slab.h>
 /* software scrollback */
 static void *vgacon_scrollback;
 static int vgacon_scrollback_tail;
@@ -210,8 +210,7 @@ static void vgacon_scrollback_init(int pitch)
  */
 static void __init_refok vgacon_scrollback_startup(void)
 {
-	vgacon_scrollback = alloc_bootmem(CONFIG_VGACON_SOFT_SCROLLBACK_SIZE
-					  * 1024);
+	vgacon_scrollback = kcalloc(CONFIG_VGACON_SOFT_SCROLLBACK_SIZE, 1024, GFP_NOWAIT);
 	vgacon_scrollback_init(vga_video_num_columns * 2);
 }
 
-- 
GitLab


From b415c49a864dab8ee90713833d642dd461eccae9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 11 Jun 2009 13:12:55 +0100
Subject: [PATCH 2091/2198] slow_work_thread() should do the exclusive wait

slow_work_thread() sleeps on slow_work_thread_wq without WQ_FLAG_EXCLUSIVE,
this means that slow_work_enqueue()->__wake_up(nr_exclusive => 1) wakes up all
kslowd threads.  This is not what we want, so we change slow_work_thread() to
use prepare_to_wait_exclusive() instead.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/slow-work.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b28d19135f431..521ed2004d63f 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -372,8 +372,8 @@ static int slow_work_thread(void *_data)
 		vsmax *= atomic_read(&slow_work_thread_count);
 		vsmax /= 100;
 
-		prepare_to_wait(&slow_work_thread_wq, &wait,
-				TASK_INTERRUPTIBLE);
+		prepare_to_wait_exclusive(&slow_work_thread_wq, &wait,
+					  TASK_INTERRUPTIBLE);
 		if (!freezing(current) &&
 		    !slow_work_threads_should_exit &&
 		    !slow_work_available(vsmax) &&
-- 
GitLab


From 90586523eb4b349806887c62ee70685a49415124 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:20 -0400
Subject: [PATCH 2092/2198] fsnotify: unified filesystem notification backend

fsnotify is a backend for filesystem notification.  fsnotify does
not provide any userspace interface but does provide the basis
needed for other notification schemes such as dnotify.  fsnotify
can be extended to be the backend for inotify or the upcoming
fanotify.  fsnotify provides a mechanism for "groups" to register for
some set of filesystem events and to then deliver those events to
those groups for processing.

fsnotify has a number of benefits, the first being actually shrinking the size
of an inode.  Before fsnotify to support both dnotify and inotify an inode had

        unsigned long           i_dnotify_mask; /* Directory notify events */
        struct dnotify_struct   *i_dnotify; /* for directory notifications */
        struct list_head        inotify_watches; /* watches on this inode */
        struct mutex            inotify_mutex;  /* protects the watches list

But with fsnotify this same functionallity (and more) is done with just

        __u32                   i_fsnotify_mask; /* all events for this inode */
        struct hlist_head       i_fsnotify_mark_entries; /* marks on this inode */

That's right, inotify, dnotify, and fanotify all in 64 bits.  We used that
much space just in inotify_watches alone, before this patch set.

fsnotify object lifetime and locking is MUCH better than what we have today.
inotify locking is incredibly complex.  See 8f7b0ba1c8539 as an example of
what's been busted since inception.  inotify needs to know internal semantics
of superblock destruction and unmounting to function.  The inode pinning and
vfs contortions are horrible.

no fsnotify implementers do allocation under locks.  This means things like
f04b30de3 which (due to an overabundance of caution) changes GFP_KERNEL to
GFP_NOFS can be reverted.  There are no longer any allocation rules when using
or implementing your own fsnotify listener.

fsnotify paves the way for fanotify.  In brief fanotify is a notification
mechanism that delivers the lisener both an 'event' and an open file descriptor
to the object in question.  This means that fanotify is pathname agnostic.
Some on lkml may not care for the original companies or users that pushed for
TALPA, but fanotify was designed with flexibility and input for other users in
mind.  The readahead group expressed interest in fanotify as it could be used
to profile disk access on boot without breaking the audit system.  The desktop
search groups have also expressed interest in fanotify as it solves a number
of the race conditions and problems present with managing inotify when more
than a limited number of specific files are of interest.  fanotify can provide
for a userspace access control system which makes it a clean interface for AV
vendors to hook without trying to do binary patching on the syscall table,
LSM, and everywhere else they do their things today.  With this patch series
fanotify can be implemented in less than 1200 lines of easy to review code.
Almost all of which is the socket based user interface.

This patch series builds fsnotify to the point that it can implement
dnotify and inotify_user.  Patches exist and will be sent soon after
acceptance to finish the in kernel inotify conversion (audit) and implement
fanotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/Kconfig                |  13 ++
 fs/notify/Makefile               |   2 +
 fs/notify/fsnotify.c             |  79 ++++++++++++
 fs/notify/fsnotify.h             |  15 +++
 fs/notify/group.c                | 198 +++++++++++++++++++++++++++++++
 fs/notify/inotify/inotify.c      |  20 ++++
 fs/notify/notification.c         | 121 +++++++++++++++++++
 include/linux/fsnotify.h         | 115 ++++++++++++------
 include/linux/fsnotify_backend.h | 177 +++++++++++++++++++++++++++
 9 files changed, 705 insertions(+), 35 deletions(-)
 create mode 100644 fs/notify/fsnotify.c
 create mode 100644 fs/notify/fsnotify.h
 create mode 100644 fs/notify/group.c
 create mode 100644 fs/notify/notification.c
 create mode 100644 include/linux/fsnotify_backend.h

diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 50914d7303c65..31dac7e3b0f10 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,2 +1,15 @@
+config FSNOTIFY
+	bool "Filesystem notification backend"
+	default y
+	---help---
+	   fsnotify is a backend for filesystem notification.  fsnotify does
+	   not provide any userspace interface but does provide the basis
+	   needed for other notification schemes such as dnotify, inotify,
+	   and fanotify.
+
+	   Say Y here to enable fsnotify suport.
+
+	   If unsure, say Y.
+
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index 5a95b6010ce75..db5467b5b58d4 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,2 +1,4 @@
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o
+
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
new file mode 100644
index 0000000000000..56bee0f10c382
--- /dev/null
+++ b/fs/notify/fsnotify.c
@@ -0,0 +1,79 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/dcache.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/srcu.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+/*
+ * This is the main call to fsnotify.  The VFS calls into hook specific functions
+ * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
+ * out to all of the registered fsnotify_group.  Those groups can then use the
+ * notification event in whatever means they feel necessary.
+ */
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+{
+	struct fsnotify_group *group;
+	struct fsnotify_event *event = NULL;
+	int idx;
+
+	if (list_empty(&fsnotify_groups))
+		return;
+
+	if (!(mask & fsnotify_mask))
+		return;
+
+	/*
+	 * SRCU!!  the groups list is very very much read only and the path is
+	 * very hot.  The VAST majority of events are not going to need to do
+	 * anything other than walk the list so it's crazy to pre-allocate.
+	 */
+	idx = srcu_read_lock(&fsnotify_grp_srcu);
+	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
+		if (mask & group->mask) {
+			if (!event) {
+				event = fsnotify_create_event(to_tell, mask, data, data_is);
+				/* shit, we OOM'd and now we can't tell, maybe
+				 * someday someone else will want to do something
+				 * here */
+				if (!event)
+					break;
+			}
+			group->ops->handle_event(group, event);
+		}
+	}
+	srcu_read_unlock(&fsnotify_grp_srcu, idx);
+	/*
+	 * fsnotify_create_event() took a reference so the event can't be cleaned
+	 * up while we are still trying to add it to lists, drop that one.
+	 */
+	if (event)
+		fsnotify_put_event(event);
+}
+EXPORT_SYMBOL_GPL(fsnotify);
+
+static __init int fsnotify_init(void)
+{
+	return init_srcu_struct(&fsnotify_grp_srcu);
+}
+subsys_initcall(fsnotify_init);
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
new file mode 100644
index 0000000000000..c6a8bd4765724
--- /dev/null
+++ b/fs/notify/fsnotify.h
@@ -0,0 +1,15 @@
+#ifndef __FS_NOTIFY_FSNOTIFY_H_
+#define __FS_NOTIFY_FSNOTIFY_H_
+
+#include <linux/list.h>
+#include <linux/fsnotify.h>
+#include <linux/srcu.h>
+#include <linux/types.h>
+
+/* protects reads of fsnotify_groups */
+extern struct srcu_struct fsnotify_grp_srcu;
+/* all groups which receive fsnotify events */
+extern struct list_head fsnotify_groups;
+/* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
+extern __u32 fsnotify_mask;
+#endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
new file mode 100644
index 0000000000000..c6812953b9683
--- /dev/null
+++ b/fs/notify/group.c
@@ -0,0 +1,198 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/srcu.h>
+#include <linux/rculist.h>
+#include <linux/wait.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+#include <asm/atomic.h>
+
+/* protects writes to fsnotify_groups and fsnotify_mask */
+static DEFINE_MUTEX(fsnotify_grp_mutex);
+/* protects reads while running the fsnotify_groups list */
+struct srcu_struct fsnotify_grp_srcu;
+/* all groups registered to receive filesystem notifications */
+LIST_HEAD(fsnotify_groups);
+/* bitwise OR of all events (FS_*) interesting to some group on this system */
+__u32 fsnotify_mask;
+
+/*
+ * When a new group registers or changes it's set of interesting events
+ * this function updates the fsnotify_mask to contain all interesting events
+ */
+void fsnotify_recalc_global_mask(void)
+{
+	struct fsnotify_group *group;
+	__u32 mask = 0;
+	int idx;
+
+	idx = srcu_read_lock(&fsnotify_grp_srcu);
+	list_for_each_entry_rcu(group, &fsnotify_groups, group_list)
+		mask |= group->mask;
+	srcu_read_unlock(&fsnotify_grp_srcu, idx);
+	fsnotify_mask = mask;
+}
+
+/*
+ * Take a reference to a group so things found under the fsnotify_grp_mutex
+ * can't get freed under us
+ */
+static void fsnotify_get_group(struct fsnotify_group *group)
+{
+	atomic_inc(&group->refcnt);
+}
+
+/*
+ * Final freeing of a group
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+	if (group->ops->free_group_priv)
+		group->ops->free_group_priv(group);
+
+	kfree(group);
+}
+
+/*
+ * Remove this group from the global list of groups that will get events
+ * this can be done even if there are still references and things still using
+ * this group.  This just stops the group from getting new events.
+ */
+static void __fsnotify_evict_group(struct fsnotify_group *group)
+{
+	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+
+	if (group->on_group_list)
+		list_del_rcu(&group->group_list);
+	group->on_group_list = 0;
+}
+
+/*
+ * Called when a group is no longer interested in getting events.  This can be
+ * used if a group is misbehaving or if for some reason a group should no longer
+ * get any filesystem events.
+ */
+void fsnotify_evict_group(struct fsnotify_group *group)
+{
+	mutex_lock(&fsnotify_grp_mutex);
+	__fsnotify_evict_group(group);
+	mutex_unlock(&fsnotify_grp_mutex);
+}
+
+/*
+ * Drop a reference to a group.  Free it if it's through.
+ */
+void fsnotify_put_group(struct fsnotify_group *group)
+{
+	if (!atomic_dec_and_mutex_lock(&group->refcnt, &fsnotify_grp_mutex))
+		return;
+
+	/*
+	 * OK, now we know that there's no other users *and* we hold mutex,
+	 * so no new references will appear
+	 */
+	__fsnotify_evict_group(group);
+
+	/*
+	 * now it's off the list, so the only thing we might care about is
+	 * srcu access....
+	 */
+	mutex_unlock(&fsnotify_grp_mutex);
+	synchronize_srcu(&fsnotify_grp_srcu);
+
+	/* and now it is really dead. _Nothing_ could be seeing it */
+	fsnotify_recalc_global_mask();
+	fsnotify_destroy_group(group);
+}
+
+/*
+ * Simply run the fsnotify_groups list and find a group which matches
+ * the given parameters.  If a group is found we take a reference to that
+ * group.
+ */
+static struct fsnotify_group *fsnotify_find_group(unsigned int group_num, __u32 mask,
+						  const struct fsnotify_ops *ops)
+{
+	struct fsnotify_group *group_iter;
+	struct fsnotify_group *group = NULL;
+
+	BUG_ON(!mutex_is_locked(&fsnotify_grp_mutex));
+
+	list_for_each_entry_rcu(group_iter, &fsnotify_groups, group_list) {
+		if (group_iter->group_num == group_num) {
+			if ((group_iter->mask == mask) &&
+			    (group_iter->ops == ops)) {
+				fsnotify_get_group(group_iter);
+				group = group_iter;
+			} else
+				group = ERR_PTR(-EEXIST);
+		}
+	}
+	return group;
+}
+
+/*
+ * Either finds an existing group which matches the group_num, mask, and ops or
+ * creates a new group and adds it to the global group list.  In either case we
+ * take a reference for the group returned.
+ */
+struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
+					     const struct fsnotify_ops *ops)
+{
+	struct fsnotify_group *group, *tgroup;
+
+	/* very low use, simpler locking if we just always alloc */
+	group = kmalloc(sizeof(struct fsnotify_group), GFP_KERNEL);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	atomic_set(&group->refcnt, 1);
+
+	group->on_group_list = 0;
+	group->group_num = group_num;
+	group->mask = mask;
+
+	group->ops = ops;
+
+	mutex_lock(&fsnotify_grp_mutex);
+	tgroup = fsnotify_find_group(group_num, mask, ops);
+	if (tgroup) {
+		/* group already exists */
+		mutex_unlock(&fsnotify_grp_mutex);
+		/* destroy the new one we made */
+		fsnotify_put_group(group);
+		return tgroup;
+	}
+
+	/* group not found, add a new one */
+	list_add_rcu(&group->group_list, &fsnotify_groups);
+	group->on_group_list = 1;
+
+	mutex_unlock(&fsnotify_grp_mutex);
+
+	if (mask)
+		fsnotify_recalc_global_mask();
+
+	return group;
+}
diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index 220c13f0d73d2..40b1cf914ccb5 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -32,6 +32,7 @@
 #include <linux/list.h>
 #include <linux/writeback.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 
 static atomic_t inotify_cookie;
 
@@ -905,6 +906,25 @@ EXPORT_SYMBOL_GPL(inotify_rm_watch);
  */
 static int __init inotify_setup(void)
 {
+	BUILD_BUG_ON(IN_ACCESS != FS_ACCESS);
+	BUILD_BUG_ON(IN_MODIFY != FS_MODIFY);
+	BUILD_BUG_ON(IN_ATTRIB != FS_ATTRIB);
+	BUILD_BUG_ON(IN_CLOSE_WRITE != FS_CLOSE_WRITE);
+	BUILD_BUG_ON(IN_CLOSE_NOWRITE != FS_CLOSE_NOWRITE);
+	BUILD_BUG_ON(IN_OPEN != FS_OPEN);
+	BUILD_BUG_ON(IN_MOVED_FROM != FS_MOVED_FROM);
+	BUILD_BUG_ON(IN_MOVED_TO != FS_MOVED_TO);
+	BUILD_BUG_ON(IN_CREATE != FS_CREATE);
+	BUILD_BUG_ON(IN_DELETE != FS_DELETE);
+	BUILD_BUG_ON(IN_DELETE_SELF != FS_DELETE_SELF);
+	BUILD_BUG_ON(IN_MOVE_SELF != FS_MOVE_SELF);
+	BUILD_BUG_ON(IN_Q_OVERFLOW != FS_Q_OVERFLOW);
+
+	BUILD_BUG_ON(IN_UNMOUNT != FS_UNMOUNT);
+	BUILD_BUG_ON(IN_ISDIR != FS_IN_ISDIR);
+	BUILD_BUG_ON(IN_IGNORED != FS_IN_IGNORED);
+	BUILD_BUG_ON(IN_ONESHOT != FS_IN_ONESHOT);
+
 	atomic_set(&inotify_cookie, 0);
 
 	return 0;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
new file mode 100644
index 0000000000000..b8e9a87f8f585
--- /dev/null
+++ b/fs/notify/notification.c
@@ -0,0 +1,121 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mount.h>
+#include <linux/mutex.h>
+#include <linux/namei.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+static struct kmem_cache *fsnotify_event_cachep;
+
+void fsnotify_get_event(struct fsnotify_event *event)
+{
+	atomic_inc(&event->refcnt);
+}
+
+void fsnotify_put_event(struct fsnotify_event *event)
+{
+	if (!event)
+		return;
+
+	if (atomic_dec_and_test(&event->refcnt)) {
+		if (event->data_type == FSNOTIFY_EVENT_PATH)
+			path_put(&event->path);
+
+		kmem_cache_free(fsnotify_event_cachep, event);
+	}
+}
+
+/*
+ * Allocate a new event which will be sent to each group's handle_event function
+ * if the group was interested in this particular event.
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
+					     void *data, int data_type)
+{
+	struct fsnotify_event *event;
+
+	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	if (!event)
+		return NULL;
+
+	atomic_set(&event->refcnt, 1);
+
+	spin_lock_init(&event->lock);
+
+	event->path.dentry = NULL;
+	event->path.mnt = NULL;
+	event->inode = NULL;
+
+	event->to_tell = to_tell;
+
+	switch (data_type) {
+	case FSNOTIFY_EVENT_FILE: {
+		struct file *file = data;
+		struct path *path = &file->f_path;
+		event->path.dentry = path->dentry;
+		event->path.mnt = path->mnt;
+		path_get(&event->path);
+		event->data_type = FSNOTIFY_EVENT_PATH;
+		break;
+	}
+	case FSNOTIFY_EVENT_PATH: {
+		struct path *path = data;
+		event->path.dentry = path->dentry;
+		event->path.mnt = path->mnt;
+		path_get(&event->path);
+		event->data_type = FSNOTIFY_EVENT_PATH;
+		break;
+	}
+	case FSNOTIFY_EVENT_INODE:
+		event->inode = data;
+		event->data_type = FSNOTIFY_EVENT_INODE;
+		break;
+	case FSNOTIFY_EVENT_NONE:
+		event->inode = NULL;
+		event->path.dentry = NULL;
+		event->path.mnt = NULL;
+		break;
+	default:
+		BUG();
+	}
+
+	event->mask = mask;
+
+	return event;
+}
+
+__init int fsnotify_notification_init(void)
+{
+	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+
+	return 0;
+}
+subsys_initcall(fsnotify_notification_init);
+
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 00fbd5b245c99..6c9ebefdac8e4 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -13,6 +13,7 @@
 
 #include <linux/dnotify.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify_backend.h>
 #include <linux/audit.h>
 
 /*
@@ -34,6 +35,16 @@ static inline void fsnotify_d_move(struct dentry *entry)
 	inotify_d_move(entry);
 }
 
+/*
+ * fsnotify_link_count - inode's link count changed
+ */
+static inline void fsnotify_link_count(struct inode *inode)
+{
+	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
+
+	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE);
+}
+
 /*
  * fsnotify_move - file old_name at old_dir was moved to new_name at new_dir
  */
@@ -43,28 +54,47 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 {
 	struct inode *source = moved->d_inode;
 	u32 cookie = inotify_get_cookie();
+	__u32 old_dir_mask = 0;
+	__u32 new_dir_mask = 0;
 
-	if (old_dir == new_dir)
+	if (old_dir == new_dir) {
 		inode_dir_notify(old_dir, DN_RENAME);
-	else {
+		old_dir_mask = FS_DN_RENAME;
+	} else {
 		inode_dir_notify(old_dir, DN_DELETE);
+		old_dir_mask = FS_DELETE;
 		inode_dir_notify(new_dir, DN_CREATE);
+		new_dir_mask = FS_CREATE;
 	}
 
-	if (isdir)
+	if (isdir) {
 		isdir = IN_ISDIR;
+		old_dir_mask |= FS_IN_ISDIR;
+		new_dir_mask |= FS_IN_ISDIR;
+	}
+
+	old_dir_mask |= FS_MOVED_FROM;
+	new_dir_mask |= FS_MOVED_TO;
+
 	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name,
 				  source);
 	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
 				  source);
 
+	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE);
+	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE);
+
 	if (target) {
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
 		inotify_inode_is_dead(target);
+
+		/* this is really a link_count change not a removal */
+		fsnotify_link_count(target);
 	}
 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
+		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE);
 	}
 	audit_inode_child(new_name, moved, new_dir);
 }
@@ -74,10 +104,12 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
  */
 static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 {
+	__u32 mask = FS_DELETE;
+
 	if (isdir)
-		isdir = IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
-	inotify_dentry_parent_queue_event(dentry, IN_DELETE|isdir, 0, dentry->d_name.name);
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 }
 
 /*
@@ -87,14 +119,8 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 {
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
-}
 
-/*
- * fsnotify_link_count - inode's link count changed
- */
-static inline void fsnotify_link_count(struct inode *inode)
-{
-	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
+	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -106,6 +132,8 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
+
+	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -120,6 +148,8 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 				  inode);
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
+
+	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -127,10 +157,14 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
  */
 static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 {
+	__u32 mask = (FS_CREATE | FS_IN_ISDIR);
+	struct inode *d_inode = dentry->d_inode;
+
 	inode_dir_notify(inode, DN_CREATE);
-	inotify_inode_queue_event(inode, IN_CREATE | IN_ISDIR, 0, 
-				  dentry->d_name.name, dentry->d_inode);
+	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
+
+	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -139,14 +173,16 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 static inline void fsnotify_access(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_ACCESS;
+	__u32 mask = FS_ACCESS;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_ACCESS);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -155,14 +191,16 @@ static inline void fsnotify_access(struct dentry *dentry)
 static inline void fsnotify_modify(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_MODIFY;
+	__u32 mask = FS_MODIFY;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_MODIFY);
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -171,13 +209,15 @@ static inline void fsnotify_modify(struct dentry *dentry)
 static inline void fsnotify_open(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_OPEN;
+	__u32 mask = FS_OPEN;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -189,13 +229,15 @@ static inline void fsnotify_close(struct file *file)
 	struct inode *inode = dentry->d_inode;
 	const char *name = dentry->d_name.name;
 	fmode_t mode = file->f_mode;
-	u32 mask = (mode & FMODE_WRITE) ? IN_CLOSE_WRITE : IN_CLOSE_NOWRITE;
+	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE);
 }
 
 /*
@@ -204,13 +246,15 @@ static inline void fsnotify_close(struct file *file)
 static inline void fsnotify_xattr(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
-	u32 mask = IN_ATTRIB;
+	__u32 mask = FS_ATTRIB;
 
 	if (S_ISDIR(inode->i_mode))
-		mask |= IN_ISDIR;
+		mask |= FS_IN_ISDIR;
 
 	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
+
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
 /*
@@ -221,34 +265,34 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 {
 	struct inode *inode = dentry->d_inode;
 	int dn_mask = 0;
-	u32 in_mask = 0;
+	__u32 in_mask = 0;
 
 	if (ia_valid & ATTR_UID) {
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	}
 	if (ia_valid & ATTR_GID) {
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	}
 	if (ia_valid & ATTR_SIZE) {
-		in_mask |= IN_MODIFY;
+		in_mask |= FS_MODIFY;
 		dn_mask |= DN_MODIFY;
 	}
 	/* both times implies a utime(s) call */
 	if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
 	{
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	} else if (ia_valid & ATTR_ATIME) {
-		in_mask |= IN_ACCESS;
+		in_mask |= FS_ACCESS;
 		dn_mask |= DN_ACCESS;
 	} else if (ia_valid & ATTR_MTIME) {
-		in_mask |= IN_MODIFY;
+		in_mask |= FS_MODIFY;
 		dn_mask |= DN_MODIFY;
 	}
 	if (ia_valid & ATTR_MODE) {
-		in_mask |= IN_ATTRIB;
+		in_mask |= FS_ATTRIB;
 		dn_mask |= DN_ATTRIB;
 	}
 
@@ -256,14 +300,15 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		dnotify_parent(dentry, dn_mask);
 	if (in_mask) {
 		if (S_ISDIR(inode->i_mode))
-			in_mask |= IN_ISDIR;
+			in_mask |= FS_IN_ISDIR;
 		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
 		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
 						  dentry->d_name.name);
+		fsnotify(inode, in_mask, inode, FSNOTIFY_EVENT_INODE);
 	}
 }
 
-#ifdef CONFIG_INOTIFY	/* inotify helpers */
+#if defined(CONFIG_INOTIFY) || defined(CONFIG_FSNOTIFY)	/* notify helpers */
 
 /*
  * fsnotify_oldname_init - save off the old filename before we change it
@@ -281,7 +326,7 @@ static inline void fsnotify_oldname_free(const char *old_name)
 	kfree(old_name);
 }
 
-#else	/* CONFIG_INOTIFY */
+#else	/* CONFIG_INOTIFY || CONFIG_FSNOTIFY */
 
 static inline const char *fsnotify_oldname_init(const char *name)
 {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
new file mode 100644
index 0000000000000..1a55718b38aab
--- /dev/null
+++ b/include/linux/fsnotify_backend.h
@@ -0,0 +1,177 @@
+/*
+ * Filesystem access notification for Linux
+ *
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ */
+
+#ifndef __LINUX_FSNOTIFY_BACKEND_H
+#define __LINUX_FSNOTIFY_BACKEND_H
+
+#ifdef __KERNEL__
+
+#include <linux/fs.h> /* struct inode */
+#include <linux/list.h>
+#include <linux/path.h> /* struct path */
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <asm/atomic.h>
+
+/*
+ * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily
+ * convert between them.  dnotify only needs conversion at watch creation
+ * so no perf loss there.  fanotify isn't defined yet, so it can use the
+ * wholes if it needs more events.
+ */
+#define FS_ACCESS		0x00000001	/* File was accessed */
+#define FS_MODIFY		0x00000002	/* File was modified */
+#define FS_ATTRIB		0x00000004	/* Metadata changed */
+#define FS_CLOSE_WRITE		0x00000008	/* Writtable file was closed */
+#define FS_CLOSE_NOWRITE	0x00000010	/* Unwrittable file closed */
+#define FS_OPEN			0x00000020	/* File was opened */
+#define FS_MOVED_FROM		0x00000040	/* File was moved from X */
+#define FS_MOVED_TO		0x00000080	/* File was moved to Y */
+#define FS_CREATE		0x00000100	/* Subfile was created */
+#define FS_DELETE		0x00000200	/* Subfile was deleted */
+#define FS_DELETE_SELF		0x00000400	/* Self was deleted */
+#define FS_MOVE_SELF		0x00000800	/* Self was moved */
+
+#define FS_UNMOUNT		0x00002000	/* inode on umount fs */
+#define FS_Q_OVERFLOW		0x00004000	/* Event queued overflowed */
+#define FS_IN_IGNORED		0x00008000	/* last inotify event here */
+
+#define FS_IN_ISDIR		0x40000000	/* event occurred against dir */
+#define FS_IN_ONESHOT		0x80000000	/* only send event once */
+
+#define FS_DN_RENAME		0x10000000	/* file renamed */
+#define FS_DN_MULTISHOT		0x20000000	/* dnotify multishot */
+
+struct fsnotify_group;
+struct fsnotify_event;
+
+/*
+ * Each group much define these ops.  The fsnotify infrastructure will call
+ * these operations for each relevant group.
+ *
+ * handle_event - main call for a group to handle an fs event
+ * free_group_priv - called when a group refcnt hits 0 to clean up the private union
+ */
+struct fsnotify_ops {
+	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
+	void (*free_group_priv)(struct fsnotify_group *group);
+};
+
+/*
+ * A group is a "thing" that wants to receive notification about filesystem
+ * events.  The mask holds the subset of event types this group cares about.
+ * refcnt on a group is up to the implementor and at any moment if it goes 0
+ * everything will be cleaned up.
+ */
+struct fsnotify_group {
+	/*
+	 * global list of all groups receiving events from fsnotify.
+	 * anchored by fsnotify_groups and protected by either fsnotify_grp_mutex
+	 * or fsnotify_grp_srcu depending on write vs read.
+	 */
+	struct list_head group_list;
+
+	/*
+	 * Defines all of the event types in which this group is interested.
+	 * This mask is a bitwise OR of the FS_* events from above.  Each time
+	 * this mask changes for a group (if it changes) the correct functions
+	 * must be called to update the global structures which indicate global
+	 * interest in event types.
+	 */
+	__u32 mask;
+
+	/*
+	 * How the refcnt is used is up to each group.  When the refcnt hits 0
+	 * fsnotify will clean up all of the resources associated with this group.
+	 * As an example, the dnotify group will always have a refcnt=1 and that
+	 * will never change.  Inotify, on the other hand, has a group per
+	 * inotify_init() and the refcnt will hit 0 only when that fd has been
+	 * closed.
+	 */
+	atomic_t refcnt;		/* things with interest in this group */
+	unsigned int group_num;		/* simply prevents accidental group collision */
+
+	const struct fsnotify_ops *ops;	/* how this group handles things */
+
+	/* prevents double list_del of group_list.  protected by global fsnotify_gr_mutex */
+	bool on_group_list;
+
+	/* groups can define private fields here or use the void *private */
+	union {
+		void *private;
+	};
+};
+
+/*
+ * all of the information about the original object we want to now send to
+ * a group.  If you want to carry more info from the accessing task to the
+ * listener this structure is where you need to be adding fields.
+ */
+struct fsnotify_event {
+	spinlock_t lock;	/* protection for the associated event_holder and private_list */
+	/* to_tell may ONLY be dereferenced during handle_event(). */
+	struct inode *to_tell;	/* either the inode the event happened to or its parent */
+	/*
+	 * depending on the event type we should have either a path or inode
+	 * We hold a reference on path, but NOT on inode.  Since we have the ref on
+	 * the path, it may be dereferenced at any point during this object's
+	 * lifetime.  That reference is dropped when this object's refcnt hits
+	 * 0.  If this event contains an inode instead of a path, the inode may
+	 * ONLY be used during handle_event().
+	 */
+	union {
+		struct path path;
+		struct inode *inode;
+	};
+/* when calling fsnotify tell it if the data is a path or inode */
+#define FSNOTIFY_EVENT_NONE	0
+#define FSNOTIFY_EVENT_PATH	1
+#define FSNOTIFY_EVENT_INODE	2
+#define FSNOTIFY_EVENT_FILE	3
+	int data_type;		/* which of the above union we have */
+	atomic_t refcnt;	/* how many groups still are using/need to send this event */
+	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
+};
+
+#ifdef CONFIG_FSNOTIFY
+
+/* called from the vfs helpers */
+
+/* main fsnotify call to send events */
+extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+
+
+/* called from fsnotify listeners, such as fanotify or dnotify */
+
+/* must call when a group changes its ->mask */
+extern void fsnotify_recalc_global_mask(void);
+/* get a reference to an existing or create a new group */
+extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num,
+						    __u32 mask,
+						    const struct fsnotify_ops *ops);
+/* drop reference on a group from fsnotify_obtain_group */
+extern void fsnotify_put_group(struct fsnotify_group *group);
+
+/* take a reference to an event */
+extern void fsnotify_get_event(struct fsnotify_event *event);
+extern void fsnotify_put_event(struct fsnotify_event *event);
+/* find private data previously attached to an event */
+extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
+									struct fsnotify_event *event);
+
+/* put here because inotify does some weird stuff when destroying watches */
+extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
+						    void *data, int data_is);
+#else
+
+static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+{}
+#endif	/* CONFIG_FSNOTIFY */
+
+#endif	/* __KERNEL __ */
+
+#endif	/* __LINUX_FSNOTIFY_BACKEND_H */
-- 
GitLab


From 3be25f49b9d6a97eae9bcb96d3292072b7658bd8 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:26 -0400
Subject: [PATCH 2093/2198] fsnotify: add marks to inodes so groups can
 interpret how to handle those inodes

This patch creates a way for fsnotify groups to attach marks to inodes.
These marks have little meaning to the generic fsnotify infrastructure
and thus their meaning should be interpreted by the group that attached
them to the inode's list.

dnotify and inotify  will make use of these markings to indicate which
inodes are of interest to their respective groups.  But this implementation
has the useful property that in the future other listeners could actually
use the marks for the exact opposite reason, aka to indicate which inodes
it had NO interest in.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/inode.c                       |   9 +
 fs/notify/Makefile               |   2 +-
 fs/notify/fsnotify.c             |  13 ++
 fs/notify/fsnotify.h             |   5 +
 fs/notify/group.c                |  49 ++++-
 fs/notify/inode_mark.c           | 329 +++++++++++++++++++++++++++++++
 include/linux/fs.h               |   5 +
 include/linux/fsnotify.h         |   9 +
 include/linux/fsnotify_backend.h |  65 +++++-
 9 files changed, 483 insertions(+), 3 deletions(-)
 create mode 100644 fs/notify/inode_mark.c

diff --git a/fs/inode.c b/fs/inode.c
index bca0c618fdb35..54c63ce3de25e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/fsnotify.h>
 #include <linux/mount.h>
 #include <linux/async.h>
 
@@ -189,6 +190,10 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_private = NULL;
 	inode->i_mapping = mapping;
 
+#ifdef CONFIG_FSNOTIFY
+	inode->i_fsnotify_mask = 0;
+#endif
+
 	return inode;
 
 out_free_security:
@@ -221,6 +226,7 @@ void destroy_inode(struct inode *inode)
 	BUG_ON(inode_has_buffers(inode));
 	ima_inode_free(inode);
 	security_inode_free(inode);
+	fsnotify_inode_delete(inode);
 	if (inode->i_sb->s_op->destroy_inode)
 		inode->i_sb->s_op->destroy_inode(inode);
 	else
@@ -252,6 +258,9 @@ void inode_init_once(struct inode *inode)
 	INIT_LIST_HEAD(&inode->inotify_watches);
 	mutex_init(&inode->inotify_mutex);
 #endif
+#ifdef CONFIG_FSNOTIFY
+	INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+#endif
 }
 EXPORT_SYMBOL(inode_init_once);
 
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
index db5467b5b58d4..0922cc826c46b 100644
--- a/fs/notify/Makefile
+++ b/fs/notify/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o
+obj-$(CONFIG_FSNOTIFY)		+= fsnotify.o notification.o group.o inode_mark.o
 
 obj-y			+= dnotify/
 obj-y			+= inotify/
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 56bee0f10c382..d5654629c659a 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -25,6 +25,15 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
+/*
+ * Clear all of the marks on an inode when it is being evicted from core
+ */
+void __fsnotify_inode_delete(struct inode *inode)
+{
+	fsnotify_clear_marks_by_inode(inode);
+}
+EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
+
 /*
  * This is the main call to fsnotify.  The VFS calls into hook specific functions
  * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
@@ -43,6 +52,8 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 	if (!(mask & fsnotify_mask))
 		return;
 
+	if (!(mask & to_tell->i_fsnotify_mask))
+		return;
 	/*
 	 * SRCU!!  the groups list is very very much read only and the path is
 	 * very hot.  The VAST majority of events are not going to need to do
@@ -51,6 +62,8 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
 	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
 		if (mask & group->mask) {
+			if (!group->ops->should_send_event(group, to_tell, mask))
+				continue;
 			if (!event) {
 				event = fsnotify_create_event(to_tell, mask, data, data_is);
 				/* shit, we OOM'd and now we can't tell, maybe
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index c6a8bd4765724..8ebcbe893c913 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -12,4 +12,9 @@ extern struct srcu_struct fsnotify_grp_srcu;
 extern struct list_head fsnotify_groups;
 /* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
 extern __u32 fsnotify_mask;
+
+/* final kfree of a group */
+extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+/* run the list of all marks associated with inode and flag them to be freed */
+extern void fsnotify_clear_marks_by_inode(struct inode *inode);
 #endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index c6812953b9683..a29d2fa679272 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -54,6 +54,29 @@ void fsnotify_recalc_global_mask(void)
 	fsnotify_mask = mask;
 }
 
+/*
+ * Update the group->mask by running all of the marks associated with this
+ * group and finding the bitwise | of all of the mark->mask.  If we change
+ * the group->mask we need to update the global mask of events interesting
+ * to the system.
+ */
+void fsnotify_recalc_group_mask(struct fsnotify_group *group)
+{
+	__u32 mask = 0;
+	__u32 old_mask = group->mask;
+	struct fsnotify_mark_entry *entry;
+
+	spin_lock(&group->mark_lock);
+	list_for_each_entry(entry, &group->mark_entries, g_list)
+		mask |= entry->mask;
+	spin_unlock(&group->mark_lock);
+
+	group->mask = mask;
+
+	if (old_mask != mask)
+		fsnotify_recalc_global_mask();
+}
+
 /*
  * Take a reference to a group so things found under the fsnotify_grp_mutex
  * can't get freed under us
@@ -66,7 +89,7 @@ static void fsnotify_get_group(struct fsnotify_group *group)
 /*
  * Final freeing of a group
  */
-static void fsnotify_destroy_group(struct fsnotify_group *group)
+void fsnotify_final_destroy_group(struct fsnotify_group *group)
 {
 	if (group->ops->free_group_priv)
 		group->ops->free_group_priv(group);
@@ -74,6 +97,24 @@ static void fsnotify_destroy_group(struct fsnotify_group *group)
 	kfree(group);
 }
 
+/*
+ * Trying to get rid of a group.  We need to first get rid of any outstanding
+ * allocations and then free the group.  Remember that fsnotify_clear_marks_by_group
+ * could miss marks that are being freed by inode and those marks could still
+ * hold a reference to this group (via group->num_marks)  If we get into that
+ * situtation, the fsnotify_final_destroy_group will get called when that final
+ * mark is freed.
+ */
+static void fsnotify_destroy_group(struct fsnotify_group *group)
+{
+	/* clear all inode mark entries for this group */
+	fsnotify_clear_marks_by_group(group);
+
+	/* past the point of no return, matches the initial value of 1 */
+	if (atomic_dec_and_test(&group->num_marks))
+		fsnotify_final_destroy_group(group);
+}
+
 /*
  * Remove this group from the global list of groups that will get events
  * this can be done even if there are still references and things still using
@@ -173,6 +214,10 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	group->group_num = group_num;
 	group->mask = mask;
 
+	spin_lock_init(&group->mark_lock);
+	atomic_set(&group->num_marks, 0);
+	INIT_LIST_HEAD(&group->mark_entries);
+
 	group->ops = ops;
 
 	mutex_lock(&fsnotify_grp_mutex);
@@ -188,6 +233,8 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	/* group not found, add a new one */
 	list_add_rcu(&group->group_list, &fsnotify_groups);
 	group->on_group_list = 1;
+	/* being on the fsnotify_groups list holds one num_marks */
+	atomic_inc(&group->num_marks);
 
 	mutex_unlock(&fsnotify_grp_mutex);
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
new file mode 100644
index 0000000000000..cdc154146974e
--- /dev/null
+++ b/fs/notify/inode_mark.c
@@ -0,0 +1,329 @@
+/*
+ *  Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2, or (at your option)
+ *  any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; see the file COPYING.  If not, write to
+ *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/*
+ * fsnotify inode mark locking/lifetime/and refcnting
+ *
+ * REFCNT:
+ * The mark->refcnt tells how many "things" in the kernel currently are
+ * referencing this object.  The object typically will live inside the kernel
+ * with a refcnt of 2, one for each list it is on (i_list, g_list).  Any task
+ * which can find this object holding the appropriete locks, can take a reference
+ * and the object itself is guarenteed to survive until the reference is dropped.
+ *
+ * LOCKING:
+ * There are 3 spinlocks involved with fsnotify inode marks and they MUST
+ * be taken in order as follows:
+ *
+ * entry->lock
+ * group->mark_lock
+ * inode->i_lock
+ *
+ * entry->lock protects 2 things, entry->group and entry->inode.  You must hold
+ * that lock to dereference either of these things (they could be NULL even with
+ * the lock)
+ *
+ * group->mark_lock protects the mark_entries list anchored inside a given group
+ * and each entry is hooked via the g_list.  It also sorta protects the
+ * free_g_list, which when used is anchored by a private list on the stack of the
+ * task which held the group->mark_lock.
+ *
+ * inode->i_lock protects the i_fsnotify_mark_entries list anchored inside a
+ * given inode and each entry is hooked via the i_list. (and sorta the
+ * free_i_list)
+ *
+ *
+ * LIFETIME:
+ * Inode marks survive between when they are added to an inode and when their
+ * refcnt==0.
+ *
+ * The inode mark can be cleared for a number of different reasons including:
+ * - The inode is unlinked for the last time.  (fsnotify_inode_remove)
+ * - The inode is being evicted from cache. (fsnotify_inode_delete)
+ * - The fs the inode is on is unmounted.  (fsnotify_inode_delete/fsnotify_unmount_inodes)
+ * - Something explicitly requests that it be removed.  (fsnotify_destroy_mark_by_entry)
+ * - The fsnotify_group associated with the mark is going away and all such marks
+ *   need to be cleaned up. (fsnotify_clear_marks_by_group)
+ *
+ * Worst case we are given an inode and need to clean up all the marks on that
+ * inode.  We take i_lock and walk the i_fsnotify_mark_entries safely.  For each
+ * mark on the list we take a reference (so the mark can't disappear under us).
+ * We remove that mark form the inode's list of marks and we add this mark to a
+ * private list anchored on the stack using i_free_list;  At this point we no
+ * longer fear anything finding the mark using the inode's list of marks.
+ *
+ * We can safely and locklessly run the private list on the stack of everything
+ * we just unattached from the original inode.  For each mark on the private list
+ * we grab the mark-> and can thus dereference mark->group and mark->inode.  If
+ * we see the group and inode are not NULL we take those locks.  Now holding all
+ * 3 locks we can completely remove the mark from other tasks finding it in the
+ * future.  Remember, 10 things might already be referencing this mark, but they
+ * better be holding a ref.  We drop our reference we took before we unhooked it
+ * from the inode.  When the ref hits 0 we can free the mark.
+ *
+ * Very similarly for freeing by group, except we use free_g_list.
+ *
+ * This has the very interesting property of being able to run concurrently with
+ * any (or all) other directions.
+ */
+
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include <asm/atomic.h>
+
+#include <linux/fsnotify_backend.h>
+#include "fsnotify.h"
+
+void fsnotify_get_mark(struct fsnotify_mark_entry *entry)
+{
+	atomic_inc(&entry->refcnt);
+}
+
+void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
+{
+	if (atomic_dec_and_test(&entry->refcnt))
+		entry->free_mark(entry);
+}
+
+/*
+ * Recalculate the mask of events relevant to a given inode locked.
+ */
+static void fsnotify_recalc_inode_mask_locked(struct inode *inode)
+{
+	struct fsnotify_mark_entry *entry;
+	struct hlist_node *pos;
+	__u32 new_mask = 0;
+
+	assert_spin_locked(&inode->i_lock);
+
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list)
+		new_mask |= entry->mask;
+	inode->i_fsnotify_mask = new_mask;
+}
+
+/*
+ * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types
+ * any notifier is interested in hearing for this inode.
+ */
+void fsnotify_recalc_inode_mask(struct inode *inode)
+{
+	spin_lock(&inode->i_lock);
+	fsnotify_recalc_inode_mask_locked(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Any time a mark is getting freed we end up here.
+ * The caller had better be holding a reference to this mark so we don't actually
+ * do the final put under the entry->lock
+ */
+void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
+{
+	struct fsnotify_group *group;
+	struct inode *inode;
+
+	spin_lock(&entry->lock);
+
+	group = entry->group;
+	inode = entry->inode;
+
+	BUG_ON(group && !inode);
+	BUG_ON(!group && inode);
+
+	/* if !group something else already marked this to die */
+	if (!group) {
+		spin_unlock(&entry->lock);
+		return;
+	}
+
+	/* 1 from caller and 1 for being on i_list/g_list */
+	BUG_ON(atomic_read(&entry->refcnt) < 2);
+
+	spin_lock(&group->mark_lock);
+	spin_lock(&inode->i_lock);
+
+	hlist_del_init(&entry->i_list);
+	entry->inode = NULL;
+
+	list_del_init(&entry->g_list);
+	entry->group = NULL;
+
+	fsnotify_put_mark(entry); /* for i_list and g_list */
+
+	/*
+	 * this mark is now off the inode->i_fsnotify_mark_entries list and we
+	 * hold the inode->i_lock, so this is the perfect time to update the
+	 * inode->i_fsnotify_mask
+	 */
+	fsnotify_recalc_inode_mask_locked(inode);
+
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&entry->lock);
+
+	/*
+	 * Some groups like to know that marks are being freed.  This is a
+	 * callback to the group function to let it know that this entry
+	 * is being freed.
+	 */
+	group->ops->freeing_mark(entry, group);
+
+	/*
+	 * it's possible that this group tried to destroy itself, but this
+	 * this mark was simultaneously being freed by inode.  If that's the
+	 * case, we finish freeing the group here.
+	 */
+	if (unlikely(atomic_dec_and_test(&group->num_marks)))
+		fsnotify_final_destroy_group(group);
+}
+
+/*
+ * Given a group, destroy all of the marks associated with that group.
+ */
+void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
+{
+	struct fsnotify_mark_entry *lentry, *entry;
+	LIST_HEAD(free_list);
+
+	spin_lock(&group->mark_lock);
+	list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
+		list_add(&entry->free_g_list, &free_list);
+		list_del_init(&entry->g_list);
+		fsnotify_get_mark(entry);
+	}
+	spin_unlock(&group->mark_lock);
+
+	list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
+	}
+}
+
+/*
+ * Given an inode, destroy all of the marks associated with that inode.
+ */
+void fsnotify_clear_marks_by_inode(struct inode *inode)
+{
+	struct fsnotify_mark_entry *entry, *lentry;
+	struct hlist_node *pos, *n;
+	LIST_HEAD(free_list);
+
+	spin_lock(&inode->i_lock);
+	hlist_for_each_entry_safe(entry, pos, n, &inode->i_fsnotify_mark_entries, i_list) {
+		list_add(&entry->free_i_list, &free_list);
+		hlist_del_init(&entry->i_list);
+		fsnotify_get_mark(entry);
+	}
+	spin_unlock(&inode->i_lock);
+
+	list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
+	}
+}
+
+/*
+ * given a group and inode, find the mark associated with that combination.
+ * if found take a reference to that mark and return it, else return NULL
+ */
+struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group,
+						     struct inode *inode)
+{
+	struct fsnotify_mark_entry *entry;
+	struct hlist_node *pos;
+
+	assert_spin_locked(&inode->i_lock);
+
+	hlist_for_each_entry(entry, pos, &inode->i_fsnotify_mark_entries, i_list) {
+		if (entry->group == group) {
+			fsnotify_get_mark(entry);
+			return entry;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Nothing fancy, just initialize lists and locks and counters.
+ */
+void fsnotify_init_mark(struct fsnotify_mark_entry *entry,
+			void (*free_mark)(struct fsnotify_mark_entry *entry))
+
+{
+	spin_lock_init(&entry->lock);
+	atomic_set(&entry->refcnt, 1);
+	INIT_HLIST_NODE(&entry->i_list);
+	entry->group = NULL;
+	entry->mask = 0;
+	entry->inode = NULL;
+	entry->free_mark = free_mark;
+}
+
+/*
+ * Attach an initialized mark entry to a given group and inode.
+ * These marks may be used for the fsnotify backend to determine which
+ * event types should be delivered to which group and for which inodes.
+ */
+int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
+		      struct fsnotify_group *group, struct inode *inode)
+{
+	struct fsnotify_mark_entry *lentry;
+	int ret = 0;
+
+	/*
+	 * LOCKING ORDER!!!!
+	 * entry->lock
+	 * group->mark_lock
+	 * inode->i_lock
+	 */
+	spin_lock(&entry->lock);
+	spin_lock(&group->mark_lock);
+	spin_lock(&inode->i_lock);
+
+	entry->group = group;
+	entry->inode = inode;
+
+	lentry = fsnotify_find_mark_entry(group, inode);
+	if (!lentry) {
+		hlist_add_head(&entry->i_list, &inode->i_fsnotify_mark_entries);
+		list_add(&entry->g_list, &group->mark_entries);
+
+		fsnotify_get_mark(entry); /* for i_list and g_list */
+
+		atomic_inc(&group->num_marks);
+
+		fsnotify_recalc_inode_mask_locked(inode);
+	}
+
+	spin_unlock(&inode->i_lock);
+	spin_unlock(&group->mark_lock);
+	spin_unlock(&entry->lock);
+
+	if (lentry) {
+		ret = -EEXIST;
+		fsnotify_put_mark(lentry);
+	}
+
+	return ret;
+}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 83d6b4397245b..275b0860044cc 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -755,6 +755,11 @@ struct inode {
 
 	__u32			i_generation;
 
+#ifdef CONFIG_FSNOTIFY
+	__u32			i_fsnotify_mask; /* all events this inode cares about */
+	struct hlist_head	i_fsnotify_mark_entries; /* fsnotify mark entries */
+#endif
+
 #ifdef CONFIG_DNOTIFY
 	unsigned long		i_dnotify_mask; /* Directory notify events */
 	struct dnotify_struct	*i_dnotify; /* for directory notifications */
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 6c9ebefdac8e4..3856eb6e59731 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -99,6 +99,14 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	audit_inode_child(new_name, moved, new_dir);
 }
 
+/*
+ * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
+ */
+static inline void fsnotify_inode_delete(struct inode *inode)
+{
+	__fsnotify_inode_delete(inode);
+}
+
 /*
  * fsnotify_nameremove - a filename was removed from a directory
  */
@@ -121,6 +129,7 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 	inotify_inode_is_dead(inode);
 
 	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
+	__fsnotify_inode_delete(inode);
 }
 
 /*
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1a55718b38aab..cad5c4d75c1d5 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -48,17 +48,25 @@
 
 struct fsnotify_group;
 struct fsnotify_event;
+struct fsnotify_mark_entry;
 
 /*
  * Each group much define these ops.  The fsnotify infrastructure will call
  * these operations for each relevant group.
  *
+ * should_send_event - given a group, inode, and mask this function determines
+ *		if the group is interested in this event.
  * handle_event - main call for a group to handle an fs event
  * free_group_priv - called when a group refcnt hits 0 to clean up the private union
+ * freeing-mark - this means that a mark has been flagged to die when everything
+ *		finishes using it.  The function is supplied with what must be a
+ *		valid group and inode to use to clean up.
  */
 struct fsnotify_ops {
+	bool (*should_send_event)(struct fsnotify_group *group, struct inode *inode, __u32 mask);
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
+	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
 };
 
 /*
@@ -97,7 +105,14 @@ struct fsnotify_group {
 
 	const struct fsnotify_ops *ops;	/* how this group handles things */
 
-	/* prevents double list_del of group_list.  protected by global fsnotify_gr_mutex */
+	/* stores all fastapth entries assoc with this group so they can be cleaned on unregister */
+	spinlock_t mark_lock;		/* protect mark_entries list */
+	atomic_t num_marks;		/* 1 for each mark entry and 1 for not being
+					 * past the point of no return when freeing
+					 * a group */
+	struct list_head mark_entries;	/* all inode mark entries for this group */
+
+	/* prevents double list_del of group_list.  protected by global fsnotify_grp_mutex */
 	bool on_group_list;
 
 	/* groups can define private fields here or use the void *private */
@@ -137,12 +152,38 @@ struct fsnotify_event {
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
 };
 
+/*
+ * a mark is simply an entry attached to an in core inode which allows an
+ * fsnotify listener to indicate they are either no longer interested in events
+ * of a type matching mask or only interested in those events.
+ *
+ * these are flushed when an inode is evicted from core and may be flushed
+ * when the inode is modified (as seen by fsnotify_access).  Some fsnotify users
+ * (such as dnotify) will flush these when the open fd is closed and not at
+ * inode eviction or modification.
+ */
+struct fsnotify_mark_entry {
+	__u32 mask;			/* mask this mark entry is for */
+	/* we hold ref for each i_list and g_list.  also one ref for each 'thing'
+	 * in kernel that found and may be using this mark. */
+	atomic_t refcnt;		/* active things looking at this mark */
+	struct inode *inode;		/* inode this entry is associated with */
+	struct fsnotify_group *group;	/* group this mark entry is for */
+	struct hlist_node i_list;	/* list of mark_entries by inode->i_fsnotify_mark_entries */
+	struct list_head g_list;	/* list of mark_entries by group->i_fsnotify_mark_entries */
+	spinlock_t lock;		/* protect group, inode, and killme */
+	struct list_head free_i_list;	/* tmp list used when freeing this mark */
+	struct list_head free_g_list;	/* tmp list used when freeing this mark */
+	void (*free_mark)(struct fsnotify_mark_entry *entry); /* called on final put+free */
+};
+
 #ifdef CONFIG_FSNOTIFY
 
 /* called from the vfs helpers */
 
 /* main fsnotify call to send events */
 extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+extern void __fsnotify_inode_delete(struct inode *inode);
 
 
 /* called from fsnotify listeners, such as fanotify or dnotify */
@@ -153,6 +194,8 @@ extern void fsnotify_recalc_global_mask(void);
 extern struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num,
 						    __u32 mask,
 						    const struct fsnotify_ops *ops);
+/* run all marks associated with this group and update group->mask */
+extern void fsnotify_recalc_group_mask(struct fsnotify_group *group);
 /* drop reference on a group from fsnotify_obtain_group */
 extern void fsnotify_put_group(struct fsnotify_group *group);
 
@@ -163,6 +206,22 @@ extern void fsnotify_put_event(struct fsnotify_event *event);
 extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
 									struct fsnotify_event *event);
 
+/* functions used to manipulate the marks attached to inodes */
+
+/* run all marks associated with an inode and update inode->i_fsnotify_mask */
+extern void fsnotify_recalc_inode_mask(struct inode *inode);
+extern void fsnotify_init_mark(struct fsnotify_mark_entry *entry, void (*free_mark)(struct fsnotify_mark_entry *entry));
+/* find (and take a reference) to a mark associated with group and inode */
+extern struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode);
+/* attach the mark to both the group and the inode */
+extern int fsnotify_add_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group, struct inode *inode);
+/* given a mark, flag it to be freed when all references are dropped */
+extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry);
+/* run all the marks in a group, and flag them to be freed */
+extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
+extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
+extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
+
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 						    void *data, int data_is);
@@ -170,6 +229,10 @@ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32
 
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 {}
+
+static inline void __fsnotify_inode_delete(struct inode *inode)
+{}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
GitLab


From c28f7e56e9d95fb531dc3be8df2e7f52bee76d21 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:29 -0400
Subject: [PATCH 2094/2198] fsnotify: parent event notification

inotify and dnotify both use a similar parent notification mechanism.  We
add a generic parent notification mechanism to fsnotify for both of these
to use.  This new machanism also adds the dentry flag optimization which
exists for inotify to dnotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/fsnotify.c             | 91 ++++++++++++++++++++++++++++++++
 fs/notify/fsnotify.h             |  5 ++
 fs/notify/inode_mark.c           | 17 ++++++
 include/linux/dcache.h           |  4 +-
 include/linux/fsnotify.h         | 34 ++++++++----
 include/linux/fsnotify_backend.h | 64 ++++++++++++++++++++++
 6 files changed, 205 insertions(+), 10 deletions(-)

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index d5654629c659a..7fc760067a623 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -34,6 +34,97 @@ void __fsnotify_inode_delete(struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(__fsnotify_inode_delete);
 
+/*
+ * Given an inode, first check if we care what happens to our children.  Inotify
+ * and dnotify both tell their parents about events.  If we care about any event
+ * on a child we run all of our children and set a dentry flag saying that the
+ * parent cares.  Thus when an event happens on a child it can quickly tell if
+ * if there is a need to find a parent and send the event to the parent.
+ */
+void __fsnotify_update_child_dentry_flags(struct inode *inode)
+{
+	struct dentry *alias;
+	int watched;
+
+	if (!S_ISDIR(inode->i_mode))
+		return;
+
+	/* determine if the children should tell inode about their events */
+	watched = fsnotify_inode_watches_children(inode);
+
+	spin_lock(&dcache_lock);
+	/* run all of the dentries associated with this inode.  Since this is a
+	 * directory, there damn well better only be one item on this list */
+	list_for_each_entry(alias, &inode->i_dentry, d_alias) {
+		struct dentry *child;
+
+		/* run all of the children of the original inode and fix their
+		 * d_flags to indicate parental interest (their parent is the
+		 * original inode) */
+		list_for_each_entry(child, &alias->d_subdirs, d_u.d_child) {
+			if (!child->d_inode)
+				continue;
+
+			spin_lock(&child->d_lock);
+			if (watched)
+				child->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+			else
+				child->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+			spin_unlock(&child->d_lock);
+		}
+	}
+	spin_unlock(&dcache_lock);
+}
+
+/* Notify this dentry's parent about a child's events. */
+void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+{
+	struct dentry *parent;
+	struct inode *p_inode;
+	bool send = false;
+	bool should_update_children = false;
+
+	if (!(dentry->d_flags & DCACHE_FSNOTIFY_PARENT_WATCHED))
+		return;
+
+	spin_lock(&dentry->d_lock);
+	parent = dentry->d_parent;
+	p_inode = parent->d_inode;
+
+	if (fsnotify_inode_watches_children(p_inode)) {
+		if (p_inode->i_fsnotify_mask & mask) {
+			dget(parent);
+			send = true;
+		}
+	} else {
+		/*
+		 * The parent doesn't care about events on it's children but
+		 * at least one child thought it did.  We need to run all the
+		 * children and update their d_flags to let them know p_inode
+		 * doesn't care about them any more.
+		 */
+		dget(parent);
+		should_update_children = true;
+	}
+
+	spin_unlock(&dentry->d_lock);
+
+	if (send) {
+		/* we are notifying a parent so come up with the new mask which
+		 * specifies these are events which came from a child. */
+		mask |= FS_EVENT_ON_CHILD;
+
+		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE);
+		dput(parent);
+	}
+
+	if (unlikely(should_update_children)) {
+		__fsnotify_update_child_dentry_flags(p_inode);
+		dput(parent);
+	}
+}
+EXPORT_SYMBOL_GPL(__fsnotify_parent);
+
 /*
  * This is the main call to fsnotify.  The VFS calls into hook specific functions
  * in linux/fsnotify.h.  Those functions then in turn call here.  Here will call
diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 8ebcbe893c913..83b8ec0a8ec2f 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -17,4 +17,9 @@ extern __u32 fsnotify_mask;
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
 /* run the list of all marks associated with inode and flag them to be freed */
 extern void fsnotify_clear_marks_by_inode(struct inode *inode);
+/*
+ * update the dentry->d_flags of all of inode's children to indicate if inode cares
+ * about events that happen to its children.
+ */
+extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
 #endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index cdc154146974e..a39534845b282 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -131,6 +131,8 @@ void fsnotify_recalc_inode_mask(struct inode *inode)
 	spin_lock(&inode->i_lock);
 	fsnotify_recalc_inode_mask_locked(inode);
 	spin_unlock(&inode->i_lock);
+
+	__fsnotify_update_child_dentry_flags(inode);
 }
 
 /*
@@ -189,6 +191,19 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 	 */
 	group->ops->freeing_mark(entry, group);
 
+	/*
+	 * __fsnotify_update_child_dentry_flags(inode);
+	 *
+	 * I really want to call that, but we can't, we have no idea if the inode
+	 * still exists the second we drop the entry->lock.
+	 *
+	 * The next time an event arrive to this inode from one of it's children
+	 * __fsnotify_parent will see that the inode doesn't care about it's
+	 * children and will update all of these flags then.  So really this
+	 * is just a lazy update (and could be a perf win...)
+	 */
+
+
 	/*
 	 * it's possible that this group tried to destroy itself, but this
 	 * this mark was simultaneously being freed by inode.  If that's the
@@ -323,6 +338,8 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 	if (lentry) {
 		ret = -EEXIST;
 		fsnotify_put_mark(lentry);
+	} else {
+		__fsnotify_update_child_dentry_flags(inode);
 	}
 
 	return ret;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 15156364d196a..97978004338db 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -180,10 +180,12 @@ d_iput:		no		no		no       yes
 #define DCACHE_REFERENCED	0x0008  /* Recently used, don't discard. */
 #define DCACHE_UNHASHED		0x0010	
 
-#define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched */
+#define DCACHE_INOTIFY_PARENT_WATCHED	0x0020 /* Parent inode is watched by inotify */
 
 #define DCACHE_COOKIE		0x0040	/* For use by dcookie subsystem */
 
+#define DCACHE_FSNOTIFY_PARENT_WATCHED	0x0080 /* Parent inode is watched by some fsnotify listener */
+
 extern spinlock_t dcache_lock;
 extern seqlock_t rename_lock;
 
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 3856eb6e59731..6a662ed0bc8ac 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -23,15 +23,31 @@
 static inline void fsnotify_d_instantiate(struct dentry *entry,
 						struct inode *inode)
 {
+	__fsnotify_d_instantiate(entry, inode);
+
 	inotify_d_instantiate(entry, inode);
 }
 
+/* Notify this dentry's parent about a child's events. */
+static inline void fsnotify_parent(struct dentry *dentry, __u32 mask)
+{
+	__fsnotify_parent(dentry, mask);
+
+	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+}
+
 /*
  * fsnotify_d_move - entry has been moved
  * Called with dcache_lock and entry->d_lock held.
  */
 static inline void fsnotify_d_move(struct dentry *entry)
 {
+	/*
+	 * On move we need to update entry->d_flags to indicate if the new parent
+	 * cares about events from this entry.
+	 */
+	__fsnotify_update_dcache_flags(entry);
+
 	inotify_d_move(entry);
 }
 
@@ -117,7 +133,8 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 	if (isdir)
 		mask |= FS_IN_ISDIR;
 	dnotify_parent(dentry, DN_DELETE);
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
+
+	fsnotify_parent(dentry, mask);
 }
 
 /*
@@ -188,9 +205,9 @@ static inline void fsnotify_access(struct dentry *dentry)
 		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_ACCESS);
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -206,9 +223,9 @@ static inline void fsnotify_modify(struct dentry *dentry)
 		mask |= FS_IN_ISDIR;
 
 	dnotify_parent(dentry, DN_MODIFY);
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -223,9 +240,9 @@ static inline void fsnotify_open(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -236,16 +253,15 @@ static inline void fsnotify_close(struct file *file)
 {
 	struct dentry *dentry = file->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
-	const char *name = dentry->d_name.name;
 	fmode_t mode = file->f_mode;
 	__u32 mask = (mode & FMODE_WRITE) ? FS_CLOSE_WRITE : FS_CLOSE_NOWRITE;
 
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_dentry_parent_queue_event(dentry, mask, 0, name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE);
 }
 
@@ -260,9 +276,9 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	inotify_dentry_parent_queue_event(dentry, mask, 0, dentry->d_name.name);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
+	fsnotify_parent(dentry, mask);
 	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 }
 
@@ -311,8 +327,8 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		if (S_ISDIR(inode->i_mode))
 			in_mask |= FS_IN_ISDIR;
 		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
-		inotify_dentry_parent_queue_event(dentry, in_mask, 0,
-						  dentry->d_name.name);
+
+		fsnotify_parent(dentry, in_mask);
 		fsnotify(inode, in_mask, inode, FSNOTIFY_EVENT_INODE);
 	}
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index cad5c4d75c1d5..13d2dd5700491 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -46,6 +46,17 @@
 #define FS_DN_RENAME		0x10000000	/* file renamed */
 #define FS_DN_MULTISHOT		0x20000000	/* dnotify multishot */
 
+/* This inode cares about things that happen to its children.  Always set for
+ * dnotify and inotify. */
+#define FS_EVENT_ON_CHILD	0x08000000
+
+/* This is a list of all events that may get sent to a parernt based on fs event
+ * happening to inodes inside that directory */
+#define FS_EVENTS_POSS_ON_CHILD   (FS_ACCESS | FS_MODIFY | FS_ATTRIB |\
+				   FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | FS_OPEN |\
+				   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
+				   FS_DELETE)
+
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark_entry;
@@ -183,8 +194,52 @@ struct fsnotify_mark_entry {
 
 /* main fsnotify call to send events */
 extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 
+static inline int fsnotify_inode_watches_children(struct inode *inode)
+{
+	/* FS_EVENT_ON_CHILD is set if the inode may care */
+	if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD))
+		return 0;
+	/* this inode might care about child events, does it care about the
+	 * specific set of events that can happen on a child? */
+	return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD;
+}
+
+/*
+ * Update the dentry with a flag indicating the interest of its parent to receive
+ * filesystem events when those events happens to this dentry->d_inode.
+ */
+static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	assert_spin_locked(&dcache_lock);
+	assert_spin_locked(&dentry->d_lock);
+
+	parent = dentry->d_parent;
+	if (fsnotify_inode_watches_children(parent->d_inode))
+		dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED;
+	else
+		dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
+}
+
+/*
+ * fsnotify_d_instantiate - instantiate a dentry for inode
+ * Called with dcache_lock held.
+ */
+static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
+{
+	if (!inode)
+		return;
+
+	assert_spin_locked(&dcache_lock);
+
+	spin_lock(&dentry->d_lock);
+	__fsnotify_update_dcache_flags(dentry);
+	spin_unlock(&dentry->d_lock);
+}
 
 /* called from fsnotify listeners, such as fanotify or dnotify */
 
@@ -230,9 +285,18 @@ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 {}
 
+static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
+{}
+
 static inline void __fsnotify_inode_delete(struct inode *inode)
 {}
 
+static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
+{}
+
+static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
+{}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
GitLab


From 3c5119c05d624f95f4967d16b38c9624b816bdb9 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:33 -0400
Subject: [PATCH 2095/2198] dnotify: reimplement dnotify using fsnotify

Reimplement dnotify using fsnotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 MAINTAINERS                      |   6 +-
 fs/notify/dnotify/Kconfig        |   1 +
 fs/notify/dnotify/dnotify.c      | 469 ++++++++++++++++++++++++-------
 include/linux/dnotify.h          |  29 +-
 include/linux/fs.h               |   5 -
 include/linux/fsnotify.h         |  68 ++---
 include/linux/fsnotify_backend.h |   3 +
 7 files changed, 398 insertions(+), 183 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ccdb57524e3ca..96e0c8c607965 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1802,10 +1802,10 @@ F:	drivers/char/epca*
 F:	drivers/char/digi*
 
 DIRECTORY NOTIFICATION (DNOTIFY)
-P:	Stephen Rothwell
-M:	sfr@canb.auug.org.au
+P:	Eric Paris
+M:	eparis@parisplace.org
 L:	linux-kernel@vger.kernel.org
-S:	Supported
+S:	Maintained
 F:	Documentation/filesystems/dnotify.txt
 F:	fs/notify/dnotify/
 F:	include/linux/dnotify.h
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 26adf5dfa6463..904ff8d5405a0 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,5 +1,6 @@
 config DNOTIFY
 	bool "Dnotify support"
+	depends on FSNOTIFY
 	default y
 	help
 	  Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index b0aa2cde80bd8..d9d80f502c6f9 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -3,6 +3,9 @@
  *
  * Copyright (C) 2000,2001,2002 Stephen Rothwell
  *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * dnotify was largly rewritten to use the new fsnotify infrastructure
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
  * Free Software Foundation; either version 2, or (at your option) any
@@ -21,24 +24,178 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/fdtable.h>
+#include <linux/fsnotify_backend.h>
 
 int dir_notify_enable __read_mostly = 1;
 
-static struct kmem_cache *dn_cache __read_mostly;
+static struct kmem_cache *dnotify_struct_cache __read_mostly;
+static struct kmem_cache *dnotify_mark_entry_cache __read_mostly;
+static struct fsnotify_group *dnotify_group __read_mostly;
+static DEFINE_MUTEX(dnotify_mark_mutex);
+
+/*
+ * dnotify will attach one of these to each inode (i_fsnotify_mark_entries) which
+ * is being watched by dnotify.  If multiple userspace applications are watching
+ * the same directory with dnotify their information is chained in dn
+ */
+struct dnotify_mark_entry {
+	struct fsnotify_mark_entry fsn_entry;
+	struct dnotify_struct *dn;
+};
 
-static void redo_inode_mask(struct inode *inode)
+/*
+ * When a process starts or stops watching an inode the set of events which
+ * dnotify cares about for that inode may change.  This function runs the
+ * list of everything receiving dnotify events about this directory and calculates
+ * the set of all those events.  After it updates what dnotify is interested in
+ * it calls the fsnotify function so it can update the set of all events relevant
+ * to this inode.
+ */
+static void dnotify_recalc_inode_mask(struct fsnotify_mark_entry *entry)
 {
-	unsigned long new_mask;
+	__u32 new_mask, old_mask;
 	struct dnotify_struct *dn;
+	struct dnotify_mark_entry *dnentry  = container_of(entry,
+							   struct dnotify_mark_entry,
+							   fsn_entry);
+
+	assert_spin_locked(&entry->lock);
 
+	old_mask = entry->mask;
 	new_mask = 0;
-	for (dn = inode->i_dnotify; dn != NULL; dn = dn->dn_next)
-		new_mask |= dn->dn_mask & ~DN_MULTISHOT;
-	inode->i_dnotify_mask = new_mask;
+	for (dn = dnentry->dn; dn != NULL; dn = dn->dn_next)
+		new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT);
+	entry->mask = new_mask;
+
+	if (old_mask == new_mask)
+		return;
+
+	if (entry->inode)
+		fsnotify_recalc_inode_mask(entry->inode);
 }
 
+/*
+ * Mains fsnotify call where events are delivered to dnotify.
+ * Find the dnotify mark on the relevant inode, run the list of dnotify structs
+ * on that mark and determine which of them has expressed interest in receiving
+ * events of this type.  When found send the correct process and signal and
+ * destroy the dnotify struct if it was not registered to receive multiple
+ * events.
+ */
+static int dnotify_handle_event(struct fsnotify_group *group,
+				struct fsnotify_event *event)
+{
+	struct fsnotify_mark_entry *entry = NULL;
+	struct dnotify_mark_entry *dnentry;
+	struct inode *to_tell;
+	struct dnotify_struct *dn;
+	struct dnotify_struct **prev;
+	struct fown_struct *fown;
+
+	to_tell = event->to_tell;
+
+	spin_lock(&to_tell->i_lock);
+	entry = fsnotify_find_mark_entry(group, to_tell);
+	spin_unlock(&to_tell->i_lock);
+
+	/* unlikely since we alreay passed dnotify_should_send_event() */
+	if (unlikely(!entry))
+		return 0;
+	dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+
+	spin_lock(&entry->lock);
+	prev = &dnentry->dn;
+	while ((dn = *prev) != NULL) {
+		if ((dn->dn_mask & event->mask) == 0) {
+			prev = &dn->dn_next;
+			continue;
+		}
+		fown = &dn->dn_filp->f_owner;
+		send_sigio(fown, dn->dn_fd, POLL_MSG);
+		if (dn->dn_mask & FS_DN_MULTISHOT)
+			prev = &dn->dn_next;
+		else {
+			*prev = dn->dn_next;
+			kmem_cache_free(dnotify_struct_cache, dn);
+			dnotify_recalc_inode_mask(entry);
+		}
+	}
+
+	spin_unlock(&entry->lock);
+	fsnotify_put_mark(entry);
+
+	return 0;
+}
+
+/*
+ * Given an inode and mask determine if dnotify would be interested in sending
+ * userspace notification for that pair.
+ */
+static bool dnotify_should_send_event(struct fsnotify_group *group,
+				      struct inode *inode, __u32 mask)
+{
+	struct fsnotify_mark_entry *entry;
+	bool send;
+
+	/* !dir_notify_enable should never get here, don't waste time checking
+	if (!dir_notify_enable)
+		return 0; */
+
+	/* not a dir, dnotify doesn't care */
+	if (!S_ISDIR(inode->i_mode))
+		return false;
+
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(group, inode);
+	spin_unlock(&inode->i_lock);
+
+	/* no mark means no dnotify watch */
+	if (!entry)
+		return false;
+
+	spin_lock(&entry->lock);
+	send = (mask & entry->mask) ? true : false;
+	spin_unlock(&entry->lock);
+	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
+
+	return send;
+}
+
+static void dnotify_freeing_mark(struct fsnotify_mark_entry *entry,
+				 struct fsnotify_group *group)
+{
+	/* dnotify doesn't care than an inode is on the way out */
+}
+
+static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+	struct dnotify_mark_entry *dnentry = container_of(entry,
+							  struct dnotify_mark_entry,
+							  fsn_entry);
+
+	BUG_ON(dnentry->dn);
+
+	kmem_cache_free(dnotify_mark_entry_cache, dnentry);
+}
+
+static struct fsnotify_ops dnotify_fsnotify_ops = {
+	.handle_event = dnotify_handle_event,
+	.should_send_event = dnotify_should_send_event,
+	.free_group_priv = NULL,
+	.freeing_mark = dnotify_freeing_mark,
+};
+
+/*
+ * Called every time a file is closed.  Looks first for a dnotify mark on the
+ * inode.  If one is found run all of the ->dn entries attached to that
+ * mark for one relevant to this process closing the file and remove that
+ * dnotify_struct.  If that was the last dnotify_struct also remove the
+ * fsnotify_mark_entry.
+ */
 void dnotify_flush(struct file *filp, fl_owner_t id)
 {
+	struct fsnotify_mark_entry *entry;
+	struct dnotify_mark_entry *dnentry;
 	struct dnotify_struct *dn;
 	struct dnotify_struct **prev;
 	struct inode *inode;
@@ -46,145 +203,243 @@ void dnotify_flush(struct file *filp, fl_owner_t id)
 	inode = filp->f_path.dentry->d_inode;
 	if (!S_ISDIR(inode->i_mode))
 		return;
+
 	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
+	entry = fsnotify_find_mark_entry(dnotify_group, inode);
+	spin_unlock(&inode->i_lock);
+	if (!entry)
+		return;
+	dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+
+	mutex_lock(&dnotify_mark_mutex);
+
+	spin_lock(&entry->lock);
+	prev = &dnentry->dn;
 	while ((dn = *prev) != NULL) {
 		if ((dn->dn_owner == id) && (dn->dn_filp == filp)) {
 			*prev = dn->dn_next;
-			redo_inode_mask(inode);
-			kmem_cache_free(dn_cache, dn);
+			kmem_cache_free(dnotify_struct_cache, dn);
+			dnotify_recalc_inode_mask(entry);
 			break;
 		}
 		prev = &dn->dn_next;
 	}
-	spin_unlock(&inode->i_lock);
+
+	spin_unlock(&entry->lock);
+
+	/* nothing else could have found us thanks to the dnotify_mark_mutex */
+	if (dnentry->dn == NULL)
+		fsnotify_destroy_mark_by_entry(entry);
+
+	fsnotify_recalc_group_mask(dnotify_group);
+
+	mutex_unlock(&dnotify_mark_mutex);
+
+	fsnotify_put_mark(entry);
+}
+
+/* this conversion is done only at watch creation */
+static __u32 convert_arg(unsigned long arg)
+{
+	__u32 new_mask = FS_EVENT_ON_CHILD;
+
+	if (arg & DN_MULTISHOT)
+		new_mask |= FS_DN_MULTISHOT;
+	if (arg & DN_DELETE)
+		new_mask |= (FS_DELETE | FS_MOVED_FROM);
+	if (arg & DN_MODIFY)
+		new_mask |= FS_MODIFY;
+	if (arg & DN_ACCESS)
+		new_mask |= FS_ACCESS;
+	if (arg & DN_ATTRIB)
+		new_mask |= FS_ATTRIB;
+	if (arg & DN_RENAME)
+		new_mask |= FS_DN_RENAME;
+	if (arg & DN_CREATE)
+		new_mask |= (FS_CREATE | FS_MOVED_TO);
+
+	return new_mask;
 }
 
+/*
+ * If multiple processes watch the same inode with dnotify there is only one
+ * dnotify mark in inode->i_fsnotify_mark_entries but we chain a dnotify_struct
+ * onto that mark.  This function either attaches the new dnotify_struct onto
+ * that list, or it |= the mask onto an existing dnofiy_struct.
+ */
+static int attach_dn(struct dnotify_struct *dn, struct dnotify_mark_entry *dnentry,
+		     fl_owner_t id, int fd, struct file *filp, __u32 mask)
+{
+	struct dnotify_struct *odn;
+
+	odn = dnentry->dn;
+	while (odn != NULL) {
+		/* adding more events to existing dnofiy_struct? */
+		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
+			odn->dn_fd = fd;
+			odn->dn_mask |= mask;
+			return -EEXIST;
+		}
+		odn = odn->dn_next;
+	}
+
+	dn->dn_mask = mask;
+	dn->dn_fd = fd;
+	dn->dn_filp = filp;
+	dn->dn_owner = id;
+	dn->dn_next = dnentry->dn;
+	dnentry->dn = dn;
+
+	return 0;
+}
+
+/*
+ * When a process calls fcntl to attach a dnotify watch to a directory it ends
+ * up here.  Allocate both a mark for fsnotify to add and a dnotify_struct to be
+ * attached to the fsnotify_mark.
+ */
 int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 {
+	struct dnotify_mark_entry *new_dnentry, *dnentry;
+	struct fsnotify_mark_entry *new_entry, *entry;
 	struct dnotify_struct *dn;
-	struct dnotify_struct *odn;
-	struct dnotify_struct **prev;
 	struct inode *inode;
 	fl_owner_t id = current->files;
 	struct file *f;
-	int error = 0;
+	int destroy = 0, error = 0;
+	__u32 mask;
+
+	/* we use these to tell if we need to kfree */
+	new_entry = NULL;
+	dn = NULL;
 
+	if (!dir_notify_enable) {
+		error = -EINVAL;
+		goto out_err;
+	}
+
+	/* a 0 mask means we are explicitly removing the watch */
 	if ((arg & ~DN_MULTISHOT) == 0) {
 		dnotify_flush(filp, id);
-		return 0;
+		error = 0;
+		goto out_err;
 	}
-	if (!dir_notify_enable)
-		return -EINVAL;
+
+	/* dnotify only works on directories */
 	inode = filp->f_path.dentry->d_inode;
-	if (!S_ISDIR(inode->i_mode))
-		return -ENOTDIR;
-	dn = kmem_cache_alloc(dn_cache, GFP_KERNEL);
-	if (dn == NULL)
-		return -ENOMEM;
-	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((odn = *prev) != NULL) {
-		if ((odn->dn_owner == id) && (odn->dn_filp == filp)) {
-			odn->dn_fd = fd;
-			odn->dn_mask |= arg;
-			inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-			goto out_free;
-		}
-		prev = &odn->dn_next;
+	if (!S_ISDIR(inode->i_mode)) {
+		error = -ENOTDIR;
+		goto out_err;
 	}
 
-	rcu_read_lock();
-	f = fcheck(fd);
-	rcu_read_unlock();
-	/* we'd lost the race with close(), sod off silently */
-	/* note that inode->i_lock prevents reordering problems
-	 * between accesses to descriptor table and ->i_dnotify */
-	if (f != filp)
-		goto out_free;
+	/* expect most fcntl to add new rather than augment old */
+	dn = kmem_cache_alloc(dnotify_struct_cache, GFP_KERNEL);
+	if (!dn) {
+		error = -ENOMEM;
+		goto out_err;
+	}
 
-	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
-	if (error)
-		goto out_free;
+	/* new fsnotify mark, we expect most fcntl calls to add a new mark */
+	new_dnentry = kmem_cache_alloc(dnotify_mark_entry_cache, GFP_KERNEL);
+	if (!new_dnentry) {
+		error = -ENOMEM;
+		goto out_err;
+	}
 
-	dn->dn_mask = arg;
-	dn->dn_fd = fd;
-	dn->dn_filp = filp;
-	dn->dn_owner = id;
-	inode->i_dnotify_mask |= arg & ~DN_MULTISHOT;
-	dn->dn_next = inode->i_dnotify;
-	inode->i_dnotify = dn;
-	spin_unlock(&inode->i_lock);
-	return 0;
+	/* convert the userspace DN_* "arg" to the internal FS_* defines in fsnotify */
+	mask = convert_arg(arg);
 
-out_free:
-	spin_unlock(&inode->i_lock);
-	kmem_cache_free(dn_cache, dn);
-	return error;
-}
+	/* set up the new_entry and new_dnentry */
+	new_entry = &new_dnentry->fsn_entry;
+	fsnotify_init_mark(new_entry, dnotify_free_mark);
+	new_entry->mask = mask;
+	new_dnentry->dn = NULL;
 
-void __inode_dir_notify(struct inode *inode, unsigned long event)
-{
-	struct dnotify_struct *	dn;
-	struct dnotify_struct **prev;
-	struct fown_struct *	fown;
-	int			changed = 0;
+	/* this is needed to prevent the fcntl/close race described below */
+	mutex_lock(&dnotify_mark_mutex);
 
+	/* add the new_entry or find an old one. */
 	spin_lock(&inode->i_lock);
-	prev = &inode->i_dnotify;
-	while ((dn = *prev) != NULL) {
-		if ((dn->dn_mask & event) == 0) {
-			prev = &dn->dn_next;
-			continue;
-		}
-		fown = &dn->dn_filp->f_owner;
-		send_sigio(fown, dn->dn_fd, POLL_MSG);
-		if (dn->dn_mask & DN_MULTISHOT)
-			prev = &dn->dn_next;
-		else {
-			*prev = dn->dn_next;
-			changed = 1;
-			kmem_cache_free(dn_cache, dn);
-		}
-	}
-	if (changed)
-		redo_inode_mask(inode);
+	entry = fsnotify_find_mark_entry(dnotify_group, inode);
 	spin_unlock(&inode->i_lock);
-}
-
-EXPORT_SYMBOL(__inode_dir_notify);
+	if (entry) {
+		dnentry = container_of(entry, struct dnotify_mark_entry, fsn_entry);
+		spin_lock(&entry->lock);
+	} else {
+		fsnotify_add_mark(new_entry, dnotify_group, inode);
+		spin_lock(&new_entry->lock);
+		entry = new_entry;
+		dnentry = new_dnentry;
+		/* we used new_entry, so don't free it */
+		new_entry = NULL;
+	}
 
-/*
- * This is hopelessly wrong, but unfixable without API changes.  At
- * least it doesn't oops the kernel...
- *
- * To safely access ->d_parent we need to keep d_move away from it.  Use the
- * dentry's d_lock for this.
- */
-void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-	struct dentry *parent;
+	rcu_read_lock();
+	f = fcheck(fd);
+	rcu_read_unlock();
 
-	if (!dir_notify_enable)
-		return;
+	/* if (f != filp) means that we lost a race and another task/thread
+	 * actually closed the fd we are still playing with before we grabbed
+	 * the dnotify_mark_mutex and entry->lock.  Since closing the fd is the
+	 * only time we clean up the mark entries we need to get our mark off
+	 * the list. */
+	if (f != filp) {
+		/* if we added ourselves, shoot ourselves, it's possible that
+		 * the flush actually did shoot this entry.  That's fine too
+		 * since multiple calls to destroy_mark is perfectly safe, if
+		 * we found a dnentry already attached to the inode, just sod
+		 * off silently as the flush at close time dealt with it.
+		 */
+		if (dnentry == new_dnentry)
+			destroy = 1;
+		goto out;
+	}
 
-	spin_lock(&dentry->d_lock);
-	parent = dentry->d_parent;
-	if (parent->d_inode->i_dnotify_mask & event) {
-		dget(parent);
-		spin_unlock(&dentry->d_lock);
-		__inode_dir_notify(parent->d_inode, event);
-		dput(parent);
-	} else {
-		spin_unlock(&dentry->d_lock);
+	error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0);
+	if (error) {
+		/* if we added, we must shoot */
+		if (dnentry == new_dnentry)
+			destroy = 1;
+		goto out;
 	}
+
+	error = attach_dn(dn, dnentry, id, fd, filp, mask);
+	/* !error means that we attached the dn to the dnentry, so don't free it */
+	if (!error)
+		dn = NULL;
+	/* -EEXIST means that we didn't add this new dn and used an old one.
+	 * that isn't an error (and the unused dn should be freed) */
+	else if (error == -EEXIST)
+		error = 0;
+
+	dnotify_recalc_inode_mask(entry);
+out:
+	spin_unlock(&entry->lock);
+
+	if (destroy)
+		fsnotify_destroy_mark_by_entry(entry);
+
+	fsnotify_recalc_group_mask(dnotify_group);
+
+	mutex_unlock(&dnotify_mark_mutex);
+	fsnotify_put_mark(entry);
+out_err:
+	if (new_entry)
+		fsnotify_put_mark(new_entry);
+	if (dn)
+		kmem_cache_free(dnotify_struct_cache, dn);
+	return error;
 }
-EXPORT_SYMBOL_GPL(dnotify_parent);
 
 static int __init dnotify_init(void)
 {
-	dn_cache = kmem_cache_create("dnotify_cache",
-		sizeof(struct dnotify_struct), 0, SLAB_PANIC, NULL);
+	dnotify_struct_cache = KMEM_CACHE(dnotify_struct, SLAB_PANIC);
+	dnotify_mark_entry_cache = KMEM_CACHE(dnotify_mark_entry, SLAB_PANIC);
+
+	dnotify_group = fsnotify_obtain_group(DNOTIFY_GROUP_NUM,
+					      0, &dnotify_fsnotify_ops);
+	if (IS_ERR(dnotify_group))
+		panic("unable to allocate fsnotify group for dnotify\n");
 	return 0;
 }
 
diff --git a/include/linux/dnotify.h b/include/linux/dnotify.h
index 102a902b43963..ecc06286226dc 100644
--- a/include/linux/dnotify.h
+++ b/include/linux/dnotify.h
@@ -10,7 +10,7 @@
 
 struct dnotify_struct {
 	struct dnotify_struct *	dn_next;
-	unsigned long		dn_mask;
+	__u32			dn_mask;
 	int			dn_fd;
 	struct file *		dn_filp;
 	fl_owner_t		dn_owner;
@@ -21,23 +21,18 @@ struct dnotify_struct {
 
 #ifdef CONFIG_DNOTIFY
 
-extern void __inode_dir_notify(struct inode *, unsigned long);
+#define DNOTIFY_ALL_EVENTS (FS_DELETE | FS_DELETE_CHILD |\
+			    FS_MODIFY | FS_MODIFY_CHILD |\
+			    FS_ACCESS | FS_ACCESS_CHILD |\
+			    FS_ATTRIB | FS_ATTRIB_CHILD |\
+			    FS_CREATE | FS_DN_RENAME |\
+			    FS_MOVED_FROM | FS_MOVED_TO)
+
 extern void dnotify_flush(struct file *, fl_owner_t);
 extern int fcntl_dirnotify(int, struct file *, unsigned long);
-extern void dnotify_parent(struct dentry *, unsigned long);
-
-static inline void inode_dir_notify(struct inode *inode, unsigned long event)
-{
-	if (inode->i_dnotify_mask & (event))
-		__inode_dir_notify(inode, event);
-}
 
 #else
 
-static inline void __inode_dir_notify(struct inode *inode, unsigned long event)
-{
-}
-
 static inline void dnotify_flush(struct file *filp, fl_owner_t id)
 {
 }
@@ -47,14 +42,6 @@ static inline int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
 	return -EINVAL;
 }
 
-static inline void dnotify_parent(struct dentry *dentry, unsigned long event)
-{
-}
-
-static inline void inode_dir_notify(struct inode *inode, unsigned long event)
-{
-}
-
 #endif /* CONFIG_DNOTIFY */
 
 #endif /* __KERNEL __ */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 275b0860044cc..323b5ce474c15 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -760,11 +760,6 @@ struct inode {
 	struct hlist_head	i_fsnotify_mark_entries; /* fsnotify mark entries */
 #endif
 
-#ifdef CONFIG_DNOTIFY
-	unsigned long		i_dnotify_mask; /* Directory notify events */
-	struct dnotify_struct	*i_dnotify; /* for directory notifications */
-#endif
-
 #ifdef CONFIG_INOTIFY
 	struct list_head	inotify_watches; /* watches on this inode */
 	struct mutex		inotify_mutex;	/* protects the watches list */
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 6a662ed0bc8ac..db12d9de35261 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -74,13 +74,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	__u32 new_dir_mask = 0;
 
 	if (old_dir == new_dir) {
-		inode_dir_notify(old_dir, DN_RENAME);
 		old_dir_mask = FS_DN_RENAME;
-	} else {
-		inode_dir_notify(old_dir, DN_DELETE);
-		old_dir_mask = FS_DELETE;
-		inode_dir_notify(new_dir, DN_CREATE);
-		new_dir_mask = FS_CREATE;
 	}
 
 	if (isdir) {
@@ -132,7 +126,6 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
 
 	if (isdir)
 		mask |= FS_IN_ISDIR;
-	dnotify_parent(dentry, DN_DELETE);
 
 	fsnotify_parent(dentry, mask);
 }
@@ -154,7 +147,6 @@ static inline void fsnotify_inoderemove(struct inode *inode)
  */
 static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 {
-	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, IN_CREATE, 0, dentry->d_name.name,
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
@@ -169,7 +161,6 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
  */
 static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct dentry *new_dentry)
 {
-	inode_dir_notify(dir, DN_CREATE);
 	inotify_inode_queue_event(dir, IN_CREATE, 0, new_dentry->d_name.name,
 				  inode);
 	fsnotify_link_count(inode);
@@ -186,7 +177,6 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	__u32 mask = (FS_CREATE | FS_IN_ISDIR);
 	struct inode *d_inode = dentry->d_inode;
 
-	inode_dir_notify(inode, DN_CREATE);
 	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
@@ -204,7 +194,6 @@ static inline void fsnotify_access(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	dnotify_parent(dentry, DN_ACCESS);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
@@ -222,7 +211,6 @@ static inline void fsnotify_modify(struct dentry *dentry)
 	if (S_ISDIR(inode->i_mode))
 		mask |= FS_IN_ISDIR;
 
-	dnotify_parent(dentry, DN_MODIFY);
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
@@ -289,47 +277,33 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 {
 	struct inode *inode = dentry->d_inode;
-	int dn_mask = 0;
-	__u32 in_mask = 0;
+	__u32 mask = 0;
+
+	if (ia_valid & ATTR_UID)
+		mask |= FS_ATTRIB;
+	if (ia_valid & ATTR_GID)
+		mask |= FS_ATTRIB;
+	if (ia_valid & ATTR_SIZE)
+		mask |= FS_MODIFY;
 
-	if (ia_valid & ATTR_UID) {
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	}
-	if (ia_valid & ATTR_GID) {
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	}
-	if (ia_valid & ATTR_SIZE) {
-		in_mask |= FS_MODIFY;
-		dn_mask |= DN_MODIFY;
-	}
 	/* both times implies a utime(s) call */
 	if ((ia_valid & (ATTR_ATIME | ATTR_MTIME)) == (ATTR_ATIME | ATTR_MTIME))
-	{
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	} else if (ia_valid & ATTR_ATIME) {
-		in_mask |= FS_ACCESS;
-		dn_mask |= DN_ACCESS;
-	} else if (ia_valid & ATTR_MTIME) {
-		in_mask |= FS_MODIFY;
-		dn_mask |= DN_MODIFY;
-	}
-	if (ia_valid & ATTR_MODE) {
-		in_mask |= FS_ATTRIB;
-		dn_mask |= DN_ATTRIB;
-	}
+		mask |= FS_ATTRIB;
+	else if (ia_valid & ATTR_ATIME)
+		mask |= FS_ACCESS;
+	else if (ia_valid & ATTR_MTIME)
+		mask |= FS_MODIFY;
+
+	if (ia_valid & ATTR_MODE)
+		mask |= FS_ATTRIB;
 
-	if (dn_mask)
-		dnotify_parent(dentry, dn_mask);
-	if (in_mask) {
+	if (mask) {
 		if (S_ISDIR(inode->i_mode))
-			in_mask |= FS_IN_ISDIR;
-		inotify_inode_queue_event(inode, in_mask, 0, NULL, NULL);
+			mask |= FS_IN_ISDIR;
+		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
-		fsnotify_parent(dentry, in_mask);
-		fsnotify(inode, in_mask, inode, FSNOTIFY_EVENT_INODE);
+		fsnotify_parent(dentry, mask);
+		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 13d2dd5700491..9ea800e840f13 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -57,6 +57,9 @@
 				   FS_MOVED_FROM | FS_MOVED_TO | FS_CREATE |\
 				   FS_DELETE)
 
+/* listeners that hard code group numbers near the top */
+#define DNOTIFY_GROUP_NUM	UINT_MAX
+
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark_entry;
-- 
GitLab


From a2d8bc6cb4a3024661baf877242f123787d0c054 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:37 -0400
Subject: [PATCH 2096/2198] fsnotify: generic notification queue and waitq

inotify needs to do asyc notification in which event information is stored
on a queue until the listener is ready to receive it.  This patch
implements a generic notification queue for inotify (and later fanotify) to
store events to be sent at a later time.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/fsnotify.h             |   9 ++
 fs/notify/group.c                |   9 ++
 fs/notify/notification.c         | 230 ++++++++++++++++++++++++++++++-
 include/linux/fsnotify_backend.h |  37 +++++
 4 files changed, 278 insertions(+), 7 deletions(-)

diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
index 83b8ec0a8ec2f..4dc240824b2df 100644
--- a/fs/notify/fsnotify.h
+++ b/fs/notify/fsnotify.h
@@ -13,8 +13,12 @@ extern struct list_head fsnotify_groups;
 /* all bitwise OR of all event types (FS_*) for all fsnotify_groups */
 extern __u32 fsnotify_mask;
 
+/* destroy all events sitting in this groups notification queue */
+extern void fsnotify_flush_notify(struct fsnotify_group *group);
+
 /* final kfree of a group */
 extern void fsnotify_final_destroy_group(struct fsnotify_group *group);
+
 /* run the list of all marks associated with inode and flag them to be freed */
 extern void fsnotify_clear_marks_by_inode(struct inode *inode);
 /*
@@ -22,4 +26,9 @@ extern void fsnotify_clear_marks_by_inode(struct inode *inode);
  * about events that happen to its children.
  */
 extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
+
+/* allocate and destroy and event holder to attach events to notification/access queues */
+extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void);
+extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder);
+
 #endif	/* __FS_NOTIFY_FSNOTIFY_H_ */
diff --git a/fs/notify/group.c b/fs/notify/group.c
index a29d2fa679272..0e1677144bc54 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -91,6 +91,9 @@ static void fsnotify_get_group(struct fsnotify_group *group)
  */
 void fsnotify_final_destroy_group(struct fsnotify_group *group)
 {
+	/* clear the notification queue of all events */
+	fsnotify_flush_notify(group);
+
 	if (group->ops->free_group_priv)
 		group->ops->free_group_priv(group);
 
@@ -214,6 +217,12 @@ struct fsnotify_group *fsnotify_obtain_group(unsigned int group_num, __u32 mask,
 	group->group_num = group_num;
 	group->mask = mask;
 
+	mutex_init(&group->notification_mutex);
+	INIT_LIST_HEAD(&group->notification_list);
+	init_waitqueue_head(&group->notification_waitq);
+	group->q_len = 0;
+	group->max_events = UINT_MAX;
+
 	spin_lock_init(&group->mark_lock);
 	atomic_set(&group->num_marks, 0);
 	INIT_LIST_HEAD(&group->mark_entries);
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index b8e9a87f8f585..dddecc74e63d2 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -16,6 +16,21 @@
  *  the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 
+/*
+ * Basic idea behind the notification queue: An fsnotify group (like inotify)
+ * sends the userspace notification about events asyncronously some time after
+ * the event happened.  When inotify gets an event it will need to add that
+ * event to the group notify queue.  Since a single event might need to be on
+ * multiple group's notification queues we can't add the event directly to each
+ * queue and instead add a small "event_holder" to each queue.  This event_holder
+ * has a pointer back to the original event.  Since the majority of events are
+ * going to end up on one, and only one, notification queue we embed one
+ * event_holder into each event.  This means we have a single allocation instead
+ * of always needing two.  If the embedded event_holder is already in use by
+ * another group a new event_holder (from fsnotify_event_holder_cachep) will be
+ * allocated and used.
+ */
+
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -33,6 +48,21 @@
 #include "fsnotify.h"
 
 static struct kmem_cache *fsnotify_event_cachep;
+static struct kmem_cache *fsnotify_event_holder_cachep;
+/*
+ * This is a magic event we send when the q is too full.  Since it doesn't
+ * hold real event information we just keep one system wide and use it any time
+ * it is needed.  It's refcnt is set 1 at kernel init time and will never
+ * get set to 0 so it will never get 'freed'
+ */
+static struct fsnotify_event q_overflow_event;
+
+/* return true if the notify queue is empty, false otherwise */
+bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
+{
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+	return list_empty(&group->notification_list) ? true : false;
+}
 
 void fsnotify_get_event(struct fsnotify_event *event)
 {
@@ -52,19 +82,176 @@ void fsnotify_put_event(struct fsnotify_event *event)
 	}
 }
 
+struct fsnotify_event_holder *fsnotify_alloc_event_holder(void)
+{
+	return kmem_cache_alloc(fsnotify_event_holder_cachep, GFP_KERNEL);
+}
+
+void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
+{
+	kmem_cache_free(fsnotify_event_holder_cachep, holder);
+}
+
+/*
+ * check if 2 events contain the same information.
+ */
+static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
+{
+	if ((old->mask == new->mask) &&
+	    (old->to_tell == new->to_tell) &&
+	    (old->data_type == new->data_type)) {
+		switch (old->data_type) {
+		case (FSNOTIFY_EVENT_INODE):
+			if (old->inode == new->inode)
+				return true;
+			break;
+		case (FSNOTIFY_EVENT_PATH):
+			if ((old->path.mnt == new->path.mnt) &&
+			    (old->path.dentry == new->path.dentry))
+				return true;
+		case (FSNOTIFY_EVENT_NONE):
+			return true;
+		};
+	}
+	return false;
+}
+
 /*
- * Allocate a new event which will be sent to each group's handle_event function
- * if the group was interested in this particular event.
+ * Add an event to the group notification queue.  The group can later pull this
+ * event off the queue to deal with.  If the event is successfully added to the
+ * group's notification queue, a reference is taken on event.
  */
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-					     void *data, int data_type)
+int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct fsnotify_event_holder *holder = NULL;
+	struct list_head *list = &group->notification_list;
+	struct fsnotify_event_holder *last_holder;
+	struct fsnotify_event *last_event;
+
+	/*
+	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
+	 * Check if we expect to be able to use that holder.  If not alloc a new
+	 * holder.
+	 * For the overflow event it's possible that something will use the in
+	 * event holder before we get the lock so we may need to jump back and
+	 * alloc a new holder, this can't happen for most events...
+	 */
+	if (!list_empty(&event->holder.event_list)) {
+alloc_holder:
+		holder = fsnotify_alloc_event_holder();
+		if (!holder)
+			return -ENOMEM;
+	}
+
+	mutex_lock(&group->notification_mutex);
+
+	if (group->q_len >= group->max_events)
+		event = &q_overflow_event;
+
+	spin_lock(&event->lock);
+
+	if (list_empty(&event->holder.event_list)) {
+		if (unlikely(holder))
+			fsnotify_destroy_event_holder(holder);
+		holder = &event->holder;
+	} else if (unlikely(!holder)) {
+		/* between the time we checked above and got the lock the in
+		 * event holder was used, go back and get a new one */
+		spin_unlock(&event->lock);
+		mutex_unlock(&group->notification_mutex);
+		goto alloc_holder;
+	}
+
+	if (!list_empty(list)) {
+		last_holder = list_entry(list->prev, struct fsnotify_event_holder, event_list);
+		last_event = last_holder->event;
+		if (event_compare(last_event, event)) {
+			spin_unlock(&event->lock);
+			mutex_unlock(&group->notification_mutex);
+			if (holder != &event->holder)
+				fsnotify_destroy_event_holder(holder);
+			return 0;
+		}
+	}
+
+	group->q_len++;
+	holder->event = event;
+
+	fsnotify_get_event(event);
+	list_add_tail(&holder->event_list, list);
+	spin_unlock(&event->lock);
+	mutex_unlock(&group->notification_mutex);
+
+	wake_up(&group->notification_waitq);
+	return 0;
+}
+
+/*
+ * Remove and return the first event from the notification list.  There is a
+ * reference held on this event since it was on the list.  It is the responsibility
+ * of the caller to drop this reference.
+ */
+struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
+	struct fsnotify_event_holder *holder;
 
-	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
-	if (!event)
-		return NULL;
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
 
+	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+
+	event = holder->event;
+
+	spin_lock(&event->lock);
+	holder->event = NULL;
+	list_del_init(&holder->event_list);
+	spin_unlock(&event->lock);
+
+	/* event == holder means we are referenced through the in event holder */
+	if (holder != &event->holder)
+		fsnotify_destroy_event_holder(holder);
+
+	group->q_len--;
+
+	return event;
+}
+
+/*
+ * This will not remove the event, that must be done with fsnotify_remove_notify_event()
+ */
+struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+	struct fsnotify_event_holder *holder;
+
+	BUG_ON(!mutex_is_locked(&group->notification_mutex));
+
+	holder = list_first_entry(&group->notification_list, struct fsnotify_event_holder, event_list);
+	event = holder->event;
+
+	return event;
+}
+
+/*
+ * Called when a group is being torn down to clean up any outstanding
+ * event notifications.
+ */
+void fsnotify_flush_notify(struct fsnotify_group *group)
+{
+	struct fsnotify_event *event;
+
+	mutex_lock(&group->notification_mutex);
+	while (!fsnotify_notify_queue_is_empty(group)) {
+		event = fsnotify_remove_notify_event(group);
+		fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
+	}
+	mutex_unlock(&group->notification_mutex);
+}
+
+static void initialize_event(struct fsnotify_event *event)
+{
+	event->holder.event = NULL;
+	INIT_LIST_HEAD(&event->holder.event_list);
 	atomic_set(&event->refcnt, 1);
 
 	spin_lock_init(&event->lock);
@@ -72,7 +259,32 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 	event->path.dentry = NULL;
 	event->path.mnt = NULL;
 	event->inode = NULL;
+	event->data_type = FSNOTIFY_EVENT_NONE;
 
+	event->to_tell = NULL;
+}
+
+/*
+ * fsnotify_create_event - Allocate a new event which will be sent to each
+ * group's handle_event function if the group was interested in this
+ * particular event.
+ *
+ * @to_tell the inode which is supposed to receive the event (sometimes a
+ *	parent of the inode to which the event happened.
+ * @mask what actually happened.
+ * @data pointer to the object which was actually affected
+ * @data_type flag indication if the data is a file, path, inode, nothing...
+ */
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
+					     void *data, int data_type)
+{
+	struct fsnotify_event *event;
+
+	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	if (!event)
+		return NULL;
+
+	initialize_event(event);
 	event->to_tell = to_tell;
 
 	switch (data_type) {
@@ -114,6 +326,10 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 __init int fsnotify_notification_init(void)
 {
 	fsnotify_event_cachep = KMEM_CACHE(fsnotify_event, SLAB_PANIC);
+	fsnotify_event_holder_cachep = KMEM_CACHE(fsnotify_event_holder, SLAB_PANIC);
+
+	initialize_event(&q_overflow_event);
+	q_overflow_event.mask = FS_Q_OVERFLOW;
 
 	return 0;
 }
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 9ea800e840f13..15f8f82a5c57d 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -119,6 +119,13 @@ struct fsnotify_group {
 
 	const struct fsnotify_ops *ops;	/* how this group handles things */
 
+	/* needed to send notification to userspace */
+	struct mutex notification_mutex;	/* protect the notification_list */
+	struct list_head notification_list;	/* list of event_holder this group needs to send to userspace */
+	wait_queue_head_t notification_waitq;	/* read() on the notification file blocks on this waitq */
+	unsigned int q_len;			/* events on the queue */
+	unsigned int max_events;		/* maximum events allowed on the list */
+
 	/* stores all fastapth entries assoc with this group so they can be cleaned on unregister */
 	spinlock_t mark_lock;		/* protect mark_entries list */
 	atomic_t num_marks;		/* 1 for each mark entry and 1 for not being
@@ -135,12 +142,33 @@ struct fsnotify_group {
 	};
 };
 
+/*
+ * A single event can be queued in multiple group->notification_lists.
+ *
+ * each group->notification_list will point to an event_holder which in turns points
+ * to the actual event that needs to be sent to userspace.
+ *
+ * Seemed cheaper to create a refcnt'd event and a small holder for every group
+ * than create a different event for every group
+ *
+ */
+struct fsnotify_event_holder {
+	struct fsnotify_event *event;
+	struct list_head event_list;
+};
+
 /*
  * all of the information about the original object we want to now send to
  * a group.  If you want to carry more info from the accessing task to the
  * listener this structure is where you need to be adding fields.
  */
 struct fsnotify_event {
+	/*
+	 * If we create an event we are also likely going to need a holder
+	 * to link to a group.  So embed one holder in the event.  Means only
+	 * one allocation for the common case where we only have one group
+	 */
+	struct fsnotify_event_holder holder;
 	spinlock_t lock;	/* protection for the associated event_holder and private_list */
 	/* to_tell may ONLY be dereferenced during handle_event(). */
 	struct inode *to_tell;	/* either the inode the event happened to or its parent */
@@ -264,6 +292,15 @@ extern void fsnotify_put_event(struct fsnotify_event *event);
 extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
 									struct fsnotify_event *event);
 
+/* attach the event to the group notification queue */
+extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event);
+/* true if the group notification queue is empty */
+extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
+/* return, but do not dequeue the first event on the notification queue */
+extern struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group);
+/* reutnr AND dequeue the first event on the notification queue */
+extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group);
+
 /* functions used to manipulate the marks attached to inodes */
 
 /* run all marks associated with an inode and update inode->i_fsnotify_mask */
-- 
GitLab


From 62ffe5dfba056f7ba81d710fee9f28c58a42fdd6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:43 -0400
Subject: [PATCH 2097/2198] fsnotify: include pathnames with entries when
 possible

When inotify wants to send events to a directory about a child it includes
the name of the original file.  This patch collects that filename and makes
it available for notification.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/fsnotify.c             |  7 ++++---
 fs/notify/notification.c         | 16 +++++++++++++++-
 include/linux/fsnotify.h         | 28 ++++++++++++++--------------
 include/linux/fsnotify_backend.h | 11 ++++++++---
 4 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 7fc760067a623..675129fa9fdd7 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -114,7 +114,8 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
 		 * specifies these are events which came from a child. */
 		mask |= FS_EVENT_ON_CHILD;
 
-		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE);
+		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
+			 dentry->d_name.name);
 		dput(parent);
 	}
 
@@ -131,7 +132,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent);
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name)
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
@@ -156,7 +157,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
 			if (!group->ops->should_send_event(group, to_tell, mask))
 				continue;
 			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data, data_is);
+				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name);
 				/* shit, we OOM'd and now we can't tell, maybe
 				 * someday someone else will want to do something
 				 * here */
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index dddecc74e63d2..c69b18b9aba51 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -78,6 +78,7 @@ void fsnotify_put_event(struct fsnotify_event *event)
 		if (event->data_type == FSNOTIFY_EVENT_PATH)
 			path_put(&event->path);
 
+		kfree(event->file_name);
 		kmem_cache_free(fsnotify_event_cachep, event);
 	}
 }
@@ -262,6 +263,9 @@ static void initialize_event(struct fsnotify_event *event)
 	event->data_type = FSNOTIFY_EVENT_NONE;
 
 	event->to_tell = NULL;
+
+	event->file_name = NULL;
+	event->name_len = 0;
 }
 
 /*
@@ -274,9 +278,10 @@ static void initialize_event(struct fsnotify_event *event)
  * @mask what actually happened.
  * @data pointer to the object which was actually affected
  * @data_type flag indication if the data is a file, path, inode, nothing...
+ * @name the filename, if available
  */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-					     void *data, int data_type)
+					     void *data, int data_type, const char *name)
 {
 	struct fsnotify_event *event;
 
@@ -285,6 +290,15 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		return NULL;
 
 	initialize_event(event);
+
+	if (name) {
+		event->file_name = kstrdup(name, GFP_KERNEL);
+		if (!event->file_name) {
+			kmem_cache_free(fsnotify_event_cachep, event);
+			return NULL;
+		}
+		event->name_len = strlen(event->file_name);
+	}
 	event->to_tell = to_tell;
 
 	switch (data_type) {
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index db12d9de35261..180740e9ec826 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -58,7 +58,7 @@ static inline void fsnotify_link_count(struct inode *inode)
 {
 	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
 
-	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -91,8 +91,8 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
 				  source);
 
-	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE);
-	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE);
+	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name);
+	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name);
 
 	if (target) {
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
@@ -104,7 +104,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
-		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE);
+		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL);
 	}
 	audit_inode_child(new_name, moved, new_dir);
 }
@@ -138,7 +138,7 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
 
-	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL);
 	__fsnotify_inode_delete(inode);
 }
 
@@ -151,7 +151,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
 }
 
 /*
@@ -166,7 +166,7 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
 
-	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name);
 }
 
 /*
@@ -180,7 +180,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
 }
 
 /*
@@ -197,7 +197,7 @@ static inline void fsnotify_access(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -214,7 +214,7 @@ static inline void fsnotify_modify(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -231,7 +231,7 @@ static inline void fsnotify_open(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -250,7 +250,7 @@ static inline void fsnotify_close(struct file *file)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE);
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL);
 }
 
 /*
@@ -267,7 +267,7 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 }
 
 /*
@@ -303,7 +303,7 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 		fsnotify_parent(dentry, mask);
-		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE);
+		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 15f8f82a5c57d..52692f405890f 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -192,6 +192,9 @@ struct fsnotify_event {
 	int data_type;		/* which of the above union we have */
 	atomic_t refcnt;	/* how many groups still are using/need to send this event */
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
+
+	char *file_name;
+	size_t name_len;
 };
 
 /*
@@ -224,7 +227,7 @@ struct fsnotify_mark_entry {
 /* called from the vfs helpers */
 
 /* main fsnotify call to send events */
-extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is);
+extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *name);
 extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
 
@@ -319,10 +322,12 @@ extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-						    void *data, int data_is);
+						    void *data, int data_is, const char *name);
+
 #else
 
-static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is)
+static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+			    const char *name);
 {}
 
 static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
-- 
GitLab


From 47882c6f51e8ef41fbbe2bbb746a1ea3228dd7ca Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:47 -0400
Subject: [PATCH 2098/2198] fsnotify: add correlations between events

As part of the standard inotify events it includes a correlation cookie
between two dentry move operations.  This patch includes the same behaviour
in fsnotify events.  It is needed so that inotify userspace can be
implemented on top of fsnotify.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/fsnotify.c             |  6 +++---
 fs/notify/notification.c         | 20 ++++++++++++++++--
 include/linux/fsnotify.h         | 35 ++++++++++++++++----------------
 include/linux/fsnotify_backend.h | 15 +++++++++++---
 4 files changed, 51 insertions(+), 25 deletions(-)

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 675129fa9fdd7..f11d75f023684 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -115,7 +115,7 @@ void __fsnotify_parent(struct dentry *dentry, __u32 mask)
 		mask |= FS_EVENT_ON_CHILD;
 
 		fsnotify(p_inode, mask, dentry->d_inode, FSNOTIFY_EVENT_INODE,
-			 dentry->d_name.name);
+			 dentry->d_name.name, 0);
 		dput(parent);
 	}
 
@@ -132,7 +132,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent);
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name)
+void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *file_name, u32 cookie)
 {
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
@@ -157,7 +157,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 			if (!group->ops->should_send_event(group, to_tell, mask))
 				continue;
 			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name);
+				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
 				/* shit, we OOM'd and now we can't tell, maybe
 				 * someday someone else will want to do something
 				 * here */
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index c69b18b9aba51..346f6e5c35534 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/mutex.h>
 #include <linux/namei.h>
@@ -56,6 +57,17 @@ static struct kmem_cache *fsnotify_event_holder_cachep;
  * get set to 0 so it will never get 'freed'
  */
 static struct fsnotify_event q_overflow_event;
+static atomic_t fsnotify_sync_cookie = ATOMIC_INIT(0);
+
+/**
+ * fsnotify_get_cookie - return a unique cookie for use in synchronizing events.
+ * Called from fsnotify_move, which is inlined into filesystem modules.
+ */
+u32 fsnotify_get_cookie(void)
+{
+	return atomic_inc_return(&fsnotify_sync_cookie);
+}
+EXPORT_SYMBOL_GPL(fsnotify_get_cookie);
 
 /* return true if the notify queue is empty, false otherwise */
 bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group)
@@ -266,6 +278,8 @@ static void initialize_event(struct fsnotify_event *event)
 
 	event->file_name = NULL;
 	event->name_len = 0;
+
+	event->sync_cookie = 0;
 }
 
 /*
@@ -280,8 +294,8 @@ static void initialize_event(struct fsnotify_event *event)
  * @data_type flag indication if the data is a file, path, inode, nothing...
  * @name the filename, if available
  */
-struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-					     void *data, int data_type, const char *name)
+struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
+					     int data_type, const char *name, u32 cookie)
 {
 	struct fsnotify_event *event;
 
@@ -299,6 +313,8 @@ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 		}
 		event->name_len = strlen(event->file_name);
 	}
+
+	event->sync_cookie = cookie;
 	event->to_tell = to_tell;
 
 	switch (data_type) {
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 180740e9ec826..c25b39ddd62ae 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -58,7 +58,7 @@ static inline void fsnotify_link_count(struct inode *inode)
 {
 	inotify_inode_queue_event(inode, IN_ATTRIB, 0, NULL, NULL);
 
-	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, FS_ATTRIB, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -69,7 +69,8 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 				 int isdir, struct inode *target, struct dentry *moved)
 {
 	struct inode *source = moved->d_inode;
-	u32 cookie = inotify_get_cookie();
+	u32 in_cookie = inotify_get_cookie();
+	u32 fs_cookie = fsnotify_get_cookie();
 	__u32 old_dir_mask = 0;
 	__u32 new_dir_mask = 0;
 
@@ -86,13 +87,13 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	old_dir_mask |= FS_MOVED_FROM;
 	new_dir_mask |= FS_MOVED_TO;
 
-	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir,cookie,old_name,
+	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir, in_cookie, old_name,
 				  source);
-	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, cookie, new_name,
+	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, in_cookie, new_name,
 				  source);
 
-	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name);
-	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name);
+	fsnotify(old_dir, old_dir_mask, old_dir, FSNOTIFY_EVENT_INODE, old_name, fs_cookie);
+	fsnotify(new_dir, new_dir_mask, new_dir, FSNOTIFY_EVENT_INODE, new_name, fs_cookie);
 
 	if (target) {
 		inotify_inode_queue_event(target, IN_DELETE_SELF, 0, NULL, NULL);
@@ -104,7 +105,7 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 
 	if (source) {
 		inotify_inode_queue_event(source, IN_MOVE_SELF, 0, NULL, NULL);
-		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL);
+		fsnotify(source, FS_MOVE_SELF, moved->d_inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	}
 	audit_inode_child(new_name, moved, new_dir);
 }
@@ -138,7 +139,7 @@ static inline void fsnotify_inoderemove(struct inode *inode)
 	inotify_inode_queue_event(inode, IN_DELETE_SELF, 0, NULL, NULL);
 	inotify_inode_is_dead(inode);
 
-	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	__fsnotify_inode_delete(inode);
 }
 
@@ -151,7 +152,7 @@ static inline void fsnotify_create(struct inode *inode, struct dentry *dentry)
 				  dentry->d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
+	fsnotify(inode, FS_CREATE, dentry->d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
 }
 
 /*
@@ -166,7 +167,7 @@ static inline void fsnotify_link(struct inode *dir, struct inode *inode, struct
 	fsnotify_link_count(inode);
 	audit_inode_child(new_dentry->d_name.name, new_dentry, dir);
 
-	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name);
+	fsnotify(dir, FS_CREATE, inode, FSNOTIFY_EVENT_INODE, new_dentry->d_name.name, 0);
 }
 
 /*
@@ -180,7 +181,7 @@ static inline void fsnotify_mkdir(struct inode *inode, struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, dentry->d_name.name, d_inode);
 	audit_inode_child(dentry->d_name.name, dentry, inode);
 
-	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name);
+	fsnotify(inode, mask, d_inode, FSNOTIFY_EVENT_INODE, dentry->d_name.name, 0);
 }
 
 /*
@@ -197,7 +198,7 @@ static inline void fsnotify_access(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -214,7 +215,7 @@ static inline void fsnotify_modify(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -231,7 +232,7 @@ static inline void fsnotify_open(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -250,7 +251,7 @@ static inline void fsnotify_close(struct file *file)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL);
+	fsnotify(inode, mask, file, FSNOTIFY_EVENT_FILE, NULL, 0);
 }
 
 /*
@@ -267,7 +268,7 @@ static inline void fsnotify_xattr(struct dentry *dentry)
 	inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 	fsnotify_parent(dentry, mask);
-	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+	fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 }
 
 /*
@@ -303,7 +304,7 @@ static inline void fsnotify_change(struct dentry *dentry, unsigned int ia_valid)
 		inotify_inode_queue_event(inode, mask, 0, NULL, NULL);
 
 		fsnotify_parent(dentry, mask);
-		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL);
+		fsnotify(inode, mask, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 	}
 }
 
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 52692f405890f..b78b5573d227d 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -193,6 +193,7 @@ struct fsnotify_event {
 	atomic_t refcnt;	/* how many groups still are using/need to send this event */
 	__u32 mask;		/* the type of access, bitwise OR for FS_* event types */
 
+	u32 sync_cookie;	/* used to corrolate events, namely inotify mv events */
 	char *file_name;
 	size_t name_len;
 };
@@ -227,9 +228,11 @@ struct fsnotify_mark_entry {
 /* called from the vfs helpers */
 
 /* main fsnotify call to send events */
-extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *name);
+extern void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+		     const char *name, u32 cookie);
 extern void __fsnotify_parent(struct dentry *dentry, __u32 mask);
 extern void __fsnotify_inode_delete(struct inode *inode);
+extern u32 fsnotify_get_cookie(void);
 
 static inline int fsnotify_inode_watches_children(struct inode *inode)
 {
@@ -322,12 +325,13 @@ extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
-						    void *data, int data_is, const char *name);
+						    void *data, int data_is, const char *name,
+						    u32 cookie);
 
 #else
 
 static inline void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
-			    const char *name);
+			    const char *name, u32 cookie)
 {}
 
 static inline void __fsnotify_parent(struct dentry *dentry, __u32 mask)
@@ -342,6 +346,11 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry)
 static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode)
 {}
 
+static inline u32 fsnotify_get_cookie(void)
+{
+	return 0;
+}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
GitLab


From e4aff117368cfdd3567ee41844d216d079b55173 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:50 -0400
Subject: [PATCH 2099/2198] fsnotify: allow groups to add private data to
 events

inotify needs per group information attached to events.  This patch allows
groups to attach private information and implements a callback so that
information can be freed when an event is being destroyed.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/dnotify/dnotify.c      |  1 +
 fs/notify/notification.c         | 52 +++++++++++++++++++++++++++++---
 include/linux/fsnotify_backend.h | 24 ++++++++++++---
 3 files changed, 68 insertions(+), 9 deletions(-)

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index d9d80f502c6f9..12f9e6b1ffe2b 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -183,6 +183,7 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
 	.should_send_event = dnotify_should_send_event,
 	.free_group_priv = NULL,
 	.freeing_mark = dnotify_freeing_mark,
+	.free_event_priv = NULL,
 };
 
 /*
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 346f6e5c35534..959b73e756fd8 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -90,6 +90,8 @@ void fsnotify_put_event(struct fsnotify_event *event)
 		if (event->data_type == FSNOTIFY_EVENT_PATH)
 			path_put(&event->path);
 
+		BUG_ON(!list_empty(&event->private_data_list));
+
 		kfree(event->file_name);
 		kmem_cache_free(fsnotify_event_cachep, event);
 	}
@@ -106,7 +108,29 @@ void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder)
 }
 
 /*
- * check if 2 events contain the same information.
+ * Find the private data that the group previously attached to this event when
+ * the group added the event to the notification queue (fsnotify_add_notify_event)
+ */
+struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct fsnotify_event_private_data *lpriv;
+	struct fsnotify_event_private_data *priv = NULL;
+
+	assert_spin_locked(&event->lock);
+
+	list_for_each_entry(lpriv, &event->private_data_list, event_list) {
+		if (lpriv->group == group) {
+			priv = lpriv;
+			list_del(&priv->event_list);
+			break;
+		}
+	}
+	return priv;
+}
+
+/*
+ * Check if 2 events contain the same information.  We do not compare private data
+ * but at this moment that isn't a problem for any know fsnotify listeners.
  */
 static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new)
 {
@@ -134,13 +158,17 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
  * event off the queue to deal with.  If the event is successfully added to the
  * group's notification queue, a reference is taken on event.
  */
-int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event)
+int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+			      struct fsnotify_event_private_data *priv)
 {
 	struct fsnotify_event_holder *holder = NULL;
 	struct list_head *list = &group->notification_list;
 	struct fsnotify_event_holder *last_holder;
 	struct fsnotify_event *last_event;
 
+	/* easy to tell if priv was attached to the event */
+	INIT_LIST_HEAD(&priv->event_list);
+
 	/*
 	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
 	 * Check if we expect to be able to use that holder.  If not alloc a new
@@ -158,8 +186,11 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
 
 	mutex_lock(&group->notification_mutex);
 
-	if (group->q_len >= group->max_events)
+	if (group->q_len >= group->max_events) {
 		event = &q_overflow_event;
+		/* sorry, no private data on the overflow event */
+		priv = NULL;
+	}
 
 	spin_lock(&event->lock);
 
@@ -183,7 +214,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
 			mutex_unlock(&group->notification_mutex);
 			if (holder != &event->holder)
 				fsnotify_destroy_event_holder(holder);
-			return 0;
+			return -EEXIST;
 		}
 	}
 
@@ -192,6 +223,8 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
 
 	fsnotify_get_event(event);
 	list_add_tail(&holder->event_list, list);
+	if (priv)
+		list_add_tail(&priv->event_list, &event->private_data_list);
 	spin_unlock(&event->lock);
 	mutex_unlock(&group->notification_mutex);
 
@@ -252,10 +285,19 @@ struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group)
 void fsnotify_flush_notify(struct fsnotify_group *group)
 {
 	struct fsnotify_event *event;
+	struct fsnotify_event_private_data *priv;
 
 	mutex_lock(&group->notification_mutex);
 	while (!fsnotify_notify_queue_is_empty(group)) {
 		event = fsnotify_remove_notify_event(group);
+		/* if they don't implement free_event_priv they better not have attached any */
+		if (group->ops->free_event_priv) {
+			spin_lock(&event->lock);
+			priv = fsnotify_remove_priv_from_event(group, event);
+			spin_unlock(&event->lock);
+			if (priv)
+				group->ops->free_event_priv(priv);
+		}
 		fsnotify_put_event(event); /* matches fsnotify_add_notify_event */
 	}
 	mutex_unlock(&group->notification_mutex);
@@ -274,6 +316,8 @@ static void initialize_event(struct fsnotify_event *event)
 	event->inode = NULL;
 	event->data_type = FSNOTIFY_EVENT_NONE;
 
+	INIT_LIST_HEAD(&event->private_data_list);
+
 	event->to_tell = NULL;
 
 	event->file_name = NULL;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index b78b5573d227d..efdf9e442d86b 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -63,6 +63,7 @@
 struct fsnotify_group;
 struct fsnotify_event;
 struct fsnotify_mark_entry;
+struct fsnotify_event_private_data;
 
 /*
  * Each group much define these ops.  The fsnotify infrastructure will call
@@ -81,6 +82,7 @@ struct fsnotify_ops {
 	int (*handle_event)(struct fsnotify_group *group, struct fsnotify_event *event);
 	void (*free_group_priv)(struct fsnotify_group *group);
 	void (*freeing_mark)(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+	void (*free_event_priv)(struct fsnotify_event_private_data *priv);
 };
 
 /*
@@ -157,6 +159,15 @@ struct fsnotify_event_holder {
 	struct list_head event_list;
 };
 
+/*
+ * Inotify needs to tack data onto an event.  This struct lets us later find the
+ * correct private data of the correct group.
+ */
+struct fsnotify_event_private_data {
+	struct fsnotify_group *group;
+	struct list_head event_list;
+};
+
 /*
  * all of the information about the original object we want to now send to
  * a group.  If you want to carry more info from the accessing task to the
@@ -196,6 +207,8 @@ struct fsnotify_event {
 	u32 sync_cookie;	/* used to corrolate events, namely inotify mv events */
 	char *file_name;
 	size_t name_len;
+
+	struct list_head private_data_list;	/* groups can store private data here */
 };
 
 /*
@@ -294,17 +307,18 @@ extern void fsnotify_put_group(struct fsnotify_group *group);
 /* take a reference to an event */
 extern void fsnotify_get_event(struct fsnotify_event *event);
 extern void fsnotify_put_event(struct fsnotify_event *event);
-/* find private data previously attached to an event */
-extern struct fsnotify_event_private_data *fsnotify_get_priv_from_event(struct fsnotify_group *group,
-									struct fsnotify_event *event);
+/* find private data previously attached to an event and unlink it */
+extern struct fsnotify_event_private_data *fsnotify_remove_priv_from_event(struct fsnotify_group *group,
+									   struct fsnotify_event *event);
 
 /* attach the event to the group notification queue */
-extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event);
+extern int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_event *event,
+				     struct fsnotify_event_private_data *priv);
 /* true if the group notification queue is empty */
 extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group);
 /* return, but do not dequeue the first event on the notification queue */
 extern struct fsnotify_event *fsnotify_peek_notify_event(struct fsnotify_group *group);
-/* reutnr AND dequeue the first event on the notification queue */
+/* return AND dequeue the first event on the notification queue */
 extern struct fsnotify_event *fsnotify_remove_notify_event(struct fsnotify_group *group);
 
 /* functions used to manipulate the marks attached to inodes */
-- 
GitLab


From 1ef5f13c6c8acd3fd10db9f1743f3b4cf30a4abb Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:54 -0400
Subject: [PATCH 2100/2198] fsnotify: fsnotify marks on inodes pin them in core

This patch pins any inodes with an fsnotify mark in core.  The idea is that
as soon as the mark is removed from the inode->fsnotify_mark_entries list
the inode will be iput.  In reality is doesn't quite work exactly this way.
The igrab will happen when the mark is added to an inode, but the iput will
happen when the inode pointer is NULL'd inside the mark.

It's possible that 2 racing things will try to remove the mark from
different directions.  One may try to remove the mark because of an
explicit request and one might try to remove it because the inode was
deleted.  It's possible that the removal because of inode deletion will
remove the mark from the inode's list, but the removal by explicit request
will actually set entry->inode == NULL; and call the iput.  This is safe.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/notify/inode_mark.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index a39534845b282..282150f74cfaa 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -204,6 +204,8 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 	 */
 
 
+	iput(inode);
+
 	/*
 	 * it's possible that this group tried to destroy itself, but this
 	 * this mark was simultaneously being freed by inode.  If that's the
@@ -306,6 +308,10 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 	struct fsnotify_mark_entry *lentry;
 	int ret = 0;
 
+	inode = igrab(inode);
+	if (unlikely(!inode))
+		return -EINVAL;
+
 	/*
 	 * LOCKING ORDER!!!!
 	 * entry->lock
@@ -337,6 +343,7 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 
 	if (lentry) {
 		ret = -EEXIST;
+		iput(inode);
 		fsnotify_put_mark(lentry);
 	} else {
 		__fsnotify_update_child_dentry_flags(inode);
-- 
GitLab


From 164bc6195139047faaf5ada1278332e99494803b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:01:58 -0400
Subject: [PATCH 2101/2198] fsnotify: handle filesystem unmounts with fsnotify
 marks

When an fs is unmounted with an fsnotify mark entry attached to one of its
inodes we need to destroy that mark entry and we also (like inotify) send
an unmount event.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 fs/inode.c                       |  1 +
 fs/notify/inode_mark.c           | 72 ++++++++++++++++++++++++++++++++
 include/linux/fsnotify_backend.h |  4 ++
 3 files changed, 77 insertions(+)

diff --git a/fs/inode.c b/fs/inode.c
index 54c63ce3de25e..ca337014ae294 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -407,6 +407,7 @@ int invalidate_inodes(struct super_block *sb)
 	mutex_lock(&iprune_mutex);
 	spin_lock(&inode_lock);
 	inotify_unmount_inodes(&sb->s_inodes);
+	fsnotify_unmount_inodes(&sb->s_inodes);
 	busy = invalidate_list(&sb->s_inodes, &throw_away);
 	spin_unlock(&inode_lock);
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 282150f74cfaa..0a499d2c61918 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -89,6 +89,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
+#include <linux/writeback.h> /* for inode_lock */
 
 #include <asm/atomic.h>
 
@@ -351,3 +352,74 @@ int fsnotify_add_mark(struct fsnotify_mark_entry *entry,
 
 	return ret;
 }
+
+/**
+ * fsnotify_unmount_inodes - an sb is unmounting.  handle any watched inodes.
+ * @list: list of inodes being unmounted (sb->s_inodes)
+ *
+ * Called with inode_lock held, protecting the unmounting super block's list
+ * of inodes, and with iprune_mutex held, keeping shrink_icache_memory() at bay.
+ * We temporarily drop inode_lock, however, and CAN block.
+ */
+void fsnotify_unmount_inodes(struct list_head *list)
+{
+	struct inode *inode, *next_i, *need_iput = NULL;
+
+	list_for_each_entry_safe(inode, next_i, list, i_sb_list) {
+		struct inode *need_iput_tmp;
+
+		/*
+		 * We cannot __iget() an inode in state I_CLEAR, I_FREEING,
+		 * I_WILL_FREE, or I_NEW which is fine because by that point
+		 * the inode cannot have any associated watches.
+		 */
+		if (inode->i_state & (I_CLEAR|I_FREEING|I_WILL_FREE|I_NEW))
+			continue;
+
+		/*
+		 * If i_count is zero, the inode cannot have any watches and
+		 * doing an __iget/iput with MS_ACTIVE clear would actually
+		 * evict all inodes with zero i_count from icache which is
+		 * unnecessarily violent and may in fact be illegal to do.
+		 */
+		if (!atomic_read(&inode->i_count))
+			continue;
+
+		need_iput_tmp = need_iput;
+		need_iput = NULL;
+
+		/* In case fsnotify_inode_delete() drops a reference. */
+		if (inode != need_iput_tmp)
+			__iget(inode);
+		else
+			need_iput_tmp = NULL;
+
+		/* In case the dropping of a reference would nuke next_i. */
+		if ((&next_i->i_sb_list != list) &&
+		    atomic_read(&next_i->i_count) &&
+		    !(next_i->i_state & (I_CLEAR | I_FREEING | I_WILL_FREE))) {
+			__iget(next_i);
+			need_iput = next_i;
+		}
+
+		/*
+		 * We can safely drop inode_lock here because we hold
+		 * references on both inode and next_i.  Also no new inodes
+		 * will be added since the umount has begun.  Finally,
+		 * iprune_mutex keeps shrink_icache_memory() away.
+		 */
+		spin_unlock(&inode_lock);
+
+		if (need_iput_tmp)
+			iput(need_iput_tmp);
+
+		/* for each watch, send FS_UNMOUNT and then remove it */
+		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
+
+		fsnotify_inode_delete(inode);
+
+		iput(inode);
+
+		spin_lock(&inode_lock);
+	}
+}
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index efdf9e442d86b..d2c0ee30e6184 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -336,6 +336,7 @@ extern void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry);
 extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark_entry *entry);
 extern void fsnotify_put_mark(struct fsnotify_mark_entry *entry);
+extern void fsnotify_unmount_inodes(struct list_head *list);
 
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
@@ -365,6 +366,9 @@ static inline u32 fsnotify_get_cookie(void)
 	return 0;
 }
 
+static inline void fsnotify_unmount_inodes(struct list_head *list)
+{}
+
 #endif	/* CONFIG_FSNOTIFY */
 
 #endif	/* __KERNEL __ */
-- 
GitLab


From 63c882a05416e18de6fb59f7dd6da48f3bbe8273 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 21 May 2009 17:02:01 -0400
Subject: [PATCH 2102/2198] inotify: reimplement inotify using fsnotify

Reimplement inotify_user using fsnotify.  This should be feature for feature
exactly the same as the original inotify_user.  This does not make any changes
to the in kernel inotify feature used by audit.  Those patches (and the eventual
removal of in kernel inotify) will come after the new inotify_user proves to be
working correctly.

Signed-off-by: Eric Paris <eparis@redhat.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
---
 MAINTAINERS                          |   2 +
 fs/notify/inotify/Kconfig            |  20 +-
 fs/notify/inotify/Makefile           |   2 +-
 fs/notify/inotify/inotify.h          |  21 +
 fs/notify/inotify/inotify_fsnotify.c | 137 +++++
 fs/notify/inotify/inotify_user.c     | 837 +++++++++++++--------------
 include/linux/fsnotify_backend.h     |  11 +
 init/Kconfig                         |   3 +-
 8 files changed, 585 insertions(+), 448 deletions(-)
 create mode 100644 fs/notify/inotify/inotify.h
 create mode 100644 fs/notify/inotify/inotify_fsnotify.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 96e0c8c607965..e697b67031a28 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2858,6 +2858,8 @@ P:	John McCutchan
 M:	john@johnmccutchan.com
 P:	Robert Love
 M:	rlove@rlove.org
+P:	Eric Paris
+M:	eparis@parisplace.org
 L:	linux-kernel@vger.kernel.org
 S:	Maintained
 F:	Documentation/filesystems/inotify.txt
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 4467928410233..5356884289a15 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,26 +1,30 @@
 config INOTIFY
 	bool "Inotify file change notification support"
-	default y
+	default n
 	---help---
-	  Say Y here to enable inotify support.  Inotify is a file change
-	  notification system and a replacement for dnotify.  Inotify fixes
-	  numerous shortcomings in dnotify and introduces several new features
-	  including multiple file events, one-shot support, and unmount
-	  notification.
+	  Say Y here to enable legacy in kernel inotify support.  Inotify is a
+	  file change notification system.  It is a replacement for dnotify.
+	  This option only provides the legacy inotify in kernel API.  There
+	  are no in tree kernel users of this interface since it is deprecated.
+	  You only need this if you are loading an out of tree kernel module
+	  that uses inotify.
 
 	  For more information, see <file:Documentation/filesystems/inotify.txt>
 
-	  If unsure, say Y.
+	  If unsure, say N.
 
 config INOTIFY_USER
 	bool "Inotify support for userspace"
-	depends on INOTIFY
+	depends on FSNOTIFY
 	default y
 	---help---
 	  Say Y here to enable inotify support for userspace, including the
 	  associated system calls.  Inotify allows monitoring of both files and
 	  directories via a single open fd.  Events are read from the file
 	  descriptor, which is also select()- and poll()-able.
+	  Inotify fixes numerous shortcomings in dnotify and introduces several
+	  new features including multiple file events, one-shot support, and
+	  unmount notification.
 
 	  For more information, see <file:Documentation/filesystems/inotify.txt>
 
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
index e290f3bb9d8d9..9438281713621 100644
--- a/fs/notify/inotify/Makefile
+++ b/fs/notify/inotify/Makefile
@@ -1,2 +1,2 @@
 obj-$(CONFIG_INOTIFY)		+= inotify.o
-obj-$(CONFIG_INOTIFY_USER)	+= inotify_user.o
+obj-$(CONFIG_INOTIFY_USER)	+= inotify_fsnotify.o inotify_user.o
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
new file mode 100644
index 0000000000000..ea2605a58b8a7
--- /dev/null
+++ b/fs/notify/inotify/inotify.h
@@ -0,0 +1,21 @@
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/slab.h> /* struct kmem_cache */
+
+extern struct kmem_cache *event_priv_cachep;
+
+struct inotify_event_private_data {
+	struct fsnotify_event_private_data fsnotify_event_priv_data;
+	int wd;
+};
+
+struct inotify_inode_mark_entry {
+	/* fsnotify_mark_entry MUST be the first thing */
+	struct fsnotify_mark_entry fsn_entry;
+	int wd;
+};
+
+extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
+
+extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
new file mode 100644
index 0000000000000..160da54868398
--- /dev/null
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -0,0 +1,137 @@
+/*
+ * fs/inotify_user.c - inotify support for userspace
+ *
+ * Authors:
+ *	John McCutchan	<ttb@tentacle.dhs.org>
+ *	Robert Love	<rml@novell.com>
+ *
+ * Copyright (C) 2005 John McCutchan
+ * Copyright 2006 Hewlett-Packard Development Company, L.P.
+ *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 2, or (at your option) any
+ * later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/inotify.h>
+#include <linux/path.h> /* struct path */
+#include <linux/slab.h> /* kmem_* */
+#include <linux/types.h>
+
+#include "inotify.h"
+
+static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_event *event)
+{
+	struct fsnotify_mark_entry *entry;
+	struct inotify_inode_mark_entry *ientry;
+	struct inode *to_tell;
+	struct inotify_event_private_data *event_priv;
+	struct fsnotify_event_private_data *fsn_event_priv;
+	int wd, ret;
+
+	to_tell = event->to_tell;
+
+	spin_lock(&to_tell->i_lock);
+	entry = fsnotify_find_mark_entry(group, to_tell);
+	spin_unlock(&to_tell->i_lock);
+	/* race with watch removal?  We already passes should_send */
+	if (unlikely(!entry))
+		return 0;
+	ientry = container_of(entry, struct inotify_inode_mark_entry,
+			      fsn_entry);
+	wd = ientry->wd;
+
+	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+	if (unlikely(!event_priv))
+		return -ENOMEM;
+
+	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+
+	fsn_event_priv->group = group;
+	event_priv->wd = wd;
+
+	ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
+	/* EEXIST is not an error */
+	if (ret == -EEXIST)
+		ret = 0;
+
+	/* did event_priv get attached? */
+	if (list_empty(&fsn_event_priv->event_list))
+		inotify_free_event_priv(fsn_event_priv);
+
+	/*
+	 * If we hold the entry until after the event is on the queue
+	 * IN_IGNORED won't be able to pass this event in the queue
+	 */
+	fsnotify_put_mark(entry);
+
+	return ret;
+}
+
+static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+	inotify_destroy_mark_entry(entry, group);
+}
+
+static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
+{
+	struct fsnotify_mark_entry *entry;
+	bool send;
+
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(group, inode);
+	spin_unlock(&inode->i_lock);
+	if (!entry)
+		return false;
+
+	send = (entry->mask & mask);
+
+	/* find took a reference */
+	fsnotify_put_mark(entry);
+
+	return send;
+}
+
+static int idr_callback(int id, void *p, void *data)
+{
+	BUG();
+	return 0;
+}
+
+static void inotify_free_group_priv(struct fsnotify_group *group)
+{
+	/* ideally the idr is empty and we won't hit the BUG in teh callback */
+	idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
+	idr_remove_all(&group->inotify_data.idr);
+	idr_destroy(&group->inotify_data.idr);
+}
+
+void inotify_free_event_priv(struct fsnotify_event_private_data *fsn_event_priv)
+{
+	struct inotify_event_private_data *event_priv;
+
+
+	event_priv = container_of(fsn_event_priv, struct inotify_event_private_data,
+				  fsnotify_event_priv_data);
+
+	kmem_cache_free(event_priv_cachep, event_priv);
+}
+
+const struct fsnotify_ops inotify_fsnotify_ops = {
+	.handle_event = inotify_handle_event,
+	.should_send_event = inotify_should_send_event,
+	.free_group_priv = inotify_free_group_priv,
+	.free_event_priv = inotify_free_event_priv,
+	.freeing_mark = inotify_freeing_mark,
+};
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1634319e2404d..982a412ac5bc9 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -8,6 +8,9 @@
  * Copyright (C) 2005 John McCutchan
  * Copyright 2006 Hewlett-Packard Development Company, L.P.
  *
+ * Copyright (C) 2009 Eric Paris <Red Hat Inc>
+ * inotify was largely rewriten to make use of the fsnotify infrastructure
+ *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License as published by the
  * Free Software Foundation; either version 2, or (at your option) any
@@ -19,94 +22,48 @@
  * General Public License for more details.
  */
 
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
 #include <linux/file.h>
-#include <linux/mount.h>
-#include <linux/namei.h>
-#include <linux/poll.h>
-#include <linux/init.h>
-#include <linux/list.h>
+#include <linux/fs.h> /* struct inode */
+#include <linux/fsnotify_backend.h>
+#include <linux/idr.h>
+#include <linux/init.h> /* module_init */
 #include <linux/inotify.h>
+#include <linux/kernel.h> /* roundup() */
+#include <linux/magic.h> /* superblock magic number */
+#include <linux/mount.h> /* mntget */
+#include <linux/namei.h> /* LOOKUP_FOLLOW */
+#include <linux/path.h> /* struct path */
+#include <linux/sched.h> /* struct user */
+#include <linux/slab.h> /* struct kmem_cache */
 #include <linux/syscalls.h>
-#include <linux/magic.h>
+#include <linux/types.h>
+#include <linux/uaccess.h>
+#include <linux/poll.h>
+#include <linux/wait.h>
 
-#include <asm/ioctls.h>
+#include "inotify.h"
 
-static struct kmem_cache *watch_cachep __read_mostly;
-static struct kmem_cache *event_cachep __read_mostly;
+#include <asm/ioctls.h>
 
 static struct vfsmount *inotify_mnt __read_mostly;
 
+/* this just sits here and wastes global memory.  used to just pad userspace messages with zeros */
+static struct inotify_event nul_inotify_event;
+
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
-static int inotify_max_user_watches __read_mostly;
 static int inotify_max_queued_events __read_mostly;
+int inotify_max_user_watches __read_mostly;
 
-/*
- * Lock ordering:
- *
- * inotify_dev->up_mutex (ensures we don't re-add the same watch)
- * 	inode->inotify_mutex (protects inode's watch list)
- * 		inotify_handle->mutex (protects inotify_handle's watch list)
- * 			inotify_dev->ev_mutex (protects device's event queue)
- */
+static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
+struct kmem_cache *event_priv_cachep __read_mostly;
+static struct fsnotify_event *inotify_ignored_event;
 
 /*
- * Lifetimes of the main data structures:
- *
- * inotify_device: Lifetime is managed by reference count, from
- * sys_inotify_init() until release.  Additional references can bump the count
- * via get_inotify_dev() and drop the count via put_inotify_dev().
- *
- * inotify_user_watch: Lifetime is from create_watch() to the receipt of an
- * IN_IGNORED event from inotify, or when using IN_ONESHOT, to receipt of the
- * first event, or to inotify_destroy().
+ * When inotify registers a new group it increments this and uses that
+ * value as an offset to set the fsnotify group "name" and priority.
  */
-
-/*
- * struct inotify_device - represents an inotify instance
- *
- * This structure is protected by the mutex 'mutex'.
- */
-struct inotify_device {
-	wait_queue_head_t 	wq;		/* wait queue for i/o */
-	struct mutex		ev_mutex;	/* protects event queue */
-	struct mutex		up_mutex;	/* synchronizes watch updates */
-	struct list_head 	events;		/* list of queued events */
-	struct user_struct	*user;		/* user who opened this dev */
-	struct inotify_handle	*ih;		/* inotify handle */
-	struct fasync_struct    *fa;            /* async notification */
-	atomic_t		count;		/* reference count */
-	unsigned int		queue_size;	/* size of the queue (bytes) */
-	unsigned int		event_count;	/* number of pending events */
-	unsigned int		max_events;	/* maximum number of events */
-};
-
-/*
- * struct inotify_kernel_event - An inotify event, originating from a watch and
- * queued for user-space.  A list of these is attached to each instance of the
- * device.  In read(), this list is walked and all events that can fit in the
- * buffer are returned.
- *
- * Protected by dev->ev_mutex of the device in which we are queued.
- */
-struct inotify_kernel_event {
-	struct inotify_event	event;	/* the user-space event */
-	struct list_head        list;	/* entry in inotify_device's list */
-	char			*name;	/* filename, if any */
-};
-
-/*
- * struct inotify_user_watch - our version of an inotify_watch, we add
- * a reference to the associated inotify_device.
- */
-struct inotify_user_watch {
-	struct inotify_device	*dev;	/* associated device */
-	struct inotify_watch	wdata;	/* inotify watch data */
-};
+static atomic_t inotify_grp_num;
 
 #ifdef CONFIG_SYSCTL
 
@@ -149,280 +106,36 @@ ctl_table inotify_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static inline void get_inotify_dev(struct inotify_device *dev)
-{
-	atomic_inc(&dev->count);
-}
-
-static inline void put_inotify_dev(struct inotify_device *dev)
-{
-	if (atomic_dec_and_test(&dev->count)) {
-		atomic_dec(&dev->user->inotify_devs);
-		free_uid(dev->user);
-		kfree(dev);
-	}
-}
-
-/*
- * free_inotify_user_watch - cleans up the watch and its references
- */
-static void free_inotify_user_watch(struct inotify_watch *w)
-{
-	struct inotify_user_watch *watch;
-	struct inotify_device *dev;
-
-	watch = container_of(w, struct inotify_user_watch, wdata);
-	dev = watch->dev;
-
-	atomic_dec(&dev->user->inotify_watches);
-	put_inotify_dev(dev);
-	kmem_cache_free(watch_cachep, watch);
-}
-
-/*
- * kernel_event - create a new kernel event with the given parameters
- *
- * This function can sleep.
- */
-static struct inotify_kernel_event * kernel_event(s32 wd, u32 mask, u32 cookie,
-						  const char *name)
-{
-	struct inotify_kernel_event *kevent;
-
-	kevent = kmem_cache_alloc(event_cachep, GFP_NOFS);
-	if (unlikely(!kevent))
-		return NULL;
-
-	/* we hand this out to user-space, so zero it just in case */
-	memset(&kevent->event, 0, sizeof(struct inotify_event));
-
-	kevent->event.wd = wd;
-	kevent->event.mask = mask;
-	kevent->event.cookie = cookie;
-
-	INIT_LIST_HEAD(&kevent->list);
-
-	if (name) {
-		size_t len, rem, event_size = sizeof(struct inotify_event);
-
-		/*
-		 * We need to pad the filename so as to properly align an
-		 * array of inotify_event structures.  Because the structure is
-		 * small and the common case is a small filename, we just round
-		 * up to the next multiple of the structure's sizeof.  This is
-		 * simple and safe for all architectures.
-		 */
-		len = strlen(name) + 1;
-		rem = event_size - len;
-		if (len > event_size) {
-			rem = event_size - (len % event_size);
-			if (len % event_size == 0)
-				rem = 0;
-		}
-
-		kevent->name = kmalloc(len + rem, GFP_NOFS);
-		if (unlikely(!kevent->name)) {
-			kmem_cache_free(event_cachep, kevent);
-			return NULL;
-		}
-		memcpy(kevent->name, name, len);
-		if (rem)
-			memset(kevent->name + len, 0, rem);
-		kevent->event.len = len + rem;
-	} else {
-		kevent->event.len = 0;
-		kevent->name = NULL;
-	}
-
-	return kevent;
-}
-
-/*
- * inotify_dev_get_event - return the next event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_event(struct inotify_device *dev)
-{
-	return list_entry(dev->events.next, struct inotify_kernel_event, list);
-}
-
-/*
- * inotify_dev_get_last_event - return the last event in the given dev's queue
- *
- * Caller must hold dev->ev_mutex.
- */
-static inline struct inotify_kernel_event *
-inotify_dev_get_last_event(struct inotify_device *dev)
+static inline __u32 inotify_arg_to_mask(u32 arg)
 {
-	if (list_empty(&dev->events))
-		return NULL;
-	return list_entry(dev->events.prev, struct inotify_kernel_event, list);
-}
+	__u32 mask;
 
-/*
- * inotify_dev_queue_event - event handler registered with core inotify, adds
- * a new event to the given device
- *
- * Can sleep (calls kernel_event()).
- */
-static void inotify_dev_queue_event(struct inotify_watch *w, u32 wd, u32 mask,
-				    u32 cookie, const char *name,
-				    struct inode *ignored)
-{
-	struct inotify_user_watch *watch;
-	struct inotify_device *dev;
-	struct inotify_kernel_event *kevent, *last;
+	/* everything should accept their own ignored and cares about children */
+	mask = (FS_IN_IGNORED | FS_EVENT_ON_CHILD);
 
-	watch = container_of(w, struct inotify_user_watch, wdata);
-	dev = watch->dev;
+	/* mask off the flags used to open the fd */
+	mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT));
 
-	mutex_lock(&dev->ev_mutex);
-
-	/* we can safely put the watch as we don't reference it while
-	 * generating the event
-	 */
-	if (mask & IN_IGNORED || w->mask & IN_ONESHOT)
-		put_inotify_watch(w); /* final put */
-
-	/* coalescing: drop this event if it is a dupe of the previous */
-	last = inotify_dev_get_last_event(dev);
-	if (last && last->event.mask == mask && last->event.wd == wd &&
-			last->event.cookie == cookie) {
-		const char *lastname = last->name;
-
-		if (!name && !lastname)
-			goto out;
-		if (name && lastname && !strcmp(lastname, name))
-			goto out;
-	}
-
-	/* the queue overflowed and we already sent the Q_OVERFLOW event */
-	if (unlikely(dev->event_count > dev->max_events))
-		goto out;
-
-	/* if the queue overflows, we need to notify user space */
-	if (unlikely(dev->event_count == dev->max_events))
-		kevent = kernel_event(-1, IN_Q_OVERFLOW, cookie, NULL);
-	else
-		kevent = kernel_event(wd, mask, cookie, name);
-
-	if (unlikely(!kevent))
-		goto out;
-
-	/* queue the event and wake up anyone waiting */
-	dev->event_count++;
-	dev->queue_size += sizeof(struct inotify_event) + kevent->event.len;
-	list_add_tail(&kevent->list, &dev->events);
-	wake_up_interruptible(&dev->wq);
-	kill_fasync(&dev->fa, SIGIO, POLL_IN);
-
-out:
-	mutex_unlock(&dev->ev_mutex);
-}
-
-/*
- * remove_kevent - cleans up the given kevent
- *
- * Caller must hold dev->ev_mutex.
- */
-static void remove_kevent(struct inotify_device *dev,
-			  struct inotify_kernel_event *kevent)
-{
-	list_del(&kevent->list);
-
-	dev->event_count--;
-	dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len;
-}
-
-/*
- * free_kevent - frees the given kevent.
- */
-static void free_kevent(struct inotify_kernel_event *kevent)
-{
-	kfree(kevent->name);
-	kmem_cache_free(event_cachep, kevent);
-}
-
-/*
- * inotify_dev_event_dequeue - destroy an event on the given device
- *
- * Caller must hold dev->ev_mutex.
- */
-static void inotify_dev_event_dequeue(struct inotify_device *dev)
-{
-	if (!list_empty(&dev->events)) {
-		struct inotify_kernel_event *kevent;
-		kevent = inotify_dev_get_event(dev);
-		remove_kevent(dev, kevent);
-		free_kevent(kevent);
-	}
-}
-
-/*
- * find_inode - resolve a user-given path to a specific inode
- */
-static int find_inode(const char __user *dirname, struct path *path,
-		      unsigned flags)
-{
-	int error;
-
-	error = user_path_at(AT_FDCWD, dirname, flags, path);
-	if (error)
-		return error;
-	/* you can only watch an inode if you have read permissions on it */
-	error = inode_permission(path->dentry->d_inode, MAY_READ);
-	if (error)
-		path_put(path);
-	return error;
+	return mask;
 }
 
-/*
- * create_watch - creates a watch on the given device.
- *
- * Callers must hold dev->up_mutex.
- */
-static int create_watch(struct inotify_device *dev, struct inode *inode,
-			u32 mask)
+static inline u32 inotify_mask_to_arg(__u32 mask)
 {
-	struct inotify_user_watch *watch;
-	int ret;
-
-	if (atomic_read(&dev->user->inotify_watches) >=
-			inotify_max_user_watches)
-		return -ENOSPC;
-
-	watch = kmem_cache_alloc(watch_cachep, GFP_KERNEL);
-	if (unlikely(!watch))
-		return -ENOMEM;
-
-	/* save a reference to device and bump the count to make it official */
-	get_inotify_dev(dev);
-	watch->dev = dev;
-
-	atomic_inc(&dev->user->inotify_watches);
-
-	inotify_init_watch(&watch->wdata);
-	ret = inotify_add_watch(dev->ih, &watch->wdata, inode, mask);
-	if (ret < 0)
-		free_inotify_user_watch(&watch->wdata);
-
-	return ret;
+	return mask & (IN_ALL_EVENTS | IN_ISDIR | IN_UNMOUNT | IN_IGNORED |
+		       IN_Q_OVERFLOW);
 }
 
-/* Device Interface */
-
+/* intofiy userspace file descriptor functions */
 static unsigned int inotify_poll(struct file *file, poll_table *wait)
 {
-	struct inotify_device *dev = file->private_data;
+	struct fsnotify_group *group = file->private_data;
 	int ret = 0;
 
-	poll_wait(file, &dev->wq, wait);
-	mutex_lock(&dev->ev_mutex);
-	if (!list_empty(&dev->events))
+	poll_wait(file, &group->notification_waitq, wait);
+	mutex_lock(&group->notification_mutex);
+	if (!fsnotify_notify_queue_is_empty(group))
 		ret = POLLIN | POLLRDNORM;
-	mutex_unlock(&dev->ev_mutex);
+	mutex_unlock(&group->notification_mutex);
 
 	return ret;
 }
@@ -432,26 +145,29 @@ static unsigned int inotify_poll(struct file *file, poll_table *wait)
  * enough to fit in "count". Return an error pointer if
  * not large enough.
  *
- * Called with the device ev_mutex held.
+ * Called with the group->notification_mutex held.
  */
-static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
-						  size_t count)
+static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
+					    size_t count)
 {
 	size_t event_size = sizeof(struct inotify_event);
-	struct inotify_kernel_event *kevent;
+	struct fsnotify_event *event;
 
-	if (list_empty(&dev->events))
+	if (fsnotify_notify_queue_is_empty(group))
 		return NULL;
 
-	kevent = inotify_dev_get_event(dev);
-	if (kevent->name)
-		event_size += kevent->event.len;
+	event = fsnotify_peek_notify_event(group);
+
+	event_size += roundup(event->name_len, event_size);
 
 	if (event_size > count)
 		return ERR_PTR(-EINVAL);
 
-	remove_kevent(dev, kevent);
-	return kevent;
+	/* held the notification_mutex the whole time, so this is the
+	 * same event we peeked above */
+	fsnotify_remove_notify_event(group);
+
+	return event;
 }
 
 /*
@@ -460,51 +176,90 @@ static struct inotify_kernel_event *get_one_event(struct inotify_device *dev,
  * We already checked that the event size is smaller than the
  * buffer we had in "get_one_event()" above.
  */
-static ssize_t copy_event_to_user(struct inotify_kernel_event *kevent,
+static ssize_t copy_event_to_user(struct fsnotify_group *group,
+				  struct fsnotify_event *event,
 				  char __user *buf)
 {
+	struct inotify_event inotify_event;
+	struct fsnotify_event_private_data *fsn_priv;
+	struct inotify_event_private_data *priv;
 	size_t event_size = sizeof(struct inotify_event);
+	size_t name_len;
+
+	/* we get the inotify watch descriptor from the event private data */
+	spin_lock(&event->lock);
+	fsn_priv = fsnotify_remove_priv_from_event(group, event);
+	spin_unlock(&event->lock);
+
+	if (!fsn_priv)
+		inotify_event.wd = -1;
+	else {
+		priv = container_of(fsn_priv, struct inotify_event_private_data,
+				    fsnotify_event_priv_data);
+		inotify_event.wd = priv->wd;
+		inotify_free_event_priv(fsn_priv);
+	}
+
+	/* round up event->name_len so it is a multiple of event_size */
+	name_len = roundup(event->name_len, event_size);
+	inotify_event.len = name_len;
+
+	inotify_event.mask = inotify_mask_to_arg(event->mask);
+	inotify_event.cookie = event->sync_cookie;
 
-	if (copy_to_user(buf, &kevent->event, event_size))
+	/* send the main event */
+	if (copy_to_user(buf, &inotify_event, event_size))
 		return -EFAULT;
 
-	if (kevent->name) {
-		buf += event_size;
+	buf += event_size;
 
-		if (copy_to_user(buf, kevent->name, kevent->event.len))
+	/*
+	 * fsnotify only stores the pathname, so here we have to send the pathname
+	 * and then pad that pathname out to a multiple of sizeof(inotify_event)
+	 * with zeros.  I get my zeros from the nul_inotify_event.
+	 */
+	if (name_len) {
+		unsigned int len_to_zero = name_len - event->name_len;
+		/* copy the path name */
+		if (copy_to_user(buf, event->file_name, event->name_len))
 			return -EFAULT;
+		buf += event->name_len;
 
-		event_size += kevent->event.len;
+		/* fill userspace with 0's from nul_inotify_event */
+		if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
+			return -EFAULT;
+		buf += len_to_zero;
+		event_size += name_len;
 	}
+
 	return event_size;
 }
 
 static ssize_t inotify_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *pos)
 {
-	struct inotify_device *dev;
+	struct fsnotify_group *group;
+	struct fsnotify_event *kevent;
 	char __user *start;
 	int ret;
 	DEFINE_WAIT(wait);
 
 	start = buf;
-	dev = file->private_data;
+	group = file->private_data;
 
 	while (1) {
-		struct inotify_kernel_event *kevent;
+		prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
 
-		prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE);
-
-		mutex_lock(&dev->ev_mutex);
-		kevent = get_one_event(dev, count);
-		mutex_unlock(&dev->ev_mutex);
+		mutex_lock(&group->notification_mutex);
+		kevent = get_one_event(group, count);
+		mutex_unlock(&group->notification_mutex);
 
 		if (kevent) {
 			ret = PTR_ERR(kevent);
 			if (IS_ERR(kevent))
 				break;
-			ret = copy_event_to_user(kevent, buf);
-			free_kevent(kevent);
+			ret = copy_event_to_user(group, kevent, buf);
+			fsnotify_put_event(kevent);
 			if (ret < 0)
 				break;
 			buf += ret;
@@ -525,7 +280,7 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 		schedule();
 	}
 
-	finish_wait(&dev->wq, &wait);
+	finish_wait(&group->notification_waitq, &wait);
 	if (start != buf && ret != -EFAULT)
 		ret = buf - start;
 	return ret;
@@ -533,25 +288,19 @@ static ssize_t inotify_read(struct file *file, char __user *buf,
 
 static int inotify_fasync(int fd, struct file *file, int on)
 {
-	struct inotify_device *dev = file->private_data;
+	struct fsnotify_group *group = file->private_data;
 
-	return fasync_helper(fd, file, on, &dev->fa) >= 0 ? 0 : -EIO;
+	return fasync_helper(fd, file, on, &group->inotify_data.fa) >= 0 ? 0 : -EIO;
 }
 
 static int inotify_release(struct inode *ignored, struct file *file)
 {
-	struct inotify_device *dev = file->private_data;
-
-	inotify_destroy(dev->ih);
+	struct fsnotify_group *group = file->private_data;
 
-	/* destroy all of the events on this device */
-	mutex_lock(&dev->ev_mutex);
-	while (!list_empty(&dev->events))
-		inotify_dev_event_dequeue(dev);
-	mutex_unlock(&dev->ev_mutex);
+	fsnotify_clear_marks_by_group(group);
 
-	/* free this device: the put matching the get in inotify_init() */
-	put_inotify_dev(dev);
+	/* free this group, matching get was inotify_init->fsnotify_obtain_group */
+	fsnotify_put_group(group);
 
 	return 0;
 }
@@ -559,16 +308,27 @@ static int inotify_release(struct inode *ignored, struct file *file)
 static long inotify_ioctl(struct file *file, unsigned int cmd,
 			  unsigned long arg)
 {
-	struct inotify_device *dev;
+	struct fsnotify_group *group;
+	struct fsnotify_event_holder *holder;
+	struct fsnotify_event *event;
 	void __user *p;
 	int ret = -ENOTTY;
+	size_t send_len = 0;
 
-	dev = file->private_data;
+	group = file->private_data;
 	p = (void __user *) arg;
 
 	switch (cmd) {
 	case FIONREAD:
-		ret = put_user(dev->queue_size, (int __user *) p);
+		mutex_lock(&group->notification_mutex);
+		list_for_each_entry(holder, &group->notification_list, event_list) {
+			event = holder->event;
+			send_len += sizeof(struct inotify_event);
+			send_len += roundup(event->name_len,
+					     sizeof(struct inotify_event));
+		}
+		mutex_unlock(&group->notification_mutex);
+		ret = put_user(send_len, (int __user *) p);
 		break;
 	}
 
@@ -576,23 +336,233 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 }
 
 static const struct file_operations inotify_fops = {
-	.poll           = inotify_poll,
-	.read           = inotify_read,
-	.fasync         = inotify_fasync,
-	.release        = inotify_release,
-	.unlocked_ioctl = inotify_ioctl,
+	.poll		= inotify_poll,
+	.read		= inotify_read,
+	.fasync		= inotify_fasync,
+	.release	= inotify_release,
+	.unlocked_ioctl	= inotify_ioctl,
 	.compat_ioctl	= inotify_ioctl,
 };
 
-static const struct inotify_operations inotify_user_ops = {
-	.handle_event	= inotify_dev_queue_event,
-	.destroy_watch	= free_inotify_user_watch,
-};
 
+/*
+ * find_inode - resolve a user-given path to a specific inode
+ */
+static int inotify_find_inode(const char __user *dirname, struct path *path, unsigned flags)
+{
+	int error;
+
+	error = user_path_at(AT_FDCWD, dirname, flags, path);
+	if (error)
+		return error;
+	/* you can only watch an inode if you have read permissions on it */
+	error = inode_permission(path->dentry->d_inode, MAY_READ);
+	if (error)
+		path_put(path);
+	return error;
+}
+
+/*
+ * When, for whatever reason, inotify is done with a mark (or what used to be a
+ * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
+ * for the given wd.
+ *
+ * There is a bit of recursion here.  The loop looks like:
+ * 	inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
+ *	inotify_freeing_mark -> inotify_destory_mark_entry -> restart
+ * But the loop is broken in 2 places.  fsnotify_destroy_mark_by_entry sets
+ * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
+ * test below will not call back to fsnotify again.  But even if that test wasn't
+ * there this would still be safe since fsnotify_destroy_mark_by_entry() is
+ * safe from recursion.
+ */
+void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+{
+	struct inotify_inode_mark_entry *ientry;
+	struct inotify_event_private_data *event_priv;
+	struct fsnotify_event_private_data *fsn_event_priv;
+	struct fsnotify_group *egroup;
+	struct idr *idr;
+
+	spin_lock(&entry->lock);
+	egroup = entry->group;
+
+	/* if egroup we aren't really done and something might still send events
+	 * for this inode, on the callback we'll send the IN_IGNORED */
+	if (egroup) {
+		spin_unlock(&entry->lock);
+		fsnotify_destroy_mark_by_entry(entry);
+		return;
+	}
+	spin_unlock(&entry->lock);
+
+	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+
+	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+	if (unlikely(!event_priv))
+		goto skip_send_ignore;
+
+	fsn_event_priv = &event_priv->fsnotify_event_priv_data;
+
+	fsn_event_priv->group = group;
+	event_priv->wd = ientry->wd;
+
+	fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+
+	/* did the private data get added? */
+	if (list_empty(&fsn_event_priv->event_list))
+		inotify_free_event_priv(fsn_event_priv);
+
+skip_send_ignore:
+
+	/* remove this entry from the idr */
+	spin_lock(&group->inotify_data.idr_lock);
+	idr = &group->inotify_data.idr;
+	idr_remove(idr, ientry->wd);
+	spin_unlock(&group->inotify_data.idr_lock);
+
+	/* removed from idr, drop that reference */
+	fsnotify_put_mark(entry);
+}
+
+/* ding dong the mark is dead */
+static void inotify_free_mark(struct fsnotify_mark_entry *entry)
+{
+	struct inotify_inode_mark_entry *ientry = (struct inotify_inode_mark_entry *)entry;
+
+	kmem_cache_free(inotify_inode_mark_cachep, ientry);
+}
+
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+	struct fsnotify_mark_entry *entry = NULL;
+	struct inotify_inode_mark_entry *ientry;
+	int ret = 0;
+	int add = (arg & IN_MASK_ADD);
+	__u32 mask;
+	__u32 old_mask, new_mask;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask = inotify_arg_to_mask(arg);
+	if (unlikely(!mask))
+		return -EINVAL;
+
+	ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+	if (unlikely(!ientry))
+		return -ENOMEM;
+	/* we set the mask at the end after attaching it */
+	fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
+	ientry->wd = 0;
+
+find_entry:
+	spin_lock(&inode->i_lock);
+	entry = fsnotify_find_mark_entry(group, inode);
+	spin_unlock(&inode->i_lock);
+	if (entry) {
+		kmem_cache_free(inotify_inode_mark_cachep, ientry);
+		ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+	} else {
+		if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
+			ret = -ENOSPC;
+			goto out_err;
+		}
+
+		ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
+		if (ret == -EEXIST)
+			goto find_entry;
+		else if (ret)
+			goto out_err;
+
+		entry = &ientry->fsn_entry;
+retry:
+		ret = -ENOMEM;
+		if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+			goto out_err;
+
+		spin_lock(&group->inotify_data.idr_lock);
+		/* if entry is added to the idr we keep the reference obtained
+		 * through fsnotify_mark_add.  remember to drop this reference
+		 * when entry is removed from idr */
+		ret = idr_get_new_above(&group->inotify_data.idr, entry,
+					++group->inotify_data.last_wd,
+					&ientry->wd);
+		spin_unlock(&group->inotify_data.idr_lock);
+		if (ret) {
+			if (ret == -EAGAIN)
+				goto retry;
+			goto out_err;
+		}
+		atomic_inc(&group->inotify_data.user->inotify_watches);
+	}
+
+	spin_lock(&entry->lock);
+
+	old_mask = entry->mask;
+	if (add) {
+		entry->mask |= mask;
+		new_mask = entry->mask;
+	} else {
+		entry->mask = mask;
+		new_mask = entry->mask;
+	}
+
+	spin_unlock(&entry->lock);
+
+	if (old_mask != new_mask) {
+		/* more bits in old than in new? */
+		int dropped = (old_mask & ~new_mask);
+		/* more bits in this entry than the inode's mask? */
+		int do_inode = (new_mask & ~inode->i_fsnotify_mask);
+		/* more bits in this entry than the group? */
+		int do_group = (new_mask & ~group->mask);
+
+		/* update the inode with this new entry */
+		if (dropped || do_inode)
+			fsnotify_recalc_inode_mask(inode);
+
+		/* update the group mask with the new mask */
+		if (dropped || do_group)
+			fsnotify_recalc_group_mask(group);
+	}
+
+	return ientry->wd;
+
+out_err:
+	/* see this isn't supposed to happen, just kill the watch */
+	if (entry) {
+		fsnotify_destroy_mark_by_entry(entry);
+		fsnotify_put_mark(entry);
+	}
+	return ret;
+}
+
+static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsigned int max_events)
+{
+	struct fsnotify_group *group;
+	unsigned int grp_num;
+
+	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+	grp_num = (INOTIFY_GROUP_NUM - atomic_inc_return(&inotify_grp_num));
+	group = fsnotify_obtain_group(grp_num, 0, &inotify_fsnotify_ops);
+	if (IS_ERR(group))
+		return group;
+
+	group->max_events = max_events;
+
+	spin_lock_init(&group->inotify_data.idr_lock);
+	idr_init(&group->inotify_data.idr);
+	group->inotify_data.last_wd = 0;
+	group->inotify_data.user = user;
+	group->inotify_data.fa = NULL;
+
+	return group;
+}
+
+
+/* inotify syscalls */
 SYSCALL_DEFINE1(inotify_init1, int, flags)
 {
-	struct inotify_device *dev;
-	struct inotify_handle *ih;
+	struct fsnotify_group *group;
 	struct user_struct *user;
 	struct file *filp;
 	int fd, ret;
@@ -621,45 +591,27 @@ SYSCALL_DEFINE1(inotify_init1, int, flags)
 		goto out_free_uid;
 	}
 
-	dev = kmalloc(sizeof(struct inotify_device), GFP_KERNEL);
-	if (unlikely(!dev)) {
-		ret = -ENOMEM;
+	/* fsnotify_obtain_group took a reference to group, we put this when we kill the file in the end */
+	group = inotify_new_group(user, inotify_max_queued_events);
+	if (IS_ERR(group)) {
+		ret = PTR_ERR(group);
 		goto out_free_uid;
 	}
 
-	ih = inotify_init(&inotify_user_ops);
-	if (IS_ERR(ih)) {
-		ret = PTR_ERR(ih);
-		goto out_free_dev;
-	}
-	dev->ih = ih;
-	dev->fa = NULL;
-
 	filp->f_op = &inotify_fops;
 	filp->f_path.mnt = mntget(inotify_mnt);
 	filp->f_path.dentry = dget(inotify_mnt->mnt_root);
 	filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
 	filp->f_mode = FMODE_READ;
 	filp->f_flags = O_RDONLY | (flags & O_NONBLOCK);
-	filp->private_data = dev;
-
-	INIT_LIST_HEAD(&dev->events);
-	init_waitqueue_head(&dev->wq);
-	mutex_init(&dev->ev_mutex);
-	mutex_init(&dev->up_mutex);
-	dev->event_count = 0;
-	dev->queue_size = 0;
-	dev->max_events = inotify_max_queued_events;
-	dev->user = user;
-	atomic_set(&dev->count, 0);
-
-	get_inotify_dev(dev);
+	filp->private_data = group;
+
 	atomic_inc(&user->inotify_devs);
+
 	fd_install(fd, filp);
 
 	return fd;
-out_free_dev:
-	kfree(dev);
+
 out_free_uid:
 	free_uid(user);
 	put_filp(filp);
@@ -676,8 +628,8 @@ SYSCALL_DEFINE0(inotify_init)
 SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 		u32, mask)
 {
+	struct fsnotify_group *group;
 	struct inode *inode;
-	struct inotify_device *dev;
 	struct path path;
 	struct file *filp;
 	int ret, fput_needed;
@@ -698,20 +650,20 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 	if (mask & IN_ONLYDIR)
 		flags |= LOOKUP_DIRECTORY;
 
-	ret = find_inode(pathname, &path, flags);
-	if (unlikely(ret))
+	ret = inotify_find_inode(pathname, &path, flags);
+	if (ret)
 		goto fput_and_out;
 
-	/* inode held in place by reference to path; dev by fget on fd */
+	/* inode held in place by reference to path; group by fget on fd */
 	inode = path.dentry->d_inode;
-	dev = filp->private_data;
+	group = filp->private_data;
 
-	mutex_lock(&dev->up_mutex);
-	ret = inotify_find_update_watch(dev->ih, inode, mask);
-	if (ret == -ENOENT)
-		ret = create_watch(dev, inode, mask);
-	mutex_unlock(&dev->up_mutex);
+	/* create/update an inode mark */
+	ret = inotify_update_watch(group, inode, mask);
+	if (unlikely(ret))
+		goto path_put_and_out;
 
+path_put_and_out:
 	path_put(&path);
 fput_and_out:
 	fput_light(filp, fput_needed);
@@ -720,9 +672,10 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
 
 SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 {
+	struct fsnotify_group *group;
+	struct fsnotify_mark_entry *entry;
 	struct file *filp;
-	struct inotify_device *dev;
-	int ret, fput_needed;
+	int ret = 0, fput_needed;
 
 	filp = fget_light(fd, &fput_needed);
 	if (unlikely(!filp))
@@ -734,10 +687,20 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 		goto out;
 	}
 
-	dev = filp->private_data;
+	group = filp->private_data;
 
-	/* we free our watch data when we get IN_IGNORED */
-	ret = inotify_rm_wd(dev->ih, wd);
+	spin_lock(&group->inotify_data.idr_lock);
+	entry = idr_find(&group->inotify_data.idr, wd);
+	if (unlikely(!entry)) {
+		spin_unlock(&group->inotify_data.idr_lock);
+		ret = -EINVAL;
+		goto out;
+	}
+	fsnotify_get_mark(entry);
+	spin_unlock(&group->inotify_data.idr_lock);
+
+	inotify_destroy_mark_entry(entry, group);
+	fsnotify_put_mark(entry);
 
 out:
 	fput_light(filp, fput_needed);
@@ -753,9 +716,9 @@ inotify_get_sb(struct file_system_type *fs_type, int flags,
 }
 
 static struct file_system_type inotify_fs_type = {
-    .name           = "inotifyfs",
-    .get_sb         = inotify_get_sb,
-    .kill_sb        = kill_anon_super,
+    .name	= "inotifyfs",
+    .get_sb	= inotify_get_sb,
+    .kill_sb	= kill_anon_super,
 };
 
 /*
@@ -775,18 +738,16 @@ static int __init inotify_user_setup(void)
 	if (IS_ERR(inotify_mnt))
 		panic("inotify: kern_mount ret %ld!\n", PTR_ERR(inotify_mnt));
 
+	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
+	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
+	inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
+	if (!inotify_ignored_event)
+		panic("unable to allocate the inotify ignored event\n");
+
 	inotify_max_queued_events = 16384;
 	inotify_max_user_instances = 128;
 	inotify_max_user_watches = 8192;
 
-	watch_cachep = kmem_cache_create("inotify_watch_cache",
-					 sizeof(struct inotify_user_watch),
-					 0, SLAB_PANIC, NULL);
-	event_cachep = kmem_cache_create("inotify_event_cache",
-					 sizeof(struct inotify_kernel_event),
-					 0, SLAB_PANIC, NULL);
-
 	return 0;
 }
-
 module_init(inotify_user_setup);
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index d2c0ee30e6184..44848aa830dcd 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -9,6 +9,7 @@
 
 #ifdef __KERNEL__
 
+#include <linux/idr.h> /* inotify uses this */
 #include <linux/fs.h> /* struct inode */
 #include <linux/list.h>
 #include <linux/path.h> /* struct path */
@@ -59,6 +60,7 @@
 
 /* listeners that hard code group numbers near the top */
 #define DNOTIFY_GROUP_NUM	UINT_MAX
+#define INOTIFY_GROUP_NUM	(DNOTIFY_GROUP_NUM-1)
 
 struct fsnotify_group;
 struct fsnotify_event;
@@ -141,6 +143,15 @@ struct fsnotify_group {
 	/* groups can define private fields here or use the void *private */
 	union {
 		void *private;
+#ifdef CONFIG_INOTIFY_USER
+		struct inotify_group_private_data {
+			spinlock_t	idr_lock;
+			struct idr      idr;
+			u32             last_wd;
+			struct fasync_struct    *fa;    /* async notification */
+			struct user_struct      *user;
+		} inotify_data;
+#endif
 	};
 };
 
diff --git a/init/Kconfig b/init/Kconfig
index d4e9671347ee9..5de1c17c51ed0 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -302,7 +302,8 @@ config AUDITSYSCALL
 
 config AUDIT_TREE
 	def_bool y
-	depends on AUDITSYSCALL && INOTIFY
+	depends on AUDITSYSCALL
+	select INOTIFY
 
 menu "RCU Subsystem"
 
-- 
GitLab


From ff52cc2158b32b3b979ca7802b1fd7c70f36e13c Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:47 -0400
Subject: [PATCH 2103/2198] fsnotify: move events should indicate the event was
 on a child

fsnotify tells its listeners explicitly when an event happened on the given
inode verses on the child of the given inode.  (see __fsnotify_parent)
However, the semantics of fsnotify_move() are such that we deliver events
directly to the two parent directories in question (old_dir and new_dir)
directly without using the __fsnotify_parent() call.  fsnotify should be
adding FS_EVENT_ON_CHILD for the notifications to these parents.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 include/linux/fsnotify.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index c25b39ddd62ae..936f9aa8bb975 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -71,12 +71,11 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	struct inode *source = moved->d_inode;
 	u32 in_cookie = inotify_get_cookie();
 	u32 fs_cookie = fsnotify_get_cookie();
-	__u32 old_dir_mask = 0;
-	__u32 new_dir_mask = 0;
+	__u32 old_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_FROM);
+	__u32 new_dir_mask = (FS_EVENT_ON_CHILD | FS_MOVED_TO);
 
-	if (old_dir == new_dir) {
-		old_dir_mask = FS_DN_RENAME;
-	}
+	if (old_dir == new_dir)
+		old_dir_mask |= FS_DN_RENAME;
 
 	if (isdir) {
 		isdir = IN_ISDIR;
@@ -84,9 +83,6 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 		new_dir_mask |= FS_IN_ISDIR;
 	}
 
-	old_dir_mask |= FS_MOVED_FROM;
-	new_dir_mask |= FS_MOVED_TO;
-
 	inotify_inode_queue_event(old_dir, IN_MOVED_FROM|isdir, in_cookie, old_name,
 				  source);
 	inotify_inode_queue_event(new_dir, IN_MOVED_TO|isdir, in_cookie, new_name,
-- 
GitLab


From 5ac697b793a3c45005c568df692518da6e690390 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:47 -0400
Subject: [PATCH 2104/2198] dnotify: do not use ?true:false when assigning to a
 bool

dnotify_should send event assigned a bool using ?true:false when computing
a bit operation.  This is poitless and the bool type does this for us.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 12f9e6b1ffe2b..5134e898f60da 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -154,7 +154,7 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 		return false;
 
 	spin_lock(&entry->lock);
-	send = (mask & entry->mask) ? true : false;
+	send = (mask & entry->mask);
 	spin_unlock(&entry->lock);
 	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
 
-- 
GitLab


From ce61856bd2aadb064f595e5c0444376a2b117c41 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:47 -0400
Subject: [PATCH 2105/2198] dnotify: do not bother to lock entry->lock when
 reading mask

entry->lock is needed to make sure entry->mask does not change while
manipulating it.  In dnotify_should_send_event() we don't care if we get an
old or a new mask value out of this entry so there is no point it taking
the lock.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 5134e898f60da..ec459b6e8c641 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -153,9 +153,8 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 	if (!entry)
 		return false;
 
-	spin_lock(&entry->lock);
 	send = (mask & entry->mask);
-	spin_unlock(&entry->lock);
+
 	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
 
 	return send;
-- 
GitLab


From e42e27736de80045f925564ea27a1d32957219e7 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:47 -0400
Subject: [PATCH 2106/2198] inotify/dnotify: should_send_event shouldn't match
 on FS_EVENT_ON_CHILD

inotify and dnotify will both indicate that they want any event which came
from a child inode.  The fix is to mask off FS_EVENT_ON_CHILD when deciding
if inotify or dnotify is interested in a given event.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c          | 1 +
 fs/notify/fsnotify.c                 | 8 +++++---
 fs/notify/inotify/inotify_fsnotify.c | 1 +
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index ec459b6e8c641..98a751614c748 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -153,6 +153,7 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 	if (!entry)
 		return false;
 
+	mask = (mask & ~FS_EVENT_ON_CHILD);
 	send = (mask & entry->mask);
 
 	fsnotify_put_mark(entry); /* matches fsnotify_find_mark_entry */
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index f11d75f023684..ec2f7bd768184 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -137,14 +137,16 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	struct fsnotify_group *group;
 	struct fsnotify_event *event = NULL;
 	int idx;
+	/* global tests shouldn't care about events on child only the specific event */
+	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
 	if (list_empty(&fsnotify_groups))
 		return;
 
-	if (!(mask & fsnotify_mask))
+	if (!(test_mask & fsnotify_mask))
 		return;
 
-	if (!(mask & to_tell->i_fsnotify_mask))
+	if (!(test_mask & to_tell->i_fsnotify_mask))
 		return;
 	/*
 	 * SRCU!!  the groups list is very very much read only and the path is
@@ -153,7 +155,7 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 	 */
 	idx = srcu_read_lock(&fsnotify_grp_srcu);
 	list_for_each_entry_rcu(group, &fsnotify_groups, group_list) {
-		if (mask & group->mask) {
+		if (test_mask & group->mask) {
 			if (!group->ops->should_send_event(group, to_tell, mask))
 				continue;
 			if (!event) {
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 160da54868398..7ef75b83247e9 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -95,6 +95,7 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 	if (!entry)
 		return false;
 
+	mask = (mask & ~FS_EVENT_ON_CHILD);
 	send = (entry->mask & mask);
 
 	/* find took a reference */
-- 
GitLab


From a092ee20fd33d2df0990dcbf2235afc181612818 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Thu, 11 Jun 2009 11:09:48 -0400
Subject: [PATCH 2107/2198] fsnotify: allow groups to set freeing_mark to null

Most fsnotify listeners (all but inotify) do not care about marks being
freed.  Allow groups to set freeing_mark to null and do not call any
function if it is set that way.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/dnotify/dnotify.c | 8 +-------
 fs/notify/inode_mark.c      | 3 ++-
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 98a751614c748..828a889be9095 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -161,12 +161,6 @@ static bool dnotify_should_send_event(struct fsnotify_group *group,
 	return send;
 }
 
-static void dnotify_freeing_mark(struct fsnotify_mark_entry *entry,
-				 struct fsnotify_group *group)
-{
-	/* dnotify doesn't care than an inode is on the way out */
-}
-
 static void dnotify_free_mark(struct fsnotify_mark_entry *entry)
 {
 	struct dnotify_mark_entry *dnentry = container_of(entry,
@@ -182,7 +176,7 @@ static struct fsnotify_ops dnotify_fsnotify_ops = {
 	.handle_event = dnotify_handle_event,
 	.should_send_event = dnotify_should_send_event,
 	.free_group_priv = NULL,
-	.freeing_mark = dnotify_freeing_mark,
+	.freeing_mark = NULL,
 	.free_event_priv = NULL,
 };
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 0a499d2c61918..c8a07c65482b0 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -190,7 +190,8 @@ void fsnotify_destroy_mark_by_entry(struct fsnotify_mark_entry *entry)
 	 * callback to the group function to let it know that this entry
 	 * is being freed.
 	 */
-	group->ops->freeing_mark(entry, group);
+	if (group->ops->freeing_mark)
+		group->ops->freeing_mark(entry, group);
 
 	/*
 	 * __fsnotify_update_child_dentry_flags(inode);
-- 
GitLab


From 73422811d290c628b4ddbf6830e5cd6fa42e84f1 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Sun, 10 May 2009 16:05:39 -0400
Subject: [PATCH 2108/2198] reiserfs: allow exposing privroot w/ xattrs enabled

This patch adds an -oexpose_privroot option to allow access to the privroot.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/reiserfs/dir.c              | 10 ++++------
 fs/reiserfs/super.c            |  1 +
 fs/reiserfs/xattr.c            |  3 ++-
 include/linux/reiserfs_fs_sb.h |  2 ++
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 45ee3d357c70e..6d2668fdc3848 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -44,13 +44,11 @@ static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
 static inline bool is_privroot_deh(struct dentry *dir,
 				   struct reiserfs_de_head *deh)
 {
-	int ret = 0;
-#ifdef CONFIG_REISERFS_FS_XATTR
 	struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root;
-	ret = (dir == dir->d_parent && privroot->d_inode &&
-	       deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
-#endif
-	return ret;
+	if (reiserfs_expose_privroot(dir->d_sb))
+		return 0;
+	return (dir == dir->d_parent && privroot->d_inode &&
+	        deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid);
 }
 
 int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3567fb9e3fb12..9dbdcfb5d3148 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -898,6 +898,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options,	/* strin
 		{"conv",.setmask = 1 << REISERFS_CONVERT},
 		{"attrs",.setmask = 1 << REISERFS_ATTRS},
 		{"noattrs",.clrmask = 1 << REISERFS_ATTRS},
+		{"expose_privroot", .setmask = 1 << REISERFS_EXPOSE_PRIVROOT},
 #ifdef CONFIG_REISERFS_FS_XATTR
 		{"user_xattr",.setmask = 1 << REISERFS_XATTRS_USER},
 		{"nouser_xattr",.clrmask = 1 << REISERFS_XATTRS_USER},
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 8e7deb0e6964a..f3d47d8568482 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -981,7 +981,8 @@ int reiserfs_lookup_privroot(struct super_block *s)
 				strlen(PRIVROOT_NAME));
 	if (!IS_ERR(dentry)) {
 		REISERFS_SB(s)->priv_root = dentry;
-		s->s_root->d_op = &xattr_lookup_poison_ops;
+		if (!reiserfs_expose_privroot(s))
+			s->s_root->d_op = &xattr_lookup_poison_ops;
 		if (dentry->d_inode)
 			dentry->d_inode->i_flags |= S_PRIVATE;
 	} else
diff --git a/include/linux/reiserfs_fs_sb.h b/include/linux/reiserfs_fs_sb.h
index 6473650c28f13..dab68bbed6757 100644
--- a/include/linux/reiserfs_fs_sb.h
+++ b/include/linux/reiserfs_fs_sb.h
@@ -453,6 +453,7 @@ enum reiserfs_mount_options {
 	REISERFS_ATTRS,
 	REISERFS_XATTRS_USER,
 	REISERFS_POSIXACL,
+	REISERFS_EXPOSE_PRIVROOT,
 	REISERFS_BARRIER_NONE,
 	REISERFS_BARRIER_FLUSH,
 
@@ -490,6 +491,7 @@ enum reiserfs_mount_options {
 #define reiserfs_data_writeback(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_DATA_WRITEBACK))
 #define reiserfs_xattrs_user(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_XATTRS_USER))
 #define reiserfs_posixacl(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_POSIXACL))
+#define reiserfs_expose_privroot(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_EXPOSE_PRIVROOT))
 #define reiserfs_xattrs_optional(s) (reiserfs_xattrs_user(s) || reiserfs_posixacl(s))
 #define reiserfs_barrier_none(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_NONE))
 #define reiserfs_barrier_flush(s) (REISERFS_SB(s)->s_mount_opt & (1 << REISERFS_BARRIER_FLUSH))
-- 
GitLab


From 4e44b6852e03c915618ca6776b6697b436246b00 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 11:08:56 -0400
Subject: [PATCH 2109/2198] Get rid of path_lookup in autofs4

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/dev-ioctl.c | 195 +++++++++++++----------------------------
 1 file changed, 60 insertions(+), 135 deletions(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 84168c0dcc2d2..f71dac9986f0b 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -192,77 +192,42 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
 	return 0;
 }
 
-/*
- * Walk down the mount stack looking for an autofs mount that
- * has the requested device number (aka. new_encode_dev(sb->s_dev).
- */
-static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno)
+static int find_autofs_mount(const char *pathname,
+			     struct path *res,
+			     int test(struct path *path, void *data),
+			     void *data)
 {
-	struct dentry *dentry;
-	struct inode *inode;
-	struct super_block *sb;
-	dev_t s_dev;
-	unsigned int err;
-
+	struct path path;
+	int err = kern_path(pathname, 0, &path);
+	if (err)
+		return err;
 	err = -ENOENT;
-
-	/* Lookup the dentry name at the base of our mount point */
-	dentry = d_lookup(nd->path.dentry, &nd->last);
-	if (!dentry)
-		goto out;
-
-	dput(nd->path.dentry);
-	nd->path.dentry = dentry;
-
-	/* And follow the mount stack looking for our autofs mount */
-	while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
-		inode = nd->path.dentry->d_inode;
-		if (!inode)
-			break;
-
-		sb = inode->i_sb;
-		s_dev = new_encode_dev(sb->s_dev);
-		if (devno == s_dev) {
-			if (sb->s_magic == AUTOFS_SUPER_MAGIC) {
+	while (path.dentry == path.mnt->mnt_root) {
+		if (path.mnt->mnt_sb->s_magic == AUTOFS_SUPER_MAGIC) {
+			if (test(&path, data)) {
+				path_get(&path);
+				if (!err) /* already found some */
+					path_put(res);
+				*res = path;
 				err = 0;
-				break;
 			}
 		}
+		if (!follow_up(&path.mnt, &path.dentry))
+			break;
 	}
-out:
+	path_put(&path);
 	return err;
 }
 
-/*
- * Walk down the mount stack looking for an autofs mount that
- * has the requested mount type (ie. indirect, direct or offset).
- */
-static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type)
+static int test_by_dev(struct path *path, void *p)
 {
-	struct dentry *dentry;
-	struct autofs_info *ino;
-	unsigned int err;
-
-	err = -ENOENT;
-
-	/* Lookup the dentry name at the base of our mount point */
-	dentry = d_lookup(nd->path.dentry, &nd->last);
-	if (!dentry)
-		goto out;
-
-	dput(nd->path.dentry);
-	nd->path.dentry = dentry;
+	return path->mnt->mnt_sb->s_dev == *(dev_t *)p;
+}
 
-	/* And follow the mount stack looking for our autofs mount */
-	while (follow_down(&nd->path.mnt, &nd->path.dentry)) {
-		ino = autofs4_dentry_ino(nd->path.dentry);
-		if (ino && ino->sbi->type & type) {
-			err = 0;
-			break;
-		}
-	}
-out:
-	return err;
+static int test_by_type(struct path *path, void *p)
+{
+	struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
+	return ino && ino->sbi->type & *(unsigned *)p;
 }
 
 static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
@@ -283,31 +248,25 @@ static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file)
  * Open a file descriptor on the autofs mount point corresponding
  * to the given path and device number (aka. new_encode_dev(sb->s_dev)).
  */
-static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid)
+static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid)
 {
-	struct file *filp;
-	struct nameidata nd;
 	int err, fd;
 
 	fd = get_unused_fd();
 	if (likely(fd >= 0)) {
-		/* Get nameidata of the parent directory */
-		err = path_lookup(path, LOOKUP_PARENT, &nd);
+		struct file *filp;
+		struct path path;
+
+		err = find_autofs_mount(name, &path, test_by_dev, &devid);
 		if (err)
 			goto out;
 
 		/*
-		 * Search down, within the parent, looking for an
-		 * autofs super block that has the device number
+		 * Find autofs super block that has the device number
 		 * corresponding to the autofs fs we want to open.
 		 */
-		err = autofs_dev_ioctl_find_super(&nd, devid);
-		if (err) {
-			path_put(&nd.path);
-			goto out;
-		}
 
-		filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY,
+		filp = dentry_open(path.dentry, path.mnt, O_RDONLY,
 				   current_cred());
 		if (IS_ERR(filp)) {
 			err = PTR_ERR(filp);
@@ -340,7 +299,7 @@ static int autofs_dev_ioctl_openmount(struct file *fp,
 	param->ioctlfd = -1;
 
 	path = param->path;
-	devid = param->openmount.devid;
+	devid = new_decode_dev(param->openmount.devid);
 
 	err = 0;
 	fd = autofs_dev_ioctl_open_mountpoint(path, devid);
@@ -475,8 +434,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 				      struct autofs_dev_ioctl *param)
 {
 	struct autofs_info *ino;
-	struct nameidata nd;
-	const char *path;
+	struct path path;
 	dev_t devid;
 	int err = -ENOENT;
 
@@ -485,32 +443,24 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 		goto out;
 	}
 
-	path = param->path;
-	devid = new_encode_dev(sbi->sb->s_dev);
+	devid = sbi->sb->s_dev;
 
 	param->requester.uid = param->requester.gid = -1;
 
-	/* Get nameidata of the parent directory */
-	err = path_lookup(path, LOOKUP_PARENT, &nd);
+	err = find_autofs_mount(param->path, &path, test_by_dev, &devid);
 	if (err)
 		goto out;
 
-	err = autofs_dev_ioctl_find_super(&nd, devid);
-	if (err)
-		goto out_release;
-
-	ino = autofs4_dentry_ino(nd.path.dentry);
+	ino = autofs4_dentry_ino(path.dentry);
 	if (ino) {
 		err = 0;
-		autofs4_expire_wait(nd.path.dentry);
+		autofs4_expire_wait(path.dentry);
 		spin_lock(&sbi->fs_lock);
 		param->requester.uid = ino->uid;
 		param->requester.gid = ino->gid;
 		spin_unlock(&sbi->fs_lock);
 	}
-
-out_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return err;
 }
@@ -569,8 +519,8 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 					 struct autofs_sb_info *sbi,
 					 struct autofs_dev_ioctl *param)
 {
-	struct nameidata nd;
-	const char *path;
+	struct path path;
+	const char *name;
 	unsigned int type;
 	unsigned int devid, magic;
 	int err = -ENOENT;
@@ -580,71 +530,46 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 		goto out;
 	}
 
-	path = param->path;
+	name = param->path;
 	type = param->ismountpoint.in.type;
 
 	param->ismountpoint.out.devid = devid = 0;
 	param->ismountpoint.out.magic = magic = 0;
 
 	if (!fp || param->ioctlfd == -1) {
-		if (autofs_type_any(type)) {
-			struct super_block *sb;
-
-			err = path_lookup(path, LOOKUP_FOLLOW, &nd);
-			if (err)
-				goto out;
-
-			sb = nd.path.dentry->d_sb;
-			devid = new_encode_dev(sb->s_dev);
-		} else {
-			struct autofs_info *ino;
-
-			err = path_lookup(path, LOOKUP_PARENT, &nd);
-			if (err)
-				goto out;
-
-			err = autofs_dev_ioctl_find_sbi_type(&nd, type);
-			if (err)
-				goto out_release;
-
-			ino = autofs4_dentry_ino(nd.path.dentry);
-			devid = autofs4_get_dev(ino->sbi);
-		}
-
+		if (autofs_type_any(type))
+			err = kern_path(name, LOOKUP_FOLLOW, &path);
+		else
+			err = find_autofs_mount(name, &path, test_by_type, &type);
+		if (err)
+			goto out;
+		devid = new_encode_dev(path.mnt->mnt_sb->s_dev);
 		err = 0;
-		if (nd.path.dentry->d_inode &&
-		    nd.path.mnt->mnt_root == nd.path.dentry) {
+		if (path.dentry->d_inode &&
+		    path.mnt->mnt_root == path.dentry) {
 			err = 1;
-			magic = nd.path.dentry->d_inode->i_sb->s_magic;
+			magic = path.dentry->d_inode->i_sb->s_magic;
 		}
 	} else {
-		dev_t dev = autofs4_get_dev(sbi);
+		dev_t dev = sbi->sb->s_dev;
 
-		err = path_lookup(path, LOOKUP_PARENT, &nd);
+		err = find_autofs_mount(name, &path, test_by_dev, &dev);
 		if (err)
 			goto out;
 
-		err = autofs_dev_ioctl_find_super(&nd, dev);
-		if (err)
-			goto out_release;
-
-		devid = dev;
+		devid = new_encode_dev(dev);
 
-		err = have_submounts(nd.path.dentry);
+		err = have_submounts(path.dentry);
 
-		if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) {
-			if (follow_down(&nd.path.mnt, &nd.path.dentry)) {
-				struct inode *inode = nd.path.dentry->d_inode;
-				magic = inode->i_sb->s_magic;
-			}
+		if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
+			if (follow_down(&path.mnt, &path.dentry))
+				magic = path.mnt->mnt_sb->s_magic;
 		}
 	}
 
 	param->ismountpoint.out.devid = devid;
 	param->ismountpoint.out.magic = magic;
-
-out_release:
-	path_put(&nd.path);
+	path_put(&path);
 out:
 	return err;
 }
-- 
GitLab


From 9b4a9b14a793bc69b505ed916051f6f32db13bb8 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 11:44:16 -0400
Subject: [PATCH 2110/2198] Preparations to caching root in path_walk()

Split do_path_lookup(), opencode the call from do_filp_open()
do_filp_open() is the only caller of do_path_lookup() that
cares about root afterwards (it keeps resolving symlinks on
O_CREAT path after it'd done LOOKUP_PARENT walk).  So when
we start caching fs->root in path_walk(), it'll need a different
treatment.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index c82805d088e1d..895733efc6b9b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1017,9 +1017,7 @@ static int path_walk(const char *name, struct nameidata *nd)
 	return link_path_walk(name, nd);
 }
 
-/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
-static int do_path_lookup(int dfd, const char *name,
-				unsigned int flags, struct nameidata *nd)
+static int path_init(int dfd, const char *name, unsigned int flags, struct nameidata *nd)
 {
 	int retval = 0;
 	int fput_needed;
@@ -1063,17 +1061,25 @@ static int do_path_lookup(int dfd, const char *name,
 
 		fput_light(file, fput_needed);
 	}
+	return 0;
 
-	retval = path_walk(name, nd);
+fput_fail:
+	fput_light(file, fput_needed);
+out_fail:
+	return retval;
+}
+
+/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
+static int do_path_lookup(int dfd, const char *name,
+				unsigned int flags, struct nameidata *nd)
+{
+	int retval = path_init(dfd, name, flags, nd);
+	if (!retval)
+		retval = path_walk(name, nd);
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
-out_fail:
 	return retval;
-
-fput_fail:
-	fput_light(file, fput_needed);
-	goto out_fail;
 }
 
 int path_lookup(const char *name, unsigned int flags,
@@ -1676,9 +1682,14 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	/*
 	 * Create - we need to know the parent.
 	 */
-	error = do_path_lookup(dfd, pathname, LOOKUP_PARENT, &nd);
+	error = path_init(dfd, pathname, LOOKUP_PARENT, &nd);
 	if (error)
 		return ERR_PTR(error);
+	error = path_walk(pathname, &nd);
+	if (error)
+		return ERR_PTR(error);
+	if (unlikely(!audit_dummy_context()))
+		audit_inode(pathname, nd.path.dentry);
 
 	/*
 	 * We have the parent and last component. First of all, check
-- 
GitLab


From 2a737871108de9ba8930f7650d549f1383767f8b Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 11:49:53 -0400
Subject: [PATCH 2111/2198] Cache root in nameidata

New field: nd->root.  When pathname resolution wants to know the root,
check if nd->root.mnt is non-NULL; use nd->root if it is, otherwise
copy current->fs->root there.  After path_walk() is finished, we check
if we'd got a cached value in nd->root and drop it.  Before calling
path_walk() we should either set nd->root.mnt to NULL *or* copy (and
pin down) some path to nd->root.  In the latter case we won't be
looking at current->fs->root at all.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c            | 53 +++++++++++++++++++++++++++----------------
 include/linux/namei.h |  1 +
 2 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 895733efc6b9b..88baaf2b91677 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -552,6 +552,17 @@ static __always_inline int link_path_walk(const char *name, struct nameidata *nd
 	return result;
 }
 
+static __always_inline void set_root(struct nameidata *nd)
+{
+	if (!nd->root.mnt) {
+		struct fs_struct *fs = current->fs;
+		read_lock(&fs->lock);
+		nd->root = fs->root;
+		path_get(&nd->root);
+		read_unlock(&fs->lock);
+	}
+}
+
 static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
 {
 	int res = 0;
@@ -560,14 +571,10 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
 		goto fail;
 
 	if (*link == '/') {
-		struct fs_struct *fs = current->fs;
-
+		set_root(nd);
 		path_put(&nd->path);
-
-		read_lock(&fs->lock);
-		nd->path = fs->root;
-		path_get(&fs->root);
-		read_unlock(&fs->lock);
+		nd->path = nd->root;
+		path_get(&nd->root);
 	}
 
 	res = link_path_walk(link, nd);
@@ -741,19 +748,16 @@ int follow_down(struct vfsmount **mnt, struct dentry **dentry)
 
 static __always_inline void follow_dotdot(struct nameidata *nd)
 {
-	struct fs_struct *fs = current->fs;
+	set_root(nd);
 
 	while(1) {
 		struct vfsmount *parent;
 		struct dentry *old = nd->path.dentry;
 
-                read_lock(&fs->lock);
-		if (nd->path.dentry == fs->root.dentry &&
-		    nd->path.mnt == fs->root.mnt) {
-                        read_unlock(&fs->lock);
+		if (nd->path.dentry == nd->root.dentry &&
+		    nd->path.mnt == nd->root.mnt) {
 			break;
 		}
-                read_unlock(&fs->lock);
 		spin_lock(&dcache_lock);
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			nd->path.dentry = dget(nd->path.dentry->d_parent);
@@ -1022,18 +1026,18 @@ static int path_init(int dfd, const char *name, unsigned int flags, struct namei
 	int retval = 0;
 	int fput_needed;
 	struct file *file;
-	struct fs_struct *fs = current->fs;
 
 	nd->last_type = LAST_ROOT; /* if there are only slashes... */
 	nd->flags = flags;
 	nd->depth = 0;
+	nd->root.mnt = NULL;
 
 	if (*name=='/') {
-		read_lock(&fs->lock);
-		nd->path = fs->root;
-		path_get(&fs->root);
-		read_unlock(&fs->lock);
+		set_root(nd);
+		nd->path = nd->root;
+		path_get(&nd->root);
 	} else if (dfd == AT_FDCWD) {
+		struct fs_struct *fs = current->fs;
 		read_lock(&fs->lock);
 		nd->path = fs->pwd;
 		path_get(&fs->pwd);
@@ -1079,6 +1083,10 @@ static int do_path_lookup(int dfd, const char *name,
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
+	if (nd->root.mnt) {
+		path_put(&nd->root);
+		nd->root.mnt = NULL;
+	}
 	return retval;
 }
 
@@ -1115,6 +1123,7 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	nd->last_type = LAST_ROOT;
 	nd->flags = flags;
 	nd->depth = 0;
+	nd->root.mnt = NULL;
 
 	nd->path.dentry = dentry;
 	nd->path.mnt = mnt;
@@ -1125,8 +1134,12 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
 
-	return retval;
+	if (nd->root.mnt) {
+		path_put(&nd->root);
+		nd->root.mnt = NULL;
+	}
 
+	return retval;
 }
 
 /**
@@ -1817,6 +1830,8 @@ struct file *do_filp_open(int dfd, const char *pathname,
 	if (!IS_ERR(nd.intent.open.file))
 		release_open_intent(&nd);
 exit_parent:
+	if (nd.root.mnt)
+		path_put(&nd.root);
 	path_put(&nd.path);
 	return ERR_PTR(error);
 
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 518098fe63afd..325dd3ad39a02 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -18,6 +18,7 @@ enum { MAX_NESTED_LINKS = 8 };
 struct nameidata {
 	struct path	path;
 	struct qstr	last;
+	struct path	root;
 	unsigned int	flags;
 	int		last_type;
 	unsigned	depth;
-- 
GitLab


From 5b857119538daac7118c1364d7ff3613f12b84d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 11:53:49 -0400
Subject: [PATCH 2112/2198] Make vfs_path_lookup() use starting point as root

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 88baaf2b91677..4379ef9897092 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1123,21 +1123,20 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt,
 	nd->last_type = LAST_ROOT;
 	nd->flags = flags;
 	nd->depth = 0;
-	nd->root.mnt = NULL;
 
 	nd->path.dentry = dentry;
 	nd->path.mnt = mnt;
 	path_get(&nd->path);
+	nd->root = nd->path;
+	path_get(&nd->root);
 
 	retval = path_walk(name, nd);
 	if (unlikely(!retval && !audit_dummy_context() && nd->path.dentry &&
 				nd->path.dentry->d_inode))
 		audit_inode(name, nd->path.dentry);
 
-	if (nd->root.mnt) {
-		path_put(&nd->root);
-		nd->root.mnt = NULL;
-	}
+	path_put(&nd->root);
+	nd->root.mnt = NULL;
 
 	return retval;
 }
-- 
GitLab


From dd5cae6e9772ecc62fd374f7a8ec10eb51c96c4d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 12:21:18 -0400
Subject: [PATCH 2113/2198] Don't bother with check_mnt() in do_add_mount() on
 shrinkable ones

These guys are what we add as submounts; checks for "is that attached in
our namespace" are simply irrelevant for those and counterproductive for
use of private vfsmount trees a-la what NFS folks want.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 134d494158d9d..88a904d5aa23c 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1698,7 +1698,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 	       follow_down(&path->mnt, &path->dentry))
 		;
 	err = -EINVAL;
-	if (!check_mnt(path->mnt))
+	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
 		goto unlock;
 
 	/* Refuse the same filesystem on the same mount point */
-- 
GitLab


From 55430e2ecee574e729c12d4063b3ecabfa98fa82 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 02:04:46 -0400
Subject: [PATCH 2114/2198] nfsd struct path use: exp_get_by_name()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/export.c | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5839b229cd0ea..3f6d51b8c3efe 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -847,9 +847,8 @@ exp_get_fsid_key(svc_client *clp, int fsid)
 	return exp_find_key(clp, FSID_NUM, fsidv, NULL);
 }
 
-static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
-				   struct dentry *dentry,
-				   struct cache_req *reqp)
+static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
+				     struct cache_req *reqp)
 {
 	struct svc_export *exp, key;
 	int err;
@@ -858,8 +857,7 @@ static svc_export *exp_get_by_name(svc_client *clp, struct vfsmount *mnt,
 		return ERR_PTR(-ENOENT);
 
 	key.ex_client = clp;
-	key.ex_path.mnt = mnt;
-	key.ex_path.dentry = dentry;
+	key.ex_path = *path;
 
 	exp = svc_export_lookup(&key);
 	if (exp == NULL)
@@ -877,20 +875,19 @@ static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt,
 				     struct dentry *dentry,
 				     struct cache_req *reqp)
 {
+	struct path path = {.mnt = mnt, .dentry = dentry};
 	svc_export *exp;
 
-	dget(dentry);
-	exp = exp_get_by_name(clp, mnt, dentry, reqp);
-
-	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
-		struct dentry *parent;
+	dget(path.dentry);
+	exp = exp_get_by_name(clp, &path, reqp);
 
-		parent = dget_parent(dentry);
-		dput(dentry);
-		dentry = parent;
-		exp = exp_get_by_name(clp, mnt, dentry, reqp);
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path.dentry)) {
+		struct dentry *parent = dget_parent(path.dentry);
+		dput(path.dentry);
+		path.dentry = parent;
+		exp = exp_get_by_name(clp, &path, reqp);
 	}
-	dput(dentry);
+	dput(path.dentry);
 	return exp;
 }
 
@@ -1018,7 +1015,7 @@ exp_export(struct nfsctl_export *nxp)
 		goto out_put_clp;
 	err = -EINVAL;
 
-	exp = exp_get_by_name(clp, path.mnt, path.dentry, NULL);
+	exp = exp_get_by_name(clp, &path, NULL);
 
 	memset(&new, 0, sizeof(new));
 
@@ -1135,7 +1132,7 @@ exp_unexport(struct nfsctl_export *nxp)
 		goto out_domain;
 
 	err = -EINVAL;
-	exp = exp_get_by_name(dom, path.mnt, path.dentry, NULL);
+	exp = exp_get_by_name(dom, &path, NULL);
 	path_put(&path);
 	if (IS_ERR(exp))
 		goto out_domain;
@@ -1207,7 +1204,7 @@ static struct svc_export *exp_find(struct auth_domain *clp, int fsid_type,
 	if (IS_ERR(ek))
 		return ERR_CAST(ek);
 
-	exp = exp_get_by_name(clp, ek->ek_path.mnt, ek->ek_path.dentry, reqp);
+	exp = exp_get_by_name(clp, &ek->ek_path, reqp);
 	cache_put(&ek->h, &svc_expkey_cache);
 
 	if (IS_ERR(exp))
@@ -1251,12 +1248,13 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
 		struct dentry *dentry)
 {
 	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
+	struct path path = {.mnt = mnt, .dentry = dentry};
 
 	if (rqstp->rq_client == NULL)
 		goto gss;
 
 	/* First try the auth_unix client: */
-	exp = exp_get_by_name(rqstp->rq_client, mnt, dentry,
+	exp = exp_get_by_name(rqstp->rq_client, &path,
 						&rqstp->rq_chandle);
 	if (PTR_ERR(exp) == -ENOENT)
 		goto gss;
@@ -1269,7 +1267,7 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
 	/* Otherwise, try falling back on gss client */
 	if (rqstp->rq_gssclient == NULL)
 		return exp;
-	gssexp = exp_get_by_name(rqstp->rq_gssclient, mnt, dentry,
+	gssexp = exp_get_by_name(rqstp->rq_gssclient, &path,
 						&rqstp->rq_chandle);
 	if (PTR_ERR(gssexp) == -ENOENT)
 		return exp;
-- 
GitLab


From 5bf3bd2b5cb68ba43c91f5bd0ac043543fba2558 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 02:14:32 -0400
Subject: [PATCH 2115/2198] switch exp_parent() to struct path

... and lose the always-NULL last argument (non-NULL case had been
split off a while ago).

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/export.c | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 3f6d51b8c3efe..5149dabde5551 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -871,23 +871,19 @@ static svc_export *exp_get_by_name(svc_client *clp, const struct path *path,
 /*
  * Find the export entry for a given dentry.
  */
-static struct svc_export *exp_parent(svc_client *clp, struct vfsmount *mnt,
-				     struct dentry *dentry,
-				     struct cache_req *reqp)
+static struct svc_export *exp_parent(svc_client *clp, struct path *path)
 {
-	struct path path = {.mnt = mnt, .dentry = dentry};
-	svc_export *exp;
-
-	dget(path.dentry);
-	exp = exp_get_by_name(clp, &path, reqp);
-
-	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path.dentry)) {
-		struct dentry *parent = dget_parent(path.dentry);
-		dput(path.dentry);
-		path.dentry = parent;
-		exp = exp_get_by_name(clp, &path, reqp);
+	struct dentry *saved = dget(path->dentry);
+	svc_export *exp = exp_get_by_name(clp, path, NULL);
+
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+		struct dentry *parent = dget_parent(path->dentry);
+		dput(path->dentry);
+		path->dentry = parent;
+		exp = exp_get_by_name(clp, path, NULL);
 	}
-	dput(path.dentry);
+	dput(path->dentry);
+	path->dentry = saved;
 	return exp;
 }
 
@@ -1174,7 +1170,7 @@ exp_rootfh(svc_client *clp, char *name, struct knfsd_fh *f, int maxsize)
 	dprintk("nfsd: exp_rootfh(%s [%p] %s:%s/%ld)\n",
 		 name, path.dentry, clp->name,
 		 inode->i_sb->s_id, inode->i_ino);
-	exp = exp_parent(clp, path.mnt, path.dentry, NULL);
+	exp = exp_parent(clp, &path);
 	if (IS_ERR(exp)) {
 		err = PTR_ERR(exp);
 		goto out;
-- 
GitLab


From 91c9fa8f75877c0c1e455c23e8f8206c91c8f77f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 02:42:05 -0400
Subject: [PATCH 2116/2198] switch rqst_exp_get_by_name()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/export.c            | 15 ++++++---------
 fs/nfsd/vfs.c               | 32 ++++++++++++++++----------------
 include/linux/nfsd/export.h |  3 +--
 3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 5149dabde5551..84f5e5cb08637 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1240,18 +1240,15 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp)
  * use exp_get_by_name() or exp_find().
  */
 struct svc_export *
-rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
-		struct dentry *dentry)
+rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path)
 {
 	struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT);
-	struct path path = {.mnt = mnt, .dentry = dentry};
 
 	if (rqstp->rq_client == NULL)
 		goto gss;
 
 	/* First try the auth_unix client: */
-	exp = exp_get_by_name(rqstp->rq_client, &path,
-						&rqstp->rq_chandle);
+	exp = exp_get_by_name(rqstp->rq_client, path, &rqstp->rq_chandle);
 	if (PTR_ERR(exp) == -ENOENT)
 		goto gss;
 	if (IS_ERR(exp))
@@ -1263,8 +1260,7 @@ rqst_exp_get_by_name(struct svc_rqst *rqstp, struct vfsmount *mnt,
 	/* Otherwise, try falling back on gss client */
 	if (rqstp->rq_gssclient == NULL)
 		return exp;
-	gssexp = exp_get_by_name(rqstp->rq_gssclient, &path,
-						&rqstp->rq_chandle);
+	gssexp = exp_get_by_name(rqstp->rq_gssclient, path, &rqstp->rq_chandle);
 	if (PTR_ERR(gssexp) == -ENOENT)
 		return exp;
 	if (!IS_ERR(exp))
@@ -1307,9 +1303,10 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
 		struct dentry *dentry)
 {
 	struct svc_export *exp;
+	struct path path = {.mnt = mnt, .dentry = dentry};
 
 	dget(dentry);
-	exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+	exp = rqst_exp_get_by_name(rqstp, &path);
 
 	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
 		struct dentry *parent;
@@ -1317,7 +1314,7 @@ rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
 		parent = dget_parent(dentry);
 		dput(dentry);
 		dentry = parent;
-		exp = rqst_exp_get_by_name(rqstp, mnt, dentry);
+		exp = rqst_exp_get_by_name(rqstp, &path);
 	}
 	dput(dentry);
 	return exp;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index bd584bcf1d9f9..d84c4eaa526b5 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -101,36 +101,36 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 {
 	struct svc_export *exp = *expp, *exp2 = NULL;
 	struct dentry *dentry = *dpp;
-	struct vfsmount *mnt = mntget(exp->ex_path.mnt);
-	struct dentry *mounts = dget(dentry);
+	struct path path = {.mnt = mntget(exp->ex_path.mnt),
+			    .dentry = dget(dentry)};
 	int err = 0;
 
-	while (follow_down(&mnt,&mounts)&&d_mountpoint(mounts));
+	while (follow_down(&path.mnt, &path.dentry) &&
+	       d_mountpoint(path.dentry))
+		;
 
-	exp2 = rqst_exp_get_by_name(rqstp, mnt, mounts);
+	exp2 = rqst_exp_get_by_name(rqstp, &path);
 	if (IS_ERR(exp2)) {
 		if (PTR_ERR(exp2) != -ENOENT)
 			err = PTR_ERR(exp2);
-		dput(mounts);
-		mntput(mnt);
+		path_put(&path);
 		goto out;
 	}
 	if ((exp->ex_flags & NFSEXP_CROSSMOUNT) || EX_NOHIDE(exp2)) {
 		/* successfully crossed mount point */
 		/*
-		 * This is subtle: dentry is *not* under mnt at this point.
-		 * The only reason we are safe is that original mnt is pinned
-		 * down by exp, so we should dput before putting exp.
+		 * This is subtle: path.dentry is *not* on path.mnt
+		 * at this point.  The only reason we are safe is that
+		 * original mnt is pinned down by exp, so we should
+		 * put path *before* putting exp
 		 */
-		dput(dentry);
-		*dpp = mounts;
-		exp_put(exp);
+		*dpp = path.dentry;
+		path.dentry = dentry;
 		*expp = exp2;
-	} else {
-		exp_put(exp2);
-		dput(mounts);
+		exp2 = exp;
 	}
-	mntput(mnt);
+	path_put(&path);
+	exp_put(exp2);
 out:
 	return err;
 }
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index bcd0201589f83..98f6fd584d537 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -125,8 +125,7 @@ void			nfsd_export_flush(void);
 void			exp_readlock(void);
 void			exp_readunlock(void);
 struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
-					     struct vfsmount *,
-					     struct dentry *);
+					     struct path *);
 struct svc_export *	rqst_exp_parent(struct svc_rqst *,
 					struct vfsmount *mnt,
 					struct dentry *dentry);
-- 
GitLab


From e64c390ca0b60fd2119331ef1fa888d7ea27e424 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:00:46 -0400
Subject: [PATCH 2117/2198] switch rqst_exp_parent()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nfsd/export.c            | 25 ++++++++++---------------
 fs/nfsd/vfs.c               | 23 ++++++++++++-----------
 include/linux/nfsd/export.h |  3 +--
 3 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 84f5e5cb08637..8b1f8efb4690e 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1299,24 +1299,19 @@ rqst_exp_find(struct svc_rqst *rqstp, int fsid_type, u32 *fsidv)
 }
 
 struct svc_export *
-rqst_exp_parent(struct svc_rqst *rqstp, struct vfsmount *mnt,
-		struct dentry *dentry)
+rqst_exp_parent(struct svc_rqst *rqstp, struct path *path)
 {
-	struct svc_export *exp;
-	struct path path = {.mnt = mnt, .dentry = dentry};
-
-	dget(dentry);
-	exp = rqst_exp_get_by_name(rqstp, &path);
-
-	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(dentry)) {
-		struct dentry *parent;
+	struct dentry *saved = dget(path->dentry);
+	struct svc_export *exp = rqst_exp_get_by_name(rqstp, path);
 
-		parent = dget_parent(dentry);
-		dput(dentry);
-		dentry = parent;
-		exp = rqst_exp_get_by_name(rqstp, &path);
+	while (PTR_ERR(exp) == -ENOENT && !IS_ROOT(path->dentry)) {
+		struct dentry *parent = dget_parent(path->dentry);
+		dput(path->dentry);
+		path->dentry = parent;
+		exp = rqst_exp_get_by_name(rqstp, path);
 	}
-	dput(dentry);
+	dput(path->dentry);
+	path->dentry = saved;
 	return exp;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d84c4eaa526b5..9f1ea3127f5d4 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -169,28 +169,29 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			/* checking mountpoint crossing is very different when stepping up */
 			struct svc_export *exp2 = NULL;
 			struct dentry *dp;
-			struct vfsmount *mnt = mntget(exp->ex_path.mnt);
-			dentry = dget(dparent);
-			while(dentry == mnt->mnt_root && follow_up(&mnt, &dentry))
+			struct path path = {.mnt = mntget(exp->ex_path.mnt),
+					    .dentry = dget(dparent)};
+
+			while (path.dentry == path.mnt->mnt_root &&
+			       follow_up(&path.mnt, &path.dentry))
 				;
-			dp = dget_parent(dentry);
-			dput(dentry);
-			dentry = dp;
+			dp = dget_parent(path.dentry);
+			dput(path.dentry);
+			path.dentry = dp;
 
-			exp2 = rqst_exp_parent(rqstp, mnt, dentry);
+			exp2 = rqst_exp_parent(rqstp, &path);
 			if (PTR_ERR(exp2) == -ENOENT) {
-				dput(dentry);
 				dentry = dget(dparent);
 			} else if (IS_ERR(exp2)) {
 				host_err = PTR_ERR(exp2);
-				dput(dentry);
-				mntput(mnt);
+				path_put(&path);
 				goto out_nfserr;
 			} else {
+				dentry = dget(path.dentry);
 				exp_put(exp);
 				exp = exp2;
 			}
-			mntput(mnt);
+			path_put(&path);
 		}
 	} else {
 		fh_lock(fhp);
diff --git a/include/linux/nfsd/export.h b/include/linux/nfsd/export.h
index 98f6fd584d537..a6d9ef2bb34a9 100644
--- a/include/linux/nfsd/export.h
+++ b/include/linux/nfsd/export.h
@@ -127,8 +127,7 @@ void			exp_readunlock(void);
 struct svc_export *	rqst_exp_get_by_name(struct svc_rqst *,
 					     struct path *);
 struct svc_export *	rqst_exp_parent(struct svc_rqst *,
-					struct vfsmount *mnt,
-					struct dentry *dentry);
+					struct path *);
 int			exp_rootfh(struct auth_domain *, 
 					char *path, struct knfsd_fh *, int maxsize);
 __be32			exp_pseudoroot(struct svc_rqst *, struct svc_fh *);
-- 
GitLab


From bab77ebf51e3902f608ecf08c9d34a0a52ac35a9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:26:48 -0400
Subject: [PATCH 2118/2198] switch follow_up() to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/autofs4/dev-ioctl.c |  2 +-
 fs/namei.c             | 16 ++++++++--------
 fs/nfsd/vfs.c          |  2 +-
 include/linux/namei.h  |  2 +-
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index f71dac9986f0b..670407576b255 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -212,7 +212,7 @@ static int find_autofs_mount(const char *pathname,
 				err = 0;
 			}
 		}
-		if (!follow_up(&path.mnt, &path.dentry))
+		if (!follow_up(&path))
 			break;
 	}
 	path_put(&path);
diff --git a/fs/namei.c b/fs/namei.c
index 4379ef9897092..8c1f48ae68e7c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -675,23 +675,23 @@ static inline int do_follow_link(struct path *path, struct nameidata *nd)
 	return err;
 }
 
-int follow_up(struct vfsmount **mnt, struct dentry **dentry)
+int follow_up(struct path *path)
 {
 	struct vfsmount *parent;
 	struct dentry *mountpoint;
 	spin_lock(&vfsmount_lock);
-	parent=(*mnt)->mnt_parent;
-	if (parent == *mnt) {
+	parent = path->mnt->mnt_parent;
+	if (parent == path->mnt) {
 		spin_unlock(&vfsmount_lock);
 		return 0;
 	}
 	mntget(parent);
-	mountpoint=dget((*mnt)->mnt_mountpoint);
+	mountpoint = dget(path->mnt->mnt_mountpoint);
 	spin_unlock(&vfsmount_lock);
-	dput(*dentry);
-	*dentry = mountpoint;
-	mntput(*mnt);
-	*mnt = parent;
+	dput(path->dentry);
+	path->dentry = mountpoint;
+	mntput(path->mnt);
+	path->mnt = parent;
 	return 1;
 }
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9f1ea3127f5d4..7b2b3f775326c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -173,7 +173,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 					    .dentry = dget(dparent)};
 
 			while (path.dentry == path.mnt->mnt_root &&
-			       follow_up(&path.mnt, &path.dentry))
+			       follow_up(&path))
 				;
 			dp = dget_parent(path.dentry);
 			dput(path.dentry);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 325dd3ad39a02..9cd5a717be3b2 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -79,7 +79,7 @@ extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_noperm(const char *, struct dentry *);
 
 extern int follow_down(struct vfsmount **, struct dentry **);
-extern int follow_up(struct vfsmount **, struct dentry **);
+extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
 extern void unlock_rename(struct dentry *, struct dentry *);
-- 
GitLab


From 589ff870ed60a9ebdd5ec99ec3f5afe1282fe151 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 03:28:19 -0400
Subject: [PATCH 2119/2198] Switch collect_mounts() to struct path

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c      | 4 ++--
 include/linux/fs.h  | 2 +-
 kernel/audit_tree.c | 6 +++---
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 88a904d5aa23c..c85962206aad6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1253,11 +1253,11 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry,
 	return NULL;
 }
 
-struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *collect_mounts(struct path *path)
 {
 	struct vfsmount *tree;
 	down_write(&namespace_sem);
-	tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
+	tree = copy_tree(path->mnt, path->dentry, CL_COPY_ALL | CL_PRIVATE);
 	up_write(&namespace_sem);
 	return tree;
 }
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 323b5ce474c15..03fb2102b8f3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1800,7 +1800,7 @@ extern struct vfsmount *kern_mount_data(struct file_system_type *, void *data);
 extern int may_umount_tree(struct vfsmount *);
 extern int may_umount(struct vfsmount *);
 extern long do_mount(char *, char *, char *, unsigned long, void *);
-extern struct vfsmount *collect_mounts(struct vfsmount *, struct dentry *);
+extern struct vfsmount *collect_mounts(struct path *);
 extern void drop_collected_mounts(struct vfsmount *);
 
 extern int vfs_statfs(struct dentry *, struct kstatfs *);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 6e7351739a820..1f6396d766874 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -568,7 +568,7 @@ void audit_trim_trees(void)
 		if (err)
 			goto skip_it;
 
-		root_mnt = collect_mounts(path.mnt, path.dentry);
+		root_mnt = collect_mounts(&path);
 		path_put(&path);
 		if (!root_mnt)
 			goto skip_it;
@@ -660,7 +660,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
 	err = kern_path(tree->pathname, 0, &path);
 	if (err)
 		goto Err;
-	mnt = collect_mounts(path.mnt, path.dentry);
+	mnt = collect_mounts(&path);
 	path_put(&path);
 	if (!mnt) {
 		err = -ENOMEM;
@@ -720,7 +720,7 @@ int audit_tag_tree(char *old, char *new)
 	err = kern_path(new, 0, &path);
 	if (err)
 		return err;
-	tagged = collect_mounts(path.mnt, path.dentry);
+	tagged = collect_mounts(&path);
 	path_put(&path);
 	if (!tagged)
 		return -ENOMEM;
-- 
GitLab


From 9393bd07cf218ca51d0e627653f906a9d76a9131 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 13:58:15 -0400
Subject: [PATCH 2120/2198] switch follow_down()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/afs/mntpt.c         |  2 +-
 fs/autofs/dirhash.c    |  5 ++---
 fs/autofs4/autofs_i.h  |  6 +++---
 fs/autofs4/dev-ioctl.c |  2 +-
 fs/autofs4/expire.c    | 15 +++++++--------
 fs/autofs4/root.c      |  7 +++----
 fs/cifs/cifs_dfs_ref.c |  2 +-
 fs/namei.c             | 12 ++++++------
 fs/namespace.c         |  4 ++--
 fs/nfs/namespace.c     |  2 +-
 fs/nfsd/vfs.c          |  3 +--
 include/linux/namei.h  |  2 +-
 12 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 2b9e2d03a3902..c52be53f69462 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -244,7 +244,7 @@ static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
 		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path.mnt, &nd->path.dentry))
+		       follow_down(&nd->path))
 			;
 		err = 0;
 	default:
diff --git a/fs/autofs/dirhash.c b/fs/autofs/dirhash.c
index 4eb4d8dfb2f18..2316e944a109d 100644
--- a/fs/autofs/dirhash.c
+++ b/fs/autofs/dirhash.c
@@ -85,13 +85,12 @@ struct autofs_dir_ent *autofs_expire(struct super_block *sb,
 		}
 		path.mnt = mnt;
 		path_get(&path);
-		if (!follow_down(&path.mnt, &path.dentry)) {
+		if (!follow_down(&path)) {
 			path_put(&path);
 			DPRINTK(("autofs: not expirable (not a mounted directory): %s\n", ent->name));
 			continue;
 		}
-		while (d_mountpoint(path.dentry) &&
-		       follow_down(&path.mnt, &path.dentry))
+		while (d_mountpoint(path.dentry) && follow_down(&path));
 			;
 		umount_ok = may_umount(path.mnt);
 		path_put(&path);
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index b7ff33c631015..8f7cdde417335 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -223,12 +223,12 @@ int autofs4_wait(struct autofs_sb_info *,struct dentry *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *,autofs_wqt_t,int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
-static inline int autofs4_follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static inline int autofs4_follow_mount(struct path *path)
 {
 	int res = 0;
 
-	while (d_mountpoint(*dentry)) {
-		int followed = follow_down(mnt, dentry);
+	while (d_mountpoint(path->dentry)) {
+		int followed = follow_down(path);
 		if (!followed)
 			break;
 		res = 1;
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 670407576b255..f3da2eb51f56a 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -562,7 +562,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 		err = have_submounts(path.dentry);
 
 		if (path.mnt->mnt_mountpoint != path.mnt->mnt_root) {
-			if (follow_down(&path.mnt, &path.dentry))
+			if (follow_down(&path))
 				magic = path.mnt->mnt_sb->s_magic;
 		}
 	}
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 3077d8f165230..aa39ae83f0193 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -48,19 +48,19 @@ static inline int autofs4_can_expire(struct dentry *dentry,
 static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 {
 	struct dentry *top = dentry;
+	struct path path = {.mnt = mnt, .dentry = dentry};
 	int status = 1;
 
 	DPRINTK("dentry %p %.*s",
 		dentry, (int)dentry->d_name.len, dentry->d_name.name);
 
-	mntget(mnt);
-	dget(dentry);
+	path_get(&path);
 
-	if (!follow_down(&mnt, &dentry))
+	if (!follow_down(&path))
 		goto done;
 
-	if (is_autofs4_dentry(dentry)) {
-		struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+	if (is_autofs4_dentry(path.dentry)) {
+		struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb);
 
 		/* This is an autofs submount, we can't expire it */
 		if (autofs_type_indirect(sbi->type))
@@ -70,7 +70,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 		 * Otherwise it's an offset mount and we need to check
 		 * if we can umount its mount, if there is one.
 		 */
-		if (!d_mountpoint(dentry)) {
+		if (!d_mountpoint(path.dentry)) {
 			status = 0;
 			goto done;
 		}
@@ -86,8 +86,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
 	status = 0;
 done:
 	DPRINTK("returning = %d", status);
-	dput(dentry);
-	mntput(mnt);
+	path_put(&path);
 	return status;
 }
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e383bf0334f17..b96a3c57359d7 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -181,7 +181,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		nd->flags);
 	/*
 	 * For an expire of a covered direct or offset mount we need
-	 * to beeak out of follow_down() at the autofs mount trigger
+	 * to break out of follow_down() at the autofs mount trigger
 	 * (d_mounted--), so we can see the expiring flag, and manage
 	 * the blocking and following here until the expire is completed.
 	 */
@@ -190,7 +190,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 		if (ino->flags & AUTOFS_INF_EXPIRING) {
 			spin_unlock(&sbi->fs_lock);
 			/* Follow down to our covering mount. */
-			if (!follow_down(&nd->path.mnt, &nd->path.dentry))
+			if (!follow_down(&nd->path))
 				goto done;
 			goto follow;
 		}
@@ -230,8 +230,7 @@ static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd)
 	 * to follow it.
 	 */
 	if (d_mountpoint(dentry)) {
-		if (!autofs4_follow_mount(&nd->path.mnt,
-					  &nd->path.dentry)) {
+		if (!autofs4_follow_mount(&nd->path)) {
 			status = -ENOENT;
 			goto out_error;
 		}
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 83d62759c7c7a..3bb11be8b6a82 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -275,7 +275,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd,
 	case -EBUSY:
 		/* someone else made a mount here whilst we were busy */
 		while (d_mountpoint(nd->path.dentry) &&
-		       follow_down(&nd->path.mnt, &nd->path.dentry))
+		       follow_down(&nd->path))
 			;
 		err = 0;
 	default:
diff --git a/fs/namei.c b/fs/namei.c
index 8c1f48ae68e7c..4d49a3eee6d43 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -731,16 +731,16 @@ static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
 /* no need for dcache_lock, as serialization is taken care in
  * namespace.c
  */
-int follow_down(struct vfsmount **mnt, struct dentry **dentry)
+int follow_down(struct path *path)
 {
 	struct vfsmount *mounted;
 
-	mounted = lookup_mnt(*mnt, *dentry);
+	mounted = lookup_mnt(path->mnt, path->dentry);
 	if (mounted) {
-		dput(*dentry);
-		mntput(*mnt);
-		*mnt = mounted;
-		*dentry = dget(mounted->mnt_root);
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
 		return 1;
 	}
 	return 0;
diff --git a/fs/namespace.c b/fs/namespace.c
index c85962206aad6..ba5237be1cf98 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1601,7 +1601,7 @@ static int do_move_mount(struct path *path, char *old_name)
 
 	down_write(&namespace_sem);
 	while (d_mountpoint(path->dentry) &&
-	       follow_down(&path->mnt, &path->dentry))
+	       follow_down(path))
 		;
 	err = -EINVAL;
 	if (!check_mnt(path->mnt) || !check_mnt(old_path.mnt))
@@ -1695,7 +1695,7 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path,
 	down_write(&namespace_sem);
 	/* Something was mounted here while we slept */
 	while (d_mountpoint(path->dentry) &&
-	       follow_down(&path->mnt, &path->dentry))
+	       follow_down(path))
 		;
 	err = -EINVAL;
 	if (!(mnt_flags & MNT_SHRINKABLE) && !check_mnt(path->mnt))
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 64a288ee046d0..f01caec844634 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -154,7 +154,7 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd)
 	goto out;
 out_follow:
 	while (d_mountpoint(nd->path.dentry) &&
-	       follow_down(&nd->path.mnt, &nd->path.dentry))
+	       follow_down(&nd->path))
 		;
 	err = 0;
 	goto out;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7b2b3f775326c..99f8357535969 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -105,8 +105,7 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp,
 			    .dentry = dget(dentry)};
 	int err = 0;
 
-	while (follow_down(&path.mnt, &path.dentry) &&
-	       d_mountpoint(path.dentry))
+	while (d_mountpoint(path.dentry) && follow_down(&path))
 		;
 
 	exp2 = rqst_exp_get_by_name(rqstp, &path);
diff --git a/include/linux/namei.h b/include/linux/namei.h
index 9cd5a717be3b2..d870ae2faedc7 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -78,7 +78,7 @@ extern void release_open_intent(struct nameidata *);
 extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
 extern struct dentry *lookup_one_noperm(const char *, struct dentry *);
 
-extern int follow_down(struct vfsmount **, struct dentry **);
+extern int follow_down(struct path *);
 extern int follow_up(struct path *);
 
 extern struct dentry *lock_rename(struct dentry *, struct dentry *);
-- 
GitLab


From 79ed0226198c628133530b179a90dbf42b1c2eba Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 13:59:41 -0400
Subject: [PATCH 2121/2198] switch follow_mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index 4d49a3eee6d43..c006bc61d1ea5 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -715,16 +715,16 @@ static int __follow_mount(struct path *path)
 	return res;
 }
 
-static void follow_mount(struct vfsmount **mnt, struct dentry **dentry)
+static void follow_mount(struct path *path)
 {
-	while (d_mountpoint(*dentry)) {
-		struct vfsmount *mounted = lookup_mnt(*mnt, *dentry);
+	while (d_mountpoint(path->dentry)) {
+		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
 		if (!mounted)
 			break;
-		dput(*dentry);
-		mntput(*mnt);
-		*mnt = mounted;
-		*dentry = dget(mounted->mnt_root);
+		dput(path->dentry);
+		mntput(path->mnt);
+		path->mnt = mounted;
+		path->dentry = dget(mounted->mnt_root);
 	}
 }
 
@@ -779,7 +779,7 @@ static __always_inline void follow_dotdot(struct nameidata *nd)
 		mntput(nd->path.mnt);
 		nd->path.mnt = parent;
 	}
-	follow_mount(&nd->path.mnt, &nd->path.dentry);
+	follow_mount(&nd->path);
 }
 
 /*
-- 
GitLab


From 1c755af4df75996b0dd4b7e6cacaf9d57a6ef2ef Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 18 Apr 2009 14:06:57 -0400
Subject: [PATCH 2122/2198] switch lookup_mnt()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c             | 6 +++---
 fs/namespace.c         | 4 ++--
 include/linux/dcache.h | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/namei.c b/fs/namei.c
index c006bc61d1ea5..527119afb6a5c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -702,7 +702,7 @@ static int __follow_mount(struct path *path)
 {
 	int res = 0;
 	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
+		struct vfsmount *mounted = lookup_mnt(path);
 		if (!mounted)
 			break;
 		dput(path->dentry);
@@ -718,7 +718,7 @@ static int __follow_mount(struct path *path)
 static void follow_mount(struct path *path)
 {
 	while (d_mountpoint(path->dentry)) {
-		struct vfsmount *mounted = lookup_mnt(path->mnt, path->dentry);
+		struct vfsmount *mounted = lookup_mnt(path);
 		if (!mounted)
 			break;
 		dput(path->dentry);
@@ -735,7 +735,7 @@ int follow_down(struct path *path)
 {
 	struct vfsmount *mounted;
 
-	mounted = lookup_mnt(path->mnt, path->dentry);
+	mounted = lookup_mnt(path);
 	if (mounted) {
 		dput(path->dentry);
 		mntput(path->mnt);
diff --git a/fs/namespace.c b/fs/namespace.c
index ba5237be1cf98..b94ad3d685ff4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -442,11 +442,11 @@ struct vfsmount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry,
  * lookup_mnt increments the ref count before returning
  * the vfsmount struct.
  */
-struct vfsmount *lookup_mnt(struct vfsmount *mnt, struct dentry *dentry)
+struct vfsmount *lookup_mnt(struct path *path)
 {
 	struct vfsmount *child_mnt;
 	spin_lock(&vfsmount_lock);
-	if ((child_mnt = __lookup_mnt(mnt, dentry, 1)))
+	if ((child_mnt = __lookup_mnt(path->mnt, path->dentry, 1)))
 		mntget(child_mnt);
 	spin_unlock(&vfsmount_lock);
 	return child_mnt;
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 97978004338db..72ce2ae88591e 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -370,7 +370,7 @@ static inline int d_mountpoint(struct dentry *dentry)
 	return dentry->d_mounted;
 }
 
-extern struct vfsmount *lookup_mnt(struct vfsmount *, struct dentry *);
+extern struct vfsmount *lookup_mnt(struct path *);
 extern struct dentry *lookup_create(struct nameidata *nd, int is_dir);
 
 extern int sysctl_vfs_cache_pressure;
-- 
GitLab


From 3174c21b74b56c6a53fddd41a30fd6f757a32bd0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 7 Apr 2009 13:19:18 -0400
Subject: [PATCH 2123/2198] Move junk from proc_fs.h to fs/proc/internal.h

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/proc/internal.h      | 25 +++++++++++++++++++++++++
 fs/proc/proc_devtree.c  |  1 +
 include/linux/proc_fs.h | 24 ------------------------
 3 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index f6db9618a8889..753ca37002c8c 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -92,3 +92,28 @@ struct pde_opener {
 	struct list_head lh;
 };
 void pde_users_dec(struct proc_dir_entry *pde);
+
+extern spinlock_t proc_subdir_lock;
+
+struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
+int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
+unsigned long task_vsize(struct mm_struct *);
+int task_statm(struct mm_struct *, int *, int *, int *, int *);
+void task_mem(struct seq_file *, struct mm_struct *);
+
+struct proc_dir_entry *de_get(struct proc_dir_entry *de);
+void de_put(struct proc_dir_entry *de);
+
+extern struct vfsmount *proc_mnt;
+int proc_fill_super(struct super_block *);
+struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
+
+/*
+ * These are generic /proc routines that use the internal
+ * "struct proc_dir_entry" tree to traverse the filesystem.
+ *
+ * The /proc root directory has extended versions to take care
+ * of the /proc/<pid> subdirectories.
+ */
+int proc_readdir(struct file *, void *, filldir_t);
+struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c
index de2bba5a3440a..fc6c3025befdd 100644
--- a/fs/proc/proc_devtree.c
+++ b/fs/proc/proc_devtree.c
@@ -11,6 +11,7 @@
 #include <linux/string.h>
 #include <asm/prom.h>
 #include <asm/uaccess.h>
+#include "internal.h"
 
 #ifndef HAVE_ARCH_DEVTREE_FIXUPS
 static inline void set_node_proc_entry(struct device_node *np,
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index fbfa3d44d33d8..e6e77d31c4188 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -93,20 +93,9 @@ struct vmcore {
 
 #ifdef CONFIG_PROC_FS
 
-extern spinlock_t proc_subdir_lock;
-
 extern void proc_root_init(void);
 
 void proc_flush_task(struct task_struct *task);
-struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *);
-int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir);
-unsigned long task_vsize(struct mm_struct *);
-int task_statm(struct mm_struct *, int *, int *, int *, int *);
-void task_mem(struct seq_file *, struct mm_struct *);
-void clear_refs_smap(struct mm_struct *mm);
-
-struct proc_dir_entry *de_get(struct proc_dir_entry *de);
-void de_put(struct proc_dir_entry *de);
 
 extern struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
 						struct proc_dir_entry *parent);
@@ -116,20 +105,7 @@ struct proc_dir_entry *proc_create_data(const char *name, mode_t mode,
 				void *data);
 extern void remove_proc_entry(const char *name, struct proc_dir_entry *parent);
 
-extern struct vfsmount *proc_mnt;
 struct pid_namespace;
-extern int proc_fill_super(struct super_block *);
-extern struct inode *proc_get_inode(struct super_block *, unsigned int, struct proc_dir_entry *);
-
-/*
- * These are generic /proc routines that use the internal
- * "struct proc_dir_entry" tree to traverse the filesystem.
- *
- * The /proc root directory has extended versions to take care
- * of the /proc/<pid> subdirectories.
- */
-extern int proc_readdir(struct file *, void *, filldir_t);
-extern struct dentry *proc_lookup(struct inode *, struct dentry *, struct nameidata *);
 
 extern int pid_ns_prepare_proc(struct pid_namespace *ns);
 extern void pid_ns_release_proc(struct pid_namespace *ns);
-- 
GitLab


From d3ef3d7351ccfbef3e5d926efc5ee332136f40d4 Mon Sep 17 00:00:00 2001
From: "npiggin@suse.de" <npiggin@suse.de>
Date: Sun, 26 Apr 2009 20:25:54 +1000
Subject: [PATCH 2124/2198] fs: mnt_want_write speedup

This patch speeds up lmbench lat_mmap test by about 8%. lat_mmap is set up
basically to mmap a 64MB file on tmpfs, fault in its pages, then unmap it.
A microbenchmark yes, but it exercises some important paths in the mm.

Before:
 avg = 501.9
 std = 14.7773

After:
 avg = 462.286
 std = 5.46106

(50 runs of each, stddev gives a reasonable confidence, but there is quite
a bit of variation there still)

It does this by removing the complex per-cpu locking and counter-cache and
replaces it with a percpu counter in struct vfsmount. This makes the code
much simpler, and avoids spinlocks (although the msync is still pretty
costly, unfortunately). It results in about 900 bytes smaller code too. It
does increase the size of a vfsmount, however.

It should also give a speedup on large systems if CPUs are frequently operating
on different mounts (because the existing scheme has to operate on an atomic in
the struct vfsmount when switching between mounts). But I'm most interested in
the single threaded path performance for the moment.

[AV: minor cleanup]

Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 268 ++++++++++++++----------------------------
 include/linux/mount.h |  21 +++-
 2 files changed, 106 insertions(+), 183 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index b94ad3d685ff4..22ae06ad751d9 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -131,10 +131,20 @@ struct vfsmount *alloc_vfsmnt(const char *name)
 		INIT_LIST_HEAD(&mnt->mnt_share);
 		INIT_LIST_HEAD(&mnt->mnt_slave_list);
 		INIT_LIST_HEAD(&mnt->mnt_slave);
-		atomic_set(&mnt->__mnt_writers, 0);
+#ifdef CONFIG_SMP
+		mnt->mnt_writers = alloc_percpu(int);
+		if (!mnt->mnt_writers)
+			goto out_free_devname;
+#else
+		mnt->mnt_writers = 0;
+#endif
 	}
 	return mnt;
 
+#ifdef CONFIG_SMP
+out_free_devname:
+	kfree(mnt->mnt_devname);
+#endif
 out_free_id:
 	mnt_free_id(mnt);
 out_free_cache:
@@ -171,65 +181,38 @@ int __mnt_is_readonly(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(__mnt_is_readonly);
 
-struct mnt_writer {
-	/*
-	 * If holding multiple instances of this lock, they
-	 * must be ordered by cpu number.
-	 */
-	spinlock_t lock;
-	struct lock_class_key lock_class; /* compiles out with !lockdep */
-	unsigned long count;
-	struct vfsmount *mnt;
-} ____cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
+static inline void inc_mnt_writers(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))++;
+#else
+	mnt->mnt_writers++;
+#endif
+}
 
-static int __init init_mnt_writers(void)
+static inline void dec_mnt_writers(struct vfsmount *mnt)
 {
-	int cpu;
-	for_each_possible_cpu(cpu) {
-		struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
-		spin_lock_init(&writer->lock);
-		lockdep_set_class(&writer->lock, &writer->lock_class);
-		writer->count = 0;
-	}
-	return 0;
+#ifdef CONFIG_SMP
+	(*per_cpu_ptr(mnt->mnt_writers, smp_processor_id()))--;
+#else
+	mnt->mnt_writers--;
+#endif
 }
-fs_initcall(init_mnt_writers);
 
-static void unlock_mnt_writers(void)
+static unsigned int count_mnt_writers(struct vfsmount *mnt)
 {
+#ifdef CONFIG_SMP
+	unsigned int count = 0;
 	int cpu;
-	struct mnt_writer *cpu_writer;
 
 	for_each_possible_cpu(cpu) {
-		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_unlock(&cpu_writer->lock);
+		count += *per_cpu_ptr(mnt->mnt_writers, cpu);
 	}
-}
 
-static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
-{
-	if (!cpu_writer->mnt)
-		return;
-	/*
-	 * This is in case anyone ever leaves an invalid,
-	 * old ->mnt and a count of 0.
-	 */
-	if (!cpu_writer->count)
-		return;
-	atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
-	cpu_writer->count = 0;
-}
- /*
- * must hold cpu_writer->lock
- */
-static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
-					  struct vfsmount *mnt)
-{
-	if (cpu_writer->mnt == mnt)
-		return;
-	__clear_mnt_count(cpu_writer);
-	cpu_writer->mnt = mnt;
+	return count;
+#else
+	return mnt->mnt_writers;
+#endif
 }
 
 /*
@@ -253,75 +236,34 @@ static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
 int mnt_want_write(struct vfsmount *mnt)
 {
 	int ret = 0;
-	struct mnt_writer *cpu_writer;
 
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
+	preempt_disable();
+	inc_mnt_writers(mnt);
+	/*
+	 * The store to inc_mnt_writers must be visible before we pass
+	 * MNT_WRITE_HOLD loop below, so that the slowpath can see our
+	 * incremented count after it has set MNT_WRITE_HOLD.
+	 */
+	smp_mb();
+	while (mnt->mnt_flags & MNT_WRITE_HOLD)
+		cpu_relax();
+	/*
+	 * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will
+	 * be set to match its requirements. So we must not load that until
+	 * MNT_WRITE_HOLD is cleared.
+	 */
+	smp_rmb();
 	if (__mnt_is_readonly(mnt)) {
+		dec_mnt_writers(mnt);
 		ret = -EROFS;
 		goto out;
 	}
-	use_cpu_writer_for_mount(cpu_writer, mnt);
-	cpu_writer->count++;
 out:
-	spin_unlock(&cpu_writer->lock);
-	put_cpu_var(mnt_writers);
+	preempt_enable();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
-static void lock_mnt_writers(void)
-{
-	int cpu;
-	struct mnt_writer *cpu_writer;
-
-	for_each_possible_cpu(cpu) {
-		cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_lock(&cpu_writer->lock);
-		__clear_mnt_count(cpu_writer);
-		cpu_writer->mnt = NULL;
-	}
-}
-
-/*
- * These per-cpu write counts are not guaranteed to have
- * matched increments and decrements on any given cpu.
- * A file open()ed for write on one cpu and close()d on
- * another cpu will imbalance this count.  Make sure it
- * does not get too far out of whack.
- */
-static void handle_write_count_underflow(struct vfsmount *mnt)
-{
-	if (atomic_read(&mnt->__mnt_writers) >=
-	    MNT_WRITER_UNDERFLOW_LIMIT)
-		return;
-	/*
-	 * It isn't necessary to hold all of the locks
-	 * at the same time, but doing it this way makes
-	 * us share a lot more code.
-	 */
-	lock_mnt_writers();
-	/*
-	 * vfsmount_lock is for mnt_flags.
-	 */
-	spin_lock(&vfsmount_lock);
-	/*
-	 * If coalescing the per-cpu writer counts did not
-	 * get us back to a positive writer count, we have
-	 * a bug.
-	 */
-	if ((atomic_read(&mnt->__mnt_writers) < 0) &&
-	    !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
-		WARN(1, KERN_DEBUG "leak detected on mount(%p) writers "
-				"count: %d\n",
-			mnt, atomic_read(&mnt->__mnt_writers));
-		/* use the flag to keep the dmesg spam down */
-		mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
-	}
-	spin_unlock(&vfsmount_lock);
-	unlock_mnt_writers();
-}
-
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
@@ -332,37 +274,9 @@ static void handle_write_count_underflow(struct vfsmount *mnt)
  */
 void mnt_drop_write(struct vfsmount *mnt)
 {
-	int must_check_underflow = 0;
-	struct mnt_writer *cpu_writer;
-
-	cpu_writer = &get_cpu_var(mnt_writers);
-	spin_lock(&cpu_writer->lock);
-
-	use_cpu_writer_for_mount(cpu_writer, mnt);
-	if (cpu_writer->count > 0) {
-		cpu_writer->count--;
-	} else {
-		must_check_underflow = 1;
-		atomic_dec(&mnt->__mnt_writers);
-	}
-
-	spin_unlock(&cpu_writer->lock);
-	/*
-	 * Logically, we could call this each time,
-	 * but the __mnt_writers cacheline tends to
-	 * be cold, and makes this expensive.
-	 */
-	if (must_check_underflow)
-		handle_write_count_underflow(mnt);
-	/*
-	 * This could be done right after the spinlock
-	 * is taken because the spinlock keeps us on
-	 * the cpu, and disables preemption.  However,
-	 * putting it here bounds the amount that
-	 * __mnt_writers can underflow.  Without it,
-	 * we could theoretically wrap __mnt_writers.
-	 */
-	put_cpu_var(mnt_writers);
+	preempt_disable();
+	dec_mnt_writers(mnt);
+	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(mnt_drop_write);
 
@@ -370,24 +284,41 @@ static int mnt_make_readonly(struct vfsmount *mnt)
 {
 	int ret = 0;
 
-	lock_mnt_writers();
+	spin_lock(&vfsmount_lock);
+	mnt->mnt_flags |= MNT_WRITE_HOLD;
 	/*
-	 * With all the locks held, this value is stable
+	 * After storing MNT_WRITE_HOLD, we'll read the counters. This store
+	 * should be visible before we do.
 	 */
-	if (atomic_read(&mnt->__mnt_writers) > 0) {
-		ret = -EBUSY;
-		goto out;
-	}
+	smp_mb();
+
 	/*
-	 * nobody can do a successful mnt_want_write() with all
-	 * of the counts in MNT_DENIED_WRITE and the locks held.
+	 * With writers on hold, if this value is zero, then there are
+	 * definitely no active writers (although held writers may subsequently
+	 * increment the count, they'll have to wait, and decrement it after
+	 * seeing MNT_READONLY).
+	 *
+	 * It is OK to have counter incremented on one CPU and decremented on
+	 * another: the sum will add up correctly. The danger would be when we
+	 * sum up each counter, if we read a counter before it is incremented,
+	 * but then read another CPU's count which it has been subsequently
+	 * decremented from -- we would see more decrements than we should.
+	 * MNT_WRITE_HOLD protects against this scenario, because
+	 * mnt_want_write first increments count, then smp_mb, then spins on
+	 * MNT_WRITE_HOLD, so it can't be decremented by another CPU while
+	 * we're counting up here.
 	 */
-	spin_lock(&vfsmount_lock);
-	if (!ret)
+	if (count_mnt_writers(mnt) > 0)
+		ret = -EBUSY;
+	else
 		mnt->mnt_flags |= MNT_READONLY;
+	/*
+	 * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers
+	 * that become unheld will see MNT_READONLY.
+	 */
+	smp_wmb();
+	mnt->mnt_flags &= ~MNT_WRITE_HOLD;
 	spin_unlock(&vfsmount_lock);
-out:
-	unlock_mnt_writers();
 	return ret;
 }
 
@@ -410,6 +341,9 @@ void free_vfsmnt(struct vfsmount *mnt)
 {
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
+#ifdef CONFIG_SMP
+	free_percpu(mnt->mnt_writers);
+#endif
 	kmem_cache_free(mnt_cache, mnt);
 }
 
@@ -604,38 +538,18 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
 
 static inline void __mntput(struct vfsmount *mnt)
 {
-	int cpu;
 	struct super_block *sb = mnt->mnt_sb;
-	/*
-	 * We don't have to hold all of the locks at the
-	 * same time here because we know that we're the
-	 * last reference to mnt and that no new writers
-	 * can come in.
-	 */
-	for_each_possible_cpu(cpu) {
-		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-		spin_lock(&cpu_writer->lock);
-		if (cpu_writer->mnt != mnt) {
-			spin_unlock(&cpu_writer->lock);
-			continue;
-		}
-		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
-		cpu_writer->count = 0;
-		/*
-		 * Might as well do this so that no one
-		 * ever sees the pointer and expects
-		 * it to be valid.
-		 */
-		cpu_writer->mnt = NULL;
-		spin_unlock(&cpu_writer->lock);
-	}
 	/*
 	 * This probably indicates that somebody messed
 	 * up a mnt_want/drop_write() pair.  If this
 	 * happens, the filesystem was probably unable
 	 * to make r/w->r/o transitions.
 	 */
-	WARN_ON(atomic_read(&mnt->__mnt_writers));
+	/*
+	 * atomic_dec_and_lock() used to deal with ->mnt_count decrements
+	 * provides barriers, so count_mnt_writers() below is safe.  AV
+	 */
+	WARN_ON(count_mnt_writers(mnt));
 	dput(mnt->mnt_root);
 	free_vfsmnt(mnt);
 	deactivate_super(sb);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index 51f55f903aff5..ac49c1f8e5c09 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -30,7 +30,7 @@ struct mnt_namespace;
 #define MNT_STRICTATIME 0x80
 
 #define MNT_SHRINKABLE	0x100
-#define MNT_IMBALANCED_WRITE_COUNT	0x200 /* just for debugging */
+#define MNT_WRITE_HOLD	0x200
 
 #define MNT_SHARED	0x1000	/* if the vfsmount is a shared mount */
 #define MNT_UNBINDABLE	0x2000	/* if the vfsmount is a unbindable mount */
@@ -65,13 +65,22 @@ struct vfsmount {
 	int mnt_expiry_mark;		/* true if marked for expiry */
 	int mnt_pinned;
 	int mnt_ghosts;
-	/*
-	 * This value is not stable unless all of the mnt_writers[] spinlocks
-	 * are held, and all mnt_writer[]s on this mount have 0 as their ->count
-	 */
-	atomic_t __mnt_writers;
+#ifdef CONFIG_SMP
+	int *mnt_writers;
+#else
+	int mnt_writers;
+#endif
 };
 
+static inline int *get_mnt_writers_ptr(struct vfsmount *mnt)
+{
+#ifdef CONFIG_SMP
+	return mnt->mnt_writers;
+#else
+	return &mnt->mnt_writers;
+#endif
+}
+
 static inline struct vfsmount *mntget(struct vfsmount *mnt)
 {
 	if (mnt)
-- 
GitLab


From 96029c4e09ccbd73a6d0ed2b29e80bf2586ad7ef Mon Sep 17 00:00:00 2001
From: "npiggin@suse.de" <npiggin@suse.de>
Date: Sun, 26 Apr 2009 20:25:55 +1000
Subject: [PATCH 2125/2198] fs: introduce mnt_clone_write

This patch speeds up lmbench lat_mmap test by about another 2% after the
first patch.

Before:
 avg = 462.286
 std = 5.46106

After:
 avg = 453.12
 std = 9.58257

(50 runs of each, stddev gives a reasonable confidence)

It does this by introducing mnt_clone_write, which avoids some heavyweight
operations of mnt_want_write if called on a vfsmount which we know already
has a write count; and mnt_want_write_file, which can call mnt_clone_write
if the file is open for write.

After these two patches, mnt_want_write and mnt_drop_write go from 7% on
the profile down to 1.3% (including mnt_clone_write).

[AV: mnt_want_write_file() should take file alone and derive mnt from it;
not only all callers have that form, but that's the only mnt about which
we know that it's already held for write if file is opened for write]

Cc: Dave Hansen <haveblue@us.ibm.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c       |  2 +-
 fs/inode.c            |  2 +-
 fs/namespace.c        | 40 ++++++++++++++++++++++++++++++++++++++++
 fs/open.c             |  4 ++--
 fs/xattr.c            |  4 ++--
 include/linux/mount.h |  4 ++++
 6 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 54018fe488403..3d66dbcebef6e 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -214,7 +214,7 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
 	 */
 	if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
 		file_take_write(file);
-		error = mnt_want_write(mnt);
+		error = mnt_clone_write(mnt);
 		WARN_ON(error);
 	}
 	return error;
diff --git a/fs/inode.c b/fs/inode.c
index ca337014ae294..a88baebf77cf1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1422,7 +1422,7 @@ void file_update_time(struct file *file)
 	if (IS_NOCMTIME(inode))
 		return;
 
-	err = mnt_want_write(file->f_path.mnt);
+	err = mnt_want_write_file(file);
 	if (err)
 		return;
 
diff --git a/fs/namespace.c b/fs/namespace.c
index 22ae06ad751d9..120b8a6b99ed6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -264,6 +264,46 @@ int mnt_want_write(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL_GPL(mnt_want_write);
 
+/**
+ * mnt_clone_write - get write access to a mount
+ * @mnt: the mount on which to take a write
+ *
+ * This is effectively like mnt_want_write, except
+ * it must only be used to take an extra write reference
+ * on a mountpoint that we already know has a write reference
+ * on it. This allows some optimisation.
+ *
+ * After finished, mnt_drop_write must be called as usual to
+ * drop the reference.
+ */
+int mnt_clone_write(struct vfsmount *mnt)
+{
+	/* superblock may be r/o */
+	if (__mnt_is_readonly(mnt))
+		return -EROFS;
+	preempt_disable();
+	inc_mnt_writers(mnt);
+	preempt_enable();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mnt_clone_write);
+
+/**
+ * mnt_want_write_file - get write access to a file's mount
+ * @file: the file who's mount on which to take a write
+ *
+ * This is like mnt_want_write, but it takes a file and can
+ * do some optimisations if the file is open for write already
+ */
+int mnt_want_write_file(struct file *file)
+{
+	if (!(file->f_mode & FMODE_WRITE))
+		return mnt_want_write(file->f_path.mnt);
+	else
+		return mnt_clone_write(file->f_path.mnt);
+}
+EXPORT_SYMBOL_GPL(mnt_want_write_file);
+
 /**
  * mnt_drop_write - give up write access to a mount
  * @mnt: the mount on which to give up write access
diff --git a/fs/open.c b/fs/open.c
index bdfbf03615a48..7200e23d9258c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -612,7 +612,7 @@ SYSCALL_DEFINE2(fchmod, unsigned int, fd, mode_t, mode)
 
 	audit_inode(NULL, dentry);
 
-	err = mnt_want_write(file->f_path.mnt);
+	err = mnt_want_write_file(file);
 	if (err)
 		goto out_putf;
 	mutex_lock(&inode->i_mutex);
@@ -761,7 +761,7 @@ SYSCALL_DEFINE3(fchown, unsigned int, fd, uid_t, user, gid_t, group)
 	if (!file)
 		goto out;
 
-	error = mnt_want_write(file->f_path.mnt);
+	error = mnt_want_write_file(file);
 	if (error)
 		goto out_fput;
 	dentry = file->f_path.dentry;
diff --git a/fs/xattr.c b/fs/xattr.c
index d51b8f9db921c..1c3d0af59ddf8 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -297,7 +297,7 @@ SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name,
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write(f->f_path.mnt);
+	error = mnt_want_write_file(f);
 	if (!error) {
 		error = setxattr(dentry, name, value, size, flags);
 		mnt_drop_write(f->f_path.mnt);
@@ -524,7 +524,7 @@ SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name)
 		return error;
 	dentry = f->f_path.dentry;
 	audit_inode(NULL, dentry);
-	error = mnt_want_write(f->f_path.mnt);
+	error = mnt_want_write_file(f);
 	if (!error) {
 		error = removexattr(dentry, name);
 		mnt_drop_write(f->f_path.mnt);
diff --git a/include/linux/mount.h b/include/linux/mount.h
index ac49c1f8e5c09..5d52753648678 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -88,7 +88,11 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt)
 	return mnt;
 }
 
+struct file; /* forward dec */
+
 extern int mnt_want_write(struct vfsmount *mnt);
+extern int mnt_want_write_file(struct file *file);
+extern int mnt_clone_write(struct vfsmount *mnt);
 extern void mnt_drop_write(struct vfsmount *mnt);
 extern void mntput_no_expire(struct vfsmount *mnt);
 extern void mnt_pin(struct vfsmount *mnt);
-- 
GitLab


From 864d7c4c068f23642efe91b33be3a84afe5f71e0 Mon Sep 17 00:00:00 2001
From: "npiggin@suse.de" <npiggin@suse.de>
Date: Sun, 26 Apr 2009 20:25:56 +1000
Subject: [PATCH 2126/2198] fs: move mark_files_ro into file_table.c

This function walks the s_files lock, and operates primarily on the
files in a superblock, so it better belongs here (eg. see also
fs_may_remount_ro).

[AV: ... and it shouldn't be static after that move]

Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/file_table.c | 38 ++++++++++++++++++++++++++++++++++++++
 fs/internal.h   |  5 +++++
 fs/super.c      | 39 ---------------------------------------
 3 files changed, 43 insertions(+), 39 deletions(-)

diff --git a/fs/file_table.c b/fs/file_table.c
index 3d66dbcebef6e..334ce39881f8f 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -399,6 +399,44 @@ int fs_may_remount_ro(struct super_block *sb)
 	return 0;
 }
 
+/**
+ *	mark_files_ro - mark all files read-only
+ *	@sb: superblock in question
+ *
+ *	All files are marked read-only.  We don't care about pending
+ *	delete files so this should be used in 'force' mode only.
+ */
+void mark_files_ro(struct super_block *sb)
+{
+	struct file *f;
+
+retry:
+	file_list_lock();
+	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
+		struct vfsmount *mnt;
+		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
+		       continue;
+		if (!file_count(f))
+			continue;
+		if (!(f->f_mode & FMODE_WRITE))
+			continue;
+		f->f_mode &= ~FMODE_WRITE;
+		if (file_check_writeable(f) != 0)
+			continue;
+		file_release_write(f);
+		mnt = mntget(f->f_path.mnt);
+		file_list_unlock();
+		/*
+		 * This can sleep, so we can't hold
+		 * the file_list_lock() spinlock.
+		 */
+		mnt_drop_write(mnt);
+		mntput(mnt);
+		goto retry;
+	}
+	file_list_unlock();
+}
+
 void __init files_init(unsigned long mempages)
 { 
 	int n; 
diff --git a/fs/internal.h b/fs/internal.h
index b4dac4fb6b61f..6d4ef208ef659 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -66,3 +66,8 @@ extern void __init mnt_init(void);
  * fs_struct.c
  */
 extern void chroot_fs_refs(struct path *, struct path *);
+
+/*
+ * file_table.c
+ */
+extern void mark_files_ro(struct super_block *);
diff --git a/fs/super.c b/fs/super.c
index 1943fdf655fac..c170551c23fe0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -615,45 +615,6 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 	return err;
 }
 
-/**
- *	mark_files_ro - mark all files read-only
- *	@sb: superblock in question
- *
- *	All files are marked read-only.  We don't care about pending
- *	delete files so this should be used in 'force' mode only.
- */
-
-static void mark_files_ro(struct super_block *sb)
-{
-	struct file *f;
-
-retry:
-	file_list_lock();
-	list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
-		struct vfsmount *mnt;
-		if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
-		       continue;
-		if (!file_count(f))
-			continue;
-		if (!(f->f_mode & FMODE_WRITE))
-			continue;
-		f->f_mode &= ~FMODE_WRITE;
-		if (file_check_writeable(f) != 0)
-			continue;
-		file_release_write(f);
-		mnt = mntget(f->f_path.mnt);
-		file_list_unlock();
-		/*
-		 * This can sleep, so we can't hold
-		 * the file_list_lock() spinlock.
-		 */
-		mnt_drop_write(mnt);
-		mntput(mnt);
-		goto retry;
-	}
-	file_list_unlock();
-}
-
 /**
  *	do_remount_sb - asks filesystem to change mount options.
  *	@sb:	superblock in question
-- 
GitLab


From 876a9f76abbcb775f8d21cbc99fa161f9e5937f1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Apr 2009 18:05:55 +0200
Subject: [PATCH 2127/2198] remove s_async_list

Remove the unused s_async_list in the superblock, a leftover of the
broken async inode deletion code that leaked into mainline.  Having this
in the middle of the sync/unmount path is not helpful for the following
cleanups.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 8 --------
 include/linux/fs.h | 5 -----
 2 files changed, 13 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index c170551c23fe0..3d9f117dd2a39 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -38,7 +38,6 @@
 #include <linux/kobject.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
-#include <linux/async.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -72,7 +71,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		INIT_HLIST_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
 		INIT_LIST_HEAD(&s->s_dentry_lru);
-		INIT_LIST_HEAD(&s->s_async_list);
 		init_rwsem(&s->s_umount);
 		mutex_init(&s->s_lock);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -342,11 +340,6 @@ void generic_shutdown_super(struct super_block *sb)
 		lock_super(sb);
 		sb->s_flags &= ~MS_ACTIVE;
 
-		/*
-		 * wait for asynchronous fs operations to finish before going further
-		 */
-		async_synchronize_full_domain(&sb->s_async_list);
-
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
 		lock_kernel();
@@ -517,7 +510,6 @@ void sync_filesystems(int wait)
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		async_synchronize_full_domain(&sb->s_async_list);
 		if (sb->s_root && (wait || sb->s_dirt))
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 03fb2102b8f3a..36bcff7036efd 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1372,11 +1372,6 @@ struct super_block {
 	 * generic_show_options()
 	 */
 	char *s_options;
-
-	/*
-	 * storage for asynchronous operations
-	 */
-	struct list_head s_async_list;
 };
 
 extern struct timespec current_fs_time(struct super_block *sb);
-- 
GitLab


From 5a3e5cb8e08bd876e2542c1451c9a93dab1b0e39 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:48 +0200
Subject: [PATCH 2128/2198] vfs: Fix sys_sync() and fsync_super() reliability
 (version 4)

So far, do_sync() called:
  sync_inodes(0);
  sync_supers();
  sync_filesystems(0);
  sync_filesystems(1);
  sync_inodes(1);

This ordering makes it kind of hard for filesystems as sync_inodes(0) need not
submit all the IO (for example it skips inodes with I_SYNC set) so e.g. forcing
transaction to disk in ->sync_fs() is not really enough. Therefore sys_sync has
not been completely reliable on some filesystems (ext3, ext4, reiserfs, ocfs2
and others are hit by this) when racing e.g. with background writeback. A
similar problem hits also other filesystems (e.g. ext2) because of
write_supers() being called before the sync_inodes(1).

Change the ordering of calls in do_sync() - this requires a new function
sync_blockdevs() to preserve the property that block devices are always synced
after write_super() / sync_fs() call.

The same issue is fixed in __fsync_super() function used on umount /
remount read-only.

[AV: build fixes]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h |  9 +++++++++
 fs/super.c    | 29 ++++++++++++++++++++++++++++-
 fs/sync.c     |  4 +++-
 3 files changed, 40 insertions(+), 2 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index 6d4ef208ef659..343a537ab8097 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -71,3 +71,12 @@ extern void chroot_fs_refs(struct path *, struct path *);
  * file_table.c
  */
 extern void mark_files_ro(struct super_block *);
+
+/*
+ * super.c
+ */
+#ifdef CONFIG_BLOCK
+extern void sync_blockdevs(void);
+#else
+static inline void sync_blockdevs(void) { }
+#endif
diff --git a/fs/super.c b/fs/super.c
index 3d9f117dd2a39..18d159dc1e406 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -293,6 +293,7 @@ void __fsync_super(struct super_block *sb)
 {
 	sync_inodes_sb(sb, 0);
 	vfs_dq_sync(sb);
+	sync_inodes_sb(sb, 1);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
@@ -300,7 +301,6 @@ void __fsync_super(struct super_block *sb)
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, 1);
 	sync_blockdev(sb->s_bdev);
-	sync_inodes_sb(sb, 1);
 }
 
 /*
@@ -522,6 +522,33 @@ void sync_filesystems(int wait)
 	mutex_unlock(&mutex);
 }
 
+#ifdef CONFIG_BLOCK
+/*
+ *  Sync all block devices underlying some superblock
+ */
+void sync_blockdevs(void)
+{
+	struct super_block *sb;
+
+	spin_lock(&sb_lock);
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (!sb->s_bdev)
+			continue;
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			sync_blockdev(sb->s_bdev);
+		up_read(&sb->s_umount);
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+}
+#endif
+
 /**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
diff --git a/fs/sync.c b/fs/sync.c
index 7abc65fbf21df..631fd5aece78d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -13,6 +13,7 @@
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include "internal.h"
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -26,10 +27,11 @@ static void do_sync(unsigned long wait)
 	wakeup_pdflush(0);
 	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
 	vfs_dq_sync(NULL);
+	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
 	sync_supers();		/* Write the superblocks */
 	sync_filesystems(0);	/* Start syncing the filesystems */
 	sync_filesystems(wait);	/* Waitingly sync the filesystems */
-	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
+	sync_blockdevs();
 	if (!wait)
 		printk("Emergency Sync complete\n");
 	if (unlikely(laptop_mode))
-- 
GitLab


From bfe881255c74800147523b59c85328a1a826ba21 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:49 +0200
Subject: [PATCH 2129/2198] vfs: Call ->sync_fs() even if s_dirt is 0 (version
 4)

sync_filesystems() has a condition that if wait == 0 and s_dirt == 0, then
->sync_fs() isn't called. This does not really make much sence since s_dirt is
generally used by a filesystem to mean that ->write_super() needs to be called.
But ->sync_fs() does different things. I even suspect that some filesystems
(btrfs?) sets s_dirt just to fool this logic.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/super.c b/fs/super.c
index 18d159dc1e406..fae91ba38e48d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -510,7 +510,7 @@ void sync_filesystems(int wait)
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		if (sb->s_root && (wait || sb->s_dirt))
+		if (sb->s_root)
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
 		/* restart only when sb is no longer on the list */
-- 
GitLab


From 429479f031322a0cc5c921ffb2321a51718dc875 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:50 +0200
Subject: [PATCH 2130/2198] vfs: Make __fsync_super() a static function
 (version 4)

__fsync_super() does the same thing as fsync_super(). So change the only
caller to use fsync_super() and make __fsync_super() static. This removes
unnecessarily duplicated call to sync_blockdev() and prepares ground
for the changes to __fsync_super() in the following patches.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c     | 2 +-
 fs/super.c         | 7 +++----
 include/linux/fs.h | 1 -
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 931f6b8c4b2f4..fe47f72276186 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -241,7 +241,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 		sb->s_frozen = SB_FREEZE_WRITE;
 		smp_wmb();
 
-		__fsync_super(sb);
+		fsync_super(sb);
 
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
diff --git a/fs/super.c b/fs/super.c
index fae91ba38e48d..8dbe1ead9ddd5 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -289,7 +289,7 @@ EXPORT_SYMBOL(unlock_super);
  * device.  Takes the superblock lock.  Requires a second blkdev
  * flush by the caller to complete the operation.
  */
-void __fsync_super(struct super_block *sb)
+static int __fsync_super(struct super_block *sb)
 {
 	sync_inodes_sb(sb, 0);
 	vfs_dq_sync(sb);
@@ -300,7 +300,7 @@ void __fsync_super(struct super_block *sb)
 	unlock_super(sb);
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, 1);
-	sync_blockdev(sb->s_bdev);
+	return sync_blockdev(sb->s_bdev);
 }
 
 /*
@@ -310,8 +310,7 @@ void __fsync_super(struct super_block *sb)
  */
 int fsync_super(struct super_block *sb)
 {
-	__fsync_super(sb);
-	return sync_blockdev(sb->s_bdev);
+	return __fsync_super(sb);
 }
 EXPORT_SYMBOL_GPL(fsync_super);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 36bcff7036efd..41a9907f342e5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2078,7 +2078,6 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
 extern void sync_filesystems(int wait);
-extern void __fsync_super(struct super_block *sb);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
 extern int do_remount_sb(struct super_block *sb, int flags,
-- 
GitLab


From 5cee5815d1564bbbd505fea86f4550f1efdb5cd0 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:51 +0200
Subject: [PATCH 2131/2198] vfs: Make sys_sync() use fsync_super() (version 4)

It is unnecessarily fragile to have two places (fsync_super() and do_sync())
doing data integrity sync of the filesystem. Alter __fsync_super() to
accommodate needs of both callers and use it. So after this patch
__fsync_super() is the only place where we gather all the calls needed to
properly send all data on a filesystem to disk.

Nice bonus is that we get a complete livelock avoidance and write_supers()
is now only used for periodic writeback of superblocks.

sync_blockdevs() introduced a couple of patches ago is gone now.

[build fixes folded]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c            | 15 +++++---
 fs/fs-writeback.c         | 49 --------------------------
 fs/internal.h             | 16 ++++-----
 fs/super.c                | 72 +++++++++++++--------------------------
 fs/sync.c                 | 31 ++++++-----------
 include/linux/fs.h        |  2 +-
 include/linux/writeback.h |  1 -
 7 files changed, 51 insertions(+), 135 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe47f72276186..4b6a3b9d01eff 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -176,17 +176,22 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 				iov, offset, nr_segs, blkdev_get_blocks, NULL);
 }
 
+int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	if (!bdev)
+		return 0;
+	if (!wait)
+		return filemap_flush(bdev->bd_inode->i_mapping);
+	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
+
 /*
  * Write out and wait upon all the dirty data associated with a block
  * device via its mapping.  Does not take the superblock lock.
  */
 int sync_blockdev(struct block_device *bdev)
 {
-	int ret = 0;
-
-	if (bdev)
-		ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
-	return ret;
+	return __sync_blockdev(bdev, 1);
 }
 EXPORT_SYMBOL(sync_blockdev);
 
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91013ff7dd531..e0fb2e7895985 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -678,55 +678,6 @@ void sync_inodes_sb(struct super_block *sb, int wait)
 	sync_sb_inodes(sb, &wbc);
 }
 
-/**
- * sync_inodes - writes all inodes to disk
- * @wait: wait for completion
- *
- * sync_inodes() goes through each super block's dirty inode list, writes the
- * inodes out, waits on the writeout and puts the inodes back on the normal
- * list.
- *
- * This is for sys_sync().  fsync_dev() uses the same algorithm.  The subtle
- * part of the sync functions is that the blockdev "superblock" is processed
- * last.  This is because the write_inode() function of a typical fs will
- * perform no I/O, but will mark buffers in the blockdev mapping as dirty.
- * What we want to do is to perform all that dirtying first, and then write
- * back all those inode blocks via the blockdev mapping in one sweep.  So the
- * additional (somewhat redundant) sync_blockdev() calls here are to make
- * sure that really happens.  Because if we call sync_inodes_sb(wait=1) with
- * outstanding dirty inodes, the writeback goes block-at-a-time within the
- * filesystem's write_inode().  This is extremely slow.
- */
-static void __sync_inodes(int wait)
-{
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root) {
-			sync_inodes_sb(sb, wait);
-			sync_blockdev(sb->s_bdev);
-		}
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-}
-
-void sync_inodes(int wait)
-{
-	__sync_inodes(0);
-
-	if (wait)
-		__sync_inodes(1);
-}
-
 /**
  * write_inode_now	-	write an inode to disk
  * @inode: inode to write to disk
diff --git a/fs/internal.h b/fs/internal.h
index 343a537ab8097..dbec3cc28338f 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -25,6 +25,8 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 	return sb == blockdev_superblock;
 }
 
+extern int __sync_blockdev(struct block_device *bdev, int wait);
+
 #else
 static inline void bdev_cache_init(void)
 {
@@ -34,6 +36,11 @@ static inline int sb_is_blkdev_sb(struct super_block *sb)
 {
 	return 0;
 }
+
+static inline int __sync_blockdev(struct block_device *bdev, int wait)
+{
+	return 0;
+}
 #endif
 
 /*
@@ -71,12 +78,3 @@ extern void chroot_fs_refs(struct path *, struct path *);
  * file_table.c
  */
 extern void mark_files_ro(struct super_block *);
-
-/*
- * super.c
- */
-#ifdef CONFIG_BLOCK
-extern void sync_blockdevs(void);
-#else
-static inline void sync_blockdevs(void) { }
-#endif
diff --git a/fs/super.c b/fs/super.c
index 8dbe1ead9ddd5..c8ce5ed042498 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -284,23 +284,23 @@ EXPORT_SYMBOL(lock_super);
 EXPORT_SYMBOL(unlock_super);
 
 /*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.  Requires a second blkdev
- * flush by the caller to complete the operation.
+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
+ * just dirties buffers with inodes so we have to submit IO for these buffers
+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * write one block at a time.
  */
-static int __fsync_super(struct super_block *sb)
+static int __fsync_super(struct super_block *sb, int wait)
 {
-	sync_inodes_sb(sb, 0);
 	vfs_dq_sync(sb);
-	sync_inodes_sb(sb, 1);
+	sync_inodes_sb(sb, wait);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
 	unlock_super(sb);
 	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, 1);
-	return sync_blockdev(sb->s_bdev);
+		sb->s_op->sync_fs(sb, wait);
+	return __sync_blockdev(sb->s_bdev, wait);
 }
 
 /*
@@ -310,7 +310,12 @@ static int __fsync_super(struct super_block *sb)
  */
 int fsync_super(struct super_block *sb)
 {
-	return __fsync_super(sb);
+	int ret;
+
+	ret = __fsync_super(sb, 0);
+	if (ret < 0)
+		return ret;
+	return __fsync_super(sb, 1);
 }
 EXPORT_SYMBOL_GPL(fsync_super);
 
@@ -469,20 +474,18 @@ void sync_supers(void)
 }
 
 /*
- * Call the ->sync_fs super_op against all filesystems which are r/w and
- * which implement it.
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
  *
  * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync_fs
+ * if two or more filesystems are being continuously dirtied.  s_need_sync
  * is used only here.  We set it against all filesystems and then clear it as
  * we sync them.  So redirtied filesystems are skipped.
  *
  * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync_fs
+ * calls sync_filesystems as well, process B will set all the s_need_sync
  * flags again, which will cause process A to resync everything.  Fix that with
  * a local mutex.
- *
- * (Fabian) Avoid sync_fs with clean fs & wait mode 0
  */
 void sync_filesystems(int wait)
 {
@@ -492,25 +495,23 @@ void sync_filesystems(int wait)
 	mutex_lock(&mutex);		/* Could be down_interruptible */
 	spin_lock(&sb_lock);
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_op->sync_fs)
-			continue;
 		if (sb->s_flags & MS_RDONLY)
 			continue;
-		sb->s_need_sync_fs = 1;
+		sb->s_need_sync = 1;
 	}
 
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_need_sync_fs)
+		if (!sb->s_need_sync)
 			continue;
-		sb->s_need_sync_fs = 0;
+		sb->s_need_sync = 0;
 		if (sb->s_flags & MS_RDONLY)
 			continue;	/* hm.  Was remounted r/o meanwhile */
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
 		if (sb->s_root)
-			sb->s_op->sync_fs(sb, wait);
+			__fsync_super(sb, wait);
 		up_read(&sb->s_umount);
 		/* restart only when sb is no longer on the list */
 		spin_lock(&sb_lock);
@@ -521,33 +522,6 @@ void sync_filesystems(int wait)
 	mutex_unlock(&mutex);
 }
 
-#ifdef CONFIG_BLOCK
-/*
- *  Sync all block devices underlying some superblock
- */
-void sync_blockdevs(void)
-{
-	struct super_block *sb;
-
-	spin_lock(&sb_lock);
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_bdev)
-			continue;
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root)
-			sync_blockdev(sb->s_bdev);
-		up_read(&sb->s_umount);
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-}
-#endif
-
 /**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
diff --git a/fs/sync.c b/fs/sync.c
index 631fd5aece78d..be0798cc33d78 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -18,35 +18,24 @@
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
-/*
- * sync everything.  Start out by waking pdflush, because that writes back
- * all queues in parallel.
- */
-static void do_sync(unsigned long wait)
+SYSCALL_DEFINE0(sync)
 {
-	wakeup_pdflush(0);
-	sync_inodes(0);		/* All mappings, inodes and their blockdevs */
-	vfs_dq_sync(NULL);
-	sync_inodes(wait);	/* Mappings, inodes and blockdevs, again. */
-	sync_supers();		/* Write the superblocks */
-	sync_filesystems(0);	/* Start syncing the filesystems */
-	sync_filesystems(wait);	/* Waitingly sync the filesystems */
-	sync_blockdevs();
-	if (!wait)
-		printk("Emergency Sync complete\n");
+	sync_filesystems(0);
+	sync_filesystems(1);
 	if (unlikely(laptop_mode))
 		laptop_sync_completion();
-}
-
-SYSCALL_DEFINE0(sync)
-{
-	do_sync(1);
 	return 0;
 }
 
 static void do_sync_work(struct work_struct *work)
 {
-	do_sync(0);
+	/*
+	 * Sync twice to reduce the possibility we skipped some inodes / pages
+	 * because they were temporarily locked
+	 */
+	sync_filesystems(0);
+	sync_filesystems(0);
+	printk("Emergency Sync complete\n");
 	kfree(work);
 }
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 41a9907f342e5..f00df653cf2b7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1321,7 +1321,7 @@ struct super_block {
 	struct rw_semaphore	s_umount;
 	struct mutex		s_lock;
 	int			s_count;
-	int			s_need_sync_fs;
+	int			s_need_sync;
 	atomic_t		s_active;
 #ifdef CONFIG_SECURITY
 	void                    *s_security;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 93445477f86a7..3224820c85146 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -79,7 +79,6 @@ struct writeback_control {
 void writeback_inodes(struct writeback_control *wbc);
 int inode_wait(void *);
 void sync_inodes_sb(struct super_block *, int wait);
-void sync_inodes(int wait);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
 static inline void wait_on_inode(struct inode *inode)
-- 
GitLab


From c15c54f5f056ee4819da9fde59a5f2cd45445f23 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:52 +0200
Subject: [PATCH 2132/2198] vfs: Move syncing code from super.c to sync.c
 (version 4)

Move sync_filesystems(), __fsync_super(), fsync_super() from
super.c to sync.c where it fits better.

[build fixes folded]

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c         | 85 ----------------------------------------------
 fs/sync.c          | 85 ++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/fs.h |  3 +-
 3 files changed, 86 insertions(+), 87 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index c8ce5ed042498..f822c74f25bee 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -283,42 +283,6 @@ void unlock_super(struct super_block * sb)
 EXPORT_SYMBOL(lock_super);
 EXPORT_SYMBOL(unlock_super);
 
-/*
- * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
- * just dirties buffers with inodes so we have to submit IO for these buffers
- * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
- * case write_inode() functions do sync_dirty_buffer() and thus effectively
- * write one block at a time.
- */
-static int __fsync_super(struct super_block *sb, int wait)
-{
-	vfs_dq_sync(sb);
-	sync_inodes_sb(sb, wait);
-	lock_super(sb);
-	if (sb->s_dirt && sb->s_op->write_super)
-		sb->s_op->write_super(sb);
-	unlock_super(sb);
-	if (sb->s_op->sync_fs)
-		sb->s_op->sync_fs(sb, wait);
-	return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
- * Write out and wait upon all dirty data associated with this
- * superblock.  Filesystem data as well as the underlying block
- * device.  Takes the superblock lock.
- */
-int fsync_super(struct super_block *sb)
-{
-	int ret;
-
-	ret = __fsync_super(sb, 0);
-	if (ret < 0)
-		return ret;
-	return __fsync_super(sb, 1);
-}
-EXPORT_SYMBOL_GPL(fsync_super);
-
 /**
  *	generic_shutdown_super	-	common helper for ->kill_sb()
  *	@sb: superblock to kill
@@ -473,55 +437,6 @@ void sync_supers(void)
 	spin_unlock(&sb_lock);
 }
 
-/*
- * Sync all the data for all the filesystems (called by sys_sync() and
- * emergency sync)
- *
- * This operation is careful to avoid the livelock which could easily happen
- * if two or more filesystems are being continuously dirtied.  s_need_sync
- * is used only here.  We set it against all filesystems and then clear it as
- * we sync them.  So redirtied filesystems are skipped.
- *
- * But if process A is currently running sync_filesystems and then process B
- * calls sync_filesystems as well, process B will set all the s_need_sync
- * flags again, which will cause process A to resync everything.  Fix that with
- * a local mutex.
- */
-void sync_filesystems(int wait)
-{
-	struct super_block *sb;
-	static DEFINE_MUTEX(mutex);
-
-	mutex_lock(&mutex);		/* Could be down_interruptible */
-	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_flags & MS_RDONLY)
-			continue;
-		sb->s_need_sync = 1;
-	}
-
-restart:
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (!sb->s_need_sync)
-			continue;
-		sb->s_need_sync = 0;
-		if (sb->s_flags & MS_RDONLY)
-			continue;	/* hm.  Was remounted r/o meanwhile */
-		sb->s_count++;
-		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
-		if (sb->s_root)
-			__fsync_super(sb, wait);
-		up_read(&sb->s_umount);
-		/* restart only when sb is no longer on the list */
-		spin_lock(&sb_lock);
-		if (__put_super_and_need_restart(sb))
-			goto restart;
-	}
-	spin_unlock(&sb_lock);
-	mutex_unlock(&mutex);
-}
-
 /**
  *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
diff --git a/fs/sync.c b/fs/sync.c
index be0798cc33d78..d5fa7b79982ed 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -18,6 +18,91 @@
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
 
+/*
+ * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
+ * just dirties buffers with inodes so we have to submit IO for these buffers
+ * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * write one block at a time.
+ */
+static int __fsync_super(struct super_block *sb, int wait)
+{
+	vfs_dq_sync(sb);
+	sync_inodes_sb(sb, wait);
+	lock_super(sb);
+	if (sb->s_dirt && sb->s_op->write_super)
+		sb->s_op->write_super(sb);
+	unlock_super(sb);
+	if (sb->s_op->sync_fs)
+		sb->s_op->sync_fs(sb, wait);
+	return __sync_blockdev(sb->s_bdev, wait);
+}
+
+/*
+ * Write out and wait upon all dirty data associated with this
+ * superblock.  Filesystem data as well as the underlying block
+ * device.  Takes the superblock lock.
+ */
+int fsync_super(struct super_block *sb)
+{
+	int ret;
+
+	ret = __fsync_super(sb, 0);
+	if (ret < 0)
+		return ret;
+	return __fsync_super(sb, 1);
+}
+EXPORT_SYMBOL_GPL(fsync_super);
+
+/*
+ * Sync all the data for all the filesystems (called by sys_sync() and
+ * emergency sync)
+ *
+ * This operation is careful to avoid the livelock which could easily happen
+ * if two or more filesystems are being continuously dirtied.  s_need_sync
+ * is used only here.  We set it against all filesystems and then clear it as
+ * we sync them.  So redirtied filesystems are skipped.
+ *
+ * But if process A is currently running sync_filesystems and then process B
+ * calls sync_filesystems as well, process B will set all the s_need_sync
+ * flags again, which will cause process A to resync everything.  Fix that with
+ * a local mutex.
+ */
+static void sync_filesystems(int wait)
+{
+	struct super_block *sb;
+	static DEFINE_MUTEX(mutex);
+
+	mutex_lock(&mutex);		/* Could be down_interruptible */
+	spin_lock(&sb_lock);
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (sb->s_flags & MS_RDONLY)
+			continue;
+		sb->s_need_sync = 1;
+	}
+
+restart:
+	list_for_each_entry(sb, &super_blocks, s_list) {
+		if (!sb->s_need_sync)
+			continue;
+		sb->s_need_sync = 0;
+		if (sb->s_flags & MS_RDONLY)
+			continue;	/* hm.  Was remounted r/o meanwhile */
+		sb->s_count++;
+		spin_unlock(&sb_lock);
+		down_read(&sb->s_umount);
+		if (sb->s_root)
+			__fsync_super(sb, wait);
+		up_read(&sb->s_umount);
+		/* restart only when sb is no longer on the list */
+		spin_lock(&sb_lock);
+		if (__put_super_and_need_restart(sb))
+			goto restart;
+	}
+	spin_unlock(&sb_lock);
+	mutex_unlock(&mutex);
+}
+
 SYSCALL_DEFINE0(sync)
 {
 	sync_filesystems(0);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f00df653cf2b7..d3f7159993cf7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1942,7 +1942,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-extern int fsync_super(struct super_block *);
 extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
@@ -1959,6 +1958,7 @@ static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 	return 0;
 }
 #endif
+extern int fsync_super(struct super_block *);
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
@@ -2077,7 +2077,6 @@ extern int filemap_fdatawrite_range(struct address_space *mapping,
 
 extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
-extern void sync_filesystems(int wait);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
 extern int do_remount_sb(struct super_block *sb, int flags,
-- 
GitLab


From 60b0680fa236ac4e17ce31a50048c9d75f9ec831 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:53 +0200
Subject: [PATCH 2133/2198] vfs: Rename fsync_super() to sync_filesystem()
 (version 4)

Rename the function so that it better describe what it really does. Also
remove the unnecessary include of buffer_head.h.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/block_dev.c            |  4 ++--
 fs/cachefiles/interface.c |  2 +-
 fs/super.c                |  5 ++---
 fs/sync.c                 | 12 ++++++------
 include/linux/fs.h        |  2 +-
 5 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4b6a3b9d01eff..3a6d4fb2a329c 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -204,7 +204,7 @@ int fsync_bdev(struct block_device *bdev)
 {
 	struct super_block *sb = get_super(bdev);
 	if (sb) {
-		int res = fsync_super(sb);
+		int res = sync_filesystem(sb);
 		drop_super(sb);
 		return res;
 	}
@@ -246,7 +246,7 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 		sb->s_frozen = SB_FREEZE_WRITE;
 		smp_wmb();
 
-		fsync_super(sb);
+		sync_filesystem(sb);
 
 		sb->s_frozen = SB_FREEZE_TRANS;
 		smp_wmb();
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 1e962348d1114..dafd484d7bdac 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -354,7 +354,7 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
 	/* make sure all pages pinned by operations on behalf of the netfs are
 	 * written to disc */
 	cachefiles_begin_secure(cache, &saved_cred);
-	ret = fsync_super(cache->mnt->mnt_sb);
+	ret = sync_filesystem(cache->mnt->mnt_sb);
 	cachefiles_end_secure(cache, saved_cred);
 
 	if (ret == -EIO)
diff --git a/fs/super.c b/fs/super.c
index f822c74f25bee..721236e6177ac 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -28,7 +28,6 @@
 #include <linux/blkdev.h>
 #include <linux/quotaops.h>
 #include <linux/namei.h>
-#include <linux/buffer_head.h>		/* for fsync_super() */
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
@@ -304,7 +303,7 @@ void generic_shutdown_super(struct super_block *sb)
 
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
-		fsync_super(sb);
+		sync_filesystem(sb);
 		lock_super(sb);
 		sb->s_flags &= ~MS_ACTIVE;
 
@@ -543,7 +542,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	if (flags & MS_RDONLY)
 		acct_auto_close(sb);
 	shrink_dcache_sb(sb);
-	fsync_super(sb);
+	sync_filesystem(sb);
 
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
diff --git a/fs/sync.c b/fs/sync.c
index d5fa7b79982ed..8aa870a4d406d 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -25,7 +25,7 @@
  * case write_inode() functions do sync_dirty_buffer() and thus effectively
  * write one block at a time.
  */
-static int __fsync_super(struct super_block *sb, int wait)
+static int __sync_filesystem(struct super_block *sb, int wait)
 {
 	vfs_dq_sync(sb);
 	sync_inodes_sb(sb, wait);
@@ -43,16 +43,16 @@ static int __fsync_super(struct super_block *sb, int wait)
  * superblock.  Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
-int fsync_super(struct super_block *sb)
+int sync_filesystem(struct super_block *sb)
 {
 	int ret;
 
-	ret = __fsync_super(sb, 0);
+	ret = __sync_filesystem(sb, 0);
 	if (ret < 0)
 		return ret;
-	return __fsync_super(sb, 1);
+	return __sync_filesystem(sb, 1);
 }
-EXPORT_SYMBOL_GPL(fsync_super);
+EXPORT_SYMBOL_GPL(sync_filesystem);
 
 /*
  * Sync all the data for all the filesystems (called by sys_sync() and
@@ -92,7 +92,7 @@ static void sync_filesystems(int wait)
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
 		if (sb->s_root)
-			__fsync_super(sb, wait);
+			__sync_filesystem(sb, wait);
 		up_read(&sb->s_umount);
 		/* restart only when sb is no longer on the list */
 		spin_lock(&sb_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d3f7159993cf7..fb1822bed7c85 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1958,7 +1958,7 @@ static inline int thaw_bdev(struct block_device *bdev, struct super_block *sb)
 	return 0;
 }
 #endif
-extern int fsync_super(struct super_block *);
+extern int sync_filesystem(struct super_block *);
 extern const struct file_operations def_blk_fops;
 extern const struct file_operations def_chr_fops;
 extern const struct file_operations bad_sock_fops;
-- 
GitLab


From 850b201b087f5525a0a7278551c2bcd0423c3b26 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 27 Apr 2009 16:43:54 +0200
Subject: [PATCH 2134/2198] quota: cleanup dquota sync functions (version 4)

Currently the VFS calls vfs_dq_sync to sync out disk quotas for a given
superblock.  This is a small wrapper around sync_dquots which for the
case of a non-NULL superblock is a small wrapper around quota_sync_sb.

Just make quota_sync_sb global (rename it to sync_quota_sb) and call it
directly.  Also call it directly for those cases in quota.c that have a
superblock and leave sync_dquots purely an iterator over sync_quota_sb and
remove it's superblock argument.

To make this nicer move the check for the lack of a quota_sync method
from the callers into sync_quota_sb.

[folded build fix from Alexander Beregalov <a.beregalov@gmail.com>]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/quota/quota.c         | 25 ++++++++++++++-----------
 fs/sync.c                |  2 +-
 include/linux/quotaops.h | 11 +++--------
 3 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b7f5a468f076d..95c5b42384b29 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -159,10 +159,14 @@ static int check_quotactl_valid(struct super_block *sb, int type, int cmd,
 	return error;
 }
 
-static void quota_sync_sb(struct super_block *sb, int type)
+#ifdef CONFIG_QUOTA
+void sync_quota_sb(struct super_block *sb, int type)
 {
 	int cnt;
 
+	if (!sb->s_qcop->quota_sync)
+		return;
+
 	sb->s_qcop->quota_sync(sb, type);
 
 	if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
@@ -191,17 +195,13 @@ static void quota_sync_sb(struct super_block *sb, int type)
 	}
 	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 }
+#endif
 
-void sync_dquots(struct super_block *sb, int type)
+static void sync_dquots(int type)
 {
+	struct super_block *sb;
 	int cnt;
 
-	if (sb) {
-		if (sb->s_qcop->quota_sync)
-			quota_sync_sb(sb, type);
-		return;
-	}
-
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
@@ -222,8 +222,8 @@ void sync_dquots(struct super_block *sb, int type)
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		if (sb->s_root && sb->s_qcop->quota_sync)
-			quota_sync_sb(sb, type);
+		if (sb->s_root)
+			sync_quota_sb(sb, type);
 		up_read(&sb->s_umount);
 		spin_lock(&sb_lock);
 		if (__put_super_and_need_restart(sb))
@@ -301,7 +301,10 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
 			return sb->s_qcop->set_dqblk(sb, type, id, &idq);
 		}
 		case Q_SYNC:
-			sync_dquots(sb, type);
+			if (sb)
+				sync_quota_sb(sb, type);
+			else
+				sync_dquots(type);
 			return 0;
 
 		case Q_XQUOTAON:
diff --git a/fs/sync.c b/fs/sync.c
index 8aa870a4d406d..d90ab77645550 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,7 +27,7 @@
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
-	vfs_dq_sync(sb);
+	sync_quota_sb(sb, -1);
 	sync_inodes_sb(sb, wait);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 36353d95c8dbf..047310fa22fb7 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -20,7 +20,7 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
 /*
  * declaration of quota_function calls in kernel.
  */
-void sync_dquots(struct super_block *sb, int type);
+void sync_quota_sb(struct super_block *sb, int type);
 
 int dquot_initialize(struct inode *inode, int type);
 int dquot_drop(struct inode *inode);
@@ -253,12 +253,7 @@ static inline void vfs_dq_free_inode(struct inode *inode)
 		inode->i_sb->dq_op->free_inode(inode, 1);
 }
 
-/* The following two functions cannot be called inside a transaction */
-static inline void vfs_dq_sync(struct super_block *sb)
-{
-	sync_dquots(sb, -1);
-}
-
+/* Cannot be called inside a transaction */
 static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
 	int ret = -ENOSYS;
@@ -334,7 +329,7 @@ static inline void vfs_dq_free_inode(struct inode *inode)
 {
 }
 
-static inline void vfs_dq_sync(struct super_block *sb)
+static inline void sync_quota_sb(struct super_block *sb, int type)
 {
 }
 
-- 
GitLab


From c3f8a40c1cd5591b882497d1d00d43d0e5bb4698 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Mon, 27 Apr 2009 16:43:55 +0200
Subject: [PATCH 2135/2198] quota: Introduce writeout_quota_sb() (version 4)

Introduce this function which just writes all the quota structures but
avoids all the syncing and cache pruning work to expose quota structures
to userspace. Use this function from __sync_filesystem when wait == 0.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sync.c                | 6 +++++-
 include/linux/quotaops.h | 9 +++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/fs/sync.c b/fs/sync.c
index d90ab77645550..4487b5560dc8e 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -27,7 +27,11 @@
  */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
-	sync_quota_sb(sb, -1);
+	/* Avoid doing twice syncing and cache pruning for quota sync */
+	if (!wait)
+		writeout_quota_sb(sb, -1);
+	else
+		sync_quota_sb(sb, -1);
 	sync_inodes_sb(sb, wait);
 	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index 047310fa22fb7..7bc457593684a 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -21,6 +21,11 @@ static inline struct quota_info *sb_dqopt(struct super_block *sb)
  * declaration of quota_function calls in kernel.
  */
 void sync_quota_sb(struct super_block *sb, int type);
+static inline void writeout_quota_sb(struct super_block *sb, int type)
+{
+	if (sb->s_qcop->quota_sync)
+		sb->s_qcop->quota_sync(sb, type);
+}
 
 int dquot_initialize(struct inode *inode, int type);
 int dquot_drop(struct inode *inode);
@@ -333,6 +338,10 @@ static inline void sync_quota_sb(struct super_block *sb, int type)
 {
 }
 
+static inline void writeout_quota_sb(struct super_block *sb, int type)
+{
+}
+
 static inline int vfs_dq_off(struct super_block *sb, int remount)
 {
 	return 0;
-- 
GitLab


From 59d697b70285c348c01cfc2695c3469ba71d7539 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 27 Apr 2009 09:46:41 -0400
Subject: [PATCH 2136/2198] btrfs: remove ->write_super and stop maintaining
 ->s_dirt

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c | 7 -------
 fs/btrfs/super.c | 8 --------
 2 files changed, 15 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5b68330f85852..8612b3a098111 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2322,7 +2322,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	btrfs_update_inode(trans, root, dir);
 	btrfs_drop_nlink(inode);
 	ret = btrfs_update_inode(trans, root, inode);
-	dir->i_sb->s_dirt = 1;
 out:
 	return ret;
 }
@@ -2806,7 +2805,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 				      pending_del_nr);
 	}
 	btrfs_free_path(path);
-	inode->i_sb->s_dirt = 1;
 	return ret;
 }
 
@@ -3768,7 +3766,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		init_special_inode(inode, inode->i_mode, rdev);
 		btrfs_update_inode(trans, root, inode);
 	}
-	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3833,7 +3830,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
-	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 out_unlock:
@@ -3880,7 +3876,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 	if (err)
 		drop_inode = 1;
 
-	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, dir);
 	err = btrfs_update_inode(trans, root, inode);
 
@@ -3962,7 +3957,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 
 	d_instantiate(dentry, inode);
 	drop_on_err = 0;
-	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 
@@ -4991,7 +4985,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &btrfs_file_inode_operations;
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
-	dir->i_sb->s_dirt = 1;
 	btrfs_update_inode_block_group(trans, inode);
 	btrfs_update_inode_block_group(trans, dir);
 	if (drop_inode)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 708ac06b953be..52d84522c2c2d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -397,7 +397,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	sb->s_dirt = 0;
 	if (!wait) {
 		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
@@ -408,7 +407,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
-	sb->s_dirt = 0;
 	return ret;
 }
 
@@ -454,11 +452,6 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 	return 0;
 }
 
-static void btrfs_write_super(struct super_block *sb)
-{
-	sb->s_dirt = 0;
-}
-
 static int btrfs_test_super(struct super_block *s, void *data)
 {
 	struct btrfs_fs_devices *test_fs_devices = data;
@@ -689,7 +682,6 @@ static int btrfs_unfreeze(struct super_block *sb)
 static struct super_operations btrfs_super_ops = {
 	.delete_inode	= btrfs_delete_inode,
 	.put_super	= btrfs_put_super,
-	.write_super	= btrfs_write_super,
 	.sync_fs	= btrfs_sync_fs,
 	.show_options	= btrfs_show_options,
 	.write_inode	= btrfs_write_inode,
-- 
GitLab


From ca41f7b918294c2a17780e057568413dcbfc6d49 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 27 Apr 2009 09:46:42 -0400
Subject: [PATCH 2137/2198] ext3: remove ->write_super and stop maintaining
 ->s_dirt

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/balloc.c |  3 +--
 fs/ext3/ialloc.c |  3 +--
 fs/ext3/inode.c  |  1 -
 fs/ext3/resize.c |  2 --
 fs/ext3/super.c  | 22 ----------------------
 fs/ext3/xattr.c  |  1 -
 6 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 225202db89746..27967f92e8201 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -649,7 +649,7 @@ void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
 		count = overflow;
 		goto do_more;
 	}
-	sb->s_dirt = 1;
+
 error_return:
 	brelse(bitmap_bh);
 	ext3_std_error(sb, err);
@@ -1708,7 +1708,6 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
 	if (!fatal)
 		fatal = err;
 
-	sb->s_dirt = 1;
 	if (fatal)
 		goto out;
 
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index dd13d60d524bb..b399912851367 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -181,7 +181,7 @@ void ext3_free_inode (handle_t *handle, struct inode * inode)
 	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
 	if (!fatal)
 		fatal = err;
-	sb->s_dirt = 1;
+
 error_return:
 	brelse(bitmap_bh);
 	ext3_std_error(sb, fatal);
@@ -537,7 +537,6 @@ struct inode *ext3_new_inode(handle_t *handle, struct inode * dir, int mode)
 	percpu_counter_dec(&sbi->s_freeinodes_counter);
 	if (S_ISDIR(mode))
 		percpu_counter_inc(&sbi->s_dirs_counter);
-	sb->s_dirt = 1;
 
 	inode->i_uid = current_fsuid();
 	if (test_opt (sb, GRPID))
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index fcfa243618567..b0248c6d5d4c9 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2960,7 +2960,6 @@ static int ext3_do_update_inode(handle_t *handle,
 				ext3_update_dynamic_rev(sb);
 				EXT3_SET_RO_COMPAT_FEATURE(sb,
 					EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
-				sb->s_dirt = 1;
 				handle->h_sync = 1;
 				err = ext3_journal_dirty_metadata(handle,
 						EXT3_SB(sb)->s_sbh);
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 78fdf38363702..8a0b26340b54d 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -934,7 +934,6 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 			   EXT3_INODES_PER_GROUP(sb));
 
 	ext3_journal_dirty_metadata(handle, sbi->s_sbh);
-	sb->s_dirt = 1;
 
 exit_journal:
 	unlock_super(sb);
@@ -1066,7 +1065,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	}
 	es->s_blocks_count = cpu_to_le32(o_blocks_count + add);
 	ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
-	sb->s_dirt = 1;
 	unlock_super(sb);
 	ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count,
 		   o_blocks_count + add);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 3c70d52afb10b..1efd958687e97 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -67,7 +67,6 @@ static const char *ext3_decode_error(struct super_block * sb, int errno,
 static int ext3_remount (struct super_block * sb, int * flags, char * data);
 static int ext3_statfs (struct dentry * dentry, struct kstatfs * buf);
 static int ext3_unfreeze(struct super_block *sb);
-static void ext3_write_super (struct super_block * sb);
 static int ext3_freeze(struct super_block *sb);
 
 /*
@@ -761,7 +760,6 @@ static const struct super_operations ext3_sops = {
 	.dirty_inode	= ext3_dirty_inode,
 	.delete_inode	= ext3_delete_inode,
 	.put_super	= ext3_put_super,
-	.write_super	= ext3_write_super,
 	.sync_fs	= ext3_sync_fs,
 	.freeze_fs	= ext3_freeze,
 	.unfreeze_fs	= ext3_unfreeze,
@@ -1785,7 +1783,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 #else
 		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
-		sb->s_dirt = 1;
 	}
 
 	if (sbi->s_blocks_per_group > blocksize * 8) {
@@ -2265,7 +2262,6 @@ static int ext3_load_journal(struct super_block *sb,
 	if (journal_devnum &&
 	    journal_devnum != le32_to_cpu(es->s_journal_dev)) {
 		es->s_journal_dev = cpu_to_le32(journal_devnum);
-		sb->s_dirt = 1;
 
 		/* Make sure we flush the recovery flag to disk. */
 		ext3_commit_super(sb, es, 1);
@@ -2308,7 +2304,6 @@ static int ext3_create_journal(struct super_block * sb,
 	EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL);
 
 	es->s_journal_inum = cpu_to_le32(journal_inum);
-	sb->s_dirt = 1;
 
 	/* Make sure we flush the recovery flag to disk. */
 	ext3_commit_super(sb, es, 1);
@@ -2354,7 +2349,6 @@ static void ext3_mark_recovery_complete(struct super_block * sb,
 	if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) &&
 	    sb->s_flags & MS_RDONLY) {
 		EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER);
-		sb->s_dirt = 0;
 		ext3_commit_super(sb, es, 1);
 	}
 	unlock_super(sb);
@@ -2413,29 +2407,14 @@ int ext3_force_commit(struct super_block *sb)
 		return 0;
 
 	journal = EXT3_SB(sb)->s_journal;
-	sb->s_dirt = 0;
 	ret = ext3_journal_force_commit(journal);
 	return ret;
 }
 
-/*
- * Ext3 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext3; eventual cleanup...)
- */
-static void ext3_write_super (struct super_block * sb)
-{
-	if (mutex_trylock(&sb->s_lock) != 0)
-		BUG();
-	sb->s_dirt = 0;
-}
-
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
 	tid_t target;
 
-	sb->s_dirt = 0;
 	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
 		if (wait)
 			log_wait_commit(EXT3_SB(sb)->s_journal, target);
@@ -2451,7 +2430,6 @@ static int ext3_freeze(struct super_block *sb)
 {
 	int error = 0;
 	journal_t *journal;
-	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY)) {
 		journal = EXT3_SB(sb)->s_journal;
diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c
index 83b7be849bd50..545e37c4b91eb 100644
--- a/fs/ext3/xattr.c
+++ b/fs/ext3/xattr.c
@@ -463,7 +463,6 @@ static void ext3_xattr_update_super_block(handle_t *handle,
 
 	if (ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh) == 0) {
 		EXT3_SET_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_EXT_ATTR);
-		sb->s_dirt = 1;
 		ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh);
 	}
 }
-- 
GitLab


From b7d245de25d1f0bb75a0d04194b647762b30d3db Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 27 Apr 2009 09:46:43 -0400
Subject: [PATCH 2138/2198] gfs2: remove ->write_super and stop maintaining
 ->s_dirt

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/gfs2/log.c   |  2 --
 fs/gfs2/super.c | 13 -------------
 2 files changed, 15 deletions(-)

diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index aa62cf5976e80..f2e449c595b44 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -764,7 +764,6 @@ void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	}
 	gfs2_log_unlock(sdp);
 
-	sdp->sd_vfs->s_dirt = 0;
 	up_write(&sdp->sd_log_flush_lock);
 
 	kfree(ai);
@@ -823,7 +822,6 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	log_refund(sdp, tr);
 	buf_lo_incore_commit(sdp, tr);
 
-	sdp->sd_vfs->s_dirt = 1;
 	up_read(&sdp->sd_log_flush_lock);
 
 	gfs2_log_lock(sdp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 40bcc37e5a700..0a68013364708 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -787,17 +787,6 @@ static void gfs2_put_super(struct super_block *sb)
 	gfs2_sys_fs_del(sdp);
 }
 
-/**
- * gfs2_write_super
- * @sb: the superblock
- *
- */
-
-static void gfs2_write_super(struct super_block *sb)
-{
-	sb->s_dirt = 0;
-}
-
 /**
  * gfs2_sync_fs - sync the filesystem
  * @sb: the superblock
@@ -807,7 +796,6 @@ static void gfs2_write_super(struct super_block *sb)
 
 static int gfs2_sync_fs(struct super_block *sb, int wait)
 {
-	sb->s_dirt = 0;
 	if (wait && sb->s_fs_info)
 		gfs2_log_flush(sb->s_fs_info, NULL);
 	return 0;
@@ -1324,7 +1312,6 @@ const struct super_operations gfs2_super_ops = {
 	.write_inode		= gfs2_write_inode,
 	.delete_inode		= gfs2_delete_inode,
 	.put_super		= gfs2_put_super,
-	.write_super		= gfs2_write_super,
 	.sync_fs		= gfs2_sync_fs,
 	.freeze_fs 		= gfs2_freeze,
 	.unfreeze_fs		= gfs2_unfreeze,
-- 
GitLab


From 94cb993f2ee99f3a9318e7b4dbb383497c4bedea Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 27 Apr 2009 09:46:44 -0400
Subject: [PATCH 2139/2198] ocfs2: remove ->write_super and stop maintaining
 ->s_dirt

Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ocfs2/super.c | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 5c6163f550393..3eb076ce4c073 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -126,7 +126,6 @@ static int ocfs2_get_sector(struct super_block *sb,
 			    struct buffer_head **bh,
 			    int block,
 			    int sect_size);
-static void ocfs2_write_super(struct super_block *sb);
 static struct inode *ocfs2_alloc_inode(struct super_block *sb);
 static void ocfs2_destroy_inode(struct inode *inode);
 static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
@@ -141,7 +140,6 @@ static const struct super_operations ocfs2_sops = {
 	.clear_inode	= ocfs2_clear_inode,
 	.delete_inode	= ocfs2_delete_inode,
 	.sync_fs	= ocfs2_sync_fs,
-	.write_super	= ocfs2_write_super,
 	.put_super	= ocfs2_put_super,
 	.remount_fs	= ocfs2_remount,
 	.show_options   = ocfs2_show_options,
@@ -365,24 +363,12 @@ static struct file_operations ocfs2_osb_debug_fops = {
 	.llseek =	generic_file_llseek,
 };
 
-/*
- * write_super and sync_fs ripped right out of ext3.
- */
-static void ocfs2_write_super(struct super_block *sb)
-{
-	if (mutex_trylock(&sb->s_lock) != 0)
-		BUG();
-	sb->s_dirt = 0;
-}
-
 static int ocfs2_sync_fs(struct super_block *sb, int wait)
 {
 	int status;
 	tid_t target;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
-	sb->s_dirt = 0;
-
 	if (ocfs2_is_hard_readonly(osb))
 		return -EROFS;
 
-- 
GitLab


From 517bfae28353e996160518add4d00033d3886e61 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Mon, 27 Apr 2009 09:46:45 -0400
Subject: [PATCH 2140/2198] qnx4: remove ->write_super

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/inode.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index fe1f0f31d11ca..95c12fc613f1f 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -70,14 +70,6 @@ static void qnx4_delete_inode(struct inode *inode)
 	unlock_kernel();
 }
 
-static void qnx4_write_super(struct super_block *sb)
-{
-	lock_kernel();
-	QNX4DEBUG(("qnx4: write_super\n"));
-	sb->s_dirt = 0;
-	unlock_kernel();
-}
-
 static int qnx4_write_inode(struct inode *inode, int unused)
 {
 	struct qnx4_inode_entry *raw_inode;
@@ -138,7 +130,6 @@ static const struct super_operations qnx4_sops =
 #ifdef CONFIG_QNX4FS_RW
 	.write_inode	= qnx4_write_inode,
 	.delete_inode	= qnx4_delete_inode,
-	.write_super	= qnx4_write_super,
 #endif
 };
 
-- 
GitLab


From 8c85e125124a473d6f3e9bb187b0b84207f81d91 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 28 Apr 2009 18:00:26 +0200
Subject: [PATCH 2141/2198] remove ->write_super call in generic_shutdown_super

We just did a full fs writeout using sync_filesystem before, and if
that's not enough for the filesystem it can perform it's own writeout
in ->put_super, which many filesystems already do.

Move a call to foofs_write_super into every foofs_put_super for now to
guarantee identical behaviour until it's cleaned up by the individual
filesystem maintainers.

Exceptions:

 - affs already has identical copy & pasted code at the beginning of
   affs_put_super so no need to do it twice.
 - xfs does the right thing without it and I have changes pending for
   the xfs tree touching this are so I don't really need conflicts
   here..

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/bfs/inode.c      | 4 ++++
 fs/exofs/super.c    | 3 +++
 fs/ext2/super.c     | 3 +++
 fs/ext4/super.c     | 3 +++
 fs/fat/inode.c      | 3 +++
 fs/hfs/super.c      | 2 ++
 fs/hfsplus/super.c  | 2 ++
 fs/jffs2/super.c    | 3 +++
 fs/nilfs2/super.c   | 4 ++++
 fs/reiserfs/super.c | 3 +++
 fs/super.c          | 2 --
 fs/sysv/inode.c     | 3 +++
 fs/ufs/super.c      | 3 +++
 13 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index cc4062d12ca21..4cf3d269e2718 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -30,6 +30,7 @@ MODULE_LICENSE("GPL");
 #define dprintf(x...)
 #endif
 
+static void bfs_write_super(struct super_block *s);
 void dump_imap(const char *prefix, struct super_block *s);
 
 struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
@@ -216,6 +217,9 @@ static void bfs_put_super(struct super_block *s)
 	if (!info)
 		return;
 
+	if (s->s_dirt)
+		bfs_write_super(s);
+
 	brelse(info->si_sbh);
 	mutex_destroy(&info->bfs_lock);
 	kfree(info->si_imap);
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f1985e857e25..3cdb761db8ad8 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -258,6 +258,9 @@ static void exofs_put_super(struct super_block *sb)
 	int num_pend;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
 
+	if (sb->s_dirt)
+		exofs_write_super(sb);
+
 	/* make sure there are no pending commands */
 	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
 	     num_pend = atomic_read(&sbi->s_curr_pending)) {
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index e3c748faf2dbc..932a2bcb69081 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -114,6 +114,9 @@ static void ext2_put_super (struct super_block * sb)
 	int i;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 
+	if (sb->s_dirt)
+		ext2_write_super(sb);
+
 	ext2_xattr_put_super(sb);
 	if (!(sb->s_flags & MS_RDONLY)) {
 		struct ext2_super_block *es = sbi->s_es;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f016707597a72..c7b8f8d9b7a83 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -576,6 +576,9 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	if (sb->s_dirt)
+		ext4_write_super(sb);
+
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
 	ext4_ext_release(sb);
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 296785a0dec80..4978621511bf7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,6 +451,9 @@ static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
+	if (sb->s_dirt)
+		fat_write_super(sb);
+
 	if (sbi->nls_disk) {
 		unload_nls(sbi->nls_disk);
 		sbi->nls_disk = NULL;
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index a36bb749926da..e071e6d064636 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -65,6 +65,8 @@ static void hfs_write_super(struct super_block *sb)
  */
 static void hfs_put_super(struct super_block *sb)
 {
+	if (sb->s_dirt)
+		hfs_write_super(sb);
 	hfs_mdb_close(sb);
 	/* release the MDB's resources */
 	hfs_mdb_put(sb);
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index f2a64020f42e1..40bdab79dae8a 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -199,6 +199,8 @@ static void hfsplus_put_super(struct super_block *sb)
 	dprint(DBG_SUPER, "hfsplus_put_super\n");
 	if (!sb->s_fs_info)
 		return;
+	if (sb->s_dirt)
+		hfsplus_write_super(sb);
 	if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
 		struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
 
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 4c4e18c54a511..5059e9633edbd 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -174,6 +174,9 @@ static void jffs2_put_super (struct super_block *sb)
 
 	D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
 
+	if (sb->s_dirt)
+		jffs2_write_super(sb);
+
 	mutex_lock(&c->alloc_sem);
 	jffs2_flush_wbuf_pad(c);
 	mutex_unlock(&c->alloc_sem);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 6989b03e97ab5..7901d8cbb9b1f 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -65,6 +65,7 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
 		   "(NILFS)");
 MODULE_LICENSE("GPL");
 
+static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 static int test_exclusive_mount(struct file_system_type *fs_type,
 				struct block_device *bdev, int flags);
@@ -315,6 +316,9 @@ static void nilfs_put_super(struct super_block *sb)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
+	if (sb->s_dirt)
+		nilfs_write_super(sb);
+
 	nilfs_detach_segment_constructor(sbi);
 
 	if (!(sb->s_flags & MS_RDONLY)) {
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 9dbdcfb5d3148..1b52daa351c56 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -468,6 +468,9 @@ static void reiserfs_put_super(struct super_block *s)
 	struct reiserfs_transaction_handle th;
 	th.t_trans_id = 0;
 
+	if (s->s_dirt)
+		reiserfs_write_super(s);
+
 	/* change file system state to current state if it was mounted with read-write permissions */
 	if (!(s->s_flags & MS_RDONLY)) {
 		if (!journal_begin(&th, s, 10)) {
diff --git a/fs/super.c b/fs/super.c
index 721236e6177ac..d9a29d5b1d280 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -311,8 +311,6 @@ void generic_shutdown_super(struct super_block *sb)
 		invalidate_inodes(sb);
 		lock_kernel();
 
-		if (sop->write_super && sb->s_dirt)
-			sop->write_super(sb);
 		if (sop->put_super)
 			sop->put_super(sb);
 
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index da20b48d350fd..cd80316302ccd 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -72,6 +72,9 @@ static void sysv_put_super(struct super_block *sb)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
+	if (sb->s_dirt)
+		sysv_write_super(sb);
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		/* XXX ext2 also updates the state here */
 		mark_buffer_dirty(sbi->s_bh1);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 60359291761f2..74afb9fbf58e3 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1152,6 +1152,9 @@ static void ufs_put_super(struct super_block *sb)
 		
 	UFSD("ENTER\n");
 
+	if (sb->s_dirt)
+		ufs_write_super(sb);
+
 	if (!(sb->s_flags & MS_RDONLY))
 		ufs_put_super_internal(sb);
 	
-- 
GitLab


From f3da392e9ff14b9f388e74319e6d195848991c07 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 4 May 2009 03:32:03 +0400
Subject: [PATCH 2142/2198] dcache: extrace and use d_unlinked()

d_unlinked() will be used in middle-term to ban checkpointing when opened
but unlinked file is detected, and in long term, to detect such situation
and special case on it.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c            | 7 +++----
 fs/namespace.c         | 8 ++++----
 include/linux/dcache.h | 5 +++++
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/fs/dcache.c b/fs/dcache.c
index 75659a6fd1f80..9e5cd3c3a6ba7 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1910,7 +1910,7 @@ char *__d_path(const struct path *path, struct path *root,
 
 	spin_lock(&vfsmount_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+	if (d_unlinked(dentry) &&
 		(prepend(&end, &buflen, " (deleted)", 10) != 0))
 			goto Elong;
 
@@ -2035,7 +2035,7 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
 
 	spin_lock(&dcache_lock);
 	prepend(&end, &buflen, "\0", 1);
-	if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
+	if (d_unlinked(dentry) &&
 		(prepend(&end, &buflen, "//deleted", 9) != 0))
 			goto Elong;
 	if (buflen < 1)
@@ -2097,9 +2097,8 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 	read_unlock(&current->fs->lock);
 
 	error = -ENOENT;
-	/* Has the current directory has been unlinked? */
 	spin_lock(&dcache_lock);
-	if (IS_ROOT(pwd.dentry) || !d_unhashed(pwd.dentry)) {
+	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
 		struct path tmp = root;
 		char * cwd;
diff --git a/fs/namespace.c b/fs/namespace.c
index 120b8a6b99ed6..7e537f0393b58 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1384,7 +1384,7 @@ static int graft_tree(struct vfsmount *mnt, struct path *path)
 		goto out_unlock;
 
 	err = -ENOENT;
-	if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
+	if (!d_unlinked(path->dentry))
 		err = attach_recursive_mnt(mnt, path, NULL);
 out_unlock:
 	mutex_unlock(&path->dentry->d_inode->i_mutex);
@@ -1566,7 +1566,7 @@ static int do_move_mount(struct path *path, char *old_name)
 	if (IS_DEADDIR(path->dentry->d_inode))
 		goto out1;
 
-	if (!IS_ROOT(path->dentry) && d_unhashed(path->dentry))
+	if (d_unlinked(path->dentry))
 		goto out1;
 
 	err = -EINVAL;
@@ -2129,9 +2129,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	error = -ENOENT;
 	if (IS_DEADDIR(new.dentry->d_inode))
 		goto out2;
-	if (d_unhashed(new.dentry) && !IS_ROOT(new.dentry))
+	if (d_unlinked(new.dentry))
 		goto out2;
-	if (d_unhashed(old.dentry) && !IS_ROOT(old.dentry))
+	if (d_unlinked(old.dentry))
 		goto out2;
 	error = -EBUSY;
 	if (new.mnt == root.mnt ||
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index 72ce2ae88591e..30b93b2a01a42 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -353,6 +353,11 @@ static inline int d_unhashed(struct dentry *dentry)
 	return (dentry->d_flags & DCACHE_UNHASHED);
 }
 
+static inline int d_unlinked(struct dentry *dentry)
+{
+	return d_unhashed(dentry) && !IS_ROOT(dentry);
+}
+
 static inline struct dentry *dget_parent(struct dentry *dentry)
 {
 	struct dentry *ret;
-- 
GitLab


From e5004753388dcf5e1b8a52ac0ab807d232340fbb Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 5 May 2009 16:08:56 +0200
Subject: [PATCH 2143/2198] cleanup sync_supers

Merge the write_super helper into sync_super and move the check for
->write_super earlier so that we can avoid grabbing a reference to
a superblock that doesn't have it.

While we're at it also add a little comment documenting sync_supers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 27 +++++++++++++++------------
 1 file changed, 15 insertions(+), 12 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index d9a29d5b1d280..cb19fffc7681f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -399,16 +399,14 @@ void drop_super(struct super_block *sb)
 
 EXPORT_SYMBOL(drop_super);
 
-static inline void write_super(struct super_block *sb)
-{
-	lock_super(sb);
-	if (sb->s_root && sb->s_dirt)
-		if (sb->s_op->write_super)
-			sb->s_op->write_super(sb);
-	unlock_super(sb);
-}
-
-/*
+/**
+ * sync_supers - helper for periodic superblock writeback
+ *
+ * Call the write_super method if present on all dirty superblocks in
+ * the system.  This is for the periodic writeback used by most older
+ * filesystems.  For data integrity superblock writeback use
+ * sync_filesystems() instead.
+ *
  * Note: check the dirty flag before waiting, so we don't
  * hold up the sync while mounting a device. (The newly
  * mounted device won't need syncing.)
@@ -420,12 +418,17 @@ void sync_supers(void)
 	spin_lock(&sb_lock);
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_dirt) {
+		if (sb->s_op->write_super && sb->s_dirt) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
+
 			down_read(&sb->s_umount);
-			write_super(sb);
+			lock_super(sb);
+			if (sb->s_root && sb->s_dirt)
+				sb->s_op->write_super(sb);
+			unlock_super(sb);
 			up_read(&sb->s_umount);
+
 			spin_lock(&sb_lock);
 			if (__put_super_and_need_restart(sb))
 				goto restart;
-- 
GitLab


From 5af7926ff33b68b3ba46531471c6e0564b285efc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 5 May 2009 15:41:25 +0200
Subject: [PATCH 2144/2198] enforce ->sync_fs is only called for rw superblock

Make sure a superblock really is writeable by checking MS_RDONLY
under s_umount.  sync_filesystems needed some re-arragement for
that, but all but one sync_filesystem caller had the correct locking
already so that we could add that check there.  cachefiles grew
s_umount locking.

I've also added a WARN_ON to sync_filesystem to assert this for
future callers.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/super.c          |  3 ---
 fs/cachefiles/interface.c |  2 ++
 fs/reiserfs/super.c       | 21 +++++++++------------
 fs/sync.c                 | 23 ++++++++++++++++-------
 fs/ubifs/super.c          |  3 ---
 5 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 52d84522c2c2d..9f179d4832d5c 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -394,9 +394,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
-	if (sb->s_flags & MS_RDONLY)
-		return 0;
-
 	if (!wait) {
 		filemap_flush(root->fs_info->btree_inode->i_mapping);
 		return 0;
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index dafd484d7bdac..431accd475a73 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -354,7 +354,9 @@ static void cachefiles_sync_cache(struct fscache_cache *_cache)
 	/* make sure all pages pinned by operations on behalf of the netfs are
 	 * written to disc */
 	cachefiles_begin_secure(cache, &saved_cred);
+	down_read(&cache->mnt->mnt_sb->s_umount);
 	ret = sync_filesystem(cache->mnt->mnt_sb);
+	up_read(&cache->mnt->mnt_sb->s_umount);
 	cachefiles_end_secure(cache, saved_cred);
 
 	if (ret == -EIO)
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1b52daa351c56..3da0401c0a966 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -64,18 +64,15 @@ static int reiserfs_statfs(struct dentry *dentry, struct kstatfs *buf);
 
 static int reiserfs_sync_fs(struct super_block *s, int wait)
 {
-	if (!(s->s_flags & MS_RDONLY)) {
-		struct reiserfs_transaction_handle th;
-		reiserfs_write_lock(s);
-		if (!journal_begin(&th, s, 1))
-			if (!journal_end_sync(&th, s, 1))
-				reiserfs_flush_old_commits(s);
-		s->s_dirt = 0;	/* Even if it's not true.
-				 * We'll loop forever in sync_supers otherwise */
-		reiserfs_write_unlock(s);
-	} else {
-		s->s_dirt = 0;
-	}
+	struct reiserfs_transaction_handle th;
+
+	reiserfs_write_lock(s);
+	if (!journal_begin(&th, s, 1))
+		if (!journal_end_sync(&th, s, 1))
+			reiserfs_flush_old_commits(s);
+	s->s_dirt = 0;	/* Even if it's not true.
+			 * We'll loop forever in sync_supers otherwise */
+	reiserfs_write_unlock(s);
 	return 0;
 }
 
diff --git a/fs/sync.c b/fs/sync.c
index 4487b5560dc8e..89c37f732afad 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -51,6 +51,18 @@ int sync_filesystem(struct super_block *sb)
 {
 	int ret;
 
+	/*
+	 * We need to be protected against the filesystem going from
+	 * r/o to r/w or vice versa.
+	 */
+	WARN_ON(!rwsem_is_locked(&sb->s_umount));
+
+	/*
+	 * No point in syncing out anything if the filesystem is read-only.
+	 */
+	if (sb->s_flags & MS_RDONLY)
+		return 0;
+
 	ret = __sync_filesystem(sb, 0);
 	if (ret < 0)
 		return ret;
@@ -79,25 +91,22 @@ static void sync_filesystems(int wait)
 
 	mutex_lock(&mutex);		/* Could be down_interruptible */
 	spin_lock(&sb_lock);
-	list_for_each_entry(sb, &super_blocks, s_list) {
-		if (sb->s_flags & MS_RDONLY)
-			continue;
+	list_for_each_entry(sb, &super_blocks, s_list)
 		sb->s_need_sync = 1;
-	}
 
 restart:
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		if (!sb->s_need_sync)
 			continue;
 		sb->s_need_sync = 0;
-		if (sb->s_flags & MS_RDONLY)
-			continue;	/* hm.  Was remounted r/o meanwhile */
 		sb->s_count++;
 		spin_unlock(&sb_lock);
+
 		down_read(&sb->s_umount);
-		if (sb->s_root)
+		if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
 			__sync_filesystem(sb, wait);
 		up_read(&sb->s_umount);
+
 		/* restart only when sb is no longer on the list */
 		spin_lock(&sb_lock);
 		if (__put_super_and_need_restart(sb))
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index e9f7a754c4f74..84f3c7fd15524 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -447,9 +447,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait)
 	if (!wait)
 		return 0;
 
-	if (sb->s_flags & MS_RDONLY)
-		return 0;
-
 	/*
 	 * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
 	 * pages, so synchronize them first, then commit the journal. Strictly
-- 
GitLab


From 443b94baaa16771e98b29ca7c24f1e305738ffca Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 5 May 2009 23:48:50 -0400
Subject: [PATCH 2145/2198] Make sure that all callers of remount hold s_umount
 exclusive

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/super.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index cb19fffc7681f..49f670cb9a830 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -579,7 +579,7 @@ static void do_emergency_remount(struct work_struct *work)
 	list_for_each_entry(sb, &super_blocks, s_list) {
 		sb->s_count++;
 		spin_unlock(&sb_lock);
-		down_read(&sb->s_umount);
+		down_write(&sb->s_umount);
 		if (sb->s_root && sb->s_bdev && !(sb->s_flags & MS_RDONLY)) {
 			/*
 			 * ->remount_fs needs lock_kernel().
@@ -590,7 +590,8 @@ static void do_emergency_remount(struct work_struct *work)
 			do_remount_sb(sb, MS_RDONLY, NULL, 1);
 			unlock_kernel();
 		}
-		drop_super(sb);
+		up_write(&sb->s_umount);
+		put_super(sb);
 		spin_lock(&sb_lock);
 	}
 	spin_unlock(&sb_lock);
-- 
GitLab


From 62c6943b4b1e818aea60c11c5a68a50785b83119 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 7 May 2009 03:12:29 -0400
Subject: [PATCH 2146/2198] Trim a bit of crap from fs.h

do_remount_sb() is fs/internal.h fodder, fsync_no_super() is long gone.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/internal.h      | 5 +++++
 include/linux/fs.h | 3 ---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/fs/internal.h b/fs/internal.h
index dbec3cc28338f..d55ef562f0bb5 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -78,3 +78,8 @@ extern void chroot_fs_refs(struct path *, struct path *);
  * file_table.c
  */
 extern void mark_files_ro(struct super_block *);
+
+/*
+ * super.c
+ */
+extern int do_remount_sb(struct super_block *, int, void *, int);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index fb1822bed7c85..e7833ef5d1d63 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1942,7 +1942,6 @@ extern struct super_block *freeze_bdev(struct block_device *);
 extern void emergency_thaw_all(void);
 extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
 extern int fsync_bdev(struct block_device *);
-extern int fsync_no_super(struct block_device *);
 #else
 static inline void bd_forget(struct inode *inode) {}
 static inline int sync_blockdev(struct block_device *bdev) { return 0; }
@@ -2079,8 +2078,6 @@ extern int vfs_fsync(struct file *file, struct dentry *dentry, int datasync);
 extern void sync_supers(void);
 extern void emergency_sync(void);
 extern void emergency_remount(void);
-extern int do_remount_sb(struct super_block *sb, int flags,
-			 void *data, int force);
 #ifdef CONFIG_BLOCK
 extern sector_t bmap(struct inode *, sector_t);
 #endif
-- 
GitLab


From a9e220f8322e2b0e0b8903fe00265461cffad3f0 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Tue, 5 May 2009 22:10:44 -0400
Subject: [PATCH 2147/2198] No need to do lock_super() for exclusion in
 generic_shutdown_super()

We can't run into contention on it.  All other callers of lock_super()
either hold s_umount (and we have it exclusive) or hold an active
reference to superblock in question, which prevents the call of
generic_shutdown_super() while the reference is held.  So we can
replace lock_super(s) with get_fs_excl() in generic_shutdown_super()
(and corresponding change for unlock_super(), of course).

Since ext4 expects s_lock held for its put_super, take lock_super()
into it.  The rest of filesystems do not care at all.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext4/super.c | 2 +-
 fs/super.c      | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c7b8f8d9b7a83..0d3034c5e8a43 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -576,6 +576,7 @@ static void ext4_put_super(struct super_block *sb)
 	struct ext4_super_block *es = sbi->s_es;
 	int i, err;
 
+	lock_super(sb);
 	if (sb->s_dirt)
 		ext4_write_super(sb);
 
@@ -645,7 +646,6 @@ static void ext4_put_super(struct super_block *sb)
 	unlock_super(sb);
 	kobject_put(&sbi->s_kobj);
 	wait_for_completion(&sbi->s_kobj_unregister);
-	lock_super(sb);
 	lock_kernel();
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
diff --git a/fs/super.c b/fs/super.c
index 49f670cb9a830..54fd331f0cabe 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -304,7 +304,7 @@ void generic_shutdown_super(struct super_block *sb)
 	if (sb->s_root) {
 		shrink_dcache_for_umount(sb);
 		sync_filesystem(sb);
-		lock_super(sb);
+		get_fs_excl();
 		sb->s_flags &= ~MS_ACTIVE;
 
 		/* bad name - it should be evict_inodes() */
@@ -322,7 +322,7 @@ void generic_shutdown_super(struct super_block *sb)
 		}
 
 		unlock_kernel();
-		unlock_super(sb);
+		put_fs_excl();
 	}
 	spin_lock(&sb_lock);
 	/* should be initialized for __put_super_and_need_restart() */
-- 
GitLab


From 6cfd0148425e528b859b26e436b01f23f6926224 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 5 May 2009 15:40:36 +0200
Subject: [PATCH 2148/2198] push BKL down into ->put_super

Move BKL into ->put_super from the only caller.  A couple of
filesystems had trivial enough ->put_super (only kfree and NULLing of
s_fs_info + stuff in there) to not get any locking: coda, cramfs, efs,
hugetlbfs, omfs, qnx4, shmem, all others got the full treatment.  Most
of them probably don't need it, but I'd rather sort that out individually.
Preferably after all the other BKL pushdowns in that area.

[AV: original used to move lock_super() down as well; these changes are
removed since we don't do lock_super() at all in generic_shutdown_super()
now]
[AV: fuse, btrfs and xfs are known to need no damn BKL, exempt]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/adfs/super.c          | 4 ++++
 fs/affs/super.c          | 5 ++++-
 fs/afs/super.c           | 4 ++++
 fs/befs/linuxvfs.c       | 5 ++++-
 fs/bfs/inode.c           | 4 ++++
 fs/cifs/cifsfs.c         | 6 +++++-
 fs/ecryptfs/super.c      | 5 +++++
 fs/exofs/super.c         | 4 ++++
 fs/ext2/super.c          | 4 +++-
 fs/ext3/super.c          | 5 ++++-
 fs/ext4/super.c          | 2 +-
 fs/fat/inode.c           | 4 ++++
 fs/freevxfs/vxfs_super.c | 4 ++++
 fs/gfs2/super.c          | 4 ++++
 fs/hfs/super.c           | 4 ++++
 fs/hfsplus/super.c       | 5 +++++
 fs/hpfs/super.c          | 5 +++++
 fs/isofs/inode.c         | 5 +++++
 fs/jffs2/super.c         | 4 ++++
 fs/jfs/super.c           | 5 +++++
 fs/minix/inode.c         | 4 +++-
 fs/ncpfs/inode.c         | 4 ++++
 fs/nilfs2/super.c        | 4 ++++
 fs/ntfs/super.c          | 6 +++++-
 fs/ocfs2/super.c         | 4 ++++
 fs/reiserfs/super.c      | 4 +++-
 fs/smbfs/inode.c         | 4 ++++
 fs/squashfs/super.c      | 4 ++++
 fs/super.c               | 3 ---
 fs/sysv/inode.c          | 4 ++++
 fs/ubifs/super.c         | 5 +++++
 fs/udf/super.c           | 5 +++++
 fs/ufs/super.c           | 6 ++++++
 33 files changed, 133 insertions(+), 12 deletions(-)

diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index dd9becca4241b..0ec5aaf47aa77 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -132,11 +132,15 @@ static void adfs_put_super(struct super_block *sb)
 	int i;
 	struct adfs_sb_info *asb = ADFS_SB(sb);
 
+	lock_kernel();
+
 	for (i = 0; i < asb->s_map_size; i++)
 		brelse(asb->s_map[i].dm_bh);
 	kfree(asb->s_map);
 	kfree(asb);
 	sb->s_fs_info = NULL;
+
+	unlock_kernel();
 }
 
 static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt)
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 63f5183f263b3..d7386462a8e7b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -29,6 +29,8 @@ affs_put_super(struct super_block *sb)
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 	pr_debug("AFFS: put_super()\n");
 
+	lock_kernel();
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1);
 		secs_to_datestamp(get_seconds(),
@@ -42,7 +44,8 @@ affs_put_super(struct super_block *sb)
 	affs_brelse(sbi->s_root_bh);
 	kfree(sbi);
 	sb->s_fs_info = NULL;
-	return;
+
+	unlock_kernel();
 }
 
 static void
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 76828e5f8a390..ad0514d0115f7 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -440,8 +440,12 @@ static void afs_put_super(struct super_block *sb)
 
 	_enter("");
 
+	lock_kernel();
+
 	afs_put_volume(as->volume);
 
+	unlock_kernel();
+
 	_leave("");
 }
 
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 76afd0d6b86c9..9367b6297d84c 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -737,6 +737,8 @@ parse_options(char *options, befs_mount_options * opts)
 static void
 befs_put_super(struct super_block *sb)
 {
+	lock_kernel();
+
 	kfree(BEFS_SB(sb)->mount_opts.iocharset);
 	BEFS_SB(sb)->mount_opts.iocharset = NULL;
 
@@ -747,7 +749,8 @@ befs_put_super(struct super_block *sb)
 
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
-	return;
+
+	unlock_kernel();
 }
 
 /* Allocate private field of the superblock, fill it.
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 4cf3d269e2718..3a9a1361fdc17 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -217,6 +217,8 @@ static void bfs_put_super(struct super_block *s)
 	if (!info)
 		return;
 
+	lock_kernel();
+
 	if (s->s_dirt)
 		bfs_write_super(s);
 
@@ -225,6 +227,8 @@ static void bfs_put_super(struct super_block *s)
 	kfree(info->si_imap);
 	kfree(info);
 	s->s_fs_info = NULL;
+
+	unlock_kernel();
 }
 
 static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0a10a59b63928..0d92114195ab6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -204,6 +204,9 @@ cifs_put_super(struct super_block *sb)
 		cFYI(1, ("Empty cifs superblock info passed to unmount"));
 		return;
 	}
+
+	lock_kernel();
+
 	rc = cifs_umount(sb, cifs_sb);
 	if (rc)
 		cERROR(1, ("cifs_umount failed with return code %d", rc));
@@ -216,7 +219,8 @@ cifs_put_super(struct super_block *sb)
 
 	unload_nls(cifs_sb->local_nls);
 	kfree(cifs_sb);
-	return;
+
+	unlock_kernel();
 }
 
 static int
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index fa4c7e7d15d99..12d649602d3a1 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -27,6 +27,7 @@
 #include <linux/mount.h>
 #include <linux/key.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <linux/file.h>
 #include <linux/crypto.h>
 #include "ecryptfs_kernel.h"
@@ -120,9 +121,13 @@ static void ecryptfs_put_super(struct super_block *sb)
 {
 	struct ecryptfs_sb_info *sb_info = ecryptfs_superblock_to_private(sb);
 
+	lock_kernel();
+
 	ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat);
 	kmem_cache_free(ecryptfs_sb_info_cache, sb_info);
 	ecryptfs_set_superblock_private(sb, NULL);
+
+	unlock_kernel();
 }
 
 /**
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 3cdb761db8ad8..cd1f8b18a2186 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -258,6 +258,8 @@ static void exofs_put_super(struct super_block *sb)
 	int num_pend;
 	struct exofs_sb_info *sbi = sb->s_fs_info;
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		exofs_write_super(sb);
 
@@ -274,6 +276,8 @@ static void exofs_put_super(struct super_block *sb)
 	osduld_put_device(sbi->s_dev);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
+
+	unlock_kernel();
 }
 
 /*
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 932a2bcb69081..a44963d8edbd4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -114,6 +114,8 @@ static void ext2_put_super (struct super_block * sb)
 	int i;
 	struct ext2_sb_info *sbi = EXT2_SB(sb);
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		ext2_write_super(sb);
 
@@ -138,7 +140,7 @@ static void ext2_put_super (struct super_block * sb)
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 
-	return;
+	unlock_kernel();
 }
 
 static struct kmem_cache * ext2_inode_cachep;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 1efd958687e97..546b8d732bf24 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -398,6 +398,8 @@ static void ext3_put_super (struct super_block * sb)
 	struct ext3_super_block *es = sbi->s_es;
 	int i, err;
 
+	lock_kernel();
+
 	ext3_xattr_put_super(sb);
 	err = journal_destroy(sbi->s_journal);
 	sbi->s_journal = NULL;
@@ -446,7 +448,8 @@ static void ext3_put_super (struct super_block * sb)
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
-	return;
+
+	unlock_kernel();
 }
 
 static struct kmem_cache *ext3_inode_cachep;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0d3034c5e8a43..1d4180b867729 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -577,6 +577,7 @@ static void ext4_put_super(struct super_block *sb)
 	int i, err;
 
 	lock_super(sb);
+	lock_kernel();
 	if (sb->s_dirt)
 		ext4_write_super(sb);
 
@@ -646,7 +647,6 @@ static void ext4_put_super(struct super_block *sb)
 	unlock_super(sb);
 	kobject_put(&sbi->s_kobj);
 	wait_for_completion(&sbi->s_kobj_unregister);
-	lock_kernel();
 	kfree(sbi->s_blockgroup_lock);
 	kfree(sbi);
 }
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 4978621511bf7..2b88c93af227b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -451,6 +451,8 @@ static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		fat_write_super(sb);
 
@@ -470,6 +472,8 @@ static void fat_put_super(struct super_block *sb)
 
 	sb->s_fs_info = NULL;
 	kfree(sbi);
+
+	unlock_kernel();
 }
 
 static struct kmem_cache *fat_inode_cachep;
diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c
index 1dacda8315775..cdbd1654e4cde 100644
--- a/fs/freevxfs/vxfs_super.c
+++ b/fs/freevxfs/vxfs_super.c
@@ -80,12 +80,16 @@ vxfs_put_super(struct super_block *sbp)
 {
 	struct vxfs_sb_info	*infp = VXFS_SBI(sbp);
 
+	lock_kernel();
+
 	vxfs_put_fake_inode(infp->vsi_fship);
 	vxfs_put_fake_inode(infp->vsi_ilist);
 	vxfs_put_fake_inode(infp->vsi_stilist);
 
 	brelse(infp->vsi_bp);
 	kfree(infp);
+
+	unlock_kernel();
 }
 
 /**
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0a68013364708..c8930b31cdf0b 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -719,6 +719,8 @@ static void gfs2_put_super(struct super_block *sb)
 	int error;
 	struct gfs2_jdesc *jd;
 
+	lock_kernel();
+
 	/*  Unfreeze the filesystem, if we need to  */
 
 	mutex_lock(&sdp->sd_freeze_lock);
@@ -785,6 +787,8 @@ static void gfs2_put_super(struct super_block *sb)
 
 	/*  At this point, we're through participating in the lockspace  */
 	gfs2_sys_fs_del(sdp);
+
+	unlock_kernel();
 }
 
 /**
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index e071e6d064636..9f5eaa01cc77b 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -65,11 +65,15 @@ static void hfs_write_super(struct super_block *sb)
  */
 static void hfs_put_super(struct super_block *sb)
 {
+	lock_kernel();
+
 	if (sb->s_dirt)
 		hfs_write_super(sb);
 	hfs_mdb_close(sb);
 	/* release the MDB's resources */
 	hfs_mdb_put(sb);
+
+	unlock_kernel();
 }
 
 /*
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 40bdab79dae8a..9b292dcc39c83 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -199,6 +199,9 @@ static void hfsplus_put_super(struct super_block *sb)
 	dprint(DBG_SUPER, "hfsplus_put_super\n");
 	if (!sb->s_fs_info)
 		return;
+
+	lock_kernel();
+
 	if (sb->s_dirt)
 		hfsplus_write_super(sb);
 	if (!(sb->s_flags & MS_RDONLY) && HFSPLUS_SB(sb).s_vhdr) {
@@ -220,6 +223,8 @@ static void hfsplus_put_super(struct super_block *sb)
 		unload_nls(HFSPLUS_SB(sb).nls);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
+
+	unlock_kernel();
 }
 
 static int hfsplus_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index fc77965be8415..437a32e9deacb 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -99,11 +99,16 @@ int hpfs_stop_cycles(struct super_block *s, int key, int *c1, int *c2,
 static void hpfs_put_super(struct super_block *s)
 {
 	struct hpfs_sb_info *sbi = hpfs_sb(s);
+
+	lock_kernel();
+
 	kfree(sbi->sb_cp_table);
 	kfree(sbi->sb_bmp_dir);
 	unmark_dirty(s);
 	s->s_fs_info = NULL;
 	kfree(sbi);
+
+	unlock_kernel();
 }
 
 unsigned hpfs_count_one_bitmap(struct super_block *s, secno secno)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index b4cbe9603c7d9..068b34b5a107b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -42,11 +42,16 @@ static int isofs_dentry_cmp_ms(struct dentry *dentry, struct qstr *a, struct qst
 static void isofs_put_super(struct super_block *sb)
 {
 	struct isofs_sb_info *sbi = ISOFS_SB(sb);
+
 #ifdef CONFIG_JOLIET
+	lock_kernel();
+
 	if (sbi->s_nls_iocharset) {
 		unload_nls(sbi->s_nls_iocharset);
 		sbi->s_nls_iocharset = NULL;
 	}
+
+	unlock_kernel();
 #endif
 
 	kfree(sbi);
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 5059e9633edbd..37b12125c127d 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -174,6 +174,8 @@ static void jffs2_put_super (struct super_block *sb)
 
 	D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n"));
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		jffs2_write_super(sb);
 
@@ -195,6 +197,8 @@ static void jffs2_put_super (struct super_block *sb)
 	if (c->mtd->sync)
 		c->mtd->sync(c->mtd);
 
+	unlock_kernel();
+
 	D1(printk(KERN_DEBUG "jffs2_put_super returning\n"));
 }
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index d9b0e92b36021..3eb13adf38623 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -183,6 +183,9 @@ static void jfs_put_super(struct super_block *sb)
 	int rc;
 
 	jfs_info("In jfs_put_super");
+
+	lock_kernel();
+
 	rc = jfs_umount(sb);
 	if (rc)
 		jfs_err("jfs_umount failed with return code %d", rc);
@@ -195,6 +198,8 @@ static void jfs_put_super(struct super_block *sb)
 	sbi->direct_inode = NULL;
 
 	kfree(sbi);
+
+	unlock_kernel();
 }
 
 enum {
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index daad3c2740dbd..7eb53970f4bc2 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -35,6 +35,8 @@ static void minix_put_super(struct super_block *sb)
 	int i;
 	struct minix_sb_info *sbi = minix_sb(sb);
 
+	lock_kernel();
+
 	if (!(sb->s_flags & MS_RDONLY)) {
 		if (sbi->s_version != MINIX_V3)	 /* s_state is now out from V3 sb */
 			sbi->s_ms->s_state = sbi->s_mount_state;
@@ -49,7 +51,7 @@ static void minix_put_super(struct super_block *sb)
 	sb->s_fs_info = NULL;
 	kfree(sbi);
 
-	return;
+	unlock_kernel();
 }
 
 static struct kmem_cache * minix_inode_cachep;
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index d642f0e5b3653..b99ce205b1bd7 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -736,6 +736,8 @@ static void ncp_put_super(struct super_block *sb)
 {
 	struct ncp_server *server = NCP_SBP(sb);
 
+	lock_kernel();
+
 	ncp_lock_server(server);
 	ncp_disconnect(server);
 	ncp_unlock_server(server);
@@ -769,6 +771,8 @@ static void ncp_put_super(struct super_block *sb)
 	vfree(server->packet);
 	sb->s_fs_info = NULL;
 	kfree(server);
+
+	unlock_kernel();
 }
 
 static int ncp_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7901d8cbb9b1f..7262e8427c203 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -316,6 +316,8 @@ static void nilfs_put_super(struct super_block *sb)
 	struct nilfs_sb_info *sbi = NILFS_SB(sb);
 	struct the_nilfs *nilfs = sbi->s_nilfs;
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		nilfs_write_super(sb);
 
@@ -333,6 +335,8 @@ static void nilfs_put_super(struct super_block *sb)
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
 	kfree(sbi);
+
+	unlock_kernel();
 }
 
 /**
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 6aa7c47135366..a9ec4e1084e4f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2246,6 +2246,9 @@ static void ntfs_put_super(struct super_block *sb)
 	ntfs_volume *vol = NTFS_SB(sb);
 
 	ntfs_debug("Entering.");
+
+	lock_kernel();
+
 #ifdef NTFS_RW
 	/*
 	 * Commit all inodes while they are still open in case some of them
@@ -2444,7 +2447,8 @@ static void ntfs_put_super(struct super_block *sb)
 	}
 	sb->s_fs_info = NULL;
 	kfree(vol);
-	return;
+
+	unlock_kernel();
 }
 
 /**
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 3eb076ce4c073..02737596b597d 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1536,9 +1536,13 @@ static void ocfs2_put_super(struct super_block *sb)
 {
 	mlog_entry("(0x%p)\n", sb);
 
+	lock_kernel();
+
 	ocfs2_sync_blockdev(sb);
 	ocfs2_dismount_volume(sb, 0);
 
+	unlock_kernel();
+
 	mlog_exit_void();
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 3da0401c0a966..90dcb7b033ea2 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -465,6 +465,8 @@ static void reiserfs_put_super(struct super_block *s)
 	struct reiserfs_transaction_handle th;
 	th.t_trans_id = 0;
 
+	lock_kernel();
+
 	if (s->s_dirt)
 		reiserfs_write_super(s);
 
@@ -500,7 +502,7 @@ static void reiserfs_put_super(struct super_block *s)
 	kfree(s->s_fs_info);
 	s->s_fs_info = NULL;
 
-	return;
+	unlock_kernel();
 }
 
 static struct kmem_cache *reiserfs_inode_cachep;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index fc27fbfc5397c..1402d2d54f523 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -474,6 +474,8 @@ smb_put_super(struct super_block *sb)
 {
 	struct smb_sb_info *server = SMB_SB(sb);
 
+	lock_kernel();
+
 	smb_lock_server(server);
 	server->state = CONN_INVALID;
 	smbiod_unregister_server(server);
@@ -489,6 +491,8 @@ smb_put_super(struct super_block *sb)
 	smb_unlock_server(server);
 	put_pid(server->conn_pid);
 	kfree(server);
+
+	unlock_kernel();
 }
 
 static int smb_fill_super(struct super_block *sb, void *raw_data, int silent)
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 0adc624c956f7..3b52770f46ffd 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -338,6 +338,8 @@ static int squashfs_remount(struct super_block *sb, int *flags, char *data)
 
 static void squashfs_put_super(struct super_block *sb)
 {
+	lock_kernel();
+
 	if (sb->s_fs_info) {
 		struct squashfs_sb_info *sbi = sb->s_fs_info;
 		squashfs_cache_delete(sbi->block_cache);
@@ -350,6 +352,8 @@ static void squashfs_put_super(struct super_block *sb)
 		kfree(sb->s_fs_info);
 		sb->s_fs_info = NULL;
 	}
+
+	unlock_kernel();
 }
 
 
diff --git a/fs/super.c b/fs/super.c
index 54fd331f0cabe..bdd7158b785e7 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -309,7 +309,6 @@ void generic_shutdown_super(struct super_block *sb)
 
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
-		lock_kernel();
 
 		if (sop->put_super)
 			sop->put_super(sb);
@@ -320,8 +319,6 @@ void generic_shutdown_super(struct super_block *sb)
 			   "Self-destruct in 5 seconds.  Have a nice day...\n",
 			   sb->s_id);
 		}
-
-		unlock_kernel();
 		put_fs_excl();
 	}
 	spin_lock(&sb_lock);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index cd80316302ccd..a8189864c2416 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -72,6 +72,8 @@ static void sysv_put_super(struct super_block *sb)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 
+	lock_kernel();
+
 	if (sb->s_dirt)
 		sysv_write_super(sb);
 
@@ -87,6 +89,8 @@ static void sysv_put_super(struct super_block *sb)
 		brelse(sbi->s_bh2);
 
 	kfree(sbi);
+
+	unlock_kernel();
 }
 
 static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 84f3c7fd15524..522c3fd7eb3ca 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1684,6 +1684,9 @@ static void ubifs_put_super(struct super_block *sb)
 
 	ubifs_msg("un-mount UBI device %d, volume %d", c->vi.ubi_num,
 		  c->vi.vol_id);
+
+	lock_kernel();
+
 	/*
 	 * The following asserts are only valid if there has not been a failure
 	 * of the media. For example, there will be dirty inodes if we failed
@@ -1750,6 +1753,8 @@ static void ubifs_put_super(struct super_block *sb)
 	ubi_close_volume(c->ubi);
 	mutex_unlock(&c->umount_mutex);
 	kfree(c);
+
+	unlock_kernel();
 }
 
 static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 0ba44107d8f10..04802cc39b18d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -2062,6 +2062,9 @@ static void udf_put_super(struct super_block *sb)
 	struct udf_sb_info *sbi;
 
 	sbi = UDF_SB(sb);
+
+	lock_kernel();
+
 	if (sbi->s_vat_inode)
 		iput(sbi->s_vat_inode);
 	if (sbi->s_partitions)
@@ -2077,6 +2080,8 @@ static void udf_put_super(struct super_block *sb)
 	kfree(sbi->s_partmaps);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
+
+	unlock_kernel();
 }
 
 static int udf_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 74afb9fbf58e3..2b4d2b6234dfe 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -594,6 +594,9 @@ static void ufs_put_super_internal(struct super_block *sb)
 
 	
 	UFSD("ENTER\n");
+
+	lock_kernel();
+
 	ufs_put_cstotal(sb);
 	size = uspi->s_cssize;
 	blks = (size + uspi->s_fsize - 1) >> uspi->s_fshift;
@@ -621,6 +624,9 @@ static void ufs_put_super_internal(struct super_block *sb)
 		brelse (sbi->s_ucg[i]);
 	kfree (sbi->s_ucg);
 	kfree (base);
+
+	unlock_kernel();
+
 	UFSD("EXIT\n");
 }
 
-- 
GitLab


From bbd6851a3213a525128473e978b692ab6ac11aba Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Wed, 6 May 2009 10:43:07 -0400
Subject: [PATCH 2149/2198] Push lock_super() into the ->remount_fs() of
 filesystems that care about it

Note that since we can't run into contention between remount_fs and write_super
(due to exclusion on s_umount), we have to care only about filesystems that
touch lock_super() on their own.  Out of those ext3, ext4, hpfs, sysv and ufs
do need it; fat doesn't since its ->remount_fs() only accesses assign-once
data (basically, it's "we have no atime on directories and only have atime on
files for vfat; force nodiratime and possibly noatime into *flags").

[folded a build fix from hch]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext3/super.c |  3 +++
 fs/ext4/super.c |  3 +++
 fs/hpfs/super.c |  3 +++
 fs/super.c      |  2 --
 fs/sysv/inode.c |  2 ++
 fs/ufs/super.c  | 11 ++++++++++-
 6 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 546b8d732bf24..e213a2613a561 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2491,6 +2491,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 #endif
 
 	/* Store the original options */
+	lock_super(sb);
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
 	old_opts.s_resuid = sbi->s_resuid;
@@ -2598,6 +2599,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
 			kfree(old_opts.s_qf_names[i]);
 #endif
+	unlock_super(sb);
 	return 0;
 restore_opts:
 	sb->s_flags = old_sb_flags;
@@ -2614,6 +2616,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
 #endif
+	unlock_super(sb);
 	return err;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 1d4180b867729..a9c6834259293 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3421,6 +3421,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
 
 	/* Store the original options */
+	lock_super(sb);
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
 	old_opts.s_resuid = sbi->s_resuid;
@@ -3554,6 +3555,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		    old_opts.s_qf_names[i] != sbi->s_qf_names[i])
 			kfree(old_opts.s_qf_names[i]);
 #endif
+	unlock_super(sb);
 	return 0;
 
 restore_opts:
@@ -3573,6 +3575,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 		sbi->s_qf_names[i] = old_opts.s_qf_names[i];
 	}
 #endif
+	unlock_super(sb);
 	return err;
 }
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index 437a32e9deacb..f68193cf08113 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -398,6 +398,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	
 	*flags |= MS_NOATIME;
 	
+	lock_super(s);
 	uid = sbi->sb_uid; gid = sbi->sb_gid;
 	umask = 0777 & ~sbi->sb_mode;
 	lowercase = sbi->sb_lowercase; conv = sbi->sb_conv;
@@ -430,9 +431,11 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 
 	replace_mount_options(s, new_opts);
 
+	unlock_super(s);
 	return 0;
 
 out_err:
+	unlock_super(s);
 	kfree(new_opts);
 	return -EINVAL;
 }
diff --git a/fs/super.c b/fs/super.c
index bdd7158b785e7..2a49fed77453f 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -556,9 +556,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
 
 	if (sb->s_op->remount_fs) {
-		lock_super(sb);
 		retval = sb->s_op->remount_fs(sb, &flags, data);
-		unlock_super(sb);
 		if (retval)
 			return retval;
 	}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index a8189864c2416..e0a39f1fb88e1 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -61,10 +61,12 @@ static void sysv_write_super(struct super_block *sb)
 static int sysv_remount(struct super_block *sb, int *flags, char *data)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
+	lock_super(sb);
 	if (sbi->s_forced_ro)
 		*flags |= MS_RDONLY;
 	if (!(*flags & MS_RDONLY))
 		sb->s_dirt = 1;
+	unlock_super(sb);
 	return 0;
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 2b4d2b6234dfe..a5ecabfdc9761 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1181,6 +1181,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	unsigned new_mount_opt, ufstype;
 	unsigned flags;
 	
+	lock_super(sb);
 	uspi = UFS_SB(sb)->s_uspi;
 	flags = UFS_SB(sb)->s_flags;
 	usb1 = ubh_get_usb_first(uspi);
@@ -1193,17 +1194,21 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE;
 	new_mount_opt = 0;
 	ufs_set_opt (new_mount_opt, ONERROR_LOCK);
-	if (!ufs_parse_options (data, &new_mount_opt))
+	if (!ufs_parse_options (data, &new_mount_opt)) {
+		unlock_super(sb);
 		return -EINVAL;
+	}
 	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
 		new_mount_opt |= ufstype;
 	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
 		printk("ufstype can't be changed during remount\n");
+		unlock_super(sb);
 		return -EINVAL;
 	}
 
 	if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
 		UFS_SB(sb)->s_mount_opt = new_mount_opt;
+		unlock_super(sb);
 		return 0;
 	}
 	
@@ -1228,6 +1233,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 #ifndef CONFIG_UFS_FS_WRITE
 		printk("ufs was compiled with read-only support, "
 		"can't be mounted as read-write\n");
+		unlock_super(sb);
 		return -EINVAL;
 #else
 		if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1236,16 +1242,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		    ufstype != UFS_MOUNT_UFSTYPE_SUNx86 &&
 		    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
 			printk("this ufstype is read-only supported\n");
+			unlock_super(sb);
 			return -EINVAL;
 		}
 		if (!ufs_read_cylinder_structures(sb)) {
 			printk("failed during remounting\n");
+			unlock_super(sb);
 			return -EPERM;
 		}
 		sb->s_flags &= ~MS_RDONLY;
 #endif
 	}
 	UFS_SB(sb)->s_mount_opt = new_mount_opt;
+	unlock_super(sb);
 	return 0;
 }
 
-- 
GitLab


From 6fac98dd218653c6aff8a0f56305c424930cea2a Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 May 2009 13:31:17 -0400
Subject: [PATCH 2150/2198] Push BKL into do_mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/alpha/kernel/osf_sys.c | 3 ---
 fs/compat.c                 | 2 --
 fs/namespace.c              | 4 ++--
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c
index 42ee05981e717..9a3334ae282e3 100644
--- a/arch/alpha/kernel/osf_sys.c
+++ b/arch/alpha/kernel/osf_sys.c
@@ -371,8 +371,6 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path,
 	int retval = -EINVAL;
 	char *name;
 
-	lock_kernel();
-
 	name = getname(path);
 	retval = PTR_ERR(name);
 	if (IS_ERR(name))
@@ -392,7 +390,6 @@ SYSCALL_DEFINE4(osf_mount, unsigned long, typenr, char __user *, path,
 	}
 	putname(name);
  out:
-	unlock_kernel();
 	return retval;
 }
 
diff --git a/fs/compat.c b/fs/compat.c
index bb2a9b2e81738..6aefb776dfeb6 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -812,10 +812,8 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name,
 		}
 	}
 
-	lock_kernel();
 	retval = do_mount((char*)dev_page, dir_page, (char*)type_page,
 			flags, (void*)data_page);
-	unlock_kernel();
 
  out4:
 	free_page(data_page);
diff --git a/fs/namespace.c b/fs/namespace.c
index 7e537f0393b58..4740f7bdb5562 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1921,6 +1921,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (retval)
 		goto dput_out;
 
+	lock_kernel();
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
@@ -1933,6 +1934,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	else
 		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
+	unlock_kernel();
 dput_out:
 	path_put(&path);
 	return retval;
@@ -2046,10 +2048,8 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 	if (retval < 0)
 		goto out3;
 
-	lock_kernel();
 	retval = do_mount((char *)dev_page, dir_page, (char *)type_page,
 			  flags, (void *)data_page);
-	unlock_kernel();
 	free_page(data_page);
 
 out3:
-- 
GitLab


From 7f78d4cd4c5d01864943c22b79df1b6bde923129 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 May 2009 13:34:06 -0400
Subject: [PATCH 2151/2198] Push BKL down beyond VFS-only parts of do_mount()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 4740f7bdb5562..b94325f00c5a2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1515,8 +1515,11 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
-	else
+	else {
+		lock_kernel();
 		err = do_remount_sb(sb, flags, data, 0);
+		unlock_kernel();
+	}
 	if (!err)
 		path->mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
@@ -1630,7 +1633,9 @@ static int do_new_mount(struct path *path, char *type, int flags,
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
+	lock_kernel();
 	mnt = do_kern_mount(type, flags, name, data);
+	unlock_kernel();
 	if (IS_ERR(mnt))
 		return PTR_ERR(mnt);
 
@@ -1921,7 +1926,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	if (retval)
 		goto dput_out;
 
-	lock_kernel();
 	if (flags & MS_REMOUNT)
 		retval = do_remount(&path, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
@@ -1934,7 +1938,6 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 	else
 		retval = do_new_mount(&path, type_page, flags, mnt_flags,
 				      dev_name, data_page);
-	unlock_kernel();
 dput_out:
 	path_put(&path);
 	return retval;
-- 
GitLab


From 4aa98cf768b6f2ea4b204620d949a665959214f6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 8 May 2009 13:36:58 -0400
Subject: [PATCH 2152/2198] Push BKL down into do_remount_sb()

[folded fix from Jiri Slaby]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c | 10 ++--------
 fs/super.c     | 16 +++++++++++-----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index b94325f00c5a2..2dd333b0fe7f6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1060,11 +1060,8 @@ static int do_umount(struct vfsmount *mnt, int flags)
 		 * we just try to remount it readonly.
 		 */
 		down_write(&sb->s_umount);
-		if (!(sb->s_flags & MS_RDONLY)) {
-			lock_kernel();
+		if (!(sb->s_flags & MS_RDONLY))
 			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
-			unlock_kernel();
-		}
 		up_write(&sb->s_umount);
 		return retval;
 	}
@@ -1515,11 +1512,8 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
-	else {
-		lock_kernel();
+	else
 		err = do_remount_sb(sb, flags, data, 0);
-		unlock_kernel();
-	}
 	if (!err)
 		path->mnt->mnt_flags = mnt_flags;
 	up_write(&sb->s_umount);
diff --git a/fs/super.c b/fs/super.c
index 2a49fed77453f..a64f362087975 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -542,25 +542,33 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	shrink_dcache_sb(sb);
 	sync_filesystem(sb);
 
+	lock_kernel();
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
 		if (force)
 			mark_files_ro(sb);
-		else if (!fs_may_remount_ro(sb))
+		else if (!fs_may_remount_ro(sb)) {
+			unlock_kernel();
 			return -EBUSY;
+		}
 		retval = vfs_dq_off(sb, 1);
-		if (retval < 0 && retval != -ENOSYS)
+		if (retval < 0 && retval != -ENOSYS) {
+			unlock_kernel();
 			return -EBUSY;
+		}
 	}
 	remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
 
 	if (sb->s_op->remount_fs) {
 		retval = sb->s_op->remount_fs(sb, &flags, data);
-		if (retval)
+		if (retval) {
+			unlock_kernel();
 			return retval;
+		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
+	unlock_kernel();
 	if (remount_rw)
 		vfs_dq_quota_on_remount(sb);
 	return 0;
@@ -581,9 +589,7 @@ static void do_emergency_remount(struct work_struct *work)
 			 *
 			 * What lock protects sb->s_flags??
 			 */
-			lock_kernel();
 			do_remount_sb(sb, MS_RDONLY, NULL, 1);
-			unlock_kernel();
 		}
 		up_write(&sb->s_umount);
 		put_super(sb);
-- 
GitLab


From 01ba687577647beef6c5f2ea59bfb56fac9fcde2 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 May 2009 23:34:27 +0200
Subject: [PATCH 2153/2198] jffs2: move jffs2_write_super to super.c

jffs2_write_super is only called from super.c and doesn't use any
functionality from fs.c.  So move it over to super.c and make it
static there.

[should go in through the vfs tree as it is a requirement for the
 next patch]

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jffs2/fs.c       | 15 ---------------
 fs/jffs2/os-linux.h |  1 -
 fs/jffs2/super.c    | 14 ++++++++++++++
 3 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 249305d65d5bd..237b27a3d5708 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -402,21 +402,6 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 	return 0;
 }
 
-void jffs2_write_super (struct super_block *sb)
-{
-	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-	sb->s_dirt = 0;
-
-	if (sb->s_flags & MS_RDONLY)
-		return;
-
-	D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-	jffs2_garbage_collect_trigger(c);
-	jffs2_erase_pending_blocks(c, 0);
-	jffs2_flush_wbuf_gc(c, 0);
-}
-
-
 /* jffs2_new_inode: allocate a new inode and inocache, add it to the hash,
    fill in the raw_inode while you're at it. */
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_inode *ri)
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 5e194a5c8e29c..2228380c47b9e 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -181,7 +181,6 @@ void jffs2_dirty_inode(struct inode *inode);
 struct inode *jffs2_new_inode (struct inode *dir_i, int mode,
 			       struct jffs2_raw_inode *ri);
 int jffs2_statfs (struct dentry *, struct kstatfs *);
-void jffs2_write_super (struct super_block *);
 int jffs2_remount_fs (struct super_block *, int *, char *);
 int jffs2_do_fill_super(struct super_block *sb, void *data, int silent);
 void jffs2_gc_release_inode(struct jffs2_sb_info *c,
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 37b12125c127d..a80a50e445e29 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -53,6 +53,20 @@ static void jffs2_i_init_once(void *foo)
 	inode_init_once(&f->vfs_inode);
 }
 
+static void jffs2_write_super(struct super_block *sb)
+{
+	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+	sb->s_dirt = 0;
+
+	if (sb->s_flags & MS_RDONLY)
+		return;
+
+	D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
+	jffs2_garbage_collect_trigger(c);
+	jffs2_erase_pending_blocks(c, 0);
+	jffs2_flush_wbuf_gc(c, 0);
+}
+
 static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
-- 
GitLab


From ebc1ac164560a241d9bf1b7519062910c3f90a01 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 11 May 2009 23:35:03 +0200
Subject: [PATCH 2154/2198] ->write_super lock_super pushdown

Push down lock_super into ->write_super instances and remove it from the
caller.

Following filesystem don't need ->s_lock in ->write_super and are skipped:

 * bfs, nilfs2 - no other uses of s_lock and have internal locks in
	->write_super
 * ext2 - uses BKL in ext2_write_super and has internal calls without s_lock
 * reiserfs - no other uses of s_lock as has reiserfs_write_lock (BKL) in
 	->write_super
 * xfs - no other uses of s_lock and uses internal lock (buffer lock on
	superblock buffer) to serialize ->write_super.  Also xfs_fs_write_super
	is superflous and will go away in the next merge window

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/super.c    |  2 ++
 fs/exofs/super.c   |  2 ++
 fs/ext4/super.c    |  4 +++-
 fs/fat/inode.c     |  2 ++
 fs/hfs/super.c     |  8 +++++---
 fs/hfsplus/super.c |  6 +++++-
 fs/jffs2/super.c   | 15 +++++++++------
 fs/super.c         |  2 --
 fs/sync.c          |  4 ----
 fs/sysv/inode.c    |  2 ++
 fs/ufs/super.c     |  2 ++
 11 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/fs/affs/super.c b/fs/affs/super.c
index d7386462a8e7b..280d361af41f6 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -54,6 +54,7 @@ affs_write_super(struct super_block *sb)
 	int clean = 2;
 	struct affs_sb_info *sbi = AFFS_SB(sb);
 
+	lock_super(sb);
 	if (!(sb->s_flags & MS_RDONLY)) {
 		//	if (sbi->s_bitmap[i].bm_bh) {
 		//		if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
@@ -66,6 +67,7 @@ affs_write_super(struct super_block *sb)
 		sb->s_dirt = !clean;	/* redo until bitmap synced */
 	} else
 		sb->s_dirt = 0;
+	unlock_super(sb);
 
 	pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
 }
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index cd1f8b18a2186..49e16af4e6194 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -214,6 +214,7 @@ static void exofs_write_super(struct super_block *sb)
 		return;
 	}
 
+	lock_super(sb);
 	lock_kernel();
 	sbi = sb->s_fs_info;
 	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
@@ -246,6 +247,7 @@ static void exofs_write_super(struct super_block *sb)
 	if (or)
 		osd_end_request(or);
 	unlock_kernel();
+	unlock_super(sb);
 	kfree(fscb);
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a9c6834259293..c17200a42301a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -579,7 +579,7 @@ static void ext4_put_super(struct super_block *sb)
 	lock_super(sb);
 	lock_kernel();
 	if (sb->s_dirt)
-		ext4_write_super(sb);
+		ext4_commit_super(sb, 1);
 
 	ext4_release_system_zone(sb);
 	ext4_mb_release(sb);
@@ -3336,7 +3336,9 @@ int ext4_force_commit(struct super_block *sb)
 
 static void ext4_write_super(struct super_block *sb)
 {
+	lock_super(sb);
 	ext4_commit_super(sb, 1);
+	unlock_super(sb);
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 2b88c93af227b..2292cbf7d364a 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -441,10 +441,12 @@ static void fat_clear_inode(struct inode *inode)
 
 static void fat_write_super(struct super_block *sb)
 {
+	lock_super(sb);
 	sb->s_dirt = 0;
 
 	if (!(sb->s_flags & MS_RDONLY))
 		fat_clusters_flush(sb);
+	unlock_super(sb);
 }
 
 static void fat_put_super(struct super_block *sb)
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 9f5eaa01cc77b..3aac417510301 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -49,11 +49,13 @@ MODULE_LICENSE("GPL");
  */
 static void hfs_write_super(struct super_block *sb)
 {
+	lock_super(sb);
 	sb->s_dirt = 0;
-	if (sb->s_flags & MS_RDONLY)
-		return;
+
 	/* sync everything to the buffers */
-	hfs_mdb_commit(sb);
+	if (!(sb->s_flags & MS_RDONLY))
+		hfs_mdb_commit(sb);
+	unlock_super(sb);
 }
 
 /*
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 9b292dcc39c83..1aab8aa7801e2 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -157,10 +157,12 @@ static void hfsplus_write_super(struct super_block *sb)
 	struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
 
 	dprint(DBG_SUPER, "hfsplus_write_super\n");
+
+	lock_super(sb);
 	sb->s_dirt = 0;
 	if (sb->s_flags & MS_RDONLY)
 		/* warn? */
-		return;
+		goto out;
 
 	vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
 	vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
@@ -192,6 +194,8 @@ static void hfsplus_write_super(struct super_block *sb)
 		}
 		HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
+ out:
+	unlock_super(sb);
 }
 
 static void hfsplus_put_super(struct super_block *sb)
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index a80a50e445e29..f7bfd3ac8bfa6 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -56,15 +56,18 @@ static void jffs2_i_init_once(void *foo)
 static void jffs2_write_super(struct super_block *sb)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
+
+	lock_super(sb);
 	sb->s_dirt = 0;
 
-	if (sb->s_flags & MS_RDONLY)
-		return;
+	if (!(sb->s_flags & MS_RDONLY)) {
+		D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
+		jffs2_garbage_collect_trigger(c);
+		jffs2_erase_pending_blocks(c, 0);
+		jffs2_flush_wbuf_gc(c, 0);
+	}
 
-	D1(printk(KERN_DEBUG "jffs2_write_super()\n"));
-	jffs2_garbage_collect_trigger(c);
-	jffs2_erase_pending_blocks(c, 0);
-	jffs2_flush_wbuf_gc(c, 0);
+	unlock_super(sb);
 }
 
 static int jffs2_sync_fs(struct super_block *sb, int wait)
diff --git a/fs/super.c b/fs/super.c
index a64f362087975..1905f4af01cc4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -420,10 +420,8 @@ void sync_supers(void)
 			spin_unlock(&sb_lock);
 
 			down_read(&sb->s_umount);
-			lock_super(sb);
 			if (sb->s_root && sb->s_dirt)
 				sb->s_op->write_super(sb);
-			unlock_super(sb);
 			up_read(&sb->s_umount);
 
 			spin_lock(&sb_lock);
diff --git a/fs/sync.c b/fs/sync.c
index 89c37f732afad..e9d56f6c0b74b 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -33,10 +33,8 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	else
 		sync_quota_sb(sb, -1);
 	sync_inodes_sb(sb, wait);
-	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
-	unlock_super(sb);
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
 	return __sync_blockdev(sb->s_bdev, wait);
@@ -164,10 +162,8 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 
 	/* sync the superblock to buffers */
 	sb = inode->i_sb;
-	lock_super(sb);
 	if (sb->s_dirt && sb->s_op->write_super)
 		sb->s_op->write_super(sb);
-	unlock_super(sb);
 
 	/* .. finally sync the buffers to disk */
 	err = sync_blockdev(sb->s_bdev);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index e0a39f1fb88e1..a3f45fc626a16 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -37,6 +37,7 @@ static void sysv_write_super(struct super_block *sb)
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 	unsigned long time = get_seconds(), old_time;
 
+	lock_super(sb);
 	lock_kernel();
 	if (sb->s_flags & MS_RDONLY)
 		goto clean;
@@ -56,6 +57,7 @@ static void sysv_write_super(struct super_block *sb)
 clean:
 	sb->s_dirt = 0;
 	unlock_kernel();
+	unlock_super(sb);
 }
 
 static int sysv_remount(struct super_block *sb, int *flags, char *data)
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index a5ecabfdc9761..c97210ee0670c 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1131,6 +1131,7 @@ static void ufs_write_super(struct super_block *sb)
 	struct ufs_super_block_third * usb3;
 	unsigned flags;
 
+	lock_super(sb);
 	lock_kernel();
 	UFSD("ENTER\n");
 	flags = UFS_SB(sb)->s_flags;
@@ -1150,6 +1151,7 @@ static void ufs_write_super(struct super_block *sb)
 	sb->s_dirt = 0;
 	UFSD("EXIT\n");
 	unlock_kernel();
+	unlock_super(sb);
 }
 
 static void ufs_put_super(struct super_block *sb)
-- 
GitLab


From 9fd5746fd3d7838bf6ff991d50f1257057d1156f Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 21 May 2009 16:01:00 -0400
Subject: [PATCH 2155/2198] fs: Remove i_cindex from struct inode

The only user of the i_cindex element in the inode structure is used
is by the firewire drivers.  As part of an attempt to slim down the
inode structure to save memory --- since a typical Linux system will
have hundreds of thousands if not millions of inodes cached, a
reduction in the size inode has high leverage.

The firewire driver does not need i_cindex in any fast path, so it's
simple enough to calculate when it is needed, instead of wasting space
in the inode structure.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: krh@redhat.com
Cc: stefanr@s5r6.in-berlin.de
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/ieee1394/dv1394.c        |  5 +++--
 drivers/ieee1394/ieee1394_core.h |  6 +++++-
 fs/char_dev.c                    | 14 +++++++++++++-
 include/linux/cdev.h             |  2 ++
 include/linux/fs.h               |  1 -
 5 files changed, 23 insertions(+), 5 deletions(-)

diff --git a/drivers/ieee1394/dv1394.c b/drivers/ieee1394/dv1394.c
index 823a6297a1afb..2cd00b5b45b46 100644
--- a/drivers/ieee1394/dv1394.c
+++ b/drivers/ieee1394/dv1394.c
@@ -1789,12 +1789,13 @@ static int dv1394_open(struct inode *inode, struct file *file)
 	} else {
 		/* look up the card by ID */
 		unsigned long flags;
+		int idx = ieee1394_file_to_instance(file);
 
 		spin_lock_irqsave(&dv1394_cards_lock, flags);
 		if (!list_empty(&dv1394_cards)) {
 			struct video_card *p;
 			list_for_each_entry(p, &dv1394_cards, list) {
-				if ((p->id) == ieee1394_file_to_instance(file)) {
+				if ((p->id) == idx) {
 					video = p;
 					break;
 				}
@@ -1803,7 +1804,7 @@ static int dv1394_open(struct inode *inode, struct file *file)
 		spin_unlock_irqrestore(&dv1394_cards_lock, flags);
 
 		if (!video) {
-			debug_printk("dv1394: OHCI card %d not found", ieee1394_file_to_instance(file));
+			debug_printk("dv1394: OHCI card %d not found", idx);
 			return -ENODEV;
 		}
 
diff --git a/drivers/ieee1394/ieee1394_core.h b/drivers/ieee1394/ieee1394_core.h
index 21d50f73a210c..28b9f58bafd25 100644
--- a/drivers/ieee1394/ieee1394_core.h
+++ b/drivers/ieee1394/ieee1394_core.h
@@ -5,6 +5,7 @@
 #include <linux/fs.h>
 #include <linux/list.h>
 #include <linux/types.h>
+#include <linux/cdev.h>
 #include <asm/atomic.h>
 
 #include "hosts.h"
@@ -155,7 +156,10 @@ void hpsb_packet_received(struct hpsb_host *host, quadlet_t *data, size_t size,
  */
 static inline unsigned char ieee1394_file_to_instance(struct file *file)
 {
-	return file->f_path.dentry->d_inode->i_cindex;
+	int idx = cdev_index(file->f_path.dentry->d_inode);
+	if (idx < 0)
+		idx = 0;
+	return idx;
 }
 
 extern int hpsb_disable_irm;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 38f71222a552e..b7c9d5187a756 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -375,7 +375,6 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 		p = inode->i_cdev;
 		if (!p) {
 			inode->i_cdev = p = new;
-			inode->i_cindex = idx;
 			list_add(&inode->i_devices, &p->list);
 			new = NULL;
 		} else if (!cdev_get(p))
@@ -405,6 +404,18 @@ static int chrdev_open(struct inode *inode, struct file *filp)
 	return ret;
 }
 
+int cdev_index(struct inode *inode)
+{
+	int idx;
+	struct kobject *kobj;
+
+	kobj = kobj_lookup(cdev_map, inode->i_rdev, &idx);
+	if (!kobj)
+		return -1;
+	kobject_put(kobj);
+	return idx;
+}
+
 void cd_forget(struct inode *inode)
 {
 	spin_lock(&cdev_lock);
@@ -557,6 +568,7 @@ EXPORT_SYMBOL(cdev_init);
 EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
+EXPORT_SYMBOL(cdev_index);
 EXPORT_SYMBOL(register_chrdev);
 EXPORT_SYMBOL(unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/include/linux/cdev.h b/include/linux/cdev.h
index fb4591977b039..f389e319a454e 100644
--- a/include/linux/cdev.h
+++ b/include/linux/cdev.h
@@ -28,6 +28,8 @@ int cdev_add(struct cdev *, dev_t, unsigned);
 
 void cdev_del(struct cdev *);
 
+int cdev_index(struct inode *inode);
+
 void cd_forget(struct inode *);
 
 extern struct backing_dev_info directly_mappable_cdev_bdi;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index e7833ef5d1d63..bcd63706db87e 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -751,7 +751,6 @@ struct inode {
 		struct block_device	*i_bdev;
 		struct cdev		*i_cdev;
 	};
-	int			i_cindex;
 
 	__u32			i_generation;
 
-- 
GitLab


From 28ad0c118b0ed98b042d362acfe0017591921138 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Thu, 21 May 2009 16:01:02 -0400
Subject: [PATCH 2156/2198] fs: Rearrange inode structure elements to avoid
 waste due to padding

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/fs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index bcd63706db87e..d883aa1fc2eb5 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -729,8 +729,8 @@ struct inode {
 	struct timespec		i_atime;
 	struct timespec		i_mtime;
 	struct timespec		i_ctime;
-	unsigned int		i_blkbits;
 	blkcnt_t		i_blocks;
+	unsigned int		i_blkbits;
 	unsigned short          i_bytes;
 	umode_t			i_mode;
 	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
-- 
GitLab


From 13205fb9260c2377438599ef0773c6a3eaeb0b07 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 25 May 2009 09:30:45 +0200
Subject: [PATCH 2157/2198] ntfs: remove old debug check for dirty data in
 ntfs_put_super()

This should not trigger anymore, so kill it.

Acked-by: Anton Altaparmakov <aia21@cam.ac.uk>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ntfs/super.c | 33 +++------------------------------
 1 file changed, 3 insertions(+), 30 deletions(-)

diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index a9ec4e1084e4f..7a7b0d326395f 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2376,39 +2376,12 @@ static void ntfs_put_super(struct super_block *sb)
 		vol->mftmirr_ino = NULL;
 	}
 	/*
-	 * If any dirty inodes are left, throw away all mft data page cache
-	 * pages to allow a clean umount.  This should never happen any more
-	 * due to mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
-	 * the underlying mft records are written out and cleaned.  If it does,
-	 * happen anyway, we want to know...
+	 * We should have no dirty inodes left, due to
+	 * mft.c::ntfs_mft_writepage() cleaning all the dirty pages as
+	 * the underlying mft records are written out and cleaned.
 	 */
 	ntfs_commit_inode(vol->mft_ino);
 	write_inode_now(vol->mft_ino, 1);
-	if (sb_has_dirty_inodes(sb)) {
-		const char *s1, *s2;
-
-		mutex_lock(&vol->mft_ino->i_mutex);
-		truncate_inode_pages(vol->mft_ino->i_mapping, 0);
-		mutex_unlock(&vol->mft_ino->i_mutex);
-		write_inode_now(vol->mft_ino, 1);
-		if (sb_has_dirty_inodes(sb)) {
-			static const char *_s1 = "inodes";
-			static const char *_s2 = "";
-			s1 = _s1;
-			s2 = _s2;
-		} else {
-			static const char *_s1 = "mft pages";
-			static const char *_s2 = "They have been thrown "
-					"away.  ";
-			s1 = _s1;
-			s2 = _s2;
-		}
-		ntfs_error(sb, "Dirty %s found at umount time.  %sYou should "
-				"run chkdsk.  Please email "
-				"linux-ntfs-dev@lists.sourceforge.net and say "
-				"that you saw this message.  Thank you.", s1,
-				s2);
-	}
 #endif /* NTFS_RW */
 
 	iput(vol->mft_ino);
-- 
GitLab


From f95022161d23ee661a48af8f280472209f513a67 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Fri, 5 Jun 2009 12:26:23 +0200
Subject: [PATCH 2158/2198] xfs: remove ->write_super and stop maintaining
 ->s_dirt

the write_super method is used for

 (1) writing back the superblock periodically from pdflush
 (2) called just before ->sync_fs for data integerity syncs

We don't need (1) because we have our own peridoc writeout through xfssyncd,
and we don't need (2) because xfs_fs_sync_fs performs a proper synchronous
superblock writeout after all other data and metadata has been written out.

Also remove ->s_dirt tracking as it's only used to decide when too call
->write_super.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/xfs/linux-2.6/xfs_super.c | 12 ------------
 fs/xfs/xfs_trans.c           |  2 --
 2 files changed, 14 deletions(-)

diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index bb685269f832e..08d6bd9a39476 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -1104,15 +1104,6 @@ xfs_fs_put_super(
 	kfree(mp);
 }
 
-STATIC void
-xfs_fs_write_super(
-	struct super_block	*sb)
-{
-	if (!(sb->s_flags & MS_RDONLY))
-		xfs_sync_fsdata(XFS_M(sb), 0);
-	sb->s_dirt = 0;
-}
-
 STATIC int
 xfs_fs_sync_super(
 	struct super_block	*sb,
@@ -1137,7 +1128,6 @@ xfs_fs_sync_super(
 		error = xfs_quiesce_data(mp);
 	else
 		error = xfs_sync_fsdata(mp, 0);
-	sb->s_dirt = 0;
 
 	if (unlikely(laptop_mode)) {
 		int	prev_sync_seq = mp->m_sync_seq;
@@ -1443,7 +1433,6 @@ xfs_fs_fill_super(
 
 	XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
 
-	sb->s_dirt = 1;
 	sb->s_magic = XFS_SB_MAGIC;
 	sb->s_blocksize = mp->m_sb.sb_blocksize;
 	sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
@@ -1533,7 +1522,6 @@ static struct super_operations xfs_super_operations = {
 	.write_inode		= xfs_fs_write_inode,
 	.clear_inode		= xfs_fs_clear_inode,
 	.put_super		= xfs_fs_put_super,
-	.write_super		= xfs_fs_write_super,
 	.sync_fs		= xfs_fs_sync_super,
 	.freeze_fs		= xfs_fs_freeze,
 	.statfs			= xfs_fs_statfs,
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 8570b826fedd8..bcc39d358ad39 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -628,8 +628,6 @@ xfs_trans_apply_sb_deltas(
 		xfs_trans_log_buf(tp, bp, offsetof(xfs_dsb_t, sb_icount),
 				  offsetof(xfs_dsb_t, sb_frextents) +
 				  sizeof(sbp->sb_frextents) - 1);
-
-	tp->t_mountp->m_super->s_dirt = 1;
 }
 
 /*
-- 
GitLab


From 8688b8635266cf98f00c6b0350ea2dbe7c42c321 Mon Sep 17 00:00:00 2001
From: Mike Frysinger <vapier@gentoo.org>
Date: Tue, 26 May 2009 05:45:04 -0400
Subject: [PATCH 2159/2198] linux/magic.h: move cramfs magic out of cramfs_fs.h

Signed-off-by: Mike Frysinger <vapier@gentoo.org>
CC: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 include/linux/cramfs_fs.h | 3 +--
 include/linux/magic.h     | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/include/linux/cramfs_fs.h b/include/linux/cramfs_fs.h
index 3be4e5a27d82f..6fc2bed368b8c 100644
--- a/include/linux/cramfs_fs.h
+++ b/include/linux/cramfs_fs.h
@@ -2,9 +2,8 @@
 #define __CRAMFS_H
 
 #include <linux/types.h>
+#include <linux/magic.h>
 
-#define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
-#define CRAMFS_MAGIC_WEND	0x453dcd28	/* magic number with the wrong endianess */
 #define CRAMFS_SIGNATURE	"Compressed ROMFS"
 
 /*
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 927138cf30502..1923327b98695 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -6,6 +6,8 @@
 #define AFS_SUPER_MAGIC                0x5346414F
 #define AUTOFS_SUPER_MAGIC	0x0187
 #define CODA_SUPER_MAGIC	0x73757245
+#define CRAMFS_MAGIC		0x28cd3d45	/* some random number */
+#define CRAMFS_MAGIC_WEND	0x453dcd28	/* magic number with the wrong endianess */
 #define DEBUGFS_MAGIC          0x64626720
 #define SYSFS_MAGIC		0x62656572
 #define SECURITYFS_MAGIC	0x73636673
-- 
GitLab


From 545b9fd3d737afc0bb5203b1e79194a471605acd Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Tue, 2 Jun 2009 12:07:47 +0200
Subject: [PATCH 2160/2198] fs: remove incorrect I_NEW warnings

Some filesystems can call in to sync an inode that is still in the
I_NEW state (eg. ext family, when mounted with -osync). This is OK
because the filesystem has sole access to the new inode, so it can
modify i_state without races (because no other thread should be
modifying it, by definition of I_NEW). Ie. a false positive, so
remove the warnings.

The races are described here 7ef0d7377cb287e08f3ae94cebc919448e1f5dff,
which is also where the warnings were introduced.

Reported-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs-writeback.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e0fb2e7895985..efcedb6d9cbca 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -289,7 +289,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	int ret;
 
 	BUG_ON(inode->i_state & I_SYNC);
-	WARN_ON(inode->i_state & I_NEW);
 
 	/* Set I_SYNC, reset I_DIRTY */
 	dirty = inode->i_state & I_DIRTY;
@@ -314,7 +313,6 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc)
 	}
 
 	spin_lock(&inode_lock);
-	WARN_ON(inode->i_state & I_NEW);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
 		if (!(inode->i_state & I_DIRTY) &&
-- 
GitLab


From 4195f73d1329e49727bcceb028e58cb38376c2b0 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Thu, 28 May 2009 09:01:15 +0200
Subject: [PATCH 2161/2198] fs: block_dump missing dentry locking

I think the block_dump output in __mark_inode_dirty is missing dentry locking.
Surely the i_dentry list can change any time, so we may not even *get* a
dentry there. If we do get one by chance, then it would appear to be able to
go away or get renamed at any time...

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fs-writeback.c | 41 ++++++++++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 17 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index efcedb6d9cbca..40308e98c6a44 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -64,6 +64,28 @@ static void writeback_release(struct backing_dev_info *bdi)
 	clear_bit(BDI_pdflush, &bdi->state);
 }
 
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+	if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+		struct dentry *dentry;
+		const char *name = "?";
+
+		dentry = d_find_alias(inode);
+		if (dentry) {
+			spin_lock(&dentry->d_lock);
+			name = (const char *) dentry->d_name.name;
+		}
+		printk(KERN_DEBUG
+		       "%s(%d): dirtied inode %lu (%s) on %s\n",
+		       current->comm, task_pid_nr(current), inode->i_ino,
+		       name, inode->i_sb->s_id);
+		if (dentry) {
+			spin_unlock(&dentry->d_lock);
+			dput(dentry);
+		}
+	}
+}
+
 /**
  *	__mark_inode_dirty -	internal function
  *	@inode: inode to mark
@@ -114,23 +136,8 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	if ((inode->i_state & flags) == flags)
 		return;
 
-	if (unlikely(block_dump)) {
-		struct dentry *dentry = NULL;
-		const char *name = "?";
-
-		if (!list_empty(&inode->i_dentry)) {
-			dentry = list_entry(inode->i_dentry.next,
-					    struct dentry, d_alias);
-			if (dentry && dentry->d_name.name)
-				name = (const char *) dentry->d_name.name;
-		}
-
-		if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev"))
-			printk(KERN_DEBUG
-			       "%s(%d): dirtied inode %lu (%s) on %s\n",
-			       current->comm, task_pid_nr(current), inode->i_ino,
-			       name, inode->i_sb->s_id);
-	}
+	if (unlikely(block_dump))
+		block_dump___mark_inode_dirty(inode);
 
 	spin_lock(&inode_lock);
 	if ((inode->i_state & flags) != flags) {
-- 
GitLab


From 337eb00a2c3a421999c39c94ce7e33545ee8baa7 Mon Sep 17 00:00:00 2001
From: Alessio Igor Bogani <abogani@texware.it>
Date: Tue, 12 May 2009 15:10:54 +0200
Subject: [PATCH 2162/2198] Push BKL down into ->remount_fs()

[xfs, btrfs, capifs, shmem don't need BKL, exempt]

Signed-off-by: Alessio Igor Bogani <abogani@texware.it>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/usb/core/inode.c |  5 +++++
 fs/affs/super.c          |  7 ++++++-
 fs/ext2/super.c          | 12 ++++++++++--
 fs/ext3/super.c          |  4 ++++
 fs/ext4/super.c          |  4 ++++
 fs/hpfs/super.c          |  4 ++++
 fs/jffs2/fs.c            |  3 +++
 fs/jfs/super.c           | 22 ++++++++++++++++++----
 fs/nfs/super.c           |  2 ++
 fs/nilfs2/super.c        |  4 ++++
 fs/ntfs/super.c          | 15 ++++++++++++++-
 fs/ocfs2/super.c         |  4 ++++
 fs/reiserfs/super.c      |  4 ++++
 fs/super.c               |  2 --
 fs/ubifs/super.c         |  9 ++++++++-
 fs/udf/super.c           |  6 +++++-
 fs/ufs/super.c           | 11 ++++++++++-
 kernel/cgroup.c          |  3 +++
 18 files changed, 108 insertions(+), 13 deletions(-)

diff --git a/drivers/usb/core/inode.c b/drivers/usb/core/inode.c
index dff5760a37f61..ffe75e83787c2 100644
--- a/drivers/usb/core/inode.c
+++ b/drivers/usb/core/inode.c
@@ -39,6 +39,7 @@
 #include <linux/parser.h>
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 #include <asm/byteorder.h>
 #include "usb.h"
 #include "hcd.h"
@@ -265,9 +266,13 @@ static int remount(struct super_block *sb, int *flags, char *data)
 		return -EINVAL;
 	}
 
+	lock_kernel();
+
 	if (usbfs_mount && usbfs_mount->mnt_sb)
 		update_sb(usbfs_mount->mnt_sb);
 
+	unlock_kernel();
+
 	return 0;
 }
 
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 280d361af41f6..c4814937c9685 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -16,6 +16,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include "affs.h"
 
 extern struct timezone sys_tz;
@@ -512,6 +513,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 		kfree(new_opts);
 		return -EINVAL;
 	}
+	lock_kernel();
 	replace_mount_options(sb, new_opts);
 
 	sbi->s_flags = mount_flags;
@@ -519,8 +521,10 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	sbi->s_uid   = uid;
 	sbi->s_gid   = gid;
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		unlock_kernel();
 		return 0;
+	}
 	if (*flags & MS_RDONLY) {
 		sb->s_dirt = 1;
 		while (sb->s_dirt)
@@ -529,6 +533,7 @@ affs_remount(struct super_block *sb, int *flags, char *data)
 	} else
 		res = affs_init_bitmap(sb, flags);
 
+	unlock_kernel();
 	return res;
 }
 
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index a44963d8edbd4..f8cbdf5691909 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1162,6 +1162,8 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	unsigned long old_sb_flags;
 	int err;
 
+	lock_kernel();
+
 	/* Store the old options */
 	old_sb_flags = sb->s_flags;
 	old_opts.s_mount_opt = sbi->s_mount_opt;
@@ -1197,12 +1199,16 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
 		sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
 	}
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		unlock_kernel();
 		return 0;
+	}
 	if (*flags & MS_RDONLY) {
 		if (le16_to_cpu(es->s_state) & EXT2_VALID_FS ||
-		    !(sbi->s_mount_state & EXT2_VALID_FS))
+		    !(sbi->s_mount_state & EXT2_VALID_FS)) {
+			unlock_kernel();
 			return 0;
+		}
 		/*
 		 * OK, we are remounting a valid rw partition rdonly, so set
 		 * the rdonly flag and then mark the partition as valid again.
@@ -1229,12 +1235,14 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 			sb->s_flags &= ~MS_RDONLY;
 	}
 	ext2_sync_super(sb, es);
+	unlock_kernel();
 	return 0;
 restore_opts:
 	sbi->s_mount_opt = old_opts.s_mount_opt;
 	sbi->s_resuid = old_opts.s_resuid;
 	sbi->s_resgid = old_opts.s_resgid;
 	sb->s_flags = old_sb_flags;
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index e213a2613a561..26aa64dee6aac 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2490,6 +2490,8 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	int i;
 #endif
 
+	lock_kernel();
+
 	/* Store the original options */
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
@@ -2600,6 +2602,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return 0;
 restore_opts:
 	sb->s_flags = old_sb_flags;
@@ -2617,6 +2620,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 	}
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c17200a42301a..012c4251397e9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3422,6 +3422,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int i;
 #endif
 
+	lock_kernel();
+
 	/* Store the original options */
 	lock_super(sb);
 	old_sb_flags = sb->s_flags;
@@ -3558,6 +3560,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			kfree(old_opts.s_qf_names[i]);
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return 0;
 
 restore_opts:
@@ -3578,6 +3581,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	}
 #endif
 	unlock_super(sb);
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index f68193cf08113..f2feaa06bf26c 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -13,6 +13,7 @@
 #include <linux/statfs.h>
 #include <linux/magic.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 
 /* Mark the filesystem dirty, so that chkdsk checks it when os/2 booted */
 
@@ -398,6 +399,7 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	
 	*flags |= MS_NOATIME;
 	
+	lock_kernel();
 	lock_super(s);
 	uid = sbi->sb_uid; gid = sbi->sb_gid;
 	umask = 0777 & ~sbi->sb_mode;
@@ -432,10 +434,12 @@ static int hpfs_remount_fs(struct super_block *s, int *flags, char *data)
 	replace_mount_options(s, new_opts);
 
 	unlock_super(s);
+	unlock_kernel();
 	return 0;
 
 out_err:
 	unlock_super(s);
+	unlock_kernel();
 	kfree(new_opts);
 	return -EINVAL;
 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 237b27a3d5708..3451a81b21428 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -20,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/vfs.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 #include "nodelist.h"
 
 static int jffs2_flash_setup(struct jffs2_sb_info *c);
@@ -387,6 +388,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 	   This also catches the case where it was stopped and this
 	   is just a remount to restart it.
 	   Flush the writebuffer, if neccecary, else we loose it */
+	lock_kernel();
 	if (!(sb->s_flags & MS_RDONLY)) {
 		jffs2_stop_garbage_collect_thread(c);
 		mutex_lock(&c->alloc_sem);
@@ -399,6 +401,7 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data)
 
 	*flags |= MS_NOATIME;
 
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 3eb13adf38623..09b1b6ee21861 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -32,6 +32,7 @@
 #include <linux/crc32.h>
 #include <asm/uaccess.h>
 #include <linux/seq_file.h>
+#include <linux/smp_lock.h>
 
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
@@ -375,19 +376,24 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 	s64 newLVSize = 0;
 	int rc = 0;
 	int flag = JFS_SBI(sb)->flag;
+	int ret;
 
 	if (!parse_options(data, sb, &newLVSize, &flag)) {
 		return -EINVAL;
 	}
+	lock_kernel();
 	if (newLVSize) {
 		if (sb->s_flags & MS_RDONLY) {
 			printk(KERN_ERR
 		  "JFS: resize requires volume to be mounted read-write\n");
+			unlock_kernel();
 			return -EROFS;
 		}
 		rc = jfs_extendfs(sb, newLVSize, 0);
-		if (rc)
+		if (rc) {
+			unlock_kernel();
 			return rc;
+		}
 	}
 
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
@@ -398,23 +404,31 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 		truncate_inode_pages(JFS_SBI(sb)->direct_inode->i_mapping, 0);
 
 		JFS_SBI(sb)->flag = flag;
-		return jfs_mount_rw(sb, 1);
+		ret = jfs_mount_rw(sb, 1);
+		unlock_kernel();
+		return ret;
 	}
 	if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
 		rc = jfs_umount_rw(sb);
 		JFS_SBI(sb)->flag = flag;
+		unlock_kernel();
 		return rc;
 	}
 	if ((JFS_SBI(sb)->flag & JFS_NOINTEGRITY) != (flag & JFS_NOINTEGRITY))
 		if (!(sb->s_flags & MS_RDONLY)) {
 			rc = jfs_umount_rw(sb);
-			if (rc)
+			if (rc) {
+				unlock_kernel();
 				return rc;
+			}
 			JFS_SBI(sb)->flag = flag;
-			return jfs_mount_rw(sb, 1);
+			ret = jfs_mount_rw(sb, 1);
+			unlock_kernel();
+			return ret;
 		}
 	JFS_SBI(sb)->flag = flag;
 
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d2d67781c579f..26127b69a2755 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1813,6 +1813,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	if (data == NULL)
 		return -ENOMEM;
 
+	lock_kernel();
 	/* fill out struct with values from existing mount */
 	data->flags = nfss->flags;
 	data->rsize = nfss->rsize;
@@ -1837,6 +1838,7 @@ nfs_remount(struct super_block *sb, int *flags, char *raw_data)
 	error = nfs_compare_remount_data(nfss, data);
 out:
 	kfree(data);
+	unlock_kernel();
 	return error;
 }
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 7262e8427c203..11151eaa2c4a2 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -906,6 +906,8 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	struct nilfs_mount_options old_opts;
 	int err;
 
+	lock_kernel();
+
 	old_sb_flags = sb->s_flags;
 	old_opts.mount_opt = sbi->s_mount_opt;
 	old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -985,6 +987,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		up(&sb->s_bdev->bd_mount_sem);
 	}
  out:
+	unlock_kernel();
 	return 0;
 
  rw_remount_failed:
@@ -993,6 +996,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.mount_opt;
 	sbi->s_snapshot_cno = old_opts.snapshot_cno;
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 7a7b0d326395f..abaaa1cbf8de4 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -443,6 +443,8 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 	ntfs_volume *vol = NTFS_SB(sb);
 
 	ntfs_debug("Entering with remount options string: %s", opt);
+
+	lock_kernel();
 #ifndef NTFS_RW
 	/* For read-only compiled driver, enforce read-only flag. */
 	*flags |= MS_RDONLY;
@@ -466,15 +468,18 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 		if (NVolErrors(vol)) {
 			ntfs_error(sb, "Volume has errors and is read-only%s",
 					es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_IS_DIRTY) {
 			ntfs_error(sb, "Volume is dirty and read-only%s", es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MODIFIED_BY_CHKDSK) {
 			ntfs_error(sb, "Volume has been modified by chkdsk "
 					"and is read-only%s", es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (vol->vol_flags & VOLUME_MUST_MOUNT_RO_MASK) {
@@ -482,11 +487,13 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 					"(0x%x) and is read-only%s",
 					(unsigned)le16_to_cpu(vol->vol_flags),
 					es);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (ntfs_set_volume_flags(vol, VOLUME_IS_DIRTY)) {
 			ntfs_error(sb, "Failed to set dirty bit in volume "
 					"information flags%s", es);
+			unlock_kernel();
 			return -EROFS;
 		}
 #if 0
@@ -506,18 +513,21 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 			ntfs_error(sb, "Failed to empty journal $LogFile%s",
 					es);
 			NVolSetErrors(vol);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_mark_quotas_out_of_date(vol)) {
 			ntfs_error(sb, "Failed to mark quotas out of date%s",
 					es);
 			NVolSetErrors(vol);
+			unlock_kernel();
 			return -EROFS;
 		}
 		if (!ntfs_stamp_usnjrnl(vol)) {
 			ntfs_error(sb, "Failed to stamp transation log "
 					"($UsnJrnl)%s", es);
 			NVolSetErrors(vol);
+			unlock_kernel();
 			return -EROFS;
 		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
@@ -533,8 +543,11 @@ static int ntfs_remount(struct super_block *sb, int *flags, char *opt)
 
 	// TODO: Deal with *flags.
 
-	if (!parse_options(vol, opt))
+	if (!parse_options(vol, opt)) {
+		unlock_kernel();
 		return -EINVAL;
+	}
+	unlock_kernel();
 	ntfs_debug("Done.");
 	return 0;
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 02737596b597d..201b40a441fe8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -42,6 +42,7 @@
 #include <linux/mount.h>
 #include <linux/seq_file.h>
 #include <linux/quotaops.h>
+#include <linux/smp_lock.h>
 
 #define MLOG_MASK_PREFIX ML_SUPER
 #include <cluster/masklog.h>
@@ -581,6 +582,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 	struct mount_options parsed_options;
 	struct ocfs2_super *osb = OCFS2_SB(sb);
 
+	lock_kernel();
+
 	if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) {
 		ret = -EINVAL;
 		goto out;
@@ -684,6 +687,7 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
 			ocfs2_set_journal_params(osb);
 	}
 out:
+	unlock_kernel();
 	return ret;
 }
 
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 90dcb7b033ea2..2969773cfc22a 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -28,6 +28,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/crc32.h>
+#include <linux/smp_lock.h>
 
 struct file_system_type reiserfs_fs_type;
 
@@ -1196,6 +1197,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 	memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
 #endif
 
+	lock_kernel();
 	rs = SB_DISK_SUPER_BLOCK(s);
 
 	if (!reiserfs_parse_options
@@ -1318,10 +1320,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
 
 out_ok:
 	replace_mount_options(s, new_opts);
+	unlock_kernel();
 	return 0;
 
 out_err:
 	kfree(new_opts);
+	unlock_kernel();
 	return err;
 }
 
diff --git a/fs/super.c b/fs/super.c
index 1905f4af01cc4..83b47416d0069 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -540,7 +540,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 	shrink_dcache_sb(sb);
 	sync_filesystem(sb);
 
-	lock_kernel();
 	/* If we are remounting RDONLY and current sb is read/write,
 	   make sure there are no rw files opened */
 	if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) {
@@ -566,7 +565,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 		}
 	}
 	sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
-	unlock_kernel();
 	if (remount_rw)
 		vfs_dq_quota_on_remount(sb);
 	return 0;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 522c3fd7eb3ca..3589eab02a2f5 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -36,6 +36,7 @@
 #include <linux/mount.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
+#include <linux/smp_lock.h>
 #include "ubifs.h"
 
 /*
@@ -1770,17 +1771,22 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 		return err;
 	}
 
+	lock_kernel();
 	if ((sb->s_flags & MS_RDONLY) && !(*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
+			unlock_kernel();
 			return -EROFS;
 		}
 		err = ubifs_remount_rw(c);
-		if (err)
+		if (err) {
+			unlock_kernel();
 			return err;
+		}
 	} else if (!(sb->s_flags & MS_RDONLY) && (*flags & MS_RDONLY)) {
 		if (c->ro_media) {
 			ubifs_msg("cannot re-mount due to prior errors");
+			unlock_kernel();
 			return -EROFS;
 		}
 		ubifs_remount_ro(c);
@@ -1795,6 +1801,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
 	}
 
 	ubifs_assert(c->lst.taken_empty_lebs > 0);
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 04802cc39b18d..6832135159b68 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -568,6 +568,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 	if (!udf_parse_options(options, &uopt, true))
 		return -EINVAL;
 
+	lock_kernel();
 	sbi->s_flags = uopt.flags;
 	sbi->s_uid   = uopt.uid;
 	sbi->s_gid   = uopt.gid;
@@ -581,13 +582,16 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 			*flags |= MS_RDONLY;
 	}
 
-	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
+	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+		unlock_kernel();
 		return 0;
+	}
 	if (*flags & MS_RDONLY)
 		udf_close_lvid(sb);
 	else
 		udf_open_lvid(sb);
 
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index c97210ee0670c..6560dda7b18c7 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -263,6 +263,7 @@ void ufs_panic (struct super_block * sb, const char * function,
 	struct ufs_super_block_first * usb1;
 	va_list args;
 	
+	lock_kernel();
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	
@@ -1182,7 +1183,8 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	struct ufs_super_block_third * usb3;
 	unsigned new_mount_opt, ufstype;
 	unsigned flags;
-	
+
+	lock_kernel();
 	lock_super(sb);
 	uspi = UFS_SB(sb)->s_uspi;
 	flags = UFS_SB(sb)->s_flags;
@@ -1198,6 +1200,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	ufs_set_opt (new_mount_opt, ONERROR_LOCK);
 	if (!ufs_parse_options (data, &new_mount_opt)) {
 		unlock_super(sb);
+		unlock_kernel();
 		return -EINVAL;
 	}
 	if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) {
@@ -1205,12 +1208,14 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	} else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) {
 		printk("ufstype can't be changed during remount\n");
 		unlock_super(sb);
+		unlock_kernel();
 		return -EINVAL;
 	}
 
 	if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
 		UFS_SB(sb)->s_mount_opt = new_mount_opt;
 		unlock_super(sb);
+		unlock_kernel();
 		return 0;
 	}
 	
@@ -1236,6 +1241,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		printk("ufs was compiled with read-only support, "
 		"can't be mounted as read-write\n");
 		unlock_super(sb);
+		unlock_kernel();
 		return -EINVAL;
 #else
 		if (ufstype != UFS_MOUNT_UFSTYPE_SUN && 
@@ -1245,11 +1251,13 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 		    ufstype != UFS_MOUNT_UFSTYPE_UFS2) {
 			printk("this ufstype is read-only supported\n");
 			unlock_super(sb);
+			unlock_kernel();
 			return -EINVAL;
 		}
 		if (!ufs_read_cylinder_structures(sb)) {
 			printk("failed during remounting\n");
 			unlock_super(sb);
+			unlock_kernel();
 			return -EPERM;
 		}
 		sb->s_flags &= ~MS_RDONLY;
@@ -1257,6 +1265,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data)
 	}
 	UFS_SB(sb)->s_mount_opt = new_mount_opt;
 	unlock_super(sb);
+	unlock_kernel();
 	return 0;
 }
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index a7267bfd3765b..3fb789f6df943 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
 #include <linux/cgroupstats.h>
 #include <linux/hash.h>
 #include <linux/namei.h>
+#include <linux/smp_lock.h>
 
 #include <asm/atomic.h>
 
@@ -900,6 +901,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	struct cgroup *cgrp = &root->top_cgroup;
 	struct cgroup_sb_opts opts;
 
+	lock_kernel();
 	mutex_lock(&cgrp->dentry->d_inode->i_mutex);
 	mutex_lock(&cgroup_mutex);
 
@@ -927,6 +929,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	kfree(opts.release_agent);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+	unlock_kernel();
 	return ret;
 }
 
-- 
GitLab


From d5aacad548db1ff547adf35d0a77eb2a8ed4fe14 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 14:56:44 -0400
Subject: [PATCH 2163/2198] New helper - simple_fsync()

writes associated buffers, then does sync_inode() to write
the inode itself (and to make it clean).  Depends on
->write_inode() honouring the second argument.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/libfs.c         | 25 +++++++++++++++++++++++++
 include/linux/fs.h |  2 ++
 2 files changed, 27 insertions(+)

diff --git a/fs/libfs.c b/fs/libfs.c
index 80046ddf5063a..ddfa89948c3f1 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -9,6 +9,8 @@
 #include <linux/vfs.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
 
 #include <asm/uaccess.h>
 
@@ -807,6 +809,29 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
 
+int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+	struct writeback_control wbc = {
+		.sync_mode = WB_SYNC_ALL,
+		.nr_to_write = 0, /* metadata-only; caller takes care of data */
+	};
+	struct inode *inode = dentry->d_inode;
+	int err;
+	int ret;
+
+	ret = sync_mapping_buffers(inode->i_mapping);
+	if (!(inode->i_state & I_DIRTY))
+		return ret;
+	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
+		return ret;
+
+	err = sync_inode(inode, &wbc);
+	if (ret == 0)
+		ret = err;
+	return ret;
+}
+EXPORT_SYMBOL(simple_fsync);
+
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
 EXPORT_SYMBOL(dcache_dir_open);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index d883aa1fc2eb5..ede84fa7da5d0 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2345,6 +2345,8 @@ extern void simple_release_fs(struct vfsmount **mount, int *count);
 extern ssize_t simple_read_from_buffer(void __user *to, size_t count,
 			loff_t *ppos, const void *from, size_t available);
 
+extern int simple_fsync(struct file *, struct dentry *, int);
+
 #ifdef CONFIG_MIGRATION
 extern int buffer_migrate_page(struct address_space *,
 				struct page *, struct page *);
-- 
GitLab


From 79d25767583e4e086f8309bfd1f502660a64fe7f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 09:30:08 -0400
Subject: [PATCH 2164/2198] Sanitize qnx4 fsync handling

* have directory operations use mark_buffer_dirty_inode(),
  so that sync_mapping_buffers() would get those.
* make qnx4_write_inode() honour its last argument.
* get rid of insane copies of very ancient "walk the indirect blocks"
  in qnx4/fsync - they never matched the actual fs layout and, fortunately,
  never'd been called.  Again, all this junk is not needed; ->fsync()
  should just do sync_mapping_buffers + sync_inode (and if we implement
  block allocation for qnx4, we'll need to use mark_buffer_dirty_inode()
  for extent blocks)

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/Makefile        |   2 +-
 fs/qnx4/dir.c           |   2 +-
 fs/qnx4/file.c          |   2 +-
 fs/qnx4/fsync.c         | 169 ----------------------------------------
 fs/qnx4/inode.c         |  38 +++------
 fs/qnx4/namei.c         |   4 +-
 include/linux/qnx4_fs.h |   2 -
 7 files changed, 17 insertions(+), 202 deletions(-)
 delete mode 100644 fs/qnx4/fsync.c

diff --git a/fs/qnx4/Makefile b/fs/qnx4/Makefile
index 502d7fe98bab5..e4d408cc5473c 100644
--- a/fs/qnx4/Makefile
+++ b/fs/qnx4/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_QNX4FS_FS) += qnx4.o
 
-qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o fsync.o
+qnx4-objs := inode.o dir.o namei.o file.o bitmap.o truncate.o
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ea9ffefb48add..ff6c1ba6c4e01 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -84,7 +84,7 @@ const struct file_operations qnx4_dir_operations =
 {
 	.read		= generic_read_dir,
 	.readdir	= qnx4_readdir,
-	.fsync		= file_fsync,
+	.fsync		= simple_fsync,
 };
 
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index 867f42b02035d..e7033ea10e2fc 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -29,7 +29,7 @@ const struct file_operations qnx4_file_operations =
 #ifdef CONFIG_QNX4FS_RW
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
-	.fsync		= qnx4_sync_file,
+	.fsync		= simple_fsync,
 #endif
 };
 
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
deleted file mode 100644
index aa3b19544beea..0000000000000
--- a/fs/qnx4/fsync.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/* 
- * QNX4 file system, Linux implementation.
- * 
- * Version : 0.1
- * 
- * Using parts of the xiafs filesystem.
- * 
- * History :
- * 
- * 24-03-1998 by Richard Frowijn : first release.
- */
-
-#include <linux/errno.h>
-#include <linux/time.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/smp_lock.h>
-#include <linux/buffer_head.h>
-
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-
-#include <asm/system.h>
-
-/*
- * The functions for qnx4 fs file synchronization.
- */
-
-#ifdef CONFIG_QNX4FS_RW
-
-static int sync_block(struct inode *inode, unsigned short *block, int wait)
-{
-	struct buffer_head *bh;
-	unsigned short tmp;
-
-	if (!*block)
-		return 0;
-	tmp = *block;
-	bh = sb_find_get_block(inode->i_sb, *block);
-	if (!bh)
-		return 0;
-	if (*block != tmp) {
-		brelse(bh);
-		return 1;
-	}
-	if (wait && buffer_req(bh) && !buffer_uptodate(bh)) {
-		brelse(bh);
-		return -1;
-	}
-	if (wait || !buffer_uptodate(bh) || !buffer_dirty(bh)) {
-		brelse(bh);
-		return 0;
-	}
-	ll_rw_block(WRITE, 1, &bh);
-	atomic_dec(&bh->b_count);
-	return 0;
-}
-
-#ifdef WTF
-static int sync_iblock(struct inode *inode, unsigned short *iblock,
-		       struct buffer_head **bh, int wait)
-{
-	int rc;
-	unsigned short tmp;
-
-	*bh = NULL;
-	tmp = *iblock;
-	if (!tmp)
-		return 0;
-	rc = sync_block(inode, iblock, wait);
-	if (rc)
-		return rc;
-	*bh = sb_bread(inode->i_sb, tmp);
-	if (tmp != *iblock) {
-		brelse(*bh);
-		*bh = NULL;
-		return 1;
-	}
-	if (!*bh)
-		return -1;
-	return 0;
-}
-#endif
-
-static int sync_direct(struct inode *inode, int wait)
-{
-	int i;
-	int rc, err = 0;
-
-	for (i = 0; i < 7; i++) {
-		rc = sync_block(inode,
-				(unsigned short *) qnx4_raw_inode(inode)->di_first_xtnt.xtnt_blk + i, wait);
-		if (rc > 0)
-			break;
-		if (rc)
-			err = rc;
-	}
-	return err;
-}
-
-#ifdef WTF
-static int sync_indirect(struct inode *inode, unsigned short *iblock, int wait)
-{
-	int i;
-	struct buffer_head *ind_bh;
-	int rc, err = 0;
-
-	rc = sync_iblock(inode, iblock, &ind_bh, wait);
-	if (rc || !ind_bh)
-		return rc;
-
-	for (i = 0; i < 512; i++) {
-		rc = sync_block(inode,
-				((unsigned short *) ind_bh->b_data) + i,
-				wait);
-		if (rc > 0)
-			break;
-		if (rc)
-			err = rc;
-	}
-	brelse(ind_bh);
-	return err;
-}
-
-static int sync_dindirect(struct inode *inode, unsigned short *diblock,
-			  int wait)
-{
-	int i;
-	struct buffer_head *dind_bh;
-	int rc, err = 0;
-
-	rc = sync_iblock(inode, diblock, &dind_bh, wait);
-	if (rc || !dind_bh)
-		return rc;
-
-	for (i = 0; i < 512; i++) {
-		rc = sync_indirect(inode,
-				((unsigned short *) dind_bh->b_data) + i,
-				   wait);
-		if (rc > 0)
-			break;
-		if (rc)
-			err = rc;
-	}
-	brelse(dind_bh);
-	return err;
-}
-#endif
-
-int qnx4_sync_file(struct file *file, struct dentry *dentry, int unused)
-{
-        struct inode *inode = dentry->d_inode;
-	int wait, err = 0;
-        
-        (void) file;
-	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-	      S_ISLNK(inode->i_mode)))
-		return -EINVAL;
-
-	lock_kernel();
-	for (wait = 0; wait <= 1; wait++) {
-		err |= sync_direct(inode, wait);
-	}
-	err |= qnx4_sync_inode(inode);
-	unlock_kernel();
-	return (err < 0) ? -EIO : 0;
-}
-
-#endif
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 95c12fc613f1f..40712867b8a87 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -24,6 +24,7 @@
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
+#include <linux/writeback.h>
 #include <linux/vfs.h>
 #include <asm/uaccess.h>
 
@@ -34,31 +35,6 @@ static const struct super_operations qnx4_sops;
 
 #ifdef CONFIG_QNX4FS_RW
 
-int qnx4_sync_inode(struct inode *inode)
-{
-	int err = 0;
-# if 0
-	struct buffer_head *bh;
-
-   	bh = qnx4_update_inode(inode);
-	if (bh && buffer_dirty(bh))
-	{
-		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh))
-		{
-			printk ("IO error syncing qnx4 inode [%s:%08lx]\n",
-				inode->i_sb->s_id, inode->i_ino);
-			err = -1;
-		}
-	        brelse (bh);
-	} else if (!bh) {
-		err = -1;
-	}
-# endif
-
-	return err;
-}
-
 static void qnx4_delete_inode(struct inode *inode)
 {
 	QNX4DEBUG(("qnx4: deleting inode [%lu]\n", (unsigned long) inode->i_ino));
@@ -70,7 +46,7 @@ static void qnx4_delete_inode(struct inode *inode)
 	unlock_kernel();
 }
 
-static int qnx4_write_inode(struct inode *inode, int unused)
+static int qnx4_write_inode(struct inode *inode, int do_sync)
 {
 	struct qnx4_inode_entry *raw_inode;
 	int block, ino;
@@ -107,6 +83,16 @@ static int qnx4_write_inode(struct inode *inode, int unused)
 	raw_inode->di_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
 	raw_inode->di_first_xtnt.xtnt_size = cpu_to_le32(inode->i_blocks);
 	mark_buffer_dirty(bh);
+	if (do_sync) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
+			printk("qnx4: IO error syncing inode [%s:%08x]\n",
+					inode->i_sb->s_id, ino);
+			brelse(bh);
+			unlock_kernel();
+			return -EIO;
+		}
+	}
 	brelse(bh);
 	unlock_kernel();
 	return 0;
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 775eed3a4085f..123270c537607 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -187,7 +187,7 @@ int qnx4_rmdir(struct inode *dir, struct dentry *dentry)
 	de->di_status = 0;
 	memset(de->di_fname, 0, sizeof de->di_fname);
 	de->di_mode = 0;
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty_inode(bh, dir);
 	clear_nlink(inode);
 	mark_inode_dirty(inode);
 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
@@ -232,7 +232,7 @@ int qnx4_unlink(struct inode *dir, struct dentry *dentry)
 	de->di_status = 0;
 	memset(de->di_fname, 0, sizeof de->di_fname);
 	de->di_mode = 0;
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty_inode(bh, dir);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
 	inode->i_ctime = dir->i_ctime;
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index 787d19ea9f46a..acbaec3524e09 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -126,8 +126,6 @@ extern void qnx4_truncate(struct inode *inode);
 extern void qnx4_free_inode(struct inode *inode);
 extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
 extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
-extern int qnx4_sync_file(struct file *file, struct dentry *dentry, int);
-extern int qnx4_sync_inode(struct inode *inode);
 
 static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
 {
-- 
GitLab


From 964f5369667b342994fe3f384e9ba41d404ee796 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 09:47:13 -0400
Subject: [PATCH 2165/2198] fs/qnx4: sanitize includes

fs-internal parts of qnx4_fs.h taken to fs/qnx4/qnx4.h, includes adjusted,
qnx4_fs.h doesn't need unifdef anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/qnx4/bitmap.c        |  7 +----
 fs/qnx4/dir.c           |  7 +----
 fs/qnx4/file.c          |  3 +--
 fs/qnx4/inode.c         | 11 +++-----
 fs/qnx4/namei.c         |  9 +------
 fs/qnx4/qnx4.h          | 57 +++++++++++++++++++++++++++++++++++++++
 fs/qnx4/truncate.c      |  6 +----
 include/linux/Kbuild    |  2 +-
 include/linux/qnx4_fs.h | 59 -----------------------------------------
 9 files changed, 66 insertions(+), 95 deletions(-)
 create mode 100644 fs/qnx4/qnx4.h

diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 8425cf6e96248..e1cd061a25f74 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -13,14 +13,9 @@
  * 28-06-1998 by Frank Denis : qnx4_free_inode (to be fixed) .
  */
 
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
+#include "qnx4.h"
 
 #if 0
 int qnx4_new_block(struct super_block *sb)
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index ff6c1ba6c4e01..003c68f3238b4 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -11,14 +11,9 @@
  * 20-06-1998 by Frank Denis : Linux 2.1.99+ & dcache support.
  */
 
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/stat.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
-
+#include "qnx4.h"
 
 static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
 {
diff --git a/fs/qnx4/file.c b/fs/qnx4/file.c
index e7033ea10e2fc..09b170ac936ce 100644
--- a/fs/qnx4/file.c
+++ b/fs/qnx4/file.c
@@ -12,8 +12,7 @@
  * 27-06-1998 by Frank Denis : file overwriting.
  */
 
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
+#include "qnx4.h"
 
 /*
  * We have mostly NULL's here: the current defaults are ok for
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index 40712867b8a87..681df5fcd1614 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -13,20 +13,15 @@
  */
 
 #include <linux/module.h>
-#include <linux/types.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/init.h>
+#include <linux/slab.h>
 #include <linux/highuid.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/vfs.h>
-#include <asm/uaccess.h>
+#include <linux/statfs.h>
+#include "qnx4.h"
 
 #define QNX4_VERSION  4
 #define QNX4_BMNAME   ".bitmap"
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index 123270c537607..5972ed2149372 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -12,16 +12,9 @@
  * 04-07-1998 by Frank Denis : first step for rmdir/unlink.
  */
 
-#include <linux/time.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
+#include "qnx4.h"
 
 
 /*
diff --git a/fs/qnx4/qnx4.h b/fs/qnx4/qnx4.h
new file mode 100644
index 0000000000000..9efc089454f6d
--- /dev/null
+++ b/fs/qnx4/qnx4.h
@@ -0,0 +1,57 @@
+#include <linux/fs.h>
+#include <linux/qnx4_fs.h>
+
+#define QNX4_DEBUG 0
+
+#if QNX4_DEBUG
+#define QNX4DEBUG(X) printk X
+#else
+#define QNX4DEBUG(X) (void) 0
+#endif
+
+struct qnx4_sb_info {
+	struct buffer_head	*sb_buf;	/* superblock buffer */
+	struct qnx4_super_block	*sb;		/* our superblock */
+	unsigned int		Version;	/* may be useful */
+	struct qnx4_inode_entry	*BitMap;	/* useful */
+};
+
+struct qnx4_inode_info {
+	struct qnx4_inode_entry raw;
+	loff_t mmu_private;
+	struct inode vfs_inode;
+};
+
+extern struct inode *qnx4_iget(struct super_block *, unsigned long);
+extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
+extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
+extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
+
+extern struct buffer_head *qnx4_bread(struct inode *, int, int);
+
+extern const struct inode_operations qnx4_file_inode_operations;
+extern const struct inode_operations qnx4_dir_inode_operations;
+extern const struct file_operations qnx4_file_operations;
+extern const struct file_operations qnx4_dir_operations;
+extern int qnx4_is_free(struct super_block *sb, long block);
+extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
+extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
+extern void qnx4_truncate(struct inode *inode);
+extern void qnx4_free_inode(struct inode *inode);
+extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
+extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
+
+static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
+{
+	return sb->s_fs_info;
+}
+
+static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
+{
+	return container_of(inode, struct qnx4_inode_info, vfs_inode);
+}
+
+static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
+{
+	return &qnx4_i(inode)->raw;
+}
diff --git a/fs/qnx4/truncate.c b/fs/qnx4/truncate.c
index 6437c1c3d1dd0..d94d9ee241fe5 100644
--- a/fs/qnx4/truncate.c
+++ b/fs/qnx4/truncate.c
@@ -10,12 +10,8 @@
  * 30-06-1998 by Frank DENIS : ugly filler.
  */
 
-#include <linux/types.h>
-#include <linux/errno.h>
-#include <linux/fs.h>
-#include <linux/qnx4_fs.h>
 #include <linux/smp_lock.h>
-#include <asm/uaccess.h>
+#include "qnx4.h"
 
 #ifdef CONFIG_QNX4FS_RW
 
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 3f0eaa397ef59..b3afd2219ad2e 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -135,6 +135,7 @@ header-y += posix_types.h
 header-y += ppdev.h
 header-y += prctl.h
 header-y += qnxtypes.h
+header-y += qnx4_fs.h
 header-y += radeonfb.h
 header-y += raw.h
 header-y += resource.h
@@ -308,7 +309,6 @@ unifdef-y += poll.h
 unifdef-y += ppp_defs.h
 unifdef-y += ppp-comp.h
 unifdef-y += ptrace.h
-unifdef-y += qnx4_fs.h
 unifdef-y += quota.h
 unifdef-y += random.h
 unifdef-y += irqnr.h
diff --git a/include/linux/qnx4_fs.h b/include/linux/qnx4_fs.h
index acbaec3524e09..8b9aee1a9ce34 100644
--- a/include/linux/qnx4_fs.h
+++ b/include/linux/qnx4_fs.h
@@ -85,63 +85,4 @@ struct qnx4_super_block {
 	struct qnx4_inode_entry AltBoot;
 };
 
-#ifdef __KERNEL__
-
-#define QNX4_DEBUG 0
-
-#if QNX4_DEBUG
-#define QNX4DEBUG(X) printk X
-#else
-#define QNX4DEBUG(X) (void) 0
-#endif
-
-struct qnx4_sb_info {
-	struct buffer_head	*sb_buf;	/* superblock buffer */
-	struct qnx4_super_block	*sb;		/* our superblock */
-	unsigned int		Version;	/* may be useful */
-	struct qnx4_inode_entry	*BitMap;	/* useful */
-};
-
-struct qnx4_inode_info {
-	struct qnx4_inode_entry raw;
-	loff_t mmu_private;
-	struct inode vfs_inode;
-};
-
-extern struct inode *qnx4_iget(struct super_block *, unsigned long);
-extern struct dentry *qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd);
-extern unsigned long qnx4_count_free_blocks(struct super_block *sb);
-extern unsigned long qnx4_block_map(struct inode *inode, long iblock);
-
-extern struct buffer_head *qnx4_bread(struct inode *, int, int);
-
-extern const struct inode_operations qnx4_file_inode_operations;
-extern const struct inode_operations qnx4_dir_inode_operations;
-extern const struct file_operations qnx4_file_operations;
-extern const struct file_operations qnx4_dir_operations;
-extern int qnx4_is_free(struct super_block *sb, long block);
-extern int qnx4_set_bitmap(struct super_block *sb, long block, int busy);
-extern int qnx4_create(struct inode *inode, struct dentry *dentry, int mode, struct nameidata *nd);
-extern void qnx4_truncate(struct inode *inode);
-extern void qnx4_free_inode(struct inode *inode);
-extern int qnx4_unlink(struct inode *dir, struct dentry *dentry);
-extern int qnx4_rmdir(struct inode *dir, struct dentry *dentry);
-
-static inline struct qnx4_sb_info *qnx4_sb(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct qnx4_inode_info *qnx4_i(struct inode *inode)
-{
-	return container_of(inode, struct qnx4_inode_info, vfs_inode);
-}
-
-static inline struct qnx4_inode_entry *qnx4_raw_inode(struct inode *inode)
-{
-	return &qnx4_i(inode)->raw;
-}
-
-#endif				/* __KERNEL__ */
-
 #endif
-- 
GitLab


From b522412aeabadbb302fd4338eaabf09d10e2d29c Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 13:44:36 -0400
Subject: [PATCH 2166/2198] Sanitize ->fsync() for FAT

* mark directory data blocks as assoc. metadata
* add new inode to deal with FAT, mark FAT blocks as assoc. metadata of that
* now ->fsync() is trivial both for files and directories

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fat/dir.c         | 16 ++++++++--------
 fs/fat/fat.h         |  6 ++++++
 fs/fat/fatent.c      | 13 ++++++++-----
 fs/fat/file.c        | 14 +++++++++++++-
 fs/fat/inode.c       | 11 ++++++++++-
 fs/fat/namei_msdos.c |  4 ++--
 fs/fat/namei_vfat.c  |  4 ++--
 7 files changed, 49 insertions(+), 19 deletions(-)

diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 3a7f603b69824..f3500294eec58 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -840,7 +840,7 @@ const struct file_operations fat_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= fat_compat_dir_ioctl,
 #endif
-	.fsync		= file_fsync,
+	.fsync		= fat_file_fsync,
 };
 
 static int fat_get_short_entry(struct inode *dir, loff_t *pos,
@@ -967,7 +967,7 @@ static int __fat_remove_entries(struct inode *dir, loff_t pos, int nr_slots)
 			de++;
 			nr_slots--;
 		}
-		mark_buffer_dirty(bh);
+		mark_buffer_dirty_inode(bh, dir);
 		if (IS_DIRSYNC(dir))
 			err = sync_dirty_buffer(bh);
 		brelse(bh);
@@ -1001,7 +1001,7 @@ int fat_remove_entries(struct inode *dir, struct fat_slot_info *sinfo)
 		de--;
 		nr_slots--;
 	}
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty_inode(bh, dir);
 	if (IS_DIRSYNC(dir))
 		err = sync_dirty_buffer(bh);
 	brelse(bh);
@@ -1051,7 +1051,7 @@ static int fat_zeroed_cluster(struct inode *dir, sector_t blknr, int nr_used,
 		}
 		memset(bhs[n]->b_data, 0, sb->s_blocksize);
 		set_buffer_uptodate(bhs[n]);
-		mark_buffer_dirty(bhs[n]);
+		mark_buffer_dirty_inode(bhs[n], dir);
 
 		n++;
 		blknr++;
@@ -1131,7 +1131,7 @@ int fat_alloc_new_dir(struct inode *dir, struct timespec *ts)
 	de[0].size = de[1].size = 0;
 	memset(de + 2, 0, sb->s_blocksize - 2 * sizeof(*de));
 	set_buffer_uptodate(bhs[0]);
-	mark_buffer_dirty(bhs[0]);
+	mark_buffer_dirty_inode(bhs[0], dir);
 
 	err = fat_zeroed_cluster(dir, blknr, 1, bhs, MAX_BUF_PER_PAGE);
 	if (err)
@@ -1193,7 +1193,7 @@ static int fat_add_new_entries(struct inode *dir, void *slots, int nr_slots,
 			slots += copy;
 			size -= copy;
 			set_buffer_uptodate(bhs[n]);
-			mark_buffer_dirty(bhs[n]);
+			mark_buffer_dirty_inode(bhs[n], dir);
 			if (!size)
 				break;
 			n++;
@@ -1293,7 +1293,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
 		for (i = 0; i < long_bhs; i++) {
 			int copy = min_t(int, sb->s_blocksize - offset, size);
 			memcpy(bhs[i]->b_data + offset, slots, copy);
-			mark_buffer_dirty(bhs[i]);
+			mark_buffer_dirty_inode(bhs[i], dir);
 			offset = 0;
 			slots += copy;
 			size -= copy;
@@ -1304,7 +1304,7 @@ int fat_add_entries(struct inode *dir, void *slots, int nr_slots,
 			/* Fill the short name slot. */
 			int copy = min_t(int, sb->s_blocksize - offset, size);
 			memcpy(bhs[i]->b_data + offset, slots, copy);
-			mark_buffer_dirty(bhs[i]);
+			mark_buffer_dirty_inode(bhs[i], dir);
 			if (IS_DIRSYNC(dir))
 				err = sync_dirty_buffer(bhs[i]);
 		}
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index ea440d65819c8..e4d88527b5dd9 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -74,6 +74,7 @@ struct msdos_sb_info {
 
 	int fatent_shift;
 	struct fatent_operations *fatent_ops;
+	struct inode *fat_inode;
 
 	spinlock_t inode_hash_lock;
 	struct hlist_head inode_hashtable[FAT_HASH_SIZE];
@@ -251,6 +252,7 @@ struct fat_entry {
 	} u;
 	int nr_bhs;
 	struct buffer_head *bhs[2];
+	struct inode *fat_inode;
 };
 
 static inline void fatent_init(struct fat_entry *fatent)
@@ -259,6 +261,7 @@ static inline void fatent_init(struct fat_entry *fatent)
 	fatent->entry = 0;
 	fatent->u.ent32_p = NULL;
 	fatent->bhs[0] = fatent->bhs[1] = NULL;
+	fatent->fat_inode = NULL;
 }
 
 static inline void fatent_set_entry(struct fat_entry *fatent, int entry)
@@ -275,6 +278,7 @@ static inline void fatent_brelse(struct fat_entry *fatent)
 		brelse(fatent->bhs[i]);
 	fatent->nr_bhs = 0;
 	fatent->bhs[0] = fatent->bhs[1] = NULL;
+	fatent->fat_inode = NULL;
 }
 
 extern void fat_ent_access_init(struct super_block *sb);
@@ -296,6 +300,8 @@ extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
 extern void fat_truncate(struct inode *inode);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		       struct kstat *stat);
+extern int fat_file_fsync(struct file *file, struct dentry *dentry,
+			  int datasync);
 
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index da6eea47872f4..618f5305c2e4f 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -73,6 +73,8 @@ static int fat12_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 	struct buffer_head **bhs = fatent->bhs;
 
 	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
+
 	bhs[0] = sb_bread(sb, blocknr);
 	if (!bhs[0])
 		goto err;
@@ -103,6 +105,7 @@ static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
 
 	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
+	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
 	fatent->bhs[0] = sb_bread(sb, blocknr);
 	if (!fatent->bhs[0]) {
 		printk(KERN_ERR "FAT: FAT read failed (blocknr %llu)\n",
@@ -167,9 +170,9 @@ static void fat12_ent_put(struct fat_entry *fatent, int new)
 	}
 	spin_unlock(&fat12_entry_lock);
 
-	mark_buffer_dirty(fatent->bhs[0]);
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 	if (fatent->nr_bhs == 2)
-		mark_buffer_dirty(fatent->bhs[1]);
+		mark_buffer_dirty_inode(fatent->bhs[1], fatent->fat_inode);
 }
 
 static void fat16_ent_put(struct fat_entry *fatent, int new)
@@ -178,7 +181,7 @@ static void fat16_ent_put(struct fat_entry *fatent, int new)
 		new = EOF_FAT16;
 
 	*fatent->u.ent16_p = cpu_to_le16(new);
-	mark_buffer_dirty(fatent->bhs[0]);
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 }
 
 static void fat32_ent_put(struct fat_entry *fatent, int new)
@@ -189,7 +192,7 @@ static void fat32_ent_put(struct fat_entry *fatent, int new)
 	WARN_ON(new & 0xf0000000);
 	new |= le32_to_cpu(*fatent->u.ent32_p) & ~0x0fffffff;
 	*fatent->u.ent32_p = cpu_to_le32(new);
-	mark_buffer_dirty(fatent->bhs[0]);
+	mark_buffer_dirty_inode(fatent->bhs[0], fatent->fat_inode);
 }
 
 static int fat12_ent_next(struct fat_entry *fatent)
@@ -381,7 +384,7 @@ static int fat_mirror_bhs(struct super_block *sb, struct buffer_head **bhs,
 			}
 			memcpy(c_bh->b_data, bhs[n]->b_data, sb->s_blocksize);
 			set_buffer_uptodate(c_bh);
-			mark_buffer_dirty(c_bh);
+			mark_buffer_dirty_inode(c_bh, sbi->fat_inode);
 			if (sb->s_flags & MS_SYNCHRONOUS)
 				err = sync_dirty_buffer(c_bh);
 			brelse(c_bh);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index 0a7f4a9918b34..e955a56b4e5ea 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -133,6 +133,18 @@ static int fat_file_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
+int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+	struct inode *inode = dentry->d_inode;
+	int res, err;
+
+	res = simple_fsync(filp, dentry, datasync);
+	err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
+
+	return res ? res : err;
+}
+
+
 const struct file_operations fat_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -142,7 +154,7 @@ const struct file_operations fat_file_operations = {
 	.mmap		= generic_file_mmap,
 	.release	= fat_file_release,
 	.ioctl		= fat_generic_ioctl,
-	.fsync		= file_fsync,
+	.fsync		= fat_file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 2292cbf7d364a..476f80b175fe5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -458,6 +458,8 @@ static void fat_put_super(struct super_block *sb)
 	if (sb->s_dirt)
 		fat_write_super(sb);
 
+	iput(sbi->fat_inode);
+
 	if (sbi->nls_disk) {
 		unload_nls(sbi->nls_disk);
 		sbi->nls_disk = NULL;
@@ -1183,7 +1185,7 @@ static int fat_read_root(struct inode *inode)
 int fat_fill_super(struct super_block *sb, void *data, int silent,
 		   const struct inode_operations *fs_dir_inode_ops, int isvfat)
 {
-	struct inode *root_inode = NULL;
+	struct inode *root_inode = NULL, *fat_inode = NULL;
 	struct buffer_head *bh;
 	struct fat_boot_sector *b;
 	struct msdos_sb_info *sbi;
@@ -1423,6 +1425,11 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 	}
 
 	error = -ENOMEM;
+	fat_inode = new_inode(sb);
+	if (!fat_inode)
+		goto out_fail;
+	MSDOS_I(fat_inode)->i_pos = 0;
+	sbi->fat_inode = fat_inode;
 	root_inode = new_inode(sb);
 	if (!root_inode)
 		goto out_fail;
@@ -1448,6 +1455,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
 		       " on dev %s.\n", sb->s_id);
 
 out_fail:
+	if (fat_inode)
+		iput(fat_inode);
 	if (root_inode)
 		iput(root_inode);
 	if (sbi->nls_io)
diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c
index da3f361a37ddb..20f522861355a 100644
--- a/fs/fat/namei_msdos.c
+++ b/fs/fat/namei_msdos.c
@@ -544,7 +544,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		int start = MSDOS_I(new_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
 		if (IS_DIRSYNC(new_dir)) {
 			err = sync_dirty_buffer(dotdot_bh);
 			if (err)
@@ -586,7 +586,7 @@ static int do_msdos_rename(struct inode *old_dir, unsigned char *old_name,
 		int start = MSDOS_I(old_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
 		corrupt |= sync_dirty_buffer(dotdot_bh);
 	}
 error_inode:
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index a0e00e3a46e91..b50ecbe97f83d 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -965,7 +965,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		int start = MSDOS_I(new_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
 		if (IS_DIRSYNC(new_dir)) {
 			err = sync_dirty_buffer(dotdot_bh);
 			if (err)
@@ -1009,7 +1009,7 @@ static int vfat_rename(struct inode *old_dir, struct dentry *old_dentry,
 		int start = MSDOS_I(old_dir)->i_logstart;
 		dotdot_de->start = cpu_to_le16(start);
 		dotdot_de->starthi = cpu_to_le16(start >> 16);
-		mark_buffer_dirty(dotdot_bh);
+		mark_buffer_dirty_inode(dotdot_bh, old_inode);
 		corrupt |= sync_dirty_buffer(dotdot_bh);
 	}
 error_inode:
-- 
GitLab


From e1740a462ecb2eae213be15857b577cc6f6bb8b4 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:14:02 -0400
Subject: [PATCH 2167/2198] switch ext2 to simple_fsync()

kill ext2_sync_file() (along with ext2/fsync.c), get rid of
ext2_update_inode() - it's an alias of ext2_write_inode().

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/Makefile |  2 +-
 fs/ext2/dir.c    |  2 +-
 fs/ext2/ext2.h   |  3 ---
 fs/ext2/file.c   |  4 ++--
 fs/ext2/fsync.c  | 50 ------------------------------------------------
 fs/ext2/inode.c  | 11 ++---------
 6 files changed, 6 insertions(+), 66 deletions(-)
 delete mode 100644 fs/ext2/fsync.c

diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index e0b2b43c1fdb9..f42af45cfd88a 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_EXT2_FS) += ext2.o
 
-ext2-y := balloc.o dir.o file.o fsync.o ialloc.o inode.o \
+ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
 	  ioctl.o namei.o super.o symlink.o
 
 ext2-$(CONFIG_EXT2_FS_XATTR)	 += xattr.o xattr_user.o xattr_trusted.o
diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c
index 2999d72153b7c..003500498c22e 100644
--- a/fs/ext2/dir.c
+++ b/fs/ext2/dir.c
@@ -720,5 +720,5 @@ const struct file_operations ext2_dir_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= ext2_compat_ioctl,
 #endif
-	.fsync		= ext2_sync_file,
+	.fsync		= simple_fsync,
 };
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 3203042b36efa..b2bbf45039e08 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -113,9 +113,6 @@ extern int ext2_empty_dir (struct inode *);
 extern struct ext2_dir_entry_2 * ext2_dotdot (struct inode *, struct page **);
 extern void ext2_set_link(struct inode *, struct ext2_dir_entry_2 *, struct page *, struct inode *);
 
-/* fsync.c */
-extern int ext2_sync_file (struct file *, struct dentry *, int);
-
 /* ialloc.c */
 extern struct inode * ext2_new_inode (struct inode *, int);
 extern void ext2_free_inode (struct inode *);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 45ed071221821..2b9e47dc92229 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -55,7 +55,7 @@ const struct file_operations ext2_file_operations = {
 	.mmap		= generic_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
-	.fsync		= ext2_sync_file,
+	.fsync		= simple_fsync,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= generic_file_splice_write,
 };
@@ -72,7 +72,7 @@ const struct file_operations ext2_xip_file_operations = {
 	.mmap		= xip_file_mmap,
 	.open		= generic_file_open,
 	.release	= ext2_release_file,
-	.fsync		= ext2_sync_file,
+	.fsync		= simple_fsync,
 };
 #endif
 
diff --git a/fs/ext2/fsync.c b/fs/ext2/fsync.c
deleted file mode 100644
index fc66c93fcb5c9..0000000000000
--- a/fs/ext2/fsync.c
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- *  linux/fs/ext2/fsync.c
- *
- *  Copyright (C) 1993  Stephen Tweedie (sct@dcs.ed.ac.uk)
- *  from
- *  Copyright (C) 1992  Remy Card (card@masi.ibp.fr)
- *                      Laboratoire MASI - Institut Blaise Pascal
- *                      Universite Pierre et Marie Curie (Paris VI)
- *  from
- *  linux/fs/minix/truncate.c   Copyright (C) 1991, 1992  Linus Torvalds
- * 
- *  ext2fs fsync primitive
- *
- *  Big-endian to little-endian byte-swapping/bitmaps by
- *        David S. Miller (davem@caip.rutgers.edu), 1995
- * 
- *  Removed unnecessary code duplication for little endian machines
- *  and excessive __inline__s. 
- *        Andi Kleen, 1997
- *
- * Major simplications and cleanup - we only need to do the metadata, because
- * we can depend on generic_block_fdatasync() to sync the data blocks.
- */
-
-#include "ext2.h"
-#include <linux/buffer_head.h>		/* for sync_mapping_buffers() */
-
-
-/*
- *	File may be NULL when we are called. Perhaps we shouldn't
- *	even pass file to fsync ?
- */
-
-int ext2_sync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-	int ret;
-
-	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return ret;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return ret;
-
-	err = ext2_sync_inode(inode);
-	if (ret == 0)
-		ret = err;
-	return ret;
-}
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index acf6788311038..29ed682061f64 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -41,8 +41,6 @@ MODULE_AUTHOR("Remy Card and others");
 MODULE_DESCRIPTION("Second Extended Filesystem");
 MODULE_LICENSE("GPL");
 
-static int ext2_update_inode(struct inode * inode, int do_sync);
-
 /*
  * Test whether an inode is a fast symlink.
  */
@@ -66,7 +64,7 @@ void ext2_delete_inode (struct inode * inode)
 		goto no_delete;
 	EXT2_I(inode)->i_dtime	= get_seconds();
 	mark_inode_dirty(inode);
-	ext2_update_inode(inode, inode_needs_sync(inode));
+	ext2_write_inode(inode, inode_needs_sync(inode));
 
 	inode->i_size = 0;
 	if (inode->i_blocks)
@@ -1337,7 +1335,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 	return ERR_PTR(ret);
 }
 
-static int ext2_update_inode(struct inode * inode, int do_sync)
+int ext2_write_inode(struct inode *inode, int do_sync)
 {
 	struct ext2_inode_info *ei = EXT2_I(inode);
 	struct super_block *sb = inode->i_sb;
@@ -1442,11 +1440,6 @@ static int ext2_update_inode(struct inode * inode, int do_sync)
 	return err;
 }
 
-int ext2_write_inode(struct inode *inode, int wait)
-{
-	return ext2_update_inode(inode, wait);
-}
-
 int ext2_sync_inode(struct inode *inode)
 {
 	struct writeback_control wbc = {
-- 
GitLab


From 0d7916d7e985da52cdd2989c900485e17b035972 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:21:06 -0400
Subject: [PATCH 2168/2198] switch minix to simple_fsync()

* get minix_write_inode() to honour the second argument
* now we can use simple_fsync() for minixfs

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/minix/dir.c   |  2 +-
 fs/minix/file.c  | 20 +-------------------
 fs/minix/inode.c | 33 ++++++++++-----------------------
 fs/minix/minix.h |  2 --
 4 files changed, 12 insertions(+), 45 deletions(-)

diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index d4946c4c90e2f..e5f206467e402 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ static int minix_readdir(struct file *, void *, filldir_t);
 const struct file_operations minix_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= minix_readdir,
-	.fsync		= minix_sync_file,
+	.fsync		= simple_fsync,
 };
 
 static inline void dir_put_page(struct page *page)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 17765f697e50f..3eec3e607a878 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -6,15 +6,12 @@
  *  minix regular file handling primitives
  */
 
-#include <linux/buffer_head.h>		/* for fsync_inode_buffers() */
 #include "minix.h"
 
 /*
  * We have mostly NULLs here: the current defaults are OK for
  * the minix filesystem.
  */
-int minix_sync_file(struct file *, struct dentry *, int);
-
 const struct file_operations minix_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
@@ -22,7 +19,7 @@ const struct file_operations minix_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= minix_sync_file,
+	.fsync		= simple_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
@@ -30,18 +27,3 @@ const struct inode_operations minix_file_inode_operations = {
 	.truncate	= minix_truncate,
 	.getattr	= minix_getattr,
 };
-
-int minix_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-
-	err = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return err;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return err;
-	
-	err |= minix_sync_inode(inode);
-	return err ? -EIO : 0;
-}
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 7eb53970f4bc2..f91a236935974 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -556,38 +556,25 @@ static struct buffer_head * V2_minix_update_inode(struct inode * inode)
 	return bh;
 }
 
-static struct buffer_head *minix_update_inode(struct inode *inode)
-{
-	if (INODE_VERSION(inode) == MINIX_V1)
-		return V1_minix_update_inode(inode);
-	else
-		return V2_minix_update_inode(inode);
-}
-
-static int minix_write_inode(struct inode * inode, int wait)
-{
-	brelse(minix_update_inode(inode));
-	return 0;
-}
-
-int minix_sync_inode(struct inode * inode)
+static int minix_write_inode(struct inode *inode, int wait)
 {
 	int err = 0;
 	struct buffer_head *bh;
 
-	bh = minix_update_inode(inode);
-	if (bh && buffer_dirty(bh))
-	{
+	if (INODE_VERSION(inode) == MINIX_V1)
+		bh = V1_minix_update_inode(inode);
+	else
+		bh = V2_minix_update_inode(inode);
+	if (!bh)
+		return -EIO;
+	if (wait && buffer_dirty(bh)) {
 		sync_dirty_buffer(bh);
-		if (buffer_req(bh) && !buffer_uptodate(bh))
-		{
+		if (buffer_req(bh) && !buffer_uptodate(bh)) {
 			printk("IO error syncing minix inode [%s:%08lx]\n",
 				inode->i_sb->s_id, inode->i_ino);
-			err = -1;
+			err = -EIO;
 		}
 	}
-	else if (!bh)
-		err = -1;
 	brelse (bh);
 	return err;
 }
diff --git a/fs/minix/minix.h b/fs/minix/minix.h
index e6a0b193bea41..cb7fdd11f9a52 100644
--- a/fs/minix/minix.h
+++ b/fs/minix/minix.h
@@ -57,7 +57,6 @@ extern int __minix_write_begin(struct file *file, struct address_space *mapping,
 extern void V1_minix_truncate(struct inode *);
 extern void V2_minix_truncate(struct inode *);
 extern void minix_truncate(struct inode *);
-extern int minix_sync_inode(struct inode *);
 extern void minix_set_inode(struct inode *, dev_t);
 extern int V1_minix_get_block(struct inode *, long, struct buffer_head *, int);
 extern int V2_minix_get_block(struct inode *, long, struct buffer_head *, int);
@@ -72,7 +71,6 @@ extern int minix_empty_dir(struct inode*);
 extern void minix_set_link(struct minix_dir_entry*, struct page*, struct inode*);
 extern struct minix_dir_entry *minix_dotdot(struct inode*, struct page**);
 extern ino_t minix_inode_by_name(struct dentry*);
-extern int minix_sync_file(struct file *, struct dentry *, int);
 
 extern const struct inode_operations minix_file_inode_operations;
 extern const struct inode_operations minix_dir_inode_operations;
-- 
GitLab


From 05459ca81ac3064cb040d983342bc453cccec458 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:29:45 -0400
Subject: [PATCH 2169/2198] repair sysv_write_inode(), switch sysv to
 simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysv/dir.c   |  2 +-
 fs/sysv/file.c  | 17 +----------------
 fs/sysv/inode.c | 45 ++++++++++++++++-----------------------------
 fs/sysv/sysv.h  |  1 -
 4 files changed, 18 insertions(+), 47 deletions(-)

diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 56f655254bfed..c7798079e644f 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ static int sysv_readdir(struct file *, void *, filldir_t);
 const struct file_operations sysv_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= sysv_readdir,
-	.fsync		= sysv_sync_file,
+	.fsync		= simple_fsync,
 };
 
 static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 589be21d884e3..96340c01f4a7f 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
-	.fsync		= sysv_sync_file,
+	.fsync		= simple_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
@@ -34,18 +34,3 @@ const struct inode_operations sysv_file_inode_operations = {
 	.truncate	= sysv_truncate,
 	.getattr	= sysv_getattr,
 };
-
-int sysv_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-
-	err = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return err;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return err;
-	
-	err |= sysv_sync_inode(inode);
-	return err ? -EIO : 0;
-}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index a3f45fc626a16..425c976cfcd20 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -247,7 +247,7 @@ struct inode *sysv_iget(struct super_block *sb, unsigned int ino)
 	return ERR_PTR(-EIO);
 }
 
-static struct buffer_head * sysv_update_inode(struct inode * inode)
+int sysv_write_inode(struct inode *inode, int wait)
 {
 	struct super_block * sb = inode->i_sb;
 	struct sysv_sb_info * sbi = SYSV_SB(sb);
@@ -255,19 +255,21 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
 	struct sysv_inode * raw_inode;
 	struct sysv_inode_info * si;
 	unsigned int ino, block;
+	int err = 0;
 
 	ino = inode->i_ino;
 	if (!ino || ino > sbi->s_ninodes) {
 		printk("Bad inode number on dev %s: %d is out of range\n",
 		       inode->i_sb->s_id, ino);
-		return NULL;
+		return -EIO;
 	}
 	raw_inode = sysv_raw_inode(sb, ino, &bh);
 	if (!raw_inode) {
 		printk("unable to read i-node block\n");
-		return NULL;
+		return -EIO;
 	}
 
+	lock_kernel();
 	raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode);
 	raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid));
 	raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid));
@@ -283,38 +285,23 @@ static struct buffer_head * sysv_update_inode(struct inode * inode)
 	for (block = 0; block < 10+1+1+1; block++)
 		write3byte(sbi, (u8 *)&si->i_data[block],
 			&raw_inode->i_data[3*block]);
+	unlock_kernel();
 	mark_buffer_dirty(bh);
-	return bh;
-}
-
-int sysv_write_inode(struct inode * inode, int wait)
-{
-	struct buffer_head *bh;
-	lock_kernel();
-	bh = sysv_update_inode(inode);
+	if (wait) {
+                sync_dirty_buffer(bh);
+                if (buffer_req(bh) && !buffer_uptodate(bh)) {
+                        printk ("IO error syncing sysv inode [%s:%08x]\n",
+                                sb->s_id, ino);
+                        err = -EIO;
+                }
+        }
 	brelse(bh);
-	unlock_kernel();
 	return 0;
 }
 
-int sysv_sync_inode(struct inode * inode)
+int sysv_sync_inode(struct inode *inode)
 {
-        int err = 0;
-        struct buffer_head *bh;
-
-        bh = sysv_update_inode(inode);
-        if (bh && buffer_dirty(bh)) {
-                sync_dirty_buffer(bh);
-                if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                        printk ("IO error syncing sysv inode [%s:%08lx]\n",
-                                inode->i_sb->s_id, inode->i_ino);
-                        err = -1;
-                }
-        }
-        else if (!bh)
-                err = -1;
-        brelse (bh);
-        return err;
+	return sysv_write_inode(inode, 1);
 }
 
 static void sysv_delete_inode(struct inode *inode)
diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h
index 5784a318c8836..53786eb5cf60a 100644
--- a/fs/sysv/sysv.h
+++ b/fs/sysv/sysv.h
@@ -144,7 +144,6 @@ extern int __sysv_write_begin(struct file *file, struct address_space *mapping,
 extern struct inode *sysv_iget(struct super_block *, unsigned int);
 extern int sysv_write_inode(struct inode *, int);
 extern int sysv_sync_inode(struct inode *);
-extern int sysv_sync_file(struct file *, struct dentry *, int);
 extern void sysv_set_inode(struct inode *, dev_t);
 extern int sysv_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int sysv_init_icache(void);
-- 
GitLab


From a932801543fe74050ebee07fde082234c46b624f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:35:18 -0400
Subject: [PATCH 2170/2198] switch ufs to simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ufs/dir.c  |  2 +-
 fs/ufs/file.c | 23 +----------------------
 fs/ufs/ufs.h  |  1 -
 3 files changed, 2 insertions(+), 24 deletions(-)

diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 6321b797061b7..6f671f1ac2712 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ int ufs_empty_dir(struct inode * inode)
 const struct file_operations ufs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= ufs_readdir,
-	.fsync		= ufs_sync_file,
+	.fsync		= simple_fsync,
 	.llseek		= generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index 2bd3a1615714c..73655c61240a0 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,31 +24,10 @@
  */
 
 #include <linux/fs.h>
-#include <linux/buffer_head.h>	/* for sync_mapping_buffers() */
 
 #include "ufs_fs.h"
 #include "ufs.h"
 
-
-int ufs_sync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-	int ret;
-
-	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return ret;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return ret;
-
-	err = ufs_sync_inode(inode);
-	if (ret == 0)
-		ret = err;
-	return ret;
-}
-
-
 /*
  * We have mostly NULL's here: the current defaults are ok for
  * the ufs filesystem.
@@ -62,6 +41,6 @@ const struct file_operations ufs_file_operations = {
 	.aio_write	= generic_file_aio_write,
 	.mmap		= generic_file_mmap,
 	.open           = generic_file_open,
-	.fsync		= ufs_sync_file,
+	.fsync		= simple_fsync,
 	.splice_read	= generic_file_splice_read,
 };
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index d0c4acd4f1f36..644e77e135991 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -99,7 +99,6 @@ extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de,
 extern const struct inode_operations ufs_file_inode_operations;
 extern const struct file_operations ufs_file_operations;
 extern const struct address_space_operations ufs_aops;
-extern int ufs_sync_file(struct file *, struct dentry *, int);
 
 /* ialloc.c */
 extern void ufs_free_inode (struct inode *inode);
-- 
GitLab


From 90de066443a8632bb42fed0a8216313d7da07aba Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:40:27 -0400
Subject: [PATCH 2171/2198] switch udf to simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/udf/Makefile  |  2 +-
 fs/udf/dir.c     |  2 +-
 fs/udf/file.c    |  2 +-
 fs/udf/fsync.c   | 52 ------------------------------------------------
 fs/udf/udfdecl.h |  3 ---
 5 files changed, 3 insertions(+), 58 deletions(-)
 delete mode 100644 fs/udf/fsync.c

diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index 0d4503f7446dc..eb880f66c23a7 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_UDF_FS) += udf.o
 
 udf-objs     := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
-		partition.o super.o truncate.o symlink.o fsync.o \
+		partition.o super.o truncate.o symlink.o \
 		directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 2efd4d5291b69..61d9a76a3a69d 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -210,5 +210,5 @@ const struct file_operations udf_dir_operations = {
 	.read			= generic_read_dir,
 	.readdir		= udf_readdir,
 	.ioctl			= udf_ioctl,
-	.fsync			= udf_fsync_file,
+	.fsync			= simple_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index eb91f3b703201..7464305382b5e 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -209,7 +209,7 @@ const struct file_operations udf_file_operations = {
 	.write			= do_sync_write,
 	.aio_write		= udf_file_aio_write,
 	.release		= udf_release_file,
-	.fsync			= udf_fsync_file,
+	.fsync			= simple_fsync,
 	.splice_read		= generic_file_splice_read,
 	.llseek			= generic_file_llseek,
 };
diff --git a/fs/udf/fsync.c b/fs/udf/fsync.c
deleted file mode 100644
index b2c472b733b8a..0000000000000
--- a/fs/udf/fsync.c
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * fsync.c
- *
- * PURPOSE
- *  Fsync handling routines for the OSTA-UDF(tm) filesystem.
- *
- * COPYRIGHT
- *  This file is distributed under the terms of the GNU General Public
- *  License (GPL). Copies of the GPL can be obtained from:
- *      ftp://prep.ai.mit.edu/pub/gnu/GPL
- *  Each contributing author retains all rights to their own work.
- *
- *  (C) 1999-2001 Ben Fennema
- *  (C) 1999-2000 Stelias Computing Inc
- *
- * HISTORY
- *
- *  05/22/99 blf  Created.
- */
-
-#include "udfdecl.h"
-
-#include <linux/fs.h>
-
-static int udf_fsync_inode(struct inode *, int);
-
-/*
- *	File may be NULL when we are called. Perhaps we shouldn't
- *	even pass file to fsync ?
- */
-
-int udf_fsync_file(struct file *file, struct dentry *dentry, int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-
-	return udf_fsync_inode(inode, datasync);
-}
-
-static int udf_fsync_inode(struct inode *inode, int datasync)
-{
-	int err;
-
-	err = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return err;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return err;
-
-	err |= udf_sync_inode(inode);
-
-	return err ? -EIO : 0;
-}
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index cac51b77a5d16..8d46f4294ee74 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -223,9 +223,6 @@ extern int udf_prealloc_blocks(struct super_block *, struct inode *, uint16_t,
 extern int udf_new_block(struct super_block *, struct inode *, uint16_t,
 			 uint32_t, int *);
 
-/* fsync.c */
-extern int udf_fsync_file(struct file *, struct dentry *, int);
-
 /* directory.c */
 extern struct fileIdentDesc *udf_fileident_read(struct inode *, loff_t *,
 						struct udf_fileident_bh *,
-- 
GitLab


From bea6b64c277f0824cdaea6190209b26a164419d6 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 7 Jun 2009 15:44:50 -0400
Subject: [PATCH 2172/2198] switch omfs to simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/omfs/file.c | 17 +----------------
 1 file changed, 1 insertion(+), 16 deletions(-)

diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 834b2331f6b3e..d17e774eaf456 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -11,21 +11,6 @@
 #include <linux/mpage.h>
 #include "omfs.h"
 
-static int omfs_sync_file(struct file *file, struct dentry *dentry,
-		int datasync)
-{
-	struct inode *inode = dentry->d_inode;
-	int err;
-
-	err = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
-		return err;
-	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-		return err;
-	err |= omfs_sync_inode(inode);
-	return err ? -EIO : 0;
-}
-
 static u32 omfs_max_extents(struct omfs_sb_info *sbi, int offset)
 {
 	return (sbi->s_sys_blocksize - offset -
@@ -344,7 +329,7 @@ struct file_operations omfs_file_operations = {
 	.aio_read = generic_file_aio_read,
 	.aio_write = generic_file_aio_write,
 	.mmap = generic_file_mmap,
-	.fsync = omfs_sync_file,
+	.fsync = simple_fsync,
 	.splice_read = generic_file_splice_read,
 };
 
-- 
GitLab


From ffdc9064f8b4fa9db37a7d5180f41cce2ea2b7ad Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 00:44:42 -0400
Subject: [PATCH 2173/2198] repair adfs ->write_inode(), switch to
 simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/adfs/adfs.h      |  4 +++-
 fs/adfs/dir.c       | 10 ++++++++--
 fs/adfs/dir_f.c     | 17 +++++++++++++++++
 fs/adfs/dir_fplus.c | 17 +++++++++++++++++
 fs/adfs/file.c      |  2 +-
 fs/adfs/inode.c     |  4 ++--
 6 files changed, 48 insertions(+), 6 deletions(-)

diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index e0a85dbeeb887..a6665f37f4560 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -53,6 +53,7 @@ struct adfs_dir_ops {
 	int	(*update)(struct adfs_dir *dir, struct object_info *obj);
 	int	(*create)(struct adfs_dir *dir, struct object_info *obj);
 	int	(*remove)(struct adfs_dir *dir, struct object_info *obj);
+	int	(*sync)(struct adfs_dir *dir);
 	void	(*free)(struct adfs_dir *dir);
 };
 
@@ -90,7 +91,8 @@ extern const struct dentry_operations adfs_dentry_operations;
 extern struct adfs_dir_ops adfs_f_dir_ops;
 extern struct adfs_dir_ops adfs_fplus_dir_ops;
 
-extern int adfs_dir_update(struct super_block *sb, struct object_info *obj);
+extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
+			   int wait);
 
 /* file.c */
 extern const struct inode_operations adfs_file_inode_operations;
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index e867ccf372466..4d4073447d1a7 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -83,7 +83,7 @@ adfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
 }
 
 int
-adfs_dir_update(struct super_block *sb, struct object_info *obj)
+adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
 {
 	int ret = -EINVAL;
 #ifdef CONFIG_ADFS_FS_RW
@@ -106,6 +106,12 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj)
 	ret = ops->update(&dir, obj);
 	write_unlock(&adfs_dir_lock);
 
+	if (wait) {
+		int err = ops->sync(&dir);
+		if (!ret)
+			ret = err;
+	}
+
 	ops->free(&dir);
 out:
 #endif
@@ -199,7 +205,7 @@ const struct file_operations adfs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
 	.readdir	= adfs_readdir,
-	.fsync		= file_fsync,
+	.fsync		= simple_fsync,
 };
 
 static int
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index ea7df21469211..31df6adf0de65 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -437,6 +437,22 @@ adfs_f_update(struct adfs_dir *dir, struct object_info *obj)
 #endif
 }
 
+static int
+adfs_f_sync(struct adfs_dir *dir)
+{
+	int err = 0;
+	int i;
+
+	for (i = dir->nr_buffers - 1; i >= 0; i--) {
+		struct buffer_head *bh = dir->bh[i];
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
+
+	return err;
+}
+
 static void
 adfs_f_free(struct adfs_dir *dir)
 {
@@ -456,5 +472,6 @@ struct adfs_dir_ops adfs_f_dir_ops = {
 	.setpos		= adfs_f_setpos,
 	.getnext	= adfs_f_getnext,
 	.update		= adfs_f_update,
+	.sync		= adfs_f_sync,
 	.free		= adfs_f_free
 };
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 1ec644e32df9a..139e0f345f181 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -161,6 +161,22 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj)
 	return ret;
 }
 
+static int
+adfs_fplus_sync(struct adfs_dir *dir)
+{
+	int err = 0;
+	int i;
+
+	for (i = dir->nr_buffers - 1; i >= 0; i--) {
+		struct buffer_head *bh = dir->bh[i];
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
+
+	return err;
+}
+
 static void
 adfs_fplus_free(struct adfs_dir *dir)
 {
@@ -175,5 +191,6 @@ struct adfs_dir_ops adfs_fplus_dir_ops = {
 	.read		= adfs_fplus_read,
 	.setpos		= adfs_fplus_setpos,
 	.getnext	= adfs_fplus_getnext,
+	.sync		= adfs_fplus_sync,
 	.free		= adfs_fplus_free
 };
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 36e381c6a99a6..8224d54a2afb4 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -30,7 +30,7 @@ const struct file_operations adfs_file_operations = {
 	.read		= do_sync_read,
 	.aio_read	= generic_file_aio_read,
 	.mmap		= generic_file_mmap,
-	.fsync		= file_fsync,
+	.fsync		= simple_fsync,
 	.write		= do_sync_write,
 	.aio_write	= generic_file_aio_write,
 	.splice_read	= generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index e647200262a20..05b3a677201d9 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -376,7 +376,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
  * The adfs-specific inode data has already been updated by
  * adfs_notify_change()
  */
-int adfs_write_inode(struct inode *inode, int unused)
+int adfs_write_inode(struct inode *inode, int wait)
 {
 	struct super_block *sb = inode->i_sb;
 	struct object_info obj;
@@ -391,7 +391,7 @@ int adfs_write_inode(struct inode *inode, int unused)
 	obj.attr	= ADFS_I(inode)->attr;
 	obj.size	= inode->i_size;
 
-	ret = adfs_dir_update(sb, &obj);
+	ret = adfs_dir_update(sb, &obj, wait);
 	unlock_kernel();
 	return ret;
 }
-- 
GitLab


From 224c886643e52e6b4c1143489cd0b289b6c03976 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 00:46:40 -0400
Subject: [PATCH 2174/2198] Fix adfs GET_FRAG_ID() on big-endian

Missing conversion to host-endian before doing shifts

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/adfs/map.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/adfs/map.c b/fs/adfs/map.c
index 92ab4fbc2031f..568081b93f732 100644
--- a/fs/adfs/map.c
+++ b/fs/adfs/map.c
@@ -62,7 +62,7 @@ static DEFINE_RWLOCK(adfs_map_lock);
 #define GET_FRAG_ID(_map,_start,_idmask)				\
 	({								\
 		unsigned char *_m = _map + (_start >> 3);		\
-		u32 _frag = get_unaligned((u32 *)_m);			\
+		u32 _frag = get_unaligned_le32(_m);			\
 		_frag >>= (_start & 7);					\
 		_frag & _idmask;					\
 	})
-- 
GitLab


From 4427f0c36e22e2cd6696b2fe7643e9756a14b3d3 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 01:15:58 -0400
Subject: [PATCH 2175/2198] repair bfs_write_inode(), switch bfs to
 simple_fsync()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/bfs/dir.c   |  8 ++++----
 fs/bfs/inode.c | 12 +++++++++---
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 4dd1b623f9374..54bd07d44e687 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -79,7 +79,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
 	.read		= generic_read_dir,
 	.readdir	= bfs_readdir,
-	.fsync		= file_fsync,
+	.fsync		= simple_fsync,
 	.llseek		= generic_file_llseek,
 };
 
@@ -205,7 +205,7 @@ static int bfs_unlink(struct inode *dir, struct dentry *dentry)
 		inode->i_nlink = 1;
 	}
 	de->ino = 0;
-	mark_buffer_dirty(bh);
+	mark_buffer_dirty_inode(bh, dir);
 	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
 	mark_inode_dirty(dir);
 	inode->i_ctime = dir->i_ctime;
@@ -267,7 +267,7 @@ static int bfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		new_inode->i_ctime = CURRENT_TIME_SEC;
 		inode_dec_link_count(new_inode);
 	}
-	mark_buffer_dirty(old_bh);
+	mark_buffer_dirty_inode(old_bh, old_dir);
 	error = 0;
 
 end_rename:
@@ -320,7 +320,7 @@ static int bfs_add_entry(struct inode *dir, const unsigned char *name,
 				for (i = 0; i < BFS_NAMELEN; i++)
 					de->name[i] =
 						(i < namelen) ? name[i] : 0;
-				mark_buffer_dirty(bh);
+				mark_buffer_dirty_inode(bh, dir);
 				brelse(bh);
 				return 0;
 			}
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 3a9a1361fdc17..d1d9d9088371d 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -98,14 +98,15 @@ struct inode *bfs_iget(struct super_block *sb, unsigned long ino)
 	return ERR_PTR(-EIO);
 }
 
-static int bfs_write_inode(struct inode *inode, int unused)
+static int bfs_write_inode(struct inode *inode, int wait)
 {
+	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
 	unsigned int ino = (u16)inode->i_ino;
         unsigned long i_sblock;
 	struct bfs_inode *di;
 	struct buffer_head *bh;
 	int block, off;
-	struct bfs_sb_info *info = BFS_SB(inode->i_sb);
+	int err = 0;
 
         dprintf("ino=%08x\n", ino);
 
@@ -146,9 +147,14 @@ static int bfs_write_inode(struct inode *inode, int unused)
 	di->i_eoffset = cpu_to_le32(i_sblock * BFS_BSIZE + inode->i_size - 1);
 
 	mark_buffer_dirty(bh);
+	if (wait) {
+		sync_dirty_buffer(bh);
+		if (buffer_req(bh) && !buffer_uptodate(bh))
+			err = -EIO;
+	}
 	brelse(bh);
 	mutex_unlock(&info->bfs_lock);
-	return 0;
+	return err;
 }
 
 static void bfs_delete_inode(struct inode *inode)
-- 
GitLab


From c475879556a8602bbe2faa9a06f6e5fcc8c05bb2 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 8 Jun 2009 01:22:00 -0400
Subject: [PATCH 2176/2198] sanitize ->fsync() for affs

unfortunately, for affs (especially for affs directories) we have
no real way to keep track of metadata ownership.  So we have to
do more or less what file_fsync() does, but we do *not* need to
call write_super() there.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/affs.h |  1 +
 fs/affs/dir.c  |  2 +-
 fs/affs/file.c | 14 +++++++++++++-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 1a2d5e3c7f4eb..e511dc621a2e6 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -182,6 +182,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 
 void		affs_free_prealloc(struct inode *inode);
 extern void	affs_truncate(struct inode *);
+int		affs_file_fsync(struct file *, struct dentry *, int);
 
 /* dir.c */
 
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 7b36904dbeac6..8ca8f3a555992 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -21,7 +21,7 @@ const struct file_operations affs_dir_operations = {
 	.read		= generic_read_dir,
 	.llseek		= generic_file_llseek,
 	.readdir	= affs_readdir,
-	.fsync		= file_fsync,
+	.fsync		= affs_file_fsync,
 };
 
 /*
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 9246cb4aa018f..184e55c1c9ba4 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -34,7 +34,7 @@ const struct file_operations affs_file_operations = {
 	.mmap		= generic_file_mmap,
 	.open		= affs_file_open,
 	.release	= affs_file_release,
-	.fsync		= file_fsync,
+	.fsync		= affs_file_fsync,
 	.splice_read	= generic_file_splice_read,
 };
 
@@ -915,3 +915,15 @@ affs_truncate(struct inode *inode)
 	}
 	affs_free_prealloc(inode);
 }
+
+int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+{
+	struct inode * inode = dentry->d_inode;
+	int ret, err;
+
+	ret = write_inode_now(inode, 0);
+	err = sync_blockdev(inode->i_sb->s_bdev);
+	if (!ret)
+		ret = err;
+	return ret;
+}
-- 
GitLab


From e28964365faf3b9961695eb62b48cbc9f2a2a245 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:03:15 +0200
Subject: [PATCH 2177/2198] affs: add ->sync_fs

Add a ->sync_fs method for data integrity syncs.  Factor out common code
between affs_put_super, affs_write_super and the new affs_sync_fs into
a helper.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/affs/super.c | 40 +++++++++++++++++++++++++++-------------
 1 file changed, 27 insertions(+), 13 deletions(-)

diff --git a/fs/affs/super.c b/fs/affs/super.c
index c4814937c9685..104fdcb3a7fca 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -24,6 +24,19 @@ extern struct timezone sys_tz;
 static int affs_statfs(struct dentry *dentry, struct kstatfs *buf);
 static int affs_remount (struct super_block *sb, int *flags, char *data);
 
+static void
+affs_commit_super(struct super_block *sb, int clean)
+{
+	struct affs_sb_info *sbi = AFFS_SB(sb);
+	struct buffer_head *bh = sbi->s_root_bh;
+	struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
+
+	tail->bm_flag = cpu_to_be32(clean);
+	secs_to_datestamp(get_seconds(), &tail->disk_change);
+	affs_fix_checksum(sb, bh);
+	mark_buffer_dirty(bh);
+}
+
 static void
 affs_put_super(struct super_block *sb)
 {
@@ -32,13 +45,8 @@ affs_put_super(struct super_block *sb)
 
 	lock_kernel();
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(1);
-		secs_to_datestamp(get_seconds(),
-				  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
-		affs_fix_checksum(sb, sbi->s_root_bh);
-		mark_buffer_dirty(sbi->s_root_bh);
-	}
+	if (!(sb->s_flags & MS_RDONLY))
+		affs_commit_super(sb, 1);
 
 	kfree(sbi->s_prefix);
 	affs_free_bitmap(sb);
@@ -53,18 +61,13 @@ static void
 affs_write_super(struct super_block *sb)
 {
 	int clean = 2;
-	struct affs_sb_info *sbi = AFFS_SB(sb);
 
 	lock_super(sb);
 	if (!(sb->s_flags & MS_RDONLY)) {
 		//	if (sbi->s_bitmap[i].bm_bh) {
 		//		if (buffer_dirty(sbi->s_bitmap[i].bm_bh)) {
 		//			clean = 0;
-		AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->bm_flag = cpu_to_be32(clean);
-		secs_to_datestamp(get_seconds(),
-				  &AFFS_ROOT_TAIL(sb, sbi->s_root_bh)->disk_change);
-		affs_fix_checksum(sb, sbi->s_root_bh);
-		mark_buffer_dirty(sbi->s_root_bh);
+		affs_commit_super(sb, clean);
 		sb->s_dirt = !clean;	/* redo until bitmap synced */
 	} else
 		sb->s_dirt = 0;
@@ -73,6 +76,16 @@ affs_write_super(struct super_block *sb)
 	pr_debug("AFFS: write_super() at %lu, clean=%d\n", get_seconds(), clean);
 }
 
+static int
+affs_sync_fs(struct super_block *sb, int wait)
+{
+	lock_super(sb);
+	affs_commit_super(sb, 2);
+	sb->s_dirt = 0;
+	unlock_super(sb);
+	return 0;
+}
+
 static struct kmem_cache * affs_inode_cachep;
 
 static struct inode *affs_alloc_inode(struct super_block *sb)
@@ -130,6 +143,7 @@ static const struct super_operations affs_sops = {
 	.clear_inode	= affs_clear_inode,
 	.put_super	= affs_put_super,
 	.write_super	= affs_write_super,
+	.sync_fs	= affs_sync_fs,
 	.statfs		= affs_statfs,
 	.remount_fs	= affs_remount,
 	.show_options	= generic_show_options,
-- 
GitLab


From 561e47ce7244168788d4ecef9a2271df204b3c89 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:03:38 +0200
Subject: [PATCH 2178/2198] bfs: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/bfs/inode.c | 32 +++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index d1d9d9088371d..6f60336c66281 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -216,6 +216,26 @@ static void bfs_delete_inode(struct inode *inode)
 	clear_inode(inode);
 }
 
+static int bfs_sync_fs(struct super_block *sb, int wait)
+{
+	struct bfs_sb_info *info = BFS_SB(sb);
+
+	mutex_lock(&info->bfs_lock);
+	mark_buffer_dirty(info->si_sbh);
+	sb->s_dirt = 0;
+	mutex_unlock(&info->bfs_lock);
+
+	return 0;
+}
+
+static void bfs_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		bfs_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
+}
+
 static void bfs_put_super(struct super_block *s)
 {
 	struct bfs_sb_info *info = BFS_SB(s);
@@ -254,17 +274,6 @@ static int bfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	return 0;
 }
 
-static void bfs_write_super(struct super_block *s)
-{
-	struct bfs_sb_info *info = BFS_SB(s);
-
-	mutex_lock(&info->bfs_lock);
-	if (!(s->s_flags & MS_RDONLY))
-		mark_buffer_dirty(info->si_sbh);
-	s->s_dirt = 0;
-	mutex_unlock(&info->bfs_lock);
-}
-
 static struct kmem_cache *bfs_inode_cachep;
 
 static struct inode *bfs_alloc_inode(struct super_block *sb)
@@ -312,6 +321,7 @@ static const struct super_operations bfs_sops = {
 	.delete_inode	= bfs_delete_inode,
 	.put_super	= bfs_put_super,
 	.write_super	= bfs_write_super,
+	.sync_fs	= bfs_sync_fs,
 	.statfs		= bfs_statfs,
 };
 
-- 
GitLab


From 80e09fb942d38beb19dcffbeb14d496beeb0a989 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:03:58 +0200
Subject: [PATCH 2179/2198] exofs: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/exofs/super.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 49e16af4e6194..8216c5b77b532 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -200,18 +200,18 @@ static const struct export_operations exofs_export_ops;
 /*
  * Write the superblock to the OSD
  */
-static void exofs_write_super(struct super_block *sb)
+static int exofs_sync_fs(struct super_block *sb, int wait)
 {
 	struct exofs_sb_info *sbi;
 	struct exofs_fscb *fscb;
 	struct osd_request *or;
 	struct osd_obj_id obj;
-	int ret;
+	int ret = -ENOMEM;
 
 	fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
 	if (!fscb) {
 		EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
-		return;
+		return -ENOMEM;
 	}
 
 	lock_super(sb);
@@ -249,6 +249,15 @@ static void exofs_write_super(struct super_block *sb)
 	unlock_kernel();
 	unlock_super(sb);
 	kfree(fscb);
+	return ret;
+}
+
+static void exofs_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		exofs_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
 }
 
 /*
@@ -493,6 +502,7 @@ static const struct super_operations exofs_sops = {
 	.delete_inode   = exofs_delete_inode,
 	.put_super      = exofs_put_super,
 	.write_super    = exofs_write_super,
+	.sync_fs	= exofs_sync_fs,
 	.statfs         = exofs_statfs,
 };
 
-- 
GitLab


From 40f31dd47e7c3d15af1f9845eda0fa0c4c33f32f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:04:17 +0200
Subject: [PATCH 2180/2198] ext2: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ext2/super.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index f8cbdf5691909..458999638c3de 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -42,6 +42,7 @@ static void ext2_sync_super(struct super_block *sb,
 			    struct ext2_super_block *es);
 static int ext2_remount (struct super_block * sb, int * flags, char * data);
 static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf);
+static int ext2_sync_fs(struct super_block *sb, int wait);
 
 void ext2_error (struct super_block * sb, const char * function,
 		 const char * fmt, ...)
@@ -309,6 +310,7 @@ static const struct super_operations ext2_sops = {
 	.delete_inode	= ext2_delete_inode,
 	.put_super	= ext2_put_super,
 	.write_super	= ext2_write_super,
+	.sync_fs	= ext2_sync_fs,
 	.statfs		= ext2_statfs,
 	.remount_fs	= ext2_remount,
 	.clear_inode	= ext2_clear_inode,
@@ -1132,25 +1134,36 @@ static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es)
  * set s_state to EXT2_VALID_FS after some corrections.
  */
 
-void ext2_write_super (struct super_block * sb)
+static int ext2_sync_fs(struct super_block *sb, int wait)
 {
-	struct ext2_super_block * es;
+	struct ext2_super_block *es = EXT2_SB(sb)->s_es;
+
 	lock_kernel();
-	if (!(sb->s_flags & MS_RDONLY)) {
-		es = EXT2_SB(sb)->s_es;
-
-		if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
-			ext2_debug ("setting valid to 0\n");
-			es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
-			es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb));
-			es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb));
-			es->s_mtime = cpu_to_le32(get_seconds());
-			ext2_sync_super(sb, es);
-		} else
-			ext2_commit_super (sb, es);
+	if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) {
+		ext2_debug("setting valid to 0\n");
+		es->s_state &= cpu_to_le16(~EXT2_VALID_FS);
+		es->s_free_blocks_count =
+			cpu_to_le32(ext2_count_free_blocks(sb));
+		es->s_free_inodes_count =
+			cpu_to_le32(ext2_count_free_inodes(sb));
+		es->s_mtime = cpu_to_le32(get_seconds());
+		ext2_sync_super(sb, es);
+	} else {
+		ext2_commit_super(sb, es);
 	}
 	sb->s_dirt = 0;
 	unlock_kernel();
+
+	return 0;
+}
+
+
+void ext2_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		ext2_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
 }
 
 static int ext2_remount (struct super_block * sb, int * flags, char * data)
-- 
GitLab


From f83d6d46e7adf241a064a4a425e5cd8a8fd8925f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:04:35 +0200
Subject: [PATCH 2181/2198] fat: add ->sync_fs

Add a ->sync_fs method for data integrity syncs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/fat/inode.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 476f80b175fe5..51a5ecf9000ad 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -449,6 +449,16 @@ static void fat_write_super(struct super_block *sb)
 	unlock_super(sb);
 }
 
+static int fat_sync_fs(struct super_block *sb, int wait)
+{
+	lock_super(sb);
+	fat_clusters_flush(sb);
+	sb->s_dirt = 0;
+	unlock_super(sb);
+
+	return 0;
+}
+
 static void fat_put_super(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
@@ -643,6 +653,7 @@ static const struct super_operations fat_sops = {
 	.delete_inode	= fat_delete_inode,
 	.put_super	= fat_put_super,
 	.write_super	= fat_write_super,
+	.sync_fs	= fat_sync_fs,
 	.statfs		= fat_statfs,
 	.clear_inode	= fat_clear_inode,
 	.remount_fs	= fat_remount,
-- 
GitLab


From 58bc5bbb873eb5d86126a3fd3ff02aaa69ec15d0 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:04:54 +0200
Subject: [PATCH 2182/2198] hfs: add ->sync_fs

Add a ->sync_fs method for data integrity syncs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfs/super.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 3aac417510301..6f833dc8e9102 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -58,6 +58,16 @@ static void hfs_write_super(struct super_block *sb)
 	unlock_super(sb);
 }
 
+static int hfs_sync_fs(struct super_block *sb, int wait)
+{
+	lock_super(sb);
+	hfs_mdb_commit(sb);
+	sb->s_dirt = 0;
+	unlock_super(sb);
+
+	return 0;
+}
+
 /*
  * hfs_put_super()
  *
@@ -172,6 +182,7 @@ static const struct super_operations hfs_super_operations = {
 	.clear_inode	= hfs_clear_inode,
 	.put_super	= hfs_put_super,
 	.write_super	= hfs_write_super,
+	.sync_fs	= hfs_sync_fs,
 	.statfs		= hfs_statfs,
 	.remount_fs     = hfs_remount,
 	.show_options	= hfs_show_options,
-- 
GitLab


From 7fbc6df0e7a561a313f49faa77829d5de45a97f8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:05:12 +0200
Subject: [PATCH 2183/2198] hfsplus: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/hfsplus/super.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 1aab8aa7801e2..9fc3af0c0dab6 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -152,7 +152,7 @@ static void hfsplus_clear_inode(struct inode *inode)
 	}
 }
 
-static void hfsplus_write_super(struct super_block *sb)
+static int hfsplus_sync_fs(struct super_block *sb, int wait)
 {
 	struct hfsplus_vh *vhdr = HFSPLUS_SB(sb).s_vhdr;
 
@@ -160,9 +160,6 @@ static void hfsplus_write_super(struct super_block *sb)
 
 	lock_super(sb);
 	sb->s_dirt = 0;
-	if (sb->s_flags & MS_RDONLY)
-		/* warn? */
-		goto out;
 
 	vhdr->free_blocks = cpu_to_be32(HFSPLUS_SB(sb).free_blocks);
 	vhdr->next_alloc = cpu_to_be32(HFSPLUS_SB(sb).next_alloc);
@@ -194,8 +191,16 @@ static void hfsplus_write_super(struct super_block *sb)
 		}
 		HFSPLUS_SB(sb).flags &= ~HFSPLUS_SB_WRITEBACKUP;
 	}
- out:
 	unlock_super(sb);
+	return 0;
+}
+
+static void hfsplus_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		hfsplus_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
 }
 
 static void hfsplus_put_super(struct super_block *sb)
@@ -290,6 +295,7 @@ static const struct super_operations hfsplus_sops = {
 	.clear_inode	= hfsplus_clear_inode,
 	.put_super	= hfsplus_put_super,
 	.write_super	= hfsplus_write_super,
+	.sync_fs	= hfsplus_sync_fs,
 	.statfs		= hfsplus_statfs,
 	.remount_fs	= hfsplus_remount,
 	.show_options	= hfsplus_show_options,
-- 
GitLab


From ad43ffdeea0a7bd3e6036c4aeec2e6699aef8ac7 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:07:45 +0200
Subject: [PATCH 2184/2198] sysv: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sysv/inode.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 425c976cfcd20..479923456a542 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -31,16 +31,13 @@
 #include <asm/byteorder.h>
 #include "sysv.h"
 
-/* This is only called on sync() and umount(), when s_dirt=1. */
-static void sysv_write_super(struct super_block *sb)
+static int sysv_sync_fs(struct super_block *sb, int wait)
 {
 	struct sysv_sb_info *sbi = SYSV_SB(sb);
 	unsigned long time = get_seconds(), old_time;
 
 	lock_super(sb);
 	lock_kernel();
-	if (sb->s_flags & MS_RDONLY)
-		goto clean;
 
 	/*
 	 * If we are going to write out the super block,
@@ -54,10 +51,19 @@ static void sysv_write_super(struct super_block *sb)
 		*sbi->s_sb_time = cpu_to_fs32(sbi, time);
 		mark_buffer_dirty(sbi->s_bh2);
 	}
-clean:
-	sb->s_dirt = 0;
+
 	unlock_kernel();
 	unlock_super(sb);
+
+	return 0;
+}
+
+static void sysv_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		sysv_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
 }
 
 static int sysv_remount(struct super_block *sb, int *flags, char *data)
@@ -345,6 +351,7 @@ const struct super_operations sysv_sops = {
 	.delete_inode	= sysv_delete_inode,
 	.put_super	= sysv_put_super,
 	.write_super	= sysv_write_super,
+	.sync_fs	= sysv_sync_fs,
 	.remount_fs	= sysv_remount,
 	.statfs		= sysv_statfs,
 };
-- 
GitLab


From 8c8006564a58d0ea912bf7f2d0a758d97e4b464f Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:08:05 +0200
Subject: [PATCH 2185/2198] ufs: add ->sync_fs

Add a ->sync_fs method for data integrity syncs, and reimplement
->write_super ontop of it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/ufs/super.c | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 6560dda7b18c7..5faed7954d0a1 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1125,7 +1125,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent)
 	return -ENOMEM;
 }
 
-static void ufs_write_super(struct super_block *sb)
+static int ufs_sync_fs(struct super_block *sb, int wait)
 {
 	struct ufs_sb_private_info * uspi;
 	struct ufs_super_block_first * usb1;
@@ -1134,25 +1134,36 @@ static void ufs_write_super(struct super_block *sb)
 
 	lock_super(sb);
 	lock_kernel();
+
 	UFSD("ENTER\n");
+
 	flags = UFS_SB(sb)->s_flags;
 	uspi = UFS_SB(sb)->s_uspi;
 	usb1 = ubh_get_usb_first(uspi);
 	usb3 = ubh_get_usb_third(uspi);
 
-	if (!(sb->s_flags & MS_RDONLY)) {
-		usb1->fs_time = cpu_to_fs32(sb, get_seconds());
-		if ((flags & UFS_ST_MASK) == UFS_ST_SUN 
-		  || (flags & UFS_ST_MASK) == UFS_ST_SUNOS
-		  || (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
-			ufs_set_fs_state(sb, usb1, usb3,
-					UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
-		ufs_put_cstotal(sb);
-	}
+	usb1->fs_time = cpu_to_fs32(sb, get_seconds());
+	if ((flags & UFS_ST_MASK) == UFS_ST_SUN  ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNOS ||
+	    (flags & UFS_ST_MASK) == UFS_ST_SUNx86)
+		ufs_set_fs_state(sb, usb1, usb3,
+				UFS_FSOK - fs32_to_cpu(sb, usb1->fs_time));
+	ufs_put_cstotal(sb);
 	sb->s_dirt = 0;
+
 	UFSD("EXIT\n");
 	unlock_kernel();
 	unlock_super(sb);
+
+	return 0;
+}
+
+static void ufs_write_super(struct super_block *sb)
+{
+	if (!(sb->s_flags & MS_RDONLY))
+		ufs_sync_fs(sb, 1);
+	else
+		sb->s_dirt = 0;
 }
 
 static void ufs_put_super(struct super_block *sb)
@@ -1381,6 +1392,7 @@ static const struct super_operations ufs_super_ops = {
 	.delete_inode	= ufs_delete_inode,
 	.put_super	= ufs_put_super,
 	.write_super	= ufs_write_super,
+	.sync_fs	= ufs_sync_fs,
 	.statfs		= ufs_statfs,
 	.remount_fs	= ufs_remount,
 	.show_options   = ufs_show_options,
-- 
GitLab


From d579ed00aa96a7f7486978540a0d7cecaff742ae Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:08:21 +0200
Subject: [PATCH 2186/2198] jffs2: call jffs2_write_super from jffs2_sync_fs

The call to ->write_super from __sync_filesystem will go away, so make
sure jffs2 performs the same actions from inside ->sync_fs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/jffs2/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index f7bfd3ac8bfa6..07a22caf26878 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -74,6 +74,8 @@ static int jffs2_sync_fs(struct super_block *sb, int wait)
 {
 	struct jffs2_sb_info *c = JFFS2_SB_INFO(sb);
 
+	jffs2_write_super(sb);
+
 	mutex_lock(&c->alloc_sem);
 	jffs2_flush_wbuf_pad(c);
 	mutex_unlock(&c->alloc_sem);
-- 
GitLab


From d731e06323cb705003e4172ec209e469be4c18e1 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:08:36 +0200
Subject: [PATCH 2187/2198] nilfs2: call nilfs2_write_super from nilfs2_sync_fs

The call to ->write_super from __sync_filesystem will go away, so make
sure nilfs2 performs the same actions from inside ->sync_fs.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/super.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 11151eaa2c4a2..122dc1e489fb2 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -391,6 +391,8 @@ static int nilfs_sync_fs(struct super_block *sb, int wait)
 {
 	int err = 0;
 
+	nilfs_write_super(sb);
+
 	/* This function is called when super block should be written back */
 	if (wait)
 		err = nilfs_construct_segment(sb);
-- 
GitLab


From 0c95ee190e1dea60c55c834d14695341085c9b7b Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Mon, 8 Jun 2009 10:08:54 +0200
Subject: [PATCH 2188/2198] remove the call to ->write_super in
 __sync_filesystem

Now that all filesystems provide ->sync_fs methods we can change
__sync_filesystem to only call ->sync_fs.

This gives us a clear separation between periodic writeouts which
are driven by ->write_super and data integrity syncs that go
through ->sync_fs. (modulo file_fsync which is also going away)

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/sync.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/sync.c b/fs/sync.c
index e9d56f6c0b74b..dd200025af852 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -33,8 +33,6 @@ static int __sync_filesystem(struct super_block *sb, int wait)
 	else
 		sync_quota_sb(sb, -1);
 	sync_inodes_sb(sb, wait);
-	if (sb->s_dirt && sb->s_op->write_super)
-		sb->s_op->write_super(sb);
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
 	return __sync_blockdev(sb->s_bdev, wait);
-- 
GitLab


From 81fc20bd0e75ba6357bce2403767d7c2585d8f28 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:28 +0900
Subject: [PATCH 2189/2198] nilfs2: remove meaningless EBUSY case from
 nilfs_get_sb function

The following EBUSY case in nilfs_get_sb() is meaningless.  Indeed,
this error code is never returned to the caller.

    if (!s->s_root) {
          ...
    } else if (!(s->s_flags & MS_RDONLY)) {
        err = -EBUSY;
    }

This simply removes the else case.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/super.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 122dc1e489fb2..1c505d0e031ed 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1186,8 +1186,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
 		s->s_flags |= MS_ACTIVE;
 		need_to_close = 0;
-	} else if (!(s->s_flags & MS_RDONLY)) {
-		err = -EBUSY;
 	}
 
 	up(&sd.bdev->bd_mount_sem);
-- 
GitLab


From 33c8e57c86d1bd1548c12a4f7c4bceb94b862cca Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:29 +0900
Subject: [PATCH 2190/2198] nilfs2: get rid of sget use for acquiring nilfs
 object

This will change the way to obtain nilfs object in nilfs_get_sb()
function.

Previously, a preliminary sget() call was performed, and the nilfs
object was acquired from a super block instance found by the sget()
call.

This patch, instead, instroduces a new dedicated function
find_or_create_nilfs(); as the name implies, the function finds an
existent nilfs object from a global list or creates a new one if no
object is found on the device.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/super.c     | 80 ++++++++++---------------------------------
 fs/nilfs2/the_nilfs.c | 57 ++++++++++++++++++++++++++++--
 fs/nilfs2/the_nilfs.h |  4 ++-
 3 files changed, 76 insertions(+), 65 deletions(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1c505d0e031ed..3c9833e3e74a4 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1059,13 +1059,6 @@ static int nilfs_set_bdev_super(struct super_block *s, void *data)
 }
 
 static int nilfs_test_bdev_super(struct super_block *s, void *data)
-{
-	struct nilfs_super_data *sd = data;
-
-	return s->s_bdev == sd->bdev;
-}
-
-static int nilfs_test_bdev_super2(struct super_block *s, void *data)
 {
 	struct nilfs_super_data *sd = data;
 	int ret;
@@ -1096,8 +1089,8 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	     const char *dev_name, void *data, struct vfsmount *mnt)
 {
 	struct nilfs_super_data sd;
-	struct super_block *s, *s2;
-	struct the_nilfs *nilfs = NULL;
+	struct super_block *s;
+	struct the_nilfs *nilfs;
 	int err, need_to_close = 1;
 
 	sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
@@ -1118,11 +1111,12 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto failed;
 	}
 
-	/*
-	 * once the super is inserted into the list by sget, s_umount
-	 * will protect the lockfs code from trying to start a snapshot
-	 * while we are mounting
-	 */
+	nilfs = find_or_create_nilfs(sd.bdev);
+	if (!nilfs) {
+		err = -ENOMEM;
+		goto failed;
+	}
+
 	down(&sd.bdev->bd_mount_sem);
 	if (!sd.cno &&
 	    (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
@@ -1131,51 +1125,22 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	/*
-	 * Phase-1: search any existent instance and get the_nilfs
+	 * Search specified snapshot or R/W mode super_block
 	 */
-	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
-	if (IS_ERR(s))
-		goto error_s;
-
-	if (!s->s_root) {
-		err = -ENOMEM;
-		nilfs = alloc_nilfs(sd.bdev);
-		if (!nilfs)
-			goto cancel_new;
-	} else {
-		struct nilfs_sb_info *sbi = NILFS_SB(s);
+	if (!sd.cno)
+		/* trying to get the latest checkpoint.  */
+		sd.cno = nilfs_last_cno(nilfs);
 
-		/*
-		 * s_umount protects super_block from unmount process;
-		 * It covers pointers of nilfs_sb_info and the_nilfs.
-		 */
-		nilfs = sbi->s_nilfs;
-		get_nilfs(nilfs);
-		up_write(&s->s_umount);
-
-		/*
-		 * Phase-2: search specified snapshot or R/W mode super_block
-		 */
-		if (!sd.cno)
-			/* trying to get the latest checkpoint.  */
-			sd.cno = nilfs_last_cno(nilfs);
-
-		s2 = sget(fs_type, nilfs_test_bdev_super2,
-			  nilfs_set_bdev_super, &sd);
-		deactivate_super(s);
-		/*
-		 * Although deactivate_super() invokes close_bdev_exclusive() at
-		 * kill_block_super().  Here, s is an existent mount; we need
-		 * one more close_bdev_exclusive() call.
-		 */
-		s = s2;
-		if (IS_ERR(s))
-			goto error_s;
+	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
+	if (IS_ERR(s)) {
+		err = PTR_ERR(s);
+		goto failed_unlock;
 	}
 
 	if (!s->s_root) {
 		char b[BDEVNAME_SIZE];
 
+		/* New superblock instance created */
 		s->s_flags = flags;
 		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(sd.bdev));
@@ -1195,15 +1160,9 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	simple_set_mnt(mnt, s);
 	return 0;
 
- error_s:
-	up(&sd.bdev->bd_mount_sem);
-	if (nilfs)
-		put_nilfs(nilfs);
-	close_bdev_exclusive(sd.bdev, flags);
-	return PTR_ERR(s);
-
  failed_unlock:
 	up(&sd.bdev->bd_mount_sem);
+	put_nilfs(nilfs);
  failed:
 	close_bdev_exclusive(sd.bdev, flags);
 
@@ -1212,8 +1171,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
  cancel_new:
 	/* Abandoning the newly allocated superblock */
 	up(&sd.bdev->bd_mount_sem);
-	if (nilfs)
-		put_nilfs(nilfs);
+	put_nilfs(nilfs);
 	up_write(&s->s_umount);
 	deactivate_super(s);
 	/*
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index a91f15b8673c3..45dbf6a61744d 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -35,6 +35,10 @@
 #include "seglist.h"
 #include "segbuf.h"
 
+
+static LIST_HEAD(nilfs_objects);
+static DEFINE_SPINLOCK(nilfs_lock);
+
 void nilfs_set_last_segment(struct the_nilfs *nilfs,
 			    sector_t start_blocknr, u64 seq, __u64 cno)
 {
@@ -55,7 +59,7 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs,
  * Return Value: On success, pointer to the_nilfs is returned.
  * On error, NULL is returned.
  */
-struct the_nilfs *alloc_nilfs(struct block_device *bdev)
+static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 {
 	struct the_nilfs *nilfs;
 
@@ -69,6 +73,7 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
 	mutex_init(&nilfs->ns_writer_mutex);
+	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_supers);
 	spin_lock_init(&nilfs->ns_last_segment_lock);
 	nilfs->ns_gc_inodes_h = NULL;
@@ -77,6 +82,45 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	return nilfs;
 }
 
+/**
+ * find_or_create_nilfs - find or create nilfs object
+ * @bdev: block device to which the_nilfs is related
+ *
+ * find_nilfs() looks up an existent nilfs object created on the
+ * device and gets the reference count of the object.  If no nilfs object
+ * is found on the device, a new nilfs object is allocated.
+ *
+ * Return Value: On success, pointer to the nilfs object is returned.
+ * On error, NULL is returned.
+ */
+struct the_nilfs *find_or_create_nilfs(struct block_device *bdev)
+{
+	struct the_nilfs *nilfs, *new = NULL;
+
+ retry:
+	spin_lock(&nilfs_lock);
+	list_for_each_entry(nilfs, &nilfs_objects, ns_list) {
+		if (nilfs->ns_bdev == bdev) {
+			get_nilfs(nilfs);
+			spin_unlock(&nilfs_lock);
+			if (new)
+				put_nilfs(new);
+			return nilfs; /* existing object */
+		}
+	}
+	if (new) {
+		list_add_tail(&new->ns_list, &nilfs_objects);
+		spin_unlock(&nilfs_lock);
+		return new; /* new object */
+	}
+	spin_unlock(&nilfs_lock);
+
+	new = alloc_nilfs(bdev);
+	if (new)
+		goto retry;
+	return NULL; /* insufficient memory */
+}
+
 /**
  * put_nilfs - release a reference to the_nilfs
  * @nilfs: the_nilfs structure to be released
@@ -86,13 +130,20 @@ struct the_nilfs *alloc_nilfs(struct block_device *bdev)
  */
 void put_nilfs(struct the_nilfs *nilfs)
 {
-	if (!atomic_dec_and_test(&nilfs->ns_count))
+	spin_lock(&nilfs_lock);
+	if (!atomic_dec_and_test(&nilfs->ns_count)) {
+		spin_unlock(&nilfs_lock);
 		return;
+	}
+	list_del_init(&nilfs->ns_list);
+	spin_unlock(&nilfs_lock);
+
 	/*
-	 * Increment of ns_count never occur below because the caller
+	 * Increment of ns_count never occurs below because the caller
 	 * of get_nilfs() holds at least one reference to the_nilfs.
 	 * Thus its exclusion control is not required here.
 	 */
+
 	might_sleep();
 	if (nilfs_loaded(nilfs)) {
 		nilfs_mdt_clear(nilfs->ns_sufile);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 30fe58778d05a..116caf96e7f39 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -43,6 +43,7 @@ enum {
  * struct the_nilfs - struct to supervise multiple nilfs mount points
  * @ns_flags: flags
  * @ns_count: reference count
+ * @ns_list: list head for nilfs_list
  * @ns_bdev: block device
  * @ns_bdi: backing dev info
  * @ns_writer: back pointer to writable nilfs_sb_info
@@ -88,6 +89,7 @@ enum {
 struct the_nilfs {
 	unsigned long		ns_flags;
 	atomic_t		ns_count;
+	struct list_head	ns_list;
 
 	struct block_device    *ns_bdev;
 	struct backing_dev_info *ns_bdi;
@@ -191,7 +193,7 @@ THE_NILFS_FNS(DISCONTINUED, discontinued)
 #define NILFS_ALTSB_FREQ	60  /* spare superblock */
 
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
-struct the_nilfs *alloc_nilfs(struct block_device *);
+struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
-- 
GitLab


From 3f82ff55168e92859119bf348e9e0bd6714d2fea Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:30 +0900
Subject: [PATCH 2191/2198] nilfs2: get rid of sget use for checking if current
 mount is present

This stops using sget() for checking if an r/w-mount or an r/o-mount
exists on the device.  This elimination uses a back pointer to the
current mount added to nilfs object.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/super.c     | 92 +++++++++++++++----------------------------
 fs/nilfs2/the_nilfs.h |  3 ++
 2 files changed, 35 insertions(+), 60 deletions(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 3c9833e3e74a4..5a8c5e4731b39 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -67,8 +67,6 @@ MODULE_LICENSE("GPL");
 
 static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
-static int test_exclusive_mount(struct file_system_type *fs_type,
-				struct block_device *bdev, int flags);
 
 /**
  * nilfs_error() - report failure condition on a filesystem
@@ -329,6 +327,10 @@ static void nilfs_put_super(struct super_block *sb)
 		nilfs_commit_super(sbi, 1);
 		up_write(&nilfs->ns_sem);
 	}
+	down_write(&nilfs->ns_sem);
+	if (nilfs->ns_current == sbi)
+		nilfs->ns_current = NULL;
+	up_write(&nilfs->ns_sem);
 
 	nilfs_detach_checkpoint(sbi);
 	put_nilfs(sbi->s_nilfs);
@@ -880,6 +882,11 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		goto failed_root;
 	}
 
+	down_write(&nilfs->ns_sem);
+	if (!nilfs_test_opt(sbi, SNAPSHOT))
+		nilfs->ns_current = sbi;
+	up_write(&nilfs->ns_sem);
+
 	return 0;
 
  failed_root:
@@ -958,14 +965,16 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		 * by fsck since we originally mounted the partition.)
 		 */
 		down(&sb->s_bdev->bd_mount_sem);
-		/* Check existing RW-mount */
-		if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
+		down_read(&nilfs->ns_sem);
+		if (nilfs->ns_current && nilfs->ns_current != sbi) {
 			printk(KERN_WARNING "NILFS (device %s): couldn't "
-			       "remount because a RW-mount exists.\n",
+			       "remount because an RW-mount exists.\n",
 			       sb->s_id);
+			up_read(&nilfs->ns_sem);
 			err = -EBUSY;
 			goto rw_remount_failed;
 		}
+		up_read(&nilfs->ns_sem);
 		if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
 			printk(KERN_WARNING "NILFS (device %s): couldn't "
 			       "remount because the current RO-mount is not "
@@ -984,6 +993,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 		down_write(&nilfs->ns_sem);
 		nilfs_setup_super(sbi);
+		nilfs->ns_current = sbi;
 		up_write(&nilfs->ns_sem);
 
 		up(&sb->s_bdev->bd_mount_sem);
@@ -1118,10 +1128,23 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	down(&sd.bdev->bd_mount_sem);
-	if (!sd.cno &&
-	    (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
-		err = (err < 0) ? : -EBUSY;
-		goto failed_unlock;
+
+	if (!sd.cno) {
+		/*
+		 * Check if an exclusive mount exists or not.
+		 * Snapshot mounts coexist with a current mount
+		 * (i.e. rw-mount or ro-mount), whereas rw-mount and
+		 * ro-mount are mutually exclusive.
+		 */
+		down_read(&nilfs->ns_sem);
+		if (nilfs->ns_current &&
+		    ((nilfs->ns_current->s_super->s_flags ^ flags)
+		     & MS_RDONLY)) {
+			up_read(&nilfs->ns_sem);
+			err = -EBUSY;
+			goto failed_unlock;
+		}
+		up_read(&nilfs->ns_sem);
 	}
 
 	/*
@@ -1182,57 +1205,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	return err;
 }
 
-static int nilfs_test_bdev_super3(struct super_block *s, void *data)
-{
-	struct nilfs_super_data *sd = data;
-	int ret;
-
-	if (s->s_bdev != sd->bdev)
-		return 0;
-	if (down_read_trylock(&s->s_umount)) {
-		ret = (s->s_flags & MS_RDONLY) && s->s_root &&
-			nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
-		up_read(&s->s_umount);
-		if (ret)
-			return 0; /* ignore snapshot mounts */
-	}
-	return !((sd->flags ^ s->s_flags) & MS_RDONLY);
-}
-
-static int __false_bdev_super(struct super_block *s, void *data)
-{
-#if 0 /* XXX: workaround for lock debug. This is not good idea */
-	up_write(&s->s_umount);
-#endif
-	return -EFAULT;
-}
-
-/**
- * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
- * fs_type: filesystem type
- * bdev: block device
- * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
- * res: pointer to an integer to store result
- *
- * This function must be called within a section protected by bd_mount_mutex.
- */
-static int test_exclusive_mount(struct file_system_type *fs_type,
-				struct block_device *bdev, int flags)
-{
-	struct super_block *s;
-	struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
-
-	s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
-	if (IS_ERR(s)) {
-		if (PTR_ERR(s) != -EFAULT)
-			return PTR_ERR(s);
-		return 0;  /* Not found */
-	}
-	up_write(&s->s_umount);
-	deactivate_super(s);
-	return 1;  /* Found */
-}
-
 struct file_system_type nilfs_fs_type = {
 	.owner    = THIS_MODULE,
 	.name     = "nilfs2",
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 116caf96e7f39..99f7e29a53351 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -50,6 +50,7 @@ enum {
  * @ns_sem: semaphore for shared states
  * @ns_writer_mutex: mutex protecting ns_writer attach/detach
  * @ns_writer_refcount: number of referrers on ns_writer
+ * @ns_current: back pointer to current mount
  * @ns_sbh: buffer heads of on-disk super blocks
  * @ns_sbp: pointers to super block data
  * @ns_sbwtime: previous write time of super blocks
@@ -98,6 +99,8 @@ struct the_nilfs {
 	struct mutex		ns_writer_mutex;
 	atomic_t		ns_writer_refcount;
 
+	struct nilfs_sb_info   *ns_current;
+
 	/*
 	 * used for
 	 * - loading the latest checkpoint exclusively.
-- 
GitLab


From 6dd4740662405a68bb229ac2b9e0aeaaf2188bf2 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:31 +0900
Subject: [PATCH 2192/2198] nilfs2: simplify remaining sget() use

This simplifies the test function passed on the remaining sget()
callsite in nilfs.

Instead of checking mount type (i.e. ro-mount/rw-mount/snapshot mount)
in the test function passed to sget(), this patch first looks up the
nilfs_sb_info struct which the given mount type matches, and then
acquires the super block instance holding the nilfs_sb_info.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/sb.h        |  1 +
 fs/nilfs2/super.c     | 42 +++++++++++++++---------------------
 fs/nilfs2/the_nilfs.c | 50 +++++++++++++++++++++++++++++++++++++++++++
 fs/nilfs2/the_nilfs.h |  7 ++++++
 4 files changed, 75 insertions(+), 25 deletions(-)

diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
index adccd4fc654e6..0776ccc2504a9 100644
--- a/fs/nilfs2/sb.h
+++ b/fs/nilfs2/sb.h
@@ -60,6 +60,7 @@ struct nilfs_sb_info {
 	struct super_block *s_super;	/* reverse pointer to super_block */
 	struct the_nilfs *s_nilfs;
 	struct list_head s_list;	/* list head for nilfs->ns_supers */
+	atomic_t s_count;		/* reference count */
 
 	/* Segment constructor */
 	struct list_head s_dirty_files;	/* dirty files list */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 5a8c5e4731b39..1d1b6e1251593 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -336,7 +336,7 @@ static void nilfs_put_super(struct super_block *sb)
 	put_nilfs(sbi->s_nilfs);
 	sbi->s_super = NULL;
 	sb->s_fs_info = NULL;
-	kfree(sbi);
+	nilfs_put_sbinfo(sbi);
 
 	unlock_kernel();
 }
@@ -785,6 +785,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 	get_nilfs(nilfs);
 	sbi->s_nilfs = nilfs;
 	sbi->s_super = sb;
+	atomic_set(&sbi->s_count, 1);
 
 	err = init_nilfs(nilfs, sbi, (char *)data);
 	if (err)
@@ -902,7 +903,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
  failed_sbi:
 	put_nilfs(nilfs);
 	sb->s_fs_info = NULL;
-	kfree(sbi);
+	nilfs_put_sbinfo(sbi);
 	return err;
 }
 
@@ -1014,6 +1015,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 struct nilfs_super_data {
 	struct block_device *bdev;
+	struct nilfs_sb_info *sbi;
 	__u64 cno;
 	int flags;
 };
@@ -1071,27 +1073,8 @@ static int nilfs_set_bdev_super(struct super_block *s, void *data)
 static int nilfs_test_bdev_super(struct super_block *s, void *data)
 {
 	struct nilfs_super_data *sd = data;
-	int ret;
-
-	if (s->s_bdev != sd->bdev)
-		return 0;
-
-	if (!((s->s_flags | sd->flags) & MS_RDONLY))
-		return 1; /* Reuse an old R/W-mode super_block */
-
-	if (s->s_flags & sd->flags & MS_RDONLY) {
-		if (down_read_trylock(&s->s_umount)) {
-			ret = s->s_root &&
-				(sd->cno == NILFS_SB(s)->s_snapshot_cno);
-			up_read(&s->s_umount);
-			/*
-			 * This path is locked with sb_lock by sget().
-			 * So, drop_super() causes deadlock.
-			 */
-			return ret;
-		}
-	}
-	return 0;
+
+	return sd->sbi && s->s_fs_info == (void *)sd->sbi;
 }
 
 static int
@@ -1112,7 +1095,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	 * much more information than normal filesystems to identify mount
 	 * instance.  For snapshot mounts, not only a mount type (ro-mount
 	 * or rw-mount) but also a checkpoint number is required.
-	 * The results are passed in sget() using nilfs_super_data.
 	 */
 	sd.cno = 0;
 	sd.flags = flags;
@@ -1148,13 +1130,23 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	}
 
 	/*
-	 * Search specified snapshot or R/W mode super_block
+	 * Find existing nilfs_sb_info struct
 	 */
+	sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
+
 	if (!sd.cno)
 		/* trying to get the latest checkpoint.  */
 		sd.cno = nilfs_last_cno(nilfs);
 
+	/*
+	 * Get super block instance holding the nilfs_sb_info struct.
+	 * A new instance is allocated if no existing mount is present or
+	 * existing instance has been unmounted.
+	 */
 	s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
+	if (sd.sbi)
+		nilfs_put_sbinfo(sd.sbi);
+
 	if (IS_ERR(s)) {
 		err = PTR_ERR(s);
 		goto failed_unlock;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 45dbf6a61744d..221953bfc8594 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -664,6 +664,56 @@ int nilfs_near_disk_full(struct the_nilfs *nilfs)
 	return ret;
 }
 
+/**
+ * nilfs_find_sbinfo - find existing nilfs_sb_info structure
+ * @nilfs: nilfs object
+ * @rw_mount: mount type (non-zero value for read/write mount)
+ * @cno: checkpoint number (zero for read-only mount)
+ *
+ * nilfs_find_sbinfo() returns the nilfs_sb_info structure which
+ * @rw_mount and @cno (in case of snapshots) matched.  If no instance
+ * was found, NULL is returned.  Although the super block instance can
+ * be unmounted after this function returns, the nilfs_sb_info struct
+ * is kept on memory until nilfs_put_sbinfo() is called.
+ */
+struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
+					int rw_mount, __u64 cno)
+{
+	struct nilfs_sb_info *sbi;
+
+	down_read(&nilfs->ns_sem);
+	/*
+	 * The SNAPSHOT flag and sb->s_flags are supposed to be
+	 * protected with nilfs->ns_sem.
+	 */
+	sbi = nilfs->ns_current;
+	if (rw_mount) {
+		if (sbi && !(sbi->s_super->s_flags & MS_RDONLY))
+			goto found; /* read/write mount */
+		else
+			goto out;
+	} else if (cno == 0) {
+		if (sbi && (sbi->s_super->s_flags & MS_RDONLY))
+			goto found; /* read-only mount */
+		else
+			goto out;
+	}
+
+	list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
+		if (nilfs_test_opt(sbi, SNAPSHOT) &&
+		    sbi->s_snapshot_cno == cno)
+			goto found; /* snapshot mount */
+	}
+ out:
+	up_read(&nilfs->ns_sem);
+	return NULL;
+
+ found:
+	atomic_inc(&sbi->s_count);
+	up_read(&nilfs->ns_sem);
+	return sbi;
+}
+
 int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
 				int snapshot_mount)
 {
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index 99f7e29a53351..be4c040fd6299 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -201,6 +201,7 @@ void put_nilfs(struct the_nilfs *);
 int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
 int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
 int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
+struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *, int, __u64);
 int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
 int nilfs_near_disk_full(struct the_nilfs *);
 void nilfs_fall_back_super_block(struct the_nilfs *);
@@ -243,6 +244,12 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 	mutex_unlock(&nilfs->ns_writer_mutex);
 }
 
+static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
+{
+	if (!atomic_dec_and_test(&sbi->s_count))
+		kfree(sbi);
+}
+
 static inline void
 nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
 			sector_t *seg_start, sector_t *seg_end)
-- 
GitLab


From e59399d0102c1813cec48db5cebe1750313f88a0 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:32 +0900
Subject: [PATCH 2193/2198] nilfs2: correct exclusion control in nilfs_remount
 function

nilfs_remount() changes mount state of a superblock instance.  Even
though nilfs accesses other superblock instances during mount or
remount, the mount state was not properly protected in
nilfs_remount().

Moreover, nilfs_remount() has a lock order reversal problem;
nilfs_get_sb() holds:

  1. bdev->bd_mount_sem
  2. sb->s_umount  (sget acquires)

and nilfs_remount() holds:

  1. sb->s_umount  (locked by the caller in vfs)
  2. bdev->bd_mount_sem

To avoid these problems, this patch divides a semaphore protecting
super block instances from nilfs->ns_sem, and applies it to the mount
state protection in nilfs_remount().

With this change, bd_mount_sem use is removed from nilfs_remount() and
the lock order reversal will be resolved.  And the new rw-semaphore,
nilfs->ns_super_sem will properly protect the mount state except the
modification from nilfs_error function.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/super.c     | 44 ++++++++++++++++++++-----------------------
 fs/nilfs2/the_nilfs.c | 13 +++++++------
 fs/nilfs2/the_nilfs.h |  7 ++++++-
 3 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 1d1b6e1251593..f02762fa8ea00 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -327,10 +327,10 @@ static void nilfs_put_super(struct super_block *sb)
 		nilfs_commit_super(sbi, 1);
 		up_write(&nilfs->ns_sem);
 	}
-	down_write(&nilfs->ns_sem);
+	down_write(&nilfs->ns_super_sem);
 	if (nilfs->ns_current == sbi)
 		nilfs->ns_current = NULL;
-	up_write(&nilfs->ns_sem);
+	up_write(&nilfs->ns_super_sem);
 
 	nilfs_detach_checkpoint(sbi);
 	put_nilfs(sbi->s_nilfs);
@@ -408,9 +408,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	struct buffer_head *bh_cp;
 	int err;
 
-	down_write(&nilfs->ns_sem);
+	down_write(&nilfs->ns_super_sem);
 	list_add(&sbi->s_list, &nilfs->ns_supers);
-	up_write(&nilfs->ns_sem);
+	up_write(&nilfs->ns_super_sem);
 
 	sbi->s_ifile = nilfs_mdt_new(
 		nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
@@ -448,9 +448,9 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
 	nilfs_mdt_destroy(sbi->s_ifile);
 	sbi->s_ifile = NULL;
 
-	down_write(&nilfs->ns_sem);
+	down_write(&nilfs->ns_super_sem);
 	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_sem);
+	up_write(&nilfs->ns_super_sem);
 
 	return err;
 }
@@ -462,9 +462,9 @@ void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
 	nilfs_mdt_clear(sbi->s_ifile);
 	nilfs_mdt_destroy(sbi->s_ifile);
 	sbi->s_ifile = NULL;
-	down_write(&nilfs->ns_sem);
+	down_write(&nilfs->ns_super_sem);
 	list_del_init(&sbi->s_list);
-	up_write(&nilfs->ns_sem);
+	up_write(&nilfs->ns_super_sem);
 }
 
 static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
@@ -883,10 +883,10 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
 		goto failed_root;
 	}
 
-	down_write(&nilfs->ns_sem);
+	down_write(&nilfs->ns_super_sem);
 	if (!nilfs_test_opt(sbi, SNAPSHOT))
 		nilfs->ns_current = sbi;
-	up_write(&nilfs->ns_sem);
+	up_write(&nilfs->ns_super_sem);
 
 	return 0;
 
@@ -918,6 +918,7 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 	lock_kernel();
 
+	down_write(&nilfs->ns_super_sem);
 	old_sb_flags = sb->s_flags;
 	old_opts.mount_opt = sbi->s_mount_opt;
 	old_opts.snapshot_cno = sbi->s_snapshot_cno;
@@ -965,24 +966,20 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 		 * store the current valid flag.  (It may have been changed
 		 * by fsck since we originally mounted the partition.)
 		 */
-		down(&sb->s_bdev->bd_mount_sem);
-		down_read(&nilfs->ns_sem);
 		if (nilfs->ns_current && nilfs->ns_current != sbi) {
 			printk(KERN_WARNING "NILFS (device %s): couldn't "
 			       "remount because an RW-mount exists.\n",
 			       sb->s_id);
-			up_read(&nilfs->ns_sem);
 			err = -EBUSY;
-			goto rw_remount_failed;
+			goto restore_opts;
 		}
-		up_read(&nilfs->ns_sem);
 		if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
 			printk(KERN_WARNING "NILFS (device %s): couldn't "
 			       "remount because the current RO-mount is not "
 			       "the latest one.\n",
 			       sb->s_id);
 			err = -EINVAL;
-			goto rw_remount_failed;
+			goto restore_opts;
 		}
 		sb->s_flags &= ~MS_RDONLY;
 		nilfs_clear_opt(sbi, SNAPSHOT);
@@ -990,25 +987,24 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data)
 
 		err = nilfs_attach_segment_constructor(sbi);
 		if (err)
-			goto rw_remount_failed;
+			goto restore_opts;
 
 		down_write(&nilfs->ns_sem);
 		nilfs_setup_super(sbi);
-		nilfs->ns_current = sbi;
 		up_write(&nilfs->ns_sem);
 
-		up(&sb->s_bdev->bd_mount_sem);
+		nilfs->ns_current = sbi;
 	}
  out:
+	up_write(&nilfs->ns_super_sem);
 	unlock_kernel();
 	return 0;
 
- rw_remount_failed:
-	up(&sb->s_bdev->bd_mount_sem);
  restore_opts:
 	sb->s_flags = old_sb_flags;
 	sbi->s_mount_opt = old_opts.mount_opt;
 	sbi->s_snapshot_cno = old_opts.snapshot_cno;
+	up_write(&nilfs->ns_super_sem);
 	unlock_kernel();
 	return err;
 }
@@ -1118,15 +1114,15 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		 * (i.e. rw-mount or ro-mount), whereas rw-mount and
 		 * ro-mount are mutually exclusive.
 		 */
-		down_read(&nilfs->ns_sem);
+		down_read(&nilfs->ns_super_sem);
 		if (nilfs->ns_current &&
 		    ((nilfs->ns_current->s_super->s_flags ^ flags)
 		     & MS_RDONLY)) {
-			up_read(&nilfs->ns_sem);
+			up_read(&nilfs->ns_super_sem);
 			err = -EBUSY;
 			goto failed_unlock;
 		}
-		up_read(&nilfs->ns_sem);
+		up_read(&nilfs->ns_super_sem);
 	}
 
 	/*
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 221953bfc8594..06e8dfd538d6f 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -72,6 +72,7 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	atomic_set(&nilfs->ns_writer_refcount, -1);
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
+	init_rwsem(&nilfs->ns_super_sem);
 	mutex_init(&nilfs->ns_writer_mutex);
 	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_supers);
@@ -681,10 +682,10 @@ struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
 {
 	struct nilfs_sb_info *sbi;
 
-	down_read(&nilfs->ns_sem);
+	down_read(&nilfs->ns_super_sem);
 	/*
 	 * The SNAPSHOT flag and sb->s_flags are supposed to be
-	 * protected with nilfs->ns_sem.
+	 * protected with nilfs->ns_super_sem.
 	 */
 	sbi = nilfs->ns_current;
 	if (rw_mount) {
@@ -705,12 +706,12 @@ struct nilfs_sb_info *nilfs_find_sbinfo(struct the_nilfs *nilfs,
 			goto found; /* snapshot mount */
 	}
  out:
-	up_read(&nilfs->ns_sem);
+	up_read(&nilfs->ns_super_sem);
 	return NULL;
 
  found:
 	atomic_inc(&sbi->s_count);
-	up_read(&nilfs->ns_sem);
+	up_read(&nilfs->ns_super_sem);
 	return sbi;
 }
 
@@ -720,7 +721,7 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
 	struct nilfs_sb_info *sbi;
 	int ret = 0;
 
-	down_read(&nilfs->ns_sem);
+	down_read(&nilfs->ns_super_sem);
 	if (cno == 0 || cno > nilfs->ns_cno)
 		goto out_unlock;
 
@@ -737,6 +738,6 @@ int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
 		ret++;
 
  out_unlock:
-	up_read(&nilfs->ns_sem);
+	up_read(&nilfs->ns_super_sem);
 	return ret;
 }
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index be4c040fd6299..d0cf4fb7c9cee 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -48,6 +48,7 @@ enum {
  * @ns_bdi: backing dev info
  * @ns_writer: back pointer to writable nilfs_sb_info
  * @ns_sem: semaphore for shared states
+ * @ns_super_sem: semaphore for global operations across super block instances
  * @ns_writer_mutex: mutex protecting ns_writer attach/detach
  * @ns_writer_refcount: number of referrers on ns_writer
  * @ns_current: back pointer to current mount
@@ -96,10 +97,15 @@ struct the_nilfs {
 	struct backing_dev_info *ns_bdi;
 	struct nilfs_sb_info   *ns_writer;
 	struct rw_semaphore	ns_sem;
+	struct rw_semaphore	ns_super_sem;
 	struct mutex		ns_writer_mutex;
 	atomic_t		ns_writer_refcount;
 
+	/*
+	 * components protected by ns_super_sem
+	 */
 	struct nilfs_sb_info   *ns_current;
+	struct list_head	ns_supers;
 
 	/*
 	 * used for
@@ -113,7 +119,6 @@ struct the_nilfs {
 	time_t			ns_sbwtime[2];
 	unsigned		ns_sbsize;
 	unsigned		ns_mount_state;
-	struct list_head	ns_supers;
 
 	/*
 	 * Following fields are dedicated to a writable FS-instance.
-- 
GitLab


From aa7dfb8954ccf49e026ba13d12991a4eb7defb96 Mon Sep 17 00:00:00 2001
From: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Date: Mon, 8 Jun 2009 01:39:33 +0900
Subject: [PATCH 2194/2198] nilfs2: get rid of bd_mount_sem use from nilfs

This will remove every bd_mount_sem use in nilfs.

The intended exclusion control was replaced by the previous patch
("nilfs2: correct exclusion control in nilfs_remount function") for
nilfs_remount(), and this patch will replace remains with a new mutex
that this inserts in nilfs object.

Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
Cc: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/nilfs2/cpfile.c    |  6 +++---
 fs/nilfs2/super.c     | 12 ++++++------
 fs/nilfs2/the_nilfs.c |  1 +
 fs/nilfs2/the_nilfs.h |  2 ++
 4 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index 300f1cdfa8620..cadd36b14d070 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -864,11 +864,11 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 	case NILFS_CHECKPOINT:
 		/*
 		 * Check for protecting existing snapshot mounts:
-		 * bd_mount_sem is used to make this operation atomic and
+		 * ns_mount_mutex is used to make this operation atomic and
 		 * exclusive with a new mount job.  Though it doesn't cover
 		 * umount, it's enough for the purpose.
 		 */
-		down(&nilfs->ns_bdev->bd_mount_sem);
+		mutex_lock(&nilfs->ns_mount_mutex);
 		if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
 			/* Current implementation does not have to protect
 			   plain read-only mounts since they are exclusive
@@ -877,7 +877,7 @@ int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
 			ret = -EBUSY;
 		} else
 			ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
-		up(&nilfs->ns_bdev->bd_mount_sem);
+		mutex_unlock(&nilfs->ns_mount_mutex);
 		return ret;
 	case NILFS_SNAPSHOT:
 		return nilfs_cpfile_set_snapshot(cpfile, cno);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index f02762fa8ea00..1777a3467bd29 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -764,7 +764,7 @@ int nilfs_store_magic_and_option(struct super_block *sb,
  * @silent: silent mode flag
  * @nilfs: the_nilfs struct
  *
- * This function is called exclusively by bd_mount_mutex.
+ * This function is called exclusively by nilfs->ns_mount_mutex.
  * So, the recovery process is protected from other simultaneous mounts.
  */
 static int
@@ -1105,7 +1105,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		goto failed;
 	}
 
-	down(&sd.bdev->bd_mount_sem);
+	mutex_lock(&nilfs->ns_mount_mutex);
 
 	if (!sd.cno) {
 		/*
@@ -1164,7 +1164,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 		need_to_close = 0;
 	}
 
-	up(&sd.bdev->bd_mount_sem);
+	mutex_unlock(&nilfs->ns_mount_mutex);
 	put_nilfs(nilfs);
 	if (need_to_close)
 		close_bdev_exclusive(sd.bdev, flags);
@@ -1172,7 +1172,7 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 	return 0;
 
  failed_unlock:
-	up(&sd.bdev->bd_mount_sem);
+	mutex_unlock(&nilfs->ns_mount_mutex);
 	put_nilfs(nilfs);
  failed:
 	close_bdev_exclusive(sd.bdev, flags);
@@ -1181,14 +1181,14 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
 
  cancel_new:
 	/* Abandoning the newly allocated superblock */
-	up(&sd.bdev->bd_mount_sem);
+	mutex_unlock(&nilfs->ns_mount_mutex);
 	put_nilfs(nilfs);
 	up_write(&s->s_umount);
 	deactivate_super(s);
 	/*
 	 * deactivate_super() invokes close_bdev_exclusive().
 	 * We must finish all post-cleaning before this call;
-	 * put_nilfs() and unlocking bd_mount_sem need the block device.
+	 * put_nilfs() needs the block device.
 	 */
 	return err;
 }
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 06e8dfd538d6f..e4e5c78bcc93f 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -73,6 +73,7 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
 	atomic_set(&nilfs->ns_ndirtyblks, 0);
 	init_rwsem(&nilfs->ns_sem);
 	init_rwsem(&nilfs->ns_super_sem);
+	mutex_init(&nilfs->ns_mount_mutex);
 	mutex_init(&nilfs->ns_writer_mutex);
 	INIT_LIST_HEAD(&nilfs->ns_list);
 	INIT_LIST_HEAD(&nilfs->ns_supers);
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index d0cf4fb7c9cee..e8adbffc626f0 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -49,6 +49,7 @@ enum {
  * @ns_writer: back pointer to writable nilfs_sb_info
  * @ns_sem: semaphore for shared states
  * @ns_super_sem: semaphore for global operations across super block instances
+ * @ns_mount_mutex: mutex protecting mount process of nilfs
  * @ns_writer_mutex: mutex protecting ns_writer attach/detach
  * @ns_writer_refcount: number of referrers on ns_writer
  * @ns_current: back pointer to current mount
@@ -98,6 +99,7 @@ struct the_nilfs {
 	struct nilfs_sb_info   *ns_writer;
 	struct rw_semaphore	ns_sem;
 	struct rw_semaphore	ns_super_sem;
+	struct mutex		ns_mount_mutex;
 	struct mutex		ns_writer_mutex;
 	atomic_t		ns_writer_refcount;
 
-- 
GitLab


From e14112d1bd5e193166b54be19119cf6440470560 Mon Sep 17 00:00:00 2001
From: Stephen Rothwell <sfr@canb.auug.org.au>
Date: Fri, 12 Jun 2009 10:14:22 +1000
Subject: [PATCH 2195/2198] perfcounters: remove powerpc definitions of
 perf_counter_do_pending

Commit 925d519ab82b6dd7aca9420d809ee83819c08db2 ("perf_counter:
unify and fix delayed counter wakeup") added global definitions.

Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Acked-by: Paul Mackerras <paulus@samba.org>
Acked-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/include/asm/hw_irq.h | 3 ---
 arch/powerpc/kernel/irq.c         | 1 +
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h
index 20a44d0c9fdde..53512374e1c9d 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -156,8 +156,6 @@ static inline void clear_perf_counter_pending(void)
 		"i" (offsetof(struct paca_struct, perf_counter_pending)));
 }
 
-extern void perf_counter_do_pending(void);
-
 #else
 
 static inline unsigned long test_perf_counter_pending(void)
@@ -167,7 +165,6 @@ static inline unsigned long test_perf_counter_pending(void)
 
 static inline void set_perf_counter_pending(void) {}
 static inline void clear_perf_counter_pending(void) {}
-static inline void perf_counter_do_pending(void) {}
 #endif /* CONFIG_PERF_COUNTERS */
 
 #endif	/* __KERNEL__ */
diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c
index feff792ed0f9f..844d3f882a150 100644
--- a/arch/powerpc/kernel/irq.c
+++ b/arch/powerpc/kernel/irq.c
@@ -53,6 +53,7 @@
 #include <linux/bootmem.h>
 #include <linux/pci.h>
 #include <linux/debugfs.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
-- 
GitLab


From 12274e96b42884f043dfaa87eb1e5e10281bfac3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 11 Jun 2009 15:07:48 -0700
Subject: [PATCH 2196/2198] x86: use zalloc_cpumask_var in arch_early_irq_init

So we make sure MAXSMP gets a cleared cpumask

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/apic/io_apic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 94605e7f6a541..ef8d9290c7ea7 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -187,8 +187,8 @@ int __init arch_early_irq_init(void)
 	for (i = 0; i < count; i++) {
 		desc = irq_to_desc(i);
 		desc->chip_data = &cfg[i];
-		alloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
-		alloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
+		zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
+		zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
 		if (i < NR_IRQS_LEGACY)
 			cpumask_setall(cfg[i].domain);
 	}
-- 
GitLab


From aee74f3bb3f4fb5dbeae8c1947c6d8ebdc19ee01 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Thu, 11 Jun 2009 15:09:00 -0700
Subject: [PATCH 2197/2198] kvm: remove the duplicated cpumask_clear

zalloc_cpumask_var already cleared it.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 virt/kvm/kvm_main.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index e21194566b714..764554350ed86 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2604,7 +2604,6 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 		r = -ENOMEM;
 		goto out_free_0;
 	}
-	cpumask_clear(cpus_hardware_enabled);
 
 	r = kvm_arch_hardware_setup();
 	if (r < 0)
-- 
GitLab


From 8ebf975608aaebd7feb33d77f07ba21a6380e086 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 11 Jun 2009 20:00:41 -0700
Subject: [PATCH 2198/2198] block: fix kernel-doc in recent block/ changes

Fix kernel-doc warnings in recently changed block/ source code.

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 block/blk-core.c     | 21 +++++++++++----------
 block/blk-settings.c |  6 +++---
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index d17d71c71d4f1..f6452f692501a 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -884,9 +884,10 @@ EXPORT_SYMBOL(blk_get_request);
 
 /**
  * blk_make_request - given a bio, allocate a corresponding struct request.
- *
+ * @q: target request queue
  * @bio:  The bio describing the memory mappings that will be submitted for IO.
  *        It may be a chained-bio properly constructed by block/bio layer.
+ * @gfp_mask: gfp flags to be used for memory allocation
  *
  * blk_make_request is the parallel of generic_make_request for BLOCK_PC
  * type commands. Where the struct request needs to be farther initialized by
@@ -1872,14 +1873,14 @@ EXPORT_SYMBOL(blk_fetch_request);
 
 /**
  * blk_update_request - Special helper function for request stacking drivers
- * @rq:	      the request being processed
+ * @req:      the request being processed
  * @error:    %0 for success, < %0 for error
- * @nr_bytes: number of bytes to complete @rq
+ * @nr_bytes: number of bytes to complete @req
  *
  * Description:
- *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
- *     the request structure even if @rq doesn't have leftover.
- *     If @rq has leftover, sets it up for the next range of segments.
+ *     Ends I/O on a number of bytes attached to @req, but doesn't complete
+ *     the request structure even if @req doesn't have leftover.
+ *     If @req has leftover, sets it up for the next range of segments.
  *
  *     This special helper function is only for request stacking drivers
  *     (e.g. request-based dm) so that they can handle partial completion.
@@ -2145,7 +2146,7 @@ EXPORT_SYMBOL_GPL(blk_end_request);
 /**
  * blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
- * @err: %0 for success, < %0 for error
+ * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Completely finish @rq.
@@ -2166,7 +2167,7 @@ EXPORT_SYMBOL_GPL(blk_end_request_all);
 /**
  * blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
- * @err: %0 for success, < %0 for error
+ * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.
@@ -2203,7 +2204,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request);
 /**
  * __blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
- * @err: %0 for success, < %0 for error
+ * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Completely finish @rq.  Must be called with queue lock held.
@@ -2224,7 +2225,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request_all);
 /**
  * __blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
- * @err: %0 for success, < %0 for error
+ * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.  Must
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 1c4df9bf68136..d71cedc09c4e2 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -343,7 +343,7 @@ EXPORT_SYMBOL(blk_queue_physical_block_size);
 /**
  * blk_queue_alignment_offset - set physical block alignment offset
  * @q:	the request queue for the device
- * @alignment:	alignment offset in bytes
+ * @offset: alignment offset in bytes
  *
  * Description:
  *   Some devices are naturally misaligned to compensate for things like
@@ -362,7 +362,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset);
 /**
  * blk_queue_io_min - set minimum request size for the queue
  * @q:	the request queue for the device
- * @io_min:  smallest I/O size in bytes
+ * @min:  smallest I/O size in bytes
  *
  * Description:
  *   Some devices have an internal block size bigger than the reported
@@ -385,7 +385,7 @@ EXPORT_SYMBOL(blk_queue_io_min);
 /**
  * blk_queue_io_opt - set optimal request size for the queue
  * @q:	the request queue for the device
- * @io_opt:  optimal request size in bytes
+ * @opt:  optimal request size in bytes
  *
  * Description:
  *   Drivers can call this function to set the preferred I/O request
-- 
GitLab